From 6e42e3146cfb91ba395890a0d46cdd68f8ff8fd8 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Wed, 12 Sep 2018 12:10:22 +0000 Subject: [PATCH] [AArch64] Implement aarch64_vector_pcs codegen support. This patch adds codegen support for the saving/restoring V8-V23 for functions specified with the aarch64_vector_pcs calling convention attribute, as added in patch D51477. Reviewers: t.p.northover, gberry, thegameg, rengolin, javed.absar, MatzeB Reviewed By: thegameg Differential Revision: https://reviews.llvm.org/D51479 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@342049 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64CallingConvention.td | 8 + lib/Target/AArch64/AArch64FrameLowering.cpp | 119 ++++++++---- lib/Target/AArch64/AArch64RegisterInfo.cpp | 6 +- test/CodeGen/AArch64/aarch64-vector-pcs.mir | 253 +++++++++++++++++++++++++ 4 files changed, 345 insertions(+), 41 deletions(-) create mode 100644 test/CodeGen/AArch64/aarch64-vector-pcs.mir diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td index 30492003df1..91fe3f237af 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.td +++ b/lib/Target/AArch64/AArch64CallingConvention.td @@ -288,6 +288,12 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, D8, D9, D10, D11, D12, D13, D14, D15)>; +// AArch64 PCS for vector functions (VPCS) +// must (additionally) preserve full Q8-Q23 registers +def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, + X23, X24, X25, X26, X27, X28, + (sequence "Q%u", 8, 23))>; + // Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since // 'this' and the pointer return value are both passed in X0 in these cases, // this can be partially modelled by treating X0 as a callee-saved register; @@ -362,5 +368,7 @@ def CSR_AArch64_AAPCS_SwiftError_SCS : CalleeSavedRegs<(add CSR_AArch64_AAPCS_SwiftError, X18)>; def CSR_AArch64_RT_MostRegs_SCS : CalleeSavedRegs<(add CSR_AArch64_RT_MostRegs, X18)>; +def CSR_AArch64_AAVPCS_SCS + : CalleeSavedRegs<(add CSR_AArch64_AAVPCS, X18)>; def CSR_AArch64_AAPCS_SCS : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X18)>; diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 56e659056c4..40efcbe5278 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -461,12 +461,19 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( NewOpc = AArch64::STPDpre; Scale = 8; break; + case AArch64::STPQi: + NewOpc = AArch64::STPQpre; + Scale = 16; + break; case AArch64::STRXui: NewOpc = AArch64::STRXpre; break; case AArch64::STRDui: NewOpc = AArch64::STRDpre; break; + case AArch64::STRQui: + NewOpc = AArch64::STRQpre; + break; case AArch64::LDPXi: NewOpc = AArch64::LDPXpost; Scale = 8; @@ -475,12 +482,19 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( NewOpc = AArch64::LDPDpost; Scale = 8; break; + case AArch64::LDPQi: + NewOpc = AArch64::LDPQpost; + Scale = 16; + break; case AArch64::LDRXui: NewOpc = AArch64::LDRXpost; break; case AArch64::LDRDui: NewOpc = AArch64::LDRDpost; break; + case AArch64::LDRQui: + NewOpc = AArch64::LDRQpost; + break; } MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); @@ -531,6 +545,12 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, case AArch64::LDRDui: Scale = 8; break; + case AArch64::STPQi: + case AArch64::STRQui: + case AArch64::LDPQi: + case AArch64::LDRQui: + Scale = 16; + break; default: llvm_unreachable("Unexpected callee-save save/restore opcode!"); } @@ -541,7 +561,7 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, // Last operand is immediate offset that needs fixing. MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx); // All generated opcodes have scaled offsets. - assert(LocalStackSize % 8 == 0); + assert(LocalStackSize % Scale == 0); OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale); } @@ -1208,7 +1228,7 @@ struct RegPairInfo { unsigned Reg2 = AArch64::NoRegister; int FrameIdx; int Offset; - enum RegType { GPR, FPR64 } Type; + enum RegType { GPR, FPR64, FPR128 } Type; RegPairInfo() = default; @@ -1246,6 +1266,8 @@ static void computeCalleeSaveRegisterPairs( RPI.Type = RegPairInfo::GPR; else if (AArch64::FPR64RegClass.contains(RPI.Reg1)) RPI.Type = RegPairInfo::FPR64; + else if (AArch64::FPR128RegClass.contains(RPI.Reg1)) + RPI.Type = RegPairInfo::FPR128; else llvm_unreachable("Unsupported register class."); @@ -1261,6 +1283,10 @@ static void computeCalleeSaveRegisterPairs( if (AArch64::FPR64RegClass.contains(NextReg)) RPI.Reg2 = NextReg; break; + case RegPairInfo::FPR128: + if (AArch64::FPR128RegClass.contains(NextReg)) + RPI.Reg2 = NextReg; + break; } } @@ -1294,17 +1320,21 @@ static void computeCalleeSaveRegisterPairs( RPI.FrameIdx = CSI[i].getFrameIdx(); - if (Count * 8 != AFI->getCalleeSavedStackSize() && !RPI.isPaired()) { - // Round up size of non-pair to pair size if we need to pad the - // callee-save area to ensure 16-byte alignment. - Offset -= 16; + int Scale = RPI.Type == RegPairInfo::FPR128 ? 16 : 8; + Offset -= RPI.isPaired() ? 2 * Scale : Scale; + + // Round up size of non-pair to pair size if we need to pad the + // callee-save area to ensure 16-byte alignment. + if (AFI->hasCalleeSaveStackFreeSpace() && + RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired()) { + Offset -= 8; + assert(Offset % 16 == 0); assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16); MFI.setObjectAlignment(RPI.FrameIdx, 16); - AFI->setCalleeSaveStackHasFreeSpace(true); - } else - Offset -= RPI.isPaired() ? 16 : 8; - assert(Offset % 8 == 0); - RPI.Offset = Offset / 8; + } + + assert(Offset % Scale == 0); + RPI.Offset = Offset / Scale; assert((RPI.Offset >= -64 && RPI.Offset <= 63) && "Offset out of bounds for LDP/STP immediate"); @@ -1370,6 +1400,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( Size = 8; Align = 8; break; + case RegPairInfo::FPR128: + StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui; + Size = 16; + Align = 16; + break; } LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); @@ -1441,6 +1476,11 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( Size = 8; Align = 8; break; + case RegPairInfo::FPR128: + LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui; + Size = 16; + Align = 16; + break; } LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI); if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); @@ -1507,24 +1547,6 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, ? RegInfo->getBaseRegister() : (unsigned)AArch64::NoRegister; - unsigned SpillEstimate = SavedRegs.count(); - for (unsigned i = 0; CSRegs[i]; ++i) { - unsigned Reg = CSRegs[i]; - unsigned PairedReg = CSRegs[i ^ 1]; - if (Reg == BasePointerReg) - SpillEstimate++; - if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) - SpillEstimate++; - } - SpillEstimate += 2; // Conservatively include FP+LR in the estimate - unsigned StackEstimate = MFI.estimateStackSize(MF) + 8 * SpillEstimate; - - // The frame record needs to be created by saving the appropriate registers - if (hasFP(MF) || windowsRequiresStackProbe(MF, StackEstimate)) { - SavedRegs.set(AArch64::FP); - SavedRegs.set(AArch64::LR); - } - unsigned ExtraCSSpill = 0; // Figure out which callee-saved registers to save/restore. for (unsigned i = 0; CSRegs[i]; ++i) { @@ -1548,7 +1570,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // MachO's compact unwind format relies on all registers being stored in // pairs. // FIXME: the usual format is actually better if unwinding isn't needed. - if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) { + if (produceCompactUnwindFrame(MF) && PairedReg != AArch64::NoRegister && + !SavedRegs.test(PairedReg)) { SavedRegs.set(PairedReg); if (AArch64::GPR64RegClass.contains(PairedReg) && !RegInfo->isReservedReg(MF, PairedReg)) @@ -1556,6 +1579,24 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, } } + // Calculates the callee saved stack size. + unsigned CSStackSize = 0; + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (unsigned Reg : SavedRegs.set_bits()) + CSStackSize += TRI->getRegSizeInBits(Reg, MRI) / 8; + + // Save number of saved regs, so we can easily update CSStackSize later. + unsigned NumSavedRegs = SavedRegs.count(); + + // The frame record needs to be created by saving the appropriate registers + unsigned EstimatedStackSize = MFI.estimateStackSize(MF); + if (hasFP(MF) || + windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) { + SavedRegs.set(AArch64::FP); + SavedRegs.set(AArch64::LR); + } + LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"; for (unsigned Reg : SavedRegs.set_bits()) dbgs() @@ -1563,15 +1604,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, dbgs() << "\n";); // If any callee-saved registers are used, the frame cannot be eliminated. - unsigned NumRegsSpilled = SavedRegs.count(); - bool CanEliminateFrame = NumRegsSpilled == 0; + bool CanEliminateFrame = SavedRegs.count() == 0; // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. - unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled; - LLVM_DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n"); unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF); - bool BigStack = (CFSize > EstimatedStackSizeLimit); + bool BigStack = (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit; if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) AFI->setHasStackFrame(true); @@ -1592,7 +1630,6 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, if (produceCompactUnwindFrame(MF)) SavedRegs.set(UnspilledCSGPRPaired); ExtraCSSpill = UnspilledCSGPRPaired; - NumRegsSpilled = SavedRegs.count(); } // If we didn't find an extra callee-saved register to spill, create @@ -1609,9 +1646,17 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, } } + // Adding the size of additional 64bit GPR saves. + CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs); + unsigned AlignedCSStackSize = alignTo(CSStackSize, 16); + LLVM_DEBUG(dbgs() << "Estimated stack frame size: " + << EstimatedStackSize + AlignedCSStackSize + << " bytes.\n"); + // Round up to register pair alignment to avoid additional SP adjustment // instructions. - AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16)); + AFI->setCalleeSavedStackSize(AlignedCSStackSize); + AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize); } bool AArch64FrameLowering::enableStackSlotScavenging( diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index ec1925e06f8..fdadcefc1f1 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -50,8 +50,7 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (MF->getFunction().getCallingConv() == CallingConv::AnyReg) return CSR_AArch64_AllRegs_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall) - // FIXME: default to AAPCS until we add full support. - return CSR_AArch64_AAPCS_SaveList; + return CSR_AArch64_AAVPCS_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS) return MF->getInfo()->isSplitCSR() ? CSR_AArch64_CXX_TLS_Darwin_PE_SaveList : @@ -102,8 +101,7 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return SCS ? CSR_AArch64_CXX_TLS_Darwin_SCS_RegMask : CSR_AArch64_CXX_TLS_Darwin_RegMask; if (CC == CallingConv::AArch64_VectorCall) - // FIXME: default to AAPCS until we add full support. - return SCS ? CSR_AArch64_AAPCS_SCS_RegMask : CSR_AArch64_AAPCS_RegMask; + return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask; if (MF.getSubtarget().getTargetLowering() ->supportSwiftError() && MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError)) diff --git a/test/CodeGen/AArch64/aarch64-vector-pcs.mir b/test/CodeGen/AArch64/aarch64-vector-pcs.mir new file mode 100644 index 00000000000..276a726363a --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-vector-pcs.mir @@ -0,0 +1,253 @@ +# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s + +# The tests below test the allocation of 128bit callee-saves +# on the stack, specifically their offsets. + +# Padding of GPR64-registers is needed to ensure 16 byte alignment of +# the stack pointer after the GPR64/FPR64 block (which is also needed +# for the FPR128 saves when present). + +# This file also tests whether an emergency stack slot is allocated +# when the stack frame is over a given size, caused by a series of +# FPR128 saves. The alignment can leave a gap that can be scavenged +# for stack slot scavenging, so it is important that the stack size +# is properly estimated. + + +--- | + + ; ModuleID = '' + source_filename = "" + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64-unknown-linux-gnu" + + ; Function Attrs: nounwind + define aarch64_vector_pcs void @test_q10_q11_x19() nounwind { entry: unreachable } + + ; Function Attrs: nounwind + define aarch64_vector_pcs void @test_q10_q11_x19_x20() nounwind { entry: unreachable } + + ; Function Attrs: nounwind + define aarch64_vector_pcs void @test_q10_q11_x19_x20_x21() nounwind { entry: unreachable } + + ; Function Attrs: nounwind + define aarch64_vector_pcs void @test_q8_to_q23_x19_to_x30() nounwind { entry: unreachable } + + ; Function Attrs: nounwind + define aarch64_vector_pcs void @test_q8_to_q23_x19_to_x30_preinc() nounwind { entry: unreachable } + +... +--- +name: test_q10_q11_x19 +tracksRegLiveness: true +body: | + bb.0.entry: + $x19 = IMPLICIT_DEF + $q10 = IMPLICIT_DEF + $q11 = IMPLICIT_DEF + + ; Check that the alignment gap for the 8-byte x19 is padded + ; with another 8 bytes. The CSR region will look like this: + ; +-------------------+ + ; |/////padding///////| (8 bytes) + ; | X19 | (8 bytes) + ; +-------------------+ <- SP -16 + ; | Q10, Q11 | (32 bytes) + ; +-------------------+ <- SP -48 + + ; CHECK-LABEL: test_q10_q11_x19{{[[:space:]]}} + ; CHECK-DAG: $sp = frame-setup STPQpre killed $q11, killed $q10, $sp, -3 :: (store 16 into %stack.[[Q11:[0-9]+]]), (store 16 into %stack.[[Q10:[0-9]+]]) + ; CHECK-DAG: - { id: [[Q11]], {{.*}}, offset: -48, size: 16, alignment: 16 + ; CHECK-DAG: - { id: [[Q10]], {{.*}}, offset: -32, size: 16, alignment: 16 + ; CHECK-DAG: frame-setup STRXui killed $x19, $sp, 4 :: (store 8 into %stack.[[X19:[0-9]+]]) + ; CHECK-DAG: - { id: [[X19]], {{.*}}, offset: -16, size: 8, alignment: 16 + +... +--- +name: test_q10_q11_x19_x20 +alignment: 2 +tracksRegLiveness: true +body: | + bb.0.entry: + $x19 = IMPLICIT_DEF + $x20 = IMPLICIT_DEF + $q10 = IMPLICIT_DEF + $q11 = IMPLICIT_DEF + + ; +-------------------+ + ; | X19, X20 | (16 bytes) + ; +-------------------+ <- SP -16 + ; | Q10, Q11 | (32 bytes) + ; +-------------------+ <- SP -48 + + ; CHECK-LABEL: test_q10_q11_x19_x20{{[[:space:]]}} + ; CHECK-DAG: $sp = frame-setup STPQpre killed $q11, killed $q10, $sp, -3 :: (store 16 into %stack.[[Q11:[0-9]+]]), (store 16 into %stack.[[Q10:[0-9]+]]) + ; CHECK-DAG: frame-setup STPXi killed $x20, killed $x19, $sp, 4 :: (store 8 into %stack.[[X20:[0-9]+]]), (store 8 into %stack.[[X19:[0-9]+]]) + ; CHECK-DAG: - { id: [[Q11]], {{.*}}, offset: -48, size: 16, alignment: 16 + ; CHECK-DAG: - { id: [[Q10]], {{.*}}, offset: -32, size: 16, alignment: 16 + ; CHECK-DAG: - { id: [[X20]], {{.*}}, offset: -16, size: 8, alignment: 8 + ; CHECK-DAG: - { id: [[X19]], {{.*}}, offset: -8, size: 8, alignment: 8 + +... +--- +name: test_q10_q11_x19_x20_x21 +tracksRegLiveness: true +body: | + bb.0.entry: + $x19 = IMPLICIT_DEF + $x20 = IMPLICIT_DEF + $x21 = IMPLICIT_DEF + $q10 = IMPLICIT_DEF + $q11 = IMPLICIT_DEF + + ; Check that the alignment gap is padded with another 8 bytes. + ; The CSR region will look like this: + ; +-------------------+ + ; | X19, X20 | (16 bytes) + ; +-------------------+ <- SP -16 + ; |/////padding///////| (8 bytes) + ; | X21 | (8 bytes) + ; +-------------------+ <- SP -32 + ; | Q10, Q11 | (32 bytes) + ; +-------------------+ <- SP -64 + + ; CHECK-LABEL: test_q10_q11_x19_x20_x21 + ; CHECK-DAG: $sp = frame-setup STPQpre killed $q11, killed $q10, $sp, -4 :: (store 16 into %stack.[[Q11:[0-9]+]]), (store 16 into %stack.[[Q10:[0-9]+]]) + ; CHECK-DAG: frame-setup STRXui killed $x21, $sp, 4 :: (store 8 into %stack.[[X21:[0-9]+]]) + ; CHECK-DAG: frame-setup STPXi killed $x20, killed $x19, $sp, 6 + ; CHECK-DAG: - { id: [[Q11]], {{.*}}, offset: -64, size: 16, alignment: 16 + ; CHECK-DAG: - { id: [[Q10]], {{.*}}, offset: -48, size: 16, alignment: 16 + ; CHECK-DAG: - { id: [[X21]], {{.*}}, offset: -32, size: 8, alignment: 16 + +... +--- +name: test_q8_to_q23_x19_to_x30 +tracksRegLiveness: true +body: | + bb.0.entry: + $x19 = IMPLICIT_DEF + $x20 = IMPLICIT_DEF + $x21 = IMPLICIT_DEF + $x22 = IMPLICIT_DEF + $x23 = IMPLICIT_DEF + $x24 = IMPLICIT_DEF + $x25 = IMPLICIT_DEF + $x26 = IMPLICIT_DEF + $x27 = IMPLICIT_DEF + $x28 = IMPLICIT_DEF + $fp = IMPLICIT_DEF + $lr = IMPLICIT_DEF + $q8 = IMPLICIT_DEF + $q9 = IMPLICIT_DEF + $q10 = IMPLICIT_DEF + $q11 = IMPLICIT_DEF + $q12 = IMPLICIT_DEF + $q13 = IMPLICIT_DEF + $q14 = IMPLICIT_DEF + $q15 = IMPLICIT_DEF + $q16 = IMPLICIT_DEF + $q17 = IMPLICIT_DEF + $q18 = IMPLICIT_DEF + $q19 = IMPLICIT_DEF + $q20 = IMPLICIT_DEF + $q21 = IMPLICIT_DEF + $q22 = IMPLICIT_DEF + $q23 = IMPLICIT_DEF + + ; Test with more callee saves, which triggers 'BigStack' in + ; AArch64FrameLowering which in turn causes an emergency spill + ; slot to be allocated. The emergency spill slot is allocated + ; as close as possible to SP, so at SP + 0. + ; +-------------------+ + ; | X19..X30 | (96 bytes) + ; +-------------------+ <- SP -96 + ; | Q8..Q23 | (256 bytes) + ; +-------------------+ <- SP -352 + ; | emergency slot | (16 bytes) + ; +-------------------+ <- SP -368 + + ; CHECK-LABEL: test_q8_to_q23_x19_to_x30 + ; CHECK: $sp = frame-setup SUBXri $sp, 368, 0 + ; CHECK-NEXT: frame-setup STPQi killed $q23, killed $q22, $sp, 1 :: (store 16 into %stack.{{[0-9]+}}), (store 16 into %stack.{{[0-9]+}}) + ; CHECK-NEXT: frame-setup STPQi killed $q21, killed $q20, $sp, 3 + ; CHECK-NEXT: frame-setup STPQi killed $q19, killed $q18, $sp, 5 + ; CHECK-NEXT: frame-setup STPQi killed $q17, killed $q16, $sp, 7 + ; CHECK-NEXT: frame-setup STPQi killed $q15, killed $q14, $sp, 9 + ; CHECK-NEXT: frame-setup STPQi killed $q13, killed $q12, $sp, 11 + ; CHECK-NEXT: frame-setup STPQi killed $q11, killed $q10, $sp, 13 + ; CHECK-NEXT: frame-setup STPQi killed $q9, killed $q8, $sp, 15 + ; CHECK-NEXT: frame-setup STPXi killed $x28, killed $x27, $sp, 34 :: (store 8 into %stack.{{[0-9]+}}), (store 8 into %stack.{{[0-9]+}}) + ; CHECK-NEXT: frame-setup STPXi killed $x26, killed $x25, $sp, 36 + ; CHECK-NEXT: frame-setup STPXi killed $x24, killed $x23, $sp, 38 + ; CHECK-NEXT: frame-setup STPXi killed $x22, killed $x21, $sp, 40 + ; CHECK-NEXT: frame-setup STPXi killed $x20, killed $x19, $sp, 42 + ; CHECK-NEXT: frame-setup STPXi killed $fp, killed $lr, $sp, 44 + +... +--- +name: test_q8_to_q23_x19_to_x30_preinc +tracksRegLiveness: true +stack: + - { id: 0, size: 160, alignment: 4, local-offset: 0 } +constants: +body: | + bb.0.entry: + $x19 = IMPLICIT_DEF + $x20 = IMPLICIT_DEF + $x21 = IMPLICIT_DEF + $x22 = IMPLICIT_DEF + $x23 = IMPLICIT_DEF + $x24 = IMPLICIT_DEF + $x25 = IMPLICIT_DEF + $x26 = IMPLICIT_DEF + $x27 = IMPLICIT_DEF + $x28 = IMPLICIT_DEF + $fp = IMPLICIT_DEF + $lr = IMPLICIT_DEF + $q8 = IMPLICIT_DEF + $q9 = IMPLICIT_DEF + $q10 = IMPLICIT_DEF + $q11 = IMPLICIT_DEF + $q12 = IMPLICIT_DEF + $q13 = IMPLICIT_DEF + $q14 = IMPLICIT_DEF + $q15 = IMPLICIT_DEF + $q16 = IMPLICIT_DEF + $q17 = IMPLICIT_DEF + $q18 = IMPLICIT_DEF + $q19 = IMPLICIT_DEF + $q20 = IMPLICIT_DEF + $q21 = IMPLICIT_DEF + $q22 = IMPLICIT_DEF + $q23 = IMPLICIT_DEF + + ; When the total stack size >= 512, it will use the pre-increment + ; rather than the 'sub sp, sp, '. + ; +-------------------+ + ; | X19..X30 | (96 bytes) + ; +-------------------+ <- SP -96 + ; | Q8..Q23 | (256 bytes) + ; +-------------------+ <- SP -352 + ; | 'obj' | (32 bytes) + ; +-------------------+ <- SP -384 + ; | emergency slot | (16 bytes) + ; +-------------------+ <- SP -400 + + ; CHECK-LABEL: test_q8_to_q23_x19_to_x30_preinc + ; CHECK: $sp = frame-setup STPQpre killed $q23, killed $q22, $sp, -22 :: (store 16 into %stack.{{[0-9]+}}), (store 16 into %stack.{{[0-9]+}}) + ; CHECK-NEXT: frame-setup STPQi killed $q21, killed $q20, $sp, 2 + ; CHECK-NEXT: frame-setup STPQi killed $q19, killed $q18, $sp, 4 + ; CHECK-NEXT: frame-setup STPQi killed $q17, killed $q16, $sp, 6 + ; CHECK-NEXT: frame-setup STPQi killed $q15, killed $q14, $sp, 8 + ; CHECK-NEXT: frame-setup STPQi killed $q13, killed $q12, $sp, 10 + ; CHECK-NEXT: frame-setup STPQi killed $q11, killed $q10, $sp, 12 + ; CHECK-NEXT: frame-setup STPQi killed $q9, killed $q8, $sp, 14 + ; CHECK-NEXT: frame-setup STPXi killed $x28, killed $x27, $sp, 32 :: (store 8 into %stack.{{[0-9]+}}), (store 8 into %stack.{{[0-9]+}}) + ; CHECK-NEXT: frame-setup STPXi killed $x26, killed $x25, $sp, 34 + ; CHECK-NEXT: frame-setup STPXi killed $x24, killed $x23, $sp, 36 + ; CHECK-NEXT: frame-setup STPXi killed $x22, killed $x21, $sp, 38 + ; CHECK-NEXT: frame-setup STPXi killed $x20, killed $x19, $sp, 40 + ; CHECK-NEXT: frame-setup STPXi killed $fp, killed $lr, $sp, 42 + ; CHECK-NEXT: $sp = frame-setup SUBXri $sp, 176, 0 + +... -- 2.11.0