From 4a0b3e170dcf591b0795170f6744d3a17858ee56 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Mon, 21 Sep 2009 18:30:38 +0000 Subject: [PATCH] Add support for rematerializing FsFLD0SS and FsFLD0SD as constant-pool loads in order to reduce register pressure. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@82470 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrInfo.cpp | 52 +++++++++++++------ lib/Target/X86/X86InstrSSE.td | 6 ++- test/CodeGen/X86/remat-scalar-zero.ll | 95 +++++++++++++++++++++++++++++++++++ 3 files changed, 137 insertions(+), 16 deletions(-) create mode 100644 test/CodeGen/X86/remat-scalar-zero.ll diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 859ad57725d..e55aa1e9bbd 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2297,9 +2297,21 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, unsigned Alignment = 0; if (LoadMI->hasOneMemOperand()) Alignment = LoadMI->memoperands_begin()->getAlignment(); - else if (LoadMI->getOpcode() == X86::V_SET0 || - LoadMI->getOpcode() == X86::V_SETALLONES) - Alignment = 16; + else + switch (LoadMI->getOpcode()) { + case X86::V_SET0: + case X86::V_SETALLONES: + Alignment = 16; + break; + case X86::FsFLD0SD: + Alignment = 8; + break; + case X86::FsFLD0SS: + Alignment = 4; + break; + default: + llvm_unreachable("Don't know how to fold this instruction!"); + } if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; switch (MI->getOpcode()) { @@ -2316,8 +2328,11 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, return NULL; SmallVector MOs; - if (LoadMI->getOpcode() == X86::V_SET0 || - LoadMI->getOpcode() == X86::V_SETALLONES) { + switch (LoadMI->getOpcode()) { + case X86::V_SET0: + case X86::V_SETALLONES: + case X86::FsFLD0SD: + case X86::FsFLD0SS: { // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. // Create a constant-pool entry and operands to load from it. @@ -2331,17 +2346,22 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // This doesn't work for several reasons. // 1. GlobalBaseReg may have been spilled. // 2. It may not be live at MI. - return false; + return NULL; } - // Create a v4i32 constant-pool entry. + // Create a constant-pool entry. MachineConstantPool &MCP = *MF.getConstantPool(); - const VectorType *Ty = - VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); - Constant *C = LoadMI->getOpcode() == X86::V_SET0 ? - Constant::getNullValue(Ty) : - Constant::getAllOnesValue(Ty); - unsigned CPI = MCP.getConstantPoolIndex(C, 16); + const Type *Ty; + if (LoadMI->getOpcode() == X86::FsFLD0SS) + Ty = Type::getFloatTy(MF.getFunction()->getContext()); + else if (LoadMI->getOpcode() == X86::FsFLD0SD) + Ty = Type::getDoubleTy(MF.getFunction()->getContext()); + else + Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); + Constant *C = LoadMI->getOpcode() == X86::V_SETALLONES ? + Constant::getAllOnesValue(Ty) : + Constant::getNullValue(Ty); + unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); // Create operands to load from the constant pool entry. MOs.push_back(MachineOperand::CreateReg(PICBase, false)); @@ -2349,11 +2369,15 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MOs.push_back(MachineOperand::CreateReg(0, false)); MOs.push_back(MachineOperand::CreateCPI(CPI, 0)); MOs.push_back(MachineOperand::CreateReg(0, false)); - } else { + break; + } + default: { // Folding a normal load. Just copy the load's address operands. unsigned NumOps = LoadMI->getDesc().getNumOperands(); for (unsigned i = NumOps - X86AddrNumOperands; i != NumOps; ++i) MOs.push_back(LoadMI->getOperand(i)); + break; + } } return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment); } diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index da09a936303..96fc932fc88 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -472,7 +472,8 @@ def Int_COMISSrm: PSI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), // that start with 'Fs'. // Alias instructions that map fld0 to pxor for sse. -let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1 in +let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1, + canFoldAsLoad = 1 in def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "pxor\t$dst, $dst", [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>, TB, OpSize; @@ -1230,7 +1231,8 @@ def Int_COMISDrm: PDI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), // that start with 'Fs'. // Alias instructions that map fld0 to pxor for sse. -let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1 in +let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1, + canFoldAsLoad = 1 in def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "pxor\t$dst, $dst", [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>, TB, OpSize; diff --git a/test/CodeGen/X86/remat-scalar-zero.ll b/test/CodeGen/X86/remat-scalar-zero.ll new file mode 100644 index 00000000000..790ae83c2b2 --- /dev/null +++ b/test/CodeGen/X86/remat-scalar-zero.ll @@ -0,0 +1,95 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu > %t +; RUN: not grep xor %t +; RUN: not grep movap %t +; RUN: grep {\\.zero} %t + +; Remat should be able to fold the zero constant into the div instructions +; as a constant-pool load. + +define void @foo(double* nocapture %x, double* nocapture %y) nounwind { +entry: + %tmp1 = load double* %x ; [#uses=1] + %arrayidx4 = getelementptr inbounds double* %x, i64 1 ; [#uses=1] + %tmp5 = load double* %arrayidx4 ; [#uses=1] + %arrayidx8 = getelementptr inbounds double* %x, i64 2 ; [#uses=1] + %tmp9 = load double* %arrayidx8 ; [#uses=1] + %arrayidx12 = getelementptr inbounds double* %x, i64 3 ; [#uses=1] + %tmp13 = load double* %arrayidx12 ; [#uses=1] + %arrayidx16 = getelementptr inbounds double* %x, i64 4 ; [#uses=1] + %tmp17 = load double* %arrayidx16 ; [#uses=1] + %arrayidx20 = getelementptr inbounds double* %x, i64 5 ; [#uses=1] + %tmp21 = load double* %arrayidx20 ; [#uses=1] + %arrayidx24 = getelementptr inbounds double* %x, i64 6 ; [#uses=1] + %tmp25 = load double* %arrayidx24 ; [#uses=1] + %arrayidx28 = getelementptr inbounds double* %x, i64 7 ; [#uses=1] + %tmp29 = load double* %arrayidx28 ; [#uses=1] + %arrayidx32 = getelementptr inbounds double* %x, i64 8 ; [#uses=1] + %tmp33 = load double* %arrayidx32 ; [#uses=1] + %arrayidx36 = getelementptr inbounds double* %x, i64 9 ; [#uses=1] + %tmp37 = load double* %arrayidx36 ; [#uses=1] + %arrayidx40 = getelementptr inbounds double* %x, i64 10 ; [#uses=1] + %tmp41 = load double* %arrayidx40 ; [#uses=1] + %arrayidx44 = getelementptr inbounds double* %x, i64 11 ; [#uses=1] + %tmp45 = load double* %arrayidx44 ; [#uses=1] + %arrayidx48 = getelementptr inbounds double* %x, i64 12 ; [#uses=1] + %tmp49 = load double* %arrayidx48 ; [#uses=1] + %arrayidx52 = getelementptr inbounds double* %x, i64 13 ; [#uses=1] + %tmp53 = load double* %arrayidx52 ; [#uses=1] + %arrayidx56 = getelementptr inbounds double* %x, i64 14 ; [#uses=1] + %tmp57 = load double* %arrayidx56 ; [#uses=1] + %arrayidx60 = getelementptr inbounds double* %x, i64 15 ; [#uses=1] + %tmp61 = load double* %arrayidx60 ; [#uses=1] + %arrayidx64 = getelementptr inbounds double* %x, i64 16 ; [#uses=1] + %tmp65 = load double* %arrayidx64 ; [#uses=1] + %div = fdiv double %tmp1, 0.000000e+00 ; [#uses=1] + store double %div, double* %y + %div70 = fdiv double %tmp5, 2.000000e-01 ; [#uses=1] + %arrayidx72 = getelementptr inbounds double* %y, i64 1 ; [#uses=1] + store double %div70, double* %arrayidx72 + %div74 = fdiv double %tmp9, 2.000000e-01 ; [#uses=1] + %arrayidx76 = getelementptr inbounds double* %y, i64 2 ; [#uses=1] + store double %div74, double* %arrayidx76 + %div78 = fdiv double %tmp13, 2.000000e-01 ; [#uses=1] + %arrayidx80 = getelementptr inbounds double* %y, i64 3 ; [#uses=1] + store double %div78, double* %arrayidx80 + %div82 = fdiv double %tmp17, 2.000000e-01 ; [#uses=1] + %arrayidx84 = getelementptr inbounds double* %y, i64 4 ; [#uses=1] + store double %div82, double* %arrayidx84 + %div86 = fdiv double %tmp21, 2.000000e-01 ; [#uses=1] + %arrayidx88 = getelementptr inbounds double* %y, i64 5 ; [#uses=1] + store double %div86, double* %arrayidx88 + %div90 = fdiv double %tmp25, 2.000000e-01 ; [#uses=1] + %arrayidx92 = getelementptr inbounds double* %y, i64 6 ; [#uses=1] + store double %div90, double* %arrayidx92 + %div94 = fdiv double %tmp29, 2.000000e-01 ; [#uses=1] + %arrayidx96 = getelementptr inbounds double* %y, i64 7 ; [#uses=1] + store double %div94, double* %arrayidx96 + %div98 = fdiv double %tmp33, 2.000000e-01 ; [#uses=1] + %arrayidx100 = getelementptr inbounds double* %y, i64 8 ; [#uses=1] + store double %div98, double* %arrayidx100 + %div102 = fdiv double %tmp37, 2.000000e-01 ; [#uses=1] + %arrayidx104 = getelementptr inbounds double* %y, i64 9 ; [#uses=1] + store double %div102, double* %arrayidx104 + %div106 = fdiv double %tmp41, 2.000000e-01 ; [#uses=1] + %arrayidx108 = getelementptr inbounds double* %y, i64 10 ; [#uses=1] + store double %div106, double* %arrayidx108 + %div110 = fdiv double %tmp45, 2.000000e-01 ; [#uses=1] + %arrayidx112 = getelementptr inbounds double* %y, i64 11 ; [#uses=1] + store double %div110, double* %arrayidx112 + %div114 = fdiv double %tmp49, 2.000000e-01 ; [#uses=1] + %arrayidx116 = getelementptr inbounds double* %y, i64 12 ; [#uses=1] + store double %div114, double* %arrayidx116 + %div118 = fdiv double %tmp53, 2.000000e-01 ; [#uses=1] + %arrayidx120 = getelementptr inbounds double* %y, i64 13 ; [#uses=1] + store double %div118, double* %arrayidx120 + %div122 = fdiv double %tmp57, 2.000000e-01 ; [#uses=1] + %arrayidx124 = getelementptr inbounds double* %y, i64 14 ; [#uses=1] + store double %div122, double* %arrayidx124 + %div126 = fdiv double %tmp61, 2.000000e-01 ; [#uses=1] + %arrayidx128 = getelementptr inbounds double* %y, i64 15 ; [#uses=1] + store double %div126, double* %arrayidx128 + %div130 = fdiv double %tmp65, 0.000000e+00 ; [#uses=1] + %arrayidx132 = getelementptr inbounds double* %y, i64 16 ; [#uses=1] + store double %div130, double* %arrayidx132 + ret void +} -- 2.11.0