[MachineCombiner] Support for floating-point FMA on ARM64

author Gerolf Hoflehner <ghoflehner@apple.com>

Fri, 22 Apr 2016 02:15:19 +0000 (02:15 +0000)

committer Gerolf Hoflehner <ghoflehner@apple.com>

Fri, 22 Apr 2016 02:15:19 +0000 (02:15 +0000)
author Gerolf Hoflehner <ghoflehner@apple.com>
Fri, 22 Apr 2016 02:15:19 +0000 (02:15 +0000)
committer Gerolf Hoflehner <ghoflehner@apple.com>
Fri, 22 Apr 2016 02:15:19 +0000 (02:15 +0000)
diff --git a/include/llvm/CodeGen/MachineCombinerPattern.h b/include/llvm/CodeGen/MachineCombinerPattern.h

index f389122..1123801 100644 (file)
--- a/include/llvm/CodeGen/MachineCombinerPattern.h
+++ b/include/llvm/CodeGen/MachineCombinerPattern.h
@@ -38,7 +38,40 @@ enum class MachineCombinerPattern {
    MULSUBX_OP1,
    MULSUBX_OP2,
    MULADDXI_OP1,
-  MULSUBXI_OP1
+  MULSUBXI_OP1,
+  // Floating Point
+  FMULADDS_OP1,
+  FMULADDS_OP2,
+  FMULSUBS_OP1,
+  FMULSUBS_OP2,
+  FMULADDD_OP1,
+  FMULADDD_OP2,
+  FMULSUBD_OP1,
+  FMULSUBD_OP2,
+  FMLAv1i32_indexed_OP1,
+  FMLAv1i32_indexed_OP2,
+  FMLAv1i64_indexed_OP1,
+  FMLAv1i64_indexed_OP2,
+  FMLAv2f32_OP2,
+  FMLAv2f32_OP1,
+  FMLAv2f64_OP1,
+  FMLAv2f64_OP2,
+  FMLAv2i32_indexed_OP1,
+  FMLAv2i32_indexed_OP2,
+  FMLAv2i64_indexed_OP1,
+  FMLAv2i64_indexed_OP2,
+  FMLAv4f32_OP1,
+  FMLAv4f32_OP2,
+  FMLAv4i32_indexed_OP1,
+  FMLAv4i32_indexed_OP2,
+  FMLSv1i32_indexed_OP2,
+  FMLSv1i64_indexed_OP2,
+  FMLSv2i32_indexed_OP2,
+  FMLSv2i64_indexed_OP2,
+  FMLSv2f32_OP2,
+  FMLSv2f64_OP2,
+  FMLSv4i32_indexed_OP2,
+  FMLSv4f32_OP2
  };
  
  } // end namespace llvm
diff --git a/include/llvm/CodeGen/SelectionDAGTargetInfo.h b/include/llvm/CodeGen/SelectionDAGTargetInfo.h

index 2e0339b..2fe9e34 100644 (file)
--- a/include/llvm/CodeGen/SelectionDAGTargetInfo.h
+++ b/include/llvm/CodeGen/SelectionDAGTargetInfo.h
@@ -17,6 +17,7 @@
  #define LLVM_CODEGEN_SELECTIONDAGTARGETINFO_H
  
  #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/CodeGen.h"
  
  namespace llvm {
  
@@ -138,6 +139,11 @@ public:
                             MachinePointerInfo SrcPtrInfo) const {
      return std::make_pair(SDValue(), SDValue());
    }
+  // Return true when the decision to generate FMA's (or FMS, FMLA etc) rather
+  // than FMUL and ADD is delegated to the machine combiner.
+  virtual bool GenerateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const {
+    return false;
+  }
  };
  
  } // end llvm namespace
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h

index 955b5cf..bdb3be2 100644 (file)
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -818,6 +818,11 @@ public:
        MachineInstr &Root,
        SmallVectorImpl<MachineCombinerPattern> &Patterns) const;
  
+  /// Return true when a code sequence can improve throughput. It
+  /// should be called only for instructions in loops.
+  /// \param Pattern - combiner pattern
+  virtual bool isThroughputPattern(MachineCombinerPattern Pattern) const;
+
    /// Return true if the input \P Inst is part of a chain of dependent ops
    /// that are suitable for reassociation, otherwise return false.
    /// If the instruction's operands must be commuted to have a previous
diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp

index 44601d5..6b5c6ba 100644 (file)
--- a/lib/CodeGen/MachineCombiner.cpp
+++ b/lib/CodeGen/MachineCombiner.cpp
@@ -40,6 +40,7 @@ class MachineCombiner : public MachineFunctionPass {
    const TargetRegisterInfo *TRI;
    MCSchedModel SchedModel;
    MachineRegisterInfo *MRI;
+  MachineLoopInfo *MLI; // Current MachineLoopInfo
    MachineTraceMetrics *Traces;
    MachineTraceMetrics::Ensemble *MinInstr;
  
@@ -86,6 +87,7 @@ char &llvm::MachineCombinerID = MachineCombiner::ID;
  
  INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner",
                        "Machine InstCombiner", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
  INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
  INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner",
                      false, false)
@@ -93,6 +95,7 @@ INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner",
  void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
    AU.setPreservesCFG();
    AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
    AU.addPreserved<MachineLoopInfo>();
    AU.addRequired<MachineTraceMetrics>();
    AU.addPreserved<MachineTraceMetrics>();
@@ -354,6 +357,8 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
    DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
  
    auto BlockIter = MBB->begin();
+  // Check if the block is in a loop.
+  const MachineLoop *ML = MLI->getLoopFor(MBB);
  
    while (BlockIter != MBB->end()) {
      auto &MI = *BlockIter++;
@@ -406,11 +411,15 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
        if (!NewInstCount)
          continue;
  
+      bool SubstituteAlways = false;
+      if (ML && TII->isThroughputPattern(P))
+        SubstituteAlways = true;
+
        // Substitute when we optimize for codesize and the new sequence has
        // fewer instructions OR
        // the new sequence neither lengthens the critical path nor increases
        // resource pressure.
-      if (doSubstitute(NewInstCount, OldInstCount) ||
+      if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) ||
            (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
                                     InstrIdxForVirtReg, P) &&
             preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
@@ -447,6 +456,7 @@ bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
    SchedModel = STI.getSchedModel();
    TSchedModel.init(SchedModel, &STI, TII);
    MRI = &MF.getRegInfo();
+  MLI = &getAnalysis<MachineLoopInfo>();
    Traces = &getAnalysis<MachineTraceMetrics>();
    MinInstr = nullptr;
    OptSize = MF.getFunction()->optForSize();
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index 6c98008..ff3fee2 100644 (file)
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -24,6 +24,7 @@
  #include "llvm/Analysis/AliasAnalysis.h"
  #include "llvm/CodeGen/MachineFrameInfo.h"
  #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
  #include "llvm/IR/DataLayout.h"
  #include "llvm/IR/DerivedTypes.h"
  #include "llvm/IR/Function.h"
@@ -85,6 +86,7 @@ namespace {
  
    class DAGCombiner {
      SelectionDAG &DAG;
+    const SelectionDAGTargetInfo &STI;
      const TargetLowering &TLI;
      CombineLevel Level;
      CodeGenOpt::Level OptLevel;
@@ -469,8 +471,9 @@ namespace {
  
    public:
      DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
-        : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
-          OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
+        : DAG(D), STI(D.getSelectionDAGInfo()), TLI(D.getTargetLoweringInfo()),
+          Level(BeforeLegalizeTypes), OptLevel(OL), LegalOperations(false),
+          LegalTypes(false), AA(A) {
        ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize();
      }
  
@@ -7715,6 +7718,9 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
    if (!HasFMAD && !HasFMA)
      return SDValue();
  
+  if (AllowFusion && STI.GenerateFMAsInMachineCombiner(OptLevel))
+    return SDValue();
+
    // Always prefer FMAD to FMA for precision.
    unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
    bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
@@ -7898,6 +7904,9 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
    if (!HasFMAD && !HasFMA)
      return SDValue();
  
+  if (AllowFusion && STI.GenerateFMAsInMachineCombiner(OptLevel))
+    return SDValue();
+
    // Always prefer FMAD to FMA for precision.
    unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
    bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp

index 86517d9..800ad6d 100644 (file)
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -655,7 +655,11 @@ bool TargetInstrInfo::getMachineCombinerPatterns(
  
    return false;
  }
-
+/// Return true when a code sequence can improve loop throughput.
+bool
+TargetInstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
+  return false;
+}
  /// Attempt the reassociation transformation to reduce critical path length.
  /// See the above comments before getMachineCombinerPatterns().
  void TargetInstrInfo::reassociateOps(
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp

index b0574f2..fd137db 100644 (file)
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2788,37 +2788,75 @@ static bool isCombineInstrCandidate64(unsigned Opc) {
    return false;
  }
  //
+// FP Opcodes that can be combined with a FMUL
+static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
+  switch (Inst.getOpcode()) {
+  case AArch64::FADDSrr:
+  case AArch64::FADDDrr:
+  case AArch64::FADDv2f32:
+  case AArch64::FADDv2f64:
+  case AArch64::FADDv4f32:
+  case AArch64::FSUBSrr:
+  case AArch64::FSUBDrr:
+  case AArch64::FSUBv2f32:
+  case AArch64::FSUBv2f64:
+  case AArch64::FSUBv4f32:
+    return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+  default:
+    break;
+  }
+  return false;
+}
+//
  // Opcodes that can be combined with a MUL
  static bool isCombineInstrCandidate(unsigned Opc) {
    return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
  }
  
-static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
-                              unsigned MulOpc, unsigned ZeroReg) {
+//
+// Utility routine that checks if \param MO is defined by an
+// \param CombineOpc instruction in the basic block \param MBB
+static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
+                       unsigned CombineOpc, unsigned ZeroReg = 0,
+                       bool CheckZeroReg = false) {
    MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    MachineInstr *MI = nullptr;
-  // We need a virtual register definition.
+
    if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
      MI = MRI.getUniqueVRegDef(MO.getReg());
    // And it needs to be in the trace (otherwise, it won't have a depth).
-  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc)
-    return false;
-
-  assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
-         MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
-         MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
-
-  // The third input reg must be zero.
-  if (MI->getOperand(3).getReg() != ZeroReg)
+  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
      return false;
-
    // Must only used by the user we combine with.
    if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
      return false;
  
+  if (CheckZeroReg) {
+    assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
+           MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+           MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
+    // The third input reg must be zero.
+    if (MI->getOperand(3).getReg() != ZeroReg)
+      return false;
+  }
+
    return true;
  }
  
+//
+// Is \param MO defined by an integer multiply and can be combined?
+static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+                              unsigned MulOpc, unsigned ZeroReg) {
+  return canCombine(MBB, MO, MulOpc, ZeroReg, true);
+}
+
+//
+// Is \param MO defined by a floating-point multiply and can be combined?
+static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+                               unsigned MulOpc) {
+  return canCombine(MBB, MO, MulOpc);
+}
+
  // TODO: There are many more machine instruction opcodes to match:
  //       1. Other data types (integer, vectors)
  //       2. Other math / logic operations (xor, or)
@@ -2952,7 +2990,230 @@ static bool getMaddPatterns(MachineInstr &Root,
    }
    return Found;
  }
+/// Floating-Point Support
  
+/// Find instructions that can be turned into madd.
+static bool getFMAPatterns(MachineInstr &Root,
+                           SmallVectorImpl<MachineCombinerPattern> &Patterns) {
+
+  if (!isCombineInstrCandidateFP(Root))
+    return 0;
+
+  MachineBasicBlock &MBB = *Root.getParent();
+  bool Found = false;
+
+  switch (Root.getOpcode()) {
+  default:
+    assert(false && "Unsupported FP instruction in combiner\n");
+    break;
+  case AArch64::FADDSrr:
+    assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+           "FADDWrr does not have register operands");
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv1i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDDrr:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv1i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDv2f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                           AArch64::FMULv2i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv2f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDv2f64:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                           AArch64::FMULv2i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv2f64)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f64)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDv4f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                           AArch64::FMULv4i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv4f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv4i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv4f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
+      Found = true;
+    }
+    break;
+
+  case AArch64::FSUBSrr:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBDrr:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBv2f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBv2f64:
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f64)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBv4f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv4i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv4f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
+      Found = true;
+    }
+    break;
+  }
+  return Found;
+}
+
+/// Return true when a code sequence can improve throughput. It
+/// should be called only for instructions in loops.
+/// \param Pattern - combiner pattern
+bool
+AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
+  switch (Pattern) {
+  default:
+    break;
+  case MachineCombinerPattern::FMULADDS_OP1:
+  case MachineCombinerPattern::FMULADDS_OP2:
+  case MachineCombinerPattern::FMULSUBS_OP1:
+  case MachineCombinerPattern::FMULSUBS_OP2:
+  case MachineCombinerPattern::FMULADDD_OP1:
+  case MachineCombinerPattern::FMULADDD_OP2:
+  case MachineCombinerPattern::FMULSUBD_OP1:
+  case MachineCombinerPattern::FMULSUBD_OP2:
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2f32_OP2:
+  case MachineCombinerPattern::FMLAv2f32_OP1:
+  case MachineCombinerPattern::FMLAv2f64_OP1:
+  case MachineCombinerPattern::FMLAv2f64_OP2:
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMLAv4f32_OP1:
+  case MachineCombinerPattern::FMLAv4f32_OP2:
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
+  case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMLSv2f32_OP2:
+  case MachineCombinerPattern::FMLSv2f64_OP2:
+  case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv4f32_OP2:
+    return true;
+  } // end switch (Pattern)
+  return false;
+}
  /// Return true when there is potentially a faster code sequence for an
  /// instruction chain ending in \p Root. All potential patterns are listed in
  /// the \p Pattern vector. Pattern should be sorted in priority order since the
@@ -2961,28 +3222,35 @@ static bool getMaddPatterns(MachineInstr &Root,
  bool AArch64InstrInfo::getMachineCombinerPatterns(
      MachineInstr &Root,
      SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+  // Integer patterns
    if (getMaddPatterns(Root, Patterns))
      return true;
+  // Floating point patterns
+  if (getFMAPatterns(Root, Patterns))
+    return true;
  
    return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
  }
  
-/// genMadd - Generate madd instruction and combine mul and add.
-/// Example:
-///  MUL I=A,B,0
-///  ADD R,I,C
-///  ==> MADD R,A,B,C
-/// \param Root is the ADD instruction
+enum class FMAInstKind { Default, Indexed, Accumulator };
+/// genFusedMultiply - Generate fused multiply instructions.
+/// This function supports both integer and floating point instructions.
+/// A typical example:
+///  F|MUL I=A,B,0
+///  F|ADD R,I,C
+///  ==> F|MADD R,A,B,C
+/// \param Root is the F|ADD instruction
  /// \param [out] InsInstrs is a vector of machine instructions and will
  /// contain the generated madd instruction
  /// \param IdxMulOpd is index of operand in Root that is the result of
-/// the MUL. In the example above IdxMulOpd is 1.
-/// \param MaddOpc the opcode fo the madd instruction
-static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
-                             const TargetInstrInfo *TII, MachineInstr &Root,
-                             SmallVectorImpl<MachineInstr *> &InsInstrs,
-                             unsigned IdxMulOpd, unsigned MaddOpc,
-                             const TargetRegisterClass *RC) {
+/// the F|MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the f|madd instruction
+static MachineInstr *
+genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
+                 const TargetInstrInfo *TII, MachineInstr &Root,
+                 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
+                 unsigned MaddOpc, const TargetRegisterClass *RC,
+                 FMAInstKind kind = FMAInstKind::Default) {
    assert(IdxMulOpd == 1 || IdxMulOpd == 2);
  
    unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
@@ -3004,12 +3272,26 @@ static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
    if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
      MRI.constrainRegClass(SrcReg2, RC);
  
-  MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc),
-                                    ResultReg)
-                                .addReg(SrcReg0, getKillRegState(Src0IsKill))
-                                .addReg(SrcReg1, getKillRegState(Src1IsKill))
-                                .addReg(SrcReg2, getKillRegState(Src2IsKill));
-  // Insert the MADD
+  MachineInstrBuilder MIB;
+  if (kind == FMAInstKind::Default)
+    MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+              .addReg(SrcReg0, getKillRegState(Src0IsKill))
+              .addReg(SrcReg1, getKillRegState(Src1IsKill))
+              .addReg(SrcReg2, getKillRegState(Src2IsKill));
+  else if (kind == FMAInstKind::Indexed)
+    MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+              .addReg(SrcReg2, getKillRegState(Src2IsKill))
+              .addReg(SrcReg0, getKillRegState(Src0IsKill))
+              .addReg(SrcReg1, getKillRegState(Src1IsKill))
+              .addImm(MUL->getOperand(3).getImm());
+  else if (kind == FMAInstKind::Accumulator)
+    MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+              .addReg(SrcReg2, getKillRegState(Src2IsKill))
+              .addReg(SrcReg0, getKillRegState(Src0IsKill))
+              .addReg(SrcReg1, getKillRegState(Src1IsKill));
+  else
+    assert(false && "Invalid FMA instruction kind \n");
+  // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
    InsInstrs.push_back(MIB);
    return MUL;
  }
@@ -3097,7 +3379,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
        Opc = AArch64::MADDXrrr;
        RC = &AArch64::GPR64RegClass;
      }
-    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
      break;
    case MachineCombinerPattern::MULADDW_OP2:
    case MachineCombinerPattern::MULADDX_OP2:
@@ -3112,7 +3394,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
        Opc = AArch64::MADDXrrr;
        RC = &AArch64::GPR64RegClass;
      }
-    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
      break;
    case MachineCombinerPattern::MULADDWI_OP1:
    case MachineCombinerPattern::MULADDXI_OP1: {
@@ -3204,7 +3486,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
        Opc = AArch64::MSUBXrrr;
        RC = &AArch64::GPR64RegClass;
      }
-    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
      break;
    case MachineCombinerPattern::MULSUBWI_OP1:
    case MachineCombinerPattern::MULSUBXI_OP1: {
@@ -3249,6 +3531,234 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
      }
      break;
    }
+  // Floating Point Support
+  case MachineCombinerPattern::FMULADDS_OP1:
+  case MachineCombinerPattern::FMULADDD_OP1:
+    // MUL I=A,B,0
+    // ADD R,I,C
+    // ==> MADD R,A,B,C
+    // --- Create(MADD);
+    if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
+      Opc = AArch64::FMADDSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FMADDDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::FMULADDS_OP2:
+  case MachineCombinerPattern::FMULADDD_OP2:
+    // FMUL I=A,B,0
+    // FADD R,C,I
+    // ==> FMADD R,A,B,C
+    // --- Create(FMADD);
+    if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
+      Opc = AArch64::FMADDSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FMADDDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
+    Opc = AArch64::FMLAv1i32_indexed;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
+    Opc = AArch64::FMLAv1i32_indexed;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
+    Opc = AArch64::FMLAv1i64_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+    Opc = AArch64::FMLAv1i64_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2f32_OP1:
+    RC = &AArch64::FPR64RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
+      Opc = AArch64::FMLAv2i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2f32_OP2:
+    RC = &AArch64::FPR64RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
+      Opc = AArch64::FMLAv2i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2f64_OP1:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
+      Opc = AArch64::FMLAv2i64_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f64;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2f64_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
+      Opc = AArch64::FMLAv2i64_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f64;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv4f32_OP1:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
+      Opc = AArch64::FMLAv4i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv4f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv4f32_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
+      Opc = AArch64::FMLAv4i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv4f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMULSUBS_OP1:
+  case MachineCombinerPattern::FMULSUBD_OP1: {
+    // FMUL I=A,B,0
+    // FSUB R,I,C
+    // ==> FNMSUB R,A,B,C // = -C + A*B
+    // --- Create(FNMSUB);
+    if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
+      Opc = AArch64::FNMSUBSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FNMSUBDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  }
+  case MachineCombinerPattern::FMULSUBS_OP2:
+  case MachineCombinerPattern::FMULSUBD_OP2: {
+    // FMUL I=A,B,0
+    // FSUB R,C,I
+    // ==> FMSUB R,A,B,C (computes C - A*B)
+    // --- Create(FMSUB);
+    if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
+      Opc = AArch64::FMSUBSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FMSUBDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+
+  case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
+    Opc = AArch64::FMLSv1i32_indexed;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
+    Opc = AArch64::FMLSv1i64_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLSv2f32_OP2:
+  case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
+    RC = &AArch64::FPR64RegClass;
+    if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
+      Opc = AArch64::FMLSv2i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLSv2f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLSv2f64_OP2:
+  case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
+      Opc = AArch64::FMLSv2i64_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLSv2f64;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLSv4f32_OP2:
+  case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
+      Opc = AArch64::FMLSv4i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLSv4f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+  }
    } // end switch (Pattern)
    // Record MUL and ADD/SUB for deletion
    DelInstrs.push_back(MUL);
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h

index a592f91..353ef73 100644 (file)
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -174,6 +174,11 @@ public:
                              unsigned SrcReg2, int CmpMask, int CmpValue,
                              const MachineRegisterInfo *MRI) const override;
    bool optimizeCondBranch(MachineInstr *MI) const override;
+
+  /// Return true when a code sequence can improve throughput. It
+  /// should be called only for instructions in loops.
+  /// \param Pattern - combiner pattern
+  bool isThroughputPattern(MachineCombinerPattern Pattern) const override;
    /// Return true when there is potentially a faster code sequence
    /// for an instruction chain ending in <Root>. All potential patterns are
    /// listed in the <Patterns> array.
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp

index f402930..4e4aaf8 100644 (file)
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -51,3 +51,9 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
    }
    return SDValue();
  }
+bool AArch64SelectionDAGInfo::GenerateFMAsInMachineCombiner(
+    CodeGenOpt::Level OptLevel) const {
+  if (OptLevel >= CodeGenOpt::Aggressive)
+    return true;
+  return false;
+}
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h

index 8adb030..e61f177 100644 (file)
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -25,6 +25,7 @@ public:
                                    SDValue Dst, SDValue Src, SDValue Size,
                                    unsigned Align, bool isVolatile,
                                    MachinePointerInfo DstPtrInfo) const override;
+  bool GenerateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override;
  };
  }
  
diff --git a/test/CodeGen/AArch64/arm64-fma-combines.ll b/test/CodeGen/AArch64/arm64-fma-combines.ll

new file mode 100644 (file)

index 0000000..ab875c0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fma-combines.ll
@@ -0,0 +1,136 @@
+; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s
+define void @foo_2d(double* %src) {
+; CHECK-LABEL: %entry
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %arrayidx1 = getelementptr inbounds double, double* %src, i64 5
+  %arrayidx2 = getelementptr inbounds double, double* %src, i64 11
+  %tmp = bitcast double* %arrayidx1 to <2 x double>*
+  %tmp1 = load double, double* %arrayidx2, align 8
+  %tmp2 = load double, double* %arrayidx1, align 8
+  %fmul = fmul fast double %tmp1, %tmp1
+  %fmul2 = fmul fast double %tmp2, 0x3F94AFD6A052BF5B
+  %fadd = fadd fast double %fmul, %fmul2
+  br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+; CHECK: fmla.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0]
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds double, double* %src, i64 %indvars.iv.next
+  %tmp3 = load double, double* %arrayidx3, align 8
+  %add = fadd fast double %tmp3, %tmp3
+  %mul = fmul fast double %add, %fadd
+  %e1 = insertelement <2 x double> undef, double %add, i32 0
+  %e2 = insertelement <2 x double> %e1, double %add, i32 1
+  %add2 = fadd fast <2 x double> %e2, <double 3.000000e+00, double -3.000000e+00>
+  %e3 = insertelement <2 x double> undef, double %mul, i32 0
+  %e4 = insertelement <2 x double> %e3, double %mul, i32 1
+  %mul2 = fmul fast <2 x double> %add2,<double 3.000000e+00, double -3.000000e+00>
+  %e5 = insertelement <2 x double> undef, double %add, i32 0
+  %e6 = insertelement <2 x double> %e5, double %add, i32 1
+  %add3 = fadd fast  <2 x double> %mul2, <double 3.000000e+00, double -3.000000e+00>
+  %mulx = fmul fast <2 x double> %add2, %e2
+  %addx = fadd fast  <2 x double> %mulx, %e4
+  %e7 = insertelement <2 x double> undef, double %mul, i32 0
+  %e8 = insertelement <2 x double> %e7, double %mul, i32 1
+  %e9 = fmul fast <2 x double>  %addx, %add3
+  store <2 x double> %e9, <2 x double>* %tmp, align 8
+  %e10 = extractelement <2 x double> %add3, i32 0
+  %mul3 = fmul fast double %mul, %e10
+  %add4 = fadd fast double %mul3, %mul
+  store double %add4, double* %arrayidx2, align 8
+  %exitcond = icmp eq i64 %indvars.iv.next, 25
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+define void @foo_2s(float* %src) {
+entry:
+  %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
+  %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
+  %tmp = bitcast float* %arrayidx1 to <2 x float>*
+  br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+; CHECK: fmla.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0]
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
+  %tmp1 = load float, float* %arrayidx3, align 8
+  %add = fadd fast float %tmp1, %tmp1
+  %mul = fmul fast float %add, %add
+  %e1 = insertelement <2 x float> undef, float %add, i32 0
+  %e2 = insertelement <2 x float> %e1, float %add, i32 1
+  %add2 = fadd fast <2 x float> %e2, <float 3.000000e+00, float -3.000000e+00>
+  %e3 = insertelement <2 x float> undef, float %mul, i32 0
+  %e4 = insertelement <2 x float> %e3, float %mul, i32 1
+  %mul2 = fmul fast <2 x float> %add2,<float 3.000000e+00, float -3.000000e+00>
+  %e5 = insertelement <2 x float> undef, float %add, i32 0
+  %e6 = insertelement <2 x float> %e5, float %add, i32 1
+  %add3 = fadd fast  <2 x float> %mul2, <float 3.000000e+00, float -3.000000e+00>
+  %mulx = fmul fast <2 x float> %add2, %e2
+  %addx = fadd fast  <2 x float> %mulx, %e4
+  %e7 = insertelement <2 x float> undef, float %mul, i32 0
+  %e8 = insertelement <2 x float> %e7, float %mul, i32 1
+  %e9 = fmul fast <2 x float>  %addx, %add3
+  store <2 x float> %e9, <2 x float>* %tmp, align 8
+  %e10 = extractelement <2 x float> %add3, i32 0
+  %mul3 = fmul fast float %mul, %e10
+  %add4 = fadd fast float %mul3, %mul
+  store float %add4, float* %arrayidx2, align 8
+  %exitcond = icmp eq i64 %indvars.iv.next, 25
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+define void @foo_4s(float* %src) {
+entry:
+  %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
+  %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
+  %tmp = bitcast float* %arrayidx1 to <4 x float>*
+  br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmla.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmla.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
+  %tmp1 = load float, float* %arrayidx3, align 8
+  %add = fadd fast float %tmp1, %tmp1
+  %mul = fmul fast float %add, %add
+  %e1 = insertelement <4 x float> undef, float %add, i32 0
+  %e2 = insertelement <4 x float> %e1, float %add, i32 1
+  %add2 = fadd fast <4 x float> %e2, <float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00>
+  %e3 = insertelement <4 x float> undef, float %mul, i32 0
+  %e4 = insertelement <4 x float> %e3, float %mul, i32 1
+  %mul2 = fmul fast <4 x float> %add2,<float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00>
+  %e5 = insertelement <4 x float> undef, float %add, i32 0
+  %e6 = insertelement <4 x float> %e5, float %add, i32 1
+  %add3 = fadd fast  <4 x float> %mul2, <float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00> 
+  %mulx = fmul fast <4 x float> %add2, %e2
+  %addx = fadd fast  <4 x float> %mulx, %e4
+  %e7 = insertelement <4 x float> undef, float %mul, i32 0
+  %e8 = insertelement <4 x float> %e7, float %mul, i32 1
+  %e9 = fmul fast <4 x float>  %addx, %add3
+  store <4 x float> %e9, <4 x float>* %tmp, align 8
+  %e10 = extractelement <4 x float> %add3, i32 0
+  %mul3 = fmul fast float %mul, %e10
+  store float %mul3, float* %arrayidx2, align 8
+  %exitcond = icmp eq i64 %indvars.iv.next, 25
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fml-combines.ll b/test/CodeGen/AArch64/arm64-fml-combines.ll

new file mode 100644 (file)

index 0000000..840d1dc
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fml-combines.ll
@@ -0,0 +1,128 @@
+; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios  -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s
+define void @foo_2d(double* %src) {
+entry:
+  %arrayidx1 = getelementptr inbounds double, double* %src, i64 5
+  %arrayidx2 = getelementptr inbounds double, double* %src, i64 11
+  %tmp = bitcast double* %arrayidx1 to <2 x double>*
+  br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+; CHECK: fmls.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0]
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %indvars.iv.next = sub nuw nsw i64 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds double, double* %src, i64 %indvars.iv.next
+  %tmp1 = load double, double* %arrayidx3, align 8
+  %add = fadd fast double %tmp1, %tmp1
+  %mul = fmul fast double %add, %add
+  %e1 = insertelement <2 x double> undef, double %add, i32 0
+  %e2 = insertelement <2 x double> %e1, double %add, i32 1
+  %sub2 = fsub fast <2 x double> %e2, <double 3.000000e+00, double -3.000000e+00>
+  %e3 = insertelement <2 x double> undef, double %mul, i32 0
+  %e4 = insertelement <2 x double> %e3, double %mul, i32 1
+  %mul2 = fmul fast <2 x double> %sub2,<double 3.000000e+00, double -3.000000e+00>
+  %e5 = insertelement <2 x double> undef, double %add, i32 0
+  %e6 = insertelement <2 x double> %e5, double %add, i32 1
+  %sub3 = fsub fast  <2 x double>  <double 3.000000e+00, double -3.000000e+00>, %mul2
+  %mulx = fmul fast <2 x double> %sub2, %e2
+  %subx = fsub fast  <2 x double> %e4, %mulx
+  %e7 = insertelement <2 x double> undef, double %mul, i32 0
+  %e8 = insertelement <2 x double> %e7, double %mul, i32 1
+  %e9 = fmul fast <2 x double>  %subx, %sub3
+  store <2 x double> %e9, <2 x double>* %tmp, align 8
+  %e10 = extractelement <2 x double> %sub3, i32 0
+  %mul3 = fmul fast double %mul, %e10
+  %sub4 = fsub fast double %mul, %mul3
+  store double %sub4, double* %arrayidx2, align 8
+  %exitcond = icmp eq i64 %indvars.iv.next, 25
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+define void @foo_2s(float* %src) {
+entry:
+  %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
+  %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
+  %tmp = bitcast float* %arrayidx1 to <2 x float>*
+  br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+; CHECK: fmls.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0]
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
+  %tmp1 = load float, float* %arrayidx3, align 8
+  %add = fadd fast float %tmp1, %tmp1
+  %mul = fmul fast float %add, %add
+  %e1 = insertelement <2 x float> undef, float %add, i32 0
+  %e2 = insertelement <2 x float> %e1, float %add, i32 1
+  %add2 = fsub fast <2 x float> %e2, <float 3.000000e+00, float -3.000000e+00>
+  %e3 = insertelement <2 x float> undef, float %mul, i32 0
+  %e4 = insertelement <2 x float> %e3, float %mul, i32 1
+  %mul2 = fmul fast <2 x float> %add2,<float 3.000000e+00, float -3.000000e+00>
+  %e5 = insertelement <2 x float> undef, float %add, i32 0
+  %e6 = insertelement <2 x float> %e5, float %add, i32 1
+  %add3 = fsub fast  <2 x float>  <float 3.000000e+00, float -3.000000e+00>, %mul2
+  %mulx = fmul fast <2 x float> %add2, %e2
+  %addx = fsub fast  <2 x float> %e4, %mulx
+  %e7 = insertelement <2 x float> undef, float %mul, i32 0
+  %e8 = insertelement <2 x float> %e7, float %mul, i32 1
+  %e9 = fmul fast <2 x float>  %addx, %add3
+  store <2 x float> %e9, <2 x float>* %tmp, align 8
+  %e10 = extractelement <2 x float> %add3, i32 0
+  %mul3 = fmul fast float %mul, %e10
+  %add4 = fsub fast float %mul, %mul3
+  store float %add4, float* %arrayidx2, align 8
+  %exitcond = icmp eq i64 %indvars.iv.next, 25
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+define void @foo_4s(float* %src) {
+entry:
+  %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
+  %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
+  %tmp = bitcast float* %arrayidx1 to <4 x float>*
+  br label %for.body
+
+; CHECK-LABEL: %for.body
+; CHECK: fmls.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; CHECK: fmls.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
+  %tmp1 = load float, float* %arrayidx3, align 8
+  %add = fadd fast float %tmp1, %tmp1
+  %mul = fmul fast float %add, %add
+  %e1 = insertelement <4 x float> undef, float %add, i32 0
+  %e2 = insertelement <4 x float> %e1, float %add, i32 1
+  %add2 = fadd fast <4 x float> %e2, <float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00>
+  %e3 = insertelement <4 x float> undef, float %mul, i32 0
+  %e4 = insertelement <4 x float> %e3, float %mul, i32 1
+  %mul2 = fmul fast <4 x float> %add2,<float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00>
+  %e5 = insertelement <4 x float> undef, float %add, i32 0
+  %e6 = insertelement <4 x float> %e5, float %add, i32 1
+  %add3 = fsub fast  <4 x float> <float 3.000000e+00, float -3.000000e+00, float 5.000000e+00, float 7.000000e+00> , %mul2
+  %mulx = fmul fast <4 x float> %add2, %e2
+  %addx = fsub fast  <4 x float> %e4, %mulx
+  %e7 = insertelement <4 x float> undef, float %mul, i32 0
+  %e8 = insertelement <4 x float> %e7, float %mul, i32 1
+  %e9 = fmul fast <4 x float>  %addx, %add3
+  store <4 x float> %e9, <4 x float>* %tmp, align 8
+  %e10 = extractelement <4 x float> %add3, i32 0
+  %mul3 = fmul fast float %mul, %e10
+  store float %mul3, float* %arrayidx2, align 8
+  %exitcond = icmp eq i64 %indvars.iv.next, 25
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
author	Gerolf Hoflehner <ghoflehner@apple.com>
	Fri, 22 Apr 2016 02:15:19 +0000 (02:15 +0000)
committer	Gerolf Hoflehner <ghoflehner@apple.com>
	Fri, 22 Apr 2016 02:15:19 +0000 (02:15 +0000)
include/llvm/CodeGen/MachineCombinerPattern.h		patch \| blob \| history
include/llvm/CodeGen/SelectionDAGTargetInfo.h		patch \| blob \| history
include/llvm/Target/TargetInstrInfo.h		patch \| blob \| history
lib/CodeGen/MachineCombiner.cpp		patch \| blob \| history
lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
lib/CodeGen/TargetInstrInfo.cpp		patch \| blob \| history
lib/Target/AArch64/AArch64InstrInfo.cpp		patch \| blob \| history
lib/Target/AArch64/AArch64InstrInfo.h		patch \| blob \| history
lib/Target/AArch64/AArch64SelectionDAGInfo.cpp		patch \| blob \| history
lib/Target/AArch64/AArch64SelectionDAGInfo.h		patch \| blob \| history
test/CodeGen/AArch64/arm64-fma-combines.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/AArch64/arm64-fml-combines.ll	[new file with mode: 0644]	patch \| blob