ARM: align loops to 4 bytes on Cortex-M3 and Cortex-M4.

author Tim Northover <tnorthover@apple.com>

Thu, 13 Sep 2018 10:28:05 +0000 (10:28 +0000)

committer Tim Northover <tnorthover@apple.com>

Thu, 13 Sep 2018 10:28:05 +0000 (10:28 +0000)
author Tim Northover <tnorthover@apple.com>
Thu, 13 Sep 2018 10:28:05 +0000 (10:28 +0000)
committer Tim Northover <tnorthover@apple.com>
Thu, 13 Sep 2018 10:28:05 +0000 (10:28 +0000)
diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h

index e49cd3f..1760302 100644 (file)
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -1435,6 +1435,12 @@ public:
      return PrefLoopAlignment;
    }
  
+  /// Should loops be aligned even when the function is marked OptSize (but not
+  /// MinSize).
+  virtual bool alignLoopsWithOptSize() const {
+    return false;
+  }
+
    /// If the target has a standard location for the stack protector guard,
    /// returns the address of that location. Otherwise, returns nullptr.
    /// DEPRECATED: please override useLoadStackGuardNode and customize
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp

index 21350df..624d336 100644 (file)
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -2497,7 +2497,8 @@ void MachineBlockPlacement::alignBlocks() {
    // exclusively on the loop info here so that we can align backedges in
    // unnatural CFGs and backedges that were introduced purely because of the
    // loop rotations done during this layout pass.
-  if (F->getFunction().optForSize())
+  if (F->getFunction().optForMinSize() ||
+      (F->getFunction().optForSize() && !TLI->alignLoopsWithOptSize()))
      return;
    BlockChain &FunctionChain = *BlockToChain[&F->front()];
    if (FunctionChain.begin() == FunctionChain.end())
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td

index 276ea78..2d0f270 100644 (file)
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -265,6 +265,9 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
  def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
                                               "Prefer 32-bit Thumb instrs">;
  
+def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopAlignment","2",
+                                              "Prefer 32-bit alignment for loops">;
+
  /// Some instructions update CPSR partially, which can add false dependency for
  /// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is
  /// mapped to a separate physical register. Avoid partial CPSR update for these
@@ -936,6 +939,7 @@ def : ProcessorModel<"cortex-r8",   CortexA8Model,      [ARMv7r,
  
  def : ProcessorModel<"cortex-m3", CortexM3Model,        [ARMv7m,
                                                           ProcM3,
+                                                         FeaturePrefLoopAlign32,
                                                           FeatureHasNoBranchPredictor]>;
  
  def : ProcessorModel<"sc300",     CortexM3Model,        [ARMv7m,
@@ -946,6 +950,7 @@ def : ProcessorModel<"cortex-m4", CortexM3Model,        [ARMv7em,
                                                           FeatureVFP4,
                                                           FeatureVFPOnlySP,
                                                           FeatureD16,
+                                                         FeaturePrefLoopAlign32,
                                                           FeatureHasNoBranchPredictor]>;
  
  def : ProcNoItin<"cortex-m7",                           [ARMv7em,
@@ -960,6 +965,7 @@ def : ProcessorModel<"cortex-m33", CortexM3Model,       [ARMv8mMainline,
                                                           FeatureFPARMv8,
                                                           FeatureD16,
                                                           FeatureVFPOnlySP,
+                                                         FeaturePrefLoopAlign32,
                                                           FeatureHasNoBranchPredictor]>;
  
  def : ProcNoItin<"cortex-a32",                           [ARMv8a,
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp

index 4fed202..8b7b6b3 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -1199,6 +1199,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
    // Prefer likely predicted branches to selects on out-of-order cores.
    PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
  
+  setPrefLoopAlignment(Subtarget->getPrefLoopAlignment());
+
    setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
  }
  
@@ -14695,6 +14697,11 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                Addr});
  }
  
+
+bool ARMTargetLowering::alignLoopsWithOptSize() const {
+  return Subtarget->isMClass();
+}
+
  /// A helper function for determining the number of interleaved accesses we
  /// will generate when lowering accesses of the given type.
  unsigned
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h

index 734b1ee..bf652bc 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -575,6 +575,8 @@ class VectorType;
      bool isLegalInterleavedAccessType(VectorType *VecTy,
                                        const DataLayout &DL) const;
  
+    bool alignLoopsWithOptSize() const override;
+
      /// Returns the number of interleaved accesses that will be generated when
      /// lowering accesses of the given type.
      unsigned getNumInterleavedAccesses(VectorType *VecTy,
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h

index d7bfa89..34938c3 100644 (file)
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -438,6 +438,9 @@ protected:
    /// operand cycle returned by the itinerary data for pre-ISel operands.
    int PreISelOperandLatencyAdjustment = 2;
  
+  /// What alignment is preferred for loop bodies, in log2(bytes).
+  unsigned PrefLoopAlignment = 0;
+
    /// IsLittle - The target is Little Endian
    bool IsLittle;
  
@@ -804,6 +807,10 @@ public:
    bool allowPositionIndependentMovt() const {
      return isROPI() || !isTargetELF();
    }
+
+  unsigned getPrefLoopAlignment() const {
+    return PrefLoopAlignment;
+  }
  };
  
  } // end namespace llvm
diff --git a/test/CodeGen/ARM/loop-align-cortex-m.ll b/test/CodeGen/ARM/loop-align-cortex-m.ll

new file mode 100644 (file)

index 0000000..1b41c1b
--- /dev/null
+++ b/test/CodeGen/ARM/loop-align-cortex-m.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m3 -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m4 -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m33 -o - | FileCheck %s
+
+define void @test_loop_alignment(i32* %in, i32*  %out) optsize {
+; CHECK-LABEL: test_loop_alignment:
+; CHECK: movs {{r[0-9]+}}, #0
+; CHECK: .p2align 2
+
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %in.addr = getelementptr inbounds i32, i32* %in, i32 %i
+  %lhs = load i32, i32* %in.addr, align 4
+  %res = mul nsw i32 %lhs, 5
+  %out.addr = getelementptr inbounds i32, i32* %out, i32 %i
+  store i32 %res, i32* %out.addr, align 4
+  %i.next = add i32 %i, 1
+  %done = icmp eq i32 %i.next, 1024
+  br i1 %done, label %end, label %loop
+
+end:
+  ret void
+}
+
+define void @test_loop_alignment_minsize(i32* %in, i32*  %out) minsize {
+; CHECK-LABEL: test_loop_alignment_minsize:
+; CHECK: movs {{r[0-9]+}}, #0
+; CHECK-NOT: .p2align
+
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %in.addr = getelementptr inbounds i32, i32* %in, i32 %i
+  %lhs = load i32, i32* %in.addr, align 4
+  %res = mul nsw i32 %lhs, 5
+  %out.addr = getelementptr inbounds i32, i32* %out, i32 %i
+  store i32 %res, i32* %out.addr, align 4
+  %i.next = add i32 %i, 1
+  %done = icmp eq i32 %i.next, 1024
+  br i1 %done, label %end, label %loop
+
+end:
+  ret void
+}
author	Tim Northover <tnorthover@apple.com>
	Thu, 13 Sep 2018 10:28:05 +0000 (10:28 +0000)
committer	Tim Northover <tnorthover@apple.com>
	Thu, 13 Sep 2018 10:28:05 +0000 (10:28 +0000)
include/llvm/CodeGen/TargetLowering.h		patch \| blob \| history
lib/CodeGen/MachineBlockPlacement.cpp		patch \| blob \| history
lib/Target/ARM/ARM.td		patch \| blob \| history
lib/Target/ARM/ARMISelLowering.cpp		patch \| blob \| history
lib/Target/ARM/ARMISelLowering.h		patch \| blob \| history
lib/Target/ARM/ARMSubtarget.h		patch \| blob \| history
test/CodeGen/ARM/loop-align-cortex-m.ll	[new file with mode: 0644]	patch \| blob