merge vector stores into wider vector stores and fix AArch64 misaligned access TLI...

author Sanjay Patel <spatel@rotateright.com>

Fri, 25 Sep 2015 21:49:48 +0000 (21:49 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Fri, 25 Sep 2015 21:49:48 +0000 (21:49 +0000)
author Sanjay Patel <spatel@rotateright.com>
Fri, 25 Sep 2015 21:49:48 +0000 (21:49 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Fri, 25 Sep 2015 21:49:48 +0000 (21:49 +0000)
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

index ad60b82..01ca288 100644 (file)
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11072,8 +11072,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
    if (ElementSizeBytes * 8 != MemVT.getSizeInBits())
      return false;
  
-  // Don't merge vectors into wider inputs.
-  if (MemVT.isVector() || !MemVT.isSimple())
+  if (!MemVT.isSimple())
      return false;
  
    // Perform an early exit check. Do not bother looking at stored values that
@@ -11082,9 +11081,16 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
    bool IsLoadSrc = isa<LoadSDNode>(StoredVal);
    bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) ||
                         isa<ConstantFPSDNode>(StoredVal);
-  bool IsExtractVecEltSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT);
+  bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+                          StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR);
  
-  if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecEltSrc)
+  if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc)
+    return false;
+
+  // Don't merge vectors into wider vectors if the source data comes from loads.
+  // TODO: This restriction can be lifted by using logic similar to the
+  // ExtractVecSrc case.
+  if (MemVT.isVector() && IsLoadSrc)
      return false;
  
    // Only look at ends of store sequences.
@@ -11217,29 +11223,36 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
  
    // When extracting multiple vector elements, try to store them
    // in one vector store rather than a sequence of scalar stores.
-  if (IsExtractVecEltSrc) {
-    unsigned NumElem = 0;
+  if (IsExtractVecSrc) {
+    unsigned NumStoresToMerge = 0;
+    bool IsVec = MemVT.isVector();
      for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) {
        StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      SDValue StoredVal = St->getValue();
+      unsigned StoreValOpcode = St->getValue().getOpcode();
        // This restriction could be loosened.
        // Bail out if any stored values are not elements extracted from a vector.
        // It should be possible to handle mixed sources, but load sources need
        // more careful handling (see the block of code below that handles
        // consecutive loads).
-      if (StoredVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT &&
+          StoreValOpcode != ISD::EXTRACT_SUBVECTOR)
          return false;
  
        // Find a legal type for the vector store.
-      EVT Ty = EVT::getVectorVT(Context, MemVT, i+1);
+      unsigned Elts = i + 1;
+      if (IsVec) {
+        // When merging vector stores, get the total number of elements.
+        Elts *= MemVT.getVectorNumElements();
+      }
+      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
        bool IsFast;
        if (TLI.isTypeLegal(Ty) &&
            TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
                                   FirstStoreAlign, &IsFast) && IsFast)
-        NumElem = i + 1;
+        NumStoresToMerge = i + 1;
      }
  
-    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
+    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumStoresToMerge,
                                             false, true);
    }
  
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp

index cba7c5b..f27c326 100644 (file)
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -796,9 +796,25 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                             bool *Fast) const {
    if (Subtarget->requiresStrictAlign())
      return false;
-  // FIXME: True for Cyclone, but not necessary others.
-  if (Fast)
-    *Fast = true;
+
+  // FIXME: This is mostly true for Cyclone, but not necessarily others.
+  if (Fast) {
+    // FIXME: Define an attribute for slow unaligned accesses instead of
+    // relying on the CPU type as a proxy.
+    // On Cyclone, unaligned 128-bit stores are slow.
+    *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||
+            // See comments in performSTORECombine() for more details about
+            // these conditions.
+
+            // Code that uses clang vector extensions can mark that it
+            // wants unaligned accesses to be treated as fast by
+            // underspecifying alignment to be 1 or 2.
+            Align <= 2 ||
+
+            // Disregard v2i64. Memcpy lowering produces those and splitting
+            // them regresses performance on micro-benchmarks and olden/bh.
+            VT == MVT::v2i64;
+  }
    return true;
  }
  
@@ -8435,6 +8451,10 @@ static SDValue performSTORECombine(SDNode *N,
    if (S->isVolatile())
      return SDValue();
  
+  // FIXME: The logic for deciding if an unaligned store should be split should
+  // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
+  // a call to that function here.
+
    // Cyclone has bad performance on unaligned 16B stores when crossing line and
    // page boundaries. We want to split such stores.
    if (!Subtarget->isCyclone())
diff --git a/test/CodeGen/AArch64/merge-store.ll b/test/CodeGen/AArch64/merge-store.ll

index 18dbad4..86f5edd 100644 (file)
--- a/test/CodeGen/AArch64/merge-store.ll
+++ b/test/CodeGen/AArch64/merge-store.ll
@@ -1,4 +1,5 @@
  ; RUN: llc -march aarch64 %s -o - | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown  -mcpu=cyclone  | FileCheck %s --check-prefix=CYCLONE
  
  @g0 = external global <3 x float>, align 16
  @g1 = external global <3 x float>, align 4
@@ -18,3 +19,32 @@ define void @blam() {
    store float %tmp9, float* %tmp7
    ret void;
  }
+
+
+; PR21711 - Merge vector stores into wider vector stores.
+
+; On Cyclone, the stores should not get merged into a 16-byte store because
+; unaligned 16-byte stores are slow. This test would infinite loop when
+; the fastness of unaligned accesses was not specified correctly.
+
+define void @merge_vec_extract_stores(<4 x float> %v1, <2 x float>* %ptr) {
+  %idx0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3
+  %idx1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 4
+
+  %shuffle0 = shufflevector <4 x float> %v1, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuffle1 = shufflevector <4 x float> %v1, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+
+  store <2 x float> %shuffle0, <2 x float>* %idx0, align 8
+  store <2 x float> %shuffle1, <2 x float>* %idx1, align 8
+  ret void
+
+; CHECK-LABEL:    merge_vec_extract_stores
+; CHECK:          stur   q0, [x0, #24]
+; CHECK-NEXT:     ret
+
+; CYCLONE-LABEL:    merge_vec_extract_stores
+; CYCLONE:          ext   v1.16b, v0.16b, v0.16b, #8
+; CYCLONE-NEXT:     str   d0, [x0, #24]
+; CYCLONE-NEXT:     str   d1, [x0, #32]
+; CYCLONE-NEXT:     ret
+}
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll

index c25fdf5..6670bbe 100644 (file)
--- a/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -478,10 +478,8 @@ define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x flo
    ret void
  
  ; CHECK-LABEL: merge_vec_extract_stores
-; CHECK:      vmovaps %xmm0, 48(%rdi)
-; CHECK-NEXT: vextractf128 $1, %ymm0, 64(%rdi)
-; CHECK-NEXT: vmovaps %xmm1, 80(%rdi)
-; CHECK-NEXT: vextractf128 $1, %ymm1, 96(%rdi)
+; CHECK:      vmovups %ymm0, 48(%rdi)
+; CHECK-NEXT: vmovups %ymm1, 80(%rdi)
  ; CHECK-NEXT: vzeroupper
  ; CHECK-NEXT: retq
  }
author	Sanjay Patel <spatel@rotateright.com>
	Fri, 25 Sep 2015 21:49:48 +0000 (21:49 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Fri, 25 Sep 2015 21:49:48 +0000 (21:49 +0000)
lib/CodeGen/SelectionDAG/DAGCombiner.cpp		patch \| blob \| history
lib/Target/AArch64/AArch64ISelLowering.cpp		patch \| blob \| history
test/CodeGen/AArch64/merge-store.ll		patch \| blob \| history
test/CodeGen/X86/MergeConsecutiveStores.ll		patch \| blob \| history