From b2fd770136b92637c5f084b743eab29f910288d5 Mon Sep 17 00:00:00 2001
From: Cameron Zwarich <zwarich@apple.com>
Date: Wed, 9 Mar 2011 05:43:05 +0000
Subject: [PATCH] Add support to scalar replacement for partial vector accesses
 of an alloca, e.g. a union of a float, <2 x float>, and <4 x float>. This
 mostly comes up with the use of vector intrinsics, especially in NEON when
 programmers know the layout of the register file. This enables codegen to
 eliminate a lot of the subregister traffic it would otherwise generate.

This commit only enables this for a small number of floating-point cases, but a
lot more integer cases. I assume this is okay for all ports, but I did not do
extensive testing of the quality of code involving i512 vectors and the like. If
there is a use case where this generates worse code than before, let me know and
we can scale it back.

This fixes <rdar://problem/9036264>.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@127317 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/ScalarReplAggregates.cpp | 139 ++++++++++++++++++++++---
 test/Transforms/ScalarRepl/vector_promote.ll   |  59 +++++++++++
 2 files changed, 186 insertions(+), 12 deletions(-)
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index bc6035e1fae..1f64ad2606a 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -295,12 +295,16 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
 /// MergeInType - Add the 'In' type to the accumulated vector type (VectorTy)
 /// so far at the offset specified by Offset (which is specified in bytes).
 ///
-/// There are two cases we handle here:
+/// There are three cases we handle here:
 ///   1) A union of vector types of the same size and potentially its elements.
 ///      Here we turn element accesses into insert/extract element operations.
 ///      This promotes a <4 x float> with a store of float to the third element
 ///      into a <4 x float> that uses insert element.
-///   2) A fully general blob of memory, which we turn into some (potentially
+///   2) A union of vector types with power-of-2 size differences, e.g. a float,
+///      <2 x float> and <4 x float>.  Here we turn element accesses into insert
+///      and extract element operations, and <2 x float> accesses into a cast to
+///      <2 x double>, an extract, and a cast back to <2 x float>.
+///   3) A fully general blob of memory, which we turn into some (potentially
 ///      large) integer type with extract and insert operations where the loads
 ///      and stores would mutate the memory.  We mark this by setting VectorTy
 ///      to VoidTy.
@@ -346,18 +350,68 @@ bool ConvertToScalarInfo::MergeInVectorType(const VectorType *VInTy,
   // Remember if we saw a vector type.
   HadAVector = true;
 
-  if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) {
-    // If we're storing/loading a vector of the right size, allow it as a
-    // vector.  If this the first vector we see, remember the type so that
-    // we know the element size.  If this is a subsequent access, ignore it
-    // even if it is a differing type but the same size.  Worst case we can
-    // bitcast the resultant vectors.
-    if (VectorTy == 0)
-      VectorTy = VInTy;
+  // TODO: Support nonzero offsets?
+  if (Offset != 0)
+    return false;
+
+  // Only allow vectors that are a power-of-2 away from the size of the alloca.
+  if (!isPowerOf2_64(AllocaSize / (VInTy->getBitWidth() / 8)))
+    return false;
+
+  // If this the first vector we see, remember the type so that we know the
+  // element size.
+  if (!VectorTy) {
+    VectorTy = VInTy;
     return true;
   }
 
-  return false;
+  unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
+  unsigned InBitWidth = VInTy->getBitWidth();
+
+  // Vectors of the same size can be converted using a simple bitcast.
+  if (InBitWidth == BitWidth && AllocaSize == (InBitWidth / 8))
+    return true;
+
+  const Type *ElementTy = cast<VectorType>(VectorTy)->getElementType();
+  const Type *InElementTy = cast<VectorType>(VectorTy)->getElementType();
+
+  // Do not allow mixed integer and floating-point accesses from vectors of
+  // different sizes.
+  if (ElementTy->isFloatingPointTy() != InElementTy->isFloatingPointTy())
+    return false;
+
+  if (ElementTy->isFloatingPointTy()) {
+    // Only allow floating-point vectors of different sizes if they have the
+    // same element type.
+    // TODO: This could be loosened a bit, but would anything benefit?
+    if (ElementTy != InElementTy)
+      return false;
+
+    // There are no arbitrary-precision floating-point types, which limits the
+    // number of legal vector types with larger element types that we can form
+    // to bitcast and extract a subvector.
+    // TODO: We could support some more cases with mixed fp128 and double here.
+    if (!(BitWidth == 64 || BitWidth == 128) ||
+        !(InBitWidth == 64 || InBitWidth == 128))
+      return false;
+  } else {
+    assert(ElementTy->isIntegerTy() && "Vector elements must be either integer "
+                                       "or floating-point.");
+    unsigned BitWidth = ElementTy->getPrimitiveSizeInBits();
+    unsigned InBitWidth = InElementTy->getPrimitiveSizeInBits();
+
+    // Do not allow integer types smaller than a byte or types whose widths are
+    // not a multiple of a byte.
+    if (BitWidth < 8 || InBitWidth < 8 ||
+        BitWidth % 8 != 0 || InBitWidth % 8 != 0)
+      return false;
+  }
+
+  // Pick the largest of the two vector types.
+  if (InBitWidth > BitWidth)
+    VectorTy = VInTy;
+
+  return true;
 }
 
 /// CanConvertToScalar - V is a pointer.  If we can convert the pointee and all
@@ -586,6 +640,26 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
   }
 }
 
+/// getScaledElementType - Gets a scaled element type for a partial vector
+/// access of an alloca. The input type must be an integer or float, and
+/// the resulting type must be an integer, float or double.
+static const Type *getScaledElementType(const Type *OldTy, unsigned Scale) {
+  assert((OldTy->isIntegerTy() || OldTy->isFloatTy()) && "Partial vector "
+         "accesses must be scaled from integer or float elements.");
+
+  LLVMContext &Context = OldTy->getContext();
+  unsigned Size = OldTy->getPrimitiveSizeInBits() * Scale;
+
+  if (OldTy->isIntegerTy())
+    return Type::getIntNTy(Context, Size);
+  if (Size == 32)
+    return Type::getFloatTy(Context);
+  if (Size == 64)
+    return Type::getDoubleTy(Context);
+
+  llvm_unreachable("Invalid type for a partial vector access of an alloca!");
+}
+
 /// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer
 /// or vector value FromVal, extracting the bits from the offset specified by
 /// Offset.  This returns the value, which is of type ToType.
@@ -606,8 +680,27 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
   // If the result alloca is a vector type, this is either an element
   // access or a bitcast to another vector type of the same size.
   if (const VectorType *VTy = dyn_cast<VectorType>(FromVal->getType())) {
-    if (ToType->isVectorTy())
+    if (ToType->isVectorTy()) {
+      if (isPowerOf2_64(AllocaSize / TD.getTypeAllocSize(ToType))) {
+        assert(Offset == 0 && "Can't extract a value of a smaller vector type "
+                              "from a nonzero offset.");
+
+        const Type *ToElementTy = cast<VectorType>(ToType)->getElementType();
+        unsigned Scale = AllocaSize / TD.getTypeAllocSize(ToType);
+        const Type *CastElementTy = getScaledElementType(ToElementTy, Scale);
+        unsigned NumCastVectorElements = VTy->getNumElements() / Scale;
+
+        LLVMContext &Context = FromVal->getContext();
+        const Type *CastTy = VectorType::get(CastElementTy,
+                                             NumCastVectorElements);
+        Value *Cast = Builder.CreateBitCast(FromVal, CastTy, "tmp");
+        Value *Extract = Builder.CreateExtractElement(Cast, ConstantInt::get(
+                                          Type::getInt32Ty(Context), 0), "tmp");
+        return Builder.CreateBitCast(Extract, ToType, "tmp");
+      }
+
       return Builder.CreateBitCast(FromVal, ToType, "tmp");
+    }
 
     // Otherwise it must be an element access.
     unsigned Elt = 0;
@@ -728,6 +821,28 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
     if (ValSize == VecSize)
       return Builder.CreateBitCast(SV, AllocaType, "tmp");
 
+    if (SV->getType()->isVectorTy() && isPowerOf2_64(VecSize / ValSize)) {
+      assert(Offset == 0 && "Can't insert a value of a smaller vector type at "
+                            "a nonzero offset.");
+
+      const Type *ToElementTy =
+        cast<VectorType>(SV->getType())->getElementType();
+      unsigned Scale = VecSize / ValSize;
+      const Type *CastElementTy = getScaledElementType(ToElementTy, Scale);
+      unsigned NumCastVectorElements = VTy->getNumElements() / Scale;
+
+      LLVMContext &Context = SV->getContext();
+      const Type *OldCastTy = VectorType::get(CastElementTy,
+                                              NumCastVectorElements);
+      Value *OldCast = Builder.CreateBitCast(Old, OldCastTy, "tmp");
+
+      Value *SVCast = Builder.CreateBitCast(SV, CastElementTy, "tmp");
+      Value *Insert =
+        Builder.CreateInsertElement(OldCast, SVCast, ConstantInt::get(
+                                    Type::getInt32Ty(Context), 0), "tmp");
+      return Builder.CreateBitCast(Insert, AllocaType, "tmp");
+    }
+
     uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType());
 
     // Must be an element insertion.
diff --git a/test/Transforms/ScalarRepl/vector_promote.ll b/test/Transforms/ScalarRepl/vector_promote.ll
index 37cb49f539d..898cbde3530 100644
--- a/test/Transforms/ScalarRepl/vector_promote.ll
+++ b/test/Transforms/ScalarRepl/vector_promote.ll
@@ -98,3 +98,62 @@ define i64 @test6(<2 x float> %X) {
 ; CHECK: ret i64
 }
 
+define float @test7(<4 x float> %x) {
+	%a = alloca <4 x float>
+	store <4 x float> %x, <4 x float>* %a
+	%p = bitcast <4 x float>* %a to <2 x float>*
+	%b = load <2 x float>* %p
+	%q = getelementptr <4 x float>* %a, i32 0, i32 2
+	%c = load float* %q
+	ret float %c
+; CHECK: @test7
+; CHECK-NOT: alloca
+; CHECK: bitcast <4 x float> %x to <2 x double>
+; CHECK-NEXT: extractelement <2 x double>
+; CHECK-NEXT: bitcast double %tmp4 to <2 x float>
+; CHECK-NEXT: extractelement <4 x float>
+}
+
+define void @test8(<4 x float> %x, <2 x float> %y) {
+	%a = alloca <4 x float>
+	store <4 x float> %x, <4 x float>* %a
+	%p = bitcast <4 x float>* %a to <2 x float>*
+	store <2 x float> %y, <2 x float>* %p
+	ret void
+; CHECK: @test8
+; CHECK-NOT: alloca
+; CHECK: bitcast <4 x float> %x to <2 x double>
+; CHECK-NEXT: bitcast <2 x float> %y to double
+; CHECK-NEXT: insertelement <2 x double>
+; CHECK-NEXT: bitcast <2 x double> %tmp2 to <4 x float>
+}
+
+define i256 @test9(<4 x i256> %x) {
+	%a = alloca <4 x i256>
+	store <4 x i256> %x, <4 x i256>* %a
+	%p = bitcast <4 x i256>* %a to <2 x i256>*
+	%b = load <2 x i256>* %p
+	%q = getelementptr <4 x i256>* %a, i32 0, i32 2
+	%c = load i256* %q
+	ret i256 %c
+; CHECK: @test9
+; CHECK-NOT: alloca
+; CHECK: bitcast <4 x i256> %x to <2 x i512>
+; CHECK-NEXT: extractelement <2 x i512>
+; CHECK-NEXT: bitcast i512 %tmp4 to <2 x i256>
+; CHECK-NEXT: extractelement <4 x i256>
+}
+
+define void @test10(<4 x i256> %x, <2 x i256> %y) {
+	%a = alloca <4 x i256>
+	store <4 x i256> %x, <4 x i256>* %a
+	%p = bitcast <4 x i256>* %a to <2 x i256>*
+	store <2 x i256> %y, <2 x i256>* %p
+	ret void
+; CHECK: @test10
+; CHECK-NOT: alloca
+; CHECK: bitcast <4 x i256> %x to <2 x i512>
+; CHECK-NEXT: bitcast <2 x i256> %y to i512
+; CHECK-NEXT: insertelement <2 x i512>
+; CHECK-NEXT: bitcast <2 x i512> %tmp2 to <4 x i256>
+}
-- 
2.11.0