OSDN Git Service

X86InterleaveAccess: A fix for bug33826
authorFarhana Aleen <farhana.a.aleen@intel.com>
Fri, 21 Jul 2017 21:35:00 +0000 (21:35 +0000)
committerFarhana Aleen <farhana.a.aleen@intel.com>
Fri, 21 Jul 2017 21:35:00 +0000 (21:35 +0000)
Reviewers: DavidKreitzer

Differential Revision: https://reviews.llvm.org/D35638

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@308784 91177308-0d34-0410-b5e6-96231b3b80d8

lib/Target/X86/X86InterleavedAccess.cpp
test/Transforms/InterleavedAccess/X86/interleaved-accesses-64bits-avx.ll

index f0ed4bc..de3f672 100644 (file)
@@ -98,18 +98,22 @@ public:
 
 bool X86InterleavedAccessGroup::isSupported() const {
   VectorType *ShuffleVecTy = Shuffles[0]->getType();
-  uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy);
   Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
+  unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
+  unsigned SupportedNumElem = 4;
+  unsigned WideInstSize;
 
   // Currently, lowering is supported for 4-element vectors of 64 bits on AVX.
-  uint64_t ExpectedShuffleVecSize;
-  if (isa<LoadInst>(Inst))
-    ExpectedShuffleVecSize = 256;
-  else
-    ExpectedShuffleVecSize = 1024;
-
-  if (!Subtarget.hasAVX() || ShuffleVecSize != ExpectedShuffleVecSize ||
-      DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4)
+  if (isa<LoadInst>(Inst)) {
+    if (DL.getTypeSizeInBits(ShuffleVecTy) != SupportedNumElem * ShuffleElemSize)
+      return false;
+
+    WideInstSize = DL.getTypeSizeInBits(Inst->getType());
+  } else
+    WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
+
+  if (!Subtarget.hasAVX() || Factor != 4 || ShuffleElemSize != 64 ||
+      WideInstSize != (Factor * ShuffleElemSize * SupportedNumElem))
     return false;
 
   return true;
@@ -137,8 +141,9 @@ void X86InterleavedAccessGroup::decompose(
     for (unsigned i = 0; i < NumSubVectors; ++i)
       DecomposedVectors.push_back(
           cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
-              Op0, Op1, createSequentialMask(Builder, Indices[i],
-                                             SubVecTy->getVectorNumElements(), 0))));
+              Op0, Op1,
+              createSequentialMask(Builder, Indices[i],
+                                   SubVecTy->getVectorNumElements(), 0))));
     return;
   }
 
@@ -219,8 +224,8 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
   // Lower the interleaved stores:
   //   1. Decompose the interleaved wide shuffle into individual shuffle
   //   vectors.
-  decompose(Shuffles[0], Factor,
-            VectorType::get(ShuffleEltTy, NumSubVecElems), DecomposedVectors);
+  decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
+            DecomposedVectors);
 
   //   2. Transpose the interleaved-vectors into vectors of contiguous
   //      elements.
index dcef57f..1a48be2 100644 (file)
@@ -217,3 +217,20 @@ define void @store_factorf64_4_arbitraryMask(<16 x double>* %ptr, <16 x double>
   store <16 x double> %interleaved.vec, <16 x double>* %ptr, align 16
   ret void
 }
+
+; This verifies whether the test passes and does not hit any assertions.
+; Today, X86InterleavedAccess could have handled this case and
+; generate transposed sequence by extending the current implementation
+; which would be creating dummy vectors of undef. But it decided not to
+; optimize these cases where the load-size is less than Factor * NumberOfElements.
+; Because a better sequence can easily be generated by CG.
+
+@a = local_unnamed_addr global <4 x double> zeroinitializer, align 32
+; Function Attrs: norecurse nounwind readonly uwtable
+define <4 x double> @test_unhandled(<4 x double> %b) {
+entry:
+  %0 = load <4 x double>, <4 x double>* @a, align 32
+  %1 = shufflevector <4 x double> %0, <4 x double> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x double> %1, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 0, i32 0>
+  ret <4 x double> %shuffle
+}