bool X86InterleavedAccessGroup::isSupported() const {
VectorType *ShuffleVecTy = Shuffles[0]->getType();
- uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy);
Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
+ unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
+ unsigned SupportedNumElem = 4;
+ unsigned WideInstSize;
// Currently, lowering is supported for 4-element vectors of 64 bits on AVX.
- uint64_t ExpectedShuffleVecSize;
- if (isa<LoadInst>(Inst))
- ExpectedShuffleVecSize = 256;
- else
- ExpectedShuffleVecSize = 1024;
-
- if (!Subtarget.hasAVX() || ShuffleVecSize != ExpectedShuffleVecSize ||
- DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4)
+ if (isa<LoadInst>(Inst)) {
+ if (DL.getTypeSizeInBits(ShuffleVecTy) != SupportedNumElem * ShuffleElemSize)
+ return false;
+
+ WideInstSize = DL.getTypeSizeInBits(Inst->getType());
+ } else
+ WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
+
+ if (!Subtarget.hasAVX() || Factor != 4 || ShuffleElemSize != 64 ||
+ WideInstSize != (Factor * ShuffleElemSize * SupportedNumElem))
return false;
return true;
for (unsigned i = 0; i < NumSubVectors; ++i)
DecomposedVectors.push_back(
cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
- Op0, Op1, createSequentialMask(Builder, Indices[i],
- SubVecTy->getVectorNumElements(), 0))));
+ Op0, Op1,
+ createSequentialMask(Builder, Indices[i],
+ SubVecTy->getVectorNumElements(), 0))));
return;
}
// Lower the interleaved stores:
// 1. Decompose the interleaved wide shuffle into individual shuffle
// vectors.
- decompose(Shuffles[0], Factor,
- VectorType::get(ShuffleEltTy, NumSubVecElems), DecomposedVectors);
+ decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
+ DecomposedVectors);
// 2. Transpose the interleaved-vectors into vectors of contiguous
// elements.
store <16 x double> %interleaved.vec, <16 x double>* %ptr, align 16
ret void
}
+
+; This verifies whether the test passes and does not hit any assertions.
+; Today, X86InterleavedAccess could have handled this case and
+; generate transposed sequence by extending the current implementation
+; which would be creating dummy vectors of undef. But it decided not to
+; optimize these cases where the load-size is less than Factor * NumberOfElements.
+; Because a better sequence can easily be generated by CG.
+
+@a = local_unnamed_addr global <4 x double> zeroinitializer, align 32
+; Function Attrs: norecurse nounwind readonly uwtable
+define <4 x double> @test_unhandled(<4 x double> %b) {
+entry:
+ %0 = load <4 x double>, <4 x double>* @a, align 32
+ %1 = shufflevector <4 x double> %0, <4 x double> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
+ %shuffle = shufflevector <4 x double> %1, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 0, i32 0>
+ ret <4 x double> %shuffle
+}