-; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=NOSTORE
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
; #include <stdint.h>
;
; return sum;
; }
-; NOSTORE-LABEL: add_red
-; NOSTORE: fmul <4 x float>
-; NOSTORE: shufflevector <4 x float>
+; CHECK-LABEL: add_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>
define i32 @add_red(float* %A, i32 %n) {
entry:
; }
; CHECK-LABEL: long_red
-; CHECK: fmul fast <4 x float>
-; CHECK: shufflevector <4 x float>
+; CHECK: fmul fast <8 x float>
+; CHECK: shufflevector <8 x float>
define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
entry:
ret i32 %sum.0.lcssa
}
+; void foo(const float *arg_A, unsigned arg_B, float *array) {
+; for (uint32_t i = 0; i < 6; ++i) {
+; const float *ptr = arg_A + i;
+; float w0 = array[i * 4 + 0];
+; float w1 = array[i * 4 + 1];
+; float w2 = array[i * 4 + 2];
+; float w3 = array[i * 4 + 3];
+;
+; for (unsigned j = 0; j < arg_B; ++j) {
+; const float x1 = *ptr - (-1.1f * w0) - (1.2f * w1);
+; const float x2 = (2.1f * x1) + (-2.2f * w0) + (2.3f * w1);
+; const float x3 = x2 - (-3.1f * w2) - (3.2f * w3);
+; const float x4 = x3 + (-4.0f * w2) + w3;
+; w1 = w0;
+; w0 = x1;
+; w3 = w2;
+; w2 = x3;
+; }
+;
+; array[i * 4 + 0] = w0;
+; array[i * 4 + 1] = w1;
+; array[i * 4 + 2] = w2;
+; array[i * 4 + 3] = w3;
+; }
+; }
+
+define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture %array) {
+; CHECK-LABEL: @foo(
+; CHECK: fmul fast <4 x float>
+; CHECK: shufflevector <4 x float>
+;
+entry:
+ %cmp1495 = icmp eq i32 %arg_B, 0
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.cleanup15
+ ret void
+
+for.body: ; preds = %for.cond.cleanup15, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond.cleanup15 ]
+ %0 = shl i64 %indvars.iv, 2
+ %arrayidx = getelementptr inbounds float, float* %array, i64 %0
+ %1 = load float, float* %arrayidx, align 4
+ %2 = or i64 %0, 1
+ %arrayidx4 = getelementptr inbounds float, float* %array, i64 %2
+ %3 = load float, float* %arrayidx4, align 4
+ %4 = or i64 %0, 2
+ %arrayidx8 = getelementptr inbounds float, float* %array, i64 %4
+ %5 = load float, float* %arrayidx8, align 4
+ %6 = or i64 %0, 3
+ %arrayidx12 = getelementptr inbounds float, float* %array, i64 %6
+ %7 = load float, float* %arrayidx12, align 4
+ br i1 %cmp1495, label %for.cond.cleanup15, label %for.body16.lr.ph
+
+for.body16.lr.ph: ; preds = %for.body
+ %add.ptr = getelementptr inbounds float, float* %arg_A, i64 %indvars.iv
+ %8 = load float, float* %add.ptr, align 4
+ br label %for.body16
+
+for.cond.cleanup15: ; preds = %for.body16, %for.body
+ %w2.0.lcssa = phi float [ %5, %for.body ], [ %sub28, %for.body16 ]
+ %w3.0.lcssa = phi float [ %7, %for.body ], [ %w2.096, %for.body16 ]
+ %w1.0.lcssa = phi float [ %3, %for.body ], [ %w0.0100, %for.body16 ]
+ %w0.0.lcssa = phi float [ %1, %for.body ], [ %sub19, %for.body16 ]
+ store float %w0.0.lcssa, float* %arrayidx, align 4
+ store float %w1.0.lcssa, float* %arrayidx4, align 4
+ store float %w2.0.lcssa, float* %arrayidx8, align 4
+ store float %w3.0.lcssa, float* %arrayidx12, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond109 = icmp eq i64 %indvars.iv.next, 6
+ br i1 %exitcond109, label %for.cond.cleanup, label %for.body
+
+for.body16: ; preds = %for.body16, %for.body16.lr.ph
+ %w0.0100 = phi float [ %1, %for.body16.lr.ph ], [ %sub19, %for.body16 ]
+ %w1.099 = phi float [ %3, %for.body16.lr.ph ], [ %w0.0100, %for.body16 ]
+ %j.098 = phi i32 [ 0, %for.body16.lr.ph ], [ %inc, %for.body16 ]
+ %w3.097 = phi float [ %7, %for.body16.lr.ph ], [ %w2.096, %for.body16 ]
+ %w2.096 = phi float [ %5, %for.body16.lr.ph ], [ %sub28, %for.body16 ]
+ %mul17 = fmul fast float %w0.0100, 0x3FF19999A0000000
+ %mul18.neg = fmul fast float %w1.099, 0xBFF3333340000000
+ %sub92 = fadd fast float %mul17, %mul18.neg
+ %sub19 = fadd fast float %sub92, %8
+ %mul20 = fmul fast float %sub19, 0x4000CCCCC0000000
+ %mul21.neg = fmul fast float %w0.0100, 0xC0019999A0000000
+ %mul23 = fmul fast float %w1.099, 0x4002666660000000
+ %mul25 = fmul fast float %w2.096, 0x4008CCCCC0000000
+ %mul27.neg = fmul fast float %w3.097, 0xC0099999A0000000
+ %add2293 = fadd fast float %mul27.neg, %mul25
+ %add24 = fadd fast float %add2293, %mul23
+ %sub2694 = fadd fast float %add24, %mul21.neg
+ %sub28 = fadd fast float %sub2694, %mul20
+ %inc = add nuw i32 %j.098, 1
+ %exitcond = icmp eq i32 %inc, %arg_B
+ br i1 %exitcond, label %for.cond.cleanup15, label %for.body16
+}
+
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
+
+; void foo(double * restrict A, double * restrict B, double * restrict C,
+; int n) {
+; for (intptr_t i=0; i < n; ++i) {
+; C[i] = B[0] *A[i*4 ] + B[1] *A[i*4+1];
+; }
+; }
+
+; STORE-LABEL: store_red_double
+; STORE: fmul fast <2 x double>
+; STORE: extractelement <2 x double>
+; STORE: extractelement <2 x double>
+
+define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
+entry:
+ %cmp17 = icmp sgt i32 %n, 0
+ br i1 %cmp17, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+ %0 = load double, double* %B, align 8
+ %arrayidx4 = getelementptr inbounds double, double* %B, i64 1
+ %1 = load double, double* %arrayidx4, align 8
+ %2 = sext i32 %n to i64
+ br label %for.body
+
+for.body:
+ %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+ %mul = shl nsw i64 %i.018, 2
+ %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
+ %3 = load double, double* %arrayidx2, align 8
+ %mul3 = fmul fast double %0, %3
+ %add16 = or i64 %mul, 1
+ %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
+ %4 = load double, double* %arrayidx6, align 8
+ %mul7 = fmul fast double %1, %4
+ %add8 = fadd fast double %mul3, %mul7
+ %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
+ store double %add8, double* %arrayidx9, align 8
+ %inc = add nsw i64 %i.018, 1
+ %exitcond = icmp eq i64 %inc, %2
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
; float sum = 0;
; for (intptr_t i=0; i < n; ++i) {
; return sum;
; }
-; CHECK-LABEL: store_red
-; CHECK: fmul fast <4 x float>
-; CHECK: shufflevector <4 x float>
+; STORE-LABEL: store_red
+; STORE: fmul fast <4 x float>
+; STORE: shufflevector <4 x float>
define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
entry:
ret i32 0
}
-
-; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
-
-; void foo(double * restrict A, double * restrict B, double * restrict C,
-; int n) {
-; for (intptr_t i=0; i < n; ++i) {
-; C[i] = B[0] *A[i*4 ] + B[1] *A[i*4+1];
-; }
-; }
-
-; STORE-LABEL: store_red_double
-; STORE: fmul fast <2 x double>
-; STORE: extractelement <2 x double>
-; STORE: extractelement <2 x double>
-
-define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
-entry:
- %cmp17 = icmp sgt i32 %n, 0
- br i1 %cmp17, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:
- %0 = load double, double* %B, align 8
- %arrayidx4 = getelementptr inbounds double, double* %B, i64 1
- %1 = load double, double* %arrayidx4, align 8
- %2 = sext i32 %n to i64
- br label %for.body
-
-for.body:
- %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
- %mul = shl nsw i64 %i.018, 2
- %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
- %3 = load double, double* %arrayidx2, align 8
- %mul3 = fmul fast double %0, %3
- %add16 = or i64 %mul, 1
- %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
- %4 = load double, double* %arrayidx6, align 8
- %mul7 = fmul fast double %1, %4
- %add8 = fadd fast double %mul3, %mul7
- %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
- store double %add8, double* %arrayidx9, align 8
- %inc = add nsw i64 %i.018, 1
- %exitcond = icmp eq i64 %inc, %2
- br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
- ret void
-}