; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
%cmp = icmp eq <32 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
%cmp = icmp eq <32 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
%cmp = icmp eq <32 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
%cmp = icmp eq <32 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
%cmp = icmp eq <16 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
%cmp = icmp eq <16 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
%cmp = icmp eq <16 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
%cmp = icmp eq <16 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [3:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [3:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [3:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [3:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
-; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
%cmp = icmp eq <16 x i8> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
-; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
%cmp = icmp eq <16 x i8> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
-; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
%cmp = icmp eq <16 x i8> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
-; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
%cmp = icmp eq <16 x i8> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
%cmp = icmp eq <32 x i8> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
%cmp = icmp eq <32 x i8> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
%cmp = icmp eq <32 x i8> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
%cmp = icmp eq <32 x i8> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
%cmp = icmp eq <64 x i8> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
%cmp = icmp eq <64 x i8> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
%cmp = icmp eq <64 x i8> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
%cmp = icmp eq <64 x i8> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
%cmp = icmp eq <8 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
%cmp = icmp eq <32 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
%cmp = icmp eq <32 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
%cmp = icmp eq <32 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
%cmp = icmp eq <32 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
%cmp = icmp eq <32 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
%cmp = icmp eq <32 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
%cmp = icmp eq <32 x i16> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
%cmp = icmp eq <32 x i16> %mask, zeroinitializer
; SKX-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.25]
+; SKX-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
; SKX-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.25]
+; SKX-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec = load <32 x i16>, <32 x i16>* %vp
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
%cmp = icmp eq <16 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
%cmp = icmp eq <16 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
%cmp = icmp eq <16 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
%cmp = icmp eq <16 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
-; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
-; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
-; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
-; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
-; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
-; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
%cmp = icmp eq <16 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
%cmp = icmp eq <16 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
%cmp = icmp eq <16 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
%cmp = icmp eq <16 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <16 x i32>, <16 x i32>* %vec2p
%shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i64>, <8 x i64>* %vec2p
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i64>, <8 x i64>* %vec2p
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i64>, <8 x i64>* %vec2p
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
-; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i64>, <8 x i64>* %vec2p
%shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
-; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
%cmp = icmp eq <2 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
-; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
%cmp = icmp eq <2 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
-; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <2 x double>, <2 x double>* %vec2p
%shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
-; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <2 x double>, <2 x double>* %vec2p
%shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
%cmp = icmp eq <4 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x float>, <4 x float>* %vec2p
%shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
%cmp = icmp eq <2 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
%cmp = icmp eq <2 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <2 x double>, <2 x double>* %vec2p
%shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <2 x double>, <2 x double>* %vec2p
%shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
%cmp = icmp eq <8 x i64> %mask, zeroinitializer
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x double>, <8 x double>* %vec2p
%shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>