bool HasSSE41 = Subtarget->hasSSE41();
bool HasAVX = Subtarget->hasAVX();
bool HasAVX2 = Subtarget->hasAVX2();
+ bool HasAVX512 = Subtarget->hasAVX512();
+ bool HasVLX = Subtarget->hasVLX();
bool IsNonTemporal = MMO && MMO->isNonTemporal();
// Get opcode and regclass of the output for the given load instruction.
break;
case MVT::f32:
if (X86ScalarSSEf32) {
- Opc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
+ Opc = HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
RC = &X86::FR32RegClass;
} else {
Opc = X86::LD_Fp32m;
break;
case MVT::f64:
if (X86ScalarSSEf64) {
- Opc = HasAVX ? X86::VMOVSDrm : X86::MOVSDrm;
+ Opc = HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm;
RC = &X86::FR64RegClass;
} else {
Opc = X86::LD_Fp64m;
return false;
case MVT::v4f32:
if (IsNonTemporal && Alignment >= 16 && HasSSE41)
- Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+ Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
+ HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
else if (Alignment >= 16)
- Opc = HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm;
+ Opc = HasVLX ? X86::VMOVAPSZ128rm :
+ HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm;
else
- Opc = HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
+ Opc = HasVLX ? X86::VMOVUPSZ128rm :
+ HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
RC = &X86::VR128RegClass;
break;
case MVT::v2f64:
if (IsNonTemporal && Alignment >= 16 && HasSSE41)
- Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+ Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
+ HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
else if (Alignment >= 16)
- Opc = HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm;
+ Opc = HasVLX ? X86::VMOVAPDZ128rm :
+ HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm;
else
- Opc = HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
+ Opc = HasVLX ? X86::VMOVUPDZ128rm :
+ HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
RC = &X86::VR128RegClass;
break;
case MVT::v4i32:
case MVT::v8i16:
case MVT::v16i8:
if (IsNonTemporal && Alignment >= 16)
- Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
+ Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
+ HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
else if (Alignment >= 16)
- Opc = HasAVX ? X86::VMOVDQArm : X86::MOVDQArm;
+ Opc = HasVLX ? X86::VMOVDQA64Z128rm :
+ HasAVX ? X86::VMOVDQArm : X86::MOVDQArm;
else
- Opc = HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
+ Opc = HasVLX ? X86::VMOVDQU64Z128rm :
+ HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
RC = &X86::VR128RegClass;
break;
case MVT::v8f32:
assert(HasAVX);
if (IsNonTemporal && Alignment >= 32 && HasAVX2)
- Opc = X86::VMOVNTDQAYrm;
+ Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
+ else if (Alignment >= 32)
+ Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm;
else
- Opc = (Alignment >= 32) ? X86::VMOVAPSYrm : X86::VMOVUPSYrm;
+ Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm;
RC = &X86::VR256RegClass;
break;
case MVT::v4f64:
assert(HasAVX);
if (IsNonTemporal && Alignment >= 32 && HasAVX2)
Opc = X86::VMOVNTDQAYrm;
+ else if (Alignment >= 32)
+ Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm;
else
- Opc = (Alignment >= 32) ? X86::VMOVAPDYrm : X86::VMOVUPDYrm;
+ Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm;
RC = &X86::VR256RegClass;
break;
case MVT::v8i32:
assert(HasAVX);
if (IsNonTemporal && Alignment >= 32 && HasAVX2)
Opc = X86::VMOVNTDQAYrm;
+ else if (Alignment >= 32)
+ Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm;
else
- Opc = (Alignment >= 32) ? X86::VMOVDQAYrm : X86::VMOVDQUYrm;
+ Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm;
RC = &X86::VR256RegClass;
break;
case MVT::v16f32:
- assert(Subtarget->hasAVX512());
+ assert(HasAVX512);
if (IsNonTemporal && Alignment >= 64)
Opc = X86::VMOVNTDQAZrm;
else
RC = &X86::VR512RegClass;
break;
case MVT::v8f64:
- assert(Subtarget->hasAVX512());
+ assert(HasAVX512);
if (IsNonTemporal && Alignment >= 64)
Opc = X86::VMOVNTDQAZrm;
else
case MVT::v16i32:
case MVT::v32i16:
case MVT::v64i8:
- assert(Subtarget->hasAVX512());
+ assert(HasAVX512);
// Note: There are a lot more choices based on type with AVX-512, but
// there's really no advantage when the load isn't masked.
if (IsNonTemporal && Alignment >= 64)
bool HasSSE2 = Subtarget->hasSSE2();
bool HasSSE4A = Subtarget->hasSSE4A();
bool HasAVX = Subtarget->hasAVX();
+ bool HasAVX512 = Subtarget->hasAVX512();
+ bool HasVLX = Subtarget->hasVLX();
bool IsNonTemporal = MMO && MMO->isNonTemporal();
// Get opcode and regclass of the output for the given store instruction.
if (IsNonTemporal && HasSSE4A)
Opc = X86::MOVNTSS;
else
- Opc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
+ Opc = HasAVX512 ? X86::VMOVSSZmr :
+ HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
} else
Opc = X86::ST_Fp32m;
break;
if (IsNonTemporal && HasSSE4A)
Opc = X86::MOVNTSD;
else
- Opc = HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;
+ Opc = HasAVX512 ? X86::VMOVSDZmr :
+ HasAVX ? X86::VMOVSDmr : X86::MOVSDmr;
} else
Opc = X86::ST_Fp64m;
break;
case MVT::v4f32:
if (Aligned) {
if (IsNonTemporal)
- Opc = HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;
+ Opc = HasVLX ? X86::VMOVNTPSZ128mr :
+ HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr;
else
- Opc = HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;
+ Opc = HasVLX ? X86::VMOVAPSZ128mr :
+ HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr;
} else
- Opc = HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;
+ Opc = HasVLX ? X86::VMOVUPSZ128mr :
+ HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr;
break;
case MVT::v2f64:
if (Aligned) {
if (IsNonTemporal)
- Opc = HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;
+ Opc = HasVLX ? X86::VMOVNTPDZ128mr :
+ HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr;
else
- Opc = HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;
+ Opc = HasVLX ? X86::VMOVAPDZ128mr :
+ HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr;
} else
- Opc = HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;
+ Opc = HasVLX ? X86::VMOVUPDZ128mr :
+ HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr;
break;
case MVT::v4i32:
case MVT::v2i64:
case MVT::v16i8:
if (Aligned) {
if (IsNonTemporal)
- Opc = HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;
+ Opc = HasVLX ? X86::VMOVNTDQZ128mr :
+ HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr;
else
- Opc = HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
+ Opc = HasVLX ? X86::VMOVDQA64Z128mr :
+ HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr;
} else
- Opc = HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr;
+ Opc = HasVLX ? X86::VMOVDQU64Z128mr :
+ HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr;
break;
case MVT::v8f32:
assert(HasAVX);
- if (Aligned)
- Opc = IsNonTemporal ? X86::VMOVNTPSYmr : X86::VMOVAPSYmr;
- else
- Opc = X86::VMOVUPSYmr;
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTPSZ256mr : X86::VMOVNTPSYmr;
+ else
+ Opc = HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr;
+ } else
+ Opc = HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr;
break;
case MVT::v4f64:
assert(HasAVX);
if (Aligned) {
- Opc = IsNonTemporal ? X86::VMOVNTPDYmr : X86::VMOVAPDYmr;
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTPDZ256mr : X86::VMOVNTPDYmr;
+ else
+ Opc = HasVLX ? X86::VMOVAPDZ256mr : X86::VMOVAPDYmr;
} else
- Opc = X86::VMOVUPDYmr;
+ Opc = HasVLX ? X86::VMOVUPDZ256mr : X86::VMOVUPDYmr;
break;
case MVT::v8i32:
case MVT::v4i64:
case MVT::v16i16:
case MVT::v32i8:
assert(HasAVX);
- if (Aligned)
- Opc = IsNonTemporal ? X86::VMOVNTDQYmr : X86::VMOVDQAYmr;
- else
- Opc = X86::VMOVDQUYmr;
+ if (Aligned) {
+ if (IsNonTemporal)
+ Opc = HasVLX ? X86::VMOVNTDQZ256mr : X86::VMOVNTDQYmr;
+ else
+ Opc = HasVLX ? X86::VMOVDQA64Z256mr : X86::VMOVDQAYmr;
+ } else
+ Opc = HasVLX ? X86::VMOVDQU64Z256mr : X86::VMOVDQUYmr;
break;
case MVT::v16f32:
- assert(Subtarget->hasAVX512());
+ assert(HasAVX512);
if (Aligned)
Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr;
else
Opc = X86::VMOVUPSZmr;
break;
case MVT::v8f64:
- assert(Subtarget->hasAVX512());
+ assert(HasAVX512);
if (Aligned) {
Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr;
} else
case MVT::v16i32:
case MVT::v32i16:
case MVT::v64i8:
- assert(Subtarget->hasAVX512());
+ assert(HasAVX512);
// Note: There are a lot more choices based on type with AVX-512, but
// there's really no advantage when the store isn't masked.
if (Aligned)
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v16i8:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v16i8:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v16i8:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqa (%rdi), %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v16i8:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqa64 (%rdi), %xmm0
+; SKX-NEXT: retq
entry:
%0 = load <16 x i8>, <16 x i8>* %V, align 16
ret <16 x i8> %0
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v8i16:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v8i16:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v8i16:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqa (%rdi), %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v8i16:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqa64 (%rdi), %xmm0
+; SKX-NEXT: retq
entry:
%0 = load <8 x i16>, <8 x i16>* %V, align 16
ret <8 x i16> %0
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v4i32:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v4i32:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v4i32:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqa (%rdi), %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v4i32:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqa64 (%rdi), %xmm0
+; SKX-NEXT: retq
entry:
%0 = load <4 x i32>, <4 x i32>* %V, align 16
ret <4 x i32> %0
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v2i64:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v2i64:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v2i64:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqa (%rdi), %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v2i64:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqa64 (%rdi), %xmm0
+; SKX-NEXT: retq
entry:
%0 = load <2 x i64>, <2 x i64>* %V, align 16
ret <2 x i64> %0
; SSE-NEXT: movdqu (%rdi), %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v16i8_unaligned:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqu (%rdi), %xmm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v16i8_unaligned:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqu (%rdi), %xmm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v16i8_unaligned:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqu (%rdi), %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v16i8_unaligned:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqu64 (%rdi), %xmm0
+; SKX-NEXT: retq
entry:
%0 = load <16 x i8>, <16 x i8>* %V, align 4
ret <16 x i8> %0
; SSE-NEXT: movdqu (%rdi), %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v8i16_unaligned:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqu (%rdi), %xmm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v8i16_unaligned:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqu (%rdi), %xmm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v8i16_unaligned:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqu (%rdi), %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v8i16_unaligned:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqu64 (%rdi), %xmm0
+; SKX-NEXT: retq
entry:
%0 = load <8 x i16>, <8 x i16>* %V, align 4
ret <8 x i16> %0
; SSE-NEXT: movdqu (%rdi), %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v4i32_unaligned:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqu (%rdi), %xmm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v4i32_unaligned:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqu (%rdi), %xmm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v4i32_unaligned:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqu (%rdi), %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v4i32_unaligned:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqu64 (%rdi), %xmm0
+; SKX-NEXT: retq
entry:
%0 = load <4 x i32>, <4 x i32>* %V, align 4
ret <4 x i32> %0
; SSE-NEXT: movdqu (%rdi), %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v2i64_unaligned:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqu (%rdi), %xmm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v2i64_unaligned:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqu (%rdi), %xmm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v2i64_unaligned:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqu (%rdi), %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v2i64_unaligned:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqu64 (%rdi), %xmm0
+; SKX-NEXT: retq
entry:
%0 = load <2 x i64>, <2 x i64>* %V, align 4
ret <2 x i64> %0
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v16i8_abi_alignment:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v16i8_abi_alignment:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v16i8_abi_alignment:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqa (%rdi), %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v16i8_abi_alignment:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqa64 (%rdi), %xmm0
+; SKX-NEXT: retq
entry:
%0 = load <16 x i8>, <16 x i8>* %V
ret <16 x i8> %0
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v8i16_abi_alignment:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v8i16_abi_alignment:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v8i16_abi_alignment:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqa (%rdi), %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v8i16_abi_alignment:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqa64 (%rdi), %xmm0
+; SKX-NEXT: retq
entry:
%0 = load <8 x i16>, <8 x i16>* %V
ret <8 x i16> %0
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v4i32_abi_alignment:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v4i32_abi_alignment:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v4i32_abi_alignment:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqa (%rdi), %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v4i32_abi_alignment:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqa64 (%rdi), %xmm0
+; SKX-NEXT: retq
entry:
%0 = load <4 x i32>, <4 x i32>* %V
ret <4 x i32> %0
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v2i64_abi_alignment:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v2i64_abi_alignment:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v2i64_abi_alignment:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqa (%rdi), %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v2i64_abi_alignment:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqa64 (%rdi), %xmm0
+; SKX-NEXT: retq
entry:
%0 = load <2 x i64>, <2 x i64>* %V
ret <2 x i64> %0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v32i8:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqa (%rdi), %ymm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v32i8:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqa (%rdi), %ymm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v32i8:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqa (%rdi), %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v32i8:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqa64 (%rdi), %ymm0
+; SKX-NEXT: retq
entry:
%0 = load <32 x i8>, <32 x i8>* %V, align 32
ret <32 x i8> %0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v16i16:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqa (%rdi), %ymm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v16i16:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqa (%rdi), %ymm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v16i16:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqa (%rdi), %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v16i16:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqa64 (%rdi), %ymm0
+; SKX-NEXT: retq
entry:
%0 = load <16 x i16>, <16 x i16>* %V, align 32
ret <16 x i16> %0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v8i32:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqu (%rdi), %ymm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v8i32:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqu (%rdi), %ymm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v8i32:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqu (%rdi), %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v8i32:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqu64 (%rdi), %ymm0
+; SKX-NEXT: retq
entry:
%0 = load <8 x i32>, <8 x i32>* %V, align 16
ret <8 x i32> %0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v4i64:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqa (%rdi), %ymm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v4i64:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqa (%rdi), %ymm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v4i64:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqa (%rdi), %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v4i64:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqa64 (%rdi), %ymm0
+; SKX-NEXT: retq
entry:
%0 = load <4 x i64>, <4 x i64>* %V, align 32
ret <4 x i64> %0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v32i8_unaligned:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqu (%rdi), %ymm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v32i8_unaligned:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqu (%rdi), %ymm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v32i8_unaligned:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqu (%rdi), %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v32i8_unaligned:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqu64 (%rdi), %ymm0
+; SKX-NEXT: retq
entry:
%0 = load <32 x i8>, <32 x i8>* %V, align 4
ret <32 x i8> %0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v16i16_unaligned:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqu (%rdi), %ymm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v16i16_unaligned:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqu (%rdi), %ymm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v16i16_unaligned:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqu (%rdi), %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v16i16_unaligned:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqu64 (%rdi), %ymm0
+; SKX-NEXT: retq
entry:
%0 = load <16 x i16>, <16 x i16>* %V, align 4
ret <16 x i16> %0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v8i32_unaligned:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqu (%rdi), %ymm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v8i32_unaligned:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqu (%rdi), %ymm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v8i32_unaligned:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqu (%rdi), %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v8i32_unaligned:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqu64 (%rdi), %ymm0
+; SKX-NEXT: retq
entry:
%0 = load <8 x i32>, <8 x i32>* %V, align 4
ret <8 x i32> %0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v4i64_unaligned:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqu (%rdi), %ymm0
-; AVX-NEXT: retq
+; AVXONLY-LABEL: test_v4i64_unaligned:
+; AVXONLY: # BB#0: # %entry
+; AVXONLY-NEXT: vmovdqu (%rdi), %ymm0
+; AVXONLY-NEXT: retq
+;
+; KNL-LABEL: test_v4i64_unaligned:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqu (%rdi), %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_v4i64_unaligned:
+; SKX: # BB#0: # %entry
+; SKX-NEXT: vmovdqu64 (%rdi), %ymm0
+; SKX-NEXT: retq
entry:
%0 = load <4 x i64>, <4 x i64>* %V, align 4
ret <4 x i64> %0