From 610e45c3d24230238bfd9a5b09decac0ce4d8adc Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 5 Sep 2016 23:58:40 +0000 Subject: [PATCH] [AVX-512] Teach fastisel load/store handling to use EVEX encoded instructions for 128/256-bit vectors and scalar single/double. Still need to fix the register classes to allow the extended range of registers. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@280682 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86FastISel.cpp | 123 ++++++++---- test/CodeGen/X86/fast-isel-store.ll | 44 ++++- test/CodeGen/X86/fast-isel-vecload.ll | 360 ++++++++++++++++++++++++++-------- 3 files changed, 395 insertions(+), 132 deletions(-) diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 74e6e1baa0d..41651d26148 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -351,6 +351,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, bool HasSSE41 = Subtarget->hasSSE41(); bool HasAVX = Subtarget->hasAVX(); bool HasAVX2 = Subtarget->hasAVX2(); + bool HasAVX512 = Subtarget->hasAVX512(); + bool HasVLX = Subtarget->hasVLX(); bool IsNonTemporal = MMO && MMO->isNonTemporal(); // Get opcode and regclass of the output for the given load instruction. @@ -378,7 +380,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, break; case MVT::f32: if (X86ScalarSSEf32) { - Opc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm; + Opc = HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm; RC = &X86::FR32RegClass; } else { Opc = X86::LD_Fp32m; @@ -387,7 +389,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, break; case MVT::f64: if (X86ScalarSSEf64) { - Opc = HasAVX ? X86::VMOVSDrm : X86::MOVSDrm; + Opc = HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm; RC = &X86::FR64RegClass; } else { Opc = X86::LD_Fp64m; @@ -399,20 +401,26 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, return false; case MVT::v4f32: if (IsNonTemporal && Alignment >= 16 && HasSSE41) - Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; + Opc = HasVLX ? X86::VMOVNTDQAZ128rm : + HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; else if (Alignment >= 16) - Opc = HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm; + Opc = HasVLX ? X86::VMOVAPSZ128rm : + HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm; else - Opc = HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm; + Opc = HasVLX ? X86::VMOVUPSZ128rm : + HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm; RC = &X86::VR128RegClass; break; case MVT::v2f64: if (IsNonTemporal && Alignment >= 16 && HasSSE41) - Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; + Opc = HasVLX ? X86::VMOVNTDQAZ128rm : + HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; else if (Alignment >= 16) - Opc = HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm; + Opc = HasVLX ? X86::VMOVAPDZ128rm : + HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm; else - Opc = HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm; + Opc = HasVLX ? X86::VMOVUPDZ128rm : + HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm; RC = &X86::VR128RegClass; break; case MVT::v4i32: @@ -420,27 +428,34 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, case MVT::v8i16: case MVT::v16i8: if (IsNonTemporal && Alignment >= 16) - Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; + Opc = HasVLX ? X86::VMOVNTDQAZ128rm : + HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; else if (Alignment >= 16) - Opc = HasAVX ? X86::VMOVDQArm : X86::MOVDQArm; + Opc = HasVLX ? X86::VMOVDQA64Z128rm : + HasAVX ? X86::VMOVDQArm : X86::MOVDQArm; else - Opc = HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm; + Opc = HasVLX ? X86::VMOVDQU64Z128rm : + HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm; RC = &X86::VR128RegClass; break; case MVT::v8f32: assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) - Opc = X86::VMOVNTDQAYrm; + Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm; + else if (Alignment >= 32) + Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm; else - Opc = (Alignment >= 32) ? X86::VMOVAPSYrm : X86::VMOVUPSYrm; + Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm; RC = &X86::VR256RegClass; break; case MVT::v4f64: assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) Opc = X86::VMOVNTDQAYrm; + else if (Alignment >= 32) + Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm; else - Opc = (Alignment >= 32) ? X86::VMOVAPDYrm : X86::VMOVUPDYrm; + Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm; RC = &X86::VR256RegClass; break; case MVT::v8i32: @@ -450,12 +465,14 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) Opc = X86::VMOVNTDQAYrm; + else if (Alignment >= 32) + Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm; else - Opc = (Alignment >= 32) ? X86::VMOVDQAYrm : X86::VMOVDQUYrm; + Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm; RC = &X86::VR256RegClass; break; case MVT::v16f32: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); if (IsNonTemporal && Alignment >= 64) Opc = X86::VMOVNTDQAZrm; else @@ -463,7 +480,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, RC = &X86::VR512RegClass; break; case MVT::v8f64: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); if (IsNonTemporal && Alignment >= 64) Opc = X86::VMOVNTDQAZrm; else @@ -474,7 +491,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); // Note: There are a lot more choices based on type with AVX-512, but // there's really no advantage when the load isn't masked. if (IsNonTemporal && Alignment >= 64) @@ -504,6 +521,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, bool HasSSE2 = Subtarget->hasSSE2(); bool HasSSE4A = Subtarget->hasSSE4A(); bool HasAVX = Subtarget->hasAVX(); + bool HasAVX512 = Subtarget->hasAVX512(); + bool HasVLX = Subtarget->hasVLX(); bool IsNonTemporal = MMO && MMO->isNonTemporal(); // Get opcode and regclass of the output for the given store instruction. @@ -534,7 +553,8 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, if (IsNonTemporal && HasSSE4A) Opc = X86::MOVNTSS; else - Opc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr; + Opc = HasAVX512 ? X86::VMOVSSZmr : + HasAVX ? X86::VMOVSSmr : X86::MOVSSmr; } else Opc = X86::ST_Fp32m; break; @@ -543,27 +563,34 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, if (IsNonTemporal && HasSSE4A) Opc = X86::MOVNTSD; else - Opc = HasAVX ? X86::VMOVSDmr : X86::MOVSDmr; + Opc = HasAVX512 ? X86::VMOVSDZmr : + HasAVX ? X86::VMOVSDmr : X86::MOVSDmr; } else Opc = X86::ST_Fp64m; break; case MVT::v4f32: if (Aligned) { if (IsNonTemporal) - Opc = HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr; + Opc = HasVLX ? X86::VMOVNTPSZ128mr : + HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr; else - Opc = HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr; + Opc = HasVLX ? X86::VMOVAPSZ128mr : + HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr; } else - Opc = HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr; + Opc = HasVLX ? X86::VMOVUPSZ128mr : + HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr; break; case MVT::v2f64: if (Aligned) { if (IsNonTemporal) - Opc = HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr; + Opc = HasVLX ? X86::VMOVNTPDZ128mr : + HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr; else - Opc = HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr; + Opc = HasVLX ? X86::VMOVAPDZ128mr : + HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr; } else - Opc = HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr; + Opc = HasVLX ? X86::VMOVUPDZ128mr : + HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr; break; case MVT::v4i32: case MVT::v2i64: @@ -571,45 +598,57 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, case MVT::v16i8: if (Aligned) { if (IsNonTemporal) - Opc = HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr; + Opc = HasVLX ? X86::VMOVNTDQZ128mr : + HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr; else - Opc = HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr; + Opc = HasVLX ? X86::VMOVDQA64Z128mr : + HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr; } else - Opc = HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr; + Opc = HasVLX ? X86::VMOVDQU64Z128mr : + HasAVX ? X86::VMOVDQUmr : X86::MOVDQUmr; break; case MVT::v8f32: assert(HasAVX); - if (Aligned) - Opc = IsNonTemporal ? X86::VMOVNTPSYmr : X86::VMOVAPSYmr; - else - Opc = X86::VMOVUPSYmr; + if (Aligned) { + if (IsNonTemporal) + Opc = HasVLX ? X86::VMOVNTPSZ256mr : X86::VMOVNTPSYmr; + else + Opc = HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr; + } else + Opc = HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr; break; case MVT::v4f64: assert(HasAVX); if (Aligned) { - Opc = IsNonTemporal ? X86::VMOVNTPDYmr : X86::VMOVAPDYmr; + if (IsNonTemporal) + Opc = HasVLX ? X86::VMOVNTPDZ256mr : X86::VMOVNTPDYmr; + else + Opc = HasVLX ? X86::VMOVAPDZ256mr : X86::VMOVAPDYmr; } else - Opc = X86::VMOVUPDYmr; + Opc = HasVLX ? X86::VMOVUPDZ256mr : X86::VMOVUPDYmr; break; case MVT::v8i32: case MVT::v4i64: case MVT::v16i16: case MVT::v32i8: assert(HasAVX); - if (Aligned) - Opc = IsNonTemporal ? X86::VMOVNTDQYmr : X86::VMOVDQAYmr; - else - Opc = X86::VMOVDQUYmr; + if (Aligned) { + if (IsNonTemporal) + Opc = HasVLX ? X86::VMOVNTDQZ256mr : X86::VMOVNTDQYmr; + else + Opc = HasVLX ? X86::VMOVDQA64Z256mr : X86::VMOVDQAYmr; + } else + Opc = HasVLX ? X86::VMOVDQU64Z256mr : X86::VMOVDQUYmr; break; case MVT::v16f32: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); if (Aligned) Opc = IsNonTemporal ? X86::VMOVNTPSZmr : X86::VMOVAPSZmr; else Opc = X86::VMOVUPSZmr; break; case MVT::v8f64: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); if (Aligned) { Opc = IsNonTemporal ? X86::VMOVNTPDZmr : X86::VMOVAPDZmr; } else @@ -619,7 +658,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: - assert(Subtarget->hasAVX512()); + assert(HasAVX512); // Note: There are a lot more choices based on type with AVX-512, but // there's really no advantage when the store isn't masked. if (Aligned) diff --git a/test/CodeGen/X86/fast-isel-store.ll b/test/CodeGen/X86/fast-isel-store.ll index 729304a443b..d18e8f99a59 100644 --- a/test/CodeGen/X86/fast-isel-store.ll +++ b/test/CodeGen/X86/fast-isel-store.ll @@ -58,11 +58,11 @@ define <4 x i32> @test_store_4xi32(<4 x i32>* nocapture %addr, <4 x i32> %value, ; SSE64-NEXT: movdqu %xmm0, (%eax) ; SSE64-NEXT: retl ; -; AVX32-LABEL: test_store_4xi32: -; AVX32: # BB#0: -; AVX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX32-NEXT: vmovdqu %xmm0, (%rdi) -; AVX32-NEXT: retq +; AVXONLY32-LABEL: test_store_4xi32: +; AVXONLY32: # BB#0: +; AVXONLY32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXONLY32-NEXT: vmovdqu %xmm0, (%rdi) +; AVXONLY32-NEXT: retq ; ; AVX64-LABEL: test_store_4xi32: ; AVX64: # BB#0: @@ -70,6 +70,18 @@ define <4 x i32> @test_store_4xi32(<4 x i32>* nocapture %addr, <4 x i32> %value, ; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX64-NEXT: vmovdqu %xmm0, (%eax) ; AVX64-NEXT: retl +; +; KNL32-LABEL: test_store_4xi32: +; KNL32: # BB#0: +; KNL32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; KNL32-NEXT: vmovdqu %xmm0, (%rdi) +; KNL32-NEXT: retq +; +; SKX32-LABEL: test_store_4xi32: +; SKX32: # BB#0: +; SKX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; SKX32-NEXT: vmovdqu64 %xmm0, (%rdi) +; SKX32-NEXT: retq %foo = add <4 x i32> %value, %value2 ; to force integer type on store store <4 x i32> %foo, <4 x i32>* %addr, align 1 ret <4 x i32> %foo @@ -89,11 +101,11 @@ define <4 x i32> @test_store_4xi32_aligned(<4 x i32>* nocapture %addr, <4 x i32> ; SSE64-NEXT: movdqa %xmm0, (%eax) ; SSE64-NEXT: retl ; -; AVX32-LABEL: test_store_4xi32_aligned: -; AVX32: # BB#0: -; AVX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX32-NEXT: vmovdqa %xmm0, (%rdi) -; AVX32-NEXT: retq +; AVXONLY32-LABEL: test_store_4xi32_aligned: +; AVXONLY32: # BB#0: +; AVXONLY32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXONLY32-NEXT: vmovdqa %xmm0, (%rdi) +; AVXONLY32-NEXT: retq ; ; AVX64-LABEL: test_store_4xi32_aligned: ; AVX64: # BB#0: @@ -101,6 +113,18 @@ define <4 x i32> @test_store_4xi32_aligned(<4 x i32>* nocapture %addr, <4 x i32> ; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX64-NEXT: vmovdqa %xmm0, (%eax) ; AVX64-NEXT: retl +; +; KNL32-LABEL: test_store_4xi32_aligned: +; KNL32: # BB#0: +; KNL32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; KNL32-NEXT: vmovdqa %xmm0, (%rdi) +; KNL32-NEXT: retq +; +; SKX32-LABEL: test_store_4xi32_aligned: +; SKX32: # BB#0: +; SKX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; SKX32-NEXT: vmovdqa64 %xmm0, (%rdi) +; SKX32-NEXT: retq %foo = add <4 x i32> %value, %value2 ; to force integer type on store store <4 x i32> %foo, <4 x i32>* %addr, align 16 ret <4 x i32> %foo diff --git a/test/CodeGen/X86/fast-isel-vecload.ll b/test/CodeGen/X86/fast-isel-vecload.ll index 0476d83f6cd..f7051b8c8e9 100644 --- a/test/CodeGen/X86/fast-isel-vecload.ll +++ b/test/CodeGen/X86/fast-isel-vecload.ll @@ -13,10 +13,20 @@ define <16 x i8> @test_v16i8(<16 x i8>* %V) { ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v16i8: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v16i8: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v16i8: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqa (%rdi), %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v16i8: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqa64 (%rdi), %xmm0 +; SKX-NEXT: retq entry: %0 = load <16 x i8>, <16 x i8>* %V, align 16 ret <16 x i8> %0 @@ -28,10 +38,20 @@ define <8 x i16> @test_v8i16(<8 x i16>* %V) { ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v8i16: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v8i16: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v8i16: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqa (%rdi), %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v8i16: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqa64 (%rdi), %xmm0 +; SKX-NEXT: retq entry: %0 = load <8 x i16>, <8 x i16>* %V, align 16 ret <8 x i16> %0 @@ -43,10 +63,20 @@ define <4 x i32> @test_v4i32(<4 x i32>* %V) { ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v4i32: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v4i32: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v4i32: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqa (%rdi), %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v4i32: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqa64 (%rdi), %xmm0 +; SKX-NEXT: retq entry: %0 = load <4 x i32>, <4 x i32>* %V, align 16 ret <4 x i32> %0 @@ -58,10 +88,20 @@ define <2 x i64> @test_v2i64(<2 x i64>* %V) { ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v2i64: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v2i64: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v2i64: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqa (%rdi), %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v2i64: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqa64 (%rdi), %xmm0 +; SKX-NEXT: retq entry: %0 = load <2 x i64>, <2 x i64>* %V, align 16 ret <2 x i64> %0 @@ -73,10 +113,20 @@ define <16 x i8> @test_v16i8_unaligned(<16 x i8>* %V) { ; SSE-NEXT: movdqu (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v16i8_unaligned: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqu (%rdi), %xmm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v16i8_unaligned: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqu (%rdi), %xmm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v16i8_unaligned: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqu (%rdi), %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v16i8_unaligned: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqu64 (%rdi), %xmm0 +; SKX-NEXT: retq entry: %0 = load <16 x i8>, <16 x i8>* %V, align 4 ret <16 x i8> %0 @@ -88,10 +138,20 @@ define <8 x i16> @test_v8i16_unaligned(<8 x i16>* %V) { ; SSE-NEXT: movdqu (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v8i16_unaligned: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqu (%rdi), %xmm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v8i16_unaligned: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqu (%rdi), %xmm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v8i16_unaligned: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqu (%rdi), %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v8i16_unaligned: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqu64 (%rdi), %xmm0 +; SKX-NEXT: retq entry: %0 = load <8 x i16>, <8 x i16>* %V, align 4 ret <8 x i16> %0 @@ -103,10 +163,20 @@ define <4 x i32> @test_v4i32_unaligned(<4 x i32>* %V) { ; SSE-NEXT: movdqu (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v4i32_unaligned: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqu (%rdi), %xmm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v4i32_unaligned: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqu (%rdi), %xmm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v4i32_unaligned: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqu (%rdi), %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v4i32_unaligned: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqu64 (%rdi), %xmm0 +; SKX-NEXT: retq entry: %0 = load <4 x i32>, <4 x i32>* %V, align 4 ret <4 x i32> %0 @@ -118,10 +188,20 @@ define <2 x i64> @test_v2i64_unaligned(<2 x i64>* %V) { ; SSE-NEXT: movdqu (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v2i64_unaligned: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqu (%rdi), %xmm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v2i64_unaligned: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqu (%rdi), %xmm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v2i64_unaligned: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqu (%rdi), %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v2i64_unaligned: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqu64 (%rdi), %xmm0 +; SKX-NEXT: retq entry: %0 = load <2 x i64>, <2 x i64>* %V, align 4 ret <2 x i64> %0 @@ -193,10 +273,20 @@ define <16 x i8> @test_v16i8_abi_alignment(<16 x i8>* %V) { ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v16i8_abi_alignment: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v16i8_abi_alignment: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v16i8_abi_alignment: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqa (%rdi), %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v16i8_abi_alignment: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqa64 (%rdi), %xmm0 +; SKX-NEXT: retq entry: %0 = load <16 x i8>, <16 x i8>* %V ret <16 x i8> %0 @@ -208,10 +298,20 @@ define <8 x i16> @test_v8i16_abi_alignment(<8 x i16>* %V) { ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v8i16_abi_alignment: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v8i16_abi_alignment: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v8i16_abi_alignment: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqa (%rdi), %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v8i16_abi_alignment: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqa64 (%rdi), %xmm0 +; SKX-NEXT: retq entry: %0 = load <8 x i16>, <8 x i16>* %V ret <8 x i16> %0 @@ -223,10 +323,20 @@ define <4 x i32> @test_v4i32_abi_alignment(<4 x i32>* %V) { ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v4i32_abi_alignment: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v4i32_abi_alignment: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v4i32_abi_alignment: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqa (%rdi), %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v4i32_abi_alignment: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqa64 (%rdi), %xmm0 +; SKX-NEXT: retq entry: %0 = load <4 x i32>, <4 x i32>* %V ret <4 x i32> %0 @@ -238,10 +348,20 @@ define <2 x i64> @test_v2i64_abi_alignment(<2 x i64>* %V) { ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v2i64_abi_alignment: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v2i64_abi_alignment: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v2i64_abi_alignment: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqa (%rdi), %xmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v2i64_abi_alignment: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqa64 (%rdi), %xmm0 +; SKX-NEXT: retq entry: %0 = load <2 x i64>, <2 x i64>* %V ret <2 x i64> %0 @@ -284,10 +404,20 @@ define <32 x i8> @test_v32i8(<32 x i8>* %V) { ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v32i8: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v32i8: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqa (%rdi), %ymm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v32i8: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqa (%rdi), %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v32i8: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqa64 (%rdi), %ymm0 +; SKX-NEXT: retq entry: %0 = load <32 x i8>, <32 x i8>* %V, align 32 ret <32 x i8> %0 @@ -300,10 +430,20 @@ define <16 x i16> @test_v16i16(<16 x i16>* %V) { ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v16i16: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v16i16: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqa (%rdi), %ymm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v16i16: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqa (%rdi), %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v16i16: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqa64 (%rdi), %ymm0 +; SKX-NEXT: retq entry: %0 = load <16 x i16>, <16 x i16>* %V, align 32 ret <16 x i16> %0 @@ -316,10 +456,20 @@ define <8 x i32> @test_v8i32(<8 x i32>* %V) { ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v8i32: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqu (%rdi), %ymm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v8i32: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqu (%rdi), %ymm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v8i32: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqu (%rdi), %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v8i32: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 +; SKX-NEXT: retq entry: %0 = load <8 x i32>, <8 x i32>* %V, align 16 ret <8 x i32> %0 @@ -332,10 +482,20 @@ define <4 x i64> @test_v4i64(<4 x i64>* %V) { ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v4i64: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v4i64: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqa (%rdi), %ymm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v4i64: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqa (%rdi), %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v4i64: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqa64 (%rdi), %ymm0 +; SKX-NEXT: retq entry: %0 = load <4 x i64>, <4 x i64>* %V, align 32 ret <4 x i64> %0 @@ -348,10 +508,20 @@ define <32 x i8> @test_v32i8_unaligned(<32 x i8>* %V) { ; SSE-NEXT: movups 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v32i8_unaligned: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqu (%rdi), %ymm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v32i8_unaligned: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqu (%rdi), %ymm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v32i8_unaligned: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqu (%rdi), %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v32i8_unaligned: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 +; SKX-NEXT: retq entry: %0 = load <32 x i8>, <32 x i8>* %V, align 4 ret <32 x i8> %0 @@ -364,10 +534,20 @@ define <16 x i16> @test_v16i16_unaligned(<16 x i16>* %V) { ; SSE-NEXT: movups 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v16i16_unaligned: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqu (%rdi), %ymm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v16i16_unaligned: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqu (%rdi), %ymm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v16i16_unaligned: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqu (%rdi), %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v16i16_unaligned: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 +; SKX-NEXT: retq entry: %0 = load <16 x i16>, <16 x i16>* %V, align 4 ret <16 x i16> %0 @@ -380,10 +560,20 @@ define <8 x i32> @test_v8i32_unaligned(<8 x i32>* %V) { ; SSE-NEXT: movups 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v8i32_unaligned: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqu (%rdi), %ymm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v8i32_unaligned: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqu (%rdi), %ymm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v8i32_unaligned: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqu (%rdi), %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v8i32_unaligned: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 +; SKX-NEXT: retq entry: %0 = load <8 x i32>, <8 x i32>* %V, align 4 ret <8 x i32> %0 @@ -396,10 +586,20 @@ define <4 x i64> @test_v4i64_unaligned(<4 x i64>* %V) { ; SSE-NEXT: movups 16(%rdi), %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v4i64_unaligned: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqu (%rdi), %ymm0 -; AVX-NEXT: retq +; AVXONLY-LABEL: test_v4i64_unaligned: +; AVXONLY: # BB#0: # %entry +; AVXONLY-NEXT: vmovdqu (%rdi), %ymm0 +; AVXONLY-NEXT: retq +; +; KNL-LABEL: test_v4i64_unaligned: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqu (%rdi), %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: test_v4i64_unaligned: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 +; SKX-NEXT: retq entry: %0 = load <4 x i64>, <4 x i64>* %V, align 4 ret <4 x i64> %0 -- 2.11.0