From 0c9c0d47aa95fda0c095cb60d8a5099ed6743f41 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 19 Feb 2017 17:19:38 +0000 Subject: [PATCH] [X86][SSE] Enable initial support for domain crossing at high shuffle combine depths. As discussed on D27692, this permits another domain to be used to combine a shuffle at high depths. We currently set the required depth at 4 or more combined shuffles, this is probably too high for most targets but is a good starting point and already helps avoid a number of costly variable shuffles. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295608 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 6 +++--- test/CodeGen/X86/shuffle-vs-trunc-512.ll | 22 +++++++++++----------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 88651a2d227..4d660acbc20 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -26971,9 +26971,9 @@ static bool combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, unsigned Shuffle, PermuteImm; // Which shuffle domains are permitted? - // TODO - Allow either domain after a threshold depth. - bool AllowFloatDomain = FloatDomain; - bool AllowIntDomain = !FloatDomain; + // Permit domain crossing at higher combine depths. + bool AllowFloatDomain = FloatDomain || (Depth > 3); + bool AllowIntDomain = !FloatDomain || (Depth > 3); if (UnaryShuffle) { // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll index 8f9b8c156d3..aaf1ffe8adc 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -100,11 +100,11 @@ define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind { define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind { ; AVX512F-LABEL: shuffle_v32i16_to_v16i16: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512F-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512F-NEXT: retq @@ -113,11 +113,9 @@ define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512VL-NEXT: retq @@ -126,9 +124,11 @@ define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind { ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-NEXT: retq -- 2.11.0