From 566eb704ab3e2684c28f1de3b951e5a6d142a410 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 15 Sep 2017 17:09:00 +0000
Subject: [PATCH] [X86] Add isel pattern infrastructure to begin recognizing
 when we're inserting 0s into the upper portions of a vector register and the
 producing instruction as already produced the zeros.

Currently if we're inserting 0s into the upper elements of a vector register we insert an explicit move of the smaller register to implicitly zero the upper bits. But if we can prove that they are already zero we can skip that. This is based on a similar idea of what we do to avoid emitting explicit zero extends for GR32->GR64.

Unfortunately, this is harder for vector registers because there are several opcodes that don't have VEX equivalent instructions, but can write to XMM registers. Among these are SHA instructions and a MMX->XMM move. Bitcasts can also get in the way.

So for now I'm starting with explicitly allowing only VPMADDWD because we emit zeros in combineLoopMAddPattern. So that is placing extra instruction into the reduction loop.

I'd like to allow PSADBW as well after D37453, but that's currently blocked by a bitcast. We either need to peek through bitcasts or canonicalize insert_subvectors with zeros to remove bitcasts on the value being inserted.

Longer term we should probably have a cleanup pass that removes superfluous zeroing moves even when the producer is in another basic block which is something these isel tricks can't do. See PR32544.

Differential Revision: https://reviews.llvm.org/D37653

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@313365 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86InstrVecCompiler.td | 59 +++++++++++++++++++++++++++++++++++
 test/CodeGen/X86/madd.ll              |  3 --
 2 files changed, 59 insertions(+), 3 deletions(-)
diff --git a/lib/Target/X86/X86InstrVecCompiler.td b/lib/Target/X86/X86InstrVecCompiler.td
index 452b40dae12..ca5561acb4f 100644
--- a/lib/Target/X86/X86InstrVecCompiler.td
+++ b/lib/Target/X86/X86InstrVecCompiler.td
@@ -360,3 +360,62 @@ let Predicates = [HasAVX512, NoVLX] in {
   defm : subvector_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32,
                                  loadv4i64, sub_ymm>;
 }
+
+// List of opcodes that guaranteed to zero the upper elements of vector regs.
+// TODO: Ideally this would be a blacklist instead of a whitelist. But SHA
+// intrinsics and some MMX->XMM move instructions that aren't VEX encoded make
+// this difficult. So starting with a couple opcodes used by reduction loops
+// where we explicitly insert zeros.
+class veczeroupper<ValueType vt, RegisterClass RC> :
+  PatLeaf<(vt RC:$src), [{
+    return N->getOpcode() == X86ISD::VPMADDWD;
+  }]>;
+
+def zeroupperv2f64 : veczeroupper<v2f64, VR128>;
+def zeroupperv4f32 : veczeroupper<v4f32, VR128>;
+def zeroupperv2i64 : veczeroupper<v2i64, VR128>;
+def zeroupperv4i32 : veczeroupper<v4i32, VR128>;
+def zeroupperv8i16 : veczeroupper<v8i16, VR128>;
+def zeroupperv16i8 : veczeroupper<v16i8, VR128>;
+
+def zeroupperv4f64  : veczeroupper<v4f64, VR256>;
+def zeroupperv8f32  : veczeroupper<v8f32, VR256>;
+def zeroupperv4i64  : veczeroupper<v4i64, VR256>;
+def zeroupperv8i32  : veczeroupper<v8i32, VR256>;
+def zeroupperv16i16 : veczeroupper<v16i16, VR256>;
+def zeroupperv32i8  : veczeroupper<v32i8, VR256>;
+
+
+// If we can guarantee the upper elements have already been zeroed we can elide
+// an explicit zeroing.
+multiclass subvector_zero_ellision<RegisterClass RC, ValueType DstTy,
+                                   ValueType SrcTy, ValueType ZeroTy,
+                                   SubRegIndex SubIdx, PatLeaf Zeroupper> {
+  def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
+                                     Zeroupper:$src, (iPTR 0))),
+            (SUBREG_TO_REG (i64 0), RC:$src, SubIdx)>;
+}
+
+// 128->256
+defm: subvector_zero_ellision<VR128, v4f64,  v2f64, v8i32, sub_xmm, zeroupperv2f64>;
+defm: subvector_zero_ellision<VR128, v8f32,  v4f32, v8i32, sub_xmm, zeroupperv4f32>;
+defm: subvector_zero_ellision<VR128, v4i64,  v2i64, v8i32, sub_xmm, zeroupperv2i64>;
+defm: subvector_zero_ellision<VR128, v8i32,  v4i32, v8i32, sub_xmm, zeroupperv4i32>;
+defm: subvector_zero_ellision<VR128, v16i16, v8i16, v8i32, sub_xmm, zeroupperv8i16>;
+defm: subvector_zero_ellision<VR128, v32i8,  v16i8, v8i32, sub_xmm, zeroupperv16i8>;
+
+// 128->512
+defm: subvector_zero_ellision<VR128, v8f64,  v2f64, v16i32, sub_xmm, zeroupperv2f64>;
+defm: subvector_zero_ellision<VR128, v16f32, v4f32, v16i32, sub_xmm, zeroupperv4f32>;
+defm: subvector_zero_ellision<VR128, v8i64,  v2i64, v16i32, sub_xmm, zeroupperv2i64>;
+defm: subvector_zero_ellision<VR128, v16i32, v4i32, v16i32, sub_xmm, zeroupperv4i32>;
+defm: subvector_zero_ellision<VR128, v32i16, v8i16, v16i32, sub_xmm, zeroupperv8i16>;
+defm: subvector_zero_ellision<VR128, v64i8,  v16i8, v16i32, sub_xmm, zeroupperv16i8>;
+
+// 256->512
+defm: subvector_zero_ellision<VR256, v8f64,  v4f64,  v16i32, sub_ymm, zeroupperv4f64>;
+defm: subvector_zero_ellision<VR256, v16f32, v8f32,  v16i32, sub_ymm, zeroupperv8f32>;
+defm: subvector_zero_ellision<VR256, v8i64,  v4i64,  v16i32, sub_ymm, zeroupperv4i64>;
+defm: subvector_zero_ellision<VR256, v16i32, v8i32,  v16i32, sub_ymm, zeroupperv8i32>;
+defm: subvector_zero_ellision<VR256, v32i16, v16i16, v16i32, sub_ymm, zeroupperv16i16>;
+defm: subvector_zero_ellision<VR256, v64i8,  v32i8,  v16i32, sub_ymm, zeroupperv32i8>;
diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll
index d378727630c..ae0ed8b3d61 100644
--- a/test/CodeGen/X86/madd.ll
+++ b/test/CodeGen/X86/madd.ll
@@ -40,7 +40,6 @@ define i32 @_Z10test_shortPsS_i(i16* nocapture readonly, i16* nocapture readonly
 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX2-NEXT:    vmovdqu (%rsi,%rcx,2), %xmm1
 ; AVX2-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1
-; AVX2-NEXT:    vmovdqa %xmm1, %xmm1
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    addq $8, %rcx
 ; AVX2-NEXT:    cmpq %rcx, %rax
@@ -65,7 +64,6 @@ define i32 @_Z10test_shortPsS_i(i16* nocapture readonly, i16* nocapture readonly
 ; AVX512-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX512-NEXT:    vmovdqu (%rsi,%rcx,2), %xmm1
 ; AVX512-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1
-; AVX512-NEXT:    vmovdqa %xmm1, %xmm1
 ; AVX512-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    addq $8, %rcx
 ; AVX512-NEXT:    cmpq %rcx, %rax
@@ -314,7 +312,6 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3
 ; AVX512-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm1
 ; AVX512-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm2
 ; AVX512-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
-; AVX512-NEXT:    vmovdqa %ymm1, %ymm1
 ; AVX512-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    addq $16, %rcx
 ; AVX512-NEXT:    cmpq %rcx, %rax
-- 
2.11.0