AVX-512: Fix for PR28175 - Scalar code optimization.

author Elena Demikhovsky <elena.demikhovsky@intel.com>

Tue, 13 Sep 2016 07:57:00 +0000 (07:57 +0000)

committer Elena Demikhovsky <elena.demikhovsky@intel.com>

Tue, 13 Sep 2016 07:57:00 +0000 (07:57 +0000)
author Elena Demikhovsky <elena.demikhovsky@intel.com>
Tue, 13 Sep 2016 07:57:00 +0000 (07:57 +0000)
committer Elena Demikhovsky <elena.demikhovsky@intel.com>
Tue, 13 Sep 2016 07:57:00 +0000 (07:57 +0000)
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td

index 2596963..8bcbe19 100644 (file)
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -2189,10 +2189,14 @@ let Predicates = [HasAVX512] in {
    def : Pat<(i1 (trunc (i32 GR32:$src))),
              (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 $src, (i32 1))), VK1)>;
  
+  def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))),
+            (COPY_TO_REGCLASS GR32:$src, VK1)>;
+
    def : Pat<(i1 (trunc (i8 GR8:$src))),
         (COPY_TO_REGCLASS
          (KMOVWkr (AND32ri8 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit), (i32 1))),
         VK1)>;
+
    def : Pat<(i1 (trunc (i16 GR16:$src))),
         (COPY_TO_REGCLASS
          (KMOVWkr (AND32ri8 (SUBREG_TO_REG (i32 0), $src, sub_16bit), (i32 1))),
@@ -2200,32 +2204,33 @@ let Predicates = [HasAVX512] in {
  
    def : Pat<(i32 (zext VK1:$src)),
              (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
+
    def : Pat<(i32 (anyext VK1:$src)),
-            (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16))>;
+            (COPY_TO_REGCLASS VK1:$src, GR32)>;
  
    def : Pat<(i8 (zext VK1:$src)),
              (EXTRACT_SUBREG
               (AND32ri8 (KMOVWrk
                          (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>;
+
    def : Pat<(i8 (anyext VK1:$src)),
-              (EXTRACT_SUBREG
-                (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_8bit)>;
+            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_8bit)>;
  
    def : Pat<(i64 (zext VK1:$src)),
              (AND64ri8 (SUBREG_TO_REG (i64 0),
               (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>;
+
    def : Pat<(i64 (anyext VK1:$src)),
              (SUBREG_TO_REG (i64 0),
-             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit)>;
+             (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_32bit)>;
  
    def : Pat<(i16 (zext VK1:$src)),
              (EXTRACT_SUBREG
               (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
                sub_16bit)>;
+
    def : Pat<(i16 (anyext VK1:$src)),
-            (EXTRACT_SUBREG
-             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)),
-              sub_16bit)>;
+            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_16bit)>;
  }
  def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
            (COPY_TO_REGCLASS VK1:$src, VK16)>;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td

index 08b7e02..0b14ba4 100644 (file)
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -1024,3 +1024,8 @@ def masked_truncstorevi32 :
            (X86mtruncstore node:$src1, node:$src2, node:$src3), [{
    return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
  }]>;
+
+def assertzext_i1 :
+  PatFrag<(ops node:$src), (assertzext node:$src), [{
+    return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1;
+}]>;
+\ No newline at end of file
diff --git a/test/CodeGen/X86/avx512-i1test.ll b/test/CodeGen/X86/avx512-i1test.ll

index a40261c..69fafdf 100644 (file)
--- a/test/CodeGen/X86/avx512-i1test.ll
+++ b/test/CodeGen/X86/avx512-i1test.ll
@@ -66,15 +66,14 @@ L_30:                                             ; preds = %bb51, %L_10
  define i64 @func2(i1 zeroext %i, i32 %j) {
  ; CHECK-LABEL: func2:
  ; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
  ; CHECK-NEXT:    testl %esi, %esi
  ; CHECK-NEXT:    je .LBB1_1
  ; CHECK-NEXT:  # BB#2: # %if.then
  ; CHECK-NEXT:    jmp bar # TAILCALL
  ; CHECK-NEXT:  .LBB1_1: # %return
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    orq $-2, %rax
+; CHECK-NEXT:    orq $-2, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
  ; CHECK-NEXT:    retq
  entry:
    %tobool = icmp eq i32 %j, 0
diff --git a/test/CodeGen/X86/fast-isel-select-cmov.ll b/test/CodeGen/X86/fast-isel-select-cmov.ll

index a6de1e0..290bcaa 100644 (file)
--- a/test/CodeGen/X86/fast-isel-select-cmov.ll
+++ b/test/CodeGen/X86/fast-isel-select-cmov.ll
@@ -15,7 +15,6 @@ define zeroext i16 @select_cmov_i16(i1 zeroext %cond, i16 zeroext %a, i16 zeroex
  ;
  ; AVX512-LABEL: select_cmov_i16:
  ; AVX512:       ## BB#0:
-; AVX512-NEXT:    andl $1, %edi
  ; AVX512-NEXT:    kmovw %edi, %k0
  ; AVX512-NEXT:    kortestw %k0, %k0
  ; AVX512-NEXT:    cmovew %dx, %si
@@ -47,7 +46,6 @@ define i32 @select_cmov_i32(i1 zeroext %cond, i32 %a, i32 %b) {
  ;
  ; AVX512-LABEL: select_cmov_i32:
  ; AVX512:       ## BB#0:
-; AVX512-NEXT:    andl $1, %edi
  ; AVX512-NEXT:    kmovw %edi, %k0
  ; AVX512-NEXT:    kortestw %k0, %k0
  ; AVX512-NEXT:    cmovel %edx, %esi
@@ -79,7 +77,6 @@ define i64 @select_cmov_i64(i1 zeroext %cond, i64 %a, i64 %b) {
  ;
  ; AVX512-LABEL: select_cmov_i64:
  ; AVX512:       ## BB#0:
-; AVX512-NEXT:    andl $1, %edi
  ; AVX512-NEXT:    kmovw %edi, %k0
  ; AVX512-NEXT:    kortestw %k0, %k0
  ; AVX512-NEXT:    cmoveq %rdx, %rsi
diff --git a/test/CodeGen/X86/pr28173.ll b/test/CodeGen/X86/pr28173.ll

index 81c10bb..db7d333 100644 (file)
--- a/test/CodeGen/X86/pr28173.ll
+++ b/test/CodeGen/X86/pr28173.ll
@@ -1,16 +1,20 @@
-; RUN: llc -mattr=+avx512f < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mattr=+avx512f | FileCheck %s  --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq | FileCheck %s  --check-prefix=CHECK --check-prefix=SKX
+
  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-unknown-linux-gnu"
  
  ; Note that the kmovs should really *not* appear in the output, this is an
  ; artifact of the current poor lowering. This is tracked by PR28175.
  
-; CHECK-LABEL: @foo64
-; CHECK: kmov
-; CHECK: kmov
-; CHECK: orq  $-2, %rax
-; CHECK: ret
-define i64 @foo64(i1 zeroext %i, i32 %j) #0 {
+define i64 @foo64(i1 zeroext %i) #0 {
+; CHECK-LABEL: foo64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK-NEXT:    orq $-2, %rdi
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    retq
    br label %bb
  
  bb:
@@ -22,12 +26,12 @@ end:
    ret i64 %v
  }
  
-; CHECK-LABEL: @foo16
-; CHECK: kmov
-; CHECK: kmov
-; CHECK: orl $65534, %eax
-; CHECK: retq
-define i16 @foo16(i1 zeroext %i, i32 %j) #0 {
+define i16 @foo16(i1 zeroext %i) #0 {
+; CHECK-LABEL: foo16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    orl $65534, %edi # imm = 0xFFFE
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    retq
    br label %bb
  
  bb:
@@ -38,3 +42,68 @@ bb:
  end:
    ret i16 %v
  }
+
+; This code is still not optimal
+define i16 @foo16_1(i1 zeroext %i, i32 %j) #0 {
+; KNL-LABEL: foo16_1:
+; KNL:       # BB#0:
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    orl $2, %eax
+; KNL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: foo16_1:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    orl $2, %eax
+; SKX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    retq
+  br label %bb
+
+bb:
+  %z = zext i1 %i to i16
+  %v = or i16 %z, 2
+  br label %end
+
+end:
+  ret i16 %v
+}
+
+define i32 @foo32(i1 zeroext %i) #0 {
+; CHECK-LABEL: foo32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    orl $-2, %edi
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    retq
+  br label %bb
+
+bb:
+  %z = zext i1 %i to i32
+  %v = or i32 %z, -2
+  br label %end
+
+end:
+  ret i32 %v
+}
+
+define i8 @foo8(i1 zeroext %i) #0 {
+; CHECK-LABEL: foo8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    orb $-2, %dil
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    retq
+  br label %bb
+
+bb:
+  %z = zext i1 %i to i8
+  %v = or i8 %z, -2
+  br label %end
+
+end:
+  ret i8 %v
+}
+
diff --git a/test/CodeGen/X86/xaluo.ll b/test/CodeGen/X86/xaluo.ll

index 75743b0..de20647 100644 (file)
--- a/test/CodeGen/X86/xaluo.ll
+++ b/test/CodeGen/X86/xaluo.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
  ; RUN: llc -mtriple=x86_64-darwin-unknown                             < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG
  ; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort=1 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
  ; RUN: llc -mtriple=x86_64-darwin-unknown -mcpu=knl < %s | FileCheck %s --check-prefix=KNL
@@ -738,15 +739,15 @@ define i1 @bug27873(i64 %c1, i1 %c2) {
  ; KNL-LABEL: bug27873:
  ; KNL:       ## BB#0:
  ; KNL-NEXT:    andl $1, %esi
+; KNL-NEXT:    kmovw %esi, %k0
  ; KNL-NEXT:    movl $160, %ecx
  ; KNL-NEXT:    movq %rdi, %rax
  ; KNL-NEXT:    mulq %rcx
-; KNL-NEXT:    kmovw %esi, %k0
  ; KNL-NEXT:    seto %al
  ; KNL-NEXT:    kmovw %eax, %k1
  ; KNL-NEXT:    korw %k1, %k0, %k0
  ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    # kill
+; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
  ; KNL-NEXT:    retq
    %mul = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c1, i64 160)
    %mul.overflow = extractvalue { i64, i1 } %mul, 1
author	Elena Demikhovsky <elena.demikhovsky@intel.com>
	Tue, 13 Sep 2016 07:57:00 +0000 (07:57 +0000)
committer	Elena Demikhovsky <elena.demikhovsky@intel.com>
	Tue, 13 Sep 2016 07:57:00 +0000 (07:57 +0000)
lib/Target/X86/X86InstrAVX512.td		patch \| blob \| history
lib/Target/X86/X86InstrFragmentsSIMD.td		patch \| blob \| history
test/CodeGen/X86/avx512-i1test.ll		patch \| blob \| history
test/CodeGen/X86/fast-isel-select-cmov.ll		patch \| blob \| history
test/CodeGen/X86/pr28173.ll		patch \| blob \| history
test/CodeGen/X86/xaluo.ll		patch \| blob \| history