From ebbb5912415bf46798f064fea93863a95f32efd8 Mon Sep 17 00:00:00 2001
From: Jim Stichnoth <stichnot@chromium.org>
Date: Mon, 5 Oct 2015 15:12:09 -0700
Subject: [PATCH] Subzero: Improve lowering sequence for "a=b*b".

Originally, the lowering sequence looked like:
  T = b
  T *= b
  a = T
Now it looks like:
  T = b
  T *= T
  a = T

If "b" gets a register and its live range ends after this instruction, then the new lowering sequence allows its register to be reused for "T".  This decreases register pressure, and removes an instruction (register move) from what could be a critical path.

This optimization is actually applicable for most arithmetic operations whose source operands are identical, but mul/fmul are the only ones that seem at all likely in practice.

BUG= none
R=kschimpf@google.com

Review URL: https://codereview.chromium.org/1377213004 .
---
 src/IceTargetLoweringX86BaseImpl.h | 13 +++---
 tests_lit/llvm2ice_tests/square.ll | 82 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+), 6 deletions(-)
 create mode 100644 tests_lit/llvm2ice_tests/square.ll
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 2ebbe170f..677bc98d4 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -1492,7 +1492,7 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
       if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
         Variable *T = makeReg(Dest->getType());
         _movp(T, Src0);
-        _pmull(T, Src1);
+        _pmull(T, Src0 == Src1 ? T : Src1);
         _movp(Dest, T);
       } else if (Dest->getType() == IceType_v4i32) {
         // Lowering sequence:
@@ -1532,9 +1532,10 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
         _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
         _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
         _movp(Dest, T4);
-      } else {
-        assert(Dest->getType() == IceType_v16i8);
+      } else if (Dest->getType() == IceType_v16i8) {
         scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
+      } else {
+        llvm::report_fatal_error("Invalid vector multiply type");
       }
     } break;
     case InstArithmetic::Shl:
@@ -1561,7 +1562,7 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
     case InstArithmetic::Fmul: {
       Variable *T = makeReg(Dest->getType());
       _movp(T, Src0);
-      _mulps(T, Src1);
+      _mulps(T, Src0 == Src1 ? T : Src1);
       _movp(Dest, T);
     } break;
     case InstArithmetic::Fdiv: {
@@ -1620,7 +1621,7 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
     } else {
       _mov(T, Src0);
     }
-    _imul(T, Src1);
+    _imul(T, Src0 == Src1 ? T : Src1);
     _mov(Dest, T);
     break;
   case InstArithmetic::Shl:
@@ -1826,7 +1827,7 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
     break;
   case InstArithmetic::Fmul:
     _mov(T, Src0);
-    _mulss(T, Src1);
+    _mulss(T, Src0 == Src1 ? T : Src1);
     _mov(Dest, T);
     break;
   case InstArithmetic::Fdiv:
diff --git a/tests_lit/llvm2ice_tests/square.ll b/tests_lit/llvm2ice_tests/square.ll
new file mode 100644
index 000000000..18c5d5884
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/square.ll
@@ -0,0 +1,82 @@
+; Test the a=b*b lowering sequence which can use a single temporary register
+; instead of two registers.
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 -mattr=sse4.1 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -Om1 -mattr=sse4.1 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+
+define float @Square_float(float %a) {
+entry:
+  %result = fmul float %a, %a
+  ret float %result
+}
+; CHECK-LABEL: Square_float
+; CHECK: mulss [[REG:xmm.]],[[REG]]
+
+define double @Square_double(double %a) {
+entry:
+  %result = fmul double %a, %a
+  ret double %result
+}
+; CHECK-LABEL: Square_double
+; CHECK: mulsd [[REG:xmm.]],[[REG]]
+
+define i32 @Square_i32(i32 %a) {
+entry:
+  %result = mul i32 %a, %a
+  ret i32 %result
+}
+; CHECK-LABEL: Square_i32
+; CHECK: imul [[REG:e..]],[[REG]]
+
+define i16 @Square_i16(i16 %a) {
+entry:
+  %result = mul i16 %a, %a
+  ret i16 %result
+}
+; CHECK-LABEL: Square_i16
+; CHECK: imul [[REG:..]],[[REG]]
+
+define i8 @Square_i8(i8 %a) {
+entry:
+  %result = mul i8 %a, %a
+  ret i8 %result
+}
+; CHECK-LABEL: Square_i8
+; CHECK: imul al
+
+define <4 x float> @Square_v4f32(<4 x float> %a) {
+entry:
+  %result = fmul <4 x float> %a, %a
+  ret <4 x float> %result
+}
+; CHECK-LABEL: Square_v4f32
+; CHECK: mulps [[REG:xmm.]],[[REG]]
+
+define <4 x i32> @Square_v4i32(<4 x i32> %a) {
+entry:
+  %result = mul <4 x i32> %a, %a
+  ret <4 x i32> %result
+}
+; CHECK-LABEL: Square_v4i32
+; CHECK: pmulld [[REG:xmm.]],[[REG]]
+
+define <8 x i16> @Square_v8i16(<8 x i16> %a) {
+entry:
+  %result = mul <8 x i16> %a, %a
+  ret <8 x i16> %result
+}
+; CHECK-LABEL: Square_v8i16
+; CHECK: pmullw [[REG:xmm.]],[[REG]]
+
+define <16 x i8> @Square_v16i8(<16 x i8> %a) {
+entry:
+  %result = mul <16 x i8> %a, %a
+  ret <16 x i8> %result
+}
+; CHECK-LABEL: Square_v16i8
+; CHECK-NOT: pmul
-- 
2.11.0