Subzero: Improve lowering sequence for "a=b*b".

author Jim Stichnoth <stichnot@chromium.org>

Mon, 5 Oct 2015 22:12:09 +0000 (15:12 -0700)

committer Jim Stichnoth <stichnot@chromium.org>

Mon, 5 Oct 2015 22:12:09 +0000 (15:12 -0700)
author Jim Stichnoth <stichnot@chromium.org>
Mon, 5 Oct 2015 22:12:09 +0000 (15:12 -0700)
committer Jim Stichnoth <stichnot@chromium.org>
Mon, 5 Oct 2015 22:12:09 +0000 (15:12 -0700)
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h

index 2ebbe17..677bc98 100644 (file)
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -1492,7 +1492,7 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
        if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
          Variable *T = makeReg(Dest->getType());
          _movp(T, Src0);
-        _pmull(T, Src1);
+        _pmull(T, Src0 == Src1 ? T : Src1);
          _movp(Dest, T);
        } else if (Dest->getType() == IceType_v4i32) {
          // Lowering sequence:
@@ -1532,9 +1532,10 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
          _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
          _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
          _movp(Dest, T4);
-      } else {
-        assert(Dest->getType() == IceType_v16i8);
+      } else if (Dest->getType() == IceType_v16i8) {
          scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
+      } else {
+        llvm::report_fatal_error("Invalid vector multiply type");
        }
      } break;
      case InstArithmetic::Shl:
@@ -1561,7 +1562,7 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
      case InstArithmetic::Fmul: {
        Variable *T = makeReg(Dest->getType());
        _movp(T, Src0);
-      _mulps(T, Src1);
+      _mulps(T, Src0 == Src1 ? T : Src1);
        _movp(Dest, T);
      } break;
      case InstArithmetic::Fdiv: {
@@ -1620,7 +1621,7 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
      } else {
        _mov(T, Src0);
      }
-    _imul(T, Src1);
+    _imul(T, Src0 == Src1 ? T : Src1);
      _mov(Dest, T);
      break;
    case InstArithmetic::Shl:
@@ -1826,7 +1827,7 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
      break;
    case InstArithmetic::Fmul:
      _mov(T, Src0);
-    _mulss(T, Src1);
+    _mulss(T, Src0 == Src1 ? T : Src1);
      _mov(Dest, T);
      break;
    case InstArithmetic::Fdiv:
diff --git a/tests_lit/llvm2ice_tests/square.ll b/tests_lit/llvm2ice_tests/square.ll

new file mode 100644 (file)

index 0000000..18c5d58
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/square.ll
@@ -0,0 +1,82 @@
+; Test the a=b*b lowering sequence which can use a single temporary register
+; instead of two registers.
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 -mattr=sse4.1 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -Om1 -mattr=sse4.1 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+
+define float @Square_float(float %a) {
+entry:
+  %result = fmul float %a, %a
+  ret float %result
+}
+; CHECK-LABEL: Square_float
+; CHECK: mulss [[REG:xmm.]],[[REG]]
+
+define double @Square_double(double %a) {
+entry:
+  %result = fmul double %a, %a
+  ret double %result
+}
+; CHECK-LABEL: Square_double
+; CHECK: mulsd [[REG:xmm.]],[[REG]]
+
+define i32 @Square_i32(i32 %a) {
+entry:
+  %result = mul i32 %a, %a
+  ret i32 %result
+}
+; CHECK-LABEL: Square_i32
+; CHECK: imul [[REG:e..]],[[REG]]
+
+define i16 @Square_i16(i16 %a) {
+entry:
+  %result = mul i16 %a, %a
+  ret i16 %result
+}
+; CHECK-LABEL: Square_i16
+; CHECK: imul [[REG:..]],[[REG]]
+
+define i8 @Square_i8(i8 %a) {
+entry:
+  %result = mul i8 %a, %a
+  ret i8 %result
+}
+; CHECK-LABEL: Square_i8
+; CHECK: imul al
+
+define <4 x float> @Square_v4f32(<4 x float> %a) {
+entry:
+  %result = fmul <4 x float> %a, %a
+  ret <4 x float> %result
+}
+; CHECK-LABEL: Square_v4f32
+; CHECK: mulps [[REG:xmm.]],[[REG]]
+
+define <4 x i32> @Square_v4i32(<4 x i32> %a) {
+entry:
+  %result = mul <4 x i32> %a, %a
+  ret <4 x i32> %result
+}
+; CHECK-LABEL: Square_v4i32
+; CHECK: pmulld [[REG:xmm.]],[[REG]]
+
+define <8 x i16> @Square_v8i16(<8 x i16> %a) {
+entry:
+  %result = mul <8 x i16> %a, %a
+  ret <8 x i16> %result
+}
+; CHECK-LABEL: Square_v8i16
+; CHECK: pmullw [[REG:xmm.]],[[REG]]
+
+define <16 x i8> @Square_v16i8(<16 x i8> %a) {
+entry:
+  %result = mul <16 x i8> %a, %a
+  ret <16 x i8> %result
+}
+; CHECK-LABEL: Square_v16i8
+; CHECK-NOT: pmul
author	Jim Stichnoth <stichnot@chromium.org>
	Mon, 5 Oct 2015 22:12:09 +0000 (15:12 -0700)
committer	Jim Stichnoth <stichnot@chromium.org>
	Mon, 5 Oct 2015 22:12:09 +0000 (15:12 -0700)
src/IceTargetLoweringX86BaseImpl.h		patch \| blob \| history
tests_lit/llvm2ice_tests/square.ll	[new file with mode: 0644]	patch \| blob