From c77f817f923b2fcbf043da76e1a3066f7729aef2 Mon Sep 17 00:00:00 2001
From: Jim Stichnoth <stichnot@chromium.org>
Date: Sun, 31 May 2015 23:34:44 -0700
Subject: [PATCH] Subzero: Fold the load instruction into the next cast
 instruction.

This is similar to the way a load instruction may be folded into the next arithmetic instruction.

Usually the effect is to improve a sequence like:
  mov ax, WORD PTR [mem]
  movsx eax, ax
into this:
  movsx eax, WORD PTR [mem]
without actually improving register allocation, though other kinds of casts may have different improvements.

Existing tests needed to be fixed when they "inadvertently" did a cast to i32 return type and triggered the optimization when it wasn't wanted.  These were fixed by inserting a "dummy" instruction between the load and the cast.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4095
R=jvoung@chromium.org

Review URL: https://codereview.chromium.org/1152783006
---
 src/IceTargetLoweringX8632.cpp                     | 108 ++++++---
 tests_lit/llvm2ice_tests/8bit.pnacl.ll             |   6 +-
 tests_lit/llvm2ice_tests/load_cast.ll              | 266 +++++++++++++++++++++
 tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll |   6 +-
 4 files changed, 344 insertions(+), 42 deletions(-)
 create mode 100644 tests_lit/llvm2ice_tests/load_cast.ll
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 7edb2c36f..a1ba9d600 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -883,7 +883,7 @@ void TargetX8632::addProlog(CfgNode *Node) {
     // that stack slot.
     if (SpillVariable *SpillVar = llvm::dyn_cast<SpillVariable>(Var)) {
       assert(Var->getWeight().isZero());
-      if (!SpillVar->getLinkedTo()->hasReg()) {
+      if (SpillVar->getLinkedTo() && !SpillVar->getLinkedTo()->hasReg()) {
         VariablesLinkedToSpillSlots.push_back(Var);
         continue;
       }
@@ -1160,8 +1160,9 @@ void TargetX8632::split64(Variable *Var) {
 }
 
 Operand *TargetX8632::loOperand(Operand *Operand) {
-  assert(Operand->getType() == IceType_i64);
-  if (Operand->getType() != IceType_i64)
+  assert(Operand->getType() == IceType_i64 ||
+         Operand->getType() == IceType_f64);
+  if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
     return Operand;
   if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
     split64(Var);
@@ -1180,8 +1181,9 @@ Operand *TargetX8632::loOperand(Operand *Operand) {
 }
 
 Operand *TargetX8632::hiOperand(Operand *Operand) {
-  assert(Operand->getType() == IceType_i64);
-  if (Operand->getType() != IceType_i64)
+  assert(Operand->getType() == IceType_i64 ||
+         Operand->getType() == IceType_f64);
+  if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
     return Operand;
   if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
     split64(Var);
@@ -2463,20 +2465,25 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
       //   a_lo.i32 = t_lo.i32
       //   t_hi.i32 = hi(s.f64)
       //   a_hi.i32 = t_hi.i32
-      SpillVariable *SpillVar = Func->makeVariable<SpillVariable>(IceType_f64);
-      SpillVar->setLinkedTo(llvm::dyn_cast<Variable>(Src0RM));
-      Variable *Spill = SpillVar;
-      Spill->setWeight(RegWeight::Zero);
-      _movq(Spill, Src0RM);
+      Operand *SpillLo, *SpillHi;
+      if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
+        SpillVariable *SpillVar =
+            Func->makeVariable<SpillVariable>(IceType_f64);
+        SpillVar->setLinkedTo(Src0Var);
+        Variable *Spill = SpillVar;
+        Spill->setWeight(RegWeight::Zero);
+        _movq(Spill, Src0RM);
+        SpillLo = VariableSplit::create(Func, Spill, VariableSplit::Low);
+        SpillHi = VariableSplit::create(Func, Spill, VariableSplit::High);
+      } else {
+        SpillLo = loOperand(Src0RM);
+        SpillHi = hiOperand(Src0RM);
+      }
 
       Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
       Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
       Variable *T_Lo = makeReg(IceType_i32);
       Variable *T_Hi = makeReg(IceType_i32);
-      VariableSplit *SpillLo =
-          VariableSplit::create(Func, Spill, VariableSplit::Low);
-      VariableSplit *SpillHi =
-          VariableSplit::create(Func, Spill, VariableSplit::High);
 
       _mov(T_Lo, SpillLo);
       _mov(DestLo, T_Lo);
@@ -2486,6 +2493,12 @@ void TargetX8632::lowerCast(const InstCast *Inst) {
     case IceType_f64: {
       Src0 = legalize(Src0);
       assert(Src0->getType() == IceType_i64);
+      if (llvm::isa<OperandX8632Mem>(Src0)) {
+        Variable *T = Func->makeVariable(Dest->getType());
+        _movq(T, Src0);
+        _movq(Dest, T);
+        break;
+      }
       // a.f64 = bitcast b.i64 ==>
       //   t_lo.i32 = b_lo.i32
       //   FakeDef(s.f64)
@@ -3955,20 +3968,23 @@ void computeAddressOpt(Cfg *Func, const Inst *Instr, Variable *&Base,
 
 } // anonymous namespace
 
-void TargetX8632::lowerLoad(const InstLoad *Inst) {
+void TargetX8632::lowerLoad(const InstLoad *Load) {
   // A Load instruction can be treated the same as an Assign
   // instruction, after the source operand is transformed into an
   // OperandX8632Mem operand.  Note that the address mode
   // optimization already creates an OperandX8632Mem operand, so it
   // doesn't need another level of transformation.
-  Type Ty = Inst->getDest()->getType();
-  Operand *Src0 = FormMemoryOperand(Inst->getSourceAddress(), Ty);
+  Type Ty = Load->getDest()->getType();
+  Operand *Src0 = FormMemoryOperand(Load->getSourceAddress(), Ty);
 
   // Fuse this load with a subsequent Arithmetic instruction in the
   // following situations:
   //   a=[mem]; c=b+a ==> c=b+[mem] if last use of a and a not in b
   //   a=[mem]; c=a+b ==> c=b+[mem] if commutative and above is true
   //
+  // Fuse this load with a subsequent Cast instruction:
+  //   a=[mem]; b=cast(a) ==> b=cast([mem]) if last use of a
+  //
   // TODO: Clean up and test thoroughly.
   // (E.g., if there is an mfence-all make sure the load ends up on the
   // same side of the fence).
@@ -3979,30 +3995,46 @@ void TargetX8632::lowerLoad(const InstLoad *Inst) {
   // load instruction's dest variable, and that instruction ends that
   // variable's live range, then make the substitution.  Deal with
   // commutativity optimization in the arithmetic instruction lowering.
-  InstArithmetic *NewArith = nullptr;
-  if (InstArithmetic *Arith =
-          llvm::dyn_cast_or_null<InstArithmetic>(Context.getNextInst())) {
-    Variable *DestLoad = Inst->getDest();
-    Variable *Src0Arith = llvm::dyn_cast<Variable>(Arith->getSrc(0));
-    Variable *Src1Arith = llvm::dyn_cast<Variable>(Arith->getSrc(1));
-    if (Src1Arith == DestLoad && Arith->isLastUse(Src1Arith) &&
-        DestLoad != Src0Arith) {
-      NewArith = InstArithmetic::create(Func, Arith->getOp(), Arith->getDest(),
-                                        Arith->getSrc(0), Src0);
-    } else if (Src0Arith == DestLoad && Arith->isCommutative() &&
-               Arith->isLastUse(Src0Arith) && DestLoad != Src1Arith) {
-      NewArith = InstArithmetic::create(Func, Arith->getOp(), Arith->getDest(),
-                                        Arith->getSrc(1), Src0);
-    }
-    if (NewArith) {
-      Arith->setDeleted();
-      Context.advanceNext();
-      lowerArithmetic(NewArith);
-      return;
+  //
+  // TODO(stichnot): Do load fusing as a separate pass.  Run it before
+  // the bool folding pass.  Modify Ice::Inst to allow src operands to
+  // be replaced, including updating Inst::LiveRangesEnded, to avoid
+  // having to manually mostly clone each instruction type.
+  Inst *NextInst = Context.getNextInst();
+  Variable *DestLoad = Load->getDest();
+  if (NextInst && NextInst->isLastUse(DestLoad)) {
+    if (auto *Arith = llvm::dyn_cast<InstArithmetic>(NextInst)) {
+      InstArithmetic *NewArith = nullptr;
+      Variable *Src0Arith = llvm::dyn_cast<Variable>(Arith->getSrc(0));
+      Variable *Src1Arith = llvm::dyn_cast<Variable>(Arith->getSrc(1));
+      if (Src1Arith == DestLoad && DestLoad != Src0Arith) {
+        NewArith = InstArithmetic::create(
+            Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(0), Src0);
+      } else if (Src0Arith == DestLoad && Arith->isCommutative() &&
+                 DestLoad != Src1Arith) {
+        NewArith = InstArithmetic::create(
+            Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(1), Src0);
+      }
+      if (NewArith) {
+        Arith->setDeleted();
+        Context.advanceNext();
+        lowerArithmetic(NewArith);
+        return;
+      }
+    } else if (auto *Cast = llvm::dyn_cast<InstCast>(NextInst)) {
+      Variable *Src0Cast = llvm::dyn_cast<Variable>(Cast->getSrc(0));
+      if (Src0Cast == DestLoad) {
+        InstCast *NewCast =
+            InstCast::create(Func, Cast->getCastKind(), Cast->getDest(), Src0);
+        Cast->setDeleted();
+        Context.advanceNext();
+        lowerCast(NewCast);
+        return;
+      }
     }
   }
 
-  InstAssign *Assign = InstAssign::create(Func, Inst->getDest(), Src0);
+  InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0);
   lowerAssign(Assign);
 }
 
diff --git a/tests_lit/llvm2ice_tests/8bit.pnacl.ll b/tests_lit/llvm2ice_tests/8bit.pnacl.ll
index 4987a1d5c..81db96d7f 100644
--- a/tests_lit/llvm2ice_tests/8bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/8bit.pnacl.ll
@@ -335,7 +335,8 @@ define i32 @load_i8(i32 %addr_arg) {
 entry:
   %addr = inttoptr i32 %addr_arg to i8*
   %ret = load i8* %addr, align 1
-  %ret_ext = zext i8 %ret to i32
+  %ret2 = sub i8 %ret, 0
+  %ret_ext = zext i8 %ret2 to i32
   ret i32 %ret_ext
 }
 ; CHECK-LABEL: load_i8
@@ -345,7 +346,8 @@ define i32 @load_i8_global(i32 %addr_arg) {
 entry:
   %addr = bitcast [1 x i8]* @global8 to i8*
   %ret = load i8* %addr, align 1
-  %ret_ext = zext i8 %ret to i32
+  %ret2 = sub i8 %ret, 0
+  %ret_ext = zext i8 %ret2 to i32
   ret i32 %ret_ext
 }
 ; CHECK-LABEL: load_i8_global
diff --git a/tests_lit/llvm2ice_tests/load_cast.ll b/tests_lit/llvm2ice_tests/load_cast.ll
new file mode 100644
index 000000000..5395d04ff
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/load_cast.ll
@@ -0,0 +1,266 @@
+; Tests desired and undesired folding of load instructions into cast
+; instructions.  The folding is only done when liveness analysis is performed,
+; so only O2 is tested.
+
+; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
+
+; Not testing trunc, or 32-bit bitcast, because the lowered code uses pretty
+; much the same mov instructions regardless of whether folding is done.
+
+define internal i32 @zext_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i8*
+  %load = load i8* %addr, align 1
+  %result = zext i8 %load to i32
+  ret i32 %result
+}
+; CHECK-LABEL: zext_fold
+; CHECK: movzx {{.*}},BYTE PTR [{{.*}}+0xc8]
+
+define internal i32 @zext_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i8*
+  %load = load i8* %addr, align 1
+  %tmp1 = zext i8 %load to i32
+  %tmp2 = zext i8 %load to i32
+  %result = add i32 %tmp1, %tmp2
+  ret i32 %result
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: zext_nofold
+; CHECK-NOT: movzx {{.*}},BYTE PTR [{{.*}}+0xc8]
+
+define internal i32 @sext_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i8*
+  %load = load i8* %addr, align 1
+  %result = sext i8 %load to i32
+  ret i32 %result
+}
+; CHECK-LABEL: sext_fold
+; CHECK: movsx {{.*}},BYTE PTR [{{.*}}+0xc8]
+
+define internal i32 @sext_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i8*
+  %load = load i8* %addr, align 1
+  %tmp1 = sext i8 %load to i32
+  %tmp2 = sext i8 %load to i32
+  %result = add i32 %tmp1, %tmp2
+  ret i32 %result
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: sext_nofold
+; CHECK-NOT: movsx {{.*}},BYTE PTR [{{.*}}+0xc8]
+
+define internal float @fptrunc_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to double*
+  %load = load double* %addr, align 8
+  %result = fptrunc double %load to float
+  ret float %result
+}
+; CHECK-LABEL: fptrunc_fold
+; CHECK: cvtsd2ss {{.*}},QWORD PTR [{{.*}}+0xc8]
+
+define internal float @fptrunc_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to double*
+  %load = load double* %addr, align 8
+  %tmp1 = fptrunc double %load to float
+  %tmp2 = fptrunc double %load to float
+  %result = fadd float %tmp1, %tmp2
+  ret float %result
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: fptrunc_nofold
+; CHECK-NOT: cvtsd2ss {{.*}},QWORD PTR [{{.*}}+0xc8]
+
+define internal double @fpext_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to float*
+  %load = load float* %addr, align 4
+  %result = fpext float %load to double
+  ret double %result
+}
+; CHECK-LABEL: fpext_fold
+; CHECK: cvtss2sd {{.*}},DWORD PTR [{{.*}}+0xc8]
+
+define internal double @fpext_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to float*
+  %load = load float* %addr, align 4
+  %tmp1 = fpext float %load to double
+  %tmp2 = fpext float %load to double
+  %result = fadd double %tmp1, %tmp2
+  ret double %result
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: fpext_nofold
+; CHECK-NOT: cvtss2sd {{.*}},DWORD PTR [{{.*}}+0xc8]
+
+define internal i32 @fptoui_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to double*
+  %load = load double* %addr, align 8
+  %result = fptoui double %load to i16
+  %result2 = zext i16 %result to i32
+  ret i32 %result2
+}
+; CHECK-LABEL: fptoui_fold
+; CHECK: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8]
+
+define internal i32 @fptoui_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to double*
+  %load = load double* %addr, align 8
+  %tmp1 = fptoui double %load to i16
+  %tmp2 = fptoui double %load to i16
+  %result = add i16 %tmp1, %tmp2
+  %result2 = zext i16 %result to i32
+  ret i32 %result2
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: fptoui_nofold
+; CHECK-NOT: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8]
+
+define internal i32 @fptosi_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to double*
+  %load = load double* %addr, align 8
+  %result = fptosi double %load to i16
+  %result2 = zext i16 %result to i32
+  ret i32 %result2
+}
+; CHECK-LABEL: fptosi_fold
+; CHECK: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8]
+
+define internal i32 @fptosi_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to double*
+  %load = load double* %addr, align 8
+  %tmp1 = fptosi double %load to i16
+  %tmp2 = fptosi double %load to i16
+  %result = add i16 %tmp1, %tmp2
+  %result2 = zext i16 %result to i32
+  ret i32 %result2
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: fptosi_nofold
+; CHECK-NOT: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8]
+
+define internal double @uitofp_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i16*
+  %load = load i16* %addr, align 1
+  %result = uitofp i16 %load to double
+  ret double %result
+}
+; CHECK-LABEL: uitofp_fold
+; CHECK: movzx {{.*}},WORD PTR [{{.*}}+0xc8]
+
+define internal double @uitofp_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i16*
+  %load = load i16* %addr, align 1
+  %tmp1 = uitofp i16 %load to double
+  %tmp2 = uitofp i16 %load to double
+  %result = fadd double %tmp1, %tmp2
+  ret double %result
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: uitofp_nofold
+; CHECK-NOT: movzx {{.*}},WORD PTR [{{.*}}+0xc8]
+
+define internal double @sitofp_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i16*
+  %load = load i16* %addr, align 1
+  %result = sitofp i16 %load to double
+  ret double %result
+}
+; CHECK-LABEL: sitofp_fold
+; CHECK: movsx {{.*}},WORD PTR [{{.*}}+0xc8]
+
+define internal double @sitofp_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i16*
+  %load = load i16* %addr, align 1
+  %tmp1 = sitofp i16 %load to double
+  %tmp2 = sitofp i16 %load to double
+  %result = fadd double %tmp1, %tmp2
+  ret double %result
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: sitofp_nofold
+; CHECK-NOT: movsx {{.*}},WORD PTR [{{.*}}+0xc8]
+
+define internal double @bitcast_i64_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i64*
+  %load = load i64* %addr, align 1
+  %result = bitcast i64 %load to double
+  ret double %result
+}
+; CHECK-LABEL: bitcast_i64_fold
+; CHECK: movq {{.*}},QWORD PTR [{{.*}}+0xc8]
+
+define internal double @bitcast_i64_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to i64*
+  %load = load i64* %addr, align 1
+  %tmp1 = bitcast i64 %load to double
+  %tmp2 = bitcast i64 %load to double
+  %result = fadd double %tmp1, %tmp2
+  ret double %result
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: bitcast_i64_nofold
+; CHECK-NOT: movq {{.*}},QWORD PTR [{{.*}}+0xc8]
+
+define internal i64 @bitcast_double_fold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to double*
+  %load = load double* %addr, align 8
+  %result = bitcast double %load to i64
+  ret i64 %result
+}
+; CHECK-LABEL: bitcast_double_fold
+; CHECK-NOT: QWORD PTR
+; CHECK: mov {{.*}},DWORD PTR [{{.*}}+0xc8]
+; CHECK: mov {{.*}},DWORD PTR [{{.*}}+0xcc]
+; CHECK-NOT: QWORD PTR
+
+define internal i64 @bitcast_double_nofold(i32 %arg) {
+entry:
+  %ptr = add i32 %arg, 200
+  %addr = inttoptr i32 %ptr to double*
+  %load = load double* %addr, align 8
+  %tmp1 = bitcast double %load to i64
+  %tmp2 = bitcast double %load to i64
+  %result = add i64 %tmp1, %tmp2
+  ret i64 %result
+}
+; Test that load folding does not happen.
+; CHECK-LABEL: bitcast_double_nofold
+; CHECK: QWORD PTR
+; CHECK: QWORD PTR
diff --git a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
index a0b8b6c5e..1d24497d1 100644
--- a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
@@ -48,7 +48,8 @@ entry:
   %ptr = inttoptr i32 %iptr to i8*
   ; parameter value "6" is for the sequential consistency memory order.
   %i = call i8 @llvm.nacl.atomic.load.i8(i8* %ptr, i32 6)
-  %r = zext i8 %i to i32
+  %i2 = sub i8 %i, 0
+  %r = zext i8 %i2 to i32
   ret i32 %r
 }
 ; CHECK-LABEL: test_atomic_load_8
@@ -59,7 +60,8 @@ define i32 @test_atomic_load_16(i32 %iptr) {
 entry:
   %ptr = inttoptr i32 %iptr to i16*
   %i = call i16 @llvm.nacl.atomic.load.i16(i16* %ptr, i32 6)
-  %r = zext i16 %i to i32
+  %i2 = sub i16 %i, 0
+  %r = zext i16 %i2 to i32
   ret i32 %r
 }
 ; CHECK-LABEL: test_atomic_load_16
-- 
2.11.0