Subzero: Improve/refactor folding loads into the next instruction.

author Jim Stichnoth <stichnot@chromium.org>

Wed, 3 Jun 2015 22:58:12 +0000 (15:58 -0700)

committer Jim Stichnoth <stichnot@chromium.org>

Wed, 3 Jun 2015 22:58:12 +0000 (15:58 -0700)
author Jim Stichnoth <stichnot@chromium.org>
Wed, 3 Jun 2015 22:58:12 +0000 (15:58 -0700)
committer Jim Stichnoth <stichnot@chromium.org>
Wed, 3 Jun 2015 22:58:12 +0000 (15:58 -0700)
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp

index f69d3c1..87eee0f 100644 (file)
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -809,8 +809,8 @@ void emitRegisterUsage(Ostream &Str, const Cfg *Func, const CfgNode *Node,
      // familiar order.
      std::sort(LiveRegs.begin(), LiveRegs.end(),
                [](const Variable *V1, const Variable *V2) {
-      return V1->getRegNum() < V2->getRegNum();
-    });
+                return V1->getRegNum() < V2->getRegNum();
+              });
      bool First = true;
      for (Variable *Var : LiveRegs) {
        if (!First)
diff --git a/src/IceELFObjectWriter.cpp b/src/IceELFObjectWriter.cpp

index 9761dde..aab663c 100644 (file)
--- a/src/IceELFObjectWriter.cpp
+++ b/src/IceELFObjectWriter.cpp
@@ -383,8 +383,9 @@ void ELFObjectWriter::writeDataOfType(SectionType ST,
        for (VariableDeclaration::Initializer *Init : Var->getInitializers()) {
          switch (Init->getKind()) {
          case VariableDeclaration::Initializer::DataInitializerKind: {
-          const auto Data = llvm::cast<VariableDeclaration::DataInitializer>(
-                                Init)->getContents();
+          const auto Data =
+              llvm::cast<VariableDeclaration::DataInitializer>(Init)
+                  ->getContents();
            Section->appendData(Str, llvm::StringRef(Data.data(), Data.size()));
            break;
          }
diff --git a/src/IceInst.cpp b/src/IceInst.cpp

index 659d5e5..de082bb 100644 (file)
--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -112,6 +112,44 @@ bool Inst::isLastUse(const Operand *TestSrc) const {
    return false;
  }
  
+// Given an instruction like:
+//   a = b + c + [x,y] + e
+// which was created from OrigInst:
+//   a = b + c + d + e
+// with SpliceAssn spliced in:
+//   d = [x,y]
+//
+// Reconstruct the LiveRangesEnded bitmask in this instruction by
+// combining the LiveRangesEnded values of OrigInst and SpliceAssn.
+// If operands d and [x,y] contain a different number of variables,
+// then the bitmask position for e may be different in OrigInst and
+// the current instruction, requiring extra shifts and masks in the
+// computation.  In the example above, OrigInst has variable e in bit
+// position 3, whereas the current instruction has e in bit position 4
+// because [x,y] consumes 2 bitmask slots while d only consumed 1.
+//
+// Additionally, set HasSideEffects if either OrigInst or SpliceAssn
+// have HasSideEffects set.
+void Inst::spliceLivenessInfo(Inst *OrigInst, Inst *SpliceAssn) {
+  HasSideEffects |= OrigInst->HasSideEffects;
+  HasSideEffects |= SpliceAssn->HasSideEffects;
+  // Find the bitmask index of SpliceAssn's dest within OrigInst.
+  Variable *SpliceDest = SpliceAssn->getDest();
+  SizeT Index = 0;
+  for (SizeT I = 0; I < OrigInst->getSrcSize(); ++I) {
+    Operand *Src = OrigInst->getSrc(I);
+    if (Src == SpliceDest) {
+      LREndedBits LeftMask = OrigInst->LiveRangesEnded & ((1 << Index) - 1);
+      LREndedBits RightMask = OrigInst->LiveRangesEnded >> (Index + 1);
+      LiveRangesEnded = LeftMask | (SpliceAssn->LiveRangesEnded << Index) |
+                        (RightMask << (Index + getSrc(I)->getNumVars()));
+      return;
+    }
+    Index += getSrc(I)->getNumVars();
+  }
+  llvm::report_fatal_error("Failed to find splice operand");
+}
+
  void Inst::livenessLightweight(Cfg *Func, LivenessBV &Live) {
    assert(!isDeleted());
    resetLastUses();
diff --git a/src/IceInst.h b/src/IceInst.h

index b3a4a6d..948c5db 100644 (file)
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -102,6 +102,7 @@ public:
    }
  
    bool isLastUse(const Operand *Src) const;
+  void spliceLivenessInfo(Inst *OrigInst, Inst *SpliceAssn);
  
    // Returns a list of out-edges corresponding to a terminator
    // instruction, which is the last instruction of the block.
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp

index 326db0a..49c370a 100644 (file)
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -251,9 +251,9 @@ void TargetLowering::sortVarsByAlignment(VarList &Dest,
    // as the buckets, if performance is an issue.
    std::sort(Dest.begin(), Dest.end(),
              [this](const Variable *V1, const Variable *V2) {
-    return typeWidthInBytesOnStack(V1->getType()) >
-           typeWidthInBytesOnStack(V2->getType());
-  });
+              return typeWidthInBytesOnStack(V1->getType()) >
+                     typeWidthInBytesOnStack(V2->getType());
+            });
  }
  
  void TargetLowering::getVarStackSlotParams(
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp

index bdae394..45d6892 100644 (file)
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -482,6 +482,7 @@ void TargetX8632::translateO2() {
      return;
    Func->dump("After x86 address mode opt");
  
+  doLoadOpt();
    Func->genCode();
    if (Func->hasError())
      return;
@@ -572,6 +573,126 @@ void TargetX8632::translateOm1() {
    }
  }
  
+namespace {
+
+// Converts a ConstantInteger32 operand into its constant value, or
+// MemoryOrderInvalid if the operand is not a ConstantInteger32.
+uint64_t getConstantMemoryOrder(Operand *Opnd) {
+  if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
+    return Integer->getValue();
+  return Intrinsics::MemoryOrderInvalid;
+}
+
+// Determines whether the dest of a Load instruction can be folded
+// into one of the src operands of a 2-operand instruction.  This is
+// true as long as the load dest matches exactly one of the binary
+// instruction's src operands.  Replaces Src0 or Src1 with LoadSrc if
+// the answer is true.
+bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
+                               Operand *&Src0, Operand *&Src1) {
+  if (Src0 == LoadDest && Src1 != LoadDest) {
+    Src0 = LoadSrc;
+    return true;
+  }
+  if (Src0 != LoadDest && Src1 == LoadDest) {
+    Src1 = LoadSrc;
+    return true;
+  }
+  return false;
+}
+
+} // end of anonymous namespace
+
+void TargetX8632::doLoadOpt() {
+  for (CfgNode *Node : Func->getNodes()) {
+    Context.init(Node);
+    while (!Context.atEnd()) {
+      Variable *LoadDest = nullptr;
+      Operand *LoadSrc = nullptr;
+      Inst *CurInst = Context.getCur();
+      Inst *Next = Context.getNextInst();
+      // Determine whether the current instruction is a Load
+      // instruction or equivalent.
+      if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
+        // An InstLoad always qualifies.
+        LoadDest = Load->getDest();
+        const bool DoLegalize = false;
+        LoadSrc = formMemoryOperand(Load->getSourceAddress(),
+                                    LoadDest->getType(), DoLegalize);
+      } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) {
+        // An AtomicLoad intrinsic qualifies as long as it has a valid
+        // memory ordering, and can be implemented in a single
+        // instruction (i.e., not i64).
+        Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID;
+        if (ID == Intrinsics::AtomicLoad &&
+            Intrin->getDest()->getType() != IceType_i64 &&
+            Intrinsics::isMemoryOrderValid(
+                ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
+          LoadDest = Intrin->getDest();
+          const bool DoLegalize = false;
+          LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
+                                      DoLegalize);
+        }
+      }
+      // A Load instruction can be folded into the following
+      // instruction only if the following instruction ends the Load's
+      // Dest variable's live range.
+      if (LoadDest && Next && Next->isLastUse(LoadDest)) {
+        assert(LoadSrc);
+        Inst *NewInst = nullptr;
+        if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
+          Operand *Src0 = Arith->getSrc(0);
+          Operand *Src1 = Arith->getSrc(1);
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstArithmetic::create(Func, Arith->getOp(),
+                                             Arith->getDest(), Src0, Src1);
+          }
+        } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
+          Operand *Src0 = Icmp->getSrc(0);
+          Operand *Src1 = Icmp->getSrc(1);
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstIcmp::create(Func, Icmp->getCondition(),
+                                       Icmp->getDest(), Src0, Src1);
+          }
+        } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
+          Operand *Src0 = Fcmp->getSrc(0);
+          Operand *Src1 = Fcmp->getSrc(1);
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
+                                       Fcmp->getDest(), Src0, Src1);
+          }
+        } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
+          Operand *Src0 = Select->getTrueOperand();
+          Operand *Src1 = Select->getFalseOperand();
+          if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+            NewInst = InstSelect::create(Func, Select->getDest(),
+                                         Select->getCondition(), Src0, Src1);
+          }
+        } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
+          // The load dest can always be folded into a Cast
+          // instruction.
+          Variable *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
+          if (Src0 == LoadDest) {
+            NewInst = InstCast::create(Func, Cast->getCastKind(),
+                                       Cast->getDest(), LoadSrc);
+          }
+        }
+        if (NewInst) {
+          CurInst->setDeleted();
+          Next->setDeleted();
+          Context.insert(NewInst);
+          // Update NewInst->LiveRangesEnded so that target lowering
+          // may benefit.  Also update NewInst->HasSideEffects.
+          NewInst->spliceLivenessInfo(Next, CurInst);
+        }
+      }
+      Context.advanceCur();
+      Context.advanceNext();
+    }
+  }
+  Func->dump("After load optimization");
+}
+
  bool TargetX8632::doBranchOpt(Inst *I, const CfgNode *NextNode) {
    if (InstX8632Br *Br = llvm::dyn_cast<InstX8632Br>(I)) {
      return Br->optimizeBranch(NextNode);
@@ -804,15 +925,15 @@ void TargetX8632::addProlog(CfgNode *Node) {
    // that stack slot.
    std::function<bool(Variable *)> TargetVarHook =
        [&VariablesLinkedToSpillSlots](Variable *Var) {
-    if (SpillVariable *SpillVar = llvm::dyn_cast<SpillVariable>(Var)) {
-      assert(Var->getWeight().isZero());
-      if (SpillVar->getLinkedTo() && !SpillVar->getLinkedTo()->hasReg()) {
-        VariablesLinkedToSpillSlots.push_back(Var);
-        return true;
-      }
-    }
-    return false;
-  };
+        if (SpillVariable *SpillVar = llvm::dyn_cast<SpillVariable>(Var)) {
+          assert(Var->getWeight().isZero());
+          if (SpillVar->getLinkedTo() && !SpillVar->getLinkedTo()->hasReg()) {
+            VariablesLinkedToSpillSlots.push_back(Var);
+            return true;
+          }
+        }
+        return false;
+      };
  
    // Compute the list of spilled variables and bounds for GlobalsSize, etc.
    getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
@@ -1170,6 +1291,10 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
    Variable *Dest = Inst->getDest();
    Operand *Src0 = legalize(Inst->getSrc(0));
    Operand *Src1 = legalize(Inst->getSrc(1));
+  if (Inst->isCommutative()) {
+    if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1))
+      std::swap(Src0, Src1);
+  }
    if (Dest->getType() == IceType_i64) {
      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
@@ -2891,18 +3016,6 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
    }
  }
  
-namespace {
-
-// Converts a ConstantInteger32 operand into its constant value, or
-// MemoryOrderInvalid if the operand is not a ConstantInteger32.
-uint64_t getConstantMemoryOrder(Operand *Opnd) {
-  if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
-    return Integer->getValue();
-  return Intrinsics::MemoryOrderInvalid;
-}
-
-} // end of anonymous namespace
-
  void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
    switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID) {
    case Intrinsics::AtomicCmpxchg: {
@@ -3006,10 +3119,11 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
        Func->setError("Unexpected memory ordering for AtomicRMW");
        return;
      }
-    lowerAtomicRMW(Instr->getDest(),
-                   static_cast<uint32_t>(llvm::cast<ConstantInteger32>(
-                                             Instr->getArg(0))->getValue()),
-                   Instr->getArg(1), Instr->getArg(2));
+    lowerAtomicRMW(
+        Instr->getDest(),
+        static_cast<uint32_t>(
+            llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
+        Instr->getArg(1), Instr->getArg(2));
      return;
    case Intrinsics::AtomicStore: {
      if (!Intrinsics::isMemoryOrderValid(
@@ -3852,66 +3966,9 @@ void TargetX8632::lowerLoad(const InstLoad *Load) {
    // OperandX8632Mem operand.  Note that the address mode
    // optimization already creates an OperandX8632Mem operand, so it
    // doesn't need another level of transformation.
-  Type Ty = Load->getDest()->getType();
-  Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
-
-  // Fuse this load with a subsequent Arithmetic instruction in the
-  // following situations:
-  //   a=[mem]; c=b+a ==> c=b+[mem] if last use of a and a not in b
-  //   a=[mem]; c=a+b ==> c=b+[mem] if commutative and above is true
-  //
-  // Fuse this load with a subsequent Cast instruction:
-  //   a=[mem]; b=cast(a) ==> b=cast([mem]) if last use of a
-  //
-  // TODO: Clean up and test thoroughly.
-  // (E.g., if there is an mfence-all make sure the load ends up on the
-  // same side of the fence).
-  //
-  // TODO: Why limit to Arithmetic instructions?  This could probably be
-  // applied to most any instruction type.  Look at all source operands
-  // in the following instruction, and if there is one instance of the
-  // load instruction's dest variable, and that instruction ends that
-  // variable's live range, then make the substitution.  Deal with
-  // commutativity optimization in the arithmetic instruction lowering.
-  //
-  // TODO(stichnot): Do load fusing as a separate pass.  Run it before
-  // the bool folding pass.  Modify Ice::Inst to allow src operands to
-  // be replaced, including updating Inst::LiveRangesEnded, to avoid
-  // having to manually mostly clone each instruction type.
-  Inst *NextInst = Context.getNextInst();
    Variable *DestLoad = Load->getDest();
-  if (NextInst && NextInst->isLastUse(DestLoad)) {
-    if (auto *Arith = llvm::dyn_cast<InstArithmetic>(NextInst)) {
-      InstArithmetic *NewArith = nullptr;
-      Variable *Src0Arith = llvm::dyn_cast<Variable>(Arith->getSrc(0));
-      Variable *Src1Arith = llvm::dyn_cast<Variable>(Arith->getSrc(1));
-      if (Src1Arith == DestLoad && DestLoad != Src0Arith) {
-        NewArith = InstArithmetic::create(
-            Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(0), Src0);
-      } else if (Src0Arith == DestLoad && Arith->isCommutative() &&
-                 DestLoad != Src1Arith) {
-        NewArith = InstArithmetic::create(
-            Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(1), Src0);
-      }
-      if (NewArith) {
-        Arith->setDeleted();
-        Context.advanceNext();
-        lowerArithmetic(NewArith);
-        return;
-      }
-    } else if (auto *Cast = llvm::dyn_cast<InstCast>(NextInst)) {
-      Variable *Src0Cast = llvm::dyn_cast<Variable>(Cast->getSrc(0));
-      if (Src0Cast == DestLoad) {
-        InstCast *NewCast =
-            InstCast::create(Func, Cast->getCastKind(), Cast->getDest(), Src0);
-        Cast->setDeleted();
-        Context.advanceNext();
-        lowerCast(NewCast);
-        return;
-      }
-    }
-  }
-
+  Type Ty = DestLoad->getType();
+  Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
    InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0);
    lowerAssign(Assign);
  }
@@ -4639,7 +4696,8 @@ Operand *TargetX8632::legalizeSrc0ForCmp(Operand *Src0, Operand *Src1) {
    return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
  }
  
-OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty) {
+OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty,
+                                                bool DoLegalize) {
    OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand);
    // It may be the case that address mode optimization already creates
    // an OperandX8632Mem, so in that case it wouldn't need another level
@@ -4656,7 +4714,7 @@ OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty) {
      }
      Mem = OperandX8632Mem::create(Func, Ty, Base, Offset);
    }
-  return llvm::cast<OperandX8632Mem>(legalize(Mem));
+  return llvm::cast<OperandX8632Mem>(DoLegalize ? legalize(Mem) : Mem);
  }
  
  Variable *TargetX8632::makeReg(Type Type, int32_t RegNum) {
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h

index ff400a1..5773340 100644 (file)
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -101,6 +101,7 @@ public:
  
    void translateOm1() override;
    void translateO2() override;
+  void doLoadOpt();
    bool doBranchOpt(Inst *I, const CfgNode *NextNode) override;
  
    SizeT getNumRegisters() const override { return RegX8632::Reg_NUM; }
@@ -229,7 +230,8 @@ protected:
    // Turn a pointer operand into a memory operand that can be
    // used by a real load/store operation. Legalizes the operand as well.
    // This is a nop if the operand is already a legal memory operand.
-  OperandX8632Mem *formMemoryOperand(Operand *Ptr, Type Ty);
+  OperandX8632Mem *formMemoryOperand(Operand *Ptr, Type Ty,
+                                     bool DoLegalize = true);
  
    Variable *makeReg(Type Ty, int32_t RegNum = Variable::NoRegister);
    static Type stackSlotType();
diff --git a/tests_lit/llvm2ice_tests/nacl-atomic-fence-all.ll b/tests_lit/llvm2ice_tests/nacl-atomic-fence-all.ll

index 1b33546..01e2048 100644 (file)
--- a/tests_lit/llvm2ice_tests/nacl-atomic-fence-all.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-fence-all.ll
@@ -14,7 +14,7 @@ declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32)
  @g32_c = internal global [4 x i8] zeroinitializer, align 4
  @g32_d = internal global [4 x i8] zeroinitializer, align 4
  
-define i32 @test_fused_load_add_a() {
+define i32 @test_fused_load_sub_a() {
  entry:
    %p_alloca = alloca i8, i32 4, align 4
    %p_alloca_bc = bitcast i8* %p_alloca to i32*
@@ -22,39 +22,39 @@ entry:
  
    %p_a = bitcast [4 x i8]* @g32_a to i32*
    %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
-  %l_a2 = add i32 %l_a, 1
+  %l_a2 = sub i32 1, %l_a
    call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
  
    %p_b = bitcast [4 x i8]* @g32_b to i32*
    %l_b = load i32, i32* %p_b, align 1
-  %l_b2 = add i32 %l_b, 1
+  %l_b2 = sub i32 1, %l_b
    store i32 %l_b2, i32* %p_b, align 1
  
    %p_c = bitcast [4 x i8]* @g32_c to i32*
    %l_c = load i32, i32* %p_c, align 1
-  %l_c2 = add i32 %l_c, 1
+  %l_c2 = sub i32 1, %l_c
    call void @llvm.nacl.atomic.fence.all()
    store i32 %l_c2, i32* %p_c, align 1
  
    ret i32 %l_c2
  }
-; CHECK-LABEL: test_fused_load_add_a
+; CHECK-LABEL: test_fused_load_sub_a
  ;    alloca store
  ; CHECK: mov {{.*}},esp
  ; CHECK: mov DWORD PTR {{.*}},0x3e7
  ;    atomic store (w/ its own mfence)
-; The load + add are optimized into one everywhere.
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a
+; The load + sub are optimized into one everywhere.
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a
  ; CHECK: mov DWORD PTR
  ; CHECK: mfence
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_b
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_b
  ; CHECK: mov DWORD PTR
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c
  ; CHECK: mfence
  ; CHECK: mov DWORD PTR
  
  ; Test with the fence moved up a bit.
-define i32 @test_fused_load_add_b() {
+define i32 @test_fused_load_sub_b() {
  entry:
    %p_alloca = alloca i8, i32 4, align 4
    %p_alloca_bc = bitcast i8* %p_alloca to i32*
@@ -62,40 +62,40 @@ entry:
  
    %p_a = bitcast [4 x i8]* @g32_a to i32*
    %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
-  %l_a2 = add i32 %l_a, 1
+  %l_a2 = sub i32 1, %l_a
    call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
  
    %p_b = bitcast [4 x i8]* @g32_b to i32*
    %l_b = load i32, i32* %p_b, align 1
-  %l_b2 = add i32 %l_b, 1
+  %l_b2 = sub i32 1, %l_b
    store i32 %l_b2, i32* %p_b, align 1
  
    %p_c = bitcast [4 x i8]* @g32_c to i32*
    call void @llvm.nacl.atomic.fence.all()
    %l_c = load i32, i32* %p_c, align 1
-  %l_c2 = add i32 %l_c, 1
+  %l_c2 = sub i32 1, %l_c
    store i32 %l_c2, i32* %p_c, align 1
  
    ret i32 %l_c2
  }
-; CHECK-LABEL: test_fused_load_add_b
+; CHECK-LABEL: test_fused_load_sub_b
  ;    alloca store
  ; CHECK: mov {{.*}},esp
  ; CHECK: mov DWORD PTR {{.*}},0x3e7
  ;    atomic store (w/ its own mfence)
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a
  ; CHECK: mov DWORD PTR
  ; CHECK: mfence
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_b
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_b
  ; CHECK: mov DWORD PTR
  ; CHECK: mfence
-; Load + add can still be optimized into one instruction
+; Load + sub can still be optimized into one instruction
  ; because it is not separated by a fence.
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c
  ; CHECK: mov DWORD PTR
  
-; Test with the fence splitting a load/add.
-define i32 @test_fused_load_add_c() {
+; Test with the fence splitting a load/sub.
+define i32 @test_fused_load_sub_c() {
  entry:
    %p_alloca = alloca i8, i32 4, align 4
    %p_alloca_bc = bitcast i8* %p_alloca to i32*
@@ -103,38 +103,39 @@ entry:
  
    %p_a = bitcast [4 x i8]* @g32_a to i32*
    %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
-  %l_a2 = add i32 %l_a, 1
+  %l_a2 = sub i32 1, %l_a
    call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
  
    %p_b = bitcast [4 x i8]* @g32_b to i32*
    %l_b = load i32, i32* %p_b, align 1
    call void @llvm.nacl.atomic.fence.all()
-  %l_b2 = add i32 %l_b, 1
+  %l_b2 = sub i32 1, %l_b
    store i32 %l_b2, i32* %p_b, align 1
  
    %p_c = bitcast [4 x i8]* @g32_c to i32*
    %l_c = load i32, i32* %p_c, align 1
-  %l_c2 = add i32 %l_c, 1
+  %l_c2 = sub i32 1, %l_c
    store i32 %l_c2, i32* %p_c, align 1
  
    ret i32 %l_c2
  }
-; CHECK-LABEL: test_fused_load_add_c
+; CHECK-LABEL: test_fused_load_sub_c
  ;    alloca store
  ; CHECK: mov {{.*}},esp
  ; CHECK: mov DWORD PTR {{.*}},0x3e7
  ;    atomic store (w/ its own mfence)
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a
  ; CHECK: mov DWORD PTR
  ; CHECK: mfence
-; This load + add are no longer optimized into one,
+; This load + sub are no longer optimized into one,
  ; though perhaps it should be legal as long as
  ; the load stays on the same side of the fence.
  ; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_b
  ; CHECK: mfence
-; CHECK: add {{.*}},0x1
+; CHECK: mov {{.*}},0x1
+; CHECK: sub
  ; CHECK: mov DWORD PTR
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c
  ; CHECK: mov DWORD PTR
  
  
diff --git a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll

index 1d24497..ee81a09 100644 (file)
--- a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
@@ -95,17 +95,17 @@ entry:
  next:
    %ptr = inttoptr i32 %iptr to i32*
    %r = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
-  %r2 = add i32 %r, 32
+  %r2 = sub i32 32, %r
    ret i32 %r2
  }
  ; CHECK-LABEL: test_atomic_load_32_with_arith
  ; CHECK: mov {{.*}},DWORD
  ; The next instruction may be a separate load or folded into an add.
  ;
-; In O2 mode, we know that the load and add are going to be fused.
+; In O2 mode, we know that the load and sub are going to be fused.
  ; O2-LABEL: test_atomic_load_32_with_arith
  ; O2: mov {{.*}},DWORD
-; O2: add {{.*}},DWORD
+; O2: sub {{.*}},DWORD
  
  define i32 @test_atomic_load_32_ignored(i32 %iptr) {
  entry:
author	Jim Stichnoth <stichnot@chromium.org>
	Wed, 3 Jun 2015 22:58:12 +0000 (15:58 -0700)
committer	Jim Stichnoth <stichnot@chromium.org>
	Wed, 3 Jun 2015 22:58:12 +0000 (15:58 -0700)
src/IceCfgNode.cpp		patch \| blob \| history
src/IceELFObjectWriter.cpp		patch \| blob \| history
src/IceInst.cpp		patch \| blob \| history
src/IceInst.h		patch \| blob \| history
src/IceTargetLowering.cpp		patch \| blob \| history
src/IceTargetLoweringX8632.cpp		patch \| blob \| history
src/IceTargetLoweringX8632.h		patch \| blob \| history
tests_lit/llvm2ice_tests/nacl-atomic-fence-all.ll		patch \| blob \| history
tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll		patch \| blob \| history