From 722e9e6d0a5b67d136be40bc015abc5b0b32f97b Mon Sep 17 00:00:00 2001
From: Richard Sandiford <rsandifo@linux.vnet.ibm.com>
Date: Thu, 27 Jun 2013 09:27:40 +0000
Subject: [PATCH] [SystemZ] Add conditional store patterns

Add pseudo conditional store instructions, so that we use:

    branch foo:
    store
foo:

instead of:

    load
    branch foo:
    move
foo:
    store

z196 has real 32-bit and 64-bit conditional stores, but we don't use
any z196 instructions yet.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185065 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/SystemZ/SystemZISelLowering.cpp |  90 +++++++
 lib/Target/SystemZ/SystemZISelLowering.h   |   3 +
 lib/Target/SystemZ/SystemZInstrFP.td       |   7 +-
 lib/Target/SystemZ/SystemZInstrFormats.td  |  13 +
 lib/Target/SystemZ/SystemZInstrInfo.td     |  20 ++
 lib/Target/SystemZ/SystemZOperators.td     |  24 +-
 lib/Target/SystemZ/SystemZPatterns.td      |   8 +-
 test/CodeGen/SystemZ/cond-store-01.ll      | 396 +++++++++++++++++++++++++++++
 test/CodeGen/SystemZ/cond-store-02.ll      | 396 +++++++++++++++++++++++++++++
 test/CodeGen/SystemZ/cond-store-03.ll      | 322 +++++++++++++++++++++++
 test/CodeGen/SystemZ/cond-store-04.ll      | 214 ++++++++++++++++
 test/CodeGen/SystemZ/cond-store-05.ll      | 213 ++++++++++++++++
 test/CodeGen/SystemZ/cond-store-06.ll      | 213 ++++++++++++++++
 13 files changed, 1910 insertions(+), 9 deletions(-)
 create mode 100644 test/CodeGen/SystemZ/cond-store-01.ll
 create mode 100644 test/CodeGen/SystemZ/cond-store-02.ll
 create mode 100644 test/CodeGen/SystemZ/cond-store-03.ll
 create mode 100644 test/CodeGen/SystemZ/cond-store-04.ll
 create mode 100644 test/CodeGen/SystemZ/cond-store-05.ll
 create mode 100644 test/CodeGen/SystemZ/cond-store-06.ll

diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 0b0dbeaef14..955b88e1630 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1696,6 +1696,59 @@ SystemZTargetLowering::emitSelect(MachineInstr *MI,
   return JoinMBB;
 }
 
+// Implement EmitInstrWithCustomInserter for pseudo CondStore* instruction MI.
+// StoreOpcode is the store to use and Invert says whether the store should
+// happen when the condition is false rather than true.
+MachineBasicBlock *
+SystemZTargetLowering::emitCondStore(MachineInstr *MI,
+                                     MachineBasicBlock *MBB,
+                                     unsigned StoreOpcode, bool Invert) const {
+  const SystemZInstrInfo *TII = TM.getInstrInfo();
+
+  MachineOperand Base = MI->getOperand(0);
+  int64_t Disp        = MI->getOperand(1).getImm();
+  unsigned IndexReg   = MI->getOperand(2).getReg();
+  unsigned SrcReg     = MI->getOperand(3).getReg();
+  unsigned CCMask     = MI->getOperand(4).getImm();
+  DebugLoc DL         = MI->getDebugLoc();
+
+  StoreOpcode = TII->getOpcodeForOffset(StoreOpcode, Disp);
+
+  // Get the condition needed to branch around the store.
+  if (!Invert)
+    CCMask = CCMask ^ SystemZ::CCMASK_ANY;
+
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *JoinMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
+
+  //  StartMBB:
+  //   BRC CCMask, JoinMBB
+  //   # fallthrough to FalseMBB
+  //
+  // The original DAG glues comparisons to their uses, both to ensure
+  // that no CC-clobbering instructions are inserted between them, and
+  // to ensure that comparison results are not reused.  This means that
+  // this CondStore is the sole user of any preceding comparison instruction
+  // and that we can try to use a fused compare and branch instead.
+  MBB = StartMBB;
+  if (!convertPrevCompareToBranch(MBB, MI, CCMask, JoinMBB))
+    BuildMI(MBB, DL, TII->get(SystemZ::BRC)).addImm(CCMask).addMBB(JoinMBB);
+  MBB->addSuccessor(JoinMBB);
+  MBB->addSuccessor(FalseMBB);
+
+  //  FalseMBB:
+  //   store %SrcReg, %Disp(%Index,%Base)
+  //   # fallthrough to JoinMBB
+  MBB = FalseMBB;
+  BuildMI(MBB, DL, TII->get(StoreOpcode))
+    .addReg(SrcReg).addOperand(Base).addImm(Disp).addReg(IndexReg);
+  MBB->addSuccessor(JoinMBB);
+
+  MI->eraseFromParent();
+  return JoinMBB;
+}
+
 // Implement EmitInstrWithCustomInserter for pseudo ATOMIC_LOAD{,W}_*
 // or ATOMIC_SWAP{,W} instruction MI.  BinOpcode is the instruction that
 // performs the binary operation elided by "*", or 0 for ATOMIC_SWAP{,W}.
@@ -2100,6 +2153,43 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
   case SystemZ::SelectF128:
     return emitSelect(MI, MBB);
 
+  case SystemZ::CondStore8_32:
+    return emitCondStore(MI, MBB, SystemZ::STC32, false);
+  case SystemZ::CondStore8_32Inv:
+    return emitCondStore(MI, MBB, SystemZ::STC32, true);
+  case SystemZ::CondStore16_32:
+    return emitCondStore(MI, MBB, SystemZ::STH32, false);
+  case SystemZ::CondStore16_32Inv:
+    return emitCondStore(MI, MBB, SystemZ::STH32, true);
+  case SystemZ::CondStore32_32:
+    return emitCondStore(MI, MBB, SystemZ::ST32, false);
+  case SystemZ::CondStore32_32Inv:
+    return emitCondStore(MI, MBB, SystemZ::ST32, true);
+  case SystemZ::CondStore8:
+    return emitCondStore(MI, MBB, SystemZ::STC, false);
+  case SystemZ::CondStore8Inv:
+    return emitCondStore(MI, MBB, SystemZ::STC, true);
+  case SystemZ::CondStore16:
+    return emitCondStore(MI, MBB, SystemZ::STH, false);
+  case SystemZ::CondStore16Inv:
+    return emitCondStore(MI, MBB, SystemZ::STH, true);
+  case SystemZ::CondStore32:
+    return emitCondStore(MI, MBB, SystemZ::ST, false);
+  case SystemZ::CondStore32Inv:
+    return emitCondStore(MI, MBB, SystemZ::ST, true);
+  case SystemZ::CondStore64:
+    return emitCondStore(MI, MBB, SystemZ::STG, false);
+  case SystemZ::CondStore64Inv:
+    return emitCondStore(MI, MBB, SystemZ::STG, true);
+  case SystemZ::CondStoreF32:
+    return emitCondStore(MI, MBB, SystemZ::STE, false);
+  case SystemZ::CondStoreF32Inv:
+    return emitCondStore(MI, MBB, SystemZ::STE, true);
+  case SystemZ::CondStoreF64:
+    return emitCondStore(MI, MBB, SystemZ::STD, false);
+  case SystemZ::CondStoreF64Inv:
+    return emitCondStore(MI, MBB, SystemZ::STD, true);
+
   case SystemZ::AEXT128_64:
     return emitExt128(MI, MBB, false, SystemZ::subreg_low);
   case SystemZ::ZEXT128_32:
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index f48cc4f9654..f6c49f066a9 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -203,6 +203,9 @@ private:
   // Implement EmitInstrWithCustomInserter for individual operation types.
   MachineBasicBlock *emitSelect(MachineInstr *MI,
                                 MachineBasicBlock *BB) const;
+  MachineBasicBlock *emitCondStore(MachineInstr *MI,
+                                   MachineBasicBlock *BB,
+                                   unsigned StoreOpcode, bool Invert) const;
   MachineBasicBlock *emitExt128(MachineInstr *MI,
                                 MachineBasicBlock *MBB,
                                 bool ClearEven, unsigned SubReg) const;
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index 86ef14c69b5..7499d2fb8d9 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Control-flow instructions
+// Select instructions
 //===----------------------------------------------------------------------===//
 
 // C's ?: operator for floating-point operands.
@@ -16,6 +16,11 @@ def SelectF32  : SelectWrapper<FP32>;
 def SelectF64  : SelectWrapper<FP64>;
 def SelectF128 : SelectWrapper<FP128>;
 
+defm CondStoreF32 : CondStores<FP32, nonvolatile_store,
+                               nonvolatile_load, bdxaddr20only>;
+defm CondStoreF64 : CondStores<FP64, nonvolatile_store,
+                               nonvolatile_load, bdxaddr20only>;
+
 //===----------------------------------------------------------------------===//
 // Move instructions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index ad050fd10cc..ac0300c95e0 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -956,6 +956,19 @@ class SelectWrapper<RegisterOperand cls>
   let Uses = [CC];
 }
 
+// Stores $new to $addr if $cc is true ("" case) or false (Inv case).
+multiclass CondStores<RegisterOperand cls, SDPatternOperator store,
+                      SDPatternOperator load, AddressingMode mode> {
+  let Defs = [CC], Uses = [CC], usesCustomInserter = 1 in {
+    def "" : Pseudo<(outs), (ins mode:$addr, cls:$new, i8imm:$cc),
+                    [(store (z_select_ccmask cls:$new, (load mode:$addr),
+                                             imm:$cc), mode:$addr)]>;
+    def Inv : Pseudo<(outs), (ins mode:$addr, cls:$new, i8imm:$cc),
+                     [(store (z_select_ccmask (load mode:$addr), cls:$new,
+                                              imm:$cc), mode:$addr)]>;
+  }
+}
+
 // OPERATOR is ATOMIC_SWAP or an ATOMIC_LOAD_* operation.  PAT and OPERAND
 // describe the second (non-memory) operand.
 class AtomicLoadBinary<SDPatternOperator operator, RegisterOperand cls,
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index c9ec6bc1f99..44ff1d00472 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -163,9 +163,29 @@ defm AsmJE   : IntCondExtendedMnemonic<8,  "e",  "nlh">;
 defm AsmJHE  : IntCondExtendedMnemonic<10, "he", "nl">;
 defm AsmJLE  : IntCondExtendedMnemonic<12, "le", "nh">;
 
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
 def Select32 : SelectWrapper<GR32>;
 def Select64 : SelectWrapper<GR64>;
 
+defm CondStore8_32  : CondStores<GR32, nonvolatile_truncstorei8,
+                                 nonvolatile_anyextloadi8, bdxaddr20only>;
+defm CondStore16_32 : CondStores<GR32, nonvolatile_truncstorei16,
+                                 nonvolatile_anyextloadi16, bdxaddr20only>;
+defm CondStore32_32 : CondStores<GR32, nonvolatile_store,
+                                 nonvolatile_load, bdxaddr20only>;
+
+defm CondStore8  : CondStores<GR64, nonvolatile_truncstorei8,
+                              nonvolatile_anyextloadi8, bdxaddr20only>;
+defm CondStore16 : CondStores<GR64, nonvolatile_truncstorei16,
+                              nonvolatile_anyextloadi16, bdxaddr20only>;
+defm CondStore32 : CondStores<GR64, nonvolatile_truncstorei32,
+                              nonvolatile_anyextloadi32, bdxaddr20only>;
+defm CondStore64 : CondStores<GR64, nonvolatile_store,
+                              nonvolatile_load, bdxaddr20only>;
+
 //===----------------------------------------------------------------------===//
 // Call instructions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index ab01b2527a8..021824e23c5 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -120,6 +120,20 @@ def zext32 : PatFrag<(ops node:$src), (zext (i32 node:$src))>;
 def loadf32 : PatFrag<(ops node:$src), (f32 (load node:$src))>;
 def loadf64 : PatFrag<(ops node:$src), (f64 (load node:$src))>;
 
+// Extending loads in which the extension type doesn't matter.
+def anyextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getExtensionType() != ISD::NON_EXTLOAD;
+}]>;
+def anyextloadi8 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def anyextloadi16 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def anyextloadi32 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
 // Aligned loads.
 class AlignedLoad<SDPatternOperator load>
   : PatFrag<(ops node:$addr), (load node:$addr), [{
@@ -149,7 +163,10 @@ class NonvolatileLoad<SDPatternOperator load>
   LoadSDNode *Load = cast<LoadSDNode>(N);
   return !Load->isVolatile();
 }]>;
-def nonvolatile_load : NonvolatileLoad<load>;
+def nonvolatile_load          : NonvolatileLoad<load>;
+def nonvolatile_anyextloadi8  : NonvolatileLoad<anyextloadi8>;
+def nonvolatile_anyextloadi16 : NonvolatileLoad<anyextloadi16>;
+def nonvolatile_anyextloadi32 : NonvolatileLoad<anyextloadi32>;
 
 // Non-volatile stores.
 class NonvolatileStore<SDPatternOperator store>
@@ -157,7 +174,10 @@ class NonvolatileStore<SDPatternOperator store>
   StoreSDNode *Store = cast<StoreSDNode>(N);
   return !Store->isVolatile();
 }]>;
-def nonvolatile_store : NonvolatileStore<store>;
+def nonvolatile_store         : NonvolatileStore<store>;
+def nonvolatile_truncstorei8  : NonvolatileStore<truncstorei8>;
+def nonvolatile_truncstorei16 : NonvolatileStore<truncstorei16>;
+def nonvolatile_truncstorei32 : NonvolatileStore<truncstorei32>;
 
 // Insertions.
 def inserti8 : PatFrag<(ops node:$src1, node:$src2),
diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td
index 3689f74bfd4..fb6c2219df9 100644
--- a/lib/Target/SystemZ/SystemZPatterns.td
+++ b/lib/Target/SystemZ/SystemZPatterns.td
@@ -50,12 +50,8 @@ class RMWI<SDPatternOperator load, SDPatternOperator operator,
 // memory location.  IMM is the type of the second operand.
 multiclass RMWIByte<SDPatternOperator operator, AddressingMode mode,
                     Instruction insn> {
-  def : RMWI<zextloadi8, operator, truncstorei8, mode, imm32, insn>;
-  def : RMWI<zextloadi8, operator, truncstorei8, mode, imm64, insn>;
-  def : RMWI<sextloadi8, operator, truncstorei8, mode, imm32, insn>;
-  def : RMWI<sextloadi8, operator, truncstorei8, mode, imm64, insn>;
-  def : RMWI<extloadi8, operator, truncstorei8, mode, imm32, insn>;
-  def : RMWI<extloadi8, operator, truncstorei8, mode, imm64, insn>;
+  def : RMWI<anyextloadi8, operator, truncstorei8, mode, imm32, insn>;
+  def : RMWI<anyextloadi8, operator, truncstorei8, mode, imm64, insn>;
 }
 
 // Record that INSN performs insertion TYPE into a register of class CLS.
diff --git a/test/CodeGen/SystemZ/cond-store-01.ll b/test/CodeGen/SystemZ/cond-store-01.ll
new file mode 100644
index 00000000000..fadcae5e90a
--- /dev/null
+++ b/test/CodeGen/SystemZ/cond-store-01.ll
@@ -0,0 +1,396 @@
+; Test 8-bit conditional stores that are presented as selects.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare void @foo(i8 *)
+
+; Test the simple case, with the loaded value first.
+define void @f1(i8 *%ptr, i8 %alt, i32 %limit) {
+; CHECK: f1:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stc %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %res = select i1 %cond, i8 %orig, i8 %alt
+  store i8 %res, i8 *%ptr
+  ret void
+}
+
+; ...and with the loaded value second
+define void @f2(i8 *%ptr, i8 %alt, i32 %limit) {
+; CHECK: f2:
+; CHECK-NOT: %r2
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stc %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %res = select i1 %cond, i8 %alt, i8 %orig
+  store i8 %res, i8 *%ptr
+  ret void
+}
+
+; Test cases where the value is explicitly sign-extended to 32 bits, with the
+; loaded value first.
+define void @f3(i8 *%ptr, i32 %alt, i32 %limit) {
+; CHECK: f3:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stc %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %ext = sext i8 %orig to i32
+  %res = select i1 %cond, i32 %ext, i32 %alt
+  %trunc = trunc i32 %res to i8
+  store i8 %trunc, i8 *%ptr
+  ret void
+}
+
+; ...and with the loaded value second
+define void @f4(i8 *%ptr, i32 %alt, i32 %limit) {
+; CHECK: f4:
+; CHECK-NOT: %r2
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stc %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %ext = sext i8 %orig to i32
+  %res = select i1 %cond, i32 %alt, i32 %ext
+  %trunc = trunc i32 %res to i8
+  store i8 %trunc, i8 *%ptr
+  ret void
+}
+
+; Test cases where the value is explicitly zero-extended to 32 bits, with the
+; loaded value first.
+define void @f5(i8 *%ptr, i32 %alt, i32 %limit) {
+; CHECK: f5:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stc %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %ext = zext i8 %orig to i32
+  %res = select i1 %cond, i32 %ext, i32 %alt
+  %trunc = trunc i32 %res to i8
+  store i8 %trunc, i8 *%ptr
+  ret void
+}
+
+; ...and with the loaded value second
+define void @f6(i8 *%ptr, i32 %alt, i32 %limit) {
+; CHECK: f6:
+; CHECK-NOT: %r2
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stc %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %ext = zext i8 %orig to i32
+  %res = select i1 %cond, i32 %alt, i32 %ext
+  %trunc = trunc i32 %res to i8
+  store i8 %trunc, i8 *%ptr
+  ret void
+}
+
+; Test cases where the value is explicitly sign-extended to 64 bits, with the
+; loaded value first.
+define void @f7(i8 *%ptr, i64 %alt, i32 %limit) {
+; CHECK: f7:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stc %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %ext = sext i8 %orig to i64
+  %res = select i1 %cond, i64 %ext, i64 %alt
+  %trunc = trunc i64 %res to i8
+  store i8 %trunc, i8 *%ptr
+  ret void
+}
+
+; ...and with the loaded value second
+define void @f8(i8 *%ptr, i64 %alt, i32 %limit) {
+; CHECK: f8:
+; CHECK-NOT: %r2
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stc %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %ext = sext i8 %orig to i64
+  %res = select i1 %cond, i64 %alt, i64 %ext
+  %trunc = trunc i64 %res to i8
+  store i8 %trunc, i8 *%ptr
+  ret void
+}
+
+; Test cases where the value is explicitly zero-extended to 64 bits, with the
+; loaded value first.
+define void @f9(i8 *%ptr, i64 %alt, i32 %limit) {
+; CHECK: f9:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stc %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %ext = zext i8 %orig to i64
+  %res = select i1 %cond, i64 %ext, i64 %alt
+  %trunc = trunc i64 %res to i8
+  store i8 %trunc, i8 *%ptr
+  ret void
+}
+
+; ...and with the loaded value second
+define void @f10(i8 *%ptr, i64 %alt, i32 %limit) {
+; CHECK: f10:
+; CHECK-NOT: %r2
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stc %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %ext = zext i8 %orig to i64
+  %res = select i1 %cond, i64 %alt, i64 %ext
+  %trunc = trunc i64 %res to i8
+  store i8 %trunc, i8 *%ptr
+  ret void
+}
+
+; Check the high end of the STC range.
+define void @f11(i8 *%base, i8 %alt, i32 %limit) {
+; CHECK: f11:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stc %r3, 4095(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%base, i64 4095
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %res = select i1 %cond, i8 %orig, i8 %alt
+  store i8 %res, i8 *%ptr
+  ret void
+}
+
+; Check the next byte up, which should use STCY instead of STC.
+define void @f12(i8 *%base, i8 %alt, i32 %limit) {
+; CHECK: f12:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stcy %r3, 4096(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%base, i64 4096
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %res = select i1 %cond, i8 %orig, i8 %alt
+  store i8 %res, i8 *%ptr
+  ret void
+}
+
+; Check the high end of the STCY range.
+define void @f13(i8 *%base, i8 %alt, i32 %limit) {
+; CHECK: f13:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stcy %r3, 524287(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%base, i64 524287
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %res = select i1 %cond, i8 %orig, i8 %alt
+  store i8 %res, i8 *%ptr
+  ret void
+}
+
+; Check the next byte up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define void @f14(i8 *%base, i8 %alt, i32 %limit) {
+; CHECK: f14:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: agfi %r2, 524288
+; CHECK: stc %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%base, i64 524288
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %res = select i1 %cond, i8 %orig, i8 %alt
+  store i8 %res, i8 *%ptr
+  ret void
+}
+
+; Check the low end of the STCY range.
+define void @f15(i8 *%base, i8 %alt, i32 %limit) {
+; CHECK: f15:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stcy %r3, -524288(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%base, i64 -524288
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %res = select i1 %cond, i8 %orig, i8 %alt
+  store i8 %res, i8 *%ptr
+  ret void
+}
+
+; Check the next byte down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define void @f16(i8 *%base, i8 %alt, i32 %limit) {
+; CHECK: f16:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: agfi %r2, -524289
+; CHECK: stc %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%base, i64 -524289
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %res = select i1 %cond, i8 %orig, i8 %alt
+  store i8 %res, i8 *%ptr
+  ret void
+}
+
+; Check that STCY allows an index.
+define void @f17(i64 %base, i64 %index, i8 %alt, i32 %limit) {
+; CHECK: f17:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stcy %r4, 4096(%r3,%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %add1 = add i64 %base, %index
+  %add2 = add i64 %add1, 4096
+  %ptr = inttoptr i64 %add2 to i8 *
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %res = select i1 %cond, i8 %orig, i8 %alt
+  store i8 %res, i8 *%ptr
+  ret void
+}
+
+; Check that volatile loads are not matched.
+define void @f18(i8 *%ptr, i8 %alt, i32 %limit) {
+; CHECK: f18:
+; CHECK: lb {{%r[0-5]}}, 0(%r2)
+; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: [[LABEL]]:
+; CHECK: stc {{%r[0-5]}}, 0(%r2)
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load volatile i8 *%ptr
+  %res = select i1 %cond, i8 %orig, i8 %alt
+  store i8 %res, i8 *%ptr
+  ret void
+}
+
+; ...likewise stores.  In this case we should have a conditional load into %r3.
+define void @f19(i8 *%ptr, i8 %alt, i32 %limit) {
+; CHECK: f19:
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK: lb %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: stc %r3, 0(%r2)
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %res = select i1 %cond, i8 %orig, i8 %alt
+  store volatile i8 %res, i8 *%ptr
+  ret void
+}
+
+; Check that atomic loads are not matched.  The transformation is OK for
+; the "unordered" case tested here, but since we don't try to handle atomic
+; operations at all in this context, it seems better to assert that than
+; to restrict the test to a stronger ordering.
+define void @f20(i8 *%ptr, i8 %alt, i32 %limit) {
+; FIXME: should use a normal load instead of CS.
+; CHECK: f20:
+; CHECK: cs {{%r[0-9]+}},
+; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: [[LABEL]]:
+; CHECK: stc {{%r[0-9]+}},
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load atomic i8 *%ptr unordered, align 1
+  %res = select i1 %cond, i8 %orig, i8 %alt
+  store i8 %res, i8 *%ptr
+  ret void
+}
+
+; ...likewise stores.
+define void @f21(i8 *%ptr, i8 %alt, i32 %limit) {
+; FIXME: should use a normal store instead of CS.
+; CHECK: f21:
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK: lb %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: cs {{%r[0-9]+}},
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %res = select i1 %cond, i8 %orig, i8 %alt
+  store atomic i8 %res, i8 *%ptr unordered, align 1
+  ret void
+}
+
+; Try a frame index base.
+define void @f22(i8 %alt, i32 %limit) {
+; CHECK: f22:
+; CHECK: brasl %r14, foo@PLT
+; CHECK-NOT: %r15
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r15
+; CHECK: stc {{%r[0-9]+}}, {{[0-9]+}}(%r15)
+; CHECK: [[LABEL]]:
+; CHECK: brasl %r14, foo@PLT
+; CHECK: br %r14
+  %ptr = alloca i8
+  call void @foo(i8 *%ptr)
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i8 *%ptr
+  %res = select i1 %cond, i8 %orig, i8 %alt
+  store i8 %res, i8 *%ptr
+  call void @foo(i8 *%ptr)
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/cond-store-02.ll b/test/CodeGen/SystemZ/cond-store-02.ll
new file mode 100644
index 00000000000..51f3ffc0f75
--- /dev/null
+++ b/test/CodeGen/SystemZ/cond-store-02.ll
@@ -0,0 +1,396 @@
+; Test 16-bit conditional stores that are presented as selects.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare void @foo(i16 *)
+
+; Test the simple case, with the loaded value first.
+define void @f1(i16 *%ptr, i16 %alt, i32 %limit) {
+; CHECK: f1:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sth %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %res = select i1 %cond, i16 %orig, i16 %alt
+  store i16 %res, i16 *%ptr
+  ret void
+}
+
+; ...and with the loaded value second
+define void @f2(i16 *%ptr, i16 %alt, i32 %limit) {
+; CHECK: f2:
+; CHECK-NOT: %r2
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sth %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %res = select i1 %cond, i16 %alt, i16 %orig
+  store i16 %res, i16 *%ptr
+  ret void
+}
+
+; Test cases where the value is explicitly sign-extended to 32 bits, with the
+; loaded value first.
+define void @f3(i16 *%ptr, i32 %alt, i32 %limit) {
+; CHECK: f3:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sth %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %ext = sext i16 %orig to i32
+  %res = select i1 %cond, i32 %ext, i32 %alt
+  %trunc = trunc i32 %res to i16
+  store i16 %trunc, i16 *%ptr
+  ret void
+}
+
+; ...and with the loaded value second
+define void @f4(i16 *%ptr, i32 %alt, i32 %limit) {
+; CHECK: f4:
+; CHECK-NOT: %r2
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sth %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %ext = sext i16 %orig to i32
+  %res = select i1 %cond, i32 %alt, i32 %ext
+  %trunc = trunc i32 %res to i16
+  store i16 %trunc, i16 *%ptr
+  ret void
+}
+
+; Test cases where the value is explicitly zero-extended to 32 bits, with the
+; loaded value first.
+define void @f5(i16 *%ptr, i32 %alt, i32 %limit) {
+; CHECK: f5:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sth %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %ext = zext i16 %orig to i32
+  %res = select i1 %cond, i32 %ext, i32 %alt
+  %trunc = trunc i32 %res to i16
+  store i16 %trunc, i16 *%ptr
+  ret void
+}
+
+; ...and with the loaded value second
+define void @f6(i16 *%ptr, i32 %alt, i32 %limit) {
+; CHECK: f6:
+; CHECK-NOT: %r2
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sth %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %ext = zext i16 %orig to i32
+  %res = select i1 %cond, i32 %alt, i32 %ext
+  %trunc = trunc i32 %res to i16
+  store i16 %trunc, i16 *%ptr
+  ret void
+}
+
+; Test cases where the value is explicitly sign-extended to 64 bits, with the
+; loaded value first.
+define void @f7(i16 *%ptr, i64 %alt, i32 %limit) {
+; CHECK: f7:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sth %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %ext = sext i16 %orig to i64
+  %res = select i1 %cond, i64 %ext, i64 %alt
+  %trunc = trunc i64 %res to i16
+  store i16 %trunc, i16 *%ptr
+  ret void
+}
+
+; ...and with the loaded value second
+define void @f8(i16 *%ptr, i64 %alt, i32 %limit) {
+; CHECK: f8:
+; CHECK-NOT: %r2
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sth %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %ext = sext i16 %orig to i64
+  %res = select i1 %cond, i64 %alt, i64 %ext
+  %trunc = trunc i64 %res to i16
+  store i16 %trunc, i16 *%ptr
+  ret void
+}
+
+; Test cases where the value is explicitly zero-extended to 64 bits, with the
+; loaded value first.
+define void @f9(i16 *%ptr, i64 %alt, i32 %limit) {
+; CHECK: f9:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sth %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %ext = zext i16 %orig to i64
+  %res = select i1 %cond, i64 %ext, i64 %alt
+  %trunc = trunc i64 %res to i16
+  store i16 %trunc, i16 *%ptr
+  ret void
+}
+
+; ...and with the loaded value second
+define void @f10(i16 *%ptr, i64 %alt, i32 %limit) {
+; CHECK: f10:
+; CHECK-NOT: %r2
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sth %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %ext = zext i16 %orig to i64
+  %res = select i1 %cond, i64 %alt, i64 %ext
+  %trunc = trunc i64 %res to i16
+  store i16 %trunc, i16 *%ptr
+  ret void
+}
+
+; Check the high end of the aligned STH range.
+define void @f11(i16 *%base, i16 %alt, i32 %limit) {
+; CHECK: f11:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sth %r3, 4094(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i16 *%base, i64 2047
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %res = select i1 %cond, i16 %orig, i16 %alt
+  store i16 %res, i16 *%ptr
+  ret void
+}
+
+; Check the next halfword up, which should use STHY instead of STH.
+define void @f12(i16 *%base, i16 %alt, i32 %limit) {
+; CHECK: f12:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sthy %r3, 4096(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i16 *%base, i64 2048
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %res = select i1 %cond, i16 %orig, i16 %alt
+  store i16 %res, i16 *%ptr
+  ret void
+}
+
+; Check the high end of the aligned STHY range.
+define void @f13(i16 *%base, i16 %alt, i32 %limit) {
+; CHECK: f13:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sthy %r3, 524286(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i16 *%base, i64 262143
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %res = select i1 %cond, i16 %orig, i16 %alt
+  store i16 %res, i16 *%ptr
+  ret void
+}
+
+; Check the next halfword up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define void @f14(i16 *%base, i16 %alt, i32 %limit) {
+; CHECK: f14:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: agfi %r2, 524288
+; CHECK: sth %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i16 *%base, i64 262144
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %res = select i1 %cond, i16 %orig, i16 %alt
+  store i16 %res, i16 *%ptr
+  ret void
+}
+
+; Check the low end of the STHY range.
+define void @f15(i16 *%base, i16 %alt, i32 %limit) {
+; CHECK: f15:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sthy %r3, -524288(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i16 *%base, i64 -262144
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %res = select i1 %cond, i16 %orig, i16 %alt
+  store i16 %res, i16 *%ptr
+  ret void
+}
+
+; Check the next halfword down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define void @f16(i16 *%base, i16 %alt, i32 %limit) {
+; CHECK: f16:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: agfi %r2, -524290
+; CHECK: sth %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i16 *%base, i64 -262145
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %res = select i1 %cond, i16 %orig, i16 %alt
+  store i16 %res, i16 *%ptr
+  ret void
+}
+
+; Check that STHY allows an index.
+define void @f17(i64 %base, i64 %index, i16 %alt, i32 %limit) {
+; CHECK: f17:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sthy %r4, 4096(%r3,%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %add1 = add i64 %base, %index
+  %add2 = add i64 %add1, 4096
+  %ptr = inttoptr i64 %add2 to i16 *
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %res = select i1 %cond, i16 %orig, i16 %alt
+  store i16 %res, i16 *%ptr
+  ret void
+}
+
+; Check that volatile loads are not matched.
+define void @f18(i16 *%ptr, i16 %alt, i32 %limit) {
+; CHECK: f18:
+; CHECK: lh {{%r[0-5]}}, 0(%r2)
+; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: [[LABEL]]:
+; CHECK: sth {{%r[0-5]}}, 0(%r2)
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load volatile i16 *%ptr
+  %res = select i1 %cond, i16 %orig, i16 %alt
+  store i16 %res, i16 *%ptr
+  ret void
+}
+
+; ...likewise stores.  In this case we should have a conditional load into %r3.
+define void @f19(i16 *%ptr, i16 %alt, i32 %limit) {
+; CHECK: f19:
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK: lh %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: sth %r3, 0(%r2)
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %res = select i1 %cond, i16 %orig, i16 %alt
+  store volatile i16 %res, i16 *%ptr
+  ret void
+}
+
+; Check that atomic loads are not matched.  The transformation is OK for
+; the "unordered" case tested here, but since we don't try to handle atomic
+; operations at all in this context, it seems better to assert that than
+; to restrict the test to a stronger ordering.
+define void @f20(i16 *%ptr, i16 %alt, i32 %limit) {
+; FIXME: should use a normal load instead of CS.
+; CHECK: f20:
+; CHECK: cs {{%r[0-9]+}},
+; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: [[LABEL]]:
+; CHECK: sth {{%r[0-9]+}},
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load atomic i16 *%ptr unordered, align 2
+  %res = select i1 %cond, i16 %orig, i16 %alt
+  store i16 %res, i16 *%ptr
+  ret void
+}
+
+; ...likewise stores.
+define void @f21(i16 *%ptr, i16 %alt, i32 %limit) {
+; FIXME: should use a normal store instead of CS.
+; CHECK: f21:
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK: lh %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: cs {{%r[0-9]+}},
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %res = select i1 %cond, i16 %orig, i16 %alt
+  store atomic i16 %res, i16 *%ptr unordered, align 2
+  ret void
+}
+
+; Try a frame index base.
+define void @f22(i16 %alt, i32 %limit) {
+; CHECK: f22:
+; CHECK: brasl %r14, foo@PLT
+; CHECK-NOT: %r15
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r15
+; CHECK: sth {{%r[0-9]+}}, {{[0-9]+}}(%r15)
+; CHECK: [[LABEL]]:
+; CHECK: brasl %r14, foo@PLT
+; CHECK: br %r14
+  %ptr = alloca i16
+  call void @foo(i16 *%ptr)
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i16 *%ptr
+  %res = select i1 %cond, i16 %orig, i16 %alt
+  store i16 %res, i16 *%ptr
+  call void @foo(i16 *%ptr)
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/cond-store-03.ll b/test/CodeGen/SystemZ/cond-store-03.ll
new file mode 100644
index 00000000000..6f19fbc3598
--- /dev/null
+++ b/test/CodeGen/SystemZ/cond-store-03.ll
@@ -0,0 +1,322 @@
+; Test 32-bit conditional stores that are presented as selects.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare void @foo(i32 *)
+
+; Test the simple case, with the loaded value first.
+define void @f1(i32 *%ptr, i32 %alt, i32 %limit) {
+; CHECK: f1:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: st %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i32 *%ptr
+  %res = select i1 %cond, i32 %orig, i32 %alt
+  store i32 %res, i32 *%ptr
+  ret void
+}
+
+; ...and with the loaded value second
+define void @f2(i32 *%ptr, i32 %alt, i32 %limit) {
+; CHECK: f2:
+; CHECK-NOT: %r2
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: st %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i32 *%ptr
+  %res = select i1 %cond, i32 %alt, i32 %orig
+  store i32 %res, i32 *%ptr
+  ret void
+}
+
+; Test cases where the value is explicitly sign-extended to 64 bits, with the
+; loaded value first.
+define void @f3(i32 *%ptr, i64 %alt, i32 %limit) {
+; CHECK: f3:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: st %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i32 *%ptr
+  %ext = sext i32 %orig to i64
+  %res = select i1 %cond, i64 %ext, i64 %alt
+  %trunc = trunc i64 %res to i32
+  store i32 %trunc, i32 *%ptr
+  ret void
+}
+
+; ...and with the loaded value second
+define void @f4(i32 *%ptr, i64 %alt, i32 %limit) {
+; CHECK: f4:
+; CHECK-NOT: %r2
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: st %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i32 *%ptr
+  %ext = sext i32 %orig to i64
+  %res = select i1 %cond, i64 %alt, i64 %ext
+  %trunc = trunc i64 %res to i32
+  store i32 %trunc, i32 *%ptr
+  ret void
+}
+
+; Test cases where the value is explicitly zero-extended to 32 bits, with the
+; loaded value first.
+define void @f5(i32 *%ptr, i64 %alt, i32 %limit) {
+; CHECK: f5:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: st %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i32 *%ptr
+  %ext = zext i32 %orig to i64
+  %res = select i1 %cond, i64 %ext, i64 %alt
+  %trunc = trunc i64 %res to i32
+  store i32 %trunc, i32 *%ptr
+  ret void
+}
+
+; ...and with the loaded value second
+define void @f6(i32 *%ptr, i64 %alt, i32 %limit) {
+; CHECK: f6:
+; CHECK-NOT: %r2
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: st %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i32 *%ptr
+  %ext = zext i32 %orig to i64
+  %res = select i1 %cond, i64 %alt, i64 %ext
+  %trunc = trunc i64 %res to i32
+  store i32 %trunc, i32 *%ptr
+  ret void
+}
+
+; Check the high end of the aligned ST range.
+define void @f7(i32 *%base, i32 %alt, i32 %limit) {
+; CHECK: f7:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: st %r3, 4092(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%base, i64 1023
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i32 *%ptr
+  %res = select i1 %cond, i32 %orig, i32 %alt
+  store i32 %res, i32 *%ptr
+  ret void
+}
+
+; Check the next word up, which should use STY instead of ST.
+define void @f8(i32 *%base, i32 %alt, i32 %limit) {
+; CHECK: f8:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sty %r3, 4096(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%base, i64 1024
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i32 *%ptr
+  %res = select i1 %cond, i32 %orig, i32 %alt
+  store i32 %res, i32 *%ptr
+  ret void
+}
+
+; Check the high end of the aligned STY range.
+define void @f9(i32 *%base, i32 %alt, i32 %limit) {
+; CHECK: f9:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sty %r3, 524284(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%base, i64 131071
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i32 *%ptr
+  %res = select i1 %cond, i32 %orig, i32 %alt
+  store i32 %res, i32 *%ptr
+  ret void
+}
+
+; Check the next word up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define void @f10(i32 *%base, i32 %alt, i32 %limit) {
+; CHECK: f10:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: agfi %r2, 524288
+; CHECK: st %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%base, i64 131072
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i32 *%ptr
+  %res = select i1 %cond, i32 %orig, i32 %alt
+  store i32 %res, i32 *%ptr
+  ret void
+}
+
+; Check the low end of the STY range.
+define void @f11(i32 *%base, i32 %alt, i32 %limit) {
+; CHECK: f11:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sty %r3, -524288(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%base, i64 -131072
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i32 *%ptr
+  %res = select i1 %cond, i32 %orig, i32 %alt
+  store i32 %res, i32 *%ptr
+  ret void
+}
+
+; Check the next word down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define void @f12(i32 *%base, i32 %alt, i32 %limit) {
+; CHECK: f12:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: agfi %r2, -524292
+; CHECK: st %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i32 *%base, i64 -131073
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i32 *%ptr
+  %res = select i1 %cond, i32 %orig, i32 %alt
+  store i32 %res, i32 *%ptr
+  ret void
+}
+
+; Check that STY allows an index.
+define void @f13(i64 %base, i64 %index, i32 %alt, i32 %limit) {
+; CHECK: f13:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: sty %r4, 4096(%r3,%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %add1 = add i64 %base, %index
+  %add2 = add i64 %add1, 4096
+  %ptr = inttoptr i64 %add2 to i32 *
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i32 *%ptr
+  %res = select i1 %cond, i32 %orig, i32 %alt
+  store i32 %res, i32 *%ptr
+  ret void
+}
+
+; Check that volatile loads are not matched.
+define void @f14(i32 *%ptr, i32 %alt, i32 %limit) {
+; CHECK: f14:
+; CHECK: l {{%r[0-5]}}, 0(%r2)
+; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: [[LABEL]]:
+; CHECK: st {{%r[0-5]}}, 0(%r2)
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load volatile i32 *%ptr
+  %res = select i1 %cond, i32 %orig, i32 %alt
+  store i32 %res, i32 *%ptr
+  ret void
+}
+
+; ...likewise stores.  In this case we should have a conditional load into %r3.
+define void @f15(i32 *%ptr, i32 %alt, i32 %limit) {
+; CHECK: f15:
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK: l %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: st %r3, 0(%r2)
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i32 *%ptr
+  %res = select i1 %cond, i32 %orig, i32 %alt
+  store volatile i32 %res, i32 *%ptr
+  ret void
+}
+
+; Check that atomic loads are not matched.  The transformation is OK for
+; the "unordered" case tested here, but since we don't try to handle atomic
+; operations at all in this context, it seems better to assert that than
+; to restrict the test to a stronger ordering.
+define void @f16(i32 *%ptr, i32 %alt, i32 %limit) {
+; FIXME: should use a normal load instead of CS.
+; CHECK: f16:
+; CHECK: cs {{%r[0-5]}}, {{%r[0-5]}}, 0(%r2)
+; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: [[LABEL]]:
+; CHECK: st {{%r[0-5]}}, 0(%r2)
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load atomic i32 *%ptr unordered, align 4
+  %res = select i1 %cond, i32 %orig, i32 %alt
+  store i32 %res, i32 *%ptr
+  ret void
+}
+
+; ...likewise stores.
+define void @f17(i32 *%ptr, i32 %alt, i32 %limit) {
+; FIXME: should use a normal store instead of CS.
+; CHECK: f17:
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK: l %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: cs {{%r[0-5]}}, %r3, 0(%r2)
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i32 *%ptr
+  %res = select i1 %cond, i32 %orig, i32 %alt
+  store atomic i32 %res, i32 *%ptr unordered, align 4
+  ret void
+}
+
+; Try a frame index base.
+define void @f18(i32 %alt, i32 %limit) {
+; CHECK: f18:
+; CHECK: brasl %r14, foo@PLT
+; CHECK-NOT: %r15
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r15
+; CHECK: st {{%r[0-9]+}}, {{[0-9]+}}(%r15)
+; CHECK: [[LABEL]]:
+; CHECK: brasl %r14, foo@PLT
+; CHECK: br %r14
+  %ptr = alloca i32
+  call void @foo(i32 *%ptr)
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i32 *%ptr
+  %res = select i1 %cond, i32 %orig, i32 %alt
+  store i32 %res, i32 *%ptr
+  call void @foo(i32 *%ptr)
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/cond-store-04.ll b/test/CodeGen/SystemZ/cond-store-04.ll
new file mode 100644
index 00000000000..22f5fd42265
--- /dev/null
+++ b/test/CodeGen/SystemZ/cond-store-04.ll
@@ -0,0 +1,214 @@
+; Test 64-bit conditional stores that are presented as selects.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare void @foo(i64 *)
+
+; Test with the loaded value first.
+define void @f1(i64 *%ptr, i64 %alt, i32 %limit) {
+; CHECK: f1:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stg %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i64 *%ptr
+  %res = select i1 %cond, i64 %orig, i64 %alt
+  store i64 %res, i64 *%ptr
+  ret void
+}
+
+; ...and with the loaded value second
+define void @f2(i64 *%ptr, i64 %alt, i32 %limit) {
+; CHECK: f2:
+; CHECK-NOT: %r2
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stg %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i64 *%ptr
+  %res = select i1 %cond, i64 %alt, i64 %orig
+  store i64 %res, i64 *%ptr
+  ret void
+}
+
+; Check the high end of the aligned STG range.
+define void @f3(i64 *%base, i64 %alt, i32 %limit) {
+; CHECK: f3:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stg %r3, 524280(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%base, i64 65535
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i64 *%ptr
+  %res = select i1 %cond, i64 %orig, i64 %alt
+  store i64 %res, i64 *%ptr
+  ret void
+}
+
+; Check the next doubleword up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define void @f4(i64 *%base, i64 %alt, i32 %limit) {
+; CHECK: f4:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: agfi %r2, 524288
+; CHECK: stg %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%base, i64 65536
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i64 *%ptr
+  %res = select i1 %cond, i64 %orig, i64 %alt
+  store i64 %res, i64 *%ptr
+  ret void
+}
+
+; Check the low end of the STG range.
+define void @f5(i64 *%base, i64 %alt, i32 %limit) {
+; CHECK: f5:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stg %r3, -524288(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%base, i64 -65536
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i64 *%ptr
+  %res = select i1 %cond, i64 %orig, i64 %alt
+  store i64 %res, i64 *%ptr
+  ret void
+}
+
+; Check the next doubleword down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define void @f6(i64 *%base, i64 %alt, i32 %limit) {
+; CHECK: f6:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: agfi %r2, -524296
+; CHECK: stg %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr i64 *%base, i64 -65537
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i64 *%ptr
+  %res = select i1 %cond, i64 %orig, i64 %alt
+  store i64 %res, i64 *%ptr
+  ret void
+}
+
+; Check that STG allows an index.
+define void @f7(i64 %base, i64 %index, i64 %alt, i32 %limit) {
+; CHECK: f7:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stg %r4, 524287(%r3,%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %add1 = add i64 %base, %index
+  %add2 = add i64 %add1, 524287
+  %ptr = inttoptr i64 %add2 to i64 *
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i64 *%ptr
+  %res = select i1 %cond, i64 %orig, i64 %alt
+  store i64 %res, i64 *%ptr
+  ret void
+}
+
+; Check that volatile loads are not matched.
+define void @f8(i64 *%ptr, i64 %alt, i32 %limit) {
+; CHECK: f8:
+; CHECK: lg {{%r[0-5]}}, 0(%r2)
+; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: [[LABEL]]:
+; CHECK: stg {{%r[0-5]}}, 0(%r2)
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load volatile i64 *%ptr
+  %res = select i1 %cond, i64 %orig, i64 %alt
+  store i64 %res, i64 *%ptr
+  ret void
+}
+
+; ...likewise stores.  In this case we should have a conditional load into %r3.
+define void @f9(i64 *%ptr, i64 %alt, i32 %limit) {
+; CHECK: f9:
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK: lg %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: stg %r3, 0(%r2)
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i64 *%ptr
+  %res = select i1 %cond, i64 %orig, i64 %alt
+  store volatile i64 %res, i64 *%ptr
+  ret void
+}
+
+; Check that atomic loads are not matched.  The transformation is OK for
+; the "unordered" case tested here, but since we don't try to handle atomic
+; operations at all in this context, it seems better to assert that than
+; to restrict the test to a stronger ordering.
+define void @f10(i64 *%ptr, i64 %alt, i32 %limit) {
+; FIXME: should use a normal load instead of CSG.
+; CHECK: f10:
+; CHECK: csg {{%r[0-5]}}, {{%r[0-5]}}, 0(%r2)
+; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: [[LABEL]]:
+; CHECK: stg {{%r[0-5]}}, 0(%r2)
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load atomic i64 *%ptr unordered, align 8
+  %res = select i1 %cond, i64 %orig, i64 %alt
+  store i64 %res, i64 *%ptr
+  ret void
+}
+
+; ...likewise stores.
+define void @f11(i64 *%ptr, i64 %alt, i32 %limit) {
+; FIXME: should use a normal store instead of CSG.
+; CHECK: f11:
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK: lg %r3, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: csg {{%r[0-5]}}, %r3, 0(%r2)
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i64 *%ptr
+  %res = select i1 %cond, i64 %orig, i64 %alt
+  store atomic i64 %res, i64 *%ptr unordered, align 8
+  ret void
+}
+
+; Try a frame index base.
+define void @f12(i64 %alt, i32 %limit) {
+; CHECK: f12:
+; CHECK: brasl %r14, foo@PLT
+; CHECK-NOT: %r15
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r15
+; CHECK: stg {{%r[0-9]+}}, {{[0-9]+}}(%r15)
+; CHECK: [[LABEL]]:
+; CHECK: brasl %r14, foo@PLT
+; CHECK: br %r14
+  %ptr = alloca i64
+  call void @foo(i64 *%ptr)
+  %cond = icmp ult i32 %limit, 42
+  %orig = load i64 *%ptr
+  %res = select i1 %cond, i64 %orig, i64 %alt
+  store i64 %res, i64 *%ptr
+  call void @foo(i64 *%ptr)
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/cond-store-05.ll b/test/CodeGen/SystemZ/cond-store-05.ll
new file mode 100644
index 00000000000..5bcfed0cd4a
--- /dev/null
+++ b/test/CodeGen/SystemZ/cond-store-05.ll
@@ -0,0 +1,213 @@
+; Test f32 conditional stores that are presented as selects.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare void @foo(float *)
+
+; Test with the loaded value first.
+define void @f1(float *%ptr, float %alt, i32 %limit) {
+; CHECK: f1:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: ste %f0, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load float *%ptr
+  %res = select i1 %cond, float %orig, float %alt
+  store float %res, float *%ptr
+  ret void
+}
+
+; ...and with the loaded value second
+define void @f2(float *%ptr, float %alt, i32 %limit) {
+; CHECK: f2:
+; CHECK-NOT: %r2
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: ste %f0, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load float *%ptr
+  %res = select i1 %cond, float %alt, float %orig
+  store float %res, float *%ptr
+  ret void
+}
+
+; Check the high end of the aligned STE range.
+define void @f3(float *%base, float %alt, i32 %limit) {
+; CHECK: f3:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: ste %f0, 4092(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr float *%base, i64 1023
+  %cond = icmp ult i32 %limit, 42
+  %orig = load float *%ptr
+  %res = select i1 %cond, float %orig, float %alt
+  store float %res, float *%ptr
+  ret void
+}
+
+; Check the next word up, which should use STEY instead of STE.
+define void @f4(float *%base, float %alt, i32 %limit) {
+; CHECK: f4:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stey %f0, 4096(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr float *%base, i64 1024
+  %cond = icmp ult i32 %limit, 42
+  %orig = load float *%ptr
+  %res = select i1 %cond, float %orig, float %alt
+  store float %res, float *%ptr
+  ret void
+}
+
+; Check the high end of the aligned STEY range.
+define void @f5(float *%base, float %alt, i32 %limit) {
+; CHECK: f5:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stey %f0, 524284(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr float *%base, i64 131071
+  %cond = icmp ult i32 %limit, 42
+  %orig = load float *%ptr
+  %res = select i1 %cond, float %orig, float %alt
+  store float %res, float *%ptr
+  ret void
+}
+
+; Check the next word up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define void @f6(float *%base, float %alt, i32 %limit) {
+; CHECK: f6:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: agfi %r2, 524288
+; CHECK: ste %f0, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr float *%base, i64 131072
+  %cond = icmp ult i32 %limit, 42
+  %orig = load float *%ptr
+  %res = select i1 %cond, float %orig, float %alt
+  store float %res, float *%ptr
+  ret void
+}
+
+; Check the low end of the STEY range.
+define void @f7(float *%base, float %alt, i32 %limit) {
+; CHECK: f7:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stey %f0, -524288(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr float *%base, i64 -131072
+  %cond = icmp ult i32 %limit, 42
+  %orig = load float *%ptr
+  %res = select i1 %cond, float %orig, float %alt
+  store float %res, float *%ptr
+  ret void
+}
+
+; Check the next word down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define void @f8(float *%base, float %alt, i32 %limit) {
+; CHECK: f8:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: agfi %r2, -524292
+; CHECK: ste %f0, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr float *%base, i64 -131073
+  %cond = icmp ult i32 %limit, 42
+  %orig = load float *%ptr
+  %res = select i1 %cond, float %orig, float %alt
+  store float %res, float *%ptr
+  ret void
+}
+
+; Check that STEY allows an index.
+define void @f9(i64 %base, i64 %index, float %alt, i32 %limit) {
+; CHECK: f9:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stey %f0, 4096(%r3,%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %add1 = add i64 %base, %index
+  %add2 = add i64 %add1, 4096
+  %ptr = inttoptr i64 %add2 to float *
+  %cond = icmp ult i32 %limit, 42
+  %orig = load float *%ptr
+  %res = select i1 %cond, float %orig, float %alt
+  store float %res, float *%ptr
+  ret void
+}
+
+; Check that volatile loads are not matched.
+define void @f10(float *%ptr, float %alt, i32 %limit) {
+; CHECK: f10:
+; CHECK: le {{%f[0-5]}}, 0(%r2)
+; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: [[LABEL]]:
+; CHECK: ste {{%f[0-5]}}, 0(%r2)
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load volatile float *%ptr
+  %res = select i1 %cond, float %orig, float %alt
+  store float %res, float *%ptr
+  ret void
+}
+
+; ...likewise stores.  In this case we should have a conditional load into %f0.
+define void @f11(float *%ptr, float %alt, i32 %limit) {
+; CHECK: f11:
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK: le %f0, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: ste %f0, 0(%r2)
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load float *%ptr
+  %res = select i1 %cond, float %orig, float %alt
+  store volatile float %res, float *%ptr
+  ret void
+}
+
+; Try a frame index base.
+define void @f12(float %alt, i32 %limit) {
+; CHECK: f12:
+; CHECK: brasl %r14, foo@PLT
+; CHECK-NOT: %r15
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r15
+; CHECK: ste {{%f[0-9]+}}, {{[0-9]+}}(%r15)
+; CHECK: [[LABEL]]:
+; CHECK: brasl %r14, foo@PLT
+; CHECK: br %r14
+  %ptr = alloca float
+  call void @foo(float *%ptr)
+  %cond = icmp ult i32 %limit, 42
+  %orig = load float *%ptr
+  %res = select i1 %cond, float %orig, float %alt
+  store float %res, float *%ptr
+  call void @foo(float *%ptr)
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/cond-store-06.ll b/test/CodeGen/SystemZ/cond-store-06.ll
new file mode 100644
index 00000000000..203a0b04fb9
--- /dev/null
+++ b/test/CodeGen/SystemZ/cond-store-06.ll
@@ -0,0 +1,213 @@
+; Test f64 conditional stores that are presented as selects.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare void @foo(double *)
+
+; Test with the loaded value first.
+define void @f1(double *%ptr, double %alt, i32 %limit) {
+; CHECK: f1:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: std %f0, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load double *%ptr
+  %res = select i1 %cond, double %orig, double %alt
+  store double %res, double *%ptr
+  ret void
+}
+
+; ...and with the loaded value second
+define void @f2(double *%ptr, double %alt, i32 %limit) {
+; CHECK: f2:
+; CHECK-NOT: %r2
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: std %f0, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load double *%ptr
+  %res = select i1 %cond, double %alt, double %orig
+  store double %res, double *%ptr
+  ret void
+}
+
+; Check the high end of the aligned STD range.
+define void @f3(double *%base, double %alt, i32 %limit) {
+; CHECK: f3:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: std %f0, 4088(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr double *%base, i64 511
+  %cond = icmp ult i32 %limit, 42
+  %orig = load double *%ptr
+  %res = select i1 %cond, double %orig, double %alt
+  store double %res, double *%ptr
+  ret void
+}
+
+; Check the next doubleword up, which should use STDY instead of STD.
+define void @f4(double *%base, double %alt, i32 %limit) {
+; CHECK: f4:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stdy %f0, 4096(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr double *%base, i64 512
+  %cond = icmp ult i32 %limit, 42
+  %orig = load double *%ptr
+  %res = select i1 %cond, double %orig, double %alt
+  store double %res, double *%ptr
+  ret void
+}
+
+; Check the high end of the aligned STDY range.
+define void @f5(double *%base, double %alt, i32 %limit) {
+; CHECK: f5:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stdy %f0, 524280(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr double *%base, i64 65535
+  %cond = icmp ult i32 %limit, 42
+  %orig = load double *%ptr
+  %res = select i1 %cond, double %orig, double %alt
+  store double %res, double *%ptr
+  ret void
+}
+
+; Check the next doubleword up, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define void @f6(double *%base, double %alt, i32 %limit) {
+; CHECK: f6:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: agfi %r2, 524288
+; CHECK: std %f0, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr double *%base, i64 65536
+  %cond = icmp ult i32 %limit, 42
+  %orig = load double *%ptr
+  %res = select i1 %cond, double %orig, double %alt
+  store double %res, double *%ptr
+  ret void
+}
+
+; Check the low end of the STDY range.
+define void @f7(double *%base, double %alt, i32 %limit) {
+; CHECK: f7:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stdy %f0, -524288(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr double *%base, i64 -65536
+  %cond = icmp ult i32 %limit, 42
+  %orig = load double *%ptr
+  %res = select i1 %cond, double %orig, double %alt
+  store double %res, double *%ptr
+  ret void
+}
+
+; Check the next doubleword down, which needs separate address logic.
+; Other sequences besides this one would be OK.
+define void @f8(double *%base, double %alt, i32 %limit) {
+; CHECK: f8:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: agfi %r2, -524296
+; CHECK: std %f0, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %ptr = getelementptr double *%base, i64 -65537
+  %cond = icmp ult i32 %limit, 42
+  %orig = load double *%ptr
+  %res = select i1 %cond, double %orig, double %alt
+  store double %res, double *%ptr
+  ret void
+}
+
+; Check that STDY allows an index.
+define void @f9(i64 %base, i64 %index, double %alt, i32 %limit) {
+; CHECK: f9:
+; CHECK-NOT: %r2
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r2
+; CHECK: stdy %f0, 524287(%r3,%r2)
+; CHECK: [[LABEL]]:
+; CHECK: br %r14
+  %add1 = add i64 %base, %index
+  %add2 = add i64 %add1, 524287
+  %ptr = inttoptr i64 %add2 to double *
+  %cond = icmp ult i32 %limit, 42
+  %orig = load double *%ptr
+  %res = select i1 %cond, double %orig, double %alt
+  store double %res, double *%ptr
+  ret void
+}
+
+; Check that volatile loads are not matched.
+define void @f10(double *%ptr, double %alt, i32 %limit) {
+; CHECK: f10:
+; CHECK: ld {{%f[0-5]}}, 0(%r2)
+; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: [[LABEL]]:
+; CHECK: std {{%f[0-5]}}, 0(%r2)
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load volatile double *%ptr
+  %res = select i1 %cond, double %orig, double %alt
+  store double %res, double *%ptr
+  ret void
+}
+
+; ...likewise stores.  In this case we should have a conditional load into %f0.
+define void @f11(double *%ptr, double %alt, i32 %limit) {
+; CHECK: f11:
+; CHECK: jnl [[LABEL:[^ ]*]]
+; CHECK: ld %f0, 0(%r2)
+; CHECK: [[LABEL]]:
+; CHECK: std %f0, 0(%r2)
+; CHECK: br %r14
+  %cond = icmp ult i32 %limit, 42
+  %orig = load double *%ptr
+  %res = select i1 %cond, double %orig, double %alt
+  store volatile double %res, double *%ptr
+  ret void
+}
+
+; Try a frame index base.
+define void @f12(double %alt, i32 %limit) {
+; CHECK: f12:
+; CHECK: brasl %r14, foo@PLT
+; CHECK-NOT: %r15
+; CHECK: jl [[LABEL:[^ ]*]]
+; CHECK-NOT: %r15
+; CHECK: std {{%f[0-9]+}}, {{[0-9]+}}(%r15)
+; CHECK: [[LABEL]]:
+; CHECK: brasl %r14, foo@PLT
+; CHECK: br %r14
+  %ptr = alloca double
+  call void @foo(double *%ptr)
+  %cond = icmp ult i32 %limit, 42
+  %orig = load double *%ptr
+  %res = select i1 %cond, double %orig, double %alt
+  store double %res, double *%ptr
+  call void @foo(double *%ptr)
+  ret void
+}
-- 
2.11.0