From 06eabdeb15ce46ab4a331d6d760ce919fe249a71 Mon Sep 17 00:00:00 2001
From: Scott Michel <scottm@aero.org>
Date: Sat, 27 Dec 2008 04:51:36 +0000
Subject: [PATCH] - Remove Tilmann's custom truncate lowering: it completely
 hosed over   DAGcombine's ability to find reasons to remove truncates when
 they were not   needed. Consequently, the CellSPU backend would produce
 correct, but _really   slow and horrible_, code.

  Replaced with instruction sequences that do the equivalent truncation in
  SPUInstrInfo.td.

- Re-examine how unaligned loads and stores work. Generated unaligned
  load code has been tested on the CellSPU hardware; see the i32operations.c
  and i64operations.c in CodeGen/CellSPU/useful-harnesses.  (While they may be
  toy test code, it does prove that some real world code does compile
  correctly.)

- Fix truncating stores in bug 3193 (note: unpack_df.ll will still make llc
  fault because i64 ult is not yet implemented.)

- Added i64 eq and neq for setcc and select/setcc; started new instruction
  information file for them in SPU64InstrInfo.td. Additional i64 operations
  should be added to this file and not to SPUInstrInfo.td.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@61447 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp    |   8 +-
 lib/Target/CellSPU/SPU64InstrInfo.td               |  77 ++
 lib/Target/CellSPU/SPUISelDAGToDAG.cpp             |  66 +-
 lib/Target/CellSPU/SPUISelLowering.cpp             | 797 +++++++++++----------
 lib/Target/CellSPU/SPUISelLowering.h               |   6 +-
 lib/Target/CellSPU/SPUInstrFormats.td              |   5 +-
 lib/Target/CellSPU/SPUInstrInfo.cpp                |  55 +-
 lib/Target/CellSPU/SPUInstrInfo.td                 | 726 ++++++++++++++-----
 lib/Target/CellSPU/SPUNodes.td                     |  14 +-
 lib/Target/CellSPU/SPUOperands.td                  |  10 +-
 lib/Target/CellSPU/SPURegisterInfo.cpp             |   5 -
 lib/Target/CellSPU/SPUTargetAsmInfo.cpp            |   7 +
 test/CodeGen/CellSPU/call_indirect.ll              |   2 +-
 test/CodeGen/CellSPU/icmp64.ll                     | 144 ++++
 test/CodeGen/CellSPU/stores.ll                     |  60 ++
 test/CodeGen/CellSPU/struct_1.ll                   |   2 +-
 test/CodeGen/CellSPU/trunc.ll                      |  41 +-
 .../CellSPU/useful-harnesses/i32operations.c       |  69 ++
 .../CellSPU/useful-harnesses/i64operations.c       |  68 ++
 19 files changed, 1509 insertions(+), 653 deletions(-)
 create mode 100644 lib/Target/CellSPU/SPU64InstrInfo.td
 create mode 100644 test/CodeGen/CellSPU/icmp64.ll
 create mode 100644 test/CodeGen/CellSPU/useful-harnesses/i32operations.c
 create mode 100644 test/CodeGen/CellSPU/useful-harnesses/i64operations.c

diff --git a/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp b/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp
index 589a2600050..98aa084d504 100644
--- a/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp
+++ b/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp
@@ -117,7 +117,7 @@ namespace {
     }
  
     void
-    printMemRegImmS7(const MachineInstr *MI, unsigned OpNo)
+    printShufAddr(const MachineInstr *MI, unsigned OpNo)
     {
       char value = MI->getOperand(OpNo).getImm();
       O << (int) value;
@@ -183,16 +183,16 @@ namespace {
     }
 
     void
-    printMemRegImmS10(const MachineInstr *MI, unsigned OpNo)
+    printDFormAddr(const MachineInstr *MI, unsigned OpNo)
     {
       const MachineOperand &MO = MI->getOperand(OpNo);
       assert(MO.isImm() &&
-             "printMemRegImmS10 first operand is not immedate");
+             "printDFormAddr first operand is not immedate");
       int64_t value = int64_t(MI->getOperand(OpNo).getImm());
       int16_t value16 = int16_t(value);
       assert((value16 >= -(1 << (9+4)) && value16 <= (1 << (9+4)) - 1)
              && "Invalid dform s10 offset argument");
-      O << value16 << "(";
+      O << (value16 & ~0xf) << "(";
       printOperand(MI, OpNo+1);
       O << ")";
     }
diff --git a/lib/Target/CellSPU/SPU64InstrInfo.td b/lib/Target/CellSPU/SPU64InstrInfo.td
new file mode 100644
index 00000000000..6d679bac724
--- /dev/null
+++ b/lib/Target/CellSPU/SPU64InstrInfo.td
@@ -0,0 +1,77 @@
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// 64-bit comparisons:
+//
+// 1. The instruction sequences for vector vice scalar differ by a
+//    constant.
+//
+// 2. There are no "immediate" forms, since loading 64-bit constants
+//    could be a constant pool load.
+//
+// 3. i64 setcc results are i32, which are subsequently converted to a FSM
+//    mask when used in a select pattern.
+//
+// 4. v2i64 setcc results are v4i32, which can be converted to a FSM mask
+//    (TODO)
+//
+// M00$E Kan be Pretty N@sTi!!!!! (appologies to Monty!)
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+// selb instruction definition for i64. Note that the selection mask is
+// a vector, produced by various forms of FSM:
+def SELBr64_cond:
+   SELBInst<(outs R64C:$rT), (ins R64C:$rA, R64C:$rB, VECREG:$rC),
+            [/* no pattern */]>;
+
+class CodeFrag<dag frag> {
+  dag Fragment = frag;
+}
+
+class I64SELECTNegCond<PatFrag cond, CodeFrag cmpare>:
+  Pat<(select (i32 (cond R64C:$rA, R64C:$rB)), R64C:$rTrue, R64C:$rFalse),
+      (SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 cmpare.Fragment))>;
+
+class I64SETCCNegCond<PatFrag cond, CodeFrag cmpare>:
+  Pat<(cond R64C:$rA, R64C:$rB),
+      (XORIr32 cmpare.Fragment, -1)>;
+
+// The i64 seteq fragment that does the scalar->vector conversion and
+// comparison:
+def CEQr64compare:
+    CodeFrag<(CGTIv4i32 (GBv4i32 (CEQv4i32 (ORv2i64_i64 R64C:$rA),
+                                           (ORv2i64_i64 R64C:$rB))),
+                        0x0000000c)>;
+
+
+// The i64 seteq fragment that does the vector comparison
+def CEQv2i64compare:
+    CodeFrag<(CGTIv4i32 (GBv4i32 (CEQv4i32 VECREG:$rA, VECREG:$rB)),
+                        0x0000000f)>;
+
+// i64 seteq (equality): the setcc result is i32, which is converted to a
+// vector FSM mask when used in a select pattern.
+//
+// v2i64 seteq (equality): the setcc result is v4i32
+multiclass CompareEqual64 {
+  // Plain old comparison, converts back to i32 scalar
+  def r64: CodeFrag<(ORi32_v4i32 CEQr64compare.Fragment)>;
+  def v2i64: CodeFrag<(ORi32_v4i32 CEQv2i64compare.Fragment)>;
+
+  // SELB mask from FSM:
+  def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CEQr64compare.Fragment))>;
+  def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CEQv2i64compare.Fragment))>;
+}
+
+defm I64EQ: CompareEqual64;
+
+def : Pat<(seteq R64C:$rA, R64C:$rB), I64EQr64.Fragment>;
+
+def : Pat<(seteq (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
+          I64EQv2i64.Fragment>;
+
+def I64Select:
+    Pat<(select R32C:$rC, R64C:$rB, R64C:$rA),
+        (SELBr64_cond R64C:$rA, R64C:$rB, (FSMr32 R32C:$rC))>;
+
+def : I64SETCCNegCond<setne, I64EQr64>;
+
+def : I64SELECTNegCond<setne, I64EQr64>;
\ No newline at end of file
diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
index 9ac0e2e256c..f51aba2fda6 100644
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -165,24 +165,23 @@ namespace {
     MVT VT;
     unsigned ldresult_ins;      /// LDRESULT instruction (0 = undefined)
     bool ldresult_imm;          /// LDRESULT instruction requires immediate?
-    int prefslot_byte;          /// Byte offset of the "preferred" slot
+    unsigned lrinst;            /// LR instruction
   };
 
   const valtype_map_s valtype_map[] = {
-    { MVT::i1,    0,            false, 3 },
-    { MVT::i8,    SPU::ORBIr8,  true,  3 },
-    { MVT::i16,   SPU::ORHIr16, true,  2 },
-    { MVT::i32,   SPU::ORIr32,  true,  0 },
-    { MVT::i64,   SPU::ORr64,   false, 0 },
-    { MVT::f32,   SPU::ORf32,   false, 0 },
-    { MVT::f64,   SPU::ORf64,   false, 0 },
+    { MVT::i8,    SPU::ORBIr8,  true,  SPU::LRr8 },
+    { MVT::i16,   SPU::ORHIr16, true,  SPU::LRr16 },
+    { MVT::i32,   SPU::ORIr32,  true,  SPU::LRr32 },
+    { MVT::i64,   SPU::ORr64,   false, SPU::LRr64 },
+    { MVT::f32,   SPU::ORf32,   false, SPU::LRf32 },
+    { MVT::f64,   SPU::ORf64,   false, SPU::LRf64 },
     // vector types... (sigh!)
-    { MVT::v16i8, 0,            false, 0 },
-    { MVT::v8i16, 0,            false, 0 },
-    { MVT::v4i32, 0,            false, 0 },
-    { MVT::v2i64, 0,            false, 0 },
-    { MVT::v4f32, 0,            false, 0 },
-    { MVT::v2f64, 0,            false, 0 }
+    { MVT::v16i8, 0,            false, SPU::LRv16i8 },
+    { MVT::v8i16, 0,            false, SPU::LRv8i16 },
+    { MVT::v4i32, 0,            false, SPU::LRv4i32 },
+    { MVT::v2i64, 0,            false, SPU::LRv2i64 },
+    { MVT::v4f32, 0,            false, SPU::LRv4f32 },
+    { MVT::v2f64, 0,            false, SPU::LRv2f64 }
   };
 
   const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]);
@@ -686,31 +685,32 @@ SPUDAGToDAGISel::Select(SDValue Op) {
       Result = CurDAG->getTargetNode(Opc, VT, MVT::Other, Arg, Arg, Chain);
     }
 
-    Chain = SDValue(Result, 1);
-
     return Result;
   } else if (Opc == SPUISD::IndirectAddr) {
-    SDValue Op0 = Op.getOperand(0);
-    if (Op0.getOpcode() == SPUISD::LDRESULT) {
-        /* || Op0.getOpcode() == SPUISD::AFormAddr) */
-      // (IndirectAddr (LDRESULT, imm))
-      SDValue Op1 = Op.getOperand(1);
-      MVT VT = Op.getValueType();
-
-      DEBUG(cerr << "CellSPU: IndirectAddr(LDRESULT, imm):\nOp0 = ");
-      DEBUG(Op.getOperand(0).getNode()->dump(CurDAG));
-      DEBUG(cerr << "\nOp1 = ");
-      DEBUG(Op.getOperand(1).getNode()->dump(CurDAG));
-      DEBUG(cerr << "\n");
-
+    // Look at the operands: SelectCode() will catch the cases that aren't
+    // specifically handled here.
+    //
+    // SPUInstrInfo catches the following patterns:
+    // (SPUindirect (SPUhi ...), (SPUlo ...))
+    // (SPUindirect $sp, imm)
+    MVT VT = Op.getValueType();
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+    RegisterSDNode *RN;
+
+    if ((Op0.getOpcode() != SPUISD::Hi && Op1.getOpcode() != SPUISD::Lo)
+        || (Op0.getOpcode() == ISD::Register
+            && ((RN = dyn_cast<RegisterSDNode>(Op0.getNode())) != 0
+                && RN->getReg() != SPU::R1))) {
+      NewOpc = SPU::Ar32;
       if (Op1.getOpcode() == ISD::Constant) {
         ConstantSDNode *CN = cast<ConstantSDNode>(Op1);
-        Op1 = CurDAG->getTargetConstant(CN->getZExtValue(), VT);
+        Op1 = CurDAG->getTargetConstant(CN->getSExtValue(), VT);
         NewOpc = (isI32IntS10Immediate(CN) ? SPU::AIr32 : SPU::Ar32);
-        Ops[0] = Op0;
-        Ops[1] = Op1;
-        n_ops = 2;
       }
+      Ops[0] = Op0;
+      Ops[1] = Op1;
+      n_ops = 2;
     }
   }
   
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index c3c31e0f470..e975d0d039c 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -15,6 +15,7 @@
 #include "SPUISelLowering.h"
 #include "SPUTargetMachine.h"
 #include "SPUFrameInfo.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/VectorExtras.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -77,37 +78,6 @@ namespace {
 
     return retval;
   }
-
-  //! Predicate that returns true if operand is a memory target
-  /*!
-    \arg Op Operand to test
-    \return true if the operand is a memory target (i.e., global
-    address, external symbol, constant pool) or an A-form
-    address.
-   */
-  bool isMemoryOperand(const SDValue &Op)
-  {
-    const unsigned Opc = Op.getOpcode();
-    return (Opc == ISD::GlobalAddress
-            || Opc == ISD::GlobalTLSAddress
-            || Opc == ISD::JumpTable
-            || Opc == ISD::ConstantPool
-            || Opc == ISD::ExternalSymbol
-            || Opc == ISD::TargetGlobalAddress
-            || Opc == ISD::TargetGlobalTLSAddress
-            || Opc == ISD::TargetJumpTable
-            || Opc == ISD::TargetConstantPool
-            || Opc == ISD::TargetExternalSymbol
-            || Opc == SPUISD::AFormAddr);
-  }
-
-  //! Predicate that returns true if the operand is an indirect target
-  bool isIndirectOperand(const SDValue &Op)
-  {
-    const unsigned Opc = Op.getOpcode();
-    return (Opc == ISD::Register
-            || Opc == SPUISD::LDRESULT);
-  }
 }
 
 SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
@@ -135,20 +105,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
 
-  setLoadExtAction(ISD::EXTLOAD,  MVT::i8, Custom);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
-  setTruncStoreAction(MVT::i8,    MVT::i8, Custom);
-  setTruncStoreAction(MVT::i16,   MVT::i8, Custom);
-  setTruncStoreAction(MVT::i32,   MVT::i8, Custom);
-  setTruncStoreAction(MVT::i64,   MVT::i8, Custom);
-  setTruncStoreAction(MVT::i128,  MVT::i8, Custom);
-
-  setLoadExtAction(ISD::EXTLOAD,  MVT::i16, Custom);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
-
-  setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Custom);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
 
   // SPU constant load actions are custom lowered:
   setOperationAction(ISD::Constant,   MVT::i64, Custom);
@@ -160,11 +118,33 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
        ++sctype) {
     MVT VT = (MVT::SimpleValueType)sctype;
 
-    setOperationAction(ISD::LOAD, VT, Custom);
-    setOperationAction(ISD::STORE, VT, Custom);
+    setOperationAction(ISD::LOAD,   VT, Custom);
+    setOperationAction(ISD::STORE,  VT, Custom);
+    setLoadExtAction(ISD::EXTLOAD,  VT, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
+
+    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
+      MVT StoreVT = (MVT::SimpleValueType) stype;
+      setTruncStoreAction(VT, StoreVT, Expand);
+    }
+  }
+
+  for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
+       ++sctype) {
+    MVT VT = (MVT::SimpleValueType) sctype;
+
+    setOperationAction(ISD::LOAD,   VT, Custom);
+    setOperationAction(ISD::STORE,  VT, Custom);
+
+    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
+      MVT StoreVT = (MVT::SimpleValueType) stype;
+      setTruncStoreAction(VT, StoreVT, Expand);
+    }
   }
 
-  // Custom lower BRCOND for i8 to "promote" the result to i16
+  // Custom lower BRCOND for i8 to "promote" the result to whatever the result
+  // operand happens to be:
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
 
   // Expand the jumptable branches
@@ -176,14 +156,12 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
   setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
   setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
-#if 0
   setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
-#endif
 
   // SPU has no intrinsics for these particular operations:
   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
 
-  // PowerPC has no SREM/UREM instructions
+  // SPU has no SREM/UREM instructions
   setOperationAction(ISD::SREM, MVT::i32, Expand);
   setOperationAction(ISD::UREM, MVT::i32, Expand);
   setOperationAction(ISD::SREM, MVT::i64, Expand);
@@ -232,14 +210,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::MUL,  MVT::i32,    Custom);
   setOperationAction(ISD::MUL,  MVT::i64,    Expand);   // libcall
 
-  // SMUL_LOHI, UMUL_LOHI
-#if 0
-  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
-#endif
-
   // Need to custom handle (some) common i8, i64 math ops
   setOperationAction(ISD::ADD,  MVT::i64,    Custom);
   setOperationAction(ISD::SUB,  MVT::i8,     Custom);
@@ -265,12 +235,12 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::SELECT, MVT::i8,   Legal);
   setOperationAction(ISD::SELECT, MVT::i16,  Legal);
   setOperationAction(ISD::SELECT, MVT::i32,  Legal);
-  setOperationAction(ISD::SELECT, MVT::i64,  Expand);
+  setOperationAction(ISD::SELECT, MVT::i64,  Legal);
 
   setOperationAction(ISD::SETCC, MVT::i8,    Legal);
   setOperationAction(ISD::SETCC, MVT::i16,   Legal);
-  setOperationAction(ISD::SETCC, MVT::i32,   Legal);
-  setOperationAction(ISD::SETCC, MVT::i64,   Expand);
+  setOperationAction(ISD::SETCC, MVT::i32,   Custom);
+  setOperationAction(ISD::SETCC, MVT::i64,   Custom);
 
   // Zero extension and sign extension for i64 have to be
   // custom legalized
@@ -278,10 +248,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
   setOperationAction(ISD::ANY_EXTEND,  MVT::i64, Custom);
 
-  // Custom lower truncates
-  setOperationAction(ISD::TRUNCATE, MVT::i8, Custom);
-  setOperationAction(ISD::TRUNCATE, MVT::i16, Custom);
-  setOperationAction(ISD::TRUNCATE, MVT::i32, Custom);
+  // Custom lower i128 -> i64 truncates
   setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
 
   // SPU has a legal FP -> signed INT instruction
@@ -292,7 +259,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
 
   // FDIV on SPU requires custom lowering
   setOperationAction(ISD::FDIV, MVT::f32, Custom);
-  //setOperationAction(ISD::FDIV, MVT::f64, Custom);
+  setOperationAction(ISD::FDIV, MVT::f64, Expand);      // libcall
 
   // SPU has [U|S]INT_TO_FP
   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
@@ -402,7 +369,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
 
   setShiftAmountType(MVT::i32);
-  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
 
   setStackPointerRegisterToSaveRestore(SPU::R1);
 
@@ -435,7 +402,7 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
     node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
     node_names[(unsigned) SPUISD::SHUFFLE_MASK] = "SPUISD::SHUFFLE_MASK";
     node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
-    node_names[(unsigned) SPUISD::PROMOTE_SCALAR] = "SPUISD::PROMOTE_SCALAR";
+    node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PROMOTE_SCALAR";
     node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
     node_names[(unsigned) SPUISD::MPY] = "SPUISD::MPY";
     node_names[(unsigned) SPUISD::MPYU] = "SPUISD::MPYU";
@@ -471,9 +438,14 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
   return ((i != node_names.end()) ? i->second : 0);
 }
 
+//===----------------------------------------------------------------------===//
+// Return the Cell SPU's SETCC result type
+//===----------------------------------------------------------------------===//
+
 MVT SPUTargetLowering::getSetCCResultType(const SDValue &Op) const {
   MVT VT = Op.getValueType();
-  return (VT.isInteger() ? VT : MVT(MVT::i32));
+  // i16 and i32 are valid SETCC result types
+  return ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) ? VT : MVT::i32);
 }
 
 //===----------------------------------------------------------------------===//
@@ -486,105 +458,6 @@ MVT SPUTargetLowering::getSetCCResultType(const SDValue &Op) const {
 //  LowerOperation implementation
 //===----------------------------------------------------------------------===//
 
-/// Aligned load common code for CellSPU
-/*!
-  \param[in] Op The SelectionDAG load or store operand
-  \param[in] DAG The selection DAG
-  \param[in] ST CellSPU subtarget information structure
-  \param[in,out] alignment Caller initializes this to the load or store node's
-  value from getAlignment(), may be updated while generating the aligned load
-  \param[in,out] alignOffs Aligned offset; set by AlignedLoad to the aligned
-  offset (divisible by 16, modulo 16 == 0)
-  \param[in,out] prefSlotOffs Preferred slot offset; set by AlignedLoad to the
-  offset of the preferred slot (modulo 16 != 0)
-  \param[in,out] VT Caller initializes this value type to the the load or store
-  node's loaded or stored value type; may be updated if an i1-extended load or
-  store.
-  \param[out] was16aligned true if the base pointer had 16-byte alignment,
-  otherwise false. Can help to determine if the chunk needs to be rotated.
-
- Both load and store lowering load a block of data aligned on a 16-byte
- boundary. This is the common aligned load code shared between both.
- */
-static SDValue
-AlignedLoad(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST,
-            LSBaseSDNode *LSN,
-            unsigned &alignment, int &alignOffs, int &prefSlotOffs,
-            MVT &VT, bool &was16aligned)
-{
-  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-  const valtype_map_s *vtm = getValueTypeMapEntry(VT);
-  SDValue basePtr = LSN->getBasePtr();
-  SDValue chain = LSN->getChain();
-
-  if (basePtr.getOpcode() == ISD::ADD) {
-    SDValue Op1 = basePtr.getNode()->getOperand(1);
-
-    if (Op1.getOpcode() == ISD::Constant
-        || Op1.getOpcode() == ISD::TargetConstant) {
-      const ConstantSDNode *CN = cast<ConstantSDNode>(basePtr.getOperand(1));
-
-      alignOffs = (int) CN->getZExtValue();
-      prefSlotOffs = (int) (alignOffs & 0xf);
-
-      // Adjust the rotation amount to ensure that the final result ends up in
-      // the preferred slot:
-      prefSlotOffs -= vtm->prefslot_byte;
-      basePtr = basePtr.getOperand(0);
-
-      // Loading from memory, can we adjust alignment?
-      if (basePtr.getOpcode() == SPUISD::AFormAddr) {
-        SDValue APtr = basePtr.getOperand(0);
-        if (APtr.getOpcode() == ISD::TargetGlobalAddress) {
-          GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(APtr);
-          alignment = GSDN->getGlobal()->getAlignment();
-        }
-      }
-    } else {
-      alignOffs = 0;
-      prefSlotOffs = -vtm->prefslot_byte;
-    }
-  } else if (basePtr.getOpcode() == ISD::FrameIndex) {
-    FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(basePtr);
-    alignOffs = int(FIN->getIndex() * SPUFrameInfo::stackSlotSize());
-    prefSlotOffs = (int) (alignOffs & 0xf);
-    prefSlotOffs -= vtm->prefslot_byte;
-  } else {
-    alignOffs = 0;
-    prefSlotOffs = -vtm->prefslot_byte;
-  }
-
-  if (alignment == 16) {
-    // Realign the base pointer as a D-Form address:
-    if (!isMemoryOperand(basePtr) || (alignOffs & ~0xf) != 0) {
-      basePtr = DAG.getNode(ISD::ADD, PtrVT,
-                            basePtr,
-                            DAG.getConstant((alignOffs & ~0xf), PtrVT));
-    }
-
-    // Emit the vector load:
-    was16aligned = true;
-    return DAG.getLoad(MVT::v16i8, chain, basePtr,
-                       LSN->getSrcValue(), LSN->getSrcValueOffset(),
-                       LSN->isVolatile(), 16);
-  }
-
-  // Unaligned load or we're using the "large memory" model, which means that
-  // we have to be very pessimistic:
-  if (isMemoryOperand(basePtr) || isIndirectOperand(basePtr)) {
-    basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT, basePtr,
-                          DAG.getConstant(0, PtrVT));
-  }
-
-  // Add the offset
-  basePtr = DAG.getNode(ISD::ADD, PtrVT, basePtr,
-                        DAG.getConstant((alignOffs & ~0xf), PtrVT));
-  was16aligned = false;
-  return DAG.getLoad(MVT::v16i8, chain, basePtr,
-                     LSN->getSrcValue(), LSN->getSrcValueOffset(),
-                     LSN->isVolatile(), 16);
-}
-
 /// Custom lower loads for CellSPU
 /*!
  All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
@@ -605,43 +478,110 @@ static SDValue
 LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
   LoadSDNode *LN = cast<LoadSDNode>(Op);
   SDValue the_chain = LN->getChain();
+  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   MVT InVT = LN->getMemoryVT();
   MVT OutVT = Op.getValueType();
   ISD::LoadExtType ExtType = LN->getExtensionType();
   unsigned alignment = LN->getAlignment();
-  SDValue Ops[8];
+  const valtype_map_s *vtm = getValueTypeMapEntry(InVT);
 
   switch (LN->getAddressingMode()) {
   case ISD::UNINDEXED: {
-    int offset, rotamt;
-    bool was16aligned;
-    SDValue result =
-      AlignedLoad(Op, DAG, ST, LN,alignment, offset, rotamt, InVT,
-                  was16aligned);
-
-    if (result.getNode() == 0)
-      return result;
-
-    the_chain = result.getValue(1);
-    // Rotate the chunk if necessary
-    if (rotamt < 0)
-      rotamt += 16;
-    if (rotamt != 0 || !was16aligned) {
-      SDVTList vecvts = DAG.getVTList(MVT::v16i8, MVT::Other);
-
-      Ops[0] = result;
-      if (was16aligned) {
-        Ops[1] = DAG.getConstant(rotamt, MVT::i16);
+    SDValue result;
+    SDValue basePtr = LN->getBasePtr();
+    SDValue rotate;
+
+    if (alignment == 16) {
+      ConstantSDNode *CN;
+
+      // Special cases for a known aligned load to simplify the base pointer
+      // and the rotation amount:
+      if (basePtr.getOpcode() == ISD::ADD
+          && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
+        // Known offset into basePtr
+        int64_t offset = CN->getSExtValue();
+        int64_t rotamt = int64_t((offset & 0xf) - vtm->prefslot_byte);
+
+        if (rotamt < 0)
+          rotamt += 16;
+
+        rotate = DAG.getConstant(rotamt, MVT::i16);
+
+        // Simplify the base pointer for this case:
+        basePtr = basePtr.getOperand(0);
+        if ((offset & ~0xf) > 0) {
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT,
+                                basePtr,
+                                DAG.getConstant((offset & ~0xf), PtrVT));
+        }
+      } else if ((basePtr.getOpcode() == SPUISD::AFormAddr)
+                 || (basePtr.getOpcode() == SPUISD::IndirectAddr
+                     && basePtr.getOperand(0).getOpcode() == SPUISD::Hi
+                     && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
+        // Plain aligned a-form address: rotate into preferred slot
+        // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
+        int64_t rotamt = -vtm->prefslot_byte;
+        if (rotamt < 0)
+          rotamt += 16;
+        rotate = DAG.getConstant(rotamt, MVT::i16);
       } else {
-        MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-        LoadSDNode *LN1 = cast<LoadSDNode>(result);
-        Ops[1] = DAG.getNode(ISD::ADD, PtrVT, LN1->getBasePtr(),
+        // Offset the rotate amount by the basePtr and the preferred slot
+        // byte offset
+        int64_t rotamt = -vtm->prefslot_byte;
+        if (rotamt < 0)
+          rotamt += 16;
+        rotate = DAG.getNode(ISD::ADD, PtrVT,
+                             basePtr,
                              DAG.getConstant(rotamt, PtrVT));
       }
+    } else {
+      // Unaligned load: must be more pessimistic about addressing modes:
+      if (basePtr.getOpcode() == ISD::ADD) {
+        MachineFunction &MF = DAG.getMachineFunction();
+        MachineRegisterInfo &RegInfo = MF.getRegInfo();
+        unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+        SDValue Flag;
+
+        SDValue Op0 = basePtr.getOperand(0);
+        SDValue Op1 = basePtr.getOperand(1);
+
+        if (isa<ConstantSDNode>(Op1)) {
+          // Convert the (add <ptr>, <const>) to an indirect address contained
+          // in a register. Note that this is done because we need to avoid
+          // creating a 0(reg) d-form address due to the SPU's block loads.
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT, Op0, Op1);
+          the_chain = DAG.getCopyToReg(the_chain, VReg, basePtr, Flag);
+          basePtr = DAG.getCopyFromReg(the_chain, VReg, PtrVT);
+        } else {
+          // Convert the (add <arg1>, <arg2>) to an indirect address, which
+          // will likely be lowered as a reg(reg) x-form address.
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT, Op0, Op1);
+        }
+      } else {
+        basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT,
+                              basePtr,
+                              DAG.getConstant(0, PtrVT));
+      }
 
-      result = DAG.getNode(SPUISD::ROTBYTES_LEFT, MVT::v16i8, Ops, 2);
+      // Offset the rotate amount by the basePtr and the preferred slot
+      // byte offset
+      rotate = DAG.getNode(ISD::ADD, PtrVT,
+                           basePtr,
+                           DAG.getConstant(-vtm->prefslot_byte, PtrVT));
     }
 
+    // Re-emit as a v16i8 vector load
+    result = DAG.getLoad(MVT::v16i8, the_chain, basePtr,
+                         LN->getSrcValue(), LN->getSrcValueOffset(),
+                         LN->isVolatile(), 16);
+
+    // Update the chain
+    the_chain = result.getValue(1);
+
+    // Rotate into the preferred slot:
+    result = DAG.getNode(SPUISD::ROTBYTES_LEFT, MVT::v16i8,
+                         result.getValue(0), rotate);
+
     // Convert the loaded v16i8 vector to the appropriate vector type
     // specified by the operand:
     MVT vecVT = MVT::getVectorVT(InVT, (128 / InVT.getSizeInBits()));
@@ -704,23 +644,86 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 
   switch (SN->getAddressingMode()) {
   case ISD::UNINDEXED: {
-    int chunk_offset, slot_offset;
-    bool was16aligned;
-
     // The vector type we really want to load from the 16-byte chunk.
     MVT vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits())),
         stVecVT = MVT::getVectorVT(StVT, (128 / StVT.getSizeInBits()));
 
-    SDValue alignLoadVec =
-      AlignedLoad(Op, DAG, ST, SN, alignment,
-                  chunk_offset, slot_offset, VT, was16aligned);
+    SDValue alignLoadVec;
+    SDValue basePtr = SN->getBasePtr();
+    SDValue the_chain = SN->getChain();
+    SDValue insertEltOffs;
+
+    if (alignment == 16) {
+      ConstantSDNode *CN;
+
+      // Special cases for a known aligned load to simplify the base pointer
+      // and insertion byte:
+      if (basePtr.getOpcode() == ISD::ADD
+          && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) {
+        // Known offset into basePtr
+        int64_t offset = CN->getSExtValue();
+
+        // Simplify the base pointer for this case:
+        basePtr = basePtr.getOperand(0);
+        insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, PtrVT,
+                                    basePtr,
+                                    DAG.getConstant((offset & 0xf), PtrVT));
+
+        if ((offset & ~0xf) > 0) {
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT,
+                                basePtr,
+                                DAG.getConstant((offset & ~0xf), PtrVT));
+        }
+      } else {
+        // Otherwise, assume it's at byte 0 of basePtr
+        insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, PtrVT,
+                                    basePtr,
+                                    DAG.getConstant(0, PtrVT));
+      }
+    } else {
+      // Unaligned load: must be more pessimistic about addressing modes:
+      if (basePtr.getOpcode() == ISD::ADD) {
+        MachineFunction &MF = DAG.getMachineFunction();
+        MachineRegisterInfo &RegInfo = MF.getRegInfo();
+        unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+        SDValue Flag;
+
+        SDValue Op0 = basePtr.getOperand(0);
+        SDValue Op1 = basePtr.getOperand(1);
+
+        if (isa<ConstantSDNode>(Op1)) {
+          // Convert the (add <ptr>, <const>) to an indirect address contained
+          // in a register. Note that this is done because we need to avoid
+          // creating a 0(reg) d-form address due to the SPU's block loads.
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT, Op0, Op1);
+          the_chain = DAG.getCopyToReg(the_chain, VReg, basePtr, Flag);
+          basePtr = DAG.getCopyFromReg(the_chain, VReg, PtrVT);
+        } else {
+          // Convert the (add <arg1>, <arg2>) to an indirect address, which
+          // will likely be lowered as a reg(reg) x-form address.
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT, Op0, Op1);
+        }
+      } else {
+        basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT,
+                              basePtr,
+                              DAG.getConstant(0, PtrVT));
+      }
+
+      // Insertion point is solely determined by basePtr's contents
+      insertEltOffs = DAG.getNode(ISD::ADD, PtrVT,
+                                  basePtr,
+                                  DAG.getConstant(0, PtrVT));
+    }
+
+    // Re-emit as a v16i8 vector load
+    alignLoadVec = DAG.getLoad(MVT::v16i8, the_chain, basePtr,
+                               SN->getSrcValue(), SN->getSrcValueOffset(),
+                               SN->isVolatile(), 16);
 
-    if (alignLoadVec.getNode() == 0)
-      return alignLoadVec;
+    // Update the chain
+    the_chain = alignLoadVec.getValue(1);
 
     LoadSDNode *LN = cast<LoadSDNode>(alignLoadVec);
-    SDValue basePtr = LN->getBasePtr();
-    SDValue the_chain = alignLoadVec.getValue(1);
     SDValue theValue = SN->getValue();
     SDValue result;
 
@@ -732,29 +735,20 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
       theValue = theValue.getOperand(0);
     }
 
-    chunk_offset &= 0xf;
-
-    SDValue insertEltOffs = DAG.getConstant(chunk_offset, PtrVT);
-    SDValue insertEltPtr;
-
     // If the base pointer is already a D-form address, then just create
     // a new D-form address with a slot offset and the orignal base pointer.
     // Otherwise generate a D-form address with the slot offset relative
     // to the stack pointer, which is always aligned.
-    DEBUG(cerr << "CellSPU LowerSTORE: basePtr = ");
-    DEBUG(basePtr.getNode()->dump(&DAG));
-    DEBUG(cerr << "\n");
-
-    if (basePtr.getOpcode() == SPUISD::IndirectAddr ||
-        (basePtr.getOpcode() == ISD::ADD
-         && basePtr.getOperand(0).getOpcode() == SPUISD::IndirectAddr)) {
-      insertEltPtr = basePtr;
-    } else {
-      insertEltPtr = DAG.getNode(ISD::ADD, PtrVT, basePtr, insertEltOffs);
-    }
+#if !defined(NDEBUG)
+      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+        cerr << "CellSPU LowerSTORE: basePtr = ";
+        basePtr.getNode()->dump(&DAG);
+        cerr << "\n";
+      }
+#endif
 
     SDValue insertEltOp =
-            DAG.getNode(SPUISD::SHUFFLE_MASK, vecVT, insertEltPtr);
+            DAG.getNode(SPUISD::SHUFFLE_MASK, vecVT, insertEltOffs);
     SDValue vectorizeOp =
             DAG.getNode(ISD::SCALAR_TO_VECTOR, vecVT, theValue);
 
@@ -919,22 +913,31 @@ LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
   return SDValue();
 }
 
-//! Lower MVT::i8 brcond to a promoted type (MVT::i32, MVT::i16)
 static SDValue
-LowerBRCOND(SDValue Op, SelectionDAG &DAG)
-{
+LowerBRCOND(SDValue Op, SelectionDAG &DAG, const TargetLowering &TLI) {
   SDValue Cond = Op.getOperand(1);
   MVT CondVT = Cond.getValueType();
-  MVT CondNVT;
+  unsigned CondOpc;
 
   if (CondVT == MVT::i8) {
-    CondNVT = MVT::i16;
+    SDValue CondOp0 = Cond.getOperand(0);
+    if (Cond.getOpcode() == ISD::TRUNCATE) {
+      // Use the truncate's value type and ANY_EXTEND the condition (DAGcombine
+      // will then remove the truncate)
+      CondVT = CondOp0.getValueType();
+      CondOpc = ISD::ANY_EXTEND;
+    } else {
+      CondVT = MVT::i32;                // default to something reasonable
+      CondOpc = ISD::ZERO_EXTEND;
+    }
+
+    Cond = DAG.getNode(CondOpc, CondVT, Op.getOperand(1));
+
     return DAG.getNode(ISD::BRCOND, Op.getValueType(),
-                      Op.getOperand(0),
-                      DAG.getNode(ISD::ZERO_EXTEND, CondNVT, Op.getOperand(1)),
-                      Op.getOperand(2));
-  } else
-    return SDValue();                // Unchanged
+                       Op.getOperand(0), Cond, Op.getOperand(2));
+  }
+  
+  return SDValue(); // Unchanged
 }
 
 static SDValue
@@ -1896,7 +1899,7 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
     case MVT::i64:
     case MVT::f32:
     case MVT::f64:
-      return DAG.getNode(SPUISD::PROMOTE_SCALAR, Op.getValueType(), Op0, Op0);
+      return DAG.getNode(SPUISD::PREFSLOT2VEC, Op.getValueType(), Op0, Op0);
     }
   }
 
@@ -2274,9 +2277,11 @@ static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   return result;
 }
 
-static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
+static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
+                           const TargetLowering &TLI)
 {
   SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
+  MVT ShiftVT = TLI.getShiftAmountTy();
 
   assert(Op.getValueType() == MVT::i8);
   switch (Opc) {
@@ -2290,11 +2295,11 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
     SDValue N1 = Op.getOperand(1);
     N0 = (N0.getOpcode() != ISD::Constant
           ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0)
-          : DAG.getConstant(cast<ConstantSDNode>(N0)->getZExtValue(),
+          : DAG.getConstant(cast<ConstantSDNode>(N0)->getSExtValue(),
                             MVT::i16));
     N1 = (N1.getOpcode() != ISD::Constant
           ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N1)
-          : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
+          : DAG.getConstant(cast<ConstantSDNode>(N1)->getSExtValue(),
                             MVT::i16));
     return DAG.getNode(ISD::TRUNCATE, MVT::i8,
                        DAG.getNode(Opc, MVT::i16, N0, N1));
@@ -2307,13 +2312,13 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
           ? DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, N0)
           : DAG.getConstant(cast<ConstantSDNode>(N0)->getZExtValue(),
                             MVT::i16));
-    N1Opc = N1.getValueType().bitsLT(MVT::i32)
+    N1Opc = N1.getValueType().bitsLT(ShiftVT)
             ? ISD::ZERO_EXTEND
             : ISD::TRUNCATE;
     N1 = (N1.getOpcode() != ISD::Constant
-          ? DAG.getNode(N1Opc, MVT::i32, N1)
+          ? DAG.getNode(N1Opc, ShiftVT, N1)
           : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
-                            MVT::i32));
+                            TLI.getShiftAmountTy()));
     SDValue ExpandArg =
       DAG.getNode(ISD::OR, MVT::i16, N0,
                   DAG.getNode(ISD::SHL, MVT::i16,
@@ -2328,14 +2333,13 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
     N0 = (N0.getOpcode() != ISD::Constant
           ? DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, N0)
           : DAG.getConstant(cast<ConstantSDNode>(N0)->getZExtValue(),
-                            MVT::i16));
-    N1Opc = N1.getValueType().bitsLT(MVT::i16)
+                            MVT::i32));
+    N1Opc = N1.getValueType().bitsLT(ShiftVT)
             ? ISD::ZERO_EXTEND
             : ISD::TRUNCATE;
     N1 = (N1.getOpcode() != ISD::Constant
-          ? DAG.getNode(N1Opc, MVT::i16, N1)
-          : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
-                            MVT::i16));
+          ? DAG.getNode(N1Opc, ShiftVT, N1)
+          : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(), ShiftVT));
     return DAG.getNode(ISD::TRUNCATE, MVT::i8,
                        DAG.getNode(Opc, MVT::i16, N0, N1));
   }
@@ -2344,15 +2348,15 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
     unsigned N1Opc;
     N0 = (N0.getOpcode() != ISD::Constant
           ? DAG.getNode(ISD::SIGN_EXTEND, MVT::i16, N0)
-          : DAG.getConstant(cast<ConstantSDNode>(N0)->getZExtValue(),
+          : DAG.getConstant(cast<ConstantSDNode>(N0)->getSExtValue(),
                             MVT::i16));
-    N1Opc = N1.getValueType().bitsLT(MVT::i16)
+    N1Opc = N1.getValueType().bitsLT(ShiftVT)
             ? ISD::SIGN_EXTEND
             : ISD::TRUNCATE;
     N1 = (N1.getOpcode() != ISD::Constant
-          ? DAG.getNode(N1Opc, MVT::i16, N1)
+          ? DAG.getNode(N1Opc, ShiftVT, N1)
           : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
-                            MVT::i16));
+                            ShiftVT));
     return DAG.getNode(ISD::TRUNCATE, MVT::i8,
                        DAG.getNode(Opc, MVT::i16, N0, N1));
   }
@@ -2366,7 +2370,7 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
     N1Opc = N1.getValueType().bitsLT(MVT::i16) ? ISD::SIGN_EXTEND : ISD::TRUNCATE;
     N1 = (N1.getOpcode() != ISD::Constant
           ? DAG.getNode(N1Opc, MVT::i16, N1)
-          : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
+          : DAG.getConstant(cast<ConstantSDNode>(N1)->getSExtValue(),
                             MVT::i16));
     return DAG.getNode(ISD::TRUNCATE, MVT::i8,
                        DAG.getNode(Opc, MVT::i16, N0, N1));
@@ -2397,7 +2401,7 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
     DEBUG(cerr << "CellSPU.LowerI64Math: lowering zero/sign/any extend\n");
 
     SDValue PromoteScalar =
-            DAG.getNode(SPUISD::PROMOTE_SCALAR, Op0VecVT, Op0);
+            DAG.getNode(SPUISD::PREFSLOT2VEC, Op0VecVT, Op0);
 
     if (Opc != ISD::SIGN_EXTEND) {
       // Use a shuffle to zero extend the i32 to i64 directly:
@@ -2438,9 +2442,9 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
     // Turn operands into vectors to satisfy type checking (shufb works on
     // vectors)
     SDValue Op0 =
-      DAG.getNode(SPUISD::PROMOTE_SCALAR, MVT::v2i64, Op.getOperand(0));
+      DAG.getNode(SPUISD::PREFSLOT2VEC, MVT::v2i64, Op.getOperand(0));
     SDValue Op1 =
-      DAG.getNode(SPUISD::PROMOTE_SCALAR, MVT::v2i64, Op.getOperand(1));
+      DAG.getNode(SPUISD::PREFSLOT2VEC, MVT::v2i64, Op.getOperand(1));
     SmallVector<SDValue, 16> ShufBytes;
 
     // Create the shuffle mask for "rotating" the borrow up one register slot
@@ -2467,9 +2471,9 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
     // Turn operands into vectors to satisfy type checking (shufb works on
     // vectors)
     SDValue Op0 =
-      DAG.getNode(SPUISD::PROMOTE_SCALAR, MVT::v2i64, Op.getOperand(0));
+      DAG.getNode(SPUISD::PREFSLOT2VEC, MVT::v2i64, Op.getOperand(0));
     SDValue Op1 =
-      DAG.getNode(SPUISD::PROMOTE_SCALAR, MVT::v2i64, Op.getOperand(1));
+      DAG.getNode(SPUISD::PREFSLOT2VEC, MVT::v2i64, Op.getOperand(1));
     SmallVector<SDValue, 16> ShufBytes;
 
     // Create the shuffle mask for "rotating" the borrow up one register slot
@@ -2495,7 +2499,7 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
   case ISD::SHL: {
     SDValue ShiftAmt = Op.getOperand(1);
     MVT ShiftAmtVT = ShiftAmt.getValueType();
-    SDValue Op0Vec = DAG.getNode(SPUISD::PROMOTE_SCALAR, VecVT, Op0);
+    SDValue Op0Vec = DAG.getNode(SPUISD::PREFSLOT2VEC, VecVT, Op0);
     SDValue MaskLower =
       DAG.getNode(SPUISD::SELB, VecVT,
                   Op0Vec,
@@ -2540,7 +2544,7 @@ static SDValue LowerI64Math(SDValue Op, SelectionDAG &DAG, unsigned Opc)
   case ISD::SRA: {
     // Promote Op0 to vector
     SDValue Op0 =
-      DAG.getNode(SPUISD::PROMOTE_SCALAR, MVT::v2i64, Op.getOperand(0));
+      DAG.getNode(SPUISD::PREFSLOT2VEC, MVT::v2i64, Op.getOperand(0));
     SDValue ShiftAmt = Op.getOperand(1);
     MVT ShiftVT = ShiftAmt.getValueType();
 
@@ -2669,7 +2673,7 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
     SDValue N = Op.getOperand(0);
     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
 
-    SDValue Promote = DAG.getNode(SPUISD::PROMOTE_SCALAR, vecVT, N, N);
+    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, vecVT, N, N);
     SDValue CNTB = DAG.getNode(SPUISD::CNTB, vecVT, Promote);
 
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i8, CNTB, Elt0);
@@ -2686,7 +2690,7 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
     SDValue Mask0 = DAG.getConstant(0x0f, MVT::i16);
     SDValue Shift1 = DAG.getConstant(8, MVT::i32);
 
-    SDValue Promote = DAG.getNode(SPUISD::PROMOTE_SCALAR, vecVT, N, N);
+    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, vecVT, N, N);
     SDValue CNTB = DAG.getNode(SPUISD::CNTB, vecVT, Promote);
 
     // CNTB_result becomes the chain to which all of the virtual registers
@@ -2720,7 +2724,7 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
     SDValue Shift1 = DAG.getConstant(16, MVT::i32);
     SDValue Shift2 = DAG.getConstant(8, MVT::i32);
 
-    SDValue Promote = DAG.getNode(SPUISD::PROMOTE_SCALAR, vecVT, N, N);
+    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, vecVT, N, N);
     SDValue CNTB = DAG.getNode(SPUISD::CNTB, vecVT, Promote);
 
     // CNTB_result becomes the chain to which all of the virtual registers
@@ -2760,6 +2764,32 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
   return SDValue();
 }
 
+//! Lower ISD::SETCC
+/*!
+ Lower i64 condition code handling.
+ */
+
+static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  SDValue lhs = Op.getOperand(0);
+  SDValue rhs = Op.getOperand(1);
+  SDValue condition = Op.getOperand(2);
+
+  if (VT == MVT::i32 && lhs.getValueType() == MVT::i64) {
+    // Expand the i64 comparisons to what Cell can actually support,
+    // which is eq, ugt and sgt:
+#if 0
+    CondCodeSDNode *ccvalue = dyn_cast<CondCodeSDValue>(condition);
+
+    switch (ccvalue->get()) {
+      case
+    }
+#endif
+  }
+
+  return SDValue();
+}
+
 //! Lower ISD::SELECT_CC
 /*!
   ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
@@ -2772,7 +2802,8 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
   assumption, given the simplisitc uses so far.
  */
 
-static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
+                              const TargetLowering &TLI) {
   MVT VT = Op.getValueType();
   SDValue lhs = Op.getOperand(0);
   SDValue rhs = Op.getOperand(1);
@@ -2780,12 +2811,20 @@ static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
   SDValue falseval = Op.getOperand(3);
   SDValue condition = Op.getOperand(4);
 
+  // NOTE: SELB's arguments: $rA, $rB, $mask
+  //
+  // SELB selects bits from $rA where bits in $mask are 0, bits from $rB
+  // where bits in $mask are 1. CCond will be inverted, having 1s where the
+  // condition was true and 0s where the condition was false. Hence, the
+  // arguments to SELB get reversed.
+
   // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
   // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
   // with another "cannot select select_cc" assert:
 
-  SDValue compare = DAG.getNode(ISD::SETCC, VT, lhs, rhs, condition);
-  return DAG.getNode(SPUISD::SELB, VT, trueval, falseval, compare);
+  SDValue compare = DAG.getNode(ISD::SETCC, TLI.getSetCCResultType(Op),
+                                lhs, rhs, condition);
+  return DAG.getNode(SPUISD::SELB, VT, falseval, trueval, compare);
 }
 
 //! Custom lower ISD::TRUNCATE
@@ -2799,89 +2838,29 @@ static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
   MVT Op0VT = Op0.getValueType();
   MVT Op0VecVT = MVT::getVectorVT(Op0VT, (128 / Op0VT.getSizeInBits()));
 
-  SDValue PromoteScalar = DAG.getNode(SPUISD::PROMOTE_SCALAR, Op0VecVT, Op0);
+  // Create shuffle mask
+  if (Op0VT.getSimpleVT() == MVT::i128 && simpleVT == MVT::i64) {
+    // least significant doubleword of quadword
+    unsigned maskHigh = 0x08090a0b;
+    unsigned maskLow = 0x0c0d0e0f;
+    // Use a shuffle to perform the truncation
+    SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+                                   DAG.getConstant(maskHigh, MVT::i32),
+                                   DAG.getConstant(maskLow, MVT::i32),
+                                   DAG.getConstant(maskHigh, MVT::i32),
+                                   DAG.getConstant(maskLow, MVT::i32));
 
-  unsigned maskLow;
-  unsigned maskHigh;
 
-  // Create shuffle mask
-  switch (Op0VT.getSimpleVT()) {
-  case MVT::i128:
-    switch (simpleVT) {
-    case MVT::i64:
-      // least significant doubleword of quadword
-      maskHigh = 0x08090a0b;
-      maskLow = 0x0c0d0e0f;
-      break;
-    case MVT::i32:
-      // least significant word of quadword
-      maskHigh = maskLow = 0x0c0d0e0f;
-      break;
-    case MVT::i16:
-      // least significant halfword of quadword
-      maskHigh = maskLow = 0x0e0f0e0f;
-      break;
-    case MVT::i8:
-      // least significant byte of quadword
-      maskHigh = maskLow = 0x0f0f0f0f;
-      break;
-    default:
-      cerr << "Truncation to illegal type!";
-      abort();
-    }
-    break;
-  case MVT::i64:
-    switch (simpleVT) {
-    case MVT::i32:
-      // least significant word of doubleword
-      maskHigh = maskLow = 0x04050607;
-      break;
-    case MVT::i16:
-      // least significant halfword of doubleword
-      maskHigh = maskLow = 0x06070607;
-      break;
-    case MVT::i8:
-      // least significant byte of doubleword
-      maskHigh = maskLow = 0x07070707;
-      break;
-    default:
-      cerr << "Truncation to illegal type!";
-      abort();
-    }
-    break;
-  case MVT::i32:
-  case MVT::i16:
-    switch (simpleVT) {
-    case MVT::i16:
-      // least significant halfword of word
-      maskHigh = maskLow = 0x02030203;
-      break;
-    case MVT::i8:
-      // least significant byte of word/halfword
-      maskHigh = maskLow = 0x03030303;
-      break;
-    default:
-      cerr << "Truncation to illegal type!";
-      abort();
-    }
-    break;
-  default:
-    cerr << "Trying to lower truncation from illegal type!";
-    abort();
-  }
+    SDValue PromoteScalar = DAG.getNode(SPUISD::PREFSLOT2VEC, Op0VecVT, Op0);
 
-  // Use a shuffle to perform the truncation
-  SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                                 DAG.getConstant(maskHigh, MVT::i32),
-                                 DAG.getConstant(maskLow, MVT::i32),
-                                 DAG.getConstant(maskHigh, MVT::i32),
-                                 DAG.getConstant(maskLow, MVT::i32));
+    SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, Op0VecVT,
+                                       PromoteScalar, PromoteScalar, shufMask);
 
-  SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, Op0VecVT,
-                                     PromoteScalar, PromoteScalar, shufMask);
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
+                       DAG.getNode(ISD::BIT_CONVERT, VecVT, truncShuffle));
+  }
 
-  return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
-                     DAG.getNode(ISD::BIT_CONVERT, VecVT, truncShuffle));
+  return SDValue();             // Leave the truncate unmolested
 }
 
 //! Custom (target-specific) lowering entry point
@@ -2921,7 +2900,7 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
   case ISD::ConstantFP:
     return LowerConstantFP(Op, DAG);
   case ISD::BRCOND:
-    return LowerBRCOND(Op, DAG);
+    return LowerBRCOND(Op, DAG, *this);
   case ISD::FORMAL_ARGUMENTS:
     return LowerFORMAL_ARGUMENTS(Op, DAG, VarArgsFrameIndex);
   case ISD::CALL:
@@ -2942,7 +2921,7 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
   case ISD::SHL:
   case ISD::SRA: {
     if (VT == MVT::i8)
-      return LowerI8Math(Op, DAG, Opc);
+      return LowerI8Math(Op, DAG, Opc, *this);
     else if (VT == MVT::i64)
       return LowerI64Math(Op, DAG, Opc);
     break;
@@ -2971,7 +2950,7 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
     if (VT.isVector())
       return LowerVectorMUL(Op, DAG);
     else if (VT == MVT::i8)
-      return LowerI8Math(Op, DAG, Opc);
+      return LowerI8Math(Op, DAG, Opc, *this);
     else
       return LowerMUL(Op, DAG, VT, Opc);
 
@@ -2990,10 +2969,13 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
     return LowerCTPOP(Op, DAG);
 
   case ISD::SELECT_CC:
-    return LowerSELECT_CC(Op, DAG);
+    return LowerSELECT_CC(Op, DAG, *this);
 
   case ISD::TRUNCATE:
     return LowerTRUNCATE(Op, DAG);
+
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG);
   }
 
   return SDValue();
@@ -3036,7 +3018,7 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
   SelectionDAG &DAG = DCI.DAG;
   SDValue Op0 = N->getOperand(0);       // everything has at least one operand
   MVT NodeVT = N->getValueType(0);      // The node's value type
-  MVT Op0VT = Op0.getValueType();      // The first operand's result
+  MVT Op0VT = Op0.getValueType();       // The first operand's result
   SDValue Result;                       // Initially, empty result
 
   switch (N->getOpcode()) {
@@ -3044,49 +3026,53 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
   case ISD::ADD: {
     SDValue Op1 = N->getOperand(1);
 
-    if (isa<ConstantSDNode>(Op1) && Op0.getOpcode() == SPUISD::IndirectAddr) {
-      SDValue Op01 = Op0.getOperand(1);
-      if (Op01.getOpcode() == ISD::Constant
-          || Op01.getOpcode() == ISD::TargetConstant) {
-        // (add <const>, (SPUindirect <arg>, <const>)) ->
-        // (SPUindirect <arg>, <const + const>)
-        ConstantSDNode *CN0 = cast<ConstantSDNode>(Op1);
-        ConstantSDNode *CN1 = cast<ConstantSDNode>(Op01);
-        SDValue combinedConst =
-          DAG.getConstant(CN0->getZExtValue() + CN1->getZExtValue(), Op0VT);
+    if (Op0.getOpcode() == SPUISD::IndirectAddr
+        || Op1.getOpcode() == SPUISD::IndirectAddr) {
+      // Normalize the operands to reduce repeated code
+      SDValue IndirectArg = Op0, AddArg = Op1;
+      
+      if (Op1.getOpcode() == SPUISD::IndirectAddr) {
+        IndirectArg = Op1;
+        AddArg = Op0;
+      }
+
+      if (isa<ConstantSDNode>(AddArg)) {
+        ConstantSDNode *CN0 = cast<ConstantSDNode > (AddArg);
+        SDValue IndOp1 = IndirectArg.getOperand(1);
+
+        if (CN0->isNullValue()) {
+          // (add (SPUindirect <arg>, <arg>), 0) ->
+          // (SPUindirect <arg>, <arg>)
 
 #if !defined(NDEBUG)
-        if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
             cerr << "\n"
-                 << "Replace: (add " << CN0->getZExtValue() << ", "
-                 << "(SPUindirect <arg>, " << CN1->getZExtValue() << "))\n"
+                 << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
+                 << "With:    (SPUindirect <arg>, <arg>)\n";
+          }
+#endif
+
+          return IndirectArg;
+        } else if (isa<ConstantSDNode>(IndOp1)) {
+          // (add (SPUindirect <arg>, <const>), <const>) ->
+          // (SPUindirect <arg>, <const + const>)
+          ConstantSDNode *CN1 = cast<ConstantSDNode > (IndOp1);
+          int64_t combinedConst = CN0->getSExtValue() + CN1->getSExtValue();
+          SDValue combinedValue = DAG.getConstant(combinedConst, Op0VT);
+
+#if !defined(NDEBUG)
+          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+            cerr << "\n"
+                 << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
+                 << "), " << CN0->getSExtValue() << ")\n"
                  << "With:    (SPUindirect <arg>, "
-                 << CN0->getZExtValue() + CN1->getZExtValue() << ")\n";
-        }
+                 << combinedConst << ")\n";
+          }
 #endif
 
-        return DAG.getNode(SPUISD::IndirectAddr, Op0VT,
-                           Op0.getOperand(0), combinedConst);
-      }
-    } else if (isa<ConstantSDNode>(Op0)
-               && Op1.getOpcode() == SPUISD::IndirectAddr) {
-      SDValue Op11 = Op1.getOperand(1);
-      if (Op11.getOpcode() == ISD::Constant
-          || Op11.getOpcode() == ISD::TargetConstant) {
-        // (add (SPUindirect <arg>, <const>), <const>) ->
-        // (SPUindirect <arg>, <const + const>)
-        ConstantSDNode *CN0 = cast<ConstantSDNode>(Op0);
-        ConstantSDNode *CN1 = cast<ConstantSDNode>(Op11);
-        SDValue combinedConst =
-          DAG.getConstant(CN0->getZExtValue() + CN1->getZExtValue(), Op0VT);
-
-        DEBUG(cerr << "Replace: (add " << CN0->getZExtValue() << ", "
-                   << "(SPUindirect <arg>, " << CN1->getZExtValue() << "))\n");
-        DEBUG(cerr << "With:    (SPUindirect <arg>, "
-                   << CN0->getZExtValue() + CN1->getZExtValue() << ")\n");
-
-        return DAG.getNode(SPUISD::IndirectAddr, Op1.getValueType(),
-                           Op1.getOperand(0), combinedConst);
+          return DAG.getNode(SPUISD::IndirectAddr, Op0VT,
+                             IndirectArg, combinedValue);
+        }
       }
     }
     break;
@@ -3127,6 +3113,25 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
 
         return Op0;
       }
+    } else if (Op0.getOpcode() == ISD::ADD) {
+      SDValue Op1 = N->getOperand(1);
+      if (ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Op1)) {
+        // (SPUindirect (add <arg>, <arg>), 0) ->
+        // (SPUindirect <arg>, <arg>)
+        if (CN1->isNullValue()) {
+
+#if !defined(NDEBUG)
+          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+            cerr << "\n"
+                 << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
+                 << "With:    (SPUindirect <arg>, <arg>)\n";
+          }
+#endif
+
+          return DAG.getNode(SPUISD::IndirectAddr, Op0VT,
+                             Op0.getOperand(0), Op0.getOperand(1));
+        }
+      }
     }
     break;
   }
@@ -3136,19 +3141,19 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
   case SPUISD::VEC_SRL:
   case SPUISD::VEC_SRA:
   case SPUISD::ROTQUAD_RZ_BYTES:
-  case SPUISD::ROTQUAD_RZ_BITS: {
+  case SPUISD::ROTQUAD_RZ_BITS:
+  case SPUISD::ROTBYTES_LEFT: {
     SDValue Op1 = N->getOperand(1);
 
-    if (isa<ConstantSDNode>(Op1)) {
-      // Kill degenerate vector shifts:
-      ConstantSDNode *CN = cast<ConstantSDNode>(Op1);
-      if (CN->getZExtValue() == 0) {
+    // Kill degenerate vector shifts:
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
+      if (CN->isNullValue()) {
         Result = Op0;
       }
     }
     break;
   }
-  case SPUISD::PROMOTE_SCALAR: {
+  case SPUISD::PREFSLOT2VEC: {
     switch (Op0.getOpcode()) {
     default:
       break;
@@ -3263,7 +3268,7 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
   case CNTB:
 #endif
 
-  case SPUISD::PROMOTE_SCALAR: {
+  case SPUISD::PREFSLOT2VEC: {
     SDValue Op0 = Op.getOperand(0);
     MVT Op0VT = Op0.getValueType();
     unsigned Op0VTBits = Op0VT.getSizeInBits();
@@ -3306,7 +3311,25 @@ SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
 #endif
   }
 }
+  
+unsigned
+SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                   unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  default:
+    return 1;
 
+  case ISD::SETCC: {
+    MVT VT = Op.getValueType();
+
+    if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
+      VT = MVT::i32;
+    }
+    return VT.getSizeInBits();
+  }
+  }
+}
+  
 // LowerAsmOperandForConstraint
 void
 SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
index dd1f97f8d35..8d2e9945455 100644
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -39,7 +39,7 @@ namespace llvm {
       SHUFB,                    ///< Vector shuffle (permute)
       SHUFFLE_MASK,             ///< Shuffle mask
       CNTB,                     ///< Count leading ones in bytes
-      PROMOTE_SCALAR,           ///< Promote scalar->vector
+      PREFSLOT2VEC,             ///< Promote scalar->vector
       VEC2PREFSLOT,             ///< Extract element 0
       MPY,                      ///< 16-bit Multiply (low parts of a 32-bit)
       MPYU,                     ///< Multiply Unsigned
@@ -58,6 +58,7 @@ namespace llvm {
       ROTBYTES_LEFT_BITS,       ///< Rotate bytes left by bit shift count
       SELECT_MASK,              ///< Select Mask (FSM, FSMB, FSMH, FSMBI)
       SELB,                     ///< Select bits -> (b & mask) | (a & ~mask)
+      GATHER_BITS,              ///< Gather bits from bytes/words/halfwords
       ADD_EXTENDED,             ///< Add extended, with carry
       CARRY_GENERATE,           ///< Carry generate for ADD_EXTENDED
       SUB_EXTENDED,             ///< Subtract extended, with borrow
@@ -120,6 +121,9 @@ namespace llvm {
                                                 const SelectionDAG &DAG,
                                                 unsigned Depth = 0) const;
 
+    virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                   unsigned Depth = 0) const;
+
     ConstraintType getConstraintType(const std::string &ConstraintLetter) const;
 
     std::pair<unsigned, const TargetRegisterClass*> 
diff --git a/lib/Target/CellSPU/SPUInstrFormats.td b/lib/Target/CellSPU/SPUInstrFormats.td
index f423dfa3420..710196467bc 100644
--- a/lib/Target/CellSPU/SPUInstrFormats.td
+++ b/lib/Target/CellSPU/SPUInstrFormats.td
@@ -120,9 +120,8 @@ class CVTIntFPForm<bits<10> opcode, dag OOL, dag IOL, string asmstr,
 }
 
 let RA = 0 in {
-  class BICondForm<bits<11> opcode, string asmstr, list<dag> pattern>
-           : RRForm<opcode, (outs), (ins R32C:$rA, R32C:$func), asmstr,
-                    BranchResolv, pattern>
+  class BICondForm<bits<11> opcode, dag OOL, dag IOL, string asmstr, list<dag> pattern>
+           : RRForm<opcode, OOL, IOL, asmstr, BranchResolv, pattern>
   { }
 
   let RT = 0 in {
diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp
index 442d49141b1..37a58705795 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -34,10 +34,14 @@ namespace {
   inline bool isCondBranch(const MachineInstr *I) {
     unsigned opc = I->getOpcode();
 
-    return (opc == SPU::BRNZ
-	    || opc == SPU::BRZ
-	    || opc == SPU::BRHNZ
-	    || opc == SPU::BRHZ);
+    return (opc == SPU::BRNZr32
+            || opc == SPU::BRNZv4i32
+	    || opc == SPU::BRZr32
+	    || opc == SPU::BRZv4i32
+	    || opc == SPU::BRHNZr16
+	    || opc == SPU::BRHNZv8i16
+	    || opc == SPU::BRHZr16
+	    || opc == SPU::BRHZv8i16);
   }
 }
 
@@ -103,6 +107,19 @@ SPUInstrInfo::isMoveInstr(const MachineInstr& MI,
       return true;
     }
     break;
+  case SPU::LRr8:
+  case SPU::LRr16:
+  case SPU::LRr32:
+  case SPU::LRf32:
+  case SPU::LRr64:
+  case SPU::LRf64:
+  case SPU::LRr128:
+  case SPU::LRv16i8:
+  case SPU::LRv8i16:
+  case SPU::LRv4i32:
+  case SPU::LRv4f32:
+  case SPU::LRv2i64:
+  case SPU::LRv2f64:
   case SPU::ORv16i8_i8:
   case SPU::ORv8i16_i16:
   case SPU::ORv4i32_i32:
@@ -114,7 +131,18 @@ SPUInstrInfo::isMoveInstr(const MachineInstr& MI,
   case SPU::ORi32_v4i32:
   case SPU::ORi64_v2i64:
   case SPU::ORf32_v4f32:
-  case SPU::ORf64_v2f64:
+  case SPU::ORf64_v2f64: {
+    assert(MI.getNumOperands() == 2 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           "invalid SPU OR<type>_<vec> instruction!");
+    if (MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) {
+      sourceReg = MI.getOperand(0).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+    break;
+  }
   case SPU::ORv16i8:
   case SPU::ORv8i16:
   case SPU::ORv4i32:
@@ -198,18 +226,14 @@ SPUInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   case SPU::STQDr8: {
     const MachineOperand MOp1 = MI->getOperand(1);
     const MachineOperand MOp2 = MI->getOperand(2);
-    if (MOp1.isImm()
-	&& (MOp2.isFI()
-	    || (MOp2.isReg() && MOp2.getReg() == SPU::R1))) {
-      if (MOp2.isFI())
-	FrameIndex = MOp2.getIndex();
-      else
-	FrameIndex = MOp1.getImm() / SPUFrameInfo::stackSlotSize();
+    if (MOp1.isImm() && MOp2.isFI()) {
+      FrameIndex = MOp2.getIndex();
       return MI->getOperand(0).getReg();
     }
     break;
   }
-  case SPU::STQXv16i8:
+#if 0
+    case SPU::STQXv16i8:
   case SPU::STQXv8i16:
   case SPU::STQXv4i32:
   case SPU::STQXv4f32:
@@ -226,6 +250,7 @@ SPUInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
       return MI->getOperand(0).getReg();
     }
     break;
+#endif
   }
   return 0;
 }
@@ -292,6 +317,8 @@ SPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     opc = (isValidFrameIdx ? SPU::STQDr16 : SPU::STQXr16);
   } else if (RC == SPU::R8CRegisterClass) {
     opc = (isValidFrameIdx ? SPU::STQDr8 : SPU::STQXr8);
+  } else if (RC == SPU::VECREGRegisterClass) {
+    opc = (isValidFrameIdx) ? SPU::STQDv16i8 : SPU::STQXv16i8;
   } else {
     assert(0 && "Unknown regclass!");
     abort();
@@ -366,6 +393,8 @@ SPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     opc = (isValidFrameIdx ? SPU::LQDr16 : SPU::LQXr16);
   } else if (RC == SPU::R8CRegisterClass) {
     opc = (isValidFrameIdx ? SPU::LQDr8 : SPU::LQXr8);
+  } else if (RC == SPU::VECREGRegisterClass) {
+    opc = (isValidFrameIdx) ? SPU::LQDv16i8 : SPU::LQXv16i8;
   } else {
     assert(0 && "Unknown regclass in loadRegFromStackSlot!");
     abort();
diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td
index 2338a0318ba..08d767684af 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@@ -1,10 +1,10 @@
 //==- SPUInstrInfo.td - Describe the Cell SPU Instructions -*- tablegen -*-==//
-// 
+//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 // Cell SPU Instructions:
 //===----------------------------------------------------------------------===//
@@ -49,14 +49,14 @@ def DWARF_LOC        : Pseudo<(outs), (ins i32imm:$line, i32imm:$col, i32imm:$fi
 
 let canFoldAsLoad = 1 in {
   class LoadDFormVec<ValueType vectype>
-    : RI10Form<0b00101100, (outs VECREG:$rT), (ins memri10:$src),
+    : RI10Form<0b00101100, (outs VECREG:$rT), (ins dformaddr:$src),
                "lqd\t$rT, $src",
                LoadStore,
                [(set (vectype VECREG:$rT), (load dform_addr:$src))]>
   { }
 
   class LoadDForm<RegisterClass rclass>
-    : RI10Form<0b00101100, (outs rclass:$rT), (ins memri10:$src),
+    : RI10Form<0b00101100, (outs rclass:$rT), (ins dformaddr:$src),
                "lqd\t$rT, $src",
                LoadStore,
                [(set rclass:$rT, (load dform_addr:$src))]>
@@ -161,14 +161,14 @@ let canFoldAsLoad = 1 in {
 // Stores:
 //===----------------------------------------------------------------------===//
 class StoreDFormVec<ValueType vectype>
-  : RI10Form<0b00100100, (outs), (ins VECREG:$rT, memri10:$src),
+  : RI10Form<0b00100100, (outs), (ins VECREG:$rT, dformaddr:$src),
              "stqd\t$rT, $src",
              LoadStore,
              [(store (vectype VECREG:$rT), dform_addr:$src)]>
 { }
 
 class StoreDForm<RegisterClass rclass>
-  : RI10Form<0b00100100, (outs), (ins rclass:$rT, memri10:$src),
+  : RI10Form<0b00100100, (outs), (ins rclass:$rT, dformaddr:$src),
              "stqd\t$rT, $src",
              LoadStore,
              [(store rclass:$rT, dform_addr:$src)]>
@@ -269,7 +269,7 @@ def STQR : RI16Form<0b111000100, (outs), (ins VECREG:$rT, s16imm:$disp),
 // Generate Controls for Insertion:
 //===----------------------------------------------------------------------===//
 
-def CBD: RI7Form<0b10101111100, (outs VECREG:$rT), (ins memri7:$src),
+def CBD: RI7Form<0b10101111100, (outs VECREG:$rT), (ins shufaddr:$src),
     "cbd\t$rT, $src", ShuffleOp,
     [(set (v16i8 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
 
@@ -277,7 +277,7 @@ def CBX: RRForm<0b00101011100, (outs VECREG:$rT), (ins memrr:$src),
     "cbx\t$rT, $src", ShuffleOp,
     [(set (v16i8 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
 
-def CHD: RI7Form<0b10101111100, (outs VECREG:$rT), (ins memri7:$src),
+def CHD: RI7Form<0b10101111100, (outs VECREG:$rT), (ins shufaddr:$src),
     "chd\t$rT, $src", ShuffleOp,
     [(set (v8i16 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
 
@@ -285,7 +285,7 @@ def CHX: RRForm<0b10101011100, (outs VECREG:$rT), (ins memrr:$src),
     "chx\t$rT, $src", ShuffleOp,
     [(set (v8i16 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
 
-def CWD: RI7Form<0b01101111100, (outs VECREG:$rT), (ins memri7:$src),
+def CWD: RI7Form<0b01101111100, (outs VECREG:$rT), (ins shufaddr:$src),
     "cwd\t$rT, $src", ShuffleOp,
     [(set (v4i32 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
 
@@ -293,7 +293,7 @@ def CWX: RRForm<0b01101011100, (outs VECREG:$rT), (ins memrr:$src),
     "cwx\t$rT, $src", ShuffleOp,
     [(set (v4i32 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
 
-def CWDf32: RI7Form<0b01101111100, (outs VECREG:$rT), (ins memri7:$src),
+def CWDf32: RI7Form<0b01101111100, (outs VECREG:$rT), (ins shufaddr:$src),
     "cwd\t$rT, $src", ShuffleOp,
     [(set (v4f32 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
 
@@ -301,7 +301,7 @@ def CWXf32: RRForm<0b01101011100, (outs VECREG:$rT), (ins memrr:$src),
     "cwx\t$rT, $src", ShuffleOp,
     [(set (v4f32 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
 
-def CDD: RI7Form<0b11101111100, (outs VECREG:$rT), (ins memri7:$src),
+def CDD: RI7Form<0b11101111100, (outs VECREG:$rT), (ins shufaddr:$src),
     "cdd\t$rT, $src", ShuffleOp,
     [(set (v2i64 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
 
@@ -309,7 +309,7 @@ def CDX: RRForm<0b11101011100, (outs VECREG:$rT), (ins memrr:$src),
     "cdx\t$rT, $src", ShuffleOp,
     [(set (v2i64 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
 
-def CDDf64: RI7Form<0b11101111100, (outs VECREG:$rT), (ins memri7:$src),
+def CDDf64: RI7Form<0b11101111100, (outs VECREG:$rT), (ins shufaddr:$src),
     "cdd\t$rT, $src", ShuffleOp,
     [(set (v2f64 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
 
@@ -421,6 +421,7 @@ multiclass ImmLoadAddress
   def f32: ILARegInst<R32FP, f18imm, fpimm18>;
   def f64: ILARegInst<R64FP, f18imm_f64, fpimm18>;
 
+  def hi: ILARegInst<R32C, symbolHi, imm18>;
   def lo: ILARegInst<R32C, symbolLo, imm18>;
 
   def lsa: ILAInst<(outs R32C:$rT), (ins symbolLSA:$val),
@@ -481,37 +482,77 @@ multiclass FormSelectMaskBytesImm
 defm FSMBI : FormSelectMaskBytesImm;
 
 // fsmb: Form select mask for bytes. N.B. Input operand, $rA, is 16-bits
-def FSMB:
-    RRForm_1<0b01101101100, (outs VECREG:$rT), (ins R16C:$rA),
-             "fsmb\t$rT, $rA", SelectOp,
-             [(set (v16i8 VECREG:$rT), (SPUselmask R16C:$rA))]>;
+class FSMBInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b01101101100, OOL, IOL, "fsmb\t$rT, $rA", SelectOp,
+             pattern>;
+
+class FSMBRegInst<RegisterClass rclass, ValueType vectype>:
+    FSMBInst<(outs VECREG:$rT), (ins rclass:$rA),
+             [(set (vectype VECREG:$rT), (SPUselmask rclass:$rA))]>;
+
+class FSMBVecInst<ValueType vectype>:
+    FSMBInst<(outs VECREG:$rT), (ins VECREG:$rA),
+             [(set (vectype VECREG:$rT),
+                   (SPUselmask (vectype VECREG:$rA)))]>;
+
+multiclass FormSelectMaskBits {
+  def v16i8_r16: FSMBRegInst<R16C, v16i8>;
+  def v16i8:     FSMBVecInst<v16i8>;
+}
+
+defm FSMB: FormSelectMaskBits;
 
 // fsmh: Form select mask for halfwords. N.B., Input operand, $rA, is
 // only 8-bits wide (even though it's input as 16-bits here)
-def FSMH:
-    RRForm_1<0b10101101100, (outs VECREG:$rT), (ins R16C:$rA),
-      "fsmh\t$rT, $rA", SelectOp,
-      [(set (v8i16 VECREG:$rT), (SPUselmask R16C:$rA))]>;
+
+class FSMHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b10101101100, OOL, IOL, "fsmh\t$rT, $rA", SelectOp,
+             pattern>;
+
+class FSMHRegInst<RegisterClass rclass, ValueType vectype>:
+    FSMHInst<(outs VECREG:$rT), (ins rclass:$rA),
+             [(set (vectype VECREG:$rT), (SPUselmask rclass:$rA))]>;
+
+class FSMHVecInst<ValueType vectype>:
+    FSMHInst<(outs VECREG:$rT), (ins VECREG:$rA),
+             [(set (vectype VECREG:$rT),
+                   (SPUselmask (vectype VECREG:$rA)))]>;
+
+multiclass FormSelectMaskHalfword {
+  def v8i16_r16: FSMHRegInst<R16C, v8i16>;
+  def v8i16:     FSMHVecInst<v8i16>;
+}
+
+defm FSMH: FormSelectMaskHalfword;
 
 // fsm: Form select mask for words. Like the other fsm* instructions,
 // only the lower 4 bits of $rA are significant.
-class FSMInst<ValueType vectype, RegisterClass rclass>:
-    RRForm_1<0b00101101100, (outs VECREG:$rT), (ins rclass:$rA),
-      "fsm\t$rT, $rA",
-      SelectOp,
-      [(set (vectype VECREG:$rT), (SPUselmask rclass:$rA))]>;
+
+class FSMInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b00101101100, OOL, IOL, "fsm\t$rT, $rA", SelectOp,
+             pattern>;
+
+class FSMRegInst<ValueType vectype, RegisterClass rclass>:
+    FSMInst<(outs VECREG:$rT), (ins rclass:$rA),
+            [(set (vectype VECREG:$rT), (SPUselmask rclass:$rA))]>;
+
+class FSMVecInst<ValueType vectype>:
+    FSMInst<(outs VECREG:$rT), (ins VECREG:$rA),
+            [(set (vectype VECREG:$rT), (SPUselmask (vectype VECREG:$rA)))]>;
 
 multiclass FormSelectMaskWord {
-  def r32 : FSMInst<v4i32, R32C>;
-  def r16 : FSMInst<v4i32, R16C>;
+  def v4i32: FSMVecInst<v4i32>;
+
+  def r32 :  FSMRegInst<v4i32, R32C>;
+  def r16 :  FSMRegInst<v4i32, R16C>;
 }
 
 defm FSM : FormSelectMaskWord;
 
 // Special case when used for i64 math operations
 multiclass FormSelectMaskWord64 {
-  def r32 : FSMInst<v2i64, R32C>;
-  def r16 : FSMInst<v2i64, R16C>;
+  def r32 : FSMRegInst<v2i64, R32C>;
+  def r16 : FSMRegInst<v2i64, R16C>;
 }
 
 defm FSM64 : FormSelectMaskWord64;
@@ -736,7 +777,7 @@ defm BG : BorrowGenerate;
 // BGX: Borrow generate, extended.
 def BGXvec:
     RRForm<0b11000010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB,
-                               VECREG:$rCarry),
+                                VECREG:$rCarry),
       "bgx\t$rT, $rA, $rB", IntegerOp,
       []>,
     RegConstraint<"$rCarry = $rT">,
@@ -898,20 +939,31 @@ def MPYHHAUr32:
       []>;
 
 // clz: Count leading zeroes
-def CLZv4i32:
-    RRForm_1<0b10100101010, (outs VECREG:$rT), (ins VECREG:$rA),
-      "clz\t$rT, $rA", IntegerOp,
-      [/* intrinsic */]>;
+class CLZInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b10100101010, OOL, IOL, "clz\t$rT, $rA",
+             IntegerOp, pattern>;
 
-def CLZr32:
-    RRForm_1<0b10100101010, (outs R32C:$rT), (ins R32C:$rA),
-      "clz\t$rT, $rA", IntegerOp,
-      [(set R32C:$rT, (ctlz R32C:$rA))]>;
+class CLZRegInst<RegisterClass rclass>:
+    CLZInst<(outs rclass:$rT), (ins rclass:$rA),
+	    [(set rclass:$rT, (ctlz rclass:$rA))]>;
+
+class CLZVecInst<ValueType vectype>:
+    CLZInst<(outs VECREG:$rT), (ins VECREG:$rA),
+            [(set (vectype VECREG:$rT), (ctlz (vectype VECREG:$rA)))]>;
+
+multiclass CountLeadingZeroes {
+  def v4i32 : CLZVecInst<v4i32>;
+  def r32   : CLZRegInst<R32C>;
+}
+
+defm CLZ : CountLeadingZeroes;
 
 // cntb: Count ones in bytes (aka "population count")
+//
 // NOTE: This instruction is really a vector instruction, but the custom
 // lowering code uses it in unorthodox ways to support CTPOP for other
 // data types!
+
 def CNTBv16i8:
     RRForm_1<0b00101101010, (outs VECREG:$rT), (ins VECREG:$rA),
       "cntb\t$rT, $rA", IntegerOp,
@@ -927,26 +979,88 @@ def CNTBv4i32 :
       "cntb\t$rT, $rA", IntegerOp,
       [(set (v4i32 VECREG:$rT), (SPUcntb (v4i32 VECREG:$rA)))]>;
 
-// gbb: Gather all low order bits from each byte in $rA into a single 16-bit
-// quantity stored into $rT
-def GBB:
-    RRForm_1<0b01001101100, (outs R16C:$rT), (ins VECREG:$rA),
-      "gbb\t$rT, $rA", GatherOp,
-      []>;
+// gbb: Gather the low order bits from each byte in $rA into a single 16-bit
+// quantity stored into $rT's slot 0, upper 16 bits are zeroed, as are
+// slots 1-3.
+//
+// Note: This instruction "pairs" with the fsmb instruction for all of the
+// various types defined here.
+//
+// Note 2: The "VecInst" and "RegInst" forms refer to the result being either
+// a vector or register.
+
+class GBBInst<dag OOL, dag IOL, list<dag> pattern>:
+  RRForm_1<0b01001101100, OOL, IOL, "gbb\t$rT, $rA", GatherOp, pattern>;
+
+class GBBRegInst<RegisterClass rclass, ValueType vectype>:
+  GBBInst<(outs rclass:$rT), (ins VECREG:$rA),
+          [(set rclass:$rT, (SPUgatherbits (vectype VECREG:$rA)))]>;
+
+class GBBVecInst<ValueType vectype>:
+  GBBInst<(outs VECREG:$rT), (ins VECREG:$rA),
+          [(set (vectype VECREG:$rT), (SPUgatherbits (vectype VECREG:$rA)))]>;
+
+multiclass GatherBitsFromBytes {
+  def v16i8_r32: GBBRegInst<R32C, v16i8>;
+  def v16i8_r16: GBBRegInst<R16C, v16i8>;
+  def v16i8:     GBBVecInst<v16i8>;
+}
+
+defm GBB: GatherBitsFromBytes;
 
 // gbh: Gather all low order bits from each halfword in $rA into a single
-// 8-bit quantity stored in $rT
-def GBH:
-    RRForm_1<0b10001101100, (outs R16C:$rT), (ins VECREG:$rA),
-      "gbh\t$rT, $rA", GatherOp,
-      []>;
+// 8-bit quantity stored in $rT's slot 0, with the upper bits of $rT set to 0
+// and slots 1-3 also set to 0.
+//
+// See notes for GBBInst, above.
+
+class GBHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b10001101100, OOL, IOL, "gbh\t$rT, $rA", GatherOp,
+             pattern>;
+
+class GBHRegInst<RegisterClass rclass, ValueType vectype>:
+    GBHInst<(outs rclass:$rT), (ins VECREG:$rA),
+            [(set rclass:$rT, (SPUgatherbits (vectype VECREG:$rA)))]>;
+
+class GBHVecInst<ValueType vectype>:
+    GBHInst<(outs VECREG:$rT), (ins VECREG:$rA),
+            [(set (vectype VECREG:$rT),
+                  (SPUgatherbits (vectype VECREG:$rA)))]>;
+
+multiclass GatherBitsHalfword {
+  def v8i16_r32: GBHRegInst<R32C, v8i16>;
+  def v8i16_r16: GBHRegInst<R16C, v8i16>;
+  def v8i16:     GBHVecInst<v8i16>;
+}
+
+defm GBH: GatherBitsHalfword;
 
 // gb: Gather all low order bits from each word in $rA into a single
-// 4-bit quantity stored in $rT
-def GB:
-    RRForm_1<0b00001101100, (outs R16C:$rT), (ins VECREG:$rA),
-      "gb\t$rT, $rA", GatherOp,
-      []>;
+// 4-bit quantity stored in $rT's slot 0, upper bits in $rT set to 0,
+// as well as slots 1-3.
+//
+// See notes for gbb, above.
+
+class GBInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b00001101100, OOL, IOL, "gb\t$rT, $rA", GatherOp,
+             pattern>;
+
+class GBRegInst<RegisterClass rclass, ValueType vectype>:
+    GBInst<(outs rclass:$rT), (ins VECREG:$rA),
+           [(set rclass:$rT, (SPUgatherbits (vectype VECREG:$rA)))]>;
+
+class GBVecInst<ValueType vectype>:
+    GBInst<(outs VECREG:$rT), (ins VECREG:$rA),
+           [(set (vectype VECREG:$rT),
+                 (SPUgatherbits (vectype VECREG:$rA)))]>;
+
+multiclass GatherBitsWord {
+  def v4i32_r32: GBRegInst<R32C, v4i32>;
+  def v4i32_r16: GBRegInst<R16C, v4i32>;
+  def v4i32:     GBVecInst<v4i32>;
+}
+
+defm GB: GatherBitsWord;
 
 // avgb: average bytes
 def AVGB:
@@ -976,30 +1090,26 @@ class XSBHVecInst<ValueType vectype>:
     XSBHInst<(outs VECREG:$rDst), (ins VECREG:$rSrc),
       [(set (v8i16 VECREG:$rDst), (sext (vectype VECREG:$rSrc)))]>;
 
-class XSBHRegInst<RegisterClass rclass>:
+class XSBHInRegInst<RegisterClass rclass>:
     XSBHInst<(outs rclass:$rDst), (ins rclass:$rSrc),
       [(set rclass:$rDst, (sext_inreg rclass:$rSrc, i8))]>;
 
 multiclass ExtendByteHalfword {
   def v16i8: XSBHVecInst<v8i16>;
-  def r16: XSBHRegInst<R16C>;
+  def r16:   XSBHInRegInst<R16C>;
+  def r8:    XSBHInst<(outs R16C:$rDst), (ins R8C:$rSrc),
+                      [(set R16C:$rDst, (sext R8C:$rSrc))]>;
 
   // 32-bit form for XSBH: used to sign extend 8-bit quantities to 16-bit
   // quantities to 32-bit quantities via a 32-bit register (see the sext 8->32
   // pattern below). Intentionally doesn't match a pattern because we want the
   // sext 8->32 pattern to do the work for us, namely because we need the extra
   // XSHWr32.
-  def r32: XSBHRegInst<R32C>;
+  def r32:   XSBHInRegInst<R32C>;
 }
 
 defm XSBH : ExtendByteHalfword;
 
-// Sign-extend, but take an 8-bit register to a 16-bit register (not done as
-// sext_inreg)
-def XSBHr8:
-    XSBHInst<(outs R16C:$rDst), (ins R8C:$rSrc),
-             [(set R16C:$rDst, (sext R8C:$rSrc))]>;
-
 // Sign extend halfwords to words:
 def XSHWvec:
     RRForm_1<0b01101101010, (outs VECREG:$rDest), (ins VECREG:$rSrc),
@@ -1208,13 +1318,44 @@ class ORRegInst<RegisterClass rclass>:
     ORInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
            [(set rclass:$rT, (or rclass:$rA, rclass:$rB))]>;
 
+// ORCvtForm: OR conversion form
+//
+// This is used to "convert" the preferred slot to its vector equivalent, as
+// well as convert a vector back to its preferred slot.
+//
+// These are effectively no-ops, but need to exist for proper type conversion
+// and type coercion.
+
+class ORCvtForm<dag OOL, dag IOL>
+          : SPUInstr<OOL, IOL, "or\t$rT, $rA, $rA", IntegerOp> {
+  bits<7> RA;
+  bits<7> RT;
+
+  let Pattern = [/* no pattern */];
+
+  let Inst{0-10} = 0b10000010000;
+  let Inst{11-17} = RA;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
 class ORPromoteScalar<RegisterClass rclass>:
-    ORInst<(outs VECREG:$rT), (ins rclass:$rA, rclass:$rB),
-           [/* no pattern */]>;
+    ORCvtForm<(outs VECREG:$rT), (ins rclass:$rA)>;
 
 class ORExtractElt<RegisterClass rclass>:
-    ORInst<(outs rclass:$rT), (ins VECREG:$rA, VECREG:$rB),
-           [/* no pattern */]>;
+    ORCvtForm<(outs rclass:$rT), (ins VECREG:$rA)>;
+
+class ORCvtRegGPRC<RegisterClass rclass>:
+    ORCvtForm<(outs GPRC:$rT), (ins rclass:$rA)>;
+
+class ORCvtVecGPRC:
+    ORCvtForm<(outs GPRC:$rT), (ins VECREG:$rA)>;
+
+class ORCvtGPRCReg<RegisterClass rclass>:
+    ORCvtForm<(outs rclass:$rT), (ins GPRC:$rA)>;
+
+class ORCvtGPRCVec:
+    ORCvtForm<(outs VECREG:$rT), (ins GPRC:$rA)>;
 
 multiclass BitwiseOr
 {
@@ -1229,7 +1370,7 @@ multiclass BitwiseOr
                                                  (v4i32 VECREG:$rB)))))]>;
 
   def v2f64: ORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-                    [(set (v2f64 VECREG:$rT), 
+                    [(set (v2f64 VECREG:$rT),
                           (v2f64 (bitconvert (or (v2i64 VECREG:$rA),
                                                  (v2i64 VECREG:$rB)))))]>;
 
@@ -1260,48 +1401,115 @@ multiclass BitwiseOr
   def i64_v2i64: ORExtractElt<R64C>;
   def f32_v4f32: ORExtractElt<R32FP>;
   def f64_v2f64: ORExtractElt<R64FP>;
+
+  // Conversion from GPRC to register
+  def i128_r64:  ORCvtRegGPRC<R64C>;
+  def i128_f64:  ORCvtRegGPRC<R64FP>;
+  def i128_r32:  ORCvtRegGPRC<R32C>;
+  def i128_f32:  ORCvtRegGPRC<R32FP>;
+  def i128_r16:  ORCvtRegGPRC<R16C>;
+  def i128_r8:   ORCvtRegGPRC<R8C>;
+
+  // Conversion from GPRC to vector
+  def i128_vec:  ORCvtVecGPRC;
+
+  // Conversion from register to GPRC
+  def r64_i128:  ORCvtGPRCReg<R64C>;
+  def f64_i128:  ORCvtGPRCReg<R64FP>;
+  def r32_i128:  ORCvtGPRCReg<R32C>;
+  def f32_i128:  ORCvtGPRCReg<R32FP>;
+  def r16_i128:  ORCvtGPRCReg<R16C>;
+  def r8_i128:   ORCvtGPRCReg<R8C>;
+
+  // Conversion from vector to GPRC
+  def vec_i128:  ORCvtGPRCVec;
 }
 
 defm OR : BitwiseOr;
 
-// scalar->vector promotion patterns:
-def : Pat<(v16i8 (SPUpromote_scalar R8C:$rA)),
-          (ORv16i8_i8 R8C:$rA, R8C:$rA)>;
+// scalar->vector promotion patterns (preferred slot to vector):
+def : Pat<(v16i8 (SPUprefslot2vec R8C:$rA)),
+          (ORv16i8_i8 R8C:$rA)>;
 
-def : Pat<(v8i16 (SPUpromote_scalar R16C:$rA)),
-          (ORv8i16_i16 R16C:$rA, R16C:$rA)>;
+def : Pat<(v8i16 (SPUprefslot2vec R16C:$rA)),
+          (ORv8i16_i16 R16C:$rA)>;
 
-def : Pat<(v4i32 (SPUpromote_scalar R32C:$rA)),
-          (ORv4i32_i32 R32C:$rA, R32C:$rA)>;
+def : Pat<(v4i32 (SPUprefslot2vec R32C:$rA)),
+          (ORv4i32_i32 R32C:$rA)>;
 
-def : Pat<(v2i64 (SPUpromote_scalar R64C:$rA)),
-          (ORv2i64_i64 R64C:$rA, R64C:$rA)>;
+def : Pat<(v2i64 (SPUprefslot2vec R64C:$rA)),
+          (ORv2i64_i64 R64C:$rA)>;
 
-def : Pat<(v4f32 (SPUpromote_scalar R32FP:$rA)),
-          (ORv4f32_f32 R32FP:$rA, R32FP:$rA)>;
+def : Pat<(v4f32 (SPUprefslot2vec R32FP:$rA)),
+          (ORv4f32_f32 R32FP:$rA)>;
 
-def : Pat<(v2f64 (SPUpromote_scalar R64FP:$rA)),
-          (ORv2f64_f64 R64FP:$rA, R64FP:$rA)>;
+def : Pat<(v2f64 (SPUprefslot2vec R64FP:$rA)),
+          (ORv2f64_f64 R64FP:$rA)>;
 
-// ORi*_v*: Used to extract vector element 0 (the preferred slot)
+// ORi*_v*: Used to extract vector element 0 (the preferred slot), otherwise
+// known as converting the vector back to its preferred slot
 
 def : Pat<(SPUvec2prefslot (v16i8 VECREG:$rA)),
-          (ORi8_v16i8 VECREG:$rA, VECREG:$rA)>;
+          (ORi8_v16i8 VECREG:$rA)>;
 
 def : Pat<(SPUvec2prefslot (v8i16 VECREG:$rA)),
-          (ORi16_v8i16 VECREG:$rA, VECREG:$rA)>;
+          (ORi16_v8i16 VECREG:$rA)>;
 
 def : Pat<(SPUvec2prefslot (v4i32 VECREG:$rA)),
-          (ORi32_v4i32 VECREG:$rA, VECREG:$rA)>;
+          (ORi32_v4i32 VECREG:$rA)>;
 
 def : Pat<(SPUvec2prefslot (v2i64 VECREG:$rA)),
-          (ORi64_v2i64 VECREG:$rA, VECREG:$rA)>;
+          (ORi64_v2i64 VECREG:$rA)>;
 
 def : Pat<(SPUvec2prefslot (v4f32 VECREG:$rA)),
-          (ORf32_v4f32 VECREG:$rA, VECREG:$rA)>;
+          (ORf32_v4f32 VECREG:$rA)>;
 
 def : Pat<(SPUvec2prefslot (v2f64 VECREG:$rA)),
-          (ORf64_v2f64 VECREG:$rA, VECREG:$rA)>;
+          (ORf64_v2f64 VECREG:$rA)>;
+
+// Load Register: This is an assembler alias for a bitwise OR of a register
+// against itself. It's here because it brings some clarity to assembly
+// language output.
+
+let hasCtrlDep = 1 in {
+    class LRInst<dag OOL, dag IOL>
+              : SPUInstr<OOL, IOL, "lr\t$rT, $rA", IntegerOp> {
+      bits<7> RA;
+      bits<7> RT;
+
+      let Pattern = [/*no pattern*/];
+
+      let Inst{0-10} = 0b10000010000;   /* It's an OR operation */
+      let Inst{11-17} = RA;
+      let Inst{18-24} = RA;
+      let Inst{25-31} = RT;
+    }
+
+    class LRVecInst<ValueType vectype>:
+        LRInst<(outs VECREG:$rT), (ins VECREG:$rA)>;
+
+    class LRRegInst<RegisterClass rclass>:
+        LRInst<(outs rclass:$rT), (ins rclass:$rA)>;
+
+    multiclass LoadRegister {
+      def v2i64: LRVecInst<v2i64>;
+      def v2f64: LRVecInst<v2f64>;
+      def v4i32: LRVecInst<v4i32>;
+      def v4f32: LRVecInst<v4f32>;
+      def v8i16: LRVecInst<v8i16>;
+      def v16i8: LRVecInst<v16i8>;
+
+      def r128:  LRRegInst<GPRC>;
+      def r64:   LRRegInst<R64C>;
+      def f64:   LRRegInst<R64FP>;
+      def r32:   LRRegInst<R32C>;
+      def f32:   LRRegInst<R32FP>;
+      def r16:   LRRegInst<R16C>;
+      def r8:    LRRegInst<R8C>;
+    }
+
+    defm LR: LoadRegister;
+}
 
 // ORC: Bitwise "or" with complement (c = a | ~b)
 
@@ -1585,12 +1793,24 @@ class SELBVecInst<ValueType vectype>:
                      (and (vnot (vectype VECREG:$rC)),
                           (vectype VECREG:$rA))))]>;
 
+class SELBVecCondInst<ValueType vectype>:
+  SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, R32C:$rC),
+           [(set (vectype VECREG:$rT),
+                 (select R32C:$rC,
+                         (vectype VECREG:$rB),
+                         (vectype VECREG:$rA)))]>;
+
 class SELBRegInst<RegisterClass rclass>:
   SELBInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB, rclass:$rC),
            [(set rclass:$rT,
                  (or (and rclass:$rA, rclass:$rC),
                      (and rclass:$rB, (not rclass:$rC))))]>;
 
+class SELBRegCondInst<RegisterClass rcond, RegisterClass rclass>:
+  SELBInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB, rcond:$rC),
+           [(set rclass:$rT,
+                 (select rcond:$rC, rclass:$rB, rclass:$rA))]>;
+
 multiclass SelectBits
 {
   def v16i8: SELBVecInst<v16i8>;
@@ -1603,6 +1823,16 @@ multiclass SelectBits
   def r32:   SELBRegInst<R32C>;
   def r16:   SELBRegInst<R16C>;
   def r8:    SELBRegInst<R8C>;
+
+  def v16i8_cond: SELBVecCondInst<v16i8>;
+  def v8i16_cond: SELBVecCondInst<v8i16>;
+  def v4i32_cond: SELBVecCondInst<v4i32>;
+  def v2i64_cond: SELBVecCondInst<v2i64>;
+
+  // SELBr64_cond is defined further down, look for i64 comparisons
+  def r32_cond:   SELBRegCondInst<R32C, R32C>;
+  def r16_cond:   SELBRegCondInst<R16C, R16C>;
+  def r8_cond:    SELBRegCondInst<R8C,  R8C>;
 }
 
 defm SELB : SelectBits;
@@ -1625,14 +1855,6 @@ def : SPUselbPatReg<R16C,  SELBr16>;
 def : SPUselbPatReg<R32C,  SELBr32>;
 def : SPUselbPatReg<R64C,  SELBr64>;
 
-class SelectConditional<RegisterClass rclass, SPUInstr inst>:
-    Pat<(select rclass:$rCond, rclass:$rTrue, rclass:$rFalse),
-        (inst rclass:$rFalse, rclass:$rTrue, rclass:$rCond)>;
-
-def : SelectConditional<R32C, SELBr32>;
-def : SelectConditional<R16C, SELBr16>;
-def : SelectConditional<R8C, SELBr8>;
-
 // EQV: Equivalence (1 for each same bit, otherwise 0)
 //
 // Note: There are a lot of ways to match this bit operator and these patterns
@@ -1753,6 +1975,10 @@ class SHUFBVecInst<ValueType resultvec, ValueType maskvec>:
                                 (resultvec VECREG:$rB),
                                 (maskvec VECREG:$rC)))]>;
 
+class SHUFBGPRCInst:
+    SHUFBInst<(outs VECREG:$rT), (ins GPRC:$rA, GPRC:$rB, VECREG:$rC),
+              [/* no pattern */]>;
+
 multiclass ShuffleBytes
 {
   def v16i8     : SHUFBVecInst<v16i8, v16i8>;
@@ -1769,6 +1995,8 @@ multiclass ShuffleBytes
 
   def v2f64     : SHUFBVecInst<v2f64, v16i8>;
   def v2f64_m32 : SHUFBVecInst<v2f64, v4i32>;
+
+  def gprc      : SHUFBGPRCInst;
 }
 
 defm SHUFB : ShuffleBytes;
@@ -2027,7 +2255,7 @@ defm ROTHI: RotateLeftHalfwordImm;
 
 def : Pat<(SPUvec_rotl VECREG:$rA, (i32 uimm7:$val)),
           (ROTHIv8i16 VECREG:$rA, imm:$val)>;
-    
+
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 // Rotate word:
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
@@ -2207,7 +2435,7 @@ multiclass RotateQuadByBitCount
 }
 
 defm ROTQBI: RotateQuadByBitCount;
-    
+
 class ROTQBIIInst<dag OOL, dag IOL, list<dag> pattern>:
     RI7Form<0b00011111100, OOL, IOL, "rotqbii\t$rT, $rA, $val",
             RotateShift, pattern>;
@@ -2298,7 +2526,7 @@ def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), (i32 imm:$val)),
 
 def: Pat<(SPUvec_srl (v8i16 VECREG:$rA), (i16 imm:$val)),
          (ROTHMIv8i16 VECREG:$rA, imm:$val)>;
- 
+
 def: Pat<(SPUvec_srl (v8i16 VECREG:$rA), (i8 imm:$val)),
          (ROTHMIv8i16 VECREG:$rA, imm:$val)>;
 
@@ -2359,7 +2587,7 @@ def ROTMIv4i32:
 
 def : Pat<(SPUvec_srl VECREG:$rA, (i16 uimm7:$val)),
           (ROTMIv4i32 VECREG:$rA, uimm7:$val)>;
- 
+
 def : Pat<(SPUvec_srl VECREG:$rA, (i8 uimm7:$val)),
           (ROTMIv4i32 VECREG:$rA, uimm7:$val)>;
 
@@ -2682,7 +2910,7 @@ let isTerminator = 1, isBarrier = 1 in {
       "hgt\t$rA, $rB", BranchResolv,
       [/* no pattern to match */]>;
 
-  def HGTIr32: 
+  def HGTIr32:
     RI10Form_2<0b11110010, (outs), (ins R32C:$rA, s10imm:$val),
       "hgti\t$rA, $val", BranchResolv,
       [/* no pattern to match */]>;
@@ -2698,9 +2926,9 @@ let isTerminator = 1, isBarrier = 1 in {
       [/* no pattern to match */]>;
 }
 
-//------------------------------------------------------------------------
-// Comparison operators:
-//------------------------------------------------------------------------
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Comparison operators for i8, i16 and i32:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 
 class CEQBInst<dag OOL, dag IOL, list<dag> pattern> :
   RRForm<0b00001011110, OOL, IOL, "ceqb\t$rT, $rA, $rB",
@@ -2990,8 +3218,14 @@ defm CLGTI  : CmpLGtrWordImm;
 // define a pattern to generate the right code, as a binary operator
 // (in a manner of speaking.)
 //
-// N.B.: This only matches the setcc set of conditionals. Special pattern
-// matching is used for select conditionals.
+// Notes:
+// 1. This only matches the setcc set of conditionals. Special pattern
+//    matching is used for select conditionals.
+//
+// 2. The "DAG" versions of these classes is almost exclusively used for
+//    i64 comparisons. See the tblgen fundamentals documentation for what
+//    ".ResultInstrs[0]" means; see TargetSelectionDAG.td and the Pattern
+//    class for where ResultInstrs originates.
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 
 class SETCCNegCondReg<PatFrag cond, RegisterClass rclass, ValueType inttype,
@@ -3004,10 +3238,10 @@ class SETCCNegCondImm<PatFrag cond, RegisterClass rclass, ValueType inttype,
   Pat<(cond rclass:$rA, (inttype immpred:$imm)),
       (xorinst (cmpare rclass:$rA, (inttype immpred:$imm)), (inttype -1))>;
 
-def : SETCCNegCondReg<setne, R8C, i8, XORBIr8, CEQBr8>;
+def : SETCCNegCondReg<setne, R8C, i8, XORBIr8,  CEQBr8>;
 def : SETCCNegCondImm<setne, R8C, i8, immSExt8, XORBIr8, CEQBIr8>;
 
-def : SETCCNegCondReg<setne, R16C, i16, XORHIr16, CEQHr16>;
+def : SETCCNegCondReg<setne, R16C, i16, XORHIr16,     CEQHr16>;
 def : SETCCNegCondImm<setne, R16C, i16, i16ImmSExt10, XORHIr16, CEQHIr16>;
 
 def : SETCCNegCondReg<setne, R32C, i32, XORIr32, CEQr32>;
@@ -3128,8 +3362,8 @@ class SELECTBinOpReg<PatFrag cond, RegisterClass rclass, ValueType inttype,
                      SPUInstr selinstr, SPUInstr binop, SPUInstr cmpOp1,
                      SPUInstr cmpOp2>:
   Pat<(select (inttype (cond rclass:$rA, rclass:$rB)),
-              rclass:$rFalse, rclass:$rTrue),
-      (selinstr rclass:$rTrue, rclass:$rFalse,
+              rclass:$rTrue, rclass:$rFalse),
+      (selinstr rclass:$rFalse, rclass:$rTrue,
                 (binop (cmpOp1 rclass:$rA, rclass:$rB),
                        (cmpOp2 rclass:$rA, rclass:$rB)))>;
 
@@ -3226,54 +3460,129 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
     BIForm<0b00010101100, "bi\t$func", [(brind R32C:$func)]>;
 
   // Various branches:
-  def BRNZ:
-    RI16Form<0b010000100, (outs), (ins R32C:$rCond, brtarget:$dest),
-      "brnz\t$rCond,$dest",
-      BranchResolv,
-      [(brcond R32C:$rCond, bb:$dest)]>;
-
-  def BRZ:
-    RI16Form<0b000000100, (outs), (ins R32C:$rT, brtarget:$dest),
-      "brz\t$rT,$dest",
-      BranchResolv,
-      [/* no pattern */]>;
+  class BRNZInst<dag IOL, list<dag> pattern>:
+    RI16Form<0b010000100, (outs), IOL, "brnz\t$rCond,$dest",
+             BranchResolv, pattern>;
 
-  def BRHNZ:
-    RI16Form<0b011000100, (outs), (ins R16C:$rCond, brtarget:$dest),
-      "brhnz\t$rCond,$dest",
-      BranchResolv,
-      [(brcond R16C:$rCond, bb:$dest)]>;
+  class BRNZRegInst<RegisterClass rclass>:
+    BRNZInst<(ins rclass:$rCond, brtarget:$dest),
+             [(brcond rclass:$rCond, bb:$dest)]>;
 
-  def BRHZ:
-    RI16Form<0b001000100, (outs), (ins R16C:$rT, brtarget:$dest),
-      "brhz\t$rT,$dest",
-      BranchResolv,
-      [/* no pattern */]>;
-  
-/*
-  def BINZ:
-    BICondForm<0b10010100100, "binz\t$rA, $func",
-               [(SPUbinz R32C:$rA, R32C:$func)]>;
-
-  def BIZ:
-    BICondForm<0b00010100100, "biz\t$rA, $func",
-               [(SPUbiz R32C:$rA, R32C:$func)]>;
-*/
+  class BRNZVecInst<ValueType vectype>:
+    BRNZInst<(ins VECREG:$rCond, brtarget:$dest),
+             [(brcond (vectype VECREG:$rCond), bb:$dest)]>;
+
+  multiclass BranchNotZero {
+    def v4i32 : BRNZVecInst<v4i32>;
+    def r32   : BRNZRegInst<R32C>;
+  }
+
+  defm BRNZ : BranchNotZero;
+
+  class BRZInst<dag IOL, list<dag> pattern>:
+    RI16Form<0b000000100, (outs), IOL, "brz\t$rT,$dest",
+             BranchResolv, pattern>;
+
+  class BRZRegInst<RegisterClass rclass>:
+    BRZInst<(ins rclass:$rT, brtarget:$dest), [/* no pattern */]>;
+
+  class BRZVecInst<ValueType vectype>:
+    BRZInst<(ins VECREG:$rT, brtarget:$dest), [/* no pattern */]>;
+
+  multiclass BranchZero {
+    def v4i32: BRZVecInst<v4i32>;
+    def r32:   BRZRegInst<R32C>;
+  }
+
+  defm BRZ: BranchZero;
+
+  // Note: LLVM doesn't do branch conditional, indirect. Otherwise these would
+  // be useful:
+  /*
+  class BINZInst<dag IOL, list<dag> pattern>:
+   BICondForm<0b10010100100, (outs), IOL, "binz\t$rA, $dest", pattern>;
+
+  class BINZRegInst<RegisterClass rclass>:
+    BINZInst<(ins rclass:$rA, brtarget:$dest),
+             [(brcond rclass:$rA, R32C:$dest)]>;
+
+  class BINZVecInst<ValueType vectype>:
+    BINZInst<(ins VECREG:$rA, R32C:$dest),
+             [(brcond (vectype VECREG:$rA), R32C:$dest)]>;
+
+  multiclass BranchNotZeroIndirect {
+    def v4i32: BINZVecInst<v4i32>;
+    def r32:   BINZRegInst<R32C>;
+  }
+
+  defm BINZ: BranchNotZeroIndirect;
+
+  class BIZInst<dag IOL, list<dag> pattern>:
+    BICondForm<0b00010100100, (outs), IOL, "biz\t$rA, $func", pattern>;
+
+  class BIZRegInst<RegisterClass rclass>:
+    BIZInst<(ins rclass:$rA, R32C:$func), [/* no pattern */]>;
+
+  class BIZVecInst<ValueType vectype>:
+    BIZInst<(ins VECREG:$rA, R32C:$func), [/* no pattern */]>;
+
+  multiclass BranchZeroIndirect {
+    def v4i32: BIZVecInst<v4i32>;
+    def r32:   BIZRegInst<R32C>;
+  }
+
+  defm BIZ: BranchZeroIndirect;
+  */
+
+  class BRHNZInst<dag IOL, list<dag> pattern>:
+    RI16Form<0b011000100, (outs), IOL, "brhnz\t$rCond,$dest", BranchResolv,
+             pattern>;
+
+  class BRHNZRegInst<RegisterClass rclass>:
+    BRHNZInst<(ins rclass:$rCond, brtarget:$dest),
+              [(brcond rclass:$rCond, bb:$dest)]>;
+
+  class BRHNZVecInst<ValueType vectype>:
+    BRHNZInst<(ins VECREG:$rCond, brtarget:$dest), [/* no pattern */]>;
+
+  multiclass BranchNotZeroHalfword {
+    def v8i16: BRHNZVecInst<v8i16>;
+    def r16:   BRHNZRegInst<R16C>;
+  }
+
+  defm BRHNZ: BranchNotZeroHalfword;
+
+  class BRHZInst<dag IOL, list<dag> pattern>:
+    RI16Form<0b001000100, (outs), IOL, "brhz\t$rT,$dest", BranchResolv,
+             pattern>;
+
+  class BRHZRegInst<RegisterClass rclass>:
+    BRHZInst<(ins rclass:$rT, brtarget:$dest), [/* no pattern */]>;
+
+  class BRHZVecInst<ValueType vectype>:
+    BRHZInst<(ins VECREG:$rT, brtarget:$dest), [/* no pattern */]>;
+
+  multiclass BranchZeroHalfword {
+    def v8i16: BRHZVecInst<v8i16>;
+    def r16:   BRHZRegInst<R16C>;
+  }
+
+  defm BRHZ: BranchZeroHalfword;
 }
 
 //===----------------------------------------------------------------------===//
 // setcc and brcond patterns:
 //===----------------------------------------------------------------------===//
 
-def : Pat<(brcond (i16 (seteq R16C:$rA, 0)), bb:$dest), 
-          (BRHZ R16C:$rA, bb:$dest)>;
-def : Pat<(brcond (i16 (setne R16C:$rA, 0)), bb:$dest), 
-          (BRHNZ R16C:$rA, bb:$dest)>;
+def : Pat<(brcond (i16 (seteq R16C:$rA, 0)), bb:$dest),
+          (BRHZr16 R16C:$rA, bb:$dest)>;
+def : Pat<(brcond (i16 (setne R16C:$rA, 0)), bb:$dest),
+          (BRHNZr16 R16C:$rA, bb:$dest)>;
 
-def : Pat<(brcond (i32 (seteq R32C:$rA, 0)), bb:$dest), 
-          (BRZ R32C:$rA, bb:$dest)>;
-def : Pat<(brcond (i32 (setne R32C:$rA, 0)), bb:$dest), 
-          (BRNZ R32C:$rA, bb:$dest)>;
+def : Pat<(brcond (i32 (seteq R32C:$rA, 0)), bb:$dest),
+          (BRZr32 R32C:$rA, bb:$dest)>;
+def : Pat<(brcond (i32 (setne R32C:$rA, 0)), bb:$dest),
+          (BRNZr32 R32C:$rA, bb:$dest)>;
 
 multiclass BranchCondEQ<PatFrag cond, SPUInstr brinst16, SPUInstr brinst32>
 {
@@ -3290,8 +3599,8 @@ multiclass BranchCondEQ<PatFrag cond, SPUInstr brinst16, SPUInstr brinst32>
                 (brinst32 (CEQr32 R32C:$rA, R32C:$rB), bb:$dest)>;
 }
 
-defm BRCONDeq : BranchCondEQ<seteq, BRHZ, BRZ>;
-defm BRCONDne : BranchCondEQ<setne, BRHNZ, BRNZ>;
+defm BRCONDeq : BranchCondEQ<seteq, BRHZr16, BRZr32>;
+defm BRCONDne : BranchCondEQ<setne, BRHNZr16, BRNZr32>;
 
 multiclass BranchCondLGT<PatFrag cond, SPUInstr brinst16, SPUInstr brinst32>
 {
@@ -3308,8 +3617,8 @@ multiclass BranchCondLGT<PatFrag cond, SPUInstr brinst16, SPUInstr brinst32>
                 (brinst32 (CLGTr32 R32C:$rA, R32C:$rB), bb:$dest)>;
 }
 
-defm BRCONDugt : BranchCondLGT<setugt, BRHNZ, BRNZ>;
-defm BRCONDule : BranchCondLGT<setule, BRHZ, BRZ>;
+defm BRCONDugt : BranchCondLGT<setugt, BRHNZr16, BRNZr32>;
+defm BRCONDule : BranchCondLGT<setule, BRHZr16, BRZr32>;
 
 multiclass BranchCondLGTEQ<PatFrag cond, SPUInstr orinst16, SPUInstr brinst16,
                            SPUInstr orinst32, SPUInstr brinst32>
@@ -3335,8 +3644,8 @@ multiclass BranchCondLGTEQ<PatFrag cond, SPUInstr orinst16, SPUInstr brinst16,
                           bb:$dest)>;
 }
 
-defm BRCONDuge : BranchCondLGTEQ<setuge, ORr16, BRHNZ, ORr32, BRNZ>;
-defm BRCONDult : BranchCondLGTEQ<setult, ORr16, BRHZ, ORr32, BRZ>;
+defm BRCONDuge : BranchCondLGTEQ<setuge, ORr16, BRHNZr16, ORr32, BRNZr32>;
+defm BRCONDult : BranchCondLGTEQ<setult, ORr16, BRHZr16, ORr32, BRZr32>;
 
 multiclass BranchCondGT<PatFrag cond, SPUInstr brinst16, SPUInstr brinst32>
 {
@@ -3353,8 +3662,8 @@ multiclass BranchCondGT<PatFrag cond, SPUInstr brinst16, SPUInstr brinst32>
                 (brinst32 (CGTr32 R32C:$rA, R32C:$rB), bb:$dest)>;
 }
 
-defm BRCONDgt : BranchCondGT<setgt, BRHNZ, BRNZ>;
-defm BRCONDle : BranchCondGT<setle, BRHZ, BRZ>;
+defm BRCONDgt : BranchCondGT<setgt, BRHNZr16, BRNZr32>;
+defm BRCONDle : BranchCondGT<setle, BRHZr16, BRZr32>;
 
 multiclass BranchCondGTEQ<PatFrag cond, SPUInstr orinst16, SPUInstr brinst16,
                           SPUInstr orinst32, SPUInstr brinst32>
@@ -3380,8 +3689,8 @@ multiclass BranchCondGTEQ<PatFrag cond, SPUInstr orinst16, SPUInstr brinst16,
                           bb:$dest)>;
 }
 
-defm BRCONDge : BranchCondGTEQ<setge, ORr16, BRHNZ, ORr32, BRNZ>;
-defm BRCONDlt : BranchCondGTEQ<setlt, ORr16, BRHZ, ORr32, BRZ>;
+defm BRCONDge : BranchCondGTEQ<setge, ORr16, BRHNZr16, ORr32, BRNZr32>;
+defm BRCONDlt : BranchCondGTEQ<setlt, ORr16, BRHZr16, ORr32, BRZr32>;
 
 let isTerminator = 1, isBarrier = 1 in {
   let isReturn = 1 in {
@@ -3397,10 +3706,12 @@ let isTerminator = 1, isBarrier = 1 in {
 class FAInst<dag OOL, dag IOL, list<dag> pattern>:
     RRForm<0b01011000100, OOL, IOL, "fa\t$rT, $rA, $rB",
 	   SPrecFP, pattern>;
+
 class FAVecInst<ValueType vectype>:
     FAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
              [(set (vectype VECREG:$rT),
 		   (fadd (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
+
 multiclass SFPAdd
 {
   def v4f32: FAVecInst<v4f32>;
@@ -3548,7 +3859,7 @@ def FSCRRf32 :
 // floating reciprocal absolute square root estimate (frsqest)
 
 // The following are probably just intrinsics
-// status and control register write 
+// status and control register write
 // status and control register read
 
 //--------------------------------------
@@ -3603,7 +3914,7 @@ def FMSf32 :
 //     = c - a * b
 // NOTE: subtraction order
 // fsub a b = a - b
-// fs a b = b - a? 
+// fs a b = b - a?
 def FNMSf32 :
     RRRForm<0b1101, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB, R32FP:$rC),
       "fnms\t$rT, $rA, $rB, $rC", SPrecFP,
@@ -3612,9 +3923,9 @@ def FNMSf32 :
 def FNMSv4f32 :
     RRRForm<0b1101, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
       "fnms\t$rT, $rA, $rB, $rC", SPrecFP,
-      [(set (v4f32 VECREG:$rT), 
-            (fsub (v4f32 VECREG:$rC), 
-                  (fmul (v4f32 VECREG:$rA), 
+      [(set (v4f32 VECREG:$rT),
+            (fsub (v4f32 VECREG:$rC),
+                  (fmul (v4f32 VECREG:$rA),
                         (v4f32 VECREG:$rB))))]>;
 
 //--------------------------------------
@@ -3625,7 +3936,7 @@ def CSiFv4f32:
       "csflt\t$rT, $rA, 0", SPrecFP,
       [(set (v4f32 VECREG:$rT), (sint_to_fp (v4i32 VECREG:$rA)))]>;
 
-// Convert signed integer to floating point 
+// Convert signed integer to floating point
 def CSiFf32 :
     CVTIntFPForm<0b0101101110, (outs R32FP:$rT), (ins R32C:$rA),
       "csflt\t$rT, $rA, 0", SPrecFP,
@@ -3642,7 +3953,7 @@ def CUiFf32 :
       "cuflt\t$rT, $rA, 0", SPrecFP,
       [(set R32FP:$rT, (uint_to_fp R32C:$rA))]>;
 
-// Convert float to unsigned int 
+// Convert float to unsigned int
 // Assume that scale = 0
 
 def CFUiv4f32 :
@@ -3655,7 +3966,7 @@ def CFUif32 :
       "cfltu\t$rT, $rA, 0", SPrecFP,
       [(set R32C:$rT, (fp_to_uint R32FP:$rA))]>;
 
-// Convert float to signed int 
+// Convert float to signed int
 // Assume that scale = 0
 
 def CFSiv4f32 :
@@ -3788,9 +4099,9 @@ def FNMSv2f64 :
     RRForm<0b01111010110, (outs VECREG:$rT),
                           (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
       "dfnms\t$rT, $rA, $rB", DPrecFP,
-      [(set (v2f64 VECREG:$rT), 
-            (fsub (v2f64 VECREG:$rC), 
-                  (fmul (v2f64 VECREG:$rA), 
+      [(set (v2f64 VECREG:$rT),
+            (fsub (v2f64 VECREG:$rC),
+                  (fmul (v2f64 VECREG:$rA),
                         (v2f64 VECREG:$rB))))]>,
     RegConstraint<"$rC = $rT">,
     NoEncode<"$rC">;
@@ -3813,9 +4124,9 @@ def FNMAv2f64 :
     RRForm<0b11111010110, (outs VECREG:$rT),
                           (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
       "dfnma\t$rT, $rA, $rB", DPrecFP,
-      [(set (v2f64 VECREG:$rT), 
-            (fneg (fadd (v2f64 VECREG:$rC), 
-                        (fmul (v2f64 VECREG:$rA), 
+      [(set (v2f64 VECREG:$rT),
+            (fneg (fadd (v2f64 VECREG:$rC),
+                        (fmul (v2f64 VECREG:$rA),
                               (v2f64 VECREG:$rB)))))]>,
     RegConstraint<"$rC = $rT">,
     NoEncode<"$rC">;
@@ -3825,7 +4136,7 @@ def FNMAv2f64 :
 //===----------------------------------------------------------------------==//
 
 def : Pat<(fneg (v4f32 VECREG:$rA)),
-          (XORfnegvec (v4f32 VECREG:$rA), 
+          (XORfnegvec (v4f32 VECREG:$rA),
                       (v4f32 (ILHUv4i32 0x8000)))>;
 
 def : Pat<(fneg R32FP:$rA),
@@ -3944,7 +4255,7 @@ def : Pat<(f32 fpimm:$imm),
 def : Pat<(v4i32 v4i32Imm:$imm),
           (IOHLv4i32 (v4i32 (ILHUv4i32 (HI16_vec v4i32Imm:$imm))),
                      (LO16_vec v4i32Imm:$imm))>;
- 
+
 // 8-bit constants
 def : Pat<(i8 imm:$imm),
           (ILHr8 imm:$imm)>;
@@ -4001,6 +4312,69 @@ def : Pat<(i32 (anyext R16C:$rSrc)),
           (ORIi16i32 R16C:$rSrc, 0)>;
 
 //===----------------------------------------------------------------------===//
+// Truncates:
+// These truncates are for the SPU's supported types (i8, i16, i32). i64 and
+// above are custom lowered.
+//===----------------------------------------------------------------------===//
+
+def : Pat<(i8 (trunc GPRC:$src)),
+          (ORi8_v16i8
+            (SHUFBgprc GPRC:$src, GPRC:$src,
+                       (IOHLv4i32 (ILHUv4i32 0x0f0f), 0x0f0f)))>;
+
+def : Pat<(i8 (trunc R64C:$src)),
+          (ORi8_v16i8
+            (SHUFBv2i64_m32
+              (ORv2i64_i64 R64C:$src),
+              (ORv2i64_i64 R64C:$src),
+              (IOHLv4i32 (ILHUv4i32 0x0707), 0x0707)))>;
+
+def : Pat<(i8 (trunc R32C:$src)),
+          (ORi8_v16i8
+            (SHUFBv4i32_m32
+               (ORv4i32_i32 R32C:$src),
+               (ORv4i32_i32 R32C:$src),
+               (IOHLv4i32 (ILHUv4i32 0x0303), 0x0303)))>;
+
+def : Pat<(i8 (trunc R16C:$src)),
+          (ORi8_v16i8
+            (SHUFBv4i32_m32
+               (ORv8i16_i16 R16C:$src),
+               (ORv8i16_i16 R16C:$src),
+               (IOHLv4i32 (ILHUv4i32 0x0303), 0x0303)))>;
+
+def : Pat<(i16 (trunc GPRC:$src)),
+          (ORi16_v8i16
+            (SHUFBgprc GPRC:$src, GPRC:$src,
+                       (IOHLv4i32 (ILHUv4i32 0x0e0f), 0x0e0f)))>;
+
+def : Pat<(i16 (trunc R64C:$src)),
+          (ORi16_v8i16
+            (SHUFBv2i64_m32
+              (ORv2i64_i64 R64C:$src),
+              (ORv2i64_i64 R64C:$src),
+              (IOHLv4i32 (ILHUv4i32 0x0607), 0x0607)))>;
+
+def : Pat<(i16 (trunc R32C:$src)),
+          (ORi16_v8i16
+            (SHUFBv4i32_m32
+               (ORv4i32_i32 R32C:$src),
+               (ORv4i32_i32 R32C:$src),
+               (IOHLv4i32 (ILHUv4i32 0x0203), 0x0203)))>;
+
+def : Pat<(i32 (trunc GPRC:$src)),
+          (ORi32_v4i32
+            (SHUFBgprc GPRC:$src, GPRC:$src,
+                       (IOHLv4i32 (ILHUv4i32 0x0c0d), 0x0e0f)))>;
+
+def : Pat<(i32 (trunc R64C:$src)),
+          (ORi32_v4i32
+            (SHUFBv2i64_m32
+              (ORv2i64_i64 R64C:$src),
+              (ORv2i64_i64 R64C:$src),
+              (IOHLv4i32 (ILHUv4i32 0x0405), 0x0607)))>;
+
+//===----------------------------------------------------------------------===//
 // Address generation: SPU, like PPC, has to split addresses into high and
 // low parts in order to load them into a register.
 //===----------------------------------------------------------------------===//
@@ -4047,3 +4421,5 @@ def : Pat<(add (SPUhi tconstpool:$in, 0), (SPUlo tconstpool:$in, 0)),
 
 // Instrinsics:
 include "CellSDKIntrinsics.td"
+// 64-bit "instructions"/support
+include "SPU64InstrInfo.td"
diff --git a/lib/Target/CellSPU/SPUNodes.td b/lib/Target/CellSPU/SPUNodes.td
index 1ed1e3ba51e..b22c6b5d9fe 100644
--- a/lib/Target/CellSPU/SPUNodes.td
+++ b/lib/Target/CellSPU/SPUNodes.td
@@ -66,6 +66,13 @@ def SPUselb_type: SDTypeProfile<1, 3, [
 def SPUvecshift_type: SDTypeProfile<1, 2, [
   SDTCisSameAs<0, 1>, SDTCisInt<2>]>;
 
+// SPU gather bits:
+// This instruction looks at each vector (word|halfword|byte) slot's low bit
+// and forms a mask in the low order bits of the first word's preferred slot.
+def SPUgatherbits_type: SDTypeProfile<1, 1, [
+  /* no type constraints defined */
+]>;
+
 //===----------------------------------------------------------------------===//
 // Synthetic/pseudo-instructions
 //===----------------------------------------------------------------------===//
@@ -137,14 +144,17 @@ def SPUselmask: SDNode<"SPUISD::SELECT_MASK", SPUselmask_type, []>;
 // SPU select bits instruction
 def SPUselb: SDNode<"SPUISD::SELB", SPUselb_type, []>;
 
+// SPU gather bits instruction:
+def SPUgatherbits: SDNode<"SPUISD::GATHER_BITS", SPUgatherbits_type, []>;
+
 // SPU floating point interpolate
 def SPUinterpolate : SDNode<"SPUISD::FPInterp", SDTFPBinOp, []>;
 
 // SPU floating point reciprocal estimate (used for fdiv)
 def SPUreciprocalEst: SDNode<"SPUISD::FPRecipEst", SDTFPUnaryOp, []>;
 
-def SDTpromote_scalar: SDTypeProfile<1, 1, []>;
-def SPUpromote_scalar: SDNode<"SPUISD::PROMOTE_SCALAR", SDTpromote_scalar, []>;
+def SDTprefslot2vec: SDTypeProfile<1, 1, []>;
+def SPUprefslot2vec: SDNode<"SPUISD::PREFSLOT2VEC", SDTprefslot2vec, []>;
 
 def SPU_vec_demote   : SDTypeProfile<1, 1, []>;
 def SPUvec2prefslot: SDNode<"SPUISD::VEC2PREFSLOT", SPU_vec_demote, []>;
diff --git a/lib/Target/CellSPU/SPUOperands.td b/lib/Target/CellSPU/SPUOperands.td
index d788f837fc0..802628f8996 100644
--- a/lib/Target/CellSPU/SPUOperands.td
+++ b/lib/Target/CellSPU/SPUOperands.td
@@ -609,15 +609,15 @@ def symbolLSA: Operand<i32> {
   let PrintMethod = "printSymbolLSA";
 }
 
-// memory s7imm(reg) operaand
-def memri7 : Operand<iPTR> {
-  let PrintMethod = "printMemRegImmS7";
+// Shuffle address memory operaand [s7imm(reg) d-format]
+def shufaddr : Operand<iPTR> {
+  let PrintMethod = "printShufAddr";
   let MIOperandInfo = (ops s7imm:$imm, ptr_rc:$reg);
 }
 
 // memory s10imm(reg) operand
-def memri10 : Operand<iPTR> {
-  let PrintMethod = "printMemRegImmS10";
+def dformaddr : Operand<iPTR> {
+  let PrintMethod = "printDFormAddr";
   let MIOperandInfo = (ops s10imm:$imm, ptr_rc:$reg);
 }
 
diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp
index beea0dfb02c..cf4089fa29e 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.cpp
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -403,11 +403,6 @@ SPURegisterInfo::determineFrameLayout(MachineFunction &MF) const
 void SPURegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                                            RegScavenger *RS)
   const {
-#if 0
-  //  Save and clear the LR state.
-  SPUFunctionInfo *FI = MF.getInfo<SPUFunctionInfo>();
-  FI->setUsesLR(MF.getRegInfo().isPhysRegUsed(LR));
-#endif
   // Mark LR and SP unused, since the prolog spills them to stack and
   // we don't want anyone else to spill them for us.
   //
diff --git a/lib/Target/CellSPU/SPUTargetAsmInfo.cpp b/lib/Target/CellSPU/SPUTargetAsmInfo.cpp
index 2bc0ffdb7ef..72752555e49 100644
--- a/lib/Target/CellSPU/SPUTargetAsmInfo.cpp
+++ b/lib/Target/CellSPU/SPUTargetAsmInfo.cpp
@@ -26,6 +26,13 @@ SPULinuxTargetAsmInfo::SPULinuxTargetAsmInfo(const SPUTargetMachine &TM) :
   PrivateGlobalPrefix = ".L";
   // This corresponds to what the gcc SPU compiler emits, for consistency.
   CStringSection = ".rodata.str";
+
+  // BSS section needs to be emitted as ".section"
+  BSSSection = "\t.section\t.bss";
+  BSSSection_ = getUnnamedSection("\t.section\t.bss",
+                                  SectionFlags::Writeable | SectionFlags::BSS,
+                                  true);
+
 }
 
 /// PreferredEHDataFormat - This hook allows the target to select data
diff --git a/test/CodeGen/CellSPU/call_indirect.ll b/test/CodeGen/CellSPU/call_indirect.ll
index 4b0a957feb2..9be714ebc9b 100644
--- a/test/CodeGen/CellSPU/call_indirect.ll
+++ b/test/CodeGen/CellSPU/call_indirect.ll
@@ -2,7 +2,7 @@
 ; RUN: llvm-as -o - %s | llc -march=cellspu -mattr=large_mem > %t2.s
 ; RUN: grep bisl    %t1.s | count 7
 ; RUN: grep ila     %t1.s | count 1
-; RUN: grep rotqbyi %t1.s | count 4
+; RUN: grep rotqby  %t1.s | count 6
 ; RUN: grep lqa     %t1.s | count 1
 ; RUN: grep lqd     %t1.s | count 12
 ; RUN: grep dispatch_tab %t1.s | count 5
diff --git a/test/CodeGen/CellSPU/icmp64.ll b/test/CodeGen/CellSPU/icmp64.ll
new file mode 100644
index 00000000000..d2b4fc096ee
--- /dev/null
+++ b/test/CodeGen/CellSPU/icmp64.ll
@@ -0,0 +1,144 @@
+; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
+; RUN: grep ceq                                %t1.s | count 4
+; RUN: grep cgti                               %t1.s | count 4
+; RUN: grep gb                                 %t1.s | count 4
+; RUN: grep fsm                                %t1.s | count 2
+; RUN: grep xori                               %t1.s | count 1
+; RUN: grep selb                               %t1.s | count 2
+
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
+
+; $3 = %arg1, $4 = %arg2, $5 = %val1, $6 = %val2
+; $3 = %arg1, $4 = %val1, $5 = %val2
+;
+; i64 integer comparisons:
+define i64 @icmp_eq_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+entry:
+       %A = icmp eq i64 %arg1, %arg2
+       %B = select i1 %A, i64 %val1, i64 %val2
+       ret i64 %B
+}
+
+define i1 @icmp_eq_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+entry:
+       %A = icmp eq i64 %arg1, %arg2
+       ret i1 %A
+}
+
+define i64 @icmp_ne_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+entry:
+       %A = icmp ne i64 %arg1, %arg2
+       %B = select i1 %A, i64 %val1, i64 %val2
+       ret i64 %B
+}
+
+define i1 @icmp_ne_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+entry:
+       %A = icmp ne i64 %arg1, %arg2
+       ret i1 %A
+}
+
+;; define i64 @icmp_ugt_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp ugt i64 %arg1, %arg2
+;;        %B = select i1 %A, i64 %val1, i64 %val2
+;;        ret i64 %B
+;; }
+;; 
+;; define i1 @icmp_ugt_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp ugt i64 %arg1, %arg2
+;;        ret i1 %A
+;; }
+;; 
+;; define i64 @icmp_uge_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp uge i64 %arg1, %arg2
+;;        %B = select i1 %A, i64 %val1, i64 %val2
+;;        ret i64 %B
+;; }
+;; 
+;; define i1 @icmp_uge_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp uge i64 %arg1, %arg2
+;;        ret i1 %A
+;; }
+;; 
+;; define i64 @icmp_ult_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp ult i64 %arg1, %arg2
+;;        %B = select i1 %A, i64 %val1, i64 %val2
+;;        ret i64 %B
+;; }
+;; 
+;; define i1 @icmp_ult_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp ult i64 %arg1, %arg2
+;;        ret i1 %A
+;; }
+;; 
+;; define i64 @icmp_ule_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp ule i64 %arg1, %arg2
+;;        %B = select i1 %A, i64 %val1, i64 %val2
+;;        ret i64 %B
+;; }
+;; 
+;; define i1 @icmp_ule_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp ule i64 %arg1, %arg2
+;;        ret i1 %A
+;; }
+;; 
+;; define i64 @icmp_sgt_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp sgt i64 %arg1, %arg2
+;;        %B = select i1 %A, i64 %val1, i64 %val2
+;;        ret i64 %B
+;; }
+;; 
+;; define i1 @icmp_sgt_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp sgt i64 %arg1, %arg2
+;;        ret i1 %A
+;; }
+;; 
+;; define i64 @icmp_sge_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp sge i64 %arg1, %arg2
+;;        %B = select i1 %A, i64 %val1, i64 %val2
+;;        ret i64 %B
+;; }
+;; 
+;; define i1 @icmp_sge_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp sge i64 %arg1, %arg2
+;;        ret i1 %A
+;; }
+;; 
+;; define i64 @icmp_slt_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp slt i64 %arg1, %arg2
+;;        %B = select i1 %A, i64 %val1, i64 %val2
+;;        ret i64 %B
+;; }
+;; 
+;; define i1 @icmp_slt_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp slt i64 %arg1, %arg2
+;;        ret i1 %A
+;; }
+;; 
+;; define i64 @icmp_sle_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp sle i64 %arg1, %arg2
+;;        %B = select i1 %A, i64 %val1, i64 %val2
+;;        ret i64 %B
+;; }
+;; 
+;; define i1 @icmp_sle_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp sle i64 %arg1, %arg2
+;;        ret i1 %A
+;; }
diff --git a/test/CodeGen/CellSPU/stores.ll b/test/CodeGen/CellSPU/stores.ll
index 28d2e5b0a89..f2f35ef4dbc 100644
--- a/test/CodeGen/CellSPU/stores.ll
+++ b/test/CodeGen/CellSPU/stores.ll
@@ -3,8 +3,17 @@
 ; RUN: grep {stqd.*16(\$3)}     %t1.s | count 4
 ; RUN: grep 16256               %t1.s | count 2
 ; RUN: grep 16384               %t1.s | count 1
+; RUN: grep 771                 %t1.s | count 4
+; RUN: grep 515                 %t1.s | count 2
+; RUN: grep 1799                %t1.s | count 2
+; RUN: grep 1543                %t1.s | count 5
+; RUN: grep 1029                %t1.s | count 3
 ; RUN: grep {shli.*, 4}         %t1.s | count 4
 ; RUN: grep stqx                %t1.s | count 4
+; RUN: grep ilhu                %t1.s | count 11
+; RUN: grep iohl                %t1.s | count 8
+; RUN: grep shufb               %t1.s | count 15
+; RUN: grep frds                %t1.s | count 1
 
 ; ModuleID = 'stores.bc'
 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
@@ -89,3 +98,54 @@ entry:
         store <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x float>* %arrayidx
         ret void
 }
+
+; Test truncating stores:
+
+define zeroext i8 @tstore_i16_i8(i16 signext %val, i8* %dest) nounwind {
+entry:
+	%conv = trunc i16 %val to i8
+	store i8 %conv, i8* %dest
+	ret i8 %conv
+}
+
+define zeroext i8 @tstore_i32_i8(i32 %val, i8* %dest) nounwind {
+entry:
+	%conv = trunc i32 %val to i8
+	store i8 %conv, i8* %dest
+	ret i8 %conv
+}
+
+define signext i16 @tstore_i32_i16(i32 %val, i16* %dest) nounwind {
+entry:
+	%conv = trunc i32 %val to i16
+	store i16 %conv, i16* %dest
+	ret i16 %conv
+}
+
+define zeroext i8 @tstore_i64_i8(i64 %val, i8* %dest) nounwind {
+entry:
+	%conv = trunc i64 %val to i8
+	store i8 %conv, i8* %dest
+	ret i8 %conv
+}
+
+define signext i16 @tstore_i64_i16(i64 %val, i16* %dest) nounwind {
+entry:
+	%conv = trunc i64 %val to i16
+	store i16 %conv, i16* %dest
+	ret i16 %conv
+}
+
+define i32 @tstore_i64_i32(i64 %val, i32* %dest) nounwind {
+entry:
+	%conv = trunc i64 %val to i32
+	store i32 %conv, i32* %dest
+	ret i32 %conv
+}
+
+define float @tstore_f64_f32(double %val, float* %dest) nounwind {
+entry:
+	%conv = fptrunc double %val to float
+	store float %conv, float* %dest
+	ret float %conv
+}
diff --git a/test/CodeGen/CellSPU/struct_1.ll b/test/CodeGen/CellSPU/struct_1.ll
index 3df7267ff27..82d319dd105 100644
--- a/test/CodeGen/CellSPU/struct_1.ll
+++ b/test/CodeGen/CellSPU/struct_1.ll
@@ -35,7 +35,7 @@ target triple = "spu"
 ;   int           i2;   // offset 12 [ignored]
 ;   unsigned char c4;   // offset 16 [ignored]
 ;   unsigned char c5;   // offset 17 [ignored]
-;   unsigned char c6;   // offset 18 [ignored]
+;   unsigned char c6;   // offset 18 (rotate left by 14 bytes to byte 3)
 ;   unsigned char c7;   // offset 19 (no rotate, in preferred slot)
 ;   int           i3;   // offset 20 [ignored]
 ;   int           i4;   // offset 24 [ignored]
diff --git a/test/CodeGen/CellSPU/trunc.ll b/test/CodeGen/CellSPU/trunc.ll
index 845feed8b33..1c6e1f6cb14 100644
--- a/test/CodeGen/CellSPU/trunc.ll
+++ b/test/CodeGen/CellSPU/trunc.ll
@@ -1,16 +1,12 @@
 ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
-; RUN: grep shufb   %t1.s | count 9
+; RUN: grep shufb   %t1.s | count 10
 ; RUN: grep {ilhu.*1799}  %t1.s | count 1
-; RUN: grep {ilhu.*771}  %t1.s | count 3
+; RUN: grep {ilhu.*771}  %t1.s | count 1
 ; RUN: grep {ilhu.*1543}  %t1.s | count 1
 ; RUN: grep {ilhu.*1029}  %t1.s | count 1
-; RUN: grep {ilhu.*515}  %t1.s | count 1
-; RUN: grep {iohl.*1799}  %t1.s | count 1
-; RUN: grep {iohl.*771}  %t1.s | count 3
-; RUN: grep {iohl.*1543}  %t1.s | count 2
-; RUN: grep {iohl.*515}  %t1.s | count 1
-; RUN: grep xsbh  %t1.s | count 6
-; RUN: grep sfh  %t1.s | count 5
+; RUN: grep {ilhu.*515}  %t1.s | count 2
+; RUN: grep xsbh  %t1.s | count 2
+; RUN: grep sfh  %t1.s | count 1
 
 ; ModuleID = 'trunc.bc'
 target datalayout = "E-p:32:32:128-i1:8:128-i8:8:128-i16:16:128-i32:32:128-i64:32:128-f32:32:128-f64:64:128-v64:64:64-v128:128:128-a0:0:128-s0:128:128"
@@ -41,23 +37,22 @@ target triple = "spu"
 ;	ret i64 %0
 ;}
 
-define i8 @trunc_i64_i8(i64 %u, i8 %v) nounwind readnone {
+define <16 x i8> @trunc_i64_i8(i64 %u, <16 x i8> %v) nounwind readnone {
 entry:
 	%0 = trunc i64 %u to i8
-	%1 = sub i8 %0, %v
-	ret i8 %1
+        %tmp1 = insertelement <16 x i8> %v, i8 %0, i32 10
+	ret <16 x i8> %tmp1
 }
-define i16 @trunc_i64_i16(i64 %u, i16 %v) nounwind readnone {
+define <8 x i16> @trunc_i64_i16(i64 %u, <8 x i16> %v) nounwind readnone {
 entry:
 	%0 = trunc i64 %u to i16
-        %1 = sub i16 %0, %v
-	ret i16 %1
+        %tmp1 = insertelement <8 x i16> %v, i16 %0, i32 6
+	ret <8 x i16> %tmp1
 }
 define i32 @trunc_i64_i32(i64 %u, i32 %v) nounwind readnone {
 entry:
 	%0 = trunc i64 %u to i32
-	%1 = sub i32 %0, %v
-	ret i32 %1
+	ret i32 %0
 }
 
 define i8 @trunc_i32_i8(i32 %u, i8 %v) nounwind readnone {
@@ -66,16 +61,16 @@ entry:
 	%1 = sub i8 %0, %v
 	ret i8 %1
 }
-define i16 @trunc_i32_i16(i32 %u, i16 %v) nounwind readnone {
+define <8 x i16> @trunc_i32_i16(i32 %u, <8 x i16> %v) nounwind readnone {
 entry:
 	%0 = trunc i32 %u to i16
-	%1 = sub i16 %0, %v
-	ret i16 %1
+        %tmp1 = insertelement <8 x i16> %v, i16 %0, i32 3
+	ret <8 x i16> %tmp1
 }
 
-define i8 @trunc_i16_i8(i16 %u, i8 %v) nounwind readnone {
+define <16 x i8> @trunc_i16_i8(i16 %u, <16 x i8> %v) nounwind readnone {
 entry:
 	%0 = trunc i16 %u to i8
-	%1 = sub i8 %0, %v
-	ret i8 %1
+        %tmp1 = insertelement <16 x i8> %v, i8 %0, i32 5
+	ret <16 x i8> %tmp1
 }
diff --git a/test/CodeGen/CellSPU/useful-harnesses/i32operations.c b/test/CodeGen/CellSPU/useful-harnesses/i32operations.c
new file mode 100644
index 00000000000..12fc30bf65d
--- /dev/null
+++ b/test/CodeGen/CellSPU/useful-harnesses/i32operations.c
@@ -0,0 +1,69 @@
+#include <stdio.h>
+
+typedef unsigned int  		uint32_t;
+typedef int           		int32_t;
+
+const char *boolstring(int val) {
+  return val ? "true" : "false";
+}
+
+int i32_eq(int32_t a, int32_t b) {
+  return (a == b);
+}
+
+int i32_neq(int32_t a, int32_t b) {
+  return (a != b);
+}
+
+int32_t i32_eq_select(int32_t a, int32_t b, int32_t c, int32_t d) {
+  return ((a == b) ? c : d);
+}
+
+int32_t i32_neq_select(int32_t a, int32_t b, int32_t c, int32_t d) {
+  return ((a != b) ? c : d);
+}
+
+struct pred_s {
+  const char *name;
+  int (*predfunc)(int32_t, int32_t);
+  int (*selfunc)(int32_t, int32_t, int32_t, int32_t);
+};
+
+struct pred_s preds[] = {
+  { "eq",  i32_eq,  i32_eq_select },
+  { "neq", i32_neq, i32_neq_select }
+};
+
+int main(void) {
+  int i;
+  int32_t a = 1234567890;
+  int32_t b =  345678901;
+  int32_t c = 1234500000;
+  int32_t d =      10001;
+  int32_t e =      10000;
+
+  printf("a = %12d (0x%08x)\n", a, a);
+  printf("b = %12d (0x%08x)\n", b, b);
+  printf("c = %12d (0x%08x)\n", c, c);
+  printf("d = %12d (0x%08x)\n", d, d);
+  printf("e = %12d (0x%08x)\n", e, e);
+  printf("----------------------------------------\n");
+
+  for (i = 0; i < sizeof(preds)/sizeof(preds[0]); ++i) {
+    printf("a %s a = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(a, a)));
+    printf("a %s a = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(a, a)));
+    printf("a %s b = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(a, b)));
+    printf("a %s c = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(a, c)));
+    printf("d %s e = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(d, e)));
+    printf("e %s e = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(e, e)));
+
+    printf("a %s a ? c : d = %d\n", preds[i].name, (*preds[i].selfunc)(a, a, c, d));
+    printf("a %s a ? c : d == c (%s)\n", preds[i].name, boolstring((*preds[i].selfunc)(a, a, c, d) == c));
+    printf("a %s b ? c : d = %d\n", preds[i].name, (*preds[i].selfunc)(a, b, c, d));
+    printf("a %s b ? c : d == d (%s)\n", preds[i].name, boolstring((*preds[i].selfunc)(a, b, c, d) == d));
+
+    printf("----------------------------------------\n");
+  }
+
+  return 0;
+}
diff --git a/test/CodeGen/CellSPU/useful-harnesses/i64operations.c b/test/CodeGen/CellSPU/useful-harnesses/i64operations.c
new file mode 100644
index 00000000000..7b86070095f
--- /dev/null
+++ b/test/CodeGen/CellSPU/useful-harnesses/i64operations.c
@@ -0,0 +1,68 @@
+#include <stdio.h>
+
+typedef unsigned long long int	uint64_t;
+typedef long long int   	int64_t;
+
+const char *boolstring(int val) {
+  return val ? "true" : "false";
+}
+
+int i64_eq(int64_t a, int64_t b) {
+  return (a == b);
+}
+
+int i64_neq(int64_t a, int64_t b) {
+  return (a != b);
+}
+
+int64_t i64_eq_select(int64_t a, int64_t b, int64_t c, int64_t d) {
+  return ((a == b) ? c : d);
+}
+
+int64_t i64_neq_select(int64_t a, int64_t b, int64_t c, int64_t d) {
+  return ((a != b) ? c : d);
+}
+
+struct pred_s {
+  const char   *name;
+  int 		(*predfunc)(int64_t, int64_t);
+  int64_t       (*selfunc)(int64_t, int64_t, int64_t, int64_t);
+};
+
+struct pred_s preds[] = {
+  { "eq",  i64_eq,  i64_eq_select },
+  { "neq", i64_neq, i64_neq_select }
+};
+
+int main(void) {
+  int i;
+  int64_t a = 1234567890000LL;
+  int64_t b = 2345678901234LL;
+  int64_t c = 1234567890001LL;
+  int64_t d =         10001LL;
+  int64_t e =         10000LL;
+
+  printf("a = %16lld (0x%016llx)\n", a, a);
+  printf("b = %16lld (0x%016llx)\n", b, b);
+  printf("c = %16lld (0x%016llx)\n", c, c);
+  printf("d = %16lld (0x%016llx)\n", d, d);
+  printf("e = %16lld (0x%016llx)\n", e, e);
+  printf("----------------------------------------\n");
+
+  for (i = 0; i < sizeof(preds)/sizeof(preds[0]); ++i) {
+    printf("a %s a = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(a, a)));
+    printf("a %s b = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(a, b)));
+    printf("a %s c = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(a, c)));
+    printf("d %s e = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(d, e)));
+    printf("e %s e = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(e, e)));
+
+    printf("a %s a ? c : d = %lld\n", preds[i].name, (*preds[i].selfunc)(a, a, c, d));
+    printf("a %s a ? c : d == c (%s)\n", preds[i].name, boolstring((*preds[i].selfunc)(a, a, c, d) == c));
+    printf("a %s b ? c : d = %lld\n", preds[i].name, (*preds[i].selfunc)(a, b, c, d));
+    printf("a %s b ? c : d == d (%s)\n", preds[i].name, boolstring((*preds[i].selfunc)(a, b, c, d) == d));
+
+    printf("----------------------------------------\n");
+  }
+
+  return 0;
+}
-- 
2.11.0