From f63be7d3959939b2ffaf0bba5519b71216ec9ee6 Mon Sep 17 00:00:00 2001
From: Nate Begeman <natebegeman@mac.com>
Date: Wed, 6 Jul 2005 18:59:04 +0000
Subject: [PATCH] First round of support for doing scalar FP using the SSE2 ISA
 extension and XMM registers.  There are many known deficiencies and fixmes,
 which will be addressed ASAP.  The major benefit of this work is that it will
 allow the LLVM register allocator to allocate FP registers across basic
 blocks.

The x86 backend will still default to x87 style FP.  To enable this work,
you must pass -enable-sse-scalar-fp and either -sse2 or -sse3 to llc.

An example before and after would be for:
double foo(double *P) { double Sum = 0; int i; for (i = 0; i < 1000; ++i)
                        Sum += P[i]; return Sum; }

The inner loop looks like the following:
x87:
.LBB_foo_1:     # no_exit
        fldl (%esp)
        faddl (%eax,%ecx,8)
        fstpl (%esp)
        incl %ecx
        cmpl $1000, %ecx
        #FP_REG_KILL
        jne .LBB_foo_1  # no_exit

SSE2:
        addsd (%eax,%ecx,8), %xmm0
        incl %ecx
        cmpl $1000, %ecx
        #FP_REG_KILL
        jne .LBB_foo_1  # no_exit


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@22340 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/TargetMachine.cpp        |   6 +
 lib/Target/X86/X86.h                |   1 +
 lib/Target/X86/X86.td               |   2 +-
 lib/Target/X86/X86CodeEmitter.cpp   |  14 +-
 lib/Target/X86/X86ISelPattern.cpp   | 395 +++++++++++++++++++++++++++++-------
 lib/Target/X86/X86InstrInfo.cpp     |   2 +-
 lib/Target/X86/X86InstrInfo.h       |   4 +
 lib/Target/X86/X86InstrInfo.td      |  20 +-
 lib/Target/X86/X86RegisterInfo.cpp  |  22 +-
 lib/Target/X86/X86RegisterInfo.td   |   4 +-
 lib/Target/X86/X86TargetMachine.cpp |   9 +-
 11 files changed, 388 insertions(+), 91 deletions(-)

diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index f4b9039f482..90f10c1fccb 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -27,6 +27,7 @@ namespace llvm {
   bool NoExcessFPPrecision;
   int  PatternISelTriState;
   bool UnsafeFPMath;
+  bool PICEnabled;
 };
 namespace {
   cl::opt<bool, true> PrintCode("print-machineinstrs",
@@ -52,6 +53,11 @@ namespace {
                cl::desc("Enable optimizations that may decrease FP precision"),
                cl::location(UnsafeFPMath),
                cl::init(false));
+  cl::opt<bool, true>
+  EnablePIC("enable-pic",
+               cl::desc("Enable generation of position independant code"),
+               cl::location(PICEnabled),
+               cl::init(false));
 };
 
 //---------------------------------------------------------------------------
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 1bb4ec2e9c3..2fc022af4b1 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -28,6 +28,7 @@ enum X86VectorEnum {
 };
 
 extern X86VectorEnum X86Vector;
+extern bool X86ScalarSSE;
 
 /// createX86SimpleInstructionSelector - This pass converts an LLVM function
 /// into a machine code representation in a very simple peep-hole fashion.  The
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 874391dda9f..afa3ff7b71c 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -61,7 +61,7 @@ def IntelAsmWriter : AsmWriter {
 
 def X86 : Target {
   // Specify the callee saved registers.
-  let CalleeSavedRegisters = [ESI, EDI, EBX, EBP];
+  let CalleeSavedRegisters = [ESI, EDI, EBX, EBP, XMM4, XMM5, XMM6, XMM7];
 
   // Yes, pointers are 32-bits in size.
   let PointerType = i32;
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index d55edc5841c..789b8e26cef 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -361,8 +361,18 @@ void Emitter::emitInstruction(const MachineInstr &MI) {
   // Emit the repeat opcode prefix as needed.
   if ((Desc.TSFlags & X86II::Op0Mask) == X86II::REP) MCE.emitByte(0xF3);
 
-  // Emit instruction prefixes if necessary
-  if (Desc.TSFlags & X86II::OpSize) MCE.emitByte(0x66);// Operand size...
+  // Emit the operand size opcode prefix as needed.
+  if (Desc.TSFlags & X86II::OpSize) MCE.emitByte(0x66);
+
+  // Emit the double precision sse fp opcode prefix as needed.
+  if ((Desc.TSFlags & X86II::Op0Mask) == X86II::XD) {
+    MCE.emitByte(0xF2); MCE.emitByte(0x0F);
+  }
+
+  // Emit the double precision sse fp opcode prefix as needed.
+  if ((Desc.TSFlags & X86II::Op0Mask) == X86II::XS) {
+    MCE.emitByte(0xF3); MCE.emitByte(0x0F);
+  }
 
   switch (Desc.TSFlags & X86II::Op0Mask) {
   case X86II::TB:
diff --git a/lib/Target/X86/X86ISelPattern.cpp b/lib/Target/X86/X86ISelPattern.cpp
index bc728a7a41b..5c561aa1716 100644
--- a/lib/Target/X86/X86ISelPattern.cpp
+++ b/lib/Target/X86/X86ISelPattern.cpp
@@ -97,15 +97,13 @@ namespace {
       setShiftAmountFlavor(Mask);   // shl X, 32 == shl X, 0
 
       // Set up the register classes.
+      // FIXME: Eliminate these two classes when legalize can handle promotions
+      // well.
+      addRegisterClass(MVT::i1, X86::R8RegisterClass);
       addRegisterClass(MVT::i8, X86::R8RegisterClass);
       addRegisterClass(MVT::i16, X86::R16RegisterClass);
       addRegisterClass(MVT::i32, X86::R32RegisterClass);
-      addRegisterClass(MVT::f64, X86::RFPRegisterClass);
-
-      // FIXME: Eliminate these two classes when legalize can handle promotions
-      // well.
-/**/  addRegisterClass(MVT::i1, X86::R8RegisterClass);
-
+      
       setOperationAction(ISD::SINT_TO_FP       , MVT::i64  , Custom);
       setOperationAction(ISD::BRCONDTWOWAY     , MVT::Other, Expand);
       setOperationAction(ISD::MEMMOVE          , MVT::Other, Expand);
@@ -123,7 +121,7 @@ namespace {
       setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
       setOperationAction(ISD::CTTZ             , MVT::i32  , Expand);
       setOperationAction(ISD::CTLZ             , MVT::i32  , Expand);
-
+      
       setOperationAction(ISD::READIO           , MVT::i1   , Expand);
       setOperationAction(ISD::READIO           , MVT::i8   , Expand);
       setOperationAction(ISD::READIO           , MVT::i16  , Expand);
@@ -132,24 +130,47 @@ namespace {
       setOperationAction(ISD::WRITEIO          , MVT::i8   , Expand);
       setOperationAction(ISD::WRITEIO          , MVT::i16  , Expand);
       setOperationAction(ISD::WRITEIO          , MVT::i32  , Expand);
-
-      if (!UnsafeFPMath) {
-        setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
-        setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
-      }
-
+      
       // These should be promoted to a larger select which is supported.
-/**/  setOperationAction(ISD::SELECT           , MVT::i1   , Promote);
+      setOperationAction(ISD::SELECT           , MVT::i1   , Promote);
       setOperationAction(ISD::SELECT           , MVT::i8   , Promote);
-
+      
+      if (X86ScalarSSE) {
+        // Set up the FP register classes.
+        addRegisterClass(MVT::f32, X86::RXMMRegisterClass);
+        addRegisterClass(MVT::f64, X86::RXMMRegisterClass);
+        
+        setOperationAction(ISD::EXTLOAD,  MVT::f32, Expand);
+        setOperationAction(ISD::ZEXTLOAD, MVT::f32, Expand);
+        
+        // We don't support sin/cos/sqrt/fmod
+        setOperationAction(ISD::FSIN , MVT::f64, Expand);
+        setOperationAction(ISD::FCOS , MVT::f64, Expand);
+        setOperationAction(ISD::FABS , MVT::f64, Expand);
+        setOperationAction(ISD::FNEG , MVT::f64, Expand);
+        setOperationAction(ISD::SREM , MVT::f64, Expand);
+        setOperationAction(ISD::FSIN , MVT::f32, Expand);
+        setOperationAction(ISD::FCOS , MVT::f32, Expand);
+        setOperationAction(ISD::FABS , MVT::f32, Expand);
+        setOperationAction(ISD::FNEG , MVT::f32, Expand);
+        setOperationAction(ISD::SREM , MVT::f32, Expand);
+      } else {
+        // Set up the FP register classes.
+        addRegisterClass(MVT::f64, X86::RFPRegisterClass);
+        
+        if (!UnsafeFPMath) {
+          setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
+          setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
+        }
+        
+        addLegalFPImmediate(+0.0); // FLD0
+        addLegalFPImmediate(+1.0); // FLD1
+        addLegalFPImmediate(-0.0); // FLD0/FCHS
+        addLegalFPImmediate(-1.0); // FLD1/FCHS
+      }
       computeRegisterProperties();
-
-      addLegalFPImmediate(+0.0); // FLD0
-      addLegalFPImmediate(+1.0); // FLD1
-      addLegalFPImmediate(-0.0); // FLD0/FCHS
-      addLegalFPImmediate(-1.0); // FLD1/FCHS
     }
-
+    
     // Return the number of bytes that a function should pop when it returns (in
     // addition to the space used by the return address).
     //
@@ -400,7 +421,10 @@ X86TargetLowering::LowerCCCCallTo(SDOperand Chain, const Type *RetTy,
     RetVals.push_back(MVT::i32);
     break;
   case MVT::f32:
-    RetVals.push_back(MVT::f64);
+    if (X86ScalarSSE)
+      RetVals.push_back(MVT::f32);
+    else
+      RetVals.push_back(MVT::f64);
     break;
   case MVT::i64:
     RetVals.push_back(MVT::i32);
@@ -805,7 +829,10 @@ X86TargetLowering::LowerFastCCCallTo(SDOperand Chain, const Type *RetTy,
     RetVals.push_back(MVT::i32);
     break;
   case MVT::f32:
-    RetVals.push_back(MVT::f64);
+    if (X86ScalarSSE)
+      RetVals.push_back(MVT::f32);
+    else
+      RetVals.push_back(MVT::f64);
     break;
   case MVT::i64:
     RetVals.push_back(MVT::i32);
@@ -1041,6 +1068,8 @@ void ISel::EmitFunctionEntryCode(Function &Fn, MachineFunction &MF) {
         BuildMI(BB, X86::MOV32rr, 1, LI->second).addReg(LI->first);
       } else if (RC == X86::RFPRegisterClass) {
         BuildMI(BB, X86::FpMOV, 1, LI->second).addReg(LI->first);
+      } else if (RC == X86::RXMMRegisterClass) {
+        BuildMI(BB, X86::MOVAPDrr, 1, LI->second).addReg(LI->first);
       } else {
         assert(0 && "Unknown regclass!");
       }
@@ -1641,6 +1670,11 @@ void ISel::EmitSelectCC(SDOperand Cond, MVT::ValueType SVT,
     /*missing*/0,  /*missing*/0, X86::FCMOVB , X86::FCMOVBE,
     X86::FCMOVA ,  X86::FCMOVAE, X86::FCMOVP , X86::FCMOVNP
   };
+  static const unsigned SSE_CMOVTAB[] = {
+    0 /* CMPEQSS */, 4 /* CMPNEQSS */, 1 /* CMPLTSS */, 2 /* CMPLESS */,
+    2 /* CMPLESS */, 1 /* CMPLTSS */, /*missing*/0, /*missing*/0,
+    /*missing*/0,  /*missing*/0, /*missing*/0, /*missing*/0
+  };
 
   if (SetCCSDNode *SetCC = dyn_cast<SetCCSDNode>(Cond)) {
     if (MVT::isInteger(SetCC->getOperand(0).getValueType())) {
@@ -1657,6 +1691,20 @@ void ISel::EmitSelectCC(SDOperand Cond, MVT::ValueType SVT,
       case ISD::SETULE: CondCode = BE; break;
       case ISD::SETUGE: CondCode = AE; break;
       }
+    } else if (X86ScalarSSE) {
+      switch (SetCC->getCondition()) {
+      default: assert(0 && "Unknown scalar fp comparison!");
+      case ISD::SETEQ:  CondCode = EQ; break;
+      case ISD::SETNE:  CondCode = NE; break;
+      case ISD::SETULT:
+      case ISD::SETLT:  CondCode = LT; break;
+      case ISD::SETULE:
+      case ISD::SETLE:  CondCode = LE; break;
+      case ISD::SETUGT:
+      case ISD::SETGT:  CondCode = GT; break;
+      case ISD::SETUGE:
+      case ISD::SETGE:  CondCode = GE; break;
+      }
     } else {
       // On a floating point condition, the flags are set as follows:
       // ZF  PF  CF   op
@@ -1693,6 +1741,79 @@ void ISel::EmitSelectCC(SDOperand Cond, MVT::ValueType SVT,
     }
   }
 
+  // There's no SSE equivalent of FCMOVE.  In some cases we can fake it up, in
+  // Others we will have to do the PowerPC thing and generate an MBB for the
+  // true and false values and select between them with a PHI.
+  if (X86ScalarSSE) { 
+    if (CondCode != NOT_SET) {
+      unsigned CMPSOpc = (SVT == MVT::f64) ? X86::CMPSDrr : X86::CMPSSrr;
+      unsigned CMPSImm = SSE_CMOVTAB[CondCode];
+      // FIXME check for min
+      // FIXME check for max
+      // FIXME check for reverse
+      unsigned LHS = SelectExpr(Cond.getOperand(0));
+      unsigned RHS = SelectExpr(Cond.getOperand(1));
+      // emit compare mask
+      unsigned MaskReg = MakeReg(SVT);
+      BuildMI(BB, CMPSOpc, 3, MaskReg).addReg(LHS).addReg(RHS).addImm(CMPSImm);
+      // emit and with mask
+      unsigned TrueMask = MakeReg(SVT);
+      unsigned AndOpc = (SVT == MVT::f32) ? X86::ANDPSrr : X86::ANDPDrr;
+      BuildMI(BB, AndOpc, 2, TrueMask).addReg(RTrue).addReg(MaskReg);
+      // emit and with inverse mask
+      unsigned FalseMask = MakeReg(SVT);
+      unsigned AndnOpc = (SVT == MVT::f32) ? X86::ANDNPSrr : X86::ANDNPDrr;
+      BuildMI(BB, AndnOpc, 2, FalseMask).addReg(RFalse).addReg(MaskReg);
+      // emit or into dest reg
+      unsigned OROpc = (SVT == MVT::f32) ? X86::ORPSrr : X86::ORPDrr;
+      BuildMI(BB, OROpc, 2, RDest).addReg(TrueMask).addReg(FalseMask);
+      return;
+    } else {
+      // do the test and branch thing
+      // Get the condition into the zero flag.
+      unsigned CondReg = SelectExpr(Cond);
+      BuildMI(BB, X86::TEST8rr, 2).addReg(CondReg).addReg(CondReg);
+
+      // Create an iterator with which to insert the MBB for copying the false
+      // value and the MBB to hold the PHI instruction for this SetCC.
+      MachineBasicBlock *thisMBB = BB;
+      const BasicBlock *LLVM_BB = BB->getBasicBlock();
+      ilist<MachineBasicBlock>::iterator It = BB;
+      ++It;
+
+      //  thisMBB:
+      //  ...
+      //   TrueVal = ...
+      //   cmpTY ccX, r1, r2
+      //   bCC sinkMBB
+      //   fallthrough --> copy0MBB
+      MachineBasicBlock *copy0MBB = new MachineBasicBlock(LLVM_BB);
+      MachineBasicBlock *sinkMBB = new MachineBasicBlock(LLVM_BB);
+      BuildMI(BB, X86::JNE, 1).addMBB(sinkMBB);
+      MachineFunction *F = BB->getParent();
+      F->getBasicBlockList().insert(It, copy0MBB);
+      F->getBasicBlockList().insert(It, sinkMBB);
+      // Update machine-CFG edges
+      BB->addSuccessor(copy0MBB);
+      BB->addSuccessor(sinkMBB);
+
+      //  copy0MBB:
+      //   %FalseValue = ...
+      //   # fallthrough to sinkMBB
+      BB = copy0MBB;
+      // Update machine-CFG edges
+      BB->addSuccessor(sinkMBB);
+
+      //  sinkMBB:
+      //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+      //  ...
+      BB = sinkMBB;
+      BuildMI(BB, X86::PHI, 4, RDest).addReg(RFalse)
+        .addMBB(copy0MBB).addReg(RTrue).addMBB(thisMBB);
+    }
+    return;
+  }
+
   unsigned Opc = 0;
   if (CondCode != NOT_SET) {
     switch (SVT) {
@@ -1702,7 +1823,7 @@ void ISel::EmitSelectCC(SDOperand Cond, MVT::ValueType SVT,
     case MVT::f64: Opc = CMOVTABFP[CondCode]; break;
     }
   }
-
+  
   // Finally, if we weren't able to fold this, just emit the condition and test
   // it.
   if (CondCode == NOT_SET || Opc == 0) {
@@ -1757,8 +1878,8 @@ void ISel::EmitCMP(SDOperand LHS, SDOperand RHS, bool HasOneUse) {
       return;
     }
   } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(RHS)) {
-    if (CN->isExactlyValue(+0.0) ||
-        CN->isExactlyValue(-0.0)) {
+    if (!X86ScalarSSE && (CN->isExactlyValue(+0.0) ||
+                          CN->isExactlyValue(-0.0))) {
       unsigned Reg = SelectExpr(LHS);
       BuildMI(BB, X86::FTST, 1).addReg(Reg);
       BuildMI(BB, X86::FNSTSW8r, 0);
@@ -1791,7 +1912,8 @@ void ISel::EmitCMP(SDOperand LHS, SDOperand RHS, bool HasOneUse) {
   case MVT::i8:  Opc = X86::CMP8rr;  break;
   case MVT::i16: Opc = X86::CMP16rr; break;
   case MVT::i32: Opc = X86::CMP32rr; break;
-  case MVT::f64: Opc = X86::FUCOMIr; break;
+  case MVT::f32: Opc = X86::UCOMISSrr; break;
+  case MVT::f64: Opc = X86ScalarSSE ? X86::UCOMISDrr : X86::FUCOMIr; break;
   }
   unsigned Tmp1, Tmp2;
   if (getRegPressure(LHS) > getRegPressure(RHS)) {
@@ -2040,6 +2162,11 @@ unsigned ISel::SelectExpr(SDOperand N) {
   default:
     Node->dump();
     assert(0 && "Node not handled!\n");
+  case ISD::FP_EXTEND:
+    assert(X86ScalarSSE && "Scalar SSE FP must be enabled to use f32"); 
+    Tmp1 = SelectExpr(N.getOperand(0));
+    BuildMI(BB, X86::CVTSS2SDrr, 1, Result).addReg(Tmp1);
+    return Result;
   case ISD::CopyFromReg:
     Select(N.getOperand(0));
     if (Result == 1) {
@@ -2212,6 +2339,37 @@ unsigned ISel::SelectExpr(SDOperand N) {
 
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP: {
+    Tmp1 = SelectExpr(N.getOperand(0));  // Get the operand register
+    unsigned PromoteOpcode = 0;
+
+    // We can handle any sint to fp, and 8 and 16 uint to fp with the direct 
+    // sse conversion instructions.
+    if (X86ScalarSSE) {
+      MVT::ValueType SrcTy = N.getOperand(0).getValueType();
+      MVT::ValueType DstTy = N.getValueType();
+      switch (SrcTy) {
+      case MVT::i1:
+      case MVT::i8:
+        PromoteOpcode = (N.getOpcode() == ISD::UINT_TO_FP) ?
+          X86::MOVZX32rr8 : X86::MOVSX32rr8;
+        break;
+      case MVT::i16:
+        PromoteOpcode = (N.getOpcode() == ISD::UINT_TO_FP) ?
+          X86::MOVZX32rr16 : X86::MOVSX32rr16;
+        break;
+      default:
+        assert(N.getOpcode() != ISD::UINT_TO_FP);
+        break;
+      }
+      if (PromoteOpcode) {
+        BuildMI(BB, PromoteOpcode, 1, Tmp2).addReg(Tmp1);
+        Tmp1 = Tmp2;
+      }
+      Opc = (DstTy == MVT::f64) ? X86::CVTSI2SDrr : X86::CVTSI2SSrr;
+      BuildMI(BB, Opc, 1, Result).addReg(Tmp1);
+      return Result;
+    }
+    
     // FIXME: Most of this grunt work should be done by legalize!
     ContainsFPCode = true;
 
@@ -2221,8 +2379,6 @@ unsigned ISel::SelectExpr(SDOperand N) {
     //
     MVT::ValueType PromoteType = MVT::Other;
     MVT::ValueType SrcTy = N.getOperand(0).getValueType();
-    unsigned PromoteOpcode = 0;
-    unsigned RealDestReg = Result;
     switch (SrcTy) {
     case MVT::i1:
     case MVT::i8:
@@ -2245,8 +2401,6 @@ unsigned ISel::SelectExpr(SDOperand N) {
       break;
     }
 
-    Tmp1 = SelectExpr(N.getOperand(0));  // Get the operand register
-
     if (PromoteType != MVT::Other) {
       Tmp2 = MakeReg(PromoteType);
       BuildMI(BB, PromoteOpcode, 1, Tmp2).addReg(Tmp1);
@@ -2272,31 +2426,28 @@ unsigned ISel::SelectExpr(SDOperand N) {
       break;
     default: break; // No promotion required.
     }
-
-    if (Node->getOpcode() == ISD::UINT_TO_FP && Result != RealDestReg) {
-      // If this is a cast from uint -> double, we need to be careful when if
-      // the "sign" bit is set.  If so, we don't want to make a negative number,
-      // we want to make a positive number.  Emit code to add an offset if the
-      // sign bit is set.
-
-      // Compute whether the sign bit is set by shifting the reg right 31 bits.
-      unsigned IsNeg = MakeReg(MVT::i32);
-      BuildMI(BB, X86::SHR32ri, 2, IsNeg).addReg(Tmp1).addImm(31);
-
-      // Create a CP value that has the offset in one word and 0 in the other.
-      static ConstantInt *TheOffset = ConstantUInt::get(Type::ULongTy,
-                                                        0x4f80000000000000ULL);
-      unsigned CPI = F->getConstantPool()->getConstantPoolIndex(TheOffset);
-      BuildMI(BB, X86::FADD32m, 5, RealDestReg).addReg(Result)
-        .addConstantPoolIndex(CPI).addZImm(4).addReg(IsNeg).addSImm(0);
-    }
-    return RealDestReg;
+    return Result;
   }
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT: {
     // FIXME: Most of this grunt work should be done by legalize!
     Tmp1 = SelectExpr(N.getOperand(0));  // Get the operand register
 
+    // If the target supports SSE2 and is performing FP operations in SSE regs
+    // instead of the FP stack, then we can use the efficient CVTSS2SI and
+    // CVTSD2SI instructions.
+    if (ISD::FP_TO_SINT == N.getOpcode() && X86ScalarSSE) {
+      if (MVT::f32 == N.getOperand(0).getValueType()) {
+        BuildMI(BB, X86::CVTSS2SIrr, 1, Result).addReg(Tmp1);
+      } else if (MVT::f64 == N.getOperand(0).getValueType()) {
+        BuildMI(BB, X86::CVTSD2SIrr, 1, Result).addReg(Tmp1);
+      } else {
+        assert(0 && "Not an f32 or f64?");
+        abort();
+      }
+      return Result;
+    } 
+
     // Change the floating point control register to use "round towards zero"
     // mode when truncating to an integer value.
     //
@@ -2385,9 +2536,15 @@ unsigned ISel::SelectExpr(SDOperand N) {
       case MVT::i8:  Opc = X86::ADD8rm;  break;
       case MVT::i16: Opc = X86::ADD16rm; break;
       case MVT::i32: Opc = X86::ADD32rm; break;
+      case MVT::f32: Opc = X86::ADDSSrm; break;
       case MVT::f64:
         // For F64, handle promoted load operations (from F32) as well!
-        Opc = Op1.getOpcode() == ISD::LOAD ? X86::FADD64m : X86::FADD32m;
+        if (X86ScalarSSE) {
+          assert(Op1.getOpcode() == ISD::LOAD && "SSE load not promoted");
+          Opc = X86::ADDSDrm;
+        } else {
+          Opc = Op1.getOpcode() == ISD::LOAD ? X86::FADD64m : X86::FADD32m;
+        }
         break;
       }
       X86AddressMode AM;
@@ -2458,7 +2615,8 @@ unsigned ISel::SelectExpr(SDOperand N) {
     case MVT::i8:  Opc = X86::ADD8rr; break;
     case MVT::i16: Opc = X86::ADD16rr; break;
     case MVT::i32: Opc = X86::ADD32rr; break;
-    case MVT::f64: Opc = X86::FpADD; break;
+    case MVT::f32: Opc = X86::ADDSSrr; break;
+    case MVT::f64: Opc = X86ScalarSSE ? X86::ADDSDrr : X86::FpADD; break;
     }
 
     if (getRegPressure(Op0) > getRegPressure(Op1)) {
@@ -2472,18 +2630,29 @@ unsigned ISel::SelectExpr(SDOperand N) {
     BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addReg(Tmp2);
     return Result;
 
+  case ISD::FSQRT:
+    Tmp1 = SelectExpr(Node->getOperand(0));
+    if (X86ScalarSSE) {
+      Opc = (N.getValueType() == MVT::f32) ? X86::SQRTSSrr : X86::SQRTSDrr;
+      BuildMI(BB, Opc, 1, Result).addReg(Tmp1);
+    } else {
+      BuildMI(BB, X86::FSQRT, 1, Result).addReg(Tmp1);
+    }
+    return Result;
+
+  // FIXME:
+  // Once we can spill 16 byte constants into the constant pool, we can
+  // implement SSE equivalents of FABS and FCHS.
   case ISD::FABS:
   case ISD::FNEG:
   case ISD::FSIN:
   case ISD::FCOS:
-  case ISD::FSQRT:
     assert(N.getValueType()==MVT::f64 && "Illegal type for this operation");
     Tmp1 = SelectExpr(Node->getOperand(0));
     switch (N.getOpcode()) {
     default: assert(0 && "Unreachable!");
     case ISD::FABS: BuildMI(BB, X86::FABS, 1, Result).addReg(Tmp1); break;
     case ISD::FNEG: BuildMI(BB, X86::FCHS, 1, Result).addReg(Tmp1); break;
-    case ISD::FSQRT: BuildMI(BB, X86::FSQRT, 1, Result).addReg(Tmp1); break;
     case ISD::FSIN: BuildMI(BB, X86::FSIN, 1, Result).addReg(Tmp1); break;
     case ISD::FCOS: BuildMI(BB, X86::FCOS, 1, Result).addReg(Tmp1); break;
     }
@@ -2550,11 +2719,21 @@ unsigned ISel::SelectExpr(SDOperand N) {
       X86::SUB8rm, X86::SUB16rm, X86::SUB32rm, X86::FSUB32m, X86::FSUB64m,
       X86::SUB8rr, X86::SUB16rr, X86::SUB32rr, X86::FpSUB  , X86::FpSUB,
     };
+    static const unsigned SSE_SUBTab[] = {
+      X86::SUB8ri, X86::SUB16ri, X86::SUB32ri, 0, 0,
+      X86::SUB8rm, X86::SUB16rm, X86::SUB32rm, X86::SUBSSrm, X86::SUBSDrm,
+      X86::SUB8rr, X86::SUB16rr, X86::SUB32rr, X86::SUBSSrr, X86::SUBSDrr,
+    };
     static const unsigned MULTab[] = {
       0, X86::IMUL16rri, X86::IMUL32rri, 0, 0,
       0, X86::IMUL16rm , X86::IMUL32rm, X86::FMUL32m, X86::FMUL64m,
       0, X86::IMUL16rr , X86::IMUL32rr, X86::FpMUL  , X86::FpMUL,
     };
+    static const unsigned SSE_MULTab[] = {
+      0, X86::IMUL16rri, X86::IMUL32rri, 0, 0,
+      0, X86::IMUL16rm , X86::IMUL32rm, X86::MULSSrm, X86::MULSDrm,
+      0, X86::IMUL16rr , X86::IMUL32rr, X86::MULSSrr, X86::MULSDrr,
+    };
     static const unsigned ANDTab[] = {
       X86::AND8ri, X86::AND16ri, X86::AND32ri, 0, 0,
       X86::AND8rm, X86::AND16rm, X86::AND32rm, 0, 0,
@@ -2637,8 +2816,8 @@ unsigned ISel::SelectExpr(SDOperand N) {
       }
       switch (Node->getOpcode()) {
       default: assert(0 && "Unreachable!");
-      case ISD::SUB: Opc = SUBTab[Opc]; break;
-      case ISD::MUL: Opc = MULTab[Opc]; break;
+      case ISD::SUB: Opc = X86ScalarSSE ? SSE_SUBTab[Opc] : SUBTab[Opc]; break;
+      case ISD::MUL: Opc = X86ScalarSSE ? SSE_MULTab[Opc] : MULTab[Opc]; break;
       case ISD::AND: Opc = ANDTab[Opc]; break;
       case ISD::OR:  Opc =  ORTab[Opc]; break;
       case ISD::XOR: Opc = XORTab[Opc]; break;
@@ -2656,7 +2835,7 @@ unsigned ISel::SelectExpr(SDOperand N) {
         goto FoldOps;
       } else {
         // For FP, emit 'reverse' subract, with a memory operand.
-        if (N.getValueType() == MVT::f64) {
+        if (N.getValueType() == MVT::f64 && !X86ScalarSSE) {
           if (Op0.getOpcode() == ISD::EXTLOAD)
             Opc = X86::FSUBR32m;
           else
@@ -2678,13 +2857,17 @@ unsigned ISel::SelectExpr(SDOperand N) {
       case MVT::i8:  Opc = 5; break;
       case MVT::i16: Opc = 6; break;
       case MVT::i32: Opc = 7; break;
+      case MVT::f32: Opc = 8; break;
         // For F64, handle promoted load operations (from F32) as well!
-      case MVT::f64: Opc = Op1.getOpcode() == ISD::LOAD ? 9 : 8; break;
+      case MVT::f64: 
+        assert((!X86ScalarSSE || Op1.getOpcode() == ISD::LOAD) && 
+               "SSE load should have been promoted");
+        Opc = Op1.getOpcode() == ISD::LOAD ? 9 : 8; break;
       }
       switch (Node->getOpcode()) {
       default: assert(0 && "Unreachable!");
-      case ISD::SUB: Opc = SUBTab[Opc]; break;
-      case ISD::MUL: Opc = MULTab[Opc]; break;
+      case ISD::SUB: Opc = X86ScalarSSE ? SSE_SUBTab[Opc] : SUBTab[Opc]; break;
+      case ISD::MUL: Opc = X86ScalarSSE ? SSE_MULTab[Opc] : MULTab[Opc]; break;
       case ISD::AND: Opc = ANDTab[Opc]; break;
       case ISD::OR:  Opc =  ORTab[Opc]; break;
       case ISD::XOR: Opc = XORTab[Opc]; break;
@@ -2725,8 +2908,8 @@ unsigned ISel::SelectExpr(SDOperand N) {
     }
     switch (Node->getOpcode()) {
     default: assert(0 && "Unreachable!");
-    case ISD::SUB: Opc = SUBTab[Opc]; break;
-    case ISD::MUL: Opc = MULTab[Opc]; break;
+    case ISD::SUB: Opc = X86ScalarSSE ? SSE_SUBTab[Opc] : SUBTab[Opc]; break;
+    case ISD::MUL: Opc = X86ScalarSSE ? SSE_MULTab[Opc] : MULTab[Opc]; break;
     case ISD::AND: Opc = ANDTab[Opc]; break;
     case ISD::OR:  Opc =  ORTab[Opc]; break;
     case ISD::XOR: Opc = XORTab[Opc]; break;
@@ -2844,7 +3027,7 @@ unsigned ISel::SelectExpr(SDOperand N) {
 
     if (N.getOpcode() == ISD::SDIV) {
       // We can fold loads into FpDIVs, but not really into any others.
-      if (N.getValueType() == MVT::f64) {
+      if (N.getValueType() == MVT::f64 || !X86ScalarSSE) {
         // Check for reversed and unreversed DIV.
         if (isFoldableLoad(N.getOperand(0), N.getOperand(1), true)) {
           if (N.getOperand(0).getOpcode() == ISD::EXTLOAD)
@@ -2962,8 +3145,12 @@ unsigned ISel::SelectExpr(SDOperand N) {
       ClrOpcode = X86::MOV32ri;
       SExtOpcode = X86::CDQ;
       break;
+    case MVT::f32:
+      BuildMI(BB, X86::DIVSSrr, 2, Result).addReg(Tmp1).addReg(Tmp2);
+      return Result;
     case MVT::f64:
-      BuildMI(BB, X86::FpDIV, 2, Result).addReg(Tmp1).addReg(Tmp2);
+      Opc = X86ScalarSSE ? X86::DIVSDrr : X86::FpDIV;
+      BuildMI(BB, Opc, 2, Result).addReg(Tmp1).addReg(Tmp2);
       return Result;
     }
 
@@ -3108,7 +3295,15 @@ unsigned ISel::SelectExpr(SDOperand N) {
     case MVT::i8:  Opc = X86::MOV8rm; break;
     case MVT::i16: Opc = X86::MOV16rm; break;
     case MVT::i32: Opc = X86::MOV32rm; break;
-    case MVT::f64: Opc = X86::FLD64m; ContainsFPCode = true; break;
+    case MVT::f32: Opc = X86::MOVSSrm; break;
+    case MVT::f64: 
+      if (X86ScalarSSE) {
+        Opc = X86::MOVSDrm;
+      } else {
+        Opc = X86::FLD64m;
+        ContainsFPCode = true; 
+      }
+      break;
     }
 
     if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N.getOperand(1))){
@@ -3385,9 +3580,21 @@ unsigned ISel::SelectExpr(SDOperand N) {
           BuildMI(BB, X86::MOV32rr, 1, Result+1).addReg(X86::EDX);
         break;
       case MVT::f64:     // Floating-point return values live in %ST(0)
-        ContainsFPCode = true;
-        BuildMI(BB, X86::FpGETRESULT, 1, Result);
-        break;
+        if (X86ScalarSSE) {
+          ContainsFPCode = true;
+          BuildMI(BB, X86::FpGETRESULT, 1, X86::FP0);
+
+          unsigned Size = MVT::getSizeInBits(MVT::f64)/8;
+          MachineFunction *F = BB->getParent();
+          int FrameIdx = F->getFrameInfo()->CreateStackObject(Size, Size);
+          addFrameReference(BuildMI(BB, X86::FST64m, 5), FrameIdx).addReg(X86::FP0);
+          addFrameReference(BuildMI(BB, X86::MOVSDrm, 4, Result), FrameIdx);
+          break;
+        } else {
+          ContainsFPCode = true;
+          BuildMI(BB, X86::FpGETRESULT, 1, Result);
+          break;
+        }
       }
     return Result+N.ResNo-1;
   }
@@ -3977,7 +4184,15 @@ void ISel::Select(SDOperand N) {
       case MVT::i8:  Opc = X86::MOV8rr; break;
       case MVT::i16: Opc = X86::MOV16rr; break;
       case MVT::i32: Opc = X86::MOV32rr; break;
-      case MVT::f64: Opc = X86::FpMOV; ContainsFPCode = true; break;
+      case MVT::f32: Opc = X86::MOVAPSrr; break;
+      case MVT::f64: 
+        if (X86ScalarSSE) {
+          Opc = X86::MOVAPDrr;
+        } else {
+          Opc = X86::FpMOV; 
+          ContainsFPCode = true; 
+        }
+        break;
       }
       BuildMI(BB, Opc, 1, Tmp2).addReg(Tmp1);
     }
@@ -4018,12 +4233,38 @@ void ISel::Select(SDOperand N) {
       }
       switch (N.getOperand(1).getValueType()) {
       default: assert(0 && "All other types should have been promoted!!");
+      case MVT::f32:
+        if (X86ScalarSSE) {
+          // Spill the value to memory and reload it into top of stack.
+          unsigned Size = MVT::getSizeInBits(MVT::f32)/8;
+          MachineFunction *F = BB->getParent();
+          int FrameIdx = F->getFrameInfo()->CreateStackObject(Size, Size);
+          addFrameReference(BuildMI(BB, X86::MOVSSmr, 5), FrameIdx).addReg(Tmp1);
+          addFrameReference(BuildMI(BB, X86::FLD32m, 4, X86::FP0), FrameIdx);
+          BuildMI(BB, X86::FpSETRESULT, 1).addReg(X86::FP0);
+          ContainsFPCode = true; 
+        } else {
+          assert(0 && "MVT::f32 only legal with scalar sse fp");
+          abort();
+        }
+        break;
       case MVT::f64:
-	BuildMI(BB, X86::FpSETRESULT, 1).addReg(Tmp1);
-	break;
+        if (X86ScalarSSE) {
+          // Spill the value to memory and reload it into top of stack.
+          unsigned Size = MVT::getSizeInBits(MVT::f64)/8;
+          MachineFunction *F = BB->getParent();
+          int FrameIdx = F->getFrameInfo()->CreateStackObject(Size, Size);
+          addFrameReference(BuildMI(BB, X86::MOVSDmr, 5), FrameIdx).addReg(Tmp1);
+          addFrameReference(BuildMI(BB, X86::FLD64m, 4, X86::FP0), FrameIdx);
+          BuildMI(BB, X86::FpSETRESULT, 1).addReg(X86::FP0);
+          ContainsFPCode = true; 
+        } else {
+          BuildMI(BB, X86::FpSETRESULT, 1).addReg(Tmp1);
+        }
+        break;
       case MVT::i32:
-	BuildMI(BB, X86::MOV32rr, 1, X86::EAX).addReg(Tmp1);
-	break;
+        BuildMI(BB, X86::MOV32rr, 1, X86::EAX).addReg(Tmp1);
+        break;
       }
       break;
     case 1:
@@ -4144,7 +4385,9 @@ void ISel::Select(SDOperand N) {
     switch (StoredTy) {
     default: assert(0 && "Cannot truncstore this type!");
     case MVT::i1: Opc = X86::MOV8mr; break;
-    case MVT::f32: Opc = X86::FST32m; break;
+    case MVT::f32:
+      assert(!X86ScalarSSE && "Cannot truncstore scalar SSE regs"); 
+      Opc = X86::FST32m; break;
     }
 
     std::vector<std::pair<unsigned, unsigned> > RP;
@@ -4176,7 +4419,6 @@ void ISel::Select(SDOperand N) {
       case MVT::i8:  Opc = X86::MOV8mi; break;
       case MVT::i16: Opc = X86::MOV16mi; break;
       case MVT::i32: Opc = X86::MOV32mi; break;
-      case MVT::f64: break;
       }
       if (Opc) {
         if (getRegPressure(N.getOperand(0)) > getRegPressure(N.getOperand(2))) {
@@ -4215,7 +4457,8 @@ void ISel::Select(SDOperand N) {
     case MVT::i8:  Opc = X86::MOV8mr; break;
     case MVT::i16: Opc = X86::MOV16mr; break;
     case MVT::i32: Opc = X86::MOV32mr; break;
-    case MVT::f64: Opc = X86::FST64m; break;
+    case MVT::f32: Opc = X86::MOVSSmr; break;
+    case MVT::f64: Opc = X86ScalarSSE ? X86::MOVSDmr : X86::FST64m; break;
     }
 
     std::vector<std::pair<unsigned, unsigned> > RP;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index bda2cb73f6f..957360b2013 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -28,7 +28,7 @@ bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
                                unsigned& destReg) const {
   MachineOpCode oc = MI.getOpcode();
   if (oc == X86::MOV8rr || oc == X86::MOV16rr || oc == X86::MOV32rr ||
-      oc == X86::FpMOV) {
+      oc == X86::FpMOV  || oc == X86::MOVAPDrr) {
       assert(MI.getNumOperands() == 2 &&
              MI.getOperand(0).isRegister() &&
              MI.getOperand(1).isRegister() &&
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 5b63ff93f5c..95e8205a00b 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -107,6 +107,10 @@ namespace X86II {
     DA = 5 << Op0Shift,   DB = 6 << Op0Shift,
     DC = 7 << Op0Shift,   DD = 8 << Op0Shift,
     DE = 9 << Op0Shift,   DF = 10 << Op0Shift,
+    
+    // XS, XD - These prefix codes are for single and double precision scalar
+    // floating point operations performed in the SSE registers.
+    XD = 11 << Op0Shift,   XS = 12 << Op0Shift,
 
     //===------------------------------------------------------------------===//
     // This two-bit field describes the size of an immediate operand.  Zero is
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 39a4317bc2b..1376d8fe8f0 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -187,7 +187,8 @@ def JG  : IBr<0x8F, (ops i32imm:$dst), "jg $dst">, TB;
 //
 let isCall = 1 in
   // All calls clobber the non-callee saved registers...
-  let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0] in {
+  let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+              XMM0, XMM1, XMM2, XMM3] in {
     def CALLpcrel32 : I<0xE8, RawFrm, (ops calltarget:$dst), "call $dst">;
     def CALL32r     : I<0xFF, MRM2r, (ops R32:$dst), "call {*}$dst">;
     def CALL32m     : I<0xFF, MRM2m, (ops i32mem:$dst), "call {*}$dst">;
@@ -1436,6 +1437,23 @@ def CVTSS2SDrr: I<0x5A, MRMSrcReg, (ops R32:$dst, RXMM:$src),
                 "cvtss2sd {$src, $dst|$dst, $src}">, XD;
 def CVTSS2SDrm: I<0x5A, MRMSrcMem, (ops R32:$dst, f32mem:$src),
                 "cvtss2sd {$src, $dst|$dst, $src}">, XD;
+def CVTSI2SSrr: I<0x2A, MRMSrcReg, (ops R32:$dst, RXMM:$src),
+                "cvtsi2ss {$src, $dst|$dst, $src}">, XS;
+def CVTSI2SSrm: I<0x2A, MRMSrcMem, (ops R32:$dst, f32mem:$src),
+                "cvtsi2ss {$src, $dst|$dst, $src}">, XS;
+def CVTSI2SDrr: I<0x2A, MRMSrcReg, (ops R32:$dst, RXMM:$src),
+                "cvtsi2sd {$src, $dst|$dst, $src}">, XD;
+def CVTSI2SDrm: I<0x2A, MRMSrcMem, (ops R32:$dst, f64mem:$src),
+                "cvtsi2sd {$src, $dst|$dst, $src}">, XD;
+
+def SQRTSSrm : I<0x51, MRMSrcMem, (ops RXMM:$dst, f32mem:$src),
+                "subss {$src, $dst|$dst, $src}">, XS;
+def SQRTSSrr : I<0x51, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
+                "subss {$src, $dst|$dst, $src}">, XS;
+def SQRTSDrm : I<0x51, MRMSrcMem, (ops RXMM:$dst, f64mem:$src),
+                "subsd {$src, $dst|$dst, $src}">, XD;
+def SQRTSDrr : I<0x51, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
+                "subsd {$src, $dst|$dst, $src}">, XD;
 
 def UCOMISDrr: I<0x2E, MRMSrcReg, (ops RXMM:$dst, RXMM:$src),
                 "ucomisd {$src, $dst|$dst, $src}">, TB, OpSize;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 08920cc2605..230debf7a7a 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -52,6 +52,7 @@ static unsigned getIdx(unsigned SpillSize) {
   case 32: return 2;
   case 64: return 3;   // FP in 64-bit spill mode.
   case 80: return 4;   // FP in 80-bit spill mode.
+  case 128: return 5;  // XMM reg in 128 bit mode.
   }
 }
 
@@ -59,18 +60,24 @@ void X86RegisterInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator MI,
                                           unsigned SrcReg, int FrameIdx) const {
   static const unsigned Opcode[] =
-    { X86::MOV8mr, X86::MOV16mr, X86::MOV32mr, X86::FST64m, X86::FSTP80m };
+    { X86::MOV8mr, X86::MOV16mr, X86::MOV32mr, X86::FST64m, X86::FSTP80m,
+      X86::MOVAPDmr };
   unsigned Idx = getIdx(getSpillSize(SrcReg));
-  addFrameReference(BuildMI(MBB, MI, Opcode[Idx], 5), FrameIdx).addReg(SrcReg);
+  unsigned Opc = Opcode[Idx];
+  if (X86ScalarSSE && Opc == X86::FST64m) Opc = X86::MOVSDmr;
+  addFrameReference(BuildMI(MBB, MI, Opc, 5), FrameIdx).addReg(SrcReg);
 }
 
 void X86RegisterInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator MI,
                                            unsigned DestReg, int FrameIdx)const{
   static const unsigned Opcode[] =
-    { X86::MOV8rm, X86::MOV16rm, X86::MOV32rm, X86::FLD64m, X86::FLD80m };
+    { X86::MOV8rm, X86::MOV16rm, X86::MOV32rm, X86::FLD64m, X86::FLD80m,
+      X86::MOVAPDrm };
   unsigned Idx = getIdx(getSpillSize(DestReg));
-  addFrameReference(BuildMI(MBB, MI, Opcode[Idx], 4, DestReg), FrameIdx);
+  unsigned Opc = Opcode[Idx];
+  if (X86ScalarSSE && Opc == X86::FLD64m) Opc = X86::MOVSDrm;
+  addFrameReference(BuildMI(MBB, MI, Opc, 4, DestReg), FrameIdx);
 }
 
 void X86RegisterInfo::copyRegToReg(MachineBasicBlock &MBB,
@@ -78,8 +85,11 @@ void X86RegisterInfo::copyRegToReg(MachineBasicBlock &MBB,
                                    unsigned DestReg, unsigned SrcReg,
                                    const TargetRegisterClass *RC) const {
   static const unsigned Opcode[] =
-    { X86::MOV8rr, X86::MOV16rr, X86::MOV32rr, X86::FpMOV, X86::FpMOV };
-  BuildMI(MBB, MI, Opcode[getIdx(RC->getSize()*8)], 1, DestReg).addReg(SrcReg);
+    { X86::MOV8rr, X86::MOV16rr, X86::MOV32rr, X86::FpMOV, X86::FpMOV,
+      X86::MOVAPDrr };
+  unsigned Opc = Opcode[getIdx(RC->getSize()*8)];
+  if (X86ScalarSSE && Opc == X86::FpMOV) Opc = X86::MOVAPDrr;
+  BuildMI(MBB, MI, Opc, 1, DestReg).addReg(SrcReg);
 }
 
 static MachineInstr *MakeMInst(unsigned Opcode, unsigned FrameIndex,
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index da8e612daef..30190fc18c4 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -99,8 +99,8 @@ def R32 : RegisterClass<i32, 32, [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP]> {
 // FIXME: These registers can contain both integer and fp values.  We should
 // figure out the right way to deal with that.  For now, since they'll be used
 // for scalar FP, they are being declared f64
-def RXMM : RegisterClass<f64, 128, [XMM0, XMM1, XMM2, XMM3, 
-                                    XMM4, XMM5, XMM6, XMM7]>;
+def RXMM : RegisterClass<f64, 32, [XMM0, XMM1, XMM2, XMM3, 
+                                   XMM4, XMM5, XMM6, XMM7]>;
 
 // FIXME: This sets up the floating point register files as though they are f64
 // values, though they really are f80 values.  This will cause us to spill
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 2330182372c..def4f9cfa49 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -26,6 +26,7 @@
 using namespace llvm;
 
 X86VectorEnum llvm::X86Vector = NoSSE;
+bool llvm::X86ScalarSSE = false;
 
 /// X86TargetMachineModule - Note that this is used on hosts that cannot link
 /// in a library unless there are references into the library.  In particular,
@@ -41,8 +42,11 @@ namespace {
   cl::opt<bool> DisableOutput("disable-x86-llc-output", cl::Hidden,
                               cl::desc("Disable the X86 asm printer, for use "
                                        "when profiling the code generator."));
+  cl::opt<bool, true> EnableSSEFP("enable-sse-scalar-fp",
+                cl::desc("Perform FP math in SSE regs instead of the FP stack"),
+                cl::location(X86ScalarSSE),
+                cl::init(false));
 
-#if 0
   // FIXME: This should eventually be handled with target triples and
   // subtarget support!
   cl::opt<X86VectorEnum, true>
@@ -54,7 +58,6 @@ namespace {
        clEnumValN(SSE3, "sse3", "  Enable SSE, SSE2, and SSE3 support"),
        clEnumValEnd),
     cl::location(X86Vector), cl::init(NoSSE));
-#endif
 
   // Register the target.
   RegisterTarget<X86TargetMachine> X("x86", "  IA-32 (Pentium and above)");
@@ -91,6 +94,8 @@ X86TargetMachine::X86TargetMachine(const Module &M, IntrinsicLowering *IL)
   : TargetMachine("X86", IL, true, 4, 4, 4, 4, 4),
     FrameInfo(TargetFrameInfo::StackGrowsDown, 8, -4),
     JITInfo(*this) {
+  // Scalar SSE FP requires at least SSE2
+  X86ScalarSSE &= X86Vector >= SSE2;
 }
 
 
-- 
2.11.0