src/Reactor/LLVMReactor.cpp

   1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //    http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "Nucleus.hpp"
  16
  17 #include "llvm/Support/IRBuilder.h"
  18 #include "llvm/Function.h"
  19 #include "llvm/GlobalVariable.h"
  20 #include "llvm/Module.h"
  21 #include "llvm/LLVMContext.h"
  22 #include "llvm/Constants.h"
  23 #include "llvm/Intrinsics.h"
  24 #include "llvm/PassManager.h"
  25 #include "llvm/Analysis/LoopPass.h"
  26 #include "llvm/Transforms/Scalar.h"
  27 #include "llvm/Target/TargetData.h"
  28 #include "llvm/Target/TargetOptions.h"
  29 #include "llvm/Support/TargetSelect.h"
  30 #include "../lib/ExecutionEngine/JIT/JIT.h"
  31
  32 #include "LLVMRoutine.hpp"
  33 #include "LLVMRoutineManager.hpp"
  34 #include "x86.hpp"
  35 #include "CPUID.hpp"
  36 #include "Thread.hpp"
  37 #include "Memory.hpp"
  38 #include "MutexLock.hpp"
  39
  40 #include <xmmintrin.h>
  41 #include <fstream>
  42
  43 #if defined(__x86_64__) && defined(_WIN32)
  44 extern "C" void X86CompilationCallback()
  45 {
  46         assert(false);   // UNIMPLEMENTED
  47 }
  48 #endif
  49
  50 extern "C"
  51 {
  52         bool (*CodeAnalystInitialize)() = 0;
  53         void (*CodeAnalystCompleteJITLog)() = 0;
  54         bool (*CodeAnalystLogJITCode)(const void *jitCodeStartAddr, unsigned int jitCodeSize, const wchar_t *functionName) = 0;
  55 }
  56
  57 namespace llvm
  58 {
  59         extern bool JITEmitDebugInfo;
  60 }
  61
  62 namespace
  63 {
  64         sw::LLVMRoutineManager *routineManager = nullptr;
  65         llvm::ExecutionEngine *executionEngine = nullptr;
  66         llvm::IRBuilder<> *builder = nullptr;
  67         llvm::LLVMContext *context = nullptr;
  68         llvm::Module *module = nullptr;
  69         llvm::Function *function = nullptr;
  70
  71         sw::BackoffLock codegenMutex;
  72 }
  73
  74 namespace sw
  75 {
  76         using namespace llvm;
  77
  78         Optimization optimization[10] = {InstructionCombining, Disabled};
  79
  80         class Type : public llvm::Type {};
  81         class Value : public llvm::Value {};
  82         class SwitchCases : public llvm::SwitchInst {};
  83         class BasicBlock : public llvm::BasicBlock {};
  84
  85         inline Type *T(llvm::Type *t)
  86         {
  87                 return reinterpret_cast<Type*>(t);
  88         }
  89
  90         inline Value *V(llvm::Value *t)
  91         {
  92                 return reinterpret_cast<Value*>(t);
  93         }
  94
  95         inline std::vector<llvm::Type*> &T(std::vector<Type*> &t)
  96         {
  97                 return reinterpret_cast<std::vector<llvm::Type*>&>(t);
  98         }
  99
 100         inline BasicBlock *B(llvm::BasicBlock *t)
 101         {
 102                 return reinterpret_cast<BasicBlock*>(t);
 103         }
 104
 105         Nucleus::Nucleus()
 106         {
 107                 ::codegenMutex.lock();   // Reactor and LLVM are currently not thread safe
 108
 109                 InitializeNativeTarget();
 110                 JITEmitDebugInfo = false;
 111
 112                 if(!::context)
 113                 {
 114                         ::context = new LLVMContext();
 115                 }
 116
 117                 ::module = new Module("", *::context);
 118                 ::routineManager = new LLVMRoutineManager();
 119
 120                 #if defined(__x86_64__)
 121                         const char *architecture = "x86-64";
 122                 #else
 123                         const char *architecture = "x86";
 124                 #endif
 125
 126                 SmallVector<std::string, 1> MAttrs;
 127                 MAttrs.push_back(CPUID::supportsMMX()    ? "+mmx"   : "-mmx");
 128                 MAttrs.push_back(CPUID::supportsCMOV()   ? "+cmov"  : "-cmov");
 129                 MAttrs.push_back(CPUID::supportsSSE()    ? "+sse"   : "-sse");
 130                 MAttrs.push_back(CPUID::supportsSSE2()   ? "+sse2"  : "-sse2");
 131                 MAttrs.push_back(CPUID::supportsSSE3()   ? "+sse3"  : "-sse3");
 132                 MAttrs.push_back(CPUID::supportsSSSE3()  ? "+ssse3" : "-ssse3");
 133                 MAttrs.push_back(CPUID::supportsSSE4_1() ? "+sse41" : "-sse41");
 134
 135                 std::string error;
 136                 TargetMachine *targetMachine = EngineBuilder::selectTarget(::module, architecture, "", MAttrs, Reloc::Default, CodeModel::JITDefault, &error);
 137                 ::executionEngine = JIT::createJIT(::module, 0, ::routineManager, CodeGenOpt::Aggressive, true, targetMachine);
 138
 139                 if(!::builder)
 140                 {
 141                         ::builder = new IRBuilder<>(*::context);
 142
 143                         #if defined(_WIN32)
 144                                 HMODULE CodeAnalyst = LoadLibrary("CAJitNtfyLib.dll");
 145                                 if(CodeAnalyst)
 146                                 {
 147                                         CodeAnalystInitialize = (bool(*)())GetProcAddress(CodeAnalyst, "CAJIT_Initialize");
 148                                         CodeAnalystCompleteJITLog = (void(*)())GetProcAddress(CodeAnalyst, "CAJIT_CompleteJITLog");
 149                                         CodeAnalystLogJITCode = (bool(*)(const void*, unsigned int, const wchar_t*))GetProcAddress(CodeAnalyst, "CAJIT_LogJITCode");
 150
 151                                         CodeAnalystInitialize();
 152                                 }
 153                         #endif
 154                 }
 155         }
 156
 157         Nucleus::~Nucleus()
 158         {
 159                 delete ::executionEngine;
 160                 ::executionEngine = nullptr;
 161
 162                 ::routineManager = nullptr;
 163                 ::function = nullptr;
 164                 ::module = nullptr;
 165
 166                 ::codegenMutex.unlock();
 167         }
 168
 169         Routine *Nucleus::acquireRoutine(const wchar_t *name, bool runOptimizations)
 170         {
 171                 if(::builder->GetInsertBlock()->empty() || !::builder->GetInsertBlock()->back().isTerminator())
 172                 {
 173                         llvm::Type *type = ::function->getReturnType();
 174
 175                         if(type->isVoidTy())
 176                         {
 177                                 createRetVoid();
 178                         }
 179                         else
 180                         {
 181                                 createRet(V(UndefValue::get(type)));
 182                         }
 183                 }
 184
 185                 if(false)
 186                 {
 187                         std::string error;
 188                         raw_fd_ostream file("llvm-dump-unopt.txt", error);
 189                         ::module->print(file, 0);
 190                 }
 191
 192                 if(runOptimizations)
 193                 {
 194                         optimize();
 195                 }
 196
 197                 if(false)
 198                 {
 199                         std::string error;
 200                         raw_fd_ostream file("llvm-dump-opt.txt", error);
 201                         ::module->print(file, 0);
 202                 }
 203
 204                 void *entry = ::executionEngine->getPointerToFunction(::function);
 205                 LLVMRoutine *routine = ::routineManager->acquireRoutine(entry);
 206
 207                 if(CodeAnalystLogJITCode)
 208                 {
 209                         CodeAnalystLogJITCode(routine->getEntry(), routine->getCodeSize(), name);
 210                 }
 211
 212                 return routine;
 213         }
 214
 215         void Nucleus::optimize()
 216         {
 217                 static PassManager *passManager = nullptr;
 218
 219                 if(!passManager)
 220                 {
 221                         passManager = new PassManager();
 222
 223                         UnsafeFPMath = true;
 224                 //      NoInfsFPMath = true;
 225                 //      NoNaNsFPMath = true;
 226
 227                         passManager->add(new TargetData(*::executionEngine->getTargetData()));
 228                         passManager->add(createScalarReplAggregatesPass());
 229
 230                         for(int pass = 0; pass < 10 && optimization[pass] != Disabled; pass++)
 231                         {
 232                                 switch(optimization[pass])
 233                                 {
 234                                 case Disabled:                                                                 break;
 235                                 case CFGSimplification:    passManager->add(createCFGSimplificationPass());    break;
 236                                 case LICM:                 passManager->add(createLICMPass());                 break;
 237                                 case AggressiveDCE:        passManager->add(createAggressiveDCEPass());        break;
 238                                 case GVN:                  passManager->add(createGVNPass());                  break;
 239                                 case InstructionCombining: passManager->add(createInstructionCombiningPass()); break;
 240                                 case Reassociate:          passManager->add(createReassociatePass());          break;
 241                                 case DeadStoreElimination: passManager->add(createDeadStoreEliminationPass()); break;
 242                                 case SCCP:                 passManager->add(createSCCPPass());                 break;
 243                                 case ScalarReplAggregates: passManager->add(createScalarReplAggregatesPass()); break;
 244                                 default:
 245                                         assert(false);
 246                                 }
 247                         }
 248                 }
 249
 250                 passManager->run(*::module);
 251         }
 252
 253         Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
 254         {
 255                 // Need to allocate it in the entry block for mem2reg to work
 256                 llvm::BasicBlock &entryBlock = ::function->getEntryBlock();
 257
 258                 Instruction *declaration;
 259
 260                 if(arraySize)
 261                 {
 262                         declaration = new AllocaInst(type, Nucleus::createConstantInt(arraySize));
 263                 }
 264                 else
 265                 {
 266                         declaration = new AllocaInst(type, (Value*)0);
 267                 }
 268
 269                 entryBlock.getInstList().push_front(declaration);
 270
 271                 return V(declaration);
 272         }
 273
 274         BasicBlock *Nucleus::createBasicBlock()
 275         {
 276                 return B(BasicBlock::Create(*::context, "", ::function));
 277         }
 278
 279         BasicBlock *Nucleus::getInsertBlock()
 280         {
 281                 return B(::builder->GetInsertBlock());
 282         }
 283
 284         void Nucleus::setInsertBlock(BasicBlock *basicBlock)
 285         {
 286         //      assert(::builder->GetInsertBlock()->back().isTerminator());
 287                 return ::builder->SetInsertPoint(basicBlock);
 288         }
 289
 290         void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
 291         {
 292                 llvm::FunctionType *functionType = llvm::FunctionType::get(ReturnType, T(Params), false);
 293                 ::function = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, "", ::module);
 294                 ::function->setCallingConv(llvm::CallingConv::C);
 295
 296                 ::builder->SetInsertPoint(BasicBlock::Create(*::context, "", ::function));
 297         }
 298
 299         Value *Nucleus::getArgument(unsigned int index)
 300         {
 301                 llvm::Function::arg_iterator args = ::function->arg_begin();
 302
 303                 while(index)
 304                 {
 305                         args++;
 306                         index--;
 307                 }
 308
 309                 return V(&*args);
 310         }
 311
 312         void Nucleus::createRetVoid()
 313         {
 314                 x86::emms();
 315
 316                 ::builder->CreateRetVoid();
 317         }
 318
 319         void Nucleus::createRet(Value *v)
 320         {
 321                 x86::emms();
 322
 323                 ::builder->CreateRet(v);
 324         }
 325
 326         void Nucleus::createBr(BasicBlock *dest)
 327         {
 328                 ::builder->CreateBr(dest);
 329         }
 330
 331         void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
 332         {
 333                 ::builder->CreateCondBr(cond, ifTrue, ifFalse);
 334         }
 335
 336         Value *Nucleus::createAdd(Value *lhs, Value *rhs)
 337         {
 338                 return V(::builder->CreateAdd(lhs, rhs));
 339         }
 340
 341         Value *Nucleus::createSub(Value *lhs, Value *rhs)
 342         {
 343                 return V(::builder->CreateSub(lhs, rhs));
 344         }
 345
 346         Value *Nucleus::createMul(Value *lhs, Value *rhs)
 347         {
 348                 return V(::builder->CreateMul(lhs, rhs));
 349         }
 350
 351         Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
 352         {
 353                 return V(::builder->CreateUDiv(lhs, rhs));
 354         }
 355
 356         Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
 357         {
 358                 return V(::builder->CreateSDiv(lhs, rhs));
 359         }
 360
 361         Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
 362         {
 363                 return V(::builder->CreateFAdd(lhs, rhs));
 364         }
 365
 366         Value *Nucleus::createFSub(Value *lhs, Value *rhs)
 367         {
 368                 return V(::builder->CreateFSub(lhs, rhs));
 369         }
 370
 371         Value *Nucleus::createFMul(Value *lhs, Value *rhs)
 372         {
 373                 return V(::builder->CreateFMul(lhs, rhs));
 374         }
 375
 376         Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
 377         {
 378                 return V(::builder->CreateFDiv(lhs, rhs));
 379         }
 380
 381         Value *Nucleus::createURem(Value *lhs, Value *rhs)
 382         {
 383                 return V(::builder->CreateURem(lhs, rhs));
 384         }
 385
 386         Value *Nucleus::createSRem(Value *lhs, Value *rhs)
 387         {
 388                 return V(::builder->CreateSRem(lhs, rhs));
 389         }
 390
 391         Value *Nucleus::createFRem(Value *lhs, Value *rhs)
 392         {
 393                 return V(::builder->CreateFRem(lhs, rhs));
 394         }
 395
 396         Value *Nucleus::createShl(Value *lhs, Value *rhs)
 397         {
 398                 return V(::builder->CreateShl(lhs, rhs));
 399         }
 400
 401         Value *Nucleus::createLShr(Value *lhs, Value *rhs)
 402         {
 403                 return V(::builder->CreateLShr(lhs, rhs));
 404         }
 405
 406         Value *Nucleus::createAShr(Value *lhs, Value *rhs)
 407         {
 408                 return V(::builder->CreateAShr(lhs, rhs));
 409         }
 410
 411         Value *Nucleus::createAnd(Value *lhs, Value *rhs)
 412         {
 413                 return V(::builder->CreateAnd(lhs, rhs));
 414         }
 415
 416         Value *Nucleus::createOr(Value *lhs, Value *rhs)
 417         {
 418                 return V(::builder->CreateOr(lhs, rhs));
 419         }
 420
 421         Value *Nucleus::createXor(Value *lhs, Value *rhs)
 422         {
 423                 return V(::builder->CreateXor(lhs, rhs));
 424         }
 425
 426         Value *Nucleus::createNeg(Value *v)
 427         {
 428                 return V(::builder->CreateNeg(v));
 429         }
 430
 431         Value *Nucleus::createFNeg(Value *v)
 432         {
 433                 return V(::builder->CreateFNeg(v));
 434         }
 435
 436         Value *Nucleus::createNot(Value *v)
 437         {
 438                 return V(::builder->CreateNot(v));
 439         }
 440
 441         Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int align)
 442         {
 443                 assert(ptr->getType()->getContainedType(0) == type);
 444                 return V(::builder->Insert(new LoadInst(ptr, "", isVolatile, align)));
 445         }
 446
 447         Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int align)
 448         {
 449                 assert(ptr->getType()->getContainedType(0) == type);
 450                 ::builder->Insert(new StoreInst(value, ptr, isVolatile, align));
 451                 return value;
 452         }
 453
 454         Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index)
 455         {
 456                 assert(ptr->getType()->getContainedType(0) == type);
 457                 return V(::builder->CreateGEP(ptr, index));
 458         }
 459
 460         Value *Nucleus::createAtomicAdd(Value *ptr, Value *value)
 461         {
 462                 return V(::builder->CreateAtomicRMW(AtomicRMWInst::Add, ptr, value, SequentiallyConsistent));
 463         }
 464
 465         Value *Nucleus::createTrunc(Value *v, Type *destType)
 466         {
 467                 return V(::builder->CreateTrunc(v, destType));
 468         }
 469
 470         Value *Nucleus::createZExt(Value *v, Type *destType)
 471         {
 472                 return V(::builder->CreateZExt(v, destType));
 473         }
 474
 475         Value *Nucleus::createSExt(Value *v, Type *destType)
 476         {
 477                 return V(::builder->CreateSExt(v, destType));
 478         }
 479
 480         Value *Nucleus::createFPToSI(Value *v, Type *destType)
 481         {
 482                 return V(::builder->CreateFPToSI(v, destType));
 483         }
 484
 485         Value *Nucleus::createSIToFP(Value *v, Type *destType)
 486         {
 487                 return V(::builder->CreateSIToFP(v, destType));
 488         }
 489
 490         Value *Nucleus::createFPTrunc(Value *v, Type *destType)
 491         {
 492                 return V(::builder->CreateFPTrunc(v, destType));
 493         }
 494
 495         Value *Nucleus::createFPExt(Value *v, Type *destType)
 496         {
 497                 return V(::builder->CreateFPExt(v, destType));
 498         }
 499
 500         Value *Nucleus::createBitCast(Value *v, Type *destType)
 501         {
 502                 return V(::builder->CreateBitCast(v, destType));
 503         }
 504
 505         Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
 506         {
 507                 return V(::builder->CreateICmpEQ(lhs, rhs));
 508         }
 509
 510         Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
 511         {
 512                 return V(::builder->CreateICmpNE(lhs, rhs));
 513         }
 514
 515         Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
 516         {
 517                 return V(::builder->CreateICmpUGT(lhs, rhs));
 518         }
 519
 520         Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
 521         {
 522                 return V(::builder->CreateICmpUGE(lhs, rhs));
 523         }
 524
 525         Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
 526         {
 527                 return V(::builder->CreateICmpULT(lhs, rhs));
 528         }
 529
 530         Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
 531         {
 532                 return V(::builder->CreateICmpULE(lhs, rhs));
 533         }
 534
 535         Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
 536         {
 537                 return V(::builder->CreateICmpSGT(lhs, rhs));
 538         }
 539
 540         Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
 541         {
 542                 return V(::builder->CreateICmpSGE(lhs, rhs));
 543         }
 544
 545         Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
 546         {
 547                 return V(::builder->CreateICmpSLT(lhs, rhs));
 548         }
 549
 550         Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
 551         {
 552                 return V(::builder->CreateICmpSLE(lhs, rhs));
 553         }
 554
 555         Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
 556         {
 557                 return V(::builder->CreateFCmpOEQ(lhs, rhs));
 558         }
 559
 560         Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
 561         {
 562                 return V(::builder->CreateFCmpOGT(lhs, rhs));
 563         }
 564
 565         Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
 566         {
 567                 return V(::builder->CreateFCmpOGE(lhs, rhs));
 568         }
 569
 570         Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
 571         {
 572                 return V(::builder->CreateFCmpOLT(lhs, rhs));
 573         }
 574
 575         Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
 576         {
 577                 return V(::builder->CreateFCmpOLE(lhs, rhs));
 578         }
 579
 580         Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
 581         {
 582                 return V(::builder->CreateFCmpONE(lhs, rhs));
 583         }
 584
 585         Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
 586         {
 587                 return V(::builder->CreateFCmpORD(lhs, rhs));
 588         }
 589
 590         Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
 591         {
 592                 return V(::builder->CreateFCmpUNO(lhs, rhs));
 593         }
 594
 595         Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
 596         {
 597                 return V(::builder->CreateFCmpUEQ(lhs, rhs));
 598         }
 599
 600         Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
 601         {
 602                 return V(::builder->CreateFCmpUGT(lhs, rhs));
 603         }
 604
 605         Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
 606         {
 607                 return V(::builder->CreateFCmpUGE(lhs, rhs));
 608         }
 609
 610         Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
 611         {
 612                 return V(::builder->CreateFCmpULT(lhs, rhs));
 613         }
 614
 615         Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
 616         {
 617                 return V(::builder->CreateFCmpULE(lhs, rhs));
 618         }
 619
 620         Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
 621         {
 622                 return V(::builder->CreateFCmpULE(lhs, rhs));
 623         }
 624
 625         Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
 626         {
 627                 assert(vector->getType()->getContainedType(0) == type);
 628                 return V(::builder->CreateExtractElement(vector, createConstantInt(index)));
 629         }
 630
 631         Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
 632         {
 633                 return V(::builder->CreateInsertElement(vector, element, createConstantInt(index)));
 634         }
 635
 636         Value *Nucleus::createShuffleVector(Value *V1, Value *V2, const int *select)
 637         {
 638                 int size = llvm::cast<llvm::VectorType>(V1->getType())->getNumElements();
 639                 const int maxSize = 16;
 640                 llvm::Constant *swizzle[maxSize];
 641                 assert(size <= maxSize);
 642
 643                 for(int i = 0; i < size; i++)
 644                 {
 645                         swizzle[i] = llvm::ConstantInt::get(Type::getInt32Ty(*::context), select[i]);
 646                 }
 647
 648                 llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(swizzle, size));
 649
 650                 return V(::builder->CreateShuffleVector(V1, V2, shuffle));
 651         }
 652
 653         Value *Nucleus::createSelect(Value *C, Value *ifTrue, Value *ifFalse)
 654         {
 655                 return V(::builder->CreateSelect(C, ifTrue, ifFalse));
 656         }
 657
 658         SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
 659         {
 660                 return reinterpret_cast<SwitchCases*>(::builder->CreateSwitch(control, defaultBranch, numCases));
 661         }
 662
 663         void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
 664         {
 665                 switchCases->addCase(llvm::ConstantInt::get(Type::getInt32Ty(*::context), label, true), branch);
 666         }
 667
 668         void Nucleus::createUnreachable()
 669         {
 670                 ::builder->CreateUnreachable();
 671         }
 672
 673         static Value *createSwizzle4(Value *val, unsigned char select)
 674         {
 675                 int swizzle[4] =
 676                 {
 677                         (select >> 0) & 0x03,
 678                         (select >> 2) & 0x03,
 679                         (select >> 4) & 0x03,
 680                         (select >> 6) & 0x03,
 681                 };
 682
 683                 return Nucleus::createShuffleVector(val, val, swizzle);
 684         }
 685
 686         static Value *createMask4(Value *lhs, Value *rhs, unsigned char select)
 687         {
 688                 bool mask[4] = {false, false, false, false};
 689
 690                 mask[(select >> 0) & 0x03] = true;
 691                 mask[(select >> 2) & 0x03] = true;
 692                 mask[(select >> 4) & 0x03] = true;
 693                 mask[(select >> 6) & 0x03] = true;
 694
 695                 int swizzle[4] =
 696                 {
 697                         mask[0] ? 4 : 0,
 698                         mask[1] ? 5 : 1,
 699                         mask[2] ? 6 : 2,
 700                         mask[3] ? 7 : 3,
 701                 };
 702
 703                 return Nucleus::createShuffleVector(lhs, rhs, swizzle);
 704         }
 705
 706         Type *Nucleus::getPointerType(Type *ElementType)
 707         {
 708                 return T(llvm::PointerType::get(ElementType, 0));
 709         }
 710
 711         Value *Nucleus::createNullValue(Type *Ty)
 712         {
 713                 return V(llvm::Constant::getNullValue(Ty));
 714         }
 715
 716         Value *Nucleus::createConstantLong(int64_t i)
 717         {
 718                 return V(llvm::ConstantInt::get(Type::getInt64Ty(*::context), i, true));
 719         }
 720
 721         Value *Nucleus::createConstantInt(int i)
 722         {
 723                 return V(llvm::ConstantInt::get(Type::getInt32Ty(*::context), i, true));
 724         }
 725
 726         Value *Nucleus::createConstantInt(unsigned int i)
 727         {
 728                 return V(llvm::ConstantInt::get(Type::getInt32Ty(*::context), i, false));
 729         }
 730
 731         Value *Nucleus::createConstantBool(bool b)
 732         {
 733                 return V(llvm::ConstantInt::get(Type::getInt1Ty(*::context), b));
 734         }
 735
 736         Value *Nucleus::createConstantByte(signed char i)
 737         {
 738                 return V(llvm::ConstantInt::get(Type::getInt8Ty(*::context), i, true));
 739         }
 740
 741         Value *Nucleus::createConstantByte(unsigned char i)
 742         {
 743                 return V(llvm::ConstantInt::get(Type::getInt8Ty(*::context), i, false));
 744         }
 745
 746         Value *Nucleus::createConstantShort(short i)
 747         {
 748                 return V(llvm::ConstantInt::get(Type::getInt16Ty(*::context), i, true));
 749         }
 750
 751         Value *Nucleus::createConstantShort(unsigned short i)
 752         {
 753                 return V(llvm::ConstantInt::get(Type::getInt16Ty(*::context), i, false));
 754         }
 755
 756         Value *Nucleus::createConstantFloat(float x)
 757         {
 758                 return V(llvm::ConstantFP::get(Float::getType(), x));
 759         }
 760
 761         Value *Nucleus::createNullPointer(Type *Ty)
 762         {
 763                 return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(Ty, 0)));
 764         }
 765
 766         Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
 767         {
 768                 assert(llvm::isa<VectorType>(type));
 769                 const int numConstants = llvm::cast<VectorType>(type)->getNumElements();
 770                 assert(numConstants <= 16);
 771                 llvm::Constant *constantVector[16];
 772
 773                 for(int i = 0; i < numConstants; i++)
 774                 {
 775                         constantVector[i] = llvm::ConstantInt::get(type->getContainedType(0), constants[i]);
 776                 }
 777
 778                 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numConstants)));
 779         }
 780
 781         Value *Nucleus::createConstantVector(const double *constants, Type *type)
 782         {
 783                 assert(llvm::isa<VectorType>(type));
 784                 const int numConstants = llvm::cast<VectorType>(type)->getNumElements();
 785                 assert(numConstants <= 8);
 786                 llvm::Constant *constantVector[8];
 787
 788                 for(int i = 0; i < numConstants; i++)
 789                 {
 790                         constantVector[i] = llvm::ConstantFP::get(type->getContainedType(0), constants[i]);
 791                 }
 792
 793                 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numConstants)));
 794         }
 795
 796         Type *Void::getType()
 797         {
 798                 return T(llvm::Type::getVoidTy(*::context));
 799         }
 800
 801         class MMX : public LValue<MMX>
 802         {
 803         public:
 804                 static Type *getType();
 805         };
 806
 807         Type *MMX::getType()
 808         {
 809                 return T(llvm::Type::getX86_MMXTy(*::context));
 810         }
 811
 812         Bool::Bool(Argument<Bool> argument)
 813         {
 814                 storeValue(argument.value);
 815         }
 816
 817         Bool::Bool()
 818         {
 819         }
 820
 821         Bool::Bool(bool x)
 822         {
 823                 storeValue(Nucleus::createConstantBool(x));
 824         }
 825
 826         Bool::Bool(RValue<Bool> rhs)
 827         {
 828                 storeValue(rhs.value);
 829         }
 830
 831         Bool::Bool(const Bool &rhs)
 832         {
 833                 Value *value = rhs.loadValue();
 834                 storeValue(value);
 835         }
 836
 837         Bool::Bool(const Reference<Bool> &rhs)
 838         {
 839                 Value *value = rhs.loadValue();
 840                 storeValue(value);
 841         }
 842
 843         RValue<Bool> Bool::operator=(RValue<Bool> rhs)
 844         {
 845                 storeValue(rhs.value);
 846
 847                 return rhs;
 848         }
 849
 850         RValue<Bool> Bool::operator=(const Bool &rhs)
 851         {
 852                 Value *value = rhs.loadValue();
 853                 storeValue(value);
 854
 855                 return RValue<Bool>(value);
 856         }
 857
 858         RValue<Bool> Bool::operator=(const Reference<Bool> &rhs)
 859         {
 860                 Value *value = rhs.loadValue();
 861                 storeValue(value);
 862
 863                 return RValue<Bool>(value);
 864         }
 865
 866         RValue<Bool> operator!(RValue<Bool> val)
 867         {
 868                 return RValue<Bool>(Nucleus::createNot(val.value));
 869         }
 870
 871         RValue<Bool> operator&&(RValue<Bool> lhs, RValue<Bool> rhs)
 872         {
 873                 return RValue<Bool>(Nucleus::createAnd(lhs.value, rhs.value));
 874         }
 875
 876         RValue<Bool> operator||(RValue<Bool> lhs, RValue<Bool> rhs)
 877         {
 878                 return RValue<Bool>(Nucleus::createOr(lhs.value, rhs.value));
 879         }
 880
 881         Type *Bool::getType()
 882         {
 883                 return T(llvm::Type::getInt1Ty(*::context));
 884         }
 885
 886         Byte::Byte(Argument<Byte> argument)
 887         {
 888                 storeValue(argument.value);
 889         }
 890
 891         Byte::Byte(RValue<Int> cast)
 892         {
 893                 Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
 894
 895                 storeValue(integer);
 896         }
 897
 898         Byte::Byte(RValue<UInt> cast)
 899         {
 900                 Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
 901
 902                 storeValue(integer);
 903         }
 904
 905         Byte::Byte(RValue<UShort> cast)
 906         {
 907                 Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
 908
 909                 storeValue(integer);
 910         }
 911
 912         Byte::Byte()
 913         {
 914         }
 915
 916         Byte::Byte(int x)
 917         {
 918                 storeValue(Nucleus::createConstantByte((unsigned char)x));
 919         }
 920
 921         Byte::Byte(unsigned char x)
 922         {
 923                 storeValue(Nucleus::createConstantByte(x));
 924         }
 925
 926         Byte::Byte(RValue<Byte> rhs)
 927         {
 928                 storeValue(rhs.value);
 929         }
 930
 931         Byte::Byte(const Byte &rhs)
 932         {
 933                 Value *value = rhs.loadValue();
 934                 storeValue(value);
 935         }
 936
 937         Byte::Byte(const Reference<Byte> &rhs)
 938         {
 939                 Value *value = rhs.loadValue();
 940                 storeValue(value);
 941         }
 942
 943         RValue<Byte> Byte::operator=(RValue<Byte> rhs)
 944         {
 945                 storeValue(rhs.value);
 946
 947                 return rhs;
 948         }
 949
 950         RValue<Byte> Byte::operator=(const Byte &rhs)
 951         {
 952                 Value *value = rhs.loadValue();
 953                 storeValue(value);
 954
 955                 return RValue<Byte>(value);
 956         }
 957
 958         RValue<Byte> Byte::operator=(const Reference<Byte> &rhs)
 959         {
 960                 Value *value = rhs.loadValue();
 961                 storeValue(value);
 962
 963                 return RValue<Byte>(value);
 964         }
 965
 966         RValue<Byte> operator+(RValue<Byte> lhs, RValue<Byte> rhs)
 967         {
 968                 return RValue<Byte>(Nucleus::createAdd(lhs.value, rhs.value));
 969         }
 970
 971         RValue<Byte> operator-(RValue<Byte> lhs, RValue<Byte> rhs)
 972         {
 973                 return RValue<Byte>(Nucleus::createSub(lhs.value, rhs.value));
 974         }
 975
 976         RValue<Byte> operator*(RValue<Byte> lhs, RValue<Byte> rhs)
 977         {
 978                 return RValue<Byte>(Nucleus::createMul(lhs.value, rhs.value));
 979         }
 980
 981         RValue<Byte> operator/(RValue<Byte> lhs, RValue<Byte> rhs)
 982         {
 983                 return RValue<Byte>(Nucleus::createUDiv(lhs.value, rhs.value));
 984         }
 985
 986         RValue<Byte> operator%(RValue<Byte> lhs, RValue<Byte> rhs)
 987         {
 988                 return RValue<Byte>(Nucleus::createURem(lhs.value, rhs.value));
 989         }
 990
 991         RValue<Byte> operator&(RValue<Byte> lhs, RValue<Byte> rhs)
 992         {
 993                 return RValue<Byte>(Nucleus::createAnd(lhs.value, rhs.value));
 994         }
 995
 996         RValue<Byte> operator|(RValue<Byte> lhs, RValue<Byte> rhs)
 997         {
 998                 return RValue<Byte>(Nucleus::createOr(lhs.value, rhs.value));
 999         }
1000
1001         RValue<Byte> operator^(RValue<Byte> lhs, RValue<Byte> rhs)
1002         {
1003                 return RValue<Byte>(Nucleus::createXor(lhs.value, rhs.value));
1004         }
1005
1006         RValue<Byte> operator<<(RValue<Byte> lhs, RValue<Byte> rhs)
1007         {
1008                 return RValue<Byte>(Nucleus::createShl(lhs.value, rhs.value));
1009         }
1010
1011         RValue<Byte> operator>>(RValue<Byte> lhs, RValue<Byte> rhs)
1012         {
1013                 return RValue<Byte>(Nucleus::createLShr(lhs.value, rhs.value));
1014         }
1015
1016         RValue<Byte> operator+=(Byte &lhs, RValue<Byte> rhs)
1017         {
1018                 return lhs = lhs + rhs;
1019         }
1020
1021         RValue<Byte> operator-=(Byte &lhs, RValue<Byte> rhs)
1022         {
1023                 return lhs = lhs - rhs;
1024         }
1025
1026         RValue<Byte> operator*=(Byte &lhs, RValue<Byte> rhs)
1027         {
1028                 return lhs = lhs * rhs;
1029         }
1030
1031         RValue<Byte> operator/=(Byte &lhs, RValue<Byte> rhs)
1032         {
1033                 return lhs = lhs / rhs;
1034         }
1035
1036         RValue<Byte> operator%=(Byte &lhs, RValue<Byte> rhs)
1037         {
1038                 return lhs = lhs % rhs;
1039         }
1040
1041         RValue<Byte> operator&=(Byte &lhs, RValue<Byte> rhs)
1042         {
1043                 return lhs = lhs & rhs;
1044         }
1045
1046         RValue<Byte> operator|=(Byte &lhs, RValue<Byte> rhs)
1047         {
1048                 return lhs = lhs | rhs;
1049         }
1050
1051         RValue<Byte> operator^=(Byte &lhs, RValue<Byte> rhs)
1052         {
1053                 return lhs = lhs ^ rhs;
1054         }
1055
1056         RValue<Byte> operator<<=(Byte &lhs, RValue<Byte> rhs)
1057         {
1058                 return lhs = lhs << rhs;
1059         }
1060
1061         RValue<Byte> operator>>=(Byte &lhs, RValue<Byte> rhs)
1062         {
1063                 return lhs = lhs >> rhs;
1064         }
1065
1066         RValue<Byte> operator+(RValue<Byte> val)
1067         {
1068                 return val;
1069         }
1070
1071         RValue<Byte> operator-(RValue<Byte> val)
1072         {
1073                 return RValue<Byte>(Nucleus::createNeg(val.value));
1074         }
1075
1076         RValue<Byte> operator~(RValue<Byte> val)
1077         {
1078                 return RValue<Byte>(Nucleus::createNot(val.value));
1079         }
1080
1081         RValue<Byte> operator++(Byte &val, int)   // Post-increment
1082         {
1083                 RValue<Byte> res = val;
1084
1085                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantByte((unsigned char)1)));
1086                 val.storeValue(inc);
1087
1088                 return res;
1089         }
1090
1091         const Byte &operator++(Byte &val)   // Pre-increment
1092         {
1093                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantByte((unsigned char)1)));
1094                 val.storeValue(inc);
1095
1096                 return val;
1097         }
1098
1099         RValue<Byte> operator--(Byte &val, int)   // Post-decrement
1100         {
1101                 RValue<Byte> res = val;
1102
1103                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantByte((unsigned char)1)));
1104                 val.storeValue(inc);
1105
1106                 return res;
1107         }
1108
1109         const Byte &operator--(Byte &val)   // Pre-decrement
1110         {
1111                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantByte((unsigned char)1)));
1112                 val.storeValue(inc);
1113
1114                 return val;
1115         }
1116
1117         RValue<Bool> operator<(RValue<Byte> lhs, RValue<Byte> rhs)
1118         {
1119                 return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
1120         }
1121
1122         RValue<Bool> operator<=(RValue<Byte> lhs, RValue<Byte> rhs)
1123         {
1124                 return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
1125         }
1126
1127         RValue<Bool> operator>(RValue<Byte> lhs, RValue<Byte> rhs)
1128         {
1129                 return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
1130         }
1131
1132         RValue<Bool> operator>=(RValue<Byte> lhs, RValue<Byte> rhs)
1133         {
1134                 return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
1135         }
1136
1137         RValue<Bool> operator!=(RValue<Byte> lhs, RValue<Byte> rhs)
1138         {
1139                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1140         }
1141
1142         RValue<Bool> operator==(RValue<Byte> lhs, RValue<Byte> rhs)
1143         {
1144                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1145         }
1146
1147         Type *Byte::getType()
1148         {
1149                 return T(llvm::Type::getInt8Ty(*::context));
1150         }
1151
1152         SByte::SByte(Argument<SByte> argument)
1153         {
1154                 storeValue(argument.value);
1155         }
1156
1157         SByte::SByte(RValue<Int> cast)
1158         {
1159                 Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
1160
1161                 storeValue(integer);
1162         }
1163
1164         SByte::SByte(RValue<Short> cast)
1165         {
1166                 Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
1167
1168                 storeValue(integer);
1169         }
1170
1171         SByte::SByte()
1172         {
1173         }
1174
1175         SByte::SByte(signed char x)
1176         {
1177                 storeValue(Nucleus::createConstantByte(x));
1178         }
1179
1180         SByte::SByte(RValue<SByte> rhs)
1181         {
1182                 storeValue(rhs.value);
1183         }
1184
1185         SByte::SByte(const SByte &rhs)
1186         {
1187                 Value *value = rhs.loadValue();
1188                 storeValue(value);
1189         }
1190
1191         SByte::SByte(const Reference<SByte> &rhs)
1192         {
1193                 Value *value = rhs.loadValue();
1194                 storeValue(value);
1195         }
1196
1197         RValue<SByte> SByte::operator=(RValue<SByte> rhs)
1198         {
1199                 storeValue(rhs.value);
1200
1201                 return rhs;
1202         }
1203
1204         RValue<SByte> SByte::operator=(const SByte &rhs)
1205         {
1206                 Value *value = rhs.loadValue();
1207                 storeValue(value);
1208
1209                 return RValue<SByte>(value);
1210         }
1211
1212         RValue<SByte> SByte::operator=(const Reference<SByte> &rhs)
1213         {
1214                 Value *value = rhs.loadValue();
1215                 storeValue(value);
1216
1217                 return RValue<SByte>(value);
1218         }
1219
1220         RValue<SByte> operator+(RValue<SByte> lhs, RValue<SByte> rhs)
1221         {
1222                 return RValue<SByte>(Nucleus::createAdd(lhs.value, rhs.value));
1223         }
1224
1225         RValue<SByte> operator-(RValue<SByte> lhs, RValue<SByte> rhs)
1226         {
1227                 return RValue<SByte>(Nucleus::createSub(lhs.value, rhs.value));
1228         }
1229
1230         RValue<SByte> operator*(RValue<SByte> lhs, RValue<SByte> rhs)
1231         {
1232                 return RValue<SByte>(Nucleus::createMul(lhs.value, rhs.value));
1233         }
1234
1235         RValue<SByte> operator/(RValue<SByte> lhs, RValue<SByte> rhs)
1236         {
1237                 return RValue<SByte>(Nucleus::createSDiv(lhs.value, rhs.value));
1238         }
1239
1240         RValue<SByte> operator%(RValue<SByte> lhs, RValue<SByte> rhs)
1241         {
1242                 return RValue<SByte>(Nucleus::createSRem(lhs.value, rhs.value));
1243         }
1244
1245         RValue<SByte> operator&(RValue<SByte> lhs, RValue<SByte> rhs)
1246         {
1247                 return RValue<SByte>(Nucleus::createAnd(lhs.value, rhs.value));
1248         }
1249
1250         RValue<SByte> operator|(RValue<SByte> lhs, RValue<SByte> rhs)
1251         {
1252                 return RValue<SByte>(Nucleus::createOr(lhs.value, rhs.value));
1253         }
1254
1255         RValue<SByte> operator^(RValue<SByte> lhs, RValue<SByte> rhs)
1256         {
1257                 return RValue<SByte>(Nucleus::createXor(lhs.value, rhs.value));
1258         }
1259
1260         RValue<SByte> operator<<(RValue<SByte> lhs, RValue<SByte> rhs)
1261         {
1262                 return RValue<SByte>(Nucleus::createShl(lhs.value, rhs.value));
1263         }
1264
1265         RValue<SByte> operator>>(RValue<SByte> lhs, RValue<SByte> rhs)
1266         {
1267                 return RValue<SByte>(Nucleus::createAShr(lhs.value, rhs.value));
1268         }
1269
1270         RValue<SByte> operator+=(SByte &lhs, RValue<SByte> rhs)
1271         {
1272                 return lhs = lhs + rhs;
1273         }
1274
1275         RValue<SByte> operator-=(SByte &lhs, RValue<SByte> rhs)
1276         {
1277                 return lhs = lhs - rhs;
1278         }
1279
1280         RValue<SByte> operator*=(SByte &lhs, RValue<SByte> rhs)
1281         {
1282                 return lhs = lhs * rhs;
1283         }
1284
1285         RValue<SByte> operator/=(SByte &lhs, RValue<SByte> rhs)
1286         {
1287                 return lhs = lhs / rhs;
1288         }
1289
1290         RValue<SByte> operator%=(SByte &lhs, RValue<SByte> rhs)
1291         {
1292                 return lhs = lhs % rhs;
1293         }
1294
1295         RValue<SByte> operator&=(SByte &lhs, RValue<SByte> rhs)
1296         {
1297                 return lhs = lhs & rhs;
1298         }
1299
1300         RValue<SByte> operator|=(SByte &lhs, RValue<SByte> rhs)
1301         {
1302                 return lhs = lhs | rhs;
1303         }
1304
1305         RValue<SByte> operator^=(SByte &lhs, RValue<SByte> rhs)
1306         {
1307                 return lhs = lhs ^ rhs;
1308         }
1309
1310         RValue<SByte> operator<<=(SByte &lhs, RValue<SByte> rhs)
1311         {
1312                 return lhs = lhs << rhs;
1313         }
1314
1315         RValue<SByte> operator>>=(SByte &lhs, RValue<SByte> rhs)
1316         {
1317                 return lhs = lhs >> rhs;
1318         }
1319
1320         RValue<SByte> operator+(RValue<SByte> val)
1321         {
1322                 return val;
1323         }
1324
1325         RValue<SByte> operator-(RValue<SByte> val)
1326         {
1327                 return RValue<SByte>(Nucleus::createNeg(val.value));
1328         }
1329
1330         RValue<SByte> operator~(RValue<SByte> val)
1331         {
1332                 return RValue<SByte>(Nucleus::createNot(val.value));
1333         }
1334
1335         RValue<SByte> operator++(SByte &val, int)   // Post-increment
1336         {
1337                 RValue<SByte> res = val;
1338
1339                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantByte((signed char)1)));
1340                 val.storeValue(inc);
1341
1342                 return res;
1343         }
1344
1345         const SByte &operator++(SByte &val)   // Pre-increment
1346         {
1347                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantByte((signed char)1)));
1348                 val.storeValue(inc);
1349
1350                 return val;
1351         }
1352
1353         RValue<SByte> operator--(SByte &val, int)   // Post-decrement
1354         {
1355                 RValue<SByte> res = val;
1356
1357                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantByte((signed char)1)));
1358                 val.storeValue(inc);
1359
1360                 return res;
1361         }
1362
1363         const SByte &operator--(SByte &val)   // Pre-decrement
1364         {
1365                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantByte((signed char)1)));
1366                 val.storeValue(inc);
1367
1368                 return val;
1369         }
1370
1371         RValue<Bool> operator<(RValue<SByte> lhs, RValue<SByte> rhs)
1372         {
1373                 return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
1374         }
1375
1376         RValue<Bool> operator<=(RValue<SByte> lhs, RValue<SByte> rhs)
1377         {
1378                 return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
1379         }
1380
1381         RValue<Bool> operator>(RValue<SByte> lhs, RValue<SByte> rhs)
1382         {
1383                 return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
1384         }
1385
1386         RValue<Bool> operator>=(RValue<SByte> lhs, RValue<SByte> rhs)
1387         {
1388                 return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
1389         }
1390
1391         RValue<Bool> operator!=(RValue<SByte> lhs, RValue<SByte> rhs)
1392         {
1393                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1394         }
1395
1396         RValue<Bool> operator==(RValue<SByte> lhs, RValue<SByte> rhs)
1397         {
1398                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1399         }
1400
1401         Type *SByte::getType()
1402         {
1403                 return T(llvm::Type::getInt8Ty(*::context));
1404         }
1405
1406         Short::Short(Argument<Short> argument)
1407         {
1408                 storeValue(argument.value);
1409         }
1410
1411         Short::Short(RValue<Int> cast)
1412         {
1413                 Value *integer = Nucleus::createTrunc(cast.value, Short::getType());
1414
1415                 storeValue(integer);
1416         }
1417
1418         Short::Short()
1419         {
1420         }
1421
1422         Short::Short(short x)
1423         {
1424                 storeValue(Nucleus::createConstantShort(x));
1425         }
1426
1427         Short::Short(RValue<Short> rhs)
1428         {
1429                 storeValue(rhs.value);
1430         }
1431
1432         Short::Short(const Short &rhs)
1433         {
1434                 Value *value = rhs.loadValue();
1435                 storeValue(value);
1436         }
1437
1438         Short::Short(const Reference<Short> &rhs)
1439         {
1440                 Value *value = rhs.loadValue();
1441                 storeValue(value);
1442         }
1443
1444         RValue<Short> Short::operator=(RValue<Short> rhs)
1445         {
1446                 storeValue(rhs.value);
1447
1448                 return rhs;
1449         }
1450
1451         RValue<Short> Short::operator=(const Short &rhs)
1452         {
1453                 Value *value = rhs.loadValue();
1454                 storeValue(value);
1455
1456                 return RValue<Short>(value);
1457         }
1458
1459         RValue<Short> Short::operator=(const Reference<Short> &rhs)
1460         {
1461                 Value *value = rhs.loadValue();
1462                 storeValue(value);
1463
1464                 return RValue<Short>(value);
1465         }
1466
1467         RValue<Short> operator+(RValue<Short> lhs, RValue<Short> rhs)
1468         {
1469                 return RValue<Short>(Nucleus::createAdd(lhs.value, rhs.value));
1470         }
1471
1472         RValue<Short> operator-(RValue<Short> lhs, RValue<Short> rhs)
1473         {
1474                 return RValue<Short>(Nucleus::createSub(lhs.value, rhs.value));
1475         }
1476
1477         RValue<Short> operator*(RValue<Short> lhs, RValue<Short> rhs)
1478         {
1479                 return RValue<Short>(Nucleus::createMul(lhs.value, rhs.value));
1480         }
1481
1482         RValue<Short> operator/(RValue<Short> lhs, RValue<Short> rhs)
1483         {
1484                 return RValue<Short>(Nucleus::createSDiv(lhs.value, rhs.value));
1485         }
1486
1487         RValue<Short> operator%(RValue<Short> lhs, RValue<Short> rhs)
1488         {
1489                 return RValue<Short>(Nucleus::createSRem(lhs.value, rhs.value));
1490         }
1491
1492         RValue<Short> operator&(RValue<Short> lhs, RValue<Short> rhs)
1493         {
1494                 return RValue<Short>(Nucleus::createAnd(lhs.value, rhs.value));
1495         }
1496
1497         RValue<Short> operator|(RValue<Short> lhs, RValue<Short> rhs)
1498         {
1499                 return RValue<Short>(Nucleus::createOr(lhs.value, rhs.value));
1500         }
1501
1502         RValue<Short> operator^(RValue<Short> lhs, RValue<Short> rhs)
1503         {
1504                 return RValue<Short>(Nucleus::createXor(lhs.value, rhs.value));
1505         }
1506
1507         RValue<Short> operator<<(RValue<Short> lhs, RValue<Short> rhs)
1508         {
1509                 return RValue<Short>(Nucleus::createShl(lhs.value, rhs.value));
1510         }
1511
1512         RValue<Short> operator>>(RValue<Short> lhs, RValue<Short> rhs)
1513         {
1514                 return RValue<Short>(Nucleus::createAShr(lhs.value, rhs.value));
1515         }
1516
1517         RValue<Short> operator+=(Short &lhs, RValue<Short> rhs)
1518         {
1519                 return lhs = lhs + rhs;
1520         }
1521
1522         RValue<Short> operator-=(Short &lhs, RValue<Short> rhs)
1523         {
1524                 return lhs = lhs - rhs;
1525         }
1526
1527         RValue<Short> operator*=(Short &lhs, RValue<Short> rhs)
1528         {
1529                 return lhs = lhs * rhs;
1530         }
1531
1532         RValue<Short> operator/=(Short &lhs, RValue<Short> rhs)
1533         {
1534                 return lhs = lhs / rhs;
1535         }
1536
1537         RValue<Short> operator%=(Short &lhs, RValue<Short> rhs)
1538         {
1539                 return lhs = lhs % rhs;
1540         }
1541
1542         RValue<Short> operator&=(Short &lhs, RValue<Short> rhs)
1543         {
1544                 return lhs = lhs & rhs;
1545         }
1546
1547         RValue<Short> operator|=(Short &lhs, RValue<Short> rhs)
1548         {
1549                 return lhs = lhs | rhs;
1550         }
1551
1552         RValue<Short> operator^=(Short &lhs, RValue<Short> rhs)
1553         {
1554                 return lhs = lhs ^ rhs;
1555         }
1556
1557         RValue<Short> operator<<=(Short &lhs, RValue<Short> rhs)
1558         {
1559                 return lhs = lhs << rhs;
1560         }
1561
1562         RValue<Short> operator>>=(Short &lhs, RValue<Short> rhs)
1563         {
1564                 return lhs = lhs >> rhs;
1565         }
1566
1567         RValue<Short> operator+(RValue<Short> val)
1568         {
1569                 return val;
1570         }
1571
1572         RValue<Short> operator-(RValue<Short> val)
1573         {
1574                 return RValue<Short>(Nucleus::createNeg(val.value));
1575         }
1576
1577         RValue<Short> operator~(RValue<Short> val)
1578         {
1579                 return RValue<Short>(Nucleus::createNot(val.value));
1580         }
1581
1582         RValue<Short> operator++(Short &val, int)   // Post-increment
1583         {
1584                 RValue<Short> res = val;
1585
1586                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantShort((short)1)));
1587                 val.storeValue(inc);
1588
1589                 return res;
1590         }
1591
1592         const Short &operator++(Short &val)   // Pre-increment
1593         {
1594                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantShort((short)1)));
1595                 val.storeValue(inc);
1596
1597                 return val;
1598         }
1599
1600         RValue<Short> operator--(Short &val, int)   // Post-decrement
1601         {
1602                 RValue<Short> res = val;
1603
1604                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantShort((short)1)));
1605                 val.storeValue(inc);
1606
1607                 return res;
1608         }
1609
1610         const Short &operator--(Short &val)   // Pre-decrement
1611         {
1612                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantShort((short)1)));
1613                 val.storeValue(inc);
1614
1615                 return val;
1616         }
1617
1618         RValue<Bool> operator<(RValue<Short> lhs, RValue<Short> rhs)
1619         {
1620                 return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
1621         }
1622
1623         RValue<Bool> operator<=(RValue<Short> lhs, RValue<Short> rhs)
1624         {
1625                 return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
1626         }
1627
1628         RValue<Bool> operator>(RValue<Short> lhs, RValue<Short> rhs)
1629         {
1630                 return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
1631         }
1632
1633         RValue<Bool> operator>=(RValue<Short> lhs, RValue<Short> rhs)
1634         {
1635                 return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
1636         }
1637
1638         RValue<Bool> operator!=(RValue<Short> lhs, RValue<Short> rhs)
1639         {
1640                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1641         }
1642
1643         RValue<Bool> operator==(RValue<Short> lhs, RValue<Short> rhs)
1644         {
1645                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1646         }
1647
1648         Type *Short::getType()
1649         {
1650                 return T(llvm::Type::getInt16Ty(*::context));
1651         }
1652
1653         UShort::UShort(Argument<UShort> argument)
1654         {
1655                 storeValue(argument.value);
1656         }
1657
1658         UShort::UShort(RValue<UInt> cast)
1659         {
1660                 Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
1661
1662                 storeValue(integer);
1663         }
1664
1665         UShort::UShort(RValue<Int> cast)
1666         {
1667                 Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
1668
1669                 storeValue(integer);
1670         }
1671
1672         UShort::UShort()
1673         {
1674         }
1675
1676         UShort::UShort(unsigned short x)
1677         {
1678                 storeValue(Nucleus::createConstantShort(x));
1679         }
1680
1681         UShort::UShort(RValue<UShort> rhs)
1682         {
1683                 storeValue(rhs.value);
1684         }
1685
1686         UShort::UShort(const UShort &rhs)
1687         {
1688                 Value *value = rhs.loadValue();
1689                 storeValue(value);
1690         }
1691
1692         UShort::UShort(const Reference<UShort> &rhs)
1693         {
1694                 Value *value = rhs.loadValue();
1695                 storeValue(value);
1696         }
1697
1698         RValue<UShort> UShort::operator=(RValue<UShort> rhs)
1699         {
1700                 storeValue(rhs.value);
1701
1702                 return rhs;
1703         }
1704
1705         RValue<UShort> UShort::operator=(const UShort &rhs)
1706         {
1707                 Value *value = rhs.loadValue();
1708                 storeValue(value);
1709
1710                 return RValue<UShort>(value);
1711         }
1712
1713         RValue<UShort> UShort::operator=(const Reference<UShort> &rhs)
1714         {
1715                 Value *value = rhs.loadValue();
1716                 storeValue(value);
1717
1718                 return RValue<UShort>(value);
1719         }
1720
1721         RValue<UShort> operator+(RValue<UShort> lhs, RValue<UShort> rhs)
1722         {
1723                 return RValue<UShort>(Nucleus::createAdd(lhs.value, rhs.value));
1724         }
1725
1726         RValue<UShort> operator-(RValue<UShort> lhs, RValue<UShort> rhs)
1727         {
1728                 return RValue<UShort>(Nucleus::createSub(lhs.value, rhs.value));
1729         }
1730
1731         RValue<UShort> operator*(RValue<UShort> lhs, RValue<UShort> rhs)
1732         {
1733                 return RValue<UShort>(Nucleus::createMul(lhs.value, rhs.value));
1734         }
1735
1736         RValue<UShort> operator/(RValue<UShort> lhs, RValue<UShort> rhs)
1737         {
1738                 return RValue<UShort>(Nucleus::createUDiv(lhs.value, rhs.value));
1739         }
1740
1741         RValue<UShort> operator%(RValue<UShort> lhs, RValue<UShort> rhs)
1742         {
1743                 return RValue<UShort>(Nucleus::createURem(lhs.value, rhs.value));
1744         }
1745
1746         RValue<UShort> operator&(RValue<UShort> lhs, RValue<UShort> rhs)
1747         {
1748                 return RValue<UShort>(Nucleus::createAnd(lhs.value, rhs.value));
1749         }
1750
1751         RValue<UShort> operator|(RValue<UShort> lhs, RValue<UShort> rhs)
1752         {
1753                 return RValue<UShort>(Nucleus::createOr(lhs.value, rhs.value));
1754         }
1755
1756         RValue<UShort> operator^(RValue<UShort> lhs, RValue<UShort> rhs)
1757         {
1758                 return RValue<UShort>(Nucleus::createXor(lhs.value, rhs.value));
1759         }
1760
1761         RValue<UShort> operator<<(RValue<UShort> lhs, RValue<UShort> rhs)
1762         {
1763                 return RValue<UShort>(Nucleus::createShl(lhs.value, rhs.value));
1764         }
1765
1766         RValue<UShort> operator>>(RValue<UShort> lhs, RValue<UShort> rhs)
1767         {
1768                 return RValue<UShort>(Nucleus::createLShr(lhs.value, rhs.value));
1769         }
1770
1771         RValue<UShort> operator+=(UShort &lhs, RValue<UShort> rhs)
1772         {
1773                 return lhs = lhs + rhs;
1774         }
1775
1776         RValue<UShort> operator-=(UShort &lhs, RValue<UShort> rhs)
1777         {
1778                 return lhs = lhs - rhs;
1779         }
1780
1781         RValue<UShort> operator*=(UShort &lhs, RValue<UShort> rhs)
1782         {
1783                 return lhs = lhs * rhs;
1784         }
1785
1786         RValue<UShort> operator/=(UShort &lhs, RValue<UShort> rhs)
1787         {
1788                 return lhs = lhs / rhs;
1789         }
1790
1791         RValue<UShort> operator%=(UShort &lhs, RValue<UShort> rhs)
1792         {
1793                 return lhs = lhs % rhs;
1794         }
1795
1796         RValue<UShort> operator&=(UShort &lhs, RValue<UShort> rhs)
1797         {
1798                 return lhs = lhs & rhs;
1799         }
1800
1801         RValue<UShort> operator|=(UShort &lhs, RValue<UShort> rhs)
1802         {
1803                 return lhs = lhs | rhs;
1804         }
1805
1806         RValue<UShort> operator^=(UShort &lhs, RValue<UShort> rhs)
1807         {
1808                 return lhs = lhs ^ rhs;
1809         }
1810
1811         RValue<UShort> operator<<=(UShort &lhs, RValue<UShort> rhs)
1812         {
1813                 return lhs = lhs << rhs;
1814         }
1815
1816         RValue<UShort> operator>>=(UShort &lhs, RValue<UShort> rhs)
1817         {
1818                 return lhs = lhs >> rhs;
1819         }
1820
1821         RValue<UShort> operator+(RValue<UShort> val)
1822         {
1823                 return val;
1824         }
1825
1826         RValue<UShort> operator-(RValue<UShort> val)
1827         {
1828                 return RValue<UShort>(Nucleus::createNeg(val.value));
1829         }
1830
1831         RValue<UShort> operator~(RValue<UShort> val)
1832         {
1833                 return RValue<UShort>(Nucleus::createNot(val.value));
1834         }
1835
1836         RValue<UShort> operator++(UShort &val, int)   // Post-increment
1837         {
1838                 RValue<UShort> res = val;
1839
1840                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantShort((unsigned short)1)));
1841                 val.storeValue(inc);
1842
1843                 return res;
1844         }
1845
1846         const UShort &operator++(UShort &val)   // Pre-increment
1847         {
1848                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantShort((unsigned short)1)));
1849                 val.storeValue(inc);
1850
1851                 return val;
1852         }
1853
1854         RValue<UShort> operator--(UShort &val, int)   // Post-decrement
1855         {
1856                 RValue<UShort> res = val;
1857
1858                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantShort((unsigned short)1)));
1859                 val.storeValue(inc);
1860
1861                 return res;
1862         }
1863
1864         const UShort &operator--(UShort &val)   // Pre-decrement
1865         {
1866                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantShort((unsigned short)1)));
1867                 val.storeValue(inc);
1868
1869                 return val;
1870         }
1871
1872         RValue<Bool> operator<(RValue<UShort> lhs, RValue<UShort> rhs)
1873         {
1874                 return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
1875         }
1876
1877         RValue<Bool> operator<=(RValue<UShort> lhs, RValue<UShort> rhs)
1878         {
1879                 return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
1880         }
1881
1882         RValue<Bool> operator>(RValue<UShort> lhs, RValue<UShort> rhs)
1883         {
1884                 return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
1885         }
1886
1887         RValue<Bool> operator>=(RValue<UShort> lhs, RValue<UShort> rhs)
1888         {
1889                 return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
1890         }
1891
1892         RValue<Bool> operator!=(RValue<UShort> lhs, RValue<UShort> rhs)
1893         {
1894                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1895         }
1896
1897         RValue<Bool> operator==(RValue<UShort> lhs, RValue<UShort> rhs)
1898         {
1899                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1900         }
1901
1902         Type *UShort::getType()
1903         {
1904                 return T(llvm::Type::getInt16Ty(*::context));
1905         }
1906
1907         Byte4::Byte4(RValue<Byte8> cast)
1908         {
1909         //      xyzw.parent = this;
1910
1911                 storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), Int::getType()));
1912         }
1913
1914         Byte4::Byte4(const Reference<Byte4> &rhs)
1915         {
1916         //      xyzw.parent = this;
1917
1918                 Value *value = rhs.loadValue();
1919                 storeValue(value);
1920         }
1921
1922         Type *Byte4::getType()
1923         {
1924                 #if 0
1925                         return T(VectorType::get(Byte::getType(), 4));
1926                 #else
1927                         return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
1928                 #endif
1929         }
1930
1931         Type *SByte4::getType()
1932         {
1933                 #if 0
1934                         return T(VectorType::get(SByte::getType(), 4));
1935                 #else
1936                         return Int::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
1937                 #endif
1938         }
1939
1940         Byte8::Byte8()
1941         {
1942         //      xyzw.parent = this;
1943         }
1944
1945         Byte8::Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
1946         {
1947         //      xyzw.parent = this;
1948
1949                 int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
1950                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Byte::getType(), 8))));
1951
1952                 storeValue(Nucleus::createBitCast(vector, getType()));
1953         }
1954
1955         Byte8::Byte8(RValue<Byte8> rhs)
1956         {
1957         //      xyzw.parent = this;
1958
1959                 storeValue(rhs.value);
1960         }
1961
1962         Byte8::Byte8(const Byte8 &rhs)
1963         {
1964         //      xyzw.parent = this;
1965
1966                 Value *value = rhs.loadValue();
1967                 storeValue(value);
1968         }
1969
1970         Byte8::Byte8(const Reference<Byte8> &rhs)
1971         {
1972         //      xyzw.parent = this;
1973
1974                 Value *value = rhs.loadValue();
1975                 storeValue(value);
1976         }
1977
1978         RValue<Byte8> Byte8::operator=(RValue<Byte8> rhs)
1979         {
1980                 storeValue(rhs.value);
1981
1982                 return rhs;
1983         }
1984
1985         RValue<Byte8> Byte8::operator=(const Byte8 &rhs)
1986         {
1987                 Value *value = rhs.loadValue();
1988                 storeValue(value);
1989
1990                 return RValue<Byte8>(value);
1991         }
1992
1993         RValue<Byte8> Byte8::operator=(const Reference<Byte8> &rhs)
1994         {
1995                 Value *value = rhs.loadValue();
1996                 storeValue(value);
1997
1998                 return RValue<Byte8>(value);
1999         }
2000
2001         RValue<Byte8> operator+(RValue<Byte8> lhs, RValue<Byte8> rhs)
2002         {
2003                 if(CPUID::supportsMMX2())
2004                 {
2005                         return x86::paddb(lhs, rhs);
2006                 }
2007                 else
2008                 {
2009                         return RValue<Byte8>(Nucleus::createAdd(lhs.value, rhs.value));
2010                 }
2011         }
2012
2013         RValue<Byte8> operator-(RValue<Byte8> lhs, RValue<Byte8> rhs)
2014         {
2015                 if(CPUID::supportsMMX2())
2016                 {
2017                         return x86::psubb(lhs, rhs);
2018                 }
2019                 else
2020                 {
2021                         return RValue<Byte8>(Nucleus::createSub(lhs.value, rhs.value));
2022                 }
2023         }
2024
2025 //      RValue<Byte8> operator*(RValue<Byte8> lhs, RValue<Byte8> rhs)
2026 //      {
2027 //              return RValue<Byte8>(Nucleus::createMul(lhs.value, rhs.value));
2028 //      }
2029
2030 //      RValue<Byte8> operator/(RValue<Byte8> lhs, RValue<Byte8> rhs)
2031 //      {
2032 //              return RValue<Byte8>(Nucleus::createUDiv(lhs.value, rhs.value));
2033 //      }
2034
2035 //      RValue<Byte8> operator%(RValue<Byte8> lhs, RValue<Byte8> rhs)
2036 //      {
2037 //              return RValue<Byte8>(Nucleus::createURem(lhs.value, rhs.value));
2038 //      }
2039
2040         RValue<Byte8> operator&(RValue<Byte8> lhs, RValue<Byte8> rhs)
2041         {
2042                 if(CPUID::supportsMMX2())
2043                 {
2044                         return As<Byte8>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
2045                 }
2046                 else
2047                 {
2048                         return RValue<Byte8>(Nucleus::createAnd(lhs.value, rhs.value));
2049                 }
2050         }
2051
2052         RValue<Byte8> operator|(RValue<Byte8> lhs, RValue<Byte8> rhs)
2053         {
2054                 if(CPUID::supportsMMX2())
2055                 {
2056                         return As<Byte8>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
2057                 }
2058                 else
2059                 {
2060                         return RValue<Byte8>(Nucleus::createOr(lhs.value, rhs.value));
2061                 }
2062         }
2063
2064         RValue<Byte8> operator^(RValue<Byte8> lhs, RValue<Byte8> rhs)
2065         {
2066                 if(CPUID::supportsMMX2())
2067                 {
2068                         return As<Byte8>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
2069                 }
2070                 else
2071                 {
2072                         return RValue<Byte8>(Nucleus::createXor(lhs.value, rhs.value));
2073                 }
2074         }
2075
2076 //      RValue<Byte8> operator<<(RValue<Byte8> lhs, unsigned char rhs)
2077 //      {
2078 //              return RValue<Byte8>(Nucleus::createShl(lhs.value, rhs.value));
2079 //      }
2080
2081 //      RValue<Byte8> operator>>(RValue<Byte8> lhs, unsigned char rhs)
2082 //      {
2083 //              return RValue<Byte8>(Nucleus::createLShr(lhs.value, rhs.value));
2084 //      }
2085
2086         RValue<Byte8> operator+=(Byte8 &lhs, RValue<Byte8> rhs)
2087         {
2088                 return lhs = lhs + rhs;
2089         }
2090
2091         RValue<Byte8> operator-=(Byte8 &lhs, RValue<Byte8> rhs)
2092         {
2093                 return lhs = lhs - rhs;
2094         }
2095
2096 //      RValue<Byte8> operator*=(Byte8 &lhs, RValue<Byte8> rhs)
2097 //      {
2098 //              return lhs = lhs * rhs;
2099 //      }
2100
2101 //      RValue<Byte8> operator/=(Byte8 &lhs, RValue<Byte8> rhs)
2102 //      {
2103 //              return lhs = lhs / rhs;
2104 //      }
2105
2106 //      RValue<Byte8> operator%=(Byte8 &lhs, RValue<Byte8> rhs)
2107 //      {
2108 //              return lhs = lhs % rhs;
2109 //      }
2110
2111         RValue<Byte8> operator&=(Byte8 &lhs, RValue<Byte8> rhs)
2112         {
2113                 return lhs = lhs & rhs;
2114         }
2115
2116         RValue<Byte8> operator|=(Byte8 &lhs, RValue<Byte8> rhs)
2117         {
2118                 return lhs = lhs | rhs;
2119         }
2120
2121         RValue<Byte8> operator^=(Byte8 &lhs, RValue<Byte8> rhs)
2122         {
2123                 return lhs = lhs ^ rhs;
2124         }
2125
2126 //      RValue<Byte8> operator<<=(Byte8 &lhs, RValue<Byte8> rhs)
2127 //      {
2128 //              return lhs = lhs << rhs;
2129 //      }
2130
2131 //      RValue<Byte8> operator>>=(Byte8 &lhs, RValue<Byte8> rhs)
2132 //      {
2133 //              return lhs = lhs >> rhs;
2134 //      }
2135
2136 //      RValue<Byte8> operator+(RValue<Byte8> val)
2137 //      {
2138 //              return val;
2139 //      }
2140
2141 //      RValue<Byte8> operator-(RValue<Byte8> val)
2142 //      {
2143 //              return RValue<Byte8>(Nucleus::createNeg(val.value));
2144 //      }
2145
2146         RValue<Byte8> operator~(RValue<Byte8> val)
2147         {
2148                 if(CPUID::supportsMMX2())
2149                 {
2150                         return val ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
2151                 }
2152                 else
2153                 {
2154                         return RValue<Byte8>(Nucleus::createNot(val.value));
2155                 }
2156         }
2157
2158         RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
2159         {
2160                 return x86::paddusb(x, y);
2161         }
2162
2163         RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
2164         {
2165                 return x86::psubusb(x, y);
2166         }
2167
2168         RValue<Short4> Unpack(RValue<Byte4> x)
2169         {
2170                 Value *int2 = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), x.value, 0);
2171                 Value *byte8 = Nucleus::createBitCast(int2, Byte8::getType());
2172
2173                 return UnpackLow(RValue<Byte8>(byte8), RValue<Byte8>(byte8));
2174         }
2175
2176         RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y)
2177         {
2178                 if(CPUID::supportsMMX2())
2179                 {
2180                         return x86::punpcklbw(x, y);
2181                 }
2182                 else
2183                 {
2184                         int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
2185                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2186
2187                         return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2188                 }
2189         }
2190
2191         RValue<Short4> UnpackHigh(RValue<Byte8> x, RValue<Byte8> y)
2192         {
2193                 if(CPUID::supportsMMX2())
2194                 {
2195                         return x86::punpckhbw(x, y);
2196                 }
2197                 else
2198                 {
2199                         int shuffle[8] = {4, 12, 5, 13, 6, 14, 7, 15};
2200                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2201
2202                         return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2203                 }
2204         }
2205
2206         RValue<Int> SignMask(RValue<Byte8> x)
2207         {
2208                 return x86::pmovmskb(x);
2209         }
2210
2211 //      RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
2212 //      {
2213 //              return x86::pcmpgtb(x, y);   // FIXME: Signedness
2214 //      }
2215
2216         RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
2217         {
2218                 return x86::pcmpeqb(x, y);
2219         }
2220
2221         Type *Byte8::getType()
2222         {
2223                 if(CPUID::supportsMMX2())
2224                 {
2225                         return MMX::getType();
2226                 }
2227                 else
2228                 {
2229                         return T(VectorType::get(Byte::getType(), 8));
2230                 }
2231         }
2232
2233         SByte8::SByte8()
2234         {
2235         //      xyzw.parent = this;
2236         }
2237
2238         SByte8::SByte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
2239         {
2240         //      xyzw.parent = this;
2241
2242                 int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
2243                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(SByte::getType(), 8))));
2244
2245                 storeValue(Nucleus::createBitCast(vector, getType()));
2246         }
2247
2248         SByte8::SByte8(RValue<SByte8> rhs)
2249         {
2250         //      xyzw.parent = this;
2251
2252                 storeValue(rhs.value);
2253         }
2254
2255         SByte8::SByte8(const SByte8 &rhs)
2256         {
2257         //      xyzw.parent = this;
2258
2259                 Value *value = rhs.loadValue();
2260                 storeValue(value);
2261         }
2262
2263         SByte8::SByte8(const Reference<SByte8> &rhs)
2264         {
2265         //      xyzw.parent = this;
2266
2267                 Value *value = rhs.loadValue();
2268                 storeValue(value);
2269         }
2270
2271         RValue<SByte8> SByte8::operator=(RValue<SByte8> rhs)
2272         {
2273                 storeValue(rhs.value);
2274
2275                 return rhs;
2276         }
2277
2278         RValue<SByte8> SByte8::operator=(const SByte8 &rhs)
2279         {
2280                 Value *value = rhs.loadValue();
2281                 storeValue(value);
2282
2283                 return RValue<SByte8>(value);
2284         }
2285
2286         RValue<SByte8> SByte8::operator=(const Reference<SByte8> &rhs)
2287         {
2288                 Value *value = rhs.loadValue();
2289                 storeValue(value);
2290
2291                 return RValue<SByte8>(value);
2292         }
2293
2294         RValue<SByte8> operator+(RValue<SByte8> lhs, RValue<SByte8> rhs)
2295         {
2296                 if(CPUID::supportsMMX2())
2297                 {
2298                         return As<SByte8>(x86::paddb(As<Byte8>(lhs), As<Byte8>(rhs)));
2299                 }
2300                 else
2301                 {
2302                         return RValue<SByte8>(Nucleus::createAdd(lhs.value, rhs.value));
2303                 }
2304         }
2305
2306         RValue<SByte8> operator-(RValue<SByte8> lhs, RValue<SByte8> rhs)
2307         {
2308                 if(CPUID::supportsMMX2())
2309                 {
2310                         return As<SByte8>(x86::psubb(As<Byte8>(lhs), As<Byte8>(rhs)));
2311                 }
2312                 else
2313                 {
2314                         return RValue<SByte8>(Nucleus::createSub(lhs.value, rhs.value));
2315                 }
2316         }
2317
2318 //      RValue<SByte8> operator*(RValue<SByte8> lhs, RValue<SByte8> rhs)
2319 //      {
2320 //              return RValue<SByte8>(Nucleus::createMul(lhs.value, rhs.value));
2321 //      }
2322
2323 //      RValue<SByte8> operator/(RValue<SByte8> lhs, RValue<SByte8> rhs)
2324 //      {
2325 //              return RValue<SByte8>(Nucleus::createSDiv(lhs.value, rhs.value));
2326 //      }
2327
2328 //      RValue<SByte8> operator%(RValue<SByte8> lhs, RValue<SByte8> rhs)
2329 //      {
2330 //              return RValue<SByte8>(Nucleus::createSRem(lhs.value, rhs.value));
2331 //      }
2332
2333         RValue<SByte8> operator&(RValue<SByte8> lhs, RValue<SByte8> rhs)
2334         {
2335                 return RValue<SByte8>(Nucleus::createAnd(lhs.value, rhs.value));
2336         }
2337
2338         RValue<SByte8> operator|(RValue<SByte8> lhs, RValue<SByte8> rhs)
2339         {
2340                 return RValue<SByte8>(Nucleus::createOr(lhs.value, rhs.value));
2341         }
2342
2343         RValue<SByte8> operator^(RValue<SByte8> lhs, RValue<SByte8> rhs)
2344         {
2345                 return RValue<SByte8>(Nucleus::createXor(lhs.value, rhs.value));
2346         }
2347
2348 //      RValue<SByte8> operator<<(RValue<SByte8> lhs, unsigned char rhs)
2349 //      {
2350 //              return RValue<SByte8>(Nucleus::createShl(lhs.value, rhs.value));
2351 //      }
2352
2353 //      RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
2354 //      {
2355 //              return RValue<SByte8>(Nucleus::createAShr(lhs.value, rhs.value));
2356 //      }
2357
2358         RValue<SByte8> operator+=(SByte8 &lhs, RValue<SByte8> rhs)
2359         {
2360                 return lhs = lhs + rhs;
2361         }
2362
2363         RValue<SByte8> operator-=(SByte8 &lhs, RValue<SByte8> rhs)
2364         {
2365                 return lhs = lhs - rhs;
2366         }
2367
2368 //      RValue<SByte8> operator*=(SByte8 &lhs, RValue<SByte8> rhs)
2369 //      {
2370 //              return lhs = lhs * rhs;
2371 //      }
2372
2373 //      RValue<SByte8> operator/=(SByte8 &lhs, RValue<SByte8> rhs)
2374 //      {
2375 //              return lhs = lhs / rhs;
2376 //      }
2377
2378 //      RValue<SByte8> operator%=(SByte8 &lhs, RValue<SByte8> rhs)
2379 //      {
2380 //              return lhs = lhs % rhs;
2381 //      }
2382
2383         RValue<SByte8> operator&=(SByte8 &lhs, RValue<SByte8> rhs)
2384         {
2385                 return lhs = lhs & rhs;
2386         }
2387
2388         RValue<SByte8> operator|=(SByte8 &lhs, RValue<SByte8> rhs)
2389         {
2390                 return lhs = lhs | rhs;
2391         }
2392
2393         RValue<SByte8> operator^=(SByte8 &lhs, RValue<SByte8> rhs)
2394         {
2395                 return lhs = lhs ^ rhs;
2396         }
2397
2398 //      RValue<SByte8> operator<<=(SByte8 &lhs, RValue<SByte8> rhs)
2399 //      {
2400 //              return lhs = lhs << rhs;
2401 //      }
2402
2403 //      RValue<SByte8> operator>>=(SByte8 &lhs, RValue<SByte8> rhs)
2404 //      {
2405 //              return lhs = lhs >> rhs;
2406 //      }
2407
2408 //      RValue<SByte8> operator+(RValue<SByte8> val)
2409 //      {
2410 //              return val;
2411 //      }
2412
2413 //      RValue<SByte8> operator-(RValue<SByte8> val)
2414 //      {
2415 //              return RValue<SByte8>(Nucleus::createNeg(val.value));
2416 //      }
2417
2418         RValue<SByte8> operator~(RValue<SByte8> val)
2419         {
2420                 if(CPUID::supportsMMX2())
2421                 {
2422                         return val ^ SByte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
2423                 }
2424                 else
2425                 {
2426                         return RValue<SByte8>(Nucleus::createNot(val.value));
2427                 }
2428         }
2429
2430         RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
2431         {
2432                 return x86::paddsb(x, y);
2433         }
2434
2435         RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
2436         {
2437                 return x86::psubsb(x, y);
2438         }
2439
2440         RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y)
2441         {
2442                 if(CPUID::supportsMMX2())
2443                 {
2444                         return As<Short4>(x86::punpcklbw(As<Byte8>(x), As<Byte8>(y)));
2445                 }
2446                 else
2447                 {
2448                         int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
2449                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2450
2451                         return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2452                 }
2453         }
2454
2455         RValue<Short4> UnpackHigh(RValue<SByte8> x, RValue<SByte8> y)
2456         {
2457                 if(CPUID::supportsMMX2())
2458                 {
2459                         return As<Short4>(x86::punpckhbw(As<Byte8>(x), As<Byte8>(y)));
2460                 }
2461                 else
2462                 {
2463                         int shuffle[8] = {4, 12, 5, 13, 6, 14, 7, 15};
2464                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2465
2466                         return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2467                 }
2468         }
2469
2470         RValue<Int> SignMask(RValue<SByte8> x)
2471         {
2472                 return x86::pmovmskb(As<Byte8>(x));
2473         }
2474
2475         RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
2476         {
2477                 return x86::pcmpgtb(x, y);
2478         }
2479
2480         RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
2481         {
2482                 return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
2483         }
2484
2485         Type *SByte8::getType()
2486         {
2487                 if(CPUID::supportsMMX2())
2488                 {
2489                         return MMX::getType();
2490                 }
2491                 else
2492                 {
2493                         return T(VectorType::get(SByte::getType(), 8));
2494                 }
2495         }
2496
2497         Byte16::Byte16(RValue<Byte16> rhs)
2498         {
2499         //      xyzw.parent = this;
2500
2501                 storeValue(rhs.value);
2502         }
2503
2504         Byte16::Byte16(const Byte16 &rhs)
2505         {
2506         //      xyzw.parent = this;
2507
2508                 Value *value = rhs.loadValue();
2509                 storeValue(value);
2510         }
2511
2512         Byte16::Byte16(const Reference<Byte16> &rhs)
2513         {
2514         //      xyzw.parent = this;
2515
2516                 Value *value = rhs.loadValue();
2517                 storeValue(value);
2518         }
2519
2520         RValue<Byte16> Byte16::operator=(RValue<Byte16> rhs)
2521         {
2522                 storeValue(rhs.value);
2523
2524                 return rhs;
2525         }
2526
2527         RValue<Byte16> Byte16::operator=(const Byte16 &rhs)
2528         {
2529                 Value *value = rhs.loadValue();
2530                 storeValue(value);
2531
2532                 return RValue<Byte16>(value);
2533         }
2534
2535         RValue<Byte16> Byte16::operator=(const Reference<Byte16> &rhs)
2536         {
2537                 Value *value = rhs.loadValue();
2538                 storeValue(value);
2539
2540                 return RValue<Byte16>(value);
2541         }
2542
2543         Type *Byte16::getType()
2544         {
2545                 return T(VectorType::get(Byte::getType(), 16));
2546         }
2547
2548         Type *SByte16::getType()
2549         {
2550                 return T( VectorType::get(SByte::getType(), 16));
2551         }
2552
2553         Short2::Short2(RValue<Short4> cast)
2554         {
2555                 storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), UInt::getType()));
2556         }
2557
2558         Type *Short2::getType()
2559         {
2560                 #if 0
2561                         return T(VectorType::get(Short::getType(), 2));
2562                 #else
2563                         return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
2564                 #endif
2565         }
2566
2567         UShort2::UShort2(RValue<UShort4> cast)
2568         {
2569                 storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), UInt::getType()));
2570         }
2571
2572         Type *UShort2::getType()
2573         {
2574                 #if 0
2575                         return T(VectorType::get(UShort::getType(), 2));
2576                 #else
2577                         return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
2578                 #endif
2579         }
2580
2581         Short4::Short4(RValue<Int> cast)
2582         {
2583                 Value *extend = Nucleus::createZExt(cast.value, Long::getType());
2584                 Value *swizzle = Swizzle(RValue<Short4>(extend), 0x00).value;
2585
2586                 storeValue(swizzle);
2587         }
2588
2589         Short4::Short4(RValue<Int4> cast)
2590         {
2591                 Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
2592
2593                 #if 0   // FIXME: Check codegen (pshuflw phshufhw pshufd)
2594                         Constant *pack[8];
2595                         pack[0] = Nucleus::createConstantInt(0);
2596                         pack[1] = Nucleus::createConstantInt(2);
2597                         pack[2] = Nucleus::createConstantInt(4);
2598                         pack[3] = Nucleus::createConstantInt(6);
2599
2600                         Value *short4 = Nucleus::createShuffleVector(short8, short8, Nucleus::createConstantVector(pack, 4));
2601                 #else
2602                         Value *packed;
2603
2604                         // FIXME: Use Swizzle<Short8>
2605                         if(!CPUID::supportsSSSE3())
2606                         {
2607                                 int pshuflw[8] = {0, 2, 0, 2, 4, 5, 6, 7};
2608                                 int pshufhw[8] = {0, 1, 2, 3, 4, 6, 4, 6};
2609
2610                                 Value *shuffle1 = Nucleus::createShuffleVector(short8, short8, pshuflw);
2611                                 Value *shuffle2 = Nucleus::createShuffleVector(shuffle1, shuffle1, pshufhw);
2612                                 Value *int4 = Nucleus::createBitCast(shuffle2, Int4::getType());
2613                                 packed = createSwizzle4(int4, 0x88);
2614                         }
2615                         else
2616                         {
2617                                 int pshufb[16] = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
2618                                 Value *byte16 = Nucleus::createBitCast(cast.value, Byte16::getType());
2619                                 packed = Nucleus::createShuffleVector(byte16, byte16, pshufb);
2620                         }
2621
2622                         #if 0   // FIXME: No optimal instruction selection
2623                                 Value *qword2 = Nucleus::createBitCast(packed, T(VectorType::get(Long::getType(), 2)));
2624                                 Value *element = Nucleus::createExtractElement(qword2, 0);
2625                                 Value *short4 = Nucleus::createBitCast(element, Short4::getType());
2626                         #else   // FIXME: Requires SSE
2627                                 Value *int2 = RValue<Int2>(Int2(RValue<Int4>(packed))).value;
2628                                 Value *short4 = Nucleus::createBitCast(int2, Short4::getType());
2629                         #endif
2630                 #endif
2631
2632                 storeValue(short4);
2633         }
2634
2635 //      Short4::Short4(RValue<Float> cast)
2636 //      {
2637 //      }
2638
2639         Short4::Short4(RValue<Float4> cast)
2640         {
2641                 Int4 v4i32 = Int4(cast);
2642                 v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
2643
2644                 storeValue(As<Short4>(Int2(v4i32)).value);
2645         }
2646
2647         Short4::Short4()
2648         {
2649         //      xyzw.parent = this;
2650         }
2651
2652         Short4::Short4(short xyzw)
2653         {
2654                 //      xyzw.parent = this;
2655
2656                 int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
2657                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Short::getType(), 4))));
2658
2659                 storeValue(Nucleus::createBitCast(vector, getType()));
2660         }
2661
2662         Short4::Short4(short x, short y, short z, short w)
2663         {
2664         //      xyzw.parent = this;
2665
2666                 int64_t constantVector[4] = {x, y, z, w};
2667                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Short::getType(), 4))));
2668
2669                 storeValue(Nucleus::createBitCast(vector, getType()));
2670         }
2671
2672         Short4::Short4(RValue<Short4> rhs)
2673         {
2674         //      xyzw.parent = this;
2675
2676                 storeValue(rhs.value);
2677         }
2678
2679         Short4::Short4(const Short4 &rhs)
2680         {
2681         //      xyzw.parent = this;
2682
2683                 Value *value = rhs.loadValue();
2684                 storeValue(value);
2685         }
2686
2687         Short4::Short4(const Reference<Short4> &rhs)
2688         {
2689         //      xyzw.parent = this;
2690
2691                 Value *value = rhs.loadValue();
2692                 storeValue(value);
2693         }
2694
2695         Short4::Short4(RValue<UShort4> rhs)
2696         {
2697         //      xyzw.parent = this;
2698
2699                 storeValue(rhs.value);
2700         }
2701
2702         Short4::Short4(const UShort4 &rhs)
2703         {
2704         //      xyzw.parent = this;
2705
2706                 storeValue(rhs.loadValue());
2707         }
2708
2709         Short4::Short4(const Reference<UShort4> &rhs)
2710         {
2711         //      xyzw.parent = this;
2712
2713                 storeValue(rhs.loadValue());
2714         }
2715
2716         RValue<Short4> Short4::operator=(RValue<Short4> rhs)
2717         {
2718                 storeValue(rhs.value);
2719
2720                 return rhs;
2721         }
2722
2723         RValue<Short4> Short4::operator=(const Short4 &rhs)
2724         {
2725                 Value *value = rhs.loadValue();
2726                 storeValue(value);
2727
2728                 return RValue<Short4>(value);
2729         }
2730
2731         RValue<Short4> Short4::operator=(const Reference<Short4> &rhs)
2732         {
2733                 Value *value = rhs.loadValue();
2734                 storeValue(value);
2735
2736                 return RValue<Short4>(value);
2737         }
2738
2739         RValue<Short4> Short4::operator=(RValue<UShort4> rhs)
2740         {
2741                 storeValue(rhs.value);
2742
2743                 return RValue<Short4>(rhs);
2744         }
2745
2746         RValue<Short4> Short4::operator=(const UShort4 &rhs)
2747         {
2748                 Value *value = rhs.loadValue();
2749                 storeValue(value);
2750
2751                 return RValue<Short4>(value);
2752         }
2753
2754         RValue<Short4> Short4::operator=(const Reference<UShort4> &rhs)
2755         {
2756                 Value *value = rhs.loadValue();
2757                 storeValue(value);
2758
2759                 return RValue<Short4>(value);
2760         }
2761
2762         RValue<Short4> operator+(RValue<Short4> lhs, RValue<Short4> rhs)
2763         {
2764                 if(CPUID::supportsMMX2())
2765                 {
2766                         return x86::paddw(lhs, rhs);
2767                 }
2768                 else
2769                 {
2770                         return RValue<Short4>(Nucleus::createAdd(lhs.value, rhs.value));
2771                 }
2772         }
2773
2774         RValue<Short4> operator-(RValue<Short4> lhs, RValue<Short4> rhs)
2775         {
2776                 if(CPUID::supportsMMX2())
2777                 {
2778                         return x86::psubw(lhs, rhs);
2779                 }
2780                 else
2781                 {
2782                         return RValue<Short4>(Nucleus::createSub(lhs.value, rhs.value));
2783                 }
2784         }
2785
2786         RValue<Short4> operator*(RValue<Short4> lhs, RValue<Short4> rhs)
2787         {
2788                 if(CPUID::supportsMMX2())
2789                 {
2790                         return x86::pmullw(lhs, rhs);
2791                 }
2792                 else
2793                 {
2794                         return RValue<Short4>(Nucleus::createMul(lhs.value, rhs.value));
2795                 }
2796         }
2797
2798 //      RValue<Short4> operator/(RValue<Short4> lhs, RValue<Short4> rhs)
2799 //      {
2800 //              return RValue<Short4>(Nucleus::createSDiv(lhs.value, rhs.value));
2801 //      }
2802
2803 //      RValue<Short4> operator%(RValue<Short4> lhs, RValue<Short4> rhs)
2804 //      {
2805 //              return RValue<Short4>(Nucleus::createSRem(lhs.value, rhs.value));
2806 //      }
2807
2808         RValue<Short4> operator&(RValue<Short4> lhs, RValue<Short4> rhs)
2809         {
2810                 if(CPUID::supportsMMX2())
2811                 {
2812                         return x86::pand(lhs, rhs);
2813                 }
2814                 else
2815                 {
2816                         return RValue<Short4>(Nucleus::createAnd(lhs.value, rhs.value));
2817                 }
2818         }
2819
2820         RValue<Short4> operator|(RValue<Short4> lhs, RValue<Short4> rhs)
2821         {
2822                 if(CPUID::supportsMMX2())
2823                 {
2824                         return x86::por(lhs, rhs);
2825                 }
2826                 else
2827                 {
2828                         return RValue<Short4>(Nucleus::createOr(lhs.value, rhs.value));
2829                 }
2830         }
2831
2832         RValue<Short4> operator^(RValue<Short4> lhs, RValue<Short4> rhs)
2833         {
2834                 if(CPUID::supportsMMX2())
2835                 {
2836                         return x86::pxor(lhs, rhs);
2837                 }
2838                 else
2839                 {
2840                         return RValue<Short4>(Nucleus::createXor(lhs.value, rhs.value));
2841                 }
2842         }
2843
2844         RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2845         {
2846         //      return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
2847
2848                 return x86::psllw(lhs, rhs);
2849         }
2850
2851         RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2852         {
2853         //      return RValue<Short4>(Nucleus::createAShr(lhs.value, rhs.value));
2854
2855                 return x86::psraw(lhs, rhs);
2856         }
2857
2858         RValue<Short4> operator+=(Short4 &lhs, RValue<Short4> rhs)
2859         {
2860                 return lhs = lhs + rhs;
2861         }
2862
2863         RValue<Short4> operator-=(Short4 &lhs, RValue<Short4> rhs)
2864         {
2865                 return lhs = lhs - rhs;
2866         }
2867
2868         RValue<Short4> operator*=(Short4 &lhs, RValue<Short4> rhs)
2869         {
2870                 return lhs = lhs * rhs;
2871         }
2872
2873 //      RValue<Short4> operator/=(Short4 &lhs, RValue<Short4> rhs)
2874 //      {
2875 //              return lhs = lhs / rhs;
2876 //      }
2877
2878 //      RValue<Short4> operator%=(Short4 &lhs, RValue<Short4> rhs)
2879 //      {
2880 //              return lhs = lhs % rhs;
2881 //      }
2882
2883         RValue<Short4> operator&=(Short4 &lhs, RValue<Short4> rhs)
2884         {
2885                 return lhs = lhs & rhs;
2886         }
2887
2888         RValue<Short4> operator|=(Short4 &lhs, RValue<Short4> rhs)
2889         {
2890                 return lhs = lhs | rhs;
2891         }
2892
2893         RValue<Short4> operator^=(Short4 &lhs, RValue<Short4> rhs)
2894         {
2895                 return lhs = lhs ^ rhs;
2896         }
2897
2898         RValue<Short4> operator<<=(Short4 &lhs, unsigned char rhs)
2899         {
2900                 return lhs = lhs << rhs;
2901         }
2902
2903         RValue<Short4> operator>>=(Short4 &lhs, unsigned char rhs)
2904         {
2905                 return lhs = lhs >> rhs;
2906         }
2907
2908 //      RValue<Short4> operator+(RValue<Short4> val)
2909 //      {
2910 //              return val;
2911 //      }
2912
2913         RValue<Short4> operator-(RValue<Short4> val)
2914         {
2915                 if(CPUID::supportsMMX2())
2916                 {
2917                         return Short4(0, 0, 0, 0) - val;
2918                 }
2919                 else
2920                 {
2921                         return RValue<Short4>(Nucleus::createNeg(val.value));
2922                 }
2923         }
2924
2925         RValue<Short4> operator~(RValue<Short4> val)
2926         {
2927                 if(CPUID::supportsMMX2())
2928                 {
2929                         return val ^ Short4(0xFFFFu, 0xFFFFu, 0xFFFFu, 0xFFFFu);
2930                 }
2931                 else
2932                 {
2933                         return RValue<Short4>(Nucleus::createNot(val.value));
2934                 }
2935         }
2936
2937         RValue<Short4> RoundShort4(RValue<Float4> cast)
2938         {
2939                 RValue<Int4> v4i32 = x86::cvtps2dq(cast);
2940                 RValue<Short8> v8i16 = x86::packssdw(v4i32, v4i32);
2941
2942                 return As<Short4>(Int2(As<Int4>(v8i16)));
2943         }
2944
2945         RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2946         {
2947                 return x86::pmaxsw(x, y);
2948         }
2949
2950         RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
2951         {
2952                 return x86::pminsw(x, y);
2953         }
2954
2955         RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
2956         {
2957                 return x86::paddsw(x, y);
2958         }
2959
2960         RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
2961         {
2962                 return x86::psubsw(x, y);
2963         }
2964
2965         RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
2966         {
2967                 return x86::pmulhw(x, y);
2968         }
2969
2970         RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
2971         {
2972                 return x86::pmaddwd(x, y);
2973         }
2974
2975         RValue<SByte8> Pack(RValue<Short4> x, RValue<Short4> y)
2976         {
2977                 return x86::packsswb(x, y);
2978         }
2979
2980         RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y)
2981         {
2982                 if(CPUID::supportsMMX2())
2983                 {
2984                         return x86::punpcklwd(x, y);
2985                 }
2986                 else
2987                 {
2988                         int shuffle[4] = {0, 4, 1, 5};
2989                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2990
2991                         return RValue<Int2>(Nucleus::createBitCast(packed, Int2::getType()));
2992                 }
2993         }
2994
2995         RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y)
2996         {
2997                 if(CPUID::supportsMMX2())
2998                 {
2999                         return x86::punpckhwd(x, y);
3000                 }
3001                 else
3002                 {
3003                         int shuffle[4] = {2, 6, 3, 7};
3004                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
3005
3006                         return RValue<Int2>(Nucleus::createBitCast(packed, Int2::getType()));
3007                 }
3008         }
3009
3010         RValue<Short4> Swizzle(RValue<Short4> x, unsigned char select)
3011         {
3012                 if(CPUID::supportsMMX2())
3013                 {
3014                         return x86::pshufw(x, select);
3015                 }
3016                 else
3017                 {
3018                         return RValue<Short4>(createSwizzle4(x.value, select));
3019                 }
3020         }
3021
3022         RValue<Short4> Insert(RValue<Short4> val, RValue<Short> element, int i)
3023         {
3024                 if(CPUID::supportsMMX2())
3025                 {
3026                         return x86::pinsrw(val, Int(element), i);
3027                 }
3028                 else
3029                 {
3030                         return RValue<Short4>(Nucleus::createInsertElement(val.value, element.value, i));
3031                 }
3032         }
3033
3034         RValue<Short> Extract(RValue<Short4> val, int i)
3035         {
3036                 if(CPUID::supportsMMX2())
3037                 {
3038                         return Short(x86::pextrw(val, i));
3039                 }
3040                 else
3041                 {
3042                         return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
3043                 }
3044         }
3045
3046         RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
3047         {
3048                 return x86::pcmpgtw(x, y);
3049         }
3050
3051         RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
3052         {
3053                 return x86::pcmpeqw(x, y);
3054         }
3055
3056         Type *Short4::getType()
3057         {
3058                 if(CPUID::supportsMMX2())
3059                 {
3060                         return MMX::getType();
3061                 }
3062                 else
3063                 {
3064                         return T(VectorType::get(Short::getType(), 4));
3065                 }
3066         }
3067
3068         UShort4::UShort4(RValue<Int4> cast)
3069         {
3070                 *this = Short4(cast);
3071         }
3072
3073         UShort4::UShort4(RValue<Float4> cast, bool saturate)
3074         {
3075                 Float4 sat;
3076
3077                 if(saturate)
3078                 {
3079                         if(CPUID::supportsSSE4_1())
3080                         {
3081                                 sat = Min(cast, Float4(0xFFFF));   // packusdw takes care of 0x0000 saturation
3082                         }
3083                         else
3084                         {
3085                                 sat = Max(Min(cast, Float4(0xFFFF)), Float4(0x0000));
3086                         }
3087                 }
3088                 else
3089                 {
3090                         sat = cast;
3091                 }
3092
3093                 Int4 int4(sat);
3094
3095                 if(!saturate || !CPUID::supportsSSE4_1())
3096                 {
3097                         *this = Short4(Int4(int4));
3098                 }
3099                 else
3100                 {
3101                         *this = As<Short4>(Int2(As<Int4>(x86::packusdw(As<UInt4>(int4), As<UInt4>(int4)))));
3102                 }
3103         }
3104
3105         UShort4::UShort4()
3106         {
3107         //      xyzw.parent = this;
3108         }
3109
3110         UShort4::UShort4(unsigned short xyzw)
3111         {
3112                 //      xyzw.parent = this;
3113
3114                 int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
3115                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UShort::getType(), 4))));
3116
3117                 storeValue(Nucleus::createBitCast(vector, getType()));
3118         }
3119
3120         UShort4::UShort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
3121         {
3122         //      xyzw.parent = this;
3123
3124                 int64_t constantVector[4] = {x, y, z, w};
3125                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UShort::getType(), 4))));
3126
3127                 storeValue(Nucleus::createBitCast(vector, getType()));
3128         }
3129
3130         UShort4::UShort4(RValue<UShort4> rhs)
3131         {
3132         //      xyzw.parent = this;
3133
3134                 storeValue(rhs.value);
3135         }
3136
3137         UShort4::UShort4(const UShort4 &rhs)
3138         {
3139         //      xyzw.parent = this;
3140
3141                 Value *value = rhs.loadValue();
3142                 storeValue(value);
3143         }
3144
3145         UShort4::UShort4(const Reference<UShort4> &rhs)
3146         {
3147         //      xyzw.parent = this;
3148
3149                 Value *value = rhs.loadValue();
3150                 storeValue(value);
3151         }
3152
3153         UShort4::UShort4(RValue<Short4> rhs)
3154         {
3155         //      xyzw.parent = this;
3156
3157                 storeValue(rhs.value);
3158         }
3159
3160         UShort4::UShort4(const Short4 &rhs)
3161         {
3162         //      xyzw.parent = this;
3163
3164                 Value *value = rhs.loadValue();
3165                 storeValue(value);
3166         }
3167
3168         UShort4::UShort4(const Reference<Short4> &rhs)
3169         {
3170         //      xyzw.parent = this;
3171
3172                 Value *value = rhs.loadValue();
3173                 storeValue(value);
3174         }
3175
3176         RValue<UShort4> UShort4::operator=(RValue<UShort4> rhs)
3177         {
3178                 storeValue(rhs.value);
3179
3180                 return rhs;
3181         }
3182
3183         RValue<UShort4> UShort4::operator=(const UShort4 &rhs)
3184         {
3185                 Value *value = rhs.loadValue();
3186                 storeValue(value);
3187
3188                 return RValue<UShort4>(value);
3189         }
3190
3191         RValue<UShort4> UShort4::operator=(const Reference<UShort4> &rhs)
3192         {
3193                 Value *value = rhs.loadValue();
3194                 storeValue(value);
3195
3196                 return RValue<UShort4>(value);
3197         }
3198
3199         RValue<UShort4> UShort4::operator=(RValue<Short4> rhs)
3200         {
3201                 storeValue(rhs.value);
3202
3203                 return RValue<UShort4>(rhs);
3204         }
3205
3206         RValue<UShort4> UShort4::operator=(const Short4 &rhs)
3207         {
3208                 Value *value = rhs.loadValue();
3209                 storeValue(value);
3210
3211                 return RValue<UShort4>(value);
3212         }
3213
3214         RValue<UShort4> UShort4::operator=(const Reference<Short4> &rhs)
3215         {
3216                 Value *value = rhs.loadValue();
3217                 storeValue(value);
3218
3219                 return RValue<UShort4>(value);
3220         }
3221
3222         RValue<UShort4> operator+(RValue<UShort4> lhs, RValue<UShort4> rhs)
3223         {
3224                 if(CPUID::supportsMMX2())
3225                 {
3226                         return As<UShort4>(x86::paddw(As<Short4>(lhs), As<Short4>(rhs)));
3227                 }
3228                 else
3229                 {
3230                         return RValue<UShort4>(Nucleus::createAdd(lhs.value, rhs.value));
3231                 }
3232         }
3233
3234         RValue<UShort4> operator-(RValue<UShort4> lhs, RValue<UShort4> rhs)
3235         {
3236                 if(CPUID::supportsMMX2())
3237                 {
3238                         return As<UShort4>(x86::psubw(As<Short4>(lhs), As<Short4>(rhs)));
3239                 }
3240                 else
3241                 {
3242                         return RValue<UShort4>(Nucleus::createSub(lhs.value, rhs.value));
3243                 }
3244         }
3245
3246         RValue<UShort4> operator*(RValue<UShort4> lhs, RValue<UShort4> rhs)
3247         {
3248                 if(CPUID::supportsMMX2())
3249                 {
3250                         return As<UShort4>(x86::pmullw(As<Short4>(lhs), As<Short4>(rhs)));
3251                 }
3252                 else
3253                 {
3254                         return RValue<UShort4>(Nucleus::createMul(lhs.value, rhs.value));
3255                 }
3256         }
3257
3258         RValue<UShort4> operator&(RValue<UShort4> lhs, RValue<UShort4> rhs)
3259         {
3260                 if(CPUID::supportsMMX2())
3261                 {
3262                         return As<UShort4>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
3263                 }
3264                 else
3265                 {
3266                         return RValue<UShort4>(Nucleus::createAnd(lhs.value, rhs.value));
3267                 }
3268         }
3269
3270         RValue<UShort4> operator|(RValue<UShort4> lhs, RValue<UShort4> rhs)
3271         {
3272                 if(CPUID::supportsMMX2())
3273                 {
3274                         return As<UShort4>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
3275                 }
3276                 else
3277                 {
3278                         return RValue<UShort4>(Nucleus::createOr(lhs.value, rhs.value));
3279                 }
3280         }
3281
3282         RValue<UShort4> operator^(RValue<UShort4> lhs, RValue<UShort4> rhs)
3283         {
3284                 if(CPUID::supportsMMX2())
3285                 {
3286                         return As<UShort4>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
3287                 }
3288                 else
3289                 {
3290                         return RValue<UShort4>(Nucleus::createXor(lhs.value, rhs.value));
3291                 }
3292         }
3293
3294         RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
3295         {
3296         //      return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
3297
3298                 return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
3299         }
3300
3301         RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
3302         {
3303         //      return RValue<Short4>(Nucleus::createLShr(lhs.value, rhs.value));
3304
3305                 return x86::psrlw(lhs, rhs);
3306         }
3307
3308         RValue<UShort4> operator<<=(UShort4 &lhs, unsigned char rhs)
3309         {
3310                 return lhs = lhs << rhs;
3311         }
3312
3313         RValue<UShort4> operator>>=(UShort4 &lhs, unsigned char rhs)
3314         {
3315                 return lhs = lhs >> rhs;
3316         }
3317
3318         RValue<UShort4> operator~(RValue<UShort4> val)
3319         {
3320                 if(CPUID::supportsMMX2())
3321                 {
3322                         return As<UShort4>(As<Short4>(val) ^ Short4(0xFFFFu, 0xFFFFu, 0xFFFFu, 0xFFFFu));
3323                 }
3324                 else
3325                 {
3326                         return RValue<UShort4>(Nucleus::createNot(val.value));
3327                 }
3328         }
3329
3330         RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
3331         {
3332                 return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
3333         }
3334
3335         RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
3336         {
3337                 return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
3338         }
3339
3340         RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
3341         {
3342                 return x86::paddusw(x, y);
3343         }
3344
3345         RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
3346         {
3347                 return x86::psubusw(x, y);
3348         }
3349
3350         RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
3351         {
3352                 return x86::pmulhuw(x, y);
3353         }
3354
3355         RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
3356         {
3357                 return x86::pavgw(x, y);
3358         }
3359
3360         RValue<Byte8> Pack(RValue<UShort4> x, RValue<UShort4> y)
3361         {
3362                 return x86::packuswb(x, y);
3363         }
3364
3365         Type *UShort4::getType()
3366         {
3367                 if(CPUID::supportsMMX2())
3368                 {
3369                         return MMX::getType();
3370                 }
3371                 else
3372                 {
3373                         return T(VectorType::get(UShort::getType(), 4));
3374                 }
3375         }
3376
3377         Short8::Short8(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7)
3378         {
3379                 int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
3380                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
3381         }
3382
3383         Short8::Short8(RValue<Short8> rhs)
3384         {
3385                 storeValue(rhs.value);
3386         }
3387
3388         Short8::Short8(const Reference<Short8> &rhs)
3389         {
3390                 Value *value = rhs.loadValue();
3391                 storeValue(value);
3392         }
3393
3394         Short8::Short8(RValue<Short4> lo, RValue<Short4> hi)
3395         {
3396                 Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
3397                 Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
3398
3399                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
3400                 long2 = Nucleus::createInsertElement(long2, loLong, 0);
3401                 long2 = Nucleus::createInsertElement(long2, hiLong, 1);
3402                 Value *short8 = Nucleus::createBitCast(long2, Short8::getType());
3403
3404                 storeValue(short8);
3405         }
3406
3407         RValue<Short8> operator+(RValue<Short8> lhs, RValue<Short8> rhs)
3408         {
3409                 return RValue<Short8>(Nucleus::createAdd(lhs.value, rhs.value));
3410         }
3411
3412         RValue<Short8> operator&(RValue<Short8> lhs, RValue<Short8> rhs)
3413         {
3414                 return RValue<Short8>(Nucleus::createAnd(lhs.value, rhs.value));
3415         }
3416
3417         RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
3418         {
3419                 return x86::psllw(lhs, rhs);   // FIXME: Fallback required
3420         }
3421
3422         RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
3423         {
3424                 return x86::psraw(lhs, rhs);   // FIXME: Fallback required
3425         }
3426
3427         RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
3428         {
3429                 return x86::pmaddwd(x, y);   // FIXME: Fallback required
3430         }
3431
3432         RValue<Int4> Abs(RValue<Int4> x)
3433         {
3434                 if(CPUID::supportsSSSE3())
3435                 {
3436                         return x86::pabsd(x);
3437                 }
3438                 else
3439                 {
3440                         Int4 mask = (x >> 31);
3441                         return (mask ^ x) - mask;
3442                 }
3443         }
3444
3445         RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
3446         {
3447                 return x86::pmulhw(x, y);   // FIXME: Fallback required
3448         }
3449
3450         Type *Short8::getType()
3451         {
3452                 return T(VectorType::get(Short::getType(), 8));
3453         }
3454
3455         UShort8::UShort8(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7)
3456         {
3457                 int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
3458                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
3459         }
3460
3461         UShort8::UShort8(RValue<UShort8> rhs)
3462         {
3463                 storeValue(rhs.value);
3464         }
3465
3466         UShort8::UShort8(const Reference<UShort8> &rhs)
3467         {
3468                 Value *value = rhs.loadValue();
3469                 storeValue(value);
3470         }
3471
3472         UShort8::UShort8(RValue<UShort4> lo, RValue<UShort4> hi)
3473         {
3474                 Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
3475                 Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
3476
3477                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
3478                 long2 = Nucleus::createInsertElement(long2, loLong, 0);
3479                 long2 = Nucleus::createInsertElement(long2, hiLong, 1);
3480                 Value *short8 = Nucleus::createBitCast(long2, Short8::getType());
3481
3482                 storeValue(short8);
3483         }
3484
3485         RValue<UShort8> UShort8::operator=(RValue<UShort8> rhs)
3486         {
3487                 storeValue(rhs.value);
3488
3489                 return rhs;
3490         }
3491
3492         RValue<UShort8> UShort8::operator=(const UShort8 &rhs)
3493         {
3494                 Value *value = rhs.loadValue();
3495                 storeValue(value);
3496
3497                 return RValue<UShort8>(value);
3498         }
3499
3500         RValue<UShort8> UShort8::operator=(const Reference<UShort8> &rhs)
3501         {
3502                 Value *value = rhs.loadValue();
3503                 storeValue(value);
3504
3505                 return RValue<UShort8>(value);
3506         }
3507
3508         RValue<UShort8> operator&(RValue<UShort8> lhs, RValue<UShort8> rhs)
3509         {
3510                 return RValue<UShort8>(Nucleus::createAnd(lhs.value, rhs.value));
3511         }
3512
3513         RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
3514         {
3515                 return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));   // FIXME: Fallback required
3516         }
3517
3518         RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
3519         {
3520                 return x86::psrlw(lhs, rhs);   // FIXME: Fallback required
3521         }
3522
3523         RValue<UShort8> operator+(RValue<UShort8> lhs, RValue<UShort8> rhs)
3524         {
3525                 return RValue<UShort8>(Nucleus::createAdd(lhs.value, rhs.value));
3526         }
3527
3528         RValue<UShort8> operator*(RValue<UShort8> lhs, RValue<UShort8> rhs)
3529         {
3530                 return RValue<UShort8>(Nucleus::createMul(lhs.value, rhs.value));
3531         }
3532
3533         RValue<UShort8> operator+=(UShort8 &lhs, RValue<UShort8> rhs)
3534         {
3535                 return lhs = lhs + rhs;
3536         }
3537
3538         RValue<UShort8> operator~(RValue<UShort8> val)
3539         {
3540                 return RValue<UShort8>(Nucleus::createNot(val.value));
3541         }
3542
3543         RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
3544         {
3545                 int pshufb[16] =
3546                 {
3547                         select0 + 0,
3548                         select0 + 1,
3549                         select1 + 0,
3550                         select1 + 1,
3551                         select2 + 0,
3552                         select2 + 1,
3553                         select3 + 0,
3554                         select3 + 1,
3555                         select4 + 0,
3556                         select4 + 1,
3557                         select5 + 0,
3558                         select5 + 1,
3559                         select6 + 0,
3560                         select6 + 1,
3561                         select7 + 0,
3562                         select7 + 1,
3563                 };
3564
3565                 Value *byte16 = Nucleus::createBitCast(x.value, Byte16::getType());
3566                 Value *shuffle = Nucleus::createShuffleVector(byte16, byte16, pshufb);
3567                 Value *short8 = Nucleus::createBitCast(shuffle, UShort8::getType());
3568
3569                 return RValue<UShort8>(short8);
3570         }
3571
3572         RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
3573         {
3574                 return x86::pmulhuw(x, y);   // FIXME: Fallback required
3575         }
3576
3577         Type *UShort8::getType()
3578         {
3579                 return T(VectorType::get(UShort::getType(), 8));
3580         }
3581
3582         Int::Int(Argument<Int> argument)
3583         {
3584                 storeValue(argument.value);
3585         }
3586
3587         Int::Int(RValue<Byte> cast)
3588         {
3589                 Value *integer = Nucleus::createZExt(cast.value, Int::getType());
3590
3591                 storeValue(integer);
3592         }
3593
3594         Int::Int(RValue<SByte> cast)
3595         {
3596                 Value *integer = Nucleus::createSExt(cast.value, Int::getType());
3597
3598                 storeValue(integer);
3599         }
3600
3601         Int::Int(RValue<Short> cast)
3602         {
3603                 Value *integer = Nucleus::createSExt(cast.value, Int::getType());
3604
3605                 storeValue(integer);
3606         }
3607
3608         Int::Int(RValue<UShort> cast)
3609         {
3610                 Value *integer = Nucleus::createZExt(cast.value, Int::getType());
3611
3612                 storeValue(integer);
3613         }
3614
3615         Int::Int(RValue<Int2> cast)
3616         {
3617                 *this = Extract(cast, 0);
3618         }
3619
3620         Int::Int(RValue<Long> cast)
3621         {
3622                 Value *integer = Nucleus::createTrunc(cast.value, Int::getType());
3623
3624                 storeValue(integer);
3625         }
3626
3627         Int::Int(RValue<Float> cast)
3628         {
3629                 Value *integer = Nucleus::createFPToSI(cast.value, Int::getType());
3630
3631                 storeValue(integer);
3632         }
3633
3634         Int::Int()
3635         {
3636         }
3637
3638         Int::Int(int x)
3639         {
3640                 storeValue(Nucleus::createConstantInt(x));
3641         }
3642
3643         Int::Int(RValue<Int> rhs)
3644         {
3645                 storeValue(rhs.value);
3646         }
3647
3648         Int::Int(RValue<UInt> rhs)
3649         {
3650                 storeValue(rhs.value);
3651         }
3652
3653         Int::Int(const Int &rhs)
3654         {
3655                 Value *value = rhs.loadValue();
3656                 storeValue(value);
3657         }
3658
3659         Int::Int(const Reference<Int> &rhs)
3660         {
3661                 Value *value = rhs.loadValue();
3662                 storeValue(value);
3663         }
3664
3665         Int::Int(const UInt &rhs)
3666         {
3667                 Value *value = rhs.loadValue();
3668                 storeValue(value);
3669         }
3670
3671         Int::Int(const Reference<UInt> &rhs)
3672         {
3673                 Value *value = rhs.loadValue();
3674                 storeValue(value);
3675         }
3676
3677         RValue<Int> Int::operator=(int rhs)
3678         {
3679                 return RValue<Int>(storeValue(Nucleus::createConstantInt(rhs)));
3680         }
3681
3682         RValue<Int> Int::operator=(RValue<Int> rhs)
3683         {
3684                 storeValue(rhs.value);
3685
3686                 return rhs;
3687         }
3688
3689         RValue<Int> Int::operator=(RValue<UInt> rhs)
3690         {
3691                 storeValue(rhs.value);
3692
3693                 return RValue<Int>(rhs);
3694         }
3695
3696         RValue<Int> Int::operator=(const Int &rhs)
3697         {
3698                 Value *value = rhs.loadValue();
3699                 storeValue(value);
3700
3701                 return RValue<Int>(value);
3702         }
3703
3704         RValue<Int> Int::operator=(const Reference<Int> &rhs)
3705         {
3706                 Value *value = rhs.loadValue();
3707                 storeValue(value);
3708
3709                 return RValue<Int>(value);
3710         }
3711
3712         RValue<Int> Int::operator=(const UInt &rhs)
3713         {
3714                 Value *value = rhs.loadValue();
3715                 storeValue(value);
3716
3717                 return RValue<Int>(value);
3718         }
3719
3720         RValue<Int> Int::operator=(const Reference<UInt> &rhs)
3721         {
3722                 Value *value = rhs.loadValue();
3723                 storeValue(value);
3724
3725                 return RValue<Int>(value);
3726         }
3727
3728         RValue<Int> operator+(RValue<Int> lhs, RValue<Int> rhs)
3729         {
3730                 return RValue<Int>(Nucleus::createAdd(lhs.value, rhs.value));
3731         }
3732
3733         RValue<Int> operator-(RValue<Int> lhs, RValue<Int> rhs)
3734         {
3735                 return RValue<Int>(Nucleus::createSub(lhs.value, rhs.value));
3736         }
3737
3738         RValue<Int> operator*(RValue<Int> lhs, RValue<Int> rhs)
3739         {
3740                 return RValue<Int>(Nucleus::createMul(lhs.value, rhs.value));
3741         }
3742
3743         RValue<Int> operator/(RValue<Int> lhs, RValue<Int> rhs)
3744         {
3745                 return RValue<Int>(Nucleus::createSDiv(lhs.value, rhs.value));
3746         }
3747
3748         RValue<Int> operator%(RValue<Int> lhs, RValue<Int> rhs)
3749         {
3750                 return RValue<Int>(Nucleus::createSRem(lhs.value, rhs.value));
3751         }
3752
3753         RValue<Int> operator&(RValue<Int> lhs, RValue<Int> rhs)
3754         {
3755                 return RValue<Int>(Nucleus::createAnd(lhs.value, rhs.value));
3756         }
3757
3758         RValue<Int> operator|(RValue<Int> lhs, RValue<Int> rhs)
3759         {
3760                 return RValue<Int>(Nucleus::createOr(lhs.value, rhs.value));
3761         }
3762
3763         RValue<Int> operator^(RValue<Int> lhs, RValue<Int> rhs)
3764         {
3765                 return RValue<Int>(Nucleus::createXor(lhs.value, rhs.value));
3766         }
3767
3768         RValue<Int> operator<<(RValue<Int> lhs, RValue<Int> rhs)
3769         {
3770                 return RValue<Int>(Nucleus::createShl(lhs.value, rhs.value));
3771         }
3772
3773         RValue<Int> operator>>(RValue<Int> lhs, RValue<Int> rhs)
3774         {
3775                 return RValue<Int>(Nucleus::createAShr(lhs.value, rhs.value));
3776         }
3777
3778         RValue<Int> operator+=(Int &lhs, RValue<Int> rhs)
3779         {
3780                 return lhs = lhs + rhs;
3781         }
3782
3783         RValue<Int> operator-=(Int &lhs, RValue<Int> rhs)
3784         {
3785                 return lhs = lhs - rhs;
3786         }
3787
3788         RValue<Int> operator*=(Int &lhs, RValue<Int> rhs)
3789         {
3790                 return lhs = lhs * rhs;
3791         }
3792
3793         RValue<Int> operator/=(Int &lhs, RValue<Int> rhs)
3794         {
3795                 return lhs = lhs / rhs;
3796         }
3797
3798         RValue<Int> operator%=(Int &lhs, RValue<Int> rhs)
3799         {
3800                 return lhs = lhs % rhs;
3801         }
3802
3803         RValue<Int> operator&=(Int &lhs, RValue<Int> rhs)
3804         {
3805                 return lhs = lhs & rhs;
3806         }
3807
3808         RValue<Int> operator|=(Int &lhs, RValue<Int> rhs)
3809         {
3810                 return lhs = lhs | rhs;
3811         }
3812
3813         RValue<Int> operator^=(Int &lhs, RValue<Int> rhs)
3814         {
3815                 return lhs = lhs ^ rhs;
3816         }
3817
3818         RValue<Int> operator<<=(Int &lhs, RValue<Int> rhs)
3819         {
3820                 return lhs = lhs << rhs;
3821         }
3822
3823         RValue<Int> operator>>=(Int &lhs, RValue<Int> rhs)
3824         {
3825                 return lhs = lhs >> rhs;
3826         }
3827
3828         RValue<Int> operator+(RValue<Int> val)
3829         {
3830                 return val;
3831         }
3832
3833         RValue<Int> operator-(RValue<Int> val)
3834         {
3835                 return RValue<Int>(Nucleus::createNeg(val.value));
3836         }
3837
3838         RValue<Int> operator~(RValue<Int> val)
3839         {
3840                 return RValue<Int>(Nucleus::createNot(val.value));
3841         }
3842
3843         RValue<Int> operator++(Int &val, int)   // Post-increment
3844         {
3845                 RValue<Int> res = val;
3846
3847                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantInt(1)));
3848                 val.storeValue(inc);
3849
3850                 return res;
3851         }
3852
3853         const Int &operator++(Int &val)   // Pre-increment
3854         {
3855                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantInt(1)));
3856                 val.storeValue(inc);
3857
3858                 return val;
3859         }
3860
3861         RValue<Int> operator--(Int &val, int)   // Post-decrement
3862         {
3863                 RValue<Int> res = val;
3864
3865                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantInt(1)));
3866                 val.storeValue(inc);
3867
3868                 return res;
3869         }
3870
3871         const Int &operator--(Int &val)   // Pre-decrement
3872         {
3873                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantInt(1)));
3874                 val.storeValue(inc);
3875
3876                 return val;
3877         }
3878
3879         RValue<Bool> operator<(RValue<Int> lhs, RValue<Int> rhs)
3880         {
3881                 return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
3882         }
3883
3884         RValue<Bool> operator<=(RValue<Int> lhs, RValue<Int> rhs)
3885         {
3886                 return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
3887         }
3888
3889         RValue<Bool> operator>(RValue<Int> lhs, RValue<Int> rhs)
3890         {
3891                 return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
3892         }
3893
3894         RValue<Bool> operator>=(RValue<Int> lhs, RValue<Int> rhs)
3895         {
3896                 return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
3897         }
3898
3899         RValue<Bool> operator!=(RValue<Int> lhs, RValue<Int> rhs)
3900         {
3901                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
3902         }
3903
3904         RValue<Bool> operator==(RValue<Int> lhs, RValue<Int> rhs)
3905         {
3906                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
3907         }
3908
3909         RValue<Int> Max(RValue<Int> x, RValue<Int> y)
3910         {
3911                 return IfThenElse(x > y, x, y);
3912         }
3913
3914         RValue<Int> Min(RValue<Int> x, RValue<Int> y)
3915         {
3916                 return IfThenElse(x < y, x, y);
3917         }
3918
3919         RValue<Int> Clamp(RValue<Int> x, RValue<Int> min, RValue<Int> max)
3920         {
3921                 return Min(Max(x, min), max);
3922         }
3923
3924         RValue<Int> RoundInt(RValue<Float> cast)
3925         {
3926                 return x86::cvtss2si(cast);
3927
3928         //      return IfThenElse(val > 0.0f, Int(val + 0.5f), Int(val - 0.5f));
3929         }
3930
3931         Type *Int::getType()
3932         {
3933                 return T(llvm::Type::getInt32Ty(*::context));
3934         }
3935
3936         Long::Long(RValue<Int> cast)
3937         {
3938                 Value *integer = Nucleus::createSExt(cast.value, Long::getType());
3939
3940                 storeValue(integer);
3941         }
3942
3943         Long::Long(RValue<UInt> cast)
3944         {
3945                 Value *integer = Nucleus::createZExt(cast.value, Long::getType());
3946
3947                 storeValue(integer);
3948         }
3949
3950         Long::Long()
3951         {
3952         }
3953
3954         Long::Long(RValue<Long> rhs)
3955         {
3956                 storeValue(rhs.value);
3957         }
3958
3959         RValue<Long> Long::operator=(int64_t rhs)
3960         {
3961                 return RValue<Long>(storeValue(Nucleus::createConstantLong(rhs)));
3962         }
3963
3964         RValue<Long> Long::operator=(RValue<Long> rhs)
3965         {
3966                 storeValue(rhs.value);
3967
3968                 return rhs;
3969         }
3970
3971         RValue<Long> Long::operator=(const Long &rhs)
3972         {
3973                 Value *value = rhs.loadValue();
3974                 storeValue(value);
3975
3976                 return RValue<Long>(value);
3977         }
3978
3979         RValue<Long> Long::operator=(const Reference<Long> &rhs)
3980         {
3981                 Value *value = rhs.loadValue();
3982                 storeValue(value);
3983
3984                 return RValue<Long>(value);
3985         }
3986
3987         RValue<Long> operator+(RValue<Long> lhs, RValue<Long> rhs)
3988         {
3989                 return RValue<Long>(Nucleus::createAdd(lhs.value, rhs.value));
3990         }
3991
3992         RValue<Long> operator-(RValue<Long> lhs, RValue<Long> rhs)
3993         {
3994                 return RValue<Long>(Nucleus::createSub(lhs.value, rhs.value));
3995         }
3996
3997         RValue<Long> operator+=(Long &lhs, RValue<Long> rhs)
3998         {
3999                 return lhs = lhs + rhs;
4000         }
4001
4002         RValue<Long> operator-=(Long &lhs, RValue<Long> rhs)
4003         {
4004                 return lhs = lhs - rhs;
4005         }
4006
4007         RValue<Long> AddAtomic(RValue<Pointer<Long> > x, RValue<Long> y)
4008         {
4009                 return RValue<Long>(Nucleus::createAtomicAdd(x.value, y.value));
4010         }
4011
4012         Type *Long::getType()
4013         {
4014                 return T(llvm::Type::getInt64Ty(*::context));
4015         }
4016
4017         UInt::UInt(Argument<UInt> argument)
4018         {
4019                 storeValue(argument.value);
4020         }
4021
4022         UInt::UInt(RValue<UShort> cast)
4023         {
4024                 Value *integer = Nucleus::createZExt(cast.value, UInt::getType());
4025
4026                 storeValue(integer);
4027         }
4028
4029         UInt::UInt(RValue<Long> cast)
4030         {
4031                 Value *integer = Nucleus::createTrunc(cast.value, UInt::getType());
4032
4033                 storeValue(integer);
4034         }
4035
4036         UInt::UInt(RValue<Float> cast)
4037         {
4038                 // Note: createFPToUI is broken, must perform conversion using createFPtoSI
4039                 // Value *integer = Nucleus::createFPToUI(cast.value, UInt::getType());
4040
4041                 // Smallest positive value representable in UInt, but not in Int
4042                 const unsigned int ustart = 0x80000000u;
4043                 const float ustartf = float(ustart);
4044
4045                 // If the value is negative, store 0, otherwise store the result of the conversion
4046                 storeValue((~(As<Int>(cast) >> 31) &
4047                 // Check if the value can be represented as an Int
4048                         IfThenElse(cast >= ustartf,
4049                 // If the value is too large, subtract ustart and re-add it after conversion.
4050                                 As<Int>(As<UInt>(Int(cast - Float(ustartf))) + UInt(ustart)),
4051                 // Otherwise, just convert normally
4052                                 Int(cast))).value);
4053         }
4054
4055         UInt::UInt()
4056         {
4057         }
4058
4059         UInt::UInt(int x)
4060         {
4061                 storeValue(Nucleus::createConstantInt(x));
4062         }
4063
4064         UInt::UInt(unsigned int x)
4065         {
4066                 storeValue(Nucleus::createConstantInt(x));
4067         }
4068
4069         UInt::UInt(RValue<UInt> rhs)
4070         {
4071                 storeValue(rhs.value);
4072         }
4073
4074         UInt::UInt(RValue<Int> rhs)
4075         {
4076                 storeValue(rhs.value);
4077         }
4078
4079         UInt::UInt(const UInt &rhs)
4080         {
4081                 Value *value = rhs.loadValue();
4082                 storeValue(value);
4083         }
4084
4085         UInt::UInt(const Reference<UInt> &rhs)
4086         {
4087                 Value *value = rhs.loadValue();
4088                 storeValue(value);
4089         }
4090
4091         UInt::UInt(const Int &rhs)
4092         {
4093                 Value *value = rhs.loadValue();
4094                 storeValue(value);
4095         }
4096
4097         UInt::UInt(const Reference<Int> &rhs)
4098         {
4099                 Value *value = rhs.loadValue();
4100                 storeValue(value);
4101         }
4102
4103         RValue<UInt> UInt::operator=(unsigned int rhs)
4104         {
4105                 return RValue<UInt>(storeValue(Nucleus::createConstantInt(rhs)));
4106         }
4107
4108         RValue<UInt> UInt::operator=(RValue<UInt> rhs)
4109         {
4110                 storeValue(rhs.value);
4111
4112                 return rhs;
4113         }
4114
4115         RValue<UInt> UInt::operator=(RValue<Int> rhs)
4116         {
4117                 storeValue(rhs.value);
4118
4119                 return RValue<UInt>(rhs);
4120         }
4121
4122         RValue<UInt> UInt::operator=(const UInt &rhs)
4123         {
4124                 Value *value = rhs.loadValue();
4125                 storeValue(value);
4126
4127                 return RValue<UInt>(value);
4128         }
4129
4130         RValue<UInt> UInt::operator=(const Reference<UInt> &rhs)
4131         {
4132                 Value *value = rhs.loadValue();
4133                 storeValue(value);
4134
4135                 return RValue<UInt>(value);
4136         }
4137
4138         RValue<UInt> UInt::operator=(const Int &rhs)
4139         {
4140                 Value *value = rhs.loadValue();
4141                 storeValue(value);
4142
4143                 return RValue<UInt>(value);
4144         }
4145
4146         RValue<UInt> UInt::operator=(const Reference<Int> &rhs)
4147         {
4148                 Value *value = rhs.loadValue();
4149                 storeValue(value);
4150
4151                 return RValue<UInt>(value);
4152         }
4153
4154         RValue<UInt> operator+(RValue<UInt> lhs, RValue<UInt> rhs)
4155         {
4156                 return RValue<UInt>(Nucleus::createAdd(lhs.value, rhs.value));
4157         }
4158
4159         RValue<UInt> operator-(RValue<UInt> lhs, RValue<UInt> rhs)
4160         {
4161                 return RValue<UInt>(Nucleus::createSub(lhs.value, rhs.value));
4162         }
4163
4164         RValue<UInt> operator*(RValue<UInt> lhs, RValue<UInt> rhs)
4165         {
4166                 return RValue<UInt>(Nucleus::createMul(lhs.value, rhs.value));
4167         }
4168
4169         RValue<UInt> operator/(RValue<UInt> lhs, RValue<UInt> rhs)
4170         {
4171                 return RValue<UInt>(Nucleus::createUDiv(lhs.value, rhs.value));
4172         }
4173
4174         RValue<UInt> operator%(RValue<UInt> lhs, RValue<UInt> rhs)
4175         {
4176                 return RValue<UInt>(Nucleus::createURem(lhs.value, rhs.value));
4177         }
4178
4179         RValue<UInt> operator&(RValue<UInt> lhs, RValue<UInt> rhs)
4180         {
4181                 return RValue<UInt>(Nucleus::createAnd(lhs.value, rhs.value));
4182         }
4183
4184         RValue<UInt> operator|(RValue<UInt> lhs, RValue<UInt> rhs)
4185         {
4186                 return RValue<UInt>(Nucleus::createOr(lhs.value, rhs.value));
4187         }
4188
4189         RValue<UInt> operator^(RValue<UInt> lhs, RValue<UInt> rhs)
4190         {
4191                 return RValue<UInt>(Nucleus::createXor(lhs.value, rhs.value));
4192         }
4193
4194         RValue<UInt> operator<<(RValue<UInt> lhs, RValue<UInt> rhs)
4195         {
4196                 return RValue<UInt>(Nucleus::createShl(lhs.value, rhs.value));
4197         }
4198
4199         RValue<UInt> operator>>(RValue<UInt> lhs, RValue<UInt> rhs)
4200         {
4201                 return RValue<UInt>(Nucleus::createLShr(lhs.value, rhs.value));
4202         }
4203
4204         RValue<UInt> operator+=(UInt &lhs, RValue<UInt> rhs)
4205         {
4206                 return lhs = lhs + rhs;
4207         }
4208
4209         RValue<UInt> operator-=(UInt &lhs, RValue<UInt> rhs)
4210         {
4211                 return lhs = lhs - rhs;
4212         }
4213
4214         RValue<UInt> operator*=(UInt &lhs, RValue<UInt> rhs)
4215         {
4216                 return lhs = lhs * rhs;
4217         }
4218
4219         RValue<UInt> operator/=(UInt &lhs, RValue<UInt> rhs)
4220         {
4221                 return lhs = lhs / rhs;
4222         }
4223
4224         RValue<UInt> operator%=(UInt &lhs, RValue<UInt> rhs)
4225         {
4226                 return lhs = lhs % rhs;
4227         }
4228
4229         RValue<UInt> operator&=(UInt &lhs, RValue<UInt> rhs)
4230         {
4231                 return lhs = lhs & rhs;
4232         }
4233
4234         RValue<UInt> operator|=(UInt &lhs, RValue<UInt> rhs)
4235         {
4236                 return lhs = lhs | rhs;
4237         }
4238
4239         RValue<UInt> operator^=(UInt &lhs, RValue<UInt> rhs)
4240         {
4241                 return lhs = lhs ^ rhs;
4242         }
4243
4244         RValue<UInt> operator<<=(UInt &lhs, RValue<UInt> rhs)
4245         {
4246                 return lhs = lhs << rhs;
4247         }
4248
4249         RValue<UInt> operator>>=(UInt &lhs, RValue<UInt> rhs)
4250         {
4251                 return lhs = lhs >> rhs;
4252         }
4253
4254         RValue<UInt> operator+(RValue<UInt> val)
4255         {
4256                 return val;
4257         }
4258
4259         RValue<UInt> operator-(RValue<UInt> val)
4260         {
4261                 return RValue<UInt>(Nucleus::createNeg(val.value));
4262         }
4263
4264         RValue<UInt> operator~(RValue<UInt> val)
4265         {
4266                 return RValue<UInt>(Nucleus::createNot(val.value));
4267         }
4268
4269         RValue<UInt> operator++(UInt &val, int)   // Post-increment
4270         {
4271                 RValue<UInt> res = val;
4272
4273                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantInt(1)));
4274                 val.storeValue(inc);
4275
4276                 return res;
4277         }
4278
4279         const UInt &operator++(UInt &val)   // Pre-increment
4280         {
4281                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantInt(1)));
4282                 val.storeValue(inc);
4283
4284                 return val;
4285         }
4286
4287         RValue<UInt> operator--(UInt &val, int)   // Post-decrement
4288         {
4289                 RValue<UInt> res = val;
4290
4291                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantInt(1)));
4292                 val.storeValue(inc);
4293
4294                 return res;
4295         }
4296
4297         const UInt &operator--(UInt &val)   // Pre-decrement
4298         {
4299                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantInt(1)));
4300                 val.storeValue(inc);
4301
4302                 return val;
4303         }
4304
4305         RValue<UInt> Max(RValue<UInt> x, RValue<UInt> y)
4306         {
4307                 return IfThenElse(x > y, x, y);
4308         }
4309
4310         RValue<UInt> Min(RValue<UInt> x, RValue<UInt> y)
4311         {
4312                 return IfThenElse(x < y, x, y);
4313         }
4314
4315         RValue<UInt> Clamp(RValue<UInt> x, RValue<UInt> min, RValue<UInt> max)
4316         {
4317                 return Min(Max(x, min), max);
4318         }
4319
4320         RValue<Bool> operator<(RValue<UInt> lhs, RValue<UInt> rhs)
4321         {
4322                 return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
4323         }
4324
4325         RValue<Bool> operator<=(RValue<UInt> lhs, RValue<UInt> rhs)
4326         {
4327                 return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
4328         }
4329
4330         RValue<Bool> operator>(RValue<UInt> lhs, RValue<UInt> rhs)
4331         {
4332                 return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
4333         }
4334
4335         RValue<Bool> operator>=(RValue<UInt> lhs, RValue<UInt> rhs)
4336         {
4337                 return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
4338         }
4339
4340         RValue<Bool> operator!=(RValue<UInt> lhs, RValue<UInt> rhs)
4341         {
4342                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
4343         }
4344
4345         RValue<Bool> operator==(RValue<UInt> lhs, RValue<UInt> rhs)
4346         {
4347                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
4348         }
4349
4350 //      RValue<UInt> RoundUInt(RValue<Float> cast)
4351 //      {
4352 //              return x86::cvtss2si(val);   // FIXME: Unsigned
4353 //
4354 //      //      return IfThenElse(val > 0.0f, Int(val + 0.5f), Int(val - 0.5f));
4355 //      }
4356
4357         Type *UInt::getType()
4358         {
4359                 return T(llvm::Type::getInt32Ty(*::context));
4360         }
4361
4362 //      Int2::Int2(RValue<Int> cast)
4363 //      {
4364 //              Value *extend = Nucleus::createZExt(cast.value, Long::getType());
4365 //              Value *vector = Nucleus::createBitCast(extend, Int2::getType());
4366 //
4367 //              int shuffle[2] = {0, 0};
4368 //              Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
4369 //
4370 //              storeValue(replicate);
4371 //      }
4372
4373         Int2::Int2(RValue<Int4> cast)
4374         {
4375                 Value *long2 = Nucleus::createBitCast(cast.value, T(VectorType::get(Long::getType(), 2)));
4376                 Value *element = Nucleus::createExtractElement(long2, Long::getType(), 0);
4377                 Value *int2 = Nucleus::createBitCast(element, Int2::getType());
4378
4379                 storeValue(int2);
4380         }
4381
4382         Int2::Int2()
4383         {
4384         //      xy.parent = this;
4385         }
4386
4387         Int2::Int2(int x, int y)
4388         {
4389         //      xy.parent = this;
4390
4391                 int64_t constantVector[2] = {x, y};
4392                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Int::getType(), 2))));
4393
4394                 storeValue(Nucleus::createBitCast(vector, getType()));
4395         }
4396
4397         Int2::Int2(RValue<Int2> rhs)
4398         {
4399         //      xy.parent = this;
4400
4401                 storeValue(rhs.value);
4402         }
4403
4404         Int2::Int2(const Int2 &rhs)
4405         {
4406         //      xy.parent = this;
4407
4408                 Value *value = rhs.loadValue();
4409                 storeValue(value);
4410         }
4411
4412         Int2::Int2(const Reference<Int2> &rhs)
4413         {
4414         //      xy.parent = this;
4415
4416                 Value *value = rhs.loadValue();
4417                 storeValue(value);
4418         }
4419
4420         Int2::Int2(RValue<Int> lo, RValue<Int> hi)
4421         {
4422                 if(CPUID::supportsMMX2())
4423                 {
4424                         // movd mm0, lo
4425                         // movd mm1, hi
4426                         // punpckldq mm0, mm1
4427
4428                         Value *loLong = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), lo.value, 0);
4429                         loLong = Nucleus::createInsertElement(loLong, V(ConstantInt::get(Int::getType(), 0)), 1);
4430                         Value *hiLong = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), hi.value, 0);
4431                         hiLong = Nucleus::createInsertElement(hiLong, V(ConstantInt::get(Int::getType(), 0)), 1);
4432
4433                         storeValue(As<Int2>(UnpackLow(As<Int2>(loLong), As<Int2>(hiLong))).value);
4434                 }
4435                 else
4436                 {
4437                         int shuffle[2] = {0, 1};
4438                         Value *packed = Nucleus::createShuffleVector(Nucleus::createBitCast(lo.value, T(VectorType::get(Int::getType(), 1))), Nucleus::createBitCast(hi.value, T(VectorType::get(Int::getType(), 1))), shuffle);
4439
4440                         storeValue(Nucleus::createBitCast(packed, Int2::getType()));
4441                 }
4442         }
4443
4444         RValue<Int2> Int2::operator=(RValue<Int2> rhs)
4445         {
4446                 storeValue(rhs.value);
4447
4448                 return rhs;
4449         }
4450
4451         RValue<Int2> Int2::operator=(const Int2 &rhs)
4452         {
4453                 Value *value = rhs.loadValue();
4454                 storeValue(value);
4455
4456                 return RValue<Int2>(value);
4457         }
4458
4459         RValue<Int2> Int2::operator=(const Reference<Int2> &rhs)
4460         {
4461                 Value *value = rhs.loadValue();
4462                 storeValue(value);
4463
4464                 return RValue<Int2>(value);
4465         }
4466
4467         RValue<Int2> operator+(RValue<Int2> lhs, RValue<Int2> rhs)
4468         {
4469                 if(CPUID::supportsMMX2())
4470                 {
4471                         return x86::paddd(lhs, rhs);
4472                 }
4473                 else
4474                 {
4475                         return RValue<Int2>(Nucleus::createAdd(lhs.value, rhs.value));
4476                 }
4477         }
4478
4479         RValue<Int2> operator-(RValue<Int2> lhs, RValue<Int2> rhs)
4480         {
4481                 if(CPUID::supportsMMX2())
4482                 {
4483                         return x86::psubd(lhs, rhs);
4484                 }
4485                 else
4486                 {
4487                         return RValue<Int2>(Nucleus::createSub(lhs.value, rhs.value));
4488                 }
4489         }
4490
4491 //      RValue<Int2> operator*(RValue<Int2> lhs, RValue<Int2> rhs)
4492 //      {
4493 //              return RValue<Int2>(Nucleus::createMul(lhs.value, rhs.value));
4494 //      }
4495
4496 //      RValue<Int2> operator/(RValue<Int2> lhs, RValue<Int2> rhs)
4497 //      {
4498 //              return RValue<Int2>(Nucleus::createSDiv(lhs.value, rhs.value));
4499 //      }
4500
4501 //      RValue<Int2> operator%(RValue<Int2> lhs, RValue<Int2> rhs)
4502 //      {
4503 //              return RValue<Int2>(Nucleus::createSRem(lhs.value, rhs.value));
4504 //      }
4505
4506         RValue<Int2> operator&(RValue<Int2> lhs, RValue<Int2> rhs)
4507         {
4508                 if(CPUID::supportsMMX2())
4509                 {
4510                         return As<Int2>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
4511                 }
4512                 else
4513                 {
4514                         return RValue<Int2>(Nucleus::createAnd(lhs.value, rhs.value));
4515                 }
4516         }
4517
4518         RValue<Int2> operator|(RValue<Int2> lhs, RValue<Int2> rhs)
4519         {
4520                 if(CPUID::supportsMMX2())
4521                 {
4522                         return As<Int2>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
4523                 }
4524                 else
4525                 {
4526                         return RValue<Int2>(Nucleus::createOr(lhs.value, rhs.value));
4527                 }
4528         }
4529
4530         RValue<Int2> operator^(RValue<Int2> lhs, RValue<Int2> rhs)
4531         {
4532                 if(CPUID::supportsMMX2())
4533                 {
4534                         return As<Int2>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
4535                 }
4536                 else
4537                 {
4538                         return RValue<Int2>(Nucleus::createXor(lhs.value, rhs.value));
4539                 }
4540         }
4541
4542         RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
4543         {
4544         //      return RValue<Int2>(Nucleus::createShl(lhs.value, rhs.value));
4545
4546                 return x86::pslld(lhs, rhs);
4547         }
4548
4549         RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
4550         {
4551         //      return RValue<Int2>(Nucleus::createAShr(lhs.value, rhs.value));
4552
4553                 return x86::psrad(lhs, rhs);
4554         }
4555
4556         RValue<Int2> operator+=(Int2 &lhs, RValue<Int2> rhs)
4557         {
4558                 return lhs = lhs + rhs;
4559         }
4560
4561         RValue<Int2> operator-=(Int2 &lhs, RValue<Int2> rhs)
4562         {
4563                 return lhs = lhs - rhs;
4564         }
4565
4566 //      RValue<Int2> operator*=(Int2 &lhs, RValue<Int2> rhs)
4567 //      {
4568 //              return lhs = lhs * rhs;
4569 //      }
4570
4571 //      RValue<Int2> operator/=(Int2 &lhs, RValue<Int2> rhs)
4572 //      {
4573 //              return lhs = lhs / rhs;
4574 //      }
4575
4576 //      RValue<Int2> operator%=(Int2 &lhs, RValue<Int2> rhs)
4577 //      {
4578 //              return lhs = lhs % rhs;
4579 //      }
4580
4581         RValue<Int2> operator&=(Int2 &lhs, RValue<Int2> rhs)
4582         {
4583                 return lhs = lhs & rhs;
4584         }
4585
4586         RValue<Int2> operator|=(Int2 &lhs, RValue<Int2> rhs)
4587         {
4588                 return lhs = lhs | rhs;
4589         }
4590
4591         RValue<Int2> operator^=(Int2 &lhs, RValue<Int2> rhs)
4592         {
4593                 return lhs = lhs ^ rhs;
4594         }
4595
4596         RValue<Int2> operator<<=(Int2 &lhs, unsigned char rhs)
4597         {
4598                 return lhs = lhs << rhs;
4599         }
4600
4601         RValue<Int2> operator>>=(Int2 &lhs, unsigned char rhs)
4602         {
4603                 return lhs = lhs >> rhs;
4604         }
4605
4606 //      RValue<Int2> operator+(RValue<Int2> val)
4607 //      {
4608 //              return val;
4609 //      }
4610
4611 //      RValue<Int2> operator-(RValue<Int2> val)
4612 //      {
4613 //              return RValue<Int2>(Nucleus::createNeg(val.value));
4614 //      }
4615
4616         RValue<Int2> operator~(RValue<Int2> val)
4617         {
4618                 if(CPUID::supportsMMX2())
4619                 {
4620                         return val ^ Int2(0xFFFFFFFF, 0xFFFFFFFF);
4621                 }
4622                 else
4623                 {
4624                         return RValue<Int2>(Nucleus::createNot(val.value));
4625                 }
4626         }
4627
4628         RValue<Short4> UnpackLow(RValue<Int2> x, RValue<Int2> y)
4629         {
4630                 if(CPUID::supportsMMX2())
4631                 {
4632                         return x86::punpckldq(x, y);
4633                 }
4634                 else
4635                 {
4636                         int shuffle[2] = {0, 2};
4637                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
4638
4639                         return As<Short4>(packed);
4640                 }
4641         }
4642
4643         RValue<Short4> UnpackHigh(RValue<Int2> x, RValue<Int2> y)
4644         {
4645                 if(CPUID::supportsMMX2())
4646                 {
4647                         return x86::punpckhdq(x, y);
4648                 }
4649                 else
4650                 {
4651                         int shuffle[2] = {1, 3};
4652                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
4653
4654                         return As<Short4>(packed);
4655                 }
4656         }
4657
4658         RValue<Int> Extract(RValue<Int2> val, int i)
4659         {
4660                 if(false)   // FIXME: LLVM does not generate optimal code
4661                 {
4662                         return RValue<Int>(Nucleus::createExtractElement(val.value, Int::getType(), i));
4663                 }
4664                 else
4665                 {
4666                         if(i == 0)
4667                         {
4668                                 return RValue<Int>(Nucleus::createExtractElement(Nucleus::createBitCast(val.value, T(VectorType::get(Int::getType(), 2))), Int::getType(), 0));
4669                         }
4670                         else
4671                         {
4672                                 Int2 val2 = As<Int2>(UnpackHigh(val, val));
4673
4674                                 return Extract(val2, 0);
4675                         }
4676                 }
4677         }
4678
4679         RValue<Int2> Insert(RValue<Int2> val, RValue<Int> element, int i)
4680         {
4681                 return RValue<Int2>(Nucleus::createBitCast(Nucleus::createInsertElement(Nucleus::createBitCast(val.value, T(VectorType::get(Int::getType(), 2))), element.value, i), Int2::getType()));
4682         }
4683
4684         Type *Int2::getType()
4685         {
4686                 if(CPUID::supportsMMX2())
4687                 {
4688                         return MMX::getType();
4689                 }
4690                 else
4691                 {
4692                         return T(VectorType::get(Int::getType(), 2));
4693                 }
4694         }
4695
4696         UInt2::UInt2()
4697         {
4698         //      xy.parent = this;
4699         }
4700
4701         UInt2::UInt2(unsigned int x, unsigned int y)
4702         {
4703         //      xy.parent = this;
4704
4705                 int64_t constantVector[2] = {x, y};
4706                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UInt::getType(), 2))));
4707
4708                 storeValue(Nucleus::createBitCast(vector, getType()));
4709         }
4710
4711         UInt2::UInt2(RValue<UInt2> rhs)
4712         {
4713         //      xy.parent = this;
4714
4715                 storeValue(rhs.value);
4716         }
4717
4718         UInt2::UInt2(const UInt2 &rhs)
4719         {
4720         //      xy.parent = this;
4721
4722                 Value *value = rhs.loadValue();
4723                 storeValue(value);
4724         }
4725
4726         UInt2::UInt2(const Reference<UInt2> &rhs)
4727         {
4728         //      xy.parent = this;
4729
4730                 Value *value = rhs.loadValue();
4731                 storeValue(value);
4732         }
4733
4734         RValue<UInt2> UInt2::operator=(RValue<UInt2> rhs)
4735         {
4736                 storeValue(rhs.value);
4737
4738                 return rhs;
4739         }
4740
4741         RValue<UInt2> UInt2::operator=(const UInt2 &rhs)
4742         {
4743                 Value *value = rhs.loadValue();
4744                 storeValue(value);
4745
4746                 return RValue<UInt2>(value);
4747         }
4748
4749         RValue<UInt2> UInt2::operator=(const Reference<UInt2> &rhs)
4750         {
4751                 Value *value = rhs.loadValue();
4752                 storeValue(value);
4753
4754                 return RValue<UInt2>(value);
4755         }
4756
4757         RValue<UInt2> operator+(RValue<UInt2> lhs, RValue<UInt2> rhs)
4758         {
4759                 if(CPUID::supportsMMX2())
4760                 {
4761                         return As<UInt2>(x86::paddd(As<Int2>(lhs), As<Int2>(rhs)));
4762                 }
4763                 else
4764                 {
4765                         return RValue<UInt2>(Nucleus::createAdd(lhs.value, rhs.value));
4766                 }
4767         }
4768
4769         RValue<UInt2> operator-(RValue<UInt2> lhs, RValue<UInt2> rhs)
4770         {
4771                 if(CPUID::supportsMMX2())
4772                 {
4773                         return As<UInt2>(x86::psubd(As<Int2>(lhs), As<Int2>(rhs)));
4774                 }
4775                 else
4776                 {
4777                         return RValue<UInt2>(Nucleus::createSub(lhs.value, rhs.value));
4778                 }
4779         }
4780
4781 //      RValue<UInt2> operator*(RValue<UInt2> lhs, RValue<UInt2> rhs)
4782 //      {
4783 //              return RValue<UInt2>(Nucleus::createMul(lhs.value, rhs.value));
4784 //      }
4785
4786 //      RValue<UInt2> operator/(RValue<UInt2> lhs, RValue<UInt2> rhs)
4787 //      {
4788 //              return RValue<UInt2>(Nucleus::createUDiv(lhs.value, rhs.value));
4789 //      }
4790
4791 //      RValue<UInt2> operator%(RValue<UInt2> lhs, RValue<UInt2> rhs)
4792 //      {
4793 //              return RValue<UInt2>(Nucleus::createURem(lhs.value, rhs.value));
4794 //      }
4795
4796         RValue<UInt2> operator&(RValue<UInt2> lhs, RValue<UInt2> rhs)
4797         {
4798                 if(CPUID::supportsMMX2())
4799                 {
4800                         return As<UInt2>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
4801                 }
4802                 else
4803                 {
4804                         return RValue<UInt2>(Nucleus::createAnd(lhs.value, rhs.value));
4805                 }
4806         }
4807
4808         RValue<UInt2> operator|(RValue<UInt2> lhs, RValue<UInt2> rhs)
4809         {
4810                 if(CPUID::supportsMMX2())
4811                 {
4812                         return As<UInt2>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
4813                 }
4814                 else
4815                 {
4816                         return RValue<UInt2>(Nucleus::createOr(lhs.value, rhs.value));
4817                 }
4818         }
4819
4820         RValue<UInt2> operator^(RValue<UInt2> lhs, RValue<UInt2> rhs)
4821         {
4822                 if(CPUID::supportsMMX2())
4823                 {
4824                         return As<UInt2>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
4825                 }
4826                 else
4827                 {
4828                         return RValue<UInt2>(Nucleus::createXor(lhs.value, rhs.value));
4829                 }
4830         }
4831
4832         RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
4833         {
4834         //      return RValue<UInt2>(Nucleus::createShl(lhs.value, rhs.value));
4835
4836                 return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
4837         }
4838
4839         RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
4840         {
4841         //      return RValue<UInt2>(Nucleus::createLShr(lhs.value, rhs.value));
4842
4843                 return x86::psrld(lhs, rhs);
4844         }
4845
4846         RValue<UInt2> operator+=(UInt2 &lhs, RValue<UInt2> rhs)
4847         {
4848                 return lhs = lhs + rhs;
4849         }
4850
4851         RValue<UInt2> operator-=(UInt2 &lhs, RValue<UInt2> rhs)
4852         {
4853                 return lhs = lhs - rhs;
4854         }
4855
4856 //      RValue<UInt2> operator*=(UInt2 &lhs, RValue<UInt2> rhs)
4857 //      {
4858 //              return lhs = lhs * rhs;
4859 //      }
4860
4861 //      RValue<UInt2> operator/=(UInt2 &lhs, RValue<UInt2> rhs)
4862 //      {
4863 //              return lhs = lhs / rhs;
4864 //      }
4865
4866 //      RValue<UInt2> operator%=(UInt2 &lhs, RValue<UInt2> rhs)
4867 //      {
4868 //              return lhs = lhs % rhs;
4869 //      }
4870
4871         RValue<UInt2> operator&=(UInt2 &lhs, RValue<UInt2> rhs)
4872         {
4873                 return lhs = lhs & rhs;
4874         }
4875
4876         RValue<UInt2> operator|=(UInt2 &lhs, RValue<UInt2> rhs)
4877         {
4878                 return lhs = lhs | rhs;
4879         }
4880
4881         RValue<UInt2> operator^=(UInt2 &lhs, RValue<UInt2> rhs)
4882         {
4883                 return lhs = lhs ^ rhs;
4884         }
4885
4886         RValue<UInt2> operator<<=(UInt2 &lhs, unsigned char rhs)
4887         {
4888                 return lhs = lhs << rhs;
4889         }
4890
4891         RValue<UInt2> operator>>=(UInt2 &lhs, unsigned char rhs)
4892         {
4893                 return lhs = lhs >> rhs;
4894         }
4895
4896 //      RValue<UInt2> operator+(RValue<UInt2> val)
4897 //      {
4898 //              return val;
4899 //      }
4900
4901 //      RValue<UInt2> operator-(RValue<UInt2> val)
4902 //      {
4903 //              return RValue<UInt2>(Nucleus::createNeg(val.value));
4904 //      }
4905
4906         RValue<UInt2> operator~(RValue<UInt2> val)
4907         {
4908                 if(CPUID::supportsMMX2())
4909                 {
4910                         return val ^ UInt2(0xFFFFFFFF, 0xFFFFFFFF);
4911                 }
4912                 else
4913                 {
4914                         return RValue<UInt2>(Nucleus::createNot(val.value));
4915                 }
4916         }
4917
4918         Type *UInt2::getType()
4919         {
4920                 if(CPUID::supportsMMX2())
4921                 {
4922                         return MMX::getType();
4923                 }
4924                 else
4925                 {
4926                         return T(VectorType::get(UInt::getType(), 2));
4927                 }
4928         }
4929
4930         Int4::Int4(RValue<Byte4> cast)
4931         {
4932                 Value *x = Nucleus::createBitCast(cast.value, Int::getType());
4933                 Value *a = Nucleus::createInsertElement(V(UndefValue::get(Int4::getType())), x, 0);
4934
4935                 Value *e;
4936
4937                 if (CPUID::supportsSSE4_1())
4938                 {
4939                         e = x86::pmovzxbd(RValue<Int4>(a)).value;
4940                 }
4941                 else
4942                 {
4943                         int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
4944                         Value *b = Nucleus::createBitCast(a, Byte16::getType());
4945                         Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Byte16::getType())), swizzle);
4946
4947                         int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
4948                         Value *d = Nucleus::createBitCast(c, Short8::getType());
4949                         e = Nucleus::createShuffleVector(d, V(Nucleus::createNullValue(Short8::getType())), swizzle2);
4950                 }
4951
4952                 Value *f = Nucleus::createBitCast(e, Int4::getType());
4953                 storeValue(f);
4954         }
4955
4956         Int4::Int4(RValue<SByte4> cast)
4957         {
4958                 Value *x = Nucleus::createBitCast(cast.value, Int::getType());
4959                 Value *a = Nucleus::createInsertElement(V(UndefValue::get(Int4::getType())), x, 0);
4960
4961                 Value *g;
4962
4963                 if (CPUID::supportsSSE4_1())
4964                 {
4965                         g = x86::pmovsxbd(RValue<Int4>(a)).value;
4966                 }
4967                 else
4968                 {
4969                         int     swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
4970                         Value *b = Nucleus::createBitCast(a, Byte16::getType());
4971                         Value *c = Nucleus::createShuffleVector(b, b, swizzle);
4972
4973                         int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
4974                         Value *d = Nucleus::createBitCast(c, Short8::getType());
4975                         Value *e = Nucleus::createShuffleVector(d, d, swizzle2);
4976
4977                         Value *f = Nucleus::createBitCast(e, Int4::getType());
4978                         //      g = Nucleus::createAShr(f, Nucleus::createConstantInt(24));
4979                         g = x86::psrad(RValue<Int4>(f), 24).value;
4980                 }
4981
4982                 storeValue(g);
4983         }
4984
4985         Int4::Int4(RValue<Float4> cast)
4986         {
4987         //      xyzw.parent = this;
4988
4989                 Value *xyzw = Nucleus::createFPToSI(cast.value, Int4::getType());
4990
4991                 storeValue(xyzw);
4992         }
4993
4994         Int4::Int4(RValue<Short4> cast)
4995         {
4996                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
4997                 Value *element = Nucleus::createBitCast(cast.value, Long::getType());
4998                 long2 = Nucleus::createInsertElement(long2, element, 0);
4999                 RValue<Int4> vector = RValue<Int4>(Nucleus::createBitCast(long2, Int4::getType()));
5000
5001                 if(CPUID::supportsSSE4_1())
5002                 {
5003                         storeValue(x86::pmovsxwd(vector).value);
5004                 }
5005                 else
5006                 {
5007                         Value *b = Nucleus::createBitCast(vector.value, Short8::getType());
5008
5009                         int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
5010                         Value *c = Nucleus::createShuffleVector(b, b, swizzle);
5011                         Value *d = Nucleus::createBitCast(c, Int4::getType());
5012                         storeValue(d);
5013
5014                         // Each Short is packed into each Int in the (Short | Short) format.
5015                         // Shifting by 16 will retrieve the original Short value.
5016                         // Shitfing an Int will propagate the sign bit, which will work
5017                         // for both positive and negative values of a Short.
5018                         *this >>= 16;
5019                 }
5020         }
5021
5022         Int4::Int4(RValue<UShort4> cast)
5023         {
5024                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
5025                 Value *element = Nucleus::createBitCast(cast.value, Long::getType());
5026                 long2 = Nucleus::createInsertElement(long2, element, 0);
5027                 RValue<Int4> vector = RValue<Int4>(Nucleus::createBitCast(long2, Int4::getType()));
5028
5029                 if(CPUID::supportsSSE4_1())
5030                 {
5031                         storeValue(x86::pmovzxwd(RValue<Int4>(vector)).value);
5032                 }
5033                 else
5034                 {
5035                         Value *b = Nucleus::createBitCast(vector.value, Short8::getType());
5036
5037                         int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
5038                         Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Short8::getType())), swizzle);
5039                         Value *d = Nucleus::createBitCast(c, Int4::getType());
5040                         storeValue(d);
5041                 }
5042         }
5043
5044         Int4::Int4()
5045         {
5046         //      xyzw.parent = this;
5047         }
5048
5049         Int4::Int4(int xyzw)
5050         {
5051                 constant(xyzw, xyzw, xyzw, xyzw);
5052         }
5053
5054         Int4::Int4(int x, int yzw)
5055         {
5056                 constant(x, yzw, yzw, yzw);
5057         }
5058
5059         Int4::Int4(int x, int y, int zw)
5060         {
5061                 constant(x, y, zw, zw);
5062         }
5063
5064         Int4::Int4(int x, int y, int z, int w)
5065         {
5066                 constant(x, y, z, w);
5067         }
5068
5069         void Int4::constant(int x, int y, int z, int w)
5070         {
5071         //      xyzw.parent = this;
5072
5073                 int64_t constantVector[4] = {x, y, z, w};
5074                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
5075         }
5076
5077         Int4::Int4(RValue<Int4> rhs)
5078         {
5079         //      xyzw.parent = this;
5080
5081                 storeValue(rhs.value);
5082         }
5083
5084         Int4::Int4(const Int4 &rhs)
5085         {
5086         //      xyzw.parent = this;
5087
5088                 Value *value = rhs.loadValue();
5089                 storeValue(value);
5090         }
5091
5092         Int4::Int4(const Reference<Int4> &rhs)
5093         {
5094         //      xyzw.parent = this;
5095
5096                 Value *value = rhs.loadValue();
5097                 storeValue(value);
5098         }
5099
5100         Int4::Int4(RValue<UInt4> rhs)
5101         {
5102         //      xyzw.parent = this;
5103
5104                 storeValue(rhs.value);
5105         }
5106
5107         Int4::Int4(const UInt4 &rhs)
5108         {
5109         //      xyzw.parent = this;
5110
5111                 Value *value = rhs.loadValue();
5112                 storeValue(value);
5113         }
5114
5115         Int4::Int4(const Reference<UInt4> &rhs)
5116         {
5117         //      xyzw.parent = this;
5118
5119                 Value *value = rhs.loadValue();
5120                 storeValue(value);
5121         }
5122
5123         Int4::Int4(RValue<Int2> lo, RValue<Int2> hi)
5124         {
5125         //      xyzw.parent = this;
5126
5127                 Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
5128                 Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
5129
5130                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
5131                 long2 = Nucleus::createInsertElement(long2, loLong, 0);
5132                 long2 = Nucleus::createInsertElement(long2, hiLong, 1);
5133                 Value *int4 = Nucleus::createBitCast(long2, Int4::getType());
5134
5135                 storeValue(int4);
5136         }
5137
5138         Int4::Int4(RValue<Int> rhs)
5139         {
5140         //      xyzw.parent = this;
5141
5142                 Value *vector = loadValue();
5143                 Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
5144
5145                 int swizzle[4] = {0, 0, 0, 0};
5146                 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
5147
5148                 storeValue(replicate);
5149         }
5150
5151         Int4::Int4(const Int &rhs)
5152         {
5153         //      xyzw.parent = this;
5154
5155                 *this = RValue<Int>(rhs.loadValue());
5156         }
5157
5158         Int4::Int4(const Reference<Int> &rhs)
5159         {
5160         //      xyzw.parent = this;
5161
5162                 *this = RValue<Int>(rhs.loadValue());
5163         }
5164
5165         RValue<Int4> Int4::operator=(RValue<Int4> rhs)
5166         {
5167                 storeValue(rhs.value);
5168
5169                 return rhs;
5170         }
5171
5172         RValue<Int4> Int4::operator=(const Int4 &rhs)
5173         {
5174                 Value *value = rhs.loadValue();
5175                 storeValue(value);
5176
5177                 return RValue<Int4>(value);
5178         }
5179
5180         RValue<Int4> Int4::operator=(const Reference<Int4> &rhs)
5181         {
5182                 Value *value = rhs.loadValue();
5183                 storeValue(value);
5184
5185                 return RValue<Int4>(value);
5186         }
5187
5188         RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int4> rhs)
5189         {
5190                 return RValue<Int4>(Nucleus::createAdd(lhs.value, rhs.value));
5191         }
5192
5193         RValue<Int4> operator-(RValue<Int4> lhs, RValue<Int4> rhs)
5194         {
5195                 return RValue<Int4>(Nucleus::createSub(lhs.value, rhs.value));
5196         }
5197
5198         RValue<Int4> operator*(RValue<Int4> lhs, RValue<Int4> rhs)
5199         {
5200                 return RValue<Int4>(Nucleus::createMul(lhs.value, rhs.value));
5201         }
5202
5203         RValue<Int4> operator/(RValue<Int4> lhs, RValue<Int4> rhs)
5204         {
5205                 return RValue<Int4>(Nucleus::createSDiv(lhs.value, rhs.value));
5206         }
5207
5208         RValue<Int4> operator%(RValue<Int4> lhs, RValue<Int4> rhs)
5209         {
5210                 return RValue<Int4>(Nucleus::createSRem(lhs.value, rhs.value));
5211         }
5212
5213         RValue<Int4> operator&(RValue<Int4> lhs, RValue<Int4> rhs)
5214         {
5215                 return RValue<Int4>(Nucleus::createAnd(lhs.value, rhs.value));
5216         }
5217
5218         RValue<Int4> operator|(RValue<Int4> lhs, RValue<Int4> rhs)
5219         {
5220                 return RValue<Int4>(Nucleus::createOr(lhs.value, rhs.value));
5221         }
5222
5223         RValue<Int4> operator^(RValue<Int4> lhs, RValue<Int4> rhs)
5224         {
5225                 return RValue<Int4>(Nucleus::createXor(lhs.value, rhs.value));
5226         }
5227
5228         RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
5229         {
5230                 return x86::pslld(lhs, rhs);
5231         }
5232
5233         RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
5234         {
5235                 return x86::psrad(lhs, rhs);
5236         }
5237
5238         RValue<Int4> operator<<(RValue<Int4> lhs, RValue<Int4> rhs)
5239         {
5240                 return RValue<Int4>(Nucleus::createShl(lhs.value, rhs.value));
5241         }
5242
5243         RValue<Int4> operator>>(RValue<Int4> lhs, RValue<Int4> rhs)
5244         {
5245                 return RValue<Int4>(Nucleus::createAShr(lhs.value, rhs.value));
5246         }
5247
5248         RValue<Int4> operator+=(Int4 &lhs, RValue<Int4> rhs)
5249         {
5250                 return lhs = lhs + rhs;
5251         }
5252
5253         RValue<Int4> operator-=(Int4 &lhs, RValue<Int4> rhs)
5254         {
5255                 return lhs = lhs - rhs;
5256         }
5257
5258         RValue<Int4> operator*=(Int4 &lhs, RValue<Int4> rhs)
5259         {
5260                 return lhs = lhs * rhs;
5261         }
5262
5263 //      RValue<Int4> operator/=(Int4 &lhs, RValue<Int4> rhs)
5264 //      {
5265 //              return lhs = lhs / rhs;
5266 //      }
5267
5268 //      RValue<Int4> operator%=(Int4 &lhs, RValue<Int4> rhs)
5269 //      {
5270 //              return lhs = lhs % rhs;
5271 //      }
5272
5273         RValue<Int4> operator&=(Int4 &lhs, RValue<Int4> rhs)
5274         {
5275                 return lhs = lhs & rhs;
5276         }
5277
5278         RValue<Int4> operator|=(Int4 &lhs, RValue<Int4> rhs)
5279         {
5280                 return lhs = lhs | rhs;
5281         }
5282
5283         RValue<Int4> operator^=(Int4 &lhs, RValue<Int4> rhs)
5284         {
5285                 return lhs = lhs ^ rhs;
5286         }
5287
5288         RValue<Int4> operator<<=(Int4 &lhs, unsigned char rhs)
5289         {
5290                 return lhs = lhs << rhs;
5291         }
5292
5293         RValue<Int4> operator>>=(Int4 &lhs, unsigned char rhs)
5294         {
5295                 return lhs = lhs >> rhs;
5296         }
5297
5298         RValue<Int4> operator+(RValue<Int4> val)
5299         {
5300                 return val;
5301         }
5302
5303         RValue<Int4> operator-(RValue<Int4> val)
5304         {
5305                 return RValue<Int4>(Nucleus::createNeg(val.value));
5306         }
5307
5308         RValue<Int4> operator~(RValue<Int4> val)
5309         {
5310                 return RValue<Int4>(Nucleus::createNot(val.value));
5311         }
5312
5313         RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
5314         {
5315                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5316                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5317                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
5318                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
5319         }
5320
5321         RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
5322         {
5323                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType()));
5324         }
5325
5326         RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
5327         {
5328                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5329                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5330                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value, y.value), Int4::getType()));
5331                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
5332         }
5333
5334         RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
5335         {
5336                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
5337         }
5338
5339         RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
5340         {
5341                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5342                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5343                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value, y.value), Int4::getType()));
5344                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
5345         }
5346
5347         RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
5348         {
5349                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType()));
5350         }
5351
5352         RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
5353         {
5354                 if(CPUID::supportsSSE4_1())
5355                 {
5356                         return x86::pmaxsd(x, y);
5357                 }
5358                 else
5359                 {
5360                         RValue<Int4> greater = CmpNLE(x, y);
5361                         return x & greater | y & ~greater;
5362                 }
5363         }
5364
5365         RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
5366         {
5367                 if(CPUID::supportsSSE4_1())
5368                 {
5369                         return x86::pminsd(x, y);
5370                 }
5371                 else
5372                 {
5373                         RValue<Int4> less = CmpLT(x, y);
5374                         return x & less | y & ~less;
5375                 }
5376         }
5377
5378         RValue<Int4> RoundInt(RValue<Float4> cast)
5379         {
5380                 return x86::cvtps2dq(cast);
5381         }
5382
5383         RValue<Short8> Pack(RValue<Int4> x, RValue<Int4> y)
5384         {
5385                 return x86::packssdw(x, y);
5386         }
5387
5388         RValue<Int> Extract(RValue<Int4> x, int i)
5389         {
5390                 return RValue<Int>(Nucleus::createExtractElement(x.value, Int::getType(), i));
5391         }
5392
5393         RValue<Int4> Insert(RValue<Int4> x, RValue<Int> element, int i)
5394         {
5395                 return RValue<Int4>(Nucleus::createInsertElement(x.value, element.value, i));
5396         }
5397
5398         RValue<Int> SignMask(RValue<Int4> x)
5399         {
5400                 return x86::movmskps(As<Float4>(x));
5401         }
5402
5403         RValue<Int4> Swizzle(RValue<Int4> x, unsigned char select)
5404         {
5405                 return RValue<Int4>(createSwizzle4(x.value, select));
5406         }
5407
5408         Type *Int4::getType()
5409         {
5410                 return T(VectorType::get(Int::getType(), 4));
5411         }
5412
5413         UInt4::UInt4(RValue<Float4> cast)
5414         {
5415         //      xyzw.parent = this;
5416
5417                 // Note: createFPToUI is broken, must perform conversion using createFPtoSI
5418                 // Value *xyzw = Nucleus::createFPToUI(cast.value, UInt4::getType());
5419
5420                 // Smallest positive value representable in UInt, but not in Int
5421                 const unsigned int ustart = 0x80000000u;
5422                 const float ustartf = float(ustart);
5423
5424                 // Check if the value can be represented as an Int
5425                 Int4 uiValue = CmpNLT(cast, Float4(ustartf));
5426                 // If the value is too large, subtract ustart and re-add it after conversion.
5427                 uiValue = (uiValue & As<Int4>(As<UInt4>(Int4(cast - Float4(ustartf))) + UInt4(ustart))) |
5428                 // Otherwise, just convert normally
5429                           (~uiValue & Int4(cast));
5430                 // If the value is negative, store 0, otherwise store the result of the conversion
5431                 storeValue((~(As<Int4>(cast) >> 31) & uiValue).value);
5432         }
5433
5434         UInt4::UInt4()
5435         {
5436         //      xyzw.parent = this;
5437         }
5438
5439         UInt4::UInt4(int xyzw)
5440         {
5441                 constant(xyzw, xyzw, xyzw, xyzw);
5442         }
5443
5444         UInt4::UInt4(int x, int yzw)
5445         {
5446                 constant(x, yzw, yzw, yzw);
5447         }
5448
5449         UInt4::UInt4(int x, int y, int zw)
5450         {
5451                 constant(x, y, zw, zw);
5452         }
5453
5454         UInt4::UInt4(int x, int y, int z, int w)
5455         {
5456                 constant(x, y, z, w);
5457         }
5458
5459         void UInt4::constant(int x, int y, int z, int w)
5460         {
5461         //      xyzw.parent = this;
5462
5463                 int64_t constantVector[4] = {x, y, z, w};
5464                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
5465         }
5466
5467         UInt4::UInt4(RValue<UInt4> rhs)
5468         {
5469         //      xyzw.parent = this;
5470
5471                 storeValue(rhs.value);
5472         }
5473
5474         UInt4::UInt4(const UInt4 &rhs)
5475         {
5476         //      xyzw.parent = this;
5477
5478                 Value *value = rhs.loadValue();
5479                 storeValue(value);
5480         }
5481
5482         UInt4::UInt4(const Reference<UInt4> &rhs)
5483         {
5484         //      xyzw.parent = this;
5485
5486                 Value *value = rhs.loadValue();
5487                 storeValue(value);
5488         }
5489
5490         UInt4::UInt4(RValue<Int4> rhs)
5491         {
5492         //      xyzw.parent = this;
5493
5494                 storeValue(rhs.value);
5495         }
5496
5497         UInt4::UInt4(const Int4 &rhs)
5498         {
5499         //      xyzw.parent = this;
5500
5501                 Value *value = rhs.loadValue();
5502                 storeValue(value);
5503         }
5504
5505         UInt4::UInt4(const Reference<Int4> &rhs)
5506         {
5507         //      xyzw.parent = this;
5508
5509                 Value *value = rhs.loadValue();
5510                 storeValue(value);
5511         }
5512
5513         UInt4::UInt4(RValue<UInt2> lo, RValue<UInt2> hi)
5514         {
5515                 Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
5516                 Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
5517
5518                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
5519                 long2 = Nucleus::createInsertElement(long2, loLong, 0);
5520                 long2 = Nucleus::createInsertElement(long2, hiLong, 1);
5521                 Value *uint4 = Nucleus::createBitCast(long2, Int4::getType());
5522
5523                 storeValue(uint4);
5524         }
5525
5526         RValue<UInt4> UInt4::operator=(RValue<UInt4> rhs)
5527         {
5528                 storeValue(rhs.value);
5529
5530                 return rhs;
5531         }
5532
5533         RValue<UInt4> UInt4::operator=(const UInt4 &rhs)
5534         {
5535                 Value *value = rhs.loadValue();
5536                 storeValue(value);
5537
5538                 return RValue<UInt4>(value);
5539         }
5540
5541         RValue<UInt4> UInt4::operator=(const Reference<UInt4> &rhs)
5542         {
5543                 Value *value = rhs.loadValue();
5544                 storeValue(value);
5545
5546                 return RValue<UInt4>(value);
5547         }
5548
5549         RValue<UInt4> operator+(RValue<UInt4> lhs, RValue<UInt4> rhs)
5550         {
5551                 return RValue<UInt4>(Nucleus::createAdd(lhs.value, rhs.value));
5552         }
5553
5554         RValue<UInt4> operator-(RValue<UInt4> lhs, RValue<UInt4> rhs)
5555         {
5556                 return RValue<UInt4>(Nucleus::createSub(lhs.value, rhs.value));
5557         }
5558
5559         RValue<UInt4> operator*(RValue<UInt4> lhs, RValue<UInt4> rhs)
5560         {
5561                 return RValue<UInt4>(Nucleus::createMul(lhs.value, rhs.value));
5562         }
5563
5564         RValue<UInt4> operator/(RValue<UInt4> lhs, RValue<UInt4> rhs)
5565         {
5566                 return RValue<UInt4>(Nucleus::createUDiv(lhs.value, rhs.value));
5567         }
5568
5569         RValue<UInt4> operator%(RValue<UInt4> lhs, RValue<UInt4> rhs)
5570         {
5571                 return RValue<UInt4>(Nucleus::createURem(lhs.value, rhs.value));
5572         }
5573
5574         RValue<UInt4> operator&(RValue<UInt4> lhs, RValue<UInt4> rhs)
5575         {
5576                 return RValue<UInt4>(Nucleus::createAnd(lhs.value, rhs.value));
5577         }
5578
5579         RValue<UInt4> operator|(RValue<UInt4> lhs, RValue<UInt4> rhs)
5580         {
5581                 return RValue<UInt4>(Nucleus::createOr(lhs.value, rhs.value));
5582         }
5583
5584         RValue<UInt4> operator^(RValue<UInt4> lhs, RValue<UInt4> rhs)
5585         {
5586                 return RValue<UInt4>(Nucleus::createXor(lhs.value, rhs.value));
5587         }
5588
5589         RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
5590         {
5591                 return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
5592         }
5593
5594         RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
5595         {
5596                 return x86::psrld(lhs, rhs);
5597         }
5598
5599         RValue<UInt4> operator<<(RValue<UInt4> lhs, RValue<UInt4> rhs)
5600         {
5601                 return RValue<UInt4>(Nucleus::createShl(lhs.value, rhs.value));
5602         }
5603
5604         RValue<UInt4> operator>>(RValue<UInt4> lhs, RValue<UInt4> rhs)
5605         {
5606                 return RValue<UInt4>(Nucleus::createLShr(lhs.value, rhs.value));
5607         }
5608
5609         RValue<UInt4> operator+=(UInt4 &lhs, RValue<UInt4> rhs)
5610         {
5611                 return lhs = lhs + rhs;
5612         }
5613
5614         RValue<UInt4> operator-=(UInt4 &lhs, RValue<UInt4> rhs)
5615         {
5616                 return lhs = lhs - rhs;
5617         }
5618
5619         RValue<UInt4> operator*=(UInt4 &lhs, RValue<UInt4> rhs)
5620         {
5621                 return lhs = lhs * rhs;
5622         }
5623
5624 //      RValue<UInt4> operator/=(UInt4 &lhs, RValue<UInt4> rhs)
5625 //      {
5626 //              return lhs = lhs / rhs;
5627 //      }
5628
5629 //      RValue<UInt4> operator%=(UInt4 &lhs, RValue<UInt4> rhs)
5630 //      {
5631 //              return lhs = lhs % rhs;
5632 //      }
5633
5634         RValue<UInt4> operator&=(UInt4 &lhs, RValue<UInt4> rhs)
5635         {
5636                 return lhs = lhs & rhs;
5637         }
5638
5639         RValue<UInt4> operator|=(UInt4 &lhs, RValue<UInt4> rhs)
5640         {
5641                 return lhs = lhs | rhs;
5642         }
5643
5644         RValue<UInt4> operator^=(UInt4 &lhs, RValue<UInt4> rhs)
5645         {
5646                 return lhs = lhs ^ rhs;
5647         }
5648
5649         RValue<UInt4> operator<<=(UInt4 &lhs, unsigned char rhs)
5650         {
5651                 return lhs = lhs << rhs;
5652         }
5653
5654         RValue<UInt4> operator>>=(UInt4 &lhs, unsigned char rhs)
5655         {
5656                 return lhs = lhs >> rhs;
5657         }
5658
5659         RValue<UInt4> operator+(RValue<UInt4> val)
5660         {
5661                 return val;
5662         }
5663
5664         RValue<UInt4> operator-(RValue<UInt4> val)
5665         {
5666                 return RValue<UInt4>(Nucleus::createNeg(val.value));
5667         }
5668
5669         RValue<UInt4> operator~(RValue<UInt4> val)
5670         {
5671                 return RValue<UInt4>(Nucleus::createNot(val.value));
5672         }
5673
5674         RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
5675         {
5676                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5677                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5678                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
5679                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
5680         }
5681
5682         RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
5683         {
5684                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType()));
5685         }
5686
5687         RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
5688         {
5689                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5690                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5691                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value, y.value), Int4::getType()));
5692                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
5693         }
5694
5695         RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
5696         {
5697                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
5698         }
5699
5700         RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
5701         {
5702                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5703                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5704                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value, y.value), Int4::getType()));
5705                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
5706         }
5707
5708         RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
5709         {
5710                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType()));
5711         }
5712
5713         RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
5714         {
5715                 if(CPUID::supportsSSE4_1())
5716                 {
5717                         return x86::pmaxud(x, y);
5718                 }
5719                 else
5720                 {
5721                         RValue<UInt4> greater = CmpNLE(x, y);
5722                         return x & greater | y & ~greater;
5723                 }
5724         }
5725
5726         RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
5727         {
5728                 if(CPUID::supportsSSE4_1())
5729                 {
5730                         return x86::pminud(x, y);
5731                 }
5732                 else
5733                 {
5734                         RValue<UInt4> less = CmpLT(x, y);
5735                         return x & less | y & ~less;
5736                 }
5737         }
5738
5739         RValue<UShort8> Pack(RValue<UInt4> x, RValue<UInt4> y)
5740         {
5741                 return x86::packusdw(x, y);   // FIXME: Fallback required
5742         }
5743
5744         Type *UInt4::getType()
5745         {
5746                 return T(VectorType::get(UInt::getType(), 4));
5747         }
5748
5749         Float::Float(RValue<Int> cast)
5750         {
5751                 Value *integer = Nucleus::createSIToFP(cast.value, Float::getType());
5752
5753                 storeValue(integer);
5754         }
5755
5756         Float::Float()
5757         {
5758
5759         }
5760
5761         Float::Float(float x)
5762         {
5763                 storeValue(Nucleus::createConstantFloat(x));
5764         }
5765
5766         Float::Float(RValue<Float> rhs)
5767         {
5768                 storeValue(rhs.value);
5769         }
5770
5771         Float::Float(const Float &rhs)
5772         {
5773                 Value *value = rhs.loadValue();
5774                 storeValue(value);
5775         }
5776
5777         Float::Float(const Reference<Float> &rhs)
5778         {
5779                 Value *value = rhs.loadValue();
5780                 storeValue(value);
5781         }
5782
5783         RValue<Float> Float::operator=(RValue<Float> rhs)
5784         {
5785                 storeValue(rhs.value);
5786
5787                 return rhs;
5788         }
5789
5790         RValue<Float> Float::operator=(const Float &rhs)
5791         {
5792                 Value *value = rhs.loadValue();
5793                 storeValue(value);
5794
5795                 return RValue<Float>(value);
5796         }
5797
5798         RValue<Float> Float::operator=(const Reference<Float> &rhs)
5799         {
5800                 Value *value = rhs.loadValue();
5801                 storeValue(value);
5802
5803                 return RValue<Float>(value);
5804         }
5805
5806         RValue<Float> operator+(RValue<Float> lhs, RValue<Float> rhs)
5807         {
5808                 return RValue<Float>(Nucleus::createFAdd(lhs.value, rhs.value));
5809         }
5810
5811         RValue<Float> operator-(RValue<Float> lhs, RValue<Float> rhs)
5812         {
5813                 return RValue<Float>(Nucleus::createFSub(lhs.value, rhs.value));
5814         }
5815
5816         RValue<Float> operator*(RValue<Float> lhs, RValue<Float> rhs)
5817         {
5818                 return RValue<Float>(Nucleus::createFMul(lhs.value, rhs.value));
5819         }
5820
5821         RValue<Float> operator/(RValue<Float> lhs, RValue<Float> rhs)
5822         {
5823                 return RValue<Float>(Nucleus::createFDiv(lhs.value, rhs.value));
5824         }
5825
5826         RValue<Float> operator+=(Float &lhs, RValue<Float> rhs)
5827         {
5828                 return lhs = lhs + rhs;
5829         }
5830
5831         RValue<Float> operator-=(Float &lhs, RValue<Float> rhs)
5832         {
5833                 return lhs = lhs - rhs;
5834         }
5835
5836         RValue<Float> operator*=(Float &lhs, RValue<Float> rhs)
5837         {
5838                 return lhs = lhs * rhs;
5839         }
5840
5841         RValue<Float> operator/=(Float &lhs, RValue<Float> rhs)
5842         {
5843                 return lhs = lhs / rhs;
5844         }
5845
5846         RValue<Float> operator+(RValue<Float> val)
5847         {
5848                 return val;
5849         }
5850
5851         RValue<Float> operator-(RValue<Float> val)
5852         {
5853                 return RValue<Float>(Nucleus::createFNeg(val.value));
5854         }
5855
5856         RValue<Bool> operator<(RValue<Float> lhs, RValue<Float> rhs)
5857         {
5858                 return RValue<Bool>(Nucleus::createFCmpOLT(lhs.value, rhs.value));
5859         }
5860
5861         RValue<Bool> operator<=(RValue<Float> lhs, RValue<Float> rhs)
5862         {
5863                 return RValue<Bool>(Nucleus::createFCmpOLE(lhs.value, rhs.value));
5864         }
5865
5866         RValue<Bool> operator>(RValue<Float> lhs, RValue<Float> rhs)
5867         {
5868                 return RValue<Bool>(Nucleus::createFCmpOGT(lhs.value, rhs.value));
5869         }
5870
5871         RValue<Bool> operator>=(RValue<Float> lhs, RValue<Float> rhs)
5872         {
5873                 return RValue<Bool>(Nucleus::createFCmpOGE(lhs.value, rhs.value));
5874         }
5875
5876         RValue<Bool> operator!=(RValue<Float> lhs, RValue<Float> rhs)
5877         {
5878                 return RValue<Bool>(Nucleus::createFCmpONE(lhs.value, rhs.value));
5879         }
5880
5881         RValue<Bool> operator==(RValue<Float> lhs, RValue<Float> rhs)
5882         {
5883                 return RValue<Bool>(Nucleus::createFCmpOEQ(lhs.value, rhs.value));
5884         }
5885
5886         RValue<Float> Abs(RValue<Float> x)
5887         {
5888                 return IfThenElse(x > 0.0f, x, -x);
5889         }
5890
5891         RValue<Float> Max(RValue<Float> x, RValue<Float> y)
5892         {
5893                 return IfThenElse(x > y, x, y);
5894         }
5895
5896         RValue<Float> Min(RValue<Float> x, RValue<Float> y)
5897         {
5898                 return IfThenElse(x < y, x, y);
5899         }
5900
5901         RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
5902         {
5903                 if(exactAtPow2)
5904                 {
5905                         // rcpss uses a piecewise-linear approximation which minimizes the relative error
5906                         // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
5907                         return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
5908                 }
5909                 else
5910                 {
5911                         return x86::rcpss(x);
5912                 }
5913         }
5914
5915         RValue<Float> RcpSqrt_pp(RValue<Float> x)
5916         {
5917                 return x86::rsqrtss(x);
5918         }
5919
5920         RValue<Float> Sqrt(RValue<Float> x)
5921         {
5922                 return x86::sqrtss(x);
5923         }
5924
5925         RValue<Float> Round(RValue<Float> x)
5926         {
5927                 if(CPUID::supportsSSE4_1())
5928                 {
5929                         return x86::roundss(x, 0);
5930                 }
5931                 else
5932                 {
5933                         return Float4(Round(Float4(x))).x;
5934                 }
5935         }
5936
5937         RValue<Float> Trunc(RValue<Float> x)
5938         {
5939                 if(CPUID::supportsSSE4_1())
5940                 {
5941                         return x86::roundss(x, 3);
5942                 }
5943                 else
5944                 {
5945                         return Float(Int(x));   // Rounded toward zero
5946                 }
5947         }
5948
5949         RValue<Float> Frac(RValue<Float> x)
5950         {
5951                 if(CPUID::supportsSSE4_1())
5952                 {
5953                         return x - x86::floorss(x);
5954                 }
5955                 else
5956                 {
5957                         return Float4(Frac(Float4(x))).x;
5958                 }
5959         }
5960
5961         RValue<Float> Floor(RValue<Float> x)
5962         {
5963                 if(CPUID::supportsSSE4_1())
5964                 {
5965                         return x86::floorss(x);
5966                 }
5967                 else
5968                 {
5969                         return Float4(Floor(Float4(x))).x;
5970                 }
5971         }
5972
5973         RValue<Float> Ceil(RValue<Float> x)
5974         {
5975                 if(CPUID::supportsSSE4_1())
5976                 {
5977                         return x86::ceilss(x);
5978                 }
5979                 else
5980                 {
5981                         return Float4(Ceil(Float4(x))).x;
5982                 }
5983         }
5984
5985         Type *Float::getType()
5986         {
5987                 return T(llvm::Type::getFloatTy(*::context));
5988         }
5989
5990         Float2::Float2(RValue<Float4> cast)
5991         {
5992         //      xyzw.parent = this;
5993
5994                 Value *int64x2 = Nucleus::createBitCast(cast.value, T(VectorType::get(Long::getType(), 2)));
5995                 Value *int64 = Nucleus::createExtractElement(int64x2, Long::getType(), 0);
5996                 Value *float2 = Nucleus::createBitCast(int64, Float2::getType());
5997
5998                 storeValue(float2);
5999         }
6000
6001         Type *Float2::getType()
6002         {
6003                 return T(VectorType::get(Float::getType(), 2));
6004         }
6005
6006         Float4::Float4(RValue<Byte4> cast)
6007         {
6008                 xyzw.parent = this;
6009
6010                 #if 0
6011                         Value *xyzw = Nucleus::createUIToFP(cast.value, Float4::getType());   // FIXME: Crashes
6012                 #elif 0
6013                         Value *vector = loadValue();
6014
6015                         Value *i8x = Nucleus::createExtractElement(cast.value, 0);
6016                         Value *f32x = Nucleus::createUIToFP(i8x, Float::getType());
6017                         Value *x = Nucleus::createInsertElement(vector, f32x, 0);
6018
6019                         Value *i8y = Nucleus::createExtractElement(cast.value, V(Nucleus::createConstantInt(1)));
6020                         Value *f32y = Nucleus::createUIToFP(i8y, Float::getType());
6021                         Value *xy = Nucleus::createInsertElement(x, f32y, V(Nucleus::createConstantInt(1)));
6022
6023                         Value *i8z = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(2));
6024                         Value *f32z = Nucleus::createUIToFP(i8z, Float::getType());
6025                         Value *xyz = Nucleus::createInsertElement(xy, f32z, Nucleus::createConstantInt(2));
6026
6027                         Value *i8w = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(3));
6028                         Value *f32w = Nucleus::createUIToFP(i8w, Float::getType());
6029                         Value *xyzw = Nucleus::createInsertElement(xyz, f32w, Nucleus::createConstantInt(3));
6030                 #else
6031                         Value *a = Int4(cast).loadValue();
6032                         Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
6033                 #endif
6034
6035                 storeValue(xyzw);
6036         }
6037
6038         Float4::Float4(RValue<SByte4> cast)
6039         {
6040                 xyzw.parent = this;
6041
6042                 #if 0
6043                         Value *xyzw = Nucleus::createSIToFP(cast.value, Float4::getType());   // FIXME: Crashes
6044                 #elif 0
6045                         Value *vector = loadValue();
6046
6047                         Value *i8x = Nucleus::createExtractElement(cast.value, 0);
6048                         Value *f32x = Nucleus::createSIToFP(i8x, Float::getType());
6049                         Value *x = Nucleus::createInsertElement(vector, f32x, 0);
6050
6051                         Value *i8y = Nucleus::createExtractElement(cast.value, V(Nucleus::createConstantInt(1)));
6052                         Value *f32y = Nucleus::createSIToFP(i8y, Float::getType());
6053                         Value *xy = Nucleus::createInsertElement(x, f32y, V(Nucleus::createConstantInt(1)));
6054
6055                         Value *i8z = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(2));
6056                         Value *f32z = Nucleus::createSIToFP(i8z, Float::getType());
6057                         Value *xyz = Nucleus::createInsertElement(xy, f32z, Nucleus::createConstantInt(2));
6058
6059                         Value *i8w = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(3));
6060                         Value *f32w = Nucleus::createSIToFP(i8w, Float::getType());
6061                         Value *xyzw = Nucleus::createInsertElement(xyz, f32w, Nucleus::createConstantInt(3));
6062                 #else
6063                         Value *a = Int4(cast).loadValue();
6064                         Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
6065                 #endif
6066
6067                 storeValue(xyzw);
6068         }
6069
6070         Float4::Float4(RValue<Short4> cast)
6071         {
6072                 xyzw.parent = this;
6073
6074                 Int4 c(cast);
6075                 storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
6076         }
6077
6078         Float4::Float4(RValue<UShort4> cast)
6079         {
6080                 xyzw.parent = this;
6081
6082                 Int4 c(cast);
6083                 storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
6084         }
6085
6086         Float4::Float4(RValue<Int4> cast)
6087         {
6088                 xyzw.parent = this;
6089
6090                 Value *xyzw = Nucleus::createSIToFP(cast.value, Float4::getType());
6091
6092                 storeValue(xyzw);
6093         }
6094
6095         Float4::Float4(RValue<UInt4> cast)
6096         {
6097                 xyzw.parent = this;
6098
6099                 RValue<Float4> result = Float4(Int4(cast & UInt4(0x7FFFFFFF))) +
6100                                         As<Float4>((As<Int4>(cast) >> 31) & As<Int4>(Float4(0x80000000u)));
6101
6102                 storeValue(result.value);
6103         }
6104
6105         Float4::Float4()
6106         {
6107                 xyzw.parent = this;
6108         }
6109
6110         Float4::Float4(float xyzw)
6111         {
6112                 constant(xyzw, xyzw, xyzw, xyzw);
6113         }
6114
6115         Float4::Float4(float x, float yzw)
6116         {
6117                 constant(x, yzw, yzw, yzw);
6118         }
6119
6120         Float4::Float4(float x, float y, float zw)
6121         {
6122                 constant(x, y, zw, zw);
6123         }
6124
6125         Float4::Float4(float x, float y, float z, float w)
6126         {
6127                 constant(x, y, z, w);
6128         }
6129
6130         void Float4::constant(float x, float y, float z, float w)
6131         {
6132                 xyzw.parent = this;
6133
6134                 double constantVector[4] = {x, y, z, w};
6135                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
6136         }
6137
6138         Float4::Float4(RValue<Float4> rhs)
6139         {
6140                 xyzw.parent = this;
6141
6142                 storeValue(rhs.value);
6143         }
6144
6145         Float4::Float4(const Float4 &rhs)
6146         {
6147                 xyzw.parent = this;
6148
6149                 Value *value = rhs.loadValue();
6150                 storeValue(value);
6151         }
6152
6153         Float4::Float4(const Reference<Float4> &rhs)
6154         {
6155                 xyzw.parent = this;
6156
6157                 Value *value = rhs.loadValue();
6158                 storeValue(value);
6159         }
6160
6161         Float4::Float4(RValue<Float> rhs)
6162         {
6163                 xyzw.parent = this;
6164
6165                 Value *vector = loadValue();
6166                 Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
6167
6168                 int swizzle[4] = {0, 0, 0, 0};
6169                 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
6170
6171                 storeValue(replicate);
6172         }
6173
6174         Float4::Float4(const Float &rhs)
6175         {
6176                 xyzw.parent = this;
6177
6178                 *this = RValue<Float>(rhs.loadValue());
6179         }
6180
6181         Float4::Float4(const Reference<Float> &rhs)
6182         {
6183                 xyzw.parent = this;
6184
6185                 *this = RValue<Float>(rhs.loadValue());
6186         }
6187
6188         RValue<Float4> Float4::operator=(float x)
6189         {
6190                 return *this = Float4(x, x, x, x);
6191         }
6192
6193         RValue<Float4> Float4::operator=(RValue<Float4> rhs)
6194         {
6195                 storeValue(rhs.value);
6196
6197                 return rhs;
6198         }
6199
6200         RValue<Float4> Float4::operator=(const Float4 &rhs)
6201         {
6202                 Value *value = rhs.loadValue();
6203                 storeValue(value);
6204
6205                 return RValue<Float4>(value);
6206         }
6207
6208         RValue<Float4> Float4::operator=(const Reference<Float4> &rhs)
6209         {
6210                 Value *value = rhs.loadValue();
6211                 storeValue(value);
6212
6213                 return RValue<Float4>(value);
6214         }
6215
6216         RValue<Float4> Float4::operator=(RValue<Float> rhs)
6217         {
6218                 return *this = Float4(rhs);
6219         }
6220
6221         RValue<Float4> Float4::operator=(const Float &rhs)
6222         {
6223                 return *this = Float4(rhs);
6224         }
6225
6226         RValue<Float4> Float4::operator=(const Reference<Float> &rhs)
6227         {
6228                 return *this = Float4(rhs);
6229         }
6230
6231         RValue<Float4> operator+(RValue<Float4> lhs, RValue<Float4> rhs)
6232         {
6233                 return RValue<Float4>(Nucleus::createFAdd(lhs.value, rhs.value));
6234         }
6235
6236         RValue<Float4> operator-(RValue<Float4> lhs, RValue<Float4> rhs)
6237         {
6238                 return RValue<Float4>(Nucleus::createFSub(lhs.value, rhs.value));
6239         }
6240
6241         RValue<Float4> operator*(RValue<Float4> lhs, RValue<Float4> rhs)
6242         {
6243                 return RValue<Float4>(Nucleus::createFMul(lhs.value, rhs.value));
6244         }
6245
6246         RValue<Float4> operator/(RValue<Float4> lhs, RValue<Float4> rhs)
6247         {
6248                 return RValue<Float4>(Nucleus::createFDiv(lhs.value, rhs.value));
6249         }
6250
6251         RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
6252         {
6253                 return RValue<Float4>(Nucleus::createFRem(lhs.value, rhs.value));
6254         }
6255
6256         RValue<Float4> operator+=(Float4 &lhs, RValue<Float4> rhs)
6257         {
6258                 return lhs = lhs + rhs;
6259         }
6260
6261         RValue<Float4> operator-=(Float4 &lhs, RValue<Float4> rhs)
6262         {
6263                 return lhs = lhs - rhs;
6264         }
6265
6266         RValue<Float4> operator*=(Float4 &lhs, RValue<Float4> rhs)
6267         {
6268                 return lhs = lhs * rhs;
6269         }
6270
6271         RValue<Float4> operator/=(Float4 &lhs, RValue<Float4> rhs)
6272         {
6273                 return lhs = lhs / rhs;
6274         }
6275
6276         RValue<Float4> operator%=(Float4 &lhs, RValue<Float4> rhs)
6277         {
6278                 return lhs = lhs % rhs;
6279         }
6280
6281         RValue<Float4> operator+(RValue<Float4> val)
6282         {
6283                 return val;
6284         }
6285
6286         RValue<Float4> operator-(RValue<Float4> val)
6287         {
6288                 return RValue<Float4>(Nucleus::createFNeg(val.value));
6289         }
6290
6291         RValue<Float4> Abs(RValue<Float4> x)
6292         {
6293                 Value *vector = Nucleus::createBitCast(x.value, Int4::getType());
6294                 int64_t constantVector[4] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
6295                 Value *result = Nucleus::createAnd(vector, V(Nucleus::createConstantVector(constantVector, Int4::getType())));
6296
6297                 return RValue<Float4>(Nucleus::createBitCast(result, Float4::getType()));
6298         }
6299
6300         RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
6301         {
6302                 return x86::maxps(x, y);
6303         }
6304
6305         RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
6306         {
6307                 return x86::minps(x, y);
6308         }
6309
6310         RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
6311         {
6312                 if(exactAtPow2)
6313                 {
6314                         // rcpps uses a piecewise-linear approximation which minimizes the relative error
6315                         // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
6316                         return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
6317                 }
6318                 else
6319                 {
6320                         return x86::rcpps(x);
6321                 }
6322         }
6323
6324         RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
6325         {
6326                 return x86::rsqrtps(x);
6327         }
6328
6329         RValue<Float4> Sqrt(RValue<Float4> x)
6330         {
6331                 return x86::sqrtps(x);
6332         }
6333
6334         RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i)
6335         {
6336                 return RValue<Float4>(Nucleus::createInsertElement(val.value, element.value, i));
6337         }
6338
6339         RValue<Float> Extract(RValue<Float4> x, int i)
6340         {
6341                 return RValue<Float>(Nucleus::createExtractElement(x.value, Float::getType(), i));
6342         }
6343
6344         RValue<Float4> Swizzle(RValue<Float4> x, unsigned char select)
6345         {
6346                 return RValue<Float4>(createSwizzle4(x.value, select));
6347         }
6348
6349         RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, unsigned char imm)
6350         {
6351                 int shuffle[4] =
6352                 {
6353                         ((imm >> 0) & 0x03) + 0,
6354                         ((imm >> 2) & 0x03) + 0,
6355                         ((imm >> 4) & 0x03) + 4,
6356                         ((imm >> 6) & 0x03) + 4,
6357                 };
6358
6359                 return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
6360         }
6361
6362         RValue<Float4> UnpackLow(RValue<Float4> x, RValue<Float4> y)
6363         {
6364                 int shuffle[4] = {0, 4, 1, 5};
6365                 return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
6366         }
6367
6368         RValue<Float4> UnpackHigh(RValue<Float4> x, RValue<Float4> y)
6369         {
6370                 int shuffle[4] = {2, 6, 3, 7};
6371                 return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
6372         }
6373
6374         RValue<Float4> Mask(Float4 &lhs, RValue<Float4> rhs, unsigned char select)
6375         {
6376                 Value *vector = lhs.loadValue();
6377                 Value *shuffle = createMask4(vector, rhs.value, select);
6378                 lhs.storeValue(shuffle);
6379
6380                 return RValue<Float4>(shuffle);
6381         }
6382
6383         RValue<Int> SignMask(RValue<Float4> x)
6384         {
6385                 return x86::movmskps(x);
6386         }
6387
6388         RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
6389         {
6390         //      return As<Int4>(x86::cmpeqps(x, y));
6391                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value, y.value), Int4::getType()));
6392         }
6393
6394         RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
6395         {
6396         //      return As<Int4>(x86::cmpltps(x, y));
6397                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value, y.value), Int4::getType()));
6398         }
6399
6400         RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
6401         {
6402         //      return As<Int4>(x86::cmpleps(x, y));
6403                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value, y.value), Int4::getType()));
6404         }
6405
6406         RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
6407         {
6408         //      return As<Int4>(x86::cmpneqps(x, y));
6409                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value, y.value), Int4::getType()));
6410         }
6411
6412         RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
6413         {
6414         //      return As<Int4>(x86::cmpnltps(x, y));
6415                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value, y.value), Int4::getType()));
6416         }
6417
6418         RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
6419         {
6420         //      return As<Int4>(x86::cmpnleps(x, y));
6421                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value, y.value), Int4::getType()));
6422         }
6423
6424         RValue<Float4> Round(RValue<Float4> x)
6425         {
6426                 if(CPUID::supportsSSE4_1())
6427                 {
6428                         return x86::roundps(x, 0);
6429                 }
6430                 else
6431                 {
6432                         return Float4(RoundInt(x));
6433                 }
6434         }
6435
6436         RValue<Float4> Trunc(RValue<Float4> x)
6437         {
6438                 if(CPUID::supportsSSE4_1())
6439                 {
6440                         return x86::roundps(x, 3);
6441                 }
6442                 else
6443                 {
6444                         return Float4(Int4(x));   // Rounded toward zero
6445                 }
6446         }
6447
6448         RValue<Float4> Frac(RValue<Float4> x)
6449         {
6450                 if(CPUID::supportsSSE4_1())
6451                 {
6452                         return x - x86::floorps(x);
6453                 }
6454                 else
6455                 {
6456                         Float4 frc = x - Float4(Int4(x));   // Signed fractional part
6457
6458                         return frc + As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1, 1, 1, 1)));
6459                 }
6460         }
6461
6462         RValue<Float4> Floor(RValue<Float4> x)
6463         {
6464                 if(CPUID::supportsSSE4_1())
6465                 {
6466                         return x86::floorps(x);
6467                 }
6468                 else
6469                 {
6470                         return x - Frac(x);
6471                 }
6472         }
6473
6474         RValue<Float4> Ceil(RValue<Float4> x)
6475         {
6476                 if(CPUID::supportsSSE4_1())
6477                 {
6478                         return x86::ceilps(x);
6479                 }
6480                 else
6481                 {
6482                         return -Floor(-x);
6483                 }
6484         }
6485
6486         Type *Float4::getType()
6487         {
6488                 return T(VectorType::get(Float::getType(), 4));
6489         }
6490
6491         RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, int offset)
6492         {
6493                 return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), V(Nucleus::createConstantInt(offset))));
6494         }
6495
6496         RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
6497         {
6498                 return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value));
6499         }
6500
6501         RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
6502         {
6503                 return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value));
6504         }
6505
6506         RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, int offset)
6507         {
6508                 return lhs = lhs + offset;
6509         }
6510
6511         RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<Int> offset)
6512         {
6513                 return lhs = lhs + offset;
6514         }
6515
6516         RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<UInt> offset)
6517         {
6518                 return lhs = lhs + offset;
6519         }
6520
6521         RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, int offset)
6522         {
6523                 return lhs + -offset;
6524         }
6525
6526         RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
6527         {
6528                 return lhs + -offset;
6529         }
6530
6531         RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
6532         {
6533                 return lhs + -offset;
6534         }
6535
6536         RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, int offset)
6537         {
6538                 return lhs = lhs - offset;
6539         }
6540
6541         RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<Int> offset)
6542         {
6543                 return lhs = lhs - offset;
6544         }
6545
6546         RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<UInt> offset)
6547         {
6548                 return lhs = lhs - offset;
6549         }
6550
6551         void Return()
6552         {
6553                 Nucleus::createRetVoid();
6554                 Nucleus::setInsertBlock(Nucleus::createBasicBlock());
6555                 Nucleus::createUnreachable();
6556         }
6557
6558         void Return(RValue<Int> ret)
6559         {
6560                 Nucleus::createRet(ret.value);
6561                 Nucleus::setInsertBlock(Nucleus::createBasicBlock());
6562                 Nucleus::createUnreachable();
6563         }
6564
6565         bool branch(RValue<Bool> cmp, BasicBlock *bodyBB, BasicBlock *endBB)
6566         {
6567                 Nucleus::createCondBr(cmp.value, bodyBB, endBB);
6568                 Nucleus::setInsertBlock(bodyBB);
6569
6570                 return true;
6571         }
6572
6573         RValue<Long> Ticks()
6574         {
6575                 llvm::Function *rdtsc = Intrinsic::getDeclaration(::module, Intrinsic::readcyclecounter);
6576
6577                 return RValue<Long>(V(::builder->CreateCall(rdtsc)));
6578         }
6579 }
6580
6581 namespace sw
6582 {
6583         namespace x86
6584         {
6585                 RValue<Int> cvtss2si(RValue<Float> val)
6586                 {
6587                         llvm::Function *cvtss2si = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvtss2si);
6588
6589                         Float4 vector;
6590                         vector.x = val;
6591
6592                         return RValue<Int>(V(::builder->CreateCall(cvtss2si, RValue<Float4>(vector).value)));
6593                 }
6594
6595                 RValue<Int2> cvtps2pi(RValue<Float4> val)
6596                 {
6597                         llvm::Function *cvtps2pi = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvtps2pi);
6598
6599                         return RValue<Int2>(V(::builder->CreateCall(cvtps2pi, val.value)));
6600                 }
6601
6602                 RValue<Int2> cvttps2pi(RValue<Float4> val)
6603                 {
6604                         llvm::Function *cvttps2pi = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvttps2pi);
6605
6606                         return RValue<Int2>(V(::builder->CreateCall(cvttps2pi, val.value)));
6607                 }
6608
6609                 RValue<Int4> cvtps2dq(RValue<Float4> val)
6610                 {
6611                         if(CPUID::supportsSSE2())
6612                         {
6613                                 llvm::Function *cvtps2dq = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_cvtps2dq);
6614
6615                                 return RValue<Int4>(V(::builder->CreateCall(cvtps2dq, val.value)));
6616                         }
6617                         else
6618                         {
6619                                 Int2 lo = x86::cvtps2pi(val);
6620                                 Int2 hi = x86::cvtps2pi(Swizzle(val, 0xEE));
6621
6622                                 return Int4(lo, hi);
6623                         }
6624                 }
6625
6626                 RValue<Float> rcpss(RValue<Float> val)
6627                 {
6628                         llvm::Function *rcpss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rcp_ss);
6629
6630                         Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
6631
6632                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(rcpss, vector)), Float::getType(), 0));
6633                 }
6634
6635                 RValue<Float> sqrtss(RValue<Float> val)
6636                 {
6637                         llvm::Function *sqrtss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_sqrt_ss);
6638
6639                         Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
6640
6641                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(sqrtss, vector)), Float::getType(), 0));
6642                 }
6643
6644                 RValue<Float> rsqrtss(RValue<Float> val)
6645                 {
6646                         llvm::Function *rsqrtss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rsqrt_ss);
6647
6648                         Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
6649
6650                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(rsqrtss, vector)), Float::getType(), 0));
6651                 }
6652
6653                 RValue<Float4> rcpps(RValue<Float4> val)
6654                 {
6655                         llvm::Function *rcpps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rcp_ps);
6656
6657                         return RValue<Float4>(V(::builder->CreateCall(rcpps, val.value)));
6658                 }
6659
6660                 RValue<Float4> sqrtps(RValue<Float4> val)
6661                 {
6662                         llvm::Function *sqrtps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_sqrt_ps);
6663
6664                         return RValue<Float4>(V(::builder->CreateCall(sqrtps, val.value)));
6665                 }
6666
6667                 RValue<Float4> rsqrtps(RValue<Float4> val)
6668                 {
6669                         llvm::Function *rsqrtps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rsqrt_ps);
6670
6671                         return RValue<Float4>(V(::builder->CreateCall(rsqrtps, val.value)));
6672                 }
6673
6674                 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
6675                 {
6676                         llvm::Function *maxps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_max_ps);
6677
6678                         return RValue<Float4>(V(::builder->CreateCall2(maxps, x.value, y.value)));
6679                 }
6680
6681                 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
6682                 {
6683                         llvm::Function *minps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_min_ps);
6684
6685                         return RValue<Float4>(V(::builder->CreateCall2(minps, x.value, y.value)));
6686                 }
6687
6688                 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
6689                 {
6690                         llvm::Function *roundss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_round_ss);
6691
6692                         Value *undef = V(UndefValue::get(Float4::getType()));
6693                         Value *vector = Nucleus::createInsertElement(undef, val.value, 0);
6694
6695                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall3(roundss, undef, vector, V(Nucleus::createConstantInt(imm)))), Float::getType(), 0));
6696                 }
6697
6698                 RValue<Float> floorss(RValue<Float> val)
6699                 {
6700                         return roundss(val, 1);
6701                 }
6702
6703                 RValue<Float> ceilss(RValue<Float> val)
6704                 {
6705                         return roundss(val, 2);
6706                 }
6707
6708                 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
6709                 {
6710                         llvm::Function *roundps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_round_ps);
6711
6712                         return RValue<Float4>(V(::builder->CreateCall2(roundps, val.value, V(Nucleus::createConstantInt(imm)))));
6713                 }
6714
6715                 RValue<Float4> floorps(RValue<Float4> val)
6716                 {
6717                         return roundps(val, 1);
6718                 }
6719
6720                 RValue<Float4> ceilps(RValue<Float4> val)
6721                 {
6722                         return roundps(val, 2);
6723                 }
6724
6725                 RValue<Float4> cmpps(RValue<Float4> x, RValue<Float4> y, unsigned char imm)
6726                 {
6727                         llvm::Function *cmpps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cmp_ps);
6728
6729                         return RValue<Float4>(V(::builder->CreateCall3(cmpps, x.value, y.value, V(Nucleus::createConstantByte(imm)))));
6730                 }
6731
6732                 RValue<Float4> cmpeqps(RValue<Float4> x, RValue<Float4> y)
6733                 {
6734                         return cmpps(x, y, 0);
6735                 }
6736
6737                 RValue<Float4> cmpltps(RValue<Float4> x, RValue<Float4> y)
6738                 {
6739                         return cmpps(x, y, 1);
6740                 }
6741
6742                 RValue<Float4> cmpleps(RValue<Float4> x, RValue<Float4> y)
6743                 {
6744                         return cmpps(x, y, 2);
6745                 }
6746
6747                 RValue<Float4> cmpunordps(RValue<Float4> x, RValue<Float4> y)
6748                 {
6749                         return cmpps(x, y, 3);
6750                 }
6751
6752                 RValue<Float4> cmpneqps(RValue<Float4> x, RValue<Float4> y)
6753                 {
6754                         return cmpps(x, y, 4);
6755                 }
6756
6757                 RValue<Float4> cmpnltps(RValue<Float4> x, RValue<Float4> y)
6758                 {
6759                         return cmpps(x, y, 5);
6760                 }
6761
6762                 RValue<Float4> cmpnleps(RValue<Float4> x, RValue<Float4> y)
6763                 {
6764                         return cmpps(x, y, 6);
6765                 }
6766
6767                 RValue<Float4> cmpordps(RValue<Float4> x, RValue<Float4> y)
6768                 {
6769                         return cmpps(x, y, 7);
6770                 }
6771
6772                 RValue<Float> cmpss(RValue<Float> x, RValue<Float> y, unsigned char imm)
6773                 {
6774                         llvm::Function *cmpss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cmp_ss);
6775
6776                         Value *vector1 = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), x.value, 0);
6777                         Value *vector2 = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), y.value, 0);
6778
6779                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall3(cmpss, vector1, vector2, V(Nucleus::createConstantByte(imm)))), Float::getType(), 0));
6780                 }
6781
6782                 RValue<Float> cmpeqss(RValue<Float> x, RValue<Float> y)
6783                 {
6784                         return cmpss(x, y, 0);
6785                 }
6786
6787                 RValue<Float> cmpltss(RValue<Float> x, RValue<Float> y)
6788                 {
6789                         return cmpss(x, y, 1);
6790                 }
6791
6792                 RValue<Float> cmpless(RValue<Float> x, RValue<Float> y)
6793                 {
6794                         return cmpss(x, y, 2);
6795                 }
6796
6797                 RValue<Float> cmpunordss(RValue<Float> x, RValue<Float> y)
6798                 {
6799                         return cmpss(x, y, 3);
6800                 }
6801
6802                 RValue<Float> cmpneqss(RValue<Float> x, RValue<Float> y)
6803                 {
6804                         return cmpss(x, y, 4);
6805                 }
6806
6807                 RValue<Float> cmpnltss(RValue<Float> x, RValue<Float> y)
6808                 {
6809                         return cmpss(x, y, 5);
6810                 }
6811
6812                 RValue<Float> cmpnless(RValue<Float> x, RValue<Float> y)
6813                 {
6814                         return cmpss(x, y, 6);
6815                 }
6816
6817                 RValue<Float> cmpordss(RValue<Float> x, RValue<Float> y)
6818                 {
6819                         return cmpss(x, y, 7);
6820                 }
6821
6822                 RValue<Int4> pabsd(RValue<Int4> x)
6823                 {
6824                         llvm::Function *pabsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_ssse3_pabs_d_128);
6825
6826                         return RValue<Int4>(V(::builder->CreateCall(pabsd, x.value)));
6827                 }
6828
6829                 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
6830                 {
6831                         llvm::Function *paddsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padds_w);
6832
6833                         return As<Short4>(V(::builder->CreateCall2(paddsw, As<MMX>(x).value, As<MMX>(y).value)));
6834                 }
6835
6836                 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
6837                 {
6838                         llvm::Function *psubsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubs_w);
6839
6840                         return As<Short4>(V(::builder->CreateCall2(psubsw, As<MMX>(x).value, As<MMX>(y).value)));
6841                 }
6842
6843                 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
6844                 {
6845                         llvm::Function *paddusw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_paddus_w);
6846
6847                         return As<UShort4>(V(::builder->CreateCall2(paddusw, As<MMX>(x).value, As<MMX>(y).value)));
6848                 }
6849
6850                 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
6851                 {
6852                         llvm::Function *psubusw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubus_w);
6853
6854                         return As<UShort4>(V(::builder->CreateCall2(psubusw, As<MMX>(x).value, As<MMX>(y).value)));
6855                 }
6856
6857                 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
6858                 {
6859                         llvm::Function *paddsb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padds_b);
6860
6861                         return As<SByte8>(V(::builder->CreateCall2(paddsb, As<MMX>(x).value, As<MMX>(y).value)));
6862                 }
6863
6864                 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
6865                 {
6866                         llvm::Function *psubsb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubs_b);
6867
6868                         return As<SByte8>(V(::builder->CreateCall2(psubsb, As<MMX>(x).value, As<MMX>(y).value)));
6869                 }
6870
6871                 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
6872                 {
6873                         llvm::Function *paddusb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_paddus_b);
6874
6875                         return As<Byte8>(V(::builder->CreateCall2(paddusb, As<MMX>(x).value, As<MMX>(y).value)));
6876                 }
6877
6878                 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
6879                 {
6880                         llvm::Function *psubusb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubus_b);
6881
6882                         return As<Byte8>(V(::builder->CreateCall2(psubusb, As<MMX>(x).value, As<MMX>(y).value)));
6883                 }
6884
6885                 RValue<Short4> paddw(RValue<Short4> x, RValue<Short4> y)
6886                 {
6887                         llvm::Function *paddw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_w);
6888
6889                         return As<Short4>(V(::builder->CreateCall2(paddw, As<MMX>(x).value, As<MMX>(y).value)));
6890                 }
6891
6892                 RValue<Short4> psubw(RValue<Short4> x, RValue<Short4> y)
6893                 {
6894                         llvm::Function *psubw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_w);
6895
6896                         return As<Short4>(V(::builder->CreateCall2(psubw, As<MMX>(x).value, As<MMX>(y).value)));
6897                 }
6898
6899                 RValue<Short4> pmullw(RValue<Short4> x, RValue<Short4> y)
6900                 {
6901                         llvm::Function *pmullw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmull_w);
6902
6903                         return As<Short4>(V(::builder->CreateCall2(pmullw, As<MMX>(x).value, As<MMX>(y).value)));
6904                 }
6905
6906                 RValue<Short4> pand(RValue<Short4> x, RValue<Short4> y)
6907                 {
6908                         llvm::Function *pand = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pand);
6909
6910                         return As<Short4>(V(::builder->CreateCall2(pand, As<MMX>(x).value, As<MMX>(y).value)));
6911                 }
6912
6913                 RValue<Short4> por(RValue<Short4> x, RValue<Short4> y)
6914                 {
6915                         llvm::Function *por = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_por);
6916
6917                         return As<Short4>(V(::builder->CreateCall2(por, As<MMX>(x).value, As<MMX>(y).value)));
6918                 }
6919
6920                 RValue<Short4> pxor(RValue<Short4> x, RValue<Short4> y)
6921                 {
6922                         llvm::Function *pxor = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pxor);
6923
6924                         return As<Short4>(V(::builder->CreateCall2(pxor, As<MMX>(x).value, As<MMX>(y).value)));
6925                 }
6926
6927                 RValue<Short4> pshufw(RValue<Short4> x, unsigned char y)
6928                 {
6929                         llvm::Function *pshufw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_pshuf_w);
6930
6931                         return As<Short4>(V(::builder->CreateCall2(pshufw, As<MMX>(x).value, V(Nucleus::createConstantByte(y)))));
6932                 }
6933
6934                 RValue<Int2> punpcklwd(RValue<Short4> x, RValue<Short4> y)
6935                 {
6936                         llvm::Function *punpcklwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpcklwd);
6937
6938                         return As<Int2>(V(::builder->CreateCall2(punpcklwd, As<MMX>(x).value, As<MMX>(y).value)));
6939                 }
6940
6941                 RValue<Int2> punpckhwd(RValue<Short4> x, RValue<Short4> y)
6942                 {
6943                         llvm::Function *punpckhwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhwd);
6944
6945                         return As<Int2>(V(::builder->CreateCall2(punpckhwd, As<MMX>(x).value, As<MMX>(y).value)));
6946                 }
6947
6948                 RValue<Short4> pinsrw(RValue<Short4> x, RValue<Int> y, unsigned int i)
6949                 {
6950                         llvm::Function *pinsrw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pinsr_w);
6951
6952                         return As<Short4>(V(::builder->CreateCall3(pinsrw, As<MMX>(x).value, y.value, V(Nucleus::createConstantInt(i)))));
6953                 }
6954
6955                 RValue<Int> pextrw(RValue<Short4> x, unsigned int i)
6956                 {
6957                         llvm::Function *pextrw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pextr_w);
6958
6959                         return RValue<Int>(V(::builder->CreateCall2(pextrw, As<MMX>(x).value, V(Nucleus::createConstantInt(i)))));
6960                 }
6961
6962                 RValue<Short4> punpckldq(RValue<Int2> x, RValue<Int2> y)
6963                 {
6964                         llvm::Function *punpckldq = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckldq);
6965
6966                         return As<Short4>(V(::builder->CreateCall2(punpckldq, As<MMX>(x).value, As<MMX>(y).value)));
6967                 }
6968
6969                 RValue<Short4> punpckhdq(RValue<Int2> x, RValue<Int2> y)
6970                 {
6971                         llvm::Function *punpckhdq = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhdq);
6972
6973                         return As<Short4>(V(::builder->CreateCall2(punpckhdq, As<MMX>(x).value, As<MMX>(y).value)));
6974                 }
6975
6976                 RValue<Short4> punpcklbw(RValue<Byte8> x, RValue<Byte8> y)
6977                 {
6978                         llvm::Function *punpcklbw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpcklbw);
6979
6980                         return As<Short4>(V(::builder->CreateCall2(punpcklbw, As<MMX>(x).value, As<MMX>(y).value)));
6981                 }
6982
6983                 RValue<Short4> punpckhbw(RValue<Byte8> x, RValue<Byte8> y)
6984                 {
6985                         llvm::Function *punpckhbw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhbw);
6986
6987                         return As<Short4>(V(::builder->CreateCall2(punpckhbw, As<MMX>(x).value, As<MMX>(y).value)));
6988                 }
6989
6990                 RValue<Byte8> paddb(RValue<Byte8> x, RValue<Byte8> y)
6991                 {
6992                         llvm::Function *paddb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_b);
6993
6994                         return As<Byte8>(V(::builder->CreateCall2(paddb, As<MMX>(x).value, As<MMX>(y).value)));
6995                 }
6996
6997                 RValue<Byte8> psubb(RValue<Byte8> x, RValue<Byte8> y)
6998                 {
6999                         llvm::Function *psubb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_b);
7000
7001                         return As<Byte8>(V(::builder->CreateCall2(psubb, As<MMX>(x).value, As<MMX>(y).value)));
7002                 }
7003
7004                 RValue<Int2> paddd(RValue<Int2> x, RValue<Int2> y)
7005                 {
7006                         llvm::Function *paddd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_d);
7007
7008                         return As<Int2>(V(::builder->CreateCall2(paddd, As<MMX>(x).value, As<MMX>(y).value)));
7009                 }
7010
7011                 RValue<Int2> psubd(RValue<Int2> x, RValue<Int2> y)
7012                 {
7013                         llvm::Function *psubd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_d);
7014
7015                         return As<Int2>(V(::builder->CreateCall2(psubd, As<MMX>(x).value, As<MMX>(y).value)));
7016                 }
7017
7018                 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
7019                 {
7020                         llvm::Function *pavgw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pavg_w);
7021
7022                         return As<UShort4>(V(::builder->CreateCall2(pavgw, As<MMX>(x).value, As<MMX>(y).value)));
7023                 }
7024
7025                 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
7026                 {
7027                         llvm::Function *pmaxsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmaxs_w);
7028
7029                         return As<Short4>(V(::builder->CreateCall2(pmaxsw, As<MMX>(x).value, As<MMX>(y).value)));
7030                 }
7031
7032                 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
7033                 {
7034                         llvm::Function *pminsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmins_w);
7035
7036                         return As<Short4>(V(::builder->CreateCall2(pminsw, As<MMX>(x).value, As<MMX>(y).value)));
7037                 }
7038
7039                 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
7040                 {
7041                         llvm::Function *pcmpgtw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpgt_w);
7042
7043                         return As<Short4>(V(::builder->CreateCall2(pcmpgtw, As<MMX>(x).value, As<MMX>(y).value)));
7044                 }
7045
7046                 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
7047                 {
7048                         llvm::Function *pcmpeqw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpeq_w);
7049
7050                         return As<Short4>(V(::builder->CreateCall2(pcmpeqw, As<MMX>(x).value, As<MMX>(y).value)));
7051                 }
7052
7053                 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
7054                 {
7055                         llvm::Function *pcmpgtb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpgt_b);
7056
7057                         return As<Byte8>(V(::builder->CreateCall2(pcmpgtb, As<MMX>(x).value, As<MMX>(y).value)));
7058                 }
7059
7060                 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
7061                 {
7062                         llvm::Function *pcmpeqb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpeq_b);
7063
7064                         return As<Byte8>(V(::builder->CreateCall2(pcmpeqb, As<MMX>(x).value, As<MMX>(y).value)));
7065                 }
7066
7067                 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
7068                 {
7069                         llvm::Function *packssdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packssdw);
7070
7071                         return As<Short4>(V(::builder->CreateCall2(packssdw, As<MMX>(x).value, As<MMX>(y).value)));
7072                 }
7073
7074                 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
7075                 {
7076                         if(CPUID::supportsSSE2())
7077                         {
7078                                 llvm::Function *packssdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_packssdw_128);
7079
7080                                 return RValue<Short8>(V(::builder->CreateCall2(packssdw, x.value, y.value)));
7081                         }
7082                         else
7083                         {
7084                                 Int2 loX = Int2(x);
7085                                 Int2 hiX = Int2(Swizzle(x, 0xEE));
7086
7087                                 Int2 loY = Int2(y);
7088                                 Int2 hiY = Int2(Swizzle(y, 0xEE));
7089
7090                                 Short4 lo = x86::packssdw(loX, hiX);
7091                                 Short4 hi = x86::packssdw(loY, hiY);
7092
7093                                 return Short8(lo, hi);
7094                         }
7095                 }
7096
7097                 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
7098                 {
7099                         llvm::Function *packsswb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packsswb);
7100
7101                         return As<SByte8>(V(::builder->CreateCall2(packsswb, As<MMX>(x).value, As<MMX>(y).value)));
7102                 }
7103
7104                 RValue<Byte8> packuswb(RValue<UShort4> x, RValue<UShort4> y)
7105                 {
7106                         llvm::Function *packuswb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packuswb);
7107
7108                         return As<Byte8>(V(::builder->CreateCall2(packuswb, As<MMX>(x).value, As<MMX>(y).value)));
7109                 }
7110
7111                 RValue<UShort8> packusdw(RValue<UInt4> x, RValue<UInt4> y)
7112                 {
7113                         if(CPUID::supportsSSE4_1())
7114                         {
7115                                 llvm::Function *packusdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_packusdw);
7116
7117                                 return RValue<UShort8>(V(::builder->CreateCall2(packusdw, x.value, y.value)));
7118                         }
7119                         else
7120                         {
7121                                 // FIXME: Not an exact replacement!
7122                                 return As<UShort8>(packssdw(As<Int4>(x - UInt4(0x00008000, 0x00008000, 0x00008000, 0x00008000)), As<Int4>(y - UInt4(0x00008000, 0x00008000, 0x00008000, 0x00008000))) + Short8(0x8000u, 0x8000u, 0x8000u, 0x8000u, 0x8000u, 0x8000u, 0x8000u, 0x8000u));
7123                         }
7124                 }
7125
7126                 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
7127                 {
7128                         llvm::Function *psrlw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrli_w);
7129
7130                         return As<UShort4>(V(::builder->CreateCall2(psrlw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7131                 }
7132
7133                 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
7134                 {
7135                         llvm::Function *psrlw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrli_w);
7136
7137                         return RValue<UShort8>(V(::builder->CreateCall2(psrlw, x.value, V(Nucleus::createConstantInt(y)))));
7138                 }
7139
7140                 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
7141                 {
7142                         llvm::Function *psraw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrai_w);
7143
7144                         return As<Short4>(V(::builder->CreateCall2(psraw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7145                 }
7146
7147                 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
7148                 {
7149                         llvm::Function *psraw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrai_w);
7150
7151                         return RValue<Short8>(V(::builder->CreateCall2(psraw, x.value, V(Nucleus::createConstantInt(y)))));
7152                 }
7153
7154                 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
7155                 {
7156                         llvm::Function *psllw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pslli_w);
7157
7158                         return As<Short4>(V(::builder->CreateCall2(psllw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7159                 }
7160
7161                 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
7162                 {
7163                         llvm::Function *psllw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pslli_w);
7164
7165                         return RValue<Short8>(V(::builder->CreateCall2(psllw, x.value, V(Nucleus::createConstantInt(y)))));
7166                 }
7167
7168                 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
7169                 {
7170                         llvm::Function *pslld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pslli_d);
7171
7172                         return As<Int2>(V(::builder->CreateCall2(pslld, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7173                 }
7174
7175                 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
7176                 {
7177                         if(CPUID::supportsSSE2())
7178                         {
7179                                 llvm::Function *pslld = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pslli_d);
7180
7181                                 return RValue<Int4>(V(::builder->CreateCall2(pslld, x.value, V(Nucleus::createConstantInt(y)))));
7182                         }
7183                         else
7184                         {
7185                                 Int2 lo = Int2(x);
7186                                 Int2 hi = Int2(Swizzle(x, 0xEE));
7187
7188                                 lo = x86::pslld(lo, y);
7189                                 hi = x86::pslld(hi, y);
7190
7191                                 return Int4(lo, hi);
7192                         }
7193                 }
7194
7195                 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
7196                 {
7197                         llvm::Function *psrad = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrai_d);
7198
7199                         return As<Int2>(V(::builder->CreateCall2(psrad, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7200                 }
7201
7202                 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
7203                 {
7204                         if(CPUID::supportsSSE2())
7205                         {
7206                                 llvm::Function *psrad = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrai_d);
7207
7208                                 return RValue<Int4>(V(::builder->CreateCall2(psrad, x.value, V(Nucleus::createConstantInt(y)))));
7209                         }
7210                         else
7211                         {
7212                                 Int2 lo = Int2(x);
7213                                 Int2 hi = Int2(Swizzle(x, 0xEE));
7214
7215                                 lo = x86::psrad(lo, y);
7216                                 hi = x86::psrad(hi, y);
7217
7218                                 return Int4(lo, hi);
7219                         }
7220                 }
7221
7222                 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
7223                 {
7224                         llvm::Function *psrld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrli_d);
7225
7226                         return As<UInt2>(V(::builder->CreateCall2(psrld, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7227                 }
7228
7229                 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
7230                 {
7231                         if(CPUID::supportsSSE2())
7232                         {
7233                                 llvm::Function *psrld = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrli_d);
7234
7235                                 return RValue<UInt4>(V(::builder->CreateCall2(psrld, x.value, V(Nucleus::createConstantInt(y)))));
7236                         }
7237                         else
7238                         {
7239                                 UInt2 lo = As<UInt2>(Int2(As<Int4>(x)));
7240                                 UInt2 hi = As<UInt2>(Int2(Swizzle(As<Int4>(x), 0xEE)));
7241
7242                                 lo = x86::psrld(lo, y);
7243                                 hi = x86::psrld(hi, y);
7244
7245                                 return UInt4(lo, hi);
7246                         }
7247                 }
7248
7249                 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
7250                 {
7251                         llvm::Function *pmaxsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmaxsd);
7252
7253                         return RValue<Int4>(V(::builder->CreateCall2(pmaxsd, x.value, y.value)));
7254                 }
7255
7256                 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
7257                 {
7258                         llvm::Function *pminsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pminsd);
7259
7260                         return RValue<Int4>(V(::builder->CreateCall2(pminsd, x.value, y.value)));
7261                 }
7262
7263                 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
7264                 {
7265                         llvm::Function *pmaxud = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmaxud);
7266
7267                         return RValue<UInt4>(V(::builder->CreateCall2(pmaxud, x.value, y.value)));
7268                 }
7269
7270                 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
7271                 {
7272                         llvm::Function *pminud = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pminud);
7273
7274                         return RValue<UInt4>(V(::builder->CreateCall2(pminud, x.value, y.value)));
7275                 }
7276
7277                 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
7278                 {
7279                         llvm::Function *pmulhw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmulh_w);
7280
7281                         return As<Short4>(V(::builder->CreateCall2(pmulhw, As<MMX>(x).value, As<MMX>(y).value)));
7282                 }
7283
7284                 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
7285                 {
7286                         llvm::Function *pmulhuw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmulhu_w);
7287
7288                         return As<UShort4>(V(::builder->CreateCall2(pmulhuw, As<MMX>(x).value, As<MMX>(y).value)));
7289                 }
7290
7291                 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
7292                 {
7293                         llvm::Function *pmaddwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmadd_wd);
7294
7295                         return As<Int2>(V(::builder->CreateCall2(pmaddwd, As<MMX>(x).value, As<MMX>(y).value)));
7296                 }
7297
7298                 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
7299                 {
7300                         llvm::Function *pmulhw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmulh_w);
7301
7302                         return RValue<Short8>(V(::builder->CreateCall2(pmulhw, x.value, y.value)));
7303                 }
7304
7305                 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
7306                 {
7307                         llvm::Function *pmulhuw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmulhu_w);
7308
7309                         return RValue<UShort8>(V(::builder->CreateCall2(pmulhuw, x.value, y.value)));
7310                 }
7311
7312                 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
7313                 {
7314                         llvm::Function *pmaddwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmadd_wd);
7315
7316                         return RValue<Int4>(V(::builder->CreateCall2(pmaddwd, x.value, y.value)));
7317                 }
7318
7319                 RValue<Int> movmskps(RValue<Float4> x)
7320                 {
7321                         llvm::Function *movmskps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_movmsk_ps);
7322
7323                         return RValue<Int>(V(::builder->CreateCall(movmskps, x.value)));
7324                 }
7325
7326                 RValue<Int> pmovmskb(RValue<Byte8> x)
7327                 {
7328                         llvm::Function *pmovmskb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmovmskb);
7329
7330                         return RValue<Int>(V(::builder->CreateCall(pmovmskb, As<MMX>(x).value)));
7331                 }
7332
7333                 //RValue<Int2> movd(RValue<Pointer<Int>> x)
7334                 //{
7335                 //      Value *element = Nucleus::createLoad(x.value);
7336
7337                 ////    Value *int2 = UndefValue::get(Int2::getType());
7338                 ////    int2 = Nucleus::createInsertElement(int2, element, ConstantInt::get(Int::getType(), 0));
7339
7340                 //      Value *int2 = Nucleus::createBitCast(Nucleus::createZExt(element, Long::getType()), Int2::getType());
7341
7342                 //      return RValue<Int2>(int2);
7343                 //}
7344
7345                 //RValue<Int2> movdq2q(RValue<Int4> x)
7346                 //{
7347                 //      Value *long2 = Nucleus::createBitCast(x.value, T(VectorType::get(Long::getType(), 2)));
7348                 //      Value *element = Nucleus::createExtractElement(long2, ConstantInt::get(Int::getType(), 0));
7349
7350                 //      return RValue<Int2>(Nucleus::createBitCast(element, Int2::getType()));
7351                 //}
7352
7353                 RValue<Int4> pmovzxbd(RValue<Int4> x)
7354                 {
7355                         llvm::Function *pmovzxbd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovzxbd);
7356
7357                         return RValue<Int4>(V(::builder->CreateCall(pmovzxbd, Nucleus::createBitCast(x.value, Byte16::getType()))));
7358                 }
7359
7360                 RValue<Int4> pmovsxbd(RValue<Int4> x)
7361                 {
7362                         llvm::Function *pmovsxbd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovsxbd);
7363
7364                         return RValue<Int4>(V(::builder->CreateCall(pmovsxbd, Nucleus::createBitCast(x.value, SByte16::getType()))));
7365                 }
7366
7367                 RValue<Int4> pmovzxwd(RValue<Int4> x)
7368                 {
7369                         llvm::Function *pmovzxwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovzxwd);
7370
7371                         return RValue<Int4>(V(::builder->CreateCall(pmovzxwd, Nucleus::createBitCast(x.value, UShort8::getType()))));
7372                 }
7373
7374                 RValue<Int4> pmovsxwd(RValue<Int4> x)
7375                 {
7376                         llvm::Function *pmovsxwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovsxwd);
7377
7378                         return RValue<Int4>(V(::builder->CreateCall(pmovsxwd, Nucleus::createBitCast(x.value, Short8::getType()))));
7379                 }
7380
7381                 void emms()
7382                 {
7383                         llvm::Function *emms = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_emms);
7384
7385                         V(::builder->CreateCall(emms));
7386                 }
7387         }
7388 }