src/Reactor/LLVMReactor.cpp

   1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //    http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "Nucleus.hpp"
  16
  17 #include "llvm/Support/IRBuilder.h"
  18 #include "llvm/Function.h"
  19 #include "llvm/GlobalVariable.h"
  20 #include "llvm/Module.h"
  21 #include "llvm/LLVMContext.h"
  22 #include "llvm/Constants.h"
  23 #include "llvm/Intrinsics.h"
  24 #include "llvm/PassManager.h"
  25 #include "llvm/Analysis/LoopPass.h"
  26 #include "llvm/Transforms/Scalar.h"
  27 #include "llvm/Target/TargetData.h"
  28 #include "llvm/Target/TargetOptions.h"
  29 #include "llvm/Support/TargetSelect.h"
  30 #include "../lib/ExecutionEngine/JIT/JIT.h"
  31
  32 #include "LLVMRoutine.hpp"
  33 #include "LLVMRoutineManager.hpp"
  34 #include "x86.hpp"
  35 #include "CPUID.hpp"
  36 #include "Thread.hpp"
  37 #include "Memory.hpp"
  38 #include "MutexLock.hpp"
  39
  40 #include <xmmintrin.h>
  41 #include <fstream>
  42
  43 #if defined(__x86_64__) && defined(_WIN32)
  44 extern "C" void X86CompilationCallback()
  45 {
  46         assert(false);   // UNIMPLEMENTED
  47 }
  48 #endif
  49
  50 extern "C"
  51 {
  52         bool (*CodeAnalystInitialize)() = 0;
  53         void (*CodeAnalystCompleteJITLog)() = 0;
  54         bool (*CodeAnalystLogJITCode)(const void *jitCodeStartAddr, unsigned int jitCodeSize, const wchar_t *functionName) = 0;
  55 }
  56
  57 namespace llvm
  58 {
  59         extern bool JITEmitDebugInfo;
  60 }
  61
  62 namespace
  63 {
  64         sw::LLVMRoutineManager *routineManager = nullptr;
  65         llvm::ExecutionEngine *executionEngine = nullptr;
  66         llvm::IRBuilder<> *builder = nullptr;
  67         llvm::LLVMContext *context = nullptr;
  68         llvm::Module *module = nullptr;
  69         llvm::Function *function = nullptr;
  70
  71         sw::BackoffLock codegenMutex;
  72
  73         sw::BasicBlock *falseBB = nullptr;
  74 }
  75
  76 namespace sw
  77 {
  78         using namespace llvm;
  79
  80         Optimization optimization[10] = {InstructionCombining, Disabled};
  81
  82         class Type : public llvm::Type {};
  83         class Value : public llvm::Value {};
  84         class BasicBlock : public llvm::BasicBlock {};
  85
  86         inline Type *T(llvm::Type *t)
  87         {
  88                 return reinterpret_cast<Type*>(t);
  89         }
  90
  91         inline Value *V(llvm::Value *t)
  92         {
  93                 return reinterpret_cast<Value*>(t);
  94         }
  95
  96         inline std::vector<llvm::Type*> &T(std::vector<Type*> &t)
  97         {
  98                 return reinterpret_cast<std::vector<llvm::Type*>&>(t);
  99         }
 100
 101         inline BasicBlock *B(llvm::BasicBlock *t)
 102         {
 103                 return reinterpret_cast<BasicBlock*>(t);
 104         }
 105
 106         Nucleus::Nucleus()
 107         {
 108                 ::codegenMutex.lock();   // Reactor and LLVM are currently not thread safe
 109
 110                 InitializeNativeTarget();
 111                 JITEmitDebugInfo = false;
 112
 113                 if(!::context)
 114                 {
 115                         ::context = new LLVMContext();
 116                 }
 117
 118                 ::module = new Module("", *::context);
 119                 ::routineManager = new LLVMRoutineManager();
 120
 121                 #if defined(__x86_64__)
 122                         const char *architecture = "x86-64";
 123                 #else
 124                         const char *architecture = "x86";
 125                 #endif
 126
 127                 SmallVector<std::string, 1> MAttrs;
 128                 MAttrs.push_back(CPUID::supportsMMX()    ? "+mmx"   : "-mmx");
 129                 MAttrs.push_back(CPUID::supportsCMOV()   ? "+cmov"  : "-cmov");
 130                 MAttrs.push_back(CPUID::supportsSSE()    ? "+sse"   : "-sse");
 131                 MAttrs.push_back(CPUID::supportsSSE2()   ? "+sse2"  : "-sse2");
 132                 MAttrs.push_back(CPUID::supportsSSE3()   ? "+sse3"  : "-sse3");
 133                 MAttrs.push_back(CPUID::supportsSSSE3()  ? "+ssse3" : "-ssse3");
 134                 MAttrs.push_back(CPUID::supportsSSE4_1() ? "+sse41" : "-sse41");
 135
 136                 std::string error;
 137                 TargetMachine *targetMachine = EngineBuilder::selectTarget(::module, architecture, "", MAttrs, Reloc::Default, CodeModel::JITDefault, &error);
 138                 ::executionEngine = JIT::createJIT(::module, 0, ::routineManager, CodeGenOpt::Aggressive, true, targetMachine);
 139
 140                 if(!::builder)
 141                 {
 142                         ::builder = new IRBuilder<>(*::context);
 143
 144                         #if defined(_WIN32)
 145                                 HMODULE CodeAnalyst = LoadLibrary("CAJitNtfyLib.dll");
 146                                 if(CodeAnalyst)
 147                                 {
 148                                         CodeAnalystInitialize = (bool(*)())GetProcAddress(CodeAnalyst, "CAJIT_Initialize");
 149                                         CodeAnalystCompleteJITLog = (void(*)())GetProcAddress(CodeAnalyst, "CAJIT_CompleteJITLog");
 150                                         CodeAnalystLogJITCode = (bool(*)(const void*, unsigned int, const wchar_t*))GetProcAddress(CodeAnalyst, "CAJIT_LogJITCode");
 151
 152                                         CodeAnalystInitialize();
 153                                 }
 154                         #endif
 155                 }
 156         }
 157
 158         Nucleus::~Nucleus()
 159         {
 160                 delete ::executionEngine;
 161                 ::executionEngine = nullptr;
 162
 163                 ::routineManager = nullptr;
 164                 ::function = nullptr;
 165                 ::module = nullptr;
 166
 167                 ::codegenMutex.unlock();
 168         }
 169
 170         Routine *Nucleus::acquireRoutine(const wchar_t *name, bool runOptimizations)
 171         {
 172                 if(::builder->GetInsertBlock()->empty() || !::builder->GetInsertBlock()->back().isTerminator())
 173                 {
 174                         llvm::Type *type = ::function->getReturnType();
 175
 176                         if(type->isVoidTy())
 177                         {
 178                                 createRetVoid();
 179                         }
 180                         else
 181                         {
 182                                 createRet(V(UndefValue::get(type)));
 183                         }
 184                 }
 185
 186                 if(false)
 187                 {
 188                         std::string error;
 189                         raw_fd_ostream file("llvm-dump-unopt.txt", error);
 190                         ::module->print(file, 0);
 191                 }
 192
 193                 if(runOptimizations)
 194                 {
 195                         optimize();
 196                 }
 197
 198                 if(false)
 199                 {
 200                         std::string error;
 201                         raw_fd_ostream file("llvm-dump-opt.txt", error);
 202                         ::module->print(file, 0);
 203                 }
 204
 205                 void *entry = ::executionEngine->getPointerToFunction(::function);
 206                 LLVMRoutine *routine = ::routineManager->acquireRoutine(entry);
 207
 208                 if(CodeAnalystLogJITCode)
 209                 {
 210                         CodeAnalystLogJITCode(routine->getEntry(), routine->getCodeSize(), name);
 211                 }
 212
 213                 return routine;
 214         }
 215
 216         void Nucleus::optimize()
 217         {
 218                 static PassManager *passManager = nullptr;
 219
 220                 if(!passManager)
 221                 {
 222                         passManager = new PassManager();
 223
 224                         UnsafeFPMath = true;
 225                 //      NoInfsFPMath = true;
 226                 //      NoNaNsFPMath = true;
 227
 228                         passManager->add(new TargetData(*::executionEngine->getTargetData()));
 229                         passManager->add(createScalarReplAggregatesPass());
 230
 231                         for(int pass = 0; pass < 10 && optimization[pass] != Disabled; pass++)
 232                         {
 233                                 switch(optimization[pass])
 234                                 {
 235                                 case Disabled:                                                                 break;
 236                                 case CFGSimplification:    passManager->add(createCFGSimplificationPass());    break;
 237                                 case LICM:                 passManager->add(createLICMPass());                 break;
 238                                 case AggressiveDCE:        passManager->add(createAggressiveDCEPass());        break;
 239                                 case GVN:                  passManager->add(createGVNPass());                  break;
 240                                 case InstructionCombining: passManager->add(createInstructionCombiningPass()); break;
 241                                 case Reassociate:          passManager->add(createReassociatePass());          break;
 242                                 case DeadStoreElimination: passManager->add(createDeadStoreEliminationPass()); break;
 243                                 case SCCP:                 passManager->add(createSCCPPass());                 break;
 244                                 case ScalarReplAggregates: passManager->add(createScalarReplAggregatesPass()); break;
 245                                 default:
 246                                         assert(false);
 247                                 }
 248                         }
 249                 }
 250
 251                 passManager->run(*::module);
 252         }
 253
 254         Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
 255         {
 256                 // Need to allocate it in the entry block for mem2reg to work
 257                 llvm::BasicBlock &entryBlock = ::function->getEntryBlock();
 258
 259                 Instruction *declaration;
 260
 261                 if(arraySize)
 262                 {
 263                         declaration = new AllocaInst(type, Nucleus::createConstantInt(arraySize));
 264                 }
 265                 else
 266                 {
 267                         declaration = new AllocaInst(type, (Value*)0);
 268                 }
 269
 270                 entryBlock.getInstList().push_front(declaration);
 271
 272                 return V(declaration);
 273         }
 274
 275         BasicBlock *Nucleus::createBasicBlock()
 276         {
 277                 return B(BasicBlock::Create(*::context, "", ::function));
 278         }
 279
 280         BasicBlock *Nucleus::getInsertBlock()
 281         {
 282                 return B(::builder->GetInsertBlock());
 283         }
 284
 285         void Nucleus::setInsertBlock(BasicBlock *basicBlock)
 286         {
 287         //      assert(::builder->GetInsertBlock()->back().isTerminator());
 288                 return ::builder->SetInsertPoint(basicBlock);
 289         }
 290
 291         void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
 292         {
 293                 llvm::FunctionType *functionType = llvm::FunctionType::get(ReturnType, T(Params), false);
 294                 ::function = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, "", ::module);
 295                 ::function->setCallingConv(llvm::CallingConv::C);
 296
 297                 ::builder->SetInsertPoint(BasicBlock::Create(*::context, "", ::function));
 298         }
 299
 300         Value *Nucleus::getArgument(unsigned int index)
 301         {
 302                 llvm::Function::arg_iterator args = ::function->arg_begin();
 303
 304                 while(index)
 305                 {
 306                         args++;
 307                         index--;
 308                 }
 309
 310                 return V(&*args);
 311         }
 312
 313         void Nucleus::createRetVoid()
 314         {
 315                 x86::emms();
 316
 317                 ::builder->CreateRetVoid();
 318         }
 319
 320         void Nucleus::createRet(Value *v)
 321         {
 322                 x86::emms();
 323
 324                 ::builder->CreateRet(v);
 325         }
 326
 327         void Nucleus::createBr(BasicBlock *dest)
 328         {
 329                 ::builder->CreateBr(dest);
 330         }
 331
 332         void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
 333         {
 334                 ::builder->CreateCondBr(cond, ifTrue, ifFalse);
 335         }
 336
 337         Value *Nucleus::createAdd(Value *lhs, Value *rhs)
 338         {
 339                 return V(::builder->CreateAdd(lhs, rhs));
 340         }
 341
 342         Value *Nucleus::createSub(Value *lhs, Value *rhs)
 343         {
 344                 return V(::builder->CreateSub(lhs, rhs));
 345         }
 346
 347         Value *Nucleus::createMul(Value *lhs, Value *rhs)
 348         {
 349                 return V(::builder->CreateMul(lhs, rhs));
 350         }
 351
 352         Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
 353         {
 354                 return V(::builder->CreateUDiv(lhs, rhs));
 355         }
 356
 357         Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
 358         {
 359                 return V(::builder->CreateSDiv(lhs, rhs));
 360         }
 361
 362         Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
 363         {
 364                 return V(::builder->CreateFAdd(lhs, rhs));
 365         }
 366
 367         Value *Nucleus::createFSub(Value *lhs, Value *rhs)
 368         {
 369                 return V(::builder->CreateFSub(lhs, rhs));
 370         }
 371
 372         Value *Nucleus::createFMul(Value *lhs, Value *rhs)
 373         {
 374                 return V(::builder->CreateFMul(lhs, rhs));
 375         }
 376
 377         Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
 378         {
 379                 return V(::builder->CreateFDiv(lhs, rhs));
 380         }
 381
 382         Value *Nucleus::createURem(Value *lhs, Value *rhs)
 383         {
 384                 return V(::builder->CreateURem(lhs, rhs));
 385         }
 386
 387         Value *Nucleus::createSRem(Value *lhs, Value *rhs)
 388         {
 389                 return V(::builder->CreateSRem(lhs, rhs));
 390         }
 391
 392         Value *Nucleus::createFRem(Value *lhs, Value *rhs)
 393         {
 394                 return V(::builder->CreateFRem(lhs, rhs));
 395         }
 396
 397         Value *Nucleus::createShl(Value *lhs, Value *rhs)
 398         {
 399                 return V(::builder->CreateShl(lhs, rhs));
 400         }
 401
 402         Value *Nucleus::createLShr(Value *lhs, Value *rhs)
 403         {
 404                 return V(::builder->CreateLShr(lhs, rhs));
 405         }
 406
 407         Value *Nucleus::createAShr(Value *lhs, Value *rhs)
 408         {
 409                 return V(::builder->CreateAShr(lhs, rhs));
 410         }
 411
 412         Value *Nucleus::createAnd(Value *lhs, Value *rhs)
 413         {
 414                 return V(::builder->CreateAnd(lhs, rhs));
 415         }
 416
 417         Value *Nucleus::createOr(Value *lhs, Value *rhs)
 418         {
 419                 return V(::builder->CreateOr(lhs, rhs));
 420         }
 421
 422         Value *Nucleus::createXor(Value *lhs, Value *rhs)
 423         {
 424                 return V(::builder->CreateXor(lhs, rhs));
 425         }
 426
 427         Value *Nucleus::createNeg(Value *v)
 428         {
 429                 return V(::builder->CreateNeg(v));
 430         }
 431
 432         Value *Nucleus::createFNeg(Value *v)
 433         {
 434                 return V(::builder->CreateFNeg(v));
 435         }
 436
 437         Value *Nucleus::createNot(Value *v)
 438         {
 439                 return V(::builder->CreateNot(v));
 440         }
 441
 442         Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int align)
 443         {
 444                 assert(ptr->getType()->getContainedType(0) == type);
 445                 return V(::builder->Insert(new LoadInst(ptr, "", isVolatile, align)));
 446         }
 447
 448         Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int align)
 449         {
 450                 assert(ptr->getType()->getContainedType(0) == type);
 451                 ::builder->Insert(new StoreInst(value, ptr, isVolatile, align));
 452                 return value;
 453         }
 454
 455         Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index)
 456         {
 457                 assert(ptr->getType()->getContainedType(0) == type);
 458                 return V(::builder->CreateGEP(ptr, index));
 459         }
 460
 461         Value *Nucleus::createAtomicAdd(Value *ptr, Value *value)
 462         {
 463                 return V(::builder->CreateAtomicRMW(AtomicRMWInst::Add, ptr, value, SequentiallyConsistent));
 464         }
 465
 466         Value *Nucleus::createTrunc(Value *v, Type *destType)
 467         {
 468                 return V(::builder->CreateTrunc(v, destType));
 469         }
 470
 471         Value *Nucleus::createZExt(Value *v, Type *destType)
 472         {
 473                 return V(::builder->CreateZExt(v, destType));
 474         }
 475
 476         Value *Nucleus::createSExt(Value *v, Type *destType)
 477         {
 478                 return V(::builder->CreateSExt(v, destType));
 479         }
 480
 481         Value *Nucleus::createFPToSI(Value *v, Type *destType)
 482         {
 483                 return V(::builder->CreateFPToSI(v, destType));
 484         }
 485
 486         Value *Nucleus::createUIToFP(Value *v, Type *destType)
 487         {
 488                 return V(::builder->CreateUIToFP(v, destType));
 489         }
 490
 491         Value *Nucleus::createSIToFP(Value *v, Type *destType)
 492         {
 493                 return V(::builder->CreateSIToFP(v, destType));
 494         }
 495
 496         Value *Nucleus::createFPTrunc(Value *v, Type *destType)
 497         {
 498                 return V(::builder->CreateFPTrunc(v, destType));
 499         }
 500
 501         Value *Nucleus::createFPExt(Value *v, Type *destType)
 502         {
 503                 return V(::builder->CreateFPExt(v, destType));
 504         }
 505
 506         Value *Nucleus::createBitCast(Value *v, Type *destType)
 507         {
 508                 return V(::builder->CreateBitCast(v, destType));
 509         }
 510
 511         Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
 512         {
 513                 return V(::builder->CreateICmpEQ(lhs, rhs));
 514         }
 515
 516         Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
 517         {
 518                 return V(::builder->CreateICmpNE(lhs, rhs));
 519         }
 520
 521         Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
 522         {
 523                 return V(::builder->CreateICmpUGT(lhs, rhs));
 524         }
 525
 526         Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
 527         {
 528                 return V(::builder->CreateICmpUGE(lhs, rhs));
 529         }
 530
 531         Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
 532         {
 533                 return V(::builder->CreateICmpULT(lhs, rhs));
 534         }
 535
 536         Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
 537         {
 538                 return V(::builder->CreateICmpULE(lhs, rhs));
 539         }
 540
 541         Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
 542         {
 543                 return V(::builder->CreateICmpSGT(lhs, rhs));
 544         }
 545
 546         Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
 547         {
 548                 return V(::builder->CreateICmpSGE(lhs, rhs));
 549         }
 550
 551         Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
 552         {
 553                 return V(::builder->CreateICmpSLT(lhs, rhs));
 554         }
 555
 556         Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
 557         {
 558                 return V(::builder->CreateICmpSLE(lhs, rhs));
 559         }
 560
 561         Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
 562         {
 563                 return V(::builder->CreateFCmpOEQ(lhs, rhs));
 564         }
 565
 566         Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
 567         {
 568                 return V(::builder->CreateFCmpOGT(lhs, rhs));
 569         }
 570
 571         Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
 572         {
 573                 return V(::builder->CreateFCmpOGE(lhs, rhs));
 574         }
 575
 576         Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
 577         {
 578                 return V(::builder->CreateFCmpOLT(lhs, rhs));
 579         }
 580
 581         Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
 582         {
 583                 return V(::builder->CreateFCmpOLE(lhs, rhs));
 584         }
 585
 586         Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
 587         {
 588                 return V(::builder->CreateFCmpONE(lhs, rhs));
 589         }
 590
 591         Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
 592         {
 593                 return V(::builder->CreateFCmpORD(lhs, rhs));
 594         }
 595
 596         Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
 597         {
 598                 return V(::builder->CreateFCmpUNO(lhs, rhs));
 599         }
 600
 601         Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
 602         {
 603                 return V(::builder->CreateFCmpUEQ(lhs, rhs));
 604         }
 605
 606         Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
 607         {
 608                 return V(::builder->CreateFCmpUGT(lhs, rhs));
 609         }
 610
 611         Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
 612         {
 613                 return V(::builder->CreateFCmpUGE(lhs, rhs));
 614         }
 615
 616         Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
 617         {
 618                 return V(::builder->CreateFCmpULT(lhs, rhs));
 619         }
 620
 621         Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
 622         {
 623                 return V(::builder->CreateFCmpULE(lhs, rhs));
 624         }
 625
 626         Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
 627         {
 628                 return V(::builder->CreateFCmpULE(lhs, rhs));
 629         }
 630
 631         Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
 632         {
 633                 assert(vector->getType()->getContainedType(0) == type);
 634                 return V(::builder->CreateExtractElement(vector, createConstantInt(index)));
 635         }
 636
 637         Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
 638         {
 639                 return V(::builder->CreateInsertElement(vector, element, createConstantInt(index)));
 640         }
 641
 642         Value *Nucleus::createShuffleVector(Value *V1, Value *V2, const int *select)
 643         {
 644                 int size = llvm::cast<llvm::VectorType>(V1->getType())->getNumElements();
 645                 const int maxSize = 16;
 646                 llvm::Constant *swizzle[maxSize];
 647                 assert(size <= maxSize);
 648
 649                 for(int i = 0; i < size; i++)
 650                 {
 651                         swizzle[i] = llvm::ConstantInt::get(Type::getInt32Ty(*::context), select[i]);
 652                 }
 653
 654                 llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(swizzle, size));
 655
 656                 return V(::builder->CreateShuffleVector(V1, V2, shuffle));
 657         }
 658
 659         Value *Nucleus::createSelect(Value *C, Value *ifTrue, Value *ifFalse)
 660         {
 661                 return V(::builder->CreateSelect(C, ifTrue, ifFalse));
 662         }
 663
 664         Value *Nucleus::createSwitch(Value *v, BasicBlock *Dest, unsigned NumCases)
 665         {
 666                 return V(::builder->CreateSwitch(v, Dest, NumCases));
 667         }
 668
 669         void Nucleus::addSwitchCase(Value *Switch, int Case, BasicBlock *Branch)
 670         {
 671                 reinterpret_cast<SwitchInst*>(Switch)->addCase(llvm::ConstantInt::get(Type::getInt32Ty(*::context), Case, true), Branch);
 672         }
 673
 674         void Nucleus::createUnreachable()
 675         {
 676                 ::builder->CreateUnreachable();
 677         }
 678
 679         static Value *createSwizzle4(Value *val, unsigned char select)
 680         {
 681                 int swizzle[4] =
 682                 {
 683                         (select >> 0) & 0x03,
 684                         (select >> 2) & 0x03,
 685                         (select >> 4) & 0x03,
 686                         (select >> 6) & 0x03,
 687                 };
 688
 689                 return Nucleus::createShuffleVector(val, val, swizzle);
 690         }
 691
 692         static Value *createMask4(Value *lhs, Value *rhs, unsigned char select)
 693         {
 694                 bool mask[4] = {false, false, false, false};
 695
 696                 mask[(select >> 0) & 0x03] = true;
 697                 mask[(select >> 2) & 0x03] = true;
 698                 mask[(select >> 4) & 0x03] = true;
 699                 mask[(select >> 6) & 0x03] = true;
 700
 701                 int swizzle[4] =
 702                 {
 703                         mask[0] ? 4 : 0,
 704                         mask[1] ? 5 : 1,
 705                         mask[2] ? 6 : 2,
 706                         mask[3] ? 7 : 3,
 707                 };
 708
 709                 Value *shuffle = Nucleus::createShuffleVector(lhs, rhs, swizzle);
 710
 711                 return shuffle;
 712         }
 713
 714         Value *Nucleus::createConstantPointer(const void *address, Type *Ty, unsigned int align)
 715         {
 716                 const GlobalValue *existingGlobal = ::executionEngine->getGlobalValueAtAddress(const_cast<void*>(address));   // FIXME: Const
 717
 718                 if(existingGlobal)
 719                 {
 720                         return (Value*)existingGlobal;
 721                 }
 722
 723                 llvm::GlobalValue *global = new llvm::GlobalVariable(*::module, Ty, true, llvm::GlobalValue::ExternalLinkage, 0, "");
 724                 global->setAlignment(align);
 725
 726                 ::executionEngine->addGlobalMapping(global, const_cast<void*>(address));
 727
 728                 return V(global);
 729         }
 730
 731         Type *Nucleus::getPointerType(Type *ElementType)
 732         {
 733                 return T(llvm::PointerType::get(ElementType, 0));
 734         }
 735
 736         Value *Nucleus::createNullValue(Type *Ty)
 737         {
 738                 return V(llvm::Constant::getNullValue(Ty));
 739         }
 740
 741         Value *Nucleus::createConstantLong(int64_t i)
 742         {
 743                 return V(llvm::ConstantInt::get(Type::getInt64Ty(*::context), i, true));
 744         }
 745
 746         Value *Nucleus::createConstantInt(int i)
 747         {
 748                 return V(llvm::ConstantInt::get(Type::getInt32Ty(*::context), i, true));
 749         }
 750
 751         Value *Nucleus::createConstantInt(unsigned int i)
 752         {
 753                 return V(llvm::ConstantInt::get(Type::getInt32Ty(*::context), i, false));
 754         }
 755
 756         Value *Nucleus::createConstantBool(bool b)
 757         {
 758                 return V(llvm::ConstantInt::get(Type::getInt1Ty(*::context), b));
 759         }
 760
 761         Value *Nucleus::createConstantByte(signed char i)
 762         {
 763                 return V(llvm::ConstantInt::get(Type::getInt8Ty(*::context), i, true));
 764         }
 765
 766         Value *Nucleus::createConstantByte(unsigned char i)
 767         {
 768                 return V(llvm::ConstantInt::get(Type::getInt8Ty(*::context), i, false));
 769         }
 770
 771         Value *Nucleus::createConstantShort(short i)
 772         {
 773                 return V(llvm::ConstantInt::get(Type::getInt16Ty(*::context), i, true));
 774         }
 775
 776         Value *Nucleus::createConstantShort(unsigned short i)
 777         {
 778                 return V(llvm::ConstantInt::get(Type::getInt16Ty(*::context), i, false));
 779         }
 780
 781         Value *Nucleus::createConstantFloat(float x)
 782         {
 783                 return V(llvm::ConstantFP::get(Float::getType(), x));
 784         }
 785
 786         Value *Nucleus::createNullPointer(Type *Ty)
 787         {
 788                 return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(Ty, 0)));
 789         }
 790
 791         Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
 792         {
 793                 assert(llvm::isa<VectorType>(type));
 794                 const int numConstants = llvm::cast<VectorType>(type)->getNumElements();
 795                 assert(numConstants <= 16);
 796                 llvm::Constant *constantVector[16];
 797
 798                 for(int i = 0; i < numConstants; i++)
 799                 {
 800                         constantVector[i] = llvm::ConstantInt::get(type->getContainedType(0), constants[i]);
 801                 }
 802
 803                 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numConstants)));
 804         }
 805
 806         Value *Nucleus::createConstantVector(const double *constants, Type *type)
 807         {
 808                 assert(llvm::isa<VectorType>(type));
 809                 const int numConstants = llvm::cast<VectorType>(type)->getNumElements();
 810                 assert(numConstants <= 8);
 811                 llvm::Constant *constantVector[8];
 812
 813                 for(int i = 0; i < numConstants; i++)
 814                 {
 815                         constantVector[i] = llvm::ConstantFP::get(type->getContainedType(0), constants[i]);
 816                 }
 817
 818                 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numConstants)));
 819         }
 820
 821         Type *Void::getType()
 822         {
 823                 return T(llvm::Type::getVoidTy(*::context));
 824         }
 825
 826         class MMX : public Variable<MMX>
 827         {
 828         public:
 829                 static Type *getType();
 830         };
 831
 832         Type *MMX::getType()
 833         {
 834                 return T(llvm::Type::getX86_MMXTy(*::context));
 835         }
 836
 837         Bool::Bool(Argument<Bool> argument)
 838         {
 839                 storeValue(argument.value);
 840         }
 841
 842         Bool::Bool()
 843         {
 844         }
 845
 846         Bool::Bool(bool x)
 847         {
 848                 storeValue(Nucleus::createConstantBool(x));
 849         }
 850
 851         Bool::Bool(RValue<Bool> rhs)
 852         {
 853                 storeValue(rhs.value);
 854         }
 855
 856         Bool::Bool(const Bool &rhs)
 857         {
 858                 Value *value = rhs.loadValue();
 859                 storeValue(value);
 860         }
 861
 862         Bool::Bool(const Reference<Bool> &rhs)
 863         {
 864                 Value *value = rhs.loadValue();
 865                 storeValue(value);
 866         }
 867
 868         RValue<Bool> Bool::operator=(RValue<Bool> rhs) const
 869         {
 870                 storeValue(rhs.value);
 871
 872                 return rhs;
 873         }
 874
 875         RValue<Bool> Bool::operator=(const Bool &rhs) const
 876         {
 877                 Value *value = rhs.loadValue();
 878                 storeValue(value);
 879
 880                 return RValue<Bool>(value);
 881         }
 882
 883         RValue<Bool> Bool::operator=(const Reference<Bool> &rhs) const
 884         {
 885                 Value *value = rhs.loadValue();
 886                 storeValue(value);
 887
 888                 return RValue<Bool>(value);
 889         }
 890
 891         RValue<Bool> operator!(RValue<Bool> val)
 892         {
 893                 return RValue<Bool>(Nucleus::createNot(val.value));
 894         }
 895
 896         RValue<Bool> operator&&(RValue<Bool> lhs, RValue<Bool> rhs)
 897         {
 898                 return RValue<Bool>(Nucleus::createAnd(lhs.value, rhs.value));
 899         }
 900
 901         RValue<Bool> operator||(RValue<Bool> lhs, RValue<Bool> rhs)
 902         {
 903                 return RValue<Bool>(Nucleus::createOr(lhs.value, rhs.value));
 904         }
 905
 906         Type *Bool::getType()
 907         {
 908                 return T(llvm::Type::getInt1Ty(*::context));
 909         }
 910
 911         Byte::Byte(Argument<Byte> argument)
 912         {
 913                 storeValue(argument.value);
 914         }
 915
 916         Byte::Byte(RValue<Int> cast)
 917         {
 918                 Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
 919
 920                 storeValue(integer);
 921         }
 922
 923         Byte::Byte(RValue<UInt> cast)
 924         {
 925                 Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
 926
 927                 storeValue(integer);
 928         }
 929
 930         Byte::Byte(RValue<UShort> cast)
 931         {
 932                 Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
 933
 934                 storeValue(integer);
 935         }
 936
 937         Byte::Byte()
 938         {
 939         }
 940
 941         Byte::Byte(int x)
 942         {
 943                 storeValue(Nucleus::createConstantByte((unsigned char)x));
 944         }
 945
 946         Byte::Byte(unsigned char x)
 947         {
 948                 storeValue(Nucleus::createConstantByte(x));
 949         }
 950
 951         Byte::Byte(RValue<Byte> rhs)
 952         {
 953                 storeValue(rhs.value);
 954         }
 955
 956         Byte::Byte(const Byte &rhs)
 957         {
 958                 Value *value = rhs.loadValue();
 959                 storeValue(value);
 960         }
 961
 962         Byte::Byte(const Reference<Byte> &rhs)
 963         {
 964                 Value *value = rhs.loadValue();
 965                 storeValue(value);
 966         }
 967
 968         RValue<Byte> Byte::operator=(RValue<Byte> rhs) const
 969         {
 970                 storeValue(rhs.value);
 971
 972                 return rhs;
 973         }
 974
 975         RValue<Byte> Byte::operator=(const Byte &rhs) const
 976         {
 977                 Value *value = rhs.loadValue();
 978                 storeValue(value);
 979
 980                 return RValue<Byte>(value);
 981         }
 982
 983         RValue<Byte> Byte::operator=(const Reference<Byte> &rhs) const
 984         {
 985                 Value *value = rhs.loadValue();
 986                 storeValue(value);
 987
 988                 return RValue<Byte>(value);
 989         }
 990
 991         RValue<Byte> operator+(RValue<Byte> lhs, RValue<Byte> rhs)
 992         {
 993                 return RValue<Byte>(Nucleus::createAdd(lhs.value, rhs.value));
 994         }
 995
 996         RValue<Byte> operator-(RValue<Byte> lhs, RValue<Byte> rhs)
 997         {
 998                 return RValue<Byte>(Nucleus::createSub(lhs.value, rhs.value));
 999         }
1000
1001         RValue<Byte> operator*(RValue<Byte> lhs, RValue<Byte> rhs)
1002         {
1003                 return RValue<Byte>(Nucleus::createMul(lhs.value, rhs.value));
1004         }
1005
1006         RValue<Byte> operator/(RValue<Byte> lhs, RValue<Byte> rhs)
1007         {
1008                 return RValue<Byte>(Nucleus::createUDiv(lhs.value, rhs.value));
1009         }
1010
1011         RValue<Byte> operator%(RValue<Byte> lhs, RValue<Byte> rhs)
1012         {
1013                 return RValue<Byte>(Nucleus::createURem(lhs.value, rhs.value));
1014         }
1015
1016         RValue<Byte> operator&(RValue<Byte> lhs, RValue<Byte> rhs)
1017         {
1018                 return RValue<Byte>(Nucleus::createAnd(lhs.value, rhs.value));
1019         }
1020
1021         RValue<Byte> operator|(RValue<Byte> lhs, RValue<Byte> rhs)
1022         {
1023                 return RValue<Byte>(Nucleus::createOr(lhs.value, rhs.value));
1024         }
1025
1026         RValue<Byte> operator^(RValue<Byte> lhs, RValue<Byte> rhs)
1027         {
1028                 return RValue<Byte>(Nucleus::createXor(lhs.value, rhs.value));
1029         }
1030
1031         RValue<Byte> operator<<(RValue<Byte> lhs, RValue<Byte> rhs)
1032         {
1033                 return RValue<Byte>(Nucleus::createShl(lhs.value, rhs.value));
1034         }
1035
1036         RValue<Byte> operator>>(RValue<Byte> lhs, RValue<Byte> rhs)
1037         {
1038                 return RValue<Byte>(Nucleus::createLShr(lhs.value, rhs.value));
1039         }
1040
1041         RValue<Byte> operator+=(const Byte &lhs, RValue<Byte> rhs)
1042         {
1043                 return lhs = lhs + rhs;
1044         }
1045
1046         RValue<Byte> operator-=(const Byte &lhs, RValue<Byte> rhs)
1047         {
1048                 return lhs = lhs - rhs;
1049         }
1050
1051         RValue<Byte> operator*=(const Byte &lhs, RValue<Byte> rhs)
1052         {
1053                 return lhs = lhs * rhs;
1054         }
1055
1056         RValue<Byte> operator/=(const Byte &lhs, RValue<Byte> rhs)
1057         {
1058                 return lhs = lhs / rhs;
1059         }
1060
1061         RValue<Byte> operator%=(const Byte &lhs, RValue<Byte> rhs)
1062         {
1063                 return lhs = lhs % rhs;
1064         }
1065
1066         RValue<Byte> operator&=(const Byte &lhs, RValue<Byte> rhs)
1067         {
1068                 return lhs = lhs & rhs;
1069         }
1070
1071         RValue<Byte> operator|=(const Byte &lhs, RValue<Byte> rhs)
1072         {
1073                 return lhs = lhs | rhs;
1074         }
1075
1076         RValue<Byte> operator^=(const Byte &lhs, RValue<Byte> rhs)
1077         {
1078                 return lhs = lhs ^ rhs;
1079         }
1080
1081         RValue<Byte> operator<<=(const Byte &lhs, RValue<Byte> rhs)
1082         {
1083                 return lhs = lhs << rhs;
1084         }
1085
1086         RValue<Byte> operator>>=(const Byte &lhs, RValue<Byte> rhs)
1087         {
1088                 return lhs = lhs >> rhs;
1089         }
1090
1091         RValue<Byte> operator+(RValue<Byte> val)
1092         {
1093                 return val;
1094         }
1095
1096         RValue<Byte> operator-(RValue<Byte> val)
1097         {
1098                 return RValue<Byte>(Nucleus::createNeg(val.value));
1099         }
1100
1101         RValue<Byte> operator~(RValue<Byte> val)
1102         {
1103                 return RValue<Byte>(Nucleus::createNot(val.value));
1104         }
1105
1106         RValue<Byte> operator++(const Byte &val, int)   // Post-increment
1107         {
1108                 RValue<Byte> res = val;
1109
1110                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantByte((unsigned char)1)));
1111                 val.storeValue(inc);
1112
1113                 return res;
1114         }
1115
1116         const Byte &operator++(const Byte &val)   // Pre-increment
1117         {
1118                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantByte((unsigned char)1)));
1119                 val.storeValue(inc);
1120
1121                 return val;
1122         }
1123
1124         RValue<Byte> operator--(const Byte &val, int)   // Post-decrement
1125         {
1126                 RValue<Byte> res = val;
1127
1128                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantByte((unsigned char)1)));
1129                 val.storeValue(inc);
1130
1131                 return res;
1132         }
1133
1134         const Byte &operator--(const Byte &val)   // Pre-decrement
1135         {
1136                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantByte((unsigned char)1)));
1137                 val.storeValue(inc);
1138
1139                 return val;
1140         }
1141
1142         RValue<Bool> operator<(RValue<Byte> lhs, RValue<Byte> rhs)
1143         {
1144                 return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
1145         }
1146
1147         RValue<Bool> operator<=(RValue<Byte> lhs, RValue<Byte> rhs)
1148         {
1149                 return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
1150         }
1151
1152         RValue<Bool> operator>(RValue<Byte> lhs, RValue<Byte> rhs)
1153         {
1154                 return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
1155         }
1156
1157         RValue<Bool> operator>=(RValue<Byte> lhs, RValue<Byte> rhs)
1158         {
1159                 return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
1160         }
1161
1162         RValue<Bool> operator!=(RValue<Byte> lhs, RValue<Byte> rhs)
1163         {
1164                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1165         }
1166
1167         RValue<Bool> operator==(RValue<Byte> lhs, RValue<Byte> rhs)
1168         {
1169                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1170         }
1171
1172         Type *Byte::getType()
1173         {
1174                 return T(llvm::Type::getInt8Ty(*::context));
1175         }
1176
1177         SByte::SByte(Argument<SByte> argument)
1178         {
1179                 storeValue(argument.value);
1180         }
1181
1182         SByte::SByte(RValue<Int> cast)
1183         {
1184                 Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
1185
1186                 storeValue(integer);
1187         }
1188
1189         SByte::SByte(RValue<Short> cast)
1190         {
1191                 Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
1192
1193                 storeValue(integer);
1194         }
1195
1196         SByte::SByte()
1197         {
1198         }
1199
1200         SByte::SByte(signed char x)
1201         {
1202                 storeValue(Nucleus::createConstantByte(x));
1203         }
1204
1205         SByte::SByte(RValue<SByte> rhs)
1206         {
1207                 storeValue(rhs.value);
1208         }
1209
1210         SByte::SByte(const SByte &rhs)
1211         {
1212                 Value *value = rhs.loadValue();
1213                 storeValue(value);
1214         }
1215
1216         SByte::SByte(const Reference<SByte> &rhs)
1217         {
1218                 Value *value = rhs.loadValue();
1219                 storeValue(value);
1220         }
1221
1222         RValue<SByte> SByte::operator=(RValue<SByte> rhs) const
1223         {
1224                 storeValue(rhs.value);
1225
1226                 return rhs;
1227         }
1228
1229         RValue<SByte> SByte::operator=(const SByte &rhs) const
1230         {
1231                 Value *value = rhs.loadValue();
1232                 storeValue(value);
1233
1234                 return RValue<SByte>(value);
1235         }
1236
1237         RValue<SByte> SByte::operator=(const Reference<SByte> &rhs) const
1238         {
1239                 Value *value = rhs.loadValue();
1240                 storeValue(value);
1241
1242                 return RValue<SByte>(value);
1243         }
1244
1245         RValue<SByte> operator+(RValue<SByte> lhs, RValue<SByte> rhs)
1246         {
1247                 return RValue<SByte>(Nucleus::createAdd(lhs.value, rhs.value));
1248         }
1249
1250         RValue<SByte> operator-(RValue<SByte> lhs, RValue<SByte> rhs)
1251         {
1252                 return RValue<SByte>(Nucleus::createSub(lhs.value, rhs.value));
1253         }
1254
1255         RValue<SByte> operator*(RValue<SByte> lhs, RValue<SByte> rhs)
1256         {
1257                 return RValue<SByte>(Nucleus::createMul(lhs.value, rhs.value));
1258         }
1259
1260         RValue<SByte> operator/(RValue<SByte> lhs, RValue<SByte> rhs)
1261         {
1262                 return RValue<SByte>(Nucleus::createSDiv(lhs.value, rhs.value));
1263         }
1264
1265         RValue<SByte> operator%(RValue<SByte> lhs, RValue<SByte> rhs)
1266         {
1267                 return RValue<SByte>(Nucleus::createSRem(lhs.value, rhs.value));
1268         }
1269
1270         RValue<SByte> operator&(RValue<SByte> lhs, RValue<SByte> rhs)
1271         {
1272                 return RValue<SByte>(Nucleus::createAnd(lhs.value, rhs.value));
1273         }
1274
1275         RValue<SByte> operator|(RValue<SByte> lhs, RValue<SByte> rhs)
1276         {
1277                 return RValue<SByte>(Nucleus::createOr(lhs.value, rhs.value));
1278         }
1279
1280         RValue<SByte> operator^(RValue<SByte> lhs, RValue<SByte> rhs)
1281         {
1282                 return RValue<SByte>(Nucleus::createXor(lhs.value, rhs.value));
1283         }
1284
1285         RValue<SByte> operator<<(RValue<SByte> lhs, RValue<SByte> rhs)
1286         {
1287                 return RValue<SByte>(Nucleus::createShl(lhs.value, rhs.value));
1288         }
1289
1290         RValue<SByte> operator>>(RValue<SByte> lhs, RValue<SByte> rhs)
1291         {
1292                 return RValue<SByte>(Nucleus::createAShr(lhs.value, rhs.value));
1293         }
1294
1295         RValue<SByte> operator+=(const SByte &lhs, RValue<SByte> rhs)
1296         {
1297                 return lhs = lhs + rhs;
1298         }
1299
1300         RValue<SByte> operator-=(const SByte &lhs, RValue<SByte> rhs)
1301         {
1302                 return lhs = lhs - rhs;
1303         }
1304
1305         RValue<SByte> operator*=(const SByte &lhs, RValue<SByte> rhs)
1306         {
1307                 return lhs = lhs * rhs;
1308         }
1309
1310         RValue<SByte> operator/=(const SByte &lhs, RValue<SByte> rhs)
1311         {
1312                 return lhs = lhs / rhs;
1313         }
1314
1315         RValue<SByte> operator%=(const SByte &lhs, RValue<SByte> rhs)
1316         {
1317                 return lhs = lhs % rhs;
1318         }
1319
1320         RValue<SByte> operator&=(const SByte &lhs, RValue<SByte> rhs)
1321         {
1322                 return lhs = lhs & rhs;
1323         }
1324
1325         RValue<SByte> operator|=(const SByte &lhs, RValue<SByte> rhs)
1326         {
1327                 return lhs = lhs | rhs;
1328         }
1329
1330         RValue<SByte> operator^=(const SByte &lhs, RValue<SByte> rhs)
1331         {
1332                 return lhs = lhs ^ rhs;
1333         }
1334
1335         RValue<SByte> operator<<=(const SByte &lhs, RValue<SByte> rhs)
1336         {
1337                 return lhs = lhs << rhs;
1338         }
1339
1340         RValue<SByte> operator>>=(const SByte &lhs, RValue<SByte> rhs)
1341         {
1342                 return lhs = lhs >> rhs;
1343         }
1344
1345         RValue<SByte> operator+(RValue<SByte> val)
1346         {
1347                 return val;
1348         }
1349
1350         RValue<SByte> operator-(RValue<SByte> val)
1351         {
1352                 return RValue<SByte>(Nucleus::createNeg(val.value));
1353         }
1354
1355         RValue<SByte> operator~(RValue<SByte> val)
1356         {
1357                 return RValue<SByte>(Nucleus::createNot(val.value));
1358         }
1359
1360         RValue<SByte> operator++(const SByte &val, int)   // Post-increment
1361         {
1362                 RValue<SByte> res = val;
1363
1364                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantByte((signed char)1)));
1365                 val.storeValue(inc);
1366
1367                 return res;
1368         }
1369
1370         const SByte &operator++(const SByte &val)   // Pre-increment
1371         {
1372                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantByte((signed char)1)));
1373                 val.storeValue(inc);
1374
1375                 return val;
1376         }
1377
1378         RValue<SByte> operator--(const SByte &val, int)   // Post-decrement
1379         {
1380                 RValue<SByte> res = val;
1381
1382                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantByte((signed char)1)));
1383                 val.storeValue(inc);
1384
1385                 return res;
1386         }
1387
1388         const SByte &operator--(const SByte &val)   // Pre-decrement
1389         {
1390                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantByte((signed char)1)));
1391                 val.storeValue(inc);
1392
1393                 return val;
1394         }
1395
1396         RValue<Bool> operator<(RValue<SByte> lhs, RValue<SByte> rhs)
1397         {
1398                 return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
1399         }
1400
1401         RValue<Bool> operator<=(RValue<SByte> lhs, RValue<SByte> rhs)
1402         {
1403                 return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
1404         }
1405
1406         RValue<Bool> operator>(RValue<SByte> lhs, RValue<SByte> rhs)
1407         {
1408                 return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
1409         }
1410
1411         RValue<Bool> operator>=(RValue<SByte> lhs, RValue<SByte> rhs)
1412         {
1413                 return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
1414         }
1415
1416         RValue<Bool> operator!=(RValue<SByte> lhs, RValue<SByte> rhs)
1417         {
1418                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1419         }
1420
1421         RValue<Bool> operator==(RValue<SByte> lhs, RValue<SByte> rhs)
1422         {
1423                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1424         }
1425
1426         Type *SByte::getType()
1427         {
1428                 return T(llvm::Type::getInt8Ty(*::context));
1429         }
1430
1431         Short::Short(Argument<Short> argument)
1432         {
1433                 storeValue(argument.value);
1434         }
1435
1436         Short::Short(RValue<Int> cast)
1437         {
1438                 Value *integer = Nucleus::createTrunc(cast.value, Short::getType());
1439
1440                 storeValue(integer);
1441         }
1442
1443         Short::Short()
1444         {
1445         }
1446
1447         Short::Short(short x)
1448         {
1449                 storeValue(Nucleus::createConstantShort(x));
1450         }
1451
1452         Short::Short(RValue<Short> rhs)
1453         {
1454                 storeValue(rhs.value);
1455         }
1456
1457         Short::Short(const Short &rhs)
1458         {
1459                 Value *value = rhs.loadValue();
1460                 storeValue(value);
1461         }
1462
1463         Short::Short(const Reference<Short> &rhs)
1464         {
1465                 Value *value = rhs.loadValue();
1466                 storeValue(value);
1467         }
1468
1469         RValue<Short> Short::operator=(RValue<Short> rhs) const
1470         {
1471                 storeValue(rhs.value);
1472
1473                 return rhs;
1474         }
1475
1476         RValue<Short> Short::operator=(const Short &rhs) const
1477         {
1478                 Value *value = rhs.loadValue();
1479                 storeValue(value);
1480
1481                 return RValue<Short>(value);
1482         }
1483
1484         RValue<Short> Short::operator=(const Reference<Short> &rhs) const
1485         {
1486                 Value *value = rhs.loadValue();
1487                 storeValue(value);
1488
1489                 return RValue<Short>(value);
1490         }
1491
1492         RValue<Short> operator+(RValue<Short> lhs, RValue<Short> rhs)
1493         {
1494                 return RValue<Short>(Nucleus::createAdd(lhs.value, rhs.value));
1495         }
1496
1497         RValue<Short> operator-(RValue<Short> lhs, RValue<Short> rhs)
1498         {
1499                 return RValue<Short>(Nucleus::createSub(lhs.value, rhs.value));
1500         }
1501
1502         RValue<Short> operator*(RValue<Short> lhs, RValue<Short> rhs)
1503         {
1504                 return RValue<Short>(Nucleus::createMul(lhs.value, rhs.value));
1505         }
1506
1507         RValue<Short> operator/(RValue<Short> lhs, RValue<Short> rhs)
1508         {
1509                 return RValue<Short>(Nucleus::createSDiv(lhs.value, rhs.value));
1510         }
1511
1512         RValue<Short> operator%(RValue<Short> lhs, RValue<Short> rhs)
1513         {
1514                 return RValue<Short>(Nucleus::createSRem(lhs.value, rhs.value));
1515         }
1516
1517         RValue<Short> operator&(RValue<Short> lhs, RValue<Short> rhs)
1518         {
1519                 return RValue<Short>(Nucleus::createAnd(lhs.value, rhs.value));
1520         }
1521
1522         RValue<Short> operator|(RValue<Short> lhs, RValue<Short> rhs)
1523         {
1524                 return RValue<Short>(Nucleus::createOr(lhs.value, rhs.value));
1525         }
1526
1527         RValue<Short> operator^(RValue<Short> lhs, RValue<Short> rhs)
1528         {
1529                 return RValue<Short>(Nucleus::createXor(lhs.value, rhs.value));
1530         }
1531
1532         RValue<Short> operator<<(RValue<Short> lhs, RValue<Short> rhs)
1533         {
1534                 return RValue<Short>(Nucleus::createShl(lhs.value, rhs.value));
1535         }
1536
1537         RValue<Short> operator>>(RValue<Short> lhs, RValue<Short> rhs)
1538         {
1539                 return RValue<Short>(Nucleus::createAShr(lhs.value, rhs.value));
1540         }
1541
1542         RValue<Short> operator+=(const Short &lhs, RValue<Short> rhs)
1543         {
1544                 return lhs = lhs + rhs;
1545         }
1546
1547         RValue<Short> operator-=(const Short &lhs, RValue<Short> rhs)
1548         {
1549                 return lhs = lhs - rhs;
1550         }
1551
1552         RValue<Short> operator*=(const Short &lhs, RValue<Short> rhs)
1553         {
1554                 return lhs = lhs * rhs;
1555         }
1556
1557         RValue<Short> operator/=(const Short &lhs, RValue<Short> rhs)
1558         {
1559                 return lhs = lhs / rhs;
1560         }
1561
1562         RValue<Short> operator%=(const Short &lhs, RValue<Short> rhs)
1563         {
1564                 return lhs = lhs % rhs;
1565         }
1566
1567         RValue<Short> operator&=(const Short &lhs, RValue<Short> rhs)
1568         {
1569                 return lhs = lhs & rhs;
1570         }
1571
1572         RValue<Short> operator|=(const Short &lhs, RValue<Short> rhs)
1573         {
1574                 return lhs = lhs | rhs;
1575         }
1576
1577         RValue<Short> operator^=(const Short &lhs, RValue<Short> rhs)
1578         {
1579                 return lhs = lhs ^ rhs;
1580         }
1581
1582         RValue<Short> operator<<=(const Short &lhs, RValue<Short> rhs)
1583         {
1584                 return lhs = lhs << rhs;
1585         }
1586
1587         RValue<Short> operator>>=(const Short &lhs, RValue<Short> rhs)
1588         {
1589                 return lhs = lhs >> rhs;
1590         }
1591
1592         RValue<Short> operator+(RValue<Short> val)
1593         {
1594                 return val;
1595         }
1596
1597         RValue<Short> operator-(RValue<Short> val)
1598         {
1599                 return RValue<Short>(Nucleus::createNeg(val.value));
1600         }
1601
1602         RValue<Short> operator~(RValue<Short> val)
1603         {
1604                 return RValue<Short>(Nucleus::createNot(val.value));
1605         }
1606
1607         RValue<Short> operator++(const Short &val, int)   // Post-increment
1608         {
1609                 RValue<Short> res = val;
1610
1611                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantShort((short)1)));
1612                 val.storeValue(inc);
1613
1614                 return res;
1615         }
1616
1617         const Short &operator++(const Short &val)   // Pre-increment
1618         {
1619                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantShort((short)1)));
1620                 val.storeValue(inc);
1621
1622                 return val;
1623         }
1624
1625         RValue<Short> operator--(const Short &val, int)   // Post-decrement
1626         {
1627                 RValue<Short> res = val;
1628
1629                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantShort((short)1)));
1630                 val.storeValue(inc);
1631
1632                 return res;
1633         }
1634
1635         const Short &operator--(const Short &val)   // Pre-decrement
1636         {
1637                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantShort((short)1)));
1638                 val.storeValue(inc);
1639
1640                 return val;
1641         }
1642
1643         RValue<Bool> operator<(RValue<Short> lhs, RValue<Short> rhs)
1644         {
1645                 return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
1646         }
1647
1648         RValue<Bool> operator<=(RValue<Short> lhs, RValue<Short> rhs)
1649         {
1650                 return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
1651         }
1652
1653         RValue<Bool> operator>(RValue<Short> lhs, RValue<Short> rhs)
1654         {
1655                 return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
1656         }
1657
1658         RValue<Bool> operator>=(RValue<Short> lhs, RValue<Short> rhs)
1659         {
1660                 return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
1661         }
1662
1663         RValue<Bool> operator!=(RValue<Short> lhs, RValue<Short> rhs)
1664         {
1665                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1666         }
1667
1668         RValue<Bool> operator==(RValue<Short> lhs, RValue<Short> rhs)
1669         {
1670                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1671         }
1672
1673         Type *Short::getType()
1674         {
1675                 return T(llvm::Type::getInt16Ty(*::context));
1676         }
1677
1678         UShort::UShort(Argument<UShort> argument)
1679         {
1680                 storeValue(argument.value);
1681         }
1682
1683         UShort::UShort(RValue<UInt> cast)
1684         {
1685                 Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
1686
1687                 storeValue(integer);
1688         }
1689
1690         UShort::UShort(RValue<Int> cast)
1691         {
1692                 Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
1693
1694                 storeValue(integer);
1695         }
1696
1697         UShort::UShort()
1698         {
1699         }
1700
1701         UShort::UShort(unsigned short x)
1702         {
1703                 storeValue(Nucleus::createConstantShort(x));
1704         }
1705
1706         UShort::UShort(RValue<UShort> rhs)
1707         {
1708                 storeValue(rhs.value);
1709         }
1710
1711         UShort::UShort(const UShort &rhs)
1712         {
1713                 Value *value = rhs.loadValue();
1714                 storeValue(value);
1715         }
1716
1717         UShort::UShort(const Reference<UShort> &rhs)
1718         {
1719                 Value *value = rhs.loadValue();
1720                 storeValue(value);
1721         }
1722
1723         RValue<UShort> UShort::operator=(RValue<UShort> rhs) const
1724         {
1725                 storeValue(rhs.value);
1726
1727                 return rhs;
1728         }
1729
1730         RValue<UShort> UShort::operator=(const UShort &rhs) const
1731         {
1732                 Value *value = rhs.loadValue();
1733                 storeValue(value);
1734
1735                 return RValue<UShort>(value);
1736         }
1737
1738         RValue<UShort> UShort::operator=(const Reference<UShort> &rhs) const
1739         {
1740                 Value *value = rhs.loadValue();
1741                 storeValue(value);
1742
1743                 return RValue<UShort>(value);
1744         }
1745
1746         RValue<UShort> operator+(RValue<UShort> lhs, RValue<UShort> rhs)
1747         {
1748                 return RValue<UShort>(Nucleus::createAdd(lhs.value, rhs.value));
1749         }
1750
1751         RValue<UShort> operator-(RValue<UShort> lhs, RValue<UShort> rhs)
1752         {
1753                 return RValue<UShort>(Nucleus::createSub(lhs.value, rhs.value));
1754         }
1755
1756         RValue<UShort> operator*(RValue<UShort> lhs, RValue<UShort> rhs)
1757         {
1758                 return RValue<UShort>(Nucleus::createMul(lhs.value, rhs.value));
1759         }
1760
1761         RValue<UShort> operator/(RValue<UShort> lhs, RValue<UShort> rhs)
1762         {
1763                 return RValue<UShort>(Nucleus::createUDiv(lhs.value, rhs.value));
1764         }
1765
1766         RValue<UShort> operator%(RValue<UShort> lhs, RValue<UShort> rhs)
1767         {
1768                 return RValue<UShort>(Nucleus::createURem(lhs.value, rhs.value));
1769         }
1770
1771         RValue<UShort> operator&(RValue<UShort> lhs, RValue<UShort> rhs)
1772         {
1773                 return RValue<UShort>(Nucleus::createAnd(lhs.value, rhs.value));
1774         }
1775
1776         RValue<UShort> operator|(RValue<UShort> lhs, RValue<UShort> rhs)
1777         {
1778                 return RValue<UShort>(Nucleus::createOr(lhs.value, rhs.value));
1779         }
1780
1781         RValue<UShort> operator^(RValue<UShort> lhs, RValue<UShort> rhs)
1782         {
1783                 return RValue<UShort>(Nucleus::createXor(lhs.value, rhs.value));
1784         }
1785
1786         RValue<UShort> operator<<(RValue<UShort> lhs, RValue<UShort> rhs)
1787         {
1788                 return RValue<UShort>(Nucleus::createShl(lhs.value, rhs.value));
1789         }
1790
1791         RValue<UShort> operator>>(RValue<UShort> lhs, RValue<UShort> rhs)
1792         {
1793                 return RValue<UShort>(Nucleus::createLShr(lhs.value, rhs.value));
1794         }
1795
1796         RValue<UShort> operator+=(const UShort &lhs, RValue<UShort> rhs)
1797         {
1798                 return lhs = lhs + rhs;
1799         }
1800
1801         RValue<UShort> operator-=(const UShort &lhs, RValue<UShort> rhs)
1802         {
1803                 return lhs = lhs - rhs;
1804         }
1805
1806         RValue<UShort> operator*=(const UShort &lhs, RValue<UShort> rhs)
1807         {
1808                 return lhs = lhs * rhs;
1809         }
1810
1811         RValue<UShort> operator/=(const UShort &lhs, RValue<UShort> rhs)
1812         {
1813                 return lhs = lhs / rhs;
1814         }
1815
1816         RValue<UShort> operator%=(const UShort &lhs, RValue<UShort> rhs)
1817         {
1818                 return lhs = lhs % rhs;
1819         }
1820
1821         RValue<UShort> operator&=(const UShort &lhs, RValue<UShort> rhs)
1822         {
1823                 return lhs = lhs & rhs;
1824         }
1825
1826         RValue<UShort> operator|=(const UShort &lhs, RValue<UShort> rhs)
1827         {
1828                 return lhs = lhs | rhs;
1829         }
1830
1831         RValue<UShort> operator^=(const UShort &lhs, RValue<UShort> rhs)
1832         {
1833                 return lhs = lhs ^ rhs;
1834         }
1835
1836         RValue<UShort> operator<<=(const UShort &lhs, RValue<UShort> rhs)
1837         {
1838                 return lhs = lhs << rhs;
1839         }
1840
1841         RValue<UShort> operator>>=(const UShort &lhs, RValue<UShort> rhs)
1842         {
1843                 return lhs = lhs >> rhs;
1844         }
1845
1846         RValue<UShort> operator+(RValue<UShort> val)
1847         {
1848                 return val;
1849         }
1850
1851         RValue<UShort> operator-(RValue<UShort> val)
1852         {
1853                 return RValue<UShort>(Nucleus::createNeg(val.value));
1854         }
1855
1856         RValue<UShort> operator~(RValue<UShort> val)
1857         {
1858                 return RValue<UShort>(Nucleus::createNot(val.value));
1859         }
1860
1861         RValue<UShort> operator++(const UShort &val, int)   // Post-increment
1862         {
1863                 RValue<UShort> res = val;
1864
1865                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantShort((unsigned short)1)));
1866                 val.storeValue(inc);
1867
1868                 return res;
1869         }
1870
1871         const UShort &operator++(const UShort &val)   // Pre-increment
1872         {
1873                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantShort((unsigned short)1)));
1874                 val.storeValue(inc);
1875
1876                 return val;
1877         }
1878
1879         RValue<UShort> operator--(const UShort &val, int)   // Post-decrement
1880         {
1881                 RValue<UShort> res = val;
1882
1883                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantShort((unsigned short)1)));
1884                 val.storeValue(inc);
1885
1886                 return res;
1887         }
1888
1889         const UShort &operator--(const UShort &val)   // Pre-decrement
1890         {
1891                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantShort((unsigned short)1)));
1892                 val.storeValue(inc);
1893
1894                 return val;
1895         }
1896
1897         RValue<Bool> operator<(RValue<UShort> lhs, RValue<UShort> rhs)
1898         {
1899                 return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
1900         }
1901
1902         RValue<Bool> operator<=(RValue<UShort> lhs, RValue<UShort> rhs)
1903         {
1904                 return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
1905         }
1906
1907         RValue<Bool> operator>(RValue<UShort> lhs, RValue<UShort> rhs)
1908         {
1909                 return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
1910         }
1911
1912         RValue<Bool> operator>=(RValue<UShort> lhs, RValue<UShort> rhs)
1913         {
1914                 return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
1915         }
1916
1917         RValue<Bool> operator!=(RValue<UShort> lhs, RValue<UShort> rhs)
1918         {
1919                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1920         }
1921
1922         RValue<Bool> operator==(RValue<UShort> lhs, RValue<UShort> rhs)
1923         {
1924                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1925         }
1926
1927         Type *UShort::getType()
1928         {
1929                 return T(llvm::Type::getInt16Ty(*::context));
1930         }
1931
1932         Byte4::Byte4(RValue<Byte8> cast)
1933         {
1934         //      xyzw.parent = this;
1935
1936                 storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), Int::getType()));
1937         }
1938
1939         Byte4::Byte4(const Reference<Byte4> &rhs)
1940         {
1941         //      xyzw.parent = this;
1942
1943                 Value *value = rhs.loadValue();
1944                 storeValue(value);
1945         }
1946
1947         Type *Byte4::getType()
1948         {
1949                 #if 0
1950                         return T(VectorType::get(Byte::getType(), 4));
1951                 #else
1952                         return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
1953                 #endif
1954         }
1955
1956         Type *SByte4::getType()
1957         {
1958                 #if 0
1959                         return T(VectorType::get(SByte::getType(), 4));
1960                 #else
1961                         return Int::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
1962                 #endif
1963         }
1964
1965         Byte8::Byte8()
1966         {
1967         //      xyzw.parent = this;
1968         }
1969
1970         Byte8::Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
1971         {
1972         //      xyzw.parent = this;
1973
1974                 int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
1975                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Byte::getType(), 8))));
1976
1977                 storeValue(Nucleus::createBitCast(vector, getType()));
1978         }
1979
1980         Byte8::Byte8(RValue<Byte8> rhs)
1981         {
1982         //      xyzw.parent = this;
1983
1984                 storeValue(rhs.value);
1985         }
1986
1987         Byte8::Byte8(const Byte8 &rhs)
1988         {
1989         //      xyzw.parent = this;
1990
1991                 Value *value = rhs.loadValue();
1992                 storeValue(value);
1993         }
1994
1995         Byte8::Byte8(const Reference<Byte8> &rhs)
1996         {
1997         //      xyzw.parent = this;
1998
1999                 Value *value = rhs.loadValue();
2000                 storeValue(value);
2001         }
2002
2003         RValue<Byte8> Byte8::operator=(RValue<Byte8> rhs) const
2004         {
2005                 storeValue(rhs.value);
2006
2007                 return rhs;
2008         }
2009
2010         RValue<Byte8> Byte8::operator=(const Byte8 &rhs) const
2011         {
2012                 Value *value = rhs.loadValue();
2013                 storeValue(value);
2014
2015                 return RValue<Byte8>(value);
2016         }
2017
2018         RValue<Byte8> Byte8::operator=(const Reference<Byte8> &rhs) const
2019         {
2020                 Value *value = rhs.loadValue();
2021                 storeValue(value);
2022
2023                 return RValue<Byte8>(value);
2024         }
2025
2026         RValue<Byte8> operator+(RValue<Byte8> lhs, RValue<Byte8> rhs)
2027         {
2028                 if(CPUID::supportsMMX2())
2029                 {
2030                         return x86::paddb(lhs, rhs);
2031                 }
2032                 else
2033                 {
2034                         return RValue<Byte8>(Nucleus::createAdd(lhs.value, rhs.value));
2035                 }
2036         }
2037
2038         RValue<Byte8> operator-(RValue<Byte8> lhs, RValue<Byte8> rhs)
2039         {
2040                 if(CPUID::supportsMMX2())
2041                 {
2042                         return x86::psubb(lhs, rhs);
2043                 }
2044                 else
2045                 {
2046                         return RValue<Byte8>(Nucleus::createSub(lhs.value, rhs.value));
2047                 }
2048         }
2049
2050 //      RValue<Byte8> operator*(RValue<Byte8> lhs, RValue<Byte8> rhs)
2051 //      {
2052 //              return RValue<Byte8>(Nucleus::createMul(lhs.value, rhs.value));
2053 //      }
2054
2055 //      RValue<Byte8> operator/(RValue<Byte8> lhs, RValue<Byte8> rhs)
2056 //      {
2057 //              return RValue<Byte8>(Nucleus::createUDiv(lhs.value, rhs.value));
2058 //      }
2059
2060 //      RValue<Byte8> operator%(RValue<Byte8> lhs, RValue<Byte8> rhs)
2061 //      {
2062 //              return RValue<Byte8>(Nucleus::createURem(lhs.value, rhs.value));
2063 //      }
2064
2065         RValue<Byte8> operator&(RValue<Byte8> lhs, RValue<Byte8> rhs)
2066         {
2067                 if(CPUID::supportsMMX2())
2068                 {
2069                         return As<Byte8>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
2070                 }
2071                 else
2072                 {
2073                         return RValue<Byte8>(Nucleus::createAnd(lhs.value, rhs.value));
2074                 }
2075         }
2076
2077         RValue<Byte8> operator|(RValue<Byte8> lhs, RValue<Byte8> rhs)
2078         {
2079                 if(CPUID::supportsMMX2())
2080                 {
2081                         return As<Byte8>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
2082                 }
2083                 else
2084                 {
2085                         return RValue<Byte8>(Nucleus::createOr(lhs.value, rhs.value));
2086                 }
2087         }
2088
2089         RValue<Byte8> operator^(RValue<Byte8> lhs, RValue<Byte8> rhs)
2090         {
2091                 if(CPUID::supportsMMX2())
2092                 {
2093                         return As<Byte8>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
2094                 }
2095                 else
2096                 {
2097                         return RValue<Byte8>(Nucleus::createXor(lhs.value, rhs.value));
2098                 }
2099         }
2100
2101 //      RValue<Byte8> operator<<(RValue<Byte8> lhs, unsigned char rhs)
2102 //      {
2103 //              return RValue<Byte8>(Nucleus::createShl(lhs.value, rhs.value));
2104 //      }
2105
2106 //      RValue<Byte8> operator>>(RValue<Byte8> lhs, unsigned char rhs)
2107 //      {
2108 //              return RValue<Byte8>(Nucleus::createLShr(lhs.value, rhs.value));
2109 //      }
2110
2111         RValue<Byte8> operator+=(const Byte8 &lhs, RValue<Byte8> rhs)
2112         {
2113                 return lhs = lhs + rhs;
2114         }
2115
2116         RValue<Byte8> operator-=(const Byte8 &lhs, RValue<Byte8> rhs)
2117         {
2118                 return lhs = lhs - rhs;
2119         }
2120
2121 //      RValue<Byte8> operator*=(const Byte8 &lhs, RValue<Byte8> rhs)
2122 //      {
2123 //              return lhs = lhs * rhs;
2124 //      }
2125
2126 //      RValue<Byte8> operator/=(const Byte8 &lhs, RValue<Byte8> rhs)
2127 //      {
2128 //              return lhs = lhs / rhs;
2129 //      }
2130
2131 //      RValue<Byte8> operator%=(const Byte8 &lhs, RValue<Byte8> rhs)
2132 //      {
2133 //              return lhs = lhs % rhs;
2134 //      }
2135
2136         RValue<Byte8> operator&=(const Byte8 &lhs, RValue<Byte8> rhs)
2137         {
2138                 return lhs = lhs & rhs;
2139         }
2140
2141         RValue<Byte8> operator|=(const Byte8 &lhs, RValue<Byte8> rhs)
2142         {
2143                 return lhs = lhs | rhs;
2144         }
2145
2146         RValue<Byte8> operator^=(const Byte8 &lhs, RValue<Byte8> rhs)
2147         {
2148                 return lhs = lhs ^ rhs;
2149         }
2150
2151 //      RValue<Byte8> operator<<=(const Byte8 &lhs, RValue<Byte8> rhs)
2152 //      {
2153 //              return lhs = lhs << rhs;
2154 //      }
2155
2156 //      RValue<Byte8> operator>>=(const Byte8 &lhs, RValue<Byte8> rhs)
2157 //      {
2158 //              return lhs = lhs >> rhs;
2159 //      }
2160
2161 //      RValue<Byte8> operator+(RValue<Byte8> val)
2162 //      {
2163 //              return val;
2164 //      }
2165
2166 //      RValue<Byte8> operator-(RValue<Byte8> val)
2167 //      {
2168 //              return RValue<Byte8>(Nucleus::createNeg(val.value));
2169 //      }
2170
2171         RValue<Byte8> operator~(RValue<Byte8> val)
2172         {
2173                 if(CPUID::supportsMMX2())
2174                 {
2175                         return val ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
2176                 }
2177                 else
2178                 {
2179                         return RValue<Byte8>(Nucleus::createNot(val.value));
2180                 }
2181         }
2182
2183         RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
2184         {
2185                 return x86::paddusb(x, y);
2186         }
2187
2188         RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
2189         {
2190                 return x86::psubusb(x, y);
2191         }
2192
2193         RValue<Short4> Unpack(RValue<Byte4> x)
2194         {
2195                 Value *int2 = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), x.value, 0);
2196                 Value *byte8 = Nucleus::createBitCast(int2, Byte8::getType());
2197
2198                 return UnpackLow(RValue<Byte8>(byte8), RValue<Byte8>(byte8));
2199         }
2200
2201         RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y)
2202         {
2203                 if(CPUID::supportsMMX2())
2204                 {
2205                         return x86::punpcklbw(x, y);
2206                 }
2207                 else
2208                 {
2209                         int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
2210                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2211
2212                         return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2213                 }
2214         }
2215
2216         RValue<Short4> UnpackHigh(RValue<Byte8> x, RValue<Byte8> y)
2217         {
2218                 if(CPUID::supportsMMX2())
2219                 {
2220                         return x86::punpckhbw(x, y);
2221                 }
2222                 else
2223                 {
2224                         int shuffle[8] = {4, 12, 5, 13, 6, 14, 7, 15};
2225                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2226
2227                         return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2228                 }
2229         }
2230
2231         RValue<Int> SignMask(RValue<Byte8> x)
2232         {
2233                 return x86::pmovmskb(x);
2234         }
2235
2236 //      RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
2237 //      {
2238 //              return x86::pcmpgtb(x, y);   // FIXME: Signedness
2239 //      }
2240
2241         RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
2242         {
2243                 return x86::pcmpeqb(x, y);
2244         }
2245
2246         Type *Byte8::getType()
2247         {
2248                 if(CPUID::supportsMMX2())
2249                 {
2250                         return MMX::getType();
2251                 }
2252                 else
2253                 {
2254                         return T(VectorType::get(Byte::getType(), 8));
2255                 }
2256         }
2257
2258         SByte8::SByte8()
2259         {
2260         //      xyzw.parent = this;
2261         }
2262
2263         SByte8::SByte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
2264         {
2265         //      xyzw.parent = this;
2266
2267                 int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
2268                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(SByte::getType(), 8))));
2269
2270                 storeValue(Nucleus::createBitCast(vector, getType()));
2271         }
2272
2273         SByte8::SByte8(RValue<SByte8> rhs)
2274         {
2275         //      xyzw.parent = this;
2276
2277                 storeValue(rhs.value);
2278         }
2279
2280         SByte8::SByte8(const SByte8 &rhs)
2281         {
2282         //      xyzw.parent = this;
2283
2284                 Value *value = rhs.loadValue();
2285                 storeValue(value);
2286         }
2287
2288         SByte8::SByte8(const Reference<SByte8> &rhs)
2289         {
2290         //      xyzw.parent = this;
2291
2292                 Value *value = rhs.loadValue();
2293                 storeValue(value);
2294         }
2295
2296         RValue<SByte8> SByte8::operator=(RValue<SByte8> rhs) const
2297         {
2298                 storeValue(rhs.value);
2299
2300                 return rhs;
2301         }
2302
2303         RValue<SByte8> SByte8::operator=(const SByte8 &rhs) const
2304         {
2305                 Value *value = rhs.loadValue();
2306                 storeValue(value);
2307
2308                 return RValue<SByte8>(value);
2309         }
2310
2311         RValue<SByte8> SByte8::operator=(const Reference<SByte8> &rhs) const
2312         {
2313                 Value *value = rhs.loadValue();
2314                 storeValue(value);
2315
2316                 return RValue<SByte8>(value);
2317         }
2318
2319         RValue<SByte8> operator+(RValue<SByte8> lhs, RValue<SByte8> rhs)
2320         {
2321                 if(CPUID::supportsMMX2())
2322                 {
2323                         return As<SByte8>(x86::paddb(As<Byte8>(lhs), As<Byte8>(rhs)));
2324                 }
2325                 else
2326                 {
2327                         return RValue<SByte8>(Nucleus::createAdd(lhs.value, rhs.value));
2328                 }
2329         }
2330
2331         RValue<SByte8> operator-(RValue<SByte8> lhs, RValue<SByte8> rhs)
2332         {
2333                 if(CPUID::supportsMMX2())
2334                 {
2335                         return As<SByte8>(x86::psubb(As<Byte8>(lhs), As<Byte8>(rhs)));
2336                 }
2337                 else
2338                 {
2339                         return RValue<SByte8>(Nucleus::createSub(lhs.value, rhs.value));
2340                 }
2341         }
2342
2343 //      RValue<SByte8> operator*(RValue<SByte8> lhs, RValue<SByte8> rhs)
2344 //      {
2345 //              return RValue<SByte8>(Nucleus::createMul(lhs.value, rhs.value));
2346 //      }
2347
2348 //      RValue<SByte8> operator/(RValue<SByte8> lhs, RValue<SByte8> rhs)
2349 //      {
2350 //              return RValue<SByte8>(Nucleus::createSDiv(lhs.value, rhs.value));
2351 //      }
2352
2353 //      RValue<SByte8> operator%(RValue<SByte8> lhs, RValue<SByte8> rhs)
2354 //      {
2355 //              return RValue<SByte8>(Nucleus::createSRem(lhs.value, rhs.value));
2356 //      }
2357
2358         RValue<SByte8> operator&(RValue<SByte8> lhs, RValue<SByte8> rhs)
2359         {
2360                 return RValue<SByte8>(Nucleus::createAnd(lhs.value, rhs.value));
2361         }
2362
2363         RValue<SByte8> operator|(RValue<SByte8> lhs, RValue<SByte8> rhs)
2364         {
2365                 return RValue<SByte8>(Nucleus::createOr(lhs.value, rhs.value));
2366         }
2367
2368         RValue<SByte8> operator^(RValue<SByte8> lhs, RValue<SByte8> rhs)
2369         {
2370                 return RValue<SByte8>(Nucleus::createXor(lhs.value, rhs.value));
2371         }
2372
2373 //      RValue<SByte8> operator<<(RValue<SByte8> lhs, unsigned char rhs)
2374 //      {
2375 //              return RValue<SByte8>(Nucleus::createShl(lhs.value, rhs.value));
2376 //      }
2377
2378 //      RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
2379 //      {
2380 //              return RValue<SByte8>(Nucleus::createAShr(lhs.value, rhs.value));
2381 //      }
2382
2383         RValue<SByte8> operator+=(const SByte8 &lhs, RValue<SByte8> rhs)
2384         {
2385                 return lhs = lhs + rhs;
2386         }
2387
2388         RValue<SByte8> operator-=(const SByte8 &lhs, RValue<SByte8> rhs)
2389         {
2390                 return lhs = lhs - rhs;
2391         }
2392
2393 //      RValue<SByte8> operator*=(const SByte8 &lhs, RValue<SByte8> rhs)
2394 //      {
2395 //              return lhs = lhs * rhs;
2396 //      }
2397
2398 //      RValue<SByte8> operator/=(const SByte8 &lhs, RValue<SByte8> rhs)
2399 //      {
2400 //              return lhs = lhs / rhs;
2401 //      }
2402
2403 //      RValue<SByte8> operator%=(const SByte8 &lhs, RValue<SByte8> rhs)
2404 //      {
2405 //              return lhs = lhs % rhs;
2406 //      }
2407
2408         RValue<SByte8> operator&=(const SByte8 &lhs, RValue<SByte8> rhs)
2409         {
2410                 return lhs = lhs & rhs;
2411         }
2412
2413         RValue<SByte8> operator|=(const SByte8 &lhs, RValue<SByte8> rhs)
2414         {
2415                 return lhs = lhs | rhs;
2416         }
2417
2418         RValue<SByte8> operator^=(const SByte8 &lhs, RValue<SByte8> rhs)
2419         {
2420                 return lhs = lhs ^ rhs;
2421         }
2422
2423 //      RValue<SByte8> operator<<=(const SByte8 &lhs, RValue<SByte8> rhs)
2424 //      {
2425 //              return lhs = lhs << rhs;
2426 //      }
2427
2428 //      RValue<SByte8> operator>>=(const SByte8 &lhs, RValue<SByte8> rhs)
2429 //      {
2430 //              return lhs = lhs >> rhs;
2431 //      }
2432
2433 //      RValue<SByte8> operator+(RValue<SByte8> val)
2434 //      {
2435 //              return val;
2436 //      }
2437
2438 //      RValue<SByte8> operator-(RValue<SByte8> val)
2439 //      {
2440 //              return RValue<SByte8>(Nucleus::createNeg(val.value));
2441 //      }
2442
2443         RValue<SByte8> operator~(RValue<SByte8> val)
2444         {
2445                 if(CPUID::supportsMMX2())
2446                 {
2447                         return val ^ SByte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
2448                 }
2449                 else
2450                 {
2451                         return RValue<SByte8>(Nucleus::createNot(val.value));
2452                 }
2453         }
2454
2455         RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
2456         {
2457                 return x86::paddsb(x, y);
2458         }
2459
2460         RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
2461         {
2462                 return x86::psubsb(x, y);
2463         }
2464
2465         RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y)
2466         {
2467                 if(CPUID::supportsMMX2())
2468                 {
2469                         return As<Short4>(x86::punpcklbw(As<Byte8>(x), As<Byte8>(y)));
2470                 }
2471                 else
2472                 {
2473                         int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
2474                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2475
2476                         return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2477                 }
2478         }
2479
2480         RValue<Short4> UnpackHigh(RValue<SByte8> x, RValue<SByte8> y)
2481         {
2482                 if(CPUID::supportsMMX2())
2483                 {
2484                         return As<Short4>(x86::punpckhbw(As<Byte8>(x), As<Byte8>(y)));
2485                 }
2486                 else
2487                 {
2488                         int shuffle[8] = {4, 12, 5, 13, 6, 14, 7, 15};
2489                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2490
2491                         return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2492                 }
2493         }
2494
2495         RValue<Int> SignMask(RValue<SByte8> x)
2496         {
2497                 return x86::pmovmskb(As<Byte8>(x));
2498         }
2499
2500         RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
2501         {
2502                 return x86::pcmpgtb(x, y);
2503         }
2504
2505         RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
2506         {
2507                 return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
2508         }
2509
2510         Type *SByte8::getType()
2511         {
2512                 if(CPUID::supportsMMX2())
2513                 {
2514                         return MMX::getType();
2515                 }
2516                 else
2517                 {
2518                         return T(VectorType::get(SByte::getType(), 8));
2519                 }
2520         }
2521
2522         Byte16::Byte16(RValue<Byte16> rhs)
2523         {
2524         //      xyzw.parent = this;
2525
2526                 storeValue(rhs.value);
2527         }
2528
2529         Byte16::Byte16(const Byte16 &rhs)
2530         {
2531         //      xyzw.parent = this;
2532
2533                 Value *value = rhs.loadValue();
2534                 storeValue(value);
2535         }
2536
2537         Byte16::Byte16(const Reference<Byte16> &rhs)
2538         {
2539         //      xyzw.parent = this;
2540
2541                 Value *value = rhs.loadValue();
2542                 storeValue(value);
2543         }
2544
2545         RValue<Byte16> Byte16::operator=(RValue<Byte16> rhs) const
2546         {
2547                 storeValue(rhs.value);
2548
2549                 return rhs;
2550         }
2551
2552         RValue<Byte16> Byte16::operator=(const Byte16 &rhs) const
2553         {
2554                 Value *value = rhs.loadValue();
2555                 storeValue(value);
2556
2557                 return RValue<Byte16>(value);
2558         }
2559
2560         RValue<Byte16> Byte16::operator=(const Reference<Byte16> &rhs) const
2561         {
2562                 Value *value = rhs.loadValue();
2563                 storeValue(value);
2564
2565                 return RValue<Byte16>(value);
2566         }
2567
2568         Type *Byte16::getType()
2569         {
2570                 return T(VectorType::get(Byte::getType(), 16));
2571         }
2572
2573         Type *SByte16::getType()
2574         {
2575                 return T( VectorType::get(SByte::getType(), 16));
2576         }
2577
2578         Short2::Short2(RValue<Short4> cast)
2579         {
2580                 storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), UInt::getType()));
2581         }
2582
2583         Type *Short2::getType()
2584         {
2585                 #if 0
2586                         return T(VectorType::get(Short::getType(), 2));
2587                 #else
2588                         return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
2589                 #endif
2590         }
2591
2592         UShort2::UShort2(RValue<UShort4> cast)
2593         {
2594                 storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), UInt::getType()));
2595         }
2596
2597         Type *UShort2::getType()
2598         {
2599                 #if 0
2600                         return T(VectorType::get(UShort::getType(), 2));
2601                 #else
2602                         return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
2603                 #endif
2604         }
2605
2606         Short4::Short4(RValue<Int> cast)
2607         {
2608                 Value *extend = Nucleus::createZExt(cast.value, Long::getType());
2609                 Value *swizzle = Swizzle(RValue<Short4>(extend), 0x00).value;
2610
2611                 storeValue(swizzle);
2612         }
2613
2614         Short4::Short4(RValue<Int4> cast)
2615         {
2616                 Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
2617
2618                 #if 0   // FIXME: Check codegen (pshuflw phshufhw pshufd)
2619                         Constant *pack[8];
2620                         pack[0] = Nucleus::createConstantInt(0);
2621                         pack[1] = Nucleus::createConstantInt(2);
2622                         pack[2] = Nucleus::createConstantInt(4);
2623                         pack[3] = Nucleus::createConstantInt(6);
2624
2625                         Value *short4 = Nucleus::createShuffleVector(short8, short8, Nucleus::createConstantVector(pack, 4));
2626                 #else
2627                         Value *packed;
2628
2629                         // FIXME: Use Swizzle<Short8>
2630                         if(!CPUID::supportsSSSE3())
2631                         {
2632                                 int pshuflw[8] = {0, 2, 0, 2, 4, 5, 6, 7};
2633                                 int pshufhw[8] = {0, 1, 2, 3, 4, 6, 4, 6};
2634
2635                                 Value *shuffle1 = Nucleus::createShuffleVector(short8, short8, pshuflw);
2636                                 Value *shuffle2 = Nucleus::createShuffleVector(shuffle1, shuffle1, pshufhw);
2637                                 Value *int4 = Nucleus::createBitCast(shuffle2, Int4::getType());
2638                                 packed = createSwizzle4(int4, 0x88);
2639                         }
2640                         else
2641                         {
2642                                 int pshufb[16] = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
2643                                 Value *byte16 = Nucleus::createBitCast(cast.value, Byte16::getType());
2644                                 packed = Nucleus::createShuffleVector(byte16, byte16, pshufb);
2645                         }
2646
2647                         #if 0   // FIXME: No optimal instruction selection
2648                                 Value *qword2 = Nucleus::createBitCast(packed, T(VectorType::get(Long::getType(), 2)));
2649                                 Value *element = Nucleus::createExtractElement(qword2, 0);
2650                                 Value *short4 = Nucleus::createBitCast(element, Short4::getType());
2651                         #else   // FIXME: Requires SSE
2652                                 Value *int2 = RValue<Int2>(Int2(RValue<Int4>(packed))).value;
2653                                 Value *short4 = Nucleus::createBitCast(int2, Short4::getType());
2654                         #endif
2655                 #endif
2656
2657                 storeValue(short4);
2658         }
2659
2660 //      Short4::Short4(RValue<Float> cast)
2661 //      {
2662 //      }
2663
2664         Short4::Short4(RValue<Float4> cast)
2665         {
2666                 Int4 v4i32 = Int4(cast);
2667                 v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
2668
2669                 storeValue(As<Short4>(Int2(v4i32)).value);
2670         }
2671
2672         Short4::Short4()
2673         {
2674         //      xyzw.parent = this;
2675         }
2676
2677         Short4::Short4(short xyzw)
2678         {
2679                 //      xyzw.parent = this;
2680
2681                 int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
2682                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Short::getType(), 4))));
2683
2684                 storeValue(Nucleus::createBitCast(vector, getType()));
2685         }
2686
2687         Short4::Short4(short x, short y, short z, short w)
2688         {
2689         //      xyzw.parent = this;
2690
2691                 int64_t constantVector[4] = {x, y, z, w};
2692                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Short::getType(), 4))));
2693
2694                 storeValue(Nucleus::createBitCast(vector, getType()));
2695         }
2696
2697         Short4::Short4(RValue<Short4> rhs)
2698         {
2699         //      xyzw.parent = this;
2700
2701                 storeValue(rhs.value);
2702         }
2703
2704         Short4::Short4(const Short4 &rhs)
2705         {
2706         //      xyzw.parent = this;
2707
2708                 Value *value = rhs.loadValue();
2709                 storeValue(value);
2710         }
2711
2712         Short4::Short4(const Reference<Short4> &rhs)
2713         {
2714         //      xyzw.parent = this;
2715
2716                 Value *value = rhs.loadValue();
2717                 storeValue(value);
2718         }
2719
2720         Short4::Short4(RValue<UShort4> rhs)
2721         {
2722         //      xyzw.parent = this;
2723
2724                 storeValue(rhs.value);
2725         }
2726
2727         Short4::Short4(const UShort4 &rhs)
2728         {
2729         //      xyzw.parent = this;
2730
2731                 storeValue(rhs.loadValue());
2732         }
2733
2734         Short4::Short4(const Reference<UShort4> &rhs)
2735         {
2736         //      xyzw.parent = this;
2737
2738                 storeValue(rhs.loadValue());
2739         }
2740
2741         RValue<Short4> Short4::operator=(RValue<Short4> rhs) const
2742         {
2743                 storeValue(rhs.value);
2744
2745                 return rhs;
2746         }
2747
2748         RValue<Short4> Short4::operator=(const Short4 &rhs) const
2749         {
2750                 Value *value = rhs.loadValue();
2751                 storeValue(value);
2752
2753                 return RValue<Short4>(value);
2754         }
2755
2756         RValue<Short4> Short4::operator=(const Reference<Short4> &rhs) const
2757         {
2758                 Value *value = rhs.loadValue();
2759                 storeValue(value);
2760
2761                 return RValue<Short4>(value);
2762         }
2763
2764         RValue<Short4> Short4::operator=(RValue<UShort4> rhs) const
2765         {
2766                 storeValue(rhs.value);
2767
2768                 return RValue<Short4>(rhs);
2769         }
2770
2771         RValue<Short4> Short4::operator=(const UShort4 &rhs) const
2772         {
2773                 Value *value = rhs.loadValue();
2774                 storeValue(value);
2775
2776                 return RValue<Short4>(value);
2777         }
2778
2779         RValue<Short4> Short4::operator=(const Reference<UShort4> &rhs) const
2780         {
2781                 Value *value = rhs.loadValue();
2782                 storeValue(value);
2783
2784                 return RValue<Short4>(value);
2785         }
2786
2787         RValue<Short4> operator+(RValue<Short4> lhs, RValue<Short4> rhs)
2788         {
2789                 if(CPUID::supportsMMX2())
2790                 {
2791                         return x86::paddw(lhs, rhs);
2792                 }
2793                 else
2794                 {
2795                         return RValue<Short4>(Nucleus::createAdd(lhs.value, rhs.value));
2796                 }
2797         }
2798
2799         RValue<Short4> operator-(RValue<Short4> lhs, RValue<Short4> rhs)
2800         {
2801                 if(CPUID::supportsMMX2())
2802                 {
2803                         return x86::psubw(lhs, rhs);
2804                 }
2805                 else
2806                 {
2807                         return RValue<Short4>(Nucleus::createSub(lhs.value, rhs.value));
2808                 }
2809         }
2810
2811         RValue<Short4> operator*(RValue<Short4> lhs, RValue<Short4> rhs)
2812         {
2813                 if(CPUID::supportsMMX2())
2814                 {
2815                         return x86::pmullw(lhs, rhs);
2816                 }
2817                 else
2818                 {
2819                         return RValue<Short4>(Nucleus::createMul(lhs.value, rhs.value));
2820                 }
2821         }
2822
2823 //      RValue<Short4> operator/(RValue<Short4> lhs, RValue<Short4> rhs)
2824 //      {
2825 //              return RValue<Short4>(Nucleus::createSDiv(lhs.value, rhs.value));
2826 //      }
2827
2828 //      RValue<Short4> operator%(RValue<Short4> lhs, RValue<Short4> rhs)
2829 //      {
2830 //              return RValue<Short4>(Nucleus::createSRem(lhs.value, rhs.value));
2831 //      }
2832
2833         RValue<Short4> operator&(RValue<Short4> lhs, RValue<Short4> rhs)
2834         {
2835                 if(CPUID::supportsMMX2())
2836                 {
2837                         return x86::pand(lhs, rhs);
2838                 }
2839                 else
2840                 {
2841                         return RValue<Short4>(Nucleus::createAnd(lhs.value, rhs.value));
2842                 }
2843         }
2844
2845         RValue<Short4> operator|(RValue<Short4> lhs, RValue<Short4> rhs)
2846         {
2847                 if(CPUID::supportsMMX2())
2848                 {
2849                         return x86::por(lhs, rhs);
2850                 }
2851                 else
2852                 {
2853                         return RValue<Short4>(Nucleus::createOr(lhs.value, rhs.value));
2854                 }
2855         }
2856
2857         RValue<Short4> operator^(RValue<Short4> lhs, RValue<Short4> rhs)
2858         {
2859                 if(CPUID::supportsMMX2())
2860                 {
2861                         return x86::pxor(lhs, rhs);
2862                 }
2863                 else
2864                 {
2865                         return RValue<Short4>(Nucleus::createXor(lhs.value, rhs.value));
2866                 }
2867         }
2868
2869         RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2870         {
2871         //      return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
2872
2873                 return x86::psllw(lhs, rhs);
2874         }
2875
2876         RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2877         {
2878         //      return RValue<Short4>(Nucleus::createAShr(lhs.value, rhs.value));
2879
2880                 return x86::psraw(lhs, rhs);
2881         }
2882
2883         RValue<Short4> operator<<(RValue<Short4> lhs, RValue<Long1> rhs)
2884         {
2885         //      return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
2886
2887                 return x86::psllw(lhs, rhs);
2888         }
2889
2890         RValue<Short4> operator>>(RValue<Short4> lhs, RValue<Long1> rhs)
2891         {
2892         //      return RValue<Short4>(Nucleus::createAShr(lhs.value, rhs.value));
2893
2894                 return x86::psraw(lhs, rhs);
2895         }
2896
2897         RValue<Short4> operator+=(const Short4 &lhs, RValue<Short4> rhs)
2898         {
2899                 return lhs = lhs + rhs;
2900         }
2901
2902         RValue<Short4> operator-=(const Short4 &lhs, RValue<Short4> rhs)
2903         {
2904                 return lhs = lhs - rhs;
2905         }
2906
2907         RValue<Short4> operator*=(const Short4 &lhs, RValue<Short4> rhs)
2908         {
2909                 return lhs = lhs * rhs;
2910         }
2911
2912 //      RValue<Short4> operator/=(const Short4 &lhs, RValue<Short4> rhs)
2913 //      {
2914 //              return lhs = lhs / rhs;
2915 //      }
2916
2917 //      RValue<Short4> operator%=(const Short4 &lhs, RValue<Short4> rhs)
2918 //      {
2919 //              return lhs = lhs % rhs;
2920 //      }
2921
2922         RValue<Short4> operator&=(const Short4 &lhs, RValue<Short4> rhs)
2923         {
2924                 return lhs = lhs & rhs;
2925         }
2926
2927         RValue<Short4> operator|=(const Short4 &lhs, RValue<Short4> rhs)
2928         {
2929                 return lhs = lhs | rhs;
2930         }
2931
2932         RValue<Short4> operator^=(const Short4 &lhs, RValue<Short4> rhs)
2933         {
2934                 return lhs = lhs ^ rhs;
2935         }
2936
2937         RValue<Short4> operator<<=(const Short4 &lhs, unsigned char rhs)
2938         {
2939                 return lhs = lhs << rhs;
2940         }
2941
2942         RValue<Short4> operator>>=(const Short4 &lhs, unsigned char rhs)
2943         {
2944                 return lhs = lhs >> rhs;
2945         }
2946
2947         RValue<Short4> operator<<=(const Short4 &lhs, RValue<Long1> rhs)
2948         {
2949                 return lhs = lhs << rhs;
2950         }
2951
2952         RValue<Short4> operator>>=(const Short4 &lhs, RValue<Long1> rhs)
2953         {
2954                 return lhs = lhs >> rhs;
2955         }
2956
2957 //      RValue<Short4> operator+(RValue<Short4> val)
2958 //      {
2959 //              return val;
2960 //      }
2961
2962         RValue<Short4> operator-(RValue<Short4> val)
2963         {
2964                 if(CPUID::supportsMMX2())
2965                 {
2966                         return Short4(0, 0, 0, 0) - val;
2967                 }
2968                 else
2969                 {
2970                         return RValue<Short4>(Nucleus::createNeg(val.value));
2971                 }
2972         }
2973
2974         RValue<Short4> operator~(RValue<Short4> val)
2975         {
2976                 if(CPUID::supportsMMX2())
2977                 {
2978                         return val ^ Short4(0xFFFFu, 0xFFFFu, 0xFFFFu, 0xFFFFu);
2979                 }
2980                 else
2981                 {
2982                         return RValue<Short4>(Nucleus::createNot(val.value));
2983                 }
2984         }
2985
2986         RValue<Short4> RoundShort4(RValue<Float4> cast)
2987         {
2988                 RValue<Int4> v4i32 = x86::cvtps2dq(cast);
2989                 RValue<Short8> v8i16 = x86::packssdw(v4i32, v4i32);
2990
2991                 return As<Short4>(Int2(As<Int4>(v8i16)));
2992         }
2993
2994         RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2995         {
2996                 return x86::pmaxsw(x, y);
2997         }
2998
2999         RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
3000         {
3001                 return x86::pminsw(x, y);
3002         }
3003
3004         RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
3005         {
3006                 return x86::paddsw(x, y);
3007         }
3008
3009         RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
3010         {
3011                 return x86::psubsw(x, y);
3012         }
3013
3014         RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
3015         {
3016                 return x86::pmulhw(x, y);
3017         }
3018
3019         RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
3020         {
3021                 return x86::pmaddwd(x, y);
3022         }
3023
3024         RValue<SByte8> Pack(RValue<Short4> x, RValue<Short4> y)
3025         {
3026                 return x86::packsswb(x, y);
3027         }
3028
3029         RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y)
3030         {
3031                 if(CPUID::supportsMMX2())
3032                 {
3033                         return x86::punpcklwd(x, y);
3034                 }
3035                 else
3036                 {
3037                         int shuffle[4] = {0, 4, 1, 5};
3038                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
3039
3040                         return RValue<Int2>(Nucleus::createBitCast(packed, Int2::getType()));
3041                 }
3042         }
3043
3044         RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y)
3045         {
3046                 if(CPUID::supportsMMX2())
3047                 {
3048                         return x86::punpckhwd(x, y);
3049                 }
3050                 else
3051                 {
3052                         int shuffle[4] = {2, 6, 3, 7};
3053                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
3054
3055                         return RValue<Int2>(Nucleus::createBitCast(packed, Int2::getType()));
3056                 }
3057         }
3058
3059         RValue<Short4> Swizzle(RValue<Short4> x, unsigned char select)
3060         {
3061                 if(CPUID::supportsMMX2())
3062                 {
3063                         return x86::pshufw(x, select);
3064                 }
3065                 else
3066                 {
3067                         return RValue<Short4>(createSwizzle4(x.value, select));
3068                 }
3069         }
3070
3071         RValue<Short4> Insert(RValue<Short4> val, RValue<Short> element, int i)
3072         {
3073                 if(CPUID::supportsMMX2())
3074                 {
3075                         return x86::pinsrw(val, Int(element), i);
3076                 }
3077                 else
3078                 {
3079                         return RValue<Short4>(Nucleus::createInsertElement(val.value, element.value, i));
3080                 }
3081         }
3082
3083         RValue<Short> Extract(RValue<Short4> val, int i)
3084         {
3085                 if(CPUID::supportsMMX2())
3086                 {
3087                         return Short(x86::pextrw(val, i));
3088                 }
3089                 else
3090                 {
3091                         return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
3092                 }
3093         }
3094
3095         RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
3096         {
3097                 return x86::pcmpgtw(x, y);
3098         }
3099
3100         RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
3101         {
3102                 return x86::pcmpeqw(x, y);
3103         }
3104
3105         Type *Short4::getType()
3106         {
3107                 if(CPUID::supportsMMX2())
3108                 {
3109                         return MMX::getType();
3110                 }
3111                 else
3112                 {
3113                         return T(VectorType::get(Short::getType(), 4));
3114                 }
3115         }
3116
3117         UShort4::UShort4(RValue<Int4> cast)
3118         {
3119                 *this = Short4(cast);
3120         }
3121
3122         UShort4::UShort4(RValue<Float4> cast, bool saturate)
3123         {
3124                 Float4 sat;
3125
3126                 if(saturate)
3127                 {
3128                         if(CPUID::supportsSSE4_1())
3129                         {
3130                                 sat = Min(cast, Float4(0xFFFF));   // packusdw takes care of 0x0000 saturation
3131                         }
3132                         else
3133                         {
3134                                 sat = Max(Min(cast, Float4(0xFFFF)), Float4(0x0000));
3135                         }
3136                 }
3137                 else
3138                 {
3139                         sat = cast;
3140                 }
3141
3142                 Int4 int4(sat);
3143
3144                 if(!saturate || !CPUID::supportsSSE4_1())
3145                 {
3146                         *this = Short4(Int4(int4));
3147                 }
3148                 else
3149                 {
3150                         *this = As<Short4>(Int2(As<Int4>(x86::packusdw(As<UInt4>(int4), As<UInt4>(int4)))));
3151                 }
3152         }
3153
3154         UShort4::UShort4()
3155         {
3156         //      xyzw.parent = this;
3157         }
3158
3159         UShort4::UShort4(unsigned short xyzw)
3160         {
3161                 //      xyzw.parent = this;
3162
3163                 int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
3164                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UShort::getType(), 4))));
3165
3166                 storeValue(Nucleus::createBitCast(vector, getType()));
3167         }
3168
3169         UShort4::UShort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
3170         {
3171         //      xyzw.parent = this;
3172
3173                 int64_t constantVector[4] = {x, y, z, w};
3174                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UShort::getType(), 4))));
3175
3176                 storeValue(Nucleus::createBitCast(vector, getType()));
3177         }
3178
3179         UShort4::UShort4(RValue<UShort4> rhs)
3180         {
3181         //      xyzw.parent = this;
3182
3183                 storeValue(rhs.value);
3184         }
3185
3186         UShort4::UShort4(const UShort4 &rhs)
3187         {
3188         //      xyzw.parent = this;
3189
3190                 Value *value = rhs.loadValue();
3191                 storeValue(value);
3192         }
3193
3194         UShort4::UShort4(const Reference<UShort4> &rhs)
3195         {
3196         //      xyzw.parent = this;
3197
3198                 Value *value = rhs.loadValue();
3199                 storeValue(value);
3200         }
3201
3202         UShort4::UShort4(RValue<Short4> rhs)
3203         {
3204         //      xyzw.parent = this;
3205
3206                 storeValue(rhs.value);
3207         }
3208
3209         UShort4::UShort4(const Short4 &rhs)
3210         {
3211         //      xyzw.parent = this;
3212
3213                 Value *value = rhs.loadValue();
3214                 storeValue(value);
3215         }
3216
3217         UShort4::UShort4(const Reference<Short4> &rhs)
3218         {
3219         //      xyzw.parent = this;
3220
3221                 Value *value = rhs.loadValue();
3222                 storeValue(value);
3223         }
3224
3225         RValue<UShort4> UShort4::operator=(RValue<UShort4> rhs) const
3226         {
3227                 storeValue(rhs.value);
3228
3229                 return rhs;
3230         }
3231
3232         RValue<UShort4> UShort4::operator=(const UShort4 &rhs) const
3233         {
3234                 Value *value = rhs.loadValue();
3235                 storeValue(value);
3236
3237                 return RValue<UShort4>(value);
3238         }
3239
3240         RValue<UShort4> UShort4::operator=(const Reference<UShort4> &rhs) const
3241         {
3242                 Value *value = rhs.loadValue();
3243                 storeValue(value);
3244
3245                 return RValue<UShort4>(value);
3246         }
3247
3248         RValue<UShort4> UShort4::operator=(RValue<Short4> rhs) const
3249         {
3250                 storeValue(rhs.value);
3251
3252                 return RValue<UShort4>(rhs);
3253         }
3254
3255         RValue<UShort4> UShort4::operator=(const Short4 &rhs) const
3256         {
3257                 Value *value = rhs.loadValue();
3258                 storeValue(value);
3259
3260                 return RValue<UShort4>(value);
3261         }
3262
3263         RValue<UShort4> UShort4::operator=(const Reference<Short4> &rhs) const
3264         {
3265                 Value *value = rhs.loadValue();
3266                 storeValue(value);
3267
3268                 return RValue<UShort4>(value);
3269         }
3270
3271         RValue<UShort4> operator+(RValue<UShort4> lhs, RValue<UShort4> rhs)
3272         {
3273                 if(CPUID::supportsMMX2())
3274                 {
3275                         return As<UShort4>(x86::paddw(As<Short4>(lhs), As<Short4>(rhs)));
3276                 }
3277                 else
3278                 {
3279                         return RValue<UShort4>(Nucleus::createAdd(lhs.value, rhs.value));
3280                 }
3281         }
3282
3283         RValue<UShort4> operator-(RValue<UShort4> lhs, RValue<UShort4> rhs)
3284         {
3285                 if(CPUID::supportsMMX2())
3286                 {
3287                         return As<UShort4>(x86::psubw(As<Short4>(lhs), As<Short4>(rhs)));
3288                 }
3289                 else
3290                 {
3291                         return RValue<UShort4>(Nucleus::createSub(lhs.value, rhs.value));
3292                 }
3293         }
3294
3295         RValue<UShort4> operator*(RValue<UShort4> lhs, RValue<UShort4> rhs)
3296         {
3297                 if(CPUID::supportsMMX2())
3298                 {
3299                         return As<UShort4>(x86::pmullw(As<Short4>(lhs), As<Short4>(rhs)));
3300                 }
3301                 else
3302                 {
3303                         return RValue<UShort4>(Nucleus::createMul(lhs.value, rhs.value));
3304                 }
3305         }
3306
3307         RValue<UShort4> operator&(RValue<UShort4> lhs, RValue<UShort4> rhs)
3308         {
3309                 if(CPUID::supportsMMX2())
3310                 {
3311                         return As<UShort4>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
3312                 }
3313                 else
3314                 {
3315                         return RValue<UShort4>(Nucleus::createAnd(lhs.value, rhs.value));
3316                 }
3317         }
3318
3319         RValue<UShort4> operator|(RValue<UShort4> lhs, RValue<UShort4> rhs)
3320         {
3321                 if(CPUID::supportsMMX2())
3322                 {
3323                         return As<UShort4>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
3324                 }
3325                 else
3326                 {
3327                         return RValue<UShort4>(Nucleus::createOr(lhs.value, rhs.value));
3328                 }
3329         }
3330
3331         RValue<UShort4> operator^(RValue<UShort4> lhs, RValue<UShort4> rhs)
3332         {
3333                 if(CPUID::supportsMMX2())
3334                 {
3335                         return As<UShort4>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
3336                 }
3337                 else
3338                 {
3339                         return RValue<UShort4>(Nucleus::createXor(lhs.value, rhs.value));
3340                 }
3341         }
3342
3343         RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
3344         {
3345         //      return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
3346
3347                 return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
3348         }
3349
3350         RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
3351         {
3352         //      return RValue<Short4>(Nucleus::createLShr(lhs.value, rhs.value));
3353
3354                 return x86::psrlw(lhs, rhs);
3355         }
3356
3357         RValue<UShort4> operator<<(RValue<UShort4> lhs, RValue<Long1> rhs)
3358         {
3359         //      return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
3360
3361                 return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
3362         }
3363
3364         RValue<UShort4> operator>>(RValue<UShort4> lhs, RValue<Long1> rhs)
3365         {
3366         //      return RValue<Short4>(Nucleus::createLShr(lhs.value, rhs.value));
3367
3368                 return x86::psrlw(lhs, rhs);
3369         }
3370
3371         RValue<UShort4> operator<<=(const UShort4 &lhs, unsigned char rhs)
3372         {
3373                 return lhs = lhs << rhs;
3374         }
3375
3376         RValue<UShort4> operator>>=(const UShort4 &lhs, unsigned char rhs)
3377         {
3378                 return lhs = lhs >> rhs;
3379         }
3380
3381         RValue<UShort4> operator<<=(const UShort4 &lhs, RValue<Long1> rhs)
3382         {
3383                 return lhs = lhs << rhs;
3384         }
3385
3386         RValue<UShort4> operator>>=(const UShort4 &lhs, RValue<Long1> rhs)
3387         {
3388                 return lhs = lhs >> rhs;
3389         }
3390
3391         RValue<UShort4> operator~(RValue<UShort4> val)
3392         {
3393                 if(CPUID::supportsMMX2())
3394                 {
3395                         return As<UShort4>(As<Short4>(val) ^ Short4(0xFFFFu, 0xFFFFu, 0xFFFFu, 0xFFFFu));
3396                 }
3397                 else
3398                 {
3399                         return RValue<UShort4>(Nucleus::createNot(val.value));
3400                 }
3401         }
3402
3403         RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
3404         {
3405                 return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
3406         }
3407
3408         RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
3409         {
3410                 return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
3411         }
3412
3413         RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
3414         {
3415                 return x86::paddusw(x, y);
3416         }
3417
3418         RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
3419         {
3420                 return x86::psubusw(x, y);
3421         }
3422
3423         RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
3424         {
3425                 return x86::pmulhuw(x, y);
3426         }
3427
3428         RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
3429         {
3430                 return x86::pavgw(x, y);
3431         }
3432
3433         RValue<Byte8> Pack(RValue<UShort4> x, RValue<UShort4> y)
3434         {
3435                 return x86::packuswb(x, y);
3436         }
3437
3438         Type *UShort4::getType()
3439         {
3440                 if(CPUID::supportsMMX2())
3441                 {
3442                         return MMX::getType();
3443                 }
3444                 else
3445                 {
3446                         return T(VectorType::get(UShort::getType(), 4));
3447                 }
3448         }
3449
3450         Short8::Short8(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7)
3451         {
3452                 int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
3453                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
3454         }
3455
3456         Short8::Short8(RValue<Short8> rhs)
3457         {
3458                 storeValue(rhs.value);
3459         }
3460
3461         Short8::Short8(const Reference<Short8> &rhs)
3462         {
3463                 Value *value = rhs.loadValue();
3464                 storeValue(value);
3465         }
3466
3467         Short8::Short8(RValue<Short4> lo, RValue<Short4> hi)
3468         {
3469                 Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
3470                 Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
3471
3472                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
3473                 long2 = Nucleus::createInsertElement(long2, loLong, 0);
3474                 long2 = Nucleus::createInsertElement(long2, hiLong, 1);
3475                 Value *short8 = Nucleus::createBitCast(long2, Short8::getType());
3476
3477                 storeValue(short8);
3478         }
3479
3480         RValue<Short8> operator+(RValue<Short8> lhs, RValue<Short8> rhs)
3481         {
3482                 return RValue<Short8>(Nucleus::createAdd(lhs.value, rhs.value));
3483         }
3484
3485         RValue<Short8> operator&(RValue<Short8> lhs, RValue<Short8> rhs)
3486         {
3487                 return RValue<Short8>(Nucleus::createAnd(lhs.value, rhs.value));
3488         }
3489
3490         RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
3491         {
3492                 return x86::psllw(lhs, rhs);   // FIXME: Fallback required
3493         }
3494
3495         RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
3496         {
3497                 return x86::psraw(lhs, rhs);   // FIXME: Fallback required
3498         }
3499
3500         RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
3501         {
3502                 return x86::pmaddwd(x, y);   // FIXME: Fallback required
3503         }
3504
3505         RValue<Int4> Abs(RValue<Int4> x)
3506         {
3507                 if(CPUID::supportsSSSE3())
3508                 {
3509                         return x86::pabsd(x);
3510                 }
3511                 else
3512                 {
3513                         Int4 mask = (x >> 31);
3514                         return (mask ^ x) - mask;
3515                 }
3516         }
3517
3518         RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
3519         {
3520                 return x86::pmulhw(x, y);   // FIXME: Fallback required
3521         }
3522
3523         Type *Short8::getType()
3524         {
3525                 return T(VectorType::get(Short::getType(), 8));
3526         }
3527
3528         UShort8::UShort8(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7)
3529         {
3530                 int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
3531                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
3532         }
3533
3534         UShort8::UShort8(RValue<UShort8> rhs)
3535         {
3536                 storeValue(rhs.value);
3537         }
3538
3539         UShort8::UShort8(const Reference<UShort8> &rhs)
3540         {
3541                 Value *value = rhs.loadValue();
3542                 storeValue(value);
3543         }
3544
3545         UShort8::UShort8(RValue<UShort4> lo, RValue<UShort4> hi)
3546         {
3547                 Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
3548                 Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
3549
3550                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
3551                 long2 = Nucleus::createInsertElement(long2, loLong, 0);
3552                 long2 = Nucleus::createInsertElement(long2, hiLong, 1);
3553                 Value *short8 = Nucleus::createBitCast(long2, Short8::getType());
3554
3555                 storeValue(short8);
3556         }
3557
3558         RValue<UShort8> UShort8::operator=(RValue<UShort8> rhs) const
3559         {
3560                 storeValue(rhs.value);
3561
3562                 return rhs;
3563         }
3564
3565         RValue<UShort8> UShort8::operator=(const UShort8 &rhs) const
3566         {
3567                 Value *value = rhs.loadValue();
3568                 storeValue(value);
3569
3570                 return RValue<UShort8>(value);
3571         }
3572
3573         RValue<UShort8> UShort8::operator=(const Reference<UShort8> &rhs) const
3574         {
3575                 Value *value = rhs.loadValue();
3576                 storeValue(value);
3577
3578                 return RValue<UShort8>(value);
3579         }
3580
3581         RValue<UShort8> operator&(RValue<UShort8> lhs, RValue<UShort8> rhs)
3582         {
3583                 return RValue<UShort8>(Nucleus::createAnd(lhs.value, rhs.value));
3584         }
3585
3586         RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
3587         {
3588                 return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));   // FIXME: Fallback required
3589         }
3590
3591         RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
3592         {
3593                 return x86::psrlw(lhs, rhs);   // FIXME: Fallback required
3594         }
3595
3596         RValue<UShort8> operator+(RValue<UShort8> lhs, RValue<UShort8> rhs)
3597         {
3598                 return RValue<UShort8>(Nucleus::createAdd(lhs.value, rhs.value));
3599         }
3600
3601         RValue<UShort8> operator*(RValue<UShort8> lhs, RValue<UShort8> rhs)
3602         {
3603                 return RValue<UShort8>(Nucleus::createMul(lhs.value, rhs.value));
3604         }
3605
3606         RValue<UShort8> operator+=(const UShort8 &lhs, RValue<UShort8> rhs)
3607         {
3608                 return lhs = lhs + rhs;
3609         }
3610
3611         RValue<UShort8> operator~(RValue<UShort8> val)
3612         {
3613                 return RValue<UShort8>(Nucleus::createNot(val.value));
3614         }
3615
3616         RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
3617         {
3618                 int pshufb[16] =
3619                 {
3620                         select0 + 0,
3621                         select0 + 1,
3622                         select1 + 0,
3623                         select1 + 1,
3624                         select2 + 0,
3625                         select2 + 1,
3626                         select3 + 0,
3627                         select3 + 1,
3628                         select4 + 0,
3629                         select4 + 1,
3630                         select5 + 0,
3631                         select5 + 1,
3632                         select6 + 0,
3633                         select6 + 1,
3634                         select7 + 0,
3635                         select7 + 1,
3636                 };
3637
3638                 Value *byte16 = Nucleus::createBitCast(x.value, Byte16::getType());
3639                 Value *shuffle = Nucleus::createShuffleVector(byte16, byte16, pshufb);
3640                 Value *short8 = Nucleus::createBitCast(shuffle, UShort8::getType());
3641
3642                 return RValue<UShort8>(short8);
3643         }
3644
3645         RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
3646         {
3647                 return x86::pmulhuw(x, y);   // FIXME: Fallback required
3648         }
3649
3650         Type *UShort8::getType()
3651         {
3652                 return T(VectorType::get(UShort::getType(), 8));
3653         }
3654
3655         Int::Int(Argument<Int> argument)
3656         {
3657                 storeValue(argument.value);
3658         }
3659
3660         Int::Int(RValue<Byte> cast)
3661         {
3662                 Value *integer = Nucleus::createZExt(cast.value, Int::getType());
3663
3664                 storeValue(integer);
3665         }
3666
3667         Int::Int(RValue<SByte> cast)
3668         {
3669                 Value *integer = Nucleus::createSExt(cast.value, Int::getType());
3670
3671                 storeValue(integer);
3672         }
3673
3674         Int::Int(RValue<Short> cast)
3675         {
3676                 Value *integer = Nucleus::createSExt(cast.value, Int::getType());
3677
3678                 storeValue(integer);
3679         }
3680
3681         Int::Int(RValue<UShort> cast)
3682         {
3683                 Value *integer = Nucleus::createZExt(cast.value, Int::getType());
3684
3685                 storeValue(integer);
3686         }
3687
3688         Int::Int(RValue<Int2> cast)
3689         {
3690                 *this = Extract(cast, 0);
3691         }
3692
3693         Int::Int(RValue<Long> cast)
3694         {
3695                 Value *integer = Nucleus::createTrunc(cast.value, Int::getType());
3696
3697                 storeValue(integer);
3698         }
3699
3700         Int::Int(RValue<Float> cast)
3701         {
3702                 Value *integer = Nucleus::createFPToSI(cast.value, Int::getType());
3703
3704                 storeValue(integer);
3705         }
3706
3707         Int::Int()
3708         {
3709         }
3710
3711         Int::Int(int x)
3712         {
3713                 storeValue(Nucleus::createConstantInt(x));
3714         }
3715
3716         Int::Int(RValue<Int> rhs)
3717         {
3718                 storeValue(rhs.value);
3719         }
3720
3721         Int::Int(RValue<UInt> rhs)
3722         {
3723                 storeValue(rhs.value);
3724         }
3725
3726         Int::Int(const Int &rhs)
3727         {
3728                 Value *value = rhs.loadValue();
3729                 storeValue(value);
3730         }
3731
3732         Int::Int(const Reference<Int> &rhs)
3733         {
3734                 Value *value = rhs.loadValue();
3735                 storeValue(value);
3736         }
3737
3738         Int::Int(const UInt &rhs)
3739         {
3740                 Value *value = rhs.loadValue();
3741                 storeValue(value);
3742         }
3743
3744         Int::Int(const Reference<UInt> &rhs)
3745         {
3746                 Value *value = rhs.loadValue();
3747                 storeValue(value);
3748         }
3749
3750         RValue<Int> Int::operator=(int rhs) const
3751         {
3752                 return RValue<Int>(storeValue(Nucleus::createConstantInt(rhs)));
3753         }
3754
3755         RValue<Int> Int::operator=(RValue<Int> rhs) const
3756         {
3757                 storeValue(rhs.value);
3758
3759                 return rhs;
3760         }
3761
3762         RValue<Int> Int::operator=(RValue<UInt> rhs) const
3763         {
3764                 storeValue(rhs.value);
3765
3766                 return RValue<Int>(rhs);
3767         }
3768
3769         RValue<Int> Int::operator=(const Int &rhs) const
3770         {
3771                 Value *value = rhs.loadValue();
3772                 storeValue(value);
3773
3774                 return RValue<Int>(value);
3775         }
3776
3777         RValue<Int> Int::operator=(const Reference<Int> &rhs) const
3778         {
3779                 Value *value = rhs.loadValue();
3780                 storeValue(value);
3781
3782                 return RValue<Int>(value);
3783         }
3784
3785         RValue<Int> Int::operator=(const UInt &rhs) const
3786         {
3787                 Value *value = rhs.loadValue();
3788                 storeValue(value);
3789
3790                 return RValue<Int>(value);
3791         }
3792
3793         RValue<Int> Int::operator=(const Reference<UInt> &rhs) const
3794         {
3795                 Value *value = rhs.loadValue();
3796                 storeValue(value);
3797
3798                 return RValue<Int>(value);
3799         }
3800
3801         RValue<Int> operator+(RValue<Int> lhs, RValue<Int> rhs)
3802         {
3803                 return RValue<Int>(Nucleus::createAdd(lhs.value, rhs.value));
3804         }
3805
3806         RValue<Int> operator-(RValue<Int> lhs, RValue<Int> rhs)
3807         {
3808                 return RValue<Int>(Nucleus::createSub(lhs.value, rhs.value));
3809         }
3810
3811         RValue<Int> operator*(RValue<Int> lhs, RValue<Int> rhs)
3812         {
3813                 return RValue<Int>(Nucleus::createMul(lhs.value, rhs.value));
3814         }
3815
3816         RValue<Int> operator/(RValue<Int> lhs, RValue<Int> rhs)
3817         {
3818                 return RValue<Int>(Nucleus::createSDiv(lhs.value, rhs.value));
3819         }
3820
3821         RValue<Int> operator%(RValue<Int> lhs, RValue<Int> rhs)
3822         {
3823                 return RValue<Int>(Nucleus::createSRem(lhs.value, rhs.value));
3824         }
3825
3826         RValue<Int> operator&(RValue<Int> lhs, RValue<Int> rhs)
3827         {
3828                 return RValue<Int>(Nucleus::createAnd(lhs.value, rhs.value));
3829         }
3830
3831         RValue<Int> operator|(RValue<Int> lhs, RValue<Int> rhs)
3832         {
3833                 return RValue<Int>(Nucleus::createOr(lhs.value, rhs.value));
3834         }
3835
3836         RValue<Int> operator^(RValue<Int> lhs, RValue<Int> rhs)
3837         {
3838                 return RValue<Int>(Nucleus::createXor(lhs.value, rhs.value));
3839         }
3840
3841         RValue<Int> operator<<(RValue<Int> lhs, RValue<Int> rhs)
3842         {
3843                 return RValue<Int>(Nucleus::createShl(lhs.value, rhs.value));
3844         }
3845
3846         RValue<Int> operator>>(RValue<Int> lhs, RValue<Int> rhs)
3847         {
3848                 return RValue<Int>(Nucleus::createAShr(lhs.value, rhs.value));
3849         }
3850
3851         RValue<Int> operator+=(const Int &lhs, RValue<Int> rhs)
3852         {
3853                 return lhs = lhs + rhs;
3854         }
3855
3856         RValue<Int> operator-=(const Int &lhs, RValue<Int> rhs)
3857         {
3858                 return lhs = lhs - rhs;
3859         }
3860
3861         RValue<Int> operator*=(const Int &lhs, RValue<Int> rhs)
3862         {
3863                 return lhs = lhs * rhs;
3864         }
3865
3866         RValue<Int> operator/=(const Int &lhs, RValue<Int> rhs)
3867         {
3868                 return lhs = lhs / rhs;
3869         }
3870
3871         RValue<Int> operator%=(const Int &lhs, RValue<Int> rhs)
3872         {
3873                 return lhs = lhs % rhs;
3874         }
3875
3876         RValue<Int> operator&=(const Int &lhs, RValue<Int> rhs)
3877         {
3878                 return lhs = lhs & rhs;
3879         }
3880
3881         RValue<Int> operator|=(const Int &lhs, RValue<Int> rhs)
3882         {
3883                 return lhs = lhs | rhs;
3884         }
3885
3886         RValue<Int> operator^=(const Int &lhs, RValue<Int> rhs)
3887         {
3888                 return lhs = lhs ^ rhs;
3889         }
3890
3891         RValue<Int> operator<<=(const Int &lhs, RValue<Int> rhs)
3892         {
3893                 return lhs = lhs << rhs;
3894         }
3895
3896         RValue<Int> operator>>=(const Int &lhs, RValue<Int> rhs)
3897         {
3898                 return lhs = lhs >> rhs;
3899         }
3900
3901         RValue<Int> operator+(RValue<Int> val)
3902         {
3903                 return val;
3904         }
3905
3906         RValue<Int> operator-(RValue<Int> val)
3907         {
3908                 return RValue<Int>(Nucleus::createNeg(val.value));
3909         }
3910
3911         RValue<Int> operator~(RValue<Int> val)
3912         {
3913                 return RValue<Int>(Nucleus::createNot(val.value));
3914         }
3915
3916         RValue<Int> operator++(const Int &val, int)   // Post-increment
3917         {
3918                 RValue<Int> res = val;
3919
3920                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantInt(1)));
3921                 val.storeValue(inc);
3922
3923                 return res;
3924         }
3925
3926         const Int &operator++(const Int &val)   // Pre-increment
3927         {
3928                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantInt(1)));
3929                 val.storeValue(inc);
3930
3931                 return val;
3932         }
3933
3934         RValue<Int> operator--(const Int &val, int)   // Post-decrement
3935         {
3936                 RValue<Int> res = val;
3937
3938                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantInt(1)));
3939                 val.storeValue(inc);
3940
3941                 return res;
3942         }
3943
3944         const Int &operator--(const Int &val)   // Pre-decrement
3945         {
3946                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantInt(1)));
3947                 val.storeValue(inc);
3948
3949                 return val;
3950         }
3951
3952         RValue<Bool> operator<(RValue<Int> lhs, RValue<Int> rhs)
3953         {
3954                 return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
3955         }
3956
3957         RValue<Bool> operator<=(RValue<Int> lhs, RValue<Int> rhs)
3958         {
3959                 return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
3960         }
3961
3962         RValue<Bool> operator>(RValue<Int> lhs, RValue<Int> rhs)
3963         {
3964                 return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
3965         }
3966
3967         RValue<Bool> operator>=(RValue<Int> lhs, RValue<Int> rhs)
3968         {
3969                 return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
3970         }
3971
3972         RValue<Bool> operator!=(RValue<Int> lhs, RValue<Int> rhs)
3973         {
3974                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
3975         }
3976
3977         RValue<Bool> operator==(RValue<Int> lhs, RValue<Int> rhs)
3978         {
3979                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
3980         }
3981
3982         RValue<Int> Max(RValue<Int> x, RValue<Int> y)
3983         {
3984                 return IfThenElse(x > y, x, y);
3985         }
3986
3987         RValue<Int> Min(RValue<Int> x, RValue<Int> y)
3988         {
3989                 return IfThenElse(x < y, x, y);
3990         }
3991
3992         RValue<Int> Clamp(RValue<Int> x, RValue<Int> min, RValue<Int> max)
3993         {
3994                 return Min(Max(x, min), max);
3995         }
3996
3997         RValue<Int> RoundInt(RValue<Float> cast)
3998         {
3999                 return x86::cvtss2si(cast);
4000
4001         //      return IfThenElse(val > 0.0f, Int(val + 0.5f), Int(val - 0.5f));
4002         }
4003
4004         Type *Int::getType()
4005         {
4006                 return T(llvm::Type::getInt32Ty(*::context));
4007         }
4008
4009         Long::Long(RValue<Int> cast)
4010         {
4011                 Value *integer = Nucleus::createSExt(cast.value, Long::getType());
4012
4013                 storeValue(integer);
4014         }
4015
4016         Long::Long(RValue<UInt> cast)
4017         {
4018                 Value *integer = Nucleus::createZExt(cast.value, Long::getType());
4019
4020                 storeValue(integer);
4021         }
4022
4023         Long::Long()
4024         {
4025         }
4026
4027         Long::Long(RValue<Long> rhs)
4028         {
4029                 storeValue(rhs.value);
4030         }
4031
4032         RValue<Long> Long::operator=(int64_t rhs) const
4033         {
4034                 return RValue<Long>(storeValue(Nucleus::createConstantLong(rhs)));
4035         }
4036
4037         RValue<Long> Long::operator=(RValue<Long> rhs) const
4038         {
4039                 storeValue(rhs.value);
4040
4041                 return rhs;
4042         }
4043
4044         RValue<Long> Long::operator=(const Long &rhs) const
4045         {
4046                 Value *value = rhs.loadValue();
4047                 storeValue(value);
4048
4049                 return RValue<Long>(value);
4050         }
4051
4052         RValue<Long> Long::operator=(const Reference<Long> &rhs) const
4053         {
4054                 Value *value = rhs.loadValue();
4055                 storeValue(value);
4056
4057                 return RValue<Long>(value);
4058         }
4059
4060         RValue<Long> operator+(RValue<Long> lhs, RValue<Long> rhs)
4061         {
4062                 return RValue<Long>(Nucleus::createAdd(lhs.value, rhs.value));
4063         }
4064
4065         RValue<Long> operator-(RValue<Long> lhs, RValue<Long> rhs)
4066         {
4067                 return RValue<Long>(Nucleus::createSub(lhs.value, rhs.value));
4068         }
4069
4070         RValue<Long> operator+=(const Long &lhs, RValue<Long> rhs)
4071         {
4072                 return lhs = lhs + rhs;
4073         }
4074
4075         RValue<Long> operator-=(const Long &lhs, RValue<Long> rhs)
4076         {
4077                 return lhs = lhs - rhs;
4078         }
4079
4080         RValue<Long> AddAtomic(RValue<Pointer<Long> > x, RValue<Long> y)
4081         {
4082                 return RValue<Long>(Nucleus::createAtomicAdd(x.value, y.value));
4083         }
4084
4085         Type *Long::getType()
4086         {
4087                 return T(llvm::Type::getInt64Ty(*::context));
4088         }
4089
4090         Long1::Long1(const RValue<UInt> cast)
4091         {
4092                 Value *undefCast = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), cast.value, 0);
4093                 Value *zeroCast = Nucleus::createInsertElement(undefCast, V(Nucleus::createConstantInt(0)), 1);
4094
4095                 storeValue(Nucleus::createBitCast(zeroCast, Long1::getType()));
4096         }
4097
4098         Long1::Long1(RValue<Long1> rhs)
4099         {
4100                 storeValue(rhs.value);
4101         }
4102
4103         Type *Long1::getType()
4104         {
4105                 if(CPUID::supportsMMX2())
4106                 {
4107                         return MMX::getType();
4108                 }
4109                 else
4110                 {
4111                         return T(VectorType::get(Long::getType(), 1));
4112                 }
4113         }
4114
4115         UInt::UInt(Argument<UInt> argument)
4116         {
4117                 storeValue(argument.value);
4118         }
4119
4120         UInt::UInt(RValue<UShort> cast)
4121         {
4122                 Value *integer = Nucleus::createZExt(cast.value, UInt::getType());
4123
4124                 storeValue(integer);
4125         }
4126
4127         UInt::UInt(RValue<Long> cast)
4128         {
4129                 Value *integer = Nucleus::createTrunc(cast.value, UInt::getType());
4130
4131                 storeValue(integer);
4132         }
4133
4134         UInt::UInt(RValue<Float> cast)
4135         {
4136                 // Note: createFPToUI is broken, must perform conversion using createFPtoSI
4137                 // Value *integer = Nucleus::createFPToUI(cast.value, UInt::getType());
4138
4139                 // Smallest positive value representable in UInt, but not in Int
4140                 const unsigned int ustart = 0x80000000u;
4141                 const float ustartf = float(ustart);
4142
4143                 // If the value is negative, store 0, otherwise store the result of the conversion
4144                 storeValue((~(As<Int>(cast) >> 31) &
4145                 // Check if the value can be represented as an Int
4146                         IfThenElse(cast >= ustartf,
4147                 // If the value is too large, subtract ustart and re-add it after conversion.
4148                                 As<Int>(As<UInt>(Int(cast - Float(ustartf))) + UInt(ustart)),
4149                 // Otherwise, just convert normally
4150                                 Int(cast))).value);
4151         }
4152
4153         UInt::UInt()
4154         {
4155         }
4156
4157         UInt::UInt(int x)
4158         {
4159                 storeValue(Nucleus::createConstantInt(x));
4160         }
4161
4162         UInt::UInt(unsigned int x)
4163         {
4164                 storeValue(Nucleus::createConstantInt(x));
4165         }
4166
4167         UInt::UInt(RValue<UInt> rhs)
4168         {
4169                 storeValue(rhs.value);
4170         }
4171
4172         UInt::UInt(RValue<Int> rhs)
4173         {
4174                 storeValue(rhs.value);
4175         }
4176
4177         UInt::UInt(const UInt &rhs)
4178         {
4179                 Value *value = rhs.loadValue();
4180                 storeValue(value);
4181         }
4182
4183         UInt::UInt(const Reference<UInt> &rhs)
4184         {
4185                 Value *value = rhs.loadValue();
4186                 storeValue(value);
4187         }
4188
4189         UInt::UInt(const Int &rhs)
4190         {
4191                 Value *value = rhs.loadValue();
4192                 storeValue(value);
4193         }
4194
4195         UInt::UInt(const Reference<Int> &rhs)
4196         {
4197                 Value *value = rhs.loadValue();
4198                 storeValue(value);
4199         }
4200
4201         RValue<UInt> UInt::operator=(unsigned int rhs) const
4202         {
4203                 return RValue<UInt>(storeValue(Nucleus::createConstantInt(rhs)));
4204         }
4205
4206         RValue<UInt> UInt::operator=(RValue<UInt> rhs) const
4207         {
4208                 storeValue(rhs.value);
4209
4210                 return rhs;
4211         }
4212
4213         RValue<UInt> UInt::operator=(RValue<Int> rhs) const
4214         {
4215                 storeValue(rhs.value);
4216
4217                 return RValue<UInt>(rhs);
4218         }
4219
4220         RValue<UInt> UInt::operator=(const UInt &rhs) const
4221         {
4222                 Value *value = rhs.loadValue();
4223                 storeValue(value);
4224
4225                 return RValue<UInt>(value);
4226         }
4227
4228         RValue<UInt> UInt::operator=(const Reference<UInt> &rhs) const
4229         {
4230                 Value *value = rhs.loadValue();
4231                 storeValue(value);
4232
4233                 return RValue<UInt>(value);
4234         }
4235
4236         RValue<UInt> UInt::operator=(const Int &rhs) const
4237         {
4238                 Value *value = rhs.loadValue();
4239                 storeValue(value);
4240
4241                 return RValue<UInt>(value);
4242         }
4243
4244         RValue<UInt> UInt::operator=(const Reference<Int> &rhs) const
4245         {
4246                 Value *value = rhs.loadValue();
4247                 storeValue(value);
4248
4249                 return RValue<UInt>(value);
4250         }
4251
4252         RValue<UInt> operator+(RValue<UInt> lhs, RValue<UInt> rhs)
4253         {
4254                 return RValue<UInt>(Nucleus::createAdd(lhs.value, rhs.value));
4255         }
4256
4257         RValue<UInt> operator-(RValue<UInt> lhs, RValue<UInt> rhs)
4258         {
4259                 return RValue<UInt>(Nucleus::createSub(lhs.value, rhs.value));
4260         }
4261
4262         RValue<UInt> operator*(RValue<UInt> lhs, RValue<UInt> rhs)
4263         {
4264                 return RValue<UInt>(Nucleus::createMul(lhs.value, rhs.value));
4265         }
4266
4267         RValue<UInt> operator/(RValue<UInt> lhs, RValue<UInt> rhs)
4268         {
4269                 return RValue<UInt>(Nucleus::createUDiv(lhs.value, rhs.value));
4270         }
4271
4272         RValue<UInt> operator%(RValue<UInt> lhs, RValue<UInt> rhs)
4273         {
4274                 return RValue<UInt>(Nucleus::createURem(lhs.value, rhs.value));
4275         }
4276
4277         RValue<UInt> operator&(RValue<UInt> lhs, RValue<UInt> rhs)
4278         {
4279                 return RValue<UInt>(Nucleus::createAnd(lhs.value, rhs.value));
4280         }
4281
4282         RValue<UInt> operator|(RValue<UInt> lhs, RValue<UInt> rhs)
4283         {
4284                 return RValue<UInt>(Nucleus::createOr(lhs.value, rhs.value));
4285         }
4286
4287         RValue<UInt> operator^(RValue<UInt> lhs, RValue<UInt> rhs)
4288         {
4289                 return RValue<UInt>(Nucleus::createXor(lhs.value, rhs.value));
4290         }
4291
4292         RValue<UInt> operator<<(RValue<UInt> lhs, RValue<UInt> rhs)
4293         {
4294                 return RValue<UInt>(Nucleus::createShl(lhs.value, rhs.value));
4295         }
4296
4297         RValue<UInt> operator>>(RValue<UInt> lhs, RValue<UInt> rhs)
4298         {
4299                 return RValue<UInt>(Nucleus::createLShr(lhs.value, rhs.value));
4300         }
4301
4302         RValue<UInt> operator+=(const UInt &lhs, RValue<UInt> rhs)
4303         {
4304                 return lhs = lhs + rhs;
4305         }
4306
4307         RValue<UInt> operator-=(const UInt &lhs, RValue<UInt> rhs)
4308         {
4309                 return lhs = lhs - rhs;
4310         }
4311
4312         RValue<UInt> operator*=(const UInt &lhs, RValue<UInt> rhs)
4313         {
4314                 return lhs = lhs * rhs;
4315         }
4316
4317         RValue<UInt> operator/=(const UInt &lhs, RValue<UInt> rhs)
4318         {
4319                 return lhs = lhs / rhs;
4320         }
4321
4322         RValue<UInt> operator%=(const UInt &lhs, RValue<UInt> rhs)
4323         {
4324                 return lhs = lhs % rhs;
4325         }
4326
4327         RValue<UInt> operator&=(const UInt &lhs, RValue<UInt> rhs)
4328         {
4329                 return lhs = lhs & rhs;
4330         }
4331
4332         RValue<UInt> operator|=(const UInt &lhs, RValue<UInt> rhs)
4333         {
4334                 return lhs = lhs | rhs;
4335         }
4336
4337         RValue<UInt> operator^=(const UInt &lhs, RValue<UInt> rhs)
4338         {
4339                 return lhs = lhs ^ rhs;
4340         }
4341
4342         RValue<UInt> operator<<=(const UInt &lhs, RValue<UInt> rhs)
4343         {
4344                 return lhs = lhs << rhs;
4345         }
4346
4347         RValue<UInt> operator>>=(const UInt &lhs, RValue<UInt> rhs)
4348         {
4349                 return lhs = lhs >> rhs;
4350         }
4351
4352         RValue<UInt> operator+(RValue<UInt> val)
4353         {
4354                 return val;
4355         }
4356
4357         RValue<UInt> operator-(RValue<UInt> val)
4358         {
4359                 return RValue<UInt>(Nucleus::createNeg(val.value));
4360         }
4361
4362         RValue<UInt> operator~(RValue<UInt> val)
4363         {
4364                 return RValue<UInt>(Nucleus::createNot(val.value));
4365         }
4366
4367         RValue<UInt> operator++(const UInt &val, int)   // Post-increment
4368         {
4369                 RValue<UInt> res = val;
4370
4371                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantInt(1)));
4372                 val.storeValue(inc);
4373
4374                 return res;
4375         }
4376
4377         const UInt &operator++(const UInt &val)   // Pre-increment
4378         {
4379                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantInt(1)));
4380                 val.storeValue(inc);
4381
4382                 return val;
4383         }
4384
4385         RValue<UInt> operator--(const UInt &val, int)   // Post-decrement
4386         {
4387                 RValue<UInt> res = val;
4388
4389                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantInt(1)));
4390                 val.storeValue(inc);
4391
4392                 return res;
4393         }
4394
4395         const UInt &operator--(const UInt &val)   // Pre-decrement
4396         {
4397                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantInt(1)));
4398                 val.storeValue(inc);
4399
4400                 return val;
4401         }
4402
4403         RValue<UInt> Max(RValue<UInt> x, RValue<UInt> y)
4404         {
4405                 return IfThenElse(x > y, x, y);
4406         }
4407
4408         RValue<UInt> Min(RValue<UInt> x, RValue<UInt> y)
4409         {
4410                 return IfThenElse(x < y, x, y);
4411         }
4412
4413         RValue<UInt> Clamp(RValue<UInt> x, RValue<UInt> min, RValue<UInt> max)
4414         {
4415                 return Min(Max(x, min), max);
4416         }
4417
4418         RValue<Bool> operator<(RValue<UInt> lhs, RValue<UInt> rhs)
4419         {
4420                 return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
4421         }
4422
4423         RValue<Bool> operator<=(RValue<UInt> lhs, RValue<UInt> rhs)
4424         {
4425                 return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
4426         }
4427
4428         RValue<Bool> operator>(RValue<UInt> lhs, RValue<UInt> rhs)
4429         {
4430                 return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
4431         }
4432
4433         RValue<Bool> operator>=(RValue<UInt> lhs, RValue<UInt> rhs)
4434         {
4435                 return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
4436         }
4437
4438         RValue<Bool> operator!=(RValue<UInt> lhs, RValue<UInt> rhs)
4439         {
4440                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
4441         }
4442
4443         RValue<Bool> operator==(RValue<UInt> lhs, RValue<UInt> rhs)
4444         {
4445                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
4446         }
4447
4448 //      RValue<UInt> RoundUInt(RValue<Float> cast)
4449 //      {
4450 //              return x86::cvtss2si(val);   // FIXME: Unsigned
4451 //
4452 //      //      return IfThenElse(val > 0.0f, Int(val + 0.5f), Int(val - 0.5f));
4453 //      }
4454
4455         Type *UInt::getType()
4456         {
4457                 return T(llvm::Type::getInt32Ty(*::context));
4458         }
4459
4460 //      Int2::Int2(RValue<Int> cast)
4461 //      {
4462 //              Value *extend = Nucleus::createZExt(cast.value, Long::getType());
4463 //              Value *vector = Nucleus::createBitCast(extend, Int2::getType());
4464 //
4465 //              int shuffle[2] = {0, 0};
4466 //              Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
4467 //
4468 //              storeValue(replicate);
4469 //      }
4470
4471         Int2::Int2(RValue<Int4> cast)
4472         {
4473                 Value *long2 = Nucleus::createBitCast(cast.value, T(VectorType::get(Long::getType(), 2)));
4474                 Value *element = Nucleus::createExtractElement(long2, Long::getType(), 0);
4475                 Value *int2 = Nucleus::createBitCast(element, Int2::getType());
4476
4477                 storeValue(int2);
4478         }
4479
4480         Int2::Int2()
4481         {
4482         //      xy.parent = this;
4483         }
4484
4485         Int2::Int2(int x, int y)
4486         {
4487         //      xy.parent = this;
4488
4489                 int64_t constantVector[2] = {x, y};
4490                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Int::getType(), 2))));
4491
4492                 storeValue(Nucleus::createBitCast(vector, getType()));
4493         }
4494
4495         Int2::Int2(RValue<Int2> rhs)
4496         {
4497         //      xy.parent = this;
4498
4499                 storeValue(rhs.value);
4500         }
4501
4502         Int2::Int2(const Int2 &rhs)
4503         {
4504         //      xy.parent = this;
4505
4506                 Value *value = rhs.loadValue();
4507                 storeValue(value);
4508         }
4509
4510         Int2::Int2(const Reference<Int2> &rhs)
4511         {
4512         //      xy.parent = this;
4513
4514                 Value *value = rhs.loadValue();
4515                 storeValue(value);
4516         }
4517
4518         Int2::Int2(RValue<Int> lo, RValue<Int> hi)
4519         {
4520                 if(CPUID::supportsMMX2())
4521                 {
4522                         // movd mm0, lo
4523                         // movd mm1, hi
4524                         // punpckldq mm0, mm1
4525                         storeValue(As<Int2>(UnpackLow(As<Int2>(Long1(RValue<UInt>(lo))), As<Int2>(Long1(RValue<UInt>(hi))))).value);
4526                 }
4527                 else
4528                 {
4529                         int shuffle[2] = {0, 1};
4530                         Value *packed = Nucleus::createShuffleVector(Nucleus::createBitCast(lo.value, T(VectorType::get(Int::getType(), 1))), Nucleus::createBitCast(hi.value, T(VectorType::get(Int::getType(), 1))), shuffle);
4531
4532                         storeValue(Nucleus::createBitCast(packed, Int2::getType()));
4533                 }
4534         }
4535
4536         RValue<Int2> Int2::operator=(RValue<Int2> rhs) const
4537         {
4538                 storeValue(rhs.value);
4539
4540                 return rhs;
4541         }
4542
4543         RValue<Int2> Int2::operator=(const Int2 &rhs) const
4544         {
4545                 Value *value = rhs.loadValue();
4546                 storeValue(value);
4547
4548                 return RValue<Int2>(value);
4549         }
4550
4551         RValue<Int2> Int2::operator=(const Reference<Int2> &rhs) const
4552         {
4553                 Value *value = rhs.loadValue();
4554                 storeValue(value);
4555
4556                 return RValue<Int2>(value);
4557         }
4558
4559         RValue<Int2> operator+(RValue<Int2> lhs, RValue<Int2> rhs)
4560         {
4561                 if(CPUID::supportsMMX2())
4562                 {
4563                         return x86::paddd(lhs, rhs);
4564                 }
4565                 else
4566                 {
4567                         return RValue<Int2>(Nucleus::createAdd(lhs.value, rhs.value));
4568                 }
4569         }
4570
4571         RValue<Int2> operator-(RValue<Int2> lhs, RValue<Int2> rhs)
4572         {
4573                 if(CPUID::supportsMMX2())
4574                 {
4575                         return x86::psubd(lhs, rhs);
4576                 }
4577                 else
4578                 {
4579                         return RValue<Int2>(Nucleus::createSub(lhs.value, rhs.value));
4580                 }
4581         }
4582
4583 //      RValue<Int2> operator*(RValue<Int2> lhs, RValue<Int2> rhs)
4584 //      {
4585 //              return RValue<Int2>(Nucleus::createMul(lhs.value, rhs.value));
4586 //      }
4587
4588 //      RValue<Int2> operator/(RValue<Int2> lhs, RValue<Int2> rhs)
4589 //      {
4590 //              return RValue<Int2>(Nucleus::createSDiv(lhs.value, rhs.value));
4591 //      }
4592
4593 //      RValue<Int2> operator%(RValue<Int2> lhs, RValue<Int2> rhs)
4594 //      {
4595 //              return RValue<Int2>(Nucleus::createSRem(lhs.value, rhs.value));
4596 //      }
4597
4598         RValue<Int2> operator&(RValue<Int2> lhs, RValue<Int2> rhs)
4599         {
4600                 if(CPUID::supportsMMX2())
4601                 {
4602                         return As<Int2>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
4603                 }
4604                 else
4605                 {
4606                         return RValue<Int2>(Nucleus::createAnd(lhs.value, rhs.value));
4607                 }
4608         }
4609
4610         RValue<Int2> operator|(RValue<Int2> lhs, RValue<Int2> rhs)
4611         {
4612                 if(CPUID::supportsMMX2())
4613                 {
4614                         return As<Int2>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
4615                 }
4616                 else
4617                 {
4618                         return RValue<Int2>(Nucleus::createOr(lhs.value, rhs.value));
4619                 }
4620         }
4621
4622         RValue<Int2> operator^(RValue<Int2> lhs, RValue<Int2> rhs)
4623         {
4624                 if(CPUID::supportsMMX2())
4625                 {
4626                         return As<Int2>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
4627                 }
4628                 else
4629                 {
4630                         return RValue<Int2>(Nucleus::createXor(lhs.value, rhs.value));
4631                 }
4632         }
4633
4634         RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
4635         {
4636         //      return RValue<Int2>(Nucleus::createShl(lhs.value, rhs.value));
4637
4638                 return x86::pslld(lhs, rhs);
4639         }
4640
4641         RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
4642         {
4643         //      return RValue<Int2>(Nucleus::createAShr(lhs.value, rhs.value));
4644
4645                 return x86::psrad(lhs, rhs);
4646         }
4647
4648         RValue<Int2> operator<<(RValue<Int2> lhs, RValue<Long1> rhs)
4649         {
4650         //      return RValue<Int2>(Nucleus::createShl(lhs.value, rhs.value));
4651
4652                 return x86::pslld(lhs, rhs);
4653         }
4654
4655         RValue<Int2> operator>>(RValue<Int2> lhs, RValue<Long1> rhs)
4656         {
4657         //      return RValue<Int2>(Nucleus::createAShr(lhs.value, rhs.value));
4658
4659                 return x86::psrad(lhs, rhs);
4660         }
4661
4662         RValue<Int2> operator+=(const Int2 &lhs, RValue<Int2> rhs)
4663         {
4664                 return lhs = lhs + rhs;
4665         }
4666
4667         RValue<Int2> operator-=(const Int2 &lhs, RValue<Int2> rhs)
4668         {
4669                 return lhs = lhs - rhs;
4670         }
4671
4672 //      RValue<Int2> operator*=(const Int2 &lhs, RValue<Int2> rhs)
4673 //      {
4674 //              return lhs = lhs * rhs;
4675 //      }
4676
4677 //      RValue<Int2> operator/=(const Int2 &lhs, RValue<Int2> rhs)
4678 //      {
4679 //              return lhs = lhs / rhs;
4680 //      }
4681
4682 //      RValue<Int2> operator%=(const Int2 &lhs, RValue<Int2> rhs)
4683 //      {
4684 //              return lhs = lhs % rhs;
4685 //      }
4686
4687         RValue<Int2> operator&=(const Int2 &lhs, RValue<Int2> rhs)
4688         {
4689                 return lhs = lhs & rhs;
4690         }
4691
4692         RValue<Int2> operator|=(const Int2 &lhs, RValue<Int2> rhs)
4693         {
4694                 return lhs = lhs | rhs;
4695         }
4696
4697         RValue<Int2> operator^=(const Int2 &lhs, RValue<Int2> rhs)
4698         {
4699                 return lhs = lhs ^ rhs;
4700         }
4701
4702         RValue<Int2> operator<<=(const Int2 &lhs, unsigned char rhs)
4703         {
4704                 return lhs = lhs << rhs;
4705         }
4706
4707         RValue<Int2> operator>>=(const Int2 &lhs, unsigned char rhs)
4708         {
4709                 return lhs = lhs >> rhs;
4710         }
4711
4712         RValue<Int2> operator<<=(const Int2 &lhs, RValue<Long1> rhs)
4713         {
4714                 return lhs = lhs << rhs;
4715         }
4716
4717         RValue<Int2> operator>>=(const Int2 &lhs, RValue<Long1> rhs)
4718         {
4719                 return lhs = lhs >> rhs;
4720         }
4721
4722 //      RValue<Int2> operator+(RValue<Int2> val)
4723 //      {
4724 //              return val;
4725 //      }
4726
4727 //      RValue<Int2> operator-(RValue<Int2> val)
4728 //      {
4729 //              return RValue<Int2>(Nucleus::createNeg(val.value));
4730 //      }
4731
4732         RValue<Int2> operator~(RValue<Int2> val)
4733         {
4734                 if(CPUID::supportsMMX2())
4735                 {
4736                         return val ^ Int2(0xFFFFFFFF, 0xFFFFFFFF);
4737                 }
4738                 else
4739                 {
4740                         return RValue<Int2>(Nucleus::createNot(val.value));
4741                 }
4742         }
4743
4744         RValue<Long1> UnpackLow(RValue<Int2> x, RValue<Int2> y)
4745         {
4746                 if(CPUID::supportsMMX2())
4747                 {
4748                         return x86::punpckldq(x, y);
4749                 }
4750                 else
4751                 {
4752                         int shuffle[2] = {0, 2};
4753                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
4754
4755                         return RValue<Long1>(Nucleus::createBitCast(packed, Long1::getType()));
4756                 }
4757         }
4758
4759         RValue<Long1> UnpackHigh(RValue<Int2> x, RValue<Int2> y)
4760         {
4761                 if(CPUID::supportsMMX2())
4762                 {
4763                         return x86::punpckhdq(x, y);
4764                 }
4765                 else
4766                 {
4767                         int shuffle[2] = {1, 3};
4768                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
4769
4770                         return RValue<Long1>(Nucleus::createBitCast(packed, Long1::getType()));
4771                 }
4772         }
4773
4774         RValue<Int> Extract(RValue<Int2> val, int i)
4775         {
4776                 if(false)   // FIXME: LLVM does not generate optimal code
4777                 {
4778                         return RValue<Int>(Nucleus::createExtractElement(val.value, Int::getType(), i));
4779                 }
4780                 else
4781                 {
4782                         if(i == 0)
4783                         {
4784                                 return RValue<Int>(Nucleus::createExtractElement(Nucleus::createBitCast(val.value, T(VectorType::get(Int::getType(), 2))), Int::getType(), 0));
4785                         }
4786                         else
4787                         {
4788                                 Int2 val2 = As<Int2>(UnpackHigh(val, val));
4789
4790                                 return Extract(val2, 0);
4791                         }
4792                 }
4793         }
4794
4795         RValue<Int2> Insert(RValue<Int2> val, RValue<Int> element, int i)
4796         {
4797                 return RValue<Int2>(Nucleus::createBitCast(Nucleus::createInsertElement(Nucleus::createBitCast(val.value, T(VectorType::get(Int::getType(), 2))), element.value, i), Int2::getType()));
4798         }
4799
4800         Type *Int2::getType()
4801         {
4802                 if(CPUID::supportsMMX2())
4803                 {
4804                         return MMX::getType();
4805                 }
4806                 else
4807                 {
4808                         return T(VectorType::get(Int::getType(), 2));
4809                 }
4810         }
4811
4812         UInt2::UInt2()
4813         {
4814         //      xy.parent = this;
4815         }
4816
4817         UInt2::UInt2(unsigned int x, unsigned int y)
4818         {
4819         //      xy.parent = this;
4820
4821                 int64_t constantVector[2] = {x, y};
4822                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UInt::getType(), 2))));
4823
4824                 storeValue(Nucleus::createBitCast(vector, getType()));
4825         }
4826
4827         UInt2::UInt2(RValue<UInt2> rhs)
4828         {
4829         //      xy.parent = this;
4830
4831                 storeValue(rhs.value);
4832         }
4833
4834         UInt2::UInt2(const UInt2 &rhs)
4835         {
4836         //      xy.parent = this;
4837
4838                 Value *value = rhs.loadValue();
4839                 storeValue(value);
4840         }
4841
4842         UInt2::UInt2(const Reference<UInt2> &rhs)
4843         {
4844         //      xy.parent = this;
4845
4846                 Value *value = rhs.loadValue();
4847                 storeValue(value);
4848         }
4849
4850         RValue<UInt2> UInt2::operator=(RValue<UInt2> rhs) const
4851         {
4852                 storeValue(rhs.value);
4853
4854                 return rhs;
4855         }
4856
4857         RValue<UInt2> UInt2::operator=(const UInt2 &rhs) const
4858         {
4859                 Value *value = rhs.loadValue();
4860                 storeValue(value);
4861
4862                 return RValue<UInt2>(value);
4863         }
4864
4865         RValue<UInt2> UInt2::operator=(const Reference<UInt2> &rhs) const
4866         {
4867                 Value *value = rhs.loadValue();
4868                 storeValue(value);
4869
4870                 return RValue<UInt2>(value);
4871         }
4872
4873         RValue<UInt2> operator+(RValue<UInt2> lhs, RValue<UInt2> rhs)
4874         {
4875                 if(CPUID::supportsMMX2())
4876                 {
4877                         return As<UInt2>(x86::paddd(As<Int2>(lhs), As<Int2>(rhs)));
4878                 }
4879                 else
4880                 {
4881                         return RValue<UInt2>(Nucleus::createAdd(lhs.value, rhs.value));
4882                 }
4883         }
4884
4885         RValue<UInt2> operator-(RValue<UInt2> lhs, RValue<UInt2> rhs)
4886         {
4887                 if(CPUID::supportsMMX2())
4888                 {
4889                         return As<UInt2>(x86::psubd(As<Int2>(lhs), As<Int2>(rhs)));
4890                 }
4891                 else
4892                 {
4893                         return RValue<UInt2>(Nucleus::createSub(lhs.value, rhs.value));
4894                 }
4895         }
4896
4897 //      RValue<UInt2> operator*(RValue<UInt2> lhs, RValue<UInt2> rhs)
4898 //      {
4899 //              return RValue<UInt2>(Nucleus::createMul(lhs.value, rhs.value));
4900 //      }
4901
4902 //      RValue<UInt2> operator/(RValue<UInt2> lhs, RValue<UInt2> rhs)
4903 //      {
4904 //              return RValue<UInt2>(Nucleus::createUDiv(lhs.value, rhs.value));
4905 //      }
4906
4907 //      RValue<UInt2> operator%(RValue<UInt2> lhs, RValue<UInt2> rhs)
4908 //      {
4909 //              return RValue<UInt2>(Nucleus::createURem(lhs.value, rhs.value));
4910 //      }
4911
4912         RValue<UInt2> operator&(RValue<UInt2> lhs, RValue<UInt2> rhs)
4913         {
4914                 if(CPUID::supportsMMX2())
4915                 {
4916                         return As<UInt2>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
4917                 }
4918                 else
4919                 {
4920                         return RValue<UInt2>(Nucleus::createAnd(lhs.value, rhs.value));
4921                 }
4922         }
4923
4924         RValue<UInt2> operator|(RValue<UInt2> lhs, RValue<UInt2> rhs)
4925         {
4926                 if(CPUID::supportsMMX2())
4927                 {
4928                         return As<UInt2>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
4929                 }
4930                 else
4931                 {
4932                         return RValue<UInt2>(Nucleus::createOr(lhs.value, rhs.value));
4933                 }
4934         }
4935
4936         RValue<UInt2> operator^(RValue<UInt2> lhs, RValue<UInt2> rhs)
4937         {
4938                 if(CPUID::supportsMMX2())
4939                 {
4940                         return As<UInt2>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
4941                 }
4942                 else
4943                 {
4944                         return RValue<UInt2>(Nucleus::createXor(lhs.value, rhs.value));
4945                 }
4946         }
4947
4948         RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
4949         {
4950         //      return RValue<UInt2>(Nucleus::createShl(lhs.value, rhs.value));
4951
4952                 return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
4953         }
4954
4955         RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
4956         {
4957         //      return RValue<UInt2>(Nucleus::createLShr(lhs.value, rhs.value));
4958
4959                 return x86::psrld(lhs, rhs);
4960         }
4961
4962         RValue<UInt2> operator<<(RValue<UInt2> lhs, RValue<Long1> rhs)
4963         {
4964         //      return RValue<UInt2>(Nucleus::createShl(lhs.value, rhs.value));
4965
4966                 return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
4967         }
4968
4969         RValue<UInt2> operator>>(RValue<UInt2> lhs, RValue<Long1> rhs)
4970         {
4971         //      return RValue<UInt2>(Nucleus::createLShr(lhs.value, rhs.value));
4972
4973                 return x86::psrld(lhs, rhs);
4974         }
4975
4976         RValue<UInt2> operator+=(const UInt2 &lhs, RValue<UInt2> rhs)
4977         {
4978                 return lhs = lhs + rhs;
4979         }
4980
4981         RValue<UInt2> operator-=(const UInt2 &lhs, RValue<UInt2> rhs)
4982         {
4983                 return lhs = lhs - rhs;
4984         }
4985
4986 //      RValue<UInt2> operator*=(const UInt2 &lhs, RValue<UInt2> rhs)
4987 //      {
4988 //              return lhs = lhs * rhs;
4989 //      }
4990
4991 //      RValue<UInt2> operator/=(const UInt2 &lhs, RValue<UInt2> rhs)
4992 //      {
4993 //              return lhs = lhs / rhs;
4994 //      }
4995
4996 //      RValue<UInt2> operator%=(const UInt2 &lhs, RValue<UInt2> rhs)
4997 //      {
4998 //              return lhs = lhs % rhs;
4999 //      }
5000
5001         RValue<UInt2> operator&=(const UInt2 &lhs, RValue<UInt2> rhs)
5002         {
5003                 return lhs = lhs & rhs;
5004         }
5005
5006         RValue<UInt2> operator|=(const UInt2 &lhs, RValue<UInt2> rhs)
5007         {
5008                 return lhs = lhs | rhs;
5009         }
5010
5011         RValue<UInt2> operator^=(const UInt2 &lhs, RValue<UInt2> rhs)
5012         {
5013                 return lhs = lhs ^ rhs;
5014         }
5015
5016         RValue<UInt2> operator<<=(const UInt2 &lhs, unsigned char rhs)
5017         {
5018                 return lhs = lhs << rhs;
5019         }
5020
5021         RValue<UInt2> operator>>=(const UInt2 &lhs, unsigned char rhs)
5022         {
5023                 return lhs = lhs >> rhs;
5024         }
5025
5026         RValue<UInt2> operator<<=(const UInt2 &lhs, RValue<Long1> rhs)
5027         {
5028                 return lhs = lhs << rhs;
5029         }
5030
5031         RValue<UInt2> operator>>=(const UInt2 &lhs, RValue<Long1> rhs)
5032         {
5033                 return lhs = lhs >> rhs;
5034         }
5035
5036 //      RValue<UInt2> operator+(RValue<UInt2> val)
5037 //      {
5038 //              return val;
5039 //      }
5040
5041 //      RValue<UInt2> operator-(RValue<UInt2> val)
5042 //      {
5043 //              return RValue<UInt2>(Nucleus::createNeg(val.value));
5044 //      }
5045
5046         RValue<UInt2> operator~(RValue<UInt2> val)
5047         {
5048                 if(CPUID::supportsMMX2())
5049                 {
5050                         return val ^ UInt2(0xFFFFFFFF, 0xFFFFFFFF);
5051                 }
5052                 else
5053                 {
5054                         return RValue<UInt2>(Nucleus::createNot(val.value));
5055                 }
5056         }
5057
5058         Type *UInt2::getType()
5059         {
5060                 if(CPUID::supportsMMX2())
5061                 {
5062                         return MMX::getType();
5063                 }
5064                 else
5065                 {
5066                         return T(VectorType::get(UInt::getType(), 2));
5067                 }
5068         }
5069
5070         Int4::Int4(RValue<Byte4> cast)
5071         {
5072                 Value *x = Nucleus::createBitCast(cast.value, Int::getType());
5073                 Value *a = Nucleus::createInsertElement(V(UndefValue::get(Int4::getType())), x, 0);
5074
5075                 Value *e;
5076
5077                 if (CPUID::supportsSSE4_1())
5078                 {
5079                         e = x86::pmovzxbd(RValue<Int4>(a)).value;
5080                 }
5081                 else
5082                 {
5083                         int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
5084                         Value *b = Nucleus::createBitCast(a, Byte16::getType());
5085                         Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Byte16::getType())), swizzle);
5086
5087                         int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
5088                         Value *d = Nucleus::createBitCast(c, Short8::getType());
5089                         e = Nucleus::createShuffleVector(d, V(Nucleus::createNullValue(Short8::getType())), swizzle2);
5090                 }
5091
5092                 Value *f = Nucleus::createBitCast(e, Int4::getType());
5093                 storeValue(f);
5094         }
5095
5096         Int4::Int4(RValue<SByte4> cast)
5097         {
5098                 Value *x = Nucleus::createBitCast(cast.value, Int::getType());
5099                 Value *a = Nucleus::createInsertElement(V(UndefValue::get(Int4::getType())), x, 0);
5100
5101                 Value *g;
5102
5103                 if (CPUID::supportsSSE4_1())
5104                 {
5105                         g = x86::pmovsxbd(RValue<Int4>(a)).value;
5106                 }
5107                 else
5108                 {
5109                         int     swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
5110                         Value *b = Nucleus::createBitCast(a, Byte16::getType());
5111                         Value *c = Nucleus::createShuffleVector(b, b, swizzle);
5112
5113                         int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
5114                         Value *d = Nucleus::createBitCast(c, Short8::getType());
5115                         Value *e = Nucleus::createShuffleVector(d, d, swizzle2);
5116
5117                         Value *f = Nucleus::createBitCast(e, Int4::getType());
5118                         //      g = Nucleus::createAShr(f, Nucleus::createConstantInt(24));
5119                         g = x86::psrad(RValue<Int4>(f), 24).value;
5120                 }
5121
5122                 storeValue(g);
5123         }
5124
5125         Int4::Int4(RValue<Float4> cast)
5126         {
5127         //      xyzw.parent = this;
5128
5129                 Value *xyzw = Nucleus::createFPToSI(cast.value, Int4::getType());
5130
5131                 storeValue(xyzw);
5132         }
5133
5134         Int4::Int4(RValue<Short4> cast)
5135         {
5136                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
5137                 Value *element = Nucleus::createBitCast(cast.value, Long::getType());
5138                 long2 = Nucleus::createInsertElement(long2, element, 0);
5139                 RValue<Int4> vector = RValue<Int4>(Nucleus::createBitCast(long2, Int4::getType()));
5140
5141                 if(CPUID::supportsSSE4_1())
5142                 {
5143                         storeValue(x86::pmovsxwd(vector).value);
5144                 }
5145                 else
5146                 {
5147                         Value *b = Nucleus::createBitCast(vector.value, Short8::getType());
5148
5149                         int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
5150                         Value *c = Nucleus::createShuffleVector(b, b, swizzle);
5151                         Value *d = Nucleus::createBitCast(c, Int4::getType());
5152                         storeValue(d);
5153
5154                         // Each Short is packed into each Int in the (Short | Short) format.
5155                         // Shifting by 16 will retrieve the original Short value.
5156                         // Shitfing an Int will propagate the sign bit, which will work
5157                         // for both positive and negative values of a Short.
5158                         *this >>= 16;
5159                 }
5160         }
5161
5162         Int4::Int4(RValue<UShort4> cast)
5163         {
5164                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
5165                 Value *element = Nucleus::createBitCast(cast.value, Long::getType());
5166                 long2 = Nucleus::createInsertElement(long2, element, 0);
5167                 RValue<Int4> vector = RValue<Int4>(Nucleus::createBitCast(long2, Int4::getType()));
5168
5169                 if(CPUID::supportsSSE4_1())
5170                 {
5171                         storeValue(x86::pmovzxwd(RValue<Int4>(vector)).value);
5172                 }
5173                 else
5174                 {
5175                         Value *b = Nucleus::createBitCast(vector.value, Short8::getType());
5176
5177                         int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
5178                         Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Short8::getType())), swizzle);
5179                         Value *d = Nucleus::createBitCast(c, Int4::getType());
5180                         storeValue(d);
5181                 }
5182         }
5183
5184         Int4::Int4()
5185         {
5186         //      xyzw.parent = this;
5187         }
5188
5189         Int4::Int4(int xyzw)
5190         {
5191                 constant(xyzw, xyzw, xyzw, xyzw);
5192         }
5193
5194         Int4::Int4(int x, int yzw)
5195         {
5196                 constant(x, yzw, yzw, yzw);
5197         }
5198
5199         Int4::Int4(int x, int y, int zw)
5200         {
5201                 constant(x, y, zw, zw);
5202         }
5203
5204         Int4::Int4(int x, int y, int z, int w)
5205         {
5206                 constant(x, y, z, w);
5207         }
5208
5209         void Int4::constant(int x, int y, int z, int w)
5210         {
5211         //      xyzw.parent = this;
5212
5213                 int64_t constantVector[4] = {x, y, z, w};
5214                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
5215         }
5216
5217         Int4::Int4(RValue<Int4> rhs)
5218         {
5219         //      xyzw.parent = this;
5220
5221                 storeValue(rhs.value);
5222         }
5223
5224         Int4::Int4(const Int4 &rhs)
5225         {
5226         //      xyzw.parent = this;
5227
5228                 Value *value = rhs.loadValue();
5229                 storeValue(value);
5230         }
5231
5232         Int4::Int4(const Reference<Int4> &rhs)
5233         {
5234         //      xyzw.parent = this;
5235
5236                 Value *value = rhs.loadValue();
5237                 storeValue(value);
5238         }
5239
5240         Int4::Int4(RValue<UInt4> rhs)
5241         {
5242         //      xyzw.parent = this;
5243
5244                 storeValue(rhs.value);
5245         }
5246
5247         Int4::Int4(const UInt4 &rhs)
5248         {
5249         //      xyzw.parent = this;
5250
5251                 Value *value = rhs.loadValue();
5252                 storeValue(value);
5253         }
5254
5255         Int4::Int4(const Reference<UInt4> &rhs)
5256         {
5257         //      xyzw.parent = this;
5258
5259                 Value *value = rhs.loadValue();
5260                 storeValue(value);
5261         }
5262
5263         Int4::Int4(RValue<Int2> lo, RValue<Int2> hi)
5264         {
5265         //      xyzw.parent = this;
5266
5267                 Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
5268                 Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
5269
5270                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
5271                 long2 = Nucleus::createInsertElement(long2, loLong, 0);
5272                 long2 = Nucleus::createInsertElement(long2, hiLong, 1);
5273                 Value *int4 = Nucleus::createBitCast(long2, Int4::getType());
5274
5275                 storeValue(int4);
5276         }
5277
5278         Int4::Int4(RValue<Int> rhs)
5279         {
5280         //      xyzw.parent = this;
5281
5282                 Value *vector = loadValue();
5283                 Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
5284
5285                 int swizzle[4] = {0, 0, 0, 0};
5286                 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
5287
5288                 storeValue(replicate);
5289         }
5290
5291         Int4::Int4(const Int &rhs)
5292         {
5293         //      xyzw.parent = this;
5294
5295                 *this = RValue<Int>(rhs.loadValue());
5296         }
5297
5298         Int4::Int4(const Reference<Int> &rhs)
5299         {
5300         //      xyzw.parent = this;
5301
5302                 *this = RValue<Int>(rhs.loadValue());
5303         }
5304
5305         RValue<Int4> Int4::operator=(RValue<Int4> rhs) const
5306         {
5307                 storeValue(rhs.value);
5308
5309                 return rhs;
5310         }
5311
5312         RValue<Int4> Int4::operator=(const Int4 &rhs) const
5313         {
5314                 Value *value = rhs.loadValue();
5315                 storeValue(value);
5316
5317                 return RValue<Int4>(value);
5318         }
5319
5320         RValue<Int4> Int4::operator=(const Reference<Int4> &rhs) const
5321         {
5322                 Value *value = rhs.loadValue();
5323                 storeValue(value);
5324
5325                 return RValue<Int4>(value);
5326         }
5327
5328         RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int4> rhs)
5329         {
5330                 return RValue<Int4>(Nucleus::createAdd(lhs.value, rhs.value));
5331         }
5332
5333         RValue<Int4> operator-(RValue<Int4> lhs, RValue<Int4> rhs)
5334         {
5335                 return RValue<Int4>(Nucleus::createSub(lhs.value, rhs.value));
5336         }
5337
5338         RValue<Int4> operator*(RValue<Int4> lhs, RValue<Int4> rhs)
5339         {
5340                 return RValue<Int4>(Nucleus::createMul(lhs.value, rhs.value));
5341         }
5342
5343         RValue<Int4> operator/(RValue<Int4> lhs, RValue<Int4> rhs)
5344         {
5345                 return RValue<Int4>(Nucleus::createSDiv(lhs.value, rhs.value));
5346         }
5347
5348         RValue<Int4> operator%(RValue<Int4> lhs, RValue<Int4> rhs)
5349         {
5350                 return RValue<Int4>(Nucleus::createSRem(lhs.value, rhs.value));
5351         }
5352
5353         RValue<Int4> operator&(RValue<Int4> lhs, RValue<Int4> rhs)
5354         {
5355                 return RValue<Int4>(Nucleus::createAnd(lhs.value, rhs.value));
5356         }
5357
5358         RValue<Int4> operator|(RValue<Int4> lhs, RValue<Int4> rhs)
5359         {
5360                 return RValue<Int4>(Nucleus::createOr(lhs.value, rhs.value));
5361         }
5362
5363         RValue<Int4> operator^(RValue<Int4> lhs, RValue<Int4> rhs)
5364         {
5365                 return RValue<Int4>(Nucleus::createXor(lhs.value, rhs.value));
5366         }
5367
5368         RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
5369         {
5370                 return x86::pslld(lhs, rhs);
5371         }
5372
5373         RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
5374         {
5375                 return x86::psrad(lhs, rhs);
5376         }
5377
5378         RValue<Int4> operator<<(RValue<Int4> lhs, RValue<Int4> rhs)
5379         {
5380                 return RValue<Int4>(Nucleus::createShl(lhs.value, rhs.value));
5381         }
5382
5383         RValue<Int4> operator>>(RValue<Int4> lhs, RValue<Int4> rhs)
5384         {
5385                 return RValue<Int4>(Nucleus::createAShr(lhs.value, rhs.value));
5386         }
5387
5388         RValue<Int4> operator+=(const Int4 &lhs, RValue<Int4> rhs)
5389         {
5390                 return lhs = lhs + rhs;
5391         }
5392
5393         RValue<Int4> operator-=(const Int4 &lhs, RValue<Int4> rhs)
5394         {
5395                 return lhs = lhs - rhs;
5396         }
5397
5398         RValue<Int4> operator*=(const Int4 &lhs, RValue<Int4> rhs)
5399         {
5400                 return lhs = lhs * rhs;
5401         }
5402
5403 //      RValue<Int4> operator/=(const Int4 &lhs, RValue<Int4> rhs)
5404 //      {
5405 //              return lhs = lhs / rhs;
5406 //      }
5407
5408 //      RValue<Int4> operator%=(const Int4 &lhs, RValue<Int4> rhs)
5409 //      {
5410 //              return lhs = lhs % rhs;
5411 //      }
5412
5413         RValue<Int4> operator&=(const Int4 &lhs, RValue<Int4> rhs)
5414         {
5415                 return lhs = lhs & rhs;
5416         }
5417
5418         RValue<Int4> operator|=(const Int4 &lhs, RValue<Int4> rhs)
5419         {
5420                 return lhs = lhs | rhs;
5421         }
5422
5423         RValue<Int4> operator^=(const Int4 &lhs, RValue<Int4> rhs)
5424         {
5425                 return lhs = lhs ^ rhs;
5426         }
5427
5428         RValue<Int4> operator<<=(const Int4 &lhs, unsigned char rhs)
5429         {
5430                 return lhs = lhs << rhs;
5431         }
5432
5433         RValue<Int4> operator>>=(const Int4 &lhs, unsigned char rhs)
5434         {
5435                 return lhs = lhs >> rhs;
5436         }
5437
5438         RValue<Int4> operator+(RValue<Int4> val)
5439         {
5440                 return val;
5441         }
5442
5443         RValue<Int4> operator-(RValue<Int4> val)
5444         {
5445                 return RValue<Int4>(Nucleus::createNeg(val.value));
5446         }
5447
5448         RValue<Int4> operator~(RValue<Int4> val)
5449         {
5450                 return RValue<Int4>(Nucleus::createNot(val.value));
5451         }
5452
5453         RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
5454         {
5455                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5456                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5457                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
5458                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
5459         }
5460
5461         RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
5462         {
5463                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType()));
5464         }
5465
5466         RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
5467         {
5468                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5469                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5470                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value, y.value), Int4::getType()));
5471                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
5472         }
5473
5474         RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
5475         {
5476                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
5477         }
5478
5479         RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
5480         {
5481                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5482                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5483                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value, y.value), Int4::getType()));
5484                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
5485         }
5486
5487         RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
5488         {
5489                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType()));
5490         }
5491
5492         RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
5493         {
5494                 if(CPUID::supportsSSE4_1())
5495                 {
5496                         return x86::pmaxsd(x, y);
5497                 }
5498                 else
5499                 {
5500                         RValue<Int4> greater = CmpNLE(x, y);
5501                         return x & greater | y & ~greater;
5502                 }
5503         }
5504
5505         RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
5506         {
5507                 if(CPUID::supportsSSE4_1())
5508                 {
5509                         return x86::pminsd(x, y);
5510                 }
5511                 else
5512                 {
5513                         RValue<Int4> less = CmpLT(x, y);
5514                         return x & less | y & ~less;
5515                 }
5516         }
5517
5518         RValue<Int4> RoundInt(RValue<Float4> cast)
5519         {
5520                 return x86::cvtps2dq(cast);
5521         }
5522
5523         RValue<Short8> Pack(RValue<Int4> x, RValue<Int4> y)
5524         {
5525                 return x86::packssdw(x, y);
5526         }
5527
5528         RValue<Int> Extract(RValue<Int4> x, int i)
5529         {
5530                 return RValue<Int>(Nucleus::createExtractElement(x.value, Int::getType(), i));
5531         }
5532
5533         RValue<Int4> Insert(RValue<Int4> x, RValue<Int> element, int i)
5534         {
5535                 return RValue<Int4>(Nucleus::createInsertElement(x.value, element.value, i));
5536         }
5537
5538         RValue<Int> SignMask(RValue<Int4> x)
5539         {
5540                 return x86::movmskps(As<Float4>(x));
5541         }
5542
5543         RValue<Int4> Swizzle(RValue<Int4> x, unsigned char select)
5544         {
5545                 return RValue<Int4>(createSwizzle4(x.value, select));
5546         }
5547
5548         Type *Int4::getType()
5549         {
5550                 return T(VectorType::get(Int::getType(), 4));
5551         }
5552
5553         UInt4::UInt4(RValue<Float4> cast)
5554         {
5555         //      xyzw.parent = this;
5556
5557                 // Note: createFPToUI is broken, must perform conversion using createFPtoSI
5558                 // Value *xyzw = Nucleus::createFPToUI(cast.value, UInt4::getType());
5559
5560                 // Smallest positive value representable in UInt, but not in Int
5561                 const unsigned int ustart = 0x80000000u;
5562                 const float ustartf = float(ustart);
5563
5564                 // Check if the value can be represented as an Int
5565                 Int4 uiValue = CmpNLT(cast, Float4(ustartf));
5566                 // If the value is too large, subtract ustart and re-add it after conversion.
5567                 uiValue = (uiValue & As<Int4>(As<UInt4>(Int4(cast - Float4(ustartf))) + UInt4(ustart))) |
5568                 // Otherwise, just convert normally
5569                           (~uiValue & Int4(cast));
5570                 // If the value is negative, store 0, otherwise store the result of the conversion
5571                 storeValue((~(As<Int4>(cast) >> 31) & uiValue).value);
5572         }
5573
5574         UInt4::UInt4()
5575         {
5576         //      xyzw.parent = this;
5577         }
5578
5579         UInt4::UInt4(int xyzw)
5580         {
5581                 constant(xyzw, xyzw, xyzw, xyzw);
5582         }
5583
5584         UInt4::UInt4(int x, int yzw)
5585         {
5586                 constant(x, yzw, yzw, yzw);
5587         }
5588
5589         UInt4::UInt4(int x, int y, int zw)
5590         {
5591                 constant(x, y, zw, zw);
5592         }
5593
5594         UInt4::UInt4(int x, int y, int z, int w)
5595         {
5596                 constant(x, y, z, w);
5597         }
5598
5599         void UInt4::constant(int x, int y, int z, int w)
5600         {
5601         //      xyzw.parent = this;
5602
5603                 int64_t constantVector[4] = {x, y, z, w};
5604                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
5605         }
5606
5607         UInt4::UInt4(RValue<UInt4> rhs)
5608         {
5609         //      xyzw.parent = this;
5610
5611                 storeValue(rhs.value);
5612         }
5613
5614         UInt4::UInt4(const UInt4 &rhs)
5615         {
5616         //      xyzw.parent = this;
5617
5618                 Value *value = rhs.loadValue();
5619                 storeValue(value);
5620         }
5621
5622         UInt4::UInt4(const Reference<UInt4> &rhs)
5623         {
5624         //      xyzw.parent = this;
5625
5626                 Value *value = rhs.loadValue();
5627                 storeValue(value);
5628         }
5629
5630         UInt4::UInt4(RValue<Int4> rhs)
5631         {
5632         //      xyzw.parent = this;
5633
5634                 storeValue(rhs.value);
5635         }
5636
5637         UInt4::UInt4(const Int4 &rhs)
5638         {
5639         //      xyzw.parent = this;
5640
5641                 Value *value = rhs.loadValue();
5642                 storeValue(value);
5643         }
5644
5645         UInt4::UInt4(const Reference<Int4> &rhs)
5646         {
5647         //      xyzw.parent = this;
5648
5649                 Value *value = rhs.loadValue();
5650                 storeValue(value);
5651         }
5652
5653         UInt4::UInt4(RValue<UInt2> lo, RValue<UInt2> hi)
5654         {
5655                 Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
5656                 Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
5657
5658                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
5659                 long2 = Nucleus::createInsertElement(long2, loLong, 0);
5660                 long2 = Nucleus::createInsertElement(long2, hiLong, 1);
5661                 Value *uint4 = Nucleus::createBitCast(long2, Int4::getType());
5662
5663                 storeValue(uint4);
5664         }
5665
5666         RValue<UInt4> UInt4::operator=(RValue<UInt4> rhs) const
5667         {
5668                 storeValue(rhs.value);
5669
5670                 return rhs;
5671         }
5672
5673         RValue<UInt4> UInt4::operator=(const UInt4 &rhs) const
5674         {
5675                 Value *value = rhs.loadValue();
5676                 storeValue(value);
5677
5678                 return RValue<UInt4>(value);
5679         }
5680
5681         RValue<UInt4> UInt4::operator=(const Reference<UInt4> &rhs) const
5682         {
5683                 Value *value = rhs.loadValue();
5684                 storeValue(value);
5685
5686                 return RValue<UInt4>(value);
5687         }
5688
5689         RValue<UInt4> operator+(RValue<UInt4> lhs, RValue<UInt4> rhs)
5690         {
5691                 return RValue<UInt4>(Nucleus::createAdd(lhs.value, rhs.value));
5692         }
5693
5694         RValue<UInt4> operator-(RValue<UInt4> lhs, RValue<UInt4> rhs)
5695         {
5696                 return RValue<UInt4>(Nucleus::createSub(lhs.value, rhs.value));
5697         }
5698
5699         RValue<UInt4> operator*(RValue<UInt4> lhs, RValue<UInt4> rhs)
5700         {
5701                 return RValue<UInt4>(Nucleus::createMul(lhs.value, rhs.value));
5702         }
5703
5704         RValue<UInt4> operator/(RValue<UInt4> lhs, RValue<UInt4> rhs)
5705         {
5706                 return RValue<UInt4>(Nucleus::createUDiv(lhs.value, rhs.value));
5707         }
5708
5709         RValue<UInt4> operator%(RValue<UInt4> lhs, RValue<UInt4> rhs)
5710         {
5711                 return RValue<UInt4>(Nucleus::createURem(lhs.value, rhs.value));
5712         }
5713
5714         RValue<UInt4> operator&(RValue<UInt4> lhs, RValue<UInt4> rhs)
5715         {
5716                 return RValue<UInt4>(Nucleus::createAnd(lhs.value, rhs.value));
5717         }
5718
5719         RValue<UInt4> operator|(RValue<UInt4> lhs, RValue<UInt4> rhs)
5720         {
5721                 return RValue<UInt4>(Nucleus::createOr(lhs.value, rhs.value));
5722         }
5723
5724         RValue<UInt4> operator^(RValue<UInt4> lhs, RValue<UInt4> rhs)
5725         {
5726                 return RValue<UInt4>(Nucleus::createXor(lhs.value, rhs.value));
5727         }
5728
5729         RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
5730         {
5731                 return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
5732         }
5733
5734         RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
5735         {
5736                 return x86::psrld(lhs, rhs);
5737         }
5738
5739         RValue<UInt4> operator<<(RValue<UInt4> lhs, RValue<UInt4> rhs)
5740         {
5741                 return RValue<UInt4>(Nucleus::createShl(lhs.value, rhs.value));
5742         }
5743
5744         RValue<UInt4> operator>>(RValue<UInt4> lhs, RValue<UInt4> rhs)
5745         {
5746                 return RValue<UInt4>(Nucleus::createLShr(lhs.value, rhs.value));
5747         }
5748
5749         RValue<UInt4> operator+=(const UInt4 &lhs, RValue<UInt4> rhs)
5750         {
5751                 return lhs = lhs + rhs;
5752         }
5753
5754         RValue<UInt4> operator-=(const UInt4 &lhs, RValue<UInt4> rhs)
5755         {
5756                 return lhs = lhs - rhs;
5757         }
5758
5759         RValue<UInt4> operator*=(const UInt4 &lhs, RValue<UInt4> rhs)
5760         {
5761                 return lhs = lhs * rhs;
5762         }
5763
5764 //      RValue<UInt4> operator/=(const UInt4 &lhs, RValue<UInt4> rhs)
5765 //      {
5766 //              return lhs = lhs / rhs;
5767 //      }
5768
5769 //      RValue<UInt4> operator%=(const UInt4 &lhs, RValue<UInt4> rhs)
5770 //      {
5771 //              return lhs = lhs % rhs;
5772 //      }
5773
5774         RValue<UInt4> operator&=(const UInt4 &lhs, RValue<UInt4> rhs)
5775         {
5776                 return lhs = lhs & rhs;
5777         }
5778
5779         RValue<UInt4> operator|=(const UInt4 &lhs, RValue<UInt4> rhs)
5780         {
5781                 return lhs = lhs | rhs;
5782         }
5783
5784         RValue<UInt4> operator^=(const UInt4 &lhs, RValue<UInt4> rhs)
5785         {
5786                 return lhs = lhs ^ rhs;
5787         }
5788
5789         RValue<UInt4> operator<<=(const UInt4 &lhs, unsigned char rhs)
5790         {
5791                 return lhs = lhs << rhs;
5792         }
5793
5794         RValue<UInt4> operator>>=(const UInt4 &lhs, unsigned char rhs)
5795         {
5796                 return lhs = lhs >> rhs;
5797         }
5798
5799         RValue<UInt4> operator+(RValue<UInt4> val)
5800         {
5801                 return val;
5802         }
5803
5804         RValue<UInt4> operator-(RValue<UInt4> val)
5805         {
5806                 return RValue<UInt4>(Nucleus::createNeg(val.value));
5807         }
5808
5809         RValue<UInt4> operator~(RValue<UInt4> val)
5810         {
5811                 return RValue<UInt4>(Nucleus::createNot(val.value));
5812         }
5813
5814         RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
5815         {
5816                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5817                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5818                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
5819                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
5820         }
5821
5822         RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
5823         {
5824                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType()));
5825         }
5826
5827         RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
5828         {
5829                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5830                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5831                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value, y.value), Int4::getType()));
5832                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
5833         }
5834
5835         RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
5836         {
5837                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
5838         }
5839
5840         RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
5841         {
5842                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5843                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5844                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value, y.value), Int4::getType()));
5845                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
5846         }
5847
5848         RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
5849         {
5850                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType()));
5851         }
5852
5853         RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
5854         {
5855                 if(CPUID::supportsSSE4_1())
5856                 {
5857                         return x86::pmaxud(x, y);
5858                 }
5859                 else
5860                 {
5861                         RValue<UInt4> greater = CmpNLE(x, y);
5862                         return x & greater | y & ~greater;
5863                 }
5864         }
5865
5866         RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
5867         {
5868                 if(CPUID::supportsSSE4_1())
5869                 {
5870                         return x86::pminud(x, y);
5871                 }
5872                 else
5873                 {
5874                         RValue<UInt4> less = CmpLT(x, y);
5875                         return x & less | y & ~less;
5876                 }
5877         }
5878
5879         RValue<UShort8> Pack(RValue<UInt4> x, RValue<UInt4> y)
5880         {
5881                 return x86::packusdw(x, y);   // FIXME: Fallback required
5882         }
5883
5884         Type *UInt4::getType()
5885         {
5886                 return T(VectorType::get(UInt::getType(), 4));
5887         }
5888
5889         Float::Float(RValue<Int> cast)
5890         {
5891                 Value *integer = Nucleus::createSIToFP(cast.value, Float::getType());
5892
5893                 storeValue(integer);
5894         }
5895
5896         Float::Float()
5897         {
5898
5899         }
5900
5901         Float::Float(float x)
5902         {
5903                 storeValue(Nucleus::createConstantFloat(x));
5904         }
5905
5906         Float::Float(RValue<Float> rhs)
5907         {
5908                 storeValue(rhs.value);
5909         }
5910
5911         Float::Float(const Float &rhs)
5912         {
5913                 Value *value = rhs.loadValue();
5914                 storeValue(value);
5915         }
5916
5917         Float::Float(const Reference<Float> &rhs)
5918         {
5919                 Value *value = rhs.loadValue();
5920                 storeValue(value);
5921         }
5922
5923         RValue<Float> Float::operator=(RValue<Float> rhs) const
5924         {
5925                 storeValue(rhs.value);
5926
5927                 return rhs;
5928         }
5929
5930         RValue<Float> Float::operator=(const Float &rhs) const
5931         {
5932                 Value *value = rhs.loadValue();
5933                 storeValue(value);
5934
5935                 return RValue<Float>(value);
5936         }
5937
5938         RValue<Float> Float::operator=(const Reference<Float> &rhs) const
5939         {
5940                 Value *value = rhs.loadValue();
5941                 storeValue(value);
5942
5943                 return RValue<Float>(value);
5944         }
5945
5946         RValue<Float> operator+(RValue<Float> lhs, RValue<Float> rhs)
5947         {
5948                 return RValue<Float>(Nucleus::createFAdd(lhs.value, rhs.value));
5949         }
5950
5951         RValue<Float> operator-(RValue<Float> lhs, RValue<Float> rhs)
5952         {
5953                 return RValue<Float>(Nucleus::createFSub(lhs.value, rhs.value));
5954         }
5955
5956         RValue<Float> operator*(RValue<Float> lhs, RValue<Float> rhs)
5957         {
5958                 return RValue<Float>(Nucleus::createFMul(lhs.value, rhs.value));
5959         }
5960
5961         RValue<Float> operator/(RValue<Float> lhs, RValue<Float> rhs)
5962         {
5963                 return RValue<Float>(Nucleus::createFDiv(lhs.value, rhs.value));
5964         }
5965
5966         RValue<Float> operator+=(const Float &lhs, RValue<Float> rhs)
5967         {
5968                 return lhs = lhs + rhs;
5969         }
5970
5971         RValue<Float> operator-=(const Float &lhs, RValue<Float> rhs)
5972         {
5973                 return lhs = lhs - rhs;
5974         }
5975
5976         RValue<Float> operator*=(const Float &lhs, RValue<Float> rhs)
5977         {
5978                 return lhs = lhs * rhs;
5979         }
5980
5981         RValue<Float> operator/=(const Float &lhs, RValue<Float> rhs)
5982         {
5983                 return lhs = lhs / rhs;
5984         }
5985
5986         RValue<Float> operator+(RValue<Float> val)
5987         {
5988                 return val;
5989         }
5990
5991         RValue<Float> operator-(RValue<Float> val)
5992         {
5993                 return RValue<Float>(Nucleus::createFNeg(val.value));
5994         }
5995
5996         RValue<Bool> operator<(RValue<Float> lhs, RValue<Float> rhs)
5997         {
5998                 return RValue<Bool>(Nucleus::createFCmpOLT(lhs.value, rhs.value));
5999         }
6000
6001         RValue<Bool> operator<=(RValue<Float> lhs, RValue<Float> rhs)
6002         {
6003                 return RValue<Bool>(Nucleus::createFCmpOLE(lhs.value, rhs.value));
6004         }
6005
6006         RValue<Bool> operator>(RValue<Float> lhs, RValue<Float> rhs)
6007         {
6008                 return RValue<Bool>(Nucleus::createFCmpOGT(lhs.value, rhs.value));
6009         }
6010
6011         RValue<Bool> operator>=(RValue<Float> lhs, RValue<Float> rhs)
6012         {
6013                 return RValue<Bool>(Nucleus::createFCmpOGE(lhs.value, rhs.value));
6014         }
6015
6016         RValue<Bool> operator!=(RValue<Float> lhs, RValue<Float> rhs)
6017         {
6018                 return RValue<Bool>(Nucleus::createFCmpONE(lhs.value, rhs.value));
6019         }
6020
6021         RValue<Bool> operator==(RValue<Float> lhs, RValue<Float> rhs)
6022         {
6023                 return RValue<Bool>(Nucleus::createFCmpOEQ(lhs.value, rhs.value));
6024         }
6025
6026         RValue<Float> Abs(RValue<Float> x)
6027         {
6028                 return IfThenElse(x > 0.0f, x, -x);
6029         }
6030
6031         RValue<Float> Max(RValue<Float> x, RValue<Float> y)
6032         {
6033                 return IfThenElse(x > y, x, y);
6034         }
6035
6036         RValue<Float> Min(RValue<Float> x, RValue<Float> y)
6037         {
6038                 return IfThenElse(x < y, x, y);
6039         }
6040
6041         RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
6042         {
6043                 if(exactAtPow2)
6044                 {
6045                         // rcpss uses a piecewise-linear approximation which minimizes the relative error
6046                         // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
6047                         return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
6048                 }
6049                 else
6050                 {
6051                         return x86::rcpss(x);
6052                 }
6053         }
6054
6055         RValue<Float> RcpSqrt_pp(RValue<Float> x)
6056         {
6057                 return x86::rsqrtss(x);
6058         }
6059
6060         RValue<Float> Sqrt(RValue<Float> x)
6061         {
6062                 return x86::sqrtss(x);
6063         }
6064
6065         RValue<Float> Round(RValue<Float> x)
6066         {
6067                 if(CPUID::supportsSSE4_1())
6068                 {
6069                         return x86::roundss(x, 0);
6070                 }
6071                 else
6072                 {
6073                         return Float4(Round(Float4(x))).x;
6074                 }
6075         }
6076
6077         RValue<Float> Trunc(RValue<Float> x)
6078         {
6079                 if(CPUID::supportsSSE4_1())
6080                 {
6081                         return x86::roundss(x, 3);
6082                 }
6083                 else
6084                 {
6085                         return Float(Int(x));   // Rounded toward zero
6086                 }
6087         }
6088
6089         RValue<Float> Frac(RValue<Float> x)
6090         {
6091                 if(CPUID::supportsSSE4_1())
6092                 {
6093                         return x - x86::floorss(x);
6094                 }
6095                 else
6096                 {
6097                         return Float4(Frac(Float4(x))).x;
6098                 }
6099         }
6100
6101         RValue<Float> Floor(RValue<Float> x)
6102         {
6103                 if(CPUID::supportsSSE4_1())
6104                 {
6105                         return x86::floorss(x);
6106                 }
6107                 else
6108                 {
6109                         return Float4(Floor(Float4(x))).x;
6110                 }
6111         }
6112
6113         RValue<Float> Ceil(RValue<Float> x)
6114         {
6115                 if(CPUID::supportsSSE4_1())
6116                 {
6117                         return x86::ceilss(x);
6118                 }
6119                 else
6120                 {
6121                         return Float4(Ceil(Float4(x))).x;
6122                 }
6123         }
6124
6125         Type *Float::getType()
6126         {
6127                 return T(llvm::Type::getFloatTy(*::context));
6128         }
6129
6130         Float2::Float2(RValue<Float4> cast)
6131         {
6132         //      xyzw.parent = this;
6133
6134                 Value *int64x2 = Nucleus::createBitCast(cast.value, T(VectorType::get(Long::getType(), 2)));
6135                 Value *int64 = Nucleus::createExtractElement(int64x2, Long::getType(), 0);
6136                 Value *float2 = Nucleus::createBitCast(int64, Float2::getType());
6137
6138                 storeValue(float2);
6139         }
6140
6141         Type *Float2::getType()
6142         {
6143                 return T(VectorType::get(Float::getType(), 2));
6144         }
6145
6146         Float4::Float4(RValue<Byte4> cast)
6147         {
6148                 xyzw.parent = this;
6149
6150                 #if 0
6151                         Value *xyzw = Nucleus::createUIToFP(cast.value, Float4::getType());   // FIXME: Crashes
6152                 #elif 0
6153                         Value *vector = loadValue();
6154
6155                         Value *i8x = Nucleus::createExtractElement(cast.value, 0);
6156                         Value *f32x = Nucleus::createUIToFP(i8x, Float::getType());
6157                         Value *x = Nucleus::createInsertElement(vector, f32x, 0);
6158
6159                         Value *i8y = Nucleus::createExtractElement(cast.value, V(Nucleus::createConstantInt(1)));
6160                         Value *f32y = Nucleus::createUIToFP(i8y, Float::getType());
6161                         Value *xy = Nucleus::createInsertElement(x, f32y, V(Nucleus::createConstantInt(1)));
6162
6163                         Value *i8z = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(2));
6164                         Value *f32z = Nucleus::createUIToFP(i8z, Float::getType());
6165                         Value *xyz = Nucleus::createInsertElement(xy, f32z, Nucleus::createConstantInt(2));
6166
6167                         Value *i8w = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(3));
6168                         Value *f32w = Nucleus::createUIToFP(i8w, Float::getType());
6169                         Value *xyzw = Nucleus::createInsertElement(xyz, f32w, Nucleus::createConstantInt(3));
6170                 #else
6171                         Value *a = Int4(cast).loadValue();
6172                         Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
6173                 #endif
6174
6175                 storeValue(xyzw);
6176         }
6177
6178         Float4::Float4(RValue<SByte4> cast)
6179         {
6180                 xyzw.parent = this;
6181
6182                 #if 0
6183                         Value *xyzw = Nucleus::createSIToFP(cast.value, Float4::getType());   // FIXME: Crashes
6184                 #elif 0
6185                         Value *vector = loadValue();
6186
6187                         Value *i8x = Nucleus::createExtractElement(cast.value, 0);
6188                         Value *f32x = Nucleus::createSIToFP(i8x, Float::getType());
6189                         Value *x = Nucleus::createInsertElement(vector, f32x, 0);
6190
6191                         Value *i8y = Nucleus::createExtractElement(cast.value, V(Nucleus::createConstantInt(1)));
6192                         Value *f32y = Nucleus::createSIToFP(i8y, Float::getType());
6193                         Value *xy = Nucleus::createInsertElement(x, f32y, V(Nucleus::createConstantInt(1)));
6194
6195                         Value *i8z = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(2));
6196                         Value *f32z = Nucleus::createSIToFP(i8z, Float::getType());
6197                         Value *xyz = Nucleus::createInsertElement(xy, f32z, Nucleus::createConstantInt(2));
6198
6199                         Value *i8w = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(3));
6200                         Value *f32w = Nucleus::createSIToFP(i8w, Float::getType());
6201                         Value *xyzw = Nucleus::createInsertElement(xyz, f32w, Nucleus::createConstantInt(3));
6202                 #else
6203                         Value *a = Int4(cast).loadValue();
6204                         Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
6205                 #endif
6206
6207                 storeValue(xyzw);
6208         }
6209
6210         Float4::Float4(RValue<Short4> cast)
6211         {
6212                 xyzw.parent = this;
6213
6214                 Int4 c(cast);
6215                 storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
6216         }
6217
6218         Float4::Float4(RValue<UShort4> cast)
6219         {
6220                 xyzw.parent = this;
6221
6222                 Int4 c(cast);
6223                 storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
6224         }
6225
6226         Float4::Float4(RValue<Int4> cast)
6227         {
6228                 xyzw.parent = this;
6229
6230                 Value *xyzw = Nucleus::createSIToFP(cast.value, Float4::getType());
6231
6232                 storeValue(xyzw);
6233         }
6234
6235         Float4::Float4(RValue<UInt4> cast)
6236         {
6237                 xyzw.parent = this;
6238
6239                 Value *xyzw = Nucleus::createUIToFP(cast.value, Float4::getType());
6240
6241                 storeValue(xyzw);
6242         }
6243
6244         Float4::Float4()
6245         {
6246                 xyzw.parent = this;
6247         }
6248
6249         Float4::Float4(float xyzw)
6250         {
6251                 constant(xyzw, xyzw, xyzw, xyzw);
6252         }
6253
6254         Float4::Float4(float x, float yzw)
6255         {
6256                 constant(x, yzw, yzw, yzw);
6257         }
6258
6259         Float4::Float4(float x, float y, float zw)
6260         {
6261                 constant(x, y, zw, zw);
6262         }
6263
6264         Float4::Float4(float x, float y, float z, float w)
6265         {
6266                 constant(x, y, z, w);
6267         }
6268
6269         void Float4::constant(float x, float y, float z, float w)
6270         {
6271                 xyzw.parent = this;
6272
6273                 double constantVector[4] = {x, y, z, w};
6274                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
6275         }
6276
6277         Float4::Float4(RValue<Float4> rhs)
6278         {
6279                 xyzw.parent = this;
6280
6281                 storeValue(rhs.value);
6282         }
6283
6284         Float4::Float4(const Float4 &rhs)
6285         {
6286                 xyzw.parent = this;
6287
6288                 Value *value = rhs.loadValue();
6289                 storeValue(value);
6290         }
6291
6292         Float4::Float4(const Reference<Float4> &rhs)
6293         {
6294                 xyzw.parent = this;
6295
6296                 Value *value = rhs.loadValue();
6297                 storeValue(value);
6298         }
6299
6300         Float4::Float4(RValue<Float> rhs)
6301         {
6302                 xyzw.parent = this;
6303
6304                 Value *vector = loadValue();
6305                 Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
6306
6307                 int swizzle[4] = {0, 0, 0, 0};
6308                 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
6309
6310                 storeValue(replicate);
6311         }
6312
6313         Float4::Float4(const Float &rhs)
6314         {
6315                 xyzw.parent = this;
6316
6317                 *this = RValue<Float>(rhs.loadValue());
6318         }
6319
6320         Float4::Float4(const Reference<Float> &rhs)
6321         {
6322                 xyzw.parent = this;
6323
6324                 *this = RValue<Float>(rhs.loadValue());
6325         }
6326
6327         RValue<Float4> Float4::operator=(float x) const
6328         {
6329                 return *this = Float4(x, x, x, x);
6330         }
6331
6332         RValue<Float4> Float4::operator=(RValue<Float4> rhs) const
6333         {
6334                 storeValue(rhs.value);
6335
6336                 return rhs;
6337         }
6338
6339         RValue<Float4> Float4::operator=(const Float4 &rhs) const
6340         {
6341                 Value *value = rhs.loadValue();
6342                 storeValue(value);
6343
6344                 return RValue<Float4>(value);
6345         }
6346
6347         RValue<Float4> Float4::operator=(const Reference<Float4> &rhs) const
6348         {
6349                 Value *value = rhs.loadValue();
6350                 storeValue(value);
6351
6352                 return RValue<Float4>(value);
6353         }
6354
6355         RValue<Float4> Float4::operator=(RValue<Float> rhs) const
6356         {
6357                 return *this = Float4(rhs);
6358         }
6359
6360         RValue<Float4> Float4::operator=(const Float &rhs) const
6361         {
6362                 return *this = Float4(rhs);
6363         }
6364
6365         RValue<Float4> Float4::operator=(const Reference<Float> &rhs) const
6366         {
6367                 return *this = Float4(rhs);
6368         }
6369
6370         RValue<Float4> operator+(RValue<Float4> lhs, RValue<Float4> rhs)
6371         {
6372                 return RValue<Float4>(Nucleus::createFAdd(lhs.value, rhs.value));
6373         }
6374
6375         RValue<Float4> operator-(RValue<Float4> lhs, RValue<Float4> rhs)
6376         {
6377                 return RValue<Float4>(Nucleus::createFSub(lhs.value, rhs.value));
6378         }
6379
6380         RValue<Float4> operator*(RValue<Float4> lhs, RValue<Float4> rhs)
6381         {
6382                 return RValue<Float4>(Nucleus::createFMul(lhs.value, rhs.value));
6383         }
6384
6385         RValue<Float4> operator/(RValue<Float4> lhs, RValue<Float4> rhs)
6386         {
6387                 return RValue<Float4>(Nucleus::createFDiv(lhs.value, rhs.value));
6388         }
6389
6390         RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
6391         {
6392                 return RValue<Float4>(Nucleus::createFRem(lhs.value, rhs.value));
6393         }
6394
6395         RValue<Float4> operator+=(const Float4 &lhs, RValue<Float4> rhs)
6396         {
6397                 return lhs = lhs + rhs;
6398         }
6399
6400         RValue<Float4> operator-=(const Float4 &lhs, RValue<Float4> rhs)
6401         {
6402                 return lhs = lhs - rhs;
6403         }
6404
6405         RValue<Float4> operator*=(const Float4 &lhs, RValue<Float4> rhs)
6406         {
6407                 return lhs = lhs * rhs;
6408         }
6409
6410         RValue<Float4> operator/=(const Float4 &lhs, RValue<Float4> rhs)
6411         {
6412                 return lhs = lhs / rhs;
6413         }
6414
6415         RValue<Float4> operator%=(const Float4 &lhs, RValue<Float4> rhs)
6416         {
6417                 return lhs = lhs % rhs;
6418         }
6419
6420         RValue<Float4> operator+(RValue<Float4> val)
6421         {
6422                 return val;
6423         }
6424
6425         RValue<Float4> operator-(RValue<Float4> val)
6426         {
6427                 return RValue<Float4>(Nucleus::createFNeg(val.value));
6428         }
6429
6430         RValue<Float4> Abs(RValue<Float4> x)
6431         {
6432                 Value *vector = Nucleus::createBitCast(x.value, Int4::getType());
6433                 int64_t constantVector[4] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
6434                 Value *result = Nucleus::createAnd(vector, V(Nucleus::createConstantVector(constantVector, Int4::getType())));
6435
6436                 return RValue<Float4>(Nucleus::createBitCast(result, Float4::getType()));
6437         }
6438
6439         RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
6440         {
6441                 return x86::maxps(x, y);
6442         }
6443
6444         RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
6445         {
6446                 return x86::minps(x, y);
6447         }
6448
6449         RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
6450         {
6451                 if(exactAtPow2)
6452                 {
6453                         // rcpps uses a piecewise-linear approximation which minimizes the relative error
6454                         // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
6455                         return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
6456                 }
6457                 else
6458                 {
6459                         return x86::rcpps(x);
6460                 }
6461         }
6462
6463         RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
6464         {
6465                 return x86::rsqrtps(x);
6466         }
6467
6468         RValue<Float4> Sqrt(RValue<Float4> x)
6469         {
6470                 return x86::sqrtps(x);
6471         }
6472
6473         RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i)
6474         {
6475                 return RValue<Float4>(Nucleus::createInsertElement(val.value, element.value, i));
6476         }
6477
6478         RValue<Float> Extract(RValue<Float4> x, int i)
6479         {
6480                 return RValue<Float>(Nucleus::createExtractElement(x.value, Float::getType(), i));
6481         }
6482
6483         RValue<Float4> Swizzle(RValue<Float4> x, unsigned char select)
6484         {
6485                 return RValue<Float4>(createSwizzle4(x.value, select));
6486         }
6487
6488         RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, unsigned char imm)
6489         {
6490                 int shuffle[4] =
6491                 {
6492                         ((imm >> 0) & 0x03) + 0,
6493                         ((imm >> 2) & 0x03) + 0,
6494                         ((imm >> 4) & 0x03) + 4,
6495                         ((imm >> 6) & 0x03) + 4,
6496                 };
6497
6498                 return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
6499         }
6500
6501         RValue<Float4> UnpackLow(RValue<Float4> x, RValue<Float4> y)
6502         {
6503                 int shuffle[4] = {0, 4, 1, 5};
6504                 return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
6505         }
6506
6507         RValue<Float4> UnpackHigh(RValue<Float4> x, RValue<Float4> y)
6508         {
6509                 int shuffle[4] = {2, 6, 3, 7};
6510                 return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
6511         }
6512
6513         RValue<Float4> Mask(Float4 &lhs, RValue<Float4> rhs, unsigned char select)
6514         {
6515                 Value *vector = lhs.loadValue();
6516                 Value *shuffle = createMask4(vector, rhs.value, select);
6517                 lhs.storeValue(shuffle);
6518
6519                 return RValue<Float4>(shuffle);
6520         }
6521
6522         RValue<Int> SignMask(RValue<Float4> x)
6523         {
6524                 return x86::movmskps(x);
6525         }
6526
6527         RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
6528         {
6529         //      return As<Int4>(x86::cmpeqps(x, y));
6530                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value, y.value), Int4::getType()));
6531         }
6532
6533         RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
6534         {
6535         //      return As<Int4>(x86::cmpltps(x, y));
6536                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value, y.value), Int4::getType()));
6537         }
6538
6539         RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
6540         {
6541         //      return As<Int4>(x86::cmpleps(x, y));
6542                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value, y.value), Int4::getType()));
6543         }
6544
6545         RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
6546         {
6547         //      return As<Int4>(x86::cmpneqps(x, y));
6548                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value, y.value), Int4::getType()));
6549         }
6550
6551         RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
6552         {
6553         //      return As<Int4>(x86::cmpnltps(x, y));
6554                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value, y.value), Int4::getType()));
6555         }
6556
6557         RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
6558         {
6559         //      return As<Int4>(x86::cmpnleps(x, y));
6560                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value, y.value), Int4::getType()));
6561         }
6562
6563         RValue<Float4> Round(RValue<Float4> x)
6564         {
6565                 if(CPUID::supportsSSE4_1())
6566                 {
6567                         return x86::roundps(x, 0);
6568                 }
6569                 else
6570                 {
6571                         return Float4(RoundInt(x));
6572                 }
6573         }
6574
6575         RValue<Float4> Trunc(RValue<Float4> x)
6576         {
6577                 if(CPUID::supportsSSE4_1())
6578                 {
6579                         return x86::roundps(x, 3);
6580                 }
6581                 else
6582                 {
6583                         return Float4(Int4(x));   // Rounded toward zero
6584                 }
6585         }
6586
6587         RValue<Float4> Frac(RValue<Float4> x)
6588         {
6589                 if(CPUID::supportsSSE4_1())
6590                 {
6591                         return x - x86::floorps(x);
6592                 }
6593                 else
6594                 {
6595                         Float4 frc = x - Float4(Int4(x));   // Signed fractional part
6596
6597                         return frc + As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1, 1, 1, 1)));
6598                 }
6599         }
6600
6601         RValue<Float4> Floor(RValue<Float4> x)
6602         {
6603                 if(CPUID::supportsSSE4_1())
6604                 {
6605                         return x86::floorps(x);
6606                 }
6607                 else
6608                 {
6609                         return x - Frac(x);
6610                 }
6611         }
6612
6613         RValue<Float4> Ceil(RValue<Float4> x)
6614         {
6615                 if(CPUID::supportsSSE4_1())
6616                 {
6617                         return x86::ceilps(x);
6618                 }
6619                 else
6620                 {
6621                         return -Floor(-x);
6622                 }
6623         }
6624
6625         Type *Float4::getType()
6626         {
6627                 return T(VectorType::get(Float::getType(), 4));
6628         }
6629
6630         RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, int offset)
6631         {
6632                 return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), V(Nucleus::createConstantInt(offset))));
6633         }
6634
6635         RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
6636         {
6637                 return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value));
6638         }
6639
6640         RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
6641         {
6642                 return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value));
6643         }
6644
6645         RValue<Pointer<Byte>> operator+=(const Pointer<Byte> &lhs, int offset)
6646         {
6647                 return lhs = lhs + offset;
6648         }
6649
6650         RValue<Pointer<Byte>> operator+=(const Pointer<Byte> &lhs, RValue<Int> offset)
6651         {
6652                 return lhs = lhs + offset;
6653         }
6654
6655         RValue<Pointer<Byte>> operator+=(const Pointer<Byte> &lhs, RValue<UInt> offset)
6656         {
6657                 return lhs = lhs + offset;
6658         }
6659
6660         RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, int offset)
6661         {
6662                 return lhs + -offset;
6663         }
6664
6665         RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
6666         {
6667                 return lhs + -offset;
6668         }
6669
6670         RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
6671         {
6672                 return lhs + -offset;
6673         }
6674
6675         RValue<Pointer<Byte>> operator-=(const Pointer<Byte> &lhs, int offset)
6676         {
6677                 return lhs = lhs - offset;
6678         }
6679
6680         RValue<Pointer<Byte>> operator-=(const Pointer<Byte> &lhs, RValue<Int> offset)
6681         {
6682                 return lhs = lhs - offset;
6683         }
6684
6685         RValue<Pointer<Byte>> operator-=(const Pointer<Byte> &lhs, RValue<UInt> offset)
6686         {
6687                 return lhs = lhs - offset;
6688         }
6689
6690         void Return()
6691         {
6692                 Nucleus::createRetVoid();
6693                 Nucleus::setInsertBlock(Nucleus::createBasicBlock());
6694                 Nucleus::createUnreachable();
6695         }
6696
6697         void Return(bool ret)
6698         {
6699                 Nucleus::createRet(V(Nucleus::createConstantBool(ret)));
6700                 Nucleus::setInsertBlock(Nucleus::createBasicBlock());
6701                 Nucleus::createUnreachable();
6702         }
6703
6704         void Return(const Int &ret)
6705         {
6706                 Nucleus::createRet(ret.loadValue());
6707                 Nucleus::setInsertBlock(Nucleus::createBasicBlock());
6708                 Nucleus::createUnreachable();
6709         }
6710
6711         bool branch(RValue<Bool> cmp, BasicBlock *bodyBB, BasicBlock *endBB)
6712         {
6713                 Nucleus::createCondBr(cmp.value, bodyBB, endBB);
6714                 Nucleus::setInsertBlock(bodyBB);
6715
6716                 return true;
6717         }
6718
6719         void endIf(BasicBlock *falseBB)
6720         {
6721                 ::falseBB = falseBB;
6722         }
6723
6724         bool elseBlock(BasicBlock *falseBB)
6725         {
6726                 assert(falseBB && "Else not preceded by If");
6727                 falseBB->back().eraseFromParent();
6728                 Nucleus::setInsertBlock(falseBB);
6729
6730                 return true;
6731         }
6732
6733         BasicBlock *beginElse()
6734         {
6735                 BasicBlock *falseBB = ::falseBB;
6736                 ::falseBB = nullptr;
6737
6738                 return falseBB;
6739         }
6740
6741         RValue<Long> Ticks()
6742         {
6743                 llvm::Function *rdtsc = Intrinsic::getDeclaration(::module, Intrinsic::readcyclecounter);
6744
6745                 return RValue<Long>(V(::builder->CreateCall(rdtsc)));
6746         }
6747 }
6748
6749 namespace sw
6750 {
6751         namespace x86
6752         {
6753                 RValue<Int> cvtss2si(RValue<Float> val)
6754                 {
6755                         llvm::Function *cvtss2si = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvtss2si);
6756
6757                         Float4 vector;
6758                         vector.x = val;
6759
6760                         return RValue<Int>(V(::builder->CreateCall(cvtss2si, RValue<Float4>(vector).value)));
6761                 }
6762
6763                 RValue<Int2> cvtps2pi(RValue<Float4> val)
6764                 {
6765                         llvm::Function *cvtps2pi = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvtps2pi);
6766
6767                         return RValue<Int2>(V(::builder->CreateCall(cvtps2pi, val.value)));
6768                 }
6769
6770                 RValue<Int2> cvttps2pi(RValue<Float4> val)
6771                 {
6772                         llvm::Function *cvttps2pi = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvttps2pi);
6773
6774                         return RValue<Int2>(V(::builder->CreateCall(cvttps2pi, val.value)));
6775                 }
6776
6777                 RValue<Int4> cvtps2dq(RValue<Float4> val)
6778                 {
6779                         if(CPUID::supportsSSE2())
6780                         {
6781                                 llvm::Function *cvtps2dq = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_cvtps2dq);
6782
6783                                 return RValue<Int4>(V(::builder->CreateCall(cvtps2dq, val.value)));
6784                         }
6785                         else
6786                         {
6787                                 Int2 lo = x86::cvtps2pi(val);
6788                                 Int2 hi = x86::cvtps2pi(Swizzle(val, 0xEE));
6789
6790                                 return Int4(lo, hi);
6791                         }
6792                 }
6793
6794                 RValue<Float> rcpss(RValue<Float> val)
6795                 {
6796                         llvm::Function *rcpss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rcp_ss);
6797
6798                         Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
6799
6800                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(rcpss, vector)), Float::getType(), 0));
6801                 }
6802
6803                 RValue<Float> sqrtss(RValue<Float> val)
6804                 {
6805                         llvm::Function *sqrtss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_sqrt_ss);
6806
6807                         Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
6808
6809                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(sqrtss, vector)), Float::getType(), 0));
6810                 }
6811
6812                 RValue<Float> rsqrtss(RValue<Float> val)
6813                 {
6814                         llvm::Function *rsqrtss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rsqrt_ss);
6815
6816                         Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
6817
6818                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(rsqrtss, vector)), Float::getType(), 0));
6819                 }
6820
6821                 RValue<Float4> rcpps(RValue<Float4> val)
6822                 {
6823                         llvm::Function *rcpps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rcp_ps);
6824
6825                         return RValue<Float4>(V(::builder->CreateCall(rcpps, val.value)));
6826                 }
6827
6828                 RValue<Float4> sqrtps(RValue<Float4> val)
6829                 {
6830                         llvm::Function *sqrtps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_sqrt_ps);
6831
6832                         return RValue<Float4>(V(::builder->CreateCall(sqrtps, val.value)));
6833                 }
6834
6835                 RValue<Float4> rsqrtps(RValue<Float4> val)
6836                 {
6837                         llvm::Function *rsqrtps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rsqrt_ps);
6838
6839                         return RValue<Float4>(V(::builder->CreateCall(rsqrtps, val.value)));
6840                 }
6841
6842                 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
6843                 {
6844                         llvm::Function *maxps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_max_ps);
6845
6846                         return RValue<Float4>(V(::builder->CreateCall2(maxps, x.value, y.value)));
6847                 }
6848
6849                 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
6850                 {
6851                         llvm::Function *minps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_min_ps);
6852
6853                         return RValue<Float4>(V(::builder->CreateCall2(minps, x.value, y.value)));
6854                 }
6855
6856                 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
6857                 {
6858                         llvm::Function *roundss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_round_ss);
6859
6860                         Value *undef = V(UndefValue::get(Float4::getType()));
6861                         Value *vector = Nucleus::createInsertElement(undef, val.value, 0);
6862
6863                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall3(roundss, undef, vector, V(Nucleus::createConstantInt(imm)))), Float::getType(), 0));
6864                 }
6865
6866                 RValue<Float> floorss(RValue<Float> val)
6867                 {
6868                         return roundss(val, 1);
6869                 }
6870
6871                 RValue<Float> ceilss(RValue<Float> val)
6872                 {
6873                         return roundss(val, 2);
6874                 }
6875
6876                 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
6877                 {
6878                         llvm::Function *roundps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_round_ps);
6879
6880                         return RValue<Float4>(V(::builder->CreateCall2(roundps, val.value, V(Nucleus::createConstantInt(imm)))));
6881                 }
6882
6883                 RValue<Float4> floorps(RValue<Float4> val)
6884                 {
6885                         return roundps(val, 1);
6886                 }
6887
6888                 RValue<Float4> ceilps(RValue<Float4> val)
6889                 {
6890                         return roundps(val, 2);
6891                 }
6892
6893                 RValue<Float4> cmpps(RValue<Float4> x, RValue<Float4> y, unsigned char imm)
6894                 {
6895                         llvm::Function *cmpps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cmp_ps);
6896
6897                         return RValue<Float4>(V(::builder->CreateCall3(cmpps, x.value, y.value, V(Nucleus::createConstantByte(imm)))));
6898                 }
6899
6900                 RValue<Float4> cmpeqps(RValue<Float4> x, RValue<Float4> y)
6901                 {
6902                         return cmpps(x, y, 0);
6903                 }
6904
6905                 RValue<Float4> cmpltps(RValue<Float4> x, RValue<Float4> y)
6906                 {
6907                         return cmpps(x, y, 1);
6908                 }
6909
6910                 RValue<Float4> cmpleps(RValue<Float4> x, RValue<Float4> y)
6911                 {
6912                         return cmpps(x, y, 2);
6913                 }
6914
6915                 RValue<Float4> cmpunordps(RValue<Float4> x, RValue<Float4> y)
6916                 {
6917                         return cmpps(x, y, 3);
6918                 }
6919
6920                 RValue<Float4> cmpneqps(RValue<Float4> x, RValue<Float4> y)
6921                 {
6922                         return cmpps(x, y, 4);
6923                 }
6924
6925                 RValue<Float4> cmpnltps(RValue<Float4> x, RValue<Float4> y)
6926                 {
6927                         return cmpps(x, y, 5);
6928                 }
6929
6930                 RValue<Float4> cmpnleps(RValue<Float4> x, RValue<Float4> y)
6931                 {
6932                         return cmpps(x, y, 6);
6933                 }
6934
6935                 RValue<Float4> cmpordps(RValue<Float4> x, RValue<Float4> y)
6936                 {
6937                         return cmpps(x, y, 7);
6938                 }
6939
6940                 RValue<Float> cmpss(RValue<Float> x, RValue<Float> y, unsigned char imm)
6941                 {
6942                         llvm::Function *cmpss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cmp_ss);
6943
6944                         Value *vector1 = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), x.value, 0);
6945                         Value *vector2 = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), y.value, 0);
6946
6947                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall3(cmpss, vector1, vector2, V(Nucleus::createConstantByte(imm)))), Float::getType(), 0));
6948                 }
6949
6950                 RValue<Float> cmpeqss(RValue<Float> x, RValue<Float> y)
6951                 {
6952                         return cmpss(x, y, 0);
6953                 }
6954
6955                 RValue<Float> cmpltss(RValue<Float> x, RValue<Float> y)
6956                 {
6957                         return cmpss(x, y, 1);
6958                 }
6959
6960                 RValue<Float> cmpless(RValue<Float> x, RValue<Float> y)
6961                 {
6962                         return cmpss(x, y, 2);
6963                 }
6964
6965                 RValue<Float> cmpunordss(RValue<Float> x, RValue<Float> y)
6966                 {
6967                         return cmpss(x, y, 3);
6968                 }
6969
6970                 RValue<Float> cmpneqss(RValue<Float> x, RValue<Float> y)
6971                 {
6972                         return cmpss(x, y, 4);
6973                 }
6974
6975                 RValue<Float> cmpnltss(RValue<Float> x, RValue<Float> y)
6976                 {
6977                         return cmpss(x, y, 5);
6978                 }
6979
6980                 RValue<Float> cmpnless(RValue<Float> x, RValue<Float> y)
6981                 {
6982                         return cmpss(x, y, 6);
6983                 }
6984
6985                 RValue<Float> cmpordss(RValue<Float> x, RValue<Float> y)
6986                 {
6987                         return cmpss(x, y, 7);
6988                 }
6989
6990                 RValue<Int4> pabsd(RValue<Int4> x)
6991                 {
6992                         llvm::Function *pabsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_ssse3_pabs_d_128);
6993
6994                         return RValue<Int4>(V(::builder->CreateCall(pabsd, x.value)));
6995                 }
6996
6997                 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
6998                 {
6999                         llvm::Function *paddsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padds_w);
7000
7001                         return As<Short4>(V(::builder->CreateCall2(paddsw, As<MMX>(x).value, As<MMX>(y).value)));
7002                 }
7003
7004                 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
7005                 {
7006                         llvm::Function *psubsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubs_w);
7007
7008                         return As<Short4>(V(::builder->CreateCall2(psubsw, As<MMX>(x).value, As<MMX>(y).value)));
7009                 }
7010
7011                 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
7012                 {
7013                         llvm::Function *paddusw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_paddus_w);
7014
7015                         return As<UShort4>(V(::builder->CreateCall2(paddusw, As<MMX>(x).value, As<MMX>(y).value)));
7016                 }
7017
7018                 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
7019                 {
7020                         llvm::Function *psubusw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubus_w);
7021
7022                         return As<UShort4>(V(::builder->CreateCall2(psubusw, As<MMX>(x).value, As<MMX>(y).value)));
7023                 }
7024
7025                 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
7026                 {
7027                         llvm::Function *paddsb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padds_b);
7028
7029                         return As<SByte8>(V(::builder->CreateCall2(paddsb, As<MMX>(x).value, As<MMX>(y).value)));
7030                 }
7031
7032                 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
7033                 {
7034                         llvm::Function *psubsb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubs_b);
7035
7036                         return As<SByte8>(V(::builder->CreateCall2(psubsb, As<MMX>(x).value, As<MMX>(y).value)));
7037                 }
7038
7039                 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
7040                 {
7041                         llvm::Function *paddusb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_paddus_b);
7042
7043                         return As<Byte8>(V(::builder->CreateCall2(paddusb, As<MMX>(x).value, As<MMX>(y).value)));
7044                 }
7045
7046                 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
7047                 {
7048                         llvm::Function *psubusb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubus_b);
7049
7050                         return As<Byte8>(V(::builder->CreateCall2(psubusb, As<MMX>(x).value, As<MMX>(y).value)));
7051                 }
7052
7053                 RValue<Short4> paddw(RValue<Short4> x, RValue<Short4> y)
7054                 {
7055                         llvm::Function *paddw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_w);
7056
7057                         return As<Short4>(V(::builder->CreateCall2(paddw, As<MMX>(x).value, As<MMX>(y).value)));
7058                 }
7059
7060                 RValue<Short4> psubw(RValue<Short4> x, RValue<Short4> y)
7061                 {
7062                         llvm::Function *psubw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_w);
7063
7064                         return As<Short4>(V(::builder->CreateCall2(psubw, As<MMX>(x).value, As<MMX>(y).value)));
7065                 }
7066
7067                 RValue<Short4> pmullw(RValue<Short4> x, RValue<Short4> y)
7068                 {
7069                         llvm::Function *pmullw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmull_w);
7070
7071                         return As<Short4>(V(::builder->CreateCall2(pmullw, As<MMX>(x).value, As<MMX>(y).value)));
7072                 }
7073
7074                 RValue<Short4> pand(RValue<Short4> x, RValue<Short4> y)
7075                 {
7076                         llvm::Function *pand = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pand);
7077
7078                         return As<Short4>(V(::builder->CreateCall2(pand, As<MMX>(x).value, As<MMX>(y).value)));
7079                 }
7080
7081                 RValue<Short4> por(RValue<Short4> x, RValue<Short4> y)
7082                 {
7083                         llvm::Function *por = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_por);
7084
7085                         return As<Short4>(V(::builder->CreateCall2(por, As<MMX>(x).value, As<MMX>(y).value)));
7086                 }
7087
7088                 RValue<Short4> pxor(RValue<Short4> x, RValue<Short4> y)
7089                 {
7090                         llvm::Function *pxor = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pxor);
7091
7092                         return As<Short4>(V(::builder->CreateCall2(pxor, As<MMX>(x).value, As<MMX>(y).value)));
7093                 }
7094
7095                 RValue<Short4> pshufw(RValue<Short4> x, unsigned char y)
7096                 {
7097                         llvm::Function *pshufw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_pshuf_w);
7098
7099                         return As<Short4>(V(::builder->CreateCall2(pshufw, As<MMX>(x).value, V(Nucleus::createConstantByte(y)))));
7100                 }
7101
7102                 RValue<Int2> punpcklwd(RValue<Short4> x, RValue<Short4> y)
7103                 {
7104                         llvm::Function *punpcklwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpcklwd);
7105
7106                         return As<Int2>(V(::builder->CreateCall2(punpcklwd, As<MMX>(x).value, As<MMX>(y).value)));
7107                 }
7108
7109                 RValue<Int2> punpckhwd(RValue<Short4> x, RValue<Short4> y)
7110                 {
7111                         llvm::Function *punpckhwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhwd);
7112
7113                         return As<Int2>(V(::builder->CreateCall2(punpckhwd, As<MMX>(x).value, As<MMX>(y).value)));
7114                 }
7115
7116                 RValue<Short4> pinsrw(RValue<Short4> x, RValue<Int> y, unsigned int i)
7117                 {
7118                         llvm::Function *pinsrw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pinsr_w);
7119
7120                         return As<Short4>(V(::builder->CreateCall3(pinsrw, As<MMX>(x).value, y.value, V(Nucleus::createConstantInt(i)))));
7121                 }
7122
7123                 RValue<Int> pextrw(RValue<Short4> x, unsigned int i)
7124                 {
7125                         llvm::Function *pextrw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pextr_w);
7126
7127                         return RValue<Int>(V(::builder->CreateCall2(pextrw, As<MMX>(x).value, V(Nucleus::createConstantInt(i)))));
7128                 }
7129
7130                 RValue<Long1> punpckldq(RValue<Int2> x, RValue<Int2> y)
7131                 {
7132                         llvm::Function *punpckldq = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckldq);
7133
7134                         return As<Long1>(V(::builder->CreateCall2(punpckldq, As<MMX>(x).value, As<MMX>(y).value)));
7135                 }
7136
7137                 RValue<Long1> punpckhdq(RValue<Int2> x, RValue<Int2> y)
7138                 {
7139                         llvm::Function *punpckhdq = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhdq);
7140
7141                         return As<Long1>(V(::builder->CreateCall2(punpckhdq, As<MMX>(x).value, As<MMX>(y).value)));
7142                 }
7143
7144                 RValue<Short4> punpcklbw(RValue<Byte8> x, RValue<Byte8> y)
7145                 {
7146                         llvm::Function *punpcklbw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpcklbw);
7147
7148                         return As<Short4>(V(::builder->CreateCall2(punpcklbw, As<MMX>(x).value, As<MMX>(y).value)));
7149                 }
7150
7151                 RValue<Short4> punpckhbw(RValue<Byte8> x, RValue<Byte8> y)
7152                 {
7153                         llvm::Function *punpckhbw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhbw);
7154
7155                         return As<Short4>(V(::builder->CreateCall2(punpckhbw, As<MMX>(x).value, As<MMX>(y).value)));
7156                 }
7157
7158                 RValue<Byte8> paddb(RValue<Byte8> x, RValue<Byte8> y)
7159                 {
7160                         llvm::Function *paddb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_b);
7161
7162                         return As<Byte8>(V(::builder->CreateCall2(paddb, As<MMX>(x).value, As<MMX>(y).value)));
7163                 }
7164
7165                 RValue<Byte8> psubb(RValue<Byte8> x, RValue<Byte8> y)
7166                 {
7167                         llvm::Function *psubb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_b);
7168
7169                         return As<Byte8>(V(::builder->CreateCall2(psubb, As<MMX>(x).value, As<MMX>(y).value)));
7170                 }
7171
7172                 RValue<Int2> paddd(RValue<Int2> x, RValue<Int2> y)
7173                 {
7174                         llvm::Function *paddd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_d);
7175
7176                         return As<Int2>(V(::builder->CreateCall2(paddd, As<MMX>(x).value, As<MMX>(y).value)));
7177                 }
7178
7179                 RValue<Int2> psubd(RValue<Int2> x, RValue<Int2> y)
7180                 {
7181                         llvm::Function *psubd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_d);
7182
7183                         return As<Int2>(V(::builder->CreateCall2(psubd, As<MMX>(x).value, As<MMX>(y).value)));
7184                 }
7185
7186                 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
7187                 {
7188                         llvm::Function *pavgw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pavg_w);
7189
7190                         return As<UShort4>(V(::builder->CreateCall2(pavgw, As<MMX>(x).value, As<MMX>(y).value)));
7191                 }
7192
7193                 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
7194                 {
7195                         llvm::Function *pmaxsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmaxs_w);
7196
7197                         return As<Short4>(V(::builder->CreateCall2(pmaxsw, As<MMX>(x).value, As<MMX>(y).value)));
7198                 }
7199
7200                 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
7201                 {
7202                         llvm::Function *pminsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmins_w);
7203
7204                         return As<Short4>(V(::builder->CreateCall2(pminsw, As<MMX>(x).value, As<MMX>(y).value)));
7205                 }
7206
7207                 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
7208                 {
7209                         llvm::Function *pcmpgtw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpgt_w);
7210
7211                         return As<Short4>(V(::builder->CreateCall2(pcmpgtw, As<MMX>(x).value, As<MMX>(y).value)));
7212                 }
7213
7214                 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
7215                 {
7216                         llvm::Function *pcmpeqw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpeq_w);
7217
7218                         return As<Short4>(V(::builder->CreateCall2(pcmpeqw, As<MMX>(x).value, As<MMX>(y).value)));
7219                 }
7220
7221                 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
7222                 {
7223                         llvm::Function *pcmpgtb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpgt_b);
7224
7225                         return As<Byte8>(V(::builder->CreateCall2(pcmpgtb, As<MMX>(x).value, As<MMX>(y).value)));
7226                 }
7227
7228                 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
7229                 {
7230                         llvm::Function *pcmpeqb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpeq_b);
7231
7232                         return As<Byte8>(V(::builder->CreateCall2(pcmpeqb, As<MMX>(x).value, As<MMX>(y).value)));
7233                 }
7234
7235                 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
7236                 {
7237                         llvm::Function *packssdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packssdw);
7238
7239                         return As<Short4>(V(::builder->CreateCall2(packssdw, As<MMX>(x).value, As<MMX>(y).value)));
7240                 }
7241
7242                 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
7243                 {
7244                         if(CPUID::supportsSSE2())
7245                         {
7246                                 llvm::Function *packssdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_packssdw_128);
7247
7248                                 return RValue<Short8>(V(::builder->CreateCall2(packssdw, x.value, y.value)));
7249                         }
7250                         else
7251                         {
7252                                 Int2 loX = Int2(x);
7253                                 Int2 hiX = Int2(Swizzle(x, 0xEE));
7254
7255                                 Int2 loY = Int2(y);
7256                                 Int2 hiY = Int2(Swizzle(y, 0xEE));
7257
7258                                 Short4 lo = x86::packssdw(loX, hiX);
7259                                 Short4 hi = x86::packssdw(loY, hiY);
7260
7261                                 return Short8(lo, hi);
7262                         }
7263                 }
7264
7265                 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
7266                 {
7267                         llvm::Function *packsswb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packsswb);
7268
7269                         return As<SByte8>(V(::builder->CreateCall2(packsswb, As<MMX>(x).value, As<MMX>(y).value)));
7270                 }
7271
7272                 RValue<Byte8> packuswb(RValue<UShort4> x, RValue<UShort4> y)
7273                 {
7274                         llvm::Function *packuswb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packuswb);
7275
7276                         return As<Byte8>(V(::builder->CreateCall2(packuswb, As<MMX>(x).value, As<MMX>(y).value)));
7277                 }
7278
7279                 RValue<UShort8> packusdw(RValue<UInt4> x, RValue<UInt4> y)
7280                 {
7281                         if(CPUID::supportsSSE4_1())
7282                         {
7283                                 llvm::Function *packusdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_packusdw);
7284
7285                                 return RValue<UShort8>(V(::builder->CreateCall2(packusdw, x.value, y.value)));
7286                         }
7287                         else
7288                         {
7289                                 // FIXME: Not an exact replacement!
7290                                 return As<UShort8>(packssdw(As<Int4>(x - UInt4(0x00008000, 0x00008000, 0x00008000, 0x00008000)), As<Int4>(y - UInt4(0x00008000, 0x00008000, 0x00008000, 0x00008000))) + Short8(0x8000u, 0x8000u, 0x8000u, 0x8000u, 0x8000u, 0x8000u, 0x8000u, 0x8000u));
7291                         }
7292                 }
7293
7294                 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
7295                 {
7296                         llvm::Function *psrlw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrli_w);
7297
7298                         return As<UShort4>(V(::builder->CreateCall2(psrlw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7299                 }
7300
7301                 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
7302                 {
7303                         llvm::Function *psrlw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrli_w);
7304
7305                         return RValue<UShort8>(V(::builder->CreateCall2(psrlw, x.value, V(Nucleus::createConstantInt(y)))));
7306                 }
7307
7308                 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
7309                 {
7310                         llvm::Function *psraw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrai_w);
7311
7312                         return As<Short4>(V(::builder->CreateCall2(psraw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7313                 }
7314
7315                 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
7316                 {
7317                         llvm::Function *psraw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrai_w);
7318
7319                         return RValue<Short8>(V(::builder->CreateCall2(psraw, x.value, V(Nucleus::createConstantInt(y)))));
7320                 }
7321
7322                 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
7323                 {
7324                         llvm::Function *psllw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pslli_w);
7325
7326                         return As<Short4>(V(::builder->CreateCall2(psllw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7327                 }
7328
7329                 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
7330                 {
7331                         llvm::Function *psllw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pslli_w);
7332
7333                         return RValue<Short8>(V(::builder->CreateCall2(psllw, x.value, V(Nucleus::createConstantInt(y)))));
7334                 }
7335
7336                 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
7337                 {
7338                         llvm::Function *pslld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pslli_d);
7339
7340                         return As<Int2>(V(::builder->CreateCall2(pslld, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7341                 }
7342
7343                 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
7344                 {
7345                         if(CPUID::supportsSSE2())
7346                         {
7347                                 llvm::Function *pslld = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pslli_d);
7348
7349                                 return RValue<Int4>(V(::builder->CreateCall2(pslld, x.value, V(Nucleus::createConstantInt(y)))));
7350                         }
7351                         else
7352                         {
7353                                 Int2 lo = Int2(x);
7354                                 Int2 hi = Int2(Swizzle(x, 0xEE));
7355
7356                                 lo = x86::pslld(lo, y);
7357                                 hi = x86::pslld(hi, y);
7358
7359                                 return Int4(lo, hi);
7360                         }
7361                 }
7362
7363                 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
7364                 {
7365                         llvm::Function *psrad = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrai_d);
7366
7367                         return As<Int2>(V(::builder->CreateCall2(psrad, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7368                 }
7369
7370                 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
7371                 {
7372                         if(CPUID::supportsSSE2())
7373                         {
7374                                 llvm::Function *psrad = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrai_d);
7375
7376                                 return RValue<Int4>(V(::builder->CreateCall2(psrad, x.value, V(Nucleus::createConstantInt(y)))));
7377                         }
7378                         else
7379                         {
7380                                 Int2 lo = Int2(x);
7381                                 Int2 hi = Int2(Swizzle(x, 0xEE));
7382
7383                                 lo = x86::psrad(lo, y);
7384                                 hi = x86::psrad(hi, y);
7385
7386                                 return Int4(lo, hi);
7387                         }
7388                 }
7389
7390                 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
7391                 {
7392                         llvm::Function *psrld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrli_d);
7393
7394                         return As<UInt2>(V(::builder->CreateCall2(psrld, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7395                 }
7396
7397                 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
7398                 {
7399                         if(CPUID::supportsSSE2())
7400                         {
7401                                 llvm::Function *psrld = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrli_d);
7402
7403                                 return RValue<UInt4>(V(::builder->CreateCall2(psrld, x.value, V(Nucleus::createConstantInt(y)))));
7404                         }
7405                         else
7406                         {
7407                                 UInt2 lo = As<UInt2>(Int2(As<Int4>(x)));
7408                                 UInt2 hi = As<UInt2>(Int2(Swizzle(As<Int4>(x), 0xEE)));
7409
7410                                 lo = x86::psrld(lo, y);
7411                                 hi = x86::psrld(hi, y);
7412
7413                                 return UInt4(lo, hi);
7414                         }
7415                 }
7416
7417                 RValue<UShort4> psrlw(RValue<UShort4> x, RValue<Long1> y)
7418                 {
7419                         llvm::Function *psrlw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrl_w);
7420
7421                         return As<UShort4>(V(::builder->CreateCall2(psrlw, As<MMX>(x).value, As<MMX>(y).value)));
7422                 }
7423
7424                 RValue<Short4> psraw(RValue<Short4> x, RValue<Long1> y)
7425                 {
7426                         llvm::Function *psraw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psra_w);
7427
7428                         return As<Short4>(V(::builder->CreateCall2(psraw, As<MMX>(x).value, As<MMX>(y).value)));
7429                 }
7430
7431                 RValue<Short4> psllw(RValue<Short4> x, RValue<Long1> y)
7432                 {
7433                         llvm::Function *psllw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psll_w);
7434
7435                         return As<Short4>(V(::builder->CreateCall2(psllw, As<MMX>(x).value, As<MMX>(y).value)));
7436                 }
7437
7438                 RValue<Int2> pslld(RValue<Int2> x, RValue<Long1> y)
7439                 {
7440                         llvm::Function *pslld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psll_d);
7441
7442                         return As<Int2>(V(::builder->CreateCall2(pslld, As<MMX>(x).value, As<MMX>(y).value)));
7443                 }
7444
7445                 RValue<UInt2> psrld(RValue<UInt2> x, RValue<Long1> y)
7446                 {
7447                         llvm::Function *psrld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrl_d);
7448
7449                         return As<UInt2>(V(::builder->CreateCall2(psrld, As<MMX>(x).value, As<MMX>(y).value)));
7450                 }
7451
7452                 RValue<Int2> psrad(RValue<Int2> x, RValue<Long1> y)
7453                 {
7454                         llvm::Function *psrld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psra_d);
7455
7456                         return As<Int2>(V(::builder->CreateCall2(psrld, As<MMX>(x).value, As<MMX>(y).value)));
7457                 }
7458
7459                 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
7460                 {
7461                         llvm::Function *pmaxsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmaxsd);
7462
7463                         return RValue<Int4>(V(::builder->CreateCall2(pmaxsd, x.value, y.value)));
7464                 }
7465
7466                 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
7467                 {
7468                         llvm::Function *pminsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pminsd);
7469
7470                         return RValue<Int4>(V(::builder->CreateCall2(pminsd, x.value, y.value)));
7471                 }
7472
7473                 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
7474                 {
7475                         llvm::Function *pmaxud = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmaxud);
7476
7477                         return RValue<UInt4>(V(::builder->CreateCall2(pmaxud, x.value, y.value)));
7478                 }
7479
7480                 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
7481                 {
7482                         llvm::Function *pminud = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pminud);
7483
7484                         return RValue<UInt4>(V(::builder->CreateCall2(pminud, x.value, y.value)));
7485                 }
7486
7487                 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
7488                 {
7489                         llvm::Function *pmulhw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmulh_w);
7490
7491                         return As<Short4>(V(::builder->CreateCall2(pmulhw, As<MMX>(x).value, As<MMX>(y).value)));
7492                 }
7493
7494                 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
7495                 {
7496                         llvm::Function *pmulhuw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmulhu_w);
7497
7498                         return As<UShort4>(V(::builder->CreateCall2(pmulhuw, As<MMX>(x).value, As<MMX>(y).value)));
7499                 }
7500
7501                 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
7502                 {
7503                         llvm::Function *pmaddwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmadd_wd);
7504
7505                         return As<Int2>(V(::builder->CreateCall2(pmaddwd, As<MMX>(x).value, As<MMX>(y).value)));
7506                 }
7507
7508                 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
7509                 {
7510                         llvm::Function *pmulhw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmulh_w);
7511
7512                         return RValue<Short8>(V(::builder->CreateCall2(pmulhw, x.value, y.value)));
7513                 }
7514
7515                 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
7516                 {
7517                         llvm::Function *pmulhuw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmulhu_w);
7518
7519                         return RValue<UShort8>(V(::builder->CreateCall2(pmulhuw, x.value, y.value)));
7520                 }
7521
7522                 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
7523                 {
7524                         llvm::Function *pmaddwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmadd_wd);
7525
7526                         return RValue<Int4>(V(::builder->CreateCall2(pmaddwd, x.value, y.value)));
7527                 }
7528
7529                 RValue<Int> movmskps(RValue<Float4> x)
7530                 {
7531                         llvm::Function *movmskps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_movmsk_ps);
7532
7533                         return RValue<Int>(V(::builder->CreateCall(movmskps, x.value)));
7534                 }
7535
7536                 RValue<Int> pmovmskb(RValue<Byte8> x)
7537                 {
7538                         llvm::Function *pmovmskb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmovmskb);
7539
7540                         return RValue<Int>(V(::builder->CreateCall(pmovmskb, As<MMX>(x).value)));
7541                 }
7542
7543                 //RValue<Int2> movd(RValue<Pointer<Int>> x)
7544                 //{
7545                 //      Value *element = Nucleus::createLoad(x.value);
7546
7547                 ////    Value *int2 = UndefValue::get(Int2::getType());
7548                 ////    int2 = Nucleus::createInsertElement(int2, element, ConstantInt::get(Int::getType(), 0));
7549
7550                 //      Value *int2 = Nucleus::createBitCast(Nucleus::createZExt(element, Long::getType()), Int2::getType());
7551
7552                 //      return RValue<Int2>(int2);
7553                 //}
7554
7555                 //RValue<Int2> movdq2q(RValue<Int4> x)
7556                 //{
7557                 //      Value *long2 = Nucleus::createBitCast(x.value, T(VectorType::get(Long::getType(), 2)));
7558                 //      Value *element = Nucleus::createExtractElement(long2, ConstantInt::get(Int::getType(), 0));
7559
7560                 //      return RValue<Int2>(Nucleus::createBitCast(element, Int2::getType()));
7561                 //}
7562
7563                 RValue<Int4> pmovzxbd(RValue<Int4> x)
7564                 {
7565                         llvm::Function *pmovzxbd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovzxbd);
7566
7567                         return RValue<Int4>(V(::builder->CreateCall(pmovzxbd, Nucleus::createBitCast(x.value, Byte16::getType()))));
7568                 }
7569
7570                 RValue<Int4> pmovsxbd(RValue<Int4> x)
7571                 {
7572                         llvm::Function *pmovsxbd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovsxbd);
7573
7574                         return RValue<Int4>(V(::builder->CreateCall(pmovsxbd, Nucleus::createBitCast(x.value, SByte16::getType()))));
7575                 }
7576
7577                 RValue<Int4> pmovzxwd(RValue<Int4> x)
7578                 {
7579                         llvm::Function *pmovzxwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovzxwd);
7580
7581                         return RValue<Int4>(V(::builder->CreateCall(pmovzxwd, Nucleus::createBitCast(x.value, UShort8::getType()))));
7582                 }
7583
7584                 RValue<Int4> pmovsxwd(RValue<Int4> x)
7585                 {
7586                         llvm::Function *pmovsxwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovsxwd);
7587
7588                         return RValue<Int4>(V(::builder->CreateCall(pmovsxwd, Nucleus::createBitCast(x.value, Short8::getType()))));
7589                 }
7590
7591                 void emms()
7592                 {
7593                         llvm::Function *emms = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_emms);
7594
7595                         V(::builder->CreateCall(emms));
7596                 }
7597         }
7598 }