src/Reactor/LLVMReactor.cpp

   1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //    http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "Nucleus.hpp"
  16
  17 #include "llvm/Support/IRBuilder.h"
  18 #include "llvm/Function.h"
  19 #include "llvm/GlobalVariable.h"
  20 #include "llvm/Module.h"
  21 #include "llvm/LLVMContext.h"
  22 #include "llvm/Constants.h"
  23 #include "llvm/Intrinsics.h"
  24 #include "llvm/PassManager.h"
  25 #include "llvm/Analysis/LoopPass.h"
  26 #include "llvm/Transforms/Scalar.h"
  27 #include "llvm/Target/TargetData.h"
  28 #include "llvm/Target/TargetOptions.h"
  29 #include "llvm/Support/TargetSelect.h"
  30 #include "../lib/ExecutionEngine/JIT/JIT.h"
  31
  32 #include "LLVMRoutine.hpp"
  33 #include "LLVMRoutineManager.hpp"
  34 #include "x86.hpp"
  35 #include "CPUID.hpp"
  36 #include "Thread.hpp"
  37 #include "Memory.hpp"
  38 #include "MutexLock.hpp"
  39
  40 #include <xmmintrin.h>
  41 #include <fstream>
  42
  43 #if defined(__x86_64__) && defined(_WIN32)
  44 extern "C" void X86CompilationCallback()
  45 {
  46         assert(false);   // UNIMPLEMENTED
  47 }
  48 #endif
  49
  50 extern "C"
  51 {
  52         bool (*CodeAnalystInitialize)() = 0;
  53         void (*CodeAnalystCompleteJITLog)() = 0;
  54         bool (*CodeAnalystLogJITCode)(const void *jitCodeStartAddr, unsigned int jitCodeSize, const wchar_t *functionName) = 0;
  55 }
  56
  57 namespace llvm
  58 {
  59         extern bool JITEmitDebugInfo;
  60 }
  61
  62 namespace
  63 {
  64         sw::LLVMRoutineManager *routineManager = nullptr;
  65         llvm::ExecutionEngine *executionEngine = nullptr;
  66         llvm::IRBuilder<> *builder = nullptr;
  67         llvm::LLVMContext *context = nullptr;
  68         llvm::Module *module = nullptr;
  69         llvm::Function *function = nullptr;
  70
  71         sw::BackoffLock codegenMutex;
  72 }
  73
  74 namespace sw
  75 {
  76         using namespace llvm;
  77
  78         Optimization optimization[10] = {InstructionCombining, Disabled};
  79
  80         class Type : public llvm::Type {};
  81         class Value : public llvm::Value {};
  82         class SwitchCases : public llvm::SwitchInst {};
  83         class BasicBlock : public llvm::BasicBlock {};
  84
  85         inline Type *T(llvm::Type *t)
  86         {
  87                 return reinterpret_cast<Type*>(t);
  88         }
  89
  90         inline Value *V(llvm::Value *t)
  91         {
  92                 return reinterpret_cast<Value*>(t);
  93         }
  94
  95         inline std::vector<llvm::Type*> &T(std::vector<Type*> &t)
  96         {
  97                 return reinterpret_cast<std::vector<llvm::Type*>&>(t);
  98         }
  99
 100         inline BasicBlock *B(llvm::BasicBlock *t)
 101         {
 102                 return reinterpret_cast<BasicBlock*>(t);
 103         }
 104
 105         Nucleus::Nucleus()
 106         {
 107                 ::codegenMutex.lock();   // Reactor and LLVM are currently not thread safe
 108
 109                 InitializeNativeTarget();
 110                 JITEmitDebugInfo = false;
 111
 112                 if(!::context)
 113                 {
 114                         ::context = new LLVMContext();
 115                 }
 116
 117                 ::module = new Module("", *::context);
 118                 ::routineManager = new LLVMRoutineManager();
 119
 120                 #if defined(__x86_64__)
 121                         const char *architecture = "x86-64";
 122                 #else
 123                         const char *architecture = "x86";
 124                 #endif
 125
 126                 SmallVector<std::string, 1> MAttrs;
 127                 MAttrs.push_back(CPUID::supportsMMX()    ? "+mmx"   : "-mmx");
 128                 MAttrs.push_back(CPUID::supportsCMOV()   ? "+cmov"  : "-cmov");
 129                 MAttrs.push_back(CPUID::supportsSSE()    ? "+sse"   : "-sse");
 130                 MAttrs.push_back(CPUID::supportsSSE2()   ? "+sse2"  : "-sse2");
 131                 MAttrs.push_back(CPUID::supportsSSE3()   ? "+sse3"  : "-sse3");
 132                 MAttrs.push_back(CPUID::supportsSSSE3()  ? "+ssse3" : "-ssse3");
 133                 MAttrs.push_back(CPUID::supportsSSE4_1() ? "+sse41" : "-sse41");
 134
 135                 std::string error;
 136                 TargetMachine *targetMachine = EngineBuilder::selectTarget(::module, architecture, "", MAttrs, Reloc::Default, CodeModel::JITDefault, &error);
 137                 ::executionEngine = JIT::createJIT(::module, 0, ::routineManager, CodeGenOpt::Aggressive, true, targetMachine);
 138
 139                 if(!::builder)
 140                 {
 141                         ::builder = new IRBuilder<>(*::context);
 142
 143                         #if defined(_WIN32)
 144                                 HMODULE CodeAnalyst = LoadLibrary("CAJitNtfyLib.dll");
 145                                 if(CodeAnalyst)
 146                                 {
 147                                         CodeAnalystInitialize = (bool(*)())GetProcAddress(CodeAnalyst, "CAJIT_Initialize");
 148                                         CodeAnalystCompleteJITLog = (void(*)())GetProcAddress(CodeAnalyst, "CAJIT_CompleteJITLog");
 149                                         CodeAnalystLogJITCode = (bool(*)(const void*, unsigned int, const wchar_t*))GetProcAddress(CodeAnalyst, "CAJIT_LogJITCode");
 150
 151                                         CodeAnalystInitialize();
 152                                 }
 153                         #endif
 154                 }
 155         }
 156
 157         Nucleus::~Nucleus()
 158         {
 159                 delete ::executionEngine;
 160                 ::executionEngine = nullptr;
 161
 162                 ::routineManager = nullptr;
 163                 ::function = nullptr;
 164                 ::module = nullptr;
 165
 166                 ::codegenMutex.unlock();
 167         }
 168
 169         Routine *Nucleus::acquireRoutine(const wchar_t *name, bool runOptimizations)
 170         {
 171                 if(::builder->GetInsertBlock()->empty() || !::builder->GetInsertBlock()->back().isTerminator())
 172                 {
 173                         llvm::Type *type = ::function->getReturnType();
 174
 175                         if(type->isVoidTy())
 176                         {
 177                                 createRetVoid();
 178                         }
 179                         else
 180                         {
 181                                 createRet(V(UndefValue::get(type)));
 182                         }
 183                 }
 184
 185                 if(false)
 186                 {
 187                         std::string error;
 188                         raw_fd_ostream file("llvm-dump-unopt.txt", error);
 189                         ::module->print(file, 0);
 190                 }
 191
 192                 if(runOptimizations)
 193                 {
 194                         optimize();
 195                 }
 196
 197                 if(false)
 198                 {
 199                         std::string error;
 200                         raw_fd_ostream file("llvm-dump-opt.txt", error);
 201                         ::module->print(file, 0);
 202                 }
 203
 204                 void *entry = ::executionEngine->getPointerToFunction(::function);
 205                 LLVMRoutine *routine = ::routineManager->acquireRoutine(entry);
 206
 207                 if(CodeAnalystLogJITCode)
 208                 {
 209                         CodeAnalystLogJITCode(routine->getEntry(), routine->getCodeSize(), name);
 210                 }
 211
 212                 return routine;
 213         }
 214
 215         void Nucleus::optimize()
 216         {
 217                 static PassManager *passManager = nullptr;
 218
 219                 if(!passManager)
 220                 {
 221                         passManager = new PassManager();
 222
 223                         UnsafeFPMath = true;
 224                 //      NoInfsFPMath = true;
 225                 //      NoNaNsFPMath = true;
 226
 227                         passManager->add(new TargetData(*::executionEngine->getTargetData()));
 228                         passManager->add(createScalarReplAggregatesPass());
 229
 230                         for(int pass = 0; pass < 10 && optimization[pass] != Disabled; pass++)
 231                         {
 232                                 switch(optimization[pass])
 233                                 {
 234                                 case Disabled:                                                                 break;
 235                                 case CFGSimplification:    passManager->add(createCFGSimplificationPass());    break;
 236                                 case LICM:                 passManager->add(createLICMPass());                 break;
 237                                 case AggressiveDCE:        passManager->add(createAggressiveDCEPass());        break;
 238                                 case GVN:                  passManager->add(createGVNPass());                  break;
 239                                 case InstructionCombining: passManager->add(createInstructionCombiningPass()); break;
 240                                 case Reassociate:          passManager->add(createReassociatePass());          break;
 241                                 case DeadStoreElimination: passManager->add(createDeadStoreEliminationPass()); break;
 242                                 case SCCP:                 passManager->add(createSCCPPass());                 break;
 243                                 case ScalarReplAggregates: passManager->add(createScalarReplAggregatesPass()); break;
 244                                 default:
 245                                         assert(false);
 246                                 }
 247                         }
 248                 }
 249
 250                 passManager->run(*::module);
 251         }
 252
 253         Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
 254         {
 255                 // Need to allocate it in the entry block for mem2reg to work
 256                 llvm::BasicBlock &entryBlock = ::function->getEntryBlock();
 257
 258                 Instruction *declaration;
 259
 260                 if(arraySize)
 261                 {
 262                         declaration = new AllocaInst(type, Nucleus::createConstantInt(arraySize));
 263                 }
 264                 else
 265                 {
 266                         declaration = new AllocaInst(type, (Value*)0);
 267                 }
 268
 269                 entryBlock.getInstList().push_front(declaration);
 270
 271                 return V(declaration);
 272         }
 273
 274         BasicBlock *Nucleus::createBasicBlock()
 275         {
 276                 return B(BasicBlock::Create(*::context, "", ::function));
 277         }
 278
 279         BasicBlock *Nucleus::getInsertBlock()
 280         {
 281                 return B(::builder->GetInsertBlock());
 282         }
 283
 284         void Nucleus::setInsertBlock(BasicBlock *basicBlock)
 285         {
 286         //      assert(::builder->GetInsertBlock()->back().isTerminator());
 287                 return ::builder->SetInsertPoint(basicBlock);
 288         }
 289
 290         void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
 291         {
 292                 llvm::FunctionType *functionType = llvm::FunctionType::get(ReturnType, T(Params), false);
 293                 ::function = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, "", ::module);
 294                 ::function->setCallingConv(llvm::CallingConv::C);
 295
 296                 ::builder->SetInsertPoint(BasicBlock::Create(*::context, "", ::function));
 297         }
 298
 299         Value *Nucleus::getArgument(unsigned int index)
 300         {
 301                 llvm::Function::arg_iterator args = ::function->arg_begin();
 302
 303                 while(index)
 304                 {
 305                         args++;
 306                         index--;
 307                 }
 308
 309                 return V(&*args);
 310         }
 311
 312         void Nucleus::createRetVoid()
 313         {
 314                 x86::emms();
 315
 316                 ::builder->CreateRetVoid();
 317         }
 318
 319         void Nucleus::createRet(Value *v)
 320         {
 321                 x86::emms();
 322
 323                 ::builder->CreateRet(v);
 324         }
 325
 326         void Nucleus::createBr(BasicBlock *dest)
 327         {
 328                 ::builder->CreateBr(dest);
 329         }
 330
 331         void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
 332         {
 333                 ::builder->CreateCondBr(cond, ifTrue, ifFalse);
 334         }
 335
 336         Value *Nucleus::createAdd(Value *lhs, Value *rhs)
 337         {
 338                 return V(::builder->CreateAdd(lhs, rhs));
 339         }
 340
 341         Value *Nucleus::createSub(Value *lhs, Value *rhs)
 342         {
 343                 return V(::builder->CreateSub(lhs, rhs));
 344         }
 345
 346         Value *Nucleus::createMul(Value *lhs, Value *rhs)
 347         {
 348                 return V(::builder->CreateMul(lhs, rhs));
 349         }
 350
 351         Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
 352         {
 353                 return V(::builder->CreateUDiv(lhs, rhs));
 354         }
 355
 356         Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
 357         {
 358                 return V(::builder->CreateSDiv(lhs, rhs));
 359         }
 360
 361         Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
 362         {
 363                 return V(::builder->CreateFAdd(lhs, rhs));
 364         }
 365
 366         Value *Nucleus::createFSub(Value *lhs, Value *rhs)
 367         {
 368                 return V(::builder->CreateFSub(lhs, rhs));
 369         }
 370
 371         Value *Nucleus::createFMul(Value *lhs, Value *rhs)
 372         {
 373                 return V(::builder->CreateFMul(lhs, rhs));
 374         }
 375
 376         Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
 377         {
 378                 return V(::builder->CreateFDiv(lhs, rhs));
 379         }
 380
 381         Value *Nucleus::createURem(Value *lhs, Value *rhs)
 382         {
 383                 return V(::builder->CreateURem(lhs, rhs));
 384         }
 385
 386         Value *Nucleus::createSRem(Value *lhs, Value *rhs)
 387         {
 388                 return V(::builder->CreateSRem(lhs, rhs));
 389         }
 390
 391         Value *Nucleus::createFRem(Value *lhs, Value *rhs)
 392         {
 393                 return V(::builder->CreateFRem(lhs, rhs));
 394         }
 395
 396         Value *Nucleus::createShl(Value *lhs, Value *rhs)
 397         {
 398                 return V(::builder->CreateShl(lhs, rhs));
 399         }
 400
 401         Value *Nucleus::createLShr(Value *lhs, Value *rhs)
 402         {
 403                 return V(::builder->CreateLShr(lhs, rhs));
 404         }
 405
 406         Value *Nucleus::createAShr(Value *lhs, Value *rhs)
 407         {
 408                 return V(::builder->CreateAShr(lhs, rhs));
 409         }
 410
 411         Value *Nucleus::createAnd(Value *lhs, Value *rhs)
 412         {
 413                 return V(::builder->CreateAnd(lhs, rhs));
 414         }
 415
 416         Value *Nucleus::createOr(Value *lhs, Value *rhs)
 417         {
 418                 return V(::builder->CreateOr(lhs, rhs));
 419         }
 420
 421         Value *Nucleus::createXor(Value *lhs, Value *rhs)
 422         {
 423                 return V(::builder->CreateXor(lhs, rhs));
 424         }
 425
 426         Value *Nucleus::createNeg(Value *v)
 427         {
 428                 return V(::builder->CreateNeg(v));
 429         }
 430
 431         Value *Nucleus::createFNeg(Value *v)
 432         {
 433                 return V(::builder->CreateFNeg(v));
 434         }
 435
 436         Value *Nucleus::createNot(Value *v)
 437         {
 438                 return V(::builder->CreateNot(v));
 439         }
 440
 441         Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int align)
 442         {
 443                 assert(ptr->getType()->getContainedType(0) == type);
 444                 return V(::builder->Insert(new LoadInst(ptr, "", isVolatile, align)));
 445         }
 446
 447         Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int align)
 448         {
 449                 assert(ptr->getType()->getContainedType(0) == type);
 450                 ::builder->Insert(new StoreInst(value, ptr, isVolatile, align));
 451                 return value;
 452         }
 453
 454         Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index)
 455         {
 456                 assert(ptr->getType()->getContainedType(0) == type);
 457                 return V(::builder->CreateGEP(ptr, index));
 458         }
 459
 460         Value *Nucleus::createAtomicAdd(Value *ptr, Value *value)
 461         {
 462                 return V(::builder->CreateAtomicRMW(AtomicRMWInst::Add, ptr, value, SequentiallyConsistent));
 463         }
 464
 465         Value *Nucleus::createTrunc(Value *v, Type *destType)
 466         {
 467                 return V(::builder->CreateTrunc(v, destType));
 468         }
 469
 470         Value *Nucleus::createZExt(Value *v, Type *destType)
 471         {
 472                 return V(::builder->CreateZExt(v, destType));
 473         }
 474
 475         Value *Nucleus::createSExt(Value *v, Type *destType)
 476         {
 477                 return V(::builder->CreateSExt(v, destType));
 478         }
 479
 480         Value *Nucleus::createFPToSI(Value *v, Type *destType)
 481         {
 482                 return V(::builder->CreateFPToSI(v, destType));
 483         }
 484
 485         Value *Nucleus::createSIToFP(Value *v, Type *destType)
 486         {
 487                 return V(::builder->CreateSIToFP(v, destType));
 488         }
 489
 490         Value *Nucleus::createFPTrunc(Value *v, Type *destType)
 491         {
 492                 return V(::builder->CreateFPTrunc(v, destType));
 493         }
 494
 495         Value *Nucleus::createFPExt(Value *v, Type *destType)
 496         {
 497                 return V(::builder->CreateFPExt(v, destType));
 498         }
 499
 500         Value *Nucleus::createBitCast(Value *v, Type *destType)
 501         {
 502                 return V(::builder->CreateBitCast(v, destType));
 503         }
 504
 505         Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
 506         {
 507                 return V(::builder->CreateICmpEQ(lhs, rhs));
 508         }
 509
 510         Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
 511         {
 512                 return V(::builder->CreateICmpNE(lhs, rhs));
 513         }
 514
 515         Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
 516         {
 517                 return V(::builder->CreateICmpUGT(lhs, rhs));
 518         }
 519
 520         Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
 521         {
 522                 return V(::builder->CreateICmpUGE(lhs, rhs));
 523         }
 524
 525         Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
 526         {
 527                 return V(::builder->CreateICmpULT(lhs, rhs));
 528         }
 529
 530         Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
 531         {
 532                 return V(::builder->CreateICmpULE(lhs, rhs));
 533         }
 534
 535         Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
 536         {
 537                 return V(::builder->CreateICmpSGT(lhs, rhs));
 538         }
 539
 540         Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
 541         {
 542                 return V(::builder->CreateICmpSGE(lhs, rhs));
 543         }
 544
 545         Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
 546         {
 547                 return V(::builder->CreateICmpSLT(lhs, rhs));
 548         }
 549
 550         Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
 551         {
 552                 return V(::builder->CreateICmpSLE(lhs, rhs));
 553         }
 554
 555         Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
 556         {
 557                 return V(::builder->CreateFCmpOEQ(lhs, rhs));
 558         }
 559
 560         Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
 561         {
 562                 return V(::builder->CreateFCmpOGT(lhs, rhs));
 563         }
 564
 565         Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
 566         {
 567                 return V(::builder->CreateFCmpOGE(lhs, rhs));
 568         }
 569
 570         Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
 571         {
 572                 return V(::builder->CreateFCmpOLT(lhs, rhs));
 573         }
 574
 575         Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
 576         {
 577                 return V(::builder->CreateFCmpOLE(lhs, rhs));
 578         }
 579
 580         Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
 581         {
 582                 return V(::builder->CreateFCmpONE(lhs, rhs));
 583         }
 584
 585         Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
 586         {
 587                 return V(::builder->CreateFCmpORD(lhs, rhs));
 588         }
 589
 590         Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
 591         {
 592                 return V(::builder->CreateFCmpUNO(lhs, rhs));
 593         }
 594
 595         Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
 596         {
 597                 return V(::builder->CreateFCmpUEQ(lhs, rhs));
 598         }
 599
 600         Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
 601         {
 602                 return V(::builder->CreateFCmpUGT(lhs, rhs));
 603         }
 604
 605         Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
 606         {
 607                 return V(::builder->CreateFCmpUGE(lhs, rhs));
 608         }
 609
 610         Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
 611         {
 612                 return V(::builder->CreateFCmpULT(lhs, rhs));
 613         }
 614
 615         Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
 616         {
 617                 return V(::builder->CreateFCmpULE(lhs, rhs));
 618         }
 619
 620         Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
 621         {
 622                 return V(::builder->CreateFCmpULE(lhs, rhs));
 623         }
 624
 625         Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
 626         {
 627                 assert(vector->getType()->getContainedType(0) == type);
 628                 return V(::builder->CreateExtractElement(vector, createConstantInt(index)));
 629         }
 630
 631         Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
 632         {
 633                 return V(::builder->CreateInsertElement(vector, element, createConstantInt(index)));
 634         }
 635
 636         Value *Nucleus::createShuffleVector(Value *V1, Value *V2, const int *select)
 637         {
 638                 int size = llvm::cast<llvm::VectorType>(V1->getType())->getNumElements();
 639                 const int maxSize = 16;
 640                 llvm::Constant *swizzle[maxSize];
 641                 assert(size <= maxSize);
 642
 643                 for(int i = 0; i < size; i++)
 644                 {
 645                         swizzle[i] = llvm::ConstantInt::get(Type::getInt32Ty(*::context), select[i]);
 646                 }
 647
 648                 llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(swizzle, size));
 649
 650                 return V(::builder->CreateShuffleVector(V1, V2, shuffle));
 651         }
 652
 653         Value *Nucleus::createSelect(Value *C, Value *ifTrue, Value *ifFalse)
 654         {
 655                 return V(::builder->CreateSelect(C, ifTrue, ifFalse));
 656         }
 657
 658         SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
 659         {
 660                 return reinterpret_cast<SwitchCases*>(::builder->CreateSwitch(control, defaultBranch, numCases));
 661         }
 662
 663         void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
 664         {
 665                 switchCases->addCase(llvm::ConstantInt::get(Type::getInt32Ty(*::context), label, true), branch);
 666         }
 667
 668         void Nucleus::createUnreachable()
 669         {
 670                 ::builder->CreateUnreachable();
 671         }
 672
 673         static Value *createSwizzle4(Value *val, unsigned char select)
 674         {
 675                 int swizzle[4] =
 676                 {
 677                         (select >> 0) & 0x03,
 678                         (select >> 2) & 0x03,
 679                         (select >> 4) & 0x03,
 680                         (select >> 6) & 0x03,
 681                 };
 682
 683                 return Nucleus::createShuffleVector(val, val, swizzle);
 684         }
 685
 686         static Value *createMask4(Value *lhs, Value *rhs, unsigned char select)
 687         {
 688                 bool mask[4] = {false, false, false, false};
 689
 690                 mask[(select >> 0) & 0x03] = true;
 691                 mask[(select >> 2) & 0x03] = true;
 692                 mask[(select >> 4) & 0x03] = true;
 693                 mask[(select >> 6) & 0x03] = true;
 694
 695                 int swizzle[4] =
 696                 {
 697                         mask[0] ? 4 : 0,
 698                         mask[1] ? 5 : 1,
 699                         mask[2] ? 6 : 2,
 700                         mask[3] ? 7 : 3,
 701                 };
 702
 703                 return Nucleus::createShuffleVector(lhs, rhs, swizzle);
 704         }
 705
 706         Type *Nucleus::getPointerType(Type *ElementType)
 707         {
 708                 return T(llvm::PointerType::get(ElementType, 0));
 709         }
 710
 711         Value *Nucleus::createNullValue(Type *Ty)
 712         {
 713                 return V(llvm::Constant::getNullValue(Ty));
 714         }
 715
 716         Value *Nucleus::createConstantLong(int64_t i)
 717         {
 718                 return V(llvm::ConstantInt::get(Type::getInt64Ty(*::context), i, true));
 719         }
 720
 721         Value *Nucleus::createConstantInt(int i)
 722         {
 723                 return V(llvm::ConstantInt::get(Type::getInt32Ty(*::context), i, true));
 724         }
 725
 726         Value *Nucleus::createConstantInt(unsigned int i)
 727         {
 728                 return V(llvm::ConstantInt::get(Type::getInt32Ty(*::context), i, false));
 729         }
 730
 731         Value *Nucleus::createConstantBool(bool b)
 732         {
 733                 return V(llvm::ConstantInt::get(Type::getInt1Ty(*::context), b));
 734         }
 735
 736         Value *Nucleus::createConstantByte(signed char i)
 737         {
 738                 return V(llvm::ConstantInt::get(Type::getInt8Ty(*::context), i, true));
 739         }
 740
 741         Value *Nucleus::createConstantByte(unsigned char i)
 742         {
 743                 return V(llvm::ConstantInt::get(Type::getInt8Ty(*::context), i, false));
 744         }
 745
 746         Value *Nucleus::createConstantShort(short i)
 747         {
 748                 return V(llvm::ConstantInt::get(Type::getInt16Ty(*::context), i, true));
 749         }
 750
 751         Value *Nucleus::createConstantShort(unsigned short i)
 752         {
 753                 return V(llvm::ConstantInt::get(Type::getInt16Ty(*::context), i, false));
 754         }
 755
 756         Value *Nucleus::createConstantFloat(float x)
 757         {
 758                 return V(llvm::ConstantFP::get(Float::getType(), x));
 759         }
 760
 761         Value *Nucleus::createNullPointer(Type *Ty)
 762         {
 763                 return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(Ty, 0)));
 764         }
 765
 766         Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
 767         {
 768                 assert(llvm::isa<VectorType>(type));
 769                 const int numConstants = llvm::cast<VectorType>(type)->getNumElements();
 770                 assert(numConstants <= 16);
 771                 llvm::Constant *constantVector[16];
 772
 773                 for(int i = 0; i < numConstants; i++)
 774                 {
 775                         constantVector[i] = llvm::ConstantInt::get(type->getContainedType(0), constants[i]);
 776                 }
 777
 778                 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numConstants)));
 779         }
 780
 781         Value *Nucleus::createConstantVector(const double *constants, Type *type)
 782         {
 783                 assert(llvm::isa<VectorType>(type));
 784                 const int numConstants = llvm::cast<VectorType>(type)->getNumElements();
 785                 assert(numConstants <= 8);
 786                 llvm::Constant *constantVector[8];
 787
 788                 for(int i = 0; i < numConstants; i++)
 789                 {
 790                         constantVector[i] = llvm::ConstantFP::get(type->getContainedType(0), constants[i]);
 791                 }
 792
 793                 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numConstants)));
 794         }
 795
 796         Type *Void::getType()
 797         {
 798                 return T(llvm::Type::getVoidTy(*::context));
 799         }
 800
 801         class MMX : public LValue<MMX>
 802         {
 803         public:
 804                 static Type *getType();
 805         };
 806
 807         Type *MMX::getType()
 808         {
 809                 return T(llvm::Type::getX86_MMXTy(*::context));
 810         }
 811
 812         Bool::Bool(Argument<Bool> argument)
 813         {
 814                 storeValue(argument.value);
 815         }
 816
 817         Bool::Bool(bool x)
 818         {
 819                 storeValue(Nucleus::createConstantBool(x));
 820         }
 821
 822         Bool::Bool(RValue<Bool> rhs)
 823         {
 824                 storeValue(rhs.value);
 825         }
 826
 827         Bool::Bool(const Bool &rhs)
 828         {
 829                 Value *value = rhs.loadValue();
 830                 storeValue(value);
 831         }
 832
 833         Bool::Bool(const Reference<Bool> &rhs)
 834         {
 835                 Value *value = rhs.loadValue();
 836                 storeValue(value);
 837         }
 838
 839         RValue<Bool> Bool::operator=(RValue<Bool> rhs)
 840         {
 841                 storeValue(rhs.value);
 842
 843                 return rhs;
 844         }
 845
 846         RValue<Bool> Bool::operator=(const Bool &rhs)
 847         {
 848                 Value *value = rhs.loadValue();
 849                 storeValue(value);
 850
 851                 return RValue<Bool>(value);
 852         }
 853
 854         RValue<Bool> Bool::operator=(const Reference<Bool> &rhs)
 855         {
 856                 Value *value = rhs.loadValue();
 857                 storeValue(value);
 858
 859                 return RValue<Bool>(value);
 860         }
 861
 862         RValue<Bool> operator!(RValue<Bool> val)
 863         {
 864                 return RValue<Bool>(Nucleus::createNot(val.value));
 865         }
 866
 867         RValue<Bool> operator&&(RValue<Bool> lhs, RValue<Bool> rhs)
 868         {
 869                 return RValue<Bool>(Nucleus::createAnd(lhs.value, rhs.value));
 870         }
 871
 872         RValue<Bool> operator||(RValue<Bool> lhs, RValue<Bool> rhs)
 873         {
 874                 return RValue<Bool>(Nucleus::createOr(lhs.value, rhs.value));
 875         }
 876
 877         Type *Bool::getType()
 878         {
 879                 return T(llvm::Type::getInt1Ty(*::context));
 880         }
 881
 882         Byte::Byte(Argument<Byte> argument)
 883         {
 884                 storeValue(argument.value);
 885         }
 886
 887         Byte::Byte(RValue<Int> cast)
 888         {
 889                 Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
 890
 891                 storeValue(integer);
 892         }
 893
 894         Byte::Byte(RValue<UInt> cast)
 895         {
 896                 Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
 897
 898                 storeValue(integer);
 899         }
 900
 901         Byte::Byte(RValue<UShort> cast)
 902         {
 903                 Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
 904
 905                 storeValue(integer);
 906         }
 907
 908         Byte::Byte(int x)
 909         {
 910                 storeValue(Nucleus::createConstantByte((unsigned char)x));
 911         }
 912
 913         Byte::Byte(unsigned char x)
 914         {
 915                 storeValue(Nucleus::createConstantByte(x));
 916         }
 917
 918         Byte::Byte(RValue<Byte> rhs)
 919         {
 920                 storeValue(rhs.value);
 921         }
 922
 923         Byte::Byte(const Byte &rhs)
 924         {
 925                 Value *value = rhs.loadValue();
 926                 storeValue(value);
 927         }
 928
 929         Byte::Byte(const Reference<Byte> &rhs)
 930         {
 931                 Value *value = rhs.loadValue();
 932                 storeValue(value);
 933         }
 934
 935         RValue<Byte> Byte::operator=(RValue<Byte> rhs)
 936         {
 937                 storeValue(rhs.value);
 938
 939                 return rhs;
 940         }
 941
 942         RValue<Byte> Byte::operator=(const Byte &rhs)
 943         {
 944                 Value *value = rhs.loadValue();
 945                 storeValue(value);
 946
 947                 return RValue<Byte>(value);
 948         }
 949
 950         RValue<Byte> Byte::operator=(const Reference<Byte> &rhs)
 951         {
 952                 Value *value = rhs.loadValue();
 953                 storeValue(value);
 954
 955                 return RValue<Byte>(value);
 956         }
 957
 958         RValue<Byte> operator+(RValue<Byte> lhs, RValue<Byte> rhs)
 959         {
 960                 return RValue<Byte>(Nucleus::createAdd(lhs.value, rhs.value));
 961         }
 962
 963         RValue<Byte> operator-(RValue<Byte> lhs, RValue<Byte> rhs)
 964         {
 965                 return RValue<Byte>(Nucleus::createSub(lhs.value, rhs.value));
 966         }
 967
 968         RValue<Byte> operator*(RValue<Byte> lhs, RValue<Byte> rhs)
 969         {
 970                 return RValue<Byte>(Nucleus::createMul(lhs.value, rhs.value));
 971         }
 972
 973         RValue<Byte> operator/(RValue<Byte> lhs, RValue<Byte> rhs)
 974         {
 975                 return RValue<Byte>(Nucleus::createUDiv(lhs.value, rhs.value));
 976         }
 977
 978         RValue<Byte> operator%(RValue<Byte> lhs, RValue<Byte> rhs)
 979         {
 980                 return RValue<Byte>(Nucleus::createURem(lhs.value, rhs.value));
 981         }
 982
 983         RValue<Byte> operator&(RValue<Byte> lhs, RValue<Byte> rhs)
 984         {
 985                 return RValue<Byte>(Nucleus::createAnd(lhs.value, rhs.value));
 986         }
 987
 988         RValue<Byte> operator|(RValue<Byte> lhs, RValue<Byte> rhs)
 989         {
 990                 return RValue<Byte>(Nucleus::createOr(lhs.value, rhs.value));
 991         }
 992
 993         RValue<Byte> operator^(RValue<Byte> lhs, RValue<Byte> rhs)
 994         {
 995                 return RValue<Byte>(Nucleus::createXor(lhs.value, rhs.value));
 996         }
 997
 998         RValue<Byte> operator<<(RValue<Byte> lhs, RValue<Byte> rhs)
 999         {
1000                 return RValue<Byte>(Nucleus::createShl(lhs.value, rhs.value));
1001         }
1002
1003         RValue<Byte> operator>>(RValue<Byte> lhs, RValue<Byte> rhs)
1004         {
1005                 return RValue<Byte>(Nucleus::createLShr(lhs.value, rhs.value));
1006         }
1007
1008         RValue<Byte> operator+=(Byte &lhs, RValue<Byte> rhs)
1009         {
1010                 return lhs = lhs + rhs;
1011         }
1012
1013         RValue<Byte> operator-=(Byte &lhs, RValue<Byte> rhs)
1014         {
1015                 return lhs = lhs - rhs;
1016         }
1017
1018         RValue<Byte> operator*=(Byte &lhs, RValue<Byte> rhs)
1019         {
1020                 return lhs = lhs * rhs;
1021         }
1022
1023         RValue<Byte> operator/=(Byte &lhs, RValue<Byte> rhs)
1024         {
1025                 return lhs = lhs / rhs;
1026         }
1027
1028         RValue<Byte> operator%=(Byte &lhs, RValue<Byte> rhs)
1029         {
1030                 return lhs = lhs % rhs;
1031         }
1032
1033         RValue<Byte> operator&=(Byte &lhs, RValue<Byte> rhs)
1034         {
1035                 return lhs = lhs & rhs;
1036         }
1037
1038         RValue<Byte> operator|=(Byte &lhs, RValue<Byte> rhs)
1039         {
1040                 return lhs = lhs | rhs;
1041         }
1042
1043         RValue<Byte> operator^=(Byte &lhs, RValue<Byte> rhs)
1044         {
1045                 return lhs = lhs ^ rhs;
1046         }
1047
1048         RValue<Byte> operator<<=(Byte &lhs, RValue<Byte> rhs)
1049         {
1050                 return lhs = lhs << rhs;
1051         }
1052
1053         RValue<Byte> operator>>=(Byte &lhs, RValue<Byte> rhs)
1054         {
1055                 return lhs = lhs >> rhs;
1056         }
1057
1058         RValue<Byte> operator+(RValue<Byte> val)
1059         {
1060                 return val;
1061         }
1062
1063         RValue<Byte> operator-(RValue<Byte> val)
1064         {
1065                 return RValue<Byte>(Nucleus::createNeg(val.value));
1066         }
1067
1068         RValue<Byte> operator~(RValue<Byte> val)
1069         {
1070                 return RValue<Byte>(Nucleus::createNot(val.value));
1071         }
1072
1073         RValue<Byte> operator++(Byte &val, int)   // Post-increment
1074         {
1075                 RValue<Byte> res = val;
1076
1077                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantByte((unsigned char)1)));
1078                 val.storeValue(inc);
1079
1080                 return res;
1081         }
1082
1083         const Byte &operator++(Byte &val)   // Pre-increment
1084         {
1085                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantByte((unsigned char)1)));
1086                 val.storeValue(inc);
1087
1088                 return val;
1089         }
1090
1091         RValue<Byte> operator--(Byte &val, int)   // Post-decrement
1092         {
1093                 RValue<Byte> res = val;
1094
1095                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantByte((unsigned char)1)));
1096                 val.storeValue(inc);
1097
1098                 return res;
1099         }
1100
1101         const Byte &operator--(Byte &val)   // Pre-decrement
1102         {
1103                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantByte((unsigned char)1)));
1104                 val.storeValue(inc);
1105
1106                 return val;
1107         }
1108
1109         RValue<Bool> operator<(RValue<Byte> lhs, RValue<Byte> rhs)
1110         {
1111                 return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
1112         }
1113
1114         RValue<Bool> operator<=(RValue<Byte> lhs, RValue<Byte> rhs)
1115         {
1116                 return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
1117         }
1118
1119         RValue<Bool> operator>(RValue<Byte> lhs, RValue<Byte> rhs)
1120         {
1121                 return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
1122         }
1123
1124         RValue<Bool> operator>=(RValue<Byte> lhs, RValue<Byte> rhs)
1125         {
1126                 return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
1127         }
1128
1129         RValue<Bool> operator!=(RValue<Byte> lhs, RValue<Byte> rhs)
1130         {
1131                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1132         }
1133
1134         RValue<Bool> operator==(RValue<Byte> lhs, RValue<Byte> rhs)
1135         {
1136                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1137         }
1138
1139         Type *Byte::getType()
1140         {
1141                 return T(llvm::Type::getInt8Ty(*::context));
1142         }
1143
1144         SByte::SByte(Argument<SByte> argument)
1145         {
1146                 storeValue(argument.value);
1147         }
1148
1149         SByte::SByte(RValue<Int> cast)
1150         {
1151                 Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
1152
1153                 storeValue(integer);
1154         }
1155
1156         SByte::SByte(RValue<Short> cast)
1157         {
1158                 Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
1159
1160                 storeValue(integer);
1161         }
1162
1163         SByte::SByte(signed char x)
1164         {
1165                 storeValue(Nucleus::createConstantByte(x));
1166         }
1167
1168         SByte::SByte(RValue<SByte> rhs)
1169         {
1170                 storeValue(rhs.value);
1171         }
1172
1173         SByte::SByte(const SByte &rhs)
1174         {
1175                 Value *value = rhs.loadValue();
1176                 storeValue(value);
1177         }
1178
1179         SByte::SByte(const Reference<SByte> &rhs)
1180         {
1181                 Value *value = rhs.loadValue();
1182                 storeValue(value);
1183         }
1184
1185         RValue<SByte> SByte::operator=(RValue<SByte> rhs)
1186         {
1187                 storeValue(rhs.value);
1188
1189                 return rhs;
1190         }
1191
1192         RValue<SByte> SByte::operator=(const SByte &rhs)
1193         {
1194                 Value *value = rhs.loadValue();
1195                 storeValue(value);
1196
1197                 return RValue<SByte>(value);
1198         }
1199
1200         RValue<SByte> SByte::operator=(const Reference<SByte> &rhs)
1201         {
1202                 Value *value = rhs.loadValue();
1203                 storeValue(value);
1204
1205                 return RValue<SByte>(value);
1206         }
1207
1208         RValue<SByte> operator+(RValue<SByte> lhs, RValue<SByte> rhs)
1209         {
1210                 return RValue<SByte>(Nucleus::createAdd(lhs.value, rhs.value));
1211         }
1212
1213         RValue<SByte> operator-(RValue<SByte> lhs, RValue<SByte> rhs)
1214         {
1215                 return RValue<SByte>(Nucleus::createSub(lhs.value, rhs.value));
1216         }
1217
1218         RValue<SByte> operator*(RValue<SByte> lhs, RValue<SByte> rhs)
1219         {
1220                 return RValue<SByte>(Nucleus::createMul(lhs.value, rhs.value));
1221         }
1222
1223         RValue<SByte> operator/(RValue<SByte> lhs, RValue<SByte> rhs)
1224         {
1225                 return RValue<SByte>(Nucleus::createSDiv(lhs.value, rhs.value));
1226         }
1227
1228         RValue<SByte> operator%(RValue<SByte> lhs, RValue<SByte> rhs)
1229         {
1230                 return RValue<SByte>(Nucleus::createSRem(lhs.value, rhs.value));
1231         }
1232
1233         RValue<SByte> operator&(RValue<SByte> lhs, RValue<SByte> rhs)
1234         {
1235                 return RValue<SByte>(Nucleus::createAnd(lhs.value, rhs.value));
1236         }
1237
1238         RValue<SByte> operator|(RValue<SByte> lhs, RValue<SByte> rhs)
1239         {
1240                 return RValue<SByte>(Nucleus::createOr(lhs.value, rhs.value));
1241         }
1242
1243         RValue<SByte> operator^(RValue<SByte> lhs, RValue<SByte> rhs)
1244         {
1245                 return RValue<SByte>(Nucleus::createXor(lhs.value, rhs.value));
1246         }
1247
1248         RValue<SByte> operator<<(RValue<SByte> lhs, RValue<SByte> rhs)
1249         {
1250                 return RValue<SByte>(Nucleus::createShl(lhs.value, rhs.value));
1251         }
1252
1253         RValue<SByte> operator>>(RValue<SByte> lhs, RValue<SByte> rhs)
1254         {
1255                 return RValue<SByte>(Nucleus::createAShr(lhs.value, rhs.value));
1256         }
1257
1258         RValue<SByte> operator+=(SByte &lhs, RValue<SByte> rhs)
1259         {
1260                 return lhs = lhs + rhs;
1261         }
1262
1263         RValue<SByte> operator-=(SByte &lhs, RValue<SByte> rhs)
1264         {
1265                 return lhs = lhs - rhs;
1266         }
1267
1268         RValue<SByte> operator*=(SByte &lhs, RValue<SByte> rhs)
1269         {
1270                 return lhs = lhs * rhs;
1271         }
1272
1273         RValue<SByte> operator/=(SByte &lhs, RValue<SByte> rhs)
1274         {
1275                 return lhs = lhs / rhs;
1276         }
1277
1278         RValue<SByte> operator%=(SByte &lhs, RValue<SByte> rhs)
1279         {
1280                 return lhs = lhs % rhs;
1281         }
1282
1283         RValue<SByte> operator&=(SByte &lhs, RValue<SByte> rhs)
1284         {
1285                 return lhs = lhs & rhs;
1286         }
1287
1288         RValue<SByte> operator|=(SByte &lhs, RValue<SByte> rhs)
1289         {
1290                 return lhs = lhs | rhs;
1291         }
1292
1293         RValue<SByte> operator^=(SByte &lhs, RValue<SByte> rhs)
1294         {
1295                 return lhs = lhs ^ rhs;
1296         }
1297
1298         RValue<SByte> operator<<=(SByte &lhs, RValue<SByte> rhs)
1299         {
1300                 return lhs = lhs << rhs;
1301         }
1302
1303         RValue<SByte> operator>>=(SByte &lhs, RValue<SByte> rhs)
1304         {
1305                 return lhs = lhs >> rhs;
1306         }
1307
1308         RValue<SByte> operator+(RValue<SByte> val)
1309         {
1310                 return val;
1311         }
1312
1313         RValue<SByte> operator-(RValue<SByte> val)
1314         {
1315                 return RValue<SByte>(Nucleus::createNeg(val.value));
1316         }
1317
1318         RValue<SByte> operator~(RValue<SByte> val)
1319         {
1320                 return RValue<SByte>(Nucleus::createNot(val.value));
1321         }
1322
1323         RValue<SByte> operator++(SByte &val, int)   // Post-increment
1324         {
1325                 RValue<SByte> res = val;
1326
1327                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantByte((signed char)1)));
1328                 val.storeValue(inc);
1329
1330                 return res;
1331         }
1332
1333         const SByte &operator++(SByte &val)   // Pre-increment
1334         {
1335                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantByte((signed char)1)));
1336                 val.storeValue(inc);
1337
1338                 return val;
1339         }
1340
1341         RValue<SByte> operator--(SByte &val, int)   // Post-decrement
1342         {
1343                 RValue<SByte> res = val;
1344
1345                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantByte((signed char)1)));
1346                 val.storeValue(inc);
1347
1348                 return res;
1349         }
1350
1351         const SByte &operator--(SByte &val)   // Pre-decrement
1352         {
1353                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantByte((signed char)1)));
1354                 val.storeValue(inc);
1355
1356                 return val;
1357         }
1358
1359         RValue<Bool> operator<(RValue<SByte> lhs, RValue<SByte> rhs)
1360         {
1361                 return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
1362         }
1363
1364         RValue<Bool> operator<=(RValue<SByte> lhs, RValue<SByte> rhs)
1365         {
1366                 return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
1367         }
1368
1369         RValue<Bool> operator>(RValue<SByte> lhs, RValue<SByte> rhs)
1370         {
1371                 return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
1372         }
1373
1374         RValue<Bool> operator>=(RValue<SByte> lhs, RValue<SByte> rhs)
1375         {
1376                 return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
1377         }
1378
1379         RValue<Bool> operator!=(RValue<SByte> lhs, RValue<SByte> rhs)
1380         {
1381                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1382         }
1383
1384         RValue<Bool> operator==(RValue<SByte> lhs, RValue<SByte> rhs)
1385         {
1386                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1387         }
1388
1389         Type *SByte::getType()
1390         {
1391                 return T(llvm::Type::getInt8Ty(*::context));
1392         }
1393
1394         Short::Short(Argument<Short> argument)
1395         {
1396                 storeValue(argument.value);
1397         }
1398
1399         Short::Short(RValue<Int> cast)
1400         {
1401                 Value *integer = Nucleus::createTrunc(cast.value, Short::getType());
1402
1403                 storeValue(integer);
1404         }
1405
1406         Short::Short(short x)
1407         {
1408                 storeValue(Nucleus::createConstantShort(x));
1409         }
1410
1411         Short::Short(RValue<Short> rhs)
1412         {
1413                 storeValue(rhs.value);
1414         }
1415
1416         Short::Short(const Short &rhs)
1417         {
1418                 Value *value = rhs.loadValue();
1419                 storeValue(value);
1420         }
1421
1422         Short::Short(const Reference<Short> &rhs)
1423         {
1424                 Value *value = rhs.loadValue();
1425                 storeValue(value);
1426         }
1427
1428         RValue<Short> Short::operator=(RValue<Short> rhs)
1429         {
1430                 storeValue(rhs.value);
1431
1432                 return rhs;
1433         }
1434
1435         RValue<Short> Short::operator=(const Short &rhs)
1436         {
1437                 Value *value = rhs.loadValue();
1438                 storeValue(value);
1439
1440                 return RValue<Short>(value);
1441         }
1442
1443         RValue<Short> Short::operator=(const Reference<Short> &rhs)
1444         {
1445                 Value *value = rhs.loadValue();
1446                 storeValue(value);
1447
1448                 return RValue<Short>(value);
1449         }
1450
1451         RValue<Short> operator+(RValue<Short> lhs, RValue<Short> rhs)
1452         {
1453                 return RValue<Short>(Nucleus::createAdd(lhs.value, rhs.value));
1454         }
1455
1456         RValue<Short> operator-(RValue<Short> lhs, RValue<Short> rhs)
1457         {
1458                 return RValue<Short>(Nucleus::createSub(lhs.value, rhs.value));
1459         }
1460
1461         RValue<Short> operator*(RValue<Short> lhs, RValue<Short> rhs)
1462         {
1463                 return RValue<Short>(Nucleus::createMul(lhs.value, rhs.value));
1464         }
1465
1466         RValue<Short> operator/(RValue<Short> lhs, RValue<Short> rhs)
1467         {
1468                 return RValue<Short>(Nucleus::createSDiv(lhs.value, rhs.value));
1469         }
1470
1471         RValue<Short> operator%(RValue<Short> lhs, RValue<Short> rhs)
1472         {
1473                 return RValue<Short>(Nucleus::createSRem(lhs.value, rhs.value));
1474         }
1475
1476         RValue<Short> operator&(RValue<Short> lhs, RValue<Short> rhs)
1477         {
1478                 return RValue<Short>(Nucleus::createAnd(lhs.value, rhs.value));
1479         }
1480
1481         RValue<Short> operator|(RValue<Short> lhs, RValue<Short> rhs)
1482         {
1483                 return RValue<Short>(Nucleus::createOr(lhs.value, rhs.value));
1484         }
1485
1486         RValue<Short> operator^(RValue<Short> lhs, RValue<Short> rhs)
1487         {
1488                 return RValue<Short>(Nucleus::createXor(lhs.value, rhs.value));
1489         }
1490
1491         RValue<Short> operator<<(RValue<Short> lhs, RValue<Short> rhs)
1492         {
1493                 return RValue<Short>(Nucleus::createShl(lhs.value, rhs.value));
1494         }
1495
1496         RValue<Short> operator>>(RValue<Short> lhs, RValue<Short> rhs)
1497         {
1498                 return RValue<Short>(Nucleus::createAShr(lhs.value, rhs.value));
1499         }
1500
1501         RValue<Short> operator+=(Short &lhs, RValue<Short> rhs)
1502         {
1503                 return lhs = lhs + rhs;
1504         }
1505
1506         RValue<Short> operator-=(Short &lhs, RValue<Short> rhs)
1507         {
1508                 return lhs = lhs - rhs;
1509         }
1510
1511         RValue<Short> operator*=(Short &lhs, RValue<Short> rhs)
1512         {
1513                 return lhs = lhs * rhs;
1514         }
1515
1516         RValue<Short> operator/=(Short &lhs, RValue<Short> rhs)
1517         {
1518                 return lhs = lhs / rhs;
1519         }
1520
1521         RValue<Short> operator%=(Short &lhs, RValue<Short> rhs)
1522         {
1523                 return lhs = lhs % rhs;
1524         }
1525
1526         RValue<Short> operator&=(Short &lhs, RValue<Short> rhs)
1527         {
1528                 return lhs = lhs & rhs;
1529         }
1530
1531         RValue<Short> operator|=(Short &lhs, RValue<Short> rhs)
1532         {
1533                 return lhs = lhs | rhs;
1534         }
1535
1536         RValue<Short> operator^=(Short &lhs, RValue<Short> rhs)
1537         {
1538                 return lhs = lhs ^ rhs;
1539         }
1540
1541         RValue<Short> operator<<=(Short &lhs, RValue<Short> rhs)
1542         {
1543                 return lhs = lhs << rhs;
1544         }
1545
1546         RValue<Short> operator>>=(Short &lhs, RValue<Short> rhs)
1547         {
1548                 return lhs = lhs >> rhs;
1549         }
1550
1551         RValue<Short> operator+(RValue<Short> val)
1552         {
1553                 return val;
1554         }
1555
1556         RValue<Short> operator-(RValue<Short> val)
1557         {
1558                 return RValue<Short>(Nucleus::createNeg(val.value));
1559         }
1560
1561         RValue<Short> operator~(RValue<Short> val)
1562         {
1563                 return RValue<Short>(Nucleus::createNot(val.value));
1564         }
1565
1566         RValue<Short> operator++(Short &val, int)   // Post-increment
1567         {
1568                 RValue<Short> res = val;
1569
1570                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantShort((short)1)));
1571                 val.storeValue(inc);
1572
1573                 return res;
1574         }
1575
1576         const Short &operator++(Short &val)   // Pre-increment
1577         {
1578                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantShort((short)1)));
1579                 val.storeValue(inc);
1580
1581                 return val;
1582         }
1583
1584         RValue<Short> operator--(Short &val, int)   // Post-decrement
1585         {
1586                 RValue<Short> res = val;
1587
1588                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantShort((short)1)));
1589                 val.storeValue(inc);
1590
1591                 return res;
1592         }
1593
1594         const Short &operator--(Short &val)   // Pre-decrement
1595         {
1596                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantShort((short)1)));
1597                 val.storeValue(inc);
1598
1599                 return val;
1600         }
1601
1602         RValue<Bool> operator<(RValue<Short> lhs, RValue<Short> rhs)
1603         {
1604                 return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
1605         }
1606
1607         RValue<Bool> operator<=(RValue<Short> lhs, RValue<Short> rhs)
1608         {
1609                 return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
1610         }
1611
1612         RValue<Bool> operator>(RValue<Short> lhs, RValue<Short> rhs)
1613         {
1614                 return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
1615         }
1616
1617         RValue<Bool> operator>=(RValue<Short> lhs, RValue<Short> rhs)
1618         {
1619                 return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
1620         }
1621
1622         RValue<Bool> operator!=(RValue<Short> lhs, RValue<Short> rhs)
1623         {
1624                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1625         }
1626
1627         RValue<Bool> operator==(RValue<Short> lhs, RValue<Short> rhs)
1628         {
1629                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1630         }
1631
1632         Type *Short::getType()
1633         {
1634                 return T(llvm::Type::getInt16Ty(*::context));
1635         }
1636
1637         UShort::UShort(Argument<UShort> argument)
1638         {
1639                 storeValue(argument.value);
1640         }
1641
1642         UShort::UShort(RValue<UInt> cast)
1643         {
1644                 Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
1645
1646                 storeValue(integer);
1647         }
1648
1649         UShort::UShort(RValue<Int> cast)
1650         {
1651                 Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
1652
1653                 storeValue(integer);
1654         }
1655
1656         UShort::UShort(unsigned short x)
1657         {
1658                 storeValue(Nucleus::createConstantShort(x));
1659         }
1660
1661         UShort::UShort(RValue<UShort> rhs)
1662         {
1663                 storeValue(rhs.value);
1664         }
1665
1666         UShort::UShort(const UShort &rhs)
1667         {
1668                 Value *value = rhs.loadValue();
1669                 storeValue(value);
1670         }
1671
1672         UShort::UShort(const Reference<UShort> &rhs)
1673         {
1674                 Value *value = rhs.loadValue();
1675                 storeValue(value);
1676         }
1677
1678         RValue<UShort> UShort::operator=(RValue<UShort> rhs)
1679         {
1680                 storeValue(rhs.value);
1681
1682                 return rhs;
1683         }
1684
1685         RValue<UShort> UShort::operator=(const UShort &rhs)
1686         {
1687                 Value *value = rhs.loadValue();
1688                 storeValue(value);
1689
1690                 return RValue<UShort>(value);
1691         }
1692
1693         RValue<UShort> UShort::operator=(const Reference<UShort> &rhs)
1694         {
1695                 Value *value = rhs.loadValue();
1696                 storeValue(value);
1697
1698                 return RValue<UShort>(value);
1699         }
1700
1701         RValue<UShort> operator+(RValue<UShort> lhs, RValue<UShort> rhs)
1702         {
1703                 return RValue<UShort>(Nucleus::createAdd(lhs.value, rhs.value));
1704         }
1705
1706         RValue<UShort> operator-(RValue<UShort> lhs, RValue<UShort> rhs)
1707         {
1708                 return RValue<UShort>(Nucleus::createSub(lhs.value, rhs.value));
1709         }
1710
1711         RValue<UShort> operator*(RValue<UShort> lhs, RValue<UShort> rhs)
1712         {
1713                 return RValue<UShort>(Nucleus::createMul(lhs.value, rhs.value));
1714         }
1715
1716         RValue<UShort> operator/(RValue<UShort> lhs, RValue<UShort> rhs)
1717         {
1718                 return RValue<UShort>(Nucleus::createUDiv(lhs.value, rhs.value));
1719         }
1720
1721         RValue<UShort> operator%(RValue<UShort> lhs, RValue<UShort> rhs)
1722         {
1723                 return RValue<UShort>(Nucleus::createURem(lhs.value, rhs.value));
1724         }
1725
1726         RValue<UShort> operator&(RValue<UShort> lhs, RValue<UShort> rhs)
1727         {
1728                 return RValue<UShort>(Nucleus::createAnd(lhs.value, rhs.value));
1729         }
1730
1731         RValue<UShort> operator|(RValue<UShort> lhs, RValue<UShort> rhs)
1732         {
1733                 return RValue<UShort>(Nucleus::createOr(lhs.value, rhs.value));
1734         }
1735
1736         RValue<UShort> operator^(RValue<UShort> lhs, RValue<UShort> rhs)
1737         {
1738                 return RValue<UShort>(Nucleus::createXor(lhs.value, rhs.value));
1739         }
1740
1741         RValue<UShort> operator<<(RValue<UShort> lhs, RValue<UShort> rhs)
1742         {
1743                 return RValue<UShort>(Nucleus::createShl(lhs.value, rhs.value));
1744         }
1745
1746         RValue<UShort> operator>>(RValue<UShort> lhs, RValue<UShort> rhs)
1747         {
1748                 return RValue<UShort>(Nucleus::createLShr(lhs.value, rhs.value));
1749         }
1750
1751         RValue<UShort> operator+=(UShort &lhs, RValue<UShort> rhs)
1752         {
1753                 return lhs = lhs + rhs;
1754         }
1755
1756         RValue<UShort> operator-=(UShort &lhs, RValue<UShort> rhs)
1757         {
1758                 return lhs = lhs - rhs;
1759         }
1760
1761         RValue<UShort> operator*=(UShort &lhs, RValue<UShort> rhs)
1762         {
1763                 return lhs = lhs * rhs;
1764         }
1765
1766         RValue<UShort> operator/=(UShort &lhs, RValue<UShort> rhs)
1767         {
1768                 return lhs = lhs / rhs;
1769         }
1770
1771         RValue<UShort> operator%=(UShort &lhs, RValue<UShort> rhs)
1772         {
1773                 return lhs = lhs % rhs;
1774         }
1775
1776         RValue<UShort> operator&=(UShort &lhs, RValue<UShort> rhs)
1777         {
1778                 return lhs = lhs & rhs;
1779         }
1780
1781         RValue<UShort> operator|=(UShort &lhs, RValue<UShort> rhs)
1782         {
1783                 return lhs = lhs | rhs;
1784         }
1785
1786         RValue<UShort> operator^=(UShort &lhs, RValue<UShort> rhs)
1787         {
1788                 return lhs = lhs ^ rhs;
1789         }
1790
1791         RValue<UShort> operator<<=(UShort &lhs, RValue<UShort> rhs)
1792         {
1793                 return lhs = lhs << rhs;
1794         }
1795
1796         RValue<UShort> operator>>=(UShort &lhs, RValue<UShort> rhs)
1797         {
1798                 return lhs = lhs >> rhs;
1799         }
1800
1801         RValue<UShort> operator+(RValue<UShort> val)
1802         {
1803                 return val;
1804         }
1805
1806         RValue<UShort> operator-(RValue<UShort> val)
1807         {
1808                 return RValue<UShort>(Nucleus::createNeg(val.value));
1809         }
1810
1811         RValue<UShort> operator~(RValue<UShort> val)
1812         {
1813                 return RValue<UShort>(Nucleus::createNot(val.value));
1814         }
1815
1816         RValue<UShort> operator++(UShort &val, int)   // Post-increment
1817         {
1818                 RValue<UShort> res = val;
1819
1820                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantShort((unsigned short)1)));
1821                 val.storeValue(inc);
1822
1823                 return res;
1824         }
1825
1826         const UShort &operator++(UShort &val)   // Pre-increment
1827         {
1828                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantShort((unsigned short)1)));
1829                 val.storeValue(inc);
1830
1831                 return val;
1832         }
1833
1834         RValue<UShort> operator--(UShort &val, int)   // Post-decrement
1835         {
1836                 RValue<UShort> res = val;
1837
1838                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantShort((unsigned short)1)));
1839                 val.storeValue(inc);
1840
1841                 return res;
1842         }
1843
1844         const UShort &operator--(UShort &val)   // Pre-decrement
1845         {
1846                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantShort((unsigned short)1)));
1847                 val.storeValue(inc);
1848
1849                 return val;
1850         }
1851
1852         RValue<Bool> operator<(RValue<UShort> lhs, RValue<UShort> rhs)
1853         {
1854                 return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
1855         }
1856
1857         RValue<Bool> operator<=(RValue<UShort> lhs, RValue<UShort> rhs)
1858         {
1859                 return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
1860         }
1861
1862         RValue<Bool> operator>(RValue<UShort> lhs, RValue<UShort> rhs)
1863         {
1864                 return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
1865         }
1866
1867         RValue<Bool> operator>=(RValue<UShort> lhs, RValue<UShort> rhs)
1868         {
1869                 return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
1870         }
1871
1872         RValue<Bool> operator!=(RValue<UShort> lhs, RValue<UShort> rhs)
1873         {
1874                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1875         }
1876
1877         RValue<Bool> operator==(RValue<UShort> lhs, RValue<UShort> rhs)
1878         {
1879                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1880         }
1881
1882         Type *UShort::getType()
1883         {
1884                 return T(llvm::Type::getInt16Ty(*::context));
1885         }
1886
1887         Byte4::Byte4(RValue<Byte8> cast)
1888         {
1889                 storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), Int::getType()));
1890         }
1891
1892         Byte4::Byte4(const Reference<Byte4> &rhs)
1893         {
1894                 Value *value = rhs.loadValue();
1895                 storeValue(value);
1896         }
1897
1898         Type *Byte4::getType()
1899         {
1900                 #if 0
1901                         return T(VectorType::get(Byte::getType(), 4));
1902                 #else
1903                         return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
1904                 #endif
1905         }
1906
1907         Type *SByte4::getType()
1908         {
1909                 #if 0
1910                         return T(VectorType::get(SByte::getType(), 4));
1911                 #else
1912                         return Int::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
1913                 #endif
1914         }
1915
1916         Byte8::Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
1917         {
1918                 int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
1919                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Byte::getType(), 8))));
1920
1921                 storeValue(Nucleus::createBitCast(vector, getType()));
1922         }
1923
1924         Byte8::Byte8(RValue<Byte8> rhs)
1925         {
1926                 storeValue(rhs.value);
1927         }
1928
1929         Byte8::Byte8(const Byte8 &rhs)
1930         {
1931                 Value *value = rhs.loadValue();
1932                 storeValue(value);
1933         }
1934
1935         Byte8::Byte8(const Reference<Byte8> &rhs)
1936         {
1937                 Value *value = rhs.loadValue();
1938                 storeValue(value);
1939         }
1940
1941         RValue<Byte8> Byte8::operator=(RValue<Byte8> rhs)
1942         {
1943                 storeValue(rhs.value);
1944
1945                 return rhs;
1946         }
1947
1948         RValue<Byte8> Byte8::operator=(const Byte8 &rhs)
1949         {
1950                 Value *value = rhs.loadValue();
1951                 storeValue(value);
1952
1953                 return RValue<Byte8>(value);
1954         }
1955
1956         RValue<Byte8> Byte8::operator=(const Reference<Byte8> &rhs)
1957         {
1958                 Value *value = rhs.loadValue();
1959                 storeValue(value);
1960
1961                 return RValue<Byte8>(value);
1962         }
1963
1964         RValue<Byte8> operator+(RValue<Byte8> lhs, RValue<Byte8> rhs)
1965         {
1966                 if(CPUID::supportsMMX2())
1967                 {
1968                         return x86::paddb(lhs, rhs);
1969                 }
1970                 else
1971                 {
1972                         return RValue<Byte8>(Nucleus::createAdd(lhs.value, rhs.value));
1973                 }
1974         }
1975
1976         RValue<Byte8> operator-(RValue<Byte8> lhs, RValue<Byte8> rhs)
1977         {
1978                 if(CPUID::supportsMMX2())
1979                 {
1980                         return x86::psubb(lhs, rhs);
1981                 }
1982                 else
1983                 {
1984                         return RValue<Byte8>(Nucleus::createSub(lhs.value, rhs.value));
1985                 }
1986         }
1987
1988 //      RValue<Byte8> operator*(RValue<Byte8> lhs, RValue<Byte8> rhs)
1989 //      {
1990 //              return RValue<Byte8>(Nucleus::createMul(lhs.value, rhs.value));
1991 //      }
1992
1993 //      RValue<Byte8> operator/(RValue<Byte8> lhs, RValue<Byte8> rhs)
1994 //      {
1995 //              return RValue<Byte8>(Nucleus::createUDiv(lhs.value, rhs.value));
1996 //      }
1997
1998 //      RValue<Byte8> operator%(RValue<Byte8> lhs, RValue<Byte8> rhs)
1999 //      {
2000 //              return RValue<Byte8>(Nucleus::createURem(lhs.value, rhs.value));
2001 //      }
2002
2003         RValue<Byte8> operator&(RValue<Byte8> lhs, RValue<Byte8> rhs)
2004         {
2005                 if(CPUID::supportsMMX2())
2006                 {
2007                         return As<Byte8>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
2008                 }
2009                 else
2010                 {
2011                         return RValue<Byte8>(Nucleus::createAnd(lhs.value, rhs.value));
2012                 }
2013         }
2014
2015         RValue<Byte8> operator|(RValue<Byte8> lhs, RValue<Byte8> rhs)
2016         {
2017                 if(CPUID::supportsMMX2())
2018                 {
2019                         return As<Byte8>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
2020                 }
2021                 else
2022                 {
2023                         return RValue<Byte8>(Nucleus::createOr(lhs.value, rhs.value));
2024                 }
2025         }
2026
2027         RValue<Byte8> operator^(RValue<Byte8> lhs, RValue<Byte8> rhs)
2028         {
2029                 if(CPUID::supportsMMX2())
2030                 {
2031                         return As<Byte8>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
2032                 }
2033                 else
2034                 {
2035                         return RValue<Byte8>(Nucleus::createXor(lhs.value, rhs.value));
2036                 }
2037         }
2038
2039 //      RValue<Byte8> operator<<(RValue<Byte8> lhs, unsigned char rhs)
2040 //      {
2041 //              return RValue<Byte8>(Nucleus::createShl(lhs.value, rhs.value));
2042 //      }
2043
2044 //      RValue<Byte8> operator>>(RValue<Byte8> lhs, unsigned char rhs)
2045 //      {
2046 //              return RValue<Byte8>(Nucleus::createLShr(lhs.value, rhs.value));
2047 //      }
2048
2049         RValue<Byte8> operator+=(Byte8 &lhs, RValue<Byte8> rhs)
2050         {
2051                 return lhs = lhs + rhs;
2052         }
2053
2054         RValue<Byte8> operator-=(Byte8 &lhs, RValue<Byte8> rhs)
2055         {
2056                 return lhs = lhs - rhs;
2057         }
2058
2059 //      RValue<Byte8> operator*=(Byte8 &lhs, RValue<Byte8> rhs)
2060 //      {
2061 //              return lhs = lhs * rhs;
2062 //      }
2063
2064 //      RValue<Byte8> operator/=(Byte8 &lhs, RValue<Byte8> rhs)
2065 //      {
2066 //              return lhs = lhs / rhs;
2067 //      }
2068
2069 //      RValue<Byte8> operator%=(Byte8 &lhs, RValue<Byte8> rhs)
2070 //      {
2071 //              return lhs = lhs % rhs;
2072 //      }
2073
2074         RValue<Byte8> operator&=(Byte8 &lhs, RValue<Byte8> rhs)
2075         {
2076                 return lhs = lhs & rhs;
2077         }
2078
2079         RValue<Byte8> operator|=(Byte8 &lhs, RValue<Byte8> rhs)
2080         {
2081                 return lhs = lhs | rhs;
2082         }
2083
2084         RValue<Byte8> operator^=(Byte8 &lhs, RValue<Byte8> rhs)
2085         {
2086                 return lhs = lhs ^ rhs;
2087         }
2088
2089 //      RValue<Byte8> operator<<=(Byte8 &lhs, RValue<Byte8> rhs)
2090 //      {
2091 //              return lhs = lhs << rhs;
2092 //      }
2093
2094 //      RValue<Byte8> operator>>=(Byte8 &lhs, RValue<Byte8> rhs)
2095 //      {
2096 //              return lhs = lhs >> rhs;
2097 //      }
2098
2099 //      RValue<Byte8> operator+(RValue<Byte8> val)
2100 //      {
2101 //              return val;
2102 //      }
2103
2104 //      RValue<Byte8> operator-(RValue<Byte8> val)
2105 //      {
2106 //              return RValue<Byte8>(Nucleus::createNeg(val.value));
2107 //      }
2108
2109         RValue<Byte8> operator~(RValue<Byte8> val)
2110         {
2111                 if(CPUID::supportsMMX2())
2112                 {
2113                         return val ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
2114                 }
2115                 else
2116                 {
2117                         return RValue<Byte8>(Nucleus::createNot(val.value));
2118                 }
2119         }
2120
2121         RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
2122         {
2123                 return x86::paddusb(x, y);
2124         }
2125
2126         RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
2127         {
2128                 return x86::psubusb(x, y);
2129         }
2130
2131         RValue<Short4> Unpack(RValue<Byte4> x)
2132         {
2133                 Value *int2 = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), x.value, 0);
2134                 Value *byte8 = Nucleus::createBitCast(int2, Byte8::getType());
2135
2136                 return UnpackLow(RValue<Byte8>(byte8), RValue<Byte8>(byte8));
2137         }
2138
2139         RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y)
2140         {
2141                 if(CPUID::supportsMMX2())
2142                 {
2143                         return x86::punpcklbw(x, y);
2144                 }
2145                 else
2146                 {
2147                         int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
2148                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2149
2150                         return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2151                 }
2152         }
2153
2154         RValue<Short4> UnpackHigh(RValue<Byte8> x, RValue<Byte8> y)
2155         {
2156                 if(CPUID::supportsMMX2())
2157                 {
2158                         return x86::punpckhbw(x, y);
2159                 }
2160                 else
2161                 {
2162                         int shuffle[8] = {4, 12, 5, 13, 6, 14, 7, 15};
2163                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2164
2165                         return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2166                 }
2167         }
2168
2169         RValue<Int> SignMask(RValue<Byte8> x)
2170         {
2171                 return x86::pmovmskb(x);
2172         }
2173
2174 //      RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
2175 //      {
2176 //              return x86::pcmpgtb(x, y);   // FIXME: Signedness
2177 //      }
2178
2179         RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
2180         {
2181                 return x86::pcmpeqb(x, y);
2182         }
2183
2184         Type *Byte8::getType()
2185         {
2186                 if(CPUID::supportsMMX2())
2187                 {
2188                         return MMX::getType();
2189                 }
2190                 else
2191                 {
2192                         return T(VectorType::get(Byte::getType(), 8));
2193                 }
2194         }
2195
2196         SByte8::SByte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
2197         {
2198                 int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
2199                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(SByte::getType(), 8))));
2200
2201                 storeValue(Nucleus::createBitCast(vector, getType()));
2202         }
2203
2204         SByte8::SByte8(RValue<SByte8> rhs)
2205         {
2206                 storeValue(rhs.value);
2207         }
2208
2209         SByte8::SByte8(const SByte8 &rhs)
2210         {
2211                 Value *value = rhs.loadValue();
2212                 storeValue(value);
2213         }
2214
2215         SByte8::SByte8(const Reference<SByte8> &rhs)
2216         {
2217                 Value *value = rhs.loadValue();
2218                 storeValue(value);
2219         }
2220
2221         RValue<SByte8> SByte8::operator=(RValue<SByte8> rhs)
2222         {
2223                 storeValue(rhs.value);
2224
2225                 return rhs;
2226         }
2227
2228         RValue<SByte8> SByte8::operator=(const SByte8 &rhs)
2229         {
2230                 Value *value = rhs.loadValue();
2231                 storeValue(value);
2232
2233                 return RValue<SByte8>(value);
2234         }
2235
2236         RValue<SByte8> SByte8::operator=(const Reference<SByte8> &rhs)
2237         {
2238                 Value *value = rhs.loadValue();
2239                 storeValue(value);
2240
2241                 return RValue<SByte8>(value);
2242         }
2243
2244         RValue<SByte8> operator+(RValue<SByte8> lhs, RValue<SByte8> rhs)
2245         {
2246                 if(CPUID::supportsMMX2())
2247                 {
2248                         return As<SByte8>(x86::paddb(As<Byte8>(lhs), As<Byte8>(rhs)));
2249                 }
2250                 else
2251                 {
2252                         return RValue<SByte8>(Nucleus::createAdd(lhs.value, rhs.value));
2253                 }
2254         }
2255
2256         RValue<SByte8> operator-(RValue<SByte8> lhs, RValue<SByte8> rhs)
2257         {
2258                 if(CPUID::supportsMMX2())
2259                 {
2260                         return As<SByte8>(x86::psubb(As<Byte8>(lhs), As<Byte8>(rhs)));
2261                 }
2262                 else
2263                 {
2264                         return RValue<SByte8>(Nucleus::createSub(lhs.value, rhs.value));
2265                 }
2266         }
2267
2268 //      RValue<SByte8> operator*(RValue<SByte8> lhs, RValue<SByte8> rhs)
2269 //      {
2270 //              return RValue<SByte8>(Nucleus::createMul(lhs.value, rhs.value));
2271 //      }
2272
2273 //      RValue<SByte8> operator/(RValue<SByte8> lhs, RValue<SByte8> rhs)
2274 //      {
2275 //              return RValue<SByte8>(Nucleus::createSDiv(lhs.value, rhs.value));
2276 //      }
2277
2278 //      RValue<SByte8> operator%(RValue<SByte8> lhs, RValue<SByte8> rhs)
2279 //      {
2280 //              return RValue<SByte8>(Nucleus::createSRem(lhs.value, rhs.value));
2281 //      }
2282
2283         RValue<SByte8> operator&(RValue<SByte8> lhs, RValue<SByte8> rhs)
2284         {
2285                 return RValue<SByte8>(Nucleus::createAnd(lhs.value, rhs.value));
2286         }
2287
2288         RValue<SByte8> operator|(RValue<SByte8> lhs, RValue<SByte8> rhs)
2289         {
2290                 return RValue<SByte8>(Nucleus::createOr(lhs.value, rhs.value));
2291         }
2292
2293         RValue<SByte8> operator^(RValue<SByte8> lhs, RValue<SByte8> rhs)
2294         {
2295                 return RValue<SByte8>(Nucleus::createXor(lhs.value, rhs.value));
2296         }
2297
2298 //      RValue<SByte8> operator<<(RValue<SByte8> lhs, unsigned char rhs)
2299 //      {
2300 //              return RValue<SByte8>(Nucleus::createShl(lhs.value, rhs.value));
2301 //      }
2302
2303 //      RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
2304 //      {
2305 //              return RValue<SByte8>(Nucleus::createAShr(lhs.value, rhs.value));
2306 //      }
2307
2308         RValue<SByte8> operator+=(SByte8 &lhs, RValue<SByte8> rhs)
2309         {
2310                 return lhs = lhs + rhs;
2311         }
2312
2313         RValue<SByte8> operator-=(SByte8 &lhs, RValue<SByte8> rhs)
2314         {
2315                 return lhs = lhs - rhs;
2316         }
2317
2318 //      RValue<SByte8> operator*=(SByte8 &lhs, RValue<SByte8> rhs)
2319 //      {
2320 //              return lhs = lhs * rhs;
2321 //      }
2322
2323 //      RValue<SByte8> operator/=(SByte8 &lhs, RValue<SByte8> rhs)
2324 //      {
2325 //              return lhs = lhs / rhs;
2326 //      }
2327
2328 //      RValue<SByte8> operator%=(SByte8 &lhs, RValue<SByte8> rhs)
2329 //      {
2330 //              return lhs = lhs % rhs;
2331 //      }
2332
2333         RValue<SByte8> operator&=(SByte8 &lhs, RValue<SByte8> rhs)
2334         {
2335                 return lhs = lhs & rhs;
2336         }
2337
2338         RValue<SByte8> operator|=(SByte8 &lhs, RValue<SByte8> rhs)
2339         {
2340                 return lhs = lhs | rhs;
2341         }
2342
2343         RValue<SByte8> operator^=(SByte8 &lhs, RValue<SByte8> rhs)
2344         {
2345                 return lhs = lhs ^ rhs;
2346         }
2347
2348 //      RValue<SByte8> operator<<=(SByte8 &lhs, RValue<SByte8> rhs)
2349 //      {
2350 //              return lhs = lhs << rhs;
2351 //      }
2352
2353 //      RValue<SByte8> operator>>=(SByte8 &lhs, RValue<SByte8> rhs)
2354 //      {
2355 //              return lhs = lhs >> rhs;
2356 //      }
2357
2358 //      RValue<SByte8> operator+(RValue<SByte8> val)
2359 //      {
2360 //              return val;
2361 //      }
2362
2363 //      RValue<SByte8> operator-(RValue<SByte8> val)
2364 //      {
2365 //              return RValue<SByte8>(Nucleus::createNeg(val.value));
2366 //      }
2367
2368         RValue<SByte8> operator~(RValue<SByte8> val)
2369         {
2370                 if(CPUID::supportsMMX2())
2371                 {
2372                         return val ^ SByte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
2373                 }
2374                 else
2375                 {
2376                         return RValue<SByte8>(Nucleus::createNot(val.value));
2377                 }
2378         }
2379
2380         RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
2381         {
2382                 return x86::paddsb(x, y);
2383         }
2384
2385         RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
2386         {
2387                 return x86::psubsb(x, y);
2388         }
2389
2390         RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y)
2391         {
2392                 if(CPUID::supportsMMX2())
2393                 {
2394                         return As<Short4>(x86::punpcklbw(As<Byte8>(x), As<Byte8>(y)));
2395                 }
2396                 else
2397                 {
2398                         int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
2399                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2400
2401                         return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2402                 }
2403         }
2404
2405         RValue<Short4> UnpackHigh(RValue<SByte8> x, RValue<SByte8> y)
2406         {
2407                 if(CPUID::supportsMMX2())
2408                 {
2409                         return As<Short4>(x86::punpckhbw(As<Byte8>(x), As<Byte8>(y)));
2410                 }
2411                 else
2412                 {
2413                         int shuffle[8] = {4, 12, 5, 13, 6, 14, 7, 15};
2414                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2415
2416                         return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2417                 }
2418         }
2419
2420         RValue<Int> SignMask(RValue<SByte8> x)
2421         {
2422                 return x86::pmovmskb(As<Byte8>(x));
2423         }
2424
2425         RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
2426         {
2427                 return x86::pcmpgtb(x, y);
2428         }
2429
2430         RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
2431         {
2432                 return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
2433         }
2434
2435         Type *SByte8::getType()
2436         {
2437                 if(CPUID::supportsMMX2())
2438                 {
2439                         return MMX::getType();
2440                 }
2441                 else
2442                 {
2443                         return T(VectorType::get(SByte::getType(), 8));
2444                 }
2445         }
2446
2447         Byte16::Byte16(RValue<Byte16> rhs)
2448         {
2449                 storeValue(rhs.value);
2450         }
2451
2452         Byte16::Byte16(const Byte16 &rhs)
2453         {
2454                 Value *value = rhs.loadValue();
2455                 storeValue(value);
2456         }
2457
2458         Byte16::Byte16(const Reference<Byte16> &rhs)
2459         {
2460                 Value *value = rhs.loadValue();
2461                 storeValue(value);
2462         }
2463
2464         RValue<Byte16> Byte16::operator=(RValue<Byte16> rhs)
2465         {
2466                 storeValue(rhs.value);
2467
2468                 return rhs;
2469         }
2470
2471         RValue<Byte16> Byte16::operator=(const Byte16 &rhs)
2472         {
2473                 Value *value = rhs.loadValue();
2474                 storeValue(value);
2475
2476                 return RValue<Byte16>(value);
2477         }
2478
2479         RValue<Byte16> Byte16::operator=(const Reference<Byte16> &rhs)
2480         {
2481                 Value *value = rhs.loadValue();
2482                 storeValue(value);
2483
2484                 return RValue<Byte16>(value);
2485         }
2486
2487         Type *Byte16::getType()
2488         {
2489                 return T(VectorType::get(Byte::getType(), 16));
2490         }
2491
2492         Type *SByte16::getType()
2493         {
2494                 return T( VectorType::get(SByte::getType(), 16));
2495         }
2496
2497         Short2::Short2(RValue<Short4> cast)
2498         {
2499                 storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), UInt::getType()));
2500         }
2501
2502         Type *Short2::getType()
2503         {
2504                 #if 0
2505                         return T(VectorType::get(Short::getType(), 2));
2506                 #else
2507                         return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
2508                 #endif
2509         }
2510
2511         UShort2::UShort2(RValue<UShort4> cast)
2512         {
2513                 storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), UInt::getType()));
2514         }
2515
2516         Type *UShort2::getType()
2517         {
2518                 #if 0
2519                         return T(VectorType::get(UShort::getType(), 2));
2520                 #else
2521                         return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
2522                 #endif
2523         }
2524
2525         Short4::Short4(RValue<Int> cast)
2526         {
2527                 Value *extend = Nucleus::createZExt(cast.value, Long::getType());
2528                 Value *swizzle = Swizzle(RValue<Short4>(extend), 0x00).value;
2529
2530                 storeValue(swizzle);
2531         }
2532
2533         Short4::Short4(RValue<Int4> cast)
2534         {
2535                 Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
2536
2537                 #if 0   // FIXME: Check codegen (pshuflw phshufhw pshufd)
2538                         Constant *pack[8];
2539                         pack[0] = Nucleus::createConstantInt(0);
2540                         pack[1] = Nucleus::createConstantInt(2);
2541                         pack[2] = Nucleus::createConstantInt(4);
2542                         pack[3] = Nucleus::createConstantInt(6);
2543
2544                         Value *short4 = Nucleus::createShuffleVector(short8, short8, Nucleus::createConstantVector(pack, 4));
2545                 #else
2546                         Value *packed;
2547
2548                         // FIXME: Use Swizzle<Short8>
2549                         if(!CPUID::supportsSSSE3())
2550                         {
2551                                 int pshuflw[8] = {0, 2, 0, 2, 4, 5, 6, 7};
2552                                 int pshufhw[8] = {0, 1, 2, 3, 4, 6, 4, 6};
2553
2554                                 Value *shuffle1 = Nucleus::createShuffleVector(short8, short8, pshuflw);
2555                                 Value *shuffle2 = Nucleus::createShuffleVector(shuffle1, shuffle1, pshufhw);
2556                                 Value *int4 = Nucleus::createBitCast(shuffle2, Int4::getType());
2557                                 packed = createSwizzle4(int4, 0x88);
2558                         }
2559                         else
2560                         {
2561                                 int pshufb[16] = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
2562                                 Value *byte16 = Nucleus::createBitCast(cast.value, Byte16::getType());
2563                                 packed = Nucleus::createShuffleVector(byte16, byte16, pshufb);
2564                         }
2565
2566                         #if 0   // FIXME: No optimal instruction selection
2567                                 Value *qword2 = Nucleus::createBitCast(packed, T(VectorType::get(Long::getType(), 2)));
2568                                 Value *element = Nucleus::createExtractElement(qword2, 0);
2569                                 Value *short4 = Nucleus::createBitCast(element, Short4::getType());
2570                         #else   // FIXME: Requires SSE
2571                                 Value *int2 = RValue<Int2>(Int2(RValue<Int4>(packed))).value;
2572                                 Value *short4 = Nucleus::createBitCast(int2, Short4::getType());
2573                         #endif
2574                 #endif
2575
2576                 storeValue(short4);
2577         }
2578
2579 //      Short4::Short4(RValue<Float> cast)
2580 //      {
2581 //      }
2582
2583         Short4::Short4(RValue<Float4> cast)
2584         {
2585                 Int4 v4i32 = Int4(cast);
2586                 v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
2587
2588                 storeValue(As<Short4>(Int2(v4i32)).value);
2589         }
2590
2591         Short4::Short4(short xyzw)
2592         {
2593                 int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
2594                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Short::getType(), 4))));
2595
2596                 storeValue(Nucleus::createBitCast(vector, getType()));
2597         }
2598
2599         Short4::Short4(short x, short y, short z, short w)
2600         {
2601                 int64_t constantVector[4] = {x, y, z, w};
2602                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Short::getType(), 4))));
2603
2604                 storeValue(Nucleus::createBitCast(vector, getType()));
2605         }
2606
2607         Short4::Short4(RValue<Short4> rhs)
2608         {
2609                 storeValue(rhs.value);
2610         }
2611
2612         Short4::Short4(const Short4 &rhs)
2613         {
2614                 Value *value = rhs.loadValue();
2615                 storeValue(value);
2616         }
2617
2618         Short4::Short4(const Reference<Short4> &rhs)
2619         {
2620                 Value *value = rhs.loadValue();
2621                 storeValue(value);
2622         }
2623
2624         Short4::Short4(RValue<UShort4> rhs)
2625         {
2626                 storeValue(rhs.value);
2627         }
2628
2629         Short4::Short4(const UShort4 &rhs)
2630         {
2631                 storeValue(rhs.loadValue());
2632         }
2633
2634         Short4::Short4(const Reference<UShort4> &rhs)
2635         {
2636                 storeValue(rhs.loadValue());
2637         }
2638
2639         RValue<Short4> Short4::operator=(RValue<Short4> rhs)
2640         {
2641                 storeValue(rhs.value);
2642
2643                 return rhs;
2644         }
2645
2646         RValue<Short4> Short4::operator=(const Short4 &rhs)
2647         {
2648                 Value *value = rhs.loadValue();
2649                 storeValue(value);
2650
2651                 return RValue<Short4>(value);
2652         }
2653
2654         RValue<Short4> Short4::operator=(const Reference<Short4> &rhs)
2655         {
2656                 Value *value = rhs.loadValue();
2657                 storeValue(value);
2658
2659                 return RValue<Short4>(value);
2660         }
2661
2662         RValue<Short4> Short4::operator=(RValue<UShort4> rhs)
2663         {
2664                 storeValue(rhs.value);
2665
2666                 return RValue<Short4>(rhs);
2667         }
2668
2669         RValue<Short4> Short4::operator=(const UShort4 &rhs)
2670         {
2671                 Value *value = rhs.loadValue();
2672                 storeValue(value);
2673
2674                 return RValue<Short4>(value);
2675         }
2676
2677         RValue<Short4> Short4::operator=(const Reference<UShort4> &rhs)
2678         {
2679                 Value *value = rhs.loadValue();
2680                 storeValue(value);
2681
2682                 return RValue<Short4>(value);
2683         }
2684
2685         RValue<Short4> operator+(RValue<Short4> lhs, RValue<Short4> rhs)
2686         {
2687                 if(CPUID::supportsMMX2())
2688                 {
2689                         return x86::paddw(lhs, rhs);
2690                 }
2691                 else
2692                 {
2693                         return RValue<Short4>(Nucleus::createAdd(lhs.value, rhs.value));
2694                 }
2695         }
2696
2697         RValue<Short4> operator-(RValue<Short4> lhs, RValue<Short4> rhs)
2698         {
2699                 if(CPUID::supportsMMX2())
2700                 {
2701                         return x86::psubw(lhs, rhs);
2702                 }
2703                 else
2704                 {
2705                         return RValue<Short4>(Nucleus::createSub(lhs.value, rhs.value));
2706                 }
2707         }
2708
2709         RValue<Short4> operator*(RValue<Short4> lhs, RValue<Short4> rhs)
2710         {
2711                 if(CPUID::supportsMMX2())
2712                 {
2713                         return x86::pmullw(lhs, rhs);
2714                 }
2715                 else
2716                 {
2717                         return RValue<Short4>(Nucleus::createMul(lhs.value, rhs.value));
2718                 }
2719         }
2720
2721 //      RValue<Short4> operator/(RValue<Short4> lhs, RValue<Short4> rhs)
2722 //      {
2723 //              return RValue<Short4>(Nucleus::createSDiv(lhs.value, rhs.value));
2724 //      }
2725
2726 //      RValue<Short4> operator%(RValue<Short4> lhs, RValue<Short4> rhs)
2727 //      {
2728 //              return RValue<Short4>(Nucleus::createSRem(lhs.value, rhs.value));
2729 //      }
2730
2731         RValue<Short4> operator&(RValue<Short4> lhs, RValue<Short4> rhs)
2732         {
2733                 if(CPUID::supportsMMX2())
2734                 {
2735                         return x86::pand(lhs, rhs);
2736                 }
2737                 else
2738                 {
2739                         return RValue<Short4>(Nucleus::createAnd(lhs.value, rhs.value));
2740                 }
2741         }
2742
2743         RValue<Short4> operator|(RValue<Short4> lhs, RValue<Short4> rhs)
2744         {
2745                 if(CPUID::supportsMMX2())
2746                 {
2747                         return x86::por(lhs, rhs);
2748                 }
2749                 else
2750                 {
2751                         return RValue<Short4>(Nucleus::createOr(lhs.value, rhs.value));
2752                 }
2753         }
2754
2755         RValue<Short4> operator^(RValue<Short4> lhs, RValue<Short4> rhs)
2756         {
2757                 if(CPUID::supportsMMX2())
2758                 {
2759                         return x86::pxor(lhs, rhs);
2760                 }
2761                 else
2762                 {
2763                         return RValue<Short4>(Nucleus::createXor(lhs.value, rhs.value));
2764                 }
2765         }
2766
2767         RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2768         {
2769         //      return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
2770
2771                 return x86::psllw(lhs, rhs);
2772         }
2773
2774         RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2775         {
2776         //      return RValue<Short4>(Nucleus::createAShr(lhs.value, rhs.value));
2777
2778                 return x86::psraw(lhs, rhs);
2779         }
2780
2781         RValue<Short4> operator+=(Short4 &lhs, RValue<Short4> rhs)
2782         {
2783                 return lhs = lhs + rhs;
2784         }
2785
2786         RValue<Short4> operator-=(Short4 &lhs, RValue<Short4> rhs)
2787         {
2788                 return lhs = lhs - rhs;
2789         }
2790
2791         RValue<Short4> operator*=(Short4 &lhs, RValue<Short4> rhs)
2792         {
2793                 return lhs = lhs * rhs;
2794         }
2795
2796 //      RValue<Short4> operator/=(Short4 &lhs, RValue<Short4> rhs)
2797 //      {
2798 //              return lhs = lhs / rhs;
2799 //      }
2800
2801 //      RValue<Short4> operator%=(Short4 &lhs, RValue<Short4> rhs)
2802 //      {
2803 //              return lhs = lhs % rhs;
2804 //      }
2805
2806         RValue<Short4> operator&=(Short4 &lhs, RValue<Short4> rhs)
2807         {
2808                 return lhs = lhs & rhs;
2809         }
2810
2811         RValue<Short4> operator|=(Short4 &lhs, RValue<Short4> rhs)
2812         {
2813                 return lhs = lhs | rhs;
2814         }
2815
2816         RValue<Short4> operator^=(Short4 &lhs, RValue<Short4> rhs)
2817         {
2818                 return lhs = lhs ^ rhs;
2819         }
2820
2821         RValue<Short4> operator<<=(Short4 &lhs, unsigned char rhs)
2822         {
2823                 return lhs = lhs << rhs;
2824         }
2825
2826         RValue<Short4> operator>>=(Short4 &lhs, unsigned char rhs)
2827         {
2828                 return lhs = lhs >> rhs;
2829         }
2830
2831 //      RValue<Short4> operator+(RValue<Short4> val)
2832 //      {
2833 //              return val;
2834 //      }
2835
2836         RValue<Short4> operator-(RValue<Short4> val)
2837         {
2838                 if(CPUID::supportsMMX2())
2839                 {
2840                         return Short4(0, 0, 0, 0) - val;
2841                 }
2842                 else
2843                 {
2844                         return RValue<Short4>(Nucleus::createNeg(val.value));
2845                 }
2846         }
2847
2848         RValue<Short4> operator~(RValue<Short4> val)
2849         {
2850                 if(CPUID::supportsMMX2())
2851                 {
2852                         return val ^ Short4(0xFFFFu, 0xFFFFu, 0xFFFFu, 0xFFFFu);
2853                 }
2854                 else
2855                 {
2856                         return RValue<Short4>(Nucleus::createNot(val.value));
2857                 }
2858         }
2859
2860         RValue<Short4> RoundShort4(RValue<Float4> cast)
2861         {
2862                 RValue<Int4> v4i32 = x86::cvtps2dq(cast);
2863                 RValue<Short8> v8i16 = x86::packssdw(v4i32, v4i32);
2864
2865                 return As<Short4>(Int2(As<Int4>(v8i16)));
2866         }
2867
2868         RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2869         {
2870                 return x86::pmaxsw(x, y);
2871         }
2872
2873         RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
2874         {
2875                 return x86::pminsw(x, y);
2876         }
2877
2878         RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
2879         {
2880                 return x86::paddsw(x, y);
2881         }
2882
2883         RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
2884         {
2885                 return x86::psubsw(x, y);
2886         }
2887
2888         RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
2889         {
2890                 return x86::pmulhw(x, y);
2891         }
2892
2893         RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
2894         {
2895                 return x86::pmaddwd(x, y);
2896         }
2897
2898         RValue<SByte8> Pack(RValue<Short4> x, RValue<Short4> y)
2899         {
2900                 return x86::packsswb(x, y);
2901         }
2902
2903         RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y)
2904         {
2905                 if(CPUID::supportsMMX2())
2906                 {
2907                         return x86::punpcklwd(x, y);
2908                 }
2909                 else
2910                 {
2911                         int shuffle[4] = {0, 4, 1, 5};
2912                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2913
2914                         return RValue<Int2>(Nucleus::createBitCast(packed, Int2::getType()));
2915                 }
2916         }
2917
2918         RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y)
2919         {
2920                 if(CPUID::supportsMMX2())
2921                 {
2922                         return x86::punpckhwd(x, y);
2923                 }
2924                 else
2925                 {
2926                         int shuffle[4] = {2, 6, 3, 7};
2927                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2928
2929                         return RValue<Int2>(Nucleus::createBitCast(packed, Int2::getType()));
2930                 }
2931         }
2932
2933         RValue<Short4> Swizzle(RValue<Short4> x, unsigned char select)
2934         {
2935                 if(CPUID::supportsMMX2())
2936                 {
2937                         return x86::pshufw(x, select);
2938                 }
2939                 else
2940                 {
2941                         return RValue<Short4>(createSwizzle4(x.value, select));
2942                 }
2943         }
2944
2945         RValue<Short4> Insert(RValue<Short4> val, RValue<Short> element, int i)
2946         {
2947                 if(CPUID::supportsMMX2())
2948                 {
2949                         return x86::pinsrw(val, Int(element), i);
2950                 }
2951                 else
2952                 {
2953                         return RValue<Short4>(Nucleus::createInsertElement(val.value, element.value, i));
2954                 }
2955         }
2956
2957         RValue<Short> Extract(RValue<Short4> val, int i)
2958         {
2959                 if(CPUID::supportsMMX2())
2960                 {
2961                         return Short(x86::pextrw(val, i));
2962                 }
2963                 else
2964                 {
2965                         return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
2966                 }
2967         }
2968
2969         RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2970         {
2971                 return x86::pcmpgtw(x, y);
2972         }
2973
2974         RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2975         {
2976                 return x86::pcmpeqw(x, y);
2977         }
2978
2979         Type *Short4::getType()
2980         {
2981                 if(CPUID::supportsMMX2())
2982                 {
2983                         return MMX::getType();
2984                 }
2985                 else
2986                 {
2987                         return T(VectorType::get(Short::getType(), 4));
2988                 }
2989         }
2990
2991         UShort4::UShort4(RValue<Int4> cast)
2992         {
2993                 *this = Short4(cast);
2994         }
2995
2996         UShort4::UShort4(RValue<Float4> cast, bool saturate)
2997         {
2998                 Float4 sat;
2999
3000                 if(saturate)
3001                 {
3002                         if(CPUID::supportsSSE4_1())
3003                         {
3004                                 sat = Min(cast, Float4(0xFFFF));   // packusdw takes care of 0x0000 saturation
3005                         }
3006                         else
3007                         {
3008                                 sat = Max(Min(cast, Float4(0xFFFF)), Float4(0x0000));
3009                         }
3010                 }
3011                 else
3012                 {
3013                         sat = cast;
3014                 }
3015
3016                 Int4 int4(sat);
3017
3018                 if(!saturate || !CPUID::supportsSSE4_1())
3019                 {
3020                         *this = Short4(int4);
3021                 }
3022                 else
3023                 {
3024                         *this = As<Short4>(Int2(As<Int4>(x86::packusdw(int4, int4))));
3025                 }
3026         }
3027
3028         UShort4::UShort4(unsigned short xyzw)
3029         {
3030                 int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
3031                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UShort::getType(), 4))));
3032
3033                 storeValue(Nucleus::createBitCast(vector, getType()));
3034         }
3035
3036         UShort4::UShort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
3037         {
3038                 int64_t constantVector[4] = {x, y, z, w};
3039                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UShort::getType(), 4))));
3040
3041                 storeValue(Nucleus::createBitCast(vector, getType()));
3042         }
3043
3044         UShort4::UShort4(RValue<UShort4> rhs)
3045         {
3046                 storeValue(rhs.value);
3047         }
3048
3049         UShort4::UShort4(const UShort4 &rhs)
3050         {
3051                 Value *value = rhs.loadValue();
3052                 storeValue(value);
3053         }
3054
3055         UShort4::UShort4(const Reference<UShort4> &rhs)
3056         {
3057                 Value *value = rhs.loadValue();
3058                 storeValue(value);
3059         }
3060
3061         UShort4::UShort4(RValue<Short4> rhs)
3062         {
3063                 storeValue(rhs.value);
3064         }
3065
3066         UShort4::UShort4(const Short4 &rhs)
3067         {
3068                 Value *value = rhs.loadValue();
3069                 storeValue(value);
3070         }
3071
3072         UShort4::UShort4(const Reference<Short4> &rhs)
3073         {
3074                 Value *value = rhs.loadValue();
3075                 storeValue(value);
3076         }
3077
3078         RValue<UShort4> UShort4::operator=(RValue<UShort4> rhs)
3079         {
3080                 storeValue(rhs.value);
3081
3082                 return rhs;
3083         }
3084
3085         RValue<UShort4> UShort4::operator=(const UShort4 &rhs)
3086         {
3087                 Value *value = rhs.loadValue();
3088                 storeValue(value);
3089
3090                 return RValue<UShort4>(value);
3091         }
3092
3093         RValue<UShort4> UShort4::operator=(const Reference<UShort4> &rhs)
3094         {
3095                 Value *value = rhs.loadValue();
3096                 storeValue(value);
3097
3098                 return RValue<UShort4>(value);
3099         }
3100
3101         RValue<UShort4> UShort4::operator=(RValue<Short4> rhs)
3102         {
3103                 storeValue(rhs.value);
3104
3105                 return RValue<UShort4>(rhs);
3106         }
3107
3108         RValue<UShort4> UShort4::operator=(const Short4 &rhs)
3109         {
3110                 Value *value = rhs.loadValue();
3111                 storeValue(value);
3112
3113                 return RValue<UShort4>(value);
3114         }
3115
3116         RValue<UShort4> UShort4::operator=(const Reference<Short4> &rhs)
3117         {
3118                 Value *value = rhs.loadValue();
3119                 storeValue(value);
3120
3121                 return RValue<UShort4>(value);
3122         }
3123
3124         RValue<UShort4> operator+(RValue<UShort4> lhs, RValue<UShort4> rhs)
3125         {
3126                 if(CPUID::supportsMMX2())
3127                 {
3128                         return As<UShort4>(x86::paddw(As<Short4>(lhs), As<Short4>(rhs)));
3129                 }
3130                 else
3131                 {
3132                         return RValue<UShort4>(Nucleus::createAdd(lhs.value, rhs.value));
3133                 }
3134         }
3135
3136         RValue<UShort4> operator-(RValue<UShort4> lhs, RValue<UShort4> rhs)
3137         {
3138                 if(CPUID::supportsMMX2())
3139                 {
3140                         return As<UShort4>(x86::psubw(As<Short4>(lhs), As<Short4>(rhs)));
3141                 }
3142                 else
3143                 {
3144                         return RValue<UShort4>(Nucleus::createSub(lhs.value, rhs.value));
3145                 }
3146         }
3147
3148         RValue<UShort4> operator*(RValue<UShort4> lhs, RValue<UShort4> rhs)
3149         {
3150                 if(CPUID::supportsMMX2())
3151                 {
3152                         return As<UShort4>(x86::pmullw(As<Short4>(lhs), As<Short4>(rhs)));
3153                 }
3154                 else
3155                 {
3156                         return RValue<UShort4>(Nucleus::createMul(lhs.value, rhs.value));
3157                 }
3158         }
3159
3160         RValue<UShort4> operator&(RValue<UShort4> lhs, RValue<UShort4> rhs)
3161         {
3162                 if(CPUID::supportsMMX2())
3163                 {
3164                         return As<UShort4>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
3165                 }
3166                 else
3167                 {
3168                         return RValue<UShort4>(Nucleus::createAnd(lhs.value, rhs.value));
3169                 }
3170         }
3171
3172         RValue<UShort4> operator|(RValue<UShort4> lhs, RValue<UShort4> rhs)
3173         {
3174                 if(CPUID::supportsMMX2())
3175                 {
3176                         return As<UShort4>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
3177                 }
3178                 else
3179                 {
3180                         return RValue<UShort4>(Nucleus::createOr(lhs.value, rhs.value));
3181                 }
3182         }
3183
3184         RValue<UShort4> operator^(RValue<UShort4> lhs, RValue<UShort4> rhs)
3185         {
3186                 if(CPUID::supportsMMX2())
3187                 {
3188                         return As<UShort4>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
3189                 }
3190                 else
3191                 {
3192                         return RValue<UShort4>(Nucleus::createXor(lhs.value, rhs.value));
3193                 }
3194         }
3195
3196         RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
3197         {
3198         //      return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
3199
3200                 return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
3201         }
3202
3203         RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
3204         {
3205         //      return RValue<Short4>(Nucleus::createLShr(lhs.value, rhs.value));
3206
3207                 return x86::psrlw(lhs, rhs);
3208         }
3209
3210         RValue<UShort4> operator<<=(UShort4 &lhs, unsigned char rhs)
3211         {
3212                 return lhs = lhs << rhs;
3213         }
3214
3215         RValue<UShort4> operator>>=(UShort4 &lhs, unsigned char rhs)
3216         {
3217                 return lhs = lhs >> rhs;
3218         }
3219
3220         RValue<UShort4> operator~(RValue<UShort4> val)
3221         {
3222                 if(CPUID::supportsMMX2())
3223                 {
3224                         return As<UShort4>(As<Short4>(val) ^ Short4(0xFFFFu, 0xFFFFu, 0xFFFFu, 0xFFFFu));
3225                 }
3226                 else
3227                 {
3228                         return RValue<UShort4>(Nucleus::createNot(val.value));
3229                 }
3230         }
3231
3232         RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
3233         {
3234                 return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
3235         }
3236
3237         RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
3238         {
3239                 return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
3240         }
3241
3242         RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
3243         {
3244                 return x86::paddusw(x, y);
3245         }
3246
3247         RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
3248         {
3249                 return x86::psubusw(x, y);
3250         }
3251
3252         RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
3253         {
3254                 return x86::pmulhuw(x, y);
3255         }
3256
3257         RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
3258         {
3259                 return x86::pavgw(x, y);
3260         }
3261
3262         RValue<Byte8> Pack(RValue<UShort4> x, RValue<UShort4> y)
3263         {
3264                 return x86::packuswb(x, y);
3265         }
3266
3267         Type *UShort4::getType()
3268         {
3269                 if(CPUID::supportsMMX2())
3270                 {
3271                         return MMX::getType();
3272                 }
3273                 else
3274                 {
3275                         return T(VectorType::get(UShort::getType(), 4));
3276                 }
3277         }
3278
3279         Short8::Short8(short c)
3280         {
3281                 int64_t constantVector[8] = {c, c, c, c, c, c, c, c};
3282                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
3283         }
3284
3285         Short8::Short8(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7)
3286         {
3287                 int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
3288                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
3289         }
3290
3291         Short8::Short8(RValue<Short8> rhs)
3292         {
3293                 storeValue(rhs.value);
3294         }
3295
3296         Short8::Short8(const Reference<Short8> &rhs)
3297         {
3298                 Value *value = rhs.loadValue();
3299                 storeValue(value);
3300         }
3301
3302         Short8::Short8(RValue<Short4> lo, RValue<Short4> hi)
3303         {
3304                 Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
3305                 Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
3306
3307                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
3308                 long2 = Nucleus::createInsertElement(long2, loLong, 0);
3309                 long2 = Nucleus::createInsertElement(long2, hiLong, 1);
3310                 Value *short8 = Nucleus::createBitCast(long2, Short8::getType());
3311
3312                 storeValue(short8);
3313         }
3314
3315         RValue<Short8> operator+(RValue<Short8> lhs, RValue<Short8> rhs)
3316         {
3317                 return RValue<Short8>(Nucleus::createAdd(lhs.value, rhs.value));
3318         }
3319
3320         RValue<Short8> operator&(RValue<Short8> lhs, RValue<Short8> rhs)
3321         {
3322                 return RValue<Short8>(Nucleus::createAnd(lhs.value, rhs.value));
3323         }
3324
3325         RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
3326         {
3327                 return x86::psllw(lhs, rhs);   // FIXME: Fallback required
3328         }
3329
3330         RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
3331         {
3332                 return x86::psraw(lhs, rhs);   // FIXME: Fallback required
3333         }
3334
3335         RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
3336         {
3337                 return x86::pmaddwd(x, y);   // FIXME: Fallback required
3338         }
3339
3340         RValue<Int4> Abs(RValue<Int4> x)
3341         {
3342                 if(CPUID::supportsSSSE3())
3343                 {
3344                         return x86::pabsd(x);
3345                 }
3346                 else
3347                 {
3348                         Int4 mask = (x >> 31);
3349                         return (mask ^ x) - mask;
3350                 }
3351         }
3352
3353         RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
3354         {
3355                 return x86::pmulhw(x, y);   // FIXME: Fallback required
3356         }
3357
3358         Type *Short8::getType()
3359         {
3360                 return T(VectorType::get(Short::getType(), 8));
3361         }
3362
3363         UShort8::UShort8(unsigned short c)
3364         {
3365                 int64_t constantVector[8] = {c, c, c, c, c, c, c, c};
3366                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
3367         }
3368
3369         UShort8::UShort8(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7)
3370         {
3371                 int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
3372                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
3373         }
3374
3375         UShort8::UShort8(RValue<UShort8> rhs)
3376         {
3377                 storeValue(rhs.value);
3378         }
3379
3380         UShort8::UShort8(const Reference<UShort8> &rhs)
3381         {
3382                 Value *value = rhs.loadValue();
3383                 storeValue(value);
3384         }
3385
3386         UShort8::UShort8(RValue<UShort4> lo, RValue<UShort4> hi)
3387         {
3388                 Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
3389                 Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
3390
3391                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
3392                 long2 = Nucleus::createInsertElement(long2, loLong, 0);
3393                 long2 = Nucleus::createInsertElement(long2, hiLong, 1);
3394                 Value *short8 = Nucleus::createBitCast(long2, Short8::getType());
3395
3396                 storeValue(short8);
3397         }
3398
3399         RValue<UShort8> UShort8::operator=(RValue<UShort8> rhs)
3400         {
3401                 storeValue(rhs.value);
3402
3403                 return rhs;
3404         }
3405
3406         RValue<UShort8> UShort8::operator=(const UShort8 &rhs)
3407         {
3408                 Value *value = rhs.loadValue();
3409                 storeValue(value);
3410
3411                 return RValue<UShort8>(value);
3412         }
3413
3414         RValue<UShort8> UShort8::operator=(const Reference<UShort8> &rhs)
3415         {
3416                 Value *value = rhs.loadValue();
3417                 storeValue(value);
3418
3419                 return RValue<UShort8>(value);
3420         }
3421
3422         RValue<UShort8> operator&(RValue<UShort8> lhs, RValue<UShort8> rhs)
3423         {
3424                 return RValue<UShort8>(Nucleus::createAnd(lhs.value, rhs.value));
3425         }
3426
3427         RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
3428         {
3429                 return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));   // FIXME: Fallback required
3430         }
3431
3432         RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
3433         {
3434                 return x86::psrlw(lhs, rhs);   // FIXME: Fallback required
3435         }
3436
3437         RValue<UShort8> operator+(RValue<UShort8> lhs, RValue<UShort8> rhs)
3438         {
3439                 return RValue<UShort8>(Nucleus::createAdd(lhs.value, rhs.value));
3440         }
3441
3442         RValue<UShort8> operator*(RValue<UShort8> lhs, RValue<UShort8> rhs)
3443         {
3444                 return RValue<UShort8>(Nucleus::createMul(lhs.value, rhs.value));
3445         }
3446
3447         RValue<UShort8> operator+=(UShort8 &lhs, RValue<UShort8> rhs)
3448         {
3449                 return lhs = lhs + rhs;
3450         }
3451
3452         RValue<UShort8> operator~(RValue<UShort8> val)
3453         {
3454                 return RValue<UShort8>(Nucleus::createNot(val.value));
3455         }
3456
3457         RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
3458         {
3459                 int pshufb[16] =
3460                 {
3461                         select0 + 0,
3462                         select0 + 1,
3463                         select1 + 0,
3464                         select1 + 1,
3465                         select2 + 0,
3466                         select2 + 1,
3467                         select3 + 0,
3468                         select3 + 1,
3469                         select4 + 0,
3470                         select4 + 1,
3471                         select5 + 0,
3472                         select5 + 1,
3473                         select6 + 0,
3474                         select6 + 1,
3475                         select7 + 0,
3476                         select7 + 1,
3477                 };
3478
3479                 Value *byte16 = Nucleus::createBitCast(x.value, Byte16::getType());
3480                 Value *shuffle = Nucleus::createShuffleVector(byte16, byte16, pshufb);
3481                 Value *short8 = Nucleus::createBitCast(shuffle, UShort8::getType());
3482
3483                 return RValue<UShort8>(short8);
3484         }
3485
3486         RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
3487         {
3488                 return x86::pmulhuw(x, y);   // FIXME: Fallback required
3489         }
3490
3491         Type *UShort8::getType()
3492         {
3493                 return T(VectorType::get(UShort::getType(), 8));
3494         }
3495
3496         Int::Int(Argument<Int> argument)
3497         {
3498                 storeValue(argument.value);
3499         }
3500
3501         Int::Int(RValue<Byte> cast)
3502         {
3503                 Value *integer = Nucleus::createZExt(cast.value, Int::getType());
3504
3505                 storeValue(integer);
3506         }
3507
3508         Int::Int(RValue<SByte> cast)
3509         {
3510                 Value *integer = Nucleus::createSExt(cast.value, Int::getType());
3511
3512                 storeValue(integer);
3513         }
3514
3515         Int::Int(RValue<Short> cast)
3516         {
3517                 Value *integer = Nucleus::createSExt(cast.value, Int::getType());
3518
3519                 storeValue(integer);
3520         }
3521
3522         Int::Int(RValue<UShort> cast)
3523         {
3524                 Value *integer = Nucleus::createZExt(cast.value, Int::getType());
3525
3526                 storeValue(integer);
3527         }
3528
3529         Int::Int(RValue<Int2> cast)
3530         {
3531                 *this = Extract(cast, 0);
3532         }
3533
3534         Int::Int(RValue<Long> cast)
3535         {
3536                 Value *integer = Nucleus::createTrunc(cast.value, Int::getType());
3537
3538                 storeValue(integer);
3539         }
3540
3541         Int::Int(RValue<Float> cast)
3542         {
3543                 Value *integer = Nucleus::createFPToSI(cast.value, Int::getType());
3544
3545                 storeValue(integer);
3546         }
3547
3548         Int::Int(int x)
3549         {
3550                 storeValue(Nucleus::createConstantInt(x));
3551         }
3552
3553         Int::Int(RValue<Int> rhs)
3554         {
3555                 storeValue(rhs.value);
3556         }
3557
3558         Int::Int(RValue<UInt> rhs)
3559         {
3560                 storeValue(rhs.value);
3561         }
3562
3563         Int::Int(const Int &rhs)
3564         {
3565                 Value *value = rhs.loadValue();
3566                 storeValue(value);
3567         }
3568
3569         Int::Int(const Reference<Int> &rhs)
3570         {
3571                 Value *value = rhs.loadValue();
3572                 storeValue(value);
3573         }
3574
3575         Int::Int(const UInt &rhs)
3576         {
3577                 Value *value = rhs.loadValue();
3578                 storeValue(value);
3579         }
3580
3581         Int::Int(const Reference<UInt> &rhs)
3582         {
3583                 Value *value = rhs.loadValue();
3584                 storeValue(value);
3585         }
3586
3587         RValue<Int> Int::operator=(int rhs)
3588         {
3589                 return RValue<Int>(storeValue(Nucleus::createConstantInt(rhs)));
3590         }
3591
3592         RValue<Int> Int::operator=(RValue<Int> rhs)
3593         {
3594                 storeValue(rhs.value);
3595
3596                 return rhs;
3597         }
3598
3599         RValue<Int> Int::operator=(RValue<UInt> rhs)
3600         {
3601                 storeValue(rhs.value);
3602
3603                 return RValue<Int>(rhs);
3604         }
3605
3606         RValue<Int> Int::operator=(const Int &rhs)
3607         {
3608                 Value *value = rhs.loadValue();
3609                 storeValue(value);
3610
3611                 return RValue<Int>(value);
3612         }
3613
3614         RValue<Int> Int::operator=(const Reference<Int> &rhs)
3615         {
3616                 Value *value = rhs.loadValue();
3617                 storeValue(value);
3618
3619                 return RValue<Int>(value);
3620         }
3621
3622         RValue<Int> Int::operator=(const UInt &rhs)
3623         {
3624                 Value *value = rhs.loadValue();
3625                 storeValue(value);
3626
3627                 return RValue<Int>(value);
3628         }
3629
3630         RValue<Int> Int::operator=(const Reference<UInt> &rhs)
3631         {
3632                 Value *value = rhs.loadValue();
3633                 storeValue(value);
3634
3635                 return RValue<Int>(value);
3636         }
3637
3638         RValue<Int> operator+(RValue<Int> lhs, RValue<Int> rhs)
3639         {
3640                 return RValue<Int>(Nucleus::createAdd(lhs.value, rhs.value));
3641         }
3642
3643         RValue<Int> operator-(RValue<Int> lhs, RValue<Int> rhs)
3644         {
3645                 return RValue<Int>(Nucleus::createSub(lhs.value, rhs.value));
3646         }
3647
3648         RValue<Int> operator*(RValue<Int> lhs, RValue<Int> rhs)
3649         {
3650                 return RValue<Int>(Nucleus::createMul(lhs.value, rhs.value));
3651         }
3652
3653         RValue<Int> operator/(RValue<Int> lhs, RValue<Int> rhs)
3654         {
3655                 return RValue<Int>(Nucleus::createSDiv(lhs.value, rhs.value));
3656         }
3657
3658         RValue<Int> operator%(RValue<Int> lhs, RValue<Int> rhs)
3659         {
3660                 return RValue<Int>(Nucleus::createSRem(lhs.value, rhs.value));
3661         }
3662
3663         RValue<Int> operator&(RValue<Int> lhs, RValue<Int> rhs)
3664         {
3665                 return RValue<Int>(Nucleus::createAnd(lhs.value, rhs.value));
3666         }
3667
3668         RValue<Int> operator|(RValue<Int> lhs, RValue<Int> rhs)
3669         {
3670                 return RValue<Int>(Nucleus::createOr(lhs.value, rhs.value));
3671         }
3672
3673         RValue<Int> operator^(RValue<Int> lhs, RValue<Int> rhs)
3674         {
3675                 return RValue<Int>(Nucleus::createXor(lhs.value, rhs.value));
3676         }
3677
3678         RValue<Int> operator<<(RValue<Int> lhs, RValue<Int> rhs)
3679         {
3680                 return RValue<Int>(Nucleus::createShl(lhs.value, rhs.value));
3681         }
3682
3683         RValue<Int> operator>>(RValue<Int> lhs, RValue<Int> rhs)
3684         {
3685                 return RValue<Int>(Nucleus::createAShr(lhs.value, rhs.value));
3686         }
3687
3688         RValue<Int> operator+=(Int &lhs, RValue<Int> rhs)
3689         {
3690                 return lhs = lhs + rhs;
3691         }
3692
3693         RValue<Int> operator-=(Int &lhs, RValue<Int> rhs)
3694         {
3695                 return lhs = lhs - rhs;
3696         }
3697
3698         RValue<Int> operator*=(Int &lhs, RValue<Int> rhs)
3699         {
3700                 return lhs = lhs * rhs;
3701         }
3702
3703         RValue<Int> operator/=(Int &lhs, RValue<Int> rhs)
3704         {
3705                 return lhs = lhs / rhs;
3706         }
3707
3708         RValue<Int> operator%=(Int &lhs, RValue<Int> rhs)
3709         {
3710                 return lhs = lhs % rhs;
3711         }
3712
3713         RValue<Int> operator&=(Int &lhs, RValue<Int> rhs)
3714         {
3715                 return lhs = lhs & rhs;
3716         }
3717
3718         RValue<Int> operator|=(Int &lhs, RValue<Int> rhs)
3719         {
3720                 return lhs = lhs | rhs;
3721         }
3722
3723         RValue<Int> operator^=(Int &lhs, RValue<Int> rhs)
3724         {
3725                 return lhs = lhs ^ rhs;
3726         }
3727
3728         RValue<Int> operator<<=(Int &lhs, RValue<Int> rhs)
3729         {
3730                 return lhs = lhs << rhs;
3731         }
3732
3733         RValue<Int> operator>>=(Int &lhs, RValue<Int> rhs)
3734         {
3735                 return lhs = lhs >> rhs;
3736         }
3737
3738         RValue<Int> operator+(RValue<Int> val)
3739         {
3740                 return val;
3741         }
3742
3743         RValue<Int> operator-(RValue<Int> val)
3744         {
3745                 return RValue<Int>(Nucleus::createNeg(val.value));
3746         }
3747
3748         RValue<Int> operator~(RValue<Int> val)
3749         {
3750                 return RValue<Int>(Nucleus::createNot(val.value));
3751         }
3752
3753         RValue<Int> operator++(Int &val, int)   // Post-increment
3754         {
3755                 RValue<Int> res = val;
3756
3757                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantInt(1)));
3758                 val.storeValue(inc);
3759
3760                 return res;
3761         }
3762
3763         const Int &operator++(Int &val)   // Pre-increment
3764         {
3765                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantInt(1)));
3766                 val.storeValue(inc);
3767
3768                 return val;
3769         }
3770
3771         RValue<Int> operator--(Int &val, int)   // Post-decrement
3772         {
3773                 RValue<Int> res = val;
3774
3775                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantInt(1)));
3776                 val.storeValue(inc);
3777
3778                 return res;
3779         }
3780
3781         const Int &operator--(Int &val)   // Pre-decrement
3782         {
3783                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantInt(1)));
3784                 val.storeValue(inc);
3785
3786                 return val;
3787         }
3788
3789         RValue<Bool> operator<(RValue<Int> lhs, RValue<Int> rhs)
3790         {
3791                 return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
3792         }
3793
3794         RValue<Bool> operator<=(RValue<Int> lhs, RValue<Int> rhs)
3795         {
3796                 return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
3797         }
3798
3799         RValue<Bool> operator>(RValue<Int> lhs, RValue<Int> rhs)
3800         {
3801                 return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
3802         }
3803
3804         RValue<Bool> operator>=(RValue<Int> lhs, RValue<Int> rhs)
3805         {
3806                 return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
3807         }
3808
3809         RValue<Bool> operator!=(RValue<Int> lhs, RValue<Int> rhs)
3810         {
3811                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
3812         }
3813
3814         RValue<Bool> operator==(RValue<Int> lhs, RValue<Int> rhs)
3815         {
3816                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
3817         }
3818
3819         RValue<Int> Max(RValue<Int> x, RValue<Int> y)
3820         {
3821                 return IfThenElse(x > y, x, y);
3822         }
3823
3824         RValue<Int> Min(RValue<Int> x, RValue<Int> y)
3825         {
3826                 return IfThenElse(x < y, x, y);
3827         }
3828
3829         RValue<Int> Clamp(RValue<Int> x, RValue<Int> min, RValue<Int> max)
3830         {
3831                 return Min(Max(x, min), max);
3832         }
3833
3834         RValue<Int> RoundInt(RValue<Float> cast)
3835         {
3836                 return x86::cvtss2si(cast);
3837
3838         //      return IfThenElse(val > 0.0f, Int(val + 0.5f), Int(val - 0.5f));
3839         }
3840
3841         Type *Int::getType()
3842         {
3843                 return T(llvm::Type::getInt32Ty(*::context));
3844         }
3845
3846         Long::Long(RValue<Int> cast)
3847         {
3848                 Value *integer = Nucleus::createSExt(cast.value, Long::getType());
3849
3850                 storeValue(integer);
3851         }
3852
3853         Long::Long(RValue<UInt> cast)
3854         {
3855                 Value *integer = Nucleus::createZExt(cast.value, Long::getType());
3856
3857                 storeValue(integer);
3858         }
3859
3860         Long::Long(RValue<Long> rhs)
3861         {
3862                 storeValue(rhs.value);
3863         }
3864
3865         RValue<Long> Long::operator=(int64_t rhs)
3866         {
3867                 return RValue<Long>(storeValue(Nucleus::createConstantLong(rhs)));
3868         }
3869
3870         RValue<Long> Long::operator=(RValue<Long> rhs)
3871         {
3872                 storeValue(rhs.value);
3873
3874                 return rhs;
3875         }
3876
3877         RValue<Long> Long::operator=(const Long &rhs)
3878         {
3879                 Value *value = rhs.loadValue();
3880                 storeValue(value);
3881
3882                 return RValue<Long>(value);
3883         }
3884
3885         RValue<Long> Long::operator=(const Reference<Long> &rhs)
3886         {
3887                 Value *value = rhs.loadValue();
3888                 storeValue(value);
3889
3890                 return RValue<Long>(value);
3891         }
3892
3893         RValue<Long> operator+(RValue<Long> lhs, RValue<Long> rhs)
3894         {
3895                 return RValue<Long>(Nucleus::createAdd(lhs.value, rhs.value));
3896         }
3897
3898         RValue<Long> operator-(RValue<Long> lhs, RValue<Long> rhs)
3899         {
3900                 return RValue<Long>(Nucleus::createSub(lhs.value, rhs.value));
3901         }
3902
3903         RValue<Long> operator+=(Long &lhs, RValue<Long> rhs)
3904         {
3905                 return lhs = lhs + rhs;
3906         }
3907
3908         RValue<Long> operator-=(Long &lhs, RValue<Long> rhs)
3909         {
3910                 return lhs = lhs - rhs;
3911         }
3912
3913         RValue<Long> AddAtomic(RValue<Pointer<Long> > x, RValue<Long> y)
3914         {
3915                 return RValue<Long>(Nucleus::createAtomicAdd(x.value, y.value));
3916         }
3917
3918         Type *Long::getType()
3919         {
3920                 return T(llvm::Type::getInt64Ty(*::context));
3921         }
3922
3923         UInt::UInt(Argument<UInt> argument)
3924         {
3925                 storeValue(argument.value);
3926         }
3927
3928         UInt::UInt(RValue<UShort> cast)
3929         {
3930                 Value *integer = Nucleus::createZExt(cast.value, UInt::getType());
3931
3932                 storeValue(integer);
3933         }
3934
3935         UInt::UInt(RValue<Long> cast)
3936         {
3937                 Value *integer = Nucleus::createTrunc(cast.value, UInt::getType());
3938
3939                 storeValue(integer);
3940         }
3941
3942         UInt::UInt(RValue<Float> cast)
3943         {
3944                 // Note: createFPToUI is broken, must perform conversion using createFPtoSI
3945                 // Value *integer = Nucleus::createFPToUI(cast.value, UInt::getType());
3946
3947                 // Smallest positive value representable in UInt, but not in Int
3948                 const unsigned int ustart = 0x80000000u;
3949                 const float ustartf = float(ustart);
3950
3951                 // If the value is negative, store 0, otherwise store the result of the conversion
3952                 storeValue((~(As<Int>(cast) >> 31) &
3953                 // Check if the value can be represented as an Int
3954                         IfThenElse(cast >= ustartf,
3955                 // If the value is too large, subtract ustart and re-add it after conversion.
3956                                 As<Int>(As<UInt>(Int(cast - Float(ustartf))) + UInt(ustart)),
3957                 // Otherwise, just convert normally
3958                                 Int(cast))).value);
3959         }
3960
3961         UInt::UInt(int x)
3962         {
3963                 storeValue(Nucleus::createConstantInt(x));
3964         }
3965
3966         UInt::UInt(unsigned int x)
3967         {
3968                 storeValue(Nucleus::createConstantInt(x));
3969         }
3970
3971         UInt::UInt(RValue<UInt> rhs)
3972         {
3973                 storeValue(rhs.value);
3974         }
3975
3976         UInt::UInt(RValue<Int> rhs)
3977         {
3978                 storeValue(rhs.value);
3979         }
3980
3981         UInt::UInt(const UInt &rhs)
3982         {
3983                 Value *value = rhs.loadValue();
3984                 storeValue(value);
3985         }
3986
3987         UInt::UInt(const Reference<UInt> &rhs)
3988         {
3989                 Value *value = rhs.loadValue();
3990                 storeValue(value);
3991         }
3992
3993         UInt::UInt(const Int &rhs)
3994         {
3995                 Value *value = rhs.loadValue();
3996                 storeValue(value);
3997         }
3998
3999         UInt::UInt(const Reference<Int> &rhs)
4000         {
4001                 Value *value = rhs.loadValue();
4002                 storeValue(value);
4003         }
4004
4005         RValue<UInt> UInt::operator=(unsigned int rhs)
4006         {
4007                 return RValue<UInt>(storeValue(Nucleus::createConstantInt(rhs)));
4008         }
4009
4010         RValue<UInt> UInt::operator=(RValue<UInt> rhs)
4011         {
4012                 storeValue(rhs.value);
4013
4014                 return rhs;
4015         }
4016
4017         RValue<UInt> UInt::operator=(RValue<Int> rhs)
4018         {
4019                 storeValue(rhs.value);
4020
4021                 return RValue<UInt>(rhs);
4022         }
4023
4024         RValue<UInt> UInt::operator=(const UInt &rhs)
4025         {
4026                 Value *value = rhs.loadValue();
4027                 storeValue(value);
4028
4029                 return RValue<UInt>(value);
4030         }
4031
4032         RValue<UInt> UInt::operator=(const Reference<UInt> &rhs)
4033         {
4034                 Value *value = rhs.loadValue();
4035                 storeValue(value);
4036
4037                 return RValue<UInt>(value);
4038         }
4039
4040         RValue<UInt> UInt::operator=(const Int &rhs)
4041         {
4042                 Value *value = rhs.loadValue();
4043                 storeValue(value);
4044
4045                 return RValue<UInt>(value);
4046         }
4047
4048         RValue<UInt> UInt::operator=(const Reference<Int> &rhs)
4049         {
4050                 Value *value = rhs.loadValue();
4051                 storeValue(value);
4052
4053                 return RValue<UInt>(value);
4054         }
4055
4056         RValue<UInt> operator+(RValue<UInt> lhs, RValue<UInt> rhs)
4057         {
4058                 return RValue<UInt>(Nucleus::createAdd(lhs.value, rhs.value));
4059         }
4060
4061         RValue<UInt> operator-(RValue<UInt> lhs, RValue<UInt> rhs)
4062         {
4063                 return RValue<UInt>(Nucleus::createSub(lhs.value, rhs.value));
4064         }
4065
4066         RValue<UInt> operator*(RValue<UInt> lhs, RValue<UInt> rhs)
4067         {
4068                 return RValue<UInt>(Nucleus::createMul(lhs.value, rhs.value));
4069         }
4070
4071         RValue<UInt> operator/(RValue<UInt> lhs, RValue<UInt> rhs)
4072         {
4073                 return RValue<UInt>(Nucleus::createUDiv(lhs.value, rhs.value));
4074         }
4075
4076         RValue<UInt> operator%(RValue<UInt> lhs, RValue<UInt> rhs)
4077         {
4078                 return RValue<UInt>(Nucleus::createURem(lhs.value, rhs.value));
4079         }
4080
4081         RValue<UInt> operator&(RValue<UInt> lhs, RValue<UInt> rhs)
4082         {
4083                 return RValue<UInt>(Nucleus::createAnd(lhs.value, rhs.value));
4084         }
4085
4086         RValue<UInt> operator|(RValue<UInt> lhs, RValue<UInt> rhs)
4087         {
4088                 return RValue<UInt>(Nucleus::createOr(lhs.value, rhs.value));
4089         }
4090
4091         RValue<UInt> operator^(RValue<UInt> lhs, RValue<UInt> rhs)
4092         {
4093                 return RValue<UInt>(Nucleus::createXor(lhs.value, rhs.value));
4094         }
4095
4096         RValue<UInt> operator<<(RValue<UInt> lhs, RValue<UInt> rhs)
4097         {
4098                 return RValue<UInt>(Nucleus::createShl(lhs.value, rhs.value));
4099         }
4100
4101         RValue<UInt> operator>>(RValue<UInt> lhs, RValue<UInt> rhs)
4102         {
4103                 return RValue<UInt>(Nucleus::createLShr(lhs.value, rhs.value));
4104         }
4105
4106         RValue<UInt> operator+=(UInt &lhs, RValue<UInt> rhs)
4107         {
4108                 return lhs = lhs + rhs;
4109         }
4110
4111         RValue<UInt> operator-=(UInt &lhs, RValue<UInt> rhs)
4112         {
4113                 return lhs = lhs - rhs;
4114         }
4115
4116         RValue<UInt> operator*=(UInt &lhs, RValue<UInt> rhs)
4117         {
4118                 return lhs = lhs * rhs;
4119         }
4120
4121         RValue<UInt> operator/=(UInt &lhs, RValue<UInt> rhs)
4122         {
4123                 return lhs = lhs / rhs;
4124         }
4125
4126         RValue<UInt> operator%=(UInt &lhs, RValue<UInt> rhs)
4127         {
4128                 return lhs = lhs % rhs;
4129         }
4130
4131         RValue<UInt> operator&=(UInt &lhs, RValue<UInt> rhs)
4132         {
4133                 return lhs = lhs & rhs;
4134         }
4135
4136         RValue<UInt> operator|=(UInt &lhs, RValue<UInt> rhs)
4137         {
4138                 return lhs = lhs | rhs;
4139         }
4140
4141         RValue<UInt> operator^=(UInt &lhs, RValue<UInt> rhs)
4142         {
4143                 return lhs = lhs ^ rhs;
4144         }
4145
4146         RValue<UInt> operator<<=(UInt &lhs, RValue<UInt> rhs)
4147         {
4148                 return lhs = lhs << rhs;
4149         }
4150
4151         RValue<UInt> operator>>=(UInt &lhs, RValue<UInt> rhs)
4152         {
4153                 return lhs = lhs >> rhs;
4154         }
4155
4156         RValue<UInt> operator+(RValue<UInt> val)
4157         {
4158                 return val;
4159         }
4160
4161         RValue<UInt> operator-(RValue<UInt> val)
4162         {
4163                 return RValue<UInt>(Nucleus::createNeg(val.value));
4164         }
4165
4166         RValue<UInt> operator~(RValue<UInt> val)
4167         {
4168                 return RValue<UInt>(Nucleus::createNot(val.value));
4169         }
4170
4171         RValue<UInt> operator++(UInt &val, int)   // Post-increment
4172         {
4173                 RValue<UInt> res = val;
4174
4175                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantInt(1)));
4176                 val.storeValue(inc);
4177
4178                 return res;
4179         }
4180
4181         const UInt &operator++(UInt &val)   // Pre-increment
4182         {
4183                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantInt(1)));
4184                 val.storeValue(inc);
4185
4186                 return val;
4187         }
4188
4189         RValue<UInt> operator--(UInt &val, int)   // Post-decrement
4190         {
4191                 RValue<UInt> res = val;
4192
4193                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantInt(1)));
4194                 val.storeValue(inc);
4195
4196                 return res;
4197         }
4198
4199         const UInt &operator--(UInt &val)   // Pre-decrement
4200         {
4201                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantInt(1)));
4202                 val.storeValue(inc);
4203
4204                 return val;
4205         }
4206
4207         RValue<UInt> Max(RValue<UInt> x, RValue<UInt> y)
4208         {
4209                 return IfThenElse(x > y, x, y);
4210         }
4211
4212         RValue<UInt> Min(RValue<UInt> x, RValue<UInt> y)
4213         {
4214                 return IfThenElse(x < y, x, y);
4215         }
4216
4217         RValue<UInt> Clamp(RValue<UInt> x, RValue<UInt> min, RValue<UInt> max)
4218         {
4219                 return Min(Max(x, min), max);
4220         }
4221
4222         RValue<Bool> operator<(RValue<UInt> lhs, RValue<UInt> rhs)
4223         {
4224                 return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
4225         }
4226
4227         RValue<Bool> operator<=(RValue<UInt> lhs, RValue<UInt> rhs)
4228         {
4229                 return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
4230         }
4231
4232         RValue<Bool> operator>(RValue<UInt> lhs, RValue<UInt> rhs)
4233         {
4234                 return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
4235         }
4236
4237         RValue<Bool> operator>=(RValue<UInt> lhs, RValue<UInt> rhs)
4238         {
4239                 return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
4240         }
4241
4242         RValue<Bool> operator!=(RValue<UInt> lhs, RValue<UInt> rhs)
4243         {
4244                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
4245         }
4246
4247         RValue<Bool> operator==(RValue<UInt> lhs, RValue<UInt> rhs)
4248         {
4249                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
4250         }
4251
4252 //      RValue<UInt> RoundUInt(RValue<Float> cast)
4253 //      {
4254 //              return x86::cvtss2si(val);   // FIXME: Unsigned
4255 //
4256 //      //      return IfThenElse(val > 0.0f, Int(val + 0.5f), Int(val - 0.5f));
4257 //      }
4258
4259         Type *UInt::getType()
4260         {
4261                 return T(llvm::Type::getInt32Ty(*::context));
4262         }
4263
4264 //      Int2::Int2(RValue<Int> cast)
4265 //      {
4266 //              Value *extend = Nucleus::createZExt(cast.value, Long::getType());
4267 //              Value *vector = Nucleus::createBitCast(extend, Int2::getType());
4268 //
4269 //              int shuffle[2] = {0, 0};
4270 //              Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
4271 //
4272 //              storeValue(replicate);
4273 //      }
4274
4275         Int2::Int2(RValue<Int4> cast)
4276         {
4277                 Value *long2 = Nucleus::createBitCast(cast.value, T(VectorType::get(Long::getType(), 2)));
4278                 Value *element = Nucleus::createExtractElement(long2, Long::getType(), 0);
4279                 Value *int2 = Nucleus::createBitCast(element, Int2::getType());
4280
4281                 storeValue(int2);
4282         }
4283
4284         Int2::Int2(int x, int y)
4285         {
4286                 int64_t constantVector[2] = {x, y};
4287                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Int::getType(), 2))));
4288
4289                 storeValue(Nucleus::createBitCast(vector, getType()));
4290         }
4291
4292         Int2::Int2(RValue<Int2> rhs)
4293         {
4294                 storeValue(rhs.value);
4295         }
4296
4297         Int2::Int2(const Int2 &rhs)
4298         {
4299                 Value *value = rhs.loadValue();
4300                 storeValue(value);
4301         }
4302
4303         Int2::Int2(const Reference<Int2> &rhs)
4304         {
4305                 Value *value = rhs.loadValue();
4306                 storeValue(value);
4307         }
4308
4309         Int2::Int2(RValue<Int> lo, RValue<Int> hi)
4310         {
4311                 if(CPUID::supportsMMX2())
4312                 {
4313                         // movd mm0, lo
4314                         // movd mm1, hi
4315                         // punpckldq mm0, mm1
4316
4317                         Value *loLong = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), lo.value, 0);
4318                         loLong = Nucleus::createInsertElement(loLong, V(ConstantInt::get(Int::getType(), 0)), 1);
4319                         Value *hiLong = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), hi.value, 0);
4320                         hiLong = Nucleus::createInsertElement(hiLong, V(ConstantInt::get(Int::getType(), 0)), 1);
4321
4322                         storeValue(As<Int2>(UnpackLow(As<Int2>(loLong), As<Int2>(hiLong))).value);
4323                 }
4324                 else
4325                 {
4326                         int shuffle[2] = {0, 1};
4327                         Value *packed = Nucleus::createShuffleVector(Nucleus::createBitCast(lo.value, T(VectorType::get(Int::getType(), 1))), Nucleus::createBitCast(hi.value, T(VectorType::get(Int::getType(), 1))), shuffle);
4328
4329                         storeValue(Nucleus::createBitCast(packed, Int2::getType()));
4330                 }
4331         }
4332
4333         RValue<Int2> Int2::operator=(RValue<Int2> rhs)
4334         {
4335                 storeValue(rhs.value);
4336
4337                 return rhs;
4338         }
4339
4340         RValue<Int2> Int2::operator=(const Int2 &rhs)
4341         {
4342                 Value *value = rhs.loadValue();
4343                 storeValue(value);
4344
4345                 return RValue<Int2>(value);
4346         }
4347
4348         RValue<Int2> Int2::operator=(const Reference<Int2> &rhs)
4349         {
4350                 Value *value = rhs.loadValue();
4351                 storeValue(value);
4352
4353                 return RValue<Int2>(value);
4354         }
4355
4356         RValue<Int2> operator+(RValue<Int2> lhs, RValue<Int2> rhs)
4357         {
4358                 if(CPUID::supportsMMX2())
4359                 {
4360                         return x86::paddd(lhs, rhs);
4361                 }
4362                 else
4363                 {
4364                         return RValue<Int2>(Nucleus::createAdd(lhs.value, rhs.value));
4365                 }
4366         }
4367
4368         RValue<Int2> operator-(RValue<Int2> lhs, RValue<Int2> rhs)
4369         {
4370                 if(CPUID::supportsMMX2())
4371                 {
4372                         return x86::psubd(lhs, rhs);
4373                 }
4374                 else
4375                 {
4376                         return RValue<Int2>(Nucleus::createSub(lhs.value, rhs.value));
4377                 }
4378         }
4379
4380 //      RValue<Int2> operator*(RValue<Int2> lhs, RValue<Int2> rhs)
4381 //      {
4382 //              return RValue<Int2>(Nucleus::createMul(lhs.value, rhs.value));
4383 //      }
4384
4385 //      RValue<Int2> operator/(RValue<Int2> lhs, RValue<Int2> rhs)
4386 //      {
4387 //              return RValue<Int2>(Nucleus::createSDiv(lhs.value, rhs.value));
4388 //      }
4389
4390 //      RValue<Int2> operator%(RValue<Int2> lhs, RValue<Int2> rhs)
4391 //      {
4392 //              return RValue<Int2>(Nucleus::createSRem(lhs.value, rhs.value));
4393 //      }
4394
4395         RValue<Int2> operator&(RValue<Int2> lhs, RValue<Int2> rhs)
4396         {
4397                 if(CPUID::supportsMMX2())
4398                 {
4399                         return As<Int2>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
4400                 }
4401                 else
4402                 {
4403                         return RValue<Int2>(Nucleus::createAnd(lhs.value, rhs.value));
4404                 }
4405         }
4406
4407         RValue<Int2> operator|(RValue<Int2> lhs, RValue<Int2> rhs)
4408         {
4409                 if(CPUID::supportsMMX2())
4410                 {
4411                         return As<Int2>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
4412                 }
4413                 else
4414                 {
4415                         return RValue<Int2>(Nucleus::createOr(lhs.value, rhs.value));
4416                 }
4417         }
4418
4419         RValue<Int2> operator^(RValue<Int2> lhs, RValue<Int2> rhs)
4420         {
4421                 if(CPUID::supportsMMX2())
4422                 {
4423                         return As<Int2>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
4424                 }
4425                 else
4426                 {
4427                         return RValue<Int2>(Nucleus::createXor(lhs.value, rhs.value));
4428                 }
4429         }
4430
4431         RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
4432         {
4433         //      return RValue<Int2>(Nucleus::createShl(lhs.value, rhs.value));
4434
4435                 return x86::pslld(lhs, rhs);
4436         }
4437
4438         RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
4439         {
4440         //      return RValue<Int2>(Nucleus::createAShr(lhs.value, rhs.value));
4441
4442                 return x86::psrad(lhs, rhs);
4443         }
4444
4445         RValue<Int2> operator+=(Int2 &lhs, RValue<Int2> rhs)
4446         {
4447                 return lhs = lhs + rhs;
4448         }
4449
4450         RValue<Int2> operator-=(Int2 &lhs, RValue<Int2> rhs)
4451         {
4452                 return lhs = lhs - rhs;
4453         }
4454
4455 //      RValue<Int2> operator*=(Int2 &lhs, RValue<Int2> rhs)
4456 //      {
4457 //              return lhs = lhs * rhs;
4458 //      }
4459
4460 //      RValue<Int2> operator/=(Int2 &lhs, RValue<Int2> rhs)
4461 //      {
4462 //              return lhs = lhs / rhs;
4463 //      }
4464
4465 //      RValue<Int2> operator%=(Int2 &lhs, RValue<Int2> rhs)
4466 //      {
4467 //              return lhs = lhs % rhs;
4468 //      }
4469
4470         RValue<Int2> operator&=(Int2 &lhs, RValue<Int2> rhs)
4471         {
4472                 return lhs = lhs & rhs;
4473         }
4474
4475         RValue<Int2> operator|=(Int2 &lhs, RValue<Int2> rhs)
4476         {
4477                 return lhs = lhs | rhs;
4478         }
4479
4480         RValue<Int2> operator^=(Int2 &lhs, RValue<Int2> rhs)
4481         {
4482                 return lhs = lhs ^ rhs;
4483         }
4484
4485         RValue<Int2> operator<<=(Int2 &lhs, unsigned char rhs)
4486         {
4487                 return lhs = lhs << rhs;
4488         }
4489
4490         RValue<Int2> operator>>=(Int2 &lhs, unsigned char rhs)
4491         {
4492                 return lhs = lhs >> rhs;
4493         }
4494
4495 //      RValue<Int2> operator+(RValue<Int2> val)
4496 //      {
4497 //              return val;
4498 //      }
4499
4500 //      RValue<Int2> operator-(RValue<Int2> val)
4501 //      {
4502 //              return RValue<Int2>(Nucleus::createNeg(val.value));
4503 //      }
4504
4505         RValue<Int2> operator~(RValue<Int2> val)
4506         {
4507                 if(CPUID::supportsMMX2())
4508                 {
4509                         return val ^ Int2(0xFFFFFFFF, 0xFFFFFFFF);
4510                 }
4511                 else
4512                 {
4513                         return RValue<Int2>(Nucleus::createNot(val.value));
4514                 }
4515         }
4516
4517         RValue<Short4> UnpackLow(RValue<Int2> x, RValue<Int2> y)
4518         {
4519                 if(CPUID::supportsMMX2())
4520                 {
4521                         return x86::punpckldq(x, y);
4522                 }
4523                 else
4524                 {
4525                         int shuffle[2] = {0, 2};
4526                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
4527
4528                         return As<Short4>(packed);
4529                 }
4530         }
4531
4532         RValue<Short4> UnpackHigh(RValue<Int2> x, RValue<Int2> y)
4533         {
4534                 if(CPUID::supportsMMX2())
4535                 {
4536                         return x86::punpckhdq(x, y);
4537                 }
4538                 else
4539                 {
4540                         int shuffle[2] = {1, 3};
4541                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
4542
4543                         return As<Short4>(packed);
4544                 }
4545         }
4546
4547         RValue<Int> Extract(RValue<Int2> val, int i)
4548         {
4549                 if(false)   // FIXME: LLVM does not generate optimal code
4550                 {
4551                         return RValue<Int>(Nucleus::createExtractElement(val.value, Int::getType(), i));
4552                 }
4553                 else
4554                 {
4555                         if(i == 0)
4556                         {
4557                                 return RValue<Int>(Nucleus::createExtractElement(Nucleus::createBitCast(val.value, T(VectorType::get(Int::getType(), 2))), Int::getType(), 0));
4558                         }
4559                         else
4560                         {
4561                                 Int2 val2 = As<Int2>(UnpackHigh(val, val));
4562
4563                                 return Extract(val2, 0);
4564                         }
4565                 }
4566         }
4567
4568         RValue<Int2> Insert(RValue<Int2> val, RValue<Int> element, int i)
4569         {
4570                 return RValue<Int2>(Nucleus::createBitCast(Nucleus::createInsertElement(Nucleus::createBitCast(val.value, T(VectorType::get(Int::getType(), 2))), element.value, i), Int2::getType()));
4571         }
4572
4573         Type *Int2::getType()
4574         {
4575                 if(CPUID::supportsMMX2())
4576                 {
4577                         return MMX::getType();
4578                 }
4579                 else
4580                 {
4581                         return T(VectorType::get(Int::getType(), 2));
4582                 }
4583         }
4584
4585         UInt2::UInt2(unsigned int x, unsigned int y)
4586         {
4587                 int64_t constantVector[2] = {x, y};
4588                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UInt::getType(), 2))));
4589
4590                 storeValue(Nucleus::createBitCast(vector, getType()));
4591         }
4592
4593         UInt2::UInt2(RValue<UInt2> rhs)
4594         {
4595                 storeValue(rhs.value);
4596         }
4597
4598         UInt2::UInt2(const UInt2 &rhs)
4599         {
4600                 Value *value = rhs.loadValue();
4601                 storeValue(value);
4602         }
4603
4604         UInt2::UInt2(const Reference<UInt2> &rhs)
4605         {
4606                 Value *value = rhs.loadValue();
4607                 storeValue(value);
4608         }
4609
4610         RValue<UInt2> UInt2::operator=(RValue<UInt2> rhs)
4611         {
4612                 storeValue(rhs.value);
4613
4614                 return rhs;
4615         }
4616
4617         RValue<UInt2> UInt2::operator=(const UInt2 &rhs)
4618         {
4619                 Value *value = rhs.loadValue();
4620                 storeValue(value);
4621
4622                 return RValue<UInt2>(value);
4623         }
4624
4625         RValue<UInt2> UInt2::operator=(const Reference<UInt2> &rhs)
4626         {
4627                 Value *value = rhs.loadValue();
4628                 storeValue(value);
4629
4630                 return RValue<UInt2>(value);
4631         }
4632
4633         RValue<UInt2> operator+(RValue<UInt2> lhs, RValue<UInt2> rhs)
4634         {
4635                 if(CPUID::supportsMMX2())
4636                 {
4637                         return As<UInt2>(x86::paddd(As<Int2>(lhs), As<Int2>(rhs)));
4638                 }
4639                 else
4640                 {
4641                         return RValue<UInt2>(Nucleus::createAdd(lhs.value, rhs.value));
4642                 }
4643         }
4644
4645         RValue<UInt2> operator-(RValue<UInt2> lhs, RValue<UInt2> rhs)
4646         {
4647                 if(CPUID::supportsMMX2())
4648                 {
4649                         return As<UInt2>(x86::psubd(As<Int2>(lhs), As<Int2>(rhs)));
4650                 }
4651                 else
4652                 {
4653                         return RValue<UInt2>(Nucleus::createSub(lhs.value, rhs.value));
4654                 }
4655         }
4656
4657 //      RValue<UInt2> operator*(RValue<UInt2> lhs, RValue<UInt2> rhs)
4658 //      {
4659 //              return RValue<UInt2>(Nucleus::createMul(lhs.value, rhs.value));
4660 //      }
4661
4662 //      RValue<UInt2> operator/(RValue<UInt2> lhs, RValue<UInt2> rhs)
4663 //      {
4664 //              return RValue<UInt2>(Nucleus::createUDiv(lhs.value, rhs.value));
4665 //      }
4666
4667 //      RValue<UInt2> operator%(RValue<UInt2> lhs, RValue<UInt2> rhs)
4668 //      {
4669 //              return RValue<UInt2>(Nucleus::createURem(lhs.value, rhs.value));
4670 //      }
4671
4672         RValue<UInt2> operator&(RValue<UInt2> lhs, RValue<UInt2> rhs)
4673         {
4674                 if(CPUID::supportsMMX2())
4675                 {
4676                         return As<UInt2>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
4677                 }
4678                 else
4679                 {
4680                         return RValue<UInt2>(Nucleus::createAnd(lhs.value, rhs.value));
4681                 }
4682         }
4683
4684         RValue<UInt2> operator|(RValue<UInt2> lhs, RValue<UInt2> rhs)
4685         {
4686                 if(CPUID::supportsMMX2())
4687                 {
4688                         return As<UInt2>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
4689                 }
4690                 else
4691                 {
4692                         return RValue<UInt2>(Nucleus::createOr(lhs.value, rhs.value));
4693                 }
4694         }
4695
4696         RValue<UInt2> operator^(RValue<UInt2> lhs, RValue<UInt2> rhs)
4697         {
4698                 if(CPUID::supportsMMX2())
4699                 {
4700                         return As<UInt2>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
4701                 }
4702                 else
4703                 {
4704                         return RValue<UInt2>(Nucleus::createXor(lhs.value, rhs.value));
4705                 }
4706         }
4707
4708         RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
4709         {
4710         //      return RValue<UInt2>(Nucleus::createShl(lhs.value, rhs.value));
4711
4712                 return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
4713         }
4714
4715         RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
4716         {
4717         //      return RValue<UInt2>(Nucleus::createLShr(lhs.value, rhs.value));
4718
4719                 return x86::psrld(lhs, rhs);
4720         }
4721
4722         RValue<UInt2> operator+=(UInt2 &lhs, RValue<UInt2> rhs)
4723         {
4724                 return lhs = lhs + rhs;
4725         }
4726
4727         RValue<UInt2> operator-=(UInt2 &lhs, RValue<UInt2> rhs)
4728         {
4729                 return lhs = lhs - rhs;
4730         }
4731
4732 //      RValue<UInt2> operator*=(UInt2 &lhs, RValue<UInt2> rhs)
4733 //      {
4734 //              return lhs = lhs * rhs;
4735 //      }
4736
4737 //      RValue<UInt2> operator/=(UInt2 &lhs, RValue<UInt2> rhs)
4738 //      {
4739 //              return lhs = lhs / rhs;
4740 //      }
4741
4742 //      RValue<UInt2> operator%=(UInt2 &lhs, RValue<UInt2> rhs)
4743 //      {
4744 //              return lhs = lhs % rhs;
4745 //      }
4746
4747         RValue<UInt2> operator&=(UInt2 &lhs, RValue<UInt2> rhs)
4748         {
4749                 return lhs = lhs & rhs;
4750         }
4751
4752         RValue<UInt2> operator|=(UInt2 &lhs, RValue<UInt2> rhs)
4753         {
4754                 return lhs = lhs | rhs;
4755         }
4756
4757         RValue<UInt2> operator^=(UInt2 &lhs, RValue<UInt2> rhs)
4758         {
4759                 return lhs = lhs ^ rhs;
4760         }
4761
4762         RValue<UInt2> operator<<=(UInt2 &lhs, unsigned char rhs)
4763         {
4764                 return lhs = lhs << rhs;
4765         }
4766
4767         RValue<UInt2> operator>>=(UInt2 &lhs, unsigned char rhs)
4768         {
4769                 return lhs = lhs >> rhs;
4770         }
4771
4772 //      RValue<UInt2> operator+(RValue<UInt2> val)
4773 //      {
4774 //              return val;
4775 //      }
4776
4777 //      RValue<UInt2> operator-(RValue<UInt2> val)
4778 //      {
4779 //              return RValue<UInt2>(Nucleus::createNeg(val.value));
4780 //      }
4781
4782         RValue<UInt2> operator~(RValue<UInt2> val)
4783         {
4784                 if(CPUID::supportsMMX2())
4785                 {
4786                         return val ^ UInt2(0xFFFFFFFF, 0xFFFFFFFF);
4787                 }
4788                 else
4789                 {
4790                         return RValue<UInt2>(Nucleus::createNot(val.value));
4791                 }
4792         }
4793
4794         Type *UInt2::getType()
4795         {
4796                 if(CPUID::supportsMMX2())
4797                 {
4798                         return MMX::getType();
4799                 }
4800                 else
4801                 {
4802                         return T(VectorType::get(UInt::getType(), 2));
4803                 }
4804         }
4805
4806         Int4::Int4(RValue<Byte4> cast)
4807         {
4808                 Value *x = Nucleus::createBitCast(cast.value, Int::getType());
4809                 Value *a = Nucleus::createInsertElement(V(UndefValue::get(Int4::getType())), x, 0);
4810
4811                 Value *e;
4812
4813                 if (CPUID::supportsSSE4_1())
4814                 {
4815                         e = x86::pmovzxbd(RValue<Int4>(a)).value;
4816                 }
4817                 else
4818                 {
4819                         int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
4820                         Value *b = Nucleus::createBitCast(a, Byte16::getType());
4821                         Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Byte16::getType())), swizzle);
4822
4823                         int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
4824                         Value *d = Nucleus::createBitCast(c, Short8::getType());
4825                         e = Nucleus::createShuffleVector(d, V(Nucleus::createNullValue(Short8::getType())), swizzle2);
4826                 }
4827
4828                 Value *f = Nucleus::createBitCast(e, Int4::getType());
4829                 storeValue(f);
4830         }
4831
4832         Int4::Int4(RValue<SByte4> cast)
4833         {
4834                 Value *x = Nucleus::createBitCast(cast.value, Int::getType());
4835                 Value *a = Nucleus::createInsertElement(V(UndefValue::get(Int4::getType())), x, 0);
4836
4837                 Value *g;
4838
4839                 if (CPUID::supportsSSE4_1())
4840                 {
4841                         g = x86::pmovsxbd(RValue<Int4>(a)).value;
4842                 }
4843                 else
4844                 {
4845                         int     swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
4846                         Value *b = Nucleus::createBitCast(a, Byte16::getType());
4847                         Value *c = Nucleus::createShuffleVector(b, b, swizzle);
4848
4849                         int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
4850                         Value *d = Nucleus::createBitCast(c, Short8::getType());
4851                         Value *e = Nucleus::createShuffleVector(d, d, swizzle2);
4852
4853                         Value *f = Nucleus::createBitCast(e, Int4::getType());
4854                         //      g = Nucleus::createAShr(f, Nucleus::createConstantInt(24));
4855                         g = x86::psrad(RValue<Int4>(f), 24).value;
4856                 }
4857
4858                 storeValue(g);
4859         }
4860
4861         Int4::Int4(RValue<Float4> cast)
4862         {
4863                 Value *xyzw = Nucleus::createFPToSI(cast.value, Int4::getType());
4864
4865                 storeValue(xyzw);
4866         }
4867
4868         Int4::Int4(RValue<Short4> cast)
4869         {
4870                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
4871                 Value *element = Nucleus::createBitCast(cast.value, Long::getType());
4872                 long2 = Nucleus::createInsertElement(long2, element, 0);
4873                 RValue<Int4> vector = RValue<Int4>(Nucleus::createBitCast(long2, Int4::getType()));
4874
4875                 if(CPUID::supportsSSE4_1())
4876                 {
4877                         storeValue(x86::pmovsxwd(vector).value);
4878                 }
4879                 else
4880                 {
4881                         Value *b = Nucleus::createBitCast(vector.value, Short8::getType());
4882
4883                         int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
4884                         Value *c = Nucleus::createShuffleVector(b, b, swizzle);
4885                         Value *d = Nucleus::createBitCast(c, Int4::getType());
4886                         storeValue(d);
4887
4888                         // Each Short is packed into each Int in the (Short | Short) format.
4889                         // Shifting by 16 will retrieve the original Short value.
4890                         // Shifting an Int will propagate the sign bit, which will work
4891                         // for both positive and negative values of a Short.
4892                         *this >>= 16;
4893                 }
4894         }
4895
4896         Int4::Int4(RValue<UShort4> cast)
4897         {
4898                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
4899                 Value *element = Nucleus::createBitCast(cast.value, Long::getType());
4900                 long2 = Nucleus::createInsertElement(long2, element, 0);
4901                 RValue<Int4> vector = RValue<Int4>(Nucleus::createBitCast(long2, Int4::getType()));
4902
4903                 if(CPUID::supportsSSE4_1())
4904                 {
4905                         storeValue(x86::pmovzxwd(RValue<Int4>(vector)).value);
4906                 }
4907                 else
4908                 {
4909                         Value *b = Nucleus::createBitCast(vector.value, Short8::getType());
4910
4911                         int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
4912                         Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Short8::getType())), swizzle);
4913                         Value *d = Nucleus::createBitCast(c, Int4::getType());
4914                         storeValue(d);
4915                 }
4916         }
4917
4918         Int4::Int4(int xyzw)
4919         {
4920                 constant(xyzw, xyzw, xyzw, xyzw);
4921         }
4922
4923         Int4::Int4(int x, int yzw)
4924         {
4925                 constant(x, yzw, yzw, yzw);
4926         }
4927
4928         Int4::Int4(int x, int y, int zw)
4929         {
4930                 constant(x, y, zw, zw);
4931         }
4932
4933         Int4::Int4(int x, int y, int z, int w)
4934         {
4935                 constant(x, y, z, w);
4936         }
4937
4938         void Int4::constant(int x, int y, int z, int w)
4939         {
4940                 int64_t constantVector[4] = {x, y, z, w};
4941                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
4942         }
4943
4944         Int4::Int4(RValue<Int4> rhs)
4945         {
4946                 storeValue(rhs.value);
4947         }
4948
4949         Int4::Int4(const Int4 &rhs)
4950         {
4951                 Value *value = rhs.loadValue();
4952                 storeValue(value);
4953         }
4954
4955         Int4::Int4(const Reference<Int4> &rhs)
4956         {
4957                 Value *value = rhs.loadValue();
4958                 storeValue(value);
4959         }
4960
4961         Int4::Int4(RValue<UInt4> rhs)
4962         {
4963                 storeValue(rhs.value);
4964         }
4965
4966         Int4::Int4(const UInt4 &rhs)
4967         {
4968                 Value *value = rhs.loadValue();
4969                 storeValue(value);
4970         }
4971
4972         Int4::Int4(const Reference<UInt4> &rhs)
4973         {
4974                 Value *value = rhs.loadValue();
4975                 storeValue(value);
4976         }
4977
4978         Int4::Int4(RValue<Int2> lo, RValue<Int2> hi)
4979         {
4980                 Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
4981                 Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
4982
4983                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
4984                 long2 = Nucleus::createInsertElement(long2, loLong, 0);
4985                 long2 = Nucleus::createInsertElement(long2, hiLong, 1);
4986                 Value *int4 = Nucleus::createBitCast(long2, Int4::getType());
4987
4988                 storeValue(int4);
4989         }
4990
4991         Int4::Int4(RValue<Int> rhs)
4992         {
4993                 Value *vector = loadValue();
4994                 Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
4995
4996                 int swizzle[4] = {0, 0, 0, 0};
4997                 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
4998
4999                 storeValue(replicate);
5000         }
5001
5002         Int4::Int4(const Int &rhs)
5003         {
5004                 *this = RValue<Int>(rhs.loadValue());
5005         }
5006
5007         Int4::Int4(const Reference<Int> &rhs)
5008         {
5009                 *this = RValue<Int>(rhs.loadValue());
5010         }
5011
5012         RValue<Int4> Int4::operator=(RValue<Int4> rhs)
5013         {
5014                 storeValue(rhs.value);
5015
5016                 return rhs;
5017         }
5018
5019         RValue<Int4> Int4::operator=(const Int4 &rhs)
5020         {
5021                 Value *value = rhs.loadValue();
5022                 storeValue(value);
5023
5024                 return RValue<Int4>(value);
5025         }
5026
5027         RValue<Int4> Int4::operator=(const Reference<Int4> &rhs)
5028         {
5029                 Value *value = rhs.loadValue();
5030                 storeValue(value);
5031
5032                 return RValue<Int4>(value);
5033         }
5034
5035         RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int4> rhs)
5036         {
5037                 return RValue<Int4>(Nucleus::createAdd(lhs.value, rhs.value));
5038         }
5039
5040         RValue<Int4> operator-(RValue<Int4> lhs, RValue<Int4> rhs)
5041         {
5042                 return RValue<Int4>(Nucleus::createSub(lhs.value, rhs.value));
5043         }
5044
5045         RValue<Int4> operator*(RValue<Int4> lhs, RValue<Int4> rhs)
5046         {
5047                 return RValue<Int4>(Nucleus::createMul(lhs.value, rhs.value));
5048         }
5049
5050         RValue<Int4> operator/(RValue<Int4> lhs, RValue<Int4> rhs)
5051         {
5052                 return RValue<Int4>(Nucleus::createSDiv(lhs.value, rhs.value));
5053         }
5054
5055         RValue<Int4> operator%(RValue<Int4> lhs, RValue<Int4> rhs)
5056         {
5057                 return RValue<Int4>(Nucleus::createSRem(lhs.value, rhs.value));
5058         }
5059
5060         RValue<Int4> operator&(RValue<Int4> lhs, RValue<Int4> rhs)
5061         {
5062                 return RValue<Int4>(Nucleus::createAnd(lhs.value, rhs.value));
5063         }
5064
5065         RValue<Int4> operator|(RValue<Int4> lhs, RValue<Int4> rhs)
5066         {
5067                 return RValue<Int4>(Nucleus::createOr(lhs.value, rhs.value));
5068         }
5069
5070         RValue<Int4> operator^(RValue<Int4> lhs, RValue<Int4> rhs)
5071         {
5072                 return RValue<Int4>(Nucleus::createXor(lhs.value, rhs.value));
5073         }
5074
5075         RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
5076         {
5077                 return x86::pslld(lhs, rhs);
5078         }
5079
5080         RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
5081         {
5082                 return x86::psrad(lhs, rhs);
5083         }
5084
5085         RValue<Int4> operator<<(RValue<Int4> lhs, RValue<Int4> rhs)
5086         {
5087                 return RValue<Int4>(Nucleus::createShl(lhs.value, rhs.value));
5088         }
5089
5090         RValue<Int4> operator>>(RValue<Int4> lhs, RValue<Int4> rhs)
5091         {
5092                 return RValue<Int4>(Nucleus::createAShr(lhs.value, rhs.value));
5093         }
5094
5095         RValue<Int4> operator+=(Int4 &lhs, RValue<Int4> rhs)
5096         {
5097                 return lhs = lhs + rhs;
5098         }
5099
5100         RValue<Int4> operator-=(Int4 &lhs, RValue<Int4> rhs)
5101         {
5102                 return lhs = lhs - rhs;
5103         }
5104
5105         RValue<Int4> operator*=(Int4 &lhs, RValue<Int4> rhs)
5106         {
5107                 return lhs = lhs * rhs;
5108         }
5109
5110 //      RValue<Int4> operator/=(Int4 &lhs, RValue<Int4> rhs)
5111 //      {
5112 //              return lhs = lhs / rhs;
5113 //      }
5114
5115 //      RValue<Int4> operator%=(Int4 &lhs, RValue<Int4> rhs)
5116 //      {
5117 //              return lhs = lhs % rhs;
5118 //      }
5119
5120         RValue<Int4> operator&=(Int4 &lhs, RValue<Int4> rhs)
5121         {
5122                 return lhs = lhs & rhs;
5123         }
5124
5125         RValue<Int4> operator|=(Int4 &lhs, RValue<Int4> rhs)
5126         {
5127                 return lhs = lhs | rhs;
5128         }
5129
5130         RValue<Int4> operator^=(Int4 &lhs, RValue<Int4> rhs)
5131         {
5132                 return lhs = lhs ^ rhs;
5133         }
5134
5135         RValue<Int4> operator<<=(Int4 &lhs, unsigned char rhs)
5136         {
5137                 return lhs = lhs << rhs;
5138         }
5139
5140         RValue<Int4> operator>>=(Int4 &lhs, unsigned char rhs)
5141         {
5142                 return lhs = lhs >> rhs;
5143         }
5144
5145         RValue<Int4> operator+(RValue<Int4> val)
5146         {
5147                 return val;
5148         }
5149
5150         RValue<Int4> operator-(RValue<Int4> val)
5151         {
5152                 return RValue<Int4>(Nucleus::createNeg(val.value));
5153         }
5154
5155         RValue<Int4> operator~(RValue<Int4> val)
5156         {
5157                 return RValue<Int4>(Nucleus::createNot(val.value));
5158         }
5159
5160         RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
5161         {
5162                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5163                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5164                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
5165                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
5166         }
5167
5168         RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
5169         {
5170                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType()));
5171         }
5172
5173         RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
5174         {
5175                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5176                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5177                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value, y.value), Int4::getType()));
5178                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
5179         }
5180
5181         RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
5182         {
5183                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
5184         }
5185
5186         RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
5187         {
5188                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5189                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5190                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value, y.value), Int4::getType()));
5191                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
5192         }
5193
5194         RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
5195         {
5196                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType()));
5197         }
5198
5199         RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
5200         {
5201                 if(CPUID::supportsSSE4_1())
5202                 {
5203                         return x86::pmaxsd(x, y);
5204                 }
5205                 else
5206                 {
5207                         RValue<Int4> greater = CmpNLE(x, y);
5208                         return x & greater | y & ~greater;
5209                 }
5210         }
5211
5212         RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
5213         {
5214                 if(CPUID::supportsSSE4_1())
5215                 {
5216                         return x86::pminsd(x, y);
5217                 }
5218                 else
5219                 {
5220                         RValue<Int4> less = CmpLT(x, y);
5221                         return x & less | y & ~less;
5222                 }
5223         }
5224
5225         RValue<Int4> RoundInt(RValue<Float4> cast)
5226         {
5227                 return x86::cvtps2dq(cast);
5228         }
5229
5230         RValue<Short8> Pack(RValue<Int4> x, RValue<Int4> y)
5231         {
5232                 return x86::packssdw(x, y);
5233         }
5234
5235         RValue<Int> Extract(RValue<Int4> x, int i)
5236         {
5237                 return RValue<Int>(Nucleus::createExtractElement(x.value, Int::getType(), i));
5238         }
5239
5240         RValue<Int4> Insert(RValue<Int4> x, RValue<Int> element, int i)
5241         {
5242                 return RValue<Int4>(Nucleus::createInsertElement(x.value, element.value, i));
5243         }
5244
5245         RValue<Int> SignMask(RValue<Int4> x)
5246         {
5247                 return x86::movmskps(As<Float4>(x));
5248         }
5249
5250         RValue<Int4> Swizzle(RValue<Int4> x, unsigned char select)
5251         {
5252                 return RValue<Int4>(createSwizzle4(x.value, select));
5253         }
5254
5255         Type *Int4::getType()
5256         {
5257                 return T(VectorType::get(Int::getType(), 4));
5258         }
5259
5260         UInt4::UInt4(RValue<Float4> cast)
5261         {
5262                 // Note: createFPToUI is broken, must perform conversion using createFPtoSI
5263                 // Value *xyzw = Nucleus::createFPToUI(cast.value, UInt4::getType());
5264
5265                 // Smallest positive value representable in UInt, but not in Int
5266                 const unsigned int ustart = 0x80000000u;
5267                 const float ustartf = float(ustart);
5268
5269                 // Check if the value can be represented as an Int
5270                 Int4 uiValue = CmpNLT(cast, Float4(ustartf));
5271                 // If the value is too large, subtract ustart and re-add it after conversion.
5272                 uiValue = (uiValue & As<Int4>(As<UInt4>(Int4(cast - Float4(ustartf))) + UInt4(ustart))) |
5273                 // Otherwise, just convert normally
5274                           (~uiValue & Int4(cast));
5275                 // If the value is negative, store 0, otherwise store the result of the conversion
5276                 storeValue((~(As<Int4>(cast) >> 31) & uiValue).value);
5277         }
5278
5279         UInt4::UInt4(int xyzw)
5280         {
5281                 constant(xyzw, xyzw, xyzw, xyzw);
5282         }
5283
5284         UInt4::UInt4(int x, int yzw)
5285         {
5286                 constant(x, yzw, yzw, yzw);
5287         }
5288
5289         UInt4::UInt4(int x, int y, int zw)
5290         {
5291                 constant(x, y, zw, zw);
5292         }
5293
5294         UInt4::UInt4(int x, int y, int z, int w)
5295         {
5296                 constant(x, y, z, w);
5297         }
5298
5299         void UInt4::constant(int x, int y, int z, int w)
5300         {
5301                 int64_t constantVector[4] = {x, y, z, w};
5302                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
5303         }
5304
5305         UInt4::UInt4(RValue<UInt4> rhs)
5306         {
5307                 storeValue(rhs.value);
5308         }
5309
5310         UInt4::UInt4(const UInt4 &rhs)
5311         {
5312                 Value *value = rhs.loadValue();
5313                 storeValue(value);
5314         }
5315
5316         UInt4::UInt4(const Reference<UInt4> &rhs)
5317         {
5318                 Value *value = rhs.loadValue();
5319                 storeValue(value);
5320         }
5321
5322         UInt4::UInt4(RValue<Int4> rhs)
5323         {
5324                 storeValue(rhs.value);
5325         }
5326
5327         UInt4::UInt4(const Int4 &rhs)
5328         {
5329                 Value *value = rhs.loadValue();
5330                 storeValue(value);
5331         }
5332
5333         UInt4::UInt4(const Reference<Int4> &rhs)
5334         {
5335                 Value *value = rhs.loadValue();
5336                 storeValue(value);
5337         }
5338
5339         UInt4::UInt4(RValue<UInt2> lo, RValue<UInt2> hi)
5340         {
5341                 Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
5342                 Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
5343
5344                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
5345                 long2 = Nucleus::createInsertElement(long2, loLong, 0);
5346                 long2 = Nucleus::createInsertElement(long2, hiLong, 1);
5347                 Value *uint4 = Nucleus::createBitCast(long2, Int4::getType());
5348
5349                 storeValue(uint4);
5350         }
5351
5352         RValue<UInt4> UInt4::operator=(RValue<UInt4> rhs)
5353         {
5354                 storeValue(rhs.value);
5355
5356                 return rhs;
5357         }
5358
5359         RValue<UInt4> UInt4::operator=(const UInt4 &rhs)
5360         {
5361                 Value *value = rhs.loadValue();
5362                 storeValue(value);
5363
5364                 return RValue<UInt4>(value);
5365         }
5366
5367         RValue<UInt4> UInt4::operator=(const Reference<UInt4> &rhs)
5368         {
5369                 Value *value = rhs.loadValue();
5370                 storeValue(value);
5371
5372                 return RValue<UInt4>(value);
5373         }
5374
5375         RValue<UInt4> operator+(RValue<UInt4> lhs, RValue<UInt4> rhs)
5376         {
5377                 return RValue<UInt4>(Nucleus::createAdd(lhs.value, rhs.value));
5378         }
5379
5380         RValue<UInt4> operator-(RValue<UInt4> lhs, RValue<UInt4> rhs)
5381         {
5382                 return RValue<UInt4>(Nucleus::createSub(lhs.value, rhs.value));
5383         }
5384
5385         RValue<UInt4> operator*(RValue<UInt4> lhs, RValue<UInt4> rhs)
5386         {
5387                 return RValue<UInt4>(Nucleus::createMul(lhs.value, rhs.value));
5388         }
5389
5390         RValue<UInt4> operator/(RValue<UInt4> lhs, RValue<UInt4> rhs)
5391         {
5392                 return RValue<UInt4>(Nucleus::createUDiv(lhs.value, rhs.value));
5393         }
5394
5395         RValue<UInt4> operator%(RValue<UInt4> lhs, RValue<UInt4> rhs)
5396         {
5397                 return RValue<UInt4>(Nucleus::createURem(lhs.value, rhs.value));
5398         }
5399
5400         RValue<UInt4> operator&(RValue<UInt4> lhs, RValue<UInt4> rhs)
5401         {
5402                 return RValue<UInt4>(Nucleus::createAnd(lhs.value, rhs.value));
5403         }
5404
5405         RValue<UInt4> operator|(RValue<UInt4> lhs, RValue<UInt4> rhs)
5406         {
5407                 return RValue<UInt4>(Nucleus::createOr(lhs.value, rhs.value));
5408         }
5409
5410         RValue<UInt4> operator^(RValue<UInt4> lhs, RValue<UInt4> rhs)
5411         {
5412                 return RValue<UInt4>(Nucleus::createXor(lhs.value, rhs.value));
5413         }
5414
5415         RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
5416         {
5417                 return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
5418         }
5419
5420         RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
5421         {
5422                 return x86::psrld(lhs, rhs);
5423         }
5424
5425         RValue<UInt4> operator<<(RValue<UInt4> lhs, RValue<UInt4> rhs)
5426         {
5427                 return RValue<UInt4>(Nucleus::createShl(lhs.value, rhs.value));
5428         }
5429
5430         RValue<UInt4> operator>>(RValue<UInt4> lhs, RValue<UInt4> rhs)
5431         {
5432                 return RValue<UInt4>(Nucleus::createLShr(lhs.value, rhs.value));
5433         }
5434
5435         RValue<UInt4> operator+=(UInt4 &lhs, RValue<UInt4> rhs)
5436         {
5437                 return lhs = lhs + rhs;
5438         }
5439
5440         RValue<UInt4> operator-=(UInt4 &lhs, RValue<UInt4> rhs)
5441         {
5442                 return lhs = lhs - rhs;
5443         }
5444
5445         RValue<UInt4> operator*=(UInt4 &lhs, RValue<UInt4> rhs)
5446         {
5447                 return lhs = lhs * rhs;
5448         }
5449
5450 //      RValue<UInt4> operator/=(UInt4 &lhs, RValue<UInt4> rhs)
5451 //      {
5452 //              return lhs = lhs / rhs;
5453 //      }
5454
5455 //      RValue<UInt4> operator%=(UInt4 &lhs, RValue<UInt4> rhs)
5456 //      {
5457 //              return lhs = lhs % rhs;
5458 //      }
5459
5460         RValue<UInt4> operator&=(UInt4 &lhs, RValue<UInt4> rhs)
5461         {
5462                 return lhs = lhs & rhs;
5463         }
5464
5465         RValue<UInt4> operator|=(UInt4 &lhs, RValue<UInt4> rhs)
5466         {
5467                 return lhs = lhs | rhs;
5468         }
5469
5470         RValue<UInt4> operator^=(UInt4 &lhs, RValue<UInt4> rhs)
5471         {
5472                 return lhs = lhs ^ rhs;
5473         }
5474
5475         RValue<UInt4> operator<<=(UInt4 &lhs, unsigned char rhs)
5476         {
5477                 return lhs = lhs << rhs;
5478         }
5479
5480         RValue<UInt4> operator>>=(UInt4 &lhs, unsigned char rhs)
5481         {
5482                 return lhs = lhs >> rhs;
5483         }
5484
5485         RValue<UInt4> operator+(RValue<UInt4> val)
5486         {
5487                 return val;
5488         }
5489
5490         RValue<UInt4> operator-(RValue<UInt4> val)
5491         {
5492                 return RValue<UInt4>(Nucleus::createNeg(val.value));
5493         }
5494
5495         RValue<UInt4> operator~(RValue<UInt4> val)
5496         {
5497                 return RValue<UInt4>(Nucleus::createNot(val.value));
5498         }
5499
5500         RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
5501         {
5502                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5503                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5504                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
5505                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
5506         }
5507
5508         RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
5509         {
5510                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType()));
5511         }
5512
5513         RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
5514         {
5515                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5516                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5517                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value, y.value), Int4::getType()));
5518                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
5519         }
5520
5521         RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
5522         {
5523                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
5524         }
5525
5526         RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
5527         {
5528                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5529                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5530                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value, y.value), Int4::getType()));
5531                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
5532         }
5533
5534         RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
5535         {
5536                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType()));
5537         }
5538
5539         RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
5540         {
5541                 if(CPUID::supportsSSE4_1())
5542                 {
5543                         return x86::pmaxud(x, y);
5544                 }
5545                 else
5546                 {
5547                         RValue<UInt4> greater = CmpNLE(x, y);
5548                         return x & greater | y & ~greater;
5549                 }
5550         }
5551
5552         RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
5553         {
5554                 if(CPUID::supportsSSE4_1())
5555                 {
5556                         return x86::pminud(x, y);
5557                 }
5558                 else
5559                 {
5560                         RValue<UInt4> less = CmpLT(x, y);
5561                         return x & less | y & ~less;
5562                 }
5563         }
5564
5565         RValue<UShort8> Pack(RValue<UInt4> x, RValue<UInt4> y)
5566         {
5567                 return x86::packusdw(As<Int4>(x), As<Int4>(y));
5568         }
5569
5570         Type *UInt4::getType()
5571         {
5572                 return T(VectorType::get(UInt::getType(), 4));
5573         }
5574
5575         Float::Float(RValue<Int> cast)
5576         {
5577                 Value *integer = Nucleus::createSIToFP(cast.value, Float::getType());
5578
5579                 storeValue(integer);
5580         }
5581
5582         Float::Float(float x)
5583         {
5584                 storeValue(Nucleus::createConstantFloat(x));
5585         }
5586
5587         Float::Float(RValue<Float> rhs)
5588         {
5589                 storeValue(rhs.value);
5590         }
5591
5592         Float::Float(const Float &rhs)
5593         {
5594                 Value *value = rhs.loadValue();
5595                 storeValue(value);
5596         }
5597
5598         Float::Float(const Reference<Float> &rhs)
5599         {
5600                 Value *value = rhs.loadValue();
5601                 storeValue(value);
5602         }
5603
5604         RValue<Float> Float::operator=(RValue<Float> rhs)
5605         {
5606                 storeValue(rhs.value);
5607
5608                 return rhs;
5609         }
5610
5611         RValue<Float> Float::operator=(const Float &rhs)
5612         {
5613                 Value *value = rhs.loadValue();
5614                 storeValue(value);
5615
5616                 return RValue<Float>(value);
5617         }
5618
5619         RValue<Float> Float::operator=(const Reference<Float> &rhs)
5620         {
5621                 Value *value = rhs.loadValue();
5622                 storeValue(value);
5623
5624                 return RValue<Float>(value);
5625         }
5626
5627         RValue<Float> operator+(RValue<Float> lhs, RValue<Float> rhs)
5628         {
5629                 return RValue<Float>(Nucleus::createFAdd(lhs.value, rhs.value));
5630         }
5631
5632         RValue<Float> operator-(RValue<Float> lhs, RValue<Float> rhs)
5633         {
5634                 return RValue<Float>(Nucleus::createFSub(lhs.value, rhs.value));
5635         }
5636
5637         RValue<Float> operator*(RValue<Float> lhs, RValue<Float> rhs)
5638         {
5639                 return RValue<Float>(Nucleus::createFMul(lhs.value, rhs.value));
5640         }
5641
5642         RValue<Float> operator/(RValue<Float> lhs, RValue<Float> rhs)
5643         {
5644                 return RValue<Float>(Nucleus::createFDiv(lhs.value, rhs.value));
5645         }
5646
5647         RValue<Float> operator+=(Float &lhs, RValue<Float> rhs)
5648         {
5649                 return lhs = lhs + rhs;
5650         }
5651
5652         RValue<Float> operator-=(Float &lhs, RValue<Float> rhs)
5653         {
5654                 return lhs = lhs - rhs;
5655         }
5656
5657         RValue<Float> operator*=(Float &lhs, RValue<Float> rhs)
5658         {
5659                 return lhs = lhs * rhs;
5660         }
5661
5662         RValue<Float> operator/=(Float &lhs, RValue<Float> rhs)
5663         {
5664                 return lhs = lhs / rhs;
5665         }
5666
5667         RValue<Float> operator+(RValue<Float> val)
5668         {
5669                 return val;
5670         }
5671
5672         RValue<Float> operator-(RValue<Float> val)
5673         {
5674                 return RValue<Float>(Nucleus::createFNeg(val.value));
5675         }
5676
5677         RValue<Bool> operator<(RValue<Float> lhs, RValue<Float> rhs)
5678         {
5679                 return RValue<Bool>(Nucleus::createFCmpOLT(lhs.value, rhs.value));
5680         }
5681
5682         RValue<Bool> operator<=(RValue<Float> lhs, RValue<Float> rhs)
5683         {
5684                 return RValue<Bool>(Nucleus::createFCmpOLE(lhs.value, rhs.value));
5685         }
5686
5687         RValue<Bool> operator>(RValue<Float> lhs, RValue<Float> rhs)
5688         {
5689                 return RValue<Bool>(Nucleus::createFCmpOGT(lhs.value, rhs.value));
5690         }
5691
5692         RValue<Bool> operator>=(RValue<Float> lhs, RValue<Float> rhs)
5693         {
5694                 return RValue<Bool>(Nucleus::createFCmpOGE(lhs.value, rhs.value));
5695         }
5696
5697         RValue<Bool> operator!=(RValue<Float> lhs, RValue<Float> rhs)
5698         {
5699                 return RValue<Bool>(Nucleus::createFCmpONE(lhs.value, rhs.value));
5700         }
5701
5702         RValue<Bool> operator==(RValue<Float> lhs, RValue<Float> rhs)
5703         {
5704                 return RValue<Bool>(Nucleus::createFCmpOEQ(lhs.value, rhs.value));
5705         }
5706
5707         RValue<Float> Abs(RValue<Float> x)
5708         {
5709                 return IfThenElse(x > 0.0f, x, -x);
5710         }
5711
5712         RValue<Float> Max(RValue<Float> x, RValue<Float> y)
5713         {
5714                 return IfThenElse(x > y, x, y);
5715         }
5716
5717         RValue<Float> Min(RValue<Float> x, RValue<Float> y)
5718         {
5719                 return IfThenElse(x < y, x, y);
5720         }
5721
5722         RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
5723         {
5724                 if(exactAtPow2)
5725                 {
5726                         // rcpss uses a piecewise-linear approximation which minimizes the relative error
5727                         // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
5728                         return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
5729                 }
5730                 else
5731                 {
5732                         return x86::rcpss(x);
5733                 }
5734         }
5735
5736         RValue<Float> RcpSqrt_pp(RValue<Float> x)
5737         {
5738                 return x86::rsqrtss(x);
5739         }
5740
5741         RValue<Float> Sqrt(RValue<Float> x)
5742         {
5743                 return x86::sqrtss(x);
5744         }
5745
5746         RValue<Float> Round(RValue<Float> x)
5747         {
5748                 if(CPUID::supportsSSE4_1())
5749                 {
5750                         return x86::roundss(x, 0);
5751                 }
5752                 else
5753                 {
5754                         return Float4(Round(Float4(x))).x;
5755                 }
5756         }
5757
5758         RValue<Float> Trunc(RValue<Float> x)
5759         {
5760                 if(CPUID::supportsSSE4_1())
5761                 {
5762                         return x86::roundss(x, 3);
5763                 }
5764                 else
5765                 {
5766                         return Float(Int(x));   // Rounded toward zero
5767                 }
5768         }
5769
5770         RValue<Float> Frac(RValue<Float> x)
5771         {
5772                 if(CPUID::supportsSSE4_1())
5773                 {
5774                         return x - x86::floorss(x);
5775                 }
5776                 else
5777                 {
5778                         return Float4(Frac(Float4(x))).x;
5779                 }
5780         }
5781
5782         RValue<Float> Floor(RValue<Float> x)
5783         {
5784                 if(CPUID::supportsSSE4_1())
5785                 {
5786                         return x86::floorss(x);
5787                 }
5788                 else
5789                 {
5790                         return Float4(Floor(Float4(x))).x;
5791                 }
5792         }
5793
5794         RValue<Float> Ceil(RValue<Float> x)
5795         {
5796                 if(CPUID::supportsSSE4_1())
5797                 {
5798                         return x86::ceilss(x);
5799                 }
5800                 else
5801                 {
5802                         return Float4(Ceil(Float4(x))).x;
5803                 }
5804         }
5805
5806         Type *Float::getType()
5807         {
5808                 return T(llvm::Type::getFloatTy(*::context));
5809         }
5810
5811         Float2::Float2(RValue<Float4> cast)
5812         {
5813                 Value *int64x2 = Nucleus::createBitCast(cast.value, T(VectorType::get(Long::getType(), 2)));
5814                 Value *int64 = Nucleus::createExtractElement(int64x2, Long::getType(), 0);
5815                 Value *float2 = Nucleus::createBitCast(int64, Float2::getType());
5816
5817                 storeValue(float2);
5818         }
5819
5820         Type *Float2::getType()
5821         {
5822                 return T(VectorType::get(Float::getType(), 2));
5823         }
5824
5825         Float4::Float4(RValue<Byte4> cast) : FloatXYZW(this)
5826         {
5827                 #if 0
5828                         Value *xyzw = Nucleus::createUIToFP(cast.value, Float4::getType());   // FIXME: Crashes
5829                 #elif 0
5830                         Value *vector = loadValue();
5831
5832                         Value *i8x = Nucleus::createExtractElement(cast.value, 0);
5833                         Value *f32x = Nucleus::createUIToFP(i8x, Float::getType());
5834                         Value *x = Nucleus::createInsertElement(vector, f32x, 0);
5835
5836                         Value *i8y = Nucleus::createExtractElement(cast.value, V(Nucleus::createConstantInt(1)));
5837                         Value *f32y = Nucleus::createUIToFP(i8y, Float::getType());
5838                         Value *xy = Nucleus::createInsertElement(x, f32y, V(Nucleus::createConstantInt(1)));
5839
5840                         Value *i8z = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(2));
5841                         Value *f32z = Nucleus::createUIToFP(i8z, Float::getType());
5842                         Value *xyz = Nucleus::createInsertElement(xy, f32z, Nucleus::createConstantInt(2));
5843
5844                         Value *i8w = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(3));
5845                         Value *f32w = Nucleus::createUIToFP(i8w, Float::getType());
5846                         Value *xyzw = Nucleus::createInsertElement(xyz, f32w, Nucleus::createConstantInt(3));
5847                 #else
5848                         Value *a = Int4(cast).loadValue();
5849                         Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
5850                 #endif
5851
5852                 storeValue(xyzw);
5853         }
5854
5855         Float4::Float4(RValue<SByte4> cast) : FloatXYZW(this)
5856         {
5857                 #if 0
5858                         Value *xyzw = Nucleus::createSIToFP(cast.value, Float4::getType());   // FIXME: Crashes
5859                 #elif 0
5860                         Value *vector = loadValue();
5861
5862                         Value *i8x = Nucleus::createExtractElement(cast.value, 0);
5863                         Value *f32x = Nucleus::createSIToFP(i8x, Float::getType());
5864                         Value *x = Nucleus::createInsertElement(vector, f32x, 0);
5865
5866                         Value *i8y = Nucleus::createExtractElement(cast.value, V(Nucleus::createConstantInt(1)));
5867                         Value *f32y = Nucleus::createSIToFP(i8y, Float::getType());
5868                         Value *xy = Nucleus::createInsertElement(x, f32y, V(Nucleus::createConstantInt(1)));
5869
5870                         Value *i8z = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(2));
5871                         Value *f32z = Nucleus::createSIToFP(i8z, Float::getType());
5872                         Value *xyz = Nucleus::createInsertElement(xy, f32z, Nucleus::createConstantInt(2));
5873
5874                         Value *i8w = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(3));
5875                         Value *f32w = Nucleus::createSIToFP(i8w, Float::getType());
5876                         Value *xyzw = Nucleus::createInsertElement(xyz, f32w, Nucleus::createConstantInt(3));
5877                 #else
5878                         Value *a = Int4(cast).loadValue();
5879                         Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
5880                 #endif
5881
5882                 storeValue(xyzw);
5883         }
5884
5885         Float4::Float4(RValue<Short4> cast) : FloatXYZW(this)
5886         {
5887                 Int4 c(cast);
5888                 storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
5889         }
5890
5891         Float4::Float4(RValue<UShort4> cast) : FloatXYZW(this)
5892         {
5893                 Int4 c(cast);
5894                 storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
5895         }
5896
5897         Float4::Float4(RValue<Int4> cast) : FloatXYZW(this)
5898         {
5899                 Value *xyzw = Nucleus::createSIToFP(cast.value, Float4::getType());
5900
5901                 storeValue(xyzw);
5902         }
5903
5904         Float4::Float4(RValue<UInt4> cast) : FloatXYZW(this)
5905         {
5906                 RValue<Float4> result = Float4(Int4(cast & UInt4(0x7FFFFFFF))) +
5907                                         As<Float4>((As<Int4>(cast) >> 31) & As<Int4>(Float4(0x80000000u)));
5908
5909                 storeValue(result.value);
5910         }
5911
5912         Float4::Float4() : FloatXYZW(this)
5913         {
5914         }
5915
5916         Float4::Float4(float xyzw) : FloatXYZW(this)
5917         {
5918                 constant(xyzw, xyzw, xyzw, xyzw);
5919         }
5920
5921         Float4::Float4(float x, float yzw) : FloatXYZW(this)
5922         {
5923                 constant(x, yzw, yzw, yzw);
5924         }
5925
5926         Float4::Float4(float x, float y, float zw) : FloatXYZW(this)
5927         {
5928                 constant(x, y, zw, zw);
5929         }
5930
5931         Float4::Float4(float x, float y, float z, float w) : FloatXYZW(this)
5932         {
5933                 constant(x, y, z, w);
5934         }
5935
5936         void Float4::constant(float x, float y, float z, float w)
5937         {
5938                 double constantVector[4] = {x, y, z, w};
5939                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
5940         }
5941
5942         Float4::Float4(RValue<Float4> rhs) : FloatXYZW(this)
5943         {
5944                 storeValue(rhs.value);
5945         }
5946
5947         Float4::Float4(const Float4 &rhs) : FloatXYZW(this)
5948         {
5949                 Value *value = rhs.loadValue();
5950                 storeValue(value);
5951         }
5952
5953         Float4::Float4(const Reference<Float4> &rhs) : FloatXYZW(this)
5954         {
5955                 Value *value = rhs.loadValue();
5956                 storeValue(value);
5957         }
5958
5959         Float4::Float4(RValue<Float> rhs) : FloatXYZW(this)
5960         {
5961                 Value *vector = loadValue();
5962                 Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
5963
5964                 int swizzle[4] = {0, 0, 0, 0};
5965                 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
5966
5967                 storeValue(replicate);
5968         }
5969
5970         Float4::Float4(const Float &rhs) : FloatXYZW(this)
5971         {
5972                 *this = RValue<Float>(rhs.loadValue());
5973         }
5974
5975         Float4::Float4(const Reference<Float> &rhs) : FloatXYZW(this)
5976         {
5977                 *this = RValue<Float>(rhs.loadValue());
5978         }
5979
5980         RValue<Float4> Float4::operator=(float x)
5981         {
5982                 return *this = Float4(x, x, x, x);
5983         }
5984
5985         RValue<Float4> Float4::operator=(RValue<Float4> rhs)
5986         {
5987                 storeValue(rhs.value);
5988
5989                 return rhs;
5990         }
5991
5992         RValue<Float4> Float4::operator=(const Float4 &rhs)
5993         {
5994                 Value *value = rhs.loadValue();
5995                 storeValue(value);
5996
5997                 return RValue<Float4>(value);
5998         }
5999
6000         RValue<Float4> Float4::operator=(const Reference<Float4> &rhs)
6001         {
6002                 Value *value = rhs.loadValue();
6003                 storeValue(value);
6004
6005                 return RValue<Float4>(value);
6006         }
6007
6008         RValue<Float4> Float4::operator=(RValue<Float> rhs)
6009         {
6010                 return *this = Float4(rhs);
6011         }
6012
6013         RValue<Float4> Float4::operator=(const Float &rhs)
6014         {
6015                 return *this = Float4(rhs);
6016         }
6017
6018         RValue<Float4> Float4::operator=(const Reference<Float> &rhs)
6019         {
6020                 return *this = Float4(rhs);
6021         }
6022
6023         RValue<Float4> operator+(RValue<Float4> lhs, RValue<Float4> rhs)
6024         {
6025                 return RValue<Float4>(Nucleus::createFAdd(lhs.value, rhs.value));
6026         }
6027
6028         RValue<Float4> operator-(RValue<Float4> lhs, RValue<Float4> rhs)
6029         {
6030                 return RValue<Float4>(Nucleus::createFSub(lhs.value, rhs.value));
6031         }
6032
6033         RValue<Float4> operator*(RValue<Float4> lhs, RValue<Float4> rhs)
6034         {
6035                 return RValue<Float4>(Nucleus::createFMul(lhs.value, rhs.value));
6036         }
6037
6038         RValue<Float4> operator/(RValue<Float4> lhs, RValue<Float4> rhs)
6039         {
6040                 return RValue<Float4>(Nucleus::createFDiv(lhs.value, rhs.value));
6041         }
6042
6043         RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
6044         {
6045                 return RValue<Float4>(Nucleus::createFRem(lhs.value, rhs.value));
6046         }
6047
6048         RValue<Float4> operator+=(Float4 &lhs, RValue<Float4> rhs)
6049         {
6050                 return lhs = lhs + rhs;
6051         }
6052
6053         RValue<Float4> operator-=(Float4 &lhs, RValue<Float4> rhs)
6054         {
6055                 return lhs = lhs - rhs;
6056         }
6057
6058         RValue<Float4> operator*=(Float4 &lhs, RValue<Float4> rhs)
6059         {
6060                 return lhs = lhs * rhs;
6061         }
6062
6063         RValue<Float4> operator/=(Float4 &lhs, RValue<Float4> rhs)
6064         {
6065                 return lhs = lhs / rhs;
6066         }
6067
6068         RValue<Float4> operator%=(Float4 &lhs, RValue<Float4> rhs)
6069         {
6070                 return lhs = lhs % rhs;
6071         }
6072
6073         RValue<Float4> operator+(RValue<Float4> val)
6074         {
6075                 return val;
6076         }
6077
6078         RValue<Float4> operator-(RValue<Float4> val)
6079         {
6080                 return RValue<Float4>(Nucleus::createFNeg(val.value));
6081         }
6082
6083         RValue<Float4> Abs(RValue<Float4> x)
6084         {
6085                 Value *vector = Nucleus::createBitCast(x.value, Int4::getType());
6086                 int64_t constantVector[4] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
6087                 Value *result = Nucleus::createAnd(vector, V(Nucleus::createConstantVector(constantVector, Int4::getType())));
6088
6089                 return RValue<Float4>(Nucleus::createBitCast(result, Float4::getType()));
6090         }
6091
6092         RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
6093         {
6094                 return x86::maxps(x, y);
6095         }
6096
6097         RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
6098         {
6099                 return x86::minps(x, y);
6100         }
6101
6102         RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
6103         {
6104                 if(exactAtPow2)
6105                 {
6106                         // rcpps uses a piecewise-linear approximation which minimizes the relative error
6107                         // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
6108                         return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
6109                 }
6110                 else
6111                 {
6112                         return x86::rcpps(x);
6113                 }
6114         }
6115
6116         RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
6117         {
6118                 return x86::rsqrtps(x);
6119         }
6120
6121         RValue<Float4> Sqrt(RValue<Float4> x)
6122         {
6123                 return x86::sqrtps(x);
6124         }
6125
6126         RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i)
6127         {
6128                 return RValue<Float4>(Nucleus::createInsertElement(val.value, element.value, i));
6129         }
6130
6131         RValue<Float> Extract(RValue<Float4> x, int i)
6132         {
6133                 return RValue<Float>(Nucleus::createExtractElement(x.value, Float::getType(), i));
6134         }
6135
6136         RValue<Float4> Swizzle(RValue<Float4> x, unsigned char select)
6137         {
6138                 return RValue<Float4>(createSwizzle4(x.value, select));
6139         }
6140
6141         RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, unsigned char imm)
6142         {
6143                 int shuffle[4] =
6144                 {
6145                         ((imm >> 0) & 0x03) + 0,
6146                         ((imm >> 2) & 0x03) + 0,
6147                         ((imm >> 4) & 0x03) + 4,
6148                         ((imm >> 6) & 0x03) + 4,
6149                 };
6150
6151                 return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
6152         }
6153
6154         RValue<Float4> UnpackLow(RValue<Float4> x, RValue<Float4> y)
6155         {
6156                 int shuffle[4] = {0, 4, 1, 5};
6157                 return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
6158         }
6159
6160         RValue<Float4> UnpackHigh(RValue<Float4> x, RValue<Float4> y)
6161         {
6162                 int shuffle[4] = {2, 6, 3, 7};
6163                 return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
6164         }
6165
6166         RValue<Float4> Mask(Float4 &lhs, RValue<Float4> rhs, unsigned char select)
6167         {
6168                 Value *vector = lhs.loadValue();
6169                 Value *shuffle = createMask4(vector, rhs.value, select);
6170                 lhs.storeValue(shuffle);
6171
6172                 return RValue<Float4>(shuffle);
6173         }
6174
6175         RValue<Int> SignMask(RValue<Float4> x)
6176         {
6177                 return x86::movmskps(x);
6178         }
6179
6180         RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
6181         {
6182         //      return As<Int4>(x86::cmpeqps(x, y));
6183                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value, y.value), Int4::getType()));
6184         }
6185
6186         RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
6187         {
6188         //      return As<Int4>(x86::cmpltps(x, y));
6189                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value, y.value), Int4::getType()));
6190         }
6191
6192         RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
6193         {
6194         //      return As<Int4>(x86::cmpleps(x, y));
6195                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value, y.value), Int4::getType()));
6196         }
6197
6198         RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
6199         {
6200         //      return As<Int4>(x86::cmpneqps(x, y));
6201                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value, y.value), Int4::getType()));
6202         }
6203
6204         RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
6205         {
6206         //      return As<Int4>(x86::cmpnltps(x, y));
6207                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value, y.value), Int4::getType()));
6208         }
6209
6210         RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
6211         {
6212         //      return As<Int4>(x86::cmpnleps(x, y));
6213                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value, y.value), Int4::getType()));
6214         }
6215
6216         RValue<Float4> Round(RValue<Float4> x)
6217         {
6218                 if(CPUID::supportsSSE4_1())
6219                 {
6220                         return x86::roundps(x, 0);
6221                 }
6222                 else
6223                 {
6224                         return Float4(RoundInt(x));
6225                 }
6226         }
6227
6228         RValue<Float4> Trunc(RValue<Float4> x)
6229         {
6230                 if(CPUID::supportsSSE4_1())
6231                 {
6232                         return x86::roundps(x, 3);
6233                 }
6234                 else
6235                 {
6236                         return Float4(Int4(x));   // Rounded toward zero
6237                 }
6238         }
6239
6240         RValue<Float4> Frac(RValue<Float4> x)
6241         {
6242                 if(CPUID::supportsSSE4_1())
6243                 {
6244                         return x - x86::floorps(x);
6245                 }
6246                 else
6247                 {
6248                         Float4 frc = x - Float4(Int4(x));   // Signed fractional part
6249
6250                         return frc + As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1, 1, 1, 1)));
6251                 }
6252         }
6253
6254         RValue<Float4> Floor(RValue<Float4> x)
6255         {
6256                 if(CPUID::supportsSSE4_1())
6257                 {
6258                         return x86::floorps(x);
6259                 }
6260                 else
6261                 {
6262                         return x - Frac(x);
6263                 }
6264         }
6265
6266         RValue<Float4> Ceil(RValue<Float4> x)
6267         {
6268                 if(CPUID::supportsSSE4_1())
6269                 {
6270                         return x86::ceilps(x);
6271                 }
6272                 else
6273                 {
6274                         return -Floor(-x);
6275                 }
6276         }
6277
6278         Type *Float4::getType()
6279         {
6280                 return T(VectorType::get(Float::getType(), 4));
6281         }
6282
6283         RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, int offset)
6284         {
6285                 return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), V(Nucleus::createConstantInt(offset))));
6286         }
6287
6288         RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
6289         {
6290                 return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value));
6291         }
6292
6293         RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
6294         {
6295                 return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value));
6296         }
6297
6298         RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, int offset)
6299         {
6300                 return lhs = lhs + offset;
6301         }
6302
6303         RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<Int> offset)
6304         {
6305                 return lhs = lhs + offset;
6306         }
6307
6308         RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<UInt> offset)
6309         {
6310                 return lhs = lhs + offset;
6311         }
6312
6313         RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, int offset)
6314         {
6315                 return lhs + -offset;
6316         }
6317
6318         RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
6319         {
6320                 return lhs + -offset;
6321         }
6322
6323         RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
6324         {
6325                 return lhs + -offset;
6326         }
6327
6328         RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, int offset)
6329         {
6330                 return lhs = lhs - offset;
6331         }
6332
6333         RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<Int> offset)
6334         {
6335                 return lhs = lhs - offset;
6336         }
6337
6338         RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<UInt> offset)
6339         {
6340                 return lhs = lhs - offset;
6341         }
6342
6343         void Return()
6344         {
6345                 Nucleus::createRetVoid();
6346                 Nucleus::setInsertBlock(Nucleus::createBasicBlock());
6347                 Nucleus::createUnreachable();
6348         }
6349
6350         void Return(RValue<Int> ret)
6351         {
6352                 Nucleus::createRet(ret.value);
6353                 Nucleus::setInsertBlock(Nucleus::createBasicBlock());
6354                 Nucleus::createUnreachable();
6355         }
6356
6357         bool branch(RValue<Bool> cmp, BasicBlock *bodyBB, BasicBlock *endBB)
6358         {
6359                 Nucleus::createCondBr(cmp.value, bodyBB, endBB);
6360                 Nucleus::setInsertBlock(bodyBB);
6361
6362                 return true;
6363         }
6364
6365         RValue<Long> Ticks()
6366         {
6367                 llvm::Function *rdtsc = Intrinsic::getDeclaration(::module, Intrinsic::readcyclecounter);
6368
6369                 return RValue<Long>(V(::builder->CreateCall(rdtsc)));
6370         }
6371 }
6372
6373 namespace sw
6374 {
6375         namespace x86
6376         {
6377                 RValue<Int> cvtss2si(RValue<Float> val)
6378                 {
6379                         llvm::Function *cvtss2si = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvtss2si);
6380
6381                         Float4 vector;
6382                         vector.x = val;
6383
6384                         return RValue<Int>(V(::builder->CreateCall(cvtss2si, RValue<Float4>(vector).value)));
6385                 }
6386
6387                 RValue<Int2> cvtps2pi(RValue<Float4> val)
6388                 {
6389                         llvm::Function *cvtps2pi = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvtps2pi);
6390
6391                         return RValue<Int2>(V(::builder->CreateCall(cvtps2pi, val.value)));
6392                 }
6393
6394                 RValue<Int2> cvttps2pi(RValue<Float4> val)
6395                 {
6396                         llvm::Function *cvttps2pi = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvttps2pi);
6397
6398                         return RValue<Int2>(V(::builder->CreateCall(cvttps2pi, val.value)));
6399                 }
6400
6401                 RValue<Int4> cvtps2dq(RValue<Float4> val)
6402                 {
6403                         if(CPUID::supportsSSE2())
6404                         {
6405                                 llvm::Function *cvtps2dq = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_cvtps2dq);
6406
6407                                 return RValue<Int4>(V(::builder->CreateCall(cvtps2dq, val.value)));
6408                         }
6409                         else
6410                         {
6411                                 Int2 lo = x86::cvtps2pi(val);
6412                                 Int2 hi = x86::cvtps2pi(Swizzle(val, 0xEE));
6413
6414                                 return Int4(lo, hi);
6415                         }
6416                 }
6417
6418                 RValue<Float> rcpss(RValue<Float> val)
6419                 {
6420                         llvm::Function *rcpss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rcp_ss);
6421
6422                         Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
6423
6424                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(rcpss, vector)), Float::getType(), 0));
6425                 }
6426
6427                 RValue<Float> sqrtss(RValue<Float> val)
6428                 {
6429                         llvm::Function *sqrtss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_sqrt_ss);
6430
6431                         Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
6432
6433                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(sqrtss, vector)), Float::getType(), 0));
6434                 }
6435
6436                 RValue<Float> rsqrtss(RValue<Float> val)
6437                 {
6438                         llvm::Function *rsqrtss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rsqrt_ss);
6439
6440                         Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
6441
6442                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(rsqrtss, vector)), Float::getType(), 0));
6443                 }
6444
6445                 RValue<Float4> rcpps(RValue<Float4> val)
6446                 {
6447                         llvm::Function *rcpps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rcp_ps);
6448
6449                         return RValue<Float4>(V(::builder->CreateCall(rcpps, val.value)));
6450                 }
6451
6452                 RValue<Float4> sqrtps(RValue<Float4> val)
6453                 {
6454                         llvm::Function *sqrtps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_sqrt_ps);
6455
6456                         return RValue<Float4>(V(::builder->CreateCall(sqrtps, val.value)));
6457                 }
6458
6459                 RValue<Float4> rsqrtps(RValue<Float4> val)
6460                 {
6461                         llvm::Function *rsqrtps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rsqrt_ps);
6462
6463                         return RValue<Float4>(V(::builder->CreateCall(rsqrtps, val.value)));
6464                 }
6465
6466                 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
6467                 {
6468                         llvm::Function *maxps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_max_ps);
6469
6470                         return RValue<Float4>(V(::builder->CreateCall2(maxps, x.value, y.value)));
6471                 }
6472
6473                 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
6474                 {
6475                         llvm::Function *minps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_min_ps);
6476
6477                         return RValue<Float4>(V(::builder->CreateCall2(minps, x.value, y.value)));
6478                 }
6479
6480                 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
6481                 {
6482                         llvm::Function *roundss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_round_ss);
6483
6484                         Value *undef = V(UndefValue::get(Float4::getType()));
6485                         Value *vector = Nucleus::createInsertElement(undef, val.value, 0);
6486
6487                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall3(roundss, undef, vector, V(Nucleus::createConstantInt(imm)))), Float::getType(), 0));
6488                 }
6489
6490                 RValue<Float> floorss(RValue<Float> val)
6491                 {
6492                         return roundss(val, 1);
6493                 }
6494
6495                 RValue<Float> ceilss(RValue<Float> val)
6496                 {
6497                         return roundss(val, 2);
6498                 }
6499
6500                 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
6501                 {
6502                         llvm::Function *roundps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_round_ps);
6503
6504                         return RValue<Float4>(V(::builder->CreateCall2(roundps, val.value, V(Nucleus::createConstantInt(imm)))));
6505                 }
6506
6507                 RValue<Float4> floorps(RValue<Float4> val)
6508                 {
6509                         return roundps(val, 1);
6510                 }
6511
6512                 RValue<Float4> ceilps(RValue<Float4> val)
6513                 {
6514                         return roundps(val, 2);
6515                 }
6516
6517                 RValue<Float4> cmpps(RValue<Float4> x, RValue<Float4> y, unsigned char imm)
6518                 {
6519                         llvm::Function *cmpps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cmp_ps);
6520
6521                         return RValue<Float4>(V(::builder->CreateCall3(cmpps, x.value, y.value, V(Nucleus::createConstantByte(imm)))));
6522                 }
6523
6524                 RValue<Float4> cmpeqps(RValue<Float4> x, RValue<Float4> y)
6525                 {
6526                         return cmpps(x, y, 0);
6527                 }
6528
6529                 RValue<Float4> cmpltps(RValue<Float4> x, RValue<Float4> y)
6530                 {
6531                         return cmpps(x, y, 1);
6532                 }
6533
6534                 RValue<Float4> cmpleps(RValue<Float4> x, RValue<Float4> y)
6535                 {
6536                         return cmpps(x, y, 2);
6537                 }
6538
6539                 RValue<Float4> cmpunordps(RValue<Float4> x, RValue<Float4> y)
6540                 {
6541                         return cmpps(x, y, 3);
6542                 }
6543
6544                 RValue<Float4> cmpneqps(RValue<Float4> x, RValue<Float4> y)
6545                 {
6546                         return cmpps(x, y, 4);
6547                 }
6548
6549                 RValue<Float4> cmpnltps(RValue<Float4> x, RValue<Float4> y)
6550                 {
6551                         return cmpps(x, y, 5);
6552                 }
6553
6554                 RValue<Float4> cmpnleps(RValue<Float4> x, RValue<Float4> y)
6555                 {
6556                         return cmpps(x, y, 6);
6557                 }
6558
6559                 RValue<Float4> cmpordps(RValue<Float4> x, RValue<Float4> y)
6560                 {
6561                         return cmpps(x, y, 7);
6562                 }
6563
6564                 RValue<Float> cmpss(RValue<Float> x, RValue<Float> y, unsigned char imm)
6565                 {
6566                         llvm::Function *cmpss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cmp_ss);
6567
6568                         Value *vector1 = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), x.value, 0);
6569                         Value *vector2 = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), y.value, 0);
6570
6571                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall3(cmpss, vector1, vector2, V(Nucleus::createConstantByte(imm)))), Float::getType(), 0));
6572                 }
6573
6574                 RValue<Float> cmpeqss(RValue<Float> x, RValue<Float> y)
6575                 {
6576                         return cmpss(x, y, 0);
6577                 }
6578
6579                 RValue<Float> cmpltss(RValue<Float> x, RValue<Float> y)
6580                 {
6581                         return cmpss(x, y, 1);
6582                 }
6583
6584                 RValue<Float> cmpless(RValue<Float> x, RValue<Float> y)
6585                 {
6586                         return cmpss(x, y, 2);
6587                 }
6588
6589                 RValue<Float> cmpunordss(RValue<Float> x, RValue<Float> y)
6590                 {
6591                         return cmpss(x, y, 3);
6592                 }
6593
6594                 RValue<Float> cmpneqss(RValue<Float> x, RValue<Float> y)
6595                 {
6596                         return cmpss(x, y, 4);
6597                 }
6598
6599                 RValue<Float> cmpnltss(RValue<Float> x, RValue<Float> y)
6600                 {
6601                         return cmpss(x, y, 5);
6602                 }
6603
6604                 RValue<Float> cmpnless(RValue<Float> x, RValue<Float> y)
6605                 {
6606                         return cmpss(x, y, 6);
6607                 }
6608
6609                 RValue<Float> cmpordss(RValue<Float> x, RValue<Float> y)
6610                 {
6611                         return cmpss(x, y, 7);
6612                 }
6613
6614                 RValue<Int4> pabsd(RValue<Int4> x)
6615                 {
6616                         llvm::Function *pabsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_ssse3_pabs_d_128);
6617
6618                         return RValue<Int4>(V(::builder->CreateCall(pabsd, x.value)));
6619                 }
6620
6621                 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
6622                 {
6623                         llvm::Function *paddsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padds_w);
6624
6625                         return As<Short4>(V(::builder->CreateCall2(paddsw, As<MMX>(x).value, As<MMX>(y).value)));
6626                 }
6627
6628                 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
6629                 {
6630                         llvm::Function *psubsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubs_w);
6631
6632                         return As<Short4>(V(::builder->CreateCall2(psubsw, As<MMX>(x).value, As<MMX>(y).value)));
6633                 }
6634
6635                 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
6636                 {
6637                         llvm::Function *paddusw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_paddus_w);
6638
6639                         return As<UShort4>(V(::builder->CreateCall2(paddusw, As<MMX>(x).value, As<MMX>(y).value)));
6640                 }
6641
6642                 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
6643                 {
6644                         llvm::Function *psubusw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubus_w);
6645
6646                         return As<UShort4>(V(::builder->CreateCall2(psubusw, As<MMX>(x).value, As<MMX>(y).value)));
6647                 }
6648
6649                 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
6650                 {
6651                         llvm::Function *paddsb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padds_b);
6652
6653                         return As<SByte8>(V(::builder->CreateCall2(paddsb, As<MMX>(x).value, As<MMX>(y).value)));
6654                 }
6655
6656                 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
6657                 {
6658                         llvm::Function *psubsb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubs_b);
6659
6660                         return As<SByte8>(V(::builder->CreateCall2(psubsb, As<MMX>(x).value, As<MMX>(y).value)));
6661                 }
6662
6663                 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
6664                 {
6665                         llvm::Function *paddusb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_paddus_b);
6666
6667                         return As<Byte8>(V(::builder->CreateCall2(paddusb, As<MMX>(x).value, As<MMX>(y).value)));
6668                 }
6669
6670                 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
6671                 {
6672                         llvm::Function *psubusb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubus_b);
6673
6674                         return As<Byte8>(V(::builder->CreateCall2(psubusb, As<MMX>(x).value, As<MMX>(y).value)));
6675                 }
6676
6677                 RValue<Short4> paddw(RValue<Short4> x, RValue<Short4> y)
6678                 {
6679                         llvm::Function *paddw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_w);
6680
6681                         return As<Short4>(V(::builder->CreateCall2(paddw, As<MMX>(x).value, As<MMX>(y).value)));
6682                 }
6683
6684                 RValue<Short4> psubw(RValue<Short4> x, RValue<Short4> y)
6685                 {
6686                         llvm::Function *psubw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_w);
6687
6688                         return As<Short4>(V(::builder->CreateCall2(psubw, As<MMX>(x).value, As<MMX>(y).value)));
6689                 }
6690
6691                 RValue<Short4> pmullw(RValue<Short4> x, RValue<Short4> y)
6692                 {
6693                         llvm::Function *pmullw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmull_w);
6694
6695                         return As<Short4>(V(::builder->CreateCall2(pmullw, As<MMX>(x).value, As<MMX>(y).value)));
6696                 }
6697
6698                 RValue<Short4> pand(RValue<Short4> x, RValue<Short4> y)
6699                 {
6700                         llvm::Function *pand = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pand);
6701
6702                         return As<Short4>(V(::builder->CreateCall2(pand, As<MMX>(x).value, As<MMX>(y).value)));
6703                 }
6704
6705                 RValue<Short4> por(RValue<Short4> x, RValue<Short4> y)
6706                 {
6707                         llvm::Function *por = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_por);
6708
6709                         return As<Short4>(V(::builder->CreateCall2(por, As<MMX>(x).value, As<MMX>(y).value)));
6710                 }
6711
6712                 RValue<Short4> pxor(RValue<Short4> x, RValue<Short4> y)
6713                 {
6714                         llvm::Function *pxor = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pxor);
6715
6716                         return As<Short4>(V(::builder->CreateCall2(pxor, As<MMX>(x).value, As<MMX>(y).value)));
6717                 }
6718
6719                 RValue<Short4> pshufw(RValue<Short4> x, unsigned char y)
6720                 {
6721                         llvm::Function *pshufw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_pshuf_w);
6722
6723                         return As<Short4>(V(::builder->CreateCall2(pshufw, As<MMX>(x).value, V(Nucleus::createConstantByte(y)))));
6724                 }
6725
6726                 RValue<Int2> punpcklwd(RValue<Short4> x, RValue<Short4> y)
6727                 {
6728                         llvm::Function *punpcklwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpcklwd);
6729
6730                         return As<Int2>(V(::builder->CreateCall2(punpcklwd, As<MMX>(x).value, As<MMX>(y).value)));
6731                 }
6732
6733                 RValue<Int2> punpckhwd(RValue<Short4> x, RValue<Short4> y)
6734                 {
6735                         llvm::Function *punpckhwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhwd);
6736
6737                         return As<Int2>(V(::builder->CreateCall2(punpckhwd, As<MMX>(x).value, As<MMX>(y).value)));
6738                 }
6739
6740                 RValue<Short4> pinsrw(RValue<Short4> x, RValue<Int> y, unsigned int i)
6741                 {
6742                         llvm::Function *pinsrw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pinsr_w);
6743
6744                         return As<Short4>(V(::builder->CreateCall3(pinsrw, As<MMX>(x).value, y.value, V(Nucleus::createConstantInt(i)))));
6745                 }
6746
6747                 RValue<Int> pextrw(RValue<Short4> x, unsigned int i)
6748                 {
6749                         llvm::Function *pextrw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pextr_w);
6750
6751                         return RValue<Int>(V(::builder->CreateCall2(pextrw, As<MMX>(x).value, V(Nucleus::createConstantInt(i)))));
6752                 }
6753
6754                 RValue<Short4> punpckldq(RValue<Int2> x, RValue<Int2> y)
6755                 {
6756                         llvm::Function *punpckldq = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckldq);
6757
6758                         return As<Short4>(V(::builder->CreateCall2(punpckldq, As<MMX>(x).value, As<MMX>(y).value)));
6759                 }
6760
6761                 RValue<Short4> punpckhdq(RValue<Int2> x, RValue<Int2> y)
6762                 {
6763                         llvm::Function *punpckhdq = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhdq);
6764
6765                         return As<Short4>(V(::builder->CreateCall2(punpckhdq, As<MMX>(x).value, As<MMX>(y).value)));
6766                 }
6767
6768                 RValue<Short4> punpcklbw(RValue<Byte8> x, RValue<Byte8> y)
6769                 {
6770                         llvm::Function *punpcklbw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpcklbw);
6771
6772                         return As<Short4>(V(::builder->CreateCall2(punpcklbw, As<MMX>(x).value, As<MMX>(y).value)));
6773                 }
6774
6775                 RValue<Short4> punpckhbw(RValue<Byte8> x, RValue<Byte8> y)
6776                 {
6777                         llvm::Function *punpckhbw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhbw);
6778
6779                         return As<Short4>(V(::builder->CreateCall2(punpckhbw, As<MMX>(x).value, As<MMX>(y).value)));
6780                 }
6781
6782                 RValue<Byte8> paddb(RValue<Byte8> x, RValue<Byte8> y)
6783                 {
6784                         llvm::Function *paddb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_b);
6785
6786                         return As<Byte8>(V(::builder->CreateCall2(paddb, As<MMX>(x).value, As<MMX>(y).value)));
6787                 }
6788
6789                 RValue<Byte8> psubb(RValue<Byte8> x, RValue<Byte8> y)
6790                 {
6791                         llvm::Function *psubb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_b);
6792
6793                         return As<Byte8>(V(::builder->CreateCall2(psubb, As<MMX>(x).value, As<MMX>(y).value)));
6794                 }
6795
6796                 RValue<Int2> paddd(RValue<Int2> x, RValue<Int2> y)
6797                 {
6798                         llvm::Function *paddd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_d);
6799
6800                         return As<Int2>(V(::builder->CreateCall2(paddd, As<MMX>(x).value, As<MMX>(y).value)));
6801                 }
6802
6803                 RValue<Int2> psubd(RValue<Int2> x, RValue<Int2> y)
6804                 {
6805                         llvm::Function *psubd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_d);
6806
6807                         return As<Int2>(V(::builder->CreateCall2(psubd, As<MMX>(x).value, As<MMX>(y).value)));
6808                 }
6809
6810                 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
6811                 {
6812                         llvm::Function *pavgw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pavg_w);
6813
6814                         return As<UShort4>(V(::builder->CreateCall2(pavgw, As<MMX>(x).value, As<MMX>(y).value)));
6815                 }
6816
6817                 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
6818                 {
6819                         llvm::Function *pmaxsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmaxs_w);
6820
6821                         return As<Short4>(V(::builder->CreateCall2(pmaxsw, As<MMX>(x).value, As<MMX>(y).value)));
6822                 }
6823
6824                 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
6825                 {
6826                         llvm::Function *pminsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmins_w);
6827
6828                         return As<Short4>(V(::builder->CreateCall2(pminsw, As<MMX>(x).value, As<MMX>(y).value)));
6829                 }
6830
6831                 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
6832                 {
6833                         llvm::Function *pcmpgtw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpgt_w);
6834
6835                         return As<Short4>(V(::builder->CreateCall2(pcmpgtw, As<MMX>(x).value, As<MMX>(y).value)));
6836                 }
6837
6838                 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
6839                 {
6840                         llvm::Function *pcmpeqw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpeq_w);
6841
6842                         return As<Short4>(V(::builder->CreateCall2(pcmpeqw, As<MMX>(x).value, As<MMX>(y).value)));
6843                 }
6844
6845                 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
6846                 {
6847                         llvm::Function *pcmpgtb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpgt_b);
6848
6849                         return As<Byte8>(V(::builder->CreateCall2(pcmpgtb, As<MMX>(x).value, As<MMX>(y).value)));
6850                 }
6851
6852                 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
6853                 {
6854                         llvm::Function *pcmpeqb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpeq_b);
6855
6856                         return As<Byte8>(V(::builder->CreateCall2(pcmpeqb, As<MMX>(x).value, As<MMX>(y).value)));
6857                 }
6858
6859                 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
6860                 {
6861                         llvm::Function *packssdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packssdw);
6862
6863                         return As<Short4>(V(::builder->CreateCall2(packssdw, As<MMX>(x).value, As<MMX>(y).value)));
6864                 }
6865
6866                 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
6867                 {
6868                         if(CPUID::supportsSSE2())
6869                         {
6870                                 llvm::Function *packssdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_packssdw_128);
6871
6872                                 return RValue<Short8>(V(::builder->CreateCall2(packssdw, x.value, y.value)));
6873                         }
6874                         else
6875                         {
6876                                 Int2 loX = Int2(x);
6877                                 Int2 hiX = Int2(Swizzle(x, 0xEE));
6878
6879                                 Int2 loY = Int2(y);
6880                                 Int2 hiY = Int2(Swizzle(y, 0xEE));
6881
6882                                 Short4 lo = x86::packssdw(loX, hiX);
6883                                 Short4 hi = x86::packssdw(loY, hiY);
6884
6885                                 return Short8(lo, hi);
6886                         }
6887                 }
6888
6889                 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
6890                 {
6891                         llvm::Function *packsswb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packsswb);
6892
6893                         return As<SByte8>(V(::builder->CreateCall2(packsswb, As<MMX>(x).value, As<MMX>(y).value)));
6894                 }
6895
6896                 RValue<Byte8> packuswb(RValue<UShort4> x, RValue<UShort4> y)
6897                 {
6898                         llvm::Function *packuswb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packuswb);
6899
6900                         return As<Byte8>(V(::builder->CreateCall2(packuswb, As<MMX>(x).value, As<MMX>(y).value)));
6901                 }
6902
6903                 RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
6904                 {
6905                         if(CPUID::supportsSSE4_1())
6906                         {
6907                                 llvm::Function *packusdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_packusdw);
6908
6909                                 return RValue<UShort8>(V(::builder->CreateCall2(packusdw, x.value, y.value)));
6910                         }
6911                         else
6912                         {
6913                                 RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
6914                                 RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
6915
6916                                 return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
6917                         }
6918                 }
6919
6920                 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
6921                 {
6922                         llvm::Function *psrlw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrli_w);
6923
6924                         return As<UShort4>(V(::builder->CreateCall2(psrlw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
6925                 }
6926
6927                 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
6928                 {
6929                         llvm::Function *psrlw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrli_w);
6930
6931                         return RValue<UShort8>(V(::builder->CreateCall2(psrlw, x.value, V(Nucleus::createConstantInt(y)))));
6932                 }
6933
6934                 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
6935                 {
6936                         llvm::Function *psraw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrai_w);
6937
6938                         return As<Short4>(V(::builder->CreateCall2(psraw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
6939                 }
6940
6941                 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
6942                 {
6943                         llvm::Function *psraw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrai_w);
6944
6945                         return RValue<Short8>(V(::builder->CreateCall2(psraw, x.value, V(Nucleus::createConstantInt(y)))));
6946                 }
6947
6948                 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
6949                 {
6950                         llvm::Function *psllw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pslli_w);
6951
6952                         return As<Short4>(V(::builder->CreateCall2(psllw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
6953                 }
6954
6955                 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
6956                 {
6957                         llvm::Function *psllw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pslli_w);
6958
6959                         return RValue<Short8>(V(::builder->CreateCall2(psllw, x.value, V(Nucleus::createConstantInt(y)))));
6960                 }
6961
6962                 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
6963                 {
6964                         llvm::Function *pslld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pslli_d);
6965
6966                         return As<Int2>(V(::builder->CreateCall2(pslld, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
6967                 }
6968
6969                 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
6970                 {
6971                         if(CPUID::supportsSSE2())
6972                         {
6973                                 llvm::Function *pslld = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pslli_d);
6974
6975                                 return RValue<Int4>(V(::builder->CreateCall2(pslld, x.value, V(Nucleus::createConstantInt(y)))));
6976                         }
6977                         else
6978                         {
6979                                 Int2 lo = Int2(x);
6980                                 Int2 hi = Int2(Swizzle(x, 0xEE));
6981
6982                                 lo = x86::pslld(lo, y);
6983                                 hi = x86::pslld(hi, y);
6984
6985                                 return Int4(lo, hi);
6986                         }
6987                 }
6988
6989                 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
6990                 {
6991                         llvm::Function *psrad = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrai_d);
6992
6993                         return As<Int2>(V(::builder->CreateCall2(psrad, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
6994                 }
6995
6996                 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
6997                 {
6998                         if(CPUID::supportsSSE2())
6999                         {
7000                                 llvm::Function *psrad = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrai_d);
7001
7002                                 return RValue<Int4>(V(::builder->CreateCall2(psrad, x.value, V(Nucleus::createConstantInt(y)))));
7003                         }
7004                         else
7005                         {
7006                                 Int2 lo = Int2(x);
7007                                 Int2 hi = Int2(Swizzle(x, 0xEE));
7008
7009                                 lo = x86::psrad(lo, y);
7010                                 hi = x86::psrad(hi, y);
7011
7012                                 return Int4(lo, hi);
7013                         }
7014                 }
7015
7016                 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
7017                 {
7018                         llvm::Function *psrld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrli_d);
7019
7020                         return As<UInt2>(V(::builder->CreateCall2(psrld, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7021                 }
7022
7023                 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
7024                 {
7025                         if(CPUID::supportsSSE2())
7026                         {
7027                                 llvm::Function *psrld = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrli_d);
7028
7029                                 return RValue<UInt4>(V(::builder->CreateCall2(psrld, x.value, V(Nucleus::createConstantInt(y)))));
7030                         }
7031                         else
7032                         {
7033                                 UInt2 lo = As<UInt2>(Int2(As<Int4>(x)));
7034                                 UInt2 hi = As<UInt2>(Int2(Swizzle(As<Int4>(x), 0xEE)));
7035
7036                                 lo = x86::psrld(lo, y);
7037                                 hi = x86::psrld(hi, y);
7038
7039                                 return UInt4(lo, hi);
7040                         }
7041                 }
7042
7043                 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
7044                 {
7045                         llvm::Function *pmaxsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmaxsd);
7046
7047                         return RValue<Int4>(V(::builder->CreateCall2(pmaxsd, x.value, y.value)));
7048                 }
7049
7050                 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
7051                 {
7052                         llvm::Function *pminsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pminsd);
7053
7054                         return RValue<Int4>(V(::builder->CreateCall2(pminsd, x.value, y.value)));
7055                 }
7056
7057                 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
7058                 {
7059                         llvm::Function *pmaxud = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmaxud);
7060
7061                         return RValue<UInt4>(V(::builder->CreateCall2(pmaxud, x.value, y.value)));
7062                 }
7063
7064                 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
7065                 {
7066                         llvm::Function *pminud = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pminud);
7067
7068                         return RValue<UInt4>(V(::builder->CreateCall2(pminud, x.value, y.value)));
7069                 }
7070
7071                 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
7072                 {
7073                         llvm::Function *pmulhw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmulh_w);
7074
7075                         return As<Short4>(V(::builder->CreateCall2(pmulhw, As<MMX>(x).value, As<MMX>(y).value)));
7076                 }
7077
7078                 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
7079                 {
7080                         llvm::Function *pmulhuw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmulhu_w);
7081
7082                         return As<UShort4>(V(::builder->CreateCall2(pmulhuw, As<MMX>(x).value, As<MMX>(y).value)));
7083                 }
7084
7085                 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
7086                 {
7087                         llvm::Function *pmaddwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmadd_wd);
7088
7089                         return As<Int2>(V(::builder->CreateCall2(pmaddwd, As<MMX>(x).value, As<MMX>(y).value)));
7090                 }
7091
7092                 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
7093                 {
7094                         llvm::Function *pmulhw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmulh_w);
7095
7096                         return RValue<Short8>(V(::builder->CreateCall2(pmulhw, x.value, y.value)));
7097                 }
7098
7099                 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
7100                 {
7101                         llvm::Function *pmulhuw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmulhu_w);
7102
7103                         return RValue<UShort8>(V(::builder->CreateCall2(pmulhuw, x.value, y.value)));
7104                 }
7105
7106                 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
7107                 {
7108                         llvm::Function *pmaddwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmadd_wd);
7109
7110                         return RValue<Int4>(V(::builder->CreateCall2(pmaddwd, x.value, y.value)));
7111                 }
7112
7113                 RValue<Int> movmskps(RValue<Float4> x)
7114                 {
7115                         llvm::Function *movmskps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_movmsk_ps);
7116
7117                         return RValue<Int>(V(::builder->CreateCall(movmskps, x.value)));
7118                 }
7119
7120                 RValue<Int> pmovmskb(RValue<Byte8> x)
7121                 {
7122                         llvm::Function *pmovmskb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmovmskb);
7123
7124                         return RValue<Int>(V(::builder->CreateCall(pmovmskb, As<MMX>(x).value)));
7125                 }
7126
7127                 //RValue<Int2> movd(RValue<Pointer<Int>> x)
7128                 //{
7129                 //      Value *element = Nucleus::createLoad(x.value);
7130
7131                 ////    Value *int2 = UndefValue::get(Int2::getType());
7132                 ////    int2 = Nucleus::createInsertElement(int2, element, ConstantInt::get(Int::getType(), 0));
7133
7134                 //      Value *int2 = Nucleus::createBitCast(Nucleus::createZExt(element, Long::getType()), Int2::getType());
7135
7136                 //      return RValue<Int2>(int2);
7137                 //}
7138
7139                 //RValue<Int2> movdq2q(RValue<Int4> x)
7140                 //{
7141                 //      Value *long2 = Nucleus::createBitCast(x.value, T(VectorType::get(Long::getType(), 2)));
7142                 //      Value *element = Nucleus::createExtractElement(long2, ConstantInt::get(Int::getType(), 0));
7143
7144                 //      return RValue<Int2>(Nucleus::createBitCast(element, Int2::getType()));
7145                 //}
7146
7147                 RValue<Int4> pmovzxbd(RValue<Int4> x)
7148                 {
7149                         llvm::Function *pmovzxbd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovzxbd);
7150
7151                         return RValue<Int4>(V(::builder->CreateCall(pmovzxbd, Nucleus::createBitCast(x.value, Byte16::getType()))));
7152                 }
7153
7154                 RValue<Int4> pmovsxbd(RValue<Int4> x)
7155                 {
7156                         llvm::Function *pmovsxbd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovsxbd);
7157
7158                         return RValue<Int4>(V(::builder->CreateCall(pmovsxbd, Nucleus::createBitCast(x.value, SByte16::getType()))));
7159                 }
7160
7161                 RValue<Int4> pmovzxwd(RValue<Int4> x)
7162                 {
7163                         llvm::Function *pmovzxwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovzxwd);
7164
7165                         return RValue<Int4>(V(::builder->CreateCall(pmovzxwd, Nucleus::createBitCast(x.value, UShort8::getType()))));
7166                 }
7167
7168                 RValue<Int4> pmovsxwd(RValue<Int4> x)
7169                 {
7170                         llvm::Function *pmovsxwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovsxwd);
7171
7172                         return RValue<Int4>(V(::builder->CreateCall(pmovsxwd, Nucleus::createBitCast(x.value, Short8::getType()))));
7173                 }
7174
7175                 void emms()
7176                 {
7177                         llvm::Function *emms = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_emms);
7178
7179                         V(::builder->CreateCall(emms));
7180                 }
7181         }
7182 }