OSDN Git Service

5dd4a8b14d249d226e0c0537f9044077a9435215
[android-x86/external-swiftshader.git] / src / Reactor / LLVMReactor.cpp
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "Nucleus.hpp"
16
17 #include "llvm/Support/IRBuilder.h"
18 #include "llvm/Function.h"
19 #include "llvm/GlobalVariable.h"
20 #include "llvm/Module.h"
21 #include "llvm/LLVMContext.h"
22 #include "llvm/Constants.h"
23 #include "llvm/Intrinsics.h"
24 #include "llvm/PassManager.h"
25 #include "llvm/Analysis/LoopPass.h"
26 #include "llvm/Transforms/Scalar.h"
27 #include "llvm/Target/TargetData.h"
28 #include "llvm/Target/TargetOptions.h"
29 #include "llvm/Support/TargetSelect.h"
30 #include "../lib/ExecutionEngine/JIT/JIT.h"
31
32 #include "LLVMRoutine.hpp"
33 #include "LLVMRoutineManager.hpp"
34 #include "x86.hpp"
35 #include "CPUID.hpp"
36 #include "Thread.hpp"
37 #include "Memory.hpp"
38 #include "MutexLock.hpp"
39
40 #include <fstream>
41
42 #if defined(__i386__) || defined(__x86_64__)
43 #include <xmmintrin.h>
44 #endif
45
46 #if defined(__x86_64__) && defined(_WIN32)
47 extern "C" void X86CompilationCallback()
48 {
49         assert(false);   // UNIMPLEMENTED
50 }
51 #endif
52
53 extern "C"
54 {
55         bool (*CodeAnalystInitialize)() = 0;
56         void (*CodeAnalystCompleteJITLog)() = 0;
57         bool (*CodeAnalystLogJITCode)(const void *jitCodeStartAddr, unsigned int jitCodeSize, const wchar_t *functionName) = 0;
58 }
59
60 namespace llvm
61 {
62         extern bool JITEmitDebugInfo;
63 }
64
65 namespace
66 {
67         sw::LLVMRoutineManager *routineManager = nullptr;
68         llvm::ExecutionEngine *executionEngine = nullptr;
69         llvm::IRBuilder<> *builder = nullptr;
70         llvm::LLVMContext *context = nullptr;
71         llvm::Module *module = nullptr;
72         llvm::Function *function = nullptr;
73
74         sw::MutexLock codegenMutex;
75 }
76
77 namespace sw
78 {
79         using namespace llvm;
80
81         Optimization optimization[10] = {InstructionCombining, Disabled};
82
83         class Type : public llvm::Type {};
84         class Value : public llvm::Value {};
85         class SwitchCases : public llvm::SwitchInst {};
86         class BasicBlock : public llvm::BasicBlock {};
87
88         inline Type *T(llvm::Type *t)
89         {
90                 return reinterpret_cast<Type*>(t);
91         }
92
93         inline Value *V(llvm::Value *t)
94         {
95                 return reinterpret_cast<Value*>(t);
96         }
97
98         inline std::vector<llvm::Type*> &T(std::vector<Type*> &t)
99         {
100                 return reinterpret_cast<std::vector<llvm::Type*>&>(t);
101         }
102
103         inline BasicBlock *B(llvm::BasicBlock *t)
104         {
105                 return reinterpret_cast<BasicBlock*>(t);
106         }
107
108         Nucleus::Nucleus()
109         {
110                 ::codegenMutex.lock();   // Reactor and LLVM are currently not thread safe
111
112                 InitializeNativeTarget();
113                 JITEmitDebugInfo = false;
114
115                 if(!::context)
116                 {
117                         ::context = new LLVMContext();
118                 }
119
120                 ::module = new Module("", *::context);
121                 ::routineManager = new LLVMRoutineManager();
122
123                 #if defined(__x86_64__)
124                         const char *architecture = "x86-64";
125                 #else
126                         const char *architecture = "x86";
127                 #endif
128
129                 SmallVector<std::string, 1> MAttrs;
130                 MAttrs.push_back(CPUID::supportsMMX()    ? "+mmx"   : "-mmx");
131                 MAttrs.push_back(CPUID::supportsCMOV()   ? "+cmov"  : "-cmov");
132                 MAttrs.push_back(CPUID::supportsSSE()    ? "+sse"   : "-sse");
133                 MAttrs.push_back(CPUID::supportsSSE2()   ? "+sse2"  : "-sse2");
134                 MAttrs.push_back(CPUID::supportsSSE3()   ? "+sse3"  : "-sse3");
135                 MAttrs.push_back(CPUID::supportsSSSE3()  ? "+ssse3" : "-ssse3");
136                 MAttrs.push_back(CPUID::supportsSSE4_1() ? "+sse41" : "-sse41");
137
138                 std::string error;
139                 TargetMachine *targetMachine = EngineBuilder::selectTarget(::module, architecture, "", MAttrs, Reloc::Default, CodeModel::JITDefault, &error);
140                 ::executionEngine = JIT::createJIT(::module, 0, ::routineManager, CodeGenOpt::Aggressive, true, targetMachine);
141
142                 if(!::builder)
143                 {
144                         ::builder = new IRBuilder<>(*::context);
145
146                         #if defined(_WIN32)
147                                 HMODULE CodeAnalyst = LoadLibrary("CAJitNtfyLib.dll");
148                                 if(CodeAnalyst)
149                                 {
150                                         CodeAnalystInitialize = (bool(*)())GetProcAddress(CodeAnalyst, "CAJIT_Initialize");
151                                         CodeAnalystCompleteJITLog = (void(*)())GetProcAddress(CodeAnalyst, "CAJIT_CompleteJITLog");
152                                         CodeAnalystLogJITCode = (bool(*)(const void*, unsigned int, const wchar_t*))GetProcAddress(CodeAnalyst, "CAJIT_LogJITCode");
153
154                                         CodeAnalystInitialize();
155                                 }
156                         #endif
157                 }
158         }
159
160         Nucleus::~Nucleus()
161         {
162                 delete ::executionEngine;
163                 ::executionEngine = nullptr;
164
165                 ::routineManager = nullptr;
166                 ::function = nullptr;
167                 ::module = nullptr;
168
169                 ::codegenMutex.unlock();
170         }
171
172         Routine *Nucleus::acquireRoutine(const wchar_t *name, bool runOptimizations)
173         {
174                 if(::builder->GetInsertBlock()->empty() || !::builder->GetInsertBlock()->back().isTerminator())
175                 {
176                         llvm::Type *type = ::function->getReturnType();
177
178                         if(type->isVoidTy())
179                         {
180                                 createRetVoid();
181                         }
182                         else
183                         {
184                                 createRet(V(UndefValue::get(type)));
185                         }
186                 }
187
188                 if(false)
189                 {
190                         std::string error;
191                         raw_fd_ostream file("llvm-dump-unopt.txt", error);
192                         ::module->print(file, 0);
193                 }
194
195                 if(runOptimizations)
196                 {
197                         optimize();
198                 }
199
200                 if(false)
201                 {
202                         std::string error;
203                         raw_fd_ostream file("llvm-dump-opt.txt", error);
204                         ::module->print(file, 0);
205                 }
206
207                 void *entry = ::executionEngine->getPointerToFunction(::function);
208                 LLVMRoutine *routine = ::routineManager->acquireRoutine(entry);
209
210                 if(CodeAnalystLogJITCode)
211                 {
212                         CodeAnalystLogJITCode(routine->getEntry(), routine->getCodeSize(), name);
213                 }
214
215                 return routine;
216         }
217
218         void Nucleus::optimize()
219         {
220                 static PassManager *passManager = nullptr;
221
222                 if(!passManager)
223                 {
224                         passManager = new PassManager();
225
226                         UnsafeFPMath = true;
227                 //      NoInfsFPMath = true;
228                 //      NoNaNsFPMath = true;
229
230                         passManager->add(new TargetData(*::executionEngine->getTargetData()));
231                         passManager->add(createScalarReplAggregatesPass());
232
233                         for(int pass = 0; pass < 10 && optimization[pass] != Disabled; pass++)
234                         {
235                                 switch(optimization[pass])
236                                 {
237                                 case Disabled:                                                                 break;
238                                 case CFGSimplification:    passManager->add(createCFGSimplificationPass());    break;
239                                 case LICM:                 passManager->add(createLICMPass());                 break;
240                                 case AggressiveDCE:        passManager->add(createAggressiveDCEPass());        break;
241                                 case GVN:                  passManager->add(createGVNPass());                  break;
242                                 case InstructionCombining: passManager->add(createInstructionCombiningPass()); break;
243                                 case Reassociate:          passManager->add(createReassociatePass());          break;
244                                 case DeadStoreElimination: passManager->add(createDeadStoreEliminationPass()); break;
245                                 case SCCP:                 passManager->add(createSCCPPass());                 break;
246                                 case ScalarReplAggregates: passManager->add(createScalarReplAggregatesPass()); break;
247                                 default:
248                                         assert(false);
249                                 }
250                         }
251                 }
252
253                 passManager->run(*::module);
254         }
255
256         Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
257         {
258                 // Need to allocate it in the entry block for mem2reg to work
259                 llvm::BasicBlock &entryBlock = ::function->getEntryBlock();
260
261                 Instruction *declaration;
262
263                 if(arraySize)
264                 {
265                         declaration = new AllocaInst(type, Nucleus::createConstantInt(arraySize));
266                 }
267                 else
268                 {
269                         declaration = new AllocaInst(type, (Value*)0);
270                 }
271
272                 entryBlock.getInstList().push_front(declaration);
273
274                 return V(declaration);
275         }
276
277         BasicBlock *Nucleus::createBasicBlock()
278         {
279                 return B(BasicBlock::Create(*::context, "", ::function));
280         }
281
282         BasicBlock *Nucleus::getInsertBlock()
283         {
284                 return B(::builder->GetInsertBlock());
285         }
286
287         void Nucleus::setInsertBlock(BasicBlock *basicBlock)
288         {
289         //      assert(::builder->GetInsertBlock()->back().isTerminator());
290                 return ::builder->SetInsertPoint(basicBlock);
291         }
292
293         void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
294         {
295                 llvm::FunctionType *functionType = llvm::FunctionType::get(ReturnType, T(Params), false);
296                 ::function = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, "", ::module);
297                 ::function->setCallingConv(llvm::CallingConv::C);
298
299                 ::builder->SetInsertPoint(BasicBlock::Create(*::context, "", ::function));
300         }
301
302         Value *Nucleus::getArgument(unsigned int index)
303         {
304                 llvm::Function::arg_iterator args = ::function->arg_begin();
305
306                 while(index)
307                 {
308                         args++;
309                         index--;
310                 }
311
312                 return V(&*args);
313         }
314
315         void Nucleus::createRetVoid()
316         {
317                 x86::emms();
318
319                 ::builder->CreateRetVoid();
320         }
321
322         void Nucleus::createRet(Value *v)
323         {
324                 x86::emms();
325
326                 ::builder->CreateRet(v);
327         }
328
329         void Nucleus::createBr(BasicBlock *dest)
330         {
331                 ::builder->CreateBr(dest);
332         }
333
334         void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
335         {
336                 ::builder->CreateCondBr(cond, ifTrue, ifFalse);
337         }
338
339         Value *Nucleus::createAdd(Value *lhs, Value *rhs)
340         {
341                 return V(::builder->CreateAdd(lhs, rhs));
342         }
343
344         Value *Nucleus::createSub(Value *lhs, Value *rhs)
345         {
346                 return V(::builder->CreateSub(lhs, rhs));
347         }
348
349         Value *Nucleus::createMul(Value *lhs, Value *rhs)
350         {
351                 return V(::builder->CreateMul(lhs, rhs));
352         }
353
354         Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
355         {
356                 return V(::builder->CreateUDiv(lhs, rhs));
357         }
358
359         Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
360         {
361                 return V(::builder->CreateSDiv(lhs, rhs));
362         }
363
364         Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
365         {
366                 return V(::builder->CreateFAdd(lhs, rhs));
367         }
368
369         Value *Nucleus::createFSub(Value *lhs, Value *rhs)
370         {
371                 return V(::builder->CreateFSub(lhs, rhs));
372         }
373
374         Value *Nucleus::createFMul(Value *lhs, Value *rhs)
375         {
376                 return V(::builder->CreateFMul(lhs, rhs));
377         }
378
379         Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
380         {
381                 return V(::builder->CreateFDiv(lhs, rhs));
382         }
383
384         Value *Nucleus::createURem(Value *lhs, Value *rhs)
385         {
386                 return V(::builder->CreateURem(lhs, rhs));
387         }
388
389         Value *Nucleus::createSRem(Value *lhs, Value *rhs)
390         {
391                 return V(::builder->CreateSRem(lhs, rhs));
392         }
393
394         Value *Nucleus::createFRem(Value *lhs, Value *rhs)
395         {
396                 return V(::builder->CreateFRem(lhs, rhs));
397         }
398
399         Value *Nucleus::createShl(Value *lhs, Value *rhs)
400         {
401                 return V(::builder->CreateShl(lhs, rhs));
402         }
403
404         Value *Nucleus::createLShr(Value *lhs, Value *rhs)
405         {
406                 return V(::builder->CreateLShr(lhs, rhs));
407         }
408
409         Value *Nucleus::createAShr(Value *lhs, Value *rhs)
410         {
411                 return V(::builder->CreateAShr(lhs, rhs));
412         }
413
414         Value *Nucleus::createAnd(Value *lhs, Value *rhs)
415         {
416                 return V(::builder->CreateAnd(lhs, rhs));
417         }
418
419         Value *Nucleus::createOr(Value *lhs, Value *rhs)
420         {
421                 return V(::builder->CreateOr(lhs, rhs));
422         }
423
424         Value *Nucleus::createXor(Value *lhs, Value *rhs)
425         {
426                 return V(::builder->CreateXor(lhs, rhs));
427         }
428
429         Value *Nucleus::createNeg(Value *v)
430         {
431                 return V(::builder->CreateNeg(v));
432         }
433
434         Value *Nucleus::createFNeg(Value *v)
435         {
436                 return V(::builder->CreateFNeg(v));
437         }
438
439         Value *Nucleus::createNot(Value *v)
440         {
441                 return V(::builder->CreateNot(v));
442         }
443
444         Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int align)
445         {
446                 assert(ptr->getType()->getContainedType(0) == type);
447                 return V(::builder->Insert(new LoadInst(ptr, "", isVolatile, align)));
448         }
449
450         Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int align)
451         {
452                 assert(ptr->getType()->getContainedType(0) == type);
453                 ::builder->Insert(new StoreInst(value, ptr, isVolatile, align));
454                 return value;
455         }
456
457         Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
458         {
459                 if(unsignedIndex && sizeof(void*) == 8)
460                 {
461                         index = createZExt(index, Long::getType());
462                 }
463
464                 assert(ptr->getType()->getContainedType(0) == type);
465                 return V(::builder->CreateGEP(ptr, index));
466         }
467
468         Value *Nucleus::createAtomicAdd(Value *ptr, Value *value)
469         {
470                 return V(::builder->CreateAtomicRMW(AtomicRMWInst::Add, ptr, value, SequentiallyConsistent));
471         }
472
473         Value *Nucleus::createTrunc(Value *v, Type *destType)
474         {
475                 return V(::builder->CreateTrunc(v, destType));
476         }
477
478         Value *Nucleus::createZExt(Value *v, Type *destType)
479         {
480                 return V(::builder->CreateZExt(v, destType));
481         }
482
483         Value *Nucleus::createSExt(Value *v, Type *destType)
484         {
485                 return V(::builder->CreateSExt(v, destType));
486         }
487
488         Value *Nucleus::createFPToSI(Value *v, Type *destType)
489         {
490                 return V(::builder->CreateFPToSI(v, destType));
491         }
492
493         Value *Nucleus::createSIToFP(Value *v, Type *destType)
494         {
495                 return V(::builder->CreateSIToFP(v, destType));
496         }
497
498         Value *Nucleus::createFPTrunc(Value *v, Type *destType)
499         {
500                 return V(::builder->CreateFPTrunc(v, destType));
501         }
502
503         Value *Nucleus::createFPExt(Value *v, Type *destType)
504         {
505                 return V(::builder->CreateFPExt(v, destType));
506         }
507
508         Value *Nucleus::createBitCast(Value *v, Type *destType)
509         {
510                 return V(::builder->CreateBitCast(v, destType));
511         }
512
513         Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
514         {
515                 return V(::builder->CreateICmpEQ(lhs, rhs));
516         }
517
518         Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
519         {
520                 return V(::builder->CreateICmpNE(lhs, rhs));
521         }
522
523         Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
524         {
525                 return V(::builder->CreateICmpUGT(lhs, rhs));
526         }
527
528         Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
529         {
530                 return V(::builder->CreateICmpUGE(lhs, rhs));
531         }
532
533         Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
534         {
535                 return V(::builder->CreateICmpULT(lhs, rhs));
536         }
537
538         Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
539         {
540                 return V(::builder->CreateICmpULE(lhs, rhs));
541         }
542
543         Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
544         {
545                 return V(::builder->CreateICmpSGT(lhs, rhs));
546         }
547
548         Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
549         {
550                 return V(::builder->CreateICmpSGE(lhs, rhs));
551         }
552
553         Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
554         {
555                 return V(::builder->CreateICmpSLT(lhs, rhs));
556         }
557
558         Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
559         {
560                 return V(::builder->CreateICmpSLE(lhs, rhs));
561         }
562
563         Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
564         {
565                 return V(::builder->CreateFCmpOEQ(lhs, rhs));
566         }
567
568         Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
569         {
570                 return V(::builder->CreateFCmpOGT(lhs, rhs));
571         }
572
573         Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
574         {
575                 return V(::builder->CreateFCmpOGE(lhs, rhs));
576         }
577
578         Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
579         {
580                 return V(::builder->CreateFCmpOLT(lhs, rhs));
581         }
582
583         Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
584         {
585                 return V(::builder->CreateFCmpOLE(lhs, rhs));
586         }
587
588         Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
589         {
590                 return V(::builder->CreateFCmpONE(lhs, rhs));
591         }
592
593         Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
594         {
595                 return V(::builder->CreateFCmpORD(lhs, rhs));
596         }
597
598         Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
599         {
600                 return V(::builder->CreateFCmpUNO(lhs, rhs));
601         }
602
603         Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
604         {
605                 return V(::builder->CreateFCmpUEQ(lhs, rhs));
606         }
607
608         Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
609         {
610                 return V(::builder->CreateFCmpUGT(lhs, rhs));
611         }
612
613         Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
614         {
615                 return V(::builder->CreateFCmpUGE(lhs, rhs));
616         }
617
618         Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
619         {
620                 return V(::builder->CreateFCmpULT(lhs, rhs));
621         }
622
623         Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
624         {
625                 return V(::builder->CreateFCmpULE(lhs, rhs));
626         }
627
628         Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
629         {
630                 return V(::builder->CreateFCmpULE(lhs, rhs));
631         }
632
633         Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
634         {
635                 assert(vector->getType()->getContainedType(0) == type);
636                 return V(::builder->CreateExtractElement(vector, createConstantInt(index)));
637         }
638
639         Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
640         {
641                 return V(::builder->CreateInsertElement(vector, element, createConstantInt(index)));
642         }
643
644         Value *Nucleus::createShuffleVector(Value *V1, Value *V2, const int *select)
645         {
646                 int size = llvm::cast<llvm::VectorType>(V1->getType())->getNumElements();
647                 const int maxSize = 16;
648                 llvm::Constant *swizzle[maxSize];
649                 assert(size <= maxSize);
650
651                 for(int i = 0; i < size; i++)
652                 {
653                         swizzle[i] = llvm::ConstantInt::get(Type::getInt32Ty(*::context), select[i]);
654                 }
655
656                 llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(swizzle, size));
657
658                 return V(::builder->CreateShuffleVector(V1, V2, shuffle));
659         }
660
661         Value *Nucleus::createSelect(Value *C, Value *ifTrue, Value *ifFalse)
662         {
663                 return V(::builder->CreateSelect(C, ifTrue, ifFalse));
664         }
665
666         SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
667         {
668                 return reinterpret_cast<SwitchCases*>(::builder->CreateSwitch(control, defaultBranch, numCases));
669         }
670
671         void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
672         {
673                 switchCases->addCase(llvm::ConstantInt::get(Type::getInt32Ty(*::context), label, true), branch);
674         }
675
676         void Nucleus::createUnreachable()
677         {
678                 ::builder->CreateUnreachable();
679         }
680
681         static Value *createSwizzle4(Value *val, unsigned char select)
682         {
683                 int swizzle[4] =
684                 {
685                         (select >> 0) & 0x03,
686                         (select >> 2) & 0x03,
687                         (select >> 4) & 0x03,
688                         (select >> 6) & 0x03,
689                 };
690
691                 return Nucleus::createShuffleVector(val, val, swizzle);
692         }
693
694         static Value *createMask4(Value *lhs, Value *rhs, unsigned char select)
695         {
696                 bool mask[4] = {false, false, false, false};
697
698                 mask[(select >> 0) & 0x03] = true;
699                 mask[(select >> 2) & 0x03] = true;
700                 mask[(select >> 4) & 0x03] = true;
701                 mask[(select >> 6) & 0x03] = true;
702
703                 int swizzle[4] =
704                 {
705                         mask[0] ? 4 : 0,
706                         mask[1] ? 5 : 1,
707                         mask[2] ? 6 : 2,
708                         mask[3] ? 7 : 3,
709                 };
710
711                 return Nucleus::createShuffleVector(lhs, rhs, swizzle);
712         }
713
714         Type *Nucleus::getPointerType(Type *ElementType)
715         {
716                 return T(llvm::PointerType::get(ElementType, 0));
717         }
718
719         Value *Nucleus::createNullValue(Type *Ty)
720         {
721                 return V(llvm::Constant::getNullValue(Ty));
722         }
723
724         Value *Nucleus::createConstantLong(int64_t i)
725         {
726                 return V(llvm::ConstantInt::get(Type::getInt64Ty(*::context), i, true));
727         }
728
729         Value *Nucleus::createConstantInt(int i)
730         {
731                 return V(llvm::ConstantInt::get(Type::getInt32Ty(*::context), i, true));
732         }
733
734         Value *Nucleus::createConstantInt(unsigned int i)
735         {
736                 return V(llvm::ConstantInt::get(Type::getInt32Ty(*::context), i, false));
737         }
738
739         Value *Nucleus::createConstantBool(bool b)
740         {
741                 return V(llvm::ConstantInt::get(Type::getInt1Ty(*::context), b));
742         }
743
744         Value *Nucleus::createConstantByte(signed char i)
745         {
746                 return V(llvm::ConstantInt::get(Type::getInt8Ty(*::context), i, true));
747         }
748
749         Value *Nucleus::createConstantByte(unsigned char i)
750         {
751                 return V(llvm::ConstantInt::get(Type::getInt8Ty(*::context), i, false));
752         }
753
754         Value *Nucleus::createConstantShort(short i)
755         {
756                 return V(llvm::ConstantInt::get(Type::getInt16Ty(*::context), i, true));
757         }
758
759         Value *Nucleus::createConstantShort(unsigned short i)
760         {
761                 return V(llvm::ConstantInt::get(Type::getInt16Ty(*::context), i, false));
762         }
763
764         Value *Nucleus::createConstantFloat(float x)
765         {
766                 return V(llvm::ConstantFP::get(Float::getType(), x));
767         }
768
769         Value *Nucleus::createNullPointer(Type *Ty)
770         {
771                 return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(Ty, 0)));
772         }
773
774         Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
775         {
776                 assert(llvm::isa<VectorType>(type));
777                 const int numConstants = llvm::cast<VectorType>(type)->getNumElements();
778                 assert(numConstants <= 16);
779                 llvm::Constant *constantVector[16];
780
781                 for(int i = 0; i < numConstants; i++)
782                 {
783                         constantVector[i] = llvm::ConstantInt::get(type->getContainedType(0), constants[i]);
784                 }
785
786                 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numConstants)));
787         }
788
789         Value *Nucleus::createConstantVector(const double *constants, Type *type)
790         {
791                 assert(llvm::isa<VectorType>(type));
792                 const int numConstants = llvm::cast<VectorType>(type)->getNumElements();
793                 assert(numConstants <= 8);
794                 llvm::Constant *constantVector[8];
795
796                 for(int i = 0; i < numConstants; i++)
797                 {
798                         constantVector[i] = llvm::ConstantFP::get(type->getContainedType(0), constants[i]);
799                 }
800
801                 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numConstants)));
802         }
803
804         Type *Void::getType()
805         {
806                 return T(llvm::Type::getVoidTy(*::context));
807         }
808
809         class MMX : public LValue<MMX>
810         {
811         public:
812                 static Type *getType();
813         };
814
815         Type *MMX::getType()
816         {
817                 return T(llvm::Type::getX86_MMXTy(*::context));
818         }
819
820         Bool::Bool(Argument<Bool> argument)
821         {
822                 storeValue(argument.value);
823         }
824
825         Bool::Bool(bool x)
826         {
827                 storeValue(Nucleus::createConstantBool(x));
828         }
829
830         Bool::Bool(RValue<Bool> rhs)
831         {
832                 storeValue(rhs.value);
833         }
834
835         Bool::Bool(const Bool &rhs)
836         {
837                 Value *value = rhs.loadValue();
838                 storeValue(value);
839         }
840
841         Bool::Bool(const Reference<Bool> &rhs)
842         {
843                 Value *value = rhs.loadValue();
844                 storeValue(value);
845         }
846
847         RValue<Bool> Bool::operator=(RValue<Bool> rhs)
848         {
849                 storeValue(rhs.value);
850
851                 return rhs;
852         }
853
854         RValue<Bool> Bool::operator=(const Bool &rhs)
855         {
856                 Value *value = rhs.loadValue();
857                 storeValue(value);
858
859                 return RValue<Bool>(value);
860         }
861
862         RValue<Bool> Bool::operator=(const Reference<Bool> &rhs)
863         {
864                 Value *value = rhs.loadValue();
865                 storeValue(value);
866
867                 return RValue<Bool>(value);
868         }
869
870         RValue<Bool> operator!(RValue<Bool> val)
871         {
872                 return RValue<Bool>(Nucleus::createNot(val.value));
873         }
874
875         RValue<Bool> operator&&(RValue<Bool> lhs, RValue<Bool> rhs)
876         {
877                 return RValue<Bool>(Nucleus::createAnd(lhs.value, rhs.value));
878         }
879
880         RValue<Bool> operator||(RValue<Bool> lhs, RValue<Bool> rhs)
881         {
882                 return RValue<Bool>(Nucleus::createOr(lhs.value, rhs.value));
883         }
884
885         Type *Bool::getType()
886         {
887                 return T(llvm::Type::getInt1Ty(*::context));
888         }
889
890         Byte::Byte(Argument<Byte> argument)
891         {
892                 storeValue(argument.value);
893         }
894
895         Byte::Byte(RValue<Int> cast)
896         {
897                 Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
898
899                 storeValue(integer);
900         }
901
902         Byte::Byte(RValue<UInt> cast)
903         {
904                 Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
905
906                 storeValue(integer);
907         }
908
909         Byte::Byte(RValue<UShort> cast)
910         {
911                 Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
912
913                 storeValue(integer);
914         }
915
916         Byte::Byte(int x)
917         {
918                 storeValue(Nucleus::createConstantByte((unsigned char)x));
919         }
920
921         Byte::Byte(unsigned char x)
922         {
923                 storeValue(Nucleus::createConstantByte(x));
924         }
925
926         Byte::Byte(RValue<Byte> rhs)
927         {
928                 storeValue(rhs.value);
929         }
930
931         Byte::Byte(const Byte &rhs)
932         {
933                 Value *value = rhs.loadValue();
934                 storeValue(value);
935         }
936
937         Byte::Byte(const Reference<Byte> &rhs)
938         {
939                 Value *value = rhs.loadValue();
940                 storeValue(value);
941         }
942
943         RValue<Byte> Byte::operator=(RValue<Byte> rhs)
944         {
945                 storeValue(rhs.value);
946
947                 return rhs;
948         }
949
950         RValue<Byte> Byte::operator=(const Byte &rhs)
951         {
952                 Value *value = rhs.loadValue();
953                 storeValue(value);
954
955                 return RValue<Byte>(value);
956         }
957
958         RValue<Byte> Byte::operator=(const Reference<Byte> &rhs)
959         {
960                 Value *value = rhs.loadValue();
961                 storeValue(value);
962
963                 return RValue<Byte>(value);
964         }
965
966         RValue<Byte> operator+(RValue<Byte> lhs, RValue<Byte> rhs)
967         {
968                 return RValue<Byte>(Nucleus::createAdd(lhs.value, rhs.value));
969         }
970
971         RValue<Byte> operator-(RValue<Byte> lhs, RValue<Byte> rhs)
972         {
973                 return RValue<Byte>(Nucleus::createSub(lhs.value, rhs.value));
974         }
975
976         RValue<Byte> operator*(RValue<Byte> lhs, RValue<Byte> rhs)
977         {
978                 return RValue<Byte>(Nucleus::createMul(lhs.value, rhs.value));
979         }
980
981         RValue<Byte> operator/(RValue<Byte> lhs, RValue<Byte> rhs)
982         {
983                 return RValue<Byte>(Nucleus::createUDiv(lhs.value, rhs.value));
984         }
985
986         RValue<Byte> operator%(RValue<Byte> lhs, RValue<Byte> rhs)
987         {
988                 return RValue<Byte>(Nucleus::createURem(lhs.value, rhs.value));
989         }
990
991         RValue<Byte> operator&(RValue<Byte> lhs, RValue<Byte> rhs)
992         {
993                 return RValue<Byte>(Nucleus::createAnd(lhs.value, rhs.value));
994         }
995
996         RValue<Byte> operator|(RValue<Byte> lhs, RValue<Byte> rhs)
997         {
998                 return RValue<Byte>(Nucleus::createOr(lhs.value, rhs.value));
999         }
1000
1001         RValue<Byte> operator^(RValue<Byte> lhs, RValue<Byte> rhs)
1002         {
1003                 return RValue<Byte>(Nucleus::createXor(lhs.value, rhs.value));
1004         }
1005
1006         RValue<Byte> operator<<(RValue<Byte> lhs, RValue<Byte> rhs)
1007         {
1008                 return RValue<Byte>(Nucleus::createShl(lhs.value, rhs.value));
1009         }
1010
1011         RValue<Byte> operator>>(RValue<Byte> lhs, RValue<Byte> rhs)
1012         {
1013                 return RValue<Byte>(Nucleus::createLShr(lhs.value, rhs.value));
1014         }
1015
1016         RValue<Byte> operator+=(Byte &lhs, RValue<Byte> rhs)
1017         {
1018                 return lhs = lhs + rhs;
1019         }
1020
1021         RValue<Byte> operator-=(Byte &lhs, RValue<Byte> rhs)
1022         {
1023                 return lhs = lhs - rhs;
1024         }
1025
1026         RValue<Byte> operator*=(Byte &lhs, RValue<Byte> rhs)
1027         {
1028                 return lhs = lhs * rhs;
1029         }
1030
1031         RValue<Byte> operator/=(Byte &lhs, RValue<Byte> rhs)
1032         {
1033                 return lhs = lhs / rhs;
1034         }
1035
1036         RValue<Byte> operator%=(Byte &lhs, RValue<Byte> rhs)
1037         {
1038                 return lhs = lhs % rhs;
1039         }
1040
1041         RValue<Byte> operator&=(Byte &lhs, RValue<Byte> rhs)
1042         {
1043                 return lhs = lhs & rhs;
1044         }
1045
1046         RValue<Byte> operator|=(Byte &lhs, RValue<Byte> rhs)
1047         {
1048                 return lhs = lhs | rhs;
1049         }
1050
1051         RValue<Byte> operator^=(Byte &lhs, RValue<Byte> rhs)
1052         {
1053                 return lhs = lhs ^ rhs;
1054         }
1055
1056         RValue<Byte> operator<<=(Byte &lhs, RValue<Byte> rhs)
1057         {
1058                 return lhs = lhs << rhs;
1059         }
1060
1061         RValue<Byte> operator>>=(Byte &lhs, RValue<Byte> rhs)
1062         {
1063                 return lhs = lhs >> rhs;
1064         }
1065
1066         RValue<Byte> operator+(RValue<Byte> val)
1067         {
1068                 return val;
1069         }
1070
1071         RValue<Byte> operator-(RValue<Byte> val)
1072         {
1073                 return RValue<Byte>(Nucleus::createNeg(val.value));
1074         }
1075
1076         RValue<Byte> operator~(RValue<Byte> val)
1077         {
1078                 return RValue<Byte>(Nucleus::createNot(val.value));
1079         }
1080
1081         RValue<Byte> operator++(Byte &val, int)   // Post-increment
1082         {
1083                 RValue<Byte> res = val;
1084
1085                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantByte((unsigned char)1)));
1086                 val.storeValue(inc);
1087
1088                 return res;
1089         }
1090
1091         const Byte &operator++(Byte &val)   // Pre-increment
1092         {
1093                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantByte((unsigned char)1)));
1094                 val.storeValue(inc);
1095
1096                 return val;
1097         }
1098
1099         RValue<Byte> operator--(Byte &val, int)   // Post-decrement
1100         {
1101                 RValue<Byte> res = val;
1102
1103                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantByte((unsigned char)1)));
1104                 val.storeValue(inc);
1105
1106                 return res;
1107         }
1108
1109         const Byte &operator--(Byte &val)   // Pre-decrement
1110         {
1111                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantByte((unsigned char)1)));
1112                 val.storeValue(inc);
1113
1114                 return val;
1115         }
1116
1117         RValue<Bool> operator<(RValue<Byte> lhs, RValue<Byte> rhs)
1118         {
1119                 return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
1120         }
1121
1122         RValue<Bool> operator<=(RValue<Byte> lhs, RValue<Byte> rhs)
1123         {
1124                 return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
1125         }
1126
1127         RValue<Bool> operator>(RValue<Byte> lhs, RValue<Byte> rhs)
1128         {
1129                 return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
1130         }
1131
1132         RValue<Bool> operator>=(RValue<Byte> lhs, RValue<Byte> rhs)
1133         {
1134                 return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
1135         }
1136
1137         RValue<Bool> operator!=(RValue<Byte> lhs, RValue<Byte> rhs)
1138         {
1139                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1140         }
1141
1142         RValue<Bool> operator==(RValue<Byte> lhs, RValue<Byte> rhs)
1143         {
1144                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1145         }
1146
1147         Type *Byte::getType()
1148         {
1149                 return T(llvm::Type::getInt8Ty(*::context));
1150         }
1151
1152         SByte::SByte(Argument<SByte> argument)
1153         {
1154                 storeValue(argument.value);
1155         }
1156
1157         SByte::SByte(RValue<Int> cast)
1158         {
1159                 Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
1160
1161                 storeValue(integer);
1162         }
1163
1164         SByte::SByte(RValue<Short> cast)
1165         {
1166                 Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
1167
1168                 storeValue(integer);
1169         }
1170
1171         SByte::SByte(signed char x)
1172         {
1173                 storeValue(Nucleus::createConstantByte(x));
1174         }
1175
1176         SByte::SByte(RValue<SByte> rhs)
1177         {
1178                 storeValue(rhs.value);
1179         }
1180
1181         SByte::SByte(const SByte &rhs)
1182         {
1183                 Value *value = rhs.loadValue();
1184                 storeValue(value);
1185         }
1186
1187         SByte::SByte(const Reference<SByte> &rhs)
1188         {
1189                 Value *value = rhs.loadValue();
1190                 storeValue(value);
1191         }
1192
1193         RValue<SByte> SByte::operator=(RValue<SByte> rhs)
1194         {
1195                 storeValue(rhs.value);
1196
1197                 return rhs;
1198         }
1199
1200         RValue<SByte> SByte::operator=(const SByte &rhs)
1201         {
1202                 Value *value = rhs.loadValue();
1203                 storeValue(value);
1204
1205                 return RValue<SByte>(value);
1206         }
1207
1208         RValue<SByte> SByte::operator=(const Reference<SByte> &rhs)
1209         {
1210                 Value *value = rhs.loadValue();
1211                 storeValue(value);
1212
1213                 return RValue<SByte>(value);
1214         }
1215
1216         RValue<SByte> operator+(RValue<SByte> lhs, RValue<SByte> rhs)
1217         {
1218                 return RValue<SByte>(Nucleus::createAdd(lhs.value, rhs.value));
1219         }
1220
1221         RValue<SByte> operator-(RValue<SByte> lhs, RValue<SByte> rhs)
1222         {
1223                 return RValue<SByte>(Nucleus::createSub(lhs.value, rhs.value));
1224         }
1225
1226         RValue<SByte> operator*(RValue<SByte> lhs, RValue<SByte> rhs)
1227         {
1228                 return RValue<SByte>(Nucleus::createMul(lhs.value, rhs.value));
1229         }
1230
1231         RValue<SByte> operator/(RValue<SByte> lhs, RValue<SByte> rhs)
1232         {
1233                 return RValue<SByte>(Nucleus::createSDiv(lhs.value, rhs.value));
1234         }
1235
1236         RValue<SByte> operator%(RValue<SByte> lhs, RValue<SByte> rhs)
1237         {
1238                 return RValue<SByte>(Nucleus::createSRem(lhs.value, rhs.value));
1239         }
1240
1241         RValue<SByte> operator&(RValue<SByte> lhs, RValue<SByte> rhs)
1242         {
1243                 return RValue<SByte>(Nucleus::createAnd(lhs.value, rhs.value));
1244         }
1245
1246         RValue<SByte> operator|(RValue<SByte> lhs, RValue<SByte> rhs)
1247         {
1248                 return RValue<SByte>(Nucleus::createOr(lhs.value, rhs.value));
1249         }
1250
1251         RValue<SByte> operator^(RValue<SByte> lhs, RValue<SByte> rhs)
1252         {
1253                 return RValue<SByte>(Nucleus::createXor(lhs.value, rhs.value));
1254         }
1255
1256         RValue<SByte> operator<<(RValue<SByte> lhs, RValue<SByte> rhs)
1257         {
1258                 return RValue<SByte>(Nucleus::createShl(lhs.value, rhs.value));
1259         }
1260
1261         RValue<SByte> operator>>(RValue<SByte> lhs, RValue<SByte> rhs)
1262         {
1263                 return RValue<SByte>(Nucleus::createAShr(lhs.value, rhs.value));
1264         }
1265
1266         RValue<SByte> operator+=(SByte &lhs, RValue<SByte> rhs)
1267         {
1268                 return lhs = lhs + rhs;
1269         }
1270
1271         RValue<SByte> operator-=(SByte &lhs, RValue<SByte> rhs)
1272         {
1273                 return lhs = lhs - rhs;
1274         }
1275
1276         RValue<SByte> operator*=(SByte &lhs, RValue<SByte> rhs)
1277         {
1278                 return lhs = lhs * rhs;
1279         }
1280
1281         RValue<SByte> operator/=(SByte &lhs, RValue<SByte> rhs)
1282         {
1283                 return lhs = lhs / rhs;
1284         }
1285
1286         RValue<SByte> operator%=(SByte &lhs, RValue<SByte> rhs)
1287         {
1288                 return lhs = lhs % rhs;
1289         }
1290
1291         RValue<SByte> operator&=(SByte &lhs, RValue<SByte> rhs)
1292         {
1293                 return lhs = lhs & rhs;
1294         }
1295
1296         RValue<SByte> operator|=(SByte &lhs, RValue<SByte> rhs)
1297         {
1298                 return lhs = lhs | rhs;
1299         }
1300
1301         RValue<SByte> operator^=(SByte &lhs, RValue<SByte> rhs)
1302         {
1303                 return lhs = lhs ^ rhs;
1304         }
1305
1306         RValue<SByte> operator<<=(SByte &lhs, RValue<SByte> rhs)
1307         {
1308                 return lhs = lhs << rhs;
1309         }
1310
1311         RValue<SByte> operator>>=(SByte &lhs, RValue<SByte> rhs)
1312         {
1313                 return lhs = lhs >> rhs;
1314         }
1315
1316         RValue<SByte> operator+(RValue<SByte> val)
1317         {
1318                 return val;
1319         }
1320
1321         RValue<SByte> operator-(RValue<SByte> val)
1322         {
1323                 return RValue<SByte>(Nucleus::createNeg(val.value));
1324         }
1325
1326         RValue<SByte> operator~(RValue<SByte> val)
1327         {
1328                 return RValue<SByte>(Nucleus::createNot(val.value));
1329         }
1330
1331         RValue<SByte> operator++(SByte &val, int)   // Post-increment
1332         {
1333                 RValue<SByte> res = val;
1334
1335                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantByte((signed char)1)));
1336                 val.storeValue(inc);
1337
1338                 return res;
1339         }
1340
1341         const SByte &operator++(SByte &val)   // Pre-increment
1342         {
1343                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantByte((signed char)1)));
1344                 val.storeValue(inc);
1345
1346                 return val;
1347         }
1348
1349         RValue<SByte> operator--(SByte &val, int)   // Post-decrement
1350         {
1351                 RValue<SByte> res = val;
1352
1353                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantByte((signed char)1)));
1354                 val.storeValue(inc);
1355
1356                 return res;
1357         }
1358
1359         const SByte &operator--(SByte &val)   // Pre-decrement
1360         {
1361                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantByte((signed char)1)));
1362                 val.storeValue(inc);
1363
1364                 return val;
1365         }
1366
1367         RValue<Bool> operator<(RValue<SByte> lhs, RValue<SByte> rhs)
1368         {
1369                 return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
1370         }
1371
1372         RValue<Bool> operator<=(RValue<SByte> lhs, RValue<SByte> rhs)
1373         {
1374                 return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
1375         }
1376
1377         RValue<Bool> operator>(RValue<SByte> lhs, RValue<SByte> rhs)
1378         {
1379                 return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
1380         }
1381
1382         RValue<Bool> operator>=(RValue<SByte> lhs, RValue<SByte> rhs)
1383         {
1384                 return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
1385         }
1386
1387         RValue<Bool> operator!=(RValue<SByte> lhs, RValue<SByte> rhs)
1388         {
1389                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1390         }
1391
1392         RValue<Bool> operator==(RValue<SByte> lhs, RValue<SByte> rhs)
1393         {
1394                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1395         }
1396
1397         Type *SByte::getType()
1398         {
1399                 return T(llvm::Type::getInt8Ty(*::context));
1400         }
1401
1402         Short::Short(Argument<Short> argument)
1403         {
1404                 storeValue(argument.value);
1405         }
1406
1407         Short::Short(RValue<Int> cast)
1408         {
1409                 Value *integer = Nucleus::createTrunc(cast.value, Short::getType());
1410
1411                 storeValue(integer);
1412         }
1413
1414         Short::Short(short x)
1415         {
1416                 storeValue(Nucleus::createConstantShort(x));
1417         }
1418
1419         Short::Short(RValue<Short> rhs)
1420         {
1421                 storeValue(rhs.value);
1422         }
1423
1424         Short::Short(const Short &rhs)
1425         {
1426                 Value *value = rhs.loadValue();
1427                 storeValue(value);
1428         }
1429
1430         Short::Short(const Reference<Short> &rhs)
1431         {
1432                 Value *value = rhs.loadValue();
1433                 storeValue(value);
1434         }
1435
1436         RValue<Short> Short::operator=(RValue<Short> rhs)
1437         {
1438                 storeValue(rhs.value);
1439
1440                 return rhs;
1441         }
1442
1443         RValue<Short> Short::operator=(const Short &rhs)
1444         {
1445                 Value *value = rhs.loadValue();
1446                 storeValue(value);
1447
1448                 return RValue<Short>(value);
1449         }
1450
1451         RValue<Short> Short::operator=(const Reference<Short> &rhs)
1452         {
1453                 Value *value = rhs.loadValue();
1454                 storeValue(value);
1455
1456                 return RValue<Short>(value);
1457         }
1458
1459         RValue<Short> operator+(RValue<Short> lhs, RValue<Short> rhs)
1460         {
1461                 return RValue<Short>(Nucleus::createAdd(lhs.value, rhs.value));
1462         }
1463
1464         RValue<Short> operator-(RValue<Short> lhs, RValue<Short> rhs)
1465         {
1466                 return RValue<Short>(Nucleus::createSub(lhs.value, rhs.value));
1467         }
1468
1469         RValue<Short> operator*(RValue<Short> lhs, RValue<Short> rhs)
1470         {
1471                 return RValue<Short>(Nucleus::createMul(lhs.value, rhs.value));
1472         }
1473
1474         RValue<Short> operator/(RValue<Short> lhs, RValue<Short> rhs)
1475         {
1476                 return RValue<Short>(Nucleus::createSDiv(lhs.value, rhs.value));
1477         }
1478
1479         RValue<Short> operator%(RValue<Short> lhs, RValue<Short> rhs)
1480         {
1481                 return RValue<Short>(Nucleus::createSRem(lhs.value, rhs.value));
1482         }
1483
1484         RValue<Short> operator&(RValue<Short> lhs, RValue<Short> rhs)
1485         {
1486                 return RValue<Short>(Nucleus::createAnd(lhs.value, rhs.value));
1487         }
1488
1489         RValue<Short> operator|(RValue<Short> lhs, RValue<Short> rhs)
1490         {
1491                 return RValue<Short>(Nucleus::createOr(lhs.value, rhs.value));
1492         }
1493
1494         RValue<Short> operator^(RValue<Short> lhs, RValue<Short> rhs)
1495         {
1496                 return RValue<Short>(Nucleus::createXor(lhs.value, rhs.value));
1497         }
1498
1499         RValue<Short> operator<<(RValue<Short> lhs, RValue<Short> rhs)
1500         {
1501                 return RValue<Short>(Nucleus::createShl(lhs.value, rhs.value));
1502         }
1503
1504         RValue<Short> operator>>(RValue<Short> lhs, RValue<Short> rhs)
1505         {
1506                 return RValue<Short>(Nucleus::createAShr(lhs.value, rhs.value));
1507         }
1508
1509         RValue<Short> operator+=(Short &lhs, RValue<Short> rhs)
1510         {
1511                 return lhs = lhs + rhs;
1512         }
1513
1514         RValue<Short> operator-=(Short &lhs, RValue<Short> rhs)
1515         {
1516                 return lhs = lhs - rhs;
1517         }
1518
1519         RValue<Short> operator*=(Short &lhs, RValue<Short> rhs)
1520         {
1521                 return lhs = lhs * rhs;
1522         }
1523
1524         RValue<Short> operator/=(Short &lhs, RValue<Short> rhs)
1525         {
1526                 return lhs = lhs / rhs;
1527         }
1528
1529         RValue<Short> operator%=(Short &lhs, RValue<Short> rhs)
1530         {
1531                 return lhs = lhs % rhs;
1532         }
1533
1534         RValue<Short> operator&=(Short &lhs, RValue<Short> rhs)
1535         {
1536                 return lhs = lhs & rhs;
1537         }
1538
1539         RValue<Short> operator|=(Short &lhs, RValue<Short> rhs)
1540         {
1541                 return lhs = lhs | rhs;
1542         }
1543
1544         RValue<Short> operator^=(Short &lhs, RValue<Short> rhs)
1545         {
1546                 return lhs = lhs ^ rhs;
1547         }
1548
1549         RValue<Short> operator<<=(Short &lhs, RValue<Short> rhs)
1550         {
1551                 return lhs = lhs << rhs;
1552         }
1553
1554         RValue<Short> operator>>=(Short &lhs, RValue<Short> rhs)
1555         {
1556                 return lhs = lhs >> rhs;
1557         }
1558
1559         RValue<Short> operator+(RValue<Short> val)
1560         {
1561                 return val;
1562         }
1563
1564         RValue<Short> operator-(RValue<Short> val)
1565         {
1566                 return RValue<Short>(Nucleus::createNeg(val.value));
1567         }
1568
1569         RValue<Short> operator~(RValue<Short> val)
1570         {
1571                 return RValue<Short>(Nucleus::createNot(val.value));
1572         }
1573
1574         RValue<Short> operator++(Short &val, int)   // Post-increment
1575         {
1576                 RValue<Short> res = val;
1577
1578                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantShort((short)1)));
1579                 val.storeValue(inc);
1580
1581                 return res;
1582         }
1583
1584         const Short &operator++(Short &val)   // Pre-increment
1585         {
1586                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantShort((short)1)));
1587                 val.storeValue(inc);
1588
1589                 return val;
1590         }
1591
1592         RValue<Short> operator--(Short &val, int)   // Post-decrement
1593         {
1594                 RValue<Short> res = val;
1595
1596                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantShort((short)1)));
1597                 val.storeValue(inc);
1598
1599                 return res;
1600         }
1601
1602         const Short &operator--(Short &val)   // Pre-decrement
1603         {
1604                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantShort((short)1)));
1605                 val.storeValue(inc);
1606
1607                 return val;
1608         }
1609
1610         RValue<Bool> operator<(RValue<Short> lhs, RValue<Short> rhs)
1611         {
1612                 return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
1613         }
1614
1615         RValue<Bool> operator<=(RValue<Short> lhs, RValue<Short> rhs)
1616         {
1617                 return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
1618         }
1619
1620         RValue<Bool> operator>(RValue<Short> lhs, RValue<Short> rhs)
1621         {
1622                 return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
1623         }
1624
1625         RValue<Bool> operator>=(RValue<Short> lhs, RValue<Short> rhs)
1626         {
1627                 return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
1628         }
1629
1630         RValue<Bool> operator!=(RValue<Short> lhs, RValue<Short> rhs)
1631         {
1632                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1633         }
1634
1635         RValue<Bool> operator==(RValue<Short> lhs, RValue<Short> rhs)
1636         {
1637                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1638         }
1639
1640         Type *Short::getType()
1641         {
1642                 return T(llvm::Type::getInt16Ty(*::context));
1643         }
1644
1645         UShort::UShort(Argument<UShort> argument)
1646         {
1647                 storeValue(argument.value);
1648         }
1649
1650         UShort::UShort(RValue<UInt> cast)
1651         {
1652                 Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
1653
1654                 storeValue(integer);
1655         }
1656
1657         UShort::UShort(RValue<Int> cast)
1658         {
1659                 Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
1660
1661                 storeValue(integer);
1662         }
1663
1664         UShort::UShort(unsigned short x)
1665         {
1666                 storeValue(Nucleus::createConstantShort(x));
1667         }
1668
1669         UShort::UShort(RValue<UShort> rhs)
1670         {
1671                 storeValue(rhs.value);
1672         }
1673
1674         UShort::UShort(const UShort &rhs)
1675         {
1676                 Value *value = rhs.loadValue();
1677                 storeValue(value);
1678         }
1679
1680         UShort::UShort(const Reference<UShort> &rhs)
1681         {
1682                 Value *value = rhs.loadValue();
1683                 storeValue(value);
1684         }
1685
1686         RValue<UShort> UShort::operator=(RValue<UShort> rhs)
1687         {
1688                 storeValue(rhs.value);
1689
1690                 return rhs;
1691         }
1692
1693         RValue<UShort> UShort::operator=(const UShort &rhs)
1694         {
1695                 Value *value = rhs.loadValue();
1696                 storeValue(value);
1697
1698                 return RValue<UShort>(value);
1699         }
1700
1701         RValue<UShort> UShort::operator=(const Reference<UShort> &rhs)
1702         {
1703                 Value *value = rhs.loadValue();
1704                 storeValue(value);
1705
1706                 return RValue<UShort>(value);
1707         }
1708
1709         RValue<UShort> operator+(RValue<UShort> lhs, RValue<UShort> rhs)
1710         {
1711                 return RValue<UShort>(Nucleus::createAdd(lhs.value, rhs.value));
1712         }
1713
1714         RValue<UShort> operator-(RValue<UShort> lhs, RValue<UShort> rhs)
1715         {
1716                 return RValue<UShort>(Nucleus::createSub(lhs.value, rhs.value));
1717         }
1718
1719         RValue<UShort> operator*(RValue<UShort> lhs, RValue<UShort> rhs)
1720         {
1721                 return RValue<UShort>(Nucleus::createMul(lhs.value, rhs.value));
1722         }
1723
1724         RValue<UShort> operator/(RValue<UShort> lhs, RValue<UShort> rhs)
1725         {
1726                 return RValue<UShort>(Nucleus::createUDiv(lhs.value, rhs.value));
1727         }
1728
1729         RValue<UShort> operator%(RValue<UShort> lhs, RValue<UShort> rhs)
1730         {
1731                 return RValue<UShort>(Nucleus::createURem(lhs.value, rhs.value));
1732         }
1733
1734         RValue<UShort> operator&(RValue<UShort> lhs, RValue<UShort> rhs)
1735         {
1736                 return RValue<UShort>(Nucleus::createAnd(lhs.value, rhs.value));
1737         }
1738
1739         RValue<UShort> operator|(RValue<UShort> lhs, RValue<UShort> rhs)
1740         {
1741                 return RValue<UShort>(Nucleus::createOr(lhs.value, rhs.value));
1742         }
1743
1744         RValue<UShort> operator^(RValue<UShort> lhs, RValue<UShort> rhs)
1745         {
1746                 return RValue<UShort>(Nucleus::createXor(lhs.value, rhs.value));
1747         }
1748
1749         RValue<UShort> operator<<(RValue<UShort> lhs, RValue<UShort> rhs)
1750         {
1751                 return RValue<UShort>(Nucleus::createShl(lhs.value, rhs.value));
1752         }
1753
1754         RValue<UShort> operator>>(RValue<UShort> lhs, RValue<UShort> rhs)
1755         {
1756                 return RValue<UShort>(Nucleus::createLShr(lhs.value, rhs.value));
1757         }
1758
1759         RValue<UShort> operator+=(UShort &lhs, RValue<UShort> rhs)
1760         {
1761                 return lhs = lhs + rhs;
1762         }
1763
1764         RValue<UShort> operator-=(UShort &lhs, RValue<UShort> rhs)
1765         {
1766                 return lhs = lhs - rhs;
1767         }
1768
1769         RValue<UShort> operator*=(UShort &lhs, RValue<UShort> rhs)
1770         {
1771                 return lhs = lhs * rhs;
1772         }
1773
1774         RValue<UShort> operator/=(UShort &lhs, RValue<UShort> rhs)
1775         {
1776                 return lhs = lhs / rhs;
1777         }
1778
1779         RValue<UShort> operator%=(UShort &lhs, RValue<UShort> rhs)
1780         {
1781                 return lhs = lhs % rhs;
1782         }
1783
1784         RValue<UShort> operator&=(UShort &lhs, RValue<UShort> rhs)
1785         {
1786                 return lhs = lhs & rhs;
1787         }
1788
1789         RValue<UShort> operator|=(UShort &lhs, RValue<UShort> rhs)
1790         {
1791                 return lhs = lhs | rhs;
1792         }
1793
1794         RValue<UShort> operator^=(UShort &lhs, RValue<UShort> rhs)
1795         {
1796                 return lhs = lhs ^ rhs;
1797         }
1798
1799         RValue<UShort> operator<<=(UShort &lhs, RValue<UShort> rhs)
1800         {
1801                 return lhs = lhs << rhs;
1802         }
1803
1804         RValue<UShort> operator>>=(UShort &lhs, RValue<UShort> rhs)
1805         {
1806                 return lhs = lhs >> rhs;
1807         }
1808
1809         RValue<UShort> operator+(RValue<UShort> val)
1810         {
1811                 return val;
1812         }
1813
1814         RValue<UShort> operator-(RValue<UShort> val)
1815         {
1816                 return RValue<UShort>(Nucleus::createNeg(val.value));
1817         }
1818
1819         RValue<UShort> operator~(RValue<UShort> val)
1820         {
1821                 return RValue<UShort>(Nucleus::createNot(val.value));
1822         }
1823
1824         RValue<UShort> operator++(UShort &val, int)   // Post-increment
1825         {
1826                 RValue<UShort> res = val;
1827
1828                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantShort((unsigned short)1)));
1829                 val.storeValue(inc);
1830
1831                 return res;
1832         }
1833
1834         const UShort &operator++(UShort &val)   // Pre-increment
1835         {
1836                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantShort((unsigned short)1)));
1837                 val.storeValue(inc);
1838
1839                 return val;
1840         }
1841
1842         RValue<UShort> operator--(UShort &val, int)   // Post-decrement
1843         {
1844                 RValue<UShort> res = val;
1845
1846                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantShort((unsigned short)1)));
1847                 val.storeValue(inc);
1848
1849                 return res;
1850         }
1851
1852         const UShort &operator--(UShort &val)   // Pre-decrement
1853         {
1854                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantShort((unsigned short)1)));
1855                 val.storeValue(inc);
1856
1857                 return val;
1858         }
1859
1860         RValue<Bool> operator<(RValue<UShort> lhs, RValue<UShort> rhs)
1861         {
1862                 return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
1863         }
1864
1865         RValue<Bool> operator<=(RValue<UShort> lhs, RValue<UShort> rhs)
1866         {
1867                 return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
1868         }
1869
1870         RValue<Bool> operator>(RValue<UShort> lhs, RValue<UShort> rhs)
1871         {
1872                 return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
1873         }
1874
1875         RValue<Bool> operator>=(RValue<UShort> lhs, RValue<UShort> rhs)
1876         {
1877                 return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
1878         }
1879
1880         RValue<Bool> operator!=(RValue<UShort> lhs, RValue<UShort> rhs)
1881         {
1882                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1883         }
1884
1885         RValue<Bool> operator==(RValue<UShort> lhs, RValue<UShort> rhs)
1886         {
1887                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1888         }
1889
1890         Type *UShort::getType()
1891         {
1892                 return T(llvm::Type::getInt16Ty(*::context));
1893         }
1894
1895         Byte4::Byte4(RValue<Byte8> cast)
1896         {
1897                 storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), Int::getType()));
1898         }
1899
1900         Byte4::Byte4(const Reference<Byte4> &rhs)
1901         {
1902                 Value *value = rhs.loadValue();
1903                 storeValue(value);
1904         }
1905
1906         Type *Byte4::getType()
1907         {
1908                 #if 0
1909                         return T(VectorType::get(Byte::getType(), 4));
1910                 #else
1911                         return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
1912                 #endif
1913         }
1914
1915         Type *SByte4::getType()
1916         {
1917                 #if 0
1918                         return T(VectorType::get(SByte::getType(), 4));
1919                 #else
1920                         return Int::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
1921                 #endif
1922         }
1923
1924         Byte8::Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
1925         {
1926                 int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
1927                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Byte::getType(), 8))));
1928
1929                 storeValue(Nucleus::createBitCast(vector, getType()));
1930         }
1931
1932         Byte8::Byte8(RValue<Byte8> rhs)
1933         {
1934                 storeValue(rhs.value);
1935         }
1936
1937         Byte8::Byte8(const Byte8 &rhs)
1938         {
1939                 Value *value = rhs.loadValue();
1940                 storeValue(value);
1941         }
1942
1943         Byte8::Byte8(const Reference<Byte8> &rhs)
1944         {
1945                 Value *value = rhs.loadValue();
1946                 storeValue(value);
1947         }
1948
1949         RValue<Byte8> Byte8::operator=(RValue<Byte8> rhs)
1950         {
1951                 storeValue(rhs.value);
1952
1953                 return rhs;
1954         }
1955
1956         RValue<Byte8> Byte8::operator=(const Byte8 &rhs)
1957         {
1958                 Value *value = rhs.loadValue();
1959                 storeValue(value);
1960
1961                 return RValue<Byte8>(value);
1962         }
1963
1964         RValue<Byte8> Byte8::operator=(const Reference<Byte8> &rhs)
1965         {
1966                 Value *value = rhs.loadValue();
1967                 storeValue(value);
1968
1969                 return RValue<Byte8>(value);
1970         }
1971
1972         RValue<Byte8> operator+(RValue<Byte8> lhs, RValue<Byte8> rhs)
1973         {
1974                 if(CPUID::supportsMMX2())
1975                 {
1976                         return x86::paddb(lhs, rhs);
1977                 }
1978                 else
1979                 {
1980                         return RValue<Byte8>(Nucleus::createAdd(lhs.value, rhs.value));
1981                 }
1982         }
1983
1984         RValue<Byte8> operator-(RValue<Byte8> lhs, RValue<Byte8> rhs)
1985         {
1986                 if(CPUID::supportsMMX2())
1987                 {
1988                         return x86::psubb(lhs, rhs);
1989                 }
1990                 else
1991                 {
1992                         return RValue<Byte8>(Nucleus::createSub(lhs.value, rhs.value));
1993                 }
1994         }
1995
1996 //      RValue<Byte8> operator*(RValue<Byte8> lhs, RValue<Byte8> rhs)
1997 //      {
1998 //              return RValue<Byte8>(Nucleus::createMul(lhs.value, rhs.value));
1999 //      }
2000
2001 //      RValue<Byte8> operator/(RValue<Byte8> lhs, RValue<Byte8> rhs)
2002 //      {
2003 //              return RValue<Byte8>(Nucleus::createUDiv(lhs.value, rhs.value));
2004 //      }
2005
2006 //      RValue<Byte8> operator%(RValue<Byte8> lhs, RValue<Byte8> rhs)
2007 //      {
2008 //              return RValue<Byte8>(Nucleus::createURem(lhs.value, rhs.value));
2009 //      }
2010
2011         RValue<Byte8> operator&(RValue<Byte8> lhs, RValue<Byte8> rhs)
2012         {
2013                 if(CPUID::supportsMMX2())
2014                 {
2015                         return As<Byte8>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
2016                 }
2017                 else
2018                 {
2019                         return RValue<Byte8>(Nucleus::createAnd(lhs.value, rhs.value));
2020                 }
2021         }
2022
2023         RValue<Byte8> operator|(RValue<Byte8> lhs, RValue<Byte8> rhs)
2024         {
2025                 if(CPUID::supportsMMX2())
2026                 {
2027                         return As<Byte8>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
2028                 }
2029                 else
2030                 {
2031                         return RValue<Byte8>(Nucleus::createOr(lhs.value, rhs.value));
2032                 }
2033         }
2034
2035         RValue<Byte8> operator^(RValue<Byte8> lhs, RValue<Byte8> rhs)
2036         {
2037                 if(CPUID::supportsMMX2())
2038                 {
2039                         return As<Byte8>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
2040                 }
2041                 else
2042                 {
2043                         return RValue<Byte8>(Nucleus::createXor(lhs.value, rhs.value));
2044                 }
2045         }
2046
2047 //      RValue<Byte8> operator<<(RValue<Byte8> lhs, unsigned char rhs)
2048 //      {
2049 //              return RValue<Byte8>(Nucleus::createShl(lhs.value, rhs.value));
2050 //      }
2051
2052 //      RValue<Byte8> operator>>(RValue<Byte8> lhs, unsigned char rhs)
2053 //      {
2054 //              return RValue<Byte8>(Nucleus::createLShr(lhs.value, rhs.value));
2055 //      }
2056
2057         RValue<Byte8> operator+=(Byte8 &lhs, RValue<Byte8> rhs)
2058         {
2059                 return lhs = lhs + rhs;
2060         }
2061
2062         RValue<Byte8> operator-=(Byte8 &lhs, RValue<Byte8> rhs)
2063         {
2064                 return lhs = lhs - rhs;
2065         }
2066
2067 //      RValue<Byte8> operator*=(Byte8 &lhs, RValue<Byte8> rhs)
2068 //      {
2069 //              return lhs = lhs * rhs;
2070 //      }
2071
2072 //      RValue<Byte8> operator/=(Byte8 &lhs, RValue<Byte8> rhs)
2073 //      {
2074 //              return lhs = lhs / rhs;
2075 //      }
2076
2077 //      RValue<Byte8> operator%=(Byte8 &lhs, RValue<Byte8> rhs)
2078 //      {
2079 //              return lhs = lhs % rhs;
2080 //      }
2081
2082         RValue<Byte8> operator&=(Byte8 &lhs, RValue<Byte8> rhs)
2083         {
2084                 return lhs = lhs & rhs;
2085         }
2086
2087         RValue<Byte8> operator|=(Byte8 &lhs, RValue<Byte8> rhs)
2088         {
2089                 return lhs = lhs | rhs;
2090         }
2091
2092         RValue<Byte8> operator^=(Byte8 &lhs, RValue<Byte8> rhs)
2093         {
2094                 return lhs = lhs ^ rhs;
2095         }
2096
2097 //      RValue<Byte8> operator<<=(Byte8 &lhs, RValue<Byte8> rhs)
2098 //      {
2099 //              return lhs = lhs << rhs;
2100 //      }
2101
2102 //      RValue<Byte8> operator>>=(Byte8 &lhs, RValue<Byte8> rhs)
2103 //      {
2104 //              return lhs = lhs >> rhs;
2105 //      }
2106
2107 //      RValue<Byte8> operator+(RValue<Byte8> val)
2108 //      {
2109 //              return val;
2110 //      }
2111
2112 //      RValue<Byte8> operator-(RValue<Byte8> val)
2113 //      {
2114 //              return RValue<Byte8>(Nucleus::createNeg(val.value));
2115 //      }
2116
2117         RValue<Byte8> operator~(RValue<Byte8> val)
2118         {
2119                 if(CPUID::supportsMMX2())
2120                 {
2121                         return val ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
2122                 }
2123                 else
2124                 {
2125                         return RValue<Byte8>(Nucleus::createNot(val.value));
2126                 }
2127         }
2128
2129         RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
2130         {
2131                 return x86::paddusb(x, y);
2132         }
2133
2134         RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
2135         {
2136                 return x86::psubusb(x, y);
2137         }
2138
2139         RValue<Short4> Unpack(RValue<Byte4> x)
2140         {
2141                 Value *int2 = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), x.value, 0);
2142                 Value *byte8 = Nucleus::createBitCast(int2, Byte8::getType());
2143
2144                 return UnpackLow(RValue<Byte8>(byte8), RValue<Byte8>(byte8));
2145         }
2146
2147         RValue<Short4> Unpack(RValue<Byte4> x, RValue<Byte4> y)
2148         {
2149                 Value *xx = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), x.value, 0);
2150                 Value *yy = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), y.value, 0);
2151
2152                 return UnpackLow(As<Byte8>(xx), As<Byte8>(yy));
2153         }
2154
2155         RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y)
2156         {
2157                 if(CPUID::supportsMMX2())
2158                 {
2159                         return x86::punpcklbw(x, y);
2160                 }
2161                 else
2162                 {
2163                         int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
2164                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2165
2166                         return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2167                 }
2168         }
2169
2170         RValue<Short4> UnpackHigh(RValue<Byte8> x, RValue<Byte8> y)
2171         {
2172                 if(CPUID::supportsMMX2())
2173                 {
2174                         return x86::punpckhbw(x, y);
2175                 }
2176                 else
2177                 {
2178                         int shuffle[8] = {4, 12, 5, 13, 6, 14, 7, 15};
2179                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2180
2181                         return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2182                 }
2183         }
2184
2185         RValue<Int> SignMask(RValue<Byte8> x)
2186         {
2187                 return x86::pmovmskb(x);
2188         }
2189
2190 //      RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
2191 //      {
2192 //              return x86::pcmpgtb(x, y);   // FIXME: Signedness
2193 //      }
2194
2195         RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
2196         {
2197                 return x86::pcmpeqb(x, y);
2198         }
2199
2200         Type *Byte8::getType()
2201         {
2202                 if(CPUID::supportsMMX2())
2203                 {
2204                         return MMX::getType();
2205                 }
2206                 else
2207                 {
2208                         return T(VectorType::get(Byte::getType(), 8));
2209                 }
2210         }
2211
2212         SByte8::SByte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
2213         {
2214                 int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
2215                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(SByte::getType(), 8))));
2216
2217                 storeValue(Nucleus::createBitCast(vector, getType()));
2218         }
2219
2220         SByte8::SByte8(RValue<SByte8> rhs)
2221         {
2222                 storeValue(rhs.value);
2223         }
2224
2225         SByte8::SByte8(const SByte8 &rhs)
2226         {
2227                 Value *value = rhs.loadValue();
2228                 storeValue(value);
2229         }
2230
2231         SByte8::SByte8(const Reference<SByte8> &rhs)
2232         {
2233                 Value *value = rhs.loadValue();
2234                 storeValue(value);
2235         }
2236
2237         RValue<SByte8> SByte8::operator=(RValue<SByte8> rhs)
2238         {
2239                 storeValue(rhs.value);
2240
2241                 return rhs;
2242         }
2243
2244         RValue<SByte8> SByte8::operator=(const SByte8 &rhs)
2245         {
2246                 Value *value = rhs.loadValue();
2247                 storeValue(value);
2248
2249                 return RValue<SByte8>(value);
2250         }
2251
2252         RValue<SByte8> SByte8::operator=(const Reference<SByte8> &rhs)
2253         {
2254                 Value *value = rhs.loadValue();
2255                 storeValue(value);
2256
2257                 return RValue<SByte8>(value);
2258         }
2259
2260         RValue<SByte8> operator+(RValue<SByte8> lhs, RValue<SByte8> rhs)
2261         {
2262                 if(CPUID::supportsMMX2())
2263                 {
2264                         return As<SByte8>(x86::paddb(As<Byte8>(lhs), As<Byte8>(rhs)));
2265                 }
2266                 else
2267                 {
2268                         return RValue<SByte8>(Nucleus::createAdd(lhs.value, rhs.value));
2269                 }
2270         }
2271
2272         RValue<SByte8> operator-(RValue<SByte8> lhs, RValue<SByte8> rhs)
2273         {
2274                 if(CPUID::supportsMMX2())
2275                 {
2276                         return As<SByte8>(x86::psubb(As<Byte8>(lhs), As<Byte8>(rhs)));
2277                 }
2278                 else
2279                 {
2280                         return RValue<SByte8>(Nucleus::createSub(lhs.value, rhs.value));
2281                 }
2282         }
2283
2284 //      RValue<SByte8> operator*(RValue<SByte8> lhs, RValue<SByte8> rhs)
2285 //      {
2286 //              return RValue<SByte8>(Nucleus::createMul(lhs.value, rhs.value));
2287 //      }
2288
2289 //      RValue<SByte8> operator/(RValue<SByte8> lhs, RValue<SByte8> rhs)
2290 //      {
2291 //              return RValue<SByte8>(Nucleus::createSDiv(lhs.value, rhs.value));
2292 //      }
2293
2294 //      RValue<SByte8> operator%(RValue<SByte8> lhs, RValue<SByte8> rhs)
2295 //      {
2296 //              return RValue<SByte8>(Nucleus::createSRem(lhs.value, rhs.value));
2297 //      }
2298
2299         RValue<SByte8> operator&(RValue<SByte8> lhs, RValue<SByte8> rhs)
2300         {
2301                 return RValue<SByte8>(Nucleus::createAnd(lhs.value, rhs.value));
2302         }
2303
2304         RValue<SByte8> operator|(RValue<SByte8> lhs, RValue<SByte8> rhs)
2305         {
2306                 return RValue<SByte8>(Nucleus::createOr(lhs.value, rhs.value));
2307         }
2308
2309         RValue<SByte8> operator^(RValue<SByte8> lhs, RValue<SByte8> rhs)
2310         {
2311                 return RValue<SByte8>(Nucleus::createXor(lhs.value, rhs.value));
2312         }
2313
2314 //      RValue<SByte8> operator<<(RValue<SByte8> lhs, unsigned char rhs)
2315 //      {
2316 //              return RValue<SByte8>(Nucleus::createShl(lhs.value, rhs.value));
2317 //      }
2318
2319 //      RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
2320 //      {
2321 //              return RValue<SByte8>(Nucleus::createAShr(lhs.value, rhs.value));
2322 //      }
2323
2324         RValue<SByte8> operator+=(SByte8 &lhs, RValue<SByte8> rhs)
2325         {
2326                 return lhs = lhs + rhs;
2327         }
2328
2329         RValue<SByte8> operator-=(SByte8 &lhs, RValue<SByte8> rhs)
2330         {
2331                 return lhs = lhs - rhs;
2332         }
2333
2334 //      RValue<SByte8> operator*=(SByte8 &lhs, RValue<SByte8> rhs)
2335 //      {
2336 //              return lhs = lhs * rhs;
2337 //      }
2338
2339 //      RValue<SByte8> operator/=(SByte8 &lhs, RValue<SByte8> rhs)
2340 //      {
2341 //              return lhs = lhs / rhs;
2342 //      }
2343
2344 //      RValue<SByte8> operator%=(SByte8 &lhs, RValue<SByte8> rhs)
2345 //      {
2346 //              return lhs = lhs % rhs;
2347 //      }
2348
2349         RValue<SByte8> operator&=(SByte8 &lhs, RValue<SByte8> rhs)
2350         {
2351                 return lhs = lhs & rhs;
2352         }
2353
2354         RValue<SByte8> operator|=(SByte8 &lhs, RValue<SByte8> rhs)
2355         {
2356                 return lhs = lhs | rhs;
2357         }
2358
2359         RValue<SByte8> operator^=(SByte8 &lhs, RValue<SByte8> rhs)
2360         {
2361                 return lhs = lhs ^ rhs;
2362         }
2363
2364 //      RValue<SByte8> operator<<=(SByte8 &lhs, RValue<SByte8> rhs)
2365 //      {
2366 //              return lhs = lhs << rhs;
2367 //      }
2368
2369 //      RValue<SByte8> operator>>=(SByte8 &lhs, RValue<SByte8> rhs)
2370 //      {
2371 //              return lhs = lhs >> rhs;
2372 //      }
2373
2374 //      RValue<SByte8> operator+(RValue<SByte8> val)
2375 //      {
2376 //              return val;
2377 //      }
2378
2379 //      RValue<SByte8> operator-(RValue<SByte8> val)
2380 //      {
2381 //              return RValue<SByte8>(Nucleus::createNeg(val.value));
2382 //      }
2383
2384         RValue<SByte8> operator~(RValue<SByte8> val)
2385         {
2386                 if(CPUID::supportsMMX2())
2387                 {
2388                         return val ^ SByte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
2389                 }
2390                 else
2391                 {
2392                         return RValue<SByte8>(Nucleus::createNot(val.value));
2393                 }
2394         }
2395
2396         RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
2397         {
2398                 return x86::paddsb(x, y);
2399         }
2400
2401         RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
2402         {
2403                 return x86::psubsb(x, y);
2404         }
2405
2406         RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y)
2407         {
2408                 if(CPUID::supportsMMX2())
2409                 {
2410                         return As<Short4>(x86::punpcklbw(As<Byte8>(x), As<Byte8>(y)));
2411                 }
2412                 else
2413                 {
2414                         int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
2415                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2416
2417                         return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2418                 }
2419         }
2420
2421         RValue<Short4> UnpackHigh(RValue<SByte8> x, RValue<SByte8> y)
2422         {
2423                 if(CPUID::supportsMMX2())
2424                 {
2425                         return As<Short4>(x86::punpckhbw(As<Byte8>(x), As<Byte8>(y)));
2426                 }
2427                 else
2428                 {
2429                         int shuffle[8] = {4, 12, 5, 13, 6, 14, 7, 15};
2430                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2431
2432                         return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2433                 }
2434         }
2435
2436         RValue<Int> SignMask(RValue<SByte8> x)
2437         {
2438                 return x86::pmovmskb(As<Byte8>(x));
2439         }
2440
2441         RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
2442         {
2443                 return x86::pcmpgtb(x, y);
2444         }
2445
2446         RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
2447         {
2448                 return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
2449         }
2450
2451         Type *SByte8::getType()
2452         {
2453                 if(CPUID::supportsMMX2())
2454                 {
2455                         return MMX::getType();
2456                 }
2457                 else
2458                 {
2459                         return T(VectorType::get(SByte::getType(), 8));
2460                 }
2461         }
2462
2463         Byte16::Byte16(RValue<Byte16> rhs)
2464         {
2465                 storeValue(rhs.value);
2466         }
2467
2468         Byte16::Byte16(const Byte16 &rhs)
2469         {
2470                 Value *value = rhs.loadValue();
2471                 storeValue(value);
2472         }
2473
2474         Byte16::Byte16(const Reference<Byte16> &rhs)
2475         {
2476                 Value *value = rhs.loadValue();
2477                 storeValue(value);
2478         }
2479
2480         RValue<Byte16> Byte16::operator=(RValue<Byte16> rhs)
2481         {
2482                 storeValue(rhs.value);
2483
2484                 return rhs;
2485         }
2486
2487         RValue<Byte16> Byte16::operator=(const Byte16 &rhs)
2488         {
2489                 Value *value = rhs.loadValue();
2490                 storeValue(value);
2491
2492                 return RValue<Byte16>(value);
2493         }
2494
2495         RValue<Byte16> Byte16::operator=(const Reference<Byte16> &rhs)
2496         {
2497                 Value *value = rhs.loadValue();
2498                 storeValue(value);
2499
2500                 return RValue<Byte16>(value);
2501         }
2502
2503         Type *Byte16::getType()
2504         {
2505                 return T(VectorType::get(Byte::getType(), 16));
2506         }
2507
2508         Type *SByte16::getType()
2509         {
2510                 return T( VectorType::get(SByte::getType(), 16));
2511         }
2512
2513         Short2::Short2(RValue<Short4> cast)
2514         {
2515                 storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), UInt::getType()));
2516         }
2517
2518         Type *Short2::getType()
2519         {
2520                 #if 0
2521                         return T(VectorType::get(Short::getType(), 2));
2522                 #else
2523                         return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
2524                 #endif
2525         }
2526
2527         UShort2::UShort2(RValue<UShort4> cast)
2528         {
2529                 storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), UInt::getType()));
2530         }
2531
2532         Type *UShort2::getType()
2533         {
2534                 #if 0
2535                         return T(VectorType::get(UShort::getType(), 2));
2536                 #else
2537                         return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
2538                 #endif
2539         }
2540
2541         Short4::Short4(RValue<Int> cast)
2542         {
2543                 Value *extend = Nucleus::createZExt(cast.value, Long::getType());
2544                 Value *swizzle = Swizzle(As<Short4>(extend), 0x00).value;
2545
2546                 storeValue(swizzle);
2547         }
2548
2549         Short4::Short4(RValue<Int4> cast)
2550         {
2551                 Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
2552
2553                 #if 0   // FIXME: Check codegen (pshuflw phshufhw pshufd)
2554                         Constant *pack[8];
2555                         pack[0] = Nucleus::createConstantInt(0);
2556                         pack[1] = Nucleus::createConstantInt(2);
2557                         pack[2] = Nucleus::createConstantInt(4);
2558                         pack[3] = Nucleus::createConstantInt(6);
2559
2560                         Value *short4 = Nucleus::createShuffleVector(short8, short8, Nucleus::createConstantVector(pack, 4));
2561                 #else
2562                         Value *packed;
2563
2564                         // FIXME: Use Swizzle<Short8>
2565                         if(!CPUID::supportsSSSE3())
2566                         {
2567                                 int pshuflw[8] = {0, 2, 0, 2, 4, 5, 6, 7};
2568                                 int pshufhw[8] = {0, 1, 2, 3, 4, 6, 4, 6};
2569
2570                                 Value *shuffle1 = Nucleus::createShuffleVector(short8, short8, pshuflw);
2571                                 Value *shuffle2 = Nucleus::createShuffleVector(shuffle1, shuffle1, pshufhw);
2572                                 Value *int4 = Nucleus::createBitCast(shuffle2, Int4::getType());
2573                                 packed = createSwizzle4(int4, 0x88);
2574                         }
2575                         else
2576                         {
2577                                 int pshufb[16] = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
2578                                 Value *byte16 = Nucleus::createBitCast(cast.value, Byte16::getType());
2579                                 packed = Nucleus::createShuffleVector(byte16, byte16, pshufb);
2580                         }
2581
2582                         #if 0   // FIXME: No optimal instruction selection
2583                                 Value *qword2 = Nucleus::createBitCast(packed, T(VectorType::get(Long::getType(), 2)));
2584                                 Value *element = Nucleus::createExtractElement(qword2, 0);
2585                                 Value *short4 = Nucleus::createBitCast(element, Short4::getType());
2586                         #else   // FIXME: Requires SSE
2587                                 Value *int2 = RValue<Int2>(Int2(As<Int4>(packed))).value;
2588                                 Value *short4 = Nucleus::createBitCast(int2, Short4::getType());
2589                         #endif
2590                 #endif
2591
2592                 storeValue(short4);
2593         }
2594
2595 //      Short4::Short4(RValue<Float> cast)
2596 //      {
2597 //      }
2598
2599         Short4::Short4(RValue<Float4> cast)
2600         {
2601                 Int4 v4i32 = Int4(cast);
2602                 v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
2603
2604                 storeValue(As<Short4>(Int2(v4i32)).value);
2605         }
2606
2607         Short4::Short4(short xyzw)
2608         {
2609                 int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
2610                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Short::getType(), 4))));
2611
2612                 storeValue(Nucleus::createBitCast(vector, getType()));
2613         }
2614
2615         Short4::Short4(short x, short y, short z, short w)
2616         {
2617                 int64_t constantVector[4] = {x, y, z, w};
2618                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Short::getType(), 4))));
2619
2620                 storeValue(Nucleus::createBitCast(vector, getType()));
2621         }
2622
2623         Short4::Short4(RValue<Short4> rhs)
2624         {
2625                 storeValue(rhs.value);
2626         }
2627
2628         Short4::Short4(const Short4 &rhs)
2629         {
2630                 Value *value = rhs.loadValue();
2631                 storeValue(value);
2632         }
2633
2634         Short4::Short4(const Reference<Short4> &rhs)
2635         {
2636                 Value *value = rhs.loadValue();
2637                 storeValue(value);
2638         }
2639
2640         Short4::Short4(RValue<UShort4> rhs)
2641         {
2642                 storeValue(rhs.value);
2643         }
2644
2645         Short4::Short4(const UShort4 &rhs)
2646         {
2647                 storeValue(rhs.loadValue());
2648         }
2649
2650         Short4::Short4(const Reference<UShort4> &rhs)
2651         {
2652                 storeValue(rhs.loadValue());
2653         }
2654
2655         RValue<Short4> Short4::operator=(RValue<Short4> rhs)
2656         {
2657                 storeValue(rhs.value);
2658
2659                 return rhs;
2660         }
2661
2662         RValue<Short4> Short4::operator=(const Short4 &rhs)
2663         {
2664                 Value *value = rhs.loadValue();
2665                 storeValue(value);
2666
2667                 return RValue<Short4>(value);
2668         }
2669
2670         RValue<Short4> Short4::operator=(const Reference<Short4> &rhs)
2671         {
2672                 Value *value = rhs.loadValue();
2673                 storeValue(value);
2674
2675                 return RValue<Short4>(value);
2676         }
2677
2678         RValue<Short4> Short4::operator=(RValue<UShort4> rhs)
2679         {
2680                 storeValue(rhs.value);
2681
2682                 return RValue<Short4>(rhs);
2683         }
2684
2685         RValue<Short4> Short4::operator=(const UShort4 &rhs)
2686         {
2687                 Value *value = rhs.loadValue();
2688                 storeValue(value);
2689
2690                 return RValue<Short4>(value);
2691         }
2692
2693         RValue<Short4> Short4::operator=(const Reference<UShort4> &rhs)
2694         {
2695                 Value *value = rhs.loadValue();
2696                 storeValue(value);
2697
2698                 return RValue<Short4>(value);
2699         }
2700
2701         RValue<Short4> operator+(RValue<Short4> lhs, RValue<Short4> rhs)
2702         {
2703                 if(CPUID::supportsMMX2())
2704                 {
2705                         return x86::paddw(lhs, rhs);
2706                 }
2707                 else
2708                 {
2709                         return RValue<Short4>(Nucleus::createAdd(lhs.value, rhs.value));
2710                 }
2711         }
2712
2713         RValue<Short4> operator-(RValue<Short4> lhs, RValue<Short4> rhs)
2714         {
2715                 if(CPUID::supportsMMX2())
2716                 {
2717                         return x86::psubw(lhs, rhs);
2718                 }
2719                 else
2720                 {
2721                         return RValue<Short4>(Nucleus::createSub(lhs.value, rhs.value));
2722                 }
2723         }
2724
2725         RValue<Short4> operator*(RValue<Short4> lhs, RValue<Short4> rhs)
2726         {
2727                 if(CPUID::supportsMMX2())
2728                 {
2729                         return x86::pmullw(lhs, rhs);
2730                 }
2731                 else
2732                 {
2733                         return RValue<Short4>(Nucleus::createMul(lhs.value, rhs.value));
2734                 }
2735         }
2736
2737 //      RValue<Short4> operator/(RValue<Short4> lhs, RValue<Short4> rhs)
2738 //      {
2739 //              return RValue<Short4>(Nucleus::createSDiv(lhs.value, rhs.value));
2740 //      }
2741
2742 //      RValue<Short4> operator%(RValue<Short4> lhs, RValue<Short4> rhs)
2743 //      {
2744 //              return RValue<Short4>(Nucleus::createSRem(lhs.value, rhs.value));
2745 //      }
2746
2747         RValue<Short4> operator&(RValue<Short4> lhs, RValue<Short4> rhs)
2748         {
2749                 if(CPUID::supportsMMX2())
2750                 {
2751                         return x86::pand(lhs, rhs);
2752                 }
2753                 else
2754                 {
2755                         return RValue<Short4>(Nucleus::createAnd(lhs.value, rhs.value));
2756                 }
2757         }
2758
2759         RValue<Short4> operator|(RValue<Short4> lhs, RValue<Short4> rhs)
2760         {
2761                 if(CPUID::supportsMMX2())
2762                 {
2763                         return x86::por(lhs, rhs);
2764                 }
2765                 else
2766                 {
2767                         return RValue<Short4>(Nucleus::createOr(lhs.value, rhs.value));
2768                 }
2769         }
2770
2771         RValue<Short4> operator^(RValue<Short4> lhs, RValue<Short4> rhs)
2772         {
2773                 if(CPUID::supportsMMX2())
2774                 {
2775                         return x86::pxor(lhs, rhs);
2776                 }
2777                 else
2778                 {
2779                         return RValue<Short4>(Nucleus::createXor(lhs.value, rhs.value));
2780                 }
2781         }
2782
2783         RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2784         {
2785         //      return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
2786
2787                 return x86::psllw(lhs, rhs);
2788         }
2789
2790         RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2791         {
2792         //      return RValue<Short4>(Nucleus::createAShr(lhs.value, rhs.value));
2793
2794                 return x86::psraw(lhs, rhs);
2795         }
2796
2797         RValue<Short4> operator+=(Short4 &lhs, RValue<Short4> rhs)
2798         {
2799                 return lhs = lhs + rhs;
2800         }
2801
2802         RValue<Short4> operator-=(Short4 &lhs, RValue<Short4> rhs)
2803         {
2804                 return lhs = lhs - rhs;
2805         }
2806
2807         RValue<Short4> operator*=(Short4 &lhs, RValue<Short4> rhs)
2808         {
2809                 return lhs = lhs * rhs;
2810         }
2811
2812 //      RValue<Short4> operator/=(Short4 &lhs, RValue<Short4> rhs)
2813 //      {
2814 //              return lhs = lhs / rhs;
2815 //      }
2816
2817 //      RValue<Short4> operator%=(Short4 &lhs, RValue<Short4> rhs)
2818 //      {
2819 //              return lhs = lhs % rhs;
2820 //      }
2821
2822         RValue<Short4> operator&=(Short4 &lhs, RValue<Short4> rhs)
2823         {
2824                 return lhs = lhs & rhs;
2825         }
2826
2827         RValue<Short4> operator|=(Short4 &lhs, RValue<Short4> rhs)
2828         {
2829                 return lhs = lhs | rhs;
2830         }
2831
2832         RValue<Short4> operator^=(Short4 &lhs, RValue<Short4> rhs)
2833         {
2834                 return lhs = lhs ^ rhs;
2835         }
2836
2837         RValue<Short4> operator<<=(Short4 &lhs, unsigned char rhs)
2838         {
2839                 return lhs = lhs << rhs;
2840         }
2841
2842         RValue<Short4> operator>>=(Short4 &lhs, unsigned char rhs)
2843         {
2844                 return lhs = lhs >> rhs;
2845         }
2846
2847 //      RValue<Short4> operator+(RValue<Short4> val)
2848 //      {
2849 //              return val;
2850 //      }
2851
2852         RValue<Short4> operator-(RValue<Short4> val)
2853         {
2854                 if(CPUID::supportsMMX2())
2855                 {
2856                         return Short4(0, 0, 0, 0) - val;
2857                 }
2858                 else
2859                 {
2860                         return RValue<Short4>(Nucleus::createNeg(val.value));
2861                 }
2862         }
2863
2864         RValue<Short4> operator~(RValue<Short4> val)
2865         {
2866                 if(CPUID::supportsMMX2())
2867                 {
2868                         return val ^ Short4(0xFFFFu, 0xFFFFu, 0xFFFFu, 0xFFFFu);
2869                 }
2870                 else
2871                 {
2872                         return RValue<Short4>(Nucleus::createNot(val.value));
2873                 }
2874         }
2875
2876         RValue<Short4> RoundShort4(RValue<Float4> cast)
2877         {
2878                 RValue<Int4> v4i32 = x86::cvtps2dq(cast);
2879                 RValue<Short8> v8i16 = x86::packssdw(v4i32, v4i32);
2880
2881                 return As<Short4>(Int2(As<Int4>(v8i16)));
2882         }
2883
2884         RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2885         {
2886                 return x86::pmaxsw(x, y);
2887         }
2888
2889         RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
2890         {
2891                 return x86::pminsw(x, y);
2892         }
2893
2894         RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
2895         {
2896                 return x86::paddsw(x, y);
2897         }
2898
2899         RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
2900         {
2901                 return x86::psubsw(x, y);
2902         }
2903
2904         RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
2905         {
2906                 return x86::pmulhw(x, y);
2907         }
2908
2909         RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
2910         {
2911                 return x86::pmaddwd(x, y);
2912         }
2913
2914         RValue<SByte8> Pack(RValue<Short4> x, RValue<Short4> y)
2915         {
2916                 return x86::packsswb(x, y);
2917         }
2918
2919         RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y)
2920         {
2921                 if(CPUID::supportsMMX2())
2922                 {
2923                         return x86::punpcklwd(x, y);
2924                 }
2925                 else
2926                 {
2927                         int shuffle[4] = {0, 4, 1, 5};
2928                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2929
2930                         return RValue<Int2>(Nucleus::createBitCast(packed, Int2::getType()));
2931                 }
2932         }
2933
2934         RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y)
2935         {
2936                 if(CPUID::supportsMMX2())
2937                 {
2938                         return x86::punpckhwd(x, y);
2939                 }
2940                 else
2941                 {
2942                         int shuffle[4] = {2, 6, 3, 7};
2943                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2944
2945                         return RValue<Int2>(Nucleus::createBitCast(packed, Int2::getType()));
2946                 }
2947         }
2948
2949         RValue<Short4> Swizzle(RValue<Short4> x, unsigned char select)
2950         {
2951                 if(CPUID::supportsMMX2())
2952                 {
2953                         return x86::pshufw(x, select);
2954                 }
2955                 else
2956                 {
2957                         return RValue<Short4>(createSwizzle4(x.value, select));
2958                 }
2959         }
2960
2961         RValue<Short4> Insert(RValue<Short4> val, RValue<Short> element, int i)
2962         {
2963                 if(CPUID::supportsMMX2())
2964                 {
2965                         return x86::pinsrw(val, Int(element), i);
2966                 }
2967                 else
2968                 {
2969                         return RValue<Short4>(Nucleus::createInsertElement(val.value, element.value, i));
2970                 }
2971         }
2972
2973         RValue<Short> Extract(RValue<Short4> val, int i)
2974         {
2975                 if(CPUID::supportsMMX2())
2976                 {
2977                         return Short(x86::pextrw(val, i));
2978                 }
2979                 else
2980                 {
2981                         return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
2982                 }
2983         }
2984
2985         RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2986         {
2987                 return x86::pcmpgtw(x, y);
2988         }
2989
2990         RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2991         {
2992                 return x86::pcmpeqw(x, y);
2993         }
2994
2995         Type *Short4::getType()
2996         {
2997                 if(CPUID::supportsMMX2())
2998                 {
2999                         return MMX::getType();
3000                 }
3001                 else
3002                 {
3003                         return T(VectorType::get(Short::getType(), 4));
3004                 }
3005         }
3006
3007         UShort4::UShort4(RValue<Int4> cast)
3008         {
3009                 *this = Short4(cast);
3010         }
3011
3012         UShort4::UShort4(RValue<Float4> cast, bool saturate)
3013         {
3014                 Float4 sat;
3015
3016                 if(saturate)
3017                 {
3018                         if(CPUID::supportsSSE4_1())
3019                         {
3020                                 sat = Min(cast, Float4(0xFFFF));   // packusdw takes care of 0x0000 saturation
3021                         }
3022                         else
3023                         {
3024                                 sat = Max(Min(cast, Float4(0xFFFF)), Float4(0x0000));
3025                         }
3026                 }
3027                 else
3028                 {
3029                         sat = cast;
3030                 }
3031
3032                 Int4 int4(sat);
3033
3034                 if(!saturate || !CPUID::supportsSSE4_1())
3035                 {
3036                         *this = Short4(int4);
3037                 }
3038                 else
3039                 {
3040                         *this = As<Short4>(Int2(As<Int4>(x86::packusdw(int4, int4))));
3041                 }
3042         }
3043
3044         UShort4::UShort4(unsigned short xyzw)
3045         {
3046                 int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
3047                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UShort::getType(), 4))));
3048
3049                 storeValue(Nucleus::createBitCast(vector, getType()));
3050         }
3051
3052         UShort4::UShort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
3053         {
3054                 int64_t constantVector[4] = {x, y, z, w};
3055                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UShort::getType(), 4))));
3056
3057                 storeValue(Nucleus::createBitCast(vector, getType()));
3058         }
3059
3060         UShort4::UShort4(RValue<UShort4> rhs)
3061         {
3062                 storeValue(rhs.value);
3063         }
3064
3065         UShort4::UShort4(const UShort4 &rhs)
3066         {
3067                 Value *value = rhs.loadValue();
3068                 storeValue(value);
3069         }
3070
3071         UShort4::UShort4(const Reference<UShort4> &rhs)
3072         {
3073                 Value *value = rhs.loadValue();
3074                 storeValue(value);
3075         }
3076
3077         UShort4::UShort4(RValue<Short4> rhs)
3078         {
3079                 storeValue(rhs.value);
3080         }
3081
3082         UShort4::UShort4(const Short4 &rhs)
3083         {
3084                 Value *value = rhs.loadValue();
3085                 storeValue(value);
3086         }
3087
3088         UShort4::UShort4(const Reference<Short4> &rhs)
3089         {
3090                 Value *value = rhs.loadValue();
3091                 storeValue(value);
3092         }
3093
3094         RValue<UShort4> UShort4::operator=(RValue<UShort4> rhs)
3095         {
3096                 storeValue(rhs.value);
3097
3098                 return rhs;
3099         }
3100
3101         RValue<UShort4> UShort4::operator=(const UShort4 &rhs)
3102         {
3103                 Value *value = rhs.loadValue();
3104                 storeValue(value);
3105
3106                 return RValue<UShort4>(value);
3107         }
3108
3109         RValue<UShort4> UShort4::operator=(const Reference<UShort4> &rhs)
3110         {
3111                 Value *value = rhs.loadValue();
3112                 storeValue(value);
3113
3114                 return RValue<UShort4>(value);
3115         }
3116
3117         RValue<UShort4> UShort4::operator=(RValue<Short4> rhs)
3118         {
3119                 storeValue(rhs.value);
3120
3121                 return RValue<UShort4>(rhs);
3122         }
3123
3124         RValue<UShort4> UShort4::operator=(const Short4 &rhs)
3125         {
3126                 Value *value = rhs.loadValue();
3127                 storeValue(value);
3128
3129                 return RValue<UShort4>(value);
3130         }
3131
3132         RValue<UShort4> UShort4::operator=(const Reference<Short4> &rhs)
3133         {
3134                 Value *value = rhs.loadValue();
3135                 storeValue(value);
3136
3137                 return RValue<UShort4>(value);
3138         }
3139
3140         RValue<UShort4> operator+(RValue<UShort4> lhs, RValue<UShort4> rhs)
3141         {
3142                 if(CPUID::supportsMMX2())
3143                 {
3144                         return As<UShort4>(x86::paddw(As<Short4>(lhs), As<Short4>(rhs)));
3145                 }
3146                 else
3147                 {
3148                         return RValue<UShort4>(Nucleus::createAdd(lhs.value, rhs.value));
3149                 }
3150         }
3151
3152         RValue<UShort4> operator-(RValue<UShort4> lhs, RValue<UShort4> rhs)
3153         {
3154                 if(CPUID::supportsMMX2())
3155                 {
3156                         return As<UShort4>(x86::psubw(As<Short4>(lhs), As<Short4>(rhs)));
3157                 }
3158                 else
3159                 {
3160                         return RValue<UShort4>(Nucleus::createSub(lhs.value, rhs.value));
3161                 }
3162         }
3163
3164         RValue<UShort4> operator*(RValue<UShort4> lhs, RValue<UShort4> rhs)
3165         {
3166                 if(CPUID::supportsMMX2())
3167                 {
3168                         return As<UShort4>(x86::pmullw(As<Short4>(lhs), As<Short4>(rhs)));
3169                 }
3170                 else
3171                 {
3172                         return RValue<UShort4>(Nucleus::createMul(lhs.value, rhs.value));
3173                 }
3174         }
3175
3176         RValue<UShort4> operator&(RValue<UShort4> lhs, RValue<UShort4> rhs)
3177         {
3178                 if(CPUID::supportsMMX2())
3179                 {
3180                         return As<UShort4>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
3181                 }
3182                 else
3183                 {
3184                         return RValue<UShort4>(Nucleus::createAnd(lhs.value, rhs.value));
3185                 }
3186         }
3187
3188         RValue<UShort4> operator|(RValue<UShort4> lhs, RValue<UShort4> rhs)
3189         {
3190                 if(CPUID::supportsMMX2())
3191                 {
3192                         return As<UShort4>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
3193                 }
3194                 else
3195                 {
3196                         return RValue<UShort4>(Nucleus::createOr(lhs.value, rhs.value));
3197                 }
3198         }
3199
3200         RValue<UShort4> operator^(RValue<UShort4> lhs, RValue<UShort4> rhs)
3201         {
3202                 if(CPUID::supportsMMX2())
3203                 {
3204                         return As<UShort4>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
3205                 }
3206                 else
3207                 {
3208                         return RValue<UShort4>(Nucleus::createXor(lhs.value, rhs.value));
3209                 }
3210         }
3211
3212         RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
3213         {
3214         //      return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
3215
3216                 return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
3217         }
3218
3219         RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
3220         {
3221         //      return RValue<Short4>(Nucleus::createLShr(lhs.value, rhs.value));
3222
3223                 return x86::psrlw(lhs, rhs);
3224         }
3225
3226         RValue<UShort4> operator<<=(UShort4 &lhs, unsigned char rhs)
3227         {
3228                 return lhs = lhs << rhs;
3229         }
3230
3231         RValue<UShort4> operator>>=(UShort4 &lhs, unsigned char rhs)
3232         {
3233                 return lhs = lhs >> rhs;
3234         }
3235
3236         RValue<UShort4> operator~(RValue<UShort4> val)
3237         {
3238                 if(CPUID::supportsMMX2())
3239                 {
3240                         return As<UShort4>(As<Short4>(val) ^ Short4(0xFFFFu, 0xFFFFu, 0xFFFFu, 0xFFFFu));
3241                 }
3242                 else
3243                 {
3244                         return RValue<UShort4>(Nucleus::createNot(val.value));
3245                 }
3246         }
3247
3248         RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
3249         {
3250                 return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
3251         }
3252
3253         RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
3254         {
3255                 return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
3256         }
3257
3258         RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
3259         {
3260                 return x86::paddusw(x, y);
3261         }
3262
3263         RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
3264         {
3265                 return x86::psubusw(x, y);
3266         }
3267
3268         RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
3269         {
3270                 return x86::pmulhuw(x, y);
3271         }
3272
3273         RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
3274         {
3275                 return x86::pavgw(x, y);
3276         }
3277
3278         RValue<Byte8> Pack(RValue<UShort4> x, RValue<UShort4> y)
3279         {
3280                 return x86::packuswb(x, y);
3281         }
3282
3283         Type *UShort4::getType()
3284         {
3285                 if(CPUID::supportsMMX2())
3286                 {
3287                         return MMX::getType();
3288                 }
3289                 else
3290                 {
3291                         return T(VectorType::get(UShort::getType(), 4));
3292                 }
3293         }
3294
3295         Short8::Short8(short c)
3296         {
3297                 int64_t constantVector[8] = {c, c, c, c, c, c, c, c};
3298                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
3299         }
3300
3301         Short8::Short8(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7)
3302         {
3303                 int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
3304                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
3305         }
3306
3307         Short8::Short8(RValue<Short8> rhs)
3308         {
3309                 storeValue(rhs.value);
3310         }
3311
3312         Short8::Short8(const Reference<Short8> &rhs)
3313         {
3314                 Value *value = rhs.loadValue();
3315                 storeValue(value);
3316         }
3317
3318         Short8::Short8(RValue<Short4> lo, RValue<Short4> hi)
3319         {
3320                 Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
3321                 Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
3322
3323                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
3324                 long2 = Nucleus::createInsertElement(long2, loLong, 0);
3325                 long2 = Nucleus::createInsertElement(long2, hiLong, 1);
3326                 Value *short8 = Nucleus::createBitCast(long2, Short8::getType());
3327
3328                 storeValue(short8);
3329         }
3330
3331         RValue<Short8> operator+(RValue<Short8> lhs, RValue<Short8> rhs)
3332         {
3333                 return RValue<Short8>(Nucleus::createAdd(lhs.value, rhs.value));
3334         }
3335
3336         RValue<Short8> operator&(RValue<Short8> lhs, RValue<Short8> rhs)
3337         {
3338                 return RValue<Short8>(Nucleus::createAnd(lhs.value, rhs.value));
3339         }
3340
3341         RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
3342         {
3343                 return x86::psllw(lhs, rhs);   // FIXME: Fallback required
3344         }
3345
3346         RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
3347         {
3348                 return x86::psraw(lhs, rhs);   // FIXME: Fallback required
3349         }
3350
3351         RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
3352         {
3353                 return x86::pmaddwd(x, y);   // FIXME: Fallback required
3354         }
3355
3356         RValue<Int4> Abs(RValue<Int4> x)
3357         {
3358                 if(CPUID::supportsSSSE3())
3359                 {
3360                         return x86::pabsd(x);
3361                 }
3362                 else
3363                 {
3364                         Int4 mask = (x >> 31);
3365                         return (mask ^ x) - mask;
3366                 }
3367         }
3368
3369         RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
3370         {
3371                 return x86::pmulhw(x, y);   // FIXME: Fallback required
3372         }
3373
3374         Type *Short8::getType()
3375         {
3376                 return T(VectorType::get(Short::getType(), 8));
3377         }
3378
3379         UShort8::UShort8(unsigned short c)
3380         {
3381                 int64_t constantVector[8] = {c, c, c, c, c, c, c, c};
3382                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
3383         }
3384
3385         UShort8::UShort8(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7)
3386         {
3387                 int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
3388                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
3389         }
3390
3391         UShort8::UShort8(RValue<UShort8> rhs)
3392         {
3393                 storeValue(rhs.value);
3394         }
3395
3396         UShort8::UShort8(const Reference<UShort8> &rhs)
3397         {
3398                 Value *value = rhs.loadValue();
3399                 storeValue(value);
3400         }
3401
3402         UShort8::UShort8(RValue<UShort4> lo, RValue<UShort4> hi)
3403         {
3404                 Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
3405                 Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
3406
3407                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
3408                 long2 = Nucleus::createInsertElement(long2, loLong, 0);
3409                 long2 = Nucleus::createInsertElement(long2, hiLong, 1);
3410                 Value *short8 = Nucleus::createBitCast(long2, Short8::getType());
3411
3412                 storeValue(short8);
3413         }
3414
3415         RValue<UShort8> UShort8::operator=(RValue<UShort8> rhs)
3416         {
3417                 storeValue(rhs.value);
3418
3419                 return rhs;
3420         }
3421
3422         RValue<UShort8> UShort8::operator=(const UShort8 &rhs)
3423         {
3424                 Value *value = rhs.loadValue();
3425                 storeValue(value);
3426
3427                 return RValue<UShort8>(value);
3428         }
3429
3430         RValue<UShort8> UShort8::operator=(const Reference<UShort8> &rhs)
3431         {
3432                 Value *value = rhs.loadValue();
3433                 storeValue(value);
3434
3435                 return RValue<UShort8>(value);
3436         }
3437
3438         RValue<UShort8> operator&(RValue<UShort8> lhs, RValue<UShort8> rhs)
3439         {
3440                 return RValue<UShort8>(Nucleus::createAnd(lhs.value, rhs.value));
3441         }
3442
3443         RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
3444         {
3445                 return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));   // FIXME: Fallback required
3446         }
3447
3448         RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
3449         {
3450                 return x86::psrlw(lhs, rhs);   // FIXME: Fallback required
3451         }
3452
3453         RValue<UShort8> operator+(RValue<UShort8> lhs, RValue<UShort8> rhs)
3454         {
3455                 return RValue<UShort8>(Nucleus::createAdd(lhs.value, rhs.value));
3456         }
3457
3458         RValue<UShort8> operator*(RValue<UShort8> lhs, RValue<UShort8> rhs)
3459         {
3460                 return RValue<UShort8>(Nucleus::createMul(lhs.value, rhs.value));
3461         }
3462
3463         RValue<UShort8> operator+=(UShort8 &lhs, RValue<UShort8> rhs)
3464         {
3465                 return lhs = lhs + rhs;
3466         }
3467
3468         RValue<UShort8> operator~(RValue<UShort8> val)
3469         {
3470                 return RValue<UShort8>(Nucleus::createNot(val.value));
3471         }
3472
3473         RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
3474         {
3475                 int pshufb[16] =
3476                 {
3477                         select0 + 0,
3478                         select0 + 1,
3479                         select1 + 0,
3480                         select1 + 1,
3481                         select2 + 0,
3482                         select2 + 1,
3483                         select3 + 0,
3484                         select3 + 1,
3485                         select4 + 0,
3486                         select4 + 1,
3487                         select5 + 0,
3488                         select5 + 1,
3489                         select6 + 0,
3490                         select6 + 1,
3491                         select7 + 0,
3492                         select7 + 1,
3493                 };
3494
3495                 Value *byte16 = Nucleus::createBitCast(x.value, Byte16::getType());
3496                 Value *shuffle = Nucleus::createShuffleVector(byte16, byte16, pshufb);
3497                 Value *short8 = Nucleus::createBitCast(shuffle, UShort8::getType());
3498
3499                 return RValue<UShort8>(short8);
3500         }
3501
3502         RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
3503         {
3504                 return x86::pmulhuw(x, y);   // FIXME: Fallback required
3505         }
3506
3507         Type *UShort8::getType()
3508         {
3509                 return T(VectorType::get(UShort::getType(), 8));
3510         }
3511
3512         Int::Int(Argument<Int> argument)
3513         {
3514                 storeValue(argument.value);
3515         }
3516
3517         Int::Int(RValue<Byte> cast)
3518         {
3519                 Value *integer = Nucleus::createZExt(cast.value, Int::getType());
3520
3521                 storeValue(integer);
3522         }
3523
3524         Int::Int(RValue<SByte> cast)
3525         {
3526                 Value *integer = Nucleus::createSExt(cast.value, Int::getType());
3527
3528                 storeValue(integer);
3529         }
3530
3531         Int::Int(RValue<Short> cast)
3532         {
3533                 Value *integer = Nucleus::createSExt(cast.value, Int::getType());
3534
3535                 storeValue(integer);
3536         }
3537
3538         Int::Int(RValue<UShort> cast)
3539         {
3540                 Value *integer = Nucleus::createZExt(cast.value, Int::getType());
3541
3542                 storeValue(integer);
3543         }
3544
3545         Int::Int(RValue<Int2> cast)
3546         {
3547                 *this = Extract(cast, 0);
3548         }
3549
3550         Int::Int(RValue<Long> cast)
3551         {
3552                 Value *integer = Nucleus::createTrunc(cast.value, Int::getType());
3553
3554                 storeValue(integer);
3555         }
3556
3557         Int::Int(RValue<Float> cast)
3558         {
3559                 Value *integer = Nucleus::createFPToSI(cast.value, Int::getType());
3560
3561                 storeValue(integer);
3562         }
3563
3564         Int::Int(int x)
3565         {
3566                 storeValue(Nucleus::createConstantInt(x));
3567         }
3568
3569         Int::Int(RValue<Int> rhs)
3570         {
3571                 storeValue(rhs.value);
3572         }
3573
3574         Int::Int(RValue<UInt> rhs)
3575         {
3576                 storeValue(rhs.value);
3577         }
3578
3579         Int::Int(const Int &rhs)
3580         {
3581                 Value *value = rhs.loadValue();
3582                 storeValue(value);
3583         }
3584
3585         Int::Int(const Reference<Int> &rhs)
3586         {
3587                 Value *value = rhs.loadValue();
3588                 storeValue(value);
3589         }
3590
3591         Int::Int(const UInt &rhs)
3592         {
3593                 Value *value = rhs.loadValue();
3594                 storeValue(value);
3595         }
3596
3597         Int::Int(const Reference<UInt> &rhs)
3598         {
3599                 Value *value = rhs.loadValue();
3600                 storeValue(value);
3601         }
3602
3603         RValue<Int> Int::operator=(int rhs)
3604         {
3605                 return RValue<Int>(storeValue(Nucleus::createConstantInt(rhs)));
3606         }
3607
3608         RValue<Int> Int::operator=(RValue<Int> rhs)
3609         {
3610                 storeValue(rhs.value);
3611
3612                 return rhs;
3613         }
3614
3615         RValue<Int> Int::operator=(RValue<UInt> rhs)
3616         {
3617                 storeValue(rhs.value);
3618
3619                 return RValue<Int>(rhs);
3620         }
3621
3622         RValue<Int> Int::operator=(const Int &rhs)
3623         {
3624                 Value *value = rhs.loadValue();
3625                 storeValue(value);
3626
3627                 return RValue<Int>(value);
3628         }
3629
3630         RValue<Int> Int::operator=(const Reference<Int> &rhs)
3631         {
3632                 Value *value = rhs.loadValue();
3633                 storeValue(value);
3634
3635                 return RValue<Int>(value);
3636         }
3637
3638         RValue<Int> Int::operator=(const UInt &rhs)
3639         {
3640                 Value *value = rhs.loadValue();
3641                 storeValue(value);
3642
3643                 return RValue<Int>(value);
3644         }
3645
3646         RValue<Int> Int::operator=(const Reference<UInt> &rhs)
3647         {
3648                 Value *value = rhs.loadValue();
3649                 storeValue(value);
3650
3651                 return RValue<Int>(value);
3652         }
3653
3654         RValue<Int> operator+(RValue<Int> lhs, RValue<Int> rhs)
3655         {
3656                 return RValue<Int>(Nucleus::createAdd(lhs.value, rhs.value));
3657         }
3658
3659         RValue<Int> operator-(RValue<Int> lhs, RValue<Int> rhs)
3660         {
3661                 return RValue<Int>(Nucleus::createSub(lhs.value, rhs.value));
3662         }
3663
3664         RValue<Int> operator*(RValue<Int> lhs, RValue<Int> rhs)
3665         {
3666                 return RValue<Int>(Nucleus::createMul(lhs.value, rhs.value));
3667         }
3668
3669         RValue<Int> operator/(RValue<Int> lhs, RValue<Int> rhs)
3670         {
3671                 return RValue<Int>(Nucleus::createSDiv(lhs.value, rhs.value));
3672         }
3673
3674         RValue<Int> operator%(RValue<Int> lhs, RValue<Int> rhs)
3675         {
3676                 return RValue<Int>(Nucleus::createSRem(lhs.value, rhs.value));
3677         }
3678
3679         RValue<Int> operator&(RValue<Int> lhs, RValue<Int> rhs)
3680         {
3681                 return RValue<Int>(Nucleus::createAnd(lhs.value, rhs.value));
3682         }
3683
3684         RValue<Int> operator|(RValue<Int> lhs, RValue<Int> rhs)
3685         {
3686                 return RValue<Int>(Nucleus::createOr(lhs.value, rhs.value));
3687         }
3688
3689         RValue<Int> operator^(RValue<Int> lhs, RValue<Int> rhs)
3690         {
3691                 return RValue<Int>(Nucleus::createXor(lhs.value, rhs.value));
3692         }
3693
3694         RValue<Int> operator<<(RValue<Int> lhs, RValue<Int> rhs)
3695         {
3696                 return RValue<Int>(Nucleus::createShl(lhs.value, rhs.value));
3697         }
3698
3699         RValue<Int> operator>>(RValue<Int> lhs, RValue<Int> rhs)
3700         {
3701                 return RValue<Int>(Nucleus::createAShr(lhs.value, rhs.value));
3702         }
3703
3704         RValue<Int> operator+=(Int &lhs, RValue<Int> rhs)
3705         {
3706                 return lhs = lhs + rhs;
3707         }
3708
3709         RValue<Int> operator-=(Int &lhs, RValue<Int> rhs)
3710         {
3711                 return lhs = lhs - rhs;
3712         }
3713
3714         RValue<Int> operator*=(Int &lhs, RValue<Int> rhs)
3715         {
3716                 return lhs = lhs * rhs;
3717         }
3718
3719         RValue<Int> operator/=(Int &lhs, RValue<Int> rhs)
3720         {
3721                 return lhs = lhs / rhs;
3722         }
3723
3724         RValue<Int> operator%=(Int &lhs, RValue<Int> rhs)
3725         {
3726                 return lhs = lhs % rhs;
3727         }
3728
3729         RValue<Int> operator&=(Int &lhs, RValue<Int> rhs)
3730         {
3731                 return lhs = lhs & rhs;
3732         }
3733
3734         RValue<Int> operator|=(Int &lhs, RValue<Int> rhs)
3735         {
3736                 return lhs = lhs | rhs;
3737         }
3738
3739         RValue<Int> operator^=(Int &lhs, RValue<Int> rhs)
3740         {
3741                 return lhs = lhs ^ rhs;
3742         }
3743
3744         RValue<Int> operator<<=(Int &lhs, RValue<Int> rhs)
3745         {
3746                 return lhs = lhs << rhs;
3747         }
3748
3749         RValue<Int> operator>>=(Int &lhs, RValue<Int> rhs)
3750         {
3751                 return lhs = lhs >> rhs;
3752         }
3753
3754         RValue<Int> operator+(RValue<Int> val)
3755         {
3756                 return val;
3757         }
3758
3759         RValue<Int> operator-(RValue<Int> val)
3760         {
3761                 return RValue<Int>(Nucleus::createNeg(val.value));
3762         }
3763
3764         RValue<Int> operator~(RValue<Int> val)
3765         {
3766                 return RValue<Int>(Nucleus::createNot(val.value));
3767         }
3768
3769         RValue<Int> operator++(Int &val, int)   // Post-increment
3770         {
3771                 RValue<Int> res = val;
3772
3773                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantInt(1)));
3774                 val.storeValue(inc);
3775
3776                 return res;
3777         }
3778
3779         const Int &operator++(Int &val)   // Pre-increment
3780         {
3781                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantInt(1)));
3782                 val.storeValue(inc);
3783
3784                 return val;
3785         }
3786
3787         RValue<Int> operator--(Int &val, int)   // Post-decrement
3788         {
3789                 RValue<Int> res = val;
3790
3791                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantInt(1)));
3792                 val.storeValue(inc);
3793
3794                 return res;
3795         }
3796
3797         const Int &operator--(Int &val)   // Pre-decrement
3798         {
3799                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantInt(1)));
3800                 val.storeValue(inc);
3801
3802                 return val;
3803         }
3804
3805         RValue<Bool> operator<(RValue<Int> lhs, RValue<Int> rhs)
3806         {
3807                 return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
3808         }
3809
3810         RValue<Bool> operator<=(RValue<Int> lhs, RValue<Int> rhs)
3811         {
3812                 return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
3813         }
3814
3815         RValue<Bool> operator>(RValue<Int> lhs, RValue<Int> rhs)
3816         {
3817                 return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
3818         }
3819
3820         RValue<Bool> operator>=(RValue<Int> lhs, RValue<Int> rhs)
3821         {
3822                 return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
3823         }
3824
3825         RValue<Bool> operator!=(RValue<Int> lhs, RValue<Int> rhs)
3826         {
3827                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
3828         }
3829
3830         RValue<Bool> operator==(RValue<Int> lhs, RValue<Int> rhs)
3831         {
3832                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
3833         }
3834
3835         RValue<Int> Max(RValue<Int> x, RValue<Int> y)
3836         {
3837                 return IfThenElse(x > y, x, y);
3838         }
3839
3840         RValue<Int> Min(RValue<Int> x, RValue<Int> y)
3841         {
3842                 return IfThenElse(x < y, x, y);
3843         }
3844
3845         RValue<Int> Clamp(RValue<Int> x, RValue<Int> min, RValue<Int> max)
3846         {
3847                 return Min(Max(x, min), max);
3848         }
3849
3850         RValue<Int> RoundInt(RValue<Float> cast)
3851         {
3852                 return x86::cvtss2si(cast);
3853
3854         //      return IfThenElse(val > 0.0f, Int(val + 0.5f), Int(val - 0.5f));
3855         }
3856
3857         Type *Int::getType()
3858         {
3859                 return T(llvm::Type::getInt32Ty(*::context));
3860         }
3861
3862         Long::Long(RValue<Int> cast)
3863         {
3864                 Value *integer = Nucleus::createSExt(cast.value, Long::getType());
3865
3866                 storeValue(integer);
3867         }
3868
3869         Long::Long(RValue<UInt> cast)
3870         {
3871                 Value *integer = Nucleus::createZExt(cast.value, Long::getType());
3872
3873                 storeValue(integer);
3874         }
3875
3876         Long::Long(RValue<Long> rhs)
3877         {
3878                 storeValue(rhs.value);
3879         }
3880
3881         RValue<Long> Long::operator=(int64_t rhs)
3882         {
3883                 return RValue<Long>(storeValue(Nucleus::createConstantLong(rhs)));
3884         }
3885
3886         RValue<Long> Long::operator=(RValue<Long> rhs)
3887         {
3888                 storeValue(rhs.value);
3889
3890                 return rhs;
3891         }
3892
3893         RValue<Long> Long::operator=(const Long &rhs)
3894         {
3895                 Value *value = rhs.loadValue();
3896                 storeValue(value);
3897
3898                 return RValue<Long>(value);
3899         }
3900
3901         RValue<Long> Long::operator=(const Reference<Long> &rhs)
3902         {
3903                 Value *value = rhs.loadValue();
3904                 storeValue(value);
3905
3906                 return RValue<Long>(value);
3907         }
3908
3909         RValue<Long> operator+(RValue<Long> lhs, RValue<Long> rhs)
3910         {
3911                 return RValue<Long>(Nucleus::createAdd(lhs.value, rhs.value));
3912         }
3913
3914         RValue<Long> operator-(RValue<Long> lhs, RValue<Long> rhs)
3915         {
3916                 return RValue<Long>(Nucleus::createSub(lhs.value, rhs.value));
3917         }
3918
3919         RValue<Long> operator+=(Long &lhs, RValue<Long> rhs)
3920         {
3921                 return lhs = lhs + rhs;
3922         }
3923
3924         RValue<Long> operator-=(Long &lhs, RValue<Long> rhs)
3925         {
3926                 return lhs = lhs - rhs;
3927         }
3928
3929         RValue<Long> AddAtomic(RValue<Pointer<Long> > x, RValue<Long> y)
3930         {
3931                 return RValue<Long>(Nucleus::createAtomicAdd(x.value, y.value));
3932         }
3933
3934         Type *Long::getType()
3935         {
3936                 return T(llvm::Type::getInt64Ty(*::context));
3937         }
3938
3939         UInt::UInt(Argument<UInt> argument)
3940         {
3941                 storeValue(argument.value);
3942         }
3943
3944         UInt::UInt(RValue<UShort> cast)
3945         {
3946                 Value *integer = Nucleus::createZExt(cast.value, UInt::getType());
3947
3948                 storeValue(integer);
3949         }
3950
3951         UInt::UInt(RValue<Long> cast)
3952         {
3953                 Value *integer = Nucleus::createTrunc(cast.value, UInt::getType());
3954
3955                 storeValue(integer);
3956         }
3957
3958         UInt::UInt(RValue<Float> cast)
3959         {
3960                 // Note: createFPToUI is broken, must perform conversion using createFPtoSI
3961                 // Value *integer = Nucleus::createFPToUI(cast.value, UInt::getType());
3962
3963                 // Smallest positive value representable in UInt, but not in Int
3964                 const unsigned int ustart = 0x80000000u;
3965                 const float ustartf = float(ustart);
3966
3967                 // If the value is negative, store 0, otherwise store the result of the conversion
3968                 storeValue((~(As<Int>(cast) >> 31) &
3969                 // Check if the value can be represented as an Int
3970                         IfThenElse(cast >= ustartf,
3971                 // If the value is too large, subtract ustart and re-add it after conversion.
3972                                 As<Int>(As<UInt>(Int(cast - Float(ustartf))) + UInt(ustart)),
3973                 // Otherwise, just convert normally
3974                                 Int(cast))).value);
3975         }
3976
3977         UInt::UInt(int x)
3978         {
3979                 storeValue(Nucleus::createConstantInt(x));
3980         }
3981
3982         UInt::UInt(unsigned int x)
3983         {
3984                 storeValue(Nucleus::createConstantInt(x));
3985         }
3986
3987         UInt::UInt(RValue<UInt> rhs)
3988         {
3989                 storeValue(rhs.value);
3990         }
3991
3992         UInt::UInt(RValue<Int> rhs)
3993         {
3994                 storeValue(rhs.value);
3995         }
3996
3997         UInt::UInt(const UInt &rhs)
3998         {
3999                 Value *value = rhs.loadValue();
4000                 storeValue(value);
4001         }
4002
4003         UInt::UInt(const Reference<UInt> &rhs)
4004         {
4005                 Value *value = rhs.loadValue();
4006                 storeValue(value);
4007         }
4008
4009         UInt::UInt(const Int &rhs)
4010         {
4011                 Value *value = rhs.loadValue();
4012                 storeValue(value);
4013         }
4014
4015         UInt::UInt(const Reference<Int> &rhs)
4016         {
4017                 Value *value = rhs.loadValue();
4018                 storeValue(value);
4019         }
4020
4021         RValue<UInt> UInt::operator=(unsigned int rhs)
4022         {
4023                 return RValue<UInt>(storeValue(Nucleus::createConstantInt(rhs)));
4024         }
4025
4026         RValue<UInt> UInt::operator=(RValue<UInt> rhs)
4027         {
4028                 storeValue(rhs.value);
4029
4030                 return rhs;
4031         }
4032
4033         RValue<UInt> UInt::operator=(RValue<Int> rhs)
4034         {
4035                 storeValue(rhs.value);
4036
4037                 return RValue<UInt>(rhs);
4038         }
4039
4040         RValue<UInt> UInt::operator=(const UInt &rhs)
4041         {
4042                 Value *value = rhs.loadValue();
4043                 storeValue(value);
4044
4045                 return RValue<UInt>(value);
4046         }
4047
4048         RValue<UInt> UInt::operator=(const Reference<UInt> &rhs)
4049         {
4050                 Value *value = rhs.loadValue();
4051                 storeValue(value);
4052
4053                 return RValue<UInt>(value);
4054         }
4055
4056         RValue<UInt> UInt::operator=(const Int &rhs)
4057         {
4058                 Value *value = rhs.loadValue();
4059                 storeValue(value);
4060
4061                 return RValue<UInt>(value);
4062         }
4063
4064         RValue<UInt> UInt::operator=(const Reference<Int> &rhs)
4065         {
4066                 Value *value = rhs.loadValue();
4067                 storeValue(value);
4068
4069                 return RValue<UInt>(value);
4070         }
4071
4072         RValue<UInt> operator+(RValue<UInt> lhs, RValue<UInt> rhs)
4073         {
4074                 return RValue<UInt>(Nucleus::createAdd(lhs.value, rhs.value));
4075         }
4076
4077         RValue<UInt> operator-(RValue<UInt> lhs, RValue<UInt> rhs)
4078         {
4079                 return RValue<UInt>(Nucleus::createSub(lhs.value, rhs.value));
4080         }
4081
4082         RValue<UInt> operator*(RValue<UInt> lhs, RValue<UInt> rhs)
4083         {
4084                 return RValue<UInt>(Nucleus::createMul(lhs.value, rhs.value));
4085         }
4086
4087         RValue<UInt> operator/(RValue<UInt> lhs, RValue<UInt> rhs)
4088         {
4089                 return RValue<UInt>(Nucleus::createUDiv(lhs.value, rhs.value));
4090         }
4091
4092         RValue<UInt> operator%(RValue<UInt> lhs, RValue<UInt> rhs)
4093         {
4094                 return RValue<UInt>(Nucleus::createURem(lhs.value, rhs.value));
4095         }
4096
4097         RValue<UInt> operator&(RValue<UInt> lhs, RValue<UInt> rhs)
4098         {
4099                 return RValue<UInt>(Nucleus::createAnd(lhs.value, rhs.value));
4100         }
4101
4102         RValue<UInt> operator|(RValue<UInt> lhs, RValue<UInt> rhs)
4103         {
4104                 return RValue<UInt>(Nucleus::createOr(lhs.value, rhs.value));
4105         }
4106
4107         RValue<UInt> operator^(RValue<UInt> lhs, RValue<UInt> rhs)
4108         {
4109                 return RValue<UInt>(Nucleus::createXor(lhs.value, rhs.value));
4110         }
4111
4112         RValue<UInt> operator<<(RValue<UInt> lhs, RValue<UInt> rhs)
4113         {
4114                 return RValue<UInt>(Nucleus::createShl(lhs.value, rhs.value));
4115         }
4116
4117         RValue<UInt> operator>>(RValue<UInt> lhs, RValue<UInt> rhs)
4118         {
4119                 return RValue<UInt>(Nucleus::createLShr(lhs.value, rhs.value));
4120         }
4121
4122         RValue<UInt> operator+=(UInt &lhs, RValue<UInt> rhs)
4123         {
4124                 return lhs = lhs + rhs;
4125         }
4126
4127         RValue<UInt> operator-=(UInt &lhs, RValue<UInt> rhs)
4128         {
4129                 return lhs = lhs - rhs;
4130         }
4131
4132         RValue<UInt> operator*=(UInt &lhs, RValue<UInt> rhs)
4133         {
4134                 return lhs = lhs * rhs;
4135         }
4136
4137         RValue<UInt> operator/=(UInt &lhs, RValue<UInt> rhs)
4138         {
4139                 return lhs = lhs / rhs;
4140         }
4141
4142         RValue<UInt> operator%=(UInt &lhs, RValue<UInt> rhs)
4143         {
4144                 return lhs = lhs % rhs;
4145         }
4146
4147         RValue<UInt> operator&=(UInt &lhs, RValue<UInt> rhs)
4148         {
4149                 return lhs = lhs & rhs;
4150         }
4151
4152         RValue<UInt> operator|=(UInt &lhs, RValue<UInt> rhs)
4153         {
4154                 return lhs = lhs | rhs;
4155         }
4156
4157         RValue<UInt> operator^=(UInt &lhs, RValue<UInt> rhs)
4158         {
4159                 return lhs = lhs ^ rhs;
4160         }
4161
4162         RValue<UInt> operator<<=(UInt &lhs, RValue<UInt> rhs)
4163         {
4164                 return lhs = lhs << rhs;
4165         }
4166
4167         RValue<UInt> operator>>=(UInt &lhs, RValue<UInt> rhs)
4168         {
4169                 return lhs = lhs >> rhs;
4170         }
4171
4172         RValue<UInt> operator+(RValue<UInt> val)
4173         {
4174                 return val;
4175         }
4176
4177         RValue<UInt> operator-(RValue<UInt> val)
4178         {
4179                 return RValue<UInt>(Nucleus::createNeg(val.value));
4180         }
4181
4182         RValue<UInt> operator~(RValue<UInt> val)
4183         {
4184                 return RValue<UInt>(Nucleus::createNot(val.value));
4185         }
4186
4187         RValue<UInt> operator++(UInt &val, int)   // Post-increment
4188         {
4189                 RValue<UInt> res = val;
4190
4191                 Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantInt(1)));
4192                 val.storeValue(inc);
4193
4194                 return res;
4195         }
4196
4197         const UInt &operator++(UInt &val)   // Pre-increment
4198         {
4199                 Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantInt(1)));
4200                 val.storeValue(inc);
4201
4202                 return val;
4203         }
4204
4205         RValue<UInt> operator--(UInt &val, int)   // Post-decrement
4206         {
4207                 RValue<UInt> res = val;
4208
4209                 Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantInt(1)));
4210                 val.storeValue(inc);
4211
4212                 return res;
4213         }
4214
4215         const UInt &operator--(UInt &val)   // Pre-decrement
4216         {
4217                 Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantInt(1)));
4218                 val.storeValue(inc);
4219
4220                 return val;
4221         }
4222
4223         RValue<UInt> Max(RValue<UInt> x, RValue<UInt> y)
4224         {
4225                 return IfThenElse(x > y, x, y);
4226         }
4227
4228         RValue<UInt> Min(RValue<UInt> x, RValue<UInt> y)
4229         {
4230                 return IfThenElse(x < y, x, y);
4231         }
4232
4233         RValue<UInt> Clamp(RValue<UInt> x, RValue<UInt> min, RValue<UInt> max)
4234         {
4235                 return Min(Max(x, min), max);
4236         }
4237
4238         RValue<Bool> operator<(RValue<UInt> lhs, RValue<UInt> rhs)
4239         {
4240                 return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
4241         }
4242
4243         RValue<Bool> operator<=(RValue<UInt> lhs, RValue<UInt> rhs)
4244         {
4245                 return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
4246         }
4247
4248         RValue<Bool> operator>(RValue<UInt> lhs, RValue<UInt> rhs)
4249         {
4250                 return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
4251         }
4252
4253         RValue<Bool> operator>=(RValue<UInt> lhs, RValue<UInt> rhs)
4254         {
4255                 return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
4256         }
4257
4258         RValue<Bool> operator!=(RValue<UInt> lhs, RValue<UInt> rhs)
4259         {
4260                 return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
4261         }
4262
4263         RValue<Bool> operator==(RValue<UInt> lhs, RValue<UInt> rhs)
4264         {
4265                 return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
4266         }
4267
4268 //      RValue<UInt> RoundUInt(RValue<Float> cast)
4269 //      {
4270 //              return x86::cvtss2si(val);   // FIXME: Unsigned
4271 //
4272 //      //      return IfThenElse(val > 0.0f, Int(val + 0.5f), Int(val - 0.5f));
4273 //      }
4274
4275         Type *UInt::getType()
4276         {
4277                 return T(llvm::Type::getInt32Ty(*::context));
4278         }
4279
4280 //      Int2::Int2(RValue<Int> cast)
4281 //      {
4282 //              Value *extend = Nucleus::createZExt(cast.value, Long::getType());
4283 //              Value *vector = Nucleus::createBitCast(extend, Int2::getType());
4284 //
4285 //              int shuffle[2] = {0, 0};
4286 //              Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
4287 //
4288 //              storeValue(replicate);
4289 //      }
4290
4291         Int2::Int2(RValue<Int4> cast)
4292         {
4293                 Value *long2 = Nucleus::createBitCast(cast.value, T(VectorType::get(Long::getType(), 2)));
4294                 Value *element = Nucleus::createExtractElement(long2, Long::getType(), 0);
4295                 Value *int2 = Nucleus::createBitCast(element, Int2::getType());
4296
4297                 storeValue(int2);
4298         }
4299
4300         Int2::Int2(int x, int y)
4301         {
4302                 int64_t constantVector[2] = {x, y};
4303                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Int::getType(), 2))));
4304
4305                 storeValue(Nucleus::createBitCast(vector, getType()));
4306         }
4307
4308         Int2::Int2(RValue<Int2> rhs)
4309         {
4310                 storeValue(rhs.value);
4311         }
4312
4313         Int2::Int2(const Int2 &rhs)
4314         {
4315                 Value *value = rhs.loadValue();
4316                 storeValue(value);
4317         }
4318
4319         Int2::Int2(const Reference<Int2> &rhs)
4320         {
4321                 Value *value = rhs.loadValue();
4322                 storeValue(value);
4323         }
4324
4325         Int2::Int2(RValue<Int> lo, RValue<Int> hi)
4326         {
4327                 if(CPUID::supportsMMX2())
4328                 {
4329                         // movd mm0, lo
4330                         // movd mm1, hi
4331                         // punpckldq mm0, mm1
4332
4333                         Value *loLong = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), lo.value, 0);
4334                         loLong = Nucleus::createInsertElement(loLong, V(ConstantInt::get(Int::getType(), 0)), 1);
4335                         Value *hiLong = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), hi.value, 0);
4336                         hiLong = Nucleus::createInsertElement(hiLong, V(ConstantInt::get(Int::getType(), 0)), 1);
4337
4338                         storeValue(As<Int2>(UnpackLow(As<Int2>(loLong), As<Int2>(hiLong))).value);
4339                 }
4340                 else
4341                 {
4342                         int shuffle[2] = {0, 1};
4343                         Value *packed = Nucleus::createShuffleVector(Nucleus::createBitCast(lo.value, T(VectorType::get(Int::getType(), 1))), Nucleus::createBitCast(hi.value, T(VectorType::get(Int::getType(), 1))), shuffle);
4344
4345                         storeValue(Nucleus::createBitCast(packed, Int2::getType()));
4346                 }
4347         }
4348
4349         RValue<Int2> Int2::operator=(RValue<Int2> rhs)
4350         {
4351                 storeValue(rhs.value);
4352
4353                 return rhs;
4354         }
4355
4356         RValue<Int2> Int2::operator=(const Int2 &rhs)
4357         {
4358                 Value *value = rhs.loadValue();
4359                 storeValue(value);
4360
4361                 return RValue<Int2>(value);
4362         }
4363
4364         RValue<Int2> Int2::operator=(const Reference<Int2> &rhs)
4365         {
4366                 Value *value = rhs.loadValue();
4367                 storeValue(value);
4368
4369                 return RValue<Int2>(value);
4370         }
4371
4372         RValue<Int2> operator+(RValue<Int2> lhs, RValue<Int2> rhs)
4373         {
4374                 if(CPUID::supportsMMX2())
4375                 {
4376                         return x86::paddd(lhs, rhs);
4377                 }
4378                 else
4379                 {
4380                         return RValue<Int2>(Nucleus::createAdd(lhs.value, rhs.value));
4381                 }
4382         }
4383
4384         RValue<Int2> operator-(RValue<Int2> lhs, RValue<Int2> rhs)
4385         {
4386                 if(CPUID::supportsMMX2())
4387                 {
4388                         return x86::psubd(lhs, rhs);
4389                 }
4390                 else
4391                 {
4392                         return RValue<Int2>(Nucleus::createSub(lhs.value, rhs.value));
4393                 }
4394         }
4395
4396 //      RValue<Int2> operator*(RValue<Int2> lhs, RValue<Int2> rhs)
4397 //      {
4398 //              return RValue<Int2>(Nucleus::createMul(lhs.value, rhs.value));
4399 //      }
4400
4401 //      RValue<Int2> operator/(RValue<Int2> lhs, RValue<Int2> rhs)
4402 //      {
4403 //              return RValue<Int2>(Nucleus::createSDiv(lhs.value, rhs.value));
4404 //      }
4405
4406 //      RValue<Int2> operator%(RValue<Int2> lhs, RValue<Int2> rhs)
4407 //      {
4408 //              return RValue<Int2>(Nucleus::createSRem(lhs.value, rhs.value));
4409 //      }
4410
4411         RValue<Int2> operator&(RValue<Int2> lhs, RValue<Int2> rhs)
4412         {
4413                 if(CPUID::supportsMMX2())
4414                 {
4415                         return As<Int2>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
4416                 }
4417                 else
4418                 {
4419                         return RValue<Int2>(Nucleus::createAnd(lhs.value, rhs.value));
4420                 }
4421         }
4422
4423         RValue<Int2> operator|(RValue<Int2> lhs, RValue<Int2> rhs)
4424         {
4425                 if(CPUID::supportsMMX2())
4426                 {
4427                         return As<Int2>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
4428                 }
4429                 else
4430                 {
4431                         return RValue<Int2>(Nucleus::createOr(lhs.value, rhs.value));
4432                 }
4433         }
4434
4435         RValue<Int2> operator^(RValue<Int2> lhs, RValue<Int2> rhs)
4436         {
4437                 if(CPUID::supportsMMX2())
4438                 {
4439                         return As<Int2>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
4440                 }
4441                 else
4442                 {
4443                         return RValue<Int2>(Nucleus::createXor(lhs.value, rhs.value));
4444                 }
4445         }
4446
4447         RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
4448         {
4449         //      return RValue<Int2>(Nucleus::createShl(lhs.value, rhs.value));
4450
4451                 return x86::pslld(lhs, rhs);
4452         }
4453
4454         RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
4455         {
4456         //      return RValue<Int2>(Nucleus::createAShr(lhs.value, rhs.value));
4457
4458                 return x86::psrad(lhs, rhs);
4459         }
4460
4461         RValue<Int2> operator+=(Int2 &lhs, RValue<Int2> rhs)
4462         {
4463                 return lhs = lhs + rhs;
4464         }
4465
4466         RValue<Int2> operator-=(Int2 &lhs, RValue<Int2> rhs)
4467         {
4468                 return lhs = lhs - rhs;
4469         }
4470
4471 //      RValue<Int2> operator*=(Int2 &lhs, RValue<Int2> rhs)
4472 //      {
4473 //              return lhs = lhs * rhs;
4474 //      }
4475
4476 //      RValue<Int2> operator/=(Int2 &lhs, RValue<Int2> rhs)
4477 //      {
4478 //              return lhs = lhs / rhs;
4479 //      }
4480
4481 //      RValue<Int2> operator%=(Int2 &lhs, RValue<Int2> rhs)
4482 //      {
4483 //              return lhs = lhs % rhs;
4484 //      }
4485
4486         RValue<Int2> operator&=(Int2 &lhs, RValue<Int2> rhs)
4487         {
4488                 return lhs = lhs & rhs;
4489         }
4490
4491         RValue<Int2> operator|=(Int2 &lhs, RValue<Int2> rhs)
4492         {
4493                 return lhs = lhs | rhs;
4494         }
4495
4496         RValue<Int2> operator^=(Int2 &lhs, RValue<Int2> rhs)
4497         {
4498                 return lhs = lhs ^ rhs;
4499         }
4500
4501         RValue<Int2> operator<<=(Int2 &lhs, unsigned char rhs)
4502         {
4503                 return lhs = lhs << rhs;
4504         }
4505
4506         RValue<Int2> operator>>=(Int2 &lhs, unsigned char rhs)
4507         {
4508                 return lhs = lhs >> rhs;
4509         }
4510
4511 //      RValue<Int2> operator+(RValue<Int2> val)
4512 //      {
4513 //              return val;
4514 //      }
4515
4516 //      RValue<Int2> operator-(RValue<Int2> val)
4517 //      {
4518 //              return RValue<Int2>(Nucleus::createNeg(val.value));
4519 //      }
4520
4521         RValue<Int2> operator~(RValue<Int2> val)
4522         {
4523                 if(CPUID::supportsMMX2())
4524                 {
4525                         return val ^ Int2(0xFFFFFFFF, 0xFFFFFFFF);
4526                 }
4527                 else
4528                 {
4529                         return RValue<Int2>(Nucleus::createNot(val.value));
4530                 }
4531         }
4532
4533         RValue<Short4> UnpackLow(RValue<Int2> x, RValue<Int2> y)
4534         {
4535                 if(CPUID::supportsMMX2())
4536                 {
4537                         return x86::punpckldq(x, y);
4538                 }
4539                 else
4540                 {
4541                         int shuffle[2] = {0, 2};
4542                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
4543
4544                         return As<Short4>(packed);
4545                 }
4546         }
4547
4548         RValue<Short4> UnpackHigh(RValue<Int2> x, RValue<Int2> y)
4549         {
4550                 if(CPUID::supportsMMX2())
4551                 {
4552                         return x86::punpckhdq(x, y);
4553                 }
4554                 else
4555                 {
4556                         int shuffle[2] = {1, 3};
4557                         Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
4558
4559                         return As<Short4>(packed);
4560                 }
4561         }
4562
4563         RValue<Int> Extract(RValue<Int2> val, int i)
4564         {
4565                 if(false)   // FIXME: LLVM does not generate optimal code
4566                 {
4567                         return RValue<Int>(Nucleus::createExtractElement(val.value, Int::getType(), i));
4568                 }
4569                 else
4570                 {
4571                         if(i == 0)
4572                         {
4573                                 return RValue<Int>(Nucleus::createExtractElement(Nucleus::createBitCast(val.value, T(VectorType::get(Int::getType(), 2))), Int::getType(), 0));
4574                         }
4575                         else
4576                         {
4577                                 Int2 val2 = As<Int2>(UnpackHigh(val, val));
4578
4579                                 return Extract(val2, 0);
4580                         }
4581                 }
4582         }
4583
4584         RValue<Int2> Insert(RValue<Int2> val, RValue<Int> element, int i)
4585         {
4586                 return RValue<Int2>(Nucleus::createBitCast(Nucleus::createInsertElement(Nucleus::createBitCast(val.value, T(VectorType::get(Int::getType(), 2))), element.value, i), Int2::getType()));
4587         }
4588
4589         Type *Int2::getType()
4590         {
4591                 if(CPUID::supportsMMX2())
4592                 {
4593                         return MMX::getType();
4594                 }
4595                 else
4596                 {
4597                         return T(VectorType::get(Int::getType(), 2));
4598                 }
4599         }
4600
4601         UInt2::UInt2(unsigned int x, unsigned int y)
4602         {
4603                 int64_t constantVector[2] = {x, y};
4604                 Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UInt::getType(), 2))));
4605
4606                 storeValue(Nucleus::createBitCast(vector, getType()));
4607         }
4608
4609         UInt2::UInt2(RValue<UInt2> rhs)
4610         {
4611                 storeValue(rhs.value);
4612         }
4613
4614         UInt2::UInt2(const UInt2 &rhs)
4615         {
4616                 Value *value = rhs.loadValue();
4617                 storeValue(value);
4618         }
4619
4620         UInt2::UInt2(const Reference<UInt2> &rhs)
4621         {
4622                 Value *value = rhs.loadValue();
4623                 storeValue(value);
4624         }
4625
4626         RValue<UInt2> UInt2::operator=(RValue<UInt2> rhs)
4627         {
4628                 storeValue(rhs.value);
4629
4630                 return rhs;
4631         }
4632
4633         RValue<UInt2> UInt2::operator=(const UInt2 &rhs)
4634         {
4635                 Value *value = rhs.loadValue();
4636                 storeValue(value);
4637
4638                 return RValue<UInt2>(value);
4639         }
4640
4641         RValue<UInt2> UInt2::operator=(const Reference<UInt2> &rhs)
4642         {
4643                 Value *value = rhs.loadValue();
4644                 storeValue(value);
4645
4646                 return RValue<UInt2>(value);
4647         }
4648
4649         RValue<UInt2> operator+(RValue<UInt2> lhs, RValue<UInt2> rhs)
4650         {
4651                 if(CPUID::supportsMMX2())
4652                 {
4653                         return As<UInt2>(x86::paddd(As<Int2>(lhs), As<Int2>(rhs)));
4654                 }
4655                 else
4656                 {
4657                         return RValue<UInt2>(Nucleus::createAdd(lhs.value, rhs.value));
4658                 }
4659         }
4660
4661         RValue<UInt2> operator-(RValue<UInt2> lhs, RValue<UInt2> rhs)
4662         {
4663                 if(CPUID::supportsMMX2())
4664                 {
4665                         return As<UInt2>(x86::psubd(As<Int2>(lhs), As<Int2>(rhs)));
4666                 }
4667                 else
4668                 {
4669                         return RValue<UInt2>(Nucleus::createSub(lhs.value, rhs.value));
4670                 }
4671         }
4672
4673 //      RValue<UInt2> operator*(RValue<UInt2> lhs, RValue<UInt2> rhs)
4674 //      {
4675 //              return RValue<UInt2>(Nucleus::createMul(lhs.value, rhs.value));
4676 //      }
4677
4678 //      RValue<UInt2> operator/(RValue<UInt2> lhs, RValue<UInt2> rhs)
4679 //      {
4680 //              return RValue<UInt2>(Nucleus::createUDiv(lhs.value, rhs.value));
4681 //      }
4682
4683 //      RValue<UInt2> operator%(RValue<UInt2> lhs, RValue<UInt2> rhs)
4684 //      {
4685 //              return RValue<UInt2>(Nucleus::createURem(lhs.value, rhs.value));
4686 //      }
4687
4688         RValue<UInt2> operator&(RValue<UInt2> lhs, RValue<UInt2> rhs)
4689         {
4690                 if(CPUID::supportsMMX2())
4691                 {
4692                         return As<UInt2>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
4693                 }
4694                 else
4695                 {
4696                         return RValue<UInt2>(Nucleus::createAnd(lhs.value, rhs.value));
4697                 }
4698         }
4699
4700         RValue<UInt2> operator|(RValue<UInt2> lhs, RValue<UInt2> rhs)
4701         {
4702                 if(CPUID::supportsMMX2())
4703                 {
4704                         return As<UInt2>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
4705                 }
4706                 else
4707                 {
4708                         return RValue<UInt2>(Nucleus::createOr(lhs.value, rhs.value));
4709                 }
4710         }
4711
4712         RValue<UInt2> operator^(RValue<UInt2> lhs, RValue<UInt2> rhs)
4713         {
4714                 if(CPUID::supportsMMX2())
4715                 {
4716                         return As<UInt2>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
4717                 }
4718                 else
4719                 {
4720                         return RValue<UInt2>(Nucleus::createXor(lhs.value, rhs.value));
4721                 }
4722         }
4723
4724         RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
4725         {
4726         //      return RValue<UInt2>(Nucleus::createShl(lhs.value, rhs.value));
4727
4728                 return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
4729         }
4730
4731         RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
4732         {
4733         //      return RValue<UInt2>(Nucleus::createLShr(lhs.value, rhs.value));
4734
4735                 return x86::psrld(lhs, rhs);
4736         }
4737
4738         RValue<UInt2> operator+=(UInt2 &lhs, RValue<UInt2> rhs)
4739         {
4740                 return lhs = lhs + rhs;
4741         }
4742
4743         RValue<UInt2> operator-=(UInt2 &lhs, RValue<UInt2> rhs)
4744         {
4745                 return lhs = lhs - rhs;
4746         }
4747
4748 //      RValue<UInt2> operator*=(UInt2 &lhs, RValue<UInt2> rhs)
4749 //      {
4750 //              return lhs = lhs * rhs;
4751 //      }
4752
4753 //      RValue<UInt2> operator/=(UInt2 &lhs, RValue<UInt2> rhs)
4754 //      {
4755 //              return lhs = lhs / rhs;
4756 //      }
4757
4758 //      RValue<UInt2> operator%=(UInt2 &lhs, RValue<UInt2> rhs)
4759 //      {
4760 //              return lhs = lhs % rhs;
4761 //      }
4762
4763         RValue<UInt2> operator&=(UInt2 &lhs, RValue<UInt2> rhs)
4764         {
4765                 return lhs = lhs & rhs;
4766         }
4767
4768         RValue<UInt2> operator|=(UInt2 &lhs, RValue<UInt2> rhs)
4769         {
4770                 return lhs = lhs | rhs;
4771         }
4772
4773         RValue<UInt2> operator^=(UInt2 &lhs, RValue<UInt2> rhs)
4774         {
4775                 return lhs = lhs ^ rhs;
4776         }
4777
4778         RValue<UInt2> operator<<=(UInt2 &lhs, unsigned char rhs)
4779         {
4780                 return lhs = lhs << rhs;
4781         }
4782
4783         RValue<UInt2> operator>>=(UInt2 &lhs, unsigned char rhs)
4784         {
4785                 return lhs = lhs >> rhs;
4786         }
4787
4788 //      RValue<UInt2> operator+(RValue<UInt2> val)
4789 //      {
4790 //              return val;
4791 //      }
4792
4793 //      RValue<UInt2> operator-(RValue<UInt2> val)
4794 //      {
4795 //              return RValue<UInt2>(Nucleus::createNeg(val.value));
4796 //      }
4797
4798         RValue<UInt2> operator~(RValue<UInt2> val)
4799         {
4800                 if(CPUID::supportsMMX2())
4801                 {
4802                         return val ^ UInt2(0xFFFFFFFF, 0xFFFFFFFF);
4803                 }
4804                 else
4805                 {
4806                         return RValue<UInt2>(Nucleus::createNot(val.value));
4807                 }
4808         }
4809
4810         Type *UInt2::getType()
4811         {
4812                 if(CPUID::supportsMMX2())
4813                 {
4814                         return MMX::getType();
4815                 }
4816                 else
4817                 {
4818                         return T(VectorType::get(UInt::getType(), 2));
4819                 }
4820         }
4821
4822         Int4::Int4(RValue<Byte4> cast)
4823         {
4824                 Value *x = Nucleus::createBitCast(cast.value, Int::getType());
4825                 Value *a = Nucleus::createInsertElement(V(UndefValue::get(Int4::getType())), x, 0);
4826
4827                 Value *e;
4828
4829                 if (CPUID::supportsSSE4_1())
4830                 {
4831                         e = x86::pmovzxbd(RValue<Int4>(a)).value;
4832                 }
4833                 else
4834                 {
4835                         int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
4836                         Value *b = Nucleus::createBitCast(a, Byte16::getType());
4837                         Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Byte16::getType())), swizzle);
4838
4839                         int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
4840                         Value *d = Nucleus::createBitCast(c, Short8::getType());
4841                         e = Nucleus::createShuffleVector(d, V(Nucleus::createNullValue(Short8::getType())), swizzle2);
4842                 }
4843
4844                 Value *f = Nucleus::createBitCast(e, Int4::getType());
4845                 storeValue(f);
4846         }
4847
4848         Int4::Int4(RValue<SByte4> cast)
4849         {
4850                 Value *x = Nucleus::createBitCast(cast.value, Int::getType());
4851                 Value *a = Nucleus::createInsertElement(V(UndefValue::get(Int4::getType())), x, 0);
4852
4853                 Value *g;
4854
4855                 if (CPUID::supportsSSE4_1())
4856                 {
4857                         g = x86::pmovsxbd(RValue<Int4>(a)).value;
4858                 }
4859                 else
4860                 {
4861                         int     swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
4862                         Value *b = Nucleus::createBitCast(a, Byte16::getType());
4863                         Value *c = Nucleus::createShuffleVector(b, b, swizzle);
4864
4865                         int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
4866                         Value *d = Nucleus::createBitCast(c, Short8::getType());
4867                         Value *e = Nucleus::createShuffleVector(d, d, swizzle2);
4868
4869                         Value *f = Nucleus::createBitCast(e, Int4::getType());
4870                         //      g = Nucleus::createAShr(f, Nucleus::createConstantInt(24));
4871                         g = x86::psrad(RValue<Int4>(f), 24).value;
4872                 }
4873
4874                 storeValue(g);
4875         }
4876
4877         Int4::Int4(RValue<Float4> cast)
4878         {
4879                 Value *xyzw = Nucleus::createFPToSI(cast.value, Int4::getType());
4880
4881                 storeValue(xyzw);
4882         }
4883
4884         Int4::Int4(RValue<Short4> cast)
4885         {
4886                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
4887                 Value *element = Nucleus::createBitCast(cast.value, Long::getType());
4888                 long2 = Nucleus::createInsertElement(long2, element, 0);
4889                 RValue<Int4> vector = RValue<Int4>(Nucleus::createBitCast(long2, Int4::getType()));
4890
4891                 if(CPUID::supportsSSE4_1())
4892                 {
4893                         storeValue(x86::pmovsxwd(vector).value);
4894                 }
4895                 else
4896                 {
4897                         Value *b = Nucleus::createBitCast(vector.value, Short8::getType());
4898
4899                         int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
4900                         Value *c = Nucleus::createShuffleVector(b, b, swizzle);
4901                         Value *d = Nucleus::createBitCast(c, Int4::getType());
4902                         storeValue(d);
4903
4904                         // Each Short is packed into each Int in the (Short | Short) format.
4905                         // Shifting by 16 will retrieve the original Short value.
4906                         // Shifting an Int will propagate the sign bit, which will work
4907                         // for both positive and negative values of a Short.
4908                         *this >>= 16;
4909                 }
4910         }
4911
4912         Int4::Int4(RValue<UShort4> cast)
4913         {
4914                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
4915                 Value *element = Nucleus::createBitCast(cast.value, Long::getType());
4916                 long2 = Nucleus::createInsertElement(long2, element, 0);
4917                 RValue<Int4> vector = RValue<Int4>(Nucleus::createBitCast(long2, Int4::getType()));
4918
4919                 if(CPUID::supportsSSE4_1())
4920                 {
4921                         storeValue(x86::pmovzxwd(RValue<Int4>(vector)).value);
4922                 }
4923                 else
4924                 {
4925                         Value *b = Nucleus::createBitCast(vector.value, Short8::getType());
4926
4927                         int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
4928                         Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Short8::getType())), swizzle);
4929                         Value *d = Nucleus::createBitCast(c, Int4::getType());
4930                         storeValue(d);
4931                 }
4932         }
4933
4934         Int4::Int4(int xyzw)
4935         {
4936                 constant(xyzw, xyzw, xyzw, xyzw);
4937         }
4938
4939         Int4::Int4(int x, int yzw)
4940         {
4941                 constant(x, yzw, yzw, yzw);
4942         }
4943
4944         Int4::Int4(int x, int y, int zw)
4945         {
4946                 constant(x, y, zw, zw);
4947         }
4948
4949         Int4::Int4(int x, int y, int z, int w)
4950         {
4951                 constant(x, y, z, w);
4952         }
4953
4954         void Int4::constant(int x, int y, int z, int w)
4955         {
4956                 int64_t constantVector[4] = {x, y, z, w};
4957                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
4958         }
4959
4960         Int4::Int4(RValue<Int4> rhs)
4961         {
4962                 storeValue(rhs.value);
4963         }
4964
4965         Int4::Int4(const Int4 &rhs)
4966         {
4967                 Value *value = rhs.loadValue();
4968                 storeValue(value);
4969         }
4970
4971         Int4::Int4(const Reference<Int4> &rhs)
4972         {
4973                 Value *value = rhs.loadValue();
4974                 storeValue(value);
4975         }
4976
4977         Int4::Int4(RValue<UInt4> rhs)
4978         {
4979                 storeValue(rhs.value);
4980         }
4981
4982         Int4::Int4(const UInt4 &rhs)
4983         {
4984                 Value *value = rhs.loadValue();
4985                 storeValue(value);
4986         }
4987
4988         Int4::Int4(const Reference<UInt4> &rhs)
4989         {
4990                 Value *value = rhs.loadValue();
4991                 storeValue(value);
4992         }
4993
4994         Int4::Int4(RValue<Int2> lo, RValue<Int2> hi)
4995         {
4996                 Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
4997                 Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
4998
4999                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
5000                 long2 = Nucleus::createInsertElement(long2, loLong, 0);
5001                 long2 = Nucleus::createInsertElement(long2, hiLong, 1);
5002                 Value *int4 = Nucleus::createBitCast(long2, Int4::getType());
5003
5004                 storeValue(int4);
5005         }
5006
5007         Int4::Int4(RValue<Int> rhs)
5008         {
5009                 Value *vector = loadValue();
5010                 Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
5011
5012                 int swizzle[4] = {0, 0, 0, 0};
5013                 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
5014
5015                 storeValue(replicate);
5016         }
5017
5018         Int4::Int4(const Int &rhs)
5019         {
5020                 *this = RValue<Int>(rhs.loadValue());
5021         }
5022
5023         Int4::Int4(const Reference<Int> &rhs)
5024         {
5025                 *this = RValue<Int>(rhs.loadValue());
5026         }
5027
5028         RValue<Int4> Int4::operator=(RValue<Int4> rhs)
5029         {
5030                 storeValue(rhs.value);
5031
5032                 return rhs;
5033         }
5034
5035         RValue<Int4> Int4::operator=(const Int4 &rhs)
5036         {
5037                 Value *value = rhs.loadValue();
5038                 storeValue(value);
5039
5040                 return RValue<Int4>(value);
5041         }
5042
5043         RValue<Int4> Int4::operator=(const Reference<Int4> &rhs)
5044         {
5045                 Value *value = rhs.loadValue();
5046                 storeValue(value);
5047
5048                 return RValue<Int4>(value);
5049         }
5050
5051         RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int4> rhs)
5052         {
5053                 return RValue<Int4>(Nucleus::createAdd(lhs.value, rhs.value));
5054         }
5055
5056         RValue<Int4> operator-(RValue<Int4> lhs, RValue<Int4> rhs)
5057         {
5058                 return RValue<Int4>(Nucleus::createSub(lhs.value, rhs.value));
5059         }
5060
5061         RValue<Int4> operator*(RValue<Int4> lhs, RValue<Int4> rhs)
5062         {
5063                 return RValue<Int4>(Nucleus::createMul(lhs.value, rhs.value));
5064         }
5065
5066         RValue<Int4> operator/(RValue<Int4> lhs, RValue<Int4> rhs)
5067         {
5068                 return RValue<Int4>(Nucleus::createSDiv(lhs.value, rhs.value));
5069         }
5070
5071         RValue<Int4> operator%(RValue<Int4> lhs, RValue<Int4> rhs)
5072         {
5073                 return RValue<Int4>(Nucleus::createSRem(lhs.value, rhs.value));
5074         }
5075
5076         RValue<Int4> operator&(RValue<Int4> lhs, RValue<Int4> rhs)
5077         {
5078                 return RValue<Int4>(Nucleus::createAnd(lhs.value, rhs.value));
5079         }
5080
5081         RValue<Int4> operator|(RValue<Int4> lhs, RValue<Int4> rhs)
5082         {
5083                 return RValue<Int4>(Nucleus::createOr(lhs.value, rhs.value));
5084         }
5085
5086         RValue<Int4> operator^(RValue<Int4> lhs, RValue<Int4> rhs)
5087         {
5088                 return RValue<Int4>(Nucleus::createXor(lhs.value, rhs.value));
5089         }
5090
5091         RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
5092         {
5093                 return x86::pslld(lhs, rhs);
5094         }
5095
5096         RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
5097         {
5098                 return x86::psrad(lhs, rhs);
5099         }
5100
5101         RValue<Int4> operator<<(RValue<Int4> lhs, RValue<Int4> rhs)
5102         {
5103                 return RValue<Int4>(Nucleus::createShl(lhs.value, rhs.value));
5104         }
5105
5106         RValue<Int4> operator>>(RValue<Int4> lhs, RValue<Int4> rhs)
5107         {
5108                 return RValue<Int4>(Nucleus::createAShr(lhs.value, rhs.value));
5109         }
5110
5111         RValue<Int4> operator+=(Int4 &lhs, RValue<Int4> rhs)
5112         {
5113                 return lhs = lhs + rhs;
5114         }
5115
5116         RValue<Int4> operator-=(Int4 &lhs, RValue<Int4> rhs)
5117         {
5118                 return lhs = lhs - rhs;
5119         }
5120
5121         RValue<Int4> operator*=(Int4 &lhs, RValue<Int4> rhs)
5122         {
5123                 return lhs = lhs * rhs;
5124         }
5125
5126 //      RValue<Int4> operator/=(Int4 &lhs, RValue<Int4> rhs)
5127 //      {
5128 //              return lhs = lhs / rhs;
5129 //      }
5130
5131 //      RValue<Int4> operator%=(Int4 &lhs, RValue<Int4> rhs)
5132 //      {
5133 //              return lhs = lhs % rhs;
5134 //      }
5135
5136         RValue<Int4> operator&=(Int4 &lhs, RValue<Int4> rhs)
5137         {
5138                 return lhs = lhs & rhs;
5139         }
5140
5141         RValue<Int4> operator|=(Int4 &lhs, RValue<Int4> rhs)
5142         {
5143                 return lhs = lhs | rhs;
5144         }
5145
5146         RValue<Int4> operator^=(Int4 &lhs, RValue<Int4> rhs)
5147         {
5148                 return lhs = lhs ^ rhs;
5149         }
5150
5151         RValue<Int4> operator<<=(Int4 &lhs, unsigned char rhs)
5152         {
5153                 return lhs = lhs << rhs;
5154         }
5155
5156         RValue<Int4> operator>>=(Int4 &lhs, unsigned char rhs)
5157         {
5158                 return lhs = lhs >> rhs;
5159         }
5160
5161         RValue<Int4> operator+(RValue<Int4> val)
5162         {
5163                 return val;
5164         }
5165
5166         RValue<Int4> operator-(RValue<Int4> val)
5167         {
5168                 return RValue<Int4>(Nucleus::createNeg(val.value));
5169         }
5170
5171         RValue<Int4> operator~(RValue<Int4> val)
5172         {
5173                 return RValue<Int4>(Nucleus::createNot(val.value));
5174         }
5175
5176         RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
5177         {
5178                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5179                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5180                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
5181                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
5182         }
5183
5184         RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
5185         {
5186                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType()));
5187         }
5188
5189         RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
5190         {
5191                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5192                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5193                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value, y.value), Int4::getType()));
5194                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
5195         }
5196
5197         RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
5198         {
5199                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
5200         }
5201
5202         RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
5203         {
5204                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5205                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5206                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value, y.value), Int4::getType()));
5207                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
5208         }
5209
5210         RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
5211         {
5212                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType()));
5213         }
5214
5215         RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
5216         {
5217                 if(CPUID::supportsSSE4_1())
5218                 {
5219                         return x86::pmaxsd(x, y);
5220                 }
5221                 else
5222                 {
5223                         RValue<Int4> greater = CmpNLE(x, y);
5224                         return (x & greater) | (y & ~greater);
5225                 }
5226         }
5227
5228         RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
5229         {
5230                 if(CPUID::supportsSSE4_1())
5231                 {
5232                         return x86::pminsd(x, y);
5233                 }
5234                 else
5235                 {
5236                         RValue<Int4> less = CmpLT(x, y);
5237                         return (x & less) | (y & ~less);
5238                 }
5239         }
5240
5241         RValue<Int4> RoundInt(RValue<Float4> cast)
5242         {
5243                 return x86::cvtps2dq(cast);
5244         }
5245
5246         RValue<Short8> Pack(RValue<Int4> x, RValue<Int4> y)
5247         {
5248                 return x86::packssdw(x, y);
5249         }
5250
5251         RValue<Int> Extract(RValue<Int4> x, int i)
5252         {
5253                 return RValue<Int>(Nucleus::createExtractElement(x.value, Int::getType(), i));
5254         }
5255
5256         RValue<Int4> Insert(RValue<Int4> x, RValue<Int> element, int i)
5257         {
5258                 return RValue<Int4>(Nucleus::createInsertElement(x.value, element.value, i));
5259         }
5260
5261         RValue<Int> SignMask(RValue<Int4> x)
5262         {
5263                 return x86::movmskps(As<Float4>(x));
5264         }
5265
5266         RValue<Int4> Swizzle(RValue<Int4> x, unsigned char select)
5267         {
5268                 return RValue<Int4>(createSwizzle4(x.value, select));
5269         }
5270
5271         Type *Int4::getType()
5272         {
5273                 return T(VectorType::get(Int::getType(), 4));
5274         }
5275
5276         UInt4::UInt4(RValue<Float4> cast)
5277         {
5278                 // Note: createFPToUI is broken, must perform conversion using createFPtoSI
5279                 // Value *xyzw = Nucleus::createFPToUI(cast.value, UInt4::getType());
5280
5281                 // Smallest positive value representable in UInt, but not in Int
5282                 const unsigned int ustart = 0x80000000u;
5283                 const float ustartf = float(ustart);
5284
5285                 // Check if the value can be represented as an Int
5286                 Int4 uiValue = CmpNLT(cast, Float4(ustartf));
5287                 // If the value is too large, subtract ustart and re-add it after conversion.
5288                 uiValue = (uiValue & As<Int4>(As<UInt4>(Int4(cast - Float4(ustartf))) + UInt4(ustart))) |
5289                 // Otherwise, just convert normally
5290                           (~uiValue & Int4(cast));
5291                 // If the value is negative, store 0, otherwise store the result of the conversion
5292                 storeValue((~(As<Int4>(cast) >> 31) & uiValue).value);
5293         }
5294
5295         UInt4::UInt4(int xyzw)
5296         {
5297                 constant(xyzw, xyzw, xyzw, xyzw);
5298         }
5299
5300         UInt4::UInt4(int x, int yzw)
5301         {
5302                 constant(x, yzw, yzw, yzw);
5303         }
5304
5305         UInt4::UInt4(int x, int y, int zw)
5306         {
5307                 constant(x, y, zw, zw);
5308         }
5309
5310         UInt4::UInt4(int x, int y, int z, int w)
5311         {
5312                 constant(x, y, z, w);
5313         }
5314
5315         void UInt4::constant(int x, int y, int z, int w)
5316         {
5317                 int64_t constantVector[4] = {x, y, z, w};
5318                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
5319         }
5320
5321         UInt4::UInt4(RValue<UInt4> rhs)
5322         {
5323                 storeValue(rhs.value);
5324         }
5325
5326         UInt4::UInt4(const UInt4 &rhs)
5327         {
5328                 Value *value = rhs.loadValue();
5329                 storeValue(value);
5330         }
5331
5332         UInt4::UInt4(const Reference<UInt4> &rhs)
5333         {
5334                 Value *value = rhs.loadValue();
5335                 storeValue(value);
5336         }
5337
5338         UInt4::UInt4(RValue<Int4> rhs)
5339         {
5340                 storeValue(rhs.value);
5341         }
5342
5343         UInt4::UInt4(const Int4 &rhs)
5344         {
5345                 Value *value = rhs.loadValue();
5346                 storeValue(value);
5347         }
5348
5349         UInt4::UInt4(const Reference<Int4> &rhs)
5350         {
5351                 Value *value = rhs.loadValue();
5352                 storeValue(value);
5353         }
5354
5355         UInt4::UInt4(RValue<UInt2> lo, RValue<UInt2> hi)
5356         {
5357                 Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
5358                 Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
5359
5360                 Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
5361                 long2 = Nucleus::createInsertElement(long2, loLong, 0);
5362                 long2 = Nucleus::createInsertElement(long2, hiLong, 1);
5363                 Value *uint4 = Nucleus::createBitCast(long2, Int4::getType());
5364
5365                 storeValue(uint4);
5366         }
5367
5368         RValue<UInt4> UInt4::operator=(RValue<UInt4> rhs)
5369         {
5370                 storeValue(rhs.value);
5371
5372                 return rhs;
5373         }
5374
5375         RValue<UInt4> UInt4::operator=(const UInt4 &rhs)
5376         {
5377                 Value *value = rhs.loadValue();
5378                 storeValue(value);
5379
5380                 return RValue<UInt4>(value);
5381         }
5382
5383         RValue<UInt4> UInt4::operator=(const Reference<UInt4> &rhs)
5384         {
5385                 Value *value = rhs.loadValue();
5386                 storeValue(value);
5387
5388                 return RValue<UInt4>(value);
5389         }
5390
5391         RValue<UInt4> operator+(RValue<UInt4> lhs, RValue<UInt4> rhs)
5392         {
5393                 return RValue<UInt4>(Nucleus::createAdd(lhs.value, rhs.value));
5394         }
5395
5396         RValue<UInt4> operator-(RValue<UInt4> lhs, RValue<UInt4> rhs)
5397         {
5398                 return RValue<UInt4>(Nucleus::createSub(lhs.value, rhs.value));
5399         }
5400
5401         RValue<UInt4> operator*(RValue<UInt4> lhs, RValue<UInt4> rhs)
5402         {
5403                 return RValue<UInt4>(Nucleus::createMul(lhs.value, rhs.value));
5404         }
5405
5406         RValue<UInt4> operator/(RValue<UInt4> lhs, RValue<UInt4> rhs)
5407         {
5408                 return RValue<UInt4>(Nucleus::createUDiv(lhs.value, rhs.value));
5409         }
5410
5411         RValue<UInt4> operator%(RValue<UInt4> lhs, RValue<UInt4> rhs)
5412         {
5413                 return RValue<UInt4>(Nucleus::createURem(lhs.value, rhs.value));
5414         }
5415
5416         RValue<UInt4> operator&(RValue<UInt4> lhs, RValue<UInt4> rhs)
5417         {
5418                 return RValue<UInt4>(Nucleus::createAnd(lhs.value, rhs.value));
5419         }
5420
5421         RValue<UInt4> operator|(RValue<UInt4> lhs, RValue<UInt4> rhs)
5422         {
5423                 return RValue<UInt4>(Nucleus::createOr(lhs.value, rhs.value));
5424         }
5425
5426         RValue<UInt4> operator^(RValue<UInt4> lhs, RValue<UInt4> rhs)
5427         {
5428                 return RValue<UInt4>(Nucleus::createXor(lhs.value, rhs.value));
5429         }
5430
5431         RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
5432         {
5433                 return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
5434         }
5435
5436         RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
5437         {
5438                 return x86::psrld(lhs, rhs);
5439         }
5440
5441         RValue<UInt4> operator<<(RValue<UInt4> lhs, RValue<UInt4> rhs)
5442         {
5443                 return RValue<UInt4>(Nucleus::createShl(lhs.value, rhs.value));
5444         }
5445
5446         RValue<UInt4> operator>>(RValue<UInt4> lhs, RValue<UInt4> rhs)
5447         {
5448                 return RValue<UInt4>(Nucleus::createLShr(lhs.value, rhs.value));
5449         }
5450
5451         RValue<UInt4> operator+=(UInt4 &lhs, RValue<UInt4> rhs)
5452         {
5453                 return lhs = lhs + rhs;
5454         }
5455
5456         RValue<UInt4> operator-=(UInt4 &lhs, RValue<UInt4> rhs)
5457         {
5458                 return lhs = lhs - rhs;
5459         }
5460
5461         RValue<UInt4> operator*=(UInt4 &lhs, RValue<UInt4> rhs)
5462         {
5463                 return lhs = lhs * rhs;
5464         }
5465
5466 //      RValue<UInt4> operator/=(UInt4 &lhs, RValue<UInt4> rhs)
5467 //      {
5468 //              return lhs = lhs / rhs;
5469 //      }
5470
5471 //      RValue<UInt4> operator%=(UInt4 &lhs, RValue<UInt4> rhs)
5472 //      {
5473 //              return lhs = lhs % rhs;
5474 //      }
5475
5476         RValue<UInt4> operator&=(UInt4 &lhs, RValue<UInt4> rhs)
5477         {
5478                 return lhs = lhs & rhs;
5479         }
5480
5481         RValue<UInt4> operator|=(UInt4 &lhs, RValue<UInt4> rhs)
5482         {
5483                 return lhs = lhs | rhs;
5484         }
5485
5486         RValue<UInt4> operator^=(UInt4 &lhs, RValue<UInt4> rhs)
5487         {
5488                 return lhs = lhs ^ rhs;
5489         }
5490
5491         RValue<UInt4> operator<<=(UInt4 &lhs, unsigned char rhs)
5492         {
5493                 return lhs = lhs << rhs;
5494         }
5495
5496         RValue<UInt4> operator>>=(UInt4 &lhs, unsigned char rhs)
5497         {
5498                 return lhs = lhs >> rhs;
5499         }
5500
5501         RValue<UInt4> operator+(RValue<UInt4> val)
5502         {
5503                 return val;
5504         }
5505
5506         RValue<UInt4> operator-(RValue<UInt4> val)
5507         {
5508                 return RValue<UInt4>(Nucleus::createNeg(val.value));
5509         }
5510
5511         RValue<UInt4> operator~(RValue<UInt4> val)
5512         {
5513                 return RValue<UInt4>(Nucleus::createNot(val.value));
5514         }
5515
5516         RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
5517         {
5518                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5519                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5520                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
5521                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
5522         }
5523
5524         RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
5525         {
5526                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType()));
5527         }
5528
5529         RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
5530         {
5531                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5532                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5533                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value, y.value), Int4::getType()));
5534                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
5535         }
5536
5537         RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
5538         {
5539                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
5540         }
5541
5542         RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
5543         {
5544                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5545                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
5546                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value, y.value), Int4::getType()));
5547                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
5548         }
5549
5550         RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
5551         {
5552                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType()));
5553         }
5554
5555         RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
5556         {
5557                 if(CPUID::supportsSSE4_1())
5558                 {
5559                         return x86::pmaxud(x, y);
5560                 }
5561                 else
5562                 {
5563                         RValue<UInt4> greater = CmpNLE(x, y);
5564                         return (x & greater) | (y & ~greater);
5565                 }
5566         }
5567
5568         RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
5569         {
5570                 if(CPUID::supportsSSE4_1())
5571                 {
5572                         return x86::pminud(x, y);
5573                 }
5574                 else
5575                 {
5576                         RValue<UInt4> less = CmpLT(x, y);
5577                         return (x & less) | (y & ~less);
5578                 }
5579         }
5580
5581         RValue<UShort8> Pack(RValue<UInt4> x, RValue<UInt4> y)
5582         {
5583                 return x86::packusdw(As<Int4>(x), As<Int4>(y));
5584         }
5585
5586         Type *UInt4::getType()
5587         {
5588                 return T(VectorType::get(UInt::getType(), 4));
5589         }
5590
5591         Float::Float(RValue<Int> cast)
5592         {
5593                 Value *integer = Nucleus::createSIToFP(cast.value, Float::getType());
5594
5595                 storeValue(integer);
5596         }
5597
5598         Float::Float(RValue<UInt> cast)
5599         {
5600                 RValue<Float> result = Float(Int(cast & UInt(0x7FFFFFFF))) +
5601                                        As<Float>((As<Int>(cast) >> 31) & As<Int>(Float(0x80000000u)));
5602
5603                 storeValue(result.value);
5604         }
5605
5606         Float::Float(float x)
5607         {
5608                 storeValue(Nucleus::createConstantFloat(x));
5609         }
5610
5611         Float::Float(RValue<Float> rhs)
5612         {
5613                 storeValue(rhs.value);
5614         }
5615
5616         Float::Float(const Float &rhs)
5617         {
5618                 Value *value = rhs.loadValue();
5619                 storeValue(value);
5620         }
5621
5622         Float::Float(const Reference<Float> &rhs)
5623         {
5624                 Value *value = rhs.loadValue();
5625                 storeValue(value);
5626         }
5627
5628         RValue<Float> Float::operator=(RValue<Float> rhs)
5629         {
5630                 storeValue(rhs.value);
5631
5632                 return rhs;
5633         }
5634
5635         RValue<Float> Float::operator=(const Float &rhs)
5636         {
5637                 Value *value = rhs.loadValue();
5638                 storeValue(value);
5639
5640                 return RValue<Float>(value);
5641         }
5642
5643         RValue<Float> Float::operator=(const Reference<Float> &rhs)
5644         {
5645                 Value *value = rhs.loadValue();
5646                 storeValue(value);
5647
5648                 return RValue<Float>(value);
5649         }
5650
5651         RValue<Float> operator+(RValue<Float> lhs, RValue<Float> rhs)
5652         {
5653                 return RValue<Float>(Nucleus::createFAdd(lhs.value, rhs.value));
5654         }
5655
5656         RValue<Float> operator-(RValue<Float> lhs, RValue<Float> rhs)
5657         {
5658                 return RValue<Float>(Nucleus::createFSub(lhs.value, rhs.value));
5659         }
5660
5661         RValue<Float> operator*(RValue<Float> lhs, RValue<Float> rhs)
5662         {
5663                 return RValue<Float>(Nucleus::createFMul(lhs.value, rhs.value));
5664         }
5665
5666         RValue<Float> operator/(RValue<Float> lhs, RValue<Float> rhs)
5667         {
5668                 return RValue<Float>(Nucleus::createFDiv(lhs.value, rhs.value));
5669         }
5670
5671         RValue<Float> operator+=(Float &lhs, RValue<Float> rhs)
5672         {
5673                 return lhs = lhs + rhs;
5674         }
5675
5676         RValue<Float> operator-=(Float &lhs, RValue<Float> rhs)
5677         {
5678                 return lhs = lhs - rhs;
5679         }
5680
5681         RValue<Float> operator*=(Float &lhs, RValue<Float> rhs)
5682         {
5683                 return lhs = lhs * rhs;
5684         }
5685
5686         RValue<Float> operator/=(Float &lhs, RValue<Float> rhs)
5687         {
5688                 return lhs = lhs / rhs;
5689         }
5690
5691         RValue<Float> operator+(RValue<Float> val)
5692         {
5693                 return val;
5694         }
5695
5696         RValue<Float> operator-(RValue<Float> val)
5697         {
5698                 return RValue<Float>(Nucleus::createFNeg(val.value));
5699         }
5700
5701         RValue<Bool> operator<(RValue<Float> lhs, RValue<Float> rhs)
5702         {
5703                 return RValue<Bool>(Nucleus::createFCmpOLT(lhs.value, rhs.value));
5704         }
5705
5706         RValue<Bool> operator<=(RValue<Float> lhs, RValue<Float> rhs)
5707         {
5708                 return RValue<Bool>(Nucleus::createFCmpOLE(lhs.value, rhs.value));
5709         }
5710
5711         RValue<Bool> operator>(RValue<Float> lhs, RValue<Float> rhs)
5712         {
5713                 return RValue<Bool>(Nucleus::createFCmpOGT(lhs.value, rhs.value));
5714         }
5715
5716         RValue<Bool> operator>=(RValue<Float> lhs, RValue<Float> rhs)
5717         {
5718                 return RValue<Bool>(Nucleus::createFCmpOGE(lhs.value, rhs.value));
5719         }
5720
5721         RValue<Bool> operator!=(RValue<Float> lhs, RValue<Float> rhs)
5722         {
5723                 return RValue<Bool>(Nucleus::createFCmpONE(lhs.value, rhs.value));
5724         }
5725
5726         RValue<Bool> operator==(RValue<Float> lhs, RValue<Float> rhs)
5727         {
5728                 return RValue<Bool>(Nucleus::createFCmpOEQ(lhs.value, rhs.value));
5729         }
5730
5731         RValue<Float> Abs(RValue<Float> x)
5732         {
5733                 return IfThenElse(x > 0.0f, x, -x);
5734         }
5735
5736         RValue<Float> Max(RValue<Float> x, RValue<Float> y)
5737         {
5738                 return IfThenElse(x > y, x, y);
5739         }
5740
5741         RValue<Float> Min(RValue<Float> x, RValue<Float> y)
5742         {
5743                 return IfThenElse(x < y, x, y);
5744         }
5745
5746         RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
5747         {
5748                 #if defined(__i386__) || defined(__x86_64__)
5749                         if(exactAtPow2)
5750                         {
5751                                 // rcpss uses a piecewise-linear approximation which minimizes the relative error
5752                                 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
5753                                 return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
5754                         }
5755                 #endif
5756
5757                 return x86::rcpss(x);
5758         }
5759
5760         RValue<Float> RcpSqrt_pp(RValue<Float> x)
5761         {
5762                 return x86::rsqrtss(x);
5763         }
5764
5765         RValue<Float> Sqrt(RValue<Float> x)
5766         {
5767                 return x86::sqrtss(x);
5768         }
5769
5770         RValue<Float> Round(RValue<Float> x)
5771         {
5772                 if(CPUID::supportsSSE4_1())
5773                 {
5774                         return x86::roundss(x, 0);
5775                 }
5776                 else
5777                 {
5778                         return Float4(Round(Float4(x))).x;
5779                 }
5780         }
5781
5782         RValue<Float> Trunc(RValue<Float> x)
5783         {
5784                 if(CPUID::supportsSSE4_1())
5785                 {
5786                         return x86::roundss(x, 3);
5787                 }
5788                 else
5789                 {
5790                         return Float(Int(x));   // Rounded toward zero
5791                 }
5792         }
5793
5794         RValue<Float> Frac(RValue<Float> x)
5795         {
5796                 if(CPUID::supportsSSE4_1())
5797                 {
5798                         return x - x86::floorss(x);
5799                 }
5800                 else
5801                 {
5802                         return Float4(Frac(Float4(x))).x;
5803                 }
5804         }
5805
5806         RValue<Float> Floor(RValue<Float> x)
5807         {
5808                 if(CPUID::supportsSSE4_1())
5809                 {
5810                         return x86::floorss(x);
5811                 }
5812                 else
5813                 {
5814                         return Float4(Floor(Float4(x))).x;
5815                 }
5816         }
5817
5818         RValue<Float> Ceil(RValue<Float> x)
5819         {
5820                 if(CPUID::supportsSSE4_1())
5821                 {
5822                         return x86::ceilss(x);
5823                 }
5824                 else
5825                 {
5826                         return Float4(Ceil(Float4(x))).x;
5827                 }
5828         }
5829
5830         Type *Float::getType()
5831         {
5832                 return T(llvm::Type::getFloatTy(*::context));
5833         }
5834
5835         Float2::Float2(RValue<Float4> cast)
5836         {
5837                 Value *int64x2 = Nucleus::createBitCast(cast.value, T(VectorType::get(Long::getType(), 2)));
5838                 Value *int64 = Nucleus::createExtractElement(int64x2, Long::getType(), 0);
5839                 Value *float2 = Nucleus::createBitCast(int64, Float2::getType());
5840
5841                 storeValue(float2);
5842         }
5843
5844         Type *Float2::getType()
5845         {
5846                 return T(VectorType::get(Float::getType(), 2));
5847         }
5848
5849         Float4::Float4(RValue<Byte4> cast) : FloatXYZW(this)
5850         {
5851                 #if 0
5852                         Value *xyzw = Nucleus::createUIToFP(cast.value, Float4::getType());   // FIXME: Crashes
5853                 #elif 0
5854                         Value *vector = loadValue();
5855
5856                         Value *i8x = Nucleus::createExtractElement(cast.value, 0);
5857                         Value *f32x = Nucleus::createUIToFP(i8x, Float::getType());
5858                         Value *x = Nucleus::createInsertElement(vector, f32x, 0);
5859
5860                         Value *i8y = Nucleus::createExtractElement(cast.value, V(Nucleus::createConstantInt(1)));
5861                         Value *f32y = Nucleus::createUIToFP(i8y, Float::getType());
5862                         Value *xy = Nucleus::createInsertElement(x, f32y, V(Nucleus::createConstantInt(1)));
5863
5864                         Value *i8z = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(2));
5865                         Value *f32z = Nucleus::createUIToFP(i8z, Float::getType());
5866                         Value *xyz = Nucleus::createInsertElement(xy, f32z, Nucleus::createConstantInt(2));
5867
5868                         Value *i8w = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(3));
5869                         Value *f32w = Nucleus::createUIToFP(i8w, Float::getType());
5870                         Value *xyzw = Nucleus::createInsertElement(xyz, f32w, Nucleus::createConstantInt(3));
5871                 #else
5872                         Value *a = Int4(cast).loadValue();
5873                         Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
5874                 #endif
5875
5876                 storeValue(xyzw);
5877         }
5878
5879         Float4::Float4(RValue<SByte4> cast) : FloatXYZW(this)
5880         {
5881                 #if 0
5882                         Value *xyzw = Nucleus::createSIToFP(cast.value, Float4::getType());   // FIXME: Crashes
5883                 #elif 0
5884                         Value *vector = loadValue();
5885
5886                         Value *i8x = Nucleus::createExtractElement(cast.value, 0);
5887                         Value *f32x = Nucleus::createSIToFP(i8x, Float::getType());
5888                         Value *x = Nucleus::createInsertElement(vector, f32x, 0);
5889
5890                         Value *i8y = Nucleus::createExtractElement(cast.value, V(Nucleus::createConstantInt(1)));
5891                         Value *f32y = Nucleus::createSIToFP(i8y, Float::getType());
5892                         Value *xy = Nucleus::createInsertElement(x, f32y, V(Nucleus::createConstantInt(1)));
5893
5894                         Value *i8z = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(2));
5895                         Value *f32z = Nucleus::createSIToFP(i8z, Float::getType());
5896                         Value *xyz = Nucleus::createInsertElement(xy, f32z, Nucleus::createConstantInt(2));
5897
5898                         Value *i8w = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(3));
5899                         Value *f32w = Nucleus::createSIToFP(i8w, Float::getType());
5900                         Value *xyzw = Nucleus::createInsertElement(xyz, f32w, Nucleus::createConstantInt(3));
5901                 #else
5902                         Value *a = Int4(cast).loadValue();
5903                         Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
5904                 #endif
5905
5906                 storeValue(xyzw);
5907         }
5908
5909         Float4::Float4(RValue<Short4> cast) : FloatXYZW(this)
5910         {
5911                 Int4 c(cast);
5912                 storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
5913         }
5914
5915         Float4::Float4(RValue<UShort4> cast) : FloatXYZW(this)
5916         {
5917                 Int4 c(cast);
5918                 storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
5919         }
5920
5921         Float4::Float4(RValue<Int4> cast) : FloatXYZW(this)
5922         {
5923                 Value *xyzw = Nucleus::createSIToFP(cast.value, Float4::getType());
5924
5925                 storeValue(xyzw);
5926         }
5927
5928         Float4::Float4(RValue<UInt4> cast) : FloatXYZW(this)
5929         {
5930                 RValue<Float4> result = Float4(Int4(cast & UInt4(0x7FFFFFFF))) +
5931                                         As<Float4>((As<Int4>(cast) >> 31) & As<Int4>(Float4(0x80000000u)));
5932
5933                 storeValue(result.value);
5934         }
5935
5936         Float4::Float4() : FloatXYZW(this)
5937         {
5938         }
5939
5940         Float4::Float4(float xyzw) : FloatXYZW(this)
5941         {
5942                 constant(xyzw, xyzw, xyzw, xyzw);
5943         }
5944
5945         Float4::Float4(float x, float yzw) : FloatXYZW(this)
5946         {
5947                 constant(x, yzw, yzw, yzw);
5948         }
5949
5950         Float4::Float4(float x, float y, float zw) : FloatXYZW(this)
5951         {
5952                 constant(x, y, zw, zw);
5953         }
5954
5955         Float4::Float4(float x, float y, float z, float w) : FloatXYZW(this)
5956         {
5957                 constant(x, y, z, w);
5958         }
5959
5960         void Float4::constant(float x, float y, float z, float w)
5961         {
5962                 double constantVector[4] = {x, y, z, w};
5963                 storeValue(Nucleus::createConstantVector(constantVector, getType()));
5964         }
5965
5966         Float4::Float4(RValue<Float4> rhs) : FloatXYZW(this)
5967         {
5968                 storeValue(rhs.value);
5969         }
5970
5971         Float4::Float4(const Float4 &rhs) : FloatXYZW(this)
5972         {
5973                 Value *value = rhs.loadValue();
5974                 storeValue(value);
5975         }
5976
5977         Float4::Float4(const Reference<Float4> &rhs) : FloatXYZW(this)
5978         {
5979                 Value *value = rhs.loadValue();
5980                 storeValue(value);
5981         }
5982
5983         Float4::Float4(RValue<Float> rhs) : FloatXYZW(this)
5984         {
5985                 Value *vector = loadValue();
5986                 Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
5987
5988                 int swizzle[4] = {0, 0, 0, 0};
5989                 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
5990
5991                 storeValue(replicate);
5992         }
5993
5994         Float4::Float4(const Float &rhs) : FloatXYZW(this)
5995         {
5996                 *this = RValue<Float>(rhs.loadValue());
5997         }
5998
5999         Float4::Float4(const Reference<Float> &rhs) : FloatXYZW(this)
6000         {
6001                 *this = RValue<Float>(rhs.loadValue());
6002         }
6003
6004         RValue<Float4> Float4::operator=(float x)
6005         {
6006                 return *this = Float4(x, x, x, x);
6007         }
6008
6009         RValue<Float4> Float4::operator=(RValue<Float4> rhs)
6010         {
6011                 storeValue(rhs.value);
6012
6013                 return rhs;
6014         }
6015
6016         RValue<Float4> Float4::operator=(const Float4 &rhs)
6017         {
6018                 Value *value = rhs.loadValue();
6019                 storeValue(value);
6020
6021                 return RValue<Float4>(value);
6022         }
6023
6024         RValue<Float4> Float4::operator=(const Reference<Float4> &rhs)
6025         {
6026                 Value *value = rhs.loadValue();
6027                 storeValue(value);
6028
6029                 return RValue<Float4>(value);
6030         }
6031
6032         RValue<Float4> Float4::operator=(RValue<Float> rhs)
6033         {
6034                 return *this = Float4(rhs);
6035         }
6036
6037         RValue<Float4> Float4::operator=(const Float &rhs)
6038         {
6039                 return *this = Float4(rhs);
6040         }
6041
6042         RValue<Float4> Float4::operator=(const Reference<Float> &rhs)
6043         {
6044                 return *this = Float4(rhs);
6045         }
6046
6047         RValue<Float4> operator+(RValue<Float4> lhs, RValue<Float4> rhs)
6048         {
6049                 return RValue<Float4>(Nucleus::createFAdd(lhs.value, rhs.value));
6050         }
6051
6052         RValue<Float4> operator-(RValue<Float4> lhs, RValue<Float4> rhs)
6053         {
6054                 return RValue<Float4>(Nucleus::createFSub(lhs.value, rhs.value));
6055         }
6056
6057         RValue<Float4> operator*(RValue<Float4> lhs, RValue<Float4> rhs)
6058         {
6059                 return RValue<Float4>(Nucleus::createFMul(lhs.value, rhs.value));
6060         }
6061
6062         RValue<Float4> operator/(RValue<Float4> lhs, RValue<Float4> rhs)
6063         {
6064                 return RValue<Float4>(Nucleus::createFDiv(lhs.value, rhs.value));
6065         }
6066
6067         RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
6068         {
6069                 return RValue<Float4>(Nucleus::createFRem(lhs.value, rhs.value));
6070         }
6071
6072         RValue<Float4> operator+=(Float4 &lhs, RValue<Float4> rhs)
6073         {
6074                 return lhs = lhs + rhs;
6075         }
6076
6077         RValue<Float4> operator-=(Float4 &lhs, RValue<Float4> rhs)
6078         {
6079                 return lhs = lhs - rhs;
6080         }
6081
6082         RValue<Float4> operator*=(Float4 &lhs, RValue<Float4> rhs)
6083         {
6084                 return lhs = lhs * rhs;
6085         }
6086
6087         RValue<Float4> operator/=(Float4 &lhs, RValue<Float4> rhs)
6088         {
6089                 return lhs = lhs / rhs;
6090         }
6091
6092         RValue<Float4> operator%=(Float4 &lhs, RValue<Float4> rhs)
6093         {
6094                 return lhs = lhs % rhs;
6095         }
6096
6097         RValue<Float4> operator+(RValue<Float4> val)
6098         {
6099                 return val;
6100         }
6101
6102         RValue<Float4> operator-(RValue<Float4> val)
6103         {
6104                 return RValue<Float4>(Nucleus::createFNeg(val.value));
6105         }
6106
6107         RValue<Float4> Abs(RValue<Float4> x)
6108         {
6109                 Value *vector = Nucleus::createBitCast(x.value, Int4::getType());
6110                 int64_t constantVector[4] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
6111                 Value *result = Nucleus::createAnd(vector, V(Nucleus::createConstantVector(constantVector, Int4::getType())));
6112
6113                 return RValue<Float4>(Nucleus::createBitCast(result, Float4::getType()));
6114         }
6115
6116         RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
6117         {
6118                 return x86::maxps(x, y);
6119         }
6120
6121         RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
6122         {
6123                 return x86::minps(x, y);
6124         }
6125
6126         RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
6127         {
6128                 #if defined(__i386__) || defined(__x86_64__)
6129                         if(exactAtPow2)
6130                         {
6131                                 // rcpps uses a piecewise-linear approximation which minimizes the relative error
6132                                 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
6133                                 return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
6134                         }
6135                 #endif
6136
6137                 return x86::rcpps(x);
6138         }
6139
6140         RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
6141         {
6142                 return x86::rsqrtps(x);
6143         }
6144
6145         RValue<Float4> Sqrt(RValue<Float4> x)
6146         {
6147                 return x86::sqrtps(x);
6148         }
6149
6150         RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i)
6151         {
6152                 return RValue<Float4>(Nucleus::createInsertElement(val.value, element.value, i));
6153         }
6154
6155         RValue<Float> Extract(RValue<Float4> x, int i)
6156         {
6157                 return RValue<Float>(Nucleus::createExtractElement(x.value, Float::getType(), i));
6158         }
6159
6160         RValue<Float4> Swizzle(RValue<Float4> x, unsigned char select)
6161         {
6162                 return RValue<Float4>(createSwizzle4(x.value, select));
6163         }
6164
6165         RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, unsigned char imm)
6166         {
6167                 int shuffle[4] =
6168                 {
6169                         ((imm >> 0) & 0x03) + 0,
6170                         ((imm >> 2) & 0x03) + 0,
6171                         ((imm >> 4) & 0x03) + 4,
6172                         ((imm >> 6) & 0x03) + 4,
6173                 };
6174
6175                 return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
6176         }
6177
6178         RValue<Float4> UnpackLow(RValue<Float4> x, RValue<Float4> y)
6179         {
6180                 int shuffle[4] = {0, 4, 1, 5};
6181                 return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
6182         }
6183
6184         RValue<Float4> UnpackHigh(RValue<Float4> x, RValue<Float4> y)
6185         {
6186                 int shuffle[4] = {2, 6, 3, 7};
6187                 return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
6188         }
6189
6190         RValue<Float4> Mask(Float4 &lhs, RValue<Float4> rhs, unsigned char select)
6191         {
6192                 Value *vector = lhs.loadValue();
6193                 Value *shuffle = createMask4(vector, rhs.value, select);
6194                 lhs.storeValue(shuffle);
6195
6196                 return RValue<Float4>(shuffle);
6197         }
6198
6199         RValue<Int> SignMask(RValue<Float4> x)
6200         {
6201                 return x86::movmskps(x);
6202         }
6203
6204         RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
6205         {
6206         //      return As<Int4>(x86::cmpeqps(x, y));
6207                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value, y.value), Int4::getType()));
6208         }
6209
6210         RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
6211         {
6212         //      return As<Int4>(x86::cmpltps(x, y));
6213                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value, y.value), Int4::getType()));
6214         }
6215
6216         RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
6217         {
6218         //      return As<Int4>(x86::cmpleps(x, y));
6219                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value, y.value), Int4::getType()));
6220         }
6221
6222         RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
6223         {
6224         //      return As<Int4>(x86::cmpneqps(x, y));
6225                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value, y.value), Int4::getType()));
6226         }
6227
6228         RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
6229         {
6230         //      return As<Int4>(x86::cmpnltps(x, y));
6231                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value, y.value), Int4::getType()));
6232         }
6233
6234         RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
6235         {
6236         //      return As<Int4>(x86::cmpnleps(x, y));
6237                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value, y.value), Int4::getType()));
6238         }
6239
6240         RValue<Float4> Round(RValue<Float4> x)
6241         {
6242                 if(CPUID::supportsSSE4_1())
6243                 {
6244                         return x86::roundps(x, 0);
6245                 }
6246                 else
6247                 {
6248                         return Float4(RoundInt(x));
6249                 }
6250         }
6251
6252         RValue<Float4> Trunc(RValue<Float4> x)
6253         {
6254                 if(CPUID::supportsSSE4_1())
6255                 {
6256                         return x86::roundps(x, 3);
6257                 }
6258                 else
6259                 {
6260                         return Float4(Int4(x));   // Rounded toward zero
6261                 }
6262         }
6263
6264         RValue<Float4> Frac(RValue<Float4> x)
6265         {
6266                 Float4 frc;
6267
6268                 if(CPUID::supportsSSE4_1())
6269                 {
6270                         frc = x - x86::floorps(x);
6271                 }
6272                 else
6273                 {
6274                         frc = x - Float4(Int4(x));   // Signed fractional part.
6275
6276                         frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f)));   // Add 1.0 if negative.
6277                 }
6278
6279                 // x - floor(x) can be 1.0 for very small negative x.
6280                 // Clamp against the value just below 1.0.
6281                 return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
6282         }
6283
6284         RValue<Float4> Floor(RValue<Float4> x)
6285         {
6286                 if(CPUID::supportsSSE4_1())
6287                 {
6288                         return x86::floorps(x);
6289                 }
6290                 else
6291                 {
6292                         return x - Frac(x);
6293                 }
6294         }
6295
6296         RValue<Float4> Ceil(RValue<Float4> x)
6297         {
6298                 if(CPUID::supportsSSE4_1())
6299                 {
6300                         return x86::ceilps(x);
6301                 }
6302                 else
6303                 {
6304                         return -Floor(-x);
6305                 }
6306         }
6307
6308         Type *Float4::getType()
6309         {
6310                 return T(VectorType::get(Float::getType(), 4));
6311         }
6312
6313         RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, int offset)
6314         {
6315                 return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), V(Nucleus::createConstantInt(offset)), false));
6316         }
6317
6318         RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
6319         {
6320                 return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value, false));
6321         }
6322
6323         RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
6324         {
6325                 return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value, true));
6326         }
6327
6328         RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, int offset)
6329         {
6330                 return lhs = lhs + offset;
6331         }
6332
6333         RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<Int> offset)
6334         {
6335                 return lhs = lhs + offset;
6336         }
6337
6338         RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, RValue<UInt> offset)
6339         {
6340                 return lhs = lhs + offset;
6341         }
6342
6343         RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, int offset)
6344         {
6345                 return lhs + -offset;
6346         }
6347
6348         RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
6349         {
6350                 return lhs + -offset;
6351         }
6352
6353         RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
6354         {
6355                 return lhs + -offset;
6356         }
6357
6358         RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, int offset)
6359         {
6360                 return lhs = lhs - offset;
6361         }
6362
6363         RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<Int> offset)
6364         {
6365                 return lhs = lhs - offset;
6366         }
6367
6368         RValue<Pointer<Byte>> operator-=(Pointer<Byte> &lhs, RValue<UInt> offset)
6369         {
6370                 return lhs = lhs - offset;
6371         }
6372
6373         void Return()
6374         {
6375                 Nucleus::createRetVoid();
6376                 Nucleus::setInsertBlock(Nucleus::createBasicBlock());
6377                 Nucleus::createUnreachable();
6378         }
6379
6380         void Return(RValue<Int> ret)
6381         {
6382                 Nucleus::createRet(ret.value);
6383                 Nucleus::setInsertBlock(Nucleus::createBasicBlock());
6384                 Nucleus::createUnreachable();
6385         }
6386
6387         void branch(RValue<Bool> cmp, BasicBlock *bodyBB, BasicBlock *endBB)
6388         {
6389                 Nucleus::createCondBr(cmp.value, bodyBB, endBB);
6390                 Nucleus::setInsertBlock(bodyBB);
6391         }
6392
6393         RValue<Long> Ticks()
6394         {
6395                 llvm::Function *rdtsc = Intrinsic::getDeclaration(::module, Intrinsic::readcyclecounter);
6396
6397                 return RValue<Long>(V(::builder->CreateCall(rdtsc)));
6398         }
6399 }
6400
6401 namespace sw
6402 {
6403         namespace x86
6404         {
6405                 RValue<Int> cvtss2si(RValue<Float> val)
6406                 {
6407                         llvm::Function *cvtss2si = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvtss2si);
6408
6409                         Float4 vector;
6410                         vector.x = val;
6411
6412                         return RValue<Int>(V(::builder->CreateCall(cvtss2si, RValue<Float4>(vector).value)));
6413                 }
6414
6415                 RValue<Int2> cvtps2pi(RValue<Float4> val)
6416                 {
6417                         llvm::Function *cvtps2pi = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvtps2pi);
6418
6419                         return RValue<Int2>(V(::builder->CreateCall(cvtps2pi, val.value)));
6420                 }
6421
6422                 RValue<Int2> cvttps2pi(RValue<Float4> val)
6423                 {
6424                         llvm::Function *cvttps2pi = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvttps2pi);
6425
6426                         return RValue<Int2>(V(::builder->CreateCall(cvttps2pi, val.value)));
6427                 }
6428
6429                 RValue<Int4> cvtps2dq(RValue<Float4> val)
6430                 {
6431                         if(CPUID::supportsSSE2())
6432                         {
6433                                 llvm::Function *cvtps2dq = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_cvtps2dq);
6434
6435                                 return RValue<Int4>(V(::builder->CreateCall(cvtps2dq, val.value)));
6436                         }
6437                         else
6438                         {
6439                                 Int2 lo = x86::cvtps2pi(val);
6440                                 Int2 hi = x86::cvtps2pi(Swizzle(val, 0xEE));
6441
6442                                 return Int4(lo, hi);
6443                         }
6444                 }
6445
6446                 RValue<Float> rcpss(RValue<Float> val)
6447                 {
6448                         llvm::Function *rcpss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rcp_ss);
6449
6450                         Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
6451
6452                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(rcpss, vector)), Float::getType(), 0));
6453                 }
6454
6455                 RValue<Float> sqrtss(RValue<Float> val)
6456                 {
6457                         llvm::Function *sqrtss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_sqrt_ss);
6458
6459                         Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
6460
6461                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(sqrtss, vector)), Float::getType(), 0));
6462                 }
6463
6464                 RValue<Float> rsqrtss(RValue<Float> val)
6465                 {
6466                         llvm::Function *rsqrtss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rsqrt_ss);
6467
6468                         Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
6469
6470                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(rsqrtss, vector)), Float::getType(), 0));
6471                 }
6472
6473                 RValue<Float4> rcpps(RValue<Float4> val)
6474                 {
6475                         llvm::Function *rcpps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rcp_ps);
6476
6477                         return RValue<Float4>(V(::builder->CreateCall(rcpps, val.value)));
6478                 }
6479
6480                 RValue<Float4> sqrtps(RValue<Float4> val)
6481                 {
6482                         llvm::Function *sqrtps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_sqrt_ps);
6483
6484                         return RValue<Float4>(V(::builder->CreateCall(sqrtps, val.value)));
6485                 }
6486
6487                 RValue<Float4> rsqrtps(RValue<Float4> val)
6488                 {
6489                         llvm::Function *rsqrtps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rsqrt_ps);
6490
6491                         return RValue<Float4>(V(::builder->CreateCall(rsqrtps, val.value)));
6492                 }
6493
6494                 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
6495                 {
6496                         llvm::Function *maxps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_max_ps);
6497
6498                         return RValue<Float4>(V(::builder->CreateCall2(maxps, x.value, y.value)));
6499                 }
6500
6501                 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
6502                 {
6503                         llvm::Function *minps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_min_ps);
6504
6505                         return RValue<Float4>(V(::builder->CreateCall2(minps, x.value, y.value)));
6506                 }
6507
6508                 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
6509                 {
6510                         llvm::Function *roundss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_round_ss);
6511
6512                         Value *undef = V(UndefValue::get(Float4::getType()));
6513                         Value *vector = Nucleus::createInsertElement(undef, val.value, 0);
6514
6515                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall3(roundss, undef, vector, V(Nucleus::createConstantInt(imm)))), Float::getType(), 0));
6516                 }
6517
6518                 RValue<Float> floorss(RValue<Float> val)
6519                 {
6520                         return roundss(val, 1);
6521                 }
6522
6523                 RValue<Float> ceilss(RValue<Float> val)
6524                 {
6525                         return roundss(val, 2);
6526                 }
6527
6528                 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
6529                 {
6530                         llvm::Function *roundps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_round_ps);
6531
6532                         return RValue<Float4>(V(::builder->CreateCall2(roundps, val.value, V(Nucleus::createConstantInt(imm)))));
6533                 }
6534
6535                 RValue<Float4> floorps(RValue<Float4> val)
6536                 {
6537                         return roundps(val, 1);
6538                 }
6539
6540                 RValue<Float4> ceilps(RValue<Float4> val)
6541                 {
6542                         return roundps(val, 2);
6543                 }
6544
6545                 RValue<Float4> cmpps(RValue<Float4> x, RValue<Float4> y, unsigned char imm)
6546                 {
6547                         llvm::Function *cmpps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cmp_ps);
6548
6549                         return RValue<Float4>(V(::builder->CreateCall3(cmpps, x.value, y.value, V(Nucleus::createConstantByte(imm)))));
6550                 }
6551
6552                 RValue<Float4> cmpeqps(RValue<Float4> x, RValue<Float4> y)
6553                 {
6554                         return cmpps(x, y, 0);
6555                 }
6556
6557                 RValue<Float4> cmpltps(RValue<Float4> x, RValue<Float4> y)
6558                 {
6559                         return cmpps(x, y, 1);
6560                 }
6561
6562                 RValue<Float4> cmpleps(RValue<Float4> x, RValue<Float4> y)
6563                 {
6564                         return cmpps(x, y, 2);
6565                 }
6566
6567                 RValue<Float4> cmpunordps(RValue<Float4> x, RValue<Float4> y)
6568                 {
6569                         return cmpps(x, y, 3);
6570                 }
6571
6572                 RValue<Float4> cmpneqps(RValue<Float4> x, RValue<Float4> y)
6573                 {
6574                         return cmpps(x, y, 4);
6575                 }
6576
6577                 RValue<Float4> cmpnltps(RValue<Float4> x, RValue<Float4> y)
6578                 {
6579                         return cmpps(x, y, 5);
6580                 }
6581
6582                 RValue<Float4> cmpnleps(RValue<Float4> x, RValue<Float4> y)
6583                 {
6584                         return cmpps(x, y, 6);
6585                 }
6586
6587                 RValue<Float4> cmpordps(RValue<Float4> x, RValue<Float4> y)
6588                 {
6589                         return cmpps(x, y, 7);
6590                 }
6591
6592                 RValue<Float> cmpss(RValue<Float> x, RValue<Float> y, unsigned char imm)
6593                 {
6594                         llvm::Function *cmpss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cmp_ss);
6595
6596                         Value *vector1 = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), x.value, 0);
6597                         Value *vector2 = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), y.value, 0);
6598
6599                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall3(cmpss, vector1, vector2, V(Nucleus::createConstantByte(imm)))), Float::getType(), 0));
6600                 }
6601
6602                 RValue<Float> cmpeqss(RValue<Float> x, RValue<Float> y)
6603                 {
6604                         return cmpss(x, y, 0);
6605                 }
6606
6607                 RValue<Float> cmpltss(RValue<Float> x, RValue<Float> y)
6608                 {
6609                         return cmpss(x, y, 1);
6610                 }
6611
6612                 RValue<Float> cmpless(RValue<Float> x, RValue<Float> y)
6613                 {
6614                         return cmpss(x, y, 2);
6615                 }
6616
6617                 RValue<Float> cmpunordss(RValue<Float> x, RValue<Float> y)
6618                 {
6619                         return cmpss(x, y, 3);
6620                 }
6621
6622                 RValue<Float> cmpneqss(RValue<Float> x, RValue<Float> y)
6623                 {
6624                         return cmpss(x, y, 4);
6625                 }
6626
6627                 RValue<Float> cmpnltss(RValue<Float> x, RValue<Float> y)
6628                 {
6629                         return cmpss(x, y, 5);
6630                 }
6631
6632                 RValue<Float> cmpnless(RValue<Float> x, RValue<Float> y)
6633                 {
6634                         return cmpss(x, y, 6);
6635                 }
6636
6637                 RValue<Float> cmpordss(RValue<Float> x, RValue<Float> y)
6638                 {
6639                         return cmpss(x, y, 7);
6640                 }
6641
6642                 RValue<Int4> pabsd(RValue<Int4> x)
6643                 {
6644                         llvm::Function *pabsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_ssse3_pabs_d_128);
6645
6646                         return RValue<Int4>(V(::builder->CreateCall(pabsd, x.value)));
6647                 }
6648
6649                 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
6650                 {
6651                         llvm::Function *paddsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padds_w);
6652
6653                         return As<Short4>(V(::builder->CreateCall2(paddsw, As<MMX>(x).value, As<MMX>(y).value)));
6654                 }
6655
6656                 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
6657                 {
6658                         llvm::Function *psubsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubs_w);
6659
6660                         return As<Short4>(V(::builder->CreateCall2(psubsw, As<MMX>(x).value, As<MMX>(y).value)));
6661                 }
6662
6663                 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
6664                 {
6665                         llvm::Function *paddusw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_paddus_w);
6666
6667                         return As<UShort4>(V(::builder->CreateCall2(paddusw, As<MMX>(x).value, As<MMX>(y).value)));
6668                 }
6669
6670                 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
6671                 {
6672                         llvm::Function *psubusw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubus_w);
6673
6674                         return As<UShort4>(V(::builder->CreateCall2(psubusw, As<MMX>(x).value, As<MMX>(y).value)));
6675                 }
6676
6677                 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
6678                 {
6679                         llvm::Function *paddsb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padds_b);
6680
6681                         return As<SByte8>(V(::builder->CreateCall2(paddsb, As<MMX>(x).value, As<MMX>(y).value)));
6682                 }
6683
6684                 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
6685                 {
6686                         llvm::Function *psubsb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubs_b);
6687
6688                         return As<SByte8>(V(::builder->CreateCall2(psubsb, As<MMX>(x).value, As<MMX>(y).value)));
6689                 }
6690
6691                 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
6692                 {
6693                         llvm::Function *paddusb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_paddus_b);
6694
6695                         return As<Byte8>(V(::builder->CreateCall2(paddusb, As<MMX>(x).value, As<MMX>(y).value)));
6696                 }
6697
6698                 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
6699                 {
6700                         llvm::Function *psubusb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubus_b);
6701
6702                         return As<Byte8>(V(::builder->CreateCall2(psubusb, As<MMX>(x).value, As<MMX>(y).value)));
6703                 }
6704
6705                 RValue<Short4> paddw(RValue<Short4> x, RValue<Short4> y)
6706                 {
6707                         llvm::Function *paddw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_w);
6708
6709                         return As<Short4>(V(::builder->CreateCall2(paddw, As<MMX>(x).value, As<MMX>(y).value)));
6710                 }
6711
6712                 RValue<Short4> psubw(RValue<Short4> x, RValue<Short4> y)
6713                 {
6714                         llvm::Function *psubw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_w);
6715
6716                         return As<Short4>(V(::builder->CreateCall2(psubw, As<MMX>(x).value, As<MMX>(y).value)));
6717                 }
6718
6719                 RValue<Short4> pmullw(RValue<Short4> x, RValue<Short4> y)
6720                 {
6721                         llvm::Function *pmullw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmull_w);
6722
6723                         return As<Short4>(V(::builder->CreateCall2(pmullw, As<MMX>(x).value, As<MMX>(y).value)));
6724                 }
6725
6726                 RValue<Short4> pand(RValue<Short4> x, RValue<Short4> y)
6727                 {
6728                         llvm::Function *pand = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pand);
6729
6730                         return As<Short4>(V(::builder->CreateCall2(pand, As<MMX>(x).value, As<MMX>(y).value)));
6731                 }
6732
6733                 RValue<Short4> por(RValue<Short4> x, RValue<Short4> y)
6734                 {
6735                         llvm::Function *por = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_por);
6736
6737                         return As<Short4>(V(::builder->CreateCall2(por, As<MMX>(x).value, As<MMX>(y).value)));
6738                 }
6739
6740                 RValue<Short4> pxor(RValue<Short4> x, RValue<Short4> y)
6741                 {
6742                         llvm::Function *pxor = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pxor);
6743
6744                         return As<Short4>(V(::builder->CreateCall2(pxor, As<MMX>(x).value, As<MMX>(y).value)));
6745                 }
6746
6747                 RValue<Short4> pshufw(RValue<Short4> x, unsigned char y)
6748                 {
6749                         llvm::Function *pshufw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_pshuf_w);
6750
6751                         return As<Short4>(V(::builder->CreateCall2(pshufw, As<MMX>(x).value, V(Nucleus::createConstantByte(y)))));
6752                 }
6753
6754                 RValue<Int2> punpcklwd(RValue<Short4> x, RValue<Short4> y)
6755                 {
6756                         llvm::Function *punpcklwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpcklwd);
6757
6758                         return As<Int2>(V(::builder->CreateCall2(punpcklwd, As<MMX>(x).value, As<MMX>(y).value)));
6759                 }
6760
6761                 RValue<Int2> punpckhwd(RValue<Short4> x, RValue<Short4> y)
6762                 {
6763                         llvm::Function *punpckhwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhwd);
6764
6765                         return As<Int2>(V(::builder->CreateCall2(punpckhwd, As<MMX>(x).value, As<MMX>(y).value)));
6766                 }
6767
6768                 RValue<Short4> pinsrw(RValue<Short4> x, RValue<Int> y, unsigned int i)
6769                 {
6770                         llvm::Function *pinsrw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pinsr_w);
6771
6772                         return As<Short4>(V(::builder->CreateCall3(pinsrw, As<MMX>(x).value, y.value, V(Nucleus::createConstantInt(i)))));
6773                 }
6774
6775                 RValue<Int> pextrw(RValue<Short4> x, unsigned int i)
6776                 {
6777                         llvm::Function *pextrw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pextr_w);
6778
6779                         return RValue<Int>(V(::builder->CreateCall2(pextrw, As<MMX>(x).value, V(Nucleus::createConstantInt(i)))));
6780                 }
6781
6782                 RValue<Short4> punpckldq(RValue<Int2> x, RValue<Int2> y)
6783                 {
6784                         llvm::Function *punpckldq = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckldq);
6785
6786                         return As<Short4>(V(::builder->CreateCall2(punpckldq, As<MMX>(x).value, As<MMX>(y).value)));
6787                 }
6788
6789                 RValue<Short4> punpckhdq(RValue<Int2> x, RValue<Int2> y)
6790                 {
6791                         llvm::Function *punpckhdq = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhdq);
6792
6793                         return As<Short4>(V(::builder->CreateCall2(punpckhdq, As<MMX>(x).value, As<MMX>(y).value)));
6794                 }
6795
6796                 RValue<Short4> punpcklbw(RValue<Byte8> x, RValue<Byte8> y)
6797                 {
6798                         llvm::Function *punpcklbw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpcklbw);
6799
6800                         return As<Short4>(V(::builder->CreateCall2(punpcklbw, As<MMX>(x).value, As<MMX>(y).value)));
6801                 }
6802
6803                 RValue<Short4> punpckhbw(RValue<Byte8> x, RValue<Byte8> y)
6804                 {
6805                         llvm::Function *punpckhbw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhbw);
6806
6807                         return As<Short4>(V(::builder->CreateCall2(punpckhbw, As<MMX>(x).value, As<MMX>(y).value)));
6808                 }
6809
6810                 RValue<Byte8> paddb(RValue<Byte8> x, RValue<Byte8> y)
6811                 {
6812                         llvm::Function *paddb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_b);
6813
6814                         return As<Byte8>(V(::builder->CreateCall2(paddb, As<MMX>(x).value, As<MMX>(y).value)));
6815                 }
6816
6817                 RValue<Byte8> psubb(RValue<Byte8> x, RValue<Byte8> y)
6818                 {
6819                         llvm::Function *psubb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_b);
6820
6821                         return As<Byte8>(V(::builder->CreateCall2(psubb, As<MMX>(x).value, As<MMX>(y).value)));
6822                 }
6823
6824                 RValue<Int2> paddd(RValue<Int2> x, RValue<Int2> y)
6825                 {
6826                         llvm::Function *paddd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_d);
6827
6828                         return As<Int2>(V(::builder->CreateCall2(paddd, As<MMX>(x).value, As<MMX>(y).value)));
6829                 }
6830
6831                 RValue<Int2> psubd(RValue<Int2> x, RValue<Int2> y)
6832                 {
6833                         llvm::Function *psubd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_d);
6834
6835                         return As<Int2>(V(::builder->CreateCall2(psubd, As<MMX>(x).value, As<MMX>(y).value)));
6836                 }
6837
6838                 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
6839                 {
6840                         llvm::Function *pavgw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pavg_w);
6841
6842                         return As<UShort4>(V(::builder->CreateCall2(pavgw, As<MMX>(x).value, As<MMX>(y).value)));
6843                 }
6844
6845                 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
6846                 {
6847                         llvm::Function *pmaxsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmaxs_w);
6848
6849                         return As<Short4>(V(::builder->CreateCall2(pmaxsw, As<MMX>(x).value, As<MMX>(y).value)));
6850                 }
6851
6852                 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
6853                 {
6854                         llvm::Function *pminsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmins_w);
6855
6856                         return As<Short4>(V(::builder->CreateCall2(pminsw, As<MMX>(x).value, As<MMX>(y).value)));
6857                 }
6858
6859                 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
6860                 {
6861                         llvm::Function *pcmpgtw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpgt_w);
6862
6863                         return As<Short4>(V(::builder->CreateCall2(pcmpgtw, As<MMX>(x).value, As<MMX>(y).value)));
6864                 }
6865
6866                 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
6867                 {
6868                         llvm::Function *pcmpeqw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpeq_w);
6869
6870                         return As<Short4>(V(::builder->CreateCall2(pcmpeqw, As<MMX>(x).value, As<MMX>(y).value)));
6871                 }
6872
6873                 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
6874                 {
6875                         llvm::Function *pcmpgtb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpgt_b);
6876
6877                         return As<Byte8>(V(::builder->CreateCall2(pcmpgtb, As<MMX>(x).value, As<MMX>(y).value)));
6878                 }
6879
6880                 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
6881                 {
6882                         llvm::Function *pcmpeqb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpeq_b);
6883
6884                         return As<Byte8>(V(::builder->CreateCall2(pcmpeqb, As<MMX>(x).value, As<MMX>(y).value)));
6885                 }
6886
6887                 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
6888                 {
6889                         llvm::Function *packssdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packssdw);
6890
6891                         return As<Short4>(V(::builder->CreateCall2(packssdw, As<MMX>(x).value, As<MMX>(y).value)));
6892                 }
6893
6894                 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
6895                 {
6896                         if(CPUID::supportsSSE2())
6897                         {
6898                                 llvm::Function *packssdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_packssdw_128);
6899
6900                                 return RValue<Short8>(V(::builder->CreateCall2(packssdw, x.value, y.value)));
6901                         }
6902                         else
6903                         {
6904                                 Int2 loX = Int2(x);
6905                                 Int2 hiX = Int2(Swizzle(x, 0xEE));
6906
6907                                 Int2 loY = Int2(y);
6908                                 Int2 hiY = Int2(Swizzle(y, 0xEE));
6909
6910                                 Short4 lo = x86::packssdw(loX, hiX);
6911                                 Short4 hi = x86::packssdw(loY, hiY);
6912
6913                                 return Short8(lo, hi);
6914                         }
6915                 }
6916
6917                 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
6918                 {
6919                         llvm::Function *packsswb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packsswb);
6920
6921                         return As<SByte8>(V(::builder->CreateCall2(packsswb, As<MMX>(x).value, As<MMX>(y).value)));
6922                 }
6923
6924                 RValue<Byte8> packuswb(RValue<UShort4> x, RValue<UShort4> y)
6925                 {
6926                         llvm::Function *packuswb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packuswb);
6927
6928                         return As<Byte8>(V(::builder->CreateCall2(packuswb, As<MMX>(x).value, As<MMX>(y).value)));
6929                 }
6930
6931                 RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
6932                 {
6933                         if(CPUID::supportsSSE4_1())
6934                         {
6935                                 llvm::Function *packusdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_packusdw);
6936
6937                                 return RValue<UShort8>(V(::builder->CreateCall2(packusdw, x.value, y.value)));
6938                         }
6939                         else
6940                         {
6941                                 RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
6942                                 RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
6943
6944                                 return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
6945                         }
6946                 }
6947
6948                 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
6949                 {
6950                         llvm::Function *psrlw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrli_w);
6951
6952                         return As<UShort4>(V(::builder->CreateCall2(psrlw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
6953                 }
6954
6955                 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
6956                 {
6957                         llvm::Function *psrlw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrli_w);
6958
6959                         return RValue<UShort8>(V(::builder->CreateCall2(psrlw, x.value, V(Nucleus::createConstantInt(y)))));
6960                 }
6961
6962                 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
6963                 {
6964                         llvm::Function *psraw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrai_w);
6965
6966                         return As<Short4>(V(::builder->CreateCall2(psraw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
6967                 }
6968
6969                 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
6970                 {
6971                         llvm::Function *psraw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrai_w);
6972
6973                         return RValue<Short8>(V(::builder->CreateCall2(psraw, x.value, V(Nucleus::createConstantInt(y)))));
6974                 }
6975
6976                 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
6977                 {
6978                         llvm::Function *psllw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pslli_w);
6979
6980                         return As<Short4>(V(::builder->CreateCall2(psllw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
6981                 }
6982
6983                 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
6984                 {
6985                         llvm::Function *psllw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pslli_w);
6986
6987                         return RValue<Short8>(V(::builder->CreateCall2(psllw, x.value, V(Nucleus::createConstantInt(y)))));
6988                 }
6989
6990                 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
6991                 {
6992                         llvm::Function *pslld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pslli_d);
6993
6994                         return As<Int2>(V(::builder->CreateCall2(pslld, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
6995                 }
6996
6997                 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
6998                 {
6999                         if(CPUID::supportsSSE2())
7000                         {
7001                                 llvm::Function *pslld = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pslli_d);
7002
7003                                 return RValue<Int4>(V(::builder->CreateCall2(pslld, x.value, V(Nucleus::createConstantInt(y)))));
7004                         }
7005                         else
7006                         {
7007                                 Int2 lo = Int2(x);
7008                                 Int2 hi = Int2(Swizzle(x, 0xEE));
7009
7010                                 lo = x86::pslld(lo, y);
7011                                 hi = x86::pslld(hi, y);
7012
7013                                 return Int4(lo, hi);
7014                         }
7015                 }
7016
7017                 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
7018                 {
7019                         llvm::Function *psrad = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrai_d);
7020
7021                         return As<Int2>(V(::builder->CreateCall2(psrad, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7022                 }
7023
7024                 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
7025                 {
7026                         if(CPUID::supportsSSE2())
7027                         {
7028                                 llvm::Function *psrad = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrai_d);
7029
7030                                 return RValue<Int4>(V(::builder->CreateCall2(psrad, x.value, V(Nucleus::createConstantInt(y)))));
7031                         }
7032                         else
7033                         {
7034                                 Int2 lo = Int2(x);
7035                                 Int2 hi = Int2(Swizzle(x, 0xEE));
7036
7037                                 lo = x86::psrad(lo, y);
7038                                 hi = x86::psrad(hi, y);
7039
7040                                 return Int4(lo, hi);
7041                         }
7042                 }
7043
7044                 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
7045                 {
7046                         llvm::Function *psrld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrli_d);
7047
7048                         return As<UInt2>(V(::builder->CreateCall2(psrld, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7049                 }
7050
7051                 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
7052                 {
7053                         if(CPUID::supportsSSE2())
7054                         {
7055                                 llvm::Function *psrld = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrli_d);
7056
7057                                 return RValue<UInt4>(V(::builder->CreateCall2(psrld, x.value, V(Nucleus::createConstantInt(y)))));
7058                         }
7059                         else
7060                         {
7061                                 UInt2 lo = As<UInt2>(Int2(As<Int4>(x)));
7062                                 UInt2 hi = As<UInt2>(Int2(Swizzle(As<Int4>(x), 0xEE)));
7063
7064                                 lo = x86::psrld(lo, y);
7065                                 hi = x86::psrld(hi, y);
7066
7067                                 return UInt4(lo, hi);
7068                         }
7069                 }
7070
7071                 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
7072                 {
7073                         llvm::Function *pmaxsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmaxsd);
7074
7075                         return RValue<Int4>(V(::builder->CreateCall2(pmaxsd, x.value, y.value)));
7076                 }
7077
7078                 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
7079                 {
7080                         llvm::Function *pminsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pminsd);
7081
7082                         return RValue<Int4>(V(::builder->CreateCall2(pminsd, x.value, y.value)));
7083                 }
7084
7085                 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
7086                 {
7087                         llvm::Function *pmaxud = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmaxud);
7088
7089                         return RValue<UInt4>(V(::builder->CreateCall2(pmaxud, x.value, y.value)));
7090                 }
7091
7092                 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
7093                 {
7094                         llvm::Function *pminud = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pminud);
7095
7096                         return RValue<UInt4>(V(::builder->CreateCall2(pminud, x.value, y.value)));
7097                 }
7098
7099                 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
7100                 {
7101                         llvm::Function *pmulhw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmulh_w);
7102
7103                         return As<Short4>(V(::builder->CreateCall2(pmulhw, As<MMX>(x).value, As<MMX>(y).value)));
7104                 }
7105
7106                 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
7107                 {
7108                         llvm::Function *pmulhuw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmulhu_w);
7109
7110                         return As<UShort4>(V(::builder->CreateCall2(pmulhuw, As<MMX>(x).value, As<MMX>(y).value)));
7111                 }
7112
7113                 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
7114                 {
7115                         llvm::Function *pmaddwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmadd_wd);
7116
7117                         return As<Int2>(V(::builder->CreateCall2(pmaddwd, As<MMX>(x).value, As<MMX>(y).value)));
7118                 }
7119
7120                 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
7121                 {
7122                         llvm::Function *pmulhw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmulh_w);
7123
7124                         return RValue<Short8>(V(::builder->CreateCall2(pmulhw, x.value, y.value)));
7125                 }
7126
7127                 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
7128                 {
7129                         llvm::Function *pmulhuw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmulhu_w);
7130
7131                         return RValue<UShort8>(V(::builder->CreateCall2(pmulhuw, x.value, y.value)));
7132                 }
7133
7134                 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
7135                 {
7136                         llvm::Function *pmaddwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmadd_wd);
7137
7138                         return RValue<Int4>(V(::builder->CreateCall2(pmaddwd, x.value, y.value)));
7139                 }
7140
7141                 RValue<Int> movmskps(RValue<Float4> x)
7142                 {
7143                         llvm::Function *movmskps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_movmsk_ps);
7144
7145                         return RValue<Int>(V(::builder->CreateCall(movmskps, x.value)));
7146                 }
7147
7148                 RValue<Int> pmovmskb(RValue<Byte8> x)
7149                 {
7150                         llvm::Function *pmovmskb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmovmskb);
7151
7152                         return RValue<Int>(V(::builder->CreateCall(pmovmskb, As<MMX>(x).value)));
7153                 }
7154
7155                 //RValue<Int2> movd(RValue<Pointer<Int>> x)
7156                 //{
7157                 //      Value *element = Nucleus::createLoad(x.value);
7158
7159                 ////    Value *int2 = UndefValue::get(Int2::getType());
7160                 ////    int2 = Nucleus::createInsertElement(int2, element, ConstantInt::get(Int::getType(), 0));
7161
7162                 //      Value *int2 = Nucleus::createBitCast(Nucleus::createZExt(element, Long::getType()), Int2::getType());
7163
7164                 //      return RValue<Int2>(int2);
7165                 //}
7166
7167                 //RValue<Int2> movdq2q(RValue<Int4> x)
7168                 //{
7169                 //      Value *long2 = Nucleus::createBitCast(x.value, T(VectorType::get(Long::getType(), 2)));
7170                 //      Value *element = Nucleus::createExtractElement(long2, ConstantInt::get(Int::getType(), 0));
7171
7172                 //      return RValue<Int2>(Nucleus::createBitCast(element, Int2::getType()));
7173                 //}
7174
7175                 RValue<Int4> pmovzxbd(RValue<Int4> x)
7176                 {
7177                         llvm::Function *pmovzxbd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovzxbd);
7178
7179                         return RValue<Int4>(V(::builder->CreateCall(pmovzxbd, Nucleus::createBitCast(x.value, Byte16::getType()))));
7180                 }
7181
7182                 RValue<Int4> pmovsxbd(RValue<Int4> x)
7183                 {
7184                         llvm::Function *pmovsxbd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovsxbd);
7185
7186                         return RValue<Int4>(V(::builder->CreateCall(pmovsxbd, Nucleus::createBitCast(x.value, SByte16::getType()))));
7187                 }
7188
7189                 RValue<Int4> pmovzxwd(RValue<Int4> x)
7190                 {
7191                         llvm::Function *pmovzxwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovzxwd);
7192
7193                         return RValue<Int4>(V(::builder->CreateCall(pmovzxwd, Nucleus::createBitCast(x.value, UShort8::getType()))));
7194                 }
7195
7196                 RValue<Int4> pmovsxwd(RValue<Int4> x)
7197                 {
7198                         llvm::Function *pmovsxwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovsxwd);
7199
7200                         return RValue<Int4>(V(::builder->CreateCall(pmovsxwd, Nucleus::createBitCast(x.value, Short8::getType()))));
7201                 }
7202
7203                 void emms()
7204                 {
7205                         llvm::Function *emms = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_emms);
7206
7207                         V(::builder->CreateCall(emms));
7208                 }
7209         }
7210 }