OSDN Git Service

624af832f8f719c159556db77351f30b51f90f6f
[android-x86/external-swiftshader.git] / src / Reactor / LLVMReactor.cpp
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "Reactor.hpp"
16
17 #include "x86.hpp"
18 #include "CPUID.hpp"
19 #include "Thread.hpp"
20 #include "ExecutableMemory.hpp"
21 #include "MutexLock.hpp"
22
23 #undef min
24 #undef max
25
26 #if REACTOR_LLVM_VERSION < 7
27         #include "llvm/Analysis/LoopPass.h"
28         #include "llvm/Constants.h"
29         #include "llvm/Function.h"
30         #include "llvm/GlobalVariable.h"
31         #include "llvm/Intrinsics.h"
32         #include "llvm/LLVMContext.h"
33         #include "llvm/Module.h"
34         #include "llvm/PassManager.h"
35         #include "llvm/Support/IRBuilder.h"
36         #include "llvm/Support/TargetSelect.h"
37         #include "llvm/Target/TargetData.h"
38         #include "llvm/Target/TargetOptions.h"
39         #include "llvm/Transforms/Scalar.h"
40         #include "../lib/ExecutionEngine/JIT/JIT.h"
41
42         #include "LLVMRoutine.hpp"
43         #include "LLVMRoutineManager.hpp"
44
45         #define ARGS(...) __VA_ARGS__
46 #else
47         #include "llvm/Analysis/LoopPass.h"
48         #include "llvm/ExecutionEngine/ExecutionEngine.h"
49         #include "llvm/ExecutionEngine/JITSymbol.h"
50         #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
51         #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
52         #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
53         #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
54         #include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
55         #include "llvm/ExecutionEngine/SectionMemoryManager.h"
56         #include "llvm/IR/Constants.h"
57         #include "llvm/IR/DataLayout.h"
58         #include "llvm/IR/Function.h"
59         #include "llvm/IR/GlobalVariable.h"
60         #include "llvm/IR/IRBuilder.h"
61         #include "llvm/IR/Intrinsics.h"
62         #include "llvm/IR/LLVMContext.h"
63         #include "llvm/IR/LegacyPassManager.h"
64         #include "llvm/IR/Mangler.h"
65         #include "llvm/IR/Module.h"
66         #include "llvm/Support/Error.h"
67         #include "llvm/Support/TargetSelect.h"
68         #include "llvm/Target/TargetOptions.h"
69         #include "llvm/Transforms/InstCombine/InstCombine.h"
70         #include "llvm/Transforms/Scalar.h"
71         #include "llvm/Transforms/Scalar/GVN.h"
72
73         #include "LLVMRoutine.hpp"
74
75         #define ARGS(...) {__VA_ARGS__}
76         #define CreateCall2 CreateCall
77         #define CreateCall3 CreateCall
78
79         #include <unordered_map>
80 #endif
81
82 #include <fstream>
83 #include <numeric>
84 #include <thread>
85
86 #if defined(__i386__) || defined(__x86_64__)
87 #include <xmmintrin.h>
88 #endif
89
90 #include <math.h>
91
92 #if defined(__x86_64__) && defined(_WIN32)
93 extern "C" void X86CompilationCallback()
94 {
95         assert(false);   // UNIMPLEMENTED
96 }
97 #endif
98
99 #if REACTOR_LLVM_VERSION < 7
100 namespace llvm
101 {
102         extern bool JITEmitDebugInfo;
103 }
104 #endif
105
106 namespace rr
107 {
108         class LLVMReactorJIT;
109 }
110
111 namespace
112 {
113         rr::LLVMReactorJIT *reactorJIT = nullptr;
114         llvm::IRBuilder<> *builder = nullptr;
115         llvm::LLVMContext *context = nullptr;
116         llvm::Module *module = nullptr;
117         llvm::Function *function = nullptr;
118
119         rr::MutexLock codegenMutex;
120
121 #ifdef ENABLE_RR_PRINT
122         std::string replace(std::string str, const std::string& substr, const std::string& replacement)
123         {
124                 size_t pos = 0;
125                 while((pos = str.find(substr, pos)) != std::string::npos) {
126                         str.replace(pos, substr.length(), replacement);
127                         pos += replacement.length();
128                 }
129                 return str;
130         }
131 #endif // ENABLE_RR_PRINT
132
133 #if REACTOR_LLVM_VERSION >= 7
134         llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
135         {
136                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
137
138                 llvm::VectorType *extTy =
139                         llvm::VectorType::getExtendedElementVectorType(ty);
140                 x = ::builder->CreateZExt(x, extTy);
141                 y = ::builder->CreateZExt(y, extTy);
142
143                 // (x + y + 1) >> 1
144                 llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
145                 llvm::Value *res = ::builder->CreateAdd(x, y);
146                 res = ::builder->CreateAdd(res, one);
147                 res = ::builder->CreateLShr(res, one);
148                 return ::builder->CreateTrunc(res, ty);
149         }
150
151         llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
152                                   llvm::ICmpInst::Predicate pred)
153         {
154                 return ::builder->CreateSelect(::builder->CreateICmp(pred, x, y), x, y);
155         }
156
157         llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
158                                llvm::Value *y, llvm::Type *dstTy)
159         {
160                 return ::builder->CreateSExt(::builder->CreateICmp(pred, x, y), dstTy, "");
161         }
162
163 #if defined(__i386__) || defined(__x86_64__)
164         llvm::Value *lowerPMOV(llvm::Value *op, llvm::Type *dstType, bool sext)
165         {
166                 llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(op->getType());
167                 llvm::VectorType *dstTy = llvm::cast<llvm::VectorType>(dstType);
168
169                 llvm::Value *undef = llvm::UndefValue::get(srcTy);
170                 llvm::SmallVector<uint32_t, 16> mask(dstTy->getNumElements());
171                 std::iota(mask.begin(), mask.end(), 0);
172                 llvm::Value *v = ::builder->CreateShuffleVector(op, undef, mask);
173
174                 return sext ? ::builder->CreateSExt(v, dstTy)
175                             : ::builder->CreateZExt(v, dstTy);
176         }
177
178         llvm::Value *lowerPABS(llvm::Value *v)
179         {
180                 llvm::Value *zero = llvm::Constant::getNullValue(v->getType());
181                 llvm::Value *cmp = ::builder->CreateICmp(llvm::ICmpInst::ICMP_SGT, v, zero);
182                 llvm::Value *neg = ::builder->CreateNeg(v);
183                 return ::builder->CreateSelect(cmp, v, neg);
184         }
185 #endif  // defined(__i386__) || defined(__x86_64__)
186
187 #if !defined(__i386__) && !defined(__x86_64__)
188         llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
189                                    llvm::FCmpInst::Predicate pred)
190         {
191                 return ::builder->CreateSelect(::builder->CreateFCmp(pred, x, y), x, y);
192         }
193
194         llvm::Value *lowerRound(llvm::Value *x)
195         {
196                 llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
197                         ::module, llvm::Intrinsic::nearbyint, {x->getType()});
198                 return ::builder->CreateCall(nearbyint, ARGS(x));
199         }
200
201         llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
202         {
203                 return ::builder->CreateFPToSI(lowerRound(x), ty);
204         }
205
206         llvm::Value *lowerFloor(llvm::Value *x)
207         {
208                 llvm::Function *floor = llvm::Intrinsic::getDeclaration(
209                         ::module, llvm::Intrinsic::floor, {x->getType()});
210                 return ::builder->CreateCall(floor, ARGS(x));
211         }
212
213         llvm::Value *lowerTrunc(llvm::Value *x)
214         {
215                 llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
216                         ::module, llvm::Intrinsic::trunc, {x->getType()});
217                 return ::builder->CreateCall(trunc, ARGS(x));
218         }
219
220         // Packed add/sub saturatation
221         llvm::Value *lowerPSAT(llvm::Value *x, llvm::Value *y, bool isAdd, bool isSigned)
222         {
223                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
224                 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
225
226                 unsigned numBits = ty->getScalarSizeInBits();
227
228                 llvm::Value *max, *min, *extX, *extY;
229                 if (isSigned)
230                 {
231                         max = llvm::ConstantInt::get(extTy, (1LL << (numBits - 1)) - 1, true);
232                         min = llvm::ConstantInt::get(extTy, (-1LL << (numBits - 1)), true);
233                         extX = ::builder->CreateSExt(x, extTy);
234                         extY = ::builder->CreateSExt(y, extTy);
235                 }
236                 else
237                 {
238                         assert(numBits <= 64);
239                         uint64_t maxVal = (numBits == 64) ? ~0ULL : (1ULL << numBits) - 1;
240                         max = llvm::ConstantInt::get(extTy, maxVal, false);
241                         min = llvm::ConstantInt::get(extTy, 0, false);
242                         extX = ::builder->CreateZExt(x, extTy);
243                         extY = ::builder->CreateZExt(y, extTy);
244                 }
245
246                 llvm::Value *res = isAdd ? ::builder->CreateAdd(extX, extY)
247                                          : ::builder->CreateSub(extX, extY);
248
249                 res = lowerPMINMAX(res, min, llvm::ICmpInst::ICMP_SGT);
250                 res = lowerPMINMAX(res, max, llvm::ICmpInst::ICMP_SLT);
251
252                 return ::builder->CreateTrunc(res, ty);
253         }
254
255         llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
256         {
257                 return lowerPSAT(x, y, true, false);
258         }
259
260         llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
261         {
262                 return lowerPSAT(x, y, true, true);
263         }
264
265         llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
266         {
267                 return lowerPSAT(x, y, false, false);
268         }
269
270         llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
271         {
272                 return lowerPSAT(x, y, false, true);
273         }
274
275         llvm::Value *lowerSQRT(llvm::Value *x)
276         {
277                 llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
278                         ::module, llvm::Intrinsic::sqrt, {x->getType()});
279                 return ::builder->CreateCall(sqrt, ARGS(x));
280         }
281
282         llvm::Value *lowerRCP(llvm::Value *x)
283         {
284                 llvm::Type *ty = x->getType();
285                 llvm::Constant *one;
286                 if (llvm::VectorType *vectorTy = llvm::dyn_cast<llvm::VectorType>(ty))
287                 {
288                         one = llvm::ConstantVector::getSplat(
289                                 vectorTy->getNumElements(),
290                                 llvm::ConstantFP::get(vectorTy->getElementType(), 1));
291                 }
292                 else
293                 {
294                         one = llvm::ConstantFP::get(ty, 1);
295                 }
296                 return ::builder->CreateFDiv(one, x);
297         }
298
299         llvm::Value *lowerRSQRT(llvm::Value *x)
300         {
301                 return lowerRCP(lowerSQRT(x));
302         }
303
304         llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
305         {
306                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
307                 llvm::Value *y = llvm::ConstantVector::getSplat(
308                         ty->getNumElements(),
309                         llvm::ConstantInt::get(ty->getElementType(), scalarY));
310                 return ::builder->CreateShl(x, y);
311         }
312
313         llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
314         {
315                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
316                 llvm::Value *y = llvm::ConstantVector::getSplat(
317                         ty->getNumElements(),
318                         llvm::ConstantInt::get(ty->getElementType(), scalarY));
319                 return ::builder->CreateAShr(x, y);
320         }
321
322         llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
323         {
324                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
325                 llvm::Value *y = llvm::ConstantVector::getSplat(
326                         ty->getNumElements(),
327                         llvm::ConstantInt::get(ty->getElementType(), scalarY));
328                 return ::builder->CreateLShr(x, y);
329         }
330
331         llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
332         {
333                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
334                 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
335
336                 llvm::Value *extX = ::builder->CreateSExt(x, extTy);
337                 llvm::Value *extY = ::builder->CreateSExt(y, extTy);
338                 llvm::Value *mult = ::builder->CreateMul(extX, extY);
339
340                 llvm::Value *undef = llvm::UndefValue::get(extTy);
341
342                 llvm::SmallVector<uint32_t, 16> evenIdx;
343                 llvm::SmallVector<uint32_t, 16> oddIdx;
344                 for (uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
345                 {
346                         evenIdx.push_back(i);
347                         oddIdx.push_back(i + 1);
348                 }
349
350                 llvm::Value *lhs = ::builder->CreateShuffleVector(mult, undef, evenIdx);
351                 llvm::Value *rhs = ::builder->CreateShuffleVector(mult, undef, oddIdx);
352                 return ::builder->CreateAdd(lhs, rhs);
353         }
354
355         llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
356         {
357                 llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(x->getType());
358                 llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
359
360                 llvm::IntegerType *dstElemTy =
361                         llvm::cast<llvm::IntegerType>(dstTy->getElementType());
362
363                 uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
364                 assert(truncNumBits < 64 && "shift 64 must be handled separately");
365                 llvm::Constant *max, *min;
366                 if (isSigned)
367                 {
368                         max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
369                         min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
370                 }
371                 else
372                 {
373                         max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
374                         min = llvm::ConstantInt::get(srcTy, 0, false);
375                 }
376
377                 x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
378                 x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
379                 y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
380                 y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
381
382                 x = ::builder->CreateTrunc(x, dstTy);
383                 y = ::builder->CreateTrunc(y, dstTy);
384
385                 llvm::SmallVector<uint32_t, 16> index(srcTy->getNumElements() * 2);
386                 std::iota(index.begin(), index.end(), 0);
387
388                 return ::builder->CreateShuffleVector(x, y, index);
389         }
390
391         llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
392         {
393                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
394                 llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
395                 llvm::Value *cmp = ::builder->CreateICmpSLT(x, zero);
396
397                 llvm::Value *ret = ::builder->CreateZExt(
398                         ::builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
399                 for (uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
400                 {
401                         llvm::Value *elem = ::builder->CreateZExt(
402                                 ::builder->CreateExtractElement(cmp, i), retTy);
403                         ret = ::builder->CreateOr(ret, ::builder->CreateShl(elem, i));
404                 }
405                 return ret;
406         }
407
408         llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
409         {
410                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
411                 llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
412                 llvm::Value *cmp = ::builder->CreateFCmpULT(x, zero);
413
414                 llvm::Value *ret = ::builder->CreateZExt(
415                         ::builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
416                 for (uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
417                 {
418                         llvm::Value *elem = ::builder->CreateZExt(
419                                 ::builder->CreateExtractElement(cmp, i), retTy);
420                         ret = ::builder->CreateOr(ret, ::builder->CreateShl(elem, i));
421                 }
422                 return ret;
423         }
424 #endif  // !defined(__i386__) && !defined(__x86_64__)
425 #endif  // REACTOR_LLVM_VERSION >= 7
426
427         llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
428         {
429                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
430                 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
431
432                 llvm::Value *extX, *extY;
433                 if (sext)
434                 {
435                         extX = ::builder->CreateSExt(x, extTy);
436                         extY = ::builder->CreateSExt(y, extTy);
437                 }
438                 else
439                 {
440                         extX = ::builder->CreateZExt(x, extTy);
441                         extY = ::builder->CreateZExt(y, extTy);
442                 }
443
444                 llvm::Value *mult = ::builder->CreateMul(extX, extY);
445
446                 llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
447                 llvm::Value *mulh = ::builder->CreateAShr(mult, intTy->getBitWidth());
448                 return ::builder->CreateTrunc(mulh, ty);
449         }
450 }
451
452 namespace rr
453 {
454 #if REACTOR_LLVM_VERSION < 7
455         class LLVMReactorJIT
456         {
457         private:
458                 std::string arch;
459                 llvm::SmallVector<std::string, 16> mattrs;
460                 llvm::ExecutionEngine *executionEngine;
461                 LLVMRoutineManager *routineManager;
462
463         public:
464                 LLVMReactorJIT(const std::string &arch_,
465                                const llvm::SmallVectorImpl<std::string> &mattrs_) :
466                         arch(arch_),
467                         mattrs(mattrs_.begin(), mattrs_.end()),
468                         executionEngine(nullptr),
469                         routineManager(nullptr)
470                 {
471                 }
472
473                 void startSession()
474                 {
475                         std::string error;
476
477                         ::module = new llvm::Module("", *::context);
478
479                         routineManager = new LLVMRoutineManager();
480
481                         llvm::TargetMachine *targetMachine =
482                                 llvm::EngineBuilder::selectTarget(
483                                         ::module, arch, "", mattrs, llvm::Reloc::Default,
484                                         llvm::CodeModel::JITDefault, &error);
485
486                         executionEngine = llvm::JIT::createJIT(
487                                 ::module, &error, routineManager, llvm::CodeGenOpt::Aggressive,
488                                 true, targetMachine);
489                 }
490
491                 void endSession()
492                 {
493                         delete executionEngine;
494                         executionEngine = nullptr;
495                         routineManager = nullptr;
496
497                         ::function = nullptr;
498                         ::module = nullptr;
499                 }
500
501                 LLVMRoutine *acquireRoutine(llvm::Function *func)
502                 {
503                         void *entry = executionEngine->getPointerToFunction(::function);
504                         return routineManager->acquireRoutine(entry);
505                 }
506
507                 void optimize(llvm::Module *module)
508                 {
509                         static llvm::PassManager *passManager = nullptr;
510
511                         if(!passManager)
512                         {
513                                 passManager = new llvm::PassManager();
514
515                                 passManager->add(new llvm::TargetData(*executionEngine->getTargetData()));
516                                 passManager->add(llvm::createScalarReplAggregatesPass());
517
518                                 for(int pass = 0; pass < 10 && optimization[pass] != Disabled; pass++)
519                                 {
520                                         switch(optimization[pass])
521                                         {
522                                         case Disabled:                                                                       break;
523                                         case CFGSimplification:    passManager->add(llvm::createCFGSimplificationPass());    break;
524                                         case LICM:                 passManager->add(llvm::createLICMPass());                 break;
525                                         case AggressiveDCE:        passManager->add(llvm::createAggressiveDCEPass());        break;
526                                         case GVN:                  passManager->add(llvm::createGVNPass());                  break;
527                                         case InstructionCombining: passManager->add(llvm::createInstructionCombiningPass()); break;
528                                         case Reassociate:          passManager->add(llvm::createReassociatePass());          break;
529                                         case DeadStoreElimination: passManager->add(llvm::createDeadStoreEliminationPass()); break;
530                                         case SCCP:                 passManager->add(llvm::createSCCPPass());                 break;
531                                         case ScalarReplAggregates: passManager->add(llvm::createScalarReplAggregatesPass()); break;
532                                         default:
533                                                 assert(false);
534                                         }
535                                 }
536                         }
537
538                         passManager->run(*::module);
539                 }
540         };
541 #else
542         class ExternalFunctionSymbolResolver
543         {
544         private:
545                 using FunctionMap = std::unordered_map<std::string, void *>;
546                 FunctionMap func_;
547
548         public:
549                 ExternalFunctionSymbolResolver()
550                 {
551                         func_.emplace("floorf", reinterpret_cast<void*>(floorf));
552                         func_.emplace("nearbyintf", reinterpret_cast<void*>(nearbyintf));
553                         func_.emplace("truncf", reinterpret_cast<void*>(truncf));
554                         func_.emplace("printf", reinterpret_cast<void*>(printf));
555                         func_.emplace("puts", reinterpret_cast<void*>(puts));
556                         func_.emplace("fmodf", reinterpret_cast<void*>(fmodf));
557                 }
558
559                 void *findSymbol(const std::string &name) const
560                 {
561                         // Trim off any underscores from the start of the symbol. LLVM likes
562                         // to append these on macOS.
563                         const char* trimmed = name.c_str();
564                         while (trimmed[0] == '_') { trimmed++; }
565
566                         FunctionMap::const_iterator it = func_.find(trimmed);
567                         assert(it != func_.end()); // Missing functions will likely make the module fail in exciting non-obvious ways.
568                         return it->second;
569                 }
570         };
571
572         class LLVMReactorJIT
573         {
574         private:
575                 using ObjLayer = llvm::orc::RTDyldObjectLinkingLayer;
576                 using CompileLayer = llvm::orc::IRCompileLayer<ObjLayer, llvm::orc::SimpleCompiler>;
577
578                 llvm::orc::ExecutionSession session;
579                 ExternalFunctionSymbolResolver externalSymbolResolver;
580                 std::shared_ptr<llvm::orc::SymbolResolver> resolver;
581                 std::unique_ptr<llvm::TargetMachine> targetMachine;
582                 const llvm::DataLayout dataLayout;
583                 ObjLayer objLayer;
584                 CompileLayer compileLayer;
585                 size_t emittedFunctionsNum;
586
587         public:
588                 LLVMReactorJIT(const char *arch, const llvm::SmallVectorImpl<std::string>& mattrs,
589                                            const llvm::TargetOptions &targetOpts):
590                         resolver(createLegacyLookupResolver(
591                                 session,
592                                 [this](const std::string &name) {
593                                         void *func = externalSymbolResolver.findSymbol(name);
594                                         if (func != nullptr)
595                                         {
596                                                 return llvm::JITSymbol(
597                                                         reinterpret_cast<uintptr_t>(func), llvm::JITSymbolFlags::Absolute);
598                                         }
599
600                                         return objLayer.findSymbol(name, true);
601                                 },
602                                 [](llvm::Error err) {
603                                         if (err)
604                                         {
605                                                 // TODO: Log the symbol resolution errors.
606                                                 return;
607                                         }
608                                 })),
609                         targetMachine(llvm::EngineBuilder()
610                                 .setMArch(arch)
611                                 .setMAttrs(mattrs)
612                                 .setTargetOptions(targetOpts)
613                                 .selectTarget()),
614                         dataLayout(targetMachine->createDataLayout()),
615                         objLayer(
616                                 session,
617                                 [this](llvm::orc::VModuleKey) {
618                                         return ObjLayer::Resources{
619                                                 std::make_shared<llvm::SectionMemoryManager>(),
620                                                 resolver};
621                                 }),
622                         compileLayer(objLayer, llvm::orc::SimpleCompiler(*targetMachine)),
623                         emittedFunctionsNum(0)
624                 {
625                 }
626
627                 void startSession()
628                 {
629                         ::module = new llvm::Module("", *::context);
630                 }
631
632                 void endSession()
633                 {
634                         ::function = nullptr;
635                         ::module = nullptr;
636                 }
637
638                 LLVMRoutine *acquireRoutine(llvm::Function *func)
639                 {
640                         std::string name = "f" + llvm::Twine(emittedFunctionsNum++).str();
641                         func->setName(name);
642                         func->setLinkage(llvm::GlobalValue::ExternalLinkage);
643                         func->setDoesNotThrow();
644
645                         std::unique_ptr<llvm::Module> mod(::module);
646                         ::module = nullptr;
647                         mod->setDataLayout(dataLayout);
648
649                         auto moduleKey = session.allocateVModule();
650                         llvm::cantFail(compileLayer.addModule(moduleKey, std::move(mod)));
651
652                         std::string mangledName;
653                         {
654                                 llvm::raw_string_ostream mangledNameStream(mangledName);
655                                 llvm::Mangler::getNameWithPrefix(mangledNameStream, name, dataLayout);
656                         }
657
658                         llvm::JITSymbol symbol = compileLayer.findSymbolIn(moduleKey, mangledName, false);
659
660                         llvm::Expected<llvm::JITTargetAddress> expectAddr = symbol.getAddress();
661                         if(!expectAddr)
662                         {
663                                 return nullptr;
664                         }
665
666                         void *addr = reinterpret_cast<void *>(static_cast<intptr_t>(expectAddr.get()));
667                         return new LLVMRoutine(addr, releaseRoutineCallback, this, moduleKey);
668                 }
669
670                 void optimize(llvm::Module *module)
671                 {
672                         std::unique_ptr<llvm::legacy::PassManager> passManager(
673                                 new llvm::legacy::PassManager());
674
675                         passManager->add(llvm::createSROAPass());
676
677                         for(int pass = 0; pass < 10 && optimization[pass] != Disabled; pass++)
678                         {
679                                 switch(optimization[pass])
680                                 {
681                                 case Disabled:                                                                       break;
682                                 case CFGSimplification:    passManager->add(llvm::createCFGSimplificationPass());    break;
683                                 case LICM:                 passManager->add(llvm::createLICMPass());                 break;
684                                 case AggressiveDCE:        passManager->add(llvm::createAggressiveDCEPass());        break;
685                                 case GVN:                  passManager->add(llvm::createGVNPass());                  break;
686                                 case InstructionCombining: passManager->add(llvm::createInstructionCombiningPass()); break;
687                                 case Reassociate:          passManager->add(llvm::createReassociatePass());          break;
688                                 case DeadStoreElimination: passManager->add(llvm::createDeadStoreEliminationPass()); break;
689                                 case SCCP:                 passManager->add(llvm::createSCCPPass());                 break;
690                                 case ScalarReplAggregates: passManager->add(llvm::createSROAPass());                 break;
691                                 default:
692                                                            assert(false);
693                                 }
694                         }
695
696                         passManager->run(*::module);
697                 }
698
699         private:
700                 void releaseRoutineModule(llvm::orc::VModuleKey moduleKey)
701                 {
702                         llvm::cantFail(compileLayer.removeModule(moduleKey));
703                 }
704
705                 static void releaseRoutineCallback(LLVMReactorJIT *jit, uint64_t moduleKey)
706                 {
707                         jit->releaseRoutineModule(moduleKey);
708                 }
709         };
710 #endif
711
712         Optimization optimization[10] = {InstructionCombining, Disabled};
713
714         // The abstract Type* types are implemented as LLVM types, except that
715         // 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
716         // and VFP in ARM, and eliminate the overhead of converting them to explicit
717         // 128-bit ones. LLVM types are pointers, so we can represent emulated types
718         // as abstract pointers with small enum values.
719         enum InternalType : uintptr_t
720         {
721                 // Emulated types:
722                 Type_v2i32,
723                 Type_v4i16,
724                 Type_v2i16,
725                 Type_v8i8,
726                 Type_v4i8,
727                 Type_v2f32,
728                 EmulatedTypeCount,
729                 // Returned by asInternalType() to indicate that the abstract Type*
730                 // should be interpreted as LLVM type pointer:
731                 Type_LLVM
732         };
733
734         inline InternalType asInternalType(Type *type)
735         {
736                 InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
737                 return (t < EmulatedTypeCount) ? t : Type_LLVM;
738         }
739
740         llvm::Type *T(Type *t)
741         {
742                 // Use 128-bit vectors to implement logically shorter ones.
743                 switch(asInternalType(t))
744                 {
745                 case Type_v2i32: return T(Int4::getType());
746                 case Type_v4i16: return T(Short8::getType());
747                 case Type_v2i16: return T(Short8::getType());
748                 case Type_v8i8:  return T(Byte16::getType());
749                 case Type_v4i8:  return T(Byte16::getType());
750                 case Type_v2f32: return T(Float4::getType());
751                 case Type_LLVM:  return reinterpret_cast<llvm::Type*>(t);
752                 default: assert(false); return nullptr;
753                 }
754         }
755
756         inline Type *T(llvm::Type *t)
757         {
758                 return reinterpret_cast<Type*>(t);
759         }
760
761         Type *T(InternalType t)
762         {
763                 return reinterpret_cast<Type*>(t);
764         }
765
766         inline llvm::Value *V(Value *t)
767         {
768                 return reinterpret_cast<llvm::Value*>(t);
769         }
770
771         inline Value *V(llvm::Value *t)
772         {
773                 return reinterpret_cast<Value*>(t);
774         }
775
776         inline std::vector<llvm::Type*> &T(std::vector<Type*> &t)
777         {
778                 return reinterpret_cast<std::vector<llvm::Type*>&>(t);
779         }
780
781         inline llvm::BasicBlock *B(BasicBlock *t)
782         {
783                 return reinterpret_cast<llvm::BasicBlock*>(t);
784         }
785
786         inline BasicBlock *B(llvm::BasicBlock *t)
787         {
788                 return reinterpret_cast<BasicBlock*>(t);
789         }
790
791         static size_t typeSize(Type *type)
792         {
793                 switch(asInternalType(type))
794                 {
795                 case Type_v2i32: return 8;
796                 case Type_v4i16: return 8;
797                 case Type_v2i16: return 4;
798                 case Type_v8i8:  return 8;
799                 case Type_v4i8:  return 4;
800                 case Type_v2f32: return 8;
801                 case Type_LLVM:
802                         {
803                                 llvm::Type *t = T(type);
804
805                                 if(t->isPointerTy())
806                                 {
807                                         return sizeof(void*);
808                                 }
809
810                                 // At this point we should only have LLVM 'primitive' types.
811                                 unsigned int bits = t->getPrimitiveSizeInBits();
812                                 assert(bits != 0);
813
814                                 // TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
815                                 // but are typically stored as one byte. The DataLayout structure should
816                                 // be used here and many other places if this assumption fails.
817                                 return (bits + 7) / 8;
818                         }
819                         break;
820                 default:
821                         assert(false);
822                         return 0;
823                 }
824         }
825
826         static unsigned int elementCount(Type *type)
827         {
828                 switch(asInternalType(type))
829                 {
830                 case Type_v2i32: return 2;
831                 case Type_v4i16: return 4;
832                 case Type_v2i16: return 2;
833                 case Type_v8i8:  return 8;
834                 case Type_v4i8:  return 4;
835                 case Type_v2f32: return 2;
836                 case Type_LLVM:  return llvm::cast<llvm::VectorType>(T(type))->getNumElements();
837                 default: assert(false); return 0;
838                 }
839         }
840
841         static llvm::AtomicOrdering atomicOrdering(bool atomic, std::memory_order memoryOrder)
842         {
843                 #if REACTOR_LLVM_VERSION < 7
844                         return llvm::AtomicOrdering::NotAtomic;
845                 #endif
846
847                 if(!atomic)
848                 {
849                         return llvm::AtomicOrdering::NotAtomic;
850                 }
851
852                 switch(memoryOrder)
853                 {
854                 case std::memory_order_relaxed: return llvm::AtomicOrdering::Monotonic;  // https://llvm.org/docs/Atomics.html#monotonic
855                 case std::memory_order_consume: return llvm::AtomicOrdering::Acquire;    // https://llvm.org/docs/Atomics.html#acquire: "It should also be used for C++11/C11 memory_order_consume."
856                 case std::memory_order_acquire: return llvm::AtomicOrdering::Acquire;
857                 case std::memory_order_release: return llvm::AtomicOrdering::Release;
858                 case std::memory_order_acq_rel: return llvm::AtomicOrdering::AcquireRelease;
859                 case std::memory_order_seq_cst: return llvm::AtomicOrdering::SequentiallyConsistent;
860                 default: assert(false);         return llvm::AtomicOrdering::AcquireRelease;
861                 }
862         }
863
864         Nucleus::Nucleus()
865         {
866                 ::codegenMutex.lock();   // Reactor and LLVM are currently not thread safe
867
868                 llvm::InitializeNativeTarget();
869
870 #if REACTOR_LLVM_VERSION >= 7
871                 llvm::InitializeNativeTargetAsmPrinter();
872                 llvm::InitializeNativeTargetAsmParser();
873 #endif
874
875                 if(!::context)
876                 {
877                         ::context = new llvm::LLVMContext();
878                 }
879
880                 #if defined(__x86_64__)
881                         static const char arch[] = "x86-64";
882                 #elif defined(__i386__)
883                         static const char arch[] = "x86";
884                 #elif defined(__aarch64__)
885                         static const char arch[] = "arm64";
886                 #elif defined(__arm__)
887                         static const char arch[] = "arm";
888                 #elif defined(__mips__)
889                         #if defined(__mips64)
890                             static const char arch[] = "mips64el";
891                         #else
892                             static const char arch[] = "mipsel";
893                         #endif
894                 #else
895                 #error "unknown architecture"
896                 #endif
897
898                 llvm::SmallVector<std::string, 1> mattrs;
899 #if defined(__i386__) || defined(__x86_64__)
900                 mattrs.push_back(CPUID::supportsMMX()    ? "+mmx"    : "-mmx");
901                 mattrs.push_back(CPUID::supportsCMOV()   ? "+cmov"   : "-cmov");
902                 mattrs.push_back(CPUID::supportsSSE()    ? "+sse"    : "-sse");
903                 mattrs.push_back(CPUID::supportsSSE2()   ? "+sse2"   : "-sse2");
904                 mattrs.push_back(CPUID::supportsSSE3()   ? "+sse3"   : "-sse3");
905                 mattrs.push_back(CPUID::supportsSSSE3()  ? "+ssse3"  : "-ssse3");
906 #if REACTOR_LLVM_VERSION < 7
907                 mattrs.push_back(CPUID::supportsSSE4_1() ? "+sse41"  : "-sse41");
908 #else
909                 mattrs.push_back(CPUID::supportsSSE4_1() ? "+sse4.1" : "-sse4.1");
910 #endif
911 #elif defined(__arm__)
912 #if __ARM_ARCH >= 8
913                 mattrs.push_back("+armv8-a");
914 #else
915                 // armv7-a requires compiler-rt routines; otherwise, compiled kernel
916                 // might fail to link.
917 #endif
918 #endif
919
920 #if REACTOR_LLVM_VERSION < 7
921                 llvm::JITEmitDebugInfo = false;
922                 llvm::UnsafeFPMath = true;
923                 // llvm::NoInfsFPMath = true;
924                 // llvm::NoNaNsFPMath = true;
925 #else
926                 llvm::TargetOptions targetOpts;
927                 targetOpts.UnsafeFPMath = false;
928                 // targetOpts.NoInfsFPMath = true;
929                 // targetOpts.NoNaNsFPMath = true;
930 #endif
931
932                 if(!::reactorJIT)
933                 {
934 #if REACTOR_LLVM_VERSION < 7
935                         ::reactorJIT = new LLVMReactorJIT(arch, mattrs);
936 #else
937                         ::reactorJIT = new LLVMReactorJIT(arch, mattrs, targetOpts);
938 #endif
939                 }
940
941                 ::reactorJIT->startSession();
942
943                 if(!::builder)
944                 {
945                         ::builder = new llvm::IRBuilder<>(*::context);
946                 }
947         }
948
949         Nucleus::~Nucleus()
950         {
951                 ::reactorJIT->endSession();
952
953                 ::codegenMutex.unlock();
954         }
955
956         Routine *Nucleus::acquireRoutine(const char *name, bool runOptimizations)
957         {
958                 if(::builder->GetInsertBlock()->empty() || !::builder->GetInsertBlock()->back().isTerminator())
959                 {
960                         llvm::Type *type = ::function->getReturnType();
961
962                         if(type->isVoidTy())
963                         {
964                                 createRetVoid();
965                         }
966                         else
967                         {
968                                 createRet(V(llvm::UndefValue::get(type)));
969                         }
970                 }
971
972                 if(false)
973                 {
974                         #if REACTOR_LLVM_VERSION < 7
975                                 std::string error;
976                                 llvm::raw_fd_ostream file((std::string(name) + "-llvm-dump-unopt.txt").c_str(), error);
977                         #else
978                                 std::error_code error;
979                                 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
980                         #endif
981
982                         ::module->print(file, 0);
983                 }
984
985                 if(runOptimizations)
986                 {
987                         optimize();
988                 }
989
990                 if(false)
991                 {
992                         #if REACTOR_LLVM_VERSION < 7
993                                 std::string error;
994                                 llvm::raw_fd_ostream file((std::string(name) + "-llvm-dump-opt.txt").c_str(), error);
995                         #else
996                                 std::error_code error;
997                                 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
998                         #endif
999
1000                         ::module->print(file, 0);
1001                 }
1002
1003                 LLVMRoutine *routine = ::reactorJIT->acquireRoutine(::function);
1004
1005                 return routine;
1006         }
1007
1008         void Nucleus::optimize()
1009         {
1010                 ::reactorJIT->optimize(::module);
1011         }
1012
1013         Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
1014         {
1015                 // Need to allocate it in the entry block for mem2reg to work
1016                 llvm::BasicBlock &entryBlock = ::function->getEntryBlock();
1017
1018                 llvm::Instruction *declaration;
1019
1020                 if(arraySize)
1021                 {
1022 #if REACTOR_LLVM_VERSION < 7
1023                         declaration = new llvm::AllocaInst(T(type), V(Nucleus::createConstantInt(arraySize)));
1024 #else
1025                         declaration = new llvm::AllocaInst(T(type), 0, V(Nucleus::createConstantInt(arraySize)));
1026 #endif
1027                 }
1028                 else
1029                 {
1030 #if REACTOR_LLVM_VERSION < 7
1031                         declaration = new llvm::AllocaInst(T(type), (llvm::Value*)nullptr);
1032 #else
1033                         declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value*)nullptr);
1034 #endif
1035                 }
1036
1037                 entryBlock.getInstList().push_front(declaration);
1038
1039                 return V(declaration);
1040         }
1041
1042         BasicBlock *Nucleus::createBasicBlock()
1043         {
1044                 return B(llvm::BasicBlock::Create(*::context, "", ::function));
1045         }
1046
1047         BasicBlock *Nucleus::getInsertBlock()
1048         {
1049                 return B(::builder->GetInsertBlock());
1050         }
1051
1052         void Nucleus::setInsertBlock(BasicBlock *basicBlock)
1053         {
1054         //      assert(::builder->GetInsertBlock()->back().isTerminator());
1055                 ::builder->SetInsertPoint(B(basicBlock));
1056         }
1057
1058         void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
1059         {
1060                 llvm::FunctionType *functionType = llvm::FunctionType::get(T(ReturnType), T(Params), false);
1061                 ::function = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, "", ::module);
1062                 ::function->setCallingConv(llvm::CallingConv::C);
1063
1064                 #if defined(_WIN32) && REACTOR_LLVM_VERSION >= 7
1065                         // FIXME(capn):
1066                         // On Windows, stack memory is committed in increments of 4 kB pages, with the last page
1067                         // having a trap which allows the OS to grow the stack. For functions with a stack frame
1068                         // larger than 4 kB this can cause an issue when a variable is accessed beyond the guard
1069                         // page. Therefore the compiler emits a call to __chkstk in the function prolog to probe
1070                         // the stack and ensure all pages have been committed. This is currently broken in LLVM
1071                         // JIT, but we can prevent emitting the stack probe call:
1072                         ::function->addFnAttr("stack-probe-size", "1048576");
1073                 #endif
1074
1075                 ::builder->SetInsertPoint(llvm::BasicBlock::Create(*::context, "", ::function));
1076         }
1077
1078         Value *Nucleus::getArgument(unsigned int index)
1079         {
1080                 llvm::Function::arg_iterator args = ::function->arg_begin();
1081
1082                 while(index)
1083                 {
1084                         args++;
1085                         index--;
1086                 }
1087
1088                 return V(&*args);
1089         }
1090
1091         void Nucleus::createRetVoid()
1092         {
1093                 ::builder->CreateRetVoid();
1094         }
1095
1096         void Nucleus::createRet(Value *v)
1097         {
1098                 ::builder->CreateRet(V(v));
1099         }
1100
1101         void Nucleus::createBr(BasicBlock *dest)
1102         {
1103                 ::builder->CreateBr(B(dest));
1104         }
1105
1106         void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
1107         {
1108                 ::builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
1109         }
1110
1111         Value *Nucleus::createAdd(Value *lhs, Value *rhs)
1112         {
1113                 return V(::builder->CreateAdd(V(lhs), V(rhs)));
1114         }
1115
1116         Value *Nucleus::createSub(Value *lhs, Value *rhs)
1117         {
1118                 return V(::builder->CreateSub(V(lhs), V(rhs)));
1119         }
1120
1121         Value *Nucleus::createMul(Value *lhs, Value *rhs)
1122         {
1123                 return V(::builder->CreateMul(V(lhs), V(rhs)));
1124         }
1125
1126         Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
1127         {
1128                 return V(::builder->CreateUDiv(V(lhs), V(rhs)));
1129         }
1130
1131         Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
1132         {
1133                 return V(::builder->CreateSDiv(V(lhs), V(rhs)));
1134         }
1135
1136         Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
1137         {
1138                 return V(::builder->CreateFAdd(V(lhs), V(rhs)));
1139         }
1140
1141         Value *Nucleus::createFSub(Value *lhs, Value *rhs)
1142         {
1143                 return V(::builder->CreateFSub(V(lhs), V(rhs)));
1144         }
1145
1146         Value *Nucleus::createFMul(Value *lhs, Value *rhs)
1147         {
1148                 return V(::builder->CreateFMul(V(lhs), V(rhs)));
1149         }
1150
1151         Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
1152         {
1153                 return V(::builder->CreateFDiv(V(lhs), V(rhs)));
1154         }
1155
1156         Value *Nucleus::createURem(Value *lhs, Value *rhs)
1157         {
1158                 return V(::builder->CreateURem(V(lhs), V(rhs)));
1159         }
1160
1161         Value *Nucleus::createSRem(Value *lhs, Value *rhs)
1162         {
1163                 return V(::builder->CreateSRem(V(lhs), V(rhs)));
1164         }
1165
1166         Value *Nucleus::createFRem(Value *lhs, Value *rhs)
1167         {
1168                 return V(::builder->CreateFRem(V(lhs), V(rhs)));
1169         }
1170
1171         Value *Nucleus::createShl(Value *lhs, Value *rhs)
1172         {
1173                 return V(::builder->CreateShl(V(lhs), V(rhs)));
1174         }
1175
1176         Value *Nucleus::createLShr(Value *lhs, Value *rhs)
1177         {
1178                 return V(::builder->CreateLShr(V(lhs), V(rhs)));
1179         }
1180
1181         Value *Nucleus::createAShr(Value *lhs, Value *rhs)
1182         {
1183                 return V(::builder->CreateAShr(V(lhs), V(rhs)));
1184         }
1185
1186         Value *Nucleus::createAnd(Value *lhs, Value *rhs)
1187         {
1188                 return V(::builder->CreateAnd(V(lhs), V(rhs)));
1189         }
1190
1191         Value *Nucleus::createOr(Value *lhs, Value *rhs)
1192         {
1193                 return V(::builder->CreateOr(V(lhs), V(rhs)));
1194         }
1195
1196         Value *Nucleus::createXor(Value *lhs, Value *rhs)
1197         {
1198                 return V(::builder->CreateXor(V(lhs), V(rhs)));
1199         }
1200
1201         Value *Nucleus::createNeg(Value *v)
1202         {
1203                 return V(::builder->CreateNeg(V(v)));
1204         }
1205
1206         Value *Nucleus::createFNeg(Value *v)
1207         {
1208                 return V(::builder->CreateFNeg(V(v)));
1209         }
1210
1211         Value *Nucleus::createNot(Value *v)
1212         {
1213                 return V(::builder->CreateNot(V(v)));
1214         }
1215
1216         Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
1217         {
1218                 switch(asInternalType(type))
1219                 {
1220                 case Type_v2i32:
1221                 case Type_v4i16:
1222                 case Type_v8i8:
1223                 case Type_v2f32:
1224                         return createBitCast(
1225                                 createInsertElement(
1226                                         V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2))),
1227                                         createLoad(createBitCast(ptr, Pointer<Long>::getType()), Long::getType(), isVolatile, alignment, atomic, memoryOrder),
1228                                         0),
1229                                 type);
1230                 case Type_v2i16:
1231                 case Type_v4i8:
1232                         if(alignment != 0)   // Not a local variable (all vectors are 128-bit).
1233                         {
1234                                 Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2)));
1235                                 Value *i = createLoad(createBitCast(ptr, Pointer<Int>::getType()), Int::getType(), isVolatile, alignment, atomic, memoryOrder);
1236                                 i = createZExt(i, Long::getType());
1237                                 Value *v = createInsertElement(u, i, 0);
1238                                 return createBitCast(v, type);
1239                         }
1240                         // Fallthrough to non-emulated case.
1241                 case Type_LLVM:
1242                         {
1243                                 assert(V(ptr)->getType()->getContainedType(0) == T(type));
1244                                 auto load = new llvm::LoadInst(V(ptr), "", isVolatile, alignment);
1245                                 load->setAtomic(atomicOrdering(atomic, memoryOrder));
1246
1247                                 return V(::builder->Insert(load));
1248                         }
1249                 default:
1250                         assert(false); return nullptr;
1251                 }
1252         }
1253
1254         Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
1255         {
1256                 switch(asInternalType(type))
1257                 {
1258                 case Type_v2i32:
1259                 case Type_v4i16:
1260                 case Type_v8i8:
1261                 case Type_v2f32:
1262                         createStore(
1263                                 createExtractElement(
1264                                         createBitCast(value, T(llvm::VectorType::get(T(Long::getType()), 2))), Long::getType(), 0),
1265                                 createBitCast(ptr, Pointer<Long>::getType()),
1266                                 Long::getType(), isVolatile, alignment, atomic, memoryOrder);
1267                         return value;
1268                 case Type_v2i16:
1269                 case Type_v4i8:
1270                         if(alignment != 0)   // Not a local variable (all vectors are 128-bit).
1271                         {
1272                                 createStore(
1273                                         createExtractElement(createBitCast(value, Int4::getType()), Int::getType(), 0),
1274                                         createBitCast(ptr, Pointer<Int>::getType()),
1275                                         Int::getType(), isVolatile, alignment, atomic, memoryOrder);
1276                                 return value;
1277                         }
1278                         // Fallthrough to non-emulated case.
1279                 case Type_LLVM:
1280                         {
1281                                 assert(V(ptr)->getType()->getContainedType(0) == T(type));
1282                                 auto store = ::builder->Insert(new llvm::StoreInst(V(value), V(ptr), isVolatile, alignment));
1283                                 store->setAtomic(atomicOrdering(atomic, memoryOrder));
1284
1285                                 return value;
1286                         }
1287                 default:
1288                         assert(false); return nullptr;
1289                 }
1290         }
1291
1292         Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1293         {
1294                 assert(V(ptr)->getType()->getContainedType(0) == T(type));
1295
1296                 if(sizeof(void*) == 8)
1297                 {
1298                         // LLVM manual: "When indexing into an array, pointer or vector,
1299                         // integers of any width are allowed, and they are not required to
1300                         // be constant. These integers are treated as signed values where
1301                         // relevant."
1302                         //
1303                         // Thus if we want indexes to be treated as unsigned we have to
1304                         // zero-extend them ourselves.
1305                         //
1306                         // Note that this is not because we want to address anywhere near
1307                         // 4 GB of data. Instead this is important for performance because
1308                         // x86 supports automatic zero-extending of 32-bit registers to
1309                         // 64-bit. Thus when indexing into an array using a uint32 is
1310                         // actually faster than an int32.
1311                         index = unsignedIndex ?
1312                                 createZExt(index, Long::getType()) :
1313                                 createSExt(index, Long::getType());
1314                 }
1315
1316                 // For non-emulated types we can rely on LLVM's GEP to calculate the
1317                 // effective address correctly.
1318                 if(asInternalType(type) == Type_LLVM)
1319                 {
1320                         return V(::builder->CreateGEP(V(ptr), V(index)));
1321                 }
1322
1323                 // For emulated types we have to multiply the index by the intended
1324                 // type size ourselves to obain the byte offset.
1325                 index = (sizeof(void*) == 8) ?
1326                         createMul(index, createConstantLong((int64_t)typeSize(type))) :
1327                         createMul(index, createConstantInt((int)typeSize(type)));
1328
1329                 // Cast to a byte pointer, apply the byte offset, and cast back to the
1330                 // original pointer type.
1331                 return createBitCast(
1332                         V(::builder->CreateGEP(V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::getType()), 0)))), V(index))),
1333                         T(llvm::PointerType::get(T(type), 0)));
1334         }
1335
1336         Value *Nucleus::createAtomicAdd(Value *ptr, Value *value)
1337         {
1338                 return V(::builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value), llvm::AtomicOrdering::SequentiallyConsistent));
1339         }
1340
1341         Value *Nucleus::createTrunc(Value *v, Type *destType)
1342         {
1343                 return V(::builder->CreateTrunc(V(v), T(destType)));
1344         }
1345
1346         Value *Nucleus::createZExt(Value *v, Type *destType)
1347         {
1348                 return V(::builder->CreateZExt(V(v), T(destType)));
1349         }
1350
1351         Value *Nucleus::createSExt(Value *v, Type *destType)
1352         {
1353                 return V(::builder->CreateSExt(V(v), T(destType)));
1354         }
1355
1356         Value *Nucleus::createFPToSI(Value *v, Type *destType)
1357         {
1358                 return V(::builder->CreateFPToSI(V(v), T(destType)));
1359         }
1360
1361         Value *Nucleus::createSIToFP(Value *v, Type *destType)
1362         {
1363                 return V(::builder->CreateSIToFP(V(v), T(destType)));
1364         }
1365
1366         Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1367         {
1368                 return V(::builder->CreateFPTrunc(V(v), T(destType)));
1369         }
1370
1371         Value *Nucleus::createFPExt(Value *v, Type *destType)
1372         {
1373                 return V(::builder->CreateFPExt(V(v), T(destType)));
1374         }
1375
1376         Value *Nucleus::createBitCast(Value *v, Type *destType)
1377         {
1378                 // Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1379                 // support for casting between scalars and wide vectors. Emulate them by writing to the stack and
1380                 // reading back as the destination type.
1381                 if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
1382                 {
1383                         Value *readAddress = allocateStackVariable(destType);
1384                         Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
1385                         createStore(v, writeAddress, T(V(v)->getType()));
1386                         return createLoad(readAddress, destType);
1387                 }
1388                 else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
1389                 {
1390                         Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
1391                         createStore(v, writeAddress, T(V(v)->getType()));
1392                         Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
1393                         return createLoad(readAddress, destType);
1394                 }
1395
1396                 return V(::builder->CreateBitCast(V(v), T(destType)));
1397         }
1398
1399         Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1400         {
1401                 return V(::builder->CreateICmpEQ(V(lhs), V(rhs)));
1402         }
1403
1404         Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1405         {
1406                 return V(::builder->CreateICmpNE(V(lhs), V(rhs)));
1407         }
1408
1409         Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1410         {
1411                 return V(::builder->CreateICmpUGT(V(lhs), V(rhs)));
1412         }
1413
1414         Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1415         {
1416                 return V(::builder->CreateICmpUGE(V(lhs), V(rhs)));
1417         }
1418
1419         Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1420         {
1421                 return V(::builder->CreateICmpULT(V(lhs), V(rhs)));
1422         }
1423
1424         Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1425         {
1426                 return V(::builder->CreateICmpULE(V(lhs), V(rhs)));
1427         }
1428
1429         Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1430         {
1431                 return V(::builder->CreateICmpSGT(V(lhs), V(rhs)));
1432         }
1433
1434         Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1435         {
1436                 return V(::builder->CreateICmpSGE(V(lhs), V(rhs)));
1437         }
1438
1439         Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1440         {
1441                 return V(::builder->CreateICmpSLT(V(lhs), V(rhs)));
1442         }
1443
1444         Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1445         {
1446                 return V(::builder->CreateICmpSLE(V(lhs), V(rhs)));
1447         }
1448
1449         Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1450         {
1451                 return V(::builder->CreateFCmpOEQ(V(lhs), V(rhs)));
1452         }
1453
1454         Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1455         {
1456                 return V(::builder->CreateFCmpOGT(V(lhs), V(rhs)));
1457         }
1458
1459         Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1460         {
1461                 return V(::builder->CreateFCmpOGE(V(lhs), V(rhs)));
1462         }
1463
1464         Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1465         {
1466                 return V(::builder->CreateFCmpOLT(V(lhs), V(rhs)));
1467         }
1468
1469         Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1470         {
1471                 return V(::builder->CreateFCmpOLE(V(lhs), V(rhs)));
1472         }
1473
1474         Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1475         {
1476                 return V(::builder->CreateFCmpONE(V(lhs), V(rhs)));
1477         }
1478
1479         Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1480         {
1481                 return V(::builder->CreateFCmpORD(V(lhs), V(rhs)));
1482         }
1483
1484         Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1485         {
1486                 return V(::builder->CreateFCmpUNO(V(lhs), V(rhs)));
1487         }
1488
1489         Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1490         {
1491                 return V(::builder->CreateFCmpUEQ(V(lhs), V(rhs)));
1492         }
1493
1494         Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1495         {
1496                 return V(::builder->CreateFCmpUGT(V(lhs), V(rhs)));
1497         }
1498
1499         Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1500         {
1501                 return V(::builder->CreateFCmpUGE(V(lhs), V(rhs)));
1502         }
1503
1504         Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1505         {
1506                 return V(::builder->CreateFCmpULT(V(lhs), V(rhs)));
1507         }
1508
1509         Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1510         {
1511                 return V(::builder->CreateFCmpULE(V(lhs), V(rhs)));
1512         }
1513
1514         Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1515         {
1516                 return V(::builder->CreateFCmpUNE(V(lhs), V(rhs)));
1517         }
1518
1519         Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1520         {
1521                 assert(V(vector)->getType()->getContainedType(0) == T(type));
1522                 return V(::builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
1523         }
1524
1525         Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1526         {
1527                 return V(::builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
1528         }
1529
1530         Value *Nucleus::createShuffleVector(Value *v1, Value *v2, const int *select)
1531         {
1532                 int size = llvm::cast<llvm::VectorType>(V(v1)->getType())->getNumElements();
1533                 const int maxSize = 16;
1534                 llvm::Constant *swizzle[maxSize];
1535                 assert(size <= maxSize);
1536
1537                 for(int i = 0; i < size; i++)
1538                 {
1539                         swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*::context), select[i]);
1540                 }
1541
1542                 llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(swizzle, size));
1543
1544                 return V(::builder->CreateShuffleVector(V(v1), V(v2), shuffle));
1545         }
1546
1547         Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
1548         {
1549                 return V(::builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
1550         }
1551
1552         SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1553         {
1554                 return reinterpret_cast<SwitchCases*>(::builder->CreateSwitch(V(control), B(defaultBranch), numCases));
1555         }
1556
1557         void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1558         {
1559                 llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
1560                 sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*::context), label, true), B(branch));
1561         }
1562
1563         void Nucleus::createUnreachable()
1564         {
1565                 ::builder->CreateUnreachable();
1566         }
1567
1568         Type *Nucleus::getPointerType(Type *ElementType)
1569         {
1570                 return T(llvm::PointerType::get(T(ElementType), 0));
1571         }
1572
1573         Value *Nucleus::createNullValue(Type *Ty)
1574         {
1575                 return V(llvm::Constant::getNullValue(T(Ty)));
1576         }
1577
1578         Value *Nucleus::createConstantLong(int64_t i)
1579         {
1580                 return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(*::context), i, true));
1581         }
1582
1583         Value *Nucleus::createConstantInt(int i)
1584         {
1585                 return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*::context), i, true));
1586         }
1587
1588         Value *Nucleus::createConstantInt(unsigned int i)
1589         {
1590                 return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*::context), i, false));
1591         }
1592
1593         Value *Nucleus::createConstantBool(bool b)
1594         {
1595                 return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(*::context), b));
1596         }
1597
1598         Value *Nucleus::createConstantByte(signed char i)
1599         {
1600                 return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*::context), i, true));
1601         }
1602
1603         Value *Nucleus::createConstantByte(unsigned char i)
1604         {
1605                 return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*::context), i, false));
1606         }
1607
1608         Value *Nucleus::createConstantShort(short i)
1609         {
1610                 return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*::context), i, true));
1611         }
1612
1613         Value *Nucleus::createConstantShort(unsigned short i)
1614         {
1615                 return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*::context), i, false));
1616         }
1617
1618         Value *Nucleus::createConstantFloat(float x)
1619         {
1620                 return V(llvm::ConstantFP::get(T(Float::getType()), x));
1621         }
1622
1623         Value *Nucleus::createNullPointer(Type *Ty)
1624         {
1625                 return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
1626         }
1627
1628         Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
1629         {
1630                 assert(llvm::isa<llvm::VectorType>(T(type)));
1631                 const int numConstants = elementCount(type);                                       // Number of provided constants for the (emulated) type.
1632                 const int numElements = llvm::cast<llvm::VectorType>(T(type))->getNumElements();   // Number of elements of the underlying vector type.
1633                 assert(numElements <= 16 && numConstants <= numElements);
1634                 llvm::Constant *constantVector[16];
1635
1636                 for(int i = 0; i < numElements; i++)
1637                 {
1638                         constantVector[i] = llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]);
1639                 }
1640
1641                 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numElements)));
1642         }
1643
1644         Value *Nucleus::createConstantVector(const double *constants, Type *type)
1645         {
1646                 assert(llvm::isa<llvm::VectorType>(T(type)));
1647                 const int numConstants = elementCount(type);                                       // Number of provided constants for the (emulated) type.
1648                 const int numElements = llvm::cast<llvm::VectorType>(T(type))->getNumElements();   // Number of elements of the underlying vector type.
1649                 assert(numElements <= 8 && numConstants <= numElements);
1650                 llvm::Constant *constantVector[8];
1651
1652                 for(int i = 0; i < numElements; i++)
1653                 {
1654                         constantVector[i] = llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]);
1655                 }
1656
1657                 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numElements)));
1658         }
1659
1660         Type *Void::getType()
1661         {
1662                 return T(llvm::Type::getVoidTy(*::context));
1663         }
1664
1665         Type *Bool::getType()
1666         {
1667                 return T(llvm::Type::getInt1Ty(*::context));
1668         }
1669
1670         Type *Byte::getType()
1671         {
1672                 return T(llvm::Type::getInt8Ty(*::context));
1673         }
1674
1675         Type *SByte::getType()
1676         {
1677                 return T(llvm::Type::getInt8Ty(*::context));
1678         }
1679
1680         Type *Short::getType()
1681         {
1682                 return T(llvm::Type::getInt16Ty(*::context));
1683         }
1684
1685         Type *UShort::getType()
1686         {
1687                 return T(llvm::Type::getInt16Ty(*::context));
1688         }
1689
1690         Type *Byte4::getType()
1691         {
1692                 return T(Type_v4i8);
1693         }
1694
1695         Type *SByte4::getType()
1696         {
1697                 return T(Type_v4i8);
1698         }
1699
1700         RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
1701         {
1702 #if defined(__i386__) || defined(__x86_64__)
1703                 return x86::paddusb(x, y);
1704 #else
1705                 return As<Byte8>(V(lowerPUADDSAT(V(x.value), V(y.value))));
1706 #endif
1707         }
1708
1709         RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
1710         {
1711 #if defined(__i386__) || defined(__x86_64__)
1712                 return x86::psubusb(x, y);
1713 #else
1714                 return As<Byte8>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
1715 #endif
1716         }
1717
1718         RValue<Int> SignMask(RValue<Byte8> x)
1719         {
1720 #if defined(__i386__) || defined(__x86_64__)
1721                 return x86::pmovmskb(x);
1722 #else
1723                 return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
1724 #endif
1725         }
1726
1727 //      RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
1728 //      {
1729 //#if defined(__i386__) || defined(__x86_64__)
1730 //              return x86::pcmpgtb(x, y);   // FIXME: Signedness
1731 //#else
1732 //              return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
1733 //#endif
1734 //      }
1735
1736         RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
1737         {
1738 #if defined(__i386__) || defined(__x86_64__)
1739                 return x86::pcmpeqb(x, y);
1740 #else
1741                 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
1742 #endif
1743         }
1744
1745         Type *Byte8::getType()
1746         {
1747                 return T(Type_v8i8);
1748         }
1749
1750         RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
1751         {
1752 #if defined(__i386__) || defined(__x86_64__)
1753                 return x86::paddsb(x, y);
1754 #else
1755                 return As<SByte8>(V(lowerPSADDSAT(V(x.value), V(y.value))));
1756 #endif
1757         }
1758
1759         RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
1760         {
1761 #if defined(__i386__) || defined(__x86_64__)
1762                 return x86::psubsb(x, y);
1763 #else
1764                 return As<SByte8>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
1765 #endif
1766         }
1767
1768         RValue<Int> SignMask(RValue<SByte8> x)
1769         {
1770 #if defined(__i386__) || defined(__x86_64__)
1771                 return x86::pmovmskb(As<Byte8>(x));
1772 #else
1773                 return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
1774 #endif
1775         }
1776
1777         RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
1778         {
1779 #if defined(__i386__) || defined(__x86_64__)
1780                 return x86::pcmpgtb(x, y);
1781 #else
1782                 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
1783 #endif
1784         }
1785
1786         RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
1787         {
1788 #if defined(__i386__) || defined(__x86_64__)
1789                 return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
1790 #else
1791                 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
1792 #endif
1793         }
1794
1795         Type *SByte8::getType()
1796         {
1797                 return T(Type_v8i8);
1798         }
1799
1800         Type *Byte16::getType()
1801         {
1802                 return T(llvm::VectorType::get(T(Byte::getType()), 16));
1803         }
1804
1805         Type *SByte16::getType()
1806         {
1807                 return T(llvm::VectorType::get(T(SByte::getType()), 16));
1808         }
1809
1810         Type *Short2::getType()
1811         {
1812                 return T(Type_v2i16);
1813         }
1814
1815         Type *UShort2::getType()
1816         {
1817                 return T(Type_v2i16);
1818         }
1819
1820         Short4::Short4(RValue<Int4> cast)
1821         {
1822                 int select[8] = {0, 2, 4, 6, 0, 2, 4, 6};
1823                 Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
1824
1825                 Value *packed = Nucleus::createShuffleVector(short8, short8, select);
1826                 Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value;
1827
1828                 storeValue(short4);
1829         }
1830
1831 //      Short4::Short4(RValue<Float> cast)
1832 //      {
1833 //      }
1834
1835         Short4::Short4(RValue<Float4> cast)
1836         {
1837                 Int4 v4i32 = Int4(cast);
1838 #if defined(__i386__) || defined(__x86_64__)
1839                 v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
1840 #else
1841                 Value *v = v4i32.loadValue();
1842                 v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
1843 #endif
1844
1845                 storeValue(As<Short4>(Int2(v4i32)).value);
1846         }
1847
1848         RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
1849         {
1850 #if defined(__i386__) || defined(__x86_64__)
1851         //      return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
1852
1853                 return x86::psllw(lhs, rhs);
1854 #else
1855                 return As<Short4>(V(lowerVectorShl(V(lhs.value), rhs)));
1856 #endif
1857         }
1858
1859         RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
1860         {
1861 #if defined(__i386__) || defined(__x86_64__)
1862                 return x86::psraw(lhs, rhs);
1863 #else
1864                 return As<Short4>(V(lowerVectorAShr(V(lhs.value), rhs)));
1865 #endif
1866         }
1867
1868         RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
1869         {
1870 #if defined(__i386__) || defined(__x86_64__)
1871                 return x86::pmaxsw(x, y);
1872 #else
1873                 return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
1874 #endif
1875         }
1876
1877         RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
1878         {
1879 #if defined(__i386__) || defined(__x86_64__)
1880                 return x86::pminsw(x, y);
1881 #else
1882                 return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
1883 #endif
1884         }
1885
1886         RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
1887         {
1888 #if defined(__i386__) || defined(__x86_64__)
1889                 return x86::paddsw(x, y);
1890 #else
1891                 return As<Short4>(V(lowerPSADDSAT(V(x.value), V(y.value))));
1892 #endif
1893         }
1894
1895         RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
1896         {
1897 #if defined(__i386__) || defined(__x86_64__)
1898                 return x86::psubsw(x, y);
1899 #else
1900                 return As<Short4>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
1901 #endif
1902         }
1903
1904         RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
1905         {
1906 #if defined(__i386__) || defined(__x86_64__)
1907                 return x86::pmulhw(x, y);
1908 #else
1909                 return As<Short4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
1910 #endif
1911         }
1912
1913         RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
1914         {
1915 #if defined(__i386__) || defined(__x86_64__)
1916                 return x86::pmaddwd(x, y);
1917 #else
1918                 return As<Int2>(V(lowerMulAdd(V(x.value), V(y.value))));
1919 #endif
1920         }
1921
1922         RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
1923         {
1924 #if defined(__i386__) || defined(__x86_64__)
1925                 auto result = x86::packsswb(x, y);
1926 #else
1927                 auto result = V(lowerPack(V(x.value), V(y.value), true));
1928 #endif
1929                 return As<SByte8>(Swizzle(As<Int4>(result), 0x88));
1930         }
1931
1932         RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
1933         {
1934 #if defined(__i386__) || defined(__x86_64__)
1935                 auto result = x86::packuswb(x, y);
1936 #else
1937                 auto result = V(lowerPack(V(x.value), V(y.value), false));
1938 #endif
1939                 return As<Byte8>(Swizzle(As<Int4>(result), 0x88));
1940         }
1941
1942         RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
1943         {
1944 #if defined(__i386__) || defined(__x86_64__)
1945                 return x86::pcmpgtw(x, y);
1946 #else
1947                 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
1948 #endif
1949         }
1950
1951         RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
1952         {
1953 #if defined(__i386__) || defined(__x86_64__)
1954                 return x86::pcmpeqw(x, y);
1955 #else
1956                 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
1957 #endif
1958         }
1959
1960         Type *Short4::getType()
1961         {
1962                 return T(Type_v4i16);
1963         }
1964
1965         UShort4::UShort4(RValue<Float4> cast, bool saturate)
1966         {
1967                 if(saturate)
1968                 {
1969 #if defined(__i386__) || defined(__x86_64__)
1970                         if(CPUID::supportsSSE4_1())
1971                         {
1972                                 Int4 int4(Min(cast, Float4(0xFFFF)));   // packusdw takes care of 0x0000 saturation
1973                                 *this = As<Short4>(PackUnsigned(int4, int4));
1974                         }
1975                         else
1976 #endif
1977                         {
1978                                 *this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
1979                         }
1980                 }
1981                 else
1982                 {
1983                         *this = Short4(Int4(cast));
1984                 }
1985         }
1986
1987         RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
1988         {
1989 #if defined(__i386__) || defined(__x86_64__)
1990         //      return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
1991
1992                 return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
1993 #else
1994                 return As<UShort4>(V(lowerVectorShl(V(lhs.value), rhs)));
1995 #endif
1996         }
1997
1998         RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
1999         {
2000 #if defined(__i386__) || defined(__x86_64__)
2001         //      return RValue<Short4>(Nucleus::createLShr(lhs.value, rhs.value));
2002
2003                 return x86::psrlw(lhs, rhs);
2004 #else
2005                 return As<UShort4>(V(lowerVectorLShr(V(lhs.value), rhs)));
2006 #endif
2007         }
2008
2009         RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2010         {
2011                 return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2012         }
2013
2014         RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2015         {
2016                 return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2017         }
2018
2019         RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2020         {
2021 #if defined(__i386__) || defined(__x86_64__)
2022                 return x86::paddusw(x, y);
2023 #else
2024                 return As<UShort4>(V(lowerPUADDSAT(V(x.value), V(y.value))));
2025 #endif
2026         }
2027
2028         RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2029         {
2030 #if defined(__i386__) || defined(__x86_64__)
2031                 return x86::psubusw(x, y);
2032 #else
2033                 return As<UShort4>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
2034 #endif
2035         }
2036
2037         RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2038         {
2039 #if defined(__i386__) || defined(__x86_64__)
2040                 return x86::pmulhuw(x, y);
2041 #else
2042                 return As<UShort4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
2043 #endif
2044         }
2045
2046         RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
2047         {
2048 #if defined(__i386__) || defined(__x86_64__)
2049                 return x86::pavgw(x, y);
2050 #else
2051                 return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
2052 #endif
2053         }
2054
2055         Type *UShort4::getType()
2056         {
2057                 return T(Type_v4i16);
2058         }
2059
2060         RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
2061         {
2062 #if defined(__i386__) || defined(__x86_64__)
2063                 return x86::psllw(lhs, rhs);
2064 #else
2065                 return As<Short8>(V(lowerVectorShl(V(lhs.value), rhs)));
2066 #endif
2067         }
2068
2069         RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
2070         {
2071 #if defined(__i386__) || defined(__x86_64__)
2072                 return x86::psraw(lhs, rhs);
2073 #else
2074                 return As<Short8>(V(lowerVectorAShr(V(lhs.value), rhs)));
2075 #endif
2076         }
2077
2078         RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
2079         {
2080 #if defined(__i386__) || defined(__x86_64__)
2081                 return x86::pmaddwd(x, y);
2082 #else
2083                 return As<Int4>(V(lowerMulAdd(V(x.value), V(y.value))));
2084 #endif
2085         }
2086
2087         RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
2088         {
2089 #if defined(__i386__) || defined(__x86_64__)
2090                 return x86::pmulhw(x, y);
2091 #else
2092                 return As<Short8>(V(lowerMulHigh(V(x.value), V(y.value), true)));
2093 #endif
2094         }
2095
2096         Type *Short8::getType()
2097         {
2098                 return T(llvm::VectorType::get(T(Short::getType()), 8));
2099         }
2100
2101         RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
2102         {
2103 #if defined(__i386__) || defined(__x86_64__)
2104                 return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
2105 #else
2106                 return As<UShort8>(V(lowerVectorShl(V(lhs.value), rhs)));
2107 #endif
2108         }
2109
2110         RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
2111         {
2112 #if defined(__i386__) || defined(__x86_64__)
2113                 return x86::psrlw(lhs, rhs);   // FIXME: Fallback required
2114 #else
2115                 return As<UShort8>(V(lowerVectorLShr(V(lhs.value), rhs)));
2116 #endif
2117         }
2118
2119         RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
2120         {
2121                 int pshufb[16] =
2122                 {
2123                         select0 + 0,
2124                         select0 + 1,
2125                         select1 + 0,
2126                         select1 + 1,
2127                         select2 + 0,
2128                         select2 + 1,
2129                         select3 + 0,
2130                         select3 + 1,
2131                         select4 + 0,
2132                         select4 + 1,
2133                         select5 + 0,
2134                         select5 + 1,
2135                         select6 + 0,
2136                         select6 + 1,
2137                         select7 + 0,
2138                         select7 + 1,
2139                 };
2140
2141                 Value *byte16 = Nucleus::createBitCast(x.value, Byte16::getType());
2142                 Value *shuffle = Nucleus::createShuffleVector(byte16, byte16, pshufb);
2143                 Value *short8 = Nucleus::createBitCast(shuffle, UShort8::getType());
2144
2145                 return RValue<UShort8>(short8);
2146         }
2147
2148         RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
2149         {
2150 #if defined(__i386__) || defined(__x86_64__)
2151                 return x86::pmulhuw(x, y);
2152 #else
2153                 return As<UShort8>(V(lowerMulHigh(V(x.value), V(y.value), false)));
2154 #endif
2155         }
2156
2157         Type *UShort8::getType()
2158         {
2159                 return T(llvm::VectorType::get(T(UShort::getType()), 8));
2160         }
2161
2162         RValue<Int> operator++(Int &val, int)   // Post-increment
2163         {
2164                 RValue<Int> res = val;
2165
2166                 Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantInt(1));
2167                 val.storeValue(inc);
2168
2169                 return res;
2170         }
2171
2172         const Int &operator++(Int &val)   // Pre-increment
2173         {
2174                 Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2175                 val.storeValue(inc);
2176
2177                 return val;
2178         }
2179
2180         RValue<Int> operator--(Int &val, int)   // Post-decrement
2181         {
2182                 RValue<Int> res = val;
2183
2184                 Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantInt(1));
2185                 val.storeValue(inc);
2186
2187                 return res;
2188         }
2189
2190         const Int &operator--(Int &val)   // Pre-decrement
2191         {
2192                 Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2193                 val.storeValue(inc);
2194
2195                 return val;
2196         }
2197
2198         RValue<Int> RoundInt(RValue<Float> cast)
2199         {
2200 #if defined(__i386__) || defined(__x86_64__)
2201                 return x86::cvtss2si(cast);
2202 #else
2203                 return RValue<Int>(V(lowerRoundInt(V(cast.value), T(Int::getType()))));
2204 #endif
2205         }
2206
2207         Type *Int::getType()
2208         {
2209                 return T(llvm::Type::getInt32Ty(*::context));
2210         }
2211
2212         Type *Long::getType()
2213         {
2214                 return T(llvm::Type::getInt64Ty(*::context));
2215         }
2216
2217         UInt::UInt(RValue<Float> cast)
2218         {
2219                 // Note: createFPToUI is broken, must perform conversion using createFPtoSI
2220                 // Value *integer = Nucleus::createFPToUI(cast.value, UInt::getType());
2221
2222                 // Smallest positive value representable in UInt, but not in Int
2223                 const unsigned int ustart = 0x80000000u;
2224                 const float ustartf = float(ustart);
2225
2226                 // If the value is negative, store 0, otherwise store the result of the conversion
2227                 storeValue((~(As<Int>(cast) >> 31) &
2228                 // Check if the value can be represented as an Int
2229                         IfThenElse(cast >= ustartf,
2230                 // If the value is too large, subtract ustart and re-add it after conversion.
2231                                 As<Int>(As<UInt>(Int(cast - Float(ustartf))) + UInt(ustart)),
2232                 // Otherwise, just convert normally
2233                                 Int(cast))).value);
2234         }
2235
2236         RValue<UInt> operator++(UInt &val, int)   // Post-increment
2237         {
2238                 RValue<UInt> res = val;
2239
2240                 Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantInt(1));
2241                 val.storeValue(inc);
2242
2243                 return res;
2244         }
2245
2246         const UInt &operator++(UInt &val)   // Pre-increment
2247         {
2248                 Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2249                 val.storeValue(inc);
2250
2251                 return val;
2252         }
2253
2254         RValue<UInt> operator--(UInt &val, int)   // Post-decrement
2255         {
2256                 RValue<UInt> res = val;
2257
2258                 Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantInt(1));
2259                 val.storeValue(inc);
2260
2261                 return res;
2262         }
2263
2264         const UInt &operator--(UInt &val)   // Pre-decrement
2265         {
2266                 Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2267                 val.storeValue(inc);
2268
2269                 return val;
2270         }
2271
2272 //      RValue<UInt> RoundUInt(RValue<Float> cast)
2273 //      {
2274 //#if defined(__i386__) || defined(__x86_64__)
2275 //              return x86::cvtss2si(val);   // FIXME: Unsigned
2276 //#else
2277 //              return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f));
2278 //#endif
2279 //      }
2280
2281         Type *UInt::getType()
2282         {
2283                 return T(llvm::Type::getInt32Ty(*::context));
2284         }
2285
2286 //      Int2::Int2(RValue<Int> cast)
2287 //      {
2288 //              Value *extend = Nucleus::createZExt(cast.value, Long::getType());
2289 //              Value *vector = Nucleus::createBitCast(extend, Int2::getType());
2290 //
2291 //              int shuffle[2] = {0, 0};
2292 //              Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
2293 //
2294 //              storeValue(replicate);
2295 //      }
2296
2297         RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
2298         {
2299 #if defined(__i386__) || defined(__x86_64__)
2300         //      return RValue<Int2>(Nucleus::createShl(lhs.value, rhs.value));
2301
2302                 return x86::pslld(lhs, rhs);
2303 #else
2304                 return As<Int2>(V(lowerVectorShl(V(lhs.value), rhs)));
2305 #endif
2306         }
2307
2308         RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
2309         {
2310 #if defined(__i386__) || defined(__x86_64__)
2311         //      return RValue<Int2>(Nucleus::createAShr(lhs.value, rhs.value));
2312
2313                 return x86::psrad(lhs, rhs);
2314 #else
2315                 return As<Int2>(V(lowerVectorAShr(V(lhs.value), rhs)));
2316 #endif
2317         }
2318
2319         Type *Int2::getType()
2320         {
2321                 return T(Type_v2i32);
2322         }
2323
2324         RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
2325         {
2326 #if defined(__i386__) || defined(__x86_64__)
2327         //      return RValue<UInt2>(Nucleus::createShl(lhs.value, rhs.value));
2328
2329                 return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
2330 #else
2331                 return As<UInt2>(V(lowerVectorShl(V(lhs.value), rhs)));
2332 #endif
2333         }
2334
2335         RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
2336         {
2337 #if defined(__i386__) || defined(__x86_64__)
2338         //      return RValue<UInt2>(Nucleus::createLShr(lhs.value, rhs.value));
2339
2340                 return x86::psrld(lhs, rhs);
2341 #else
2342                 return As<UInt2>(V(lowerVectorLShr(V(lhs.value), rhs)));
2343 #endif
2344         }
2345
2346         Type *UInt2::getType()
2347         {
2348                 return T(Type_v2i32);
2349         }
2350
2351         Int4::Int4(RValue<Byte4> cast) : XYZW(this)
2352         {
2353 #if defined(__i386__) || defined(__x86_64__)
2354                 if(CPUID::supportsSSE4_1())
2355                 {
2356                         *this = x86::pmovzxbd(As<Byte16>(cast));
2357                 }
2358                 else
2359 #endif
2360                 {
2361                         int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
2362                         Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
2363                         Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::getType()), swizzle);
2364
2365                         int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
2366                         Value *c = Nucleus::createBitCast(b, Short8::getType());
2367                         Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::getType()), swizzle2);
2368
2369                         *this = As<Int4>(d);
2370                 }
2371         }
2372
2373         Int4::Int4(RValue<SByte4> cast) : XYZW(this)
2374         {
2375 #if defined(__i386__) || defined(__x86_64__)
2376                 if(CPUID::supportsSSE4_1())
2377                 {
2378                         *this = x86::pmovsxbd(As<SByte16>(cast));
2379                 }
2380                 else
2381 #endif
2382                 {
2383                         int swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
2384                         Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
2385                         Value *b = Nucleus::createShuffleVector(a, a, swizzle);
2386
2387                         int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
2388                         Value *c = Nucleus::createBitCast(b, Short8::getType());
2389                         Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
2390
2391                         *this = As<Int4>(d) >> 24;
2392                 }
2393         }
2394
2395         Int4::Int4(RValue<Short4> cast) : XYZW(this)
2396         {
2397 #if defined(__i386__) || defined(__x86_64__)
2398                 if(CPUID::supportsSSE4_1())
2399                 {
2400                         *this = x86::pmovsxwd(As<Short8>(cast));
2401                 }
2402                 else
2403 #endif
2404                 {
2405                         int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
2406                         Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle);
2407                         *this = As<Int4>(c) >> 16;
2408                 }
2409         }
2410
2411         Int4::Int4(RValue<UShort4> cast) : XYZW(this)
2412         {
2413 #if defined(__i386__) || defined(__x86_64__)
2414                 if(CPUID::supportsSSE4_1())
2415                 {
2416                         *this = x86::pmovzxwd(As<UShort8>(cast));
2417                 }
2418                 else
2419 #endif
2420                 {
2421                         int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
2422                         Value *c = Nucleus::createShuffleVector(cast.value, Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
2423                         *this = As<Int4>(c);
2424                 }
2425         }
2426
2427         Int4::Int4(RValue<Int> rhs) : XYZW(this)
2428         {
2429                 Value *vector = loadValue();
2430                 Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
2431
2432                 int swizzle[4] = {0, 0, 0, 0};
2433                 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2434
2435                 storeValue(replicate);
2436         }
2437
2438         RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
2439         {
2440 #if defined(__i386__) || defined(__x86_64__)
2441                 return x86::pslld(lhs, rhs);
2442 #else
2443                 return As<Int4>(V(lowerVectorShl(V(lhs.value), rhs)));
2444 #endif
2445         }
2446
2447         RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
2448         {
2449 #if defined(__i386__) || defined(__x86_64__)
2450                 return x86::psrad(lhs, rhs);
2451 #else
2452                 return As<Int4>(V(lowerVectorAShr(V(lhs.value), rhs)));
2453 #endif
2454         }
2455
2456         RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
2457         {
2458                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2459                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2460                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
2461                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
2462         }
2463
2464         RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
2465         {
2466                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2467                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2468                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType()));
2469                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
2470         }
2471
2472         RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
2473         {
2474                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2475                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2476                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value, y.value), Int4::getType()));
2477                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
2478         }
2479
2480         RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
2481         {
2482                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2483                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2484                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
2485                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
2486         }
2487
2488         RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
2489         {
2490                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2491                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2492                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value, y.value), Int4::getType()));
2493                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
2494         }
2495
2496         RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
2497         {
2498                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2499                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2500                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType()));
2501                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
2502         }
2503
2504         RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
2505         {
2506 #if defined(__i386__) || defined(__x86_64__)
2507                 if(CPUID::supportsSSE4_1())
2508                 {
2509                         return x86::pmaxsd(x, y);
2510                 }
2511                 else
2512 #endif
2513                 {
2514                         RValue<Int4> greater = CmpNLE(x, y);
2515                         return (x & greater) | (y & ~greater);
2516                 }
2517         }
2518
2519         RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
2520         {
2521 #if defined(__i386__) || defined(__x86_64__)
2522                 if(CPUID::supportsSSE4_1())
2523                 {
2524                         return x86::pminsd(x, y);
2525                 }
2526                 else
2527 #endif
2528                 {
2529                         RValue<Int4> less = CmpLT(x, y);
2530                         return (x & less) | (y & ~less);
2531                 }
2532         }
2533
2534         RValue<Int4> RoundInt(RValue<Float4> cast)
2535         {
2536 #if defined(__i386__) || defined(__x86_64__)
2537                 return x86::cvtps2dq(cast);
2538 #else
2539                 return As<Int4>(V(lowerRoundInt(V(cast.value), T(Int4::getType()))));
2540 #endif
2541         }
2542
2543         RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2544         {
2545                 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2546                 return As<Int4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
2547         }
2548
2549         RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
2550         {
2551                 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2552                 return As<UInt4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
2553         }
2554
2555         RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
2556         {
2557 #if defined(__i386__) || defined(__x86_64__)
2558                 return x86::packssdw(x, y);
2559 #else
2560                 return As<Short8>(V(lowerPack(V(x.value), V(y.value), true)));
2561 #endif
2562         }
2563
2564         RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
2565         {
2566 #if defined(__i386__) || defined(__x86_64__)
2567                 return x86::packusdw(x, y);
2568 #else
2569                 return As<UShort8>(V(lowerPack(V(x.value), V(y.value), false)));
2570 #endif
2571         }
2572
2573         RValue<Int> SignMask(RValue<Int4> x)
2574         {
2575 #if defined(__i386__) || defined(__x86_64__)
2576                 return x86::movmskps(As<Float4>(x));
2577 #else
2578                 return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
2579 #endif
2580         }
2581
2582         Type *Int4::getType()
2583         {
2584                 return T(llvm::VectorType::get(T(Int::getType()), 4));
2585         }
2586
2587         UInt4::UInt4(RValue<Float4> cast) : XYZW(this)
2588         {
2589                 // Note: createFPToUI is broken, must perform conversion using createFPtoSI
2590                 // Value *xyzw = Nucleus::createFPToUI(cast.value, UInt4::getType());
2591
2592                 // Smallest positive value representable in UInt, but not in Int
2593                 const unsigned int ustart = 0x80000000u;
2594                 const float ustartf = float(ustart);
2595
2596                 // Check if the value can be represented as an Int
2597                 Int4 uiValue = CmpNLT(cast, Float4(ustartf));
2598                 // If the value is too large, subtract ustart and re-add it after conversion.
2599                 uiValue = (uiValue & As<Int4>(As<UInt4>(Int4(cast - Float4(ustartf))) + UInt4(ustart))) |
2600                 // Otherwise, just convert normally
2601                           (~uiValue & Int4(cast));
2602                 // If the value is negative, store 0, otherwise store the result of the conversion
2603                 storeValue((~(As<Int4>(cast) >> 31) & uiValue).value);
2604         }
2605
2606         RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
2607         {
2608 #if defined(__i386__) || defined(__x86_64__)
2609                 return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
2610 #else
2611                 return As<UInt4>(V(lowerVectorShl(V(lhs.value), rhs)));
2612 #endif
2613         }
2614
2615         RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
2616         {
2617 #if defined(__i386__) || defined(__x86_64__)
2618                 return x86::psrld(lhs, rhs);
2619 #else
2620                 return As<UInt4>(V(lowerVectorLShr(V(lhs.value), rhs)));
2621 #endif
2622         }
2623
2624         RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
2625         {
2626                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2627                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2628                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
2629                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
2630         }
2631
2632         RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
2633         {
2634                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType()));
2635         }
2636
2637         RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
2638         {
2639                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2640                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2641                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value, y.value), Int4::getType()));
2642                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
2643         }
2644
2645         RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
2646         {
2647                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
2648         }
2649
2650         RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
2651         {
2652                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2653                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2654                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value, y.value), Int4::getType()));
2655                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
2656         }
2657
2658         RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
2659         {
2660                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType()));
2661         }
2662
2663         RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
2664         {
2665 #if defined(__i386__) || defined(__x86_64__)
2666                 if(CPUID::supportsSSE4_1())
2667                 {
2668                         return x86::pmaxud(x, y);
2669                 }
2670                 else
2671 #endif
2672                 {
2673                         RValue<UInt4> greater = CmpNLE(x, y);
2674                         return (x & greater) | (y & ~greater);
2675                 }
2676         }
2677
2678         RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
2679         {
2680 #if defined(__i386__) || defined(__x86_64__)
2681                 if(CPUID::supportsSSE4_1())
2682                 {
2683                         return x86::pminud(x, y);
2684                 }
2685                 else
2686 #endif
2687                 {
2688                         RValue<UInt4> less = CmpLT(x, y);
2689                         return (x & less) | (y & ~less);
2690                 }
2691         }
2692
2693         Type *UInt4::getType()
2694         {
2695                 return T(llvm::VectorType::get(T(UInt::getType()), 4));
2696         }
2697
2698         Type *Half::getType()
2699         {
2700                 return T(llvm::Type::getInt16Ty(*::context));
2701         }
2702
2703         RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
2704         {
2705 #if defined(__i386__) || defined(__x86_64__)
2706                 if(exactAtPow2)
2707                 {
2708                         // rcpss uses a piecewise-linear approximation which minimizes the relative error
2709                         // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2710                         return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2711                 }
2712                 return x86::rcpss(x);
2713 #else
2714                 return As<Float>(V(lowerRCP(V(x.value))));
2715 #endif
2716         }
2717
2718         RValue<Float> RcpSqrt_pp(RValue<Float> x)
2719         {
2720 #if defined(__i386__) || defined(__x86_64__)
2721                 return x86::rsqrtss(x);
2722 #else
2723                 return As<Float>(V(lowerRSQRT(V(x.value))));
2724 #endif
2725         }
2726
2727         RValue<Float> Sqrt(RValue<Float> x)
2728         {
2729 #if defined(__i386__) || defined(__x86_64__)
2730                 return x86::sqrtss(x);
2731 #else
2732                 return As<Float>(V(lowerSQRT(V(x.value))));
2733 #endif
2734         }
2735
2736         RValue<Float> Round(RValue<Float> x)
2737         {
2738 #if defined(__i386__) || defined(__x86_64__)
2739                 if(CPUID::supportsSSE4_1())
2740                 {
2741                         return x86::roundss(x, 0);
2742                 }
2743                 else
2744                 {
2745                         return Float4(Round(Float4(x))).x;
2746                 }
2747 #else
2748                 return RValue<Float>(V(lowerRound(V(x.value))));
2749 #endif
2750         }
2751
2752         RValue<Float> Trunc(RValue<Float> x)
2753         {
2754 #if defined(__i386__) || defined(__x86_64__)
2755                 if(CPUID::supportsSSE4_1())
2756                 {
2757                         return x86::roundss(x, 3);
2758                 }
2759                 else
2760                 {
2761                         return Float(Int(x));   // Rounded toward zero
2762                 }
2763 #else
2764                 return RValue<Float>(V(lowerTrunc(V(x.value))));
2765 #endif
2766         }
2767
2768         RValue<Float> Frac(RValue<Float> x)
2769         {
2770 #if defined(__i386__) || defined(__x86_64__)
2771                 if(CPUID::supportsSSE4_1())
2772                 {
2773                         return x - x86::floorss(x);
2774                 }
2775                 else
2776                 {
2777                         return Float4(Frac(Float4(x))).x;
2778                 }
2779 #else
2780                 // x - floor(x) can be 1.0 for very small negative x.
2781                 // Clamp against the value just below 1.0.
2782                 return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
2783 #endif
2784         }
2785
2786         RValue<Float> Floor(RValue<Float> x)
2787         {
2788 #if defined(__i386__) || defined(__x86_64__)
2789                 if(CPUID::supportsSSE4_1())
2790                 {
2791                         return x86::floorss(x);
2792                 }
2793                 else
2794                 {
2795                         return Float4(Floor(Float4(x))).x;
2796                 }
2797 #else
2798                 return RValue<Float>(V(lowerFloor(V(x.value))));
2799 #endif
2800         }
2801
2802         RValue<Float> Ceil(RValue<Float> x)
2803         {
2804 #if defined(__i386__) || defined(__x86_64__)
2805                 if(CPUID::supportsSSE4_1())
2806                 {
2807                         return x86::ceilss(x);
2808                 }
2809                 else
2810 #endif
2811                 {
2812                         return Float4(Ceil(Float4(x))).x;
2813                 }
2814         }
2815
2816         Type *Float::getType()
2817         {
2818                 return T(llvm::Type::getFloatTy(*::context));
2819         }
2820
2821         Type *Float2::getType()
2822         {
2823                 return T(Type_v2f32);
2824         }
2825
2826         Float4::Float4(RValue<Float> rhs) : XYZW(this)
2827         {
2828                 Value *vector = loadValue();
2829                 Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
2830
2831                 int swizzle[4] = {0, 0, 0, 0};
2832                 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2833
2834                 storeValue(replicate);
2835         }
2836
2837         RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
2838         {
2839 #if defined(__i386__) || defined(__x86_64__)
2840                 return x86::maxps(x, y);
2841 #else
2842                 return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OGT)));
2843 #endif
2844         }
2845
2846         RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
2847         {
2848 #if defined(__i386__) || defined(__x86_64__)
2849                 return x86::minps(x, y);
2850 #else
2851                 return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OLT)));
2852 #endif
2853         }
2854
2855         RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
2856         {
2857 #if defined(__i386__) || defined(__x86_64__)
2858                 if(exactAtPow2)
2859                 {
2860                         // rcpps uses a piecewise-linear approximation which minimizes the relative error
2861                         // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2862                         return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2863                 }
2864                 return x86::rcpps(x);
2865 #else
2866                 return As<Float4>(V(lowerRCP(V(x.value))));
2867 #endif
2868         }
2869
2870         RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
2871         {
2872 #if defined(__i386__) || defined(__x86_64__)
2873                 return x86::rsqrtps(x);
2874 #else
2875                 return As<Float4>(V(lowerRSQRT(V(x.value))));
2876 #endif
2877         }
2878
2879         RValue<Float4> Sqrt(RValue<Float4> x)
2880         {
2881 #if defined(__i386__) || defined(__x86_64__)
2882                 return x86::sqrtps(x);
2883 #else
2884                 return As<Float4>(V(lowerSQRT(V(x.value))));
2885 #endif
2886         }
2887
2888         RValue<Int> SignMask(RValue<Float4> x)
2889         {
2890 #if defined(__i386__) || defined(__x86_64__)
2891                 return x86::movmskps(x);
2892 #else
2893                 return As<Int>(V(lowerFPSignMask(V(x.value), T(Int::getType()))));
2894 #endif
2895         }
2896
2897         RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
2898         {
2899         //      return As<Int4>(x86::cmpeqps(x, y));
2900                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value, y.value), Int4::getType()));
2901         }
2902
2903         RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
2904         {
2905         //      return As<Int4>(x86::cmpltps(x, y));
2906                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value, y.value), Int4::getType()));
2907         }
2908
2909         RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
2910         {
2911         //      return As<Int4>(x86::cmpleps(x, y));
2912                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value, y.value), Int4::getType()));
2913         }
2914
2915         RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
2916         {
2917         //      return As<Int4>(x86::cmpneqps(x, y));
2918                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value, y.value), Int4::getType()));
2919         }
2920
2921         RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
2922         {
2923         //      return As<Int4>(x86::cmpnltps(x, y));
2924                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value, y.value), Int4::getType()));
2925         }
2926
2927         RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
2928         {
2929         //      return As<Int4>(x86::cmpnleps(x, y));
2930                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value, y.value), Int4::getType()));
2931         }
2932
2933         RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
2934         {
2935                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value, y.value), Int4::getType()));
2936         }
2937
2938         RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
2939         {
2940                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value, y.value), Int4::getType()));
2941         }
2942
2943         RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
2944         {
2945                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value, y.value), Int4::getType()));
2946         }
2947
2948         RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
2949         {
2950                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value, y.value), Int4::getType()));
2951         }
2952
2953         RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
2954         {
2955                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value, y.value), Int4::getType()));
2956         }
2957
2958         RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
2959         {
2960                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value, y.value), Int4::getType()));
2961         }
2962
2963         RValue<Float4> Round(RValue<Float4> x)
2964         {
2965 #if defined(__i386__) || defined(__x86_64__)
2966                 if(CPUID::supportsSSE4_1())
2967                 {
2968                         return x86::roundps(x, 0);
2969                 }
2970                 else
2971                 {
2972                         return Float4(RoundInt(x));
2973                 }
2974 #else
2975                 return RValue<Float4>(V(lowerRound(V(x.value))));
2976 #endif
2977         }
2978
2979         RValue<Float4> Trunc(RValue<Float4> x)
2980         {
2981 #if defined(__i386__) || defined(__x86_64__)
2982                 if(CPUID::supportsSSE4_1())
2983                 {
2984                         return x86::roundps(x, 3);
2985                 }
2986                 else
2987                 {
2988                         return Float4(Int4(x));
2989                 }
2990 #else
2991                 return RValue<Float4>(V(lowerTrunc(V(x.value))));
2992 #endif
2993         }
2994
2995         RValue<Float4> Frac(RValue<Float4> x)
2996         {
2997                 Float4 frc;
2998
2999 #if defined(__i386__) || defined(__x86_64__)
3000                 if(CPUID::supportsSSE4_1())
3001                 {
3002                         frc = x - Floor(x);
3003                 }
3004                 else
3005                 {
3006                         frc = x - Float4(Int4(x));   // Signed fractional part.
3007
3008                         frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f)));   // Add 1.0 if negative.
3009                 }
3010 #else
3011                 frc = x - Floor(x);
3012 #endif
3013
3014                 // x - floor(x) can be 1.0 for very small negative x.
3015                 // Clamp against the value just below 1.0.
3016                 return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
3017         }
3018
3019         RValue<Float4> Floor(RValue<Float4> x)
3020         {
3021 #if defined(__i386__) || defined(__x86_64__)
3022                 if(CPUID::supportsSSE4_1())
3023                 {
3024                         return x86::floorps(x);
3025                 }
3026                 else
3027                 {
3028                         return x - Frac(x);
3029                 }
3030 #else
3031                 return RValue<Float4>(V(lowerFloor(V(x.value))));
3032 #endif
3033         }
3034
3035         RValue<Float4> Ceil(RValue<Float4> x)
3036         {
3037 #if defined(__i386__) || defined(__x86_64__)
3038                 if(CPUID::supportsSSE4_1())
3039                 {
3040                         return x86::ceilps(x);
3041                 }
3042                 else
3043 #endif
3044                 {
3045                         return -Floor(-x);
3046                 }
3047         }
3048
3049         Type *Float4::getType()
3050         {
3051                 return T(llvm::VectorType::get(T(Float::getType()), 4));
3052         }
3053
3054         RValue<Long> Ticks()
3055         {
3056                 llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::readcyclecounter);
3057
3058                 return RValue<Long>(V(::builder->CreateCall(rdtsc)));
3059         }
3060 }
3061
3062 namespace rr
3063 {
3064 #if defined(__i386__) || defined(__x86_64__)
3065         namespace x86
3066         {
3067                 RValue<Int> cvtss2si(RValue<Float> val)
3068                 {
3069                         llvm::Function *cvtss2si = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_cvtss2si);
3070
3071                         Float4 vector;
3072                         vector.x = val;
3073
3074                         return RValue<Int>(V(::builder->CreateCall(cvtss2si, ARGS(V(RValue<Float4>(vector).value)))));
3075                 }
3076
3077                 RValue<Int4> cvtps2dq(RValue<Float4> val)
3078                 {
3079                         llvm::Function *cvtps2dq = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_cvtps2dq);
3080
3081                         return RValue<Int4>(V(::builder->CreateCall(cvtps2dq, ARGS(V(val.value)))));
3082                 }
3083
3084                 RValue<Float> rcpss(RValue<Float> val)
3085                 {
3086                         llvm::Function *rcpss = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_rcp_ss);
3087
3088                         Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
3089
3090                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(rcpss, ARGS(V(vector)))), Float::getType(), 0));
3091                 }
3092
3093                 RValue<Float> sqrtss(RValue<Float> val)
3094                 {
3095 #if REACTOR_LLVM_VERSION < 7
3096                         llvm::Function *sqrtss = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_sqrt_ss);
3097                         Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
3098
3099                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(sqrtss, ARGS(V(vector)))), Float::getType(), 0));
3100 #else
3101                         llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::sqrt, {V(val.value)->getType()});
3102                         return RValue<Float>(V(::builder->CreateCall(sqrt, ARGS(V(val.value)))));
3103 #endif
3104                 }
3105
3106                 RValue<Float> rsqrtss(RValue<Float> val)
3107                 {
3108                         llvm::Function *rsqrtss = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_rsqrt_ss);
3109
3110                         Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
3111
3112                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(rsqrtss, ARGS(V(vector)))), Float::getType(), 0));
3113                 }
3114
3115                 RValue<Float4> rcpps(RValue<Float4> val)
3116                 {
3117                         llvm::Function *rcpps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_rcp_ps);
3118
3119                         return RValue<Float4>(V(::builder->CreateCall(rcpps, ARGS(V(val.value)))));
3120                 }
3121
3122                 RValue<Float4> sqrtps(RValue<Float4> val)
3123                 {
3124 #if REACTOR_LLVM_VERSION < 7
3125                         llvm::Function *sqrtps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_sqrt_ps);
3126 #else
3127                         llvm::Function *sqrtps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::sqrt, {V(val.value)->getType()});
3128 #endif
3129
3130                         return RValue<Float4>(V(::builder->CreateCall(sqrtps, ARGS(V(val.value)))));
3131                 }
3132
3133                 RValue<Float4> rsqrtps(RValue<Float4> val)
3134                 {
3135                         llvm::Function *rsqrtps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_rsqrt_ps);
3136
3137                         return RValue<Float4>(V(::builder->CreateCall(rsqrtps, ARGS(V(val.value)))));
3138                 }
3139
3140                 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
3141                 {
3142                         llvm::Function *maxps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_max_ps);
3143
3144                         return RValue<Float4>(V(::builder->CreateCall2(maxps, ARGS(V(x.value), V(y.value)))));
3145                 }
3146
3147                 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
3148                 {
3149                         llvm::Function *minps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_min_ps);
3150
3151                         return RValue<Float4>(V(::builder->CreateCall2(minps, ARGS(V(x.value), V(y.value)))));
3152                 }
3153
3154                 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
3155                 {
3156                         llvm::Function *roundss = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_round_ss);
3157
3158                         Value *undef = V(llvm::UndefValue::get(T(Float4::getType())));
3159                         Value *vector = Nucleus::createInsertElement(undef, val.value, 0);
3160
3161                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall3(roundss, ARGS(V(undef), V(vector), V(Nucleus::createConstantInt(imm))))), Float::getType(), 0));
3162                 }
3163
3164                 RValue<Float> floorss(RValue<Float> val)
3165                 {
3166                         return roundss(val, 1);
3167                 }
3168
3169                 RValue<Float> ceilss(RValue<Float> val)
3170                 {
3171                         return roundss(val, 2);
3172                 }
3173
3174                 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
3175                 {
3176                         llvm::Function *roundps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_round_ps);
3177
3178                         return RValue<Float4>(V(::builder->CreateCall2(roundps, ARGS(V(val.value), V(Nucleus::createConstantInt(imm))))));
3179                 }
3180
3181                 RValue<Float4> floorps(RValue<Float4> val)
3182                 {
3183                         return roundps(val, 1);
3184                 }
3185
3186                 RValue<Float4> ceilps(RValue<Float4> val)
3187                 {
3188                         return roundps(val, 2);
3189                 }
3190
3191                 RValue<Int4> pabsd(RValue<Int4> x)
3192                 {
3193 #if REACTOR_LLVM_VERSION < 7
3194                         llvm::Function *pabsd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_ssse3_pabs_d_128);
3195
3196                         return RValue<Int4>(V(::builder->CreateCall(pabsd, ARGS(V(x.value)))));
3197 #else
3198                         return RValue<Int4>(V(lowerPABS(V(x.value))));
3199 #endif
3200                 }
3201
3202                 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
3203                 {
3204                         llvm::Function *paddsw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_padds_w);
3205
3206                         return As<Short4>(V(::builder->CreateCall2(paddsw, ARGS(V(x.value), V(y.value)))));
3207                 }
3208
3209                 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
3210                 {
3211                         llvm::Function *psubsw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psubs_w);
3212
3213                         return As<Short4>(V(::builder->CreateCall2(psubsw, ARGS(V(x.value), V(y.value)))));
3214                 }
3215
3216                 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
3217                 {
3218                         llvm::Function *paddusw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_paddus_w);
3219
3220                         return As<UShort4>(V(::builder->CreateCall2(paddusw, ARGS(V(x.value), V(y.value)))));
3221                 }
3222
3223                 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
3224                 {
3225                         llvm::Function *psubusw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psubus_w);
3226
3227                         return As<UShort4>(V(::builder->CreateCall2(psubusw, ARGS(V(x.value), V(y.value)))));
3228                 }
3229
3230                 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
3231                 {
3232                         llvm::Function *paddsb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_padds_b);
3233
3234                         return As<SByte8>(V(::builder->CreateCall2(paddsb, ARGS(V(x.value), V(y.value)))));
3235                 }
3236
3237                 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
3238                 {
3239                         llvm::Function *psubsb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psubs_b);
3240
3241                         return As<SByte8>(V(::builder->CreateCall2(psubsb, ARGS(V(x.value), V(y.value)))));
3242                 }
3243
3244                 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
3245                 {
3246                         llvm::Function *paddusb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_paddus_b);
3247
3248                         return As<Byte8>(V(::builder->CreateCall2(paddusb, ARGS(V(x.value), V(y.value)))));
3249                 }
3250
3251                 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
3252                 {
3253                         llvm::Function *psubusb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psubus_b);
3254
3255                         return As<Byte8>(V(::builder->CreateCall2(psubusb, ARGS(V(x.value), V(y.value)))));
3256                 }
3257
3258                 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
3259                 {
3260 #if REACTOR_LLVM_VERSION < 7
3261                         llvm::Function *pavgw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pavg_w);
3262
3263                         return As<UShort4>(V(::builder->CreateCall2(pavgw, ARGS(V(x.value), V(y.value)))));
3264 #else
3265                         return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
3266 #endif
3267                 }
3268
3269                 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
3270                 {
3271 #if REACTOR_LLVM_VERSION < 7
3272                         llvm::Function *pmaxsw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmaxs_w);
3273
3274                         return As<Short4>(V(::builder->CreateCall2(pmaxsw, ARGS(V(x.value), V(y.value)))));
3275 #else
3276                         return As<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
3277 #endif
3278                 }
3279
3280                 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
3281                 {
3282 #if REACTOR_LLVM_VERSION < 7
3283                         llvm::Function *pminsw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmins_w);
3284
3285                         return As<Short4>(V(::builder->CreateCall2(pminsw, ARGS(V(x.value), V(y.value)))));
3286 #else
3287                         return As<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
3288 #endif
3289                 }
3290
3291                 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
3292                 {
3293 #if REACTOR_LLVM_VERSION < 7
3294                         llvm::Function *pcmpgtw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pcmpgt_w);
3295
3296                         return As<Short4>(V(::builder->CreateCall2(pcmpgtw, ARGS(V(x.value), V(y.value)))));
3297 #else
3298                         return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
3299 #endif
3300                 }
3301
3302                 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
3303                 {
3304 #if REACTOR_LLVM_VERSION < 7
3305                         llvm::Function *pcmpeqw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pcmpeq_w);
3306
3307                         return As<Short4>(V(::builder->CreateCall2(pcmpeqw, ARGS(V(x.value), V(y.value)))));
3308 #else
3309                         return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
3310 #endif
3311                 }
3312
3313                 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
3314                 {
3315 #if REACTOR_LLVM_VERSION < 7
3316                         llvm::Function *pcmpgtb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pcmpgt_b);
3317
3318                         return As<Byte8>(V(::builder->CreateCall2(pcmpgtb, ARGS(V(x.value), V(y.value)))));
3319 #else
3320                         return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
3321 #endif
3322                 }
3323
3324                 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
3325                 {
3326 #if REACTOR_LLVM_VERSION < 7
3327                         llvm::Function *pcmpeqb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pcmpeq_b);
3328
3329                         return As<Byte8>(V(::builder->CreateCall2(pcmpeqb, ARGS(V(x.value), V(y.value)))));
3330 #else
3331                         return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
3332 #endif
3333                 }
3334
3335                 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
3336                 {
3337                         llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_packssdw_128);
3338
3339                         return As<Short4>(V(::builder->CreateCall2(packssdw, ARGS(V(x.value), V(y.value)))));
3340                 }
3341
3342                 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
3343                 {
3344                         llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_packssdw_128);
3345
3346                         return RValue<Short8>(V(::builder->CreateCall2(packssdw, ARGS(V(x.value), V(y.value)))));
3347                 }
3348
3349                 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
3350                 {
3351                         llvm::Function *packsswb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_packsswb_128);
3352
3353                         return As<SByte8>(V(::builder->CreateCall2(packsswb, ARGS(V(x.value), V(y.value)))));
3354                 }
3355
3356                 RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
3357                 {
3358                         llvm::Function *packuswb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_packuswb_128);
3359
3360                         return As<Byte8>(V(::builder->CreateCall2(packuswb, ARGS(V(x.value), V(y.value)))));
3361                 }
3362
3363                 RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
3364                 {
3365                         if(CPUID::supportsSSE4_1())
3366                         {
3367                                 llvm::Function *packusdw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_packusdw);
3368
3369                                 return RValue<UShort8>(V(::builder->CreateCall2(packusdw, ARGS(V(x.value), V(y.value)))));
3370                         }
3371                         else
3372                         {
3373                                 RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
3374                                 RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
3375
3376                                 return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
3377                         }
3378                 }
3379
3380                 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
3381                 {
3382                         llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrli_w);
3383
3384                         return As<UShort4>(V(::builder->CreateCall2(psrlw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3385                 }
3386
3387                 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
3388                 {
3389                         llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrli_w);
3390
3391                         return RValue<UShort8>(V(::builder->CreateCall2(psrlw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3392                 }
3393
3394                 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
3395                 {
3396                         llvm::Function *psraw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrai_w);
3397
3398                         return As<Short4>(V(::builder->CreateCall2(psraw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3399                 }
3400
3401                 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
3402                 {
3403                         llvm::Function *psraw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrai_w);
3404
3405                         return RValue<Short8>(V(::builder->CreateCall2(psraw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3406                 }
3407
3408                 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
3409                 {
3410                         llvm::Function *psllw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pslli_w);
3411
3412                         return As<Short4>(V(::builder->CreateCall2(psllw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3413                 }
3414
3415                 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
3416                 {
3417                         llvm::Function *psllw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pslli_w);
3418
3419                         return RValue<Short8>(V(::builder->CreateCall2(psllw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3420                 }
3421
3422                 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
3423                 {
3424                         llvm::Function *pslld = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pslli_d);
3425
3426                         return As<Int2>(V(::builder->CreateCall2(pslld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3427                 }
3428
3429                 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
3430                 {
3431                         llvm::Function *pslld = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pslli_d);
3432
3433                         return RValue<Int4>(V(::builder->CreateCall2(pslld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3434                 }
3435
3436                 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
3437                 {
3438                         llvm::Function *psrad = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrai_d);
3439
3440                         return As<Int2>(V(::builder->CreateCall2(psrad, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3441                 }
3442
3443                 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
3444                 {
3445                         llvm::Function *psrad = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrai_d);
3446
3447                         return RValue<Int4>(V(::builder->CreateCall2(psrad, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3448                 }
3449
3450                 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
3451                 {
3452                         llvm::Function *psrld = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrli_d);
3453
3454                         return As<UInt2>(V(::builder->CreateCall2(psrld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3455                 }
3456
3457                 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
3458                 {
3459                         llvm::Function *psrld = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrli_d);
3460
3461                         return RValue<UInt4>(V(::builder->CreateCall2(psrld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3462                 }
3463
3464                 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
3465                 {
3466 #if REACTOR_LLVM_VERSION < 7
3467                         llvm::Function *pmaxsd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmaxsd);
3468
3469                         return RValue<Int4>(V(::builder->CreateCall2(pmaxsd, ARGS(V(x.value), V(y.value)))));
3470 #else
3471                         return RValue<Int4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
3472 #endif
3473                 }
3474
3475                 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
3476                 {
3477 #if REACTOR_LLVM_VERSION < 7
3478                         llvm::Function *pminsd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pminsd);
3479
3480                         return RValue<Int4>(V(::builder->CreateCall2(pminsd, ARGS(V(x.value), V(y.value)))));
3481 #else
3482                         return RValue<Int4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
3483 #endif
3484                 }
3485
3486                 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
3487                 {
3488 #if REACTOR_LLVM_VERSION < 7
3489                         llvm::Function *pmaxud = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmaxud);
3490
3491                         return RValue<UInt4>(V(::builder->CreateCall2(pmaxud, ARGS(V(x.value), V(y.value)))));
3492 #else
3493                         return RValue<UInt4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_UGT)));
3494 #endif
3495                 }
3496
3497                 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
3498                 {
3499 #if REACTOR_LLVM_VERSION < 7
3500                         llvm::Function *pminud = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pminud);
3501
3502                         return RValue<UInt4>(V(::builder->CreateCall2(pminud, ARGS(V(x.value), V(y.value)))));
3503 #else
3504                         return RValue<UInt4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_ULT)));
3505 #endif
3506                 }
3507
3508                 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
3509                 {
3510                         llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmulh_w);
3511
3512                         return As<Short4>(V(::builder->CreateCall2(pmulhw, ARGS(V(x.value), V(y.value)))));
3513                 }
3514
3515                 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
3516                 {
3517                         llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmulhu_w);
3518
3519                         return As<UShort4>(V(::builder->CreateCall2(pmulhuw, ARGS(V(x.value), V(y.value)))));
3520                 }
3521
3522                 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
3523                 {
3524                         llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmadd_wd);
3525
3526                         return As<Int2>(V(::builder->CreateCall2(pmaddwd, ARGS(V(x.value), V(y.value)))));
3527                 }
3528
3529                 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
3530                 {
3531                         llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmulh_w);
3532
3533                         return RValue<Short8>(V(::builder->CreateCall2(pmulhw, ARGS(V(x.value), V(y.value)))));
3534                 }
3535
3536                 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
3537                 {
3538                         llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmulhu_w);
3539
3540                         return RValue<UShort8>(V(::builder->CreateCall2(pmulhuw, ARGS(V(x.value), V(y.value)))));
3541                 }
3542
3543                 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
3544                 {
3545                         llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmadd_wd);
3546
3547                         return RValue<Int4>(V(::builder->CreateCall2(pmaddwd, ARGS(V(x.value), V(y.value)))));
3548                 }
3549
3550                 RValue<Int> movmskps(RValue<Float4> x)
3551                 {
3552                         llvm::Function *movmskps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_movmsk_ps);
3553
3554                         return RValue<Int>(V(::builder->CreateCall(movmskps, ARGS(V(x.value)))));
3555                 }
3556
3557                 RValue<Int> pmovmskb(RValue<Byte8> x)
3558                 {
3559                         llvm::Function *pmovmskb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmovmskb_128);
3560
3561                         return RValue<Int>(V(::builder->CreateCall(pmovmskb, ARGS(V(x.value))))) & 0xFF;
3562                 }
3563
3564                 RValue<Int4> pmovzxbd(RValue<Byte16> x)
3565                 {
3566 #if REACTOR_LLVM_VERSION < 7
3567                         llvm::Function *pmovzxbd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmovzxbd);
3568
3569                         return RValue<Int4>(V(::builder->CreateCall(pmovzxbd, ARGS(V(x.value)))));
3570 #else
3571                         return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), false)));
3572 #endif
3573                 }
3574
3575                 RValue<Int4> pmovsxbd(RValue<SByte16> x)
3576                 {
3577 #if REACTOR_LLVM_VERSION < 7
3578                         llvm::Function *pmovsxbd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmovsxbd);
3579
3580                         return RValue<Int4>(V(::builder->CreateCall(pmovsxbd, ARGS(V(x.value)))));
3581 #else
3582                         return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), true)));
3583 #endif
3584                 }
3585
3586                 RValue<Int4> pmovzxwd(RValue<UShort8> x)
3587                 {
3588 #if REACTOR_LLVM_VERSION < 7
3589                         llvm::Function *pmovzxwd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmovzxwd);
3590
3591                         return RValue<Int4>(V(::builder->CreateCall(pmovzxwd, ARGS(V(x.value)))));
3592 #else
3593                         return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), false)));
3594 #endif
3595                 }
3596
3597                 RValue<Int4> pmovsxwd(RValue<Short8> x)
3598                 {
3599 #if REACTOR_LLVM_VERSION < 7
3600                         llvm::Function *pmovsxwd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmovsxwd);
3601
3602                         return RValue<Int4>(V(::builder->CreateCall(pmovsxwd, ARGS(V(x.value)))));
3603 #else
3604                         return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), true)));
3605 #endif
3606                 }
3607         }
3608 #endif  // defined(__i386__) || defined(__x86_64__)
3609
3610 #ifdef ENABLE_RR_PRINT
3611         // extractAll returns a vector containing the extracted n scalar value of
3612         // the vector vec.
3613         static std::vector<Value*> extractAll(Value* vec, int n)
3614         {
3615                 std::vector<Value*> elements;
3616                 elements.reserve(n);
3617                 for (int i = 0; i < n; i++)
3618                 {
3619                         auto el = V(::builder->CreateExtractElement(V(vec), i));
3620                         elements.push_back(el);
3621                 }
3622                 return elements;
3623         }
3624
3625         // toDouble returns all the float values in vals extended to doubles.
3626         static std::vector<Value*> toDouble(const std::vector<Value*>& vals)
3627         {
3628                 auto doubleTy = ::llvm::Type::getDoubleTy(*::context);
3629                 std::vector<Value*> elements;
3630                 elements.reserve(vals.size());
3631                 for (auto v : vals)
3632                 {
3633                         elements.push_back(V(::builder->CreateFPExt(V(v), doubleTy)));
3634                 }
3635                 return elements;
3636         }
3637
3638         std::vector<Value*> PrintValue::Ty<Byte4>::val(const RValue<Byte4>& v) { return extractAll(v.value, 4); }
3639         std::vector<Value*> PrintValue::Ty<Int4>::val(const RValue<Int4>& v) { return extractAll(v.value, 4); }
3640         std::vector<Value*> PrintValue::Ty<UInt4>::val(const RValue<UInt4>& v) { return extractAll(v.value, 4); }
3641         std::vector<Value*> PrintValue::Ty<Short4>::val(const RValue<Short4>& v) { return extractAll(v.value, 4); }
3642         std::vector<Value*> PrintValue::Ty<UShort4>::val(const RValue<UShort4>& v) { return extractAll(v.value, 4); }
3643         std::vector<Value*> PrintValue::Ty<Float>::val(const RValue<Float>& v) { return toDouble({v.value}); }
3644         std::vector<Value*> PrintValue::Ty<Float4>::val(const RValue<Float4>& v) { return toDouble(extractAll(v.value, 4)); }
3645
3646         void Printv(const char* function, const char* file, int line, const char* fmt, std::initializer_list<PrintValue> args)
3647         {
3648                 // LLVM types used below.
3649                 auto i32Ty = ::llvm::Type::getInt32Ty(*::context);
3650                 auto intTy = ::llvm::Type::getInt64Ty(*::context); // TODO: Natural int width.
3651                 auto i8PtrTy = ::llvm::Type::getInt8PtrTy(*::context);
3652                 auto funcTy = ::llvm::FunctionType::get(i32Ty, {i8PtrTy}, true);
3653
3654                 auto func = ::module->getOrInsertFunction("printf", funcTy);
3655
3656                 // Build the printf format message string.
3657                 std::string str;
3658                 if (file != nullptr) { str += (line > 0) ? "%s:%d " : "%s "; }
3659                 if (function != nullptr) { str += "%s "; }
3660                 str += fmt;
3661
3662                 // Perform subsitution on all '{n}' bracketed indices in the format
3663                 // message.
3664                 int i = 0;
3665                 for (const PrintValue& arg : args)
3666                 {
3667                         str = replace(str, "{" + std::to_string(i++) + "}", arg.format);
3668                 }
3669
3670                 ::llvm::SmallVector<::llvm::Value*, 8> vals;
3671
3672                 // The format message is always the first argument.
3673                 vals.push_back(::builder->CreateGlobalStringPtr(str));
3674
3675                 // Add optional file, line and function info if provided.
3676                 if (file != nullptr)
3677                 {
3678                         vals.push_back(::builder->CreateGlobalStringPtr(file));
3679                         if (line > 0)
3680                         {
3681                                 vals.push_back(::llvm::ConstantInt::get(intTy, line));
3682                         }
3683                 }
3684                 if (function != nullptr)
3685                 {
3686                         vals.push_back(::builder->CreateGlobalStringPtr(function));
3687                 }
3688
3689                 // Add all format arguments.
3690                 for (const PrintValue& arg : args)
3691                 {
3692                         for (auto val : arg.values)
3693                         {
3694                                 vals.push_back(V(val));
3695                         }
3696                 }
3697
3698                 ::builder->CreateCall(func, vals);
3699         }
3700 #endif // ENABLE_RR_PRINT
3701
3702 }