src/Reactor/LLVMReactor.cpp

   1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //    http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "Reactor.hpp"
  16
  17 #include "x86.hpp"
  18 #include "CPUID.hpp"
  19 #include "Thread.hpp"
  20 #include "ExecutableMemory.hpp"
  21 #include "MutexLock.hpp"
  22
  23 #undef min
  24 #undef max
  25
  26 #if REACTOR_LLVM_VERSION < 7
  27         #include "llvm/Analysis/LoopPass.h"
  28         #include "llvm/Constants.h"
  29         #include "llvm/Function.h"
  30         #include "llvm/GlobalVariable.h"
  31         #include "llvm/Intrinsics.h"
  32         #include "llvm/LLVMContext.h"
  33         #include "llvm/Module.h"
  34         #include "llvm/PassManager.h"
  35         #include "llvm/Support/IRBuilder.h"
  36         #include "llvm/Support/TargetSelect.h"
  37         #include "llvm/Target/TargetData.h"
  38         #include "llvm/Target/TargetOptions.h"
  39         #include "llvm/Transforms/Scalar.h"
  40         #include "../lib/ExecutionEngine/JIT/JIT.h"
  41
  42         #include "LLVMRoutine.hpp"
  43         #include "LLVMRoutineManager.hpp"
  44
  45         #define ARGS(...) __VA_ARGS__
  46 #else
  47         #include "llvm/Analysis/LoopPass.h"
  48         #include "llvm/ExecutionEngine/ExecutionEngine.h"
  49         #include "llvm/ExecutionEngine/JITSymbol.h"
  50         #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
  51         #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
  52         #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
  53         #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
  54         #include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
  55         #include "llvm/ExecutionEngine/SectionMemoryManager.h"
  56         #include "llvm/IR/Constants.h"
  57         #include "llvm/IR/DataLayout.h"
  58         #include "llvm/IR/Function.h"
  59         #include "llvm/IR/GlobalVariable.h"
  60         #include "llvm/IR/IRBuilder.h"
  61         #include "llvm/IR/Intrinsics.h"
  62         #include "llvm/IR/LLVMContext.h"
  63         #include "llvm/IR/LegacyPassManager.h"
  64         #include "llvm/IR/Mangler.h"
  65         #include "llvm/IR/Module.h"
  66         #include "llvm/Support/Error.h"
  67         #include "llvm/Support/TargetSelect.h"
  68         #include "llvm/Target/TargetOptions.h"
  69         #include "llvm/Transforms/InstCombine/InstCombine.h"
  70         #include "llvm/Transforms/Scalar.h"
  71         #include "llvm/Transforms/Scalar/GVN.h"
  72
  73         #include "LLVMRoutine.hpp"
  74
  75         #define ARGS(...) {__VA_ARGS__}
  76         #define CreateCall2 CreateCall
  77         #define CreateCall3 CreateCall
  78
  79         #include <unordered_map>
  80 #endif
  81
  82 #include <fstream>
  83 #include <numeric>
  84 #include <thread>
  85
  86 #if defined(__i386__) || defined(__x86_64__)
  87 #include <xmmintrin.h>
  88 #endif
  89
  90 #include <math.h>
  91
  92 #if defined(__x86_64__) && defined(_WIN32)
  93 extern "C" void X86CompilationCallback()
  94 {
  95         assert(false);   // UNIMPLEMENTED
  96 }
  97 #endif
  98
  99 #if REACTOR_LLVM_VERSION < 7
 100 namespace llvm
 101 {
 102         extern bool JITEmitDebugInfo;
 103 }
 104 #endif
 105
 106 namespace rr
 107 {
 108         class LLVMReactorJIT;
 109 }
 110
 111 namespace
 112 {
 113         rr::LLVMReactorJIT *reactorJIT = nullptr;
 114         llvm::IRBuilder<> *builder = nullptr;
 115         llvm::LLVMContext *context = nullptr;
 116         llvm::Module *module = nullptr;
 117         llvm::Function *function = nullptr;
 118
 119         rr::MutexLock codegenMutex;
 120
 121 #ifdef ENABLE_RR_PRINT
 122         std::string replace(std::string str, const std::string& substr, const std::string& replacement)
 123         {
 124                 size_t pos = 0;
 125                 while((pos = str.find(substr, pos)) != std::string::npos) {
 126                         str.replace(pos, substr.length(), replacement);
 127                         pos += replacement.length();
 128                 }
 129                 return str;
 130         }
 131 #endif // ENABLE_RR_PRINT
 132
 133 #if REACTOR_LLVM_VERSION >= 7
 134         llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
 135         {
 136                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
 137
 138                 llvm::VectorType *extTy =
 139                         llvm::VectorType::getExtendedElementVectorType(ty);
 140                 x = ::builder->CreateZExt(x, extTy);
 141                 y = ::builder->CreateZExt(y, extTy);
 142
 143                 // (x + y + 1) >> 1
 144                 llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
 145                 llvm::Value *res = ::builder->CreateAdd(x, y);
 146                 res = ::builder->CreateAdd(res, one);
 147                 res = ::builder->CreateLShr(res, one);
 148                 return ::builder->CreateTrunc(res, ty);
 149         }
 150
 151         llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
 152                                   llvm::ICmpInst::Predicate pred)
 153         {
 154                 return ::builder->CreateSelect(::builder->CreateICmp(pred, x, y), x, y);
 155         }
 156
 157         llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
 158                                llvm::Value *y, llvm::Type *dstTy)
 159         {
 160                 return ::builder->CreateSExt(::builder->CreateICmp(pred, x, y), dstTy, "");
 161         }
 162
 163 #if defined(__i386__) || defined(__x86_64__)
 164         llvm::Value *lowerPMOV(llvm::Value *op, llvm::Type *dstType, bool sext)
 165         {
 166                 llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(op->getType());
 167                 llvm::VectorType *dstTy = llvm::cast<llvm::VectorType>(dstType);
 168
 169                 llvm::Value *undef = llvm::UndefValue::get(srcTy);
 170                 llvm::SmallVector<uint32_t, 16> mask(dstTy->getNumElements());
 171                 std::iota(mask.begin(), mask.end(), 0);
 172                 llvm::Value *v = ::builder->CreateShuffleVector(op, undef, mask);
 173
 174                 return sext ? ::builder->CreateSExt(v, dstTy)
 175                             : ::builder->CreateZExt(v, dstTy);
 176         }
 177
 178         llvm::Value *lowerPABS(llvm::Value *v)
 179         {
 180                 llvm::Value *zero = llvm::Constant::getNullValue(v->getType());
 181                 llvm::Value *cmp = ::builder->CreateICmp(llvm::ICmpInst::ICMP_SGT, v, zero);
 182                 llvm::Value *neg = ::builder->CreateNeg(v);
 183                 return ::builder->CreateSelect(cmp, v, neg);
 184         }
 185 #endif  // defined(__i386__) || defined(__x86_64__)
 186
 187 #if !defined(__i386__) && !defined(__x86_64__)
 188         llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
 189                                    llvm::FCmpInst::Predicate pred)
 190         {
 191                 return ::builder->CreateSelect(::builder->CreateFCmp(pred, x, y), x, y);
 192         }
 193
 194         llvm::Value *lowerRound(llvm::Value *x)
 195         {
 196                 llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
 197                         ::module, llvm::Intrinsic::nearbyint, {x->getType()});
 198                 return ::builder->CreateCall(nearbyint, ARGS(x));
 199         }
 200
 201         llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
 202         {
 203                 return ::builder->CreateFPToSI(lowerRound(x), ty);
 204         }
 205
 206         llvm::Value *lowerFloor(llvm::Value *x)
 207         {
 208                 llvm::Function *floor = llvm::Intrinsic::getDeclaration(
 209                         ::module, llvm::Intrinsic::floor, {x->getType()});
 210                 return ::builder->CreateCall(floor, ARGS(x));
 211         }
 212
 213         llvm::Value *lowerTrunc(llvm::Value *x)
 214         {
 215                 llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
 216                         ::module, llvm::Intrinsic::trunc, {x->getType()});
 217                 return ::builder->CreateCall(trunc, ARGS(x));
 218         }
 219
 220         // Packed add/sub saturatation
 221         llvm::Value *lowerPSAT(llvm::Value *x, llvm::Value *y, bool isAdd, bool isSigned)
 222         {
 223                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
 224                 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
 225
 226                 unsigned numBits = ty->getScalarSizeInBits();
 227
 228                 llvm::Value *max, *min, *extX, *extY;
 229                 if (isSigned)
 230                 {
 231                         max = llvm::ConstantInt::get(extTy, (1LL << (numBits - 1)) - 1, true);
 232                         min = llvm::ConstantInt::get(extTy, (-1LL << (numBits - 1)), true);
 233                         extX = ::builder->CreateSExt(x, extTy);
 234                         extY = ::builder->CreateSExt(y, extTy);
 235                 }
 236                 else
 237                 {
 238                         assert(numBits <= 64);
 239                         uint64_t maxVal = (numBits == 64) ? ~0ULL : (1ULL << numBits) - 1;
 240                         max = llvm::ConstantInt::get(extTy, maxVal, false);
 241                         min = llvm::ConstantInt::get(extTy, 0, false);
 242                         extX = ::builder->CreateZExt(x, extTy);
 243                         extY = ::builder->CreateZExt(y, extTy);
 244                 }
 245
 246                 llvm::Value *res = isAdd ? ::builder->CreateAdd(extX, extY)
 247                                          : ::builder->CreateSub(extX, extY);
 248
 249                 res = lowerPMINMAX(res, min, llvm::ICmpInst::ICMP_SGT);
 250                 res = lowerPMINMAX(res, max, llvm::ICmpInst::ICMP_SLT);
 251
 252                 return ::builder->CreateTrunc(res, ty);
 253         }
 254
 255         llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
 256         {
 257                 return lowerPSAT(x, y, true, false);
 258         }
 259
 260         llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
 261         {
 262                 return lowerPSAT(x, y, true, true);
 263         }
 264
 265         llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
 266         {
 267                 return lowerPSAT(x, y, false, false);
 268         }
 269
 270         llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
 271         {
 272                 return lowerPSAT(x, y, false, true);
 273         }
 274
 275         llvm::Value *lowerSQRT(llvm::Value *x)
 276         {
 277                 llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
 278                         ::module, llvm::Intrinsic::sqrt, {x->getType()});
 279                 return ::builder->CreateCall(sqrt, ARGS(x));
 280         }
 281
 282         llvm::Value *lowerRCP(llvm::Value *x)
 283         {
 284                 llvm::Type *ty = x->getType();
 285                 llvm::Constant *one;
 286                 if (llvm::VectorType *vectorTy = llvm::dyn_cast<llvm::VectorType>(ty))
 287                 {
 288                         one = llvm::ConstantVector::getSplat(
 289                                 vectorTy->getNumElements(),
 290                                 llvm::ConstantFP::get(vectorTy->getElementType(), 1));
 291                 }
 292                 else
 293                 {
 294                         one = llvm::ConstantFP::get(ty, 1);
 295                 }
 296                 return ::builder->CreateFDiv(one, x);
 297         }
 298
 299         llvm::Value *lowerRSQRT(llvm::Value *x)
 300         {
 301                 return lowerRCP(lowerSQRT(x));
 302         }
 303
 304         llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
 305         {
 306                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
 307                 llvm::Value *y = llvm::ConstantVector::getSplat(
 308                         ty->getNumElements(),
 309                         llvm::ConstantInt::get(ty->getElementType(), scalarY));
 310                 return ::builder->CreateShl(x, y);
 311         }
 312
 313         llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
 314         {
 315                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
 316                 llvm::Value *y = llvm::ConstantVector::getSplat(
 317                         ty->getNumElements(),
 318                         llvm::ConstantInt::get(ty->getElementType(), scalarY));
 319                 return ::builder->CreateAShr(x, y);
 320         }
 321
 322         llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
 323         {
 324                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
 325                 llvm::Value *y = llvm::ConstantVector::getSplat(
 326                         ty->getNumElements(),
 327                         llvm::ConstantInt::get(ty->getElementType(), scalarY));
 328                 return ::builder->CreateLShr(x, y);
 329         }
 330
 331         llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
 332         {
 333                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
 334                 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
 335
 336                 llvm::Value *extX = ::builder->CreateSExt(x, extTy);
 337                 llvm::Value *extY = ::builder->CreateSExt(y, extTy);
 338                 llvm::Value *mult = ::builder->CreateMul(extX, extY);
 339
 340                 llvm::Value *undef = llvm::UndefValue::get(extTy);
 341
 342                 llvm::SmallVector<uint32_t, 16> evenIdx;
 343                 llvm::SmallVector<uint32_t, 16> oddIdx;
 344                 for (uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
 345                 {
 346                         evenIdx.push_back(i);
 347                         oddIdx.push_back(i + 1);
 348                 }
 349
 350                 llvm::Value *lhs = ::builder->CreateShuffleVector(mult, undef, evenIdx);
 351                 llvm::Value *rhs = ::builder->CreateShuffleVector(mult, undef, oddIdx);
 352                 return ::builder->CreateAdd(lhs, rhs);
 353         }
 354
 355         llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
 356         {
 357                 llvm::VectorType *srcTy = llvm::cast<llvm::VectorType>(x->getType());
 358                 llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
 359
 360                 llvm::IntegerType *dstElemTy =
 361                         llvm::cast<llvm::IntegerType>(dstTy->getElementType());
 362
 363                 uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
 364                 assert(truncNumBits < 64 && "shift 64 must be handled separately");
 365                 llvm::Constant *max, *min;
 366                 if (isSigned)
 367                 {
 368                         max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
 369                         min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
 370                 }
 371                 else
 372                 {
 373                         max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
 374                         min = llvm::ConstantInt::get(srcTy, 0, false);
 375                 }
 376
 377                 x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
 378                 x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
 379                 y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
 380                 y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
 381
 382                 x = ::builder->CreateTrunc(x, dstTy);
 383                 y = ::builder->CreateTrunc(y, dstTy);
 384
 385                 llvm::SmallVector<uint32_t, 16> index(srcTy->getNumElements() * 2);
 386                 std::iota(index.begin(), index.end(), 0);
 387
 388                 return ::builder->CreateShuffleVector(x, y, index);
 389         }
 390
 391         llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
 392         {
 393                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
 394                 llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
 395                 llvm::Value *cmp = ::builder->CreateICmpSLT(x, zero);
 396
 397                 llvm::Value *ret = ::builder->CreateZExt(
 398                         ::builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
 399                 for (uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
 400                 {
 401                         llvm::Value *elem = ::builder->CreateZExt(
 402                                 ::builder->CreateExtractElement(cmp, i), retTy);
 403                         ret = ::builder->CreateOr(ret, ::builder->CreateShl(elem, i));
 404                 }
 405                 return ret;
 406         }
 407
 408         llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
 409         {
 410                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
 411                 llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
 412                 llvm::Value *cmp = ::builder->CreateFCmpULT(x, zero);
 413
 414                 llvm::Value *ret = ::builder->CreateZExt(
 415                         ::builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
 416                 for (uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
 417                 {
 418                         llvm::Value *elem = ::builder->CreateZExt(
 419                                 ::builder->CreateExtractElement(cmp, i), retTy);
 420                         ret = ::builder->CreateOr(ret, ::builder->CreateShl(elem, i));
 421                 }
 422                 return ret;
 423         }
 424 #endif  // !defined(__i386__) && !defined(__x86_64__)
 425 #endif  // REACTOR_LLVM_VERSION >= 7
 426
 427         llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
 428         {
 429                 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
 430                 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
 431
 432                 llvm::Value *extX, *extY;
 433                 if (sext)
 434                 {
 435                         extX = ::builder->CreateSExt(x, extTy);
 436                         extY = ::builder->CreateSExt(y, extTy);
 437                 }
 438                 else
 439                 {
 440                         extX = ::builder->CreateZExt(x, extTy);
 441                         extY = ::builder->CreateZExt(y, extTy);
 442                 }
 443
 444                 llvm::Value *mult = ::builder->CreateMul(extX, extY);
 445
 446                 llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
 447                 llvm::Value *mulh = ::builder->CreateAShr(mult, intTy->getBitWidth());
 448                 return ::builder->CreateTrunc(mulh, ty);
 449         }
 450 }
 451
 452 namespace rr
 453 {
 454 #if REACTOR_LLVM_VERSION < 7
 455         class LLVMReactorJIT
 456         {
 457         private:
 458                 std::string arch;
 459                 llvm::SmallVector<std::string, 16> mattrs;
 460                 llvm::ExecutionEngine *executionEngine;
 461                 LLVMRoutineManager *routineManager;
 462
 463         public:
 464                 LLVMReactorJIT(const std::string &arch_,
 465                                const llvm::SmallVectorImpl<std::string> &mattrs_) :
 466                         arch(arch_),
 467                         mattrs(mattrs_.begin(), mattrs_.end()),
 468                         executionEngine(nullptr),
 469                         routineManager(nullptr)
 470                 {
 471                 }
 472
 473                 void startSession()
 474                 {
 475                         std::string error;
 476
 477                         ::module = new llvm::Module("", *::context);
 478
 479                         routineManager = new LLVMRoutineManager();
 480
 481                         llvm::TargetMachine *targetMachine =
 482                                 llvm::EngineBuilder::selectTarget(
 483                                         ::module, arch, "", mattrs, llvm::Reloc::Default,
 484                                         llvm::CodeModel::JITDefault, &error);
 485
 486                         executionEngine = llvm::JIT::createJIT(
 487                                 ::module, &error, routineManager, llvm::CodeGenOpt::Aggressive,
 488                                 true, targetMachine);
 489                 }
 490
 491                 void endSession()
 492                 {
 493                         delete executionEngine;
 494                         executionEngine = nullptr;
 495                         routineManager = nullptr;
 496
 497                         ::function = nullptr;
 498                         ::module = nullptr;
 499                 }
 500
 501                 LLVMRoutine *acquireRoutine(llvm::Function *func)
 502                 {
 503                         void *entry = executionEngine->getPointerToFunction(::function);
 504                         return routineManager->acquireRoutine(entry);
 505                 }
 506
 507                 void optimize(llvm::Module *module)
 508                 {
 509                         static llvm::PassManager *passManager = nullptr;
 510
 511                         if(!passManager)
 512                         {
 513                                 passManager = new llvm::PassManager();
 514
 515                                 passManager->add(new llvm::TargetData(*executionEngine->getTargetData()));
 516                                 passManager->add(llvm::createScalarReplAggregatesPass());
 517
 518                                 for(int pass = 0; pass < 10 && optimization[pass] != Disabled; pass++)
 519                                 {
 520                                         switch(optimization[pass])
 521                                         {
 522                                         case Disabled:                                                                       break;
 523                                         case CFGSimplification:    passManager->add(llvm::createCFGSimplificationPass());    break;
 524                                         case LICM:                 passManager->add(llvm::createLICMPass());                 break;
 525                                         case AggressiveDCE:        passManager->add(llvm::createAggressiveDCEPass());        break;
 526                                         case GVN:                  passManager->add(llvm::createGVNPass());                  break;
 527                                         case InstructionCombining: passManager->add(llvm::createInstructionCombiningPass()); break;
 528                                         case Reassociate:          passManager->add(llvm::createReassociatePass());          break;
 529                                         case DeadStoreElimination: passManager->add(llvm::createDeadStoreEliminationPass()); break;
 530                                         case SCCP:                 passManager->add(llvm::createSCCPPass());                 break;
 531                                         case ScalarReplAggregates: passManager->add(llvm::createScalarReplAggregatesPass()); break;
 532                                         default:
 533                                                 assert(false);
 534                                         }
 535                                 }
 536                         }
 537
 538                         passManager->run(*::module);
 539                 }
 540         };
 541 #else
 542         class ExternalFunctionSymbolResolver
 543         {
 544         private:
 545                 using FunctionMap = std::unordered_map<std::string, void *>;
 546                 FunctionMap func_;
 547
 548         public:
 549                 ExternalFunctionSymbolResolver()
 550                 {
 551                         func_.emplace("floorf", reinterpret_cast<void*>(floorf));
 552                         func_.emplace("nearbyintf", reinterpret_cast<void*>(nearbyintf));
 553                         func_.emplace("truncf", reinterpret_cast<void*>(truncf));
 554                         func_.emplace("printf", reinterpret_cast<void*>(printf));
 555                         func_.emplace("puts", reinterpret_cast<void*>(puts));
 556                         func_.emplace("fmodf", reinterpret_cast<void*>(fmodf));
 557                 }
 558
 559                 void *findSymbol(const std::string &name) const
 560                 {
 561                         // Trim off any underscores from the start of the symbol. LLVM likes
 562                         // to append these on macOS.
 563                         const char* trimmed = name.c_str();
 564                         while (trimmed[0] == '_') { trimmed++; }
 565
 566                         FunctionMap::const_iterator it = func_.find(trimmed);
 567                         assert(it != func_.end()); // Missing functions will likely make the module fail in exciting non-obvious ways.
 568                         return it->second;
 569                 }
 570         };
 571
 572         class LLVMReactorJIT
 573         {
 574         private:
 575                 using ObjLayer = llvm::orc::RTDyldObjectLinkingLayer;
 576                 using CompileLayer = llvm::orc::IRCompileLayer<ObjLayer, llvm::orc::SimpleCompiler>;
 577
 578                 llvm::orc::ExecutionSession session;
 579                 ExternalFunctionSymbolResolver externalSymbolResolver;
 580                 std::shared_ptr<llvm::orc::SymbolResolver> resolver;
 581                 std::unique_ptr<llvm::TargetMachine> targetMachine;
 582                 const llvm::DataLayout dataLayout;
 583                 ObjLayer objLayer;
 584                 CompileLayer compileLayer;
 585                 size_t emittedFunctionsNum;
 586
 587         public:
 588                 LLVMReactorJIT(const char *arch, const llvm::SmallVectorImpl<std::string>& mattrs,
 589                                            const llvm::TargetOptions &targetOpts):
 590                         resolver(createLegacyLookupResolver(
 591                                 session,
 592                                 [this](const std::string &name) {
 593                                         void *func = externalSymbolResolver.findSymbol(name);
 594                                         if (func != nullptr)
 595                                         {
 596                                                 return llvm::JITSymbol(
 597                                                         reinterpret_cast<uintptr_t>(func), llvm::JITSymbolFlags::Absolute);
 598                                         }
 599
 600                                         return objLayer.findSymbol(name, true);
 601                                 },
 602                                 [](llvm::Error err) {
 603                                         if (err)
 604                                         {
 605                                                 // TODO: Log the symbol resolution errors.
 606                                                 return;
 607                                         }
 608                                 })),
 609                         targetMachine(llvm::EngineBuilder()
 610                                 .setMArch(arch)
 611                                 .setMAttrs(mattrs)
 612                                 .setTargetOptions(targetOpts)
 613                                 .selectTarget()),
 614                         dataLayout(targetMachine->createDataLayout()),
 615                         objLayer(
 616                                 session,
 617                                 [this](llvm::orc::VModuleKey) {
 618                                         return ObjLayer::Resources{
 619                                                 std::make_shared<llvm::SectionMemoryManager>(),
 620                                                 resolver};
 621                                 }),
 622                         compileLayer(objLayer, llvm::orc::SimpleCompiler(*targetMachine)),
 623                         emittedFunctionsNum(0)
 624                 {
 625                 }
 626
 627                 void startSession()
 628                 {
 629                         ::module = new llvm::Module("", *::context);
 630                 }
 631
 632                 void endSession()
 633                 {
 634                         ::function = nullptr;
 635                         ::module = nullptr;
 636                 }
 637
 638                 LLVMRoutine *acquireRoutine(llvm::Function *func)
 639                 {
 640                         std::string name = "f" + llvm::Twine(emittedFunctionsNum++).str();
 641                         func->setName(name);
 642                         func->setLinkage(llvm::GlobalValue::ExternalLinkage);
 643                         func->setDoesNotThrow();
 644
 645                         std::unique_ptr<llvm::Module> mod(::module);
 646                         ::module = nullptr;
 647                         mod->setDataLayout(dataLayout);
 648
 649                         auto moduleKey = session.allocateVModule();
 650                         llvm::cantFail(compileLayer.addModule(moduleKey, std::move(mod)));
 651
 652                         std::string mangledName;
 653                         {
 654                                 llvm::raw_string_ostream mangledNameStream(mangledName);
 655                                 llvm::Mangler::getNameWithPrefix(mangledNameStream, name, dataLayout);
 656                         }
 657
 658                         llvm::JITSymbol symbol = compileLayer.findSymbolIn(moduleKey, mangledName, false);
 659
 660                         llvm::Expected<llvm::JITTargetAddress> expectAddr = symbol.getAddress();
 661                         if(!expectAddr)
 662                         {
 663                                 return nullptr;
 664                         }
 665
 666                         void *addr = reinterpret_cast<void *>(static_cast<intptr_t>(expectAddr.get()));
 667                         return new LLVMRoutine(addr, releaseRoutineCallback, this, moduleKey);
 668                 }
 669
 670                 void optimize(llvm::Module *module)
 671                 {
 672                         std::unique_ptr<llvm::legacy::PassManager> passManager(
 673                                 new llvm::legacy::PassManager());
 674
 675                         passManager->add(llvm::createSROAPass());
 676
 677                         for(int pass = 0; pass < 10 && optimization[pass] != Disabled; pass++)
 678                         {
 679                                 switch(optimization[pass])
 680                                 {
 681                                 case Disabled:                                                                       break;
 682                                 case CFGSimplification:    passManager->add(llvm::createCFGSimplificationPass());    break;
 683                                 case LICM:                 passManager->add(llvm::createLICMPass());                 break;
 684                                 case AggressiveDCE:        passManager->add(llvm::createAggressiveDCEPass());        break;
 685                                 case GVN:                  passManager->add(llvm::createGVNPass());                  break;
 686                                 case InstructionCombining: passManager->add(llvm::createInstructionCombiningPass()); break;
 687                                 case Reassociate:          passManager->add(llvm::createReassociatePass());          break;
 688                                 case DeadStoreElimination: passManager->add(llvm::createDeadStoreEliminationPass()); break;
 689                                 case SCCP:                 passManager->add(llvm::createSCCPPass());                 break;
 690                                 case ScalarReplAggregates: passManager->add(llvm::createSROAPass());                 break;
 691                                 default:
 692                                                            assert(false);
 693                                 }
 694                         }
 695
 696                         passManager->run(*::module);
 697                 }
 698
 699         private:
 700                 void releaseRoutineModule(llvm::orc::VModuleKey moduleKey)
 701                 {
 702                         llvm::cantFail(compileLayer.removeModule(moduleKey));
 703                 }
 704
 705                 static void releaseRoutineCallback(LLVMReactorJIT *jit, uint64_t moduleKey)
 706                 {
 707                         jit->releaseRoutineModule(moduleKey);
 708                 }
 709         };
 710 #endif
 711
 712         Optimization optimization[10] = {InstructionCombining, Disabled};
 713
 714         // The abstract Type* types are implemented as LLVM types, except that
 715         // 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
 716         // and VFP in ARM, and eliminate the overhead of converting them to explicit
 717         // 128-bit ones. LLVM types are pointers, so we can represent emulated types
 718         // as abstract pointers with small enum values.
 719         enum InternalType : uintptr_t
 720         {
 721                 // Emulated types:
 722                 Type_v2i32,
 723                 Type_v4i16,
 724                 Type_v2i16,
 725                 Type_v8i8,
 726                 Type_v4i8,
 727                 Type_v2f32,
 728                 EmulatedTypeCount,
 729                 // Returned by asInternalType() to indicate that the abstract Type*
 730                 // should be interpreted as LLVM type pointer:
 731                 Type_LLVM
 732         };
 733
 734         inline InternalType asInternalType(Type *type)
 735         {
 736                 InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
 737                 return (t < EmulatedTypeCount) ? t : Type_LLVM;
 738         }
 739
 740         llvm::Type *T(Type *t)
 741         {
 742                 // Use 128-bit vectors to implement logically shorter ones.
 743                 switch(asInternalType(t))
 744                 {
 745                 case Type_v2i32: return T(Int4::getType());
 746                 case Type_v4i16: return T(Short8::getType());
 747                 case Type_v2i16: return T(Short8::getType());
 748                 case Type_v8i8:  return T(Byte16::getType());
 749                 case Type_v4i8:  return T(Byte16::getType());
 750                 case Type_v2f32: return T(Float4::getType());
 751                 case Type_LLVM:  return reinterpret_cast<llvm::Type*>(t);
 752                 default: assert(false); return nullptr;
 753                 }
 754         }
 755
 756         inline Type *T(llvm::Type *t)
 757         {
 758                 return reinterpret_cast<Type*>(t);
 759         }
 760
 761         Type *T(InternalType t)
 762         {
 763                 return reinterpret_cast<Type*>(t);
 764         }
 765
 766         inline llvm::Value *V(Value *t)
 767         {
 768                 return reinterpret_cast<llvm::Value*>(t);
 769         }
 770
 771         inline Value *V(llvm::Value *t)
 772         {
 773                 return reinterpret_cast<Value*>(t);
 774         }
 775
 776         inline std::vector<llvm::Type*> &T(std::vector<Type*> &t)
 777         {
 778                 return reinterpret_cast<std::vector<llvm::Type*>&>(t);
 779         }
 780
 781         inline llvm::BasicBlock *B(BasicBlock *t)
 782         {
 783                 return reinterpret_cast<llvm::BasicBlock*>(t);
 784         }
 785
 786         inline BasicBlock *B(llvm::BasicBlock *t)
 787         {
 788                 return reinterpret_cast<BasicBlock*>(t);
 789         }
 790
 791         static size_t typeSize(Type *type)
 792         {
 793                 switch(asInternalType(type))
 794                 {
 795                 case Type_v2i32: return 8;
 796                 case Type_v4i16: return 8;
 797                 case Type_v2i16: return 4;
 798                 case Type_v8i8:  return 8;
 799                 case Type_v4i8:  return 4;
 800                 case Type_v2f32: return 8;
 801                 case Type_LLVM:
 802                         {
 803                                 llvm::Type *t = T(type);
 804
 805                                 if(t->isPointerTy())
 806                                 {
 807                                         return sizeof(void*);
 808                                 }
 809
 810                                 // At this point we should only have LLVM 'primitive' types.
 811                                 unsigned int bits = t->getPrimitiveSizeInBits();
 812                                 assert(bits != 0);
 813
 814                                 // TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
 815                                 // but are typically stored as one byte. The DataLayout structure should
 816                                 // be used here and many other places if this assumption fails.
 817                                 return (bits + 7) / 8;
 818                         }
 819                         break;
 820                 default:
 821                         assert(false);
 822                         return 0;
 823                 }
 824         }
 825
 826         static unsigned int elementCount(Type *type)
 827         {
 828                 switch(asInternalType(type))
 829                 {
 830                 case Type_v2i32: return 2;
 831                 case Type_v4i16: return 4;
 832                 case Type_v2i16: return 2;
 833                 case Type_v8i8:  return 8;
 834                 case Type_v4i8:  return 4;
 835                 case Type_v2f32: return 2;
 836                 case Type_LLVM:  return llvm::cast<llvm::VectorType>(T(type))->getNumElements();
 837                 default: assert(false); return 0;
 838                 }
 839         }
 840
 841         static llvm::AtomicOrdering atomicOrdering(bool atomic, std::memory_order memoryOrder)
 842         {
 843                 #if REACTOR_LLVM_VERSION < 7
 844                         return llvm::AtomicOrdering::NotAtomic;
 845                 #endif
 846
 847                 if(!atomic)
 848                 {
 849                         return llvm::AtomicOrdering::NotAtomic;
 850                 }
 851
 852                 switch(memoryOrder)
 853                 {
 854                 case std::memory_order_relaxed: return llvm::AtomicOrdering::Monotonic;  // https://llvm.org/docs/Atomics.html#monotonic
 855                 case std::memory_order_consume: return llvm::AtomicOrdering::Acquire;    // https://llvm.org/docs/Atomics.html#acquire: "It should also be used for C++11/C11 memory_order_consume."
 856                 case std::memory_order_acquire: return llvm::AtomicOrdering::Acquire;
 857                 case std::memory_order_release: return llvm::AtomicOrdering::Release;
 858                 case std::memory_order_acq_rel: return llvm::AtomicOrdering::AcquireRelease;
 859                 case std::memory_order_seq_cst: return llvm::AtomicOrdering::SequentiallyConsistent;
 860                 default: assert(false);         return llvm::AtomicOrdering::AcquireRelease;
 861                 }
 862         }
 863
 864         Nucleus::Nucleus()
 865         {
 866                 ::codegenMutex.lock();   // Reactor and LLVM are currently not thread safe
 867
 868                 llvm::InitializeNativeTarget();
 869
 870 #if REACTOR_LLVM_VERSION >= 7
 871                 llvm::InitializeNativeTargetAsmPrinter();
 872                 llvm::InitializeNativeTargetAsmParser();
 873 #endif
 874
 875                 if(!::context)
 876                 {
 877                         ::context = new llvm::LLVMContext();
 878                 }
 879
 880                 #if defined(__x86_64__)
 881                         static const char arch[] = "x86-64";
 882                 #elif defined(__i386__)
 883                         static const char arch[] = "x86";
 884                 #elif defined(__aarch64__)
 885                         static const char arch[] = "arm64";
 886                 #elif defined(__arm__)
 887                         static const char arch[] = "arm";
 888                 #elif defined(__mips__)
 889                         #if defined(__mips64)
 890                             static const char arch[] = "mips64el";
 891                         #else
 892                             static const char arch[] = "mipsel";
 893                         #endif
 894                 #else
 895                 #error "unknown architecture"
 896                 #endif
 897
 898                 llvm::SmallVector<std::string, 1> mattrs;
 899 #if defined(__i386__) || defined(__x86_64__)
 900                 mattrs.push_back(CPUID::supportsMMX()    ? "+mmx"    : "-mmx");
 901                 mattrs.push_back(CPUID::supportsCMOV()   ? "+cmov"   : "-cmov");
 902                 mattrs.push_back(CPUID::supportsSSE()    ? "+sse"    : "-sse");
 903                 mattrs.push_back(CPUID::supportsSSE2()   ? "+sse2"   : "-sse2");
 904                 mattrs.push_back(CPUID::supportsSSE3()   ? "+sse3"   : "-sse3");
 905                 mattrs.push_back(CPUID::supportsSSSE3()  ? "+ssse3"  : "-ssse3");
 906 #if REACTOR_LLVM_VERSION < 7
 907                 mattrs.push_back(CPUID::supportsSSE4_1() ? "+sse41"  : "-sse41");
 908 #else
 909                 mattrs.push_back(CPUID::supportsSSE4_1() ? "+sse4.1" : "-sse4.1");
 910 #endif
 911 #elif defined(__arm__)
 912 #if __ARM_ARCH >= 8
 913                 mattrs.push_back("+armv8-a");
 914 #else
 915                 // armv7-a requires compiler-rt routines; otherwise, compiled kernel
 916                 // might fail to link.
 917 #endif
 918 #endif
 919
 920 #if REACTOR_LLVM_VERSION < 7
 921                 llvm::JITEmitDebugInfo = false;
 922                 llvm::UnsafeFPMath = true;
 923                 // llvm::NoInfsFPMath = true;
 924                 // llvm::NoNaNsFPMath = true;
 925 #else
 926                 llvm::TargetOptions targetOpts;
 927                 targetOpts.UnsafeFPMath = false;
 928                 // targetOpts.NoInfsFPMath = true;
 929                 // targetOpts.NoNaNsFPMath = true;
 930 #endif
 931
 932                 if(!::reactorJIT)
 933                 {
 934 #if REACTOR_LLVM_VERSION < 7
 935                         ::reactorJIT = new LLVMReactorJIT(arch, mattrs);
 936 #else
 937                         ::reactorJIT = new LLVMReactorJIT(arch, mattrs, targetOpts);
 938 #endif
 939                 }
 940
 941                 ::reactorJIT->startSession();
 942
 943                 if(!::builder)
 944                 {
 945                         ::builder = new llvm::IRBuilder<>(*::context);
 946                 }
 947         }
 948
 949         Nucleus::~Nucleus()
 950         {
 951                 ::reactorJIT->endSession();
 952
 953                 ::codegenMutex.unlock();
 954         }
 955
 956         Routine *Nucleus::acquireRoutine(const char *name, bool runOptimizations)
 957         {
 958                 if(::builder->GetInsertBlock()->empty() || !::builder->GetInsertBlock()->back().isTerminator())
 959                 {
 960                         llvm::Type *type = ::function->getReturnType();
 961
 962                         if(type->isVoidTy())
 963                         {
 964                                 createRetVoid();
 965                         }
 966                         else
 967                         {
 968                                 createRet(V(llvm::UndefValue::get(type)));
 969                         }
 970                 }
 971
 972                 if(false)
 973                 {
 974                         #if REACTOR_LLVM_VERSION < 7
 975                                 std::string error;
 976                                 llvm::raw_fd_ostream file((std::string(name) + "-llvm-dump-unopt.txt").c_str(), error);
 977                         #else
 978                                 std::error_code error;
 979                                 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
 980                         #endif
 981
 982                         ::module->print(file, 0);
 983                 }
 984
 985                 if(runOptimizations)
 986                 {
 987                         optimize();
 988                 }
 989
 990                 if(false)
 991                 {
 992                         #if REACTOR_LLVM_VERSION < 7
 993                                 std::string error;
 994                                 llvm::raw_fd_ostream file((std::string(name) + "-llvm-dump-opt.txt").c_str(), error);
 995                         #else
 996                                 std::error_code error;
 997                                 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
 998                         #endif
 999
1000                         ::module->print(file, 0);
1001                 }
1002
1003                 LLVMRoutine *routine = ::reactorJIT->acquireRoutine(::function);
1004
1005                 return routine;
1006         }
1007
1008         void Nucleus::optimize()
1009         {
1010                 ::reactorJIT->optimize(::module);
1011         }
1012
1013         Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
1014         {
1015                 // Need to allocate it in the entry block for mem2reg to work
1016                 llvm::BasicBlock &entryBlock = ::function->getEntryBlock();
1017
1018                 llvm::Instruction *declaration;
1019
1020                 if(arraySize)
1021                 {
1022 #if REACTOR_LLVM_VERSION < 7
1023                         declaration = new llvm::AllocaInst(T(type), V(Nucleus::createConstantInt(arraySize)));
1024 #else
1025                         declaration = new llvm::AllocaInst(T(type), 0, V(Nucleus::createConstantInt(arraySize)));
1026 #endif
1027                 }
1028                 else
1029                 {
1030 #if REACTOR_LLVM_VERSION < 7
1031                         declaration = new llvm::AllocaInst(T(type), (llvm::Value*)nullptr);
1032 #else
1033                         declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value*)nullptr);
1034 #endif
1035                 }
1036
1037                 entryBlock.getInstList().push_front(declaration);
1038
1039                 return V(declaration);
1040         }
1041
1042         BasicBlock *Nucleus::createBasicBlock()
1043         {
1044                 return B(llvm::BasicBlock::Create(*::context, "", ::function));
1045         }
1046
1047         BasicBlock *Nucleus::getInsertBlock()
1048         {
1049                 return B(::builder->GetInsertBlock());
1050         }
1051
1052         void Nucleus::setInsertBlock(BasicBlock *basicBlock)
1053         {
1054         //      assert(::builder->GetInsertBlock()->back().isTerminator());
1055                 ::builder->SetInsertPoint(B(basicBlock));
1056         }
1057
1058         void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
1059         {
1060                 llvm::FunctionType *functionType = llvm::FunctionType::get(T(ReturnType), T(Params), false);
1061                 ::function = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, "", ::module);
1062                 ::function->setCallingConv(llvm::CallingConv::C);
1063
1064                 #if defined(_WIN32) && REACTOR_LLVM_VERSION >= 7
1065                         // FIXME(capn):
1066                         // On Windows, stack memory is committed in increments of 4 kB pages, with the last page
1067                         // having a trap which allows the OS to grow the stack. For functions with a stack frame
1068                         // larger than 4 kB this can cause an issue when a variable is accessed beyond the guard
1069                         // page. Therefore the compiler emits a call to __chkstk in the function prolog to probe
1070                         // the stack and ensure all pages have been committed. This is currently broken in LLVM
1071                         // JIT, but we can prevent emitting the stack probe call:
1072                         ::function->addFnAttr("stack-probe-size", "1048576");
1073                 #endif
1074
1075                 ::builder->SetInsertPoint(llvm::BasicBlock::Create(*::context, "", ::function));
1076         }
1077
1078         Value *Nucleus::getArgument(unsigned int index)
1079         {
1080                 llvm::Function::arg_iterator args = ::function->arg_begin();
1081
1082                 while(index)
1083                 {
1084                         args++;
1085                         index--;
1086                 }
1087
1088                 return V(&*args);
1089         }
1090
1091         void Nucleus::createRetVoid()
1092         {
1093                 ::builder->CreateRetVoid();
1094         }
1095
1096         void Nucleus::createRet(Value *v)
1097         {
1098                 ::builder->CreateRet(V(v));
1099         }
1100
1101         void Nucleus::createBr(BasicBlock *dest)
1102         {
1103                 ::builder->CreateBr(B(dest));
1104         }
1105
1106         void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
1107         {
1108                 ::builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
1109         }
1110
1111         Value *Nucleus::createAdd(Value *lhs, Value *rhs)
1112         {
1113                 return V(::builder->CreateAdd(V(lhs), V(rhs)));
1114         }
1115
1116         Value *Nucleus::createSub(Value *lhs, Value *rhs)
1117         {
1118                 return V(::builder->CreateSub(V(lhs), V(rhs)));
1119         }
1120
1121         Value *Nucleus::createMul(Value *lhs, Value *rhs)
1122         {
1123                 return V(::builder->CreateMul(V(lhs), V(rhs)));
1124         }
1125
1126         Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
1127         {
1128                 return V(::builder->CreateUDiv(V(lhs), V(rhs)));
1129         }
1130
1131         Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
1132         {
1133                 return V(::builder->CreateSDiv(V(lhs), V(rhs)));
1134         }
1135
1136         Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
1137         {
1138                 return V(::builder->CreateFAdd(V(lhs), V(rhs)));
1139         }
1140
1141         Value *Nucleus::createFSub(Value *lhs, Value *rhs)
1142         {
1143                 return V(::builder->CreateFSub(V(lhs), V(rhs)));
1144         }
1145
1146         Value *Nucleus::createFMul(Value *lhs, Value *rhs)
1147         {
1148                 return V(::builder->CreateFMul(V(lhs), V(rhs)));
1149         }
1150
1151         Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
1152         {
1153                 return V(::builder->CreateFDiv(V(lhs), V(rhs)));
1154         }
1155
1156         Value *Nucleus::createURem(Value *lhs, Value *rhs)
1157         {
1158                 return V(::builder->CreateURem(V(lhs), V(rhs)));
1159         }
1160
1161         Value *Nucleus::createSRem(Value *lhs, Value *rhs)
1162         {
1163                 return V(::builder->CreateSRem(V(lhs), V(rhs)));
1164         }
1165
1166         Value *Nucleus::createFRem(Value *lhs, Value *rhs)
1167         {
1168                 return V(::builder->CreateFRem(V(lhs), V(rhs)));
1169         }
1170
1171         Value *Nucleus::createShl(Value *lhs, Value *rhs)
1172         {
1173                 return V(::builder->CreateShl(V(lhs), V(rhs)));
1174         }
1175
1176         Value *Nucleus::createLShr(Value *lhs, Value *rhs)
1177         {
1178                 return V(::builder->CreateLShr(V(lhs), V(rhs)));
1179         }
1180
1181         Value *Nucleus::createAShr(Value *lhs, Value *rhs)
1182         {
1183                 return V(::builder->CreateAShr(V(lhs), V(rhs)));
1184         }
1185
1186         Value *Nucleus::createAnd(Value *lhs, Value *rhs)
1187         {
1188                 return V(::builder->CreateAnd(V(lhs), V(rhs)));
1189         }
1190
1191         Value *Nucleus::createOr(Value *lhs, Value *rhs)
1192         {
1193                 return V(::builder->CreateOr(V(lhs), V(rhs)));
1194         }
1195
1196         Value *Nucleus::createXor(Value *lhs, Value *rhs)
1197         {
1198                 return V(::builder->CreateXor(V(lhs), V(rhs)));
1199         }
1200
1201         Value *Nucleus::createNeg(Value *v)
1202         {
1203                 return V(::builder->CreateNeg(V(v)));
1204         }
1205
1206         Value *Nucleus::createFNeg(Value *v)
1207         {
1208                 return V(::builder->CreateFNeg(V(v)));
1209         }
1210
1211         Value *Nucleus::createNot(Value *v)
1212         {
1213                 return V(::builder->CreateNot(V(v)));
1214         }
1215
1216         Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
1217         {
1218                 switch(asInternalType(type))
1219                 {
1220                 case Type_v2i32:
1221                 case Type_v4i16:
1222                 case Type_v8i8:
1223                 case Type_v2f32:
1224                         return createBitCast(
1225                                 createInsertElement(
1226                                         V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2))),
1227                                         createLoad(createBitCast(ptr, Pointer<Long>::getType()), Long::getType(), isVolatile, alignment, atomic, memoryOrder),
1228                                         0),
1229                                 type);
1230                 case Type_v2i16:
1231                 case Type_v4i8:
1232                         if(alignment != 0)   // Not a local variable (all vectors are 128-bit).
1233                         {
1234                                 Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::getType()), 2)));
1235                                 Value *i = createLoad(createBitCast(ptr, Pointer<Int>::getType()), Int::getType(), isVolatile, alignment, atomic, memoryOrder);
1236                                 i = createZExt(i, Long::getType());
1237                                 Value *v = createInsertElement(u, i, 0);
1238                                 return createBitCast(v, type);
1239                         }
1240                         // Fallthrough to non-emulated case.
1241                 case Type_LLVM:
1242                         {
1243                                 assert(V(ptr)->getType()->getContainedType(0) == T(type));
1244                                 auto load = new llvm::LoadInst(V(ptr), "", isVolatile, alignment);
1245                                 load->setAtomic(atomicOrdering(atomic, memoryOrder));
1246
1247                                 return V(::builder->Insert(load));
1248                         }
1249                 default:
1250                         assert(false); return nullptr;
1251                 }
1252         }
1253
1254         Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
1255         {
1256                 switch(asInternalType(type))
1257                 {
1258                 case Type_v2i32:
1259                 case Type_v4i16:
1260                 case Type_v8i8:
1261                 case Type_v2f32:
1262                         createStore(
1263                                 createExtractElement(
1264                                         createBitCast(value, T(llvm::VectorType::get(T(Long::getType()), 2))), Long::getType(), 0),
1265                                 createBitCast(ptr, Pointer<Long>::getType()),
1266                                 Long::getType(), isVolatile, alignment, atomic, memoryOrder);
1267                         return value;
1268                 case Type_v2i16:
1269                 case Type_v4i8:
1270                         if(alignment != 0)   // Not a local variable (all vectors are 128-bit).
1271                         {
1272                                 createStore(
1273                                         createExtractElement(createBitCast(value, Int4::getType()), Int::getType(), 0),
1274                                         createBitCast(ptr, Pointer<Int>::getType()),
1275                                         Int::getType(), isVolatile, alignment, atomic, memoryOrder);
1276                                 return value;
1277                         }
1278                         // Fallthrough to non-emulated case.
1279                 case Type_LLVM:
1280                         {
1281                                 assert(V(ptr)->getType()->getContainedType(0) == T(type));
1282                                 auto store = ::builder->Insert(new llvm::StoreInst(V(value), V(ptr), isVolatile, alignment));
1283                                 store->setAtomic(atomicOrdering(atomic, memoryOrder));
1284
1285                                 return value;
1286                         }
1287                 default:
1288                         assert(false); return nullptr;
1289                 }
1290         }
1291
1292         Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1293         {
1294                 assert(V(ptr)->getType()->getContainedType(0) == T(type));
1295
1296                 if(sizeof(void*) == 8)
1297                 {
1298                         // LLVM manual: "When indexing into an array, pointer or vector,
1299                         // integers of any width are allowed, and they are not required to
1300                         // be constant. These integers are treated as signed values where
1301                         // relevant."
1302                         //
1303                         // Thus if we want indexes to be treated as unsigned we have to
1304                         // zero-extend them ourselves.
1305                         //
1306                         // Note that this is not because we want to address anywhere near
1307                         // 4 GB of data. Instead this is important for performance because
1308                         // x86 supports automatic zero-extending of 32-bit registers to
1309                         // 64-bit. Thus when indexing into an array using a uint32 is
1310                         // actually faster than an int32.
1311                         index = unsignedIndex ?
1312                                 createZExt(index, Long::getType()) :
1313                                 createSExt(index, Long::getType());
1314                 }
1315
1316                 // For non-emulated types we can rely on LLVM's GEP to calculate the
1317                 // effective address correctly.
1318                 if(asInternalType(type) == Type_LLVM)
1319                 {
1320                         return V(::builder->CreateGEP(V(ptr), V(index)));
1321                 }
1322
1323                 // For emulated types we have to multiply the index by the intended
1324                 // type size ourselves to obain the byte offset.
1325                 index = (sizeof(void*) == 8) ?
1326                         createMul(index, createConstantLong((int64_t)typeSize(type))) :
1327                         createMul(index, createConstantInt((int)typeSize(type)));
1328
1329                 // Cast to a byte pointer, apply the byte offset, and cast back to the
1330                 // original pointer type.
1331                 return createBitCast(
1332                         V(::builder->CreateGEP(V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::getType()), 0)))), V(index))),
1333                         T(llvm::PointerType::get(T(type), 0)));
1334         }
1335
1336         Value *Nucleus::createAtomicAdd(Value *ptr, Value *value)
1337         {
1338                 return V(::builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value), llvm::AtomicOrdering::SequentiallyConsistent));
1339         }
1340
1341         Value *Nucleus::createTrunc(Value *v, Type *destType)
1342         {
1343                 return V(::builder->CreateTrunc(V(v), T(destType)));
1344         }
1345
1346         Value *Nucleus::createZExt(Value *v, Type *destType)
1347         {
1348                 return V(::builder->CreateZExt(V(v), T(destType)));
1349         }
1350
1351         Value *Nucleus::createSExt(Value *v, Type *destType)
1352         {
1353                 return V(::builder->CreateSExt(V(v), T(destType)));
1354         }
1355
1356         Value *Nucleus::createFPToSI(Value *v, Type *destType)
1357         {
1358                 return V(::builder->CreateFPToSI(V(v), T(destType)));
1359         }
1360
1361         Value *Nucleus::createSIToFP(Value *v, Type *destType)
1362         {
1363                 return V(::builder->CreateSIToFP(V(v), T(destType)));
1364         }
1365
1366         Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1367         {
1368                 return V(::builder->CreateFPTrunc(V(v), T(destType)));
1369         }
1370
1371         Value *Nucleus::createFPExt(Value *v, Type *destType)
1372         {
1373                 return V(::builder->CreateFPExt(V(v), T(destType)));
1374         }
1375
1376         Value *Nucleus::createBitCast(Value *v, Type *destType)
1377         {
1378                 // Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1379                 // support for casting between scalars and wide vectors. Emulate them by writing to the stack and
1380                 // reading back as the destination type.
1381                 if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
1382                 {
1383                         Value *readAddress = allocateStackVariable(destType);
1384                         Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
1385                         createStore(v, writeAddress, T(V(v)->getType()));
1386                         return createLoad(readAddress, destType);
1387                 }
1388                 else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
1389                 {
1390                         Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
1391                         createStore(v, writeAddress, T(V(v)->getType()));
1392                         Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
1393                         return createLoad(readAddress, destType);
1394                 }
1395
1396                 return V(::builder->CreateBitCast(V(v), T(destType)));
1397         }
1398
1399         Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1400         {
1401                 return V(::builder->CreateICmpEQ(V(lhs), V(rhs)));
1402         }
1403
1404         Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1405         {
1406                 return V(::builder->CreateICmpNE(V(lhs), V(rhs)));
1407         }
1408
1409         Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1410         {
1411                 return V(::builder->CreateICmpUGT(V(lhs), V(rhs)));
1412         }
1413
1414         Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1415         {
1416                 return V(::builder->CreateICmpUGE(V(lhs), V(rhs)));
1417         }
1418
1419         Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1420         {
1421                 return V(::builder->CreateICmpULT(V(lhs), V(rhs)));
1422         }
1423
1424         Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1425         {
1426                 return V(::builder->CreateICmpULE(V(lhs), V(rhs)));
1427         }
1428
1429         Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1430         {
1431                 return V(::builder->CreateICmpSGT(V(lhs), V(rhs)));
1432         }
1433
1434         Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1435         {
1436                 return V(::builder->CreateICmpSGE(V(lhs), V(rhs)));
1437         }
1438
1439         Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1440         {
1441                 return V(::builder->CreateICmpSLT(V(lhs), V(rhs)));
1442         }
1443
1444         Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1445         {
1446                 return V(::builder->CreateICmpSLE(V(lhs), V(rhs)));
1447         }
1448
1449         Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1450         {
1451                 return V(::builder->CreateFCmpOEQ(V(lhs), V(rhs)));
1452         }
1453
1454         Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1455         {
1456                 return V(::builder->CreateFCmpOGT(V(lhs), V(rhs)));
1457         }
1458
1459         Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1460         {
1461                 return V(::builder->CreateFCmpOGE(V(lhs), V(rhs)));
1462         }
1463
1464         Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1465         {
1466                 return V(::builder->CreateFCmpOLT(V(lhs), V(rhs)));
1467         }
1468
1469         Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1470         {
1471                 return V(::builder->CreateFCmpOLE(V(lhs), V(rhs)));
1472         }
1473
1474         Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1475         {
1476                 return V(::builder->CreateFCmpONE(V(lhs), V(rhs)));
1477         }
1478
1479         Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1480         {
1481                 return V(::builder->CreateFCmpORD(V(lhs), V(rhs)));
1482         }
1483
1484         Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1485         {
1486                 return V(::builder->CreateFCmpUNO(V(lhs), V(rhs)));
1487         }
1488
1489         Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1490         {
1491                 return V(::builder->CreateFCmpUEQ(V(lhs), V(rhs)));
1492         }
1493
1494         Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1495         {
1496                 return V(::builder->CreateFCmpUGT(V(lhs), V(rhs)));
1497         }
1498
1499         Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1500         {
1501                 return V(::builder->CreateFCmpUGE(V(lhs), V(rhs)));
1502         }
1503
1504         Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1505         {
1506                 return V(::builder->CreateFCmpULT(V(lhs), V(rhs)));
1507         }
1508
1509         Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1510         {
1511                 return V(::builder->CreateFCmpULE(V(lhs), V(rhs)));
1512         }
1513
1514         Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1515         {
1516                 return V(::builder->CreateFCmpUNE(V(lhs), V(rhs)));
1517         }
1518
1519         Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1520         {
1521                 assert(V(vector)->getType()->getContainedType(0) == T(type));
1522                 return V(::builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
1523         }
1524
1525         Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1526         {
1527                 return V(::builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
1528         }
1529
1530         Value *Nucleus::createShuffleVector(Value *v1, Value *v2, const int *select)
1531         {
1532                 int size = llvm::cast<llvm::VectorType>(V(v1)->getType())->getNumElements();
1533                 const int maxSize = 16;
1534                 llvm::Constant *swizzle[maxSize];
1535                 assert(size <= maxSize);
1536
1537                 for(int i = 0; i < size; i++)
1538                 {
1539                         swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*::context), select[i]);
1540                 }
1541
1542                 llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(swizzle, size));
1543
1544                 return V(::builder->CreateShuffleVector(V(v1), V(v2), shuffle));
1545         }
1546
1547         Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
1548         {
1549                 return V(::builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
1550         }
1551
1552         SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1553         {
1554                 return reinterpret_cast<SwitchCases*>(::builder->CreateSwitch(V(control), B(defaultBranch), numCases));
1555         }
1556
1557         void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1558         {
1559                 llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
1560                 sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*::context), label, true), B(branch));
1561         }
1562
1563         void Nucleus::createUnreachable()
1564         {
1565                 ::builder->CreateUnreachable();
1566         }
1567
1568         Type *Nucleus::getPointerType(Type *ElementType)
1569         {
1570                 return T(llvm::PointerType::get(T(ElementType), 0));
1571         }
1572
1573         Value *Nucleus::createNullValue(Type *Ty)
1574         {
1575                 return V(llvm::Constant::getNullValue(T(Ty)));
1576         }
1577
1578         Value *Nucleus::createConstantLong(int64_t i)
1579         {
1580                 return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(*::context), i, true));
1581         }
1582
1583         Value *Nucleus::createConstantInt(int i)
1584         {
1585                 return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*::context), i, true));
1586         }
1587
1588         Value *Nucleus::createConstantInt(unsigned int i)
1589         {
1590                 return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*::context), i, false));
1591         }
1592
1593         Value *Nucleus::createConstantBool(bool b)
1594         {
1595                 return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(*::context), b));
1596         }
1597
1598         Value *Nucleus::createConstantByte(signed char i)
1599         {
1600                 return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*::context), i, true));
1601         }
1602
1603         Value *Nucleus::createConstantByte(unsigned char i)
1604         {
1605                 return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*::context), i, false));
1606         }
1607
1608         Value *Nucleus::createConstantShort(short i)
1609         {
1610                 return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*::context), i, true));
1611         }
1612
1613         Value *Nucleus::createConstantShort(unsigned short i)
1614         {
1615                 return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*::context), i, false));
1616         }
1617
1618         Value *Nucleus::createConstantFloat(float x)
1619         {
1620                 return V(llvm::ConstantFP::get(T(Float::getType()), x));
1621         }
1622
1623         Value *Nucleus::createNullPointer(Type *Ty)
1624         {
1625                 return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
1626         }
1627
1628         Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
1629         {
1630                 assert(llvm::isa<llvm::VectorType>(T(type)));
1631                 const int numConstants = elementCount(type);                                       // Number of provided constants for the (emulated) type.
1632                 const int numElements = llvm::cast<llvm::VectorType>(T(type))->getNumElements();   // Number of elements of the underlying vector type.
1633                 assert(numElements <= 16 && numConstants <= numElements);
1634                 llvm::Constant *constantVector[16];
1635
1636                 for(int i = 0; i < numElements; i++)
1637                 {
1638                         constantVector[i] = llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]);
1639                 }
1640
1641                 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numElements)));
1642         }
1643
1644         Value *Nucleus::createConstantVector(const double *constants, Type *type)
1645         {
1646                 assert(llvm::isa<llvm::VectorType>(T(type)));
1647                 const int numConstants = elementCount(type);                                       // Number of provided constants for the (emulated) type.
1648                 const int numElements = llvm::cast<llvm::VectorType>(T(type))->getNumElements();   // Number of elements of the underlying vector type.
1649                 assert(numElements <= 8 && numConstants <= numElements);
1650                 llvm::Constant *constantVector[8];
1651
1652                 for(int i = 0; i < numElements; i++)
1653                 {
1654                         constantVector[i] = llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]);
1655                 }
1656
1657                 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numElements)));
1658         }
1659
1660         Type *Void::getType()
1661         {
1662                 return T(llvm::Type::getVoidTy(*::context));
1663         }
1664
1665         Type *Bool::getType()
1666         {
1667                 return T(llvm::Type::getInt1Ty(*::context));
1668         }
1669
1670         Type *Byte::getType()
1671         {
1672                 return T(llvm::Type::getInt8Ty(*::context));
1673         }
1674
1675         Type *SByte::getType()
1676         {
1677                 return T(llvm::Type::getInt8Ty(*::context));
1678         }
1679
1680         Type *Short::getType()
1681         {
1682                 return T(llvm::Type::getInt16Ty(*::context));
1683         }
1684
1685         Type *UShort::getType()
1686         {
1687                 return T(llvm::Type::getInt16Ty(*::context));
1688         }
1689
1690         Type *Byte4::getType()
1691         {
1692                 return T(Type_v4i8);
1693         }
1694
1695         Type *SByte4::getType()
1696         {
1697                 return T(Type_v4i8);
1698         }
1699
1700         RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
1701         {
1702 #if defined(__i386__) || defined(__x86_64__)
1703                 return x86::paddusb(x, y);
1704 #else
1705                 return As<Byte8>(V(lowerPUADDSAT(V(x.value), V(y.value))));
1706 #endif
1707         }
1708
1709         RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
1710         {
1711 #if defined(__i386__) || defined(__x86_64__)
1712                 return x86::psubusb(x, y);
1713 #else
1714                 return As<Byte8>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
1715 #endif
1716         }
1717
1718         RValue<Int> SignMask(RValue<Byte8> x)
1719         {
1720 #if defined(__i386__) || defined(__x86_64__)
1721                 return x86::pmovmskb(x);
1722 #else
1723                 return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
1724 #endif
1725         }
1726
1727 //      RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
1728 //      {
1729 //#if defined(__i386__) || defined(__x86_64__)
1730 //              return x86::pcmpgtb(x, y);   // FIXME: Signedness
1731 //#else
1732 //              return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
1733 //#endif
1734 //      }
1735
1736         RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
1737         {
1738 #if defined(__i386__) || defined(__x86_64__)
1739                 return x86::pcmpeqb(x, y);
1740 #else
1741                 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
1742 #endif
1743         }
1744
1745         Type *Byte8::getType()
1746         {
1747                 return T(Type_v8i8);
1748         }
1749
1750         RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
1751         {
1752 #if defined(__i386__) || defined(__x86_64__)
1753                 return x86::paddsb(x, y);
1754 #else
1755                 return As<SByte8>(V(lowerPSADDSAT(V(x.value), V(y.value))));
1756 #endif
1757         }
1758
1759         RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
1760         {
1761 #if defined(__i386__) || defined(__x86_64__)
1762                 return x86::psubsb(x, y);
1763 #else
1764                 return As<SByte8>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
1765 #endif
1766         }
1767
1768         RValue<Int> SignMask(RValue<SByte8> x)
1769         {
1770 #if defined(__i386__) || defined(__x86_64__)
1771                 return x86::pmovmskb(As<Byte8>(x));
1772 #else
1773                 return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
1774 #endif
1775         }
1776
1777         RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
1778         {
1779 #if defined(__i386__) || defined(__x86_64__)
1780                 return x86::pcmpgtb(x, y);
1781 #else
1782                 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
1783 #endif
1784         }
1785
1786         RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
1787         {
1788 #if defined(__i386__) || defined(__x86_64__)
1789                 return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
1790 #else
1791                 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
1792 #endif
1793         }
1794
1795         Type *SByte8::getType()
1796         {
1797                 return T(Type_v8i8);
1798         }
1799
1800         Type *Byte16::getType()
1801         {
1802                 return T(llvm::VectorType::get(T(Byte::getType()), 16));
1803         }
1804
1805         Type *SByte16::getType()
1806         {
1807                 return T(llvm::VectorType::get(T(SByte::getType()), 16));
1808         }
1809
1810         Type *Short2::getType()
1811         {
1812                 return T(Type_v2i16);
1813         }
1814
1815         Type *UShort2::getType()
1816         {
1817                 return T(Type_v2i16);
1818         }
1819
1820         Short4::Short4(RValue<Int4> cast)
1821         {
1822                 int select[8] = {0, 2, 4, 6, 0, 2, 4, 6};
1823                 Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
1824
1825                 Value *packed = Nucleus::createShuffleVector(short8, short8, select);
1826                 Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value;
1827
1828                 storeValue(short4);
1829         }
1830
1831 //      Short4::Short4(RValue<Float> cast)
1832 //      {
1833 //      }
1834
1835         Short4::Short4(RValue<Float4> cast)
1836         {
1837                 Int4 v4i32 = Int4(cast);
1838 #if defined(__i386__) || defined(__x86_64__)
1839                 v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
1840 #else
1841                 Value *v = v4i32.loadValue();
1842                 v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
1843 #endif
1844
1845                 storeValue(As<Short4>(Int2(v4i32)).value);
1846         }
1847
1848         RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
1849         {
1850 #if defined(__i386__) || defined(__x86_64__)
1851         //      return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
1852
1853                 return x86::psllw(lhs, rhs);
1854 #else
1855                 return As<Short4>(V(lowerVectorShl(V(lhs.value), rhs)));
1856 #endif
1857         }
1858
1859         RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
1860         {
1861 #if defined(__i386__) || defined(__x86_64__)
1862                 return x86::psraw(lhs, rhs);
1863 #else
1864                 return As<Short4>(V(lowerVectorAShr(V(lhs.value), rhs)));
1865 #endif
1866         }
1867
1868         RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
1869         {
1870 #if defined(__i386__) || defined(__x86_64__)
1871                 return x86::pmaxsw(x, y);
1872 #else
1873                 return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
1874 #endif
1875         }
1876
1877         RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
1878         {
1879 #if defined(__i386__) || defined(__x86_64__)
1880                 return x86::pminsw(x, y);
1881 #else
1882                 return RValue<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
1883 #endif
1884         }
1885
1886         RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
1887         {
1888 #if defined(__i386__) || defined(__x86_64__)
1889                 return x86::paddsw(x, y);
1890 #else
1891                 return As<Short4>(V(lowerPSADDSAT(V(x.value), V(y.value))));
1892 #endif
1893         }
1894
1895         RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
1896         {
1897 #if defined(__i386__) || defined(__x86_64__)
1898                 return x86::psubsw(x, y);
1899 #else
1900                 return As<Short4>(V(lowerPSSUBSAT(V(x.value), V(y.value))));
1901 #endif
1902         }
1903
1904         RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
1905         {
1906 #if defined(__i386__) || defined(__x86_64__)
1907                 return x86::pmulhw(x, y);
1908 #else
1909                 return As<Short4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
1910 #endif
1911         }
1912
1913         RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
1914         {
1915 #if defined(__i386__) || defined(__x86_64__)
1916                 return x86::pmaddwd(x, y);
1917 #else
1918                 return As<Int2>(V(lowerMulAdd(V(x.value), V(y.value))));
1919 #endif
1920         }
1921
1922         RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
1923         {
1924 #if defined(__i386__) || defined(__x86_64__)
1925                 auto result = x86::packsswb(x, y);
1926 #else
1927                 auto result = V(lowerPack(V(x.value), V(y.value), true));
1928 #endif
1929                 return As<SByte8>(Swizzle(As<Int4>(result), 0x88));
1930         }
1931
1932         RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
1933         {
1934 #if defined(__i386__) || defined(__x86_64__)
1935                 auto result = x86::packuswb(x, y);
1936 #else
1937                 auto result = V(lowerPack(V(x.value), V(y.value), false));
1938 #endif
1939                 return As<Byte8>(Swizzle(As<Int4>(result), 0x88));
1940         }
1941
1942         RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
1943         {
1944 #if defined(__i386__) || defined(__x86_64__)
1945                 return x86::pcmpgtw(x, y);
1946 #else
1947                 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
1948 #endif
1949         }
1950
1951         RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
1952         {
1953 #if defined(__i386__) || defined(__x86_64__)
1954                 return x86::pcmpeqw(x, y);
1955 #else
1956                 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
1957 #endif
1958         }
1959
1960         Type *Short4::getType()
1961         {
1962                 return T(Type_v4i16);
1963         }
1964
1965         UShort4::UShort4(RValue<Float4> cast, bool saturate)
1966         {
1967                 if(saturate)
1968                 {
1969 #if defined(__i386__) || defined(__x86_64__)
1970                         if(CPUID::supportsSSE4_1())
1971                         {
1972                                 Int4 int4(Min(cast, Float4(0xFFFF)));   // packusdw takes care of 0x0000 saturation
1973                                 *this = As<Short4>(PackUnsigned(int4, int4));
1974                         }
1975                         else
1976 #endif
1977                         {
1978                                 *this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
1979                         }
1980                 }
1981                 else
1982                 {
1983                         *this = Short4(Int4(cast));
1984                 }
1985         }
1986
1987         RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
1988         {
1989 #if defined(__i386__) || defined(__x86_64__)
1990         //      return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
1991
1992                 return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
1993 #else
1994                 return As<UShort4>(V(lowerVectorShl(V(lhs.value), rhs)));
1995 #endif
1996         }
1997
1998         RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
1999         {
2000 #if defined(__i386__) || defined(__x86_64__)
2001         //      return RValue<Short4>(Nucleus::createLShr(lhs.value, rhs.value));
2002
2003                 return x86::psrlw(lhs, rhs);
2004 #else
2005                 return As<UShort4>(V(lowerVectorLShr(V(lhs.value), rhs)));
2006 #endif
2007         }
2008
2009         RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2010         {
2011                 return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2012         }
2013
2014         RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2015         {
2016                 return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2017         }
2018
2019         RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2020         {
2021 #if defined(__i386__) || defined(__x86_64__)
2022                 return x86::paddusw(x, y);
2023 #else
2024                 return As<UShort4>(V(lowerPUADDSAT(V(x.value), V(y.value))));
2025 #endif
2026         }
2027
2028         RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2029         {
2030 #if defined(__i386__) || defined(__x86_64__)
2031                 return x86::psubusw(x, y);
2032 #else
2033                 return As<UShort4>(V(lowerPUSUBSAT(V(x.value), V(y.value))));
2034 #endif
2035         }
2036
2037         RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2038         {
2039 #if defined(__i386__) || defined(__x86_64__)
2040                 return x86::pmulhuw(x, y);
2041 #else
2042                 return As<UShort4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
2043 #endif
2044         }
2045
2046         RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
2047         {
2048 #if defined(__i386__) || defined(__x86_64__)
2049                 return x86::pavgw(x, y);
2050 #else
2051                 return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
2052 #endif
2053         }
2054
2055         Type *UShort4::getType()
2056         {
2057                 return T(Type_v4i16);
2058         }
2059
2060         RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
2061         {
2062 #if defined(__i386__) || defined(__x86_64__)
2063                 return x86::psllw(lhs, rhs);
2064 #else
2065                 return As<Short8>(V(lowerVectorShl(V(lhs.value), rhs)));
2066 #endif
2067         }
2068
2069         RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
2070         {
2071 #if defined(__i386__) || defined(__x86_64__)
2072                 return x86::psraw(lhs, rhs);
2073 #else
2074                 return As<Short8>(V(lowerVectorAShr(V(lhs.value), rhs)));
2075 #endif
2076         }
2077
2078         RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
2079         {
2080 #if defined(__i386__) || defined(__x86_64__)
2081                 return x86::pmaddwd(x, y);
2082 #else
2083                 return As<Int4>(V(lowerMulAdd(V(x.value), V(y.value))));
2084 #endif
2085         }
2086
2087         RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
2088         {
2089 #if defined(__i386__) || defined(__x86_64__)
2090                 return x86::pmulhw(x, y);
2091 #else
2092                 return As<Short8>(V(lowerMulHigh(V(x.value), V(y.value), true)));
2093 #endif
2094         }
2095
2096         Type *Short8::getType()
2097         {
2098                 return T(llvm::VectorType::get(T(Short::getType()), 8));
2099         }
2100
2101         RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
2102         {
2103 #if defined(__i386__) || defined(__x86_64__)
2104                 return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
2105 #else
2106                 return As<UShort8>(V(lowerVectorShl(V(lhs.value), rhs)));
2107 #endif
2108         }
2109
2110         RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
2111         {
2112 #if defined(__i386__) || defined(__x86_64__)
2113                 return x86::psrlw(lhs, rhs);   // FIXME: Fallback required
2114 #else
2115                 return As<UShort8>(V(lowerVectorLShr(V(lhs.value), rhs)));
2116 #endif
2117         }
2118
2119         RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
2120         {
2121                 int pshufb[16] =
2122                 {
2123                         select0 + 0,
2124                         select0 + 1,
2125                         select1 + 0,
2126                         select1 + 1,
2127                         select2 + 0,
2128                         select2 + 1,
2129                         select3 + 0,
2130                         select3 + 1,
2131                         select4 + 0,
2132                         select4 + 1,
2133                         select5 + 0,
2134                         select5 + 1,
2135                         select6 + 0,
2136                         select6 + 1,
2137                         select7 + 0,
2138                         select7 + 1,
2139                 };
2140
2141                 Value *byte16 = Nucleus::createBitCast(x.value, Byte16::getType());
2142                 Value *shuffle = Nucleus::createShuffleVector(byte16, byte16, pshufb);
2143                 Value *short8 = Nucleus::createBitCast(shuffle, UShort8::getType());
2144
2145                 return RValue<UShort8>(short8);
2146         }
2147
2148         RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
2149         {
2150 #if defined(__i386__) || defined(__x86_64__)
2151                 return x86::pmulhuw(x, y);
2152 #else
2153                 return As<UShort8>(V(lowerMulHigh(V(x.value), V(y.value), false)));
2154 #endif
2155         }
2156
2157         Type *UShort8::getType()
2158         {
2159                 return T(llvm::VectorType::get(T(UShort::getType()), 8));
2160         }
2161
2162         RValue<Int> operator++(Int &val, int)   // Post-increment
2163         {
2164                 RValue<Int> res = val;
2165
2166                 Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantInt(1));
2167                 val.storeValue(inc);
2168
2169                 return res;
2170         }
2171
2172         const Int &operator++(Int &val)   // Pre-increment
2173         {
2174                 Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2175                 val.storeValue(inc);
2176
2177                 return val;
2178         }
2179
2180         RValue<Int> operator--(Int &val, int)   // Post-decrement
2181         {
2182                 RValue<Int> res = val;
2183
2184                 Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantInt(1));
2185                 val.storeValue(inc);
2186
2187                 return res;
2188         }
2189
2190         const Int &operator--(Int &val)   // Pre-decrement
2191         {
2192                 Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2193                 val.storeValue(inc);
2194
2195                 return val;
2196         }
2197
2198         RValue<Int> RoundInt(RValue<Float> cast)
2199         {
2200 #if defined(__i386__) || defined(__x86_64__)
2201                 return x86::cvtss2si(cast);
2202 #else
2203                 return RValue<Int>(V(lowerRoundInt(V(cast.value), T(Int::getType()))));
2204 #endif
2205         }
2206
2207         Type *Int::getType()
2208         {
2209                 return T(llvm::Type::getInt32Ty(*::context));
2210         }
2211
2212         Type *Long::getType()
2213         {
2214                 return T(llvm::Type::getInt64Ty(*::context));
2215         }
2216
2217         UInt::UInt(RValue<Float> cast)
2218         {
2219                 // Note: createFPToUI is broken, must perform conversion using createFPtoSI
2220                 // Value *integer = Nucleus::createFPToUI(cast.value, UInt::getType());
2221
2222                 // Smallest positive value representable in UInt, but not in Int
2223                 const unsigned int ustart = 0x80000000u;
2224                 const float ustartf = float(ustart);
2225
2226                 // If the value is negative, store 0, otherwise store the result of the conversion
2227                 storeValue((~(As<Int>(cast) >> 31) &
2228                 // Check if the value can be represented as an Int
2229                         IfThenElse(cast >= ustartf,
2230                 // If the value is too large, subtract ustart and re-add it after conversion.
2231                                 As<Int>(As<UInt>(Int(cast - Float(ustartf))) + UInt(ustart)),
2232                 // Otherwise, just convert normally
2233                                 Int(cast))).value);
2234         }
2235
2236         RValue<UInt> operator++(UInt &val, int)   // Post-increment
2237         {
2238                 RValue<UInt> res = val;
2239
2240                 Value *inc = Nucleus::createAdd(res.value, Nucleus::createConstantInt(1));
2241                 val.storeValue(inc);
2242
2243                 return res;
2244         }
2245
2246         const UInt &operator++(UInt &val)   // Pre-increment
2247         {
2248                 Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2249                 val.storeValue(inc);
2250
2251                 return val;
2252         }
2253
2254         RValue<UInt> operator--(UInt &val, int)   // Post-decrement
2255         {
2256                 RValue<UInt> res = val;
2257
2258                 Value *inc = Nucleus::createSub(res.value, Nucleus::createConstantInt(1));
2259                 val.storeValue(inc);
2260
2261                 return res;
2262         }
2263
2264         const UInt &operator--(UInt &val)   // Pre-decrement
2265         {
2266                 Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2267                 val.storeValue(inc);
2268
2269                 return val;
2270         }
2271
2272 //      RValue<UInt> RoundUInt(RValue<Float> cast)
2273 //      {
2274 //#if defined(__i386__) || defined(__x86_64__)
2275 //              return x86::cvtss2si(val);   // FIXME: Unsigned
2276 //#else
2277 //              return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f));
2278 //#endif
2279 //      }
2280
2281         Type *UInt::getType()
2282         {
2283                 return T(llvm::Type::getInt32Ty(*::context));
2284         }
2285
2286 //      Int2::Int2(RValue<Int> cast)
2287 //      {
2288 //              Value *extend = Nucleus::createZExt(cast.value, Long::getType());
2289 //              Value *vector = Nucleus::createBitCast(extend, Int2::getType());
2290 //
2291 //              int shuffle[2] = {0, 0};
2292 //              Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
2293 //
2294 //              storeValue(replicate);
2295 //      }
2296
2297         RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
2298         {
2299 #if defined(__i386__) || defined(__x86_64__)
2300         //      return RValue<Int2>(Nucleus::createShl(lhs.value, rhs.value));
2301
2302                 return x86::pslld(lhs, rhs);
2303 #else
2304                 return As<Int2>(V(lowerVectorShl(V(lhs.value), rhs)));
2305 #endif
2306         }
2307
2308         RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
2309         {
2310 #if defined(__i386__) || defined(__x86_64__)
2311         //      return RValue<Int2>(Nucleus::createAShr(lhs.value, rhs.value));
2312
2313                 return x86::psrad(lhs, rhs);
2314 #else
2315                 return As<Int2>(V(lowerVectorAShr(V(lhs.value), rhs)));
2316 #endif
2317         }
2318
2319         Type *Int2::getType()
2320         {
2321                 return T(Type_v2i32);
2322         }
2323
2324         RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
2325         {
2326 #if defined(__i386__) || defined(__x86_64__)
2327         //      return RValue<UInt2>(Nucleus::createShl(lhs.value, rhs.value));
2328
2329                 return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
2330 #else
2331                 return As<UInt2>(V(lowerVectorShl(V(lhs.value), rhs)));
2332 #endif
2333         }
2334
2335         RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
2336         {
2337 #if defined(__i386__) || defined(__x86_64__)
2338         //      return RValue<UInt2>(Nucleus::createLShr(lhs.value, rhs.value));
2339
2340                 return x86::psrld(lhs, rhs);
2341 #else
2342                 return As<UInt2>(V(lowerVectorLShr(V(lhs.value), rhs)));
2343 #endif
2344         }
2345
2346         Type *UInt2::getType()
2347         {
2348                 return T(Type_v2i32);
2349         }
2350
2351         Int4::Int4(RValue<Byte4> cast) : XYZW(this)
2352         {
2353 #if defined(__i386__) || defined(__x86_64__)
2354                 if(CPUID::supportsSSE4_1())
2355                 {
2356                         *this = x86::pmovzxbd(As<Byte16>(cast));
2357                 }
2358                 else
2359 #endif
2360                 {
2361                         int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
2362                         Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
2363                         Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::getType()), swizzle);
2364
2365                         int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
2366                         Value *c = Nucleus::createBitCast(b, Short8::getType());
2367                         Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::getType()), swizzle2);
2368
2369                         *this = As<Int4>(d);
2370                 }
2371         }
2372
2373         Int4::Int4(RValue<SByte4> cast) : XYZW(this)
2374         {
2375 #if defined(__i386__) || defined(__x86_64__)
2376                 if(CPUID::supportsSSE4_1())
2377                 {
2378                         *this = x86::pmovsxbd(As<SByte16>(cast));
2379                 }
2380                 else
2381 #endif
2382                 {
2383                         int swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
2384                         Value *a = Nucleus::createBitCast(cast.value, Byte16::getType());
2385                         Value *b = Nucleus::createShuffleVector(a, a, swizzle);
2386
2387                         int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
2388                         Value *c = Nucleus::createBitCast(b, Short8::getType());
2389                         Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
2390
2391                         *this = As<Int4>(d) >> 24;
2392                 }
2393         }
2394
2395         Int4::Int4(RValue<Short4> cast) : XYZW(this)
2396         {
2397 #if defined(__i386__) || defined(__x86_64__)
2398                 if(CPUID::supportsSSE4_1())
2399                 {
2400                         *this = x86::pmovsxwd(As<Short8>(cast));
2401                 }
2402                 else
2403 #endif
2404                 {
2405                         int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
2406                         Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle);
2407                         *this = As<Int4>(c) >> 16;
2408                 }
2409         }
2410
2411         Int4::Int4(RValue<UShort4> cast) : XYZW(this)
2412         {
2413 #if defined(__i386__) || defined(__x86_64__)
2414                 if(CPUID::supportsSSE4_1())
2415                 {
2416                         *this = x86::pmovzxwd(As<UShort8>(cast));
2417                 }
2418                 else
2419 #endif
2420                 {
2421                         int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
2422                         Value *c = Nucleus::createShuffleVector(cast.value, Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
2423                         *this = As<Int4>(c);
2424                 }
2425         }
2426
2427         Int4::Int4(RValue<Int> rhs) : XYZW(this)
2428         {
2429                 Value *vector = loadValue();
2430                 Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
2431
2432                 int swizzle[4] = {0, 0, 0, 0};
2433                 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2434
2435                 storeValue(replicate);
2436         }
2437
2438         RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
2439         {
2440 #if defined(__i386__) || defined(__x86_64__)
2441                 return x86::pslld(lhs, rhs);
2442 #else
2443                 return As<Int4>(V(lowerVectorShl(V(lhs.value), rhs)));
2444 #endif
2445         }
2446
2447         RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
2448         {
2449 #if defined(__i386__) || defined(__x86_64__)
2450                 return x86::psrad(lhs, rhs);
2451 #else
2452                 return As<Int4>(V(lowerVectorAShr(V(lhs.value), rhs)));
2453 #endif
2454         }
2455
2456         RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
2457         {
2458                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2459                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2460                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
2461                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
2462         }
2463
2464         RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
2465         {
2466                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2467                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2468                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType()));
2469                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
2470         }
2471
2472         RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
2473         {
2474                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2475                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2476                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value, y.value), Int4::getType()));
2477                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
2478         }
2479
2480         RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
2481         {
2482                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2483                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2484                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
2485                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
2486         }
2487
2488         RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
2489         {
2490                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2491                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2492                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value, y.value), Int4::getType()));
2493                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
2494         }
2495
2496         RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
2497         {
2498                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2499                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2500                 // return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType()));
2501                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
2502         }
2503
2504         RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
2505         {
2506 #if defined(__i386__) || defined(__x86_64__)
2507                 if(CPUID::supportsSSE4_1())
2508                 {
2509                         return x86::pmaxsd(x, y);
2510                 }
2511                 else
2512 #endif
2513                 {
2514                         RValue<Int4> greater = CmpNLE(x, y);
2515                         return (x & greater) | (y & ~greater);
2516                 }
2517         }
2518
2519         RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
2520         {
2521 #if defined(__i386__) || defined(__x86_64__)
2522                 if(CPUID::supportsSSE4_1())
2523                 {
2524                         return x86::pminsd(x, y);
2525                 }
2526                 else
2527 #endif
2528                 {
2529                         RValue<Int4> less = CmpLT(x, y);
2530                         return (x & less) | (y & ~less);
2531                 }
2532         }
2533
2534         RValue<Int4> RoundInt(RValue<Float4> cast)
2535         {
2536 #if defined(__i386__) || defined(__x86_64__)
2537                 return x86::cvtps2dq(cast);
2538 #else
2539                 return As<Int4>(V(lowerRoundInt(V(cast.value), T(Int4::getType()))));
2540 #endif
2541         }
2542
2543         RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2544         {
2545                 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2546                 return As<Int4>(V(lowerMulHigh(V(x.value), V(y.value), true)));
2547         }
2548
2549         RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
2550         {
2551                 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2552                 return As<UInt4>(V(lowerMulHigh(V(x.value), V(y.value), false)));
2553         }
2554
2555         RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
2556         {
2557 #if defined(__i386__) || defined(__x86_64__)
2558                 return x86::packssdw(x, y);
2559 #else
2560                 return As<Short8>(V(lowerPack(V(x.value), V(y.value), true)));
2561 #endif
2562         }
2563
2564         RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
2565         {
2566 #if defined(__i386__) || defined(__x86_64__)
2567                 return x86::packusdw(x, y);
2568 #else
2569                 return As<UShort8>(V(lowerPack(V(x.value), V(y.value), false)));
2570 #endif
2571         }
2572
2573         RValue<Int> SignMask(RValue<Int4> x)
2574         {
2575 #if defined(__i386__) || defined(__x86_64__)
2576                 return x86::movmskps(As<Float4>(x));
2577 #else
2578                 return As<Int>(V(lowerSignMask(V(x.value), T(Int::getType()))));
2579 #endif
2580         }
2581
2582         Type *Int4::getType()
2583         {
2584                 return T(llvm::VectorType::get(T(Int::getType()), 4));
2585         }
2586
2587         UInt4::UInt4(RValue<Float4> cast) : XYZW(this)
2588         {
2589                 // Note: createFPToUI is broken, must perform conversion using createFPtoSI
2590                 // Value *xyzw = Nucleus::createFPToUI(cast.value, UInt4::getType());
2591
2592                 // Smallest positive value representable in UInt, but not in Int
2593                 const unsigned int ustart = 0x80000000u;
2594                 const float ustartf = float(ustart);
2595
2596                 // Check if the value can be represented as an Int
2597                 Int4 uiValue = CmpNLT(cast, Float4(ustartf));
2598                 // If the value is too large, subtract ustart and re-add it after conversion.
2599                 uiValue = (uiValue & As<Int4>(As<UInt4>(Int4(cast - Float4(ustartf))) + UInt4(ustart))) |
2600                 // Otherwise, just convert normally
2601                           (~uiValue & Int4(cast));
2602                 // If the value is negative, store 0, otherwise store the result of the conversion
2603                 storeValue((~(As<Int4>(cast) >> 31) & uiValue).value);
2604         }
2605
2606         RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
2607         {
2608 #if defined(__i386__) || defined(__x86_64__)
2609                 return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
2610 #else
2611                 return As<UInt4>(V(lowerVectorShl(V(lhs.value), rhs)));
2612 #endif
2613         }
2614
2615         RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
2616         {
2617 #if defined(__i386__) || defined(__x86_64__)
2618                 return x86::psrld(lhs, rhs);
2619 #else
2620                 return As<UInt4>(V(lowerVectorLShr(V(lhs.value), rhs)));
2621 #endif
2622         }
2623
2624         RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
2625         {
2626                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2627                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2628                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
2629                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
2630         }
2631
2632         RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
2633         {
2634                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType()));
2635         }
2636
2637         RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
2638         {
2639                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2640                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2641                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value, y.value), Int4::getType()));
2642                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
2643         }
2644
2645         RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
2646         {
2647                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
2648         }
2649
2650         RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
2651         {
2652                 // FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
2653                 //        Restore the following line when LLVM is updated to a version where this issue is fixed.
2654                 // return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value, y.value), Int4::getType()));
2655                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
2656         }
2657
2658         RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
2659         {
2660                 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType()));
2661         }
2662
2663         RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
2664         {
2665 #if defined(__i386__) || defined(__x86_64__)
2666                 if(CPUID::supportsSSE4_1())
2667                 {
2668                         return x86::pmaxud(x, y);
2669                 }
2670                 else
2671 #endif
2672                 {
2673                         RValue<UInt4> greater = CmpNLE(x, y);
2674                         return (x & greater) | (y & ~greater);
2675                 }
2676         }
2677
2678         RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
2679         {
2680 #if defined(__i386__) || defined(__x86_64__)
2681                 if(CPUID::supportsSSE4_1())
2682                 {
2683                         return x86::pminud(x, y);
2684                 }
2685                 else
2686 #endif
2687                 {
2688                         RValue<UInt4> less = CmpLT(x, y);
2689                         return (x & less) | (y & ~less);
2690                 }
2691         }
2692
2693         Type *UInt4::getType()
2694         {
2695                 return T(llvm::VectorType::get(T(UInt::getType()), 4));
2696         }
2697
2698         Type *Half::getType()
2699         {
2700                 return T(llvm::Type::getInt16Ty(*::context));
2701         }
2702
2703         RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
2704         {
2705 #if defined(__i386__) || defined(__x86_64__)
2706                 if(exactAtPow2)
2707                 {
2708                         // rcpss uses a piecewise-linear approximation which minimizes the relative error
2709                         // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2710                         return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2711                 }
2712                 return x86::rcpss(x);
2713 #else
2714                 return As<Float>(V(lowerRCP(V(x.value))));
2715 #endif
2716         }
2717
2718         RValue<Float> RcpSqrt_pp(RValue<Float> x)
2719         {
2720 #if defined(__i386__) || defined(__x86_64__)
2721                 return x86::rsqrtss(x);
2722 #else
2723                 return As<Float>(V(lowerRSQRT(V(x.value))));
2724 #endif
2725         }
2726
2727         RValue<Float> Sqrt(RValue<Float> x)
2728         {
2729 #if defined(__i386__) || defined(__x86_64__)
2730                 return x86::sqrtss(x);
2731 #else
2732                 return As<Float>(V(lowerSQRT(V(x.value))));
2733 #endif
2734         }
2735
2736         RValue<Float> Round(RValue<Float> x)
2737         {
2738 #if defined(__i386__) || defined(__x86_64__)
2739                 if(CPUID::supportsSSE4_1())
2740                 {
2741                         return x86::roundss(x, 0);
2742                 }
2743                 else
2744                 {
2745                         return Float4(Round(Float4(x))).x;
2746                 }
2747 #else
2748                 return RValue<Float>(V(lowerRound(V(x.value))));
2749 #endif
2750         }
2751
2752         RValue<Float> Trunc(RValue<Float> x)
2753         {
2754 #if defined(__i386__) || defined(__x86_64__)
2755                 if(CPUID::supportsSSE4_1())
2756                 {
2757                         return x86::roundss(x, 3);
2758                 }
2759                 else
2760                 {
2761                         return Float(Int(x));   // Rounded toward zero
2762                 }
2763 #else
2764                 return RValue<Float>(V(lowerTrunc(V(x.value))));
2765 #endif
2766         }
2767
2768         RValue<Float> Frac(RValue<Float> x)
2769         {
2770 #if defined(__i386__) || defined(__x86_64__)
2771                 if(CPUID::supportsSSE4_1())
2772                 {
2773                         return x - x86::floorss(x);
2774                 }
2775                 else
2776                 {
2777                         return Float4(Frac(Float4(x))).x;
2778                 }
2779 #else
2780                 // x - floor(x) can be 1.0 for very small negative x.
2781                 // Clamp against the value just below 1.0.
2782                 return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
2783 #endif
2784         }
2785
2786         RValue<Float> Floor(RValue<Float> x)
2787         {
2788 #if defined(__i386__) || defined(__x86_64__)
2789                 if(CPUID::supportsSSE4_1())
2790                 {
2791                         return x86::floorss(x);
2792                 }
2793                 else
2794                 {
2795                         return Float4(Floor(Float4(x))).x;
2796                 }
2797 #else
2798                 return RValue<Float>(V(lowerFloor(V(x.value))));
2799 #endif
2800         }
2801
2802         RValue<Float> Ceil(RValue<Float> x)
2803         {
2804 #if defined(__i386__) || defined(__x86_64__)
2805                 if(CPUID::supportsSSE4_1())
2806                 {
2807                         return x86::ceilss(x);
2808                 }
2809                 else
2810 #endif
2811                 {
2812                         return Float4(Ceil(Float4(x))).x;
2813                 }
2814         }
2815
2816         Type *Float::getType()
2817         {
2818                 return T(llvm::Type::getFloatTy(*::context));
2819         }
2820
2821         Type *Float2::getType()
2822         {
2823                 return T(Type_v2f32);
2824         }
2825
2826         Float4::Float4(RValue<Float> rhs) : XYZW(this)
2827         {
2828                 Value *vector = loadValue();
2829                 Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
2830
2831                 int swizzle[4] = {0, 0, 0, 0};
2832                 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2833
2834                 storeValue(replicate);
2835         }
2836
2837         RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
2838         {
2839 #if defined(__i386__) || defined(__x86_64__)
2840                 return x86::maxps(x, y);
2841 #else
2842                 return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OGT)));
2843 #endif
2844         }
2845
2846         RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
2847         {
2848 #if defined(__i386__) || defined(__x86_64__)
2849                 return x86::minps(x, y);
2850 #else
2851                 return As<Float4>(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OLT)));
2852 #endif
2853         }
2854
2855         RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
2856         {
2857 #if defined(__i386__) || defined(__x86_64__)
2858                 if(exactAtPow2)
2859                 {
2860                         // rcpps uses a piecewise-linear approximation which minimizes the relative error
2861                         // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2862                         return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2863                 }
2864                 return x86::rcpps(x);
2865 #else
2866                 return As<Float4>(V(lowerRCP(V(x.value))));
2867 #endif
2868         }
2869
2870         RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
2871         {
2872 #if defined(__i386__) || defined(__x86_64__)
2873                 return x86::rsqrtps(x);
2874 #else
2875                 return As<Float4>(V(lowerRSQRT(V(x.value))));
2876 #endif
2877         }
2878
2879         RValue<Float4> Sqrt(RValue<Float4> x)
2880         {
2881 #if defined(__i386__) || defined(__x86_64__)
2882                 return x86::sqrtps(x);
2883 #else
2884                 return As<Float4>(V(lowerSQRT(V(x.value))));
2885 #endif
2886         }
2887
2888         RValue<Int> SignMask(RValue<Float4> x)
2889         {
2890 #if defined(__i386__) || defined(__x86_64__)
2891                 return x86::movmskps(x);
2892 #else
2893                 return As<Int>(V(lowerFPSignMask(V(x.value), T(Int::getType()))));
2894 #endif
2895         }
2896
2897         RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
2898         {
2899         //      return As<Int4>(x86::cmpeqps(x, y));
2900                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value, y.value), Int4::getType()));
2901         }
2902
2903         RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
2904         {
2905         //      return As<Int4>(x86::cmpltps(x, y));
2906                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value, y.value), Int4::getType()));
2907         }
2908
2909         RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
2910         {
2911         //      return As<Int4>(x86::cmpleps(x, y));
2912                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value, y.value), Int4::getType()));
2913         }
2914
2915         RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
2916         {
2917         //      return As<Int4>(x86::cmpneqps(x, y));
2918                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value, y.value), Int4::getType()));
2919         }
2920
2921         RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
2922         {
2923         //      return As<Int4>(x86::cmpnltps(x, y));
2924                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value, y.value), Int4::getType()));
2925         }
2926
2927         RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
2928         {
2929         //      return As<Int4>(x86::cmpnleps(x, y));
2930                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value, y.value), Int4::getType()));
2931         }
2932
2933         RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
2934         {
2935                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value, y.value), Int4::getType()));
2936         }
2937
2938         RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
2939         {
2940                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value, y.value), Int4::getType()));
2941         }
2942
2943         RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
2944         {
2945                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value, y.value), Int4::getType()));
2946         }
2947
2948         RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
2949         {
2950                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value, y.value), Int4::getType()));
2951         }
2952
2953         RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
2954         {
2955                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value, y.value), Int4::getType()));
2956         }
2957
2958         RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
2959         {
2960                 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value, y.value), Int4::getType()));
2961         }
2962
2963         RValue<Float4> Round(RValue<Float4> x)
2964         {
2965 #if defined(__i386__) || defined(__x86_64__)
2966                 if(CPUID::supportsSSE4_1())
2967                 {
2968                         return x86::roundps(x, 0);
2969                 }
2970                 else
2971                 {
2972                         return Float4(RoundInt(x));
2973                 }
2974 #else
2975                 return RValue<Float4>(V(lowerRound(V(x.value))));
2976 #endif
2977         }
2978
2979         RValue<Float4> Trunc(RValue<Float4> x)
2980         {
2981 #if defined(__i386__) || defined(__x86_64__)
2982                 if(CPUID::supportsSSE4_1())
2983                 {
2984                         return x86::roundps(x, 3);
2985                 }
2986                 else
2987                 {
2988                         return Float4(Int4(x));
2989                 }
2990 #else
2991                 return RValue<Float4>(V(lowerTrunc(V(x.value))));
2992 #endif
2993         }
2994
2995         RValue<Float4> Frac(RValue<Float4> x)
2996         {
2997                 Float4 frc;
2998
2999 #if defined(__i386__) || defined(__x86_64__)
3000                 if(CPUID::supportsSSE4_1())
3001                 {
3002                         frc = x - Floor(x);
3003                 }
3004                 else
3005                 {
3006                         frc = x - Float4(Int4(x));   // Signed fractional part.
3007
3008                         frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f)));   // Add 1.0 if negative.
3009                 }
3010 #else
3011                 frc = x - Floor(x);
3012 #endif
3013
3014                 // x - floor(x) can be 1.0 for very small negative x.
3015                 // Clamp against the value just below 1.0.
3016                 return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
3017         }
3018
3019         RValue<Float4> Floor(RValue<Float4> x)
3020         {
3021 #if defined(__i386__) || defined(__x86_64__)
3022                 if(CPUID::supportsSSE4_1())
3023                 {
3024                         return x86::floorps(x);
3025                 }
3026                 else
3027                 {
3028                         return x - Frac(x);
3029                 }
3030 #else
3031                 return RValue<Float4>(V(lowerFloor(V(x.value))));
3032 #endif
3033         }
3034
3035         RValue<Float4> Ceil(RValue<Float4> x)
3036         {
3037 #if defined(__i386__) || defined(__x86_64__)
3038                 if(CPUID::supportsSSE4_1())
3039                 {
3040                         return x86::ceilps(x);
3041                 }
3042                 else
3043 #endif
3044                 {
3045                         return -Floor(-x);
3046                 }
3047         }
3048
3049         Type *Float4::getType()
3050         {
3051                 return T(llvm::VectorType::get(T(Float::getType()), 4));
3052         }
3053
3054         RValue<Long> Ticks()
3055         {
3056                 llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::readcyclecounter);
3057
3058                 return RValue<Long>(V(::builder->CreateCall(rdtsc)));
3059         }
3060 }
3061
3062 namespace rr
3063 {
3064 #if defined(__i386__) || defined(__x86_64__)
3065         namespace x86
3066         {
3067                 RValue<Int> cvtss2si(RValue<Float> val)
3068                 {
3069                         llvm::Function *cvtss2si = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_cvtss2si);
3070
3071                         Float4 vector;
3072                         vector.x = val;
3073
3074                         return RValue<Int>(V(::builder->CreateCall(cvtss2si, ARGS(V(RValue<Float4>(vector).value)))));
3075                 }
3076
3077                 RValue<Int4> cvtps2dq(RValue<Float4> val)
3078                 {
3079                         llvm::Function *cvtps2dq = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_cvtps2dq);
3080
3081                         return RValue<Int4>(V(::builder->CreateCall(cvtps2dq, ARGS(V(val.value)))));
3082                 }
3083
3084                 RValue<Float> rcpss(RValue<Float> val)
3085                 {
3086                         llvm::Function *rcpss = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_rcp_ss);
3087
3088                         Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
3089
3090                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(rcpss, ARGS(V(vector)))), Float::getType(), 0));
3091                 }
3092
3093                 RValue<Float> sqrtss(RValue<Float> val)
3094                 {
3095 #if REACTOR_LLVM_VERSION < 7
3096                         llvm::Function *sqrtss = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_sqrt_ss);
3097                         Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
3098
3099                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(sqrtss, ARGS(V(vector)))), Float::getType(), 0));
3100 #else
3101                         llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::sqrt, {V(val.value)->getType()});
3102                         return RValue<Float>(V(::builder->CreateCall(sqrt, ARGS(V(val.value)))));
3103 #endif
3104                 }
3105
3106                 RValue<Float> rsqrtss(RValue<Float> val)
3107                 {
3108                         llvm::Function *rsqrtss = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_rsqrt_ss);
3109
3110                         Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::getType()))), val.value, 0);
3111
3112                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(rsqrtss, ARGS(V(vector)))), Float::getType(), 0));
3113                 }
3114
3115                 RValue<Float4> rcpps(RValue<Float4> val)
3116                 {
3117                         llvm::Function *rcpps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_rcp_ps);
3118
3119                         return RValue<Float4>(V(::builder->CreateCall(rcpps, ARGS(V(val.value)))));
3120                 }
3121
3122                 RValue<Float4> sqrtps(RValue<Float4> val)
3123                 {
3124 #if REACTOR_LLVM_VERSION < 7
3125                         llvm::Function *sqrtps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_sqrt_ps);
3126 #else
3127                         llvm::Function *sqrtps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::sqrt, {V(val.value)->getType()});
3128 #endif
3129
3130                         return RValue<Float4>(V(::builder->CreateCall(sqrtps, ARGS(V(val.value)))));
3131                 }
3132
3133                 RValue<Float4> rsqrtps(RValue<Float4> val)
3134                 {
3135                         llvm::Function *rsqrtps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_rsqrt_ps);
3136
3137                         return RValue<Float4>(V(::builder->CreateCall(rsqrtps, ARGS(V(val.value)))));
3138                 }
3139
3140                 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
3141                 {
3142                         llvm::Function *maxps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_max_ps);
3143
3144                         return RValue<Float4>(V(::builder->CreateCall2(maxps, ARGS(V(x.value), V(y.value)))));
3145                 }
3146
3147                 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
3148                 {
3149                         llvm::Function *minps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_min_ps);
3150
3151                         return RValue<Float4>(V(::builder->CreateCall2(minps, ARGS(V(x.value), V(y.value)))));
3152                 }
3153
3154                 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
3155                 {
3156                         llvm::Function *roundss = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_round_ss);
3157
3158                         Value *undef = V(llvm::UndefValue::get(T(Float4::getType())));
3159                         Value *vector = Nucleus::createInsertElement(undef, val.value, 0);
3160
3161                         return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall3(roundss, ARGS(V(undef), V(vector), V(Nucleus::createConstantInt(imm))))), Float::getType(), 0));
3162                 }
3163
3164                 RValue<Float> floorss(RValue<Float> val)
3165                 {
3166                         return roundss(val, 1);
3167                 }
3168
3169                 RValue<Float> ceilss(RValue<Float> val)
3170                 {
3171                         return roundss(val, 2);
3172                 }
3173
3174                 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
3175                 {
3176                         llvm::Function *roundps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_round_ps);
3177
3178                         return RValue<Float4>(V(::builder->CreateCall2(roundps, ARGS(V(val.value), V(Nucleus::createConstantInt(imm))))));
3179                 }
3180
3181                 RValue<Float4> floorps(RValue<Float4> val)
3182                 {
3183                         return roundps(val, 1);
3184                 }
3185
3186                 RValue<Float4> ceilps(RValue<Float4> val)
3187                 {
3188                         return roundps(val, 2);
3189                 }
3190
3191                 RValue<Int4> pabsd(RValue<Int4> x)
3192                 {
3193 #if REACTOR_LLVM_VERSION < 7
3194                         llvm::Function *pabsd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_ssse3_pabs_d_128);
3195
3196                         return RValue<Int4>(V(::builder->CreateCall(pabsd, ARGS(V(x.value)))));
3197 #else
3198                         return RValue<Int4>(V(lowerPABS(V(x.value))));
3199 #endif
3200                 }
3201
3202                 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
3203                 {
3204                         llvm::Function *paddsw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_padds_w);
3205
3206                         return As<Short4>(V(::builder->CreateCall2(paddsw, ARGS(V(x.value), V(y.value)))));
3207                 }
3208
3209                 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
3210                 {
3211                         llvm::Function *psubsw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psubs_w);
3212
3213                         return As<Short4>(V(::builder->CreateCall2(psubsw, ARGS(V(x.value), V(y.value)))));
3214                 }
3215
3216                 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
3217                 {
3218                         llvm::Function *paddusw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_paddus_w);
3219
3220                         return As<UShort4>(V(::builder->CreateCall2(paddusw, ARGS(V(x.value), V(y.value)))));
3221                 }
3222
3223                 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
3224                 {
3225                         llvm::Function *psubusw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psubus_w);
3226
3227                         return As<UShort4>(V(::builder->CreateCall2(psubusw, ARGS(V(x.value), V(y.value)))));
3228                 }
3229
3230                 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
3231                 {
3232                         llvm::Function *paddsb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_padds_b);
3233
3234                         return As<SByte8>(V(::builder->CreateCall2(paddsb, ARGS(V(x.value), V(y.value)))));
3235                 }
3236
3237                 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
3238                 {
3239                         llvm::Function *psubsb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psubs_b);
3240
3241                         return As<SByte8>(V(::builder->CreateCall2(psubsb, ARGS(V(x.value), V(y.value)))));
3242                 }
3243
3244                 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
3245                 {
3246                         llvm::Function *paddusb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_paddus_b);
3247
3248                         return As<Byte8>(V(::builder->CreateCall2(paddusb, ARGS(V(x.value), V(y.value)))));
3249                 }
3250
3251                 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
3252                 {
3253                         llvm::Function *psubusb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psubus_b);
3254
3255                         return As<Byte8>(V(::builder->CreateCall2(psubusb, ARGS(V(x.value), V(y.value)))));
3256                 }
3257
3258                 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
3259                 {
3260 #if REACTOR_LLVM_VERSION < 7
3261                         llvm::Function *pavgw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pavg_w);
3262
3263                         return As<UShort4>(V(::builder->CreateCall2(pavgw, ARGS(V(x.value), V(y.value)))));
3264 #else
3265                         return As<UShort4>(V(lowerPAVG(V(x.value), V(y.value))));
3266 #endif
3267                 }
3268
3269                 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
3270                 {
3271 #if REACTOR_LLVM_VERSION < 7
3272                         llvm::Function *pmaxsw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmaxs_w);
3273
3274                         return As<Short4>(V(::builder->CreateCall2(pmaxsw, ARGS(V(x.value), V(y.value)))));
3275 #else
3276                         return As<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
3277 #endif
3278                 }
3279
3280                 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
3281                 {
3282 #if REACTOR_LLVM_VERSION < 7
3283                         llvm::Function *pminsw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmins_w);
3284
3285                         return As<Short4>(V(::builder->CreateCall2(pminsw, ARGS(V(x.value), V(y.value)))));
3286 #else
3287                         return As<Short4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
3288 #endif
3289                 }
3290
3291                 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
3292                 {
3293 #if REACTOR_LLVM_VERSION < 7
3294                         llvm::Function *pcmpgtw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pcmpgt_w);
3295
3296                         return As<Short4>(V(::builder->CreateCall2(pcmpgtw, ARGS(V(x.value), V(y.value)))));
3297 #else
3298                         return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType()))));
3299 #endif
3300                 }
3301
3302                 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
3303                 {
3304 #if REACTOR_LLVM_VERSION < 7
3305                         llvm::Function *pcmpeqw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pcmpeq_w);
3306
3307                         return As<Short4>(V(::builder->CreateCall2(pcmpeqw, ARGS(V(x.value), V(y.value)))));
3308 #else
3309                         return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType()))));
3310 #endif
3311                 }
3312
3313                 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
3314                 {
3315 #if REACTOR_LLVM_VERSION < 7
3316                         llvm::Function *pcmpgtb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pcmpgt_b);
3317
3318                         return As<Byte8>(V(::builder->CreateCall2(pcmpgtb, ARGS(V(x.value), V(y.value)))));
3319 #else
3320                         return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType()))));
3321 #endif
3322                 }
3323
3324                 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
3325                 {
3326 #if REACTOR_LLVM_VERSION < 7
3327                         llvm::Function *pcmpeqb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pcmpeq_b);
3328
3329                         return As<Byte8>(V(::builder->CreateCall2(pcmpeqb, ARGS(V(x.value), V(y.value)))));
3330 #else
3331                         return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType()))));
3332 #endif
3333                 }
3334
3335                 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
3336                 {
3337                         llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_packssdw_128);
3338
3339                         return As<Short4>(V(::builder->CreateCall2(packssdw, ARGS(V(x.value), V(y.value)))));
3340                 }
3341
3342                 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
3343                 {
3344                         llvm::Function *packssdw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_packssdw_128);
3345
3346                         return RValue<Short8>(V(::builder->CreateCall2(packssdw, ARGS(V(x.value), V(y.value)))));
3347                 }
3348
3349                 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
3350                 {
3351                         llvm::Function *packsswb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_packsswb_128);
3352
3353                         return As<SByte8>(V(::builder->CreateCall2(packsswb, ARGS(V(x.value), V(y.value)))));
3354                 }
3355
3356                 RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
3357                 {
3358                         llvm::Function *packuswb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_packuswb_128);
3359
3360                         return As<Byte8>(V(::builder->CreateCall2(packuswb, ARGS(V(x.value), V(y.value)))));
3361                 }
3362
3363                 RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
3364                 {
3365                         if(CPUID::supportsSSE4_1())
3366                         {
3367                                 llvm::Function *packusdw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_packusdw);
3368
3369                                 return RValue<UShort8>(V(::builder->CreateCall2(packusdw, ARGS(V(x.value), V(y.value)))));
3370                         }
3371                         else
3372                         {
3373                                 RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
3374                                 RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
3375
3376                                 return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
3377                         }
3378                 }
3379
3380                 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
3381                 {
3382                         llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrli_w);
3383
3384                         return As<UShort4>(V(::builder->CreateCall2(psrlw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3385                 }
3386
3387                 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
3388                 {
3389                         llvm::Function *psrlw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrli_w);
3390
3391                         return RValue<UShort8>(V(::builder->CreateCall2(psrlw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3392                 }
3393
3394                 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
3395                 {
3396                         llvm::Function *psraw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrai_w);
3397
3398                         return As<Short4>(V(::builder->CreateCall2(psraw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3399                 }
3400
3401                 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
3402                 {
3403                         llvm::Function *psraw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrai_w);
3404
3405                         return RValue<Short8>(V(::builder->CreateCall2(psraw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3406                 }
3407
3408                 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
3409                 {
3410                         llvm::Function *psllw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pslli_w);
3411
3412                         return As<Short4>(V(::builder->CreateCall2(psllw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3413                 }
3414
3415                 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
3416                 {
3417                         llvm::Function *psllw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pslli_w);
3418
3419                         return RValue<Short8>(V(::builder->CreateCall2(psllw, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3420                 }
3421
3422                 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
3423                 {
3424                         llvm::Function *pslld = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pslli_d);
3425
3426                         return As<Int2>(V(::builder->CreateCall2(pslld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3427                 }
3428
3429                 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
3430                 {
3431                         llvm::Function *pslld = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pslli_d);
3432
3433                         return RValue<Int4>(V(::builder->CreateCall2(pslld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3434                 }
3435
3436                 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
3437                 {
3438                         llvm::Function *psrad = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrai_d);
3439
3440                         return As<Int2>(V(::builder->CreateCall2(psrad, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3441                 }
3442
3443                 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
3444                 {
3445                         llvm::Function *psrad = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrai_d);
3446
3447                         return RValue<Int4>(V(::builder->CreateCall2(psrad, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3448                 }
3449
3450                 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
3451                 {
3452                         llvm::Function *psrld = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrli_d);
3453
3454                         return As<UInt2>(V(::builder->CreateCall2(psrld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3455                 }
3456
3457                 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
3458                 {
3459                         llvm::Function *psrld = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_psrli_d);
3460
3461                         return RValue<UInt4>(V(::builder->CreateCall2(psrld, ARGS(V(x.value), V(Nucleus::createConstantInt(y))))));
3462                 }
3463
3464                 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
3465                 {
3466 #if REACTOR_LLVM_VERSION < 7
3467                         llvm::Function *pmaxsd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmaxsd);
3468
3469                         return RValue<Int4>(V(::builder->CreateCall2(pmaxsd, ARGS(V(x.value), V(y.value)))));
3470 #else
3471                         return RValue<Int4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT)));
3472 #endif
3473                 }
3474
3475                 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
3476                 {
3477 #if REACTOR_LLVM_VERSION < 7
3478                         llvm::Function *pminsd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pminsd);
3479
3480                         return RValue<Int4>(V(::builder->CreateCall2(pminsd, ARGS(V(x.value), V(y.value)))));
3481 #else
3482                         return RValue<Int4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT)));
3483 #endif
3484                 }
3485
3486                 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
3487                 {
3488 #if REACTOR_LLVM_VERSION < 7
3489                         llvm::Function *pmaxud = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmaxud);
3490
3491                         return RValue<UInt4>(V(::builder->CreateCall2(pmaxud, ARGS(V(x.value), V(y.value)))));
3492 #else
3493                         return RValue<UInt4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_UGT)));
3494 #endif
3495                 }
3496
3497                 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
3498                 {
3499 #if REACTOR_LLVM_VERSION < 7
3500                         llvm::Function *pminud = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pminud);
3501
3502                         return RValue<UInt4>(V(::builder->CreateCall2(pminud, ARGS(V(x.value), V(y.value)))));
3503 #else
3504                         return RValue<UInt4>(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_ULT)));
3505 #endif
3506                 }
3507
3508                 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
3509                 {
3510                         llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmulh_w);
3511
3512                         return As<Short4>(V(::builder->CreateCall2(pmulhw, ARGS(V(x.value), V(y.value)))));
3513                 }
3514
3515                 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
3516                 {
3517                         llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmulhu_w);
3518
3519                         return As<UShort4>(V(::builder->CreateCall2(pmulhuw, ARGS(V(x.value), V(y.value)))));
3520                 }
3521
3522                 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
3523                 {
3524                         llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmadd_wd);
3525
3526                         return As<Int2>(V(::builder->CreateCall2(pmaddwd, ARGS(V(x.value), V(y.value)))));
3527                 }
3528
3529                 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
3530                 {
3531                         llvm::Function *pmulhw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmulh_w);
3532
3533                         return RValue<Short8>(V(::builder->CreateCall2(pmulhw, ARGS(V(x.value), V(y.value)))));
3534                 }
3535
3536                 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
3537                 {
3538                         llvm::Function *pmulhuw = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmulhu_w);
3539
3540                         return RValue<UShort8>(V(::builder->CreateCall2(pmulhuw, ARGS(V(x.value), V(y.value)))));
3541                 }
3542
3543                 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
3544                 {
3545                         llvm::Function *pmaddwd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmadd_wd);
3546
3547                         return RValue<Int4>(V(::builder->CreateCall2(pmaddwd, ARGS(V(x.value), V(y.value)))));
3548                 }
3549
3550                 RValue<Int> movmskps(RValue<Float4> x)
3551                 {
3552                         llvm::Function *movmskps = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse_movmsk_ps);
3553
3554                         return RValue<Int>(V(::builder->CreateCall(movmskps, ARGS(V(x.value)))));
3555                 }
3556
3557                 RValue<Int> pmovmskb(RValue<Byte8> x)
3558                 {
3559                         llvm::Function *pmovmskb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_pmovmskb_128);
3560
3561                         return RValue<Int>(V(::builder->CreateCall(pmovmskb, ARGS(V(x.value))))) & 0xFF;
3562                 }
3563
3564                 RValue<Int4> pmovzxbd(RValue<Byte16> x)
3565                 {
3566 #if REACTOR_LLVM_VERSION < 7
3567                         llvm::Function *pmovzxbd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmovzxbd);
3568
3569                         return RValue<Int4>(V(::builder->CreateCall(pmovzxbd, ARGS(V(x.value)))));
3570 #else
3571                         return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), false)));
3572 #endif
3573                 }
3574
3575                 RValue<Int4> pmovsxbd(RValue<SByte16> x)
3576                 {
3577 #if REACTOR_LLVM_VERSION < 7
3578                         llvm::Function *pmovsxbd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmovsxbd);
3579
3580                         return RValue<Int4>(V(::builder->CreateCall(pmovsxbd, ARGS(V(x.value)))));
3581 #else
3582                         return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), true)));
3583 #endif
3584                 }
3585
3586                 RValue<Int4> pmovzxwd(RValue<UShort8> x)
3587                 {
3588 #if REACTOR_LLVM_VERSION < 7
3589                         llvm::Function *pmovzxwd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmovzxwd);
3590
3591                         return RValue<Int4>(V(::builder->CreateCall(pmovzxwd, ARGS(V(x.value)))));
3592 #else
3593                         return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), false)));
3594 #endif
3595                 }
3596
3597                 RValue<Int4> pmovsxwd(RValue<Short8> x)
3598                 {
3599 #if REACTOR_LLVM_VERSION < 7
3600                         llvm::Function *pmovsxwd = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse41_pmovsxwd);
3601
3602                         return RValue<Int4>(V(::builder->CreateCall(pmovsxwd, ARGS(V(x.value)))));
3603 #else
3604                         return RValue<Int4>(V(lowerPMOV(V(x.value), T(Int4::getType()), true)));
3605 #endif
3606                 }
3607         }
3608 #endif  // defined(__i386__) || defined(__x86_64__)
3609
3610 #ifdef ENABLE_RR_PRINT
3611         // extractAll returns a vector containing the extracted n scalar value of
3612         // the vector vec.
3613         static std::vector<Value*> extractAll(Value* vec, int n)
3614         {
3615                 std::vector<Value*> elements;
3616                 elements.reserve(n);
3617                 for (int i = 0; i < n; i++)
3618                 {
3619                         auto el = V(::builder->CreateExtractElement(V(vec), i));
3620                         elements.push_back(el);
3621                 }
3622                 return elements;
3623         }
3624
3625         // toDouble returns all the float values in vals extended to doubles.
3626         static std::vector<Value*> toDouble(const std::vector<Value*>& vals)
3627         {
3628                 auto doubleTy = ::llvm::Type::getDoubleTy(*::context);
3629                 std::vector<Value*> elements;
3630                 elements.reserve(vals.size());
3631                 for (auto v : vals)
3632                 {
3633                         elements.push_back(V(::builder->CreateFPExt(V(v), doubleTy)));
3634                 }
3635                 return elements;
3636         }
3637
3638         std::vector<Value*> PrintValue::Ty<Byte4>::val(const RValue<Byte4>& v) { return extractAll(v.value, 4); }
3639         std::vector<Value*> PrintValue::Ty<Int4>::val(const RValue<Int4>& v) { return extractAll(v.value, 4); }
3640         std::vector<Value*> PrintValue::Ty<UInt4>::val(const RValue<UInt4>& v) { return extractAll(v.value, 4); }
3641         std::vector<Value*> PrintValue::Ty<Short4>::val(const RValue<Short4>& v) { return extractAll(v.value, 4); }
3642         std::vector<Value*> PrintValue::Ty<UShort4>::val(const RValue<UShort4>& v) { return extractAll(v.value, 4); }
3643         std::vector<Value*> PrintValue::Ty<Float>::val(const RValue<Float>& v) { return toDouble({v.value}); }
3644         std::vector<Value*> PrintValue::Ty<Float4>::val(const RValue<Float4>& v) { return toDouble(extractAll(v.value, 4)); }
3645
3646         void Printv(const char* function, const char* file, int line, const char* fmt, std::initializer_list<PrintValue> args)
3647         {
3648                 // LLVM types used below.
3649                 auto i32Ty = ::llvm::Type::getInt32Ty(*::context);
3650                 auto intTy = ::llvm::Type::getInt64Ty(*::context); // TODO: Natural int width.
3651                 auto i8PtrTy = ::llvm::Type::getInt8PtrTy(*::context);
3652                 auto funcTy = ::llvm::FunctionType::get(i32Ty, {i8PtrTy}, true);
3653
3654                 auto func = ::module->getOrInsertFunction("printf", funcTy);
3655
3656                 // Build the printf format message string.
3657                 std::string str;
3658                 if (file != nullptr) { str += (line > 0) ? "%s:%d " : "%s "; }
3659                 if (function != nullptr) { str += "%s "; }
3660                 str += fmt;
3661
3662                 // Perform subsitution on all '{n}' bracketed indices in the format
3663                 // message.
3664                 int i = 0;
3665                 for (const PrintValue& arg : args)
3666                 {
3667                         str = replace(str, "{" + std::to_string(i++) + "}", arg.format);
3668                 }
3669
3670                 ::llvm::SmallVector<::llvm::Value*, 8> vals;
3671
3672                 // The format message is always the first argument.
3673                 vals.push_back(::builder->CreateGlobalStringPtr(str));
3674
3675                 // Add optional file, line and function info if provided.
3676                 if (file != nullptr)
3677                 {
3678                         vals.push_back(::builder->CreateGlobalStringPtr(file));
3679                         if (line > 0)
3680                         {
3681                                 vals.push_back(::llvm::ConstantInt::get(intTy, line));
3682                         }
3683                 }
3684                 if (function != nullptr)
3685                 {
3686                         vals.push_back(::builder->CreateGlobalStringPtr(function));
3687                 }
3688
3689                 // Add all format arguments.
3690                 for (const PrintValue& arg : args)
3691                 {
3692                         for (auto val : arg.values)
3693                         {
3694                                 vals.push_back(V(val));
3695                         }
3696                 }
3697
3698                 ::builder->CreateCall(func, vals);
3699         }
3700 #endif // ENABLE_RR_PRINT
3701
3702 }