src/Reactor/SubzeroReactor.cpp

   1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //    http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "Reactor.hpp"
  16
  17 #include "Optimizer.hpp"
  18 #include "ExecutableMemory.hpp"
  19
  20 #include "src/IceTypes.h"
  21 #include "src/IceCfg.h"
  22 #include "src/IceELFStreamer.h"
  23 #include "src/IceGlobalContext.h"
  24 #include "src/IceCfgNode.h"
  25 #include "src/IceELFObjectWriter.h"
  26 #include "src/IceGlobalInits.h"
  27
  28 #include "llvm/Support/FileSystem.h"
  29 #include "llvm/Support/raw_os_ostream.h"
  30 #include "llvm/Support/Compiler.h"
  31
  32 #if __has_feature(memory_sanitizer)
  33 #include <sanitizer/msan_interface.h>
  34 #endif
  35
  36 #if defined(_WIN32)
  37 #ifndef WIN32_LEAN_AND_MEAN
  38 #define WIN32_LEAN_AND_MEAN
  39 #endif // !WIN32_LEAN_AND_MEAN
  40 #ifndef NOMINMAX
  41 #define NOMINMAX
  42 #endif // !NOMINMAX
  43 #include <Windows.h>
  44 #else
  45 #include <sys/mman.h>
  46 #if !defined(MAP_ANONYMOUS)
  47 #define MAP_ANONYMOUS MAP_ANON
  48 #endif
  49 #endif
  50
  51 #include <mutex>
  52 #include <limits>
  53 #include <iostream>
  54 #include <cassert>
  55
  56 namespace
  57 {
  58         Ice::GlobalContext *context = nullptr;
  59         Ice::Cfg *function = nullptr;
  60         Ice::CfgNode *basicBlock = nullptr;
  61         Ice::CfgLocalAllocatorScope *allocator = nullptr;
  62         rr::Routine *routine = nullptr;
  63
  64         std::mutex codegenMutex;
  65
  66         Ice::ELFFileStreamer *elfFile = nullptr;
  67         Ice::Fdstream *out = nullptr;
  68 }
  69
  70 namespace
  71 {
  72         #if !defined(__i386__) && defined(_M_IX86)
  73                 #define __i386__ 1
  74         #endif
  75
  76         #if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
  77                 #define __x86_64__ 1
  78         #endif
  79
  80         class CPUID
  81         {
  82         public:
  83                 const static bool ARM;
  84                 const static bool SSE4_1;
  85
  86         private:
  87                 static void cpuid(int registers[4], int info)
  88                 {
  89                         #if defined(__i386__) || defined(__x86_64__)
  90                                 #if defined(_WIN32)
  91                                         __cpuid(registers, info);
  92                                 #else
  93                                         __asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
  94                                 #endif
  95                         #else
  96                                 registers[0] = 0;
  97                                 registers[1] = 0;
  98                                 registers[2] = 0;
  99                                 registers[3] = 0;
 100                         #endif
 101                 }
 102
 103                 static bool detectARM()
 104                 {
 105                         #if defined(__arm__) || defined(__aarch64__)
 106                                 return true;
 107                         #elif defined(__i386__) || defined(__x86_64__)
 108                                 return false;
 109                         #elif defined(__mips__)
 110                                 return false;
 111                         #else
 112                                 #error "Unknown architecture"
 113                         #endif
 114                 }
 115
 116                 static bool detectSSE4_1()
 117                 {
 118                         #if defined(__i386__) || defined(__x86_64__)
 119                                 int registers[4];
 120                                 cpuid(registers, 1);
 121                                 return (registers[2] & 0x00080000) != 0;
 122                         #else
 123                                 return false;
 124                         #endif
 125                 }
 126         };
 127
 128         const bool CPUID::ARM = CPUID::detectARM();
 129         const bool CPUID::SSE4_1 = CPUID::detectSSE4_1();
 130         const bool emulateIntrinsics = false;
 131         const bool emulateMismatchedBitCast = CPUID::ARM;
 132 }
 133
 134 namespace rr
 135 {
 136         enum EmulatedType
 137         {
 138                 EmulatedShift = 16,
 139                 EmulatedV2 = 2 << EmulatedShift,
 140                 EmulatedV4 = 4 << EmulatedShift,
 141                 EmulatedV8 = 8 << EmulatedShift,
 142                 EmulatedBits = EmulatedV2 | EmulatedV4 | EmulatedV8,
 143
 144                 Type_v2i32 = Ice::IceType_v4i32 | EmulatedV2,
 145                 Type_v4i16 = Ice::IceType_v8i16 | EmulatedV4,
 146                 Type_v2i16 = Ice::IceType_v8i16 | EmulatedV2,
 147                 Type_v8i8 =  Ice::IceType_v16i8 | EmulatedV8,
 148                 Type_v4i8 =  Ice::IceType_v16i8 | EmulatedV4,
 149                 Type_v2f32 = Ice::IceType_v4f32 | EmulatedV2,
 150         };
 151
 152         class Value : public Ice::Operand {};
 153         class SwitchCases : public Ice::InstSwitch {};
 154         class BasicBlock : public Ice::CfgNode {};
 155
 156         Ice::Type T(Type *t)
 157         {
 158                 static_assert(static_cast<unsigned int>(Ice::IceType_NUM) < static_cast<unsigned int>(EmulatedBits), "Ice::Type overlaps with our emulated types!");
 159                 return (Ice::Type)(reinterpret_cast<std::intptr_t>(t) & ~EmulatedBits);
 160         }
 161
 162         Type *T(Ice::Type t)
 163         {
 164                 return reinterpret_cast<Type*>(t);
 165         }
 166
 167         Type *T(EmulatedType t)
 168         {
 169                 return reinterpret_cast<Type*>(t);
 170         }
 171
 172         Value *V(Ice::Operand *v)
 173         {
 174                 return reinterpret_cast<Value*>(v);
 175         }
 176
 177         BasicBlock *B(Ice::CfgNode *b)
 178         {
 179                 return reinterpret_cast<BasicBlock*>(b);
 180         }
 181
 182         static size_t typeSize(Type *type)
 183         {
 184                 if(reinterpret_cast<std::intptr_t>(type) & EmulatedBits)
 185                 {
 186                         switch(reinterpret_cast<std::intptr_t>(type))
 187                         {
 188                         case Type_v2i32: return 8;
 189                         case Type_v4i16: return 8;
 190                         case Type_v2i16: return 4;
 191                         case Type_v8i8:  return 8;
 192                         case Type_v4i8:  return 4;
 193                         case Type_v2f32: return 8;
 194                         default: assert(false);
 195                         }
 196                 }
 197
 198                 return Ice::typeWidthInBytes(T(type));
 199         }
 200
 201         Optimization optimization[10] = {InstructionCombining, Disabled};
 202
 203         using ElfHeader = std::conditional<sizeof(void*) == 8, Elf64_Ehdr, Elf32_Ehdr>::type;
 204         using SectionHeader = std::conditional<sizeof(void*) == 8, Elf64_Shdr, Elf32_Shdr>::type;
 205
 206         inline const SectionHeader *sectionHeader(const ElfHeader *elfHeader)
 207         {
 208                 return reinterpret_cast<const SectionHeader*>((intptr_t)elfHeader + elfHeader->e_shoff);
 209         }
 210
 211         inline const SectionHeader *elfSection(const ElfHeader *elfHeader, int index)
 212         {
 213                 return &sectionHeader(elfHeader)[index];
 214         }
 215
 216         static void *relocateSymbol(const ElfHeader *elfHeader, const Elf32_Rel &relocation, const SectionHeader &relocationTable)
 217         {
 218                 const SectionHeader *target = elfSection(elfHeader, relocationTable.sh_info);
 219
 220                 uint32_t index = relocation.getSymbol();
 221                 int table = relocationTable.sh_link;
 222                 void *symbolValue = nullptr;
 223
 224                 if(index != SHN_UNDEF)
 225                 {
 226                         if(table == SHN_UNDEF) return nullptr;
 227                         const SectionHeader *symbolTable = elfSection(elfHeader, table);
 228
 229                         uint32_t symtab_entries = symbolTable->sh_size / symbolTable->sh_entsize;
 230                         if(index >= symtab_entries)
 231                         {
 232                                 assert(index < symtab_entries && "Symbol Index out of range");
 233                                 return nullptr;
 234                         }
 235
 236                         intptr_t symbolAddress = (intptr_t)elfHeader + symbolTable->sh_offset;
 237                         Elf32_Sym &symbol = ((Elf32_Sym*)symbolAddress)[index];
 238                         uint16_t section = symbol.st_shndx;
 239
 240                         if(section != SHN_UNDEF && section < SHN_LORESERVE)
 241                         {
 242                                 const SectionHeader *target = elfSection(elfHeader, symbol.st_shndx);
 243                                 symbolValue = reinterpret_cast<void*>((intptr_t)elfHeader + symbol.st_value + target->sh_offset);
 244                         }
 245                         else
 246                         {
 247                                 return nullptr;
 248                         }
 249                 }
 250
 251                 intptr_t address = (intptr_t)elfHeader + target->sh_offset;
 252                 unaligned_ptr<int32_t> patchSite = (int32_t*)(address + relocation.r_offset);
 253
 254                 if(CPUID::ARM)
 255                 {
 256                         switch(relocation.getType())
 257                         {
 258                         case R_ARM_NONE:
 259                                 // No relocation
 260                                 break;
 261                         case R_ARM_MOVW_ABS_NC:
 262                                 {
 263                                         uint32_t thumb = 0;   // Calls to Thumb code not supported.
 264                                         uint32_t lo = (uint32_t)(intptr_t)symbolValue | thumb;
 265                                         *patchSite = (*patchSite & 0xFFF0F000) | ((lo & 0xF000) << 4) | (lo & 0x0FFF);
 266                                 }
 267                                 break;
 268                         case R_ARM_MOVT_ABS:
 269                                 {
 270                                         uint32_t hi = (uint32_t)(intptr_t)(symbolValue) >> 16;
 271                                         *patchSite = (*patchSite & 0xFFF0F000) | ((hi & 0xF000) << 4) | (hi & 0x0FFF);
 272                                 }
 273                                 break;
 274                         default:
 275                                 assert(false && "Unsupported relocation type");
 276                                 return nullptr;
 277                         }
 278                 }
 279                 else
 280                 {
 281                         switch(relocation.getType())
 282                         {
 283                         case R_386_NONE:
 284                                 // No relocation
 285                                 break;
 286                         case R_386_32:
 287                                 *patchSite = (int32_t)((intptr_t)symbolValue + *patchSite);
 288                                 break;
 289                 //      case R_386_PC32:
 290                 //              *patchSite = (int32_t)((intptr_t)symbolValue + *patchSite - (intptr_t)patchSite);
 291                 //              break;
 292                         default:
 293                                 assert(false && "Unsupported relocation type");
 294                                 return nullptr;
 295                         }
 296                 }
 297
 298                 return symbolValue;
 299         }
 300
 301         static void *relocateSymbol(const ElfHeader *elfHeader, const Elf64_Rela &relocation, const SectionHeader &relocationTable)
 302         {
 303                 const SectionHeader *target = elfSection(elfHeader, relocationTable.sh_info);
 304
 305                 uint32_t index = relocation.getSymbol();
 306                 int table = relocationTable.sh_link;
 307                 void *symbolValue = nullptr;
 308
 309                 if(index != SHN_UNDEF)
 310                 {
 311                         if(table == SHN_UNDEF) return nullptr;
 312                         const SectionHeader *symbolTable = elfSection(elfHeader, table);
 313
 314                         uint32_t symtab_entries = symbolTable->sh_size / symbolTable->sh_entsize;
 315                         if(index >= symtab_entries)
 316                         {
 317                                 assert(index < symtab_entries && "Symbol Index out of range");
 318                                 return nullptr;
 319                         }
 320
 321                         intptr_t symbolAddress = (intptr_t)elfHeader + symbolTable->sh_offset;
 322                         Elf64_Sym &symbol = ((Elf64_Sym*)symbolAddress)[index];
 323                         uint16_t section = symbol.st_shndx;
 324
 325                         if(section != SHN_UNDEF && section < SHN_LORESERVE)
 326                         {
 327                                 const SectionHeader *target = elfSection(elfHeader, symbol.st_shndx);
 328                                 symbolValue = reinterpret_cast<void*>((intptr_t)elfHeader + symbol.st_value + target->sh_offset);
 329                         }
 330                         else
 331                         {
 332                                 return nullptr;
 333                         }
 334                 }
 335
 336                 intptr_t address = (intptr_t)elfHeader + target->sh_offset;
 337                 unaligned_ptr<int32_t> patchSite32 = (int32_t*)(address + relocation.r_offset);
 338                 unaligned_ptr<int64_t> patchSite64 = (int64_t*)(address + relocation.r_offset);
 339
 340                 switch(relocation.getType())
 341                 {
 342                 case R_X86_64_NONE:
 343                         // No relocation
 344                         break;
 345                 case R_X86_64_64:
 346                         *patchSite64 = (int64_t)((intptr_t)symbolValue + *patchSite64 + relocation.r_addend);
 347                         break;
 348                 case R_X86_64_PC32:
 349                         *patchSite32 = (int32_t)((intptr_t)symbolValue + *patchSite32 - (intptr_t)patchSite32 + relocation.r_addend);
 350                         break;
 351                 case R_X86_64_32S:
 352                         *patchSite32 = (int32_t)((intptr_t)symbolValue + *patchSite32 + relocation.r_addend);
 353                         break;
 354                 default:
 355                         assert(false && "Unsupported relocation type");
 356                         return nullptr;
 357                 }
 358
 359                 return symbolValue;
 360         }
 361
 362         void *loadImage(uint8_t *const elfImage, size_t &codeSize)
 363         {
 364                 ElfHeader *elfHeader = (ElfHeader*)elfImage;
 365
 366                 if(!elfHeader->checkMagic())
 367                 {
 368                         return nullptr;
 369                 }
 370
 371                 // Expect ELF bitness to match platform
 372                 assert(sizeof(void*) == 8 ? elfHeader->getFileClass() == ELFCLASS64 : elfHeader->getFileClass() == ELFCLASS32);
 373                 #if defined(__i386__)
 374                         assert(sizeof(void*) == 4 && elfHeader->e_machine == EM_386);
 375                 #elif defined(__x86_64__)
 376                         assert(sizeof(void*) == 8 && elfHeader->e_machine == EM_X86_64);
 377                 #elif defined(__arm__)
 378                         assert(sizeof(void*) == 4 && elfHeader->e_machine == EM_ARM);
 379                 #elif defined(__aarch64__)
 380                         assert(sizeof(void*) == 8 && elfHeader->e_machine == EM_AARCH64);
 381                 #elif defined(__mips__)
 382                         assert(sizeof(void*) == 4 && elfHeader->e_machine == EM_MIPS);
 383                 #else
 384                         #error "Unsupported platform"
 385                 #endif
 386
 387                 SectionHeader *sectionHeader = (SectionHeader*)(elfImage + elfHeader->e_shoff);
 388                 void *entry = nullptr;
 389
 390                 for(int i = 0; i < elfHeader->e_shnum; i++)
 391                 {
 392                         if(sectionHeader[i].sh_type == SHT_PROGBITS)
 393                         {
 394                                 if(sectionHeader[i].sh_flags & SHF_EXECINSTR)
 395                                 {
 396                                         entry = elfImage + sectionHeader[i].sh_offset;
 397                                         codeSize = sectionHeader[i].sh_size;
 398                                 }
 399                         }
 400                         else if(sectionHeader[i].sh_type == SHT_REL)
 401                         {
 402                                 assert(sizeof(void*) == 4 && "UNIMPLEMENTED");   // Only expected/implemented for 32-bit code
 403
 404                                 for(Elf32_Word index = 0; index < sectionHeader[i].sh_size / sectionHeader[i].sh_entsize; index++)
 405                                 {
 406                                         const Elf32_Rel &relocation = ((const Elf32_Rel*)(elfImage + sectionHeader[i].sh_offset))[index];
 407                                         relocateSymbol(elfHeader, relocation, sectionHeader[i]);
 408                                 }
 409                         }
 410                         else if(sectionHeader[i].sh_type == SHT_RELA)
 411                         {
 412                                 assert(sizeof(void*) == 8 && "UNIMPLEMENTED");   // Only expected/implemented for 64-bit code
 413
 414                                 for(Elf32_Word index = 0; index < sectionHeader[i].sh_size / sectionHeader[i].sh_entsize; index++)
 415                                 {
 416                                         const Elf64_Rela &relocation = ((const Elf64_Rela*)(elfImage + sectionHeader[i].sh_offset))[index];
 417                                         relocateSymbol(elfHeader, relocation, sectionHeader[i]);
 418                                 }
 419                         }
 420                 }
 421
 422                 return entry;
 423         }
 424
 425         template<typename T>
 426         struct ExecutableAllocator
 427         {
 428                 ExecutableAllocator() {};
 429                 template<class U> ExecutableAllocator(const ExecutableAllocator<U> &other) {};
 430
 431                 using value_type = T;
 432                 using size_type = std::size_t;
 433
 434                 T *allocate(size_type n)
 435                 {
 436                         return (T*)allocateExecutable(sizeof(T) * n);
 437                 }
 438
 439                 void deallocate(T *p, size_type n)
 440                 {
 441                         deallocateExecutable(p, sizeof(T) * n);
 442                 }
 443         };
 444
 445         class ELFMemoryStreamer : public Ice::ELFStreamer, public Routine
 446         {
 447                 ELFMemoryStreamer(const ELFMemoryStreamer &) = delete;
 448                 ELFMemoryStreamer &operator=(const ELFMemoryStreamer &) = delete;
 449
 450         public:
 451                 ELFMemoryStreamer() : Routine(), entry(nullptr)
 452                 {
 453                         position = 0;
 454                         buffer.reserve(0x1000);
 455                 }
 456
 457                 ~ELFMemoryStreamer() override
 458                 {
 459                         #if defined(_WIN32)
 460                                 if(buffer.size() != 0)
 461                                 {
 462                                         DWORD exeProtection;
 463                                         VirtualProtect(&buffer[0], buffer.size(), oldProtection, &exeProtection);
 464                                 }
 465                         #endif
 466                 }
 467
 468                 void write8(uint8_t Value) override
 469                 {
 470                         if(position == (uint64_t)buffer.size())
 471                         {
 472                                 buffer.push_back(Value);
 473                                 position++;
 474                         }
 475                         else if(position < (uint64_t)buffer.size())
 476                         {
 477                                 buffer[position] = Value;
 478                                 position++;
 479                         }
 480                         else assert(false && "UNIMPLEMENTED");
 481                 }
 482
 483                 void writeBytes(llvm::StringRef Bytes) override
 484                 {
 485                         std::size_t oldSize = buffer.size();
 486                         buffer.resize(oldSize + Bytes.size());
 487                         memcpy(&buffer[oldSize], Bytes.begin(), Bytes.size());
 488                         position += Bytes.size();
 489                 }
 490
 491                 uint64_t tell() const override { return position; }
 492
 493                 void seek(uint64_t Off) override { position = Off; }
 494
 495                 const void *getEntry() override
 496                 {
 497                         if(!entry)
 498                         {
 499                                 position = std::numeric_limits<std::size_t>::max();   // Can't stream more data after this
 500
 501                                 size_t codeSize = 0;
 502                                 entry = loadImage(&buffer[0], codeSize);
 503
 504                                 #if defined(_WIN32)
 505                                         VirtualProtect(&buffer[0], buffer.size(), PAGE_EXECUTE_READ, &oldProtection);
 506                                         FlushInstructionCache(GetCurrentProcess(), NULL, 0);
 507                                 #else
 508                                         mprotect(&buffer[0], buffer.size(), PROT_READ | PROT_EXEC);
 509                                         __builtin___clear_cache((char*)entry, (char*)entry + codeSize);
 510                                 #endif
 511                         }
 512
 513                         return entry;
 514                 }
 515
 516         private:
 517                 void *entry;
 518                 std::vector<uint8_t, ExecutableAllocator<uint8_t>> buffer;
 519                 std::size_t position;
 520
 521                 #if defined(_WIN32)
 522                 DWORD oldProtection;
 523                 #endif
 524         };
 525
 526         Nucleus::Nucleus()
 527         {
 528                 ::codegenMutex.lock();   // Reactor is currently not thread safe
 529
 530                 Ice::ClFlags &Flags = Ice::ClFlags::Flags;
 531                 Ice::ClFlags::getParsedClFlags(Flags);
 532
 533                 #if defined(__arm__)
 534                         Flags.setTargetArch(Ice::Target_ARM32);
 535                         Flags.setTargetInstructionSet(Ice::ARM32InstructionSet_HWDivArm);
 536                 #elif defined(__mips__)
 537                         Flags.setTargetArch(Ice::Target_MIPS32);
 538                         Flags.setTargetInstructionSet(Ice::BaseInstructionSet);
 539                 #else   // x86
 540                         Flags.setTargetArch(sizeof(void*) == 8 ? Ice::Target_X8664 : Ice::Target_X8632);
 541                         Flags.setTargetInstructionSet(CPUID::SSE4_1 ? Ice::X86InstructionSet_SSE4_1 : Ice::X86InstructionSet_SSE2);
 542                 #endif
 543                 Flags.setOutFileType(Ice::FT_Elf);
 544                 Flags.setOptLevel(Ice::Opt_2);
 545                 Flags.setApplicationBinaryInterface(Ice::ABI_Platform);
 546                 Flags.setVerbose(false ? Ice::IceV_Most : Ice::IceV_None);
 547                 Flags.setDisableHybridAssembly(true);
 548
 549                 static llvm::raw_os_ostream cout(std::cout);
 550                 static llvm::raw_os_ostream cerr(std::cerr);
 551
 552                 if(false)   // Write out to a file
 553                 {
 554                         std::error_code errorCode;
 555                         ::out = new Ice::Fdstream("out.o", errorCode, llvm::sys::fs::F_None);
 556                         ::elfFile = new Ice::ELFFileStreamer(*out);
 557                         ::context = new Ice::GlobalContext(&cout, &cout, &cerr, elfFile);
 558                 }
 559                 else
 560                 {
 561                         ELFMemoryStreamer *elfMemory = new ELFMemoryStreamer();
 562                         ::context = new Ice::GlobalContext(&cout, &cout, &cerr, elfMemory);
 563                         ::routine = elfMemory;
 564                 }
 565         }
 566
 567         Nucleus::~Nucleus()
 568         {
 569                 delete ::routine;
 570
 571                 delete ::allocator;
 572                 delete ::function;
 573                 delete ::context;
 574
 575                 delete ::elfFile;
 576                 delete ::out;
 577
 578                 ::codegenMutex.unlock();
 579         }
 580
 581         Routine *Nucleus::acquireRoutine(const char *name, bool runOptimizations)
 582         {
 583                 if(basicBlock->getInsts().empty() || basicBlock->getInsts().back().getKind() != Ice::Inst::Ret)
 584                 {
 585                         createRetVoid();
 586                 }
 587
 588                 ::function->setFunctionName(Ice::GlobalString::createWithString(::context, name));
 589
 590                 optimize();
 591
 592                 ::function->translate();
 593                 assert(!::function->hasError());
 594
 595                 auto globals = ::function->getGlobalInits();
 596
 597                 if(globals && !globals->empty())
 598                 {
 599                         ::context->getGlobals()->merge(globals.get());
 600                 }
 601
 602                 ::context->emitFileHeader();
 603                 ::function->emitIAS();
 604                 auto assembler = ::function->releaseAssembler();
 605                 auto objectWriter = ::context->getObjectWriter();
 606                 assembler->alignFunction();
 607                 objectWriter->writeFunctionCode(::function->getFunctionName(), false, assembler.get());
 608                 ::context->lowerGlobals("last");
 609                 ::context->lowerConstants();
 610                 ::context->lowerJumpTables();
 611                 objectWriter->setUndefinedSyms(::context->getConstantExternSyms());
 612                 objectWriter->writeNonUserSections();
 613
 614                 Routine *handoffRoutine = ::routine;
 615                 ::routine = nullptr;
 616
 617                 return handoffRoutine;
 618         }
 619
 620         void Nucleus::optimize()
 621         {
 622                 rr::optimize(::function);
 623         }
 624
 625         Value *Nucleus::allocateStackVariable(Type *t, int arraySize)
 626         {
 627                 Ice::Type type = T(t);
 628                 int typeSize = Ice::typeWidthInBytes(type);
 629                 int totalSize = typeSize * (arraySize ? arraySize : 1);
 630
 631                 auto bytes = Ice::ConstantInteger32::create(::context, type, totalSize);
 632                 auto address = ::function->makeVariable(T(getPointerType(t)));
 633                 auto alloca = Ice::InstAlloca::create(::function, address, bytes, typeSize);
 634                 ::function->getEntryNode()->getInsts().push_front(alloca);
 635
 636                 return V(address);
 637         }
 638
 639         BasicBlock *Nucleus::createBasicBlock()
 640         {
 641                 return B(::function->makeNode());
 642         }
 643
 644         BasicBlock *Nucleus::getInsertBlock()
 645         {
 646                 return B(::basicBlock);
 647         }
 648
 649         void Nucleus::setInsertBlock(BasicBlock *basicBlock)
 650         {
 651         //      assert(::basicBlock->getInsts().back().getTerminatorEdges().size() >= 0 && "Previous basic block must have a terminator");
 652
 653                 Variable::materializeAll();
 654
 655                 ::basicBlock = basicBlock;
 656         }
 657
 658         void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
 659         {
 660                 uint32_t sequenceNumber = 0;
 661                 ::function = Ice::Cfg::create(::context, sequenceNumber).release();
 662                 ::allocator = new Ice::CfgLocalAllocatorScope(::function);
 663
 664                 for(Type *type : Params)
 665                 {
 666                         Ice::Variable *arg = ::function->makeVariable(T(type));
 667                         ::function->addArg(arg);
 668                 }
 669
 670                 Ice::CfgNode *node = ::function->makeNode();
 671                 ::function->setEntryNode(node);
 672                 ::basicBlock = node;
 673         }
 674
 675         Value *Nucleus::getArgument(unsigned int index)
 676         {
 677                 return V(::function->getArgs()[index]);
 678         }
 679
 680         void Nucleus::createRetVoid()
 681         {
 682                 // Code generated after this point is unreachable, so any variables
 683                 // being read can safely return an undefined value. We have to avoid
 684                 // materializing variables after the terminator ret instruction.
 685                 Variable::killUnmaterialized();
 686
 687                 Ice::InstRet *ret = Ice::InstRet::create(::function);
 688                 ::basicBlock->appendInst(ret);
 689         }
 690
 691         void Nucleus::createRet(Value *v)
 692         {
 693                 // Code generated after this point is unreachable, so any variables
 694                 // being read can safely return an undefined value. We have to avoid
 695                 // materializing variables after the terminator ret instruction.
 696                 Variable::killUnmaterialized();
 697
 698                 Ice::InstRet *ret = Ice::InstRet::create(::function, v);
 699                 ::basicBlock->appendInst(ret);
 700         }
 701
 702         void Nucleus::createBr(BasicBlock *dest)
 703         {
 704                 Variable::materializeAll();
 705
 706                 auto br = Ice::InstBr::create(::function, dest);
 707                 ::basicBlock->appendInst(br);
 708         }
 709
 710         void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
 711         {
 712                 Variable::materializeAll();
 713
 714                 auto br = Ice::InstBr::create(::function, cond, ifTrue, ifFalse);
 715                 ::basicBlock->appendInst(br);
 716         }
 717
 718         static bool isCommutative(Ice::InstArithmetic::OpKind op)
 719         {
 720                 switch(op)
 721                 {
 722                 case Ice::InstArithmetic::Add:
 723                 case Ice::InstArithmetic::Fadd:
 724                 case Ice::InstArithmetic::Mul:
 725                 case Ice::InstArithmetic::Fmul:
 726                 case Ice::InstArithmetic::And:
 727                 case Ice::InstArithmetic::Or:
 728                 case Ice::InstArithmetic::Xor:
 729                         return true;
 730                 default:
 731                         return false;
 732                 }
 733         }
 734
 735         static Value *createArithmetic(Ice::InstArithmetic::OpKind op, Value *lhs, Value *rhs)
 736         {
 737                 assert(lhs->getType() == rhs->getType() || llvm::isa<Ice::Constant>(rhs));
 738
 739                 bool swapOperands = llvm::isa<Ice::Constant>(lhs) && isCommutative(op);
 740
 741                 Ice::Variable *result = ::function->makeVariable(lhs->getType());
 742                 Ice::InstArithmetic *arithmetic = Ice::InstArithmetic::create(::function, op, result, swapOperands ? rhs : lhs, swapOperands ? lhs : rhs);
 743                 ::basicBlock->appendInst(arithmetic);
 744
 745                 return V(result);
 746         }
 747
 748         Value *Nucleus::createAdd(Value *lhs, Value *rhs)
 749         {
 750                 return createArithmetic(Ice::InstArithmetic::Add, lhs, rhs);
 751         }
 752
 753         Value *Nucleus::createSub(Value *lhs, Value *rhs)
 754         {
 755                 return createArithmetic(Ice::InstArithmetic::Sub, lhs, rhs);
 756         }
 757
 758         Value *Nucleus::createMul(Value *lhs, Value *rhs)
 759         {
 760                 return createArithmetic(Ice::InstArithmetic::Mul, lhs, rhs);
 761         }
 762
 763         Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
 764         {
 765                 return createArithmetic(Ice::InstArithmetic::Udiv, lhs, rhs);
 766         }
 767
 768         Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
 769         {
 770                 return createArithmetic(Ice::InstArithmetic::Sdiv, lhs, rhs);
 771         }
 772
 773         Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
 774         {
 775                 return createArithmetic(Ice::InstArithmetic::Fadd, lhs, rhs);
 776         }
 777
 778         Value *Nucleus::createFSub(Value *lhs, Value *rhs)
 779         {
 780                 return createArithmetic(Ice::InstArithmetic::Fsub, lhs, rhs);
 781         }
 782
 783         Value *Nucleus::createFMul(Value *lhs, Value *rhs)
 784         {
 785                 return createArithmetic(Ice::InstArithmetic::Fmul, lhs, rhs);
 786         }
 787
 788         Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
 789         {
 790                 return createArithmetic(Ice::InstArithmetic::Fdiv, lhs, rhs);
 791         }
 792
 793         Value *Nucleus::createURem(Value *lhs, Value *rhs)
 794         {
 795                 return createArithmetic(Ice::InstArithmetic::Urem, lhs, rhs);
 796         }
 797
 798         Value *Nucleus::createSRem(Value *lhs, Value *rhs)
 799         {
 800                 return createArithmetic(Ice::InstArithmetic::Srem, lhs, rhs);
 801         }
 802
 803         Value *Nucleus::createFRem(Value *lhs, Value *rhs)
 804         {
 805                 return createArithmetic(Ice::InstArithmetic::Frem, lhs, rhs);
 806         }
 807
 808         Value *Nucleus::createShl(Value *lhs, Value *rhs)
 809         {
 810                 return createArithmetic(Ice::InstArithmetic::Shl, lhs, rhs);
 811         }
 812
 813         Value *Nucleus::createLShr(Value *lhs, Value *rhs)
 814         {
 815                 return createArithmetic(Ice::InstArithmetic::Lshr, lhs, rhs);
 816         }
 817
 818         Value *Nucleus::createAShr(Value *lhs, Value *rhs)
 819         {
 820                 return createArithmetic(Ice::InstArithmetic::Ashr, lhs, rhs);
 821         }
 822
 823         Value *Nucleus::createAnd(Value *lhs, Value *rhs)
 824         {
 825                 return createArithmetic(Ice::InstArithmetic::And, lhs, rhs);
 826         }
 827
 828         Value *Nucleus::createOr(Value *lhs, Value *rhs)
 829         {
 830                 return createArithmetic(Ice::InstArithmetic::Or, lhs, rhs);
 831         }
 832
 833         Value *Nucleus::createXor(Value *lhs, Value *rhs)
 834         {
 835                 return createArithmetic(Ice::InstArithmetic::Xor, lhs, rhs);
 836         }
 837
 838         Value *Nucleus::createNeg(Value *v)
 839         {
 840                 return createSub(createNullValue(T(v->getType())), v);
 841         }
 842
 843         Value *Nucleus::createFNeg(Value *v)
 844         {
 845                 double c[4] = {-0.0, -0.0, -0.0, -0.0};
 846                 Value *negativeZero = Ice::isVectorType(v->getType()) ?
 847                                       createConstantVector(c, T(v->getType())) :
 848                                       V(::context->getConstantFloat(-0.0f));
 849
 850                 return createFSub(negativeZero, v);
 851         }
 852
 853         Value *Nucleus::createNot(Value *v)
 854         {
 855                 if(Ice::isScalarIntegerType(v->getType()))
 856                 {
 857                         return createXor(v, V(::context->getConstantInt(v->getType(), -1)));
 858                 }
 859                 else   // Vector
 860                 {
 861                         int64_t c[16] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
 862                         return createXor(v, createConstantVector(c, T(v->getType())));
 863                 }
 864         }
 865
 866         Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int align, bool atomic, std::memory_order memoryOrder)
 867         {
 868                 assert(!atomic);  // Unimplemented
 869                 assert(memoryOrder == std::memory_order_relaxed);  // Unimplemented
 870
 871                 int valueType = (int)reinterpret_cast<intptr_t>(type);
 872                 Ice::Variable *result = ::function->makeVariable(T(type));
 873
 874                 if((valueType & EmulatedBits) && (align != 0))   // Narrow vector not stored on stack.
 875                 {
 876                         if(emulateIntrinsics)
 877                         {
 878                                 if(typeSize(type) == 4)
 879                                 {
 880                                         auto pointer = RValue<Pointer<Byte>>(ptr);
 881                                         Int x = *Pointer<Int>(pointer);
 882
 883                                         Int4 vector;
 884                                         vector = Insert(vector, x, 0);
 885
 886                                         auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
 887                                         ::basicBlock->appendInst(bitcast);
 888                                 }
 889                                 else if(typeSize(type) == 8)
 890                                 {
 891                                         auto pointer = RValue<Pointer<Byte>>(ptr);
 892                                         Int x = *Pointer<Int>(pointer);
 893                                         Int y = *Pointer<Int>(pointer + 4);
 894
 895                                         Int4 vector;
 896                                         vector = Insert(vector, x, 0);
 897                                         vector = Insert(vector, y, 1);
 898
 899                                         auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, result, vector.loadValue());
 900                                         ::basicBlock->appendInst(bitcast);
 901                                 }
 902                                 else assert(false);
 903                         }
 904                         else
 905                         {
 906                                 const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
 907                                 auto target = ::context->getConstantUndef(Ice::IceType_i32);
 908                                 auto load = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
 909                                 load->addArg(ptr);
 910                                 load->addArg(::context->getConstantInt32(typeSize(type)));
 911                                 ::basicBlock->appendInst(load);
 912                         }
 913                 }
 914                 else
 915                 {
 916                         auto load = Ice::InstLoad::create(::function, result, ptr, align);
 917                         ::basicBlock->appendInst(load);
 918                 }
 919
 920                 return V(result);
 921         }
 922
 923         Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int align, bool atomic, std::memory_order memoryOrder)
 924         {
 925                 assert(!atomic);  // Unimplemented
 926                 assert(memoryOrder == std::memory_order_relaxed);  // Unimplemented
 927
 928                 #if __has_feature(memory_sanitizer)
 929                         // Mark all (non-stack) memory writes as initialized by calling __msan_unpoison
 930                         if(align != 0)
 931                         {
 932                                 auto call = Ice::InstCall::create(::function, 2, nullptr, ::context->getConstantInt64(reinterpret_cast<intptr_t>(__msan_unpoison)), false);
 933                                 call->addArg(ptr);
 934                                 call->addArg(::context->getConstantInt64(typeSize(type)));
 935                                 ::basicBlock->appendInst(call);
 936                         }
 937                 #endif
 938
 939                 int valueType = (int)reinterpret_cast<intptr_t>(type);
 940
 941                 if((valueType & EmulatedBits) && (align != 0))   // Narrow vector not stored on stack.
 942                 {
 943                         if(emulateIntrinsics)
 944                         {
 945                                 if(typeSize(type) == 4)
 946                                 {
 947                                         Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
 948                                         auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
 949                                         ::basicBlock->appendInst(bitcast);
 950
 951                                         RValue<Int4> v(V(vector));
 952
 953                                         auto pointer = RValue<Pointer<Byte>>(ptr);
 954                                         Int x = Extract(v, 0);
 955                                         *Pointer<Int>(pointer) = x;
 956                                 }
 957                                 else if(typeSize(type) == 8)
 958                                 {
 959                                         Ice::Variable *vector = ::function->makeVariable(Ice::IceType_v4i32);
 960                                         auto bitcast = Ice::InstCast::create(::function, Ice::InstCast::Bitcast, vector, value);
 961                                         ::basicBlock->appendInst(bitcast);
 962
 963                                         RValue<Int4> v(V(vector));
 964
 965                                         auto pointer = RValue<Pointer<Byte>>(ptr);
 966                                         Int x = Extract(v, 0);
 967                                         *Pointer<Int>(pointer) = x;
 968                                         Int y = Extract(v, 1);
 969                                         *Pointer<Int>(pointer + 4) = y;
 970                                 }
 971                                 else assert(false);
 972                         }
 973                         else
 974                         {
 975                                 const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
 976                                 auto target = ::context->getConstantUndef(Ice::IceType_i32);
 977                                 auto store = Ice::InstIntrinsicCall::create(::function, 3, nullptr, target, intrinsic);
 978                                 store->addArg(value);
 979                                 store->addArg(ptr);
 980                                 store->addArg(::context->getConstantInt32(typeSize(type)));
 981                                 ::basicBlock->appendInst(store);
 982                         }
 983                 }
 984                 else
 985                 {
 986                         assert(value->getType() == T(type));
 987
 988                         auto store = Ice::InstStore::create(::function, value, ptr, align);
 989                         ::basicBlock->appendInst(store);
 990                 }
 991
 992                 return value;
 993         }
 994
 995         Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
 996         {
 997                 assert(index->getType() == Ice::IceType_i32);
 998
 999                 if(auto *constant = llvm::dyn_cast<Ice::ConstantInteger32>(index))
1000                 {
1001                         int32_t offset = constant->getValue() * (int)typeSize(type);
1002
1003                         if(offset == 0)
1004                         {
1005                                 return ptr;
1006                         }
1007
1008                         return createAdd(ptr, createConstantInt(offset));
1009                 }
1010
1011                 if(!Ice::isByteSizedType(T(type)))
1012                 {
1013                         index = createMul(index, createConstantInt((int)typeSize(type)));
1014                 }
1015
1016                 if(sizeof(void*) == 8)
1017                 {
1018                         if(unsignedIndex)
1019                         {
1020                                 index = createZExt(index, T(Ice::IceType_i64));
1021                         }
1022                         else
1023                         {
1024                                 index = createSExt(index, T(Ice::IceType_i64));
1025                         }
1026                 }
1027
1028                 return createAdd(ptr, index);
1029         }
1030
1031         Value *Nucleus::createAtomicAdd(Value *ptr, Value *value)
1032         {
1033                 assert(false && "UNIMPLEMENTED"); return nullptr;
1034         }
1035
1036         static Value *createCast(Ice::InstCast::OpKind op, Value *v, Type *destType)
1037         {
1038                 if(v->getType() == T(destType))
1039                 {
1040                         return v;
1041                 }
1042
1043                 Ice::Variable *result = ::function->makeVariable(T(destType));
1044                 Ice::InstCast *cast = Ice::InstCast::create(::function, op, result, v);
1045                 ::basicBlock->appendInst(cast);
1046
1047                 return V(result);
1048         }
1049
1050         Value *Nucleus::createTrunc(Value *v, Type *destType)
1051         {
1052                 return createCast(Ice::InstCast::Trunc, v, destType);
1053         }
1054
1055         Value *Nucleus::createZExt(Value *v, Type *destType)
1056         {
1057                 return createCast(Ice::InstCast::Zext, v, destType);
1058         }
1059
1060         Value *Nucleus::createSExt(Value *v, Type *destType)
1061         {
1062                 return createCast(Ice::InstCast::Sext, v, destType);
1063         }
1064
1065         Value *Nucleus::createFPToSI(Value *v, Type *destType)
1066         {
1067                 return createCast(Ice::InstCast::Fptosi, v, destType);
1068         }
1069
1070         Value *Nucleus::createSIToFP(Value *v, Type *destType)
1071         {
1072                 return createCast(Ice::InstCast::Sitofp, v, destType);
1073         }
1074
1075         Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1076         {
1077                 return createCast(Ice::InstCast::Fptrunc, v, destType);
1078         }
1079
1080         Value *Nucleus::createFPExt(Value *v, Type *destType)
1081         {
1082                 return createCast(Ice::InstCast::Fpext, v, destType);
1083         }
1084
1085         Value *Nucleus::createBitCast(Value *v, Type *destType)
1086         {
1087                 // Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1088                 // support for casting between scalars and wide vectors. For platforms where this is not supported,
1089                 // emulate them by writing to the stack and reading back as the destination type.
1090                 if(emulateMismatchedBitCast)
1091                 {
1092                         if(!Ice::isVectorType(v->getType()) && Ice::isVectorType(T(destType)))
1093                         {
1094                                 Value *address = allocateStackVariable(destType);
1095                                 createStore(v, address, T(v->getType()));
1096                                 return createLoad(address, destType);
1097                         }
1098                         else if(Ice::isVectorType(v->getType()) && !Ice::isVectorType(T(destType)))
1099                         {
1100                                 Value *address = allocateStackVariable(T(v->getType()));
1101                                 createStore(v, address, T(v->getType()));
1102                                 return createLoad(address, destType);
1103                         }
1104                 }
1105
1106                 return createCast(Ice::InstCast::Bitcast, v, destType);
1107         }
1108
1109         static Value *createIntCompare(Ice::InstIcmp::ICond condition, Value *lhs, Value *rhs)
1110         {
1111                 assert(lhs->getType() == rhs->getType());
1112
1113                 auto result = ::function->makeVariable(Ice::isScalarIntegerType(lhs->getType()) ? Ice::IceType_i1 : lhs->getType());
1114                 auto cmp = Ice::InstIcmp::create(::function, condition, result, lhs, rhs);
1115                 ::basicBlock->appendInst(cmp);
1116
1117                 return V(result);
1118         }
1119
1120         Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1121         {
1122                 return createIntCompare(Ice::InstIcmp::Eq, lhs, rhs);
1123         }
1124
1125         Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1126         {
1127                 return createIntCompare(Ice::InstIcmp::Ne, lhs, rhs);
1128         }
1129
1130         Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1131         {
1132                 return createIntCompare(Ice::InstIcmp::Ugt, lhs, rhs);
1133         }
1134
1135         Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1136         {
1137                 return createIntCompare(Ice::InstIcmp::Uge, lhs, rhs);
1138         }
1139
1140         Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1141         {
1142                 return createIntCompare(Ice::InstIcmp::Ult, lhs, rhs);
1143         }
1144
1145         Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1146         {
1147                 return createIntCompare(Ice::InstIcmp::Ule, lhs, rhs);
1148         }
1149
1150         Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1151         {
1152                 return createIntCompare(Ice::InstIcmp::Sgt, lhs, rhs);
1153         }
1154
1155         Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1156         {
1157                 return createIntCompare(Ice::InstIcmp::Sge, lhs, rhs);
1158         }
1159
1160         Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1161         {
1162                 return createIntCompare(Ice::InstIcmp::Slt, lhs, rhs);
1163         }
1164
1165         Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1166         {
1167                 return createIntCompare(Ice::InstIcmp::Sle, lhs, rhs);
1168         }
1169
1170         static Value *createFloatCompare(Ice::InstFcmp::FCond condition, Value *lhs, Value *rhs)
1171         {
1172                 assert(lhs->getType() == rhs->getType());
1173                 assert(Ice::isScalarFloatingType(lhs->getType()) || lhs->getType() == Ice::IceType_v4f32);
1174
1175                 auto result = ::function->makeVariable(Ice::isScalarFloatingType(lhs->getType()) ? Ice::IceType_i1 : Ice::IceType_v4i32);
1176                 auto cmp = Ice::InstFcmp::create(::function, condition, result, lhs, rhs);
1177                 ::basicBlock->appendInst(cmp);
1178
1179                 return V(result);
1180         }
1181
1182         Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1183         {
1184                 return createFloatCompare(Ice::InstFcmp::Oeq, lhs, rhs);
1185         }
1186
1187         Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1188         {
1189                 return createFloatCompare(Ice::InstFcmp::Ogt, lhs, rhs);
1190         }
1191
1192         Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1193         {
1194                 return createFloatCompare(Ice::InstFcmp::Oge, lhs, rhs);
1195         }
1196
1197         Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1198         {
1199                 return createFloatCompare(Ice::InstFcmp::Olt, lhs, rhs);
1200         }
1201
1202         Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1203         {
1204                 return createFloatCompare(Ice::InstFcmp::Ole, lhs, rhs);
1205         }
1206
1207         Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1208         {
1209                 return createFloatCompare(Ice::InstFcmp::One, lhs, rhs);
1210         }
1211
1212         Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1213         {
1214                 return createFloatCompare(Ice::InstFcmp::Ord, lhs, rhs);
1215         }
1216
1217         Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1218         {
1219                 return createFloatCompare(Ice::InstFcmp::Uno, lhs, rhs);
1220         }
1221
1222         Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1223         {
1224                 return createFloatCompare(Ice::InstFcmp::Ueq, lhs, rhs);
1225         }
1226
1227         Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1228         {
1229                 return createFloatCompare(Ice::InstFcmp::Ugt, lhs, rhs);
1230         }
1231
1232         Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1233         {
1234                 return createFloatCompare(Ice::InstFcmp::Uge, lhs, rhs);
1235         }
1236
1237         Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1238         {
1239                 return createFloatCompare(Ice::InstFcmp::Ult, lhs, rhs);
1240         }
1241
1242         Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1243         {
1244                 return createFloatCompare(Ice::InstFcmp::Ule, lhs, rhs);
1245         }
1246
1247         Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1248         {
1249                 return createFloatCompare(Ice::InstFcmp::Une, lhs, rhs);
1250         }
1251
1252         Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1253         {
1254                 auto result = ::function->makeVariable(T(type));
1255                 auto extract = Ice::InstExtractElement::create(::function, result, vector, ::context->getConstantInt32(index));
1256                 ::basicBlock->appendInst(extract);
1257
1258                 return V(result);
1259         }
1260
1261         Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1262         {
1263                 auto result = ::function->makeVariable(vector->getType());
1264                 auto insert = Ice::InstInsertElement::create(::function, result, vector, element, ::context->getConstantInt32(index));
1265                 ::basicBlock->appendInst(insert);
1266
1267                 return V(result);
1268         }
1269
1270         Value *Nucleus::createShuffleVector(Value *V1, Value *V2, const int *select)
1271         {
1272                 assert(V1->getType() == V2->getType());
1273
1274                 int size = Ice::typeNumElements(V1->getType());
1275                 auto result = ::function->makeVariable(V1->getType());
1276                 auto shuffle = Ice::InstShuffleVector::create(::function, result, V1, V2);
1277
1278                 for(int i = 0; i < size; i++)
1279                 {
1280                         shuffle->addIndex(llvm::cast<Ice::ConstantInteger32>(::context->getConstantInt32(select[i])));
1281                 }
1282
1283                 ::basicBlock->appendInst(shuffle);
1284
1285                 return V(result);
1286         }
1287
1288         Value *Nucleus::createSelect(Value *C, Value *ifTrue, Value *ifFalse)
1289         {
1290                 assert(ifTrue->getType() == ifFalse->getType());
1291
1292                 auto result = ::function->makeVariable(ifTrue->getType());
1293                 auto *select = Ice::InstSelect::create(::function, result, C, ifTrue, ifFalse);
1294                 ::basicBlock->appendInst(select);
1295
1296                 return V(result);
1297         }
1298
1299         SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1300         {
1301                 auto switchInst = Ice::InstSwitch::create(::function, numCases, control, defaultBranch);
1302                 ::basicBlock->appendInst(switchInst);
1303
1304                 return reinterpret_cast<SwitchCases*>(switchInst);
1305         }
1306
1307         void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1308         {
1309                 switchCases->addBranch(label, label, branch);
1310         }
1311
1312         void Nucleus::createUnreachable()
1313         {
1314                 Ice::InstUnreachable *unreachable = Ice::InstUnreachable::create(::function);
1315                 ::basicBlock->appendInst(unreachable);
1316         }
1317
1318         Type *Nucleus::getPointerType(Type *ElementType)
1319         {
1320                 if(sizeof(void*) == 8)
1321                 {
1322                         return T(Ice::IceType_i64);
1323                 }
1324                 else
1325                 {
1326                         return T(Ice::IceType_i32);
1327                 }
1328         }
1329
1330         Value *Nucleus::createNullValue(Type *Ty)
1331         {
1332                 if(Ice::isVectorType(T(Ty)))
1333                 {
1334                         assert(Ice::typeNumElements(T(Ty)) <= 16);
1335                         int64_t c[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1336                         return createConstantVector(c, Ty);
1337                 }
1338                 else
1339                 {
1340                         return V(::context->getConstantZero(T(Ty)));
1341                 }
1342         }
1343
1344         Value *Nucleus::createConstantLong(int64_t i)
1345         {
1346                 return V(::context->getConstantInt64(i));
1347         }
1348
1349         Value *Nucleus::createConstantInt(int i)
1350         {
1351                 return V(::context->getConstantInt32(i));
1352         }
1353
1354         Value *Nucleus::createConstantInt(unsigned int i)
1355         {
1356                 return V(::context->getConstantInt32(i));
1357         }
1358
1359         Value *Nucleus::createConstantBool(bool b)
1360         {
1361                 return V(::context->getConstantInt1(b));
1362         }
1363
1364         Value *Nucleus::createConstantByte(signed char i)
1365         {
1366                 return V(::context->getConstantInt8(i));
1367         }
1368
1369         Value *Nucleus::createConstantByte(unsigned char i)
1370         {
1371                 return V(::context->getConstantInt8(i));
1372         }
1373
1374         Value *Nucleus::createConstantShort(short i)
1375         {
1376                 return V(::context->getConstantInt16(i));
1377         }
1378
1379         Value *Nucleus::createConstantShort(unsigned short i)
1380         {
1381                 return V(::context->getConstantInt16(i));
1382         }
1383
1384         Value *Nucleus::createConstantFloat(float x)
1385         {
1386                 return V(::context->getConstantFloat(x));
1387         }
1388
1389         Value *Nucleus::createNullPointer(Type *Ty)
1390         {
1391                 return createNullValue(T(sizeof(void*) == 8 ? Ice::IceType_i64 : Ice::IceType_i32));
1392         }
1393
1394         Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
1395         {
1396                 const int vectorSize = 16;
1397                 assert(Ice::typeWidthInBytes(T(type)) == vectorSize);
1398                 const int alignment = vectorSize;
1399                 auto globalPool = ::function->getGlobalPool();
1400
1401                 const int64_t *i = constants;
1402                 const double *f = reinterpret_cast<const double*>(constants);
1403                 Ice::VariableDeclaration::DataInitializer *dataInitializer = nullptr;
1404
1405                 switch((int)reinterpret_cast<intptr_t>(type))
1406                 {
1407                 case Ice::IceType_v4i32:
1408                 case Ice::IceType_v4i1:
1409                         {
1410                                 const int initializer[4] = {(int)i[0], (int)i[1], (int)i[2], (int)i[3]};
1411                                 static_assert(sizeof(initializer) == vectorSize, "!");
1412                                 dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
1413                         }
1414                         break;
1415                 case Ice::IceType_v4f32:
1416                         {
1417                                 const float initializer[4] = {(float)f[0], (float)f[1], (float)f[2], (float)f[3]};
1418                                 static_assert(sizeof(initializer) == vectorSize, "!");
1419                                 dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
1420                         }
1421                         break;
1422                 case Ice::IceType_v8i16:
1423                 case Ice::IceType_v8i1:
1424                         {
1425                                 const short initializer[8] = {(short)i[0], (short)i[1], (short)i[2], (short)i[3], (short)i[4], (short)i[5], (short)i[6], (short)i[7]};
1426                                 static_assert(sizeof(initializer) == vectorSize, "!");
1427                                 dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
1428                         }
1429                         break;
1430                 case Ice::IceType_v16i8:
1431                 case Ice::IceType_v16i1:
1432                         {
1433                                 const char initializer[16] = {(char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7], (char)i[8], (char)i[9], (char)i[10], (char)i[11], (char)i[12], (char)i[13], (char)i[14], (char)i[15]};
1434                                 static_assert(sizeof(initializer) == vectorSize, "!");
1435                                 dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
1436                         }
1437                         break;
1438                 case Type_v2i32:
1439                         {
1440                                 const int initializer[4] = {(int)i[0], (int)i[1], (int)i[0], (int)i[1]};
1441                                 static_assert(sizeof(initializer) == vectorSize, "!");
1442                                 dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
1443                         }
1444                         break;
1445                 case Type_v2f32:
1446                         {
1447                                 const float initializer[4] = {(float)f[0], (float)f[1], (float)f[0], (float)f[1]};
1448                                 static_assert(sizeof(initializer) == vectorSize, "!");
1449                                 dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
1450                         }
1451                         break;
1452                 case Type_v4i16:
1453                         {
1454                                 const short initializer[8] = {(short)i[0], (short)i[1], (short)i[2], (short)i[3], (short)i[0], (short)i[1], (short)i[2], (short)i[3]};
1455                                 static_assert(sizeof(initializer) == vectorSize, "!");
1456                                 dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
1457                         }
1458                         break;
1459                 case Type_v8i8:
1460                         {
1461                                 const char initializer[16] = {(char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[4], (char)i[5], (char)i[6], (char)i[7]};
1462                                 static_assert(sizeof(initializer) == vectorSize, "!");
1463                                 dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
1464                         }
1465                         break;
1466                 case Type_v4i8:
1467                         {
1468                                 const char initializer[16] = {(char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3], (char)i[0], (char)i[1], (char)i[2], (char)i[3]};
1469                                 static_assert(sizeof(initializer) == vectorSize, "!");
1470                                 dataInitializer = Ice::VariableDeclaration::DataInitializer::create(globalPool, (const char*)initializer, vectorSize);
1471                         }
1472                         break;
1473                 default:
1474                         assert(false && "Unknown constant vector type" && type);
1475                 }
1476
1477                 auto name = Ice::GlobalString::createWithoutString(::context);
1478                 auto *variableDeclaration = Ice::VariableDeclaration::create(globalPool);
1479                 variableDeclaration->setName(name);
1480                 variableDeclaration->setAlignment(alignment);
1481                 variableDeclaration->setIsConstant(true);
1482                 variableDeclaration->addInitializer(dataInitializer);
1483
1484                 ::function->addGlobal(variableDeclaration);
1485
1486                 constexpr int32_t offset = 0;
1487                 Ice::Operand *ptr = ::context->getConstantSym(offset, name);
1488
1489                 Ice::Variable *result = ::function->makeVariable(T(type));
1490                 auto load = Ice::InstLoad::create(::function, result, ptr, alignment);
1491                 ::basicBlock->appendInst(load);
1492
1493                 return V(result);
1494         }
1495
1496         Value *Nucleus::createConstantVector(const double *constants, Type *type)
1497         {
1498                 return createConstantVector((const int64_t*)constants, type);
1499         }
1500
1501         Type *Void::getType()
1502         {
1503                 return T(Ice::IceType_void);
1504         }
1505
1506         Type *Bool::getType()
1507         {
1508                 return T(Ice::IceType_i1);
1509         }
1510
1511         Type *Byte::getType()
1512         {
1513                 return T(Ice::IceType_i8);
1514         }
1515
1516         Type *SByte::getType()
1517         {
1518                 return T(Ice::IceType_i8);
1519         }
1520
1521         Type *Short::getType()
1522         {
1523                 return T(Ice::IceType_i16);
1524         }
1525
1526         Type *UShort::getType()
1527         {
1528                 return T(Ice::IceType_i16);
1529         }
1530
1531         Type *Byte4::getType()
1532         {
1533                 return T(Type_v4i8);
1534         }
1535
1536         Type *SByte4::getType()
1537         {
1538                 return T(Type_v4i8);
1539         }
1540
1541         namespace
1542         {
1543                 RValue<Byte> SaturateUnsigned(RValue<Short> x)
1544                 {
1545                         return Byte(IfThenElse(Int(x) > 0xFF, Int(0xFF), IfThenElse(Int(x) < 0, Int(0), Int(x))));
1546                 }
1547
1548                 RValue<Byte> Extract(RValue<Byte8> val, int i)
1549                 {
1550                         return RValue<Byte>(Nucleus::createExtractElement(val.value, Byte::getType(), i));
1551                 }
1552
1553                 RValue<Byte8> Insert(RValue<Byte8> val, RValue<Byte> element, int i)
1554                 {
1555                         return RValue<Byte8>(Nucleus::createInsertElement(val.value, element.value, i));
1556                 }
1557         }
1558
1559         RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
1560         {
1561                 if(emulateIntrinsics)
1562                 {
1563                         Byte8 result;
1564                         result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
1565                         result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
1566                         result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
1567                         result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
1568                         result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
1569                         result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
1570                         result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
1571                         result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
1572
1573                         return result;
1574                 }
1575                 else
1576                 {
1577                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
1578                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
1579                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
1580                         auto paddusb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
1581                         paddusb->addArg(x.value);
1582                         paddusb->addArg(y.value);
1583                         ::basicBlock->appendInst(paddusb);
1584
1585                         return RValue<Byte8>(V(result));
1586                 }
1587         }
1588
1589         RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
1590         {
1591                 if(emulateIntrinsics)
1592                 {
1593                         Byte8 result;
1594                         result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
1595                         result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
1596                         result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
1597                         result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
1598                         result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
1599                         result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
1600                         result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
1601                         result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
1602
1603                         return result;
1604                 }
1605                 else
1606                 {
1607                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
1608                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
1609                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
1610                         auto psubusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
1611                         psubusw->addArg(x.value);
1612                         psubusw->addArg(y.value);
1613                         ::basicBlock->appendInst(psubusw);
1614
1615                         return RValue<Byte8>(V(result));
1616                 }
1617         }
1618
1619         RValue<SByte> Extract(RValue<SByte8> val, int i)
1620         {
1621                 return RValue<SByte>(Nucleus::createExtractElement(val.value, SByte::getType(), i));
1622         }
1623
1624         RValue<SByte8> Insert(RValue<SByte8> val, RValue<SByte> element, int i)
1625         {
1626                 return RValue<SByte8>(Nucleus::createInsertElement(val.value, element.value, i));
1627         }
1628
1629         RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
1630         {
1631                 if(emulateIntrinsics)
1632                 {
1633                         SByte8 result;
1634                         result = Insert(result, Extract(lhs, 0) >> SByte(rhs), 0);
1635                         result = Insert(result, Extract(lhs, 1) >> SByte(rhs), 1);
1636                         result = Insert(result, Extract(lhs, 2) >> SByte(rhs), 2);
1637                         result = Insert(result, Extract(lhs, 3) >> SByte(rhs), 3);
1638                         result = Insert(result, Extract(lhs, 4) >> SByte(rhs), 4);
1639                         result = Insert(result, Extract(lhs, 5) >> SByte(rhs), 5);
1640                         result = Insert(result, Extract(lhs, 6) >> SByte(rhs), 6);
1641                         result = Insert(result, Extract(lhs, 7) >> SByte(rhs), 7);
1642
1643                         return result;
1644                 }
1645                 else
1646                 {
1647                         #if defined(__i386__) || defined(__x86_64__)
1648                                 // SSE2 doesn't support byte vector shifts, so shift as shorts and recombine.
1649                                 RValue<Short4> hi = (As<Short4>(lhs) >> rhs) & Short4(0xFF00u);
1650                                 RValue<Short4> lo = As<Short4>(As<UShort4>((As<Short4>(lhs) << 8) >> rhs) >> 8);
1651
1652                                 return As<SByte8>(hi | lo);
1653                         #else
1654                                 return RValue<SByte8>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
1655                         #endif
1656                 }
1657         }
1658
1659         RValue<Int> SignMask(RValue<Byte8> x)
1660         {
1661                 if(emulateIntrinsics || CPUID::ARM)
1662                 {
1663                         Byte8 xx = As<Byte8>(As<SByte8>(x) >> 7) & Byte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
1664                         return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
1665                 }
1666                 else
1667                 {
1668                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
1669                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
1670                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
1671                         auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
1672                         movmsk->addArg(x.value);
1673                         ::basicBlock->appendInst(movmsk);
1674
1675                         return RValue<Int>(V(result)) & 0xFF;
1676                 }
1677         }
1678
1679 //      RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
1680 //      {
1681 //              return RValue<Byte8>(createIntCompare(Ice::InstIcmp::Ugt, x.value, y.value));
1682 //      }
1683
1684         RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
1685         {
1686                 return RValue<Byte8>(Nucleus::createICmpEQ(x.value, y.value));
1687         }
1688
1689         Type *Byte8::getType()
1690         {
1691                 return T(Type_v8i8);
1692         }
1693
1694 //      RValue<SByte8> operator<<(RValue<SByte8> lhs, unsigned char rhs)
1695 //      {
1696 //              return RValue<SByte8>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
1697 //      }
1698
1699 //      RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
1700 //      {
1701 //              return RValue<SByte8>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
1702 //      }
1703
1704         RValue<SByte> SaturateSigned(RValue<Short> x)
1705         {
1706                 return SByte(IfThenElse(Int(x) > 0x7F, Int(0x7F), IfThenElse(Int(x) < -0x80, Int(0x80), Int(x))));
1707         }
1708
1709         RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
1710         {
1711                 if(emulateIntrinsics)
1712                 {
1713                         SByte8 result;
1714                         result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
1715                         result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
1716                         result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
1717                         result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
1718                         result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
1719                         result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
1720                         result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
1721                         result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
1722
1723                         return result;
1724                 }
1725                 else
1726                 {
1727                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
1728                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
1729                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
1730                         auto paddsb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
1731                         paddsb->addArg(x.value);
1732                         paddsb->addArg(y.value);
1733                         ::basicBlock->appendInst(paddsb);
1734
1735                         return RValue<SByte8>(V(result));
1736                 }
1737         }
1738
1739         RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
1740         {
1741                 if(emulateIntrinsics)
1742                 {
1743                         SByte8 result;
1744                         result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
1745                         result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
1746                         result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
1747                         result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
1748                         result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
1749                         result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
1750                         result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
1751                         result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
1752
1753                         return result;
1754                 }
1755                 else
1756                 {
1757                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
1758                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
1759                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
1760                         auto psubsb = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
1761                         psubsb->addArg(x.value);
1762                         psubsb->addArg(y.value);
1763                         ::basicBlock->appendInst(psubsb);
1764
1765                         return RValue<SByte8>(V(result));
1766                 }
1767         }
1768
1769         RValue<Int> SignMask(RValue<SByte8> x)
1770         {
1771                 if(emulateIntrinsics || CPUID::ARM)
1772                 {
1773                         SByte8 xx = (x >> 7) & SByte8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
1774                         return Int(Extract(xx, 0)) | Int(Extract(xx, 1)) | Int(Extract(xx, 2)) | Int(Extract(xx, 3)) | Int(Extract(xx, 4)) | Int(Extract(xx, 5)) | Int(Extract(xx, 6)) | Int(Extract(xx, 7));
1775                 }
1776                 else
1777                 {
1778                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
1779                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
1780                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
1781                         auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
1782                         movmsk->addArg(x.value);
1783                         ::basicBlock->appendInst(movmsk);
1784
1785                         return RValue<Int>(V(result)) & 0xFF;
1786                 }
1787         }
1788
1789         RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
1790         {
1791                 return RValue<Byte8>(createIntCompare(Ice::InstIcmp::Sgt, x.value, y.value));
1792         }
1793
1794         RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
1795         {
1796                 return RValue<Byte8>(Nucleus::createICmpEQ(x.value, y.value));
1797         }
1798
1799         Type *SByte8::getType()
1800         {
1801                 return T(Type_v8i8);
1802         }
1803
1804         Type *Byte16::getType()
1805         {
1806                 return T(Ice::IceType_v16i8);
1807         }
1808
1809         Type *SByte16::getType()
1810         {
1811                 return T(Ice::IceType_v16i8);
1812         }
1813
1814         Type *Short2::getType()
1815         {
1816                 return T(Type_v2i16);
1817         }
1818
1819         Type *UShort2::getType()
1820         {
1821                 return T(Type_v2i16);
1822         }
1823
1824         Short4::Short4(RValue<Int4> cast)
1825         {
1826                 int select[8] = {0, 2, 4, 6, 0, 2, 4, 6};
1827                 Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
1828                 Value *packed = Nucleus::createShuffleVector(short8, short8, select);
1829
1830                 Value *int2 = RValue<Int2>(Int2(As<Int4>(packed))).value;
1831                 Value *short4 = Nucleus::createBitCast(int2, Short4::getType());
1832
1833                 storeValue(short4);
1834         }
1835
1836 //      Short4::Short4(RValue<Float> cast)
1837 //      {
1838 //      }
1839
1840         Short4::Short4(RValue<Float4> cast)
1841         {
1842                 assert(false && "UNIMPLEMENTED");
1843         }
1844
1845         RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
1846         {
1847                 if(emulateIntrinsics)
1848                 {
1849                         Short4 result;
1850                         result = Insert(result, Extract(lhs, 0) << Short(rhs), 0);
1851                         result = Insert(result, Extract(lhs, 1) << Short(rhs), 1);
1852                         result = Insert(result, Extract(lhs, 2) << Short(rhs), 2);
1853                         result = Insert(result, Extract(lhs, 3) << Short(rhs), 3);
1854
1855                         return result;
1856                 }
1857                 else
1858                 {
1859                         return RValue<Short4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
1860                 }
1861         }
1862
1863         RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
1864         {
1865                 if(emulateIntrinsics)
1866                 {
1867                         Short4 result;
1868                         result = Insert(result, Extract(lhs, 0) >> Short(rhs), 0);
1869                         result = Insert(result, Extract(lhs, 1) >> Short(rhs), 1);
1870                         result = Insert(result, Extract(lhs, 2) >> Short(rhs), 2);
1871                         result = Insert(result, Extract(lhs, 3) >> Short(rhs), 3);
1872
1873                         return result;
1874                 }
1875                 else
1876                 {
1877                         return RValue<Short4>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
1878                 }
1879         }
1880
1881         RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
1882         {
1883                 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
1884                 auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value, y.value);
1885                 ::basicBlock->appendInst(cmp);
1886
1887                 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
1888                 auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
1889                 ::basicBlock->appendInst(select);
1890
1891                 return RValue<Short4>(V(result));
1892         }
1893
1894         RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
1895         {
1896                 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
1897                 auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value, y.value);
1898                 ::basicBlock->appendInst(cmp);
1899
1900                 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
1901                 auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
1902                 ::basicBlock->appendInst(select);
1903
1904                 return RValue<Short4>(V(result));
1905         }
1906
1907         RValue<Short> SaturateSigned(RValue<Int> x)
1908         {
1909                 return Short(IfThenElse(x > 0x7FFF, Int(0x7FFF), IfThenElse(x < -0x8000, Int(0x8000), x)));
1910         }
1911
1912         RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
1913         {
1914                 if(emulateIntrinsics)
1915                 {
1916                         Short4 result;
1917                         result = Insert(result, SaturateSigned(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
1918                         result = Insert(result, SaturateSigned(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
1919                         result = Insert(result, SaturateSigned(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
1920                         result = Insert(result, SaturateSigned(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
1921
1922                         return result;
1923                 }
1924                 else
1925                 {
1926                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
1927                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
1928                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
1929                         auto paddsw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
1930                         paddsw->addArg(x.value);
1931                         paddsw->addArg(y.value);
1932                         ::basicBlock->appendInst(paddsw);
1933
1934                         return RValue<Short4>(V(result));
1935                 }
1936         }
1937
1938         RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
1939         {
1940                 if(emulateIntrinsics)
1941                 {
1942                         Short4 result;
1943                         result = Insert(result, SaturateSigned(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
1944                         result = Insert(result, SaturateSigned(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
1945                         result = Insert(result, SaturateSigned(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
1946                         result = Insert(result, SaturateSigned(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
1947
1948                         return result;
1949                 }
1950                 else
1951                 {
1952                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
1953                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
1954                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
1955                         auto psubsw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
1956                         psubsw->addArg(x.value);
1957                         psubsw->addArg(y.value);
1958                         ::basicBlock->appendInst(psubsw);
1959
1960                         return RValue<Short4>(V(result));
1961                 }
1962         }
1963
1964         RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
1965         {
1966                 if(emulateIntrinsics)
1967                 {
1968                         Short4 result;
1969                         result = Insert(result, Short((Int(Extract(x, 0)) * Int(Extract(y, 0))) >> 16), 0);
1970                         result = Insert(result, Short((Int(Extract(x, 1)) * Int(Extract(y, 1))) >> 16), 1);
1971                         result = Insert(result, Short((Int(Extract(x, 2)) * Int(Extract(y, 2))) >> 16), 2);
1972                         result = Insert(result, Short((Int(Extract(x, 3)) * Int(Extract(y, 3))) >> 16), 3);
1973
1974                         return result;
1975                 }
1976                 else
1977                 {
1978                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
1979                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyHighSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
1980                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
1981                         auto pmulhw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
1982                         pmulhw->addArg(x.value);
1983                         pmulhw->addArg(y.value);
1984                         ::basicBlock->appendInst(pmulhw);
1985
1986                         return RValue<Short4>(V(result));
1987                 }
1988         }
1989
1990         RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
1991         {
1992                 if(emulateIntrinsics)
1993                 {
1994                         Int2 result;
1995                         result = Insert(result, Int(Extract(x, 0)) * Int(Extract(y, 0)) + Int(Extract(x, 1)) * Int(Extract(y, 1)), 0);
1996                         result = Insert(result, Int(Extract(x, 2)) * Int(Extract(y, 2)) + Int(Extract(x, 3)) * Int(Extract(y, 3)), 1);
1997
1998                         return result;
1999                 }
2000                 else
2001                 {
2002                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2003                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyAddPairs, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
2004                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
2005                         auto pmaddwd = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
2006                         pmaddwd->addArg(x.value);
2007                         pmaddwd->addArg(y.value);
2008                         ::basicBlock->appendInst(pmaddwd);
2009
2010                         return As<Int2>(V(result));
2011                 }
2012         }
2013
2014         RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
2015         {
2016                 if(emulateIntrinsics)
2017                 {
2018                         SByte8 result;
2019                         result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
2020                         result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
2021                         result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
2022                         result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
2023                         result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
2024                         result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
2025                         result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
2026                         result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
2027
2028                         return result;
2029                 }
2030                 else
2031                 {
2032                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2033                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
2034                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
2035                         auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
2036                         pack->addArg(x.value);
2037                         pack->addArg(y.value);
2038                         ::basicBlock->appendInst(pack);
2039
2040                         return As<SByte8>(Swizzle(As<Int4>(V(result)), 0x88));
2041                 }
2042         }
2043
2044         RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
2045         {
2046                 if(emulateIntrinsics)
2047                 {
2048                         Byte8 result;
2049                         result = Insert(result, SaturateUnsigned(Extract(x, 0)), 0);
2050                         result = Insert(result, SaturateUnsigned(Extract(x, 1)), 1);
2051                         result = Insert(result, SaturateUnsigned(Extract(x, 2)), 2);
2052                         result = Insert(result, SaturateUnsigned(Extract(x, 3)), 3);
2053                         result = Insert(result, SaturateUnsigned(Extract(y, 0)), 4);
2054                         result = Insert(result, SaturateUnsigned(Extract(y, 1)), 5);
2055                         result = Insert(result, SaturateUnsigned(Extract(y, 2)), 6);
2056                         result = Insert(result, SaturateUnsigned(Extract(y, 3)), 7);
2057
2058                         return result;
2059                 }
2060                 else
2061                 {
2062                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
2063                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
2064                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
2065                         auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
2066                         pack->addArg(x.value);
2067                         pack->addArg(y.value);
2068                         ::basicBlock->appendInst(pack);
2069
2070                         return As<Byte8>(Swizzle(As<Int4>(V(result)), 0x88));
2071                 }
2072         }
2073
2074         RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2075         {
2076                 return RValue<Short4>(createIntCompare(Ice::InstIcmp::Sgt, x.value, y.value));
2077         }
2078
2079         RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2080         {
2081                 return RValue<Short4>(Nucleus::createICmpEQ(x.value, y.value));
2082         }
2083
2084         Type *Short4::getType()
2085         {
2086                 return T(Type_v4i16);
2087         }
2088
2089         UShort4::UShort4(RValue<Float4> cast, bool saturate)
2090         {
2091                 if(saturate)
2092                 {
2093                         if(CPUID::SSE4_1)
2094                         {
2095                                 // x86 produces 0x80000000 on 32-bit integer overflow/underflow.
2096                                 // PackUnsigned takes care of 0x0000 saturation.
2097                                 Int4 int4(Min(cast, Float4(0xFFFF)));
2098                                 *this = As<UShort4>(PackUnsigned(int4, int4));
2099                         }
2100                         else if(CPUID::ARM)
2101                         {
2102                                 // ARM saturates the 32-bit integer result on overflow/undeflow.
2103                                 Int4 int4(cast);
2104                                 *this = As<UShort4>(PackUnsigned(int4, int4));
2105                         }
2106                         else
2107                         {
2108                                 *this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
2109                         }
2110                 }
2111                 else
2112                 {
2113                         *this = Short4(Int4(cast));
2114                 }
2115         }
2116
2117         RValue<UShort> Extract(RValue<UShort4> val, int i)
2118         {
2119                 return RValue<UShort>(Nucleus::createExtractElement(val.value, UShort::getType(), i));
2120         }
2121
2122         RValue<UShort4> Insert(RValue<UShort4> val, RValue<UShort> element, int i)
2123         {
2124                 return RValue<UShort4>(Nucleus::createInsertElement(val.value, element.value, i));
2125         }
2126
2127         RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
2128         {
2129                 if(emulateIntrinsics)
2130                 {
2131                         UShort4 result;
2132                         result = Insert(result, Extract(lhs, 0) << UShort(rhs), 0);
2133                         result = Insert(result, Extract(lhs, 1) << UShort(rhs), 1);
2134                         result = Insert(result, Extract(lhs, 2) << UShort(rhs), 2);
2135                         result = Insert(result, Extract(lhs, 3) << UShort(rhs), 3);
2136
2137                         return result;
2138                 }
2139                 else
2140                 {
2141                         return RValue<UShort4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
2142                 }
2143         }
2144
2145         RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
2146         {
2147                 if(emulateIntrinsics)
2148                 {
2149                         UShort4 result;
2150                         result = Insert(result, Extract(lhs, 0) >> UShort(rhs), 0);
2151                         result = Insert(result, Extract(lhs, 1) >> UShort(rhs), 1);
2152                         result = Insert(result, Extract(lhs, 2) >> UShort(rhs), 2);
2153                         result = Insert(result, Extract(lhs, 3) >> UShort(rhs), 3);
2154
2155                         return result;
2156                 }
2157                 else
2158                 {
2159                         return RValue<UShort4>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
2160                 }
2161         }
2162
2163         RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2164         {
2165                 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
2166                 auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value, y.value);
2167                 ::basicBlock->appendInst(cmp);
2168
2169                 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2170                 auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
2171                 ::basicBlock->appendInst(select);
2172
2173                 return RValue<UShort4>(V(result));
2174         }
2175
2176         RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2177         {
2178                 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v8i1);
2179                 auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value, y.value);
2180                 ::basicBlock->appendInst(cmp);
2181
2182                 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2183                 auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
2184                 ::basicBlock->appendInst(select);
2185
2186                 return RValue<UShort4>(V(result));
2187         }
2188
2189         RValue<UShort> SaturateUnsigned(RValue<Int> x)
2190         {
2191                 return UShort(IfThenElse(x > 0xFFFF, Int(0xFFFF), IfThenElse(x < 0, Int(0), x)));
2192         }
2193
2194         RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2195         {
2196                 if(emulateIntrinsics)
2197                 {
2198                         UShort4 result;
2199                         result = Insert(result, SaturateUnsigned(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
2200                         result = Insert(result, SaturateUnsigned(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
2201                         result = Insert(result, SaturateUnsigned(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
2202                         result = Insert(result, SaturateUnsigned(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
2203
2204                         return result;
2205                 }
2206                 else
2207                 {
2208                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2209                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::AddSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
2210                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
2211                         auto paddusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
2212                         paddusw->addArg(x.value);
2213                         paddusw->addArg(y.value);
2214                         ::basicBlock->appendInst(paddusw);
2215
2216                         return RValue<UShort4>(V(result));
2217                 }
2218         }
2219
2220         RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2221         {
2222                 if(emulateIntrinsics)
2223                 {
2224                         UShort4 result;
2225                         result = Insert(result, SaturateUnsigned(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
2226                         result = Insert(result, SaturateUnsigned(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
2227                         result = Insert(result, SaturateUnsigned(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
2228                         result = Insert(result, SaturateUnsigned(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
2229
2230                         return result;
2231                 }
2232                 else
2233                 {
2234                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2235                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SubtractSaturateUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
2236                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
2237                         auto psubusw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
2238                         psubusw->addArg(x.value);
2239                         psubusw->addArg(y.value);
2240                         ::basicBlock->appendInst(psubusw);
2241
2242                         return RValue<UShort4>(V(result));
2243                 }
2244         }
2245
2246         RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2247         {
2248                 if(emulateIntrinsics)
2249                 {
2250                         UShort4 result;
2251                         result = Insert(result, UShort((UInt(Extract(x, 0)) * UInt(Extract(y, 0))) >> 16), 0);
2252                         result = Insert(result, UShort((UInt(Extract(x, 1)) * UInt(Extract(y, 1))) >> 16), 1);
2253                         result = Insert(result, UShort((UInt(Extract(x, 2)) * UInt(Extract(y, 2))) >> 16), 2);
2254                         result = Insert(result, UShort((UInt(Extract(x, 3)) * UInt(Extract(y, 3))) >> 16), 3);
2255
2256                         return result;
2257                 }
2258                 else
2259                 {
2260                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2261                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::MultiplyHighUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
2262                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
2263                         auto pmulhuw = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
2264                         pmulhuw->addArg(x.value);
2265                         pmulhuw->addArg(y.value);
2266                         ::basicBlock->appendInst(pmulhuw);
2267
2268                         return RValue<UShort4>(V(result));
2269                 }
2270         }
2271
2272         RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2273         {
2274                 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2275
2276                 // Scalarized implementation.
2277                 Int4 result;
2278                 result = Insert(result, Int((Long(Extract(x, 0)) * Long(Extract(y, 0))) >> Long(Int(32))), 0);
2279                 result = Insert(result, Int((Long(Extract(x, 1)) * Long(Extract(y, 1))) >> Long(Int(32))), 1);
2280                 result = Insert(result, Int((Long(Extract(x, 2)) * Long(Extract(y, 2))) >> Long(Int(32))), 2);
2281                 result = Insert(result, Int((Long(Extract(x, 3)) * Long(Extract(y, 3))) >> Long(Int(32))), 3);
2282
2283                 return result;
2284         }
2285
2286         RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
2287         {
2288                 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2289
2290                 if(false)  // Partial product based implementation.
2291                 {
2292                         auto xh = x >> 16;
2293                         auto yh = y >> 16;
2294                         auto xl = x & UInt4(0x0000FFFF);
2295                         auto yl = y & UInt4(0x0000FFFF);
2296                         auto xlyh = xl * yh;
2297                         auto xhyl = xh * yl;
2298                         auto xlyhh = xlyh >> 16;
2299                         auto xhylh = xhyl >> 16;
2300                         auto xlyhl = xlyh & UInt4(0x0000FFFF);
2301                         auto xhyll = xhyl & UInt4(0x0000FFFF);
2302                         auto xlylh = (xl * yl) >> 16;
2303                         auto oflow = (xlyhl + xhyll + xlylh) >> 16;
2304
2305                         return (xh * yh) + (xlyhh + xhylh) + oflow;
2306                 }
2307
2308                 // Scalarized implementation.
2309                 Int4 result;
2310                 result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 0))) * Long(UInt(Extract(As<Int4>(y), 0)))) >> Long(Int(32))), 0);
2311                 result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 1))) * Long(UInt(Extract(As<Int4>(y), 1)))) >> Long(Int(32))), 1);
2312                 result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 2))) * Long(UInt(Extract(As<Int4>(y), 2)))) >> Long(Int(32))), 2);
2313                 result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 3))) * Long(UInt(Extract(As<Int4>(y), 3)))) >> Long(Int(32))), 3);
2314
2315                 return As<UInt4>(result);
2316         }
2317
2318         RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
2319         {
2320                 assert(false && "UNIMPLEMENTED"); return RValue<UShort4>(V(nullptr));
2321         }
2322
2323         Type *UShort4::getType()
2324         {
2325                 return T(Type_v4i16);
2326         }
2327
2328         RValue<Short> Extract(RValue<Short8> val, int i)
2329         {
2330                 return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
2331         }
2332
2333         RValue<Short8> Insert(RValue<Short8> val, RValue<Short> element, int i)
2334         {
2335                 return RValue<Short8>(Nucleus::createInsertElement(val.value, element.value, i));
2336         }
2337
2338         RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
2339         {
2340                 if(emulateIntrinsics)
2341                 {
2342                         Short8 result;
2343                         result = Insert(result, Extract(lhs, 0) << Short(rhs), 0);
2344                         result = Insert(result, Extract(lhs, 1) << Short(rhs), 1);
2345                         result = Insert(result, Extract(lhs, 2) << Short(rhs), 2);
2346                         result = Insert(result, Extract(lhs, 3) << Short(rhs), 3);
2347                         result = Insert(result, Extract(lhs, 4) << Short(rhs), 4);
2348                         result = Insert(result, Extract(lhs, 5) << Short(rhs), 5);
2349                         result = Insert(result, Extract(lhs, 6) << Short(rhs), 6);
2350                         result = Insert(result, Extract(lhs, 7) << Short(rhs), 7);
2351
2352                         return result;
2353                 }
2354                 else
2355                 {
2356                         return RValue<Short8>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
2357                 }
2358         }
2359
2360         RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
2361         {
2362                 if(emulateIntrinsics)
2363                 {
2364                         Short8 result;
2365                         result = Insert(result, Extract(lhs, 0) >> Short(rhs), 0);
2366                         result = Insert(result, Extract(lhs, 1) >> Short(rhs), 1);
2367                         result = Insert(result, Extract(lhs, 2) >> Short(rhs), 2);
2368                         result = Insert(result, Extract(lhs, 3) >> Short(rhs), 3);
2369                         result = Insert(result, Extract(lhs, 4) >> Short(rhs), 4);
2370                         result = Insert(result, Extract(lhs, 5) >> Short(rhs), 5);
2371                         result = Insert(result, Extract(lhs, 6) >> Short(rhs), 6);
2372                         result = Insert(result, Extract(lhs, 7) >> Short(rhs), 7);
2373
2374                         return result;
2375                 }
2376                 else
2377                 {
2378                         return RValue<Short8>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
2379                 }
2380         }
2381
2382         RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
2383         {
2384                 assert(false && "UNIMPLEMENTED"); return RValue<Int4>(V(nullptr));
2385         }
2386
2387         RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
2388         {
2389                 assert(false && "UNIMPLEMENTED"); return RValue<Short8>(V(nullptr));
2390         }
2391
2392         Type *Short8::getType()
2393         {
2394                 return T(Ice::IceType_v8i16);
2395         }
2396
2397         RValue<UShort> Extract(RValue<UShort8> val, int i)
2398         {
2399                 return RValue<UShort>(Nucleus::createExtractElement(val.value, UShort::getType(), i));
2400         }
2401
2402         RValue<UShort8> Insert(RValue<UShort8> val, RValue<UShort> element, int i)
2403         {
2404                 return RValue<UShort8>(Nucleus::createInsertElement(val.value, element.value, i));
2405         }
2406
2407         RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
2408         {
2409                 if(emulateIntrinsics)
2410                 {
2411                         UShort8 result;
2412                         result = Insert(result, Extract(lhs, 0) << UShort(rhs), 0);
2413                         result = Insert(result, Extract(lhs, 1) << UShort(rhs), 1);
2414                         result = Insert(result, Extract(lhs, 2) << UShort(rhs), 2);
2415                         result = Insert(result, Extract(lhs, 3) << UShort(rhs), 3);
2416                         result = Insert(result, Extract(lhs, 4) << UShort(rhs), 4);
2417                         result = Insert(result, Extract(lhs, 5) << UShort(rhs), 5);
2418                         result = Insert(result, Extract(lhs, 6) << UShort(rhs), 6);
2419                         result = Insert(result, Extract(lhs, 7) << UShort(rhs), 7);
2420
2421                         return result;
2422                 }
2423                 else
2424                 {
2425                         return RValue<UShort8>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
2426                 }
2427         }
2428
2429         RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
2430         {
2431                 if(emulateIntrinsics)
2432                 {
2433                         UShort8 result;
2434                         result = Insert(result, Extract(lhs, 0) >> UShort(rhs), 0);
2435                         result = Insert(result, Extract(lhs, 1) >> UShort(rhs), 1);
2436                         result = Insert(result, Extract(lhs, 2) >> UShort(rhs), 2);
2437                         result = Insert(result, Extract(lhs, 3) >> UShort(rhs), 3);
2438                         result = Insert(result, Extract(lhs, 4) >> UShort(rhs), 4);
2439                         result = Insert(result, Extract(lhs, 5) >> UShort(rhs), 5);
2440                         result = Insert(result, Extract(lhs, 6) >> UShort(rhs), 6);
2441                         result = Insert(result, Extract(lhs, 7) >> UShort(rhs), 7);
2442
2443                         return result;
2444                 }
2445                 else
2446                 {
2447                         return RValue<UShort8>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
2448                 }
2449         }
2450
2451         RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
2452         {
2453                 assert(false && "UNIMPLEMENTED"); return RValue<UShort8>(V(nullptr));
2454         }
2455
2456         RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
2457         {
2458                 assert(false && "UNIMPLEMENTED"); return RValue<UShort8>(V(nullptr));
2459         }
2460
2461         // FIXME: Implement as Shuffle(x, y, Select(i0, ..., i16)) and Shuffle(x, y, SELECT_PACK_REPEAT(element))
2462 //      RValue<UShort8> PackRepeat(RValue<Byte16> x, RValue<Byte16> y, int element)
2463 //      {
2464 //              assert(false && "UNIMPLEMENTED"); return RValue<UShort8>(V(nullptr));
2465 //      }
2466
2467         Type *UShort8::getType()
2468         {
2469                 return T(Ice::IceType_v8i16);
2470         }
2471
2472         RValue<Int> operator++(Int &val, int)   // Post-increment
2473         {
2474                 RValue<Int> res = val;
2475                 val += 1;
2476                 return res;
2477         }
2478
2479         const Int &operator++(Int &val)   // Pre-increment
2480         {
2481                 val += 1;
2482                 return val;
2483         }
2484
2485         RValue<Int> operator--(Int &val, int)   // Post-decrement
2486         {
2487                 RValue<Int> res = val;
2488                 val -= 1;
2489                 return res;
2490         }
2491
2492         const Int &operator--(Int &val)   // Pre-decrement
2493         {
2494                 val -= 1;
2495                 return val;
2496         }
2497
2498         RValue<Int> RoundInt(RValue<Float> cast)
2499         {
2500                 if(emulateIntrinsics || CPUID::ARM)
2501                 {
2502                         // Push the fractional part off the mantissa. Accurate up to +/-2^22.
2503                         return Int((cast + Float(0x00C00000)) - Float(0x00C00000));
2504                 }
2505                 else
2506                 {
2507                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
2508                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
2509                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
2510                         auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
2511                         nearbyint->addArg(cast.value);
2512                         ::basicBlock->appendInst(nearbyint);
2513
2514                         return RValue<Int>(V(result));
2515                 }
2516         }
2517
2518         Type *Int::getType()
2519         {
2520                 return T(Ice::IceType_i32);
2521         }
2522
2523         Type *Long::getType()
2524         {
2525                 return T(Ice::IceType_i64);
2526         }
2527
2528         UInt::UInt(RValue<Float> cast)
2529         {
2530                 // Smallest positive value representable in UInt, but not in Int
2531                 const unsigned int ustart = 0x80000000u;
2532                 const float ustartf = float(ustart);
2533
2534                 // If the value is negative, store 0, otherwise store the result of the conversion
2535                 storeValue((~(As<Int>(cast) >> 31) &
2536                 // Check if the value can be represented as an Int
2537                         IfThenElse(cast >= ustartf,
2538                 // If the value is too large, subtract ustart and re-add it after conversion.
2539                                 As<Int>(As<UInt>(Int(cast - Float(ustartf))) + UInt(ustart)),
2540                 // Otherwise, just convert normally
2541                                 Int(cast))).value);
2542         }
2543
2544         RValue<UInt> operator++(UInt &val, int)   // Post-increment
2545         {
2546                 RValue<UInt> res = val;
2547                 val += 1;
2548                 return res;
2549         }
2550
2551         const UInt &operator++(UInt &val)   // Pre-increment
2552         {
2553                 val += 1;
2554                 return val;
2555         }
2556
2557         RValue<UInt> operator--(UInt &val, int)   // Post-decrement
2558         {
2559                 RValue<UInt> res = val;
2560                 val -= 1;
2561                 return res;
2562         }
2563
2564         const UInt &operator--(UInt &val)   // Pre-decrement
2565         {
2566                 val -= 1;
2567                 return val;
2568         }
2569
2570 //      RValue<UInt> RoundUInt(RValue<Float> cast)
2571 //      {
2572 //              assert(false && "UNIMPLEMENTED"); return RValue<UInt>(V(nullptr));
2573 //      }
2574
2575         Type *UInt::getType()
2576         {
2577                 return T(Ice::IceType_i32);
2578         }
2579
2580 //      Int2::Int2(RValue<Int> cast)
2581 //      {
2582 //              Value *extend = Nucleus::createZExt(cast.value, Long::getType());
2583 //              Value *vector = Nucleus::createBitCast(extend, Int2::getType());
2584 //
2585 //              Constant *shuffle[2];
2586 //              shuffle[0] = Nucleus::createConstantInt(0);
2587 //              shuffle[1] = Nucleus::createConstantInt(0);
2588 //
2589 //              Value *replicate = Nucleus::createShuffleVector(vector, UndefValue::get(Int2::getType()), Nucleus::createConstantVector(shuffle, 2));
2590 //
2591 //              storeValue(replicate);
2592 //      }
2593
2594         RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
2595         {
2596                 if(emulateIntrinsics)
2597                 {
2598                         Int2 result;
2599                         result = Insert(result, Extract(lhs, 0) << Int(rhs), 0);
2600                         result = Insert(result, Extract(lhs, 1) << Int(rhs), 1);
2601
2602                         return result;
2603                 }
2604                 else
2605                 {
2606                         return RValue<Int2>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
2607                 }
2608         }
2609
2610         RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
2611         {
2612                 if(emulateIntrinsics)
2613                 {
2614                         Int2 result;
2615                         result = Insert(result, Extract(lhs, 0) >> Int(rhs), 0);
2616                         result = Insert(result, Extract(lhs, 1) >> Int(rhs), 1);
2617
2618                         return result;
2619                 }
2620                 else
2621                 {
2622                         return RValue<Int2>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
2623                 }
2624         }
2625
2626         Type *Int2::getType()
2627         {
2628                 return T(Type_v2i32);
2629         }
2630
2631         RValue<UInt> Extract(RValue<UInt2> val, int i)
2632         {
2633                 return RValue<UInt>(Nucleus::createExtractElement(val.value, UInt::getType(), i));
2634         }
2635
2636         RValue<UInt2> Insert(RValue<UInt2> val, RValue<UInt> element, int i)
2637         {
2638                 return RValue<UInt2>(Nucleus::createInsertElement(val.value, element.value, i));
2639         }
2640
2641         RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
2642         {
2643                 if(emulateIntrinsics)
2644                 {
2645                         UInt2 result;
2646                         result = Insert(result, Extract(lhs, 0) << UInt(rhs), 0);
2647                         result = Insert(result, Extract(lhs, 1) << UInt(rhs), 1);
2648
2649                         return result;
2650                 }
2651                 else
2652                 {
2653                         return RValue<UInt2>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
2654                 }
2655         }
2656
2657         RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
2658         {
2659                 if(emulateIntrinsics)
2660                 {
2661                         UInt2 result;
2662                         result = Insert(result, Extract(lhs, 0) >> UInt(rhs), 0);
2663                         result = Insert(result, Extract(lhs, 1) >> UInt(rhs), 1);
2664
2665                         return result;
2666                 }
2667                 else
2668                 {
2669                         return RValue<UInt2>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
2670                 }
2671         }
2672
2673         Type *UInt2::getType()
2674         {
2675                 return T(Type_v2i32);
2676         }
2677
2678         Int4::Int4(RValue<Byte4> cast) : XYZW(this)
2679         {
2680                 Value *x = Nucleus::createBitCast(cast.value, Int::getType());
2681                 Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
2682
2683                 Value *e;
2684                 int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
2685                 Value *b = Nucleus::createBitCast(a, Byte16::getType());
2686                 Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Byte16::getType())), swizzle);
2687
2688                 int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
2689                 Value *d = Nucleus::createBitCast(c, Short8::getType());
2690                 e = Nucleus::createShuffleVector(d, V(Nucleus::createNullValue(Short8::getType())), swizzle2);
2691
2692                 Value *f = Nucleus::createBitCast(e, Int4::getType());
2693                 storeValue(f);
2694         }
2695
2696         Int4::Int4(RValue<SByte4> cast) : XYZW(this)
2697         {
2698                 Value *x = Nucleus::createBitCast(cast.value, Int::getType());
2699                 Value *a = Nucleus::createInsertElement(loadValue(), x, 0);
2700
2701                 int swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
2702                 Value *b = Nucleus::createBitCast(a, Byte16::getType());
2703                 Value *c = Nucleus::createShuffleVector(b, b, swizzle);
2704
2705                 int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
2706                 Value *d = Nucleus::createBitCast(c, Short8::getType());
2707                 Value *e = Nucleus::createShuffleVector(d, d, swizzle2);
2708
2709                 *this = As<Int4>(e) >> 24;
2710         }
2711
2712         Int4::Int4(RValue<Short4> cast) : XYZW(this)
2713         {
2714                 int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
2715                 Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle);
2716
2717                 *this = As<Int4>(c) >> 16;
2718         }
2719
2720         Int4::Int4(RValue<UShort4> cast) : XYZW(this)
2721         {
2722                 int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
2723                 Value *c = Nucleus::createShuffleVector(cast.value, Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
2724                 Value *d = Nucleus::createBitCast(c, Int4::getType());
2725                 storeValue(d);
2726         }
2727
2728         Int4::Int4(RValue<Int> rhs) : XYZW(this)
2729         {
2730                 Value *vector = Nucleus::createBitCast(rhs.value, Int4::getType());
2731
2732                 int swizzle[4] = {0, 0, 0, 0};
2733                 Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
2734
2735                 storeValue(replicate);
2736         }
2737
2738         RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
2739         {
2740                 if(emulateIntrinsics)
2741                 {
2742                         Int4 result;
2743                         result = Insert(result, Extract(lhs, 0) << Int(rhs), 0);
2744                         result = Insert(result, Extract(lhs, 1) << Int(rhs), 1);
2745                         result = Insert(result, Extract(lhs, 2) << Int(rhs), 2);
2746                         result = Insert(result, Extract(lhs, 3) << Int(rhs), 3);
2747
2748                         return result;
2749                 }
2750                 else
2751                 {
2752                         return RValue<Int4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
2753                 }
2754         }
2755
2756         RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
2757         {
2758                 if(emulateIntrinsics)
2759                 {
2760                         Int4 result;
2761                         result = Insert(result, Extract(lhs, 0) >> Int(rhs), 0);
2762                         result = Insert(result, Extract(lhs, 1) >> Int(rhs), 1);
2763                         result = Insert(result, Extract(lhs, 2) >> Int(rhs), 2);
2764                         result = Insert(result, Extract(lhs, 3) >> Int(rhs), 3);
2765
2766                         return result;
2767                 }
2768                 else
2769                 {
2770                         return RValue<Int4>(Nucleus::createAShr(lhs.value, V(::context->getConstantInt32(rhs))));
2771                 }
2772         }
2773
2774         RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
2775         {
2776                 return RValue<Int4>(Nucleus::createICmpEQ(x.value, y.value));
2777         }
2778
2779         RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
2780         {
2781                 return RValue<Int4>(Nucleus::createICmpSLT(x.value, y.value));
2782         }
2783
2784         RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
2785         {
2786                 return RValue<Int4>(Nucleus::createICmpSLE(x.value, y.value));
2787         }
2788
2789         RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
2790         {
2791                 return RValue<Int4>(Nucleus::createICmpNE(x.value, y.value));
2792         }
2793
2794         RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
2795         {
2796                 return RValue<Int4>(Nucleus::createICmpSGE(x.value, y.value));
2797         }
2798
2799         RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
2800         {
2801                 return RValue<Int4>(Nucleus::createICmpSGT(x.value, y.value));
2802         }
2803
2804         RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
2805         {
2806                 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
2807                 auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sle, condition, x.value, y.value);
2808                 ::basicBlock->appendInst(cmp);
2809
2810                 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
2811                 auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
2812                 ::basicBlock->appendInst(select);
2813
2814                 return RValue<Int4>(V(result));
2815         }
2816
2817         RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
2818         {
2819                 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
2820                 auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Sgt, condition, x.value, y.value);
2821                 ::basicBlock->appendInst(cmp);
2822
2823                 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
2824                 auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
2825                 ::basicBlock->appendInst(select);
2826
2827                 return RValue<Int4>(V(result));
2828         }
2829
2830         RValue<Int4> RoundInt(RValue<Float4> cast)
2831         {
2832                 if(emulateIntrinsics || CPUID::ARM)
2833                 {
2834                         // Push the fractional part off the mantissa. Accurate up to +/-2^22.
2835                         return Int4((cast + Float4(0x00C00000)) - Float4(0x00C00000));
2836                 }
2837                 else
2838                 {
2839                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
2840                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
2841                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
2842                         auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
2843                         nearbyint->addArg(cast.value);
2844                         ::basicBlock->appendInst(nearbyint);
2845
2846                         return RValue<Int4>(V(result));
2847                 }
2848         }
2849
2850         RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
2851         {
2852                 if(emulateIntrinsics)
2853                 {
2854                         Short8 result;
2855                         result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
2856                         result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
2857                         result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
2858                         result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
2859                         result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
2860                         result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
2861                         result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
2862                         result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
2863
2864                         return result;
2865                 }
2866                 else
2867                 {
2868                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2869                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackSigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
2870                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
2871                         auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
2872                         pack->addArg(x.value);
2873                         pack->addArg(y.value);
2874                         ::basicBlock->appendInst(pack);
2875
2876                         return RValue<Short8>(V(result));
2877                 }
2878         }
2879
2880         RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
2881         {
2882                 if(emulateIntrinsics || !(CPUID::SSE4_1 || CPUID::ARM))
2883                 {
2884                         RValue<Int4> sx = As<Int4>(x);
2885                         RValue<Int4> bx = (sx & ~(sx >> 31)) - Int4(0x8000);
2886
2887                         RValue<Int4> sy = As<Int4>(y);
2888                         RValue<Int4> by = (sy & ~(sy >> 31)) - Int4(0x8000);
2889
2890                         return As<UShort8>(PackSigned(bx, by) + Short8(0x8000u));
2891                 }
2892                 else
2893                 {
2894                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
2895                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
2896                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
2897                         auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
2898                         pack->addArg(x.value);
2899                         pack->addArg(y.value);
2900                         ::basicBlock->appendInst(pack);
2901
2902                         return RValue<UShort8>(V(result));
2903                 }
2904         }
2905
2906         RValue<Int> SignMask(RValue<Int4> x)
2907         {
2908                 if(emulateIntrinsics || CPUID::ARM)
2909                 {
2910                         Int4 xx = (x >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
2911                         return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
2912                 }
2913                 else
2914                 {
2915                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
2916                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
2917                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
2918                         auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
2919                         movmsk->addArg(x.value);
2920                         ::basicBlock->appendInst(movmsk);
2921
2922                         return RValue<Int>(V(result));
2923                 }
2924         }
2925
2926         Type *Int4::getType()
2927         {
2928                 return T(Ice::IceType_v4i32);
2929         }
2930
2931         UInt4::UInt4(RValue<Float4> cast) : XYZW(this)
2932         {
2933                 // Smallest positive value representable in UInt, but not in Int
2934                 const unsigned int ustart = 0x80000000u;
2935                 const float ustartf = float(ustart);
2936
2937                 // Check if the value can be represented as an Int
2938                 Int4 uiValue = CmpNLT(cast, Float4(ustartf));
2939                 // If the value is too large, subtract ustart and re-add it after conversion.
2940                 uiValue = (uiValue & As<Int4>(As<UInt4>(Int4(cast - Float4(ustartf))) + UInt4(ustart))) |
2941                 // Otherwise, just convert normally
2942                           (~uiValue & Int4(cast));
2943                 // If the value is negative, store 0, otherwise store the result of the conversion
2944                 storeValue((~(As<Int4>(cast) >> 31) & uiValue).value);
2945         }
2946
2947         RValue<UInt> Extract(RValue<UInt4> x, int i)
2948         {
2949                 return RValue<UInt>(Nucleus::createExtractElement(x.value, UInt::getType(), i));
2950         }
2951
2952         RValue<UInt4> Insert(RValue<UInt4> x, RValue<UInt> element, int i)
2953         {
2954                 return RValue<UInt4>(Nucleus::createInsertElement(x.value, element.value, i));
2955         }
2956
2957         RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
2958         {
2959                 if(emulateIntrinsics)
2960                 {
2961                         UInt4 result;
2962                         result = Insert(result, Extract(lhs, 0) << UInt(rhs), 0);
2963                         result = Insert(result, Extract(lhs, 1) << UInt(rhs), 1);
2964                         result = Insert(result, Extract(lhs, 2) << UInt(rhs), 2);
2965                         result = Insert(result, Extract(lhs, 3) << UInt(rhs), 3);
2966
2967                         return result;
2968                 }
2969                 else
2970                 {
2971                         return RValue<UInt4>(Nucleus::createShl(lhs.value, V(::context->getConstantInt32(rhs))));
2972                 }
2973         }
2974
2975         RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
2976         {
2977                 if(emulateIntrinsics)
2978                 {
2979                         UInt4 result;
2980                         result = Insert(result, Extract(lhs, 0) >> UInt(rhs), 0);
2981                         result = Insert(result, Extract(lhs, 1) >> UInt(rhs), 1);
2982                         result = Insert(result, Extract(lhs, 2) >> UInt(rhs), 2);
2983                         result = Insert(result, Extract(lhs, 3) >> UInt(rhs), 3);
2984
2985                         return result;
2986                 }
2987                 else
2988                 {
2989                         return RValue<UInt4>(Nucleus::createLShr(lhs.value, V(::context->getConstantInt32(rhs))));
2990                 }
2991         }
2992
2993         RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
2994         {
2995                 return RValue<UInt4>(Nucleus::createICmpEQ(x.value, y.value));
2996         }
2997
2998         RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
2999         {
3000                 return RValue<UInt4>(Nucleus::createICmpULT(x.value, y.value));
3001         }
3002
3003         RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
3004         {
3005                 return RValue<UInt4>(Nucleus::createICmpULE(x.value, y.value));
3006         }
3007
3008         RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
3009         {
3010                 return RValue<UInt4>(Nucleus::createICmpNE(x.value, y.value));
3011         }
3012
3013         RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
3014         {
3015                 return RValue<UInt4>(Nucleus::createICmpUGE(x.value, y.value));
3016         }
3017
3018         RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
3019         {
3020                 return RValue<UInt4>(Nucleus::createICmpUGT(x.value, y.value));
3021         }
3022
3023         RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
3024         {
3025                 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3026                 auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ule, condition, x.value, y.value);
3027                 ::basicBlock->appendInst(cmp);
3028
3029                 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3030                 auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
3031                 ::basicBlock->appendInst(select);
3032
3033                 return RValue<UInt4>(V(result));
3034         }
3035
3036         RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
3037         {
3038                 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3039                 auto cmp = Ice::InstIcmp::create(::function, Ice::InstIcmp::Ugt, condition, x.value, y.value);
3040                 ::basicBlock->appendInst(cmp);
3041
3042                 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
3043                 auto select = Ice::InstSelect::create(::function, result, condition, y.value, x.value);
3044                 ::basicBlock->appendInst(select);
3045
3046                 return RValue<UInt4>(V(result));
3047         }
3048
3049         Type *UInt4::getType()
3050         {
3051                 return T(Ice::IceType_v4i32);
3052         }
3053
3054         Type *Half::getType()
3055         {
3056                 return T(Ice::IceType_i16);
3057         }
3058
3059         RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
3060         {
3061                 return 1.0f / x;
3062         }
3063
3064         RValue<Float> RcpSqrt_pp(RValue<Float> x)
3065         {
3066                 return Rcp_pp(Sqrt(x));
3067         }
3068
3069         RValue<Float> Sqrt(RValue<Float> x)
3070         {
3071                 Ice::Variable *result = ::function->makeVariable(Ice::IceType_f32);
3072                 const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
3073                 auto target = ::context->getConstantUndef(Ice::IceType_i32);
3074                 auto sqrt = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
3075                 sqrt->addArg(x.value);
3076                 ::basicBlock->appendInst(sqrt);
3077
3078                 return RValue<Float>(V(result));
3079         }
3080
3081         RValue<Float> Round(RValue<Float> x)
3082         {
3083                 return Float4(Round(Float4(x))).x;
3084         }
3085
3086         RValue<Float> Trunc(RValue<Float> x)
3087         {
3088                 return Float4(Trunc(Float4(x))).x;
3089         }
3090
3091         RValue<Float> Frac(RValue<Float> x)
3092         {
3093                 return Float4(Frac(Float4(x))).x;
3094         }
3095
3096         RValue<Float> Floor(RValue<Float> x)
3097         {
3098                 return Float4(Floor(Float4(x))).x;
3099         }
3100
3101         RValue<Float> Ceil(RValue<Float> x)
3102         {
3103                 return Float4(Ceil(Float4(x))).x;
3104         }
3105
3106         Type *Float::getType()
3107         {
3108                 return T(Ice::IceType_f32);
3109         }
3110
3111         Type *Float2::getType()
3112         {
3113                 return T(Type_v2f32);
3114         }
3115
3116         Float4::Float4(RValue<Float> rhs) : XYZW(this)
3117         {
3118                 Value *vector = Nucleus::createBitCast(rhs.value, Float4::getType());
3119
3120                 int swizzle[4] = {0, 0, 0, 0};
3121                 Value *replicate = Nucleus::createShuffleVector(vector, vector, swizzle);
3122
3123                 storeValue(replicate);
3124         }
3125
3126         RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
3127         {
3128                 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3129                 auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Ogt, condition, x.value, y.value);
3130                 ::basicBlock->appendInst(cmp);
3131
3132                 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
3133                 auto select = Ice::InstSelect::create(::function, result, condition, x.value, y.value);
3134                 ::basicBlock->appendInst(select);
3135
3136                 return RValue<Float4>(V(result));
3137         }
3138
3139         RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
3140         {
3141                 Ice::Variable *condition = ::function->makeVariable(Ice::IceType_v4i1);
3142                 auto cmp = Ice::InstFcmp::create(::function, Ice::InstFcmp::Olt, condition, x.value, y.value);
3143                 ::basicBlock->appendInst(cmp);
3144
3145                 Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
3146                 auto select = Ice::InstSelect::create(::function, result, condition, x.value, y.value);
3147                 ::basicBlock->appendInst(select);
3148
3149                 return RValue<Float4>(V(result));
3150         }
3151
3152         RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
3153         {
3154                 return Float4(1.0f) / x;
3155         }
3156
3157         RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
3158         {
3159                 return Rcp_pp(Sqrt(x));
3160         }
3161
3162         RValue<Float4> Sqrt(RValue<Float4> x)
3163         {
3164                 if(emulateIntrinsics || CPUID::ARM)
3165                 {
3166                         Float4 result;
3167                         result.x = Sqrt(Float(Float4(x).x));
3168                         result.y = Sqrt(Float(Float4(x).y));
3169                         result.z = Sqrt(Float(Float4(x).z));
3170                         result.w = Sqrt(Float(Float4(x).w));
3171
3172                         return result;
3173                 }
3174                 else
3175                 {
3176                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
3177                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Sqrt, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
3178                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
3179                         auto sqrt = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
3180                         sqrt->addArg(x.value);
3181                         ::basicBlock->appendInst(sqrt);
3182
3183                         return RValue<Float4>(V(result));
3184                 }
3185         }
3186
3187         RValue<Int> SignMask(RValue<Float4> x)
3188         {
3189                 if(emulateIntrinsics || CPUID::ARM)
3190                 {
3191                         Int4 xx = (As<Int4>(x) >> 31) & Int4(0x00000001, 0x00000002, 0x00000004, 0x00000008);
3192                         return Extract(xx, 0) | Extract(xx, 1) | Extract(xx, 2) | Extract(xx, 3);
3193                 }
3194                 else
3195                 {
3196                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_i32);
3197                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::SignMask, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
3198                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
3199                         auto movmsk = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
3200                         movmsk->addArg(x.value);
3201                         ::basicBlock->appendInst(movmsk);
3202
3203                         return RValue<Int>(V(result));
3204                 }
3205         }
3206
3207         RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
3208         {
3209                 return RValue<Int4>(Nucleus::createFCmpOEQ(x.value, y.value));
3210         }
3211
3212         RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
3213         {
3214                 return RValue<Int4>(Nucleus::createFCmpOLT(x.value, y.value));
3215         }
3216
3217         RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
3218         {
3219                 return RValue<Int4>(Nucleus::createFCmpOLE(x.value, y.value));
3220         }
3221
3222         RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
3223         {
3224                 return RValue<Int4>(Nucleus::createFCmpONE(x.value, y.value));
3225         }
3226
3227         RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
3228         {
3229                 return RValue<Int4>(Nucleus::createFCmpOGE(x.value, y.value));
3230         }
3231
3232         RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
3233         {
3234                 return RValue<Int4>(Nucleus::createFCmpOGT(x.value, y.value));
3235         }
3236
3237         RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
3238         {
3239                 return RValue<Int4>(Nucleus::createFCmpUEQ(x.value, y.value));
3240         }
3241
3242         RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
3243         {
3244                 return RValue<Int4>(Nucleus::createFCmpULT(x.value, y.value));
3245         }
3246
3247         RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
3248         {
3249                 return RValue<Int4>(Nucleus::createFCmpULE(x.value, y.value));
3250         }
3251
3252         RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
3253         {
3254                 return RValue<Int4>(Nucleus::createFCmpUNE(x.value, y.value));
3255         }
3256
3257         RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
3258         {
3259                 return RValue<Int4>(Nucleus::createFCmpUGE(x.value, y.value));
3260         }
3261
3262         RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
3263         {
3264                 return RValue<Int4>(Nucleus::createFCmpUGT(x.value, y.value));
3265         }
3266
3267         RValue<Float4> Round(RValue<Float4> x)
3268         {
3269                 if(emulateIntrinsics || CPUID::ARM)
3270                 {
3271                         // Push the fractional part off the mantissa. Accurate up to +/-2^22.
3272                         return (x + Float4(0x00C00000)) - Float4(0x00C00000);
3273                 }
3274                 else if(CPUID::SSE4_1)
3275                 {
3276                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
3277                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
3278                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
3279                         auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
3280                         round->addArg(x.value);
3281                         round->addArg(::context->getConstantInt32(0));
3282                         ::basicBlock->appendInst(round);
3283
3284                         return RValue<Float4>(V(result));
3285                 }
3286                 else
3287                 {
3288                         return Float4(RoundInt(x));
3289                 }
3290         }
3291
3292         RValue<Float4> Trunc(RValue<Float4> x)
3293         {
3294                 if(CPUID::SSE4_1)
3295                 {
3296                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
3297                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
3298                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
3299                         auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
3300                         round->addArg(x.value);
3301                         round->addArg(::context->getConstantInt32(3));
3302                         ::basicBlock->appendInst(round);
3303
3304                         return RValue<Float4>(V(result));
3305                 }
3306                 else
3307                 {
3308                         return Float4(Int4(x));
3309                 }
3310         }
3311
3312         RValue<Float4> Frac(RValue<Float4> x)
3313         {
3314                 Float4 frc;
3315
3316                 if(CPUID::SSE4_1)
3317                 {
3318                         frc = x - Floor(x);
3319                 }
3320                 else
3321                 {
3322                         frc = x - Float4(Int4(x));   // Signed fractional part.
3323
3324                         frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1, 1, 1, 1)));   // Add 1.0 if negative.
3325                 }
3326
3327                 // x - floor(x) can be 1.0 for very small negative x.
3328                 // Clamp against the value just below 1.0.
3329                 return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
3330         }
3331
3332         RValue<Float4> Floor(RValue<Float4> x)
3333         {
3334                 if(CPUID::SSE4_1)
3335                 {
3336                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
3337                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
3338                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
3339                         auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
3340                         round->addArg(x.value);
3341                         round->addArg(::context->getConstantInt32(1));
3342                         ::basicBlock->appendInst(round);
3343
3344                         return RValue<Float4>(V(result));
3345                 }
3346                 else
3347                 {
3348                         return x - Frac(x);
3349                 }
3350         }
3351
3352         RValue<Float4> Ceil(RValue<Float4> x)
3353         {
3354                 if(CPUID::SSE4_1)
3355                 {
3356                         Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4f32);
3357                         const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::Round, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
3358                         auto target = ::context->getConstantUndef(Ice::IceType_i32);
3359                         auto round = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
3360                         round->addArg(x.value);
3361                         round->addArg(::context->getConstantInt32(2));
3362                         ::basicBlock->appendInst(round);
3363
3364                         return RValue<Float4>(V(result));
3365                 }
3366                 else
3367                 {
3368                         return -Floor(-x);
3369                 }
3370         }
3371
3372         Type *Float4::getType()
3373         {
3374                 return T(Ice::IceType_v4f32);
3375         }
3376
3377         RValue<Long> Ticks()
3378         {
3379                 assert(false && "UNIMPLEMENTED"); return RValue<Long>(V(nullptr));
3380         }
3381 }