From e3191014f785357d7afdee8288ec31a4fc699a6d Mon Sep 17 00:00:00 2001 From: Logan Chien Date: Fri, 24 Aug 2018 22:01:50 +0800 Subject: [PATCH] Add the unimplemented non-x86 code generator This commit fills in the unimplemented non-x86 (generic) code generator. Change-Id: I189b0ea523ecd9b18c29ad6ed6fa3f798382295b Reviewed-on: https://swiftshader-review.googlesource.com/20476 Tested-by: Logan Chien Reviewed-by: Nicolas Capens --- src/Reactor/LLVMReactor.cpp | 553 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 519 insertions(+), 34 deletions(-) diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp index ff290c237..ba4aeb9cf 100644 --- a/src/Reactor/LLVMReactor.cpp +++ b/src/Reactor/LLVMReactor.cpp @@ -119,7 +119,6 @@ namespace sw::MutexLock codegenMutex; #if SWIFTSHADER_LLVM_VERSION >= 7 -#if defined(__i386__) || defined(__x86_64__) llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y) { llvm::VectorType *ty = llvm::cast(x->getType()); @@ -149,6 +148,7 @@ namespace return ::builder->CreateSExt(::builder->CreateICmp(pred, x, y), dstTy, ""); } +#if defined(__i386__) || defined(__x86_64__) llvm::Value *lowerPMOV(llvm::Value *op, llvm::Type *dstType, bool sext) { llvm::VectorType *srcTy = llvm::cast(op->getType()); @@ -171,6 +171,217 @@ namespace return ::builder->CreateSelect(cmp, v, neg); } #endif // defined(__i386__) || defined(__x86_64__) + +#if !defined(__i386__) && !defined(__x86_64__) + llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y, + llvm::FCmpInst::Predicate pred) + { + return ::builder->CreateSelect(::builder->CreateFCmp(pred, x, y), x, y); + } + + // Packed add/sub saturatation + llvm::Value *lowerPSAT(llvm::Intrinsic::ID intrinsic, llvm::Value *x, llvm::Value *y) + { + llvm::Function *func = llvm::Intrinsic::getDeclaration( + ::module, intrinsic, {x->getType(), y->getType()}); + llvm::Value *ret = ::builder->CreateCall(func, ARGS(x, y)); + return ::builder->CreateExtractValue(ret, {0}); + } + + llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y) + { + return lowerPSAT(llvm::Intrinsic::uadd_with_overflow, x, y); + } + + llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y) + { + return lowerPSAT(llvm::Intrinsic::sadd_with_overflow, x, y); + } + + llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y) + { + return lowerPSAT(llvm::Intrinsic::usub_with_overflow, x, y); + } + + llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y) + { + return lowerPSAT(llvm::Intrinsic::ssub_with_overflow, x, y); + } + + llvm::Value *lowerSQRT(llvm::Value *x) + { + llvm::Function *sqrt = llvm::Intrinsic::getDeclaration( + ::module, llvm::Intrinsic::sqrt, {x->getType()}); + return ::builder->CreateCall(sqrt, ARGS(x)); + } + + llvm::Value *lowerRCP(llvm::Value *x) + { + llvm::Type *ty = x->getType(); + llvm::Constant *one; + if (llvm::VectorType *vectorTy = llvm::dyn_cast(ty)) + { + one = llvm::ConstantVector::getSplat( + vectorTy->getNumElements(), + llvm::ConstantFP::get(vectorTy->getElementType(), 1)); + } + else + { + one = llvm::ConstantFP::get(ty, 1); + } + return ::builder->CreateFDiv(one, x); + } + + llvm::Value *lowerRSQRT(llvm::Value *x) + { + return lowerRCP(lowerSQRT(x)); + } + + llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY) + { + llvm::VectorType *ty = llvm::cast(x->getType()); + llvm::Value *y = llvm::ConstantVector::getSplat( + ty->getNumElements(), + llvm::ConstantInt::get(ty->getElementType(), scalarY)); + return ::builder->CreateShl(x, y); + } + + llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY) + { + llvm::VectorType *ty = llvm::cast(x->getType()); + llvm::Value *y = llvm::ConstantVector::getSplat( + ty->getNumElements(), + llvm::ConstantInt::get(ty->getElementType(), scalarY)); + return ::builder->CreateAShr(x, y); + } + + llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY) + { + llvm::VectorType *ty = llvm::cast(x->getType()); + llvm::Value *y = llvm::ConstantVector::getSplat( + ty->getNumElements(), + llvm::ConstantInt::get(ty->getElementType(), scalarY)); + return ::builder->CreateLShr(x, y); + } + + llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y) + { + llvm::VectorType *ty = llvm::cast(x->getType()); + llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty); + + llvm::Value *extX = ::builder->CreateSExt(x, extTy); + llvm::Value *extY = ::builder->CreateSExt(y, extTy); + llvm::Value *mult = ::builder->CreateMul(extX, extY); + + llvm::Value *undef = llvm::UndefValue::get(extTy); + + llvm::SmallVector evenIdx; + llvm::SmallVector oddIdx; + for (uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2) + { + evenIdx.push_back(i); + oddIdx.push_back(i + 1); + } + + llvm::Value *lhs = ::builder->CreateShuffleVector(mult, undef, evenIdx); + llvm::Value *rhs = ::builder->CreateShuffleVector(mult, undef, oddIdx); + return ::builder->CreateAdd(lhs, rhs); + } + + llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext) + { + llvm::VectorType *ty = llvm::cast(x->getType()); + llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty); + + llvm::Value *extX, *extY; + if (sext) + { + extX = ::builder->CreateSExt(x, extTy); + extY = ::builder->CreateSExt(y, extTy); + } + else + { + extX = ::builder->CreateZExt(x, extTy); + extY = ::builder->CreateZExt(y, extTy); + } + + llvm::Value *mult = ::builder->CreateMul(extX, extY); + + llvm::IntegerType *intTy = llvm::cast(ty->getElementType()); + llvm::Value *mulh = ::builder->CreateAShr(mult, intTy->getIntegerBitWidth()); + return ::builder->CreateTrunc(mulh, ty); + } + + llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned) + { + llvm::VectorType *srcTy = llvm::cast(x->getType()); + llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy); + + llvm::IntegerType *dstElemTy = + llvm::cast(dstTy->getElementType()); + + uint64_t truncNumBits = dstElemTy->getIntegerBitWidth(); + assert(truncNumBits < 64 && "shift 64 must be handled separately"); + llvm::Constant *max, *min; + if (isSigned) + { + max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true); + min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true); + } + else + { + max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false); + min = llvm::ConstantInt::get(srcTy, 0, false); + } + + x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT); + x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT); + y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT); + y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT); + + x = ::builder->CreateTrunc(x, dstTy); + y = ::builder->CreateTrunc(y, dstTy); + + llvm::SmallVector index(srcTy->getNumElements() * 2); + std::iota(index.begin(), index.end(), 0); + + return ::builder->CreateShuffleVector(x, y, index); + } + + llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy) + { + llvm::VectorType *ty = llvm::cast(x->getType()); + llvm::Constant *zero = llvm::ConstantInt::get(ty, 0); + llvm::Value *cmp = ::builder->CreateICmpSLT(x, zero); + + llvm::Value *ret = ::builder->CreateZExt( + ::builder->CreateExtractElement(cmp, static_cast(0)), retTy); + for (uint64_t i = 1, n = ty->getNumElements(); i < n; ++i) + { + llvm::Value *elem = ::builder->CreateZExt( + ::builder->CreateExtractElement(cmp, i), retTy); + ret = ::builder->CreateOr(ret, ::builder->CreateShl(elem, i)); + } + return ret; + } + + llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy) + { + llvm::VectorType *ty = llvm::cast(x->getType()); + llvm::Constant *zero = llvm::ConstantFP::get(ty, 0); + llvm::Value *cmp = ::builder->CreateFCmpULT(x, zero); + + llvm::Value *ret = ::builder->CreateZExt( + ::builder->CreateExtractElement(cmp, static_cast(0)), retTy); + for (uint64_t i = 1, n = ty->getNumElements(); i < n; ++i) + { + llvm::Value *elem = ::builder->CreateZExt( + ::builder->CreateExtractElement(cmp, i), retTy); + ret = ::builder->CreateOr(ret, ::builder->CreateShl(elem, i)); + } + return ret; + } +#endif // !defined(__i386__) && !defined(__x86_64__) #endif // SWIFTSHADER_LLVM_VERSION >= 7 } @@ -517,11 +728,18 @@ namespace sw #if defined(__x86_64__) static const char arch[] = "x86-64"; - #else + #elif defined(__i386__) static const char arch[] = "x86"; + #elif defined(__aarch64__) + static const char arch[] = "arm64"; + #elif defined(__arm__) + static const char arch[] = "arm"; + #else + #error "unknown architecture" #endif llvm::SmallVector mattrs; +#if defined(__i386__) || defined(__x86_64__) mattrs.push_back(CPUID::supportsMMX() ? "+mmx" : "-mmx"); mattrs.push_back(CPUID::supportsCMOV() ? "+cmov" : "-cmov"); mattrs.push_back(CPUID::supportsSSE() ? "+sse" : "-sse"); @@ -533,6 +751,14 @@ namespace sw #else mattrs.push_back(CPUID::supportsSSE4_1() ? "+sse4.1" : "-sse4.1"); #endif +#elif defined(__arm__) +#if __ARM_ARCH >= 8 + mattrs.push_back("+armv8-a"); +#else + // armv7-a requires compiler-rt routines; otherwise, compiled kernel + // might fail to link. +#endif +#endif #if SWIFTSHADER_LLVM_VERSION < 7 llvm::JITEmitDebugInfo = false; @@ -2556,12 +2782,20 @@ namespace sw RValue AddSat(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::paddusb(x, y); +#else + return As(V(lowerPUADDSAT(V(x.value), V(y.value)))); +#endif } RValue SubSat(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::psubusb(x, y); +#else + return As(V(lowerPUSUBSAT(V(x.value), V(y.value)))); +#endif } RValue Unpack(RValue x) @@ -2590,17 +2824,29 @@ namespace sw RValue SignMask(RValue x) { +#if defined(__i386__) || defined(__x86_64__) return x86::pmovmskb(x); +#else + return As(V(lowerSignMask(V(x.value), T(Int::getType())))); +#endif } // RValue CmpGT(RValue x, RValue y) // { +//#if defined(__i386__) || defined(__x86_64__) // return x86::pcmpgtb(x, y); // FIXME: Signedness +//#else +// return As(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType())))); +//#endif // } RValue CmpEQ(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::pcmpeqb(x, y); +#else + return As(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType())))); +#endif } Type *Byte8::getType() @@ -2773,12 +3019,20 @@ namespace sw RValue AddSat(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::paddsb(x, y); +#else + return As(V(lowerPSADDSAT(V(x.value), V(y.value)))); +#endif } RValue SubSat(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::psubsb(x, y); +#else + return As(V(lowerPSSUBSAT(V(x.value), V(y.value)))); +#endif } RValue UnpackLow(RValue x, RValue y) @@ -2796,17 +3050,29 @@ namespace sw RValue SignMask(RValue x) { +#if defined(__i386__) || defined(__x86_64__) return x86::pmovmskb(As(x)); +#else + return As(V(lowerSignMask(V(x.value), T(Int::getType())))); +#endif } RValue CmpGT(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::pcmpgtb(x, y); +#else + return As(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Byte8::getType())))); +#endif } RValue CmpEQ(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::pcmpeqb(As(x), As(y)); +#else + return As(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Byte8::getType())))); +#endif } Type *SByte8::getType() @@ -2912,7 +3178,12 @@ namespace sw Short4::Short4(RValue cast) { Int4 v4i32 = Int4(cast); +#if defined(__i386__) || defined(__x86_64__) v4i32 = As(x86::packssdw(v4i32, v4i32)); +#else + Value *v = v4i32.loadValue(); + v4i32 = As(V(lowerPack(V(v), V(v), true))); +#endif storeValue(As(Int2(v4i32)).value); } @@ -3049,16 +3320,22 @@ namespace sw RValue operator<<(RValue lhs, unsigned char rhs) { +#if defined(__i386__) || defined(__x86_64__) // return RValue(Nucleus::createShl(lhs.value, rhs.value)); return x86::psllw(lhs, rhs); +#else + return As(V(lowerVectorShl(V(lhs.value), rhs))); +#endif } RValue operator>>(RValue lhs, unsigned char rhs) { - // return RValue(Nucleus::createAShr(lhs.value, rhs.value)); - +#if defined(__i386__) || defined(__x86_64__) return x86::psraw(lhs, rhs); +#else + return As(V(lowerVectorAShr(V(lhs.value), rhs))); +#endif } RValue operator+=(Short4 &lhs, RValue rhs) @@ -3134,45 +3411,75 @@ namespace sw RValue Max(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::pmaxsw(x, y); +#else + return RValue(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SGT))); +#endif } RValue Min(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::pminsw(x, y); +#else + return RValue(V(lowerPMINMAX(V(x.value), V(y.value), llvm::ICmpInst::ICMP_SLT))); +#endif } RValue AddSat(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::paddsw(x, y); +#else + return As(V(lowerPSADDSAT(V(x.value), V(y.value)))); +#endif } RValue SubSat(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::psubsw(x, y); +#else + return As(V(lowerPSSUBSAT(V(x.value), V(y.value)))); +#endif } RValue MulHigh(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::pmulhw(x, y); +#else + return As(V(lowerMulHigh(V(x.value), V(y.value), true))); +#endif } RValue MulAdd(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::pmaddwd(x, y); +#else + return As(V(lowerMulAdd(V(x.value), V(y.value)))); +#endif } RValue PackSigned(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) auto result = x86::packsswb(x, y); - +#else + auto result = V(lowerPack(V(x.value), V(y.value), true)); +#endif return As(Swizzle(As(result), 0x88)); } RValue PackUnsigned(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) auto result = x86::packuswb(x, y); - +#else + auto result = V(lowerPack(V(x.value), V(y.value), false)); +#endif return As(Swizzle(As(result), 0x88)); } @@ -3219,12 +3526,20 @@ namespace sw RValue CmpGT(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::pcmpgtw(x, y); +#else + return As(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value), V(y.value), T(Short4::getType())))); +#endif } RValue CmpEQ(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::pcmpeqw(x, y); +#else + return As(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value), V(y.value), T(Short4::getType())))); +#endif } Type *Short4::getType() @@ -3381,16 +3696,24 @@ namespace sw RValue operator<<(RValue lhs, unsigned char rhs) { +#if defined(__i386__) || defined(__x86_64__) // return RValue(Nucleus::createShl(lhs.value, rhs.value)); return As(x86::psllw(As(lhs), rhs)); +#else + return As(V(lowerVectorShl(V(lhs.value), rhs))); +#endif } RValue operator>>(RValue lhs, unsigned char rhs) { +#if defined(__i386__) || defined(__x86_64__) // return RValue(Nucleus::createLShr(lhs.value, rhs.value)); return x86::psrlw(lhs, rhs); +#else + return As(V(lowerVectorLShr(V(lhs.value), rhs))); +#endif } RValue operator<<=(UShort4 &lhs, unsigned char rhs) @@ -3420,22 +3743,38 @@ namespace sw RValue AddSat(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::paddusw(x, y); +#else + return As(V(lowerPUADDSAT(V(x.value), V(y.value)))); +#endif } RValue SubSat(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::psubusw(x, y); +#else + return As(V(lowerPUSUBSAT(V(x.value), V(y.value)))); +#endif } RValue MulHigh(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::pmulhuw(x, y); +#else + return As(V(lowerMulHigh(V(x.value), V(y.value), false))); +#endif } RValue Average(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::pavgw(x, y); +#else + return As(V(lowerPAVG(V(x.value), V(y.value)))); +#endif } Type *UShort4::getType() @@ -3486,17 +3825,29 @@ namespace sw RValue operator<<(RValue lhs, unsigned char rhs) { - return x86::psllw(lhs, rhs); // FIXME: Fallback required +#if defined(__i386__) || defined(__x86_64__) + return x86::psllw(lhs, rhs); +#else + return As(V(lowerVectorShl(V(lhs.value), rhs))); +#endif } RValue operator>>(RValue lhs, unsigned char rhs) { - return x86::psraw(lhs, rhs); // FIXME: Fallback required +#if defined(__i386__) || defined(__x86_64__) + return x86::psraw(lhs, rhs); +#else + return As(V(lowerVectorAShr(V(lhs.value), rhs))); +#endif } RValue MulAdd(RValue x, RValue y) { - return x86::pmaddwd(x, y); // FIXME: Fallback required +#if defined(__i386__) || defined(__x86_64__) + return x86::pmaddwd(x, y); +#else + return As(V(lowerMulAdd(V(x.value), V(y.value)))); +#endif } RValue Abs(RValue x) @@ -3507,7 +3858,11 @@ namespace sw RValue MulHigh(RValue x, RValue y) { - return x86::pmulhw(x, y); // FIXME: Fallback required +#if defined(__i386__) || defined(__x86_64__) + return x86::pmulhw(x, y); +#else + return As(V(lowerMulHigh(V(x.value), V(y.value), true))); +#endif } Type *Short8::getType() @@ -3576,12 +3931,20 @@ namespace sw RValue operator<<(RValue lhs, unsigned char rhs) { - return As(x86::psllw(As(lhs), rhs)); // FIXME: Fallback required +#if defined(__i386__) || defined(__x86_64__) + return As(x86::psllw(As(lhs), rhs)); +#else + return As(V(lowerVectorShl(V(lhs.value), rhs))); +#endif } RValue operator>>(RValue lhs, unsigned char rhs) { +#if defined(__i386__) || defined(__x86_64__) return x86::psrlw(lhs, rhs); // FIXME: Fallback required +#else + return As(V(lowerVectorLShr(V(lhs.value), rhs))); +#endif } RValue operator+(RValue lhs, RValue rhs) @@ -3635,7 +3998,11 @@ namespace sw RValue MulHigh(RValue x, RValue y) { - return x86::pmulhuw(x, y); // FIXME: Fallback required +#if defined(__i386__) || defined(__x86_64__) + return x86::pmulhuw(x, y); +#else + return As(V(lowerMulHigh(V(x.value), V(y.value), false))); +#endif } Type *UShort8::getType() @@ -3983,9 +4350,11 @@ namespace sw RValue RoundInt(RValue cast) { +#if defined(__i386__) || defined(__x86_64__) return x86::cvtss2si(cast); - - // return IfThenElse(val > 0.0f, Int(val + 0.5f), Int(val - 0.5f)); +#else + return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f)); +#endif } Type *Int::getType() @@ -4401,9 +4770,11 @@ namespace sw // RValue RoundUInt(RValue cast) // { +//#if defined(__i386__) || defined(__x86_64__) // return x86::cvtss2si(val); // FIXME: Unsigned -// -// // return IfThenElse(val > 0.0f, Int(val + 0.5f), Int(val - 0.5f)); +//#else +// return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f)); +//#endif // } Type *UInt::getType() @@ -4523,16 +4894,24 @@ namespace sw RValue operator<<(RValue lhs, unsigned char rhs) { +#if defined(__i386__) || defined(__x86_64__) // return RValue(Nucleus::createShl(lhs.value, rhs.value)); return x86::pslld(lhs, rhs); +#else + return As(V(lowerVectorShl(V(lhs.value), rhs))); +#endif } RValue operator>>(RValue lhs, unsigned char rhs) { +#if defined(__i386__) || defined(__x86_64__) // return RValue(Nucleus::createAShr(lhs.value, rhs.value)); return x86::psrad(lhs, rhs); +#else + return As(V(lowerVectorAShr(V(lhs.value), rhs))); +#endif } RValue operator+=(Int2 &lhs, RValue rhs) @@ -4716,16 +5095,24 @@ namespace sw RValue operator<<(RValue lhs, unsigned char rhs) { +#if defined(__i386__) || defined(__x86_64__) // return RValue(Nucleus::createShl(lhs.value, rhs.value)); return As(x86::pslld(As(lhs), rhs)); +#else + return As(V(lowerVectorShl(V(lhs.value), rhs))); +#endif } RValue operator>>(RValue lhs, unsigned char rhs) { +#if defined(__i386__) || defined(__x86_64__) // return RValue(Nucleus::createLShr(lhs.value, rhs.value)); return x86::psrld(lhs, rhs); +#else + return As(V(lowerVectorLShr(V(lhs.value), rhs))); +#endif } RValue operator+=(UInt2 &lhs, RValue rhs) @@ -4804,11 +5191,13 @@ namespace sw Int4::Int4(RValue cast) : XYZW(this) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { *this = x86::pmovzxbd(As(cast)); } else +#endif { int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23}; Value *a = Nucleus::createBitCast(cast.value, Byte16::getType()); @@ -4824,11 +5213,13 @@ namespace sw Int4::Int4(RValue cast) : XYZW(this) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { *this = x86::pmovsxbd(As(cast)); } else +#endif { int swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}; Value *a = Nucleus::createBitCast(cast.value, Byte16::getType()); @@ -4851,11 +5242,13 @@ namespace sw Int4::Int4(RValue cast) : XYZW(this) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { *this = x86::pmovsxwd(As(cast)); } else +#endif { int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3}; Value *c = Nucleus::createShuffleVector(cast.value, cast.value, swizzle); @@ -4865,11 +5258,13 @@ namespace sw Int4::Int4(RValue cast) : XYZW(this) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { *this = x86::pmovzxwd(As(cast)); } else +#endif { int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11}; Value *c = Nucleus::createShuffleVector(cast.value, Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle); @@ -5031,12 +5426,20 @@ namespace sw RValue operator<<(RValue lhs, unsigned char rhs) { +#if defined(__i386__) || defined(__x86_64__) return x86::pslld(lhs, rhs); +#else + return As(V(lowerVectorShl(V(lhs.value), rhs))); +#endif } RValue operator>>(RValue lhs, unsigned char rhs) { +#if defined(__i386__) || defined(__x86_64__) return x86::psrad(lhs, rhs); +#else + return As(V(lowerVectorAShr(V(lhs.value), rhs))); +#endif } RValue operator<<(RValue lhs, RValue rhs) @@ -5164,11 +5567,13 @@ namespace sw RValue Max(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { return x86::pmaxsd(x, y); } else +#endif { RValue greater = CmpNLE(x, y); return (x & greater) | (y & ~greater); @@ -5177,11 +5582,13 @@ namespace sw RValue Min(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { return x86::pminsd(x, y); } else +#endif { RValue less = CmpLT(x, y); return (x & less) | (y & ~less); @@ -5190,17 +5597,29 @@ namespace sw RValue RoundInt(RValue cast) { +#if defined(__i386__) || defined(__x86_64__) return x86::cvtps2dq(cast); +#else + return As(V(::builder->CreateFPToSI(V(cast.value), T(Int4::getType())))); +#endif } RValue PackSigned(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::packssdw(x, y); +#else + return As(V(lowerPack(V(x.value), V(y.value), true))); +#endif } RValue PackUnsigned(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::packusdw(x, y); +#else + return As(V(lowerPack(V(x.value), V(y.value), false))); +#endif } RValue Extract(RValue x, int i) @@ -5215,7 +5634,11 @@ namespace sw RValue SignMask(RValue x) { +#if defined(__i386__) || defined(__x86_64__) return x86::movmskps(As(x)); +#else + return As(V(lowerSignMask(V(x.value), T(Int::getType())))); +#endif } RValue Swizzle(RValue x, unsigned char select) @@ -5384,12 +5807,20 @@ namespace sw RValue operator<<(RValue lhs, unsigned char rhs) { +#if defined(__i386__) || defined(__x86_64__) return As(x86::pslld(As(lhs), rhs)); +#else + return As(V(lowerVectorShl(V(lhs.value), rhs))); +#endif } RValue operator>>(RValue lhs, unsigned char rhs) { +#if defined(__i386__) || defined(__x86_64__) return x86::psrld(lhs, rhs); +#else + return As(V(lowerVectorLShr(V(lhs.value), rhs))); +#endif } RValue operator<<(RValue lhs, RValue rhs) @@ -5508,11 +5939,13 @@ namespace sw RValue Max(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { return x86::pmaxud(x, y); } else +#endif { RValue greater = CmpNLE(x, y); return (x & greater) | (y & ~greater); @@ -5521,11 +5954,13 @@ namespace sw RValue Min(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { return x86::pminud(x, y); } else +#endif { RValue less = CmpLT(x, y); return (x & less) | (y & ~less); @@ -5694,35 +6129,46 @@ namespace sw RValue Rcp_pp(RValue x, bool exactAtPow2) { - #if defined(__i386__) || defined(__x86_64__) - if(exactAtPow2) - { - // rcpss uses a piecewise-linear approximation which minimizes the relative error - // but is not exact at power-of-two values. Rectify by multiplying by the inverse. - return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f)))); - } - #endif - +#if defined(__i386__) || defined(__x86_64__) + if(exactAtPow2) + { + // rcpss uses a piecewise-linear approximation which minimizes the relative error + // but is not exact at power-of-two values. Rectify by multiplying by the inverse. + return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f)))); + } return x86::rcpss(x); +#else + return As(V(lowerRCP(V(x.value)))); +#endif } RValue RcpSqrt_pp(RValue x) { +#if defined(__i386__) || defined(__x86_64__) return x86::rsqrtss(x); +#else + return As(V(lowerRSQRT(V(x.value)))); +#endif } RValue Sqrt(RValue x) { +#if defined(__i386__) || defined(__x86_64__) return x86::sqrtss(x); +#else + return As(V(lowerSQRT(V(x.value)))); +#endif } RValue Round(RValue x) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { return x86::roundss(x, 0); } else +#endif { return Float4(Round(Float4(x))).x; } @@ -5730,11 +6176,13 @@ namespace sw RValue Trunc(RValue x) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { return x86::roundss(x, 3); } else +#endif { return Float(Int(x)); // Rounded toward zero } @@ -5742,11 +6190,13 @@ namespace sw RValue Frac(RValue x) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { return x - x86::floorss(x); } else +#endif { return Float4(Frac(Float4(x))).x; } @@ -5754,11 +6204,13 @@ namespace sw RValue Floor(RValue x) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { return x86::floorss(x); } else +#endif { return Float4(Floor(Float4(x))).x; } @@ -5766,11 +6218,13 @@ namespace sw RValue Ceil(RValue x) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { return x86::ceilss(x); } else +#endif { return Float4(Ceil(Float4(x))).x; } @@ -6016,36 +6470,53 @@ namespace sw RValue Max(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::maxps(x, y); +#else + return As(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OGT))); +#endif } RValue Min(RValue x, RValue y) { +#if defined(__i386__) || defined(__x86_64__) return x86::minps(x, y); +#else + return As(V(lowerPFMINMAX(V(x.value), V(y.value), llvm::FCmpInst::FCMP_OLT))); +#endif } RValue Rcp_pp(RValue x, bool exactAtPow2) { - #if defined(__i386__) || defined(__x86_64__) - if(exactAtPow2) - { - // rcpps uses a piecewise-linear approximation which minimizes the relative error - // but is not exact at power-of-two values. Rectify by multiplying by the inverse. - return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f)))); - } - #endif - +#if defined(__i386__) || defined(__x86_64__) + if(exactAtPow2) + { + // rcpps uses a piecewise-linear approximation which minimizes the relative error + // but is not exact at power-of-two values. Rectify by multiplying by the inverse. + return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f)))); + } return x86::rcpps(x); +#else + return As(V(lowerRCP(V(x.value)))); +#endif } RValue RcpSqrt_pp(RValue x) { +#if defined(__i386__) || defined(__x86_64__) return x86::rsqrtps(x); +#else + return As(V(lowerRSQRT(V(x.value)))); +#endif } RValue Sqrt(RValue x) { +#if defined(__i386__) || defined(__x86_64__) return x86::sqrtps(x); +#else + return As(V(lowerSQRT(V(x.value)))); +#endif } RValue Insert(RValue x, RValue element, int i) @@ -6099,7 +6570,11 @@ namespace sw RValue SignMask(RValue x) { +#if defined(__i386__) || defined(__x86_64__) return x86::movmskps(x); +#else + return As(V(lowerFPSignMask(V(x.value), T(Int::getType())))); +#endif } RValue CmpEQ(RValue x, RValue y) @@ -6150,11 +6625,13 @@ namespace sw RValue Round(RValue x) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { return x86::roundps(x, 0); } else +#endif { return Float4(RoundInt(x)); } @@ -6162,11 +6639,13 @@ namespace sw RValue Trunc(RValue x) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { return x86::roundps(x, 3); } else +#endif { return Float4(Int4(x)); } @@ -6194,11 +6673,13 @@ namespace sw RValue Floor(RValue x) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { return x86::floorps(x); } else +#endif { return x - Frac(x); } @@ -6206,11 +6687,13 @@ namespace sw RValue Ceil(RValue x) { +#if defined(__i386__) || defined(__x86_64__) if(CPUID::supportsSSE4_1()) { return x86::ceilps(x); } else +#endif { return -Floor(-x); } @@ -6311,6 +6794,7 @@ namespace sw namespace sw { +#if defined(__i386__) || defined(__x86_64__) namespace x86 { RValue cvtss2si(RValue val) @@ -6854,4 +7338,5 @@ namespace sw #endif } } +#endif // defined(__i386__) || defined(__x86_64__) } -- 2.11.0