#include "Memory.hpp"
#include "MutexLock.hpp"
-#include <xmmintrin.h>
#include <fstream>
+#if defined(__i386__) || defined(__x86_64__)
+#include <xmmintrin.h>
+#endif
+
#if defined(__x86_64__) && defined(_WIN32)
extern "C" void X86CompilationCallback()
{
llvm::Module *module = nullptr;
llvm::Function *function = nullptr;
- sw::BackoffLock codegenMutex;
+ sw::MutexLock codegenMutex;
}
namespace sw
return value;
}
- Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index)
+ Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
{
+ if(unsignedIndex && sizeof(void*) == 8)
+ {
+ index = createZExt(index, Long::getType());
+ }
+
assert(ptr->getType()->getContainedType(0) == type);
return V(::builder->CreateGEP(ptr, index));
}
return UnpackLow(RValue<Byte8>(byte8), RValue<Byte8>(byte8));
}
+ RValue<Short4> Unpack(RValue<Byte4> x, RValue<Byte4> y)
+ {
+ Value *xx = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), x.value, 0);
+ Value *yy = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), y.value, 0);
+
+ return UnpackLow(As<Byte8>(xx), As<Byte8>(yy));
+ }
+
RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y)
{
if(CPUID::supportsMMX2())
if(!saturate || !CPUID::supportsSSE4_1())
{
- *this = Short4(Int4(int4));
+ *this = Short4(int4);
}
else
{
- *this = As<Short4>(Int2(As<Int4>(x86::packusdw(As<UInt4>(int4), As<UInt4>(int4)))));
+ *this = As<Short4>(Int2(As<Int4>(x86::packusdw(int4, int4))));
}
}
}
}
+ Short8::Short8(short c)
+ {
+ int64_t constantVector[8] = {c, c, c, c, c, c, c, c};
+ storeValue(Nucleus::createConstantVector(constantVector, getType()));
+ }
+
Short8::Short8(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7)
{
int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
return T(VectorType::get(Short::getType(), 8));
}
+ UShort8::UShort8(unsigned short c)
+ {
+ int64_t constantVector[8] = {c, c, c, c, c, c, c, c};
+ storeValue(Nucleus::createConstantVector(constantVector, getType()));
+ }
+
UShort8::UShort8(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7)
{
int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
// Each Short is packed into each Int in the (Short | Short) format.
// Shifting by 16 will retrieve the original Short value.
- // Shitfing an Int will propagate the sign bit, which will work
+ // Shifting an Int will propagate the sign bit, which will work
// for both positive and negative values of a Short.
*this >>= 16;
}
else
{
RValue<Int4> greater = CmpNLE(x, y);
- return x & greater | y & ~greater;
+ return (x & greater) | (y & ~greater);
}
}
else
{
RValue<Int4> less = CmpLT(x, y);
- return x & less | y & ~less;
+ return (x & less) | (y & ~less);
}
}
else
{
RValue<UInt4> greater = CmpNLE(x, y);
- return x & greater | y & ~greater;
+ return (x & greater) | (y & ~greater);
}
}
else
{
RValue<UInt4> less = CmpLT(x, y);
- return x & less | y & ~less;
+ return (x & less) | (y & ~less);
}
}
RValue<UShort8> Pack(RValue<UInt4> x, RValue<UInt4> y)
{
- return x86::packusdw(x, y); // FIXME: Fallback required
+ return x86::packusdw(As<Int4>(x), As<Int4>(y));
}
Type *UInt4::getType()
RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
{
- if(exactAtPow2)
- {
- // rcpss uses a piecewise-linear approximation which minimizes the relative error
- // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
- return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
- }
- else
- {
- return x86::rcpss(x);
- }
+ #if defined(__i386__) || defined(__x86_64__)
+ if(exactAtPow2)
+ {
+ // rcpss uses a piecewise-linear approximation which minimizes the relative error
+ // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+ return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+ }
+ #endif
+
+ return x86::rcpss(x);
}
RValue<Float> RcpSqrt_pp(RValue<Float> x)
RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
{
- if(exactAtPow2)
- {
- // rcpps uses a piecewise-linear approximation which minimizes the relative error
- // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
- return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
- }
- else
- {
- return x86::rcpps(x);
- }
+ #if defined(__i386__) || defined(__x86_64__)
+ if(exactAtPow2)
+ {
+ // rcpps uses a piecewise-linear approximation which minimizes the relative error
+ // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+ return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+ }
+ #endif
+
+ return x86::rcpps(x);
}
RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
RValue<Float4> Frac(RValue<Float4> x)
{
+ Float4 frc;
+
if(CPUID::supportsSSE4_1())
{
- return x - x86::floorps(x);
+ frc = x - x86::floorps(x);
}
else
{
- Float4 frc = x - Float4(Int4(x)); // Signed fractional part
+ frc = x - Float4(Int4(x)); // Signed fractional part.
- return frc + As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1, 1, 1, 1)));
+ frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f))); // Add 1.0 if negative.
}
+
+ // x - floor(x) can be 1.0 for very small negative x.
+ // Clamp against the value just below 1.0.
+ return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
}
RValue<Float4> Floor(RValue<Float4> x)
RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, int offset)
{
- return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), V(Nucleus::createConstantInt(offset))));
+ return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), V(Nucleus::createConstantInt(offset)), false));
}
RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
{
- return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value));
+ return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value, false));
}
RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
{
- return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value));
+ return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value, true));
}
RValue<Pointer<Byte>> operator+=(Pointer<Byte> &lhs, int offset)
Nucleus::createUnreachable();
}
- bool branch(RValue<Bool> cmp, BasicBlock *bodyBB, BasicBlock *endBB)
+ void branch(RValue<Bool> cmp, BasicBlock *bodyBB, BasicBlock *endBB)
{
Nucleus::createCondBr(cmp.value, bodyBB, endBB);
Nucleus::setInsertBlock(bodyBB);
-
- return true;
}
RValue<Long> Ticks()
return As<Byte8>(V(::builder->CreateCall2(packuswb, As<MMX>(x).value, As<MMX>(y).value)));
}
- RValue<UShort8> packusdw(RValue<UInt4> x, RValue<UInt4> y)
+ RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
{
if(CPUID::supportsSSE4_1())
{
}
else
{
- // FIXME: Not an exact replacement!
- return As<UShort8>(packssdw(As<Int4>(x - UInt4(0x00008000, 0x00008000, 0x00008000, 0x00008000)), As<Int4>(y - UInt4(0x00008000, 0x00008000, 0x00008000, 0x00008000))) + Short8(0x8000u, 0x8000u, 0x8000u, 0x8000u, 0x8000u, 0x8000u, 0x8000u, 0x8000u));
+ RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
+ RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
+
+ return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
}
}