(view as text)
diff --git a/Source/Core/Common/CommonTypes.h b/Source/Core/Common/CommonTypes.h
index 3eb131c..4b5969b 100644
--- a/Source/Core/Common/CommonTypes.h
+++ b/Source/Core/Common/CommonTypes.h
@@ -16,16 +16,19 @@
#ifdef _WIN32
#include <tchar.h>
+#include <xmmintrin.h>
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
+typedef __m128 u128;
typedef int8_t s8;
typedef int16_t s16;
typedef int32_t s32;
typedef int64_t s64;
+typedef __m128 s128;
#else
@@ -35,11 +38,13 @@ typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
+typedef __uint128_t u128;
typedef int8_t s8;
typedef int16_t s16;
typedef int32_t s32;
typedef int64_t s64;
+typedef __int128_t s128;
#endif
// For using windows lock code
diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp
index 76e235e..75fe76f 100644
--- a/Source/Core/Common/x64Emitter.cpp
+++ b/Source/Core/Common/x64Emitter.cpp
@@ -204,7 +204,7 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
{
// Oh, RIP addressing.
_offsetOrBaseReg = 5;
- emit->WriteModRM(0, _operandReg&7, 5);
+ emit->WriteModRM(0, _operandReg, _offsetOrBaseReg);
//TODO : add some checks
#ifdef _M_X64
u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes;
@@ -328,7 +328,6 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
}
}
-
// W = operand extended width (1 if 64-bit)
// R = register# upper bit
// X = scale amnt upper bit
@@ -1391,6 +1390,10 @@ void XEmitter::PSRLQ(X64Reg reg, int shift) {
Write8(shift);
}
+void XEmitter::PSRLQ(X64Reg reg, OpArg arg) {
+ WriteSSEOp(64, 0xd3, true, reg, arg);
+}
+
void XEmitter::PSLLW(X64Reg reg, int shift) {
WriteSSEOp(64, 0x71, true, (X64Reg)6, R(reg));
Write8(shift);
@@ -1438,7 +1441,19 @@ void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {
Write8(0x0f);
Write8(0x38);
Write8(0x00);
- arg.WriteRest(this, 0);
+ arg.WriteRest(this);
+}
+
+void XEmitter::PTEST(X64Reg dest, OpArg arg) {
+ if (!cpu_info.bSSE4_1) {
+ PanicAlert("Trying to use PTEST on a system that doesn't support it. Nobody hears your screams.");
+ }
+ Write8(0x66);
+ Write8(0x0f);
+ Write8(0x38);
+ Write8(0x17);
+ arg.operandReg = dest;
+ arg.WriteRest(this);
}
void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDB, true, dest, arg);}
@@ -1459,7 +1474,7 @@ void XEmitter::PADDUSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDD, true, dest
void XEmitter::PSUBB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF8, true, dest, arg);}
void XEmitter::PSUBW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF9, true, dest, arg);}
void XEmitter::PSUBD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFA, true, dest, arg);}
-void XEmitter::PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDB, true, dest, arg);}
+void XEmitter::PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFB, true, dest, arg);}
void XEmitter::PSUBSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE8, true, dest, arg);}
void XEmitter::PSUBSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE9, true, dest, arg);}
@@ -1498,6 +1513,8 @@ void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64,
void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseMUL, false, regOp1, regOp2, arg);}
void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseDIV, false, regOp1, regOp2, arg);}
void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseSQRT, false, regOp1, regOp2, arg);}
+void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseAND, false, regOp1, regOp2, arg);}
+void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseANDN, false, regOp1, regOp2, arg);}
// Prefixes
@@ -1510,6 +1527,25 @@ void XEmitter::FWAIT()
Write8(0x9B);
}
+// TODO: make this more generic
+void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, OpArg arg)
+{
+ int mf = 0;
+ switch (bits) {
+ case 32: mf = 0; break;
+ case 64: mf = 2; break;
+ default: _assert_msg_(DYNA_REC, 0, "WriteFloatLoadStore: bits is not 32 or 64");
+ }
+ Write8(0xd9 | (mf << 1));
+ // x87 instructions use the reg field of the ModR/M byte as opcode:
+ arg.WriteRest(this, 0, (X64Reg) op);
+}
+
+void XEmitter::FLD(int bits, OpArg src) {WriteFloatLoadStore(bits, floatLD, src);}
+void XEmitter::FST(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatST, dest);}
+void XEmitter::FSTP(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatSTP, dest);}
+void XEmitter::FNSTSW_AX() { Write8(0xDF); Write8(0xE0); }
+
void XEmitter::RTDSC() { Write8(0x0F); Write8(0x31); }
// helper routines for setting pointers
diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h
index 87e76ef..8a6b0b1 100644
--- a/Source/Core/Common/x64Emitter.h
+++ b/Source/Core/Common/x64Emitter.h
@@ -98,6 +98,12 @@ enum NormalOp {
nrmXCHG,
};
+enum FloatOp {
+ floatLD = 0,
+ floatST = 2,
+ floatSTP = 3,
+};
+
class XEmitter;
// RIP addressing does not benefit from micro op fusion on Core arch
@@ -116,6 +122,7 @@ struct OpArg
void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const;
void WriteVex(XEmitter* emit, int size, int packed, Gen::X64Reg regOp1, X64Reg regOp2) const;
void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF, bool warn_64bit_offset = true) const;
+ void WriteFloatModRM(XEmitter *emit, FloatOp op);
void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits);
// This one is public - must be written to
u64 offset; // use RIP-relative as much as possible - 64-bit immediates are not available.
@@ -245,6 +252,7 @@ private:
void WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
void WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
void WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
+ void WriteFloatLoadStore(int bits, FloatOp op, OpArg arg);
void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
protected:
@@ -425,6 +433,28 @@ public:
void REP();
void REPNE();
+ // x87
+ enum x87StatusWordBits {
+ x87_InvalidOperation = 0x1,
+ x87_DenormalizedOperand = 0x2,
+ x87_DivisionByZero = 0x4,
+ x87_Overflow = 0x8,
+ x87_Underflow = 0x10,
+ x87_Precision = 0x20,
+ x87_StackFault = 0x40,
+ x87_ErrorSummary = 0x80,
+ x87_C0 = 0x100,
+ x87_C1 = 0x200,
+ x87_C2 = 0x400,
+ x87_TopOfStack = 0x2000 | 0x1000 | 0x800,
+ x87_C3 = 0x4000,
+ x87_FPUBusy = 0x8000,
+ };
+
+ void FLD(int bits, OpArg src);
+ void FST(int bits, OpArg dest);
+ void FSTP(int bits, OpArg dest);
+ void FNSTSW_AX();
void FWAIT();
// SSE/SSE2: Floating point arithmetic
@@ -447,14 +477,6 @@ public:
// SSE/SSE2: Floating point bitwise (yes)
void CMPSS(X64Reg regOp, OpArg arg, u8 compare);
void CMPSD(X64Reg regOp, OpArg arg, u8 compare);
- void ANDSS(X64Reg regOp, OpArg arg);
- void ANDSD(X64Reg regOp, OpArg arg);
- void ANDNSS(X64Reg regOp, OpArg arg);
- void ANDNSD(X64Reg regOp, OpArg arg);
- void ORSS(X64Reg regOp, OpArg arg);
- void ORSD(X64Reg regOp, OpArg arg);
- void XORSS(X64Reg regOp, OpArg arg);
- void XORSD(X64Reg regOp, OpArg arg);
// SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
void ADDPS(X64Reg regOp, OpArg arg);
@@ -559,6 +581,7 @@ public:
void PUNPCKLWD(X64Reg dest, const OpArg &arg);
void PUNPCKLDQ(X64Reg dest, const OpArg &arg);
+ void PTEST(X64Reg dest, OpArg arg);
void PAND(X64Reg dest, OpArg arg);
void PANDN(X64Reg dest, OpArg arg);
void PXOR(X64Reg dest, OpArg arg);
@@ -614,6 +637,7 @@ public:
void PSRLW(X64Reg reg, int shift);
void PSRLD(X64Reg reg, int shift);
void PSRLQ(X64Reg reg, int shift);
+ void PSRLQ(X64Reg reg, OpArg arg);
void PSLLW(X64Reg reg, int shift);
void PSLLD(X64Reg reg, int shift);
@@ -628,6 +652,8 @@ public:
void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+ void VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+ void VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg);
void RTDSC();
diff --git a/Source/Core/Common/x64FPURoundMode.cpp b/Source/Core/Common/x64FPURoundMode.cpp
index 34438d1..e695ca5 100644
--- a/Source/Core/Common/x64FPURoundMode.cpp
+++ b/Source/Core/Common/x64FPURoundMode.cpp
@@ -16,11 +16,11 @@ static const unsigned short FPU_ROUND_MASK = 3 << 10;
#endif
// OR-mask for disabling FPU exceptions (bits 7-12 in the MXCSR register)
-const u32 EXCEPTION_MASK = 0x1F80;
+static const u32 EXCEPTION_MASK = 0x1F80;
// Denormals-Are-Zero (non-IEEE mode: denormal inputs are set to +/- 0)
-const u32 DAZ = 0x40;
+static const u32 DAZ = 0x40;
// Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0)
-const u32 FTZ = 0x8000;
+static const u32 FTZ = 0x8000;
namespace FPURoundMode
{
@@ -101,8 +101,7 @@ namespace FPURoundMode
FTZ, // flush-to-zero only
FTZ | DAZ, // flush-to-zero and denormals-are-zero (may not be supported)
};
- // FIXME: proper (?) non-IEEE mode emulation causes issues in lots of games
- if (nonIEEEMode && false)
+ if (nonIEEEMode)
{
csr |= denormalLUT[cpu_info.bFlushToZero];
}
diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
index 9190a18..1fdca8b 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
@@ -233,4 +233,39 @@ inline u32 ConvertToSingleFTZ(u64 x)
}
}
+inline u64 ConvertToDouble(u32 _x)
+{
+ // This is a little-endian re-implementation of the algrothm described in
+ // the Power PC Programming Enviroments Manual for Loading single
+ // percision floating point numbers.
+ // See page 566 of http://www.freescale.com/files/product/doc/MPCFPE32B.pdf
+
+ u64 x = _x;
+ u64 exp = (x >> 23) & 0xff;
+ u64 frac = x & 0x007fffff;
+
+ if (exp > 0 && exp < 255) // Normal number
+ {
+ u64 y = !(exp >> 7);
+ u64 z = y << 61 | y << 60 | y << 59;
+ return ((x & 0xc0000000) << 32) | z | ((x & 0x3fffffff) << 29);
+ }
+ else if (exp == 0 && frac != 0) // Subnormal number
+ {
+ exp = 1023 - 126;
+ do
+ {
+ frac <<= 1;
+ exp -= 1;
+ } while ((frac & 0x00800000) == 0);
+ return ((x & 0x80000000) << 32) | (exp << 52) | ((frac & 0x007fffff) << 29);
+ }
+ else // QNaN, SNaN or Zero
+ {
+ u64 y = exp >> 7;
+ u64 z = y << 61 | y << 60 | y << 59;
+ return ((x & 0xc0000000) << 32) | z | ((x & 0x3fffffff) << 29);
+ }
+}
+
#endif
diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp
index 3fb441f..0356bd4 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp
@@ -93,9 +93,9 @@ void Interpreter::lfs(UGeckoInstruction _inst)
u32 uTemp = Memory::Read_U32(Helper_Get_EA(_inst));
if (!(PowerPC::ppcState.Exceptions & EXCEPTION_DSI))
{
- double value = *(float*)&uTemp;
- rPS0(_inst.FD) = value;
- rPS1(_inst.FD) = value;
+ u64 value = ConvertToDouble(uTemp);
+ riPS0(_inst.FD) = value;
+ riPS1(_inst.FD) = value;
}
}
@@ -105,9 +105,9 @@ void Interpreter::lfsu(UGeckoInstruction _inst)
u32 uTemp = Memory::Read_U32(uAddress);
if (!(PowerPC::ppcState.Exceptions & EXCEPTION_DSI))
{
- double value = *(float*)&uTemp;
- rPS0(_inst.FD) = value;
- rPS1(_inst.FD) = value;
+ u64 value = ConvertToDouble(uTemp);
+ riPS0(_inst.FD) = value;
+ riPS1(_inst.FD) = value;
m_GPR[_inst.RA] = uAddress;
}
@@ -119,9 +119,9 @@ void Interpreter::lfsux(UGeckoInstruction _inst)
u32 uTemp = Memory::Read_U32(uAddress);
if (!(PowerPC::ppcState.Exceptions & EXCEPTION_DSI))
{
- double value = *(float*)&uTemp;
- rPS0(_inst.FD) = value;
- rPS1(_inst.FD) = value;
+ u64 value = ConvertToDouble(uTemp);
+ riPS0(_inst.FD) = value;
+ riPS1(_inst.FD) = value;
m_GPR[_inst.RA] = uAddress;
}
}
@@ -131,9 +131,9 @@ void Interpreter::lfsx(UGeckoInstruction _inst)
u32 uTemp = Memory::Read_U32(Helper_Get_EA_X(_inst));
if (!(PowerPC::ppcState.Exceptions & EXCEPTION_DSI))
{
- double value = *(float*)&uTemp;
- rPS0(_inst.FD) = value;
- rPS1(_inst.FD) = value;
+ u64 value = ConvertToDouble(uTemp);
+ riPS0(_inst.FD) = value;
+ riPS1(_inst.FD) = value;
}
}
@@ -282,9 +282,6 @@ void Interpreter::stfdu(UGeckoInstruction _inst)
void Interpreter::stfs(UGeckoInstruction _inst)
{
- //double value = rPS0(_inst.FS);
- //float fTemp = (float)value;
- //Memory::Write_U32(*(u32*)&fTemp, Helper_Get_EA(_inst));
Memory::Write_U32(ConvertToSingle(riPS0(_inst.FS)), Helper_Get_EA(_inst));
}
diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
index 1be04a0..c4b063e 100644
--- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
@@ -378,7 +378,7 @@ void RegCache::Flush(FlushMode mode)
{
if (locks[i])
{
- PanicAlert("Someone forgot to unlock PPC reg %i.", i);
+ PanicAlert("Someone forgot to unlock PPC reg %i (X64 reg %i).", i, RX(i));
}
if (regs[i].away)
{
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
index bc056e6..0aac678 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
@@ -12,6 +12,8 @@
#include "JitAsm.h"
#include "JitRegCache.h"
+namespace {
+
// pshufb todo: MOVQ
const u8 GC_ALIGNED16(bswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
const u8 GC_ALIGNED16(bswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
@@ -19,11 +21,10 @@ const u8 GC_ALIGNED16(bswapShuffle1x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 8, 9, 10,
const u8 GC_ALIGNED16(bswapShuffle1x8Dupe[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0};
const u8 GC_ALIGNED16(bswapShuffle2x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
-namespace {
-
u64 GC_ALIGNED16(temp64);
-u32 GC_ALIGNED16(temp32);
+
}
+
// TODO: Add peephole optimizations for multiple consecutive lfd/lfs/stfd/stfs since they are so common,
// and pshufb could help a lot.
// Also add hacks for things like lfs/stfs the same reg consecutively, that is, simple memory moves.
@@ -46,11 +47,9 @@ void Jit64::lfs(UGeckoInstruction inst)
MEMCHECK_START
- MOV(32, M(&temp32), R(EAX));
fpr.Lock(d);
fpr.BindToRegister(d, false);
- CVTSS2SD(fpr.RX(d), M(&temp32));
- MOVDDUP(fpr.RX(d), fpr.R(d));
+ ConvertSingleToDouble(fpr.RX(d), EAX, true);
MEMCHECK_END
@@ -235,13 +234,15 @@ void Jit64::stfs(UGeckoInstruction inst)
return;
}
+ fpr.BindToRegister(s, true, false);
+ ConvertDoubleToSingle(XMM0, fpr.RX(s));
+
if (gpr.R(a).IsImm())
{
u32 addr = (u32)(gpr.R(a).offset + offset);
if (Memory::IsRAMAddress(addr))
{
if (cpu_info.bSSSE3) {
- CVTSD2SS(XMM0, fpr.R(s));
PSHUFB(XMM0, M((void *)bswapShuffle1x4));
WriteFloatToConstRamAddress(XMM0, addr);
return;
@@ -250,7 +251,6 @@ void Jit64::stfs(UGeckoInstruction inst)
else if (addr == 0xCC008000)
{
// Float directly to write gather pipe! Fun!
- CVTSD2SS(XMM0, fpr.R(s));
CALL((void*)asm_routines.fifoDirectWriteFloat);
// TODO
js.fifoBytesThisBlock += 4;
@@ -260,7 +260,6 @@ void Jit64::stfs(UGeckoInstruction inst)
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
gpr.Lock(a);
- fpr.Lock(s);
MOV(32, R(ABI_PARAM2), gpr.R(a));
ADD(32, R(ABI_PARAM2), Imm32(offset));
if (update && offset)
@@ -275,7 +274,6 @@ void Jit64::stfs(UGeckoInstruction inst)
MEMCHECK_END
}
- CVTSD2SS(XMM0, fpr.R(s));
SafeWriteFloatToReg(XMM0, ABI_PARAM2, RegistersInUse());
gpr.UnlockAll();
gpr.UnlockAllX();
@@ -290,11 +288,14 @@ void Jit64::stfsx(UGeckoInstruction inst)
// We can take a shortcut here - it's not likely that a hardware access would use this instruction.
gpr.FlushLockX(ABI_PARAM1);
- fpr.Lock(inst.RS);
MOV(32, R(ABI_PARAM1), gpr.R(inst.RB));
if (inst.RA)
ADD(32, R(ABI_PARAM1), gpr.R(inst.RA));
- CVTSD2SS(XMM0, fpr.R(inst.RS));
+
+ int s = inst.RS;
+ fpr.Lock(s);
+ fpr.BindToRegister(s, true, false);
+ ConvertDoubleToSingle(XMM0, fpr.RX(s));
MOVD_xmm(R(EAX), XMM0);
SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse());
@@ -313,21 +314,20 @@ void Jit64::lfsx(UGeckoInstruction inst)
{
ADD(32, R(EAX), gpr.R(inst.RA));
}
+ fpr.Lock(inst.RS);
+ fpr.BindToRegister(inst.RS, false);
+ X64Reg s = fpr.RX(inst.RS);
if (cpu_info.bSSSE3 && !js.memcheck) {
- fpr.Lock(inst.RS);
- fpr.BindToRegister(inst.RS, false, true);
- X64Reg r = fpr.R(inst.RS).GetSimpleReg();
#ifdef _M_IX86
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
- MOVD_xmm(r, MDisp(EAX, (u32)Memory::base));
+ MOVD_xmm(XMM0, MDisp(EAX, (u32)Memory::base));
#else
- MOVD_xmm(r, MComplex(RBX, EAX, SCALE_1, 0));
+ MOVD_xmm(XMM0, MComplex(RBX, EAX, SCALE_1, 0));
#endif
MEMCHECK_START
- PSHUFB(r, M((void *)bswapShuffle1x4));
- CVTSS2SD(r, R(r));
- MOVDDUP(r, R(r));
+ PSHUFB(XMM0, M((void *)bswapShuffle1x4));
+ ConvertSingleToDouble(s, XMM0);
MEMCHECK_END
} else {
@@ -335,11 +335,7 @@ void Jit64::lfsx(UGeckoInstruction inst)
MEMCHECK_START
- MOV(32, M(&temp32), R(EAX));
- CVTSS2SD(XMM0, M(&temp32));
- fpr.Lock(inst.RS);
- fpr.BindToRegister(inst.RS, false, true);
- MOVDDUP(fpr.R(inst.RS).GetSimpleReg(), R(XMM0));
+ ConvertSingleToDouble(s, EAX, true);
MEMCHECK_END
}
diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
index 5c9d207..be87a77 100644
--- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
@@ -1288,10 +1288,12 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit) {
}
case DupSingleToMReg: {
if (!thisUsed) break;
- X64Reg reg = fregURegWithoutMov(RI, I);
- Jit->CVTSS2SD(reg, fregLocForInst(RI, getOp1(I)));
- Jit->MOVDDUP(reg, R(reg));
- RI.fregs[reg] = I;
+
+ X64Reg input = fregEnsureInReg(RI, getOp1(I));
+ X64Reg output = fregURegWithoutMov(RI, I);
+ Jit->ConvertSingleToDouble(output, input);
+
+ RI.fregs[output] = I;
fregNormalRegClear(RI, I);
break;
}
@@ -1412,9 +1414,12 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit) {
}
case DoubleToSingle: {
if (!thisUsed) break;
- X64Reg reg = fregURegWithoutMov(RI, I);
- Jit->CVTSD2SS(reg, fregLocForInst(RI, getOp1(I)));
- RI.fregs[reg] = I;
+
+ X64Reg input = fregEnsureInReg(RI, getOp1(I));
+ X64Reg output = fregURegWithoutMov(RI, I);
+ Jit->ConvertDoubleToSingle(output, input);
+
+ RI.fregs[output] = I;
fregNormalRegClear(RI, I);
break;
}
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
index 49a83e1..a824810 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@@ -416,6 +416,93 @@ void EmuCodeBlock::ForceSinglePrecisionP(X64Reg xmm) {
}
}
+static u32 GC_ALIGNED16(temp32);
+static u64 GC_ALIGNED16(temp64);
+#ifdef _WIN32
+static const u128 GC_ALIGNED16(single_qnan_bit) = _mm_set_epi64x(0, 0x00400000);
+static const u128 GC_ALIGNED16(single_exponent) = _mm_set_epi64x(0, 0x7f800000);
+static const u128 GC_ALIGNED16(double_qnan_bit) = _mm_set_epi64x(0x00080000, 0);
+static const u128 GC_ALIGNED16(double_exponent) = _mm_set_epi64x(0x7ff00000, 0);
+#else
+static const u128 GC_ALIGNED16(single_qnan_bit) = 0x00400000;
+static const u128 GC_ALIGNED16(single_exponent) = 0x7f800000;
+static const u128 GC_ALIGNED16(double_qnan_bit) = 0x0008000000000000;
+static const u128 GC_ALIGNED16(double_exponent) = 0x7ff0000000000000;
+#endif
+
+// Since the following two functions are used in non-arithmetic PPC float instructions,
+// they must convert floats bitexact and never flush denormals to zero or turn SNaNs into QNaNs.
+// This means we can't use CVTSS2SD/CVTSD2SS :(
+// The x87 FPU doesn't even support flush-to-zero so we can use FLD+FSTP even on denormals.
+// If the number is a NaN, make sure to set the QNaN bit back to its original value.
+
+void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr)
+{
+ if (src_is_gpr) {
+ MOV(32, M(&temp32), R(src));
+ MOVD_xmm(XMM1, R(src));
+ } else {
+ MOVSS(M(&temp32), src);
+ MOVSS(R(XMM1), src);
+ }
+ FLD(32, M(&temp32));
+ CCFlags cond;
+ if (cpu_info.bSSE4_1) {
+ PTEST(XMM1, M((void *)&single_exponent));
+ cond = CC_NC;
+ } else {
+ FNSTSW_AX();
+ TEST(16, R(AX), Imm16(x87_InvalidOperation));
+ cond = CC_Z;
+ }
+ FSTP(64, M(&temp64));
+ MOVSD(dst, M(&temp64));
+ FixupBranch dont_reset_qnan_bit = J_CC(cond);
+
+ PANDN(XMM1, M((void *)&single_qnan_bit));
+ PSLLQ(XMM1, 29);
+ if (cpu_info.bAVX) {
+ VPANDN(dst, XMM1, R(dst));
+ } else {
+ PANDN(XMM1, R(dst));
+ MOVSD(dst, R(XMM1));
+ }
+
+ SetJumpTarget(dont_reset_qnan_bit);
+ MOVDDUP(dst, R(dst));
+}
+
+void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
+{
+ MOVSD(M(&temp64), src);
+ MOVSD(XMM1, R(src));
+ FLD(64, M(&temp64));
+ CCFlags cond;
+ if (cpu_info.bSSE4_1) {
+ PTEST(XMM1, M((void *)&double_exponent));
+ cond = CC_NC;
+ } else {
+ FNSTSW_AX();
+ TEST(16, R(AX), Imm16(x87_InvalidOperation));
+ cond = CC_Z;
+ }
+ FSTP(32, M(&temp32));
+ MOVSS(XMM0, M(&temp32));
+ FixupBranch dont_reset_qnan_bit = J_CC(cond);
+
+ PANDN(XMM1, M((void *)&double_qnan_bit));
+ PSRLQ(XMM1, 29);
+ if (cpu_info.bAVX) {
+ VPANDN(XMM0, XMM1, R(XMM0));
+ } else {
+ PANDN(XMM1, R(XMM0));
+ MOVSS(XMM0, R(XMM1));
+ }
+
+ SetJumpTarget(dont_reset_qnan_bit);
+ MOVDDUP(dst, R(XMM0));
+}
+
void EmuCodeBlock::JitClearCA()
{
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
index a321ddd..229a81f 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@@ -48,6 +48,10 @@ public:
void ForceSinglePrecisionS(Gen::X64Reg xmm);
void ForceSinglePrecisionP(Gen::X64Reg xmm);
+
+ // AX might get trashed
+ void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);
+ void ConvertDoubleToSingle(Gen::X64Reg dst, Gen::X64Reg src);
protected:
std::unordered_map<u8 *, u32> registersInUseAtLoc;
};