(view as text)
diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp
index 76e235e..09f9097 100644
--- a/Source/Core/Common/x64Emitter.cpp
+++ b/Source/Core/Common/x64Emitter.cpp
@@ -204,7 +204,7 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
 	{
 		// Oh, RIP addressing.
 		_offsetOrBaseReg = 5;
-		emit->WriteModRM(0, _operandReg&7, 5);
+		emit->WriteModRM(0, _operandReg, _offsetOrBaseReg);
 		//TODO : add some checks
 #ifdef _M_X64
 		u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes;
@@ -328,7 +328,6 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
 	}
 }
 
-
 // W = operand extended width (1 if 64-bit)
 // R = register# upper bit
 // X = scale amnt upper bit
@@ -1510,6 +1509,24 @@ void XEmitter::FWAIT()
 	Write8(0x9B);
 }
 
+// TODO: make this more generic
+void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, OpArg arg)
+{
+	int mf = 0;
+	switch (bits) {
+		case 32: mf = 0; break;
+		case 64: mf = 2; break;
+		default: _assert_msg_(DYNA_REC, 0, "WriteFloatLoadStore: bits is not 32 or 64");
+	}
+	Write8(0xd9 | (mf << 1));
+	// x87 instructions use the reg field of the ModR/M byte as opcode:
+	arg.WriteRest(this, 0, (X64Reg) op);
+}
+
+void XEmitter::FLD(int bits, OpArg src) {WriteFloatLoadStore(bits, floatLD, src);}
+void XEmitter::FST(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatST, dest);}
+void XEmitter::FSTP(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatSTP, dest);}
+
 void XEmitter::RTDSC() { Write8(0x0F); Write8(0x31); }
 
 // helper routines for setting pointers
diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h
index 87e76ef..11581d0 100644
--- a/Source/Core/Common/x64Emitter.h
+++ b/Source/Core/Common/x64Emitter.h
@@ -98,6 +98,13 @@ enum NormalOp {
 	nrmXCHG,
 };
 
+enum FloatOp {
+	floatLD,
+	floatUnused,
+	floatST,
+	floatSTP,
+};
+
 class XEmitter;
 
 // RIP addressing does not benefit from micro op fusion on Core arch
@@ -116,6 +123,7 @@ struct OpArg
 	void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const;
 	void WriteVex(XEmitter* emit, int size, int packed, Gen::X64Reg regOp1, X64Reg regOp2) const;
 	void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF, bool warn_64bit_offset = true) const;
+	void WriteFloatModRM(XEmitter *emit, FloatOp op);
 	void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits);
 	// This one is public - must be written to
 	u64 offset;  // use RIP-relative as much as possible - 64-bit immediates are not available.
@@ -245,6 +253,7 @@ private:
 	void WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
 	void WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
 	void WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
+	void WriteFloatLoadStore(int bits, FloatOp op, OpArg arg);
 	void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
 
 protected:
@@ -425,6 +434,10 @@ public:
 	void REP();
 	void REPNE();
 
+	// x87
+	void FLD(int bits, OpArg src);
+	void FST(int bits, OpArg dest);
+	void FSTP(int bits, OpArg dest);
 	void FWAIT();
 
 	// SSE/SSE2: Floating point arithmetic
diff --git a/Source/Core/Common/x64FPURoundMode.cpp b/Source/Core/Common/x64FPURoundMode.cpp
index 34438d1..f46c600 100644
--- a/Source/Core/Common/x64FPURoundMode.cpp
+++ b/Source/Core/Common/x64FPURoundMode.cpp
@@ -101,8 +101,7 @@ namespace FPURoundMode
 			FTZ,       // flush-to-zero only
 			FTZ | DAZ, // flush-to-zero and denormals-are-zero (may not be supported)
 		};
-		// FIXME: proper (?) non-IEEE mode emulation causes issues in lots of games
-		if (nonIEEEMode && false)
+		if (nonIEEEMode)
 		{
 			csr |= denormalLUT[cpu_info.bFlushToZero];
 		}
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
index bc056e6..ef39a9e 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
@@ -49,8 +49,9 @@ void Jit64::lfs(UGeckoInstruction inst)
 	MOV(32, M(&temp32), R(EAX));
 	fpr.Lock(d);
 	fpr.BindToRegister(d, false);
-	CVTSS2SD(fpr.RX(d), M(&temp32));
-	MOVDDUP(fpr.RX(d), fpr.R(d));
+	FLD(32, M(&temp32));
+	FSTP(64, M(&temp64));
+	MOVDDUP(fpr.RX(d), M(&temp64));
 
 	MEMCHECK_END
 
@@ -235,13 +236,18 @@ void Jit64::stfs(UGeckoInstruction inst)
 		return;
 	}
 
+	fpr.BindToRegister(s, true, false);
+	MOVSD(M(&temp64), fpr.RX(s));
+	FLD(64, M(&temp64));
+	FSTP(32, M(&temp32));
+	MOVSS(XMM0, M(&temp32));
+
 	if (gpr.R(a).IsImm())
 	{
 		u32 addr = (u32)(gpr.R(a).offset + offset);
 		if (Memory::IsRAMAddress(addr))
 		{
 			if (cpu_info.bSSSE3) {
-				CVTSD2SS(XMM0, fpr.R(s));
 				PSHUFB(XMM0, M((void *)bswapShuffle1x4));
 				WriteFloatToConstRamAddress(XMM0, addr);
 				return;
@@ -250,7 +256,6 @@ void Jit64::stfs(UGeckoInstruction inst)
 		else if (addr == 0xCC008000)
 		{
 			// Float directly to write gather pipe! Fun!
-			CVTSD2SS(XMM0, fpr.R(s));
 			CALL((void*)asm_routines.fifoDirectWriteFloat);
 			// TODO
 			js.fifoBytesThisBlock += 4;
@@ -275,7 +280,6 @@ void Jit64::stfs(UGeckoInstruction inst)
 
 		MEMCHECK_END
 	}
-	CVTSD2SS(XMM0, fpr.R(s));
 	SafeWriteFloatToReg(XMM0, ABI_PARAM2, RegistersInUse());
 	gpr.UnlockAll();
 	gpr.UnlockAllX();
@@ -294,8 +298,11 @@ void Jit64::stfsx(UGeckoInstruction inst)
 	MOV(32, R(ABI_PARAM1), gpr.R(inst.RB));
 	if (inst.RA)
 		ADD(32, R(ABI_PARAM1), gpr.R(inst.RA));
-	CVTSD2SS(XMM0, fpr.R(inst.RS));
-	MOVD_xmm(R(EAX), XMM0);
+	fpr.BindToRegister(inst.RS, true, false);
+	MOVSD(M(&temp64), fpr.RX(inst.RS));
+	FLD(64, M(&temp64));
+	FSTP(32, M(&temp32));
+	MOV(32, R(EAX), M(&temp32));
 	SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse());
 
 	gpr.UnlockAllX();
@@ -336,7 +343,9 @@ void Jit64::lfsx(UGeckoInstruction inst)
 		MEMCHECK_START
 
 		MOV(32, M(&temp32), R(EAX));
-		CVTSS2SD(XMM0, M(&temp32));
+	        FLD(32, M(&temp32));
+	        FSTP(64, M(&temp64));
+	        MOVSD(XMM0, M(&temp64));
 		fpr.Lock(inst.RS);
 		fpr.BindToRegister(inst.RS, false, true);
 		MOVDDUP(fpr.R(inst.RS).GetSimpleReg(), R(XMM0));