diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp index 3545435881..b38b645d82 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp @@ -97,8 +97,9 @@ Inter-block dead condition register elimination (Likely significant win Optimize conditions for conditional branches. General dead register elimination. Inter-block inlining. -Track down a few correctness bugs (I think there's something wrong - with my branches, but I haven't been able to figure it out). +Track down issues with new JIT + dual-core mode (I think I'm going to + need help with this one; I'm not very familiar with the + dual-core code.) Specialized slw/srw/sraw; I think there are some tricks that could have a non-trivial effect, and there are significantly shorter implementations for 64-bit involving abusing 64-bit shifts. @@ -502,16 +503,21 @@ struct RegInfo { InstLoc FirstI; std::vector IInfo; InstLoc regs[16]; + InstLoc fregs[16]; unsigned numSpills; + unsigned numFSpills; bool MakeProfile; bool UseProfile; unsigned numProfiledLoads; unsigned exitNumber; RegInfo(Jit64* j, InstLoc f, unsigned insts) : Jit(j), FirstI(f), IInfo(insts) { - for (unsigned i = 0; i < 16; i++) + for (unsigned i = 0; i < 16; i++) { regs[i] = 0; + fregs[i] = 0; + } numSpills = 0; + numFSpills = 0; numProfiledLoads = 0; exitNumber = 0; MakeProfile = UseProfile = false; @@ -533,6 +539,7 @@ static unsigned regReadUse(RegInfo& R, InstLoc I) { static unsigned SlotSet[1000]; static unsigned ProfiledLoads[1000]; +static u8 GC_ALIGNED16(FSlotSet[16*1000]); static OpArg regLocForSlot(RegInfo& RI, unsigned slot) { return M(&SlotSet[slot - 1]); @@ -558,57 +565,86 @@ static void regSpill(RegInfo& RI, X64Reg reg) { RI.regs[reg] = 0; } +static OpArg fregLocForSlot(RegInfo& RI, unsigned slot) { + return M(&FSlotSet[slot*16]); +} + +static unsigned fregCreateSpill(RegInfo& RI, InstLoc I) { + unsigned newSpill = ++RI.numFSpills; + RI.IInfo[I - RI.FirstI] |= newSpill << 16; + return newSpill; +} + +static unsigned fregGetSpill(RegInfo& RI, InstLoc I) { + return RI.IInfo[I - RI.FirstI] >> 16; +} + +static void fregSpill(RegInfo& RI, X64Reg reg) { + if (!RI.fregs[reg]) return; + unsigned slot = fregGetSpill(RI, RI.fregs[reg]); + if (!slot) { + slot = fregCreateSpill(RI, RI.fregs[reg]); + RI.Jit->MOVAPD(fregLocForSlot(RI, slot), reg); + } + RI.fregs[reg] = 0; +} + +// ECX is scratch, so we don't allocate it +static X64Reg RegAllocOrder[] = {EDI, ESI, EBP, EBX, EDX, EAX}; +static unsigned RegAllocSize = sizeof(RegAllocOrder) / sizeof(X64Reg); +static X64Reg FRegAllocOrder[] = {XMM2, XMM3, XMM4, XMM5, XMM6, XMM7}; +static unsigned FRegAllocSize = sizeof(FRegAllocOrder) / sizeof(X64Reg); + static X64Reg regFindFreeReg(RegInfo& RI) { - if (RI.regs[EDI] == 0) return EDI; - if (RI.regs[ESI] == 0) return ESI; - if (RI.regs[EBP] == 0) return EBP; - if (RI.regs[EBX] == 0) return EBX; - if (RI.regs[EDX] == 0) return EDX; - if (RI.regs[EAX] == 0) return EAX; - // ECX is scratch, so we don't allocate it - static X64Reg regs[] = {EDI, ESI, EBP, EBX, EDX, EAX}; + for (unsigned i = 0; i < RegAllocSize; i++) + if (RI.regs[RegAllocOrder[i]] == 0) + return RegAllocOrder[i]; + static unsigned nextReg = 0; - X64Reg reg = regs[nextReg++ % 6]; + X64Reg reg = RegAllocOrder[nextReg++ % RegAllocSize]; regSpill(RI, reg); return reg; } +static X64Reg fregFindFreeReg(RegInfo& RI) { + for (unsigned i = 0; i < FRegAllocSize; i++) + if (RI.fregs[FRegAllocOrder[i]] == 0) + return FRegAllocOrder[i]; + // XMM0/1 are scratch, so we don't allocate it + fregSpill(RI, XMM7); + return XMM7; +} + static OpArg regLocForInst(RegInfo& RI, InstLoc I) { - if (RI.regs[EDI] == I) return R(EDI); - if (RI.regs[ESI] == I) return R(ESI); - if (RI.regs[EBP] == I) return R(EBP); - if (RI.regs[EBX] == I) return R(EBX); - if (RI.regs[EDX] == I) return R(EDX); - if (RI.regs[EAX] == I) return R(EAX); - if (RI.regs[ECX] == I) return R(ECX); + for (unsigned i = 0; i < RegAllocSize; i++) + if (RI.regs[RegAllocOrder[i]] == I) + return R(RegAllocOrder[i]); if (regGetSpill(RI, I) == 0) PanicAlert("Retrieving unknown spill slot?!"); return regLocForSlot(RI, regGetSpill(RI, I)); } +static OpArg fregLocForInst(RegInfo& RI, InstLoc I) { + for (unsigned i = 0; i < FRegAllocSize; i++) + if (RI.fregs[FRegAllocOrder[i]] == I) + return R(FRegAllocOrder[i]); + + if (fregGetSpill(RI, I) == 0) + PanicAlert("Retrieving unknown spill slot?!"); + return fregLocForSlot(RI, fregGetSpill(RI, I)); +} + static void regClearInst(RegInfo& RI, InstLoc I) { - if (RI.regs[EDI] == I) { - RI.regs[EDI] = 0; - } - if (RI.regs[ESI] == I) { - RI.regs[ESI] = 0; - } - if (RI.regs[EBP] == I) { - RI.regs[EBP] = 0; - } - if (RI.regs[EBX] == I) { - RI.regs[EBX] = 0; - } - if (RI.regs[EDX] == I) { - RI.regs[EDX] = 0; - } - if (RI.regs[EAX] == I) { - RI.regs[EAX] = 0; - } - if (RI.regs[ECX] == I) { - RI.regs[ECX] = 0; - } + for (unsigned i = 0; i < RegAllocSize; i++) + if (RI.regs[RegAllocOrder[i]] == I) + RI.regs[RegAllocOrder[i]] = 0; +} + +static void fregClearInst(RegInfo& RI, InstLoc I) { + for (unsigned i = 0; i < FRegAllocSize; i++) + if (RI.fregs[FRegAllocOrder[i]] == I) + RI.fregs[FRegAllocOrder[i]] = 0; } static X64Reg regEnsureInReg(RegInfo& RI, InstLoc I) { @@ -645,6 +681,20 @@ static X64Reg regBinLHSReg(RegInfo& RI, InstLoc I) { return reg; } +static void regNormalRegClear(RegInfo& RI, InstLoc I) { + if (RI.IInfo[I - RI.FirstI] & 4) + regClearInst(RI, getOp1(I)); + if (RI.IInfo[I - RI.FirstI] & 8) + regClearInst(RI, getOp2(I)); +} + +static void fregNormalRegClear(RegInfo& RI, InstLoc I) { + if (RI.IInfo[I - RI.FirstI] & 4) + fregClearInst(RI, getOp1(I)); + if (RI.IInfo[I - RI.FirstI] & 8) + fregClearInst(RI, getOp2(I)); +} + static void regEmitBinInst(RegInfo& RI, InstLoc I, void (Jit64::*op)(int, const OpArg&, const OpArg&)) { @@ -660,11 +710,11 @@ static void regEmitBinInst(RegInfo& RI, InstLoc I, (RI.Jit->*op)(32, R(reg), regLocForInst(RI, getOp2(I))); } RI.regs[reg] = I; + regNormalRegClear(RI, I); } // Mark and calculation routines for profiled load/store addresses // Could be extended to unprofiled addresses. -// FIXME: Finish/activate! static void regMarkMemAddress(RegInfo& RI, InstLoc I, InstLoc AI, unsigned OpNum) { if (isImm(*AI)) { unsigned addr = RI.Build->GetImmValue(AI); @@ -743,7 +793,6 @@ static OpArg regBuildMemAddress(RegInfo& RI, InstLoc I, InstLoc AI, } return MDisp(baseReg, offset); } -// end FIXME static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) { if (RI.UseProfile) { @@ -844,7 +893,6 @@ static void regEmitMemStore(RegInfo& RI, InstLoc I, unsigned Size) { RI.Jit->js.fifoBytesThisBlock += Size >> 3; if (RI.IInfo[I - RI.FirstI] & 4) regClearInst(RI, getOp1(I)); - //regBuildMemAddress(RI, I, getOp2(I), 2, Size, 0, false); regClearDeadMemAddress(RI, I, getOp2(I), 2); return; } @@ -878,6 +926,7 @@ static void regEmitShiftInst(RegInfo& RI, InstLoc I, RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I))); (RI.Jit->*op)(32, R(reg), R(ECX)); RI.regs[reg] = I; + regNormalRegClear(RI, I); } static void regStoreInstToConstLoc(RegInfo& RI, unsigned width, InstLoc I, @@ -930,7 +979,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { RegInfo RI(Jit, ibuild->getFirstInst(), ibuild->getNumInsts()); RI.Build = ibuild; RI.UseProfile = UseProfile; - RI.MakeProfile = !RI.UseProfile; + RI.MakeProfile = false;//!RI.UseProfile; // Pass to compute liveness ibuild->StartBackPass(); for (unsigned index = RI.IInfo.size() - 1; index != -1U; --index) { @@ -949,12 +998,14 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { case LoadCarry: case LoadCTR: case LoadMSR: + case LoadFReg: case BlockEnd: case BlockStart: case InterpreterFallback: case SystemCall: case RFIExit: case InterpreterBranch: + case IdleLoop: // No liveness effects break; case Tramp: @@ -965,6 +1016,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { case SExt16: case BSwap32: case BSwap16: + case DupSingleToMReg: + case DoubleToSingle: + case ExpandPackedToMReg: if (thisUsed) regMarkUse(RI, I, getOp1(I), 1); break; @@ -973,6 +1027,10 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { case Load32: regMarkMemAddress(RI, I, getOp1(I), 1); break; + case LoadSingle: + case LoadPaired: + regMarkUse(RI, I, getOp1(I), 1); + break; case StoreCR: case StoreCarry: regMarkUse(RI, I, getOp1(I), 1); @@ -981,6 +1039,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { case StoreLink: case StoreCTR: case StoreMSR: + case StoreFReg: if (!isImm(*getOp1(I))) regMarkUse(RI, I, getOp1(I), 1); break; @@ -1000,6 +1059,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { case ICmpUgt: case ICmpSle: case ICmpSgt: + case FSMul: + case FSAdd: + case InsertDoubleInMReg: if (thisUsed) { regMarkUse(RI, I, getOp1(I), 1); if (!isImm(*getOp2(I))) @@ -1041,6 +1103,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { case InterpreterFallback: { unsigned InstCode = ibuild->GetImmValue(getOp1(I)); unsigned InstLoc = ibuild->GetImmValue(getOp2(I)); + // There really shouldn't be anything live across an + // interpreter call at the moment, but optimizing interpreter + // calls isn't completely out of the question... regSpillCallerSaved(RI); Jit->MOV(32, M(&PC), Imm32(InstLoc)); Jit->MOV(32, M(&NPC), Imm32(InstLoc+4)); @@ -1089,6 +1154,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { unsigned ppcreg = *I >> 16; regStoreInstToConstLoc(RI, 32, getOp1(I), &PowerPC::ppcState.gpr[ppcreg]); + regNormalRegClear(RI, I); break; } case StoreCR: { @@ -1096,18 +1162,22 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { unsigned ppcreg = *I >> 16; // CAUTION: uses 8-bit reg! Jit->MOV(8, M(&PowerPC::ppcState.cr_fast[ppcreg]), R(ECX)); + regNormalRegClear(RI, I); break; } case StoreLink: { regStoreInstToConstLoc(RI, 32, getOp1(I), &LR); + regNormalRegClear(RI, I); break; } case StoreCTR: { regStoreInstToConstLoc(RI, 32, getOp1(I), &CTR); + regNormalRegClear(RI, I); break; } case StoreMSR: { regStoreInstToConstLoc(RI, 32, getOp1(I), &MSR); + regNormalRegClear(RI, I); break; } case StoreCarry: { @@ -1118,6 +1188,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { Jit->SetJumpTarget(nocarry); Jit->JitClearCA(); Jit->SetJumpTarget(cont); + regNormalRegClear(RI, I); break; } case Load8: { @@ -1150,6 +1221,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I))); Jit->MOVSX(32, 8, reg, R(ECX)); RI.regs[reg] = I; + regNormalRegClear(RI, I); break; } case SExt16: { @@ -1157,6 +1229,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { X64Reg reg = regUReg(RI, I); Jit->MOVSX(32, 16, reg, regLocForInst(RI, getOp1(I))); RI.regs[reg] = I; + regNormalRegClear(RI, I); break; } case And: { @@ -1199,6 +1272,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { Jit->IMUL(32, reg, regLocForInst(RI, getOp2(I))); } RI.regs[reg] = I; + regNormalRegClear(RI, I); break; } case Rol: { @@ -1228,6 +1302,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { X64Reg reg = regFindFreeReg(RI); Jit->MOVZX(32, 8, reg, R(ECX)); RI.regs[reg] = I; + regNormalRegClear(RI, I); break; } case ICmpUgt: { @@ -1237,6 +1312,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { X64Reg reg = regFindFreeReg(RI); Jit->MOVZX(32, 8, reg, R(ECX)); RI.regs[reg] = I; + regNormalRegClear(RI, I); break; } case ICmpSle: { @@ -1246,6 +1322,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { X64Reg reg = regFindFreeReg(RI); Jit->MOVZX(32, 8, reg, R(ECX)); RI.regs[reg] = I; + regNormalRegClear(RI, I); break; } case ICmpCRUnsigned: { @@ -1264,6 +1341,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { Jit->SetJumpTarget(continue1); Jit->SetJumpTarget(continue2); RI.regs[reg] = I; + regNormalRegClear(RI, I); break; } case ICmpCRSigned: { @@ -1282,6 +1360,102 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { Jit->SetJumpTarget(continue1); Jit->SetJumpTarget(continue2); RI.regs[reg] = I; + regNormalRegClear(RI, I); + break; + } + case LoadSingle: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I))); + RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false); + Jit->MOVD_xmm(reg, R(ECX)); + RI.fregs[reg] = I; + regNormalRegClear(RI, I); + break; + } + case LoadPaired: { + if (!thisUsed) break; + regSpill(RI, EAX); + regSpill(RI, EDX); + X64Reg reg = fregFindFreeReg(RI); + unsigned quantreg = *I >> 16; + Jit->MOVZX(32, 16, EAX, M(((char *)&PowerPC::ppcState.spr[SPR_GQR0 + quantreg]) + 2)); + Jit->MOVZX(32, 8, EDX, R(AL)); + // FIXME: Fix ModR/M encoding to allow [EDX*4+disp32]! + Jit->SHL(32, R(EDX), Imm8(2)); + Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I))); + Jit->CALLptr(MDisp(EDX, (u32)asm_routines.pairedLoadQuantized)); + Jit->MOVAPD(reg, R(XMM0)); + RI.fregs[reg] = I; + regNormalRegClear(RI, I); + break; + } + case DupSingleToMReg: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->CVTSS2SD(reg, fregLocForInst(RI, getOp1(I))); + Jit->MOVDDUP(reg, R(reg)); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); + break; + } + case InsertDoubleInMReg: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->MOVAPD(reg, fregLocForInst(RI, getOp2(I))); + Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I))); + Jit->MOVSD(reg, R(XMM0)); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); + break; + } + case ExpandPackedToMReg: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->CVTPS2PD(reg, fregLocForInst(RI, getOp1(I))); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); + break; + } + case LoadFReg: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + unsigned ppcreg = *I >> 8; + Jit->MOVAPD(reg, M(&PowerPC::ppcState.ps[ppcreg])); + RI.fregs[reg] = I; + break; + } + case StoreFReg: { + unsigned ppcreg = *I >> 16; + Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I))); + Jit->MOVAPD(M(&PowerPC::ppcState.ps[ppcreg]), XMM0); + fregNormalRegClear(RI, I); + break; + } + case DoubleToSingle: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->CVTSD2SS(reg, fregLocForInst(RI, getOp1(I))); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); + break; + } + case FSMul: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I))); + Jit->MULSS(reg, fregLocForInst(RI, getOp2(I))); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); + break; + } + case FSAdd: { + if (!thisUsed) break; + X64Reg reg = fregFindFreeReg(RI); + Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I))); + Jit->ADDSS(reg, fregLocForInst(RI, getOp2(I))); + RI.fregs[reg] = I; + fregNormalRegClear(RI, I); break; } case CInt32: @@ -1328,6 +1502,15 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { } case BranchUncond: { regWriteExit(RI, getOp1(I)); + regNormalRegClear(RI, I); + break; + } + case IdleLoop: { + unsigned IdleParam = ibuild->GetImmValue(getOp1(I)); + unsigned InstLoc = ibuild->GetImmValue(getOp2(I)); + Jit->ABI_CallFunctionC((void *)&PowerPC::OnIdle, IdleParam); + Jit->MOV(32, M(&PowerPC::ppcState.pc), Imm32(InstLoc + 12)); + Jit->JMP(asm_routines.testExceptions, true); break; } case SystemCall: { @@ -1378,26 +1561,16 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { PanicAlert("Unknown JIT instruction; aborting!"); exit(1); } - if (getOpcode(*I) != Tramp && - getOpcode(*I) != BranchCond && - getOpcode(*I) != Load8 && - getOpcode(*I) != Load16 && - getOpcode(*I) != Load32 && - getOpcode(*I) != Store8 && - getOpcode(*I) != Store16 && - getOpcode(*I) != Store32 && - 1) { - if (RI.IInfo[I - RI.FirstI] & 4) - regClearInst(RI, getOp1(I)); - if (RI.IInfo[I - RI.FirstI] & 8) - regClearInst(RI, getOp2(I)); - } } for (unsigned i = 0; i < 8; i++) { if (RI.regs[i]) { PanicAlert("Incomplete cleanup!"); exit(1); } + if (RI.fregs[i]) { + PanicAlert("Incomplete cleanup!"); + exit(1); + } } if (UseProfile && RI.numSpills) @@ -1412,8 +1585,8 @@ void Jit64::WriteCode() { void ProfiledReJit() { u8* x = (u8*)jit.GetCodePtr(); - jit.SetCodePtr(jit.js.normalEntry); + jit.SetCodePtr(jit.js.rewriteStart); DoWriteCode(&jit.ibuild, &jit, true); - jit.js.curBlock->codeSize = jit.GetCodePtr() - jit.js.normalEntry; + jit.js.curBlock->codeSize = jit.GetCodePtr() - jit.js.rewriteStart; jit.SetCodePtr(x); } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h index 74bfaadd0f..4e0f734581 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h @@ -80,6 +80,7 @@ namespace IREmitter { Store16, Store32, BranchCond, +#if 0 // Floating-point // There are three floating-point formats: single, double, // and packed. For any operation where the format of the @@ -141,8 +142,18 @@ namespace IREmitter { ForceToSingle, ForceToDouble, ForceToMReg, - LoadFPReg, - StoreFPReg, +#endif + LoadSingle, + LoadDouble, + LoadPaired, // This handles quantizers itself + DoubleToSingle, + DupSingleToMReg, + InsertDoubleInMReg, + ExpandPackedToMReg, + LoadFReg, + StoreFReg, + FSMul, + FSAdd, // "Trinary" operators // FIXME: Need to change representation! @@ -156,6 +167,7 @@ namespace IREmitter { SystemCall, RFIExit, InterpreterBranch, + IdleLoop, // "Opcode" representing a register too far away to // reference directly; this is a size optimization @@ -365,6 +377,42 @@ namespace IREmitter { InstLoc EmitRFIExit() { return FoldZeroOp(RFIExit, 0); } + InstLoc EmitIdleLoop(InstLoc idleParam, InstLoc pc) { + return FoldBiOp(IdleLoop, idleParam, pc); + } + InstLoc EmitLoadSingle(InstLoc addr) { + return FoldUOp(LoadSingle, addr); + } + InstLoc EmitLoadDouble(InstLoc addr) { + return FoldUOp(LoadDouble, addr); + } + InstLoc EmitLoadPaired(InstLoc addr, unsigned quantReg) { + return FoldUOp(LoadPaired, addr, quantReg); + } + InstLoc EmitLoadFReg(unsigned freg) { + return FoldZeroOp(LoadFReg, freg); + } + InstLoc EmitStoreFReg(InstLoc val, unsigned freg) { + return FoldUOp(StoreFReg, val, freg); + } + InstLoc EmitDupSingleToMReg(InstLoc val) { + return FoldUOp(DupSingleToMReg, val); + } + InstLoc EmitInsertDoubleInMReg(InstLoc val, InstLoc reg) { + return FoldBiOp(InsertDoubleInMReg, val, reg); + } + InstLoc EmitExpandPackedToMReg(InstLoc val) { + return FoldUOp(ExpandPackedToMReg, val); + } + InstLoc EmitFSMul(InstLoc op1, InstLoc op2) { + return FoldBiOp(FSMul, op1, op2); + } + InstLoc EmitFSAdd(InstLoc op1, InstLoc op2) { + return FoldBiOp(FSAdd, op1, op2); + } + InstLoc EmitDoubleToSingle(InstLoc op1) { + return FoldUOp(DoubleToSingle, op1); + } void StartBackPass() { curReadPtr = &InstList[InstList.size()]; } void StartForwardPass() { curReadPtr = &InstList[0]; } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp index bc3ba82d3f..20c0d2308e 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp @@ -420,12 +420,11 @@ namespace CPUCompare SetJumpTarget(skip); const u8 *normalEntry = GetCodePtr(); - js.normalEntry = (u8*)normalEntry; if (ImHereDebug) ABI_CallFunction((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful - if (false && js.fpa.any) + if (js.fpa.any) { //This block uses FPU - needs to add FP exception bailout TEST(32, M(&PowerPC::ppcState.msr), Imm32(1 << 13)); //Test FP enabled bit @@ -445,24 +444,10 @@ namespace CPUCompare SetJumpTarget(b1); } - // Conditionally add profiling code. - if (Profiler::g_ProfileBlocks) { - ADD(32, M(&b->runCount), Imm8(1)); -#ifdef _WIN32 - b->ticCounter.QuadPart = 0; - b->ticStart.QuadPart = 0; - b->ticStop.QuadPart = 0; -#else -//TODO -#endif - // get start tic - PROFILER_QUERY_PERFORMACE_COUNTER(&b->ticStart); - } + js.rewriteStart = (u8*)GetCodePtr(); - //Start up the register allocators - //They use the information in gpa/fpa to preload commonly used registers. - //gpr.Start(js.gpa); - //fpr.Start(js.fpa); + // Start up IR builder (structure that collects the + // instruction processed by the JIT routines) ibuild.Reset(); js.downcountAmount = js.st.numCycles + PatchEngine::GetSpeedhackCycles(em_address); @@ -519,6 +504,7 @@ namespace CPUCompare break; } + // Perform actual code generation WriteCode(); b->flags = js.block_flags; diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h index 8ddaf59cbc..8f059656c2 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h @@ -95,7 +95,7 @@ private: PPCAnalyst::BlockRegStats gpa; PPCAnalyst::BlockRegStats fpa; PPCAnalyst::CodeOp *op; - u8* normalEntry; + u8* rewriteStart; JitBlock *curBlock; }; diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp index dc54c4a33e..1f4a95910c 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp @@ -23,6 +23,7 @@ #include "../PowerPC.h" #include "../../CoreTiming.h" #include "MemoryUtil.h" +#include "CPUDetect.h" #include "ABI.h" #include "Jit.h" @@ -168,6 +169,176 @@ void AsmRoutineManager::Generate() GenerateCommon(); } +const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15}; + +const float m_quantizeTableS[] = +{ + (1 << 0), (1 << 1), (1 << 2), (1 << 3), + (1 << 4), (1 << 5), (1 << 6), (1 << 7), + (1 << 8), (1 << 9), (1 << 10), (1 << 11), + (1 << 12), (1 << 13), (1 << 14), (1 << 15), + (1 << 16), (1 << 17), (1 << 18), (1 << 19), + (1 << 20), (1 << 21), (1 << 22), (1 << 23), + (1 << 24), (1 << 25), (1 << 26), (1 << 27), + (1 << 28), (1 << 29), (1 << 30), (1 << 31), + 1.0 / (1ULL << 32), 1.0 / (1 << 31), 1.0 / (1 << 30), 1.0 / (1 << 29), + 1.0 / (1 << 28), 1.0 / (1 << 27), 1.0 / (1 << 26), 1.0 / (1 << 25), + 1.0 / (1 << 24), 1.0 / (1 << 23), 1.0 / (1 << 22), 1.0 / (1 << 21), + 1.0 / (1 << 20), 1.0 / (1 << 19), 1.0 / (1 << 18), 1.0 / (1 << 17), + 1.0 / (1 << 16), 1.0 / (1 << 15), 1.0 / (1 << 14), 1.0 / (1 << 13), + 1.0 / (1 << 12), 1.0 / (1 << 11), 1.0 / (1 << 10), 1.0 / (1 << 9), + 1.0 / (1 << 8), 1.0 / (1 << 7), 1.0 / (1 << 6), 1.0 / (1 << 5), + 1.0 / (1 << 4), 1.0 / (1 << 3), 1.0 / (1 << 2), 1.0 / (1 << 1), +}; + +const float m_dequantizeTableS[] = +{ + 1.0 / (1 << 0), 1.0 / (1 << 1), 1.0 / (1 << 2), 1.0 / (1 << 3), + 1.0 / (1 << 4), 1.0 / (1 << 5), 1.0 / (1 << 6), 1.0 / (1 << 7), + 1.0 / (1 << 8), 1.0 / (1 << 9), 1.0 / (1 << 10), 1.0 / (1 << 11), + 1.0 / (1 << 12), 1.0 / (1 << 13), 1.0 / (1 << 14), 1.0 / (1 << 15), + 1.0 / (1 << 16), 1.0 / (1 << 17), 1.0 / (1 << 18), 1.0 / (1 << 19), + 1.0 / (1 << 20), 1.0 / (1 << 21), 1.0 / (1 << 22), 1.0 / (1 << 23), + 1.0 / (1 << 24), 1.0 / (1 << 25), 1.0 / (1 << 26), 1.0 / (1 << 27), + 1.0 / (1 << 28), 1.0 / (1 << 29), 1.0 / (1 << 30), 1.0 / (1 << 31), + (1ULL << 32), (1 << 31), (1 << 30), (1 << 29), + (1 << 28), (1 << 27), (1 << 26), (1 << 25), + (1 << 24), (1 << 23), (1 << 22), (1 << 21), + (1 << 20), (1 << 19), (1 << 18), (1 << 17), + (1 << 16), (1 << 15), (1 << 14), (1 << 13), + (1 << 12), (1 << 11), (1 << 10), (1 << 9), + (1 << 8), (1 << 7), (1 << 6), (1 << 5), + (1 << 4), (1 << 3), (1 << 2), (1 << 1), +}; + +float psTemp[2]; + +void AsmRoutineManager::GenQuantizedLoads() { + const u8* loadPairedIllegal = AlignCode4(); + UD2(); + const u8* loadPairedFloat = AlignCode4(); + if (cpu_info.bSSSE3) { +#ifdef _M_X64 + MOVQ_xmm(XMM0, MComplex(RBX, RCX, 1, 0)); +#else + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOVQ_xmm(XMM0, MDisp(ECX, (u32)Memory::base)); +#endif + PSHUFB(XMM0, M((void *)pbswapShuffle2x4)); + } else { +#ifdef _M_X64 + MOV(64, R(RCX), MComplex(RBX, RCX, 1, 0)); + BSWAP(64, RCX); + ROL(64, RCX, Imm8(32)); + MOVQ_xmm(XMM0, R(RCX)); +#else +#if 0 + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOVQ_xmm(XMM0, MDisp(ECX, (u32)Memory::base)); + PXOR(XMM1, R(XMM1)); + PSHUFLW(XMM0, R(XMM0), 0xB1); + MOVAPD(XMM1, R(XMM0)); + PSRLW(XMM0, 8); + PSLLW(XMM1, 8); + POR(XMM0, R(XMM1)); +#else + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base)); + BSWAP(32, EAX); + MOV(32, M(&psTemp[0]), R(RAX)); + MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base + 4)); + BSWAP(32, EAX); + MOV(32, M(((float *)&psTemp[0]) + 1), R(RAX)); + MOVQ_xmm(XMM0, M(&psTemp[0])); +#endif +#endif + } + RET(); + + const u8* loadPairedU8 = AlignCode4(); +#ifdef _M_X64 + MOVZX(32, 16, ECX, MComplex(RBX, RCX, 1, 0)); +#else + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOVZX(32, 16, ECX, MDisp(ECX, (u32)Memory::base)); +#endif + MOVD_xmm(XMM0, R(ECX)); + PXOR(XMM1, R(XMM1)); + PUNPCKLBW(XMM0, R(XMM1)); + PUNPCKLWD(XMM0, R(XMM1)); + CVTDQ2PS(XMM0, R(XMM0)); + SHR(32, R(EAX), Imm8(6)); + MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS)); + PUNPCKLDQ(XMM1, R(XMM1)); + MULPS(XMM0, R(XMM1)); + RET(); + + const u8* loadPairedS8 = AlignCode4(); +#ifdef _M_X64 + MOVZX(32, 16, ECX, MComplex(RBX, RCX, 1, 0)); +#else + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOVZX(32, 16, ECX, MDisp(ECX, (u32)Memory::base)); +#endif + MOVD_xmm(XMM0, R(ECX)); + PUNPCKLBW(XMM0, R(XMM0)); + PUNPCKLWD(XMM0, R(XMM0)); + PSRAD(XMM0, 24); + CVTDQ2PS(XMM0, R(XMM0)); + SHR(32, R(EAX), Imm8(6)); + MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS)); + PUNPCKLDQ(XMM1, R(XMM1)); + MULPS(XMM0, R(XMM1)); + RET(); + + const u8* loadPairedU16 = AlignCode4(); +#ifdef _M_X64 + MOV(32, R(ECX), MComplex(RBX, RCX, 1, 0)); +#else + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOV(32, R(ECX), MDisp(ECX, (u32)Memory::base)); +#endif + BSWAP(32, ECX); + ROL(32, R(ECX), Imm8(16)); + MOVD_xmm(XMM0, R(ECX)); + PXOR(XMM1, R(XMM1)); + PUNPCKLWD(XMM0, R(XMM1)); + CVTDQ2PS(XMM0, R(XMM0)); + SHR(32, R(EAX), Imm8(6)); + MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS)); + PUNPCKLDQ(XMM1, R(XMM1)); + MULPS(XMM0, R(XMM1)); + RET(); + + const u8* loadPairedS16 = AlignCode4(); +#ifdef _M_X64 + MOV(32, R(ECX), MComplex(RBX, RCX, 1, 0)); +#else + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOV(32, R(ECX), MDisp(ECX, (u32)Memory::base)); +#endif + BSWAP(32, ECX); + ROL(32, R(ECX), Imm8(16)); + MOVD_xmm(XMM0, R(ECX)); + PUNPCKLWD(XMM0, R(XMM0)); + PSRAD(XMM0, 16); + CVTDQ2PS(XMM0, R(XMM0)); + SHR(32, R(EAX), Imm8(6)); + AND(32, R(EAX), Imm32(0xFC)); + MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS)); + PUNPCKLDQ(XMM1, R(XMM1)); + MULPS(XMM0, R(XMM1)); + RET(); + + pairedLoadQuantized[0] = loadPairedFloat; + pairedLoadQuantized[1] = loadPairedIllegal; + pairedLoadQuantized[2] = loadPairedIllegal; + pairedLoadQuantized[3] = loadPairedIllegal; + pairedLoadQuantized[4] = loadPairedU8; + pairedLoadQuantized[5] = loadPairedU16; + pairedLoadQuantized[6] = loadPairedS8; + pairedLoadQuantized[7] = loadPairedS16; +} void AsmRoutineManager::GenFifoWrite(int size) { @@ -257,6 +428,8 @@ void AsmRoutineManager::GenerateCommon() SUB(32, M(&CoreTiming::downcount), Imm8(0)); JMP(dispatcher, true); + GenQuantizedLoads(); + computeRcFp = AlignCode16(); //CMPSD(R(XMM0), M(&zero), // TODO diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h b/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h index e5ef03647c..7fd646d0fe 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h @@ -42,6 +42,7 @@ private: void GenFifoWrite(int size); void GenFifoFloatWrite(); void GenFifoXmm64Write(); + void GenQuantizedLoads(); public: void Init() { @@ -80,6 +81,8 @@ public: const u8 *doReJit; + const u8 *pairedLoadQuantized[8]; + bool compareEnabled; }; diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp index 61868d80a6..4185956172 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp @@ -39,6 +39,9 @@ // Zelda and many more games seem to pass the Acid Test. +//#define NORMALBRANCH_START Default(inst); ibuild.EmitInterpreterBranch(); return; +#define NORMALBRANCH_START + using namespace Gen; void Jit64::sc(UGeckoInstruction inst) @@ -53,6 +56,7 @@ using namespace Gen; void Jit64::bx(UGeckoInstruction inst) { + NORMALBRANCH_START if (inst.LK) ibuild.EmitStoreLink(ibuild.EmitIntConst(js.compilerPC + 4)); @@ -67,6 +71,7 @@ using namespace Gen; void Jit64::bcx(UGeckoInstruction inst) { + NORMALBRANCH_START if (inst.LK) ibuild.EmitStoreLink( ibuild.EmitIntConst(js.compilerPC + 4)); @@ -117,6 +122,7 @@ using namespace Gen; void Jit64::bcctrx(UGeckoInstruction inst) { + NORMALBRANCH_START Default(inst); ibuild.EmitInterpreterBranch(); return; @@ -124,6 +130,7 @@ using namespace Gen; void Jit64::bclrx(UGeckoInstruction inst) { + NORMALBRANCH_START if (inst.hex == 0x4e800020) { ibuild.EmitBranchUncond(ibuild.EmitLoadLink()); return; diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp index 64bb657a40..4d2d67be8e 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp @@ -29,141 +29,54 @@ #define INSTRUCTION_START // #define INSTRUCTION_START Default(inst); return; - const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL}; - const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; - const double GC_ALIGNED16(psOneOne2[2]) = {1.0, 1.0}; - - void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg)) - { - fpr.Lock(d, a, b); - if (d == a) - { - fpr.LoadToX64(d, true); - (this->*op)(fpr.RX(d), fpr.R(b)); - } - else if (d == b && reversible) - { - fpr.LoadToX64(d, true); - (this->*op)(fpr.RX(d), fpr.R(a)); - } - else if (a != d && b != d) - { - // Sources different from d, can use rather quick solution - fpr.LoadToX64(d, !dupe); - MOVSD(fpr.RX(d), fpr.R(a)); - (this->*op)(fpr.RX(d), fpr.R(b)); - } - else if (b != d) - { - fpr.LoadToX64(d, !dupe); - MOVSD(XMM0, fpr.R(b)); - MOVSD(fpr.RX(d), fpr.R(a)); - (this->*op)(fpr.RX(d), Gen::R(XMM0)); - } - else // Other combo, must use two temps :( - { - MOVSD(XMM0, fpr.R(a)); - MOVSD(XMM1, fpr.R(b)); - fpr.LoadToX64(d, !dupe); - (this->*op)(XMM0, Gen::R(XMM1)); - MOVSD(fpr.RX(d), Gen::R(XMM0)); - } - if (dupe) { - ForceSinglePrecisionS(fpr.RX(d)); - MOVDDUP(fpr.RX(d), fpr.R(d)); - } - fpr.UnlockAll(); - } - void Jit64::fp_arith_s(UGeckoInstruction inst) { - if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff) - {Default(inst); return;} // turn off from debugger - INSTRUCTION_START; - if (inst.Rc) { + if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 25) { Default(inst); return; } + IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA); + val = ibuild.EmitDoubleToSingle(val); bool dupe = inst.OPCD == 59; switch (inst.SUBOP5) { - case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::DIVSD); break; //div - case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::SUBSD); break; //sub - case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, dupe, &XEmitter::ADDSD); break; //add + case 25: //mul + val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC))); + case 18: //div + case 20: //sub + case 21: //add case 23: //sel - Default(inst); - break; case 24: //res - Default(inst); - break; - case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &XEmitter::MULSD); break; //mul default: _assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!"); } + val = ibuild.EmitDupSingleToMReg(val); + ibuild.EmitStoreFReg(val, inst.FD); } void Jit64::fmaddXX(UGeckoInstruction inst) { - if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff) - {Default(inst); return;} // turn off from debugger - INSTRUCTION_START; - if (inst.Rc) { + if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 29) { Default(inst); return; } bool single_precision = inst.OPCD == 59; - int a = inst.FA; - int b = inst.FB; - int c = inst.FC; - int d = inst.FD; - - fpr.Lock(a, b, c, d); - MOVSD(XMM0, fpr.R(a)); - switch (inst.SUBOP5) - { - case 28: //msub - MULSD(XMM0, fpr.R(c)); - SUBSD(XMM0, fpr.R(b)); - break; - case 29: //madd - MULSD(XMM0, fpr.R(c)); - ADDSD(XMM0, fpr.R(b)); - break; - case 30: //nmsub - MULSD(XMM0, fpr.R(c)); - SUBSD(XMM0, fpr.R(b)); - XORPD(XMM0, M((void*)&psSignBits2)); - break; - case 31: //nmadd - MULSD(XMM0, fpr.R(c)); - ADDSD(XMM0, fpr.R(b)); - XORPD(XMM0, M((void*)&psSignBits2)); - break; - } - fpr.LoadToX64(d, false); - //YES it is necessary to dupe the result :( - //TODO : analysis - does the top reg get used? If so, dupe, if not, don't. - if (single_precision) { - ForceSinglePrecisionS(XMM0); - MOVDDUP(fpr.RX(d), R(XMM0)); - } else { - MOVSD(fpr.RX(d), R(XMM0)); - } - fpr.UnlockAll(); + IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA); + val = ibuild.EmitDoubleToSingle(val); + val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC))); + val = ibuild.EmitFSAdd(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FB))); + val = ibuild.EmitDupSingleToMReg(val); + ibuild.EmitStoreFReg(val, inst.FD); } void Jit64::fmrx(UGeckoInstruction inst) { - if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff) - {Default(inst); return;} // turn off from debugger - INSTRUCTION_START; if (inst.Rc) { Default(inst); return; } - int d = inst.FD; - int b = inst.FB; - fpr.LoadToX64(d, true); // we don't want to destroy the high bit - MOVSD(fpr.RX(d), fpr.R(b)); + IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FB); + val = ibuild.EmitInsertDoubleInMReg(val, ibuild.EmitLoadFReg(inst.FD)); + ibuild.EmitStoreFReg(val, inst.FD); } void Jit64::fcmpx(UGeckoInstruction inst) diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStore.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStore.cpp index 31d03a06d0..06f21b354d 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStore.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStore.cpp @@ -71,6 +71,20 @@ void Jit64::lhax(UGeckoInstruction inst) void Jit64::lXz(UGeckoInstruction inst) { INSTRUCTION_START + + if (Core::GetStartupParameter().bSkipIdle && + inst.OPCD == 32 && + (inst.hex & 0xFFFF0000) == 0x800D0000 && + (Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x28000000 || + (Core::GetStartupParameter().bWii && Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x2C000000)) && + Memory::ReadUnchecked_U32(js.compilerPC + 8) == 0x4182fff8) + { + ibuild.EmitIdleLoop(ibuild.EmitIntConst(PowerPC::ppcState.gpr[inst.RA] + (s32)(s16)inst.SIMM_16), + ibuild.EmitIntConst(js.compilerPC)); + js.compilerPC += 8; + return; + } + IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16); if (inst.RA) addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA)); diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp index a6161e6575..2b63931d28 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp @@ -57,38 +57,12 @@ u32 GC_ALIGNED16(temp32); void Jit64::lfs(UGeckoInstruction inst) { - if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff) - {Default(inst); return;} // turn off from debugger - INSTRUCTION_START; - - int d = inst.RD; - int a = inst.RA; - if (!a) - { - Default(inst); - return; - } - s32 offset = (s32)(s16)inst.SIMM_16; - gpr.FlushLockX(ABI_PARAM1); - gpr.Lock(a); - MOV(32, R(ABI_PARAM1), gpr.R(a)); - if (jo.assumeFPLoadFromMem) - { - UnsafeLoadRegToReg(ABI_PARAM1, EAX, 32, offset, false); - } - else - { - SafeLoadRegToEAX(ABI_PARAM1, 32, offset); - } - - MOV(32, M(&temp32), R(EAX)); - fpr.Lock(d); - fpr.LoadToX64(d, false); - CVTSS2SD(fpr.RX(d), M(&temp32)); - MOVDDUP(fpr.RX(d), fpr.R(d)); - gpr.UnlockAll(); - gpr.UnlockAllX(); - fpr.UnlockAll(); + IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16), val; + if (inst.RA) + addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA)); + val = ibuild.EmitDupSingleToMReg(ibuild.EmitLoadSingle(addr)); + ibuild.EmitStoreFReg(val, inst.RD); + return; } @@ -291,32 +265,10 @@ void Jit64::stfsx(UGeckoInstruction inst) void Jit64::lfsx(UGeckoInstruction inst) { - if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff) - {Default(inst); return;} // turn off from debugger - INSTRUCTION_START; - - fpr.Lock(inst.RS); - fpr.LoadToX64(inst.RS, false, true); - MOV(32, R(EAX), gpr.R(inst.RB)); + IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB), val; if (inst.RA) - ADD(32, R(EAX), gpr.R(inst.RA)); - if (cpu_info.bSSSE3) { - X64Reg r = fpr.R(inst.RS).GetSimpleReg(); -#ifdef _M_IX86 - AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); - MOVD_xmm(r, MDisp(EAX, (u32)Memory::base)); -#else - MOVD_xmm(r, MComplex(RBX, EAX, SCALE_1, 0)); -#endif - PSHUFB(r, M((void *)bswapShuffle1x4)); - CVTSS2SD(r, R(r)); - MOVDDUP(r, R(r)); - } else { - UnsafeLoadRegToReg(EAX, EAX, 32, false); - MOV(32, M(&temp32), R(EAX)); - CVTSS2SD(XMM0, M(&temp32)); - MOVDDUP(fpr.R(inst.RS).GetSimpleReg(), R(XMM0)); - } - fpr.UnlockAll(); + addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA)); + val = ibuild.EmitDupSingleToMReg(ibuild.EmitLoadSingle(addr)); + ibuild.EmitStoreFReg(val, inst.RD); } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStorePaired.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStorePaired.cpp index d98a0f9ece..d0c2f330c2 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStorePaired.cpp @@ -40,419 +40,20 @@ #define INSTRUCTION_START // #define INSTRUCTION_START Default(inst); return; -const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15}; -const u8 GC_ALIGNED16(pbswapShuffleNoop[16]) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - -static double GC_ALIGNED16(psTemp[2]) = {1.0, 1.0}; -static u64 GC_ALIGNED16(temp64); - -// TODO(ector): Improve 64-bit version -static void WriteDual32(u64 value, u32 address) -{ - Memory::Write_U32((u32)(value >> 32), address); - Memory::Write_U32((u32)value, address + 4); -} - -const double GC_ALIGNED16(m_quantizeTableD[]) = -{ - (1 << 0), (1 << 1), (1 << 2), (1 << 3), - (1 << 4), (1 << 5), (1 << 6), (1 << 7), - (1 << 8), (1 << 9), (1 << 10), (1 << 11), - (1 << 12), (1 << 13), (1 << 14), (1 << 15), - (1 << 16), (1 << 17), (1 << 18), (1 << 19), - (1 << 20), (1 << 21), (1 << 22), (1 << 23), - (1 << 24), (1 << 25), (1 << 26), (1 << 27), - (1 << 28), (1 << 29), (1 << 30), (1 << 31), - 1.0 / (1ULL << 32), 1.0 / (1 << 31), 1.0 / (1 << 30), 1.0 / (1 << 29), - 1.0 / (1 << 28), 1.0 / (1 << 27), 1.0 / (1 << 26), 1.0 / (1 << 25), - 1.0 / (1 << 24), 1.0 / (1 << 23), 1.0 / (1 << 22), 1.0 / (1 << 21), - 1.0 / (1 << 20), 1.0 / (1 << 19), 1.0 / (1 << 18), 1.0 / (1 << 17), - 1.0 / (1 << 16), 1.0 / (1 << 15), 1.0 / (1 << 14), 1.0 / (1 << 13), - 1.0 / (1 << 12), 1.0 / (1 << 11), 1.0 / (1 << 10), 1.0 / (1 << 9), - 1.0 / (1 << 8), 1.0 / (1 << 7), 1.0 / (1 << 6), 1.0 / (1 << 5), - 1.0 / (1 << 4), 1.0 / (1 << 3), 1.0 / (1 << 2), 1.0 / (1 << 1), -}; - -const double GC_ALIGNED16(m_dequantizeTableD[]) = -{ - 1.0 / (1 << 0), 1.0 / (1 << 1), 1.0 / (1 << 2), 1.0 / (1 << 3), - 1.0 / (1 << 4), 1.0 / (1 << 5), 1.0 / (1 << 6), 1.0 / (1 << 7), - 1.0 / (1 << 8), 1.0 / (1 << 9), 1.0 / (1 << 10), 1.0 / (1 << 11), - 1.0 / (1 << 12), 1.0 / (1 << 13), 1.0 / (1 << 14), 1.0 / (1 << 15), - 1.0 / (1 << 16), 1.0 / (1 << 17), 1.0 / (1 << 18), 1.0 / (1 << 19), - 1.0 / (1 << 20), 1.0 / (1 << 21), 1.0 / (1 << 22), 1.0 / (1 << 23), - 1.0 / (1 << 24), 1.0 / (1 << 25), 1.0 / (1 << 26), 1.0 / (1 << 27), - 1.0 / (1 << 28), 1.0 / (1 << 29), 1.0 / (1 << 30), 1.0 / (1 << 31), - (1ULL << 32), (1 << 31), (1 << 30), (1 << 29), - (1 << 28), (1 << 27), (1 << 26), (1 << 25), - (1 << 24), (1 << 23), (1 << 22), (1 << 21), - (1 << 20), (1 << 19), (1 << 18), (1 << 17), - (1 << 16), (1 << 15), (1 << 14), (1 << 13), - (1 << 12), (1 << 11), (1 << 10), (1 << 9), - (1 << 8), (1 << 7), (1 << 6), (1 << 5), - (1 << 4), (1 << 3), (1 << 2), (1 << 1), -}; - // The big problem is likely instructions that set the quantizers in the same block. // We will have to break block after quantizers are written to. void Jit64::psq_st(UGeckoInstruction inst) { - if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStorePairedOff) - {Default(inst); return;} // turn off from debugger - INSTRUCTION_START; - js.block_flags |= BLOCK_USE_GQR0 << inst.I; - - if (js.blockSetsQuantizers || !Core::GetStartupParameter().bOptimizeQuantizers) - { - Default(inst); - return; - } - if (!inst.RA) - { - // This really should never happen. Unless we change this to also support stwux - Default(inst); - return; - } - - const UGQR gqr(rSPR(SPR_GQR0 + inst.I)); - const EQuantizeType stType = static_cast(gqr.ST_TYPE); - int stScale = gqr.ST_SCALE; - bool update = inst.OPCD == 61; - - int offset = inst.SIMM_12; - int a = inst.RA; - int s = inst.RS; // Fp numbers - - if (inst.W) { - // PanicAlert("W=1: stType %i stScale %i update %i", (int)stType, (int)stScale, (int)update); - // It's fairly common that games write stuff to the pipe using this. Then, it's pretty much only - // floats so that's what we'll work on. - switch (stType) - { - case QUANTIZE_FLOAT: - { - // This one has quite a bit of optimization potential. - if (gpr.R(a).IsImm()) - { - PanicAlert("Imm: %08x", gpr.R(a).offset); - } - gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2); - gpr.Lock(a); - fpr.Lock(s); - if (update) - gpr.LoadToX64(a, true, true); - MOV(32, R(ABI_PARAM2), gpr.R(a)); - if (offset) - ADD(32, R(ABI_PARAM2), Imm32((u32)offset)); - TEST(32, R(ABI_PARAM2), Imm32(0x0C000000)); - if (update && offset) - MOV(32, gpr.R(a), R(ABI_PARAM2)); - CVTSD2SS(XMM0, fpr.R(s)); - MOVD_xmm(M(&temp64), XMM0); - MOV(32, R(ABI_PARAM1), M(&temp64)); - FixupBranch argh = J_CC(CC_NZ); - BSWAP(32, ABI_PARAM1); -#ifdef _M_X64 - MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1)); -#else - MOV(32, R(EAX), R(ABI_PARAM2)); - AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); - MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1)); -#endif - FixupBranch skip_call = J(); - SetJumpTarget(argh); - ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); - SetJumpTarget(skip_call); - gpr.UnlockAll(); - gpr.UnlockAllX(); - fpr.UnlockAll(); - return; - } - default: - Default(inst); - return; - } - return; - } - - if (stType == QUANTIZE_FLOAT) - { - if (gpr.R(a).IsImm() && !update && cpu_info.bSSSE3) - { - u32 addr = (u32)(gpr.R(a).offset + offset); - if (addr == 0xCC008000) { - // Writing to FIFO. Let's do fast method. - CVTPD2PS(XMM0, fpr.R(s)); - PSHUFB(XMM0, M((void*)&pbswapShuffle2x4)); - CALL((void*)asm_routines.fifoDirectWriteXmm64); - js.fifoBytesThisBlock += 8; - return; - } - } - - gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2); - gpr.Lock(a); - fpr.Lock(s); - if (update) - gpr.LoadToX64(a, true, true); - MOV(32, R(ABI_PARAM2), gpr.R(a)); - if (offset) - ADD(32, R(ABI_PARAM2), Imm32((u32)offset)); - TEST(32, R(ABI_PARAM2), Imm32(0x0C000000)); - if (update && offset) - MOV(32, gpr.R(a), R(ABI_PARAM2)); - CVTPD2PS(XMM0, fpr.R(s)); - SHUFPS(XMM0, R(XMM0), 1); - MOVQ_xmm(M(&temp64), XMM0); -#ifdef _M_X64 - MOV(64, R(ABI_PARAM1), M(&temp64)); - FixupBranch argh = J_CC(CC_NZ); - BSWAP(64, ABI_PARAM1); - MOV(64, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1)); - FixupBranch arg2 = J(); - SetJumpTarget(argh); - CALL(thunks.ProtectFunction((void *)&WriteDual32, 0)); -#else - FixupBranch argh = J_CC(CC_NZ); - MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4)); - BSWAP(32, ABI_PARAM1); - AND(32, R(ABI_PARAM2), Imm32(Memory::MEMVIEW32_MASK)); - MOV(32, MDisp(ABI_PARAM2, (u32)Memory::base), R(ABI_PARAM1)); - MOV(32, R(ABI_PARAM1), M(&temp64)); - BSWAP(32, ABI_PARAM1); - MOV(32, MDisp(ABI_PARAM2, 4+(u32)Memory::base), R(ABI_PARAM1)); - FixupBranch arg2 = J(); - SetJumpTarget(argh); - MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4)); - ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); - MOV(32, R(ABI_PARAM1), M(((char*)&temp64))); - ADD(32, R(ABI_PARAM2), Imm32(4)); - ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); -#endif - SetJumpTarget(arg2); - gpr.UnlockAll(); - gpr.UnlockAllX(); - fpr.UnlockAll(); - } - else if (stType == QUANTIZE_U8) - { - gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2); - gpr.Lock(a); - fpr.Lock(s); - if (update) - gpr.LoadToX64(a, true, update); - MOV(32, R(ABI_PARAM2), gpr.R(a)); - if (offset) - ADD(32, R(ABI_PARAM2), Imm32((u32)offset)); - if (update && offset) - MOV(32, gpr.R(a), R(ABI_PARAM2)); - MOVAPD(XMM0, fpr.R(s)); - MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale])); - MULPD(XMM0, R(XMM1)); - CVTPD2DQ(XMM0, R(XMM0)); - PACKSSDW(XMM0, R(XMM0)); - PACKUSWB(XMM0, R(XMM0)); - MOVD_xmm(M(&temp64), XMM0); - MOV(16, R(ABI_PARAM1), M(&temp64)); -#ifdef _M_X64 - MOV(16, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1)); -#else - MOV(32, R(EAX), R(ABI_PARAM2)); - AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); - MOV(16, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1)); -#endif - if (update) - MOV(32, gpr.R(a), R(ABI_PARAM2)); - gpr.UnlockAll(); - gpr.UnlockAllX(); - fpr.UnlockAll(); - } - else if (stType == QUANTIZE_S16) - { - gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2); - gpr.Lock(a); - fpr.Lock(s); - if (update) - gpr.LoadToX64(a, true, update); - MOV(32, R(ABI_PARAM2), gpr.R(a)); - if (offset) - ADD(32, R(ABI_PARAM2), Imm32((u32)offset)); - if (update) - MOV(32, gpr.R(a), R(ABI_PARAM2)); - MOVAPD(XMM0, fpr.R(s)); - MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale])); - MULPD(XMM0, R(XMM1)); - SHUFPD(XMM0, R(XMM0), 1); - CVTPD2DQ(XMM0, R(XMM0)); - PACKSSDW(XMM0, R(XMM0)); - MOVD_xmm(M(&temp64), XMM0); - MOV(32, R(ABI_PARAM1), M(&temp64)); - BSWAP(32, ABI_PARAM1); -#ifdef _M_X64 - MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1)); -#else - MOV(32, R(EAX), R(ABI_PARAM2)); - AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); - MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1)); -#endif - gpr.UnlockAll(); - gpr.UnlockAllX(); - fpr.UnlockAll(); - } - else { - // Dodger uses this. - // mario tennis - //PanicAlert("st %i:%i", stType, inst.W); - Default(inst); - } + Default(inst); return; } void Jit64::psq_l(UGeckoInstruction inst) { - if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStorePairedOff) - {Default(inst); return;} // turn off from debugger - INSTRUCTION_START; - - js.block_flags |= BLOCK_USE_GQR0 << inst.I; - - if (js.blockSetsQuantizers || !Core::GetStartupParameter().bOptimizeQuantizers) - { - Default(inst); - return; - } - - const UGQR gqr(rSPR(SPR_GQR0 + inst.I)); - const EQuantizeType ldType = static_cast(gqr.LD_TYPE); - int ldScale = gqr.LD_SCALE; - bool update = inst.OPCD == 57; - if (!inst.RA || inst.W) - { - // 0 1 during load - //PanicAlert("ld:%i %i", ldType, (int)inst.W); - Default(inst); - return; - } - int offset = inst.SIMM_12; - switch (ldType) { - case QUANTIZE_FLOAT: // We know this is from RAM, so we don't need to check the address. - { -#ifdef _M_X64 - gpr.LoadToX64(inst.RA, true, update); - fpr.LoadToX64(inst.RS, false); - if (cpu_info.bSSSE3) { - X64Reg xd = fpr.R(inst.RS).GetSimpleReg(); - MOVQ_xmm(xd, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); - PSHUFB(xd, M((void *)pbswapShuffle2x4)); - CVTPS2PD(xd, R(xd)); - } else { - MOV(64, R(RAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); - BSWAP(64, RAX); - MOV(64, M(&psTemp[0]), R(RAX)); - X64Reg r = fpr.R(inst.RS).GetSimpleReg(); - CVTPS2PD(r, M(&psTemp[0])); - SHUFPD(r, R(r), 1); - } - if (update && offset != 0) - ADD(32, gpr.R(inst.RA), Imm32(offset)); - break; -#else - if (cpu_info.bSSSE3) { - gpr.LoadToX64(inst.RA, true, update); - fpr.LoadToX64(inst.RS, false); - X64Reg xd = fpr.R(inst.RS).GetSimpleReg(); - MOV(32, R(EAX), gpr.R(inst.RA)); - AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); - MOVQ_xmm(xd, MDisp(EAX, (u32)Memory::base + offset)); - PSHUFB(xd, M((void *)pbswapShuffle2x4)); - CVTPS2PD(xd, R(xd)); - } else { - gpr.FlushLockX(ECX); - gpr.LoadToX64(inst.RA, true, update); - // This can probably be optimized somewhat. - LEA(32, ECX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset)); - AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); - MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base)); - BSWAP(32, RAX); - MOV(32, M(&psTemp[0]), R(RAX)); - MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base + 4)); - BSWAP(32, RAX); - MOV(32, M(((float *)&psTemp[0]) + 1), R(RAX)); - fpr.LoadToX64(inst.RS, false, true); - X64Reg r = fpr.R(inst.RS).GetSimpleReg(); - CVTPS2PD(r, M(&psTemp[0])); - gpr.UnlockAllX(); - } - if (update && offset != 0) - ADD(32, gpr.R(inst.RA), Imm32(offset)); - break; -#endif - } - case QUANTIZE_U8: - { - gpr.LoadToX64(inst.RA, true, update); -#ifdef _M_X64 - MOVZX(32, 16, EAX, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); -#else - LEA(32, EAX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset)); - AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); - MOVZX(32, 16, EAX, MDisp(EAX, (u32)Memory::base)); -#endif - MOV(32, M(&temp64), R(EAX)); - MOVD_xmm(XMM0, M(&temp64)); - // SSE4 optimization opportunity here. - PXOR(XMM1, R(XMM1)); - PUNPCKLBW(XMM0, R(XMM1)); - PUNPCKLWD(XMM0, R(XMM1)); - CVTDQ2PD(XMM0, R(XMM0)); - fpr.LoadToX64(inst.RS, false, true); - X64Reg r = fpr.R(inst.RS).GetSimpleReg(); - MOVDDUP(r, M((void *)&m_dequantizeTableD[ldScale])); - MULPD(r, R(XMM0)); - if (update && offset != 0) - ADD(32, gpr.R(inst.RA), Imm32(offset)); - } - break; - case QUANTIZE_S16: - { - gpr.LoadToX64(inst.RA, true, update); -#ifdef _M_X64 - MOV(32, R(EAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); -#else - LEA(32, EAX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset)); - AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); - MOV(32, R(EAX), MDisp(EAX, (u32)Memory::base)); -#endif - BSWAP(32, EAX); - MOV(32, M(&temp64), R(EAX)); - fpr.LoadToX64(inst.RS, false, true); - X64Reg r = fpr.R(inst.RS).GetSimpleReg(); - MOVD_xmm(XMM0, M(&temp64)); - PUNPCKLWD(XMM0, R(XMM0)); // unpack to higher word in each dword.. - PSRAD(XMM0, 16); // then use this signed shift to sign extend. clever eh? :P - CVTDQ2PD(XMM0, R(XMM0)); - MOVDDUP(r, M((void*)&m_dequantizeTableD[ldScale])); - MULPD(r, R(XMM0)); - SHUFPD(r, R(r), 1); - if (update && offset != 0) - ADD(32, gpr.R(inst.RA), Imm32(offset)); - } - break; - - /* - Dynamic quantizer. Todo when we have a test set. - MOVZX(32, 8, EAX, M(((char *)&PowerPC::ppcState.spr[SPR_GQR0 + inst.I]) + 3)); // it's in the high byte. - AND(32, R(EAX), Imm8(0x3F)); - MOV(32, R(ECX), Imm32((u32)&m_dequantizeTableD)); - MOVDDUP(r, MComplex(RCX, EAX, 8, 0)); - */ - default: - // 4 0 - // 6 0 //power tennis - // 5 0 - // PanicAlert("ld:%i %i", ldType, (int)inst.W); - Default(inst); - return; - } - - //u32 EA = (m_GPR[_inst.RA] + _inst.SIMM_12) : _inst.SIMM_12; + if (inst.W) {Default(inst); return;} + IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_12), val; + if (inst.RA) + addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA)); + val = ibuild.EmitLoadPaired(addr, inst.I); + val = ibuild.EmitExpandPackedToMReg(val); + ibuild.EmitStoreFReg(val, inst.RD); }