[ARM] Optimization to psq_l, no need to push/pop regs anymore. Implement support for single float loading, gives a decent speedup to Ikaruga in menus and game.

This commit is contained in:
Ryan Houdek 2013-09-08 08:18:34 +00:00
parent e5b5713d70
commit ba0c52b104
2 changed files with 25 additions and 21 deletions

View File

@ -28,26 +28,18 @@ void JitArm::psq_l(UGeckoInstruction inst)
if (js.memcheck) { Default(inst); return; }
if (inst.W) {
// Enable when supporting single loads
Default(inst);
return;
}
LDR(R11, R9, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
//UBFX(R12, R11, 2, 6); // Scale
UBFX(R11, R11, 13, 3); // Type
UBFX(R12, R11, 13, 3); // Type
UBFX(R11, R11, 2, 6); // Scale
MOVI2R(R10, (u32)offset);
if (inst.RA)
ADD(R10, R10, gpr.R(inst.RA));
if (update)
MOV(gpr.R(inst.RA), R10);
if (inst.W)
ADD(R11, R11, 8);
MOVI2R(R14, (u32)asm_routines.pairedLoadQuantized);
ADD(R14, R14, R11);
LDR(R14, R14);
ADD(R14, R14, R12);
LDR(R14, R14, inst.W ? 8 * 4 : 0);
// Values returned in S0, S1
BL(R14); // Jump to the quantizer Load

View File

@ -145,25 +145,37 @@ void JitArmAsmRoutineManager::Generate()
void JitArmAsmRoutineManager::GenerateCommon()
{
// R14 is LR
// R12 is scratch
// R11 is scale
// R10 is the address
Operand2 mask(3, 1); // ~(Memory::MEMVIEW32_MASK)
NEONXEmitter nemit(this);
const u8* loadPairedIllegal = GetCodePtr();
BKPT(0x10);
const u8* loadPairedFloatTwo = GetCodePtr();
PUSH(2, R12, _LR);
// R12, R14 is scratch
// R10 is the address
Operand2 mask(3, 1); // ~(Memory::MEMVIEW32_MASK)
BIC(R10, R10, mask);
MOVI2R(R14, (u32)Memory::base);
ADD(R10, R10, R14);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
NEONXEmitter nemit(this);
nemit.VLD1(I_32, D0, R10);
nemit.VREV32(I_8, D0, D0);
MOV(_PC, _LR);
POP(2, R12, _PC);
const u8* loadPairedFloatOne = GetCodePtr();
BKPT(0x12);
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
nemit.VLD1(I_32, D0, R10);
nemit.VREV32(I_8, D0, D0);
MOVI2F(S1, 1.0f, INVALID_REG); // Temp reg isn't used for 1.0f
MOV(_PC, _LR);
const u8* loadPairedU8Two = GetCodePtr();
BKPT(0x13);
const u8* loadPairedU8One = GetCodePtr();