From dd58a8d65e090519107bea3ea0ef12fbcaaaf83a Mon Sep 17 00:00:00 2001 From: Sintendo Date: Sun, 22 Oct 2023 15:13:46 +0200 Subject: [PATCH 1/2] Jit_LoadStore: Minor dcbx register optimization Instructions referencing registers r8-r15 take an additional byte to encode. `reg_downcount` may be assigned to one of these registers, so it is a small size win to store the downcount value in `RSCRATCH` first. Before: 33 D2 xor edx,edx 44 8B 6D 64 mov r13d,dword ptr [rbp+64h] 45 85 ED test r13d,r13d 7E 30 jle 0000023546B43F6D 44 8B B5 D4 02 00 00 mov r14d,dword ptr [rbp+2D4h] 41 8B C5 mov eax,r13d BF 07 00 00 00 mov edi,7 F7 F7 div eax,edi After: 33 D2 xor edx,edx 8B 45 64 mov eax,dword ptr [rbp+64h] 85 C0 test eax,eax 7E 30 jle 000001AFBBAE359D 44 8B B5 D4 02 00 00 mov r14d,dword ptr [rbp+2D4h] 44 8B E8 mov r13d,eax BF 07 00 00 00 mov edi,7 F7 F7 div eax,edi --- Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index f3a0e209f1..9d385d10c0 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -274,11 +274,11 @@ void Jit64::dcbx(UGeckoInstruction inst) // the upper bits for the DIV instruction in the downcount > 0 case. XOR(32, R(RSCRATCH2), R(RSCRATCH2)); - MOV(32, R(reg_downcount), PPCSTATE(downcount)); - TEST(32, R(reg_downcount), R(reg_downcount)); // if (downcount <= 0) + MOV(32, R(RSCRATCH), PPCSTATE(downcount)); + TEST(32, R(RSCRATCH), R(RSCRATCH)); // if (downcount <= 0) FixupBranch downcount_is_zero_or_negative = J_CC(CC_LE); // only do 1 invalidation; else: MOV(32, R(loop_counter), PPCSTATE_CTR); - MOV(32, R(RSCRATCH), R(reg_downcount)); + MOV(32, R(reg_downcount), R(RSCRATCH)); MOV(32, R(reg_cycle_count), Imm32(cycle_count_per_loop)); DIV(32, R(reg_cycle_count)); // RSCRATCH = downcount / cycle_count LEA(32, RSCRATCH2, MDisp(loop_counter, -1)); // RSCRATCH2 = CTR - 1 From 171f76ae07c7895cd91ca4a105572218601b98a5 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Tue, 24 Oct 2023 00:42:30 +0200 Subject: [PATCH 2/2] Jit_LoadStore: Another minor dcbx optimization The multiplication needs the value from RSCRATCH2, but shouldn't overwrite it as it is still needed later. The original code solved this by copying RSCRATCH2 to another register first. As it turns out, the other register involved in the multiplication can safely be overwritten, so we can swap the operands around and use RSCRATCH2 directly without making a copy. Before: 33 D2 xor edx,edx 8B 45 64 mov eax,dword ptr [rbp+64h] 85 C0 test eax,eax 7E 30 jle 000002D4DF373F6B 44 8B B5 D4 02 00 00 mov r14d,dword ptr [rbp+2D4h] 44 8B E8 mov r13d,eax BF 07 00 00 00 mov edi,7 F7 F7 div eax,edi 41 8D 56 FF lea edx,[r14-1] 3B C2 cmp eax,edx 0F 42 D0 cmovb edx,eax 44 2B F2 sub r14d,edx 44 89 B5 D4 02 00 00 mov dword ptr [rbp+2D4h],r14d 8B C2 mov eax,edx 0F AF C7 imul eax,edi 44 2B E8 sub r13d,eax 44 89 6D 64 mov dword ptr [rbp+64h],r13d 44 8D 72 01 lea r14d,[rdx+1] After: 33 D2 xor edx,edx 8B 45 64 mov eax,dword ptr [rbp+64h] 85 C0 test eax,eax 7E 2E jle 0000021C01013F69 44 8B B5 D4 02 00 00 mov r14d,dword ptr [rbp+2D4h] 44 8B E8 mov r13d,eax BF 07 00 00 00 mov edi,7 F7 F7 div eax,edi 41 8D 56 FF lea edx,[r14-1] 3B C2 cmp eax,edx 0F 42 D0 cmovb edx,eax 44 2B F2 sub r14d,edx 44 89 B5 D4 02 00 00 mov dword ptr [rbp+2D4h],r14d 0F AF FA imul edi,edx 44 2B EF sub r13d,edi 44 89 6D 64 mov dword ptr [rbp+64h],r13d 44 8D 72 01 lea r14d,[rdx+1] --- Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index 9d385d10c0..7a7461713e 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -291,10 +291,9 @@ void Jit64::dcbx(UGeckoInstruction inst) // registers. SUB(32, R(loop_counter), R(RSCRATCH2)); MOV(32, PPCSTATE_CTR, R(loop_counter)); // CTR -= RSCRATCH2 - MOV(32, R(RSCRATCH), R(RSCRATCH2)); - IMUL(32, RSCRATCH, R(reg_cycle_count)); + IMUL(32, reg_cycle_count, R(RSCRATCH2)); // ^ Note that this cannot overflow because it's limited by (downcount/cycle_count). - SUB(32, R(reg_downcount), R(RSCRATCH)); + SUB(32, R(reg_downcount), R(reg_cycle_count)); MOV(32, PPCSTATE(downcount), R(reg_downcount)); // downcount -= (RSCRATCH2 * reg_cycle_count) SetJumpTarget(downcount_is_zero_or_negative);