diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index 1e69976bf7..a2dde863cd 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -205,6 +205,7 @@ void JitArm64::FallBackToInterpreter(UGeckoInstruction inst) // we must mark them as no longer discarded gpr.ResetRegisters(js.op->regsOut); fpr.ResetRegisters(js.op->GetFregsOut()); + gpr.ResetCRRegisters(js.op->crOut); if (js.op->opinfo->flags & FL_ENDBLOCK) { @@ -1222,9 +1223,11 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) { gpr.DiscardRegisters(op.gprDiscardable); fpr.DiscardRegisters(op.fprDiscardable); + gpr.DiscardCRRegisters(op.crDiscardable); } gpr.StoreRegisters(~op.gprInUse & (op.regsIn | op.regsOut)); fpr.StoreRegisters(~op.fprInUse & (op.fregsIn | op.GetFregsOut())); + gpr.StoreCRRegisters(~op.crInUse & (op.crIn | op.crOut)); if (opinfo->flags & FL_LOADSTORE) ++js.numLoadStoreInst; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index d4e933aaf3..5da455cb61 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -129,6 +129,8 @@ void Arm64RegCache::DiscardRegister(size_t preg) { OpArg& reg = m_guest_registers[preg]; ARM64Reg host_reg = reg.GetReg(); + if (!IsVector(host_reg)) + host_reg = EncodeRegTo32(host_reg); reg.Discard(); if (host_reg != ARM64Reg::INVALID_REG) @@ -288,6 +290,25 @@ void Arm64GPRCache::FlushCRRegisters(BitSet8 regs, bool maintain_state, ARM64Reg } } +void Arm64GPRCache::DiscardCRRegisters(BitSet8 regs) +{ + for (int i : regs) + DiscardRegister(GUEST_CR_OFFSET + i); +} + +void Arm64GPRCache::ResetCRRegisters(BitSet8 regs) +{ + for (int i : regs) + { + OpArg& reg = m_guest_registers[GUEST_CR_OFFSET + i]; + ARM64Reg host_reg = reg.GetReg(); + + ASSERT_MSG(DYNA_REC, host_reg == ARM64Reg::INVALID_REG, + "Attempted to reset a loaded register (did you mean to flush it?)"); + reg.Flush(); + } +} + void Arm64GPRCache::Flush(FlushMode mode, ARM64Reg tmp_reg) { FlushRegisters(BitSet32(0xFFFFFFFF), mode == FlushMode::MaintainState, tmp_reg); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index fbe1f30134..dad43ce9ef 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -335,6 +335,9 @@ public: FlushCRRegisters(regs, false, tmp_reg); } + void DiscardCRRegisters(BitSet8 regs); + void ResetCRRegisters(BitSet8 regs); + protected: // Get the order of the host registers void GetAllocationOrder() override; diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 9cfbbdec6b..d7c4a09a64 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -222,26 +222,13 @@ bool PPCAnalyzer::CanSwapAdjacentOps(const CodeOp& a, const CodeOp& b) const // [1] https://bugs.dolphin-emu.org/issues/5864#note-7 if (a.canCauseException || b.canCauseException) return false; - if (a_flags & FL_ENDBLOCK) + if (a_flags & (FL_ENDBLOCK | FL_TIMER | FL_NO_REORDER | FL_SET_OE)) return false; - if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE)) - return false; - if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.Rc)) + if (b_flags & (FL_ENDBLOCK | FL_TIMER | FL_NO_REORDER | FL_SET_OE)) return false; if ((a_flags & (FL_SET_CA | FL_READ_CA)) && (b_flags & (FL_SET_CA | FL_READ_CA))) return false; - switch (b.inst.OPCD) - { - case 16: - case 18: - // branches. Do not swap. - case 17: // sc - case 46: // lmw - case 19: // table19 - lots of tricky stuff - return false; - } - // For now, only integer ops are acceptable. if (b_info->type != OpType::Integer) return false; @@ -249,16 +236,22 @@ bool PPCAnalyzer::CanSwapAdjacentOps(const CodeOp& a, const CodeOp& b) const // Check that we have no register collisions. // That is, check that none of b's outputs matches any of a's inputs, // and that none of a's outputs matches any of b's inputs. - // The latter does not apply if a is a cmp, of course, but doesn't hurt to check. + // register collision: b outputs to one of a's inputs if (b.regsOut & a.regsIn) return false; + if (b.crOut & a.crIn) + return false; // register collision: a outputs to one of b's inputs if (a.regsOut & b.regsIn) return false; + if (a.crOut & b.crIn) + return false; // register collision: b outputs to one of a's outputs (overwriting it) if (b.regsOut & a.regsOut) return false; + if (b.crOut & a.crOut) + return false; return true; } @@ -451,12 +444,6 @@ void FindFunctions(const Core::CPUThreadGuard& guard, u32 startAddr, u32 endAddr unniceSize); } -static bool isCmp(const CodeOp& a) -{ - return (a.inst.OPCD == 10 || a.inst.OPCD == 11) || - (a.inst.OPCD == 31 && (a.inst.SUBOP10 == 0 || a.inst.SUBOP10 == 32)); -} - static bool isCarryOp(const CodeOp& a) { return (a.opinfo->flags & FL_SET_CA) && !(a.opinfo->flags & FL_SET_OE) && @@ -506,7 +493,7 @@ void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool r // Reorder integer compares, rlwinm., and carry-affecting ops // (if we add more merged branch instructions, add them here!) if ((type == ReorderType::CROR && isCror(a)) || (type == ReorderType::Carry && isCarryOp(a)) || - (type == ReorderType::CMP && (isCmp(a) || a.outputCR[0]))) + (type == ReorderType::CMP && (type == ReorderType::CMP && a.crOut))) { // once we're next to a carry instruction, don't move away! if (type == ReorderType::Carry && i != start) @@ -544,11 +531,6 @@ void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool r void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp* code) const { - // Reorder cror instructions upwards (e.g. towards an fcmp). Technically we should be more - // picky about this, but cror seems to almost solely be used for this purpose in real code. - // Additionally, the other boolean ops seem to almost never be used. - if (HasOption(OPTION_CROR_MERGE)) - ReorderInstructionsCore(instructions, code, true, ReorderType::CROR); // For carry, bubble instructions *towards* each other; one direction often isn't enough // to get pairs like addc/adde next to each other. if (HasOption(OPTION_CARRY_MERGE)) @@ -556,8 +538,16 @@ void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp* code) const ReorderInstructionsCore(instructions, code, false, ReorderType::Carry); ReorderInstructionsCore(instructions, code, true, ReorderType::Carry); } + + // Reorder instructions which write to CR (typically compare instructions) towards branches. if (HasOption(OPTION_BRANCH_MERGE)) ReorderInstructionsCore(instructions, code, false, ReorderType::CMP); + + // Reorder cror instructions upwards (e.g. towards an fcmp). Technically we should be more + // picky about this, but cror seems to almost solely be used for this purpose in real code. + // Additionally, the other boolean ops seem to almost never be used. + if (HasOption(OPTION_CROR_MERGE)) + ReorderInstructionsCore(instructions, code, true, ReorderType::CROR); } void PPCAnalyzer::SetInstructionStats(CodeBlock* block, CodeOp* code, @@ -570,23 +560,40 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock* block, CodeOp* code, block->m_fpa->any = true; } - code->wantsCR = BitSet8(0); + code->crIn = BitSet8(0); if (opinfo->flags & FL_READ_ALL_CR) - code->wantsCR = BitSet8(0xFF); + { + code->crIn = BitSet8(0xFF); + } else if (opinfo->flags & FL_READ_CRn) - code->wantsCR[code->inst.CRFS] = true; + { + code->crIn[code->inst.CRFS] = true; + } else if (opinfo->flags & FL_READ_CR_BI) - code->wantsCR[code->inst.BI] = true; + { + code->crIn[code->inst.BI] = true; + } + else if (opinfo->type == OpType::CR) + { + code->crIn[code->inst.CRBA >> 2] = true; + code->crIn[code->inst.CRBB >> 2] = true; - code->outputCR = BitSet8(0); + // CR instructions only write to one bit of the destination CR, + // so treat the other three bits of the destination as inputs + code->crIn[code->inst.CRBD >> 2] = true; + } + + code->crOut = BitSet8(0); if (opinfo->flags & FL_SET_ALL_CR) - code->outputCR = BitSet8(0xFF); + code->crOut = BitSet8(0xFF); else if (opinfo->flags & FL_SET_CRn) - code->outputCR[code->inst.CRFD] = true; + code->crOut[code->inst.CRFD] = true; else if ((opinfo->flags & FL_SET_CR0) || ((opinfo->flags & FL_RC_BIT) && code->inst.Rc)) - code->outputCR[0] = true; + code->crOut[0] = true; else if ((opinfo->flags & FL_SET_CR1) || ((opinfo->flags & FL_RC_BIT_F) && code->inst.Rc)) - code->outputCR[1] = true; + code->crOut[1] = true; + else if (opinfo->type == OpType::CR) + code->crOut[code->inst.CRBD >> 2] = true; code->wantsFPRF = (opinfo->flags & FL_READ_FPRF) != 0; code->outputFPRF = (opinfo->flags & FL_SET_FPRF) != 0; @@ -955,9 +962,9 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, // Scan for flag dependencies; assume the next block (or any branch that can leave the block) // wants flags, to be safe. - BitSet8 wantsCR = BitSet8(0xFF); bool wantsFPRF = true; bool wantsCA = true; + BitSet8 crInUse, crDiscardable; BitSet32 gprBlockInputs, gprInUse, fprInUse, gprDiscardable, fprDiscardable, fprInXmm; for (int i = block->m_num_instructions - 1; i >= 0; i--) { @@ -974,27 +981,26 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, const bool hle = !!HLE::TryReplaceFunction(op.address); const bool may_exit_block = hle || op.canEndBlock || op.canCauseException; - const BitSet8 opWantsCR = op.wantsCR; const bool opWantsFPRF = op.wantsFPRF; const bool opWantsCA = op.wantsCA; - op.wantsCR = wantsCR | BitSet8(may_exit_block ? 0xFF : 0); op.wantsFPRF = wantsFPRF || may_exit_block; op.wantsCA = wantsCA || may_exit_block; - wantsCR |= opWantsCR | BitSet8(may_exit_block ? 0xFF : 0); wantsFPRF |= opWantsFPRF || may_exit_block; wantsCA |= opWantsCA || may_exit_block; - wantsCR &= ~op.outputCR | opWantsCR; wantsFPRF &= !op.outputFPRF || opWantsFPRF; wantsCA &= !op.outputCA || opWantsCA; op.gprInUse = gprInUse; op.fprInUse = fprInUse; + op.crInUse = crInUse; op.gprDiscardable = gprDiscardable; op.fprDiscardable = fprDiscardable; + op.crDiscardable = crDiscardable; op.fprInXmm = fprInXmm; gprBlockInputs &= ~op.regsOut; gprBlockInputs |= op.regsIn; gprInUse |= op.regsIn | op.regsOut; fprInUse |= op.fregsIn | op.GetFregsOut(); + crInUse |= op.crIn | op.crOut; if (strncmp(op.opinfo->opname, "stfd", 4)) fprInXmm |= op.fregsIn; @@ -1006,11 +1012,13 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, fprInXmm = BitSet32{}; gprDiscardable = BitSet32{}; fprDiscardable = BitSet32{}; + crDiscardable = BitSet8{}; } else if (op.canEndBlock || op.canCauseException) { gprDiscardable = BitSet32{}; fprDiscardable = BitSet32{}; + crDiscardable = BitSet8{}; } else { @@ -1018,6 +1026,8 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, gprDiscardable &= ~op.regsIn; fprDiscardable |= op.GetFregsOut(); fprDiscardable &= ~op.fregsIn; + crDiscardable |= op.crOut; + crDiscardable &= ~op.crIn; } } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 1c24120cc9..1bd25ae476 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -32,10 +32,12 @@ struct CodeOp // 16B const GekkoOPInfo* opinfo = nullptr; u32 address = 0; u32 branchTo = 0; // if UINT32_MAX, not a branch - BitSet32 regsOut; BitSet32 regsIn; + BitSet32 regsOut; BitSet32 fregsIn; s8 fregOut = 0; + BitSet8 crIn; + BitSet8 crOut; bool isBranchTarget = false; bool branchUsesCtr = false; bool branchIsIdleLoop = false; @@ -50,6 +52,8 @@ struct CodeOp // 16B bool canCauseException = false; bool skipLRStack = false; bool skip = false; // followed BL-s for example + BitSet8 crInUse; + BitSet8 crDiscardable; // which registers are still needed after this instruction in this block BitSet32 fprInUse; BitSet32 gprInUse; diff --git a/Source/Core/Core/PowerPC/PPCTables.cpp b/Source/Core/Core/PowerPC/PPCTables.cpp index 79ff5444a7..9d70fb662b 100644 --- a/Source/Core/Core/PowerPC/PPCTables.cpp +++ b/Source/Core/Core/PowerPC/PPCTables.cpp @@ -87,8 +87,8 @@ constexpr std::array s_primary_table{{ {38, "stb", OpType::Store, 1, FL_IN_A0 | FL_IN_S | FL_LOADSTORE}, {39, "stbu", OpType::Store, 1, FL_OUT_A | FL_IN_A | FL_IN_S | FL_LOADSTORE}, - {46, "lmw", OpType::System, 11, FL_EVIL | FL_IN_A0 | FL_LOADSTORE}, - {47, "stmw", OpType::System, 11, FL_EVIL | FL_IN_A0 | FL_LOADSTORE}, + {46, "lmw", OpType::System, 11, FL_IN_A0 | FL_LOADSTORE}, + {47, "stmw", OpType::System, 11, FL_IN_A0 | FL_LOADSTORE}, {48, "lfs", OpType::LoadFP, 1, FL_OUT_FLOAT_D | FL_IN_A | FL_USE_FPU | FL_LOADSTORE}, {49, "lfsu", OpType::LoadFP, 1, @@ -224,17 +224,17 @@ constexpr std::array s_table4_3{{ constexpr std::array s_table19{{ {528, "bcctrx", OpType::Branch, 1, FL_ENDBLOCK | FL_READ_CR_BI}, {16, "bclrx", OpType::Branch, 1, FL_ENDBLOCK | FL_READ_CR_BI}, - {257, "crand", OpType::CR, 1, FL_EVIL}, - {129, "crandc", OpType::CR, 1, FL_EVIL}, - {289, "creqv", OpType::CR, 1, FL_EVIL}, - {225, "crnand", OpType::CR, 1, FL_EVIL}, - {33, "crnor", OpType::CR, 1, FL_EVIL}, - {449, "cror", OpType::CR, 1, FL_EVIL}, - {417, "crorc", OpType::CR, 1, FL_EVIL}, - {193, "crxor", OpType::CR, 1, FL_EVIL}, + {257, "crand", OpType::CR, 1, 0}, + {129, "crandc", OpType::CR, 1, 0}, + {289, "creqv", OpType::CR, 1, 0}, + {225, "crnand", OpType::CR, 1, 0}, + {33, "crnor", OpType::CR, 1, 0}, + {449, "cror", OpType::CR, 1, 0}, + {417, "crorc", OpType::CR, 1, 0}, + {193, "crxor", OpType::CR, 1, 0}, - {150, "isync", OpType::InstructionCache, 1, FL_EVIL}, - {0, "mcrf", OpType::System, 1, FL_EVIL | FL_SET_CRn | FL_READ_CRn}, + {150, "isync", OpType::InstructionCache, 1, FL_NO_REORDER}, + {0, "mcrf", OpType::System, 1, FL_SET_CRn | FL_READ_CRn}, {50, "rfi", OpType::System, 2, FL_ENDBLOCK | FL_CHECKEXCEPTIONS | FL_PROGRAMEXCEPTION}, }}; @@ -324,12 +324,12 @@ constexpr std::array s_table31{{ {790, "lhbrx", OpType::Load, 1, FL_OUT_D | FL_IN_A0B | FL_LOADSTORE}, // Conditional load/store (Wii SMP) - {150, "stwcxd", OpType::Store, 1, FL_EVIL | FL_IN_S | FL_IN_A0B | FL_SET_CR0 | FL_LOADSTORE}, - {20, "lwarx", OpType::Load, 1, FL_EVIL | FL_OUT_D | FL_IN_A0B | FL_SET_CR0 | FL_LOADSTORE}, + {150, "stwcxd", OpType::Store, 1, FL_IN_S | FL_IN_A0B | FL_SET_CR0 | FL_LOADSTORE}, + {20, "lwarx", OpType::Load, 1, FL_OUT_D | FL_IN_A0B | FL_SET_CR0 | FL_LOADSTORE}, // load string (Inst these) - {533, "lswx", OpType::Load, 1, FL_EVIL | FL_IN_A0B | FL_OUT_D | FL_LOADSTORE}, - {597, "lswi", OpType::Load, 1, FL_EVIL | FL_IN_A0 | FL_OUT_D | FL_LOADSTORE}, + {533, "lswx", OpType::Load, 1, FL_IN_A0B | FL_OUT_D | FL_LOADSTORE}, + {597, "lswi", OpType::Load, 1, FL_IN_A0 | FL_OUT_D | FL_LOADSTORE}, // store word {151, "stwx", OpType::Store, 1, FL_IN_S | FL_IN_A0B | FL_LOADSTORE}, @@ -347,8 +347,8 @@ constexpr std::array s_table31{{ {662, "stwbrx", OpType::Store, 1, FL_IN_S | FL_IN_A0B | FL_LOADSTORE}, {918, "sthbrx", OpType::Store, 1, FL_IN_S | FL_IN_A0B | FL_LOADSTORE}, - {661, "stswx", OpType::Store, 1, FL_EVIL | FL_IN_A0B | FL_LOADSTORE}, - {725, "stswi", OpType::Store, 1, FL_EVIL | FL_IN_A0 | FL_LOADSTORE}, + {661, "stswx", OpType::Store, 1, FL_IN_A0B | FL_LOADSTORE}, + {725, "stswi", OpType::Store, 1, FL_IN_A0 | FL_LOADSTORE}, // fp load/store {535, "lfsx", OpType::LoadFP, 1, FL_OUT_FLOAT_D | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE}, diff --git a/Source/Core/Core/PowerPC/PPCTables.h b/Source/Core/Core/PowerPC/PPCTables.h index 00d4baaf9f..afb265f840 100644 --- a/Source/Core/Core/PowerPC/PPCTables.h +++ b/Source/Core/Core/PowerPC/PPCTables.h @@ -36,18 +36,17 @@ enum InstructionFlags : u64 FL_OUT_AD = FL_OUT_A | FL_OUT_D, FL_TIMER = (1ull << 15), // Used only for mftb. FL_CHECKEXCEPTIONS = (1ull << 16), // Used with rfi/rfid. - FL_EVIL = - (1ull << 17), // Historically used to refer to instructions that messed up Super Monkey Ball. - FL_USE_FPU = (1ull << 18), // Used to indicate a floating point instruction. - FL_LOADSTORE = (1ull << 19), // Used to indicate a load/store instruction. - FL_SET_FPRF = (1ull << 20), // Sets bits in the FPRF. - FL_READ_FPRF = (1ull << 21), // Reads bits from the FPRF. - FL_SET_OE = (1ull << 22), // Sets the overflow flag. - FL_IN_FLOAT_A = (1ull << 23), // frA is used as an input. - FL_IN_FLOAT_B = (1ull << 24), // frB is used as an input. - FL_IN_FLOAT_C = (1ull << 25), // frC is used as an input. - FL_IN_FLOAT_S = (1ull << 26), // frS is used as an input. - FL_IN_FLOAT_D = (1ull << 27), // frD is used as an input. + FL_NO_REORDER = (1ull << 17), // Instruction should not be reordered by our optimizations. + FL_USE_FPU = (1ull << 18), // Used to indicate a floating point instruction. + FL_LOADSTORE = (1ull << 19), // Used to indicate a load/store instruction. + FL_SET_FPRF = (1ull << 20), // Sets bits in the FPRF. + FL_READ_FPRF = (1ull << 21), // Reads bits from the FPRF. + FL_SET_OE = (1ull << 22), // Sets the overflow flag. + FL_IN_FLOAT_A = (1ull << 23), // frA is used as an input. + FL_IN_FLOAT_B = (1ull << 24), // frB is used as an input. + FL_IN_FLOAT_C = (1ull << 25), // frC is used as an input. + FL_IN_FLOAT_S = (1ull << 26), // frS is used as an input. + FL_IN_FLOAT_D = (1ull << 27), // frD is used as an input. FL_IN_FLOAT_AB = FL_IN_FLOAT_A | FL_IN_FLOAT_B, FL_IN_FLOAT_AC = FL_IN_FLOAT_A | FL_IN_FLOAT_C, FL_IN_FLOAT_ABC = FL_IN_FLOAT_A | FL_IN_FLOAT_B | FL_IN_FLOAT_C,