melonDS/src/ARMJIT_x64/ARMJIT_LoadStore.cpp

#include "ARMJIT_Compiler.h"

#include "../GPU.h"
#include "../Wifi.h"

using namespace Gen;

namespace ARMJIT
{

template <typename T>
int squeezePointer(T* ptr)
{
    int truncated = (int)((u64)ptr);
    assert((T*)((u64)truncated) == ptr);
    return truncated;
}

/*
    According to DeSmuME and my own research, approx. 99% (seriously, that's an empirical number)
    of all memory load and store instructions always access addresses in the same region as
    during the their first execution.

    I tried multiple optimisations, which would benefit from this behaviour
    (having fast paths for the first region, …), though none of them yielded a measureable
    improvement.
*/

/*
    address - ABI_PARAM1 (a.k.a. ECX = RSCRATCH3 on Windows)
    store value - ABI_PARAM2 (a.k.a. RDX = RSCRATCH2 on Windows)
    code cycles - ABI_PARAM3
*/

#define CALC_CYCLES_9(numC, numD, scratch) \
    LEA(32, scratch, MComplex(numD, numC, SCALE_1, -6)); \
    CMP(32, R(numC), R(numD)); \
    CMOVcc(32, numD, R(numC), CC_G); \
    CMP(32, R(numD), R(scratch)); \
    CMOVcc(32, scratch, R(numD), CC_G); \
    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch));
#define CALC_CYCLES_7_DATA_MAIN_RAM(numC, numD, scratch) \
    if (codeMainRAM) \
    { \
        LEA(32, scratch, MRegSum(numD, numC)); \
        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
    } \
    else \
    { \
        if (!store) \
            ADD(32, R(numC), Imm8(1)); \
        LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \
        CMP(32, R(numD), R(numC)); \
        CMOVcc(32, numC, R(numD), CC_G); \
        CMP(32, R(numC), R(scratch)); \
        CMOVcc(32, scratch, R(numC), CC_G); \
        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
    }
#define CALC_CYCLES_7_DATA_NON_MAIN_RAM(numC, numD, scratch) \
    if (codeMainRAM) \
    { \
        if (!store) \
            ADD(32, R(numD), Imm8(1)); \
        LEA(32, scratch, MComplex(numD, numC, SCALE_1, -3)); \
        CMP(32, R(numD), R(numC)); \
        CMOVcc(32, numC, R(numD), CC_G); \
        CMP(32, R(numC), R(scratch)); \
        CMOVcc(32, scratch, R(numC), CC_G); \
        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
    } \
    else \
    { \
        LEA(32, scratch, MComplex(numD, numC, SCALE_1, store ? 0 : 1)); \
        ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(scratch)); \
    }

void* Compiler::Gen_MemoryRoutine9(bool store, int size)
{
    u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
    AlignCode4();
    void* res = GetWritableCodePtr();

    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
    SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
    CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize)));
    FixupBranch insideDTCM = J_CC(CC_B);

    CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
    FixupBranch insideITCM = J_CC(CC_B);

    // cycle counting!
    MOV(32, R(ABI_PARAM4), R(ABI_PARAM1));
    SHR(32, R(ABI_PARAM4), Imm8(12));
    MOVZX(32, 8, ABI_PARAM4, MComplex(RCPU, ABI_PARAM4, SCALE_4, offsetof(ARMv5, MemTimings) + (size == 32 ? 2 : 1)));
    CALC_CYCLES_9(ABI_PARAM3, ABI_PARAM4, RSCRATCH)

    if (store)
    {
        if (size > 8)
            AND(32, R(ABI_PARAM1), Imm32(addressMask));
        switch (size)
        {
        case 32: JMP((u8*)NDS::ARM9Write32, true); break;
        case 16: JMP((u8*)NDS::ARM9Write16, true); break;
        case 8: JMP((u8*)NDS::ARM9Write8, true); break;
        }
    }
    else
    {
        if (size == 32)
        {
            ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8);
            AND(32, R(ABI_PARAM1), Imm32(addressMask));
            // everything's already in the appropriate register
            ABI_CallFunction(NDS::ARM9Read32);
            ABI_PopRegistersAndAdjustStack({ECX}, 8);
            AND(32, R(ECX), Imm8(3));
            SHL(32, R(ECX), Imm8(3));
            ROR_(32, R(RSCRATCH), R(ECX));
            RET();
        }
        else if (size == 16)
        {
            AND(32, R(ABI_PARAM1), Imm32(addressMask));
            JMP((u8*)NDS::ARM9Read16, true);
        }
        else
            JMP((u8*)NDS::ARM9Read8, true);
    }

    SetJumpTarget(insideDTCM);
    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3));
    AND(32, R(RSCRATCH), Imm32(0x3FFF & addressMask));
    if (store)
        MOV(size, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM2));
    else
    {
        MOVZX(32, size, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)));
        if (size == 32)
        {
            if (ABI_PARAM1 != ECX)
                MOV(32, R(ECX), R(ABI_PARAM1));
            AND(32, R(ECX), Imm8(3));
            SHL(32, R(ECX), Imm8(3));
            ROR_(32, R(RSCRATCH), R(ECX));
        }
    }
    RET();

    SetJumpTarget(insideITCM);
    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM3));
    MOV(32, R(ABI_PARAM3), R(ABI_PARAM1)); // free up ECX
    AND(32, R(ABI_PARAM3), Imm32(0x7FFF & addressMask));
    if (store)
    {
        MOV(size, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM2));
        XOR(32, R(RSCRATCH), R(RSCRATCH));
        MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(RSCRATCH));
        if (size == 32)
            MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(RSCRATCH));
    }
    else
    {
        MOVZX(32, size, RSCRATCH, MComplex(RCPU, ABI_PARAM3, SCALE_1, offsetof(ARMv5, ITCM)));
        if (size == 32)
        {
            if (ABI_PARAM1 != ECX)
                MOV(32, R(ECX), R(ABI_PARAM1));
            AND(32, R(ECX), Imm8(3));
            SHL(32, R(ECX), Imm8(3));
            ROR_(32, R(RSCRATCH), R(ECX));
        }
    }
    RET();

    static_assert(RSCRATCH == EAX);

    return res;
}

void* Compiler::Gen_MemoryRoutine7(bool store, bool codeMainRAM, int size)
{
    u32 addressMask = ~(size == 32 ? 3 : (size == 16 ? 1 : 0));
    AlignCode4();
    void* res = GetWritableCodePtr();

    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
    SHR(32, R(RSCRATCH), Imm8(15));
    MOVZX(32, 8, ABI_PARAM4, MScaled(RSCRATCH, SCALE_4, (size == 32 ? 2 : 0) + squeezePointer(NDS::ARM7MemTimings)));

    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
    AND(32, R(RSCRATCH), Imm32(0xFF000000));
    CMP(32, R(RSCRATCH), Imm32(0x02000000));
    FixupBranch outsideMainRAM = J_CC(CC_NE);
    CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
    MOV(32, R(ABI_PARAM3), R(ABI_PARAM1));
    AND(32, R(ABI_PARAM3), Imm32((MAIN_RAM_SIZE - 1) & addressMask));
    if (store)
    {
        MOV(size, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)), R(ABI_PARAM2));
        XOR(32, R(RSCRATCH), R(RSCRATCH));
        MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM)), R(RSCRATCH));
        if (size == 32)
            MOV(64, MScaled(ABI_PARAM3, SCALE_4, squeezePointer(cache.MainRAM) + 8), R(RSCRATCH));
    }
    else
    {
        MOVZX(32, size, RSCRATCH, MDisp(ABI_PARAM3, squeezePointer(NDS::MainRAM)));
        if (size == 32)
        {
            if (ABI_PARAM1 != ECX)
                MOV(32, R(ECX), R(ABI_PARAM1));
            AND(32, R(ECX), Imm8(3));
            SHL(32, R(ECX), Imm8(3));
            ROR_(32, R(RSCRATCH), R(ECX));
        }
    }
    RET();

    SetJumpTarget(outsideMainRAM);
    CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM3, ABI_PARAM4, RSCRATCH)
    if (store)
    {
        if (size > 8)
            AND(32, R(ABI_PARAM1), Imm32(addressMask));
        switch (size)
        {
        case 32: JMP((u8*)NDS::ARM7Write32, true); break;
        case 16: JMP((u8*)NDS::ARM7Write16, true); break;
        case 8: JMP((u8*)NDS::ARM7Write8, true); break;
        }
    }
    else
    {
        if (size == 32)
        {
            ABI_PushRegistersAndAdjustStack({ABI_PARAM1}, 8);
            AND(32, R(ABI_PARAM1), Imm32(addressMask));
            ABI_CallFunction(NDS::ARM7Read32);
            ABI_PopRegistersAndAdjustStack({ECX}, 8);
            AND(32, R(ECX), Imm8(3));
            SHL(32, R(ECX), Imm8(3));
            ROR_(32, R(RSCRATCH), R(ECX));
            RET();
        }
        else if (size == 16)
        {
            AND(32, R(ABI_PARAM1), Imm32(addressMask));
            JMP((u8*)NDS::ARM7Read16, true);
        }
        else
            JMP((u8*)NDS::ARM7Read8, true);
    }

    return res;
}

#define MEMORY_SEQ_WHILE_COND \
        if (!store) \
            MOV(32, currentElement, R(EAX));\
        if (!preinc) \
            ADD(32, R(ABI_PARAM1), Imm8(4)); \
        \
        SUB(32, R(ABI_PARAM3), Imm8(1)); \
        J_CC(CC_NZ, repeat);

/*
    ABI_PARAM1 address
    ABI_PARAM2 address where registers are stored
    ABI_PARAM3 how many values to read/write
    ABI_PARAM4 code cycles

    Dolphin x64CodeEmitter is my favourite assembler
 */
void* Compiler::Gen_MemoryRoutineSeq9(bool store, bool preinc)
{
    const u8* zero = GetCodePtr();
    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4));
    RET();

    void* res = (void*)GetWritableCodePtr();

    TEST(32, R(ABI_PARAM3), R(ABI_PARAM3));
    J_CC(CC_Z, zero);

    PUSH(ABI_PARAM3);
    PUSH(ABI_PARAM4); // we need you later

    const u8* repeat = GetCodePtr();

    if (preinc)
        ADD(32, R(ABI_PARAM1), Imm8(4));

    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
    SUB(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMBase)));
    CMP(32, R(RSCRATCH), MDisp(RCPU, offsetof(ARMv5, DTCMSize)));
    FixupBranch insideDTCM = J_CC(CC_B);

    CMP(32, R(ABI_PARAM1), MDisp(RCPU, offsetof(ARMv5, ITCMSize)));
    FixupBranch insideITCM = J_CC(CC_B);

    OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8); // wasting stack space like a gangster

    ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
    AND(32, R(ABI_PARAM1), Imm8(~3));
    if (store)
    {
        MOV(32, R(ABI_PARAM2), currentElement);
        CALL((void*)NDS::ARM9Write32);
    }
    else
        CALL((void*)NDS::ARM9Read32);
    ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);

    MEMORY_SEQ_WHILE_COND
    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
    SHR(32, R(RSCRATCH), Imm8(12));
    MOVZX(32, 8, ABI_PARAM2, MComplex(RCPU, RSCRATCH, SCALE_4, 2 + offsetof(ARMv5, MemTimings)));
    MOVZX(32, 8, RSCRATCH, MComplex(RCPU, RSCRATCH, SCALE_4, 3 + offsetof(ARMv5, MemTimings)));

    FixupBranch finishIt1 = J();

    SetJumpTarget(insideDTCM);
    AND(32, R(RSCRATCH), Imm32(0x3FFF & ~3));
    if (store)
    {
        MOV(32, R(ABI_PARAM4), currentElement);
        MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)), R(ABI_PARAM4));
    }
    else
        MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, DTCM)));

    MEMORY_SEQ_WHILE_COND
    MOV(32, R(RSCRATCH), Imm32(1)); // sequential access time
    MOV(32, R(ABI_PARAM2), Imm32(1)); // non sequential
    FixupBranch finishIt2 = J();

    SetJumpTarget(insideITCM);
    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
    AND(32, R(RSCRATCH), Imm32(0x7FFF & ~3));
    if (store)
    {
        MOV(32, R(ABI_PARAM4), currentElement);
        MOV(32, MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)), R(ABI_PARAM4));
        XOR(32, R(ABI_PARAM4), R(ABI_PARAM4));
        MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM)), R(ABI_PARAM4));
        MOV(64, MScaled(RSCRATCH, SCALE_4, squeezePointer(cache.ARM9_ITCM) + 8), R(ABI_PARAM4));
    }
    else
        MOV(32, R(RSCRATCH), MComplex(RCPU, RSCRATCH, SCALE_1, offsetof(ARMv5, ITCM)));

    MEMORY_SEQ_WHILE_COND
    MOV(32, R(RSCRATCH), Imm32(1));
    MOV(32, R(ABI_PARAM2), Imm32(1));

    SetJumpTarget(finishIt1);
    SetJumpTarget(finishIt2);

    POP(ABI_PARAM4);
    POP(ABI_PARAM3);

    CMP(32, R(ABI_PARAM3), Imm8(1));
    FixupBranch skipSequential = J_CC(CC_E);
    SUB(32, R(ABI_PARAM3), Imm8(1));
    IMUL(32, R(ABI_PARAM3));
    ADD(32, R(ABI_PARAM2), R(RSCRATCH));
    SetJumpTarget(skipSequential);

    CALC_CYCLES_9(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
    RET();

    return res;
}

void* Compiler::Gen_MemoryRoutineSeq7(bool store, bool preinc, bool codeMainRAM)
{
    const u8* zero = GetCodePtr();
    ADD(32, MDisp(RCPU, offsetof(ARM, Cycles)), R(ABI_PARAM4));
    RET();

    void* res = (void*)GetWritableCodePtr();

    TEST(32, R(ABI_PARAM3), R(ABI_PARAM3));
    J_CC(CC_Z, zero);

    PUSH(ABI_PARAM3);
    PUSH(ABI_PARAM4); // we need you later

    const u8* repeat = GetCodePtr();

    if (preinc)
        ADD(32, R(ABI_PARAM1), Imm8(4));

    OpArg currentElement = MComplex(ABI_PARAM2, ABI_PARAM3, SCALE_8, -8);

    ABI_PushRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);
    AND(32, R(ABI_PARAM1), Imm8(~3));
    if (store)
    {
        MOV(32, R(ABI_PARAM2), currentElement);
        CALL((void*)NDS::ARM7Write32);
    }
    else
        CALL((void*)NDS::ARM7Read32);
    ABI_PopRegistersAndAdjustStack({ABI_PARAM1, ABI_PARAM2, ABI_PARAM3}, 8);

    MEMORY_SEQ_WHILE_COND
    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
    SHR(32, R(RSCRATCH), Imm8(15));
    MOVZX(32, 8, ABI_PARAM2, MScaled(RSCRATCH, SCALE_4, 2 + squeezePointer(NDS::ARM7MemTimings)));
    MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_4, 3 + squeezePointer(NDS::ARM7MemTimings)));

    POP(ABI_PARAM4);
    POP(ABI_PARAM3);

    CMP(32, R(ABI_PARAM3), Imm8(1));
    FixupBranch skipSequential = J_CC(CC_E);
    SUB(32, R(ABI_PARAM3), Imm8(1));
    IMUL(32, R(ABI_PARAM3));
    ADD(32, R(ABI_PARAM2), R(RSCRATCH));
    SetJumpTarget(skipSequential);

    MOV(32, R(RSCRATCH), R(ABI_PARAM1));
    AND(32, R(RSCRATCH), Imm32(0xFF000000));
    CMP(32, R(RSCRATCH), Imm32(0x02000000));
    FixupBranch outsideMainRAM = J_CC(CC_NE);
    CALC_CYCLES_7_DATA_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
    RET();

    SetJumpTarget(outsideMainRAM);
    CALC_CYCLES_7_DATA_NON_MAIN_RAM(ABI_PARAM4, ABI_PARAM2, RSCRATCH)
    RET();

    return res;
}

#undef CALC_CYCLES_9
#undef MEMORY_SEQ_WHILE_COND

void Compiler::Comp_MemAccess(OpArg rd, bool signExtend, bool store, int size)
{
    if (store)
        MOV(32, R(ABI_PARAM2), rd);
    u32 cycles = Num
        ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
        : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
    MOV(32, R(ABI_PARAM3), Imm32(cycles));
    CALL(Num == 0
        ? MemoryFuncs9[size >> 4][store]
        : MemoryFuncs7[size >> 4][store][CodeRegion == 0x02]);

    if (!store)
    {
        if (signExtend)
            MOVSX(32, size, rd.GetSimpleReg(), R(RSCRATCH));
        else
            MOVZX(32, size, rd.GetSimpleReg(), R(RSCRATCH));
    }
}

s32 Compiler::Comp_MemAccessBlock(OpArg rb, BitSet16 regs, bool store, bool preinc, bool decrement, bool usermode)
{
    int regsCount = regs.Count();

    const u8 userModeOffsets[] =
    {
        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
        offsetof(ARM, R[12]), offsetof(ARM, R[13]), offsetof(ARM, R[14]), 0,

        offsetof(ARM, R_FIQ[0]), offsetof(ARM, R_FIQ[1]), offsetof(ARM, R_FIQ[2]), offsetof(ARM, R_FIQ[3]),
        offsetof(ARM, R_FIQ[4]), offsetof(ARM, R_FIQ[5]), offsetof(ARM, R_FIQ[6]), 0,

        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
        offsetof(ARM, R[12]), offsetof(ARM, R_IRQ[13]), offsetof(ARM, R_IRQ[14]), 0,

        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
        offsetof(ARM, R[12]), offsetof(ARM, R_SVC[13]), offsetof(ARM, R_SVC[14]), 0,

        0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0,

        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
        offsetof(ARM, R[12]), offsetof(ARM, R_ABT[13]), offsetof(ARM, R_ABT[14]), 0,

        0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0,

        offsetof(ARM, R[8]), offsetof(ARM, R[9]), offsetof(ARM, R[10]), offsetof(ARM, R[11]),
        offsetof(ARM, R[12]), offsetof(ARM, R_UND[13]), offsetof(ARM, R_UND[14]), 0,
    };

    if (decrement)
    {
        MOV_sum(32, ABI_PARAM1, rb, Imm32(-regsCount * 4));
        preinc = !preinc;
    }
    else
        MOV(32, R(ABI_PARAM1), rb);

    MOV(32, R(ABI_PARAM3), Imm32(regsCount));
        u32 cycles = Num
            ? NDS::ARM7MemTimings[CurInstr.CodeCycles][Thumb ? 0 : 2]
            : (R15 & 0x2 ? 0 : CurInstr.CodeCycles);
    MOV(32, R(ABI_PARAM4), Imm32(cycles));
    if (!store)
    {
        SUB(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
        MOV(64, R(ABI_PARAM2), R(RSP));

        CALL(Num == 0
            ? MemoryFuncsSeq9[0][preinc]
            : MemoryFuncsSeq7[0][preinc][CodeRegion == 0x02]);

        for (int reg = 15; reg >= 0; reg--)
        {
            if (regs[reg])
            {
                if (usermode && reg >= 8 && reg < 15)
                {
                    MOV(32, R(RSCRATCH2), R(RCPSR));
                    AND(32, R(RSCRATCH2), Imm8(0x1F));
                    // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great!
                    MOVZX(32, 8, RSCRATCH2, MScaled(RSCRATCH2, SCALE_8, squeezePointer(userModeOffsets) - 0x11 * 8 + (reg - 8)));
                    POP(RSCRATCH);
                    MOV(32, MRegSum(RCPU, RSCRATCH2), R(RSCRATCH));
                }
                else if (RegCache.Mapping[reg] == INVALID_REG)
                {
                    assert(reg != 15);

                    POP(RSCRATCH);
                    SaveReg(reg, RSCRATCH);
                }
                else
                {
                    if (reg != 15)
                        RegCache.DirtyRegs |= (1 << reg);
                    POP(MapReg(reg).GetSimpleReg());
                }
            }
        }

        if (regs[15])
        {
            if (Num == 1)
                OR(32, MapReg(15), Imm8(1));
            Comp_JumpTo(MapReg(15).GetSimpleReg(), usermode);
        }
    }
    else
    {
        for (int reg : regs)
        {
            if (usermode && reg >= 8 && reg < 15)
            {
                MOV(32, R(RSCRATCH), R(RCPSR));
                AND(32, R(RSCRATCH), Imm8(0x1F));
                // (RSCRATCH2 - 0x11) * 8 + squeezePointer(userModeOffsets) + (reg - 8), algebra is great!
                MOVZX(32, 8, RSCRATCH, MScaled(RSCRATCH, SCALE_8, squeezePointer(userModeOffsets) - 0x11 * 8 + (reg - 8)));
                MOV(32, R(RSCRATCH), MRegSum(RCPU, RSCRATCH));
                PUSH(RSCRATCH);
            }
            else if (RegCache.Mapping[reg] == INVALID_REG)
            {
                LoadReg(reg, RSCRATCH);
                PUSH(RSCRATCH);
            }
            else
                PUSH(MapReg(reg).GetSimpleReg());
        }
        MOV(64, R(ABI_PARAM2), R(RSP));

        CALL(Num == 0
            ? MemoryFuncsSeq9[1][preinc]
            : MemoryFuncsSeq7[1][preinc][CodeRegion == 0x02]);

        ADD(32, R(RSP), regsCount < 16 ? Imm8(regsCount * 8) : Imm32(regsCount * 8));
    }

    return (regsCount * 4) * (decrement ? -1 : 1);
}

OpArg Compiler::A_Comp_GetMemWBOffset()
{
    if (!(CurInstr.Instr & (1 << 25)))
    {
        u32 imm = CurInstr.Instr & 0xFFF;
        return Imm32(imm);
    }
    else
    {
        int op = (CurInstr.Instr >> 5) & 0x3;
        int amount = (CurInstr.Instr >> 7) & 0x1F;
        OpArg rm = MapReg(CurInstr.A_Reg(0));
        bool carryUsed;

        return Comp_RegShiftImm(op, amount, rm, false, carryUsed);
    }
}

void Compiler::A_Comp_MemWB()
{
    OpArg rn = MapReg(CurInstr.A_Reg(16));
    OpArg rd = MapReg(CurInstr.A_Reg(12));
    bool load = CurInstr.Instr & (1 << 20);
    bool byte = CurInstr.Instr & (1 << 22);
    int size = byte ? 8 : 32;

    if (CurInstr.Instr & (1 << 24))
    {
        OpArg offset = A_Comp_GetMemWBOffset();
        if (CurInstr.Instr & (1 << 23))
            MOV_sum(32, ABI_PARAM1, rn, offset);
        else
        {
            MOV(32, R(ABI_PARAM1), rn);
            SUB(32, R(ABI_PARAM1), offset);
        }

        if (CurInstr.Instr & (1 << 21))
            MOV(32, rn, R(ABI_PARAM1));
    }
    else
        MOV(32, R(ABI_PARAM1), rn);

    if (!(CurInstr.Instr & (1 << 24)))
    {
        OpArg offset = A_Comp_GetMemWBOffset();

        if (CurInstr.Instr & (1 << 23))
            ADD(32, rn, offset);
        else
            SUB(32, rn, offset);
    }

    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
    if (load && CurInstr.A_Reg(12) == 15)
    {
        if (byte)
            printf("!!! LDRB PC %08X\n", R15);
        else
        {
            if (Num == 1)
                AND(32, rd, Imm8(0xFE)); // immediate is sign extended
            Comp_JumpTo(rd.GetSimpleReg());
        }
    }
}

void Compiler::A_Comp_MemHalf()
{
    OpArg rn = MapReg(CurInstr.A_Reg(16));
    OpArg rd = MapReg(CurInstr.A_Reg(12));

    OpArg offset = CurInstr.Instr & (1 << 22)
        ? Imm32(CurInstr.Instr & 0xF | ((CurInstr.Instr >> 4) & 0xF0))
        : MapReg(CurInstr.A_Reg(0));

    int op = (CurInstr.Instr >> 5) & 0x3;
    bool load = CurInstr.Instr & (1 << 20);

    bool signExtend = false;
    int size;
    if (!load)
    {
        size = op == 1 ? 16 : 32;
        load = op == 2;
    }
    else if (load)
    {
        size = op == 2 ? 8 : 16;
        signExtend = op > 1;
    }

    if (size == 32 && Num == 1)
        return; // NOP

    if (CurInstr.Instr & (1 << 24))
    {
        if (CurInstr.Instr & (1 << 23))
            MOV_sum(32, ABI_PARAM1, rn, offset);
        else
        {
            MOV(32, R(ABI_PARAM1), rn);
            SUB(32, R(ABI_PARAM1), offset);
        }

        if (CurInstr.Instr & (1 << 21))
            MOV(32, rn, R(ABI_PARAM1));
    }
    else
        MOV(32, R(ABI_PARAM1), rn);

    if (!(CurInstr.Instr & (1 << 24)))
    {
        if (CurInstr.Instr & (1 << 23))
            ADD(32, rn, offset);
        else
            SUB(32, rn, offset);
    }

    Comp_MemAccess(rd, signExtend, !load, size);

    if (load && CurInstr.A_Reg(12) == 15)
        printf("!!! MemHalf op PC %08X\n", R15);;
}

void Compiler::T_Comp_MemReg()
{
    OpArg rd = MapReg(CurInstr.T_Reg(0));
    OpArg rb = MapReg(CurInstr.T_Reg(3));
    OpArg ro = MapReg(CurInstr.T_Reg(6));

    int op = (CurInstr.Instr >> 10) & 0x3;
    bool load = op & 0x2;
    bool byte = op & 0x1;

    MOV_sum(32, ABI_PARAM1, rb, ro);

    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
}

void Compiler::A_Comp_LDM_STM()
{
    BitSet16 regs(CurInstr.Instr & 0xFFFF);

    bool load = (CurInstr.Instr >> 20) & 1;
    bool pre = (CurInstr.Instr >> 24) & 1;
    bool add = (CurInstr.Instr >> 23) & 1;
    bool writeback = (CurInstr.Instr >> 21) & 1;
    bool usermode = (CurInstr.Instr >> 22) & 1;

    OpArg rn = MapReg(CurInstr.A_Reg(16));

    s32 offset = Comp_MemAccessBlock(rn, regs, !load, pre, !add, false);

    if (writeback)
        ADD(32, rn, offset >= INT8_MIN && offset < INT8_MAX ? Imm8(offset) : Imm32(offset));
}

void Compiler::T_Comp_MemImm()
{
    OpArg rd = MapReg(CurInstr.T_Reg(0));
    OpArg rb = MapReg(CurInstr.T_Reg(3));

    int op = (CurInstr.Instr >> 11) & 0x3;
    bool load = op & 0x1;
    bool byte = op & 0x2;
    u32 offset = ((CurInstr.Instr >> 6) & 0x1F) * (byte ? 1 : 4);

    LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset));

    Comp_MemAccess(rd, false, !load, byte ? 8 : 32);
}

void Compiler::T_Comp_MemRegHalf()
{
    OpArg rd = MapReg(CurInstr.T_Reg(0));
    OpArg rb = MapReg(CurInstr.T_Reg(3));
    OpArg ro = MapReg(CurInstr.T_Reg(6));

    int op = (CurInstr.Instr >> 10) & 0x3;
    bool load = op != 0;
    int size = op != 1 ? 16 : 8;
    bool signExtend = op & 1;

    MOV_sum(32, ABI_PARAM1, rb, ro);

    Comp_MemAccess(rd, signExtend, !load, size);
}

void Compiler::T_Comp_MemImmHalf()
{
    OpArg rd = MapReg(CurInstr.T_Reg(0));
    OpArg rb = MapReg(CurInstr.T_Reg(3));

    u32 offset = (CurInstr.Instr >> 5) & 0x3E;
    bool load = CurInstr.Instr & (1 << 11);

    LEA(32, ABI_PARAM1, MDisp(rb.GetSimpleReg(), offset));

    Comp_MemAccess(rd, false, !load, 16);
}

void Compiler::T_Comp_LoadPCRel()
{
    OpArg rd = MapReg(CurInstr.T_Reg(8));
    u32 addr = (R15 & ~0x2) + ((CurInstr.Instr & 0xFF) << 2);

    // hopefully this doesn't break
    u32 val; CurCPU->DataRead32(addr, &val);
    MOV(32, rd, Imm32(val));
}

void Compiler::T_Comp_MemSPRel()
{
    u32 offset = (CurInstr.Instr & 0xFF) * 4;
    OpArg rd = MapReg(CurInstr.T_Reg(8));
    bool load = CurInstr.Instr & (1 << 11);

    LEA(32, ABI_PARAM1, MDisp(MapReg(13).GetSimpleReg(), offset));

    Comp_MemAccess(rd, false, !load, 32);
}

void Compiler::T_Comp_PUSH_POP()
{
    bool load = CurInstr.Instr & (1 << 11);
    BitSet16 regs(CurInstr.Instr & 0xFF);
    if (CurInstr.Instr & (1 << 8))
    {
        if (load)
            regs[15] = true;
        else
            regs[14] = true;
    }

    OpArg sp = MapReg(13);

    s32 offset = Comp_MemAccessBlock(sp, regs, !load, !load, !load, false);

    ADD(32, sp, Imm8(offset)); // offset will be always be in range since PUSH accesses 9 regs max
}

void Compiler::T_Comp_LDMIA_STMIA()
{
    BitSet16 regs(CurInstr.Instr & 0xFF);
    OpArg rb = MapReg(CurInstr.T_Reg(8));
    bool load = CurInstr.Instr & (1 << 11);

    s32 offset = Comp_MemAccessBlock(rb, regs, !load, false, false, false);

    if (!load || !regs[CurInstr.T_Reg(8)])
        ADD(32, rb, Imm8(offset));
}

}