// Copyright 2015 Dolphin Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #pragma once #include #include #include #include #include #include #include #include "Common/ArmCommon.h" #include "Common/Assert.h" #include "Common/BitSet.h" #include "Common/BitUtils.h" #include "Common/CodeBlock.h" #include "Common/Common.h" #include "Common/CommonTypes.h" #include "Common/MathUtil.h" #include "Common/SmallVector.h" namespace Arm64Gen { // X30 serves a dual purpose as a link register // Encoded as // Types: // 000 - 32bit GPR // 001 - 64bit GPR // 010 - VFP single precision // 100 - VFP double precision // 110 - VFP quad precision enum class ARM64Reg { // 32bit registers W0 = 0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, W16, W17, W18, W19, W20, W21, W22, W23, W24, W25, W26, W27, W28, W29, W30, WSP, // 32bit stack pointer // 64bit registers X0 = 0x20, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X18, X19, X20, X21, X22, X23, X24, X25, X26, X27, X28, X29, X30, SP, // 64bit stack pointer // VFP single precision registers S0 = 0x40, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31, // VFP Double Precision registers D0 = 0x80, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15, D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31, // ASIMD Quad-Word registers Q0 = 0xC0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15, Q16, Q17, Q18, Q19, Q20, Q21, Q22, Q23, Q24, Q25, Q26, Q27, Q28, Q29, Q30, Q31, // For PRFM(prefetch memory) encoding // This is encoded in the Rt register // Data preload PLDL1KEEP = 0, PLDL1STRM, PLDL2KEEP, PLDL2STRM, PLDL3KEEP, PLDL3STRM, // Instruction preload PLIL1KEEP = 8, PLIL1STRM, PLIL2KEEP, PLIL2STRM, PLIL3KEEP, PLIL3STRM, // Prepare for store PLTL1KEEP = 16, PLTL1STRM, PLTL2KEEP, PLTL2STRM, PLTL3KEEP, PLTL3STRM, WZR = WSP, ZR = SP, INVALID_REG = -1, }; constexpr int operator&(const ARM64Reg& reg, const int mask) { return static_cast(reg) & mask; } constexpr int operator|(const ARM64Reg& reg, const int mask) { return static_cast(reg) | mask; } constexpr ARM64Reg operator+(const ARM64Reg& reg, const int addend) { return static_cast(static_cast(reg) + addend); } constexpr bool Is64Bit(ARM64Reg reg) { return (reg & 0x20) != 0; } constexpr bool IsSingle(ARM64Reg reg) { return (reg & 0xC0) == 0x40; } constexpr bool IsDouble(ARM64Reg reg) { return (reg & 0xC0) == 0x80; } constexpr bool IsScalar(ARM64Reg reg) { return IsSingle(reg) || IsDouble(reg); } constexpr bool IsQuad(ARM64Reg reg) { return (reg & 0xC0) == 0xC0; } constexpr bool IsVector(ARM64Reg reg) { return (reg & 0xC0) != 0; } constexpr bool IsGPR(ARM64Reg reg) { return static_cast(reg) < 0x40; } constexpr int DecodeReg(ARM64Reg reg) { return reg & 0x1F; } constexpr ARM64Reg EncodeRegTo32(ARM64Reg reg) { return static_cast(DecodeReg(reg)); } constexpr ARM64Reg EncodeRegTo64(ARM64Reg reg) { return static_cast(reg | 0x20); } constexpr ARM64Reg EncodeRegToSingle(ARM64Reg reg) { return static_cast(ARM64Reg::S0 | DecodeReg(reg)); } constexpr ARM64Reg EncodeRegToDouble(ARM64Reg reg) { return static_cast((reg & ~0xC0) | 0x80); } constexpr ARM64Reg EncodeRegToQuad(ARM64Reg reg) { return static_cast(reg | 0xC0); } enum class ShiftType { // Logical Shift Left LSL = 0, // Logical Shift Right LSR = 1, // Arithmetic Shift Right ASR = 2, // Rotate Right ROR = 3, }; enum class ExtendSpecifier { UXTB = 0x0, UXTH = 0x1, UXTW = 0x2, /* Also LSL on 32bit width */ UXTX = 0x3, /* Also LSL on 64bit width */ SXTB = 0x4, SXTH = 0x5, SXTW = 0x6, SXTX = 0x7, }; enum class IndexType { Unsigned, Post, Pre, Signed, // used in LDP/STP }; enum class ShiftAmount { Shift0, Shift16, Shift32, Shift48, }; enum class RoundingMode { A, // round to nearest, ties to away M, // round towards -inf N, // round to nearest, ties to even P, // round towards +inf Z, // round towards zero }; enum class GPRSize { B32, B64, }; struct FixupBranch { enum class Type : u32 { CBZ, CBNZ, BConditional, TBZ, TBNZ, B, BL, }; u8* ptr; Type type; // Used with B.cond CCFlags cond; // Used with TBZ/TBNZ u8 bit; // Used with Test/Compare and Branch ARM64Reg reg; }; enum class PStateField { SPSel = 0, DAIFSet, DAIFClr, NZCV, // The only system registers accessible from EL0 (user space) PMCR_EL0, PMCCNTR_EL0, FPCR = 0x340, FPSR = 0x341, }; enum class SystemHint { NOP, YIELD, WFE, WFI, SEV, SEVL, }; enum class BarrierType { OSHLD = 1, OSHST = 2, OSH = 3, NSHLD = 5, NSHST = 6, NSH = 7, ISHLD = 9, ISHST = 10, ISH = 11, LD = 13, ST = 14, SY = 15, }; class ArithOption { private: enum class WidthSpecifier { Default, Width32Bit, Width64Bit, }; enum class TypeSpecifier { ExtendedReg, Immediate, ShiftedReg, }; ARM64Reg m_destReg; WidthSpecifier m_width; ExtendSpecifier m_extend; TypeSpecifier m_type; ShiftType m_shifttype; u32 m_shift; public: ArithOption(ARM64Reg Rd, bool index = false) { // Indexed registers are a certain feature of AARch64 // On Loadstore instructions that use a register offset // We can have the register as an index // If we are indexing then the offset register will // be shifted to the left so we are indexing at intervals // of the size of what we are loading // 8-bit: Index does nothing // 16-bit: Index LSL 1 // 32-bit: Index LSL 2 // 64-bit: Index LSL 3 if (index) m_shift = 4; else m_shift = 0; m_destReg = Rd; m_type = TypeSpecifier::ExtendedReg; if (Is64Bit(Rd)) { m_width = WidthSpecifier::Width64Bit; m_extend = ExtendSpecifier::UXTX; } else { m_width = WidthSpecifier::Width32Bit; m_extend = ExtendSpecifier::UXTW; } m_shifttype = ShiftType::LSL; } ArithOption(ARM64Reg Rd, ExtendSpecifier extend_type, u32 shift = 0) { m_destReg = Rd; m_width = Is64Bit(Rd) ? WidthSpecifier::Width64Bit : WidthSpecifier::Width32Bit; m_extend = extend_type; m_type = TypeSpecifier::ExtendedReg; m_shifttype = ShiftType::LSL; m_shift = shift; } ArithOption(ARM64Reg Rd, ShiftType shift_type, u32 shift) { m_destReg = Rd; m_shift = shift; m_shifttype = shift_type; m_type = TypeSpecifier::ShiftedReg; if (Is64Bit(Rd)) { m_width = WidthSpecifier::Width64Bit; if (shift == 64) m_shift = 0; m_extend = ExtendSpecifier::UXTX; } else { m_width = WidthSpecifier::Width32Bit; if (shift == 32) m_shift = 0; m_extend = ExtendSpecifier::UXTW; } } ARM64Reg GetReg() const { return m_destReg; } u32 GetData() const { switch (m_type) { case TypeSpecifier::ExtendedReg: return (static_cast(m_extend) << 13) | (m_shift << 10); case TypeSpecifier::ShiftedReg: return (static_cast(m_shifttype) << 22) | (m_shift << 10); default: DEBUG_ASSERT_MSG(DYNA_REC, false, "Invalid type in GetData"); break; } return 0; } bool IsExtended() const { return m_type == TypeSpecifier::ExtendedReg; } }; struct LogicalImm { constexpr LogicalImm() {} constexpr LogicalImm(u8 r_, u8 s_, bool n_) : r(r_), s(s_), n(n_), valid(true) {} constexpr LogicalImm(u64 value, GPRSize size) { // Logical immediates are encoded using parameters n, imm_s and imm_r using // the following table: // // N imms immr size S R // 1 ssssss rrrrrr 64 UInt(ssssss) UInt(rrrrrr) // 0 0sssss xrrrrr 32 UInt(sssss) UInt(rrrrr) // 0 10ssss xxrrrr 16 UInt(ssss) UInt(rrrr) // 0 110sss xxxrrr 8 UInt(sss) UInt(rrr) // 0 1110ss xxxxrr 4 UInt(ss) UInt(rr) // 0 11110s xxxxxr 2 UInt(s) UInt(r) // (s bits must not be all set) // // A pattern is constructed of size bits, where the least significant S+1 bits // are set. The pattern is rotated right by R, and repeated across a 32 or // 64-bit value, depending on destination register width. if (size == GPRSize::B32) { // To handle 32-bit logical immediates, the very easiest thing is to repeat // the input value twice to make a 64-bit word. The correct encoding of that // as a logical immediate will also be the correct encoding of the 32-bit // value. value = (value << 32) | (value & 0xFFFFFFFF); } if (value == 0 || (~value) == 0) { valid = false; return; } // Normalize value, rotating it such that the LSB is 1: // If LSB is already one, we mask away the trailing sequence of ones and // pick the next sequence of ones. This ensures we get a complete element // that has not been cut-in-half due to rotation across the word boundary. const int rotation = std::countr_zero(value & (value + 1)); const u64 normalized = std::rotr(value, rotation); const int element_size = std::countr_zero(normalized & (normalized + 1)); const int ones = std::countr_one(normalized); // Check the value is repeating; also ensures element size is a power of two. if (std::rotr(value, element_size) != value) { valid = false; return; } // Now we're done. We just have to encode the S output in such a way that // it gives both the number of set bits and the length of the repeated // segment. r = static_cast((element_size - rotation) & (element_size - 1)); s = static_cast((((~element_size + 1) << 1) | (ones - 1)) & 0x3f); n = Common::ExtractBit<6>(element_size); valid = true; } constexpr operator bool() const { return valid; } u8 r = 0; u8 s = 0; bool n = false; bool valid = false; }; class ARM64XEmitter { friend class ARM64FloatEmitter; private: struct RegisterMove { ARM64Reg dst; ARM64Reg src; }; // Pointer to memory where code will be emitted to. u8* m_code = nullptr; // Pointer past the end of the memory region we're allowed to emit to. // Writes that would reach this memory are refused and will set the m_write_failed flag instead. u8* m_code_end = nullptr; u8* m_lastCacheFlushEnd = nullptr; // Set to true when a write request happens that would write past m_code_end. // Must be cleared with SetCodePtr() afterwards. bool m_write_failed = false; void AddImmediate(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool shift, bool negative, bool flags); void EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr); void EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr); void EncodeUnconditionalBranchInst(u32 op, const void* ptr); void EncodeUnconditionalBranchInst(u32 opc, u32 op2, u32 op3, u32 op4, ARM64Reg Rn); void EncodeExceptionInst(u32 instenc, u32 imm); void EncodeSystemInst(u32 op0, u32 op1, u32 CRn, u32 CRm, u32 op2, ARM64Reg Rt); void EncodeArithmeticInst(u32 instenc, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); void EncodeArithmeticCarryInst(u32 op, bool flags, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EncodeCondCompareImmInst(u32 op, ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond); void EncodeCondCompareRegInst(u32 op, ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond); void EncodeCondSelectInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); void EncodeData1SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn); void EncodeData2SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EncodeData3SrcInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); void EncodeLogicalInst(u32 instenc, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); void EncodeLoadRegisterInst(u32 bitop, ARM64Reg Rt, u32 imm); void EncodeLoadStoreExcInst(u32 instenc, ARM64Reg Rs, ARM64Reg Rt2, ARM64Reg Rn, ARM64Reg Rt); void EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm); void EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM64Reg Rn, s32 imm); void EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm, u8 size); void EncodeMOVWideInst(u32 op, ARM64Reg Rd, u32 imm, ShiftAmount pos); void EncodeBitfieldMOVInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd); void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, LogicalImm imm); void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm); void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); [[nodiscard]] FixupBranch WriteFixupBranch(); // This function solves the "parallel moves" problem common in compilers. // The arguments are mutated! void ParallelMoves(RegisterMove* begin, RegisterMove* end, std::array* source_gpr_usages); template void MOVI2RImpl(ARM64Reg Rd, T imm); protected: void Write32(u32 value); public: ARM64XEmitter() = default; ARM64XEmitter(u8* code, u8* code_end) : m_code(code), m_code_end(code_end), m_lastCacheFlushEnd(code) { } virtual ~ARM64XEmitter() {} void SetCodePtr(u8* ptr, u8* end, bool write_failed = false); void SetCodePtrUnsafe(u8* ptr, u8* end, bool write_failed = false); const u8* GetCodePtr() const; u8* GetWritableCodePtr(); const u8* GetCodeEnd() const; u8* GetWritableCodeEnd(); void ReserveCodeSpace(u32 bytes); u8* AlignCode16(); u8* AlignCodePage(); void FlushIcache(); void FlushIcacheSection(u8* start, u8* end); // Should be checked after a block of code has been generated to see if the code has been // successfully written to memory. Do not call the generated code when this returns true! bool HasWriteFailed() const { return m_write_failed; } // FixupBranch branching void SetJumpTarget(FixupBranch const& branch); [[nodiscard]] FixupBranch CBZ(ARM64Reg Rt); [[nodiscard]] FixupBranch CBNZ(ARM64Reg Rt); [[nodiscard]] FixupBranch B(CCFlags cond); [[nodiscard]] FixupBranch TBZ(ARM64Reg Rt, u8 bit); [[nodiscard]] FixupBranch TBNZ(ARM64Reg Rt, u8 bit); [[nodiscard]] FixupBranch B(); [[nodiscard]] FixupBranch BL(); // Compare and Branch void CBZ(ARM64Reg Rt, const void* ptr); void CBNZ(ARM64Reg Rt, const void* ptr); // Conditional Branch void B(CCFlags cond, const void* ptr); // Test and Branch void TBZ(ARM64Reg Rt, u8 bits, const void* ptr); void TBNZ(ARM64Reg Rt, u8 bits, const void* ptr); // Unconditional Branch void B(const void* ptr); void BL(const void* ptr); // Unconditional Branch (register) void BR(ARM64Reg Rn); void BLR(ARM64Reg Rn); void RET(ARM64Reg Rn = ARM64Reg::X30); void ERET(); void DRPS(); // Exception generation void SVC(u32 imm); void HVC(u32 imm); void SMC(u32 imm); void BRK(u32 imm); void HLT(u32 imm); void DCPS1(u32 imm); void DCPS2(u32 imm); void DCPS3(u32 imm); // System void _MSR(PStateField field, u8 imm); void _MSR(PStateField field, ARM64Reg Rt); void MRS(ARM64Reg Rt, PStateField field); void CNTVCT(ARM64Reg Rt); void HINT(SystemHint op); void NOP() { HINT(SystemHint::NOP); } void SEV() { HINT(SystemHint::SEV); } void SEVL() { HINT(SystemHint::SEVL); } void WFE() { HINT(SystemHint::WFE); } void WFI() { HINT(SystemHint::WFI); } void YIELD() { HINT(SystemHint::YIELD); } void CLREX(); void DSB(BarrierType type); void DMB(BarrierType type); void ISB(BarrierType type); // Add/Subtract (Extended/Shifted register) void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void ADDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void SUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void SUBS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); void CMN(ARM64Reg Rn, ARM64Reg Rm); void CMN(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); void CMP(ARM64Reg Rn, ARM64Reg Rm); void CMP(ARM64Reg Rn, ARM64Reg Rm, ArithOption Option); // Add/Subtract (with carry) void ADC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void ADCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void SBC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void SBCS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); // Conditional Compare (immediate) void CCMN(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond); void CCMP(ARM64Reg Rn, u32 imm, u32 nzcv, CCFlags cond); // Conditional Compare (register) void CCMN(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond); void CCMP(ARM64Reg Rn, ARM64Reg Rm, u32 nzcv, CCFlags cond); // Conditional Select void CSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); void CSINC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); void CSINV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); void CSNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); // Aliases void CSET(ARM64Reg Rd, CCFlags cond) { ARM64Reg zr = Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR; CSINC(Rd, zr, zr, (CCFlags)((u32)cond ^ 1)); } void CSETM(ARM64Reg Rd, CCFlags cond) { ARM64Reg zr = Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR; CSINV(Rd, zr, zr, (CCFlags)((u32)cond ^ 1)); } void NEG(ARM64Reg Rd, ARM64Reg Rs) { SUB(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs); } void NEG(ARM64Reg Rd, ARM64Reg Rs, ArithOption Option) { SUB(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs, Option); } void NEGS(ARM64Reg Rd, ARM64Reg Rs) { SUBS(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs); } void NEGS(ARM64Reg Rd, ARM64Reg Rs, ArithOption Option) { SUBS(Rd, Is64Bit(Rd) ? ARM64Reg::ZR : ARM64Reg::WZR, Rs, Option); } // Data-Processing 1 source void RBIT(ARM64Reg Rd, ARM64Reg Rn); void REV16(ARM64Reg Rd, ARM64Reg Rn); void REV32(ARM64Reg Rd, ARM64Reg Rn); void REV64(ARM64Reg Rd, ARM64Reg Rn); void CLZ(ARM64Reg Rd, ARM64Reg Rn); void CLS(ARM64Reg Rd, ARM64Reg Rn); // Data-Processing 2 source void UDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void SDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void LSLV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void LSRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void ASRV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void RORV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void CRC32B(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void CRC32H(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void CRC32W(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void CRC32CB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void CRC32CH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void CRC32CW(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void CRC32X(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void CRC32CX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); // Data-Processing 3 source void MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); void MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); void SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); void SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); void SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); void UMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void UMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); void UMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void MUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void MNEG(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); // Logical (shifted register) void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift); void TST(ARM64Reg Rn, ARM64Reg Rm) { ANDS(Is64Bit(Rn) ? ARM64Reg::ZR : ARM64Reg::WZR, Rn, Rm); } void TST(ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift) { ANDS(Is64Bit(Rn) ? ARM64Reg::ZR : ARM64Reg::WZR, Rn, Rm, Shift); } // Wrap the above for saner syntax void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { AND(Rd, Rn, Rm, ArithOption(Rd, ShiftType::LSL, 0)); } void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BIC(Rd, Rn, Rm, ArithOption(Rd, ShiftType::LSL, 0)); } void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORR(Rd, Rn, Rm, ArithOption(Rd, ShiftType::LSL, 0)); } void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORN(Rd, Rn, Rm, ArithOption(Rd, ShiftType::LSL, 0)); } void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EOR(Rd, Rn, Rm, ArithOption(Rd, ShiftType::LSL, 0)); } void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EON(Rd, Rn, Rm, ArithOption(Rd, ShiftType::LSL, 0)); } void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ANDS(Rd, Rn, Rm, ArithOption(Rd, ShiftType::LSL, 0)); } void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BICS(Rd, Rn, Rm, ArithOption(Rd, ShiftType::LSL, 0)); } // Convenience wrappers around ORR. These match the official convenience syntax. void MOV(ARM64Reg Rd, ARM64Reg Rm, ArithOption Shift); void MOV(ARM64Reg Rd, ARM64Reg Rm); void MVN(ARM64Reg Rd, ARM64Reg Rm); // Convenience wrappers around UBFM/EXTR. void LSR(ARM64Reg Rd, ARM64Reg Rm, int shift); void LSL(ARM64Reg Rd, ARM64Reg Rm, int shift); void ASR(ARM64Reg Rd, ARM64Reg Rm, int shift); void ROR(ARM64Reg Rd, ARM64Reg Rm, int shift); // Logical (immediate) void AND(ARM64Reg Rd, ARM64Reg Rn, LogicalImm imm); void ANDS(ARM64Reg Rd, ARM64Reg Rn, LogicalImm imm); void EOR(ARM64Reg Rd, ARM64Reg Rn, LogicalImm imm); void ORR(ARM64Reg Rd, ARM64Reg Rn, LogicalImm imm); void TST(ARM64Reg Rn, LogicalImm imm); // Add/subtract (immediate) void ADD(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); void ADDS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); void SUB(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); void SUBS(ARM64Reg Rd, ARM64Reg Rn, u32 imm, bool shift = false); void CMP(ARM64Reg Rn, u32 imm, bool shift = false); void CMN(ARM64Reg Rn, u32 imm, bool shift = false); // Data Processing (Immediate) void MOVZ(ARM64Reg Rd, u32 imm, ShiftAmount pos = ShiftAmount::Shift0); void MOVN(ARM64Reg Rd, u32 imm, ShiftAmount pos = ShiftAmount::Shift0); void MOVK(ARM64Reg Rd, u32 imm, ShiftAmount pos = ShiftAmount::Shift0); // Bitfield move void BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); void SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); void UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); void BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width); void BFXIL(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width); void UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width); // Extract register (ROR with two inputs, if same then faster on A67) void EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift); // Aliases void SXTB(ARM64Reg Rd, ARM64Reg Rn); void SXTH(ARM64Reg Rd, ARM64Reg Rn); void SXTW(ARM64Reg Rd, ARM64Reg Rn); void UXTB(ARM64Reg Rd, ARM64Reg Rn); void UXTH(ARM64Reg Rd, ARM64Reg Rn); void UBFX(ARM64Reg Rd, ARM64Reg Rn, int lsb, int width) { UBFM(Rd, Rn, lsb, lsb + width - 1); } // Load Register (Literal) void LDR(ARM64Reg Rt, u32 imm); void LDRSW(ARM64Reg Rt, u32 imm); void PRFM(ARM64Reg Rt, u32 imm); // Load/Store Exclusive void STXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); void STLXRB(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); void LDXRB(ARM64Reg Rt, ARM64Reg Rn); void LDAXRB(ARM64Reg Rt, ARM64Reg Rn); void STLRB(ARM64Reg Rt, ARM64Reg Rn); void LDARB(ARM64Reg Rt, ARM64Reg Rn); void STXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); void STLXRH(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); void LDXRH(ARM64Reg Rt, ARM64Reg Rn); void LDAXRH(ARM64Reg Rt, ARM64Reg Rn); void STLRH(ARM64Reg Rt, ARM64Reg Rn); void LDARH(ARM64Reg Rt, ARM64Reg Rn); void STXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); void STLXR(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rn); void STXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn); void STLXP(ARM64Reg Rs, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn); void LDXR(ARM64Reg Rt, ARM64Reg Rn); void LDAXR(ARM64Reg Rt, ARM64Reg Rn); void LDXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn); void LDAXP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn); void STLR(ARM64Reg Rt, ARM64Reg Rn); void LDAR(ARM64Reg Rt, ARM64Reg Rn); // Load/Store no-allocate pair (offset) void STNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm); void LDNP(ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, u32 imm); // Load/Store register (immediate indexed) void STRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); void LDRB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); void LDRSB(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); void STRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); void LDRH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); void LDRSH(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); void STR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); void LDR(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); void LDRSW(IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); // Load/Store register (register offset) void STRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void LDRB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void LDRSB(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void STRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void LDRH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void LDRSH(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void STR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void LDR(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); // Load/Store register (unscaled offset) void STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm); void LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm); void LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm); void STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm); void LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm); void LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm); void STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm); void LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm); void LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm); // Load/Store pair void LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); void LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); void STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); // Address of label/page PC-relative void ADR(ARM64Reg Rd, s32 imm); void ADRP(ARM64Reg Rd, s64 imm); // Wrapper around ADR/ADRP/MOVZ/MOVN/MOVK void MOVI2R(ARM64Reg Rd, u64 imm); bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2); template void MOVP2R(ARM64Reg Rd, P* ptr) { ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "Can't store pointers in 32-bit registers"); MOVI2R(Rd, reinterpret_cast(ptr)); } template // Given an address, stores the page address into a register and returns the page-relative offset s32 MOVPage2R(ARM64Reg Rd, P* ptr) { ASSERT_MSG(DYNA_REC, Is64Bit(Rd), "Can't store pointers in 32-bit registers"); MOVI2R(Rd, reinterpret_cast(ptr) & ~0xFFFULL); return static_cast(reinterpret_cast(ptr) & 0xFFFULL); } // Wrappers around bitwise operations with an immediate. If you're sure an imm can be encoded // without a scratch register, preferably construct a LogicalImm directly instead, // since that is constexpr and thus can be done at compile time for constant values. void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch); void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch); void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch) { ANDSI2R(Is64Bit(Rn) ? ARM64Reg::ZR : ARM64Reg::WZR, Rn, imm, scratch); } void ORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch); void EORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch); // Wrappers around arithmetic operations with an immediate. void ADDI2R_internal(ARM64Reg Rd, ARM64Reg Rn, u64 imm, bool negative, bool flags, ARM64Reg scratch); void ADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = ARM64Reg::INVALID_REG); void ADDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = ARM64Reg::INVALID_REG); void SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = ARM64Reg::INVALID_REG); void SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = ARM64Reg::INVALID_REG); void CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch = ARM64Reg::INVALID_REG); bool TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm); bool TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm); bool TryCMPI2R(ARM64Reg Rn, u64 imm); bool TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm); bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm); bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm); // ABI related static constexpr BitSet32 CALLER_SAVED_GPRS = BitSet32(0x4007FFFF); static constexpr BitSet32 CALLER_SAVED_FPRS = BitSet32(0xFFFF00FF); void ABI_PushRegisters(BitSet32 registers); void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0)); // Plain function call void QuickCallFunction(ARM64Reg scratchreg, const void* func); template void QuickCallFunction(ARM64Reg scratchreg, T func) { QuickCallFunction(scratchreg, (const void*)func); } template void ABI_CallFunction(FuncRet (*func)(FuncArgs...), Args... args) { static_assert(sizeof...(FuncArgs) == sizeof...(Args), "Wrong number of arguments"); static_assert(sizeof...(FuncArgs) <= 8, "Passing arguments on the stack is not supported"); if constexpr (!std::is_void_v) static_assert(sizeof(FuncRet) <= 16, "Large return types are not supported"); std::array source_gpr_uses{}; auto check_argument = [&](auto& arg) { using Arg = std::decay_t; if constexpr (std::is_same_v) { ASSERT(IsGPR(arg)); source_gpr_uses[DecodeReg(arg)]++; } else { // To be more correct, we should be checking FuncArgs here rather than Args, but that's a // lot more effort to implement. Let's just do these best-effort checks for now. static_assert(!std::is_floating_point_v, "Floating-point arguments are not supported"); static_assert(sizeof(Arg) <= 8, "Arguments bigger than a register are not supported"); } }; (check_argument(args), ...); { Common::SmallVector pending_moves; size_t i = 0; auto handle_register_argument = [&](auto& arg) { using Arg = std::decay_t; if constexpr (std::is_same_v) { const ARM64Reg dst_reg = (Is64Bit(arg) ? EncodeRegTo64 : EncodeRegTo32)(static_cast(i)); if (dst_reg == arg) { // The value is already in the right register. source_gpr_uses[DecodeReg(arg)]--; } else if (source_gpr_uses[i] == 0) { // The destination register isn't used as the source of another move. // We can go ahead and do the move right away. MOV(dst_reg, arg); source_gpr_uses[DecodeReg(arg)]--; } else { // The destination register is used as the source of a move we haven't gotten to yet. // Let's record that we need to deal with this move later. pending_moves.emplace_back(dst_reg, arg); } } ++i; }; (handle_register_argument(args), ...); if (!pending_moves.empty()) { ParallelMoves(pending_moves.data(), pending_moves.data() + pending_moves.size(), &source_gpr_uses); } } { size_t i = 0; auto handle_immediate_argument = [&](auto& arg) { using Arg = std::decay_t; if constexpr (!std::is_same_v) { const ARM64Reg dst_reg = (sizeof(arg) == 8 ? EncodeRegTo64 : EncodeRegTo32)(static_cast(i)); if constexpr (std::is_pointer_v) MOVP2R(dst_reg, arg); else MOVI2R(dst_reg, arg); } ++i; }; (handle_immediate_argument(args), ...); } QuickCallFunction(ARM64Reg::X8, func); } // Utility to generate a call to a std::function object. // // Unfortunately, calling operator() directly is undefined behavior in C++ // (this method might be a thunk in the case of multi-inheritance) so we // have to go through a trampoline function. template static T CallLambdaTrampoline(const std::function* f, Args... args) { return (*f)(args...); } template void ABI_CallLambdaFunction(const std::function* f, Args... args) { auto trampoline = &ARM64XEmitter::CallLambdaTrampoline; ABI_CallFunction(trampoline, f, args...); } }; class ARM64FloatEmitter { public: ARM64FloatEmitter(ARM64XEmitter* emit) : m_emit(emit) {} void LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); void STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); // Loadstore unscaled void LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm); void STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm); // Loadstore single structure void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn); void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn); void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); // Loadstore multiple structure void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); void LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = ARM64Reg::SP); void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = ARM64Reg::SP); // Loadstore paired void LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); void STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); // Loadstore register offset void STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); // Scalar - 1 Source void FABS(ARM64Reg Rd, ARM64Reg Rn); void FNEG(ARM64Reg Rd, ARM64Reg Rn); void FSQRT(ARM64Reg Rd, ARM64Reg Rn); void FRINTI(ARM64Reg Rd, ARM64Reg Rn); void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false); // Also generalized move between GPR/FP void FRECPE(ARM64Reg Rd, ARM64Reg Rn); void FRSQRTE(ARM64Reg Rd, ARM64Reg Rn); // Scalar - pairwise void FADDP(ARM64Reg Rd, ARM64Reg Rn); void FMAXP(ARM64Reg Rd, ARM64Reg Rn); void FMINP(ARM64Reg Rd, ARM64Reg Rn); void FMAXNMP(ARM64Reg Rd, ARM64Reg Rn); void FMINNMP(ARM64Reg Rd, ARM64Reg Rn); // Scalar - 2 Source void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FDIV(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMAX(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMIN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMAXNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMINNM(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FNMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); // Scalar - 3 Source. Note - the accumulator is last on ARM! void FMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); void FMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); void FNMADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); void FNMSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); // Scalar floating point immediate void FMOV(ARM64Reg Rd, uint8_t imm8); // Vector void ADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void BIF(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void BIT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); void FCVTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FDIV(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FNEG(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FRECPE(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FRSQRTE(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void NOT(ARM64Reg Rd, ARM64Reg Rn); void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void MOV(ARM64Reg Rd, ARM64Reg Rn) { ORR(Rd, Rn, Rn); } void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn); void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn); void REV64(u8 size, ARM64Reg Rd, ARM64Reg Rn); void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn); void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn); void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale); void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale); void SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); void SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); void UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); void UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); // Move void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn); void INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn); void INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 index2); void UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); void SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); // One source void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn); // Scalar convert float to int, in a lot of variants. // Note that the scalar version of this operation has two encodings, one that goes to an integer // register // and one that outputs to a scalar fp register. void FCVTS(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round); void FCVTU(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round); // Scalar convert int to float. No rounding mode specifier necessary. void SCVTF(ARM64Reg Rd, ARM64Reg Rn); void UCVTF(ARM64Reg Rd, ARM64Reg Rn); // Scalar fixed point to float. scale is the number of fractional bits. void SCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale); void UCVTF(ARM64Reg Rd, ARM64Reg Rn, int scale); // Float comparison void FCMP(ARM64Reg Rn, ARM64Reg Rm); void FCMP(ARM64Reg Rn); void FCMPE(ARM64Reg Rn, ARM64Reg Rm); void FCMPE(ARM64Reg Rn); void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FCMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FCMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); // Conditional select void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); // Permute void UZP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void TRN1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void ZIP1(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); // Extract void EXT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 index); // Scalar shift by immediate void SHL(ARM64Reg Rd, ARM64Reg Rn, u32 shift); void URSHR(ARM64Reg Rd, ARM64Reg Rn, u32 shift); // Vector shift by immediate void SHL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void URSHR(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); void SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); void UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); // vector x indexed element void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index); void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index); // Modified Immediate void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0); void ORR(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0); void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0); void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = ARM64Reg::INVALID_REG, bool negate = false); void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = ARM64Reg::INVALID_REG); // ABI related void ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp = ARM64Reg::INVALID_REG); void ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp = ARM64Reg::INVALID_REG); private: ARM64XEmitter* m_emit; inline void Write32(u32 value) { m_emit->Write32(value); } // Emitting functions void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitScalarThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn); void EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn); void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); void Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitConversion2(bool sf, bool S, bool direction, u32 type, u32 rmode, u32 opcode, int scale, ARM64Reg Rd, ARM64Reg Rn); void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm); void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitExtract(u32 imm4, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8); void EmitShiftImm(bool Q, bool U, u32 imm, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitScalarShiftImm(bool U, u32 imm, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn); void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); void EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, bool sign); void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra, int opcode); void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh); void ORR_BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift, u8 op); void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper); void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper); }; class ARM64CodeBlock : public Common::CodeBlock { private: void PoisonMemory() override { // If our memory isn't a multiple of u32 then this won't write the last remaining bytes with // anything // Less than optimal, but there would be nothing we could do but throw a runtime warning anyway. // AArch64: 0xD4200000 = BRK 0 constexpr u32 brk_0 = 0xD4200000; for (size_t i = 0; i < region_size; i += sizeof(u32)) { std::memcpy(region + i, &brk_0, sizeof(u32)); } } }; } // namespace Arm64Gen