diff --git a/Source/Core/DolphinLib.props b/Source/Core/DolphinLib.props
index a7354ee00b..7bb0144137 100644
--- a/Source/Core/DolphinLib.props
+++ b/Source/Core/DolphinLib.props
@@ -634,6 +634,8 @@
+
+
@@ -1238,6 +1240,7 @@
+
diff --git a/Source/Core/VideoCommon/CMakeLists.txt b/Source/Core/VideoCommon/CMakeLists.txt
index a7a52c5e40..4e8bff36fc 100644
--- a/Source/Core/VideoCommon/CMakeLists.txt
+++ b/Source/Core/VideoCommon/CMakeLists.txt
@@ -23,6 +23,9 @@ add_library(videocommon
ConstantManager.h
CPMemory.cpp
CPMemory.h
+ CPUCull.cpp
+ CPUCull.h
+ CPUCullImpl.h
DriverDetails.cpp
DriverDetails.h
Fifo.cpp
diff --git a/Source/Core/VideoCommon/CPUCull.cpp b/Source/Core/VideoCommon/CPUCull.cpp
new file mode 100644
index 0000000000..bb68376b09
--- /dev/null
+++ b/Source/Core/VideoCommon/CPUCull.cpp
@@ -0,0 +1,160 @@
+// Copyright 2022 Dolphin Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "VideoCommon/CPUCull.h"
+
+#include "Common/Assert.h"
+#include "Common/CPUDetect.h"
+#include "Common/MathUtil.h"
+#include "Common/MemoryUtil.h"
+#include "Core/System.h"
+
+#include "VideoCommon/CPMemory.h"
+#include "VideoCommon/VertexManagerBase.h"
+#include "VideoCommon/VertexShaderManager.h"
+#include "VideoCommon/VideoConfig.h"
+#include "VideoCommon/XFMemory.h"
+
+#if defined(_M_X86) || defined(_M_X86_64)
+#define USE_SSE
+#elif defined(_M_ARM_64)
+#define USE_NEON
+#else
+#define NO_SIMD
+#endif
+
+#if defined(USE_SSE)
+#include
+#elif defined(USE_NEON)
+#include
+#endif
+
+#include "VideoCommon/CPUCullImpl.h"
+#ifdef USE_SSE
+#define USE_SSE3
+#include "VideoCommon/CPUCullImpl.h"
+#define USE_SSE41
+#include "VideoCommon/CPUCullImpl.h"
+#define USE_AVX
+#include "VideoCommon/CPUCullImpl.h"
+#define USE_FMA
+#include "VideoCommon/CPUCullImpl.h"
+#endif
+
+#if defined(USE_SSE)
+#if defined(__AVX__) && defined(__FMA__)
+static constexpr int MIN_SSE = 51;
+#elif defined(__AVX__)
+static constexpr int MIN_SSE = 50;
+#elif defined(__SSE4_1__)
+static constexpr int MIN_SSE = 41;
+#elif defined(__SSE3__)
+static constexpr int MIN_SSE = 30;
+#else
+static constexpr int MIN_SSE = 0;
+#endif
+#endif
+
+template
+static CPUCull::TransformFunction GetTransformFunction()
+{
+#if defined(USE_SSE)
+ if (MIN_SSE >= 51 || (cpu_info.bAVX && cpu_info.bFMA))
+ return CPUCull_FMA::TransformVertices;
+ else if (MIN_SSE >= 50 || cpu_info.bAVX)
+ return CPUCull_AVX::TransformVertices;
+ else if (PositionHas3Elems && PerVertexPosMtx && (MIN_SSE >= 41 || cpu_info.bSSE4_1))
+ return CPUCull_SSE41::TransformVertices;
+ else if (PositionHas3Elems && (MIN_SSE >= 30 || cpu_info.bSSE3))
+ return CPUCull_SSE3::TransformVertices;
+ else
+ return CPUCull_SSE::TransformVertices;
+#elif defined(USE_NEON)
+ return CPUCull_NEON::TransformVertices;
+#else
+ return CPUCull_Scalar::TransformVertices;
+#endif
+}
+
+template
+static CPUCull::CullFunction GetCullFunction0()
+{
+#if defined(USE_SSE)
+ // Note: AVX version only actually AVX on compilers that support __attribute__((target))
+ // Sorry, MSVC + Sandy Bridge. (Ivy+ and AMD see very little benefit thanks to mov elimination)
+ if (MIN_SSE >= 50 || cpu_info.bAVX)
+ return CPUCull_AVX::AreAllVerticesCulled;
+ else if (MIN_SSE >= 30 || cpu_info.bSSE3)
+ return CPUCull_SSE3::AreAllVerticesCulled;
+ else
+ return CPUCull_SSE::AreAllVerticesCulled;
+#elif defined(USE_NEON)
+ return CPUCull_NEON::AreAllVerticesCulled;
+#else
+ return CPUCull_Scalar::AreAllVerticesCulled;
+#endif
+}
+
+template
+static Common::EnumMap GetCullFunction1()
+{
+ return {
+ GetCullFunction0(),
+ GetCullFunction0(),
+ GetCullFunction0(),
+ GetCullFunction0(),
+ };
+}
+
+CPUCull::~CPUCull() = default;
+
+void CPUCull::Init()
+{
+ m_transform_table[false][false] = GetTransformFunction();
+ m_transform_table[false][true] = GetTransformFunction();
+ m_transform_table[true][false] = GetTransformFunction();
+ m_transform_table[true][true] = GetTransformFunction();
+ using Prim = OpcodeDecoder::Primitive;
+ m_cull_table[Prim::GX_DRAW_QUADS] = GetCullFunction1();
+ m_cull_table[Prim::GX_DRAW_QUADS_2] = GetCullFunction1();
+ m_cull_table[Prim::GX_DRAW_TRIANGLES] = GetCullFunction1();
+ m_cull_table[Prim::GX_DRAW_TRIANGLE_STRIP] = GetCullFunction1();
+ m_cull_table[Prim::GX_DRAW_TRIANGLE_FAN] = GetCullFunction1();
+}
+
+bool CPUCull::AreAllVerticesCulled(VertexLoaderBase* loader, OpcodeDecoder::Primitive primitive,
+ const u8* src, u32 count)
+{
+ ASSERT_MSG(VIDEO, primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES,
+ "CPUCull should not be called on lines or points");
+ const u32 stride = loader->m_native_vtx_decl.stride;
+ const bool posHas3Elems = loader->m_native_vtx_decl.position.components >= 3;
+ const bool perVertexPosMtx = loader->m_native_vtx_decl.posmtx.enable;
+ if (m_transform_buffer_size < count) [[unlikely]]
+ {
+ u32 new_size = MathUtil::NextPowerOf2(count);
+ m_transform_buffer_size = new_size;
+ m_transform_buffer.reset(static_cast(
+ Common::AllocateAlignedMemory(new_size * sizeof(TransformedVertex), 32)));
+ }
+
+ // transform functions need the projection matrix to tranform to clip space
+ Core::System::GetInstance().GetVertexShaderManager().SetProjectionMatrix();
+
+ static constexpr Common::EnumMap cullmode_invert = {
+ CullMode::None, CullMode::Front, CullMode::Back, CullMode::All};
+
+ CullMode cullmode = bpmem.genMode.cullmode;
+ if (xfmem.viewport.ht > 0) // See videosoftware Clipper.cpp:IsBackface
+ cullmode = cullmode_invert[cullmode];
+ const TransformFunction transform = m_transform_table[posHas3Elems][perVertexPosMtx];
+ transform(m_transform_buffer.get(), src, stride, count);
+ const CullFunction cull = m_cull_table[primitive][cullmode];
+ return cull(m_transform_buffer.get(), count);
+}
+
+template
+void CPUCull::BufferDeleter::operator()(T* ptr)
+{
+ Common::FreeAlignedMemory(ptr);
+}
diff --git a/Source/Core/VideoCommon/CPUCull.h b/Source/Core/VideoCommon/CPUCull.h
new file mode 100644
index 0000000000..40248035e8
--- /dev/null
+++ b/Source/Core/VideoCommon/CPUCull.h
@@ -0,0 +1,38 @@
+// Copyright 2022 Dolphin Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include "VideoCommon/BPMemory.h"
+#include "VideoCommon/DataReader.h"
+#include "VideoCommon/OpcodeDecoding.h"
+
+class CPUCull
+{
+public:
+ ~CPUCull();
+ void Init();
+ bool AreAllVerticesCulled(VertexLoaderBase* loader, OpcodeDecoder::Primitive primitive,
+ const u8* src, u32 count);
+
+ struct alignas(16) TransformedVertex
+ {
+ float x, y, z, w;
+ };
+
+ using TransformFunction = void (*)(void*, const void*, u32, int);
+ using CullFunction = bool (*)(const CPUCull::TransformedVertex*, int);
+
+private:
+ template
+ struct BufferDeleter
+ {
+ void operator()(T* ptr);
+ };
+ std::unique_ptr> m_transform_buffer;
+ u32 m_transform_buffer_size = 0;
+ std::array, 2> m_transform_table;
+ Common::EnumMap,
+ OpcodeDecoder::Primitive::GX_DRAW_TRIANGLE_FAN>
+ m_cull_table;
+};
diff --git a/Source/Core/VideoCommon/CPUCullImpl.h b/Source/Core/VideoCommon/CPUCullImpl.h
new file mode 100644
index 0000000000..e9a1d545c2
--- /dev/null
+++ b/Source/Core/VideoCommon/CPUCullImpl.h
@@ -0,0 +1,714 @@
+// Copyright 2022 Dolphin Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#if defined(USE_FMA)
+#define VECTOR_NAMESPACE CPUCull_FMA
+#elif defined(USE_AVX)
+#define VECTOR_NAMESPACE CPUCull_AVX
+#elif defined(USE_SSE41)
+#define VECTOR_NAMESPACE CPUCull_SSE41
+#elif defined(USE_SSE3)
+#define VECTOR_NAMESPACE CPUCull_SSE3
+#elif defined(USE_SSE)
+#define VECTOR_NAMESPACE CPUCull_SSE
+#elif defined(USE_NEON)
+#define VECTOR_NAMESPACE CPUCull_NEON
+#elif defined(NO_SIMD)
+#define VECTOR_NAMESPACE CPUCull_Scalar
+#else
+#error This file is meant to be used by CPUCull.cpp only!
+#endif
+
+#if defined(__GNUC__) && defined(USE_FMA) && !(defined(__AVX__) && defined(__FMA__))
+#define ATTR_TARGET __attribute__((target("avx,fma")))
+#elif defined(__GNUC__) && defined(USE_AVX) && !defined(__AVX__)
+#define ATTR_TARGET __attribute__((target("avx")))
+#elif defined(__GNUC__) && defined(USE_SSE41) && !defined(__SSE4_1__)
+#define ATTR_TARGET __attribute__((target("sse4.1")))
+#elif defined(__GNUC__) && defined(USE_SSE3) && !defined(__SSE3__)
+#define ATTR_TARGET __attribute__((target("sse3")))
+#else
+#define ATTR_TARGET
+#endif
+
+namespace VECTOR_NAMESPACE
+{
+#if defined(USE_SSE)
+typedef __m128 Vector;
+#elif defined(USE_NEON)
+typedef float32x4_t Vector;
+#else
+struct alignas(16) Vector
+{
+ float x, y, z, w;
+};
+#endif
+static_assert(sizeof(Vector) == 16);
+
+#ifdef USE_NEON
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector vsetr_f32(float x, float y, float z, float w)
+{
+ float tmp[4] = {x, y, z, w};
+ return vld1q_f32(tmp);
+}
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void vuzp12q_f32(Vector& a, Vector& b)
+{
+ Vector tmp = vuzp2q_f32(a, b);
+ a = vuzp1q_f32(a, b);
+ b = tmp;
+}
+#endif
+#ifdef USE_SSE
+template
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector vector_broadcast(Vector v)
+{
+ return _mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i));
+}
+#endif
+#ifdef USE_AVX
+template
+ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256 vector_broadcast(__m256 v)
+{
+ return _mm256_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i));
+}
+#endif
+
+#ifdef USE_AVX
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void TransposeYMM(__m256& o0, __m256& o1, //
+ __m256& o2, __m256& o3)
+{
+ __m256d tmp0 = _mm256_castps_pd(_mm256_unpacklo_ps(o0, o1));
+ __m256d tmp1 = _mm256_castps_pd(_mm256_unpacklo_ps(o2, o3));
+ __m256d tmp2 = _mm256_castps_pd(_mm256_unpackhi_ps(o0, o1));
+ __m256d tmp3 = _mm256_castps_pd(_mm256_unpackhi_ps(o2, o3));
+ o0 = _mm256_castpd_ps(_mm256_unpacklo_pd(tmp0, tmp1));
+ o1 = _mm256_castpd_ps(_mm256_unpackhi_pd(tmp0, tmp1));
+ o2 = _mm256_castpd_ps(_mm256_unpacklo_pd(tmp2, tmp3));
+ o3 = _mm256_castpd_ps(_mm256_unpackhi_pd(tmp2, tmp3));
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadTransposedYMM(const void* source, __m256& o0,
+ __m256& o1, __m256& o2, __m256& o3)
+{
+ const Vector* vsource = static_cast(source);
+ o0 = _mm256_broadcast_ps(&vsource[0]);
+ o1 = _mm256_broadcast_ps(&vsource[1]);
+ o2 = _mm256_broadcast_ps(&vsource[2]);
+ o3 = _mm256_broadcast_ps(&vsource[3]);
+ TransposeYMM(o0, o1, o2, o3);
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadPosYMM(const void* sourcel, const void* sourceh,
+ __m256& o0, __m256& o1, __m256& o2)
+{
+ const Vector* vsourcel = static_cast(sourcel);
+ const Vector* vsourceh = static_cast(sourceh);
+ o0 = _mm256_insertf128_ps(_mm256_castps128_ps256(vsourcel[0]), vsourceh[0], 1);
+ o1 = _mm256_insertf128_ps(_mm256_castps128_ps256(vsourcel[1]), vsourceh[1], 1);
+ o2 = _mm256_insertf128_ps(_mm256_castps128_ps256(vsourcel[2]), vsourceh[2], 1);
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void
+LoadTransposedPosYMM(const void* source, __m256& o0, __m256& o1, __m256& o2, __m256& o3)
+{
+ const Vector* vsource = static_cast(source);
+ o0 = _mm256_broadcast_ps(&vsource[0]);
+ o1 = _mm256_broadcast_ps(&vsource[1]);
+ o2 = _mm256_broadcast_ps(&vsource[2]);
+ o3 = _mm256_setr_ps(0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f);
+ TransposeYMM(o0, o1, o2, o3);
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256 ApplyMatrixYMM(__m256 v, __m256 m0, __m256 m1,
+ __m256 m2, __m256 m3)
+{
+ __m256 output = _mm256_mul_ps(vector_broadcast<0>(v), m0);
+#ifdef USE_FMA
+ output = _mm256_fmadd_ps(vector_broadcast<1>(v), m1, output);
+ output = _mm256_fmadd_ps(vector_broadcast<2>(v), m2, output);
+ output = _mm256_fmadd_ps(vector_broadcast<3>(v), m3, output);
+#else
+ output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<1>(v), m1));
+ output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<2>(v), m2));
+ output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<3>(v), m3));
+#endif
+ return output;
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256
+TransformVertexNoTransposeYMM(__m256 vertex, __m256 pos0, __m256 pos1, __m256 pos2, //
+ __m256 proj0, __m256 proj1, __m256 proj2, __m256 proj3)
+{
+ __m256 mul0 = _mm256_mul_ps(vertex, pos0);
+ __m256 mul1 = _mm256_mul_ps(vertex, pos1);
+ __m256 mul2 = _mm256_mul_ps(vertex, pos2);
+ __m256 mul3 = _mm256_setr_ps(0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f);
+ __m256 output = _mm256_hadd_ps(_mm256_hadd_ps(mul0, mul1), _mm256_hadd_ps(mul2, mul3));
+ return ApplyMatrixYMM(output, proj0, proj1, proj2, proj3);
+}
+
+template
+ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256
+TransformVertexYMM(__m256 vertex, __m256 pos0, __m256 pos1, __m256 pos2, __m256 pos3, //
+ __m256 proj0, __m256 proj1, __m256 proj2, __m256 proj3)
+{
+ __m256 output = pos3; // vertex.w is always 1.0
+#ifdef USE_FMA
+ output = _mm256_fmadd_ps(vector_broadcast<0>(vertex), pos0, output);
+ output = _mm256_fmadd_ps(vector_broadcast<1>(vertex), pos1, output);
+ if constexpr (PositionHas3Elems)
+ output = _mm256_fmadd_ps(vector_broadcast<2>(vertex), pos2, output);
+#else
+ output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<0>(vertex), pos0));
+ output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<1>(vertex), pos1));
+ if constexpr (PositionHas3Elems)
+ output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<2>(vertex), pos2));
+#endif
+ return ApplyMatrixYMM(output, proj0, proj1, proj2, proj3);
+}
+
+template
+ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256
+LoadTransform2Vertices(const u8* v0data, const u8* v1data, //
+ __m256 pos0, __m256 pos1, __m256 pos2, __m256 pos3, //
+ __m256 proj0, __m256 proj1, __m256 proj2, __m256 proj3)
+{
+ __m256 v01;
+ if constexpr (PerVertexPosMtx)
+ {
+ // Vertex data layout always starts with posmtx data if available, then position data
+ // Convenient for us, that means offsets are always fixed
+ u32 v0idx = v0data[0] & 0x3f;
+ u32 v1idx = v1data[0] & 0x3f;
+ v0data += sizeof(u32);
+ v1data += sizeof(u32);
+
+ const float* v0fdata = reinterpret_cast(v0data);
+ const float* v1fdata = reinterpret_cast(v1data);
+
+ LoadPosYMM(&xfmem.posMatrices[v0idx * 4], &xfmem.posMatrices[v1idx * 4], pos0, pos1, pos2);
+
+ if constexpr (PositionHas3Elems)
+ {
+ __m256 base = _mm256_set1_ps(1.0f);
+ v01 = _mm256_blend_ps(_mm256_loadu2_m128(v1fdata, v0fdata), base, 0x88);
+ }
+ else
+ {
+ __m256 base = _mm256_unpacklo_ps(_mm256_setzero_ps(), _mm256_set1_ps(1.0f));
+ __m256 v1 = _mm256_castpd_ps(_mm256_broadcast_sd(reinterpret_cast(v1data)));
+ __m128 v0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast(v0data));
+ v01 = _mm256_blend_ps(_mm256_castps128_ps256(v0), v1, 0x30);
+ v01 = _mm256_blend_ps(v01, base, 0xcc);
+ }
+
+ v01 = TransformVertexNoTransposeYMM(v01, pos0, pos1, pos2, proj0, proj1, proj2, proj3);
+ }
+ else
+ {
+ const float* v0fdata = reinterpret_cast(v0data);
+ const float* v1fdata = reinterpret_cast(v1data);
+ if constexpr (PositionHas3Elems)
+ {
+ v01 = _mm256_loadu2_m128(v1fdata, v0fdata);
+ }
+ else
+ {
+ __m256 v1 = _mm256_castpd_ps(_mm256_broadcast_sd(reinterpret_cast(v1data)));
+ __m128 v0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast(v0data));
+ v01 = _mm256_blend_ps(_mm256_castps128_ps256(v0), v1, 0x30);
+ }
+
+#ifdef __clang__
+ // Clang's optimizer is dumb, yay
+ // It sees TransformVertexYMM doing broadcasts and is like
+ // "let's broadcast *before* we combine v0 and v1! Then we can use vbroadcastss!"
+ // Prevent it from "optimizing" here
+ asm("" : "+x"(v01)::);
+#endif
+
+ v01 = TransformVertexYMM(v01, pos0, pos1, pos2, pos3, //
+ proj0, proj1, proj2, proj3);
+ }
+
+ return v01;
+}
+
+#endif
+
+#ifndef USE_AVX
+// Note: Assumes 16-byte aligned source
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadTransposed(const void* source, Vector& o0,
+ Vector& o1, Vector& o2, Vector& o3)
+{
+#if defined(USE_SSE)
+ const Vector* vsource = static_cast(source);
+ o0 = vsource[0];
+ o1 = vsource[1];
+ o2 = vsource[2];
+ o3 = vsource[3];
+ _MM_TRANSPOSE4_PS(o0, o1, o2, o3);
+#elif defined(USE_NEON)
+ float32x4x4_t ld = vld4q_f32(static_cast(source));
+ o0 = ld.val[0];
+ o1 = ld.val[1];
+ o2 = ld.val[2];
+ o3 = ld.val[3];
+#else
+ const Vector* vsource = static_cast(source);
+ // clang-format off
+ o0.x = vsource[0].x; o0.y = vsource[1].x; o0.z = vsource[2].x; o0.w = vsource[3].x;
+ o1.x = vsource[0].y; o1.y = vsource[1].y; o1.z = vsource[2].y; o1.w = vsource[3].y;
+ o2.x = vsource[0].z; o2.y = vsource[1].z; o2.z = vsource[2].z; o2.w = vsource[3].z;
+ o3.x = vsource[0].w; o3.y = vsource[1].w; o3.z = vsource[2].w; o3.w = vsource[3].w;
+ // clang-format on
+#endif
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadTransposedPos(const void* source, Vector& o0,
+ Vector& o1, Vector& o2, Vector& o3)
+{
+ const Vector* vsource = static_cast(source);
+#if defined(USE_SSE)
+ o0 = vsource[0];
+ o1 = vsource[1];
+ o2 = vsource[2];
+ o3 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
+ _MM_TRANSPOSE4_PS(o0, o1, o2, o3);
+#elif defined(USE_NEON)
+ float32x4x2_t ld01 = vld2q_f32(static_cast(source));
+ o0 = ld01.val[0];
+ o1 = ld01.val[1];
+ o2 = vsource[2];
+ o3 = vsetr_f32(0.0f, 0.0f, 0.0f, 1.0f);
+ vuzp12q_f32(o2, o3);
+ vuzp12q_f32(o0, o2);
+ vuzp12q_f32(o1, o3);
+#else
+ // clang-format off
+ o0.x = vsource[0].x; o0.y = vsource[1].x; o0.z = vsource[2].x; o0.w = 0.0f;
+ o1.x = vsource[0].y; o1.y = vsource[1].y; o1.z = vsource[2].y; o1.w = 0.0f;
+ o2.x = vsource[0].z; o2.y = vsource[1].z; o2.z = vsource[2].z; o2.w = 0.0f;
+ o3.x = vsource[0].w; o3.y = vsource[1].w; o3.z = vsource[2].w; o3.w = 1.0f;
+ // clang-format on
+#endif
+}
+#endif
+
+#ifndef USE_NEON
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadPos(const void* source, //
+ Vector& o0, Vector& o1, Vector& o2)
+{
+ const Vector* vsource = static_cast(source);
+ o0 = vsource[0];
+ o1 = vsource[1];
+ o2 = vsource[2];
+}
+#endif
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector ApplyMatrix(Vector v, Vector m0, Vector m1,
+ Vector m2, Vector m3)
+{
+#if defined(USE_SSE)
+ Vector output = _mm_mul_ps(vector_broadcast<0>(v), m0);
+#ifdef USE_FMA
+ output = _mm_fmadd_ps(vector_broadcast<1>(v), m1, output);
+ output = _mm_fmadd_ps(vector_broadcast<2>(v), m2, output);
+ output = _mm_fmadd_ps(vector_broadcast<3>(v), m3, output);
+#else
+ output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<1>(v), m1));
+ output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<2>(v), m2));
+ output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<3>(v), m3));
+#endif
+ return output;
+#elif defined(USE_NEON)
+ Vector output = vmulq_laneq_f32(m0, v, 0);
+ output = vfmaq_laneq_f32(output, m1, v, 1);
+ output = vfmaq_laneq_f32(output, m2, v, 2);
+ output = vfmaq_laneq_f32(output, m3, v, 3);
+ return output;
+#else
+ Vector output;
+ output.x = v.x * m0.x + v.y * m1.x + v.z * m2.x + v.w * m3.x;
+ output.y = v.x * m0.y + v.y * m1.y + v.z * m2.y + v.w * m3.y;
+ output.z = v.x * m0.z + v.y * m1.z + v.z * m2.z + v.w * m3.z;
+ output.w = v.x * m0.w + v.y * m1.w + v.z * m2.w + v.w * m3.w;
+ return output;
+#endif
+}
+
+#ifndef USE_NEON
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector
+TransformVertexNoTranspose(Vector vertex, Vector pos0, Vector pos1, Vector pos2, //
+ Vector proj0, Vector proj1, Vector proj2, Vector proj3)
+{
+#ifdef USE_SSE
+ Vector mul0 = _mm_mul_ps(vertex, pos0);
+ Vector mul1 = _mm_mul_ps(vertex, pos1);
+ Vector mul2 = _mm_mul_ps(vertex, pos2);
+ Vector mul3 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
+#ifdef USE_SSE3
+ Vector output = _mm_hadd_ps(_mm_hadd_ps(mul0, mul1), _mm_hadd_ps(mul2, mul3));
+#else
+ Vector t0 = _mm_add_ps(_mm_unpacklo_ps(mul0, mul2), _mm_unpackhi_ps(mul0, mul2));
+ Vector t1 = _mm_add_ps(_mm_unpacklo_ps(mul1, mul3), _mm_unpackhi_ps(mul1, mul3));
+ Vector output = _mm_add_ps(_mm_unpacklo_ps(t0, t1), _mm_unpackhi_ps(t0, t1));
+#endif
+#else
+ Vector output;
+ output.x = vertex.x * pos0.x + vertex.y * pos0.y + vertex.z * pos0.z + vertex.w * pos0.w;
+ output.y = vertex.x * pos1.x + vertex.y * pos1.y + vertex.z * pos1.z + vertex.w * pos1.w;
+ output.z = vertex.x * pos2.x + vertex.y * pos2.y + vertex.z * pos2.z + vertex.w * pos2.w;
+ output.w = 1.0f;
+#endif
+ output = ApplyMatrix(output, proj0, proj1, proj2, proj3);
+ return output;
+}
+#endif
+
+template
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector
+TransformVertex(Vector vertex, Vector pos0, Vector pos1, Vector pos2, Vector pos3, //
+ Vector proj0, Vector proj1, Vector proj2, Vector proj3)
+{
+ Vector output = pos3; // vertex.w is always 1.0
+#if defined(USE_FMA)
+ output = _mm_fmadd_ps(vector_broadcast<0>(vertex), pos0, output);
+ output = _mm_fmadd_ps(vector_broadcast<1>(vertex), pos1, output);
+ if constexpr (PositionHas3Elems)
+ output = _mm_fmadd_ps(vector_broadcast<2>(vertex), pos2, output);
+#elif defined(USE_SSE)
+ output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<0>(vertex), pos0));
+ output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<1>(vertex), pos1));
+ if constexpr (PositionHas3Elems)
+ output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<2>(vertex), pos2));
+#elif defined(USE_NEON)
+ output = vfmaq_laneq_f32(output, pos0, vertex, 0);
+ output = vfmaq_laneq_f32(output, pos1, vertex, 1);
+ if constexpr (PositionHas3Elems)
+ output = vfmaq_laneq_f32(output, pos2, vertex, 2);
+#else
+ output.x += vertex.x * pos0.x + vertex.y * pos1.x;
+ output.y += vertex.x * pos0.y + vertex.y * pos1.y;
+ output.z += vertex.x * pos0.z + vertex.y * pos1.z;
+ if constexpr (PositionHas3Elems)
+ {
+ output.x += vertex.z * pos2.x;
+ output.y += vertex.z * pos2.y;
+ output.z += vertex.z * pos2.z;
+ }
+#endif
+ output = ApplyMatrix(output, proj0, proj1, proj2, proj3);
+ return output;
+}
+
+template
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector
+LoadTransformVertex(const u8* data, Vector pos0, Vector pos1, Vector pos2, Vector pos3,
+ Vector proj0, Vector proj1, Vector proj2, Vector proj3)
+{
+ Vector vertex;
+ if constexpr (PerVertexPosMtx)
+ {
+ // Vertex data layout always starts with posmtx data if available, then position data
+ // Convenient for us, that means offsets are always fixed
+ u32 idx = data[0] & 0x3f;
+ data += sizeof(u32);
+
+ const float* fdata = reinterpret_cast(data);
+
+#ifdef USE_NEON
+ LoadTransposedPos(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2, pos3);
+
+ if constexpr (PositionHas3Elems)
+ {
+ vertex = vld1q_f32(fdata);
+ }
+ else
+ {
+ vertex = vcombine_f32(vld1_f32(fdata), vdup_n_f32(0.0f));
+ }
+
+ vertex = TransformVertex(vertex, pos0, pos1, pos2, pos3, //
+ proj0, proj1, proj2, proj3);
+#else
+ LoadPos(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2);
+
+ if constexpr (PositionHas3Elems)
+ {
+#if defined(USE_SSE)
+#ifdef USE_SSE41
+ Vector base = _mm_set1_ps(1.0f);
+ vertex = _mm_blend_ps(_mm_loadu_ps(fdata), base, 8);
+#else
+ Vector base = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
+ Vector mask = _mm_castsi128_ps(_mm_setr_epi32(-1, -1, -1, 0));
+ vertex = _mm_or_ps(_mm_and_ps(_mm_loadu_ps(fdata), mask), base);
+#endif
+#else
+ vertex.x = fdata[0];
+ vertex.y = fdata[1];
+ vertex.z = fdata[2];
+ vertex.w = 1.0f;
+#endif
+ }
+ else
+ {
+#if defined(USE_SSE)
+ Vector base = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
+ vertex = _mm_loadl_pi(base, reinterpret_cast(fdata));
+#else
+ vertex.x = fdata[0];
+ vertex.y = fdata[1];
+ vertex.z = 0.0f;
+ vertex.w = 1.0f;
+#endif
+ }
+
+ vertex = TransformVertexNoTranspose(vertex, pos0, pos1, pos2, proj0, proj1, proj2, proj3);
+#endif
+ }
+ else
+ {
+ const float* fdata = reinterpret_cast(data);
+ if constexpr (PositionHas3Elems)
+ {
+#if defined(USE_SSE)
+ vertex = _mm_loadu_ps(fdata);
+#elif defined(USE_NEON)
+ vertex = vld1q_f32(fdata);
+#else
+ vertex.x = fdata[0];
+ vertex.y = fdata[1];
+ vertex.z = fdata[2];
+ vertex.w = 1.0f;
+#endif
+ }
+ else
+ {
+#if defined(USE_SSE)
+ vertex = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast(fdata));
+#elif defined(USE_NEON)
+ vertex = vcombine_f32(vld1_f32(fdata), vdup_n_f32(0.0f));
+#else
+ vertex.x = fdata[0];
+ vertex.y = fdata[1];
+ vertex.z = 0.0f;
+ vertex.w = 1.0f;
+#endif
+ }
+
+ vertex = TransformVertex(vertex, pos0, pos1, pos2, pos3, //
+ proj0, proj1, proj2, proj3);
+ }
+
+ return vertex;
+}
+
+template
+ATTR_TARGET static void TransformVertices(void* output, const void* vertices, u32 stride, int count)
+{
+ const VertexShaderManager& vsmanager = Core::System::GetInstance().GetVertexShaderManager();
+ const u8* cvertices = static_cast(vertices);
+ Vector* voutput = static_cast(output);
+ u32 idx = g_main_cp_state.matrix_index_a.PosNormalMtxIdx & 0x3f;
+#ifdef USE_AVX
+ __m256 proj0, proj1, proj2, proj3;
+ __m256 pos0, pos1, pos2, pos3;
+ LoadTransposedYMM(vsmanager.constants.projection.data(), proj0, proj1, proj2, proj3);
+ LoadTransposedPosYMM(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2, pos3);
+ for (int i = 1; i < count; i += 2)
+ {
+ const u8* v0data = cvertices;
+ const u8* v1data = cvertices + stride;
+ __m256 v01 = LoadTransform2Vertices(
+ v0data, v1data, pos0, pos1, pos2, pos3, proj0, proj1, proj2, proj3);
+ _mm256_store_ps(reinterpret_cast(voutput), v01);
+ cvertices += stride * 2;
+ voutput += 2;
+ }
+ if (count & 1)
+ {
+ *voutput = LoadTransformVertex(
+ cvertices, //
+ _mm256_castps256_ps128(pos0), _mm256_castps256_ps128(pos1), //
+ _mm256_castps256_ps128(pos2), _mm256_castps256_ps128(pos3), //
+ _mm256_castps256_ps128(proj0), _mm256_castps256_ps128(proj1), //
+ _mm256_castps256_ps128(proj2), _mm256_castps256_ps128(proj3));
+ }
+#else
+ Vector proj0, proj1, proj2, proj3;
+ Vector pos0, pos1, pos2, pos3;
+ LoadTransposed(vsmanager.constants.projection.data(), proj0, proj1, proj2, proj3);
+ LoadTransposedPos(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2, pos3);
+ for (int i = 0; i < count; i++)
+ {
+ *voutput = LoadTransformVertex(
+ cvertices, pos0, pos1, pos2, pos3, proj0, proj1, proj2, proj3);
+ cvertices += stride;
+ voutput += 1;
+ }
+#endif
+}
+
+template
+ATTR_TARGET DOLPHIN_FORCE_INLINE static bool CullTriangle(const CPUCull::TransformedVertex& a,
+ const CPUCull::TransformedVertex& b,
+ const CPUCull::TransformedVertex& c)
+{
+ if (Mode == CullMode::All)
+ return true;
+
+ Vector va = reinterpret_cast(a);
+ Vector vb = reinterpret_cast(b);
+ Vector vc = reinterpret_cast(c);
+
+ // See videosoftware Clipper.cpp
+
+#if defined(USE_SSE)
+ Vector wxzya = _mm_shuffle_ps(va, va, _MM_SHUFFLE(1, 2, 0, 3));
+ Vector wxzyc = _mm_shuffle_ps(vc, vc, _MM_SHUFFLE(1, 2, 0, 3));
+ Vector ywzxb = _mm_shuffle_ps(vb, vb, _MM_SHUFFLE(0, 2, 3, 1));
+ Vector part0 = _mm_mul_ps(va, wxzyc);
+ Vector part1 = _mm_mul_ps(vc, wxzya);
+ Vector part2 = _mm_mul_ps(_mm_sub_ps(part0, part1), ywzxb);
+#ifdef USE_SSE3
+ Vector part3 = _mm_movehdup_ps(part2);
+#else
+ Vector part3 = vector_broadcast<1>(part2);
+#endif
+ Vector part4 = vector_broadcast<3>(part2);
+ Vector part5 = _mm_add_ss(_mm_add_ss(part2, part3), part4);
+ float normal_z_dir;
+ _mm_store_ss(&normal_z_dir, part5);
+#elif defined(USE_NEON)
+ Vector zero = vdupq_n_f32(0.0f);
+ Vector wx0ya = vextq_f32(va, vzip1q_f32(va, zero), 3);
+ Vector wx0yc = vextq_f32(vc, vzip1q_f32(vc, zero), 3);
+ Vector ywxxb = vuzp2q_f32(vb, vdupq_laneq_f32(vb, 0));
+ Vector part0 = vmulq_f32(va, wx0yc);
+ Vector part1 = vmulq_f32(vc, wx0ya);
+ Vector part2 = vmulq_f32(vsubq_f32(part0, part1), ywxxb);
+ float normal_z_dir = vaddvq_f32(part2);
+#else
+ float normal_z_dir = (c.w * a.x - a.w * c.x) * b.y + //
+ (c.x * a.y - a.x * c.y) * b.w + //
+ (c.y * a.w - a.y * c.w) * b.x;
+#endif
+ bool cull = false;
+ switch (Mode)
+ {
+ case CullMode::None:
+ cull = normal_z_dir == 0;
+ break;
+ case CullMode::Front:
+ cull = normal_z_dir <= 0;
+ break;
+ case CullMode::Back:
+ cull = normal_z_dir >= 0;
+ break;
+ case CullMode::All:
+ cull = true;
+ break;
+ }
+ if (cull)
+ return true;
+
+#if defined(USE_SSE)
+ Vector xyab = _mm_unpacklo_ps(va, vb);
+ Vector zwab = _mm_unpackhi_ps(va, vb);
+ Vector allx = _mm_shuffle_ps(xyab, vc, _MM_SHUFFLE(0, 0, 1, 0));
+ Vector ally = _mm_shuffle_ps(xyab, vc, _MM_SHUFFLE(1, 1, 3, 2));
+ Vector allpw = _mm_shuffle_ps(zwab, vc, _MM_SHUFFLE(3, 3, 3, 2));
+ Vector allnw = _mm_xor_ps(allpw, _mm_set1_ps(-0.0f));
+ __m128i x_gt_pw = _mm_castps_si128(_mm_cmple_ps(allpw, allx));
+ __m128i y_gt_pw = _mm_castps_si128(_mm_cmple_ps(allpw, ally));
+ __m128i x_lt_nw = _mm_castps_si128(_mm_cmplt_ps(allx, allnw));
+ __m128i y_lt_nw = _mm_castps_si128(_mm_cmplt_ps(ally, allnw));
+ __m128i any_out_of_bounds = _mm_packs_epi16(_mm_packs_epi32(x_lt_nw, y_lt_nw), //
+ _mm_packs_epi32(x_gt_pw, y_gt_pw));
+ cull |= 0 != _mm_movemask_epi8(_mm_cmpeq_epi32(_mm_set1_epi32(~0), any_out_of_bounds));
+#elif defined(USE_NEON)
+ float64x2_t xyab = vreinterpretq_f64_f32(vzip1q_f32(va, vb));
+ float64x2_t xycc = vreinterpretq_f64_f32(vzip1q_f32(vc, vc));
+ float32x4_t allx = vreinterpretq_f32_f64(vzip1q_f64(xyab, xycc));
+ float32x4_t ally = vreinterpretq_f32_f64(vzip2q_f64(xyab, xycc));
+ float32x4_t allpw = vextq_f32(vzip2q_f32(va, vb), vdupq_laneq_f32(vc, 3), 2);
+ float32x4_t allnw = vnegq_f32(allpw);
+ uint16x8_t x_gt_pw = vreinterpretq_u16_u32(vcgtq_f32(allx, allpw));
+ uint16x8_t y_gt_pw = vreinterpretq_u16_u32(vcgtq_f32(ally, allpw));
+ uint16x8_t x_lt_nw = vreinterpretq_u16_u32(vcltq_f32(allx, allnw));
+ uint16x8_t y_lt_nw = vreinterpretq_u16_u32(vcltq_f32(ally, allnw));
+ uint8x16_t lt_nw = vreinterpretq_u8_u16(vuzp1q_u16(x_lt_nw, y_lt_nw));
+ uint8x16_t gt_pw = vreinterpretq_u8_u16(vuzp1q_u16(x_gt_pw, y_gt_pw));
+ uint32x4_t any_out_of_bounds = vreinterpretq_u32_u8(vuzp1q_u8(lt_nw, gt_pw));
+ cull |= 0xFFFFFFFF == vmaxvq_u32(any_out_of_bounds);
+#else
+ cull |= a.x < -a.w && b.x < -b.w && c.x < -c.w;
+ cull |= a.y < -a.w && b.y < -b.w && c.y < -c.w;
+ cull |= a.x > a.w && b.x > b.w && c.x > c.w;
+ cull |= a.y > a.w && b.y > b.w && c.y > c.w;
+#endif
+
+ return cull;
+}
+
+template
+ATTR_TARGET static bool AreAllVerticesCulled(const CPUCull::TransformedVertex* transformed,
+ int count)
+{
+ switch (Primitive)
+ {
+ case OpcodeDecoder::Primitive::GX_DRAW_QUADS:
+ case OpcodeDecoder::Primitive::GX_DRAW_QUADS_2:
+ {
+ int i = 3;
+ for (; i < count; i += 4)
+ {
+ if (!CullTriangle(transformed[i - 3], transformed[i - 2], transformed[i - 1]))
+ return false;
+ if (!CullTriangle(transformed[i - 3], transformed[i - 1], transformed[i - 0]))
+ return false;
+ }
+ // three vertices remaining, so render a triangle
+ if (i == count)
+ {
+ if (!CullTriangle(transformed[i - 3], transformed[i - 2], transformed[i - 1]))
+ return false;
+ }
+ break;
+ }
+ case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLES:
+ for (int i = 2; i < count; i += 3)
+ {
+ if (!CullTriangle(transformed[i - 2], transformed[i - 1], transformed[i - 0]))
+ return false;
+ }
+ break;
+ case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLE_STRIP:
+ {
+ bool wind = false;
+ for (int i = 2; i < count; ++i)
+ {
+ if (!CullTriangle(transformed[i - 2], transformed[i - !wind], transformed[i - wind]))
+ return false;
+ wind = !wind;
+ }
+ break;
+ }
+ case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLE_FAN:
+ for (int i = 2; i < count; ++i)
+ {
+ if (!CullTriangle(transformed[0], transformed[i - 1], transformed[i]))
+ return false;
+ }
+ break;
+ }
+
+ return true;
+}
+
+} // namespace VECTOR_NAMESPACE
+
+#undef ATTR_TARGET
+#undef VECTOR_NAMESPACE
diff --git a/Source/Core/VideoCommon/ConstantManager.h b/Source/Core/VideoCommon/ConstantManager.h
index 6e60929056..88c25a9823 100644
--- a/Source/Core/VideoCommon/ConstantManager.h
+++ b/Source/Core/VideoCommon/ConstantManager.h
@@ -17,7 +17,7 @@ enum class SrcBlendFactor : u32;
enum class ZTexOp : u32;
enum class LogicOp : u32;
-struct PixelShaderConstants
+struct alignas(16) PixelShaderConstants
{
std::array colors;
std::array kcolors;
@@ -60,7 +60,7 @@ struct PixelShaderConstants
LogicOp logic_op_mode;
};
-struct VertexShaderConstants
+struct alignas(16) VertexShaderConstants
{
u32 components; // .x
u32 xfmem_dualTexInfo; // .y
@@ -109,7 +109,7 @@ enum class VSExpand : u32
Line,
};
-struct GeometryShaderConstants
+struct alignas(16) GeometryShaderConstants
{
float4 stereoparams;
float4 lineptparams;
diff --git a/Source/Core/VideoCommon/VertexShaderManager.cpp b/Source/Core/VideoCommon/VertexShaderManager.cpp
index abc59b862c..deda07fd65 100644
--- a/Source/Core/VideoCommon/VertexShaderManager.cpp
+++ b/Source/Core/VideoCommon/VertexShaderManager.cpp
@@ -65,6 +65,97 @@ void VertexShaderManager::Dirty()
dirty = true;
}
+Common::Matrix44 VertexShaderManager::LoadProjectionMatrix()
+{
+ const auto& rawProjection = xfmem.projection.rawProjection;
+
+ switch (xfmem.projection.type)
+ {
+ case ProjectionType::Perspective:
+ {
+ const Common::Vec2 fov_multiplier = g_freelook_camera.IsActive() ?
+ g_freelook_camera.GetFieldOfViewMultiplier() :
+ Common::Vec2{1, 1};
+ m_projection_matrix[0] = rawProjection[0] * g_ActiveConfig.fAspectRatioHackW * fov_multiplier.x;
+ m_projection_matrix[1] = 0.0f;
+ m_projection_matrix[2] = rawProjection[1] * g_ActiveConfig.fAspectRatioHackW * fov_multiplier.x;
+ m_projection_matrix[3] = 0.0f;
+
+ m_projection_matrix[4] = 0.0f;
+ m_projection_matrix[5] = rawProjection[2] * g_ActiveConfig.fAspectRatioHackH * fov_multiplier.y;
+ m_projection_matrix[6] = rawProjection[3] * g_ActiveConfig.fAspectRatioHackH * fov_multiplier.y;
+ m_projection_matrix[7] = 0.0f;
+
+ m_projection_matrix[8] = 0.0f;
+ m_projection_matrix[9] = 0.0f;
+ m_projection_matrix[10] = rawProjection[4];
+ m_projection_matrix[11] = rawProjection[5];
+
+ m_projection_matrix[12] = 0.0f;
+ m_projection_matrix[13] = 0.0f;
+
+ m_projection_matrix[14] = -1.0f;
+ m_projection_matrix[15] = 0.0f;
+
+ g_stats.gproj = m_projection_matrix;
+ }
+ break;
+
+ case ProjectionType::Orthographic:
+ {
+ m_projection_matrix[0] = rawProjection[0];
+ m_projection_matrix[1] = 0.0f;
+ m_projection_matrix[2] = 0.0f;
+ m_projection_matrix[3] = rawProjection[1];
+
+ m_projection_matrix[4] = 0.0f;
+ m_projection_matrix[5] = rawProjection[2];
+ m_projection_matrix[6] = 0.0f;
+ m_projection_matrix[7] = rawProjection[3];
+
+ m_projection_matrix[8] = 0.0f;
+ m_projection_matrix[9] = 0.0f;
+ m_projection_matrix[10] = rawProjection[4];
+ m_projection_matrix[11] = rawProjection[5];
+
+ m_projection_matrix[12] = 0.0f;
+ m_projection_matrix[13] = 0.0f;
+
+ m_projection_matrix[14] = 0.0f;
+ m_projection_matrix[15] = 1.0f;
+
+ g_stats.g2proj = m_projection_matrix;
+ g_stats.proj = rawProjection;
+ }
+ break;
+
+ default:
+ ERROR_LOG_FMT(VIDEO, "Unknown projection type: {}", xfmem.projection.type);
+ }
+
+ PRIM_LOG("Projection: {} {} {} {} {} {}", rawProjection[0], rawProjection[1], rawProjection[2],
+ rawProjection[3], rawProjection[4], rawProjection[5]);
+
+ auto corrected_matrix = m_viewport_correction * Common::Matrix44::FromArray(m_projection_matrix);
+
+ if (g_freelook_camera.IsActive() && xfmem.projection.type == ProjectionType::Perspective)
+ corrected_matrix *= g_freelook_camera.GetView();
+
+ g_freelook_camera.GetController()->SetClean();
+
+ return corrected_matrix;
+}
+
+void VertexShaderManager::SetProjectionMatrix()
+{
+ if (m_projection_changed || g_freelook_camera.GetController()->IsDirty())
+ {
+ m_projection_changed = false;
+ auto corrected_matrix = LoadProjectionMatrix();
+ memcpy(constants.projection.data(), corrected_matrix.data.data(), 4 * sizeof(float4));
+ }
+}
+
// Syncs the shader constant buffers with xfmem
// TODO: A cleaner way to control the matrices without making a mess in the parameters field
void VertexShaderManager::SetConstants(const std::vector& textures)
@@ -317,84 +408,7 @@ void VertexShaderManager::SetConstants(const std::vector& textures)
m_projection_changed = false;
m_projection_graphics_mod_change = !projection_actions.empty();
- const auto& rawProjection = xfmem.projection.rawProjection;
-
- switch (xfmem.projection.type)
- {
- case ProjectionType::Perspective:
- {
- const Common::Vec2 fov_multiplier = g_freelook_camera.IsActive() ?
- g_freelook_camera.GetFieldOfViewMultiplier() :
- Common::Vec2{1, 1};
- m_projection_matrix[0] =
- rawProjection[0] * g_ActiveConfig.fAspectRatioHackW * fov_multiplier.x;
- m_projection_matrix[1] = 0.0f;
- m_projection_matrix[2] =
- rawProjection[1] * g_ActiveConfig.fAspectRatioHackW * fov_multiplier.x;
- m_projection_matrix[3] = 0.0f;
-
- m_projection_matrix[4] = 0.0f;
- m_projection_matrix[5] =
- rawProjection[2] * g_ActiveConfig.fAspectRatioHackH * fov_multiplier.y;
- m_projection_matrix[6] =
- rawProjection[3] * g_ActiveConfig.fAspectRatioHackH * fov_multiplier.y;
- m_projection_matrix[7] = 0.0f;
-
- m_projection_matrix[8] = 0.0f;
- m_projection_matrix[9] = 0.0f;
- m_projection_matrix[10] = rawProjection[4];
- m_projection_matrix[11] = rawProjection[5];
-
- m_projection_matrix[12] = 0.0f;
- m_projection_matrix[13] = 0.0f;
-
- m_projection_matrix[14] = -1.0f;
- m_projection_matrix[15] = 0.0f;
-
- g_stats.gproj = m_projection_matrix;
- }
- break;
-
- case ProjectionType::Orthographic:
- {
- m_projection_matrix[0] = rawProjection[0];
- m_projection_matrix[1] = 0.0f;
- m_projection_matrix[2] = 0.0f;
- m_projection_matrix[3] = rawProjection[1];
-
- m_projection_matrix[4] = 0.0f;
- m_projection_matrix[5] = rawProjection[2];
- m_projection_matrix[6] = 0.0f;
- m_projection_matrix[7] = rawProjection[3];
-
- m_projection_matrix[8] = 0.0f;
- m_projection_matrix[9] = 0.0f;
- m_projection_matrix[10] = rawProjection[4];
- m_projection_matrix[11] = rawProjection[5];
-
- m_projection_matrix[12] = 0.0f;
- m_projection_matrix[13] = 0.0f;
-
- m_projection_matrix[14] = 0.0f;
- m_projection_matrix[15] = 1.0f;
-
- g_stats.g2proj = m_projection_matrix;
- g_stats.proj = rawProjection;
- }
- break;
-
- default:
- ERROR_LOG_FMT(VIDEO, "Unknown projection type: {}", xfmem.projection.type);
- }
-
- PRIM_LOG("Projection: {} {} {} {} {} {}", rawProjection[0], rawProjection[1], rawProjection[2],
- rawProjection[3], rawProjection[4], rawProjection[5]);
-
- auto corrected_matrix =
- m_viewport_correction * Common::Matrix44::FromArray(m_projection_matrix);
-
- if (g_freelook_camera.IsActive() && xfmem.projection.type == ProjectionType::Perspective)
- corrected_matrix *= g_freelook_camera.GetView();
+ auto corrected_matrix = LoadProjectionMatrix();
GraphicsModActionData::Projection projection{&corrected_matrix};
for (auto action : projection_actions)
@@ -404,8 +418,6 @@ void VertexShaderManager::SetConstants(const std::vector& textures)
memcpy(constants.projection.data(), corrected_matrix.data.data(), 4 * sizeof(float4));
- g_freelook_camera.GetController()->SetClean();
-
dirty = true;
}
diff --git a/Source/Core/VideoCommon/VertexShaderManager.h b/Source/Core/VideoCommon/VertexShaderManager.h
index ce157bd596..9a150980da 100644
--- a/Source/Core/VideoCommon/VertexShaderManager.h
+++ b/Source/Core/VideoCommon/VertexShaderManager.h
@@ -24,6 +24,7 @@ public:
void DoState(PointerWrap& p);
// constant management
+ void SetProjectionMatrix();
void SetConstants(const std::vector& textures);
void InvalidateXFRange(int start, int end);
@@ -64,4 +65,6 @@ private:
std::array m_minmax_lights_changed{};
Common::Matrix44 m_viewport_correction{};
+
+ Common::Matrix44 LoadProjectionMatrix();
};
diff --git a/Source/Core/VideoCommon/XFMemory.h b/Source/Core/VideoCommon/XFMemory.h
index 71d38cb8bd..e0a2696317 100644
--- a/Source/Core/VideoCommon/XFMemory.h
+++ b/Source/Core/VideoCommon/XFMemory.h
@@ -423,7 +423,7 @@ struct Projection
ProjectionType type;
};
-struct XFMemory
+struct alignas(16) XFMemory
{
float posMatrices[256]; // 0x0000 - 0x00ff
u32 unk0[768]; // 0x0100 - 0x03ff