diff --git a/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/features/settings/model/BooleanSetting.java b/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/features/settings/model/BooleanSetting.java
index 96383c1a87..6ebaad86c9 100644
--- a/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/features/settings/model/BooleanSetting.java
+++ b/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/features/settings/model/BooleanSetting.java
@@ -214,6 +214,7 @@ public enum BooleanSetting implements AbstractBooleanSetting
"SaveTextureCacheToState", true),
GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION(Settings.FILE_GFX, Settings.SECTION_GFX_SETTINGS,
"PreferVSForLinePointExpansion", false),
+ GFX_CPU_CULL(Settings.FILE_GFX, Settings.SECTION_GFX_SETTINGS, "CPUCull", false),
GFX_MODS_ENABLE(Settings.FILE_GFX, Settings.SECTION_GFX_SETTINGS, "EnableMods", false),
GFX_ENHANCE_FORCE_FILTERING(Settings.FILE_GFX, Settings.SECTION_GFX_ENHANCEMENTS,
diff --git a/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/features/settings/ui/SettingsFragmentPresenter.java b/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/features/settings/ui/SettingsFragmentPresenter.java
index 93bd5bbf2e..5e8c5bd912 100644
--- a/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/features/settings/ui/SettingsFragmentPresenter.java
+++ b/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/features/settings/ui/SettingsFragmentPresenter.java
@@ -938,6 +938,8 @@ public final class SettingsFragmentPresenter
sl.add(new SwitchSetting(mContext, BooleanSetting.GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION,
R.string.prefer_vs_for_point_line_expansion,
R.string.prefer_vs_for_point_line_expansion_description));
+ sl.add(new SwitchSetting(mContext, BooleanSetting.GFX_CPU_CULL, R.string.cpu_cull,
+ R.string.cpu_cull_description));
sl.add(new SwitchSetting(mContext, BooleanSetting.GFX_HACK_EFB_DEFER_INVALIDATION,
R.string.defer_efb_invalidation, R.string.defer_efb_invalidation_description));
sl.add(new InvertedSwitchSetting(mContext, BooleanSetting.GFX_HACK_FAST_TEXTURE_SAMPLING,
diff --git a/Source/Android/app/src/main/res/values/strings.xml b/Source/Android/app/src/main/res/values/strings.xml
index e29acd6227..093e67b7cc 100644
--- a/Source/Android/app/src/main/res/values/strings.xml
+++ b/Source/Android/app/src/main/res/values/strings.xml
@@ -361,6 +361,8 @@
Enables graphics backend multithreading (Vulkan only). May affect performance. If unsure, leave this checked.
Prefer VS for Point/Line Expansion
On backends that support both using the geometry shader and the vertex shader for expanding points and lines, selects the vertex shader for the job. May affect performance.
+ Cull Vertices on the CPU
+ Cull vertices on the CPU to reduce the number of draw calls required. May affect performance. If unsure, leave this unchecked.
Defer EFB Cache Invalidation
Defers invalidation of the EFB access cache until a GPU synchronization command is executed. May improve performance in some games at the cost of stability. If unsure, leave this unchecked.
Manual Texture Sampling
diff --git a/Source/Core/Core/Config/GraphicsSettings.cpp b/Source/Core/Core/Config/GraphicsSettings.cpp
index 816179a565..dcf6e817d8 100644
--- a/Source/Core/Core/Config/GraphicsSettings.cpp
+++ b/Source/Core/Core/Config/GraphicsSettings.cpp
@@ -93,6 +93,7 @@ const Info GFX_SAVE_TEXTURE_CACHE_TO_STATE{
{System::GFX, "Settings", "SaveTextureCacheToState"}, true};
const Info GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION{
{System::GFX, "Settings", "PreferVSForLinePointExpansion"}, false};
+const Info GFX_CPU_CULL{{System::GFX, "Settings", "CPUCull"}, false};
const Info GFX_MTL_MANUALLY_UPLOAD_BUFFERS{
{System::GFX, "Settings", "ManuallyUploadBuffers"}, TriState::Auto};
diff --git a/Source/Core/Core/Config/GraphicsSettings.h b/Source/Core/Core/Config/GraphicsSettings.h
index f53248995d..ad8d2aeb7f 100644
--- a/Source/Core/Core/Config/GraphicsSettings.h
+++ b/Source/Core/Core/Config/GraphicsSettings.h
@@ -82,6 +82,7 @@ extern const Info GFX_SHADER_COMPILER_THREADS;
extern const Info GFX_SHADER_PRECOMPILER_THREADS;
extern const Info GFX_SAVE_TEXTURE_CACHE_TO_STATE;
extern const Info GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION;
+extern const Info GFX_CPU_CULL;
extern const Info GFX_MTL_MANUALLY_UPLOAD_BUFFERS;
extern const Info GFX_MTL_USE_PRESENT_DRAWABLE;
diff --git a/Source/Core/DolphinLib.props b/Source/Core/DolphinLib.props
index 7150e8d783..1bb676eeae 100644
--- a/Source/Core/DolphinLib.props
+++ b/Source/Core/DolphinLib.props
@@ -632,6 +632,8 @@
+
+
@@ -1226,6 +1228,7 @@
+
diff --git a/Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.cpp b/Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.cpp
index 44dbe30487..102e93e09b 100644
--- a/Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.cpp
+++ b/Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.cpp
@@ -159,16 +159,18 @@ void AdvancedWidget::CreateWidgets()
m_prefer_vs_for_point_line_expansion = new GraphicsBool(
// i18n: VS is short for vertex shaders.
tr("Prefer VS for Point/Line Expansion"), Config::GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION);
+ m_cpu_cull = new GraphicsBool(tr("Cull Vertices on the CPU"), Config::GFX_CPU_CULL);
misc_layout->addWidget(m_enable_cropping, 0, 0);
misc_layout->addWidget(m_enable_prog_scan, 0, 1);
misc_layout->addWidget(m_backend_multithreading, 1, 0);
misc_layout->addWidget(m_prefer_vs_for_point_line_expansion, 1, 1);
+ misc_layout->addWidget(m_cpu_cull, 2, 0);
#ifdef _WIN32
m_borderless_fullscreen =
new GraphicsBool(tr("Borderless Fullscreen"), Config::GFX_BORDERLESS_FULLSCREEN);
- misc_layout->addWidget(m_borderless_fullscreen, 2, 0);
+ misc_layout->addWidget(m_borderless_fullscreen, 2, 1);
#endif
// Experimental.
@@ -369,6 +371,10 @@ void AdvancedWidget::AddDescriptions()
"for expanding points and lines, selects the vertex shader for the job. May "
"affect performance."
"
%1");
+ static const char TR_CPU_CULL_DESCRIPTION[] =
+ QT_TR_NOOP("Cull vertices on the CPU to reduce the number of draw calls required. "
+ "May affect performance and draw statistics.
"
+ "If unsure, leave this unchecked.");
static const char TR_DEFER_EFB_ACCESS_INVALIDATION_DESCRIPTION[] = QT_TR_NOOP(
"Defers invalidation of the EFB access cache until a GPU synchronization command "
"is executed. If disabled, the cache will be invalidated with every draw call. "
@@ -441,6 +447,7 @@ void AdvancedWidget::AddDescriptions()
vsexpand_extra = tr(IF_UNSURE_UNCHECKED);
m_prefer_vs_for_point_line_expansion->SetDescription(
tr(TR_PREFER_VS_FOR_POINT_LINE_EXPANSION_DESCRIPTION).arg(vsexpand_extra));
+ m_cpu_cull->SetDescription(tr(TR_CPU_CULL_DESCRIPTION));
#ifdef _WIN32
m_borderless_fullscreen->SetDescription(tr(TR_BORDERLESS_FULLSCREEN_DESCRIPTION));
#endif
diff --git a/Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.h b/Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.h
index 156cf568d8..cd3f135470 100644
--- a/Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.h
+++ b/Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.h
@@ -69,6 +69,7 @@ private:
ToolTipCheckBox* m_enable_prog_scan;
GraphicsBool* m_backend_multithreading;
GraphicsBool* m_prefer_vs_for_point_line_expansion;
+ GraphicsBool* m_cpu_cull;
GraphicsBool* m_borderless_fullscreen;
// Experimental
diff --git a/Source/Core/VideoCommon/CMakeLists.txt b/Source/Core/VideoCommon/CMakeLists.txt
index a7a52c5e40..4e8bff36fc 100644
--- a/Source/Core/VideoCommon/CMakeLists.txt
+++ b/Source/Core/VideoCommon/CMakeLists.txt
@@ -23,6 +23,9 @@ add_library(videocommon
ConstantManager.h
CPMemory.cpp
CPMemory.h
+ CPUCull.cpp
+ CPUCull.h
+ CPUCullImpl.h
DriverDetails.cpp
DriverDetails.h
Fifo.cpp
diff --git a/Source/Core/VideoCommon/CPUCull.cpp b/Source/Core/VideoCommon/CPUCull.cpp
new file mode 100644
index 0000000000..92fdc81cc8
--- /dev/null
+++ b/Source/Core/VideoCommon/CPUCull.cpp
@@ -0,0 +1,174 @@
+// Copyright 2022 Dolphin Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "VideoCommon/CPUCull.h"
+
+#include "Common/Assert.h"
+#include "Common/CPUDetect.h"
+#include "Common/MathUtil.h"
+#include "Common/MemoryUtil.h"
+#include "Core/System.h"
+
+#include "VideoCommon/CPMemory.h"
+#include "VideoCommon/VertexManagerBase.h"
+#include "VideoCommon/VertexShaderManager.h"
+#include "VideoCommon/VideoConfig.h"
+#include "VideoCommon/XFMemory.h"
+
+// We really want things like c.w * a.x - a.w * c.x to stay symmetric, so they cancel to zero on
+// degenerate triangles. Make sure the compiler doesn't optimize in fmas where not requested.
+#ifdef _MSC_VER
+#pragma fp_contract(off)
+#else
+// GCC doesn't support any in-file way to turn off fp contract yet
+// Not ideal, but worst case scenario its cpu cull is worse at detecting degenerate triangles
+// (Most likely to happen on arm, as we don't compile the cull code for x86 fma)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunknown-pragmas"
+#pragma STDC FP_CONTRACT OFF
+#pragma GCC diagnostic pop
+#endif
+
+#if defined(_M_X86) || defined(_M_X86_64)
+#define USE_SSE
+#elif defined(_M_ARM_64)
+#define USE_NEON
+#else
+#define NO_SIMD
+#endif
+
+#if defined(USE_SSE)
+#include
+#elif defined(USE_NEON)
+#include
+#endif
+
+#include "VideoCommon/CPUCullImpl.h"
+#ifdef USE_SSE
+#define USE_SSE3
+#include "VideoCommon/CPUCullImpl.h"
+#define USE_SSE41
+#include "VideoCommon/CPUCullImpl.h"
+#define USE_AVX
+#include "VideoCommon/CPUCullImpl.h"
+#define USE_FMA
+#include "VideoCommon/CPUCullImpl.h"
+#endif
+
+#if defined(USE_SSE)
+#if defined(__AVX__) && defined(__FMA__)
+static constexpr int MIN_SSE = 51;
+#elif defined(__AVX__)
+static constexpr int MIN_SSE = 50;
+#elif defined(__SSE4_1__)
+static constexpr int MIN_SSE = 41;
+#elif defined(__SSE3__)
+static constexpr int MIN_SSE = 30;
+#else
+static constexpr int MIN_SSE = 0;
+#endif
+#endif
+
+template
+static CPUCull::TransformFunction GetTransformFunction()
+{
+#if defined(USE_SSE)
+ if (MIN_SSE >= 51 || (cpu_info.bAVX && cpu_info.bFMA))
+ return CPUCull_FMA::TransformVertices;
+ else if (MIN_SSE >= 50 || cpu_info.bAVX)
+ return CPUCull_AVX::TransformVertices;
+ else if (PositionHas3Elems && PerVertexPosMtx && (MIN_SSE >= 41 || cpu_info.bSSE4_1))
+ return CPUCull_SSE41::TransformVertices;
+ else if (PositionHas3Elems && (MIN_SSE >= 30 || cpu_info.bSSE3))
+ return CPUCull_SSE3::TransformVertices;
+ else
+ return CPUCull_SSE::TransformVertices;
+#elif defined(USE_NEON)
+ return CPUCull_NEON::TransformVertices;
+#else
+ return CPUCull_Scalar::TransformVertices;
+#endif
+}
+
+template
+static CPUCull::CullFunction GetCullFunction0()
+{
+#if defined(USE_SSE)
+ // Note: AVX version only actually AVX on compilers that support __attribute__((target))
+ // Sorry, MSVC + Sandy Bridge. (Ivy+ and AMD see very little benefit thanks to mov elimination)
+ if (MIN_SSE >= 50 || cpu_info.bAVX)
+ return CPUCull_AVX::AreAllVerticesCulled;
+ else if (MIN_SSE >= 30 || cpu_info.bSSE3)
+ return CPUCull_SSE3::AreAllVerticesCulled;
+ else
+ return CPUCull_SSE::AreAllVerticesCulled;
+#elif defined(USE_NEON)
+ return CPUCull_NEON::AreAllVerticesCulled;
+#else
+ return CPUCull_Scalar::AreAllVerticesCulled;
+#endif
+}
+
+template
+static Common::EnumMap GetCullFunction1()
+{
+ return {
+ GetCullFunction0(),
+ GetCullFunction0(),
+ GetCullFunction0(),
+ GetCullFunction0(),
+ };
+}
+
+CPUCull::~CPUCull() = default;
+
+void CPUCull::Init()
+{
+ m_transform_table[false][false] = GetTransformFunction();
+ m_transform_table[false][true] = GetTransformFunction();
+ m_transform_table[true][false] = GetTransformFunction();
+ m_transform_table[true][true] = GetTransformFunction();
+ using Prim = OpcodeDecoder::Primitive;
+ m_cull_table[Prim::GX_DRAW_QUADS] = GetCullFunction1();
+ m_cull_table[Prim::GX_DRAW_QUADS_2] = GetCullFunction1();
+ m_cull_table[Prim::GX_DRAW_TRIANGLES] = GetCullFunction1();
+ m_cull_table[Prim::GX_DRAW_TRIANGLE_STRIP] = GetCullFunction1();
+ m_cull_table[Prim::GX_DRAW_TRIANGLE_FAN] = GetCullFunction1();
+}
+
+bool CPUCull::AreAllVerticesCulled(VertexLoaderBase* loader, OpcodeDecoder::Primitive primitive,
+ const u8* src, u32 count)
+{
+ ASSERT_MSG(VIDEO, primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES,
+ "CPUCull should not be called on lines or points");
+ const u32 stride = loader->m_native_vtx_decl.stride;
+ const bool posHas3Elems = loader->m_native_vtx_decl.position.components >= 3;
+ const bool perVertexPosMtx = loader->m_native_vtx_decl.posmtx.enable;
+ if (m_transform_buffer_size < count) [[unlikely]]
+ {
+ u32 new_size = MathUtil::NextPowerOf2(count);
+ m_transform_buffer_size = new_size;
+ m_transform_buffer.reset(static_cast(
+ Common::AllocateAlignedMemory(new_size * sizeof(TransformedVertex), 32)));
+ }
+
+ // transform functions need the projection matrix to tranform to clip space
+ Core::System::GetInstance().GetVertexShaderManager().SetProjectionMatrix();
+
+ static constexpr Common::EnumMap cullmode_invert = {
+ CullMode::None, CullMode::Front, CullMode::Back, CullMode::All};
+
+ CullMode cullmode = bpmem.genMode.cullmode;
+ if (xfmem.viewport.ht > 0) // See videosoftware Clipper.cpp:IsBackface
+ cullmode = cullmode_invert[cullmode];
+ const TransformFunction transform = m_transform_table[posHas3Elems][perVertexPosMtx];
+ transform(m_transform_buffer.get(), src, stride, count);
+ const CullFunction cull = m_cull_table[primitive][cullmode];
+ return cull(m_transform_buffer.get(), count);
+}
+
+template
+void CPUCull::BufferDeleter::operator()(T* ptr)
+{
+ Common::FreeAlignedMemory(ptr);
+}
diff --git a/Source/Core/VideoCommon/CPUCull.h b/Source/Core/VideoCommon/CPUCull.h
new file mode 100644
index 0000000000..40248035e8
--- /dev/null
+++ b/Source/Core/VideoCommon/CPUCull.h
@@ -0,0 +1,38 @@
+// Copyright 2022 Dolphin Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include "VideoCommon/BPMemory.h"
+#include "VideoCommon/DataReader.h"
+#include "VideoCommon/OpcodeDecoding.h"
+
+class CPUCull
+{
+public:
+ ~CPUCull();
+ void Init();
+ bool AreAllVerticesCulled(VertexLoaderBase* loader, OpcodeDecoder::Primitive primitive,
+ const u8* src, u32 count);
+
+ struct alignas(16) TransformedVertex
+ {
+ float x, y, z, w;
+ };
+
+ using TransformFunction = void (*)(void*, const void*, u32, int);
+ using CullFunction = bool (*)(const CPUCull::TransformedVertex*, int);
+
+private:
+ template
+ struct BufferDeleter
+ {
+ void operator()(T* ptr);
+ };
+ std::unique_ptr> m_transform_buffer;
+ u32 m_transform_buffer_size = 0;
+ std::array, 2> m_transform_table;
+ Common::EnumMap,
+ OpcodeDecoder::Primitive::GX_DRAW_TRIANGLE_FAN>
+ m_cull_table;
+};
diff --git a/Source/Core/VideoCommon/CPUCullImpl.h b/Source/Core/VideoCommon/CPUCullImpl.h
new file mode 100644
index 0000000000..e9a1d545c2
--- /dev/null
+++ b/Source/Core/VideoCommon/CPUCullImpl.h
@@ -0,0 +1,714 @@
+// Copyright 2022 Dolphin Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#if defined(USE_FMA)
+#define VECTOR_NAMESPACE CPUCull_FMA
+#elif defined(USE_AVX)
+#define VECTOR_NAMESPACE CPUCull_AVX
+#elif defined(USE_SSE41)
+#define VECTOR_NAMESPACE CPUCull_SSE41
+#elif defined(USE_SSE3)
+#define VECTOR_NAMESPACE CPUCull_SSE3
+#elif defined(USE_SSE)
+#define VECTOR_NAMESPACE CPUCull_SSE
+#elif defined(USE_NEON)
+#define VECTOR_NAMESPACE CPUCull_NEON
+#elif defined(NO_SIMD)
+#define VECTOR_NAMESPACE CPUCull_Scalar
+#else
+#error This file is meant to be used by CPUCull.cpp only!
+#endif
+
+#if defined(__GNUC__) && defined(USE_FMA) && !(defined(__AVX__) && defined(__FMA__))
+#define ATTR_TARGET __attribute__((target("avx,fma")))
+#elif defined(__GNUC__) && defined(USE_AVX) && !defined(__AVX__)
+#define ATTR_TARGET __attribute__((target("avx")))
+#elif defined(__GNUC__) && defined(USE_SSE41) && !defined(__SSE4_1__)
+#define ATTR_TARGET __attribute__((target("sse4.1")))
+#elif defined(__GNUC__) && defined(USE_SSE3) && !defined(__SSE3__)
+#define ATTR_TARGET __attribute__((target("sse3")))
+#else
+#define ATTR_TARGET
+#endif
+
+namespace VECTOR_NAMESPACE
+{
+#if defined(USE_SSE)
+typedef __m128 Vector;
+#elif defined(USE_NEON)
+typedef float32x4_t Vector;
+#else
+struct alignas(16) Vector
+{
+ float x, y, z, w;
+};
+#endif
+static_assert(sizeof(Vector) == 16);
+
+#ifdef USE_NEON
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector vsetr_f32(float x, float y, float z, float w)
+{
+ float tmp[4] = {x, y, z, w};
+ return vld1q_f32(tmp);
+}
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void vuzp12q_f32(Vector& a, Vector& b)
+{
+ Vector tmp = vuzp2q_f32(a, b);
+ a = vuzp1q_f32(a, b);
+ b = tmp;
+}
+#endif
+#ifdef USE_SSE
+template
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector vector_broadcast(Vector v)
+{
+ return _mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i));
+}
+#endif
+#ifdef USE_AVX
+template
+ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256 vector_broadcast(__m256 v)
+{
+ return _mm256_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i));
+}
+#endif
+
+#ifdef USE_AVX
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void TransposeYMM(__m256& o0, __m256& o1, //
+ __m256& o2, __m256& o3)
+{
+ __m256d tmp0 = _mm256_castps_pd(_mm256_unpacklo_ps(o0, o1));
+ __m256d tmp1 = _mm256_castps_pd(_mm256_unpacklo_ps(o2, o3));
+ __m256d tmp2 = _mm256_castps_pd(_mm256_unpackhi_ps(o0, o1));
+ __m256d tmp3 = _mm256_castps_pd(_mm256_unpackhi_ps(o2, o3));
+ o0 = _mm256_castpd_ps(_mm256_unpacklo_pd(tmp0, tmp1));
+ o1 = _mm256_castpd_ps(_mm256_unpackhi_pd(tmp0, tmp1));
+ o2 = _mm256_castpd_ps(_mm256_unpacklo_pd(tmp2, tmp3));
+ o3 = _mm256_castpd_ps(_mm256_unpackhi_pd(tmp2, tmp3));
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadTransposedYMM(const void* source, __m256& o0,
+ __m256& o1, __m256& o2, __m256& o3)
+{
+ const Vector* vsource = static_cast(source);
+ o0 = _mm256_broadcast_ps(&vsource[0]);
+ o1 = _mm256_broadcast_ps(&vsource[1]);
+ o2 = _mm256_broadcast_ps(&vsource[2]);
+ o3 = _mm256_broadcast_ps(&vsource[3]);
+ TransposeYMM(o0, o1, o2, o3);
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadPosYMM(const void* sourcel, const void* sourceh,
+ __m256& o0, __m256& o1, __m256& o2)
+{
+ const Vector* vsourcel = static_cast(sourcel);
+ const Vector* vsourceh = static_cast(sourceh);
+ o0 = _mm256_insertf128_ps(_mm256_castps128_ps256(vsourcel[0]), vsourceh[0], 1);
+ o1 = _mm256_insertf128_ps(_mm256_castps128_ps256(vsourcel[1]), vsourceh[1], 1);
+ o2 = _mm256_insertf128_ps(_mm256_castps128_ps256(vsourcel[2]), vsourceh[2], 1);
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void
+LoadTransposedPosYMM(const void* source, __m256& o0, __m256& o1, __m256& o2, __m256& o3)
+{
+ const Vector* vsource = static_cast(source);
+ o0 = _mm256_broadcast_ps(&vsource[0]);
+ o1 = _mm256_broadcast_ps(&vsource[1]);
+ o2 = _mm256_broadcast_ps(&vsource[2]);
+ o3 = _mm256_setr_ps(0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f);
+ TransposeYMM(o0, o1, o2, o3);
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256 ApplyMatrixYMM(__m256 v, __m256 m0, __m256 m1,
+ __m256 m2, __m256 m3)
+{
+ __m256 output = _mm256_mul_ps(vector_broadcast<0>(v), m0);
+#ifdef USE_FMA
+ output = _mm256_fmadd_ps(vector_broadcast<1>(v), m1, output);
+ output = _mm256_fmadd_ps(vector_broadcast<2>(v), m2, output);
+ output = _mm256_fmadd_ps(vector_broadcast<3>(v), m3, output);
+#else
+ output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<1>(v), m1));
+ output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<2>(v), m2));
+ output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<3>(v), m3));
+#endif
+ return output;
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256
+TransformVertexNoTransposeYMM(__m256 vertex, __m256 pos0, __m256 pos1, __m256 pos2, //
+ __m256 proj0, __m256 proj1, __m256 proj2, __m256 proj3)
+{
+ __m256 mul0 = _mm256_mul_ps(vertex, pos0);
+ __m256 mul1 = _mm256_mul_ps(vertex, pos1);
+ __m256 mul2 = _mm256_mul_ps(vertex, pos2);
+ __m256 mul3 = _mm256_setr_ps(0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f);
+ __m256 output = _mm256_hadd_ps(_mm256_hadd_ps(mul0, mul1), _mm256_hadd_ps(mul2, mul3));
+ return ApplyMatrixYMM(output, proj0, proj1, proj2, proj3);
+}
+
+template
+ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256
+TransformVertexYMM(__m256 vertex, __m256 pos0, __m256 pos1, __m256 pos2, __m256 pos3, //
+ __m256 proj0, __m256 proj1, __m256 proj2, __m256 proj3)
+{
+ __m256 output = pos3; // vertex.w is always 1.0
+#ifdef USE_FMA
+ output = _mm256_fmadd_ps(vector_broadcast<0>(vertex), pos0, output);
+ output = _mm256_fmadd_ps(vector_broadcast<1>(vertex), pos1, output);
+ if constexpr (PositionHas3Elems)
+ output = _mm256_fmadd_ps(vector_broadcast<2>(vertex), pos2, output);
+#else
+ output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<0>(vertex), pos0));
+ output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<1>(vertex), pos1));
+ if constexpr (PositionHas3Elems)
+ output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<2>(vertex), pos2));
+#endif
+ return ApplyMatrixYMM(output, proj0, proj1, proj2, proj3);
+}
+
+template
+ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256
+LoadTransform2Vertices(const u8* v0data, const u8* v1data, //
+ __m256 pos0, __m256 pos1, __m256 pos2, __m256 pos3, //
+ __m256 proj0, __m256 proj1, __m256 proj2, __m256 proj3)
+{
+ __m256 v01;
+ if constexpr (PerVertexPosMtx)
+ {
+ // Vertex data layout always starts with posmtx data if available, then position data
+ // Convenient for us, that means offsets are always fixed
+ u32 v0idx = v0data[0] & 0x3f;
+ u32 v1idx = v1data[0] & 0x3f;
+ v0data += sizeof(u32);
+ v1data += sizeof(u32);
+
+ const float* v0fdata = reinterpret_cast(v0data);
+ const float* v1fdata = reinterpret_cast(v1data);
+
+ LoadPosYMM(&xfmem.posMatrices[v0idx * 4], &xfmem.posMatrices[v1idx * 4], pos0, pos1, pos2);
+
+ if constexpr (PositionHas3Elems)
+ {
+ __m256 base = _mm256_set1_ps(1.0f);
+ v01 = _mm256_blend_ps(_mm256_loadu2_m128(v1fdata, v0fdata), base, 0x88);
+ }
+ else
+ {
+ __m256 base = _mm256_unpacklo_ps(_mm256_setzero_ps(), _mm256_set1_ps(1.0f));
+ __m256 v1 = _mm256_castpd_ps(_mm256_broadcast_sd(reinterpret_cast(v1data)));
+ __m128 v0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast(v0data));
+ v01 = _mm256_blend_ps(_mm256_castps128_ps256(v0), v1, 0x30);
+ v01 = _mm256_blend_ps(v01, base, 0xcc);
+ }
+
+ v01 = TransformVertexNoTransposeYMM(v01, pos0, pos1, pos2, proj0, proj1, proj2, proj3);
+ }
+ else
+ {
+ const float* v0fdata = reinterpret_cast(v0data);
+ const float* v1fdata = reinterpret_cast(v1data);
+ if constexpr (PositionHas3Elems)
+ {
+ v01 = _mm256_loadu2_m128(v1fdata, v0fdata);
+ }
+ else
+ {
+ __m256 v1 = _mm256_castpd_ps(_mm256_broadcast_sd(reinterpret_cast(v1data)));
+ __m128 v0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast(v0data));
+ v01 = _mm256_blend_ps(_mm256_castps128_ps256(v0), v1, 0x30);
+ }
+
+#ifdef __clang__
+ // Clang's optimizer is dumb, yay
+ // It sees TransformVertexYMM doing broadcasts and is like
+ // "let's broadcast *before* we combine v0 and v1! Then we can use vbroadcastss!"
+ // Prevent it from "optimizing" here
+ asm("" : "+x"(v01)::);
+#endif
+
+ v01 = TransformVertexYMM(v01, pos0, pos1, pos2, pos3, //
+ proj0, proj1, proj2, proj3);
+ }
+
+ return v01;
+}
+
+#endif
+
+#ifndef USE_AVX
+// Note: Assumes 16-byte aligned source
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadTransposed(const void* source, Vector& o0,
+ Vector& o1, Vector& o2, Vector& o3)
+{
+#if defined(USE_SSE)
+ const Vector* vsource = static_cast(source);
+ o0 = vsource[0];
+ o1 = vsource[1];
+ o2 = vsource[2];
+ o3 = vsource[3];
+ _MM_TRANSPOSE4_PS(o0, o1, o2, o3);
+#elif defined(USE_NEON)
+ float32x4x4_t ld = vld4q_f32(static_cast(source));
+ o0 = ld.val[0];
+ o1 = ld.val[1];
+ o2 = ld.val[2];
+ o3 = ld.val[3];
+#else
+ const Vector* vsource = static_cast(source);
+ // clang-format off
+ o0.x = vsource[0].x; o0.y = vsource[1].x; o0.z = vsource[2].x; o0.w = vsource[3].x;
+ o1.x = vsource[0].y; o1.y = vsource[1].y; o1.z = vsource[2].y; o1.w = vsource[3].y;
+ o2.x = vsource[0].z; o2.y = vsource[1].z; o2.z = vsource[2].z; o2.w = vsource[3].z;
+ o3.x = vsource[0].w; o3.y = vsource[1].w; o3.z = vsource[2].w; o3.w = vsource[3].w;
+ // clang-format on
+#endif
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadTransposedPos(const void* source, Vector& o0,
+ Vector& o1, Vector& o2, Vector& o3)
+{
+ const Vector* vsource = static_cast(source);
+#if defined(USE_SSE)
+ o0 = vsource[0];
+ o1 = vsource[1];
+ o2 = vsource[2];
+ o3 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
+ _MM_TRANSPOSE4_PS(o0, o1, o2, o3);
+#elif defined(USE_NEON)
+ float32x4x2_t ld01 = vld2q_f32(static_cast(source));
+ o0 = ld01.val[0];
+ o1 = ld01.val[1];
+ o2 = vsource[2];
+ o3 = vsetr_f32(0.0f, 0.0f, 0.0f, 1.0f);
+ vuzp12q_f32(o2, o3);
+ vuzp12q_f32(o0, o2);
+ vuzp12q_f32(o1, o3);
+#else
+ // clang-format off
+ o0.x = vsource[0].x; o0.y = vsource[1].x; o0.z = vsource[2].x; o0.w = 0.0f;
+ o1.x = vsource[0].y; o1.y = vsource[1].y; o1.z = vsource[2].y; o1.w = 0.0f;
+ o2.x = vsource[0].z; o2.y = vsource[1].z; o2.z = vsource[2].z; o2.w = 0.0f;
+ o3.x = vsource[0].w; o3.y = vsource[1].w; o3.z = vsource[2].w; o3.w = 1.0f;
+ // clang-format on
+#endif
+}
+#endif
+
+#ifndef USE_NEON
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadPos(const void* source, //
+ Vector& o0, Vector& o1, Vector& o2)
+{
+ const Vector* vsource = static_cast(source);
+ o0 = vsource[0];
+ o1 = vsource[1];
+ o2 = vsource[2];
+}
+#endif
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector ApplyMatrix(Vector v, Vector m0, Vector m1,
+ Vector m2, Vector m3)
+{
+#if defined(USE_SSE)
+ Vector output = _mm_mul_ps(vector_broadcast<0>(v), m0);
+#ifdef USE_FMA
+ output = _mm_fmadd_ps(vector_broadcast<1>(v), m1, output);
+ output = _mm_fmadd_ps(vector_broadcast<2>(v), m2, output);
+ output = _mm_fmadd_ps(vector_broadcast<3>(v), m3, output);
+#else
+ output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<1>(v), m1));
+ output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<2>(v), m2));
+ output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<3>(v), m3));
+#endif
+ return output;
+#elif defined(USE_NEON)
+ Vector output = vmulq_laneq_f32(m0, v, 0);
+ output = vfmaq_laneq_f32(output, m1, v, 1);
+ output = vfmaq_laneq_f32(output, m2, v, 2);
+ output = vfmaq_laneq_f32(output, m3, v, 3);
+ return output;
+#else
+ Vector output;
+ output.x = v.x * m0.x + v.y * m1.x + v.z * m2.x + v.w * m3.x;
+ output.y = v.x * m0.y + v.y * m1.y + v.z * m2.y + v.w * m3.y;
+ output.z = v.x * m0.z + v.y * m1.z + v.z * m2.z + v.w * m3.z;
+ output.w = v.x * m0.w + v.y * m1.w + v.z * m2.w + v.w * m3.w;
+ return output;
+#endif
+}
+
+#ifndef USE_NEON
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector
+TransformVertexNoTranspose(Vector vertex, Vector pos0, Vector pos1, Vector pos2, //
+ Vector proj0, Vector proj1, Vector proj2, Vector proj3)
+{
+#ifdef USE_SSE
+ Vector mul0 = _mm_mul_ps(vertex, pos0);
+ Vector mul1 = _mm_mul_ps(vertex, pos1);
+ Vector mul2 = _mm_mul_ps(vertex, pos2);
+ Vector mul3 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
+#ifdef USE_SSE3
+ Vector output = _mm_hadd_ps(_mm_hadd_ps(mul0, mul1), _mm_hadd_ps(mul2, mul3));
+#else
+ Vector t0 = _mm_add_ps(_mm_unpacklo_ps(mul0, mul2), _mm_unpackhi_ps(mul0, mul2));
+ Vector t1 = _mm_add_ps(_mm_unpacklo_ps(mul1, mul3), _mm_unpackhi_ps(mul1, mul3));
+ Vector output = _mm_add_ps(_mm_unpacklo_ps(t0, t1), _mm_unpackhi_ps(t0, t1));
+#endif
+#else
+ Vector output;
+ output.x = vertex.x * pos0.x + vertex.y * pos0.y + vertex.z * pos0.z + vertex.w * pos0.w;
+ output.y = vertex.x * pos1.x + vertex.y * pos1.y + vertex.z * pos1.z + vertex.w * pos1.w;
+ output.z = vertex.x * pos2.x + vertex.y * pos2.y + vertex.z * pos2.z + vertex.w * pos2.w;
+ output.w = 1.0f;
+#endif
+ output = ApplyMatrix(output, proj0, proj1, proj2, proj3);
+ return output;
+}
+#endif
+
+template
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector
+TransformVertex(Vector vertex, Vector pos0, Vector pos1, Vector pos2, Vector pos3, //
+ Vector proj0, Vector proj1, Vector proj2, Vector proj3)
+{
+ Vector output = pos3; // vertex.w is always 1.0
+#if defined(USE_FMA)
+ output = _mm_fmadd_ps(vector_broadcast<0>(vertex), pos0, output);
+ output = _mm_fmadd_ps(vector_broadcast<1>(vertex), pos1, output);
+ if constexpr (PositionHas3Elems)
+ output = _mm_fmadd_ps(vector_broadcast<2>(vertex), pos2, output);
+#elif defined(USE_SSE)
+ output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<0>(vertex), pos0));
+ output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<1>(vertex), pos1));
+ if constexpr (PositionHas3Elems)
+ output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<2>(vertex), pos2));
+#elif defined(USE_NEON)
+ output = vfmaq_laneq_f32(output, pos0, vertex, 0);
+ output = vfmaq_laneq_f32(output, pos1, vertex, 1);
+ if constexpr (PositionHas3Elems)
+ output = vfmaq_laneq_f32(output, pos2, vertex, 2);
+#else
+ output.x += vertex.x * pos0.x + vertex.y * pos1.x;
+ output.y += vertex.x * pos0.y + vertex.y * pos1.y;
+ output.z += vertex.x * pos0.z + vertex.y * pos1.z;
+ if constexpr (PositionHas3Elems)
+ {
+ output.x += vertex.z * pos2.x;
+ output.y += vertex.z * pos2.y;
+ output.z += vertex.z * pos2.z;
+ }
+#endif
+ output = ApplyMatrix(output, proj0, proj1, proj2, proj3);
+ return output;
+}
+
+template
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector
+LoadTransformVertex(const u8* data, Vector pos0, Vector pos1, Vector pos2, Vector pos3,
+ Vector proj0, Vector proj1, Vector proj2, Vector proj3)
+{
+ Vector vertex;
+ if constexpr (PerVertexPosMtx)
+ {
+ // Vertex data layout always starts with posmtx data if available, then position data
+ // Convenient for us, that means offsets are always fixed
+ u32 idx = data[0] & 0x3f;
+ data += sizeof(u32);
+
+ const float* fdata = reinterpret_cast(data);
+
+#ifdef USE_NEON
+ LoadTransposedPos(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2, pos3);
+
+ if constexpr (PositionHas3Elems)
+ {
+ vertex = vld1q_f32(fdata);
+ }
+ else
+ {
+ vertex = vcombine_f32(vld1_f32(fdata), vdup_n_f32(0.0f));
+ }
+
+ vertex = TransformVertex(vertex, pos0, pos1, pos2, pos3, //
+ proj0, proj1, proj2, proj3);
+#else
+ LoadPos(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2);
+
+ if constexpr (PositionHas3Elems)
+ {
+#if defined(USE_SSE)
+#ifdef USE_SSE41
+ Vector base = _mm_set1_ps(1.0f);
+ vertex = _mm_blend_ps(_mm_loadu_ps(fdata), base, 8);
+#else
+ Vector base = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
+ Vector mask = _mm_castsi128_ps(_mm_setr_epi32(-1, -1, -1, 0));
+ vertex = _mm_or_ps(_mm_and_ps(_mm_loadu_ps(fdata), mask), base);
+#endif
+#else
+ vertex.x = fdata[0];
+ vertex.y = fdata[1];
+ vertex.z = fdata[2];
+ vertex.w = 1.0f;
+#endif
+ }
+ else
+ {
+#if defined(USE_SSE)
+ Vector base = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
+ vertex = _mm_loadl_pi(base, reinterpret_cast(fdata));
+#else
+ vertex.x = fdata[0];
+ vertex.y = fdata[1];
+ vertex.z = 0.0f;
+ vertex.w = 1.0f;
+#endif
+ }
+
+ vertex = TransformVertexNoTranspose(vertex, pos0, pos1, pos2, proj0, proj1, proj2, proj3);
+#endif
+ }
+ else
+ {
+ const float* fdata = reinterpret_cast(data);
+ if constexpr (PositionHas3Elems)
+ {
+#if defined(USE_SSE)
+ vertex = _mm_loadu_ps(fdata);
+#elif defined(USE_NEON)
+ vertex = vld1q_f32(fdata);
+#else
+ vertex.x = fdata[0];
+ vertex.y = fdata[1];
+ vertex.z = fdata[2];
+ vertex.w = 1.0f;
+#endif
+ }
+ else
+ {
+#if defined(USE_SSE)
+ vertex = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast(fdata));
+#elif defined(USE_NEON)
+ vertex = vcombine_f32(vld1_f32(fdata), vdup_n_f32(0.0f));
+#else
+ vertex.x = fdata[0];
+ vertex.y = fdata[1];
+ vertex.z = 0.0f;
+ vertex.w = 1.0f;
+#endif
+ }
+
+ vertex = TransformVertex(vertex, pos0, pos1, pos2, pos3, //
+ proj0, proj1, proj2, proj3);
+ }
+
+ return vertex;
+}
+
+template
+ATTR_TARGET static void TransformVertices(void* output, const void* vertices, u32 stride, int count)
+{
+ const VertexShaderManager& vsmanager = Core::System::GetInstance().GetVertexShaderManager();
+ const u8* cvertices = static_cast(vertices);
+ Vector* voutput = static_cast(output);
+ u32 idx = g_main_cp_state.matrix_index_a.PosNormalMtxIdx & 0x3f;
+#ifdef USE_AVX
+ __m256 proj0, proj1, proj2, proj3;
+ __m256 pos0, pos1, pos2, pos3;
+ LoadTransposedYMM(vsmanager.constants.projection.data(), proj0, proj1, proj2, proj3);
+ LoadTransposedPosYMM(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2, pos3);
+ for (int i = 1; i < count; i += 2)
+ {
+ const u8* v0data = cvertices;
+ const u8* v1data = cvertices + stride;
+ __m256 v01 = LoadTransform2Vertices(
+ v0data, v1data, pos0, pos1, pos2, pos3, proj0, proj1, proj2, proj3);
+ _mm256_store_ps(reinterpret_cast(voutput), v01);
+ cvertices += stride * 2;
+ voutput += 2;
+ }
+ if (count & 1)
+ {
+ *voutput = LoadTransformVertex(
+ cvertices, //
+ _mm256_castps256_ps128(pos0), _mm256_castps256_ps128(pos1), //
+ _mm256_castps256_ps128(pos2), _mm256_castps256_ps128(pos3), //
+ _mm256_castps256_ps128(proj0), _mm256_castps256_ps128(proj1), //
+ _mm256_castps256_ps128(proj2), _mm256_castps256_ps128(proj3));
+ }
+#else
+ Vector proj0, proj1, proj2, proj3;
+ Vector pos0, pos1, pos2, pos3;
+ LoadTransposed(vsmanager.constants.projection.data(), proj0, proj1, proj2, proj3);
+ LoadTransposedPos(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2, pos3);
+ for (int i = 0; i < count; i++)
+ {
+ *voutput = LoadTransformVertex(
+ cvertices, pos0, pos1, pos2, pos3, proj0, proj1, proj2, proj3);
+ cvertices += stride;
+ voutput += 1;
+ }
+#endif
+}
+
+template
+ATTR_TARGET DOLPHIN_FORCE_INLINE static bool CullTriangle(const CPUCull::TransformedVertex& a,
+ const CPUCull::TransformedVertex& b,
+ const CPUCull::TransformedVertex& c)
+{
+ if (Mode == CullMode::All)
+ return true;
+
+ Vector va = reinterpret_cast(a);
+ Vector vb = reinterpret_cast(b);
+ Vector vc = reinterpret_cast(c);
+
+ // See videosoftware Clipper.cpp
+
+#if defined(USE_SSE)
+ Vector wxzya = _mm_shuffle_ps(va, va, _MM_SHUFFLE(1, 2, 0, 3));
+ Vector wxzyc = _mm_shuffle_ps(vc, vc, _MM_SHUFFLE(1, 2, 0, 3));
+ Vector ywzxb = _mm_shuffle_ps(vb, vb, _MM_SHUFFLE(0, 2, 3, 1));
+ Vector part0 = _mm_mul_ps(va, wxzyc);
+ Vector part1 = _mm_mul_ps(vc, wxzya);
+ Vector part2 = _mm_mul_ps(_mm_sub_ps(part0, part1), ywzxb);
+#ifdef USE_SSE3
+ Vector part3 = _mm_movehdup_ps(part2);
+#else
+ Vector part3 = vector_broadcast<1>(part2);
+#endif
+ Vector part4 = vector_broadcast<3>(part2);
+ Vector part5 = _mm_add_ss(_mm_add_ss(part2, part3), part4);
+ float normal_z_dir;
+ _mm_store_ss(&normal_z_dir, part5);
+#elif defined(USE_NEON)
+ Vector zero = vdupq_n_f32(0.0f);
+ Vector wx0ya = vextq_f32(va, vzip1q_f32(va, zero), 3);
+ Vector wx0yc = vextq_f32(vc, vzip1q_f32(vc, zero), 3);
+ Vector ywxxb = vuzp2q_f32(vb, vdupq_laneq_f32(vb, 0));
+ Vector part0 = vmulq_f32(va, wx0yc);
+ Vector part1 = vmulq_f32(vc, wx0ya);
+ Vector part2 = vmulq_f32(vsubq_f32(part0, part1), ywxxb);
+ float normal_z_dir = vaddvq_f32(part2);
+#else
+ float normal_z_dir = (c.w * a.x - a.w * c.x) * b.y + //
+ (c.x * a.y - a.x * c.y) * b.w + //
+ (c.y * a.w - a.y * c.w) * b.x;
+#endif
+ bool cull = false;
+ switch (Mode)
+ {
+ case CullMode::None:
+ cull = normal_z_dir == 0;
+ break;
+ case CullMode::Front:
+ cull = normal_z_dir <= 0;
+ break;
+ case CullMode::Back:
+ cull = normal_z_dir >= 0;
+ break;
+ case CullMode::All:
+ cull = true;
+ break;
+ }
+ if (cull)
+ return true;
+
+#if defined(USE_SSE)
+ Vector xyab = _mm_unpacklo_ps(va, vb);
+ Vector zwab = _mm_unpackhi_ps(va, vb);
+ Vector allx = _mm_shuffle_ps(xyab, vc, _MM_SHUFFLE(0, 0, 1, 0));
+ Vector ally = _mm_shuffle_ps(xyab, vc, _MM_SHUFFLE(1, 1, 3, 2));
+ Vector allpw = _mm_shuffle_ps(zwab, vc, _MM_SHUFFLE(3, 3, 3, 2));
+ Vector allnw = _mm_xor_ps(allpw, _mm_set1_ps(-0.0f));
+ __m128i x_gt_pw = _mm_castps_si128(_mm_cmple_ps(allpw, allx));
+ __m128i y_gt_pw = _mm_castps_si128(_mm_cmple_ps(allpw, ally));
+ __m128i x_lt_nw = _mm_castps_si128(_mm_cmplt_ps(allx, allnw));
+ __m128i y_lt_nw = _mm_castps_si128(_mm_cmplt_ps(ally, allnw));
+ __m128i any_out_of_bounds = _mm_packs_epi16(_mm_packs_epi32(x_lt_nw, y_lt_nw), //
+ _mm_packs_epi32(x_gt_pw, y_gt_pw));
+ cull |= 0 != _mm_movemask_epi8(_mm_cmpeq_epi32(_mm_set1_epi32(~0), any_out_of_bounds));
+#elif defined(USE_NEON)
+ float64x2_t xyab = vreinterpretq_f64_f32(vzip1q_f32(va, vb));
+ float64x2_t xycc = vreinterpretq_f64_f32(vzip1q_f32(vc, vc));
+ float32x4_t allx = vreinterpretq_f32_f64(vzip1q_f64(xyab, xycc));
+ float32x4_t ally = vreinterpretq_f32_f64(vzip2q_f64(xyab, xycc));
+ float32x4_t allpw = vextq_f32(vzip2q_f32(va, vb), vdupq_laneq_f32(vc, 3), 2);
+ float32x4_t allnw = vnegq_f32(allpw);
+ uint16x8_t x_gt_pw = vreinterpretq_u16_u32(vcgtq_f32(allx, allpw));
+ uint16x8_t y_gt_pw = vreinterpretq_u16_u32(vcgtq_f32(ally, allpw));
+ uint16x8_t x_lt_nw = vreinterpretq_u16_u32(vcltq_f32(allx, allnw));
+ uint16x8_t y_lt_nw = vreinterpretq_u16_u32(vcltq_f32(ally, allnw));
+ uint8x16_t lt_nw = vreinterpretq_u8_u16(vuzp1q_u16(x_lt_nw, y_lt_nw));
+ uint8x16_t gt_pw = vreinterpretq_u8_u16(vuzp1q_u16(x_gt_pw, y_gt_pw));
+ uint32x4_t any_out_of_bounds = vreinterpretq_u32_u8(vuzp1q_u8(lt_nw, gt_pw));
+ cull |= 0xFFFFFFFF == vmaxvq_u32(any_out_of_bounds);
+#else
+ cull |= a.x < -a.w && b.x < -b.w && c.x < -c.w;
+ cull |= a.y < -a.w && b.y < -b.w && c.y < -c.w;
+ cull |= a.x > a.w && b.x > b.w && c.x > c.w;
+ cull |= a.y > a.w && b.y > b.w && c.y > c.w;
+#endif
+
+ return cull;
+}
+
+template
+ATTR_TARGET static bool AreAllVerticesCulled(const CPUCull::TransformedVertex* transformed,
+ int count)
+{
+ switch (Primitive)
+ {
+ case OpcodeDecoder::Primitive::GX_DRAW_QUADS:
+ case OpcodeDecoder::Primitive::GX_DRAW_QUADS_2:
+ {
+ int i = 3;
+ for (; i < count; i += 4)
+ {
+ if (!CullTriangle(transformed[i - 3], transformed[i - 2], transformed[i - 1]))
+ return false;
+ if (!CullTriangle(transformed[i - 3], transformed[i - 1], transformed[i - 0]))
+ return false;
+ }
+ // three vertices remaining, so render a triangle
+ if (i == count)
+ {
+ if (!CullTriangle(transformed[i - 3], transformed[i - 2], transformed[i - 1]))
+ return false;
+ }
+ break;
+ }
+ case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLES:
+ for (int i = 2; i < count; i += 3)
+ {
+ if (!CullTriangle(transformed[i - 2], transformed[i - 1], transformed[i - 0]))
+ return false;
+ }
+ break;
+ case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLE_STRIP:
+ {
+ bool wind = false;
+ for (int i = 2; i < count; ++i)
+ {
+ if (!CullTriangle(transformed[i - 2], transformed[i - !wind], transformed[i - wind]))
+ return false;
+ wind = !wind;
+ }
+ break;
+ }
+ case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLE_FAN:
+ for (int i = 2; i < count; ++i)
+ {
+ if (!CullTriangle(transformed[0], transformed[i - 1], transformed[i]))
+ return false;
+ }
+ break;
+ }
+
+ return true;
+}
+
+} // namespace VECTOR_NAMESPACE
+
+#undef ATTR_TARGET
+#undef VECTOR_NAMESPACE
diff --git a/Source/Core/VideoCommon/ConstantManager.h b/Source/Core/VideoCommon/ConstantManager.h
index 6e60929056..88c25a9823 100644
--- a/Source/Core/VideoCommon/ConstantManager.h
+++ b/Source/Core/VideoCommon/ConstantManager.h
@@ -17,7 +17,7 @@ enum class SrcBlendFactor : u32;
enum class ZTexOp : u32;
enum class LogicOp : u32;
-struct PixelShaderConstants
+struct alignas(16) PixelShaderConstants
{
std::array colors;
std::array kcolors;
@@ -60,7 +60,7 @@ struct PixelShaderConstants
LogicOp logic_op_mode;
};
-struct VertexShaderConstants
+struct alignas(16) VertexShaderConstants
{
u32 components; // .x
u32 xfmem_dualTexInfo; // .y
@@ -109,7 +109,7 @@ enum class VSExpand : u32
Line,
};
-struct GeometryShaderConstants
+struct alignas(16) GeometryShaderConstants
{
float4 stereoparams;
float4 lineptparams;
diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp
index 4d0597ba88..9bee673db4 100644
--- a/Source/Core/VideoCommon/VertexLoaderManager.cpp
+++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp
@@ -30,6 +30,7 @@
#include "VideoCommon/VertexLoaderBase.h"
#include "VideoCommon/VertexManagerBase.h"
#include "VideoCommon/VertexShaderManager.h"
+#include "VideoCommon/VideoConfig.h"
#include "VideoCommon/XFMemory.h"
namespace VertexLoaderManager
@@ -366,17 +367,33 @@ int RunVertices(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int coun
vertex_shader_manager.SetVertexFormat(loader->m_native_components,
loader->m_native_vertex_format->GetVertexDeclaration());
- // if cull mode is CULL_ALL, tell VertexManager to skip triangles and quads.
- // They still need to go through vertex loading, because we need to calculate a zfreeze refrence
- // slope.
- bool cullall = (bpmem.genMode.cullmode == CullMode::All &&
- primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES);
+ // CPUCull's performance increase comes from encoding fewer GPU commands, not sending less data
+ // Therefore it's only useful to check if culling could remove a flush
+ const bool can_cpu_cull = g_ActiveConfig.bCPUCull &&
+ primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES &&
+ !g_vertex_manager->HasSendableVertices();
- DataReader dst = g_vertex_manager->PrepareForAdditionalData(
- primitive, count, loader->m_native_vtx_decl.stride, cullall);
+ // if cull mode is CULL_ALL, tell VertexManager to skip triangles and quads.
+ // They still need to go through vertex loading, because we need to calculate a zfreeze
+ // reference slope.
+ const bool cullall = (bpmem.genMode.cullmode == CullMode::All &&
+ primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES);
+
+ const int stride = loader->m_native_vtx_decl.stride;
+ DataReader dst = g_vertex_manager->PrepareForAdditionalData(primitive, count, stride,
+ cullall || can_cpu_cull);
count = loader->RunVertices(src, dst.GetPointer(), count);
+ if (can_cpu_cull && !cullall)
+ {
+ if (!g_vertex_manager->AreAllVerticesCulled(loader, primitive, dst.GetPointer(), count))
+ {
+ DataReader new_dst = g_vertex_manager->DisableCullAll(stride);
+ memmove(new_dst.GetPointer(), dst.GetPointer(), count * stride);
+ }
+ }
+
g_vertex_manager->AddIndices(primitive, count);
g_vertex_manager->FlushData(count, loader->m_native_vtx_decl.stride);
diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp
index 85e2b48556..1ccfebad0c 100644
--- a/Source/Core/VideoCommon/VertexManagerBase.cpp
+++ b/Source/Core/VideoCommon/VertexManagerBase.cpp
@@ -104,6 +104,7 @@ VertexManagerBase::~VertexManagerBase() = default;
bool VertexManagerBase::Initialize()
{
m_index_generator.Init();
+ m_cpu_cull.Init();
return true;
}
@@ -117,6 +118,13 @@ void VertexManagerBase::AddIndices(OpcodeDecoder::Primitive primitive, u32 num_v
m_index_generator.AddIndices(primitive, num_vertices);
}
+bool VertexManagerBase::AreAllVerticesCulled(VertexLoaderBase* loader,
+ OpcodeDecoder::Primitive primitive, const u8* src,
+ u32 count)
+{
+ return m_cpu_cull.AreAllVerticesCulled(loader, primitive, src, count);
+}
+
DataReader VertexManagerBase::PrepareForAdditionalData(OpcodeDecoder::Primitive primitive,
u32 count, u32 stride, bool cullall)
{
@@ -187,6 +195,16 @@ DataReader VertexManagerBase::PrepareForAdditionalData(OpcodeDecoder::Primitive
return DataReader(m_cur_buffer_pointer, m_end_buffer_pointer);
}
+DataReader VertexManagerBase::DisableCullAll(u32 stride)
+{
+ if (m_cull_all)
+ {
+ m_cull_all = false;
+ ResetBuffer(stride);
+ }
+ return DataReader(m_cur_buffer_pointer, m_end_buffer_pointer);
+}
+
void VertexManagerBase::FlushData(u32 count, u32 stride)
{
m_cur_buffer_pointer += count * stride;
@@ -548,6 +566,8 @@ void VertexManagerBase::Flush()
// Now the vertices can be flushed to the GPU. Everything following the CommitBuffer() call
// must be careful to not upload any utility vertices, as the binding will be lost otherwise.
const u32 num_indices = m_index_generator.GetIndexLen();
+ if (num_indices == 0)
+ return;
u32 base_vertex, base_index;
CommitBuffer(m_index_generator.GetNumVerts(),
VertexLoaderManager::GetCurrentVertexFormat()->GetVertexStride(), num_indices,
diff --git a/Source/Core/VideoCommon/VertexManagerBase.h b/Source/Core/VideoCommon/VertexManagerBase.h
index ba3777a7fe..3b8180c5d2 100644
--- a/Source/Core/VideoCommon/VertexManagerBase.h
+++ b/Source/Core/VideoCommon/VertexManagerBase.h
@@ -9,6 +9,7 @@
#include "Common/BitSet.h"
#include "Common/CommonTypes.h"
#include "Common/MathUtil.h"
+#include "VideoCommon/CPUCull.h"
#include "VideoCommon/IndexGenerator.h"
#include "VideoCommon/RenderState.h"
#include "VideoCommon/ShaderCache.h"
@@ -100,11 +101,18 @@ public:
PrimitiveType GetCurrentPrimitiveType() const { return m_current_primitive_type; }
void AddIndices(OpcodeDecoder::Primitive primitive, u32 num_vertices);
+ bool AreAllVerticesCulled(VertexLoaderBase* loader, OpcodeDecoder::Primitive primitive,
+ const u8* src, u32 count);
virtual DataReader PrepareForAdditionalData(OpcodeDecoder::Primitive primitive, u32 count,
u32 stride, bool cullall);
+ /// Switch cullall off after a call to PrepareForAdditionalData with cullall true
+ /// Expects that you will add a nonzero number of primitives before the next flush
+ /// Returns whether cullall was changed (false if cullall was already off)
+ DataReader DisableCullAll(u32 stride);
void FlushData(u32 count, u32 stride);
void Flush();
+ bool HasSendableVertices() const { return !m_is_flushed && !m_cull_all; }
void DoState(PointerWrap& p);
@@ -201,6 +209,7 @@ protected:
bool m_cull_all = false;
IndexGenerator m_index_generator;
+ CPUCull m_cpu_cull;
private:
// Minimum number of draws per command buffer when attempting to preempt a readback operation.
diff --git a/Source/Core/VideoCommon/VertexShaderManager.cpp b/Source/Core/VideoCommon/VertexShaderManager.cpp
index abc59b862c..deda07fd65 100644
--- a/Source/Core/VideoCommon/VertexShaderManager.cpp
+++ b/Source/Core/VideoCommon/VertexShaderManager.cpp
@@ -65,6 +65,97 @@ void VertexShaderManager::Dirty()
dirty = true;
}
+Common::Matrix44 VertexShaderManager::LoadProjectionMatrix()
+{
+ const auto& rawProjection = xfmem.projection.rawProjection;
+
+ switch (xfmem.projection.type)
+ {
+ case ProjectionType::Perspective:
+ {
+ const Common::Vec2 fov_multiplier = g_freelook_camera.IsActive() ?
+ g_freelook_camera.GetFieldOfViewMultiplier() :
+ Common::Vec2{1, 1};
+ m_projection_matrix[0] = rawProjection[0] * g_ActiveConfig.fAspectRatioHackW * fov_multiplier.x;
+ m_projection_matrix[1] = 0.0f;
+ m_projection_matrix[2] = rawProjection[1] * g_ActiveConfig.fAspectRatioHackW * fov_multiplier.x;
+ m_projection_matrix[3] = 0.0f;
+
+ m_projection_matrix[4] = 0.0f;
+ m_projection_matrix[5] = rawProjection[2] * g_ActiveConfig.fAspectRatioHackH * fov_multiplier.y;
+ m_projection_matrix[6] = rawProjection[3] * g_ActiveConfig.fAspectRatioHackH * fov_multiplier.y;
+ m_projection_matrix[7] = 0.0f;
+
+ m_projection_matrix[8] = 0.0f;
+ m_projection_matrix[9] = 0.0f;
+ m_projection_matrix[10] = rawProjection[4];
+ m_projection_matrix[11] = rawProjection[5];
+
+ m_projection_matrix[12] = 0.0f;
+ m_projection_matrix[13] = 0.0f;
+
+ m_projection_matrix[14] = -1.0f;
+ m_projection_matrix[15] = 0.0f;
+
+ g_stats.gproj = m_projection_matrix;
+ }
+ break;
+
+ case ProjectionType::Orthographic:
+ {
+ m_projection_matrix[0] = rawProjection[0];
+ m_projection_matrix[1] = 0.0f;
+ m_projection_matrix[2] = 0.0f;
+ m_projection_matrix[3] = rawProjection[1];
+
+ m_projection_matrix[4] = 0.0f;
+ m_projection_matrix[5] = rawProjection[2];
+ m_projection_matrix[6] = 0.0f;
+ m_projection_matrix[7] = rawProjection[3];
+
+ m_projection_matrix[8] = 0.0f;
+ m_projection_matrix[9] = 0.0f;
+ m_projection_matrix[10] = rawProjection[4];
+ m_projection_matrix[11] = rawProjection[5];
+
+ m_projection_matrix[12] = 0.0f;
+ m_projection_matrix[13] = 0.0f;
+
+ m_projection_matrix[14] = 0.0f;
+ m_projection_matrix[15] = 1.0f;
+
+ g_stats.g2proj = m_projection_matrix;
+ g_stats.proj = rawProjection;
+ }
+ break;
+
+ default:
+ ERROR_LOG_FMT(VIDEO, "Unknown projection type: {}", xfmem.projection.type);
+ }
+
+ PRIM_LOG("Projection: {} {} {} {} {} {}", rawProjection[0], rawProjection[1], rawProjection[2],
+ rawProjection[3], rawProjection[4], rawProjection[5]);
+
+ auto corrected_matrix = m_viewport_correction * Common::Matrix44::FromArray(m_projection_matrix);
+
+ if (g_freelook_camera.IsActive() && xfmem.projection.type == ProjectionType::Perspective)
+ corrected_matrix *= g_freelook_camera.GetView();
+
+ g_freelook_camera.GetController()->SetClean();
+
+ return corrected_matrix;
+}
+
+void VertexShaderManager::SetProjectionMatrix()
+{
+ if (m_projection_changed || g_freelook_camera.GetController()->IsDirty())
+ {
+ m_projection_changed = false;
+ auto corrected_matrix = LoadProjectionMatrix();
+ memcpy(constants.projection.data(), corrected_matrix.data.data(), 4 * sizeof(float4));
+ }
+}
+
// Syncs the shader constant buffers with xfmem
// TODO: A cleaner way to control the matrices without making a mess in the parameters field
void VertexShaderManager::SetConstants(const std::vector& textures)
@@ -317,84 +408,7 @@ void VertexShaderManager::SetConstants(const std::vector& textures)
m_projection_changed = false;
m_projection_graphics_mod_change = !projection_actions.empty();
- const auto& rawProjection = xfmem.projection.rawProjection;
-
- switch (xfmem.projection.type)
- {
- case ProjectionType::Perspective:
- {
- const Common::Vec2 fov_multiplier = g_freelook_camera.IsActive() ?
- g_freelook_camera.GetFieldOfViewMultiplier() :
- Common::Vec2{1, 1};
- m_projection_matrix[0] =
- rawProjection[0] * g_ActiveConfig.fAspectRatioHackW * fov_multiplier.x;
- m_projection_matrix[1] = 0.0f;
- m_projection_matrix[2] =
- rawProjection[1] * g_ActiveConfig.fAspectRatioHackW * fov_multiplier.x;
- m_projection_matrix[3] = 0.0f;
-
- m_projection_matrix[4] = 0.0f;
- m_projection_matrix[5] =
- rawProjection[2] * g_ActiveConfig.fAspectRatioHackH * fov_multiplier.y;
- m_projection_matrix[6] =
- rawProjection[3] * g_ActiveConfig.fAspectRatioHackH * fov_multiplier.y;
- m_projection_matrix[7] = 0.0f;
-
- m_projection_matrix[8] = 0.0f;
- m_projection_matrix[9] = 0.0f;
- m_projection_matrix[10] = rawProjection[4];
- m_projection_matrix[11] = rawProjection[5];
-
- m_projection_matrix[12] = 0.0f;
- m_projection_matrix[13] = 0.0f;
-
- m_projection_matrix[14] = -1.0f;
- m_projection_matrix[15] = 0.0f;
-
- g_stats.gproj = m_projection_matrix;
- }
- break;
-
- case ProjectionType::Orthographic:
- {
- m_projection_matrix[0] = rawProjection[0];
- m_projection_matrix[1] = 0.0f;
- m_projection_matrix[2] = 0.0f;
- m_projection_matrix[3] = rawProjection[1];
-
- m_projection_matrix[4] = 0.0f;
- m_projection_matrix[5] = rawProjection[2];
- m_projection_matrix[6] = 0.0f;
- m_projection_matrix[7] = rawProjection[3];
-
- m_projection_matrix[8] = 0.0f;
- m_projection_matrix[9] = 0.0f;
- m_projection_matrix[10] = rawProjection[4];
- m_projection_matrix[11] = rawProjection[5];
-
- m_projection_matrix[12] = 0.0f;
- m_projection_matrix[13] = 0.0f;
-
- m_projection_matrix[14] = 0.0f;
- m_projection_matrix[15] = 1.0f;
-
- g_stats.g2proj = m_projection_matrix;
- g_stats.proj = rawProjection;
- }
- break;
-
- default:
- ERROR_LOG_FMT(VIDEO, "Unknown projection type: {}", xfmem.projection.type);
- }
-
- PRIM_LOG("Projection: {} {} {} {} {} {}", rawProjection[0], rawProjection[1], rawProjection[2],
- rawProjection[3], rawProjection[4], rawProjection[5]);
-
- auto corrected_matrix =
- m_viewport_correction * Common::Matrix44::FromArray(m_projection_matrix);
-
- if (g_freelook_camera.IsActive() && xfmem.projection.type == ProjectionType::Perspective)
- corrected_matrix *= g_freelook_camera.GetView();
+ auto corrected_matrix = LoadProjectionMatrix();
GraphicsModActionData::Projection projection{&corrected_matrix};
for (auto action : projection_actions)
@@ -404,8 +418,6 @@ void VertexShaderManager::SetConstants(const std::vector& textures)
memcpy(constants.projection.data(), corrected_matrix.data.data(), 4 * sizeof(float4));
- g_freelook_camera.GetController()->SetClean();
-
dirty = true;
}
diff --git a/Source/Core/VideoCommon/VertexShaderManager.h b/Source/Core/VideoCommon/VertexShaderManager.h
index ce157bd596..9a150980da 100644
--- a/Source/Core/VideoCommon/VertexShaderManager.h
+++ b/Source/Core/VideoCommon/VertexShaderManager.h
@@ -24,6 +24,7 @@ public:
void DoState(PointerWrap& p);
// constant management
+ void SetProjectionMatrix();
void SetConstants(const std::vector& textures);
void InvalidateXFRange(int start, int end);
@@ -64,4 +65,6 @@ private:
std::array m_minmax_lights_changed{};
Common::Matrix44 m_viewport_correction{};
+
+ Common::Matrix44 LoadProjectionMatrix();
};
diff --git a/Source/Core/VideoCommon/VideoConfig.cpp b/Source/Core/VideoCommon/VideoConfig.cpp
index 5314e352ad..eb00555487 100644
--- a/Source/Core/VideoCommon/VideoConfig.cpp
+++ b/Source/Core/VideoCommon/VideoConfig.cpp
@@ -113,6 +113,7 @@ void VideoConfig::Refresh()
iShaderCompilationMode = Config::Get(Config::GFX_SHADER_COMPILATION_MODE);
iShaderCompilerThreads = Config::Get(Config::GFX_SHADER_COMPILER_THREADS);
iShaderPrecompilerThreads = Config::Get(Config::GFX_SHADER_PRECOMPILER_THREADS);
+ bCPUCull = Config::Get(Config::GFX_CPU_CULL);
texture_filtering_mode = Config::Get(Config::GFX_ENHANCE_FORCE_TEXTURE_FILTERING);
iMaxAnisotropy = Config::Get(Config::GFX_ENHANCE_MAX_ANISOTROPY);
diff --git a/Source/Core/VideoCommon/VideoConfig.h b/Source/Core/VideoCommon/VideoConfig.h
index 8140eb27eb..036b5955ba 100644
--- a/Source/Core/VideoCommon/VideoConfig.h
+++ b/Source/Core/VideoCommon/VideoConfig.h
@@ -138,6 +138,7 @@ struct VideoConfig final
bool bPerfQueriesEnable = false;
bool bBBoxEnable = false;
bool bForceProgressive = false;
+ bool bCPUCull = false;
bool bEFBEmulateFormatChanges = false;
bool bSkipEFBCopyToRam = false;
diff --git a/Source/Core/VideoCommon/XFMemory.h b/Source/Core/VideoCommon/XFMemory.h
index 71d38cb8bd..e0a2696317 100644
--- a/Source/Core/VideoCommon/XFMemory.h
+++ b/Source/Core/VideoCommon/XFMemory.h
@@ -423,7 +423,7 @@ struct Projection
ProjectionType type;
};
-struct XFMemory
+struct alignas(16) XFMemory
{
float posMatrices[256]; // 0x0000 - 0x00ff
u32 unk0[768]; // 0x0100 - 0x03ff