Merge pull request #11208 from TellowKrinkle/CPUCull

Cull vertices on the CPU
2025-07-23 14:19:46 -06:00 · 2023-01-26 23:15:23 +01:00
parent 09a8d9591d 7413be1487
commit 9c9310bf44
21 changed files with 1102 additions and 92 deletions
--- a/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/features/settings/model/BooleanSetting.java
+++ b/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/features/settings/model/BooleanSetting.java
@ -214,6 +214,7 @@ public enum BooleanSetting implements AbstractBooleanSetting
          "SaveTextureCacheToState", true),
  GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION(Settings.FILE_GFX, Settings.SECTION_GFX_SETTINGS,
          "PreferVSForLinePointExpansion", false),
+  GFX_CPU_CULL(Settings.FILE_GFX, Settings.SECTION_GFX_SETTINGS, "CPUCull", false),
  GFX_MODS_ENABLE(Settings.FILE_GFX, Settings.SECTION_GFX_SETTINGS, "EnableMods", false),

  GFX_ENHANCE_FORCE_FILTERING(Settings.FILE_GFX, Settings.SECTION_GFX_ENHANCEMENTS,
--- a/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/features/settings/ui/SettingsFragmentPresenter.java
+++ b/Source/Android/app/src/main/java/org/dolphinemu/dolphinemu/features/settings/ui/SettingsFragmentPresenter.java
@ -938,6 +938,8 @@ public final class SettingsFragmentPresenter
    sl.add(new SwitchSetting(mContext, BooleanSetting.GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION,
            R.string.prefer_vs_for_point_line_expansion,
            R.string.prefer_vs_for_point_line_expansion_description));
+    sl.add(new SwitchSetting(mContext, BooleanSetting.GFX_CPU_CULL, R.string.cpu_cull,
+            R.string.cpu_cull_description));
    sl.add(new SwitchSetting(mContext, BooleanSetting.GFX_HACK_EFB_DEFER_INVALIDATION,
            R.string.defer_efb_invalidation, R.string.defer_efb_invalidation_description));
    sl.add(new InvertedSwitchSetting(mContext, BooleanSetting.GFX_HACK_FAST_TEXTURE_SAMPLING,
--- a/Source/Android/app/src/main/res/values/strings.xml
+++ b/Source/Android/app/src/main/res/values/strings.xml
@ -361,6 +361,8 @@
    <string name="backend_multithreading_description">Enables graphics backend multithreading (Vulkan only). May affect performance. If unsure, leave this checked.</string>
    <string name="prefer_vs_for_point_line_expansion">Prefer VS for Point/Line Expansion</string>
    <string name="prefer_vs_for_point_line_expansion_description">On backends that support both using the geometry shader and the vertex shader for expanding points and lines, selects the vertex shader for the job. May affect performance.</string>
+    <string name="cpu_cull">Cull Vertices on the CPU</string>
+    <string name="cpu_cull_description">Cull vertices on the CPU to reduce the number of draw calls required. May affect performance. If unsure, leave this unchecked.</string>
    <string name="defer_efb_invalidation">Defer EFB Cache Invalidation</string>
    <string name="defer_efb_invalidation_description">Defers invalidation of the EFB access cache until a GPU synchronization command is executed. May improve performance in some games at the cost of stability. If unsure, leave this unchecked.</string>
    <string name="manual_texture_sampling">Manual Texture Sampling</string>
--- a/Source/Core/Core/Config/GraphicsSettings.cpp
+++ b/Source/Core/Core/Config/GraphicsSettings.cpp
@ -93,6 +93,7 @@ const Info<bool> GFX_SAVE_TEXTURE_CACHE_TO_STATE{
    {System::GFX, "Settings", "SaveTextureCacheToState"}, true};
 const Info<bool> GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION{
    {System::GFX, "Settings", "PreferVSForLinePointExpansion"}, false};
+const Info<bool> GFX_CPU_CULL{{System::GFX, "Settings", "CPUCull"}, false};

 const Info<TriState> GFX_MTL_MANUALLY_UPLOAD_BUFFERS{
    {System::GFX, "Settings", "ManuallyUploadBuffers"}, TriState::Auto};
--- a/Source/Core/Core/Config/GraphicsSettings.h
+++ b/Source/Core/Core/Config/GraphicsSettings.h
@ -82,6 +82,7 @@ extern const Info<int> GFX_SHADER_COMPILER_THREADS;
 extern const Info<int> GFX_SHADER_PRECOMPILER_THREADS;
 extern const Info<bool> GFX_SAVE_TEXTURE_CACHE_TO_STATE;
 extern const Info<bool> GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION;
+extern const Info<bool> GFX_CPU_CULL;

 extern const Info<TriState> GFX_MTL_MANUALLY_UPLOAD_BUFFERS;
 extern const Info<bool> GFX_MTL_USE_PRESENT_DRAWABLE;
--- a/Source/Core/DolphinLib.props
+++ b/Source/Core/DolphinLib.props
@ -632,6 +632,8 @@
    <ClInclude Include="VideoCommon\CommandProcessor.h" />
    <ClInclude Include="VideoCommon\ConstantManager.h" />
    <ClInclude Include="VideoCommon\CPMemory.h" />
+    <ClInclude Include="VideoCommon\CPUCull.h" />
+    <ClInclude Include="VideoCommon\CPUCullImpl.h" />
    <ClInclude Include="VideoCommon\DataReader.h" />
    <ClInclude Include="VideoCommon\DriverDetails.h" />
    <ClInclude Include="VideoCommon\Fifo.h" />
@ -1226,6 +1228,7 @@
    <ClCompile Include="VideoCommon\BPStructs.cpp" />
    <ClCompile Include="VideoCommon\CommandProcessor.cpp" />
    <ClCompile Include="VideoCommon\CPMemory.cpp" />
+    <ClCompile Include="VideoCommon\CPUCull.cpp" />
    <ClCompile Include="VideoCommon\DriverDetails.cpp" />
    <ClCompile Include="VideoCommon\Fifo.cpp" />
    <ClCompile Include="VideoCommon\FramebufferManager.cpp" />
--- a/Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.cpp
+++ b/Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.cpp
@ -159,16 +159,18 @@ void AdvancedWidget::CreateWidgets()
  m_prefer_vs_for_point_line_expansion = new GraphicsBool(
      // i18n: VS is short for vertex shaders.
      tr("Prefer VS for Point/Line Expansion"), Config::GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION);
+  m_cpu_cull = new GraphicsBool(tr("Cull Vertices on the CPU"), Config::GFX_CPU_CULL);

  misc_layout->addWidget(m_enable_cropping, 0, 0);
  misc_layout->addWidget(m_enable_prog_scan, 0, 1);
  misc_layout->addWidget(m_backend_multithreading, 1, 0);
  misc_layout->addWidget(m_prefer_vs_for_point_line_expansion, 1, 1);
+  misc_layout->addWidget(m_cpu_cull, 2, 0);
 #ifdef _WIN32
  m_borderless_fullscreen =
      new GraphicsBool(tr("Borderless Fullscreen"), Config::GFX_BORDERLESS_FULLSCREEN);

-  misc_layout->addWidget(m_borderless_fullscreen, 2, 0);
+  misc_layout->addWidget(m_borderless_fullscreen, 2, 1);
 #endif

  // Experimental.
@ -369,6 +371,10 @@ void AdvancedWidget::AddDescriptions()
                 "for expanding points and lines, selects the vertex shader for the job.  May "
                 "affect performance."
                 "<br><br>%1");
+  static const char TR_CPU_CULL_DESCRIPTION[] =
+      QT_TR_NOOP("Cull vertices on the CPU to reduce the number of draw calls required.  "
+                 "May affect performance and draw statistics.<br><br>"
+                 "<dolphin_emphasis>If unsure, leave this unchecked.</dolphin_emphasis>");
  static const char TR_DEFER_EFB_ACCESS_INVALIDATION_DESCRIPTION[] = QT_TR_NOOP(
      "Defers invalidation of the EFB access cache until a GPU synchronization command "
      "is executed. If disabled, the cache will be invalidated with every draw call. "
@ -441,6 +447,7 @@ void AdvancedWidget::AddDescriptions()
    vsexpand_extra = tr(IF_UNSURE_UNCHECKED);
  m_prefer_vs_for_point_line_expansion->SetDescription(
      tr(TR_PREFER_VS_FOR_POINT_LINE_EXPANSION_DESCRIPTION).arg(vsexpand_extra));
+  m_cpu_cull->SetDescription(tr(TR_CPU_CULL_DESCRIPTION));
 #ifdef _WIN32
  m_borderless_fullscreen->SetDescription(tr(TR_BORDERLESS_FULLSCREEN_DESCRIPTION));
 #endif
--- a/Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.h
+++ b/Source/Core/DolphinQt/Config/Graphics/AdvancedWidget.h
@ -69,6 +69,7 @@ private:
  ToolTipCheckBox* m_enable_prog_scan;
  GraphicsBool* m_backend_multithreading;
  GraphicsBool* m_prefer_vs_for_point_line_expansion;
+  GraphicsBool* m_cpu_cull;
  GraphicsBool* m_borderless_fullscreen;

  // Experimental
--- a/Source/Core/VideoCommon/CMakeLists.txt
+++ b/Source/Core/VideoCommon/CMakeLists.txt
@ -23,6 +23,9 @@ add_library(videocommon
  ConstantManager.h
  CPMemory.cpp
  CPMemory.h
+  CPUCull.cpp
+  CPUCull.h
+  CPUCullImpl.h
  DriverDetails.cpp
  DriverDetails.h
  Fifo.cpp
--- a/Source/Core/VideoCommon/CPUCull.cpp
+++ b/Source/Core/VideoCommon/CPUCull.cpp
@ -0,0 +1,174 @@
+// Copyright 2022 Dolphin Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "VideoCommon/CPUCull.h"
+
+#include "Common/Assert.h"
+#include "Common/CPUDetect.h"
+#include "Common/MathUtil.h"
+#include "Common/MemoryUtil.h"
+#include "Core/System.h"
+
+#include "VideoCommon/CPMemory.h"
+#include "VideoCommon/VertexManagerBase.h"
+#include "VideoCommon/VertexShaderManager.h"
+#include "VideoCommon/VideoConfig.h"
+#include "VideoCommon/XFMemory.h"
+
+// We really want things like c.w * a.x - a.w * c.x to stay symmetric, so they cancel to zero on
+// degenerate triangles.  Make sure the compiler doesn't optimize in fmas where not requested.
+#ifdef _MSC_VER
+#pragma fp_contract(off)
+#else
+// GCC doesn't support any in-file way to turn off fp contract yet
+// Not ideal, but worst case scenario its cpu cull is worse at detecting degenerate triangles
+// (Most likely to happen on arm, as we don't compile the cull code for x86 fma)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunknown-pragmas"
+#pragma STDC FP_CONTRACT OFF
+#pragma GCC diagnostic pop
+#endif
+
+#if defined(_M_X86) || defined(_M_X86_64)
+#define USE_SSE
+#elif defined(_M_ARM_64)
+#define USE_NEON
+#else
+#define NO_SIMD
+#endif
+
+#if defined(USE_SSE)
+#include <immintrin.h>
+#elif defined(USE_NEON)
+#include <arm_neon.h>
+#endif
+
+#include "VideoCommon/CPUCullImpl.h"
+#ifdef USE_SSE
+#define USE_SSE3
+#include "VideoCommon/CPUCullImpl.h"
+#define USE_SSE41
+#include "VideoCommon/CPUCullImpl.h"
+#define USE_AVX
+#include "VideoCommon/CPUCullImpl.h"
+#define USE_FMA
+#include "VideoCommon/CPUCullImpl.h"
+#endif
+
+#if defined(USE_SSE)
+#if defined(__AVX__) && defined(__FMA__)
+static constexpr int MIN_SSE = 51;
+#elif defined(__AVX__)
+static constexpr int MIN_SSE = 50;
+#elif defined(__SSE4_1__)
+static constexpr int MIN_SSE = 41;
+#elif defined(__SSE3__)
+static constexpr int MIN_SSE = 30;
+#else
+static constexpr int MIN_SSE = 0;
+#endif
+#endif
+
+template <bool PositionHas3Elems, bool PerVertexPosMtx>
+static CPUCull::TransformFunction GetTransformFunction()
+{
+#if defined(USE_SSE)
+  if (MIN_SSE >= 51 || (cpu_info.bAVX && cpu_info.bFMA))
+    return CPUCull_FMA::TransformVertices<PositionHas3Elems, PerVertexPosMtx>;
+  else if (MIN_SSE >= 50 || cpu_info.bAVX)
+    return CPUCull_AVX::TransformVertices<PositionHas3Elems, PerVertexPosMtx>;
+  else if (PositionHas3Elems && PerVertexPosMtx && (MIN_SSE >= 41 || cpu_info.bSSE4_1))
+    return CPUCull_SSE41::TransformVertices<PositionHas3Elems, PerVertexPosMtx>;
+  else if (PositionHas3Elems && (MIN_SSE >= 30 || cpu_info.bSSE3))
+    return CPUCull_SSE3::TransformVertices<PositionHas3Elems, PerVertexPosMtx>;
+  else
+    return CPUCull_SSE::TransformVertices<PositionHas3Elems, PerVertexPosMtx>;
+#elif defined(USE_NEON)
+  return CPUCull_NEON::TransformVertices<PositionHas3Elems, PerVertexPosMtx>;
+#else
+  return CPUCull_Scalar::TransformVertices<PositionHas3Elems, PerVertexPosMtx>;
+#endif
+}
+
+template <OpcodeDecoder::Primitive Primitive, CullMode Mode>
+static CPUCull::CullFunction GetCullFunction0()
+{
+#if defined(USE_SSE)
+  // Note: AVX version only actually AVX on compilers that support __attribute__((target))
+  // Sorry, MSVC + Sandy Bridge.  (Ivy+ and AMD see very little benefit thanks to mov elimination)
+  if (MIN_SSE >= 50 || cpu_info.bAVX)
+    return CPUCull_AVX::AreAllVerticesCulled<Primitive, Mode>;
+  else if (MIN_SSE >= 30 || cpu_info.bSSE3)
+    return CPUCull_SSE3::AreAllVerticesCulled<Primitive, Mode>;
+  else
+    return CPUCull_SSE::AreAllVerticesCulled<Primitive, Mode>;
+#elif defined(USE_NEON)
+  return CPUCull_NEON::AreAllVerticesCulled<Primitive, Mode>;
+#else
+  return CPUCull_Scalar::AreAllVerticesCulled<Primitive, Mode>;
+#endif
+}
+
+template <OpcodeDecoder::Primitive Primitive>
+static Common::EnumMap<CPUCull::CullFunction, CullMode::All> GetCullFunction1()
+{
+  return {
+      GetCullFunction0<Primitive, CullMode::None>(),
+      GetCullFunction0<Primitive, CullMode::Back>(),
+      GetCullFunction0<Primitive, CullMode::Front>(),
+      GetCullFunction0<Primitive, CullMode::All>(),
+  };
+}
+
+CPUCull::~CPUCull() = default;
+
+void CPUCull::Init()
+{
+  m_transform_table[false][false] = GetTransformFunction<false, false>();
+  m_transform_table[false][true] = GetTransformFunction<false, true>();
+  m_transform_table[true][false] = GetTransformFunction<true, false>();
+  m_transform_table[true][true] = GetTransformFunction<true, true>();
+  using Prim = OpcodeDecoder::Primitive;
+  m_cull_table[Prim::GX_DRAW_QUADS] = GetCullFunction1<Prim::GX_DRAW_QUADS>();
+  m_cull_table[Prim::GX_DRAW_QUADS_2] = GetCullFunction1<Prim::GX_DRAW_QUADS>();
+  m_cull_table[Prim::GX_DRAW_TRIANGLES] = GetCullFunction1<Prim::GX_DRAW_TRIANGLES>();
+  m_cull_table[Prim::GX_DRAW_TRIANGLE_STRIP] = GetCullFunction1<Prim::GX_DRAW_TRIANGLE_STRIP>();
+  m_cull_table[Prim::GX_DRAW_TRIANGLE_FAN] = GetCullFunction1<Prim::GX_DRAW_TRIANGLE_FAN>();
+}
+
+bool CPUCull::AreAllVerticesCulled(VertexLoaderBase* loader, OpcodeDecoder::Primitive primitive,
+                                   const u8* src, u32 count)
+{
+  ASSERT_MSG(VIDEO, primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES,
+             "CPUCull should not be called on lines or points");
+  const u32 stride = loader->m_native_vtx_decl.stride;
+  const bool posHas3Elems = loader->m_native_vtx_decl.position.components >= 3;
+  const bool perVertexPosMtx = loader->m_native_vtx_decl.posmtx.enable;
+  if (m_transform_buffer_size < count) [[unlikely]]
+  {
+    u32 new_size = MathUtil::NextPowerOf2(count);
+    m_transform_buffer_size = new_size;
+    m_transform_buffer.reset(static_cast<TransformedVertex*>(
+        Common::AllocateAlignedMemory(new_size * sizeof(TransformedVertex), 32)));
+  }
+
+  // transform functions need the projection matrix to tranform to clip space
+  Core::System::GetInstance().GetVertexShaderManager().SetProjectionMatrix();
+
+  static constexpr Common::EnumMap<CullMode, CullMode::All> cullmode_invert = {
+      CullMode::None, CullMode::Front, CullMode::Back, CullMode::All};
+
+  CullMode cullmode = bpmem.genMode.cullmode;
+  if (xfmem.viewport.ht > 0)  // See videosoftware Clipper.cpp:IsBackface
+    cullmode = cullmode_invert[cullmode];
+  const TransformFunction transform = m_transform_table[posHas3Elems][perVertexPosMtx];
+  transform(m_transform_buffer.get(), src, stride, count);
+  const CullFunction cull = m_cull_table[primitive][cullmode];
+  return cull(m_transform_buffer.get(), count);
+}
+
+template <typename T>
+void CPUCull::BufferDeleter<T>::operator()(T* ptr)
+{
+  Common::FreeAlignedMemory(ptr);
+}
--- a/Source/Core/VideoCommon/CPUCull.h
+++ b/Source/Core/VideoCommon/CPUCull.h
@ -0,0 +1,38 @@
+// Copyright 2022 Dolphin Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include "VideoCommon/BPMemory.h"
+#include "VideoCommon/DataReader.h"
+#include "VideoCommon/OpcodeDecoding.h"
+
+class CPUCull
+{
+public:
+  ~CPUCull();
+  void Init();
+  bool AreAllVerticesCulled(VertexLoaderBase* loader, OpcodeDecoder::Primitive primitive,
+                            const u8* src, u32 count);
+
+  struct alignas(16) TransformedVertex
+  {
+    float x, y, z, w;
+  };
+
+  using TransformFunction = void (*)(void*, const void*, u32, int);
+  using CullFunction = bool (*)(const CPUCull::TransformedVertex*, int);
+
+private:
+  template <typename T>
+  struct BufferDeleter
+  {
+    void operator()(T* ptr);
+  };
+  std::unique_ptr<TransformedVertex[], BufferDeleter<TransformedVertex>> m_transform_buffer;
+  u32 m_transform_buffer_size = 0;
+  std::array<std::array<TransformFunction, 2>, 2> m_transform_table;
+  Common::EnumMap<Common::EnumMap<CullFunction, CullMode::All>,
+                  OpcodeDecoder::Primitive::GX_DRAW_TRIANGLE_FAN>
+      m_cull_table;
+};
--- a/Source/Core/VideoCommon/CPUCullImpl.h
+++ b/Source/Core/VideoCommon/CPUCullImpl.h
@ -0,0 +1,714 @@
+// Copyright 2022 Dolphin Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#if defined(USE_FMA)
+#define VECTOR_NAMESPACE CPUCull_FMA
+#elif defined(USE_AVX)
+#define VECTOR_NAMESPACE CPUCull_AVX
+#elif defined(USE_SSE41)
+#define VECTOR_NAMESPACE CPUCull_SSE41
+#elif defined(USE_SSE3)
+#define VECTOR_NAMESPACE CPUCull_SSE3
+#elif defined(USE_SSE)
+#define VECTOR_NAMESPACE CPUCull_SSE
+#elif defined(USE_NEON)
+#define VECTOR_NAMESPACE CPUCull_NEON
+#elif defined(NO_SIMD)
+#define VECTOR_NAMESPACE CPUCull_Scalar
+#else
+#error This file is meant to be used by CPUCull.cpp only!
+#endif
+
+#if defined(__GNUC__) && defined(USE_FMA) && !(defined(__AVX__) && defined(__FMA__))
+#define ATTR_TARGET __attribute__((target("avx,fma")))
+#elif defined(__GNUC__) && defined(USE_AVX) && !defined(__AVX__)
+#define ATTR_TARGET __attribute__((target("avx")))
+#elif defined(__GNUC__) && defined(USE_SSE41) && !defined(__SSE4_1__)
+#define ATTR_TARGET __attribute__((target("sse4.1")))
+#elif defined(__GNUC__) && defined(USE_SSE3) && !defined(__SSE3__)
+#define ATTR_TARGET __attribute__((target("sse3")))
+#else
+#define ATTR_TARGET
+#endif
+
+namespace VECTOR_NAMESPACE
+{
+#if defined(USE_SSE)
+typedef __m128 Vector;
+#elif defined(USE_NEON)
+typedef float32x4_t Vector;
+#else
+struct alignas(16) Vector
+{
+  float x, y, z, w;
+};
+#endif
+static_assert(sizeof(Vector) == 16);
+
+#ifdef USE_NEON
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector vsetr_f32(float x, float y, float z, float w)
+{
+  float tmp[4] = {x, y, z, w};
+  return vld1q_f32(tmp);
+}
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void vuzp12q_f32(Vector& a, Vector& b)
+{
+  Vector tmp = vuzp2q_f32(a, b);
+  a = vuzp1q_f32(a, b);
+  b = tmp;
+}
+#endif
+#ifdef USE_SSE
+template <int i>
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector vector_broadcast(Vector v)
+{
+  return _mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i));
+}
+#endif
+#ifdef USE_AVX
+template <int i>
+ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256 vector_broadcast(__m256 v)
+{
+  return _mm256_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i));
+}
+#endif
+
+#ifdef USE_AVX
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void TransposeYMM(__m256& o0, __m256& o1,  //
+                                                          __m256& o2, __m256& o3)
+{
+  __m256d tmp0 = _mm256_castps_pd(_mm256_unpacklo_ps(o0, o1));
+  __m256d tmp1 = _mm256_castps_pd(_mm256_unpacklo_ps(o2, o3));
+  __m256d tmp2 = _mm256_castps_pd(_mm256_unpackhi_ps(o0, o1));
+  __m256d tmp3 = _mm256_castps_pd(_mm256_unpackhi_ps(o2, o3));
+  o0 = _mm256_castpd_ps(_mm256_unpacklo_pd(tmp0, tmp1));
+  o1 = _mm256_castpd_ps(_mm256_unpackhi_pd(tmp0, tmp1));
+  o2 = _mm256_castpd_ps(_mm256_unpacklo_pd(tmp2, tmp3));
+  o3 = _mm256_castpd_ps(_mm256_unpackhi_pd(tmp2, tmp3));
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadTransposedYMM(const void* source, __m256& o0,
+                                                               __m256& o1, __m256& o2, __m256& o3)
+{
+  const Vector* vsource = static_cast<const Vector*>(source);
+  o0 = _mm256_broadcast_ps(&vsource[0]);
+  o1 = _mm256_broadcast_ps(&vsource[1]);
+  o2 = _mm256_broadcast_ps(&vsource[2]);
+  o3 = _mm256_broadcast_ps(&vsource[3]);
+  TransposeYMM(o0, o1, o2, o3);
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadPosYMM(const void* sourcel, const void* sourceh,
+                                                        __m256& o0, __m256& o1, __m256& o2)
+{
+  const Vector* vsourcel = static_cast<const Vector*>(sourcel);
+  const Vector* vsourceh = static_cast<const Vector*>(sourceh);
+  o0 = _mm256_insertf128_ps(_mm256_castps128_ps256(vsourcel[0]), vsourceh[0], 1);
+  o1 = _mm256_insertf128_ps(_mm256_castps128_ps256(vsourcel[1]), vsourceh[1], 1);
+  o2 = _mm256_insertf128_ps(_mm256_castps128_ps256(vsourcel[2]), vsourceh[2], 1);
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void
+LoadTransposedPosYMM(const void* source, __m256& o0, __m256& o1, __m256& o2, __m256& o3)
+{
+  const Vector* vsource = static_cast<const Vector*>(source);
+  o0 = _mm256_broadcast_ps(&vsource[0]);
+  o1 = _mm256_broadcast_ps(&vsource[1]);
+  o2 = _mm256_broadcast_ps(&vsource[2]);
+  o3 = _mm256_setr_ps(0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f);
+  TransposeYMM(o0, o1, o2, o3);
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256 ApplyMatrixYMM(__m256 v, __m256 m0, __m256 m1,
+                                                              __m256 m2, __m256 m3)
+{
+  __m256 output = _mm256_mul_ps(vector_broadcast<0>(v), m0);
+#ifdef USE_FMA
+  output = _mm256_fmadd_ps(vector_broadcast<1>(v), m1, output);
+  output = _mm256_fmadd_ps(vector_broadcast<2>(v), m2, output);
+  output = _mm256_fmadd_ps(vector_broadcast<3>(v), m3, output);
+#else
+  output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<1>(v), m1));
+  output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<2>(v), m2));
+  output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<3>(v), m3));
+#endif
+  return output;
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256
+TransformVertexNoTransposeYMM(__m256 vertex, __m256 pos0, __m256 pos1, __m256 pos2,  //
+                              __m256 proj0, __m256 proj1, __m256 proj2, __m256 proj3)
+{
+  __m256 mul0 = _mm256_mul_ps(vertex, pos0);
+  __m256 mul1 = _mm256_mul_ps(vertex, pos1);
+  __m256 mul2 = _mm256_mul_ps(vertex, pos2);
+  __m256 mul3 = _mm256_setr_ps(0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f);
+  __m256 output = _mm256_hadd_ps(_mm256_hadd_ps(mul0, mul1), _mm256_hadd_ps(mul2, mul3));
+  return ApplyMatrixYMM(output, proj0, proj1, proj2, proj3);
+}
+
+template <bool PositionHas3Elems>
+ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256
+TransformVertexYMM(__m256 vertex, __m256 pos0, __m256 pos1, __m256 pos2, __m256 pos3,  //
+                   __m256 proj0, __m256 proj1, __m256 proj2, __m256 proj3)
+{
+  __m256 output = pos3;  // vertex.w is always 1.0
+#ifdef USE_FMA
+  output = _mm256_fmadd_ps(vector_broadcast<0>(vertex), pos0, output);
+  output = _mm256_fmadd_ps(vector_broadcast<1>(vertex), pos1, output);
+  if constexpr (PositionHas3Elems)
+    output = _mm256_fmadd_ps(vector_broadcast<2>(vertex), pos2, output);
+#else
+  output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<0>(vertex), pos0));
+  output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<1>(vertex), pos1));
+  if constexpr (PositionHas3Elems)
+    output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<2>(vertex), pos2));
+#endif
+  return ApplyMatrixYMM(output, proj0, proj1, proj2, proj3);
+}
+
+template <bool PositionHas3Elems, bool PerVertexPosMtx>
+ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256
+LoadTransform2Vertices(const u8* v0data, const u8* v1data,                  //
+                       __m256 pos0, __m256 pos1, __m256 pos2, __m256 pos3,  //
+                       __m256 proj0, __m256 proj1, __m256 proj2, __m256 proj3)
+{
+  __m256 v01;
+  if constexpr (PerVertexPosMtx)
+  {
+    // Vertex data layout always starts with posmtx data if available, then position data
+    // Convenient for us, that means offsets are always fixed
+    u32 v0idx = v0data[0] & 0x3f;
+    u32 v1idx = v1data[0] & 0x3f;
+    v0data += sizeof(u32);
+    v1data += sizeof(u32);
+
+    const float* v0fdata = reinterpret_cast<const float*>(v0data);
+    const float* v1fdata = reinterpret_cast<const float*>(v1data);
+
+    LoadPosYMM(&xfmem.posMatrices[v0idx * 4], &xfmem.posMatrices[v1idx * 4], pos0, pos1, pos2);
+
+    if constexpr (PositionHas3Elems)
+    {
+      __m256 base = _mm256_set1_ps(1.0f);
+      v01 = _mm256_blend_ps(_mm256_loadu2_m128(v1fdata, v0fdata), base, 0x88);
+    }
+    else
+    {
+      __m256 base = _mm256_unpacklo_ps(_mm256_setzero_ps(), _mm256_set1_ps(1.0f));
+      __m256 v1 = _mm256_castpd_ps(_mm256_broadcast_sd(reinterpret_cast<const double*>(v1data)));
+      __m128 v0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(v0data));
+      v01 = _mm256_blend_ps(_mm256_castps128_ps256(v0), v1, 0x30);
+      v01 = _mm256_blend_ps(v01, base, 0xcc);
+    }
+
+    v01 = TransformVertexNoTransposeYMM(v01, pos0, pos1, pos2, proj0, proj1, proj2, proj3);
+  }
+  else
+  {
+    const float* v0fdata = reinterpret_cast<const float*>(v0data);
+    const float* v1fdata = reinterpret_cast<const float*>(v1data);
+    if constexpr (PositionHas3Elems)
+    {
+      v01 = _mm256_loadu2_m128(v1fdata, v0fdata);
+    }
+    else
+    {
+      __m256 v1 = _mm256_castpd_ps(_mm256_broadcast_sd(reinterpret_cast<const double*>(v1data)));
+      __m128 v0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(v0data));
+      v01 = _mm256_blend_ps(_mm256_castps128_ps256(v0), v1, 0x30);
+    }
+
+#ifdef __clang__
+    // Clang's optimizer is dumb, yay
+    // It sees TransformVertexYMM doing broadcasts and is like
+    // "let's broadcast *before* we combine v0 and v1!  Then we can use vbroadcastss!"
+    // Prevent it from "optimizing" here
+    asm("" : "+x"(v01)::);
+#endif
+
+    v01 = TransformVertexYMM<PositionHas3Elems>(v01, pos0, pos1, pos2, pos3,  //
+                                                proj0, proj1, proj2, proj3);
+  }
+
+  return v01;
+}
+
+#endif
+
+#ifndef USE_AVX
+// Note: Assumes 16-byte aligned source
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadTransposed(const void* source, Vector& o0,
+                                                            Vector& o1, Vector& o2, Vector& o3)
+{
+#if defined(USE_SSE)
+  const Vector* vsource = static_cast<const Vector*>(source);
+  o0 = vsource[0];
+  o1 = vsource[1];
+  o2 = vsource[2];
+  o3 = vsource[3];
+  _MM_TRANSPOSE4_PS(o0, o1, o2, o3);
+#elif defined(USE_NEON)
+  float32x4x4_t ld = vld4q_f32(static_cast<const float*>(source));
+  o0 = ld.val[0];
+  o1 = ld.val[1];
+  o2 = ld.val[2];
+  o3 = ld.val[3];
+#else
+  const Vector* vsource = static_cast<const Vector*>(source);
+  // clang-format off
+  o0.x = vsource[0].x; o0.y = vsource[1].x; o0.z = vsource[2].x; o0.w = vsource[3].x;
+  o1.x = vsource[0].y; o1.y = vsource[1].y; o1.z = vsource[2].y; o1.w = vsource[3].y;
+  o2.x = vsource[0].z; o2.y = vsource[1].z; o2.z = vsource[2].z; o2.w = vsource[3].z;
+  o3.x = vsource[0].w; o3.y = vsource[1].w; o3.z = vsource[2].w; o3.w = vsource[3].w;
+  // clang-format on
+#endif
+}
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadTransposedPos(const void* source, Vector& o0,
+                                                               Vector& o1, Vector& o2, Vector& o3)
+{
+  const Vector* vsource = static_cast<const Vector*>(source);
+#if defined(USE_SSE)
+  o0 = vsource[0];
+  o1 = vsource[1];
+  o2 = vsource[2];
+  o3 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
+  _MM_TRANSPOSE4_PS(o0, o1, o2, o3);
+#elif defined(USE_NEON)
+  float32x4x2_t ld01 = vld2q_f32(static_cast<const float*>(source));
+  o0 = ld01.val[0];
+  o1 = ld01.val[1];
+  o2 = vsource[2];
+  o3 = vsetr_f32(0.0f, 0.0f, 0.0f, 1.0f);
+  vuzp12q_f32(o2, o3);
+  vuzp12q_f32(o0, o2);
+  vuzp12q_f32(o1, o3);
+#else
+  // clang-format off
+  o0.x = vsource[0].x; o0.y = vsource[1].x; o0.z = vsource[2].x; o0.w = 0.0f;
+  o1.x = vsource[0].y; o1.y = vsource[1].y; o1.z = vsource[2].y; o1.w = 0.0f;
+  o2.x = vsource[0].z; o2.y = vsource[1].z; o2.z = vsource[2].z; o2.w = 0.0f;
+  o3.x = vsource[0].w; o3.y = vsource[1].w; o3.z = vsource[2].w; o3.w = 1.0f;
+  // clang-format on
+#endif
+}
+#endif
+
+#ifndef USE_NEON
+ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadPos(const void* source,  //
+                                                     Vector& o0, Vector& o1, Vector& o2)
+{
+  const Vector* vsource = static_cast<const Vector*>(source);
+  o0 = vsource[0];
+  o1 = vsource[1];
+  o2 = vsource[2];
+}
+#endif
+
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector ApplyMatrix(Vector v, Vector m0, Vector m1,
+                                                           Vector m2, Vector m3)
+{
+#if defined(USE_SSE)
+  Vector output = _mm_mul_ps(vector_broadcast<0>(v), m0);
+#ifdef USE_FMA
+  output = _mm_fmadd_ps(vector_broadcast<1>(v), m1, output);
+  output = _mm_fmadd_ps(vector_broadcast<2>(v), m2, output);
+  output = _mm_fmadd_ps(vector_broadcast<3>(v), m3, output);
+#else
+  output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<1>(v), m1));
+  output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<2>(v), m2));
+  output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<3>(v), m3));
+#endif
+  return output;
+#elif defined(USE_NEON)
+  Vector output = vmulq_laneq_f32(m0, v, 0);
+  output = vfmaq_laneq_f32(output, m1, v, 1);
+  output = vfmaq_laneq_f32(output, m2, v, 2);
+  output = vfmaq_laneq_f32(output, m3, v, 3);
+  return output;
+#else
+  Vector output;
+  output.x = v.x * m0.x + v.y * m1.x + v.z * m2.x + v.w * m3.x;
+  output.y = v.x * m0.y + v.y * m1.y + v.z * m2.y + v.w * m3.y;
+  output.z = v.x * m0.z + v.y * m1.z + v.z * m2.z + v.w * m3.z;
+  output.w = v.x * m0.w + v.y * m1.w + v.z * m2.w + v.w * m3.w;
+  return output;
+#endif
+}
+
+#ifndef USE_NEON
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector
+TransformVertexNoTranspose(Vector vertex, Vector pos0, Vector pos1, Vector pos2,  //
+                           Vector proj0, Vector proj1, Vector proj2, Vector proj3)
+{
+#ifdef USE_SSE
+  Vector mul0 = _mm_mul_ps(vertex, pos0);
+  Vector mul1 = _mm_mul_ps(vertex, pos1);
+  Vector mul2 = _mm_mul_ps(vertex, pos2);
+  Vector mul3 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
+#ifdef USE_SSE3
+  Vector output = _mm_hadd_ps(_mm_hadd_ps(mul0, mul1), _mm_hadd_ps(mul2, mul3));
+#else
+  Vector t0 = _mm_add_ps(_mm_unpacklo_ps(mul0, mul2), _mm_unpackhi_ps(mul0, mul2));
+  Vector t1 = _mm_add_ps(_mm_unpacklo_ps(mul1, mul3), _mm_unpackhi_ps(mul1, mul3));
+  Vector output = _mm_add_ps(_mm_unpacklo_ps(t0, t1), _mm_unpackhi_ps(t0, t1));
+#endif
+#else
+  Vector output;
+  output.x = vertex.x * pos0.x + vertex.y * pos0.y + vertex.z * pos0.z + vertex.w * pos0.w;
+  output.y = vertex.x * pos1.x + vertex.y * pos1.y + vertex.z * pos1.z + vertex.w * pos1.w;
+  output.z = vertex.x * pos2.x + vertex.y * pos2.y + vertex.z * pos2.z + vertex.w * pos2.w;
+  output.w = 1.0f;
+#endif
+  output = ApplyMatrix(output, proj0, proj1, proj2, proj3);
+  return output;
+}
+#endif
+
+template <bool PositionHas3Elems>
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector
+TransformVertex(Vector vertex, Vector pos0, Vector pos1, Vector pos2, Vector pos3,  //
+                Vector proj0, Vector proj1, Vector proj2, Vector proj3)
+{
+  Vector output = pos3;  // vertex.w is always 1.0
+#if defined(USE_FMA)
+  output = _mm_fmadd_ps(vector_broadcast<0>(vertex), pos0, output);
+  output = _mm_fmadd_ps(vector_broadcast<1>(vertex), pos1, output);
+  if constexpr (PositionHas3Elems)
+    output = _mm_fmadd_ps(vector_broadcast<2>(vertex), pos2, output);
+#elif defined(USE_SSE)
+  output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<0>(vertex), pos0));
+  output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<1>(vertex), pos1));
+  if constexpr (PositionHas3Elems)
+    output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<2>(vertex), pos2));
+#elif defined(USE_NEON)
+  output = vfmaq_laneq_f32(output, pos0, vertex, 0);
+  output = vfmaq_laneq_f32(output, pos1, vertex, 1);
+  if constexpr (PositionHas3Elems)
+    output = vfmaq_laneq_f32(output, pos2, vertex, 2);
+#else
+  output.x += vertex.x * pos0.x + vertex.y * pos1.x;
+  output.y += vertex.x * pos0.y + vertex.y * pos1.y;
+  output.z += vertex.x * pos0.z + vertex.y * pos1.z;
+  if constexpr (PositionHas3Elems)
+  {
+    output.x += vertex.z * pos2.x;
+    output.y += vertex.z * pos2.y;
+    output.z += vertex.z * pos2.z;
+  }
+#endif
+  output = ApplyMatrix(output, proj0, proj1, proj2, proj3);
+  return output;
+}
+
+template <bool PositionHas3Elems, bool PerVertexPosMtx>
+ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector
+LoadTransformVertex(const u8* data, Vector pos0, Vector pos1, Vector pos2, Vector pos3,
+                    Vector proj0, Vector proj1, Vector proj2, Vector proj3)
+{
+  Vector vertex;
+  if constexpr (PerVertexPosMtx)
+  {
+    // Vertex data layout always starts with posmtx data if available, then position data
+    // Convenient for us, that means offsets are always fixed
+    u32 idx = data[0] & 0x3f;
+    data += sizeof(u32);
+
+    const float* fdata = reinterpret_cast<const float*>(data);
+
+#ifdef USE_NEON
+    LoadTransposedPos(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2, pos3);
+
+    if constexpr (PositionHas3Elems)
+    {
+      vertex = vld1q_f32(fdata);
+    }
+    else
+    {
+      vertex = vcombine_f32(vld1_f32(fdata), vdup_n_f32(0.0f));
+    }
+
+    vertex = TransformVertex<PositionHas3Elems>(vertex, pos0, pos1, pos2, pos3,  //
+                                                proj0, proj1, proj2, proj3);
+#else
+    LoadPos(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2);
+
+    if constexpr (PositionHas3Elems)
+    {
+#if defined(USE_SSE)
+#ifdef USE_SSE41
+      Vector base = _mm_set1_ps(1.0f);
+      vertex = _mm_blend_ps(_mm_loadu_ps(fdata), base, 8);
+#else
+      Vector base = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
+      Vector mask = _mm_castsi128_ps(_mm_setr_epi32(-1, -1, -1, 0));
+      vertex = _mm_or_ps(_mm_and_ps(_mm_loadu_ps(fdata), mask), base);
+#endif
+#else
+      vertex.x = fdata[0];
+      vertex.y = fdata[1];
+      vertex.z = fdata[2];
+      vertex.w = 1.0f;
+#endif
+    }
+    else
+    {
+#if defined(USE_SSE)
+      Vector base = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
+      vertex = _mm_loadl_pi(base, reinterpret_cast<const __m64*>(fdata));
+#else
+      vertex.x = fdata[0];
+      vertex.y = fdata[1];
+      vertex.z = 0.0f;
+      vertex.w = 1.0f;
+#endif
+    }
+
+    vertex = TransformVertexNoTranspose(vertex, pos0, pos1, pos2, proj0, proj1, proj2, proj3);
+#endif
+  }
+  else
+  {
+    const float* fdata = reinterpret_cast<const float*>(data);
+    if constexpr (PositionHas3Elems)
+    {
+#if defined(USE_SSE)
+      vertex = _mm_loadu_ps(fdata);
+#elif defined(USE_NEON)
+      vertex = vld1q_f32(fdata);
+#else
+      vertex.x = fdata[0];
+      vertex.y = fdata[1];
+      vertex.z = fdata[2];
+      vertex.w = 1.0f;
+#endif
+    }
+    else
+    {
+#if defined(USE_SSE)
+      vertex = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<const __m64*>(fdata));
+#elif defined(USE_NEON)
+      vertex = vcombine_f32(vld1_f32(fdata), vdup_n_f32(0.0f));
+#else
+      vertex.x = fdata[0];
+      vertex.y = fdata[1];
+      vertex.z = 0.0f;
+      vertex.w = 1.0f;
+#endif
+    }
+
+    vertex = TransformVertex<PositionHas3Elems>(vertex, pos0, pos1, pos2, pos3,  //
+                                                proj0, proj1, proj2, proj3);
+  }
+
+  return vertex;
+}
+
+template <bool PositionHas3Elems, bool PerVertexPosMtx>
+ATTR_TARGET static void TransformVertices(void* output, const void* vertices, u32 stride, int count)
+{
+  const VertexShaderManager& vsmanager = Core::System::GetInstance().GetVertexShaderManager();
+  const u8* cvertices = static_cast<const u8*>(vertices);
+  Vector* voutput = static_cast<Vector*>(output);
+  u32 idx = g_main_cp_state.matrix_index_a.PosNormalMtxIdx & 0x3f;
+#ifdef USE_AVX
+  __m256 proj0, proj1, proj2, proj3;
+  __m256 pos0, pos1, pos2, pos3;
+  LoadTransposedYMM(vsmanager.constants.projection.data(), proj0, proj1, proj2, proj3);
+  LoadTransposedPosYMM(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2, pos3);
+  for (int i = 1; i < count; i += 2)
+  {
+    const u8* v0data = cvertices;
+    const u8* v1data = cvertices + stride;
+    __m256 v01 = LoadTransform2Vertices<PositionHas3Elems, PerVertexPosMtx>(
+        v0data, v1data, pos0, pos1, pos2, pos3, proj0, proj1, proj2, proj3);
+    _mm256_store_ps(reinterpret_cast<float*>(voutput), v01);
+    cvertices += stride * 2;
+    voutput += 2;
+  }
+  if (count & 1)
+  {
+    *voutput = LoadTransformVertex<PositionHas3Elems, PerVertexPosMtx>(
+        cvertices,                                                     //
+        _mm256_castps256_ps128(pos0), _mm256_castps256_ps128(pos1),    //
+        _mm256_castps256_ps128(pos2), _mm256_castps256_ps128(pos3),    //
+        _mm256_castps256_ps128(proj0), _mm256_castps256_ps128(proj1),  //
+        _mm256_castps256_ps128(proj2), _mm256_castps256_ps128(proj3));
+  }
+#else
+  Vector proj0, proj1, proj2, proj3;
+  Vector pos0, pos1, pos2, pos3;
+  LoadTransposed(vsmanager.constants.projection.data(), proj0, proj1, proj2, proj3);
+  LoadTransposedPos(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2, pos3);
+  for (int i = 0; i < count; i++)
+  {
+    *voutput = LoadTransformVertex<PositionHas3Elems, PerVertexPosMtx>(
+        cvertices, pos0, pos1, pos2, pos3, proj0, proj1, proj2, proj3);
+    cvertices += stride;
+    voutput += 1;
+  }
+#endif
+}
+
+template <CullMode Mode>
+ATTR_TARGET DOLPHIN_FORCE_INLINE static bool CullTriangle(const CPUCull::TransformedVertex& a,
+                                                          const CPUCull::TransformedVertex& b,
+                                                          const CPUCull::TransformedVertex& c)
+{
+  if (Mode == CullMode::All)
+    return true;
+
+  Vector va = reinterpret_cast<const Vector&>(a);
+  Vector vb = reinterpret_cast<const Vector&>(b);
+  Vector vc = reinterpret_cast<const Vector&>(c);
+
+  // See videosoftware Clipper.cpp
+
+#if defined(USE_SSE)
+  Vector wxzya = _mm_shuffle_ps(va, va, _MM_SHUFFLE(1, 2, 0, 3));
+  Vector wxzyc = _mm_shuffle_ps(vc, vc, _MM_SHUFFLE(1, 2, 0, 3));
+  Vector ywzxb = _mm_shuffle_ps(vb, vb, _MM_SHUFFLE(0, 2, 3, 1));
+  Vector part0 = _mm_mul_ps(va, wxzyc);
+  Vector part1 = _mm_mul_ps(vc, wxzya);
+  Vector part2 = _mm_mul_ps(_mm_sub_ps(part0, part1), ywzxb);
+#ifdef USE_SSE3
+  Vector part3 = _mm_movehdup_ps(part2);
+#else
+  Vector part3 = vector_broadcast<1>(part2);
+#endif
+  Vector part4 = vector_broadcast<3>(part2);
+  Vector part5 = _mm_add_ss(_mm_add_ss(part2, part3), part4);
+  float normal_z_dir;
+  _mm_store_ss(&normal_z_dir, part5);
+#elif defined(USE_NEON)
+  Vector zero = vdupq_n_f32(0.0f);
+  Vector wx0ya = vextq_f32(va, vzip1q_f32(va, zero), 3);
+  Vector wx0yc = vextq_f32(vc, vzip1q_f32(vc, zero), 3);
+  Vector ywxxb = vuzp2q_f32(vb, vdupq_laneq_f32(vb, 0));
+  Vector part0 = vmulq_f32(va, wx0yc);
+  Vector part1 = vmulq_f32(vc, wx0ya);
+  Vector part2 = vmulq_f32(vsubq_f32(part0, part1), ywxxb);
+  float normal_z_dir = vaddvq_f32(part2);
+#else
+  float normal_z_dir = (c.w * a.x - a.w * c.x) * b.y +  //
+                       (c.x * a.y - a.x * c.y) * b.w +  //
+                       (c.y * a.w - a.y * c.w) * b.x;
+#endif
+  bool cull = false;
+  switch (Mode)
+  {
+  case CullMode::None:
+    cull = normal_z_dir == 0;
+    break;
+  case CullMode::Front:
+    cull = normal_z_dir <= 0;
+    break;
+  case CullMode::Back:
+    cull = normal_z_dir >= 0;
+    break;
+  case CullMode::All:
+    cull = true;
+    break;
+  }
+  if (cull)
+    return true;
+
+#if defined(USE_SSE)
+  Vector xyab = _mm_unpacklo_ps(va, vb);
+  Vector zwab = _mm_unpackhi_ps(va, vb);
+  Vector allx = _mm_shuffle_ps(xyab, vc, _MM_SHUFFLE(0, 0, 1, 0));
+  Vector ally = _mm_shuffle_ps(xyab, vc, _MM_SHUFFLE(1, 1, 3, 2));
+  Vector allpw = _mm_shuffle_ps(zwab, vc, _MM_SHUFFLE(3, 3, 3, 2));
+  Vector allnw = _mm_xor_ps(allpw, _mm_set1_ps(-0.0f));
+  __m128i x_gt_pw = _mm_castps_si128(_mm_cmple_ps(allpw, allx));
+  __m128i y_gt_pw = _mm_castps_si128(_mm_cmple_ps(allpw, ally));
+  __m128i x_lt_nw = _mm_castps_si128(_mm_cmplt_ps(allx, allnw));
+  __m128i y_lt_nw = _mm_castps_si128(_mm_cmplt_ps(ally, allnw));
+  __m128i any_out_of_bounds = _mm_packs_epi16(_mm_packs_epi32(x_lt_nw, y_lt_nw),  //
+                                              _mm_packs_epi32(x_gt_pw, y_gt_pw));
+  cull |= 0 != _mm_movemask_epi8(_mm_cmpeq_epi32(_mm_set1_epi32(~0), any_out_of_bounds));
+#elif defined(USE_NEON)
+  float64x2_t xyab = vreinterpretq_f64_f32(vzip1q_f32(va, vb));
+  float64x2_t xycc = vreinterpretq_f64_f32(vzip1q_f32(vc, vc));
+  float32x4_t allx = vreinterpretq_f32_f64(vzip1q_f64(xyab, xycc));
+  float32x4_t ally = vreinterpretq_f32_f64(vzip2q_f64(xyab, xycc));
+  float32x4_t allpw = vextq_f32(vzip2q_f32(va, vb), vdupq_laneq_f32(vc, 3), 2);
+  float32x4_t allnw = vnegq_f32(allpw);
+  uint16x8_t x_gt_pw = vreinterpretq_u16_u32(vcgtq_f32(allx, allpw));
+  uint16x8_t y_gt_pw = vreinterpretq_u16_u32(vcgtq_f32(ally, allpw));
+  uint16x8_t x_lt_nw = vreinterpretq_u16_u32(vcltq_f32(allx, allnw));
+  uint16x8_t y_lt_nw = vreinterpretq_u16_u32(vcltq_f32(ally, allnw));
+  uint8x16_t lt_nw = vreinterpretq_u8_u16(vuzp1q_u16(x_lt_nw, y_lt_nw));
+  uint8x16_t gt_pw = vreinterpretq_u8_u16(vuzp1q_u16(x_gt_pw, y_gt_pw));
+  uint32x4_t any_out_of_bounds = vreinterpretq_u32_u8(vuzp1q_u8(lt_nw, gt_pw));
+  cull |= 0xFFFFFFFF == vmaxvq_u32(any_out_of_bounds);
+#else
+  cull |= a.x < -a.w && b.x < -b.w && c.x < -c.w;
+  cull |= a.y < -a.w && b.y < -b.w && c.y < -c.w;
+  cull |= a.x > a.w && b.x > b.w && c.x > c.w;
+  cull |= a.y > a.w && b.y > b.w && c.y > c.w;
+#endif
+
+  return cull;
+}
+
+template <OpcodeDecoder::Primitive Primitive, CullMode Mode>
+ATTR_TARGET static bool AreAllVerticesCulled(const CPUCull::TransformedVertex* transformed,
+                                             int count)
+{
+  switch (Primitive)
+  {
+  case OpcodeDecoder::Primitive::GX_DRAW_QUADS:
+  case OpcodeDecoder::Primitive::GX_DRAW_QUADS_2:
+  {
+    int i = 3;
+    for (; i < count; i += 4)
+    {
+      if (!CullTriangle<Mode>(transformed[i - 3], transformed[i - 2], transformed[i - 1]))
+        return false;
+      if (!CullTriangle<Mode>(transformed[i - 3], transformed[i - 1], transformed[i - 0]))
+        return false;
+    }
+    // three vertices remaining, so render a triangle
+    if (i == count)
+    {
+      if (!CullTriangle<Mode>(transformed[i - 3], transformed[i - 2], transformed[i - 1]))
+        return false;
+    }
+    break;
+  }
+  case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLES:
+    for (int i = 2; i < count; i += 3)
+    {
+      if (!CullTriangle<Mode>(transformed[i - 2], transformed[i - 1], transformed[i - 0]))
+        return false;
+    }
+    break;
+  case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLE_STRIP:
+  {
+    bool wind = false;
+    for (int i = 2; i < count; ++i)
+    {
+      if (!CullTriangle<Mode>(transformed[i - 2], transformed[i - !wind], transformed[i - wind]))
+        return false;
+      wind = !wind;
+    }
+    break;
+  }
+  case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLE_FAN:
+    for (int i = 2; i < count; ++i)
+    {
+      if (!CullTriangle<Mode>(transformed[0], transformed[i - 1], transformed[i]))
+        return false;
+    }
+    break;
+  }
+
+  return true;
+}
+
+}  // namespace VECTOR_NAMESPACE
+
+#undef ATTR_TARGET
+#undef VECTOR_NAMESPACE
--- a/Source/Core/VideoCommon/ConstantManager.h
+++ b/Source/Core/VideoCommon/ConstantManager.h
@ -17,7 +17,7 @@ enum class SrcBlendFactor : u32;
 enum class ZTexOp : u32;
 enum class LogicOp : u32;

-struct PixelShaderConstants
+struct alignas(16) PixelShaderConstants
 {
  std::array<int4, 4> colors;
  std::array<int4, 4> kcolors;
@ -60,7 +60,7 @@ struct PixelShaderConstants
  LogicOp logic_op_mode;
 };

-struct VertexShaderConstants
+struct alignas(16) VertexShaderConstants
 {
  u32 components;           // .x
  u32 xfmem_dualTexInfo;    // .y
@ -109,7 +109,7 @@ enum class VSExpand : u32
  Line,
 };

-struct GeometryShaderConstants
+struct alignas(16) GeometryShaderConstants
 {
  float4 stereoparams;
  float4 lineptparams;
--- a/Source/Core/VideoCommon/VertexLoaderManager.cpp
+++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp
@ -30,6 +30,7 @@
 #include "VideoCommon/VertexLoaderBase.h"
 #include "VideoCommon/VertexManagerBase.h"
 #include "VideoCommon/VertexShaderManager.h"
+#include "VideoCommon/VideoConfig.h"
 #include "VideoCommon/XFMemory.h"

 namespace VertexLoaderManager
@ -366,17 +367,33 @@ int RunVertices(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int coun
    vertex_shader_manager.SetVertexFormat(loader->m_native_components,
                                          loader->m_native_vertex_format->GetVertexDeclaration());

-    // if cull mode is CULL_ALL, tell VertexManager to skip triangles and quads.
-    // They still need to go through vertex loading, because we need to calculate a zfreeze refrence
-    // slope.
-    bool cullall = (bpmem.genMode.cullmode == CullMode::All &&
-                    primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES);
+    // CPUCull's performance increase comes from encoding fewer GPU commands, not sending less data
+    // Therefore it's only useful to check if culling could remove a flush
+    const bool can_cpu_cull = g_ActiveConfig.bCPUCull &&
+                              primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES &&
+                              !g_vertex_manager->HasSendableVertices();

-    DataReader dst = g_vertex_manager->PrepareForAdditionalData(
-        primitive, count, loader->m_native_vtx_decl.stride, cullall);
+    // if cull mode is CULL_ALL, tell VertexManager to skip triangles and quads.
+    // They still need to go through vertex loading, because we need to calculate a zfreeze
+    // reference slope.
+    const bool cullall = (bpmem.genMode.cullmode == CullMode::All &&
+                          primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES);
+
+    const int stride = loader->m_native_vtx_decl.stride;
+    DataReader dst = g_vertex_manager->PrepareForAdditionalData(primitive, count, stride,
+                                                                cullall || can_cpu_cull);

    count = loader->RunVertices(src, dst.GetPointer(), count);

+    if (can_cpu_cull && !cullall)
+    {
+      if (!g_vertex_manager->AreAllVerticesCulled(loader, primitive, dst.GetPointer(), count))
+      {
+        DataReader new_dst = g_vertex_manager->DisableCullAll(stride);
+        memmove(new_dst.GetPointer(), dst.GetPointer(), count * stride);
+      }
+    }
+
    g_vertex_manager->AddIndices(primitive, count);
    g_vertex_manager->FlushData(count, loader->m_native_vtx_decl.stride);

--- a/Source/Core/VideoCommon/VertexManagerBase.cpp
+++ b/Source/Core/VideoCommon/VertexManagerBase.cpp
@ -104,6 +104,7 @@ VertexManagerBase::~VertexManagerBase() = default;
 bool VertexManagerBase::Initialize()
 {
  m_index_generator.Init();
+  m_cpu_cull.Init();
  return true;
 }

@ -117,6 +118,13 @@ void VertexManagerBase::AddIndices(OpcodeDecoder::Primitive primitive, u32 num_v
  m_index_generator.AddIndices(primitive, num_vertices);
 }

+bool VertexManagerBase::AreAllVerticesCulled(VertexLoaderBase* loader,
+                                             OpcodeDecoder::Primitive primitive, const u8* src,
+                                             u32 count)
+{
+  return m_cpu_cull.AreAllVerticesCulled(loader, primitive, src, count);
+}
+
 DataReader VertexManagerBase::PrepareForAdditionalData(OpcodeDecoder::Primitive primitive,
                                                       u32 count, u32 stride, bool cullall)
 {
@ -187,6 +195,16 @@ DataReader VertexManagerBase::PrepareForAdditionalData(OpcodeDecoder::Primitive
  return DataReader(m_cur_buffer_pointer, m_end_buffer_pointer);
 }

+DataReader VertexManagerBase::DisableCullAll(u32 stride)
+{
+  if (m_cull_all)
+  {
+    m_cull_all = false;
+    ResetBuffer(stride);
+  }
+  return DataReader(m_cur_buffer_pointer, m_end_buffer_pointer);
+}
+
 void VertexManagerBase::FlushData(u32 count, u32 stride)
 {
  m_cur_buffer_pointer += count * stride;
@ -548,6 +566,8 @@ void VertexManagerBase::Flush()
    // Now the vertices can be flushed to the GPU. Everything following the CommitBuffer() call
    // must be careful to not upload any utility vertices, as the binding will be lost otherwise.
    const u32 num_indices = m_index_generator.GetIndexLen();
+    if (num_indices == 0)
+      return;
    u32 base_vertex, base_index;
    CommitBuffer(m_index_generator.GetNumVerts(),
                 VertexLoaderManager::GetCurrentVertexFormat()->GetVertexStride(), num_indices,
--- a/Source/Core/VideoCommon/VertexManagerBase.h
+++ b/Source/Core/VideoCommon/VertexManagerBase.h
@ -9,6 +9,7 @@
 #include "Common/BitSet.h"
 #include "Common/CommonTypes.h"
 #include "Common/MathUtil.h"
+#include "VideoCommon/CPUCull.h"
 #include "VideoCommon/IndexGenerator.h"
 #include "VideoCommon/RenderState.h"
 #include "VideoCommon/ShaderCache.h"
@ -100,11 +101,18 @@ public:

  PrimitiveType GetCurrentPrimitiveType() const { return m_current_primitive_type; }
  void AddIndices(OpcodeDecoder::Primitive primitive, u32 num_vertices);
+  bool AreAllVerticesCulled(VertexLoaderBase* loader, OpcodeDecoder::Primitive primitive,
+                            const u8* src, u32 count);
  virtual DataReader PrepareForAdditionalData(OpcodeDecoder::Primitive primitive, u32 count,
                                              u32 stride, bool cullall);
+  /// Switch cullall off after a call to PrepareForAdditionalData with cullall true
+  /// Expects that you will add a nonzero number of primitives before the next flush
+  /// Returns whether cullall was changed (false if cullall was already off)
+  DataReader DisableCullAll(u32 stride);
  void FlushData(u32 count, u32 stride);

  void Flush();
+  bool HasSendableVertices() const { return !m_is_flushed && !m_cull_all; }

  void DoState(PointerWrap& p);

@ -201,6 +209,7 @@ protected:
  bool m_cull_all = false;

  IndexGenerator m_index_generator;
+  CPUCull m_cpu_cull;

 private:
  // Minimum number of draws per command buffer when attempting to preempt a readback operation.
--- a/Source/Core/VideoCommon/VertexShaderManager.cpp
+++ b/Source/Core/VideoCommon/VertexShaderManager.cpp
@ -65,6 +65,97 @@ void VertexShaderManager::Dirty()
  dirty = true;
 }

+Common::Matrix44 VertexShaderManager::LoadProjectionMatrix()
+{
+  const auto& rawProjection = xfmem.projection.rawProjection;
+
+  switch (xfmem.projection.type)
+  {
+  case ProjectionType::Perspective:
+  {
+    const Common::Vec2 fov_multiplier = g_freelook_camera.IsActive() ?
+                                            g_freelook_camera.GetFieldOfViewMultiplier() :
+                                            Common::Vec2{1, 1};
+    m_projection_matrix[0] = rawProjection[0] * g_ActiveConfig.fAspectRatioHackW * fov_multiplier.x;
+    m_projection_matrix[1] = 0.0f;
+    m_projection_matrix[2] = rawProjection[1] * g_ActiveConfig.fAspectRatioHackW * fov_multiplier.x;
+    m_projection_matrix[3] = 0.0f;
+
+    m_projection_matrix[4] = 0.0f;
+    m_projection_matrix[5] = rawProjection[2] * g_ActiveConfig.fAspectRatioHackH * fov_multiplier.y;
+    m_projection_matrix[6] = rawProjection[3] * g_ActiveConfig.fAspectRatioHackH * fov_multiplier.y;
+    m_projection_matrix[7] = 0.0f;
+
+    m_projection_matrix[8] = 0.0f;
+    m_projection_matrix[9] = 0.0f;
+    m_projection_matrix[10] = rawProjection[4];
+    m_projection_matrix[11] = rawProjection[5];
+
+    m_projection_matrix[12] = 0.0f;
+    m_projection_matrix[13] = 0.0f;
+
+    m_projection_matrix[14] = -1.0f;
+    m_projection_matrix[15] = 0.0f;
+
+    g_stats.gproj = m_projection_matrix;
+  }
+  break;
+
+  case ProjectionType::Orthographic:
+  {
+    m_projection_matrix[0] = rawProjection[0];
+    m_projection_matrix[1] = 0.0f;
+    m_projection_matrix[2] = 0.0f;
+    m_projection_matrix[3] = rawProjection[1];
+
+    m_projection_matrix[4] = 0.0f;
+    m_projection_matrix[5] = rawProjection[2];
+    m_projection_matrix[6] = 0.0f;
+    m_projection_matrix[7] = rawProjection[3];
+
+    m_projection_matrix[8] = 0.0f;
+    m_projection_matrix[9] = 0.0f;
+    m_projection_matrix[10] = rawProjection[4];
+    m_projection_matrix[11] = rawProjection[5];
+
+    m_projection_matrix[12] = 0.0f;
+    m_projection_matrix[13] = 0.0f;
+
+    m_projection_matrix[14] = 0.0f;
+    m_projection_matrix[15] = 1.0f;
+
+    g_stats.g2proj = m_projection_matrix;
+    g_stats.proj = rawProjection;
+  }
+  break;
+
+  default:
+    ERROR_LOG_FMT(VIDEO, "Unknown projection type: {}", xfmem.projection.type);
+  }
+
+  PRIM_LOG("Projection: {} {} {} {} {} {}", rawProjection[0], rawProjection[1], rawProjection[2],
+           rawProjection[3], rawProjection[4], rawProjection[5]);
+
+  auto corrected_matrix = m_viewport_correction * Common::Matrix44::FromArray(m_projection_matrix);
+
+  if (g_freelook_camera.IsActive() && xfmem.projection.type == ProjectionType::Perspective)
+    corrected_matrix *= g_freelook_camera.GetView();
+
+  g_freelook_camera.GetController()->SetClean();
+
+  return corrected_matrix;
+}
+
+void VertexShaderManager::SetProjectionMatrix()
+{
+  if (m_projection_changed || g_freelook_camera.GetController()->IsDirty())
+  {
+    m_projection_changed = false;
+    auto corrected_matrix = LoadProjectionMatrix();
+    memcpy(constants.projection.data(), corrected_matrix.data.data(), 4 * sizeof(float4));
+  }
+}
+
 // Syncs the shader constant buffers with xfmem
 // TODO: A cleaner way to control the matrices without making a mess in the parameters field
 void VertexShaderManager::SetConstants(const std::vector<std::string>& textures)
@ -317,84 +408,7 @@ void VertexShaderManager::SetConstants(const std::vector<std::string>& textures)
    m_projection_changed = false;
    m_projection_graphics_mod_change = !projection_actions.empty();

-    const auto& rawProjection = xfmem.projection.rawProjection;
-
-    switch (xfmem.projection.type)
-    {
-    case ProjectionType::Perspective:
-    {
-      const Common::Vec2 fov_multiplier = g_freelook_camera.IsActive() ?
-                                              g_freelook_camera.GetFieldOfViewMultiplier() :
-                                              Common::Vec2{1, 1};
-      m_projection_matrix[0] =
-          rawProjection[0] * g_ActiveConfig.fAspectRatioHackW * fov_multiplier.x;
-      m_projection_matrix[1] = 0.0f;
-      m_projection_matrix[2] =
-          rawProjection[1] * g_ActiveConfig.fAspectRatioHackW * fov_multiplier.x;
-      m_projection_matrix[3] = 0.0f;
-
-      m_projection_matrix[4] = 0.0f;
-      m_projection_matrix[5] =
-          rawProjection[2] * g_ActiveConfig.fAspectRatioHackH * fov_multiplier.y;
-      m_projection_matrix[6] =
-          rawProjection[3] * g_ActiveConfig.fAspectRatioHackH * fov_multiplier.y;
-      m_projection_matrix[7] = 0.0f;
-
-      m_projection_matrix[8] = 0.0f;
-      m_projection_matrix[9] = 0.0f;
-      m_projection_matrix[10] = rawProjection[4];
-      m_projection_matrix[11] = rawProjection[5];
-
-      m_projection_matrix[12] = 0.0f;
-      m_projection_matrix[13] = 0.0f;
-
-      m_projection_matrix[14] = -1.0f;
-      m_projection_matrix[15] = 0.0f;
-
-      g_stats.gproj = m_projection_matrix;
-    }
-    break;
-
-    case ProjectionType::Orthographic:
-    {
-      m_projection_matrix[0] = rawProjection[0];
-      m_projection_matrix[1] = 0.0f;
-      m_projection_matrix[2] = 0.0f;
-      m_projection_matrix[3] = rawProjection[1];
-
-      m_projection_matrix[4] = 0.0f;
-      m_projection_matrix[5] = rawProjection[2];
-      m_projection_matrix[6] = 0.0f;
-      m_projection_matrix[7] = rawProjection[3];
-
-      m_projection_matrix[8] = 0.0f;
-      m_projection_matrix[9] = 0.0f;
-      m_projection_matrix[10] = rawProjection[4];
-      m_projection_matrix[11] = rawProjection[5];
-
-      m_projection_matrix[12] = 0.0f;
-      m_projection_matrix[13] = 0.0f;
-
-      m_projection_matrix[14] = 0.0f;
-      m_projection_matrix[15] = 1.0f;
-
-      g_stats.g2proj = m_projection_matrix;
-      g_stats.proj = rawProjection;
-    }
-    break;
-
-    default:
-      ERROR_LOG_FMT(VIDEO, "Unknown projection type: {}", xfmem.projection.type);
-    }
-
-    PRIM_LOG("Projection: {} {} {} {} {} {}", rawProjection[0], rawProjection[1], rawProjection[2],
-             rawProjection[3], rawProjection[4], rawProjection[5]);
-
-    auto corrected_matrix =
-        m_viewport_correction * Common::Matrix44::FromArray(m_projection_matrix);
-
-    if (g_freelook_camera.IsActive() && xfmem.projection.type == ProjectionType::Perspective)
-      corrected_matrix *= g_freelook_camera.GetView();
+    auto corrected_matrix = LoadProjectionMatrix();

    GraphicsModActionData::Projection projection{&corrected_matrix};
    for (auto action : projection_actions)
@ -404,8 +418,6 @@ void VertexShaderManager::SetConstants(const std::vector<std::string>& textures)

    memcpy(constants.projection.data(), corrected_matrix.data.data(), 4 * sizeof(float4));

-    g_freelook_camera.GetController()->SetClean();
-
    dirty = true;
  }

--- a/Source/Core/VideoCommon/VertexShaderManager.h
+++ b/Source/Core/VideoCommon/VertexShaderManager.h
@ -24,6 +24,7 @@ public:
  void DoState(PointerWrap& p);

  // constant management
+  void SetProjectionMatrix();
  void SetConstants(const std::vector<std::string>& textures);

  void InvalidateXFRange(int start, int end);
@ -64,4 +65,6 @@ private:
  std::array<int, 2> m_minmax_lights_changed{};

  Common::Matrix44 m_viewport_correction{};
+
+  Common::Matrix44 LoadProjectionMatrix();
 };
--- a/Source/Core/VideoCommon/VideoConfig.cpp
+++ b/Source/Core/VideoCommon/VideoConfig.cpp
@ -113,6 +113,7 @@ void VideoConfig::Refresh()
  iShaderCompilationMode = Config::Get(Config::GFX_SHADER_COMPILATION_MODE);
  iShaderCompilerThreads = Config::Get(Config::GFX_SHADER_COMPILER_THREADS);
  iShaderPrecompilerThreads = Config::Get(Config::GFX_SHADER_PRECOMPILER_THREADS);
+  bCPUCull = Config::Get(Config::GFX_CPU_CULL);

  texture_filtering_mode = Config::Get(Config::GFX_ENHANCE_FORCE_TEXTURE_FILTERING);
  iMaxAnisotropy = Config::Get(Config::GFX_ENHANCE_MAX_ANISOTROPY);
--- a/Source/Core/VideoCommon/VideoConfig.h
+++ b/Source/Core/VideoCommon/VideoConfig.h
@ -138,6 +138,7 @@ struct VideoConfig final
  bool bPerfQueriesEnable = false;
  bool bBBoxEnable = false;
  bool bForceProgressive = false;
+  bool bCPUCull = false;

  bool bEFBEmulateFormatChanges = false;
  bool bSkipEFBCopyToRam = false;
--- a/Source/Core/VideoCommon/XFMemory.h
+++ b/Source/Core/VideoCommon/XFMemory.h
@ -423,7 +423,7 @@ struct Projection
  ProjectionType type;
 };

-struct XFMemory
+struct alignas(16) XFMemory
 {
  float posMatrices[256];                   // 0x0000 - 0x00ff
  u32 unk0[768];                            // 0x0100 - 0x03ff