Merge pull request #13143 from TellowKrinkle/PrimBreaking

VideoCommon: Implement primitive breaking for primitive lists
2025-07-21 05:09:34 -06:00 · 2024-10-30 12:15:11 -04:00
parent b9a99a7294 5ef4fcb005
commit 1ba3a141a2
1 changed files with 43 additions and 16 deletions
--- a/Source/Core/VideoCommon/VertexLoaderManager.cpp
+++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp
@ -376,6 +376,22 @@ static void CheckCPConfiguration(int vtx_attr_group)
  }
 }

+static bool CanSplit(OpcodeDecoder::Primitive primitive)
+{
+  // Splitting is currently only implemented for the easy cases (individual lines/points/triangles)
+  switch (primitive)
+  {
+  case OpcodeDecoder::Primitive::GX_DRAW_QUADS:
+  case OpcodeDecoder::Primitive::GX_DRAW_QUADS_2:
+  case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLES:
+  case OpcodeDecoder::Primitive::GX_DRAW_LINES:
+  case OpcodeDecoder::Primitive::GX_DRAW_POINTS:
+    return true;
+  default:
+    return false;
+  }
+}
+
 template <bool IsPreprocess>
 int RunVertices(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int count, const u8* src)
 {
@ -414,7 +430,7 @@ int RunVertices(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int coun

    // CPUCull's performance increase comes from encoding fewer GPU commands, not sending less data
    // Therefore it's only useful to check if culling could remove a flush
-    const bool can_cpu_cull = g_ActiveConfig.bCPUCull &&
+    bool can_cpu_cull = g_ActiveConfig.bCPUCull &&
                        primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES &&
                        !g_vertex_manager->HasSendableVertices();

@ -425,24 +441,35 @@ int RunVertices(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int coun
                          primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES);

    const int stride = loader->m_native_vtx_decl.stride;
-    DataReader dst = g_vertex_manager->PrepareForAdditionalData(primitive, count, stride,
+    do
+    {
+      const int max_vertices = 16380;  // Max is 16383, but 16380 is divisible by both 4 and 3
+      const int run = CanSplit(primitive) && count > max_vertices ? max_vertices : count;
+      count -= run;
+      DataReader dst = g_vertex_manager->PrepareForAdditionalData(primitive, run, stride,
                                                                  cullall || can_cpu_cull);

-    count = loader->RunVertices(src, dst.GetPointer(), count);
+      const int num_loaded = loader->RunVertices(src, dst.GetPointer(), run);
+      src += loader->m_vertex_size * max_vertices;

      if (can_cpu_cull && !cullall)
      {
-      if (!g_vertex_manager->AreAllVerticesCulled(loader, primitive, dst.GetPointer(), count))
+        const bool all_culled =
+            g_vertex_manager->AreAllVerticesCulled(loader, primitive, dst.GetPointer(), num_loaded);
+        if (!all_culled)
        {
          DataReader new_dst = g_vertex_manager->DisableCullAll(stride);
-        memmove(new_dst.GetPointer(), dst.GetPointer(), count * stride);
+          memmove(new_dst.GetPointer(), dst.GetPointer(), num_loaded * stride);
+          can_cpu_cull = false;
        }
      }

-    g_vertex_manager->AddIndices(primitive, count);
-    g_vertex_manager->FlushData(count, loader->m_native_vtx_decl.stride);
+      g_vertex_manager->AddIndices(primitive, num_loaded);
+      g_vertex_manager->FlushData(num_loaded, stride);
+
+      ADDSTAT(g_stats.this_frame.num_prims, num_loaded);
+    } while (count);

-    ADDSTAT(g_stats.this_frame.num_prims, count);
    INCSTAT(g_stats.this_frame.num_primitive_joins);
  }
  return size;