From f74dbc794c124e6f1c5d5f60ff1d6fe65acc379f Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Tue, 22 May 2018 12:14:48 +1000
Subject: [PATCH] EFB2RAM: Apply copy filter as a float coefficient after
 sampling

Using 8-bit integer math here lead to precision loss for depth copies,
which broke various effects in games, e.g. lens flare in MK:DD.

It's unlikely the console implements this as a floating-point multiply
(fixed-point perhaps), but since we have the float round trip in our
EFB2RAM shaders anyway, it's not going to make things any worse. If we
do rewrite our shaders to use integer math completely, then it might be
worth switching this conversion back to integers.

However, the range of the values (format) should be known, or we should
expand all values out to 24-bits first.
---
 .../VideoBackends/D3D/PSTextureEncoder.cpp    |  4 ++--
 .../Core/VideoBackends/D3D/TextureCache.cpp   |  4 ++--
 .../Core/VideoBackends/OGL/TextureCache.cpp   |  6 ++---
 .../VideoBackends/OGL/TextureConverter.cpp    |  6 ++---
 .../VideoBackends/Vulkan/TextureCache.cpp     |  2 +-
 .../VideoBackends/Vulkan/TextureConverter.cpp |  4 ++--
 Source/Core/VideoCommon/TextureCacheBase.cpp  | 12 ++++++----
 Source/Core/VideoCommon/TextureCacheBase.h    |  2 +-
 .../VideoCommon/TextureConversionShader.cpp   | 23 ++++++++-----------
 .../VideoCommon/TextureConverterShaderGen.cpp | 10 ++++----
 10 files changed, 38 insertions(+), 35 deletions(-)
diff --git a/Source/Core/VideoBackends/D3D/PSTextureEncoder.cpp b/Source/Core/VideoBackends/D3D/PSTextureEncoder.cpp
index 22b5643416..7c6dbd4411 100644
--- a/Source/Core/VideoBackends/D3D/PSTextureEncoder.cpp
+++ b/Source/Core/VideoBackends/D3D/PSTextureEncoder.cpp
@@ -34,7 +34,7 @@ struct EFBEncodeParams
   float gamma_rcp;
   float clamp_top;
   float clamp_bottom;
-  s32 filter_coefficients[3];
+  float filter_coefficients[3];
   u32 padding;
 };
 
@@ -169,4 +169,4 @@ ID3D11PixelShader* PSTextureEncoder::GetEncodingPixelShader(const EFBCopyParams&
   m_encoding_shaders.emplace(params, newShader);
   return newShader;
 }
-}
+}  // namespace DX11
diff --git a/Source/Core/VideoBackends/D3D/TextureCache.cpp b/Source/Core/VideoBackends/D3D/TextureCache.cpp
index e34858d5a4..41ad435817 100644
--- a/Source/Core/VideoBackends/D3D/TextureCache.cpp
+++ b/Source/Core/VideoBackends/D3D/TextureCache.cpp
@@ -276,7 +276,7 @@ void TextureCache::CopyEFBToCacheEntry(TCacheEntry* entry, bool is_depth_copy,
   };
   PixelConstants constants;
   for (size_t i = 0; i < filter_coefficients.size(); i++)
-    constants.filter_coefficients[i] = filter_coefficients[i] / 64.0f;
+    constants.filter_coefficients[i] = filter_coefficients[i];
   constants.gamma_rcp = 1.0f / gamma;
   constants.clamp_top = clamp_top ? src_rect.top / float(EFB_HEIGHT) : 0.0f;
   constants.clamp_bottom = clamp_bottom ? src_rect.bottom / float(EFB_HEIGHT) : 1.0f;
@@ -315,4 +315,4 @@ TextureCache::GetEFBToTexPixelShader(const TextureConversionShaderGen::TCShaderU
   m_efb_to_tex_pixel_shaders.emplace(uid, shader);
   return shader;
 }
-}
+}  // namespace DX11
diff --git a/Source/Core/VideoBackends/OGL/TextureCache.cpp b/Source/Core/VideoBackends/OGL/TextureCache.cpp
index 7f1c084892..7ca73c6d6b 100644
--- a/Source/Core/VideoBackends/OGL/TextureCache.cpp
+++ b/Source/Core/VideoBackends/OGL/TextureCache.cpp
@@ -558,12 +558,12 @@ void TextureCache::CopyEFBToCacheEntry(TCacheEntry* entry, bool is_depth_copy,
   glUniform2f(shader.clamp_tb_uniform,
               clamp_bottom ? (1.0f - src_rect.bottom / static_cast<float>(EFB_HEIGHT)) : 0.0f,
               clamp_top ? (1.0f - src_rect.top / static_cast<float>(EFB_HEIGHT)) : 1.0f);
-  glUniform3f(shader.filter_coefficients_uniform, filter_coefficients[0] / 64.0f,
-              filter_coefficients[1] / 64.0f, filter_coefficients[2] / 64.0f);
+  glUniform3f(shader.filter_coefficients_uniform, filter_coefficients[0], filter_coefficients[1],
+              filter_coefficients[2]);
 
   ProgramShaderCache::BindVertexFormat(nullptr);
   glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
 
   g_renderer->RestoreAPIState();
 }
-}
+}  // namespace OGL
diff --git a/Source/Core/VideoBackends/OGL/TextureConverter.cpp b/Source/Core/VideoBackends/OGL/TextureConverter.cpp
index 615cc9e1c7..699e988196 100644
--- a/Source/Core/VideoBackends/OGL/TextureConverter.cpp
+++ b/Source/Core/VideoBackends/OGL/TextureConverter.cpp
@@ -50,7 +50,7 @@ std::unique_ptr<AbstractStagingTexture> s_encoding_readback_texture;
 
 const int renderBufferWidth = EFB_WIDTH * 4;
 const int renderBufferHeight = 1024;
-}
+}  // namespace
 
 static EncodingProgram& GetOrCreateEncodingShader(const EFBCopyParams& params)
 {
@@ -158,7 +158,7 @@ void EncodeToRamFromTexture(u8* dest_ptr, const EFBCopyParams& params, u32 nativ
   glUniform1f(texconv_shader.y_scale_uniform, y_scale);
   glUniform1f(texconv_shader.gamma_rcp_uniform, 1.0f / gamma);
   glUniform2f(texconv_shader.clamp_tb_uniform, clamp_top, clamp_bottom);
-  glUniform3i(texconv_shader.filter_coefficients_uniform, filter_coefficients[0],
+  glUniform3f(texconv_shader.filter_coefficients_uniform, filter_coefficients[0],
               filter_coefficients[1], filter_coefficients[2]);
 
   const GLuint read_texture = params.depth ?
@@ -171,6 +171,6 @@ void EncodeToRamFromTexture(u8* dest_ptr, const EFBCopyParams& params, u32 nativ
   g_renderer->RestoreAPIState();
 }
 
-}  // namespace
+}  // namespace TextureConverter
 
 }  // namespace OGL
diff --git a/Source/Core/VideoBackends/Vulkan/TextureCache.cpp b/Source/Core/VideoBackends/Vulkan/TextureCache.cpp
index 364569baa3..334b3f9b1a 100644
--- a/Source/Core/VideoBackends/Vulkan/TextureCache.cpp
+++ b/Source/Core/VideoBackends/Vulkan/TextureCache.cpp
@@ -244,7 +244,7 @@ void TextureCache::CopyEFBToCacheEntry(TCacheEntry* entry, bool is_depth_copy,
   };
   PixelUniforms uniforms;
   for (size_t i = 0; i < filter_coefficients.size(); i++)
-    uniforms.filter_coefficients[i] = filter_coefficients[i] / 64.0f;
+    uniforms.filter_coefficients[i] = filter_coefficients[i];
   uniforms.gamma_rcp = 1.0f / gamma;
   uniforms.clamp_top = clamp_top ? src_rect.top / float(EFB_HEIGHT) : 0.0f;
   uniforms.clamp_bottom = clamp_bottom ? src_rect.bottom / float(EFB_HEIGHT) : 1.0f;
diff --git a/Source/Core/VideoBackends/Vulkan/TextureConverter.cpp b/Source/Core/VideoBackends/Vulkan/TextureConverter.cpp
index 496a600f7b..da40b08667 100644
--- a/Source/Core/VideoBackends/Vulkan/TextureConverter.cpp
+++ b/Source/Core/VideoBackends/Vulkan/TextureConverter.cpp
@@ -41,10 +41,10 @@ struct EFBEncodeParams
   float gamma_rcp;
   float clamp_top;
   float clamp_bottom;
-  s32 filter_coefficients[3];
+  float filter_coefficients[3];
   u32 padding;
 };
-}
+}  // namespace
 TextureConverter::TextureConverter()
 {
 }
diff --git a/Source/Core/VideoCommon/TextureCacheBase.cpp b/Source/Core/VideoCommon/TextureCacheBase.cpp
index 48518911bd..c1e112e69a 100644
--- a/Source/Core/VideoCommon/TextureCacheBase.cpp
+++ b/Source/Core/VideoCommon/TextureCacheBase.cpp
@@ -1506,10 +1506,14 @@ TextureCacheBase::CopyFilterCoefficientArray TextureCacheBase::GetRAMCopyFilterC
 {
   // To simplify the backend, we precalculate the three coefficients in common. Coefficients 0, 1
   // are for the row above, 2, 3, 4 are for the current pixel, and 5, 6 are for the row below.
-  return {static_cast<u32>(coefficients[0]) + static_cast<u32>(coefficients[1]),
-          static_cast<u32>(coefficients[2]) + static_cast<u32>(coefficients[3]) +
-              static_cast<u32>(coefficients[4]),
-          static_cast<u32>(coefficients[5]) + static_cast<u32>(coefficients[6])};
+  return {
+      static_cast<float>(static_cast<u32>(coefficients[0]) + static_cast<u32>(coefficients[1])) /
+          64.0f,
+      static_cast<float>(static_cast<u32>(coefficients[2]) + static_cast<u32>(coefficients[3]) +
+                         static_cast<u32>(coefficients[4])) /
+          64.0f,
+      static_cast<float>(static_cast<u32>(coefficients[5]) + static_cast<u32>(coefficients[6])) /
+          64.0f};
 }
 
 TextureCacheBase::CopyFilterCoefficientArray TextureCacheBase::GetVRAMCopyFilterCoefficients(
diff --git a/Source/Core/VideoCommon/TextureCacheBase.h b/Source/Core/VideoCommon/TextureCacheBase.h
index 68477a4a37..c4a17efc88 100644
--- a/Source/Core/VideoCommon/TextureCacheBase.h
+++ b/Source/Core/VideoCommon/TextureCacheBase.h
@@ -109,7 +109,7 @@ private:
 
 public:
   // Reduced version of the full coefficient array, reduced to a single value for each row.
-  using CopyFilterCoefficientArray = std::array<u32, 3>;
+  using CopyFilterCoefficientArray = std::array<float, 3>;
 
   struct TCacheEntry
   {
diff --git a/Source/Core/VideoCommon/TextureConversionShader.cpp b/Source/Core/VideoCommon/TextureConversionShader.cpp
index 8a2588908c..e57cbdf603 100644
--- a/Source/Core/VideoCommon/TextureConversionShader.cpp
+++ b/Source/Core/VideoCommon/TextureConversionShader.cpp
@@ -67,7 +67,7 @@ static void WriteHeader(char*& p, APIType ApiType)
     WRITE(p, "uniform float y_scale;\n");
     WRITE(p, "uniform float gamma_rcp;\n");
     WRITE(p, "uniform float2 clamp_tb;\n");
-    WRITE(p, "uniform int3 filter_coefficients;\n");
+    WRITE(p, "uniform float3 filter_coefficients;\n");
     WRITE(p, "#define samp0 samp9\n");
     WRITE(p, "SAMPLER_BINDING(9) uniform sampler2DArray samp0;\n");
     WRITE(p, "FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n");
@@ -79,7 +79,7 @@ static void WriteHeader(char*& p, APIType ApiType)
     WRITE(p, "  float y_scale;\n");
     WRITE(p, "  float gamma_rcp;\n");
     WRITE(p, "  float2 clamp_tb;\n");
-    WRITE(p, "  int3 filter_coefficients;\n");
+    WRITE(p, "  float3 filter_coefficients;\n");
     WRITE(p, "};\n");
     WRITE(p, "SAMPLER_BINDING(0) uniform sampler2DArray samp0;\n");
     WRITE(p, "FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n");
@@ -91,7 +91,7 @@ static void WriteHeader(char*& p, APIType ApiType)
     WRITE(p, "  float y_scale;\n");
     WRITE(p, "  float gamma_rcp;\n");
     WRITE(p, "  float2 clamp_tb;\n");
-    WRITE(p, "  int3 filter_coefficients;\n");
+    WRITE(p, "  float3 filter_coefficients;\n");
     WRITE(p, "};\n");
     WRITE(p, "sampler samp0 : register(s0);\n");
     WRITE(p, "Texture2DArray Tex0 : register(t0);\n");
@@ -191,21 +191,18 @@ static void WriteSampleFunction(char*& p, const EFBCopyParams& params, APIType A
     WRITE(p, "  float4 next_row = ");
     WriteSampleOp(1);
     WRITE(p, ";\n");
-    WRITE(
-        p,
-        "  float3 col = float3(clamp((int3(prev_row.rgb * 255.0) * filter_coefficients[0] +\n"
-        "                             int3(current_row.rgb * 255.0) * filter_coefficients[1] +\n"
-        "                             int3(next_row.rgb * 255.0) * filter_coefficients[2]) >> 6,\n"
-        "                            int3(0, 0, 0), int3(255, 255, 255))) / 255.0;\n");
-    WRITE(p, "  return float4(col, current_row.a);\n");
+    WRITE(p, "  return float4(min(prev_row.rgb * filter_coefficients[0] +\n"
+             "                      current_row.rgb * filter_coefficients[1] +\n"
+             "                      next_row.rgb * filter_coefficients[2], \n"
+             "                    float3(1, 1, 1)), current_row.a);\n");
   }
   else
   {
     WRITE(p, "  float4 current_row = ");
     WriteSampleOp(0);
     WRITE(p, ";\n");
-    WRITE(p, "  return float4(clamp(int3(current_row.rgb * 255.0) * filter_coefficients[1], "
-             "int3(0, 0, 0), int3(255, 255, 255)), current_row.a);\n");
+    WRITE(p, "return float4(min(current_row.rgb * filter_coefficients[1], float3(1, 1, 1)),\n"
+             "              current_row.a);\n");
   }
   WRITE(p, "}\n");
 }
@@ -1422,4 +1419,4 @@ std::string GenerateDecodingShader(TextureFormat format, TLUTFormat palette_form
   return ss.str();
 }
 
-}  // namespace
+}  // namespace TextureConversionShaderTiled
diff --git a/Source/Core/VideoCommon/TextureConverterShaderGen.cpp b/Source/Core/VideoCommon/TextureConverterShaderGen.cpp
index 748afa29b6..5cfb3e7c76 100644
--- a/Source/Core/VideoCommon/TextureConverterShaderGen.cpp
+++ b/Source/Core/VideoCommon/TextureConverterShaderGen.cpp
@@ -97,15 +97,17 @@ ShaderCode GenerateShader(APIType api_type, const UidData* uid_data)
     out.Write("  float4 prev_row = SampleEFB(uv0, -1.0f);\n"
               "  float4 current_row = SampleEFB(uv0, 0.0f);\n"
               "  float4 next_row = SampleEFB(uv0, 1.0f);\n"
-              "  float4 texcol = float4(prev_row.rgb * filter_coefficients[0] +\n"
-              "                         current_row.rgb * filter_coefficients[1] +\n"
-              "                         next_row.rgb * filter_coefficients[2], current_row.a);\n");
+              "  float4 texcol = float4(min(prev_row.rgb * filter_coefficients[0] +\n"
+              "                               current_row.rgb * filter_coefficients[1] +\n"
+              "                               next_row.rgb * filter_coefficients[2], \n"
+              "                             float3(1, 1, 1)), current_row.a);\n");
   }
   else
   {
     out.Write(
         "  float4 current_row = SampleEFB(uv0, 0.0f);\n"
-        "  float4 texcol = float4(current_row.rgb * filter_coefficients[1], current_row.a);\n");
+        "  float4 texcol = float4(min(current_row.rgb * filter_coefficients[1], float3(1, 1, 1)),\n"
+        "                         current_row.a);\n");
   }
 
   if (uid_data->is_depth_copy)