From 60d2d42525be71a2a69c579cf37dda82aab86153 Mon Sep 17 00:00:00 2001
From: nitsuja- <nitsuja-@hotmail.com>
Date: Sun, 19 Dec 2010 21:05:41 +0000
Subject: [PATCH] made texture conversion shaders use less instructions

this is mainly so they fit within ps_2_0 limits, but I made this separate from my last checkin to make it easier to test separately in case it somehow causes problems

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6619 8ced0084-cf51-0410-be5f-012b33b47a6e
---
 .../Src/TextureConversionShader.cpp           | 88 ++++++++++++-------
 1 file changed, 56 insertions(+), 32 deletions(-)

diff --git a/Source/Core/VideoCommon/Src/TextureConversionShader.cpp b/Source/Core/VideoCommon/Src/TextureConversionShader.cpp
index 09c3059d17..c2aff1ab1e 100644
--- a/Source/Core/VideoCommon/Src/TextureConversionShader.cpp
+++ b/Source/Core/VideoCommon/Src/TextureConversionShader.cpp
@@ -30,6 +30,7 @@
 
 static char text[16384];
 static bool IntensityConstantAdded =  false;
+static int s_incrementSampleXCount = 0;
 
 namespace TextureConversionShader
 {
@@ -204,14 +205,25 @@ void Write32BitSwizzler(char*& p, u32 format, API_TYPE ApiType)
 	}	
 }
 
-void WriteSampleColor(char*& p, const char* colorComp, const char* dest,API_TYPE ApiType)
+void WriteSampleColor(char*& p, const char* colorComp, const char* dest, API_TYPE ApiType)
 {
+	const char* texSampleOpName;
 	if (ApiType == API_D3D9)
-		WRITE(p, "  %s = tex2D(samp0, sampleUv).%s;\n", dest, colorComp);
+		texSampleOpName = "tex2D";
 	else if (ApiType == API_D3D11)
-		WRITE(p, "  %s = tex0.Sample(samp0, sampleUv).%s;\n", dest, colorComp);
+		texSampleOpName = "tex0.Sample";
 	else
-		WRITE(p, "  %s = texRECT(samp0, sampleUv).%s;\n", dest, colorComp);	
+		texSampleOpName = "texRECT";
+
+	// the increment of sampleUv.x is delayed, so we perform it here. see WriteIncrementSampleX.
+	const char* texSampleIncrementUnit;
+	if(ApiType != API_OPENGL)
+		texSampleIncrementUnit = "blkDims.x / blkDims.z";
+	else
+		texSampleIncrementUnit = "blkDims.x";
+
+	WRITE(p, "  %s = %s(samp0, sampleUv + float2(%d * (%s), 0)).%s;\n",
+		dest, texSampleOpName, s_incrementSampleXCount, texSampleIncrementUnit, colorComp);
 }
 
 void WriteColorToIntensity(char*& p, const char* src, const char* dest)
@@ -221,15 +233,27 @@ void WriteColorToIntensity(char*& p, const char* src, const char* dest)
 		WRITE(p, "  float4 IntensityConst = float4(0.257f,0.504f,0.098f,0.0625f);\n");
 		IntensityConstantAdded = true;
 	}
-	WRITE(p, "  %s = dot(IntensityConst.rgb, %s.rgb) + IntensityConst.a;\n", dest, src);
+	WRITE(p, "  %s = dot(IntensityConst.rgb, %s.rgb);\n", dest, src);
+	// don't add IntensityConst.a yet, because doing it later is faster and uses less instructions, due to vectorization
 }
 
 void WriteIncrementSampleX(char*& p,API_TYPE ApiType)
 {
-	if(ApiType != API_OPENGL)
-		WRITE(p, "  sampleUv.x = sampleUv.x + blkDims.x / blkDims.z;\n");
-	else
-		WRITE(p, "  sampleUv.x = sampleUv.x + blkDims.x;\n");
+	// the shader compiler apparently isn't smart or aggressive enough to recognize that:
+	//    foo1 = lookup(x)
+	//    x = x + increment;
+	//    foo2 = lookup(x)
+	//    x = x + increment;
+	//    foo3 = lookup(x)
+	// can be replaced with this:
+	//    foo1 = lookup(x + 0.0 * increment)
+	//    foo2 = lookup(x + 1.0 * increment)
+	//    foo3 = lookup(x + 2.0 * increment)
+	// which looks like the same operations but uses considerably fewer ALU instruction slots.
+	// thus, instead of using the former method, we only increment a counter internally here,
+	// and we wait until WriteSampleColor to write out the constant multiplier
+	// to achieve the increment as in the latter case.
+	s_incrementSampleXCount++;
 }
 
 void WriteToBitDepth(char*& p, u8 depth, const char* src, const char* dest)
@@ -242,6 +266,7 @@ void WriteEncoderEnd(char* p)
 {
 	WRITE(p, "}\n");
 	IntensityConstantAdded = false;
+	s_incrementSampleXCount = 0;
 }
 
 void WriteI8Encoder(char* p, API_TYPE ApiType)
@@ -264,6 +289,8 @@ void WriteI8Encoder(char* p, API_TYPE ApiType)
 	WriteSampleColor(p, "rgb", "texSample",ApiType);
 	WriteColorToIntensity(p, "texSample", "ocol0.a");
 
+	WRITE(p, "  ocol0.rgba += IntensityConst.aaaa;\n"); // see WriteColorToIntensity
+
 	WriteEncoderEnd(p);
 }
 
@@ -305,6 +332,9 @@ void WriteI4Encoder(char* p, API_TYPE ApiType)
 	WriteSampleColor(p, "rgb", "texSample",ApiType);
 	WriteColorToIntensity(p, "texSample", "color1.a");
 
+	WRITE(p, "  color0.rgba += IntensityConst.aaaa;\n");
+	WRITE(p, "  color1.rgba += IntensityConst.aaaa;\n");
+
 	WriteToBitDepth(p, 4, "color0", "color0");
 	WriteToBitDepth(p, 4, "color1", "color1");
 
@@ -326,6 +356,8 @@ void WriteIA8Encoder(char* p,API_TYPE ApiType)
 	WRITE(p, "  ocol0.r = texSample.a;\n");
 	WriteColorToIntensity(p, "texSample", "ocol0.a");
 
+	WRITE(p, "  ocol0.ga += IntensityConst.aa;\n");
+
 	WriteEncoderEnd(p);
 }
 
@@ -355,6 +387,8 @@ void WriteIA4Encoder(char* p,API_TYPE ApiType)
 	WRITE(p, "  color0.a = texSample.a;\n");
 	WriteColorToIntensity(p, "texSample", "color1.a");
 
+	WRITE(p, "  color1.rgba += IntensityConst.aaaa;\n");
+
 	WriteToBitDepth(p, 4, "color0", "color0");
 	WriteToBitDepth(p, 4, "color1", "color1");
 
@@ -366,32 +400,22 @@ void WriteRGB565Encoder(char* p,API_TYPE ApiType)
 {
 	WriteSwizzler(p, GX_TF_RGB565,ApiType);
 
-	WRITE(p, "  float3 texSample;\n");
-	WRITE(p, "  float gInt;\n");
-	WRITE(p, "  float gUpper;\n");
-	WRITE(p, "  float gLower;\n");
-
-	WriteSampleColor(p, "rgb", "texSample",ApiType);
-	WriteToBitDepth(p, 6, "texSample.g", "gInt");
-	WRITE(p, "  gUpper = floor(gInt / 8.0f);\n");
-	WRITE(p, "  gLower = gInt - gUpper * 8.0f;\n");
-
-	WriteToBitDepth(p, 5, "texSample.r", "ocol0.b");
-	WRITE(p, "  ocol0.b = ocol0.b * 8.0f + gUpper;\n");
-	WriteToBitDepth(p, 5, "texSample.b", "ocol0.g");
-	WRITE(p, "  ocol0.g = ocol0.g + gLower * 32.0f;\n");
-
+	WriteSampleColor(p, "rgb", "float3 texSample0",ApiType);
 	WriteIncrementSampleX(p,ApiType);
+	WriteSampleColor(p, "rgb", "float3 texSample1",ApiType);
 
-	WriteSampleColor(p, "rgb", "texSample",ApiType);
-	WriteToBitDepth(p, 6, "texSample.g", "gInt");
-	WRITE(p, "  gUpper = floor(gInt / 8.0f);\n");
-	WRITE(p, "  gLower = gInt - gUpper * 8.0f;\n");
+	WRITE(p, "  float2 texRs = {texSample0.r, texSample1.r};\n");
+	WRITE(p, "  float2 texGs = {texSample0.g, texSample1.g};\n");
+	WRITE(p, "  float2 texBs = {texSample0.b, texSample1.b};\n");
+  
+	WriteToBitDepth(p, 6, "texGs", "float2 gInt");
+	WRITE(p, "  float2 gUpper = floor(gInt / 8.0f);\n");
+	WRITE(p, "  float2 gLower = gInt - gUpper * 8.0f;\n");
 
-	WriteToBitDepth(p, 5, "texSample.r", "ocol0.r");
-	WRITE(p, "  ocol0.r = ocol0.r * 8.0f + gUpper;\n");
-	WriteToBitDepth(p, 5, "texSample.b", "ocol0.a");
-	WRITE(p, "  ocol0.a = ocol0.a + gLower * 32.0f;\n");
+	WriteToBitDepth(p, 5, "texRs", "ocol0.br");
+	WRITE(p, "  ocol0.br = ocol0.br * 8.0f + gUpper;\n");
+	WriteToBitDepth(p, 5, "texBs", "ocol0.ga");
+	WRITE(p, "  ocol0.ga = ocol0.ga + gLower * 32.0f;\n");
 
 	WRITE(p, "  ocol0 = ocol0 / 255.0f;\n");
 	WriteEncoderEnd(p);