Some optimization on the pixel and vertex shader generators. As tested on Zelda TP, these changes should be able to reduce the number of lines in the compiled pixel shader program by 20% to 30%, for example, from 38 lines to 28 lines after the generated cg/HLSL pixel shader compiled by the cg/HLSL compiler. These could means slightly faster rasterization on slower video cards.

Also fixed shader compilation errors for DX9 plugin by using correct pixel/vertex attribute names that are compatibile with DX9 HLSL shader compiler. Now the generated vertex or pixel shader programs will compile correctly in either OGL or DX9. However, DX9 plugin is still not fixed, even though the shader programs can be compiled now. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@4113 8ced0084-cf51-0410-be5f-012b33b47a6e
2025-07-25 23:29:44 -06:00 · 2009-08-31 04:23:30 +00:00
parent 63786d5438
commit 718c88ff6a
5 changed files with 104 additions and 53 deletions
--- a/Source/Core/VideoCommon/Src/PixelShaderGen.cpp
+++ b/Source/Core/VideoCommon/Src/PixelShaderGen.cpp
@ -140,8 +140,8 @@ void GetPixelShaderId(PIXELSHADERUID &uid, u32 s_texturemask, u32 dstAlphaEnable
 //   output is given by .outreg
 //   tevtemp is set according to swapmodetables and 

-static void WriteStage(char *&p, int n, u32 texture_mask);
-static void SampleTexture(char *&p, const char *destination, const char *texcoords, const char *texswap, int texmap, u32 texture_mask);
+static void WriteStage(char *&p, int n, u32 texture_mask, bool HLSL);
+static void SampleTexture(char *&p, const char *destination, const char *texcoords, const char *texswap, int texmap, u32 texture_mask, bool HLSL);
 static void WriteAlphaCompare(char *&p, int num, int comp);
 static bool WriteAlphaTest(char *&p, bool HLSL);
 static void WriteFog(char *&p);
@ -405,7 +405,10 @@ const char *GeneratePixelShader(u32 texture_mask, bool dstAlphaEnable, bool HLSL
    // Declare samplers
    if (texture_mask)
 	{
-        WRITE(p, "uniform samplerRECT ");
+		if (HLSL)
+			WRITE(p, "uniform sampler ");
+		else
+			WRITE(p, "uniform samplerRECT ");
        bool bfirst = true;
        for (int i = 0; i < 8; ++i)
            if (texture_mask & (1<<i))
@ -460,15 +463,15 @@ const char *GeneratePixelShader(u32 texture_mask, bool dstAlphaEnable, bool HLSL
 			WRITE(p, "  in float%d uv%d : TEXCOORD%d, \n", i<4?4:3, i, i);
 	}

-	WRITE(p, "  in float4 colors_0 : COLOR0, in float4 colors_1 : COLOR1){\n");
+	WRITE(p, "  in float4 colors_0 : COLOR0,\n in float4 colors_1 : COLOR1){\n");

    char* pmainstart = p;

-    WRITE(p, "float4 c0="I_COLORS"[1],c1="I_COLORS"[2],c2="I_COLORS"[3],prev=float4(0.0f,0.0f,0.0f,0.0f),textemp,rastemp,konsttemp=float4(0.0f,0.0f,0.0f,0.0f);\n"
-            "float3 comp16 = float3(1,255,0), comp24 = float3(1,255,255*255);\n"
-            "float4 alphabump=0;\n"
-            "float3 tevcoord;\n"
-            "float2 wrappedcoord, tempcoord;\n\n");
+    WRITE(p, "  float4 c0="I_COLORS"[1],c1="I_COLORS"[2],c2="I_COLORS"[3],prev=float4(0.0f,0.0f,0.0f,0.0f),textemp,rastemp,konsttemp=float4(0.0f,0.0f,0.0f,0.0f);\n"
+            "  float3 comp16 = float3(1,255,0), comp24 = float3(1,255,255*255);\n"
+            "  float4 alphabump=0;\n"
+            "  float3 tevcoord;\n"
+            "  float2 wrappedcoord, tempcoord;\n\n");

    for (int i = 0; i < numTexgen; ++i) 
 	{
@ -494,7 +497,7 @@ const char *GeneratePixelShader(u32 texture_mask, bool dstAlphaEnable, bool HLSL

            char buffer[32];
            sprintf(buffer, "float3 indtex%d", i);
-            SampleTexture(p, buffer, "tempcoord", "abg", bpmem.tevindref.getTexMap(i), texture_mask);
+            SampleTexture(p, buffer, "tempcoord", "abg", bpmem.tevindref.getTexMap(i), texture_mask,HLSL);
        }
    }

@ -503,7 +506,7 @@ const char *GeneratePixelShader(u32 texture_mask, bool dstAlphaEnable, bool HLSL
        WRITE(p, "float3 uv0 = float3(0.0f,0.0f,0.0f);\n");

    for (int i = 0; i < numStages; i++)
-        WriteStage(p, i, texture_mask); //build the equation for this stage
+        WriteStage(p, i, texture_mask,HLSL); //build the equation for this stage

 	if (numTexgen >= 7)
 		WRITE(p, "float4 clipPos = float4(uv0.w, uv1.w, uv2.w, uv3.w);\n");
@ -547,7 +550,7 @@ const char *GeneratePixelShader(u32 texture_mask, bool dstAlphaEnable, bool HLSL
    return text;
 }

-static void WriteStage(char *&p, int n, u32 texture_mask)
+static void WriteStage(char *&p, int n, u32 texture_mask, bool HLSL)
 {
    char *rasswap = swapModeTable[bpmem.combiners[n].alphaC.rswap];
    char *texswap = swapModeTable[bpmem.combiners[n].alphaC.tswap];
@ -656,7 +659,7 @@ static void WriteStage(char *&p, int n, u32 texture_mask)
                WRITE(p, "tevcoord.xy = float2(0.0f,0.0f);\n");
        }

-        SampleTexture(p, "textemp", "tevcoord", texswap, texmap, texture_mask);
+        SampleTexture(p, "textemp", "tevcoord", texswap, texmap, texture_mask, HLSL);
    }
    else
        WRITE(p, "textemp=float4(1,1,1,1);\n");
@ -672,16 +675,35 @@ static void WriteStage(char *&p, int n, u32 texture_mask)
    if (bCKonst || bAKonst )
        WRITE(p, "konsttemp=float4(%s,%s);\n",tevKSelTableC[kc],tevKSelTableA[ka]);  

-    WRITE(p, "%s= ", tevCOutputTable[cc.dest]); 
+    if (cc.clamp)
+		WRITE(p, "%s= saturate(", tevCOutputTable[cc.dest]);
+	else
+		WRITE(p, "%s= (", tevCOutputTable[cc.dest]);

    // combine the color channel
    if (cc.bias != 3) // if not compare
 	{
        //normal color combiner goes here
-        WRITE(p, "   %s*(%s%s",tevScaleTable[cc.shift],tevCInputTable[cc.d],tevOpTable[cc.op]);
-        WRITE(p, "lerp(%s,%s,%s) %s);\n",
-              tevCInputTable[cc.a], tevCInputTable[cc.b],
-              tevCInputTable[cc.c], tevBiasTable[cc.bias]);
+		if (cc.shift>0)
+			WRITE(p, "   %s*(%s%s",tevScaleTable[cc.shift],tevCInputTable[cc.d],tevOpTable[cc.op]);
+		else
+			WRITE(p, "   (%s%s",tevCInputTable[cc.d],tevOpTable[cc.op]);
+
+		if (cc.a == 15 && cc.b == 15)
+			WRITE(p, "0");
+		else if (cc.a == 15 && cc.c == 15)
+			WRITE(p, "0");
+		else if (cc.b == 15 && cc.c == 15)
+			WRITE(p,"%s",tevCInputTable[cc.a]);
+		else if (cc.a == 15)
+			WRITE(p,"(%s)*(%s)",tevCInputTable[cc.b],tevCInputTable[cc.c]);
+		else if (cc.b == 15)
+			WRITE(p,"(%s)*(1-%s)",tevCInputTable[cc.a],tevCInputTable[cc.c]);
+		else if (cc.c == 15)
+			WRITE(p,"%s",tevCInputTable[cc.a]);
+		else
+			WRITE(p, "lerp(%s,%s,%s)",tevCInputTable[cc.a], tevCInputTable[cc.b],tevCInputTable[cc.c]);
+		WRITE(p, " %s)",tevBiasTable[cc.bias]);
    }
    else 
 	{
@ -690,44 +712,62 @@ static void WriteStage(char *&p, int n, u32 texture_mask)
 		{
        case TEVCMP_R8_GT:
        case TEVCMP_RGB8_GT: // per component compares
-            WRITE(p, "   %s + ((%s.%s > %s.%s) ? %s : float3(0.0f,0.0f,0.0f));\n",
+            WRITE(p, "   %s + ((%s.%s > %s.%s) ? %s : float3(0.0f,0.0f,0.0f))",
                tevCInputTable[cc.d], tevCInputTable2[cc.a], cmp==TEVCMP_R8_GT?"r":"rgb", tevCInputTable2[cc.b], cmp==TEVCMP_R8_GT?"r":"rgb", tevCInputTable[cc.c]);
            break;
        case TEVCMP_R8_EQ:
        case TEVCMP_RGB8_EQ:
-            WRITE(p, "   %s + (abs(%s.r - %s.r)<%f ? %s : float3(0.0f,0.0f,0.0f));\n",
+            WRITE(p, "   %s + (abs(%s.r - %s.r)<%f ? %s : float3(0.0f,0.0f,0.0f))n",
                tevCInputTable[cc.d], tevCInputTable2[cc.a], tevCInputTable2[cc.b], epsilon8bit, tevCInputTable[cc.c]);
            break;
        
        case TEVCMP_GR16_GT: // 16 bit compares: 255*g+r (probably used for ztextures, so make sure in ztextures, g is the most significant byte)
        case TEVCMP_BGR24_GT: // 24 bit compares: 255*255*b+255*g+r
-            WRITE(p, "   %s + (( dot(%s.rgb-%s.rgb, comp%s) > 0) ? %s : float3(0.0f,0.0f,0.0f));\n",
+            WRITE(p, "   %s + (( dot(%s.rgb-%s.rgb, comp%s) > 0) ? %s : float3(0.0f,0.0f,0.0f))",
                tevCInputTable[cc.d], tevCInputTable2[cc.a], tevCInputTable2[cc.b], cmp==TEVCMP_GR16_GT?"16":"24", tevCInputTable[cc.c]);
            break;
        case TEVCMP_GR16_EQ:
        case TEVCMP_BGR24_EQ:
-            WRITE(p, "   %s + (abs(dot(%s.rgb - %s.rgb, comp%s))<%f ? %s : float3(0.0f,0.0f,0.0f));\n",
+            WRITE(p, "   %s + (abs(dot(%s.rgb - %s.rgb, comp%s))<%f ? %s : float3(0.0f,0.0f,0.0f))",
                tevCInputTable[cc.d], tevCInputTable2[cc.a], tevCInputTable2[cc.b], cmp==TEVCMP_GR16_EQ?"16":"24", epsilon8bit, tevCInputTable[cc.c]);
            break;
        default:
-            WRITE(p, "float3(0.0f,0.0f,0.0f);\n");
+            WRITE(p, "float3(0.0f,0.0f,0.0f)");
            break;
        }
    }
-    
-    if (cc.clamp)
-        WRITE(p, "%s = clamp(%s,0.0f,1.0f);\n", tevCOutputTable[cc.dest],tevCOutputTable[cc.dest]);

+	WRITE(p,");\n");
+    
    // combine the alpha channel
-    WRITE(p, "%s= ", tevAOutputTable[ac.dest]);
+    if (ac.clamp)
+	    WRITE(p, "%s= saturate(", tevAOutputTable[ac.dest]);
+	else
+		WRITE(p, "%s= (", tevAOutputTable[ac.dest]);

    if (ac.bias != 3) // if not compare
 	{
        //normal alpha combiner goes here
-        WRITE(p, "   %s*(%s%s",tevScaleTable[ac.shift],tevAInputTable[ac.d],tevOpTable[ac.op]);
-        WRITE(p, "lerp(%s,%s,%s) %s)\n",
-            tevAInputTable[ac.a],tevAInputTable[ac.b],
-            tevAInputTable[ac.c],tevBiasTable[ac.bias]);
+		if (ac.shift>0)
+			WRITE(p, "   %s*(%s%s",tevScaleTable[ac.shift],tevAInputTable[ac.d],tevOpTable[ac.op]);
+		else
+			WRITE(p, "   (%s%s",tevAInputTable[ac.d],tevOpTable[ac.op]);
+
+		if (ac.a == 7 && ac.b == 7)
+			WRITE(p, "0");
+		else if (ac.a == 7 && ac.c == 7)
+			WRITE(p, "0");
+		else if (ac.b == 7 && ac.c == 7)
+			WRITE(p,"%s",tevAInputTable[ac.a]);
+		else if (ac.a == 7)
+			WRITE(p,"(%s)*(%s)",tevAInputTable[ac.b],tevAInputTable[ac.c]);
+		else if (ac.b == 7)
+			WRITE(p,"(%s)*(1-%s)",tevAInputTable[ac.a],tevAInputTable[ac.c]);
+		else if (ac.c == 7)
+			WRITE(p,"%s",tevAInputTable[ac.a]);
+		else
+	        WRITE(p, "lerp(%s,%s,%s)",tevAInputTable[ac.a],tevAInputTable[ac.b],tevAInputTable[ac.c]);
+		WRITE(p, " %s)",tevBiasTable[ac.bias]);
    }
    else 
 	{
@ -737,40 +777,35 @@ static void WriteStage(char *&p, int n, u32 texture_mask)
 		{
        case TEVCMP_R8_GT:
        case TEVCMP_A8_GT:
-            WRITE(p, "   %s + ((%s.%s > %s.%s) ? %s : 0)\n",
+            WRITE(p, "   %s + ((%s.%s > %s.%s) ? %s : 0)",
                tevAInputTable[ac.d],tevAInputTable2[ac.a], cmp==TEVCMP_R8_GT?"r":"a", tevAInputTable2[ac.b], cmp==TEVCMP_R8_GT?"r":"a", tevAInputTable[ac.c]);
            break;
        case TEVCMP_R8_EQ:
        case TEVCMP_A8_EQ:
-            WRITE(p, "   %s + (abs(%s.r - %s.r)<%f ? %s : 0)\n",
+            WRITE(p, "   %s + (abs(%s.r - %s.r)<%f ? %s : 0)",
                tevAInputTable[ac.d],tevAInputTable2[ac.a], tevAInputTable2[ac.b],epsilon8bit,tevAInputTable[ac.c]);
            break;
        
        case TEVCMP_GR16_GT: // 16 bit compares: 255*g+r (probably used for ztextures, so make sure in ztextures, g is the most significant byte)
        case TEVCMP_BGR24_GT: // 24 bit compares: 255*255*b+255*g+r
-            WRITE(p, "   %s + (( dot(%s.rgb-%s.rgb, comp%s) > 0) ? %s : 0)\n",
+            WRITE(p, "   %s + (( dot(%s.rgb-%s.rgb, comp%s) > 0) ? %s : 0)",
                tevAInputTable[ac.d],tevAInputTable2[ac.a], tevAInputTable2[ac.b], cmp==TEVCMP_GR16_GT?"16":"24", tevAInputTable[ac.c]);
            break;
        case TEVCMP_GR16_EQ:
        case TEVCMP_BGR24_EQ:
-            WRITE(p, "   %s + (abs(dot(%s.rgb - %s.rgb, comp%s))<%f ? %s : 0)\n",
+            WRITE(p, "   %s + (abs(dot(%s.rgb - %s.rgb, comp%s))<%f ? %s : 0)",
                tevAInputTable[ac.d],tevAInputTable2[ac.a], tevAInputTable2[ac.b],cmp==TEVCMP_GR16_EQ?"16":"24",epsilon8bit,tevAInputTable[ac.c]);
            break;
        default:
-            WRITE(p, "0)\n");
+            WRITE(p, "0)");
            break;
        }
    }

-    WRITE(p, ";\n");
-
-    if (ac.clamp)
-        WRITE(p, "%s = clamp(%s,0.0f,1.0f);\n", tevAOutputTable[ac.dest],tevAOutputTable[ac.dest]);
-
-    WRITE(p, "\n");
+    WRITE(p, ");\n\n");
 }

-void SampleTexture(char *&p, const char *destination, const char *texcoords, const char *texswap, int texmap, u32 texture_mask)
+void SampleTexture(char *&p, const char *destination, const char *texcoords, const char *texswap, int texmap, u32 texture_mask, bool HLSL)
 {
    if (texture_mask & (1<<texmap)) {
        // non pow 2
@ -792,10 +827,16 @@ void SampleTexture(char *&p, const char *destination, const char *texcoords, con
                 WRITE(p, "tempcoord.y = %s.y;\n", texcoords);
             }

-             WRITE(p, "%s=texRECT(samp%d,tempcoord.xy).%s;\n", destination, texmap, texswap);
+			 if (HLSL)
+				 WRITE(p, "%s=tex2D(samp%d,tempcoord.xy).%s;\n", destination, texmap, texswap);
+			 else
+				WRITE(p, "%s=texRECT(samp%d,tempcoord.xy).%s;\n", destination, texmap, texswap);
         }
         else {
-             WRITE(p, "%s=texRECT(samp%d,%s.xy).%s;\n", destination, texmap, texcoords, texswap);
+			 if (HLSL)
+				 WRITE(p, "%s=tex2D(samp%d,%s.xy).%s;\n", destination, texmap, texcoords, texswap);
+			 else
+				WRITE(p, "%s=texRECT(samp%d,%s.xy).%s;\n", destination, texmap, texcoords, texswap);
         }
    }
    else {
@ -910,7 +951,8 @@ static void WriteFog(char *&p)
            WRITE (p, "  float ze = "I_FOG"[1].x * depth;\n");
        }

-        WRITE (p, "  float fog = clamp(ze - "I_FOG"[1].z, 0.0f, 1.0f);\n");
+        //WRITE (p, "  float fog = clamp(ze - "I_FOG"[1].z, 0.0f, 1.0f);\n");
+		WRITE (p, "  float fog = saturate(ze - "I_FOG"[1].z);\n");
    }

    switch (bpmem.fog.c_proj_fsel.fsel) 
--- a/Source/Core/VideoCommon/Src/VertexShaderGen.cpp
+++ b/Source/Core/VideoCommon/Src/VertexShaderGen.cpp
@ -168,9 +168,15 @@ const char *GenerateVertexShader(u32 components, bool D3D)
    if (components & VB_HAS_NRM0)
        WRITE(p, "  float3 rawnorm0 : NORMAL,\n");
    if (components & VB_HAS_NRM1)
-        WRITE(p, "  float3 rawnorm1 : ATTR%d,\n", SHADER_NORM1_ATTRIB);
+		if (D3D)
+			WRITE(p, "  float3 rawnorm1 : PSIZE,\n");
+		else
+			WRITE(p, "  float3 rawnorm1 : ATTR%d,\n", SHADER_NORM1_ATTRIB);
    if (components & VB_HAS_NRM2)
-        WRITE(p, "  float3 rawnorm2 : ATTR%d,\n", SHADER_NORM2_ATTRIB);
+		if (D3D)
+			WRITE(p, "  float3 rawnorm2 : BLENDINDICES,\n");
+		else
+			WRITE(p, "  float3 rawnorm2 : ATTR%d,\n", SHADER_NORM2_ATTRIB);
    if (components & VB_HAS_COL0)
        WRITE(p, "  float4 color0 : COLOR0,\n");
    if (components & VB_HAS_COL1)
@ -181,7 +187,10 @@ const char *GenerateVertexShader(u32 components, bool D3D)
            WRITE(p, "  float%d tex%d : TEXCOORD%d,\n", hastexmtx ? 3 : 2, i,i);
    }
    if (components & VB_HAS_POSMTXIDX)
-        WRITE(p, "  half posmtx : ATTR%d,\n", SHADER_POSMTX_ATTRIB);
+		if (D3D)
+			WRITE(p, "  half posmtx : BLENDWEIGHT,\n");
+		else
+			WRITE(p, "  half posmtx : ATTR%d,\n", SHADER_POSMTX_ATTRIB);

    WRITE(p, "  float4 rawpos : POSITION) {\n");
    WRITE(p, "VS_OUTPUT o;\n");
--- a/Source/Plugins/Plugin_VideoDX9/Src/NativeVertexFormat.cpp
+++ b/Source/Plugins/Plugin_VideoDX9/Src/NativeVertexFormat.cpp
@ -142,7 +142,7 @@ void D3DVertexFormat::Initialize(const PortableVertexDeclaration &_vtx_decl)

 	if (_vtx_decl.posmtx_offset != -1)
 	{
-		PanicAlert("Posmtx stream not supported correctly. %i", _vtx_decl.posmtx_offset);
+		//PanicAlert("Posmtx stream not supported correctly. %i", _vtx_decl.posmtx_offset);
 		// glVertexAttribPointer(SHADER_POSMTX_ATTRIB, 4, GL_UNSIGNED_BYTE, GL_FALSE, vtx_decl.stride, (void *)vtx_decl.posmtx_offset);
 		elems[elem_idx].Offset = _vtx_decl.posmtx_offset;
 		elems[elem_idx].Usage = D3DDECLUSAGE_BLENDINDICES;
--- a/Source/Plugins/Plugin_VideoDX9/Src/PixelShaderCache.cpp
+++ b/Source/Plugins/Plugin_VideoDX9/Src/PixelShaderCache.cpp
@ -84,7 +84,7 @@ void PixelShaderCache::SetShader()
 		return;
 	}

-	bool HLSL = false;
+	bool HLSL = true;
 	const char *code = GeneratePixelShader(PixelShaderManager::GetTextureMask(), false, HLSL);
 	LPDIRECT3DPIXELSHADER9 shader = HLSL ? D3D::CompilePixelShader(code, (int)strlen(code), false) : CompileCgShader(code);
 	if (shader)
--- a/Source/Plugins/Plugin_VideoDX9/Src/VertexShaderCache.cpp
+++ b/Source/Plugins/Plugin_VideoDX9/Src/VertexShaderCache.cpp
@ -85,8 +85,8 @@ void VertexShaderCache::SetShader(u32 components)
 		return;
 	}

-	bool HLSL = false;
-	const char *code = GenerateVertexShader(components, true);
+	bool HLSL = true;
+	const char *code = GenerateVertexShader(components, HLSL);
 	LPDIRECT3DVERTEXSHADER9 shader = HLSL ? D3D::CompileVertexShader(code, (int)strlen(code), false) : CompileCgShader(code);
 	if (shader)
 	{