diff --git a/Source/Core/VideoCommon/CMakeLists.txt b/Source/Core/VideoCommon/CMakeLists.txt index 406e726d71..4df4a01550 100644 --- a/Source/Core/VideoCommon/CMakeLists.txt +++ b/Source/Core/VideoCommon/CMakeLists.txt @@ -34,6 +34,7 @@ set(SRCS Statistics.cpp UberShaderCommon.cpp UberShaderPixel.cpp + UberShaderVertex.cpp TextureCacheBase.cpp TextureConfig.cpp TextureConversionShader.cpp diff --git a/Source/Core/VideoCommon/ConstantManager.h b/Source/Core/VideoCommon/ConstantManager.h index 98a23c8f76..0630d015db 100644 --- a/Source/Core/VideoCommon/ConstantManager.h +++ b/Source/Core/VideoCommon/ConstantManager.h @@ -44,6 +44,11 @@ struct PixelShaderConstants struct VertexShaderConstants { + u32 components; // .x + u32 xfmem_dualTexInfo; // .y + u32 xfmem_numColorChans; // .z + u32 pad1; // .w + float4 posnormalmatrix[6]; float4 projection[4]; int4 materials[4]; @@ -60,7 +65,10 @@ struct VertexShaderConstants float4 normalmatrices[32]; float4 posttransformmatrices[64]; float4 pixelcentercorrection; - float4 viewport; + float viewport[2]; // .xy + float pad2[2]; // .zw + + uint4 xfmem_pack1[8]; // .x - texMtxInfo, .y - postMtxInfo, [0..1].z = color, [0..1].w = alpha }; struct GeometryShaderConstants diff --git a/Source/Core/VideoCommon/ShaderGenCommon.h b/Source/Core/VideoCommon/ShaderGenCommon.h index fe4b48d36a..3e3ef2eb13 100644 --- a/Source/Core/VideoCommon/ShaderGenCommon.h +++ b/Source/Core/VideoCommon/ShaderGenCommon.h @@ -318,7 +318,10 @@ inline const char* GetInterpolationQualifier(bool msaa, bool ssaa, #define I_LINEPTPARAMS "clinept" #define I_TEXOFFSET "ctexoffset" -static const char s_shader_uniforms[] = "\tfloat4 " I_POSNORMALMATRIX "[6];\n" +static const char s_shader_uniforms[] = "\tuint components;\n" + "\tuint xfmem_dualTexInfo;\n" + "\tuint xfmem_numColorChans;\n" + "\tfloat4 " I_POSNORMALMATRIX "[6];\n" "\tfloat4 " I_PROJECTION "[4];\n" "\tint4 " I_MATERIALS "[4];\n" "\tLight " I_LIGHTS "[8];\n" @@ -327,4 +330,9 @@ static const char s_shader_uniforms[] = "\tfloat4 " I_POSNORMALMATRIX "[6];\n" "\tfloat4 " I_NORMALMATRICES "[32];\n" "\tfloat4 " I_POSTTRANSFORMMATRICES "[64];\n" "\tfloat4 " I_PIXELCENTERCORRECTION ";\n" - "\tfloat2 " I_VIEWPORT_SIZE ";\n"; + "\tfloat2 " I_VIEWPORT_SIZE ";\n" + "\tuint4 xfmem_pack1[8];\n" + "\t#define xfmem_texMtxInfo(i) (xfmem_pack1[(i)].x)\n" + "\t#define xfmem_postMtxInfo(i) (xfmem_pack1[(i)].y)\n" + "\t#define xfmem_color(i) (xfmem_pack1[(i)].z)\n" + "\t#define xfmem_alpha(i) (xfmem_pack1[(i)].w)\n"; diff --git a/Source/Core/VideoCommon/UberShaderCommon.cpp b/Source/Core/VideoCommon/UberShaderCommon.cpp index 2c6ba23c28..c6f5167b3c 100644 --- a/Source/Core/VideoCommon/UberShaderCommon.cpp +++ b/Source/Core/VideoCommon/UberShaderCommon.cpp @@ -24,4 +24,4 @@ void WriteUberShaderCommonHeader(ShaderCode& out, APIType api_type, "}\n\n"); } } -} \ No newline at end of file +} diff --git a/Source/Core/VideoCommon/UberShaderCommon.h b/Source/Core/VideoCommon/UberShaderCommon.h index cca2cc38bf..d6edaf1f73 100644 --- a/Source/Core/VideoCommon/UberShaderCommon.h +++ b/Source/Core/VideoCommon/UberShaderCommon.h @@ -20,5 +20,4 @@ std::string BitfieldExtract(const std::string& source, T type) return StringFromFormat("bitfieldExtract(%s, %u, %u)", source.c_str(), static_cast(type.StartBit()), static_cast(type.NumBits())); } - } // namespace UberShader diff --git a/Source/Core/VideoCommon/UberShaderVertex.cpp b/Source/Core/VideoCommon/UberShaderVertex.cpp new file mode 100644 index 0000000000..d8ccdb9935 --- /dev/null +++ b/Source/Core/VideoCommon/UberShaderVertex.cpp @@ -0,0 +1,620 @@ +// Copyright 2015 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include "VideoCommon/UberShaderVertex.h" +#include "VideoCommon/DriverDetails.h" +#include "VideoCommon/NativeVertexFormat.h" +#include "VideoCommon/UberShaderCommon.h" +#include "VideoCommon/VertexShaderGen.h" +#include "VideoCommon/VideoConfig.h" +#include "VideoCommon/XFMemory.h" + +namespace UberShader +{ +VertexShaderUid GetVertexShaderUid() +{ + VertexShaderUid out; + vertex_ubershader_uid_data* uid_data = out.GetUidData(); + memset(uid_data, 0, sizeof(*uid_data)); + uid_data->num_texgens = xfmem.numTexGen.numTexGens; + return out; +} + +static void GenVertexShaderLighting(APIType ApiType, ShaderCode& out); +static void GenVertexShaderTexGens(APIType ApiType, u32 numTexgen, ShaderCode& out); + +ShaderCode GenVertexShader(APIType ApiType, const ShaderHostConfig& host_config, + const vertex_ubershader_uid_data* uid_data) +{ + const bool msaa = host_config.msaa; + const bool ssaa = host_config.ssaa; + const bool per_pixel_lighting = host_config.per_pixel_lighting; + const bool vertex_rounding = host_config.vertex_rounding; + const u32 numTexgen = uid_data->num_texgens; + ShaderCode out; + + out.Write("// Vertex UberShader\n\n"); + WriteUberShaderCommonHeader(out, ApiType, host_config); + + out.Write("%s", s_lighting_struct); + + // uniforms + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + out.Write("UBO_BINDING(std140, 2) uniform VSBlock {\n"); + else + out.Write("cbuffer VSBlock {\n"); + out.Write(s_shader_uniforms); + out.Write("};\n"); + + out.Write("int4 CalculateLighting(uint index, uint attnfunc, uint diffusefunc, float4 pos, " + "float3 _norm0) {\n" + " float3 ldir, h, cosAttn, distAttn;\n" + " float dist, dist2, attn;\n" + "\n" + " switch (attnfunc) {\n"); + out.Write(" case %uu: // LIGNTATTN_NONE\n", LIGHTATTN_NONE); + out.Write(" case %uu: // LIGHTATTN_DIR\n", LIGHTATTN_DIR); + out.Write(" ldir = normalize(" I_LIGHTS "[index].pos.xyz - pos.xyz);\n" + " attn = 1.0;\n" + " if (length(ldir) == 0.0)\n" + " ldir = _norm0;\n" + " break;\n\n"); + out.Write(" case %uu: // LIGHTATTN_SPEC\n", LIGHTATTN_SPEC); + out.Write(" ldir = normalize(" I_LIGHTS "[index].pos.xyz - pos.xyz);\n" + " attn = (dot(_norm0, ldir) >= 0.0) ? max(0.0, dot(_norm0, " I_LIGHTS + "[index].dir.xyz)) : 0.0;\n" + " cosAttn = " I_LIGHTS "[index].cosatt.xyz;\n"); + out.Write(" if (diffusefunc == %uu) // LIGHTDIF_NONE\n", LIGHTDIF_NONE); + out.Write(" distAttn = " I_LIGHTS "[index].distatt.xyz;\n" + " else\n" + " distAttn = normalize(" I_LIGHTS "[index].distatt.xyz);\n" + " attn = max(0.0, dot(cosAttn, float3(1.0, attn, attn*attn))) / dot(distAttn, " + "float3(1.0, attn, attn*attn));\n" + " break;\n\n"); + out.Write(" case %uu: // LIGHTATTN_SPOT\n", LIGHTATTN_SPOT); + out.Write(" ldir = " I_LIGHTS "[index].pos.xyz - pos.xyz;\n" + " dist2 = dot(ldir, ldir);\n" + " dist = sqrt(dist2);\n" + " ldir = ldir / dist;\n" + " attn = max(0.0, dot(ldir, " I_LIGHTS "[index].dir.xyz));\n" + " attn = max(0.0, " I_LIGHTS "[index].cosatt.x + " I_LIGHTS + "[index].cosatt.y * attn + " I_LIGHTS "[index].cosatt.z * attn * attn) / dot(" I_LIGHTS + "[index].distatt.xyz, float3(1.0, dist, dist2));\n" + " break;\n\n"); + out.Write(" default:\n" + " attn = 1.0;\n" + " ldir = _norm0;\n" + " break;\n" + " }\n" + "\n" + " switch (diffusefunc) {\n"); + out.Write(" case %uu: // LIGHTDIF_NONE\n", LIGHTDIF_NONE); + out.Write(" return int4(round(attn * float4(" I_LIGHTS "[index].color)));\n\n"); + out.Write(" case %uu: // LIGHTDIF_SIGN\n", LIGHTDIF_SIGN); + out.Write(" return int4(round(attn * dot(ldir, _norm0) * float4(" I_LIGHTS + "[index].color)));\n\n"); + out.Write(" case %uu: // LIGHTDIF_CLAMP\n", LIGHTDIF_CLAMP); + out.Write(" return int4(round(attn * max(0.0, dot(ldir, _norm0)) * float4(" I_LIGHTS + "[index].color)));\n\n"); + out.Write(" default:\n" + " return int4(0, 0, 0, 0);\n" + " }\n" + "}\n\n"); + + out.Write("struct VS_OUTPUT {\n"); + GenerateVSOutputMembers(out, ApiType, numTexgen, false, ""); + out.Write("};\n\n"); + + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + { + out.Write("ATTRIBUTE_LOCATION(%d) in float4 rawpos;\n", SHADER_POSITION_ATTRIB); + out.Write("ATTRIBUTE_LOCATION(%d) in uint4 posmtx;\n", SHADER_POSMTX_ATTRIB); + out.Write("ATTRIBUTE_LOCATION(%d) in float3 rawnorm0;\n", SHADER_NORM0_ATTRIB); + out.Write("ATTRIBUTE_LOCATION(%d) in float3 rawnorm1;\n", SHADER_NORM1_ATTRIB); + out.Write("ATTRIBUTE_LOCATION(%d) in float3 rawnorm2;\n", SHADER_NORM2_ATTRIB); + out.Write("ATTRIBUTE_LOCATION(%d) in float4 color0;\n", SHADER_COLOR0_ATTRIB); + out.Write("ATTRIBUTE_LOCATION(%d) in float4 color1;\n", SHADER_COLOR1_ATTRIB); + for (int i = 0; i < 8; ++i) + out.Write("ATTRIBUTE_LOCATION(%d) in float3 tex%d;\n", SHADER_TEXTURE0_ATTRIB + i, i); + + // We need to always use output blocks for Vulkan, but geometry shaders are also optional. + if (host_config.backend_geometry_shaders || ApiType == APIType::Vulkan) + { + out.Write("VARYING_LOCATION(0) out VertexData {\n"); + GenerateVSOutputMembers(out, ApiType, numTexgen, per_pixel_lighting, + GetInterpolationQualifier(msaa, ssaa, true, false)); + out.Write("} vs;\n"); + } + else + { + // Let's set up attributes + for (u32 i = 0; i < numTexgen; ++i) + out.Write("%s out float3 uv%u;\n", GetInterpolationQualifier(msaa, ssaa), i); + + out.Write("%s out float4 clipPos;\n", GetInterpolationQualifier(msaa, ssaa)); + if (per_pixel_lighting) + { + out.Write("%s out float3 Normal;\n", GetInterpolationQualifier(msaa, ssaa)); + out.Write("%s out float3 WorldPos;\n", GetInterpolationQualifier(msaa, ssaa)); + } + out.Write("%s out float4 colors_0;\n", GetInterpolationQualifier(msaa, ssaa)); + out.Write("%s out float4 colors_1;\n", GetInterpolationQualifier(msaa, ssaa)); + } + + out.Write("void main()\n{\n"); + } + else // D3D + { + out.Write("VS_OUTPUT main(\n"); + + // inputs + out.Write(" float3 rawnorm0 : NORMAL0,\n"); + out.Write(" float3 rawnorm1 : NORMAL1,\n"); + out.Write(" float3 rawnorm2 : NORMAL2,\n"); + out.Write(" float4 color0 : COLOR0,\n"); + out.Write(" float4 color1 : COLOR1,\n"); + for (int i = 0; i < 8; ++i) + out.Write(" float3 tex%d : TEXCOORD%d,\n", i, i); + out.Write(" uint posmtx : BLENDINDICES,\n"); + out.Write(" float4 rawpos : POSITION) {\n"); + } + + out.Write("VS_OUTPUT o;\n" + "\n"); + + // Transforms + out.Write("// Position matrix\n" + "float4 P0;\n" + "float4 P1;\n" + "float4 P2;\n" + "\n" + "// Normal matrix\n" + "float3 N0;\n" + "float3 N1;\n" + "float3 N2;\n" + "\n" + "if ((components & %uu) != 0u) {// VB_HAS_POSMTXIDX\n", + VB_HAS_POSMTXIDX); + out.Write(" // Vertex format has a per-vertex matrix\n" + " int posidx = int(posmtx.r);\n" + " P0 = " I_TRANSFORMMATRICES "[posidx];\n" + " P1 = " I_TRANSFORMMATRICES "[posidx+1];\n" + " P2 = " I_TRANSFORMMATRICES "[posidx+2];\n" + "\n" + " int normidx = posidx >= 32 ? (posidx - 32) : posidx;\n" + " N0 = " I_NORMALMATRICES "[normidx].xyz;\n" + " N1 = " I_NORMALMATRICES "[normidx+1].xyz;\n" + " N2 = " I_NORMALMATRICES "[normidx+2].xyz;\n" + "} else {\n" + " // One shared matrix\n" + " P0 = " I_POSNORMALMATRIX "[0];\n" + " P1 = " I_POSNORMALMATRIX "[1];\n" + " P2 = " I_POSNORMALMATRIX "[2];\n" + " N0 = " I_POSNORMALMATRIX "[3].xyz;\n" + " N1 = " I_POSNORMALMATRIX "[4].xyz;\n" + " N2 = " I_POSNORMALMATRIX "[5].xyz;\n" + "}\n" + "\n" + "float4 pos = float4(dot(P0, rawpos), dot(P1, rawpos), dot(P2, rawpos), 1.0);\n" + "o.pos = float4(dot(" I_PROJECTION "[0], pos), dot(" I_PROJECTION + "[1], pos), dot(" I_PROJECTION "[2], pos), dot(" I_PROJECTION "[3], pos));\n" + "\n" + "// Only the first normal gets normalized (TODO: why?)\n" + "float3 _norm0 = float3(0.0, 0.0, 0.0);\n" + "if ((components & %uu) != 0u) // VB_HAS_NRM0\n", + VB_HAS_NRM0); + out.Write( + " _norm0 = normalize(float3(dot(N0, rawnorm0), dot(N1, rawnorm0), dot(N2, rawnorm0)));\n" + "\n" + "float3 _norm1 = float3(0.0, 0.0, 0.0);\n" + "if ((components & %uu) != 0u) // VB_HAS_NRM1\n", + VB_HAS_NRM1); + out.Write(" _norm1 = float3(dot(N0, rawnorm1), dot(N1, rawnorm1), dot(N2, rawnorm1));\n" + "\n" + "float3 _norm2 = float3(0.0, 0.0, 0.0);\n" + "if ((components & %uu) != 0u) // VB_HAS_NRM2\n", + VB_HAS_NRM2); + out.Write(" _norm2 = float3(dot(N0, rawnorm2), dot(N1, rawnorm2), dot(N2, rawnorm2));\n" + "\n"); + + // Hardware Lighting + GenVertexShaderLighting(ApiType, out); + + // Texture Coordinates + if (numTexgen > 0) + GenVertexShaderTexGens(ApiType, numTexgen, out); + + // clipPos/w needs to be done in pixel shader, not here + out.Write("o.clipPos = o.pos;\n"); + + // If we can disable the incorrect depth clipping planes using depth clamping, then we can do + // our own depth clipping and calculate the depth range before the perspective divide if + // necessary. + if (host_config.backend_depth_clamp) + { + // Since we're adjusting z for the depth range before the perspective divide, we have to do our + // own clipping. We want to clip so that -w <= z <= 0, which matches the console -1..0 range. + // We adjust our depth value for clipping purposes to match the perspective projection in the + // software backend, which is a hack to fix Sonic Adventure and Unleashed games. + out.Write("float clipDepth = o.pos.z * (1.0 - 1e-7);\n"); + out.Write("o.clipDist0 = clipDepth + o.pos.w;\n"); // Near: z < -w + out.Write("o.clipDist1 = -clipDepth;\n"); // Far: z > 0 + } + + // Write the true depth value. If the game uses depth textures, then the pixel shader will + // override it with the correct values if not then early z culling will improve speed. + // There are two different ways to do this, when the depth range is oversized, we process + // the depth range in the vertex shader, if not we let the host driver handle it. + // + // Adjust z for the depth range. We're using an equation which incorperates a depth inversion, + // so we can map the console -1..0 range to the 0..1 range used in the depth buffer. + // We have to handle the depth range in the vertex shader instead of after the perspective + // divide, because some games will use a depth range larger than what is allowed by the + // graphics API. These large depth ranges will still be clipped to the 0..1 range, so these + // games effectively add a depth bias to the values written to the depth buffer. + out.Write("o.pos.z = o.pos.w * " I_PIXELCENTERCORRECTION ".w - " + "o.pos.z * " I_PIXELCENTERCORRECTION ".z;\n"); + + if (!host_config.backend_clip_control) + { + // If the graphics API doesn't support a depth range of 0..1, then we need to map z to + // the -1..1 range. Unfortunately we have to use a substraction, which is a lossy floating-point + // operation that can introduce a round-trip error. + out.Write("o.pos.z = o.pos.z * 2.0 - o.pos.w;\n"); + } + + // Correct for negative viewports by mirroring all vertices. We need to negate the height here, + // since the viewport height is already negated by the render backend. + out.Write("o.pos.xy *= sign(" I_PIXELCENTERCORRECTION ".xy * float2(1.0, -1.0));\n"); + + // The console GPU places the pixel center at 7/12 in screen space unless + // antialiasing is enabled, while D3D and OpenGL place it at 0.5. This results + // in some primitives being placed one pixel too far to the bottom-right, + // which in turn can be critical if it happens for clear quads. + // Hence, we compensate for this pixel center difference so that primitives + // get rasterized correctly. + out.Write("o.pos.xy = o.pos.xy - o.pos.w * " I_PIXELCENTERCORRECTION ".xy;\n"); + + if (vertex_rounding) + { + // By now our position is in clip space. However, higher resolutions than the Wii outputs + // cause an additional pixel offset. Due to a higher pixel density we need to correct this + // by converting our clip-space position into the Wii's screen-space. + // Acquire the right pixel and then convert it back. + out.Write("if (o.pos.w == 1.0f)\n"); + out.Write("{\n"); + + out.Write("\tfloat ss_pixel_x = ((o.pos.x + 1.0f) * (" I_VIEWPORT_SIZE ".x * 0.5f));\n"); + out.Write("\tfloat ss_pixel_y = ((o.pos.y + 1.0f) * (" I_VIEWPORT_SIZE ".y * 0.5f));\n"); + + out.Write("\tss_pixel_x = round(ss_pixel_x);\n"); + out.Write("\tss_pixel_y = round(ss_pixel_y);\n"); + + out.Write("\to.pos.x = ((ss_pixel_x / (" I_VIEWPORT_SIZE ".x * 0.5f)) - 1.0f);\n"); + out.Write("\to.pos.y = ((ss_pixel_y / (" I_VIEWPORT_SIZE ".y * 0.5f)) - 1.0f);\n"); + out.Write("}\n"); + } + + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + { + if (host_config.backend_geometry_shaders || ApiType == APIType::Vulkan) + { + AssignVSOutputMembers(out, "vs", "o", numTexgen, false); + } + else + { + // TODO: Pass interface blocks between shader stages even if geometry shaders + // are not supported, however that will require at least OpenGL 3.2 support. + for (u32 i = 0; i < numTexgen; ++i) + out.Write("uv%d.xyz = o.tex%d;\n", i, i); + out.Write("clipPos = o.clipPos;\n"); + out.Write("colors_0 = o.colors_0;\n"); + out.Write("colors_1 = o.colors_1;\n"); + } + + if (host_config.backend_depth_clamp) + { + out.Write("gl_ClipDistance[0] = o.clipDist0;\n"); + out.Write("gl_ClipDistance[1] = o.clipDist1;\n"); + } + + // Vulkan NDC space has Y pointing down (right-handed NDC space). + if (ApiType == APIType::Vulkan) + out.Write("gl_Position = float4(o.pos.x, -o.pos.y, o.pos.z, o.pos.w);\n"); + else + out.Write("gl_Position = o.pos;\n"); + } + else // D3D + { + out.Write("return o;\n"); + } + out.Write("}\n"); + + return out; +} + +void GenVertexShaderLighting(APIType ApiType, ShaderCode& out) +{ + out.Write("if ((components & %uu) != 0u) // VB_HAS_COL0\n", VB_HAS_COL0); + out.Write(" o.colors_0 = color0;\n" + "else\n" + " o.colors_0 = float4(1.0, 1.0, 1.0, 1.0);\n" + "\n"); + out.Write("if ((components & %uu) != 0u) // VB_HAS_COL1\n", VB_HAS_COL1); + out.Write(" o.colors_1 = color1;\n" + "else\n" + " o.colors_1 = float4(1.0, 1.0, 1.0, 1.0);\n" + "\n"); + + out.Write("// Lighting\n"); + out.Write("%sfor (uint chan = 0u; chan < xfmem_numColorChans; chan++) {\n", + ApiType == APIType::D3D ? "[loop] " : ""); + out.Write(" uint colorreg = xfmem_color(chan);\n" + " uint alphareg = xfmem_alpha(chan);\n" + " int4 mat = " I_MATERIALS "[chan + 2u]; \n" + " int4 lacc = int4(255, 255, 255, 255);\n" + "\n"); + + out.Write(" if (%s != 0u) {\n", BitfieldExtract("colorreg", LitChannel().matsource).c_str()); + out.Write(" if ((components & (%uu << chan)) != 0u) // VB_HAS_COL0\n", VB_HAS_COL0); + out.Write(" mat.xyz = int3(round(((chan == 0u) ? color0.xyz : color1.xyz) * 255.0));\n"); + out.Write(" else if ((components & %uu) != 0u) // VB_HAS_COLO0\n", VB_HAS_COL0); + out.Write(" mat.xyz = int3(round(color0.xyz * 255.0));\n" + " else\n" + " mat.xyz = int3(255, 255, 255);\n" + " }\n" + "\n"); + + out.Write(" if (%s != 0u) {\n", BitfieldExtract("alphareg", LitChannel().matsource).c_str()); + out.Write(" if ((components & (%uu << chan)) != 0u) // VB_HAS_COL0\n", VB_HAS_COL0); + out.Write(" mat.w = int(round(((chan == 0u) ? color0.w : color1.w) * 255.0));\n"); + out.Write(" else if ((components & %uu) != 0u) // VB_HAS_COLO0\n", VB_HAS_COL0); + out.Write(" mat.w = int(round(color0.w * 255.0));\n" + " else\n" + " mat.w = 255;\n" + " } else {\n" + " mat.w = " I_MATERIALS " [chan + 2u].w;\n" + " }\n" + "\n"); + + out.Write(" if (%s != 0u) {\n", + BitfieldExtract("colorreg", LitChannel().enablelighting).c_str()); + out.Write(" if (%s != 0u) {\n", BitfieldExtract("colorreg", LitChannel().ambsource).c_str()); + out.Write(" if ((components & (%uu << chan)) != 0u) // VB_HAS_COL0\n", VB_HAS_COL0); + out.Write(" lacc.xyz = int3(round(((chan == 0u) ? color0.xyz : color1.xyz) * 255.0));\n"); + out.Write(" else if ((components & %uu) != 0u) // VB_HAS_COLO0\n", VB_HAS_COL0); + out.Write(" lacc.xyz = int3(round(color0.xyz * 255.0));\n" + " else\n" + " lacc.xyz = int3(255, 255, 255);\n" + " } else {\n" + " lacc.xyz = " I_MATERIALS " [chan].xyz;\n" + " }\n" + "\n"); + out.Write(" uint light_mask = %s | (%s << 4u);\n", + BitfieldExtract("colorreg", LitChannel().lightMask0_3).c_str(), + BitfieldExtract("colorreg", LitChannel().lightMask4_7).c_str()); + out.Write(" uint attnfunc = %s;\n", + BitfieldExtract("colorreg", LitChannel().attnfunc).c_str()); + out.Write(" uint diffusefunc = %s;\n", + BitfieldExtract("colorreg", LitChannel().diffusefunc).c_str()); + out.Write(" for (uint light_index = 0u; light_index < 8u; light_index++) {\n" + " if ((light_mask & (1u << light_index)) != 0u)\n" + " lacc.xyz += CalculateLighting(light_index, attnfunc, diffusefunc, pos, " + "_norm0).xyz;\n" + " }\n" + " }\n" + "\n"); + + out.Write(" if (%s != 0u) {\n", + BitfieldExtract("alphareg", LitChannel().enablelighting).c_str()); + out.Write(" if (%s != 0u) {\n", BitfieldExtract("alphareg", LitChannel().ambsource).c_str()); + out.Write(" if ((components & (%uu << chan)) != 0u) // VB_HAS_COL0\n", VB_HAS_COL0); + out.Write(" lacc.w = int(round(((chan == 0u) ? color0.w : color1.w) * 255.0));\n"); + out.Write(" else if ((components & %uu) != 0u) // VB_HAS_COLO0\n", VB_HAS_COL0); + out.Write(" lacc.w = int(round(color0.w * 255.0));\n" + " else\n" + " lacc.w = 255;\n" + " } else {\n" + " lacc.w = " I_MATERIALS " [chan].w;\n" + " }\n" + "\n"); + out.Write(" uint light_mask = %s | (%s << 4u);\n", + BitfieldExtract("alphareg", LitChannel().lightMask0_3).c_str(), + BitfieldExtract("alphareg", LitChannel().lightMask4_7).c_str()); + out.Write(" uint attnfunc = %s;\n", + BitfieldExtract("alphareg", LitChannel().attnfunc).c_str()); + out.Write(" uint diffusefunc = %s;\n", + BitfieldExtract("alphareg", LitChannel().diffusefunc).c_str()); + out.Write( + " for (uint light_index = 0u; light_index < 8u; light_index++) {\n\n" + " if ((light_mask & (1u << light_index)) != 0u)\n\n" + " lacc.w += CalculateLighting(light_index, attnfunc, diffusefunc, pos, _norm0).w;\n" + " }\n" + " }\n" + "\n"); + + out.Write(" lacc = clamp(lacc, 0, 255);\n" + "\n" + " // Hopefully GPUs that can support dynamic indexing will optimize this.\n" + " float4 lit_color = float4((mat * (lacc + (lacc >> 7))) >> 8) / 255.0;\n" + " switch (chan) {\n" + " case 0u: o.colors_0 = lit_color; break;\n" + " case 1u: o.colors_1 = lit_color; break;\n" + " }\n" + "}\n" + "\n"); + + out.Write("if (xfmem_numColorChans < 2u && (components & %uu) == 0u)\n", VB_HAS_COL1); + out.Write(" o.colors_1 = o.colors_0;\n\n"); +} + +void GenVertexShaderTexGens(APIType ApiType, u32 numTexgen, ShaderCode& out) +{ + // The HLSL compiler complains that the output texture coordinates are uninitialized when trying + // to dynamically index them. + for (u32 i = 0; i < numTexgen; i++) + out.Write("o.tex%u = float3(0.0, 0.0, 0.0);\n", i); + + out.Write("// Texture coordinate generation\n"); + if (numTexgen == 1) + out.Write("{ const uint texgen = 0u;\n"); + else + out.Write("%sfor (uint texgen = 0u; texgen < %uu; texgen++) {\n", + ApiType == APIType::D3D ? "[loop] " : "", numTexgen); + + out.Write(" // Texcoord transforms\n"); + out.Write(" float4 coord = float4(0.0, 0.0, 1.0, 1.0);\n" + " uint texMtxInfo = xfmem_texMtxInfo(texgen);\n"); + out.Write(" switch (%s) {\n", BitfieldExtract("texMtxInfo", TexMtxInfo().sourcerow).c_str()); + out.Write(" case %uu: // XF_SRCGEOM_INROW\n", XF_SRCGEOM_INROW); + out.Write(" coord.xyz = rawpos.xyz;\n"); + out.Write(" break;\n\n"); + out.Write(" case %uu: // XF_SRCNORMAL_INROW\n", XF_SRCNORMAL_INROW); + out.Write( + " coord.xyz = ((components & %uu /* VB_HAS_NRM0 */) != 0u) ? rawnorm0.xyz : coord.xyz;", + VB_HAS_NRM0); + out.Write(" break;\n\n"); + out.Write(" case %uu: // XF_SRCBINORMAL_T_INROW\n", XF_SRCBINORMAL_T_INROW); + out.Write( + " coord.xyz = ((components & %uu /* VB_HAS_NRM1 */) != 0u) ? rawnorm1.xyz : coord.xyz;", + VB_HAS_NRM1); + out.Write(" break;\n\n"); + out.Write(" case %uu: // XF_SRCBINORMAL_B_INROW\n", XF_SRCBINORMAL_B_INROW); + out.Write( + " coord.xyz = ((components & %uu /* VB_HAS_NRM2 */) != 0u) ? rawnorm2.xyz : coord.xyz;", + VB_HAS_NRM2); + out.Write(" break;\n\n"); + for (u32 i = 0; i < 8; i++) + { + out.Write(" case %uu: // XF_SRCTEX%u_INROW\n", XF_SRCTEX0_INROW + i, i); + out.Write(" coord = ((components & %uu /* VB_HAS_UV%u */) != 0u) ? float4(tex%u.x, tex%u.y, " + "1.0, 1.0) : coord;\n", + VB_HAS_UV0 << i, i, i, i); + out.Write(" break;\n\n"); + } + out.Write(" }\n"); + out.Write("\n"); + + out.Write(" // Input form of AB11 sets z element to 1.0\n"); + out.Write(" if (%s == %uu) // inputform == XF_TEXINPUT_AB11\n", + BitfieldExtract("texMtxInfo", TexMtxInfo().inputform).c_str(), XF_TEXINPUT_AB11); + out.Write(" coord.z = 1.0f;\n"); + out.Write("\n"); + + out.Write(" // first transformation\n"); + out.Write(" uint texgentype = %s;\n", + BitfieldExtract("texMtxInfo", TexMtxInfo().texgentype).c_str()); + out.Write(" float3 output_tex;\n" + " switch (texgentype)\n" + " {\n"); + out.Write(" case %uu: // XF_TEXGEN_EMBOSS_MAP\n", XF_TEXGEN_EMBOSS_MAP); + out.Write(" {\n"); + out.Write(" uint light = %s;\n", + BitfieldExtract("texMtxInfo", TexMtxInfo().embosslightshift).c_str()); + out.Write(" uint source = %s;\n", + BitfieldExtract("texMtxInfo", TexMtxInfo().embosssourceshift).c_str()); + out.Write(" switch (source) {\n"); + for (u32 i = 0; i < numTexgen; i++) + out.Write(" case %uu: output_tex.xyz = o.tex%u; break;\n", i, i); + out.Write(" default: output_tex.xyz = float3(0.0, 0.0, 0.0); break;\n" + " }\n"); + out.Write(" if ((components & %uu) != 0u) { // VB_HAS_NRM1 | VB_HAS_NRM2\n", + VB_HAS_NRM1 | VB_HAS_NRM2); // Should this be VB_HAS_NRM1 | VB_HAS_NRM2 + out.Write(" float3 ldir = normalize(" I_LIGHTS "[light].pos.xyz - pos.xyz);\n" + " output_tex.xyz += float3(dot(ldir, _norm1), dot(ldir, _norm2), 0.0);\n" + " }\n" + " }\n" + " break;\n\n"); + out.Write(" case %uu: // XF_TEXGEN_COLOR_STRGBC0\n", XF_TEXGEN_COLOR_STRGBC0); + out.Write(" output_tex.xyz = float3(o.colors_0.x, o.colors_0.y, 1.0);\n" + " break;\n\n"); + out.Write(" case %uu: // XF_TEXGEN_COLOR_STRGBC1\n", XF_TEXGEN_COLOR_STRGBC1); + out.Write(" output_tex.xyz = float3(o.colors_1.x, o.colors_1.y, 1.0);\n" + " break;\n\n"); + out.Write(" default: // Also XF_TEXGEN_REGULAR\n" + " {\n"); + out.Write(" if ((components & (%uu /* VB_HAS_TEXMTXIDX0 */ << texgen)) != 0u) {\n", + VB_HAS_TEXMTXIDX0); + out.Write(" // This is messy, due to dynamic indexing of the input texture coordinates.\n" + " // Hopefully the compiler will unroll this whole loop anyway and the switch.\n" + " int tmp = 0;\n" + " switch (texgen) {\n"); + for (u32 i = 0; i < numTexgen; i++) + out.Write(" case %uu: tmp = int(tex%u.z); break;\n", i, i); + out.Write(" }\n" + "\n"); + out.Write(" if (%s == %uu) {\n", + BitfieldExtract("texMtxInfo", TexMtxInfo().projection).c_str(), XF_TEXPROJ_STQ); + out.Write(" output_tex.xyz = float3(dot(coord, " I_TRANSFORMMATRICES "[tmp]),\n" + " dot(coord, " I_TRANSFORMMATRICES "[tmp + 1]),\n" + " dot(coord, " I_TRANSFORMMATRICES "[tmp + 2]));\n" + " } else {\n" + " output_tex.xyz = float3(dot(coord, " I_TRANSFORMMATRICES "[tmp]),\n" + " dot(coord, " I_TRANSFORMMATRICES "[tmp + 1]),\n" + " 1.0);\n" + " }\n" + " } else {\n"); + out.Write(" if (%s == %uu) {\n", + BitfieldExtract("texMtxInfo", TexMtxInfo().projection).c_str(), XF_TEXPROJ_STQ); + out.Write(" output_tex.xyz = float3(dot(coord, " I_TEXMATRICES "[3u * texgen]),\n" + " dot(coord, " I_TEXMATRICES "[3u * texgen + 1u]),\n" + " dot(coord, " I_TEXMATRICES "[3u * texgen + 2u]));\n" + " } else {\n" + " output_tex.xyz = float3(dot(coord, " I_TEXMATRICES "[3u * texgen]),\n" + " dot(coord, " I_TEXMATRICES "[3u * texgen + 1u]),\n" + " 1.0);\n" + " }\n" + " }\n" + " }\n" + " break;\n\n" + " }\n" + "\n"); + + out.Write(" if (xfmem_dualTexInfo != 0u) {\n"); + out.Write(" uint postMtxInfo = xfmem_postMtxInfo(texgen);"); + out.Write(" uint base_index = %s;\n", + BitfieldExtract("postMtxInfo", PostMtxInfo().index).c_str()); + out.Write(" float4 P0 = " I_POSTTRANSFORMMATRICES "[base_index & 0x3fu];\n" + " float4 P1 = " I_POSTTRANSFORMMATRICES "[(base_index + 1u) & 0x3fu];\n" + " float4 P2 = " I_POSTTRANSFORMMATRICES "[(base_index + 2u) & 0x3fu];\n" + "\n"); + out.Write(" if (%s != 0u)\n", BitfieldExtract("postMtxInfo", PostMtxInfo().normalize).c_str()); + out.Write(" output_tex.xyz = normalize(output_tex.xyz);\n" + "\n" + " // multiply by postmatrix\n" + " output_tex.xyz = float3(dot(P0.xyz, output_tex.xyz) + P0.w,\n" + " dot(P1.xyz, output_tex.xyz) + P1.w,\n" + " dot(P2.xyz, output_tex.xyz) + P2.w);\n" + " }\n\n"); + + // When q is 0, the GameCube appears to have a special case + // This can be seen in devkitPro's neheGX Lesson08 example for Wii + // Makes differences in Rogue Squadron 3 (Hoth sky) and The Last Story (shadow culling) + out.Write(" if (texgentype == %uu && output_tex.z == 0.0) // XF_TEXGEN_REGULAR\n", + XF_TEXGEN_REGULAR); + out.Write( + " output_tex.xy = clamp(output_tex.xy / 2.0f, float2(-1.0f,-1.0f), float2(1.0f,1.0f));\n" + "\n"); + + out.Write(" // Hopefully GPUs that can support dynamic indexing will optimize this.\n"); + out.Write(" switch (texgen) {\n"); + for (u32 i = 0; i < numTexgen; i++) + out.Write(" case %uu: o.tex%u = output_tex; break;\n", i, i); + out.Write(" }\n" + "}\n"); +} + +void EnumerateVertexShaderUids(const std::function& callback) +{ + VertexShaderUid uid; + std::memset(&uid, 0, sizeof(uid)); + + for (u32 texgens = 0; texgens <= 8; texgens++) + { + auto* vuid = uid.GetUidData(); + vuid->num_texgens = texgens; + callback(uid); + } +} +} diff --git a/Source/Core/VideoCommon/UberShaderVertex.h b/Source/Core/VideoCommon/UberShaderVertex.h new file mode 100644 index 0000000000..daebaa3f77 --- /dev/null +++ b/Source/Core/VideoCommon/UberShaderVertex.h @@ -0,0 +1,28 @@ +// Copyright 2015 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include +#include "VideoCommon/PixelShaderGen.h" + +namespace UberShader +{ +#pragma pack(1) +struct vertex_ubershader_uid_data +{ + u32 num_texgens : 4; + + u32 NumValues() const { return sizeof(vertex_ubershader_uid_data); } +}; +#pragma pack() + +typedef ShaderUid VertexShaderUid; + +VertexShaderUid GetVertexShaderUid(); + +ShaderCode GenVertexShader(APIType api_type, const ShaderHostConfig& host_config, + const vertex_ubershader_uid_data* uid_data); +void EnumerateVertexShaderUids(const std::function& callback); +} diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp index cf13b0bcde..01d9a00fd6 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.cpp +++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp @@ -277,6 +277,7 @@ int RunVertices(int vtx_attr_group, int primitive, int count, DataReader src, bo } s_current_vtx_fmt = loader->m_native_vertex_format; g_current_components = loader->m_native_components; + VertexShaderManager::SetVertexFormat(loader->m_native_components); // if cull mode is CULL_ALL, tell VertexManager to skip triangles and quads. // They still need to go through vertex loading, because we need to calculate a zfreeze refrence diff --git a/Source/Core/VideoCommon/VertexShaderManager.cpp b/Source/Core/VideoCommon/VertexShaderManager.cpp index c167ce63aa..ad20ab2332 100644 --- a/Source/Core/VideoCommon/VertexShaderManager.cpp +++ b/Source/Core/VideoCommon/VertexShaderManager.cpp @@ -30,6 +30,7 @@ alignas(16) static float g_fProjectionMatrix[16]; // track changes static bool bTexMatricesChanged[2], bPosNormalMatrixChanged, bProjectionChanged, bViewportChanged; +static bool bTexMtxInfoChanged, bLightingConfigChanged; static BitSet32 nMaterialsChanged; static int nTransformMatricesChanged[2]; // min,max static int nNormalMatricesChanged[2]; // min,max @@ -193,6 +194,8 @@ void VertexShaderManager::Init() bPosNormalMatrixChanged = false; bProjectionChanged = true; bViewportChanged = false; + bTexMtxInfoChanged = false; + bLightingConfigChanged = false; std::memset(&xfmem, 0, sizeof(xfmem)); constants = {}; @@ -561,6 +564,32 @@ void VertexShaderManager::SetConstants() dirty = true; } + + if (bTexMtxInfoChanged) + { + bTexMtxInfoChanged = false; + constants.xfmem_dualTexInfo = xfmem.dualTexTrans.enabled; + for (size_t i = 0; i < ArraySize(xfmem.texMtxInfo); i++) + constants.xfmem_pack1[i][0] = xfmem.texMtxInfo[i].hex; + for (size_t i = 0; i < ArraySize(xfmem.postMtxInfo); i++) + constants.xfmem_pack1[i][1] = xfmem.postMtxInfo[i].hex; + + dirty = true; + } + + if (bLightingConfigChanged) + { + bLightingConfigChanged = false; + + for (size_t i = 0; i < 2; i++) + { + constants.xfmem_pack1[i][2] = xfmem.color[i].hex; + constants.xfmem_pack1[i][3] = xfmem.alpha[i].hex; + } + constants.xfmem_numColorChans = xfmem.numChan.numColorChans; + + dirty = true; + } } void VertexShaderManager::InvalidateXFRange(int start, int end) @@ -758,6 +787,27 @@ void VertexShaderManager::ResetView() bProjectionChanged = true; } +void VertexShaderManager::SetVertexFormat(u32 components) +{ + if (components != constants.components) + { + constants.components = components; + dirty = true; + } +} + +void VertexShaderManager::SetTexMatrixInfoChanged(int index) +{ + // TODO: Should we track this with more precision, like which indices changed? + // The whole vertex constants are probably going to be uploaded regardless. + bTexMtxInfoChanged = true; +} + +void VertexShaderManager::SetLightingConfigChanged() +{ + bLightingConfigChanged = true; +} + void VertexShaderManager::TransformToClipSpace(const float* data, float* out, u32 MtxIdx) { const float* world_matrix = &xfmem.posMatrices[(MtxIdx & 0x3f) * 4]; @@ -800,6 +850,8 @@ void VertexShaderManager::DoState(PointerWrap& p) p.Do(bPosNormalMatrixChanged); p.Do(bProjectionChanged); p.Do(bViewportChanged); + p.Do(bTexMtxInfoChanged); + p.Do(bLightingConfigChanged); p.Do(constants); diff --git a/Source/Core/VideoCommon/VertexShaderManager.h b/Source/Core/VideoCommon/VertexShaderManager.h index 86042437c3..b2c707db1f 100644 --- a/Source/Core/VideoCommon/VertexShaderManager.h +++ b/Source/Core/VideoCommon/VertexShaderManager.h @@ -36,6 +36,10 @@ public: static void RotateView(float x, float y); static void ResetView(); + static void SetVertexFormat(u32 components); + static void SetTexMatrixInfoChanged(int index); + static void SetLightingConfigChanged(); + // data: 3 floats representing the X, Y and Z vertex model coordinates and the posmatrix index. // out: 4 floats which will be initialized with the corresponding clip space coordinates // NOTE: g_fProjectionMatrix must be up to date when this is called diff --git a/Source/Core/VideoCommon/VideoCommon.vcxproj b/Source/Core/VideoCommon/VideoCommon.vcxproj index 0d1bc70f4e..d8c43256e1 100644 --- a/Source/Core/VideoCommon/VideoCommon.vcxproj +++ b/Source/Core/VideoCommon/VideoCommon.vcxproj @@ -75,6 +75,7 @@ + @@ -137,6 +138,7 @@ + diff --git a/Source/Core/VideoCommon/VideoCommon.vcxproj.filters b/Source/Core/VideoCommon/VideoCommon.vcxproj.filters index 43f6a29b84..8e5b9fef31 100644 --- a/Source/Core/VideoCommon/VideoCommon.vcxproj.filters +++ b/Source/Core/VideoCommon/VideoCommon.vcxproj.filters @@ -185,6 +185,9 @@ Shader Generators + + Shader Generators + @@ -350,6 +353,9 @@ Shader Generators + + Shader Generators + diff --git a/Source/Core/VideoCommon/XFStructs.cpp b/Source/Core/VideoCommon/XFStructs.cpp index dbe3f21e02..41a3a3e447 100644 --- a/Source/Core/VideoCommon/XFStructs.cpp +++ b/Source/Core/VideoCommon/XFStructs.cpp @@ -56,6 +56,7 @@ static void XFRegWritten(int transferSize, u32 baseAddress, DataReader src) case XFMEM_SETNUMCHAN: if (xfmem.numChan.numColorChans != (newValue & 3)) g_vertex_manager->Flush(); + VertexShaderManager::SetLightingConfigChanged(); break; case XFMEM_SETCHAN0_AMBCOLOR: // Channel Ambient Color @@ -88,11 +89,13 @@ static void XFRegWritten(int transferSize, u32 baseAddress, DataReader src) case XFMEM_SETCHAN1_ALPHA: if (((u32*)&xfmem)[address] != (newValue & 0x7fff)) g_vertex_manager->Flush(); + VertexShaderManager::SetLightingConfigChanged(); break; case XFMEM_DUALTEX: if (xfmem.dualTexTrans.enabled != (newValue & 1)) g_vertex_manager->Flush(); + VertexShaderManager::SetTexMatrixInfoChanged(-1); break; case XFMEM_SETMATRIXINDA: @@ -146,6 +149,7 @@ static void XFRegWritten(int transferSize, u32 baseAddress, DataReader src) case XFMEM_SETTEXMTXINFO + 6: case XFMEM_SETTEXMTXINFO + 7: g_vertex_manager->Flush(); + VertexShaderManager::SetTexMatrixInfoChanged(address - XFMEM_SETTEXMTXINFO); nextAddress = XFMEM_SETTEXMTXINFO + 8; break; @@ -159,6 +163,7 @@ static void XFRegWritten(int transferSize, u32 baseAddress, DataReader src) case XFMEM_SETPOSMTXINFO + 6: case XFMEM_SETPOSMTXINFO + 7: g_vertex_manager->Flush(); + VertexShaderManager::SetTexMatrixInfoChanged(address - XFMEM_SETPOSMTXINFO); nextAddress = XFMEM_SETPOSMTXINFO + 8; break;