diff --git a/Source/Core/VideoBackends/Software/Tev.cpp b/Source/Core/VideoBackends/Software/Tev.cpp
index 65227339e4..be6f14d610 100644
--- a/Source/Core/VideoBackends/Software/Tev.cpp
+++ b/Source/Core/VideoBackends/Software/Tev.cpp
@@ -6,6 +6,7 @@
 
 #include <algorithm>
 #include <cmath>
+#include <cstring>
 
 #include "Common/ChunkFile.h"
 #include "Common/CommonTypes.h"
@@ -485,20 +486,16 @@ void Tev::Indirect(unsigned int stageNum, s32 s, s32 t)
 
   // matrix multiply - results might overflow, but we don't care since we only use the lower 24 bits
   // of the result.
-  const int indmtxid = indirect.mid & 3;
-  if (indmtxid)
+  if (indirect.matrix_index != IndMtxIndex::Off)
   {
-    const IND_MTX& indmtx = bpmem.indmtx[indmtxid - 1];
-    const int scale =
-        ((u32)indmtx.col0.s0 << 0) | ((u32)indmtx.col1.s1 << 2) | ((u32)indmtx.col2.s2 << 4);
+    const IND_MTX& indmtx = bpmem.indmtx[static_cast<u32>(indirect.matrix_index.Value()) - 1];
 
-    int shift;
+    const int shift = 17 - indmtx.GetScale();
 
-    switch (indirect.mid & 12)
+    switch (indirect.matrix_id)
     {
-    case 0:
+    case IndMtxId::Indirect:
       // matrix values are S0.10, output format is S17.7, so divide by 8
-      shift = (17 - scale);
       indtevtrans[0] = (indmtx.col0.ma * indcoord[0] + indmtx.col1.mc * indcoord[1] +
                         indmtx.col2.me * indcoord[2]) >>
                        3;
@@ -506,25 +503,29 @@ void Tev::Indirect(unsigned int stageNum, s32 s, s32 t)
                         indmtx.col2.mf * indcoord[2]) >>
                        3;
       break;
-    case 4:  // s matrix
+    case IndMtxId::S:
       // s is S17.7, matrix elements are divided by 256, output is S17.7, so divide by 256. - TODO:
       // Maybe, since s is actually stored as S24, we should divide by 256*64?
-      shift = (17 - scale);
       indtevtrans[0] = s * indcoord[0] / 256;
       indtevtrans[1] = t * indcoord[0] / 256;
       break;
-    case 8:  // t matrix
-      shift = (17 - scale);
+    case IndMtxId::T:
       indtevtrans[0] = s * indcoord[1] / 256;
       indtevtrans[1] = t * indcoord[1] / 256;
       break;
     default:
+      PanicAlertFmt("Invalid indirect matrix ID {}", indirect.matrix_id);
       return;
     }
 
     indtevtrans[0] = shift >= 0 ? indtevtrans[0] >> shift : indtevtrans[0] << -shift;
     indtevtrans[1] = shift >= 0 ? indtevtrans[1] >> shift : indtevtrans[1] << -shift;
   }
+  else
+  {
+    // If matrix_index is Off (0), matrix_id should be Indirect (0)
+    ASSERT(indirect.matrix_id == IndMtxId::Indirect);
+  }
 
   if (indirect.fb_addprev)
   {
@@ -559,9 +560,16 @@ void Tev::Draw()
     const int stageNum2 = stageNum >> 1;
     const int stageOdd = stageNum & 1;
 
-    const u32 texcoordSel = bpmem.tevindref.getTexCoord(stageNum);
+    u32 texcoordSel = bpmem.tevindref.getTexCoord(stageNum);
     const u32 texmap = bpmem.tevindref.getTexMap(stageNum);
 
+    // Quirk: when the tex coord is not less than the number of tex gens (i.e. the tex coord does
+    // not exist), then tex coord 0 is used (though sometimes glitchy effects happen on console).
+    // This affects the Mario portrait in Luigi's Mansion, where the developers forgot to set
+    // the number of tex gens to 2 (bug 11462).
+    if (texcoordSel >= bpmem.genMode.numtexgens)
+      texcoordSel = 0;
+
     const TEXSCALE& texscale = bpmem.texscale[stageNum2];
     const s32 scaleS = stageOdd ? texscale.ss1 : texscale.ss0;
     const s32 scaleT = stageOdd ? texscale.ts1 : texscale.ts0;
@@ -592,8 +600,13 @@ void Tev::Draw()
     const TevStageCombiner::ColorCombiner& cc = bpmem.combiners[stageNum].colorC;
     const TevStageCombiner::AlphaCombiner& ac = bpmem.combiners[stageNum].alphaC;
 
-    const int texcoordSel = order.getTexCoord(stageOdd);
-    const int texmap = order.getTexMap(stageOdd);
+    u32 texcoordSel = order.getTexCoord(stageOdd);
+    const u32 texmap = order.getTexMap(stageOdd);
+
+    // Quirk: when the tex coord is not less than the number of tex gens (i.e. the tex coord does
+    // not exist), then tex coord 0 is used (though sometimes glitchy effects happen on console).
+    if (texcoordSel >= bpmem.genMode.numtexgens)
+      texcoordSel = 0;
 
     Indirect(stageNum, Uv[texcoordSel].s, Uv[texcoordSel].t);
 
@@ -603,8 +616,17 @@ void Tev::Draw()
       // RGBA
       u8 texel[4];
 
-      TextureSampler::Sample(TexCoord.s, TexCoord.t, TextureLod[stageNum], TextureLinear[stageNum],
-                             texmap, texel);
+      if (bpmem.genMode.numtexgens > 0)
+      {
+        TextureSampler::Sample(TexCoord.s, TexCoord.t, TextureLod[stageNum],
+                               TextureLinear[stageNum], texmap, texel);
+      }
+      else
+      {
+        // It seems like the result is always black when no tex coords are enabled, but further
+        // hardware testing is needed.
+        std::memset(texel, 0, 4);
+      }
 
 #if ALLOW_TEV_DUMPS
       if (g_ActiveConfig.bDumpTevTextureFetches)
diff --git a/Source/Core/VideoCommon/BPMemory.h b/Source/Core/VideoCommon/BPMemory.h
index f22ceda2cd..569b1747f3 100644
--- a/Source/Core/VideoCommon/BPMemory.h
+++ b/Source/Core/VideoCommon/BPMemory.h
@@ -300,6 +300,31 @@ struct fmt::formatter<IndTexBias> : EnumFormatter<IndTexBias::STU>
   formatter() : EnumFormatter({"None", "S", "T", "ST", "U", "SU", "TU", "STU"}) {}
 };
 
+enum class IndMtxIndex : u32
+{
+  Off = 0,
+  Matrix0 = 1,
+  Matrix1 = 2,
+  Matrix2 = 3,
+};
+template <>
+struct fmt::formatter<IndMtxIndex> : EnumFormatter<IndMtxIndex::Matrix2>
+{
+  formatter() : EnumFormatter({"Off", "Matrix 0", "Matrix 1", "Matrix 2"}) {}
+};
+
+enum class IndMtxId : u32
+{
+  Indirect = 0,
+  S = 1,
+  T = 2,
+};
+template <>
+struct fmt::formatter<IndMtxId> : EnumFormatter<IndMtxId::T>
+{
+  formatter() : EnumFormatter({"Indirect", "S", "T"}) {}
+};
+
 // Indirect texture bump alpha
 enum class IndTexBumpAlpha : u32
 {
@@ -335,7 +360,7 @@ union IND_MTXA
 {
   BitField<0, 11, s32> ma;
   BitField<11, 11, s32> mb;
-  BitField<22, 2, u32> s0;  // bits 0-1 of scale factor
+  BitField<22, 2, u8, u32> s0;  // bits 0-1 of scale factor
   u32 hex;
 };
 
@@ -343,7 +368,7 @@ union IND_MTXB
 {
   BitField<0, 11, s32> mc;
   BitField<11, 11, s32> md;
-  BitField<22, 2, u32> s1;  // bits 2-3 of scale factor
+  BitField<22, 2, u8, u32> s1;  // bits 2-3 of scale factor
   u32 hex;
 };
 
@@ -351,7 +376,7 @@ union IND_MTXC
 {
   BitField<0, 11, s32> me;
   BitField<11, 11, s32> mf;
-  BitField<22, 2, u32> s2;  // bits 4-5 of scale factor
+  BitField<22, 2, u8, u32> s2;  // bits 4-5 of scale factor
   u32 hex;
 };
 
@@ -360,6 +385,7 @@ struct IND_MTX
   IND_MTXA col0;
   IND_MTXB col1;
   IND_MTXC col2;
+  u8 GetScale() const { return (col0.s0 << 0) | (col1.s1 << 2) | (col2.s2 << 4); }
 };
 
 union IND_IMASK
@@ -475,8 +501,12 @@ union TevStageIndirect
   BitField<4, 1, bool, u32> bias_s;
   BitField<5, 1, bool, u32> bias_t;
   BitField<6, 1, bool, u32> bias_u;
-  BitField<7, 2, IndTexBumpAlpha> bs;     // Indicates which coordinate will become the 'bump alpha'
-  BitField<9, 4, u32> mid;                // Matrix ID to multiply offsets with
+  BitField<7, 2, IndTexBumpAlpha> bs;  // Indicates which coordinate will become the 'bump alpha'
+  // Indicates which indirect matrix is used when matrix_id is Indirect.
+  // Also always indicates which indirect matrix to use for the scale factor, even with S or T.
+  BitField<9, 2, IndMtxIndex> matrix_index;
+  // Should be set to Indirect (0) if matrix_index is Off (0)
+  BitField<11, 2, IndMtxId> matrix_id;
   BitField<13, 3, IndTexWrap> sw;         // Wrapping factor for S of regular coord
   BitField<16, 3, IndTexWrap> tw;         // Wrapping factor for T of regular coord
   BitField<19, 1, bool, u32> lb_utclod;   // Use modified or unmodified texture
@@ -492,9 +522,9 @@ union TevStageIndirect
 
   u32 fullhex;
 
-  // If bs and mid are zero, the result of the stage is independent of
+  // If bs and matrix are zero, the result of the stage is independent of
   // the texture sample data, so we can skip sampling the texture.
-  bool IsActive() const { return bs != IndTexBumpAlpha::Off || mid != 0; }
+  bool IsActive() const { return bs != IndTexBumpAlpha::Off || matrix_index != IndMtxIndex::Off; }
 };
 template <>
 struct fmt::formatter<TevStageIndirect>
@@ -508,13 +538,15 @@ struct fmt::formatter<TevStageIndirect>
                      "Format: {}\n"
                      "Bias: {}\n"
                      "Bump alpha: {}\n"
+                     "Offset matrix index: {}\n"
                      "Offset matrix ID: {}\n"
                      "Regular coord S wrapping factor: {}\n"
                      "Regular coord T wrapping factor: {}\n"
                      "Use modified texture coordinates for LOD computation: {}\n"
                      "Add texture coordinates from previous TEV stage: {}",
-                     tevind.bt, tevind.fmt, tevind.bias, tevind.bs, tevind.mid, tevind.sw,
-                     tevind.tw, tevind.lb_utclod ? "Yes" : "No", tevind.fb_addprev ? "Yes" : "No");
+                     tevind.bt, tevind.fmt, tevind.bias, tevind.bs, tevind.matrix_index,
+                     tevind.matrix_id, tevind.sw, tevind.tw, tevind.lb_utclod ? "Yes" : "No",
+                     tevind.fb_addprev ? "Yes" : "No");
   }
 };
 
@@ -600,13 +632,13 @@ struct fmt::formatter<TEXSCALE>
 union RAS1_IREF
 {
   BitField<0, 3, u32> bi0;  // Indirect tex stage 0 ntexmap
-  BitField<3, 3, u32> bc0;  // Indirect tex stage 0 ntexmap
+  BitField<3, 3, u32> bc0;  // Indirect tex stage 0 ntexcoord
   BitField<6, 3, u32> bi1;
   BitField<9, 3, u32> bc1;
   BitField<12, 3, u32> bi2;
-  BitField<15, 3, u32> bc3;  // Typo?
-  BitField<18, 3, u32> bi4;
-  BitField<21, 3, u32> bc4;
+  BitField<15, 3, u32> bc2;
+  BitField<18, 3, u32> bi3;
+  BitField<21, 3, u32> bc3;
   u32 hex;
 
   u32 getTexCoord(int i) const { return (hex >> (6 * i + 3)) & 7; }
@@ -625,8 +657,8 @@ struct fmt::formatter<RAS1_IREF>
                      "Stage 1 ntexmap: {}\nStage 1 ntexcoord: {}\n"
                      "Stage 2 ntexmap: {}\nStage 2 ntexcoord: {}\n"
                      "Stage 3 ntexmap: {}\nStage 3 ntexcoord: {}",
-                     indref.bi0, indref.bc0, indref.bi1, indref.bc1, indref.bi2, indref.bc3,
-                     indref.bi4, indref.bc4);
+                     indref.bi0, indref.bc0, indref.bi1, indref.bc1, indref.bi2, indref.bc2,
+                     indref.bi3, indref.bc3);
   }
 };
 
@@ -911,6 +943,8 @@ union GenMode
   BitField<7, 1, u32> unused;              // 1 bit unused?
   BitField<8, 1, bool, u32> flat_shading;  // unconfirmed
   BitField<9, 1, bool, u32> multisampling;
+  // This value is 1 less than the actual number (0-15 map to 1-16).
+  // In other words there is always at least 1 tev stage
   BitField<10, 4, u32> numtevstages;
   BitField<14, 2, CullMode> cullmode;
   BitField<16, 3, u32> numindstages;
@@ -937,7 +971,7 @@ struct fmt::formatter<GenMode>
                      "ZFreeze: {}",
                      mode.numtexgens, mode.numcolchans, mode.unused,
                      mode.flat_shading ? "Yes" : "No", mode.multisampling ? "Yes" : "No",
-                     mode.numtevstages, mode.cullmode, mode.numindstages,
+                     mode.numtevstages + 1, mode.cullmode, mode.numindstages,
                      mode.zfreeze ? "Yes" : "No");
   }
 };
@@ -1912,7 +1946,7 @@ struct BPMemory
   GenMode genMode;
   u32 display_copy_filter[4];  // 01-04
   u32 unknown;                 // 05
-  // indirect matrices (set by GXSetIndTexMtx, selected by TevStageIndirect::mid)
+  // indirect matrices (set by GXSetIndTexMtx, selected by TevStageIndirect::matrix_index)
   // abc form a 2x3 offset matrix, there's 3 such matrices
   // the 3 offset matrices can either be indirect type, S-type, or T-type
   // 6bit scale factor s is distributed across IND_MTXA/B/C.
diff --git a/Source/Core/VideoCommon/GXPipelineTypes.h b/Source/Core/VideoCommon/GXPipelineTypes.h
index 1ddd853597..8892d386f5 100644
--- a/Source/Core/VideoCommon/GXPipelineTypes.h
+++ b/Source/Core/VideoCommon/GXPipelineTypes.h
@@ -20,6 +20,7 @@ namespace VideoCommon
 // As pipelines encompass both shader UIDs and render states, changes to either of these should
 // also increment the pipeline UID version. Incrementing the UID version will cause all UID
 // caches to be invalidated.
+// TODO: Remove PixelShaderUid hasindstage on the next UID version bump
 constexpr u32 GX_PIPELINE_UID_VERSION = 2;  // Last changed in PR 9122
 
 struct GXPipelineUid
diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp
index ab59bfd52a..636bf2be7f 100644
--- a/Source/Core/VideoCommon/PixelShaderGen.cpp
+++ b/Source/Core/VideoCommon/PixelShaderGen.cpp
@@ -220,13 +220,10 @@ PixelShaderUid GetPixelShaderUid()
 
   // indirect texture map lookup
   int nIndirectStagesUsed = 0;
-  if (uid_data->genMode_numindstages > 0)
+  for (unsigned int i = 0; i < numStages; ++i)
   {
-    for (unsigned int i = 0; i < numStages; ++i)
-    {
-      if (bpmem.tevind[i].IsActive() && bpmem.tevind[i].bt < uid_data->genMode_numindstages)
-        nIndirectStagesUsed |= 1 << bpmem.tevind[i].bt;
-    }
+    if (bpmem.tevind[i].IsActive())
+      nIndirectStagesUsed |= 1 << bpmem.tevind[i].bt;
   }
 
   uid_data->nIndirectStagesUsed = nIndirectStagesUsed;
@@ -238,16 +235,14 @@ PixelShaderUid GetPixelShaderUid()
 
   for (unsigned int n = 0; n < numStages; n++)
   {
-    int texcoord = bpmem.tevorders[n / 2].getTexCoord(n & 1);
-    bool bHasTexCoord = (u32)texcoord < bpmem.genMode.numtexgens;
-    // HACK to handle cases where the tex gen is not enabled
-    if (!bHasTexCoord)
-      texcoord = bpmem.genMode.numtexgens;
+    uid_data->stagehash[n].tevorders_texcoord = bpmem.tevorders[n / 2].getTexCoord(n & 1);
 
+    // hasindstage previously was used as a criterion to set tevind to 0, but there are variables in
+    // tevind that are used even if the indirect stage is disabled, so now it is only left in to
+    // avoid breaking existing UIDs (in most cases, games will have 0 in tevind anyways)
+    // TODO: Remove hasindstage on the next UID version bump
     uid_data->stagehash[n].hasindstage = bpmem.tevind[n].bt < bpmem.genMode.numindstages;
-    uid_data->stagehash[n].tevorders_texcoord = texcoord;
-    if (uid_data->stagehash[n].hasindstage)
-      uid_data->stagehash[n].tevind = bpmem.tevind[n].hex;
+    uid_data->stagehash[n].tevind = bpmem.tevind[n].hex;
 
     TevStageCombiner::ColorCombiner& cc = bpmem.combiners[n].colorC;
     TevStageCombiner::AlphaCombiner& ac = bpmem.combiners[n].alphaC;
@@ -361,7 +356,7 @@ void ClearUnusedPixelShaderUidBits(APIType api_type, const ShaderHostConfig& hos
   uid_data->bounding_box &= host_config.bounding_box & host_config.backend_bbox;
 }
 
-void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type, u32 num_texgens,
+void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type,
                                   const ShaderHostConfig& host_config, bool bounding_box)
 {
   // dot product for integer vectors
@@ -546,8 +541,7 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
             uid_data->genMode_numtexgens, uid_data->genMode_numindstages);
 
   // Stuff that is shared between ubershaders and pixelgen.
-  WritePixelShaderCommonHeader(out, api_type, uid_data->genMode_numtexgens, host_config,
-                               uid_data->bounding_box);
+  WritePixelShaderCommonHeader(out, api_type, host_config, uid_data->bounding_box);
 
   if (uid_data->forced_early_z && g_ActiveConfig.backend_info.bSupportsEarlyZ)
   {
@@ -775,9 +769,11 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
       out.Write("col1 = float4(0.0, 0.0, 0.0, 0.0);\n");
   }
 
-  // HACK to handle cases where the tex gen is not enabled
   if (uid_data->genMode_numtexgens == 0)
   {
+    // TODO: This is a hack to ensure that shaders still compile when setting out of bounds tex
+    // coord indices to 0.  Ideally, it shouldn't exist at all, but the exact behavior hasn't been
+    // tested.
     out.Write("\tint2 fixpoint_uv0 = int2(0, 0);\n\n");
   }
   else
@@ -796,24 +792,34 @@ ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& hos
   {
     if ((uid_data->nIndirectStagesUsed & (1U << i)) != 0)
     {
-      const u32 texcoord = uid_data->GetTevindirefCoord(i);
+      u32 texcoord = uid_data->GetTevindirefCoord(i);
       const u32 texmap = uid_data->GetTevindirefMap(i);
 
-      if (texcoord < uid_data->genMode_numtexgens)
-      {
-        out.SetConstantsUsed(C_INDTEXSCALE + i / 2, C_INDTEXSCALE + i / 2);
-        out.Write("\ttempcoord = fixpoint_uv{} >> " I_INDTEXSCALE "[{}].{};\n", texcoord, i / 2,
-                  (i & 1) != 0 ? "zw" : "xy");
-      }
-      else
-      {
-        out.Write("\ttempcoord = int2(0, 0);\n");
-      }
+      // Quirk: when the tex coord is not less than the number of tex gens (i.e. the tex coord does
+      // not exist), then tex coord 0 is used (though sometimes glitchy effects happen on console).
+      // This affects the Mario portrait in Luigi's Mansion, where the developers forgot to set
+      // the number of tex gens to 2 (bug 11462).
+      if (texcoord >= uid_data->genMode_numtexgens)
+        texcoord = 0;
+
+      out.SetConstantsUsed(C_INDTEXSCALE + i / 2, C_INDTEXSCALE + i / 2);
+      out.Write("\ttempcoord = fixpoint_uv{} >> " I_INDTEXSCALE "[{}].{};\n", texcoord, i / 2,
+                (i & 1) ? "zw" : "xy");
 
       out.Write("\tint3 iindtex{} = ", i);
       SampleTexture(out, "float2(tempcoord)", "abg", texmap, stereo, api_type);
     }
   }
+  for (u32 i = uid_data->genMode_numindstages; i < 4; i++)
+  {
+    // Referencing a stage above the number of ind stages is undefined behavior,
+    // and on console produces a noise pattern (details unknown).
+    // TODO: This behavior is nowhere near that, but it ensures the shader still compiles.
+    if ((uid_data->nIndirectStagesUsed & (1U << i)) != 0)
+    {
+      out.Write("\tint3 iindtex{} = int3(0, 0, 0);  // Undefined behavior on console\n", i);
+    }
+  }
 
   for (u32 i = 0; i < numStages; i++)
   {
@@ -950,17 +956,15 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
   const auto& stage = uid_data->stagehash[n];
   out.Write("\n\t// TEV stage {}\n", n);
 
-  // HACK to handle cases where the tex gen is not enabled
+  // Quirk: when the tex coord is not less than the number of tex gens (i.e. the tex coord does not
+  // exist), then tex coord 0 is used (though sometimes glitchy effects happen on console).
   u32 texcoord = stage.tevorders_texcoord;
   const bool has_tex_coord = texcoord < uid_data->genMode_numtexgens;
   if (!has_tex_coord)
     texcoord = 0;
 
-  if (stage.hasindstage)
   {
-    TevStageIndirect tevind;
-    tevind.hex = stage.tevind;
-
+    const TevStageIndirect tevind{.hex = stage.tevind};
     out.Write("\t// indirect op\n");
 
     // Perform the indirect op on the incoming regular coordinates
@@ -991,7 +995,7 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
       // TODO: Should we reset alphabump to 0 here?
     }
 
-    if (tevind.mid != 0)
+    if (tevind.matrix_index != IndMtxIndex::Off)
     {
       // format
       static constexpr std::array<const char*, 4> tev_ind_fmt_mask{
@@ -1038,11 +1042,14 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
                   tev_ind_bias_add[u32(tevind.fmt.Value())]);
       }
 
+      // Multiplied by 2 because each matrix has two rows.
+      // Note also that the 4th column of the matrix contains the scale factor.
+      const u32 mtxidx = 2 * (static_cast<u32>(tevind.matrix_index.Value()) - 1);
+
       // multiply by offset matrix and scale - calculations are likely to overflow badly,
       // yet it works out since we only care about the lower 23 bits (+1 sign bit) of the result
-      if (tevind.mid <= 3)
+      if (tevind.matrix_id == IndMtxId::Indirect)
       {
-        const u32 mtxidx = 2 * (tevind.mid - 1);
         out.SetConstantsUsed(C_INDTEXMTX + mtxidx, C_INDTEXMTX + mtxidx);
 
         out.Write("\tint2 indtevtrans{} = int2(idot(" I_INDTEXMTX
@@ -1064,10 +1071,9 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
           out.Write("\telse indtevtrans{} <<= (-" I_INDTEXMTX "[{}].w);\n", n, mtxidx);
         }
       }
-      else if (tevind.mid <= 7 && has_tex_coord)
-      {  // s matrix
-        ASSERT(tevind.mid >= 5);
-        const u32 mtxidx = 2 * (tevind.mid - 5);
+      else if (tevind.matrix_id == IndMtxId::S)
+      {
+        ASSERT(has_tex_coord);
         out.SetConstantsUsed(C_INDTEXMTX + mtxidx, C_INDTEXMTX + mtxidx);
 
         out.Write("\tint2 indtevtrans{} = int2(fixpoint_uv{} * iindtevcrd{}.xx) >> 8;\n", n,
@@ -1086,10 +1092,9 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
           out.Write("\telse indtevtrans{} <<= (-" I_INDTEXMTX "[{}].w);\n", n, mtxidx);
         }
       }
-      else if (tevind.mid <= 11 && has_tex_coord)
-      {  // t matrix
-        ASSERT(tevind.mid >= 9);
-        const u32 mtxidx = 2 * (tevind.mid - 9);
+      else if (tevind.matrix_id == IndMtxId::T)
+      {
+        ASSERT(has_tex_coord);
         out.SetConstantsUsed(C_INDTEXMTX + mtxidx, C_INDTEXMTX + mtxidx);
 
         out.Write("\tint2 indtevtrans{} = int2(fixpoint_uv{} * iindtevcrd{}.yy) >> 8;\n", n,
@@ -1112,20 +1117,22 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
       else
       {
         out.Write("\tint2 indtevtrans{} = int2(0, 0);\n", n);
+        ASSERT(false);  // Unknown value for matrix_id
       }
     }
     else
     {
       out.Write("\tint2 indtevtrans{} = int2(0, 0);\n", n);
+      // If matrix_index is Off (0), matrix_id should be Indirect (0)
+      ASSERT(tevind.matrix_id == IndMtxId::Indirect);
     }
 
     // ---------
     // Wrapping
     // ---------
 
-    // TODO: Should the last element be 1 or (1<<7)?
-    static constexpr std::array<const char*, 7> tev_ind_wrap_start{
-        "0", "(256<<7)", "(128<<7)", "(64<<7)", "(32<<7)", "(16<<7)", "1",
+    static constexpr std::array<const char*, 5> tev_ind_wrap_start{
+        "(256<<7)", "(128<<7)", "(64<<7)", "(32<<7)", "(16<<7)",
     };
 
     // wrap S
@@ -1133,14 +1140,14 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
     {
       out.Write("\twrappedcoord.x = fixpoint_uv{}.x;\n", texcoord);
     }
-    else if (tevind.sw == IndTexWrap::ITW_0)
+    else if (tevind.sw >= IndTexWrap::ITW_0)  // 7 (Invalid) appears to behave the same as 6 (ITW_0)
     {
       out.Write("\twrappedcoord.x = 0;\n");
     }
     else
     {
       out.Write("\twrappedcoord.x = fixpoint_uv{}.x & ({} - 1);\n", texcoord,
-                tev_ind_wrap_start[u32(tevind.sw.Value())]);
+                tev_ind_wrap_start[u32(tevind.sw.Value()) - u32(IndTexWrap::ITW_256)]);
     }
 
     // wrap T
@@ -1148,14 +1155,14 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
     {
       out.Write("\twrappedcoord.y = fixpoint_uv{}.y;\n", texcoord);
     }
-    else if (tevind.tw == IndTexWrap::ITW_0)
+    else if (tevind.tw >= IndTexWrap::ITW_0)  // 7 (Invalid) appears to behave the same as 6 (ITW_0)
     {
       out.Write("\twrappedcoord.y = 0;\n");
     }
     else
     {
       out.Write("\twrappedcoord.y = fixpoint_uv{}.y & ({} - 1);\n", texcoord,
-                tev_ind_wrap_start[u32(tevind.tw.Value())]);
+                tev_ind_wrap_start[u32(tevind.tw.Value()) - u32(IndTexWrap::ITW_256)]);
     }
 
     if (tevind.fb_addprev)  // add previous tevcoord
@@ -1191,7 +1198,7 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
     out.Write("\trastemp = {}.{};\n", tev_ras_table[u32(stage.tevorders_colorchan)], rasswap);
   }
 
-  if (stage.tevorders_enable)
+  if (stage.tevorders_enable && uid_data->genMode_numtexgens > 0)
   {
     // Generate swizzle string to represent the texture color channel swapping
     const char texswap[5] = {
@@ -1202,17 +1209,15 @@ static void WriteStage(ShaderCode& out, const pixel_shader_uid_data* uid_data, i
         '\0',
     };
 
-    if (!stage.hasindstage)
-    {
-      // calc tevcord
-      if (has_tex_coord)
-        out.Write("\ttevcoord.xy = fixpoint_uv{};\n", texcoord);
-      else
-        out.Write("\ttevcoord.xy = int2(0, 0);\n");
-    }
     out.Write("\ttextemp = ");
     SampleTexture(out, "float2(tevcoord.xy)", texswap, stage.tevorders_texmap, stereo, api_type);
   }
+  else if (uid_data->genMode_numtexgens == 0)
+  {
+    // It seems like the result is always black when no tex coords are enabled, but further testing
+    // is needed.
+    out.Write("\ttextemp = int4(0, 0, 0, 0);\n");
+  }
   else
   {
     out.Write("\ttextemp = int4(255, 255, 255, 255);\n");
diff --git a/Source/Core/VideoCommon/PixelShaderGen.h b/Source/Core/VideoCommon/PixelShaderGen.h
index a9ffa25498..d75fac8eb7 100644
--- a/Source/Core/VideoCommon/PixelShaderGen.h
+++ b/Source/Core/VideoCommon/PixelShaderGen.h
@@ -66,9 +66,9 @@ struct pixel_shader_uid_data
   u32 tevindref_bi1 : 3;
   u32 tevindref_bc1 : 3;
   u32 tevindref_bi2 : 3;
+  u32 tevindref_bc2 : 3;
+  u32 tevindref_bi3 : 3;
   u32 tevindref_bc3 : 3;
-  u32 tevindref_bi4 : 3;
-  u32 tevindref_bc4 : 3;
 
   void SetTevindrefValues(int index, u32 texcoord, u32 texmap)
   {
@@ -84,55 +84,39 @@ struct pixel_shader_uid_data
     }
     else if (index == 2)
     {
-      tevindref_bc3 = texcoord;
+      tevindref_bc2 = texcoord;
       tevindref_bi2 = texmap;
     }
     else if (index == 3)
     {
-      tevindref_bc4 = texcoord;
-      tevindref_bi4 = texmap;
+      tevindref_bc3 = texcoord;
+      tevindref_bi3 = texmap;
     }
   }
 
   u32 GetTevindirefCoord(int index) const
   {
     if (index == 0)
-    {
       return tevindref_bc0;
-    }
     else if (index == 1)
-    {
       return tevindref_bc1;
-    }
     else if (index == 2)
-    {
-      return tevindref_bc3;
-    }
+      return tevindref_bc2;
     else if (index == 3)
-    {
-      return tevindref_bc4;
-    }
+      return tevindref_bc3;
     return 0;
   }
 
   u32 GetTevindirefMap(int index) const
   {
     if (index == 0)
-    {
       return tevindref_bi0;
-    }
     else if (index == 1)
-    {
       return tevindref_bi1;
-    }
     else if (index == 2)
-    {
       return tevindref_bi2;
-    }
     else if (index == 3)
-    {
-      return tevindref_bi4;
-    }
+      return tevindref_bi3;
     return 0;
   }
 
@@ -149,6 +133,7 @@ struct pixel_shader_uid_data
     u32 pad1 : 6;
 
     // TODO: Clean up the swapXY mess
+    // TODO: remove hasindstage, as it no longer does anything useful
     u32 hasindstage : 1;
     u32 tevind : 21;
     u32 tevksel_swap1a : 2;
@@ -174,7 +159,7 @@ using PixelShaderUid = ShaderUid<pixel_shader_uid_data>;
 
 ShaderCode GeneratePixelShaderCode(APIType api_type, const ShaderHostConfig& host_config,
                                    const pixel_shader_uid_data* uid_data);
-void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type, u32 num_texgens,
+void WritePixelShaderCommonHeader(ShaderCode& out, APIType api_type,
                                   const ShaderHostConfig& host_config, bool bounding_box);
 void ClearUnusedPixelShaderUidBits(APIType api_type, const ShaderHostConfig& host_config,
                                    PixelShaderUid* uid);
diff --git a/Source/Core/VideoCommon/PixelShaderManager.cpp b/Source/Core/VideoCommon/PixelShaderManager.cpp
index bae48623c7..0ae1ced1e4 100644
--- a/Source/Core/VideoCommon/PixelShaderManager.cpp
+++ b/Source/Core/VideoCommon/PixelShaderManager.cpp
@@ -148,29 +148,18 @@ void PixelShaderManager::SetConstants()
 
     for (u32 i = 0; i < (bpmem.genMode.numtevstages + 1); ++i)
     {
-      u32 stage = bpmem.tevind[i].bt;
-      if (stage < bpmem.genMode.numindstages)
-      {
-        // We set some extra bits so the ubershader can quickly check if these
-        // features are in use.
-        if (bpmem.tevind[i].IsActive())
-          constants.pack1[stage][3] =
-              bpmem.tevindref.getTexCoord(stage) | bpmem.tevindref.getTexMap(stage) << 8 | 1 << 16;
-        // Note: a tevind of zero just happens to be a passthrough, so no need
-        // to set an extra bit.
-        constants.pack1[i][2] = bpmem.tevind[i].hex;  // TODO: This match shadergen, but videosw
-                                                      // will always wrap.
+      // Note: a tevind of zero just happens to be a passthrough, so no need
+      // to set an extra bit.  Furthermore, wrap and add to previous apply even if there is no
+      // indirect stage.
+      constants.pack1[i][2] = bpmem.tevind[i].hex;
 
-        // The ubershader uses tevind != 0 as a condition whether to calculate texcoords,
-        // even when texture is disabled, instead of the stage < bpmem.genMode.numindstages.
-        // We set an unused bit here to indicate that the stage is active, even if it
-        // is just a pass-through.
-        constants.pack1[i][2] |= 0x80000000;
-      }
-      else
-      {
-        constants.pack1[i][2] = 0;
-      }
+      u32 stage = bpmem.tevind[i].bt;
+
+      // We use an extra bit (1 << 16) to provide a fast way of testing if this feature is in use.
+      // Note also that this is indexed by indirect stage, not by TEV stage.
+      if (bpmem.tevind[i].IsActive() && stage < bpmem.genMode.numindstages)
+        constants.pack1[stage][3] =
+            bpmem.tevindref.getTexCoord(stage) | bpmem.tevindref.getTexMap(stage) << 8 | 1 << 16;
     }
 
     dirty = true;
@@ -336,9 +325,7 @@ void PixelShaderManager::SetIndTexScaleChanged(bool high)
 
 void PixelShaderManager::SetIndMatrixChanged(int matrixidx)
 {
-  int scale = ((u32)bpmem.indmtx[matrixidx].col0.s0 << 0) |
-              ((u32)bpmem.indmtx[matrixidx].col1.s1 << 2) |
-              ((u32)bpmem.indmtx[matrixidx].col2.s2 << 4);
+  const u8 scale = bpmem.indmtx[matrixidx].GetScale();
 
   // xyz - static matrix
   // w - dynamic matrix scale / 128
diff --git a/Source/Core/VideoCommon/UberShaderPixel.cpp b/Source/Core/VideoCommon/UberShaderPixel.cpp
index c35ce2821d..642896a3dc 100644
--- a/Source/Core/VideoCommon/UberShaderPixel.cpp
+++ b/Source/Core/VideoCommon/UberShaderPixel.cpp
@@ -64,7 +64,7 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
 
   out.Write("// Pixel UberShader for {} texgens{}{}\n", numTexgen,
             early_depth ? ", early-depth" : "", per_pixel_depth ? ", per-pixel depth" : "");
-  WritePixelShaderCommonHeader(out, ApiType, numTexgen, host_config, bounding_box);
+  WritePixelShaderCommonHeader(out, ApiType, host_config, bounding_box);
   WriteUberShaderCommonHeader(out, ApiType, host_config);
   if (per_pixel_lighting)
     WriteLightingFunction(out);
@@ -148,19 +148,16 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
   }
 
   // Uniform index -> texture coordinates
+  // Quirk: when the tex coord is not less than the number of tex gens (i.e. the tex coord does
+  // not exist), then tex coord 0 is used (though sometimes glitchy effects happen on console).
+  // This affects the Mario portrait in Luigi's Mansion, where the developers forgot to set
+  // the number of tex gens to 2 (bug 11462).
   if (numTexgen > 0)
   {
-    if (ApiType != APIType::D3D)
-    {
-      out.Write("float3 selectTexCoord(uint index) {{\n");
-    }
-    else
-    {
-      out.Write("float3 selectTexCoord(uint index");
-      for (u32 i = 0; i < numTexgen; i++)
-        out.Write(", float3 tex{}", i);
-      out.Write(") {{\n");
-    }
+    out.Write("int2 selectTexCoord(uint index");
+    for (u32 i = 0; i < numTexgen; i++)
+      out.Write(", int2 fixpoint_uv{}", i);
+    out.Write(") {{\n");
 
     if (ApiType == APIType::D3D)
     {
@@ -168,48 +165,51 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
       for (u32 i = 0; i < numTexgen; i++)
       {
         out.Write("  case {}u:\n"
-                  "    return tex{};\n",
+                  "    return fixpoint_uv{};\n",
                   i, i);
       }
       out.Write("  default:\n"
-                "    return float3(0.0, 0.0, 0.0);\n"
+                "    return fixpoint_uv0;\n"
                 "  }}\n");
     }
     else
     {
+      out.Write("  if (index >= {}u) {{\n", numTexgen);
+      out.Write("    return fixpoint_uv0;\n"
+                "  }}\n");
       if (numTexgen > 4)
         out.Write("  if (index < 4u) {{\n");
       if (numTexgen > 2)
         out.Write("    if (index < 2u) {{\n");
       if (numTexgen > 1)
-        out.Write("      return (index == 0u) ? tex0 : tex1;\n");
+        out.Write("      return (index == 0u) ? fixpoint_uv0 : fixpoint_uv1;\n");
       else
-        out.Write("      return (index == 0u) ? tex0 : float3(0.0, 0.0, 0.0);\n");
+        out.Write("      return fixpoint_uv0;\n");
       if (numTexgen > 2)
       {
-        out.Write("    }} else {{\n");  // >= 2
+        out.Write("    }} else {{\n");  // >= 2 < min(4, numTexgen)
         if (numTexgen > 3)
-          out.Write("      return (index == 2u) ? tex2 : tex3;\n");
+          out.Write("      return (index == 2u) ? fixpoint_uv2 : fixpoint_uv3;\n");
         else
-          out.Write("      return (index == 2u) ? tex2 : float3(0.0, 0.0, 0.0);\n");
+          out.Write("      return fixpoint_uv2;\n");
         out.Write("    }}\n");
       }
       if (numTexgen > 4)
       {
-        out.Write("  }} else {{\n");  // >= 4 <= 8
+        out.Write("  }} else {{\n");  // >= 4 < min(8, numTexgen)
         if (numTexgen > 6)
           out.Write("    if (index < 6u) {{\n");
         if (numTexgen > 5)
-          out.Write("      return (index == 4u) ? tex4 : tex5;\n");
+          out.Write("      return (index == 4u) ? fixpoint_uv4 : fixpoint_uv5;\n");
         else
-          out.Write("      return (index == 4u) ? tex4 : float3(0.0, 0.0, 0.0);\n");
+          out.Write("      return fixpoint_uv4;\n");
         if (numTexgen > 6)
         {
-          out.Write("    }} else {{\n");  // >= 6 <= 8
+          out.Write("    }} else {{\n");  // >= 6 < min(8, numTexgen)
           if (numTexgen > 7)
-            out.Write("      return (index == 6u) ? tex6 : tex7;\n");
+            out.Write("      return (index == 6u) ? fixpoint_uv6 : fixpoint_uv7;\n");
           else
-            out.Write("      return (index == 6u) ? tex6 : float3(0.0, 0.0, 0.0);\n");
+            out.Write("      return fixpoint_uv6;\n");
           out.Write("    }}\n");
         }
         out.Write("  }}\n");
@@ -287,15 +287,15 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
   // ======================
   const auto LookupIndirectTexture = [&out, stereo](std::string_view out_var_name,
                                                     std::string_view in_index_name) {
+    // in_index_name is the indirect stage, not the tev stage
+    // bpmem_iref is packed differently from RAS1_IREF
     out.Write("{{\n"
               "  uint iref = bpmem_iref({});\n"
               "  if ( iref != 0u)\n"
               "  {{\n"
               "    uint texcoord = bitfieldExtract(iref, 0, 3);\n"
               "    uint texmap = bitfieldExtract(iref, 8, 3);\n"
-              "    float3 uv = getTexCoord(texcoord);\n"
-              "    int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * " I_TEXDIMS
-              "[texcoord].zw);\n"
+              "    int2 fixedPoint_uv = getTexCoord(texcoord);\n"
               "\n"
               "    if (({} & 1u) == 0u)\n"
               "      fixedPoint_uv = fixedPoint_uv >> " I_INDTEXSCALE "[{} >> 1].xy;\n"
@@ -306,6 +306,10 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
               "[texmap].xy, {})).abg;\n",
               in_index_name, in_index_name, in_index_name, in_index_name, out_var_name,
               stereo ? "float(layer)" : "0.0");
+    // There is always a bit set in bpmem_iref if the data is valid (matrix is not off, and the
+    // indirect texture stage is enabled). If the matrix is off, the result doesn't matter; if the
+    // indirect texture stage is disabled, the result is undefined (and produces a glitchy pattern
+    // on hardware, different from this).
     out.Write("  }}\n"
               "  else\n"
               "  {{\n"
@@ -666,21 +670,14 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
         "\n");
   }
 
-  // Since the texture coodinate variables aren't global, we need to pass
-  // them to the select function in D3D.
+  // Since the fixed-point texture coodinate variables aren't global, we need to pass
+  // them to the select function.  This applies to all backends.
   if (numTexgen > 0)
   {
-    if (ApiType != APIType::D3D)
-    {
-      out.Write("#define getTexCoord(index) selectTexCoord((index))\n\n");
-    }
-    else
-    {
-      out.Write("#define getTexCoord(index) selectTexCoord((index)");
-      for (u32 i = 0; i < numTexgen; i++)
-        out.Write(", tex{}", i);
-      out.Write(")\n\n");
-    }
+    out.Write("#define getTexCoord(index) selectTexCoord((index)");
+    for (u32 i = 0; i < numTexgen; i++)
+      out.Write(", fixpoint_uv{}", i);
+    out.Write(")\n\n");
   }
 
   if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan)
@@ -788,11 +785,18 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
   // Disable texturing when there are no texgens (for now)
   if (numTexgen != 0)
   {
-    out.Write("    uint tex_coord = {};\n",
+    for (u32 i = 0; i < numTexgen; i++)
+    {
+      out.Write("    int2 fixpoint_uv{} = int2(", i);
+      out.Write("(tex{}.z == 0.0 ? tex{}.xy : tex{}.xy / tex{}.z)", i, i, i, i);
+      out.Write(" * " I_TEXDIMS "[{}].zw);\n", i);
+      // TODO: S24 overflows here?
+    }
+
+    out.Write("\n"
+              "    uint tex_coord = {};\n",
               BitfieldExtract<&TwoTevStageOrders::texcoord0>("ss.order"));
-    out.Write("    float3 uv = getTexCoord(tex_coord);\n"
-              "    int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * " I_TEXDIMS
-              "[tex_coord].zw);\n"
+    out.Write("    int2 fixedPoint_uv = getTexCoord(tex_coord);\n"
               "\n"
               "    bool texture_enabled = (ss.order & {}u) != 0u;\n",
               1 << TwoTevStageOrders().enable0.StartBit());
@@ -806,7 +810,10 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
     out.Write("      uint fmt = {};\n", BitfieldExtract<&TevStageIndirect::fmt>("tevind"));
     out.Write("      uint bias = {};\n", BitfieldExtract<&TevStageIndirect::bias>("tevind"));
     out.Write("      uint bt = {};\n", BitfieldExtract<&TevStageIndirect::bt>("tevind"));
-    out.Write("      uint mid = {};\n", BitfieldExtract<&TevStageIndirect::mid>("tevind"));
+    out.Write("      uint matrix_index = {};\n",
+              BitfieldExtract<&TevStageIndirect::matrix_index>("tevind"));
+    out.Write("      uint matrix_id = {};\n",
+              BitfieldExtract<&TevStageIndirect::matrix_id>("tevind"));
     out.Write("\n");
     out.Write("      int3 indcoord;\n");
     LookupIndirectTexture("indcoord", "bt");
@@ -846,12 +853,12 @@ ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config,
               "\n"
               "      // Matrix multiply\n"
               "      int2 indtevtrans = int2(0, 0);\n"
-              "      if ((mid & 3u) != 0u)\n"
+              "      if (matrix_index != 0u)\n"
               "      {{\n"
-              "        uint mtxidx = 2u * ((mid & 3u) - 1u);\n"
+              "        uint mtxidx = 2u * (matrix_index - 1u);\n"
               "        int shift = " I_INDTEXMTX "[mtxidx].w;\n"
               "\n"
-              "        switch (mid >> 2)\n"
+              "        switch (matrix_id)\n"
               "        {{\n"
               "        case 0u: // 3x2 S0.10 matrix\n"
               "          indtevtrans = int2(idot(" I_INDTEXMTX