Improve Performance of Granular Buffering + User Adjustable Buffer Size

2025-07-25 23:29:44 -06:00 · 2025-04-01 18:17:21 -04:00
parent 1b85da9b85
commit 877864c23a
8 changed files with 218 additions and 172 deletions
--- a/Source/Core/AudioCommon/Mixer.cpp
+++ b/Source/Core/AudioCommon/Mixer.cpp
@ -11,6 +11,7 @@
 #include "Common/ChunkFile.h"
 #include "Common/CommonTypes.h"
 #include "Common/Logging/Log.h"
+#include "Common/MathUtil.h"
 #include "Common/Swap.h"
 #include "Core/Config/MainSettings.h"
 #include "Core/Core.h"
@ -60,7 +61,9 @@ void Mixer::DoState(PointerWrap& p)
 // Executed from sound stream thread
 void Mixer::MixerFifo::Mix(s16* samples, std::size_t num_samples)
 {
-  constexpr u32 half = 0x80000000;
+  constexpr u32 INDEX_HALF = 0x80000000;
+  constexpr DT_s FADE_IN_RC = DT_s(0.008);
+  constexpr DT_s FADE_OUT_RC = DT_s(0.064);

  const u64 out_sample_rate = m_mixer->m_output_sample_rate;
  u64 in_sample_rate = FIXED_SAMPLE_RATE_DIVIDEND / m_input_sample_rate_divisor;
@ -69,33 +72,86 @@ void Mixer::MixerFifo::Mix(s16* samples, std::size_t num_samples)
  if (0 < emulation_speed && emulation_speed != 1.0)
    in_sample_rate = static_cast<u64>(std::llround(in_sample_rate * emulation_speed));

-  const u32 index_jump = (in_sample_rate << GRANULE_BUFFER_FRAC_BITS) / (out_sample_rate);
+  const u32 index_jump = (in_sample_rate << GRANULE_FRAC_BITS) / (out_sample_rate);
+
+  // These fade in / out multiplier are tuned to match a constant
+  // fade speed regardless of the input or the output sample rate.
+  const float fade_in_mul = -std::expm1(-DT_s(1.0) / (out_sample_rate * FADE_IN_RC));
+  const float fade_out_mul = -std::expm1(-DT_s(1.0) / (out_sample_rate * FADE_OUT_RC));

  const StereoPair volume{m_LVolume.load() / 256.0f, m_RVolume.load() / 256.0f};

+  // Calculate the ideal length of the granule queue.
+  const std::size_t buffer_size_ms = m_mixer->m_config_audio_buffer_ms;
+  const std::size_t buffer_size_samples = buffer_size_ms * in_sample_rate / 1000;
+
+  // Limit the possible queue sizes to any number between 4 and 64.
+  const std::size_t buffer_size_granules =
+      std::clamp((buffer_size_samples) / (GRANULE_SIZE >> 1), static_cast<std::size_t>(4),
+                 static_cast<std::size_t>(MAX_GRANULE_QUEUE_SIZE));
+
+  m_granule_queue_size.store(buffer_size_granules, std::memory_order_relaxed);
+
  while (num_samples-- > 0)
  {
-    StereoPair sample = Granule::InterpStereoPair(m_front, m_back, m_current_index);
-    sample *= volume;
-
-    sample.l += samples[0] + m_quantization_error.l;
-    samples[0] = ToShort(std::lround(sample.l));
-    m_quantization_error.l = std::clamp(sample.l - samples[0], -1.0f, 1.0f);
-
-    sample.r += samples[1] + m_quantization_error.r;
-    samples[1] = ToShort(std::lround(sample.r));
-    m_quantization_error.r = std::clamp(sample.r - samples[1], -1.0f, 1.0f);
-
-    samples += 2;
-
+    // The indexes for the front and back buffers are offset by 50% of the granule size.
+    // We use the modular nature of 32-bit integers to wrap around the granule size.
    m_current_index += index_jump;
-    if (m_current_index < half)
-    {
-      m_front = m_back;
+    const u32 front_index = m_current_index;
+    const u32 back_index = m_current_index + INDEX_HALF;
+
+    // If either index is less than the index jump, that means we reached
+    // the end of the of the buffer and need to load the next granule.
+    if (front_index < index_jump)
+      Dequeue(&m_front);
+    else if (back_index < index_jump)
      Dequeue(&m_back);

-      m_current_index += half;
-    }
+    // The Granules are pre-windowed, so we can just add them together
+    const std::size_t ft = front_index >> GRANULE_FRAC_BITS;
+    const std::size_t bt = back_index >> GRANULE_FRAC_BITS;
+    const StereoPair s0 = m_front[(ft - 2) & GRANULE_MASK] + m_back[(bt - 2) & GRANULE_MASK];
+    const StereoPair s1 = m_front[(ft - 1) & GRANULE_MASK] + m_back[(bt - 1) & GRANULE_MASK];
+    const StereoPair s2 = m_front[(ft + 0) & GRANULE_MASK] + m_back[(bt + 0) & GRANULE_MASK];
+    const StereoPair s3 = m_front[(ft + 1) & GRANULE_MASK] + m_back[(bt + 1) & GRANULE_MASK];
+    const StereoPair s4 = m_front[(ft + 2) & GRANULE_MASK] + m_back[(bt + 2) & GRANULE_MASK];
+    const StereoPair s5 = m_front[(ft + 3) & GRANULE_MASK] + m_back[(bt + 3) & GRANULE_MASK];
+
+    // Polynomial Interpolators for High-Quality Resampling of
+    // Over Sampled Audio by Olli Niemitalo, October 2001.
+    // Page 43 -- 6-point, 3rd-order Hermite:
+    // https://yehar.com/blog/wp-content/uploads/2009/08/deip.pdf
+    const u32 t_frac = m_current_index & ((1 << GRANULE_FRAC_BITS) - 1);
+    const float t1 = t_frac / static_cast<float>(1 << GRANULE_FRAC_BITS);
+    const float t2 = t1 * t1;
+    const float t3 = t2 * t1;
+
+    StereoPair sample = (s0 * StereoPair{(+0.0f + 1.0f * t1 - 2.0f * t2 + 1.0f * t3) / 12.0f} +
+                         s1 * StereoPair{(+0.0f - 8.0f * t1 + 15.0f * t2 - 7.0f * t3) / 12.0f} +
+                         s2 * StereoPair{(+3.0f + 0.0f * t1 - 7.0f * t2 + 4.0f * t3) / 3.0f} +
+                         s3 * StereoPair{(+0.0f + 2.0f * t1 + 5.0f * t2 - 4.0f * t3) / 3.0f} +
+                         s4 * StereoPair{(+0.0f - 1.0f * t1 - 6.0f * t2 + 7.0f * t3) / 12.0f} +
+                         s5 * StereoPair{(+0.0f + 0.0f * t1 + 1.0f * t2 - 1.0f * t3) / 12.0f});
+
+    // Apply Fade In / Fade Out depending on if we are looping
+    if (m_queue_looping.load(std::memory_order_relaxed))
+      m_fade_volume += fade_out_mul * (0.0f - m_fade_volume);
+    else
+      m_fade_volume += fade_in_mul * (1.0f - m_fade_volume);
+
+    // Apply the fade volume and the regular volume to the sample
+    sample = sample * volume * StereoPair{m_fade_volume};
+
+    // This quantization method prevents accumulated error but does not do noise shaping.
+    sample.l += samples[0] - m_quantization_error.l;
+    samples[0] = MathUtil::SaturatingCast<s16>(std::lround(sample.l));
+    m_quantization_error.l = std::clamp(samples[0] - sample.l, -1.0f, 1.0f);
+
+    sample.r += samples[1] - m_quantization_error.r;
+    samples[1] = MathUtil::SaturatingCast<s16>(std::lround(sample.r));
+    m_quantization_error.r = std::clamp(samples[1] - sample.r, -1.0f, 1.0f);
+
+    samples += 2;
  }
 }

@ -152,13 +208,15 @@ void Mixer::MixerFifo::PushSamples(const s16* samples, std::size_t num_samples)
  {
    const s16 l = m_little_endian ? samples[1] : Common::swap16(samples[1]);
    const s16 r = m_little_endian ? samples[0] : Common::swap16(samples[0]);
-
-    m_buffer[m_buffer_index] = StereoPair(l, r);
-    m_buffer_index = (m_buffer_index + 1) & GRANULE_BUFFER_MASK;
    samples += 2;

-    if (m_buffer_index == 0 || m_buffer_index == m_buffer.size() / 2)
-      Enqueue(Granule(m_buffer, m_buffer_index));
+    m_next_buffer[m_next_buffer_index] = StereoPair(l, r);
+    m_next_buffer_index = (m_next_buffer_index + 1) & GRANULE_MASK;
+
+    // The granules overlap by 50%, so we need to enqueue the
+    // next buffer every time we fill half of the samples.
+    if (m_next_buffer_index == 0 || m_next_buffer_index == m_next_buffer.size() / 2)
+      Enqueue();
  }
 }

@ -347,7 +405,8 @@ void Mixer::StopLogDSPAudio()
 void Mixer::RefreshConfig()
 {
  m_config_emulation_speed = Config::Get(Config::MAIN_EMULATION_SPEED);
-  m_audio_fill_gaps = Config::Get(Config::MAIN_AUDIO_FILL_GAPS);
+  m_config_fill_audio_gaps = Config::Get(Config::MAIN_AUDIO_FILL_GAPS);
+  m_config_audio_buffer_ms = Config::Get(Config::MAIN_AUDIO_BUFFER_SIZE);
 }

 void Mixer::MixerFifo::DoState(PointerWrap& p)
@ -378,78 +437,16 @@ std::pair<s32, s32> Mixer::MixerFifo::GetVolume() const
  return std::make_pair(m_LVolume.load(), m_RVolume.load());
 }

-void Mixer::MixerFifo::Enqueue(const Granule& granule)
-{
-  const std::size_t head = m_queue_head.load(std::memory_order_relaxed);
-
-  std::size_t next_head = (head + 1) % GRANULE_QUEUE_SIZE;
-  if (next_head == m_queue_tail.load(std::memory_order_acquire))
-    next_head = (head + GRANULE_QUEUE_SIZE / 2) % GRANULE_QUEUE_SIZE;
-
-  m_queue[head] = granule;
-  m_queue_head.store(next_head, std::memory_order_release);
-
-  m_queue_looping.store(false, std::memory_order_relaxed);
-}
-
-void Mixer::MixerFifo::Dequeue(Granule* granule)
-{
-  // import numpy as np
-  // import scipy.signal as signal
-  // window = np.cumsum(signal.windows.dpss(32, 10))[::-1]
-  // window /= window.max()
-  // elements = ", ".join([f"{x:.10f}f" for x in window])
-  // print(f'constexpr std::array<StereoPair, {len(window)}> FADE_WINDOW = {{ {elements} }};')
-  constexpr std::array<float, 32> FADE_WINDOW = {
-      1.0000000000f, 0.9999999932f, 0.9999998472f, 0.9999982765f, 0.9999870876f, 0.9999278274f,
-      0.9996794215f, 0.9988227502f, 0.9963278433f, 0.9900772448f, 0.9764215513f, 0.9501402658f,
-      0.9052392639f, 0.8367449916f, 0.7430540364f, 0.6277889467f, 0.5000000000f, 0.3722110533f,
-      0.2569459636f, 0.1632550084f, 0.0947607361f, 0.0498597342f, 0.0235784487f, 0.0099227552f,
-      0.0036721567f, 0.0011772498f, 0.0003205785f, 0.0000721726f, 0.0000129124f, 0.0000017235f,
-      0.0000001528f, 0.0000000068f};
-
-  const std::size_t tail = m_queue_tail.load(std::memory_order_relaxed);
-  std::size_t next_tail = (tail + 1) % GRANULE_QUEUE_SIZE;
-
-  if (next_tail == m_queue_head.load(std::memory_order_acquire))
-  {
-    // Only fill gaps when running to prevent stutter on pause.
-    const bool is_running = Core::GetState(Core::System::GetInstance()) == Core::State::Running;
-    if (m_mixer->m_audio_fill_gaps && is_running)
-    {
-      next_tail = (tail + GRANULE_QUEUE_SIZE / 2) % GRANULE_QUEUE_SIZE;
-      m_queue_looping.store(true, std::memory_order_relaxed);
-    }
-    else
-    {
-      *granule = Granule();
-      return;
-    }
-  }
-
-  if (m_queue_looping.load(std::memory_order_relaxed))
-    m_queue_fade_index = std::min(m_queue_fade_index + 1, FADE_WINDOW.size() - 1);
-  else
-    m_queue_fade_index = 0;
-
-  *granule = m_queue[tail];
-  *granule *= StereoPair(FADE_WINDOW[m_queue_fade_index]);
-
-  m_queue_tail.store(next_tail, std::memory_order_release);
-}
-
-// Implementation of Granule's constructor
-constexpr Mixer::MixerFifo::Granule::Granule(const GranuleBuffer& input,
-                                             const std::size_t start_index)
+void Mixer::MixerFifo::Enqueue()
 {
  // import numpy as np
  // import scipy.signal as signal
  // window = np.convolve(np.ones(128), signal.windows.dpss(128 + 1, 4))
  // window /= (window[:len(window) // 2] + window[len(window) // 2:]).max()
  // elements = ", ".join([f"{x:.10f}f" for x in window])
-  // print(f'constexpr std::array<StereoPair, GRANULE_BUFFER_SIZE> GRANULE_WINDOW = {{ {elements}
+  // print(f'constexpr std::array<StereoPair, GRANULE_SIZE> GRANULE_WINDOW = {{ {elements}
  // }};')
-  constexpr std::array<float, GRANULE_BUFFER_SIZE> GRANULE_WINDOW = {
+  constexpr std::array<StereoPair, GRANULE_SIZE> GRANULE_WINDOW = {
      0.0000016272f, 0.0000050749f, 0.0000113187f, 0.0000216492f, 0.0000377350f, 0.0000616906f,
      0.0000961509f, 0.0001443499f, 0.0002102045f, 0.0002984010f, 0.0004144844f, 0.0005649486f,
      0.0007573262f, 0.0010002765f, 0.0013036694f, 0.0016786636f, 0.0021377783f, 0.0026949534f,
@ -494,45 +491,64 @@ constexpr Mixer::MixerFifo::Granule::Granule(const GranuleBuffer& input,
      0.0002984010f, 0.0002102045f, 0.0001443499f, 0.0000961509f, 0.0000616906f, 0.0000377350f,
      0.0000216492f, 0.0000113187f, 0.0000050749f, 0.0000016272f};

-  const auto input_middle = input.end() - start_index;
-  std::ranges::rotate_copy(input, input_middle, m_buffer.begin());
+  const std::size_t head = m_queue_head.load(std::memory_order_acquire);

-  for (std::size_t i = 0; i < m_buffer.size(); ++i)
-    m_buffer[i] *= StereoPair(GRANULE_WINDOW[i]);
+  // Check if we run out of space in the circular queue. (rare)
+  std::size_t next_head = (head + 1) & GRANULE_QUEUE_MASK;
+  if (next_head == m_queue_tail.load(std::memory_order_acquire))
+  {
+    WARN_LOG_FMT(AUDIO,
+                 "Granule Queue has completely filled and audio samples are being dropped. "
+                 "This should not happen unless the audio backend has stopped requesting audio.");
+    return;
+  }
+
+  // By preconstructing the granule window, we have the best chance of
+  // the compiler optimizing this loop using SIMD instructions.
+  const std::size_t start_index = m_next_buffer_index;
+  for (std::size_t i = 0; i < GRANULE_SIZE; ++i)
+    m_queue[head][i] = m_next_buffer[(i + start_index) & GRANULE_MASK] * GRANULE_WINDOW[i];
+
+  m_queue_head.store(next_head, std::memory_order_release);
+  m_queue_looping.store(false, std::memory_order_relaxed);
 }

-Mixer::MixerFifo::StereoPair Mixer::MixerFifo::Granule::InterpStereoPair(const Granule& prev,
-                                                                         const Granule& next,
-                                                                         const u32 frac)
+void Mixer::MixerFifo::Dequeue(Granule* granule)
 {
-  const std::size_t prev_index = frac >> Mixer::MixerFifo::GRANULE_BUFFER_FRAC_BITS;
-  const std::size_t next_index = prev_index - (GRANULE_BUFFER_SIZE / 2);
+  const std::size_t granule_queue_size = m_granule_queue_size.load(std::memory_order_relaxed);
+  const std::size_t head = m_queue_head.load(std::memory_order_acquire);
+  std::size_t tail = m_queue_tail.load(std::memory_order_acquire);

-  const u32 frac_t = frac & ((1 << GRANULE_BUFFER_FRAC_BITS) - 1);
-  const float t1 = frac_t / static_cast<float>(1 << GRANULE_BUFFER_FRAC_BITS);
-  const float t2 = t1 * t1;
-  const float t3 = t2 * t1;
+  // Checks to see if the queue has gotten too long.
+  if (granule_queue_size < ((head - tail) & GRANULE_QUEUE_MASK))
+  {
+    // Jump the playhead to half the queue size behind the head.
+    const std::size_t gap = (granule_queue_size >> 1) + 1;
+    tail = (head - gap) & GRANULE_QUEUE_MASK;
+  }

-  // The Granules are pre-windowed, so we can just add them together
-  StereoPair s0 = prev.m_buffer[(prev_index - 2) & GRANULE_BUFFER_MASK] +
-                  next.m_buffer[(next_index - 2) & GRANULE_BUFFER_MASK];
-  StereoPair s1 = prev.m_buffer[(prev_index - 1) & GRANULE_BUFFER_MASK] +
-                  next.m_buffer[(next_index - 1) & GRANULE_BUFFER_MASK];
-  StereoPair s2 = prev.m_buffer[(prev_index + 0) & GRANULE_BUFFER_MASK] +
-                  next.m_buffer[(next_index + 0) & GRANULE_BUFFER_MASK];
-  StereoPair s3 = prev.m_buffer[(prev_index + 1) & GRANULE_BUFFER_MASK] +
-                  next.m_buffer[(next_index + 1) & GRANULE_BUFFER_MASK];
-  StereoPair s4 = prev.m_buffer[(prev_index + 2) & GRANULE_BUFFER_MASK] +
-                  next.m_buffer[(next_index + 2) & GRANULE_BUFFER_MASK];
-  StereoPair s5 = prev.m_buffer[(prev_index + 3) & GRANULE_BUFFER_MASK] +
-                  next.m_buffer[(next_index + 3) & GRANULE_BUFFER_MASK];
+  // Checks to see if the queue is empty.
+  std::size_t next_tail = (tail + 1) & GRANULE_QUEUE_MASK;
+  if (next_tail == head)
+  {
+    // Only fill gaps when running to prevent stutter on pause.
+    const bool is_running = Core::GetState(Core::System::GetInstance()) == Core::State::Running;
+    if (m_mixer->m_config_fill_audio_gaps && is_running)
+    {
+      // Jump the playhead to half the queue size behind the head.
+      // This provides smoother audio playback than suddenly stopping.
+      const std::size_t gap = std::max<std::size_t>(2, granule_queue_size >> 1) - 1;
+      next_tail = (head - gap) & GRANULE_QUEUE_MASK;
+      m_queue_looping.store(true, std::memory_order_relaxed);
+    }
+    else
+    {
+      std::fill(granule->begin(), granule->end(), StereoPair{0.0f, 0.0f});
+      m_queue_looping.store(false, std::memory_order_relaxed);
+      return;
+    }
+  }

-  s0 *= StereoPair{(+0.0f + 1.0f * t1 - 2.0f * t2 + 1.0f * t3) / 12.0f};
-  s1 *= StereoPair{(+0.0f - 8.0f * t1 + 15.0f * t2 - 7.0f * t3) / 12.0f};
-  s2 *= StereoPair{(+3.0f + 0.0f * t1 - 7.0f * t2 + 4.0f * t3) / 3.0f};
-  s3 *= StereoPair{(+0.0f + 2.0f * t1 + 5.0f * t2 - 4.0f * t3) / 3.0f};
-  s4 *= StereoPair{(+0.0f - 1.0f * t1 - 6.0f * t2 + 7.0f * t3) / 12.0f};
-  s5 *= StereoPair{(+0.0f + 0.0f * t1 + 1.0f * t2 - 1.0f * t3) / 12.0f};
-
-  return s0 + s1 + s2 + s3 + s4 + s5;
+  *granule = m_queue[tail];
+  m_queue_tail.store(next_tail, std::memory_order_release);
 }