Merge pull request #10692 from Pokechu22/dsp-manual-set40-and-write-backlog

docs/DSP: Add sections on 16-bit and 40-bit modes and on main and extended opcode writing to the same register
2025-07-24 06:39:46 -06:00 · 2022-06-02 20:26:31 -04:00
parent c8ab236e0a f47dfc3dba
commit f7f47d3cd0
7 changed files with 567 additions and 102 deletions
--- a/Source/Core/Core/DSP/Interpreter/DSPIntArithmetic.cpp
+++ b/Source/Core/Core/DSP/Interpreter/DSPIntArithmetic.cpp
@ -45,7 +45,7 @@ void Interpreter::clrl(const UDSPInstruction opc)
 //----

 // ANDCF $acD.m, #I
-// 0000 001r 1100 0000
+// 0000 001d 1100 0000
 // iiii iiii iiii iiii
 // Set logic zero (LZ) flag in status register $sr if result of logic AND of
 // accumulator mid part $acD.m with immediate value I is equal to I.
@ -61,7 +61,7 @@ void Interpreter::andcf(const UDSPInstruction opc)
 }

 // ANDF $acD.m, #I
-// 0000 001r 1010 0000
+// 0000 001d 1010 0000
 // iiii iiii iiii iiii
 // Set logic zero (LZ) flag in status register $sr if result of logical AND
 // operation of accumulator mid part $acD.m with immediate value I is equal
@ -81,7 +81,7 @@ void Interpreter::andf(const UDSPInstruction opc)

 // TST
 // 1011 r001 xxxx xxxx
-// Test accumulator %acR.
+// Test accumulator $acR.
 //
 // flags out: --xx xx00
 void Interpreter::tst(const UDSPInstruction opc)
@ -143,11 +143,12 @@ void Interpreter::cmpaxh(const UDSPInstruction opc)
  ZeroWriteBackLog();
 }

-// CMPI $amD, #I
-// 0000 001r 1000 0000
+// CMPI $acD, #I
+// 0000 001d 1000 0000
 // iiii iiii iiii iiii
-// Compares mid accumulator $acD.hm ($amD) with sign extended immediate value I.
-// Although flags are being set regarding whole accumulator register.
+// Compares accumulator with immediate. Comparison is executed
+// by subtracting the immediate (16-bit sign extended) from mid accumulator
+// $acD.hm and computing flags based on whole accumulator $acD.
 //
 // flags out: x-xx xxxx
 void Interpreter::cmpi(const UDSPInstruction opc)
@ -166,8 +167,8 @@ void Interpreter::cmpi(const UDSPInstruction opc)

 // CMPIS $acD, #I
 // 0000 011d iiii iiii
-// Compares accumulator with short immediate. Comaprison is executed
-// by subtracting short immediate (8bit sign extended) from mid accumulator
+// Compares accumulator with short immediate. Comparison is executed
+// by subtracting the short immediate (8-bit sign extended) from mid accumulator
 // $acD.hm and computing flags based on whole accumulator $acD.
 //
 // flags out: x-xx xxxx
@ -320,7 +321,7 @@ void Interpreter::notc(const UDSPInstruction opc)
 }

 // XORI $acD.m, #I
-// 0000 001r 0010 0000
+// 0000 001d 0010 0000
 // iiii iiii iiii iiii
 // Logic exclusive or (XOR) of accumulator mid part $acD.m with
 // immediate value I.
@ -337,7 +338,7 @@ void Interpreter::xori(const UDSPInstruction opc)
 }

 // ANDI $acD.m, #I
-// 0000 001r 0100 0000
+// 0000 001d 0100 0000
 // iiii iiii iiii iiii
 // Logic AND of accumulator mid part $acD.m with immediate value I.
 //
@ -354,7 +355,7 @@ void Interpreter::andi(const UDSPInstruction opc)
 }

 // ORI $acD.m, #I
-// 0000 001r 0110 0000
+// 0000 001d 0110 0000
 // iiii iiii iiii iiii
 // Logic OR of accumulator mid part $acD.m with immediate value I.
 //
@ -489,8 +490,8 @@ void Interpreter::addaxl(const UDSPInstruction opc)
  UpdateSR64Add(acc, acx, GetLongAcc(dreg));
 }

-// ADDI $amR, #I
-// 0000 001r 0000 0000
+// ADDI $acD, #I
+// 0000 001d 0000 0000
 // iiii iiii iiii iiii
 // Adds immediate (16-bit sign extended) to mid accumulator $acD.hm.
 //
--- a/Source/Core/Core/DSP/Interpreter/DSPInterpreter.cpp
+++ b/Source/Core/Core/DSP/Interpreter/DSPInterpreter.cpp
@ -16,8 +16,23 @@

 namespace DSP::Interpreter
 {
-// Not needed for game ucodes (it slows down interpreter + easier to compare int VS
-// dspjit64 without it)
+// Correctly handle instructions such as `INC'L $ac0 : $ac0.l, @$ar0` (encoded as 0x7660) where both
+// the main opcode and the extension opcode modify the same register. See the "Extended opcodes"
+// section in the manual for more details.  No official uCode writes to the same register twice like
+// this, so we don't emulate it by default (and also don't support it in the recompiler).
+//
+// Dolphin only supports this behavior in the interpreter when PRECISE_BACKLOG is defined.
+// In ExecuteInstruction, if an extended opcode is in use, the extended opcode's behavior is
+// executed first, followed by the main opcode's behavior. The extended opcode does not directly
+// write to registers, but instead records the writes into a backlog (WriteToBackLog). The main
+// opcode calls ZeroWriteBackLog after it is done reading the register values; this directly
+// writes zero to all registers that have pending writes in the backlog. The main opcode then is
+// free to write directly to registers it changes. Afterwards, ApplyWriteBackLog bitwise-ors the
+// value of the register and the value in the backlog; if the main opcode didn't write to the
+// register then ZeroWriteBackLog means that the pending value is being or'd with zero, so it's
+// used without changes. When PRECISE_BACKLOG is not defined, ZeroWriteBackLog does nothing and
+// ApplyWriteBackLog overwrites the register value with the value from the backlog (so writes from
+// extended opcodes "win" over the main opcode).
 //#define PRECISE_BACKLOG

 Interpreter::Interpreter(DSPCore& dsp) : m_dsp_core{dsp}
@ -809,7 +824,7 @@ void Interpreter::ConditionalExtendAccum(int reg)
 void Interpreter::ApplyWriteBackLog()
 {
  // Always make sure to have an extra entry at the end w/ -1 to avoid
-  // infinitive loops
+  // infinite loops
  for (int i = 0; m_write_back_log_idx[i] != -1; i++)
  {
    u16 value = m_write_back_log[i];
@ -823,6 +838,11 @@ void Interpreter::ApplyWriteBackLog()
  }
 }

+// The ext ops are calculated in parallel with the actual op. That means that
+// both the main op and the ext op see the same register state as input. The
+// output is simple as long as the main and ext ops don't change the same
+// register. If they do the output is the bitwise OR of the result of both the
+// main and ext ops.
 void Interpreter::WriteToBackLog(int i, int idx, u16 value)
 {
  m_write_back_log[i] = value;
@ -840,7 +860,7 @@ void Interpreter::ZeroWriteBackLog()
 {
 #ifdef PRECISE_BACKLOG
  // always make sure to have an extra entry at the end w/ -1 to avoid
-  // infinitive loops
+  // infinite loops
  for (int i = 0; m_write_back_log_idx[i] != -1; i++)
  {
    OpWriteRegister(m_write_back_log_idx[i], 0);
--- a/Source/Core/Core/DSP/Interpreter/DSPInterpreter.h
+++ b/Source/Core/Core/DSP/Interpreter/DSPInterpreter.h
@ -235,11 +235,6 @@ private:

  void ConditionalExtendAccum(int reg);

-  // The ext ops are calculated in parallel with the actual op. That means that
-  // both the main op and the ext op see the same register state as input. The
-  // output is simple as long as the main and ext ops don't change the same
-  // register. If they do the output is the bitwise OR of the result of both the
-  // main and ext ops.
  void WriteToBackLog(int i, int idx, u16 value);
  void ZeroWriteBackLog();
  void ZeroWriteBackLogPreserveAcc(u8 acc);
--- a/Source/Core/Core/DSP/Jit/x64/DSPJitArithmetic.cpp
+++ b/Source/Core/Core/DSP/Jit/x64/DSPJitArithmetic.cpp
@ -53,7 +53,7 @@ void DSPEmitter::clrl(const UDSPInstruction opc)
 //----

 // ANDCF $acD.m, #I
-// 0000 001r 1100 0000
+// 0000 001d 1100 0000
 // iiii iiii iiii iiii
 // Set logic zero (LZ) flag in status register $sr if result of logic AND of
 // accumulator mid part $acD.m with immediate value I is equal to I.
@ -88,7 +88,7 @@ void DSPEmitter::andcf(const UDSPInstruction opc)
 }

 // ANDF $acD.m, #I
-// 0000 001r 1010 0000
+// 0000 001d 1010 0000
 // iiii iiii iiii iiii
 // Set logic zero (LZ) flag in status register $sr if result of logical AND
 // operation of accumulator mid part $acD.m with immediate value I is equal
@ -126,7 +126,7 @@ void DSPEmitter::andf(const UDSPInstruction opc)

 // TST
 // 1011 r001 xxxx xxxx
-// Test accumulator %acR.
+// Test accumulator $acR.
 //
 // flags out: --xx xx00
 void DSPEmitter::tst(const UDSPInstruction opc)
@ -220,11 +220,12 @@ void DSPEmitter::cmpaxh(const UDSPInstruction opc)
  }
 }

-// CMPI $amD, #I
-// 0000 001r 1000 0000
+// CMPI $acD, #I
+// 0000 001d 1000 0000
 // iiii iiii iiii iiii
-// Compares mid accumulator $acD.hm ($amD) with sign extended immediate value I.
-// Although flags are being set regarding whole accumulator register.
+// Compares accumulator with immediate. Comparison is executed
+// by subtracting the immediate (16-bit sign extended) from mid accumulator
+// $acD.hm and computing flags based on whole accumulator $acD.
 //
 // flags out: x-xx xxxx
 void DSPEmitter::cmpi(const UDSPInstruction opc)
@ -257,7 +258,7 @@ void DSPEmitter::cmpi(const UDSPInstruction opc)
 // CMPIS $acD, #I
 // 0000 011d iiii iiii
 // Compares accumulator with short immediate. Comparison is executed
-// by subtracting short immediate (8bit sign extended) from mid accumulator
+// by subtracting the short immediate (8-bit sign extended) from mid accumulator
 // $acD.hm and computing flags based on whole accumulator $acD.
 //
 // flags out: x-xx xxxx
@ -472,7 +473,7 @@ void DSPEmitter::notc(const UDSPInstruction opc)
 }

 // XORI $acD.m, #I
-// 0000 001r 0010 0000
+// 0000 001d 0010 0000
 // iiii iiii iiii iiii
 // Logic exclusive or (XOR) of accumulator mid part $acD.m with
 // immediate value I.
@ -498,7 +499,7 @@ void DSPEmitter::xori(const UDSPInstruction opc)
 }

 // ANDI $acD.m, #I
-// 0000 001r 0100 0000
+// 0000 001d 0100 0000
 // iiii iiii iiii iiii
 // Logic AND of accumulator mid part $acD.m with immediate value I.
 //
@ -523,7 +524,7 @@ void DSPEmitter::andi(const UDSPInstruction opc)
 }

 // ORI $acD.m, #I
-// 0000 001r 0110 0000
+// 0000 001d 0110 0000
 // iiii iiii iiii iiii
 // Logic OR of accumulator mid part $acD.m with immediate value I.
 //
@ -706,8 +707,8 @@ void DSPEmitter::addaxl(const UDSPInstruction opc)
  }
 }

-// ADDI $amR, #I
-// 0000 001r 0000 0000
+// ADDI $acD, #I
+// 0000 001d 0000 0000
 // iiii iiii iiii iiii
 // Adds immediate (16-bit sign extended) to mid accumulator $acD.hm.
 //
--- a/Source/DSPSpy/tests/40bit_ins_test.ds
+++ b/Source/DSPSpy/tests/40bit_ins_test.ds
@ -0,0 +1,202 @@
+; This test covers the behavior of 40-bit mode with various instructions.
+incdir  "tests"
+include "dsp_base.inc"
+
+positive_value: EQU #0x1234
+negative_value: EQU #0x9876
+
+negative_imem_value_addr:
+CW negative_value
+
+; DSPSpy doesn't pre-populating DMEM currently, so instead use these addresses to store values.
+positive_dmem_value_addr: EQU #0x100
+negative_dmem_value_addr: EQU #0x101
+readback_dmem_addr: EQU #0x102
+
+test_main:
+	LRI $ar0, #positive_dmem_value_addr
+	LRI $ar1, #negative_dmem_value_addr
+	LRI $ar2, #negative_imem_value_addr
+	LRI $ar3, #readback_dmem_addr
+	LRI $ix0, #0
+	LRI $ix1, #0
+	LRI $ix2, #0
+	LRI $ix3, #0
+
+	LRI $ax0.h, #positive_value
+	LRI $ax1.h, #negative_value
+
+	SR @positive_dmem_value_addr, $ax0.h
+	SR @negative_dmem_value_addr, $ax1.h
+
+	LRI $cr, #(positive_dmem_value_addr / 256)
+
+	SET40
+	; Instructions that perform sign-extension
+	; $acc0 should alternate between being positive and negative here
+	; (though none of these instructions update $sr)
+
+	; [1] ILRR (also ILRRD/ILRRI/ILRRN, not covered)
+	ILRR $ac0.m, @$ar2 ; -
+	CALL send_back
+	; [2] LR
+	LR $ac0.m, @positive_dmem_value_addr ; +
+	CALL send_back
+	; [3] LRI
+	LRI $ac0.m, #negative_value ; -
+	CALL send_back
+	; [4] LRIS
+	LRIS $ac0.m, #42 ; +
+	CALL send_back
+	; [5] LRR (also LRRD/LRRI/LRRN)
+	LRR $ac0.m, @$ar1 ; -
+	CALL send_back
+	; [6] LRS
+	LRS $ac0.m, @(positive_dmem_value_addr & 0xff) ; +
+	CALL send_back
+	; [7] MRR
+	MRR $ac0.m, $ax1.h ; -
+	CALL send_back
+	; [8] 'LN (and 'L, but 'LN lets us set $ix0 to not increment $ar0)
+	NX'LN : $ac0.m, @$ar0 ; +
+	CALL send_back
+
+	; Instructions that experience saturation
+	; $ax1.l should alternate between 0x8000 and 0x7fff.
+	LRI $ac0.m, #0x4231
+	LRI $ac0.h, #0x12 ; positive
+	LRI $ac1.m, #0x2816
+	LRI $ac1.h, #0x99 ; negative
+	; [9] MRR (again)
+	MRR $ax1.l, $ac1.m ; -
+	CALL send_back
+	; [10] SR
+	SR @readback_dmem_addr, $ac0.m
+	LR $ax1.l, @readback_dmem_addr ; +
+	CALL send_back
+	; [11] SRRN (also SRR/SRRD/SRRI)
+	SRRN @$ar3, $ac1.m
+	LR $ax1.l, @readback_dmem_addr ; -
+	CALL send_back
+	; [12] SRS
+	SRS @(readback_dmem_addr & 0xff), $ac0.m
+	LR $ax1.l, @readback_dmem_addr ; +
+	CALL send_back
+	; [13] 'LSNM (also 'LS/'LSM/'LSN) - the $ax0.l read is not relevant
+	NX'LSNM : $ax0.l, $ac1.m
+	LR $ax1.l, @readback_dmem_addr ; -
+	CALL send_back
+	; [14] 'MV
+	NX'MV : $ax1.l, $ac0.m ; +
+	CALL send_back
+	; [15] 'SLNM (also 'SL/'SLM/'SLN) - the $ax0.l read is not relevant
+	; Note that 'SL stores to @$ar0, while 'LS stores to @$ar3
+	LRI $ar0, #readback_dmem_addr
+	NX'SLNM : $ac1.m, $ax0.l
+	LR $ax1.l, @readback_dmem_addr ; -
+	CALL send_back
+	LRI $ar0, #positive_dmem_value_addr
+	; [16] 'SN (also 'S)
+	NX'SN : @$ar3, $ac0.m
+	LR $ax1.l, @readback_dmem_addr ; +
+	CALL send_back
+
+	; Instructions that are not affected
+	; [17] ADDI
+	ADDI $ac0.m, #8
+	CALL send_back
+	; [18] ADDIS
+	ADDIS $ac0.m, #-8
+	CALL send_back
+	; [19] ANDC
+	ANDC $ac1.m, $ac0.m
+	CALL send_back
+	; [20] ANDI
+	ANDI $ac0.m, #0x6666
+	CALL send_back
+	; [21] ANDR
+	ANDR $ac0.m, $ax0.h
+	CALL send_back
+	; [22] ORC
+	ORC $ac0.m, $ac1.m
+	CALL send_back
+	; [23] ORI
+	ORI $ac0.m, #0xfeed
+	CALL send_back
+	; [24] ORR
+	ORR $ac1.m, $ax0.h
+	CALL send_back
+	; [25] NOT
+	NOT $ac1.m
+	CALL send_back
+	; [26] XORC
+	XORC $ac0.m, $ac1.m
+	CALL send_back
+	; [27] XORI
+	XORI $ac0.m, #0x5555
+	CALL send_back
+	; [28] XORR
+	XORR $ac1.m, $ax1.h
+	CALL send_back
+
+	; [29] MOVR always sign extends...
+	MOVR $acc1, $ax0.h
+	CALL send_back
+	; [30] ... even in SET16 mode
+	SET16
+	MOVR $acc1, $ax1.h
+	CALL send_back
+	SET40
+
+	; Shift instructions - do these see saturated $ac1.m?
+	LRI $ac0.m, #positive_value
+	LRI $ac1.m, #2
+	LRI $ac1.h, #1
+	; [31] - for diffs only
+	CALL send_back
+	; [32]
+	LSRNR $acc0, $ac1.m
+	CALL send_back
+	; [33] Shifts $acc0 by $ac1.m (in the other direction)
+	LSRN
+	CALL send_back
+
+	; Does LOOP experience saturation?
+	CLR $acc0
+	LRI $ac1.m, #0x1234
+	LRI $ac1.h, #1
+	; [34] - for diffs only
+	CALL send_back
+	; [35] LOOP
+	LOOP $ac1.m
+	INC $acc0
+	CALL send_back
+	LRI $ac1.h, #0x99
+	; [36] BLOOP
+	BLOOP $ac1.m, bloop_last_ins
+	INCM $ac0.m
+bloop_last_ins:
+	NOP
+	CALL send_back
+
+	; For the sake of clarity, the same LOOP/BLOOP calls in SET16 mode don't have saturation:
+	SET16
+	CLR $acc0
+	LRI $ac1.m, #0x1234
+	LRI $ac1.h, #1
+	; [37] - for diffs only
+	CALL send_back
+	; [38] LOOP
+	LOOP $ac1.m
+	INC $acc0
+	CALL send_back
+	LRI $ac1.h, #0x99
+	; [39] BLOOP
+	BLOOP $ac1.m, bloop2_last_ins
+	INCM $ac0.m
+bloop2_last_ins:
+	NOP
+	CALL send_back
+
+	; We're done, DO NOT DELETE THIS LINE
+	JMP end_of_test
--- a/Source/DSPSpy/tests/40bit_test.ds
+++ b/Source/DSPSpy/tests/40bit_test.ds
@ -0,0 +1,164 @@
+; This test covers the behavior of 40-bit mode for a variety of values.
+; It takes a while to run completely (~5 minutes), but progress is indicated via mail shown at the
+; top of the screen in DSPSpy.  The value will go from 80000000 to 8041ffff.
+incdir  "tests"
+include "dsp_base.inc"
+
+
+
+test_main:
+	LRI $ar0, #0
+	LRI $ar1, #0
+	LRI $ar2, #0
+	LRI $ar3, #0
+	LRI $ix0, #0
+	LRI $ix1, #0
+	LRI $ix2, #0
+	LRI $ix3, #0
+
+	; Test with $ac0.l from 0xfff0 to 0x0010
+	LRI $ac0.l, #0xfff0
+BLOOPI #0x21, first_loop_last_ins
+	CALL test_saturation
+	IAR $ar0
+first_loop_last_ins:
+	INC $acc0
+
+	; Test with $ac0.l from 0x7ff0 to 0x8010
+	LRI $ac0.l, #0xfff0
+BLOOPI #0x21, second_loop_last_ins
+	CALL test_saturation
+	IAR $ar0
+second_loop_last_ins:
+	INC $acc0
+
+	; We're done.  Report the test results.
+	; $ix1 should be 0, or else saturation occurred on $ac0.l or $ac0.h.
+	; $ix2 should be 0, or else sign-extension occurred on $ac0.l or $ac0.h.
+	; $ix3 should be 0, or else we incorrectly predicted saturation on $ac0.m.
+	; $ar1/$ar2/$ar3 records the number of times it happened
+	CALL send_back
+
+	; We're done, DO NOT DELETE THIS LINE
+	JMP end_of_test
+
+
+
+test_saturation:
+	; We start with $ac0.h at -0x80 since we can use the overflow flag to check when wrapping around
+	; occurs; starting at 0 and ending when it wraps back to 0 doesn't work since we can't check the
+	; zero flag since $ac0.l may be nonzero ($ac0.l is used as an input to this subroutine)
+	LRI $ac0.m, #0
+	LRI $ac0.h, #-0x80
+
+loop_start:
+	; Compare the value of $ac0.m when in SET16 mode and in SET40 mode
+	SET40
+	; Reading $ac0.m in SET40 mode results in saturation if $ac0.h doesn't match the sign-extension
+	; of $ac0.h. Also, storing to $ac1.m in SET40 mode clears $ac1.l and sets $ac1.h to the
+	; sign-extension of $ac1.m, and $ac1.l.
+	MRR $ac1.m, $ac0.m
+	SET16
+	; Attempt to compute the saturated value of $ac1.m in $ax1.h,
+	; using what we know of $acc0.
+	TST'MV $acc0 : $ax1.h, $ac0.m
+	JL negative_acc0
+	; $acc0 is nonnegative.
+	JMPx8 check_saturated_ax1h ; If the above s32 bit is not set, we don't need to saturate
+	; If the above s32 bit _is_ set, then saturate $ax1.h.
+	LRI $ax1.h, #0x7fff
+	JMP check_saturated_ax1h
+
+negative_acc0:
+	JMPx8 check_saturated_ax1h ; If the above s32 bit is not set, we don't need to saturate
+	LRI $ax1.h, #0x8000
+	; Fall through to check_saturated_ax1h
+
+check_saturated_ax1h:
+	; $acc1 has the value of $ac0.m in SET40 mode.
+	; And, $ax1.h has what we computed that value should be, and CMPAXH always sign-extends $ax1.h
+	; (and ignores $ax1.l), so we can compare using it directly.
+	CMPAXH $acc1, $ax1.h
+	JZ check_read_low
+	; Our prediction was wrong (shouldn't happen)
+	LRI $ix3, #1
+	IAR $ar3
+	TST $acc0
+	CALL send_back
+	; Fall through to check_read_low
+
+check_read_low:
+	SET40
+	MRR $ac1.m, $ac0.l
+	SET16
+	MRR $ax1.h, $ac0.l
+	CMPAXH $acc1, $ax1.h
+	JZ check_read_high
+	; Reading $ac0.l gave different results in SET40 and SET16 modes (shouldn't happen)
+	LRI $ix1, #1
+	IAR $ar1
+	TST $acc0
+	CALL send_back
+	; Fall through to check_read_high
+
+check_read_high:
+	SET40
+	MRR $ac1.m, $ac0.h
+	SET16
+	MRR $ax1.h, $ac0.h
+	CMPAXH $acc1, $ax1.h
+	JZ check_write_low
+	; Reading $ac0.h gave different results in SET40 and SET16 modes (shouldn't happen)
+	LRI $ix1, #1
+	IAR $ar1
+	TST $acc0
+	CALL send_back
+	; Fall through to check_write_low
+
+check_write_low:
+	MOV $acc1, $acc0
+	SET40
+	MRR $ac1.l, $ac0.l
+	SET16
+	CMP
+	JZ check_write_high
+	; Writing to $ac1.l caused $acc1 to not match $acc0 (shouldn't happen)
+	LRI $ix2, #1
+	IAR $ar2
+	CALL send_back
+	; Fall through to check_write_high
+
+check_write_high:
+	MOV $acc1, $acc0
+	SET40
+	MRR $ac1.h, $ac0.h
+	SET16
+	CMP
+	JZ increment_loop
+	; Writing to $ac1.h caused $acc1 to not match $acc0 (shouldn't happen)
+	LRI $ix2, #1
+	IAR $ar2
+	CALL send_back
+	; Fall through to increment_loop
+
+increment_loop:
+	INCM $ac0.m
+	; If incrementing results in overflowing, then we're done.
+	RETO
+
+	; If ($ac0.m & 0x00ff) != 0, continue the loop without sending mail.
+	ANDF $ac0.m, #0x00ff
+	JLNZ loop_start
+	; Otherwise, send mail to report the progress. (This shows at the top of the screen in DSPSpy,
+	; but otherwise isn't handled in any meaningful way.)
+	MOV $acc1, $acc0
+	LSR $acc1, #-8
+	; Compensate for starting at INT_MIN (0x80'0000'0000) and ending at INT_MAX (0x7f'0000'0000)
+	; instead of going from 0 (0x00'0000'0000) to -1 (0xff'ffff'ffff)
+	XORI $ac1.m, #0x8000
+
+	SR @DMBH, $ar0
+	SR @DMBL, $ac1.m
+	SI @DIRQ, #0x0001
+	; We don't wait for the mail to be read, because we don't care about the response.
+	JMP loop_start