mirror of
https://github.com/melonDS-emu/melonDS.git
synced 2025-08-03 11:38:59 -06:00
more base work for DSP HLE.
add scaling commands.
This commit is contained in:
@ -16,6 +16,8 @@
|
||||
with melonDS. If not, see http://www.gnu.org/licenses/.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "../DSi.h"
|
||||
#include "GraphicsUcode.h"
|
||||
#include "../Platform.h"
|
||||
@ -32,6 +34,8 @@ namespace DSP_HLE
|
||||
|
||||
GraphicsUcode::GraphicsUcode(melonDS::DSi& dsi, int version) : UcodeBase(dsi)
|
||||
{
|
||||
DSi.RegisterEventFuncs(Event_DSi_DSPHLE, this, {MakeEventThunk(GraphicsUcode, FinishCmd)});
|
||||
|
||||
Log(LogLevel::Info, "DSP_HLE: initializing Graphics SDK ucode version %d\n", version);
|
||||
}
|
||||
|
||||
@ -43,6 +47,10 @@ GraphicsUcode::~GraphicsUcode()
|
||||
void GraphicsUcode::Reset()
|
||||
{
|
||||
UcodeBase::Reset();
|
||||
|
||||
CmdState = 0;
|
||||
CmdIndex = 0;
|
||||
memset(CmdParams, 0, sizeof(CmdParams));
|
||||
}
|
||||
|
||||
void GraphicsUcode::DoSavestate(Savestate *file)
|
||||
@ -55,179 +63,517 @@ void GraphicsUcode::SendData(u8 index, u16 val)
|
||||
{
|
||||
UcodeBase::SendData(index, val);
|
||||
|
||||
// CMD0 is used to send graphics commands
|
||||
// pipe 7 is used to send the command parameters
|
||||
// when the pipe is written, we get notified via CMD2
|
||||
|
||||
if (index == 0)
|
||||
{
|
||||
if (UcodeCmd)
|
||||
{
|
||||
printf("???? there is already a command pending\n");
|
||||
return;
|
||||
}
|
||||
|
||||
// writing to CMD0 initiates a ucode-specific command
|
||||
// parameters are then written to pipe 7
|
||||
UcodeCmd = val;
|
||||
CmdWritten[index] = false;
|
||||
|
||||
RunUcodeCmd();
|
||||
TryStartCmd();
|
||||
}
|
||||
else if (index == 2)
|
||||
{
|
||||
// CMD2 serves to notify that a pipe was written to
|
||||
// value is the pipe index
|
||||
if (val == 7)
|
||||
TryStartCmd();
|
||||
|
||||
CmdWritten[index] = false;
|
||||
|
||||
if (UcodeCmd)
|
||||
RunUcodeCmd();
|
||||
CmdWritten[2] = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void GraphicsUcode::RunUcodeCmd()
|
||||
void GraphicsUcode::TryStartCmd()
|
||||
{
|
||||
if (CmdState == 0)
|
||||
{
|
||||
if (!CmdWritten[0])
|
||||
return;
|
||||
|
||||
CmdState = 1;
|
||||
CmdIndex = CmdReg[0];
|
||||
CmdWritten[0] = false;
|
||||
}
|
||||
|
||||
if (CmdState != 1)
|
||||
return;
|
||||
|
||||
// try to start executing this command
|
||||
// we can run as soon as we have received all the parameters in pipe 7
|
||||
// the command time is a gross estimation of the time it would take on hardware
|
||||
// the point is mostly to convey the idea that these operations aren't free
|
||||
// bicubic scaling is infact quite slow!
|
||||
|
||||
u16* pipe = LoadPipe(7);
|
||||
u32 len = GetPipeLength(pipe);
|
||||
printf("try to run ucode cmd: cmd=%d, len=%d\n", UcodeCmd, len);
|
||||
switch (UcodeCmd)
|
||||
u32 pipelen = GetPipeLength(pipe);
|
||||
|
||||
u32 cmdtime;
|
||||
|
||||
switch (CmdIndex)
|
||||
{
|
||||
case 1: // scaling
|
||||
if (len < 14) return;
|
||||
UcodeCmd_Scaling(pipe);
|
||||
break;
|
||||
{
|
||||
if (pipelen < 14) return;
|
||||
ReadPipe(pipe, CmdParams, 14);
|
||||
|
||||
u32 srcwidth = CmdParams[11];
|
||||
u32 srcheight = CmdParams[12];
|
||||
|
||||
if (CmdParams[4] == 10)
|
||||
{
|
||||
// 1/3 scaling
|
||||
|
||||
// fails if source width/height aren't multiple of 3
|
||||
if ((srcwidth % 3) || (srcheight % 3))
|
||||
{
|
||||
SendReply(1, 0);
|
||||
CmdState = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
//UcodeCmd = 0;
|
||||
}
|
||||
|
||||
void GraphicsUcode::OnUcodeCmdFinish(u32 param)
|
||||
{
|
||||
printf("finish cmd %d, param=%d, %d/%d\n", UcodeCmd, param, CmdWritten[2], ReplyWritten[2]);
|
||||
UcodeCmd = 0;
|
||||
SendReply(1, (u16)param);
|
||||
}
|
||||
|
||||
void GraphicsUcode::UcodeCmd_Scaling(u16* pipe)
|
||||
{
|
||||
u16 params[14];
|
||||
ReadPipe(pipe, params, 14);
|
||||
|
||||
u32 src_addr = (params[1] << 16) | params[0];
|
||||
u32 dst_addr = (params[3] << 16) | params[2];
|
||||
u16 filter = params[4];
|
||||
u16 src_width = params[5];
|
||||
u16 src_height = params[6];
|
||||
u16 width_scale = params[7];
|
||||
u16 height_scale = params[8];
|
||||
u16 rect_xoffset = params[9];
|
||||
u16 rect_yoffset = params[10];
|
||||
u16 rect_width = params[11];
|
||||
u16 rect_height = params[12];
|
||||
|
||||
u32 dst_width = (src_width * width_scale) / 1000;
|
||||
u32 dst_height = (src_height * height_scale) / 1000;
|
||||
|
||||
// TODO those are slightly different for bicubic
|
||||
u32 x_factor = ((rect_width - 2) << 10) / (dst_width - 1);
|
||||
u32 y_factor = ((rect_height - 2) << 10) / (dst_height - 1);
|
||||
|
||||
// bound check
|
||||
// CHECKME
|
||||
//if (dst_width > rect_width) dst_width = rect_width;
|
||||
//if (dst_height > rect_height) dst_height = rect_height;
|
||||
// at 1700 it starts going out of bounds
|
||||
|
||||
src_addr += (((rect_yoffset * src_width) + rect_xoffset) << 1);
|
||||
|
||||
if (filter == 2)
|
||||
{
|
||||
// bilinear
|
||||
|
||||
for (u32 y = 0; y < dst_height; y++)
|
||||
{
|
||||
u32 sy = (y * y_factor) + 0x200;// + 0x3FF;
|
||||
u32 syf = sy & 0x3FF;
|
||||
u32 src_line1 = src_addr + (((sy >> 10) * src_width) << 1);
|
||||
u32 src_line2 = src_line1 + (src_width << 1);
|
||||
|
||||
for (u32 x = 0; x < dst_width; x++)
|
||||
{
|
||||
u32 sx = (x * x_factor) + 0x200;// + 0x3FF;
|
||||
u32 sxf = sx & 0x3FF;
|
||||
|
||||
// TODO caching? see what the ucode does
|
||||
// ucode loads enough lines to fill 32K buffer (16K dsp words)
|
||||
// keeps last scanline from previous buffer
|
||||
// uses 32bit DMA
|
||||
// also starting pos is 0x200 (0.5), 0x600 for bicubic
|
||||
u16 v[4];
|
||||
v[0] = DSi.ARM9Read16(src_line1 + ((sx >> 10) << 1));
|
||||
v[1] = DSi.ARM9Read16(src_line1 + ((sx >> 10) << 1) + 2);
|
||||
v[2] = DSi.ARM9Read16(src_line2 + ((sx >> 10) << 1));
|
||||
v[3] = DSi.ARM9Read16(src_line2 + ((sx >> 10) << 1) + 2);
|
||||
|
||||
u16 r[4], g[4], b[4];
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
r[i] = v[i] & 0x1F;
|
||||
g[i] = (v[i] >> 5) & 0x1F;
|
||||
b[i] = (v[i] >> 10) & 0x1F;
|
||||
}
|
||||
|
||||
u32 f_r, f_g, f_b;
|
||||
u32 t1, t2;
|
||||
|
||||
t1 = (r[0] * (0x400-sxf)) + (r[1] * sxf);
|
||||
t2 = (r[2] * (0x400-sxf)) + (r[3] * sxf);
|
||||
f_r = (t1 * (0x400-syf)) + (t2 * syf);
|
||||
f_r = (f_r >> 20) & 0x1F;
|
||||
|
||||
t1 = (g[0] * (0x400-sxf)) + (g[1] * sxf);
|
||||
t2 = (g[2] * (0x400-sxf)) + (g[3] * sxf);
|
||||
f_g = (t1 * (0x400-syf)) + (t2 * syf);
|
||||
f_g = (f_g >> 15) & 0x3E0;
|
||||
|
||||
t1 = (b[0] * (0x400-sxf)) + (b[1] * sxf);
|
||||
t2 = (b[2] * (0x400-sxf)) + (b[3] * sxf);
|
||||
f_b = (t1 * (0x400-syf)) + (t2 * syf);
|
||||
f_b = (f_b >> 10) & 0x7C00;
|
||||
|
||||
DSi.ARM9Write16(dst_addr, f_r | f_g | f_b | 0x8000);
|
||||
|
||||
dst_addr += 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (filter == 3)
|
||||
{
|
||||
// bicubic
|
||||
cmdtime = 30 * srcwidth * srcheight;
|
||||
}
|
||||
else
|
||||
{
|
||||
// nearest neighbor
|
||||
// nearest/bilinear/bicubic scaling
|
||||
|
||||
for (u32 y = 0; y < dst_height; y++)
|
||||
u32 dstwidth = (srcwidth * CmdParams[7]) / 1000;
|
||||
u32 dstheight = (srcheight * CmdParams[8]) / 1000;
|
||||
|
||||
cmdtime = 4 * srcwidth * srcheight;
|
||||
|
||||
switch (CmdParams[4]) // filtering
|
||||
{
|
||||
u32 sy = ((y * y_factor) + 0x3FF) >> 10;
|
||||
u32 src_line = src_addr + ((sy * src_width) << 1);
|
||||
case 2: cmdtime += (58 * dstwidth * dstheight); break;
|
||||
case 3: cmdtime += (605 * dstwidth * dstheight); break;
|
||||
default: cmdtime += (26 * dstwidth * dstheight); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
for (u32 x = 0; x < dst_width; x++)
|
||||
case 2: // yuv2rgb
|
||||
{
|
||||
u32 sx = ((x * x_factor) + 0x3FF) >> 10;
|
||||
if (pipelen < 6) return;
|
||||
ReadPipe(pipe, CmdParams, 6);
|
||||
|
||||
u16 v = DSi.ARM9Read16(src_line + (sx << 1));
|
||||
DSi.ARM9Write16(dst_addr, v);
|
||||
u32 len = (CmdParams[1] << 16) | CmdParams[0];
|
||||
cmdtime = 24 * (len >> 1);
|
||||
}
|
||||
break;
|
||||
|
||||
dst_addr += 2;
|
||||
}
|
||||
}
|
||||
default: // unknown
|
||||
SendReply(1, 0);
|
||||
CmdState = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO the rest of the shit!!
|
||||
CmdState = 2;
|
||||
DSi.ScheduleEvent(Event_DSi_DSPHLE, false, 200 + cmdtime, 0, 0);
|
||||
}
|
||||
|
||||
// TODO add a delay to this
|
||||
// TODO make the delay realistic
|
||||
//SendReply(1, 1);
|
||||
DSi.ScheduleEvent(Event_DSi_DSPHLE, false, 600000, 0, 1);
|
||||
void GraphicsUcode::FinishCmd(u32 param)
|
||||
{
|
||||
if (CmdState != 2)
|
||||
return;
|
||||
|
||||
switch (CmdIndex)
|
||||
{
|
||||
case 1:
|
||||
switch (CmdParams[4])
|
||||
{
|
||||
case 2: CmdScalingBilinear(); break;
|
||||
case 3: CmdScalingBicubic(); break;
|
||||
case 10:
|
||||
CmdScalingOneThird(); break;
|
||||
default: CmdScalingNearest(); break;
|
||||
}
|
||||
break;
|
||||
|
||||
case 2:
|
||||
//CmdYuvToRgb();
|
||||
break;
|
||||
}
|
||||
|
||||
SendReply(1, 1);
|
||||
CmdState = 0;
|
||||
TryStartCmd();
|
||||
}
|
||||
|
||||
|
||||
void GraphicsUcode::CmdScalingNearest()
|
||||
{
|
||||
u32 src_addr = (CmdParams[1] << 16) | CmdParams[0];
|
||||
u32 dst_addr = (CmdParams[3] << 16) | CmdParams[2];
|
||||
u16 src_width = CmdParams[5];
|
||||
u16 src_height = CmdParams[6];
|
||||
u16 width_scale = CmdParams[7];
|
||||
u16 height_scale = CmdParams[8];
|
||||
u16 rect_xoffset = CmdParams[9];
|
||||
u16 rect_yoffset = CmdParams[10];
|
||||
u16 rect_width = CmdParams[11];
|
||||
u16 rect_height = CmdParams[12];
|
||||
|
||||
u32 dst_width = (rect_width * width_scale) / 1000;
|
||||
u32 dst_height = (rect_height * height_scale) / 1000;
|
||||
|
||||
// sanity checks
|
||||
if ((dst_width == 0) ||
|
||||
(dst_height == 0) ||
|
||||
(rect_width > 16384) ||
|
||||
(dst_width > 16384))
|
||||
{
|
||||
printf("DSP_HLE: incorrect parameters for nearest scaling\n");
|
||||
return;
|
||||
}
|
||||
|
||||
u32 sx_incr = ((rect_width - 2) << 10) / (dst_width - 1);
|
||||
u32 sy_incr = ((rect_height - 2) << 10) / (dst_height - 1);
|
||||
u32 sx, sy;
|
||||
|
||||
src_addr += (((rect_yoffset * src_width) + rect_xoffset) << 1);
|
||||
sy = 0x3FF;
|
||||
|
||||
u16* src_mem = (u16*)DSi.NWRAMMap_C[2][1];
|
||||
u16* dst_mem = (u16*)DSi.NWRAMMap_C[2][3];
|
||||
|
||||
// load first line
|
||||
ReadARM9Mem(src_mem, src_addr, rect_width << 1);
|
||||
|
||||
for (u32 dy = 0; dy < dst_height; dy++)
|
||||
{
|
||||
sx = 0x3FF;
|
||||
|
||||
for (u32 dx = 0; dx < dst_width; dx++)
|
||||
{
|
||||
u16 val = src_mem[sx >> 10];
|
||||
dst_mem[dx] = val;
|
||||
|
||||
sx += sx_incr;
|
||||
}
|
||||
|
||||
// store scaled line
|
||||
WriteARM9Mem(dst_mem, dst_addr, dst_width << 1);
|
||||
dst_addr += (dst_width << 1);
|
||||
|
||||
u32 synext = sy + sy_incr;
|
||||
if ((synext >> 10) != (sy >> 10))
|
||||
{
|
||||
// load new line if needed
|
||||
ReadARM9Mem(src_mem, src_addr + (((synext>>10) * rect_width) << 1), rect_width << 1);
|
||||
}
|
||||
sy = synext;
|
||||
}
|
||||
}
|
||||
|
||||
void GraphicsUcode::CmdScalingBilinear()
|
||||
{
|
||||
u32 src_addr = (CmdParams[1] << 16) | CmdParams[0];
|
||||
u32 dst_addr = (CmdParams[3] << 16) | CmdParams[2];
|
||||
u16 src_width = CmdParams[5];
|
||||
u16 src_height = CmdParams[6];
|
||||
u16 width_scale = CmdParams[7];
|
||||
u16 height_scale = CmdParams[8];
|
||||
u16 rect_xoffset = CmdParams[9];
|
||||
u16 rect_yoffset = CmdParams[10];
|
||||
u16 rect_width = CmdParams[11];
|
||||
u16 rect_height = CmdParams[12];
|
||||
|
||||
u32 dst_width = (rect_width * width_scale) / 1000;
|
||||
u32 dst_height = (rect_height * height_scale) / 1000;
|
||||
|
||||
// sanity checks
|
||||
if ((dst_width == 0) ||
|
||||
(dst_height == 0) ||
|
||||
(rect_width > 8192) ||
|
||||
(dst_width > 8192))
|
||||
{
|
||||
printf("DSP_HLE: incorrect parameters for bilinear scaling\n");
|
||||
return;
|
||||
}
|
||||
|
||||
u32 sx_incr = ((rect_width - 2) << 10) / (dst_width - 1);
|
||||
u32 sy_incr = ((rect_height - 2) << 10) / (dst_height - 1);
|
||||
u32 sx, sy;
|
||||
|
||||
src_addr += (((rect_yoffset * src_width) + rect_xoffset) << 1);
|
||||
sy = 0x200;
|
||||
|
||||
u16* src_mem = (u16*)DSi.NWRAMMap_C[2][1];
|
||||
u16* dst_mem = (u16*)DSi.NWRAMMap_C[2][3];
|
||||
|
||||
// load first lines
|
||||
// for bilinear scaling, we keep the current line and the next one
|
||||
ReadARM9Mem(src_mem, src_addr, rect_width << 1);
|
||||
ReadARM9Mem(&src_mem[rect_width], src_addr, rect_width << 1);
|
||||
|
||||
for (u32 dy = 0; dy < dst_height; dy++)
|
||||
{
|
||||
sx = 0x200;
|
||||
|
||||
for (u32 dx = 0; dx < dst_width; dx++)
|
||||
{
|
||||
u16 val[4];
|
||||
val[0] = src_mem[sx >> 10];
|
||||
val[1] = src_mem[(sx >> 10) + 1];
|
||||
val[2] = src_mem[rect_width + (sx >> 10)];
|
||||
val[3] = src_mem[rect_width + (sx >> 10) + 1];
|
||||
|
||||
u32 fx0 = sx & 0x3FF;
|
||||
u32 fx1 = 0x400 - fx0;
|
||||
u32 fy0 = sy & 0x3FF;
|
||||
u32 fy1 = 0x400 - fy0;
|
||||
|
||||
u32 vr[4], vg[4], vb[4];
|
||||
u32 fr, fg, fb;
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
vr[i] = val[i] & 0x1F;
|
||||
vg[i] = (val[i] >> 5) & 0x1F;
|
||||
vb[i] = (val[i] >> 10) & 0x1F;
|
||||
}
|
||||
|
||||
fr = ((((vr[0] * fx1) + (vr[1] * fx0)) * fy1) +
|
||||
(((vr[2] * fx1) + (vr[3] * fx0)) * fy0)) >> 20;
|
||||
fg = ((((vg[0] * fx1) + (vg[1] * fx0)) * fy1) +
|
||||
(((vg[2] * fx1) + (vg[3] * fx0)) * fy0)) >> 20;
|
||||
fb = ((((vb[0] * fx1) + (vb[1] * fx0)) * fy1) +
|
||||
(((vb[2] * fx1) + (vb[3] * fx0)) * fy0)) >> 20;
|
||||
|
||||
dst_mem[dx] = 0x8000 | (fr & 0x1F) | ((fg & 0x1F) << 5) | ((fb & 0x1F) << 10);
|
||||
|
||||
sx += sx_incr;
|
||||
}
|
||||
|
||||
// store scaled line
|
||||
WriteARM9Mem(dst_mem, dst_addr, dst_width << 1);
|
||||
dst_addr += (dst_width << 1);
|
||||
|
||||
u32 synext = sy + sy_incr;
|
||||
if ((synext >> 10) != (sy >> 10))
|
||||
{
|
||||
// load new lines if needed
|
||||
ReadARM9Mem(src_mem, src_addr + (((synext>>10) * src_width) << 1), rect_width << 1);
|
||||
ReadARM9Mem(&src_mem[rect_width], src_addr + (((synext>>10) * src_width) << 1), rect_width << 1);
|
||||
}
|
||||
sy = synext;
|
||||
}
|
||||
}
|
||||
|
||||
s32 GraphicsUcode::CalcBicubicWeight(s32 x)
|
||||
{
|
||||
// this implements the bicubic convolution algorithm
|
||||
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
|
||||
// Nintendo used a=-1.0
|
||||
// this function returns weights with 16 fractional bits
|
||||
|
||||
if (x <= 0x400)
|
||||
{
|
||||
// x <= 1
|
||||
// W(x) = x^3 - 2x^2 + 1
|
||||
|
||||
s32 square = (x * x) >> 2;
|
||||
s32 cube = (square * x) >> 12;
|
||||
square = 2 * (square >> 2);
|
||||
|
||||
return cube - square + 0x10000;
|
||||
}
|
||||
else if (x <= 0x800)
|
||||
{
|
||||
// 1 < x <= 2
|
||||
// W(x) = -x^3 + 5x^2 - 8x + 4
|
||||
|
||||
s32 cube = (s32)((u32)(((x * x) >> 2) * x) >> 12);
|
||||
s32 square = (5 * x * x) >> 4;
|
||||
s32 one = (-8 * x) << 6;
|
||||
|
||||
return -cube + square + one + 0x40000;
|
||||
}
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
void GraphicsUcode::CmdScalingBicubic()
|
||||
{
|
||||
u32 src_addr = (CmdParams[1] << 16) | CmdParams[0];
|
||||
u32 dst_addr = (CmdParams[3] << 16) | CmdParams[2];
|
||||
u16 src_width = CmdParams[5];
|
||||
u16 src_height = CmdParams[6];
|
||||
u16 width_scale = CmdParams[7];
|
||||
u16 height_scale = CmdParams[8];
|
||||
u16 rect_xoffset = CmdParams[9];
|
||||
u16 rect_yoffset = CmdParams[10];
|
||||
u16 rect_width = CmdParams[11];
|
||||
u16 rect_height = CmdParams[12];
|
||||
|
||||
u32 dst_width = (rect_width * width_scale) / 1000;
|
||||
u32 dst_height = (rect_height * height_scale) / 1000;
|
||||
|
||||
// sanity checks
|
||||
if ((dst_width == 0) ||
|
||||
(dst_height == 0) ||
|
||||
(rect_width > 4096) ||
|
||||
(dst_width > 4096))
|
||||
{
|
||||
printf("DSP_HLE: incorrect parameters for bicubic scaling\n");
|
||||
return;
|
||||
}
|
||||
|
||||
u32 sx_incr = ((rect_width - 4) << 10) / (dst_width - 1);
|
||||
u32 sy_incr = ((rect_height - 4) << 10) / (dst_height - 1);
|
||||
u32 sx, sy;
|
||||
|
||||
src_addr += (((rect_yoffset * src_width) + rect_xoffset) << 1);
|
||||
sy = 0x200;
|
||||
|
||||
u16* src_mem = (u16*)DSi.NWRAMMap_C[2][1];
|
||||
u16* dst_mem = (u16*)DSi.NWRAMMap_C[2][3];
|
||||
|
||||
// load first lines
|
||||
// for bicubic scaling, we keep 4 lines around the current line
|
||||
for (int i = 0; i < 4; i++)
|
||||
ReadARM9Mem(&src_mem[rect_width * i], src_addr + ((src_width * i) << 1), rect_width << 1);
|
||||
|
||||
for (u32 dy = 0; dy < dst_height; dy++)
|
||||
{
|
||||
sx = 0x200;
|
||||
|
||||
for (u32 dx = 0; dx < dst_width; dx++)
|
||||
{
|
||||
u32 fx = sx & 0x3FF;
|
||||
u32 fy = sy & 0x3FF;
|
||||
|
||||
s32 wx[4], wy[4];
|
||||
wx[0] = CalcBicubicWeight(0x400 + fx);
|
||||
wx[1] = CalcBicubicWeight(fx);
|
||||
wx[2] = CalcBicubicWeight(0x400 - fx);
|
||||
wx[3] = CalcBicubicWeight(0x800 - fx);
|
||||
wy[0] = CalcBicubicWeight(0x400 + fy);
|
||||
wy[1] = CalcBicubicWeight(fy);
|
||||
wy[2] = CalcBicubicWeight(0x400 - fy);
|
||||
wy[3] = CalcBicubicWeight(0x800 - fy);
|
||||
//for (int i = 0; i < 4; i++)
|
||||
// printf("weight x%d = %08X y%d = %08X\n", i, wx[i], i, wy[i]);
|
||||
|
||||
s64 tr = 0, tg = 0, tb = 0;
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
for (int j = 0; j < 4; j++)
|
||||
{
|
||||
u16 val = src_mem[(rect_width * i) + (sx >> 10) + j];
|
||||
|
||||
s32 vr = val & 0x1F;
|
||||
s32 vg = (val >> 5) & 0x1F;
|
||||
s32 vb = (val >> 10) & 0x1F;
|
||||
|
||||
s32 weight = ((wx[j] >> 1) * (wy[i] >> 1)) >> 6;
|
||||
|
||||
tr += (vr * weight);
|
||||
tg += (vg * weight);
|
||||
tb += (vb * weight);
|
||||
}
|
||||
}
|
||||
|
||||
// round and clamp final colors
|
||||
s32 fr = (s32)((tr + 0x800000L) >> 24);
|
||||
s32 fg = (s32)((tg + 0x800000L) >> 24);
|
||||
s32 fb = (s32)((tb + 0x800000L) >> 24);
|
||||
|
||||
fr = std::clamp(fr, 0, 31);
|
||||
fg = std::clamp(fg, 0, 31);
|
||||
fb = std::clamp(fb, 0, 31);
|
||||
|
||||
dst_mem[dx] = 0x8000 | (fr & 0x1F) | ((fg & 0x1F) << 5) | ((fb & 0x1F) << 10);
|
||||
|
||||
sx += sx_incr;
|
||||
}
|
||||
|
||||
// store scaled line
|
||||
WriteARM9Mem(dst_mem, dst_addr, dst_width << 1);
|
||||
dst_addr += (dst_width << 1);
|
||||
|
||||
u32 synext = sy + sy_incr;
|
||||
if ((synext >> 10) != (sy >> 10))
|
||||
{
|
||||
// load new lines if needed
|
||||
for (int i = 0; i < 4; i++)
|
||||
ReadARM9Mem(&src_mem[rect_width * i], src_addr + ((((synext>>10) + i) * src_width) << 1), rect_width << 1);
|
||||
}
|
||||
sy = synext;
|
||||
}
|
||||
}
|
||||
|
||||
void GraphicsUcode::CmdScalingOneThird()
|
||||
{
|
||||
u32 src_addr = (CmdParams[1] << 16) | CmdParams[0];
|
||||
u32 dst_addr = (CmdParams[3] << 16) | CmdParams[2];
|
||||
u16 src_width = CmdParams[5];
|
||||
u16 src_height = CmdParams[6];
|
||||
u16 rect_xoffset = CmdParams[9];
|
||||
u16 rect_yoffset = CmdParams[10];
|
||||
u16 rect_width = CmdParams[11];
|
||||
u16 rect_height = CmdParams[12];
|
||||
|
||||
// these were already checked prior to running this
|
||||
// they are guaranteed to be multiples of 3
|
||||
u32 dst_width = rect_width / 3;
|
||||
u32 dst_height = rect_height / 3;
|
||||
|
||||
// sanity checks
|
||||
if (rect_width > 16384)
|
||||
{
|
||||
printf("DSP_HLE: incorrect parameters for one-third scaling\n");
|
||||
return;
|
||||
}
|
||||
u32 sx, sy;
|
||||
|
||||
src_addr += (((rect_yoffset * src_width) + rect_xoffset) << 1);
|
||||
sy = 0;
|
||||
|
||||
u16* src_mem = (u16*)DSi.NWRAMMap_C[2][1];
|
||||
u16* dst_mem = (u16*)DSi.NWRAMMap_C[2][3];
|
||||
|
||||
for (u32 dy = 0; dy < dst_height; dy++)
|
||||
{
|
||||
sx = 0;
|
||||
|
||||
// load source lines
|
||||
for (int i = 0; i < 3; i++)
|
||||
ReadARM9Mem(&src_mem[rect_width * i], src_addr + (((sy + i) * src_width) << 1), rect_width << 1);
|
||||
|
||||
// for this scaling method, we take a 3x3 block of source pixels
|
||||
// and average the 8 outer pixels
|
||||
for (u32 dx = 0; dx < dst_width; dx++)
|
||||
{
|
||||
u16 val[8];
|
||||
val[0] = src_mem[sx];
|
||||
val[1] = src_mem[sx + 1];
|
||||
val[2] = src_mem[sx + 2];
|
||||
val[3] = src_mem[rect_width + sx];
|
||||
val[4] = src_mem[rect_width + sx + 2];
|
||||
val[5] = src_mem[(rect_width * 2) + sx];
|
||||
val[6] = src_mem[(rect_width * 2) + sx + 1];
|
||||
val[7] = src_mem[(rect_width * 2) + sx + 2];
|
||||
|
||||
u32 fr = 0, fg = 0, fb = 0;
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
fr += (val[i] & 0x1F);
|
||||
fg += ((val[i] >> 5) & 0x1F);
|
||||
fb += ((val[i] >> 10) & 0x1F);
|
||||
}
|
||||
|
||||
dst_mem[dx] = 0x8000 | (fr >> 3) | ((fg << 2) & 0x3E0) | ((fb << 7) & 0x7C00);
|
||||
|
||||
sx += 3;
|
||||
}
|
||||
|
||||
// store scaled line
|
||||
WriteARM9Mem(dst_mem, dst_addr, dst_width << 1);
|
||||
dst_addr += (dst_width << 1);
|
||||
|
||||
sy += 3;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -45,9 +45,23 @@ public:
|
||||
void SendData(u8 index, u16 val) override;
|
||||
|
||||
protected:
|
||||
void RunUcodeCmd();
|
||||
u8 CmdState;
|
||||
u16 CmdIndex;
|
||||
u16 CmdParams[14];
|
||||
|
||||
/*void RunUcodeCmd();
|
||||
void OnUcodeCmdFinish(u32 param);
|
||||
void UcodeCmd_Scaling(u16* pipe);
|
||||
void UcodeCmd_Scaling(u16* pipe);*/
|
||||
void TryStartCmd();
|
||||
void FinishCmd(u32 param);
|
||||
|
||||
void CmdScalingNearest();
|
||||
void CmdScalingBilinear();
|
||||
s32 CalcBicubicWeight(s32 x);
|
||||
void CmdScalingBicubic();
|
||||
void CmdScalingOneThird();
|
||||
|
||||
void CmdYuvToRgb();
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -32,7 +32,7 @@ namespace DSP_HLE
|
||||
|
||||
UcodeBase::UcodeBase(melonDS::DSi& dsi) : DSi(dsi)
|
||||
{
|
||||
DSi.RegisterEventFuncs(Event_DSi_DSPHLE, this, {MakeEventThunk(UcodeBase, OnUcodeCmdFinish)});
|
||||
//DSi.RegisterEventFuncs(Event_DSi_DSPHLE, this, {MakeEventThunk(UcodeBase, OnUcodeCmdFinish)});
|
||||
}
|
||||
|
||||
UcodeBase::~UcodeBase()
|
||||
@ -56,7 +56,7 @@ void UcodeBase::Reset()
|
||||
SemaphoreOut = 0;
|
||||
SemaphoreMask = 0;
|
||||
|
||||
UcodeCmd = 0;
|
||||
//UcodeCmd = 0;
|
||||
}
|
||||
|
||||
void UcodeBase::DoSavestate(Savestate *file)
|
||||
@ -335,8 +335,8 @@ u32 UcodeBase::GetPipeLength(u16* pipe)
|
||||
|
||||
u32 UcodeBase::ReadPipe(u16* pipe, u16* data, u32 len)
|
||||
{
|
||||
u16* mem = (u16*)DSi.NWRAMMap_C[2][0];
|
||||
u16* pipebuf = &mem[pipe[0]];
|
||||
u16* mem = (u16*)DSi.NWRAMMap_C[2][pipe[0] >> 14];
|
||||
u16* pipebuf = &mem[pipe[0] & 0x3FFF];
|
||||
u16 pipelen = pipe[1] >> 1;
|
||||
u16 rdptr = pipe[2] >> 1;
|
||||
u16 wrptr = pipe[3] >> 1;
|
||||
@ -357,7 +357,58 @@ printf("-> rd=%d\n", rdptr);
|
||||
return rdlen;
|
||||
}
|
||||
|
||||
void UcodeBase::RunUcodeCmd()
|
||||
|
||||
// TODO: those could be accelerated eventually?
|
||||
// by providing a way to read/write blocks of memory in NDS
|
||||
// rather than having to decode the address for every word
|
||||
|
||||
void UcodeBase::ReadARM9Mem(u16* mem, u32 addr, u32 len)
|
||||
{
|
||||
if (addr & 2)
|
||||
{
|
||||
*mem = DSi.ARM9Read16(addr);
|
||||
mem++;
|
||||
addr += 2;
|
||||
len -= 2;
|
||||
}
|
||||
while (len >= 4)
|
||||
{
|
||||
*(u32*)mem = DSi.ARM9Read32(addr);
|
||||
mem += 2;
|
||||
addr += 4;
|
||||
len -= 4;
|
||||
}
|
||||
if (len)
|
||||
{
|
||||
*mem = DSi.ARM9Read16(addr);
|
||||
len -= 2;
|
||||
}
|
||||
}
|
||||
|
||||
void UcodeBase::WriteARM9Mem(const u16* mem, u32 addr, u32 len)
|
||||
{
|
||||
if (addr & 2)
|
||||
{
|
||||
DSi.ARM9Write16(addr, *mem);
|
||||
mem++;
|
||||
addr += 2;
|
||||
len -= 2;
|
||||
}
|
||||
while (len >= 4)
|
||||
{
|
||||
DSi.ARM9Write32(addr, *(u32*)mem);
|
||||
mem += 2;
|
||||
addr += 4;
|
||||
len -= 4;
|
||||
}
|
||||
if (len)
|
||||
{
|
||||
DSi.ARM9Write16(addr, *mem);
|
||||
len -= 2;
|
||||
}
|
||||
}
|
||||
|
||||
/*void UcodeBase::RunUcodeCmd()
|
||||
{
|
||||
u16* pipe = LoadPipe(7);
|
||||
u32 len = GetPipeLength(pipe);
|
||||
@ -372,7 +423,7 @@ void UcodeBase::OnUcodeCmdFinish(u32 param)
|
||||
printf("finish cmd %d, param=%d, %d/%d\n", UcodeCmd, param, CmdWritten[2], ReplyWritten[2]);
|
||||
UcodeCmd = 0;
|
||||
SendReply(1, (u16)param);
|
||||
}
|
||||
}*/
|
||||
|
||||
|
||||
}
|
||||
|
@ -82,7 +82,7 @@ protected:
|
||||
u16 SemaphoreOut; // DSP -> ARM9
|
||||
u16 SemaphoreMask; // DSP -> ARM9
|
||||
|
||||
u16 UcodeCmd;
|
||||
//u16 UcodeCmd;
|
||||
|
||||
void SendReply(u8 index, u16 val);
|
||||
void SetReplyReadCallback(u8 index, fnReplyReadCb callback);
|
||||
@ -93,8 +93,11 @@ protected:
|
||||
u32 GetPipeLength(u16* pipe);
|
||||
u32 ReadPipe(u16* pipe, u16* data, u32 len);
|
||||
|
||||
void RunUcodeCmd();
|
||||
void OnUcodeCmdFinish(u32 param);
|
||||
void ReadARM9Mem(u16* mem, u32 addr, u32 len);
|
||||
void WriteARM9Mem(const u16* mem, u32 addr, u32 len);
|
||||
|
||||
//void RunUcodeCmd();
|
||||
//void OnUcodeCmdFinish(u32 param);
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -146,7 +146,10 @@ void DSi_DSP::StartDSPHLE()
|
||||
}
|
||||
|
||||
if (DSPCore)
|
||||
{
|
||||
DSPCore->Reset();
|
||||
DSPCore->Start();
|
||||
}
|
||||
}
|
||||
|
||||
void DSi_DSP::StopDSP()
|
||||
@ -196,9 +199,7 @@ DSi_DSP::DSi_DSP(melonDS::DSi& dsi) : DSi(dsi)
|
||||
SCFG_RST = false;
|
||||
|
||||
if (!__temp_dsphle)
|
||||
{
|
||||
StartDSPLLE();
|
||||
}
|
||||
|
||||
//PDATAReadFifo = new FIFO<u16>(16);
|
||||
//PDATAWriteFifo = new FIFO<u16>(16);
|
||||
@ -232,7 +233,10 @@ void DSi_DSP::Reset()
|
||||
PDATAReadFifo.Clear();
|
||||
//PDATAWriteFifo->Clear();
|
||||
//TeakraCore->Reset();
|
||||
if (DSPCore) DSPCore->Reset();
|
||||
if (__temp_dsphle)
|
||||
StopDSP();
|
||||
else if (DSPCore)
|
||||
DSPCore->Reset();
|
||||
|
||||
DSi.CancelEvent(Event_DSi_DSP);
|
||||
|
||||
@ -557,6 +561,7 @@ void DSi_DSP::Write8(u32 addr, u8 val)
|
||||
}
|
||||
}
|
||||
bool fazil = false;
|
||||
int state = 0; u32 aacaddr=0; u32 aaclen = 0;
|
||||
void DSi_DSP::Write16(u32 addr, u16 val)
|
||||
{
|
||||
Log(LogLevel::Debug,"DSP WRITE16 %d %08X %08X %08X\n", IsDSPCoreEnabled(), addr, val, DSi.GetPC(0));
|
||||
@ -620,6 +625,34 @@ void DSi_DSP::Write16(u32 addr, u16 val)
|
||||
DSP_CMD[1] = val;printf("DSP: CMD1 = %04X\n", val);
|
||||
if (DSPCore)
|
||||
DSPCore->SendData(1, val);
|
||||
{
|
||||
if (state==0 && val==1)
|
||||
{
|
||||
state = 1;
|
||||
}
|
||||
else if (state > 0)
|
||||
{
|
||||
if (state==1) aaclen = val;
|
||||
if (state==5) aacaddr = val << 16;
|
||||
if (state==6) aacaddr |= val;
|
||||
if (state==10)
|
||||
{
|
||||
printf("AAC FRAME: addr=%08X len=%08X\n", aacaddr, aaclen);
|
||||
|
||||
for (int i = 0; i < aaclen; i+=16)
|
||||
{
|
||||
printf("%08X: ", i);
|
||||
int l = 16;
|
||||
if ((i+l) > aaclen) l = aaclen-i;
|
||||
for (int j = 0; j < l; j++)
|
||||
printf("%02X ", DSi.ARM9Read8(aacaddr+i+j));
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
state++;
|
||||
if (state>10) state = 0;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 0x30: // CMD2
|
||||
DSP_CMD[2] = val;printf("DSP: CMD2 = %04X\n", val);
|
||||
|
Reference in New Issue
Block a user