diff --git a/hdl/denise.vhdl b/hdl/denise.vhdl index 8fa72af..087484e 100644 --- a/hdl/denise.vhdl +++ b/hdl/denise.vhdl @@ -24,7 +24,8 @@ use work.priv.all; entity denise is generic ( CFG_STRDEBUG : boolean := false; - CFG_BLANK_DURING_VBLANK : boolean := true + CFG_BLANK_DURING_VBLANK : boolean := true; + CFG_ECS : boolean := false ); port ( deni : in denise_in_t; @@ -37,12 +38,38 @@ architecture rtl of denise is -- 10.9 μs (17.2%) out of every 63.6 μs scan line. In PAL, it -- occupies 12 μs (18.8%) out of every 64 μs scan line. -- NTSC: main clock 28.63636 MHz - -- PAL: main clock 28.37516 MHz, hblank is 85.12548 clk (7 MHz) - constant HBLANK_START_PAL : natural range 0 to 511 := 3; - constant HBLANK_NCLK_PAL : natural range 0 to 511 := 85; + -- PAL: main clock 28.37516 MHz + -- For PAL we align hblank with the Minimig/Denise reference: + -- HPOS $13 (start) to $61 (end). + constant HBLANK_START_PAL : natural range 0 to 511 := 16#013#; + constant HBLANK_NCLK_PAL : natural range 0 to 511 := 16#061#; constant NBURST_START_PAL : natural range 0 to 511 := 41; constant NBURST_NCLK_PAL : natural range 0 to 511 := 18; + -- Pipeline depth note. + -- The real Denise chip is mostly combinational (near-zero latency + -- from HPOS comparison to RGB output). Our registered pipeline + -- adds latency which shifts the display rightward relative to HSYNC + -- (which comes directly from Agnus, bypassing Denise). + -- To minimize this, stages F, G and H use combinational feed-forward + -- (v.* instead of r.*) so they collapse into the same CLK7 cycle + -- as stage E. + -- Effective pipeline: C(compare) -> D(shift reg) -> E+F+G+H(output) + -- = 2 CLK7 cycles from comparison to output. + -- + -- Sprite HPOS delay. + -- The collapsed pipeline reduces sprite-to-output latency compared + -- to the original Denise chip. SPRITE_HPOS_DELAY compensates by + -- making the sprite HPOS comparison fire later (each unit = 1 CLK7 + -- = 2 hires pixels rightward shift). Combined with using r.c.h + -- (pre-increment, +1 CLK7), the total sprite delay is + -- SPRITE_HPOS_DELAY + 1 CLK7 beyond the base v.c.h timing. + -- Value 3 gives 8 hires pixels total shift (matches ~6 pixel offset + -- observed in testing). Adjust if sprites are still misaligned: + -- Too far LEFT -> increase value + -- Too far RIGHT -> decrease value + constant SPRITE_HPOS_DELAY : natural := 3; + subtype color_index_t is std_ulogic_vector(4 downto 0); type color_index_array_t is array (integer range <>) of color_index_t; @@ -77,19 +104,32 @@ architecture rtl of denise is dblpf : std_ulogic; color : std_ulogic; gaud : std_ulogic; + shres : std_ulogic; -- BPLCON0 bit 6 (ECS: super-hires) + brdrblnk : std_ulogic; -- BPLCON0 bit 5 (ECS: blank borders) pf1h : std_ulogic_vector(3 downto 0); pf2h : std_ulogic_vector(3 downto 0); pf1p : std_ulogic_vector(2 downto 0); pf2p : std_ulogic_vector(2 downto 0); pf2pri : std_ulogic; + killehb : std_ulogic; -- BPLCON2 bit 9 (ECS: disable EHB) end record; type state_c is record - -- horizontal video beam position, lores + -- horizontal video beam position, lores (HPOS in 7 MHz clocks) h : unsigned(8 downto 0); + -- long-line enable flag (one extra 7 MHz cycle after STRLONG) + lol_ena : std_ulogic; hblank : std_ulogic; vblank : std_ulogic; - -- Indicates that beam is currently in the horizontal display window. + -- raw horizontal display window (DIWSTRT/DIWSTOP) + hwin0 : std_ulogic; + -- hwin0 AND per-line vertical window + hwin1 : std_ulogic; + -- delayed window used for pixel pipeline alignment + hwin2 : std_ulogic; + -- per-line vertical window (set by BPL1DAT writes) + vwin : std_ulogic; + -- combined display window flag used for masking/gating diw : std_ulogic; diwstrth : std_ulogic_vector(8 downto 0); diwstoph : std_ulogic_vector(8 downto 0); @@ -111,32 +151,45 @@ architecture rtl of denise is of std_ulogic_vector(5 downto 0); type state_e is record - bplbus : bplbus_array_t(0 to 1); + -- 4 pixel slots: lores uses 0..1 (same), hires 0..1, SHRES 0..3 + bplbus : bplbus_array_t(0 to 3); -- 8 sprites * 2 lines sprbus : std_ulogic_vector(15 downto 0); + -- pipelined blanking signals + hblank : std_ulogic; + vblank : std_ulogic; + diw : std_ulogic; end record; type pfp_array_t is array (integer range <>) of std_ulogic_vector(2 downto 0); type state_f is record - bplcolor : color_index_array_t(0 to 1); + bplcolor : color_index_array_t(0 to 3); hamop : std_ulogic_vector(1 downto 0); sprcolor : std_ulogic_vector(3 downto 0); -- playfield X priority code (with respect to sprites) - pfp : pfp_array_t(0 to 1); + pfp : pfp_array_t(0 to 3); -- sprite group number spp : std_ulogic_vector(2 downto 0); + -- pipelined blanking signals + hblank : std_ulogic; + vblank : std_ulogic; + diw : std_ulogic; end record; type state_g is record - color : color_index_array_t(0 to 1); + color : color_index_array_t(0 to 3); issprite : std_ulogic; hamop : std_ulogic_vector(1 downto 0); + -- pipelined blanking signals + hblank : std_ulogic; + vblank : std_ulogic; + diw : std_ulogic; end record; type state_h is record - rgb : rgb4_array_t(0 to 1); - nzd : std_ulogic_vector(0 to 1); + rgb : rgb4_array_t(0 to 3); + nzd : std_ulogic_vector(0 to 3); end record; type state_t is record @@ -154,16 +207,28 @@ architecture rtl of denise is drd_ext_noe : std_ulogic; drd_ext_to_denice : std_ulogic; end record; + function STATE_SIMINIT return state_t is variable v : state_t; begin - v.c.h := (others => '0'); - v.c.bplcon.pf1h := (others => '0'); - v.c.bplcon.pf2h := (others => '0'); - v.c.bplcon.pf1p := (others => '0'); - v.c.bplcon.pf2p := (others => '0'); - v.f.pfp := (others => (others => '0')); - v.f.spp := (others => '0'); + v.c.h := (others => '0'); + v.c.lol_ena := '0'; + v.c.hblank := '0'; + v.c.vblank := '0'; + v.c.hwin0 := '0'; + v.c.hwin1 := '0'; + v.c.hwin2 := '0'; + v.c.vwin := '0'; + v.c.diw := '0'; + v.c.bplcon.shres := '0'; + v.c.bplcon.brdrblnk := '0'; + v.c.bplcon.pf1h := (others => '0'); + v.c.bplcon.pf2h := (others => '0'); + v.c.bplcon.pf1p := (others => '0'); + v.c.bplcon.pf2p := (others => '0'); + v.c.bplcon.killehb := '0'; + v.f.pfp := (others => (others => '0')); + v.f.spp := (others => '0'); return v; end; @@ -204,6 +269,7 @@ begin process (r, deni, joy0daty, joy0datx, joy1daty, joy1datx) variable v : state_t; variable readreg : boolean; + variable spr_target : unsigned(8 downto 0); begin v := r; @@ -238,7 +304,8 @@ begin if r.a.sel.joy0dat = '1' or r.a.sel.joy1dat = '1' or - r.a.sel.clxdat = '1' + r.a.sel.clxdat = '1' or + (r.a.sel.deniseid = '1' and CFG_ECS) then v.drd_oe := '1'; v.drd_ext_noe := '0'; @@ -254,6 +321,11 @@ begin if r.a.sel.clxdat = '1' then v.drd(r.c.clxdat'range) := r.c.clxdat; end if; + -- ECS Denise identification register ($07C). + -- Real ECS Denise returns $00FC. OCS does not respond. + if r.a.sel.deniseid = '1' and CFG_ECS then + v.drd := x"00FC"; + end if; -- register address decoder v.b.sel := SEL_NONE; @@ -264,25 +336,78 @@ begin v.b.colori := r.a.rga(5 downto 1); - -- Advance beam counter every clk7 cycle. - v.c.h := r.c.h + 1; - -- Horizontal counter is reset when Agnus writes a sync strobe. - if r.b.sel.strhor = '1' or r.b.sel.strvbl = '1' or r.b.sel.strequ = '1' then - v.c.h := "111111110"; + -------------------------------------------------------------------- + -- Horizontal beam counter (HPOS) with STRLONG long-line support. -- + -------------------------------------------------------------------- + -- Default: keep previous values. + v.c.h := r.c.h; + v.c.lol_ena := r.c.lol_ena; + + -- Reset HPOS at line start when Agnus writes STRHOR, STRVBL, or + -- STREQU (ECS only — OCS Denise ignores STREQU for HPOS reset). + if r.b.sel.strhor = '1' or + r.b.sel.strvbl = '1' or + (r.b.sel.strequ = '1' and CFG_ECS) + then + -- Real Denise starts lines at HPOS=2. + v.c.h := to_unsigned(2, v.c.h'length); + v.c.lol_ena := '0'; + -- reset horizontal window pipeline at line start + v.c.hwin0 := '0'; + v.c.hwin1 := '0'; + v.c.hwin2 := '0'; + else + -- Long line: one extra 7 MHz cycle without increment. + if r.b.sel.strlong = '1' then + v.c.lol_ena := '1'; + elsif r.c.lol_ena = '1' then + -- Consume the long-line cycle: hold HPOS, clear flag. + v.c.lol_ena := '0'; + else + -- Normal increment. + v.c.h := r.c.h + 1; + end if; + end if; + + -------------------------------------------------------------------- + -- Per-line vertical "window" (used with DIW to emulate Denise). -- + -------------------------------------------------------------------- + -- Clear vertical window early in the line (HPOS ≈ $13). + -- Use v.c.h (post-increment) to match amiga_replacement timing + -- where comparisons happen on cdac_f AFTER cdac_r increments HPOS. + if v.c.h = to_unsigned(16#013#, v.c.h'length) then + v.c.vwin := '0'; + end if; + -- Set vertical window when BPL1DAT is written. + if r.b.sel.bpldat(0) = '1' then + v.c.vwin := '1'; end if; - -- Match display window horizontal start and stop position. - if std_ulogic_vector(r.c.h) = r.c.diwstrth then - v.c.diw := '1'; + -------------------------------------------------- + -- Display window start/stop and DIW generation -- + -------------------------------------------------- + -- Raw horizontal window based on DIWSTRT/DIWSTOP. + -- Use v.c.h (post-increment) to match amiga_replacement timing. + if std_ulogic_vector(v.c.h) = r.c.diwstrth then + v.c.hwin0 := '1'; end if; - if std_ulogic_vector(r.c.h) = r.c.diwstoph then - v.c.diw := '0'; + if std_ulogic_vector(v.c.h) = r.c.diwstoph then + v.c.hwin0 := '0'; end if; - if r.c.h = to_unsigned(HBLANK_START_PAL, r.c.h'length) then + -- Horizontal window AND vertical window, then one-cycle pipeline. + v.c.hwin1 := r.c.hwin0 and r.c.vwin; + v.c.hwin2 := r.c.hwin1; + v.c.diw := v.c.hwin2; + + ------------------------- + -- Blanking generation -- + ------------------------- + -- Use v.c.h (post-increment) to match amiga_replacement timing. + if v.c.h = to_unsigned(HBLANK_START_PAL, v.c.h'length) then v.c.hblank := '1'; end if; - if r.c.h = to_unsigned(HBLANK_NCLK_PAL, r.c.h'length) then + if v.c.h = to_unsigned(HBLANK_NCLK_PAL, v.c.h'length) then v.c.hblank := '0'; end if; if r.b.sel.strvbl = '1' or r.b.sel.strequ = '1' then @@ -296,11 +421,11 @@ begin -- NOTE: The width of these comparators can likely be reduced. if r.c.bplcon.color = '1' and - r.c.h = to_unsigned(HBLANK_START_PAL, r.c.h'length) + v.c.h = to_unsigned(HBLANK_START_PAL, v.c.h'length) then v.nburst := '0'; elsif - r.c.h = to_unsigned(NBURST_START_PAL+NBURST_NCLK_PAL, r.c.h'length) + v.c.h = to_unsigned(NBURST_START_PAL+NBURST_NCLK_PAL, v.c.h'length) then v.nburst := '1'; end if; @@ -313,6 +438,11 @@ begin if r.b.sel.diwstop = '1' then v.c.diwstoph := '1' & r.b.drdx(7 downto 0); end if; + -- ECS DIWHIGH register: override H8 bits of DIWSTRT/DIWSTOP. + if r.b.sel.diwhigh = '1' and CFG_ECS then + v.c.diwstrth(8) := r.b.drdx(5); + v.c.diwstoph(8) := r.b.drdx(13); + end if; -- write color table register if false then @@ -364,6 +494,10 @@ begin v.c.bplcon.dblpf := r.b.drdx(10); v.c.bplcon.color := r.b.drdx( 9); v.c.bplcon.gaud := r.b.drdx( 8); + if CFG_ECS then + v.c.bplcon.shres := r.b.drdx(6); + v.c.bplcon.brdrblnk := r.b.drdx(5); + end if; end if; if r.b.sel.bplcon1 = '1' then v.c.bplcon.pf2h := r.b.drdx( 7 downto 4); @@ -373,6 +507,9 @@ begin v.c.bplcon.pf2pri := r.b.drdx( 6); v.c.bplcon.pf2p := r.b.drdx( 5 downto 3); v.c.bplcon.pf1p := r.b.drdx( 2 downto 0); + if CFG_ECS then + v.c.bplcon.killehb := r.b.drdx(9); + end if; end if; if r.b.sel.clxcon = '1' then @@ -389,6 +526,21 @@ begin end if; + -- Pipeline blanking/DIW signals alongside pixel data. + -- Stage E is the pipeline entry point (registered from stage C). + -- Stages F, G use combinational feed-forward (v.* not r.*) to + -- collapse with stages E+F+G+H into a single CLK7 cycle. + v.e.hblank := r.c.hblank; + v.e.vblank := r.c.vblank; + v.e.diw := r.c.diw; + v.f.hblank := v.e.hblank; + v.f.vblank := v.e.vblank; + v.f.diw := v.e.diw; + v.g.hblank := v.f.hblank; + v.g.vblank := v.f.vblank; + v.g.diw := v.f.diw; + + -- parallel to serial converters case r.c.bplcon.bpu is @@ -417,7 +569,21 @@ begin end if; end loop; - if r.c.bplcon.hires = '1' then + if r.c.bplcon.shres = '1' and CFG_ECS then + -- SHRES: shift 4 bits per CLK7 cycle (4 pixels at 35 ns each). + -- Only bitplanes 0-3 are used in super-hires mode. + for i in 0 to 3 loop + v.d.shreg.bpld(i) := ( + r.d.shreg.bpld(i)(15-4 downto 0) & + r.d.shreg.bpl (i)(15 downto 12) + ); + if r.c.bpltrig = '1' then + v.d.shreg.bpl(i) := r.c.bpldat(i); + else + v.d.shreg.bpl(i) := r.d.shreg.bpl (i)(15-4 downto 0) & "0000"; + end if; + end loop; + elsif r.c.bplcon.hires = '1' then for i in 0 to 3 loop v.d.shreg.bpld(i) := ( r.d.shreg.bpld(i)(15-2 downto 0) & @@ -437,9 +603,16 @@ begin r.d.shreg.spr(i).a(r.d.shreg.spr(i).a'high - 1 downto 0) & '0'; v.d.shreg.spr(i).b := r.d.shreg.spr(i).b(r.d.shreg.spr(i).b'high - 1 downto 0) & '0'; + -- Sprite HPOS match with delay compensation. + -- The collapsed pipeline (E+F+G+H combinational) reduces + -- sprite-to-output latency vs. the original Denise chip. + -- Using r.c.h (pre-increment) adds 1 CLK7 vs v.c.h. + -- SPRITE_HPOS_DELAY adds further CLK7 delay by comparing + -- r.c.h against (target + DELAY), so match fires later. + spr_target := unsigned(r.c.spr(i).sh) + SPRITE_HPOS_DELAY; if (r.c.spr(i).en = '1') and - (std_ulogic_vector(r.c.h) = r.c.spr(i).sh) + (r.c.h = spr_target) then v.d.shreg.spr(i).a := r.c.spr(i).data; v.d.shreg.spr(i).b := r.c.spr(i).datb; @@ -449,53 +622,92 @@ begin -- Generate pixel bus - for i in 0 to 1 loop - if r.c.bplcon.hires = '1' then - v.e.bplbus(1-i)(0) := ( - r.d.bplen(0) and - r.d.shreg.bpld(0)(to_integer(unsigned(r.c.bplcon.pf1h and "1110"))+i) - ); - v.e.bplbus(1-i)(2) := ( - r.d.bplen(2) and - r.d.shreg.bpld(2)(to_integer(unsigned(r.c.bplcon.pf1h and "1110"))+i) - ); - v.e.bplbus(1-i)(4) := '0'; - v.e.bplbus(1-i)(1) := ( - r.d.bplen(1) and - r.d.shreg.bpld(1)(to_integer(unsigned(r.c.bplcon.pf2h and "1110"))+i) - ); - v.e.bplbus(1-i)(3) := ( - r.d.bplen(3) and - r.d.shreg.bpld(3)(to_integer(unsigned(r.c.bplcon.pf2h and "1110"))+i) - ); - v.e.bplbus(1-i)(5) := '0'; - else - v.e.bplbus(i)(0) := ( + if r.c.bplcon.shres = '1' and CFG_ECS then + -- SHRES: 4 independent pixels per CLK7 cycle. + -- Only 4 bitplanes (0-3) are used; bits 4-5 always zero. + -- Scroll offset aligned to groups of 4 (AND "1100"). + for i in 0 to 3 loop + v.e.bplbus(3-i)(0) := ( r.d.bplen(0) and - r.d.shreg.bpld(0)(to_integer(unsigned(r.c.bplcon.pf1h))) + r.d.shreg.bpld(0)(to_integer(unsigned(r.c.bplcon.pf1h and "1100"))+i) ); - v.e.bplbus(i)(2) := ( + v.e.bplbus(3-i)(2) := ( r.d.bplen(2) and - r.d.shreg.bpld(2)(to_integer(unsigned(r.c.bplcon.pf1h))) - ); - v.e.bplbus(i)(4) := ( - r.d.bplen(4) and - r.d.shreg.bpld(4)(to_integer(unsigned(r.c.bplcon.pf1h))) + r.d.shreg.bpld(2)(to_integer(unsigned(r.c.bplcon.pf1h and "1100"))+i) ); - v.e.bplbus(i)(1) := ( + v.e.bplbus(3-i)(4) := '0'; + v.e.bplbus(3-i)(1) := ( r.d.bplen(1) and - r.d.shreg.bpld(1)(to_integer(unsigned(r.c.bplcon.pf2h))) + r.d.shreg.bpld(1)(to_integer(unsigned(r.c.bplcon.pf2h and "1100"))+i) ); - v.e.bplbus(i)(3) := ( + v.e.bplbus(3-i)(3) := ( r.d.bplen(3) and - r.d.shreg.bpld(3)(to_integer(unsigned(r.c.bplcon.pf2h))) + r.d.shreg.bpld(3)(to_integer(unsigned(r.c.bplcon.pf2h and "1100"))+i) ); - v.e.bplbus(i)(5) := ( - r.d.bplen(5) and - r.d.shreg.bpld(5)(to_integer(unsigned(r.c.bplcon.pf2h))) - ); - end if; - end loop; + v.e.bplbus(3-i)(5) := '0'; + if r.c.diw = '0' then + v.e.bplbus(3-i) := (others => '0'); + end if; + end loop; + else + for i in 0 to 1 loop + if r.c.bplcon.hires = '1' then + v.e.bplbus(1-i)(0) := ( + r.d.bplen(0) and + r.d.shreg.bpld(0)(to_integer(unsigned(r.c.bplcon.pf1h and "1110"))+i) + ); + v.e.bplbus(1-i)(2) := ( + r.d.bplen(2) and + r.d.shreg.bpld(2)(to_integer(unsigned(r.c.bplcon.pf1h and "1110"))+i) + ); + v.e.bplbus(1-i)(4) := '0'; + v.e.bplbus(1-i)(1) := ( + r.d.bplen(1) and + r.d.shreg.bpld(1)(to_integer(unsigned(r.c.bplcon.pf2h and "1110"))+i) + ); + v.e.bplbus(1-i)(3) := ( + r.d.bplen(3) and + r.d.shreg.bpld(3)(to_integer(unsigned(r.c.bplcon.pf2h and "1110"))+i) + ); + v.e.bplbus(1-i)(5) := '0'; + if r.c.diw = '0' then + v.e.bplbus(1-i) := (others => '0'); + end if; + else + v.e.bplbus(i)(0) := ( + r.d.bplen(0) and + r.d.shreg.bpld(0)(to_integer(unsigned(r.c.bplcon.pf1h))) + ); + v.e.bplbus(i)(2) := ( + r.d.bplen(2) and + r.d.shreg.bpld(2)(to_integer(unsigned(r.c.bplcon.pf1h))) + ); + v.e.bplbus(i)(4) := ( + r.d.bplen(4) and + r.d.shreg.bpld(4)(to_integer(unsigned(r.c.bplcon.pf1h))) + ); + v.e.bplbus(i)(1) := ( + r.d.bplen(1) and + r.d.shreg.bpld(1)(to_integer(unsigned(r.c.bplcon.pf2h))) + ); + v.e.bplbus(i)(3) := ( + r.d.bplen(3) and + r.d.shreg.bpld(3)(to_integer(unsigned(r.c.bplcon.pf2h))) + ); + v.e.bplbus(i)(5) := ( + r.d.bplen(5) and + r.d.shreg.bpld(5)(to_integer(unsigned(r.c.bplcon.pf2h))) + ); + if r.c.diw = '0' then + v.e.bplbus(i) := (others => '0'); + end if; + end if; + end loop; + -- Replicate slots 0-1 to slots 2-3 for uniform pipeline processing. + -- In lores, all 4 slots are identical. In hires, pairs are duplicated. + v.e.bplbus(2) := v.e.bplbus(0); + v.e.bplbus(3) := v.e.bplbus(1); + end if; -- Transform 8 individual 2-line sprites into 4 groups of 4-line sprites. -- Each group has the same color registers. @@ -504,7 +716,10 @@ begin v.e.sprbus(4*i+2) := r.d.shreg.spr(2*i+1).a(15); v.e.sprbus(4*i+1) := r.d.shreg.spr(2*i+0).b(15); v.e.sprbus(4*i+0) := r.d.shreg.spr(2*i+0).a(15); - if r.c.spr(2*i+1).att = '0' then + -- In OCS, only odd sprite ATT bit controls attachment. + -- In ECS, either sprite's ATT bit triggers attachment. + if (r.c.spr(2*i+1).att = '0') and + (r.c.spr(2*i).att = '0' or not CFG_ECS) then -- Offset into color register space of this sprite, -- but only if there is a pixel. v.e.sprbus(4*i+3 downto 4*i+2) := std_ulogic_vector(to_unsigned(i, 2)); @@ -527,40 +742,48 @@ begin end if; end loop; + -- NOTE: Do NOT kill sprites outside DIW. Real Amiga Denise displays + -- sprites in the border region (e.g. mouse pointer, status bar sprites, + -- demo effects). The amiga_replacement_project confirms sprites are + -- independent of the display window. Bitplane pixels are already + -- correctly masked to zero outside DIW in the bplbus generation above. + -- Display priority control: select between playfields 1, 2. -- The "pfp" is the selected playfield placement with respect to sprites. -- This thing is a bit tricky. Please see the HRM - for i in 0 to 1 loop + -- Stage F uses combinational feed-forward from v.e (not r.e) to + -- collapse the pipeline: E+F execute in the same CLK7 cycle. + for i in 0 to 3 loop -- select color and priority for bitplanes if r.c.bplcon.pf2pri = '1' then -- Playfield 2 shall have priority according to BPLCON. -- It means our odd numbered planes have priority over our even planes. v.f.bplcolor(i) := - "01" & r.e.bplbus(i)(5) & r.e.bplbus(i)(3) & r.e.bplbus(i)(1); + "01" & v.e.bplbus(i)(5) & v.e.bplbus(i)(3) & v.e.bplbus(i)(1); v.f.pfp(i) := r.c.bplcon.pf2p; if v.f.bplcolor(i) = "01000" then -- Playfield 2 says color index 0 so playfield 1 wins. v.f.bplcolor(i) := - "00" & r.e.bplbus(i)(4) & r.e.bplbus(i)(2) & r.e.bplbus(i)(0); + "00" & v.e.bplbus(i)(4) & v.e.bplbus(i)(2) & v.e.bplbus(i)(0); v.f.pfp(i) := r.c.bplcon.pf1p; end if; else -- Playfield 1 shall have priority according to BPLCON. v.f.bplcolor(i) := - "00" & r.e.bplbus(i)(4) & r.e.bplbus(i)(2) & r.e.bplbus(i)(0); + "00" & v.e.bplbus(i)(4) & v.e.bplbus(i)(2) & v.e.bplbus(i)(0); v.f.pfp(i) := r.c.bplcon.pf1p; if (v.f.bplcolor(i) = "00000") and - ((r.e.bplbus(i)(5) or r.e.bplbus(i)(3) or r.e.bplbus(i)(1)) /= '0') + ((v.e.bplbus(i)(5) or v.e.bplbus(i)(3) or v.e.bplbus(i)(1)) /= '0') then -- Playfield 1 says color index 0 so playfield 2 wins. However, if -- playfield 2 also selected its color index 0, then it is -- transparent in both playfields. In that case, either the -- background color or a sprite shall be visible. v.f.bplcolor(i) := - "01" & r.e.bplbus(i)(5) & r.e.bplbus(i)(3) & r.e.bplbus(i)(1); + "01" & v.e.bplbus(i)(5) & v.e.bplbus(i)(3) & v.e.bplbus(i)(1); v.f.pfp(i) := r.c.bplcon.pf2p; end if; end if; @@ -568,7 +791,7 @@ begin if r.c.bplcon.dblpf = '0' then -- Dual-playfield not enabled so bypass the priority logic above. -- TODO: Remember bplbus(5 downto 0) and get rid of r.f.hamop? - v.f.bplcolor(i) := r.e.bplbus(i)(4 downto 0); + v.f.bplcolor(i) := v.e.bplbus(i)(4 downto 0); -- HRM says: -- "Be careful: PF2P2 - PF2P0, bits 5-3, are priority bits for -- normal (non-dual) playfields." @@ -576,7 +799,7 @@ begin end if; end loop; - v.f.hamop := r.e.bplbus(0)(5 downto 4); + v.f.hamop := v.e.bplbus(0)(5 downto 4); if r.c.bplcon.dblpf = '0' then if r.c.bplcon.homod = '1' then v.f.bplcolor(0)(4) := '0'; @@ -584,94 +807,119 @@ begin end if; -- Select color and priority for sprites. + -- Uses v.e (combinational) to collapse with stage E. v.f.spp := "111"; v.f.sprcolor := "0000"; for i in 3 downto 0 loop - if r.e.sprbus(4*i+3 downto 4*i) /= "0000" then - v.f.sprcolor := r.e.sprbus(4*i+3 downto 4*i); + if v.e.sprbus(4*i+3 downto 4*i) /= "0000" then + v.f.sprcolor := v.e.sprbus(4*i+3 downto 4*i); v.f.spp := std_ulogic_vector(to_unsigned(i, 3)); end if; end loop; -- Prio between any playfield and any sprite. + -- Stage G uses combinational feed-forward from v.f (not r.f) to + -- collapse the pipeline: E+F+G execute in the same CLK7 cycle. v.g.issprite := '0'; - for i in 0 to 1 loop - if unsigned(r.f.pfp(i)) <= unsigned(r.f.spp) then - if r.f.bplcolor(i) = "00000" and r.f.sprcolor /= "0000" then - v.g.color(i) := '1' & r.f.sprcolor; + for i in 0 to 3 loop + if unsigned(v.f.pfp(i)) <= unsigned(v.f.spp) then + if v.f.bplcolor(i) = "00000" and v.f.sprcolor /= "0000" then + v.g.color(i) := '1' & v.f.sprcolor; if i = 0 then v.g.issprite := '1'; end if; else - v.g.color(i) := r.f.bplcolor(i); + v.g.color(i) := v.f.bplcolor(i); end if; else - if r.f.sprcolor = "0000" then - v.g.color(i) := r.f.bplcolor(i); + if v.f.sprcolor = "0000" then + v.g.color(i) := v.f.bplcolor(i); else - v.g.color(i) := '1' & r.f.sprcolor; + v.g.color(i) := '1' & v.f.sprcolor; if i = 0 then v.g.issprite := '1'; end if; end if; end if; - -- Color 0 to left and right of display window. - if r.c.diw = '0' then - v.g.color(i) := (others => '0'); + -- Outside the display window: show border color (COLOR00) for + -- playfield pixels, but keep sprite colors visible. + -- Uses v.f.diw (combinational) for collapsed pipeline. + if v.f.diw = '0' then + if v.g.color(i)(4) = '0' then + v.g.color(i) := (others => '0'); + end if; end if; end loop; - v.g.hamop := r.f.hamop; + v.g.hamop := v.f.hamop; - -- Color lookup - for i in 0 to 1 loop - if isx(r.g.color(i)) then + -- Color lookup. + -- Stage H uses combinational feed-forward from v.g (not r.g) to + -- collapse the pipeline: E+F+G+H execute in the same CLK7 cycle. + for i in 0 to 3 loop + if isx(v.g.color(i)) then v.h.rgb(i) := (others => 'X'); else - v.h.rgb(i) := r.c.color(to_integer(unsigned(r.g.color(i)))); + v.h.rgb(i) := r.c.color(to_integer(unsigned(v.g.color(i)))); end if; end loop; - if r.g.issprite = '0' and (true or r.c.bplcon.hires = '0') then + if v.g.issprite = '0' and (true or r.c.bplcon.hires = '0') then if r.c.bplcon.dblpf = '0' then if r.c.bplcon.homod = '1' then -- Feedback previous RGB output and use current HAM opcode. + -- HAM is lores-only, so index 0 is the canonical pixel. v.h.rgb(0) := hold_and_modify( v.h.rgb(0), r.h.rgb(0), - r.g.hamop, - r.g.color(0)(3 downto 0) + v.g.hamop, + v.g.color(0)(3 downto 0) ); v.h.rgb(1) := v.h.rgb(0); + v.h.rgb(2) := v.h.rgb(0); + v.h.rgb(3) := v.h.rgb(0); else - -- EHB is a right-shift of looked-up color and no RGB feedback - if r.g.hamop(1) = '1' then + -- EHB is a right-shift of looked-up color and no RGB feedback. + -- EHB is lores-only, so propagate to all 4 slots. + if v.g.hamop(1) = '1' and r.c.bplcon.killehb = '0' then v.h.rgb(0)(11 downto 8) := '0' & v.h.rgb(0)(11 downto 9); v.h.rgb(0)( 7 downto 4) := '0' & v.h.rgb(0)( 7 downto 5); v.h.rgb(0)( 3 downto 0) := '0' & v.h.rgb(0)( 3 downto 1); v.h.rgb(1) := v.h.rgb(0); + v.h.rgb(2) := v.h.rgb(0); + v.h.rgb(3) := v.h.rgb(0); end if; end if; end if; end if; - for i in 0 to 1 loop - if r.c.hblank = '1' then + -- ECS BRDRBLNK: blank border to black when outside display window. + -- Uses v.g.diw (combinational) for collapsed pipeline. + if v.g.diw = '0' and r.c.bplcon.brdrblnk = '1' and CFG_ECS then + for i in 0 to 3 loop + v.h.rgb(i) := (others => '0'); + end loop; + end if; + + -- Blanking: uses v.g.hblank/vblank (combinational feed-forward + -- from r.c.hblank/vblank via v.e → v.f → v.g). + for i in 0 to 3 loop + if v.g.hblank = '1' then v.h.rgb(i) := (others => '0'); end if; - if r.c.vblank = '1' and CFG_BLANK_DURING_VBLANK then + if v.g.vblank = '1' and CFG_BLANK_DURING_VBLANK then v.h.rgb(i) := (others => '0'); end if; end loop; - for i in 0 to 1 loop + for i in 0 to 3 loop v.h.nzd(i) := '1'; - if r.g.color(i) = "00000" then + if v.g.color(i) = "00000" then v.h.nzd(i) := '0'; end if; - if r.c.vblank = '1' then + if v.g.vblank = '1' then v.h.nzd(i) := r.c.bplcon.gaud; end if; end loop; @@ -767,4 +1015,3 @@ begin end block; end; - diff --git a/hdl/ocs.vhdl b/hdl/ocs.vhdl index 2849070..47ab6b3 100644 --- a/hdl/ocs.vhdl +++ b/hdl/ocs.vhdl @@ -71,8 +71,8 @@ package ocs is type denise_out_t is record drd : std_ulogic_vector(15 downto 0); drd_oe : std_ulogic; - rgb : rgb4_array_t(0 to 1); - nzd : std_ulogic_vector(0 to 1); + rgb : rgb4_array_t(0 to 3); + nzd : std_ulogic_vector(0 to 3); nburst : std_ulogic; -- external bus driver control diff --git a/hdl/priv.vhdl b/hdl/priv.vhdl index ec05073..3e58d6f 100644 --- a/hdl/priv.vhdl +++ b/hdl/priv.vhdl @@ -56,6 +56,7 @@ package priv is strvbl : std_ulogic; -- 03A S strhor : std_ulogic; -- 03C S strlong : std_ulogic; -- 03E S + deniseid: std_ulogic; -- 07C R (ECS) diwstrt : std_ulogic; -- 08E S diwstop : std_ulogic; -- 090 S clxcon : std_ulogic; -- 098 W @@ -66,6 +67,7 @@ package priv is spr : sel_sprite_array_t(0 to 7); -- 140..17E W colorx : std_ulogic; color : std_ulogic_vector(0 to 31); -- 180..1BE W + diwhigh : std_ulogic; -- 1E4 W (ECS) end record; constant SEL_NONE : sel_t := ( joy0dat => '0', @@ -76,6 +78,7 @@ package priv is strvbl => '0', strhor => '0', strlong => '0', + deniseid=> '0', diwstrt => '0', diwstop => '0', clxcon => '0', @@ -85,7 +88,8 @@ package priv is bpldat => (others => '0'), spr => (others => SEL_SPRITE_NONE), colorx => '0', - color => (others => '0') + color => (others => '0'), + diwhigh => '0' ); -- register address decoder @@ -136,6 +140,7 @@ package body priv is when x"03A" => s.strvbl := '1'; when x"03C" => s.strhor := '1'; when x"03E" => s.strlong := '1'; + when x"07C" => s.deniseid := '1'; when x"08E" => s.diwstrt := '1'; when x"090" => s.diwstop := '1'; when x"100" => s.bplcon0 := '1'; @@ -216,6 +221,7 @@ package body priv is when x"1BA" => s.color(29) := '1'; when x"1BC" => s.color(30) := '1'; when x"1BE" => s.color(31) := '1'; + when x"1E4" => s.diwhigh := '1'; when others => null; end case; diff --git a/hdl/top.vhdl b/hdl/top.vhdl index 33f1fa0..a392f9d 100644 --- a/hdl/top.vhdl +++ b/hdl/top.vhdl @@ -18,7 +18,7 @@ library ieee; use ieee.std_logic_1164.all; use ieee.numeric_std.all; -library ocs; +library work; -- Technology-independent top level entity top is @@ -62,7 +62,10 @@ architecture rtl of top is signal deno : work.ocs.denise_out_t; begin - den0 : entity ocs.denise + den0 : entity work.denise + generic map ( + CFG_ECS => true + ) port map ( deni => deni, deno => deno @@ -92,8 +95,11 @@ begin -- All logic is clocked in the clk7 domain which corresponds to lores pixel -- resolution. DDR output registers are used to emit hires RGB pixels. + -- NOTE: SHRES mode produces 4 pixels per CLK7 in deno.rgb(0..3), but the + -- DDR output only uses indices 0 and 1 (hires rate). Full SHRES output + -- would require a 2x CLK7 PLL clock (14.3 MHz) feeding the DDR registers. vidx : for i in 0 to 3 generate - r : entity ocs.oddr + r : entity work.oddr port map ( d0 => deno.rgb(0)(8+i), d1 => deno.rgb(1)(8+i), @@ -101,7 +107,7 @@ begin q => video_r(i) ); - g : entity ocs.oddr + g : entity work.oddr port map ( d0 => deno.rgb(0)(4+i), d1 => deno.rgb(1)(4+i), @@ -109,7 +115,7 @@ begin q => video_g(i) ); - b : entity ocs.oddr + b : entity work.oddr port map ( d0 => deno.rgb(0)(0+i), d1 => deno.rgb(1)(0+i), @@ -118,7 +124,7 @@ begin ); end generate; - vidnzd : entity ocs.oddr + vidnzd : entity work.oddr port map ( d0 => deno.nzd(0), d1 => deno.nzd(1), diff --git a/test_pipeline.py b/test_pipeline.py new file mode 100644 index 0000000..9c5cae0 --- /dev/null +++ b/test_pipeline.py @@ -0,0 +1,529 @@ +#!/usr/bin/env python3 +""" +Pipeline timing simulation for the Deniser FPGA. +Models the denise.vhdl pipeline with collapsed E+F+G+H stages. + +Key design points modeled: + - Sprite HPOS match uses r.c.h (pre-increment / registered HPOS) + - HBLANK/DIW comparisons use v.c.h (post-increment) + - Stages E, F, G, H are combinational (feed-forward via v.*) + - Only stages A->B->C->D are registered (pipelined) + - Output is registered at H (r.h), so total depth = D + H_reg = 2 CLK7 + from stage D internal values to output. + - Playfield pixels extract from bpld at bit position 'scroll' (pf1h), + with scroll=0 reading bit 0 (fastest: 1 shift cycle after bpl load). +""" + +class Sim: + def __init__(self, scroll=0, sprite_hpos_delay=3): + """scroll: pf1h scroll value 0-15, selects bit from bpld. + sprite_hpos_delay: SPRITE_HPOS_DELAY constant from VHDL. + """ + self.cycle = 0 + self.scroll = scroll # pf1h: bit position to extract from bpld + self.sprite_hpos_delay = sprite_hpos_delay + # Registered state (r.*) + self.r = { + 'a_sel': None, 'a_data': 0, + 'b_sel': None, 'b_data': 0, + 'c_h': 2, # HPOS starts at 2 + 'c_hblank': False, + 'c_diw': False, # display window active + 'c_bpldat': 0, + 'c_bpltrig': False, + 'c_spr_sh': 0, # sprite HPOS target + 'c_spr_en': False, + 'c_spr_data': 0, + 'd_bpl': 0, # 16-bit shift register + 'd_bpld': 0, # 16-bit scroll delay buffer + 'd_spr': 0, # sprite shift register + 'h_rgb': 0, # output pixel + 'h_tag': '', # what produced the output + 'h_blanked': False, + } + self.log = [] + + def tick(self, bus_sel=None, bus_data=0): + """One CLK7 cycle. bus_sel: 'bpldat','sprpos','sprdata' or None. + Returns (out_tag, out_blanked) from the REGISTERED output (r.h). + """ + r = self.r + v = dict(r) # v starts as copy of r (next state) + + # ============================================================ + # Stage A: latch bus input + # ============================================================ + v['a_sel'] = bus_sel + v['a_data'] = bus_data + + # ============================================================ + # Stage B: decode (from r.a) + # ============================================================ + v['b_sel'] = r['a_sel'] + v['b_data'] = r['a_data'] + + # ============================================================ + # Stage C: register writes, HPOS counter + # ============================================================ + v['c_h'] = r['c_h'] + 1 # post-increment (v.c.h) + v_c_h = v['c_h'] # used for HBLANK/DIW comparisons + + # BPL1DAT write + v['c_bpltrig'] = False + if r['b_sel'] == 'bpldat': + v['c_bpldat'] = r['b_data'] + v['c_bpltrig'] = True + self.log.append((self.cycle, f"C: BPL1DAT=0x{r['b_data']:04x}, bpltrig=1")) + + # Sprite position/data writes + if r['b_sel'] == 'sprpos': + v['c_spr_sh'] = r['b_data'] & 0x1FF + v['c_spr_en'] = True + self.log.append((self.cycle, f"C: SPR0POS HPOS target={v['c_spr_sh']}")) + + if r['b_sel'] == 'sprdata': + v['c_spr_data'] = r['b_data'] + self.log.append((self.cycle, f"C: SPR0DATA=0x{r['b_data']:04x}")) + + # HBLANK comparisons (use v.c.h = post-increment) + v['c_hblank'] = r['c_hblank'] + if v_c_h == 0x013: + v['c_hblank'] = True + self.log.append((self.cycle, f"C: HBLANK START v.c.h=0x{v_c_h:03x}")) + if v_c_h == 0x061: + v['c_hblank'] = False + self.log.append((self.cycle, f"C: HBLANK END v.c.h=0x{v_c_h:03x}")) + + # ============================================================ + # Stage D: shift registers + # ============================================================ + + # bpld: shift left, MSB of REGISTERED bpl feeds into LSB + old_bpl_msb = (r['d_bpl'] >> 15) & 1 + v['d_bpld'] = ((r['d_bpld'] << 1) | old_bpl_msb) & 0xFFFF + + # bpl: load (if bpltrig) or shift left + if r['c_bpltrig']: + v['d_bpl'] = r['c_bpldat'] + self.log.append((self.cycle, + f"D: bpl LOADED=0x{r['c_bpldat']:04x}")) + else: + v['d_bpl'] = (r['d_bpl'] << 1) & 0xFFFF + + # Sprite: shift left (default), or load on HPOS match + v['d_spr'] = (r['d_spr'] << 1) & 0xFFFF + + # Sprite HPOS match: (r.c.h - SPRITE_HPOS_DELAY) == spr.sh + # This delays the sprite load, shifting sprites rightward. + spr_cmp = (r['c_h'] - self.sprite_hpos_delay) & 0x1FF + if r['c_spr_en'] and spr_cmp == r['c_spr_sh']: + v['d_spr'] = r['c_spr_data'] + self.log.append((self.cycle, + f"D: SPRITE MATCH (r.c.h-{self.sprite_hpos_delay})=" + f"{spr_cmp}, r.c.h={r['c_h']}, loaded data")) + + # ============================================================ + # Stages E+F+G+H: COMBINATIONAL from r.d (registered D output) + # ============================================================ + + # Pixel extraction: read bpld at scroll position (not MSB!) + pf_pixel = (r['d_bpld'] >> self.scroll) & 1 + spr_pixel = (r['d_spr'] >> 15) & 1 + + # Blanking: combinational from r.c.hblank (chain v.e->v.f->v.g) + hblank = r['c_hblank'] + + # Priority + color lookup (combinational) + tag = '' + rgb = 0 + if spr_pixel: + tag = 'SPRITE' + rgb = 0xF00 + elif pf_pixel: + tag = 'PLAYFLD' + rgb = 0x00F + + blanked = hblank + if blanked: + rgb = 0 + + # Register at stage H output + v['h_rgb'] = rgb + v['h_tag'] = tag + v['h_blanked'] = blanked + + # ============================================================ + # OUTPUT: r.h (previous cycle's v.h registration) + # ============================================================ + out_tag = r['h_tag'] + out_blanked = r['h_blanked'] + out_rgb = r['h_rgb'] + + # Advance state + self.r = v + self.cycle += 1 + + return out_tag, out_blanked, out_rgb + + +def measure_latency(label, stim_cycle, out_cycle): + lat = out_cycle - stim_cycle + print(f" {label}: {lat} CLK7 ({lat*2} hires pixels)") + return lat + + +# ==================================================================== +# TEST 1: HBLANK latency +# ==================================================================== +def test_hblank(): + print("\n=== TEST 1: HBLANK Latency ===") + print(" HPOS resets to 2. HBLANK comparison at v.c.h == 0x013.") + sim = Sim() + hblank_compare_cycle = None + hblank_output_cycle = None + + for i in range(30): + _, out_blanked, _ = sim.tick() + if hblank_output_cycle is None and out_blanked: + hblank_output_cycle = sim.cycle - 1 + + for cyc, msg in sim.log: + if 'HBLANK START' in msg: + hblank_compare_cycle = cyc + print(f" {msg} at cycle {cyc}") + + if hblank_compare_cycle is not None and hblank_output_cycle is not None: + print(f" First blanked output at cycle {hblank_output_cycle}") + measure_latency("HBLANK compare -> blanked output", + hblank_compare_cycle, hblank_output_cycle) + else: + print(" ERROR: HBLANK never appeared!") + + +# ==================================================================== +# TEST 2: Playfield latency with scroll offset +# ==================================================================== +def test_playfield(): + print("\n=== TEST 2: Playfield Latency (BPL1DAT -> pixel output) ===") + + for scroll in [0, 7, 15]: + print(f"\n --- scroll (pf1h) = {scroll} ---") + sim = Sim(scroll=scroll) + write_cycle = 5 + + pf_output_cycle = None + for i in range(40): + if i == write_cycle: + tag, _, _ = sim.tick(bus_sel='bpldat', bus_data=0xFFFF) + else: + tag, _, _ = sim.tick() + + if pf_output_cycle is None and tag == 'PLAYFLD': + pf_output_cycle = sim.cycle - 1 + + for cyc, msg in sim.log: + print(f" Cycle {cyc}: {msg}") + + if pf_output_cycle is not None: + print(f" First playfield pixel output at cycle {pf_output_cycle}") + measure_latency("BPL1DAT bus write -> playfield output", + write_cycle, pf_output_cycle) + else: + print(" WARNING: No playfield pixel appeared in 40 cycles!") + + +# ==================================================================== +# TEST 3: Sprite latency +# ==================================================================== +def test_sprite(): + print("\n=== TEST 3: Sprite Latency (HPOS match -> pixel output) ===") + + for delay in [0, 3, 4]: + print(f"\n --- SPRITE_HPOS_DELAY = {delay} ---") + sim = Sim(sprite_hpos_delay=delay) + target_hpos = 20 + + # Pre-load sprite config directly into stage C registers + sim.r['c_spr_sh'] = target_hpos + sim.r['c_spr_en'] = True + sim.r['c_spr_data'] = 0xFFFF + + match_cycle = None + spr_output_cycle = None + + for i in range(35): + tag, _, _ = sim.tick() + if spr_output_cycle is None and tag == 'SPRITE': + spr_output_cycle = sim.cycle - 1 + + for cyc, msg in sim.log: + if 'MATCH' in msg: + match_cycle = cyc + print(f" {msg} at cycle {cyc}") + + if match_cycle is not None and spr_output_cycle is not None: + print(f" First sprite output at cycle {spr_output_cycle}") + out_hpos = spr_output_cycle + 2 # HPOS at output time + print(f" Output HPOS: {out_hpos} (target was {target_hpos})") + print(f" Sprite appears {(out_hpos - target_hpos)*2} hires pixels " + f"AFTER target HPOS") + measure_latency("Sprite HPOS match -> sprite output", + match_cycle, spr_output_cycle) + else: + print(" ERROR: Sprite pixel never appeared!") + + +# ==================================================================== +# TEST 4: Sprite vs Playfield relative alignment +# ==================================================================== +def test_alignment(): + print("\n=== TEST 4: Sprite vs Playfield Alignment ===") + print(" Both aimed at same HPOS region, scroll=0, DELAY=3.") + print(" Question: do they appear at the same output cycle?") + sim = Sim(scroll=0, sprite_hpos_delay=3) + + sprite_hpos = 40 + + # Pre-load sprite + sim.r['c_spr_sh'] = sprite_hpos + sim.r['c_spr_en'] = True + sim.r['c_spr_data'] = 0x8000 # MSB set = 1 pixel + + # BPL1DAT must be written early enough to propagate through A->B->C->D. + # Sprite HPOS 40 is reached at cycle ~ (40 - 2) = 38. + # With scroll=0, playfield latency from bus to output = 6 CLK7. + # So write BPL1DAT at cycle 38 - 6 = 32 for alignment. + # But let's use ~30 to keep it simple and see what happens. + bpldat_write_cycle = 30 + + spr_out = None + pf_out = None + outputs = [] + + for i in range(55): + if i == bpldat_write_cycle: + tag, _, _ = sim.tick(bus_sel='bpldat', bus_data=0xFFFF) + else: + tag, _, _ = sim.tick() + + if tag: + outputs.append((sim.cycle - 1, tag)) + if tag == 'SPRITE' and spr_out is None: + spr_out = sim.cycle - 1 + if tag == 'PLAYFLD' and pf_out is None: + pf_out = sim.cycle - 1 + + print(f"\n Sprite HPOS target: {sprite_hpos}") + print(f" BPL1DAT written at cycle: {bpldat_write_cycle}") + print() + for cyc, msg in sim.log: + print(f" Cycle {cyc}: {msg}") + print() + print(" Output timeline (first 12):") + for cyc, tag in outputs[:12]: + print(f" Cycle {cyc}: {tag}") + + if spr_out is not None and pf_out is not None: + diff = spr_out - pf_out + print(f"\n First sprite output: cycle {spr_out}") + print(f" First playfield output: cycle {pf_out}") + print(f" Difference: {diff} CLK7 ({diff*2} hires pixels)") + if diff < 0: + print(f" -> Sprite {-diff} CLK7 BEFORE playfield (sprite LEFT of PF)") + elif diff > 0: + print(f" -> Sprite {diff} CLK7 AFTER playfield (sprite RIGHT of PF)") + else: + print(f" -> PERFECTLY ALIGNED!") + elif spr_out is not None: + print(f"\n Sprite appeared at cycle {spr_out}, but no playfield pixel.") + elif pf_out is not None: + print(f"\n Playfield appeared at cycle {pf_out}, but no sprite pixel.") + else: + print("\n Neither sprite nor playfield appeared!") + + +# ==================================================================== +# TEST 5: Detailed cycle-by-cycle trace +# ==================================================================== +def test_trace(): + print("\n=== TEST 5: Cycle-by-Cycle Pipeline Trace ===") + print(" BPL1DAT=0xFF00 written at cycle 5, scroll=0, DELAY=3.") + print(" Sprite at HPOS=15, data=0xC000 (2 pixels).") + print() + + sim = Sim(scroll=0, sprite_hpos_delay=3) + sim.r['c_spr_sh'] = 15 + sim.r['c_spr_en'] = True + sim.r['c_spr_data'] = 0xC000 + + hdr = (f" {'Cyc':>3} | {'HPOS':>4} | {'bus':>8} | " + f"{'r.c.bpltrig':>11} | {'r.d.bpl':>10} | {'r.d.bpld':>10} | " + f"{'r.d.spr':>10} | {'pf_px':>5} | {'spr_px':>6} | {'output':>8}") + print(hdr) + print(" " + "-" * (len(hdr) - 2)) + + for i in range(22): + bus = '' + if i == 5: + tag, blanked, rgb = sim.tick(bus_sel='bpldat', bus_data=0xFF00) + bus = 'bpldat' + else: + tag, blanked, rgb = sim.tick() + + r = sim.r # this is now the NEW state (after tick) + # For display, we want the state USED during this tick. + # The output (tag) comes from the old r.h, and + # E+F+G+H were computed from the old r.d and r.c. + # After tick, sim.r is the new v that was computed. + # Let's show the registered state that will be used NEXT cycle: + pf_px = (sim.r['d_bpld'] >> sim.scroll) & 1 # what stage E will see next + spr_px = (sim.r['d_spr'] >> 15) & 1 + + out_str = tag if tag else '-' + if blanked and tag: + out_str += '(BLK)' + elif blanked: + out_str = 'BLANK' + + print(f" {i:3d} | {sim.r['c_h']-1:4d} | {bus:>8s} | " + f"{'Yes' if sim.r['c_bpltrig'] else '':>11s} | " + f"0x{sim.r['d_bpl']:04x} | 0x{sim.r['d_bpld']:04x} | " + f"0x{sim.r['d_spr']:04x} | " + f"{pf_px:>5d} | {spr_px:>6d} | {out_str:>8s}") + + +# ==================================================================== +# TEST 6: Real Amiga DMA timing simulation +# ==================================================================== +def test_amiga_dma(): + print("\n=== TEST 6: Amiga DMA Timing (Realistic) ===") + print(" Simulates actual Agnus DMA bus cycles for BPL1DAT.") + print(" On real Amiga (lores), BPL1DAT DMA writes occur every") + print(" 8 CLK7 during the active display line.") + print(" DDFSTRT=0x38 (typical), scroll=0.") + print() + + sim = Sim(scroll=0, sprite_hpos_delay=3) + + # Standard Amiga horizontal timing: + # Line = 227.5 CCK = 455 hires = 227 lores CLK7 + # HPOS 0..226 (wraps, but we just run enough cycles) + # + # Display Data Fetch Start (DDFSTRT) = $38 = 56 decimal + # BPL1DAT DMA happens at DDFSTRT, DDFSTRT+8, DDFSTRT+16, etc. + # These are the HPOS values when Agnus places data on the bus. + # (In reality Agnus drives the bus 1 CCK = 2 CLK7 before Denise + # latches it, but we model the Denise-side latch timing.) + # + # For this test we'll do first few DMA fetches. + + ddfstrt = 0x38 # = 56 + dma_interval = 8 # lores: 1 word per 8 CLK7 + + # Pre-setup: sprite at HPOS 100 + sim.r['c_spr_sh'] = 100 + sim.r['c_spr_en'] = True + sim.r['c_spr_data'] = 0xF000 # 4 sprite pixels + + outputs = [] + dma_writes = [] + + for i in range(180): + bus_sel = None + bus_data = 0 + hpos = sim.r['c_h'] # current HPOS before increment + + # Simulate DMA: Agnus writes BPL1DAT at HPOS multiples + if hpos >= ddfstrt and (hpos - ddfstrt) % dma_interval == 0 and hpos < ddfstrt + 160: + bus_sel = 'bpldat' + bus_data = 0xAAAA # alternating pixel pattern + dma_writes.append((i, hpos)) + + tag, blanked, rgb = sim.tick(bus_sel=bus_sel, bus_data=bus_data) + + if tag and not blanked: + outputs.append((i, sim.r['c_h']-1, tag)) + + print(f" DDFSTRT=0x{ddfstrt:02x} ({ddfstrt}), DMA interval={dma_interval}") + print(f" Number of DMA writes: {len(dma_writes)}") + if dma_writes: + print(f" First DMA at cycle {dma_writes[0][0]} (HPOS {dma_writes[0][1]})") + print(f" Last DMA at cycle {dma_writes[-1][0]} (HPOS {dma_writes[-1][1]})") + print() + + if outputs: + print(f" First visible pixel: cycle {outputs[0][0]} " + f"(HPOS {outputs[0][1]}, type={outputs[0][2]})") + # Find first PF and first sprite + first_pf = next((o for o in outputs if o[2] == 'PLAYFLD'), None) + first_spr = next((o for o in outputs if o[2] == 'SPRITE'), None) + + if first_pf: + print(f" First PLAYFLD: cycle {first_pf[0]} (HPOS {first_pf[1]})") + pf_dma_offset = first_pf[0] - dma_writes[0][0] + print(f" Latency from first DMA write: {pf_dma_offset} CLK7") + if first_spr: + print(f" First SPRITE: cycle {first_spr[0]} (HPOS {first_spr[1]})") + if first_pf and first_spr: + diff = first_spr[0] - first_pf[0] + if diff >= 0: + print(f" Sprite arrives {diff} CLK7 AFTER first playfield pixel") + else: + print(f" Sprite arrives {-diff} CLK7 BEFORE first playfield pixel") + else: + print(" No visible pixels produced!") + + # Show output pattern around sprite + print("\n Pixel output around HPOS 95-110:") + for o in outputs: + if 95 <= o[1] <= 110: + print(f" Cycle {o[0]}, HPOS {o[1]}: {o[2]}") + + +# ==================================================================== +def main(): + print("=" * 70) + print("DENISER PIPELINE TIMING VERIFICATION") + print("=" * 70) + print("Models denise.vhdl collapsed pipeline (E+F+G+H combinational).") + print("Sprite HPOS match uses (r.c.h - SPRITE_HPOS_DELAY).") + print("HBLANK/DIW comparisons use v.c.h (post-increment).") + print("Playfield pixel extracted from bpld at bit position 'scroll'.") + print("Default SPRITE_HPOS_DELAY = 3 (6 hires pixel shift + 2 from r.c.h = 8 total).") + + test_hblank() + test_playfield() + test_sprite() + test_alignment() + test_trace() + test_amiga_dma() + + print("\n" + "=" * 70) + print("SUMMARY") + print("=" * 70) + print(""" + Pipeline structure (collapsed): + Registered stages: A -> B -> C -> D (4 CLK7) + Combinational: E + F + G + H (0 CLK7, feed-forward) + Output register: r.h (+1 CLK7) + + Effective latencies (with SPRITE_HPOS_DELAY=3): + Bus write to output: varies (A+B+C+D+bpl/bpld shift+H_reg) + Sprite HPOS match: 2 CLK7 (D match -> EFGH comb -> H reg -> out) + + DELAY fires 3+1 CLK7 later (r.c.h + DELAY vs v.c.h) + = sprite output at target_HPOS + 6 (vs real Denise ~5-6) + HBLANK compare to output: 2 CLK7 (C compare -> EFGH comb -> H reg -> out) + + SPRITE_HPOS_DELAY shifts sprites rightward to compensate for the + collapsed pipeline having less latency than the original Denise chip. + Each unit = 1 CLK7 = 2 hires pixels. Adjust if needed: + Too far LEFT -> increase SPRITE_HPOS_DELAY + Too far RIGHT -> decrease SPRITE_HPOS_DELAY +""") + print("=" * 70) + print("DONE") + print("=" * 70) + + +if __name__ == '__main__': + main()