diff --git a/common.vhdl b/common.vhdl index fc6d888..3d02997 100644 --- a/common.vhdl +++ b/common.vhdl @@ -12,17 +12,16 @@ package common is carry: std_ulogic; end record; - type Fetch1ToFetch2Type is record - nia: std_ulogic_vector(63 downto 0); - end record; - - type Fetch2ToIcacheType is record + type Fetch1ToIcacheType is record req: std_ulogic; - addr: std_ulogic_vector(63 downto 0); + stop_mark: std_ulogic; + nia: std_ulogic_vector(63 downto 0); end record; type IcacheToFetch2Type is record - ack: std_ulogic; + valid: std_ulogic; + stop_mark: std_ulogic; + nia: std_ulogic_vector(63 downto 0); insn: std_ulogic_vector(31 downto 0); end record; diff --git a/core.vhdl b/core.vhdl index df40d43..ef939e7 100644 --- a/core.vhdl +++ b/core.vhdl @@ -33,11 +33,10 @@ end core; architecture behave of core is -- fetch signals - signal fetch1_to_fetch2: Fetch1ToFetch2Type; signal fetch2_to_decode1: Fetch2ToDecode1Type; -- icache signals - signal fetch2_to_icache : Fetch2ToIcacheType; + signal fetch1_to_icache : Fetch1ToIcacheType; signal icache_to_fetch2 : IcacheToFetch2Type; -- decode signals @@ -74,8 +73,8 @@ architecture behave of core is -- local signals signal fetch1_stall_in : std_ulogic; + signal icache_stall_out : std_ulogic; signal fetch2_stall_in : std_ulogic; - signal fetch2_stall_out : std_ulogic; signal decode1_stall_in : std_ulogic; signal decode2_stall_out : std_ulogic; @@ -107,27 +106,12 @@ begin rst => core_rst, stall_in => fetch1_stall_in, flush_in => flush, - e_in => execute1_to_fetch1, - f_out => fetch1_to_fetch2 - ); - - fetch1_stall_in <= fetch2_stall_out or decode2_stall_out; - - fetch2_0: entity work.fetch2 - port map ( - clk => clk, - rst => core_rst, - stall_in => fetch2_stall_in, - stall_out => fetch2_stall_out, - flush_in => flush, - i_in => icache_to_fetch2, - i_out => fetch2_to_icache, stop_in => dbg_core_stop, - f_in => fetch1_to_fetch2, - f_out => fetch2_to_decode1 + e_in => execute1_to_fetch1, + i_out => fetch1_to_icache ); - fetch2_stall_in <= decode2_stall_out; + fetch1_stall_in <= icache_stall_out or decode2_stall_out; icache_0: entity work.icache generic map( @@ -137,13 +121,27 @@ begin port map( clk => clk, rst => icache_rst, - i_in => fetch2_to_icache, + i_in => fetch1_to_icache, i_out => icache_to_fetch2, + flush_in => flush, + stall_out => icache_stall_out, wishbone_out => wishbone_insn_out, wishbone_in => wishbone_insn_in ); - icache_rst <= rst or dbg_icache_rst; + icache_rst <= rst or dbg_icache_rst; + + fetch2_0: entity work.fetch2 + port map ( + clk => clk, + rst => core_rst, + stall_in => fetch2_stall_in, + flush_in => flush, + i_in => icache_to_fetch2, + f_out => fetch2_to_decode1 + ); + + fetch2_stall_in <= decode2_stall_out; decode1_0: entity work.decode1 port map ( @@ -274,7 +272,7 @@ begin icache_rst => dbg_icache_rst, terminate => terminate, core_stopped => dbg_core_is_stopped, - nia => fetch1_to_fetch2.nia, + nia => fetch1_to_icache.nia, terminated_out => terminated_out ); diff --git a/core_debug.vhdl b/core_debug.vhdl index c93c70d..ae4414e 100644 --- a/core_debug.vhdl +++ b/core_debug.vhdl @@ -91,15 +91,15 @@ begin reg_write: process(clk) begin if rising_edge(clk) then + -- Reset the 1-cycle "do" signals + do_step <= '0'; + do_reset <= '0'; + do_icreset <= '0'; + if (rst) then stopping <= '0'; terminated <= '0'; else - -- Reset the 1-cycle "do" signals - do_step <= '0'; - do_reset <= '0'; - do_icreset <= '0'; - -- Edge detect on dmi_req for 1-shot pulses dmi_req_1 <= dmi_req; if dmi_req = '1' and dmi_req_1 = '0' then diff --git a/fetch1.vhdl b/fetch1.vhdl index e2be900..9cd5445 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -16,51 +16,111 @@ entity fetch1 is -- Control inputs: stall_in : in std_ulogic; flush_in : in std_ulogic; + stop_in : in std_ulogic; -- redirect from execution unit e_in : in Execute1ToFetch1Type; - -- fetch data out - f_out : out Fetch1ToFetch2Type + -- Request to icache + i_out : out Fetch1ToIcacheType ); end entity fetch1; architecture behaviour of fetch1 is - signal r, r_next : Fetch1ToFetch2Type; + type stop_state_t is (RUNNING, STOPPED, RESTARTING); + type reg_internal_t is record + stop_state: stop_state_t; + end record; + signal r, r_next : Fetch1ToIcacheType; + signal r_int, r_next_int : reg_internal_t; begin regs : process(clk) begin if rising_edge(clk) then - if rst = '1' or e_in.redirect = '1' or stall_in = '0' then - r <= r_next; + if r /= r_next then + report "fetch1 rst:" & std_ulogic'image(rst) & + " R:" & std_ulogic'image(e_in.redirect) & + " S:" & std_ulogic'image(stall_in) & + " T:" & std_ulogic'image(stop_in) & + " nia:" & to_hstring(r_next.nia) & + " SM:" & std_ulogic'image(r_next.stop_mark); end if; + r <= r_next; + r_int <= r_next_int; end if; end process; comb : process(all) - variable v : Fetch1ToFetch2Type; + variable v : Fetch1ToIcacheType; + variable v_int : reg_internal_t; + variable increment : boolean; begin v := r; + v_int := r_int; if rst = '1' then v.nia := RESET_ADDRESS; + v_int.stop_state := RUNNING; elsif e_in.redirect = '1' then v.nia := e_in.redirect_nia; - else - v.nia := std_logic_vector(unsigned(v.nia) + 4); + elsif stall_in = '0' then + + -- For debug stop/step to work properly we need a little bit of + -- trickery here. If we just stop incrementing and send stop marks + -- when stop_in is set, then we'll increment on the cycle it clears + -- and end up never executing the instruction we were stopped on. + -- + -- Avoid this along with the opposite issue when stepping (stop is + -- cleared for only one cycle) is handled by the state machine below + -- + -- By default, increment addresses + increment := true; + case v_int.stop_state is + when RUNNING => + -- If we are running and stop_in is set, then stop incrementing, + -- we are now stopped. + if stop_in = '1' then + increment := false; + v_int.stop_state := STOPPED; + end if; + when STOPPED => + -- When stopped, never increment. If stop is cleared, go to state + -- "restarting" but still don't increment that cycle. stop_in is + -- now 0 so we'll send the NIA down without a stop mark. + increment := false; + if stop_in = '0' then + v_int.stop_state := RESTARTING; + end if; + when RESTARTING => + -- We have just sent the NIA down, we can start incrementing again. + -- If stop_in is still not set, go back to running normally. + -- If stop_in is set again (that was a one-cycle "step"), go + -- back to "stopped" state which means we'll stop incrementing + -- on the next cycle. This ensures we increment the PC once after + -- sending one instruction without a stop mark. Since stop_in is + -- now set, the new PC will be sent with a stop mark and thus not + -- executed. + if stop_in = '0' then + v_int.stop_state := RUNNING; + else + v_int.stop_state := STOPPED; + end if; + end case; + + if increment then + v.nia := std_logic_vector(unsigned(v.nia) + 4); + end if; end if; + v.req := not rst; + v.stop_mark := stop_in; + r_next <= v; + r_next_int <= v_int; -- Update outputs to the icache - f_out <= r; - - report "fetch1 rst:" & std_ulogic'image(rst) & - " R:" & std_ulogic'image(e_in.redirect) & - " S:" & std_ulogic'image(stall_in) & - " nia_next:" & to_hstring(r_next.nia) & - " nia:" & to_hstring(r.nia); + i_out <= r; end process; diff --git a/fetch2.vhdl b/fetch2.vhdl index 2b34836..f8aee81 100644 --- a/fetch2.vhdl +++ b/fetch2.vhdl @@ -12,55 +12,108 @@ entity fetch2 is rst : in std_ulogic; stall_in : in std_ulogic; - stall_out : out std_ulogic; - flush_in : in std_ulogic; - stop_in : in std_ulogic; + -- Results from icache i_in : in IcacheToFetch2Type; - i_out : out Fetch2ToIcacheType; - - f_in : in Fetch1ToFetch2Type; + -- Output to decode f_out : out Fetch2ToDecode1Type ); end entity fetch2; architecture behaviour of fetch2 is + + -- The icache cannot stall, so we need to stash a cycle + -- of output from it when we stall. + type reg_internal_type is record + stash : IcacheToFetch2Type; + stash_valid : std_ulogic; + stopped : std_ulogic; + end record; + + signal r_int, rin_int : reg_internal_type; signal r, rin : Fetch2ToDecode1Type; + begin regs : process(clk) begin if rising_edge(clk) then + + if (r /= rin) then + report "fetch2 rst:" & std_ulogic'image(rst) & + " S:" & std_ulogic'image(stall_in) & + " F:" & std_ulogic'image(flush_in) & + " T:" & std_ulogic'image(rin.stop_mark) & + " V:" & std_ulogic'image(rin.valid) & + " nia:" & to_hstring(rin.nia); + end if; + -- Output state remains unchanged on stall, unless we are flushing if rst = '1' or flush_in = '1' or stall_in = '0' then r <= rin; end if; + + -- Internal state is updated on every clock + r_int <= rin_int; end if; end process; comb : process(all) - variable v : Fetch2ToDecode1Type; + variable v : Fetch2ToDecode1Type; + variable v_int : reg_internal_type; + variable v_i_in : IcacheToFetch2Type; begin v := r; + v_int := r_int; - -- asynchronous icache lookup - i_out.req <= '1'; - i_out.addr <= f_in.nia; - v.valid := i_in.ack; - v.nia := f_in.nia; - v.insn := i_in.insn; - stall_out <= stop_in or not i_in.ack; + -- If stalling, stash away the current input from the icache + if stall_in = '1' and v_int.stash_valid = '0' then + v_int.stash := i_in; + v_int.stash_valid := '1'; + end if; + + -- If unstalling, source input from the stash and invalidate it, + -- otherwise source normally from the icache. + -- + v_i_in := i_in; + if v_int.stash_valid = '1' and stall_in = '0' then + v_i_in := v_int.stash; + v_int.stash_valid := '0'; + end if; + + v.valid := v_i_in.valid; + v.stop_mark := v_i_in.stop_mark; + v.nia := v_i_in.nia; + v.insn := v_i_in.insn; + + -- Clear stash internal valid bit on flush. We still mark + -- the stash itself as valid since we still want to override + -- whatever comes form icache when unstalling, but we'll + -- override it with something invalid. + -- + if flush_in = '1' then + v_int.stash.valid := '0'; + end if; + + -- If we are flushing or the instruction comes with a stop mark + -- we tag it as invalid so it doesn't get decoded and executed + if flush_in = '1' or v.stop_mark = '1' then - if flush_in = '1' or stop_in = '1' then v.valid := '0'; end if; - v.stop_mark := stop_in; + + -- Clear stash on reset + if rst = '1' then + v_int.stash_valid := '0'; + end if; -- Update registers rin <= v; + rin_int <= v_int; -- Update outputs f_out <= r; end process; + end architecture behaviour; diff --git a/icache.vhdl b/icache.vhdl index eddcdea..4ca39c0 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -19,9 +19,12 @@ entity icache is clk : in std_ulogic; rst : in std_ulogic; - i_in : in Fetch2ToIcacheType; + i_in : in Fetch1ToIcacheType; i_out : out IcacheToFetch2Type; + stall_out : out std_ulogic; + flush_in : in std_ulogic; + wishbone_out : out wishbone_master_out; wishbone_in : in wishbone_slave_out ); @@ -59,113 +62,194 @@ architecture rtl of icache is subtype cacheline_tag_type is std_logic_vector(TAG_BITS-1 downto 0); type cacheline_tag_array is array(0 to NUM_LINES-1) of cacheline_tag_type; - signal cachelines : cacheline_array := (others => (others => '0')); - signal tags : cacheline_tag_array := (others => (others => '0')); - signal tags_valid : std_ulogic_vector(NUM_LINES-1 downto 0) := (others => '0'); - + -- Storage. Hopefully "cachelines" is a BRAM, the rest is LUTs + signal cachelines : cacheline_array; + signal tags : cacheline_tag_array; + signal tags_valid : std_ulogic_vector(NUM_LINES-1 downto 0); attribute ram_style : string; attribute ram_style of cachelines : signal is "block"; - attribute ram_decomp : string; attribute ram_decomp of cachelines : signal is "power"; + -- Cache reload state machine type state_type is (IDLE, WAIT_ACK); type reg_internal_type is record - state : state_type; - w : wishbone_master_out; - store_index : integer range 0 to (NUM_LINES-1); - store_word : integer range 0 to (LINE_SIZE-1); + -- Cache hit state (1 cycle BRAM access) + hit_line : cacheline_type; + hit_nia : std_ulogic_vector(63 downto 0); + hit_smark : std_ulogic; + hit_valid : std_ulogic; + + -- Cache miss state (reload state machine) + state : state_type; + wb : wishbone_master_out; + store_index : integer range 0 to (NUM_LINES-1); + store_mask : std_ulogic_vector(LINE_SIZE_DW-1 downto 0); end record; signal r : reg_internal_type; - signal read_index : integer range 0 to NUM_LINES-1; - signal read_tag : std_ulogic_vector(63-OFFSET_BITS-INDEX_BITS downto 0); - signal read_miss : boolean; + -- Async signals decoding incoming requests + signal req_index : integer range 0 to NUM_LINES-1; + signal req_tag : std_ulogic_vector(TAG_BITS-1 downto 0); + signal req_word : integer range 0 to LINE_SIZE_DW*2-1; + signal req_is_hit : std_ulogic; + -- Return the cache line index (tag index) for an address function get_index(addr: std_ulogic_vector(63 downto 0)) return integer is begin return to_integer(unsigned(addr((OFFSET_BITS+INDEX_BITS-1) downto OFFSET_BITS))); end; - function get_word(addr: std_ulogic_vector(63 downto 0); data: cacheline_type) return std_ulogic_vector is - variable word : integer; + -- Return the word index in a cache line for an address + function get_word(addr: std_ulogic_vector(63 downto 0)) return integer is + begin + return to_integer(unsigned(addr(OFFSET_BITS-1 downto 2))); + end; + + -- Read a word in a cache line for an address + function read_word(word: integer; data: cacheline_type) return std_ulogic_vector is begin - word := to_integer(unsigned(addr(OFFSET_BITS-1 downto 2))); - return data((word+1)*32-1 downto word*32); + return data((word+1)*32-1 downto word*32); end; + -- Calculate the tag value from the address function get_tag(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is begin return addr(63 downto OFFSET_BITS+INDEX_BITS); end; + begin assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE; assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE; - icache_read : process(all) + icache_comb : process(all) begin - read_index <= get_index(i_in.addr); - read_tag <= get_tag(i_in.addr); - read_miss <= false; - - i_out.ack <= '0'; - i_out.insn <= get_word(i_in.addr, cachelines(read_index)); - - if i_in.req = '1' then - if (tags_valid(read_index) = '1') and (tags(read_index) = read_tag) then - -- report hit asynchronously - i_out.ack <= '1'; - else - read_miss <= true; - end if; - end if; + -- Calculate next index and tag index + req_index <= get_index(i_in.nia); + req_tag <= get_tag(i_in.nia); + req_word <= get_word(i_in.nia); + + -- Test if pending request is a hit + if tags(req_index) = req_tag then + req_is_hit <= tags_valid(req_index); + else + req_is_hit <= '0'; + end if; + + -- Output instruction from current cache line + -- + -- Note: This is a mild violation of our design principle of having pipeline + -- stages output from a clean latch. In this case we output the result + -- of a mux. The alternative would be output an entire cache line + -- which I prefer not to do just yet. + -- + i_out.valid <= r.hit_valid; + i_out.insn <= read_word(get_word(r.hit_nia), r.hit_line); + i_out.nia <= r.hit_nia; + i_out.stop_mark <= r.hit_smark; + + -- This needs to match the latching of a new request in icache_hit + stall_out <= not req_is_hit; + + -- Wishbone requests output (from the cache miss reload machine) + wishbone_out <= r.wb; end process; - wishbone_out <= r.w; + icache_hit : process(clk) + begin + if rising_edge(clk) then + -- Assume we have nothing valid first + r.hit_valid <= '0'; + + -- Are we free to latch a new request ? + -- + -- Note: this test needs to match the equation for generating stall_out + -- + if i_in.req = '1' and req_is_hit = '1' and flush_in = '0' then + -- Read the cache line (BRAM read port) and remember the NIA + r.hit_line <= cachelines(req_index); + r.hit_nia <= i_in.nia; + r.hit_smark <= i_in.stop_mark; + r.hit_valid <= '1'; + + report "cache hit nia:" & to_hstring(i_in.nia) & + " SM:" & std_ulogic'image(i_in.stop_mark) & + " idx:" & integer'image(req_index) & + " tag:" & to_hstring(req_tag); + end if; - icache_write : process(clk) + -- Flush requested ? discard... + if flush_in then + r.hit_valid <= '0'; + end if; + end if; + end process; + + icache_miss : process(clk) + variable store_dword : std_ulogic_vector(OFFSET_BITS-4 downto 0); begin if rising_edge(clk) then if rst = '1' then tags_valid <= (others => '0'); + r.store_mask <= (others => '0'); r.state <= IDLE; - r.w.cyc <= '0'; - r.w.stb <= '0'; - end if; + r.wb.cyc <= '0'; + r.wb.stb <= '0'; - r.w.dat <= (others => '0'); - r.w.sel <= "11111111"; - r.w.we <= '0'; + -- We only ever do reads on wishbone + r.wb.dat <= (others => '0'); + r.wb.sel <= "11111111"; + r.wb.we <= '0'; + end if; + -- State machine case r.state is when IDLE => - if read_miss = true then + -- We need to read a cache line + if i_in.req = '1' and req_is_hit = '0' then + + report "cache miss nia:" & to_hstring(i_in.nia) & + " SM:" & std_ulogic'image(i_in.stop_mark) & + " idx:" & integer'image(req_index) & + " tag:" & to_hstring(req_tag); + r.state <= WAIT_ACK; - r.store_word <= 0; - r.store_index <= read_index; + r.store_mask <= (0 => '1', others => '0'); + r.store_index <= req_index; - tags(read_index) <= read_tag; - tags_valid(read_index) <= '0'; + -- Force misses while reloading that line + tags_valid(req_index) <= '0'; + tags(req_index) <= req_tag; - r.w.adr <= i_in.addr(63 downto OFFSET_BITS) & (OFFSET_BITS-1 downto 0 => '0'); - r.w.cyc <= '1'; - r.w.stb <= '1'; + -- Prep for first dword read + r.wb.adr <= i_in.nia(63 downto OFFSET_BITS) & (OFFSET_BITS-1 downto 0 => '0'); + r.wb.cyc <= '1'; + r.wb.stb <= '1'; end if; when WAIT_ACK => if wishbone_in.ack = '1' then - cachelines(r.store_index)((r.store_word+1)*64-1 downto ((r.store_word)*64)) <= wishbone_in.dat; - r.store_word <= r.store_word + 1; + -- Store the current dword in both the cache + for i in 0 to LINE_SIZE_DW-1 loop + if r.store_mask(i) = '1' then + cachelines(r.store_index)(63 + i*64 downto i*64) <= wishbone_in.dat; + end if; + end loop; - if r.store_word = (LINE_SIZE_DW-1) then + -- That was the last word ? We are done + if r.store_mask(LINE_SIZE_DW-1) = '1' then r.state <= IDLE; tags_valid(r.store_index) <= '1'; - r.w.cyc <= '0'; - r.w.stb <= '0'; + r.wb.cyc <= '0'; + r.wb.stb <= '0'; else - r.w.adr(OFFSET_BITS-1 downto 3) <= std_ulogic_vector(to_unsigned(r.store_word+1, OFFSET_BITS-3)); + store_dword := r.wb.adr(OFFSET_BITS-1 downto 3); + store_dword := std_ulogic_vector(unsigned(store_dword) + 1); + r.wb.adr(OFFSET_BITS-1 downto 3) <= store_dword; end if; + -- Advance to next word + r.store_mask <= r.store_mask(LINE_SIZE_DW-2 downto 0) & '0'; end if; end case; end if; diff --git a/icache_tb.vhdl b/icache_tb.vhdl index 4955177..010d0ae 100644 --- a/icache_tb.vhdl +++ b/icache_tb.vhdl @@ -12,7 +12,7 @@ architecture behave of icache_tb is signal clk : std_ulogic; signal rst : std_ulogic; - signal i_out : Fetch2ToIcacheType; + signal i_out : Fetch1ToIcacheType; signal i_in : IcacheToFetch2Type; signal wb_bram_in : wishbone_master_out; @@ -30,6 +30,7 @@ begin rst => rst, i_in => i_out, i_out => i_in, + flush_in => '0', wishbone_out => wb_bram_in, wishbone_in => wb_bram_out ); @@ -66,16 +67,16 @@ begin stim: process begin i_out.req <= '0'; - i_out.addr <= (others => '0'); + i_out.nia <= (others => '0'); wait for 4*clk_period; i_out.req <= '1'; - i_out.addr <= x"0000000000000004"; + i_out.nia <= x"0000000000000004"; wait for 30*clk_period; - assert i_in.ack = '1'; + assert i_in.valid = '1'; assert i_in.insn = x"00000001"; i_out.req <= '0'; @@ -84,31 +85,31 @@ begin -- hit i_out.req <= '1'; - i_out.addr <= x"0000000000000008"; - wait for clk_period/2; - assert i_in.ack = '1'; + i_out.nia <= x"0000000000000008"; + wait for clk_period; + assert i_in.valid = '1'; assert i_in.insn = x"00000002"; - wait for clk_period/2; + wait for clk_period; -- another miss i_out.req <= '1'; - i_out.addr <= x"0000000000000040"; + i_out.nia <= x"0000000000000040"; wait for 30*clk_period; - assert i_in.ack = '1'; + assert i_in.valid = '1'; assert i_in.insn = x"00000010"; -- test something that aliases i_out.req <= '1'; - i_out.addr <= x"0000000000000100"; - wait for clk_period/2; - assert i_in.ack = '0'; - wait for clk_period/2; + i_out.nia <= x"0000000000000100"; + wait for clk_period; + assert i_in.valid = '0'; + wait for clk_period; wait for 30*clk_period; - assert i_in.ack = '1'; + assert i_in.valid = '1'; assert i_in.insn = x"00000040"; i_out.req <= '0';