From fb01dc8a90d5a99f132830b9ba3a8db29319ef4c Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 25 Sep 2019 16:50:24 +1000 Subject: [PATCH 1/7] icache: Reformat icache No code change Signed-off-by: Benjamin Herrenschmidt --- icache.vhdl | 54 ++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/icache.vhdl b/icache.vhdl index 2565219..eddcdea 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -140,33 +140,33 @@ begin r.w.we <= '0'; case r.state is - when IDLE => - if read_miss = true then - r.state <= WAIT_ACK; - r.store_word <= 0; - r.store_index <= read_index; - - tags(read_index) <= read_tag; - tags_valid(read_index) <= '0'; - - r.w.adr <= i_in.addr(63 downto OFFSET_BITS) & (OFFSET_BITS-1 downto 0 => '0'); - r.w.cyc <= '1'; - r.w.stb <= '1'; - end if; - when WAIT_ACK => - if wishbone_in.ack = '1' then - cachelines(r.store_index)((r.store_word+1)*64-1 downto ((r.store_word)*64)) <= wishbone_in.dat; - r.store_word <= r.store_word + 1; - - if r.store_word = (LINE_SIZE_DW-1) then - r.state <= IDLE; - tags_valid(r.store_index) <= '1'; - r.w.cyc <= '0'; - r.w.stb <= '0'; - else - r.w.adr(OFFSET_BITS-1 downto 3) <= std_ulogic_vector(to_unsigned(r.store_word+1, OFFSET_BITS-3)); - end if; - end if; + when IDLE => + if read_miss = true then + r.state <= WAIT_ACK; + r.store_word <= 0; + r.store_index <= read_index; + + tags(read_index) <= read_tag; + tags_valid(read_index) <= '0'; + + r.w.adr <= i_in.addr(63 downto OFFSET_BITS) & (OFFSET_BITS-1 downto 0 => '0'); + r.w.cyc <= '1'; + r.w.stb <= '1'; + end if; + when WAIT_ACK => + if wishbone_in.ack = '1' then + cachelines(r.store_index)((r.store_word+1)*64-1 downto ((r.store_word)*64)) <= wishbone_in.dat; + r.store_word <= r.store_word + 1; + + if r.store_word = (LINE_SIZE_DW-1) then + r.state <= IDLE; + tags_valid(r.store_index) <= '1'; + r.w.cyc <= '0'; + r.w.stb <= '0'; + else + r.w.adr(OFFSET_BITS-1 downto 3) <= std_ulogic_vector(to_unsigned(r.store_word+1, OFFSET_BITS-3)); + end if; + end if; end case; end if; end process; From 3589f92d5ac7921917a3b3bc573e838de760e569 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 1 Oct 2019 14:24:07 +1000 Subject: [PATCH 2/7] fetch1: Simplify a bit There is no need to have two different state records Signed-off-by: Benjamin Herrenschmidt --- fetch1.vhdl | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/fetch1.vhdl b/fetch1.vhdl index 643e8c8..e2be900 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -26,49 +26,41 @@ entity fetch1 is end entity fetch1; architecture behaviour of fetch1 is - type reg_internal_type is record - nia_next : std_ulogic_vector(63 downto 0); - end record; - signal r_int, rin_int : reg_internal_type; - signal r, rin : Fetch1ToFetch2Type; + signal r, r_next : Fetch1ToFetch2Type; begin + regs : process(clk) begin if rising_edge(clk) then - r <= rin; - r_int <= rin_int; + if rst = '1' or e_in.redirect = '1' or stall_in = '0' then + r <= r_next; + end if; end if; end process; comb : process(all) - variable v : Fetch1ToFetch2Type; - variable v_int : reg_internal_type; + variable v : Fetch1ToFetch2Type; begin v := r; - v_int := r_int; - - if stall_in = '0' then - v.nia := r_int.nia_next; - end if; - - if e_in.redirect = '1' then - v.nia := e_in.redirect_nia; - end if; if rst = '1' then - v.nia := RESET_ADDRESS; + v.nia := RESET_ADDRESS; + elsif e_in.redirect = '1' then + v.nia := e_in.redirect_nia; + else + v.nia := std_logic_vector(unsigned(v.nia) + 4); end if; - v_int.nia_next := std_logic_vector(unsigned(v.nia) + 4); - - -- Update registers - rin <= v; - rin_int <= v_int; + r_next <= v; - -- Update outputs + -- Update outputs to the icache f_out <= r; - report "fetch1 R:" & std_ulogic'image(e_in.redirect) & " v.nia:" & to_hstring(v.nia) & " f_out.nia:" & to_hstring(f_out.nia); + report "fetch1 rst:" & std_ulogic'image(rst) & + " R:" & std_ulogic'image(e_in.redirect) & + " S:" & std_ulogic'image(stall_in) & + " nia_next:" & to_hstring(r_next.nia) & + " nia:" & to_hstring(r.nia); end process; From d415e5544afe69469bb58445d818d87f7d2b352f Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 27 Sep 2019 13:23:56 +1000 Subject: [PATCH 3/7] fetch/icache: Fit icache in BRAM The goal is to have the icache fit in BRAM by latching the output into a register. In order to avoid timing issues , we need to give the BRAM a full cycle on reads, and thus we souce the BRAM address directly from fetch1 latched NIA. (Note: This will be problematic if/when we want to hash the address, we'll probably be better off having fetch1 latch a fully hashed address along with the normal one, so the icache can use the former to address the BRAM and pass the latter along) One difficulty is that we cannot really stall the icache without adding more combo logic that would break the "one full cycle" BRAM model. This means that on stalls from decode, by the time we stall fetch1, it has already gone to the next address, which the icache is already latching. We work around this by having a "stash" buffer in fetch2 that will stash away the icache output on a stall, and override the output of the icache with the content of the stash buffer when unstalling. This requires a rewrite of the stop/step debug logic as well. We now do most of the hard work in fetch1 which makes more sense. Note: Vivado is still not inferring an built-in output register for the BRAMs. I don't want to add another cycle... I don't fully understand why it wouldn't be able to treat current_row as such but clearly it won't. At least the timing seems good enough now for 100Mhz, possibly more. Signed-off-by: Benjamin Herrenschmidt --- common.vhdl | 13 ++-- core.vhdl | 46 ++++++------ core_debug.vhdl | 10 +-- fetch1.vhdl | 90 ++++++++++++++++++---- fetch2.vhdl | 85 +++++++++++++++++---- icache.vhdl | 194 ++++++++++++++++++++++++++++++++++-------------- icache_tb.vhdl | 31 ++++---- 7 files changed, 332 insertions(+), 137 deletions(-) diff --git a/common.vhdl b/common.vhdl index fc6d888..3d02997 100644 --- a/common.vhdl +++ b/common.vhdl @@ -12,17 +12,16 @@ package common is carry: std_ulogic; end record; - type Fetch1ToFetch2Type is record - nia: std_ulogic_vector(63 downto 0); - end record; - - type Fetch2ToIcacheType is record + type Fetch1ToIcacheType is record req: std_ulogic; - addr: std_ulogic_vector(63 downto 0); + stop_mark: std_ulogic; + nia: std_ulogic_vector(63 downto 0); end record; type IcacheToFetch2Type is record - ack: std_ulogic; + valid: std_ulogic; + stop_mark: std_ulogic; + nia: std_ulogic_vector(63 downto 0); insn: std_ulogic_vector(31 downto 0); end record; diff --git a/core.vhdl b/core.vhdl index df40d43..ef939e7 100644 --- a/core.vhdl +++ b/core.vhdl @@ -33,11 +33,10 @@ end core; architecture behave of core is -- fetch signals - signal fetch1_to_fetch2: Fetch1ToFetch2Type; signal fetch2_to_decode1: Fetch2ToDecode1Type; -- icache signals - signal fetch2_to_icache : Fetch2ToIcacheType; + signal fetch1_to_icache : Fetch1ToIcacheType; signal icache_to_fetch2 : IcacheToFetch2Type; -- decode signals @@ -74,8 +73,8 @@ architecture behave of core is -- local signals signal fetch1_stall_in : std_ulogic; + signal icache_stall_out : std_ulogic; signal fetch2_stall_in : std_ulogic; - signal fetch2_stall_out : std_ulogic; signal decode1_stall_in : std_ulogic; signal decode2_stall_out : std_ulogic; @@ -107,27 +106,12 @@ begin rst => core_rst, stall_in => fetch1_stall_in, flush_in => flush, - e_in => execute1_to_fetch1, - f_out => fetch1_to_fetch2 - ); - - fetch1_stall_in <= fetch2_stall_out or decode2_stall_out; - - fetch2_0: entity work.fetch2 - port map ( - clk => clk, - rst => core_rst, - stall_in => fetch2_stall_in, - stall_out => fetch2_stall_out, - flush_in => flush, - i_in => icache_to_fetch2, - i_out => fetch2_to_icache, stop_in => dbg_core_stop, - f_in => fetch1_to_fetch2, - f_out => fetch2_to_decode1 + e_in => execute1_to_fetch1, + i_out => fetch1_to_icache ); - fetch2_stall_in <= decode2_stall_out; + fetch1_stall_in <= icache_stall_out or decode2_stall_out; icache_0: entity work.icache generic map( @@ -137,13 +121,27 @@ begin port map( clk => clk, rst => icache_rst, - i_in => fetch2_to_icache, + i_in => fetch1_to_icache, i_out => icache_to_fetch2, + flush_in => flush, + stall_out => icache_stall_out, wishbone_out => wishbone_insn_out, wishbone_in => wishbone_insn_in ); - icache_rst <= rst or dbg_icache_rst; + icache_rst <= rst or dbg_icache_rst; + + fetch2_0: entity work.fetch2 + port map ( + clk => clk, + rst => core_rst, + stall_in => fetch2_stall_in, + flush_in => flush, + i_in => icache_to_fetch2, + f_out => fetch2_to_decode1 + ); + + fetch2_stall_in <= decode2_stall_out; decode1_0: entity work.decode1 port map ( @@ -274,7 +272,7 @@ begin icache_rst => dbg_icache_rst, terminate => terminate, core_stopped => dbg_core_is_stopped, - nia => fetch1_to_fetch2.nia, + nia => fetch1_to_icache.nia, terminated_out => terminated_out ); diff --git a/core_debug.vhdl b/core_debug.vhdl index c93c70d..ae4414e 100644 --- a/core_debug.vhdl +++ b/core_debug.vhdl @@ -91,15 +91,15 @@ begin reg_write: process(clk) begin if rising_edge(clk) then + -- Reset the 1-cycle "do" signals + do_step <= '0'; + do_reset <= '0'; + do_icreset <= '0'; + if (rst) then stopping <= '0'; terminated <= '0'; else - -- Reset the 1-cycle "do" signals - do_step <= '0'; - do_reset <= '0'; - do_icreset <= '0'; - -- Edge detect on dmi_req for 1-shot pulses dmi_req_1 <= dmi_req; if dmi_req = '1' and dmi_req_1 = '0' then diff --git a/fetch1.vhdl b/fetch1.vhdl index e2be900..9cd5445 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -16,51 +16,111 @@ entity fetch1 is -- Control inputs: stall_in : in std_ulogic; flush_in : in std_ulogic; + stop_in : in std_ulogic; -- redirect from execution unit e_in : in Execute1ToFetch1Type; - -- fetch data out - f_out : out Fetch1ToFetch2Type + -- Request to icache + i_out : out Fetch1ToIcacheType ); end entity fetch1; architecture behaviour of fetch1 is - signal r, r_next : Fetch1ToFetch2Type; + type stop_state_t is (RUNNING, STOPPED, RESTARTING); + type reg_internal_t is record + stop_state: stop_state_t; + end record; + signal r, r_next : Fetch1ToIcacheType; + signal r_int, r_next_int : reg_internal_t; begin regs : process(clk) begin if rising_edge(clk) then - if rst = '1' or e_in.redirect = '1' or stall_in = '0' then - r <= r_next; + if r /= r_next then + report "fetch1 rst:" & std_ulogic'image(rst) & + " R:" & std_ulogic'image(e_in.redirect) & + " S:" & std_ulogic'image(stall_in) & + " T:" & std_ulogic'image(stop_in) & + " nia:" & to_hstring(r_next.nia) & + " SM:" & std_ulogic'image(r_next.stop_mark); end if; + r <= r_next; + r_int <= r_next_int; end if; end process; comb : process(all) - variable v : Fetch1ToFetch2Type; + variable v : Fetch1ToIcacheType; + variable v_int : reg_internal_t; + variable increment : boolean; begin v := r; + v_int := r_int; if rst = '1' then v.nia := RESET_ADDRESS; + v_int.stop_state := RUNNING; elsif e_in.redirect = '1' then v.nia := e_in.redirect_nia; - else - v.nia := std_logic_vector(unsigned(v.nia) + 4); + elsif stall_in = '0' then + + -- For debug stop/step to work properly we need a little bit of + -- trickery here. If we just stop incrementing and send stop marks + -- when stop_in is set, then we'll increment on the cycle it clears + -- and end up never executing the instruction we were stopped on. + -- + -- Avoid this along with the opposite issue when stepping (stop is + -- cleared for only one cycle) is handled by the state machine below + -- + -- By default, increment addresses + increment := true; + case v_int.stop_state is + when RUNNING => + -- If we are running and stop_in is set, then stop incrementing, + -- we are now stopped. + if stop_in = '1' then + increment := false; + v_int.stop_state := STOPPED; + end if; + when STOPPED => + -- When stopped, never increment. If stop is cleared, go to state + -- "restarting" but still don't increment that cycle. stop_in is + -- now 0 so we'll send the NIA down without a stop mark. + increment := false; + if stop_in = '0' then + v_int.stop_state := RESTARTING; + end if; + when RESTARTING => + -- We have just sent the NIA down, we can start incrementing again. + -- If stop_in is still not set, go back to running normally. + -- If stop_in is set again (that was a one-cycle "step"), go + -- back to "stopped" state which means we'll stop incrementing + -- on the next cycle. This ensures we increment the PC once after + -- sending one instruction without a stop mark. Since stop_in is + -- now set, the new PC will be sent with a stop mark and thus not + -- executed. + if stop_in = '0' then + v_int.stop_state := RUNNING; + else + v_int.stop_state := STOPPED; + end if; + end case; + + if increment then + v.nia := std_logic_vector(unsigned(v.nia) + 4); + end if; end if; + v.req := not rst; + v.stop_mark := stop_in; + r_next <= v; + r_next_int <= v_int; -- Update outputs to the icache - f_out <= r; - - report "fetch1 rst:" & std_ulogic'image(rst) & - " R:" & std_ulogic'image(e_in.redirect) & - " S:" & std_ulogic'image(stall_in) & - " nia_next:" & to_hstring(r_next.nia) & - " nia:" & to_hstring(r.nia); + i_out <= r; end process; diff --git a/fetch2.vhdl b/fetch2.vhdl index 2b34836..f8aee81 100644 --- a/fetch2.vhdl +++ b/fetch2.vhdl @@ -12,55 +12,108 @@ entity fetch2 is rst : in std_ulogic; stall_in : in std_ulogic; - stall_out : out std_ulogic; - flush_in : in std_ulogic; - stop_in : in std_ulogic; + -- Results from icache i_in : in IcacheToFetch2Type; - i_out : out Fetch2ToIcacheType; - - f_in : in Fetch1ToFetch2Type; + -- Output to decode f_out : out Fetch2ToDecode1Type ); end entity fetch2; architecture behaviour of fetch2 is + + -- The icache cannot stall, so we need to stash a cycle + -- of output from it when we stall. + type reg_internal_type is record + stash : IcacheToFetch2Type; + stash_valid : std_ulogic; + stopped : std_ulogic; + end record; + + signal r_int, rin_int : reg_internal_type; signal r, rin : Fetch2ToDecode1Type; + begin regs : process(clk) begin if rising_edge(clk) then + + if (r /= rin) then + report "fetch2 rst:" & std_ulogic'image(rst) & + " S:" & std_ulogic'image(stall_in) & + " F:" & std_ulogic'image(flush_in) & + " T:" & std_ulogic'image(rin.stop_mark) & + " V:" & std_ulogic'image(rin.valid) & + " nia:" & to_hstring(rin.nia); + end if; + -- Output state remains unchanged on stall, unless we are flushing if rst = '1' or flush_in = '1' or stall_in = '0' then r <= rin; end if; + + -- Internal state is updated on every clock + r_int <= rin_int; end if; end process; comb : process(all) - variable v : Fetch2ToDecode1Type; + variable v : Fetch2ToDecode1Type; + variable v_int : reg_internal_type; + variable v_i_in : IcacheToFetch2Type; begin v := r; + v_int := r_int; - -- asynchronous icache lookup - i_out.req <= '1'; - i_out.addr <= f_in.nia; - v.valid := i_in.ack; - v.nia := f_in.nia; - v.insn := i_in.insn; - stall_out <= stop_in or not i_in.ack; + -- If stalling, stash away the current input from the icache + if stall_in = '1' and v_int.stash_valid = '0' then + v_int.stash := i_in; + v_int.stash_valid := '1'; + end if; + + -- If unstalling, source input from the stash and invalidate it, + -- otherwise source normally from the icache. + -- + v_i_in := i_in; + if v_int.stash_valid = '1' and stall_in = '0' then + v_i_in := v_int.stash; + v_int.stash_valid := '0'; + end if; + + v.valid := v_i_in.valid; + v.stop_mark := v_i_in.stop_mark; + v.nia := v_i_in.nia; + v.insn := v_i_in.insn; + + -- Clear stash internal valid bit on flush. We still mark + -- the stash itself as valid since we still want to override + -- whatever comes form icache when unstalling, but we'll + -- override it with something invalid. + -- + if flush_in = '1' then + v_int.stash.valid := '0'; + end if; + + -- If we are flushing or the instruction comes with a stop mark + -- we tag it as invalid so it doesn't get decoded and executed + if flush_in = '1' or v.stop_mark = '1' then - if flush_in = '1' or stop_in = '1' then v.valid := '0'; end if; - v.stop_mark := stop_in; + + -- Clear stash on reset + if rst = '1' then + v_int.stash_valid := '0'; + end if; -- Update registers rin <= v; + rin_int <= v_int; -- Update outputs f_out <= r; end process; + end architecture behaviour; diff --git a/icache.vhdl b/icache.vhdl index eddcdea..4ca39c0 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -19,9 +19,12 @@ entity icache is clk : in std_ulogic; rst : in std_ulogic; - i_in : in Fetch2ToIcacheType; + i_in : in Fetch1ToIcacheType; i_out : out IcacheToFetch2Type; + stall_out : out std_ulogic; + flush_in : in std_ulogic; + wishbone_out : out wishbone_master_out; wishbone_in : in wishbone_slave_out ); @@ -59,113 +62,194 @@ architecture rtl of icache is subtype cacheline_tag_type is std_logic_vector(TAG_BITS-1 downto 0); type cacheline_tag_array is array(0 to NUM_LINES-1) of cacheline_tag_type; - signal cachelines : cacheline_array := (others => (others => '0')); - signal tags : cacheline_tag_array := (others => (others => '0')); - signal tags_valid : std_ulogic_vector(NUM_LINES-1 downto 0) := (others => '0'); - + -- Storage. Hopefully "cachelines" is a BRAM, the rest is LUTs + signal cachelines : cacheline_array; + signal tags : cacheline_tag_array; + signal tags_valid : std_ulogic_vector(NUM_LINES-1 downto 0); attribute ram_style : string; attribute ram_style of cachelines : signal is "block"; - attribute ram_decomp : string; attribute ram_decomp of cachelines : signal is "power"; + -- Cache reload state machine type state_type is (IDLE, WAIT_ACK); type reg_internal_type is record - state : state_type; - w : wishbone_master_out; - store_index : integer range 0 to (NUM_LINES-1); - store_word : integer range 0 to (LINE_SIZE-1); + -- Cache hit state (1 cycle BRAM access) + hit_line : cacheline_type; + hit_nia : std_ulogic_vector(63 downto 0); + hit_smark : std_ulogic; + hit_valid : std_ulogic; + + -- Cache miss state (reload state machine) + state : state_type; + wb : wishbone_master_out; + store_index : integer range 0 to (NUM_LINES-1); + store_mask : std_ulogic_vector(LINE_SIZE_DW-1 downto 0); end record; signal r : reg_internal_type; - signal read_index : integer range 0 to NUM_LINES-1; - signal read_tag : std_ulogic_vector(63-OFFSET_BITS-INDEX_BITS downto 0); - signal read_miss : boolean; + -- Async signals decoding incoming requests + signal req_index : integer range 0 to NUM_LINES-1; + signal req_tag : std_ulogic_vector(TAG_BITS-1 downto 0); + signal req_word : integer range 0 to LINE_SIZE_DW*2-1; + signal req_is_hit : std_ulogic; + -- Return the cache line index (tag index) for an address function get_index(addr: std_ulogic_vector(63 downto 0)) return integer is begin return to_integer(unsigned(addr((OFFSET_BITS+INDEX_BITS-1) downto OFFSET_BITS))); end; - function get_word(addr: std_ulogic_vector(63 downto 0); data: cacheline_type) return std_ulogic_vector is - variable word : integer; + -- Return the word index in a cache line for an address + function get_word(addr: std_ulogic_vector(63 downto 0)) return integer is + begin + return to_integer(unsigned(addr(OFFSET_BITS-1 downto 2))); + end; + + -- Read a word in a cache line for an address + function read_word(word: integer; data: cacheline_type) return std_ulogic_vector is begin - word := to_integer(unsigned(addr(OFFSET_BITS-1 downto 2))); - return data((word+1)*32-1 downto word*32); + return data((word+1)*32-1 downto word*32); end; + -- Calculate the tag value from the address function get_tag(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is begin return addr(63 downto OFFSET_BITS+INDEX_BITS); end; + begin assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE; assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE; - icache_read : process(all) + icache_comb : process(all) begin - read_index <= get_index(i_in.addr); - read_tag <= get_tag(i_in.addr); - read_miss <= false; - - i_out.ack <= '0'; - i_out.insn <= get_word(i_in.addr, cachelines(read_index)); - - if i_in.req = '1' then - if (tags_valid(read_index) = '1') and (tags(read_index) = read_tag) then - -- report hit asynchronously - i_out.ack <= '1'; - else - read_miss <= true; - end if; - end if; + -- Calculate next index and tag index + req_index <= get_index(i_in.nia); + req_tag <= get_tag(i_in.nia); + req_word <= get_word(i_in.nia); + + -- Test if pending request is a hit + if tags(req_index) = req_tag then + req_is_hit <= tags_valid(req_index); + else + req_is_hit <= '0'; + end if; + + -- Output instruction from current cache line + -- + -- Note: This is a mild violation of our design principle of having pipeline + -- stages output from a clean latch. In this case we output the result + -- of a mux. The alternative would be output an entire cache line + -- which I prefer not to do just yet. + -- + i_out.valid <= r.hit_valid; + i_out.insn <= read_word(get_word(r.hit_nia), r.hit_line); + i_out.nia <= r.hit_nia; + i_out.stop_mark <= r.hit_smark; + + -- This needs to match the latching of a new request in icache_hit + stall_out <= not req_is_hit; + + -- Wishbone requests output (from the cache miss reload machine) + wishbone_out <= r.wb; end process; - wishbone_out <= r.w; + icache_hit : process(clk) + begin + if rising_edge(clk) then + -- Assume we have nothing valid first + r.hit_valid <= '0'; + + -- Are we free to latch a new request ? + -- + -- Note: this test needs to match the equation for generating stall_out + -- + if i_in.req = '1' and req_is_hit = '1' and flush_in = '0' then + -- Read the cache line (BRAM read port) and remember the NIA + r.hit_line <= cachelines(req_index); + r.hit_nia <= i_in.nia; + r.hit_smark <= i_in.stop_mark; + r.hit_valid <= '1'; + + report "cache hit nia:" & to_hstring(i_in.nia) & + " SM:" & std_ulogic'image(i_in.stop_mark) & + " idx:" & integer'image(req_index) & + " tag:" & to_hstring(req_tag); + end if; - icache_write : process(clk) + -- Flush requested ? discard... + if flush_in then + r.hit_valid <= '0'; + end if; + end if; + end process; + + icache_miss : process(clk) + variable store_dword : std_ulogic_vector(OFFSET_BITS-4 downto 0); begin if rising_edge(clk) then if rst = '1' then tags_valid <= (others => '0'); + r.store_mask <= (others => '0'); r.state <= IDLE; - r.w.cyc <= '0'; - r.w.stb <= '0'; - end if; + r.wb.cyc <= '0'; + r.wb.stb <= '0'; - r.w.dat <= (others => '0'); - r.w.sel <= "11111111"; - r.w.we <= '0'; + -- We only ever do reads on wishbone + r.wb.dat <= (others => '0'); + r.wb.sel <= "11111111"; + r.wb.we <= '0'; + end if; + -- State machine case r.state is when IDLE => - if read_miss = true then + -- We need to read a cache line + if i_in.req = '1' and req_is_hit = '0' then + + report "cache miss nia:" & to_hstring(i_in.nia) & + " SM:" & std_ulogic'image(i_in.stop_mark) & + " idx:" & integer'image(req_index) & + " tag:" & to_hstring(req_tag); + r.state <= WAIT_ACK; - r.store_word <= 0; - r.store_index <= read_index; + r.store_mask <= (0 => '1', others => '0'); + r.store_index <= req_index; - tags(read_index) <= read_tag; - tags_valid(read_index) <= '0'; + -- Force misses while reloading that line + tags_valid(req_index) <= '0'; + tags(req_index) <= req_tag; - r.w.adr <= i_in.addr(63 downto OFFSET_BITS) & (OFFSET_BITS-1 downto 0 => '0'); - r.w.cyc <= '1'; - r.w.stb <= '1'; + -- Prep for first dword read + r.wb.adr <= i_in.nia(63 downto OFFSET_BITS) & (OFFSET_BITS-1 downto 0 => '0'); + r.wb.cyc <= '1'; + r.wb.stb <= '1'; end if; when WAIT_ACK => if wishbone_in.ack = '1' then - cachelines(r.store_index)((r.store_word+1)*64-1 downto ((r.store_word)*64)) <= wishbone_in.dat; - r.store_word <= r.store_word + 1; + -- Store the current dword in both the cache + for i in 0 to LINE_SIZE_DW-1 loop + if r.store_mask(i) = '1' then + cachelines(r.store_index)(63 + i*64 downto i*64) <= wishbone_in.dat; + end if; + end loop; - if r.store_word = (LINE_SIZE_DW-1) then + -- That was the last word ? We are done + if r.store_mask(LINE_SIZE_DW-1) = '1' then r.state <= IDLE; tags_valid(r.store_index) <= '1'; - r.w.cyc <= '0'; - r.w.stb <= '0'; + r.wb.cyc <= '0'; + r.wb.stb <= '0'; else - r.w.adr(OFFSET_BITS-1 downto 3) <= std_ulogic_vector(to_unsigned(r.store_word+1, OFFSET_BITS-3)); + store_dword := r.wb.adr(OFFSET_BITS-1 downto 3); + store_dword := std_ulogic_vector(unsigned(store_dword) + 1); + r.wb.adr(OFFSET_BITS-1 downto 3) <= store_dword; end if; + -- Advance to next word + r.store_mask <= r.store_mask(LINE_SIZE_DW-2 downto 0) & '0'; end if; end case; end if; diff --git a/icache_tb.vhdl b/icache_tb.vhdl index 4955177..010d0ae 100644 --- a/icache_tb.vhdl +++ b/icache_tb.vhdl @@ -12,7 +12,7 @@ architecture behave of icache_tb is signal clk : std_ulogic; signal rst : std_ulogic; - signal i_out : Fetch2ToIcacheType; + signal i_out : Fetch1ToIcacheType; signal i_in : IcacheToFetch2Type; signal wb_bram_in : wishbone_master_out; @@ -30,6 +30,7 @@ begin rst => rst, i_in => i_out, i_out => i_in, + flush_in => '0', wishbone_out => wb_bram_in, wishbone_in => wb_bram_out ); @@ -66,16 +67,16 @@ begin stim: process begin i_out.req <= '0'; - i_out.addr <= (others => '0'); + i_out.nia <= (others => '0'); wait for 4*clk_period; i_out.req <= '1'; - i_out.addr <= x"0000000000000004"; + i_out.nia <= x"0000000000000004"; wait for 30*clk_period; - assert i_in.ack = '1'; + assert i_in.valid = '1'; assert i_in.insn = x"00000001"; i_out.req <= '0'; @@ -84,31 +85,31 @@ begin -- hit i_out.req <= '1'; - i_out.addr <= x"0000000000000008"; - wait for clk_period/2; - assert i_in.ack = '1'; + i_out.nia <= x"0000000000000008"; + wait for clk_period; + assert i_in.valid = '1'; assert i_in.insn = x"00000002"; - wait for clk_period/2; + wait for clk_period; -- another miss i_out.req <= '1'; - i_out.addr <= x"0000000000000040"; + i_out.nia <= x"0000000000000040"; wait for 30*clk_period; - assert i_in.ack = '1'; + assert i_in.valid = '1'; assert i_in.insn = x"00000010"; -- test something that aliases i_out.req <= '1'; - i_out.addr <= x"0000000000000100"; - wait for clk_period/2; - assert i_in.ack = '0'; - wait for clk_period/2; + i_out.nia <= x"0000000000000100"; + wait for clk_period; + assert i_in.valid = '0'; + wait for clk_period; wait for 30*clk_period; - assert i_in.ack = '1'; + assert i_in.valid = '1'; assert i_in.insn = x"00000040"; i_out.req <= '0'; From d40c1c1a252d2cf45f52ec7f2be75e20e3b0c72b Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 30 Sep 2019 18:17:10 +1000 Subject: [PATCH 4/7] icache: Use narrower block RAMs We only ever access the cache memory for at most the wishbone bus width at a time. So having the BRAMs organized as a cache-line-wide port is a waste of resources. Instead, use a wishbone-wide memory and store a line as consecutive rows in the BRAM. This significantly improves BRAM usage in the FPGA as we can now use more rows in the BRAM blocks. It also saves a few LUTs and muxes. Signed-off-by: Benjamin Herrenschmidt --- core.vhdl | 2 +- icache.vhdl | 306 ++++++++++++++++++++++++++++++++----------------- icache_tb.vhdl | 2 +- 3 files changed, 201 insertions(+), 109 deletions(-) diff --git a/core.vhdl b/core.vhdl index ef939e7..6d129d7 100644 --- a/core.vhdl +++ b/core.vhdl @@ -115,7 +115,7 @@ begin icache_0: entity work.icache generic map( - LINE_SIZE_DW => 8, + LINE_SIZE => 64, NUM_LINES => 16 ) port map( diff --git a/icache.vhdl b/icache.vhdl index 4ca39c0..b3fb99c 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -10,8 +10,8 @@ use work.wishbone_types.all; entity icache is generic ( - -- Line size in 64bit doublewords - LINE_SIZE_DW : natural := 8; + -- Line size in bytes + LINE_SIZE : natural := 64; -- Number of lines NUM_LINES : natural := 32 ); @@ -51,85 +51,176 @@ architecture rtl of icache is end if; end function; - constant LINE_SIZE : natural := LINE_SIZE_DW*8; - constant OFFSET_BITS : natural := log2(LINE_SIZE); - constant INDEX_BITS : natural := log2(NUM_LINES); - constant TAG_BITS : natural := 64 - OFFSET_BITS - INDEX_BITS; - - subtype cacheline_type is std_logic_vector((LINE_SIZE*8)-1 downto 0); - type cacheline_array is array(0 to NUM_LINES-1) of cacheline_type; - - subtype cacheline_tag_type is std_logic_vector(TAG_BITS-1 downto 0); - type cacheline_tag_array is array(0 to NUM_LINES-1) of cacheline_tag_type; - - -- Storage. Hopefully "cachelines" is a BRAM, the rest is LUTs - signal cachelines : cacheline_array; - signal tags : cacheline_tag_array; - signal tags_valid : std_ulogic_vector(NUM_LINES-1 downto 0); + -- BRAM organisation: We never access more than wishbone_data_bits at + -- a time so to save resources we make the array only that wide, and + -- use consecutive indices for to make a cache "line" + -- + -- ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits) + constant ROW_SIZE : natural := wishbone_data_bits / 8; + -- ROW_PER_LINE is the number of row (wishbone transactions) in a line + constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE; + -- BRAM_ROWS is the number of rows in BRAM needed to represent the full + -- icache + constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE; + -- INSN_PER_ROW is the number of 32bit instructions per BRAM row + constant INSN_PER_ROW : natural := wishbone_data_bits / 32; + -- Bit fields counts in the address + + -- INSN_BITS is the number of bits to select an instruction in a row + constant INSN_BITS : natural := log2(INSN_PER_ROW); + -- ROW_BITS is the number of bits to select a row + constant ROW_BITS : natural := log2(BRAM_ROWS); + -- ROW_LINEBITS is the number of bits to select a row within a line + constant ROW_LINEBITS : natural := log2(ROW_PER_LINE); + -- LINE_OFF_BITS is the number of bits for the offset in a cache line + constant LINE_OFF_BITS : natural := log2(LINE_SIZE); + -- ROW_OFF_BITS is the number of bits for the offset in a row + constant ROW_OFF_BITS : natural := log2(ROW_SIZE); + -- INDEX_BITS is the number if bits to select a cache line + constant INDEX_BITS : natural := log2(NUM_LINES); + -- TAG_BITS is the number of bits of the tag part of the address + constant TAG_BITS : natural := 64 - LINE_OFF_BITS - INDEX_BITS; + + -- Example of layout for 32 lines of 64 bytes: + -- + -- .. tag |index| line | + -- .. | row | | + -- .. | | | |00| zero (2) + -- .. | | |-| | INSN_BITS (1) + -- .. | |---| | ROW_LINEBITS (3) + -- .. | |--- - --| LINE_OFF_BITS (6) + -- .. | |- --| ROW_OFF_BITS (3) + -- .. |----- ---| | ROW_BITS (8) + -- .. |-----| | INDEX_BITS (5) + -- .. --------| | TAG_BITS (53) + + subtype row_t is integer range 0 to BRAM_ROWS-1; + subtype index_t is integer range 0 to NUM_LINES-1; + + -- The cache data BRAM organized as described above + subtype cache_row_t is std_logic_vector(wishbone_data_bits-1 downto 0); + type cache_array is array(row_t) of cache_row_t; + + -- The cache tags LUTRAM has a row per cache line + subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0); + type cache_tags_array is array(index_t) of cache_tag_t; + + -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs + signal cache_rows : cache_array; + signal tags : cache_tags_array; + signal tags_valid : std_ulogic_vector(NUM_LINES-1 downto 0); attribute ram_style : string; - attribute ram_style of cachelines : signal is "block"; + attribute ram_style of cache_rows : signal is "block"; attribute ram_decomp : string; - attribute ram_decomp of cachelines : signal is "power"; + attribute ram_decomp of cache_rows : signal is "power"; -- Cache reload state machine - type state_type is (IDLE, WAIT_ACK); + type state_t is (IDLE, WAIT_ACK); - type reg_internal_type is record + type reg_internal_t is record -- Cache hit state (1 cycle BRAM access) - hit_line : cacheline_type; + hit_row : cache_row_t; hit_nia : std_ulogic_vector(63 downto 0); hit_smark : std_ulogic; hit_valid : std_ulogic; -- Cache miss state (reload state machine) - state : state_type; - wb : wishbone_master_out; - store_index : integer range 0 to (NUM_LINES-1); - store_mask : std_ulogic_vector(LINE_SIZE_DW-1 downto 0); + state : state_t; + wb : wishbone_master_out; + store_index : index_t; end record; - signal r : reg_internal_type; + signal r : reg_internal_t; - -- Async signals decoding incoming requests - signal req_index : integer range 0 to NUM_LINES-1; - signal req_tag : std_ulogic_vector(TAG_BITS-1 downto 0); - signal req_word : integer range 0 to LINE_SIZE_DW*2-1; + -- Async signals on incoming request + signal req_index : index_t; + signal req_row : row_t; + signal req_tag : cache_tag_t; signal req_is_hit : std_ulogic; -- Return the cache line index (tag index) for an address - function get_index(addr: std_ulogic_vector(63 downto 0)) return integer is + function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is + begin + return to_integer(unsigned(addr(63-TAG_BITS downto LINE_OFF_BITS))); + end; + + -- Return the cache row index (data memory) for an address + function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is begin - return to_integer(unsigned(addr((OFFSET_BITS+INDEX_BITS-1) downto OFFSET_BITS))); + return to_integer(unsigned(addr(63-TAG_BITS downto ROW_OFF_BITS))); end; - -- Return the word index in a cache line for an address - function get_word(addr: std_ulogic_vector(63 downto 0)) return integer is + -- Returns whether this is the last row of a line + function is_last_row(addr: std_ulogic_vector(63 downto 0)) return boolean is + constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); begin - return to_integer(unsigned(addr(OFFSET_BITS-1 downto 2))); + return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones; end; - -- Read a word in a cache line for an address - function read_word(word: integer; data: cacheline_type) return std_ulogic_vector is + -- Return the address of the next row in the current cache line + function next_row_addr(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is + variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0); + variable result : std_ulogic_vector(63 downto 0); begin - return data((word+1)*32-1 downto word*32); + -- Is there no simpler way in VHDL to generate that 3 bits adder ? + row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS); + row_idx := std_ulogic_vector(unsigned(row_idx) + 1); + result := addr; + result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx; + return result; end; - -- Calculate the tag value from the address - function get_tag(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is + -- Read the instruction word for the given address in the current cache row + function read_insn_word(addr: std_ulogic_vector(63 downto 0); + data: cache_row_t) return std_ulogic_vector is + variable word: integer range 0 to INSN_PER_ROW-1; begin - return addr(63 downto OFFSET_BITS+INDEX_BITS); + word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2))); + return data(31+word*32 downto word*32); + end; + + -- Get the tag value from the address + function get_tag(addr: std_ulogic_vector(63 downto 0)) return cache_tag_t is + begin + return addr(63 downto 64-TAG_BITS); end; begin - assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE; - assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE; + assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE; + assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE; + assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" severity FAILURE; + assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2" severity FAILURE; + assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS) + report "geometry bits don't add up" severity FAILURE; + assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) + report "geometry bits don't add up" severity FAILURE; + assert (64 = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) + report "geometry bits don't add up" severity FAILURE; + assert (64 = TAG_BITS + ROW_BITS + ROW_OFF_BITS) + report "geometry bits don't add up" severity FAILURE; + + debug: process + begin + report "ROW_SIZE = " & natural'image(ROW_SIZE); + report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE); + report "BRAM_ROWS = " & natural'image(BRAM_ROWS); + report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW); + report "INSN_BITS = " & natural'image(INSN_BITS); + report "ROW_BITS = " & natural'image(ROW_BITS); + report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS); + report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS); + report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS); + report "INDEX_BITS = " & natural'image(INDEX_BITS); + report "TAG_BITS = " & natural'image(TAG_BITS); + wait; + end process; icache_comb : process(all) begin - -- Calculate next index and tag index + -- Extract line, row and tag from request req_index <= get_index(i_in.nia); + req_row <= get_row(i_in.nia); req_tag <= get_tag(i_in.nia); - req_word <= get_word(i_in.nia); -- Test if pending request is a hit if tags(req_index) = req_tag then @@ -138,19 +229,19 @@ begin req_is_hit <= '0'; end if; - -- Output instruction from current cache line + -- Output instruction from current cache row -- -- Note: This is a mild violation of our design principle of having pipeline -- stages output from a clean latch. In this case we output the result -- of a mux. The alternative would be output an entire cache line -- which I prefer not to do just yet. -- + i_out.insn <= read_insn_word(r.hit_nia, r.hit_row); i_out.valid <= r.hit_valid; - i_out.insn <= read_word(get_word(r.hit_nia), r.hit_line); i_out.nia <= r.hit_nia; i_out.stop_mark <= r.hit_smark; - -- This needs to match the latching of a new request in icache_hit + -- This needs to match the latching of a new request in process icache_hit stall_out <= not req_is_hit; -- Wishbone requests output (from the cache miss reload machine) @@ -160,8 +251,19 @@ begin icache_hit : process(clk) begin if rising_edge(clk) then - -- Assume we have nothing valid first - r.hit_valid <= '0'; + -- Debug + if i_in.req = '1' then + report "cache search for " & to_hstring(i_in.nia) & + " index:" & integer'image(req_index) & + " row:" & integer'image(req_row) & + " want_tag:" & to_hstring(req_tag) & " got_tag:" & to_hstring(req_tag) & + " valid:" & std_ulogic'image(tags_valid(req_index)); + if req_is_hit = '1' then + report "is hit !"; + else + report "is miss !"; + end if; + end if; -- Are we free to latch a new request ? -- @@ -169,7 +271,7 @@ begin -- if i_in.req = '1' and req_is_hit = '1' and flush_in = '0' then -- Read the cache line (BRAM read port) and remember the NIA - r.hit_line <= cachelines(req_index); + r.hit_row <= cache_rows(req_row); r.hit_nia <= i_in.nia; r.hit_smark <= i_in.stop_mark; r.hit_valid <= '1'; @@ -178,22 +280,19 @@ begin " SM:" & std_ulogic'image(i_in.stop_mark) & " idx:" & integer'image(req_index) & " tag:" & to_hstring(req_tag); - end if; - - -- Flush requested ? discard... - if flush_in then + else r.hit_valid <= '0'; + -- Send stop marks down regardless of validity + r.hit_smark <= i_in.stop_mark; end if; end if; end process; icache_miss : process(clk) - variable store_dword : std_ulogic_vector(OFFSET_BITS-4 downto 0); begin if rising_edge(clk) then if rst = '1' then tags_valid <= (others => '0'); - r.store_mask <= (others => '0'); r.state <= IDLE; r.wb.cyc <= '0'; r.wb.stb <= '0'; @@ -202,56 +301,49 @@ begin r.wb.dat <= (others => '0'); r.wb.sel <= "11111111"; r.wb.we <= '0'; - end if; - - -- State machine - case r.state is - when IDLE => - -- We need to read a cache line - if i_in.req = '1' and req_is_hit = '0' then - - report "cache miss nia:" & to_hstring(i_in.nia) & - " SM:" & std_ulogic'image(i_in.stop_mark) & - " idx:" & integer'image(req_index) & - " tag:" & to_hstring(req_tag); - - r.state <= WAIT_ACK; - r.store_mask <= (0 => '1', others => '0'); - r.store_index <= req_index; - - -- Force misses while reloading that line - tags_valid(req_index) <= '0'; - tags(req_index) <= req_tag; - - -- Prep for first dword read - r.wb.adr <= i_in.nia(63 downto OFFSET_BITS) & (OFFSET_BITS-1 downto 0 => '0'); - r.wb.cyc <= '1'; - r.wb.stb <= '1'; - end if; - when WAIT_ACK => - if wishbone_in.ack = '1' then - -- Store the current dword in both the cache - for i in 0 to LINE_SIZE_DW-1 loop - if r.store_mask(i) = '1' then - cachelines(r.store_index)(63 + i*64 downto i*64) <= wishbone_in.dat; + else + -- State machine + case r.state is + when IDLE => + -- We need to read a cache line + if i_in.req = '1' and req_is_hit = '0' then + report "cache miss nia:" & to_hstring(i_in.nia) & + " SM:" & std_ulogic'image(i_in.stop_mark) & + " idx:" & integer'image(req_index) & + " tag:" & to_hstring(req_tag); + + -- Force misses while reloading that line + tags_valid(req_index) <= '0'; + tags(req_index) <= req_tag; + r.store_index <= req_index; + + -- Prep for first wishbone read. We calculate the address off + -- the start of the cache line + r.wb.adr <= i_in.nia(63 downto LINE_OFF_BITS) & + (LINE_OFF_BITS-1 downto 0 => '0'); + r.wb.cyc <= '1'; + r.wb.stb <= '1'; + + r.state <= WAIT_ACK; + end if; + when WAIT_ACK => + if wishbone_in.ack = '1' then + -- Store the current dword in both the cache + cache_rows(get_row(r.wb.adr)) <= wishbone_in.dat; + + -- That was the last word ? We are done + if is_last_row(r.wb.adr) then + tags_valid(r.store_index) <= '1'; + r.wb.cyc <= '0'; + r.wb.stb <= '0'; + r.state <= IDLE; + else + -- Otherwise, calculate the next row address + r.wb.adr <= next_row_addr(r.wb.adr); end if; - end loop; - - -- That was the last word ? We are done - if r.store_mask(LINE_SIZE_DW-1) = '1' then - r.state <= IDLE; - tags_valid(r.store_index) <= '1'; - r.wb.cyc <= '0'; - r.wb.stb <= '0'; - else - store_dword := r.wb.adr(OFFSET_BITS-1 downto 3); - store_dword := std_ulogic_vector(unsigned(store_dword) + 1); - r.wb.adr(OFFSET_BITS-1 downto 3) <= store_dword; end if; - -- Advance to next word - r.store_mask <= r.store_mask(LINE_SIZE_DW-2 downto 0) & '0'; - end if; - end case; - end if; + end case; + end if; + end if; end process; end; diff --git a/icache_tb.vhdl b/icache_tb.vhdl index 010d0ae..7aeb69c 100644 --- a/icache_tb.vhdl +++ b/icache_tb.vhdl @@ -22,7 +22,7 @@ architecture behave of icache_tb is begin icache0: entity work.icache generic map( - LINE_SIZE_DW => 8, + LINE_SIZE => 64, NUM_LINES => 4 ) port map( From e1cf44cec858cf3b76b8a71528c6868a04d2c8a9 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 2 Oct 2019 16:17:42 +1000 Subject: [PATCH 5/7] fetch2: Remove blank line Signed-off-by: Benjamin Herrenschmidt --- fetch2.vhdl | 1 - 1 file changed, 1 deletion(-) diff --git a/fetch2.vhdl b/fetch2.vhdl index f8aee81..99f92ee 100644 --- a/fetch2.vhdl +++ b/fetch2.vhdl @@ -99,7 +99,6 @@ begin -- If we are flushing or the instruction comes with a stop mark -- we tag it as invalid so it doesn't get decoded and executed if flush_in = '1' or v.stop_mark = '1' then - v.valid := '0'; end if; From 004eb074c904c2aec489cf55610c00b4bf8aeab9 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 2 Oct 2019 19:06:53 +1000 Subject: [PATCH 6/7] plru: Add a simple PLRU module Tested in sim only for now Signed-off-by: Benjamin Herrenschmidt --- Makefile | 5 +++ microwatt.core | 1 + plru.vhdl | 77 ++++++++++++++++++++++++++++++++++ plru_tb.vhdl | 109 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 192 insertions(+) create mode 100644 plru.vhdl create mode 100644 plru_tb.vhdl diff --git a/Makefile b/Makefile index 7b28b31..c95c04a 100644 --- a/Makefile +++ b/Makefile @@ -55,6 +55,8 @@ writeback.o: common.o dmi_dtm_tb.o: dmi_dtm_xilinx.o wishbone_debug_master.o dmi_dtm_xilinx.o: wishbone_types.o sim-unisim/unisim_vcomponents.o wishbone_debug_master.o: wishbone_types.o +plru.o: +plru_tb.o: plru.o UNISIM_BITS = sim-unisim/unisim_vcomponents.vhdl sim-unisim/BSCANE2.vhdl sim-unisim/BUFG.vhdl sim-unisim/unisim_vcomponents.o: $(UNISIM_BITS) @@ -75,6 +77,9 @@ fetch_tb: fetch_tb.o icache_tb: icache_tb.o $(GHDL) -e $(GHDLFLAGS) -Wl,simple_ram_behavioural_helpers_c.o $@ +plru_tb: plru_tb.o + $(GHDL) -e $(GHDLFLAGS) $@ + loadstore_tb: loadstore_tb.o $(GHDL) -e $(GHDLFLAGS) $@ diff --git a/microwatt.core b/microwatt.core index 6143f50..94d909e 100644 --- a/microwatt.core +++ b/microwatt.core @@ -29,6 +29,7 @@ filesets: - insn_helpers.vhdl - core.vhdl - icache.vhdl + - plru.vhdl - core_debug.vhdl file_type : vhdlSource-2008 diff --git a/plru.vhdl b/plru.vhdl new file mode 100644 index 0000000..936f85e --- /dev/null +++ b/plru.vhdl @@ -0,0 +1,77 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; +use ieee.math_real.all; + +entity plru is + generic ( + BITS : positive := 2 + ) + ; + port ( + clk : in std_ulogic; + rst : in std_ulogic; + + acc : in std_ulogic_vector(BITS-1 downto 0); + acc_en : in std_ulogic; + lru : out std_ulogic_vector(BITS-1 downto 0) + ); +end entity plru; + +architecture rtl of plru is + constant count : positive := 2 ** BITS - 1; + subtype node_t is integer range 0 to count; + type tree_t is array(node_t) of std_ulogic; + + signal tree: tree_t; +begin + + -- XXX Check if we can turn that into a little ROM instead that + -- takes the tree bit vector and returns the LRU. See if it's better + -- in term of FPGA resouces usage... + get_lru: process(all) + variable node : node_t; + begin + node := 0; + for i in 0 to BITS-1 loop + report "GET: i:" & integer'image(i) & " node:" & integer'image(node) & " val:" & std_ulogic'image(tree(node)); + lru(BITS-1-i) <= tree(node); + if i /= BITS-1 then + node := node * 2; + if tree(node) = '1' then + node := node + 2; + else + node := node + 1; + end if; + end if; + end loop; + end process; + + update_lru: process(clk) + variable node : node_t; + variable abit : std_ulogic; + begin + if rising_edge(clk) then + if rst = '1' then + tree <= (others => '0'); + elsif acc_en = '1' then + node := 0; + for i in 0 to BITS-1 loop + abit := acc(BITS-1-i); + tree(node) <= not abit; + report "UPD: i:" & integer'image(i) & " node:" & integer'image(node) & " val" & std_ulogic'image(not abit); + if i /= BITS-1 then + node := node * 2; + if abit = '1' then + node := node + 2; + else + node := node + 1; + end if; + end if; + end loop; + end if; + end if; + end process; +end; + + diff --git a/plru_tb.vhdl b/plru_tb.vhdl new file mode 100644 index 0000000..18512e4 --- /dev/null +++ b/plru_tb.vhdl @@ -0,0 +1,109 @@ +library ieee; +use ieee.std_logic_1164.all; + +library work; +use work.common.all; +use work.wishbone_types.all; + +entity plru_tb is +end plru_tb; + +architecture behave of plru_tb is + signal clk : std_ulogic; + signal rst : std_ulogic; + + constant clk_period : time := 10 ns; + + signal acc_en : std_ulogic; + signal acc : std_ulogic_vector(2 downto 0); + signal lru : std_ulogic_vector(2 downto 0); + +begin + plru0: entity work.plru + generic map( + BITS => 3 + ) + port map( + clk => clk, + rst => rst, + + acc => acc, + acc_en => acc_en, + lru => lru + ); + + clk_process: process + begin + clk <= '0'; + wait for clk_period/2; + clk <= '1'; + wait for clk_period/2; + end process; + + rst_process: process + begin + rst <= '1'; + wait for 2*clk_period; + rst <= '0'; + wait; + end process; + + stim: process + begin + wait for 4*clk_period; + + report "accessing 1:"; + acc <= "001"; + acc_en <= '1'; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 2:"; + acc <= "010"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 7:"; + acc <= "111"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 4:"; + acc <= "100"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 3:"; + acc <= "011"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 5:"; + acc <= "101"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 3:"; + acc <= "011"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 5:"; + acc <= "101"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 6:"; + acc <= "110"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + report "accessing 0:"; + acc <= "000"; + wait for clk_period; + report "lru:" & to_hstring(lru); + + assert false report "end of test" severity failure; + wait; + end process; +end; From b56b46b7d1a537d9f99608d4957bb5183d512421 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 2 Oct 2019 22:17:31 +1000 Subject: [PATCH 7/7] icache: Set associative icache This adds support for set associativity to the icache. It can still be direct mapped by setting NUM_WAYS to 1. The replacement policy uses a simple tree-PLRU for each set. This is only lightly tested, tests pass but I have to double check that we are using the ways effectively and not creating duplicates. Signed-off-by: Benjamin Herrenschmidt --- Makefile | 7 +- cache_ram.vhdl | 46 +++++++++ core.vhdl | 3 +- icache.vhdl | 270 +++++++++++++++++++++++++++++++++++++------------ microwatt.core | 1 + plru.vhdl | 4 +- 6 files changed, 262 insertions(+), 69 deletions(-) create mode 100644 cache_ram.vhdl diff --git a/Makefile b/Makefile index c95c04a..f69dd15 100644 --- a/Makefile +++ b/Makefile @@ -30,7 +30,10 @@ fetch2.o: common.o wishbone_types.o glibc_random_helpers.o: glibc_random.o: glibc_random_helpers.o helpers.o: -icache.o: common.o wishbone_types.o +cache_ram.o: +plru.o: +plru_tb.o: plru.o +icache.o: common.o wishbone_types.o plru.o cache_ram.o icache_tb.o: common.o wishbone_types.o icache.o simple_ram_behavioural.o insn_helpers.o: loadstore1.o: common.o helpers.o @@ -55,8 +58,6 @@ writeback.o: common.o dmi_dtm_tb.o: dmi_dtm_xilinx.o wishbone_debug_master.o dmi_dtm_xilinx.o: wishbone_types.o sim-unisim/unisim_vcomponents.o wishbone_debug_master.o: wishbone_types.o -plru.o: -plru_tb.o: plru.o UNISIM_BITS = sim-unisim/unisim_vcomponents.vhdl sim-unisim/BSCANE2.vhdl sim-unisim/BUFG.vhdl sim-unisim/unisim_vcomponents.o: $(UNISIM_BITS) diff --git a/cache_ram.vhdl b/cache_ram.vhdl new file mode 100644 index 0000000..e0ffd17 --- /dev/null +++ b/cache_ram.vhdl @@ -0,0 +1,46 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; +use ieee.math_real.all; + +entity cache_ram is + generic( + ROW_BITS : integer := 16; + WIDTH : integer := 64 + ); + + port( + clk : in std_logic; + rd_en : in std_logic; + rd_addr : in std_logic_vector(ROW_BITS - 1 downto 0); + rd_data : out std_logic_vector(WIDTH - 1 downto 0); + wr_en : in std_logic; + wr_addr : in std_logic_vector(ROW_BITS - 1 downto 0); + wr_data : in std_logic_vector(WIDTH - 1 downto 0) + ); + +end cache_ram; + +architecture rtl of cache_ram is + constant SIZE : integer := 2**ROW_BITS; + + type ram_type is array (0 to SIZE - 1) of std_logic_vector(WIDTH - 1 downto 0); + signal ram : ram_type; + attribute ram_style : string; + attribute ram_style of ram : signal is "block"; + attribute ram_decomp : string; + attribute ram_decomp of ram : signal is "power"; + +begin + process(clk) + begin + if rising_edge(clk) then + if wr_en = '1' then + ram(to_integer(unsigned(wr_addr))) <= wr_data; + end if; + if rd_en = '1' then + rd_data <= ram(to_integer(unsigned(rd_addr))); + end if; + end if; + end process; +end; diff --git a/core.vhdl b/core.vhdl index 6d129d7..43b338d 100644 --- a/core.vhdl +++ b/core.vhdl @@ -116,7 +116,8 @@ begin icache_0: entity work.icache generic map( LINE_SIZE => 64, - NUM_LINES => 16 + NUM_LINES => 16, + NUM_WAYS => 2 ) port map( clk => clk, diff --git a/icache.vhdl b/icache.vhdl index b3fb99c..89e491e 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -1,3 +1,21 @@ +-- +-- Set associative icache +-- +-- TODO (in no specific order): +-- +-- * Add debug interface to inspect cache content +-- * Add snoop/invalidate path +-- * Add multi-hit error detection +-- * Pipelined bus interface (wb or axi) +-- * Maybe add parity ? There's a few bits free in each BRAM row on Xilinx +-- * Add optimization: service hits on partially loaded lines +-- * Add optimization: (maybe) interrupt reload on fluch/redirect +-- * Check if playing with the geometry of the cache tags allow for more +-- efficient use of distributed RAM and less logic/muxes. Currently we +-- write TAG_BITS width which may not match full ram blocks and might +-- cause muxes to be inferred for "partial writes". +-- * Check if making the read size of PLRU a ROM helps utilization +-- library ieee; use ieee.std_logic_1164.all; use ieee.numeric_std.all; @@ -11,9 +29,11 @@ use work.wishbone_types.all; entity icache is generic ( -- Line size in bytes - LINE_SIZE : natural := 64; - -- Number of lines - NUM_LINES : natural := 32 + LINE_SIZE : positive := 64; + -- Number of lines in a set + NUM_LINES : positive := 32; + -- Number of ways + NUM_WAYS : positive := 4 ); port ( clk : in std_ulogic; @@ -80,6 +100,8 @@ architecture rtl of icache is constant INDEX_BITS : natural := log2(NUM_LINES); -- TAG_BITS is the number of bits of the tag part of the address constant TAG_BITS : natural := 64 - LINE_OFF_BITS - INDEX_BITS; + -- WAY_BITS is the number of bits to select a way + constant WAY_BITS : natural := log2(NUM_WAYS); -- Example of layout for 32 lines of 64 bytes: -- @@ -96,30 +118,38 @@ architecture rtl of icache is subtype row_t is integer range 0 to BRAM_ROWS-1; subtype index_t is integer range 0 to NUM_LINES-1; + subtype way_t is integer range 0 to NUM_WAYS-1; - -- The cache data BRAM organized as described above - subtype cache_row_t is std_logic_vector(wishbone_data_bits-1 downto 0); - type cache_array is array(row_t) of cache_row_t; + -- The cache data BRAM organized as described above for each way + subtype cache_row_t is std_ulogic_vector(wishbone_data_bits-1 downto 0); - -- The cache tags LUTRAM has a row per cache line + -- The cache tags LUTRAM has a row per set. Vivado is a pain and will + -- not handle a clean (commented) definition of the cache tags as a 3d + -- memory. For now, work around it by putting all the tags subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0); - type cache_tags_array is array(index_t) of cache_tag_t; +-- type cache_tags_set_t is array(way_t) of cache_tag_t; +-- type cache_tags_array_t is array(index_t) of cache_tags_set_t; + constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS; + subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0); + type cache_tags_array_t is array(index_t) of cache_tags_set_t; + + -- The cache valid bits + subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); + type cache_valids_t is array(index_t) of cache_way_valids_t; -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs - signal cache_rows : cache_array; - signal tags : cache_tags_array; - signal tags_valid : std_ulogic_vector(NUM_LINES-1 downto 0); + signal cache_tags : cache_tags_array_t; + signal cache_valids : cache_valids_t; + attribute ram_style : string; - attribute ram_style of cache_rows : signal is "block"; - attribute ram_decomp : string; - attribute ram_decomp of cache_rows : signal is "power"; + attribute ram_style of cache_tags : signal is "distributed"; -- Cache reload state machine type state_t is (IDLE, WAIT_ACK); type reg_internal_t is record - -- Cache hit state (1 cycle BRAM access) - hit_row : cache_row_t; + -- Cache hit state (Latches for 1 cycle BRAM access) + hit_way : way_t; hit_nia : std_ulogic_vector(63 downto 0); hit_smark : std_ulogic; hit_valid : std_ulogic; @@ -127,16 +157,27 @@ architecture rtl of icache is -- Cache miss state (reload state machine) state : state_t; wb : wishbone_master_out; + store_way : way_t; store_index : index_t; end record; signal r : reg_internal_t; -- Async signals on incoming request - signal req_index : index_t; - signal req_row : row_t; - signal req_tag : cache_tag_t; - signal req_is_hit : std_ulogic; + signal req_index : index_t; + signal req_row : row_t; + signal req_hit_way : way_t; + signal req_tag : cache_tag_t; + signal req_is_hit : std_ulogic; + signal req_is_miss : std_ulogic; + + -- Cache RAM interface + type cache_ram_out_t is array(way_t) of cache_row_t; + signal cache_out : cache_ram_out_t; + + -- PLRU output interface + type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0); + signal plru_victim : plru_out_t; -- Return the cache line index (tag index) for an address function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is @@ -185,7 +226,22 @@ architecture rtl of icache is return addr(63 downto 64-TAG_BITS); end; + -- Read a tag from a tag memory row + function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is + begin + return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS); + end; + + -- Write a tag to tag memory row + procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t; + tag: cache_tag_t) is + begin + tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; + end; + begin + + assert LINE_SIZE mod ROW_SIZE = 0; assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE; assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE; assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" severity FAILURE; @@ -212,66 +268,134 @@ begin report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS); report "INDEX_BITS = " & natural'image(INDEX_BITS); report "TAG_BITS = " & natural'image(TAG_BITS); + report "WAY_BITS = " & natural'image(WAY_BITS); wait; end process; + -- Generate a cache RAM for each way + rams: for i in 0 to NUM_WAYS-1 generate + signal do_write : std_ulogic; + signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0); + signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0); + signal dout : cache_row_t; + begin + way: entity work.cache_ram + generic map ( + ROW_BITS => ROW_BITS, + WIDTH => wishbone_data_bits + ) + port map ( + clk => clk, + rd_en => '1', -- fixme + rd_addr => rd_addr, + rd_data => dout, + wr_en => do_write, + wr_addr => wr_addr, + wr_data => wishbone_in.dat + ); + process(all) + begin + do_write <= '0'; + if wishbone_in.ack = '1' and r.store_way = i then + do_write <= '1'; + end if; + cache_out(i) <= dout; + rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); + wr_addr <= std_ulogic_vector(to_unsigned(get_row(r.wb.adr), ROW_BITS)); + end process; + end generate; + + -- Generate PLRUs + maybe_plrus: if NUM_WAYS > 1 generate + begin + plrus: for i in 0 to NUM_LINES-1 generate + -- PLRU interface + signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); + signal plru_acc_en : std_ulogic; + signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); + + begin + plru : entity work.plru + generic map ( + BITS => WAY_BITS + ) + port map ( + clk => clk, + rst => rst, + acc => plru_acc, + acc_en => plru_acc_en, + lru => plru_out + ); + + process(req_index, req_is_hit, req_hit_way, req_is_hit, plru_out) + begin + -- PLRU interface + if req_is_hit = '1' and req_index = i then + plru_acc_en <= req_is_hit; + else + plru_acc_en <= '0'; + end if; + plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS)); + plru_victim(i) <= plru_out; + end process; + end generate; + end generate; + + -- Cache hit detection, output to fetch2 and other misc logic icache_comb : process(all) + variable is_hit : std_ulogic; + variable hit_way : way_t; begin -- Extract line, row and tag from request req_index <= get_index(i_in.nia); req_row <= get_row(i_in.nia); req_tag <= get_tag(i_in.nia); - -- Test if pending request is a hit - if tags(req_index) = req_tag then - req_is_hit <= tags_valid(req_index); - else - req_is_hit <= '0'; - end if; + -- Test if pending request is a hit on any way + hit_way := 0; + is_hit := '0'; + for i in way_t loop + if read_tag(i, cache_tags(req_index)) = req_tag and + cache_valids(req_index)(i) = '1' then + hit_way := i; + is_hit := '1'; + end if; + end loop; + + -- Generate the "hit" and "miss" signals for the synchronous blocks + req_is_hit <= i_in.req and is_hit and not flush_in; + req_is_miss <= i_in.req and not is_hit and not flush_in; + req_hit_way <= hit_way; -- Output instruction from current cache row -- -- Note: This is a mild violation of our design principle of having pipeline -- stages output from a clean latch. In this case we output the result - -- of a mux. The alternative would be output an entire cache line - -- which I prefer not to do just yet. + -- of a mux. The alternative would be output an entire row which + -- I prefer not to do just yet as it would force fetch2 to know about + -- some of the cache geometry information. -- - i_out.insn <= read_insn_word(r.hit_nia, r.hit_row); + i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way)); i_out.valid <= r.hit_valid; i_out.nia <= r.hit_nia; i_out.stop_mark <= r.hit_smark; - -- This needs to match the latching of a new request in process icache_hit - stall_out <= not req_is_hit; + -- Stall fetch1 if we have a miss + stall_out <= not is_hit; -- Wishbone requests output (from the cache miss reload machine) wishbone_out <= r.wb; end process; + -- Cache hit synchronous machine icache_hit : process(clk) begin if rising_edge(clk) then - -- Debug - if i_in.req = '1' then - report "cache search for " & to_hstring(i_in.nia) & - " index:" & integer'image(req_index) & - " row:" & integer'image(req_row) & - " want_tag:" & to_hstring(req_tag) & " got_tag:" & to_hstring(req_tag) & - " valid:" & std_ulogic'image(tags_valid(req_index)); - if req_is_hit = '1' then - report "is hit !"; - else - report "is miss !"; - end if; - end if; - - -- Are we free to latch a new request ? - -- - -- Note: this test needs to match the equation for generating stall_out + -- On a hit, latch the request for the next cycle, when the BRAM data + -- will be available on the cache_out output of the corresponding way -- - if i_in.req = '1' and req_is_hit = '1' and flush_in = '0' then - -- Read the cache line (BRAM read port) and remember the NIA - r.hit_row <= cache_rows(req_row); + if req_is_hit = '1' then + r.hit_way <= req_hit_way; r.hit_nia <= i_in.nia; r.hit_smark <= i_in.stop_mark; r.hit_valid <= '1'; @@ -279,20 +403,28 @@ begin report "cache hit nia:" & to_hstring(i_in.nia) & " SM:" & std_ulogic'image(i_in.stop_mark) & " idx:" & integer'image(req_index) & - " tag:" & to_hstring(req_tag); + " tag:" & to_hstring(req_tag) & + " way: " & integer'image(req_hit_way); else r.hit_valid <= '0'; + -- Send stop marks down regardless of validity r.hit_smark <= i_in.stop_mark; end if; end if; end process; + -- Cache miss/reload synchronous machine icache_miss : process(clk) + variable way : integer range 0 to NUM_WAYS-1; + variable tagset : cache_tags_set_t; begin if rising_edge(clk) then + -- On reset, clear all valid bits to force misses if rst = '1' then - tags_valid <= (others => '0'); + for i in index_t loop + cache_valids(i) <= (others => '0'); + end loop; r.state <= IDLE; r.wb.cyc <= '0'; r.wb.stb <= '0'; @@ -302,23 +434,38 @@ begin r.wb.sel <= "11111111"; r.wb.we <= '0'; else - -- State machine + -- Main state machine case r.state is when IDLE => -- We need to read a cache line - if i_in.req = '1' and req_is_hit = '0' then + if req_is_miss = '1' then + way := to_integer(unsigned(plru_victim(req_index))); + report "cache miss nia:" & to_hstring(i_in.nia) & " SM:" & std_ulogic'image(i_in.stop_mark) & " idx:" & integer'image(req_index) & + " way:" & integer'image(way) & " tag:" & to_hstring(req_tag); - -- Force misses while reloading that line - tags_valid(req_index) <= '0'; - tags(req_index) <= req_tag; + -- Force misses on that way while reloading that line + cache_valids(req_index)(way) <= '0'; + + -- Store new tag in selected way + for i in 0 to NUM_WAYS-1 loop + if i = way then + tagset := cache_tags(req_index); + write_tag(i, tagset, req_tag); + cache_tags(req_index) <= tagset; + end if; + end loop; + + -- Keep track of our index and way for subsequent stores r.store_index <= req_index; + r.store_way <= way; - -- Prep for first wishbone read. We calculate the address off + -- Prep for first wishbone read. We calculate the address of -- the start of the cache line + -- r.wb.adr <= i_in.nia(63 downto LINE_OFF_BITS) & (LINE_OFF_BITS-1 downto 0 => '0'); r.wb.cyc <= '1'; @@ -328,12 +475,9 @@ begin end if; when WAIT_ACK => if wishbone_in.ack = '1' then - -- Store the current dword in both the cache - cache_rows(get_row(r.wb.adr)) <= wishbone_in.dat; - -- That was the last word ? We are done if is_last_row(r.wb.adr) then - tags_valid(r.store_index) <= '1'; + cache_valids(r.store_index)(way) <= '1'; r.wb.cyc <= '0'; r.wb.stb <= '0'; r.state <= IDLE; diff --git a/microwatt.core b/microwatt.core index 94d909e..50e9957 100644 --- a/microwatt.core +++ b/microwatt.core @@ -30,6 +30,7 @@ filesets: - core.vhdl - icache.vhdl - plru.vhdl + - cache_ram.vhdl - core_debug.vhdl file_type : vhdlSource-2008 diff --git a/plru.vhdl b/plru.vhdl index 936f85e..6907c2b 100644 --- a/plru.vhdl +++ b/plru.vhdl @@ -34,7 +34,7 @@ begin begin node := 0; for i in 0 to BITS-1 loop - report "GET: i:" & integer'image(i) & " node:" & integer'image(node) & " val:" & std_ulogic'image(tree(node)); +-- report "GET: i:" & integer'image(i) & " node:" & integer'image(node) & " val:" & std_ulogic'image(tree(node)); lru(BITS-1-i) <= tree(node); if i /= BITS-1 then node := node * 2; @@ -59,7 +59,7 @@ begin for i in 0 to BITS-1 loop abit := acc(BITS-1-i); tree(node) <= not abit; - report "UPD: i:" & integer'image(i) & " node:" & integer'image(node) & " val" & std_ulogic'image(not abit); +-- report "UPD: i:" & integer'image(i) & " node:" & integer'image(node) & " val" & std_ulogic'image(not abit); if i /= BITS-1 then node := node * 2; if abit = '1' then