From 1a9834c506a8f6a05bdbae16f81e5482a75663b0 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 31 Aug 2021 19:47:14 +1000 Subject: [PATCH] dcache: Fix bug with forwarding of stores We have two stages of forwarding to cover the two cycles of latency between when something is written to BRAM and when that new data can be read from BRAM. When the writes to BRAM result from store instructions, the write may write only some bytes of a row (8 bytes) and not others, so we have a mask to enable only the written bytes to be forwarded. However, we only forward written data from either the first stage of forwarding or the second, not both. So if we have two stores in succession that write different bytes of the same row, and then a load from the row, we will only forward the data from the second store, and miss the data from the first store; thus the load will get the wrong value. To fix this, we make the decision on which forward stage to use for each byte individually. This results in a 4-input multiplexer feeding r1.data_out, with its inputs being the BRAM, the wishbone, the current write data, and the 2nd-stage forwarding register. Each byte of the multiplexer is separately controlled. The code for this multiplexer is moved to the dcache_fast_hit process since it is used for cache hits as well as cache misses. This also simplifies the BRAM code by ensuring that we can use the same source for the BRAM address and way selection for writes, whether we are writing store data or cache line refill data from memory. Signed-off-by: Paul Mackerras --- dcache.vhdl | 168 +++++++++++++++++++++++----------------------------- 1 file changed, 74 insertions(+), 94 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 282eba0..bca393a 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -122,7 +122,7 @@ architecture rtl of dcache is type cache_valids_t is array(index_t) of cache_way_valids_t; type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; - -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs + -- Storage. Hopefully implemented in LUTs signal cache_tags : cache_tags_array_t; signal cache_tag_set : cache_tags_set_t; signal cache_valids : cache_valids_t; @@ -386,12 +386,15 @@ architecture rtl of dcache is signal r0_stall : std_ulogic; signal fwd_same_tag : std_ulogic; - signal use_forward : std_ulogic; + signal use_forward_st : std_ulogic; + signal use_forward_rl : std_ulogic; signal use_forward2 : std_ulogic; -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; signal cache_out : cache_ram_out_t; + signal ram_wr_data : cache_row_t; + signal ram_wr_select : std_ulogic_vector(ROW_SIZE - 1 downto 0); -- PLRU output interface type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0); @@ -903,12 +906,13 @@ begin fwd_same_tag <= fwd_match; -- Whether to use forwarded data for a load or not - use_forward <= '0'; + use_forward_st <= '0'; + use_forward_rl <= '0'; if r1.store_row = req_row and rel_match = '1' then - -- Use the forwarding path if last cycle was a write to this row - if (r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1') or - r1.write_bram = '1' then - use_forward <= '1'; + -- Use the forwarding path if this cycle is a write to this row + use_forward_st <= r1.write_bram; + if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then + use_forward_rl <= '1'; end if; end if; use_forward2 <= '0'; @@ -931,9 +935,9 @@ begin -- For a store, consider this a hit even if the row isn't valid -- since it will be by the time we perform the store. -- For a load, check the appropriate row valid bit; but also, - -- if use_forward is 1 then we can consider this a hit. + -- if use_forward_rl is 1 then we can consider this a hit. is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE) or - use_forward; + use_forward_rl; hit_way := replace_way; end if; @@ -1101,6 +1105,13 @@ begin end process; + -- RAM write data and select multiplexers + ram_wr_data <= r1.req.data when r1.write_bram = '1' else + wishbone_in.dat when r1.dcbz = '0' else + (others => '0'); + ram_wr_select <= r1.req.byte_sel when r1.write_bram = '1' else + (others => '1'); + -- -- Generate a cache RAM for each way. This handles the normal -- reads, writes from reloads and the special store-hit update @@ -1134,7 +1145,7 @@ begin rd_data => dout, wr_sel => wr_sel_m, wr_addr => wr_addr, - wr_data => wr_data + wr_data => ram_wr_data ); process(all) begin @@ -1150,37 +1161,13 @@ begin -- For timing, the mux on wr_data/sel/addr is not dependent on anything -- other than the current state. -- - wr_sel_m <= (others => '0'); - - do_write <= '0'; - if r1.write_bram = '1' then - -- Write store data to BRAM. This happens one cycle after the - -- store is in r0. - wr_data <= r1.req.data; - wr_sel <= r1.req.byte_sel; - wr_addr <= std_ulogic_vector(to_unsigned(get_row(r1.req.real_addr), ROW_BITS)); - if i = r1.req.hit_way then - do_write <= '1'; - end if; - else - -- Otherwise, we might be doing a reload or a DCBZ - if r1.dcbz = '1' then - wr_data <= (others => '0'); - else - wr_data <= wishbone_in.dat; - end if; - wr_sel <= (others => '1'); - - if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and replace_way = i then - do_write <= '1'; - end if; - end if; wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); - -- Mask write selects with do_write since BRAM doesn't - -- have a global write-enable - if do_write = '1' then - wr_sel_m <= wr_sel; + wr_sel_m <= (others => '0'); + if i = replace_way and + (r1.write_bram = '1' or + (r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1')) then + wr_sel_m <= ram_wr_select; end if; end process; @@ -1191,20 +1178,60 @@ begin -- It also handles error cases (TLB miss, cache paradox) -- dcache_fast_hit : process(clk) + variable j : integer; + variable sel : std_ulogic_vector(1 downto 0); + variable data_out : std_ulogic_vector(63 downto 0); begin if rising_edge(clk) then if req_op /= OP_NONE then - report "op:" & op_t'image(req_op) & - " addr:" & to_hstring(r0.req.addr) & - " nc:" & std_ulogic'image(r0.req.nc) & - " idx:" & integer'image(req_index) & - " tag:" & to_hstring(req_tag) & - " way: " & integer'image(req_hit_way); - end if; + report "op:" & op_t'image(req_op) & + " addr:" & to_hstring(r0.req.addr) & + " nc:" & std_ulogic'image(r0.req.nc) & + " idx:" & integer'image(req_index) & + " tag:" & to_hstring(req_tag) & + " way: " & integer'image(req_hit_way); + end if; if r0_valid = '1' then r1.mmu_req <= r0.mmu_req; end if; + -- Bypass/forwarding multiplexer for load data. + -- Use the bypass if are reading the row of BRAM that was written 0 or 1 + -- cycles ago, including for the slow_valid = 1 cases (i.e. completing a + -- load miss or a non-cacheable load), which are handled via the r1.full case. + for i in 0 to 7 loop + if r1.full = '1' or use_forward_rl = '1' then + sel := '0' & r1.dcbz; + elsif use_forward_st = '1' and r1.req.byte_sel(i) = '1' then + sel := "01"; + elsif use_forward2 = '1' and r1.forward_sel(i) = '1' then + sel := "10"; + else + sel := "11"; + end if; + j := i * 8; + case sel is + when "00" => + data_out(j + 7 downto j) := wishbone_in.dat(j + 7 downto j); + when "01" => + data_out(j + 7 downto j) := r1.req.data(j + 7 downto j); + when "10" => + data_out(j + 7 downto j) := r1.forward_data(j + 7 downto j); + when others => + data_out(j + 7 downto j) := cache_out(req_hit_way)(j + 7 downto j); + end case; + end loop; + r1.data_out <= data_out; + + r1.forward_data <= ram_wr_data; + r1.forward_tag <= r1.reload_tag; + r1.forward_row <= r1.store_row; + r1.forward_sel <= ram_wr_select; + r1.forward_valid <= r1.write_bram; + if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then + r1.forward_valid <= '1'; + end if; + -- Fast path for load/store hits. Set signals for the writeback controls. r1.hit_way <= req_hit_way; r1.hit_index <= req_index; @@ -1259,56 +1286,8 @@ begin variable stbs_done : boolean; variable req : mem_access_request_t; variable acks : unsigned(2 downto 0); - variable data_out : std_ulogic_vector(63 downto 0); - variable data_fwd : std_ulogic_vector(63 downto 0); - variable fwd_in : std_ulogic_vector(63 downto 0); - variable fwd_sel : std_ulogic_vector(7 downto 0); - variable j : integer; - variable fwd_byte : std_ulogic_vector(7 downto 0); begin if rising_edge(clk) then - fwd_sel := (others => '1'); - if r1.write_bram = '1' then - fwd_in := r1.req.data; - fwd_sel := r1.req.byte_sel; - elsif r1.dcbz = '1' then - fwd_in := (others => '0'); - else - fwd_in := wishbone_in.dat; - end if; - - -- Use the bypass if are reading the row that was written 0 or 1 cycles - -- ago, including for the slow_valid = 1 cases (i.e. completing a load - -- miss or a non-cacheable load), which are handled via the r1.full case. - data_fwd := r1.forward_data; - fwd_byte := (others => '0'); - if r1.full = '1' then - data_fwd := fwd_in; - fwd_byte := (others => '1'); - elsif use_forward = '1' then - data_fwd := fwd_in; - fwd_byte := fwd_sel; - elsif use_forward2 = '1' then - fwd_byte := r1.forward_sel; - end if; - data_out := cache_out(req_hit_way); - for i in 0 to 7 loop - j := i * 8; - if fwd_byte(i) = '1' then - data_out(j + 7 downto j) := data_fwd(j + 7 downto j); - end if; - end loop; - r1.data_out <= data_out; - - r1.forward_data <= fwd_in; - r1.forward_tag <= r1.reload_tag; - r1.forward_row <= r1.store_row; - r1.forward_sel <= fwd_sel; - r1.forward_valid <= r1.write_bram; - if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then - r1.forward_valid <= '1'; - end if; - ev.dcache_refill <= '0'; ev.load_miss <= '0'; ev.store_miss <= '0'; @@ -1563,6 +1542,7 @@ begin (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then r1.wb.stb <= '1'; stbs_done := false; + r1.store_way <= req.hit_way; r1.store_row <= get_row(req.real_addr); if req.op = OP_STORE_HIT then r1.write_bram <= '1';