diff --git a/dcache.vhdl b/dcache.vhdl index 282eba0..bca393a 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -122,7 +122,7 @@ architecture rtl of dcache is type cache_valids_t is array(index_t) of cache_way_valids_t; type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; - -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs + -- Storage. Hopefully implemented in LUTs signal cache_tags : cache_tags_array_t; signal cache_tag_set : cache_tags_set_t; signal cache_valids : cache_valids_t; @@ -386,12 +386,15 @@ architecture rtl of dcache is signal r0_stall : std_ulogic; signal fwd_same_tag : std_ulogic; - signal use_forward : std_ulogic; + signal use_forward_st : std_ulogic; + signal use_forward_rl : std_ulogic; signal use_forward2 : std_ulogic; -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; signal cache_out : cache_ram_out_t; + signal ram_wr_data : cache_row_t; + signal ram_wr_select : std_ulogic_vector(ROW_SIZE - 1 downto 0); -- PLRU output interface type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0); @@ -903,12 +906,13 @@ begin fwd_same_tag <= fwd_match; -- Whether to use forwarded data for a load or not - use_forward <= '0'; + use_forward_st <= '0'; + use_forward_rl <= '0'; if r1.store_row = req_row and rel_match = '1' then - -- Use the forwarding path if last cycle was a write to this row - if (r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1') or - r1.write_bram = '1' then - use_forward <= '1'; + -- Use the forwarding path if this cycle is a write to this row + use_forward_st <= r1.write_bram; + if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then + use_forward_rl <= '1'; end if; end if; use_forward2 <= '0'; @@ -931,9 +935,9 @@ begin -- For a store, consider this a hit even if the row isn't valid -- since it will be by the time we perform the store. -- For a load, check the appropriate row valid bit; but also, - -- if use_forward is 1 then we can consider this a hit. + -- if use_forward_rl is 1 then we can consider this a hit. is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE) or - use_forward; + use_forward_rl; hit_way := replace_way; end if; @@ -1101,6 +1105,13 @@ begin end process; + -- RAM write data and select multiplexers + ram_wr_data <= r1.req.data when r1.write_bram = '1' else + wishbone_in.dat when r1.dcbz = '0' else + (others => '0'); + ram_wr_select <= r1.req.byte_sel when r1.write_bram = '1' else + (others => '1'); + -- -- Generate a cache RAM for each way. This handles the normal -- reads, writes from reloads and the special store-hit update @@ -1134,7 +1145,7 @@ begin rd_data => dout, wr_sel => wr_sel_m, wr_addr => wr_addr, - wr_data => wr_data + wr_data => ram_wr_data ); process(all) begin @@ -1150,37 +1161,13 @@ begin -- For timing, the mux on wr_data/sel/addr is not dependent on anything -- other than the current state. -- - wr_sel_m <= (others => '0'); - - do_write <= '0'; - if r1.write_bram = '1' then - -- Write store data to BRAM. This happens one cycle after the - -- store is in r0. - wr_data <= r1.req.data; - wr_sel <= r1.req.byte_sel; - wr_addr <= std_ulogic_vector(to_unsigned(get_row(r1.req.real_addr), ROW_BITS)); - if i = r1.req.hit_way then - do_write <= '1'; - end if; - else - -- Otherwise, we might be doing a reload or a DCBZ - if r1.dcbz = '1' then - wr_data <= (others => '0'); - else - wr_data <= wishbone_in.dat; - end if; - wr_sel <= (others => '1'); - - if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and replace_way = i then - do_write <= '1'; - end if; - end if; wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); - -- Mask write selects with do_write since BRAM doesn't - -- have a global write-enable - if do_write = '1' then - wr_sel_m <= wr_sel; + wr_sel_m <= (others => '0'); + if i = replace_way and + (r1.write_bram = '1' or + (r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1')) then + wr_sel_m <= ram_wr_select; end if; end process; @@ -1191,20 +1178,60 @@ begin -- It also handles error cases (TLB miss, cache paradox) -- dcache_fast_hit : process(clk) + variable j : integer; + variable sel : std_ulogic_vector(1 downto 0); + variable data_out : std_ulogic_vector(63 downto 0); begin if rising_edge(clk) then if req_op /= OP_NONE then - report "op:" & op_t'image(req_op) & - " addr:" & to_hstring(r0.req.addr) & - " nc:" & std_ulogic'image(r0.req.nc) & - " idx:" & integer'image(req_index) & - " tag:" & to_hstring(req_tag) & - " way: " & integer'image(req_hit_way); - end if; + report "op:" & op_t'image(req_op) & + " addr:" & to_hstring(r0.req.addr) & + " nc:" & std_ulogic'image(r0.req.nc) & + " idx:" & integer'image(req_index) & + " tag:" & to_hstring(req_tag) & + " way: " & integer'image(req_hit_way); + end if; if r0_valid = '1' then r1.mmu_req <= r0.mmu_req; end if; + -- Bypass/forwarding multiplexer for load data. + -- Use the bypass if are reading the row of BRAM that was written 0 or 1 + -- cycles ago, including for the slow_valid = 1 cases (i.e. completing a + -- load miss or a non-cacheable load), which are handled via the r1.full case. + for i in 0 to 7 loop + if r1.full = '1' or use_forward_rl = '1' then + sel := '0' & r1.dcbz; + elsif use_forward_st = '1' and r1.req.byte_sel(i) = '1' then + sel := "01"; + elsif use_forward2 = '1' and r1.forward_sel(i) = '1' then + sel := "10"; + else + sel := "11"; + end if; + j := i * 8; + case sel is + when "00" => + data_out(j + 7 downto j) := wishbone_in.dat(j + 7 downto j); + when "01" => + data_out(j + 7 downto j) := r1.req.data(j + 7 downto j); + when "10" => + data_out(j + 7 downto j) := r1.forward_data(j + 7 downto j); + when others => + data_out(j + 7 downto j) := cache_out(req_hit_way)(j + 7 downto j); + end case; + end loop; + r1.data_out <= data_out; + + r1.forward_data <= ram_wr_data; + r1.forward_tag <= r1.reload_tag; + r1.forward_row <= r1.store_row; + r1.forward_sel <= ram_wr_select; + r1.forward_valid <= r1.write_bram; + if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then + r1.forward_valid <= '1'; + end if; + -- Fast path for load/store hits. Set signals for the writeback controls. r1.hit_way <= req_hit_way; r1.hit_index <= req_index; @@ -1259,56 +1286,8 @@ begin variable stbs_done : boolean; variable req : mem_access_request_t; variable acks : unsigned(2 downto 0); - variable data_out : std_ulogic_vector(63 downto 0); - variable data_fwd : std_ulogic_vector(63 downto 0); - variable fwd_in : std_ulogic_vector(63 downto 0); - variable fwd_sel : std_ulogic_vector(7 downto 0); - variable j : integer; - variable fwd_byte : std_ulogic_vector(7 downto 0); begin if rising_edge(clk) then - fwd_sel := (others => '1'); - if r1.write_bram = '1' then - fwd_in := r1.req.data; - fwd_sel := r1.req.byte_sel; - elsif r1.dcbz = '1' then - fwd_in := (others => '0'); - else - fwd_in := wishbone_in.dat; - end if; - - -- Use the bypass if are reading the row that was written 0 or 1 cycles - -- ago, including for the slow_valid = 1 cases (i.e. completing a load - -- miss or a non-cacheable load), which are handled via the r1.full case. - data_fwd := r1.forward_data; - fwd_byte := (others => '0'); - if r1.full = '1' then - data_fwd := fwd_in; - fwd_byte := (others => '1'); - elsif use_forward = '1' then - data_fwd := fwd_in; - fwd_byte := fwd_sel; - elsif use_forward2 = '1' then - fwd_byte := r1.forward_sel; - end if; - data_out := cache_out(req_hit_way); - for i in 0 to 7 loop - j := i * 8; - if fwd_byte(i) = '1' then - data_out(j + 7 downto j) := data_fwd(j + 7 downto j); - end if; - end loop; - r1.data_out <= data_out; - - r1.forward_data <= fwd_in; - r1.forward_tag <= r1.reload_tag; - r1.forward_row <= r1.store_row; - r1.forward_sel <= fwd_sel; - r1.forward_valid <= r1.write_bram; - if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then - r1.forward_valid <= '1'; - end if; - ev.dcache_refill <= '0'; ev.load_miss <= '0'; ev.store_miss <= '0'; @@ -1563,6 +1542,7 @@ begin (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then r1.wb.stb <= '1'; stbs_done := false; + r1.store_way <= req.hit_way; r1.store_row <= get_row(req.real_addr); if req.op = OP_STORE_HIT then r1.write_bram <= '1';