From f812832ad78d422fb5d30b8b6765c236a16b41e2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 28 Aug 2021 20:11:55 +1000 Subject: [PATCH 1/4] dcache: Move way selection and forwarding earlier This moves the way multiplexer for the data from the BRAM, and the multiplexers for forwarding data from earlier stores or refills, before a clock edge rather than after, so that now the data output from the dcache comes from a clean latch. To do this we remove the extra latch on the output of the data BRAM (i.e. ADD_BUF=false) and rearrange the logic. The choice whether to forward or not now depends not on way comparisons but rather on a tag comparisons, for the sake of timing. Signed-off-by: Paul Mackerras --- dcache.vhdl | 197 +++++++++++++++++++++++++++------------------------- 1 file changed, 104 insertions(+), 93 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 90771f5..282eba0 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -317,15 +317,13 @@ architecture rtl of dcache is tlb_hit_way : tlb_way_t; tlb_hit_index : tlb_index_t; - -- 2-stage data buffer for data forwarded from writes to reads - forward_data1 : std_ulogic_vector(63 downto 0); - forward_data2 : std_ulogic_vector(63 downto 0); - forward_sel1 : std_ulogic_vector(7 downto 0); - forward_valid1 : std_ulogic; - forward_way1 : way_t; - forward_row1 : row_t; - use_forward1 : std_ulogic; + -- data buffer for data forwarded from writes to reads + forward_data : std_ulogic_vector(63 downto 0); + forward_tag : cache_tag_t; forward_sel : std_ulogic_vector(7 downto 0); + forward_valid : std_ulogic; + forward_row : row_t; + data_out : std_ulogic_vector(63 downto 0); -- Cache miss state (reload state machine) state : state_t; @@ -387,8 +385,9 @@ architecture rtl of dcache is signal r0_valid : std_ulogic; signal r0_stall : std_ulogic; - signal use_forward1_next : std_ulogic; - signal use_forward2_next : std_ulogic; + signal fwd_same_tag : std_ulogic; + signal use_forward : std_ulogic; + signal use_forward2 : std_ulogic; -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; @@ -834,6 +833,8 @@ begin variable hit_way_set : hit_way_set_t; variable rel_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); variable rel_match : std_ulogic; + variable fwd_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); + variable fwd_match : std_ulogic; begin -- Extract line, row and tag from request req_index <= get_index(r0.req.addr); @@ -849,8 +850,10 @@ begin hit_way := 0; is_hit := '0'; rel_match := '0'; + fwd_match := '0'; if r0.req.virt_mode = '1' then rel_matches := (others => '0'); + fwd_matches := (others => '0'); for j in tlb_way_t loop hit_way_set(j) := 0; s_hit := '0'; @@ -870,11 +873,15 @@ begin if s_tag = r1.reload_tag then rel_matches(j) := '1'; end if; + if s_tag = r1.forward_tag then + fwd_matches(j) := '1'; + end if; end loop; if tlb_hit = '1' then is_hit := hit_set(tlb_hit_way); hit_way := hit_way_set(tlb_hit_way); rel_match := rel_matches(tlb_hit_way); + fwd_match := fwd_matches(tlb_hit_way); end if; else s_tag := get_tag(r0.req.addr); @@ -888,39 +895,27 @@ begin if s_tag = r1.reload_tag then rel_match := '1'; end if; + if s_tag = r1.forward_tag then + fwd_match := '1'; + end if; end if; req_same_tag <= rel_match; - - -- See if the request matches the line currently being reloaded - if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index and - rel_match = '1' then - -- For a store, consider this a hit even if the row isn't valid - -- since it will be by the time we perform the store. - -- For a load, check the appropriate row valid bit. - is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE); - hit_way := replace_way; - end if; + fwd_same_tag <= fwd_match; -- Whether to use forwarded data for a load or not - use_forward1_next <= '0'; - if get_row(r1.req.real_addr) = req_row and r1.req.hit_way = hit_way then - -- Only need to consider r1.write_bram here, since if we are - -- writing refill data here, then we don't have a cache hit this - -- cycle on the line being refilled. (There is the possibility - -- that the load following the load miss that started the refill - -- could be to the old contents of the victim line, since it is a - -- couple of cycles after the refill starts before we see the - -- updated cache tag. In that case we don't use the bypass.) - use_forward1_next <= r1.write_bram; + use_forward <= '0'; + if r1.store_row = req_row and rel_match = '1' then + -- Use the forwarding path if last cycle was a write to this row + if (r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1') or + r1.write_bram = '1' then + use_forward <= '1'; + end if; end if; - use_forward2_next <= '0'; - if r1.forward_row1 = req_row and r1.forward_way1 = hit_way then - use_forward2_next <= r1.forward_valid1; + use_forward2 <= '0'; + if r1.forward_row = req_row and fwd_match = '1' then + use_forward2 <= r1.forward_valid; end if; - -- The way that matched on a hit - req_hit_way <= hit_way; - -- The way to replace on a miss if r1.write_tag = '1' then replace_way <= to_integer(unsigned(plru_victim(r1.store_index))); @@ -928,6 +923,23 @@ begin replace_way <= r1.store_way; end if; + -- See if the request matches the line currently being reloaded + if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index and + rel_match = '1' then + -- Ignore is_hit from above, because a load miss writes the new tag + -- but doesn't clear the valid bit on the line before refilling it. + -- For a store, consider this a hit even if the row isn't valid + -- since it will be by the time we perform the store. + -- For a load, check the appropriate row valid bit; but also, + -- if use_forward is 1 then we can consider this a hit. + is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE) or + use_forward; + hit_way := replace_way; + end if; + + -- The way that matched on a hit + req_hit_way <= hit_way; + -- work out whether we have permission for this access -- NB we don't yet implement AMR, thus no KUAP rc_ok <= perm_attr.reference and (r0.req.load or perm_attr.changed); @@ -1023,28 +1035,9 @@ begin -- Return data for loads & completion control logic -- writeback_control: process(all) - variable data_out : std_ulogic_vector(63 downto 0); - variable data_fwd : std_ulogic_vector(63 downto 0); - variable j : integer; begin - -- Use the bypass if are reading the row that was written 1 or 2 cycles - -- ago, including for the slow_valid = 1 case (i.e. completing a load - -- miss or a non-cacheable load). - if r1.use_forward1 = '1' then - data_fwd := r1.forward_data1; - else - data_fwd := r1.forward_data2; - end if; - data_out := cache_out(r1.hit_way); - for i in 0 to 7 loop - j := i * 8; - if r1.forward_sel(i) = '1' then - data_out(j + 7 downto j) := data_fwd(j + 7 downto j); - end if; - end loop; - d_out.valid <= r1.ls_valid; - d_out.data <= data_out; + d_out.data <= r1.data_out; d_out.store_done <= not r1.stcx_fail; d_out.error <= r1.ls_error; d_out.cache_paradox <= r1.cache_paradox; @@ -1052,7 +1045,7 @@ begin -- Outputs to MMU m_out.done <= r1.mmu_done; m_out.err <= r1.mmu_error; - m_out.data <= data_out; + m_out.data <= r1.data_out; -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store @@ -1076,7 +1069,7 @@ begin -- Request came from loadstore1... -- Load hit case is the standard path if r1.hit_load_valid = '1' then - report "completing load hit data=" & to_hstring(data_out); + report "completing load hit data=" & to_hstring(r1.data_out); end if; -- error cases complete without stalling @@ -1086,7 +1079,7 @@ begin -- Slow ops (load miss, NC, stores) if r1.slow_valid = '1' then - report "completing store or load miss data=" & to_hstring(data_out); + report "completing store or load miss data=" & to_hstring(r1.data_out); end if; else @@ -1132,7 +1125,7 @@ begin generic map ( ROW_BITS => ROW_BITS, WIDTH => wishbone_data_bits, - ADD_BUF => true + ADD_BUF => false ) port map ( clk => clk, @@ -1176,13 +1169,13 @@ begin else wr_data <= wishbone_in.dat; end if; - wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); wr_sel <= (others => '1'); if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and replace_way = i then do_write <= '1'; end if; end if; + wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); -- Mask write selects with do_write since BRAM doesn't -- have a global write-enable @@ -1263,36 +1256,57 @@ begin -- operates at stage 1. -- dcache_slow : process(clk) - variable stbs_done : boolean; + variable stbs_done : boolean; variable req : mem_access_request_t; variable acks : unsigned(2 downto 0); + variable data_out : std_ulogic_vector(63 downto 0); + variable data_fwd : std_ulogic_vector(63 downto 0); + variable fwd_in : std_ulogic_vector(63 downto 0); + variable fwd_sel : std_ulogic_vector(7 downto 0); + variable j : integer; + variable fwd_byte : std_ulogic_vector(7 downto 0); begin if rising_edge(clk) then - r1.use_forward1 <= use_forward1_next; - r1.forward_sel <= (others => '0'); - if use_forward1_next = '1' then - r1.forward_sel <= r1.req.byte_sel; - elsif use_forward2_next = '1' then - r1.forward_sel <= r1.forward_sel1; - end if; - - r1.forward_data2 <= r1.forward_data1; + fwd_sel := (others => '1'); if r1.write_bram = '1' then - r1.forward_data1 <= r1.req.data; - r1.forward_sel1 <= r1.req.byte_sel; - r1.forward_way1 <= r1.req.hit_way; - r1.forward_row1 <= get_row(r1.req.real_addr); - r1.forward_valid1 <= '1'; + fwd_in := r1.req.data; + fwd_sel := r1.req.byte_sel; + elsif r1.dcbz = '1' then + fwd_in := (others => '0'); else - if r1.dcbz = '1' then - r1.forward_data1 <= (others => '0'); - else - r1.forward_data1 <= wishbone_in.dat; + fwd_in := wishbone_in.dat; + end if; + + -- Use the bypass if are reading the row that was written 0 or 1 cycles + -- ago, including for the slow_valid = 1 cases (i.e. completing a load + -- miss or a non-cacheable load), which are handled via the r1.full case. + data_fwd := r1.forward_data; + fwd_byte := (others => '0'); + if r1.full = '1' then + data_fwd := fwd_in; + fwd_byte := (others => '1'); + elsif use_forward = '1' then + data_fwd := fwd_in; + fwd_byte := fwd_sel; + elsif use_forward2 = '1' then + fwd_byte := r1.forward_sel; + end if; + data_out := cache_out(req_hit_way); + for i in 0 to 7 loop + j := i * 8; + if fwd_byte(i) = '1' then + data_out(j + 7 downto j) := data_fwd(j + 7 downto j); end if; - r1.forward_sel1 <= (others => '1'); - r1.forward_way1 <= replace_way; - r1.forward_row1 <= r1.store_row; - r1.forward_valid1 <= '0'; + end loop; + r1.data_out <= data_out; + + r1.forward_data <= fwd_in; + r1.forward_tag <= r1.reload_tag; + r1.forward_row <= r1.store_row; + r1.forward_sel <= fwd_sel; + r1.forward_valid <= r1.write_bram; + if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then + r1.forward_valid <= '1'; end if; ev.dcache_refill <= '0'; @@ -1488,17 +1502,17 @@ begin end if; -- Incoming acks processing - r1.forward_valid1 <= wishbone_in.ack; if wishbone_in.ack = '1' then r1.rows_valid(r1.store_row mod ROW_PER_LINE) <= '1'; -- If this is the data we were looking for, we can -- complete the request next cycle. -- Compare the whole address in case the request in -- r1.req is not the one that started this refill. - if req.valid = '1' and req.same_tag = '1' and - ((r1.dcbz = '1' and req.dcbz = '1') or - (r1.dcbz = '0' and req.op = OP_LOAD_MISS)) and - r1.store_row = get_row(req.real_addr) then + -- (Cases where req comes from r0 are handled as a load + -- hit.) + if r1.full = '1' and r1.req.same_tag = '1' and + ((r1.dcbz = '1' and req.dcbz = '1') or r1.req.op = OP_LOAD_MISS) and + r1.store_row = get_row(r1.req.real_addr) then r1.full <= '0'; r1.slow_valid <= '1'; if r1.mmu_req = '0' then @@ -1506,8 +1520,6 @@ begin else r1.mmu_done <= '1'; end if; - r1.forward_sel <= (others => '1'); - r1.use_forward1 <= '1'; end if; -- Check for completion @@ -1551,6 +1563,7 @@ begin (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then r1.wb.stb <= '1'; stbs_done := false; + r1.store_row <= get_row(req.real_addr); if req.op = OP_STORE_HIT then r1.write_bram <= '1'; end if; @@ -1592,8 +1605,6 @@ begin else r1.mmu_done <= '1'; end if; - r1.forward_sel <= (others => '1'); - r1.use_forward1 <= '1'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; end if; From 1a9834c506a8f6a05bdbae16f81e5482a75663b0 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 31 Aug 2021 19:47:14 +1000 Subject: [PATCH 2/4] dcache: Fix bug with forwarding of stores We have two stages of forwarding to cover the two cycles of latency between when something is written to BRAM and when that new data can be read from BRAM. When the writes to BRAM result from store instructions, the write may write only some bytes of a row (8 bytes) and not others, so we have a mask to enable only the written bytes to be forwarded. However, we only forward written data from either the first stage of forwarding or the second, not both. So if we have two stores in succession that write different bytes of the same row, and then a load from the row, we will only forward the data from the second store, and miss the data from the first store; thus the load will get the wrong value. To fix this, we make the decision on which forward stage to use for each byte individually. This results in a 4-input multiplexer feeding r1.data_out, with its inputs being the BRAM, the wishbone, the current write data, and the 2nd-stage forwarding register. Each byte of the multiplexer is separately controlled. The code for this multiplexer is moved to the dcache_fast_hit process since it is used for cache hits as well as cache misses. This also simplifies the BRAM code by ensuring that we can use the same source for the BRAM address and way selection for writes, whether we are writing store data or cache line refill data from memory. Signed-off-by: Paul Mackerras --- dcache.vhdl | 168 +++++++++++++++++++++++----------------------------- 1 file changed, 74 insertions(+), 94 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 282eba0..bca393a 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -122,7 +122,7 @@ architecture rtl of dcache is type cache_valids_t is array(index_t) of cache_way_valids_t; type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; - -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs + -- Storage. Hopefully implemented in LUTs signal cache_tags : cache_tags_array_t; signal cache_tag_set : cache_tags_set_t; signal cache_valids : cache_valids_t; @@ -386,12 +386,15 @@ architecture rtl of dcache is signal r0_stall : std_ulogic; signal fwd_same_tag : std_ulogic; - signal use_forward : std_ulogic; + signal use_forward_st : std_ulogic; + signal use_forward_rl : std_ulogic; signal use_forward2 : std_ulogic; -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; signal cache_out : cache_ram_out_t; + signal ram_wr_data : cache_row_t; + signal ram_wr_select : std_ulogic_vector(ROW_SIZE - 1 downto 0); -- PLRU output interface type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0); @@ -903,12 +906,13 @@ begin fwd_same_tag <= fwd_match; -- Whether to use forwarded data for a load or not - use_forward <= '0'; + use_forward_st <= '0'; + use_forward_rl <= '0'; if r1.store_row = req_row and rel_match = '1' then - -- Use the forwarding path if last cycle was a write to this row - if (r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1') or - r1.write_bram = '1' then - use_forward <= '1'; + -- Use the forwarding path if this cycle is a write to this row + use_forward_st <= r1.write_bram; + if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then + use_forward_rl <= '1'; end if; end if; use_forward2 <= '0'; @@ -931,9 +935,9 @@ begin -- For a store, consider this a hit even if the row isn't valid -- since it will be by the time we perform the store. -- For a load, check the appropriate row valid bit; but also, - -- if use_forward is 1 then we can consider this a hit. + -- if use_forward_rl is 1 then we can consider this a hit. is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE) or - use_forward; + use_forward_rl; hit_way := replace_way; end if; @@ -1101,6 +1105,13 @@ begin end process; + -- RAM write data and select multiplexers + ram_wr_data <= r1.req.data when r1.write_bram = '1' else + wishbone_in.dat when r1.dcbz = '0' else + (others => '0'); + ram_wr_select <= r1.req.byte_sel when r1.write_bram = '1' else + (others => '1'); + -- -- Generate a cache RAM for each way. This handles the normal -- reads, writes from reloads and the special store-hit update @@ -1134,7 +1145,7 @@ begin rd_data => dout, wr_sel => wr_sel_m, wr_addr => wr_addr, - wr_data => wr_data + wr_data => ram_wr_data ); process(all) begin @@ -1150,37 +1161,13 @@ begin -- For timing, the mux on wr_data/sel/addr is not dependent on anything -- other than the current state. -- - wr_sel_m <= (others => '0'); - - do_write <= '0'; - if r1.write_bram = '1' then - -- Write store data to BRAM. This happens one cycle after the - -- store is in r0. - wr_data <= r1.req.data; - wr_sel <= r1.req.byte_sel; - wr_addr <= std_ulogic_vector(to_unsigned(get_row(r1.req.real_addr), ROW_BITS)); - if i = r1.req.hit_way then - do_write <= '1'; - end if; - else - -- Otherwise, we might be doing a reload or a DCBZ - if r1.dcbz = '1' then - wr_data <= (others => '0'); - else - wr_data <= wishbone_in.dat; - end if; - wr_sel <= (others => '1'); - - if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and replace_way = i then - do_write <= '1'; - end if; - end if; wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); - -- Mask write selects with do_write since BRAM doesn't - -- have a global write-enable - if do_write = '1' then - wr_sel_m <= wr_sel; + wr_sel_m <= (others => '0'); + if i = replace_way and + (r1.write_bram = '1' or + (r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1')) then + wr_sel_m <= ram_wr_select; end if; end process; @@ -1191,20 +1178,60 @@ begin -- It also handles error cases (TLB miss, cache paradox) -- dcache_fast_hit : process(clk) + variable j : integer; + variable sel : std_ulogic_vector(1 downto 0); + variable data_out : std_ulogic_vector(63 downto 0); begin if rising_edge(clk) then if req_op /= OP_NONE then - report "op:" & op_t'image(req_op) & - " addr:" & to_hstring(r0.req.addr) & - " nc:" & std_ulogic'image(r0.req.nc) & - " idx:" & integer'image(req_index) & - " tag:" & to_hstring(req_tag) & - " way: " & integer'image(req_hit_way); - end if; + report "op:" & op_t'image(req_op) & + " addr:" & to_hstring(r0.req.addr) & + " nc:" & std_ulogic'image(r0.req.nc) & + " idx:" & integer'image(req_index) & + " tag:" & to_hstring(req_tag) & + " way: " & integer'image(req_hit_way); + end if; if r0_valid = '1' then r1.mmu_req <= r0.mmu_req; end if; + -- Bypass/forwarding multiplexer for load data. + -- Use the bypass if are reading the row of BRAM that was written 0 or 1 + -- cycles ago, including for the slow_valid = 1 cases (i.e. completing a + -- load miss or a non-cacheable load), which are handled via the r1.full case. + for i in 0 to 7 loop + if r1.full = '1' or use_forward_rl = '1' then + sel := '0' & r1.dcbz; + elsif use_forward_st = '1' and r1.req.byte_sel(i) = '1' then + sel := "01"; + elsif use_forward2 = '1' and r1.forward_sel(i) = '1' then + sel := "10"; + else + sel := "11"; + end if; + j := i * 8; + case sel is + when "00" => + data_out(j + 7 downto j) := wishbone_in.dat(j + 7 downto j); + when "01" => + data_out(j + 7 downto j) := r1.req.data(j + 7 downto j); + when "10" => + data_out(j + 7 downto j) := r1.forward_data(j + 7 downto j); + when others => + data_out(j + 7 downto j) := cache_out(req_hit_way)(j + 7 downto j); + end case; + end loop; + r1.data_out <= data_out; + + r1.forward_data <= ram_wr_data; + r1.forward_tag <= r1.reload_tag; + r1.forward_row <= r1.store_row; + r1.forward_sel <= ram_wr_select; + r1.forward_valid <= r1.write_bram; + if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then + r1.forward_valid <= '1'; + end if; + -- Fast path for load/store hits. Set signals for the writeback controls. r1.hit_way <= req_hit_way; r1.hit_index <= req_index; @@ -1259,56 +1286,8 @@ begin variable stbs_done : boolean; variable req : mem_access_request_t; variable acks : unsigned(2 downto 0); - variable data_out : std_ulogic_vector(63 downto 0); - variable data_fwd : std_ulogic_vector(63 downto 0); - variable fwd_in : std_ulogic_vector(63 downto 0); - variable fwd_sel : std_ulogic_vector(7 downto 0); - variable j : integer; - variable fwd_byte : std_ulogic_vector(7 downto 0); begin if rising_edge(clk) then - fwd_sel := (others => '1'); - if r1.write_bram = '1' then - fwd_in := r1.req.data; - fwd_sel := r1.req.byte_sel; - elsif r1.dcbz = '1' then - fwd_in := (others => '0'); - else - fwd_in := wishbone_in.dat; - end if; - - -- Use the bypass if are reading the row that was written 0 or 1 cycles - -- ago, including for the slow_valid = 1 cases (i.e. completing a load - -- miss or a non-cacheable load), which are handled via the r1.full case. - data_fwd := r1.forward_data; - fwd_byte := (others => '0'); - if r1.full = '1' then - data_fwd := fwd_in; - fwd_byte := (others => '1'); - elsif use_forward = '1' then - data_fwd := fwd_in; - fwd_byte := fwd_sel; - elsif use_forward2 = '1' then - fwd_byte := r1.forward_sel; - end if; - data_out := cache_out(req_hit_way); - for i in 0 to 7 loop - j := i * 8; - if fwd_byte(i) = '1' then - data_out(j + 7 downto j) := data_fwd(j + 7 downto j); - end if; - end loop; - r1.data_out <= data_out; - - r1.forward_data <= fwd_in; - r1.forward_tag <= r1.reload_tag; - r1.forward_row <= r1.store_row; - r1.forward_sel <= fwd_sel; - r1.forward_valid <= r1.write_bram; - if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then - r1.forward_valid <= '1'; - end if; - ev.dcache_refill <= '0'; ev.load_miss <= '0'; ev.store_miss <= '0'; @@ -1563,6 +1542,7 @@ begin (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then r1.wb.stb <= '1'; stbs_done := false; + r1.store_way <= req.hit_way; r1.store_row <= get_row(req.real_addr); if req.op = OP_STORE_HIT then r1.write_bram <= '1'; From 0b23a5e760003e836d849f47ac02e46eb8a84909 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Aug 2021 18:24:49 +1000 Subject: [PATCH 3/4] dcache: Simplify data input to improve timing Signed-off-by: Paul Mackerras --- dcache.vhdl | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index bca393a..b9895f6 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -578,6 +578,7 @@ begin r.doall := m_in.doall; r.tlbld := m_in.tlbld; r.mmu_req := '1'; + r.d_valid := '1'; else r.req := d_in; r.req.data := (others => '0'); @@ -585,21 +586,19 @@ begin r.doall := '0'; r.tlbld := '0'; r.mmu_req := '0'; + r.d_valid := '0'; end if; - r.d_valid := '0'; if rst = '1' then r0_full <= '0'; elsif (r1.full = '0' and d_in.hold = '0') or r0_full = '0' then r0 <= r; r0_full <= r.req.valid; - end if; - -- Sample data the cycle after a request comes in from loadstore1. - -- If another request has come in already then the data will get - -- put directly into req.data below. - if r0.req.valid = '1' and r.req.valid = '0' and r0.d_valid = '0' and - r0.mmu_req = '0' then + elsif r0.d_valid = '0' then + -- Sample data the cycle after a request comes in from loadstore1. + -- If this request is already moving into r1 then the data will get + -- put directly into req.data in the dcache_slow process below. r0.req.data <= d_in.data; - r0.d_valid <= '1'; + r0.d_valid <= r0.req.valid; end if; end if; end process; From ba349144651bc5b0ebe28aa38ac802a75ecaa0bb Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 31 Aug 2021 11:45:57 +1000 Subject: [PATCH 4/4] tests/misc: Add a test for a load that hits two preceding stores This checks that the store forwarding machinery in the dcache correctly combines forwarded stores when they are partial stores (i.e. only writing part of the doubleword, as for a byte store). Signed-off-by: Paul Mackerras --- tests/misc/head.S | 23 +++++++++++++++++++++++ tests/misc/misc.c | 8 ++++++++ tests/test_misc.bin | Bin 5740 -> 5804 bytes tests/test_misc.console_out | 1 + 4 files changed, 32 insertions(+) diff --git a/tests/misc/head.S b/tests/misc/head.S index d490a61..9044c51 100644 --- a/tests/misc/head.S +++ b/tests/misc/head.S @@ -162,3 +162,26 @@ test_bdnzl: li %r3,0 9: mtlr %r10 blr + +/* Test that a load that hits stores gets the correct data */ + .global test_loadhitstore +test_loadhitstore: + addi %r5,%r1,-16 + ld %r0,0(%r5) + li %r0,0 + std %r0,0(%r5) + li %r6,0x66 + li %r7,0x77 + .balign 64 + nop + nop + nop + nop + stb %r6,2(%r5) + stb %r7,3(%r5) + ld %r0,0(%r5) + sldi %r6,%r6,16 + sldi %r7,%r7,24 + or %r7,%r6,%r7 + subf %r3,%r0,%r7 + blr diff --git a/tests/misc/misc.c b/tests/misc/misc.c index 73745d9..3cc0300 100644 --- a/tests/misc/misc.c +++ b/tests/misc/misc.c @@ -15,6 +15,7 @@ extern long test_addpcis_2(void); extern long test_mfpvr(void); extern long test_mtpvr(void); extern long test_bdnzl(void); +extern long test_loadhitstore(void); // i < 100 void print_test_number(int i) @@ -66,5 +67,12 @@ int main(void) } else puts(PASS); + print_test_number(6); + if (test_loadhitstore() != 0) { + fail = 1; + puts(FAIL); + } else + puts(PASS); + return fail; } diff --git a/tests/test_misc.bin b/tests/test_misc.bin index 2264686c06a630733a71d56de90ea1a1f3925baf..648ab184ab362a9a3c1ba0c554df4f1ceb96bb3e 100755 GIT binary patch delta 549 zcmaE(vqpDA0%O33{|hY`7+7C0Fff4VA88B+EXo-k0ND%)3=B>{45AU3iQ(uBW`?IT zKzg1uAFFtB@Oedr_3@eji-Z~lh6caQiHre!lLSm$CzK{z2rw}IxcFc3hb}XN2gs0X zzyH-6Fw|RcG5nX`!k%0+kqIIm@S(oOfFaoeBvuO*vtmfFD4bj=V9e+_d98r7AO~ZU z>L;KHpJaqMixp_q+JFDOXTJaM4RR&} zgR)Kqn%cQgwFV4|78SoA*QWs0gPg+vbWQ@$IkixEjsFmFkXR;E%mE^1W&;uog$l+X zDRhL2RUnD!Ld7O*_7w7GWL&X%rm!{>