diff --git a/dcache.vhdl b/dcache.vhdl index 90771f5..b9895f6 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -122,7 +122,7 @@ architecture rtl of dcache is type cache_valids_t is array(index_t) of cache_way_valids_t; type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; - -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs + -- Storage. Hopefully implemented in LUTs signal cache_tags : cache_tags_array_t; signal cache_tag_set : cache_tags_set_t; signal cache_valids : cache_valids_t; @@ -317,15 +317,13 @@ architecture rtl of dcache is tlb_hit_way : tlb_way_t; tlb_hit_index : tlb_index_t; - -- 2-stage data buffer for data forwarded from writes to reads - forward_data1 : std_ulogic_vector(63 downto 0); - forward_data2 : std_ulogic_vector(63 downto 0); - forward_sel1 : std_ulogic_vector(7 downto 0); - forward_valid1 : std_ulogic; - forward_way1 : way_t; - forward_row1 : row_t; - use_forward1 : std_ulogic; + -- data buffer for data forwarded from writes to reads + forward_data : std_ulogic_vector(63 downto 0); + forward_tag : cache_tag_t; forward_sel : std_ulogic_vector(7 downto 0); + forward_valid : std_ulogic; + forward_row : row_t; + data_out : std_ulogic_vector(63 downto 0); -- Cache miss state (reload state machine) state : state_t; @@ -387,12 +385,16 @@ architecture rtl of dcache is signal r0_valid : std_ulogic; signal r0_stall : std_ulogic; - signal use_forward1_next : std_ulogic; - signal use_forward2_next : std_ulogic; + signal fwd_same_tag : std_ulogic; + signal use_forward_st : std_ulogic; + signal use_forward_rl : std_ulogic; + signal use_forward2 : std_ulogic; -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; signal cache_out : cache_ram_out_t; + signal ram_wr_data : cache_row_t; + signal ram_wr_select : std_ulogic_vector(ROW_SIZE - 1 downto 0); -- PLRU output interface type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0); @@ -576,6 +578,7 @@ begin r.doall := m_in.doall; r.tlbld := m_in.tlbld; r.mmu_req := '1'; + r.d_valid := '1'; else r.req := d_in; r.req.data := (others => '0'); @@ -583,21 +586,19 @@ begin r.doall := '0'; r.tlbld := '0'; r.mmu_req := '0'; + r.d_valid := '0'; end if; - r.d_valid := '0'; if rst = '1' then r0_full <= '0'; elsif (r1.full = '0' and d_in.hold = '0') or r0_full = '0' then r0 <= r; r0_full <= r.req.valid; - end if; - -- Sample data the cycle after a request comes in from loadstore1. - -- If another request has come in already then the data will get - -- put directly into req.data below. - if r0.req.valid = '1' and r.req.valid = '0' and r0.d_valid = '0' and - r0.mmu_req = '0' then + elsif r0.d_valid = '0' then + -- Sample data the cycle after a request comes in from loadstore1. + -- If this request is already moving into r1 then the data will get + -- put directly into req.data in the dcache_slow process below. r0.req.data <= d_in.data; - r0.d_valid <= '1'; + r0.d_valid <= r0.req.valid; end if; end if; end process; @@ -834,6 +835,8 @@ begin variable hit_way_set : hit_way_set_t; variable rel_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); variable rel_match : std_ulogic; + variable fwd_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); + variable fwd_match : std_ulogic; begin -- Extract line, row and tag from request req_index <= get_index(r0.req.addr); @@ -849,8 +852,10 @@ begin hit_way := 0; is_hit := '0'; rel_match := '0'; + fwd_match := '0'; if r0.req.virt_mode = '1' then rel_matches := (others => '0'); + fwd_matches := (others => '0'); for j in tlb_way_t loop hit_way_set(j) := 0; s_hit := '0'; @@ -870,11 +875,15 @@ begin if s_tag = r1.reload_tag then rel_matches(j) := '1'; end if; + if s_tag = r1.forward_tag then + fwd_matches(j) := '1'; + end if; end loop; if tlb_hit = '1' then is_hit := hit_set(tlb_hit_way); hit_way := hit_way_set(tlb_hit_way); rel_match := rel_matches(tlb_hit_way); + fwd_match := fwd_matches(tlb_hit_way); end if; else s_tag := get_tag(r0.req.addr); @@ -888,39 +897,28 @@ begin if s_tag = r1.reload_tag then rel_match := '1'; end if; + if s_tag = r1.forward_tag then + fwd_match := '1'; + end if; end if; req_same_tag <= rel_match; - - -- See if the request matches the line currently being reloaded - if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index and - rel_match = '1' then - -- For a store, consider this a hit even if the row isn't valid - -- since it will be by the time we perform the store. - -- For a load, check the appropriate row valid bit. - is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE); - hit_way := replace_way; - end if; + fwd_same_tag <= fwd_match; -- Whether to use forwarded data for a load or not - use_forward1_next <= '0'; - if get_row(r1.req.real_addr) = req_row and r1.req.hit_way = hit_way then - -- Only need to consider r1.write_bram here, since if we are - -- writing refill data here, then we don't have a cache hit this - -- cycle on the line being refilled. (There is the possibility - -- that the load following the load miss that started the refill - -- could be to the old contents of the victim line, since it is a - -- couple of cycles after the refill starts before we see the - -- updated cache tag. In that case we don't use the bypass.) - use_forward1_next <= r1.write_bram; + use_forward_st <= '0'; + use_forward_rl <= '0'; + if r1.store_row = req_row and rel_match = '1' then + -- Use the forwarding path if this cycle is a write to this row + use_forward_st <= r1.write_bram; + if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then + use_forward_rl <= '1'; + end if; end if; - use_forward2_next <= '0'; - if r1.forward_row1 = req_row and r1.forward_way1 = hit_way then - use_forward2_next <= r1.forward_valid1; + use_forward2 <= '0'; + if r1.forward_row = req_row and fwd_match = '1' then + use_forward2 <= r1.forward_valid; end if; - -- The way that matched on a hit - req_hit_way <= hit_way; - -- The way to replace on a miss if r1.write_tag = '1' then replace_way <= to_integer(unsigned(plru_victim(r1.store_index))); @@ -928,6 +926,23 @@ begin replace_way <= r1.store_way; end if; + -- See if the request matches the line currently being reloaded + if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index and + rel_match = '1' then + -- Ignore is_hit from above, because a load miss writes the new tag + -- but doesn't clear the valid bit on the line before refilling it. + -- For a store, consider this a hit even if the row isn't valid + -- since it will be by the time we perform the store. + -- For a load, check the appropriate row valid bit; but also, + -- if use_forward_rl is 1 then we can consider this a hit. + is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE) or + use_forward_rl; + hit_way := replace_way; + end if; + + -- The way that matched on a hit + req_hit_way <= hit_way; + -- work out whether we have permission for this access -- NB we don't yet implement AMR, thus no KUAP rc_ok <= perm_attr.reference and (r0.req.load or perm_attr.changed); @@ -1023,28 +1038,9 @@ begin -- Return data for loads & completion control logic -- writeback_control: process(all) - variable data_out : std_ulogic_vector(63 downto 0); - variable data_fwd : std_ulogic_vector(63 downto 0); - variable j : integer; begin - -- Use the bypass if are reading the row that was written 1 or 2 cycles - -- ago, including for the slow_valid = 1 case (i.e. completing a load - -- miss or a non-cacheable load). - if r1.use_forward1 = '1' then - data_fwd := r1.forward_data1; - else - data_fwd := r1.forward_data2; - end if; - data_out := cache_out(r1.hit_way); - for i in 0 to 7 loop - j := i * 8; - if r1.forward_sel(i) = '1' then - data_out(j + 7 downto j) := data_fwd(j + 7 downto j); - end if; - end loop; - d_out.valid <= r1.ls_valid; - d_out.data <= data_out; + d_out.data <= r1.data_out; d_out.store_done <= not r1.stcx_fail; d_out.error <= r1.ls_error; d_out.cache_paradox <= r1.cache_paradox; @@ -1052,7 +1048,7 @@ begin -- Outputs to MMU m_out.done <= r1.mmu_done; m_out.err <= r1.mmu_error; - m_out.data <= data_out; + m_out.data <= r1.data_out; -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store @@ -1076,7 +1072,7 @@ begin -- Request came from loadstore1... -- Load hit case is the standard path if r1.hit_load_valid = '1' then - report "completing load hit data=" & to_hstring(data_out); + report "completing load hit data=" & to_hstring(r1.data_out); end if; -- error cases complete without stalling @@ -1086,7 +1082,7 @@ begin -- Slow ops (load miss, NC, stores) if r1.slow_valid = '1' then - report "completing store or load miss data=" & to_hstring(data_out); + report "completing store or load miss data=" & to_hstring(r1.data_out); end if; else @@ -1108,6 +1104,13 @@ begin end process; + -- RAM write data and select multiplexers + ram_wr_data <= r1.req.data when r1.write_bram = '1' else + wishbone_in.dat when r1.dcbz = '0' else + (others => '0'); + ram_wr_select <= r1.req.byte_sel when r1.write_bram = '1' else + (others => '1'); + -- -- Generate a cache RAM for each way. This handles the normal -- reads, writes from reloads and the special store-hit update @@ -1132,7 +1135,7 @@ begin generic map ( ROW_BITS => ROW_BITS, WIDTH => wishbone_data_bits, - ADD_BUF => true + ADD_BUF => false ) port map ( clk => clk, @@ -1141,7 +1144,7 @@ begin rd_data => dout, wr_sel => wr_sel_m, wr_addr => wr_addr, - wr_data => wr_data + wr_data => ram_wr_data ); process(all) begin @@ -1157,37 +1160,13 @@ begin -- For timing, the mux on wr_data/sel/addr is not dependent on anything -- other than the current state. -- - wr_sel_m <= (others => '0'); + wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); - do_write <= '0'; - if r1.write_bram = '1' then - -- Write store data to BRAM. This happens one cycle after the - -- store is in r0. - wr_data <= r1.req.data; - wr_sel <= r1.req.byte_sel; - wr_addr <= std_ulogic_vector(to_unsigned(get_row(r1.req.real_addr), ROW_BITS)); - if i = r1.req.hit_way then - do_write <= '1'; - end if; - else - -- Otherwise, we might be doing a reload or a DCBZ - if r1.dcbz = '1' then - wr_data <= (others => '0'); - else - wr_data <= wishbone_in.dat; - end if; - wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); - wr_sel <= (others => '1'); - - if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and replace_way = i then - do_write <= '1'; - end if; - end if; - - -- Mask write selects with do_write since BRAM doesn't - -- have a global write-enable - if do_write = '1' then - wr_sel_m <= wr_sel; + wr_sel_m <= (others => '0'); + if i = replace_way and + (r1.write_bram = '1' or + (r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1')) then + wr_sel_m <= ram_wr_select; end if; end process; @@ -1198,20 +1177,60 @@ begin -- It also handles error cases (TLB miss, cache paradox) -- dcache_fast_hit : process(clk) + variable j : integer; + variable sel : std_ulogic_vector(1 downto 0); + variable data_out : std_ulogic_vector(63 downto 0); begin if rising_edge(clk) then if req_op /= OP_NONE then - report "op:" & op_t'image(req_op) & - " addr:" & to_hstring(r0.req.addr) & - " nc:" & std_ulogic'image(r0.req.nc) & - " idx:" & integer'image(req_index) & - " tag:" & to_hstring(req_tag) & - " way: " & integer'image(req_hit_way); - end if; + report "op:" & op_t'image(req_op) & + " addr:" & to_hstring(r0.req.addr) & + " nc:" & std_ulogic'image(r0.req.nc) & + " idx:" & integer'image(req_index) & + " tag:" & to_hstring(req_tag) & + " way: " & integer'image(req_hit_way); + end if; if r0_valid = '1' then r1.mmu_req <= r0.mmu_req; end if; + -- Bypass/forwarding multiplexer for load data. + -- Use the bypass if are reading the row of BRAM that was written 0 or 1 + -- cycles ago, including for the slow_valid = 1 cases (i.e. completing a + -- load miss or a non-cacheable load), which are handled via the r1.full case. + for i in 0 to 7 loop + if r1.full = '1' or use_forward_rl = '1' then + sel := '0' & r1.dcbz; + elsif use_forward_st = '1' and r1.req.byte_sel(i) = '1' then + sel := "01"; + elsif use_forward2 = '1' and r1.forward_sel(i) = '1' then + sel := "10"; + else + sel := "11"; + end if; + j := i * 8; + case sel is + when "00" => + data_out(j + 7 downto j) := wishbone_in.dat(j + 7 downto j); + when "01" => + data_out(j + 7 downto j) := r1.req.data(j + 7 downto j); + when "10" => + data_out(j + 7 downto j) := r1.forward_data(j + 7 downto j); + when others => + data_out(j + 7 downto j) := cache_out(req_hit_way)(j + 7 downto j); + end case; + end loop; + r1.data_out <= data_out; + + r1.forward_data <= ram_wr_data; + r1.forward_tag <= r1.reload_tag; + r1.forward_row <= r1.store_row; + r1.forward_sel <= ram_wr_select; + r1.forward_valid <= r1.write_bram; + if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then + r1.forward_valid <= '1'; + end if; + -- Fast path for load/store hits. Set signals for the writeback controls. r1.hit_way <= req_hit_way; r1.hit_index <= req_index; @@ -1263,38 +1282,11 @@ begin -- operates at stage 1. -- dcache_slow : process(clk) - variable stbs_done : boolean; + variable stbs_done : boolean; variable req : mem_access_request_t; variable acks : unsigned(2 downto 0); begin if rising_edge(clk) then - r1.use_forward1 <= use_forward1_next; - r1.forward_sel <= (others => '0'); - if use_forward1_next = '1' then - r1.forward_sel <= r1.req.byte_sel; - elsif use_forward2_next = '1' then - r1.forward_sel <= r1.forward_sel1; - end if; - - r1.forward_data2 <= r1.forward_data1; - if r1.write_bram = '1' then - r1.forward_data1 <= r1.req.data; - r1.forward_sel1 <= r1.req.byte_sel; - r1.forward_way1 <= r1.req.hit_way; - r1.forward_row1 <= get_row(r1.req.real_addr); - r1.forward_valid1 <= '1'; - else - if r1.dcbz = '1' then - r1.forward_data1 <= (others => '0'); - else - r1.forward_data1 <= wishbone_in.dat; - end if; - r1.forward_sel1 <= (others => '1'); - r1.forward_way1 <= replace_way; - r1.forward_row1 <= r1.store_row; - r1.forward_valid1 <= '0'; - end if; - ev.dcache_refill <= '0'; ev.load_miss <= '0'; ev.store_miss <= '0'; @@ -1488,17 +1480,17 @@ begin end if; -- Incoming acks processing - r1.forward_valid1 <= wishbone_in.ack; if wishbone_in.ack = '1' then r1.rows_valid(r1.store_row mod ROW_PER_LINE) <= '1'; -- If this is the data we were looking for, we can -- complete the request next cycle. -- Compare the whole address in case the request in -- r1.req is not the one that started this refill. - if req.valid = '1' and req.same_tag = '1' and - ((r1.dcbz = '1' and req.dcbz = '1') or - (r1.dcbz = '0' and req.op = OP_LOAD_MISS)) and - r1.store_row = get_row(req.real_addr) then + -- (Cases where req comes from r0 are handled as a load + -- hit.) + if r1.full = '1' and r1.req.same_tag = '1' and + ((r1.dcbz = '1' and req.dcbz = '1') or r1.req.op = OP_LOAD_MISS) and + r1.store_row = get_row(r1.req.real_addr) then r1.full <= '0'; r1.slow_valid <= '1'; if r1.mmu_req = '0' then @@ -1506,8 +1498,6 @@ begin else r1.mmu_done <= '1'; end if; - r1.forward_sel <= (others => '1'); - r1.use_forward1 <= '1'; end if; -- Check for completion @@ -1551,6 +1541,8 @@ begin (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then r1.wb.stb <= '1'; stbs_done := false; + r1.store_way <= req.hit_way; + r1.store_row <= get_row(req.real_addr); if req.op = OP_STORE_HIT then r1.write_bram <= '1'; end if; @@ -1592,8 +1584,6 @@ begin else r1.mmu_done <= '1'; end if; - r1.forward_sel <= (others => '1'); - r1.use_forward1 <= '1'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; end if; diff --git a/tests/misc/head.S b/tests/misc/head.S index d490a61..9044c51 100644 --- a/tests/misc/head.S +++ b/tests/misc/head.S @@ -162,3 +162,26 @@ test_bdnzl: li %r3,0 9: mtlr %r10 blr + +/* Test that a load that hits stores gets the correct data */ + .global test_loadhitstore +test_loadhitstore: + addi %r5,%r1,-16 + ld %r0,0(%r5) + li %r0,0 + std %r0,0(%r5) + li %r6,0x66 + li %r7,0x77 + .balign 64 + nop + nop + nop + nop + stb %r6,2(%r5) + stb %r7,3(%r5) + ld %r0,0(%r5) + sldi %r6,%r6,16 + sldi %r7,%r7,24 + or %r7,%r6,%r7 + subf %r3,%r0,%r7 + blr diff --git a/tests/misc/misc.c b/tests/misc/misc.c index 73745d9..3cc0300 100644 --- a/tests/misc/misc.c +++ b/tests/misc/misc.c @@ -15,6 +15,7 @@ extern long test_addpcis_2(void); extern long test_mfpvr(void); extern long test_mtpvr(void); extern long test_bdnzl(void); +extern long test_loadhitstore(void); // i < 100 void print_test_number(int i) @@ -66,5 +67,12 @@ int main(void) } else puts(PASS); + print_test_number(6); + if (test_loadhitstore() != 0) { + fail = 1; + puts(FAIL); + } else + puts(PASS); + return fail; } diff --git a/tests/test_misc.bin b/tests/test_misc.bin index 2264686..648ab18 100755 Binary files a/tests/test_misc.bin and b/tests/test_misc.bin differ diff --git a/tests/test_misc.console_out b/tests/test_misc.console_out index e59c03f..5296101 100644 --- a/tests/test_misc.console_out +++ b/tests/test_misc.console_out @@ -3,3 +3,4 @@ Test 02:PASS Test 03:PASS Test 04:PASS Test 05:PASS +Test 06:PASS