From b181d28df231b21e52d6fc0405cff1086ddd9a81 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 17 May 2021 13:52:19 +1000 Subject: [PATCH] dcache: Cancel reservation on snooped store This restructures the reservation machinery so that the reservation is cleared when a snooped store by another agent is done to reservation address. The reservation address is now a real address rather than an effective address. For store-conditional, it is possible that a snooped store to the reservation address could come in even after we have asserted cyc and stb on the wishbone to do the store, and that should cause the store not to be performed. To achieve this, store-conditional now uses a separate state in the r1 state machine, which is set up so that losing the reservation due to a snooped store cause cyc and stb to be dropped immediately, and the store-conditional fails. For load-reserve, the reservation address is set at the end of cycle 1 and the reservation is made valid when the data is available. For lqarx, the reservation is made valid when the first doubleword of data is available. For the case where a snooped write comes in on cycle 0 of a larx and hits the same cache line, we detect that the index and way of the snooped write are the same as the index and way of the larx; it is done this way because reservation.addr is not set until the real address is available at the end of cycle 1. A hit on the same index and way causes reservation.valid to be set to 0 at the end of cycle 1. For a write in cycle 1, we compare the latched address in cycle 2 with the reservation address and clear reservation.valid at the end of cycle 2 if they match. In other words we compare the reservation address with both the address being written this cycle and the address being written in the previous cycle. Signed-off-by: Paul Mackerras --- common.vhdl | 1 + dcache.vhdl | 228 +++++++++++++++++++++++++++++++----------------- loadstore1.vhdl | 19 ++-- 3 files changed, 159 insertions(+), 89 deletions(-) diff --git a/common.vhdl b/common.vhdl index 3af1d7b..6df5b6b 100644 --- a/common.vhdl +++ b/common.vhdl @@ -606,6 +606,7 @@ package common is nc : std_ulogic; reserve : std_ulogic; atomic_qw : std_ulogic; -- part of a quadword atomic op + atomic_first : std_ulogic; atomic_last : std_ulogic; virt_mode : std_ulogic; priv_mode : std_ulogic; diff --git a/dcache.vhdl b/dcache.vhdl index 807a2dc..68f3b60 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -184,7 +184,6 @@ architecture rtl of dcache is -- Type of operation on a "valid" input type op_t is (OP_NONE, OP_BAD, -- NC cache hit, TLB miss, prot/RC failure - OP_STCX_FAIL, -- conditional store w/o reservation OP_LOAD_HIT, -- Cache hit on load OP_LOAD_MISS, -- Load missing cache OP_LOAD_NC, -- Non-cachable load @@ -195,8 +194,8 @@ architecture rtl of dcache is type state_t is (IDLE, -- Normal load hit processing RELOAD_WAIT_ACK, -- Cache reload wait ack STORE_WAIT_ACK, -- Store wait ack - NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack - + NC_LOAD_WAIT_ACK, -- Non-cachable load wait ack + DO_STCX); -- Check for stcx. validity -- -- Dcache operations: @@ -290,6 +289,9 @@ architecture rtl of dcache is op : op_t; valid : std_ulogic; dcbz : std_ulogic; + reserve : std_ulogic; + first_dw : std_ulogic; + last_dw : std_ulogic; real_addr : real_addr_t; data : std_ulogic_vector(63 downto 0); byte_sel : std_ulogic_vector(7 downto 0); @@ -365,10 +367,12 @@ architecture rtl of dcache is -- type reservation_t is record valid : std_ulogic; - addr : std_ulogic_vector(63 downto LINE_OFF_BITS); + addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); end record; signal reservation : reservation_t; + signal kill_rsrv : std_ulogic; + signal kill_rsrv2 : std_ulogic; -- Async signals on incoming request signal req_index : index_t; @@ -382,10 +386,6 @@ architecture rtl of dcache is signal early_req_row : row_t; signal early_rd_valid : std_ulogic; - signal cancel_store : std_ulogic; - signal set_rsrv : std_ulogic; - signal clear_rsrv : std_ulogic; - signal r0_valid : std_ulogic; signal r0_stall : std_ulogic; @@ -427,10 +427,13 @@ architecture rtl of dcache is -- TLB PLRU output interface signal tlb_plru_victim : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + signal snoop_active : std_ulogic; signal snoop_tag_set : cache_tags_set_t; signal snoop_valid : std_ulogic; - signal snoop_wrtag : cache_tag_t; - signal snoop_index : index_t; + signal snoop_paddr : real_addr_t; + signal snoop_addr : real_addr_t; + signal snoop_hits : cache_way_valids_t; + signal req_snoop_hit : std_ulogic; -- -- Helper functions to decode incoming requests @@ -861,26 +864,45 @@ begin end if; end process; + -- Snoop logic + -- Don't snoop our own cycles + snoop_addr <= addr_to_real(wb_to_addr(snoop_in.adr)); + snoop_active <= snoop_in.cyc and snoop_in.stb and snoop_in.we and + not (r1.wb.cyc and not wishbone_in.stall); + kill_rsrv <= '1' when (snoop_active = '1' and reservation.valid = '1' and + snoop_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) = reservation.addr) + else '0'; + -- Cache tag RAM second read port, for snooping cache_tag_read_2 : process(clk) - variable addr : real_addr_t; begin if rising_edge(clk) then - -- Don't snoop our own cycles - snoop_valid <= '0'; - if not (r1.wb.cyc = '1' and wishbone_in.stall = '0') then - if (snoop_in.cyc and snoop_in.stb and snoop_in.we) = '1' then - snoop_valid <= '1'; - addr := addr_to_real(wb_to_addr(snoop_in.adr)); - assert not is_X(addr); - snoop_tag_set <= cache_tags(to_integer(get_index(addr))); - snoop_wrtag <= get_tag(addr); - snoop_index <= get_index(addr); - end if; + if is_X(snoop_addr) then + snoop_tag_set <= (others => 'X'); + else + snoop_tag_set <= cache_tags(to_integer(get_index(snoop_addr))); end if; + snoop_paddr <= snoop_addr; + snoop_valid <= snoop_active; end if; end process; + -- Compare the previous cycle's snooped store address to the reservation, + -- to catch the case where a write happens on cycle 1 of a cached larx + kill_rsrv2 <= '1' when (snoop_valid = '1' and reservation.valid = '1' and + snoop_paddr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) = reservation.addr) + else '0'; + + snoop_tag_match : process(all) + begin + snoop_hits <= (others => '0'); + for i in 0 to NUM_WAYS-1 loop + if snoop_valid = '1' and read_tag(i, snoop_tag_set) = get_tag(snoop_paddr) then + snoop_hits(i) <= '1'; + end if; + end loop; + end process; + -- Cache request parsing and hit detection dcache_request : process(all) variable req_row : row_t; @@ -901,6 +923,8 @@ begin variable rel_match : std_ulogic; variable fwd_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); variable fwd_match : std_ulogic; + variable snp_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); + variable snoop_match : std_ulogic; begin -- Extract line, row and tag from request rindex := get_index(r0.req.addr); @@ -924,9 +948,11 @@ begin is_hit := '0'; rel_match := '0'; fwd_match := '0'; + snoop_match := '0'; if r0.req.virt_mode = '1' then rel_matches := (others => '0'); fwd_matches := (others => '0'); + snp_matches := (others => '0'); for j in tlb_way_t loop hit_way_set(j) := to_unsigned(0, WAY_BITS); s_hit := '0'; @@ -943,6 +969,9 @@ begin tlb_valid_way(j) = '1' then hit_way_set(j) := to_unsigned(i, WAY_BITS); s_hit := '1'; + if snoop_hits(i) = '1' then + snp_matches(j) := '1'; + end if; end if; end loop; hit_set(j) := s_hit; @@ -959,6 +988,7 @@ begin hit_way := hit_way_set(to_integer(tlb_hit_way)); rel_match := rel_matches(to_integer(tlb_hit_way)); fwd_match := fwd_matches(to_integer(tlb_hit_way)); + snoop_match := snp_matches(to_integer(tlb_hit_way)); end if; else s_tag := get_tag(r0.req.addr); @@ -970,6 +1000,9 @@ begin read_tag(i, cache_tag_set) = s_tag then hit_way := to_unsigned(i, WAY_BITS); is_hit := '1'; + if snoop_hits(i) = '1' then + snoop_match := '1'; + end if; end if; end loop; if go = '1' and not is_X(r1.reload_tag) and s_tag = r1.reload_tag then @@ -982,6 +1015,13 @@ begin req_same_tag <= rel_match; fwd_same_tag <= fwd_match; + -- This is 1 if the snooped write from the previous cycle hits the same + -- cache line that is being accessed in this cycle. + req_snoop_hit <= '0'; + if go = '1' and snoop_match = '1' and get_index(snoop_paddr) = rindex then + req_snoop_hit <= '1'; + end if; + -- Whether to use forwarded data for a load or not use_forward_st <= '0'; use_forward_rl <= '0'; @@ -1060,8 +1100,6 @@ begin if go = '1' then if access_ok = '0' then op := OP_BAD; - elsif cancel_store = '1' then - op := OP_STCX_FAIL; else opsel := r0.req.load & nc & is_hit; case opsel is @@ -1101,45 +1139,6 @@ begin -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- Handle load-with-reservation and store-conditional instructions - reservation_comb: process(all) - begin - cancel_store <= '0'; - set_rsrv <= '0'; - clear_rsrv <= '0'; - if r0_valid = '1' and r0.req.reserve = '1' then - -- XXX generate alignment interrupt if address is not aligned - -- XXX or if r0.req.nc = '1' - if r0.req.load = '1' then - -- load with reservation - set_rsrv <= not r0.req.atomic_qw or r0.req.atomic_last; - else - -- store conditional - clear_rsrv <= not r0.req.atomic_qw or r0.req.atomic_last; - if reservation.valid = '0' or - r0.req.addr(63 downto LINE_OFF_BITS) /= reservation.addr then - cancel_store <= '1'; - end if; - end if; - end if; - end process; - - reservation_reg: process(clk) - begin - if rising_edge(clk) then - if rst = '1' then - reservation.valid <= '0'; - elsif r0_valid = '1' and access_ok = '1' then - if clear_rsrv = '1' then - reservation.valid <= '0'; - elsif set_rsrv = '1' then - reservation.valid <= '1'; - reservation.addr <= r0.req.addr(63 downto LINE_OFF_BITS); - end if; - end if; - end if; - end process; - -- Return data for loads & completion control logic -- writeback_control: process(all) @@ -1367,12 +1366,6 @@ begin r1.cache_paradox <= '0'; end if; - if req_op = OP_STCX_FAIL then - r1.stcx_fail <= '1'; - else - r1.stcx_fail <= '0'; - end if; - -- Record TLB hit information for updating TLB PLRU r1.tlb_hit <= tlb_hit; r1.tlb_hit_way <= tlb_hit_way; @@ -1423,6 +1416,8 @@ begin r1.acks_pending <= to_unsigned(0, 3); r1.stalled <= '0'; r1.dec_acks <= '0'; + reservation.valid <= '0'; + reservation.addr <= (others => '0'); -- Not useful normally but helps avoiding tons of sim warnings r1.wb.adr <= (others => '0'); @@ -1430,27 +1425,40 @@ begin -- One cycle pulses reset r1.slow_valid <= '0'; r1.write_bram <= '0'; + r1.stcx_fail <= '0'; r1.ls_valid <= '0'; -- complete tlbies and TLB loads in the third cycle r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld); - if req_op = OP_LOAD_HIT or req_op = OP_STCX_FAIL then + if req_op = OP_LOAD_HIT then if r0.mmu_req = '0' then r1.ls_valid <= '1'; else r1.mmu_done <= '1'; end if; end if; + -- The kill_rsrv2 term covers the case where the reservation + -- address was set at the beginning of this cycle, and a store + -- to that address happened in the previous cycle. + if kill_rsrv = '1' or kill_rsrv2 = '1' then + reservation.valid <= '0'; + end if; + if req_go = '1' and access_ok = '1' and r0.req.load = '1' and + r0.req.reserve = '1' and r0.req.atomic_first = '1' then + reservation.addr <= ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); + if req_op = OP_LOAD_HIT then + reservation.valid <= not req_snoop_hit; + end if; + end if; -- Do invalidations from snooped stores to memory if snoop_valid = '1' then - assert not is_X(snoop_tag_set); - assert not is_X(snoop_wrtag); + assert not is_X(snoop_paddr); + assert not is_X(snoop_hits); end if; for i in 0 to NUM_WAYS-1 loop - if snoop_valid = '1' and read_tag(i, snoop_tag_set) = snoop_wrtag then - assert not is_X(snoop_index); - cache_valids(to_integer(snoop_index))(i) <= '0'; + if snoop_hits(i) = '1' then + cache_valids(to_integer(get_index(snoop_paddr)))(i) <= '0'; end if; end loop; @@ -1477,6 +1485,9 @@ begin req.valid := req_go; req.mmu_req := r0.mmu_req; req.dcbz := r0.req.dcbz; + req.reserve := r0.req.reserve; + req.first_dw := r0.req.atomic_first; + req.last_dw := r0.req.atomic_last; req.real_addr := ra; -- Force data to 0 for dcbz if r0.req.dcbz = '1' then @@ -1581,7 +1592,11 @@ begin r1.state <= NC_LOAD_WAIT_ACK; when OP_STORE_HIT | OP_STORE_MISS => - if req.dcbz = '0' then + if req.reserve = '1' then + -- stcx needs to wait until next cycle + -- for the reservation address check + r1.state <= DO_STCX; + elsif req.dcbz = '0' then r1.state <= STORE_WAIT_ACK; r1.full <= '0'; r1.slow_valid <= '1'; @@ -1593,6 +1608,9 @@ begin if req.op = OP_STORE_HIT then r1.write_bram <= '1'; end if; + r1.wb.we <= '1'; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; else -- dcbz is handled much like a load miss except -- that we are writing to memory instead of reading @@ -1600,19 +1618,18 @@ begin if req.op = OP_STORE_MISS then r1.write_tag <= '1'; end if; + r1.wb.we <= '1'; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; end if; - r1.wb.we <= '1'; - r1.wb.cyc <= '1'; - r1.wb.stb <= '1'; if req.op = OP_STORE_MISS then ev.store_miss <= '1'; end if; -- OP_NONE and OP_BAD do nothing - -- OP_BAD & OP_STCX_FAIL were handled above already + -- OP_BAD was handled above already when OP_NONE => when OP_BAD => - when OP_STCX_FAIL => end case; when RELOAD_WAIT_ACK => @@ -1652,6 +1669,13 @@ begin else r1.mmu_done <= '1'; end if; + -- NB: for lqarx, set the reservation on the first + -- dword so that a snooped store between the two + -- dwords will kill the reservation. + if req.reserve = '1' and req.first_dw = '1' then + reservation.valid <= '1'; + reservation.addr <= req.real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); + end if; end if; -- Check for completion @@ -1736,6 +1760,48 @@ begin r1.wb.cyc <= '0'; r1.wb.stb <= '0'; end if; + + when DO_STCX => + if reservation.valid = '0' or kill_rsrv = '1' or + r1.req.real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) /= reservation.addr then + -- Wrong address, didn't have reservation, or lost reservation + -- Abandon the wishbone cycle if started and fail the stcx. + r1.stcx_fail <= '1'; + r1.full <= '0'; + r1.ls_valid <= '1'; + r1.state <= IDLE; + r1.wb.cyc <= '0'; + r1.wb.stb <= '0'; + reservation.valid <= '0'; + elsif r1.wb.cyc = '0' then + -- Right address and have reservation, so start the + -- wishbone cycle + r1.wb.we <= '1'; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; + else + if wishbone_in.stall = '0' then + -- Store has been accepted, so now we can write the + -- cache data RAM + if r1.req.op = OP_STORE_HIT then + r1.write_bram <= '1'; + end if; + r1.wb.stb <= '0'; + end if; + if wishbone_in.ack = '1' then + r1.state <= IDLE; + r1.wb.cyc <= '0'; + r1.wb.stb <= '0'; + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + -- For stqcx., kill the reservation on the last dword + if r1.req.last_dw = '1' then + reservation.valid <= '0'; + end if; + end if; + end if; + end case; end if; end if; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index dcacc75..e69a27e 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -85,6 +85,7 @@ architecture behave of loadstore1 is xerc : xer_common_t; reserve : std_ulogic; atomic_qw : std_ulogic; + atomic_first : std_ulogic; atomic_last : std_ulogic; rc : std_ulogic; nc : std_ulogic; -- non-cacheable access @@ -110,7 +111,7 @@ architecture behave of loadstore1 is elt_length => x"0", byte_reverse => '0', brev_mask => "000", sign_extend => '0', update => '0', xerc => xerc_init, reserve => '0', - atomic_qw => '0', atomic_last => '0', + atomic_qw => '0', atomic_first => '0', atomic_last => '0', rc => '0', nc => '0', virt_mode => '0', priv_mode => '0', load_sp => '0', sprsel => "00", ric => "00", is_slbia => '0', align_intr => '0', @@ -478,20 +479,20 @@ begin -- check alignment for larx/stcx misaligned := or (addr_mask and addr(2 downto 0)); + if l_in.repeat = '1' and l_in.update = '0' and addr(3) /= l_in.second then + misaligned := '1'; + end if; v.align_intr := l_in.reserve and misaligned; + v.atomic_first := not misaligned and not l_in.second; + v.atomic_last := not misaligned and (l_in.second or not l_in.repeat); + -- is this a quadword load or store? i.e. lq plq stq pstq lqarx stqcx. if l_in.repeat = '1' and l_in.update = '0' then - -- is the access aligned? - if misaligned = '0' and addr(3) = l_in.second then + if misaligned = '0' then -- Since the access is aligned we have to do it atomically v.atomic_qw := '1'; - v.atomic_last := l_in.second; else - -- lqarx/stqcx have to be aligned - if l_in.reserve = '1' then - v.align_intr := '1'; - end if; -- We require non-prefixed lq in LE mode to be aligned in order -- to avoid the case where RA = RT+1 and the second access faults -- after the first has overwritten RA. @@ -979,6 +980,7 @@ begin d_out.nc <= stage1_req.nc; d_out.reserve <= stage1_req.reserve; d_out.atomic_qw <= stage1_req.atomic_qw; + d_out.atomic_first <= stage1_req.atomic_first; d_out.atomic_last <= stage1_req.atomic_last; d_out.addr <= stage1_req.addr; d_out.byte_sel <= stage1_req.byte_sel; @@ -991,6 +993,7 @@ begin d_out.nc <= r2.req.nc; d_out.reserve <= r2.req.reserve; d_out.atomic_qw <= r2.req.atomic_qw; + d_out.atomic_first <= r2.req.atomic_first; d_out.atomic_last <= r2.req.atomic_last; d_out.addr <= r2.req.addr; d_out.byte_sel <= r2.req.byte_sel;