diff --git a/common.vhdl b/common.vhdl index 3af1d7b..6df5b6b 100644 --- a/common.vhdl +++ b/common.vhdl @@ -606,6 +606,7 @@ package common is nc : std_ulogic; reserve : std_ulogic; atomic_qw : std_ulogic; -- part of a quadword atomic op + atomic_first : std_ulogic; atomic_last : std_ulogic; virt_mode : std_ulogic; priv_mode : std_ulogic; diff --git a/dcache.vhdl b/dcache.vhdl index 807a2dc..68f3b60 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -184,7 +184,6 @@ architecture rtl of dcache is -- Type of operation on a "valid" input type op_t is (OP_NONE, OP_BAD, -- NC cache hit, TLB miss, prot/RC failure - OP_STCX_FAIL, -- conditional store w/o reservation OP_LOAD_HIT, -- Cache hit on load OP_LOAD_MISS, -- Load missing cache OP_LOAD_NC, -- Non-cachable load @@ -195,8 +194,8 @@ architecture rtl of dcache is type state_t is (IDLE, -- Normal load hit processing RELOAD_WAIT_ACK, -- Cache reload wait ack STORE_WAIT_ACK, -- Store wait ack - NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack - + NC_LOAD_WAIT_ACK, -- Non-cachable load wait ack + DO_STCX); -- Check for stcx. validity -- -- Dcache operations: @@ -290,6 +289,9 @@ architecture rtl of dcache is op : op_t; valid : std_ulogic; dcbz : std_ulogic; + reserve : std_ulogic; + first_dw : std_ulogic; + last_dw : std_ulogic; real_addr : real_addr_t; data : std_ulogic_vector(63 downto 0); byte_sel : std_ulogic_vector(7 downto 0); @@ -365,10 +367,12 @@ architecture rtl of dcache is -- type reservation_t is record valid : std_ulogic; - addr : std_ulogic_vector(63 downto LINE_OFF_BITS); + addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); end record; signal reservation : reservation_t; + signal kill_rsrv : std_ulogic; + signal kill_rsrv2 : std_ulogic; -- Async signals on incoming request signal req_index : index_t; @@ -382,10 +386,6 @@ architecture rtl of dcache is signal early_req_row : row_t; signal early_rd_valid : std_ulogic; - signal cancel_store : std_ulogic; - signal set_rsrv : std_ulogic; - signal clear_rsrv : std_ulogic; - signal r0_valid : std_ulogic; signal r0_stall : std_ulogic; @@ -427,10 +427,13 @@ architecture rtl of dcache is -- TLB PLRU output interface signal tlb_plru_victim : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + signal snoop_active : std_ulogic; signal snoop_tag_set : cache_tags_set_t; signal snoop_valid : std_ulogic; - signal snoop_wrtag : cache_tag_t; - signal snoop_index : index_t; + signal snoop_paddr : real_addr_t; + signal snoop_addr : real_addr_t; + signal snoop_hits : cache_way_valids_t; + signal req_snoop_hit : std_ulogic; -- -- Helper functions to decode incoming requests @@ -861,26 +864,45 @@ begin end if; end process; + -- Snoop logic + -- Don't snoop our own cycles + snoop_addr <= addr_to_real(wb_to_addr(snoop_in.adr)); + snoop_active <= snoop_in.cyc and snoop_in.stb and snoop_in.we and + not (r1.wb.cyc and not wishbone_in.stall); + kill_rsrv <= '1' when (snoop_active = '1' and reservation.valid = '1' and + snoop_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) = reservation.addr) + else '0'; + -- Cache tag RAM second read port, for snooping cache_tag_read_2 : process(clk) - variable addr : real_addr_t; begin if rising_edge(clk) then - -- Don't snoop our own cycles - snoop_valid <= '0'; - if not (r1.wb.cyc = '1' and wishbone_in.stall = '0') then - if (snoop_in.cyc and snoop_in.stb and snoop_in.we) = '1' then - snoop_valid <= '1'; - addr := addr_to_real(wb_to_addr(snoop_in.adr)); - assert not is_X(addr); - snoop_tag_set <= cache_tags(to_integer(get_index(addr))); - snoop_wrtag <= get_tag(addr); - snoop_index <= get_index(addr); - end if; + if is_X(snoop_addr) then + snoop_tag_set <= (others => 'X'); + else + snoop_tag_set <= cache_tags(to_integer(get_index(snoop_addr))); end if; + snoop_paddr <= snoop_addr; + snoop_valid <= snoop_active; end if; end process; + -- Compare the previous cycle's snooped store address to the reservation, + -- to catch the case where a write happens on cycle 1 of a cached larx + kill_rsrv2 <= '1' when (snoop_valid = '1' and reservation.valid = '1' and + snoop_paddr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) = reservation.addr) + else '0'; + + snoop_tag_match : process(all) + begin + snoop_hits <= (others => '0'); + for i in 0 to NUM_WAYS-1 loop + if snoop_valid = '1' and read_tag(i, snoop_tag_set) = get_tag(snoop_paddr) then + snoop_hits(i) <= '1'; + end if; + end loop; + end process; + -- Cache request parsing and hit detection dcache_request : process(all) variable req_row : row_t; @@ -901,6 +923,8 @@ begin variable rel_match : std_ulogic; variable fwd_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); variable fwd_match : std_ulogic; + variable snp_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); + variable snoop_match : std_ulogic; begin -- Extract line, row and tag from request rindex := get_index(r0.req.addr); @@ -924,9 +948,11 @@ begin is_hit := '0'; rel_match := '0'; fwd_match := '0'; + snoop_match := '0'; if r0.req.virt_mode = '1' then rel_matches := (others => '0'); fwd_matches := (others => '0'); + snp_matches := (others => '0'); for j in tlb_way_t loop hit_way_set(j) := to_unsigned(0, WAY_BITS); s_hit := '0'; @@ -943,6 +969,9 @@ begin tlb_valid_way(j) = '1' then hit_way_set(j) := to_unsigned(i, WAY_BITS); s_hit := '1'; + if snoop_hits(i) = '1' then + snp_matches(j) := '1'; + end if; end if; end loop; hit_set(j) := s_hit; @@ -959,6 +988,7 @@ begin hit_way := hit_way_set(to_integer(tlb_hit_way)); rel_match := rel_matches(to_integer(tlb_hit_way)); fwd_match := fwd_matches(to_integer(tlb_hit_way)); + snoop_match := snp_matches(to_integer(tlb_hit_way)); end if; else s_tag := get_tag(r0.req.addr); @@ -970,6 +1000,9 @@ begin read_tag(i, cache_tag_set) = s_tag then hit_way := to_unsigned(i, WAY_BITS); is_hit := '1'; + if snoop_hits(i) = '1' then + snoop_match := '1'; + end if; end if; end loop; if go = '1' and not is_X(r1.reload_tag) and s_tag = r1.reload_tag then @@ -982,6 +1015,13 @@ begin req_same_tag <= rel_match; fwd_same_tag <= fwd_match; + -- This is 1 if the snooped write from the previous cycle hits the same + -- cache line that is being accessed in this cycle. + req_snoop_hit <= '0'; + if go = '1' and snoop_match = '1' and get_index(snoop_paddr) = rindex then + req_snoop_hit <= '1'; + end if; + -- Whether to use forwarded data for a load or not use_forward_st <= '0'; use_forward_rl <= '0'; @@ -1060,8 +1100,6 @@ begin if go = '1' then if access_ok = '0' then op := OP_BAD; - elsif cancel_store = '1' then - op := OP_STCX_FAIL; else opsel := r0.req.load & nc & is_hit; case opsel is @@ -1101,45 +1139,6 @@ begin -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- Handle load-with-reservation and store-conditional instructions - reservation_comb: process(all) - begin - cancel_store <= '0'; - set_rsrv <= '0'; - clear_rsrv <= '0'; - if r0_valid = '1' and r0.req.reserve = '1' then - -- XXX generate alignment interrupt if address is not aligned - -- XXX or if r0.req.nc = '1' - if r0.req.load = '1' then - -- load with reservation - set_rsrv <= not r0.req.atomic_qw or r0.req.atomic_last; - else - -- store conditional - clear_rsrv <= not r0.req.atomic_qw or r0.req.atomic_last; - if reservation.valid = '0' or - r0.req.addr(63 downto LINE_OFF_BITS) /= reservation.addr then - cancel_store <= '1'; - end if; - end if; - end if; - end process; - - reservation_reg: process(clk) - begin - if rising_edge(clk) then - if rst = '1' then - reservation.valid <= '0'; - elsif r0_valid = '1' and access_ok = '1' then - if clear_rsrv = '1' then - reservation.valid <= '0'; - elsif set_rsrv = '1' then - reservation.valid <= '1'; - reservation.addr <= r0.req.addr(63 downto LINE_OFF_BITS); - end if; - end if; - end if; - end process; - -- Return data for loads & completion control logic -- writeback_control: process(all) @@ -1367,12 +1366,6 @@ begin r1.cache_paradox <= '0'; end if; - if req_op = OP_STCX_FAIL then - r1.stcx_fail <= '1'; - else - r1.stcx_fail <= '0'; - end if; - -- Record TLB hit information for updating TLB PLRU r1.tlb_hit <= tlb_hit; r1.tlb_hit_way <= tlb_hit_way; @@ -1423,6 +1416,8 @@ begin r1.acks_pending <= to_unsigned(0, 3); r1.stalled <= '0'; r1.dec_acks <= '0'; + reservation.valid <= '0'; + reservation.addr <= (others => '0'); -- Not useful normally but helps avoiding tons of sim warnings r1.wb.adr <= (others => '0'); @@ -1430,27 +1425,40 @@ begin -- One cycle pulses reset r1.slow_valid <= '0'; r1.write_bram <= '0'; + r1.stcx_fail <= '0'; r1.ls_valid <= '0'; -- complete tlbies and TLB loads in the third cycle r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld); - if req_op = OP_LOAD_HIT or req_op = OP_STCX_FAIL then + if req_op = OP_LOAD_HIT then if r0.mmu_req = '0' then r1.ls_valid <= '1'; else r1.mmu_done <= '1'; end if; end if; + -- The kill_rsrv2 term covers the case where the reservation + -- address was set at the beginning of this cycle, and a store + -- to that address happened in the previous cycle. + if kill_rsrv = '1' or kill_rsrv2 = '1' then + reservation.valid <= '0'; + end if; + if req_go = '1' and access_ok = '1' and r0.req.load = '1' and + r0.req.reserve = '1' and r0.req.atomic_first = '1' then + reservation.addr <= ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); + if req_op = OP_LOAD_HIT then + reservation.valid <= not req_snoop_hit; + end if; + end if; -- Do invalidations from snooped stores to memory if snoop_valid = '1' then - assert not is_X(snoop_tag_set); - assert not is_X(snoop_wrtag); + assert not is_X(snoop_paddr); + assert not is_X(snoop_hits); end if; for i in 0 to NUM_WAYS-1 loop - if snoop_valid = '1' and read_tag(i, snoop_tag_set) = snoop_wrtag then - assert not is_X(snoop_index); - cache_valids(to_integer(snoop_index))(i) <= '0'; + if snoop_hits(i) = '1' then + cache_valids(to_integer(get_index(snoop_paddr)))(i) <= '0'; end if; end loop; @@ -1477,6 +1485,9 @@ begin req.valid := req_go; req.mmu_req := r0.mmu_req; req.dcbz := r0.req.dcbz; + req.reserve := r0.req.reserve; + req.first_dw := r0.req.atomic_first; + req.last_dw := r0.req.atomic_last; req.real_addr := ra; -- Force data to 0 for dcbz if r0.req.dcbz = '1' then @@ -1581,7 +1592,11 @@ begin r1.state <= NC_LOAD_WAIT_ACK; when OP_STORE_HIT | OP_STORE_MISS => - if req.dcbz = '0' then + if req.reserve = '1' then + -- stcx needs to wait until next cycle + -- for the reservation address check + r1.state <= DO_STCX; + elsif req.dcbz = '0' then r1.state <= STORE_WAIT_ACK; r1.full <= '0'; r1.slow_valid <= '1'; @@ -1593,6 +1608,9 @@ begin if req.op = OP_STORE_HIT then r1.write_bram <= '1'; end if; + r1.wb.we <= '1'; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; else -- dcbz is handled much like a load miss except -- that we are writing to memory instead of reading @@ -1600,19 +1618,18 @@ begin if req.op = OP_STORE_MISS then r1.write_tag <= '1'; end if; + r1.wb.we <= '1'; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; end if; - r1.wb.we <= '1'; - r1.wb.cyc <= '1'; - r1.wb.stb <= '1'; if req.op = OP_STORE_MISS then ev.store_miss <= '1'; end if; -- OP_NONE and OP_BAD do nothing - -- OP_BAD & OP_STCX_FAIL were handled above already + -- OP_BAD was handled above already when OP_NONE => when OP_BAD => - when OP_STCX_FAIL => end case; when RELOAD_WAIT_ACK => @@ -1652,6 +1669,13 @@ begin else r1.mmu_done <= '1'; end if; + -- NB: for lqarx, set the reservation on the first + -- dword so that a snooped store between the two + -- dwords will kill the reservation. + if req.reserve = '1' and req.first_dw = '1' then + reservation.valid <= '1'; + reservation.addr <= req.real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); + end if; end if; -- Check for completion @@ -1736,6 +1760,48 @@ begin r1.wb.cyc <= '0'; r1.wb.stb <= '0'; end if; + + when DO_STCX => + if reservation.valid = '0' or kill_rsrv = '1' or + r1.req.real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) /= reservation.addr then + -- Wrong address, didn't have reservation, or lost reservation + -- Abandon the wishbone cycle if started and fail the stcx. + r1.stcx_fail <= '1'; + r1.full <= '0'; + r1.ls_valid <= '1'; + r1.state <= IDLE; + r1.wb.cyc <= '0'; + r1.wb.stb <= '0'; + reservation.valid <= '0'; + elsif r1.wb.cyc = '0' then + -- Right address and have reservation, so start the + -- wishbone cycle + r1.wb.we <= '1'; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; + else + if wishbone_in.stall = '0' then + -- Store has been accepted, so now we can write the + -- cache data RAM + if r1.req.op = OP_STORE_HIT then + r1.write_bram <= '1'; + end if; + r1.wb.stb <= '0'; + end if; + if wishbone_in.ack = '1' then + r1.state <= IDLE; + r1.wb.cyc <= '0'; + r1.wb.stb <= '0'; + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + -- For stqcx., kill the reservation on the last dword + if r1.req.last_dw = '1' then + reservation.valid <= '0'; + end if; + end if; + end if; + end case; end if; end if; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index dcacc75..e69a27e 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -85,6 +85,7 @@ architecture behave of loadstore1 is xerc : xer_common_t; reserve : std_ulogic; atomic_qw : std_ulogic; + atomic_first : std_ulogic; atomic_last : std_ulogic; rc : std_ulogic; nc : std_ulogic; -- non-cacheable access @@ -110,7 +111,7 @@ architecture behave of loadstore1 is elt_length => x"0", byte_reverse => '0', brev_mask => "000", sign_extend => '0', update => '0', xerc => xerc_init, reserve => '0', - atomic_qw => '0', atomic_last => '0', + atomic_qw => '0', atomic_first => '0', atomic_last => '0', rc => '0', nc => '0', virt_mode => '0', priv_mode => '0', load_sp => '0', sprsel => "00", ric => "00", is_slbia => '0', align_intr => '0', @@ -478,20 +479,20 @@ begin -- check alignment for larx/stcx misaligned := or (addr_mask and addr(2 downto 0)); + if l_in.repeat = '1' and l_in.update = '0' and addr(3) /= l_in.second then + misaligned := '1'; + end if; v.align_intr := l_in.reserve and misaligned; + v.atomic_first := not misaligned and not l_in.second; + v.atomic_last := not misaligned and (l_in.second or not l_in.repeat); + -- is this a quadword load or store? i.e. lq plq stq pstq lqarx stqcx. if l_in.repeat = '1' and l_in.update = '0' then - -- is the access aligned? - if misaligned = '0' and addr(3) = l_in.second then + if misaligned = '0' then -- Since the access is aligned we have to do it atomically v.atomic_qw := '1'; - v.atomic_last := l_in.second; else - -- lqarx/stqcx have to be aligned - if l_in.reserve = '1' then - v.align_intr := '1'; - end if; -- We require non-prefixed lq in LE mode to be aligned in order -- to avoid the case where RA = RT+1 and the second access faults -- after the first has overwritten RA. @@ -979,6 +980,7 @@ begin d_out.nc <= stage1_req.nc; d_out.reserve <= stage1_req.reserve; d_out.atomic_qw <= stage1_req.atomic_qw; + d_out.atomic_first <= stage1_req.atomic_first; d_out.atomic_last <= stage1_req.atomic_last; d_out.addr <= stage1_req.addr; d_out.byte_sel <= stage1_req.byte_sel; @@ -991,6 +993,7 @@ begin d_out.nc <= r2.req.nc; d_out.reserve <= r2.req.reserve; d_out.atomic_qw <= r2.req.atomic_qw; + d_out.atomic_first <= r2.req.atomic_first; d_out.atomic_last <= r2.req.atomic_last; d_out.addr <= r2.req.addr; d_out.byte_sel <= r2.req.byte_sel;