dcache: Make aligned quadword loads and stores actually be atomic

This implements logic in the dcache to make aligned quadword loads and stores atomic with respect to other mechanisms that access memory. Such loads and stores are already marked with the atomic_qw bit in Loadstore1ToDcacheType. For quadword loads where the first dword access hits in the cache, we record the fact of the hit and the cache way used (r1.prev_hit and r1.prev_way). The second dword access then assumes a hit on the same way even if the cache line has been invalidated in the mean time by a snooped store. This gives the same effect as would loading both dwords at the time of the first dword load. For a lqarx, the reservation is set at the time of the first dword load, so if there is such a snooped store, the reservation will be invalid by the time the lqarx completes. If the first dword load hits on the cache line being refilled, so should the second, unless the refill finishes. In that case we set r1.prev_hit and r1.prev_way so the second load can use the line just refilled (but only if the first dword hit the line being refilled). For stores, the req.atomic_more flag is set on the first dword store, and that causes the STORE_WAIT_ACK state to wait for the next request without dropping cyc, so it is not possible for another wishbone master to insert an access between the writes of the two dwords to memory. For store-conditionals, DO_STCX state now transitions to STORE_WAIT_ACK state once the store has been accepted (stall is false). This means that the second store for a stqcx can be handled in the same way as the second store for a stq. Once the first store for a stqcx has succeeded, the second store is done unconditionally. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
6 months ago · 00efcc2c3b
parent c2dcf4b334
commit 00efcc2c3b
1 changed files with 96 additions and 45 deletions
--- a/dcache.vhdl
+++ b/dcache.vhdl
@ -264,6 +264,23 @@ architecture rtl of dcache is
    -- subsequent load requests to the same line can be completed as
    -- soon as the necessary data comes in from memory, without
    -- waiting for the whole line to be read.
+    --
+    -- Aligned loads and stores of a doubleword or less are atomic
+    -- because they are done in a single wishbone operation.
+    -- For quadword atomic loads and stores we rely on the wishbone
+    -- arbiter not interrupting access to a target once it has first
+    -- given access; i.e. once we have the main wishbone, no other
+    -- master gets access until we drop cyc.
+    --
+    -- Note on loads potentially hitting the victim line that is
+    -- currently being replaced: the new tag is available starting
+    -- with the 3rd cycle of RELOAD_WAIT_ACK state.  As long as the
+    -- first read on the wishbone takes at least one cycle (i.e. the
+    -- ack doesn't arrive in the same cycle as stb was asserted),
+    -- r1.full will be true at least until that 3rd cycle and so a load
+    -- following a load miss can't hit on the old tag of the victim
+    -- line.  As long as ack is not generated combinationally from
+    -- stb, this will be fine.

    -- Stage 0 register, basically contains just the latched request
    type reg_stage_0_t is record
@ -307,12 +324,16 @@ architecture rtl of dcache is
        full             : std_ulogic;          -- have uncompleted request
        mmu_req          : std_ulogic;          -- request is from MMU
        req              : mem_access_request_t;
+        atomic_more      : std_ulogic;          -- atomic request isn't finished

 	-- Cache hit state
 	hit_way          : way_t;
 	hit_load_valid   : std_ulogic;
        hit_index        : index_t;
        cache_hit        : std_ulogic;
+        prev_hit         : std_ulogic;
+        prev_way         : way_t;
+        prev_hit_reload  : std_ulogic;

        -- TLB hit state
        tlb_hit          : std_ulogic;
@ -389,6 +410,7 @@ architecture rtl of dcache is
    signal req_same_tag     : std_ulogic;
    signal req_go           : std_ulogic;
    signal req_nc           : std_ulogic;
+    signal req_hit_reload   : std_ulogic;

    signal early_req_row  : row_t;
    signal early_rd_valid : std_ulogic;
@ -927,6 +949,7 @@ begin
        variable fwd_match   : std_ulogic;
        variable snp_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0);
        variable snoop_match : std_ulogic;
+        variable hit_reload  : std_ulogic;
    begin
 	-- Extract line, row and tag from request
        rindex := get_index(r0.req.addr);
@ -1071,6 +1094,7 @@ begin
            assert not is_X(rindex);
            assert not is_X(r1.store_index);
        end if;
+        hit_reload := '0';
        if r1.state = RELOAD_WAIT_ACK and rel_match = '1' and
            rindex = r1.store_index then
            -- Ignore is_hit from above, because a load miss writes the new tag
@ -1085,11 +1109,23 @@ begin
                      r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or
                      use_forward_rl;
            hit_way := replace_way;
+            hit_reload := is_hit;
+        elsif r0.req.load = '1' and r0.req.atomic_qw = '1' and r0.req.atomic_first = '0' and
+            r0.req.nc = '0' and perm_attr.nocache = '0' and r1.prev_hit = '1' then
+            -- For the second half of an atomic quadword load, just use the
+            -- same way as the first half, without considering whether the line
+            -- is valid; it is as if we had read the second dword at the same
+            -- time as the first dword, and the line was valid back then.
+            -- (Cases where the line is currently being reloaded are handled above.)
+            -- NB lq to noncacheable isn't required to be atomic per the ISA.
+            is_hit := '1';
+            hit_way := r1.prev_way;
        end if;

 	-- The way that matched on a hit	       
 	req_hit_way <= hit_way;
        req_is_hit <= is_hit;
+        req_hit_reload <= hit_reload;

        -- work out whether we have permission for this access
        -- NB we don't yet implement AMR, thus no KUAP
@ -1418,6 +1454,8 @@ begin
                r1.acks_pending <= to_unsigned(0, 3);
                r1.stalled <= '0';
                r1.dec_acks <= '0';
+                r1.prev_hit <= '0';
+                r1.prev_hit_reload <= '0';
                reservation.valid <= '0';
                reservation.addr <= (others => '0');

@ -1443,9 +1481,7 @@ begin
                if req_go = '1' and access_ok = '1' and r0.req.load = '1' and
                    r0.req.reserve = '1' and r0.req.atomic_first = '1' then
                    reservation.addr <= ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS);
-                    if req_is_hit = '1' then
-                        reservation.valid <= not req_snoop_hit;
-                    end if;
+                    reservation.valid <= req_is_hit and not req_snoop_hit;
                end if;

                -- Do invalidations from snooped stores to memory
@ -1488,8 +1524,8 @@ begin
                    req.flush := r0.req.flush;
                    req.touch := r0.req.touch;
                    req.reserve := r0.req.reserve;
-                    req.first_dw := r0.req.atomic_first;
-                    req.last_dw := r0.req.atomic_last;
+                    req.first_dw := not r0.req.atomic_qw or r0.req.atomic_first;
+                    req.last_dw := not r0.req.atomic_qw or r0.req.atomic_last;
                    req.real_addr := ra;
                    -- Force data to 0 for dcbz
                    if r0.req.dcbz = '1' then
@ -1528,6 +1564,11 @@ begin
                if req_op_load_miss = '1' or (r0.req.dcbz = '1' and req_is_hit = '0') then
                    r1.choose_victim <= '1';
                end if;
+                if req_go = '1' then
+                    r1.prev_hit <= req_is_hit;
+                    r1.prev_way <= req_hit_way;
+                    r1.prev_hit_reload <= req_hit_reload;
+                end if;

                -- Update count of pending acks
                acks := r1.acks_pending;
@ -1549,6 +1590,7 @@ begin
                    r1.wb.sel <= req.byte_sel;
                    r1.wb.dat <= req.data;
                    r1.dcbz <= req.dcbz;
+                    r1.atomic_more <= not req.last_dw;

                    -- Keep track of our index and way for subsequent stores.
                    r1.store_index <= get_index(req.real_addr);
@ -1659,7 +1701,7 @@ begin
                            assert not is_X(r1.req.real_addr);
                        end if;
 			if r1.full = '1' and r1.req.same_tag = '1' and
-                            ((r1.dcbz = '1' and req.dcbz = '1') or r1.req.op_lmiss = '1') and
+                            ((r1.dcbz = '1' and r1.req.dcbz = '1') or r1.req.op_lmiss = '1') and
                            r1.store_row = get_row(r1.req.real_addr) then
                            r1.full <= '0';
                            r1.slow_valid <= '1';
@ -1668,12 +1710,9 @@ begin
                            else
                                r1.mmu_done <= '1';
                            end if;
-                            -- NB: for lqarx, set the reservation on the first
-                            -- dword so that a snooped store between the two
-                            -- dwords will kill the reservation.
-                            if req.reserve = '1' and req.first_dw = '1' then
+                            -- NB: for lqarx, set the reservation on the first dword
+                            if r1.req.reserve = '1' and r1.req.first_dw = '1' then
                                reservation.valid <= '1';
-                                reservation.addr <= req.real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS);
                            end if;
 			end if;

@ -1690,6 +1729,10 @@ begin
 			    cache_valids(to_integer(r1.store_index))(to_integer(r1.store_way)) <= '1';

                            ev.dcache_refill <= not r1.dcbz;
+                            -- Second half of a lq/lqarx can assume a hit on this line now
+                            -- if the first half hit this line.
+                            r1.prev_hit <= r1.prev_hit_reload;
+                            r1.prev_way <= r1.store_way;
                            r1.state <= IDLE;
 			end if;

@ -1703,6 +1746,10 @@ begin
                    if wishbone_in.stall = '0' then
                        -- See if there is another store waiting to be done
                        -- which is in the same real page.
+                        -- This could be either in r1.req or in r0.
+                        -- Ignore store-conditionals, they have to go through
+                        -- DO_STCX state, unless they are the second half of a
+                        -- successful stqcx, which is handled here.
                        if req.valid = '1' then
                            r1.wb.adr(SET_SIZE_BITS - ROW_OFF_BITS - 1 downto 0) <=
                                req.real_addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS);
@ -1710,28 +1757,33 @@ begin
                            r1.wb.sel <= req.byte_sel;
                        end if;
                        assert not is_X(acks);
-                        if acks < 7 and req.same_tag = '1' and req.dcbz = '0' and
-                            req.op_store = '1' then
-                            r1.wb.stb <= '1';
-                            stbs_done := false;
-                            r1.store_way <= req.hit_way;
-                            r1.store_row <= get_row(req.real_addr);
-                            r1.write_bram <= req.is_hit;
-                            r1.full <= '0';
-                            r1.slow_valid <= '1';
-                            -- Store requests never come from the MMU
-                            r1.ls_valid <= '1';
-                            stbs_done := false;
+                        r1.wb.stb <= '0';
+                        if req.op_store = '1' and req.same_tag = '1' and req.dcbz = '0' and
+                            (req.reserve = '0' or r1.atomic_more = '1') then
+                            if acks < 7 then
+                                r1.wb.stb <= '1';
+                                stbs_done := false;
+                                r1.store_way <= req.hit_way;
+                                r1.store_row <= get_row(req.real_addr);
+                                r1.write_bram <= req.is_hit;
+                                r1.atomic_more <= not req.last_dw;
+                                r1.full <= '0';
+                                r1.slow_valid <= '1';
+                                -- Store requests never come from the MMU
+                                r1.ls_valid <= '1';
+                            end if;
                        else
-                            r1.wb.stb <= '0';
                            stbs_done := true;
+                            if req.valid = '1' then
+                                r1.atomic_more <= '0';
+                            end if;
                        end if;
 		    end if;

 		    -- Got ack ? See if complete.
-		    if wishbone_in.ack = '1' then
+                    if stbs_done and r1.atomic_more = '0' then
                        assert not is_X(acks);
-                        if stbs_done and acks = 1 then
+                        if acks = 0 or (wishbone_in.ack = '1' and acks = 1) then
                            r1.state <= IDLE;
                            r1.wb.cyc <= '0';
                            r1.wb.stb <= '0';
@ -1770,31 +1822,30 @@ begin
                        r1.wb.cyc <= '0';
                        r1.wb.stb <= '0';
                        reservation.valid <= '0';
+                        -- If this is the first half of a stqcx., the second half
+                        -- will fail also because the reservation is not valid.
+                        r1.state <= IDLE;
                    elsif r1.wb.cyc = '0' then
                        -- Right address and have reservation, so start the
                        -- wishbone cycle
                        r1.wb.we <= '1';
                        r1.wb.cyc <= '1';
                        r1.wb.stb <= '1';
-                    else
-                        if wishbone_in.stall = '0' then
-                            -- Store has been accepted, so now we can write the
-                            -- cache data RAM
-                            r1.write_bram <= req.is_hit;
-                            r1.wb.stb <= '0';
-                        end if;
-                        if wishbone_in.ack = '1' then
-                            r1.state <= IDLE;
-                            r1.wb.cyc <= '0';
-                            r1.wb.stb <= '0';
-                            r1.full <= '0';
-                            r1.slow_valid <= '1';
-                            r1.ls_valid <= '1';
-                            -- For stqcx., kill the reservation on the last dword
-                            if r1.req.last_dw = '1' then
-                                reservation.valid <= '0';
-                            end if;
-                        end if;
+                    elsif r1.wb.stb = '1' and wishbone_in.stall = '0' then
+                        -- Store has been accepted, so now we can write the
+                        -- cache data RAM and complete the request
+                        r1.write_bram <= r1.req.is_hit;
+                        r1.wb.stb <= '0';
+                        r1.full <= '0';
+                        r1.slow_valid <= '1';
+                        r1.ls_valid <= '1';
+                        reservation.valid <= '0';
+                        -- For a stqcx, STORE_WAIT_ACK will issue the second half
+                        -- without checking the reservation, which is what we want
+                        -- given that the first half has gone out.
+                        -- With r1.atomic_more set, STORE_WAIT_ACK won't exit to
+                        -- IDLE state until it sees the second half.
+                        r1.state <= STORE_WAIT_ACK;
                    end if;

                when FLUSH_CYCLE =>