From a1f58679197fee9a8036539be585414ba0ee57df Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 7 Sep 2022 20:21:42 +1000 Subject: [PATCH] dcache: Split PLRU into storage and logic Rather than having update and decode logic for each individual PLRU as well as a register to store the current PLRU state, we now put the PLRU state in a little RAM, which will typically use LUT RAM on FPGAs, and have just a single copy of the logic to calculate the pseudo-LRU way and to update the PLRU state. The PLRU RAM that apples to the data storage (as opposed to the TLB) is read asynchronously in the cycle after the cache tag matching is done. At the end of that cycle the PLRU RAM entry is updated if the access was a cache hit, or a victim way is calculated and stored if the access was a cache miss. It is possible that a cache miss doesn't start being handled until later, in which case the stored victim way is used later when the miss gets handled. Similarly for the TLB PLRU, the RAM is read asynchronously in the cycle after a TLB lookup is done, and either updated at the end of that cycle (for a hit), or a victim is chosen and stored for when the TLB miss is satisfied. Signed-off-by: Paul Mackerras --- dcache.vhdl | 187 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 114 insertions(+), 73 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index a29cf6f..75c2ce0 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -317,6 +317,7 @@ architecture rtl of dcache is tlb_hit : std_ulogic; tlb_hit_way : tlb_way_sig_t; tlb_hit_index : tlb_index_sig_t; + tlb_victim : tlb_way_sig_t; -- data buffer for data forwarded from writes to reads forward_data : std_ulogic_vector(63 downto 0); @@ -342,6 +343,8 @@ architecture rtl of dcache is acks_pending : unsigned(2 downto 0); inc_acks : std_ulogic; dec_acks : std_ulogic; + choose_victim : std_ulogic; + victim_way : way_t; -- Signals to complete (possibly with error) ls_valid : std_ulogic; @@ -398,8 +401,7 @@ architecture rtl of dcache is signal ram_wr_select : std_ulogic_vector(ROW_SIZE - 1 downto 0); -- PLRU output interface - type plru_out_t is array(0 to NUM_LINES-1) of std_ulogic_vector(WAY_BITS-1 downto 0); - signal plru_victim : plru_out_t; + signal plru_victim : way_t; signal replace_way : way_t; -- Wishbone read/write/cache write formatting signals @@ -423,8 +425,7 @@ architecture rtl of dcache is signal tlb_miss : std_ulogic; -- TLB PLRU output interface - type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0); - signal tlb_plru_victim : tlb_plru_out_t; + signal tlb_plru_victim : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); signal snoop_tag_set : cache_tags_set_t; signal snoop_valid : std_ulogic; @@ -650,39 +651,49 @@ begin end process; -- Generate TLB PLRUs - maybe_tlb_plrus: if TLB_NUM_WAYS > 1 generate + maybe_tlb_plrus : if TLB_NUM_WAYS > 1 generate + type tlb_plru_array is array(tlb_index_t) of std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0); + signal tlb_plru_ram : tlb_plru_array; + signal tlb_plru_cur : std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0); + signal tlb_plru_upd : std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0); + signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); begin - tlb_plrus: for i in 0 to TLB_SET_SIZE - 1 generate - -- TLB PLRU interface - signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); - signal tlb_plru_acc_en : std_ulogic; - signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); - begin - tlb_plru : entity work.plru - generic map ( - BITS => TLB_WAY_BITS - ) - port map ( - clk => clk, - rst => rst, - acc => tlb_plru_acc, - acc_en => tlb_plru_acc_en, - lru => tlb_plru_out - ); - - process(all) - begin - -- PLRU interface - if not is_X(r1.tlb_hit_index) and r1.tlb_hit_index = i then - tlb_plru_acc_en <= r1.tlb_hit; - assert not is_X(r1.tlb_hit_way); - else - tlb_plru_acc_en <= '0'; - end if; - tlb_plru_acc <= std_ulogic_vector(r1.tlb_hit_way); - tlb_plru_victim(i) <= tlb_plru_out; - end process; - end generate; + tlb_plru : entity work.plrufn + generic map ( + BITS => TLB_WAY_BITS + ) + port map ( + acc => tlb_plru_acc, + tree_in => tlb_plru_cur, + tree_out => tlb_plru_upd, + lru => tlb_plru_out + ); + + process(all) + begin + -- Read PLRU bits from array + if is_X(r1.tlb_hit_index) then + tlb_plru_cur <= (others => 'X'); + else + tlb_plru_cur <= tlb_plru_ram(to_integer(r1.tlb_hit_index)); + end if; + + -- PLRU interface + tlb_plru_acc <= std_ulogic_vector(r1.tlb_hit_way); + tlb_plru_victim <= tlb_plru_out; + end process; + + -- synchronous writes to TLB PLRU array + process(clk) + begin + if rising_edge(clk) then + if r1.tlb_hit = '1' then + assert not is_X(r1.tlb_hit_index) severity failure; + tlb_plru_ram(to_integer(r1.tlb_hit_index)) <= tlb_plru_upd; + end if; + end if; + end process; end generate; tlb_search : process(all) @@ -753,7 +764,7 @@ begin if tlb_hit = '1' then repl_way := tlb_hit_way; else - repl_way := unsigned(tlb_plru_victim(to_integer(tlb_req_index))); + repl_way := unsigned(r1.tlb_victim); end if; assert not is_X(repl_way); end if; @@ -770,39 +781,49 @@ begin end process; -- Generate PLRUs - maybe_plrus: if NUM_WAYS > 1 generate + maybe_plrus : if NUM_WAYS > 1 generate + type plru_array is array(0 to NUM_LINES-1) of std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_ram : plru_array; + signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0); + signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); + signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); begin - plrus: for i in 0 to NUM_LINES-1 generate - -- PLRU interface - signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); - signal plru_acc_en : std_ulogic; - signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); - - begin - plru : entity work.plru - generic map ( - BITS => WAY_BITS - ) - port map ( - clk => clk, - rst => rst, - acc => plru_acc, - acc_en => plru_acc_en, - lru => plru_out - ); - - process(all) - begin - -- PLRU interface - if not is_X(r1.hit_index) and r1.hit_index = to_unsigned(i, INDEX_BITS) then - plru_acc_en <= r1.cache_hit; - else - plru_acc_en <= '0'; - end if; - plru_acc <= std_ulogic_vector(r1.hit_way); - plru_victim(i) <= plru_out; - end process; - end generate; + plru : entity work.plrufn + generic map ( + BITS => WAY_BITS + ) + port map ( + acc => plru_acc, + tree_in => plru_cur, + tree_out => plru_upd, + lru => plru_out + ); + + process(all) + begin + -- Read PLRU bits from array + if is_X(r1.hit_index) then + plru_cur <= (others => 'X'); + else + plru_cur <= plru_ram(to_integer(r1.hit_index)); + end if; + + -- PLRU interface + plru_acc <= std_ulogic_vector(r1.hit_way); + plru_victim <= unsigned(plru_out); + end process; + + -- synchronous writes to PLRU array + process(clk) + begin + if rising_edge(clk) then + if r1.cache_hit = '1' then + assert not is_X(r1.hit_index) severity failure; + plru_ram(to_integer(r1.hit_index)) <= plru_upd; + end if; + end if; + end process; end generate; -- Cache tag RAM read port @@ -980,8 +1001,13 @@ begin replace_way <= to_unsigned(0, WAY_BITS); if NUM_WAYS > 1 then if r1.write_tag = '1' then - assert not is_X(r1.store_index); - replace_way <= unsigned(plru_victim(to_integer(r1.store_index))); + if r1.choose_victim = '1' then + replace_way <= plru_victim; + else + -- Cache victim way was chosen earlier, + -- in the cycle after the miss was detected. + replace_way <= r1.victim_way; + end if; else replace_way <= r1.store_way; end if; @@ -1305,8 +1331,6 @@ begin end if; -- Fast path for load/store hits. Set signals for the writeback controls. - r1.hit_way <= req_hit_way; - r1.hit_index <= req_index; if req_op = OP_LOAD_HIT then r1.hit_load_valid <= '1'; else @@ -1340,6 +1364,11 @@ begin r1.tlb_hit <= tlb_hit; r1.tlb_hit_way <= tlb_hit_way; r1.tlb_hit_index <= tlb_req_index; + -- determine victim way in the TLB in the cycle after + -- we detect the TLB miss + if r1.ls_error = '1' then + r1.tlb_victim <= unsigned(tlb_plru_victim); + end if; end if; end process; @@ -1364,6 +1393,7 @@ begin ev.load_miss <= '0'; ev.store_miss <= '0'; ev.dtlb_miss <= tlb_miss; + r1.choose_victim <= '0'; -- On reset, clear all valid bits to force misses if rst = '1' then @@ -1460,6 +1490,17 @@ begin end if; end if; + -- Signals for PLRU update and victim selection + r1.hit_way <= req_hit_way; + r1.hit_index <= req_index; + -- Record victim way in the cycle after we see a load or dcbz miss + if r1.choose_victim = '1' then + r1.victim_way <= plru_victim; + end if; + if req_op = OP_LOAD_MISS or (req_op = OP_STORE_MISS and r0.req.dcbz = '1') then + r1.choose_victim <= '1'; + end if; + -- Main state machine case r1.state is when IDLE =>