dcache: Split PLRU into storage and logic

Rather than having update and decode logic for each individual PLRU
as well as a register to store the current PLRU state, we now put the
PLRU state in a little RAM, which will typically use LUT RAM on FPGAs,
and have just a single copy of the logic to calculate the pseudo-LRU
way and to update the PLRU state.

The PLRU RAM that apples to the data storage (as opposed to the TLB)
is read asynchronously in the cycle after the cache tag matching is
done.  At the end of that cycle the PLRU RAM entry is updated if the
access was a cache hit, or a victim way is calculated and stored if
the access was a cache miss.  It is possible that a cache miss doesn't
start being handled until later, in which case the stored victim way
is used later when the miss gets handled.

Similarly for the TLB PLRU, the RAM is read asynchronously in the
cycle after a TLB lookup is done, and either updated at the end of
that cycle (for a hit), or a victim is chosen and stored for when the
TLB miss is satisfied.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/408/head
Paul Mackerras 2 years ago
parent 86212dc879
commit a1f5867919

@ -317,6 +317,7 @@ architecture rtl of dcache is
tlb_hit : std_ulogic; tlb_hit : std_ulogic;
tlb_hit_way : tlb_way_sig_t; tlb_hit_way : tlb_way_sig_t;
tlb_hit_index : tlb_index_sig_t; tlb_hit_index : tlb_index_sig_t;
tlb_victim : tlb_way_sig_t;


-- data buffer for data forwarded from writes to reads -- data buffer for data forwarded from writes to reads
forward_data : std_ulogic_vector(63 downto 0); forward_data : std_ulogic_vector(63 downto 0);
@ -342,6 +343,8 @@ architecture rtl of dcache is
acks_pending : unsigned(2 downto 0); acks_pending : unsigned(2 downto 0);
inc_acks : std_ulogic; inc_acks : std_ulogic;
dec_acks : std_ulogic; dec_acks : std_ulogic;
choose_victim : std_ulogic;
victim_way : way_t;


-- Signals to complete (possibly with error) -- Signals to complete (possibly with error)
ls_valid : std_ulogic; ls_valid : std_ulogic;
@ -398,8 +401,7 @@ architecture rtl of dcache is
signal ram_wr_select : std_ulogic_vector(ROW_SIZE - 1 downto 0); signal ram_wr_select : std_ulogic_vector(ROW_SIZE - 1 downto 0);


-- PLRU output interface -- PLRU output interface
type plru_out_t is array(0 to NUM_LINES-1) of std_ulogic_vector(WAY_BITS-1 downto 0); signal plru_victim : way_t;
signal plru_victim : plru_out_t;
signal replace_way : way_t; signal replace_way : way_t;


-- Wishbone read/write/cache write formatting signals -- Wishbone read/write/cache write formatting signals
@ -423,8 +425,7 @@ architecture rtl of dcache is
signal tlb_miss : std_ulogic; signal tlb_miss : std_ulogic;


-- TLB PLRU output interface -- TLB PLRU output interface
type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0); signal tlb_plru_victim : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
signal tlb_plru_victim : tlb_plru_out_t;


signal snoop_tag_set : cache_tags_set_t; signal snoop_tag_set : cache_tags_set_t;
signal snoop_valid : std_ulogic; signal snoop_valid : std_ulogic;
@ -651,38 +652,48 @@ begin


-- Generate TLB PLRUs -- Generate TLB PLRUs
maybe_tlb_plrus : if TLB_NUM_WAYS > 1 generate maybe_tlb_plrus : if TLB_NUM_WAYS > 1 generate
begin type tlb_plru_array is array(tlb_index_t) of std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0);
tlb_plrus: for i in 0 to TLB_SET_SIZE - 1 generate signal tlb_plru_ram : tlb_plru_array;
-- TLB PLRU interface signal tlb_plru_cur : std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0);
signal tlb_plru_upd : std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0);
signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
signal tlb_plru_acc_en : std_ulogic;
signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
begin begin
tlb_plru : entity work.plru tlb_plru : entity work.plrufn
generic map ( generic map (
BITS => TLB_WAY_BITS BITS => TLB_WAY_BITS
) )
port map ( port map (
clk => clk,
rst => rst,
acc => tlb_plru_acc, acc => tlb_plru_acc,
acc_en => tlb_plru_acc_en, tree_in => tlb_plru_cur,
tree_out => tlb_plru_upd,
lru => tlb_plru_out lru => tlb_plru_out
); );


process(all) process(all)
begin begin
-- PLRU interface -- Read PLRU bits from array
if not is_X(r1.tlb_hit_index) and r1.tlb_hit_index = i then if is_X(r1.tlb_hit_index) then
tlb_plru_acc_en <= r1.tlb_hit; tlb_plru_cur <= (others => 'X');
assert not is_X(r1.tlb_hit_way);
else else
tlb_plru_acc_en <= '0'; tlb_plru_cur <= tlb_plru_ram(to_integer(r1.tlb_hit_index));
end if; end if;

-- PLRU interface
tlb_plru_acc <= std_ulogic_vector(r1.tlb_hit_way); tlb_plru_acc <= std_ulogic_vector(r1.tlb_hit_way);
tlb_plru_victim(i) <= tlb_plru_out; tlb_plru_victim <= tlb_plru_out;
end process;

-- synchronous writes to TLB PLRU array
process(clk)
begin
if rising_edge(clk) then
if r1.tlb_hit = '1' then
assert not is_X(r1.tlb_hit_index) severity failure;
tlb_plru_ram(to_integer(r1.tlb_hit_index)) <= tlb_plru_upd;
end if;
end if;
end process; end process;
end generate;
end generate; end generate;


tlb_search : process(all) tlb_search : process(all)
@ -753,7 +764,7 @@ begin
if tlb_hit = '1' then if tlb_hit = '1' then
repl_way := tlb_hit_way; repl_way := tlb_hit_way;
else else
repl_way := unsigned(tlb_plru_victim(to_integer(tlb_req_index))); repl_way := unsigned(r1.tlb_victim);
end if; end if;
assert not is_X(repl_way); assert not is_X(repl_way);
end if; end if;
@ -771,38 +782,48 @@ begin


-- Generate PLRUs -- Generate PLRUs
maybe_plrus : if NUM_WAYS > 1 generate maybe_plrus : if NUM_WAYS > 1 generate
begin type plru_array is array(0 to NUM_LINES-1) of std_ulogic_vector(NUM_WAYS - 2 downto 0);
plrus: for i in 0 to NUM_LINES-1 generate signal plru_ram : plru_array;
-- PLRU interface signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_acc_en : std_ulogic;
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
begin begin
plru : entity work.plru plru : entity work.plrufn
generic map ( generic map (
BITS => WAY_BITS BITS => WAY_BITS
) )
port map ( port map (
clk => clk,
rst => rst,
acc => plru_acc, acc => plru_acc,
acc_en => plru_acc_en, tree_in => plru_cur,
tree_out => plru_upd,
lru => plru_out lru => plru_out
); );


process(all) process(all)
begin begin
-- PLRU interface -- Read PLRU bits from array
if not is_X(r1.hit_index) and r1.hit_index = to_unsigned(i, INDEX_BITS) then if is_X(r1.hit_index) then
plru_acc_en <= r1.cache_hit; plru_cur <= (others => 'X');
else else
plru_acc_en <= '0'; plru_cur <= plru_ram(to_integer(r1.hit_index));
end if; end if;

-- PLRU interface
plru_acc <= std_ulogic_vector(r1.hit_way); plru_acc <= std_ulogic_vector(r1.hit_way);
plru_victim(i) <= plru_out; plru_victim <= unsigned(plru_out);
end process;

-- synchronous writes to PLRU array
process(clk)
begin
if rising_edge(clk) then
if r1.cache_hit = '1' then
assert not is_X(r1.hit_index) severity failure;
plru_ram(to_integer(r1.hit_index)) <= plru_upd;
end if;
end if;
end process; end process;
end generate;
end generate; end generate;


-- Cache tag RAM read port -- Cache tag RAM read port
@ -980,8 +1001,13 @@ begin
replace_way <= to_unsigned(0, WAY_BITS); replace_way <= to_unsigned(0, WAY_BITS);
if NUM_WAYS > 1 then if NUM_WAYS > 1 then
if r1.write_tag = '1' then if r1.write_tag = '1' then
assert not is_X(r1.store_index); if r1.choose_victim = '1' then
replace_way <= unsigned(plru_victim(to_integer(r1.store_index))); replace_way <= plru_victim;
else
-- Cache victim way was chosen earlier,
-- in the cycle after the miss was detected.
replace_way <= r1.victim_way;
end if;
else else
replace_way <= r1.store_way; replace_way <= r1.store_way;
end if; end if;
@ -1305,8 +1331,6 @@ begin
end if; end if;


-- Fast path for load/store hits. Set signals for the writeback controls. -- Fast path for load/store hits. Set signals for the writeback controls.
r1.hit_way <= req_hit_way;
r1.hit_index <= req_index;
if req_op = OP_LOAD_HIT then if req_op = OP_LOAD_HIT then
r1.hit_load_valid <= '1'; r1.hit_load_valid <= '1';
else else
@ -1340,6 +1364,11 @@ begin
r1.tlb_hit <= tlb_hit; r1.tlb_hit <= tlb_hit;
r1.tlb_hit_way <= tlb_hit_way; r1.tlb_hit_way <= tlb_hit_way;
r1.tlb_hit_index <= tlb_req_index; r1.tlb_hit_index <= tlb_req_index;
-- determine victim way in the TLB in the cycle after
-- we detect the TLB miss
if r1.ls_error = '1' then
r1.tlb_victim <= unsigned(tlb_plru_victim);
end if;


end if; end if;
end process; end process;
@ -1364,6 +1393,7 @@ begin
ev.load_miss <= '0'; ev.load_miss <= '0';
ev.store_miss <= '0'; ev.store_miss <= '0';
ev.dtlb_miss <= tlb_miss; ev.dtlb_miss <= tlb_miss;
r1.choose_victim <= '0';


-- On reset, clear all valid bits to force misses -- On reset, clear all valid bits to force misses
if rst = '1' then if rst = '1' then
@ -1460,6 +1490,17 @@ begin
end if; end if;
end if; end if;


-- Signals for PLRU update and victim selection
r1.hit_way <= req_hit_way;
r1.hit_index <= req_index;
-- Record victim way in the cycle after we see a load or dcbz miss
if r1.choose_victim = '1' then
r1.victim_way <= plru_victim;
end if;
if req_op = OP_LOAD_MISS or (req_op = OP_STORE_MISS and r0.req.dcbz = '1') then
r1.choose_victim <= '1';
end if;

-- Main state machine -- Main state machine
case r1.state is case r1.state is
when IDLE => when IDLE =>

Loading…
Cancel
Save