icache: Split PLRU into storage and logic

Rather than having update and decode logic for each individual PLRU
as well as a register to store the current PLRU state, we now put the
PLRU state in a little RAM, which will typically use LUT RAM on FPGAs,
and have just a single copy of the logic to calculate the pseudo-LRU
way and to update the PLRU state.  This logic is in the plrufn module
and is just combinatorial logic.  A new module was created for this as
other parts of the system are still using plru.vhdl.

The PLRU RAM in the icache is read asynchronously in the cycle
after the cache tag matching is done.  At the end of that cycle the
PLRU RAM entry is updated if the access was a cache hit, or a victim
way is calculated and stored if the access was a cache miss and
miss handling is starting in this cycle.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/408/head
Paul Mackerras 2 years ago
parent cd2e174113
commit 86212dc879

@ -68,8 +68,8 @@ all: $(all)
$(shell scripts/make_version.sh git.vhdl) $(shell scripts/make_version.sh git.vhdl)


core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl predecode.vhdl \ utils.vhdl plru.vhdl plrufn.vhdl cache_ram.vhdl icache.vhdl \
decode1.vhdl helpers.vhdl insn_helpers.vhdl \ predecode.vhdl decode1.vhdl helpers.vhdl insn_helpers.vhdl \
control.vhdl decode2.vhdl register_file.vhdl \ control.vhdl decode2.vhdl register_file.vhdl \
cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \
logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \ logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \

@ -12,7 +12,6 @@
-- efficient use of distributed RAM and less logic/muxes. Currently we -- efficient use of distributed RAM and less logic/muxes. Currently we
-- write TAG_BITS width which may not match full ram blocks and might -- write TAG_BITS width which may not match full ram blocks and might
-- cause muxes to be inferred for "partial writes". -- cause muxes to be inferred for "partial writes".
-- * Check if making the read size of PLRU a ROM helps utilization
-- --
library ieee; library ieee;
use ieee.std_logic_1164.all; use ieee.std_logic_1164.all;
@ -236,8 +235,7 @@ architecture rtl of icache is
signal wb_rd_data : std_ulogic_vector(ROW_SIZE_BITS - 1 downto 0); signal wb_rd_data : std_ulogic_vector(ROW_SIZE_BITS - 1 downto 0);


-- PLRU output interface -- PLRU output interface
type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0); signal plru_victim : way_sig_t;
signal plru_victim : plru_out_t;


-- Memory write snoop signals -- Memory write snoop signals
signal snoop_valid : std_ulogic; signal snoop_valid : std_ulogic;
@ -447,40 +445,48 @@ begin
-- Generate PLRUs -- Generate PLRUs
maybe_plrus: if NUM_WAYS > 1 generate maybe_plrus: if NUM_WAYS > 1 generate
begin type plru_array is array(index_t) of std_ulogic_vector(NUM_WAYS - 2 downto 0);
plrus: for i in 0 to NUM_LINES-1 generate signal plru_ram : plru_array;
-- PLRU interface signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0);
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_acc_en : std_ulogic;
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
begin begin
plru : entity work.plru plru : entity work.plrufn
generic map ( generic map (
BITS => WAY_BITS BITS => WAY_BITS
) )
port map ( port map (
clk => clk,
rst => rst,
acc => plru_acc, acc => plru_acc,
acc_en => plru_acc_en, tree_in => plru_cur,
tree_out => plru_upd,
lru => plru_out lru => plru_out
); );


process(all) process(all)
begin begin
-- PLRU interface -- Read PLRU bits from array
if is_X(r.hit_nia) then if is_X(r.hit_nia) then
plru_acc_en <= 'X'; plru_cur <= (others => 'X');
elsif get_index(r.hit_nia) = i then
plru_acc_en <= r.hit_valid;
else else
plru_acc_en <= '0'; plru_cur <= plru_ram(to_integer(get_index(r.hit_nia)));
end if; end if;

-- PLRU interface
plru_acc <= std_ulogic_vector(r.hit_way); plru_acc <= std_ulogic_vector(r.hit_way);
plru_victim(i) <= plru_out; plru_victim <= unsigned(plru_out);
end process;

-- synchronous writes to PLRU array
process(clk)
begin
if rising_edge(clk) then
if r.hit_valid = '1' then
assert not is_X(r.hit_nia) severity failure;
plru_ram(to_integer(get_index(r.hit_nia))) <= plru_upd;
end if;
end if;
end process; end process;
end generate;
end generate; end generate;


-- TLB hit detection and real address generation -- TLB hit detection and real address generation
@ -791,7 +797,7 @@ begin
replace_way := to_unsigned(0, WAY_BITS); replace_way := to_unsigned(0, WAY_BITS);
if NUM_WAYS > 1 then if NUM_WAYS > 1 then
-- Get victim way from plru -- Get victim way from plru
replace_way := unsigned(plru_victim(to_integer(r.store_index))); replace_way := plru_victim;
end if; end if;
r.store_way <= replace_way; r.store_way <= replace_way;



@ -34,6 +34,7 @@ filesets:
- core.vhdl - core.vhdl
- icache.vhdl - icache.vhdl
- plru.vhdl - plru.vhdl
- plrufn.vhdl
- cache_ram.vhdl - cache_ram.vhdl
- core_debug.vhdl - core_debug.vhdl
- utils.vhdl - utils.vhdl

@ -0,0 +1,72 @@
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
use ieee.math_real.all;

entity plrufn is
generic (
BITS : positive := 2
)
;
port (
acc : in std_ulogic_vector(BITS-1 downto 0);
tree_in : in std_ulogic_vector(2 ** BITS - 2 downto 0);
tree_out : out std_ulogic_vector(2 ** BITS - 2 downto 0);
lru : out std_ulogic_vector(BITS-1 downto 0)
);
end entity plrufn;

architecture rtl of plrufn is
-- Each level of the tree (from leaf to root) has half the number of nodes
-- of the previous level. So for a 2^N bits LRU, we have a level of N/2 bits
-- one of N/4 bits etc.. down to 1. This gives us 2^N-1 nodes. Ie, 2 bits
-- LRU has 3 nodes (2 + 1), 4 bits LRU has 15 nodes (8 + 4 + 2 + 1) etc...
constant count : positive := 2 ** BITS - 1;
subtype node_t is integer range 0 to count - 1;
begin

get_lru: process(tree_in)
variable node : node_t;
variable abit : std_ulogic;
begin
node := 0;
for i in 0 to BITS-1 loop
abit := tree_in(node);
if is_X(abit) then
abit := '0';
end if;
lru(BITS-1-i) <= abit;
if i /= BITS-1 then
node := node * 2;
if abit = '1' then
node := node + 2;
else
node := node + 1;
end if;
end if;
end loop;
end process;

update_lru: process(all)
variable node : node_t;
variable abit : std_ulogic;
begin
tree_out <= tree_in;
node := 0;
for i in 0 to BITS-1 loop
abit := acc(BITS-1-i);
if is_X(abit) then
abit := '0';
end if;
tree_out(node) <= not abit;
if i /= BITS-1 then
node := node * 2;
if abit = '1' then
node := node + 2;
else
node := node + 1;
end if;
end if;
end loop;
end process;
end;
Loading…
Cancel
Save