icache: Improve latencies when reloading cache lines

The icache can now detect a hit on a line being refilled from memory,
as we have an array of individual valid bits per row for the line
that is currently being loaded.  This enables the request that
initiated the refill to be satisfied earlier, and also enables
following requests to the same cache line to be satisfied before the
line is completely refilled.  Furthermore, the refill now starts
at the row that is needed.  This should reduce the latency for an
icache miss.

We now get a 'sequential' indication from fetch1, and use that to know
when we can deliver an instruction word using the other half of the
64-bit doubleword that was read last cycle.  This doesn't make much
difference at the moment, but it frees up cycles where we could test
whether the next line is present in the cache so that we could
prefetch it if not.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/208/head
Paul Mackerras 5 years ago
parent 0809bc898b
commit 62b24a8dae

@ -93,6 +93,7 @@ package common is
virt_mode : std_ulogic; virt_mode : std_ulogic;
priv_mode : std_ulogic; priv_mode : std_ulogic;
stop_mark: std_ulogic; stop_mark: std_ulogic;
sequential: std_ulogic;
nia: std_ulogic_vector(63 downto 0); nia: std_ulogic_vector(63 downto 0);
end record; end record;



@ -68,6 +68,7 @@ begin
begin begin
v := r; v := r;
v_int := r_int; v_int := r_int;
v.sequential := '0';


if rst = '1' then if rst = '1' then
if alt_reset_in = '1' then if alt_reset_in = '1' then
@ -128,6 +129,7 @@ begin


if increment then if increment then
v.nia := std_logic_vector(unsigned(v.nia) + 4); v.nia := std_logic_vector(unsigned(v.nia) + 4);
v.sequential := '1';
end if; end if;
end if; end if;



@ -115,6 +115,7 @@ architecture rtl of icache is
subtype row_t is integer range 0 to BRAM_ROWS-1; subtype row_t is integer range 0 to BRAM_ROWS-1;
subtype index_t is integer range 0 to NUM_LINES-1; subtype index_t is integer range 0 to NUM_LINES-1;
subtype way_t is integer range 0 to NUM_WAYS-1; subtype way_t is integer range 0 to NUM_WAYS-1;
subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);


-- The cache data BRAM organized as described above for each way -- The cache data BRAM organized as described above for each way
subtype cache_row_t is std_ulogic_vector(wishbone_data_bits-1 downto 0); subtype cache_row_t is std_ulogic_vector(wishbone_data_bits-1 downto 0);
@ -132,6 +133,7 @@ architecture rtl of icache is
-- The cache valid bits -- The cache valid bits
subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
type cache_valids_t is array(index_t) of cache_way_valids_t; type cache_valids_t is array(index_t) of cache_way_valids_t;
type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;


-- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
signal cache_tags : cache_tags_array_t; signal cache_tags : cache_tags_array_t;
@ -179,6 +181,8 @@ architecture rtl of icache is
store_row : row_t; store_row : row_t;
store_tag : cache_tag_t; store_tag : cache_tag_t;
store_valid : std_ulogic; store_valid : std_ulogic;
end_row_ix : row_in_line_t;
rows_valid : row_per_line_valid_t;


-- TLB miss state -- TLB miss state
fetch_failed : std_ulogic; fetch_failed : std_ulogic;
@ -200,6 +204,7 @@ architecture rtl of icache is
signal ra_valid : std_ulogic; signal ra_valid : std_ulogic;
signal priv_fault : std_ulogic; signal priv_fault : std_ulogic;
signal access_ok : std_ulogic; signal access_ok : std_ulogic;
signal use_previous : std_ulogic;


-- Output data to logger -- Output data to logger
signal log_data : std_ulogic_vector(53 downto 0); signal log_data : std_ulogic_vector(53 downto 0);
@ -225,20 +230,24 @@ architecture rtl of icache is
return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS))); return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)));
end; end;


-- Return the index of a row within a line
function get_row_of_line(row: row_t) return row_in_line_t is
variable row_v : unsigned(ROW_BITS-1 downto 0);
begin
row_v := to_unsigned(row, ROW_BITS);
return row_v(ROW_LINEBITS-1 downto 0);
end;

-- Returns whether this is the last row of a line -- Returns whether this is the last row of a line
function is_last_row_addr(addr: wishbone_addr_type) return boolean is function is_last_row_addr(addr: wishbone_addr_type; last: row_in_line_t) return boolean is
constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1');
begin begin
return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones; return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last;
end; end;


-- Returns whether this is the last row of a line -- Returns whether this is the last row of a line
function is_last_row(row: row_t) return boolean is function is_last_row(row: row_t; last: row_in_line_t) return boolean is
variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1');
begin begin
row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS)); return get_row_of_line(row) = last;
return row_v(ROW_LINEBITS-1 downto 0) = ones;
end; end;


-- Return the address of the next row in the current cache line -- Return the address of the next row in the current cache line
@ -367,7 +376,7 @@ begin
); );
process(all) process(all)
begin begin
do_read <= not stall_in; do_read <= not (stall_in or use_previous);
do_write <= '0'; do_write <= '0';
if wishbone_in.ack = '1' and r.store_way = i then if wishbone_in.ack = '1' and r.store_way = i then
do_write <= '1'; do_write <= '1';
@ -472,23 +481,38 @@ begin
variable is_hit : std_ulogic; variable is_hit : std_ulogic;
variable hit_way : way_t; variable hit_way : way_t;
begin begin
-- i_in.sequential means that i_in.nia this cycle is 4 more than
-- last cycle. If we read more than 32 bits at a time, had a cache hit
-- last cycle, and we don't want the first 32-bit chunk, then we can
-- keep the data we read last cycle and just use that.
if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
use_previous <= i_in.sequential and r.hit_valid;
else
use_previous <= '0';
end if;

-- Extract line, row and tag from request -- Extract line, row and tag from request
req_index <= get_index(i_in.nia); req_index <= get_index(i_in.nia);
req_row <= get_row(i_in.nia); req_row <= get_row(i_in.nia);
req_tag <= get_tag(real_addr); req_tag <= get_tag(real_addr);


-- Calculate address of beginning of cache line, will be -- Calculate address of beginning of cache row, will be
-- used for cache miss processing if needed -- used for cache miss processing if needed
-- --
req_laddr <= (63 downto REAL_ADDR_BITS => '0') & req_laddr <= (63 downto REAL_ADDR_BITS => '0') &
real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) & real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
(LINE_OFF_BITS-1 downto 0 => '0'); (ROW_OFF_BITS-1 downto 0 => '0');


-- Test if pending request is a hit on any way -- Test if pending request is a hit on any way
hit_way := 0; hit_way := 0;
is_hit := '0'; is_hit := '0';
for i in way_t loop for i in way_t loop
if i_in.req = '1' and cache_valids(req_index)(i) = '1' then if i_in.req = '1' and
(cache_valids(req_index)(i) = '1' or
(r.state = WAIT_ACK and
req_index = r.store_index and
i = r.store_way and
r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
if read_tag(i, cache_tags(req_index)) = req_tag then if read_tag(i, cache_tags(req_index)) = req_tag then
hit_way := i; hit_way := i;
is_hit := '1'; is_hit := '1';
@ -536,7 +560,8 @@ begin
if rising_edge(clk) then if rising_edge(clk) then
-- keep outputs to fetch2 unchanged on a stall -- keep outputs to fetch2 unchanged on a stall
-- except that flush or reset sets valid to 0 -- except that flush or reset sets valid to 0
if stall_in = '1' then -- If use_previous, keep the same data as last cycle and use the second half
if stall_in = '1' or use_previous = '1' then
if rst = '1' or flush_in = '1' then if rst = '1' or flush_in = '1' then
r.hit_valid <= '0'; r.hit_valid <= '0';
end if; end if;
@ -545,9 +570,6 @@ begin
-- will be available on the cache_out output of the corresponding way -- will be available on the cache_out output of the corresponding way
-- --
r.hit_valid <= req_is_hit; r.hit_valid <= req_is_hit;
-- Send stop marks and NIA down regardless of validity
r.hit_smark <= i_in.stop_mark;
r.hit_nia <= i_in.nia;
if req_is_hit = '1' then if req_is_hit = '1' then
r.hit_way <= req_hit_way; r.hit_way <= req_hit_way;


@ -560,6 +582,11 @@ begin
" RA:" & to_hstring(real_addr); " RA:" & to_hstring(real_addr);
end if; end if;
end if; end if;
if stall_in = '0' then
-- Send stop marks and NIA down regardless of validity
r.hit_smark <= i_in.stop_mark;
r.hit_nia <= i_in.nia;
end if;
end if; end if;
end process; end process;


@ -597,6 +624,11 @@ begin
-- Main state machine -- Main state machine
case r.state is case r.state is
when IDLE => when IDLE =>
-- Reset per-row valid flags, only used in WAIT_ACK
for i in 0 to ROW_PER_LINE - 1 loop
r.rows_valid(i) <= '0';
end loop;

-- We need to read a cache line -- We need to read a cache line
if req_is_miss = '1' then if req_is_miss = '1' then
report "cache miss nia:" & to_hstring(i_in.nia) & report "cache miss nia:" & to_hstring(i_in.nia) &
@ -613,6 +645,7 @@ begin
r.store_row <= get_row(req_laddr); r.store_row <= get_row(req_laddr);
r.store_tag <= req_tag; r.store_tag <= req_tag;
r.store_valid <= '1'; r.store_valid <= '1';
r.end_row_ix <= get_row_of_line(get_row(req_laddr)) - 1;


-- Prep for first wishbone read. We calculate the address of -- Prep for first wishbone read. We calculate the address of
-- the start of the cache line and start the WB cycle. -- the start of the cache line and start the WB cycle.
@ -650,7 +683,7 @@ begin
-- stb and set stbs_done so we can handle an eventual last -- stb and set stbs_done so we can handle an eventual last
-- ack on the same cycle. -- ack on the same cycle.
-- --
if is_last_row_addr(r.wb.adr) then if is_last_row_addr(r.wb.adr, r.end_row_ix) then
r.wb.stb <= '0'; r.wb.stb <= '0';
stbs_done := true; stbs_done := true;
end if; end if;
@ -661,8 +694,9 @@ begin


-- Incoming acks processing -- Incoming acks processing
if wishbone_in.ack = '1' then if wishbone_in.ack = '1' then
r.rows_valid(r.store_row mod ROW_PER_LINE) <= '1';
-- Check for completion -- Check for completion
if stbs_done and is_last_row(r.store_row) then if stbs_done and is_last_row(r.store_row, r.end_row_ix) then
-- Complete wishbone cycle -- Complete wishbone cycle
r.wb.cyc <= '0'; r.wb.cyc <= '0';



Loading…
Cancel
Save