From 62b24a8dae3aa3597f863de1cd8f0a9f0ec2cb6b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 29 May 2020 09:38:05 +1000 Subject: [PATCH] icache: Improve latencies when reloading cache lines The icache can now detect a hit on a line being refilled from memory, as we have an array of individual valid bits per row for the line that is currently being loaded. This enables the request that initiated the refill to be satisfied earlier, and also enables following requests to the same cache line to be satisfied before the line is completely refilled. Furthermore, the refill now starts at the row that is needed. This should reduce the latency for an icache miss. We now get a 'sequential' indication from fetch1, and use that to know when we can deliver an instruction word using the other half of the 64-bit doubleword that was read last cycle. This doesn't make much difference at the moment, but it frees up cycles where we could test whether the next line is present in the cache so that we could prefetch it if not. Signed-off-by: Paul Mackerras --- common.vhdl | 1 + fetch1.vhdl | 2 ++ icache.vhdl | 72 +++++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 56 insertions(+), 19 deletions(-) diff --git a/common.vhdl b/common.vhdl index 82b3242..f08ecd1 100644 --- a/common.vhdl +++ b/common.vhdl @@ -93,6 +93,7 @@ package common is virt_mode : std_ulogic; priv_mode : std_ulogic; stop_mark: std_ulogic; + sequential: std_ulogic; nia: std_ulogic_vector(63 downto 0); end record; diff --git a/fetch1.vhdl b/fetch1.vhdl index 758db24..93a2293 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -68,6 +68,7 @@ begin begin v := r; v_int := r_int; + v.sequential := '0'; if rst = '1' then if alt_reset_in = '1' then @@ -128,6 +129,7 @@ begin if increment then v.nia := std_logic_vector(unsigned(v.nia) + 4); + v.sequential := '1'; end if; end if; diff --git a/icache.vhdl b/icache.vhdl index e4f8448..739e047 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -115,6 +115,7 @@ architecture rtl of icache is subtype row_t is integer range 0 to BRAM_ROWS-1; subtype index_t is integer range 0 to NUM_LINES-1; subtype way_t is integer range 0 to NUM_WAYS-1; + subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0); -- The cache data BRAM organized as described above for each way subtype cache_row_t is std_ulogic_vector(wishbone_data_bits-1 downto 0); @@ -132,6 +133,7 @@ architecture rtl of icache is -- The cache valid bits subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); type cache_valids_t is array(index_t) of cache_way_valids_t; + type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs signal cache_tags : cache_tags_array_t; @@ -179,6 +181,8 @@ architecture rtl of icache is store_row : row_t; store_tag : cache_tag_t; store_valid : std_ulogic; + end_row_ix : row_in_line_t; + rows_valid : row_per_line_valid_t; -- TLB miss state fetch_failed : std_ulogic; @@ -200,6 +204,7 @@ architecture rtl of icache is signal ra_valid : std_ulogic; signal priv_fault : std_ulogic; signal access_ok : std_ulogic; + signal use_previous : std_ulogic; -- Output data to logger signal log_data : std_ulogic_vector(53 downto 0); @@ -225,20 +230,24 @@ architecture rtl of icache is return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS))); end; + -- Return the index of a row within a line + function get_row_of_line(row: row_t) return row_in_line_t is + variable row_v : unsigned(ROW_BITS-1 downto 0); + begin + row_v := to_unsigned(row, ROW_BITS); + return row_v(ROW_LINEBITS-1 downto 0); + end; + -- Returns whether this is the last row of a line - function is_last_row_addr(addr: wishbone_addr_type) return boolean is - constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); + function is_last_row_addr(addr: wishbone_addr_type; last: row_in_line_t) return boolean is begin - return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones; + return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last; end; -- Returns whether this is the last row of a line - function is_last_row(row: row_t) return boolean is - variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0); - constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); + function is_last_row(row: row_t; last: row_in_line_t) return boolean is begin - row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS)); - return row_v(ROW_LINEBITS-1 downto 0) = ones; + return get_row_of_line(row) = last; end; -- Return the address of the next row in the current cache line @@ -367,7 +376,7 @@ begin ); process(all) begin - do_read <= not stall_in; + do_read <= not (stall_in or use_previous); do_write <= '0'; if wishbone_in.ack = '1' and r.store_way = i then do_write <= '1'; @@ -472,23 +481,38 @@ begin variable is_hit : std_ulogic; variable hit_way : way_t; begin + -- i_in.sequential means that i_in.nia this cycle is 4 more than + -- last cycle. If we read more than 32 bits at a time, had a cache hit + -- last cycle, and we don't want the first 32-bit chunk, then we can + -- keep the data we read last cycle and just use that. + if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then + use_previous <= i_in.sequential and r.hit_valid; + else + use_previous <= '0'; + end if; + -- Extract line, row and tag from request req_index <= get_index(i_in.nia); req_row <= get_row(i_in.nia); req_tag <= get_tag(real_addr); - -- Calculate address of beginning of cache line, will be + -- Calculate address of beginning of cache row, will be -- used for cache miss processing if needed -- req_laddr <= (63 downto REAL_ADDR_BITS => '0') & - real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) & - (LINE_OFF_BITS-1 downto 0 => '0'); + real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & + (ROW_OFF_BITS-1 downto 0 => '0'); -- Test if pending request is a hit on any way hit_way := 0; is_hit := '0'; for i in way_t loop - if i_in.req = '1' and cache_valids(req_index)(i) = '1' then + if i_in.req = '1' and + (cache_valids(req_index)(i) = '1' or + (r.state = WAIT_ACK and + req_index = r.store_index and + i = r.store_way and + r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then if read_tag(i, cache_tags(req_index)) = req_tag then hit_way := i; is_hit := '1'; @@ -536,7 +560,8 @@ begin if rising_edge(clk) then -- keep outputs to fetch2 unchanged on a stall -- except that flush or reset sets valid to 0 - if stall_in = '1' then + -- If use_previous, keep the same data as last cycle and use the second half + if stall_in = '1' or use_previous = '1' then if rst = '1' or flush_in = '1' then r.hit_valid <= '0'; end if; @@ -545,9 +570,6 @@ begin -- will be available on the cache_out output of the corresponding way -- r.hit_valid <= req_is_hit; - -- Send stop marks and NIA down regardless of validity - r.hit_smark <= i_in.stop_mark; - r.hit_nia <= i_in.nia; if req_is_hit = '1' then r.hit_way <= req_hit_way; @@ -559,6 +581,11 @@ begin " way:" & integer'image(req_hit_way) & " RA:" & to_hstring(real_addr); end if; + end if; + if stall_in = '0' then + -- Send stop marks and NIA down regardless of validity + r.hit_smark <= i_in.stop_mark; + r.hit_nia <= i_in.nia; end if; end if; end process; @@ -597,6 +624,11 @@ begin -- Main state machine case r.state is when IDLE => + -- Reset per-row valid flags, only used in WAIT_ACK + for i in 0 to ROW_PER_LINE - 1 loop + r.rows_valid(i) <= '0'; + end loop; + -- We need to read a cache line if req_is_miss = '1' then report "cache miss nia:" & to_hstring(i_in.nia) & @@ -613,6 +645,7 @@ begin r.store_row <= get_row(req_laddr); r.store_tag <= req_tag; r.store_valid <= '1'; + r.end_row_ix <= get_row_of_line(get_row(req_laddr)) - 1; -- Prep for first wishbone read. We calculate the address of -- the start of the cache line and start the WB cycle. @@ -650,7 +683,7 @@ begin -- stb and set stbs_done so we can handle an eventual last -- ack on the same cycle. -- - if is_last_row_addr(r.wb.adr) then + if is_last_row_addr(r.wb.adr, r.end_row_ix) then r.wb.stb <= '0'; stbs_done := true; end if; @@ -661,8 +694,9 @@ begin -- Incoming acks processing if wishbone_in.ack = '1' then + r.rows_valid(r.store_row mod ROW_PER_LINE) <= '1'; -- Check for completion - if stbs_done and is_last_row(r.store_row) then + if stbs_done and is_last_row(r.store_row, r.end_row_ix) then -- Complete wishbone cycle r.wb.cyc <= '0';