From 174378b190fe47f12dc3f9ec90756a8f8df3f05f Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 10 Oct 2019 11:25:16 +1100 Subject: [PATCH] dcache: Introduce an extra cycle latency to make timing This makes the BRAMs use an output buffer, introducing an extra cycle latency. Without this, Vivado won't make timing at 100Mhz. We stash all the necessary response data in delayed latches, the extra cycle is NOT a state in the state machine, thus it's fully pipelined and doesn't involve stalling. This introduces an extra non-pipelined cycle for loads with update to avoid collision on the writeback output between the now delayed load data and the register update. We could avoid it by moving the register update in the pipeline bubble created by the extra update state, but it's a bit trickier, so I leave that for a latter optimization. Signed-off-by: Benjamin Herrenschmidt --- cache_ram.vhdl | 23 ++++++- dcache.vhdl | 182 +++++++++++++++++++++++++++++++++---------------- 2 files changed, 145 insertions(+), 60 deletions(-) diff --git a/cache_ram.vhdl b/cache_ram.vhdl index 346c6fe..7a10a1c 100644 --- a/cache_ram.vhdl +++ b/cache_ram.vhdl @@ -7,7 +7,8 @@ entity cache_ram is generic( ROW_BITS : integer := 16; WIDTH : integer := 64; - TRACE : boolean := false + TRACE : boolean := false; + ADD_BUF : boolean := false ); port( @@ -33,6 +34,8 @@ architecture rtl of cache_ram is attribute ram_decomp : string; attribute ram_decomp of ram : signal is "power"; + signal rd_data0 : std_logic_vector(WIDTH - 1 downto 0); + begin process(clk) variable lbit : integer range 0 to WIDTH - 1; @@ -56,7 +59,7 @@ begin end loop; end if; if rd_en = '1' then - rd_data <= ram(to_integer(unsigned(rd_addr))); + rd_data0 <= ram(to_integer(unsigned(rd_addr))); if TRACE then report "read a:" & to_hstring(rd_addr) & " dat:" & to_hstring(ram(to_integer(unsigned(rd_addr)))); @@ -64,4 +67,20 @@ begin end if; end if; end process; + + buf: if ADD_BUF generate + begin + process(clk) + begin + if rising_edge(clk) then + rd_data <= rd_data0; + end if; + end process; + end generate; + + nobuf: if not ADD_BUF generate + begin + rd_data <= rd_data0; + end generate; + end; diff --git a/dcache.vhdl b/dcache.vhdl index f771eae..c0d0469 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -7,6 +7,9 @@ -- * Complete load misses on the cycle when WB data comes instead of -- at the end of line (this requires dealing with requests coming in -- while not idle...) +-- * Load with update could use one less non-pipelined cycle by moving +-- the register update to the pipeline bubble that exists when going +-- back to the IDLE state. -- library ieee; use ieee.std_logic_1164.all; @@ -138,7 +141,8 @@ architecture rtl of dcache is -- Cache state machine type state_t is (IDLE, -- Normal load hit processing - LOAD_UPDATE, -- Load with update address update cycle + LOAD_UPDATE, -- Load with update extra cycle + LOAD_UPDATE2, -- Load with update extra cycle RELOAD_WAIT_ACK, -- Cache reload wait ack STORE_WAIT_ACK, -- Store wait ack NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack @@ -147,8 +151,20 @@ architecture rtl of dcache is req_latch : Loadstore1ToDcacheType; -- Cache hit state (Latches for 1 cycle BRAM access) - hit_way : way_t; - hit_load_valid : std_ulogic; + hit_way : way_t; + hit_load_valid : std_ulogic; + + -- 1-cycle delayed signals to account for the BRAM extra + -- buffer that seems necessary to make timing on load hits + -- + hit_way_delayed : way_t; + hit_load_delayed : std_ulogic; + hit_load_upd_delayed : std_ulogic; + hit_load_reg_delayed : std_ulogic_vector(4 downto 0); + hit_data_shift_delayed : std_ulogic_vector(2 downto 0); + hit_dlength_delayed : std_ulogic_vector(3 downto 0); + hit_sign_ext_delayed : std_ulogic; + hit_byte_rev_delayed : std_ulogic; -- Register update (load/store with update) update_valid : std_ulogic; @@ -382,72 +398,97 @@ begin -- Writeback (loads and reg updates) & completion control logic -- writeback_control: process(all) - variable writeback_format : boolean; begin -- The mux on d_out.write reg defaults to the normal load hit case. d_out.write_enable <= '0'; d_out.valid <= '0'; - d_out.write_reg <= r.req_latch.write_reg; - d_out.write_data <= cache_out(r.hit_way); - d_out.write_len <= r.req_latch.length; - d_out.write_shift <= r.req_latch.addr(2 downto 0); - d_out.sign_extend <= r.req_latch.sign_extend; - d_out.byte_reverse <= r.req_latch.byte_reverse; + d_out.write_reg <= r.hit_load_reg_delayed; + d_out.write_data <= cache_out(r.hit_way_delayed); + d_out.write_len <= r.hit_dlength_delayed; + d_out.write_shift <= r.hit_data_shift_delayed; + d_out.sign_extend <= r.hit_sign_ext_delayed; + d_out.byte_reverse <= r.hit_byte_rev_delayed; d_out.second_word <= '0'; - -- By default writeback doesn't need formatting - writeback_format := false; - -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store -- - if r.hit_load_valid = '1' or r.slow_valid = '1' then - if r.req_latch.load = '1' then - -- If it's a load, enable write back and enable formatting - d_out.write_enable <= '1'; - writeback_format := true; + -- Note: the load hit is delayed by one cycle. However it can still + -- not collide with r.slow_valid (well unless I miscalculated) because + -- slow_valid can only be set on a subsequent request and not on its + -- first cycle (the state machine must have advanced), which makes + -- slow_valid at least 2 cycles from the previous hit_load_valid. + -- - -- If it's a slow load (miss or NC) source it from the buffer - if r.slow_valid = '1' then - d_out.write_data <= r.slow_data; + -- Sanity: Only one of these must be set in any given cycle + assert (r.update_valid and r.hit_load_delayed) /= '1' report + "unexpected hit_load_delayed collision with update_valid" + severity FAILURE; + assert (r.slow_valid and r.hit_load_delayed) /= '1' report + "unexpected hit_load_delayed collision with slow_valid" + severity FAILURE; + assert (r.slow_valid and r.update_valid) /= '1' report + "unexpected update_valid collision with slow_valid" + severity FAILURE; + + -- Delayed load hit case is the standard path + if r.hit_load_delayed = '1' then + d_out.write_enable <= '1'; + + -- If it's not a load with update, complete it now + if r.hit_load_upd_delayed = '0' then + d_out.valid <= '1'; end if; + end if; + + -- Slow ops (load miss, NC, stores) + if r.slow_valid = '1' then + -- If it's a load, enable register writeback and switch + -- mux accordingly + -- + if r.req_latch.load then + d_out.write_reg <= r.req_latch.write_reg; + d_out.write_enable <= '1'; - -- If it's a normal load (not a load with update), we complete - -- now, otherwise we wait for the delayed update. + -- Read data comes from the slow data latch, formatter + -- from the latched request. -- - if r.req_latch.update = '0' then - d_out.valid <= '1'; - end if; - else - -- It's a store, complete always - d_out.valid <= '1'; + d_out.write_data <= r.slow_data; + d_out.write_shift <= r.req_latch.addr(2 downto 0); + d_out.sign_extend <= r.req_latch.sign_extend; + d_out.byte_reverse <= r.req_latch.byte_reverse; + d_out.write_len <= r.req_latch.length; end if; - -- Sanity - assert r.update_valid = '0' report "unexpected update_valid" - severity FAILURE; + -- If it's a store or a non-update load form, complete now + if r.req_latch.load = '0' or r.req_latch.update = '0' then + d_out.valid <= '1'; + end if; end if; -- We have a register update to do. if r.update_valid = '1' then d_out.write_enable <= '1'; d_out.write_reg <= r.req_latch.update_reg; + + -- Change the read data mux to the address that's going into + -- the register and the formatter does nothing. + -- d_out.write_data <= r.req_latch.addr; + d_out.write_shift <= "000"; + d_out.write_len <= "1000"; + d_out.sign_extend <= '0'; + d_out.byte_reverse <= '0'; - -- If it was a load, this completes the operation + -- If it was a load, this completes the operation (load with + -- update case). + -- if r.req_latch.load = '1' then d_out.valid <= '1'; end if; end if; - if not writeback_format then - d_out.write_len <= "1000"; - d_out.write_shift <= "000"; - d_out.sign_extend <= '0'; - d_out.byte_reverse <= '0'; - end if; - end process; -- Misc data & sel signals @@ -465,7 +506,12 @@ begin -- Generate a cache RAM for each way. This handles the normal -- reads, writes from reloads and the special store-hit update - -- path as well + -- path as well. + -- + -- Note: the BRAMs have an extra read buffer, meaning the output + -- is pipelined an extra cycle. This differs from the + -- icache. The writeback logic needs to take that into + -- account by using 1-cycle delayed signals for load hits. -- rams: for i in 0 to NUM_WAYS-1 generate signal do_read : std_ulogic; @@ -479,7 +525,8 @@ begin way: entity work.cache_ram generic map ( ROW_BITS => ROW_BITS, - WIDTH => wishbone_data_bits + WIDTH => wishbone_data_bits, + ADD_BUF => true ) port map ( clk => clk, @@ -493,13 +540,8 @@ begin ); process(all) begin - do_read <= '0'; - do_write <= '0'; - -- Cache hit reads - if req_op = OP_LOAD_HIT and req_hit_way = i then - do_read <= '1'; - end if; + do_read <= '1'; rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); cache_out(i) <= dout; @@ -507,19 +549,30 @@ begin -- -- Defaults to wishbone read responses (cache refill), -- - wr_data <= wishbone_in.dat; - wr_sel <= (others => '1'); - wr_addr <= std_ulogic_vector(to_unsigned(get_row(r.wb.adr), ROW_BITS)); + -- For timing, the mux on wr_data/sel/addr is not dependent on anything + -- other than the current state. Only the do_write signal is. + -- + if r.state = IDLE then + -- When IDLE, the only write path is the store-hit update case + wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); + wr_data <= store_data; + wr_sel <= bus_sel; + else + -- Otherwise, we might be doing a reload + wr_data <= wishbone_in.dat; + wr_sel <= (others => '1'); + wr_addr <= std_ulogic_vector(to_unsigned(get_row(r.wb.adr), ROW_BITS)); + end if; + + -- The two actual write cases here + do_write <= '0'; if r.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r.store_way = i then do_write <= '1'; end if; - - -- Alternatively, store-hit BRAM update case (exclusive from the above). if req_op = OP_STORE_HIT and req_hit_way = i then - report "store_data:" & to_hstring(store_data); - wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); - wr_data <= store_data; - wr_sel <= bus_sel; + assert r.state /= RELOAD_WAIT_ACK report "Store hit while in state:" & + state_t'image(r.state) + severity FAILURE; do_write <= '1'; end if; end process; @@ -531,6 +584,16 @@ begin dcache_fast_hit : process(clk) begin if rising_edge(clk) then + -- 1-cycle delayed signals for load hit response + r.hit_load_delayed <= r.hit_load_valid; + r.hit_way_delayed <= r.hit_way; + r.hit_load_upd_delayed <= r.req_latch.update; + r.hit_load_reg_delayed <= r.req_latch.write_reg; + r.hit_data_shift_delayed <= r.req_latch.addr(2 downto 0); + r.hit_sign_ext_delayed <= r.req_latch.sign_extend; + r.hit_byte_rev_delayed <= r.req_latch.byte_reverse; + r.hit_dlength_delayed <= r.req_latch.length; + -- On-cycle pulse values get reset on every cycle r.hit_load_valid <= '0'; @@ -543,7 +606,7 @@ begin if d_in.valid = '1' then r.req_latch <= d_in; - report "dcache op:" & op_t'image(req_op) & + report "op:" & op_t'image(req_op) & " addr:" & to_hstring(d_in.addr) & " upd:" & std_ulogic'image(d_in.update) & " nc:" & std_ulogic'image(d_in.nc) & @@ -712,6 +775,9 @@ begin end if; when LOAD_UPDATE => + -- We need the extra cycle to complete a load with update + r.state <= LOAD_UPDATE2; + when LOAD_UPDATE2 => -- We need the extra cycle to complete a load with update r.update_valid <= '1'; r.state <= IDLE;