dcache: Introduce an extra cycle latency to make timing

This makes the BRAMs use an output buffer, introducing an extra
cycle latency. Without this, Vivado won't make timing at 100Mhz.

We stash all the necessary response data in delayed latches, the
extra cycle is NOT a state in the state machine, thus it's fully
pipelined and doesn't involve stalling.

This introduces an extra non-pipelined cycle for loads with update
to avoid collision on the writeback output between the now delayed
load data and the register update. We could avoid it by moving
the register update in the pipeline bubble created by the extra
update state, but it's a bit trickier, so I leave that for a latter
optimization.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
jtag-port
Benjamin Herrenschmidt 5 years ago
parent b513f0fb48
commit 174378b190

@ -7,7 +7,8 @@ entity cache_ram is
generic( generic(
ROW_BITS : integer := 16; ROW_BITS : integer := 16;
WIDTH : integer := 64; WIDTH : integer := 64;
TRACE : boolean := false TRACE : boolean := false;
ADD_BUF : boolean := false
); );


port( port(
@ -33,6 +34,8 @@ architecture rtl of cache_ram is
attribute ram_decomp : string; attribute ram_decomp : string;
attribute ram_decomp of ram : signal is "power"; attribute ram_decomp of ram : signal is "power";


signal rd_data0 : std_logic_vector(WIDTH - 1 downto 0);

begin begin
process(clk) process(clk)
variable lbit : integer range 0 to WIDTH - 1; variable lbit : integer range 0 to WIDTH - 1;
@ -56,7 +59,7 @@ begin
end loop; end loop;
end if; end if;
if rd_en = '1' then if rd_en = '1' then
rd_data <= ram(to_integer(unsigned(rd_addr))); rd_data0 <= ram(to_integer(unsigned(rd_addr)));
if TRACE then if TRACE then
report "read a:" & to_hstring(rd_addr) & report "read a:" & to_hstring(rd_addr) &
" dat:" & to_hstring(ram(to_integer(unsigned(rd_addr)))); " dat:" & to_hstring(ram(to_integer(unsigned(rd_addr))));
@ -64,4 +67,20 @@ begin
end if; end if;
end if; end if;
end process; end process;

buf: if ADD_BUF generate
begin
process(clk)
begin
if rising_edge(clk) then
rd_data <= rd_data0;
end if;
end process;
end generate;

nobuf: if not ADD_BUF generate
begin
rd_data <= rd_data0;
end generate;

end; end;

@ -7,6 +7,9 @@
-- * Complete load misses on the cycle when WB data comes instead of -- * Complete load misses on the cycle when WB data comes instead of
-- at the end of line (this requires dealing with requests coming in -- at the end of line (this requires dealing with requests coming in
-- while not idle...) -- while not idle...)
-- * Load with update could use one less non-pipelined cycle by moving
-- the register update to the pipeline bubble that exists when going
-- back to the IDLE state.
-- --
library ieee; library ieee;
use ieee.std_logic_1164.all; use ieee.std_logic_1164.all;
@ -138,7 +141,8 @@ architecture rtl of dcache is
-- Cache state machine -- Cache state machine
type state_t is (IDLE, -- Normal load hit processing type state_t is (IDLE, -- Normal load hit processing
LOAD_UPDATE, -- Load with update address update cycle LOAD_UPDATE, -- Load with update extra cycle
LOAD_UPDATE2, -- Load with update extra cycle
RELOAD_WAIT_ACK, -- Cache reload wait ack RELOAD_WAIT_ACK, -- Cache reload wait ack
STORE_WAIT_ACK, -- Store wait ack STORE_WAIT_ACK, -- Store wait ack
NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack
@ -147,8 +151,20 @@ architecture rtl of dcache is
req_latch : Loadstore1ToDcacheType; req_latch : Loadstore1ToDcacheType;
-- Cache hit state (Latches for 1 cycle BRAM access) -- Cache hit state (Latches for 1 cycle BRAM access)
hit_way : way_t; hit_way : way_t;
hit_load_valid : std_ulogic; hit_load_valid : std_ulogic;

-- 1-cycle delayed signals to account for the BRAM extra
-- buffer that seems necessary to make timing on load hits
--
hit_way_delayed : way_t;
hit_load_delayed : std_ulogic;
hit_load_upd_delayed : std_ulogic;
hit_load_reg_delayed : std_ulogic_vector(4 downto 0);
hit_data_shift_delayed : std_ulogic_vector(2 downto 0);
hit_dlength_delayed : std_ulogic_vector(3 downto 0);
hit_sign_ext_delayed : std_ulogic;
hit_byte_rev_delayed : std_ulogic;


-- Register update (load/store with update) -- Register update (load/store with update)
update_valid : std_ulogic; update_valid : std_ulogic;
@ -382,72 +398,97 @@ begin
-- Writeback (loads and reg updates) & completion control logic -- Writeback (loads and reg updates) & completion control logic
-- --
writeback_control: process(all) writeback_control: process(all)
variable writeback_format : boolean;
begin begin


-- The mux on d_out.write reg defaults to the normal load hit case. -- The mux on d_out.write reg defaults to the normal load hit case.
d_out.write_enable <= '0'; d_out.write_enable <= '0';
d_out.valid <= '0'; d_out.valid <= '0';
d_out.write_reg <= r.req_latch.write_reg; d_out.write_reg <= r.hit_load_reg_delayed;
d_out.write_data <= cache_out(r.hit_way); d_out.write_data <= cache_out(r.hit_way_delayed);
d_out.write_len <= r.req_latch.length; d_out.write_len <= r.hit_dlength_delayed;
d_out.write_shift <= r.req_latch.addr(2 downto 0); d_out.write_shift <= r.hit_data_shift_delayed;
d_out.sign_extend <= r.req_latch.sign_extend; d_out.sign_extend <= r.hit_sign_ext_delayed;
d_out.byte_reverse <= r.req_latch.byte_reverse; d_out.byte_reverse <= r.hit_byte_rev_delayed;
d_out.second_word <= '0'; d_out.second_word <= '0';


-- By default writeback doesn't need formatting
writeback_format := false;

-- We have a valid load or store hit or we just completed a slow -- We have a valid load or store hit or we just completed a slow
-- op such as a load miss, a NC load or a store -- op such as a load miss, a NC load or a store
-- --
if r.hit_load_valid = '1' or r.slow_valid = '1' then -- Note: the load hit is delayed by one cycle. However it can still
if r.req_latch.load = '1' then -- not collide with r.slow_valid (well unless I miscalculated) because
-- If it's a load, enable write back and enable formatting -- slow_valid can only be set on a subsequent request and not on its
d_out.write_enable <= '1'; -- first cycle (the state machine must have advanced), which makes
writeback_format := true; -- slow_valid at least 2 cycles from the previous hit_load_valid.
--


-- If it's a slow load (miss or NC) source it from the buffer -- Sanity: Only one of these must be set in any given cycle
if r.slow_valid = '1' then assert (r.update_valid and r.hit_load_delayed) /= '1' report
d_out.write_data <= r.slow_data; "unexpected hit_load_delayed collision with update_valid"
severity FAILURE;
assert (r.slow_valid and r.hit_load_delayed) /= '1' report
"unexpected hit_load_delayed collision with slow_valid"
severity FAILURE;
assert (r.slow_valid and r.update_valid) /= '1' report
"unexpected update_valid collision with slow_valid"
severity FAILURE;

-- Delayed load hit case is the standard path
if r.hit_load_delayed = '1' then
d_out.write_enable <= '1';

-- If it's not a load with update, complete it now
if r.hit_load_upd_delayed = '0' then
d_out.valid <= '1';
end if; end if;
end if;

-- Slow ops (load miss, NC, stores)
if r.slow_valid = '1' then
-- If it's a load, enable register writeback and switch
-- mux accordingly
--
if r.req_latch.load then
d_out.write_reg <= r.req_latch.write_reg;
d_out.write_enable <= '1';


-- If it's a normal load (not a load with update), we complete -- Read data comes from the slow data latch, formatter
-- now, otherwise we wait for the delayed update. -- from the latched request.
-- --
if r.req_latch.update = '0' then d_out.write_data <= r.slow_data;
d_out.valid <= '1'; d_out.write_shift <= r.req_latch.addr(2 downto 0);
end if; d_out.sign_extend <= r.req_latch.sign_extend;
else d_out.byte_reverse <= r.req_latch.byte_reverse;
-- It's a store, complete always d_out.write_len <= r.req_latch.length;
d_out.valid <= '1';
end if; end if;


-- Sanity -- If it's a store or a non-update load form, complete now
assert r.update_valid = '0' report "unexpected update_valid" if r.req_latch.load = '0' or r.req_latch.update = '0' then
severity FAILURE; d_out.valid <= '1';
end if;
end if; end if;


-- We have a register update to do. -- We have a register update to do.
if r.update_valid = '1' then if r.update_valid = '1' then
d_out.write_enable <= '1'; d_out.write_enable <= '1';
d_out.write_reg <= r.req_latch.update_reg; d_out.write_reg <= r.req_latch.update_reg;

-- Change the read data mux to the address that's going into
-- the register and the formatter does nothing.
--
d_out.write_data <= r.req_latch.addr; d_out.write_data <= r.req_latch.addr;
d_out.write_shift <= "000";
d_out.write_len <= "1000";
d_out.sign_extend <= '0';
d_out.byte_reverse <= '0';


-- If it was a load, this completes the operation -- If it was a load, this completes the operation (load with
-- update case).
--
if r.req_latch.load = '1' then if r.req_latch.load = '1' then
d_out.valid <= '1'; d_out.valid <= '1';
end if; end if;
end if; end if;


if not writeback_format then
d_out.write_len <= "1000";
d_out.write_shift <= "000";
d_out.sign_extend <= '0';
d_out.byte_reverse <= '0';
end if;

end process; end process;


-- Misc data & sel signals -- Misc data & sel signals
@ -465,7 +506,12 @@ begin


-- Generate a cache RAM for each way. This handles the normal -- Generate a cache RAM for each way. This handles the normal
-- reads, writes from reloads and the special store-hit update -- reads, writes from reloads and the special store-hit update
-- path as well -- path as well.
--
-- Note: the BRAMs have an extra read buffer, meaning the output
-- is pipelined an extra cycle. This differs from the
-- icache. The writeback logic needs to take that into
-- account by using 1-cycle delayed signals for load hits.
-- --
rams: for i in 0 to NUM_WAYS-1 generate rams: for i in 0 to NUM_WAYS-1 generate
signal do_read : std_ulogic; signal do_read : std_ulogic;
@ -479,7 +525,8 @@ begin
way: entity work.cache_ram way: entity work.cache_ram
generic map ( generic map (
ROW_BITS => ROW_BITS, ROW_BITS => ROW_BITS,
WIDTH => wishbone_data_bits WIDTH => wishbone_data_bits,
ADD_BUF => true
) )
port map ( port map (
clk => clk, clk => clk,
@ -493,13 +540,8 @@ begin
); );
process(all) process(all)
begin begin
do_read <= '0';
do_write <= '0';

-- Cache hit reads -- Cache hit reads
if req_op = OP_LOAD_HIT and req_hit_way = i then do_read <= '1';
do_read <= '1';
end if;
rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
cache_out(i) <= dout; cache_out(i) <= dout;


@ -507,19 +549,30 @@ begin
-- --
-- Defaults to wishbone read responses (cache refill), -- Defaults to wishbone read responses (cache refill),
-- --
wr_data <= wishbone_in.dat; -- For timing, the mux on wr_data/sel/addr is not dependent on anything
wr_sel <= (others => '1'); -- other than the current state. Only the do_write signal is.
wr_addr <= std_ulogic_vector(to_unsigned(get_row(r.wb.adr), ROW_BITS)); --
if r.state = IDLE then
-- When IDLE, the only write path is the store-hit update case
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
wr_data <= store_data;
wr_sel <= bus_sel;
else
-- Otherwise, we might be doing a reload
wr_data <= wishbone_in.dat;
wr_sel <= (others => '1');
wr_addr <= std_ulogic_vector(to_unsigned(get_row(r.wb.adr), ROW_BITS));
end if;

-- The two actual write cases here
do_write <= '0';
if r.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r.store_way = i then if r.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r.store_way = i then
do_write <= '1'; do_write <= '1';
end if; end if;

-- Alternatively, store-hit BRAM update case (exclusive from the above).
if req_op = OP_STORE_HIT and req_hit_way = i then if req_op = OP_STORE_HIT and req_hit_way = i then
report "store_data:" & to_hstring(store_data); assert r.state /= RELOAD_WAIT_ACK report "Store hit while in state:" &
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); state_t'image(r.state)
wr_data <= store_data; severity FAILURE;
wr_sel <= bus_sel;
do_write <= '1'; do_write <= '1';
end if; end if;
end process; end process;
@ -531,6 +584,16 @@ begin
dcache_fast_hit : process(clk) dcache_fast_hit : process(clk)
begin begin
if rising_edge(clk) then if rising_edge(clk) then
-- 1-cycle delayed signals for load hit response
r.hit_load_delayed <= r.hit_load_valid;
r.hit_way_delayed <= r.hit_way;
r.hit_load_upd_delayed <= r.req_latch.update;
r.hit_load_reg_delayed <= r.req_latch.write_reg;
r.hit_data_shift_delayed <= r.req_latch.addr(2 downto 0);
r.hit_sign_ext_delayed <= r.req_latch.sign_extend;
r.hit_byte_rev_delayed <= r.req_latch.byte_reverse;
r.hit_dlength_delayed <= r.req_latch.length;

-- On-cycle pulse values get reset on every cycle -- On-cycle pulse values get reset on every cycle
r.hit_load_valid <= '0'; r.hit_load_valid <= '0';


@ -543,7 +606,7 @@ begin
if d_in.valid = '1' then if d_in.valid = '1' then
r.req_latch <= d_in; r.req_latch <= d_in;


report "dcache op:" & op_t'image(req_op) & report "op:" & op_t'image(req_op) &
" addr:" & to_hstring(d_in.addr) & " addr:" & to_hstring(d_in.addr) &
" upd:" & std_ulogic'image(d_in.update) & " upd:" & std_ulogic'image(d_in.update) &
" nc:" & std_ulogic'image(d_in.nc) & " nc:" & std_ulogic'image(d_in.nc) &
@ -712,6 +775,9 @@ begin
end if; end if;


when LOAD_UPDATE => when LOAD_UPDATE =>
-- We need the extra cycle to complete a load with update
r.state <= LOAD_UPDATE2;
when LOAD_UPDATE2 =>
-- We need the extra cycle to complete a load with update -- We need the extra cycle to complete a load with update
r.update_valid <= '1'; r.update_valid <= '1';
r.state <= IDLE; r.state <= IDLE;

Loading…
Cancel
Save