From 174378b190fe47f12dc3f9ec90756a8f8df3f05f Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 10 Oct 2019 11:25:16 +1100
Subject: [PATCH] dcache: Introduce an extra cycle latency to make timing

This makes the BRAMs use an output buffer, introducing an extra
cycle latency. Without this, Vivado won't make timing at 100Mhz.

We stash all the necessary response data in delayed latches, the
extra cycle is NOT a state in the state machine, thus it's fully
pipelined and doesn't involve stalling.

This introduces an extra non-pipelined cycle for loads with update
to avoid collision on the writeback output between the now delayed
load data and the register update. We could avoid it by moving
the register update in the pipeline bubble created by the extra
update state, but it's a bit trickier, so I leave that for a latter
optimization.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 cache_ram.vhdl |  23 ++++++-
 dcache.vhdl    | 182 +++++++++++++++++++++++++++++++++----------------
 2 files changed, 145 insertions(+), 60 deletions(-)

diff --git a/cache_ram.vhdl b/cache_ram.vhdl
index 346c6fe..7a10a1c 100644
--- a/cache_ram.vhdl
+++ b/cache_ram.vhdl
@@ -7,7 +7,8 @@ entity cache_ram is
     generic(
 	ROW_BITS : integer := 16;
 	WIDTH    : integer := 64;
-	TRACE    : boolean := false
+	TRACE    : boolean := false;
+	ADD_BUF  : boolean := false
 	);
 
     port(
@@ -33,6 +34,8 @@ architecture rtl of cache_ram is
     attribute ram_decomp : string;
     attribute ram_decomp of ram : signal is "power";
 
+    signal rd_data0 : std_logic_vector(WIDTH - 1 downto 0);
+
 begin
     process(clk)
 	variable lbit : integer range 0 to WIDTH - 1;
@@ -56,7 +59,7 @@ begin
 		end loop;
 	    end if;
 	    if rd_en = '1' then
-		rd_data <= ram(to_integer(unsigned(rd_addr)));
+		rd_data0 <= ram(to_integer(unsigned(rd_addr)));
 		if TRACE then
 		    report "read a:" & to_hstring(rd_addr) &
 			" dat:" & to_hstring(ram(to_integer(unsigned(rd_addr))));
@@ -64,4 +67,20 @@ begin
 	    end if;
 	end if;
     end process;
+
+    buf: if ADD_BUF generate
+    begin
+	process(clk)
+	begin
+	    if rising_edge(clk) then
+		rd_data <= rd_data0;
+	    end if;
+	end process;
+    end generate;
+
+    nobuf: if not ADD_BUF generate
+    begin
+	rd_data <= rd_data0;
+    end generate;
+
 end;
diff --git a/dcache.vhdl b/dcache.vhdl
index f771eae..c0d0469 100644
--- a/dcache.vhdl
+++ b/dcache.vhdl
@@ -7,6 +7,9 @@
 -- * Complete load misses on the cycle when WB data comes instead of
 --   at the end of line (this requires dealing with requests coming in
 --   while not idle...)
+-- * Load with update could use one less non-pipelined cycle by moving
+--   the register update to the pipeline bubble that exists when going
+--   back to the IDLE state.
 --
 library ieee;
 use ieee.std_logic_1164.all;
@@ -138,7 +141,8 @@ architecture rtl of dcache is
 		      
     -- Cache state machine
     type state_t is (IDLE,	       -- Normal load hit processing
-		     LOAD_UPDATE,      -- Load with update address update cycle
+		     LOAD_UPDATE,      -- Load with update extra cycle
+		     LOAD_UPDATE2,     -- Load with update extra cycle
 		     RELOAD_WAIT_ACK,  -- Cache reload wait ack
 		     STORE_WAIT_ACK,   -- Store wait ack
 		     NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack
@@ -147,8 +151,20 @@ architecture rtl of dcache is
 	req_latch : Loadstore1ToDcacheType;
 	
 	-- Cache hit state (Latches for 1 cycle BRAM access)
-	hit_way        : way_t;
-	hit_load_valid : std_ulogic;
+	hit_way          : way_t;
+	hit_load_valid   : std_ulogic;
+
+	-- 1-cycle delayed signals to account for the BRAM extra
+	-- buffer that seems necessary to make timing on load hits
+	--
+	hit_way_delayed        : way_t;
+	hit_load_delayed       : std_ulogic;
+	hit_load_upd_delayed   : std_ulogic;
+	hit_load_reg_delayed   : std_ulogic_vector(4 downto 0);
+	hit_data_shift_delayed : std_ulogic_vector(2 downto 0);
+	hit_dlength_delayed    : std_ulogic_vector(3 downto 0);
+	hit_sign_ext_delayed   : std_ulogic;
+	hit_byte_rev_delayed   : std_ulogic;
 
 	-- Register update (load/store with update)
 	update_valid : std_ulogic;
@@ -382,72 +398,97 @@ begin
     -- Writeback (loads and reg updates) & completion control logic
     --
     writeback_control: process(all)
-	variable writeback_format : boolean;
     begin
 
 	-- The mux on d_out.write reg defaults to the normal load hit case.
 	d_out.write_enable <= '0';
 	d_out.valid <= '0';
-	d_out.write_reg <= r.req_latch.write_reg;
-	d_out.write_data <= cache_out(r.hit_way);
-	d_out.write_len <= r.req_latch.length;
-	d_out.write_shift <= r.req_latch.addr(2 downto 0);
-	d_out.sign_extend <= r.req_latch.sign_extend;
-	d_out.byte_reverse <= r.req_latch.byte_reverse;
+	d_out.write_reg <= r.hit_load_reg_delayed;
+	d_out.write_data <= cache_out(r.hit_way_delayed);
+	d_out.write_len <= r.hit_dlength_delayed;
+	d_out.write_shift <= r.hit_data_shift_delayed;
+	d_out.sign_extend <= r.hit_sign_ext_delayed;
+	d_out.byte_reverse <= r.hit_byte_rev_delayed;
 	d_out.second_word <= '0';
 
-	-- By default writeback doesn't need formatting
-	writeback_format := false;
-
 	-- We have a valid load or store hit or we just completed a slow
 	-- op such as a load miss, a NC load or a store
 	--
-	if r.hit_load_valid = '1' or r.slow_valid = '1' then
-	    if r.req_latch.load = '1' then
-		-- If it's a load, enable write back and enable formatting
-		d_out.write_enable <= '1';
-		writeback_format := true;
+	-- Note: the load hit is delayed by one cycle. However it can still
+	-- not collide with r.slow_valid (well unless I miscalculated) because
+	-- slow_valid can only be set on a subsequent request and not on its
+	-- first cycle (the state machine must have advanced), which makes
+	-- slow_valid at least 2 cycles from the previous hit_load_valid.
+	--
 
-		-- If it's a slow load (miss or NC) source it from the buffer
-		if r.slow_valid = '1' then
-		    d_out.write_data <= r.slow_data;
+	-- Sanity: Only one of these must be set in any given cycle
+	assert (r.update_valid and r.hit_load_delayed) /= '1' report
+	    "unexpected hit_load_delayed collision with update_valid"
+	    severity FAILURE;
+	assert (r.slow_valid and r.hit_load_delayed) /= '1' report
+	    "unexpected hit_load_delayed collision with slow_valid"
+	    severity FAILURE;
+	assert (r.slow_valid and r.update_valid) /= '1' report
+	    "unexpected update_valid collision with slow_valid"
+	    severity FAILURE;
+
+	-- Delayed load hit case is the standard path
+	if r.hit_load_delayed = '1' then
+	    d_out.write_enable <= '1';
+
+	    -- If it's not a load with update, complete it now
+	    if r.hit_load_upd_delayed = '0' then
+		d_out.valid <= '1';
 		end if;
+	end if;
+
+	-- Slow ops (load miss, NC, stores)
+	if r.slow_valid = '1' then
+	    -- If it's a load, enable register writeback and switch
+	    -- mux accordingly
+	    --
+	    if r.req_latch.load then	    
+		d_out.write_reg <= r.req_latch.write_reg;
+		d_out.write_enable <= '1';
 
-		-- If it's a normal load (not a load with update), we complete
-		-- now, otherwise we wait for the delayed update.
+		-- Read data comes from the slow data latch, formatter
+		-- from the latched request.
 		--
-		if r.req_latch.update = '0' then
-		    d_out.valid <= '1';
-		end if;
-	    else
-		-- It's a store, complete always
-		d_out.valid <= '1';
+		d_out.write_data <= r.slow_data;
+		d_out.write_shift <= r.req_latch.addr(2 downto 0);
+		d_out.sign_extend <= r.req_latch.sign_extend;
+		d_out.byte_reverse <= r.req_latch.byte_reverse;
+		d_out.write_len <= r.req_latch.length;
 	    end if;
 
-	    -- Sanity
-	    assert r.update_valid = '0' report "unexpected update_valid"
-		severity FAILURE;
+	    -- If it's a store or a non-update load form, complete now
+	    if r.req_latch.load = '0' or r.req_latch.update = '0' then
+		d_out.valid <= '1';
+	    end if;
 	end if;
 
 	-- We have a register update to do.
 	if r.update_valid = '1' then
 	    d_out.write_enable <= '1';
 	    d_out.write_reg <= r.req_latch.update_reg;
+
+	    -- Change the read data mux to the address that's going into
+	    -- the register and the formatter does nothing.
+	    --
 	    d_out.write_data <= r.req_latch.addr;
+	    d_out.write_shift <= "000";
+	    d_out.write_len <= "1000";
+	    d_out.sign_extend <= '0';
+	    d_out.byte_reverse <= '0';
 
-	    -- If it was a load, this completes the operation
+	    -- If it was a load, this completes the operation (load with
+	    -- update case).
+	    --
 	    if r.req_latch.load = '1' then
 		d_out.valid <= '1';
 	    end if;
 	end if;
 
-	if not writeback_format then
-	    d_out.write_len <= "1000";
-	    d_out.write_shift <= "000";
-	    d_out.sign_extend <= '0';
-	    d_out.byte_reverse <= '0';
-	end if;
-
     end process;
 
     -- Misc data & sel signals
@@ -465,7 +506,12 @@ begin
 
     -- Generate a cache RAM for each way. This handles the normal
     -- reads, writes from reloads and the special store-hit update
-    -- path as well
+    -- path as well.
+    --
+    -- Note: the BRAMs have an extra read buffer, meaning the output
+    --       is pipelined an extra cycle. This differs from the
+    --       icache. The writeback logic needs to take that into
+    --       account by using 1-cycle delayed signals for load hits.
     --
     rams: for i in 0 to NUM_WAYS-1 generate
 	signal do_read  : std_ulogic;
@@ -479,7 +525,8 @@ begin
 	way: entity work.cache_ram
 	    generic map (
 		ROW_BITS => ROW_BITS,
-		WIDTH => wishbone_data_bits
+		WIDTH => wishbone_data_bits,
+		ADD_BUF => true
 		)
 	    port map (
 		clk     => clk,
@@ -493,13 +540,8 @@ begin
 		);
 	process(all)
 	begin
-	    do_read <= '0';
-	    do_write <= '0';
-
 	    -- Cache hit reads
-	    if req_op = OP_LOAD_HIT and req_hit_way = i then
-		do_read <= '1';
-	    end if;
+	    do_read <= '1';
 	    rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
 	    cache_out(i) <= dout;
 
@@ -507,19 +549,30 @@ begin
 	    --
 	    -- Defaults to wishbone read responses (cache refill),
 	    --
-	    wr_data <= wishbone_in.dat;
-	    wr_sel  <= (others => '1');
-	    wr_addr <= std_ulogic_vector(to_unsigned(get_row(r.wb.adr), ROW_BITS));
+	    -- For timing, the mux on wr_data/sel/addr is not dependent on anything
+	    -- other than the current state. Only the do_write signal is.
+	    --
+	    if r.state = IDLE then
+		-- When IDLE, the only write path is the store-hit update case
+		wr_addr  <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
+		wr_data  <= store_data;
+		wr_sel   <= bus_sel;
+	    else
+		-- Otherwise, we might be doing a reload
+		wr_data <= wishbone_in.dat;
+		wr_sel  <= (others => '1');
+		wr_addr <= std_ulogic_vector(to_unsigned(get_row(r.wb.adr), ROW_BITS));
+	    end if;
+
+	    -- The two actual write cases here
+	    do_write <= '0';
 	    if r.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r.store_way = i then
 		do_write <= '1';
 	    end if;
-
-	    -- Alternatively, store-hit BRAM update case (exclusive from the above).
 	    if req_op = OP_STORE_HIT and req_hit_way = i then
-		report "store_data:" & to_hstring(store_data);
-		wr_addr  <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
-		wr_data  <= store_data;
-		wr_sel   <= bus_sel;
+		assert r.state /= RELOAD_WAIT_ACK report "Store hit while in state:" &
+		    state_t'image(r.state)
+		    severity FAILURE;
 		do_write <= '1';
 	    end if;
 	end process;
@@ -531,6 +584,16 @@ begin
     dcache_fast_hit : process(clk)
     begin
         if rising_edge(clk) then
+	    -- 1-cycle delayed signals for load hit response
+	    r.hit_load_delayed       <= r.hit_load_valid;
+	    r.hit_way_delayed        <= r.hit_way;
+	    r.hit_load_upd_delayed   <= r.req_latch.update;
+	    r.hit_load_reg_delayed   <= r.req_latch.write_reg;
+	    r.hit_data_shift_delayed <= r.req_latch.addr(2 downto 0);
+	    r.hit_sign_ext_delayed   <= r.req_latch.sign_extend;
+	    r.hit_byte_rev_delayed   <= r.req_latch.byte_reverse;
+	    r.hit_dlength_delayed    <= r.req_latch.length;
+
 	    -- On-cycle pulse values get reset on every cycle
 	    r.hit_load_valid <= '0';
 
@@ -543,7 +606,7 @@ begin
 	    if d_in.valid = '1' then
 		r.req_latch <= d_in;
 
-		report "dcache op:" & op_t'image(req_op) &
+		report "op:" & op_t'image(req_op) &
 		    " addr:" & to_hstring(d_in.addr) &
 		    " upd:" & std_ulogic'image(d_in.update) &
 		    " nc:" & std_ulogic'image(d_in.nc) &
@@ -712,6 +775,9 @@ begin
 		    end if;
 
 		when LOAD_UPDATE =>
+		    -- We need the extra cycle to complete a load with update
+		    r.state <= LOAD_UPDATE2;
+		when LOAD_UPDATE2 =>
 		    -- We need the extra cycle to complete a load with update
 		    r.update_valid <= '1';
 		    r.state <= IDLE;