From e598188aca37c42a6a130d54c8d5432d6df8f89a Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 8 Oct 2019 23:26:23 +1100 Subject: [PATCH 01/11] plru: Improve sensitivity list Signed-off-by: Benjamin Herrenschmidt --- plru.vhdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plru.vhdl b/plru.vhdl index 6907c2b..6caa2ef 100644 --- a/plru.vhdl +++ b/plru.vhdl @@ -29,7 +29,7 @@ begin -- XXX Check if we can turn that into a little ROM instead that -- takes the tree bit vector and returns the LRU. See if it's better -- in term of FPGA resouces usage... - get_lru: process(all) + get_lru: process(tree) variable node : node_t; begin node := 0; From a38ae503ff3a88852bcf27cc8b56ff110595081b Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 10 Oct 2019 00:38:03 +1100 Subject: [PATCH 02/11] cache_ram: Add write-enables They will be needed by the dcache Signed-off-by: Benjamin Herrenschmidt --- cache_ram.vhdl | 25 +++++++++++++++++++++++-- icache.vhdl | 1 + 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/cache_ram.vhdl b/cache_ram.vhdl index e0ffd17..346c6fe 100644 --- a/cache_ram.vhdl +++ b/cache_ram.vhdl @@ -6,7 +6,8 @@ use ieee.math_real.all; entity cache_ram is generic( ROW_BITS : integer := 16; - WIDTH : integer := 64 + WIDTH : integer := 64; + TRACE : boolean := false ); port( @@ -15,6 +16,7 @@ entity cache_ram is rd_addr : in std_logic_vector(ROW_BITS - 1 downto 0); rd_data : out std_logic_vector(WIDTH - 1 downto 0); wr_en : in std_logic; + wr_sel : in std_logic_vector(WIDTH/8 - 1 downto 0); wr_addr : in std_logic_vector(ROW_BITS - 1 downto 0); wr_data : in std_logic_vector(WIDTH - 1 downto 0) ); @@ -33,13 +35,32 @@ architecture rtl of cache_ram is begin process(clk) + variable lbit : integer range 0 to WIDTH - 1; + variable mbit : integer range 0 to WIDTH - 1; + variable widx : integer range 0 to SIZE - 1; begin if rising_edge(clk) then if wr_en = '1' then - ram(to_integer(unsigned(wr_addr))) <= wr_data; + if TRACE then + report "write a:" & to_hstring(wr_addr) & + " sel:" & to_hstring(wr_sel) & + " dat:" & to_hstring(wr_data); + end if; + for i in 0 to WIDTH/8-1 loop + lbit := i * 8; + mbit := lbit + 7; + widx := to_integer(unsigned(wr_addr)); + if wr_sel(i) = '1' then + ram(widx)(mbit downto lbit) <= wr_data(mbit downto lbit); + end if; + end loop; end if; if rd_en = '1' then rd_data <= ram(to_integer(unsigned(rd_addr))); + if TRACE then + report "read a:" & to_hstring(rd_addr) & + " dat:" & to_hstring(ram(to_integer(unsigned(rd_addr)))); + end if; end if; end if; end process; diff --git a/icache.vhdl b/icache.vhdl index 89e491e..804b648 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -290,6 +290,7 @@ begin rd_addr => rd_addr, rd_data => dout, wr_en => do_write, + wr_sel => (others => '1'), wr_addr => wr_addr, wr_data => wishbone_in.dat ); From 7b3df7cb05cb343982ceee25e9796d96abb2d71b Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 10 Oct 2019 00:40:11 +1100 Subject: [PATCH 03/11] icache: Reduce simulation warnings This might slightly increase the logic in synthesis but avoids us looking at uninitialized tags when not servicing an active request Signed-off-by: Benjamin Herrenschmidt --- icache.vhdl | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/icache.vhdl b/icache.vhdl index 804b648..75695e1 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -274,6 +274,7 @@ begin -- Generate a cache RAM for each way rams: for i in 0 to NUM_WAYS-1 generate + signal do_read : std_ulogic; signal do_write : std_ulogic; signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0); signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0); @@ -286,7 +287,7 @@ begin ) port map ( clk => clk, - rd_en => '1', -- fixme + rd_en => do_read, rd_addr => rd_addr, rd_data => dout, wr_en => do_write, @@ -296,6 +297,7 @@ begin ); process(all) begin + do_read <= '1'; do_write <= '0'; if wishbone_in.ack = '1' and r.store_way = i then do_write <= '1'; @@ -356,10 +358,11 @@ begin hit_way := 0; is_hit := '0'; for i in way_t loop - if read_tag(i, cache_tags(req_index)) = req_tag and - cache_valids(req_index)(i) = '1' then - hit_way := i; - is_hit := '1'; + if i_in.req = '1' and cache_valids(req_index)(i) = '1' then + if read_tag(i, cache_tags(req_index)) = req_tag then + hit_way := i; + is_hit := '1'; + end if; end if; end loop; @@ -434,6 +437,9 @@ begin r.wb.dat <= (others => '0'); r.wb.sel <= "11111111"; r.wb.we <= '0'; + + -- Not useful normally but helps avoiding tons of sim warnings + r.wb.adr <= (others => '0'); else -- Main state machine case r.state is From b513f0fb48638aa0df8fe7f76a388498eecc1b79 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 10 Oct 2019 00:40:46 +1100 Subject: [PATCH 04/11] dcache: Add a dcache This replaces loadstore2 with a dcache The dcache unit is losely based on the icache one (same basic cache layout), but has some significant logic additions to deal with stores, loads with update, non-cachable accesses and other differences due to operating in the execution part of the pipeline rather than the fetch part. The cache is store-through, though a hit with an existing line will update the line rather than invalidate it. Signed-off-by: Benjamin Herrenschmidt --- Makefile | 4 +- common.vhdl | 8 +- core.vhdl | 19 +- dcache.vhdl | 733 ++++++++++++++++++++++++++++++++++++++++++++ loadstore1.vhdl | 20 +- loadstore2.vhdl | 148 --------- microwatt.core | 2 +- wishbone_types.vhdl | 4 +- writeback.vhdl | 2 +- 9 files changed, 772 insertions(+), 168 deletions(-) create mode 100644 dcache.vhdl delete mode 100644 loadstore2.vhdl diff --git a/Makefile b/Makefile index 5525c1e..bc5388b 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ common.o: decode_types.o control.o: gpr_hazard.o cr_hazard.o sim_jtag.o: sim_jtag_socket.o core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o -core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o loadstore2.o multiply.o writeback.o core_debug.o divider.o +core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o multiply.o writeback.o core_debug.o divider.o core_debug.o: common.o countzero.o: countzero_tb.o: common.o glibc_random.o countzero.o @@ -37,9 +37,9 @@ plru.o: plru_tb.o: plru.o icache.o: common.o wishbone_types.o plru.o cache_ram.o icache_tb.o: common.o wishbone_types.o icache.o simple_ram_behavioural.o +dcache.o: common.o wishbone_types.o plru.o cache_ram.o insn_helpers.o: loadstore1.o: common.o helpers.o -loadstore2.o: common.o helpers.o wishbone_types.o logical.o: decode_types.o multiply_tb.o: decode_types.o common.o glibc_random.o ppc_fx_insns.o multiply.o multiply.o: common.o decode_types.o diff --git a/common.vhdl b/common.vhdl index b353922..3e9da69 100644 --- a/common.vhdl +++ b/common.vhdl @@ -138,9 +138,10 @@ package common is end record; constant Decode2ToLoadstore1Init : Decode2ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0', sign_extend => '0', update => '0', others => (others => '0')); - type Loadstore1ToLoadstore2Type is record + type Loadstore1ToDcacheType is record valid : std_ulogic; load : std_ulogic; + nc : std_ulogic; addr : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); write_reg : std_ulogic_vector(4 downto 0); @@ -151,7 +152,7 @@ package common is update_reg : std_ulogic_vector(4 downto 0); end record; - type Loadstore2ToWritebackType is record + type DcacheToWritebackType is record valid : std_ulogic; write_enable: std_ulogic; write_reg : std_ulogic_vector(4 downto 0); @@ -162,7 +163,7 @@ package common is byte_reverse : std_ulogic; second_word : std_ulogic; end record; - constant Loadstore2ToWritebackInit : Loadstore2ToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', byte_reverse => '0', second_word => '0', others => (others => '0')); + constant DcacheToWritebackInit : DcacheToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', byte_reverse => '0', second_word => '0', others => (others => '0')); type Execute1ToWritebackType is record valid: std_ulogic; @@ -211,6 +212,7 @@ package common is write_cr_data : std_ulogic_vector(31 downto 0); end record; constant WritebackToCrFileInit : WritebackToCrFileType := (write_cr_enable => '0', others => (others => '0')); + end common; package body common is diff --git a/core.vhdl b/core.vhdl index 5a269a2..e9cd28b 100644 --- a/core.vhdl +++ b/core.vhdl @@ -59,8 +59,8 @@ architecture behave of core is -- load store signals signal decode2_to_loadstore1: Decode2ToLoadstore1Type; - signal loadstore1_to_loadstore2: Loadstore1ToLoadstore2Type; - signal loadstore2_to_writeback: Loadstore2ToWritebackType; + signal loadstore1_to_dcache: Loadstore1ToDcacheType; + signal dcache_to_writeback: DcacheToWritebackType; -- multiply signals signal decode2_to_multiply: Decode2ToMultiplyType; @@ -211,16 +211,17 @@ begin port map ( clk => clk, l_in => decode2_to_loadstore1, - l_out => loadstore1_to_loadstore2 + l_out => loadstore1_to_dcache ); - loadstore2_0: entity work.loadstore2 + dcache_0: entity work.dcache port map ( clk => clk, - l_in => loadstore1_to_loadstore2, - w_out => loadstore2_to_writeback, - m_in => wishbone_data_in, - m_out => wishbone_data_out + rst => core_rst, + d_in => loadstore1_to_dcache, + d_out => dcache_to_writeback, + wishbone_in => wishbone_data_in, + wishbone_out => wishbone_data_out ); multiply_0: entity work.multiply @@ -242,7 +243,7 @@ begin port map ( clk => clk, e_in => execute1_to_writeback, - l_in => loadstore2_to_writeback, + l_in => dcache_to_writeback, m_in => multiply_to_writeback, d_in => divider_to_writeback, w_out => writeback_to_register_file, diff --git a/dcache.vhdl b/dcache.vhdl new file mode 100644 index 0000000..f771eae --- /dev/null +++ b/dcache.vhdl @@ -0,0 +1,733 @@ +-- +-- Set associative dcache write-through +-- +-- TODO (in no specific order): +-- +-- * See list in icache.vhdl +-- * Complete load misses on the cycle when WB data comes instead of +-- at the end of line (this requires dealing with requests coming in +-- while not idle...) +-- +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.common.all; +use work.helpers.all; +use work.wishbone_types.all; + +entity dcache is + generic ( + -- Line size in bytes + LINE_SIZE : positive := 64; + -- Number of lines in a set + NUM_LINES : positive := 32; + -- Number of ways + NUM_WAYS : positive := 4 + ); + port ( + clk : in std_ulogic; + rst : in std_ulogic; + + d_in : in Loadstore1ToDcacheType; + d_out : out DcacheToWritebackType; + + wishbone_out : out wishbone_master_out; + wishbone_in : in wishbone_slave_out + ); +end entity dcache; + +architecture rtl of dcache is + function log2(i : natural) return integer is + variable tmp : integer := i; + variable ret : integer := 0; + begin + while tmp > 1 loop + ret := ret + 1; + tmp := tmp / 2; + end loop; + return ret; + end function; + + function ispow2(i : integer) return boolean is + begin + if to_integer(to_unsigned(i, 32) and to_unsigned(i - 1, 32)) = 0 then + return true; + else + return false; + end if; + end function; + + -- BRAM organisation: We never access more than wishbone_data_bits at + -- a time so to save resources we make the array only that wide, and + -- use consecutive indices for to make a cache "line" + -- + -- ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits) + constant ROW_SIZE : natural := wishbone_data_bits / 8; + -- ROW_PER_LINE is the number of row (wishbone transactions) in a line + constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE; + -- BRAM_ROWS is the number of rows in BRAM needed to represent the full + -- dcache + constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE; + + -- Bit fields counts in the address + + -- ROW_BITS is the number of bits to select a row + constant ROW_BITS : natural := log2(BRAM_ROWS); + -- ROW_LINEBITS is the number of bits to select a row within a line + constant ROW_LINEBITS : natural := log2(ROW_PER_LINE); + -- LINE_OFF_BITS is the number of bits for the offset in a cache line + constant LINE_OFF_BITS : natural := log2(LINE_SIZE); + -- ROW_OFF_BITS is the number of bits for the offset in a row + constant ROW_OFF_BITS : natural := log2(ROW_SIZE); + -- INDEX_BITS is the number if bits to select a cache line + constant INDEX_BITS : natural := log2(NUM_LINES); + -- TAG_BITS is the number of bits of the tag part of the address + constant TAG_BITS : natural := 64 - LINE_OFF_BITS - INDEX_BITS; + -- WAY_BITS is the number of bits to select a way + constant WAY_BITS : natural := log2(NUM_WAYS); + + -- Example of layout for 32 lines of 64 bytes: + -- + -- .. tag |index| line | + -- .. | row | | + -- .. | |---| | ROW_LINEBITS (3) + -- .. | |--- - --| LINE_OFF_BITS (6) + -- .. | |- --| ROW_OFF_BITS (3) + -- .. |----- ---| | ROW_BITS (8) + -- .. |-----| | INDEX_BITS (5) + -- .. --------| | TAG_BITS (53) + + subtype row_t is integer range 0 to BRAM_ROWS-1; + subtype index_t is integer range 0 to NUM_LINES-1; + subtype way_t is integer range 0 to NUM_WAYS-1; + + -- The cache data BRAM organized as described above for each way + subtype cache_row_t is std_ulogic_vector(wishbone_data_bits-1 downto 0); + + -- The cache tags LUTRAM has a row per set. Vivado is a pain and will + -- not handle a clean (commented) definition of the cache tags as a 3d + -- memory. For now, work around it by putting all the tags + subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0); +-- type cache_tags_set_t is array(way_t) of cache_tag_t; +-- type cache_tags_array_t is array(index_t) of cache_tags_set_t; + constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS; + subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0); + type cache_tags_array_t is array(index_t) of cache_tags_set_t; + + -- The cache valid bits + subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); + type cache_valids_t is array(index_t) of cache_way_valids_t; + + -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs + signal cache_tags : cache_tags_array_t; + signal cache_valids : cache_valids_t; + + attribute ram_style : string; + attribute ram_style of cache_tags : signal is "distributed"; + + -- Type of operation on a "valid" input + type op_t is (OP_NONE, + OP_LOAD_HIT, -- Cache hit on load + OP_LOAD_MISS, -- Load missing cache + OP_LOAD_NC, -- Non-cachable load + OP_BAD, -- BAD: Cache hit on NC load/store + OP_STORE_HIT, -- Store hitting cache + OP_STORE_MISS); -- Store missing cache + + -- Cache state machine + type state_t is (IDLE, -- Normal load hit processing + LOAD_UPDATE, -- Load with update address update cycle + RELOAD_WAIT_ACK, -- Cache reload wait ack + STORE_WAIT_ACK, -- Store wait ack + NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack + + type reg_internal_t is record + req_latch : Loadstore1ToDcacheType; + + -- Cache hit state (Latches for 1 cycle BRAM access) + hit_way : way_t; + hit_load_valid : std_ulogic; + + -- Register update (load/store with update) + update_valid : std_ulogic; + + -- Data buffer for "slow" read ops (load miss and NC loads). + slow_data : std_ulogic_vector(63 downto 0); + slow_valid : std_ulogic; + + -- Cache miss state (reload state machine) + state : state_t; + wb : wishbone_master_out; + store_way : way_t; + store_index : index_t; + end record; + + signal r : reg_internal_t; + + -- Async signals on incoming request + signal req_index : index_t; + signal req_row : row_t; + signal req_hit_way : way_t; + signal req_tag : cache_tag_t; + signal req_op : op_t; + + -- Cache RAM interface + type cache_ram_out_t is array(way_t) of cache_row_t; + signal cache_out : cache_ram_out_t; + + -- PLRU output interface + type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0); + signal plru_victim : plru_out_t; + + -- Wishbone read/write/cache write formatting signals + signal bus_sel : wishbone_sel_type; + signal store_data : wishbone_data_type; + + -- Return the cache line index (tag index) for an address + function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is + begin + return to_integer(unsigned(addr(63-TAG_BITS downto LINE_OFF_BITS))); + end; + + -- Return the cache row index (data memory) for an address + function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is + begin + return to_integer(unsigned(addr(63-TAG_BITS downto ROW_OFF_BITS))); + end; + + -- Returns whether this is the last row of a line + function is_last_row(addr: std_ulogic_vector(63 downto 0)) return boolean is + constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); + begin + return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones; + end; + + -- Return the address of the next row in the current cache line + function next_row_addr(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is + variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0); + variable result : std_ulogic_vector(63 downto 0); + begin + -- Is there no simpler way in VHDL to generate that 3 bits adder ? + row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS); + row_idx := std_ulogic_vector(unsigned(row_idx) + 1); + result := addr; + result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx; + return result; + end; + + -- Get the tag value from the address + function get_tag(addr: std_ulogic_vector(63 downto 0)) return cache_tag_t is + begin + return addr(63 downto 64-TAG_BITS); + end; + + -- Read a tag from a tag memory row + function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is + begin + return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS); + end; + + -- Write a tag to tag memory row + procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t; + tag: cache_tag_t) is + begin + tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; + end; + + -- Generate byte enables from sizes + function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is + begin + case length is + when "0001" => + return "00000001"; + when "0010" => + return "00000011"; + when "0100" => + return "00001111"; + when "1000" => + return "11111111"; + when others => + return "00000000"; + end case; + end function length_to_sel; + + -- Calculate shift and byte enables for wishbone + function wishbone_data_shift(address : in std_ulogic_vector(63 downto 0)) return natural is + begin + return to_integer(unsigned(address(2 downto 0))) * 8; + end function wishbone_data_shift; + + function wishbone_data_sel(size : in std_logic_vector(3 downto 0); + address : in std_logic_vector(63 downto 0)) + return std_ulogic_vector is + begin + return std_ulogic_vector(shift_left(unsigned(length_to_sel(size)), + to_integer(unsigned(address(2 downto 0))))); + end function wishbone_data_sel; + +begin + + assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; + assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE; + assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE; + assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" severity FAILURE; + assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS) + report "geometry bits don't add up" severity FAILURE; + assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) + report "geometry bits don't add up" severity FAILURE; + assert (64 = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) + report "geometry bits don't add up" severity FAILURE; + assert (64 = TAG_BITS + ROW_BITS + ROW_OFF_BITS) + report "geometry bits don't add up" severity FAILURE; + assert (64 = wishbone_data_bits) + report "Can't yet handle a wishbone width that isn't 64-bits" severity FAILURE; + + -- Generate PLRUs + maybe_plrus: if NUM_WAYS > 1 generate + begin + plrus: for i in 0 to NUM_LINES-1 generate + -- PLRU interface + signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); + signal plru_acc_en : std_ulogic; + signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); + + begin + plru : entity work.plru + generic map ( + BITS => WAY_BITS + ) + port map ( + clk => clk, + rst => rst, + acc => plru_acc, + acc_en => plru_acc_en, + lru => plru_out + ); + + process(req_index, req_op, req_hit_way, plru_out) + begin + -- PLRU interface + if (req_op = OP_LOAD_HIT or + req_op = OP_STORE_HIT) and req_index = i then + plru_acc_en <= '1'; + else + plru_acc_en <= '0'; + end if; + plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS)); + plru_victim(i) <= plru_out; + end process; + end generate; + end generate; + + -- Cache request parsing and hit detection + dcache_request : process(all) + variable is_hit : std_ulogic; + variable hit_way : way_t; + variable op : op_t; + variable tmp : std_ulogic_vector(63 downto 0); + variable data : std_ulogic_vector(63 downto 0); + variable opsel : std_ulogic_vector(3 downto 0); + begin + -- Extract line, row and tag from request + req_index <= get_index(d_in.addr); + req_row <= get_row(d_in.addr); + req_tag <= get_tag(d_in.addr); + + -- Test if pending request is a hit on any way + hit_way := 0; + is_hit := '0'; + for i in way_t loop + if d_in.valid = '1' and cache_valids(req_index)(i) = '1' then + if read_tag(i, cache_tags(req_index)) = req_tag then + hit_way := i; + is_hit := '1'; + end if; + end if; + end loop; + + -- The way that matched on a hit + req_hit_way <= hit_way; + + -- Combine the request and cache his status to decide what + -- operation needs to be done + -- + opsel := d_in.valid & d_in.load & d_in.nc & is_hit; + case opsel is + when "1101" => op := OP_LOAD_HIT; + when "1100" => op := OP_LOAD_MISS; + when "1110" => op := OP_LOAD_NC; + when "1001" => op := OP_STORE_HIT; + when "1000" => op := OP_STORE_MISS; + when "1010" => op := OP_STORE_MISS; + when "1011" => op := OP_BAD; + when "1111" => op := OP_BAD; + when others => op := OP_NONE; + end case; + + req_op <= op; + + -- XXX GENERATE ERRORS + -- err_nc_collision <= '1' when op = OP_BAD else '0'; + + -- XXX Generate stalls + -- stall_out <= r.state /= IDLE ? + + end process; + + -- Wire up wishbone request latch + wishbone_out <= r.wb; + + -- Writeback (loads and reg updates) & completion control logic + -- + writeback_control: process(all) + variable writeback_format : boolean; + begin + + -- The mux on d_out.write reg defaults to the normal load hit case. + d_out.write_enable <= '0'; + d_out.valid <= '0'; + d_out.write_reg <= r.req_latch.write_reg; + d_out.write_data <= cache_out(r.hit_way); + d_out.write_len <= r.req_latch.length; + d_out.write_shift <= r.req_latch.addr(2 downto 0); + d_out.sign_extend <= r.req_latch.sign_extend; + d_out.byte_reverse <= r.req_latch.byte_reverse; + d_out.second_word <= '0'; + + -- By default writeback doesn't need formatting + writeback_format := false; + + -- We have a valid load or store hit or we just completed a slow + -- op such as a load miss, a NC load or a store + -- + if r.hit_load_valid = '1' or r.slow_valid = '1' then + if r.req_latch.load = '1' then + -- If it's a load, enable write back and enable formatting + d_out.write_enable <= '1'; + writeback_format := true; + + -- If it's a slow load (miss or NC) source it from the buffer + if r.slow_valid = '1' then + d_out.write_data <= r.slow_data; + end if; + + -- If it's a normal load (not a load with update), we complete + -- now, otherwise we wait for the delayed update. + -- + if r.req_latch.update = '0' then + d_out.valid <= '1'; + end if; + else + -- It's a store, complete always + d_out.valid <= '1'; + end if; + + -- Sanity + assert r.update_valid = '0' report "unexpected update_valid" + severity FAILURE; + end if; + + -- We have a register update to do. + if r.update_valid = '1' then + d_out.write_enable <= '1'; + d_out.write_reg <= r.req_latch.update_reg; + d_out.write_data <= r.req_latch.addr; + + -- If it was a load, this completes the operation + if r.req_latch.load = '1' then + d_out.valid <= '1'; + end if; + end if; + + if not writeback_format then + d_out.write_len <= "1000"; + d_out.write_shift <= "000"; + d_out.sign_extend <= '0'; + d_out.byte_reverse <= '0'; + end if; + + end process; + + -- Misc data & sel signals + misc: process(d_in) + begin + -- Wishbone & BRAM write data formatting for stores (most of it already + -- happens in loadstore1, this is the remaining sel generation and shifting) + -- + store_data <= std_logic_vector(shift_left(unsigned(d_in.data), + wishbone_data_shift(d_in.addr))); + + -- Wishbone read and write and BRAM write sel bits generation + bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); + end process; + + -- Generate a cache RAM for each way. This handles the normal + -- reads, writes from reloads and the special store-hit update + -- path as well + -- + rams: for i in 0 to NUM_WAYS-1 generate + signal do_read : std_ulogic; + signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0); + signal do_write : std_ulogic; + signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0); + signal wr_data : std_ulogic_vector(wishbone_data_bits-1 downto 0); + signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0); + signal dout : cache_row_t; + begin + way: entity work.cache_ram + generic map ( + ROW_BITS => ROW_BITS, + WIDTH => wishbone_data_bits + ) + port map ( + clk => clk, + rd_en => do_read, + rd_addr => rd_addr, + rd_data => dout, + wr_en => do_write, + wr_sel => wr_sel, + wr_addr => wr_addr, + wr_data => wr_data + ); + process(all) + begin + do_read <= '0'; + do_write <= '0'; + + -- Cache hit reads + if req_op = OP_LOAD_HIT and req_hit_way = i then + do_read <= '1'; + end if; + rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); + cache_out(i) <= dout; + + -- Write mux: + -- + -- Defaults to wishbone read responses (cache refill), + -- + wr_data <= wishbone_in.dat; + wr_sel <= (others => '1'); + wr_addr <= std_ulogic_vector(to_unsigned(get_row(r.wb.adr), ROW_BITS)); + if r.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r.store_way = i then + do_write <= '1'; + end if; + + -- Alternatively, store-hit BRAM update case (exclusive from the above). + if req_op = OP_STORE_HIT and req_hit_way = i then + report "store_data:" & to_hstring(store_data); + wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); + wr_data <= store_data; + wr_sel <= bus_sel; + do_write <= '1'; + end if; + end process; + end generate; + + -- Cache hit synchronous machine for the easy case. This handles + -- non-update form load hits. + -- + dcache_fast_hit : process(clk) + begin + if rising_edge(clk) then + -- On-cycle pulse values get reset on every cycle + r.hit_load_valid <= '0'; + + -- If we have a request incoming, we have to latch it as d_in.valid + -- is only set for a single cycle. It's up to the control logic to + -- ensure we don't override an uncompleted request (for now we are + -- single issue on load/stores so we are fine, later, we can generate + -- a stall output if necessary). + + if d_in.valid = '1' then + r.req_latch <= d_in; + + report "dcache op:" & op_t'image(req_op) & + " addr:" & to_hstring(d_in.addr) & + " upd:" & std_ulogic'image(d_in.update) & + " nc:" & std_ulogic'image(d_in.nc) & + " reg:" & to_hstring(d_in.write_reg) & + " idx:" & integer'image(req_index) & + " tag:" & to_hstring(req_tag) & + " way: " & integer'image(req_hit_way); + end if; + + -- Fast path for load/store hits. Set signals for the writeback controls. + if req_op = OP_LOAD_HIT then + r.hit_way <= req_hit_way; + r.hit_load_valid <= '1'; + end if; + end if; + end process; + + -- Every other case is handled by this stage machine: + -- + -- * Cache load miss/reload (in conjunction with "rams") + -- * Load hits for update forms + -- * Load hits for non-cachable forms + -- * Stores (the collision case is handled in "rams") + -- + -- All wishbone requests generation is done here + -- + dcache_slow : process(clk) + variable way : integer range 0 to NUM_WAYS-1; + variable tagset : cache_tags_set_t; + begin + if rising_edge(clk) then + -- On reset, clear all valid bits to force misses + if rst = '1' then + for i in index_t loop + cache_valids(i) <= (others => '0'); + end loop; + r.state <= IDLE; + r.slow_valid <= '0'; + r.update_valid <= '0'; + r.wb.cyc <= '0'; + r.wb.stb <= '0'; + + -- Not useful normally but helps avoiding tons of sim warnings + r.wb.adr <= (others => '0'); + else + -- One cycle pulses reset + r.slow_valid <= '0'; + r.update_valid <= '0'; + + -- We cannot currently process a new request when not idle + assert req_op = OP_NONE or r.state = IDLE report "request " & + op_t'image(req_op) & " while in state " & state_t'image(r.state) + severity FAILURE; + + -- Main state machine + case r.state is + when IDLE => + case req_op is + when OP_LOAD_HIT => + -- We have a load with update hit, we need the delayed update cycle + if d_in.update = '1' then + r.state <= LOAD_UPDATE; + end if; + + when OP_LOAD_MISS => + -- Normal load cache miss, start the reload machine + -- + -- First find a victim way from the PLRU + -- + way := to_integer(unsigned(plru_victim(req_index))); + + report "cache miss addr:" & to_hstring(d_in.addr) & + " idx:" & integer'image(req_index) & + " way:" & integer'image(way) & + " tag:" & to_hstring(req_tag); + + -- Force misses on that way while reloading that line + cache_valids(req_index)(way) <= '0'; + + -- Store new tag in selected way + for i in 0 to NUM_WAYS-1 loop + if i = way then + tagset := cache_tags(req_index); + write_tag(i, tagset, req_tag); + cache_tags(req_index) <= tagset; + end if; + end loop; + + -- Keep track of our index and way for subsequent stores. + r.store_index <= req_index; + r.store_way <= way; + + -- Prep for first wishbone read. We calculate the address of + -- the start of the cache line + -- + r.wb.adr <= d_in.addr(63 downto LINE_OFF_BITS) & + (LINE_OFF_BITS-1 downto 0 => '0'); + r.wb.sel <= (others => '1'); + r.wb.we <= '0'; + r.wb.cyc <= '1'; + r.wb.stb <= '1'; + r.state <= RELOAD_WAIT_ACK; + + when OP_LOAD_NC => + r.wb.sel <= bus_sel; + r.wb.adr <= d_in.addr(63 downto 3) & "000"; + r.wb.cyc <= '1'; + r.wb.stb <= '1'; + r.wb.we <= '0'; + r.state <= NC_LOAD_WAIT_ACK; + + when OP_STORE_HIT | OP_STORE_MISS => + -- For store-with-update do the register update + if d_in.update = '1' then + r.update_valid <= '1'; + end if; + r.wb.sel <= bus_sel; + r.wb.adr <= d_in.addr(63 downto 3) & "000"; + r.wb.dat <= store_data; + r.wb.cyc <= '1'; + r.wb.stb <= '1'; + r.wb.we <= '1'; + r.state <= STORE_WAIT_ACK; + + -- OP_NONE and OP_BAD do nothing + when OP_NONE => + when OP_BAD => + end case; + + when RELOAD_WAIT_ACK => + if wishbone_in.ack = '1' then + -- Is this the data we were looking for ? Latch it so + -- we can respond later. We don't currently complete the + -- pending miss request immediately, we wait for the + -- whole line to be loaded. The reason is that if we + -- did, we would potentially get new requests in while + -- not idle, which we don't currently know how to deal + -- with. + -- + if r.wb.adr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = + r.req_latch.addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) then + r.slow_data <= wishbone_in.dat; + end if; + + -- That was the last word ? We are done + if is_last_row(r.wb.adr) then + cache_valids(r.store_index)(way) <= '1'; + r.wb.cyc <= '0'; + r.wb.stb <= '0'; + + -- Complete the load that missed. For load with update + -- we also need to do the deferred update cycle. + -- + r.slow_valid <= '1'; + if r.req_latch.load = '1' and r.req_latch.update = '1' then + r.state <= LOAD_UPDATE; + report "completing miss with load-update !"; + else + r.state <= IDLE; + report "completing miss !"; + end if; + else + -- Otherwise, calculate the next row address + r.wb.adr <= next_row_addr(r.wb.adr); + end if; + end if; + + when LOAD_UPDATE => + -- We need the extra cycle to complete a load with update + r.update_valid <= '1'; + r.state <= IDLE; + + when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => + if wishbone_in.ack = '1' then + if r.state = NC_LOAD_WAIT_ACK then + r.slow_data <= wishbone_in.dat; + end if; + r.slow_valid <= '1'; + r.wb.cyc <= '0'; + r.wb.stb <= '0'; + r.state <= IDLE; + end if; + end case; + end if; + end if; + end process; +end; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index aa3f274..7fa8a42 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -15,12 +15,12 @@ entity loadstore1 is l_in : in Decode2ToLoadstore1Type; - l_out : out Loadstore1ToLoadstore2Type + l_out : out Loadstore1ToDcacheType ); end loadstore1; architecture behave of loadstore1 is - signal r, rin : Loadstore1ToLoadstore2Type; + signal r, rin : Loadstore1ToDcacheType; signal lsu_sum : std_ulogic_vector(63 downto 0); begin -- Calculate the address in the first cycle @@ -34,7 +34,7 @@ begin end process; loadstore1_1: process(all) - variable v : Loadstore1ToLoadstore2Type; + variable v : Loadstore1ToDcacheType; begin v := r; @@ -48,6 +48,20 @@ begin v.update := l_in.update; v.update_reg := l_in.update_reg; + -- XXX Temporary hack. Mark the op as non-cachable if the address + -- is the form 0xc------- + -- + -- This will have to be replaced by a combination of implementing the + -- proper HV CI load/store instructions and having an MMU to get the I + -- bit otherwise. + if lsu_sum(31 downto 28) = "1100" then + v.nc := '1'; + else + v.nc := '0'; + end if; + + -- XXX Do length_to_sel here ? + -- byte reverse stores in the first cycle if v.load = '0' and l_in.byte_reverse = '1' then v.data := byte_reverse(l_in.data, to_integer(unsigned(l_in.length))); diff --git a/loadstore2.vhdl b/loadstore2.vhdl deleted file mode 100644 index cd7061c..0000000 --- a/loadstore2.vhdl +++ /dev/null @@ -1,148 +0,0 @@ -library ieee; -use ieee.std_logic_1164.all; -use ieee.numeric_std.all; - -library work; -use work.common.all; -use work.helpers.all; -use work.wishbone_types.all; - --- 2 cycle LSU --- In this cycle we read or write any data and do sign extension and update if required. - -entity loadstore2 is - port ( - clk : in std_ulogic; - - l_in : in Loadstore1ToLoadstore2Type; - w_out : out Loadstore2ToWritebackType; - - m_in : in wishbone_slave_out; - m_out : out wishbone_master_out - ); -end loadstore2; - -architecture behave of loadstore2 is - signal l_saved : Loadstore1ToLoadstore2Type; - signal w_tmp : Loadstore2ToWritebackType; - signal m_tmp : wishbone_master_out; - signal dlength : std_ulogic_vector(3 downto 0); - - type state_t is (IDLE, WAITING_FOR_READ_ACK, WAITING_FOR_WRITE_ACK); - signal state : state_t := IDLE; - - function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is - begin - case length is - when "0001" => - return "00000001"; - when "0010" => - return "00000011"; - when "0100" => - return "00001111"; - when "1000" => - return "11111111"; - when others => - return "00000000"; - end case; - end function length_to_sel; - - function wishbone_data_shift(address : in std_ulogic_vector(63 downto 0)) return natural is - begin - return to_integer(unsigned(address(2 downto 0))) * 8; - end function wishbone_data_shift; - - function wishbone_data_sel(size : in std_logic_vector(3 downto 0); address : in std_logic_vector(63 downto 0)) return std_ulogic_vector is - begin - return std_ulogic_vector(shift_left(unsigned(length_to_sel(size)), to_integer(unsigned(address(2 downto 0))))); - end function wishbone_data_sel; -begin - - w_out <= w_tmp; - m_out <= m_tmp; - - loadstore2_0: process(clk) - begin - if rising_edge(clk) then - - w_tmp.valid <= '0'; - w_tmp.write_enable <= '0'; - w_tmp.write_reg <= (others => '0'); - w_tmp.write_len <= "1000"; - w_tmp.write_shift <= "000"; - w_tmp.sign_extend <= '0'; - w_tmp.byte_reverse <= '0'; - w_tmp.second_word <= '0'; - - l_saved <= l_saved; - - case_0: case state is - when IDLE => - if l_in.valid = '1' then - m_tmp <= wishbone_master_out_init; - - m_tmp.sel <= wishbone_data_sel(l_in.length, l_in.addr); - m_tmp.adr <= l_in.addr(63 downto 3) & "000"; - m_tmp.cyc <= '1'; - m_tmp.stb <= '1'; - - l_saved <= l_in; - - if l_in.load = '1' then - m_tmp.we <= '0'; - - -- Load with update instructions write two GPR destinations. - -- We don't want the expense of two write ports, so make it - -- single in the pipeline and write back the update GPR now - -- and the load once we get the data back. We'll have to - -- revisit this when loads can take exceptions. - if l_in.update = '1' then - w_tmp.write_enable <= '1'; - w_tmp.write_reg <= l_in.update_reg; - w_tmp.write_data <= l_in.addr; - end if; - - state <= WAITING_FOR_READ_ACK; - else - m_tmp.we <= '1'; - - m_tmp.dat <= std_logic_vector(shift_left(unsigned(l_in.data), wishbone_data_shift(l_in.addr))); - - assert l_in.sign_extend = '0' report "sign extension doesn't make sense for stores" severity failure; - - state <= WAITING_FOR_WRITE_ACK; - end if; - end if; - - when WAITING_FOR_READ_ACK => - if m_in.ack = '1' then - -- write data to register file - w_tmp.valid <= '1'; - w_tmp.write_enable <= '1'; - w_tmp.write_data <= m_in.dat; - w_tmp.write_reg <= l_saved.write_reg; - w_tmp.write_len <= l_saved.length; - w_tmp.write_shift <= l_saved.addr(2 downto 0); - w_tmp.sign_extend <= l_saved.sign_extend; - w_tmp.byte_reverse <= l_saved.byte_reverse; - - m_tmp <= wishbone_master_out_init; - state <= IDLE; - end if; - - when WAITING_FOR_WRITE_ACK => - if m_in.ack = '1' then - w_tmp.valid <= '1'; - if l_saved.update = '1' then - w_tmp.write_enable <= '1'; - w_tmp.write_reg <= l_saved.update_reg; - w_tmp.write_data <= l_saved.addr; - end if; - - m_tmp <= wishbone_master_out_init; - state <= IDLE; - end if; - end case; - end if; - end process; -end; diff --git a/microwatt.core b/microwatt.core index 44dfbbd..5fb9a7a 100644 --- a/microwatt.core +++ b/microwatt.core @@ -25,7 +25,7 @@ filesets: - control.vhdl - execute1.vhdl - loadstore1.vhdl - - loadstore2.vhdl + - dcache.vhdl - multiply.vhdl - divider.vhdl - rotator.vhdl diff --git a/wishbone_types.vhdl b/wishbone_types.vhdl index 3db03aa..9284244 100644 --- a/wishbone_types.vhdl +++ b/wishbone_types.vhdl @@ -4,16 +4,18 @@ use ieee.std_logic_1164.all; package wishbone_types is constant wishbone_addr_bits : integer := 64; constant wishbone_data_bits : integer := 64; + constant wishbone_sel_bits : integer := wishbone_data_bits/8; subtype wishbone_addr_type is std_ulogic_vector(wishbone_addr_bits-1 downto 0); subtype wishbone_data_type is std_ulogic_vector(wishbone_data_bits-1 downto 0); + subtype wishbone_sel_type is std_ulogic_vector(wishbone_sel_bits-1 downto 0); type wishbone_master_out is record adr : wishbone_addr_type; dat : wishbone_data_type; cyc : std_ulogic; stb : std_ulogic; - sel : std_ulogic_vector(7 downto 0); + sel : wishbone_sel_type; we : std_ulogic; end record; constant wishbone_master_out_init : wishbone_master_out := (cyc => '0', stb => '0', we => '0', others => (others => '0')); diff --git a/writeback.vhdl b/writeback.vhdl index ab7b6c7..0d9397c 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -11,7 +11,7 @@ entity writeback is clk : in std_ulogic; e_in : in Execute1ToWritebackType; - l_in : in Loadstore2ToWritebackType; + l_in : in DcacheToWritebackType; m_in : in MultiplyToWritebackType; d_in : in DividerToWritebackType; From 174378b190fe47f12dc3f9ec90756a8f8df3f05f Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 10 Oct 2019 11:25:16 +1100 Subject: [PATCH 05/11] dcache: Introduce an extra cycle latency to make timing This makes the BRAMs use an output buffer, introducing an extra cycle latency. Without this, Vivado won't make timing at 100Mhz. We stash all the necessary response data in delayed latches, the extra cycle is NOT a state in the state machine, thus it's fully pipelined and doesn't involve stalling. This introduces an extra non-pipelined cycle for loads with update to avoid collision on the writeback output between the now delayed load data and the register update. We could avoid it by moving the register update in the pipeline bubble created by the extra update state, but it's a bit trickier, so I leave that for a latter optimization. Signed-off-by: Benjamin Herrenschmidt --- cache_ram.vhdl | 23 ++++++- dcache.vhdl | 182 +++++++++++++++++++++++++++++++++---------------- 2 files changed, 145 insertions(+), 60 deletions(-) diff --git a/cache_ram.vhdl b/cache_ram.vhdl index 346c6fe..7a10a1c 100644 --- a/cache_ram.vhdl +++ b/cache_ram.vhdl @@ -7,7 +7,8 @@ entity cache_ram is generic( ROW_BITS : integer := 16; WIDTH : integer := 64; - TRACE : boolean := false + TRACE : boolean := false; + ADD_BUF : boolean := false ); port( @@ -33,6 +34,8 @@ architecture rtl of cache_ram is attribute ram_decomp : string; attribute ram_decomp of ram : signal is "power"; + signal rd_data0 : std_logic_vector(WIDTH - 1 downto 0); + begin process(clk) variable lbit : integer range 0 to WIDTH - 1; @@ -56,7 +59,7 @@ begin end loop; end if; if rd_en = '1' then - rd_data <= ram(to_integer(unsigned(rd_addr))); + rd_data0 <= ram(to_integer(unsigned(rd_addr))); if TRACE then report "read a:" & to_hstring(rd_addr) & " dat:" & to_hstring(ram(to_integer(unsigned(rd_addr)))); @@ -64,4 +67,20 @@ begin end if; end if; end process; + + buf: if ADD_BUF generate + begin + process(clk) + begin + if rising_edge(clk) then + rd_data <= rd_data0; + end if; + end process; + end generate; + + nobuf: if not ADD_BUF generate + begin + rd_data <= rd_data0; + end generate; + end; diff --git a/dcache.vhdl b/dcache.vhdl index f771eae..c0d0469 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -7,6 +7,9 @@ -- * Complete load misses on the cycle when WB data comes instead of -- at the end of line (this requires dealing with requests coming in -- while not idle...) +-- * Load with update could use one less non-pipelined cycle by moving +-- the register update to the pipeline bubble that exists when going +-- back to the IDLE state. -- library ieee; use ieee.std_logic_1164.all; @@ -138,7 +141,8 @@ architecture rtl of dcache is -- Cache state machine type state_t is (IDLE, -- Normal load hit processing - LOAD_UPDATE, -- Load with update address update cycle + LOAD_UPDATE, -- Load with update extra cycle + LOAD_UPDATE2, -- Load with update extra cycle RELOAD_WAIT_ACK, -- Cache reload wait ack STORE_WAIT_ACK, -- Store wait ack NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack @@ -147,8 +151,20 @@ architecture rtl of dcache is req_latch : Loadstore1ToDcacheType; -- Cache hit state (Latches for 1 cycle BRAM access) - hit_way : way_t; - hit_load_valid : std_ulogic; + hit_way : way_t; + hit_load_valid : std_ulogic; + + -- 1-cycle delayed signals to account for the BRAM extra + -- buffer that seems necessary to make timing on load hits + -- + hit_way_delayed : way_t; + hit_load_delayed : std_ulogic; + hit_load_upd_delayed : std_ulogic; + hit_load_reg_delayed : std_ulogic_vector(4 downto 0); + hit_data_shift_delayed : std_ulogic_vector(2 downto 0); + hit_dlength_delayed : std_ulogic_vector(3 downto 0); + hit_sign_ext_delayed : std_ulogic; + hit_byte_rev_delayed : std_ulogic; -- Register update (load/store with update) update_valid : std_ulogic; @@ -382,72 +398,97 @@ begin -- Writeback (loads and reg updates) & completion control logic -- writeback_control: process(all) - variable writeback_format : boolean; begin -- The mux on d_out.write reg defaults to the normal load hit case. d_out.write_enable <= '0'; d_out.valid <= '0'; - d_out.write_reg <= r.req_latch.write_reg; - d_out.write_data <= cache_out(r.hit_way); - d_out.write_len <= r.req_latch.length; - d_out.write_shift <= r.req_latch.addr(2 downto 0); - d_out.sign_extend <= r.req_latch.sign_extend; - d_out.byte_reverse <= r.req_latch.byte_reverse; + d_out.write_reg <= r.hit_load_reg_delayed; + d_out.write_data <= cache_out(r.hit_way_delayed); + d_out.write_len <= r.hit_dlength_delayed; + d_out.write_shift <= r.hit_data_shift_delayed; + d_out.sign_extend <= r.hit_sign_ext_delayed; + d_out.byte_reverse <= r.hit_byte_rev_delayed; d_out.second_word <= '0'; - -- By default writeback doesn't need formatting - writeback_format := false; - -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store -- - if r.hit_load_valid = '1' or r.slow_valid = '1' then - if r.req_latch.load = '1' then - -- If it's a load, enable write back and enable formatting - d_out.write_enable <= '1'; - writeback_format := true; + -- Note: the load hit is delayed by one cycle. However it can still + -- not collide with r.slow_valid (well unless I miscalculated) because + -- slow_valid can only be set on a subsequent request and not on its + -- first cycle (the state machine must have advanced), which makes + -- slow_valid at least 2 cycles from the previous hit_load_valid. + -- - -- If it's a slow load (miss or NC) source it from the buffer - if r.slow_valid = '1' then - d_out.write_data <= r.slow_data; + -- Sanity: Only one of these must be set in any given cycle + assert (r.update_valid and r.hit_load_delayed) /= '1' report + "unexpected hit_load_delayed collision with update_valid" + severity FAILURE; + assert (r.slow_valid and r.hit_load_delayed) /= '1' report + "unexpected hit_load_delayed collision with slow_valid" + severity FAILURE; + assert (r.slow_valid and r.update_valid) /= '1' report + "unexpected update_valid collision with slow_valid" + severity FAILURE; + + -- Delayed load hit case is the standard path + if r.hit_load_delayed = '1' then + d_out.write_enable <= '1'; + + -- If it's not a load with update, complete it now + if r.hit_load_upd_delayed = '0' then + d_out.valid <= '1'; end if; + end if; + + -- Slow ops (load miss, NC, stores) + if r.slow_valid = '1' then + -- If it's a load, enable register writeback and switch + -- mux accordingly + -- + if r.req_latch.load then + d_out.write_reg <= r.req_latch.write_reg; + d_out.write_enable <= '1'; - -- If it's a normal load (not a load with update), we complete - -- now, otherwise we wait for the delayed update. + -- Read data comes from the slow data latch, formatter + -- from the latched request. -- - if r.req_latch.update = '0' then - d_out.valid <= '1'; - end if; - else - -- It's a store, complete always - d_out.valid <= '1'; + d_out.write_data <= r.slow_data; + d_out.write_shift <= r.req_latch.addr(2 downto 0); + d_out.sign_extend <= r.req_latch.sign_extend; + d_out.byte_reverse <= r.req_latch.byte_reverse; + d_out.write_len <= r.req_latch.length; end if; - -- Sanity - assert r.update_valid = '0' report "unexpected update_valid" - severity FAILURE; + -- If it's a store or a non-update load form, complete now + if r.req_latch.load = '0' or r.req_latch.update = '0' then + d_out.valid <= '1'; + end if; end if; -- We have a register update to do. if r.update_valid = '1' then d_out.write_enable <= '1'; d_out.write_reg <= r.req_latch.update_reg; + + -- Change the read data mux to the address that's going into + -- the register and the formatter does nothing. + -- d_out.write_data <= r.req_latch.addr; + d_out.write_shift <= "000"; + d_out.write_len <= "1000"; + d_out.sign_extend <= '0'; + d_out.byte_reverse <= '0'; - -- If it was a load, this completes the operation + -- If it was a load, this completes the operation (load with + -- update case). + -- if r.req_latch.load = '1' then d_out.valid <= '1'; end if; end if; - if not writeback_format then - d_out.write_len <= "1000"; - d_out.write_shift <= "000"; - d_out.sign_extend <= '0'; - d_out.byte_reverse <= '0'; - end if; - end process; -- Misc data & sel signals @@ -465,7 +506,12 @@ begin -- Generate a cache RAM for each way. This handles the normal -- reads, writes from reloads and the special store-hit update - -- path as well + -- path as well. + -- + -- Note: the BRAMs have an extra read buffer, meaning the output + -- is pipelined an extra cycle. This differs from the + -- icache. The writeback logic needs to take that into + -- account by using 1-cycle delayed signals for load hits. -- rams: for i in 0 to NUM_WAYS-1 generate signal do_read : std_ulogic; @@ -479,7 +525,8 @@ begin way: entity work.cache_ram generic map ( ROW_BITS => ROW_BITS, - WIDTH => wishbone_data_bits + WIDTH => wishbone_data_bits, + ADD_BUF => true ) port map ( clk => clk, @@ -493,13 +540,8 @@ begin ); process(all) begin - do_read <= '0'; - do_write <= '0'; - -- Cache hit reads - if req_op = OP_LOAD_HIT and req_hit_way = i then - do_read <= '1'; - end if; + do_read <= '1'; rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); cache_out(i) <= dout; @@ -507,19 +549,30 @@ begin -- -- Defaults to wishbone read responses (cache refill), -- - wr_data <= wishbone_in.dat; - wr_sel <= (others => '1'); - wr_addr <= std_ulogic_vector(to_unsigned(get_row(r.wb.adr), ROW_BITS)); + -- For timing, the mux on wr_data/sel/addr is not dependent on anything + -- other than the current state. Only the do_write signal is. + -- + if r.state = IDLE then + -- When IDLE, the only write path is the store-hit update case + wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); + wr_data <= store_data; + wr_sel <= bus_sel; + else + -- Otherwise, we might be doing a reload + wr_data <= wishbone_in.dat; + wr_sel <= (others => '1'); + wr_addr <= std_ulogic_vector(to_unsigned(get_row(r.wb.adr), ROW_BITS)); + end if; + + -- The two actual write cases here + do_write <= '0'; if r.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r.store_way = i then do_write <= '1'; end if; - - -- Alternatively, store-hit BRAM update case (exclusive from the above). if req_op = OP_STORE_HIT and req_hit_way = i then - report "store_data:" & to_hstring(store_data); - wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); - wr_data <= store_data; - wr_sel <= bus_sel; + assert r.state /= RELOAD_WAIT_ACK report "Store hit while in state:" & + state_t'image(r.state) + severity FAILURE; do_write <= '1'; end if; end process; @@ -531,6 +584,16 @@ begin dcache_fast_hit : process(clk) begin if rising_edge(clk) then + -- 1-cycle delayed signals for load hit response + r.hit_load_delayed <= r.hit_load_valid; + r.hit_way_delayed <= r.hit_way; + r.hit_load_upd_delayed <= r.req_latch.update; + r.hit_load_reg_delayed <= r.req_latch.write_reg; + r.hit_data_shift_delayed <= r.req_latch.addr(2 downto 0); + r.hit_sign_ext_delayed <= r.req_latch.sign_extend; + r.hit_byte_rev_delayed <= r.req_latch.byte_reverse; + r.hit_dlength_delayed <= r.req_latch.length; + -- On-cycle pulse values get reset on every cycle r.hit_load_valid <= '0'; @@ -543,7 +606,7 @@ begin if d_in.valid = '1' then r.req_latch <= d_in; - report "dcache op:" & op_t'image(req_op) & + report "op:" & op_t'image(req_op) & " addr:" & to_hstring(d_in.addr) & " upd:" & std_ulogic'image(d_in.update) & " nc:" & std_ulogic'image(d_in.nc) & @@ -712,6 +775,9 @@ begin end if; when LOAD_UPDATE => + -- We need the extra cycle to complete a load with update + r.state <= LOAD_UPDATE2; + when LOAD_UPDATE2 => -- We need the extra cycle to complete a load with update r.update_valid <= '1'; r.state <= IDLE; From 265fbf894bcf7e5cb0d140ca840b005b3cf7a1a9 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 15 Oct 2019 16:21:32 +1100 Subject: [PATCH 06/11] icache/dcache: Make both caches 32 lines, 2 ways Adding lines seems to add only little extra as the BRAMs aren't full, 2 ways is our current comprimise to limit pressure on small FPGAs. We could go to 64 lines for a little more, but timing is becoming a bit too right to my linking on the tags/LRU path of the icache, so let's leave it at 32 for now. Signed-off-by: Benjamin Herrenschmidt --- core.vhdl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/core.vhdl b/core.vhdl index e9cd28b..77af882 100644 --- a/core.vhdl +++ b/core.vhdl @@ -115,7 +115,7 @@ begin icache_0: entity work.icache generic map( LINE_SIZE => 64, - NUM_LINES => 16, + NUM_LINES => 32, NUM_WAYS => 2 ) port map( @@ -215,6 +215,11 @@ begin ); dcache_0: entity work.dcache + generic map( + LINE_SIZE => 64, + NUM_LINES => 32, + NUM_WAYS => 2 + ) port map ( clk => clk, rst => core_rst, From 587a5e3c45bfb3a3bfc1abe56954da8931aaaa85 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 16 Oct 2019 15:10:27 +1100 Subject: [PATCH 07/11] dcache: Cleanup (mostly cosmetic) Clearly separate the 2 stages of load hits, improve naming and comments, clarify the writeback controls etc... Signed-off-by: Benjamin Herrenschmidt --- dcache.vhdl | 316 +++++++++++++++++++++++++++++----------------------- 1 file changed, 174 insertions(+), 142 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index c0d0469..1e2a86c 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -36,6 +36,8 @@ entity dcache is d_in : in Loadstore1ToDcacheType; d_out : out DcacheToWritebackType; + stall_out : out std_ulogic; + wishbone_out : out wishbone_master_out; wishbone_in : in wishbone_slave_out ); @@ -147,31 +149,39 @@ architecture rtl of dcache is STORE_WAIT_ACK, -- Store wait ack NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack - type reg_internal_t is record - req_latch : Loadstore1ToDcacheType; - - -- Cache hit state (Latches for 1 cycle BRAM access) + + -- + -- Dcache operations: + -- + -- In order to make timing, we use the BRAMs with an output buffer, + -- which means that the BRAM output is delayed by an extra cycle. + -- + -- Thus, the dcache has a 2-stage internal pipeline for cache hits + -- with no stalls. + -- + -- All other operations are handled via stalling in the first stage. + -- + -- The second stage can thus complete a hit at the same time as the + -- first stage emits a stall for a complex op. + -- + + -- First stage register, contains state for stage 1 of load hits + -- and for the state machine used by all other operations + -- + type reg_stage_1_t is record + -- Latch the complete request from ls1 + req : Loadstore1ToDcacheType; + + -- Cache hit state hit_way : way_t; hit_load_valid : std_ulogic; - -- 1-cycle delayed signals to account for the BRAM extra - -- buffer that seems necessary to make timing on load hits - -- - hit_way_delayed : way_t; - hit_load_delayed : std_ulogic; - hit_load_upd_delayed : std_ulogic; - hit_load_reg_delayed : std_ulogic_vector(4 downto 0); - hit_data_shift_delayed : std_ulogic_vector(2 downto 0); - hit_dlength_delayed : std_ulogic_vector(3 downto 0); - hit_sign_ext_delayed : std_ulogic; - hit_byte_rev_delayed : std_ulogic; - -- Register update (load/store with update) - update_valid : std_ulogic; + update_valid : std_ulogic; -- Data buffer for "slow" read ops (load miss and NC loads). - slow_data : std_ulogic_vector(63 downto 0); - slow_valid : std_ulogic; + slow_data : std_ulogic_vector(63 downto 0); + slow_valid : std_ulogic; -- Cache miss state (reload state machine) state : state_t; @@ -180,7 +190,22 @@ architecture rtl of dcache is store_index : index_t; end record; - signal r : reg_internal_t; + signal r1 : reg_stage_1_t; + + -- Second stage register, only used for load hits + -- + type reg_stage_2_t is record + hit_way : way_t; + hit_load_valid : std_ulogic; + load_is_update : std_ulogic; + load_reg : std_ulogic_vector(4 downto 0); + data_shift : std_ulogic_vector(2 downto 0); + length : std_ulogic_vector(3 downto 0); + sign_extend : std_ulogic; + byte_reverse : std_ulogic; + end record; + + signal r2 : reg_stage_2_t; -- Async signals on incoming request signal req_index : index_t; @@ -201,6 +226,10 @@ architecture rtl of dcache is signal bus_sel : wishbone_sel_type; signal store_data : wishbone_data_type; + -- + -- Helper functions to decode incoming requests + -- + -- Return the cache line index (tag index) for an address function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is begin @@ -384,16 +413,29 @@ begin req_op <= op; - -- XXX GENERATE ERRORS - -- err_nc_collision <= '1' when op = OP_BAD else '0'; + end process; - -- XXX Generate stalls - -- stall_out <= r.state /= IDLE ? + -- + -- Misc signal assignments + -- - end process; + -- Wire up wishbone request latch out of stage 1 + wishbone_out <= r1.wb; + + -- Wishbone & BRAM write data formatting for stores (most of it already + -- happens in loadstore1, this is the remaining data shifting) + -- + store_data <= std_logic_vector(shift_left(unsigned(d_in.data), + wishbone_data_shift(d_in.addr))); + + -- Wishbone read and write and BRAM write sel bits generation + bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); + + -- TODO: Generate errors + -- err_nc_collision <= '1' when req_op = OP_BAD else '0'; - -- Wire up wishbone request latch - wishbone_out <= r.wb; + -- Generate stalls from stage 1 state machine + stall_out <= '1' when r1.state /= IDLE else '0'; -- Writeback (loads and reg updates) & completion control logic -- @@ -403,12 +445,12 @@ begin -- The mux on d_out.write reg defaults to the normal load hit case. d_out.write_enable <= '0'; d_out.valid <= '0'; - d_out.write_reg <= r.hit_load_reg_delayed; - d_out.write_data <= cache_out(r.hit_way_delayed); - d_out.write_len <= r.hit_dlength_delayed; - d_out.write_shift <= r.hit_data_shift_delayed; - d_out.sign_extend <= r.hit_sign_ext_delayed; - d_out.byte_reverse <= r.hit_byte_rev_delayed; + d_out.write_reg <= r2.load_reg; + d_out.write_data <= cache_out(r2.hit_way); + d_out.write_len <= r2.length; + d_out.write_shift <= r2.data_shift; + d_out.sign_extend <= r2.sign_extend; + d_out.byte_reverse <= r2.byte_reverse; d_out.second_word <= '0'; -- We have a valid load or store hit or we just completed a slow @@ -422,60 +464,60 @@ begin -- -- Sanity: Only one of these must be set in any given cycle - assert (r.update_valid and r.hit_load_delayed) /= '1' report + assert (r1.update_valid and r2.hit_load_valid) /= '1' report "unexpected hit_load_delayed collision with update_valid" severity FAILURE; - assert (r.slow_valid and r.hit_load_delayed) /= '1' report + assert (r1.slow_valid and r2.hit_load_valid) /= '1' report "unexpected hit_load_delayed collision with slow_valid" severity FAILURE; - assert (r.slow_valid and r.update_valid) /= '1' report + assert (r1.slow_valid and r1.update_valid) /= '1' report "unexpected update_valid collision with slow_valid" severity FAILURE; -- Delayed load hit case is the standard path - if r.hit_load_delayed = '1' then + if r2.hit_load_valid = '1' then d_out.write_enable <= '1'; -- If it's not a load with update, complete it now - if r.hit_load_upd_delayed = '0' then + if r2.load_is_update = '0' then d_out.valid <= '1'; end if; end if; -- Slow ops (load miss, NC, stores) - if r.slow_valid = '1' then + if r1.slow_valid = '1' then -- If it's a load, enable register writeback and switch -- mux accordingly -- - if r.req_latch.load then - d_out.write_reg <= r.req_latch.write_reg; + if r1.req.load then + d_out.write_reg <= r1.req.write_reg; d_out.write_enable <= '1'; -- Read data comes from the slow data latch, formatter -- from the latched request. -- - d_out.write_data <= r.slow_data; - d_out.write_shift <= r.req_latch.addr(2 downto 0); - d_out.sign_extend <= r.req_latch.sign_extend; - d_out.byte_reverse <= r.req_latch.byte_reverse; - d_out.write_len <= r.req_latch.length; + d_out.write_data <= r1.slow_data; + d_out.write_shift <= r1.req.addr(2 downto 0); + d_out.sign_extend <= r1.req.sign_extend; + d_out.byte_reverse <= r1.req.byte_reverse; + d_out.write_len <= r1.req.length; end if; -- If it's a store or a non-update load form, complete now - if r.req_latch.load = '0' or r.req_latch.update = '0' then + if r1.req.load = '0' or r1.req.update = '0' then d_out.valid <= '1'; end if; end if; -- We have a register update to do. - if r.update_valid = '1' then + if r1.update_valid = '1' then d_out.write_enable <= '1'; - d_out.write_reg <= r.req_latch.update_reg; + d_out.write_reg <= r1.req.update_reg; -- Change the read data mux to the address that's going into -- the register and the formatter does nothing. -- - d_out.write_data <= r.req_latch.addr; + d_out.write_data <= r1.req.addr; d_out.write_shift <= "000"; d_out.write_len <= "1000"; d_out.sign_extend <= '0'; @@ -484,26 +526,14 @@ begin -- If it was a load, this completes the operation (load with -- update case). -- - if r.req_latch.load = '1' then + if r1.req.load = '1' then d_out.valid <= '1'; end if; end if; end process; - -- Misc data & sel signals - misc: process(d_in) - begin - -- Wishbone & BRAM write data formatting for stores (most of it already - -- happens in loadstore1, this is the remaining sel generation and shifting) - -- - store_data <= std_logic_vector(shift_left(unsigned(d_in.data), - wishbone_data_shift(d_in.addr))); - - -- Wishbone read and write and BRAM write sel bits generation - bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); - end process; - + -- -- Generate a cache RAM for each way. This handles the normal -- reads, writes from reloads and the special store-hit update -- path as well. @@ -552,7 +582,7 @@ begin -- For timing, the mux on wr_data/sel/addr is not dependent on anything -- other than the current state. Only the do_write signal is. -- - if r.state = IDLE then + if r1.state = IDLE then -- When IDLE, the only write path is the store-hit update case wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); wr_data <= store_data; @@ -561,41 +591,39 @@ begin -- Otherwise, we might be doing a reload wr_data <= wishbone_in.dat; wr_sel <= (others => '1'); - wr_addr <= std_ulogic_vector(to_unsigned(get_row(r.wb.adr), ROW_BITS)); + wr_addr <= std_ulogic_vector(to_unsigned(get_row(r1.wb.adr), ROW_BITS)); end if; -- The two actual write cases here do_write <= '0'; - if r.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r.store_way = i then + if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r1.store_way = i then do_write <= '1'; end if; if req_op = OP_STORE_HIT and req_hit_way = i then - assert r.state /= RELOAD_WAIT_ACK report "Store hit while in state:" & - state_t'image(r.state) + assert r1.state /= RELOAD_WAIT_ACK report "Store hit while in state:" & + state_t'image(r1.state) severity FAILURE; do_write <= '1'; end if; end process; end generate; + -- -- Cache hit synchronous machine for the easy case. This handles - -- non-update form load hits. + -- non-update form load hits and stage 1 to stage 2 transfers -- dcache_fast_hit : process(clk) begin if rising_edge(clk) then - -- 1-cycle delayed signals for load hit response - r.hit_load_delayed <= r.hit_load_valid; - r.hit_way_delayed <= r.hit_way; - r.hit_load_upd_delayed <= r.req_latch.update; - r.hit_load_reg_delayed <= r.req_latch.write_reg; - r.hit_data_shift_delayed <= r.req_latch.addr(2 downto 0); - r.hit_sign_ext_delayed <= r.req_latch.sign_extend; - r.hit_byte_rev_delayed <= r.req_latch.byte_reverse; - r.hit_dlength_delayed <= r.req_latch.length; - - -- On-cycle pulse values get reset on every cycle - r.hit_load_valid <= '0'; + -- stage 1 -> stage 2 + r2.hit_load_valid <= r1.hit_load_valid; + r2.hit_way <= r1.hit_way; + r2.load_is_update <= r1.req.update; + r2.load_reg <= r1.req.write_reg; + r2.data_shift <= r1.req.addr(2 downto 0); + r2.length <= r1.req.length; + r2.sign_extend <= r1.req.sign_extend; + r2.byte_reverse <= r1.req.byte_reverse; -- If we have a request incoming, we have to latch it as d_in.valid -- is only set for a single cycle. It's up to the control logic to @@ -604,7 +632,7 @@ begin -- a stall output if necessary). if d_in.valid = '1' then - r.req_latch <= d_in; + r1.req <= d_in; report "op:" & op_t'image(req_op) & " addr:" & to_hstring(d_in.addr) & @@ -618,12 +646,15 @@ begin -- Fast path for load/store hits. Set signals for the writeback controls. if req_op = OP_LOAD_HIT then - r.hit_way <= req_hit_way; - r.hit_load_valid <= '1'; + r1.hit_way <= req_hit_way; + r1.hit_load_valid <= '1'; + else + r1.hit_load_valid <= '0'; end if; end if; end process; + -- -- Every other case is handled by this stage machine: -- -- * Cache load miss/reload (in conjunction with "rams") @@ -631,7 +662,8 @@ begin -- * Load hits for non-cachable forms -- * Stores (the collision case is handled in "rams") -- - -- All wishbone requests generation is done here + -- All wishbone requests generation is done here. This machine + -- operates at stage 1. -- dcache_slow : process(clk) variable way : integer range 0 to NUM_WAYS-1; @@ -643,32 +675,32 @@ begin for i in index_t loop cache_valids(i) <= (others => '0'); end loop; - r.state <= IDLE; - r.slow_valid <= '0'; - r.update_valid <= '0'; - r.wb.cyc <= '0'; - r.wb.stb <= '0'; + r1.state <= IDLE; + r1.slow_valid <= '0'; + r1.update_valid <= '0'; + r1.wb.cyc <= '0'; + r1.wb.stb <= '0'; -- Not useful normally but helps avoiding tons of sim warnings - r.wb.adr <= (others => '0'); + r1.wb.adr <= (others => '0'); else -- One cycle pulses reset - r.slow_valid <= '0'; - r.update_valid <= '0'; + r1.slow_valid <= '0'; + r1.update_valid <= '0'; -- We cannot currently process a new request when not idle - assert req_op = OP_NONE or r.state = IDLE report "request " & - op_t'image(req_op) & " while in state " & state_t'image(r.state) + assert req_op = OP_NONE or r1.state = IDLE report "request " & + op_t'image(req_op) & " while in state " & state_t'image(r1.state) severity FAILURE; -- Main state machine - case r.state is + case r1.state is when IDLE => case req_op is when OP_LOAD_HIT => -- We have a load with update hit, we need the delayed update cycle if d_in.update = '1' then - r.state <= LOAD_UPDATE; + r1.state <= LOAD_UPDATE; end if; when OP_LOAD_MISS => @@ -696,40 +728,40 @@ begin end loop; -- Keep track of our index and way for subsequent stores. - r.store_index <= req_index; - r.store_way <= way; + r1.store_index <= req_index; + r1.store_way <= way; -- Prep for first wishbone read. We calculate the address of -- the start of the cache line -- - r.wb.adr <= d_in.addr(63 downto LINE_OFF_BITS) & - (LINE_OFF_BITS-1 downto 0 => '0'); - r.wb.sel <= (others => '1'); - r.wb.we <= '0'; - r.wb.cyc <= '1'; - r.wb.stb <= '1'; - r.state <= RELOAD_WAIT_ACK; + r1.wb.adr <= d_in.addr(63 downto LINE_OFF_BITS) & + (LINE_OFF_BITS-1 downto 0 => '0'); + r1.wb.sel <= (others => '1'); + r1.wb.we <= '0'; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; + r1.state <= RELOAD_WAIT_ACK; when OP_LOAD_NC => - r.wb.sel <= bus_sel; - r.wb.adr <= d_in.addr(63 downto 3) & "000"; - r.wb.cyc <= '1'; - r.wb.stb <= '1'; - r.wb.we <= '0'; - r.state <= NC_LOAD_WAIT_ACK; + r1.wb.sel <= bus_sel; + r1.wb.adr <= d_in.addr(63 downto 3) & "000"; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; + r1.wb.we <= '0'; + r1.state <= NC_LOAD_WAIT_ACK; when OP_STORE_HIT | OP_STORE_MISS => -- For store-with-update do the register update if d_in.update = '1' then - r.update_valid <= '1'; + r1.update_valid <= '1'; end if; - r.wb.sel <= bus_sel; - r.wb.adr <= d_in.addr(63 downto 3) & "000"; - r.wb.dat <= store_data; - r.wb.cyc <= '1'; - r.wb.stb <= '1'; - r.wb.we <= '1'; - r.state <= STORE_WAIT_ACK; + r1.wb.sel <= bus_sel; + r1.wb.adr <= d_in.addr(63 downto 3) & "000"; + r1.wb.dat <= store_data; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; + r1.wb.we <= '1'; + r1.state <= STORE_WAIT_ACK; -- OP_NONE and OP_BAD do nothing when OP_NONE => @@ -746,51 +778,51 @@ begin -- not idle, which we don't currently know how to deal -- with. -- - if r.wb.adr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = - r.req_latch.addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) then - r.slow_data <= wishbone_in.dat; + if r1.wb.adr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = + r1.req.addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) then + r1.slow_data <= wishbone_in.dat; end if; -- That was the last word ? We are done - if is_last_row(r.wb.adr) then - cache_valids(r.store_index)(way) <= '1'; - r.wb.cyc <= '0'; - r.wb.stb <= '0'; + if is_last_row(r1.wb.adr) then + cache_valids(r1.store_index)(way) <= '1'; + r1.wb.cyc <= '0'; + r1.wb.stb <= '0'; -- Complete the load that missed. For load with update -- we also need to do the deferred update cycle. -- - r.slow_valid <= '1'; - if r.req_latch.load = '1' and r.req_latch.update = '1' then - r.state <= LOAD_UPDATE; + r1.slow_valid <= '1'; + if r1.req.load = '1' and r1.req.update = '1' then + r1.state <= LOAD_UPDATE; report "completing miss with load-update !"; else - r.state <= IDLE; + r1.state <= IDLE; report "completing miss !"; end if; else -- Otherwise, calculate the next row address - r.wb.adr <= next_row_addr(r.wb.adr); + r1.wb.adr <= next_row_addr(r1.wb.adr); end if; end if; when LOAD_UPDATE => -- We need the extra cycle to complete a load with update - r.state <= LOAD_UPDATE2; + r1.state <= LOAD_UPDATE2; when LOAD_UPDATE2 => -- We need the extra cycle to complete a load with update - r.update_valid <= '1'; - r.state <= IDLE; + r1.update_valid <= '1'; + r1.state <= IDLE; when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => if wishbone_in.ack = '1' then - if r.state = NC_LOAD_WAIT_ACK then - r.slow_data <= wishbone_in.dat; + if r1.state = NC_LOAD_WAIT_ACK then + r1.slow_data <= wishbone_in.dat; end if; - r.slow_valid <= '1'; - r.wb.cyc <= '0'; - r.wb.stb <= '0'; - r.state <= IDLE; + r1.slow_valid <= '1'; + r1.wb.cyc <= '0'; + r1.wb.stb <= '0'; + r1.state <= IDLE; end if; end case; end if; From 6e0ee0b0db81fbe190653e3903c87c2287d05946 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 17 Oct 2019 16:41:19 +1100 Subject: [PATCH 08/11] icache & dcache: Fix store way variable We used the variable "way" in the wrong state in the cache when updating a line valid bit after the end of the wishbone transactions, we need to use the latched "store_way". Signed-off-by: Benjamin Herrenschmidt --- dcache.vhdl | 19 +++++++++---------- icache.vhdl | 17 +++++++++-------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 1e2a86c..087e71d 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -221,6 +221,7 @@ architecture rtl of dcache is -- PLRU output interface type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0); signal plru_victim : plru_out_t; + signal replace_way : way_t; -- Wishbone read/write/cache write formatting signals signal bus_sel : wishbone_sel_type; @@ -395,6 +396,9 @@ begin -- The way that matched on a hit req_hit_way <= hit_way; + -- The way to replace on a miss + replace_way <= to_integer(unsigned(plru_victim(req_index))); + -- Combine the request and cache his status to decide what -- operation needs to be done -- @@ -666,7 +670,6 @@ begin -- operates at stage 1. -- dcache_slow : process(clk) - variable way : integer range 0 to NUM_WAYS-1; variable tagset : cache_tags_set_t; begin if rising_edge(clk) then @@ -706,21 +709,17 @@ begin when OP_LOAD_MISS => -- Normal load cache miss, start the reload machine -- - -- First find a victim way from the PLRU - -- - way := to_integer(unsigned(plru_victim(req_index))); - report "cache miss addr:" & to_hstring(d_in.addr) & " idx:" & integer'image(req_index) & - " way:" & integer'image(way) & + " way:" & integer'image(replace_way) & " tag:" & to_hstring(req_tag); -- Force misses on that way while reloading that line - cache_valids(req_index)(way) <= '0'; + cache_valids(req_index)(replace_way) <= '0'; -- Store new tag in selected way for i in 0 to NUM_WAYS-1 loop - if i = way then + if i = replace_way then tagset := cache_tags(req_index); write_tag(i, tagset, req_tag); cache_tags(req_index) <= tagset; @@ -729,7 +728,7 @@ begin -- Keep track of our index and way for subsequent stores. r1.store_index <= req_index; - r1.store_way <= way; + r1.store_way <= replace_way; -- Prep for first wishbone read. We calculate the address of -- the start of the cache line @@ -785,7 +784,7 @@ begin -- That was the last word ? We are done if is_last_row(r1.wb.adr) then - cache_valids(r1.store_index)(way) <= '1'; + cache_valids(r1.store_index)(r1.store_way) <= '1'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; diff --git a/icache.vhdl b/icache.vhdl index 75695e1..95e37af 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -178,6 +178,7 @@ architecture rtl of icache is -- PLRU output interface type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0); signal plru_victim : plru_out_t; + signal replace_way : way_t; -- Return the cache line index (tag index) for an address function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is @@ -371,6 +372,9 @@ begin req_is_miss <= i_in.req and not is_hit and not flush_in; req_hit_way <= hit_way; + -- The way to replace on a miss + replace_way <= to_integer(unsigned(plru_victim(req_index))); + -- Output instruction from current cache row -- -- Note: This is a mild violation of our design principle of having pipeline @@ -420,7 +424,6 @@ begin -- Cache miss/reload synchronous machine icache_miss : process(clk) - variable way : integer range 0 to NUM_WAYS-1; variable tagset : cache_tags_set_t; begin if rising_edge(clk) then @@ -446,20 +449,18 @@ begin when IDLE => -- We need to read a cache line if req_is_miss = '1' then - way := to_integer(unsigned(plru_victim(req_index))); - report "cache miss nia:" & to_hstring(i_in.nia) & " SM:" & std_ulogic'image(i_in.stop_mark) & " idx:" & integer'image(req_index) & - " way:" & integer'image(way) & + " way:" & integer'image(replace_way) & " tag:" & to_hstring(req_tag); -- Force misses on that way while reloading that line - cache_valids(req_index)(way) <= '0'; + cache_valids(req_index)(replace_way) <= '0'; -- Store new tag in selected way for i in 0 to NUM_WAYS-1 loop - if i = way then + if i = replace_way then tagset := cache_tags(req_index); write_tag(i, tagset, req_tag); cache_tags(req_index) <= tagset; @@ -468,7 +469,7 @@ begin -- Keep track of our index and way for subsequent stores r.store_index <= req_index; - r.store_way <= way; + r.store_way <= replace_way; -- Prep for first wishbone read. We calculate the address of -- the start of the cache line @@ -484,7 +485,7 @@ begin if wishbone_in.ack = '1' then -- That was the last word ? We are done if is_last_row(r.wb.adr) then - cache_valids(r.store_index)(way) <= '1'; + cache_valids(r.store_index)(r.store_way) <= '1'; r.wb.cyc <= '0'; r.wb.stb <= '0'; r.state <= IDLE; From a0d95e791e52b1ca7ee7b14e041453fb54e34a6f Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 22 Oct 2019 14:49:35 +1100 Subject: [PATCH 09/11] insn: Implement isync instruction The instruction works by redirecting fetch to nia+4 (hopefully using the same adder used to generate LR) and doing a backflush. Along with being single issue, this should guarantee that the next instruction only gets fetched after the pipe's been emptied. Signed-off-by: Benjamin Herrenschmidt --- decode1.vhdl | 2 +- execute1.vhdl | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 747411b..afb1315 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -108,7 +108,7 @@ architecture behaviour of decode1 is -- bclr, bcctr, bctar 2#100# => (ALU, OP_BCREG, NONE, NONE, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '1'), -- isync - 2#111# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), + 2#111# => (ALU, OP_ISYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), others => illegal_inst ); diff --git a/execute1.vhdl b/execute1.vhdl index de18a37..ec11678 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -118,6 +118,7 @@ begin variable bo, bi : std_ulogic_vector(4 downto 0); variable bf, bfa : std_ulogic_vector(2 downto 0); variable l : std_ulogic; + variable next_nia : std_ulogic_vector(63 downto 0); begin result := (others => '0'); result_with_carry := (others => '0'); @@ -135,6 +136,9 @@ begin terminate_out <= '0'; f_out <= Execute1ToFetch1TypeInit; + -- Next insn adder used in a couple of places + next_nia := std_ulogic_vector(unsigned(e_in.nia) + 4); + -- rotator control signals right_shift <= '1' when e_in.insn_type = OP_SHR else '0'; rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0'; @@ -345,13 +349,17 @@ begin -- Keep our test cases happy for now, ignore trap instructions report "OP_TDI FIXME"; + when OP_ISYNC => + f_out.redirect <= '1'; + f_out.redirect_nia <= next_nia; + when others => terminate_out <= '1'; report "illegal"; end case; if e_in.lr = '1' then - ctrl_tmp.lr <= std_ulogic_vector(unsigned(e_in.nia) + 4); + ctrl_tmp.lr <= next_nia; end if; end if; From 742b21480e48e91e55e8e098b62d7beaac5a7a09 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 22 Oct 2019 14:56:31 +1100 Subject: [PATCH 10/11] insn: Simplistic implementation of icbi We don't yet have a proper snooper for the icache, so for now make icbi just flush the whole thing Signed-off-by: Benjamin Herrenschmidt --- core.vhdl | 4 +++- decode1.vhdl | 2 +- execute1.vhdl | 5 +++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/core.vhdl b/core.vhdl index 77af882..95dd89b 100644 --- a/core.vhdl +++ b/core.vhdl @@ -76,6 +76,7 @@ architecture behave of core is signal fetch2_stall_in : std_ulogic; signal decode1_stall_in : std_ulogic; signal decode2_stall_out : std_ulogic; + signal ex1_icache_inval: std_ulogic; signal flush: std_ulogic; @@ -129,7 +130,7 @@ begin wishbone_in => wishbone_insn_in ); - icache_rst <= rst or dbg_icache_rst; + icache_rst <= rst or dbg_icache_rst or ex1_icache_inval; fetch2_0: entity work.fetch2 port map ( @@ -204,6 +205,7 @@ begin e_in => decode2_to_execute1, f_out => execute1_to_fetch1, e_out => execute1_to_writeback, + icache_inval => ex1_icache_inval, terminate_out => terminate ); diff --git a/decode1.vhdl b/decode1.vhdl index afb1315..1a7bc0b 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -168,7 +168,7 @@ architecture behaviour of decode1 is 2#1110011010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsh 2#1111011010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsw -- 2#110111101-# extswsli - 2#1111010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- icbi + 2#1111010110# => (ALU, OP_ICBI, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- icbi 2#0000010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- icbt 2#0000001111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- isel 2#0000101111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel diff --git a/execute1.vhdl b/execute1.vhdl index ec11678..f2277eb 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -27,6 +27,7 @@ entity execute1 is e_out : out Execute1ToWritebackType; + icache_inval : out std_ulogic; terminate_out : out std_ulogic ); end entity execute1; @@ -134,6 +135,7 @@ begin ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); terminate_out <= '0'; + icache_inval <= '0'; f_out <= Execute1ToFetch1TypeInit; -- Next insn adder used in a couple of places @@ -353,6 +355,9 @@ begin f_out.redirect <= '1'; f_out.redirect_nia <= next_nia; + when OP_ICBI => + icache_inval <= '1'; + when others => terminate_out <= '1'; report "illegal"; From cb4451498f313b96e55e7916510bfa03e4b94a2e Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sat, 19 Oct 2019 10:31:39 +1100 Subject: [PATCH 11/11] dcache: Add testbench A very simple one for now... Signed-off-by: Benjamin Herrenschmidt --- Makefile | 6 ++- dcache_tb.vhdl | 139 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 dcache_tb.vhdl diff --git a/Makefile b/Makefile index bc5388b..3056c53 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ GHDL=ghdl GHDLFLAGS=--std=08 -Psim-unisim CFLAGS=-O2 -Wall -all = core_tb simple_ram_behavioural_tb soc_reset_tb icache_tb multiply_tb dmi_dtm_tb divider_tb \ +all = core_tb simple_ram_behavioural_tb soc_reset_tb icache_tb dcache_tb multiply_tb dmi_dtm_tb divider_tb \ rotator_tb countzero_tb # XXX @@ -38,6 +38,7 @@ plru_tb.o: plru.o icache.o: common.o wishbone_types.o plru.o cache_ram.o icache_tb.o: common.o wishbone_types.o icache.o simple_ram_behavioural.o dcache.o: common.o wishbone_types.o plru.o cache_ram.o +dcache_tb.o: common.o wishbone_types.o dcache.o simple_ram_behavioural.o insn_helpers.o: loadstore1.o: common.o helpers.o logical.o: decode_types.o @@ -81,6 +82,9 @@ fetch_tb: fetch_tb.o icache_tb: icache_tb.o $(GHDL) -e $(GHDLFLAGS) -Wl,simple_ram_behavioural_helpers_c.o $@ +dcache_tb: dcache_tb.o + $(GHDL) -e $(GHDLFLAGS) -Wl,simple_ram_behavioural_helpers_c.o $@ + plru_tb: plru_tb.o $(GHDL) -e $(GHDLFLAGS) $@ diff --git a/dcache_tb.vhdl b/dcache_tb.vhdl new file mode 100644 index 0000000..0edbdb7 --- /dev/null +++ b/dcache_tb.vhdl @@ -0,0 +1,139 @@ +library ieee; +use ieee.std_logic_1164.all; + +library work; +use work.common.all; +use work.wishbone_types.all; + +entity dcache_tb is +end dcache_tb; + +architecture behave of dcache_tb is + signal clk : std_ulogic; + signal rst : std_ulogic; + + signal d_in : Loadstore1ToDcacheType; + signal d_out : DcacheToWritebackType; + + signal wb_bram_in : wishbone_master_out; + signal wb_bram_out : wishbone_slave_out; + + constant clk_period : time := 10 ns; +begin + dcache0: entity work.dcache + generic map( + LINE_SIZE => 64, + NUM_LINES => 4 + ) + port map( + clk => clk, + rst => rst, + d_in => d_in, + d_out => d_out, + wishbone_out => wb_bram_in, + wishbone_in => wb_bram_out + ); + + -- BRAM Memory slave + bram0: entity work.mw_soc_memory + generic map( + MEMORY_SIZE => 128, + RAM_INIT_FILE => "icache_test.bin" + ) + port map( + clk => clk, + rst => rst, + wishbone_in => wb_bram_in, + wishbone_out => wb_bram_out + ); + + clk_process: process + begin + clk <= '0'; + wait for clk_period/2; + clk <= '1'; + wait for clk_period/2; + end process; + + rst_process: process + begin + rst <= '1'; + wait for 2*clk_period; + rst <= '0'; + wait; + end process; + + stim: process + begin + -- Clear stuff + d_in.valid <= '0'; + d_in.load <= '0'; + d_in.nc <= '0'; + d_in.addr <= (others => '0'); + d_in.data <= (others => '0'); + d_in.write_reg <= (others => '0'); + d_in.length <= (others => '0'); + d_in.byte_reverse <= '0'; + d_in.sign_extend <= '0'; + d_in.update <= '0'; + d_in.update_reg <= (others => '0'); + + wait for 4*clk_period; + wait until rising_edge(clk); + + -- Cacheable read of address 4 + d_in.load <= '1'; + d_in.nc <= '0'; + d_in.addr <= x"0000000000000004"; + d_in.valid <= '1'; + wait until rising_edge(clk); + d_in.valid <= '0'; + + wait until rising_edge(clk) and d_out.write_enable = '1'; + assert d_out.valid = '1'; + assert d_out.write_data = x"0000000100000000" + report "data @" & to_hstring(d_in.addr) & + "=" & to_hstring(d_out.write_data) & + " expected 0000000100000000" + severity failure; +-- wait for clk_period; + + -- Cacheable read of address 30 + d_in.load <= '1'; + d_in.nc <= '0'; + d_in.addr <= x"0000000000000030"; + d_in.valid <= '1'; + wait until rising_edge(clk); + d_in.valid <= '0'; + + wait until rising_edge(clk) and d_out.write_enable = '1'; + assert d_out.valid = '1'; + assert d_out.write_data = x"0000000D0000000C" + report "data @" & to_hstring(d_in.addr) & + "=" & to_hstring(d_out.write_data) & + " expected 0000000D0000000C" + severity failure; + + -- Non-cacheable read of address 100 + d_in.load <= '1'; + d_in.nc <= '1'; + d_in.addr <= x"0000000000000100"; + d_in.valid <= '1'; + wait until rising_edge(clk); + d_in.valid <= '0'; + + wait until rising_edge(clk) and d_out.write_enable = '1'; + assert d_out.valid = '1'; + assert d_out.write_data = x"0000004100000040" + report "data @" & to_hstring(d_in.addr) & + "=" & to_hstring(d_out.write_data) & + " expected 0000004100000040" + severity failure; + + wait for clk_period*4; + + assert false report "end of test" severity failure; + wait; + + end process; +end;