library ieee; use ieee.std_logic_1164.all; use ieee.numeric_std.all; use std.textio.all; library work; use work.wishbone_types.all; use work.utils.all; use work.helpers.all; entity litedram_wrapper is generic ( DRAM_ABITS : positive; DRAM_ALINES : natural; DRAM_DLINES : natural; DRAM_CKLINES : natural; DRAM_PORT_WIDTH : positive; -- Pseudo-ROM payload PAYLOAD_SIZE : natural; PAYLOAD_FILE : string; -- L2 cache -- -- Line size in bytes LINE_SIZE : positive := 128; -- Number of lines in a set NUM_LINES : positive := 64; -- Number of ways NUM_WAYS : positive := 4; -- Max number of stores in the queue STOREQ_DEPTH : positive := 8; -- Don't send loads until all pending stores acked in litedram NO_LS_OVERLAP : boolean := false; -- Debug LITEDRAM_TRACE : boolean := false; TRACE : boolean := false ); port( -- LiteDRAM generates the system clock and reset -- from the input clkin clk_in : in std_ulogic; rst : in std_ulogic; system_clk : out std_ulogic; system_reset : out std_ulogic; core_alt_reset : out std_ulogic; pll_locked : out std_ulogic; -- Wishbone ports: wb_in : in wishbone_master_out; wb_out : out wishbone_slave_out; wb_ctrl_in : in wb_io_master_out; wb_ctrl_out : out wb_io_slave_out; wb_ctrl_is_csr : in std_ulogic; wb_ctrl_is_init : in std_ulogic; -- Misc init_done : out std_ulogic; init_error : out std_ulogic; -- DRAM wires ddram_a : out std_ulogic_vector(DRAM_ALINES-1 downto 0); ddram_ba : out std_ulogic_vector(2 downto 0); ddram_ras_n : out std_ulogic; ddram_cas_n : out std_ulogic; ddram_we_n : out std_ulogic; ddram_cs_n : out std_ulogic; ddram_dm : out std_ulogic_vector(DRAM_DLINES/8-1 downto 0); ddram_dq : inout std_ulogic_vector(DRAM_DLINES-1 downto 0); ddram_dqs_p : inout std_ulogic_vector(DRAM_DLINES/8-1 downto 0); ddram_dqs_n : inout std_ulogic_vector(DRAM_DLINES/8-1 downto 0); ddram_clk_p : out std_ulogic_vector(DRAM_CKLINES-1 downto 0); ddram_clk_n : out std_ulogic_vector(DRAM_CKLINES-1 downto 0); ddram_cke : out std_ulogic; ddram_odt : out std_ulogic; ddram_reset_n : out std_ulogic ); end entity litedram_wrapper; architecture behaviour of litedram_wrapper is component litedram_core port ( clk : in std_ulogic; rst : in std_ulogic; pll_locked : out std_ulogic; ddram_a : out std_ulogic_vector(DRAM_ALINES-1 downto 0); ddram_ba : out std_ulogic_vector(2 downto 0); ddram_ras_n : out std_ulogic; ddram_cas_n : out std_ulogic; ddram_we_n : out std_ulogic; ddram_cs_n : out std_ulogic; ddram_dm : out std_ulogic_vector(DRAM_DLINES/8-1 downto 0); ddram_dq : inout std_ulogic_vector(DRAM_DLINES-1 downto 0); ddram_dqs_p : inout std_ulogic_vector(DRAM_DLINES/8-1 downto 0); ddram_dqs_n : inout std_ulogic_vector(DRAM_DLINES/8-1 downto 0); ddram_clk_p : out std_ulogic_vector(DRAM_CKLINES-1 downto 0); ddram_clk_n : out std_ulogic_vector(DRAM_CKLINES-1 downto 0); ddram_cke : out std_ulogic; ddram_odt : out std_ulogic; ddram_reset_n : out std_ulogic; init_done : out std_ulogic; init_error : out std_ulogic; user_clk : out std_ulogic; user_rst : out std_ulogic; wb_ctrl_adr : in std_ulogic_vector(29 downto 0); wb_ctrl_dat_w : in std_ulogic_vector(31 downto 0); wb_ctrl_dat_r : out std_ulogic_vector(31 downto 0); wb_ctrl_sel : in std_ulogic_vector(3 downto 0); wb_ctrl_cyc : in std_ulogic; wb_ctrl_stb : in std_ulogic; wb_ctrl_ack : out std_ulogic; wb_ctrl_we : in std_ulogic; wb_ctrl_cti : in std_ulogic_vector(2 downto 0); wb_ctrl_bte : in std_ulogic_vector(1 downto 0); wb_ctrl_err : out std_ulogic; user_port_native_0_cmd_valid : in std_ulogic; user_port_native_0_cmd_ready : out std_ulogic; user_port_native_0_cmd_we : in std_ulogic; user_port_native_0_cmd_addr : in std_ulogic_vector(DRAM_ABITS-1 downto 0); user_port_native_0_wdata_valid : in std_ulogic; user_port_native_0_wdata_ready : out std_ulogic; user_port_native_0_wdata_we : in std_ulogic_vector(DRAM_PORT_WIDTH/8-1 downto 0); user_port_native_0_wdata_data : in std_ulogic_vector(DRAM_PORT_WIDTH-1 downto 0); user_port_native_0_rdata_valid : out std_ulogic; user_port_native_0_rdata_ready : in std_ulogic; user_port_native_0_rdata_data : out std_ulogic_vector(DRAM_PORT_WIDTH-1 downto 0) ); end component; signal user_port0_cmd_valid : std_ulogic; signal user_port0_cmd_ready : std_ulogic; signal user_port0_cmd_we : std_ulogic; signal user_port0_cmd_addr : std_ulogic_vector(DRAM_ABITS-1 downto 0); signal user_port0_wdata_valid : std_ulogic; signal user_port0_wdata_ready : std_ulogic; signal user_port0_wdata_we : std_ulogic_vector(DRAM_PORT_WIDTH/8-1 downto 0); signal user_port0_wdata_data : std_ulogic_vector(DRAM_PORT_WIDTH-1 downto 0); signal user_port0_rdata_valid : std_ulogic; signal user_port0_rdata_ready : std_ulogic; signal user_port0_rdata_data : std_ulogic_vector(DRAM_PORT_WIDTH-1 downto 0); signal wb_ctrl_adr : std_ulogic_vector(29 downto 0); signal wb_ctrl_dat_w : std_ulogic_vector(31 downto 0); signal wb_ctrl_dat_r : std_ulogic_vector(31 downto 0); signal wb_ctrl_sel : std_ulogic_vector(3 downto 0); signal wb_ctrl_cyc : std_ulogic := '0'; signal wb_ctrl_stb : std_ulogic; signal wb_ctrl_ack : std_ulogic; signal wb_ctrl_we : std_ulogic; signal wb_init_in : wb_io_master_out; signal wb_init_out : wb_io_slave_out; -- DRAM data port width constant DRAM_DBITS : natural := DRAM_PORT_WIDTH; -- DRAM data port sel bits constant DRAM_SBITS : natural := (DRAM_DBITS / 8); -- WB geometry (just a few shortcuts) constant WBL : positive := wb_in.dat'length; constant WBSL : positive := wb_in.sel'length; -- Select a WB word inside DRAM port width constant WB_WORD_COUNT : positive := DRAM_DBITS/WBL; constant WB_WSEL_BITS : positive := log2(WB_WORD_COUNT); -- BRAM organisation: We never access more than wishbone_data_bits at -- a time so to save resources we make the array only that wide, and -- use consecutive indices for to make a cache "line" -- -- ROW_SIZE is the width in bytes of the BRAM, ie, litedram port width constant ROW_SIZE : natural := DRAM_DBITS / 8; -- ROW_PER_LINE is the number of row (litedram transactions) in a line constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE; -- BRAM_ROWS is the number of rows in BRAM needed to represent the full -- dcache constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE; -- Bit fields counts in the address -- ROW_BITS is the number of bits to select a row constant ROW_BITS : natural := log2(BRAM_ROWS); -- ROW_LINEBITS is the number of bits to select a row within a line constant ROW_LINEBITS : natural := log2(ROW_PER_LINE); -- LINE_OFF_BITS is the number of bits for the offset in a cache line constant LINE_OFF_BITS : natural := log2(LINE_SIZE); -- ROW_OFF_BITS is the number of bits for the offset in a row constant ROW_OFF_BITS : natural := log2(ROW_SIZE); -- REAL_ADDR_BITS is the number of real address bits that we store constant REAL_ADDR_BITS : positive := DRAM_ABITS + ROW_OFF_BITS; -- INDEX_BITS is the number if bits to select a cache line constant INDEX_BITS : natural := log2(NUM_LINES); -- SET_SIZE_BITS is the log base 2 of the set size constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS; -- TAG_BITS is the number of bits of the tag part of the address constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS; -- WAY_BITS is the number of bits to select a way constant WAY_BITS : natural := log2(NUM_WAYS); subtype row_t is integer range 0 to BRAM_ROWS-1; subtype index_t is integer range 0 to NUM_LINES-1; subtype way_t is integer range 0 to NUM_WAYS-1; subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0); -- The cache data BRAM organized as described above for each way subtype cache_row_t is std_ulogic_vector(DRAM_DBITS-1 downto 0); -- The cache tags LUTRAM has a row per set. Vivado is a pain and will -- not handle a clean (commented) definition of the cache tags as a 3d -- memory. For now, work around it by putting all the tags subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0); -- type cache_tags_set_t is array(way_t) of cache_tag_t; -- type cache_tags_array_t is array(index_t) of cache_tags_set_t; constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS; subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0); type cache_tags_array_t is array(index_t) of cache_tags_set_t; -- The cache valid bits subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); type cache_valids_t is array(index_t) of cache_way_valids_t; -- "Temporary" valid bits for the rows of the currently refilled line type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs signal cache_tags : cache_tags_array_t; signal cache_valids : cache_valids_t; attribute ram_style : string; attribute ram_style of cache_tags : signal is "distributed"; -- -- Store queue signals -- -- We store a single wishbone dword per entry (64-bit) -- along with the wishbone sel bits and the necessary address -- bits to select which part of DRAM port to write to. constant STOREQ_BITS : positive := WBL + WBSL + WB_WSEL_BITS; signal storeq_rd_ready : std_ulogic; signal storeq_rd_valid : std_ulogic; signal storeq_rd_data : std_ulogic_vector(STOREQ_BITS-1 downto 0); signal storeq_wr_ready : std_ulogic; signal storeq_wr_valid : std_ulogic; signal storeq_wr_data : std_ulogic_vector(STOREQ_BITS-1 downto 0); -- -- Cache management signals -- -- Cache state machine type state_t is (IDLE, -- Normal load hit processing REFILL_CLR_TAG, -- Cache refill clear tag REFILL_WAIT_ACK); -- Cache refill wait ack signal state : state_t; -- Latched WB request signal wb_req : wishbone_master_out := wishbone_master_out_init; -- Stashed WB request signal wb_stash : wishbone_master_out := wishbone_master_out_init; -- Read pipeline (to handle cache RAM latency) signal read_ack_0 : std_ulogic := '0'; signal read_ack_1 : std_ulogic := '0'; signal read_wsl_0 : std_ulogic_vector(WB_WSEL_BITS-1 downto 0) := (others => '0'); signal read_wsl_1 : std_ulogic_vector(WB_WSEL_BITS-1 downto 0) := (others => '0'); signal read_way_0 : way_t; signal read_way_1 : way_t; -- Store ack pipeline signal store_ack_0 : std_ulogic := '0'; signal store_ack_1 : std_ulogic := '0'; -- Async signals decoding latched request type req_op_t is (OP_NONE, OP_LOAD_HIT, OP_LOAD_MISS, OP_STORE_HIT, OP_STORE_MISS, OP_STORE_DELAYED); signal req_index : index_t; signal req_row : row_t; signal req_hit_way : way_t; signal req_tag : cache_tag_t; signal req_op : req_op_t; signal req_laddr : std_ulogic_vector(REAL_ADDR_BITS-1 downto 0); signal req_wsl : std_ulogic_vector(WB_WSEL_BITS-1 downto 0); signal req_we : std_ulogic_vector(DRAM_SBITS-1 downto 0); signal req_wdata : std_ulogic_vector(DRAM_DBITS-1 downto 0); signal stall : std_ulogic; -- Line refill command signals and latches signal refill_cmd_valid : std_ulogic; signal refill_cmd_addr : std_ulogic_vector(DRAM_ABITS-1 downto 0); signal refill_way : way_t; signal refill_index : index_t; signal refill_row : row_t; signal refill_end_row : row_in_line_t; signal refill_rows_vlid : row_per_line_valid_t; -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; signal cache_out : cache_ram_out_t; -- PLRU output interface type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0); signal plru_victim : plru_out_t; -- -- Helper functions to decode incoming requests -- -- Return the DRAM real address from a wishbone address function get_real_addr(addr: wishbone_addr_type) return std_ulogic_vector is variable ra: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0) := (others => '0'); begin ra(REAL_ADDR_BITS - 1 downto wishbone_log2_width) := addr(REAL_ADDR_BITS - wishbone_log2_width - 1 downto 0); return ra; end; -- Return the cache line index (tag index) for an address function get_index(addr: wishbone_addr_type) return index_t is begin return to_integer(unsigned(addr(SET_SIZE_BITS - wishbone_log2_width - 1 downto LINE_OFF_BITS - wishbone_log2_width))); end; -- Return the cache row index (data memory) for an address function get_row(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto 0)) return row_t is begin return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS))); end; -- Return the index of a row within a line function get_row_of_line(row: row_t) return row_in_line_t is variable row_v : unsigned(ROW_BITS-1 downto 0); begin row_v := to_unsigned(row, ROW_BITS); return row_v(ROW_LINEBITS-1 downto 0); end; -- Returns whether this is the last row of a line. It takes a DRAM address function is_last_row_addr(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS); last: row_in_line_t) return boolean is begin return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last; end; -- Returns whether this is the last row of a line function is_last_row(row: row_t; last: row_in_line_t) return boolean is begin return get_row_of_line(row) = last; end; -- Return the address of the next row in the current cache line. It takes a -- DRAM address function next_row_addr(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS)) return std_ulogic_vector is variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0); variable result : std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS); begin -- Is there no simpler way in VHDL to generate that 3 bits adder ? row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS); row_idx := std_ulogic_vector(unsigned(row_idx) + 1); result := addr; result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx; return result; end; -- Return the next row in the current cache line. We use a dedicated -- function in order to limit the size of the generated adder to be -- only the bits within a cache line (3 bits with default settings) -- function next_row(row: row_t) return row_t is variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0); variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0); variable result : std_ulogic_vector(ROW_BITS-1 downto 0); begin row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS)); row_idx := row_v(ROW_LINEBITS-1 downto 0); row_v(ROW_LINEBITS-1 downto 0) := std_ulogic_vector(unsigned(row_idx) + 1); return to_integer(unsigned(row_v)); end; -- Get the tag value from the address function get_tag(addr: wishbone_addr_type) return cache_tag_t is begin return addr(REAL_ADDR_BITS - wishbone_log2_width - 1 downto SET_SIZE_BITS - wishbone_log2_width); end; -- Read a tag from a tag memory row function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is begin return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS); end; -- Write a tag to tag memory row procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t; tag: cache_tag_t) is begin tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; end; begin -- Sanity checks assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE; assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE; assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" severity FAILURE; assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS) report "geometry bits don't add up" severity FAILURE; assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) report "geometry bits don't add up" severity FAILURE; assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) report "geometry bits don't add up" severity FAILURE; assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS) report "geometry bits don't add up" severity FAILURE; -- alternate core reset address set when DRAM is not initialized. core_alt_reset <= not init_done; -- Init code BRAM memory slave init_ram_0: entity work.dram_init_mem generic map( EXTRA_PAYLOAD_FILE => PAYLOAD_FILE, EXTRA_PAYLOAD_SIZE => PAYLOAD_SIZE ) port map( clk => system_clk, wb_in => wb_init_in, wb_out => wb_init_out ); -- -- Control bus wishbone: This muxes the wishbone to the CSRs -- and an internal small one to the init BRAM -- -- Init DRAM wishbone IN signals wb_init_in.adr <= wb_ctrl_in.adr; wb_init_in.dat <= wb_ctrl_in.dat; wb_init_in.sel <= wb_ctrl_in.sel; wb_init_in.we <= wb_ctrl_in.we; wb_init_in.stb <= wb_ctrl_in.stb; wb_init_in.cyc <= wb_ctrl_in.cyc and wb_ctrl_is_init; -- DRAM CSR IN signals. Extra latch to help with timing csr_latch: process(system_clk) begin if rising_edge(system_clk) then if system_reset = '1' then wb_ctrl_cyc <= '0'; wb_ctrl_stb <= '0'; else -- XXX Maybe only update addr when cyc = '1' to save power ? wb_ctrl_adr <= x"0000" & wb_ctrl_in.adr(13 downto 0); wb_ctrl_dat_w <= wb_ctrl_in.dat; wb_ctrl_sel <= wb_ctrl_in.sel; wb_ctrl_we <= wb_ctrl_in.we; wb_ctrl_cyc <= wb_ctrl_in.cyc and wb_ctrl_is_csr; wb_ctrl_stb <= wb_ctrl_in.stb and wb_ctrl_is_csr; -- Clear stb on ack otherwise the memory will latch -- the write twice which breaks levelling. On the next -- cycle we will latch an updated stb that takes the -- ack into account. if wb_ctrl_ack = '1' then wb_ctrl_stb <= '0'; end if; end if; end if; end process; -- Ctrl bus wishbone OUT signals. XXX Consider adding latch on -- CSR response to help timing wb_ctrl_out.ack <= wb_ctrl_ack when wb_ctrl_is_csr = '1' else wb_init_out.ack; wb_ctrl_out.dat <= wb_ctrl_dat_r when wb_ctrl_is_csr = '1' else wb_init_out.dat; wb_ctrl_out.stall <= wb_init_out.stall when wb_ctrl_is_init else '0' when wb_ctrl_in.cyc = '0' else not wb_ctrl_ack; -- Generate a cache RAM for each way rams: for i in 0 to NUM_WAYS-1 generate signal do_read : std_ulogic; signal do_write : std_ulogic; signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0); signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0); signal wr_data : std_ulogic_vector(DRAM_DBITS-1 downto 0); signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0); signal wr_sel_m : std_ulogic_vector(ROW_SIZE-1 downto 0); signal dout : cache_row_t; begin way: entity work.cache_ram generic map ( ROW_BITS => ROW_BITS, WIDTH => DRAM_DBITS, ADD_BUF => true ) port map ( clk => system_clk, rd_en => do_read, rd_addr => rd_addr, rd_data => dout, wr_sel => wr_sel_m, wr_addr => wr_addr, wr_data => wr_data ); process(all) begin -- -- Read port -- do_read <= '1'; cache_out(i) <= dout; rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); -- -- Write mux: cache refills from DRAM or writes from Wishbone -- if req_op = OP_STORE_HIT and req_hit_way = i then -- Write from wishbone wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); wr_data <= req_wdata; wr_sel <= req_we; else -- Refill from DRAM wr_data <= user_port0_rdata_data; wr_sel <= (others => '1'); wr_addr <= std_ulogic_vector(to_unsigned(refill_row, ROW_BITS)); end if; -- -- Write enable logic -- do_write <= '0'; if req_op = OP_STORE_HIT and req_hit_way = i then do_write <= '1'; elsif user_port0_rdata_valid = '1' and refill_way = i then do_write <= '1'; end if; -- Mask write selects with do_write since BRAM doesn't always -- have a global write-enable (Vivado generates TDP instead -- of SDP when using one, thus doubling cache BRAM usage). for i in 0 to ROW_SIZE-1 loop wr_sel_m(i) <= wr_sel(i) and do_write; end loop; if TRACE and rising_edge(system_clk) then if do_write = '1' then report "cache write way:" & integer'image(i) & " addr:" & to_hstring(wr_addr) & " sel:" & to_hstring(wr_sel_m) & " data:" & to_hstring(wr_data); end if; end if; end process; end generate; -- Generate PLRUs maybe_plrus: if NUM_WAYS > 1 generate begin plrus: for i in 0 to NUM_LINES-1 generate -- PLRU interface signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); signal plru_acc_en : std_ulogic; signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); begin plru : entity work.plru generic map ( BITS => WAY_BITS ) port map ( clk => system_clk, rst => system_reset, acc => plru_acc, acc_en => plru_acc_en, lru => plru_out ); process(req_index, req_op, req_hit_way, plru_out) begin -- PLRU interface if (req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT) and req_index = i then plru_acc_en <= '1'; else plru_acc_en <= '0'; end if; plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS)); plru_victim(i) <= plru_out; end process; end generate; end generate; -- -- Wishbone request interface: -- -- - Incoming wishbone request latch (to help with timing) -- - Read response pipeline (to match BRAM output buffer delay) -- - Stall generation -- -- XXX TODO: Properly handle cyc drops before all acks are sent... -- request_latch: process(system_clk) begin if rising_edge(system_clk) then -- Implement a stash buffer. If we are stalled and stash is -- free, fill it up. This will generate a WB stall on the -- next cycle. if stall = '1' and wb_out.stall = '0' and wb_in.cyc = '1' and wb_in.stb = '1' then wb_stash <= wb_in; if TRACE then report "stashed wb req ! addr:" & to_hstring(wb_in.adr & "000") & " we:" & std_ulogic'image(wb_in.we) & " sel:" & to_hstring(wb_in.sel); end if; end if; -- We aren't stalled, see what we can do if stall = '0' then if wb_stash.cyc = '1' then -- Something in stash ! use it and clear stash wb_req <= wb_stash; wb_stash.cyc <= '0'; if TRACE then report "unstashed wb req ! addr:" & to_hstring(wb_stash.adr & "000") & " we:" & std_ulogic'image(wb_stash.we) & " sel:" & to_hstring(wb_stash.sel); end if; else -- Grab request from WB if wb_in.cyc = '1' then wb_req <= wb_in; else wb_req.cyc <= wb_in.cyc; wb_req.stb <= wb_in.stb; end if; if TRACE then if wb_in.cyc = '1' and wb_in.stb = '1' then report "latch new wb req ! addr:" & to_hstring(wb_in.adr & "000") & " we:" & std_ulogic'image(wb_in.we) & " sel:" & to_hstring(wb_in.sel); end if; end if; end if; end if; end if; end process; -- Stall when stash is full wb_out.stall <= wb_stash.cyc; -- -- Read response pipeline -- read_pipe: process(system_clk) begin if rising_edge(system_clk) then read_ack_0 <= '1' when req_op = OP_LOAD_HIT else '0'; read_wsl_0 <= req_wsl; read_way_0 <= req_hit_way; read_ack_1 <= read_ack_0; read_wsl_1 <= read_wsl_0; read_way_1 <= read_way_0; if TRACE then if req_op = OP_LOAD_HIT then report "Load hit addr:" & to_hstring(wb_req.adr & "000") & " idx:" & integer'image(req_index) & " tag:" & to_hstring(req_tag) & " way:" & integer'image(req_hit_way); elsif req_op = OP_LOAD_MISS then report "Load miss addr:" & to_hstring(wb_req.adr & "000"); end if; if read_ack_0 = '1' then report "read data:" & to_hstring(cache_out(read_way_0)); end if; end if; end if; end process; -- -- Store acks pipeline -- store_ack_pipe: process(system_clk) begin if rising_edge(system_clk) then store_ack_1 <= store_ack_0; end if; end process; -- -- Wishbone response generation -- wb_rseponse: process(all) variable rdata : std_ulogic_vector(DRAM_DBITS-1 downto 0); variable store_done : std_ulogic; variable accept_store : std_ulogic; variable wsel : natural range 0 to WB_WORD_COUNT-1; begin -- Can we accept a store ? This is set when the store queue & command -- queue are not full. -- -- This does *not* mean that we will accept the store, there are other -- reasons to delay them (see OP_STORE_DELAYED). -- -- A store is fully accepted when *both* req_op is not OP_STORE_DELAYED -- and accept_store is '1'. -- -- The reason for this split is to avoid a circular dependency inside -- LiteDRAM, since cmd_ready from litedram is driven from cmd_valid (*) -- we don't want to generate cmd_valid from cmd_ready. So we generate -- it instead from all the *other* conditions that make a store valid. -- -- (*) It's my understanding that user_port0_cmd_ready from LiteDRAM is -- ombinational from user_port0_cmd_valid along with a bunch of other -- internal signals. IE. we won't know that LiteDRAM cannot accept a -- command until we try to send one. -- accept_store := user_port0_cmd_ready and storeq_wr_ready; -- Generate stalls. For stores we stall if we can't accept it. -- For loads, we stall if we are going to take a load miss or -- are in the middle of a refill and it isn't a partial hit. if req_op = OP_STORE_MISS or req_op = OP_STORE_HIT then stall <= not accept_store; elsif req_op = OP_LOAD_MISS or req_op = OP_STORE_DELAYED then stall <= '1'; else stall <= '0'; end if; -- Data out mux rdata := cache_out(read_way_1); -- Hard wired for 64-bit wishbone wsel := to_integer(unsigned(read_wsl_1)); wb_out.dat <= rdata((wsel+1)*WBL-1 downto wsel*WBL); -- Early-complete stores on wishbone. if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then store_done := accept_store; else store_done := '0'; end if; -- Pipeline store acks store_ack_0 <= store_done; -- Generate Wishbone ACKs on read hits and store complete -- -- This can happen on store right behind loads ! This is why -- we delay a store when a load ack is in the pipeline in the -- request decoder below. -- wb_out.ack <= read_ack_1 or store_ack_1; assert read_ack_1 = '0' or store_ack_1 = '0' report "Read ack and store ack collision !" severity failure; end process; -- -- Cache request decode -- request_decode: process(all) variable valid : boolean; variable is_hit : boolean; variable store_delay : boolean; variable hit_way : way_t; begin -- Extract line, row and tag from request req_index <= get_index(wb_req.adr); req_row <= get_row(get_real_addr(wb_req.adr)); req_tag <= get_tag(wb_req.adr); -- Calculate address of beginning of cache row, will be -- used for cache miss processing if needed req_laddr <= get_real_addr(wb_req.adr); -- Do we have a valid request in the WB latch ? valid := wb_req.cyc = '1' and wb_req.stb = '1'; -- Store signals (hard wired for 64-bit wishbone at the moment) req_wsl <= wb_req.adr(WB_WSEL_BITS-1 downto 0); for i in 0 to WB_WORD_COUNT-1 loop if to_integer(unsigned(req_wsl)) = i then req_we(WBSL*(i+1)-1 downto WBSL*i) <= wb_req.sel; else req_we(WBSL*(i+1)-1 downto WBSL*i) <= x"00"; end if; req_wdata(WBL*(i+1)-1 downto WBL*i) <= wb_req.dat; end loop; -- Test if pending request is a hit on any way hit_way := 0; is_hit := false; for i in way_t loop if valid and (cache_valids(req_index)(i) = '1' or (state = REFILL_WAIT_ACK and req_index = refill_index and i = refill_way and refill_rows_vlid(req_row mod ROW_PER_LINE) = '1')) then if read_tag(i, cache_tags(req_index)) = req_tag then hit_way := i; is_hit := true; end if; end if; end loop; -- We need to delay stores under some circumstances to avoid -- collisions with the refill machine. -- -- Corner case !!! The read acks pipeline takes two extra cycles -- which means a store ack can collide with a previous load hit -- ack. Thus we stall stores if we have a load ack pending. -- if read_ack_0 = '1' or read_ack_1 = '1' then -- Clash with pending read acks, delay.. store_delay := true; elsif state /= IDLE then -- If the reload machine is active, we cannot accept a store -- for now. -- -- We could improve this a bit by allowing stores if we have sent -- all the requests down to litedram (we are only waiting for the -- responses) *and* either of those conditions is true: -- -- * It's a miss (doesn't require a write to BRAM) and isn't -- for the line being reloaded (otherwise we might reload -- stale data into the cache). -- * It's a hit on a different way than the one being reloaded -- in which case there is no conflict for BRAM access. -- -- Otherwise we delay it... -- store_delay := true; else store_delay := false; end if; -- Generate the req op. We only allow OP_LOAD_* when in the -- IDLE state as our PLRU and ACK generation rely on this, -- stores are allowed in IDLE state. -- req_op <= OP_NONE; if valid then if wb_req.we = '1' then if store_delay then req_op <= OP_STORE_DELAYED; elsif is_hit then req_op <= OP_STORE_HIT; else req_op <= OP_STORE_MISS; end if; else if is_hit then req_op <= OP_LOAD_HIT; else req_op <= OP_LOAD_MISS; end if; end if; end if; req_hit_way <= hit_way; end process; -- -- Store queue -- -- For now, queue up to 16 stores store_queue: entity work.sync_fifo generic map ( DEPTH => STOREQ_DEPTH, WIDTH => STOREQ_BITS ) port map ( clk => system_clk, reset => system_reset, rd_ready => storeq_rd_ready, rd_valid => storeq_rd_valid, rd_data => storeq_rd_data, wr_ready => storeq_wr_ready, wr_valid => storeq_wr_valid, wr_data => storeq_wr_data ); storeq_control : process(all) variable stq_data : wishbone_data_type; variable stq_sel : wishbone_sel_type; variable stq_wsl : std_ulogic_vector(WB_WSEL_BITS-1 downto 0); begin storeq_wr_data <= wb_req.dat & wb_req.sel & wb_req.adr(WB_WSEL_BITS-1 downto 0); -- Only queue stores if we can also send a command if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then storeq_wr_valid <= user_port0_cmd_ready; else storeq_wr_valid <= '0'; end if; -- Store signals (hard wired for 64-bit wishbone at the moment) stq_data := storeq_rd_data(storeq_rd_data'left downto WBSL+WB_WSEL_BITS); stq_sel := storeq_rd_data(WBSL+WB_WSEL_BITS-1 downto WB_WSEL_BITS); stq_wsl := storeq_rd_data(WB_WSEL_BITS-1 downto 0); for i in 0 to WB_WORD_COUNT-1 loop if to_integer(unsigned(stq_wsl)) = i then user_port0_wdata_we(WBSL*(i+1)-1 downto WBSL*i) <= stq_sel; else user_port0_wdata_we(WBSL*(i+1)-1 downto WBSL*i) <= x"00"; end if; user_port0_wdata_data(WBL*(i+1)-1 downto WBL*i) <= stq_data; end loop; -- Note: Current litedram ignores user_port0_wdata_valid. We -- must make sure to always have the data available at the -- output of the store queue when we send the write command. -- -- Thankfully this is always the case with this design. -- user_port0_wdata_valid <= storeq_rd_valid; storeq_rd_ready <= user_port0_wdata_ready; if TRACE then if rising_edge(system_clk) then if req_op = OP_STORE_HIT then report "Store hit to:" & to_hstring(wb_req.adr(DRAM_ABITS downto 0) & "000") & " data:" & to_hstring(req_wdata) & " we:" & to_hstring(req_we) & " V:" & std_ulogic'image(user_port0_cmd_ready); else report "Store miss to:" & to_hstring(wb_req.adr(DRAM_ABITS downto 0) & "000") & " data:" & to_hstring(req_wdata) & " we:" & to_hstring(req_we) & " V:" & std_ulogic'image(user_port0_cmd_ready); end if; if storeq_wr_valid = '1' and storeq_wr_ready = '1' then report "storeq push " & to_hstring(storeq_wr_data); end if; if storeq_rd_valid = '1' and storeq_rd_ready = '1' then report "storeq pop " & to_hstring(storeq_rd_data); end if; end if; end if; end process; -- LiteDRAM command mux dram_commands: process(all) begin if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then -- For stores, forward signals directly. Only send command if -- the FIFO can accept a store. user_port0_cmd_addr <= wb_req.adr(DRAM_ABITS + ROW_OFF_BITS - wishbone_log2_width - 1 downto ROW_OFF_BITS - wishbone_log2_width); user_port0_cmd_we <= '1'; user_port0_cmd_valid <= storeq_wr_ready; else -- For loads, we route via a latch controlled by the refill machine user_port0_cmd_addr <= refill_cmd_addr; user_port0_cmd_valid <= refill_cmd_valid; user_port0_cmd_we <= '0'; end if; -- Note: litedram ignores this signal and assumes we are -- always ready to accept read data. user_port0_rdata_ready <= '1'; -- Always 1 end process; -- LiteDRAM refill machine -- -- This handles the cache line refills -- refill_machine : process(system_clk) variable tagset : cache_tags_set_t; variable cmds_done : boolean; variable wait_qdrain : boolean; begin if rising_edge(system_clk) then -- On reset, clear all valid bits to force misses if system_reset = '1' then for i in index_t loop cache_valids(i) <= (others => '0'); end loop; state <= IDLE; refill_cmd_valid <= '0'; else -- Main state machine case state is when IDLE => assert refill_cmd_valid = '0' report "refill cmd valid in IDLE state !" severity failure; -- Reset per-row valid flags, only used in WAIT_ACK for i in 0 to ROW_PER_LINE - 1 loop refill_rows_vlid(i) <= '0'; end loop; -- If NO_LS_OVERLAP is set, disallow a load miss if the store -- queue still has data in it. wait_qdrain := false; if NO_LS_OVERLAP then wait_qdrain := storeq_rd_valid = '1'; end if; -- We need to read a cache line if req_op = OP_LOAD_MISS and not wait_qdrain then -- Grab way to replace refill_way <= to_integer(unsigned(plru_victim(req_index))); -- Keep track of our index and way for subsequent stores refill_index <= req_index; refill_row <= get_row(req_laddr); refill_end_row <= get_row_of_line(get_row(req_laddr)) - 1; -- Prep for first DRAM read -- -- XXX TODO: We could start a cycle early here by using -- combo logic to generate the first command in -- "dram_commands". In fact, we could make refill_cmd_addr -- only contain the "counter" bits and wire it with the -- other bits from req_laddr. refill_cmd_addr <= req_laddr(DRAM_ABITS+ROW_OFF_BITS-1 downto ROW_OFF_BITS); refill_cmd_valid <= '1'; if TRACE then report "refill addr " & to_hstring(req_laddr); end if; -- Track that we had one request sent state <= REFILL_CLR_TAG; end if; when REFILL_CLR_TAG | REFILL_WAIT_ACK => -- Delayed tag clearing to help timing on PLRU output if state = REFILL_CLR_TAG then -- Force misses on that way while refilling that line cache_valids(req_index)(refill_way) <= '0'; -- Store new tag in selected way for i in 0 to NUM_WAYS-1 loop if i = refill_way then tagset := cache_tags(refill_index); write_tag(i, tagset, req_tag); cache_tags(refill_index) <= tagset; end if; end loop; state <= REFILL_WAIT_ACK; end if; -- Commands are all sent if user_port0_cmd_valid is 0 cmds_done := refill_cmd_valid = '0'; -- If we are still sending requests, was one accepted ? if user_port0_cmd_ready = '1' and not cmds_done then -- That was the last word ? We are done sending. Clear -- command valid and set cmds_done so we can handle an -- eventual last ack on the same cycle. -- if TRACE then report "got refill cmd ack !"; end if; if is_last_row_addr(refill_cmd_addr, refill_end_row) then refill_cmd_valid <= '0'; cmds_done := true; if TRACE then report "all refill cmds done !"; end if; else -- Calculate the next row address refill_cmd_addr <= next_row_addr(refill_cmd_addr); if TRACE then report "refill addr " & to_hstring(next_row_addr(refill_cmd_addr)); end if; end if; end if; -- Incoming read data processing if user_port0_rdata_valid = '1' then if TRACE then report "got refill data ack !"; end if; -- Mark partial line valid refill_rows_vlid(refill_row mod ROW_PER_LINE) <= '1'; -- Check for completion if cmds_done and is_last_row(refill_row, refill_end_row) then if TRACE then report "all refill data done !"; end if; -- Cache line is now valid cache_valids(refill_index)(refill_way) <= '1'; -- We are done state <= IDLE; end if; -- Increment store row counter refill_row <= next_row(refill_row); end if; end case; end if; end if; end process; may_trace: if LITEDRAM_TRACE generate component litedram_trace_stub end component; begin litedram_trace: litedram_trace_stub; end generate; litedram: litedram_core port map( clk => clk_in, rst => rst, pll_locked => pll_locked, ddram_a => ddram_a, ddram_ba => ddram_ba, ddram_ras_n => ddram_ras_n, ddram_cas_n => ddram_cas_n, ddram_we_n => ddram_we_n, ddram_cs_n => ddram_cs_n, ddram_dm => ddram_dm, ddram_dq => ddram_dq, ddram_dqs_p => ddram_dqs_p, ddram_dqs_n => ddram_dqs_n, ddram_clk_p => ddram_clk_p, ddram_clk_n => ddram_clk_n, ddram_cke => ddram_cke, ddram_odt => ddram_odt, ddram_reset_n => ddram_reset_n, init_done => init_done, init_error => init_error, user_clk => system_clk, user_rst => system_reset, wb_ctrl_adr => wb_ctrl_adr, wb_ctrl_dat_w => wb_ctrl_dat_w, wb_ctrl_dat_r => wb_ctrl_dat_r, wb_ctrl_sel => wb_ctrl_sel, wb_ctrl_cyc => wb_ctrl_cyc, wb_ctrl_stb => wb_ctrl_stb, wb_ctrl_ack => wb_ctrl_ack, wb_ctrl_we => wb_ctrl_we, wb_ctrl_cti => "000", wb_ctrl_bte => "00", wb_ctrl_err => open, user_port_native_0_cmd_valid => user_port0_cmd_valid, user_port_native_0_cmd_ready => user_port0_cmd_ready, user_port_native_0_cmd_we => user_port0_cmd_we, user_port_native_0_cmd_addr => user_port0_cmd_addr, user_port_native_0_wdata_valid => user_port0_wdata_valid, user_port_native_0_wdata_ready => user_port0_wdata_ready, user_port_native_0_wdata_we => user_port0_wdata_we, user_port_native_0_wdata_data => user_port0_wdata_data, user_port_native_0_rdata_valid => user_port0_rdata_valid, user_port_native_0_rdata_ready => user_port0_rdata_ready, user_port_native_0_rdata_data => user_port0_rdata_data ); end architecture behaviour;