diff --git a/dram_tb.vhdl b/dram_tb.vhdl index f368e2f..11863c0 100644 --- a/dram_tb.vhdl +++ b/dram_tb.vhdl @@ -293,6 +293,43 @@ begin check_data(make_pattern(i)); end loop; + report "Pre-fill a line"; + a(11) := '1'; + clr_acks; + wb_write(add_off(a, 0), x"1111111100000000", x"ff"); + wb_write(add_off(a, 8), x"3333333322222222", x"ff"); + wb_write(add_off(a, 16), x"5555555544444444", x"ff"); + wb_write(add_off(a, 24), x"7777777766666666", x"ff"); + wb_write(add_off(a, 32), x"9999999988888888", x"ff"); + wb_write(add_off(a, 40), x"bbbbbbbbaaaaaaaa", x"ff"); + wb_write(add_off(a, 48), x"ddddddddcccccccc", x"ff"); + wb_write(add_off(a, 56), x"ffffffffeeeeeeee", x"ff"); + wb_write(add_off(a, 64), x"1111111100000000", x"ff"); + wb_write(add_off(a, 72), x"3333333322222222", x"ff"); + wb_write(add_off(a, 80), x"5555555544444444", x"ff"); + wb_write(add_off(a, 88), x"7777777766666666", x"ff"); + wb_write(add_off(a, 96), x"9999999988888888", x"ff"); + wb_write(add_off(a,104), x"bbbbbbbbaaaaaaaa", x"ff"); + wb_write(add_off(a,112), x"ddddddddcccccccc", x"ff"); + wb_write(add_off(a,120), x"ffffffffeeeeeeee", x"ff"); + wait_acks(16); + + report "Scattered from middle of line..."; + clr_acks; + wb_read(add_off(a,24)); + wb_read(add_off(a,32)); + wb_read(add_off(a, 0)); + wb_read(add_off(a,16)); + wait_acks(4); + read_data(d); + assert d = x"7777777766666666" report "bad data (24), got " & to_hstring(d) severity failure; + read_data(d); + assert d = x"9999999988888888" report "bad data (32), got " & to_hstring(d) severity failure; + read_data(d); + assert d = x"1111111100000000" report "bad data (0), got " & to_hstring(d) severity failure; + read_data(d); + assert d = x"5555555544444444" report "bad data (16), got " & to_hstring(d) severity failure; + std.env.finish; end process; end architecture; diff --git a/litedram/extras/litedram-wrapper-l2.vhdl b/litedram/extras/litedram-wrapper-l2.vhdl index 61b4867..5c9eb07 100644 --- a/litedram/extras/litedram-wrapper-l2.vhdl +++ b/litedram/extras/litedram-wrapper-l2.vhdl @@ -189,6 +189,7 @@ architecture behaviour of litedram_wrapper is subtype row_t is integer range 0 to BRAM_ROWS-1; subtype index_t is integer range 0 to NUM_LINES-1; subtype way_t is integer range 0 to NUM_WAYS-1; + subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0); -- The cache data BRAM organized as described above for each way subtype cache_row_t is std_ulogic_vector(DRAM_DBITS-1 downto 0); @@ -207,6 +208,9 @@ architecture behaviour of litedram_wrapper is subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); type cache_valids_t is array(index_t) of cache_way_valids_t; + -- "Temporary" valid bits for the rows of the currently refilled line + type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; + -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs signal cache_tags : cache_tags_array_t; signal cache_valids : cache_valids_t; @@ -233,7 +237,7 @@ architecture behaviour of litedram_wrapper is -- Cache management signals -- - -- Cache state machine + -- Cache state machine type state_t is (IDLE, -- Normal load hit processing REFILL_CLR_TAG, -- Cache refill clear tag REFILL_WAIT_ACK); -- Cache refill wait ack @@ -261,7 +265,8 @@ architecture behaviour of litedram_wrapper is OP_LOAD_HIT, OP_LOAD_MISS, OP_STORE_HIT, - OP_STORE_MISS); + OP_STORE_MISS, + OP_STORE_DELAYED); signal req_index : index_t; signal req_row : row_t; @@ -272,7 +277,6 @@ architecture behaviour of litedram_wrapper is signal req_ad3 : std_ulogic; signal req_we : std_ulogic_vector(DRAM_SBITS-1 downto 0); signal req_wdata : std_ulogic_vector(DRAM_DBITS-1 downto 0); - signal accept_store : std_ulogic; signal stall : std_ulogic; -- Line refill command signals and latches @@ -281,6 +285,8 @@ architecture behaviour of litedram_wrapper is signal refill_way : way_t; signal refill_index : index_t; signal refill_row : row_t; + signal refill_end_row : row_in_line_t; + signal refill_rows_vlid : row_per_line_valid_t; -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; @@ -306,21 +312,25 @@ architecture behaviour of litedram_wrapper is return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS))); end; + -- Return the index of a row within a line + function get_row_of_line(row: row_t) return row_in_line_t is + variable row_v : unsigned(ROW_BITS-1 downto 0); + begin + row_v := to_unsigned(row, ROW_BITS); + return row_v(ROW_LINEBITS-1 downto 0); + end; -- Returns whether this is the last row of a line. It takes a DRAM address - function is_last_row_addr(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS)) + function is_last_row_addr(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS); + last: row_in_line_t) return boolean is - constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); begin - return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones; + return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last; end; -- Returns whether this is the last row of a line - function is_last_row(row: row_t) return boolean is - variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0); - constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); + function is_last_row(row: row_t; last: row_in_line_t) return boolean is begin - row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS)); - return row_v(ROW_LINEBITS-1 downto 0) = ones; + return get_row_of_line(row) = last; end; -- Return the address of the next row in the current cache line. It takes a @@ -493,7 +503,7 @@ begin -- -- Write mux: cache refills from DRAM or writes from Wishbone -- - if state = IDLE then + if req_op = OP_STORE_HIT and req_hit_way = i then -- Write from wishbone wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); wr_data <= req_wdata; @@ -570,7 +580,7 @@ begin end generate; -- - -- Wishbone interface: + -- Wishbone request interface: -- -- - Incoming wishbone request latch (to help with timing) -- - Read response pipeline (to match BRAM output buffer delay) @@ -632,7 +642,6 @@ begin -- -- Read response pipeline -- - -- XXX Might have to put store acks in there too (see comment in wb_response) read_pipe: process(system_clk) begin if rising_edge(system_clk) then @@ -670,54 +679,46 @@ begin end if; end process; + -- + -- Wishbone response generation + -- + wb_reponse: process(all) - variable rdata : std_ulogic_vector(DRAM_DBITS-1 downto 0); - variable store_done : std_ulogic; + variable rdata : std_ulogic_vector(DRAM_DBITS-1 downto 0); + variable store_done : std_ulogic; + variable accept_store : std_ulogic; begin - -- Can we accept a store ? This is set when IDLE and the store - -- queue & command queue are not full. + -- Can we accept a store ? This is set when the store queue & command + -- queue are not full. -- - -- Note: This is only used to control the WB request latch, stall - -- and store "early complete". We don't want to use this to control - -- cmd_valid to DRAM as this would create a circular dependency inside - -- LiteDRAM as cmd_ready I think is driven from cmd_valid. + -- This does *not* mean that we will accept the store, there are other + -- reasons to delay them (see OP_STORE_DELAYED). -- - -- The state machine that controls the command queue must thus - -- reproduce this logic at least partially. + -- A store is fully accepted when *both* req_op is not OP_STORE_DELAYED + -- and accept_store is '1'. -- - -- Note also that user_port0_cmd_ready from LiteDRAM is combinational - -- from user_port0_cmd_valid. IE. we won't know that LiteDRAM cannot - -- accept a command until we try to send one. + -- The reason for this split is to avoid a circular dependency inside + -- LiteDRAM, since cmd_ready from litedram is driven from cmd_valid (*) + -- we don't want to generate cmd_valid from cmd_ready. So we generate + -- it instead from all the *other* conditions that make a store valid. -- - if state = IDLE then - accept_store <= user_port0_cmd_ready and storeq_wr_ready; - - -- Corner case !!! The read acks pipeline takes two extra cycles - -- which means a store ack can collide with a previous load hit - -- ack. Thus we stall stores if we have a load ack pending. - if read_ack_0 = '1' or read_ack_1 = '1' then - accept_store <= '0'; - end if; + -- (*) It's my understanding that user_port0_cmd_ready from LiteDRAM is + -- ombinational from user_port0_cmd_valid along with a bunch of other + -- internal signals. IE. we won't know that LiteDRAM cannot accept a + -- command until we try to send one. + -- + accept_store := user_port0_cmd_ready and storeq_wr_ready; + + -- Generate stalls. For stores we stall if we can't accept it. + -- For loads, we stall if we are going to take a load miss or + -- are in the middle of a refill and it isn't a partial hit. + if req_op = OP_STORE_MISS or req_op = OP_STORE_HIT then + stall <= not accept_store; + elsif req_op = OP_LOAD_MISS or req_op = OP_STORE_DELAYED then + stall <= '1'; else - accept_store <= '0'; + stall <= '0'; end if; - - -- Generate stalls. For loads, we stall if we are going to take a load - -- miss or are in the middle of a refill. For stores, if we can't - -- accept it. - case state is - when IDLE => - case req_op is - when OP_LOAD_MISS => - stall <= '1'; - when OP_STORE_MISS | OP_STORE_HIT => - stall <= not accept_store; - when others => - stall <= '0'; - end case; - when others => - stall <= '1'; - end case; -- Data out mux rdata := cache_out(read_way_1); @@ -736,7 +737,8 @@ begin -- Generate Wishbone ACKs on read hits and store complete -- -- This can happen on store right behind loads ! This is why - -- we don't accept a new store right behind a load ack above. + -- we delay a store when a load ack is in the pipeline in the + -- request decoder below. -- wb_out.ack <= read_ack_1 or store_ack_1; assert read_ack_1 = '0' or store_ack_1 = '0' report @@ -748,27 +750,24 @@ begin -- Cache request decode -- request_decode: process(all) - variable valid : std_ulogic; - variable is_hit : std_ulogic; - variable hit_way : way_t; + variable valid : boolean; + variable is_hit : boolean; + variable store_delay : boolean; + variable hit_way : way_t; begin -- Extract line, row and tag from request req_index <= get_index(wb_req.adr); req_row <= get_row(wb_req.adr(REAL_ADDR_BITS-1 downto 0)); req_tag <= get_tag(wb_req.adr); - -- Calculate address of beginning of cache line, will be + -- Calculate address of beginning of cache row, will be -- used for cache miss processing if needed - req_laddr <= wb_req.adr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) & - (LINE_OFF_BITS-1 downto 0 => '0'); + req_laddr <= wb_req.adr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & + (ROW_OFF_BITS-1 downto 0 => '0'); -- Do we have a valid request in the WB latch ? - if state = IDLE then - valid := wb_req.cyc and wb_req.stb; - else - valid := '0'; - end if; + valid := wb_req.cyc = '1' and wb_req.stb = '1'; -- Store signals req_ad3 <= wb_req.adr(3); @@ -778,30 +777,67 @@ begin -- Test if pending request is a hit on any way hit_way := 0; - is_hit := '0'; + is_hit := false; for i in way_t loop - if valid = '1' and cache_valids(req_index)(i) = '1' then + if valid and + (cache_valids(req_index)(i) = '1' or + (state = REFILL_WAIT_ACK and + req_index = refill_index and i = refill_way and + refill_rows_vlid(req_row mod ROW_PER_LINE) = '1')) then if read_tag(i, cache_tags(req_index)) = req_tag then hit_way := i; - is_hit := '1'; + is_hit := true; end if; end if; end loop; + -- We need to delay stores under some circumstances to avoid + -- collisions with the refill machine. + -- + -- Corner case !!! The read acks pipeline takes two extra cycles + -- which means a store ack can collide with a previous load hit + -- ack. Thus we stall stores if we have a load ack pending. + -- + if read_ack_0 = '1' or read_ack_1 = '1' then + -- Clash with pending read acks, delay.. + store_delay := true; + elsif state /= IDLE then + -- If the reload machine is active, we cannot accept a store + -- for now. + -- + -- We could improve this a bit by allowing stores if we have sent + -- all the requests down to litedram (we are only waiting for the + -- responses) *and* either of those conditions is true: + -- + -- * It's a miss (doesn't require a write to BRAM) and isn't + -- for the line being reloaded (otherwise we might reload + -- stale data into the cache). + -- * It's a hit on a different way than the one being reloaded + -- in which case there is no conflict for BRAM access. + -- + -- Otherwise we delay it... + -- + store_delay := true; + else + store_delay := false; + end if; + -- Generate the req op. We only allow OP_LOAD_* when in the -- IDLE state as our PLRU and ACK generation rely on this, -- stores are allowed in IDLE state. -- req_op <= OP_NONE; - if valid = '1' then + if valid then if wb_req.we = '1' then - if is_hit = '1' then + if store_delay then + req_op <= OP_STORE_DELAYED; + elsif is_hit then req_op <= OP_STORE_HIT; else req_op <= OP_STORE_MISS; end if; else - if is_hit = '1' then + if is_hit then req_op <= OP_LOAD_HIT; else req_op <= OP_LOAD_MISS; @@ -837,7 +873,7 @@ begin begin storeq_wr_data <= wb_req.dat & req_we; - -- Only accept store if we can send a command + -- Only queue stores if we can also send a command if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then storeq_wr_valid <= user_port0_cmd_ready; else @@ -858,13 +894,13 @@ begin to_hstring(wb_req.adr(DRAM_ABITS+3 downto 0)) & " data:" & to_hstring(req_wdata) & " we:" & to_hstring(req_we) & - " V:" & std_ulogic'image(accept_store); + " V:" & std_ulogic'image(user_port0_cmd_ready); else report "Store miss to:" & to_hstring(wb_req.adr(DRAM_ABITS+3 downto 0)) & " data:" & to_hstring(req_wdata) & " we:" & to_hstring(req_we) & - " V:" & std_ulogic'image(accept_store); + " V:" & std_ulogic'image(user_port0_cmd_ready); end if; if storeq_wr_valid = '1' and storeq_wr_ready = '1' then report "storeq push " & to_hstring(storeq_wr_data); @@ -879,9 +915,9 @@ begin -- LiteDRAM command mux dram_commands: process(all) begin - if state = IDLE and (req_op = OP_STORE_HIT or req_op = OP_STORE_MISS) then + if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then -- For stores, forward signals directly. Only send command if - -- the FIFO can accept a store + -- the FIFO can accept a store. user_port0_cmd_addr <= wb_req.adr(DRAM_ABITS+3 downto 4); user_port0_cmd_we <= '1'; user_port0_cmd_valid <= storeq_wr_ready; @@ -918,6 +954,11 @@ begin assert refill_cmd_valid = '0' report "refill cmd valid in IDLE state !" severity failure; + -- Reset per-row valid flags, only used in WAIT_ACK + for i in 0 to ROW_PER_LINE - 1 loop + refill_rows_vlid(i) <= '0'; + end loop; + -- If NO_LS_OVERLAP is set, disallow a load miss if the store -- queue still has data in it. wait_qdrain := false; @@ -931,8 +972,9 @@ begin refill_way <= to_integer(unsigned(plru_victim(req_index))); -- Keep track of our index and way for subsequent stores - refill_index <= req_index; - refill_row <= get_row(req_laddr); + refill_index <= req_index; + refill_row <= get_row(req_laddr); + refill_end_row <= get_row_of_line(get_row(req_laddr)) - 1; -- Prep for first DRAM read -- @@ -982,7 +1024,7 @@ begin if TRACE then report "got refill cmd ack !"; end if; - if is_last_row_addr(refill_cmd_addr) then + if is_last_row_addr(refill_cmd_addr, refill_end_row) then refill_cmd_valid <= '0'; cmds_done := true; if TRACE then @@ -1003,8 +1045,12 @@ begin if TRACE then report "got refill data ack !"; end if; + + -- Mark partial line valid + refill_rows_vlid(refill_row mod ROW_PER_LINE) <= '1'; + -- Check for completion - if cmds_done and is_last_row(refill_row) then + if cmds_done and is_last_row(refill_row, refill_end_row) then if TRACE then report "all refill data done !"; end if; diff --git a/litedram/extras/wave_tb.gtkw b/litedram/extras/wave_tb.gtkw index fcdf6a9..c675d7b 100644 --- a/litedram/extras/wave_tb.gtkw +++ b/litedram/extras/wave_tb.gtkw @@ -1,17 +1,18 @@ [*] [*] GTKWave Analyzer v3.3.86 (w)1999-2017 BSI -[*] Sun May 31 12:53:52 2020 +[*] Mon Jun 22 06:32:16 2020 [*] [dumpfile] "/home/ANT.AMAZON.COM/benh/hackplace/microwatt/foo.ghw" -[dumpfile_mtime] "Sun May 31 12:50:15 2020" -[dumpfile_size] 1134118 +[dumpfile_mtime] "Mon Jun 22 06:28:35 2020" +[dumpfile_size] 1680014 [savefile] "/home/ANT.AMAZON.COM/benh/hackplace/microwatt/litedram/extras/wave_tb.gtkw" -[timestart] 1312950000 +[timestart] 1920580000 [size] 2509 1371 [pos] -1 -1 -*-24.248457 1386890000 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +*-24.248457 1935000000 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 [treeopen] top. [treeopen] top.dram_tb. +[treeopen] top.dram_tb.dram. [sst_width] 301 [signals_width] 433 [sst_expanded] 1 @@ -26,7 +27,6 @@ top.dram_tb.clk @22 #{top.dram_tb.wb_in.dat[63:0]} top.dram_tb.wb_in.dat[63] top.dram_tb.wb_in.dat[62] top.dram_tb.wb_in.dat[61] top.dram_tb.wb_in.dat[60] top.dram_tb.wb_in.dat[59] top.dram_tb.wb_in.dat[58] top.dram_tb.wb_in.dat[57] top.dram_tb.wb_in.dat[56] top.dram_tb.wb_in.dat[55] top.dram_tb.wb_in.dat[54] top.dram_tb.wb_in.dat[53] top.dram_tb.wb_in.dat[52] top.dram_tb.wb_in.dat[51] top.dram_tb.wb_in.dat[50] top.dram_tb.wb_in.dat[49] top.dram_tb.wb_in.dat[48] top.dram_tb.wb_in.dat[47] top.dram_tb.wb_in.dat[46] top.dram_tb.wb_in.dat[45] top.dram_tb.wb_in.dat[44] top.dram_tb.wb_in.dat[43] top.dram_tb.wb_in.dat[42] top.dram_tb.wb_in.dat[41] top.dram_tb.wb_in.dat[40] top.dram_tb.wb_in.dat[39] top.dram_tb.wb_in.dat[38] top.dram_tb.wb_in.dat[37] top.dram_tb.wb_in.dat[36] top.dram_tb.wb_in.dat[35] top.dram_tb.wb_in.dat[34] top.dram_tb.wb_in.dat[33] top.dram_tb.wb_in.dat[32] top.dram_tb.wb_in.dat[31] top.dram_tb.wb_in.dat[30] top.dram_tb.wb_in.dat[29] top.dram_tb.wb_in.dat[28] top.dram_tb.wb_in.dat[27] top.dram_tb.wb_in.dat[26] top.dram_tb.wb_in.dat[25] top.dram_tb.wb_in.dat[24] top.dram_tb.wb_in.dat[23] top.dram_tb.wb_in.dat[22] top.dram_tb.wb_in.dat[21] top.dram_tb.wb_in.dat[20] top.dram_tb.wb_in.dat[19] top.dram_tb.wb_in.dat[18] top.dram_tb.wb_in.dat[17] top.dram_tb.wb_in.dat[16] top.dram_tb.wb_in.dat[15] top.dram_tb.wb_in.dat[14] top.dram_tb.wb_in.dat[13] top.dram_tb.wb_in.dat[12] top.dram_tb.wb_in.dat[11] top.dram_tb.wb_in.dat[10] top.dram_tb.wb_in.dat[9] top.dram_tb.wb_in.dat[8] top.dram_tb.wb_in.dat[7] top.dram_tb.wb_in.dat[6] top.dram_tb.wb_in.dat[5] top.dram_tb.wb_in.dat[4] top.dram_tb.wb_in.dat[3] top.dram_tb.wb_in.dat[2] top.dram_tb.wb_in.dat[1] top.dram_tb.wb_in.dat[0] #{top.dram_tb.wb_in.adr[31:0]} top.dram_tb.wb_in.adr[31] top.dram_tb.wb_in.adr[30] top.dram_tb.wb_in.adr[29] top.dram_tb.wb_in.adr[28] top.dram_tb.wb_in.adr[27] top.dram_tb.wb_in.adr[26] top.dram_tb.wb_in.adr[25] top.dram_tb.wb_in.adr[24] top.dram_tb.wb_in.adr[23] top.dram_tb.wb_in.adr[22] top.dram_tb.wb_in.adr[21] top.dram_tb.wb_in.adr[20] top.dram_tb.wb_in.adr[19] top.dram_tb.wb_in.adr[18] top.dram_tb.wb_in.adr[17] top.dram_tb.wb_in.adr[16] top.dram_tb.wb_in.adr[15] top.dram_tb.wb_in.adr[14] top.dram_tb.wb_in.adr[13] top.dram_tb.wb_in.adr[12] top.dram_tb.wb_in.adr[11] top.dram_tb.wb_in.adr[10] top.dram_tb.wb_in.adr[9] top.dram_tb.wb_in.adr[8] top.dram_tb.wb_in.adr[7] top.dram_tb.wb_in.adr[6] top.dram_tb.wb_in.adr[5] top.dram_tb.wb_in.adr[4] top.dram_tb.wb_in.adr[3] top.dram_tb.wb_in.adr[2] top.dram_tb.wb_in.adr[1] top.dram_tb.wb_in.adr[0] -@23 #{top.dram_tb.wb_in.sel[7:0]} top.dram_tb.wb_in.sel[7] top.dram_tb.wb_in.sel[6] top.dram_tb.wb_in.sel[5] top.dram_tb.wb_in.sel[4] top.dram_tb.wb_in.sel[3] top.dram_tb.wb_in.sel[2] top.dram_tb.wb_in.sel[1] top.dram_tb.wb_in.sel[0] @28 top.dram_tb.wb_in.cyc @@ -67,6 +67,9 @@ top.dram_tb.dram.user_port0_cmd_valid top.dram_tb.dram.refill_cmd_valid @420 top.dram_tb.dram.req_index +@421 +top.dram_tb.dram.req_row +@420 top.dram_tb.dram.req_hit_way @28 top.dram_tb.dram.req_ad3