From b5959632332cefbcb12537580710ba706fc79cf5 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 11 Jun 2020 14:23:50 +1000 Subject: [PATCH] dcache: Reduce latencies and improve timing This implements various improvements to the dcache with the aim of making it go faster. - We can now execute operations that don't need to access main memory (cacheable loads that hit in the cache and TLB operations) as soon as any previous operation has completed, without waiting for the state machine to become idle. - Cache line refills start with the doubleword that is needed to satisfy the load that initiated them. - Cacheable loads that miss return their data and complete as soon as the requested doubleword comes back from memory; they don't wait for the refill to finish. - We now have per-doubleword valid bits for the cache line being refilled, meaning that if a load comes in for a line that is in the process of being refilled, we can return the data and complete it within a couple of cycles of the doubleword coming in from memory. - There is now a bypass path for data being written to the cache RAM so that we can do a store hit followed immediately by a load hit to the same doubleword. This also makes the data from a refill available to load hits one cycle earlier than it would be otherwise. - Stores complete in the cycle where their wishbone operation is initiated, without waiting for the wishbone cycle to complete. - During the wishbone cycle for a store, if another store comes in that is to the same page, and we don't have a stall from the wishbone, we can send out the write for the second store in the same wishbone cycle and without going through the IDLE state first. We limit it to 7 outstanding writes that have not yet been acknowledged. - The cache tag RAM is now read on a clock edge rather than being combinatorial for reading. Its width is rounded up to a multiple of 8 bits per way so that byte enables can be used for writing individual tags. - The cache tag RAM is now written a cycle later than previously, in order to ease timing. - Data for a store hit is now written one cycle later than previously. This eases timing since we don't have to get through the tag matching and on to the write enable within a single cycle. The 2-stage bypass path means we can still handle a load hit on either of the two cycles after the store and return the correct data. (A load hit 3 or more cycles later will get the correct data from the BRAM.) - Operations can sit in r0 while there is an uncompleted operation in r1. Once the operation in r1 is completed, the operation in r0 spends one cycle in r0 for TLB/cache tag lookup and then gets put into r1.req. This can happen before r1 gets to the IDLE state. Some operations can then be completed before r1 gets to the IDLE state - a load miss to the cache line being refilled, or a store to the same page as a previous store. Signed-off-by: Paul Mackerras --- dcache.vhdl | 705 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 435 insertions(+), 270 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 7a8c0ba..bc351b0 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -83,6 +83,8 @@ architecture rtl of dcache is constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS; -- TAG_BITS is the number of bits of the tag part of the address constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS; + -- TAG_WIDTH is the width in bits of each way of the tag RAM + constant TAG_WIDTH : natural := TAG_BITS + 7 - ((TAG_BITS + 7) mod 8); -- WAY_BITS is the number of bits to select a way constant WAY_BITS : natural := log2(NUM_WAYS); @@ -100,6 +102,7 @@ architecture rtl of dcache is subtype row_t is integer range 0 to BRAM_ROWS-1; subtype index_t is integer range 0 to NUM_LINES-1; subtype way_t is integer range 0 to NUM_WAYS-1; + subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0); -- The cache data BRAM organized as described above for each way subtype cache_row_t is std_ulogic_vector(wishbone_data_bits-1 downto 0); @@ -110,17 +113,19 @@ architecture rtl of dcache is subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0); -- type cache_tags_set_t is array(way_t) of cache_tag_t; -- type cache_tags_array_t is array(index_t) of cache_tags_set_t; - constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS; + constant TAG_RAM_WIDTH : natural := TAG_WIDTH * NUM_WAYS; subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0); type cache_tags_array_t is array(index_t) of cache_tags_set_t; -- The cache valid bits subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); type cache_valids_t is array(index_t) of cache_way_valids_t; + type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs - signal cache_tags : cache_tags_array_t; - signal cache_valids : cache_valids_t; + signal cache_tags : cache_tags_array_t; + signal cache_tag_set : cache_tags_set_t; + signal cache_valids : cache_valids_t; attribute ram_style : string; attribute ram_style of cache_tags : signal is "distributed"; @@ -177,18 +182,17 @@ architecture rtl of dcache is -- Type of operation on a "valid" input type op_t is (OP_NONE, + OP_BAD, -- NC cache hit, TLB miss, prot/RC failure + OP_STCX_FAIL, -- conditional store w/o reservation OP_LOAD_HIT, -- Cache hit on load OP_LOAD_MISS, -- Load missing cache OP_LOAD_NC, -- Non-cachable load - OP_BAD, -- BAD: Cache hit on NC load/store - OP_TLB_ERR, -- TLB miss or protection/RC failure OP_STORE_HIT, -- Store hitting cache OP_STORE_MISS); -- Store missing cache -- Cache state machine type state_t is (IDLE, -- Normal load hit processing RELOAD_WAIT_ACK, -- Cache reload wait ack - FINISH_LD_MISS, -- Extra cycle after load miss STORE_WAIT_ACK, -- Store wait ack NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack @@ -218,38 +222,63 @@ architecture rtl of dcache is end record; signal r0 : reg_stage_0_t; - signal r0_valid : std_ulogic; - + signal r0_full : std_ulogic; + + type mem_access_request_t is record + op : op_t; + dcbz : std_ulogic; + real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + data : std_ulogic_vector(63 downto 0); + byte_sel : std_ulogic_vector(7 downto 0); + hit_way : way_t; + repl_way : way_t; + end record; + -- First stage register, contains state for stage 1 of load hits -- and for the state machine used by all other operations -- type reg_stage_1_t is record - -- Latch the complete request from ls1 - req : Loadstore1ToDcacheType; - mmu_req : std_ulogic; + -- Info about the request + full : std_ulogic; -- have uncompleted request + mmu_req : std_ulogic; -- request is from MMU + req : mem_access_request_t; -- Cache hit state hit_way : way_t; hit_load_valid : std_ulogic; - -- Data buffer for "slow" read ops (load miss and NC loads). - slow_data : std_ulogic_vector(63 downto 0); - slow_valid : std_ulogic; - - -- Signal to complete a failed stcx. - stcx_fail : std_ulogic; + -- 2-stage data buffer for data forwarded from writes to reads + forward_data1 : std_ulogic_vector(63 downto 0); + forward_data2 : std_ulogic_vector(63 downto 0); + forward_sel1 : std_ulogic_vector(7 downto 0); + forward_valid1 : std_ulogic; + forward_way1 : way_t; + forward_row1 : row_t; + use_forward1 : std_ulogic; + forward_sel : std_ulogic_vector(7 downto 0); -- Cache miss state (reload state machine) state : state_t; + dcbz : std_ulogic; + write_bram : std_ulogic; + write_tag : std_ulogic; + slow_valid : std_ulogic; wb : wishbone_master_out; + reload_tag : cache_tag_t; store_way : way_t; store_row : row_t; store_index : index_t; + end_row_ix : row_in_line_t; + rows_valid : row_per_line_valid_t; + acks_pending : unsigned(2 downto 0); -- Signals to complete with error error_done : std_ulogic; cache_paradox : std_ulogic; + -- Signal to complete a failed stcx. + stcx_fail : std_ulogic; + -- completion signal for tlbie tlbie_done : std_ulogic; end record; @@ -272,7 +301,6 @@ architecture rtl of dcache is signal req_tag : cache_tag_t; signal req_op : op_t; signal req_data : std_ulogic_vector(63 downto 0); - signal req_laddr : std_ulogic_vector(63 downto 0); signal early_req_row : row_t; @@ -280,6 +308,12 @@ architecture rtl of dcache is signal set_rsrv : std_ulogic; signal clear_rsrv : std_ulogic; + signal r0_valid : std_ulogic; + signal r0_stall : std_ulogic; + + signal use_forward1_next : std_ulogic; + signal use_forward2_next : std_ulogic; + -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; signal cache_out : cache_ram_out_t; @@ -305,6 +339,7 @@ architecture rtl of dcache is signal perm_attr : perm_attr_t; signal rc_ok : std_ulogic; signal perm_ok : std_ulogic; + signal access_ok : std_ulogic; -- TLB PLRU output interface type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0); @@ -315,31 +350,35 @@ architecture rtl of dcache is -- -- Return the cache line index (tag index) for an address - function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is + function get_index(addr: std_ulogic_vector) return index_t is begin return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS))); end; -- Return the cache row index (data memory) for an address - function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is + function get_row(addr: std_ulogic_vector) return row_t is begin return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS))); end; + -- Return the index of a row within a line + function get_row_of_line(row: row_t) return row_in_line_t is + variable row_v : unsigned(ROW_BITS-1 downto 0); + begin + row_v := to_unsigned(row, ROW_BITS); + return row_v(ROW_LINEBITS-1 downto 0); + end; + -- Returns whether this is the last row of a line - function is_last_row_addr(addr: wishbone_addr_type) return boolean is - constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); + function is_last_row_addr(addr: wishbone_addr_type; last: row_in_line_t) return boolean is begin - return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones; + return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last; end; -- Returns whether this is the last row of a line - function is_last_row(row: row_t) return boolean is - variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0); - constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); + function is_last_row(row: row_t; last: row_in_line_t) return boolean is begin - row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS)); - return row_v(ROW_LINEBITS-1 downto 0) = ones; + return get_row_of_line(row) = last; end; -- Return the address of the next row in the current cache line @@ -371,7 +410,7 @@ architecture rtl of dcache is end; -- Get the tag value from the address - function get_tag(addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)) return cache_tag_t is + function get_tag(addr: std_ulogic_vector) return cache_tag_t is begin return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS); end; @@ -379,14 +418,7 @@ architecture rtl of dcache is -- Read a tag from a tag memory row function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is begin - return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS); - end; - - -- Write a tag to tag memory row - procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t; - tag: cache_tag_t) is - begin - tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; + return tagset(way * TAG_WIDTH + TAG_BITS - 1 downto way * TAG_WIDTH); end; -- Read a TLB tag from a TLB tag memory row @@ -439,38 +471,42 @@ begin report "geometry bits don't add up" severity FAILURE; assert (64 = wishbone_data_bits) report "Can't yet handle a wishbone width that isn't 64-bits" severity FAILURE; + assert SET_SIZE_BITS <= TLB_LG_PGSZ report "Set indexed by virtual address" severity FAILURE; -- Latch the request in r0.req as long as we're not stalling stage_0 : process(clk) + variable r : reg_stage_0_t; begin if rising_edge(clk) then + assert (d_in.valid and m_in.valid) = '0' report + "request collision loadstore vs MMU"; + if m_in.valid = '1' then + r.req.valid := '1'; + r.req.load := not (m_in.tlbie or m_in.tlbld); + r.req.dcbz := '0'; + r.req.nc := '0'; + r.req.reserve := '0'; + r.req.virt_mode := '0'; + r.req.priv_mode := '1'; + r.req.addr := m_in.addr; + r.req.data := m_in.pte; + r.req.byte_sel := (others => '1'); + r.tlbie := m_in.tlbie; + r.doall := m_in.doall; + r.tlbld := m_in.tlbld; + r.mmu_req := '1'; + else + r.req := d_in; + r.tlbie := '0'; + r.doall := '0'; + r.tlbld := '0'; + r.mmu_req := '0'; + end if; if rst = '1' then - r0.req.valid <= '0'; - elsif stall_out = '0' then - assert (d_in.valid and m_in.valid) = '0' report - "request collision loadstore vs MMU"; - if m_in.valid = '1' then - r0.req.valid <= '1'; - r0.req.load <= not (m_in.tlbie or m_in.tlbld); - r0.req.dcbz <= '0'; - r0.req.nc <= '0'; - r0.req.reserve <= '0'; - r0.req.virt_mode <= '0'; - r0.req.priv_mode <= '1'; - r0.req.addr <= m_in.addr; - r0.req.data <= m_in.pte; - r0.req.byte_sel <= (others => '1'); - r0.tlbie <= m_in.tlbie; - r0.doall <= m_in.doall; - r0.tlbld <= m_in.tlbld; - r0.mmu_req <= '1'; - else - r0.req <= d_in; - r0.tlbie <= '0'; - r0.doall <= '0'; - r0.tlbld <= '0'; - r0.mmu_req <= '0'; - end if; + r0_full <= '0'; + elsif r1.full = '0' or r0_full = '0' then + r0 <= r; + r0_full <= r.req.valid; end if; end if; end process; @@ -478,9 +514,10 @@ begin -- we don't yet handle collisions between loadstore1 requests and MMU requests m_out.stall <= '0'; - -- Hold off the request in r0 when stalling, - -- and cancel it if we get an error in a previous request. - r0_valid <= r0.req.valid and not stall_out and not r1.error_done; + -- Hold off the request in r0 when r1 has an uncompleted request + r0_stall <= r0_full and r1.full; + r0_valid <= r0_full and not r1.full; + stall_out <= r0_stall; -- TLB -- Operates in the second cycle on the request latched in r0.req. @@ -490,20 +527,19 @@ begin variable addrbits : std_ulogic_vector(TLB_SET_BITS - 1 downto 0); begin if rising_edge(clk) then - if stall_out = '1' then - -- keep reading the same thing while stalled - index := tlb_req_index; + if m_in.valid = '1' then + addrbits := m_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); else - if m_in.valid = '1' then - addrbits := m_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); - else - addrbits := d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); - end if; - index := to_integer(unsigned(addrbits)); + addrbits := d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); + end if; + index := to_integer(unsigned(addrbits)); + -- If we have any op and the previous op isn't finished, + -- then keep the same output for next cycle. + if r0_stall = '0' then + tlb_valid_way <= dtlb_valids(index); + tlb_tag_way <= dtlb_tags(index); + tlb_pte_way <= dtlb_ptes(index); end if; - tlb_valid_way <= dtlb_valids(index); - tlb_tag_way <= dtlb_tags(index); - tlb_pte_way <= dtlb_ptes(index); end if; end process; @@ -569,10 +605,12 @@ begin valid_ra <= tlb_hit or not r0.req.virt_mode; if r0.req.virt_mode = '1' then ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & - r0.req.addr(TLB_LG_PGSZ - 1 downto 0); + r0.req.addr(TLB_LG_PGSZ - 1 downto ROW_OFF_BITS) & + (ROW_OFF_BITS-1 downto 0 => '0'); perm_attr <= extract_perm_attr(pte); else - ra <= r0.req.addr(REAL_ADDR_BITS - 1 downto 0); + ra <= r0.req.addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & + (ROW_OFF_BITS-1 downto 0 => '0'); perm_attr <= real_mode_perm_attr; end if; end process; @@ -652,35 +690,45 @@ begin end generate; end generate; + -- Cache tag RAM read port + cache_tag_read : process(clk) + variable index : index_t; + begin + if rising_edge(clk) then + if r0_stall = '1' then + index := req_index; + elsif m_in.valid = '1' then + index := get_index(m_in.addr); + else + index := get_index(d_in.addr); + end if; + cache_tag_set <= cache_tags(index); + end if; + end process; + -- Cache request parsing and hit detection dcache_request : process(all) - variable is_hit : std_ulogic; - variable hit_way : way_t; - variable op : op_t; - variable opsel : std_ulogic_vector(2 downto 0); - variable go : std_ulogic; - variable nc : std_ulogic; - variable s_hit : std_ulogic; - variable s_tag : cache_tag_t; - variable s_pte : tlb_pte_t; - variable s_ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); - variable hit_set : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); + variable is_hit : std_ulogic; + variable hit_way : way_t; + variable op : op_t; + variable opsel : std_ulogic_vector(2 downto 0); + variable go : std_ulogic; + variable nc : std_ulogic; + variable s_hit : std_ulogic; + variable s_tag : cache_tag_t; + variable s_pte : tlb_pte_t; + variable s_ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + variable hit_set : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); variable hit_way_set : hit_way_set_t; + variable rel_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); + variable rel_match : std_ulogic; begin -- Extract line, row and tag from request req_index <= get_index(r0.req.addr); req_row <= get_row(r0.req.addr); req_tag <= get_tag(ra); - -- Only do anything if not being stalled by stage 1 - go := r0_valid and not (r0.tlbie or r0.tlbld); - - -- Calculate address of beginning of cache line, will be - -- used for cache miss processing if needed - -- - req_laddr <= (63 downto REAL_ADDR_BITS => '0') & - ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) & - (LINE_OFF_BITS-1 downto 0 => '0'); + go := r0_valid and not (r0.tlbie or r0.tlbld) and not r1.error_done; -- Test if pending request is a hit on any way -- In order to make timing in virtual mode, when we are using the TLB, @@ -688,7 +736,9 @@ begin -- the TLB, and then decide later which match to use. hit_way := 0; is_hit := '0'; + rel_match := '0'; if r0.req.virt_mode = '1' then + rel_matches := (others => '0'); for j in tlb_way_t loop hit_way_set(j) := 0; s_hit := '0'; @@ -698,27 +748,61 @@ begin s_tag := get_tag(s_ra); for i in way_t loop if go = '1' and cache_valids(req_index)(i) = '1' and - read_tag(i, cache_tags(req_index)) = s_tag and + read_tag(i, cache_tag_set) = s_tag and tlb_valid_way(j) = '1' then hit_way_set(j) := i; s_hit := '1'; end if; end loop; hit_set(j) := s_hit; + if s_tag = r1.reload_tag then + rel_matches(j) := '1'; + end if; end loop; if tlb_hit = '1' then is_hit := hit_set(tlb_hit_way); hit_way := hit_way_set(tlb_hit_way); + rel_match := rel_matches(tlb_hit_way); end if; else - s_tag := get_tag(r0.req.addr(REAL_ADDR_BITS - 1 downto 0)); + s_tag := get_tag(r0.req.addr); for i in way_t loop if go = '1' and cache_valids(req_index)(i) = '1' and - read_tag(i, cache_tags(req_index)) = s_tag then + read_tag(i, cache_tag_set) = s_tag then hit_way := i; is_hit := '1'; end if; end loop; + if s_tag = r1.reload_tag then + rel_match := '1'; + end if; + end if; + + -- See if the request matches the line currently being reloaded + if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index and + rel_match = '1' then + -- For a store, consider this a hit even if the row isn't valid + -- since it will be by the time we perform the store. + -- For a load, check the appropriate row valid bit. + is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE); + hit_way := r1.store_way; + end if; + + -- Whether to use forwarded data for a load or not + use_forward1_next <= '0'; + if get_row(r1.req.real_addr) = req_row and r1.req.hit_way = hit_way then + -- Only need to consider r1.write_bram here, since if we are + -- writing refill data here, then we don't have a cache hit this + -- cycle on the line being refilled. (There is the possibility + -- that the load following the load miss that started the refill + -- could be to the old contents of the victim line, since it is a + -- couple of cycles after the refill starts before we see the + -- updated cache tag. In that case we don't use the bypass.) + use_forward1_next <= r1.write_bram; + end if; + use_forward2_next <= '0'; + if r1.forward_row1 = req_row and r1.forward_way1 = hit_way then + use_forward2_next <= r1.forward_valid1; end if; -- The way that matched on a hit @@ -732,6 +816,7 @@ begin rc_ok <= perm_attr.reference and (r0.req.load or perm_attr.changed); perm_ok <= (r0.req.priv_mode or not perm_attr.priv) and (perm_attr.wr_perm or (r0.req.load and perm_attr.rd_perm)); + access_ok <= valid_ra and perm_ok and rc_ok; -- Combine the request and cache hit status to decide what -- operation needs to be done @@ -739,7 +824,11 @@ begin nc := r0.req.nc or perm_attr.nocache; op := OP_NONE; if go = '1' then - if valid_ra = '1' and rc_ok = '1' and perm_ok = '1' then + if access_ok = '0' then + op := OP_BAD; + elsif cancel_store = '1' then + op := OP_STCX_FAIL; + else opsel := r0.req.load & nc & is_hit; case opsel is when "101" => op := OP_LOAD_HIT; @@ -752,8 +841,6 @@ begin when "111" => op := OP_BAD; when others => op := OP_NONE; end case; - else - op := OP_TLB_ERR; end if; end if; req_op <= op; @@ -762,7 +849,7 @@ begin -- in the cases where we need to read the cache data BRAM. -- If we're stalling then we need to keep reading the last -- row requested. - if stall_out = '0' then + if r0_stall = '0' then if m_in.valid = '1' then early_req_row <= get_row(m_in.addr); else @@ -776,9 +863,6 @@ begin -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- Generate stalls from stage 1 state machine - stall_out <= '1' when r1.state /= IDLE else '0'; - -- Handle load-with-reservation and store-conditional instructions reservation_comb: process(all) begin @@ -805,11 +889,15 @@ begin reservation_reg: process(clk) begin if rising_edge(clk) then - if rst = '1' or clear_rsrv = '1' then + if rst = '1' then reservation.valid <= '0'; - elsif set_rsrv = '1' then - reservation.valid <= '1'; - reservation.addr <= r0.req.addr(63 downto LINE_OFF_BITS); + elsif r0_valid = '1' and access_ok = '1' then + if clear_rsrv = '1' then + reservation.valid <= '0'; + elsif set_rsrv = '1' then + reservation.valid <= '1'; + reservation.addr <= r0.req.addr(63 downto LINE_OFF_BITS); + end if; end if; end if; end process; @@ -817,11 +905,28 @@ begin -- Return data for loads & completion control logic -- writeback_control: process(all) + variable data_out : std_ulogic_vector(63 downto 0); + variable data_fwd : std_ulogic_vector(63 downto 0); + variable j : integer; begin + -- Use the bypass if are reading the row that was written 1 or 2 cycles + -- ago, including for the slow_valid = 1 case (i.e. completing a load + -- miss or a non-cacheable load). + if r1.use_forward1 = '1' then + data_fwd := r1.forward_data1; + else + data_fwd := r1.forward_data2; + end if; + data_out := cache_out(r1.hit_way); + for i in 0 to 7 loop + j := i * 8; + if r1.forward_sel(i) = '1' then + data_out(j + 7 downto j) := data_fwd(j + 7 downto j); + end if; + end loop; - -- The mux on d_out.data defaults to the normal load hit case. d_out.valid <= '0'; - d_out.data <= cache_out(r1.hit_way); + d_out.data <= data_out; d_out.store_done <= '0'; d_out.error <= '0'; d_out.cache_paradox <= '0'; @@ -829,7 +934,7 @@ begin -- Outputs to MMU m_out.done <= r1.tlbie_done; m_out.err <= '0'; - m_out.data <= cache_out(r1.hit_way); + m_out.data <= data_out; -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store @@ -853,7 +958,7 @@ begin -- Request came from loadstore1... -- Load hit case is the standard path if r1.hit_load_valid = '1' then - report "completing load hit"; + report "completing load hit data=" & to_hstring(data_out); d_out.valid <= '1'; end if; @@ -867,16 +972,8 @@ begin -- Slow ops (load miss, NC, stores) if r1.slow_valid = '1' then - -- If it's a load, enable register writeback and switch - -- mux accordingly - -- - if r1.req.load then - -- Read data comes from the slow data latch - d_out.data <= r1.slow_data; - end if; d_out.store_done <= '1'; - - report "completing store or load miss"; + report "completing store or load miss data=" & to_hstring(data_out); d_out.valid <= '1'; end if; @@ -901,8 +998,6 @@ begin -- Slow ops (i.e. load miss) if r1.slow_valid = '1' then - -- Read data comes from the slow data latch - m_out.data <= r1.slow_data; report "completing MMU load miss, data=" & to_hstring(m_out.data); m_out.done <= '1'; end if; @@ -946,8 +1041,6 @@ begin wr_data => wr_data ); process(all) - variable tmp_adr : std_ulogic_vector(63 downto 0); - variable reloading : boolean; begin -- Cache hit reads do_read <= '1'; @@ -959,43 +1052,40 @@ begin -- Defaults to wishbone read responses (cache refill), -- -- For timing, the mux on wr_data/sel/addr is not dependent on anything - -- other than the current state. Only the do_write signal is. + -- other than the current state. -- - if r1.state = IDLE then - -- In IDLE state, the only write path is the store-hit update case - wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); - wr_data <= r0.req.data; - wr_sel <= r0.req.byte_sel; + wr_sel_m <= (others => '0'); + + do_write <= '0'; + if r1.write_bram = '1' then + -- Write store data to BRAM. This happens one cycle after the + -- store is in r0. + wr_data <= r1.req.data; + wr_sel <= r1.req.byte_sel; + wr_addr <= std_ulogic_vector(to_unsigned(get_row(r1.req.real_addr), ROW_BITS)); + if i = r1.req.hit_way then + do_write <= '1'; + end if; else -- Otherwise, we might be doing a reload or a DCBZ - if r1.req.dcbz = '1' then + if r1.dcbz = '1' then wr_data <= (others => '0'); else wr_data <= wishbone_in.dat; end if; - wr_sel <= (others => '1'); - wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); - end if; + wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); + wr_sel <= (others => '1'); - -- The two actual write cases here - do_write <= '0'; - reloading := r1.state = RELOAD_WAIT_ACK; - if reloading and wishbone_in.ack = '1' and r1.store_way = i then - do_write <= '1'; - end if; - if req_op = OP_STORE_HIT and req_hit_way = i and cancel_store = '0' and - r0.req.dcbz = '0' then - assert not reloading report "Store hit while in state:" & - state_t'image(r1.state) - severity FAILURE; - do_write <= '1'; + if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r1.store_way = i then + do_write <= '1'; + end if; end if; -- Mask write selects with do_write since BRAM doesn't -- have a global write-enable - for i in 0 to ROW_SIZE-1 loop - wr_sel_m(i) <= wr_sel(i) and do_write; - end loop; + if do_write = '1' then + wr_sel_m <= wr_sel; + end if; end process; end generate; @@ -1007,15 +1097,7 @@ begin dcache_fast_hit : process(clk) begin if rising_edge(clk) then - -- If we have a request incoming, we have to latch it as r0.req.valid - -- is only set for a single cycle. It's up to the control logic to - -- ensure we don't override an uncompleted request (for now we are - -- single issue on load/stores so we are fine, later, we can generate - -- a stall output if necessary). - - if req_op /= OP_NONE and stall_out = '0' then - r1.req <= r0.req; - r1.mmu_req <= r0.mmu_req; + if req_op /= OP_NONE then report "op:" & op_t'image(req_op) & " addr:" & to_hstring(r0.req.addr) & " nc:" & std_ulogic'image(r0.req.nc) & @@ -1023,8 +1105,11 @@ begin " tag:" & to_hstring(req_tag) & " way: " & integer'image(req_hit_way); end if; + if r0_valid = '1' then + r1.mmu_req <= r0.mmu_req; + end if; - -- Fast path for load/store hits. Set signals for the writeback controls. + -- Fast path for load/store hits. Set signals for the writeback controls. if req_op = OP_LOAD_HIT then r1.hit_way <= req_hit_way; r1.hit_load_valid <= '1'; @@ -1032,27 +1117,29 @@ begin r1.hit_load_valid <= '0'; end if; - if req_op = OP_TLB_ERR then + if req_op = OP_BAD then report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) & " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok); r1.error_done <= '1'; - r1.cache_paradox <= '0'; - elsif req_op = OP_BAD then - report "Signalling cache paradox"; - r1.error_done <= '1'; - r1.cache_paradox <= '1'; + r1.cache_paradox <= access_ok; else r1.error_done <= '0'; r1.cache_paradox <= '0'; end if; + if req_op = OP_STCX_FAIL then + r1.stcx_fail <= '1'; + else + r1.stcx_fail <= '0'; + end if; + -- complete tlbies and TLB loads in the third cycle r1.tlbie_done <= r0_valid and (r0.tlbie or r0.tlbld); end if; end process; -- - -- Every other case is handled by this state machine: + -- Memory accesses are handled by this state machine: -- -- * Cache load miss/reload (in conjunction with "rams") -- * Load hits for non-cachable forms @@ -1062,16 +1149,45 @@ begin -- operates at stage 1. -- dcache_slow : process(clk) - variable tagset : cache_tags_set_t; variable stbs_done : boolean; + variable req : mem_access_request_t; + variable acks : unsigned(2 downto 0); begin if rising_edge(clk) then + r1.use_forward1 <= use_forward1_next; + r1.forward_sel <= (others => '0'); + if use_forward1_next = '1' then + r1.forward_sel <= r1.req.byte_sel; + elsif use_forward2_next = '1' then + r1.forward_sel <= r1.forward_sel1; + end if; + + r1.forward_data2 <= r1.forward_data1; + if r1.write_bram = '1' then + r1.forward_data1 <= r1.req.data; + r1.forward_sel1 <= r1.req.byte_sel; + r1.forward_way1 <= r1.req.hit_way; + r1.forward_row1 <= get_row(r1.req.real_addr); + r1.forward_valid1 <= '1'; + else + if r1.dcbz = '1' then + r1.forward_data1 <= (others => '0'); + else + r1.forward_data1 <= wishbone_in.dat; + end if; + r1.forward_sel1 <= (others => '1'); + r1.forward_way1 <= r1.store_way; + r1.forward_row1 <= r1.store_row; + r1.forward_valid1 <= '0'; + end if; + -- On reset, clear all valid bits to force misses if rst = '1' then for i in index_t loop cache_valids(i) <= (others => '0'); end loop; r1.state <= IDLE; + r1.full <= '0'; r1.slow_valid <= '0'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; @@ -1081,44 +1197,77 @@ begin else -- One cycle pulses reset r1.slow_valid <= '0'; - r1.stcx_fail <= '0'; + r1.write_bram <= '0'; + + if r1.write_tag = '1' then + -- Store new tag in selected way + for i in 0 to NUM_WAYS-1 loop + if i = r1.store_way then + cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <= + (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag; + end if; + end loop; + r1.write_tag <= '0'; + end if; + + -- Take request from r1.req if there is one there, + -- else from req_op, ra, etc. + if r1.full = '1' then + req := r1.req; + else + req.op := req_op; + req.dcbz := r0.req.dcbz; + req.real_addr := ra; + req.data := r0.req.data; + req.byte_sel := r0.req.byte_sel; + req.hit_way := req_hit_way; + req.repl_way := replace_way; + + -- Store the incoming request from r0, if it is a slow request + -- Note that r1.full = 1 implies req_op = OP_NONE + if req_op = OP_LOAD_MISS or req_op = OP_LOAD_NC or + req_op = OP_STORE_MISS or req_op = OP_STORE_HIT then + r1.req <= req; + r1.full <= '1'; + end if; + end if; -- Main state machine case r1.state is when IDLE => - case req_op is + r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0); + r1.dcbz <= '0'; + + -- Keep track of our index and way for subsequent stores. + r1.store_index <= get_index(req.real_addr); + r1.store_row <= get_row(req.real_addr); + r1.end_row_ix <= get_row_of_line(get_row(req.real_addr)) - 1; + r1.reload_tag <= get_tag(req.real_addr); + + if req.op = OP_STORE_HIT then + r1.store_way <= req.hit_way; + else + r1.store_way <= req.repl_way; + end if; + + -- Reset per-row valid bits, ready for handling OP_LOAD_MISS + for i in 0 to ROW_PER_LINE - 1 loop + r1.rows_valid(i) <= '0'; + end loop; + + case req.op is when OP_LOAD_HIT => -- stay in IDLE state when OP_LOAD_MISS => -- Normal load cache miss, start the reload machine -- - report "cache miss addr:" & to_hstring(r0.req.addr) & - " idx:" & integer'image(req_index) & - " way:" & integer'image(replace_way) & - " tag:" & to_hstring(req_tag); - - -- Force misses on that way while reloading that line - cache_valids(req_index)(replace_way) <= '0'; - - -- Store new tag in selected way - for i in 0 to NUM_WAYS-1 loop - if i = replace_way then - tagset := cache_tags(req_index); - write_tag(i, tagset, req_tag); - cache_tags(req_index) <= tagset; - end if; - end loop; - - -- Keep track of our index and way for subsequent stores. - r1.store_index <= req_index; - r1.store_way <= replace_way; - r1.store_row <= get_row(req_laddr); - - -- Prep for first wishbone read. We calculate the address of - -- the start of the cache line and start the WB cycle - -- - r1.wb.adr <= req_laddr(r1.wb.adr'left downto 0); + report "cache miss real addr:" & to_hstring(req.real_addr) & + " idx:" & integer'image(get_index(req.real_addr)) & + " way:" & integer'image(req.repl_way) & + " tag:" & to_hstring(get_tag(req.real_addr)); + + -- Start the wishbone cycle r1.wb.sel <= (others => '1'); r1.wb.we <= '0'; r1.wb.cyc <= '1'; @@ -1126,74 +1275,52 @@ begin -- Track that we had one request sent r1.state <= RELOAD_WAIT_ACK; + r1.write_tag <= '1'; when OP_LOAD_NC => - r1.wb.sel <= r0.req.byte_sel; - r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; + r1.wb.sel <= req.byte_sel; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; r1.state <= NC_LOAD_WAIT_ACK; when OP_STORE_HIT | OP_STORE_MISS => - if r0.req.dcbz = '0' then - r1.wb.sel <= r0.req.byte_sel; - r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; - r1.wb.dat <= r0.req.data; - if cancel_store = '0' then - r1.wb.cyc <= '1'; - r1.wb.stb <= '1'; - r1.wb.we <= '1'; - r1.state <= STORE_WAIT_ACK; - else - r1.stcx_fail <= '1'; - r1.state <= IDLE; + if req.dcbz = '0' then + r1.wb.sel <= req.byte_sel; + r1.wb.dat <= req.data; + r1.state <= STORE_WAIT_ACK; + r1.acks_pending <= to_unsigned(1, 3); + r1.full <= '0'; + r1.slow_valid <= '1'; + if req.op = OP_STORE_HIT then + r1.write_bram <= '1'; end if; else -- dcbz is handled much like a load miss except -- that we are writing to memory instead of reading - r1.store_index <= req_index; - r1.store_row <= get_row(req_laddr); - - if req_op = OP_STORE_HIT then - r1.store_way <= req_hit_way; - else - r1.store_way <= replace_way; - - -- Force misses on the victim way while zeroing - cache_valids(req_index)(replace_way) <= '0'; - - -- Store new tag in selected way - for i in 0 to NUM_WAYS-1 loop - if i = replace_way then - tagset := cache_tags(req_index); - write_tag(i, tagset, req_tag); - cache_tags(req_index) <= tagset; - end if; - end loop; - end if; - -- Set up for wishbone writes - r1.wb.adr <= req_laddr(r1.wb.adr'left downto 0); + -- Start the wishbone writes r1.wb.sel <= (others => '1'); - r1.wb.we <= '1'; r1.wb.dat <= (others => '0'); - r1.wb.cyc <= '1'; - r1.wb.stb <= '1'; -- Handle the rest like a load miss r1.state <= RELOAD_WAIT_ACK; + r1.write_tag <= '1'; + r1.dcbz <= '1'; end if; + r1.wb.we <= '1'; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; -- OP_NONE and OP_BAD do nothing - -- OP_BAD was handled above already + -- OP_BAD & OP_STCX_FAIL were handled above already when OP_NONE => when OP_BAD => - when OP_TLB_ERR => + when OP_STCX_FAIL => end case; - when RELOAD_WAIT_ACK => - -- Requests are all sent if stb is 0 + when RELOAD_WAIT_ACK => + -- Requests are all sent if stb is 0 stbs_done := r1.wb.stb = '0'; -- If we are still sending requests, was one accepted ? @@ -1202,7 +1329,7 @@ begin -- stb and set stbs_done so we can handle an eventual last -- ack on the same cycle. -- - if is_last_row_addr(r1.wb.adr) then + if is_last_row_addr(r1.wb.adr, r1.end_row_ix) then r1.wb.stb <= '0'; stbs_done := true; end if; @@ -1212,44 +1339,82 @@ begin end if; -- Incoming acks processing + r1.forward_valid1 <= wishbone_in.ack; if wishbone_in.ack = '1' then - -- Is this the data we were looking for ? Latch it so - -- we can respond later. We don't currently complete the - -- pending miss request immediately, we wait for the - -- whole line to be loaded. The reason is that if we - -- did, we would potentially get new requests in while - -- not idle, which we don't currently know how to deal - -- with. - -- - if r1.store_row = get_row(r1.req.addr) and r1.req.dcbz = '0' then - r1.slow_data <= wishbone_in.dat; + r1.rows_valid(r1.store_row mod ROW_PER_LINE) <= '1'; + -- If this is the data we were looking for, we can + -- complete the request next cycle. + -- Compare the whole address in case the request in + -- r1.req is not the one that started this refill. + if r1.full = '1' and + ((r1.dcbz = '1' and r1.req.dcbz = '1') or + (r1.dcbz = '0' and r1.req.op = OP_LOAD_MISS)) and + r1.store_row = get_row(r1.req.real_addr) and + r1.reload_tag = get_tag(r1.req.real_addr) then + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.forward_sel <= (others => '1'); + r1.use_forward1 <= '1'; end if; -- Check for completion - if stbs_done and is_last_row(r1.store_row) then + if stbs_done and is_last_row(r1.store_row, r1.end_row_ix) then -- Complete wishbone cycle r1.wb.cyc <= '0'; -- Cache line is now valid cache_valids(r1.store_index)(r1.store_way) <= '1'; - -- Don't complete and go idle until next cycle, in - -- case the next request is for the last dword of - -- the cache line we just loaded. - r1.state <= FINISH_LD_MISS; + r1.state <= IDLE; end if; -- Increment store row counter r1.store_row <= next_row(r1.store_row); end if; - when FINISH_LD_MISS => - -- Write back the load data that we got - r1.slow_valid <= '1'; - r1.state <= IDLE; - report "completing miss !"; + when STORE_WAIT_ACK => + stbs_done := r1.wb.stb = '0'; + acks := r1.acks_pending; + -- Clear stb when slave accepted request + if wishbone_in.stall = '0' then + -- See if there is another store waiting to be done + -- which is in the same real page. + -- Using r1.req rather than req here limits us to one + -- store every two cycles, but helps timing in that we + -- don't depend on req_op or ra. + if r1.full = '1' and acks < 7 and + (r1.req.op = OP_STORE_MISS or r1.req.op = OP_STORE_HIT) and + (r1.req.real_addr(r1.wb.adr'left downto TLB_LG_PGSZ) = + r1.wb.adr(r1.wb.adr'left downto TLB_LG_PGSZ)) then + r1.wb.adr <= r1.req.real_addr(r1.wb.adr'left downto 0); + r1.wb.dat <= r1.req.data; + r1.wb.sel <= r1.req.byte_sel; + r1.wb.stb <= '1'; + stbs_done := false; + if r1.req.op = OP_STORE_HIT then + r1.write_bram <= '1'; + end if; + r1.full <= '0'; + r1.slow_valid <= '1'; + acks := acks + 1; + else + r1.wb.stb <= '0'; + stbs_done := true; + end if; + end if; + + -- Got ack ? See if complete. + if wishbone_in.ack = '1' then + if stbs_done and acks = 1 then + r1.state <= IDLE; + r1.wb.cyc <= '0'; + r1.wb.stb <= '0'; + end if; + acks := acks - 1; + end if; + r1.acks_pending <= acks; - when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => + when NC_LOAD_WAIT_ACK => -- Clear stb when slave accepted request if wishbone_in.stall = '0' then r1.wb.stb <= '0'; @@ -1257,11 +1422,11 @@ begin -- Got ack ? complete. if wishbone_in.ack = '1' then - if r1.state = NC_LOAD_WAIT_ACK then - r1.slow_data <= wishbone_in.dat; - end if; r1.state <= IDLE; + r1.full <= '0'; r1.slow_valid <= '1'; + r1.forward_sel <= (others => '1'); + r1.use_forward1 <= '1'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; end if;