From 94dd8bc48066e8c4505843a11250209f3bf29226 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 26 Feb 2020 11:55:36 +1100 Subject: [PATCH] dcache: Add support for unaligned loads and stores For an unaligned load or store, we do the first doubleword (dword) of the transfer as normal, but then go to a new NEXT_DWORD state of the state machine to do the cache tag lookup for the second dword of the transfer. From the NEXT_DWORD state we have much the same transitions to other states as from the IDLE state (the transitions for OP_LOAD_HIT are a bit different but almost identical for the other op values). We now do the preparation of the data to be written in loadstore1, that is, byte reversal if necessary and rotation by a number of bytes based on the low 3 bits of the address. We do rotation not shifting so we have the bytes that need to go into the second doubleword in the right place in the low bytes of the data sent to dcache. The rotation and byte reversal are done in a single step with one multiplexer per byte by setting the select inputs for each byte appropriately. This also fixes writeback to not write the register value until it has received both pieces of an unaligned load value. Signed-off-by: Paul Mackerras --- dcache.vhdl | 181 +++++++++++++++++++++++++++++++----------------- loadstore1.vhdl | 20 ++++-- writeback.vhdl | 2 +- 3 files changed, 136 insertions(+), 67 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index ddc4769..5bf477b 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -124,6 +124,7 @@ architecture rtl of dcache is -- Cache state machine type state_t is (IDLE, -- Normal load hit processing + NEXT_DWORD, -- Starting the 2nd xfer of misaligned LOAD_UPDATE, -- Load with update extra cycle LOAD_UPDATE2, -- Load with update extra cycle RELOAD_WAIT_ACK, -- Cache reload wait ack @@ -157,6 +158,12 @@ architecture rtl of dcache is hit_way : way_t; hit_load_valid : std_ulogic; + -- Info for doing the second transfer of a misaligned load/store + two_dwords : std_ulogic; + second_dword : std_ulogic; + next_addr : std_ulogic_vector(63 downto 0); + next_sel : std_ulogic_vector(7 downto 0); + -- Register update (load/store with update) update_valid : std_ulogic; @@ -186,6 +193,8 @@ architecture rtl of dcache is sign_extend : std_ulogic; byte_reverse : std_ulogic; xerc : xer_common_t; + last_dword : std_ulogic; + second_dword : std_ulogic; end record; signal r2 : reg_stage_2_t; @@ -196,7 +205,10 @@ architecture rtl of dcache is signal req_hit_way : way_t; signal req_tag : cache_tag_t; signal req_op : op_t; + signal req_data : std_ulogic_vector(63 downto 0); + signal req_addr : std_ulogic_vector(63 downto 0); signal req_laddr : std_ulogic_vector(63 downto 0); + signal req_sel : std_ulogic_vector(7 downto 0); -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; @@ -208,8 +220,9 @@ architecture rtl of dcache is signal replace_way : way_t; -- Wishbone read/write/cache write formatting signals - signal bus_sel : wishbone_sel_type; - signal store_data : wishbone_data_type; + signal bus_sel : std_ulogic_vector(15 downto 0); + + signal two_dwords : std_ulogic; -- -- Helper functions to decode incoming requests @@ -307,17 +320,17 @@ architecture rtl of dcache is end case; end function length_to_sel; - -- Calculate shift and byte enables for wishbone - function wishbone_data_shift(address : in std_ulogic_vector(63 downto 0)) return natural is - begin - return to_integer(unsigned(address(2 downto 0))) * 8; - end function wishbone_data_shift; - + -- Calculate byte enables for wishbone + -- This returns 16 bits, giving the select signals for two transfers, + -- to account for unaligned loads or stores function wishbone_data_sel(size : in std_logic_vector(3 downto 0); address : in std_logic_vector(63 downto 0)) return std_ulogic_vector is + variable longsel : std_ulogic_vector(15 downto 0); begin - return std_ulogic_vector(shift_left(unsigned(length_to_sel(size)), + longsel := (others => '0'); + longsel(7 downto 0) := length_to_sel(size); + return std_ulogic_vector(shift_left(unsigned(longsel), to_integer(unsigned(address(2 downto 0))))); end function wishbone_data_sel; @@ -383,23 +396,43 @@ begin variable tmp : std_ulogic_vector(63 downto 0); variable data : std_ulogic_vector(63 downto 0); variable opsel : std_ulogic_vector(3 downto 0); + variable go : std_ulogic; + variable is_load : std_ulogic; + variable is_nc : std_ulogic; begin -- Extract line, row and tag from request - req_index <= get_index(d_in.addr); - req_row <= get_row(d_in.addr); - req_tag <= get_tag(d_in.addr); - - -- Calculate address of beginning of cache line, will be - -- used for cache miss processing if needed - -- - req_laddr <= d_in.addr(63 downto LINE_OFF_BITS) & - (LINE_OFF_BITS-1 downto 0 => '0'); + if r1.state /= NEXT_DWORD then + req_addr <= d_in.addr; + req_data <= d_in.data; + req_sel <= bus_sel(7 downto 0); + go := d_in.valid; + is_load := d_in.load; + is_nc := d_in.nc; + + else + req_addr <= r1.next_addr; + req_data <= r1.req.data; + req_sel <= r1.next_sel; + go := '1'; + is_load := r1.req.load; + is_nc := r1.req.nc; + end if; + + req_index <= get_index(req_addr); + req_row <= get_row(req_addr); + req_tag <= get_tag(req_addr); + + -- Calculate address of beginning of cache line, will be + -- used for cache miss processing if needed + -- + req_laddr <= req_addr(63 downto LINE_OFF_BITS) & + (LINE_OFF_BITS-1 downto 0 => '0'); -- Test if pending request is a hit on any way hit_way := 0; is_hit := '0'; for i in way_t loop - if d_in.valid = '1' and cache_valids(req_index)(i) = '1' then + if go = '1' and cache_valids(req_index)(i) = '1' then if read_tag(i, cache_tags(req_index)) = req_tag then hit_way := i; is_hit := '1'; @@ -416,7 +449,7 @@ begin -- Combine the request and cache his status to decide what -- operation needs to be done -- - opsel := d_in.valid & d_in.load & d_in.nc & is_hit; + opsel := go & is_load & is_nc & is_hit; case opsel is when "1101" => op := OP_LOAD_HIT; when "1100" => op := OP_LOAD_MISS; @@ -433,22 +466,15 @@ begin end process; - -- - -- Misc signal assignments - -- - -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- Wishbone & BRAM write data formatting for stores (most of it already - -- happens in loadstore1, this is the remaining data shifting) - -- - store_data <= std_logic_vector(shift_left(unsigned(d_in.data), - wishbone_data_shift(d_in.addr))); - -- Wishbone read and write and BRAM write sel bits generation bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); + -- See if the operation crosses two doublewords + two_dwords <= or (bus_sel(15 downto 8)); + -- TODO: Generate errors -- err_nc_collision <= '1' when req_op = OP_BAD else '0'; @@ -469,7 +495,7 @@ begin d_out.write_shift <= r2.data_shift; d_out.sign_extend <= r2.sign_extend; d_out.byte_reverse <= r2.byte_reverse; - d_out.second_word <= '0'; + d_out.second_word <= r2.second_dword; d_out.xerc <= r2.xerc; -- We have a valid load or store hit or we just completed a slow @@ -497,8 +523,10 @@ begin if r2.hit_load_valid = '1' then d_out.write_enable <= '1'; - -- If it's not a load with update, complete it now - if r2.load_is_update = '0' then + -- If there isn't another dword to go and + -- it's not a load with update, complete it now + if r2.last_dword = '1' and r2.load_is_update = '0' then + report "completing load hit"; d_out.valid <= '1'; end if; end if; @@ -521,10 +549,14 @@ begin d_out.byte_reverse <= r1.req.byte_reverse; d_out.write_len <= r1.req.length; d_out.xerc <= r1.req.xerc; + d_out.second_word <= r1.second_dword; end if; -- If it's a store or a non-update load form, complete now - if r1.req.load = '0' or r1.req.update = '0' then + -- unless we need to do another dword transfer + if (r1.req.load = '0' or r1.req.update = '0') and + (r1.two_dwords = '0' or r1.second_dword = '1') then + report "completing store or load miss"; d_out.valid <= '1'; end if; end if; @@ -543,11 +575,13 @@ begin d_out.sign_extend <= '0'; d_out.byte_reverse <= '0'; d_out.xerc <= r1.req.xerc; + d_out.second_word <= '0'; -- If it was a load, this completes the operation (load with -- update case). -- if r1.req.load = '1' then + report "completing after load update"; d_out.valid <= '1'; end if; end if; @@ -605,11 +639,11 @@ begin -- For timing, the mux on wr_data/sel/addr is not dependent on anything -- other than the current state. Only the do_write signal is. -- - if r1.state = IDLE then - -- When IDLE, the only write path is the store-hit update case + if r1.state = IDLE or r1.state = NEXT_DWORD then + -- In these states, the only write path is the store-hit update case wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); - wr_data <= store_data; - wr_sel <= bus_sel; + wr_data <= req_data; + wr_sel <= req_sel; else -- Otherwise, we might be doing a reload wr_data <= wishbone_in.dat; @@ -648,6 +682,8 @@ begin r2.length <= r1.req.length; r2.sign_extend <= r1.req.sign_extend; r2.byte_reverse <= r1.req.byte_reverse; + r2.second_dword <= r1.second_dword; + r2.last_dword <= r1.second_dword or not r1.two_dwords; -- If we have a request incoming, we have to latch it as d_in.valid -- is only set for a single cycle. It's up to the control logic to @@ -655,8 +691,12 @@ begin -- single issue on load/stores so we are fine, later, we can generate -- a stall output if necessary). - if req_op /= OP_NONE then + if req_op /= OP_NONE and d_in.valid = '1' then r1.req <= d_in; + r1.second_dword <= '0'; + r1.two_dwords <= two_dwords; + r1.next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000"; + r1.next_sel <= bus_sel(15 downto 8); report "op:" & op_t'image(req_op) & " addr:" & to_hstring(d_in.addr) & @@ -666,6 +706,8 @@ begin " idx:" & integer'image(req_index) & " tag:" & to_hstring(req_tag) & " way: " & integer'image(req_hit_way); + elsif r1.state = NEXT_DWORD then + r1.second_dword <= '1'; end if; -- Fast path for load/store hits. Set signals for the writeback controls. @@ -713,24 +755,36 @@ begin r1.update_valid <= '0'; -- We cannot currently process a new request when not idle - assert req_op = OP_NONE or r1.state = IDLE report "request " & + assert d_in.valid = '0' or r1.state = IDLE report "request " & op_t'image(req_op) & " while in state " & state_t'image(r1.state) severity FAILURE; -- Main state machine case r1.state is - when IDLE => + when IDLE | NEXT_DWORD => case req_op is - when OP_LOAD_HIT => - -- We have a load with update hit, we need the delayed update cycle - if d_in.update = '1' then - r1.state <= LOAD_UPDATE; - end if; + when OP_LOAD_HIT => + if r1.state = IDLE then + -- If the load is misaligned then we will need to start + -- the state machine + if two_dwords = '1' then + r1.state <= NEXT_DWORD; + elsif d_in.update = '1' then + -- We have a load with update hit, we need the delayed update cycle + r1.state <= LOAD_UPDATE; + end if; + else + if r1.req.update = '1' then + r1.state <= LOAD_UPDATE; + else + r1.state <= IDLE; + end if; + end if; when OP_LOAD_MISS => -- Normal load cache miss, start the reload machine -- - report "cache miss addr:" & to_hstring(d_in.addr) & + report "cache miss addr:" & to_hstring(req_addr) & " idx:" & integer'image(req_index) & " way:" & integer'image(replace_way) & " tag:" & to_hstring(req_tag); @@ -765,8 +819,8 @@ begin r1.state <= RELOAD_WAIT_ACK; when OP_LOAD_NC => - r1.wb.sel <= bus_sel; - r1.wb.adr <= d_in.addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.sel <= req_sel; + r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000"; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; @@ -774,12 +828,10 @@ begin when OP_STORE_HIT | OP_STORE_MISS => -- For store-with-update do the register update - if d_in.update = '1' then - r1.update_valid <= '1'; - end if; - r1.wb.sel <= bus_sel; - r1.wb.adr <= d_in.addr(r1.wb.adr'left downto 3) & "000"; - r1.wb.dat <= store_data; + r1.update_valid <= d_in.valid and d_in.update; + r1.wb.sel <= req_sel; + r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.dat <= req_data; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '1'; @@ -831,11 +883,13 @@ begin -- Cache line is now valid cache_valids(r1.store_index)(r1.store_way) <= '1'; - -- Complete the load that missed. For load with update + -- Write back the load data that we got, and start + -- the second dword if necessary. Otherwise, see if -- we also need to do the deferred update cycle. - -- r1.slow_valid <= '1'; - if r1.req.update = '1' then + if r1.two_dwords and not r1.second_dword then + r1.state <= NEXT_DWORD; + elsif r1.req.update = '1' then r1.state <= LOAD_UPDATE2; report "completing miss with load-update !"; else @@ -864,12 +918,15 @@ begin -- Got ack ? complete. if wishbone_in.ack = '1' then - r1.state <= IDLE; + if r1.two_dwords and not r1.second_dword then + r1.state <= NEXT_DWORD; + elsif r1.state = NC_LOAD_WAIT_ACK and r1.req.update = '1' then + r1.state <= LOAD_UPDATE2; + else + r1.state <= IDLE; + end if; if r1.state = NC_LOAD_WAIT_ACK then r1.slow_data <= wishbone_in.dat; - if r1.req.update = '1' then - r1.state <= LOAD_UPDATE2; - end if; end if; r1.slow_valid <= '1'; r1.wb.cyc <= '0'; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 5b61d4c..9e038e1 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -35,12 +35,15 @@ begin loadstore1_1: process(all) variable v : Loadstore1ToDcacheType; + variable brev_lenm1 : unsigned(2 downto 0); + variable byte_offset : unsigned(2 downto 0); + variable j : integer; + variable k : unsigned(2 downto 0); begin v := r; v.valid := l_in.valid; v.load := l_in.load; - v.data := l_in.data; v.write_reg := l_in.write_reg; v.length := l_in.length; v.byte_reverse := l_in.byte_reverse; @@ -63,9 +66,18 @@ begin -- XXX Do length_to_sel here ? - -- byte reverse stores in the first cycle - if v.load = '0' and l_in.byte_reverse = '1' then - v.data := byte_reverse(l_in.data, to_integer(unsigned(l_in.length))); + -- Do byte reversing and rotating for stores in the first cycle + if v.load = '0' then + byte_offset := unsigned(lsu_sum(2 downto 0)); + brev_lenm1 := "000"; + if l_in.byte_reverse = '1' then + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + end if; + for i in 0 to 7 loop + k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; + j := to_integer(k) * 8; + v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); + end loop; end if; v.addr := lsu_sum; diff --git a/writeback.vhdl b/writeback.vhdl index a730266..b924ee0 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -116,12 +116,12 @@ begin if l_in.byte_reverse = '1' then brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1; end if; - w_out.write_enable <= '1'; second_word <= l_in.second_word; if l_in.valid = '0' and (data_len + byte_offset > 8) then partial_write <= '1'; end if; xe := l_in.xerc; + w_out.write_enable <= not partial_write or second_word; end if; -- shift and byte-reverse data bytes