From b349cc891a52c0453e7c721b98b96025995a4588 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 28 Mar 2020 20:24:13 +1100 Subject: [PATCH] loadstore1: Move logic from dcache to loadstore1 So that the dcache could in future be used by an MMU, this moves logic to do with data formatting, rA updates for update-form instructions, and handling of unaligned loads and stores out of dcache and into loadstore1. For now, dcache connects only to loadstore1, and loadstore1 now has the connection to writeback. Dcache generates a stall signal to loadstore1 which indicates that the request presented in the current cycle was not accepted and should be presented again. However, loadstore1 doesn't currently use it because we know that we can never hit the circumstances where it might be set. For unaligned transfers, loadstore1 generates two requests to dcache back-to-back, and then waits to see two acks back from dcache (cycles where d_in.valid is true). Loadstore1 now has a FSM for tracking how many acks we are expecting from dcache and for doing the rA update cycles when necessary. Handling for reservations and conditional stores is still in dcache. Loadstore1 now generates its own stall signal back to decode2, so we no longer need the logic in execute1 that generated the stall for the first two cycles. Signed-off-by: Paul Mackerras --- common.vhdl | 28 ++--- core.vhdl | 19 ++- dcache.vhdl | 325 +++++++++++------------------------------------- dcache_tb.vhdl | 29 ++--- execute1.vhdl | 8 -- loadstore1.vhdl | 276 +++++++++++++++++++++++++++++++++------- writeback.vhdl | 2 +- 7 files changed, 339 insertions(+), 348 deletions(-) diff --git a/common.vhdl b/common.vhdl index 4b879a1..f581ccb 100644 --- a/common.vhdl +++ b/common.vhdl @@ -218,22 +218,20 @@ package common is valid : std_ulogic; load : std_ulogic; nc : std_ulogic; + reserve : std_ulogic; addr : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); - write_reg : gpr_index_t; - length : std_ulogic_vector(3 downto 0); - byte_reverse : std_ulogic; - sign_extend : std_ulogic; - update : std_ulogic; - update_reg : gpr_index_t; - xerc : xer_common_t; - reserve : std_ulogic; - rc : std_ulogic; - early_low_addr : std_ulogic_vector(11 downto 0); - early_valid : std_ulogic; + byte_sel : std_ulogic_vector(7 downto 0); + end record; + + type DcacheToLoadstore1Type is record + valid : std_ulogic; + data : std_ulogic_vector(63 downto 0); + store_done : std_ulogic; + error : std_ulogic; end record; - type DcacheToWritebackType is record + type Loadstore1ToWritebackType is record valid : std_ulogic; write_enable: std_ulogic; write_reg : gpr_index_t; @@ -247,9 +245,9 @@ package common is rc : std_ulogic; store_done : std_ulogic; end record; - constant DcacheToWritebackInit : DcacheToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', - byte_reverse => '0', second_word => '0', xerc => xerc_init, - rc => '0', store_done => '0', others => (others => '0')); + constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', + byte_reverse => '0', second_word => '0', xerc => xerc_init, + rc => '0', store_done => '0', others => (others => '0')); type Execute1ToWritebackType is record valid: std_ulogic; diff --git a/core.vhdl b/core.vhdl index 87e73a4..d535f7a 100644 --- a/core.vhdl +++ b/core.vhdl @@ -61,8 +61,11 @@ architecture behave of core is -- load store signals signal execute1_to_loadstore1: Execute1ToLoadstore1Type; + signal loadstore1_to_writeback: Loadstore1ToWritebackType; + + -- dcache signals signal loadstore1_to_dcache: Loadstore1ToDcacheType; - signal dcache_to_writeback: DcacheToWritebackType; + signal dcache_to_loadstore1: DcacheToLoadstore1Type; -- local signals signal fetch1_stall_in : std_ulogic; @@ -73,6 +76,7 @@ architecture behave of core is signal decode2_stall_out : std_ulogic; signal ex1_icache_inval: std_ulogic; signal ex1_stall_out: std_ulogic; + signal ls1_stall_out: std_ulogic; signal dcache_stall_out: std_ulogic; signal flush: std_ulogic; @@ -196,7 +200,7 @@ begin c_in => cr_file_to_decode2, c_out => decode2_to_cr_file ); - decode2_stall_in <= ex1_stall_out or dcache_stall_out; + decode2_stall_in <= ex1_stall_out or ls1_stall_out; register_file_0: entity work.register_file generic map ( @@ -243,8 +247,13 @@ begin loadstore1_0: entity work.loadstore1 port map ( clk => clk, + rst => core_rst, l_in => execute1_to_loadstore1, - l_out => loadstore1_to_dcache + l_out => loadstore1_to_writeback, + d_out => loadstore1_to_dcache, + d_in => dcache_to_loadstore1, + dc_stall => dcache_stall_out, + stall_out => ls1_stall_out ); dcache_0: entity work.dcache @@ -257,7 +266,7 @@ begin clk => clk, rst => core_rst, d_in => loadstore1_to_dcache, - d_out => dcache_to_writeback, + d_out => dcache_to_loadstore1, stall_out => dcache_stall_out, wishbone_in => wishbone_data_in, wishbone_out => wishbone_data_out @@ -267,7 +276,7 @@ begin port map ( clk => clk, e_in => execute1_to_writeback, - l_in => dcache_to_writeback, + l_in => loadstore1_to_writeback, w_out => writeback_to_register_file, c_out => writeback_to_cr_file, complete_out => complete diff --git a/dcache.vhdl b/dcache.vhdl index bcc7590..7e553bf 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -7,9 +7,6 @@ -- * Complete load misses on the cycle when WB data comes instead of -- at the end of line (this requires dealing with requests coming in -- while not idle...) --- * Load with update could use one less non-pipelined cycle by moving --- the register update to the pipeline bubble that exists when going --- back to the IDLE state. -- library ieee; use ieee.std_logic_1164.all; @@ -35,7 +32,7 @@ entity dcache is rst : in std_ulogic; d_in : in Loadstore1ToDcacheType; - d_out : out DcacheToWritebackType; + d_out : out DcacheToLoadstore1Type; stall_out : out std_ulogic; @@ -113,6 +110,8 @@ architecture rtl of dcache is attribute ram_style : string; attribute ram_style of cache_tags : signal is "distributed"; + signal r0 : Loadstore1ToDcacheType; + -- Type of operation on a "valid" input type op_t is (OP_NONE, OP_LOAD_HIT, -- Cache hit on load @@ -124,10 +123,8 @@ architecture rtl of dcache is -- Cache state machine type state_t is (IDLE, -- Normal load hit processing - PRE_NEXT_DWORD, -- Extra state before NEXT_DWORD - NEXT_DWORD, -- Starting the 2nd xfer of misaligned - LOAD_UPDATE, -- Load with update extra cycle RELOAD_WAIT_ACK, -- Cache reload wait ack + FINISH_LD_MISS, -- Extra cycle after load miss STORE_WAIT_ACK, -- Store wait ack NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack @@ -158,15 +155,6 @@ architecture rtl of dcache is hit_way : way_t; hit_load_valid : std_ulogic; - -- Info for doing the second transfer of a misaligned load/store - two_dwords : std_ulogic; - second_dword : std_ulogic; - next_addr : std_ulogic_vector(63 downto 0); - next_sel : std_ulogic_vector(7 downto 0); - - -- Register update (load/store with update) - update_valid : std_ulogic; - -- Data buffer for "slow" read ops (load miss and NC loads). slow_data : std_ulogic_vector(63 downto 0); slow_valid : std_ulogic; @@ -200,12 +188,8 @@ architecture rtl of dcache is signal req_tag : cache_tag_t; signal req_op : op_t; signal req_data : std_ulogic_vector(63 downto 0); - signal req_addr : std_ulogic_vector(63 downto 0); signal req_laddr : std_ulogic_vector(63 downto 0); - signal req_sel : std_ulogic_vector(7 downto 0); - signal next_addr : std_ulogic_vector(63 downto 0); - signal early_req_addr : std_ulogic_vector(11 downto 0); signal early_req_row : row_t; signal cancel_store : std_ulogic; @@ -222,10 +206,8 @@ architecture rtl of dcache is signal replace_way : way_t; -- Wishbone read/write/cache write formatting signals - signal bus_sel : std_ulogic_vector(15 downto 0); + signal bus_sel : std_ulogic_vector(7 downto 0); - signal two_dwords : std_ulogic; - -- -- Helper functions to decode incoming requests -- @@ -305,37 +287,6 @@ architecture rtl of dcache is tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; end; - -- Generate byte enables from sizes - function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is - begin - case length is - when "0001" => - return "00000001"; - when "0010" => - return "00000011"; - when "0100" => - return "00001111"; - when "1000" => - return "11111111"; - when others => - return "00000000"; - end case; - end function length_to_sel; - - -- Calculate byte enables for wishbone - -- This returns 16 bits, giving the select signals for two transfers, - -- to account for unaligned loads or stores - function wishbone_data_sel(size : in std_logic_vector(3 downto 0); - address : in std_logic_vector(63 downto 0)) - return std_ulogic_vector is - variable longsel : std_ulogic_vector(15 downto 0); - begin - longsel := (others => '0'); - longsel(7 downto 0) := length_to_sel(size); - return std_ulogic_vector(shift_left(unsigned(longsel), - to_integer(unsigned(address(2 downto 0))))); - end function wishbone_data_sel; - begin assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; @@ -390,11 +341,17 @@ begin end generate; end generate; - -- Wishbone read and write and BRAM write sel bits generation - bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); - - -- See if the operation crosses two doublewords - two_dwords <= or (bus_sel(15 downto 8)); + -- Latch the request in r0 as long as we're not stalling + stage_0 : process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + r0.valid <= '0'; + elsif stall_out = '0' then + r0 <= d_in; + end if; + end if; + end process; -- Cache request parsing and hit detection dcache_request : process(all) @@ -405,40 +362,21 @@ begin variable data : std_ulogic_vector(63 downto 0); variable opsel : std_ulogic_vector(3 downto 0); variable go : std_ulogic; - variable is_load : std_ulogic; - variable is_nc : std_ulogic; begin -- Extract line, row and tag from request - if r1.state /= NEXT_DWORD then - req_addr <= d_in.addr; - req_data <= d_in.data; - req_sel <= bus_sel(7 downto 0); - go := d_in.valid; - is_load := d_in.load; - is_nc := d_in.nc; - - else - req_addr <= r1.next_addr; - req_data <= r1.req.data; - req_sel <= r1.next_sel; - go := '1'; - is_load := r1.req.load; - is_nc := r1.req.nc; - end if; + req_index <= get_index(r0.addr); + req_row <= get_row(r0.addr); + req_tag <= get_tag(r0.addr); - req_index <= get_index(req_addr); - req_row <= get_row(req_addr); - req_tag <= get_tag(req_addr); + -- Only do anything if not being stalled by stage 1 + go := r0.valid and not stall_out; -- Calculate address of beginning of cache line, will be -- used for cache miss processing if needed -- - req_laddr <= req_addr(63 downto LINE_OFF_BITS) & + req_laddr <= r0.addr(63 downto LINE_OFF_BITS) & (LINE_OFF_BITS-1 downto 0 => '0'); - -- Address of next doubleword, used for unaligned accesses - next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000"; - -- Test if pending request is a hit on any way hit_way := 0; is_hit := '0'; @@ -460,7 +398,7 @@ begin -- Combine the request and cache his status to decide what -- operation needs to be done -- - opsel := go & is_load & is_nc & is_hit; + opsel := go & r0.load & r0.nc & is_hit; case opsel is when "1101" => op := OP_LOAD_HIT; when "1100" => op := OP_LOAD_MISS; @@ -475,16 +413,15 @@ begin req_op <= op; - -- Versions of the address and row number that are valid one cycle earlier + -- Version of the row number that is valid one cycle earlier -- in the cases where we need to read the cache data BRAM. - if r1.state = IDLE and op = OP_LOAD_HIT and two_dwords = '1' then - early_req_addr <= next_addr(11 downto 0); - elsif r1.state /= IDLE and r1.two_dwords = '1' and r1.second_dword = '0' then - early_req_addr <= r1.next_addr(11 downto 0); + -- If we're stalling then we need to keep reading the last + -- row requested. + if stall_out = '0' then + early_req_row <= get_row(d_in.addr); else - early_req_addr <= d_in.early_low_addr; + early_req_row <= req_row; end if; - early_req_row <= get_row(x"0000000000000" & early_req_addr); end process; -- Wire up wishbone request latch out of stage 1 @@ -502,17 +439,17 @@ begin cancel_store <= '0'; set_rsrv <= '0'; clear_rsrv <= '0'; - if d_in.valid = '1' and d_in.reserve = '1' then + if stall_out = '0' and r0.valid = '1' and r0.reserve = '1' then -- XXX generate alignment interrupt if address is not aligned - -- XXX or if d_in.nc = '1' - if d_in.load = '1' then + -- XXX or if r0.nc = '1' + if r0.load = '1' then -- load with reservation set_rsrv <= '1'; else -- store conditional clear_rsrv <= '1'; if reservation.valid = '0' or - d_in.addr(63 downto LINE_OFF_BITS) /= reservation.addr then + r0.addr(63 downto LINE_OFF_BITS) /= reservation.addr then cancel_store <= '1'; end if; end if; @@ -526,28 +463,19 @@ begin reservation.valid <= '0'; elsif set_rsrv = '1' then reservation.valid <= '1'; - reservation.addr <= d_in.addr(63 downto LINE_OFF_BITS); + reservation.addr <= r0.addr(63 downto LINE_OFF_BITS); end if; end if; end process; - -- Writeback (loads and reg updates) & completion control logic + -- Return data for loads & completion control logic -- writeback_control: process(all) begin - -- The mux on d_out.write reg defaults to the normal load hit case. - d_out.write_enable <= '0'; + -- The mux on d_out.data defaults to the normal load hit case. d_out.valid <= '0'; - d_out.write_reg <= r1.req.write_reg; - d_out.write_data <= cache_out(r1.hit_way); - d_out.write_len <= r1.req.length; - d_out.write_shift <= r1.req.addr(2 downto 0); - d_out.sign_extend <= r1.req.sign_extend; - d_out.byte_reverse <= r1.req.byte_reverse; - d_out.second_word <= r1.second_dword; - d_out.xerc <= r1.req.xerc; - d_out.rc <= '0'; -- loads never have rc=1 + d_out.data <= cache_out(r1.hit_way); d_out.store_done <= '0'; -- We have a valid load or store hit or we just completed a slow @@ -561,30 +489,17 @@ begin -- -- Sanity: Only one of these must be set in any given cycle - assert (r1.update_valid and r1.hit_load_valid) /= '1' report - "unexpected hit_load_delayed collision with update_valid" - severity FAILURE; assert (r1.slow_valid and r1.stcx_fail) /= '1' report "unexpected slow_valid collision with stcx_fail" severity FAILURE; assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid) /= '1' report "unexpected hit_load_delayed collision with slow_valid" severity FAILURE; - assert ((r1.slow_valid or r1.stcx_fail) and r1.update_valid) /= '1' report - "unexpected update_valid collision with slow_valid or stcx_fail" - severity FAILURE; -- Load hit case is the standard path if r1.hit_load_valid = '1' then - d_out.write_enable <= '1'; - - -- If there isn't another dword to go and - -- it's not a load with update, complete it now - if (r1.second_dword or not r1.two_dwords) = '1' and - r1.req.update = '0' then - report "completing load hit"; - d_out.valid <= '1'; - end if; + report "completing load hit"; + d_out.valid <= '1'; end if; -- Slow ops (load miss, NC, stores) @@ -593,63 +508,20 @@ begin -- mux accordingly -- if r1.req.load then - d_out.write_reg <= r1.req.write_reg; - d_out.write_enable <= '1'; - - -- Read data comes from the slow data latch, formatter - -- from the latched request. - -- - d_out.write_data <= r1.slow_data; - d_out.write_shift <= r1.req.addr(2 downto 0); - d_out.sign_extend <= r1.req.sign_extend; - d_out.byte_reverse <= r1.req.byte_reverse; - d_out.write_len <= r1.req.length; - d_out.xerc <= r1.req.xerc; - d_out.second_word <= r1.second_dword; + -- Read data comes from the slow data latch + d_out.data <= r1.slow_data; end if; - d_out.rc <= r1.req.rc; d_out.store_done <= '1'; - -- If it's a store or a non-update load form, complete now - -- unless we need to do another dword transfer - if (r1.req.load = '0' or r1.req.update = '0') and - (r1.two_dwords = '0' or r1.second_dword = '1') then - report "completing store or load miss"; - d_out.valid <= '1'; - end if; + report "completing store or load miss"; + d_out.valid <= '1'; end if; if r1.stcx_fail = '1' then - d_out.rc <= r1.req.rc; d_out.store_done <= '0'; d_out.valid <= '1'; end if; - -- We have a register update to do. - if r1.update_valid = '1' then - d_out.write_enable <= '1'; - d_out.write_reg <= r1.req.update_reg; - - -- Change the read data mux to the address that's going into - -- the register and the formatter does nothing. - -- - d_out.write_data <= r1.req.addr; - d_out.write_shift <= "000"; - d_out.write_len <= "1000"; - d_out.sign_extend <= '0'; - d_out.byte_reverse <= '0'; - d_out.xerc <= r1.req.xerc; - d_out.second_word <= '0'; - - -- If it was a load, this completes the operation (load with - -- update case). - -- - if r1.req.load = '1' then - report "completing after load update"; - d_out.valid <= '1'; - end if; - end if; - end process; -- @@ -703,11 +575,11 @@ begin -- For timing, the mux on wr_data/sel/addr is not dependent on anything -- other than the current state. Only the do_write signal is. -- - if r1.state = IDLE or r1.state = NEXT_DWORD then - -- In these states, the only write path is the store-hit update case + if r1.state = IDLE then + -- In IDLE state, the only write path is the store-hit update case wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); - wr_data <= req_data; - wr_sel <= req_sel; + wr_data <= r0.data; + wr_sel <= r0.byte_sel; else -- Otherwise, we might be doing a reload wr_data <= wishbone_in.dat; @@ -731,35 +603,25 @@ begin end generate; -- - -- Cache hit synchronous machine for the easy case. This handles - -- non-update form load hits + -- Cache hit synchronous machine for the easy case. This handles load hits. -- dcache_fast_hit : process(clk) begin if rising_edge(clk) then - -- If we have a request incoming, we have to latch it as d_in.valid + -- If we have a request incoming, we have to latch it as r0.valid -- is only set for a single cycle. It's up to the control logic to -- ensure we don't override an uncompleted request (for now we are -- single issue on load/stores so we are fine, later, we can generate -- a stall output if necessary). - if req_op /= OP_NONE and d_in.valid = '1' then - r1.req <= d_in; - r1.second_dword <= '0'; - r1.two_dwords <= two_dwords; - r1.next_addr <= next_addr; - r1.next_sel <= bus_sel(15 downto 8); - + if req_op /= OP_NONE and stall_out = '0' then + r1.req <= r0; report "op:" & op_t'image(req_op) & - " addr:" & to_hstring(d_in.addr) & - " upd:" & std_ulogic'image(d_in.update) & - " nc:" & std_ulogic'image(d_in.nc) & - " reg:" & to_hstring(d_in.write_reg) & + " addr:" & to_hstring(r0.addr) & + " nc:" & std_ulogic'image(r0.nc) & " idx:" & integer'image(req_index) & " tag:" & to_hstring(req_tag) & " way: " & integer'image(req_hit_way); - elsif r1.state = NEXT_DWORD then - r1.second_dword <= '1'; end if; -- Fast path for load/store hits. Set signals for the writeback controls. @@ -776,7 +638,6 @@ begin -- Every other case is handled by this state machine: -- -- * Cache load miss/reload (in conjunction with "rams") - -- * Load hits for update forms -- * Load hits for non-cachable forms -- * Stores (the collision case is handled in "rams") -- @@ -795,7 +656,6 @@ begin end loop; r1.state <= IDLE; r1.slow_valid <= '0'; - r1.update_valid <= '0'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; @@ -804,39 +664,19 @@ begin else -- One cycle pulses reset r1.slow_valid <= '0'; - r1.update_valid <= '0'; r1.stcx_fail <= '0'; - -- We cannot currently process a new request when not idle - assert d_in.valid = '0' or r1.state = IDLE report "request " & - op_t'image(req_op) & " while in state " & state_t'image(r1.state) - severity FAILURE; - -- Main state machine case r1.state is - when IDLE | NEXT_DWORD => + when IDLE => case req_op is when OP_LOAD_HIT => - if r1.state = IDLE then - -- If the load is misaligned then we will need to start - -- the state machine - if two_dwords = '1' then - r1.state <= NEXT_DWORD; - elsif d_in.update = '1' then - r1.state <= LOAD_UPDATE; - end if; - else - if r1.req.update = '1' then - r1.state <= LOAD_UPDATE; - else - r1.state <= IDLE; - end if; - end if; + -- stay in IDLE state - when OP_LOAD_MISS => + when OP_LOAD_MISS => -- Normal load cache miss, start the reload machine -- - report "cache miss addr:" & to_hstring(req_addr) & + report "cache miss addr:" & to_hstring(r0.addr) & " idx:" & integer'image(req_index) & " way:" & integer'image(replace_way) & " tag:" & to_hstring(req_tag); @@ -871,19 +711,17 @@ begin r1.state <= RELOAD_WAIT_ACK; when OP_LOAD_NC => - r1.wb.sel <= req_sel; - r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.sel <= r0.byte_sel; + r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; r1.state <= NC_LOAD_WAIT_ACK; when OP_STORE_HIT | OP_STORE_MISS => - -- For store-with-update do the register update - r1.update_valid <= d_in.valid and d_in.update; - r1.wb.sel <= req_sel; - r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000"; - r1.wb.dat <= req_data; + r1.wb.sel <= r0.byte_sel; + r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.dat <= r0.data; if cancel_store = '0' then r1.wb.cyc <= '1'; r1.wb.stb <= '1'; @@ -899,9 +737,6 @@ begin when OP_BAD => end case; - when PRE_NEXT_DWORD => - r1.state <= NEXT_DWORD; - when RELOAD_WAIT_ACK => -- Requests are all sent if stb is 0 stbs_done := r1.wb.stb = '0'; @@ -943,31 +778,23 @@ begin -- Cache line is now valid cache_valids(r1.store_index)(r1.store_way) <= '1'; - -- Write back the load data that we got, and start - -- the second dword if necessary. Otherwise, see if - -- we also need to do the deferred update cycle. - r1.slow_valid <= '1'; - if r1.two_dwords and not r1.second_dword then - r1.state <= PRE_NEXT_DWORD; - elsif r1.req.update = '1' then - r1.state <= LOAD_UPDATE; - report "completing miss with load-update !"; - else - r1.state <= IDLE; - report "completing miss !"; - end if; + -- Don't complete and go idle until next cycle, in + -- case the next request is for the last dword of + -- the cache line we just loaded. + r1.state <= FINISH_LD_MISS; end if; -- Increment store row counter r1.store_row <= next_row(r1.store_row); end if; - when LOAD_UPDATE => - -- We need the extra cycle to complete a load with update - r1.update_valid <= '1'; - r1.state <= IDLE; + when FINISH_LD_MISS => + -- Write back the load data that we got + r1.slow_valid <= '1'; + r1.state <= IDLE; + report "completing miss !"; - when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => + when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => -- Clear stb when slave accepted request if wishbone_in.stall = '0' then r1.wb.stb <= '0'; @@ -975,16 +802,10 @@ begin -- Got ack ? complete. if wishbone_in.ack = '1' then - if r1.two_dwords and not r1.second_dword then - r1.state <= NEXT_DWORD; - elsif r1.state = NC_LOAD_WAIT_ACK and r1.req.update = '1' then - r1.state <= LOAD_UPDATE; - else - r1.state <= IDLE; - end if; if r1.state = NC_LOAD_WAIT_ACK then r1.slow_data <= wishbone_in.dat; end if; + r1.state <= IDLE; r1.slow_valid <= '1'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; diff --git a/dcache_tb.vhdl b/dcache_tb.vhdl index 437fd7d..bd8341a 100644 --- a/dcache_tb.vhdl +++ b/dcache_tb.vhdl @@ -13,7 +13,7 @@ architecture behave of dcache_tb is signal rst : std_ulogic; signal d_in : Loadstore1ToDcacheType; - signal d_out : DcacheToWritebackType; + signal d_out : DcacheToLoadstore1Type; signal wb_bram_in : wishbone_master_out; signal wb_bram_out : wishbone_slave_out; @@ -71,12 +71,6 @@ begin d_in.nc <= '0'; d_in.addr <= (others => '0'); d_in.data <= (others => '0'); - d_in.write_reg <= (others => '0'); - d_in.length <= (others => '0'); - d_in.byte_reverse <= '0'; - d_in.sign_extend <= '0'; - d_in.update <= '0'; - d_in.update_reg <= (others => '0'); wait for 4*clk_period; wait until rising_edge(clk); @@ -89,11 +83,10 @@ begin wait until rising_edge(clk); d_in.valid <= '0'; - wait until rising_edge(clk) and d_out.write_enable = '1'; - assert d_out.valid = '1'; - assert d_out.write_data = x"0000000100000000" + wait until rising_edge(clk) and d_out.valid = '1'; + assert d_out.data = x"0000000100000000" report "data @" & to_hstring(d_in.addr) & - "=" & to_hstring(d_out.write_data) & + "=" & to_hstring(d_out.data) & " expected 0000000100000000" severity failure; -- wait for clk_period; @@ -106,11 +99,10 @@ begin wait until rising_edge(clk); d_in.valid <= '0'; - wait until rising_edge(clk) and d_out.write_enable = '1'; - assert d_out.valid = '1'; - assert d_out.write_data = x"0000000D0000000C" + wait until rising_edge(clk) and d_out.valid = '1'; + assert d_out.data = x"0000000D0000000C" report "data @" & to_hstring(d_in.addr) & - "=" & to_hstring(d_out.write_data) & + "=" & to_hstring(d_out.data) & " expected 0000000D0000000C" severity failure; @@ -121,11 +113,10 @@ begin d_in.valid <= '1'; wait until rising_edge(clk); d_in.valid <= '0'; - wait until rising_edge(clk) and d_out.write_enable = '1'; - assert d_out.valid = '1'; - assert d_out.write_data = x"0000004100000040" + wait until rising_edge(clk) and d_out.valid = '1'; + assert d_out.data = x"0000004100000040" report "data @" & to_hstring(d_in.addr) & - "=" & to_hstring(d_out.write_data) & + "=" & to_hstring(d_out.data) & " expected 0000004100000040" severity failure; diff --git a/execute1.vhdl b/execute1.vhdl index b1662b7..b05fd4d 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -42,7 +42,6 @@ architecture behaviour of execute1 is next_lr : std_ulogic_vector(63 downto 0); mul_in_progress : std_ulogic; div_in_progress : std_ulogic; - ldst_in_progress : std_ulogic; cntz_in_progress : std_ulogic; slow_op_dest : gpr_index_t; slow_op_rc : std_ulogic; @@ -264,7 +263,6 @@ begin v.mul_in_progress := '0'; v.div_in_progress := '0'; v.cntz_in_progress := '0'; - v.ldst_in_progress := '0'; -- signals to multiply unit x_to_multiply <= Execute1ToMultiplyInit; @@ -662,8 +660,6 @@ begin when OP_LOAD | OP_STORE => -- loadstore/dcache has its own port to writeback v.e.valid := '0'; - stall_out <= '1'; - v.ldst_in_progress := '1'; when others => terminate_out <= '1'; @@ -703,10 +699,6 @@ begin v.e.rc := v.slow_op_rc; v.e.xerc := v.slow_op_xerc; v.e.valid := '1'; - elsif r.ldst_in_progress = '1' then - -- assert stall for 2 cycles on load/store, then - -- the stall output from dcache takes over - stall_out <= '1'; elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or (r.div_in_progress = '1' and divider_to_x.valid = '1') then diff --git a/loadstore1.vhdl b/loadstore1.vhdl index a25e617..2ab71ad 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -12,16 +12,85 @@ use work.helpers.all; entity loadstore1 is port ( clk : in std_ulogic; + rst : in std_ulogic; l_in : in Execute1ToLoadstore1Type; + l_out : out Loadstore1ToWritebackType; - l_out : out Loadstore1ToDcacheType + d_out : out Loadstore1ToDcacheType; + d_in : in DcacheToLoadstore1Type; + + dc_stall : in std_ulogic; + stall_out : out std_ulogic ); end loadstore1; +-- Note, we don't currently use the stall output from the dcache because +-- we know it can take two requests without stalling when idle, we are +-- its only user, and we know it never stalls when idle. + architecture behave of loadstore1 is - signal r, rin : Loadstore1ToDcacheType; + + -- State machine for unaligned loads/stores + type state_t is (IDLE, -- ready for instruction + SECOND_REQ, -- send 2nd request of unaligned xfer + FIRST_ACK_WAIT, -- waiting for 1st ack from dcache + LAST_ACK_WAIT, -- waiting for last ack from dcache + LD_UPDATE -- writing rA with computed addr on load + ); + + type reg_stage_t is record + -- latch most of the input request + load : std_ulogic; + addr : std_ulogic_vector(63 downto 0); + data : std_ulogic_vector(63 downto 0); + write_reg : gpr_index_t; + length : std_ulogic_vector(3 downto 0); + byte_reverse : std_ulogic; + sign_extend : std_ulogic; + update : std_ulogic; + update_reg : gpr_index_t; + xerc : xer_common_t; + reserve : std_ulogic; + rc : std_ulogic; + nc : std_ulogic; -- non-cacheable access + state : state_t; + second_bytes : std_ulogic_vector(7 downto 0); + end record; + + signal r, rin : reg_stage_t; signal lsu_sum : std_ulogic_vector(63 downto 0); + + -- Generate byte enables from sizes + function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is + begin + case length is + when "0001" => + return "00000001"; + when "0010" => + return "00000011"; + when "0100" => + return "00001111"; + when "1000" => + return "11111111"; + when others => + return "00000000"; + end case; + end function length_to_sel; + + -- Calculate byte enables + -- This returns 16 bits, giving the select signals for two transfers, + -- to account for unaligned loads or stores + function xfer_data_sel(size : in std_logic_vector(3 downto 0); + address : in std_logic_vector(2 downto 0)) + return std_ulogic_vector is + variable longsel : std_ulogic_vector(15 downto 0); + begin + longsel := "00000000" & length_to_sel(size); + return std_ulogic_vector(shift_left(unsigned(longsel), + to_integer(unsigned(address)))); + end function xfer_data_sel; + begin -- Calculate the address in the first cycle lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0'); @@ -29,69 +98,180 @@ begin loadstore1_0: process(clk) begin if rising_edge(clk) then - r <= rin; + if rst = '1' then + r.state <= IDLE; + else + r <= rin; + end if; end if; end process; loadstore1_1: process(all) - variable v : Loadstore1ToDcacheType; + variable v : reg_stage_t; variable brev_lenm1 : unsigned(2 downto 0); variable byte_offset : unsigned(2 downto 0); variable j : integer; variable k : unsigned(2 downto 0); + variable long_sel : std_ulogic_vector(15 downto 0); + variable byte_sel : std_ulogic_vector(7 downto 0); + variable req : std_ulogic; + variable stall : std_ulogic; + variable addr : std_ulogic_vector(63 downto 0); + variable wdata : std_ulogic_vector(63 downto 0); + variable write_enable : std_ulogic; + variable do_update : std_ulogic; + variable second_dword : std_ulogic; + variable done : std_ulogic; begin v := r; + req := '0'; + stall := '0'; + done := '0'; + byte_sel := (others => '0'); + addr := lsu_sum; + + write_enable := '0'; + do_update := '0'; + second_dword := '0'; + + case r.state is + when IDLE => + if l_in.valid = '1' then + v.load := l_in.load; + v.addr := lsu_sum; + v.data := l_in.data; + v.write_reg := l_in.write_reg; + v.length := l_in.length; + v.byte_reverse := l_in.byte_reverse; + v.sign_extend := l_in.sign_extend; + v.update := l_in.update; + v.update_reg := l_in.update_reg; + v.xerc := l_in.xerc; + v.reserve := l_in.reserve; + v.rc := l_in.rc; - v.valid := l_in.valid; - v.load := l_in.load; - v.write_reg := l_in.write_reg; - v.length := l_in.length; - v.byte_reverse := l_in.byte_reverse; - v.sign_extend := l_in.sign_extend; - v.update := l_in.update; - v.update_reg := l_in.update_reg; - v.xerc := l_in.xerc; - v.reserve := l_in.reserve; - v.rc := l_in.rc; - - -- XXX Temporary hack. Mark the op as non-cachable if the address - -- is the form 0xc------- - -- - -- This will have to be replaced by a combination of implementing the - -- proper HV CI load/store instructions and having an MMU to get the I - -- bit otherwise. - if lsu_sum(31 downto 28) = "1100" then - v.nc := '1'; - else - v.nc := '0'; - end if; - - -- XXX Do length_to_sel here ? - - -- Do byte reversing and rotating for stores in the first cycle - if v.load = '0' then - byte_offset := unsigned(lsu_sum(2 downto 0)); - brev_lenm1 := "000"; - if l_in.byte_reverse = '1' then - brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + -- XXX Temporary hack. Mark the op as non-cachable if the address + -- is the form 0xc------- + -- + -- This will have to be replaced by a combination of implementing the + -- proper HV CI load/store instructions and having an MMU to get the I + -- bit otherwise. + if lsu_sum(31 downto 28) = "1100" then + v.nc := '1'; + else + v.nc := '0'; + end if; + + -- Do length_to_sel and work out if we are doing 2 dwords + long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0)); + byte_sel := long_sel(7 downto 0); + v.second_bytes := long_sel(15 downto 8); + + v.addr := lsu_sum; + + -- Do byte reversing and rotating for stores in the first cycle + if v.load = '0' then + byte_offset := unsigned(lsu_sum(2 downto 0)); + brev_lenm1 := "000"; + if l_in.byte_reverse = '1' then + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + end if; + for i in 0 to 7 loop + k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; + j := to_integer(k) * 8; + v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); + end loop; + end if; + + req := '1'; + stall := '1'; + if long_sel(15 downto 8) = "00000000" then + v.state := LAST_ACK_WAIT; + else + v.state := SECOND_REQ; + end if; end if; - for i in 0 to 7 loop - k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; - j := to_integer(k) * 8; - v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); - end loop; - end if; - v.addr := lsu_sum; + when SECOND_REQ => + -- compute (addr + 8) & ~7 for the second doubleword when unaligned + addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; + byte_sel := r.second_bytes; + req := '1'; + stall := '1'; + v.state := FIRST_ACK_WAIT; + + when FIRST_ACK_WAIT => + stall := '1'; + if d_in.valid = '1' then + write_enable := r.load; + v.state := LAST_ACK_WAIT; + end if; + + when LAST_ACK_WAIT => + stall := '1'; + second_dword := or (r.second_bytes); + if d_in.valid = '1' then + write_enable := r.load; + if r.load = '1' and r.update = '1' then + -- loads with rA update need an extra cycle + v.state := LD_UPDATE; + else + -- stores write back rA update in this cycle + do_update := r.update; + stall := '0'; + done := '1'; + v.state := IDLE; + end if; + end if; + + when LD_UPDATE => + do_update := '1'; + v.state := IDLE; + done := '1'; + end case; -- Update registers rin <= v; - -- Update outputs - l_out <= r; + -- Update outputs to dcache + d_out.valid <= req; + d_out.load <= v.load; + d_out.nc <= v.nc; + d_out.reserve <= v.reserve; + d_out.addr <= addr; + d_out.data <= v.data; + d_out.byte_sel <= byte_sel; + + -- Update outputs to writeback + -- Multiplex either cache data to the destination GPR or + -- the address for the rA update. + l_out.valid <= done; + if do_update = '1' then + l_out.write_enable <= '1'; + l_out.write_reg <= r.update_reg; + l_out.write_data <= r.addr; + l_out.write_len <= x"8"; + l_out.write_shift <= "000"; + l_out.sign_extend <= '0'; + l_out.byte_reverse <= '0'; + l_out.second_word <= '0'; + l_out.rc <= '0'; + l_out.store_done <= '0'; + else + l_out.write_enable <= write_enable; + l_out.write_reg <= r.write_reg; + l_out.write_data <= d_in.data; + l_out.write_len <= r.length; + l_out.write_shift <= r.addr(2 downto 0); + l_out.sign_extend <= r.sign_extend; + l_out.byte_reverse <= r.byte_reverse; + l_out.second_word <= second_dword; + l_out.rc <= r.rc and done; + l_out.store_done <= d_in.store_done; + end if; + l_out.xerc <= r.xerc; + + stall_out <= stall; - -- Asynchronous output of the low-order address bits (latched in dcache) - l_out.early_low_addr <= lsu_sum(11 downto 0); - l_out.early_valid <= l_in.valid; end process; end; diff --git a/writeback.vhdl b/writeback.vhdl index 0151561..d52bb54 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -11,7 +11,7 @@ entity writeback is clk : in std_ulogic; e_in : in Execute1ToWritebackType; - l_in : in DcacheToWritebackType; + l_in : in Loadstore1ToWritebackType; w_out : out WritebackToRegisterFileType; c_out : out WritebackToCrFileType;