diff --git a/common.vhdl b/common.vhdl index 4b879a1..f581ccb 100644 --- a/common.vhdl +++ b/common.vhdl @@ -218,22 +218,20 @@ package common is valid : std_ulogic; load : std_ulogic; nc : std_ulogic; + reserve : std_ulogic; addr : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); - write_reg : gpr_index_t; - length : std_ulogic_vector(3 downto 0); - byte_reverse : std_ulogic; - sign_extend : std_ulogic; - update : std_ulogic; - update_reg : gpr_index_t; - xerc : xer_common_t; - reserve : std_ulogic; - rc : std_ulogic; - early_low_addr : std_ulogic_vector(11 downto 0); - early_valid : std_ulogic; + byte_sel : std_ulogic_vector(7 downto 0); + end record; + + type DcacheToLoadstore1Type is record + valid : std_ulogic; + data : std_ulogic_vector(63 downto 0); + store_done : std_ulogic; + error : std_ulogic; end record; - type DcacheToWritebackType is record + type Loadstore1ToWritebackType is record valid : std_ulogic; write_enable: std_ulogic; write_reg : gpr_index_t; @@ -247,9 +245,9 @@ package common is rc : std_ulogic; store_done : std_ulogic; end record; - constant DcacheToWritebackInit : DcacheToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', - byte_reverse => '0', second_word => '0', xerc => xerc_init, - rc => '0', store_done => '0', others => (others => '0')); + constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', + byte_reverse => '0', second_word => '0', xerc => xerc_init, + rc => '0', store_done => '0', others => (others => '0')); type Execute1ToWritebackType is record valid: std_ulogic; diff --git a/core.vhdl b/core.vhdl index 87e73a4..d535f7a 100644 --- a/core.vhdl +++ b/core.vhdl @@ -61,8 +61,11 @@ architecture behave of core is -- load store signals signal execute1_to_loadstore1: Execute1ToLoadstore1Type; + signal loadstore1_to_writeback: Loadstore1ToWritebackType; + + -- dcache signals signal loadstore1_to_dcache: Loadstore1ToDcacheType; - signal dcache_to_writeback: DcacheToWritebackType; + signal dcache_to_loadstore1: DcacheToLoadstore1Type; -- local signals signal fetch1_stall_in : std_ulogic; @@ -73,6 +76,7 @@ architecture behave of core is signal decode2_stall_out : std_ulogic; signal ex1_icache_inval: std_ulogic; signal ex1_stall_out: std_ulogic; + signal ls1_stall_out: std_ulogic; signal dcache_stall_out: std_ulogic; signal flush: std_ulogic; @@ -196,7 +200,7 @@ begin c_in => cr_file_to_decode2, c_out => decode2_to_cr_file ); - decode2_stall_in <= ex1_stall_out or dcache_stall_out; + decode2_stall_in <= ex1_stall_out or ls1_stall_out; register_file_0: entity work.register_file generic map ( @@ -243,8 +247,13 @@ begin loadstore1_0: entity work.loadstore1 port map ( clk => clk, + rst => core_rst, l_in => execute1_to_loadstore1, - l_out => loadstore1_to_dcache + l_out => loadstore1_to_writeback, + d_out => loadstore1_to_dcache, + d_in => dcache_to_loadstore1, + dc_stall => dcache_stall_out, + stall_out => ls1_stall_out ); dcache_0: entity work.dcache @@ -257,7 +266,7 @@ begin clk => clk, rst => core_rst, d_in => loadstore1_to_dcache, - d_out => dcache_to_writeback, + d_out => dcache_to_loadstore1, stall_out => dcache_stall_out, wishbone_in => wishbone_data_in, wishbone_out => wishbone_data_out @@ -267,7 +276,7 @@ begin port map ( clk => clk, e_in => execute1_to_writeback, - l_in => dcache_to_writeback, + l_in => loadstore1_to_writeback, w_out => writeback_to_register_file, c_out => writeback_to_cr_file, complete_out => complete diff --git a/dcache.vhdl b/dcache.vhdl index bcc7590..7e553bf 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -7,9 +7,6 @@ -- * Complete load misses on the cycle when WB data comes instead of -- at the end of line (this requires dealing with requests coming in -- while not idle...) --- * Load with update could use one less non-pipelined cycle by moving --- the register update to the pipeline bubble that exists when going --- back to the IDLE state. -- library ieee; use ieee.std_logic_1164.all; @@ -35,7 +32,7 @@ entity dcache is rst : in std_ulogic; d_in : in Loadstore1ToDcacheType; - d_out : out DcacheToWritebackType; + d_out : out DcacheToLoadstore1Type; stall_out : out std_ulogic; @@ -113,6 +110,8 @@ architecture rtl of dcache is attribute ram_style : string; attribute ram_style of cache_tags : signal is "distributed"; + signal r0 : Loadstore1ToDcacheType; + -- Type of operation on a "valid" input type op_t is (OP_NONE, OP_LOAD_HIT, -- Cache hit on load @@ -124,10 +123,8 @@ architecture rtl of dcache is -- Cache state machine type state_t is (IDLE, -- Normal load hit processing - PRE_NEXT_DWORD, -- Extra state before NEXT_DWORD - NEXT_DWORD, -- Starting the 2nd xfer of misaligned - LOAD_UPDATE, -- Load with update extra cycle RELOAD_WAIT_ACK, -- Cache reload wait ack + FINISH_LD_MISS, -- Extra cycle after load miss STORE_WAIT_ACK, -- Store wait ack NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack @@ -158,15 +155,6 @@ architecture rtl of dcache is hit_way : way_t; hit_load_valid : std_ulogic; - -- Info for doing the second transfer of a misaligned load/store - two_dwords : std_ulogic; - second_dword : std_ulogic; - next_addr : std_ulogic_vector(63 downto 0); - next_sel : std_ulogic_vector(7 downto 0); - - -- Register update (load/store with update) - update_valid : std_ulogic; - -- Data buffer for "slow" read ops (load miss and NC loads). slow_data : std_ulogic_vector(63 downto 0); slow_valid : std_ulogic; @@ -200,12 +188,8 @@ architecture rtl of dcache is signal req_tag : cache_tag_t; signal req_op : op_t; signal req_data : std_ulogic_vector(63 downto 0); - signal req_addr : std_ulogic_vector(63 downto 0); signal req_laddr : std_ulogic_vector(63 downto 0); - signal req_sel : std_ulogic_vector(7 downto 0); - signal next_addr : std_ulogic_vector(63 downto 0); - signal early_req_addr : std_ulogic_vector(11 downto 0); signal early_req_row : row_t; signal cancel_store : std_ulogic; @@ -222,10 +206,8 @@ architecture rtl of dcache is signal replace_way : way_t; -- Wishbone read/write/cache write formatting signals - signal bus_sel : std_ulogic_vector(15 downto 0); + signal bus_sel : std_ulogic_vector(7 downto 0); - signal two_dwords : std_ulogic; - -- -- Helper functions to decode incoming requests -- @@ -305,37 +287,6 @@ architecture rtl of dcache is tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; end; - -- Generate byte enables from sizes - function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is - begin - case length is - when "0001" => - return "00000001"; - when "0010" => - return "00000011"; - when "0100" => - return "00001111"; - when "1000" => - return "11111111"; - when others => - return "00000000"; - end case; - end function length_to_sel; - - -- Calculate byte enables for wishbone - -- This returns 16 bits, giving the select signals for two transfers, - -- to account for unaligned loads or stores - function wishbone_data_sel(size : in std_logic_vector(3 downto 0); - address : in std_logic_vector(63 downto 0)) - return std_ulogic_vector is - variable longsel : std_ulogic_vector(15 downto 0); - begin - longsel := (others => '0'); - longsel(7 downto 0) := length_to_sel(size); - return std_ulogic_vector(shift_left(unsigned(longsel), - to_integer(unsigned(address(2 downto 0))))); - end function wishbone_data_sel; - begin assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; @@ -390,11 +341,17 @@ begin end generate; end generate; - -- Wishbone read and write and BRAM write sel bits generation - bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); - - -- See if the operation crosses two doublewords - two_dwords <= or (bus_sel(15 downto 8)); + -- Latch the request in r0 as long as we're not stalling + stage_0 : process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + r0.valid <= '0'; + elsif stall_out = '0' then + r0 <= d_in; + end if; + end if; + end process; -- Cache request parsing and hit detection dcache_request : process(all) @@ -405,40 +362,21 @@ begin variable data : std_ulogic_vector(63 downto 0); variable opsel : std_ulogic_vector(3 downto 0); variable go : std_ulogic; - variable is_load : std_ulogic; - variable is_nc : std_ulogic; begin -- Extract line, row and tag from request - if r1.state /= NEXT_DWORD then - req_addr <= d_in.addr; - req_data <= d_in.data; - req_sel <= bus_sel(7 downto 0); - go := d_in.valid; - is_load := d_in.load; - is_nc := d_in.nc; - - else - req_addr <= r1.next_addr; - req_data <= r1.req.data; - req_sel <= r1.next_sel; - go := '1'; - is_load := r1.req.load; - is_nc := r1.req.nc; - end if; + req_index <= get_index(r0.addr); + req_row <= get_row(r0.addr); + req_tag <= get_tag(r0.addr); - req_index <= get_index(req_addr); - req_row <= get_row(req_addr); - req_tag <= get_tag(req_addr); + -- Only do anything if not being stalled by stage 1 + go := r0.valid and not stall_out; -- Calculate address of beginning of cache line, will be -- used for cache miss processing if needed -- - req_laddr <= req_addr(63 downto LINE_OFF_BITS) & + req_laddr <= r0.addr(63 downto LINE_OFF_BITS) & (LINE_OFF_BITS-1 downto 0 => '0'); - -- Address of next doubleword, used for unaligned accesses - next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000"; - -- Test if pending request is a hit on any way hit_way := 0; is_hit := '0'; @@ -460,7 +398,7 @@ begin -- Combine the request and cache his status to decide what -- operation needs to be done -- - opsel := go & is_load & is_nc & is_hit; + opsel := go & r0.load & r0.nc & is_hit; case opsel is when "1101" => op := OP_LOAD_HIT; when "1100" => op := OP_LOAD_MISS; @@ -475,16 +413,15 @@ begin req_op <= op; - -- Versions of the address and row number that are valid one cycle earlier + -- Version of the row number that is valid one cycle earlier -- in the cases where we need to read the cache data BRAM. - if r1.state = IDLE and op = OP_LOAD_HIT and two_dwords = '1' then - early_req_addr <= next_addr(11 downto 0); - elsif r1.state /= IDLE and r1.two_dwords = '1' and r1.second_dword = '0' then - early_req_addr <= r1.next_addr(11 downto 0); + -- If we're stalling then we need to keep reading the last + -- row requested. + if stall_out = '0' then + early_req_row <= get_row(d_in.addr); else - early_req_addr <= d_in.early_low_addr; + early_req_row <= req_row; end if; - early_req_row <= get_row(x"0000000000000" & early_req_addr); end process; -- Wire up wishbone request latch out of stage 1 @@ -502,17 +439,17 @@ begin cancel_store <= '0'; set_rsrv <= '0'; clear_rsrv <= '0'; - if d_in.valid = '1' and d_in.reserve = '1' then + if stall_out = '0' and r0.valid = '1' and r0.reserve = '1' then -- XXX generate alignment interrupt if address is not aligned - -- XXX or if d_in.nc = '1' - if d_in.load = '1' then + -- XXX or if r0.nc = '1' + if r0.load = '1' then -- load with reservation set_rsrv <= '1'; else -- store conditional clear_rsrv <= '1'; if reservation.valid = '0' or - d_in.addr(63 downto LINE_OFF_BITS) /= reservation.addr then + r0.addr(63 downto LINE_OFF_BITS) /= reservation.addr then cancel_store <= '1'; end if; end if; @@ -526,28 +463,19 @@ begin reservation.valid <= '0'; elsif set_rsrv = '1' then reservation.valid <= '1'; - reservation.addr <= d_in.addr(63 downto LINE_OFF_BITS); + reservation.addr <= r0.addr(63 downto LINE_OFF_BITS); end if; end if; end process; - -- Writeback (loads and reg updates) & completion control logic + -- Return data for loads & completion control logic -- writeback_control: process(all) begin - -- The mux on d_out.write reg defaults to the normal load hit case. - d_out.write_enable <= '0'; + -- The mux on d_out.data defaults to the normal load hit case. d_out.valid <= '0'; - d_out.write_reg <= r1.req.write_reg; - d_out.write_data <= cache_out(r1.hit_way); - d_out.write_len <= r1.req.length; - d_out.write_shift <= r1.req.addr(2 downto 0); - d_out.sign_extend <= r1.req.sign_extend; - d_out.byte_reverse <= r1.req.byte_reverse; - d_out.second_word <= r1.second_dword; - d_out.xerc <= r1.req.xerc; - d_out.rc <= '0'; -- loads never have rc=1 + d_out.data <= cache_out(r1.hit_way); d_out.store_done <= '0'; -- We have a valid load or store hit or we just completed a slow @@ -561,30 +489,17 @@ begin -- -- Sanity: Only one of these must be set in any given cycle - assert (r1.update_valid and r1.hit_load_valid) /= '1' report - "unexpected hit_load_delayed collision with update_valid" - severity FAILURE; assert (r1.slow_valid and r1.stcx_fail) /= '1' report "unexpected slow_valid collision with stcx_fail" severity FAILURE; assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid) /= '1' report "unexpected hit_load_delayed collision with slow_valid" severity FAILURE; - assert ((r1.slow_valid or r1.stcx_fail) and r1.update_valid) /= '1' report - "unexpected update_valid collision with slow_valid or stcx_fail" - severity FAILURE; -- Load hit case is the standard path if r1.hit_load_valid = '1' then - d_out.write_enable <= '1'; - - -- If there isn't another dword to go and - -- it's not a load with update, complete it now - if (r1.second_dword or not r1.two_dwords) = '1' and - r1.req.update = '0' then - report "completing load hit"; - d_out.valid <= '1'; - end if; + report "completing load hit"; + d_out.valid <= '1'; end if; -- Slow ops (load miss, NC, stores) @@ -593,63 +508,20 @@ begin -- mux accordingly -- if r1.req.load then - d_out.write_reg <= r1.req.write_reg; - d_out.write_enable <= '1'; - - -- Read data comes from the slow data latch, formatter - -- from the latched request. - -- - d_out.write_data <= r1.slow_data; - d_out.write_shift <= r1.req.addr(2 downto 0); - d_out.sign_extend <= r1.req.sign_extend; - d_out.byte_reverse <= r1.req.byte_reverse; - d_out.write_len <= r1.req.length; - d_out.xerc <= r1.req.xerc; - d_out.second_word <= r1.second_dword; + -- Read data comes from the slow data latch + d_out.data <= r1.slow_data; end if; - d_out.rc <= r1.req.rc; d_out.store_done <= '1'; - -- If it's a store or a non-update load form, complete now - -- unless we need to do another dword transfer - if (r1.req.load = '0' or r1.req.update = '0') and - (r1.two_dwords = '0' or r1.second_dword = '1') then - report "completing store or load miss"; - d_out.valid <= '1'; - end if; + report "completing store or load miss"; + d_out.valid <= '1'; end if; if r1.stcx_fail = '1' then - d_out.rc <= r1.req.rc; d_out.store_done <= '0'; d_out.valid <= '1'; end if; - -- We have a register update to do. - if r1.update_valid = '1' then - d_out.write_enable <= '1'; - d_out.write_reg <= r1.req.update_reg; - - -- Change the read data mux to the address that's going into - -- the register and the formatter does nothing. - -- - d_out.write_data <= r1.req.addr; - d_out.write_shift <= "000"; - d_out.write_len <= "1000"; - d_out.sign_extend <= '0'; - d_out.byte_reverse <= '0'; - d_out.xerc <= r1.req.xerc; - d_out.second_word <= '0'; - - -- If it was a load, this completes the operation (load with - -- update case). - -- - if r1.req.load = '1' then - report "completing after load update"; - d_out.valid <= '1'; - end if; - end if; - end process; -- @@ -703,11 +575,11 @@ begin -- For timing, the mux on wr_data/sel/addr is not dependent on anything -- other than the current state. Only the do_write signal is. -- - if r1.state = IDLE or r1.state = NEXT_DWORD then - -- In these states, the only write path is the store-hit update case + if r1.state = IDLE then + -- In IDLE state, the only write path is the store-hit update case wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); - wr_data <= req_data; - wr_sel <= req_sel; + wr_data <= r0.data; + wr_sel <= r0.byte_sel; else -- Otherwise, we might be doing a reload wr_data <= wishbone_in.dat; @@ -731,35 +603,25 @@ begin end generate; -- - -- Cache hit synchronous machine for the easy case. This handles - -- non-update form load hits + -- Cache hit synchronous machine for the easy case. This handles load hits. -- dcache_fast_hit : process(clk) begin if rising_edge(clk) then - -- If we have a request incoming, we have to latch it as d_in.valid + -- If we have a request incoming, we have to latch it as r0.valid -- is only set for a single cycle. It's up to the control logic to -- ensure we don't override an uncompleted request (for now we are -- single issue on load/stores so we are fine, later, we can generate -- a stall output if necessary). - if req_op /= OP_NONE and d_in.valid = '1' then - r1.req <= d_in; - r1.second_dword <= '0'; - r1.two_dwords <= two_dwords; - r1.next_addr <= next_addr; - r1.next_sel <= bus_sel(15 downto 8); - + if req_op /= OP_NONE and stall_out = '0' then + r1.req <= r0; report "op:" & op_t'image(req_op) & - " addr:" & to_hstring(d_in.addr) & - " upd:" & std_ulogic'image(d_in.update) & - " nc:" & std_ulogic'image(d_in.nc) & - " reg:" & to_hstring(d_in.write_reg) & + " addr:" & to_hstring(r0.addr) & + " nc:" & std_ulogic'image(r0.nc) & " idx:" & integer'image(req_index) & " tag:" & to_hstring(req_tag) & " way: " & integer'image(req_hit_way); - elsif r1.state = NEXT_DWORD then - r1.second_dword <= '1'; end if; -- Fast path for load/store hits. Set signals for the writeback controls. @@ -776,7 +638,6 @@ begin -- Every other case is handled by this state machine: -- -- * Cache load miss/reload (in conjunction with "rams") - -- * Load hits for update forms -- * Load hits for non-cachable forms -- * Stores (the collision case is handled in "rams") -- @@ -795,7 +656,6 @@ begin end loop; r1.state <= IDLE; r1.slow_valid <= '0'; - r1.update_valid <= '0'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; @@ -804,39 +664,19 @@ begin else -- One cycle pulses reset r1.slow_valid <= '0'; - r1.update_valid <= '0'; r1.stcx_fail <= '0'; - -- We cannot currently process a new request when not idle - assert d_in.valid = '0' or r1.state = IDLE report "request " & - op_t'image(req_op) & " while in state " & state_t'image(r1.state) - severity FAILURE; - -- Main state machine case r1.state is - when IDLE | NEXT_DWORD => + when IDLE => case req_op is when OP_LOAD_HIT => - if r1.state = IDLE then - -- If the load is misaligned then we will need to start - -- the state machine - if two_dwords = '1' then - r1.state <= NEXT_DWORD; - elsif d_in.update = '1' then - r1.state <= LOAD_UPDATE; - end if; - else - if r1.req.update = '1' then - r1.state <= LOAD_UPDATE; - else - r1.state <= IDLE; - end if; - end if; + -- stay in IDLE state - when OP_LOAD_MISS => + when OP_LOAD_MISS => -- Normal load cache miss, start the reload machine -- - report "cache miss addr:" & to_hstring(req_addr) & + report "cache miss addr:" & to_hstring(r0.addr) & " idx:" & integer'image(req_index) & " way:" & integer'image(replace_way) & " tag:" & to_hstring(req_tag); @@ -871,19 +711,17 @@ begin r1.state <= RELOAD_WAIT_ACK; when OP_LOAD_NC => - r1.wb.sel <= req_sel; - r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.sel <= r0.byte_sel; + r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; r1.state <= NC_LOAD_WAIT_ACK; when OP_STORE_HIT | OP_STORE_MISS => - -- For store-with-update do the register update - r1.update_valid <= d_in.valid and d_in.update; - r1.wb.sel <= req_sel; - r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000"; - r1.wb.dat <= req_data; + r1.wb.sel <= r0.byte_sel; + r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.dat <= r0.data; if cancel_store = '0' then r1.wb.cyc <= '1'; r1.wb.stb <= '1'; @@ -899,9 +737,6 @@ begin when OP_BAD => end case; - when PRE_NEXT_DWORD => - r1.state <= NEXT_DWORD; - when RELOAD_WAIT_ACK => -- Requests are all sent if stb is 0 stbs_done := r1.wb.stb = '0'; @@ -943,31 +778,23 @@ begin -- Cache line is now valid cache_valids(r1.store_index)(r1.store_way) <= '1'; - -- Write back the load data that we got, and start - -- the second dword if necessary. Otherwise, see if - -- we also need to do the deferred update cycle. - r1.slow_valid <= '1'; - if r1.two_dwords and not r1.second_dword then - r1.state <= PRE_NEXT_DWORD; - elsif r1.req.update = '1' then - r1.state <= LOAD_UPDATE; - report "completing miss with load-update !"; - else - r1.state <= IDLE; - report "completing miss !"; - end if; + -- Don't complete and go idle until next cycle, in + -- case the next request is for the last dword of + -- the cache line we just loaded. + r1.state <= FINISH_LD_MISS; end if; -- Increment store row counter r1.store_row <= next_row(r1.store_row); end if; - when LOAD_UPDATE => - -- We need the extra cycle to complete a load with update - r1.update_valid <= '1'; - r1.state <= IDLE; + when FINISH_LD_MISS => + -- Write back the load data that we got + r1.slow_valid <= '1'; + r1.state <= IDLE; + report "completing miss !"; - when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => + when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => -- Clear stb when slave accepted request if wishbone_in.stall = '0' then r1.wb.stb <= '0'; @@ -975,16 +802,10 @@ begin -- Got ack ? complete. if wishbone_in.ack = '1' then - if r1.two_dwords and not r1.second_dword then - r1.state <= NEXT_DWORD; - elsif r1.state = NC_LOAD_WAIT_ACK and r1.req.update = '1' then - r1.state <= LOAD_UPDATE; - else - r1.state <= IDLE; - end if; if r1.state = NC_LOAD_WAIT_ACK then r1.slow_data <= wishbone_in.dat; end if; + r1.state <= IDLE; r1.slow_valid <= '1'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; diff --git a/dcache_tb.vhdl b/dcache_tb.vhdl index 437fd7d..bd8341a 100644 --- a/dcache_tb.vhdl +++ b/dcache_tb.vhdl @@ -13,7 +13,7 @@ architecture behave of dcache_tb is signal rst : std_ulogic; signal d_in : Loadstore1ToDcacheType; - signal d_out : DcacheToWritebackType; + signal d_out : DcacheToLoadstore1Type; signal wb_bram_in : wishbone_master_out; signal wb_bram_out : wishbone_slave_out; @@ -71,12 +71,6 @@ begin d_in.nc <= '0'; d_in.addr <= (others => '0'); d_in.data <= (others => '0'); - d_in.write_reg <= (others => '0'); - d_in.length <= (others => '0'); - d_in.byte_reverse <= '0'; - d_in.sign_extend <= '0'; - d_in.update <= '0'; - d_in.update_reg <= (others => '0'); wait for 4*clk_period; wait until rising_edge(clk); @@ -89,11 +83,10 @@ begin wait until rising_edge(clk); d_in.valid <= '0'; - wait until rising_edge(clk) and d_out.write_enable = '1'; - assert d_out.valid = '1'; - assert d_out.write_data = x"0000000100000000" + wait until rising_edge(clk) and d_out.valid = '1'; + assert d_out.data = x"0000000100000000" report "data @" & to_hstring(d_in.addr) & - "=" & to_hstring(d_out.write_data) & + "=" & to_hstring(d_out.data) & " expected 0000000100000000" severity failure; -- wait for clk_period; @@ -106,11 +99,10 @@ begin wait until rising_edge(clk); d_in.valid <= '0'; - wait until rising_edge(clk) and d_out.write_enable = '1'; - assert d_out.valid = '1'; - assert d_out.write_data = x"0000000D0000000C" + wait until rising_edge(clk) and d_out.valid = '1'; + assert d_out.data = x"0000000D0000000C" report "data @" & to_hstring(d_in.addr) & - "=" & to_hstring(d_out.write_data) & + "=" & to_hstring(d_out.data) & " expected 0000000D0000000C" severity failure; @@ -121,11 +113,10 @@ begin d_in.valid <= '1'; wait until rising_edge(clk); d_in.valid <= '0'; - wait until rising_edge(clk) and d_out.write_enable = '1'; - assert d_out.valid = '1'; - assert d_out.write_data = x"0000004100000040" + wait until rising_edge(clk) and d_out.valid = '1'; + assert d_out.data = x"0000004100000040" report "data @" & to_hstring(d_in.addr) & - "=" & to_hstring(d_out.write_data) & + "=" & to_hstring(d_out.data) & " expected 0000004100000040" severity failure; diff --git a/execute1.vhdl b/execute1.vhdl index b1662b7..b05fd4d 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -42,7 +42,6 @@ architecture behaviour of execute1 is next_lr : std_ulogic_vector(63 downto 0); mul_in_progress : std_ulogic; div_in_progress : std_ulogic; - ldst_in_progress : std_ulogic; cntz_in_progress : std_ulogic; slow_op_dest : gpr_index_t; slow_op_rc : std_ulogic; @@ -264,7 +263,6 @@ begin v.mul_in_progress := '0'; v.div_in_progress := '0'; v.cntz_in_progress := '0'; - v.ldst_in_progress := '0'; -- signals to multiply unit x_to_multiply <= Execute1ToMultiplyInit; @@ -662,8 +660,6 @@ begin when OP_LOAD | OP_STORE => -- loadstore/dcache has its own port to writeback v.e.valid := '0'; - stall_out <= '1'; - v.ldst_in_progress := '1'; when others => terminate_out <= '1'; @@ -703,10 +699,6 @@ begin v.e.rc := v.slow_op_rc; v.e.xerc := v.slow_op_xerc; v.e.valid := '1'; - elsif r.ldst_in_progress = '1' then - -- assert stall for 2 cycles on load/store, then - -- the stall output from dcache takes over - stall_out <= '1'; elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or (r.div_in_progress = '1' and divider_to_x.valid = '1') then diff --git a/loadstore1.vhdl b/loadstore1.vhdl index a25e617..2ab71ad 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -12,16 +12,85 @@ use work.helpers.all; entity loadstore1 is port ( clk : in std_ulogic; + rst : in std_ulogic; l_in : in Execute1ToLoadstore1Type; + l_out : out Loadstore1ToWritebackType; - l_out : out Loadstore1ToDcacheType + d_out : out Loadstore1ToDcacheType; + d_in : in DcacheToLoadstore1Type; + + dc_stall : in std_ulogic; + stall_out : out std_ulogic ); end loadstore1; +-- Note, we don't currently use the stall output from the dcache because +-- we know it can take two requests without stalling when idle, we are +-- its only user, and we know it never stalls when idle. + architecture behave of loadstore1 is - signal r, rin : Loadstore1ToDcacheType; + + -- State machine for unaligned loads/stores + type state_t is (IDLE, -- ready for instruction + SECOND_REQ, -- send 2nd request of unaligned xfer + FIRST_ACK_WAIT, -- waiting for 1st ack from dcache + LAST_ACK_WAIT, -- waiting for last ack from dcache + LD_UPDATE -- writing rA with computed addr on load + ); + + type reg_stage_t is record + -- latch most of the input request + load : std_ulogic; + addr : std_ulogic_vector(63 downto 0); + data : std_ulogic_vector(63 downto 0); + write_reg : gpr_index_t; + length : std_ulogic_vector(3 downto 0); + byte_reverse : std_ulogic; + sign_extend : std_ulogic; + update : std_ulogic; + update_reg : gpr_index_t; + xerc : xer_common_t; + reserve : std_ulogic; + rc : std_ulogic; + nc : std_ulogic; -- non-cacheable access + state : state_t; + second_bytes : std_ulogic_vector(7 downto 0); + end record; + + signal r, rin : reg_stage_t; signal lsu_sum : std_ulogic_vector(63 downto 0); + + -- Generate byte enables from sizes + function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is + begin + case length is + when "0001" => + return "00000001"; + when "0010" => + return "00000011"; + when "0100" => + return "00001111"; + when "1000" => + return "11111111"; + when others => + return "00000000"; + end case; + end function length_to_sel; + + -- Calculate byte enables + -- This returns 16 bits, giving the select signals for two transfers, + -- to account for unaligned loads or stores + function xfer_data_sel(size : in std_logic_vector(3 downto 0); + address : in std_logic_vector(2 downto 0)) + return std_ulogic_vector is + variable longsel : std_ulogic_vector(15 downto 0); + begin + longsel := "00000000" & length_to_sel(size); + return std_ulogic_vector(shift_left(unsigned(longsel), + to_integer(unsigned(address)))); + end function xfer_data_sel; + begin -- Calculate the address in the first cycle lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0'); @@ -29,69 +98,180 @@ begin loadstore1_0: process(clk) begin if rising_edge(clk) then - r <= rin; + if rst = '1' then + r.state <= IDLE; + else + r <= rin; + end if; end if; end process; loadstore1_1: process(all) - variable v : Loadstore1ToDcacheType; + variable v : reg_stage_t; variable brev_lenm1 : unsigned(2 downto 0); variable byte_offset : unsigned(2 downto 0); variable j : integer; variable k : unsigned(2 downto 0); + variable long_sel : std_ulogic_vector(15 downto 0); + variable byte_sel : std_ulogic_vector(7 downto 0); + variable req : std_ulogic; + variable stall : std_ulogic; + variable addr : std_ulogic_vector(63 downto 0); + variable wdata : std_ulogic_vector(63 downto 0); + variable write_enable : std_ulogic; + variable do_update : std_ulogic; + variable second_dword : std_ulogic; + variable done : std_ulogic; begin v := r; + req := '0'; + stall := '0'; + done := '0'; + byte_sel := (others => '0'); + addr := lsu_sum; + + write_enable := '0'; + do_update := '0'; + second_dword := '0'; + + case r.state is + when IDLE => + if l_in.valid = '1' then + v.load := l_in.load; + v.addr := lsu_sum; + v.data := l_in.data; + v.write_reg := l_in.write_reg; + v.length := l_in.length; + v.byte_reverse := l_in.byte_reverse; + v.sign_extend := l_in.sign_extend; + v.update := l_in.update; + v.update_reg := l_in.update_reg; + v.xerc := l_in.xerc; + v.reserve := l_in.reserve; + v.rc := l_in.rc; - v.valid := l_in.valid; - v.load := l_in.load; - v.write_reg := l_in.write_reg; - v.length := l_in.length; - v.byte_reverse := l_in.byte_reverse; - v.sign_extend := l_in.sign_extend; - v.update := l_in.update; - v.update_reg := l_in.update_reg; - v.xerc := l_in.xerc; - v.reserve := l_in.reserve; - v.rc := l_in.rc; - - -- XXX Temporary hack. Mark the op as non-cachable if the address - -- is the form 0xc------- - -- - -- This will have to be replaced by a combination of implementing the - -- proper HV CI load/store instructions and having an MMU to get the I - -- bit otherwise. - if lsu_sum(31 downto 28) = "1100" then - v.nc := '1'; - else - v.nc := '0'; - end if; - - -- XXX Do length_to_sel here ? - - -- Do byte reversing and rotating for stores in the first cycle - if v.load = '0' then - byte_offset := unsigned(lsu_sum(2 downto 0)); - brev_lenm1 := "000"; - if l_in.byte_reverse = '1' then - brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + -- XXX Temporary hack. Mark the op as non-cachable if the address + -- is the form 0xc------- + -- + -- This will have to be replaced by a combination of implementing the + -- proper HV CI load/store instructions and having an MMU to get the I + -- bit otherwise. + if lsu_sum(31 downto 28) = "1100" then + v.nc := '1'; + else + v.nc := '0'; + end if; + + -- Do length_to_sel and work out if we are doing 2 dwords + long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0)); + byte_sel := long_sel(7 downto 0); + v.second_bytes := long_sel(15 downto 8); + + v.addr := lsu_sum; + + -- Do byte reversing and rotating for stores in the first cycle + if v.load = '0' then + byte_offset := unsigned(lsu_sum(2 downto 0)); + brev_lenm1 := "000"; + if l_in.byte_reverse = '1' then + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + end if; + for i in 0 to 7 loop + k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; + j := to_integer(k) * 8; + v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); + end loop; + end if; + + req := '1'; + stall := '1'; + if long_sel(15 downto 8) = "00000000" then + v.state := LAST_ACK_WAIT; + else + v.state := SECOND_REQ; + end if; end if; - for i in 0 to 7 loop - k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; - j := to_integer(k) * 8; - v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); - end loop; - end if; - v.addr := lsu_sum; + when SECOND_REQ => + -- compute (addr + 8) & ~7 for the second doubleword when unaligned + addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; + byte_sel := r.second_bytes; + req := '1'; + stall := '1'; + v.state := FIRST_ACK_WAIT; + + when FIRST_ACK_WAIT => + stall := '1'; + if d_in.valid = '1' then + write_enable := r.load; + v.state := LAST_ACK_WAIT; + end if; + + when LAST_ACK_WAIT => + stall := '1'; + second_dword := or (r.second_bytes); + if d_in.valid = '1' then + write_enable := r.load; + if r.load = '1' and r.update = '1' then + -- loads with rA update need an extra cycle + v.state := LD_UPDATE; + else + -- stores write back rA update in this cycle + do_update := r.update; + stall := '0'; + done := '1'; + v.state := IDLE; + end if; + end if; + + when LD_UPDATE => + do_update := '1'; + v.state := IDLE; + done := '1'; + end case; -- Update registers rin <= v; - -- Update outputs - l_out <= r; + -- Update outputs to dcache + d_out.valid <= req; + d_out.load <= v.load; + d_out.nc <= v.nc; + d_out.reserve <= v.reserve; + d_out.addr <= addr; + d_out.data <= v.data; + d_out.byte_sel <= byte_sel; + + -- Update outputs to writeback + -- Multiplex either cache data to the destination GPR or + -- the address for the rA update. + l_out.valid <= done; + if do_update = '1' then + l_out.write_enable <= '1'; + l_out.write_reg <= r.update_reg; + l_out.write_data <= r.addr; + l_out.write_len <= x"8"; + l_out.write_shift <= "000"; + l_out.sign_extend <= '0'; + l_out.byte_reverse <= '0'; + l_out.second_word <= '0'; + l_out.rc <= '0'; + l_out.store_done <= '0'; + else + l_out.write_enable <= write_enable; + l_out.write_reg <= r.write_reg; + l_out.write_data <= d_in.data; + l_out.write_len <= r.length; + l_out.write_shift <= r.addr(2 downto 0); + l_out.sign_extend <= r.sign_extend; + l_out.byte_reverse <= r.byte_reverse; + l_out.second_word <= second_dword; + l_out.rc <= r.rc and done; + l_out.store_done <= d_in.store_done; + end if; + l_out.xerc <= r.xerc; + + stall_out <= stall; - -- Asynchronous output of the low-order address bits (latched in dcache) - l_out.early_low_addr <= lsu_sum(11 downto 0); - l_out.early_valid <= l_in.valid; end process; end; diff --git a/writeback.vhdl b/writeback.vhdl index 0151561..d52bb54 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -11,7 +11,7 @@ entity writeback is clk : in std_ulogic; e_in : in Execute1ToWritebackType; - l_in : in DcacheToWritebackType; + l_in : in Loadstore1ToWritebackType; w_out : out WritebackToRegisterFileType; c_out : out WritebackToCrFileType;