loadstore1: Move logic from dcache to loadstore1

So that the dcache could in future be used by an MMU, this moves
logic to do with data formatting, rA updates for update-form
instructions, and handling of unaligned loads and stores out of
dcache and into loadstore1.  For now, dcache connects only to
loadstore1, and loadstore1 now has the connection to writeback.

Dcache generates a stall signal to loadstore1 which indicates that
the request presented in the current cycle was not accepted and
should be presented again.  However, loadstore1 doesn't currently
use it because we know that we can never hit the circumstances
where it might be set.

For unaligned transfers, loadstore1 generates two requests to
dcache back-to-back, and then waits to see two acks back from
dcache (cycles where d_in.valid is true).

Loadstore1 now has a FSM for tracking how many acks we are
expecting from dcache and for doing the rA update cycles when
necessary.  Handling for reservations and conditional stores is
still in dcache.

Loadstore1 now generates its own stall signal back to decode2,
so we no longer need the logic in execute1 that generated the stall
for the first two cycles.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/153/head
Paul Mackerras 5 years ago
parent ef9c1efd72
commit b349cc891a

@ -218,22 +218,20 @@ package common is
valid : std_ulogic; valid : std_ulogic;
load : std_ulogic; load : std_ulogic;
nc : std_ulogic; nc : std_ulogic;
reserve : std_ulogic;
addr : std_ulogic_vector(63 downto 0); addr : std_ulogic_vector(63 downto 0);
data : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0);
write_reg : gpr_index_t; byte_sel : std_ulogic_vector(7 downto 0);
length : std_ulogic_vector(3 downto 0); end record;
byte_reverse : std_ulogic;
sign_extend : std_ulogic; type DcacheToLoadstore1Type is record
update : std_ulogic; valid : std_ulogic;
update_reg : gpr_index_t; data : std_ulogic_vector(63 downto 0);
xerc : xer_common_t; store_done : std_ulogic;
reserve : std_ulogic; error : std_ulogic;
rc : std_ulogic;
early_low_addr : std_ulogic_vector(11 downto 0);
early_valid : std_ulogic;
end record; end record;


type DcacheToWritebackType is record type Loadstore1ToWritebackType is record
valid : std_ulogic; valid : std_ulogic;
write_enable: std_ulogic; write_enable: std_ulogic;
write_reg : gpr_index_t; write_reg : gpr_index_t;
@ -247,9 +245,9 @@ package common is
rc : std_ulogic; rc : std_ulogic;
store_done : std_ulogic; store_done : std_ulogic;
end record; end record;
constant DcacheToWritebackInit : DcacheToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0',
byte_reverse => '0', second_word => '0', xerc => xerc_init, byte_reverse => '0', second_word => '0', xerc => xerc_init,
rc => '0', store_done => '0', others => (others => '0')); rc => '0', store_done => '0', others => (others => '0'));


type Execute1ToWritebackType is record type Execute1ToWritebackType is record
valid: std_ulogic; valid: std_ulogic;

@ -61,8 +61,11 @@ architecture behave of core is


-- load store signals -- load store signals
signal execute1_to_loadstore1: Execute1ToLoadstore1Type; signal execute1_to_loadstore1: Execute1ToLoadstore1Type;
signal loadstore1_to_writeback: Loadstore1ToWritebackType;

-- dcache signals
signal loadstore1_to_dcache: Loadstore1ToDcacheType; signal loadstore1_to_dcache: Loadstore1ToDcacheType;
signal dcache_to_writeback: DcacheToWritebackType; signal dcache_to_loadstore1: DcacheToLoadstore1Type;


-- local signals -- local signals
signal fetch1_stall_in : std_ulogic; signal fetch1_stall_in : std_ulogic;
@ -73,6 +76,7 @@ architecture behave of core is
signal decode2_stall_out : std_ulogic; signal decode2_stall_out : std_ulogic;
signal ex1_icache_inval: std_ulogic; signal ex1_icache_inval: std_ulogic;
signal ex1_stall_out: std_ulogic; signal ex1_stall_out: std_ulogic;
signal ls1_stall_out: std_ulogic;
signal dcache_stall_out: std_ulogic; signal dcache_stall_out: std_ulogic;


signal flush: std_ulogic; signal flush: std_ulogic;
@ -196,7 +200,7 @@ begin
c_in => cr_file_to_decode2, c_in => cr_file_to_decode2,
c_out => decode2_to_cr_file c_out => decode2_to_cr_file
); );
decode2_stall_in <= ex1_stall_out or dcache_stall_out; decode2_stall_in <= ex1_stall_out or ls1_stall_out;


register_file_0: entity work.register_file register_file_0: entity work.register_file
generic map ( generic map (
@ -243,8 +247,13 @@ begin
loadstore1_0: entity work.loadstore1 loadstore1_0: entity work.loadstore1
port map ( port map (
clk => clk, clk => clk,
rst => core_rst,
l_in => execute1_to_loadstore1, l_in => execute1_to_loadstore1,
l_out => loadstore1_to_dcache l_out => loadstore1_to_writeback,
d_out => loadstore1_to_dcache,
d_in => dcache_to_loadstore1,
dc_stall => dcache_stall_out,
stall_out => ls1_stall_out
); );


dcache_0: entity work.dcache dcache_0: entity work.dcache
@ -257,7 +266,7 @@ begin
clk => clk, clk => clk,
rst => core_rst, rst => core_rst,
d_in => loadstore1_to_dcache, d_in => loadstore1_to_dcache,
d_out => dcache_to_writeback, d_out => dcache_to_loadstore1,
stall_out => dcache_stall_out, stall_out => dcache_stall_out,
wishbone_in => wishbone_data_in, wishbone_in => wishbone_data_in,
wishbone_out => wishbone_data_out wishbone_out => wishbone_data_out
@ -267,7 +276,7 @@ begin
port map ( port map (
clk => clk, clk => clk,
e_in => execute1_to_writeback, e_in => execute1_to_writeback,
l_in => dcache_to_writeback, l_in => loadstore1_to_writeback,
w_out => writeback_to_register_file, w_out => writeback_to_register_file,
c_out => writeback_to_cr_file, c_out => writeback_to_cr_file,
complete_out => complete complete_out => complete

@ -7,9 +7,6 @@
-- * Complete load misses on the cycle when WB data comes instead of -- * Complete load misses on the cycle when WB data comes instead of
-- at the end of line (this requires dealing with requests coming in -- at the end of line (this requires dealing with requests coming in
-- while not idle...) -- while not idle...)
-- * Load with update could use one less non-pipelined cycle by moving
-- the register update to the pipeline bubble that exists when going
-- back to the IDLE state.
-- --
library ieee; library ieee;
use ieee.std_logic_1164.all; use ieee.std_logic_1164.all;
@ -35,7 +32,7 @@ entity dcache is
rst : in std_ulogic; rst : in std_ulogic;


d_in : in Loadstore1ToDcacheType; d_in : in Loadstore1ToDcacheType;
d_out : out DcacheToWritebackType; d_out : out DcacheToLoadstore1Type;


stall_out : out std_ulogic; stall_out : out std_ulogic;


@ -113,6 +110,8 @@ architecture rtl of dcache is
attribute ram_style : string; attribute ram_style : string;
attribute ram_style of cache_tags : signal is "distributed"; attribute ram_style of cache_tags : signal is "distributed";


signal r0 : Loadstore1ToDcacheType;

-- Type of operation on a "valid" input -- Type of operation on a "valid" input
type op_t is (OP_NONE, type op_t is (OP_NONE,
OP_LOAD_HIT, -- Cache hit on load OP_LOAD_HIT, -- Cache hit on load
@ -124,10 +123,8 @@ architecture rtl of dcache is
-- Cache state machine -- Cache state machine
type state_t is (IDLE, -- Normal load hit processing type state_t is (IDLE, -- Normal load hit processing
PRE_NEXT_DWORD, -- Extra state before NEXT_DWORD
NEXT_DWORD, -- Starting the 2nd xfer of misaligned
LOAD_UPDATE, -- Load with update extra cycle
RELOAD_WAIT_ACK, -- Cache reload wait ack RELOAD_WAIT_ACK, -- Cache reload wait ack
FINISH_LD_MISS, -- Extra cycle after load miss
STORE_WAIT_ACK, -- Store wait ack STORE_WAIT_ACK, -- Store wait ack
NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack


@ -158,15 +155,6 @@ architecture rtl of dcache is
hit_way : way_t; hit_way : way_t;
hit_load_valid : std_ulogic; hit_load_valid : std_ulogic;


-- Info for doing the second transfer of a misaligned load/store
two_dwords : std_ulogic;
second_dword : std_ulogic;
next_addr : std_ulogic_vector(63 downto 0);
next_sel : std_ulogic_vector(7 downto 0);

-- Register update (load/store with update)
update_valid : std_ulogic;

-- Data buffer for "slow" read ops (load miss and NC loads). -- Data buffer for "slow" read ops (load miss and NC loads).
slow_data : std_ulogic_vector(63 downto 0); slow_data : std_ulogic_vector(63 downto 0);
slow_valid : std_ulogic; slow_valid : std_ulogic;
@ -200,12 +188,8 @@ architecture rtl of dcache is
signal req_tag : cache_tag_t; signal req_tag : cache_tag_t;
signal req_op : op_t; signal req_op : op_t;
signal req_data : std_ulogic_vector(63 downto 0); signal req_data : std_ulogic_vector(63 downto 0);
signal req_addr : std_ulogic_vector(63 downto 0);
signal req_laddr : std_ulogic_vector(63 downto 0); signal req_laddr : std_ulogic_vector(63 downto 0);
signal req_sel : std_ulogic_vector(7 downto 0);
signal next_addr : std_ulogic_vector(63 downto 0);


signal early_req_addr : std_ulogic_vector(11 downto 0);
signal early_req_row : row_t; signal early_req_row : row_t;


signal cancel_store : std_ulogic; signal cancel_store : std_ulogic;
@ -222,9 +206,7 @@ architecture rtl of dcache is
signal replace_way : way_t; signal replace_way : way_t;


-- Wishbone read/write/cache write formatting signals -- Wishbone read/write/cache write formatting signals
signal bus_sel : std_ulogic_vector(15 downto 0); signal bus_sel : std_ulogic_vector(7 downto 0);

signal two_dwords : std_ulogic;


-- --
-- Helper functions to decode incoming requests -- Helper functions to decode incoming requests
@ -305,37 +287,6 @@ architecture rtl of dcache is
tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
end; end;


-- Generate byte enables from sizes
function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
begin
case length is
when "0001" =>
return "00000001";
when "0010" =>
return "00000011";
when "0100" =>
return "00001111";
when "1000" =>
return "11111111";
when others =>
return "00000000";
end case;
end function length_to_sel;

-- Calculate byte enables for wishbone
-- This returns 16 bits, giving the select signals for two transfers,
-- to account for unaligned loads or stores
function wishbone_data_sel(size : in std_logic_vector(3 downto 0);
address : in std_logic_vector(63 downto 0))
return std_ulogic_vector is
variable longsel : std_ulogic_vector(15 downto 0);
begin
longsel := (others => '0');
longsel(7 downto 0) := length_to_sel(size);
return std_ulogic_vector(shift_left(unsigned(longsel),
to_integer(unsigned(address(2 downto 0)))));
end function wishbone_data_sel;

begin begin


assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
@ -390,11 +341,17 @@ begin
end generate; end generate;
end generate; end generate;


-- Wishbone read and write and BRAM write sel bits generation -- Latch the request in r0 as long as we're not stalling
bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); stage_0 : process(clk)

begin
-- See if the operation crosses two doublewords if rising_edge(clk) then
two_dwords <= or (bus_sel(15 downto 8)); if rst = '1' then
r0.valid <= '0';
elsif stall_out = '0' then
r0 <= d_in;
end if;
end if;
end process;


-- Cache request parsing and hit detection -- Cache request parsing and hit detection
dcache_request : process(all) dcache_request : process(all)
@ -405,40 +362,21 @@ begin
variable data : std_ulogic_vector(63 downto 0); variable data : std_ulogic_vector(63 downto 0);
variable opsel : std_ulogic_vector(3 downto 0); variable opsel : std_ulogic_vector(3 downto 0);
variable go : std_ulogic; variable go : std_ulogic;
variable is_load : std_ulogic;
variable is_nc : std_ulogic;
begin begin
-- Extract line, row and tag from request -- Extract line, row and tag from request
if r1.state /= NEXT_DWORD then req_index <= get_index(r0.addr);
req_addr <= d_in.addr; req_row <= get_row(r0.addr);
req_data <= d_in.data; req_tag <= get_tag(r0.addr);
req_sel <= bus_sel(7 downto 0);
go := d_in.valid;
is_load := d_in.load;
is_nc := d_in.nc;

else
req_addr <= r1.next_addr;
req_data <= r1.req.data;
req_sel <= r1.next_sel;
go := '1';
is_load := r1.req.load;
is_nc := r1.req.nc;
end if;


req_index <= get_index(req_addr); -- Only do anything if not being stalled by stage 1
req_row <= get_row(req_addr); go := r0.valid and not stall_out;
req_tag <= get_tag(req_addr);


-- Calculate address of beginning of cache line, will be -- Calculate address of beginning of cache line, will be
-- used for cache miss processing if needed -- used for cache miss processing if needed
-- --
req_laddr <= req_addr(63 downto LINE_OFF_BITS) & req_laddr <= r0.addr(63 downto LINE_OFF_BITS) &
(LINE_OFF_BITS-1 downto 0 => '0'); (LINE_OFF_BITS-1 downto 0 => '0');


-- Address of next doubleword, used for unaligned accesses
next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000";

-- Test if pending request is a hit on any way -- Test if pending request is a hit on any way
hit_way := 0; hit_way := 0;
is_hit := '0'; is_hit := '0';
@ -460,7 +398,7 @@ begin
-- Combine the request and cache his status to decide what -- Combine the request and cache his status to decide what
-- operation needs to be done -- operation needs to be done
-- --
opsel := go & is_load & is_nc & is_hit; opsel := go & r0.load & r0.nc & is_hit;
case opsel is case opsel is
when "1101" => op := OP_LOAD_HIT; when "1101" => op := OP_LOAD_HIT;
when "1100" => op := OP_LOAD_MISS; when "1100" => op := OP_LOAD_MISS;
@ -475,16 +413,15 @@ begin


req_op <= op; req_op <= op;


-- Versions of the address and row number that are valid one cycle earlier -- Version of the row number that is valid one cycle earlier
-- in the cases where we need to read the cache data BRAM. -- in the cases where we need to read the cache data BRAM.
if r1.state = IDLE and op = OP_LOAD_HIT and two_dwords = '1' then -- If we're stalling then we need to keep reading the last
early_req_addr <= next_addr(11 downto 0); -- row requested.
elsif r1.state /= IDLE and r1.two_dwords = '1' and r1.second_dword = '0' then if stall_out = '0' then
early_req_addr <= r1.next_addr(11 downto 0); early_req_row <= get_row(d_in.addr);
else else
early_req_addr <= d_in.early_low_addr; early_req_row <= req_row;
end if; end if;
early_req_row <= get_row(x"0000000000000" & early_req_addr);
end process; end process;


-- Wire up wishbone request latch out of stage 1 -- Wire up wishbone request latch out of stage 1
@ -502,17 +439,17 @@ begin
cancel_store <= '0'; cancel_store <= '0';
set_rsrv <= '0'; set_rsrv <= '0';
clear_rsrv <= '0'; clear_rsrv <= '0';
if d_in.valid = '1' and d_in.reserve = '1' then if stall_out = '0' and r0.valid = '1' and r0.reserve = '1' then
-- XXX generate alignment interrupt if address is not aligned -- XXX generate alignment interrupt if address is not aligned
-- XXX or if d_in.nc = '1' -- XXX or if r0.nc = '1'
if d_in.load = '1' then if r0.load = '1' then
-- load with reservation -- load with reservation
set_rsrv <= '1'; set_rsrv <= '1';
else else
-- store conditional -- store conditional
clear_rsrv <= '1'; clear_rsrv <= '1';
if reservation.valid = '0' or if reservation.valid = '0' or
d_in.addr(63 downto LINE_OFF_BITS) /= reservation.addr then r0.addr(63 downto LINE_OFF_BITS) /= reservation.addr then
cancel_store <= '1'; cancel_store <= '1';
end if; end if;
end if; end if;
@ -526,28 +463,19 @@ begin
reservation.valid <= '0'; reservation.valid <= '0';
elsif set_rsrv = '1' then elsif set_rsrv = '1' then
reservation.valid <= '1'; reservation.valid <= '1';
reservation.addr <= d_in.addr(63 downto LINE_OFF_BITS); reservation.addr <= r0.addr(63 downto LINE_OFF_BITS);
end if; end if;
end if; end if;
end process; end process;


-- Writeback (loads and reg updates) & completion control logic -- Return data for loads & completion control logic
-- --
writeback_control: process(all) writeback_control: process(all)
begin begin


-- The mux on d_out.write reg defaults to the normal load hit case. -- The mux on d_out.data defaults to the normal load hit case.
d_out.write_enable <= '0';
d_out.valid <= '0'; d_out.valid <= '0';
d_out.write_reg <= r1.req.write_reg; d_out.data <= cache_out(r1.hit_way);
d_out.write_data <= cache_out(r1.hit_way);
d_out.write_len <= r1.req.length;
d_out.write_shift <= r1.req.addr(2 downto 0);
d_out.sign_extend <= r1.req.sign_extend;
d_out.byte_reverse <= r1.req.byte_reverse;
d_out.second_word <= r1.second_dword;
d_out.xerc <= r1.req.xerc;
d_out.rc <= '0'; -- loads never have rc=1
d_out.store_done <= '0'; d_out.store_done <= '0';


-- We have a valid load or store hit or we just completed a slow -- We have a valid load or store hit or we just completed a slow
@ -561,30 +489,17 @@ begin
-- --


-- Sanity: Only one of these must be set in any given cycle -- Sanity: Only one of these must be set in any given cycle
assert (r1.update_valid and r1.hit_load_valid) /= '1' report
"unexpected hit_load_delayed collision with update_valid"
severity FAILURE;
assert (r1.slow_valid and r1.stcx_fail) /= '1' report assert (r1.slow_valid and r1.stcx_fail) /= '1' report
"unexpected slow_valid collision with stcx_fail" "unexpected slow_valid collision with stcx_fail"
severity FAILURE; severity FAILURE;
assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid) /= '1' report assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid) /= '1' report
"unexpected hit_load_delayed collision with slow_valid" "unexpected hit_load_delayed collision with slow_valid"
severity FAILURE; severity FAILURE;
assert ((r1.slow_valid or r1.stcx_fail) and r1.update_valid) /= '1' report
"unexpected update_valid collision with slow_valid or stcx_fail"
severity FAILURE;


-- Load hit case is the standard path -- Load hit case is the standard path
if r1.hit_load_valid = '1' then if r1.hit_load_valid = '1' then
d_out.write_enable <= '1'; report "completing load hit";

d_out.valid <= '1';
-- If there isn't another dword to go and
-- it's not a load with update, complete it now
if (r1.second_dword or not r1.two_dwords) = '1' and
r1.req.update = '0' then
report "completing load hit";
d_out.valid <= '1';
end if;
end if; end if;


-- Slow ops (load miss, NC, stores) -- Slow ops (load miss, NC, stores)
@ -593,63 +508,20 @@ begin
-- mux accordingly -- mux accordingly
-- --
if r1.req.load then if r1.req.load then
d_out.write_reg <= r1.req.write_reg; -- Read data comes from the slow data latch
d_out.write_enable <= '1'; d_out.data <= r1.slow_data;

-- Read data comes from the slow data latch, formatter
-- from the latched request.
--
d_out.write_data <= r1.slow_data;
d_out.write_shift <= r1.req.addr(2 downto 0);
d_out.sign_extend <= r1.req.sign_extend;
d_out.byte_reverse <= r1.req.byte_reverse;
d_out.write_len <= r1.req.length;
d_out.xerc <= r1.req.xerc;
d_out.second_word <= r1.second_dword;
end if; end if;
d_out.rc <= r1.req.rc;
d_out.store_done <= '1'; d_out.store_done <= '1';


-- If it's a store or a non-update load form, complete now report "completing store or load miss";
-- unless we need to do another dword transfer d_out.valid <= '1';
if (r1.req.load = '0' or r1.req.update = '0') and
(r1.two_dwords = '0' or r1.second_dword = '1') then
report "completing store or load miss";
d_out.valid <= '1';
end if;
end if; end if;


if r1.stcx_fail = '1' then if r1.stcx_fail = '1' then
d_out.rc <= r1.req.rc;
d_out.store_done <= '0'; d_out.store_done <= '0';
d_out.valid <= '1'; d_out.valid <= '1';
end if; end if;


-- We have a register update to do.
if r1.update_valid = '1' then
d_out.write_enable <= '1';
d_out.write_reg <= r1.req.update_reg;

-- Change the read data mux to the address that's going into
-- the register and the formatter does nothing.
--
d_out.write_data <= r1.req.addr;
d_out.write_shift <= "000";
d_out.write_len <= "1000";
d_out.sign_extend <= '0';
d_out.byte_reverse <= '0';
d_out.xerc <= r1.req.xerc;
d_out.second_word <= '0';

-- If it was a load, this completes the operation (load with
-- update case).
--
if r1.req.load = '1' then
report "completing after load update";
d_out.valid <= '1';
end if;
end if;

end process; end process;


-- --
@ -703,11 +575,11 @@ begin
-- For timing, the mux on wr_data/sel/addr is not dependent on anything -- For timing, the mux on wr_data/sel/addr is not dependent on anything
-- other than the current state. Only the do_write signal is. -- other than the current state. Only the do_write signal is.
-- --
if r1.state = IDLE or r1.state = NEXT_DWORD then if r1.state = IDLE then
-- In these states, the only write path is the store-hit update case -- In IDLE state, the only write path is the store-hit update case
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
wr_data <= req_data; wr_data <= r0.data;
wr_sel <= req_sel; wr_sel <= r0.byte_sel;
else else
-- Otherwise, we might be doing a reload -- Otherwise, we might be doing a reload
wr_data <= wishbone_in.dat; wr_data <= wishbone_in.dat;
@ -731,35 +603,25 @@ begin
end generate; end generate;


-- --
-- Cache hit synchronous machine for the easy case. This handles -- Cache hit synchronous machine for the easy case. This handles load hits.
-- non-update form load hits
-- --
dcache_fast_hit : process(clk) dcache_fast_hit : process(clk)
begin begin
if rising_edge(clk) then if rising_edge(clk) then
-- If we have a request incoming, we have to latch it as d_in.valid -- If we have a request incoming, we have to latch it as r0.valid
-- is only set for a single cycle. It's up to the control logic to -- is only set for a single cycle. It's up to the control logic to
-- ensure we don't override an uncompleted request (for now we are -- ensure we don't override an uncompleted request (for now we are
-- single issue on load/stores so we are fine, later, we can generate -- single issue on load/stores so we are fine, later, we can generate
-- a stall output if necessary). -- a stall output if necessary).


if req_op /= OP_NONE and d_in.valid = '1' then if req_op /= OP_NONE and stall_out = '0' then
r1.req <= d_in; r1.req <= r0;
r1.second_dword <= '0';
r1.two_dwords <= two_dwords;
r1.next_addr <= next_addr;
r1.next_sel <= bus_sel(15 downto 8);

report "op:" & op_t'image(req_op) & report "op:" & op_t'image(req_op) &
" addr:" & to_hstring(d_in.addr) & " addr:" & to_hstring(r0.addr) &
" upd:" & std_ulogic'image(d_in.update) & " nc:" & std_ulogic'image(r0.nc) &
" nc:" & std_ulogic'image(d_in.nc) &
" reg:" & to_hstring(d_in.write_reg) &
" idx:" & integer'image(req_index) & " idx:" & integer'image(req_index) &
" tag:" & to_hstring(req_tag) & " tag:" & to_hstring(req_tag) &
" way: " & integer'image(req_hit_way); " way: " & integer'image(req_hit_way);
elsif r1.state = NEXT_DWORD then
r1.second_dword <= '1';
end if; end if;


-- Fast path for load/store hits. Set signals for the writeback controls. -- Fast path for load/store hits. Set signals for the writeback controls.
@ -776,7 +638,6 @@ begin
-- Every other case is handled by this state machine: -- Every other case is handled by this state machine:
-- --
-- * Cache load miss/reload (in conjunction with "rams") -- * Cache load miss/reload (in conjunction with "rams")
-- * Load hits for update forms
-- * Load hits for non-cachable forms -- * Load hits for non-cachable forms
-- * Stores (the collision case is handled in "rams") -- * Stores (the collision case is handled in "rams")
-- --
@ -795,7 +656,6 @@ begin
end loop; end loop;
r1.state <= IDLE; r1.state <= IDLE;
r1.slow_valid <= '0'; r1.slow_valid <= '0';
r1.update_valid <= '0';
r1.wb.cyc <= '0'; r1.wb.cyc <= '0';
r1.wb.stb <= '0'; r1.wb.stb <= '0';


@ -804,39 +664,19 @@ begin
else else
-- One cycle pulses reset -- One cycle pulses reset
r1.slow_valid <= '0'; r1.slow_valid <= '0';
r1.update_valid <= '0';
r1.stcx_fail <= '0'; r1.stcx_fail <= '0';


-- We cannot currently process a new request when not idle
assert d_in.valid = '0' or r1.state = IDLE report "request " &
op_t'image(req_op) & " while in state " & state_t'image(r1.state)
severity FAILURE;

-- Main state machine -- Main state machine
case r1.state is case r1.state is
when IDLE | NEXT_DWORD => when IDLE =>
case req_op is case req_op is
when OP_LOAD_HIT => when OP_LOAD_HIT =>
if r1.state = IDLE then -- stay in IDLE state
-- If the load is misaligned then we will need to start
-- the state machine
if two_dwords = '1' then
r1.state <= NEXT_DWORD;
elsif d_in.update = '1' then
r1.state <= LOAD_UPDATE;
end if;
else
if r1.req.update = '1' then
r1.state <= LOAD_UPDATE;
else
r1.state <= IDLE;
end if;
end if;


when OP_LOAD_MISS => when OP_LOAD_MISS =>
-- Normal load cache miss, start the reload machine -- Normal load cache miss, start the reload machine
-- --
report "cache miss addr:" & to_hstring(req_addr) & report "cache miss addr:" & to_hstring(r0.addr) &
" idx:" & integer'image(req_index) & " idx:" & integer'image(req_index) &
" way:" & integer'image(replace_way) & " way:" & integer'image(replace_way) &
" tag:" & to_hstring(req_tag); " tag:" & to_hstring(req_tag);
@ -871,19 +711,17 @@ begin
r1.state <= RELOAD_WAIT_ACK; r1.state <= RELOAD_WAIT_ACK;


when OP_LOAD_NC => when OP_LOAD_NC =>
r1.wb.sel <= req_sel; r1.wb.sel <= r0.byte_sel;
r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000"; r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000";
r1.wb.cyc <= '1'; r1.wb.cyc <= '1';
r1.wb.stb <= '1'; r1.wb.stb <= '1';
r1.wb.we <= '0'; r1.wb.we <= '0';
r1.state <= NC_LOAD_WAIT_ACK; r1.state <= NC_LOAD_WAIT_ACK;


when OP_STORE_HIT | OP_STORE_MISS => when OP_STORE_HIT | OP_STORE_MISS =>
-- For store-with-update do the register update r1.wb.sel <= r0.byte_sel;
r1.update_valid <= d_in.valid and d_in.update; r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000";
r1.wb.sel <= req_sel; r1.wb.dat <= r0.data;
r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000";
r1.wb.dat <= req_data;
if cancel_store = '0' then if cancel_store = '0' then
r1.wb.cyc <= '1'; r1.wb.cyc <= '1';
r1.wb.stb <= '1'; r1.wb.stb <= '1';
@ -899,9 +737,6 @@ begin
when OP_BAD => when OP_BAD =>
end case; end case;


when PRE_NEXT_DWORD =>
r1.state <= NEXT_DWORD;

when RELOAD_WAIT_ACK => when RELOAD_WAIT_ACK =>
-- Requests are all sent if stb is 0 -- Requests are all sent if stb is 0
stbs_done := r1.wb.stb = '0'; stbs_done := r1.wb.stb = '0';
@ -943,31 +778,23 @@ begin
-- Cache line is now valid -- Cache line is now valid
cache_valids(r1.store_index)(r1.store_way) <= '1'; cache_valids(r1.store_index)(r1.store_way) <= '1';


-- Write back the load data that we got, and start -- Don't complete and go idle until next cycle, in
-- the second dword if necessary. Otherwise, see if -- case the next request is for the last dword of
-- we also need to do the deferred update cycle. -- the cache line we just loaded.
r1.slow_valid <= '1'; r1.state <= FINISH_LD_MISS;
if r1.two_dwords and not r1.second_dword then
r1.state <= PRE_NEXT_DWORD;
elsif r1.req.update = '1' then
r1.state <= LOAD_UPDATE;
report "completing miss with load-update !";
else
r1.state <= IDLE;
report "completing miss !";
end if;
end if; end if;


-- Increment store row counter -- Increment store row counter
r1.store_row <= next_row(r1.store_row); r1.store_row <= next_row(r1.store_row);
end if; end if;


when LOAD_UPDATE => when FINISH_LD_MISS =>
-- We need the extra cycle to complete a load with update -- Write back the load data that we got
r1.update_valid <= '1'; r1.slow_valid <= '1';
r1.state <= IDLE; r1.state <= IDLE;
report "completing miss !";


when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK =>
-- Clear stb when slave accepted request -- Clear stb when slave accepted request
if wishbone_in.stall = '0' then if wishbone_in.stall = '0' then
r1.wb.stb <= '0'; r1.wb.stb <= '0';
@ -975,16 +802,10 @@ begin


-- Got ack ? complete. -- Got ack ? complete.
if wishbone_in.ack = '1' then if wishbone_in.ack = '1' then
if r1.two_dwords and not r1.second_dword then
r1.state <= NEXT_DWORD;
elsif r1.state = NC_LOAD_WAIT_ACK and r1.req.update = '1' then
r1.state <= LOAD_UPDATE;
else
r1.state <= IDLE;
end if;
if r1.state = NC_LOAD_WAIT_ACK then if r1.state = NC_LOAD_WAIT_ACK then
r1.slow_data <= wishbone_in.dat; r1.slow_data <= wishbone_in.dat;
end if; end if;
r1.state <= IDLE;
r1.slow_valid <= '1'; r1.slow_valid <= '1';
r1.wb.cyc <= '0'; r1.wb.cyc <= '0';
r1.wb.stb <= '0'; r1.wb.stb <= '0';

@ -13,7 +13,7 @@ architecture behave of dcache_tb is
signal rst : std_ulogic; signal rst : std_ulogic;


signal d_in : Loadstore1ToDcacheType; signal d_in : Loadstore1ToDcacheType;
signal d_out : DcacheToWritebackType; signal d_out : DcacheToLoadstore1Type;


signal wb_bram_in : wishbone_master_out; signal wb_bram_in : wishbone_master_out;
signal wb_bram_out : wishbone_slave_out; signal wb_bram_out : wishbone_slave_out;
@ -71,12 +71,6 @@ begin
d_in.nc <= '0'; d_in.nc <= '0';
d_in.addr <= (others => '0'); d_in.addr <= (others => '0');
d_in.data <= (others => '0'); d_in.data <= (others => '0');
d_in.write_reg <= (others => '0');
d_in.length <= (others => '0');
d_in.byte_reverse <= '0';
d_in.sign_extend <= '0';
d_in.update <= '0';
d_in.update_reg <= (others => '0');


wait for 4*clk_period; wait for 4*clk_period;
wait until rising_edge(clk); wait until rising_edge(clk);
@ -89,11 +83,10 @@ begin
wait until rising_edge(clk); wait until rising_edge(clk);
d_in.valid <= '0'; d_in.valid <= '0';


wait until rising_edge(clk) and d_out.write_enable = '1'; wait until rising_edge(clk) and d_out.valid = '1';
assert d_out.valid = '1'; assert d_out.data = x"0000000100000000"
assert d_out.write_data = x"0000000100000000"
report "data @" & to_hstring(d_in.addr) & report "data @" & to_hstring(d_in.addr) &
"=" & to_hstring(d_out.write_data) & "=" & to_hstring(d_out.data) &
" expected 0000000100000000" " expected 0000000100000000"
severity failure; severity failure;
-- wait for clk_period; -- wait for clk_period;
@ -106,11 +99,10 @@ begin
wait until rising_edge(clk); wait until rising_edge(clk);
d_in.valid <= '0'; d_in.valid <= '0';


wait until rising_edge(clk) and d_out.write_enable = '1'; wait until rising_edge(clk) and d_out.valid = '1';
assert d_out.valid = '1'; assert d_out.data = x"0000000D0000000C"
assert d_out.write_data = x"0000000D0000000C"
report "data @" & to_hstring(d_in.addr) & report "data @" & to_hstring(d_in.addr) &
"=" & to_hstring(d_out.write_data) & "=" & to_hstring(d_out.data) &
" expected 0000000D0000000C" " expected 0000000D0000000C"
severity failure; severity failure;


@ -121,11 +113,10 @@ begin
d_in.valid <= '1'; d_in.valid <= '1';
wait until rising_edge(clk); wait until rising_edge(clk);
d_in.valid <= '0'; d_in.valid <= '0';
wait until rising_edge(clk) and d_out.write_enable = '1'; wait until rising_edge(clk) and d_out.valid = '1';
assert d_out.valid = '1'; assert d_out.data = x"0000004100000040"
assert d_out.write_data = x"0000004100000040"
report "data @" & to_hstring(d_in.addr) & report "data @" & to_hstring(d_in.addr) &
"=" & to_hstring(d_out.write_data) & "=" & to_hstring(d_out.data) &
" expected 0000004100000040" " expected 0000004100000040"
severity failure; severity failure;



@ -42,7 +42,6 @@ architecture behaviour of execute1 is
next_lr : std_ulogic_vector(63 downto 0); next_lr : std_ulogic_vector(63 downto 0);
mul_in_progress : std_ulogic; mul_in_progress : std_ulogic;
div_in_progress : std_ulogic; div_in_progress : std_ulogic;
ldst_in_progress : std_ulogic;
cntz_in_progress : std_ulogic; cntz_in_progress : std_ulogic;
slow_op_dest : gpr_index_t; slow_op_dest : gpr_index_t;
slow_op_rc : std_ulogic; slow_op_rc : std_ulogic;
@ -264,7 +263,6 @@ begin
v.mul_in_progress := '0'; v.mul_in_progress := '0';
v.div_in_progress := '0'; v.div_in_progress := '0';
v.cntz_in_progress := '0'; v.cntz_in_progress := '0';
v.ldst_in_progress := '0';


-- signals to multiply unit -- signals to multiply unit
x_to_multiply <= Execute1ToMultiplyInit; x_to_multiply <= Execute1ToMultiplyInit;
@ -662,8 +660,6 @@ begin
when OP_LOAD | OP_STORE => when OP_LOAD | OP_STORE =>
-- loadstore/dcache has its own port to writeback -- loadstore/dcache has its own port to writeback
v.e.valid := '0'; v.e.valid := '0';
stall_out <= '1';
v.ldst_in_progress := '1';


when others => when others =>
terminate_out <= '1'; terminate_out <= '1';
@ -703,10 +699,6 @@ begin
v.e.rc := v.slow_op_rc; v.e.rc := v.slow_op_rc;
v.e.xerc := v.slow_op_xerc; v.e.xerc := v.slow_op_xerc;
v.e.valid := '1'; v.e.valid := '1';
elsif r.ldst_in_progress = '1' then
-- assert stall for 2 cycles on load/store, then
-- the stall output from dcache takes over
stall_out <= '1';
elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then
if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or
(r.div_in_progress = '1' and divider_to_x.valid = '1') then (r.div_in_progress = '1' and divider_to_x.valid = '1') then

@ -12,16 +12,85 @@ use work.helpers.all;
entity loadstore1 is entity loadstore1 is
port ( port (
clk : in std_ulogic; clk : in std_ulogic;
rst : in std_ulogic;


l_in : in Execute1ToLoadstore1Type; l_in : in Execute1ToLoadstore1Type;
l_out : out Loadstore1ToWritebackType;


l_out : out Loadstore1ToDcacheType d_out : out Loadstore1ToDcacheType;
d_in : in DcacheToLoadstore1Type;

dc_stall : in std_ulogic;
stall_out : out std_ulogic
); );
end loadstore1; end loadstore1;


-- Note, we don't currently use the stall output from the dcache because
-- we know it can take two requests without stalling when idle, we are
-- its only user, and we know it never stalls when idle.

architecture behave of loadstore1 is architecture behave of loadstore1 is
signal r, rin : Loadstore1ToDcacheType;
-- State machine for unaligned loads/stores
type state_t is (IDLE, -- ready for instruction
SECOND_REQ, -- send 2nd request of unaligned xfer
FIRST_ACK_WAIT, -- waiting for 1st ack from dcache
LAST_ACK_WAIT, -- waiting for last ack from dcache
LD_UPDATE -- writing rA with computed addr on load
);

type reg_stage_t is record
-- latch most of the input request
load : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
data : std_ulogic_vector(63 downto 0);
write_reg : gpr_index_t;
length : std_ulogic_vector(3 downto 0);
byte_reverse : std_ulogic;
sign_extend : std_ulogic;
update : std_ulogic;
update_reg : gpr_index_t;
xerc : xer_common_t;
reserve : std_ulogic;
rc : std_ulogic;
nc : std_ulogic; -- non-cacheable access
state : state_t;
second_bytes : std_ulogic_vector(7 downto 0);
end record;

signal r, rin : reg_stage_t;
signal lsu_sum : std_ulogic_vector(63 downto 0); signal lsu_sum : std_ulogic_vector(63 downto 0);

-- Generate byte enables from sizes
function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
begin
case length is
when "0001" =>
return "00000001";
when "0010" =>
return "00000011";
when "0100" =>
return "00001111";
when "1000" =>
return "11111111";
when others =>
return "00000000";
end case;
end function length_to_sel;

-- Calculate byte enables
-- This returns 16 bits, giving the select signals for two transfers,
-- to account for unaligned loads or stores
function xfer_data_sel(size : in std_logic_vector(3 downto 0);
address : in std_logic_vector(2 downto 0))
return std_ulogic_vector is
variable longsel : std_ulogic_vector(15 downto 0);
begin
longsel := "00000000" & length_to_sel(size);
return std_ulogic_vector(shift_left(unsigned(longsel),
to_integer(unsigned(address))));
end function xfer_data_sel;

begin begin
-- Calculate the address in the first cycle -- Calculate the address in the first cycle
lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0'); lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
@ -29,69 +98,180 @@ begin
loadstore1_0: process(clk) loadstore1_0: process(clk)
begin begin
if rising_edge(clk) then if rising_edge(clk) then
r <= rin; if rst = '1' then
r.state <= IDLE;
else
r <= rin;
end if;
end if; end if;
end process; end process;


loadstore1_1: process(all) loadstore1_1: process(all)
variable v : Loadstore1ToDcacheType; variable v : reg_stage_t;
variable brev_lenm1 : unsigned(2 downto 0); variable brev_lenm1 : unsigned(2 downto 0);
variable byte_offset : unsigned(2 downto 0); variable byte_offset : unsigned(2 downto 0);
variable j : integer; variable j : integer;
variable k : unsigned(2 downto 0); variable k : unsigned(2 downto 0);
variable long_sel : std_ulogic_vector(15 downto 0);
variable byte_sel : std_ulogic_vector(7 downto 0);
variable req : std_ulogic;
variable stall : std_ulogic;
variable addr : std_ulogic_vector(63 downto 0);
variable wdata : std_ulogic_vector(63 downto 0);
variable write_enable : std_ulogic;
variable do_update : std_ulogic;
variable second_dword : std_ulogic;
variable done : std_ulogic;
begin begin
v := r; v := r;
req := '0';
stall := '0';
done := '0';
byte_sel := (others => '0');
addr := lsu_sum;

write_enable := '0';
do_update := '0';
second_dword := '0';

case r.state is
when IDLE =>
if l_in.valid = '1' then
v.load := l_in.load;
v.addr := lsu_sum;
v.data := l_in.data;
v.write_reg := l_in.write_reg;
v.length := l_in.length;
v.byte_reverse := l_in.byte_reverse;
v.sign_extend := l_in.sign_extend;
v.update := l_in.update;
v.update_reg := l_in.update_reg;
v.xerc := l_in.xerc;
v.reserve := l_in.reserve;
v.rc := l_in.rc;


v.valid := l_in.valid; -- XXX Temporary hack. Mark the op as non-cachable if the address
v.load := l_in.load; -- is the form 0xc-------
v.write_reg := l_in.write_reg; --
v.length := l_in.length; -- This will have to be replaced by a combination of implementing the
v.byte_reverse := l_in.byte_reverse; -- proper HV CI load/store instructions and having an MMU to get the I
v.sign_extend := l_in.sign_extend; -- bit otherwise.
v.update := l_in.update; if lsu_sum(31 downto 28) = "1100" then
v.update_reg := l_in.update_reg; v.nc := '1';
v.xerc := l_in.xerc; else
v.reserve := l_in.reserve; v.nc := '0';
v.rc := l_in.rc; end if;


-- XXX Temporary hack. Mark the op as non-cachable if the address -- Do length_to_sel and work out if we are doing 2 dwords
-- is the form 0xc------- long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0));
-- byte_sel := long_sel(7 downto 0);
-- This will have to be replaced by a combination of implementing the v.second_bytes := long_sel(15 downto 8);
-- proper HV CI load/store instructions and having an MMU to get the I
-- bit otherwise. v.addr := lsu_sum;
if lsu_sum(31 downto 28) = "1100" then
v.nc := '1'; -- Do byte reversing and rotating for stores in the first cycle
else if v.load = '0' then
v.nc := '0'; byte_offset := unsigned(lsu_sum(2 downto 0));
end if; brev_lenm1 := "000";

if l_in.byte_reverse = '1' then
-- XXX Do length_to_sel here ? brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;

end if;
-- Do byte reversing and rotating for stores in the first cycle for i in 0 to 7 loop
if v.load = '0' then k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
byte_offset := unsigned(lsu_sum(2 downto 0)); j := to_integer(k) * 8;
brev_lenm1 := "000"; v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
if l_in.byte_reverse = '1' then end loop;
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; end if;

req := '1';
stall := '1';
if long_sel(15 downto 8) = "00000000" then
v.state := LAST_ACK_WAIT;
else
v.state := SECOND_REQ;
end if;
end if; end if;
for i in 0 to 7 loop
k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
j := to_integer(k) * 8;
v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
end loop;
end if;


v.addr := lsu_sum; when SECOND_REQ =>
-- compute (addr + 8) & ~7 for the second doubleword when unaligned
addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";
byte_sel := r.second_bytes;
req := '1';
stall := '1';
v.state := FIRST_ACK_WAIT;

when FIRST_ACK_WAIT =>
stall := '1';
if d_in.valid = '1' then
write_enable := r.load;
v.state := LAST_ACK_WAIT;
end if;

when LAST_ACK_WAIT =>
stall := '1';
second_dword := or (r.second_bytes);
if d_in.valid = '1' then
write_enable := r.load;
if r.load = '1' and r.update = '1' then
-- loads with rA update need an extra cycle
v.state := LD_UPDATE;
else
-- stores write back rA update in this cycle
do_update := r.update;
stall := '0';
done := '1';
v.state := IDLE;
end if;
end if;

when LD_UPDATE =>
do_update := '1';
v.state := IDLE;
done := '1';
end case;


-- Update registers -- Update registers
rin <= v; rin <= v;


-- Update outputs -- Update outputs to dcache
l_out <= r; d_out.valid <= req;
d_out.load <= v.load;
d_out.nc <= v.nc;
d_out.reserve <= v.reserve;
d_out.addr <= addr;
d_out.data <= v.data;
d_out.byte_sel <= byte_sel;

-- Update outputs to writeback
-- Multiplex either cache data to the destination GPR or
-- the address for the rA update.
l_out.valid <= done;
if do_update = '1' then
l_out.write_enable <= '1';
l_out.write_reg <= r.update_reg;
l_out.write_data <= r.addr;
l_out.write_len <= x"8";
l_out.write_shift <= "000";
l_out.sign_extend <= '0';
l_out.byte_reverse <= '0';
l_out.second_word <= '0';
l_out.rc <= '0';
l_out.store_done <= '0';
else
l_out.write_enable <= write_enable;
l_out.write_reg <= r.write_reg;
l_out.write_data <= d_in.data;
l_out.write_len <= r.length;
l_out.write_shift <= r.addr(2 downto 0);
l_out.sign_extend <= r.sign_extend;
l_out.byte_reverse <= r.byte_reverse;
l_out.second_word <= second_dword;
l_out.rc <= r.rc and done;
l_out.store_done <= d_in.store_done;
end if;
l_out.xerc <= r.xerc;

stall_out <= stall;


-- Asynchronous output of the low-order address bits (latched in dcache)
l_out.early_low_addr <= lsu_sum(11 downto 0);
l_out.early_valid <= l_in.valid;
end process; end process;
end; end;

@ -11,7 +11,7 @@ entity writeback is
clk : in std_ulogic; clk : in std_ulogic;


e_in : in Execute1ToWritebackType; e_in : in Execute1ToWritebackType;
l_in : in DcacheToWritebackType; l_in : in Loadstore1ToWritebackType;


w_out : out WritebackToRegisterFileType; w_out : out WritebackToRegisterFileType;
c_out : out WritebackToCrFileType; c_out : out WritebackToCrFileType;

Loading…
Cancel
Save