loadstore1: Move logic from dcache to loadstore1

So that the dcache could in future be used by an MMU, this moves
logic to do with data formatting, rA updates for update-form
instructions, and handling of unaligned loads and stores out of
dcache and into loadstore1.  For now, dcache connects only to
loadstore1, and loadstore1 now has the connection to writeback.

Dcache generates a stall signal to loadstore1 which indicates that
the request presented in the current cycle was not accepted and
should be presented again.  However, loadstore1 doesn't currently
use it because we know that we can never hit the circumstances
where it might be set.

For unaligned transfers, loadstore1 generates two requests to
dcache back-to-back, and then waits to see two acks back from
dcache (cycles where d_in.valid is true).

Loadstore1 now has a FSM for tracking how many acks we are
expecting from dcache and for doing the rA update cycles when
necessary.  Handling for reservations and conditional stores is
still in dcache.

Loadstore1 now generates its own stall signal back to decode2,
so we no longer need the logic in execute1 that generated the stall
for the first two cycles.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/153/head
Paul Mackerras 5 years ago
parent ef9c1efd72
commit b349cc891a

@ -218,22 +218,20 @@ package common is
valid : std_ulogic;
load : std_ulogic;
nc : std_ulogic;
reserve : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
data : std_ulogic_vector(63 downto 0);
write_reg : gpr_index_t;
length : std_ulogic_vector(3 downto 0);
byte_reverse : std_ulogic;
sign_extend : std_ulogic;
update : std_ulogic;
update_reg : gpr_index_t;
xerc : xer_common_t;
reserve : std_ulogic;
rc : std_ulogic;
early_low_addr : std_ulogic_vector(11 downto 0);
early_valid : std_ulogic;
byte_sel : std_ulogic_vector(7 downto 0);
end record;

type DcacheToLoadstore1Type is record
valid : std_ulogic;
data : std_ulogic_vector(63 downto 0);
store_done : std_ulogic;
error : std_ulogic;
end record;

type DcacheToWritebackType is record
type Loadstore1ToWritebackType is record
valid : std_ulogic;
write_enable: std_ulogic;
write_reg : gpr_index_t;
@ -247,9 +245,9 @@ package common is
rc : std_ulogic;
store_done : std_ulogic;
end record;
constant DcacheToWritebackInit : DcacheToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0',
byte_reverse => '0', second_word => '0', xerc => xerc_init,
rc => '0', store_done => '0', others => (others => '0'));
constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0',
byte_reverse => '0', second_word => '0', xerc => xerc_init,
rc => '0', store_done => '0', others => (others => '0'));

type Execute1ToWritebackType is record
valid: std_ulogic;

@ -61,8 +61,11 @@ architecture behave of core is

-- load store signals
signal execute1_to_loadstore1: Execute1ToLoadstore1Type;
signal loadstore1_to_writeback: Loadstore1ToWritebackType;

-- dcache signals
signal loadstore1_to_dcache: Loadstore1ToDcacheType;
signal dcache_to_writeback: DcacheToWritebackType;
signal dcache_to_loadstore1: DcacheToLoadstore1Type;

-- local signals
signal fetch1_stall_in : std_ulogic;
@ -73,6 +76,7 @@ architecture behave of core is
signal decode2_stall_out : std_ulogic;
signal ex1_icache_inval: std_ulogic;
signal ex1_stall_out: std_ulogic;
signal ls1_stall_out: std_ulogic;
signal dcache_stall_out: std_ulogic;

signal flush: std_ulogic;
@ -196,7 +200,7 @@ begin
c_in => cr_file_to_decode2,
c_out => decode2_to_cr_file
);
decode2_stall_in <= ex1_stall_out or dcache_stall_out;
decode2_stall_in <= ex1_stall_out or ls1_stall_out;

register_file_0: entity work.register_file
generic map (
@ -243,8 +247,13 @@ begin
loadstore1_0: entity work.loadstore1
port map (
clk => clk,
rst => core_rst,
l_in => execute1_to_loadstore1,
l_out => loadstore1_to_dcache
l_out => loadstore1_to_writeback,
d_out => loadstore1_to_dcache,
d_in => dcache_to_loadstore1,
dc_stall => dcache_stall_out,
stall_out => ls1_stall_out
);

dcache_0: entity work.dcache
@ -257,7 +266,7 @@ begin
clk => clk,
rst => core_rst,
d_in => loadstore1_to_dcache,
d_out => dcache_to_writeback,
d_out => dcache_to_loadstore1,
stall_out => dcache_stall_out,
wishbone_in => wishbone_data_in,
wishbone_out => wishbone_data_out
@ -267,7 +276,7 @@ begin
port map (
clk => clk,
e_in => execute1_to_writeback,
l_in => dcache_to_writeback,
l_in => loadstore1_to_writeback,
w_out => writeback_to_register_file,
c_out => writeback_to_cr_file,
complete_out => complete

@ -7,9 +7,6 @@
-- * Complete load misses on the cycle when WB data comes instead of
-- at the end of line (this requires dealing with requests coming in
-- while not idle...)
-- * Load with update could use one less non-pipelined cycle by moving
-- the register update to the pipeline bubble that exists when going
-- back to the IDLE state.
--
library ieee;
use ieee.std_logic_1164.all;
@ -35,7 +32,7 @@ entity dcache is
rst : in std_ulogic;

d_in : in Loadstore1ToDcacheType;
d_out : out DcacheToWritebackType;
d_out : out DcacheToLoadstore1Type;

stall_out : out std_ulogic;

@ -113,6 +110,8 @@ architecture rtl of dcache is
attribute ram_style : string;
attribute ram_style of cache_tags : signal is "distributed";

signal r0 : Loadstore1ToDcacheType;

-- Type of operation on a "valid" input
type op_t is (OP_NONE,
OP_LOAD_HIT, -- Cache hit on load
@ -124,10 +123,8 @@ architecture rtl of dcache is
-- Cache state machine
type state_t is (IDLE, -- Normal load hit processing
PRE_NEXT_DWORD, -- Extra state before NEXT_DWORD
NEXT_DWORD, -- Starting the 2nd xfer of misaligned
LOAD_UPDATE, -- Load with update extra cycle
RELOAD_WAIT_ACK, -- Cache reload wait ack
FINISH_LD_MISS, -- Extra cycle after load miss
STORE_WAIT_ACK, -- Store wait ack
NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack

@ -158,15 +155,6 @@ architecture rtl of dcache is
hit_way : way_t;
hit_load_valid : std_ulogic;

-- Info for doing the second transfer of a misaligned load/store
two_dwords : std_ulogic;
second_dword : std_ulogic;
next_addr : std_ulogic_vector(63 downto 0);
next_sel : std_ulogic_vector(7 downto 0);

-- Register update (load/store with update)
update_valid : std_ulogic;

-- Data buffer for "slow" read ops (load miss and NC loads).
slow_data : std_ulogic_vector(63 downto 0);
slow_valid : std_ulogic;
@ -200,12 +188,8 @@ architecture rtl of dcache is
signal req_tag : cache_tag_t;
signal req_op : op_t;
signal req_data : std_ulogic_vector(63 downto 0);
signal req_addr : std_ulogic_vector(63 downto 0);
signal req_laddr : std_ulogic_vector(63 downto 0);
signal req_sel : std_ulogic_vector(7 downto 0);
signal next_addr : std_ulogic_vector(63 downto 0);

signal early_req_addr : std_ulogic_vector(11 downto 0);
signal early_req_row : row_t;

signal cancel_store : std_ulogic;
@ -222,9 +206,7 @@ architecture rtl of dcache is
signal replace_way : way_t;

-- Wishbone read/write/cache write formatting signals
signal bus_sel : std_ulogic_vector(15 downto 0);

signal two_dwords : std_ulogic;
signal bus_sel : std_ulogic_vector(7 downto 0);

--
-- Helper functions to decode incoming requests
@ -305,37 +287,6 @@ architecture rtl of dcache is
tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
end;

-- Generate byte enables from sizes
function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
begin
case length is
when "0001" =>
return "00000001";
when "0010" =>
return "00000011";
when "0100" =>
return "00001111";
when "1000" =>
return "11111111";
when others =>
return "00000000";
end case;
end function length_to_sel;

-- Calculate byte enables for wishbone
-- This returns 16 bits, giving the select signals for two transfers,
-- to account for unaligned loads or stores
function wishbone_data_sel(size : in std_logic_vector(3 downto 0);
address : in std_logic_vector(63 downto 0))
return std_ulogic_vector is
variable longsel : std_ulogic_vector(15 downto 0);
begin
longsel := (others => '0');
longsel(7 downto 0) := length_to_sel(size);
return std_ulogic_vector(shift_left(unsigned(longsel),
to_integer(unsigned(address(2 downto 0)))));
end function wishbone_data_sel;

begin

assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
@ -390,11 +341,17 @@ begin
end generate;
end generate;

-- Wishbone read and write and BRAM write sel bits generation
bus_sel <= wishbone_data_sel(d_in.length, d_in.addr);

-- See if the operation crosses two doublewords
two_dwords <= or (bus_sel(15 downto 8));
-- Latch the request in r0 as long as we're not stalling
stage_0 : process(clk)
begin
if rising_edge(clk) then
if rst = '1' then
r0.valid <= '0';
elsif stall_out = '0' then
r0 <= d_in;
end if;
end if;
end process;

-- Cache request parsing and hit detection
dcache_request : process(all)
@ -405,40 +362,21 @@ begin
variable data : std_ulogic_vector(63 downto 0);
variable opsel : std_ulogic_vector(3 downto 0);
variable go : std_ulogic;
variable is_load : std_ulogic;
variable is_nc : std_ulogic;
begin
-- Extract line, row and tag from request
if r1.state /= NEXT_DWORD then
req_addr <= d_in.addr;
req_data <= d_in.data;
req_sel <= bus_sel(7 downto 0);
go := d_in.valid;
is_load := d_in.load;
is_nc := d_in.nc;

else
req_addr <= r1.next_addr;
req_data <= r1.req.data;
req_sel <= r1.next_sel;
go := '1';
is_load := r1.req.load;
is_nc := r1.req.nc;
end if;
req_index <= get_index(r0.addr);
req_row <= get_row(r0.addr);
req_tag <= get_tag(r0.addr);

req_index <= get_index(req_addr);
req_row <= get_row(req_addr);
req_tag <= get_tag(req_addr);
-- Only do anything if not being stalled by stage 1
go := r0.valid and not stall_out;

-- Calculate address of beginning of cache line, will be
-- used for cache miss processing if needed
--
req_laddr <= req_addr(63 downto LINE_OFF_BITS) &
req_laddr <= r0.addr(63 downto LINE_OFF_BITS) &
(LINE_OFF_BITS-1 downto 0 => '0');

-- Address of next doubleword, used for unaligned accesses
next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000";

-- Test if pending request is a hit on any way
hit_way := 0;
is_hit := '0';
@ -460,7 +398,7 @@ begin
-- Combine the request and cache his status to decide what
-- operation needs to be done
--
opsel := go & is_load & is_nc & is_hit;
opsel := go & r0.load & r0.nc & is_hit;
case opsel is
when "1101" => op := OP_LOAD_HIT;
when "1100" => op := OP_LOAD_MISS;
@ -475,16 +413,15 @@ begin

req_op <= op;

-- Versions of the address and row number that are valid one cycle earlier
-- Version of the row number that is valid one cycle earlier
-- in the cases where we need to read the cache data BRAM.
if r1.state = IDLE and op = OP_LOAD_HIT and two_dwords = '1' then
early_req_addr <= next_addr(11 downto 0);
elsif r1.state /= IDLE and r1.two_dwords = '1' and r1.second_dword = '0' then
early_req_addr <= r1.next_addr(11 downto 0);
-- If we're stalling then we need to keep reading the last
-- row requested.
if stall_out = '0' then
early_req_row <= get_row(d_in.addr);
else
early_req_addr <= d_in.early_low_addr;
early_req_row <= req_row;
end if;
early_req_row <= get_row(x"0000000000000" & early_req_addr);
end process;

-- Wire up wishbone request latch out of stage 1
@ -502,17 +439,17 @@ begin
cancel_store <= '0';
set_rsrv <= '0';
clear_rsrv <= '0';
if d_in.valid = '1' and d_in.reserve = '1' then
if stall_out = '0' and r0.valid = '1' and r0.reserve = '1' then
-- XXX generate alignment interrupt if address is not aligned
-- XXX or if d_in.nc = '1'
if d_in.load = '1' then
-- XXX or if r0.nc = '1'
if r0.load = '1' then
-- load with reservation
set_rsrv <= '1';
else
-- store conditional
clear_rsrv <= '1';
if reservation.valid = '0' or
d_in.addr(63 downto LINE_OFF_BITS) /= reservation.addr then
r0.addr(63 downto LINE_OFF_BITS) /= reservation.addr then
cancel_store <= '1';
end if;
end if;
@ -526,28 +463,19 @@ begin
reservation.valid <= '0';
elsif set_rsrv = '1' then
reservation.valid <= '1';
reservation.addr <= d_in.addr(63 downto LINE_OFF_BITS);
reservation.addr <= r0.addr(63 downto LINE_OFF_BITS);
end if;
end if;
end process;

-- Writeback (loads and reg updates) & completion control logic
-- Return data for loads & completion control logic
--
writeback_control: process(all)
begin

-- The mux on d_out.write reg defaults to the normal load hit case.
d_out.write_enable <= '0';
-- The mux on d_out.data defaults to the normal load hit case.
d_out.valid <= '0';
d_out.write_reg <= r1.req.write_reg;
d_out.write_data <= cache_out(r1.hit_way);
d_out.write_len <= r1.req.length;
d_out.write_shift <= r1.req.addr(2 downto 0);
d_out.sign_extend <= r1.req.sign_extend;
d_out.byte_reverse <= r1.req.byte_reverse;
d_out.second_word <= r1.second_dword;
d_out.xerc <= r1.req.xerc;
d_out.rc <= '0'; -- loads never have rc=1
d_out.data <= cache_out(r1.hit_way);
d_out.store_done <= '0';

-- We have a valid load or store hit or we just completed a slow
@ -561,30 +489,17 @@ begin
--

-- Sanity: Only one of these must be set in any given cycle
assert (r1.update_valid and r1.hit_load_valid) /= '1' report
"unexpected hit_load_delayed collision with update_valid"
severity FAILURE;
assert (r1.slow_valid and r1.stcx_fail) /= '1' report
"unexpected slow_valid collision with stcx_fail"
severity FAILURE;
assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid) /= '1' report
"unexpected hit_load_delayed collision with slow_valid"
severity FAILURE;
assert ((r1.slow_valid or r1.stcx_fail) and r1.update_valid) /= '1' report
"unexpected update_valid collision with slow_valid or stcx_fail"
severity FAILURE;

-- Load hit case is the standard path
if r1.hit_load_valid = '1' then
d_out.write_enable <= '1';

-- If there isn't another dword to go and
-- it's not a load with update, complete it now
if (r1.second_dword or not r1.two_dwords) = '1' and
r1.req.update = '0' then
report "completing load hit";
d_out.valid <= '1';
end if;
report "completing load hit";
d_out.valid <= '1';
end if;

-- Slow ops (load miss, NC, stores)
@ -593,63 +508,20 @@ begin
-- mux accordingly
--
if r1.req.load then
d_out.write_reg <= r1.req.write_reg;
d_out.write_enable <= '1';

-- Read data comes from the slow data latch, formatter
-- from the latched request.
--
d_out.write_data <= r1.slow_data;
d_out.write_shift <= r1.req.addr(2 downto 0);
d_out.sign_extend <= r1.req.sign_extend;
d_out.byte_reverse <= r1.req.byte_reverse;
d_out.write_len <= r1.req.length;
d_out.xerc <= r1.req.xerc;
d_out.second_word <= r1.second_dword;
-- Read data comes from the slow data latch
d_out.data <= r1.slow_data;
end if;
d_out.rc <= r1.req.rc;
d_out.store_done <= '1';

-- If it's a store or a non-update load form, complete now
-- unless we need to do another dword transfer
if (r1.req.load = '0' or r1.req.update = '0') and
(r1.two_dwords = '0' or r1.second_dword = '1') then
report "completing store or load miss";
d_out.valid <= '1';
end if;
report "completing store or load miss";
d_out.valid <= '1';
end if;

if r1.stcx_fail = '1' then
d_out.rc <= r1.req.rc;
d_out.store_done <= '0';
d_out.valid <= '1';
end if;

-- We have a register update to do.
if r1.update_valid = '1' then
d_out.write_enable <= '1';
d_out.write_reg <= r1.req.update_reg;

-- Change the read data mux to the address that's going into
-- the register and the formatter does nothing.
--
d_out.write_data <= r1.req.addr;
d_out.write_shift <= "000";
d_out.write_len <= "1000";
d_out.sign_extend <= '0';
d_out.byte_reverse <= '0';
d_out.xerc <= r1.req.xerc;
d_out.second_word <= '0';

-- If it was a load, this completes the operation (load with
-- update case).
--
if r1.req.load = '1' then
report "completing after load update";
d_out.valid <= '1';
end if;
end if;

end process;

--
@ -703,11 +575,11 @@ begin
-- For timing, the mux on wr_data/sel/addr is not dependent on anything
-- other than the current state. Only the do_write signal is.
--
if r1.state = IDLE or r1.state = NEXT_DWORD then
-- In these states, the only write path is the store-hit update case
if r1.state = IDLE then
-- In IDLE state, the only write path is the store-hit update case
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
wr_data <= req_data;
wr_sel <= req_sel;
wr_data <= r0.data;
wr_sel <= r0.byte_sel;
else
-- Otherwise, we might be doing a reload
wr_data <= wishbone_in.dat;
@ -731,35 +603,25 @@ begin
end generate;

--
-- Cache hit synchronous machine for the easy case. This handles
-- non-update form load hits
-- Cache hit synchronous machine for the easy case. This handles load hits.
--
dcache_fast_hit : process(clk)
begin
if rising_edge(clk) then
-- If we have a request incoming, we have to latch it as d_in.valid
-- If we have a request incoming, we have to latch it as r0.valid
-- is only set for a single cycle. It's up to the control logic to
-- ensure we don't override an uncompleted request (for now we are
-- single issue on load/stores so we are fine, later, we can generate
-- a stall output if necessary).

if req_op /= OP_NONE and d_in.valid = '1' then
r1.req <= d_in;
r1.second_dword <= '0';
r1.two_dwords <= two_dwords;
r1.next_addr <= next_addr;
r1.next_sel <= bus_sel(15 downto 8);

if req_op /= OP_NONE and stall_out = '0' then
r1.req <= r0;
report "op:" & op_t'image(req_op) &
" addr:" & to_hstring(d_in.addr) &
" upd:" & std_ulogic'image(d_in.update) &
" nc:" & std_ulogic'image(d_in.nc) &
" reg:" & to_hstring(d_in.write_reg) &
" addr:" & to_hstring(r0.addr) &
" nc:" & std_ulogic'image(r0.nc) &
" idx:" & integer'image(req_index) &
" tag:" & to_hstring(req_tag) &
" way: " & integer'image(req_hit_way);
elsif r1.state = NEXT_DWORD then
r1.second_dword <= '1';
end if;

-- Fast path for load/store hits. Set signals for the writeback controls.
@ -776,7 +638,6 @@ begin
-- Every other case is handled by this state machine:
--
-- * Cache load miss/reload (in conjunction with "rams")
-- * Load hits for update forms
-- * Load hits for non-cachable forms
-- * Stores (the collision case is handled in "rams")
--
@ -795,7 +656,6 @@ begin
end loop;
r1.state <= IDLE;
r1.slow_valid <= '0';
r1.update_valid <= '0';
r1.wb.cyc <= '0';
r1.wb.stb <= '0';

@ -804,39 +664,19 @@ begin
else
-- One cycle pulses reset
r1.slow_valid <= '0';
r1.update_valid <= '0';
r1.stcx_fail <= '0';

-- We cannot currently process a new request when not idle
assert d_in.valid = '0' or r1.state = IDLE report "request " &
op_t'image(req_op) & " while in state " & state_t'image(r1.state)
severity FAILURE;

-- Main state machine
case r1.state is
when IDLE | NEXT_DWORD =>
when IDLE =>
case req_op is
when OP_LOAD_HIT =>
if r1.state = IDLE then
-- If the load is misaligned then we will need to start
-- the state machine
if two_dwords = '1' then
r1.state <= NEXT_DWORD;
elsif d_in.update = '1' then
r1.state <= LOAD_UPDATE;
end if;
else
if r1.req.update = '1' then
r1.state <= LOAD_UPDATE;
else
r1.state <= IDLE;
end if;
end if;
-- stay in IDLE state

when OP_LOAD_MISS =>
when OP_LOAD_MISS =>
-- Normal load cache miss, start the reload machine
--
report "cache miss addr:" & to_hstring(req_addr) &
report "cache miss addr:" & to_hstring(r0.addr) &
" idx:" & integer'image(req_index) &
" way:" & integer'image(replace_way) &
" tag:" & to_hstring(req_tag);
@ -871,19 +711,17 @@ begin
r1.state <= RELOAD_WAIT_ACK;

when OP_LOAD_NC =>
r1.wb.sel <= req_sel;
r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000";
r1.wb.sel <= r0.byte_sel;
r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000";
r1.wb.cyc <= '1';
r1.wb.stb <= '1';
r1.wb.we <= '0';
r1.state <= NC_LOAD_WAIT_ACK;

when OP_STORE_HIT | OP_STORE_MISS =>
-- For store-with-update do the register update
r1.update_valid <= d_in.valid and d_in.update;
r1.wb.sel <= req_sel;
r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000";
r1.wb.dat <= req_data;
r1.wb.sel <= r0.byte_sel;
r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000";
r1.wb.dat <= r0.data;
if cancel_store = '0' then
r1.wb.cyc <= '1';
r1.wb.stb <= '1';
@ -899,9 +737,6 @@ begin
when OP_BAD =>
end case;

when PRE_NEXT_DWORD =>
r1.state <= NEXT_DWORD;

when RELOAD_WAIT_ACK =>
-- Requests are all sent if stb is 0
stbs_done := r1.wb.stb = '0';
@ -943,31 +778,23 @@ begin
-- Cache line is now valid
cache_valids(r1.store_index)(r1.store_way) <= '1';

-- Write back the load data that we got, and start
-- the second dword if necessary. Otherwise, see if
-- we also need to do the deferred update cycle.
r1.slow_valid <= '1';
if r1.two_dwords and not r1.second_dword then
r1.state <= PRE_NEXT_DWORD;
elsif r1.req.update = '1' then
r1.state <= LOAD_UPDATE;
report "completing miss with load-update !";
else
r1.state <= IDLE;
report "completing miss !";
end if;
-- Don't complete and go idle until next cycle, in
-- case the next request is for the last dword of
-- the cache line we just loaded.
r1.state <= FINISH_LD_MISS;
end if;

-- Increment store row counter
r1.store_row <= next_row(r1.store_row);
end if;

when LOAD_UPDATE =>
-- We need the extra cycle to complete a load with update
r1.update_valid <= '1';
r1.state <= IDLE;
when FINISH_LD_MISS =>
-- Write back the load data that we got
r1.slow_valid <= '1';
r1.state <= IDLE;
report "completing miss !";

when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK =>
when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK =>
-- Clear stb when slave accepted request
if wishbone_in.stall = '0' then
r1.wb.stb <= '0';
@ -975,16 +802,10 @@ begin

-- Got ack ? complete.
if wishbone_in.ack = '1' then
if r1.two_dwords and not r1.second_dword then
r1.state <= NEXT_DWORD;
elsif r1.state = NC_LOAD_WAIT_ACK and r1.req.update = '1' then
r1.state <= LOAD_UPDATE;
else
r1.state <= IDLE;
end if;
if r1.state = NC_LOAD_WAIT_ACK then
r1.slow_data <= wishbone_in.dat;
end if;
r1.state <= IDLE;
r1.slow_valid <= '1';
r1.wb.cyc <= '0';
r1.wb.stb <= '0';

@ -13,7 +13,7 @@ architecture behave of dcache_tb is
signal rst : std_ulogic;

signal d_in : Loadstore1ToDcacheType;
signal d_out : DcacheToWritebackType;
signal d_out : DcacheToLoadstore1Type;

signal wb_bram_in : wishbone_master_out;
signal wb_bram_out : wishbone_slave_out;
@ -71,12 +71,6 @@ begin
d_in.nc <= '0';
d_in.addr <= (others => '0');
d_in.data <= (others => '0');
d_in.write_reg <= (others => '0');
d_in.length <= (others => '0');
d_in.byte_reverse <= '0';
d_in.sign_extend <= '0';
d_in.update <= '0';
d_in.update_reg <= (others => '0');

wait for 4*clk_period;
wait until rising_edge(clk);
@ -89,11 +83,10 @@ begin
wait until rising_edge(clk);
d_in.valid <= '0';

wait until rising_edge(clk) and d_out.write_enable = '1';
assert d_out.valid = '1';
assert d_out.write_data = x"0000000100000000"
wait until rising_edge(clk) and d_out.valid = '1';
assert d_out.data = x"0000000100000000"
report "data @" & to_hstring(d_in.addr) &
"=" & to_hstring(d_out.write_data) &
"=" & to_hstring(d_out.data) &
" expected 0000000100000000"
severity failure;
-- wait for clk_period;
@ -106,11 +99,10 @@ begin
wait until rising_edge(clk);
d_in.valid <= '0';

wait until rising_edge(clk) and d_out.write_enable = '1';
assert d_out.valid = '1';
assert d_out.write_data = x"0000000D0000000C"
wait until rising_edge(clk) and d_out.valid = '1';
assert d_out.data = x"0000000D0000000C"
report "data @" & to_hstring(d_in.addr) &
"=" & to_hstring(d_out.write_data) &
"=" & to_hstring(d_out.data) &
" expected 0000000D0000000C"
severity failure;

@ -121,11 +113,10 @@ begin
d_in.valid <= '1';
wait until rising_edge(clk);
d_in.valid <= '0';
wait until rising_edge(clk) and d_out.write_enable = '1';
assert d_out.valid = '1';
assert d_out.write_data = x"0000004100000040"
wait until rising_edge(clk) and d_out.valid = '1';
assert d_out.data = x"0000004100000040"
report "data @" & to_hstring(d_in.addr) &
"=" & to_hstring(d_out.write_data) &
"=" & to_hstring(d_out.data) &
" expected 0000004100000040"
severity failure;


@ -42,7 +42,6 @@ architecture behaviour of execute1 is
next_lr : std_ulogic_vector(63 downto 0);
mul_in_progress : std_ulogic;
div_in_progress : std_ulogic;
ldst_in_progress : std_ulogic;
cntz_in_progress : std_ulogic;
slow_op_dest : gpr_index_t;
slow_op_rc : std_ulogic;
@ -264,7 +263,6 @@ begin
v.mul_in_progress := '0';
v.div_in_progress := '0';
v.cntz_in_progress := '0';
v.ldst_in_progress := '0';

-- signals to multiply unit
x_to_multiply <= Execute1ToMultiplyInit;
@ -662,8 +660,6 @@ begin
when OP_LOAD | OP_STORE =>
-- loadstore/dcache has its own port to writeback
v.e.valid := '0';
stall_out <= '1';
v.ldst_in_progress := '1';

when others =>
terminate_out <= '1';
@ -703,10 +699,6 @@ begin
v.e.rc := v.slow_op_rc;
v.e.xerc := v.slow_op_xerc;
v.e.valid := '1';
elsif r.ldst_in_progress = '1' then
-- assert stall for 2 cycles on load/store, then
-- the stall output from dcache takes over
stall_out <= '1';
elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then
if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or
(r.div_in_progress = '1' and divider_to_x.valid = '1') then

@ -12,16 +12,85 @@ use work.helpers.all;
entity loadstore1 is
port (
clk : in std_ulogic;
rst : in std_ulogic;

l_in : in Execute1ToLoadstore1Type;
l_out : out Loadstore1ToWritebackType;

l_out : out Loadstore1ToDcacheType
d_out : out Loadstore1ToDcacheType;
d_in : in DcacheToLoadstore1Type;

dc_stall : in std_ulogic;
stall_out : out std_ulogic
);
end loadstore1;

-- Note, we don't currently use the stall output from the dcache because
-- we know it can take two requests without stalling when idle, we are
-- its only user, and we know it never stalls when idle.

architecture behave of loadstore1 is
signal r, rin : Loadstore1ToDcacheType;

-- State machine for unaligned loads/stores
type state_t is (IDLE, -- ready for instruction
SECOND_REQ, -- send 2nd request of unaligned xfer
FIRST_ACK_WAIT, -- waiting for 1st ack from dcache
LAST_ACK_WAIT, -- waiting for last ack from dcache
LD_UPDATE -- writing rA with computed addr on load
);

type reg_stage_t is record
-- latch most of the input request
load : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
data : std_ulogic_vector(63 downto 0);
write_reg : gpr_index_t;
length : std_ulogic_vector(3 downto 0);
byte_reverse : std_ulogic;
sign_extend : std_ulogic;
update : std_ulogic;
update_reg : gpr_index_t;
xerc : xer_common_t;
reserve : std_ulogic;
rc : std_ulogic;
nc : std_ulogic; -- non-cacheable access
state : state_t;
second_bytes : std_ulogic_vector(7 downto 0);
end record;

signal r, rin : reg_stage_t;
signal lsu_sum : std_ulogic_vector(63 downto 0);

-- Generate byte enables from sizes
function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
begin
case length is
when "0001" =>
return "00000001";
when "0010" =>
return "00000011";
when "0100" =>
return "00001111";
when "1000" =>
return "11111111";
when others =>
return "00000000";
end case;
end function length_to_sel;

-- Calculate byte enables
-- This returns 16 bits, giving the select signals for two transfers,
-- to account for unaligned loads or stores
function xfer_data_sel(size : in std_logic_vector(3 downto 0);
address : in std_logic_vector(2 downto 0))
return std_ulogic_vector is
variable longsel : std_ulogic_vector(15 downto 0);
begin
longsel := "00000000" & length_to_sel(size);
return std_ulogic_vector(shift_left(unsigned(longsel),
to_integer(unsigned(address))));
end function xfer_data_sel;

begin
-- Calculate the address in the first cycle
lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
@ -29,69 +98,180 @@ begin
loadstore1_0: process(clk)
begin
if rising_edge(clk) then
r <= rin;
if rst = '1' then
r.state <= IDLE;
else
r <= rin;
end if;
end if;
end process;

loadstore1_1: process(all)
variable v : Loadstore1ToDcacheType;
variable v : reg_stage_t;
variable brev_lenm1 : unsigned(2 downto 0);
variable byte_offset : unsigned(2 downto 0);
variable j : integer;
variable k : unsigned(2 downto 0);
variable long_sel : std_ulogic_vector(15 downto 0);
variable byte_sel : std_ulogic_vector(7 downto 0);
variable req : std_ulogic;
variable stall : std_ulogic;
variable addr : std_ulogic_vector(63 downto 0);
variable wdata : std_ulogic_vector(63 downto 0);
variable write_enable : std_ulogic;
variable do_update : std_ulogic;
variable second_dword : std_ulogic;
variable done : std_ulogic;
begin
v := r;
req := '0';
stall := '0';
done := '0';
byte_sel := (others => '0');
addr := lsu_sum;

write_enable := '0';
do_update := '0';
second_dword := '0';

case r.state is
when IDLE =>
if l_in.valid = '1' then
v.load := l_in.load;
v.addr := lsu_sum;
v.data := l_in.data;
v.write_reg := l_in.write_reg;
v.length := l_in.length;
v.byte_reverse := l_in.byte_reverse;
v.sign_extend := l_in.sign_extend;
v.update := l_in.update;
v.update_reg := l_in.update_reg;
v.xerc := l_in.xerc;
v.reserve := l_in.reserve;
v.rc := l_in.rc;

v.valid := l_in.valid;
v.load := l_in.load;
v.write_reg := l_in.write_reg;
v.length := l_in.length;
v.byte_reverse := l_in.byte_reverse;
v.sign_extend := l_in.sign_extend;
v.update := l_in.update;
v.update_reg := l_in.update_reg;
v.xerc := l_in.xerc;
v.reserve := l_in.reserve;
v.rc := l_in.rc;

-- XXX Temporary hack. Mark the op as non-cachable if the address
-- is the form 0xc-------
--
-- This will have to be replaced by a combination of implementing the
-- proper HV CI load/store instructions and having an MMU to get the I
-- bit otherwise.
if lsu_sum(31 downto 28) = "1100" then
v.nc := '1';
else
v.nc := '0';
end if;

-- XXX Do length_to_sel here ?

-- Do byte reversing and rotating for stores in the first cycle
if v.load = '0' then
byte_offset := unsigned(lsu_sum(2 downto 0));
brev_lenm1 := "000";
if l_in.byte_reverse = '1' then
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
-- XXX Temporary hack. Mark the op as non-cachable if the address
-- is the form 0xc-------
--
-- This will have to be replaced by a combination of implementing the
-- proper HV CI load/store instructions and having an MMU to get the I
-- bit otherwise.
if lsu_sum(31 downto 28) = "1100" then
v.nc := '1';
else
v.nc := '0';
end if;

-- Do length_to_sel and work out if we are doing 2 dwords
long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0));
byte_sel := long_sel(7 downto 0);
v.second_bytes := long_sel(15 downto 8);

v.addr := lsu_sum;

-- Do byte reversing and rotating for stores in the first cycle
if v.load = '0' then
byte_offset := unsigned(lsu_sum(2 downto 0));
brev_lenm1 := "000";
if l_in.byte_reverse = '1' then
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
end if;
for i in 0 to 7 loop
k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
j := to_integer(k) * 8;
v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
end loop;
end if;

req := '1';
stall := '1';
if long_sel(15 downto 8) = "00000000" then
v.state := LAST_ACK_WAIT;
else
v.state := SECOND_REQ;
end if;
end if;
for i in 0 to 7 loop
k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
j := to_integer(k) * 8;
v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
end loop;
end if;

v.addr := lsu_sum;
when SECOND_REQ =>
-- compute (addr + 8) & ~7 for the second doubleword when unaligned
addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";
byte_sel := r.second_bytes;
req := '1';
stall := '1';
v.state := FIRST_ACK_WAIT;

when FIRST_ACK_WAIT =>
stall := '1';
if d_in.valid = '1' then
write_enable := r.load;
v.state := LAST_ACK_WAIT;
end if;

when LAST_ACK_WAIT =>
stall := '1';
second_dword := or (r.second_bytes);
if d_in.valid = '1' then
write_enable := r.load;
if r.load = '1' and r.update = '1' then
-- loads with rA update need an extra cycle
v.state := LD_UPDATE;
else
-- stores write back rA update in this cycle
do_update := r.update;
stall := '0';
done := '1';
v.state := IDLE;
end if;
end if;

when LD_UPDATE =>
do_update := '1';
v.state := IDLE;
done := '1';
end case;

-- Update registers
rin <= v;

-- Update outputs
l_out <= r;
-- Update outputs to dcache
d_out.valid <= req;
d_out.load <= v.load;
d_out.nc <= v.nc;
d_out.reserve <= v.reserve;
d_out.addr <= addr;
d_out.data <= v.data;
d_out.byte_sel <= byte_sel;

-- Update outputs to writeback
-- Multiplex either cache data to the destination GPR or
-- the address for the rA update.
l_out.valid <= done;
if do_update = '1' then
l_out.write_enable <= '1';
l_out.write_reg <= r.update_reg;
l_out.write_data <= r.addr;
l_out.write_len <= x"8";
l_out.write_shift <= "000";
l_out.sign_extend <= '0';
l_out.byte_reverse <= '0';
l_out.second_word <= '0';
l_out.rc <= '0';
l_out.store_done <= '0';
else
l_out.write_enable <= write_enable;
l_out.write_reg <= r.write_reg;
l_out.write_data <= d_in.data;
l_out.write_len <= r.length;
l_out.write_shift <= r.addr(2 downto 0);
l_out.sign_extend <= r.sign_extend;
l_out.byte_reverse <= r.byte_reverse;
l_out.second_word <= second_dword;
l_out.rc <= r.rc and done;
l_out.store_done <= d_in.store_done;
end if;
l_out.xerc <= r.xerc;

stall_out <= stall;

-- Asynchronous output of the low-order address bits (latched in dcache)
l_out.early_low_addr <= lsu_sum(11 downto 0);
l_out.early_valid <= l_in.valid;
end process;
end;

@ -11,7 +11,7 @@ entity writeback is
clk : in std_ulogic;

e_in : in Execute1ToWritebackType;
l_in : in DcacheToWritebackType;
l_in : in Loadstore1ToWritebackType;

w_out : out WritebackToRegisterFileType;
c_out : out WritebackToCrFileType;

Loading…
Cancel
Save