Merge pull request #153 from paulusmack/master

Load/store improvements
pull/155/head
Anton Blanchard 5 years ago committed by GitHub
commit 8bee4ae3cc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -130,12 +130,13 @@ package common is
byte_reverse : std_ulogic;
sign_extend : std_ulogic; -- do we need to sign extend?
update : std_ulogic; -- is this an update instruction?
reserve : std_ulogic; -- set for larx/stcx
end record;
constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
(valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
lr => '0', rc => '0', oe => '0', invert_a => '0',
invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
is_32bit => '0', is_signed => '0', xerc => xerc_init,
is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0',
byte_reverse => '0', sign_extend => '0', update => '0', others => (others => '0'));

type Execute1ToMultiplyType is record
@ -201,46 +202,47 @@ package common is
data : std_ulogic_vector(63 downto 0); -- data to write, unused for read
write_reg : gpr_index_t;
length : std_ulogic_vector(3 downto 0);
ci : std_ulogic; -- cache-inhibited load/store
byte_reverse : std_ulogic;
sign_extend : std_ulogic; -- do we need to sign extend?
update : std_ulogic; -- is this an update instruction?
update_reg : gpr_index_t; -- if so, the register to update
xerc : xer_common_t;
reserve : std_ulogic; -- set for larx/stcx.
rc : std_ulogic; -- set for stcx.
end record;
constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0',
constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', ci => '0', byte_reverse => '0',
sign_extend => '0', update => '0', xerc => xerc_init,
others => (others => '0'));
reserve => '0', rc => '0', others => (others => '0'));

type Loadstore1ToDcacheType is record
valid : std_ulogic;
load : std_ulogic;
nc : std_ulogic;
reserve : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
data : std_ulogic_vector(63 downto 0);
write_reg : gpr_index_t;
length : std_ulogic_vector(3 downto 0);
byte_reverse : std_ulogic;
sign_extend : std_ulogic;
update : std_ulogic;
update_reg : gpr_index_t;
xerc : xer_common_t;
byte_sel : std_ulogic_vector(7 downto 0);
end record;

type DcacheToLoadstore1Type is record
valid : std_ulogic;
data : std_ulogic_vector(63 downto 0);
store_done : std_ulogic;
error : std_ulogic;
end record;

type DcacheToWritebackType is record
type Loadstore1ToWritebackType is record
valid : std_ulogic;
write_enable: std_ulogic;
write_reg : gpr_index_t;
write_data : std_ulogic_vector(63 downto 0);
write_len : std_ulogic_vector(3 downto 0);
write_shift : std_ulogic_vector(2 downto 0);
sign_extend : std_ulogic;
byte_reverse : std_ulogic;
second_word : std_ulogic;
xerc : xer_common_t;
rc : std_ulogic;
store_done : std_ulogic;
end record;
constant DcacheToWritebackInit : DcacheToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0',
byte_reverse => '0', second_word => '0', xerc => xerc_init,
others => (others => '0'));
constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := (valid => '0', write_enable => '0', xerc => xerc_init,
rc => '0', store_done => '0', others => (others => '0'));

type Execute1ToWritebackType is record
valid: std_ulogic;

@ -61,8 +61,11 @@ architecture behave of core is

-- load store signals
signal execute1_to_loadstore1: Execute1ToLoadstore1Type;
signal loadstore1_to_writeback: Loadstore1ToWritebackType;

-- dcache signals
signal loadstore1_to_dcache: Loadstore1ToDcacheType;
signal dcache_to_writeback: DcacheToWritebackType;
signal dcache_to_loadstore1: DcacheToLoadstore1Type;

-- local signals
signal fetch1_stall_in : std_ulogic;
@ -73,6 +76,8 @@ architecture behave of core is
signal decode2_stall_out : std_ulogic;
signal ex1_icache_inval: std_ulogic;
signal ex1_stall_out: std_ulogic;
signal ls1_stall_out: std_ulogic;
signal dcache_stall_out: std_ulogic;

signal flush: std_ulogic;

@ -195,7 +200,7 @@ begin
c_in => cr_file_to_decode2,
c_out => decode2_to_cr_file
);
decode2_stall_in <= ex1_stall_out;
decode2_stall_in <= ex1_stall_out or ls1_stall_out;

register_file_0: entity work.register_file
generic map (
@ -242,8 +247,13 @@ begin
loadstore1_0: entity work.loadstore1
port map (
clk => clk,
rst => core_rst,
l_in => execute1_to_loadstore1,
l_out => loadstore1_to_dcache
l_out => loadstore1_to_writeback,
d_out => loadstore1_to_dcache,
d_in => dcache_to_loadstore1,
dc_stall => dcache_stall_out,
stall_out => ls1_stall_out
);

dcache_0: entity work.dcache
@ -256,7 +266,8 @@ begin
clk => clk,
rst => core_rst,
d_in => loadstore1_to_dcache,
d_out => dcache_to_writeback,
d_out => dcache_to_loadstore1,
stall_out => dcache_stall_out,
wishbone_in => wishbone_data_in,
wishbone_out => wishbone_data_out
);
@ -265,7 +276,7 @@ begin
port map (
clk => clk,
e_in => execute1_to_writeback,
l_in => dcache_to_writeback,
l_in => loadstore1_to_writeback,
w_out => writeback_to_register_file,
c_out => writeback_to_cr_file,
complete_out => complete

@ -7,9 +7,6 @@
-- * Complete load misses on the cycle when WB data comes instead of
-- at the end of line (this requires dealing with requests coming in
-- while not idle...)
-- * Load with update could use one less non-pipelined cycle by moving
-- the register update to the pipeline bubble that exists when going
-- back to the IDLE state.
--
library ieee;
use ieee.std_logic_1164.all;
@ -35,7 +32,7 @@ entity dcache is
rst : in std_ulogic;

d_in : in Loadstore1ToDcacheType;
d_out : out DcacheToWritebackType;
d_out : out DcacheToLoadstore1Type;

stall_out : out std_ulogic;

@ -113,6 +110,8 @@ architecture rtl of dcache is
attribute ram_style : string;
attribute ram_style of cache_tags : signal is "distributed";

signal r0 : Loadstore1ToDcacheType;

-- Type of operation on a "valid" input
type op_t is (OP_NONE,
OP_LOAD_HIT, -- Cache hit on load
@ -124,9 +123,8 @@ architecture rtl of dcache is
-- Cache state machine
type state_t is (IDLE, -- Normal load hit processing
LOAD_UPDATE, -- Load with update extra cycle
LOAD_UPDATE2, -- Load with update extra cycle
RELOAD_WAIT_ACK, -- Cache reload wait ack
FINISH_LD_MISS, -- Extra cycle after load miss
STORE_WAIT_ACK, -- Store wait ack
NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack

@ -157,13 +155,13 @@ architecture rtl of dcache is
hit_way : way_t;
hit_load_valid : std_ulogic;

-- Register update (load/store with update)
update_valid : std_ulogic;

-- Data buffer for "slow" read ops (load miss and NC loads).
slow_data : std_ulogic_vector(63 downto 0);
slow_valid : std_ulogic;

-- Signal to complete a failed stcx.
stcx_fail : std_ulogic;

-- Cache miss state (reload state machine)
state : state_t;
wb : wishbone_master_out;
@ -174,21 +172,14 @@ architecture rtl of dcache is

signal r1 : reg_stage_1_t;

-- Second stage register, only used for load hits
-- Reservation information
--
type reg_stage_2_t is record
hit_way : way_t;
hit_load_valid : std_ulogic;
load_is_update : std_ulogic;
load_reg : std_ulogic_vector(4 downto 0);
data_shift : std_ulogic_vector(2 downto 0);
length : std_ulogic_vector(3 downto 0);
sign_extend : std_ulogic;
byte_reverse : std_ulogic;
xerc : xer_common_t;
type reservation_t is record
valid : std_ulogic;
addr : std_ulogic_vector(63 downto LINE_OFF_BITS);
end record;

signal r2 : reg_stage_2_t;
signal reservation : reservation_t;

-- Async signals on incoming request
signal req_index : index_t;
@ -196,8 +187,15 @@ architecture rtl of dcache is
signal req_hit_way : way_t;
signal req_tag : cache_tag_t;
signal req_op : op_t;
signal req_data : std_ulogic_vector(63 downto 0);
signal req_laddr : std_ulogic_vector(63 downto 0);

signal early_req_row : row_t;

signal cancel_store : std_ulogic;
signal set_rsrv : std_ulogic;
signal clear_rsrv : std_ulogic;

-- Cache RAM interface
type cache_ram_out_t is array(way_t) of cache_row_t;
signal cache_out : cache_ram_out_t;
@ -208,8 +206,7 @@ architecture rtl of dcache is
signal replace_way : way_t;

-- Wishbone read/write/cache write formatting signals
signal bus_sel : wishbone_sel_type;
signal store_data : wishbone_data_type;
signal bus_sel : std_ulogic_vector(7 downto 0);

--
-- Helper functions to decode incoming requests
@ -290,37 +287,6 @@ architecture rtl of dcache is
tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
end;

-- Generate byte enables from sizes
function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
begin
case length is
when "0001" =>
return "00000001";
when "0010" =>
return "00000011";
when "0100" =>
return "00001111";
when "1000" =>
return "11111111";
when others =>
return "00000000";
end case;
end function length_to_sel;

-- Calculate shift and byte enables for wishbone
function wishbone_data_shift(address : in std_ulogic_vector(63 downto 0)) return natural is
begin
return to_integer(unsigned(address(2 downto 0))) * 8;
end function wishbone_data_shift;

function wishbone_data_sel(size : in std_logic_vector(3 downto 0);
address : in std_logic_vector(63 downto 0))
return std_ulogic_vector is
begin
return std_ulogic_vector(shift_left(unsigned(length_to_sel(size)),
to_integer(unsigned(address(2 downto 0)))));
end function wishbone_data_sel;

begin

assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
@ -375,6 +341,18 @@ begin
end generate;
end generate;

-- Latch the request in r0 as long as we're not stalling
stage_0 : process(clk)
begin
if rising_edge(clk) then
if rst = '1' then
r0.valid <= '0';
elsif stall_out = '0' then
r0 <= d_in;
end if;
end if;
end process;

-- Cache request parsing and hit detection
dcache_request : process(all)
variable is_hit : std_ulogic;
@ -383,23 +361,27 @@ begin
variable tmp : std_ulogic_vector(63 downto 0);
variable data : std_ulogic_vector(63 downto 0);
variable opsel : std_ulogic_vector(3 downto 0);
variable go : std_ulogic;
begin
-- Extract line, row and tag from request
req_index <= get_index(d_in.addr);
req_row <= get_row(d_in.addr);
req_tag <= get_tag(d_in.addr);
req_index <= get_index(r0.addr);
req_row <= get_row(r0.addr);
req_tag <= get_tag(r0.addr);

-- Calculate address of beginning of cache line, will be
-- used for cache miss processing if needed
--
req_laddr <= d_in.addr(63 downto LINE_OFF_BITS) &
(LINE_OFF_BITS-1 downto 0 => '0');
-- Only do anything if not being stalled by stage 1
go := r0.valid and not stall_out;

-- Calculate address of beginning of cache line, will be
-- used for cache miss processing if needed
--
req_laddr <= r0.addr(63 downto LINE_OFF_BITS) &
(LINE_OFF_BITS-1 downto 0 => '0');

-- Test if pending request is a hit on any way
hit_way := 0;
is_hit := '0';
for i in way_t loop
if d_in.valid = '1' and cache_valids(req_index)(i) = '1' then
if go = '1' and cache_valids(req_index)(i) = '1' then
if read_tag(i, cache_tags(req_index)) = req_tag then
hit_way := i;
is_hit := '1';
@ -416,7 +398,7 @@ begin
-- Combine the request and cache his status to decide what
-- operation needs to be done
--
opsel := d_in.valid & d_in.load & d_in.nc & is_hit;
opsel := go & r0.load & r0.nc & is_hit;
case opsel is
when "1101" => op := OP_LOAD_HIT;
when "1100" => op := OP_LOAD_MISS;
@ -431,46 +413,70 @@ begin

req_op <= op;

-- Version of the row number that is valid one cycle earlier
-- in the cases where we need to read the cache data BRAM.
-- If we're stalling then we need to keep reading the last
-- row requested.
if stall_out = '0' then
early_req_row <= get_row(d_in.addr);
else
early_req_row <= req_row;
end if;
end process;

--
-- Misc signal assignments
--

-- Wire up wishbone request latch out of stage 1
wishbone_out <= r1.wb;

-- Wishbone & BRAM write data formatting for stores (most of it already
-- happens in loadstore1, this is the remaining data shifting)
--
store_data <= std_logic_vector(shift_left(unsigned(d_in.data),
wishbone_data_shift(d_in.addr)));

-- Wishbone read and write and BRAM write sel bits generation
bus_sel <= wishbone_data_sel(d_in.length, d_in.addr);

-- TODO: Generate errors
-- err_nc_collision <= '1' when req_op = OP_BAD else '0';

-- Generate stalls from stage 1 state machine
stall_out <= '1' when r1.state /= IDLE else '0';

-- Writeback (loads and reg updates) & completion control logic
-- Handle load-with-reservation and store-conditional instructions
reservation_comb: process(all)
begin
cancel_store <= '0';
set_rsrv <= '0';
clear_rsrv <= '0';
if stall_out = '0' and r0.valid = '1' and r0.reserve = '1' then
-- XXX generate alignment interrupt if address is not aligned
-- XXX or if r0.nc = '1'
if r0.load = '1' then
-- load with reservation
set_rsrv <= '1';
else
-- store conditional
clear_rsrv <= '1';
if reservation.valid = '0' or
r0.addr(63 downto LINE_OFF_BITS) /= reservation.addr then
cancel_store <= '1';
end if;
end if;
end if;
end process;

reservation_reg: process(clk)
begin
if rising_edge(clk) then
if rst = '1' or clear_rsrv = '1' then
reservation.valid <= '0';
elsif set_rsrv = '1' then
reservation.valid <= '1';
reservation.addr <= r0.addr(63 downto LINE_OFF_BITS);
end if;
end if;
end process;

-- Return data for loads & completion control logic
--
writeback_control: process(all)
begin

-- The mux on d_out.write reg defaults to the normal load hit case.
d_out.write_enable <= '0';
-- The mux on d_out.data defaults to the normal load hit case.
d_out.valid <= '0';
d_out.write_reg <= r2.load_reg;
d_out.write_data <= cache_out(r2.hit_way);
d_out.write_len <= r2.length;
d_out.write_shift <= r2.data_shift;
d_out.sign_extend <= r2.sign_extend;
d_out.byte_reverse <= r2.byte_reverse;
d_out.second_word <= '0';
d_out.xerc <= r2.xerc;
d_out.data <= cache_out(r1.hit_way);
d_out.store_done <= '0';

-- We have a valid load or store hit or we just completed a slow
-- op such as a load miss, a NC load or a store
@ -483,24 +489,17 @@ begin
--

-- Sanity: Only one of these must be set in any given cycle
assert (r1.update_valid and r2.hit_load_valid) /= '1' report
"unexpected hit_load_delayed collision with update_valid"
assert (r1.slow_valid and r1.stcx_fail) /= '1' report
"unexpected slow_valid collision with stcx_fail"
severity FAILURE;
assert (r1.slow_valid and r2.hit_load_valid) /= '1' report
assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid) /= '1' report
"unexpected hit_load_delayed collision with slow_valid"
severity FAILURE;
assert (r1.slow_valid and r1.update_valid) /= '1' report
"unexpected update_valid collision with slow_valid"
severity FAILURE;

-- Delayed load hit case is the standard path
if r2.hit_load_valid = '1' then
d_out.write_enable <= '1';

-- If it's not a load with update, complete it now
if r2.load_is_update = '0' then
d_out.valid <= '1';
end if;
-- Load hit case is the standard path
if r1.hit_load_valid = '1' then
report "completing load hit";
d_out.valid <= '1';
end if;

-- Slow ops (load miss, NC, stores)
@ -509,48 +508,19 @@ begin
-- mux accordingly
--
if r1.req.load then
d_out.write_reg <= r1.req.write_reg;
d_out.write_enable <= '1';

-- Read data comes from the slow data latch, formatter
-- from the latched request.
--
d_out.write_data <= r1.slow_data;
d_out.write_shift <= r1.req.addr(2 downto 0);
d_out.sign_extend <= r1.req.sign_extend;
d_out.byte_reverse <= r1.req.byte_reverse;
d_out.write_len <= r1.req.length;
d_out.xerc <= r1.req.xerc;
-- Read data comes from the slow data latch
d_out.data <= r1.slow_data;
end if;
d_out.store_done <= '1';

-- If it's a store or a non-update load form, complete now
if r1.req.load = '0' or r1.req.update = '0' then
d_out.valid <= '1';
end if;
report "completing store or load miss";
d_out.valid <= '1';
end if;

-- We have a register update to do.
if r1.update_valid = '1' then
d_out.write_enable <= '1';
d_out.write_reg <= r1.req.update_reg;

-- Change the read data mux to the address that's going into
-- the register and the formatter does nothing.
--
d_out.write_data <= r1.req.addr;
d_out.write_shift <= "000";
d_out.write_len <= "1000";
d_out.sign_extend <= '0';
d_out.byte_reverse <= '0';
d_out.xerc <= r1.req.xerc;

-- If it was a load, this completes the operation (load with
-- update case).
--
if r1.req.load = '1' then
d_out.valid <= '1';
end if;
end if;
if r1.stcx_fail = '1' then
d_out.store_done <= '0';
d_out.valid <= '1';
end if;

end process;

@ -595,7 +565,7 @@ begin
begin
-- Cache hit reads
do_read <= '1';
rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
rd_addr <= std_ulogic_vector(to_unsigned(early_req_row, ROW_BITS));
cache_out(i) <= dout;

-- Write mux:
@ -606,10 +576,10 @@ begin
-- other than the current state. Only the do_write signal is.
--
if r1.state = IDLE then
-- When IDLE, the only write path is the store-hit update case
-- In IDLE state, the only write path is the store-hit update case
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
wr_data <= store_data;
wr_sel <= bus_sel;
wr_data <= r0.data;
wr_sel <= r0.byte_sel;
else
-- Otherwise, we might be doing a reload
wr_data <= wishbone_in.dat;
@ -623,7 +593,7 @@ begin
if reloading and wishbone_in.ack = '1' and r1.store_way = i then
do_write <= '1';
end if;
if req_op = OP_STORE_HIT and req_hit_way = i then
if req_op = OP_STORE_HIT and req_hit_way = i and cancel_store = '0' then
assert not reloading report "Store hit while in state:" &
state_t'image(r1.state)
severity FAILURE;
@ -633,36 +603,22 @@ begin
end generate;

--
-- Cache hit synchronous machine for the easy case. This handles
-- non-update form load hits and stage 1 to stage 2 transfers
-- Cache hit synchronous machine for the easy case. This handles load hits.
--
dcache_fast_hit : process(clk)
begin
if rising_edge(clk) then
-- stage 1 -> stage 2
r2.hit_load_valid <= r1.hit_load_valid;
r2.hit_way <= r1.hit_way;
r2.load_is_update <= r1.req.update;
r2.load_reg <= r1.req.write_reg;
r2.data_shift <= r1.req.addr(2 downto 0);
r2.length <= r1.req.length;
r2.sign_extend <= r1.req.sign_extend;
r2.byte_reverse <= r1.req.byte_reverse;

-- If we have a request incoming, we have to latch it as d_in.valid
-- If we have a request incoming, we have to latch it as r0.valid
-- is only set for a single cycle. It's up to the control logic to
-- ensure we don't override an uncompleted request (for now we are
-- single issue on load/stores so we are fine, later, we can generate
-- a stall output if necessary).

if req_op /= OP_NONE then
r1.req <= d_in;

if req_op /= OP_NONE and stall_out = '0' then
r1.req <= r0;
report "op:" & op_t'image(req_op) &
" addr:" & to_hstring(d_in.addr) &
" upd:" & std_ulogic'image(d_in.update) &
" nc:" & std_ulogic'image(d_in.nc) &
" reg:" & to_hstring(d_in.write_reg) &
" addr:" & to_hstring(r0.addr) &
" nc:" & std_ulogic'image(r0.nc) &
" idx:" & integer'image(req_index) &
" tag:" & to_hstring(req_tag) &
" way: " & integer'image(req_hit_way);
@ -679,10 +635,9 @@ begin
end process;

--
-- Every other case is handled by this stage machine:
-- Every other case is handled by this state machine:
--
-- * Cache load miss/reload (in conjunction with "rams")
-- * Load hits for update forms
-- * Load hits for non-cachable forms
-- * Stores (the collision case is handled in "rams")
--
@ -701,7 +656,6 @@ begin
end loop;
r1.state <= IDLE;
r1.slow_valid <= '0';
r1.update_valid <= '0';
r1.wb.cyc <= '0';
r1.wb.stb <= '0';

@ -710,27 +664,19 @@ begin
else
-- One cycle pulses reset
r1.slow_valid <= '0';
r1.update_valid <= '0';

-- We cannot currently process a new request when not idle
assert req_op = OP_NONE or r1.state = IDLE report "request " &
op_t'image(req_op) & " while in state " & state_t'image(r1.state)
severity FAILURE;
r1.stcx_fail <= '0';

-- Main state machine
case r1.state is
when IDLE =>
when IDLE =>
case req_op is
when OP_LOAD_HIT =>
-- We have a load with update hit, we need the delayed update cycle
if d_in.update = '1' then
r1.state <= LOAD_UPDATE;
end if;
when OP_LOAD_HIT =>
-- stay in IDLE state

when OP_LOAD_MISS =>
when OP_LOAD_MISS =>
-- Normal load cache miss, start the reload machine
--
report "cache miss addr:" & to_hstring(d_in.addr) &
report "cache miss addr:" & to_hstring(r0.addr) &
" idx:" & integer'image(req_index) &
" way:" & integer'image(replace_way) &
" tag:" & to_hstring(req_tag);
@ -765,25 +711,26 @@ begin
r1.state <= RELOAD_WAIT_ACK;

when OP_LOAD_NC =>
r1.wb.sel <= bus_sel;
r1.wb.adr <= d_in.addr(r1.wb.adr'left downto 3) & "000";
r1.wb.sel <= r0.byte_sel;
r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000";
r1.wb.cyc <= '1';
r1.wb.stb <= '1';
r1.wb.we <= '0';
r1.state <= NC_LOAD_WAIT_ACK;

when OP_STORE_HIT | OP_STORE_MISS =>
-- For store-with-update do the register update
if d_in.update = '1' then
r1.update_valid <= '1';
end if;
r1.wb.sel <= bus_sel;
r1.wb.adr <= d_in.addr(r1.wb.adr'left downto 3) & "000";
r1.wb.dat <= store_data;
r1.wb.cyc <= '1';
r1.wb.stb <= '1';
r1.wb.we <= '1';
r1.state <= STORE_WAIT_ACK;
r1.wb.sel <= r0.byte_sel;
r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000";
r1.wb.dat <= r0.data;
if cancel_store = '0' then
r1.wb.cyc <= '1';
r1.wb.stb <= '1';
r1.wb.we <= '1';
r1.state <= STORE_WAIT_ACK;
else
r1.stcx_fail <= '1';
r1.state <= IDLE;
end if;

-- OP_NONE and OP_BAD do nothing
when OP_NONE =>
@ -831,32 +778,23 @@ begin
-- Cache line is now valid
cache_valids(r1.store_index)(r1.store_way) <= '1';

-- Complete the load that missed. For load with update
-- we also need to do the deferred update cycle.
--
r1.slow_valid <= '1';
if r1.req.load = '1' and r1.req.update = '1' then
r1.state <= LOAD_UPDATE;
report "completing miss with load-update !";
else
r1.state <= IDLE;
report "completing miss !";
end if;
-- Don't complete and go idle until next cycle, in
-- case the next request is for the last dword of
-- the cache line we just loaded.
r1.state <= FINISH_LD_MISS;
end if;

-- Increment store row counter
r1.store_row <= next_row(r1.store_row);
end if;

when LOAD_UPDATE =>
-- We need the extra cycle to complete a load with update
r1.state <= LOAD_UPDATE2;
when LOAD_UPDATE2 =>
-- We need the extra cycle to complete a load with update
r1.update_valid <= '1';
r1.state <= IDLE;
when FINISH_LD_MISS =>
-- Write back the load data that we got
r1.slow_valid <= '1';
r1.state <= IDLE;
report "completing miss !";

when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK =>
when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK =>
-- Clear stb when slave accepted request
if wishbone_in.stall = '0' then
r1.wb.stb <= '0';
@ -867,12 +805,12 @@ begin
if r1.state = NC_LOAD_WAIT_ACK then
r1.slow_data <= wishbone_in.dat;
end if;
r1.state <= IDLE;
r1.slow_valid <= '1';
r1.wb.cyc <= '0';
r1.wb.stb <= '0';
r1.state <= IDLE;
end if;
end case;
end case;
end if;
end if;
end process;

@ -13,7 +13,7 @@ architecture behave of dcache_tb is
signal rst : std_ulogic;

signal d_in : Loadstore1ToDcacheType;
signal d_out : DcacheToWritebackType;
signal d_out : DcacheToLoadstore1Type;

signal wb_bram_in : wishbone_master_out;
signal wb_bram_out : wishbone_slave_out;
@ -71,12 +71,6 @@ begin
d_in.nc <= '0';
d_in.addr <= (others => '0');
d_in.data <= (others => '0');
d_in.write_reg <= (others => '0');
d_in.length <= (others => '0');
d_in.byte_reverse <= '0';
d_in.sign_extend <= '0';
d_in.update <= '0';
d_in.update_reg <= (others => '0');

wait for 4*clk_period;
wait until rising_edge(clk);
@ -89,11 +83,10 @@ begin
wait until rising_edge(clk);
d_in.valid <= '0';

wait until rising_edge(clk) and d_out.write_enable = '1';
assert d_out.valid = '1';
assert d_out.write_data = x"0000000100000000"
wait until rising_edge(clk) and d_out.valid = '1';
assert d_out.data = x"0000000100000000"
report "data @" & to_hstring(d_in.addr) &
"=" & to_hstring(d_out.write_data) &
"=" & to_hstring(d_out.data) &
" expected 0000000100000000"
severity failure;
-- wait for clk_period;
@ -106,11 +99,10 @@ begin
wait until rising_edge(clk);
d_in.valid <= '0';

wait until rising_edge(clk) and d_out.write_enable = '1';
assert d_out.valid = '1';
assert d_out.write_data = x"0000000D0000000C"
wait until rising_edge(clk) and d_out.valid = '1';
assert d_out.data = x"0000000D0000000C"
report "data @" & to_hstring(d_in.addr) &
"=" & to_hstring(d_out.write_data) &
"=" & to_hstring(d_out.data) &
" expected 0000000D0000000C"
severity failure;

@ -121,11 +113,10 @@ begin
d_in.valid <= '1';
wait until rising_edge(clk);
d_in.valid <= '0';
wait until rising_edge(clk) and d_out.write_enable = '1';
assert d_out.valid = '1';
assert d_out.write_data = x"0000004100000040"
wait until rising_edge(clk) and d_out.valid = '1';
assert d_out.data = x"0000004100000040"
report "data @" & to_hstring(d_in.addr) &
"=" & to_hstring(d_out.write_data) &
"=" & to_hstring(d_out.data) &
" expected 0000004100000040"
severity failure;


@ -46,26 +46,26 @@ architecture behaviour of decode1 is
16 => (ALU, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc
11 => (ALU, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmpi
10 => (ALU, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli
34 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbz
35 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lbzu
42 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lha
43 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '1'), -- lhau
40 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lhz
41 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lhzu
32 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwz
33 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lwzu
34 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbz
35 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzu
42 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lha
43 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhau
40 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhz
41 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lhzu
32 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwz
33 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lwzu
7 => (ALU, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli
24 => (ALU, OP_OR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ori
25 => (ALU, OP_OR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- oris
20 => (ALU, OP_RLC, RA, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwimi
21 => (ALU, OP_RLC, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwinm
23 => (ALU, OP_RLC, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwnm
38 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- stb
39 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', RC, '0', '1'), -- stbu
44 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sth
45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- sthu
36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stw
37 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stwu
38 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stb
39 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbu
44 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth
45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu
36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw
37 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwu
8 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic
2 => (ALU, OP_TDI, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi
--PPC_TWI 3
@ -215,25 +215,29 @@ architecture behaviour of decode1 is
2#1110101111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
2#1111001111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
2#1111101111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
2#0000110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- lbarx
2#0001110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lbzux
2#0001010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbzx
2#0001010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- ldarx
2#1000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- ldbrx
2#0000110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- ldux
2#0000010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- ldx
2#0001110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- lharx
2#0101110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '1'), -- lhaux
2#0101010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lhax
2#1100010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lhbrx
2#0100110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lhzux
2#0100010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lhzx
2#0000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- lwarx
2#0101110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '1'), -- lwaux
2#0101010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lwax
2#1000010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwbrx
2#0000110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lwzux
2#0000010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwzx
2#0000110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lbarx
2#1101010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbzcix
2#0001110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzux
2#0001010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbzx
2#0001010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- ldarx
2#1000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldbrx
2#1101110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldcix
2#0000110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- ldux
2#0000010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldx
2#0001110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lharx
2#0101110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhaux
2#0101010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lhax
2#1100010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhbrx
2#1100110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhzcix
2#0100110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lhzux
2#0100010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhzx
2#0000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lwarx
2#0101110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lwaux
2#0101010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lwax
2#1000010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwbrx
2#1100010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwzcix
2#0000110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lwzux
2#0000010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwzx
-- 2#1000000000# mcrxr
-- 2#1001000000# mcrxrx
2#0000010011# => (ALU, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf
@ -278,21 +282,25 @@ architecture behaviour of decode1 is
2#1100111000# => (ALU, OP_SHR, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- srawi
2#1000011011# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- srd
2#1000011000# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- srw
2#1010110110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', RC, '0', '1'), -- stbcx
2#0011110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', RC, '0', '1'), -- stbux
2#0011010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- stbx
2#1010010100# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stdbrx
2#0011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- stdcx
2#0010110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stdux
2#0010010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stdx
2#1110010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sthbrx
2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- sthcx
2#0110110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- sthux
2#0110010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sthx
2#1010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stwbrx
2#0010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- stwcx
2#0010110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stwux
2#0010010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stwx
2#1111010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stbcix
2#1010110110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stbcx
2#0011110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbux
2#0011010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stbx
2#1010010100# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdbrx
2#1111110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdcix
2#0011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stdcx
2#0010110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stdux
2#0010010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdx
2#1110010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx
2#1110110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthcix
2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- sthcx
2#0110110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthux
2#0110010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthx
2#1010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwbrx
2#1110010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwcix
2#0010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stwcx
2#0010110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwux
2#0010010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwx
2#0000101000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subf
2#1000101000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfo
2#0000001000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfc
@ -329,7 +337,7 @@ architecture behaviour of decode1 is

-- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl
-- op in out A out in out len ext pipe
constant attn_instr : decode_rom_t := (ALU, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1');
constant attn_instr : decode_rom_t := (ALU, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1');
constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0');
constant sim_cfg_instr : decode_rom_t := (ALU, OP_SIM_CONFIG,NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1');


@ -334,6 +334,7 @@ begin
v.e.byte_reverse := d_in.decode.byte_reverse;
v.e.sign_extend := d_in.decode.sign_extend;
v.e.update := d_in.decode.update;
v.e.reserve := d_in.decode.reserve;

-- issue control
control_valid_in <= d_in.valid;

@ -200,6 +200,7 @@ begin
variable bo, bi : std_ulogic_vector(4 downto 0);
variable bf, bfa : std_ulogic_vector(2 downto 0);
variable cr_op : std_ulogic_vector(9 downto 0);
variable cr_operands : std_ulogic_vector(1 downto 0);
variable bt, ba, bb : std_ulogic_vector(4 downto 0);
variable btnum, banum, bbnum : integer range 0 to 31;
variable crresult : std_ulogic;
@ -532,27 +533,10 @@ begin
btnum := 31 - to_integer(unsigned(bt));
banum := 31 - to_integer(unsigned(ba));
bbnum := 31 - to_integer(unsigned(bb));
case cr_op(8 downto 5) is
when "1001" => -- CREQV
crresult := not(e_in.cr(banum) xor e_in.cr(bbnum));
when "0111" => -- CRNAND
crresult := not(e_in.cr(banum) and e_in.cr(bbnum));
when "0100" => -- CRANDC
crresult := (e_in.cr(banum) and not e_in.cr(bbnum));
when "1000" => -- CRAND
crresult := (e_in.cr(banum) and e_in.cr(bbnum));
when "0001" => -- CRNOR
crresult := not(e_in.cr(banum) or e_in.cr(bbnum));
when "1101" => -- CRORC
crresult := (e_in.cr(banum) or not e_in.cr(bbnum));
when "0110" => -- CRXOR
crresult := (e_in.cr(banum) xor e_in.cr(bbnum));
when "1110" => -- CROR
crresult := (e_in.cr(banum) or e_in.cr(bbnum));
when others =>
crresult := '0';
report "BAD CR?";
end case;
-- Bits 5-8 of cr_op give the truth table of the requested
-- logical operation
cr_operands := e_in.cr(banum) & e_in.cr(bbnum);
crresult := cr_op(5 + to_integer(unsigned(cr_operands)));
v.e.write_cr_mask := num_to_fxm((31-btnum) / 4);
for i in 0 to 31 loop
if i = btnum then
@ -767,6 +751,13 @@ begin
lv.update := e_in.update;
lv.update_reg := gspr_to_gpr(e_in.read_reg1);
lv.xerc := v.e.xerc;
lv.reserve := e_in.reserve;
lv.rc := e_in.rc;
-- decode l*cix and st*cix instructions here
if e_in.insn(31 downto 26) = "011111" and e_in.insn(10 downto 9) = "11" and
e_in.insn(5 downto 1) = "10101" then
lv.ci := '1';
end if;

-- Update registers
rin <= v;

@ -12,16 +12,90 @@ use work.helpers.all;
entity loadstore1 is
port (
clk : in std_ulogic;
rst : in std_ulogic;

l_in : in Execute1ToLoadstore1Type;
l_out : out Loadstore1ToWritebackType;

l_out : out Loadstore1ToDcacheType
d_out : out Loadstore1ToDcacheType;
d_in : in DcacheToLoadstore1Type;

dc_stall : in std_ulogic;
stall_out : out std_ulogic
);
end loadstore1;

-- Note, we don't currently use the stall output from the dcache because
-- we know it can take two requests without stalling when idle, we are
-- its only user, and we know it never stalls when idle.

architecture behave of loadstore1 is
signal r, rin : Loadstore1ToDcacheType;

-- State machine for unaligned loads/stores
type state_t is (IDLE, -- ready for instruction
SECOND_REQ, -- send 2nd request of unaligned xfer
FIRST_ACK_WAIT, -- waiting for 1st ack from dcache
LAST_ACK_WAIT, -- waiting for last ack from dcache
LD_UPDATE -- writing rA with computed addr on load
);

type reg_stage_t is record
-- latch most of the input request
load : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
store_data : std_ulogic_vector(63 downto 0);
load_data : std_ulogic_vector(63 downto 0);
write_reg : gpr_index_t;
length : std_ulogic_vector(3 downto 0);
byte_reverse : std_ulogic;
sign_extend : std_ulogic;
update : std_ulogic;
update_reg : gpr_index_t;
xerc : xer_common_t;
reserve : std_ulogic;
rc : std_ulogic;
nc : std_ulogic; -- non-cacheable access
state : state_t;
second_bytes : std_ulogic_vector(7 downto 0);
end record;

type byte_sel_t is array(0 to 7) of std_ulogic;
subtype byte_trim_t is std_ulogic_vector(1 downto 0);
type trim_ctl_t is array(0 to 7) of byte_trim_t;

signal r, rin : reg_stage_t;
signal lsu_sum : std_ulogic_vector(63 downto 0);

-- Generate byte enables from sizes
function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
begin
case length is
when "0001" =>
return "00000001";
when "0010" =>
return "00000011";
when "0100" =>
return "00001111";
when "1000" =>
return "11111111";
when others =>
return "00000000";
end case;
end function length_to_sel;

-- Calculate byte enables
-- This returns 16 bits, giving the select signals for two transfers,
-- to account for unaligned loads or stores
function xfer_data_sel(size : in std_logic_vector(3 downto 0);
address : in std_logic_vector(2 downto 0))
return std_ulogic_vector is
variable longsel : std_ulogic_vector(15 downto 0);
begin
longsel := "00000000" & length_to_sel(size);
return std_ulogic_vector(shift_left(unsigned(longsel),
to_integer(unsigned(address))));
end function xfer_data_sel;

begin
-- Calculate the address in the first cycle
lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
@ -29,51 +103,224 @@ begin
loadstore1_0: process(clk)
begin
if rising_edge(clk) then
r <= rin;
if rst = '1' then
r.state <= IDLE;
else
r <= rin;
end if;
end if;
end process;

loadstore1_1: process(all)
variable v : Loadstore1ToDcacheType;
variable v : reg_stage_t;
variable brev_lenm1 : unsigned(2 downto 0);
variable byte_offset : unsigned(2 downto 0);
variable j : integer;
variable k : unsigned(2 downto 0);
variable kk : unsigned(3 downto 0);
variable long_sel : std_ulogic_vector(15 downto 0);
variable byte_sel : std_ulogic_vector(7 downto 0);
variable req : std_ulogic;
variable stall : std_ulogic;
variable addr : std_ulogic_vector(63 downto 0);
variable wdata : std_ulogic_vector(63 downto 0);
variable write_enable : std_ulogic;
variable do_update : std_ulogic;
variable two_dwords : std_ulogic;
variable done : std_ulogic;
variable data_permuted : std_ulogic_vector(63 downto 0);
variable data_trimmed : std_ulogic_vector(63 downto 0);
variable use_second : byte_sel_t;
variable trim_ctl : trim_ctl_t;
variable negative : std_ulogic;
begin
v := r;
req := '0';
stall := '0';
done := '0';
byte_sel := (others => '0');
addr := lsu_sum;

write_enable := '0';
do_update := '0';
two_dwords := or (r.second_bytes);

-- load data formatting
if r.load = '1' then
byte_offset := unsigned(r.addr(2 downto 0));
brev_lenm1 := "000";
if r.byte_reverse = '1' then
brev_lenm1 := unsigned(r.length(2 downto 0)) - 1;
end if;

-- shift and byte-reverse data bytes
for i in 0 to 7 loop
kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
use_second(i) := kk(3);
j := to_integer(kk(2 downto 0)) * 8;
data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j);
end loop;

-- Work out the sign bit for sign extension.
-- Assumes we are not doing both sign extension and byte reversal,
-- in that for unaligned loads crossing two dwords we end up
-- using a bit from the second dword, whereas for a byte-reversed
-- (i.e. big-endian) load the sign bit would be in the first dword.
negative := (r.length(3) and data_permuted(63)) or
(r.length(2) and data_permuted(31)) or
(r.length(1) and data_permuted(15)) or
(r.length(0) and data_permuted(7));

v.valid := l_in.valid;
v.load := l_in.load;
v.data := l_in.data;
v.write_reg := l_in.write_reg;
v.length := l_in.length;
v.byte_reverse := l_in.byte_reverse;
v.sign_extend := l_in.sign_extend;
v.update := l_in.update;
v.update_reg := l_in.update_reg;
v.xerc := l_in.xerc;

-- XXX Temporary hack. Mark the op as non-cachable if the address
-- is the form 0xc-------
--
-- This will have to be replaced by a combination of implementing the
-- proper HV CI load/store instructions and having an MMU to get the I
-- bit otherwise.
if lsu_sum(31 downto 28) = "1100" then
v.nc := '1';
else
v.nc := '0';
end if;

-- XXX Do length_to_sel here ?

-- byte reverse stores in the first cycle
if v.load = '0' and l_in.byte_reverse = '1' then
v.data := byte_reverse(l_in.data, to_integer(unsigned(l_in.length)));
-- trim and sign-extend
for i in 0 to 7 loop
if i < to_integer(unsigned(r.length)) then
if two_dwords = '1' then
trim_ctl(i) := '1' & not use_second(i);
else
trim_ctl(i) := not use_second(i) & '0';
end if;
else
trim_ctl(i) := '0' & (negative and r.sign_extend);
end if;
case trim_ctl(i) is
when "11" =>
data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8);
when "10" =>
data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8);
when "01" =>
data_trimmed(i * 8 + 7 downto i * 8) := x"FF";
when others =>
data_trimmed(i * 8 + 7 downto i * 8) := x"00";
end case;
end loop;
end if;

v.addr := lsu_sum;
case r.state is
when IDLE =>
if l_in.valid = '1' then
v.load := l_in.load;
v.addr := lsu_sum;
v.write_reg := l_in.write_reg;
v.length := l_in.length;
v.byte_reverse := l_in.byte_reverse;
v.sign_extend := l_in.sign_extend;
v.update := l_in.update;
v.update_reg := l_in.update_reg;
v.xerc := l_in.xerc;
v.reserve := l_in.reserve;
v.rc := l_in.rc;
v.nc := l_in.ci;

-- XXX Temporary hack. Mark the op as non-cachable if the address
-- is the form 0xc-------
--
-- This will have to be replaced by a combination of implementing the
-- proper HV CI load/store instructions and having an MMU to get the I
-- bit otherwise.
if lsu_sum(31 downto 28) = "1100" then
v.nc := '1';
end if;

-- Do length_to_sel and work out if we are doing 2 dwords
long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0));
byte_sel := long_sel(7 downto 0);
v.second_bytes := long_sel(15 downto 8);

v.addr := lsu_sum;

-- Do byte reversing and rotating for stores in the first cycle
if v.load = '0' then
byte_offset := unsigned(lsu_sum(2 downto 0));
brev_lenm1 := "000";
if l_in.byte_reverse = '1' then
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
end if;
for i in 0 to 7 loop
k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
j := to_integer(k) * 8;
v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
end loop;
end if;

req := '1';
stall := '1';
if long_sel(15 downto 8) = "00000000" then
v.state := LAST_ACK_WAIT;
else
v.state := SECOND_REQ;
end if;
end if;

when SECOND_REQ =>
-- compute (addr + 8) & ~7 for the second doubleword when unaligned
addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";
byte_sel := r.second_bytes;
req := '1';
stall := '1';
v.state := FIRST_ACK_WAIT;

when FIRST_ACK_WAIT =>
stall := '1';
if d_in.valid = '1' then
v.state := LAST_ACK_WAIT;
if r.load = '1' then
v.load_data := data_permuted;
end if;
end if;

when LAST_ACK_WAIT =>
stall := '1';
if d_in.valid = '1' then
write_enable := r.load;
if r.load = '1' and r.update = '1' then
-- loads with rA update need an extra cycle
v.state := LD_UPDATE;
else
-- stores write back rA update in this cycle
do_update := r.update;
stall := '0';
done := '1';
v.state := IDLE;
end if;
end if;

when LD_UPDATE =>
do_update := '1';
v.state := IDLE;
done := '1';
end case;

-- Update outputs to dcache
d_out.valid <= req;
d_out.load <= v.load;
d_out.nc <= v.nc;
d_out.reserve <= v.reserve;
d_out.addr <= addr;
d_out.data <= v.store_data;
d_out.byte_sel <= byte_sel;

-- Update outputs to writeback
-- Multiplex either cache data to the destination GPR or
-- the address for the rA update.
l_out.valid <= done;
if do_update = '1' then
l_out.write_enable <= '1';
l_out.write_reg <= r.update_reg;
l_out.write_data <= r.addr;
else
l_out.write_enable <= write_enable;
l_out.write_reg <= r.write_reg;
l_out.write_data <= data_trimmed;
end if;
l_out.xerc <= r.xerc;
l_out.rc <= r.rc and done;
l_out.store_done <= d_in.store_done;

stall_out <= stall;

-- Update registers
rin <= v;

-- Update outputs
l_out <= r;
end process;

end;

@ -11,7 +11,7 @@ entity writeback is
clk : in std_ulogic;

e_in : in Execute1ToWritebackType;
l_in : in DcacheToWritebackType;
l_in : in Loadstore1ToWritebackType;

w_out : out WritebackToRegisterFileType;
c_out : out WritebackToCrFileType;
@ -21,48 +21,15 @@ entity writeback is
end entity writeback;

architecture behaviour of writeback is
subtype byte_index_t is unsigned(2 downto 0);
type permutation_t is array(0 to 7) of byte_index_t;
subtype byte_trim_t is std_ulogic_vector(1 downto 0);
type trim_ctl_t is array(0 to 7) of byte_trim_t;
type byte_sel_t is array(0 to 7) of std_ulogic;

signal data_len : unsigned(3 downto 0);
signal data_in : std_ulogic_vector(63 downto 0);
signal data_permuted : std_ulogic_vector(63 downto 0);
signal data_trimmed : std_ulogic_vector(63 downto 0);
signal data_latched : std_ulogic_vector(63 downto 0);
signal perm : permutation_t;
signal use_second : byte_sel_t;
signal byte_offset : unsigned(2 downto 0);
signal brev_lenm1 : unsigned(2 downto 0);
signal trim_ctl : trim_ctl_t;
signal rc : std_ulogic;
signal partial_write : std_ulogic;
signal sign_extend : std_ulogic;
signal negative : std_ulogic;
signal second_word : std_ulogic;
begin
writeback_0: process(clk)
begin
if rising_edge(clk) then
if partial_write = '1' then
data_latched <= data_permuted;
end if;
end if;
end process;

writeback_1: process(all)
variable x : std_ulogic_vector(0 downto 0);
variable y : std_ulogic_vector(0 downto 0);
variable z : std_ulogic_vector(0 downto 0);
variable w : std_ulogic_vector(0 downto 0);
variable j : integer;
variable k : unsigned(3 downto 0);
variable cf: std_ulogic_vector(3 downto 0);
variable xe: xer_common_t;
variable zero : std_ulogic;
variable sign : std_ulogic;
variable scf : std_ulogic_vector(3 downto 0);
begin
x(0) := e_in.valid;
y(0) := l_in.valid;
@ -84,17 +51,10 @@ begin
complete_out <= '1';
end if;

rc <= '0';
brev_lenm1 <= "000";
partial_write <= '0';
second_word <= '0';
xe := e_in.xerc;
data_in <= (others => '0');

if e_in.write_enable = '1' then
w_out.write_reg <= e_in.write_reg;
w_out.write_data <= e_in.write_data;
w_out.write_enable <= '1';
rc <= e_in.rc;
end if;

if e_in.write_cr_enable = '1' then
@ -108,75 +68,26 @@ begin
c_out.write_xerc_data <= e_in.xerc;
end if;

sign_extend <= l_in.sign_extend;
data_len <= unsigned(l_in.write_len);
byte_offset <= unsigned(l_in.write_shift);
if l_in.write_enable = '1' then
w_out.write_reg <= gpr_to_gspr(l_in.write_reg);
if l_in.byte_reverse = '1' then
brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1;
end if;
w_out.write_data <= l_in.write_data;
w_out.write_enable <= '1';
second_word <= l_in.second_word;
if l_in.valid = '0' and (data_len + byte_offset > 8) then
partial_write <= '1';
end if;
xe := l_in.xerc;
end if;

-- shift and byte-reverse data bytes
for i in 0 to 7 loop
k := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
perm(i) <= k(2 downto 0);
use_second(i) <= k(3);
end loop;
for i in 0 to 7 loop
j := to_integer(perm(i)) * 8;
data_permuted(i * 8 + 7 downto i * 8) <= l_in.write_data(j + 7 downto j);
end loop;

-- If the data can arrive split over two cycles, this will be correct
-- provided we don't have both sign extension and byte reversal.
negative <= (data_len(3) and data_permuted(63)) or
(data_len(2) and data_permuted(31)) or
(data_len(1) and data_permuted(15)) or
(data_len(0) and data_permuted(7));

-- trim and sign-extend
for i in 0 to 7 loop
if i < to_integer(data_len) then
if second_word = '1' then
trim_ctl(i) <= '1' & not use_second(i);
else
trim_ctl(i) <= not use_second(i) & '0';
end if;
else
trim_ctl(i) <= '0' & (negative and sign_extend);
end if;
end loop;
for i in 0 to 7 loop
case trim_ctl(i) is
when "11" =>
data_trimmed(i * 8 + 7 downto i * 8) <= data_latched(i * 8 + 7 downto i * 8);
when "10" =>
data_trimmed(i * 8 + 7 downto i * 8) <= data_permuted(i * 8 + 7 downto i * 8);
when "01" =>
data_trimmed(i * 8 + 7 downto i * 8) <= x"FF";
when others =>
data_trimmed(i * 8 + 7 downto i * 8) <= x"00";
end case;
end loop;

-- deliver to regfile
if l_in.write_enable = '1' then
w_out.write_data <= data_trimmed;
else
w_out.write_data <= e_in.write_data;
if l_in.rc = '1' then
-- st*cx. instructions
scf(3) := '0';
scf(2) := '0';
scf(1) := l_in.store_done;
scf(0) := l_in.xerc.so;
c_out.write_cr_enable <= '1';
c_out.write_cr_mask <= num_to_fxm(0);
c_out.write_cr_data(31 downto 28) <= scf;
end if;

-- Perform CR0 update for RC forms
-- Note that loads never have a form with an RC bit, therefore this can test e_in.write_data
if rc = '1' then
if e_in.rc = '1' and e_in.write_enable = '1' then
sign := e_in.write_data(63);
zero := not (or e_in.write_data);
c_out.write_cr_enable <= '1';
@ -184,7 +95,7 @@ begin
cf(3) := sign;
cf(2) := not sign and not zero;
cf(1) := zero;
cf(0) := xe.so;
cf(0) := e_in.xerc.so;
c_out.write_cr_data(31 downto 28) <= cf;
end if;
end process;

Loading…
Cancel
Save