Merge pull request #153 from paulusmack/master

Load/store improvements
pull/155/head
Anton Blanchard 5 years ago committed by GitHub
commit 8bee4ae3cc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -130,12 +130,13 @@ package common is
byte_reverse : std_ulogic; byte_reverse : std_ulogic;
sign_extend : std_ulogic; -- do we need to sign extend? sign_extend : std_ulogic; -- do we need to sign extend?
update : std_ulogic; -- is this an update instruction? update : std_ulogic; -- is this an update instruction?
reserve : std_ulogic; -- set for larx/stcx
end record; end record;
constant Decode2ToExecute1Init : Decode2ToExecute1Type := constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
(valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', (valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
lr => '0', rc => '0', oe => '0', invert_a => '0', lr => '0', rc => '0', oe => '0', invert_a => '0',
invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
is_32bit => '0', is_signed => '0', xerc => xerc_init, is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0',
byte_reverse => '0', sign_extend => '0', update => '0', others => (others => '0')); byte_reverse => '0', sign_extend => '0', update => '0', others => (others => '0'));


type Execute1ToMultiplyType is record type Execute1ToMultiplyType is record
@ -201,46 +202,47 @@ package common is
data : std_ulogic_vector(63 downto 0); -- data to write, unused for read data : std_ulogic_vector(63 downto 0); -- data to write, unused for read
write_reg : gpr_index_t; write_reg : gpr_index_t;
length : std_ulogic_vector(3 downto 0); length : std_ulogic_vector(3 downto 0);
ci : std_ulogic; -- cache-inhibited load/store
byte_reverse : std_ulogic; byte_reverse : std_ulogic;
sign_extend : std_ulogic; -- do we need to sign extend? sign_extend : std_ulogic; -- do we need to sign extend?
update : std_ulogic; -- is this an update instruction? update : std_ulogic; -- is this an update instruction?
update_reg : gpr_index_t; -- if so, the register to update update_reg : gpr_index_t; -- if so, the register to update
xerc : xer_common_t; xerc : xer_common_t;
reserve : std_ulogic; -- set for larx/stcx.
rc : std_ulogic; -- set for stcx.
end record; end record;
constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0', constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', ci => '0', byte_reverse => '0',
sign_extend => '0', update => '0', xerc => xerc_init, sign_extend => '0', update => '0', xerc => xerc_init,
others => (others => '0')); reserve => '0', rc => '0', others => (others => '0'));


type Loadstore1ToDcacheType is record type Loadstore1ToDcacheType is record
valid : std_ulogic; valid : std_ulogic;
load : std_ulogic; load : std_ulogic;
nc : std_ulogic; nc : std_ulogic;
reserve : std_ulogic;
addr : std_ulogic_vector(63 downto 0); addr : std_ulogic_vector(63 downto 0);
data : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0);
write_reg : gpr_index_t; byte_sel : std_ulogic_vector(7 downto 0);
length : std_ulogic_vector(3 downto 0); end record;
byte_reverse : std_ulogic;
sign_extend : std_ulogic; type DcacheToLoadstore1Type is record
update : std_ulogic; valid : std_ulogic;
update_reg : gpr_index_t; data : std_ulogic_vector(63 downto 0);
xerc : xer_common_t; store_done : std_ulogic;
error : std_ulogic;
end record; end record;


type DcacheToWritebackType is record type Loadstore1ToWritebackType is record
valid : std_ulogic; valid : std_ulogic;
write_enable: std_ulogic; write_enable: std_ulogic;
write_reg : gpr_index_t; write_reg : gpr_index_t;
write_data : std_ulogic_vector(63 downto 0); write_data : std_ulogic_vector(63 downto 0);
write_len : std_ulogic_vector(3 downto 0);
write_shift : std_ulogic_vector(2 downto 0);
sign_extend : std_ulogic;
byte_reverse : std_ulogic;
second_word : std_ulogic;
xerc : xer_common_t; xerc : xer_common_t;
rc : std_ulogic;
store_done : std_ulogic;
end record; end record;
constant DcacheToWritebackInit : DcacheToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := (valid => '0', write_enable => '0', xerc => xerc_init,
byte_reverse => '0', second_word => '0', xerc => xerc_init, rc => '0', store_done => '0', others => (others => '0'));
others => (others => '0'));


type Execute1ToWritebackType is record type Execute1ToWritebackType is record
valid: std_ulogic; valid: std_ulogic;

@ -61,8 +61,11 @@ architecture behave of core is


-- load store signals -- load store signals
signal execute1_to_loadstore1: Execute1ToLoadstore1Type; signal execute1_to_loadstore1: Execute1ToLoadstore1Type;
signal loadstore1_to_writeback: Loadstore1ToWritebackType;

-- dcache signals
signal loadstore1_to_dcache: Loadstore1ToDcacheType; signal loadstore1_to_dcache: Loadstore1ToDcacheType;
signal dcache_to_writeback: DcacheToWritebackType; signal dcache_to_loadstore1: DcacheToLoadstore1Type;


-- local signals -- local signals
signal fetch1_stall_in : std_ulogic; signal fetch1_stall_in : std_ulogic;
@ -73,6 +76,8 @@ architecture behave of core is
signal decode2_stall_out : std_ulogic; signal decode2_stall_out : std_ulogic;
signal ex1_icache_inval: std_ulogic; signal ex1_icache_inval: std_ulogic;
signal ex1_stall_out: std_ulogic; signal ex1_stall_out: std_ulogic;
signal ls1_stall_out: std_ulogic;
signal dcache_stall_out: std_ulogic;


signal flush: std_ulogic; signal flush: std_ulogic;


@ -195,7 +200,7 @@ begin
c_in => cr_file_to_decode2, c_in => cr_file_to_decode2,
c_out => decode2_to_cr_file c_out => decode2_to_cr_file
); );
decode2_stall_in <= ex1_stall_out; decode2_stall_in <= ex1_stall_out or ls1_stall_out;


register_file_0: entity work.register_file register_file_0: entity work.register_file
generic map ( generic map (
@ -242,8 +247,13 @@ begin
loadstore1_0: entity work.loadstore1 loadstore1_0: entity work.loadstore1
port map ( port map (
clk => clk, clk => clk,
rst => core_rst,
l_in => execute1_to_loadstore1, l_in => execute1_to_loadstore1,
l_out => loadstore1_to_dcache l_out => loadstore1_to_writeback,
d_out => loadstore1_to_dcache,
d_in => dcache_to_loadstore1,
dc_stall => dcache_stall_out,
stall_out => ls1_stall_out
); );


dcache_0: entity work.dcache dcache_0: entity work.dcache
@ -256,7 +266,8 @@ begin
clk => clk, clk => clk,
rst => core_rst, rst => core_rst,
d_in => loadstore1_to_dcache, d_in => loadstore1_to_dcache,
d_out => dcache_to_writeback, d_out => dcache_to_loadstore1,
stall_out => dcache_stall_out,
wishbone_in => wishbone_data_in, wishbone_in => wishbone_data_in,
wishbone_out => wishbone_data_out wishbone_out => wishbone_data_out
); );
@ -265,7 +276,7 @@ begin
port map ( port map (
clk => clk, clk => clk,
e_in => execute1_to_writeback, e_in => execute1_to_writeback,
l_in => dcache_to_writeback, l_in => loadstore1_to_writeback,
w_out => writeback_to_register_file, w_out => writeback_to_register_file,
c_out => writeback_to_cr_file, c_out => writeback_to_cr_file,
complete_out => complete complete_out => complete

@ -7,9 +7,6 @@
-- * Complete load misses on the cycle when WB data comes instead of -- * Complete load misses on the cycle when WB data comes instead of
-- at the end of line (this requires dealing with requests coming in -- at the end of line (this requires dealing with requests coming in
-- while not idle...) -- while not idle...)
-- * Load with update could use one less non-pipelined cycle by moving
-- the register update to the pipeline bubble that exists when going
-- back to the IDLE state.
-- --
library ieee; library ieee;
use ieee.std_logic_1164.all; use ieee.std_logic_1164.all;
@ -35,7 +32,7 @@ entity dcache is
rst : in std_ulogic; rst : in std_ulogic;


d_in : in Loadstore1ToDcacheType; d_in : in Loadstore1ToDcacheType;
d_out : out DcacheToWritebackType; d_out : out DcacheToLoadstore1Type;


stall_out : out std_ulogic; stall_out : out std_ulogic;


@ -113,6 +110,8 @@ architecture rtl of dcache is
attribute ram_style : string; attribute ram_style : string;
attribute ram_style of cache_tags : signal is "distributed"; attribute ram_style of cache_tags : signal is "distributed";


signal r0 : Loadstore1ToDcacheType;

-- Type of operation on a "valid" input -- Type of operation on a "valid" input
type op_t is (OP_NONE, type op_t is (OP_NONE,
OP_LOAD_HIT, -- Cache hit on load OP_LOAD_HIT, -- Cache hit on load
@ -124,9 +123,8 @@ architecture rtl of dcache is
-- Cache state machine -- Cache state machine
type state_t is (IDLE, -- Normal load hit processing type state_t is (IDLE, -- Normal load hit processing
LOAD_UPDATE, -- Load with update extra cycle
LOAD_UPDATE2, -- Load with update extra cycle
RELOAD_WAIT_ACK, -- Cache reload wait ack RELOAD_WAIT_ACK, -- Cache reload wait ack
FINISH_LD_MISS, -- Extra cycle after load miss
STORE_WAIT_ACK, -- Store wait ack STORE_WAIT_ACK, -- Store wait ack
NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack


@ -157,13 +155,13 @@ architecture rtl of dcache is
hit_way : way_t; hit_way : way_t;
hit_load_valid : std_ulogic; hit_load_valid : std_ulogic;


-- Register update (load/store with update)
update_valid : std_ulogic;

-- Data buffer for "slow" read ops (load miss and NC loads). -- Data buffer for "slow" read ops (load miss and NC loads).
slow_data : std_ulogic_vector(63 downto 0); slow_data : std_ulogic_vector(63 downto 0);
slow_valid : std_ulogic; slow_valid : std_ulogic;


-- Signal to complete a failed stcx.
stcx_fail : std_ulogic;

-- Cache miss state (reload state machine) -- Cache miss state (reload state machine)
state : state_t; state : state_t;
wb : wishbone_master_out; wb : wishbone_master_out;
@ -174,21 +172,14 @@ architecture rtl of dcache is


signal r1 : reg_stage_1_t; signal r1 : reg_stage_1_t;


-- Second stage register, only used for load hits -- Reservation information
-- --
type reg_stage_2_t is record type reservation_t is record
hit_way : way_t; valid : std_ulogic;
hit_load_valid : std_ulogic; addr : std_ulogic_vector(63 downto LINE_OFF_BITS);
load_is_update : std_ulogic;
load_reg : std_ulogic_vector(4 downto 0);
data_shift : std_ulogic_vector(2 downto 0);
length : std_ulogic_vector(3 downto 0);
sign_extend : std_ulogic;
byte_reverse : std_ulogic;
xerc : xer_common_t;
end record; end record;


signal r2 : reg_stage_2_t; signal reservation : reservation_t;


-- Async signals on incoming request -- Async signals on incoming request
signal req_index : index_t; signal req_index : index_t;
@ -196,8 +187,15 @@ architecture rtl of dcache is
signal req_hit_way : way_t; signal req_hit_way : way_t;
signal req_tag : cache_tag_t; signal req_tag : cache_tag_t;
signal req_op : op_t; signal req_op : op_t;
signal req_data : std_ulogic_vector(63 downto 0);
signal req_laddr : std_ulogic_vector(63 downto 0); signal req_laddr : std_ulogic_vector(63 downto 0);


signal early_req_row : row_t;

signal cancel_store : std_ulogic;
signal set_rsrv : std_ulogic;
signal clear_rsrv : std_ulogic;

-- Cache RAM interface -- Cache RAM interface
type cache_ram_out_t is array(way_t) of cache_row_t; type cache_ram_out_t is array(way_t) of cache_row_t;
signal cache_out : cache_ram_out_t; signal cache_out : cache_ram_out_t;
@ -208,8 +206,7 @@ architecture rtl of dcache is
signal replace_way : way_t; signal replace_way : way_t;


-- Wishbone read/write/cache write formatting signals -- Wishbone read/write/cache write formatting signals
signal bus_sel : wishbone_sel_type; signal bus_sel : std_ulogic_vector(7 downto 0);
signal store_data : wishbone_data_type;


-- --
-- Helper functions to decode incoming requests -- Helper functions to decode incoming requests
@ -290,37 +287,6 @@ architecture rtl of dcache is
tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
end; end;


-- Generate byte enables from sizes
function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
begin
case length is
when "0001" =>
return "00000001";
when "0010" =>
return "00000011";
when "0100" =>
return "00001111";
when "1000" =>
return "11111111";
when others =>
return "00000000";
end case;
end function length_to_sel;

-- Calculate shift and byte enables for wishbone
function wishbone_data_shift(address : in std_ulogic_vector(63 downto 0)) return natural is
begin
return to_integer(unsigned(address(2 downto 0))) * 8;
end function wishbone_data_shift;

function wishbone_data_sel(size : in std_logic_vector(3 downto 0);
address : in std_logic_vector(63 downto 0))
return std_ulogic_vector is
begin
return std_ulogic_vector(shift_left(unsigned(length_to_sel(size)),
to_integer(unsigned(address(2 downto 0)))));
end function wishbone_data_sel;

begin begin


assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
@ -375,6 +341,18 @@ begin
end generate; end generate;
end generate; end generate;


-- Latch the request in r0 as long as we're not stalling
stage_0 : process(clk)
begin
if rising_edge(clk) then
if rst = '1' then
r0.valid <= '0';
elsif stall_out = '0' then
r0 <= d_in;
end if;
end if;
end process;

-- Cache request parsing and hit detection -- Cache request parsing and hit detection
dcache_request : process(all) dcache_request : process(all)
variable is_hit : std_ulogic; variable is_hit : std_ulogic;
@ -383,23 +361,27 @@ begin
variable tmp : std_ulogic_vector(63 downto 0); variable tmp : std_ulogic_vector(63 downto 0);
variable data : std_ulogic_vector(63 downto 0); variable data : std_ulogic_vector(63 downto 0);
variable opsel : std_ulogic_vector(3 downto 0); variable opsel : std_ulogic_vector(3 downto 0);
variable go : std_ulogic;
begin begin
-- Extract line, row and tag from request -- Extract line, row and tag from request
req_index <= get_index(d_in.addr); req_index <= get_index(r0.addr);
req_row <= get_row(d_in.addr); req_row <= get_row(r0.addr);
req_tag <= get_tag(d_in.addr); req_tag <= get_tag(r0.addr);

-- Only do anything if not being stalled by stage 1
go := r0.valid and not stall_out;


-- Calculate address of beginning of cache line, will be -- Calculate address of beginning of cache line, will be
-- used for cache miss processing if needed -- used for cache miss processing if needed
-- --
req_laddr <= d_in.addr(63 downto LINE_OFF_BITS) & req_laddr <= r0.addr(63 downto LINE_OFF_BITS) &
(LINE_OFF_BITS-1 downto 0 => '0'); (LINE_OFF_BITS-1 downto 0 => '0');


-- Test if pending request is a hit on any way -- Test if pending request is a hit on any way
hit_way := 0; hit_way := 0;
is_hit := '0'; is_hit := '0';
for i in way_t loop for i in way_t loop
if d_in.valid = '1' and cache_valids(req_index)(i) = '1' then if go = '1' and cache_valids(req_index)(i) = '1' then
if read_tag(i, cache_tags(req_index)) = req_tag then if read_tag(i, cache_tags(req_index)) = req_tag then
hit_way := i; hit_way := i;
is_hit := '1'; is_hit := '1';
@ -416,7 +398,7 @@ begin
-- Combine the request and cache his status to decide what -- Combine the request and cache his status to decide what
-- operation needs to be done -- operation needs to be done
-- --
opsel := d_in.valid & d_in.load & d_in.nc & is_hit; opsel := go & r0.load & r0.nc & is_hit;
case opsel is case opsel is
when "1101" => op := OP_LOAD_HIT; when "1101" => op := OP_LOAD_HIT;
when "1100" => op := OP_LOAD_MISS; when "1100" => op := OP_LOAD_MISS;
@ -431,46 +413,70 @@ begin


req_op <= op; req_op <= op;


-- Version of the row number that is valid one cycle earlier
-- in the cases where we need to read the cache data BRAM.
-- If we're stalling then we need to keep reading the last
-- row requested.
if stall_out = '0' then
early_req_row <= get_row(d_in.addr);
else
early_req_row <= req_row;
end if;
end process; end process;


--
-- Misc signal assignments
--

-- Wire up wishbone request latch out of stage 1 -- Wire up wishbone request latch out of stage 1
wishbone_out <= r1.wb; wishbone_out <= r1.wb;


-- Wishbone & BRAM write data formatting for stores (most of it already
-- happens in loadstore1, this is the remaining data shifting)
--
store_data <= std_logic_vector(shift_left(unsigned(d_in.data),
wishbone_data_shift(d_in.addr)));

-- Wishbone read and write and BRAM write sel bits generation
bus_sel <= wishbone_data_sel(d_in.length, d_in.addr);

-- TODO: Generate errors -- TODO: Generate errors
-- err_nc_collision <= '1' when req_op = OP_BAD else '0'; -- err_nc_collision <= '1' when req_op = OP_BAD else '0';


-- Generate stalls from stage 1 state machine -- Generate stalls from stage 1 state machine
stall_out <= '1' when r1.state /= IDLE else '0'; stall_out <= '1' when r1.state /= IDLE else '0';


-- Writeback (loads and reg updates) & completion control logic -- Handle load-with-reservation and store-conditional instructions
reservation_comb: process(all)
begin
cancel_store <= '0';
set_rsrv <= '0';
clear_rsrv <= '0';
if stall_out = '0' and r0.valid = '1' and r0.reserve = '1' then
-- XXX generate alignment interrupt if address is not aligned
-- XXX or if r0.nc = '1'
if r0.load = '1' then
-- load with reservation
set_rsrv <= '1';
else
-- store conditional
clear_rsrv <= '1';
if reservation.valid = '0' or
r0.addr(63 downto LINE_OFF_BITS) /= reservation.addr then
cancel_store <= '1';
end if;
end if;
end if;
end process;

reservation_reg: process(clk)
begin
if rising_edge(clk) then
if rst = '1' or clear_rsrv = '1' then
reservation.valid <= '0';
elsif set_rsrv = '1' then
reservation.valid <= '1';
reservation.addr <= r0.addr(63 downto LINE_OFF_BITS);
end if;
end if;
end process;

-- Return data for loads & completion control logic
-- --
writeback_control: process(all) writeback_control: process(all)
begin begin


-- The mux on d_out.write reg defaults to the normal load hit case. -- The mux on d_out.data defaults to the normal load hit case.
d_out.write_enable <= '0';
d_out.valid <= '0'; d_out.valid <= '0';
d_out.write_reg <= r2.load_reg; d_out.data <= cache_out(r1.hit_way);
d_out.write_data <= cache_out(r2.hit_way); d_out.store_done <= '0';
d_out.write_len <= r2.length;
d_out.write_shift <= r2.data_shift;
d_out.sign_extend <= r2.sign_extend;
d_out.byte_reverse <= r2.byte_reverse;
d_out.second_word <= '0';
d_out.xerc <= r2.xerc;


-- We have a valid load or store hit or we just completed a slow -- We have a valid load or store hit or we just completed a slow
-- op such as a load miss, a NC load or a store -- op such as a load miss, a NC load or a store
@ -483,25 +489,18 @@ begin
-- --


-- Sanity: Only one of these must be set in any given cycle -- Sanity: Only one of these must be set in any given cycle
assert (r1.update_valid and r2.hit_load_valid) /= '1' report assert (r1.slow_valid and r1.stcx_fail) /= '1' report
"unexpected hit_load_delayed collision with update_valid" "unexpected slow_valid collision with stcx_fail"
severity FAILURE; severity FAILURE;
assert (r1.slow_valid and r2.hit_load_valid) /= '1' report assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid) /= '1' report
"unexpected hit_load_delayed collision with slow_valid" "unexpected hit_load_delayed collision with slow_valid"
severity FAILURE; severity FAILURE;
assert (r1.slow_valid and r1.update_valid) /= '1' report
"unexpected update_valid collision with slow_valid"
severity FAILURE;

-- Delayed load hit case is the standard path
if r2.hit_load_valid = '1' then
d_out.write_enable <= '1';


-- If it's not a load with update, complete it now -- Load hit case is the standard path
if r2.load_is_update = '0' then if r1.hit_load_valid = '1' then
report "completing load hit";
d_out.valid <= '1'; d_out.valid <= '1';
end if; end if;
end if;


-- Slow ops (load miss, NC, stores) -- Slow ops (load miss, NC, stores)
if r1.slow_valid = '1' then if r1.slow_valid = '1' then
@ -509,48 +508,19 @@ begin
-- mux accordingly -- mux accordingly
-- --
if r1.req.load then if r1.req.load then
d_out.write_reg <= r1.req.write_reg; -- Read data comes from the slow data latch
d_out.write_enable <= '1'; d_out.data <= r1.slow_data;

-- Read data comes from the slow data latch, formatter
-- from the latched request.
--
d_out.write_data <= r1.slow_data;
d_out.write_shift <= r1.req.addr(2 downto 0);
d_out.sign_extend <= r1.req.sign_extend;
d_out.byte_reverse <= r1.req.byte_reverse;
d_out.write_len <= r1.req.length;
d_out.xerc <= r1.req.xerc;
end if; end if;
d_out.store_done <= '1';


-- If it's a store or a non-update load form, complete now report "completing store or load miss";
if r1.req.load = '0' or r1.req.update = '0' then
d_out.valid <= '1'; d_out.valid <= '1';
end if; end if;
end if;

-- We have a register update to do.
if r1.update_valid = '1' then
d_out.write_enable <= '1';
d_out.write_reg <= r1.req.update_reg;


-- Change the read data mux to the address that's going into if r1.stcx_fail = '1' then
-- the register and the formatter does nothing. d_out.store_done <= '0';
--
d_out.write_data <= r1.req.addr;
d_out.write_shift <= "000";
d_out.write_len <= "1000";
d_out.sign_extend <= '0';
d_out.byte_reverse <= '0';
d_out.xerc <= r1.req.xerc;

-- If it was a load, this completes the operation (load with
-- update case).
--
if r1.req.load = '1' then
d_out.valid <= '1'; d_out.valid <= '1';
end if; end if;
end if;


end process; end process;


@ -595,7 +565,7 @@ begin
begin begin
-- Cache hit reads -- Cache hit reads
do_read <= '1'; do_read <= '1';
rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); rd_addr <= std_ulogic_vector(to_unsigned(early_req_row, ROW_BITS));
cache_out(i) <= dout; cache_out(i) <= dout;


-- Write mux: -- Write mux:
@ -606,10 +576,10 @@ begin
-- other than the current state. Only the do_write signal is. -- other than the current state. Only the do_write signal is.
-- --
if r1.state = IDLE then if r1.state = IDLE then
-- When IDLE, the only write path is the store-hit update case -- In IDLE state, the only write path is the store-hit update case
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
wr_data <= store_data; wr_data <= r0.data;
wr_sel <= bus_sel; wr_sel <= r0.byte_sel;
else else
-- Otherwise, we might be doing a reload -- Otherwise, we might be doing a reload
wr_data <= wishbone_in.dat; wr_data <= wishbone_in.dat;
@ -623,7 +593,7 @@ begin
if reloading and wishbone_in.ack = '1' and r1.store_way = i then if reloading and wishbone_in.ack = '1' and r1.store_way = i then
do_write <= '1'; do_write <= '1';
end if; end if;
if req_op = OP_STORE_HIT and req_hit_way = i then if req_op = OP_STORE_HIT and req_hit_way = i and cancel_store = '0' then
assert not reloading report "Store hit while in state:" & assert not reloading report "Store hit while in state:" &
state_t'image(r1.state) state_t'image(r1.state)
severity FAILURE; severity FAILURE;
@ -633,36 +603,22 @@ begin
end generate; end generate;


-- --
-- Cache hit synchronous machine for the easy case. This handles -- Cache hit synchronous machine for the easy case. This handles load hits.
-- non-update form load hits and stage 1 to stage 2 transfers
-- --
dcache_fast_hit : process(clk) dcache_fast_hit : process(clk)
begin begin
if rising_edge(clk) then if rising_edge(clk) then
-- stage 1 -> stage 2 -- If we have a request incoming, we have to latch it as r0.valid
r2.hit_load_valid <= r1.hit_load_valid;
r2.hit_way <= r1.hit_way;
r2.load_is_update <= r1.req.update;
r2.load_reg <= r1.req.write_reg;
r2.data_shift <= r1.req.addr(2 downto 0);
r2.length <= r1.req.length;
r2.sign_extend <= r1.req.sign_extend;
r2.byte_reverse <= r1.req.byte_reverse;

-- If we have a request incoming, we have to latch it as d_in.valid
-- is only set for a single cycle. It's up to the control logic to -- is only set for a single cycle. It's up to the control logic to
-- ensure we don't override an uncompleted request (for now we are -- ensure we don't override an uncompleted request (for now we are
-- single issue on load/stores so we are fine, later, we can generate -- single issue on load/stores so we are fine, later, we can generate
-- a stall output if necessary). -- a stall output if necessary).


if req_op /= OP_NONE then if req_op /= OP_NONE and stall_out = '0' then
r1.req <= d_in; r1.req <= r0;

report "op:" & op_t'image(req_op) & report "op:" & op_t'image(req_op) &
" addr:" & to_hstring(d_in.addr) & " addr:" & to_hstring(r0.addr) &
" upd:" & std_ulogic'image(d_in.update) & " nc:" & std_ulogic'image(r0.nc) &
" nc:" & std_ulogic'image(d_in.nc) &
" reg:" & to_hstring(d_in.write_reg) &
" idx:" & integer'image(req_index) & " idx:" & integer'image(req_index) &
" tag:" & to_hstring(req_tag) & " tag:" & to_hstring(req_tag) &
" way: " & integer'image(req_hit_way); " way: " & integer'image(req_hit_way);
@ -679,10 +635,9 @@ begin
end process; end process;


-- --
-- Every other case is handled by this stage machine: -- Every other case is handled by this state machine:
-- --
-- * Cache load miss/reload (in conjunction with "rams") -- * Cache load miss/reload (in conjunction with "rams")
-- * Load hits for update forms
-- * Load hits for non-cachable forms -- * Load hits for non-cachable forms
-- * Stores (the collision case is handled in "rams") -- * Stores (the collision case is handled in "rams")
-- --
@ -701,7 +656,6 @@ begin
end loop; end loop;
r1.state <= IDLE; r1.state <= IDLE;
r1.slow_valid <= '0'; r1.slow_valid <= '0';
r1.update_valid <= '0';
r1.wb.cyc <= '0'; r1.wb.cyc <= '0';
r1.wb.stb <= '0'; r1.wb.stb <= '0';


@ -710,27 +664,19 @@ begin
else else
-- One cycle pulses reset -- One cycle pulses reset
r1.slow_valid <= '0'; r1.slow_valid <= '0';
r1.update_valid <= '0'; r1.stcx_fail <= '0';

-- We cannot currently process a new request when not idle
assert req_op = OP_NONE or r1.state = IDLE report "request " &
op_t'image(req_op) & " while in state " & state_t'image(r1.state)
severity FAILURE;


-- Main state machine -- Main state machine
case r1.state is case r1.state is
when IDLE => when IDLE =>
case req_op is case req_op is
when OP_LOAD_HIT => when OP_LOAD_HIT =>
-- We have a load with update hit, we need the delayed update cycle -- stay in IDLE state
if d_in.update = '1' then
r1.state <= LOAD_UPDATE;
end if;


when OP_LOAD_MISS => when OP_LOAD_MISS =>
-- Normal load cache miss, start the reload machine -- Normal load cache miss, start the reload machine
-- --
report "cache miss addr:" & to_hstring(d_in.addr) & report "cache miss addr:" & to_hstring(r0.addr) &
" idx:" & integer'image(req_index) & " idx:" & integer'image(req_index) &
" way:" & integer'image(replace_way) & " way:" & integer'image(replace_way) &
" tag:" & to_hstring(req_tag); " tag:" & to_hstring(req_tag);
@ -765,25 +711,26 @@ begin
r1.state <= RELOAD_WAIT_ACK; r1.state <= RELOAD_WAIT_ACK;


when OP_LOAD_NC => when OP_LOAD_NC =>
r1.wb.sel <= bus_sel; r1.wb.sel <= r0.byte_sel;
r1.wb.adr <= d_in.addr(r1.wb.adr'left downto 3) & "000"; r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000";
r1.wb.cyc <= '1'; r1.wb.cyc <= '1';
r1.wb.stb <= '1'; r1.wb.stb <= '1';
r1.wb.we <= '0'; r1.wb.we <= '0';
r1.state <= NC_LOAD_WAIT_ACK; r1.state <= NC_LOAD_WAIT_ACK;


when OP_STORE_HIT | OP_STORE_MISS => when OP_STORE_HIT | OP_STORE_MISS =>
-- For store-with-update do the register update r1.wb.sel <= r0.byte_sel;
if d_in.update = '1' then r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000";
r1.update_valid <= '1'; r1.wb.dat <= r0.data;
end if; if cancel_store = '0' then
r1.wb.sel <= bus_sel;
r1.wb.adr <= d_in.addr(r1.wb.adr'left downto 3) & "000";
r1.wb.dat <= store_data;
r1.wb.cyc <= '1'; r1.wb.cyc <= '1';
r1.wb.stb <= '1'; r1.wb.stb <= '1';
r1.wb.we <= '1'; r1.wb.we <= '1';
r1.state <= STORE_WAIT_ACK; r1.state <= STORE_WAIT_ACK;
else
r1.stcx_fail <= '1';
r1.state <= IDLE;
end if;


-- OP_NONE and OP_BAD do nothing -- OP_NONE and OP_BAD do nothing
when OP_NONE => when OP_NONE =>
@ -831,30 +778,21 @@ begin
-- Cache line is now valid -- Cache line is now valid
cache_valids(r1.store_index)(r1.store_way) <= '1'; cache_valids(r1.store_index)(r1.store_way) <= '1';


-- Complete the load that missed. For load with update -- Don't complete and go idle until next cycle, in
-- we also need to do the deferred update cycle. -- case the next request is for the last dword of
-- -- the cache line we just loaded.
r1.slow_valid <= '1'; r1.state <= FINISH_LD_MISS;
if r1.req.load = '1' and r1.req.update = '1' then
r1.state <= LOAD_UPDATE;
report "completing miss with load-update !";
else
r1.state <= IDLE;
report "completing miss !";
end if;
end if; end if;


-- Increment store row counter -- Increment store row counter
r1.store_row <= next_row(r1.store_row); r1.store_row <= next_row(r1.store_row);
end if; end if;


when LOAD_UPDATE => when FINISH_LD_MISS =>
-- We need the extra cycle to complete a load with update -- Write back the load data that we got
r1.state <= LOAD_UPDATE2; r1.slow_valid <= '1';
when LOAD_UPDATE2 =>
-- We need the extra cycle to complete a load with update
r1.update_valid <= '1';
r1.state <= IDLE; r1.state <= IDLE;
report "completing miss !";


when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK =>
-- Clear stb when slave accepted request -- Clear stb when slave accepted request
@ -867,10 +805,10 @@ begin
if r1.state = NC_LOAD_WAIT_ACK then if r1.state = NC_LOAD_WAIT_ACK then
r1.slow_data <= wishbone_in.dat; r1.slow_data <= wishbone_in.dat;
end if; end if;
r1.state <= IDLE;
r1.slow_valid <= '1'; r1.slow_valid <= '1';
r1.wb.cyc <= '0'; r1.wb.cyc <= '0';
r1.wb.stb <= '0'; r1.wb.stb <= '0';
r1.state <= IDLE;
end if; end if;
end case; end case;
end if; end if;

@ -13,7 +13,7 @@ architecture behave of dcache_tb is
signal rst : std_ulogic; signal rst : std_ulogic;


signal d_in : Loadstore1ToDcacheType; signal d_in : Loadstore1ToDcacheType;
signal d_out : DcacheToWritebackType; signal d_out : DcacheToLoadstore1Type;


signal wb_bram_in : wishbone_master_out; signal wb_bram_in : wishbone_master_out;
signal wb_bram_out : wishbone_slave_out; signal wb_bram_out : wishbone_slave_out;
@ -71,12 +71,6 @@ begin
d_in.nc <= '0'; d_in.nc <= '0';
d_in.addr <= (others => '0'); d_in.addr <= (others => '0');
d_in.data <= (others => '0'); d_in.data <= (others => '0');
d_in.write_reg <= (others => '0');
d_in.length <= (others => '0');
d_in.byte_reverse <= '0';
d_in.sign_extend <= '0';
d_in.update <= '0';
d_in.update_reg <= (others => '0');


wait for 4*clk_period; wait for 4*clk_period;
wait until rising_edge(clk); wait until rising_edge(clk);
@ -89,11 +83,10 @@ begin
wait until rising_edge(clk); wait until rising_edge(clk);
d_in.valid <= '0'; d_in.valid <= '0';


wait until rising_edge(clk) and d_out.write_enable = '1'; wait until rising_edge(clk) and d_out.valid = '1';
assert d_out.valid = '1'; assert d_out.data = x"0000000100000000"
assert d_out.write_data = x"0000000100000000"
report "data @" & to_hstring(d_in.addr) & report "data @" & to_hstring(d_in.addr) &
"=" & to_hstring(d_out.write_data) & "=" & to_hstring(d_out.data) &
" expected 0000000100000000" " expected 0000000100000000"
severity failure; severity failure;
-- wait for clk_period; -- wait for clk_period;
@ -106,11 +99,10 @@ begin
wait until rising_edge(clk); wait until rising_edge(clk);
d_in.valid <= '0'; d_in.valid <= '0';


wait until rising_edge(clk) and d_out.write_enable = '1'; wait until rising_edge(clk) and d_out.valid = '1';
assert d_out.valid = '1'; assert d_out.data = x"0000000D0000000C"
assert d_out.write_data = x"0000000D0000000C"
report "data @" & to_hstring(d_in.addr) & report "data @" & to_hstring(d_in.addr) &
"=" & to_hstring(d_out.write_data) & "=" & to_hstring(d_out.data) &
" expected 0000000D0000000C" " expected 0000000D0000000C"
severity failure; severity failure;


@ -121,11 +113,10 @@ begin
d_in.valid <= '1'; d_in.valid <= '1';
wait until rising_edge(clk); wait until rising_edge(clk);
d_in.valid <= '0'; d_in.valid <= '0';
wait until rising_edge(clk) and d_out.write_enable = '1'; wait until rising_edge(clk) and d_out.valid = '1';
assert d_out.valid = '1'; assert d_out.data = x"0000004100000040"
assert d_out.write_data = x"0000004100000040"
report "data @" & to_hstring(d_in.addr) & report "data @" & to_hstring(d_in.addr) &
"=" & to_hstring(d_out.write_data) & "=" & to_hstring(d_out.data) &
" expected 0000004100000040" " expected 0000004100000040"
severity failure; severity failure;



@ -46,26 +46,26 @@ architecture behaviour of decode1 is
16 => (ALU, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc 16 => (ALU, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc
11 => (ALU, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmpi 11 => (ALU, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmpi
10 => (ALU, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli 10 => (ALU, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli
34 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbz 34 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbz
35 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lbzu 35 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzu
42 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lha 42 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lha
43 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '1'), -- lhau 43 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhau
40 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lhz 40 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhz
41 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lhzu 41 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lhzu
32 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwz 32 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwz
33 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lwzu 33 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lwzu
7 => (ALU, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli 7 => (ALU, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli
24 => (ALU, OP_OR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ori 24 => (ALU, OP_OR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ori
25 => (ALU, OP_OR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- oris 25 => (ALU, OP_OR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- oris
20 => (ALU, OP_RLC, RA, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwimi 20 => (ALU, OP_RLC, RA, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwimi
21 => (ALU, OP_RLC, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwinm 21 => (ALU, OP_RLC, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwinm
23 => (ALU, OP_RLC, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwnm 23 => (ALU, OP_RLC, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwnm
38 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- stb 38 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stb
39 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', RC, '0', '1'), -- stbu 39 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbu
44 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sth 44 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth
45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- sthu 45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu
36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stw 36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw
37 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stwu 37 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwu
8 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic 8 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic
2 => (ALU, OP_TDI, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi 2 => (ALU, OP_TDI, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi
--PPC_TWI 3 --PPC_TWI 3
@ -215,25 +215,29 @@ architecture behaviour of decode1 is
2#1110101111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel 2#1110101111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
2#1111001111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel 2#1111001111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
2#1111101111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel 2#1111101111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
2#0000110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- lbarx 2#0000110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lbarx
2#0001110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lbzux 2#1101010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbzcix
2#0001010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbzx 2#0001110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzux
2#0001010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- ldarx 2#0001010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbzx
2#1000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- ldbrx 2#0001010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- ldarx
2#0000110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- ldux 2#1000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldbrx
2#0000010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- ldx 2#1101110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldcix
2#0001110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- lharx 2#0000110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- ldux
2#0101110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '1'), -- lhaux 2#0000010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldx
2#0101010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lhax 2#0001110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lharx
2#1100010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lhbrx 2#0101110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhaux
2#0100110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lhzux 2#0101010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lhax
2#0100010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lhzx 2#1100010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhbrx
2#0000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- lwarx 2#1100110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhzcix
2#0101110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '1'), -- lwaux 2#0100110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lhzux
2#0101010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lwax 2#0100010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhzx
2#1000010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwbrx 2#0000010100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lwarx
2#0000110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lwzux 2#0101110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lwaux
2#0000010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwzx 2#0101010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lwax
2#1000010110# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwbrx
2#1100010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwzcix
2#0000110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lwzux
2#0000010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwzx
-- 2#1000000000# mcrxr -- 2#1000000000# mcrxr
-- 2#1001000000# mcrxrx -- 2#1001000000# mcrxrx
2#0000010011# => (ALU, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf 2#0000010011# => (ALU, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf
@ -278,21 +282,25 @@ architecture behaviour of decode1 is
2#1100111000# => (ALU, OP_SHR, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- srawi 2#1100111000# => (ALU, OP_SHR, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- srawi
2#1000011011# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- srd 2#1000011011# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- srd
2#1000011000# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- srw 2#1000011000# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- srw
2#1010110110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', RC, '0', '1'), -- stbcx 2#1111010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stbcix
2#0011110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', RC, '0', '1'), -- stbux 2#1010110110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stbcx
2#0011010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- stbx 2#0011110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbux
2#1010010100# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stdbrx 2#0011010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stbx
2#0011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- stdcx 2#1010010100# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdbrx
2#0010110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stdux 2#1111110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdcix
2#0010010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stdx 2#0011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stdcx
2#1110010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sthbrx 2#0010110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stdux
2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- sthcx 2#0010010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdx
2#0110110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- sthux 2#1110010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx
2#0110010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sthx 2#1110110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthcix
2#1010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stwbrx 2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- sthcx
2#0010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '1'), -- stwcx 2#0110110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthux
2#0010110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stwux 2#0110010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthx
2#0010010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stwx 2#1010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwbrx
2#1110010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwcix
2#0010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stwcx
2#0010110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwux
2#0010010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwx
2#0000101000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subf 2#0000101000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subf
2#1000101000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfo 2#1000101000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfo
2#0000001000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfc 2#0000001000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfc
@ -329,7 +337,7 @@ architecture behaviour of decode1 is


-- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl
-- op in out A out in out len ext pipe -- op in out A out in out len ext pipe
constant attn_instr : decode_rom_t := (ALU, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'); constant attn_instr : decode_rom_t := (ALU, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1');
constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0');
constant sim_cfg_instr : decode_rom_t := (ALU, OP_SIM_CONFIG,NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'); constant sim_cfg_instr : decode_rom_t := (ALU, OP_SIM_CONFIG,NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1');



@ -334,6 +334,7 @@ begin
v.e.byte_reverse := d_in.decode.byte_reverse; v.e.byte_reverse := d_in.decode.byte_reverse;
v.e.sign_extend := d_in.decode.sign_extend; v.e.sign_extend := d_in.decode.sign_extend;
v.e.update := d_in.decode.update; v.e.update := d_in.decode.update;
v.e.reserve := d_in.decode.reserve;


-- issue control -- issue control
control_valid_in <= d_in.valid; control_valid_in <= d_in.valid;

@ -200,6 +200,7 @@ begin
variable bo, bi : std_ulogic_vector(4 downto 0); variable bo, bi : std_ulogic_vector(4 downto 0);
variable bf, bfa : std_ulogic_vector(2 downto 0); variable bf, bfa : std_ulogic_vector(2 downto 0);
variable cr_op : std_ulogic_vector(9 downto 0); variable cr_op : std_ulogic_vector(9 downto 0);
variable cr_operands : std_ulogic_vector(1 downto 0);
variable bt, ba, bb : std_ulogic_vector(4 downto 0); variable bt, ba, bb : std_ulogic_vector(4 downto 0);
variable btnum, banum, bbnum : integer range 0 to 31; variable btnum, banum, bbnum : integer range 0 to 31;
variable crresult : std_ulogic; variable crresult : std_ulogic;
@ -532,27 +533,10 @@ begin
btnum := 31 - to_integer(unsigned(bt)); btnum := 31 - to_integer(unsigned(bt));
banum := 31 - to_integer(unsigned(ba)); banum := 31 - to_integer(unsigned(ba));
bbnum := 31 - to_integer(unsigned(bb)); bbnum := 31 - to_integer(unsigned(bb));
case cr_op(8 downto 5) is -- Bits 5-8 of cr_op give the truth table of the requested
when "1001" => -- CREQV -- logical operation
crresult := not(e_in.cr(banum) xor e_in.cr(bbnum)); cr_operands := e_in.cr(banum) & e_in.cr(bbnum);
when "0111" => -- CRNAND crresult := cr_op(5 + to_integer(unsigned(cr_operands)));
crresult := not(e_in.cr(banum) and e_in.cr(bbnum));
when "0100" => -- CRANDC
crresult := (e_in.cr(banum) and not e_in.cr(bbnum));
when "1000" => -- CRAND
crresult := (e_in.cr(banum) and e_in.cr(bbnum));
when "0001" => -- CRNOR
crresult := not(e_in.cr(banum) or e_in.cr(bbnum));
when "1101" => -- CRORC
crresult := (e_in.cr(banum) or not e_in.cr(bbnum));
when "0110" => -- CRXOR
crresult := (e_in.cr(banum) xor e_in.cr(bbnum));
when "1110" => -- CROR
crresult := (e_in.cr(banum) or e_in.cr(bbnum));
when others =>
crresult := '0';
report "BAD CR?";
end case;
v.e.write_cr_mask := num_to_fxm((31-btnum) / 4); v.e.write_cr_mask := num_to_fxm((31-btnum) / 4);
for i in 0 to 31 loop for i in 0 to 31 loop
if i = btnum then if i = btnum then
@ -767,6 +751,13 @@ begin
lv.update := e_in.update; lv.update := e_in.update;
lv.update_reg := gspr_to_gpr(e_in.read_reg1); lv.update_reg := gspr_to_gpr(e_in.read_reg1);
lv.xerc := v.e.xerc; lv.xerc := v.e.xerc;
lv.reserve := e_in.reserve;
lv.rc := e_in.rc;
-- decode l*cix and st*cix instructions here
if e_in.insn(31 downto 26) = "011111" and e_in.insn(10 downto 9) = "11" and
e_in.insn(5 downto 1) = "10101" then
lv.ci := '1';
end if;


-- Update registers -- Update registers
rin <= v; rin <= v;

@ -12,16 +12,90 @@ use work.helpers.all;
entity loadstore1 is entity loadstore1 is
port ( port (
clk : in std_ulogic; clk : in std_ulogic;
rst : in std_ulogic;


l_in : in Execute1ToLoadstore1Type; l_in : in Execute1ToLoadstore1Type;
l_out : out Loadstore1ToWritebackType;


l_out : out Loadstore1ToDcacheType d_out : out Loadstore1ToDcacheType;
d_in : in DcacheToLoadstore1Type;

dc_stall : in std_ulogic;
stall_out : out std_ulogic
); );
end loadstore1; end loadstore1;


-- Note, we don't currently use the stall output from the dcache because
-- we know it can take two requests without stalling when idle, we are
-- its only user, and we know it never stalls when idle.

architecture behave of loadstore1 is architecture behave of loadstore1 is
signal r, rin : Loadstore1ToDcacheType;
-- State machine for unaligned loads/stores
type state_t is (IDLE, -- ready for instruction
SECOND_REQ, -- send 2nd request of unaligned xfer
FIRST_ACK_WAIT, -- waiting for 1st ack from dcache
LAST_ACK_WAIT, -- waiting for last ack from dcache
LD_UPDATE -- writing rA with computed addr on load
);

type reg_stage_t is record
-- latch most of the input request
load : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
store_data : std_ulogic_vector(63 downto 0);
load_data : std_ulogic_vector(63 downto 0);
write_reg : gpr_index_t;
length : std_ulogic_vector(3 downto 0);
byte_reverse : std_ulogic;
sign_extend : std_ulogic;
update : std_ulogic;
update_reg : gpr_index_t;
xerc : xer_common_t;
reserve : std_ulogic;
rc : std_ulogic;
nc : std_ulogic; -- non-cacheable access
state : state_t;
second_bytes : std_ulogic_vector(7 downto 0);
end record;

type byte_sel_t is array(0 to 7) of std_ulogic;
subtype byte_trim_t is std_ulogic_vector(1 downto 0);
type trim_ctl_t is array(0 to 7) of byte_trim_t;

signal r, rin : reg_stage_t;
signal lsu_sum : std_ulogic_vector(63 downto 0); signal lsu_sum : std_ulogic_vector(63 downto 0);

-- Generate byte enables from sizes
function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
begin
case length is
when "0001" =>
return "00000001";
when "0010" =>
return "00000011";
when "0100" =>
return "00001111";
when "1000" =>
return "11111111";
when others =>
return "00000000";
end case;
end function length_to_sel;

-- Calculate byte enables
-- This returns 16 bits, giving the select signals for two transfers,
-- to account for unaligned loads or stores
function xfer_data_sel(size : in std_logic_vector(3 downto 0);
address : in std_logic_vector(2 downto 0))
return std_ulogic_vector is
variable longsel : std_ulogic_vector(15 downto 0);
begin
longsel := "00000000" & length_to_sel(size);
return std_ulogic_vector(shift_left(unsigned(longsel),
to_integer(unsigned(address))));
end function xfer_data_sel;

begin begin
-- Calculate the address in the first cycle -- Calculate the address in the first cycle
lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0'); lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
@ -29,18 +103,103 @@ begin
loadstore1_0: process(clk) loadstore1_0: process(clk)
begin begin
if rising_edge(clk) then if rising_edge(clk) then
if rst = '1' then
r.state <= IDLE;
else
r <= rin; r <= rin;
end if; end if;
end if;
end process; end process;


loadstore1_1: process(all) loadstore1_1: process(all)
variable v : Loadstore1ToDcacheType; variable v : reg_stage_t;
variable brev_lenm1 : unsigned(2 downto 0);
variable byte_offset : unsigned(2 downto 0);
variable j : integer;
variable k : unsigned(2 downto 0);
variable kk : unsigned(3 downto 0);
variable long_sel : std_ulogic_vector(15 downto 0);
variable byte_sel : std_ulogic_vector(7 downto 0);
variable req : std_ulogic;
variable stall : std_ulogic;
variable addr : std_ulogic_vector(63 downto 0);
variable wdata : std_ulogic_vector(63 downto 0);
variable write_enable : std_ulogic;
variable do_update : std_ulogic;
variable two_dwords : std_ulogic;
variable done : std_ulogic;
variable data_permuted : std_ulogic_vector(63 downto 0);
variable data_trimmed : std_ulogic_vector(63 downto 0);
variable use_second : byte_sel_t;
variable trim_ctl : trim_ctl_t;
variable negative : std_ulogic;
begin begin
v := r; v := r;
req := '0';
stall := '0';
done := '0';
byte_sel := (others => '0');
addr := lsu_sum;

write_enable := '0';
do_update := '0';
two_dwords := or (r.second_bytes);

-- load data formatting
if r.load = '1' then
byte_offset := unsigned(r.addr(2 downto 0));
brev_lenm1 := "000";
if r.byte_reverse = '1' then
brev_lenm1 := unsigned(r.length(2 downto 0)) - 1;
end if;

-- shift and byte-reverse data bytes
for i in 0 to 7 loop
kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
use_second(i) := kk(3);
j := to_integer(kk(2 downto 0)) * 8;
data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j);
end loop;


v.valid := l_in.valid; -- Work out the sign bit for sign extension.
-- Assumes we are not doing both sign extension and byte reversal,
-- in that for unaligned loads crossing two dwords we end up
-- using a bit from the second dword, whereas for a byte-reversed
-- (i.e. big-endian) load the sign bit would be in the first dword.
negative := (r.length(3) and data_permuted(63)) or
(r.length(2) and data_permuted(31)) or
(r.length(1) and data_permuted(15)) or
(r.length(0) and data_permuted(7));

-- trim and sign-extend
for i in 0 to 7 loop
if i < to_integer(unsigned(r.length)) then
if two_dwords = '1' then
trim_ctl(i) := '1' & not use_second(i);
else
trim_ctl(i) := not use_second(i) & '0';
end if;
else
trim_ctl(i) := '0' & (negative and r.sign_extend);
end if;
case trim_ctl(i) is
when "11" =>
data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8);
when "10" =>
data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8);
when "01" =>
data_trimmed(i * 8 + 7 downto i * 8) := x"FF";
when others =>
data_trimmed(i * 8 + 7 downto i * 8) := x"00";
end case;
end loop;
end if;

case r.state is
when IDLE =>
if l_in.valid = '1' then
v.load := l_in.load; v.load := l_in.load;
v.data := l_in.data; v.addr := lsu_sum;
v.write_reg := l_in.write_reg; v.write_reg := l_in.write_reg;
v.length := l_in.length; v.length := l_in.length;
v.byte_reverse := l_in.byte_reverse; v.byte_reverse := l_in.byte_reverse;
@ -48,6 +207,9 @@ begin
v.update := l_in.update; v.update := l_in.update;
v.update_reg := l_in.update_reg; v.update_reg := l_in.update_reg;
v.xerc := l_in.xerc; v.xerc := l_in.xerc;
v.reserve := l_in.reserve;
v.rc := l_in.rc;
v.nc := l_in.ci;


-- XXX Temporary hack. Mark the op as non-cachable if the address -- XXX Temporary hack. Mark the op as non-cachable if the address
-- is the form 0xc------- -- is the form 0xc-------
@ -57,23 +219,108 @@ begin
-- bit otherwise. -- bit otherwise.
if lsu_sum(31 downto 28) = "1100" then if lsu_sum(31 downto 28) = "1100" then
v.nc := '1'; v.nc := '1';
end if;

-- Do length_to_sel and work out if we are doing 2 dwords
long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0));
byte_sel := long_sel(7 downto 0);
v.second_bytes := long_sel(15 downto 8);

v.addr := lsu_sum;

-- Do byte reversing and rotating for stores in the first cycle
if v.load = '0' then
byte_offset := unsigned(lsu_sum(2 downto 0));
brev_lenm1 := "000";
if l_in.byte_reverse = '1' then
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
end if;
for i in 0 to 7 loop
k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
j := to_integer(k) * 8;
v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
end loop;
end if;

req := '1';
stall := '1';
if long_sel(15 downto 8) = "00000000" then
v.state := LAST_ACK_WAIT;
else else
v.nc := '0'; v.state := SECOND_REQ;
end if;
end if; end if;


-- XXX Do length_to_sel here ? when SECOND_REQ =>
-- compute (addr + 8) & ~7 for the second doubleword when unaligned
addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";
byte_sel := r.second_bytes;
req := '1';
stall := '1';
v.state := FIRST_ACK_WAIT;


-- byte reverse stores in the first cycle when FIRST_ACK_WAIT =>
if v.load = '0' and l_in.byte_reverse = '1' then stall := '1';
v.data := byte_reverse(l_in.data, to_integer(unsigned(l_in.length))); if d_in.valid = '1' then
v.state := LAST_ACK_WAIT;
if r.load = '1' then
v.load_data := data_permuted;
end if;
end if; end if;


v.addr := lsu_sum; when LAST_ACK_WAIT =>
stall := '1';
if d_in.valid = '1' then
write_enable := r.load;
if r.load = '1' and r.update = '1' then
-- loads with rA update need an extra cycle
v.state := LD_UPDATE;
else
-- stores write back rA update in this cycle
do_update := r.update;
stall := '0';
done := '1';
v.state := IDLE;
end if;
end if;

when LD_UPDATE =>
do_update := '1';
v.state := IDLE;
done := '1';
end case;

-- Update outputs to dcache
d_out.valid <= req;
d_out.load <= v.load;
d_out.nc <= v.nc;
d_out.reserve <= v.reserve;
d_out.addr <= addr;
d_out.data <= v.store_data;
d_out.byte_sel <= byte_sel;

-- Update outputs to writeback
-- Multiplex either cache data to the destination GPR or
-- the address for the rA update.
l_out.valid <= done;
if do_update = '1' then
l_out.write_enable <= '1';
l_out.write_reg <= r.update_reg;
l_out.write_data <= r.addr;
else
l_out.write_enable <= write_enable;
l_out.write_reg <= r.write_reg;
l_out.write_data <= data_trimmed;
end if;
l_out.xerc <= r.xerc;
l_out.rc <= r.rc and done;
l_out.store_done <= d_in.store_done;

stall_out <= stall;


-- Update registers -- Update registers
rin <= v; rin <= v;


-- Update outputs
l_out <= r;
end process; end process;

end; end;

@ -11,7 +11,7 @@ entity writeback is
clk : in std_ulogic; clk : in std_ulogic;


e_in : in Execute1ToWritebackType; e_in : in Execute1ToWritebackType;
l_in : in DcacheToWritebackType; l_in : in Loadstore1ToWritebackType;


w_out : out WritebackToRegisterFileType; w_out : out WritebackToRegisterFileType;
c_out : out WritebackToCrFileType; c_out : out WritebackToCrFileType;
@ -21,48 +21,15 @@ entity writeback is
end entity writeback; end entity writeback;


architecture behaviour of writeback is architecture behaviour of writeback is
subtype byte_index_t is unsigned(2 downto 0);
type permutation_t is array(0 to 7) of byte_index_t;
subtype byte_trim_t is std_ulogic_vector(1 downto 0);
type trim_ctl_t is array(0 to 7) of byte_trim_t;
type byte_sel_t is array(0 to 7) of std_ulogic;

signal data_len : unsigned(3 downto 0);
signal data_in : std_ulogic_vector(63 downto 0);
signal data_permuted : std_ulogic_vector(63 downto 0);
signal data_trimmed : std_ulogic_vector(63 downto 0);
signal data_latched : std_ulogic_vector(63 downto 0);
signal perm : permutation_t;
signal use_second : byte_sel_t;
signal byte_offset : unsigned(2 downto 0);
signal brev_lenm1 : unsigned(2 downto 0);
signal trim_ctl : trim_ctl_t;
signal rc : std_ulogic;
signal partial_write : std_ulogic;
signal sign_extend : std_ulogic;
signal negative : std_ulogic;
signal second_word : std_ulogic;
begin begin
writeback_0: process(clk)
begin
if rising_edge(clk) then
if partial_write = '1' then
data_latched <= data_permuted;
end if;
end if;
end process;

writeback_1: process(all) writeback_1: process(all)
variable x : std_ulogic_vector(0 downto 0); variable x : std_ulogic_vector(0 downto 0);
variable y : std_ulogic_vector(0 downto 0); variable y : std_ulogic_vector(0 downto 0);
variable z : std_ulogic_vector(0 downto 0);
variable w : std_ulogic_vector(0 downto 0); variable w : std_ulogic_vector(0 downto 0);
variable j : integer;
variable k : unsigned(3 downto 0);
variable cf: std_ulogic_vector(3 downto 0); variable cf: std_ulogic_vector(3 downto 0);
variable xe: xer_common_t;
variable zero : std_ulogic; variable zero : std_ulogic;
variable sign : std_ulogic; variable sign : std_ulogic;
variable scf : std_ulogic_vector(3 downto 0);
begin begin
x(0) := e_in.valid; x(0) := e_in.valid;
y(0) := l_in.valid; y(0) := l_in.valid;
@ -84,17 +51,10 @@ begin
complete_out <= '1'; complete_out <= '1';
end if; end if;


rc <= '0';
brev_lenm1 <= "000";
partial_write <= '0';
second_word <= '0';
xe := e_in.xerc;
data_in <= (others => '0');

if e_in.write_enable = '1' then if e_in.write_enable = '1' then
w_out.write_reg <= e_in.write_reg; w_out.write_reg <= e_in.write_reg;
w_out.write_data <= e_in.write_data;
w_out.write_enable <= '1'; w_out.write_enable <= '1';
rc <= e_in.rc;
end if; end if;


if e_in.write_cr_enable = '1' then if e_in.write_cr_enable = '1' then
@ -108,75 +68,26 @@ begin
c_out.write_xerc_data <= e_in.xerc; c_out.write_xerc_data <= e_in.xerc;
end if; end if;


sign_extend <= l_in.sign_extend;
data_len <= unsigned(l_in.write_len);
byte_offset <= unsigned(l_in.write_shift);
if l_in.write_enable = '1' then if l_in.write_enable = '1' then
w_out.write_reg <= gpr_to_gspr(l_in.write_reg); w_out.write_reg <= gpr_to_gspr(l_in.write_reg);
if l_in.byte_reverse = '1' then w_out.write_data <= l_in.write_data;
brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1;
end if;
w_out.write_enable <= '1'; w_out.write_enable <= '1';
second_word <= l_in.second_word;
if l_in.valid = '0' and (data_len + byte_offset > 8) then
partial_write <= '1';
end if;
xe := l_in.xerc;
end if; end if;


-- shift and byte-reverse data bytes if l_in.rc = '1' then
for i in 0 to 7 loop -- st*cx. instructions
k := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); scf(3) := '0';
perm(i) <= k(2 downto 0); scf(2) := '0';
use_second(i) <= k(3); scf(1) := l_in.store_done;
end loop; scf(0) := l_in.xerc.so;
for i in 0 to 7 loop c_out.write_cr_enable <= '1';
j := to_integer(perm(i)) * 8; c_out.write_cr_mask <= num_to_fxm(0);
data_permuted(i * 8 + 7 downto i * 8) <= l_in.write_data(j + 7 downto j); c_out.write_cr_data(31 downto 28) <= scf;
end loop;

-- If the data can arrive split over two cycles, this will be correct
-- provided we don't have both sign extension and byte reversal.
negative <= (data_len(3) and data_permuted(63)) or
(data_len(2) and data_permuted(31)) or
(data_len(1) and data_permuted(15)) or
(data_len(0) and data_permuted(7));

-- trim and sign-extend
for i in 0 to 7 loop
if i < to_integer(data_len) then
if second_word = '1' then
trim_ctl(i) <= '1' & not use_second(i);
else
trim_ctl(i) <= not use_second(i) & '0';
end if;
else
trim_ctl(i) <= '0' & (negative and sign_extend);
end if;
end loop;
for i in 0 to 7 loop
case trim_ctl(i) is
when "11" =>
data_trimmed(i * 8 + 7 downto i * 8) <= data_latched(i * 8 + 7 downto i * 8);
when "10" =>
data_trimmed(i * 8 + 7 downto i * 8) <= data_permuted(i * 8 + 7 downto i * 8);
when "01" =>
data_trimmed(i * 8 + 7 downto i * 8) <= x"FF";
when others =>
data_trimmed(i * 8 + 7 downto i * 8) <= x"00";
end case;
end loop;

-- deliver to regfile
if l_in.write_enable = '1' then
w_out.write_data <= data_trimmed;
else
w_out.write_data <= e_in.write_data;
end if; end if;


-- Perform CR0 update for RC forms -- Perform CR0 update for RC forms
-- Note that loads never have a form with an RC bit, therefore this can test e_in.write_data -- Note that loads never have a form with an RC bit, therefore this can test e_in.write_data
if rc = '1' then if e_in.rc = '1' and e_in.write_enable = '1' then
sign := e_in.write_data(63); sign := e_in.write_data(63);
zero := not (or e_in.write_data); zero := not (or e_in.write_data);
c_out.write_cr_enable <= '1'; c_out.write_cr_enable <= '1';
@ -184,7 +95,7 @@ begin
cf(3) := sign; cf(3) := sign;
cf(2) := not sign and not zero; cf(2) := not sign and not zero;
cf(1) := zero; cf(1) := zero;
cf(0) := xe.so; cf(0) := e_in.xerc.so;
c_out.write_cr_data(31 downto 28) <= cf; c_out.write_cr_data(31 downto 28) <= cf;
end if; end if;
end process; end process;

Loading…
Cancel
Save