decode2: Rework to make the stall_out signal come from a register

At present the busy/stall signal going to decode1 depends on whether
control thinks it can issue the current instruction, and that depends
on completion and bypass signals coming from execute1 and writeback.

To improve the timing of stall_out, this rearranges decode2 so that
stall_out is asserted when we have a valid instruction that couldn't
be issued in the previous cycle.  This means that decode1 could give
us a new instruction when we haven't issued the previous instruction.

This in turn means that we can only use d_in in the first cycle of
processing an instruction.  After the first cycle, we get register
addresses etc. from dc2 rather than d_in.

Then, to avoid the need to read register operands from register_file
in each cycle until the instruction issues, we bring the bypass path
for data being written to the register file into decode2 explicitly
rather than having it in register_file.

A new process called decode2_addrs does the process of calling
decode_input_reg_* and decode_output_reg and sets up the register file
addresses.  This was split out (and decode_input_reg_* reworked) to
try to reduce the number of passes through the decode2_1 process that
need to be done in simulation.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/379/head
Paul Mackerras 2 years ago
parent c9e838b656
commit 2f45e545ed

@ -288,6 +288,7 @@ package common is
write_reg_enable: std_ulogic;
read_reg1: gspr_index_t;
read_reg2: gspr_index_t;
read_reg3: gspr_index_t;
read_data1: std_ulogic_vector(63 downto 0);
read_data2: std_ulogic_vector(63 downto 0);
read_data3: std_ulogic_vector(63 downto 0);

@ -15,9 +15,7 @@ entity control is

complete_in : in instr_tag_t;
valid_in : in std_ulogic;
repeated : in std_ulogic;
flush_in : in std_ulogic;
busy_in : in std_ulogic;
deferred : in std_ulogic;
sgl_pipe_in : in std_ulogic;
stop_mark_in : in std_ulogic;
@ -43,7 +41,6 @@ entity control is
cr_write_in : in std_ulogic;

valid_out : out std_ulogic;
stall_out : out std_ulogic;
stopped_out : out std_ulogic;

gpr_bypass_a : out std_ulogic_vector(1 downto 0);
@ -157,9 +154,6 @@ begin
tag_a.tag := i;
end if;
end loop;
if tag_match(tag_a, complete_in) then
tag_a.valid := '0';
end if;
tag_b := instr_tag_init;
for i in tag_number_t loop
if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_b_read_in then
@ -167,9 +161,6 @@ begin
tag_b.tag := i;
end if;
end loop;
if tag_match(tag_b, complete_in) then
tag_b.valid := '0';
end if;
tag_c := instr_tag_init;
for i in tag_number_t loop
if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_c_read_in then
@ -177,26 +168,29 @@ begin
tag_c.tag := i;
end if;
end loop;
if tag_match(tag_c, complete_in) then
tag_c.valid := '0';
end if;

byp_a := "00";
if EX1_BYPASS and tag_match(execute_next_tag, tag_a) then
byp_a := "10";
byp_a := "01";
elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_a) then
byp_a := "10";
elsif tag_match(complete_in, tag_a) then
byp_a := "11";
end if;
byp_b := "00";
if EX1_BYPASS and tag_match(execute_next_tag, tag_b) then
byp_b := "10";
byp_b := "01";
elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_b) then
byp_b := "10";
elsif tag_match(complete_in, tag_b) then
byp_b := "11";
end if;
byp_c := "00";
if EX1_BYPASS and tag_match(execute_next_tag, tag_c) then
byp_c := "10";
byp_c := "01";
elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_c) then
byp_c := "10";
elsif tag_match(complete_in, tag_c) then
byp_c := "11";
end if;

@ -204,9 +198,9 @@ begin
gpr_bypass_b <= byp_b;
gpr_bypass_c <= byp_c;

gpr_tag_stall <= (tag_a.valid and not byp_a(1)) or
(tag_b.valid and not byp_b(1)) or
(tag_c.valid and not byp_c(1));
gpr_tag_stall <= (tag_a.valid and not (or (byp_a))) or
(tag_b.valid and not (or (byp_b))) or
(tag_c.valid and not (or (byp_c)));

incr_tag := curr_tag;
instr_tag.tag <= curr_tag;
@ -331,7 +325,6 @@ begin

-- update outputs
valid_out <= valid_tmp;
stall_out <= stall_tmp or deferred;

-- update registers
rin_int <= v_int;

@ -100,6 +100,9 @@ architecture behave of core is
signal fpu_to_execute1: FPUToExecute1Type;
signal fpu_to_writeback: FPUToWritebackType;

-- Writeback signals
signal writeback_bypass: bypass_data_t;

-- local signals
signal fetch1_stall_in : std_ulogic;
signal icache_stall_out : std_ulogic;
@ -302,6 +305,7 @@ begin
execute_cr_bypass => execute1_cr_bypass,
execute2_bypass => execute2_bypass,
execute2_cr_bypass => execute2_cr_bypass,
writeback_bypass => writeback_bypass,
log_out => log_data(119 downto 110)
);
decode2_busy_in <= ex1_busy_out;
@ -463,6 +467,7 @@ begin
w_out => writeback_to_register_file,
c_out => writeback_to_cr_file,
f_out => writeback_to_fetch1,
wb_bypass => writeback_bypass,
events => writeback_events,
interrupt_out => do_interrupt,
complete_out => complete

@ -41,6 +41,7 @@ entity decode2 is
execute_cr_bypass : in cr_bypass_data_t;
execute2_bypass : in bypass_data_t;
execute2_cr_bypass : in cr_bypass_data_t;
writeback_bypass : in bypass_data_t;

log_out : out std_ulogic_vector(9 downto 0)
);
@ -49,8 +50,16 @@ end entity decode2;
architecture behaviour of decode2 is
type reg_type is record
e : Decode2ToExecute1Type;
repeat : std_ulogic;
repeat : repeat_t;
busy : std_ulogic;
sgl_pipe : std_ulogic;
reg_a_valid : std_ulogic;
reg_b_valid : std_ulogic;
reg_c_valid : std_ulogic;
reg_o_valid : std_ulogic;
end record;
constant reg_type_init : reg_type :=
(e => Decode2ToExecute1Init, repeat => NONE, others => '0');

signal dc2, dc2in : reg_type;

@ -61,20 +70,21 @@ architecture behaviour of decode2 is
reg : gspr_index_t;
data : std_ulogic_vector(63 downto 0);
end record;
constant decode_input_reg_init : decode_input_reg_t := ('0', (others => '0'), (others => '0'));

type decode_output_reg_t is record
reg_valid : std_ulogic;
reg : gspr_index_t;
end record;
constant decode_output_reg_init : decode_output_reg_t := ('0', (others => '0'));

function decode_input_reg_a (t : input_reg_a_t; insn_in : std_ulogic_vector(31 downto 0);
reg_data : std_ulogic_vector(63 downto 0);
ispr : gspr_index_t;
instr_addr : std_ulogic_vector(63 downto 0))
return decode_input_reg_t is
begin
if t = RA or (t = RA_OR_ZERO and insn_ra(insn_in) /= "00000") then
return ('1', gpr_to_gspr(insn_ra(insn_in)), reg_data);
return ('1', gpr_to_gspr(insn_ra(insn_in)), (others => '0'));
elsif t = SPR then
-- ISPR must be either a valid fast SPR number or all 0 for a slow SPR.
-- If it's all 0, we don't treat it as a dependency as slow SPRs
@ -83,27 +93,26 @@ architecture behaviour of decode2 is
assert is_fast_spr(ispr) = '1' or ispr = "0000000"
report "Decode A says SPR but ISPR is invalid:" &
to_hstring(ispr) severity failure;
return (is_fast_spr(ispr), ispr, reg_data);
return (is_fast_spr(ispr), ispr, (others => '0'));
elsif t = CIA then
return ('0', (others => '0'), instr_addr);
elsif HAS_FPU and t = FRA then
return ('1', fpr_to_gspr(insn_fra(insn_in)), reg_data);
return ('1', fpr_to_gspr(insn_fra(insn_in)), (others => '0'));
else
return ('0', (others => '0'), (others => '0'));
end if;
end;

function decode_input_reg_b (t : input_reg_b_t; insn_in : std_ulogic_vector(31 downto 0);
reg_data : std_ulogic_vector(63 downto 0);
ispr : gspr_index_t) return decode_input_reg_t is
variable ret : decode_input_reg_t;
begin
case t is
when RB =>
ret := ('1', gpr_to_gspr(insn_rb(insn_in)), reg_data);
ret := ('1', gpr_to_gspr(insn_rb(insn_in)), (others => '0'));
when FRB =>
if HAS_FPU then
ret := ('1', fpr_to_gspr(insn_frb(insn_in)), reg_data);
ret := ('1', fpr_to_gspr(insn_frb(insn_in)), (others => '0'));
else
ret := ('0', (others => '0'), (others => '0'));
end if;
@ -138,7 +147,7 @@ architecture behaviour of decode2 is
assert is_fast_spr(ispr) = '1' or ispr = "0000000"
report "Decode B says SPR but ISPR is invalid:" &
to_hstring(ispr) severity failure;
ret := (is_fast_spr(ispr), ispr, reg_data);
ret := (is_fast_spr(ispr), ispr, (others => '0'));
when NONE =>
ret := ('0', (others => '0'), (others => '0'));
end case;
@ -146,23 +155,23 @@ architecture behaviour of decode2 is
return ret;
end;

function decode_input_reg_c (t : input_reg_c_t; insn_in : std_ulogic_vector(31 downto 0);
reg_data : std_ulogic_vector(63 downto 0)) return decode_input_reg_t is
function decode_input_reg_c (t : input_reg_c_t; insn_in : std_ulogic_vector(31 downto 0))
return decode_input_reg_t is
begin
case t is
when RS =>
return ('1', gpr_to_gspr(insn_rs(insn_in)), reg_data);
return ('1', gpr_to_gspr(insn_rs(insn_in)), (others => '0'));
when RCR =>
return ('1', gpr_to_gspr(insn_rcreg(insn_in)), reg_data);
return ('1', gpr_to_gspr(insn_rcreg(insn_in)), (others => '0'));
when FRS =>
if HAS_FPU then
return ('1', fpr_to_gspr(insn_frt(insn_in)), reg_data);
return ('1', fpr_to_gspr(insn_frt(insn_in)), (others => '0'));
else
return ('0', (others => '0'), (others => '0'));
end if;
when FRC =>
if HAS_FPU then
return ('1', fpr_to_gspr(insn_frc(insn_in)), reg_data);
return ('1', fpr_to_gspr(insn_frc(insn_in)), (others => '0'));
else
return ('0', (others => '0'), (others => '0'));
end if;
@ -264,10 +273,14 @@ architecture behaviour of decode2 is
others => "000"
);

signal decoded_reg_a : decode_input_reg_t;
signal decoded_reg_b : decode_input_reg_t;
signal decoded_reg_c : decode_input_reg_t;
signal decoded_reg_o : decode_output_reg_t;

-- issue control signals
signal control_valid_in : std_ulogic;
signal control_valid_out : std_ulogic;
signal control_stall_out : std_ulogic;
signal control_sgl_pipe : std_logic;

signal gpr_write_valid : std_ulogic;
@ -302,8 +315,6 @@ begin

complete_in => complete_in,
valid_in => control_valid_in,
repeated => dc2.repeat,
busy_in => busy_in,
deferred => deferred,
flush_in => flush_in,
sgl_pipe_in => control_sgl_pipe,
@ -331,7 +342,6 @@ begin
cr_bypass => cr_bypass,

valid_out => control_valid_out,
stall_out => control_stall_out,
stopped_out => stopped_out,

gpr_bypass_a => gpr_a_bypass,
@ -346,9 +356,12 @@ begin
decode2_0: process(clk)
begin
if rising_edge(clk) then
if rst = '1' or flush_in = '1' or deferred = '0' then
if rst = '1' or flush_in = '1' then
dc2 <= reg_type_init;
elsif deferred = '0' then
if dc2in.e.valid = '1' then
report "execute " & to_hstring(dc2in.e.nia);
report "execute " & to_hstring(dc2in.e.nia) &
" tag=" & integer'image(dc2in.e.instr_tag.tag) & std_ulogic'image(dc2in.e.instr_tag.valid);
end if;
dc2 <= dc2in;
end if;
@ -357,20 +370,44 @@ begin

c_out.read <= d_in.decode.input_cr;

decode2_addrs: process(all)
begin
decoded_reg_a <= decode_input_reg_init;
decoded_reg_b <= decode_input_reg_init;
decoded_reg_c <= decode_input_reg_init;
decoded_reg_o <= decode_output_reg_init;
if d_in.valid = '1' then
decoded_reg_a <= decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, d_in.ispr1, d_in.nia);
decoded_reg_b <= decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, d_in.ispr2);
decoded_reg_c <= decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn);
decoded_reg_o <= decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro);
end if;

r_out.read1_enable <= decoded_reg_a.reg_valid;
r_out.read1_reg <= decoded_reg_a.reg;
r_out.read2_enable <= decoded_reg_b.reg_valid;
r_out.read2_reg <= decoded_reg_b.reg;
r_out.read3_enable <= decoded_reg_c.reg_valid;
r_out.read3_reg <= decoded_reg_c.reg;

end process;

decode2_1: process(all)
variable v : reg_type;
variable decoded_reg_a : decode_input_reg_t;
variable decoded_reg_b : decode_input_reg_t;
variable decoded_reg_c : decode_input_reg_t;
variable decoded_reg_o : decode_output_reg_t;
variable length : std_ulogic_vector(3 downto 0);
variable op : insn_type_t;
variable valid_in : std_ulogic;
begin
v := dc2;

valid_in := d_in.valid or dc2.busy;

if dc2.busy = '0' then
v.e := Decode2ToExecute1Init;

--v.e.input_cr := d_in.decode.input_cr;
v.sgl_pipe := d_in.decode.sgl_pipe;

v.e.input_cr := d_in.decode.input_cr;
v.e.output_cr := d_in.decode.output_cr;

-- Work out whether XER common bits are set
@ -389,11 +426,10 @@ begin
when others =>
end case;

decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1,
d_in.nia);
decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2);
decoded_reg_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn, r_in.read3_data);
decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro);
v.reg_a_valid := decoded_reg_a.reg_valid;
v.reg_b_valid := decoded_reg_b.reg_valid;
v.reg_c_valid := decoded_reg_c.reg_valid;
v.reg_o_valid := decoded_reg_o.reg_valid;

if d_in.decode.lr = '1' then
v.e.lr := insn_lk(d_in.insn);
@ -402,34 +438,16 @@ begin
end if;
op := d_in.decode.insn_type;

v.repeat := d_in.decode.repeat;
if d_in.decode.repeat /= NONE then
v.e.repeat := '1';
v.e.second := dc2.repeat;
case d_in.decode.repeat is
when DUPD =>
-- update-form loads, 2nd instruction writes RA
if dc2.repeat = '1' then
decoded_reg_o.reg := decoded_reg_a.reg;
end if;
when others =>
end case;
elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then
-- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled
v.e.repeat := '1';
v.e.second := dc2.repeat;
-- first one does CTR, second does LR
decoded_reg_o.reg(0) := not dc2.repeat;
end if;

v.e.spr_select := d_in.spr_info;

r_out.read1_enable <= decoded_reg_a.reg_valid and d_in.valid;
r_out.read1_reg <= decoded_reg_a.reg;
r_out.read2_enable <= decoded_reg_b.reg_valid and d_in.valid;
r_out.read2_reg <= decoded_reg_b.reg;
r_out.read3_enable <= decoded_reg_c.reg_valid and d_in.valid;
r_out.read3_reg <= decoded_reg_c.reg;

case d_in.decode.length is
when is1B =>
length := "0001";
@ -447,9 +465,9 @@ begin
v.e.nia := d_in.nia;
v.e.unit := d_in.decode.unit;
v.e.fac := d_in.decode.facility;
v.e.instr_tag := instr_tag;
v.e.read_reg1 := decoded_reg_a.reg;
v.e.read_reg2 := decoded_reg_b.reg;
v.e.read_reg3 := decoded_reg_c.reg;
v.e.write_reg := decoded_reg_o.reg;
v.e.write_reg_enable := decoded_reg_o.reg_valid;
v.e.rc := decode_rc(d_in.decode.rc, d_in.insn);
@ -472,7 +490,7 @@ begin
v.e.result_sel := result_select(op);
v.e.sub_select := subresult_select(op);
if op = OP_BC or op = OP_BCREG then
if d_in.insn(23) = '0' and dc2.repeat = '0' and
if d_in.insn(23) = '0' and
not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then
-- decrement CTR if BO(2) = 0 and not bcctr
v.e.addm1 := '1';
@ -491,71 +509,107 @@ begin
end if;
end if;

elsif dc2.e.valid = '1' then
-- dc2.busy = 1 and dc2.e.valid = 1, thus this must be a repeated instruction.
-- Set up for the second iteration (if deferred = 1 this will all be ignored)
v.e.second := '1';
case dc2.repeat is
when DUPD =>
-- update-form loads, 2nd instruction writes RA
v.e.write_reg := dc2.e.read_reg1;
when NONE =>
-- bcl/bclrl/bctarl that needs to write both CTR and LR
v.e.write_reg(0) := '0'; -- point to LR
v.e.result_sel := "110"; -- select NIA (to go to LR)
when others =>
end case;
end if;

-- issue control
control_valid_in <= valid_in;
control_sgl_pipe <= v.sgl_pipe;

gpr_write_valid <= v.reg_o_valid;
gpr_write <= v.e.write_reg;

gpr_a_read_valid <= v.reg_a_valid;
gpr_a_read <= v.e.read_reg1;

gpr_b_read_valid <= v.reg_b_valid;
gpr_b_read <= v.e.read_reg2;

gpr_c_read_valid <= v.reg_c_valid;
gpr_c_read <= v.e.read_reg3;

cr_write_valid <= v.e.output_cr or v.e.rc;
-- Since ops that write CR only write some of the fields,
-- any op that writes CR effectively also reads it.
cr_read_valid <= cr_write_valid or v.e.input_cr;

-- See if any of the operands can get their value via the bypass path.
if dc2.busy = '0' or gpr_a_bypass /= "00" then
case gpr_a_bypass is
when "10" =>
when "01" =>
v.e.read_data1 := execute_bypass.data;
when "11" =>
when "10" =>
v.e.read_data1 := execute2_bypass.data;
when "11" =>
v.e.read_data1 := writeback_bypass.data;
when others =>
if decoded_reg_a.reg_valid = '1' then
v.e.read_data1 := r_in.read1_data;
else
v.e.read_data1 := decoded_reg_a.data;
end if;
end case;
end if;
if dc2.busy = '0' or gpr_b_bypass /= "00" then
case gpr_b_bypass is
when "10" =>
when "01" =>
v.e.read_data2 := execute_bypass.data;
when "11" =>
when "10" =>
v.e.read_data2 := execute2_bypass.data;
when "11" =>
v.e.read_data2 := writeback_bypass.data;
when others =>
if decoded_reg_b.reg_valid = '1' then
v.e.read_data2 := r_in.read2_data;
else
v.e.read_data2 := decoded_reg_b.data;
end if;
end case;
end if;
if dc2.busy = '0' or gpr_c_bypass /= "00" then
case gpr_c_bypass is
when "10" =>
when "01" =>
v.e.read_data3 := execute_bypass.data;
when "11" =>
when "10" =>
v.e.read_data3 := execute2_bypass.data;
when "11" =>
v.e.read_data3 := writeback_bypass.data;
when others =>
if decoded_reg_c.reg_valid = '1' then
v.e.read_data3 := r_in.read3_data;
else
v.e.read_data3 := decoded_reg_c.data;
end if;
end case;
end if;

v.e.cr := c_in.read_cr_data;
if cr_bypass = "10" then
case cr_bypass is
when "10" =>
v.e.cr := execute_cr_bypass.data;
elsif cr_bypass = "11" then
when "11" =>
v.e.cr := execute2_cr_bypass.data;
end if;

-- issue control
control_valid_in <= d_in.valid;
control_sgl_pipe <= d_in.decode.sgl_pipe;

gpr_write_valid <= v.e.write_reg_enable;
gpr_write <= decoded_reg_o.reg;

gpr_a_read_valid <= decoded_reg_a.reg_valid;
gpr_a_read <= decoded_reg_a.reg;

gpr_b_read_valid <= decoded_reg_b.reg_valid;
gpr_b_read <= decoded_reg_b.reg;

gpr_c_read_valid <= decoded_reg_c.reg_valid;
gpr_c_read <= decoded_reg_c.reg;

cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn);
-- Since ops that write CR only write some of the fields,
-- any op that writes CR effectively also reads it.
cr_read_valid <= cr_write_valid or d_in.decode.input_cr;
when others =>
v.e.cr := c_in.read_cr_data;
end case;

v.e.valid := control_valid_out;
if control_valid_out = '1' then
v.repeat := v.e.repeat and not dc2.repeat;
end if;

stall_out <= control_stall_out or v.repeat;
v.e.instr_tag := instr_tag;
v.busy := valid_in and (not control_valid_out or (v.e.repeat and not v.e.second));

if rst = '1' or flush_in = '1' then
v.e := Decode2ToExecute1Init;
v.repeat := '0';
end if;
stall_out <= dc2.busy or deferred;

-- Update registers
dc2in <= v;
@ -574,9 +628,9 @@ begin
dc2.e.valid &
stopped_out &
stall_out &
(gpr_a_bypass(1) or gpr_a_bypass(0)) &
(gpr_b_bypass(1) or gpr_b_bypass(0)) &
(gpr_c_bypass(1) or gpr_c_bypass(0));
(gpr_a_bypass(1) xor gpr_a_bypass(0)) &
(gpr_b_bypass(1) xor gpr_b_bypass(0)) &
(gpr_c_bypass(1) xor gpr_c_bypass(0));
end if;
end process;
log_out <= log_data;

@ -100,18 +100,8 @@ begin
d_out.read2_data <= rd_port_b;
d_out.read3_data <= registers(to_integer(unsigned(c_addr)));

-- Forward any written data
if w_in.write_enable = '1' then
if a_addr = w_addr then
d_out.read1_data <= w_in.write_data;
end if;
if b_addr = w_addr then
d_out.read2_data <= w_in.write_data;
end if;
if c_addr = w_addr then
d_out.read3_data <= w_in.write_data;
end if;
end if;
-- Forwarding of written data is now done explicitly with a bypass path
-- from writeback to decode2.
end process register_read_0;

-- Latch read data and ack if dbg read requested and B port not busy

@ -19,6 +19,8 @@ entity writeback is
c_out : out WritebackToCrFileType;
f_out : out WritebackToFetch1Type;

wb_bypass : out bypass_data_t;

-- PMU event bus
events : out WritebackEventType;

@ -215,6 +217,11 @@ begin
f_out <= f;
flush_out <= f_out.redirect;

-- Register write data bypass to decode2
wb_bypass.tag.tag <= complete_out.tag;
wb_bypass.tag.valid <= complete_out.valid and w_out.write_enable;
wb_bypass.data <= w_out.write_data;

rin <= v;
end process;
end;

Loading…
Cancel
Save