Add a second execute stage to the pipeline

This adds a second execute stage to the pipeline, in order to match up
the length of the pipeline through loadstore and dcache with the
length through execute1.  This will ultimately enable us to get rid of
the 1-cycle bubble that we currently have when issuing ALU
instructions after one or more LSU instructions.

Most ALU instructions execute in the first stage, except for
count-zeroes and popcount instructions (which take two cycles and do
some of their work in the second stage) and mfspr/mtspr to "slow" SPRs
(TB, DEC, PVR, LOGA/LOGD, CFAR).  Multiply and divide/mod instructions
take several cycles but the instruction stays in the first stage (ex1)
and ex1.busy is asserted until the operation is complete.

There is currently a bypass from the first stage but not the second
stage.  Performance is down somewhat because of that and because this
doesn't yet eliminate the bubble between LSU and ALU instructions.

The forwarding of XER common bits has been changed somewhat because
now there is another pipeline stage between ex1 and the committed
state in cr_file.  The simplest thing for now is to record the last
value written and use that, unless there has been a flush, in which
case the committed state (obtained via e_in.xerc) is used.

Note that this fixes what was previously a benign bug in control.vhdl,
where it was possible for control to forget an instructions dependency
on a value from a previous instruction (a GPR or the CR) if this
instruction writes the value and the instruction gets to the point
where it could issue but is blocked by the busy signal from execute1.
In that situation, control may incorrectly not indicate that a bypass
should be used.  That didn't matter previously because, for ALU and
FPU instructions, there was only one previous instruction in flight
and once the current instruction could issue, the previous instruction
was completing and the correct value would be obtained from
register_file or cr_file.  For loadstore instructions there could be
two being executed, but because there are no bypass paths, failing to
indicate use of a bypass path is fine.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/379/head
Paul Mackerras 2 years ago
parent 521a5403a9
commit 3510071d9a

@ -356,6 +356,7 @@ package common is

type Execute1ToDividerType is record
valid: std_ulogic;
flush: std_ulogic;
dividend: std_ulogic_vector(63 downto 0);
divisor: std_ulogic_vector(63 downto 0);
is_signed: std_ulogic;
@ -364,9 +365,8 @@ package common is
is_modulus: std_ulogic;
neg_result: std_ulogic;
end record;
constant Execute1ToDividerInit: Execute1ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0',
is_extended => '0', is_modulus => '0',
neg_result => '0', others => (others => '0'));
constant Execute1ToDividerInit: Execute1ToDividerType := (
dividend => 64x"0", divisor => 64x"0", others => '0');

type PMUEventType is record
no_instr_avail : std_ulogic;

@ -104,7 +104,8 @@ begin
tag_regs(i).wr_cr <= '0';
report "tag " & integer'image(i) & " not valid";
end if;
if gpr_write_valid = '1' and tag_regs(i).reg = gpr_write_in then
if instr_tag.valid = '1' and gpr_write_valid = '1' and
tag_regs(i).reg = gpr_write_in then
tag_regs(i).recent <= '0';
if tag_regs(i).recent = '1' and tag_regs(i).wr_gpr = '1' then
report "tag " & integer'image(i) & " not recent";
@ -126,7 +127,7 @@ begin
curr_cr_tag <= 0;
else
curr_tag <= next_tag;
if cr_write_valid = '1' then
if instr_tag.valid = '1' and cr_write_valid = '1' then
curr_cr_tag <= instr_tag.tag;
end if;
end if;

@ -26,6 +26,7 @@ begin
bitcounter_0: entity work.bit_counter
port map (
clk => clk,
stall => '0',
rs => rs,
result => res,
count_right => count_right,

@ -228,7 +228,6 @@ architecture behaviour of decode2 is
OP_SHR => "010",
OP_EXTSWSLI => "010",
OP_MUL_L64 => "011", -- muldiv_result
OP_MFSPR => "101", -- spr_result
OP_B => "110", -- next_nia
OP_BC => "110",
OP_BCREG => "110",

@ -36,7 +36,7 @@ begin
divider_0: process(clk)
begin
if rising_edge(clk) then
if rst = '1' then
if rst = '1' or d_in.flush = '1' then
dend <= (others => '0');
div <= (others => '0');
quot <= (others => '0');

@ -60,65 +60,90 @@ entity execute1 is
end entity execute1;

architecture behaviour of execute1 is
type reg_type is record
type side_effect_type is record
terminate : std_ulogic;
icache_inval : std_ulogic;
write_msr : std_ulogic;
write_xerlow : std_ulogic;
write_dec : std_ulogic;
write_cfar : std_ulogic;
write_loga : std_ulogic;
inc_loga : std_ulogic;
write_pmuspr : std_ulogic;
end record;
constant side_effect_init : side_effect_type := (others => '0');

type actions_type is record
e : Execute1ToWritebackType;
se : side_effect_type;
complete : std_ulogic;
exception : std_ulogic;
trap : std_ulogic;
new_msr : std_ulogic_vector(63 downto 0);
take_branch : std_ulogic;
direct_branch : std_ulogic;
start_mul : std_ulogic;
start_div : std_ulogic;
do_trace : std_ulogic;
fp_intr : std_ulogic;
res2_sel : std_ulogic_vector(1 downto 0);
bypass_valid : std_ulogic;
end record;
constant actions_type_init : actions_type :=
(e => Execute1ToWritebackInit, se => side_effect_init,
new_msr => (others => '0'), res2_sel => "00", others => '0');

type reg_stage1_type is record
e : Execute1ToWritebackType;
se : side_effect_type;
busy: std_ulogic;
terminate: std_ulogic;
intr_pending : std_ulogic;
fp_exception_next : std_ulogic;
trace_next : std_ulogic;
prev_op : insn_type_t;
br_taken : std_ulogic;
oe : std_ulogic;
mul_select : std_ulogic_vector(1 downto 0);
res2_sel : std_ulogic_vector(1 downto 0);
spr_select : spr_id;
pmu_spr_num : std_ulogic_vector(4 downto 0);
mul_in_progress : std_ulogic;
mul_finish : std_ulogic;
div_in_progress : std_ulogic;
cntz_in_progress : std_ulogic;
no_instr_avail : std_ulogic;
instr_dispatch : std_ulogic;
ext_interrupt : std_ulogic;
taken_branch_event : std_ulogic;
br_mispredict : std_ulogic;
log_addr_spr : std_ulogic_vector(31 downto 0);
msr : std_ulogic_vector(63 downto 0);
xerc : xer_common_t;
xerc_valid : std_ulogic;
end record;
constant reg_type_init : reg_type :=
(e => Execute1ToWritebackInit,
busy => '0', terminate => '0', intr_pending => '0',
constant reg_stage1_type_init : reg_stage1_type :=
(e => Execute1ToWritebackInit, se => side_effect_init,
busy => '0',
fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, br_taken => '0',
oe => '0', mul_select => "00",
mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0',
oe => '0', mul_select => "00", res2_sel => "00",
spr_select => spr_id_init, pmu_spr_num => 5x"0",
mul_in_progress => '0', mul_finish => '0', div_in_progress => '0',
no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0',
taken_branch_event => '0', br_mispredict => '0',
others => (others => '0'));
msr => 64x"0",
xerc => xerc_init, xerc_valid => '0');

type actions_type is record
type reg_stage2_type is record
e : Execute1ToWritebackType;
complete : std_ulogic;
exception : std_ulogic;
trap : std_ulogic;
terminate : std_ulogic;
write_msr : std_ulogic;
new_msr : std_ulogic_vector(63 downto 0);
write_xerlow : std_ulogic;
write_pmuspr : std_ulogic;
write_dec : std_ulogic;
write_loga : std_ulogic;
inc_loga : std_ulogic;
write_cfar : std_ulogic;
take_branch : std_ulogic;
direct_branch : std_ulogic;
start_mul : std_ulogic;
start_div : std_ulogic;
start_cntz : std_ulogic;
do_trace : std_ulogic;
fp_intr : std_ulogic;
icache_inval : std_ulogic;
se : side_effect_type;
ext_interrupt : std_ulogic;
taken_branch_event : std_ulogic;
br_mispredict : std_ulogic;
log_addr_spr : std_ulogic_vector(31 downto 0);
end record;
constant actions_type_init : actions_type :=
(e => Execute1ToWritebackInit, new_msr => (others => '0'), others => '0');
constant reg_stage2_type_init : reg_stage2_type :=
(e => Execute1ToWritebackInit, se => side_effect_init,
log_addr_spr => 32x"0", others => '0');

signal ex1, ex1in : reg_type;
signal ex1, ex1in : reg_stage1_type;
signal ex2, ex2in : reg_stage2_type;
signal actions : actions_type;

signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0);
@ -142,7 +167,9 @@ architecture behaviour of execute1 is
signal muldiv_result: std_ulogic_vector(63 downto 0);
signal shortmul_result: std_ulogic_vector(63 downto 0);
signal spr_result: std_ulogic_vector(63 downto 0);
signal ex_result: std_ulogic_vector(63 downto 0);
signal next_nia : std_ulogic_vector(63 downto 0);
signal s1_sel : std_ulogic_vector(2 downto 0);

signal carry_32 : std_ulogic;
signal carry_64 : std_ulogic;
@ -372,7 +399,7 @@ begin
end generate;

dbg_ctrl_out <= ctrl;
log_rd_addr <= ex1.log_addr_spr;
log_rd_addr <= ex2.log_addr_spr;

a_in <= e_in.read_data1;
b_in <= e_in.read_data2;
@ -393,15 +420,15 @@ begin
itlb_miss_resolved => ic_events.itlb_miss_resolved,
no_instr_avail => ex1.no_instr_avail,
dispatch => ex1.instr_dispatch,
ext_interrupt => ex1.ext_interrupt,
br_taken_complete => ex1.taken_branch_event,
br_mispredict => ex1.br_mispredict,
ext_interrupt => ex2.ext_interrupt,
br_taken_complete => ex2.taken_branch_event,
br_mispredict => ex2.br_mispredict,
others => '0');
x_to_pmu.nia <= e_in.nia;
x_to_pmu.addr <= (others => '0');
x_to_pmu.addr_v <= '0';
x_to_pmu.spr_num <= e_in.insn(20 downto 16);
x_to_pmu.spr_val <= c_in;
x_to_pmu.spr_num <= ex1.pmu_spr_num;
x_to_pmu.spr_val <= ex1.e.write_data;
x_to_pmu.run <= '1';

-- XER forwarding. To avoid having to track XER hazards, we use
@ -409,35 +436,23 @@ begin
-- (SO, OV[32] and CA[32]) are only modified by instructions that are
-- handled here, we can just forward the result being sent to
-- writeback.
xerc_in <= ex1.e.xerc when (ex1.e.write_xerc_enable and ex1.e.valid) = '1' else e_in.xerc;
xerc_in <= ex1.xerc when ex1.xerc_valid = '1' else e_in.xerc;

with e_in.unit select busy_out <=
l_in.busy or ex1.busy or fp_in.busy when LDST,
l_in.busy or ex1.e.valid or ex1.busy or fp_in.busy when LDST,
l_in.busy or l_in.in_progress or ex1.e.valid or ex1.busy or fp_in.busy when FPU,
l_in.busy or l_in.in_progress or ex1.busy or fp_in.busy when others;

valid_in <= e_in.valid and not busy_out and not flush_in;
valid_in <= e_in.valid and not (busy_out or flush_in or ex1.e.redirect or ex1.e.interrupt);

terminate_out <= ex1.terminate;

-- Slow SPR read mux
with e_in.spr_select.sel select spr_result <=
ctrl.tb when SPRSEL_TB,
32x"0" & ctrl.tb(63 downto 32) when SPRSEL_TBU,
ctrl.dec when SPRSEL_DEC,
32x"0" & PVR_MICROWATT when SPRSEL_PVR,
log_wr_addr & ex1.log_addr_spr when SPRSEL_LOGA,
log_rd_data when SPRSEL_LOGD,
ctrl.cfar when SPRSEL_CFAR,
assemble_xer(xerc_in, ctrl.xer_low) when others;

-- Result mux
with e_in.result_sel select alu_result <=
-- First stage result mux
s1_sel <= e_in.result_sel when ex1.busy = '0' else "100";
with s1_sel select alu_result <=
adder_result when "000",
logical_result when "001",
rotator_result when "010",
shortmul_result when "011",
pmu_to_x.spr_val when "100",
spr_result when "101",
muldiv_result when "100",
next_nia when "110",
misc_result when others;

@ -445,22 +460,31 @@ begin
begin
if rising_edge(clk) then
if rst = '1' then
ex1 <= reg_type_init;
ex1 <= reg_stage1_type_init;
ex2 <= reg_stage2_type_init;
ctrl <= ctrl_t_init;
ctrl.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0');
ex1.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0');
else
ex1 <= ex1in;
ex2 <= ex2in;
ctrl <= ctrl_tmp;
if valid_in = '1' then
report "execute " & to_hstring(e_in.nia) & " op=" & insn_type_t'image(e_in.insn_type) &
" wr=" & to_hstring(ex1in.e.write_reg) & " we=" & std_ulogic'image(ex1in.e.write_enable) &
" tag=" & integer'image(ex1in.e.instr_tag.tag) & std_ulogic'image(ex1in.e.instr_tag.valid);
end if;
-- We mustn't get stalled on a cycle where execute2 is
-- completing an instruction or generating an interrupt
if ex2.e.valid = '1' or ex2.e.interrupt = '1' then
assert (l_in.busy or fp_in.busy) = '0'
severity failure;
end if;
end if;
end if;
end process;

-- Data path for integer instructions
-- Data path for integer instructions (first execute stage)
execute1_dp: process(all)
variable a_inv : std_ulogic_vector(63 downto 0);
variable b_or_m1 : std_ulogic_vector(63 downto 0);
@ -543,6 +567,7 @@ begin
if e_in.insn_type = OP_MOD then
x_to_divider.is_modulus <= '1';
end if;
x_to_divider.flush <= flush_in;

addend := (others => '0');
if e_in.insn(26) = '0' then
@ -638,7 +663,7 @@ begin
misc_result <= darn;
when "100" =>
-- mfmsr
misc_result <= ctrl.msr;
misc_result <= ex1.msr;
when "101" =>
if e_in.insn(20) = '0' then
-- mfcr
@ -792,6 +817,7 @@ begin
variable illegal : std_ulogic;
variable privileged : std_ulogic;
variable slow_op : std_ulogic;
variable owait : std_ulogic;
begin
v := actions_type_init;
v.e.write_data := alu_result;
@ -803,12 +829,11 @@ begin
v.e.write_cr_enable := e_in.output_cr;
v.e.write_xerc_enable := e_in.output_xer;
v.e.xerc := xerc_in;
v.new_msr := ctrl.msr;
v.e.write_xerc_enable := e_in.output_xer;
v.e.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) &
not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF);
v.new_msr := ex1.msr;
v.e.redir_mode := ex1.msr(MSR_IR) & not ex1.msr(MSR_PR) &
not ex1.msr(MSR_LE) & not ex1.msr(MSR_SF);
v.e.intr_vec := 16#700#;
v.e.mode_32bit := not ctrl.msr(MSR_SF);
v.e.mode_32bit := not ex1.msr(MSR_SF);
v.e.instr_tag := e_in.instr_tag;
v.e.last_nia := e_in.nia;
v.e.br_offset := 64x"4";
@ -827,8 +852,9 @@ begin
illegal := '0';
privileged := '0';
slow_op := '0';
owait := '0';

if ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then
if ex1.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then
privileged := '1';
end if;

@ -837,7 +863,7 @@ begin
illegal := '1';
end if;

v.do_trace := ctrl.msr(MSR_SE);
v.do_trace := ex1.msr(MSR_SE);
case_0: case e_in.insn_type is
when OP_ILLEGAL =>
illegal := '1';
@ -858,7 +884,7 @@ begin
-- check bits 1-10 of the instruction to make sure it's attn
-- if not then it is illegal
if e_in.insn(10 downto 1) = "0100000000" then
v.terminate := '1';
v.se.terminate := '1';
if e_in.valid = '1' then
report "ATTN";
end if;
@ -909,10 +935,10 @@ begin
-- should never happen
v.e.redirect := '1';
end if;
if ctrl.msr(MSR_BE) = '1' then
if ex1.msr(MSR_BE) = '1' then
v.do_trace := '1';
end if;
v.write_cfar := '1';
v.se.write_cfar := '1';
when OP_BC =>
-- read_data1 is CTR
-- If this instruction updates both CTR and LR, then it is
@ -938,10 +964,10 @@ begin
v.direct_branch := '1';
v.e.br_last := '1';
v.e.br_taken := v.take_branch;
if ctrl.msr(MSR_BE) = '1' then
if ex1.msr(MSR_BE) = '1' then
v.do_trace := '1';
end if;
v.write_cfar := v.take_branch;
v.se.write_cfar := v.take_branch;
end if;
when OP_BCREG =>
-- read_data1 is CTR, read_data2 is target register (CTR, LR or TAR)
@ -964,10 +990,10 @@ begin
-- Indirect branches are never predicted taken
v.e.redirect := v.take_branch;
v.e.br_taken := v.take_branch;
if ctrl.msr(MSR_BE) = '1' then
if ex1.msr(MSR_BE) = '1' then
v.do_trace := '1';
end if;
v.write_cfar := v.take_branch;
v.se.write_cfar := v.take_branch;
end if;

when OP_RFID =>
@ -983,11 +1009,11 @@ begin
v.new_msr(MSR_IR) := '1';
v.new_msr(MSR_DR) := '1';
end if;
v.write_msr := '1';
v.se.write_msr := '1';
v.e.br_offset := b_in;
v.e.abs_br := '1';
v.e.redirect := '1';
v.write_cfar := '1';
v.se.write_cfar := '1';
if HAS_FPU then
v.fp_intr := fp_in.exception and
(a_in(MSR_FE0) or a_in(MSR_FE1));
@ -995,8 +1021,8 @@ begin
v.do_trace := '0';

when OP_CNTZ | OP_POPCNT =>
v.res2_sel := "01";
slow_op := '1';
v.start_cntz := '1';
when OP_ISEL =>
when OP_CROP =>
when OP_MCRXRX =>
@ -1010,14 +1036,19 @@ begin
end if;
elsif e_in.spr_select.valid = '1' then
if e_in.valid = '1' then
report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
"=" & to_hstring(spr_result);
report "MFSPR to slow SPR " & integer'image(decode_spr_num(e_in.insn));
end if;
slow_op := '1';
if e_in.spr_select.ispmu = '0' then
case e_in.spr_select.sel is
when SPRSEL_LOGD =>
v.se.inc_loga := '1';
when others =>
end case;
v.res2_sel := "10";
else
v.res2_sel := "11";
end if;
case e_in.spr_select.sel is
when SPRSEL_LOGD =>
v.inc_loga := '1';
when others =>
end case;
else
-- mfspr from unimplemented SPRs should be a nop in
-- supervisor mode and a program interrupt for user mode
@ -1025,7 +1056,7 @@ begin
report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
" invalid";
end if;
if ctrl.msr(MSR_PR) = '1' then
if ex1.msr(MSR_PR) = '1' then
illegal := '1';
end if;
end if;
@ -1033,7 +1064,7 @@ begin
when OP_MFCR =>
when OP_MTCRF =>
when OP_MTMSRD =>
v.write_msr := '1';
v.se.write_msr := '1';
if e_in.insn(16) = '1' then
-- just update EE and RI
v.new_msr(MSR_EE) := c_in(MSR_EE);
@ -1062,7 +1093,7 @@ begin
report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
"=" & to_hstring(c_in);
end if;
v.write_pmuspr := e_in.spr_select.ispmu;
v.se.write_pmuspr := e_in.spr_select.ispmu;
if e_in.spr_select.valid = '1' and e_in.spr_select.ispmu = '0' then
case e_in.spr_select.sel is
when SPRSEL_XER =>
@ -1071,17 +1102,17 @@ begin
v.e.xerc.ca := c_in(63-34);
v.e.xerc.ov32 := c_in(63-44);
v.e.xerc.ca32 := c_in(63-45);
v.write_xerlow := '1';
v.se.write_xerlow := '1';
when SPRSEL_DEC =>
v.write_dec := '1';
v.se.write_dec := '1';
when SPRSEL_LOGA =>
v.write_loga := '1';
v.se.write_loga := '1';
when others =>
end case;
elsif is_fast_spr(e_in.write_reg) = '0' then
-- mtspr to unimplemented SPRs should be a nop in
-- supervisor mode and a program interrupt for user mode
if ctrl.msr(MSR_PR) = '1' then
if ex1.msr(MSR_PR) = '1' then
illegal := '1';
end if;
end if;
@ -1095,7 +1126,7 @@ begin
v.e.redirect := '1';

when OP_ICBI =>
v.icache_inval := '1';
v.se.icache_inval := '1';

when OP_MUL_L64 =>
if HAS_SHORT_MULT and e_in.insn(26) = '1' and
@ -1109,15 +1140,18 @@ begin
-- Use standard multiplier
v.start_mul := '1';
slow_op := '1';
owait := '1';
end if;

when OP_MUL_H64 | OP_MUL_H32 =>
v.start_mul := '1';
slow_op := '1';
owait := '1';

when OP_DIV | OP_DIVE | OP_MOD =>
v.start_div := '1';
slow_op := '1';
owait := '1';

when OP_FETCH_FAILED =>
-- Handling an ITLB miss doesn't count as having executed an instruction
@ -1147,7 +1181,7 @@ begin
report "illegal instruction";
end if;

elsif HAS_FPU and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then
elsif HAS_FPU and ex1.msr(MSR_FP) = '0' and e_in.fac = FPU then
-- generate a floating-point unavailable interrupt
v.exception := '1';
v.e.intr_vec := 16#800#;
@ -1157,26 +1191,33 @@ begin
end if;

if e_in.unit = ALU then
v.complete := e_in.valid and not v.exception and not slow_op;
v.complete := e_in.valid and not v.exception and not owait;
v.bypass_valid := e_in.valid and not v.exception and not slow_op;
end if;

actions <= v;
end process;

-- First execute stage
execute1_1: process(all)
variable v : reg_type;
variable v : reg_stage1_type;
variable overflow : std_ulogic;
variable lv : Execute1ToLoadstore1Type;
variable irq_valid : std_ulogic;
variable exception : std_ulogic;
variable fv : Execute1ToFPUType;
variable go : std_ulogic;
variable bypass_valid : std_ulogic;
begin
v := ex1;
if ex1.busy = '0' then
if (ex1.busy or l_in.busy or fp_in.busy) = '0' then
v.e := actions.e;
v.e.valid := '0';
v.oe := e_in.oe;
v.spr_select := e_in.spr_select;
v.pmu_spr_num := e_in.insn(20 downto 16);
v.mul_select := e_in.sub_select(1 downto 0);
v.se := side_effect_init;
end if;

lv := Execute1ToLoadstore1Init;
@ -1184,33 +1225,13 @@ begin

x_to_multiply.valid <= '0';
x_to_divider.valid <= '0';
v.mul_in_progress := '0';
v.div_in_progress := '0';
v.cntz_in_progress := '0';
v.mul_finish := '0';
v.ext_interrupt := '0';
v.taken_branch_event := '0';
v.br_mispredict := '0';
v.busy := '0';
bypass_valid := '0';

x_to_pmu.mfspr <= '0';
x_to_pmu.mtspr <= '0';
x_to_pmu.tbbits(3) <= ctrl.tb(63 - 47);
x_to_pmu.tbbits(2) <= ctrl.tb(63 - 51);
x_to_pmu.tbbits(1) <= ctrl.tb(63 - 55);
x_to_pmu.tbbits(0) <= ctrl.tb(63 - 63);
x_to_pmu.pmm_msr <= ctrl.msr(MSR_PMM);
x_to_pmu.pr_msr <= ctrl.msr(MSR_PR);

ctrl_tmp <= ctrl;
-- FIXME: run at 512MHz not core freq
ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1);

irq_valid := ctrl.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in);

v.terminate := '0';
icache_inval <= '0';
v.busy := '0';
irq_valid := ex1.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in);

-- Next insn adder used in a couple of places
next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4);
@ -1223,19 +1244,14 @@ begin

do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0';

if ex1.intr_pending = '1' then
v.e.srr1 := ex1.e.srr1;
v.e.intr_vec := ex1.e.intr_vec;
end if;

if valid_in = '1' then
v.prev_op := e_in.insn_type;
end if;

-- Determine if there is any interrupt to be taken
-- before/instead of executing this instruction
exception := ex1.intr_pending or (valid_in and actions.exception);
if valid_in = '1' and e_in.second = '0' and ex1.intr_pending = '0' then
exception := valid_in and actions.exception;
if valid_in = '1' and e_in.second = '0' then
if HAS_FPU and ex1.fp_exception_next = '1' then
-- This is used for FP-type program interrupts that
-- become pending due to MSR[FE0,FE1] changing from 00 to non-zero.
@ -1278,54 +1294,37 @@ begin

end if;
end if;
if exception = '1' and l_in.in_progress = '1' then
-- We can't send this interrupt to writeback yet because there are
-- still instructions in loadstore1 that haven't completed.
v.intr_pending := '1';
v.busy := '1';
end if;

v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or ex1.busy or fp_in.busy);
v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or
ex1.busy or fp_in.busy);

go := valid_in and not exception;
v.instr_dispatch := go;

if go = '1' then
v.se := actions.se;
v.e.valid := actions.complete;
bypass_valid := actions.bypass_valid;
v.taken_branch_event := actions.take_branch;
v.br_taken := actions.take_branch;
v.trace_next := actions.do_trace;
v.fp_exception_next := actions.fp_intr;
v.cntz_in_progress := actions.start_cntz;

if actions.write_msr = '1' then
ctrl_tmp.msr <= actions.new_msr;
end if;
if actions.write_xerlow = '1' then
ctrl_tmp.xer_low <= c_in(17 downto 0);
end if;
if actions.write_dec = '1' then
ctrl_tmp.dec <= c_in;
end if;
if actions.write_cfar = '1' then
ctrl_tmp.cfar <= e_in.nia;
end if;
if actions.write_loga = '1' then
v.log_addr_spr := c_in(31 downto 0);
elsif actions.inc_loga = '1' then
v.log_addr_spr := std_ulogic_vector(unsigned(ex1.log_addr_spr) + 1);
end if;
x_to_pmu.mtspr <= actions.write_pmuspr;
icache_inval <= actions.icache_inval;
v.res2_sel := actions.res2_sel;
v.msr := actions.new_msr;
x_to_multiply.valid <= actions.start_mul;
v.mul_in_progress := actions.start_mul;
x_to_divider.valid <= actions.start_div;
v.div_in_progress := actions.start_div;
v.terminate := actions.terminate;
v.br_mispredict := v.e.redirect and actions.direct_branch;
v.busy := actions.start_cntz or actions.start_mul or actions.start_div;
exception := actions.trap;

-- Go busy while division is happening because the
-- divider is not pipelined. Also go busy while a
-- multiply is happening in order to stop following
-- instructions from using the wrong XER value
-- (and for simplicity in the OE=0 case).
v.busy := actions.start_div or actions.start_mul;

-- instruction for other units, i.e. LDST
if e_in.unit = LDST then
lv.valid := '1';
@ -1335,86 +1334,74 @@ begin
end if;
end if;

-- The following cases all occur when ex1.busy = 1 and therefore
-- valid_in = 0. Hence they don't happen in the same cycle as any of
-- the cases above which depend on valid_in = 1.
if ex1.cntz_in_progress = '1' then
-- cnt[lt]z and popcnt* always take two cycles
v.e.valid := '1';
v.e.write_data := countbits_result;
end if;
if ex1.div_in_progress = '1' then
if divider_to_x.valid = '1' then
v.e.write_data := muldiv_result;
overflow := divider_to_x.overflow;
-- We must test oe because the RC update code in writeback
-- will use the xerc value to set CR0:SO so we must not clobber
-- xerc if OE wasn't set.
if ex1.oe = '1' then
v.e.xerc.ov := overflow;
v.e.xerc.ov32 := overflow;
if overflow = '1' then
v.e.xerc.so := '1';
end if;
if ex1.div_in_progress = '1' then
v.div_in_progress := not divider_to_x.valid;
v.busy := not divider_to_x.valid;
if divider_to_x.valid = '1' and ex1.oe = '1' then
v.e.xerc.ov := divider_to_x.overflow;
v.e.xerc.ov32 := divider_to_x.overflow;
if divider_to_x.overflow = '1' then
v.e.xerc.so := '1';
end if;
v.e.valid := '1';
else
v.busy := '1';
v.div_in_progress := '1';
end if;
end if;
v.e.valid := divider_to_x.valid;
v.e.write_data := alu_result;
bypass_valid := v.e.valid;
end if;
if ex1.mul_in_progress = '1' then
if multiply_to_x.valid = '1' then
v.e.write_data := muldiv_result;
if ex1.oe = '1' then
-- have to wait until next cycle for overflow indication
v.mul_finish := '1';
v.busy := '1';
else
v.e.valid := '1';
end if;
else
v.busy := '1';
v.mul_in_progress := '1';
end if;
if ex1.mul_in_progress = '1' then
v.mul_in_progress := not multiply_to_x.valid;
v.mul_finish := multiply_to_x.valid and ex1.oe;
v.e.valid := multiply_to_x.valid and not ex1.oe;
v.busy := not v.e.valid;
v.e.write_data := alu_result;
bypass_valid := v.e.valid;
end if;
if ex1.mul_finish = '1' then
v.mul_finish := '0';
v.e.xerc.ov := multiply_to_x.overflow;
v.e.xerc.ov32 := multiply_to_x.overflow;
if multiply_to_x.overflow = '1' then
v.e.xerc.so := '1';
end if;
v.e.valid := '1';
end if;
end if;

v.e.interrupt := exception and not (l_in.in_progress or l_in.interrupt);
if v.e.interrupt = '1' then
v.intr_pending := '0';
if v.e.write_xerc_enable = '1' and v.e.valid = '1' then
v.xerc := v.e.xerc;
v.xerc_valid := '1';
end if;

if interrupt_in = '1' then
ctrl_tmp.msr(MSR_SF) <= '1';
ctrl_tmp.msr(MSR_EE) <= '0';
ctrl_tmp.msr(MSR_PR) <= '0';
ctrl_tmp.msr(MSR_SE) <= '0';
ctrl_tmp.msr(MSR_BE) <= '0';
ctrl_tmp.msr(MSR_FP) <= '0';
ctrl_tmp.msr(MSR_FE0) <= '0';
ctrl_tmp.msr(MSR_FE1) <= '0';
ctrl_tmp.msr(MSR_IR) <= '0';
ctrl_tmp.msr(MSR_DR) <= '0';
ctrl_tmp.msr(MSR_RI) <= '0';
ctrl_tmp.msr(MSR_LE) <= '1';
if (ex1.busy or l_in.busy or fp_in.busy) = '0' then
v.e.interrupt := exception;
end if;
if v.e.valid = '0' then
v.e.redirect := '0';
v.e.br_last := '0';
end if;
if flush_in = '1' then
v.e.valid := '0';
v.e.interrupt := '0';
v.e.redirect := '0';
v.e.br_last := '0';
v.busy := '0';
v.div_in_progress := '0';
v.mul_in_progress := '0';
v.mul_finish := '0';
v.xerc_valid := '0';
end if;
if flush_in = '1' or interrupt_in = '1' then
v.msr := ctrl_tmp.msr;
end if;
if interrupt_in = '1' then
v.trace_next := '0';
v.fp_exception_next := '0';
v.intr_pending := '0';
end if;

bypass_data.tag.valid <= v.e.write_enable and v.e.valid;
bypass_data.tag.valid <= v.e.write_enable and bypass_valid;
bypass_data.tag.tag <= v.e.instr_tag.tag;
bypass_data.data <= v.e.write_data;
bypass_data.data <= alu_result;

bypass_cr_data.tag.valid <= v.e.write_cr_enable and v.e.valid;
bypass_cr_data.tag.valid <= v.e.write_cr_enable and bypass_valid;
bypass_cr_data.tag.tag <= v.e.instr_tag.tag;
bypass_cr_data.data <= v.e.write_cr_data;

@ -1427,7 +1414,7 @@ begin
lv.data := c_in;
lv.write_reg := e_in.write_reg;
lv.length := e_in.data_len;
lv.byte_reverse := e_in.byte_reverse xnor ctrl.msr(MSR_LE);
lv.byte_reverse := e_in.byte_reverse xnor ex1.msr(MSR_LE);
lv.sign_extend := e_in.sign_extend;
lv.update := e_in.update;
lv.xerc := xerc_in;
@ -1439,9 +1426,9 @@ begin
e_in.insn(5 downto 1) = "10101" then
lv.ci := '1';
end if;
lv.virt_mode := ctrl.msr(MSR_DR);
lv.priv_mode := not ctrl.msr(MSR_PR);
lv.mode_32bit := not ctrl.msr(MSR_SF);
lv.virt_mode := ex1.msr(MSR_DR);
lv.priv_mode := not ex1.msr(MSR_PR);
lv.mode_32bit := not ex1.msr(MSR_SF);
lv.is_32bit := e_in.is_32bit;
lv.repeat := e_in.repeat;
lv.second := e_in.second;
@ -1452,7 +1439,7 @@ begin
fv.insn := e_in.insn;
fv.itag := e_in.instr_tag;
fv.single := e_in.is_32bit;
fv.fe_mode := ctrl.msr(MSR_FE0) & ctrl.msr(MSR_FE1);
fv.fe_mode := ex1.msr(MSR_FE0) & ex1.msr(MSR_FE1);
fv.fra := a_in;
fv.frb := b_in;
fv.frc := c_in;
@ -1465,19 +1452,124 @@ begin

-- update outputs
l_out <= lv;
e_out <= ex1.e;
if ex1.e.valid = '0' then
e_out.write_enable <= '0';
e_out.write_cr_enable <= '0';
e_out.write_xerc_enable <= '0';
e_out.redirect <= '0';
e_out.br_last <= '0';
fp_out <= fv;
irq_valid_log <= irq_valid;
end process;

-- Slow SPR read mux
with ex1.spr_select.sel select spr_result <=
ctrl.tb when SPRSEL_TB,
32x"0" & ctrl.tb(63 downto 32) when SPRSEL_TBU,
ctrl.dec when SPRSEL_DEC,
32x"0" & PVR_MICROWATT when SPRSEL_PVR,
log_wr_addr & ex2.log_addr_spr when SPRSEL_LOGA,
log_rd_data when SPRSEL_LOGD,
ctrl.cfar when SPRSEL_CFAR,
assemble_xer(ex1.e.xerc, ctrl.xer_low) when others;

-- Second stage result mux
with ex1.res2_sel select ex_result <=
countbits_result when "01",
spr_result when "10",
pmu_to_x.spr_val when "11",
ex1.e.write_data when others;

-- Second execute stage control
execute2_1: process(all)
variable v : reg_stage2_type;
variable overflow : std_ulogic;
variable lv : Execute1ToLoadstore1Type;
variable fv : Execute1ToFPUType;
variable k : integer;
variable go : std_ulogic;
begin
v := ex2;
if (l_in.busy or fp_in.busy) = '0' then
v.e := ex1.e;
v.se := ex1.se;
v.e.write_data := ex_result;
v.ext_interrupt := ex1.ext_interrupt;
v.taken_branch_event := ex1.taken_branch_event;
v.br_mispredict := ex1.br_mispredict;
end if;

ctrl_tmp <= ctrl;
-- FIXME: run at 512MHz not core freq
ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1);

x_to_pmu.mfspr <= '0';
x_to_pmu.mtspr <= '0';
x_to_pmu.tbbits(3) <= ctrl.tb(63 - 47);
x_to_pmu.tbbits(2) <= ctrl.tb(63 - 51);
x_to_pmu.tbbits(1) <= ctrl.tb(63 - 55);
x_to_pmu.tbbits(0) <= ctrl.tb(63 - 63);
x_to_pmu.pmm_msr <= ctrl.msr(MSR_PMM);
x_to_pmu.pr_msr <= ctrl.msr(MSR_PR);

if v.e.valid = '0' or flush_in = '1' then
v.e.write_enable := '0';
v.e.write_cr_enable := '0';
v.e.write_xerc_enable := '0';
v.e.redirect := '0';
v.e.br_last := '0';
v.se := side_effect_init;
v.taken_branch_event := '0';
v.br_mispredict := '0';
end if;
if flush_in = '1' then
v.e.valid := '0';
v.e.interrupt := '0';
v.ext_interrupt := '0';
end if;

if (l_in.busy or fp_in.busy) = '0' then
if ex1.se.write_msr = '1' then
ctrl_tmp.msr <= ex1.msr;
end if;
if ex1.se.write_xerlow = '1' then
ctrl_tmp.xer_low <= ex1.e.write_data(17 downto 0);
end if;
if ex1.se.write_dec = '1' then
ctrl_tmp.dec <= ex1.e.write_data;
end if;
if ex1.se.write_cfar = '1' then
ctrl_tmp.cfar <= ex1.e.last_nia;
end if;
if ex1.se.write_loga = '1' then
v.log_addr_spr := ex1.e.write_data(31 downto 0);
elsif ex1.se.inc_loga = '1' then
v.log_addr_spr := std_ulogic_vector(unsigned(ex2.log_addr_spr) + 1);
end if;
x_to_pmu.mtspr <= ex1.se.write_pmuspr;
end if;

if interrupt_in = '1' then
ctrl_tmp.msr(MSR_SF) <= '1';
ctrl_tmp.msr(MSR_EE) <= '0';
ctrl_tmp.msr(MSR_PR) <= '0';
ctrl_tmp.msr(MSR_SE) <= '0';
ctrl_tmp.msr(MSR_BE) <= '0';
ctrl_tmp.msr(MSR_FP) <= '0';
ctrl_tmp.msr(MSR_FE0) <= '0';
ctrl_tmp.msr(MSR_FE1) <= '0';
ctrl_tmp.msr(MSR_IR) <= '0';
ctrl_tmp.msr(MSR_DR) <= '0';
ctrl_tmp.msr(MSR_RI) <= '0';
ctrl_tmp.msr(MSR_LE) <= '1';
end if;

-- Update registers
ex2in <= v;

-- update outputs
e_out <= ex2.e;
e_out.msr <= msr_copy(ctrl.msr);
fp_out <= fv;

exception_log <= exception;
irq_valid_log <= irq_valid;
terminate_out <= ex2.se.terminate;
icache_inval <= ex2.se.icache_inval;

exception_log <= v.e.interrupt;
end process;

e1_log: if LOG_LENGTH > 0 generate
@ -1492,9 +1584,9 @@ begin
irq_valid_log &
interrupt_in &
"000" &
ex1.e.write_enable &
ex1.e.valid &
((ex1.e.redirect and ex1.e.valid) or ex1.e.interrupt) &
ex2.e.write_enable &
ex2.e.valid &
(ex2.e.redirect or ex2.e.interrupt) &
ex1.busy &
flush_in;
end if;

Loading…
Cancel
Save