From 2f45e545ed86795c0f282204a27f97887329051f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 8 Jul 2022 14:07:28 +1000 Subject: [PATCH] decode2: Rework to make the stall_out signal come from a register At present the busy/stall signal going to decode1 depends on whether control thinks it can issue the current instruction, and that depends on completion and bypass signals coming from execute1 and writeback. To improve the timing of stall_out, this rearranges decode2 so that stall_out is asserted when we have a valid instruction that couldn't be issued in the previous cycle. This means that decode1 could give us a new instruction when we haven't issued the previous instruction. This in turn means that we can only use d_in in the first cycle of processing an instruction. After the first cycle, we get register addresses etc. from dc2 rather than d_in. Then, to avoid the need to read register operands from register_file in each cycle until the instruction issues, we bring the bypass path for data being written to the register file into decode2 explicitly rather than having it in register_file. A new process called decode2_addrs does the process of calling decode_input_reg_* and decode_output_reg and sets up the register file addresses. This was split out (and decode_input_reg_* reworked) to try to reduce the number of passes through the decode2_1 process that need to be done in simulation. Signed-off-by: Paul Mackerras --- common.vhdl | 1 + control.vhdl | 31 ++-- core.vhdl | 5 + decode2.vhdl | 430 +++++++++++++++++++++++++-------------------- register_file.vhdl | 14 +- writeback.vhdl | 7 + 6 files changed, 269 insertions(+), 219 deletions(-) diff --git a/common.vhdl b/common.vhdl index ea6a8d8..54a87d2 100644 --- a/common.vhdl +++ b/common.vhdl @@ -288,6 +288,7 @@ package common is write_reg_enable: std_ulogic; read_reg1: gspr_index_t; read_reg2: gspr_index_t; + read_reg3: gspr_index_t; read_data1: std_ulogic_vector(63 downto 0); read_data2: std_ulogic_vector(63 downto 0); read_data3: std_ulogic_vector(63 downto 0); diff --git a/control.vhdl b/control.vhdl index 17a288b..e6855c2 100644 --- a/control.vhdl +++ b/control.vhdl @@ -15,9 +15,7 @@ entity control is complete_in : in instr_tag_t; valid_in : in std_ulogic; - repeated : in std_ulogic; flush_in : in std_ulogic; - busy_in : in std_ulogic; deferred : in std_ulogic; sgl_pipe_in : in std_ulogic; stop_mark_in : in std_ulogic; @@ -43,7 +41,6 @@ entity control is cr_write_in : in std_ulogic; valid_out : out std_ulogic; - stall_out : out std_ulogic; stopped_out : out std_ulogic; gpr_bypass_a : out std_ulogic_vector(1 downto 0); @@ -157,9 +154,6 @@ begin tag_a.tag := i; end if; end loop; - if tag_match(tag_a, complete_in) then - tag_a.valid := '0'; - end if; tag_b := instr_tag_init; for i in tag_number_t loop if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_b_read_in then @@ -167,9 +161,6 @@ begin tag_b.tag := i; end if; end loop; - if tag_match(tag_b, complete_in) then - tag_b.valid := '0'; - end if; tag_c := instr_tag_init; for i in tag_number_t loop if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_c_read_in then @@ -177,26 +168,29 @@ begin tag_c.tag := i; end if; end loop; - if tag_match(tag_c, complete_in) then - tag_c.valid := '0'; - end if; byp_a := "00"; if EX1_BYPASS and tag_match(execute_next_tag, tag_a) then - byp_a := "10"; + byp_a := "01"; elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_a) then + byp_a := "10"; + elsif tag_match(complete_in, tag_a) then byp_a := "11"; end if; byp_b := "00"; if EX1_BYPASS and tag_match(execute_next_tag, tag_b) then - byp_b := "10"; + byp_b := "01"; elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_b) then + byp_b := "10"; + elsif tag_match(complete_in, tag_b) then byp_b := "11"; end if; byp_c := "00"; if EX1_BYPASS and tag_match(execute_next_tag, tag_c) then - byp_c := "10"; + byp_c := "01"; elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_c) then + byp_c := "10"; + elsif tag_match(complete_in, tag_c) then byp_c := "11"; end if; @@ -204,9 +198,9 @@ begin gpr_bypass_b <= byp_b; gpr_bypass_c <= byp_c; - gpr_tag_stall <= (tag_a.valid and not byp_a(1)) or - (tag_b.valid and not byp_b(1)) or - (tag_c.valid and not byp_c(1)); + gpr_tag_stall <= (tag_a.valid and not (or (byp_a))) or + (tag_b.valid and not (or (byp_b))) or + (tag_c.valid and not (or (byp_c))); incr_tag := curr_tag; instr_tag.tag <= curr_tag; @@ -331,7 +325,6 @@ begin -- update outputs valid_out <= valid_tmp; - stall_out <= stall_tmp or deferred; -- update registers rin_int <= v_int; diff --git a/core.vhdl b/core.vhdl index 23f7e82..ba8f0cc 100644 --- a/core.vhdl +++ b/core.vhdl @@ -100,6 +100,9 @@ architecture behave of core is signal fpu_to_execute1: FPUToExecute1Type; signal fpu_to_writeback: FPUToWritebackType; + -- Writeback signals + signal writeback_bypass: bypass_data_t; + -- local signals signal fetch1_stall_in : std_ulogic; signal icache_stall_out : std_ulogic; @@ -302,6 +305,7 @@ begin execute_cr_bypass => execute1_cr_bypass, execute2_bypass => execute2_bypass, execute2_cr_bypass => execute2_cr_bypass, + writeback_bypass => writeback_bypass, log_out => log_data(119 downto 110) ); decode2_busy_in <= ex1_busy_out; @@ -463,6 +467,7 @@ begin w_out => writeback_to_register_file, c_out => writeback_to_cr_file, f_out => writeback_to_fetch1, + wb_bypass => writeback_bypass, events => writeback_events, interrupt_out => do_interrupt, complete_out => complete diff --git a/decode2.vhdl b/decode2.vhdl index 371c48c..41f3e09 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -41,6 +41,7 @@ entity decode2 is execute_cr_bypass : in cr_bypass_data_t; execute2_bypass : in bypass_data_t; execute2_cr_bypass : in cr_bypass_data_t; + writeback_bypass : in bypass_data_t; log_out : out std_ulogic_vector(9 downto 0) ); @@ -49,8 +50,16 @@ end entity decode2; architecture behaviour of decode2 is type reg_type is record e : Decode2ToExecute1Type; - repeat : std_ulogic; + repeat : repeat_t; + busy : std_ulogic; + sgl_pipe : std_ulogic; + reg_a_valid : std_ulogic; + reg_b_valid : std_ulogic; + reg_c_valid : std_ulogic; + reg_o_valid : std_ulogic; end record; + constant reg_type_init : reg_type := + (e => Decode2ToExecute1Init, repeat => NONE, others => '0'); signal dc2, dc2in : reg_type; @@ -61,20 +70,21 @@ architecture behaviour of decode2 is reg : gspr_index_t; data : std_ulogic_vector(63 downto 0); end record; + constant decode_input_reg_init : decode_input_reg_t := ('0', (others => '0'), (others => '0')); type decode_output_reg_t is record reg_valid : std_ulogic; reg : gspr_index_t; end record; + constant decode_output_reg_init : decode_output_reg_t := ('0', (others => '0')); function decode_input_reg_a (t : input_reg_a_t; insn_in : std_ulogic_vector(31 downto 0); - reg_data : std_ulogic_vector(63 downto 0); ispr : gspr_index_t; instr_addr : std_ulogic_vector(63 downto 0)) return decode_input_reg_t is begin if t = RA or (t = RA_OR_ZERO and insn_ra(insn_in) /= "00000") then - return ('1', gpr_to_gspr(insn_ra(insn_in)), reg_data); + return ('1', gpr_to_gspr(insn_ra(insn_in)), (others => '0')); elsif t = SPR then -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. -- If it's all 0, we don't treat it as a dependency as slow SPRs @@ -83,27 +93,26 @@ architecture behaviour of decode2 is assert is_fast_spr(ispr) = '1' or ispr = "0000000" report "Decode A says SPR but ISPR is invalid:" & to_hstring(ispr) severity failure; - return (is_fast_spr(ispr), ispr, reg_data); + return (is_fast_spr(ispr), ispr, (others => '0')); elsif t = CIA then return ('0', (others => '0'), instr_addr); elsif HAS_FPU and t = FRA then - return ('1', fpr_to_gspr(insn_fra(insn_in)), reg_data); + return ('1', fpr_to_gspr(insn_fra(insn_in)), (others => '0')); else return ('0', (others => '0'), (others => '0')); end if; end; function decode_input_reg_b (t : input_reg_b_t; insn_in : std_ulogic_vector(31 downto 0); - reg_data : std_ulogic_vector(63 downto 0); ispr : gspr_index_t) return decode_input_reg_t is variable ret : decode_input_reg_t; begin case t is when RB => - ret := ('1', gpr_to_gspr(insn_rb(insn_in)), reg_data); + ret := ('1', gpr_to_gspr(insn_rb(insn_in)), (others => '0')); when FRB => if HAS_FPU then - ret := ('1', fpr_to_gspr(insn_frb(insn_in)), reg_data); + ret := ('1', fpr_to_gspr(insn_frb(insn_in)), (others => '0')); else ret := ('0', (others => '0'), (others => '0')); end if; @@ -138,7 +147,7 @@ architecture behaviour of decode2 is assert is_fast_spr(ispr) = '1' or ispr = "0000000" report "Decode B says SPR but ISPR is invalid:" & to_hstring(ispr) severity failure; - ret := (is_fast_spr(ispr), ispr, reg_data); + ret := (is_fast_spr(ispr), ispr, (others => '0')); when NONE => ret := ('0', (others => '0'), (others => '0')); end case; @@ -146,23 +155,23 @@ architecture behaviour of decode2 is return ret; end; - function decode_input_reg_c (t : input_reg_c_t; insn_in : std_ulogic_vector(31 downto 0); - reg_data : std_ulogic_vector(63 downto 0)) return decode_input_reg_t is + function decode_input_reg_c (t : input_reg_c_t; insn_in : std_ulogic_vector(31 downto 0)) + return decode_input_reg_t is begin case t is when RS => - return ('1', gpr_to_gspr(insn_rs(insn_in)), reg_data); + return ('1', gpr_to_gspr(insn_rs(insn_in)), (others => '0')); when RCR => - return ('1', gpr_to_gspr(insn_rcreg(insn_in)), reg_data); + return ('1', gpr_to_gspr(insn_rcreg(insn_in)), (others => '0')); when FRS => if HAS_FPU then - return ('1', fpr_to_gspr(insn_frt(insn_in)), reg_data); + return ('1', fpr_to_gspr(insn_frt(insn_in)), (others => '0')); else return ('0', (others => '0'), (others => '0')); end if; when FRC => if HAS_FPU then - return ('1', fpr_to_gspr(insn_frc(insn_in)), reg_data); + return ('1', fpr_to_gspr(insn_frc(insn_in)), (others => '0')); else return ('0', (others => '0'), (others => '0')); end if; @@ -264,10 +273,14 @@ architecture behaviour of decode2 is others => "000" ); + signal decoded_reg_a : decode_input_reg_t; + signal decoded_reg_b : decode_input_reg_t; + signal decoded_reg_c : decode_input_reg_t; + signal decoded_reg_o : decode_output_reg_t; + -- issue control signals signal control_valid_in : std_ulogic; signal control_valid_out : std_ulogic; - signal control_stall_out : std_ulogic; signal control_sgl_pipe : std_logic; signal gpr_write_valid : std_ulogic; @@ -302,8 +315,6 @@ begin complete_in => complete_in, valid_in => control_valid_in, - repeated => dc2.repeat, - busy_in => busy_in, deferred => deferred, flush_in => flush_in, sgl_pipe_in => control_sgl_pipe, @@ -331,7 +342,6 @@ begin cr_bypass => cr_bypass, valid_out => control_valid_out, - stall_out => control_stall_out, stopped_out => stopped_out, gpr_bypass_a => gpr_a_bypass, @@ -346,9 +356,12 @@ begin decode2_0: process(clk) begin if rising_edge(clk) then - if rst = '1' or flush_in = '1' or deferred = '0' then + if rst = '1' or flush_in = '1' then + dc2 <= reg_type_init; + elsif deferred = '0' then if dc2in.e.valid = '1' then - report "execute " & to_hstring(dc2in.e.nia); + report "execute " & to_hstring(dc2in.e.nia) & + " tag=" & integer'image(dc2in.e.instr_tag.tag) & std_ulogic'image(dc2in.e.instr_tag.valid); end if; dc2 <= dc2in; end if; @@ -357,205 +370,246 @@ begin c_out.read <= d_in.decode.input_cr; + decode2_addrs: process(all) + begin + decoded_reg_a <= decode_input_reg_init; + decoded_reg_b <= decode_input_reg_init; + decoded_reg_c <= decode_input_reg_init; + decoded_reg_o <= decode_output_reg_init; + if d_in.valid = '1' then + decoded_reg_a <= decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, d_in.ispr1, d_in.nia); + decoded_reg_b <= decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, d_in.ispr2); + decoded_reg_c <= decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn); + decoded_reg_o <= decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro); + end if; + + r_out.read1_enable <= decoded_reg_a.reg_valid; + r_out.read1_reg <= decoded_reg_a.reg; + r_out.read2_enable <= decoded_reg_b.reg_valid; + r_out.read2_reg <= decoded_reg_b.reg; + r_out.read3_enable <= decoded_reg_c.reg_valid; + r_out.read3_reg <= decoded_reg_c.reg; + + end process; + decode2_1: process(all) variable v : reg_type; - variable decoded_reg_a : decode_input_reg_t; - variable decoded_reg_b : decode_input_reg_t; - variable decoded_reg_c : decode_input_reg_t; - variable decoded_reg_o : decode_output_reg_t; variable length : std_ulogic_vector(3 downto 0); variable op : insn_type_t; + variable valid_in : std_ulogic; begin v := dc2; - v.e := Decode2ToExecute1Init; - - --v.e.input_cr := d_in.decode.input_cr; - v.e.output_cr := d_in.decode.output_cr; + valid_in := d_in.valid or dc2.busy; - -- Work out whether XER common bits are set - v.e.output_xer := d_in.decode.output_carry; - case d_in.decode.insn_type is - when OP_ADD | OP_MUL_L64 | OP_DIV | OP_DIVE => - -- OE field is valid in OP_ADD/OP_MUL_L64 with major opcode 31 only - if d_in.insn(31 downto 26) = "011111" and insn_oe(d_in.insn) = '1' then - v.e.oe := '1'; - v.e.output_xer := '1'; - end if; - when OP_MTSPR => - if decode_spr_num(d_in.insn) = SPR_XER then - v.e.output_xer := '1'; - end if; - when others => - end case; + if dc2.busy = '0' then + v.e := Decode2ToExecute1Init; - decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1, - d_in.nia); - decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2); - decoded_reg_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn, r_in.read3_data); - decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro); + v.sgl_pipe := d_in.decode.sgl_pipe; - if d_in.decode.lr = '1' then - v.e.lr := insn_lk(d_in.insn); - -- b and bc have even major opcodes; bcreg is considered absolute - v.e.br_abs := insn_aa(d_in.insn) or d_in.insn(26); - end if; - op := d_in.decode.insn_type; + v.e.input_cr := d_in.decode.input_cr; + v.e.output_cr := d_in.decode.output_cr; - if d_in.decode.repeat /= NONE then - v.e.repeat := '1'; - v.e.second := dc2.repeat; - case d_in.decode.repeat is - when DUPD => - -- update-form loads, 2nd instruction writes RA - if dc2.repeat = '1' then - decoded_reg_o.reg := decoded_reg_a.reg; + -- Work out whether XER common bits are set + v.e.output_xer := d_in.decode.output_carry; + case d_in.decode.insn_type is + when OP_ADD | OP_MUL_L64 | OP_DIV | OP_DIVE => + -- OE field is valid in OP_ADD/OP_MUL_L64 with major opcode 31 only + if d_in.insn(31 downto 26) = "011111" and insn_oe(d_in.insn) = '1' then + v.e.oe := '1'; + v.e.output_xer := '1'; + end if; + when OP_MTSPR => + if decode_spr_num(d_in.insn) = SPR_XER then + v.e.output_xer := '1'; end if; when others => end case; - elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then - -- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled - v.e.repeat := '1'; - v.e.second := dc2.repeat; - -- first one does CTR, second does LR - decoded_reg_o.reg(0) := not dc2.repeat; - end if; - v.e.spr_select := d_in.spr_info; + v.reg_a_valid := decoded_reg_a.reg_valid; + v.reg_b_valid := decoded_reg_b.reg_valid; + v.reg_c_valid := decoded_reg_c.reg_valid; + v.reg_o_valid := decoded_reg_o.reg_valid; - r_out.read1_enable <= decoded_reg_a.reg_valid and d_in.valid; - r_out.read1_reg <= decoded_reg_a.reg; - r_out.read2_enable <= decoded_reg_b.reg_valid and d_in.valid; - r_out.read2_reg <= decoded_reg_b.reg; - r_out.read3_enable <= decoded_reg_c.reg_valid and d_in.valid; - r_out.read3_reg <= decoded_reg_c.reg; + if d_in.decode.lr = '1' then + v.e.lr := insn_lk(d_in.insn); + -- b and bc have even major opcodes; bcreg is considered absolute + v.e.br_abs := insn_aa(d_in.insn) or d_in.insn(26); + end if; + op := d_in.decode.insn_type; + + v.repeat := d_in.decode.repeat; + if d_in.decode.repeat /= NONE then + v.e.repeat := '1'; + elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then + -- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled + v.e.repeat := '1'; + end if; - case d_in.decode.length is - when is1B => - length := "0001"; - when is2B => - length := "0010"; - when is4B => - length := "0100"; - when is8B => - length := "1000"; - when NONE => - length := "0000"; - end case; + v.e.spr_select := d_in.spr_info; + + case d_in.decode.length is + when is1B => + length := "0001"; + when is2B => + length := "0010"; + when is4B => + length := "0100"; + when is8B => + length := "1000"; + when NONE => + length := "0000"; + end case; - -- execute unit - v.e.nia := d_in.nia; - v.e.unit := d_in.decode.unit; - v.e.fac := d_in.decode.facility; - v.e.instr_tag := instr_tag; - v.e.read_reg1 := decoded_reg_a.reg; - v.e.read_reg2 := decoded_reg_b.reg; - v.e.write_reg := decoded_reg_o.reg; - v.e.write_reg_enable := decoded_reg_o.reg_valid; - v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); - v.e.xerc := c_in.read_xerc_data; - v.e.invert_a := d_in.decode.invert_a; - v.e.addm1 := '0'; - v.e.insn_type := op; - v.e.invert_out := d_in.decode.invert_out; - v.e.input_carry := d_in.decode.input_carry; - v.e.output_carry := d_in.decode.output_carry; - v.e.is_32bit := d_in.decode.is_32bit; - v.e.is_signed := d_in.decode.is_signed; - v.e.insn := d_in.insn; - v.e.data_len := length; - v.e.byte_reverse := d_in.decode.byte_reverse; - v.e.sign_extend := d_in.decode.sign_extend; - v.e.update := d_in.decode.update; - v.e.reserve := d_in.decode.reserve; - v.e.br_pred := d_in.br_pred; - v.e.result_sel := result_select(op); - v.e.sub_select := subresult_select(op); - if op = OP_BC or op = OP_BCREG then - if d_in.insn(23) = '0' and dc2.repeat = '0' and - not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then - -- decrement CTR if BO(2) = 0 and not bcctr - v.e.addm1 := '1'; - v.e.result_sel := "000"; -- select adder output + -- execute unit + v.e.nia := d_in.nia; + v.e.unit := d_in.decode.unit; + v.e.fac := d_in.decode.facility; + v.e.read_reg1 := decoded_reg_a.reg; + v.e.read_reg2 := decoded_reg_b.reg; + v.e.read_reg3 := decoded_reg_c.reg; + v.e.write_reg := decoded_reg_o.reg; + v.e.write_reg_enable := decoded_reg_o.reg_valid; + v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); + v.e.xerc := c_in.read_xerc_data; + v.e.invert_a := d_in.decode.invert_a; + v.e.addm1 := '0'; + v.e.insn_type := op; + v.e.invert_out := d_in.decode.invert_out; + v.e.input_carry := d_in.decode.input_carry; + v.e.output_carry := d_in.decode.output_carry; + v.e.is_32bit := d_in.decode.is_32bit; + v.e.is_signed := d_in.decode.is_signed; + v.e.insn := d_in.insn; + v.e.data_len := length; + v.e.byte_reverse := d_in.decode.byte_reverse; + v.e.sign_extend := d_in.decode.sign_extend; + v.e.update := d_in.decode.update; + v.e.reserve := d_in.decode.reserve; + v.e.br_pred := d_in.br_pred; + v.e.result_sel := result_select(op); + v.e.sub_select := subresult_select(op); + if op = OP_BC or op = OP_BCREG then + if d_in.insn(23) = '0' and + not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then + -- decrement CTR if BO(2) = 0 and not bcctr + v.e.addm1 := '1'; + v.e.result_sel := "000"; -- select adder output + end if; end if; - end if; - if op = OP_MFSPR then - if is_fast_spr(d_in.ispr1) = '1' then - v.e.result_sel := "000"; -- adder_result, effectively a_in - elsif d_in.spr_info.valid = '0' then - -- Privileged mfspr to invalid/unimplemented SPR numbers - -- writes the contents of RT back to RT (i.e. it's a no-op) - v.e.result_sel := "001"; -- logical_result - elsif d_in.spr_info.ispmu = '1' then - v.e.result_sel := "100"; -- pmuspr_result + if op = OP_MFSPR then + if is_fast_spr(d_in.ispr1) = '1' then + v.e.result_sel := "000"; -- adder_result, effectively a_in + elsif d_in.spr_info.valid = '0' then + -- Privileged mfspr to invalid/unimplemented SPR numbers + -- writes the contents of RT back to RT (i.e. it's a no-op) + v.e.result_sel := "001"; -- logical_result + elsif d_in.spr_info.ispmu = '1' then + v.e.result_sel := "100"; -- pmuspr_result + end if; end if; - end if; - -- See if any of the operands can get their value via the bypass path. - case gpr_a_bypass is - when "10" => - v.e.read_data1 := execute_bypass.data; - when "11" => - v.e.read_data1 := execute2_bypass.data; - when others => - v.e.read_data1 := decoded_reg_a.data; - end case; - case gpr_b_bypass is - when "10" => - v.e.read_data2 := execute_bypass.data; - when "11" => - v.e.read_data2 := execute2_bypass.data; - when others => - v.e.read_data2 := decoded_reg_b.data; - end case; - case gpr_c_bypass is - when "10" => - v.e.read_data3 := execute_bypass.data; - when "11" => - v.e.read_data3 := execute2_bypass.data; - when others => - v.e.read_data3 := decoded_reg_c.data; - end case; - - v.e.cr := c_in.read_cr_data; - if cr_bypass = "10" then - v.e.cr := execute_cr_bypass.data; - elsif cr_bypass = "11" then - v.e.cr := execute2_cr_bypass.data; + elsif dc2.e.valid = '1' then + -- dc2.busy = 1 and dc2.e.valid = 1, thus this must be a repeated instruction. + -- Set up for the second iteration (if deferred = 1 this will all be ignored) + v.e.second := '1'; + case dc2.repeat is + when DUPD => + -- update-form loads, 2nd instruction writes RA + v.e.write_reg := dc2.e.read_reg1; + when NONE => + -- bcl/bclrl/bctarl that needs to write both CTR and LR + v.e.write_reg(0) := '0'; -- point to LR + v.e.result_sel := "110"; -- select NIA (to go to LR) + when others => + end case; end if; -- issue control - control_valid_in <= d_in.valid; - control_sgl_pipe <= d_in.decode.sgl_pipe; + control_valid_in <= valid_in; + control_sgl_pipe <= v.sgl_pipe; - gpr_write_valid <= v.e.write_reg_enable; - gpr_write <= decoded_reg_o.reg; + gpr_write_valid <= v.reg_o_valid; + gpr_write <= v.e.write_reg; - gpr_a_read_valid <= decoded_reg_a.reg_valid; - gpr_a_read <= decoded_reg_a.reg; + gpr_a_read_valid <= v.reg_a_valid; + gpr_a_read <= v.e.read_reg1; - gpr_b_read_valid <= decoded_reg_b.reg_valid; - gpr_b_read <= decoded_reg_b.reg; + gpr_b_read_valid <= v.reg_b_valid; + gpr_b_read <= v.e.read_reg2; - gpr_c_read_valid <= decoded_reg_c.reg_valid; - gpr_c_read <= decoded_reg_c.reg; + gpr_c_read_valid <= v.reg_c_valid; + gpr_c_read <= v.e.read_reg3; - cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); + cr_write_valid <= v.e.output_cr or v.e.rc; -- Since ops that write CR only write some of the fields, -- any op that writes CR effectively also reads it. - cr_read_valid <= cr_write_valid or d_in.decode.input_cr; + cr_read_valid <= cr_write_valid or v.e.input_cr; - v.e.valid := control_valid_out; - if control_valid_out = '1' then - v.repeat := v.e.repeat and not dc2.repeat; + -- See if any of the operands can get their value via the bypass path. + if dc2.busy = '0' or gpr_a_bypass /= "00" then + case gpr_a_bypass is + when "01" => + v.e.read_data1 := execute_bypass.data; + when "10" => + v.e.read_data1 := execute2_bypass.data; + when "11" => + v.e.read_data1 := writeback_bypass.data; + when others => + if decoded_reg_a.reg_valid = '1' then + v.e.read_data1 := r_in.read1_data; + else + v.e.read_data1 := decoded_reg_a.data; + end if; + end case; + end if; + if dc2.busy = '0' or gpr_b_bypass /= "00" then + case gpr_b_bypass is + when "01" => + v.e.read_data2 := execute_bypass.data; + when "10" => + v.e.read_data2 := execute2_bypass.data; + when "11" => + v.e.read_data2 := writeback_bypass.data; + when others => + if decoded_reg_b.reg_valid = '1' then + v.e.read_data2 := r_in.read2_data; + else + v.e.read_data2 := decoded_reg_b.data; + end if; + end case; + end if; + if dc2.busy = '0' or gpr_c_bypass /= "00" then + case gpr_c_bypass is + when "01" => + v.e.read_data3 := execute_bypass.data; + when "10" => + v.e.read_data3 := execute2_bypass.data; + when "11" => + v.e.read_data3 := writeback_bypass.data; + when others => + if decoded_reg_c.reg_valid = '1' then + v.e.read_data3 := r_in.read3_data; + else + v.e.read_data3 := decoded_reg_c.data; + end if; + end case; end if; - stall_out <= control_stall_out or v.repeat; + case cr_bypass is + when "10" => + v.e.cr := execute_cr_bypass.data; + when "11" => + v.e.cr := execute2_cr_bypass.data; + when others => + v.e.cr := c_in.read_cr_data; + end case; - if rst = '1' or flush_in = '1' then - v.e := Decode2ToExecute1Init; - v.repeat := '0'; - end if; + v.e.valid := control_valid_out; + v.e.instr_tag := instr_tag; + v.busy := valid_in and (not control_valid_out or (v.e.repeat and not v.e.second)); + + stall_out <= dc2.busy or deferred; -- Update registers dc2in <= v; @@ -574,9 +628,9 @@ begin dc2.e.valid & stopped_out & stall_out & - (gpr_a_bypass(1) or gpr_a_bypass(0)) & - (gpr_b_bypass(1) or gpr_b_bypass(0)) & - (gpr_c_bypass(1) or gpr_c_bypass(0)); + (gpr_a_bypass(1) xor gpr_a_bypass(0)) & + (gpr_b_bypass(1) xor gpr_b_bypass(0)) & + (gpr_c_bypass(1) xor gpr_c_bypass(0)); end if; end process; log_out <= log_data; diff --git a/register_file.vhdl b/register_file.vhdl index ab35855..0235dfc 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -100,18 +100,8 @@ begin d_out.read2_data <= rd_port_b; d_out.read3_data <= registers(to_integer(unsigned(c_addr))); - -- Forward any written data - if w_in.write_enable = '1' then - if a_addr = w_addr then - d_out.read1_data <= w_in.write_data; - end if; - if b_addr = w_addr then - d_out.read2_data <= w_in.write_data; - end if; - if c_addr = w_addr then - d_out.read3_data <= w_in.write_data; - end if; - end if; + -- Forwarding of written data is now done explicitly with a bypass path + -- from writeback to decode2. end process register_read_0; -- Latch read data and ack if dbg read requested and B port not busy diff --git a/writeback.vhdl b/writeback.vhdl index db30164..0d6f41d 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -19,6 +19,8 @@ entity writeback is c_out : out WritebackToCrFileType; f_out : out WritebackToFetch1Type; + wb_bypass : out bypass_data_t; + -- PMU event bus events : out WritebackEventType; @@ -215,6 +217,11 @@ begin f_out <= f; flush_out <= f_out.redirect; + -- Register write data bypass to decode2 + wb_bypass.tag.tag <= complete_out.tag; + wb_bypass.tag.valid <= complete_out.valid and w_out.write_enable; + wb_bypass.data <= w_out.write_data; + rin <= v; end process; end;