From 74062195ca9cb74119c81e5978315ac149fe515d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 19 Jun 2020 20:00:16 +1000 Subject: [PATCH] execute1: Do forwarding of the CR result to the next instruction This adds a path to allow the CR result of one instruction to be forwarded to the next instruction, so that sequences such as cmp; bc can avoid having a 1-cycle bubble. Forwarding is not available for dot-form (Rc=1) instructions, since the CR result for them is calculated in writeback. The decode.output_cr field is used to identify those instructions that compute the CR result in execute1. For some reason, the multiply instructions incorrectly had output_cr = 1 in the decode tables. This fixes that. Signed-off-by: Paul Mackerras --- common.vhdl | 3 ++- control.vhdl | 8 ++++++-- cr_hazard.vhdl | 25 +++++++++++++++++++++---- decode1.vhdl | 26 +++++++++++++------------- decode2.vhdl | 14 ++++++++++++-- execute1.vhdl | 27 +++++++++++++++++++-------- 6 files changed, 73 insertions(+), 30 deletions(-) diff --git a/common.vhdl b/common.vhdl index 16d38c5..18378d5 100644 --- a/common.vhdl +++ b/common.vhdl @@ -151,6 +151,7 @@ package common is bypass_data2: std_ulogic; bypass_data3: std_ulogic; cr: std_ulogic_vector(31 downto 0); + bypass_cr : std_ulogic; xerc: xer_common_t; lr: std_ulogic; rc: std_ulogic; @@ -173,7 +174,7 @@ package common is end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', - lr => '0', rc => '0', oe => '0', invert_a => '0', + bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), others => (others => '0')); diff --git a/control.vhdl b/control.vhdl index 5e557c4..d04576a 100644 --- a/control.vhdl +++ b/control.vhdl @@ -38,6 +38,7 @@ entity control is cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; + cr_bypassable : in std_ulogic; valid_out : out std_ulogic; stall_out : out std_ulogic; @@ -45,7 +46,8 @@ entity control is gpr_bypass_a : out std_ulogic; gpr_bypass_b : out std_ulogic; - gpr_bypass_c : out std_ulogic + gpr_bypass_c : out std_ulogic; + cr_bypass : out std_ulogic ); end entity control; @@ -161,8 +163,10 @@ begin cr_read_in => cr_read_in, cr_write_in => cr_write_valid, + bypassable => cr_bypassable, - stall_out => cr_stall_out + stall_out => cr_stall_out, + use_bypass => cr_bypass ); control0: process(clk) diff --git a/cr_hazard.vhdl b/cr_hazard.vhdl index 4b79020..a6203a8 100644 --- a/cr_hazard.vhdl +++ b/cr_hazard.vhdl @@ -16,15 +16,18 @@ entity cr_hazard is cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; + bypassable : in std_ulogic; - stall_out : out std_ulogic + stall_out : out std_ulogic; + use_bypass : out std_ulogic ); end entity cr_hazard; architecture behaviour of cr_hazard is type pipeline_entry_type is record - valid : std_ulogic; + valid : std_ulogic; + bypass : std_ulogic; end record; - constant pipeline_entry_init : pipeline_entry_type := (valid => '0'); + constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0'); type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type; constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init); @@ -47,7 +50,20 @@ begin if complete_in = '1' then v(1).valid := '0'; end if; - stall_out <= cr_read_in and (v(0).valid or v(1).valid); + + use_bypass <= '0'; + stall_out <= '0'; + if cr_read_in = '1' then + loop_0: for i in 0 to PIPELINE_DEPTH loop + if v(i).valid = '1' then + if r(i).bypass = '1' then + use_bypass <= '1'; + else + stall_out <= '1'; + end if; + end if; + end loop; + end if; -- XXX assumes PIPELINE_DEPTH = 1 if busy_in = '0' then @@ -56,6 +72,7 @@ begin end if; if deferred = '0' and issuing = '1' then v(0).valid := cr_write_in; + v(0).bypass := bypassable; end if; if flush_in = '1' then v(0).valid := '0'; diff --git a/decode1.vhdl b/decode1.vhdl index d215e7e..29b7a05 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -60,7 +60,7 @@ architecture behaviour of decode1 is 41 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lhzu 32 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwz 33 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lwzu - 7 => (ALU, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli + 7 => (ALU, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli 24 => (ALU, OP_OR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ori 25 => (ALU, OP_OR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- oris 20 => (ALU, OP_RLC, RA, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwimi @@ -262,19 +262,19 @@ architecture behaviour of decode1 is 2#0010010000# => (ALU, OP_MTCRF, NONE, NONE, RS, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtcrf/mtocrf 2#0010110010# => (ALU, OP_MTMSRD, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- mtmsrd # ignore top bits and d 2#0111010011# => (ALU, OP_MTSPR, NONE, NONE, RS, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtspr - 2#0001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd - 2#0000001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- mulhdu - 2#0001001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mulhw - 2#0000001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- mulhwu + 2#0001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd + 2#0000001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- mulhdu + 2#0001001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mulhw + 2#0000001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- mulhwu -- next 4 have reserved bit set - 2#1001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd - 2#1000001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- mulhdu - 2#1001001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mulhw - 2#1000001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- mulhwu - 2#0011101001# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulld - 2#1011101001# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulldo - 2#0011101011# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mullw - 2#1011101011# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mullwo + 2#1001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd + 2#1000001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- mulhdu + 2#1001001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mulhw + 2#1000001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- mulhwu + 2#0011101001# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulld + 2#1011101001# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulldo + 2#0011101011# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mullw + 2#1011101011# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mullwo 2#0111011100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nand 2#0001101000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- neg 2#1001101000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nego diff --git a/decode2.vhdl b/decode2.vhdl index 80687a0..d724874 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -213,7 +213,10 @@ architecture behaviour of decode2 is signal gpr_c_read : gpr_index_t; signal gpr_c_bypass : std_ulogic; - signal cr_write_valid : std_ulogic; + signal cr_write_valid : std_ulogic; + signal cr_bypass : std_ulogic; + signal cr_bypass_avail : std_ulogic; + begin control_0: entity work.control generic map ( @@ -248,7 +251,9 @@ begin gpr_c_read_in => gpr_c_read, cr_read_in => d_in.decode.input_cr, - cr_write_in => cr_write_valid, + cr_write_in => cr_write_valid, + cr_bypass => cr_bypass, + cr_bypassable => cr_bypass_avail, valid_out => control_valid_out, stall_out => stall_out, @@ -342,6 +347,7 @@ begin v.e.oe := decode_oe(d_in.decode.rc, d_in.insn); end if; v.e.cr := c_in.read_cr_data; + v.e.bypass_cr := cr_bypass; v.e.xerc := c_in.read_xerc_data; v.e.invert_a := d_in.decode.invert_a; v.e.invert_out := d_in.decode.invert_out; @@ -388,6 +394,10 @@ begin gpr_c_read <= gspr_to_gpr(decoded_reg_c.reg); cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); + cr_bypass_avail <= '0'; + if EX1_BYPASS then + cr_bypass_avail <= d_in.decode.output_cr; + end if; v.e.valid := control_valid_out; if d_in.decode.unit = NONE then diff --git a/execute1.vhdl b/execute1.vhdl index c68857e..a1cd008 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -74,6 +74,7 @@ architecture behaviour of execute1 is signal r, rin : reg_type; signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0); + signal cr_in : std_ulogic_vector(31 downto 0); signal valid_in : std_ulogic; signal ctrl: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0')); @@ -355,6 +356,16 @@ begin v.e.xerc := e_in.xerc; end if; + -- CR forwarding + cr_in <= e_in.cr; + if EX1_BYPASS and e_in.bypass_cr = '1' and r.e.write_cr_enable = '1' then + for i in 0 to 7 loop + if r.e.write_cr_mask(i) = '1' then + cr_in(i * 4 + 3 downto i * 4) <= r.e.write_cr_data(i * 4 + 3 downto i * 4); + end if; + end loop; + end if; + v.lr_update := '0'; v.mul_in_progress := '0'; v.div_in_progress := '0'; @@ -635,7 +646,7 @@ begin v.e.write_reg := fast_spr_num(SPR_CTR); end if; is_branch := '1'; - taken_branch := ppc_bc_taken(bo, bi, e_in.cr, a_in); + taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); abs_branch := insn_aa(e_in.insn); when OP_BCREG => -- read_data1 is CTR @@ -648,7 +659,7 @@ begin v.e.write_reg := fast_spr_num(SPR_CTR); end if; is_branch := '1'; - taken_branch := ppc_bc_taken(bo, bi, e_in.cr, a_in); + taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); abs_branch := '1'; when OP_RFID => @@ -675,7 +686,7 @@ begin v.busy := '1'; when OP_ISEL => crbit := to_integer(unsigned(insn_bc(e_in.insn))); - if e_in.cr(31-crbit) = '1' then + if cr_in(31-crbit) = '1' then result := a_in; else result := b_in; @@ -695,7 +706,7 @@ begin lo := (7-i)*4; hi := lo + 3; if i = scrnum then - newcrf := e_in.cr(hi downto lo); + newcrf := cr_in(hi downto lo); end if; end loop; for i in 0 to 7 loop @@ -713,14 +724,14 @@ begin bbnum := 31 - to_integer(unsigned(bb)); -- Bits 5-8 of cr_op give the truth table of the requested -- logical operation - cr_operands := e_in.cr(banum) & e_in.cr(bbnum); + cr_operands := cr_in(banum) & cr_in(bbnum); crresult := cr_op(5 + to_integer(unsigned(cr_operands))); v.e.write_cr_mask := num_to_fxm((31-btnum) / 4); for i in 0 to 31 loop if i = btnum then v.e.write_cr_data(i) := crresult; else - v.e.write_cr_data(i) := e_in.cr(i); + v.e.write_cr_data(i) := cr_in(i); end if; end loop; end if; @@ -772,7 +783,7 @@ begin when OP_MFCR => if e_in.insn(20) = '0' then -- mfcr - result := x"00000000" & e_in.cr; + result := x"00000000" & cr_in; else -- mfocrf crnum := fxm_to_num(insn_fxm(e_in.insn)); @@ -781,7 +792,7 @@ begin lo := (7-i)*4; hi := lo + 3; if crnum = i then - result(hi downto lo) := e_in.cr(hi downto lo); + result(hi downto lo) := cr_in(hi downto lo); end if; end loop; end if;