diff --git a/common.vhdl b/common.vhdl index 8612389..9c8a942 100644 --- a/common.vhdl +++ b/common.vhdl @@ -109,6 +109,9 @@ package common is read_data1: std_ulogic_vector(63 downto 0); read_data2: std_ulogic_vector(63 downto 0); read_data3: std_ulogic_vector(63 downto 0); + bypass_data1: std_ulogic; + bypass_data2: std_ulogic; + bypass_data3: std_ulogic; cr: std_ulogic_vector(31 downto 0); xerc: xer_common_t; lr: std_ulogic; @@ -126,7 +129,8 @@ package common is data_len: std_ulogic_vector(3 downto 0); end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := - (valid => '0', insn_type => OP_ILLEGAL, lr => '0', rc => '0', oe => '0', invert_a => '0', + (valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', + lr => '0', rc => '0', oe => '0', invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, others => (others => '0')); diff --git a/control.vhdl b/control.vhdl index ead3c1f..064ff98 100644 --- a/control.vhdl +++ b/control.vhdl @@ -21,6 +21,7 @@ entity control is gpr_write_valid_in : in std_ulogic; gpr_write_in : in gspr_index_t; + gpr_bypassable : in std_ulogic; gpr_a_read_valid_in : in std_ulogic; gpr_a_read_in : in gspr_index_t; @@ -36,7 +37,11 @@ entity control is valid_out : out std_ulogic; stall_out : out std_ulogic; - stopped_out : out std_ulogic + stopped_out : out std_ulogic; + + gpr_bypass_a : out std_ulogic; + gpr_bypass_b : out std_ulogic; + gpr_bypass_c : out std_ulogic ); end entity control; @@ -71,10 +76,12 @@ begin gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, + bypass_avail => gpr_bypassable, gpr_read_valid_in => gpr_a_read_valid_in, gpr_read_in => gpr_a_read_in, - stall_out => stall_a_out + stall_out => stall_a_out, + use_bypass => gpr_bypass_a ); gpr_hazard1: entity work.gpr_hazard @@ -87,10 +94,12 @@ begin gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, + bypass_avail => gpr_bypassable, gpr_read_valid_in => gpr_b_read_valid_in, gpr_read_in => gpr_b_read_in, - stall_out => stall_b_out + stall_out => stall_b_out, + use_bypass => gpr_bypass_b ); gpr_c_read_in_fmt <= "0" & gpr_c_read_in; @@ -105,10 +114,12 @@ begin gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, + bypass_avail => gpr_bypassable, gpr_read_valid_in => gpr_c_read_valid_in, gpr_read_in => gpr_c_read_in_fmt, - stall_out => stall_c_out + stall_out => stall_c_out, + use_bypass => gpr_bypass_c ); cr_hazard0: entity work.cr_hazard diff --git a/core.vhdl b/core.vhdl index a38cf36..aa86689 100644 --- a/core.vhdl +++ b/core.vhdl @@ -9,7 +9,8 @@ use work.wishbone_types.all; entity core is generic ( SIM : boolean := false; - DISABLE_FLATTEN : boolean := false + DISABLE_FLATTEN : boolean := false; + EX1_BYPASS : boolean := true ); port ( clk : in std_logic; @@ -176,6 +177,9 @@ begin decode1_stall_in <= decode2_stall_out; decode2_0: entity work.decode2 + generic map ( + EX1_BYPASS => EX1_BYPASS + ) port map ( clk => clk, rst => core_rst, @@ -220,6 +224,9 @@ begin ); execute1_0: entity work.execute1 + generic map ( + EX1_BYPASS => EX1_BYPASS + ) port map ( clk => clk, rst => core_rst, diff --git a/decode2.vhdl b/decode2.vhdl index 6cd4574..6e3bd8a 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -9,6 +9,9 @@ use work.helpers.all; use work.insn_helpers.all; entity decode2 is + generic ( + EX1_BYPASS : boolean := true + ); port ( clk : in std_ulogic; rst : in std_ulogic; @@ -184,15 +187,19 @@ architecture behaviour of decode2 is signal gpr_write_valid : std_ulogic; signal gpr_write : gspr_index_t; + signal gpr_bypassable : std_ulogic; signal gpr_a_read_valid : std_ulogic; signal gpr_a_read :gspr_index_t; + signal gpr_a_bypass : std_ulogic; signal gpr_b_read_valid : std_ulogic; signal gpr_b_read : gspr_index_t; + signal gpr_b_bypass : std_ulogic; signal gpr_c_read_valid : std_ulogic; signal gpr_c_read : gpr_index_t; + signal gpr_c_bypass : std_ulogic; signal cr_write_valid : std_ulogic; begin @@ -213,6 +220,7 @@ begin gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write, + gpr_bypassable => gpr_bypassable, gpr_a_read_valid_in => gpr_a_read_valid, gpr_a_read_in => gpr_a_read, @@ -228,7 +236,11 @@ begin valid_out => control_valid_out, stall_out => stall_out, - stopped_out => stopped_out + stopped_out => stopped_out, + + gpr_bypass_a => gpr_a_bypass, + gpr_bypass_b => gpr_b_bypass, + gpr_bypass_c => gpr_c_bypass ); decode2_0: process(clk) @@ -295,9 +307,12 @@ begin v.e.insn_type := d_in.decode.insn_type; v.e.read_reg1 := decoded_reg_a.reg; v.e.read_data1 := decoded_reg_a.data; + v.e.bypass_data1 := gpr_a_bypass; v.e.read_reg2 := decoded_reg_b.reg; v.e.read_data2 := decoded_reg_b.data; + v.e.bypass_data2 := gpr_b_bypass; v.e.read_data3 := decoded_reg_c.data; + v.e.bypass_data3 := gpr_c_bypass; v.e.write_reg := decoded_reg_o.reg; v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then @@ -342,6 +357,10 @@ begin gpr_write_valid <= decoded_reg_o.reg_valid; gpr_write <= decoded_reg_o.reg; + gpr_bypassable <= '0'; + if EX1_BYPASS and d_in.decode.unit = ALU then + gpr_bypassable <= '1'; + end if; gpr_a_read_valid <= decoded_reg_a.reg_valid; gpr_a_read <= decoded_reg_a.reg; diff --git a/execute1.vhdl b/execute1.vhdl index 5a626f8..d63697c 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -11,6 +11,9 @@ use work.insn_helpers.all; use work.ppc_fx_insns.all; entity execute1 is + generic ( + EX1_BYPASS : boolean := true + ); port ( clk : in std_ulogic; rst : in std_ulogic; @@ -46,6 +49,8 @@ architecture behaviour of execute1 is signal r, rin : reg_type; + signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0); + signal ctrl: ctrl_t := (others => (others => '0')); signal ctrl_tmp: ctrl_t := (others => (others => '0')); @@ -109,9 +114,9 @@ begin rotator_0: entity work.rotator port map ( - rs => e_in.read_data3, - ra => e_in.read_data1, - shift => e_in.read_data2(6 downto 0), + rs => c_in, + ra => a_in, + shift => b_in(6 downto 0), insn => e_in.insn, is_32bit => e_in.is_32bit, right_shift => right_shift, @@ -124,8 +129,8 @@ begin logical_0: entity work.logical port map ( - rs => e_in.read_data3, - rb => e_in.read_data2, + rs => c_in, + rb => b_in, op => e_in.insn_type, invert_in => e_in.invert_a, invert_out => e_in.invert_out, @@ -137,7 +142,7 @@ begin countzero_0: entity work.zero_counter port map ( - rs => e_in.read_data3, + rs => c_in, count_right => e_in.insn(10), is_32bit => e_in.is_32bit, result => countzero_result @@ -158,6 +163,10 @@ begin d_out => divider_to_x ); + a_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data1 = '1' else e_in.read_data1; + b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2; + c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3; + execute1_0: process(clk) begin if rising_edge(clk) then @@ -256,21 +265,21 @@ begin if e_in.is_32bit = '1' then if e_in.is_signed = '1' then - x_to_multiply.data1 <= (others => e_in.read_data1(31)); - x_to_multiply.data1(31 downto 0) <= e_in.read_data1(31 downto 0); - x_to_multiply.data2 <= (others => e_in.read_data2(31)); - x_to_multiply.data2(31 downto 0) <= e_in.read_data2(31 downto 0); + x_to_multiply.data1 <= (others => a_in(31)); + x_to_multiply.data1(31 downto 0) <= a_in(31 downto 0); + x_to_multiply.data2 <= (others => b_in(31)); + x_to_multiply.data2(31 downto 0) <= b_in(31 downto 0); else - x_to_multiply.data1 <= '0' & x"00000000" & e_in.read_data1(31 downto 0); - x_to_multiply.data2 <= '0' & x"00000000" & e_in.read_data2(31 downto 0); + x_to_multiply.data1 <= '0' & x"00000000" & a_in(31 downto 0); + x_to_multiply.data2 <= '0' & x"00000000" & b_in(31 downto 0); end if; else if e_in.is_signed = '1' then - x_to_multiply.data1 <= e_in.read_data1(63) & e_in.read_data1; - x_to_multiply.data2 <= e_in.read_data2(63) & e_in.read_data2; + x_to_multiply.data1 <= a_in(63) & a_in; + x_to_multiply.data2 <= b_in(63) & b_in; else - x_to_multiply.data1 <= '0' & e_in.read_data1; - x_to_multiply.data2 <= '0' & e_in.read_data2; + x_to_multiply.data1 <= '0' & a_in; + x_to_multiply.data2 <= '0' & b_in; end if; end if; @@ -279,23 +288,23 @@ begin sign2 := '0'; if e_in.is_signed = '1' then if e_in.is_32bit = '1' then - sign1 := e_in.read_data1(31); - sign2 := e_in.read_data2(31); + sign1 := a_in(31); + sign2 := b_in(31); else - sign1 := e_in.read_data1(63); - sign2 := e_in.read_data2(63); + sign1 := a_in(63); + sign2 := b_in(63); end if; end if; -- take absolute values if sign1 = '0' then - abs1 := signed(e_in.read_data1); + abs1 := signed(a_in); else - abs1 := - signed(e_in.read_data1); + abs1 := - signed(a_in); end if; if sign2 = '0' then - abs2 := signed(e_in.read_data2); + abs2 := signed(b_in); else - abs2 := - signed(e_in.read_data2); + abs2 := - signed(b_in); end if; x_to_divider <= Execute1ToDividerInit; @@ -358,14 +367,14 @@ begin -- Do nothing when OP_ADD | OP_CMP => if e_in.invert_a = '0' then - a_inv := e_in.read_data1; + a_inv := a_in; else - a_inv := not e_in.read_data1; + a_inv := not a_in; end if; - result_with_carry := ppc_adde(a_inv, e_in.read_data2, + result_with_carry := ppc_adde(a_inv, b_in, decode_input_carry(e_in.input_carry, v.e.xerc)); result := result_with_carry(63 downto 0); - carry_32 := result(32) xor a_inv(32) xor e_in.read_data2(32); + carry_32 := result(32) xor a_inv(32) xor b_in(32); carry_64 := result_with_carry(64); if e_in.insn_type = OP_ADD then if e_in.output_carry = '1' then @@ -373,8 +382,8 @@ begin end if; if e_in.oe = '1' then set_ov(v.e, - calc_ov(a_inv(63), e_in.read_data2(63), carry_64, result_with_carry(63)), - calc_ov(a_inv(31), e_in.read_data2(31), carry_32, result_with_carry(31))); + calc_ov(a_inv(63), b_in(63), carry_64, result_with_carry(63)), + calc_ov(a_inv(31), b_in(31), carry_32, result_with_carry(31))); end if; result_en := '1'; else @@ -385,20 +394,20 @@ begin v.e.write_cr_enable := '1'; crnum := to_integer(unsigned(bf)); v.e.write_cr_mask := num_to_fxm(crnum); - zerolo := not (or (e_in.read_data1(31 downto 0) xor e_in.read_data2(31 downto 0))); - zerohi := not (or (e_in.read_data1(63 downto 32) xor e_in.read_data2(63 downto 32))); + zerolo := not (or (a_in(31 downto 0) xor b_in(31 downto 0))); + zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32))); if zerolo = '1' and (l = '0' or zerohi = '1') then -- values are equal newcrf := "001" & v.e.xerc.so; else if l = '1' then -- 64-bit comparison - msb_a := e_in.read_data1(63); - msb_b := e_in.read_data2(63); + msb_a := a_in(63); + msb_b := b_in(63); else -- 32-bit comparison - msb_a := e_in.read_data1(31); - msb_b := e_in.read_data2(31); + msb_a := a_in(31); + msb_b := b_in(31); end if; if msb_a /= msb_b then -- Subtraction might overflow, but @@ -424,25 +433,25 @@ begin when OP_B => f_out.redirect <= '1'; if (insn_aa(e_in.insn)) then - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2)); + f_out.redirect_nia <= std_ulogic_vector(signed(b_in)); else - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2)); + f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in)); end if; when OP_BC => -- read_data1 is CTR bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); if bo(4-2) = '0' then - result := std_ulogic_vector(unsigned(e_in.read_data1) - 1); + result := std_ulogic_vector(unsigned(a_in) - 1); result_en := '1'; v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then + if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then f_out.redirect <= '1'; if (insn_aa(e_in.insn)) then - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2)); + f_out.redirect_nia <= std_ulogic_vector(signed(b_in)); else - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2)); + f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in)); end if; end if; when OP_BCREG => @@ -451,40 +460,40 @@ begin bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); if bo(4-2) = '0' and e_in.insn(10) = '0' then - result := std_ulogic_vector(unsigned(e_in.read_data1) - 1); + result := std_ulogic_vector(unsigned(a_in) - 1); result_en := '1'; v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then + if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then f_out.redirect <= '1'; - f_out.redirect_nia <= e_in.read_data2(63 downto 2) & "00"; + f_out.redirect_nia <= b_in(63 downto 2) & "00"; end if; when OP_CMPB => - result := ppc_cmpb(e_in.read_data3, e_in.read_data2); + result := ppc_cmpb(c_in, b_in); result_en := '1'; when OP_CNTZ => result := countzero_result; result_en := '1'; when OP_EXTS => -- note data_len is a 1-hot encoding - negative := (e_in.data_len(0) and e_in.read_data3(7)) or - (e_in.data_len(1) and e_in.read_data3(15)) or - (e_in.data_len(2) and e_in.read_data3(31)); + negative := (e_in.data_len(0) and c_in(7)) or + (e_in.data_len(1) and c_in(15)) or + (e_in.data_len(2) and c_in(31)); result := (others => negative); if e_in.data_len(2) = '1' then - result(31 downto 16) := e_in.read_data3(31 downto 16); + result(31 downto 16) := c_in(31 downto 16); end if; if e_in.data_len(2) = '1' or e_in.data_len(1) = '1' then - result(15 downto 8) := e_in.read_data3(15 downto 8); + result(15 downto 8) := c_in(15 downto 8); end if; - result(7 downto 0) := e_in.read_data3(7 downto 0); + result(7 downto 0) := c_in(7 downto 0); result_en := '1'; when OP_ISEL => crbit := to_integer(unsigned(insn_bc(e_in.insn))); if e_in.cr(31-crbit) = '1' then - result := e_in.read_data1; + result := a_in; else - result := e_in.read_data2; + result := b_in; end if; result_en := '1'; when OP_MCRF => @@ -549,7 +558,7 @@ begin end if; when OP_MFSPR => if is_fast_spr(e_in.read_reg1) then - result := e_in.read_data1; + result := a_in; if decode_spr_num(e_in.insn) = SPR_XER then -- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer result(63 downto 32) := (others => '0'); @@ -596,19 +605,19 @@ begin crnum := fxm_to_num(insn_fxm(e_in.insn)); v.e.write_cr_mask := num_to_fxm(crnum); end if; - v.e.write_cr_data := e_in.read_data3(31 downto 0); + v.e.write_cr_data := c_in(31 downto 0); when OP_MTSPR => report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & - "=" & to_hstring(e_in.read_data3); + "=" & to_hstring(c_in); if is_fast_spr(e_in.write_reg) then - result := e_in.read_data3; + result := c_in; result_en := '1'; if decode_spr_num(e_in.insn) = SPR_XER then - v.e.xerc.so := e_in.read_data3(63-32); - v.e.xerc.ov := e_in.read_data3(63-33); - v.e.xerc.ca := e_in.read_data3(63-34); - v.e.xerc.ov32 := e_in.read_data3(63-44); - v.e.xerc.ca32 := e_in.read_data3(63-45); + v.e.xerc.so := c_in(63-32); + v.e.xerc.ov := c_in(63-33); + v.e.xerc.ca := c_in(63-34); + v.e.xerc.ov32 := c_in(63-44); + v.e.xerc.ca32 := c_in(63-45); v.e.write_xerc_enable := '1'; end if; else diff --git a/gpr_hazard.vhdl b/gpr_hazard.vhdl index 705e69d..de4f7d2 100644 --- a/gpr_hazard.vhdl +++ b/gpr_hazard.vhdl @@ -12,18 +12,21 @@ entity gpr_hazard is gpr_write_valid_in : in std_ulogic; gpr_write_in : in std_ulogic_vector(5 downto 0); + bypass_avail : in std_ulogic; gpr_read_valid_in : in std_ulogic; gpr_read_in : in std_ulogic_vector(5 downto 0); - stall_out : out std_ulogic + stall_out : out std_ulogic; + use_bypass : out std_ulogic ); end entity gpr_hazard; architecture behaviour of gpr_hazard is type pipeline_entry_type is record - valid : std_ulogic; - gpr : std_ulogic_vector(5 downto 0); + valid : std_ulogic; + bypass : std_ulogic; + gpr : std_ulogic_vector(5 downto 0); end record; - constant pipeline_entry_init : pipeline_entry_type := (valid => '0', gpr => (others => '0')); + constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0')); type pipeline_t is array(0 to PIPELINE_DEPTH-1) of pipeline_entry_type; constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init); @@ -33,9 +36,7 @@ begin gpr_hazard0: process(clk) begin if rising_edge(clk) then - if stall_in = '0' then - r <= rin; - end if; + r <= rin; end if; end process; @@ -45,22 +46,49 @@ begin v := r; stall_out <= '0'; - loop_0: for i in 0 to PIPELINE_DEPTH-1 loop - if ((r(i).valid = gpr_read_valid_in) and r(i).gpr = gpr_read_in) then - stall_out <= '1'; + use_bypass <= '0'; + if gpr_read_valid_in = '1' then + if r(0).valid = '1' and r(0).gpr = gpr_read_in then + if r(0).bypass = '1' and stall_in = '0' then + use_bypass <= '1'; + else + stall_out <= '1'; + end if; end if; - end loop; + loop_0: for i in 1 to PIPELINE_DEPTH-1 loop + if r(i).valid = '1' and r(i).gpr = gpr_read_in then + if r(i).bypass = '1' then + use_bypass <= '1'; + else + stall_out <= '1'; + end if; + end if; + end loop; + end if; - v(0).valid := gpr_write_valid_in; - v(0).gpr := gpr_write_in; - loop_1: for i in 0 to PIPELINE_DEPTH-2 loop - -- propagate to next slot - v(i+1) := r(i); - end loop; + if stall_in = '0' then + v(0).valid := gpr_write_valid_in; + v(0).bypass := bypass_avail; + v(0).gpr := gpr_write_in; + loop_1: for i in 1 to PIPELINE_DEPTH-1 loop + -- propagate to next slot + v(i).valid := r(i-1).valid; + v(i).bypass := r(i-1).bypass; + v(i).gpr := r(i-1).gpr; + end loop; - -- asynchronous output - if gpr_read_valid_in = '0' then - stall_out <= '0'; + else + -- stage 0 stalled, so stage 1 becomes empty + loop_1b: for i in 1 to PIPELINE_DEPTH-1 loop + -- propagate to next slot + if i = 1 then + v(i).valid := '0'; + else + v(i).valid := r(i-1).valid; + v(i).bypass := r(i-1).bypass; + v(i).gpr := r(i-1).gpr; + end if; + end loop; end if; -- update registers