diff --git a/common.vhdl b/common.vhdl index 31bd920..52222c3 100644 --- a/common.vhdl +++ b/common.vhdl @@ -113,8 +113,16 @@ package common is ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR) decode: decode_rom_t; + br_pred: std_ulogic; -- Branch was predicted to be taken + end record; + constant Decode1ToDecode2Init : Decode1ToDecode2Type := + (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), + ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init, br_pred => '0'); + + type Decode1ToFetch1Type is record + redirect : std_ulogic; + redirect_nia : std_ulogic_vector(63 downto 0); end record; - constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init); type Decode2ToExecute1Type is record valid: std_ulogic; @@ -149,12 +157,13 @@ package common is sign_extend : std_ulogic; -- do we need to sign extend? update : std_ulogic; -- is this an update instruction? reserve : std_ulogic; -- set for larx/stcx + br_pred : std_ulogic; end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', - is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', + is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), others => (others => '0')); type Execute1ToMultiplyType is record diff --git a/core.vhdl b/core.vhdl index 019660c..092df6d 100644 --- a/core.vhdl +++ b/core.vhdl @@ -48,6 +48,7 @@ architecture behave of core is -- decode signals signal decode1_to_decode2: Decode1ToDecode2Type; + signal decode1_to_fetch1: Decode1ToFetch1Type; signal decode2_to_execute1: Decode2ToExecute1Type; -- register file signals @@ -90,6 +91,8 @@ architecture behave of core is signal dcache_stall_out: std_ulogic; signal flush: std_ulogic; + signal decode1_flush: std_ulogic; + signal fetch1_flush: std_ulogic; signal complete: std_ulogic; signal terminate: std_ulogic; @@ -182,14 +185,16 @@ begin rst => rst_fetch1, alt_reset_in => alt_reset_d, stall_in => fetch1_stall_in, - flush_in => flush, + flush_in => fetch1_flush, stop_in => dbg_core_stop, + d_in => decode1_to_fetch1, e_in => execute1_to_fetch1, i_out => fetch1_to_icache, log_out => log_data(42 downto 0) ); fetch1_stall_in <= icache_stall_out or decode1_busy; + fetch1_flush <= flush or decode1_flush; icache_0: entity work.icache generic map( @@ -204,7 +209,7 @@ begin i_in => fetch1_to_icache, i_out => icache_to_decode1, m_in => mmu_to_icache, - flush_in => flush, + flush_in => fetch1_flush, inval_in => dbg_icache_rst or ex1_icache_inval, stall_in => icache_stall_in, stall_out => icache_stall_out, @@ -221,9 +226,11 @@ begin rst => rst_dec1, stall_in => decode1_stall_in, flush_in => flush, + flush_out => decode1_flush, busy_out => decode1_busy, f_in => icache_to_decode1, d_out => decode1_to_decode2, + f_out => decode1_to_fetch1, log_out => log_data(109 downto 97) ); diff --git a/decode1.vhdl b/decode1.vhdl index f72d310..2060e64 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -8,16 +8,18 @@ use work.decode_types.all; entity decode1 is port ( - clk : in std_ulogic; - rst : in std_ulogic; - - stall_in : in std_ulogic; - flush_in : in std_ulogic; - busy_out : out std_ulogic; - - f_in : in IcacheToDecode1Type; - d_out : out Decode1ToDecode2Type; - log_out : out std_ulogic_vector(12 downto 0) + clk : in std_ulogic; + rst : in std_ulogic; + + stall_in : in std_ulogic; + flush_in : in std_ulogic; + busy_out : out std_ulogic; + flush_out : out std_ulogic; + + f_in : in IcacheToDecode1Type; + f_out : out Decode1ToFetch1Type; + d_out : out Decode1ToDecode2Type; + log_out : out std_ulogic_vector(12 downto 0) ); end entity decode1; @@ -385,11 +387,15 @@ begin decode1_1: process(all) variable v : Decode1ToDecode2Type; + variable f : Decode1ToFetch1Type; variable majorop : major_opcode_t; variable op_19_bits: std_ulogic_vector(2 downto 0); variable sprn : spr_num_t; + variable br_nia : std_ulogic_vector(61 downto 0); + variable br_target : std_ulogic_vector(61 downto 0); + variable br_offset : signed(23 downto 0); begin - v := r; + v := Decode1ToDecode2Init; v.valid := f_in.valid; v.nia := f_in.nia; @@ -486,14 +492,36 @@ begin else v.decode := major_decode_rom_array(to_integer(majorop)); + end if; + -- Branch predictor + -- Note bclr, bcctr and bctar are predicted not taken as we have no + -- count cache or link stack. + br_offset := (others => '0'); + if majorop = 18 then + -- Unconditional branches are always taken + v.br_pred := '1'; + br_offset := signed(f_in.insn(25 downto 2)); + elsif majorop = 16 then + -- Predict backward branches as taken, forward as untaken + v.br_pred := f_in.insn(15); + br_offset := resize(signed(f_in.insn(15 downto 2)), 24); + end if; + br_nia := f_in.nia(63 downto 2); + if f_in.insn(1) = '1' then + br_nia := (others => '0'); end if; + br_target := std_ulogic_vector(signed(br_nia) + br_offset); + f.redirect := v.br_pred and f_in.valid and not flush_in and not s.valid; + f.redirect_nia := br_target & "00"; -- Update registers rin <= v; -- Update outputs d_out <= r; + f_out <= f; + flush_out <= f.redirect; end process; dec1_log : process(clk) diff --git a/decode2.vhdl b/decode2.vhdl index 6acbca7..80687a0 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -358,6 +358,7 @@ begin v.e.sign_extend := d_in.decode.sign_extend; v.e.update := d_in.decode.update; v.e.reserve := d_in.decode.reserve; + v.e.br_pred := d_in.br_pred; -- issue control control_valid_in <= d_in.valid; @@ -371,6 +372,11 @@ begin end if; update_gpr_write_valid <= d_in.decode.update; update_gpr_write_reg <= decoded_reg_a.reg; + if v.e.lr = '1' then + -- there are no instructions that have both update=1 and lr=1 + update_gpr_write_valid <= '1'; + update_gpr_write_reg <= fast_spr_num(SPR_LR); + end if; gpr_a_read_valid <= decoded_reg_a.reg_valid; gpr_a_read <= decoded_reg_a.reg; diff --git a/execute1.vhdl b/execute1.vhdl index edbeaaa..12d3df1 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -305,11 +305,17 @@ begin variable exception_nextpc : std_ulogic; variable trapval : std_ulogic_vector(4 downto 0); variable illegal : std_ulogic; + variable is_branch : std_ulogic; + variable taken_branch : std_ulogic; + variable abs_branch : std_ulogic; begin result := (others => '0'); result_with_carry := (others => '0'); result_en := '0'; newcrf := (others => '0'); + is_branch := '0'; + taken_branch := '0'; + abs_branch := '0'; v := r; v.e := Execute1ToWritebackInit; @@ -625,12 +631,9 @@ begin result := logical_result; result_en := '1'; when OP_B => - f_out.redirect <= '1'; - if (insn_aa(e_in.insn)) then - f_out.redirect_nia <= b_in; - else - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in)); - end if; + is_branch := '1'; + taken_branch := '1'; + abs_branch := insn_aa(e_in.insn); when OP_BC => -- read_data1 is CTR bo := insn_bo(e_in.insn); @@ -640,14 +643,9 @@ begin result_en := '1'; v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then - f_out.redirect <= '1'; - if (insn_aa(e_in.insn)) then - f_out.redirect_nia <= b_in; - else - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in)); - end if; - end if; + is_branch := '1'; + taken_branch := ppc_bc_taken(bo, bi, e_in.cr, a_in); + abs_branch := insn_aa(e_in.insn); when OP_BCREG => -- read_data1 is CTR -- read_data2 is target register (CTR, LR or TAR) @@ -658,7 +656,7 @@ begin result_en := '1'; v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then + if ppc_bc_taken(bo, bi, e_in.cr, a_in) = '1' then f_out.redirect <= '1'; f_out.redirect_nia <= b_in(63 downto 2) & "00"; end if; @@ -903,23 +901,35 @@ begin v.e.rc := e_in.rc and valid_in; + -- Mispredicted branches cause a redirect + if is_branch = '1' and taken_branch /= e_in.br_pred then + f_out.redirect <= '1'; + if taken_branch = '1' then + if abs_branch = '1' then + f_out.redirect_nia <= b_in; + else + f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in)); + end if; + else + f_out.redirect_nia <= next_nia; + end if; + end if; + -- Update LR on the next cycle after a branch link - -- - -- WARNING: The LR update isn't tracked by our hazard tracker. This - -- will work (well I hope) because it only happens on branches - -- which will flush all decoded instructions. By the time - -- fetch catches up, we'll have the new LR. This will - -- *not* work properly however if we have a branch predictor, - -- in which case the solution would probably be to keep a - -- local cache of the updated LR in execute1 (flushed on - -- exceptions) that is used instead of the value from - -- decode when its content is valid. + -- If we're not writing back anything else, we can write back LR + -- this cycle, otherwise we take an extra cycle. if e_in.lr = '1' then - v.lr_update := '1'; - v.next_lr := next_nia; - v.e.valid := '0'; - report "Delayed LR update to " & to_hstring(next_nia); - v.busy := '1'; + if result_en = '0' then + result_en := '1'; + result := next_nia; + v.e.write_reg := fast_spr_num(SPR_LR); + else + v.lr_update := '1'; + v.next_lr := next_nia; + v.e.valid := '0'; + report "Delayed LR update to " & to_hstring(next_nia); + v.busy := '1'; + end if; end if; elsif valid_in = '1' then diff --git a/fetch1.vhdl b/fetch1.vhdl index 93a2293..0d9c6f7 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -23,6 +23,9 @@ entity fetch1 is -- redirect from execution unit e_in : in Execute1ToFetch1Type; + -- redirect from decode1 + d_in : in Decode1ToFetch1Type; + -- Request to icache i_out : out Fetch1ToIcacheType; @@ -49,7 +52,7 @@ begin report "fetch1 rst:" & std_ulogic'image(rst) & " IR:" & std_ulogic'image(e_in.virt_mode) & " P:" & std_ulogic'image(e_in.priv_mode) & - " R:" & std_ulogic'image(e_in.redirect) & + " R:" & std_ulogic'image(e_in.redirect) & std_ulogic'image(d_in.redirect) & " S:" & std_ulogic'image(stall_in) & " T:" & std_ulogic'image(stop_in) & " nia:" & to_hstring(r_next.nia) & @@ -83,6 +86,8 @@ begin v.nia := e_in.redirect_nia; v.virt_mode := e_in.virt_mode; v.priv_mode := e_in.priv_mode; + elsif d_in.redirect = '1' then + v.nia := d_in.redirect_nia; elsif stall_in = '0' then -- For debug stop/step to work properly we need a little bit of diff --git a/ppc_fx_insns.vhdl b/ppc_fx_insns.vhdl index 0bf011d..5fdf1c7 100644 --- a/ppc_fx_insns.vhdl +++ b/ppc_fx_insns.vhdl @@ -93,7 +93,7 @@ package ppc_fx_insns is function ppc_divd (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; function ppc_divwu (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; - function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return integer; + function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return std_ulogic; end package ppc_fx_insns; package body ppc_fx_insns is @@ -785,13 +785,12 @@ package body ppc_fx_insns is return std_ulogic_vector(resize(tmp, ra'length)); end; - function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return integer is + function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return std_ulogic is variable crfield: integer; variable crbit_match: std_ulogic; variable ctr_not_zero: std_ulogic; variable ctr_ok: std_ulogic; variable cond_ok: std_ulogic; - variable ret: integer; begin crfield := to_integer(unsigned(bi)); -- BE bit numbering @@ -800,12 +799,7 @@ package body ppc_fx_insns is ctr_not_zero := '1' when ctr /= x"0000000000000001" else '0'; ctr_ok := bo(4-2) or (ctr_not_zero xor bo(4-3)); cond_ok := bo(4-0) or crbit_match; - if ctr_ok = '1' and cond_ok = '1' then - ret := 1; - else - ret := 0; - end if; - return ret; + return ctr_ok and cond_ok; end; end package body ppc_fx_insns;