core: Implement a simple branch predictor

This implements a simple branch predictor in the decode1 stage.  If it
sees that the instruction is b or bc and the branch is predicted to be
taken, it sends a flush and redirect upstream (to icache and fetch1)
to redirect fetching to the branch target.  The prediction is sent
downstream with the branch instruction, and execute1 now only sends
a flush/redirect upstream if the prediction was wrong.  Unconditional
branches are always predicted to be taken, and conditional branches
are predicted to be taken if and only if the offset is negative.
Branches that take the branch address from a register (bclr, bcctr)
are predicted not taken, as we don't have any way to predict the
branch address.

Since we can now have a mflr being executed immediately after a bl
or bcl, we now track the update to LR in the hazard tracker, using
the second write register field that is used to track RA updates for
update-form loads and stores.

For those branches that update LR but don't write any other result
(i.e. that don't decrementer CTR), we now write back LR in the same
cycle as the instruction rather than taking a second cycle for the
LR writeback.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/208/head
Paul Mackerras 4 years ago
parent 09ae2ce58d
commit 6687aae4d6

@ -113,8 +113,16 @@ package common is
ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr
ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR) ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR)
decode: decode_rom_t; decode: decode_rom_t;
br_pred: std_ulogic; -- Branch was predicted to be taken
end record;
constant Decode1ToDecode2Init : Decode1ToDecode2Type :=
(valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'),
ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init, br_pred => '0');

type Decode1ToFetch1Type is record
redirect : std_ulogic;
redirect_nia : std_ulogic_vector(63 downto 0);
end record; end record;
constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init);


type Decode2ToExecute1Type is record type Decode2ToExecute1Type is record
valid: std_ulogic; valid: std_ulogic;
@ -149,12 +157,13 @@ package common is
sign_extend : std_ulogic; -- do we need to sign extend? sign_extend : std_ulogic; -- do we need to sign extend?
update : std_ulogic; -- is this an update instruction? update : std_ulogic; -- is this an update instruction?
reserve : std_ulogic; -- set for larx/stcx reserve : std_ulogic; -- set for larx/stcx
br_pred : std_ulogic;
end record; end record;
constant Decode2ToExecute1Init : Decode2ToExecute1Type := constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
(valid => '0', unit => NONE, insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', (valid => '0', unit => NONE, insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
lr => '0', rc => '0', oe => '0', invert_a => '0', lr => '0', rc => '0', oe => '0', invert_a => '0',
invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0',
byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), others => (others => '0')); byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), others => (others => '0'));


type Execute1ToMultiplyType is record type Execute1ToMultiplyType is record

@ -48,6 +48,7 @@ architecture behave of core is


-- decode signals -- decode signals
signal decode1_to_decode2: Decode1ToDecode2Type; signal decode1_to_decode2: Decode1ToDecode2Type;
signal decode1_to_fetch1: Decode1ToFetch1Type;
signal decode2_to_execute1: Decode2ToExecute1Type; signal decode2_to_execute1: Decode2ToExecute1Type;


-- register file signals -- register file signals
@ -90,6 +91,8 @@ architecture behave of core is
signal dcache_stall_out: std_ulogic; signal dcache_stall_out: std_ulogic;


signal flush: std_ulogic; signal flush: std_ulogic;
signal decode1_flush: std_ulogic;
signal fetch1_flush: std_ulogic;


signal complete: std_ulogic; signal complete: std_ulogic;
signal terminate: std_ulogic; signal terminate: std_ulogic;
@ -182,14 +185,16 @@ begin
rst => rst_fetch1, rst => rst_fetch1,
alt_reset_in => alt_reset_d, alt_reset_in => alt_reset_d,
stall_in => fetch1_stall_in, stall_in => fetch1_stall_in,
flush_in => flush, flush_in => fetch1_flush,
stop_in => dbg_core_stop, stop_in => dbg_core_stop,
d_in => decode1_to_fetch1,
e_in => execute1_to_fetch1, e_in => execute1_to_fetch1,
i_out => fetch1_to_icache, i_out => fetch1_to_icache,
log_out => log_data(42 downto 0) log_out => log_data(42 downto 0)
); );


fetch1_stall_in <= icache_stall_out or decode1_busy; fetch1_stall_in <= icache_stall_out or decode1_busy;
fetch1_flush <= flush or decode1_flush;


icache_0: entity work.icache icache_0: entity work.icache
generic map( generic map(
@ -204,7 +209,7 @@ begin
i_in => fetch1_to_icache, i_in => fetch1_to_icache,
i_out => icache_to_decode1, i_out => icache_to_decode1,
m_in => mmu_to_icache, m_in => mmu_to_icache,
flush_in => flush, flush_in => fetch1_flush,
inval_in => dbg_icache_rst or ex1_icache_inval, inval_in => dbg_icache_rst or ex1_icache_inval,
stall_in => icache_stall_in, stall_in => icache_stall_in,
stall_out => icache_stall_out, stall_out => icache_stall_out,
@ -221,9 +226,11 @@ begin
rst => rst_dec1, rst => rst_dec1,
stall_in => decode1_stall_in, stall_in => decode1_stall_in,
flush_in => flush, flush_in => flush,
flush_out => decode1_flush,
busy_out => decode1_busy, busy_out => decode1_busy,
f_in => icache_to_decode1, f_in => icache_to_decode1,
d_out => decode1_to_decode2, d_out => decode1_to_decode2,
f_out => decode1_to_fetch1,
log_out => log_data(109 downto 97) log_out => log_data(109 downto 97)
); );



@ -8,16 +8,18 @@ use work.decode_types.all;


entity decode1 is entity decode1 is
port ( port (
clk : in std_ulogic; clk : in std_ulogic;
rst : in std_ulogic; rst : in std_ulogic;


stall_in : in std_ulogic; stall_in : in std_ulogic;
flush_in : in std_ulogic; flush_in : in std_ulogic;
busy_out : out std_ulogic; busy_out : out std_ulogic;

flush_out : out std_ulogic;
f_in : in IcacheToDecode1Type;
d_out : out Decode1ToDecode2Type; f_in : in IcacheToDecode1Type;
log_out : out std_ulogic_vector(12 downto 0) f_out : out Decode1ToFetch1Type;
d_out : out Decode1ToDecode2Type;
log_out : out std_ulogic_vector(12 downto 0)
); );
end entity decode1; end entity decode1;


@ -385,11 +387,15 @@ begin


decode1_1: process(all) decode1_1: process(all)
variable v : Decode1ToDecode2Type; variable v : Decode1ToDecode2Type;
variable f : Decode1ToFetch1Type;
variable majorop : major_opcode_t; variable majorop : major_opcode_t;
variable op_19_bits: std_ulogic_vector(2 downto 0); variable op_19_bits: std_ulogic_vector(2 downto 0);
variable sprn : spr_num_t; variable sprn : spr_num_t;
variable br_nia : std_ulogic_vector(61 downto 0);
variable br_target : std_ulogic_vector(61 downto 0);
variable br_offset : signed(23 downto 0);
begin begin
v := r; v := Decode1ToDecode2Init;


v.valid := f_in.valid; v.valid := f_in.valid;
v.nia := f_in.nia; v.nia := f_in.nia;
@ -486,14 +492,36 @@ begin


else else
v.decode := major_decode_rom_array(to_integer(majorop)); v.decode := major_decode_rom_array(to_integer(majorop));
end if;


-- Branch predictor
-- Note bclr, bcctr and bctar are predicted not taken as we have no
-- count cache or link stack.
br_offset := (others => '0');
if majorop = 18 then
-- Unconditional branches are always taken
v.br_pred := '1';
br_offset := signed(f_in.insn(25 downto 2));
elsif majorop = 16 then
-- Predict backward branches as taken, forward as untaken
v.br_pred := f_in.insn(15);
br_offset := resize(signed(f_in.insn(15 downto 2)), 24);
end if;
br_nia := f_in.nia(63 downto 2);
if f_in.insn(1) = '1' then
br_nia := (others => '0');
end if; end if;
br_target := std_ulogic_vector(signed(br_nia) + br_offset);
f.redirect := v.br_pred and f_in.valid and not flush_in and not s.valid;
f.redirect_nia := br_target & "00";


-- Update registers -- Update registers
rin <= v; rin <= v;


-- Update outputs -- Update outputs
d_out <= r; d_out <= r;
f_out <= f;
flush_out <= f.redirect;
end process; end process;


dec1_log : process(clk) dec1_log : process(clk)

@ -358,6 +358,7 @@ begin
v.e.sign_extend := d_in.decode.sign_extend; v.e.sign_extend := d_in.decode.sign_extend;
v.e.update := d_in.decode.update; v.e.update := d_in.decode.update;
v.e.reserve := d_in.decode.reserve; v.e.reserve := d_in.decode.reserve;
v.e.br_pred := d_in.br_pred;


-- issue control -- issue control
control_valid_in <= d_in.valid; control_valid_in <= d_in.valid;
@ -371,6 +372,11 @@ begin
end if; end if;
update_gpr_write_valid <= d_in.decode.update; update_gpr_write_valid <= d_in.decode.update;
update_gpr_write_reg <= decoded_reg_a.reg; update_gpr_write_reg <= decoded_reg_a.reg;
if v.e.lr = '1' then
-- there are no instructions that have both update=1 and lr=1
update_gpr_write_valid <= '1';
update_gpr_write_reg <= fast_spr_num(SPR_LR);
end if;


gpr_a_read_valid <= decoded_reg_a.reg_valid; gpr_a_read_valid <= decoded_reg_a.reg_valid;
gpr_a_read <= decoded_reg_a.reg; gpr_a_read <= decoded_reg_a.reg;

@ -305,11 +305,17 @@ begin
variable exception_nextpc : std_ulogic; variable exception_nextpc : std_ulogic;
variable trapval : std_ulogic_vector(4 downto 0); variable trapval : std_ulogic_vector(4 downto 0);
variable illegal : std_ulogic; variable illegal : std_ulogic;
variable is_branch : std_ulogic;
variable taken_branch : std_ulogic;
variable abs_branch : std_ulogic;
begin begin
result := (others => '0'); result := (others => '0');
result_with_carry := (others => '0'); result_with_carry := (others => '0');
result_en := '0'; result_en := '0';
newcrf := (others => '0'); newcrf := (others => '0');
is_branch := '0';
taken_branch := '0';
abs_branch := '0';


v := r; v := r;
v.e := Execute1ToWritebackInit; v.e := Execute1ToWritebackInit;
@ -625,12 +631,9 @@ begin
result := logical_result; result := logical_result;
result_en := '1'; result_en := '1';
when OP_B => when OP_B =>
f_out.redirect <= '1'; is_branch := '1';
if (insn_aa(e_in.insn)) then taken_branch := '1';
f_out.redirect_nia <= b_in; abs_branch := insn_aa(e_in.insn);
else
f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
end if;
when OP_BC => when OP_BC =>
-- read_data1 is CTR -- read_data1 is CTR
bo := insn_bo(e_in.insn); bo := insn_bo(e_in.insn);
@ -640,14 +643,9 @@ begin
result_en := '1'; result_en := '1';
v.e.write_reg := fast_spr_num(SPR_CTR); v.e.write_reg := fast_spr_num(SPR_CTR);
end if; end if;
if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then is_branch := '1';
f_out.redirect <= '1'; taken_branch := ppc_bc_taken(bo, bi, e_in.cr, a_in);
if (insn_aa(e_in.insn)) then abs_branch := insn_aa(e_in.insn);
f_out.redirect_nia <= b_in;
else
f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
end if;
end if;
when OP_BCREG => when OP_BCREG =>
-- read_data1 is CTR -- read_data1 is CTR
-- read_data2 is target register (CTR, LR or TAR) -- read_data2 is target register (CTR, LR or TAR)
@ -658,7 +656,7 @@ begin
result_en := '1'; result_en := '1';
v.e.write_reg := fast_spr_num(SPR_CTR); v.e.write_reg := fast_spr_num(SPR_CTR);
end if; end if;
if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then if ppc_bc_taken(bo, bi, e_in.cr, a_in) = '1' then
f_out.redirect <= '1'; f_out.redirect <= '1';
f_out.redirect_nia <= b_in(63 downto 2) & "00"; f_out.redirect_nia <= b_in(63 downto 2) & "00";
end if; end if;
@ -903,23 +901,35 @@ begin


v.e.rc := e_in.rc and valid_in; v.e.rc := e_in.rc and valid_in;


-- Mispredicted branches cause a redirect
if is_branch = '1' and taken_branch /= e_in.br_pred then
f_out.redirect <= '1';
if taken_branch = '1' then
if abs_branch = '1' then
f_out.redirect_nia <= b_in;
else
f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
end if;
else
f_out.redirect_nia <= next_nia;
end if;
end if;

-- Update LR on the next cycle after a branch link -- Update LR on the next cycle after a branch link
-- -- If we're not writing back anything else, we can write back LR
-- WARNING: The LR update isn't tracked by our hazard tracker. This -- this cycle, otherwise we take an extra cycle.
-- will work (well I hope) because it only happens on branches
-- which will flush all decoded instructions. By the time
-- fetch catches up, we'll have the new LR. This will
-- *not* work properly however if we have a branch predictor,
-- in which case the solution would probably be to keep a
-- local cache of the updated LR in execute1 (flushed on
-- exceptions) that is used instead of the value from
-- decode when its content is valid.
if e_in.lr = '1' then if e_in.lr = '1' then
v.lr_update := '1'; if result_en = '0' then
v.next_lr := next_nia; result_en := '1';
v.e.valid := '0'; result := next_nia;
report "Delayed LR update to " & to_hstring(next_nia); v.e.write_reg := fast_spr_num(SPR_LR);
v.busy := '1'; else
v.lr_update := '1';
v.next_lr := next_nia;
v.e.valid := '0';
report "Delayed LR update to " & to_hstring(next_nia);
v.busy := '1';
end if;
end if; end if;


elsif valid_in = '1' then elsif valid_in = '1' then

@ -23,6 +23,9 @@ entity fetch1 is
-- redirect from execution unit -- redirect from execution unit
e_in : in Execute1ToFetch1Type; e_in : in Execute1ToFetch1Type;


-- redirect from decode1
d_in : in Decode1ToFetch1Type;

-- Request to icache -- Request to icache
i_out : out Fetch1ToIcacheType; i_out : out Fetch1ToIcacheType;


@ -49,7 +52,7 @@ begin
report "fetch1 rst:" & std_ulogic'image(rst) & report "fetch1 rst:" & std_ulogic'image(rst) &
" IR:" & std_ulogic'image(e_in.virt_mode) & " IR:" & std_ulogic'image(e_in.virt_mode) &
" P:" & std_ulogic'image(e_in.priv_mode) & " P:" & std_ulogic'image(e_in.priv_mode) &
" R:" & std_ulogic'image(e_in.redirect) & " R:" & std_ulogic'image(e_in.redirect) & std_ulogic'image(d_in.redirect) &
" S:" & std_ulogic'image(stall_in) & " S:" & std_ulogic'image(stall_in) &
" T:" & std_ulogic'image(stop_in) & " T:" & std_ulogic'image(stop_in) &
" nia:" & to_hstring(r_next.nia) & " nia:" & to_hstring(r_next.nia) &
@ -83,6 +86,8 @@ begin
v.nia := e_in.redirect_nia; v.nia := e_in.redirect_nia;
v.virt_mode := e_in.virt_mode; v.virt_mode := e_in.virt_mode;
v.priv_mode := e_in.priv_mode; v.priv_mode := e_in.priv_mode;
elsif d_in.redirect = '1' then
v.nia := d_in.redirect_nia;
elsif stall_in = '0' then elsif stall_in = '0' then


-- For debug stop/step to work properly we need a little bit of -- For debug stop/step to work properly we need a little bit of

@ -93,7 +93,7 @@ package ppc_fx_insns is
function ppc_divd (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; function ppc_divd (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
function ppc_divwu (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; function ppc_divwu (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;


function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return integer; function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return std_ulogic;
end package ppc_fx_insns; end package ppc_fx_insns;


package body ppc_fx_insns is package body ppc_fx_insns is
@ -785,13 +785,12 @@ package body ppc_fx_insns is
return std_ulogic_vector(resize(tmp, ra'length)); return std_ulogic_vector(resize(tmp, ra'length));
end; end;


function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return integer is function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return std_ulogic is
variable crfield: integer; variable crfield: integer;
variable crbit_match: std_ulogic; variable crbit_match: std_ulogic;
variable ctr_not_zero: std_ulogic; variable ctr_not_zero: std_ulogic;
variable ctr_ok: std_ulogic; variable ctr_ok: std_ulogic;
variable cond_ok: std_ulogic; variable cond_ok: std_ulogic;
variable ret: integer;
begin begin
crfield := to_integer(unsigned(bi)); crfield := to_integer(unsigned(bi));
-- BE bit numbering -- BE bit numbering
@ -800,12 +799,7 @@ package body ppc_fx_insns is
ctr_not_zero := '1' when ctr /= x"0000000000000001" else '0'; ctr_not_zero := '1' when ctr /= x"0000000000000001" else '0';
ctr_ok := bo(4-2) or (ctr_not_zero xor bo(4-3)); ctr_ok := bo(4-2) or (ctr_not_zero xor bo(4-3));
cond_ok := bo(4-0) or crbit_match; cond_ok := bo(4-0) or crbit_match;
if ctr_ok = '1' and cond_ok = '1' then return ctr_ok and cond_ok;
ret := 1;
else
ret := 0;
end if;
return ret;
end; end;


end package body ppc_fx_insns; end package body ppc_fx_insns;

Loading…
Cancel
Save