core: Crack branches that update both CTR and LR

This uses the instruction doubling machinery to convert conditional
branch instructions that update both CTR and LR (e.g., bdnzl, bdnzlrl)
into two instructions, of which the first updates CTR and determines
whether the branch is taken, and the second updates LR and does the
redirect if necessary.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/269/head
Paul Mackerras 3 years ago
parent 4c61a71a62
commit a1d7b54f76

@ -176,14 +176,15 @@ package common is
insn: std_ulogic_vector(31 downto 0);
ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr
ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR)
ispro: gspr_index_t; -- (G)SPR written with LR or CTR
decode: decode_rom_t;
br_pred: std_ulogic; -- Branch was predicted to be taken
big_endian: std_ulogic;
end record;
constant Decode1ToDecode2Init : Decode1ToDecode2Type :=
(valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'),
ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init,
br_pred => '0', big_endian => '0');
ispr1 => (others => '0'), ispr2 => (others => '0'), ispro => (others => '0'),
decode => decode_rom_init, br_pred => '0', big_endian => '0');

type Decode1ToFetch1Type is record
redirect : std_ulogic;
@ -210,6 +211,7 @@ package common is
bypass_cr : std_ulogic;
xerc: xer_common_t;
lr: std_ulogic;
br_abs: std_ulogic;
rc: std_ulogic;
oe: std_ulogic;
invert_a: std_ulogic;
@ -236,7 +238,7 @@ package common is
constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
(valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL,
write_reg_enable => '0', bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0',
bypass_cr => '0', lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0',
invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0',
byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'),
@ -552,9 +554,9 @@ package body common is
begin
case spr is
when SPR_LR =>
n := 0;
n := 0; -- N.B. decode2 relies on this specific value
when SPR_CTR =>
n:= 1;
n := 1; -- N.B. decode2 relies on this specific value
when SPR_SRR0 =>
n := 2;
when SPR_SRR1 =>

@ -79,7 +79,7 @@ architecture behaviour of decode1 is
28 => (ALU, NONE, OP_AND, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), -- andi.
29 => (ALU, NONE, OP_AND, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), -- andis.
0 => (ALU, NONE, OP_ATTN, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- attn
18 => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b
18 => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b
16 => (ALU, NONE, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- bc
11 => (ALU, NONE, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- cmpi
10 => (ALU, NONE, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpli
@ -597,9 +597,10 @@ begin
-- major opcode 31, lots of things
v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1))));

-- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path
-- Work out ispr1/ispro independent of v.decode since they seem to be critical path
sprn := decode_spr_num(f_in.insn);
v.ispr1 := fast_spr_num(sprn);
v.ispro := fast_spr_num(sprn);

if std_match(f_in.insn(10 downto 1), "01-1010011") then
-- mfspr or mtspr
@ -627,6 +628,9 @@ begin
-- CTR may be needed as input to bc
if f_in.insn(23) = '0' then
v.ispr1 := fast_spr_num(SPR_CTR);
v.ispro := fast_spr_num(SPR_CTR);
elsif f_in.insn(0) = '1' then
v.ispro := fast_spr_num(SPR_LR);
end if;
-- Predict backward branches as taken, forward as untaken
v.br_pred := f_in.insn(15);
@ -636,6 +640,9 @@ begin
-- Unconditional branches are always taken
v.br_pred := '1';
br_offset := signed(f_in.insn(25 downto 2));
if f_in.insn(0) = '1' then
v.ispro := fast_spr_num(SPR_LR);
end if;

when 19 =>
vi.override := not decode_op_19_valid(to_integer(unsigned(f_in.insn(5 downto 1) & f_in.insn(10 downto 6))));
@ -648,8 +655,12 @@ begin
-- Branch uses CTR as condition when BO(2) is 0. This is
-- also used to indicate that CTR is modified (they go
-- together).
if f_in.insn(23) = '0' then
-- bcctr doesn't update CTR or use it in the branch condition
if f_in.insn(23) = '0' and (f_in.insn(10) = '0' or f_in.insn(6) = '1') then
v.ispr1 := fast_spr_num(SPR_CTR);
v.ispro := fast_spr_num(SPR_CTR);
elsif f_in.insn(0) = '1' then
v.ispro := fast_spr_num(SPR_LR);
end if;
if f_in.insn(10) = '0' then
v.ispr2 := fast_spr_num(SPR_LR);

@ -249,6 +249,9 @@ architecture behaviour of decode2 is
OP_MOD => "011",
OP_CNTZ => "100", -- countzero_result
OP_MFSPR => "101", -- spr_result
OP_B => "110", -- next_nia
OP_BC => "110",
OP_BCREG => "110",
OP_ADDG6S => "111", -- misc_result
OP_ISEL => "111",
OP_DARN => "111",
@ -284,9 +287,6 @@ architecture behaviour of decode2 is
signal gpr_write : gspr_index_t;
signal gpr_bypassable : std_ulogic;

signal update_gpr_write_valid : std_ulogic;
signal update_gpr_write_reg : gspr_index_t;

signal gpr_a_read_valid : std_ulogic;
signal gpr_a_read :gspr_index_t;
signal gpr_a_bypass : std_ulogic;
@ -325,8 +325,8 @@ begin
gpr_write_in => gpr_write,
gpr_bypassable => gpr_bypassable,

update_gpr_write_valid => update_gpr_write_valid,
update_gpr_write_reg => update_gpr_write_reg,
update_gpr_write_valid => '0',
update_gpr_write_reg => 7x"00",

gpr_a_read_valid_in => gpr_a_read_valid,
gpr_a_read_in => gpr_a_read,
@ -376,6 +376,7 @@ begin
variable decoded_reg_c : decode_input_reg_t;
variable decoded_reg_o : decode_output_reg_t;
variable length : std_ulogic_vector(3 downto 0);
variable op : insn_type_t;
begin
v := r;

@ -391,7 +392,14 @@ begin
d_in.nia);
decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2);
decoded_reg_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn, r_in.read3_data);
decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispr1);
decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro);

if d_in.decode.lr = '1' then
v.e.lr := insn_lk(d_in.insn);
-- b and bc have even major opcodes; bcreg is considered absolute
v.e.br_abs := insn_aa(d_in.insn) or d_in.insn(26);
end if;
op := d_in.decode.insn_type;

if d_in.decode.repeat /= NONE then
v.e.repeat := '1';
@ -414,6 +422,12 @@ begin
end if;
when others =>
end case;
elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then
-- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled
v.e.repeat := '1';
v.e.second := r.repeat;
-- first one does CTR, second does LR
decoded_reg_o.reg(0) := not r.repeat;
end if;

r_out.read1_enable <= decoded_reg_a.reg_valid and d_in.valid;
@ -440,7 +454,6 @@ begin
v.e.nia := d_in.nia;
v.e.unit := d_in.decode.unit;
v.e.fac := d_in.decode.facility;
v.e.insn_type := d_in.decode.insn_type;
v.e.read_reg1 := decoded_reg_a.reg;
v.e.read_data1 := decoded_reg_a.data;
v.e.bypass_data1 := gpr_a_bypass;
@ -460,23 +473,12 @@ begin
v.e.xerc := c_in.read_xerc_data;
v.e.invert_a := d_in.decode.invert_a;
v.e.addm1 := '0';
if d_in.decode.insn_type = OP_BC or d_in.decode.insn_type = OP_BCREG then
-- add -1 to CTR
v.e.addm1 := '1';
if d_in.insn(23) = '1' or
(d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then
-- don't write decremented CTR if BO(2) = 1 or bcctr
v.e.write_reg_enable := '0';
end if;
end if;
v.e.insn_type := op;
v.e.invert_out := d_in.decode.invert_out;
v.e.input_carry := d_in.decode.input_carry;
v.e.output_carry := d_in.decode.output_carry;
v.e.is_32bit := d_in.decode.is_32bit;
v.e.is_signed := d_in.decode.is_signed;
if d_in.decode.lr = '1' then
v.e.lr := insn_lk(d_in.insn);
end if;
v.e.insn := d_in.insn;
v.e.data_len := length;
v.e.byte_reverse := d_in.decode.byte_reverse;
@ -484,8 +486,16 @@ begin
v.e.update := d_in.decode.update;
v.e.reserve := d_in.decode.reserve;
v.e.br_pred := d_in.br_pred;
v.e.result_sel := result_select(d_in.decode.insn_type);
v.e.sub_select := subresult_select(d_in.decode.insn_type);
v.e.result_sel := result_select(op);
v.e.sub_select := subresult_select(op);
if op = OP_BC or op = OP_BCREG then
if d_in.insn(23) = '0' and r.repeat = '0' and
not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then
-- decrement CTR if BO(2) = 0 and not bcctr
v.e.addm1 := '1';
v.e.result_sel := "000"; -- select adder output
end if;
end if;

-- issue control
control_valid_in <= d_in.valid;
@ -498,9 +508,6 @@ begin
gpr_bypassable <= '1';
end if;

update_gpr_write_valid <= v.e.lr;
update_gpr_write_reg <= fast_spr_num(SPR_LR);

gpr_a_read_valid <= decoded_reg_a.reg_valid;
gpr_a_read <= decoded_reg_a.reg;


@ -59,8 +59,8 @@ architecture behaviour of execute1 is
fp_exception_next : std_ulogic;
trace_next : std_ulogic;
prev_op : insn_type_t;
lr_update : std_ulogic;
next_lr : std_ulogic_vector(63 downto 0);
br_taken : std_ulogic;
mul_in_progress : std_ulogic;
mul_finish : std_ulogic;
div_in_progress : std_ulogic;
@ -79,8 +79,8 @@ architecture behaviour of execute1 is
constant reg_type_init : reg_type :=
(e => Execute1ToWritebackInit,
cur_instr => Decode2ToExecute1Init,
busy => '0', lr_update => '0', terminate => '0',
fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL,
busy => '0', terminate => '0',
fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, br_taken => '0',
mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0',
next_lr => (others => '0'), last_nia => (others => '0'),
redirect => '0', abs_br => '0', taken_br => '0', br_last => '0', do_intr => '0', vector => 0,
@ -108,6 +108,7 @@ architecture behaviour of execute1 is
signal spr_result: std_ulogic_vector(63 downto 0);
signal result_mux_sel: std_ulogic_vector(2 downto 0);
signal sub_mux_sel: std_ulogic_vector(2 downto 0);
signal next_nia : std_ulogic_vector(63 downto 0);
signal current: Decode2ToExecute1Type;

-- multiply signals
@ -301,6 +302,7 @@ begin
muldiv_result when "011",
countzero_result when "100",
spr_result when "101",
next_nia when "110",
misc_result when others;

execute1_0: process(clk)
@ -315,11 +317,9 @@ begin
else
r <= rin;
ctrl <= ctrl_tmp;
assert not (r.lr_update = '1' and valid_in = '1')
report "LR update collision with valid in EX1"
severity failure;
if r.lr_update = '1' then
report "LR update to " & to_hstring(r.next_lr);
if valid_in = '1' then
report "execute " & to_hstring(e_in.nia) & " op=" & insn_type_t'image(e_in.insn_type) &
" wr=" & to_hstring(rin.e.write_reg);
end if;
end if;
end if;
@ -350,7 +350,6 @@ begin
variable btnum, banum, bbnum : integer range 0 to 31;
variable crresult : std_ulogic;
variable l : std_ulogic;
variable next_nia : std_ulogic_vector(63 downto 0);
variable carry_32, carry_64 : std_ulogic;
variable sign1, sign2 : std_ulogic;
variable abs1, abs2 : signed(63 downto 0);
@ -421,7 +420,6 @@ begin
end loop;
end if;

v.lr_update := '0';
v.mul_in_progress := '0';
v.div_in_progress := '0';
v.cntz_in_progress := '0';
@ -673,7 +671,7 @@ begin
v.busy := '0';

-- Next insn adder used in a couple of places
next_nia := std_ulogic_vector(unsigned(e_in.nia) + 4);
next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4);

-- rotator control signals
right_shift <= '1' when e_in.insn_type = OP_SHR else '0';
@ -846,35 +844,39 @@ begin
newcrf & newcrf & newcrf & newcrf;
when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS |
OP_BPERM | OP_BCD =>

when OP_B =>
is_branch := '1';
taken_branch := '1';
is_direct_branch := '1';
abs_branch := insn_aa(e_in.insn);
abs_branch := e_in.br_abs;
if ctrl.msr(MSR_BE) = '1' then
do_trace := '1';
end if;
when OP_BC =>
-- read_data1 is CTR
when OP_BC | OP_BCREG =>
-- read_data1 is CTR
-- for OP_BCREG, read_data2 is target register (CTR, LR or TAR)
-- If this instruction updates both CTR and LR, then it is
-- doubled; the first instruction decrements CTR and determines
-- whether the branch is taken, and the second does the
-- redirect and the LR update.
bo := insn_bo(e_in.insn);
bi := insn_bi(e_in.insn);
is_branch := '1';
is_direct_branch := '1';
taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
abs_branch := insn_aa(e_in.insn);
if ctrl.msr(MSR_BE) = '1' then
do_trace := '1';
if e_in.second = '0' then
taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
else
taken_branch := r.br_taken;
end if;
when OP_BCREG =>
-- read_data1 is CTR
-- read_data2 is target register (CTR, LR or TAR)
bo := insn_bo(e_in.insn);
bi := insn_bi(e_in.insn);
is_branch := '1';
taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
abs_branch := '1';
if ctrl.msr(MSR_BE) = '1' then
do_trace := '1';
v.br_taken := taken_branch;
abs_branch := e_in.br_abs;
if e_in.repeat = '0' or e_in.second = '1' then
is_branch := '1';
if e_in.insn_type = OP_BC then
is_direct_branch := '1';
end if;
if ctrl.msr(MSR_BE) = '1' then
do_trace := '1';
end if;
end if;

when OP_RFID =>
@ -1197,11 +1199,6 @@ begin
end if;
v.e.valid := '1';
end if;
-- When doing delayed LR update, keep r.e.write_data unchanged
-- next cycle in case it is needed for a forwarded result (e.g. CTR).
if r.lr_update = '1' then
hold_wr_data := '1';
end if;

-- Generate FP-type program interrupt. fp_in.interrupt will only
-- be set during the execution of a FP instruction.
@ -1274,30 +1271,6 @@ begin
v.e.write_enable := current.write_reg_enable and v.e.valid and not exception;
v.e.rc := current.rc and v.e.valid and not exception;

-- Update LR on the next cycle after a branch link
-- If we're not writing back anything else, we can write back LR
-- this cycle, otherwise we take an extra cycle. We use the
-- exc_write path since next_nia is written through that path
-- in other places.
if v.e.valid = '1' and exception = '0' and current.lr = '1' then
if current.write_reg_enable = '0' then
v.e.exc_write_enable := '1';
v.e.exc_write_data := next_nia;
v.e.exc_write_reg := fast_spr_num(SPR_LR);
else
v.lr_update := '1';
v.e.valid := '0';
report "Delayed LR update to " & to_hstring(next_nia);
v.busy := '1';
end if;
end if;
if r.lr_update = '1' then
v.e.exc_write_enable := '1';
v.e.exc_write_data := r.next_lr;
v.e.exc_write_reg := fast_spr_num(SPR_LR);
v.e.valid := '1';
end if;

-- Defer completion for one cycle when redirecting.
-- This also ensures r.busy = 1 when ctrl.irq_state = WRITE_SRR1
if v.redirect = '1' then

Loading…
Cancel
Save