From a1d7b54f76859e7270c48a56536b901f3c05c641 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 11 Nov 2020 22:10:38 +1100 Subject: [PATCH] core: Crack branches that update both CTR and LR This uses the instruction doubling machinery to convert conditional branch instructions that update both CTR and LR (e.g., bdnzl, bdnzlrl) into two instructions, of which the first updates CTR and determines whether the branch is taken, and the second updates LR and does the redirect if necessary. Signed-off-by: Paul Mackerras --- common.vhdl | 12 ++++--- decode1.vhdl | 17 ++++++++-- decode2.vhdl | 55 +++++++++++++++++-------------- execute1.vhdl | 91 ++++++++++++++++++--------------------------------- 4 files changed, 84 insertions(+), 91 deletions(-) diff --git a/common.vhdl b/common.vhdl index 893127f..686e414 100644 --- a/common.vhdl +++ b/common.vhdl @@ -176,14 +176,15 @@ package common is insn: std_ulogic_vector(31 downto 0); ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR) + ispro: gspr_index_t; -- (G)SPR written with LR or CTR decode: decode_rom_t; br_pred: std_ulogic; -- Branch was predicted to be taken big_endian: std_ulogic; end record; constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), - ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init, - br_pred => '0', big_endian => '0'); + ispr1 => (others => '0'), ispr2 => (others => '0'), ispro => (others => '0'), + decode => decode_rom_init, br_pred => '0', big_endian => '0'); type Decode1ToFetch1Type is record redirect : std_ulogic; @@ -210,6 +211,7 @@ package common is bypass_cr : std_ulogic; xerc: xer_common_t; lr: std_ulogic; + br_abs: std_ulogic; rc: std_ulogic; oe: std_ulogic; invert_a: std_ulogic; @@ -236,7 +238,7 @@ package common is constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, write_reg_enable => '0', bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', - bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', + bypass_cr => '0', lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), @@ -552,9 +554,9 @@ package body common is begin case spr is when SPR_LR => - n := 0; + n := 0; -- N.B. decode2 relies on this specific value when SPR_CTR => - n:= 1; + n := 1; -- N.B. decode2 relies on this specific value when SPR_SRR0 => n := 2; when SPR_SRR1 => diff --git a/decode1.vhdl b/decode1.vhdl index 0f3410d..f62594b 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -79,7 +79,7 @@ architecture behaviour of decode1 is 28 => (ALU, NONE, OP_AND, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), -- andi. 29 => (ALU, NONE, OP_AND, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), -- andis. 0 => (ALU, NONE, OP_ATTN, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- attn - 18 => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b + 18 => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b 16 => (ALU, NONE, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- bc 11 => (ALU, NONE, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- cmpi 10 => (ALU, NONE, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpli @@ -597,9 +597,10 @@ begin -- major opcode 31, lots of things v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1)))); - -- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path + -- Work out ispr1/ispro independent of v.decode since they seem to be critical path sprn := decode_spr_num(f_in.insn); v.ispr1 := fast_spr_num(sprn); + v.ispro := fast_spr_num(sprn); if std_match(f_in.insn(10 downto 1), "01-1010011") then -- mfspr or mtspr @@ -627,6 +628,9 @@ begin -- CTR may be needed as input to bc if f_in.insn(23) = '0' then v.ispr1 := fast_spr_num(SPR_CTR); + v.ispro := fast_spr_num(SPR_CTR); + elsif f_in.insn(0) = '1' then + v.ispro := fast_spr_num(SPR_LR); end if; -- Predict backward branches as taken, forward as untaken v.br_pred := f_in.insn(15); @@ -636,6 +640,9 @@ begin -- Unconditional branches are always taken v.br_pred := '1'; br_offset := signed(f_in.insn(25 downto 2)); + if f_in.insn(0) = '1' then + v.ispro := fast_spr_num(SPR_LR); + end if; when 19 => vi.override := not decode_op_19_valid(to_integer(unsigned(f_in.insn(5 downto 1) & f_in.insn(10 downto 6)))); @@ -648,8 +655,12 @@ begin -- Branch uses CTR as condition when BO(2) is 0. This is -- also used to indicate that CTR is modified (they go -- together). - if f_in.insn(23) = '0' then + -- bcctr doesn't update CTR or use it in the branch condition + if f_in.insn(23) = '0' and (f_in.insn(10) = '0' or f_in.insn(6) = '1') then v.ispr1 := fast_spr_num(SPR_CTR); + v.ispro := fast_spr_num(SPR_CTR); + elsif f_in.insn(0) = '1' then + v.ispro := fast_spr_num(SPR_LR); end if; if f_in.insn(10) = '0' then v.ispr2 := fast_spr_num(SPR_LR); diff --git a/decode2.vhdl b/decode2.vhdl index 0336057..274a241 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -249,6 +249,9 @@ architecture behaviour of decode2 is OP_MOD => "011", OP_CNTZ => "100", -- countzero_result OP_MFSPR => "101", -- spr_result + OP_B => "110", -- next_nia + OP_BC => "110", + OP_BCREG => "110", OP_ADDG6S => "111", -- misc_result OP_ISEL => "111", OP_DARN => "111", @@ -284,9 +287,6 @@ architecture behaviour of decode2 is signal gpr_write : gspr_index_t; signal gpr_bypassable : std_ulogic; - signal update_gpr_write_valid : std_ulogic; - signal update_gpr_write_reg : gspr_index_t; - signal gpr_a_read_valid : std_ulogic; signal gpr_a_read :gspr_index_t; signal gpr_a_bypass : std_ulogic; @@ -325,8 +325,8 @@ begin gpr_write_in => gpr_write, gpr_bypassable => gpr_bypassable, - update_gpr_write_valid => update_gpr_write_valid, - update_gpr_write_reg => update_gpr_write_reg, + update_gpr_write_valid => '0', + update_gpr_write_reg => 7x"00", gpr_a_read_valid_in => gpr_a_read_valid, gpr_a_read_in => gpr_a_read, @@ -376,6 +376,7 @@ begin variable decoded_reg_c : decode_input_reg_t; variable decoded_reg_o : decode_output_reg_t; variable length : std_ulogic_vector(3 downto 0); + variable op : insn_type_t; begin v := r; @@ -391,7 +392,14 @@ begin d_in.nia); decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2); decoded_reg_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn, r_in.read3_data); - decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispr1); + decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro); + + if d_in.decode.lr = '1' then + v.e.lr := insn_lk(d_in.insn); + -- b and bc have even major opcodes; bcreg is considered absolute + v.e.br_abs := insn_aa(d_in.insn) or d_in.insn(26); + end if; + op := d_in.decode.insn_type; if d_in.decode.repeat /= NONE then v.e.repeat := '1'; @@ -414,6 +422,12 @@ begin end if; when others => end case; + elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then + -- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled + v.e.repeat := '1'; + v.e.second := r.repeat; + -- first one does CTR, second does LR + decoded_reg_o.reg(0) := not r.repeat; end if; r_out.read1_enable <= decoded_reg_a.reg_valid and d_in.valid; @@ -440,7 +454,6 @@ begin v.e.nia := d_in.nia; v.e.unit := d_in.decode.unit; v.e.fac := d_in.decode.facility; - v.e.insn_type := d_in.decode.insn_type; v.e.read_reg1 := decoded_reg_a.reg; v.e.read_data1 := decoded_reg_a.data; v.e.bypass_data1 := gpr_a_bypass; @@ -460,23 +473,12 @@ begin v.e.xerc := c_in.read_xerc_data; v.e.invert_a := d_in.decode.invert_a; v.e.addm1 := '0'; - if d_in.decode.insn_type = OP_BC or d_in.decode.insn_type = OP_BCREG then - -- add -1 to CTR - v.e.addm1 := '1'; - if d_in.insn(23) = '1' or - (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then - -- don't write decremented CTR if BO(2) = 1 or bcctr - v.e.write_reg_enable := '0'; - end if; - end if; + v.e.insn_type := op; v.e.invert_out := d_in.decode.invert_out; v.e.input_carry := d_in.decode.input_carry; v.e.output_carry := d_in.decode.output_carry; v.e.is_32bit := d_in.decode.is_32bit; v.e.is_signed := d_in.decode.is_signed; - if d_in.decode.lr = '1' then - v.e.lr := insn_lk(d_in.insn); - end if; v.e.insn := d_in.insn; v.e.data_len := length; v.e.byte_reverse := d_in.decode.byte_reverse; @@ -484,8 +486,16 @@ begin v.e.update := d_in.decode.update; v.e.reserve := d_in.decode.reserve; v.e.br_pred := d_in.br_pred; - v.e.result_sel := result_select(d_in.decode.insn_type); - v.e.sub_select := subresult_select(d_in.decode.insn_type); + v.e.result_sel := result_select(op); + v.e.sub_select := subresult_select(op); + if op = OP_BC or op = OP_BCREG then + if d_in.insn(23) = '0' and r.repeat = '0' and + not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then + -- decrement CTR if BO(2) = 0 and not bcctr + v.e.addm1 := '1'; + v.e.result_sel := "000"; -- select adder output + end if; + end if; -- issue control control_valid_in <= d_in.valid; @@ -498,9 +508,6 @@ begin gpr_bypassable <= '1'; end if; - update_gpr_write_valid <= v.e.lr; - update_gpr_write_reg <= fast_spr_num(SPR_LR); - gpr_a_read_valid <= decoded_reg_a.reg_valid; gpr_a_read <= decoded_reg_a.reg; diff --git a/execute1.vhdl b/execute1.vhdl index 559f34f..2690424 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -59,8 +59,8 @@ architecture behaviour of execute1 is fp_exception_next : std_ulogic; trace_next : std_ulogic; prev_op : insn_type_t; - lr_update : std_ulogic; next_lr : std_ulogic_vector(63 downto 0); + br_taken : std_ulogic; mul_in_progress : std_ulogic; mul_finish : std_ulogic; div_in_progress : std_ulogic; @@ -79,8 +79,8 @@ architecture behaviour of execute1 is constant reg_type_init : reg_type := (e => Execute1ToWritebackInit, cur_instr => Decode2ToExecute1Init, - busy => '0', lr_update => '0', terminate => '0', - fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, + busy => '0', terminate => '0', + fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, br_taken => '0', mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0', next_lr => (others => '0'), last_nia => (others => '0'), redirect => '0', abs_br => '0', taken_br => '0', br_last => '0', do_intr => '0', vector => 0, @@ -108,6 +108,7 @@ architecture behaviour of execute1 is signal spr_result: std_ulogic_vector(63 downto 0); signal result_mux_sel: std_ulogic_vector(2 downto 0); signal sub_mux_sel: std_ulogic_vector(2 downto 0); + signal next_nia : std_ulogic_vector(63 downto 0); signal current: Decode2ToExecute1Type; -- multiply signals @@ -301,6 +302,7 @@ begin muldiv_result when "011", countzero_result when "100", spr_result when "101", + next_nia when "110", misc_result when others; execute1_0: process(clk) @@ -315,11 +317,9 @@ begin else r <= rin; ctrl <= ctrl_tmp; - assert not (r.lr_update = '1' and valid_in = '1') - report "LR update collision with valid in EX1" - severity failure; - if r.lr_update = '1' then - report "LR update to " & to_hstring(r.next_lr); + if valid_in = '1' then + report "execute " & to_hstring(e_in.nia) & " op=" & insn_type_t'image(e_in.insn_type) & + " wr=" & to_hstring(rin.e.write_reg); end if; end if; end if; @@ -350,7 +350,6 @@ begin variable btnum, banum, bbnum : integer range 0 to 31; variable crresult : std_ulogic; variable l : std_ulogic; - variable next_nia : std_ulogic_vector(63 downto 0); variable carry_32, carry_64 : std_ulogic; variable sign1, sign2 : std_ulogic; variable abs1, abs2 : signed(63 downto 0); @@ -421,7 +420,6 @@ begin end loop; end if; - v.lr_update := '0'; v.mul_in_progress := '0'; v.div_in_progress := '0'; v.cntz_in_progress := '0'; @@ -673,7 +671,7 @@ begin v.busy := '0'; -- Next insn adder used in a couple of places - next_nia := std_ulogic_vector(unsigned(e_in.nia) + 4); + next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4); -- rotator control signals right_shift <= '1' when e_in.insn_type = OP_SHR else '0'; @@ -846,35 +844,39 @@ begin newcrf & newcrf & newcrf & newcrf; when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS | OP_BPERM | OP_BCD => + when OP_B => is_branch := '1'; taken_branch := '1'; is_direct_branch := '1'; - abs_branch := insn_aa(e_in.insn); + abs_branch := e_in.br_abs; if ctrl.msr(MSR_BE) = '1' then do_trace := '1'; end if; - when OP_BC => - -- read_data1 is CTR + when OP_BC | OP_BCREG => + -- read_data1 is CTR + -- for OP_BCREG, read_data2 is target register (CTR, LR or TAR) + -- If this instruction updates both CTR and LR, then it is + -- doubled; the first instruction decrements CTR and determines + -- whether the branch is taken, and the second does the + -- redirect and the LR update. bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); - is_branch := '1'; - is_direct_branch := '1'; - taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); - abs_branch := insn_aa(e_in.insn); - if ctrl.msr(MSR_BE) = '1' then - do_trace := '1'; + if e_in.second = '0' then + taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); + else + taken_branch := r.br_taken; end if; - when OP_BCREG => - -- read_data1 is CTR - -- read_data2 is target register (CTR, LR or TAR) - bo := insn_bo(e_in.insn); - bi := insn_bi(e_in.insn); - is_branch := '1'; - taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); - abs_branch := '1'; - if ctrl.msr(MSR_BE) = '1' then - do_trace := '1'; + v.br_taken := taken_branch; + abs_branch := e_in.br_abs; + if e_in.repeat = '0' or e_in.second = '1' then + is_branch := '1'; + if e_in.insn_type = OP_BC then + is_direct_branch := '1'; + end if; + if ctrl.msr(MSR_BE) = '1' then + do_trace := '1'; + end if; end if; when OP_RFID => @@ -1197,11 +1199,6 @@ begin end if; v.e.valid := '1'; end if; - -- When doing delayed LR update, keep r.e.write_data unchanged - -- next cycle in case it is needed for a forwarded result (e.g. CTR). - if r.lr_update = '1' then - hold_wr_data := '1'; - end if; -- Generate FP-type program interrupt. fp_in.interrupt will only -- be set during the execution of a FP instruction. @@ -1274,30 +1271,6 @@ begin v.e.write_enable := current.write_reg_enable and v.e.valid and not exception; v.e.rc := current.rc and v.e.valid and not exception; - -- Update LR on the next cycle after a branch link - -- If we're not writing back anything else, we can write back LR - -- this cycle, otherwise we take an extra cycle. We use the - -- exc_write path since next_nia is written through that path - -- in other places. - if v.e.valid = '1' and exception = '0' and current.lr = '1' then - if current.write_reg_enable = '0' then - v.e.exc_write_enable := '1'; - v.e.exc_write_data := next_nia; - v.e.exc_write_reg := fast_spr_num(SPR_LR); - else - v.lr_update := '1'; - v.e.valid := '0'; - report "Delayed LR update to " & to_hstring(next_nia); - v.busy := '1'; - end if; - end if; - if r.lr_update = '1' then - v.e.exc_write_enable := '1'; - v.e.exc_write_data := r.next_lr; - v.e.exc_write_reg := fast_spr_num(SPR_LR); - v.e.valid := '1'; - end if; - -- Defer completion for one cycle when redirecting. -- This also ensures r.busy = 1 when ctrl.irq_state = WRITE_SRR1 if v.redirect = '1' then