diff --git a/common.vhdl b/common.vhdl index 893127f..686e414 100644 --- a/common.vhdl +++ b/common.vhdl @@ -176,14 +176,15 @@ package common is insn: std_ulogic_vector(31 downto 0); ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR) + ispro: gspr_index_t; -- (G)SPR written with LR or CTR decode: decode_rom_t; br_pred: std_ulogic; -- Branch was predicted to be taken big_endian: std_ulogic; end record; constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), - ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init, - br_pred => '0', big_endian => '0'); + ispr1 => (others => '0'), ispr2 => (others => '0'), ispro => (others => '0'), + decode => decode_rom_init, br_pred => '0', big_endian => '0'); type Decode1ToFetch1Type is record redirect : std_ulogic; @@ -210,6 +211,7 @@ package common is bypass_cr : std_ulogic; xerc: xer_common_t; lr: std_ulogic; + br_abs: std_ulogic; rc: std_ulogic; oe: std_ulogic; invert_a: std_ulogic; @@ -236,7 +238,7 @@ package common is constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, write_reg_enable => '0', bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', - bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', + bypass_cr => '0', lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), @@ -552,9 +554,9 @@ package body common is begin case spr is when SPR_LR => - n := 0; + n := 0; -- N.B. decode2 relies on this specific value when SPR_CTR => - n:= 1; + n := 1; -- N.B. decode2 relies on this specific value when SPR_SRR0 => n := 2; when SPR_SRR1 => diff --git a/decode1.vhdl b/decode1.vhdl index 0f3410d..f62594b 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -79,7 +79,7 @@ architecture behaviour of decode1 is 28 => (ALU, NONE, OP_AND, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), -- andi. 29 => (ALU, NONE, OP_AND, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), -- andis. 0 => (ALU, NONE, OP_ATTN, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- attn - 18 => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b + 18 => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b 16 => (ALU, NONE, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- bc 11 => (ALU, NONE, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- cmpi 10 => (ALU, NONE, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpli @@ -597,9 +597,10 @@ begin -- major opcode 31, lots of things v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1)))); - -- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path + -- Work out ispr1/ispro independent of v.decode since they seem to be critical path sprn := decode_spr_num(f_in.insn); v.ispr1 := fast_spr_num(sprn); + v.ispro := fast_spr_num(sprn); if std_match(f_in.insn(10 downto 1), "01-1010011") then -- mfspr or mtspr @@ -627,6 +628,9 @@ begin -- CTR may be needed as input to bc if f_in.insn(23) = '0' then v.ispr1 := fast_spr_num(SPR_CTR); + v.ispro := fast_spr_num(SPR_CTR); + elsif f_in.insn(0) = '1' then + v.ispro := fast_spr_num(SPR_LR); end if; -- Predict backward branches as taken, forward as untaken v.br_pred := f_in.insn(15); @@ -636,6 +640,9 @@ begin -- Unconditional branches are always taken v.br_pred := '1'; br_offset := signed(f_in.insn(25 downto 2)); + if f_in.insn(0) = '1' then + v.ispro := fast_spr_num(SPR_LR); + end if; when 19 => vi.override := not decode_op_19_valid(to_integer(unsigned(f_in.insn(5 downto 1) & f_in.insn(10 downto 6)))); @@ -648,8 +655,12 @@ begin -- Branch uses CTR as condition when BO(2) is 0. This is -- also used to indicate that CTR is modified (they go -- together). - if f_in.insn(23) = '0' then + -- bcctr doesn't update CTR or use it in the branch condition + if f_in.insn(23) = '0' and (f_in.insn(10) = '0' or f_in.insn(6) = '1') then v.ispr1 := fast_spr_num(SPR_CTR); + v.ispro := fast_spr_num(SPR_CTR); + elsif f_in.insn(0) = '1' then + v.ispro := fast_spr_num(SPR_LR); end if; if f_in.insn(10) = '0' then v.ispr2 := fast_spr_num(SPR_LR); diff --git a/decode2.vhdl b/decode2.vhdl index 0336057..274a241 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -249,6 +249,9 @@ architecture behaviour of decode2 is OP_MOD => "011", OP_CNTZ => "100", -- countzero_result OP_MFSPR => "101", -- spr_result + OP_B => "110", -- next_nia + OP_BC => "110", + OP_BCREG => "110", OP_ADDG6S => "111", -- misc_result OP_ISEL => "111", OP_DARN => "111", @@ -284,9 +287,6 @@ architecture behaviour of decode2 is signal gpr_write : gspr_index_t; signal gpr_bypassable : std_ulogic; - signal update_gpr_write_valid : std_ulogic; - signal update_gpr_write_reg : gspr_index_t; - signal gpr_a_read_valid : std_ulogic; signal gpr_a_read :gspr_index_t; signal gpr_a_bypass : std_ulogic; @@ -325,8 +325,8 @@ begin gpr_write_in => gpr_write, gpr_bypassable => gpr_bypassable, - update_gpr_write_valid => update_gpr_write_valid, - update_gpr_write_reg => update_gpr_write_reg, + update_gpr_write_valid => '0', + update_gpr_write_reg => 7x"00", gpr_a_read_valid_in => gpr_a_read_valid, gpr_a_read_in => gpr_a_read, @@ -376,6 +376,7 @@ begin variable decoded_reg_c : decode_input_reg_t; variable decoded_reg_o : decode_output_reg_t; variable length : std_ulogic_vector(3 downto 0); + variable op : insn_type_t; begin v := r; @@ -391,7 +392,14 @@ begin d_in.nia); decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2); decoded_reg_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn, r_in.read3_data); - decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispr1); + decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro); + + if d_in.decode.lr = '1' then + v.e.lr := insn_lk(d_in.insn); + -- b and bc have even major opcodes; bcreg is considered absolute + v.e.br_abs := insn_aa(d_in.insn) or d_in.insn(26); + end if; + op := d_in.decode.insn_type; if d_in.decode.repeat /= NONE then v.e.repeat := '1'; @@ -414,6 +422,12 @@ begin end if; when others => end case; + elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then + -- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled + v.e.repeat := '1'; + v.e.second := r.repeat; + -- first one does CTR, second does LR + decoded_reg_o.reg(0) := not r.repeat; end if; r_out.read1_enable <= decoded_reg_a.reg_valid and d_in.valid; @@ -440,7 +454,6 @@ begin v.e.nia := d_in.nia; v.e.unit := d_in.decode.unit; v.e.fac := d_in.decode.facility; - v.e.insn_type := d_in.decode.insn_type; v.e.read_reg1 := decoded_reg_a.reg; v.e.read_data1 := decoded_reg_a.data; v.e.bypass_data1 := gpr_a_bypass; @@ -460,23 +473,12 @@ begin v.e.xerc := c_in.read_xerc_data; v.e.invert_a := d_in.decode.invert_a; v.e.addm1 := '0'; - if d_in.decode.insn_type = OP_BC or d_in.decode.insn_type = OP_BCREG then - -- add -1 to CTR - v.e.addm1 := '1'; - if d_in.insn(23) = '1' or - (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then - -- don't write decremented CTR if BO(2) = 1 or bcctr - v.e.write_reg_enable := '0'; - end if; - end if; + v.e.insn_type := op; v.e.invert_out := d_in.decode.invert_out; v.e.input_carry := d_in.decode.input_carry; v.e.output_carry := d_in.decode.output_carry; v.e.is_32bit := d_in.decode.is_32bit; v.e.is_signed := d_in.decode.is_signed; - if d_in.decode.lr = '1' then - v.e.lr := insn_lk(d_in.insn); - end if; v.e.insn := d_in.insn; v.e.data_len := length; v.e.byte_reverse := d_in.decode.byte_reverse; @@ -484,8 +486,16 @@ begin v.e.update := d_in.decode.update; v.e.reserve := d_in.decode.reserve; v.e.br_pred := d_in.br_pred; - v.e.result_sel := result_select(d_in.decode.insn_type); - v.e.sub_select := subresult_select(d_in.decode.insn_type); + v.e.result_sel := result_select(op); + v.e.sub_select := subresult_select(op); + if op = OP_BC or op = OP_BCREG then + if d_in.insn(23) = '0' and r.repeat = '0' and + not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then + -- decrement CTR if BO(2) = 0 and not bcctr + v.e.addm1 := '1'; + v.e.result_sel := "000"; -- select adder output + end if; + end if; -- issue control control_valid_in <= d_in.valid; @@ -498,9 +508,6 @@ begin gpr_bypassable <= '1'; end if; - update_gpr_write_valid <= v.e.lr; - update_gpr_write_reg <= fast_spr_num(SPR_LR); - gpr_a_read_valid <= decoded_reg_a.reg_valid; gpr_a_read <= decoded_reg_a.reg; diff --git a/execute1.vhdl b/execute1.vhdl index 559f34f..2690424 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -59,8 +59,8 @@ architecture behaviour of execute1 is fp_exception_next : std_ulogic; trace_next : std_ulogic; prev_op : insn_type_t; - lr_update : std_ulogic; next_lr : std_ulogic_vector(63 downto 0); + br_taken : std_ulogic; mul_in_progress : std_ulogic; mul_finish : std_ulogic; div_in_progress : std_ulogic; @@ -79,8 +79,8 @@ architecture behaviour of execute1 is constant reg_type_init : reg_type := (e => Execute1ToWritebackInit, cur_instr => Decode2ToExecute1Init, - busy => '0', lr_update => '0', terminate => '0', - fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, + busy => '0', terminate => '0', + fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, br_taken => '0', mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0', next_lr => (others => '0'), last_nia => (others => '0'), redirect => '0', abs_br => '0', taken_br => '0', br_last => '0', do_intr => '0', vector => 0, @@ -108,6 +108,7 @@ architecture behaviour of execute1 is signal spr_result: std_ulogic_vector(63 downto 0); signal result_mux_sel: std_ulogic_vector(2 downto 0); signal sub_mux_sel: std_ulogic_vector(2 downto 0); + signal next_nia : std_ulogic_vector(63 downto 0); signal current: Decode2ToExecute1Type; -- multiply signals @@ -301,6 +302,7 @@ begin muldiv_result when "011", countzero_result when "100", spr_result when "101", + next_nia when "110", misc_result when others; execute1_0: process(clk) @@ -315,11 +317,9 @@ begin else r <= rin; ctrl <= ctrl_tmp; - assert not (r.lr_update = '1' and valid_in = '1') - report "LR update collision with valid in EX1" - severity failure; - if r.lr_update = '1' then - report "LR update to " & to_hstring(r.next_lr); + if valid_in = '1' then + report "execute " & to_hstring(e_in.nia) & " op=" & insn_type_t'image(e_in.insn_type) & + " wr=" & to_hstring(rin.e.write_reg); end if; end if; end if; @@ -350,7 +350,6 @@ begin variable btnum, banum, bbnum : integer range 0 to 31; variable crresult : std_ulogic; variable l : std_ulogic; - variable next_nia : std_ulogic_vector(63 downto 0); variable carry_32, carry_64 : std_ulogic; variable sign1, sign2 : std_ulogic; variable abs1, abs2 : signed(63 downto 0); @@ -421,7 +420,6 @@ begin end loop; end if; - v.lr_update := '0'; v.mul_in_progress := '0'; v.div_in_progress := '0'; v.cntz_in_progress := '0'; @@ -673,7 +671,7 @@ begin v.busy := '0'; -- Next insn adder used in a couple of places - next_nia := std_ulogic_vector(unsigned(e_in.nia) + 4); + next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4); -- rotator control signals right_shift <= '1' when e_in.insn_type = OP_SHR else '0'; @@ -846,35 +844,39 @@ begin newcrf & newcrf & newcrf & newcrf; when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS | OP_BPERM | OP_BCD => + when OP_B => is_branch := '1'; taken_branch := '1'; is_direct_branch := '1'; - abs_branch := insn_aa(e_in.insn); + abs_branch := e_in.br_abs; if ctrl.msr(MSR_BE) = '1' then do_trace := '1'; end if; - when OP_BC => - -- read_data1 is CTR + when OP_BC | OP_BCREG => + -- read_data1 is CTR + -- for OP_BCREG, read_data2 is target register (CTR, LR or TAR) + -- If this instruction updates both CTR and LR, then it is + -- doubled; the first instruction decrements CTR and determines + -- whether the branch is taken, and the second does the + -- redirect and the LR update. bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); - is_branch := '1'; - is_direct_branch := '1'; - taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); - abs_branch := insn_aa(e_in.insn); - if ctrl.msr(MSR_BE) = '1' then - do_trace := '1'; + if e_in.second = '0' then + taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); + else + taken_branch := r.br_taken; end if; - when OP_BCREG => - -- read_data1 is CTR - -- read_data2 is target register (CTR, LR or TAR) - bo := insn_bo(e_in.insn); - bi := insn_bi(e_in.insn); - is_branch := '1'; - taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); - abs_branch := '1'; - if ctrl.msr(MSR_BE) = '1' then - do_trace := '1'; + v.br_taken := taken_branch; + abs_branch := e_in.br_abs; + if e_in.repeat = '0' or e_in.second = '1' then + is_branch := '1'; + if e_in.insn_type = OP_BC then + is_direct_branch := '1'; + end if; + if ctrl.msr(MSR_BE) = '1' then + do_trace := '1'; + end if; end if; when OP_RFID => @@ -1197,11 +1199,6 @@ begin end if; v.e.valid := '1'; end if; - -- When doing delayed LR update, keep r.e.write_data unchanged - -- next cycle in case it is needed for a forwarded result (e.g. CTR). - if r.lr_update = '1' then - hold_wr_data := '1'; - end if; -- Generate FP-type program interrupt. fp_in.interrupt will only -- be set during the execution of a FP instruction. @@ -1274,30 +1271,6 @@ begin v.e.write_enable := current.write_reg_enable and v.e.valid and not exception; v.e.rc := current.rc and v.e.valid and not exception; - -- Update LR on the next cycle after a branch link - -- If we're not writing back anything else, we can write back LR - -- this cycle, otherwise we take an extra cycle. We use the - -- exc_write path since next_nia is written through that path - -- in other places. - if v.e.valid = '1' and exception = '0' and current.lr = '1' then - if current.write_reg_enable = '0' then - v.e.exc_write_enable := '1'; - v.e.exc_write_data := next_nia; - v.e.exc_write_reg := fast_spr_num(SPR_LR); - else - v.lr_update := '1'; - v.e.valid := '0'; - report "Delayed LR update to " & to_hstring(next_nia); - v.busy := '1'; - end if; - end if; - if r.lr_update = '1' then - v.e.exc_write_enable := '1'; - v.e.exc_write_data := r.next_lr; - v.e.exc_write_reg := fast_spr_num(SPR_LR); - v.e.valid := '1'; - end if; - -- Defer completion for one cycle when redirecting. -- This also ensures r.busy = 1 when ctrl.irq_state = WRITE_SRR1 if v.redirect = '1' then