From 1c4b5def36c77bee61342593386f8b5110d02805 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 26 Jul 2023 16:33:27 +1000 Subject: [PATCH] Improve timing of redirect_nia going from writeback to fetch1 This gets rid of the adder in writeback that computes redirect_nia. Instead, the main adder in the ALU is used to compute the branch target for relative branches. We now decode b and bc differently depending on the AA field, generating INSN_brel, INSN_babs, INSN_bcrel or INSN_bcabs as appropriate. Each one has a separate entry in the decode table in decode1; the *rel versions use CIA as the A input. The bclr/bcctr/bctar and rfid instructions now select ramspr_result for the main result mux to get the redirect address into ex1.e.write_data. For branches which are predicted taken but not actually taken, we need to redirect to the following instruction. We also need to do that for isync. We do this in the execute2 stage since whether or not to do it depends on the branch result. The next_nia computation is moved to the execute2 stage and comes in via a new leg on the secondary result multiplexer, making next_nia available ultimately in ex2.e.write_data. This also means that the next_nia leg of the primary result multiplexer is gone. Incrementing last_nia by 4 for sc (so that SRR0 points to the following instruction) is also moved to execute2. Writing CIA+4 to LR was previously done through the main result multiplexer. Now it comes in explicitly in the ramspr write logic. Overall this removes the br_offset and abs_br fields and the logic to add br_offset and next_nia, and one leg of the primary result multiplexer, at the cost of a few extra control signals between execute1 and execute2 and some multiplexing for the ramspr write side and an extra input on the secondary result multiplexer. Signed-off-by: Paul Mackerras --- common.vhdl | 3 +-- decode1.vhdl | 10 +++++---- decode2.vhdl | 5 ++--- decode_types.vhdl | 46 ++++++++++++++++++++------------------ execute1.vhdl | 56 ++++++++++++++++++++++++++--------------------- predecode.vhdl | 34 ++++++++++++++++++++++++++-- writeback.vhdl | 6 +---- 7 files changed, 98 insertions(+), 62 deletions(-) diff --git a/common.vhdl b/common.vhdl index 59c855e..a46eff5 100644 --- a/common.vhdl +++ b/common.vhdl @@ -658,7 +658,6 @@ package common is redirect: std_ulogic; redir_mode: std_ulogic_vector(3 downto 0); last_nia: std_ulogic_vector(63 downto 0); - br_offset: std_ulogic_vector(63 downto 0); br_last: std_ulogic; br_taken: std_ulogic; abs_br: std_ulogic; @@ -672,7 +671,7 @@ package common is write_data => (others => '0'), write_cr_mask => (others => '0'), write_cr_data => (others => '0'), write_reg => (others => '0'), interrupt => '0', intr_vec => 0, redirect => '0', redir_mode => "0000", - last_nia => (others => '0'), br_offset => (others => '0'), + last_nia => (others => '0'), br_last => '0', br_taken => '0', abs_br => '0', srr1 => (others => '0'), msr => (others => '0')); diff --git a/decode1.vhdl b/decode1.vhdl index 4163584..40e8aef 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -94,8 +94,10 @@ architecture behaviour of decode1 is INSN_andi_dot => (ALU, NONE, OP_LOGIC, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), INSN_andis_dot => (ALU, NONE, OP_LOGIC, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), INSN_attn => (ALU, NONE, OP_ATTN, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), - INSN_b => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), - INSN_bc => (ALU, NONE, OP_BC, NONE, CONST_BD, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), + INSN_brel => (ALU, NONE, OP_B, CIA, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), + INSN_babs => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), + INSN_bcrel => (ALU, NONE, OP_BC, CIA, CONST_BD, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), + INSN_bcabs => (ALU, NONE, OP_BC, NONE, CONST_BD, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), INSN_bcctr => (ALU, NONE, OP_BCREG, NONE, NONE, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), INSN_bclr => (ALU, NONE, OP_BCREG, NONE, NONE, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), INSN_bctar => (ALU, NONE, OP_BCREG, NONE, NONE, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), @@ -597,11 +599,11 @@ begin -- count cache or link stack. br_offset := (others => '0'); case icode is - when INSN_b => + when INSN_brel | INSN_babs => -- Unconditional branches are always taken v.br_pred := '1'; br_offset := signed(f_in.insn(25 downto 2)); - when INSN_bc => + when INSN_bcrel | INSN_bcabs => -- Predict backward branches as taken, forward as untaken v.br_pred := f_in.insn(15); br_offset := resize(signed(f_in.insn(15 downto 2)), 24); diff --git a/decode2.vhdl b/decode2.vhdl index 1f3e7ff..80dfabd 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -221,9 +221,8 @@ architecture behaviour of decode2 is OP_SHR => "010", OP_EXTSWSLI => "010", OP_MUL_L64 => "011", -- muldiv_result - OP_B => "110", -- next_nia - OP_BC => "110", - OP_BCREG => "110", + OP_BCREG => "101", -- ramspr_result + OP_RFID => "101", OP_ADDG6S => "111", -- misc_result OP_ISEL => "111", OP_DARN => "111", diff --git a/decode_types.vhdl b/decode_types.vhdl index cfa4792..5b21fff 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -47,14 +47,16 @@ package decode_types is INSN_andi_dot, -- 10 INSN_andis_dot, INSN_attn, - INSN_b, - INSN_bc, + INSN_brel, + INSN_babs, + INSN_bcrel, + INSN_bcabs, INSN_bcctr, INSN_bclr, INSN_bctar, - INSN_brh, + INSN_brh, -- 20 INSN_brw, - INSN_brd, -- 20 + INSN_brd, INSN_cbcdtd, INSN_cdtbcd, INSN_cmpi, @@ -62,9 +64,9 @@ package decode_types is INSN_cntlzw, INSN_cntlzd, INSN_cnttzw, - INSN_cnttzd, + INSN_cnttzd, -- 30 INSN_crand, - INSN_crandc, -- 30 + INSN_crandc, INSN_creqv, INSN_crnand, INSN_crnor, @@ -72,9 +74,9 @@ package decode_types is INSN_crorc, INSN_crxor, INSN_darn, - INSN_eieio, + INSN_eieio, -- 40 INSN_extsb, - INSN_extsh, -- 40 + INSN_extsh, INSN_extsw, INSN_extswsli, INSN_isync, @@ -82,9 +84,9 @@ package decode_types is INSN_ld, INSN_ldu, INSN_lhau, - INSN_lwa, + INSN_lwa, -- 50 INSN_lwzu, - INSN_mcrf, -- 50 + INSN_mcrf, INSN_mcrxrx, INSN_mfcr, INSN_mfmsr, @@ -92,9 +94,9 @@ package decode_types is INSN_mtcrf, INSN_mtmsr, INSN_mtmsrd, - INSN_mtspr, + INSN_mtspr, -- 60 INSN_mulli, - INSN_neg, -- 60 + INSN_neg, INSN_nop, INSN_ori, INSN_oris, @@ -102,9 +104,9 @@ package decode_types is INSN_popcntw, INSN_popcntd, INSN_prtyw, - INSN_prtyd, + INSN_prtyd, -- 70 INSN_rfid, - INSN_rldic, -- 70 + INSN_rldic, INSN_rldicl, INSN_rldicr, INSN_rldimi, @@ -112,9 +114,9 @@ package decode_types is INSN_rlwinm, INSN_rnop, INSN_sc, - INSN_setb, + INSN_setb, -- 80 INSN_slbia, - INSN_sradi, -- 80 + INSN_sradi, INSN_srawi, INSN_stbu, INSN_std, @@ -122,9 +124,9 @@ package decode_types is INSN_sthu, INSN_stwu, INSN_subfic, - INSN_subfme, + INSN_subfme, -- 90 INSN_subfze, - INSN_sync, -- 90 + INSN_sync, INSN_tdi, INSN_tlbsync, INSN_twi, @@ -132,7 +134,7 @@ package decode_types is INSN_xori, INSN_xoris, -- pad to 104 - INSN_061, INSN_062, INSN_063, INSN_064, INSN_065, INSN_066, INSN_067, + INSN_063, INSN_064, INSN_065, INSN_066, INSN_067, -- Non-prefixed instructions that have a MLS:D prefixed form and -- their corresponding prefixed instructions. @@ -497,8 +499,10 @@ package body decode_types is when INSN_andi_dot => return "011100"; when INSN_andis_dot => return "011101"; when INSN_attn => return "000000"; - when INSN_b => return "010010"; - when INSN_bc => return "010000"; + when INSN_brel => return "010010"; + when INSN_babs => return "010010"; + when INSN_bcrel => return "010000"; + when INSN_bcabs => return "010000"; when INSN_brh => return "011111"; when INSN_brw => return "011111"; when INSN_brd => return "011111"; diff --git a/execute1.vhdl b/execute1.vhdl index 7c1ff8f..dacd66c 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -95,6 +95,7 @@ architecture behaviour of execute1 is exception : std_ulogic; trap : std_ulogic; advance_nia : std_ulogic; + redir_to_next : std_ulogic; new_msr : std_ulogic_vector(63 downto 0); take_branch : std_ulogic; direct_branch : std_ulogic; @@ -124,6 +125,9 @@ architecture behaviour of execute1 is res2_sel : std_ulogic_vector(1 downto 0); spr_select : spr_id; pmu_spr_num : std_ulogic_vector(4 downto 0); + redir_to_next : std_ulogic; + advance_nia : std_ulogic; + lr_from_next : std_ulogic; mul_in_progress : std_ulogic; mul_finish : std_ulogic; div_in_progress : std_ulogic; @@ -145,6 +149,7 @@ architecture behaviour of execute1 is prev_prefixed => '0', oe => '0', mul_select => "00", res2_sel => "00", spr_select => spr_id_init, pmu_spr_num => 5x"0", + redir_to_next => '0', advance_nia => '0', lr_from_next => '0', mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0', taken_branch_event => '0', br_mispredict => '0', @@ -510,6 +515,7 @@ begin variable wr_addr : ramspr_index; variable even_wr_enab, odd_wr_enab : std_ulogic; variable even_wr_data, odd_wr_data : std_ulogic_vector(63 downto 0); + variable ramspr_even_data : std_ulogic_vector(63 downto 0); variable doit : std_ulogic; begin -- Read address mux and async RAM reading @@ -533,11 +539,16 @@ begin else wr_addr := ex1.ramspr_wraddr; end if; + if ex1.lr_from_next = '1' then + ramspr_even_data := next_nia; + else + ramspr_even_data := ex1.e.write_data; + end if; if interrupt_in.intr = '1' then even_wr_data := ex2.e.last_nia; odd_wr_data := intr_srr1(ctrl.msr, interrupt_in.srr1); else - even_wr_data := ex1.e.write_data; + even_wr_data := ramspr_even_data; odd_wr_data := ex1.ramspr_odd_data; end if; ramspr_wr_addr <= wr_addr; @@ -550,7 +561,7 @@ begin -- We assume no instruction executes in the cycle immediately following -- an interrupt, so we don't need to bypass interrupt data if ex1.se.ramspr_write_even = '1' and e_in.ramspr_even_rdaddr = ex1.ramspr_wraddr then - ramspr_even <= ex1.e.write_data; + ramspr_even <= ramspr_even_data; else ramspr_even <= even_rd_data; end if; @@ -593,7 +604,6 @@ begin shortmul_result when "011", muldiv_result when "100", ramspr_result when "101", - next_nia when "110", misc_result when others; execute1_0: process(clk) @@ -1016,7 +1026,6 @@ begin v.e.mode_32bit := not ex1.msr(MSR_SF); v.e.instr_tag := e_in.instr_tag; v.e.last_nia := e_in.nia; - v.e.br_offset := 64x"4"; v.se.ramspr_write_even := e_in.ramspr_write_even; v.se.ramspr_write_odd := e_in.ramspr_write_odd; @@ -1114,8 +1123,6 @@ begin v.direct_branch := '1'; v.e.br_last := '1'; v.e.br_taken := '1'; - v.e.br_offset := b_in; - v.e.abs_br := insn_aa(e_in.insn); if e_in.br_pred = '0' then -- should never happen v.e.redirect := '1'; @@ -1129,14 +1136,13 @@ begin bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); v.take_branch := ppc_bc_taken(bo, bi, cr_in, ramspr_odd); - if v.take_branch = '1' then - v.e.br_offset := b_in; - v.e.abs_br := insn_aa(e_in.insn); - end if; -- Mispredicted branches cause a redirect if v.take_branch /= e_in.br_pred then v.e.redirect := '1'; end if; + if v.take_branch = '0' then + v.redir_to_next := '1'; + end if; v.direct_branch := '1'; v.e.br_last := '1'; v.e.br_taken := v.take_branch; @@ -1150,10 +1156,6 @@ begin bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); v.take_branch := ppc_bc_taken(bo, bi, cr_in, ramspr_odd); - if v.take_branch = '1' then - v.e.br_offset := ramspr_result; - v.e.abs_br := '1'; - end if; -- Indirect branches are never predicted taken v.e.redirect := v.take_branch; v.e.br_taken := v.take_branch; @@ -1177,8 +1179,6 @@ begin v.new_msr(MSR_DR) := '1'; end if; v.se.write_msr := '1'; - v.e.br_offset := ramspr_result; - v.e.abs_br := '1'; v.e.redirect := '1'; v.se.write_cfar := '1'; if HAS_FPU then @@ -1292,6 +1292,7 @@ begin when OP_ISYNC => v.e.redirect := '1'; + v.redir_to_next := '1'; when OP_ICBI => v.se.icache_inval := '1'; @@ -1406,6 +1407,7 @@ begin v.mul_select := e_in.sub_select(1 downto 0); v.se := side_effect_init; v.ramspr_wraddr := e_in.ramspr_wraddr; + v.lr_from_next := e_in.lr; v.ramspr_odd_data := actions.ramspr_odd_data; end if; @@ -1423,9 +1425,6 @@ begin irq_valid := ex1.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in); - -- Next insn adder used in a couple of places - next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4); - -- rotator control signals right_shift <= '1' when e_in.insn_type = OP_SHR else '0'; rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0'; @@ -1507,10 +1506,9 @@ begin x_to_divider.valid <= actions.start_div; v.div_in_progress := actions.start_div; v.br_mispredict := v.e.redirect and actions.direct_branch; + v.advance_nia := actions.advance_nia; + v.redir_to_next := actions.redir_to_next; exception := actions.trap; - if actions.advance_nia = '1' then - v.e.last_nia := next_nia; - end if; -- Go busy while division is happening because the -- divider is not pipelined. Also go busy while a @@ -1681,6 +1679,9 @@ begin variable sign, zero : std_ulogic; variable rcnz_hi, rcnz_lo : std_ulogic; begin + -- Next insn adder used in a couple of places + next_nia <= std_ulogic_vector(unsigned(ex1.e.last_nia) + 4); + v := ex2; if stage2_stall = '0' then v.e := ex1.e; @@ -1688,6 +1689,9 @@ begin v.ext_interrupt := ex1.ext_interrupt; v.taken_branch_event := ex1.taken_branch_event; v.br_mispredict := ex1.br_mispredict; + if ex1.advance_nia = '1' then + v.e.last_nia := next_nia; + end if; end if; if ex1.se.mult_32s = '1' and ex1.oe = '1' then @@ -1748,10 +1752,12 @@ begin else sprres := pmu_to_x.spr_val; end if; - if ex1.res2_sel(1) = '0' then - ex_result := rcresult; - else + if ex1.res2_sel(1) = '1' then ex_result := sprres; + elsif ex1.redir_to_next = '1' then + ex_result := next_nia; + else + ex_result := rcresult; end if; cr_res := ex1.e.write_cr_data; diff --git a/predecode.vhdl b/predecode.vhdl index 0ab7427..d3ca015 100644 --- a/predecode.vhdl +++ b/predecode.vhdl @@ -38,8 +38,38 @@ architecture behaviour of predecoder is 2#011100_00000# to 2#011100_11111# => INSN_andi_dot, 2#011101_00000# to 2#011101_11111# => INSN_andis_dot, 2#000000_00000# => INSN_attn, - 2#010010_00000# to 2#010010_11111# => INSN_b, - 2#010000_00000# to 2#010000_11111# => INSN_bc, + 2#010010_00000# to 2#010010_00001# => INSN_brel, + 2#010010_00010# to 2#010010_00011# => INSN_babs, + 2#010010_00100# to 2#010010_00101# => INSN_brel, + 2#010010_00110# to 2#010010_00111# => INSN_babs, + 2#010010_01000# to 2#010010_01001# => INSN_brel, + 2#010010_01010# to 2#010010_01011# => INSN_babs, + 2#010010_01100# to 2#010010_01101# => INSN_brel, + 2#010010_01110# to 2#010010_01111# => INSN_babs, + 2#010010_10000# to 2#010010_10001# => INSN_brel, + 2#010010_10010# to 2#010010_10011# => INSN_babs, + 2#010010_10100# to 2#010010_10101# => INSN_brel, + 2#010010_10110# to 2#010010_10111# => INSN_babs, + 2#010010_11000# to 2#010010_11001# => INSN_brel, + 2#010010_11010# to 2#010010_11011# => INSN_babs, + 2#010010_11100# to 2#010010_11101# => INSN_brel, + 2#010010_11110# to 2#010010_11111# => INSN_babs, + 2#010000_00000# to 2#010000_00001# => INSN_bcrel, + 2#010000_00010# to 2#010000_00011# => INSN_bcabs, + 2#010000_00100# to 2#010000_00101# => INSN_bcrel, + 2#010000_00110# to 2#010000_00111# => INSN_bcabs, + 2#010000_01000# to 2#010000_01001# => INSN_bcrel, + 2#010000_01010# to 2#010000_01011# => INSN_bcabs, + 2#010000_01100# to 2#010000_01101# => INSN_bcrel, + 2#010000_01110# to 2#010000_01111# => INSN_bcabs, + 2#010000_10000# to 2#010000_10001# => INSN_bcrel, + 2#010000_10010# to 2#010000_10011# => INSN_bcabs, + 2#010000_10100# to 2#010000_10101# => INSN_bcrel, + 2#010000_10110# to 2#010000_10111# => INSN_bcabs, + 2#010000_11000# to 2#010000_11001# => INSN_bcrel, + 2#010000_11010# to 2#010000_11011# => INSN_bcabs, + 2#010000_11100# to 2#010000_11101# => INSN_bcrel, + 2#010000_11110# to 2#010000_11111# => INSN_bcabs, 2#001011_00000# to 2#001011_11111# => INSN_cmpi, 2#001010_00000# to 2#001010_11111# => INSN_cmpli, 2#100010_00000# to 2#100010_11111# => INSN_lbz, diff --git a/writeback.vhdl b/writeback.vhdl index 7fef5c3..2eb9998 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -174,11 +174,7 @@ begin f.big_endian := '0'; f.mode_32bit := '0'; else - if e_in.abs_br = '1' then - f.redirect_nia := e_in.br_offset; - else - f.redirect_nia := std_ulogic_vector(unsigned(e_in.last_nia) + unsigned(e_in.br_offset)); - end if; + f.redirect_nia := e_in.write_data; -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1 f.virt_mode := e_in.redir_mode(3); f.priv_mode := e_in.redir_mode(2);