From cb1e3f6d705c6b1808e96ef6e5873c18e9d33a36 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 16 Dec 2020 19:32:07 +1100 Subject: [PATCH] decode1: Take an extra cycle for predicted branch redirects This does the addition of NIA plus the branch offset from the instruction after a clock edge, in order to ease timing, as the path from the icache RAM through the adder in decode1 to the NIA register in fetch1 was showing up as a critical path. This adds one extra cycle of latency when redirecting fetch because of a predicted-taken branch. Signed-off-by: Paul Mackerras --- decode1.vhdl | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 086083e..2edacd3 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -31,6 +31,7 @@ end entity decode1; architecture behaviour of decode1 is signal r, rin : Decode1ToDecode2Type; signal s : Decode1ToDecode2Type; + signal f, fin : Decode1ToFetch1Type; constant illegal_inst : decode_rom_t := (NONE, NONE, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE); @@ -47,6 +48,14 @@ architecture behaviour of decode1 is signal ri, ri_in : reg_internal_t; signal si : reg_internal_t; + type br_predictor_t is record + br_nia : std_ulogic_vector(61 downto 0); + br_offset : signed(23 downto 0); + predict : std_ulogic; + end record; + + signal br, br_in : br_predictor_t; + subtype major_opcode_t is unsigned(5 downto 0); type major_rom_array_t is array(0 to 63) of decode_rom_t; type minor_valid_array_t is array(0 to 1023) of std_ulogic; @@ -537,6 +546,13 @@ begin ri <= ri_in; end if; end if; + if rst = '1' then + br.br_nia <= (others => '0'); + br.br_offset <= (others => '0'); + br.predict <= '0'; + else + br <= br_in; + end if; end if; end process; busy_out <= s.valid; @@ -544,14 +560,13 @@ begin decode1_1: process(all) variable v : Decode1ToDecode2Type; variable vi : reg_internal_t; - variable f : Decode1ToFetch1Type; variable majorop : major_opcode_t; variable minor4op : std_ulogic_vector(10 downto 0); variable op_19_bits: std_ulogic_vector(2 downto 0); variable sprn : spr_num_t; - variable br_nia : std_ulogic_vector(61 downto 0); variable br_target : std_ulogic_vector(61 downto 0); variable br_offset : signed(23 downto 0); + variable bv : br_predictor_t; begin v := Decode1ToDecode2Init; vi := reg_internal_t_init; @@ -707,17 +722,19 @@ begin -- Branch predictor -- Note bclr, bcctr and bctar are predicted not taken as we have no -- count cache or link stack. - br_nia := f_in.nia(63 downto 2); + bv.br_nia := f_in.nia(63 downto 2); if f_in.insn(1) = '1' then - br_nia := (others => '0'); + bv.br_nia := (others => '0'); end if; - br_target := std_ulogic_vector(signed(br_nia) + br_offset); - f.redirect := v.br_pred and f_in.valid and not flush_in and not s.valid; - f.redirect_nia := br_target & "00"; + bv.br_offset := br_offset; + bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out; + -- after a clock edge... + br_target := std_ulogic_vector(signed(br.br_nia) + br.br_offset); -- Update registers rin <= v; ri_in <= vi; + br_in <= bv; -- Update outputs d_out <= r; @@ -729,8 +746,9 @@ begin if ri.force_single = '1' then d_out.decode.sgl_pipe <= '1'; end if; - f_out <= f; - flush_out <= f.redirect; + f_out.redirect <= br.predict; + f_out.redirect_nia <= br_target & "00"; + flush_out <= bv.predict or br.predict; end process; d1_log: if LOG_LENGTH > 0 generate