diff --git a/common.vhdl b/common.vhdl index d085199..7bf8277 100644 --- a/common.vhdl +++ b/common.vhdl @@ -155,6 +155,7 @@ package common is big_endian : std_ulogic; stop_mark: std_ulogic; sequential: std_ulogic; + predicted : std_ulogic; nia: std_ulogic_vector(63 downto 0); end record; @@ -165,6 +166,7 @@ package common is nia: std_ulogic_vector(63 downto 0); insn: std_ulogic_vector(31 downto 0); big_endian: std_ulogic; + next_predicted: std_ulogic; end record; type Decode1ToDecode2Type is record @@ -308,10 +310,14 @@ package common is big_endian: std_ulogic; mode_32bit: std_ulogic; redirect_nia: std_ulogic_vector(63 downto 0); + br_nia : std_ulogic_vector(63 downto 0); + br_last : std_ulogic; + br_taken : std_ulogic; end record; constant Execute1ToFetch1Init : Execute1ToFetch1Type := (redirect => '0', virt_mode => '0', priv_mode => '0', big_endian => '0', - mode_32bit => '0', others => (others => '0')); + mode_32bit => '0', br_taken => '0', + br_last => '0', others => (others => '0')); type Execute1ToLoadstore1Type is record valid : std_ulogic; diff --git a/core.vhdl b/core.vhdl index bc32a8c..3948b86 100644 --- a/core.vhdl +++ b/core.vhdl @@ -12,6 +12,7 @@ entity core is DISABLE_FLATTEN : boolean := false; EX1_BYPASS : boolean := true; HAS_FPU : boolean := true; + HAS_BTC : boolean := true; ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0'); LOG_LENGTH : natural := 512 ); @@ -187,7 +188,8 @@ begin fetch1_0: entity work.fetch1 generic map ( RESET_ADDRESS => (others => '0'), - ALT_RESET_ADDRESS => ALT_RESET_ADDRESS + ALT_RESET_ADDRESS => ALT_RESET_ADDRESS, + HAS_BTC => HAS_BTC ) port map ( clk => clk, @@ -195,6 +197,7 @@ begin alt_reset_in => alt_reset_d, stall_in => fetch1_stall_in, flush_in => fetch1_flush, + inval_btc => ex1_icache_inval or mmu_to_icache.tlbie, stop_in => dbg_core_stop, d_in => decode1_to_fetch1, e_in => execute1_to_fetch1, diff --git a/decode1.vhdl b/decode1.vhdl index 2edacd3..ebe59be 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -727,7 +727,10 @@ begin bv.br_nia := (others => '0'); end if; bv.br_offset := br_offset; - bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out; + if f_in.next_predicted = '1' then + v.br_pred := '1'; + end if; + bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out and not f_in.next_predicted; -- after a clock edge... br_target := std_ulogic_vector(signed(br.br_nia) + br.br_offset); diff --git a/execute1.vhdl b/execute1.vhdl index 3385455..25b1dc7 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -68,6 +68,8 @@ architecture behaviour of execute1 is last_nia : std_ulogic_vector(63 downto 0); redirect : std_ulogic; abs_br : std_ulogic; + taken_br : std_ulogic; + br_last : std_ulogic; do_intr : std_ulogic; vector : integer range 0 to 16#fff#; br_offset : std_ulogic_vector(63 downto 0); @@ -81,7 +83,7 @@ architecture behaviour of execute1 is fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0', next_lr => (others => '0'), last_nia => (others => '0'), - redirect => '0', abs_br => '0', do_intr => '0', vector => 0, + redirect => '0', abs_br => '0', taken_br => '0', br_last => '0', do_intr => '0', vector => 0, br_offset => (others => '0'), redir_mode => "0000", others => (others => '0')); @@ -365,6 +367,7 @@ begin variable trapval : std_ulogic_vector(4 downto 0); variable illegal : std_ulogic; variable is_branch : std_ulogic; + variable is_direct_branch : std_ulogic; variable taken_branch : std_ulogic; variable abs_branch : std_ulogic; variable spr_val : std_ulogic_vector(63 downto 0); @@ -377,6 +380,7 @@ begin sum_with_carry := (others => '0'); newcrf := (others => '0'); is_branch := '0'; + is_direct_branch := '0'; taken_branch := '0'; abs_branch := '0'; hold_wr_data := '0'; @@ -390,6 +394,8 @@ begin v.br_offset := (others => '0'); v.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) & not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF); + v.taken_br := '0'; + v.br_last := '0'; lv := Execute1ToLoadstore1Init; fv := Execute1ToFPUInit; @@ -843,6 +849,7 @@ begin when OP_B => is_branch := '1'; taken_branch := '1'; + is_direct_branch := '1'; abs_branch := insn_aa(e_in.insn); if ctrl.msr(MSR_BE) = '1' then do_trace := '1'; @@ -852,6 +859,7 @@ begin bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); is_branch := '1'; + is_direct_branch := '1'; taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); abs_branch := insn_aa(e_in.insn); if ctrl.msr(MSR_BE) = '1' then @@ -1093,7 +1101,7 @@ begin if taken_branch = '1' then ctrl_tmp.cfar <= e_in.nia; end if; - if e_in.br_pred = '0' then + if taken_branch = '1' then v.br_offset := b_in; v.abs_br := abs_branch; else @@ -1102,6 +1110,8 @@ begin if taken_branch /= e_in.br_pred then v.redirect := '1'; end if; + v.br_last := is_direct_branch; + v.taken_br := taken_branch; end if; elsif valid_in = '1' and exception = '0' and illegal = '0' then @@ -1300,6 +1310,9 @@ begin -- Outputs to fetch1 f.redirect := r.redirect; + f.br_nia := r.last_nia; + f.br_last := r.br_last and not r.do_intr; + f.br_taken := r.taken_br; if r.do_intr = '1' then f.redirect_nia := std_ulogic_vector(to_unsigned(r.vector, 64)); f.virt_mode := '0'; diff --git a/fetch1.vhdl b/fetch1.vhdl index 3c9d946..8ca7e57 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -8,7 +8,8 @@ use work.common.all; entity fetch1 is generic( RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0'); - ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0') + ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0'); + HAS_BTC : boolean := true ); port( clk : in std_ulogic; @@ -17,6 +18,7 @@ entity fetch1 is -- Control inputs: stall_in : in std_ulogic; flush_in : in std_ulogic; + inval_btc : in std_ulogic; stop_in : in std_ulogic; alt_reset_in : in std_ulogic; @@ -37,10 +39,25 @@ end entity fetch1; architecture behaviour of fetch1 is type reg_internal_t is record mode_32bit: std_ulogic; + rd_is_niap4: std_ulogic; + predicted: std_ulogic; + predicted_nia: std_ulogic_vector(63 downto 0); end record; signal r, r_next : Fetch1ToIcacheType; signal r_int, r_next_int : reg_internal_t; + signal advance_nia : std_ulogic; signal log_nia : std_ulogic_vector(42 downto 0); + + constant BTC_ADDR_BITS : integer := 10; + constant BTC_TAG_BITS : integer := 62 - BTC_ADDR_BITS; + constant BTC_TARGET_BITS : integer := 62; + constant BTC_SIZE : integer := 2 ** BTC_ADDR_BITS; + constant BTC_WIDTH : integer := BTC_TAG_BITS + BTC_TARGET_BITS; + type btc_mem_type is array (0 to BTC_SIZE - 1) of std_ulogic_vector(BTC_WIDTH - 1 downto 0); + + signal btc_rd_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0) := (others => '0'); + signal btc_rd_valid : std_ulogic := '0'; + begin regs : process(clk) @@ -56,15 +73,70 @@ begin " R:" & std_ulogic'image(e_in.redirect) & std_ulogic'image(d_in.redirect) & " S:" & std_ulogic'image(stall_in) & " T:" & std_ulogic'image(stop_in) & - " nia:" & to_hstring(r_next.nia) & - " SM:" & std_ulogic'image(r_next.stop_mark); + " nia:" & to_hstring(r_next.nia); end if; - r <= r_next; - r_int <= r_next_int; + if rst = '1' or e_in.redirect = '1' or d_in.redirect = '1' or stall_in = '0' then + r.virt_mode <= r_next.virt_mode; + r.priv_mode <= r_next.priv_mode; + r.big_endian <= r_next.big_endian; + r_int.mode_32bit <= r_next_int.mode_32bit; + end if; + if advance_nia = '1' then + r.predicted <= r_next.predicted; + r.nia <= r_next.nia; + r_int.predicted <= r_next_int.predicted; + r_int.predicted_nia <= r_next_int.predicted_nia; + r_int.rd_is_niap4 <= r_next.sequential; + end if; + r.sequential <= r_next.sequential and advance_nia; + -- always send the up-to-date stop mark and req + r.stop_mark <= stop_in; + r.req <= not rst; end if; end process; log_out <= log_nia; + btc : if HAS_BTC generate + signal btc_memory : btc_mem_type; + attribute ram_style : string; + attribute ram_style of btc_memory : signal is "block"; + + signal btc_valids : std_ulogic_vector(BTC_SIZE - 1 downto 0); + attribute ram_style of btc_valids : signal is "distributed"; + + signal btc_wr : std_ulogic; + signal btc_wr_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0); + signal btc_wr_addr : std_ulogic_vector(BTC_ADDR_BITS - 1 downto 0); + signal btc_wr_v : std_ulogic; + begin + btc_wr_data <= e_in.br_nia(63 downto BTC_ADDR_BITS + 2) & + e_in.redirect_nia(63 downto 2); + btc_wr_addr <= e_in.br_nia(BTC_ADDR_BITS + 1 downto 2); + btc_wr <= e_in.br_last; + btc_wr_v <= e_in.br_taken; + + btc_ram : process(clk) + variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0); + begin + if rising_edge(clk) then + raddr := unsigned(r.nia(BTC_ADDR_BITS + 1 downto 2)) + + to_unsigned(2, BTC_ADDR_BITS); + if advance_nia = '1' then + btc_rd_data <= btc_memory(to_integer(raddr)); + btc_rd_valid <= btc_valids(to_integer(raddr)); + end if; + if btc_wr = '1' then + btc_memory(to_integer(unsigned(btc_wr_addr))) <= btc_wr_data; + end if; + if inval_btc = '1' or rst = '1' then + btc_valids <= (others => '0'); + elsif btc_wr = '1' then + btc_valids(to_integer(unsigned(btc_wr_addr))) <= btc_wr_v; + end if; + end if; + end process; + end generate; + comb : process(all) variable v : Fetch1ToIcacheType; variable v_int : reg_internal_t; @@ -72,6 +144,8 @@ begin v := r; v_int := r_int; v.sequential := '0'; + v.predicted := '0'; + v_int.predicted := '0'; if rst = '1' then if alt_reset_in = '1' then @@ -83,6 +157,7 @@ begin v.priv_mode := '1'; v.big_endian := '0'; v_int.mode_32bit := '0'; + v_int.predicted_nia := (others => '0'); elsif e_in.redirect = '1' then v.nia := e_in.redirect_nia(63 downto 2) & "00"; if e_in.mode_32bit = '1' then @@ -97,22 +172,26 @@ begin if r_int.mode_32bit = '1' then v.nia(63 downto 32) := (others => '0'); end if; - elsif stall_in = '0' then - - -- If the last NIA value went down with a stop mark, it didn't get - -- executed, and hence we shouldn't increment NIA. - if r.stop_mark = '0' then - if r_int.mode_32bit = '0' then - v.nia := std_ulogic_vector(unsigned(r.nia) + 4); - else - v.nia := x"00000000" & std_ulogic_vector(unsigned(r.nia(31 downto 0)) + 4); - end if; - v.sequential := '1'; - end if; - end if; + elsif r_int.predicted = '1' then + v.nia := r_int.predicted_nia; + v.predicted := '1'; + else + v.sequential := '1'; + v.nia := std_ulogic_vector(unsigned(r.nia) + 4); + if r_int.mode_32bit = '1' then + v.nia(63 downto 32) := x"00000000"; + end if; + if btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and + btc_rd_data(BTC_WIDTH - 1 downto BTC_TARGET_BITS) + = v.nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then + v_int.predicted := '1'; + end if; + end if; + v_int.predicted_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00"; - v.req := not rst and not stop_in; - v.stop_mark := stop_in; + -- If the last NIA value went down with a stop mark, it didn't get + -- executed, and hence we shouldn't increment NIA. + advance_nia <= rst or e_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in); r_next <= v; r_next_int <= v_int; diff --git a/fpga/top-arty.vhdl b/fpga/top-arty.vhdl index 8a3dc7a..68d1e89 100644 --- a/fpga/top-arty.vhdl +++ b/fpga/top-arty.vhdl @@ -15,6 +15,7 @@ entity toplevel is RESET_LOW : boolean := true; CLK_FREQUENCY : positive := 100000000; HAS_FPU : boolean := true; + HAS_BTC : boolean := true; USE_LITEDRAM : boolean := false; NO_BRAM : boolean := false; DISABLE_FLATTEN_CORE : boolean := false; @@ -170,6 +171,7 @@ begin SIM => false, CLK_FREQ => CLK_FREQUENCY, HAS_FPU => HAS_FPU, + HAS_BTC => HAS_BTC, HAS_DRAM => USE_LITEDRAM, DRAM_SIZE => 256 * 1024 * 1024, DRAM_INIT_SIZE => PAYLOAD_SIZE, diff --git a/fpga/top-generic.vhdl b/fpga/top-generic.vhdl index d5219ff..8bff5bb 100644 --- a/fpga/top-generic.vhdl +++ b/fpga/top-generic.vhdl @@ -12,6 +12,7 @@ entity toplevel is CLK_INPUT : positive := 100000000; CLK_FREQUENCY : positive := 100000000; HAS_FPU : boolean := true; + HAS_BTC : boolean := false; LOG_LENGTH : natural := 512; DISABLE_FLATTEN_CORE : boolean := false; UART_IS_16550 : boolean := true @@ -71,6 +72,7 @@ begin SIM => false, CLK_FREQ => CLK_FREQUENCY, HAS_FPU => HAS_FPU, + HAS_BTC => HAS_BTC, LOG_LENGTH => LOG_LENGTH, DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE, UART0_IS_16550 => UART_IS_16550 diff --git a/fpga/top-nexys-video.vhdl b/fpga/top-nexys-video.vhdl index 1942b10..86bdd11 100644 --- a/fpga/top-nexys-video.vhdl +++ b/fpga/top-nexys-video.vhdl @@ -15,6 +15,7 @@ entity toplevel is RESET_LOW : boolean := true; CLK_FREQUENCY : positive := 100000000; HAS_FPU : boolean := true; + HAS_BTC : boolean := true; USE_LITEDRAM : boolean := false; NO_BRAM : boolean := false; DISABLE_FLATTEN_CORE : boolean := false; @@ -122,6 +123,7 @@ begin SIM => false, CLK_FREQ => CLK_FREQUENCY, HAS_FPU => HAS_FPU, + HAS_BTC => HAS_BTC, HAS_DRAM => USE_LITEDRAM, DRAM_SIZE => 512 * 1024 * 1024, DRAM_INIT_SIZE => PAYLOAD_SIZE, diff --git a/icache.vhdl b/icache.vhdl index 37a230d..a658783 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -565,6 +565,7 @@ begin i_out.stop_mark <= r.hit_smark; i_out.fetch_failed <= r.fetch_failed; i_out.big_endian <= r.big_endian; + i_out.next_predicted <= i_in.predicted; -- Stall fetch1 if we have a miss on cache or TLB or a protection fault stall_out <= not (is_hit and access_ok); diff --git a/microwatt.core b/microwatt.core index 7f2068d..41b6230 100644 --- a/microwatt.core +++ b/microwatt.core @@ -134,6 +134,7 @@ targets: - log_length=2048 - uart_is_16550 - has_fpu + - has_btc tools: vivado: {part : xc7a100tcsg324-1} toplevel : toplevel @@ -218,6 +219,7 @@ targets: - log_length=2048 - uart_is_16550 - has_fpu + - has_btc tools: vivado: {part : xc7a200tsbg484-1} toplevel : toplevel @@ -235,6 +237,7 @@ targets: - log_length=2048 - uart_is_16550 - has_fpu + - has_btc generate: [litedram_nexys_video] tools: vivado: {part : xc7a200tsbg484-1} @@ -254,6 +257,7 @@ targets: - uart_is_16550 - has_uart1 - has_fpu=false + - has_btc=false tools: vivado: {part : xc7a35ticsg324-1L} toplevel : toplevel @@ -273,6 +277,7 @@ targets: - uart_is_16550 - has_uart1 - has_fpu=false + - has_btc=false generate: [litedram_arty, liteeth_arty] tools: vivado: {part : xc7a35ticsg324-1L} @@ -292,6 +297,7 @@ targets: - uart_is_16550 - has_uart1 - has_fpu + - has_btc tools: vivado: {part : xc7a100ticsg324-1L} toplevel : toplevel @@ -311,6 +317,7 @@ targets: - uart_is_16550 - has_uart1 - has_fpu + - has_btc generate: [litedram_arty, liteeth_arty] tools: vivado: {part : xc7a100ticsg324-1L} @@ -329,6 +336,7 @@ targets: - log_length=512 - uart_is_16550 - has_fpu=false + - has_btc=false tools: vivado: {part : xc7a35tcpg236-1} toplevel : toplevel @@ -395,6 +403,12 @@ parameters: paramtype : generic default : true + has_btc: + datatype : bool + description : Include a branch target cache in the core + paramtype : generic + default : true + disable_flatten_core: datatype : bool description : Prevent Vivado from flattening the main core components diff --git a/soc.vhdl b/soc.vhdl index e4a7895..77f229e 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -53,6 +53,7 @@ entity soc is CLK_FREQ : positive; SIM : boolean; HAS_FPU : boolean := true; + HAS_BTC : boolean := true; DISABLE_FLATTEN_CORE : boolean := false; HAS_DRAM : boolean := false; DRAM_SIZE : integer := 0; @@ -255,6 +256,7 @@ begin generic map( SIM => SIM, HAS_FPU => HAS_FPU, + HAS_BTC => HAS_BTC, DISABLE_FLATTEN => DISABLE_FLATTEN_CORE, ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'), LOG_LENGTH => LOG_LENGTH