fetch1: Implement a simple branch target cache

This implements a cache in fetch1, where each entry stores the address
of a simple branch instruction (b or bc) and the target of the branch.
When fetching sequentially, if the address being fetched matches the
cache entry, then fetching will be redirected to the branch target.
The cache has 1024 entries and is direct-mapped, i.e. indexed by bits
11..2 of the NIA.

The bus from execute1 now carries information about taken and
not-taken simple branches, which fetch1 uses to update the cache.
The cache entry is updated for both taken and not-taken branches, with
the valid bit being set if the branch was taken and cleared if the
branch was not taken.

If fetching is redirected to the branch target then that goes down the
pipe as a predicted-taken branch, and decode1 does not do any static
branch prediction.  If fetching is not redirected, then the next
instruction goes down the pipe as normal and decode1 does its static
branch prediction.

In order to make timing, the lookup of the cache is pipelined, so on
each cycle the cache entry for the current NIA + 8 is read.  This
means that after a redirect (from decode1 or execute1), only the third
and subsequent sequentially-fetched instructions will be able to be
predicted.

This improves the coremark value on the Arty A7-100 from about 180 to
about 190 (more than 5%).

The BTC is optional.  Builds for the Artix 7 35-T part have it off by
default because the extra ~1420 LUTs it takes mean that the design
doesn't fit on the Arty A7-35 board.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
cache-tlb-parameters-2
Paul Mackerras 3 years ago
parent f7b855dfc3
commit 0fb207be60

@ -155,6 +155,7 @@ package common is
big_endian : std_ulogic;
stop_mark: std_ulogic;
sequential: std_ulogic;
predicted : std_ulogic;
nia: std_ulogic_vector(63 downto 0);
end record;

@ -165,6 +166,7 @@ package common is
nia: std_ulogic_vector(63 downto 0);
insn: std_ulogic_vector(31 downto 0);
big_endian: std_ulogic;
next_predicted: std_ulogic;
end record;

type Decode1ToDecode2Type is record
@ -308,10 +310,14 @@ package common is
big_endian: std_ulogic;
mode_32bit: std_ulogic;
redirect_nia: std_ulogic_vector(63 downto 0);
br_nia : std_ulogic_vector(63 downto 0);
br_last : std_ulogic;
br_taken : std_ulogic;
end record;
constant Execute1ToFetch1Init : Execute1ToFetch1Type := (redirect => '0', virt_mode => '0',
priv_mode => '0', big_endian => '0',
mode_32bit => '0', others => (others => '0'));
mode_32bit => '0', br_taken => '0',
br_last => '0', others => (others => '0'));

type Execute1ToLoadstore1Type is record
valid : std_ulogic;

@ -12,6 +12,7 @@ entity core is
DISABLE_FLATTEN : boolean := false;
EX1_BYPASS : boolean := true;
HAS_FPU : boolean := true;
HAS_BTC : boolean := true;
ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0');
LOG_LENGTH : natural := 512
);
@ -187,7 +188,8 @@ begin
fetch1_0: entity work.fetch1
generic map (
RESET_ADDRESS => (others => '0'),
ALT_RESET_ADDRESS => ALT_RESET_ADDRESS
ALT_RESET_ADDRESS => ALT_RESET_ADDRESS,
HAS_BTC => HAS_BTC
)
port map (
clk => clk,
@ -195,6 +197,7 @@ begin
alt_reset_in => alt_reset_d,
stall_in => fetch1_stall_in,
flush_in => fetch1_flush,
inval_btc => ex1_icache_inval or mmu_to_icache.tlbie,
stop_in => dbg_core_stop,
d_in => decode1_to_fetch1,
e_in => execute1_to_fetch1,

@ -727,7 +727,10 @@ begin
bv.br_nia := (others => '0');
end if;
bv.br_offset := br_offset;
bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out;
if f_in.next_predicted = '1' then
v.br_pred := '1';
end if;
bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out and not f_in.next_predicted;
-- after a clock edge...
br_target := std_ulogic_vector(signed(br.br_nia) + br.br_offset);


@ -68,6 +68,8 @@ architecture behaviour of execute1 is
last_nia : std_ulogic_vector(63 downto 0);
redirect : std_ulogic;
abs_br : std_ulogic;
taken_br : std_ulogic;
br_last : std_ulogic;
do_intr : std_ulogic;
vector : integer range 0 to 16#fff#;
br_offset : std_ulogic_vector(63 downto 0);
@ -81,7 +83,7 @@ architecture behaviour of execute1 is
fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL,
mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0',
next_lr => (others => '0'), last_nia => (others => '0'),
redirect => '0', abs_br => '0', do_intr => '0', vector => 0,
redirect => '0', abs_br => '0', taken_br => '0', br_last => '0', do_intr => '0', vector => 0,
br_offset => (others => '0'), redir_mode => "0000",
others => (others => '0'));

@ -365,6 +367,7 @@ begin
variable trapval : std_ulogic_vector(4 downto 0);
variable illegal : std_ulogic;
variable is_branch : std_ulogic;
variable is_direct_branch : std_ulogic;
variable taken_branch : std_ulogic;
variable abs_branch : std_ulogic;
variable spr_val : std_ulogic_vector(63 downto 0);
@ -377,6 +380,7 @@ begin
sum_with_carry := (others => '0');
newcrf := (others => '0');
is_branch := '0';
is_direct_branch := '0';
taken_branch := '0';
abs_branch := '0';
hold_wr_data := '0';
@ -390,6 +394,8 @@ begin
v.br_offset := (others => '0');
v.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) &
not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF);
v.taken_br := '0';
v.br_last := '0';

lv := Execute1ToLoadstore1Init;
fv := Execute1ToFPUInit;
@ -843,6 +849,7 @@ begin
when OP_B =>
is_branch := '1';
taken_branch := '1';
is_direct_branch := '1';
abs_branch := insn_aa(e_in.insn);
if ctrl.msr(MSR_BE) = '1' then
do_trace := '1';
@ -852,6 +859,7 @@ begin
bo := insn_bo(e_in.insn);
bi := insn_bi(e_in.insn);
is_branch := '1';
is_direct_branch := '1';
taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
abs_branch := insn_aa(e_in.insn);
if ctrl.msr(MSR_BE) = '1' then
@ -1093,7 +1101,7 @@ begin
if taken_branch = '1' then
ctrl_tmp.cfar <= e_in.nia;
end if;
if e_in.br_pred = '0' then
if taken_branch = '1' then
v.br_offset := b_in;
v.abs_br := abs_branch;
else
@ -1102,6 +1110,8 @@ begin
if taken_branch /= e_in.br_pred then
v.redirect := '1';
end if;
v.br_last := is_direct_branch;
v.taken_br := taken_branch;
end if;

elsif valid_in = '1' and exception = '0' and illegal = '0' then
@ -1300,6 +1310,9 @@ begin

-- Outputs to fetch1
f.redirect := r.redirect;
f.br_nia := r.last_nia;
f.br_last := r.br_last and not r.do_intr;
f.br_taken := r.taken_br;
if r.do_intr = '1' then
f.redirect_nia := std_ulogic_vector(to_unsigned(r.vector, 64));
f.virt_mode := '0';

@ -8,7 +8,8 @@ use work.common.all;
entity fetch1 is
generic(
RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0');
ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0')
ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0');
HAS_BTC : boolean := true
);
port(
clk : in std_ulogic;
@ -17,6 +18,7 @@ entity fetch1 is
-- Control inputs:
stall_in : in std_ulogic;
flush_in : in std_ulogic;
inval_btc : in std_ulogic;
stop_in : in std_ulogic;
alt_reset_in : in std_ulogic;

@ -37,10 +39,25 @@ end entity fetch1;
architecture behaviour of fetch1 is
type reg_internal_t is record
mode_32bit: std_ulogic;
rd_is_niap4: std_ulogic;
predicted: std_ulogic;
predicted_nia: std_ulogic_vector(63 downto 0);
end record;
signal r, r_next : Fetch1ToIcacheType;
signal r_int, r_next_int : reg_internal_t;
signal advance_nia : std_ulogic;
signal log_nia : std_ulogic_vector(42 downto 0);

constant BTC_ADDR_BITS : integer := 10;
constant BTC_TAG_BITS : integer := 62 - BTC_ADDR_BITS;
constant BTC_TARGET_BITS : integer := 62;
constant BTC_SIZE : integer := 2 ** BTC_ADDR_BITS;
constant BTC_WIDTH : integer := BTC_TAG_BITS + BTC_TARGET_BITS;
type btc_mem_type is array (0 to BTC_SIZE - 1) of std_ulogic_vector(BTC_WIDTH - 1 downto 0);

signal btc_rd_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0) := (others => '0');
signal btc_rd_valid : std_ulogic := '0';

begin

regs : process(clk)
@ -56,15 +73,70 @@ begin
" R:" & std_ulogic'image(e_in.redirect) & std_ulogic'image(d_in.redirect) &
" S:" & std_ulogic'image(stall_in) &
" T:" & std_ulogic'image(stop_in) &
" nia:" & to_hstring(r_next.nia) &
" SM:" & std_ulogic'image(r_next.stop_mark);
" nia:" & to_hstring(r_next.nia);
end if;
r <= r_next;
r_int <= r_next_int;
if rst = '1' or e_in.redirect = '1' or d_in.redirect = '1' or stall_in = '0' then
r.virt_mode <= r_next.virt_mode;
r.priv_mode <= r_next.priv_mode;
r.big_endian <= r_next.big_endian;
r_int.mode_32bit <= r_next_int.mode_32bit;
end if;
if advance_nia = '1' then
r.predicted <= r_next.predicted;
r.nia <= r_next.nia;
r_int.predicted <= r_next_int.predicted;
r_int.predicted_nia <= r_next_int.predicted_nia;
r_int.rd_is_niap4 <= r_next.sequential;
end if;
r.sequential <= r_next.sequential and advance_nia;
-- always send the up-to-date stop mark and req
r.stop_mark <= stop_in;
r.req <= not rst;
end if;
end process;
log_out <= log_nia;

btc : if HAS_BTC generate
signal btc_memory : btc_mem_type;
attribute ram_style : string;
attribute ram_style of btc_memory : signal is "block";

signal btc_valids : std_ulogic_vector(BTC_SIZE - 1 downto 0);
attribute ram_style of btc_valids : signal is "distributed";

signal btc_wr : std_ulogic;
signal btc_wr_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0);
signal btc_wr_addr : std_ulogic_vector(BTC_ADDR_BITS - 1 downto 0);
signal btc_wr_v : std_ulogic;
begin
btc_wr_data <= e_in.br_nia(63 downto BTC_ADDR_BITS + 2) &
e_in.redirect_nia(63 downto 2);
btc_wr_addr <= e_in.br_nia(BTC_ADDR_BITS + 1 downto 2);
btc_wr <= e_in.br_last;
btc_wr_v <= e_in.br_taken;

btc_ram : process(clk)
variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0);
begin
if rising_edge(clk) then
raddr := unsigned(r.nia(BTC_ADDR_BITS + 1 downto 2)) +
to_unsigned(2, BTC_ADDR_BITS);
if advance_nia = '1' then
btc_rd_data <= btc_memory(to_integer(raddr));
btc_rd_valid <= btc_valids(to_integer(raddr));
end if;
if btc_wr = '1' then
btc_memory(to_integer(unsigned(btc_wr_addr))) <= btc_wr_data;
end if;
if inval_btc = '1' or rst = '1' then
btc_valids <= (others => '0');
elsif btc_wr = '1' then
btc_valids(to_integer(unsigned(btc_wr_addr))) <= btc_wr_v;
end if;
end if;
end process;
end generate;

comb : process(all)
variable v : Fetch1ToIcacheType;
variable v_int : reg_internal_t;
@ -72,6 +144,8 @@ begin
v := r;
v_int := r_int;
v.sequential := '0';
v.predicted := '0';
v_int.predicted := '0';

if rst = '1' then
if alt_reset_in = '1' then
@ -83,6 +157,7 @@ begin
v.priv_mode := '1';
v.big_endian := '0';
v_int.mode_32bit := '0';
v_int.predicted_nia := (others => '0');
elsif e_in.redirect = '1' then
v.nia := e_in.redirect_nia(63 downto 2) & "00";
if e_in.mode_32bit = '1' then
@ -97,22 +172,26 @@ begin
if r_int.mode_32bit = '1' then
v.nia(63 downto 32) := (others => '0');
end if;
elsif stall_in = '0' then

-- If the last NIA value went down with a stop mark, it didn't get
-- executed, and hence we shouldn't increment NIA.
if r.stop_mark = '0' then
if r_int.mode_32bit = '0' then
v.nia := std_ulogic_vector(unsigned(r.nia) + 4);
else
v.nia := x"00000000" & std_ulogic_vector(unsigned(r.nia(31 downto 0)) + 4);
end if;
v.sequential := '1';
end if;
end if;
elsif r_int.predicted = '1' then
v.nia := r_int.predicted_nia;
v.predicted := '1';
else
v.sequential := '1';
v.nia := std_ulogic_vector(unsigned(r.nia) + 4);
if r_int.mode_32bit = '1' then
v.nia(63 downto 32) := x"00000000";
end if;
if btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and
btc_rd_data(BTC_WIDTH - 1 downto BTC_TARGET_BITS)
= v.nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then
v_int.predicted := '1';
end if;
end if;
v_int.predicted_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00";

v.req := not rst and not stop_in;
v.stop_mark := stop_in;
-- If the last NIA value went down with a stop mark, it didn't get
-- executed, and hence we shouldn't increment NIA.
advance_nia <= rst or e_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in);

r_next <= v;
r_next_int <= v_int;

@ -15,6 +15,7 @@ entity toplevel is
RESET_LOW : boolean := true;
CLK_FREQUENCY : positive := 100000000;
HAS_FPU : boolean := true;
HAS_BTC : boolean := true;
USE_LITEDRAM : boolean := false;
NO_BRAM : boolean := false;
DISABLE_FLATTEN_CORE : boolean := false;
@ -170,6 +171,7 @@ begin
SIM => false,
CLK_FREQ => CLK_FREQUENCY,
HAS_FPU => HAS_FPU,
HAS_BTC => HAS_BTC,
HAS_DRAM => USE_LITEDRAM,
DRAM_SIZE => 256 * 1024 * 1024,
DRAM_INIT_SIZE => PAYLOAD_SIZE,

@ -12,6 +12,7 @@ entity toplevel is
CLK_INPUT : positive := 100000000;
CLK_FREQUENCY : positive := 100000000;
HAS_FPU : boolean := true;
HAS_BTC : boolean := false;
LOG_LENGTH : natural := 512;
DISABLE_FLATTEN_CORE : boolean := false;
UART_IS_16550 : boolean := true
@ -71,6 +72,7 @@ begin
SIM => false,
CLK_FREQ => CLK_FREQUENCY,
HAS_FPU => HAS_FPU,
HAS_BTC => HAS_BTC,
LOG_LENGTH => LOG_LENGTH,
DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE,
UART0_IS_16550 => UART_IS_16550

@ -15,6 +15,7 @@ entity toplevel is
RESET_LOW : boolean := true;
CLK_FREQUENCY : positive := 100000000;
HAS_FPU : boolean := true;
HAS_BTC : boolean := true;
USE_LITEDRAM : boolean := false;
NO_BRAM : boolean := false;
DISABLE_FLATTEN_CORE : boolean := false;
@ -122,6 +123,7 @@ begin
SIM => false,
CLK_FREQ => CLK_FREQUENCY,
HAS_FPU => HAS_FPU,
HAS_BTC => HAS_BTC,
HAS_DRAM => USE_LITEDRAM,
DRAM_SIZE => 512 * 1024 * 1024,
DRAM_INIT_SIZE => PAYLOAD_SIZE,

@ -565,6 +565,7 @@ begin
i_out.stop_mark <= r.hit_smark;
i_out.fetch_failed <= r.fetch_failed;
i_out.big_endian <= r.big_endian;
i_out.next_predicted <= i_in.predicted;

-- Stall fetch1 if we have a miss on cache or TLB or a protection fault
stall_out <= not (is_hit and access_ok);

@ -134,6 +134,7 @@ targets:
- log_length=2048
- uart_is_16550
- has_fpu
- has_btc
tools:
vivado: {part : xc7a100tcsg324-1}
toplevel : toplevel
@ -218,6 +219,7 @@ targets:
- log_length=2048
- uart_is_16550
- has_fpu
- has_btc
tools:
vivado: {part : xc7a200tsbg484-1}
toplevel : toplevel
@ -235,6 +237,7 @@ targets:
- log_length=2048
- uart_is_16550
- has_fpu
- has_btc
generate: [litedram_nexys_video]
tools:
vivado: {part : xc7a200tsbg484-1}
@ -254,6 +257,7 @@ targets:
- uart_is_16550
- has_uart1
- has_fpu=false
- has_btc=false
tools:
vivado: {part : xc7a35ticsg324-1L}
toplevel : toplevel
@ -273,6 +277,7 @@ targets:
- uart_is_16550
- has_uart1
- has_fpu=false
- has_btc=false
generate: [litedram_arty, liteeth_arty]
tools:
vivado: {part : xc7a35ticsg324-1L}
@ -292,6 +297,7 @@ targets:
- uart_is_16550
- has_uart1
- has_fpu
- has_btc
tools:
vivado: {part : xc7a100ticsg324-1L}
toplevel : toplevel
@ -311,6 +317,7 @@ targets:
- uart_is_16550
- has_uart1
- has_fpu
- has_btc
generate: [litedram_arty, liteeth_arty]
tools:
vivado: {part : xc7a100ticsg324-1L}
@ -329,6 +336,7 @@ targets:
- log_length=512
- uart_is_16550
- has_fpu=false
- has_btc=false
tools:
vivado: {part : xc7a35tcpg236-1}
toplevel : toplevel
@ -395,6 +403,12 @@ parameters:
paramtype : generic
default : true

has_btc:
datatype : bool
description : Include a branch target cache in the core
paramtype : generic
default : true

disable_flatten_core:
datatype : bool
description : Prevent Vivado from flattening the main core components

@ -53,6 +53,7 @@ entity soc is
CLK_FREQ : positive;
SIM : boolean;
HAS_FPU : boolean := true;
HAS_BTC : boolean := true;
DISABLE_FLATTEN_CORE : boolean := false;
HAS_DRAM : boolean := false;
DRAM_SIZE : integer := 0;
@ -255,6 +256,7 @@ begin
generic map(
SIM => SIM,
HAS_FPU => HAS_FPU,
HAS_BTC => HAS_BTC,
DISABLE_FLATTEN => DISABLE_FLATTEN_CORE,
ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'),
LOG_LENGTH => LOG_LENGTH

Loading…
Cancel
Save