From b5a7dbb78dff640ee18b6662ea007a946a4ebb09 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sun, 10 May 2020 18:18:03 +1000 Subject: [PATCH] core: Remove fetch2 pipeline stage The fetch2 stage existed primarily to provide a stash buffer for the output of icache when a stall occurred. However, we can get the same effect -- of having the input to decode1 stay unchanged on a stall cycle -- by using the read enable of the BRAMs in icache, and by adding logic to keep the outputs unchanged on a clock cycle when stall_in = 1. This reduces branch and interrupt latency by one cycle. Signed-off-by: Paul Mackerras --- Makefile | 2 +- common.vhdl | 12 +---- core.vhdl | 25 +++------- decode1.vhdl | 3 +- fetch2.vhdl | 123 ------------------------------------------------- icache.vhdl | 52 ++++++++++++--------- icache_tb.vhdl | 3 +- microwatt.core | 1 - 8 files changed, 41 insertions(+), 180 deletions(-) delete mode 100644 fetch2.vhdl diff --git a/Makefile b/Makefile index 692704e..1e4b558 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ all = core_tb icache_tb dcache_tb multiply_tb dmi_dtm_tb divider_tb \ all: $(all) core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ - fetch2.vhdl utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \ + utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \ decode1.vhdl helpers.vhdl insn_helpers.vhdl gpr_hazard.vhdl \ cr_hazard.vhdl control.vhdl decode2.vhdl register_file.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ diff --git a/common.vhdl b/common.vhdl index a6b3f95..7236a56 100644 --- a/common.vhdl +++ b/common.vhdl @@ -96,7 +96,7 @@ package common is nia: std_ulogic_vector(63 downto 0); end record; - type IcacheToFetch2Type is record + type IcacheToDecode1Type is record valid: std_ulogic; stop_mark: std_ulogic; fetch_failed: std_ulogic; @@ -104,16 +104,6 @@ package common is insn: std_ulogic_vector(31 downto 0); end record; - type Fetch2ToDecode1Type is record - valid: std_ulogic; - stop_mark : std_ulogic; - fetch_failed: std_ulogic; - nia: std_ulogic_vector(63 downto 0); - insn: std_ulogic_vector(31 downto 0); - end record; - constant Fetch2ToDecode1Init : Fetch2ToDecode1Type := (valid => '0', stop_mark => '0', fetch_failed => '0', - nia => (others => '0'), insn => (others => '0')); - type Decode1ToDecode2Type is record valid: std_ulogic; stop_mark : std_ulogic; diff --git a/core.vhdl b/core.vhdl index da9853f..5517959 100644 --- a/core.vhdl +++ b/core.vhdl @@ -41,12 +41,9 @@ entity core is end core; architecture behave of core is - -- fetch signals - signal fetch2_to_decode1: Fetch2ToDecode1Type; - -- icache signals signal fetch1_to_icache : Fetch1ToIcacheType; - signal icache_to_fetch2 : IcacheToFetch2Type; + signal icache_to_decode1 : IcacheToDecode1Type; signal mmu_to_icache : MmuToIcacheType; -- decode signals @@ -83,7 +80,7 @@ architecture behave of core is -- local signals signal fetch1_stall_in : std_ulogic; signal icache_stall_out : std_ulogic; - signal fetch2_stall_in : std_ulogic; + signal icache_stall_in : std_ulogic; signal decode1_stall_in : std_ulogic; signal decode2_stall_in : std_ulogic; signal decode2_stall_out : std_ulogic; @@ -145,7 +142,6 @@ architecture behave of core is attribute keep_hierarchy : string; attribute keep_hierarchy of fetch1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of icache_0 : label is keep_h(DISABLE_FLATTEN); - attribute keep_hierarchy of fetch2_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of decode1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of decode2_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN); @@ -206,27 +202,18 @@ begin clk => clk, rst => rst_icache, i_in => fetch1_to_icache, - i_out => icache_to_fetch2, + i_out => icache_to_decode1, m_in => mmu_to_icache, flush_in => flush, inval_in => dbg_icache_rst or ex1_icache_inval, + stall_in => icache_stall_in, stall_out => icache_stall_out, wishbone_out => wishbone_insn_out, wishbone_in => wishbone_insn_in, log_out => log_data(96 downto 43) ); - fetch2_0: entity work.fetch2 - port map ( - clk => clk, - rst => rst_fetch2, - stall_in => fetch2_stall_in, - flush_in => flush, - i_in => icache_to_fetch2, - f_out => fetch2_to_decode1 - ); - - fetch2_stall_in <= decode2_stall_out; + icache_stall_in <= decode2_stall_out; decode1_0: entity work.decode1 port map ( @@ -234,7 +221,7 @@ begin rst => rst_dec1, stall_in => decode1_stall_in, flush_in => flush, - f_in => fetch2_to_decode1, + f_in => icache_to_decode1, d_out => decode1_to_decode2, log_out => log_data(109 downto 97) ); diff --git a/decode1.vhdl b/decode1.vhdl index 3e3b41a..214285e 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -14,9 +14,8 @@ entity decode1 is stall_in : in std_ulogic; flush_in : in std_ulogic; - f_in : in Fetch2ToDecode1Type; + f_in : in IcacheToDecode1Type; d_out : out Decode1ToDecode2Type; - log_out : out std_ulogic_vector(12 downto 0) ); end entity decode1; diff --git a/fetch2.vhdl b/fetch2.vhdl deleted file mode 100644 index 13ff56e..0000000 --- a/fetch2.vhdl +++ /dev/null @@ -1,123 +0,0 @@ -library ieee; -use ieee.std_logic_1164.all; -use ieee.numeric_std.all; - -library work; -use work.common.all; -use work.wishbone_types.all; - -entity fetch2 is - port( - clk : in std_ulogic; - rst : in std_ulogic; - - stall_in : in std_ulogic; - flush_in : in std_ulogic; - - -- Results from icache - i_in : in IcacheToFetch2Type; - - -- Output to decode - f_out : out Fetch2ToDecode1Type - ); -end entity fetch2; - -architecture behaviour of fetch2 is - - -- The icache cannot stall, so we need to stash a cycle - -- of output from it when we stall. - type reg_internal_type is record - stash : IcacheToFetch2Type; - stash_valid : std_ulogic; - stopped : std_ulogic; - end record; - - signal r_int, rin_int : reg_internal_type; - signal r, rin : Fetch2ToDecode1Type; - -begin - regs : process(clk) - begin - if rising_edge(clk) then - - if (r /= rin) then - report "fetch2 rst:" & std_ulogic'image(rst) & - " S:" & std_ulogic'image(stall_in) & - " F:" & std_ulogic'image(flush_in) & - " T:" & std_ulogic'image(rin.stop_mark) & - " V:" & std_ulogic'image(rin.valid) & - " FF:" & std_ulogic'image(rin.fetch_failed) & - " nia:" & to_hstring(rin.nia); - end if; - - -- Output state remains unchanged on stall, unless we are flushing - if rst = '1' or flush_in = '1' or stall_in = '0' then - r <= rin; - end if; - - -- Internal state is updated on every clock - r_int <= rin_int; - end if; - end process; - - comb : process(all) - variable v : Fetch2ToDecode1Type; - variable v_int : reg_internal_type; - variable v_i_in : IcacheToFetch2Type; - begin - v := r; - v_int := r_int; - - -- If stalling, stash away the current input from the icache - if stall_in = '1' and v_int.stash_valid = '0' then - v_int.stash := i_in; - v_int.stash_valid := '1'; - end if; - - -- If unstalling, source input from the stash and invalidate it, - -- otherwise source normally from the icache. - -- - v_i_in := i_in; - if v_int.stash_valid = '1' and stall_in = '0' then - v_i_in := v_int.stash; - v_int.stash_valid := '0'; - end if; - - v.valid := v_i_in.valid; - v.stop_mark := v_i_in.stop_mark; - v.fetch_failed := v_i_in.fetch_failed; - v.nia := v_i_in.nia; - v.insn := v_i_in.insn; - - -- Clear stash internal valid bit on flush. We still mark - -- the stash itself as valid since we still want to override - -- whatever comes form icache when unstalling, but we'll - -- override it with something invalid. - -- - if flush_in = '1' then - v_int.stash.valid := '0'; - v_int.stash.fetch_failed := '0'; - end if; - - -- If we are flushing or the instruction comes with a stop mark - -- we tag it as invalid so it doesn't get decoded and executed - if flush_in = '1' or v.stop_mark = '1' then - v.valid := '0'; - v.fetch_failed := '0'; - end if; - - -- Clear stash on reset - if rst = '1' then - v_int.stash_valid := '0'; - v.valid := '0'; - end if; - - -- Update registers - rin <= v; - rin_int <= v_int; - - -- Update outputs - f_out <= r; - end process; - -end architecture behaviour; diff --git a/icache.vhdl b/icache.vhdl index 2107d5a..e4f8448 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -48,10 +48,11 @@ entity icache is rst : in std_ulogic; i_in : in Fetch1ToIcacheType; - i_out : out IcacheToFetch2Type; + i_out : out IcacheToDecode1Type; m_in : in MmuToIcacheType; + stall_in : in std_ulogic; stall_out : out std_ulogic; flush_in : in std_ulogic; inval_in : in std_ulogic; @@ -366,7 +367,7 @@ begin ); process(all) begin - do_read <= '1'; + do_read <= not stall_in; do_write <= '0'; if wishbone_in.ack = '1' and r.store_way = i then do_write <= '1'; @@ -533,25 +534,32 @@ begin icache_hit : process(clk) begin if rising_edge(clk) then - -- On a hit, latch the request for the next cycle, when the BRAM data - -- will be available on the cache_out output of the corresponding way - -- - r.hit_valid <= req_is_hit; - -- Send stop marks and NIA down regardless of validity - r.hit_smark <= i_in.stop_mark; - r.hit_nia <= i_in.nia; - if req_is_hit = '1' then - r.hit_way <= req_hit_way; - r.hit_smark <= i_in.stop_mark; - - report "cache hit nia:" & to_hstring(i_in.nia) & - " IR:" & std_ulogic'image(i_in.virt_mode) & - " SM:" & std_ulogic'image(i_in.stop_mark) & - " idx:" & integer'image(req_index) & - " tag:" & to_hstring(req_tag) & - " way:" & integer'image(req_hit_way) & - " RA:" & to_hstring(real_addr); - end if; + -- keep outputs to fetch2 unchanged on a stall + -- except that flush or reset sets valid to 0 + if stall_in = '1' then + if rst = '1' or flush_in = '1' then + r.hit_valid <= '0'; + end if; + else + -- On a hit, latch the request for the next cycle, when the BRAM data + -- will be available on the cache_out output of the corresponding way + -- + r.hit_valid <= req_is_hit; + -- Send stop marks and NIA down regardless of validity + r.hit_smark <= i_in.stop_mark; + r.hit_nia <= i_in.nia; + if req_is_hit = '1' then + r.hit_way <= req_hit_way; + + report "cache hit nia:" & to_hstring(i_in.nia) & + " IR:" & std_ulogic'image(i_in.virt_mode) & + " SM:" & std_ulogic'image(i_in.stop_mark) & + " idx:" & integer'image(req_index) & + " tag:" & to_hstring(req_tag) & + " way:" & integer'image(req_hit_way) & + " RA:" & to_hstring(real_addr); + end if; + end if; end if; end process; @@ -674,7 +682,7 @@ begin -- TLB miss and protection fault processing if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then r.fetch_failed <= '0'; - elsif i_in.req = '1' and access_ok = '0' then + elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then r.fetch_failed <= '1'; end if; end if; diff --git a/icache_tb.vhdl b/icache_tb.vhdl index 39e28d5..1d179d6 100644 --- a/icache_tb.vhdl +++ b/icache_tb.vhdl @@ -13,7 +13,7 @@ architecture behave of icache_tb is signal rst : std_ulogic; signal i_out : Fetch1ToIcacheType; - signal i_in : IcacheToFetch2Type; + signal i_in : IcacheToDecode1Type; signal m_out : MmuToIcacheType; @@ -33,6 +33,7 @@ begin i_in => i_out, i_out => i_in, m_in => m_out, + stall_in => '0', flush_in => '0', inval_in => '0', wishbone_out => wb_bram_in, diff --git a/microwatt.core b/microwatt.core index 87ef39d..876f762 100644 --- a/microwatt.core +++ b/microwatt.core @@ -9,7 +9,6 @@ filesets: - wishbone_types.vhdl - common.vhdl - fetch1.vhdl - - fetch2.vhdl - decode1.vhdl - helpers.vhdl - decode2.vhdl