diff --git a/core.vhdl b/core.vhdl index 6c637c7..cd2bb07 100644 --- a/core.vhdl +++ b/core.vhdl @@ -37,6 +37,8 @@ entity core is wishbone_data_in : in wishbone_slave_out; wishbone_data_out : out wishbone_master_out; + wb_snoop_in : in wishbone_master_out; + dmi_addr : in std_ulogic_vector(3 downto 0); dmi_din : in std_ulogic_vector(63 downto 0); dmi_dout : out std_ulogic_vector(63 downto 0); @@ -423,6 +425,7 @@ begin stall_out => dcache_stall_out, wishbone_in => wishbone_data_in, wishbone_out => wishbone_data_out, + snoop_in => wb_snoop_in, log_out => log_data(170 downto 151) ); diff --git a/dcache.vhdl b/dcache.vhdl index bb93148..9916279 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -39,6 +39,8 @@ entity dcache is m_in : in MmuToDcacheType; m_out : out DcacheToMmuType; + snoop_in : in wishbone_master_out := wishbone_master_out_init; + stall_out : out std_ulogic; wishbone_out : out wishbone_master_out; @@ -415,6 +417,11 @@ architecture rtl of dcache is type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0); signal tlb_plru_victim : tlb_plru_out_t; + signal snoop_tag_set : cache_tags_set_t; + signal snoop_valid : std_ulogic; + signal snoop_wrtag : cache_tag_t; + signal snoop_index : index_t; + -- -- Helper functions to decode incoming requests -- @@ -528,7 +535,8 @@ begin assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE; assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE; - assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" severity FAILURE; + assert ispow2(ROW_PER_LINE) and ROW_PER_LINE > 1 + report "ROW_PER_LINE not power of 2 greater than 1" severity FAILURE; assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS) report "geometry bits don't add up" severity FAILURE; assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) @@ -783,6 +791,24 @@ begin end if; end process; + -- Cache tag RAM second read port, for snooping + cache_tag_read_2 : process(clk) + variable addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + begin + if rising_edge(clk) then + addr := (others => '0'); + addr(snoop_in.adr'left downto 0) := snoop_in.adr; + snoop_tag_set <= cache_tags(get_index(addr)); + snoop_wrtag <= get_tag(addr); + snoop_index <= get_index(addr); + -- Don't snoop our own cycles + snoop_valid <= '0'; + if not (r1.wb.cyc = '1' and wishbone_in.stall = '0') then + snoop_valid <= snoop_in.cyc and snoop_in.stb and snoop_in.we; + end if; + end if; + end process; + -- Cache request parsing and hit detection dcache_request : process(all) variable is_hit : std_ulogic; @@ -1293,6 +1319,13 @@ begin end if; end if; + -- Do invalidations from snooped stores to memory + for i in way_t loop + if snoop_valid = '1' and read_tag(i, snoop_tag_set) = snoop_wrtag then + cache_valids(snoop_index)(i) <= '0'; + end if; + end loop; + if r1.write_tag = '1' then -- Store new tag in selected way for i in 0 to NUM_WAYS-1 loop diff --git a/soc.vhdl b/soc.vhdl index d03f114..38bd5df 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -133,6 +133,7 @@ architecture behaviour of soc is -- Wishbone master (output of arbiter): signal wb_master_in : wishbone_slave_out; signal wb_master_out : wishbone_master_out; + signal wb_snoop : wishbone_master_out; -- Main "IO" bus, from main slave decoder to the latch signal wb_io_in : wishbone_master_out; @@ -284,6 +285,7 @@ begin wishbone_insn_out => wishbone_icore_out, wishbone_data_in => wishbone_dcore_in, wishbone_data_out => wishbone_dcore_out, + wb_snoop_in => wb_snoop, dmi_addr => dmi_addr(3 downto 0), dmi_dout => dmi_core_dout, dmi_din => dmi_dout, @@ -313,6 +315,18 @@ begin wb_slave_in => wb_master_in ); + -- Snoop bus going to caches. + -- Gate stb with stall so the caches don't see the stalled strobes. + -- That way if the caches see a strobe when their wishbone is stalled, + -- they know it is an access by another master. + process(all) + begin + wb_snoop <= wb_master_out; + if wb_master_in.stall = '1' then + wb_snoop.stb <= '0'; + end if; + end process; + -- Top level Wishbone slaves address decoder & mux -- -- From CPU to BRAM, DRAM, IO, selected on top 3 bits and dram_at_0