diff --git a/Makefile b/Makefile index 692704e..1e4b558 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ all = core_tb icache_tb dcache_tb multiply_tb dmi_dtm_tb divider_tb \ all: $(all) core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ - fetch2.vhdl utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \ + utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \ decode1.vhdl helpers.vhdl insn_helpers.vhdl gpr_hazard.vhdl \ cr_hazard.vhdl control.vhdl decode2.vhdl register_file.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ diff --git a/common.vhdl b/common.vhdl index a6b3f95..52222c3 100644 --- a/common.vhdl +++ b/common.vhdl @@ -93,10 +93,11 @@ package common is virt_mode : std_ulogic; priv_mode : std_ulogic; stop_mark: std_ulogic; + sequential: std_ulogic; nia: std_ulogic_vector(63 downto 0); end record; - type IcacheToFetch2Type is record + type IcacheToDecode1Type is record valid: std_ulogic; stop_mark: std_ulogic; fetch_failed: std_ulogic; @@ -104,16 +105,6 @@ package common is insn: std_ulogic_vector(31 downto 0); end record; - type Fetch2ToDecode1Type is record - valid: std_ulogic; - stop_mark : std_ulogic; - fetch_failed: std_ulogic; - nia: std_ulogic_vector(63 downto 0); - insn: std_ulogic_vector(31 downto 0); - end record; - constant Fetch2ToDecode1Init : Fetch2ToDecode1Type := (valid => '0', stop_mark => '0', fetch_failed => '0', - nia => (others => '0'), insn => (others => '0')); - type Decode1ToDecode2Type is record valid: std_ulogic; stop_mark : std_ulogic; @@ -122,8 +113,16 @@ package common is ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR) decode: decode_rom_t; + br_pred: std_ulogic; -- Branch was predicted to be taken + end record; + constant Decode1ToDecode2Init : Decode1ToDecode2Type := + (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), + ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init, br_pred => '0'); + + type Decode1ToFetch1Type is record + redirect : std_ulogic; + redirect_nia : std_ulogic_vector(63 downto 0); end record; - constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init); type Decode2ToExecute1Type is record valid: std_ulogic; @@ -158,23 +157,24 @@ package common is sign_extend : std_ulogic; -- do we need to sign extend? update : std_ulogic; -- is this an update instruction? reserve : std_ulogic; -- set for larx/stcx + br_pred : std_ulogic; end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', - is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', + is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), others => (others => '0')); type Execute1ToMultiplyType is record valid: std_ulogic; - insn_type: insn_type_t; - data1: std_ulogic_vector(64 downto 0); - data2: std_ulogic_vector(64 downto 0); + data1: std_ulogic_vector(63 downto 0); + data2: std_ulogic_vector(63 downto 0); is_32bit: std_ulogic; + neg_result: std_ulogic; end record; - constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, - is_32bit => '0', + constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0', + is_32bit => '0', neg_result => '0', others => (others => '0')); type Execute1ToDividerType is record @@ -253,6 +253,7 @@ package common is others => (others => '0')); type Loadstore1ToExecute1Type is record + busy : std_ulogic; exception : std_ulogic; invalid : std_ulogic; perm_error : std_ulogic; @@ -366,7 +367,7 @@ package common is type MultiplyToExecute1Type is record valid: std_ulogic; - write_reg_data: std_ulogic_vector(63 downto 0); + result: std_ulogic_vector(127 downto 0); overflow : std_ulogic; end record; constant MultiplyToExecute1Init : MultiplyToExecute1Type := (valid => '0', overflow => '0', diff --git a/control.vhdl b/control.vhdl index 55f5649..5e557c4 100644 --- a/control.vhdl +++ b/control.vhdl @@ -15,7 +15,8 @@ entity control is complete_in : in std_ulogic; valid_in : in std_ulogic; flush_in : in std_ulogic; - stall_in : in std_ulogic; + busy_in : in std_ulogic; + deferred : in std_ulogic; sgl_pipe_in : in std_ulogic; stop_mark_in : in std_ulogic; @@ -23,6 +24,9 @@ entity control is gpr_write_in : in gspr_index_t; gpr_bypassable : in std_ulogic; + update_gpr_write_valid : in std_ulogic; + update_gpr_write_reg : in gspr_index_t; + gpr_a_read_valid_in : in std_ulogic; gpr_a_read_in : in gspr_index_t; @@ -72,7 +76,11 @@ begin ) port map ( clk => clk, - stall_in => stall_in, + busy_in => busy_in, + deferred => deferred, + complete_in => complete_in, + flush_in => flush_in, + issuing => valid_out, gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, @@ -80,6 +88,9 @@ begin gpr_read_valid_in => gpr_a_read_valid_in, gpr_read_in => gpr_a_read_in, + ugpr_write_valid => update_gpr_write_valid, + ugpr_write_reg => update_gpr_write_reg, + stall_out => stall_a_out, use_bypass => gpr_bypass_a ); @@ -90,7 +101,11 @@ begin ) port map ( clk => clk, - stall_in => stall_in, + busy_in => busy_in, + deferred => deferred, + complete_in => complete_in, + flush_in => flush_in, + issuing => valid_out, gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, @@ -98,6 +113,9 @@ begin gpr_read_valid_in => gpr_b_read_valid_in, gpr_read_in => gpr_b_read_in, + ugpr_write_valid => update_gpr_write_valid, + ugpr_write_reg => update_gpr_write_reg, + stall_out => stall_b_out, use_bypass => gpr_bypass_b ); @@ -110,7 +128,11 @@ begin ) port map ( clk => clk, - stall_in => stall_in, + busy_in => busy_in, + deferred => deferred, + complete_in => complete_in, + flush_in => flush_in, + issuing => valid_out, gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, @@ -118,6 +140,9 @@ begin gpr_read_valid_in => gpr_c_read_valid_in, gpr_read_in => gpr_c_read_in_fmt, + ugpr_write_valid => update_gpr_write_valid, + ugpr_write_reg => update_gpr_write_reg, + stall_out => stall_c_out, use_bypass => gpr_bypass_c ); @@ -128,7 +153,11 @@ begin ) port map ( clk => clk, - stall_in => stall_in, + busy_in => busy_in, + deferred => deferred, + complete_in => complete_in, + flush_in => flush_in, + issuing => valid_out, cr_read_in => cr_read_in, cr_write_in => cr_write_valid, @@ -139,7 +168,8 @@ begin control0: process(clk) begin if rising_edge(clk) then - assert r_int.outstanding >= 0 and r_int.outstanding <= (PIPELINE_DEPTH+1) report "Outstanding bad " & integer'image(r_int.outstanding) severity failure; + assert rin_int.outstanding >= 0 and rin_int.outstanding <= (PIPELINE_DEPTH+1) + report "Outstanding bad " & integer'image(rin_int.outstanding) severity failure; r_int <= rin_int; end if; end process; @@ -152,17 +182,18 @@ begin v_int := r_int; -- asynchronous - valid_tmp := valid_in and not flush_in and not stall_in; - stall_tmp := stall_in; + valid_tmp := valid_in and not flush_in; + stall_tmp := '0'; - if complete_in = '1' then + if flush_in = '1' then + -- expect to see complete_in next cycle + v_int.outstanding := 1; + elsif complete_in = '1' then v_int.outstanding := r_int.outstanding - 1; end if; if rst = '1' then - v_int.state := IDLE; - v_int.outstanding := 0; - stall_tmp := '0'; + v_int := reg_internal_init; valid_tmp := '0'; end if; @@ -227,7 +258,9 @@ begin end if; if valid_tmp = '1' then - v_int.outstanding := v_int.outstanding + 1; + if deferred = '0' then + v_int.outstanding := v_int.outstanding + 1; + end if; gpr_write_valid <= gpr_write_valid_in; cr_write_valid <= cr_write_in; else @@ -237,7 +270,7 @@ begin -- update outputs valid_out <= valid_tmp; - stall_out <= stall_tmp; + stall_out <= stall_tmp or deferred; -- update registers rin_int <= v_int; diff --git a/core.vhdl b/core.vhdl index 8ba5b70..4a83d69 100644 --- a/core.vhdl +++ b/core.vhdl @@ -11,7 +11,8 @@ entity core is SIM : boolean := false; DISABLE_FLATTEN : boolean := false; EX1_BYPASS : boolean := true; - ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0') + ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0'); + LOG_LENGTH : natural := 512 ); port ( clk : in std_ulogic; @@ -41,16 +42,14 @@ entity core is end core; architecture behave of core is - -- fetch signals - signal fetch2_to_decode1: Fetch2ToDecode1Type; - -- icache signals signal fetch1_to_icache : Fetch1ToIcacheType; - signal icache_to_fetch2 : IcacheToFetch2Type; + signal icache_to_decode1 : IcacheToDecode1Type; signal mmu_to_icache : MmuToIcacheType; -- decode signals signal decode1_to_decode2: Decode1ToDecode2Type; + signal decode1_to_fetch1: Decode1ToFetch1Type; signal decode2_to_execute1: Decode2ToExecute1Type; -- register file signals @@ -83,16 +82,18 @@ architecture behave of core is -- local signals signal fetch1_stall_in : std_ulogic; signal icache_stall_out : std_ulogic; - signal fetch2_stall_in : std_ulogic; + signal icache_stall_in : std_ulogic; signal decode1_stall_in : std_ulogic; - signal decode2_stall_in : std_ulogic; + signal decode1_busy : std_ulogic; + signal decode2_busy_in : std_ulogic; signal decode2_stall_out : std_ulogic; signal ex1_icache_inval: std_ulogic; - signal ex1_stall_out: std_ulogic; - signal ls1_stall_out: std_ulogic; + signal ex1_busy_out: std_ulogic; signal dcache_stall_out: std_ulogic; signal flush: std_ulogic; + signal decode1_flush: std_ulogic; + signal fetch1_flush: std_ulogic; signal complete: std_ulogic; signal terminate: std_ulogic; @@ -128,6 +129,12 @@ architecture behave of core is -- Debug status signal dbg_core_is_stopped: std_ulogic; + -- Logging signals + signal log_data : std_ulogic_vector(255 downto 0); + signal log_rd_addr : std_ulogic_vector(31 downto 0); + signal log_wr_addr : std_ulogic_vector(31 downto 0); + signal log_rd_data : std_ulogic_vector(63 downto 0); + function keep_h(disable : boolean) return string is begin if disable then @@ -139,7 +146,6 @@ architecture behave of core is attribute keep_hierarchy : string; attribute keep_hierarchy of fetch1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of icache_0 : label is keep_h(DISABLE_FLATTEN); - attribute keep_hierarchy of fetch2_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of decode1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of decode2_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN); @@ -180,45 +186,40 @@ begin rst => rst_fetch1, alt_reset_in => alt_reset_d, stall_in => fetch1_stall_in, - flush_in => flush, + flush_in => fetch1_flush, stop_in => dbg_core_stop, + d_in => decode1_to_fetch1, e_in => execute1_to_fetch1, - i_out => fetch1_to_icache + i_out => fetch1_to_icache, + log_out => log_data(42 downto 0) ); - fetch1_stall_in <= icache_stall_out or decode2_stall_out; + fetch1_stall_in <= icache_stall_out or decode1_busy; + fetch1_flush <= flush or decode1_flush; icache_0: entity work.icache generic map( SIM => SIM, LINE_SIZE => 64, - NUM_LINES => 32, + NUM_LINES => 64, NUM_WAYS => 2 ) port map( clk => clk, rst => rst_icache, i_in => fetch1_to_icache, - i_out => icache_to_fetch2, + i_out => icache_to_decode1, m_in => mmu_to_icache, - flush_in => flush, + flush_in => fetch1_flush, inval_in => dbg_icache_rst or ex1_icache_inval, + stall_in => icache_stall_in, stall_out => icache_stall_out, wishbone_out => wishbone_insn_out, - wishbone_in => wishbone_insn_in - ); - - fetch2_0: entity work.fetch2 - port map ( - clk => clk, - rst => rst_fetch2, - stall_in => fetch2_stall_in, - flush_in => flush, - i_in => icache_to_fetch2, - f_out => fetch2_to_decode1 + wishbone_in => wishbone_insn_in, + log_out => log_data(96 downto 43) ); - fetch2_stall_in <= decode2_stall_out; + icache_stall_in <= decode1_busy; decode1_0: entity work.decode1 port map ( @@ -226,8 +227,12 @@ begin rst => rst_dec1, stall_in => decode1_stall_in, flush_in => flush, - f_in => fetch2_to_decode1, - d_out => decode1_to_decode2 + flush_out => decode1_flush, + busy_out => decode1_busy, + f_in => icache_to_decode1, + d_out => decode1_to_decode2, + f_out => decode1_to_fetch1, + log_out => log_data(109 downto 97) ); decode1_stall_in <= decode2_stall_out; @@ -239,7 +244,7 @@ begin port map ( clk => clk, rst => rst_dec2, - stall_in => decode2_stall_in, + busy_in => decode2_busy_in, stall_out => decode2_stall_out, flush_in => flush, complete_in => complete, @@ -249,9 +254,10 @@ begin r_in => register_file_to_decode2, r_out => decode2_to_register_file, c_in => cr_file_to_decode2, - c_out => decode2_to_cr_file + c_out => decode2_to_cr_file, + log_out => log_data(119 downto 110) ); - decode2_stall_in <= ex1_stall_out or ls1_stall_out; + decode2_busy_in <= ex1_busy_out; register_file_0: entity work.register_file generic map ( @@ -267,7 +273,8 @@ begin dbg_gpr_addr => dbg_gpr_addr, dbg_gpr_data => dbg_gpr_data, sim_dump => terminate, - sim_dump_done => sim_cr_dump + sim_dump_done => sim_cr_dump, + log_out => log_data(255 downto 185) ); cr_file_0: entity work.cr_file @@ -279,7 +286,8 @@ begin d_in => decode2_to_cr_file, d_out => cr_file_to_decode2, w_in => writeback_to_cr_file, - sim_dump => sim_cr_dump + sim_dump => sim_cr_dump, + log_out => log_data(184 downto 172) ); execute1_0: entity work.execute1 @@ -290,7 +298,7 @@ begin clk => clk, rst => rst_ex1, flush_out => flush, - stall_out => ex1_stall_out, + busy_out => ex1_busy_out, e_in => decode2_to_execute1, l_in => loadstore1_to_execute1, ext_irq_in => ext_irq, @@ -299,7 +307,11 @@ begin e_out => execute1_to_writeback, icache_inval => ex1_icache_inval, dbg_msr_out => msr, - terminate_out => terminate + terminate_out => terminate, + log_out => log_data(134 downto 120), + log_rd_addr => log_rd_addr, + log_rd_data => log_rd_data, + log_wr_addr => log_wr_addr ); loadstore1_0: entity work.loadstore1 @@ -314,7 +326,7 @@ begin m_out => loadstore1_to_mmu, m_in => mmu_to_loadstore1, dc_stall => dcache_stall_out, - stall_out => ls1_stall_out + log_out => log_data(149 downto 140) ); mmu_0: entity work.mmu @@ -331,7 +343,7 @@ begin dcache_0: entity work.dcache generic map( LINE_SIZE => 64, - NUM_LINES => 32, + NUM_LINES => 64, NUM_WAYS => 2 ) port map ( @@ -343,7 +355,8 @@ begin m_out => dcache_to_mmu, stall_out => dcache_stall_out, wishbone_in => wishbone_data_in, - wishbone_out => wishbone_data_out + wishbone_out => wishbone_data_out, + log_out => log_data(171 downto 152) ); writeback_0: entity work.writeback @@ -356,7 +369,13 @@ begin complete_out => complete ); + log_data(151 downto 150) <= "00"; + log_data(139 downto 135) <= "00000"; + debug_0: entity work.core_debug + generic map ( + LOG_LENGTH => LOG_LENGTH + ) port map ( clk => clk, rst => rst_dbg, @@ -377,6 +396,10 @@ begin dbg_gpr_ack => dbg_gpr_ack, dbg_gpr_addr => dbg_gpr_addr, dbg_gpr_data => dbg_gpr_data, + log_data => log_data, + log_read_addr => log_rd_addr, + log_read_data => log_rd_data, + log_write_addr => log_wr_addr, terminated_out => terminated_out ); diff --git a/core_debug.vhdl b/core_debug.vhdl index c97213b..9efaa7c 100644 --- a/core_debug.vhdl +++ b/core_debug.vhdl @@ -3,9 +3,14 @@ use ieee.std_logic_1164.all; use ieee.numeric_std.all; library work; +use work.utils.all; use work.common.all; entity core_debug is + generic ( + -- Length of log buffer + LOG_LENGTH : natural := 512 + ); port ( clk : in std_logic; rst : in std_logic; @@ -34,6 +39,12 @@ entity core_debug is dbg_gpr_addr : out gspr_index_t; dbg_gpr_data : in std_ulogic_vector(63 downto 0); + -- Core logging data + log_data : in std_ulogic_vector(255 downto 0); + log_read_addr : in std_ulogic_vector(31 downto 0); + log_read_data : out std_ulogic_vector(63 downto 0); + log_write_addr : out std_ulogic_vector(31 downto 0); + -- Misc terminated_out : out std_ulogic ); @@ -77,6 +88,12 @@ architecture behave of core_debug is -- GSPR register data constant DBG_CORE_GSPR_DATA : std_ulogic_vector(3 downto 0) := "0101"; + -- Log buffer address and data registers + constant DBG_CORE_LOG_ADDR : std_ulogic_vector(3 downto 0) := "0110"; + constant DBG_CORE_LOG_DATA : std_ulogic_vector(3 downto 0) := "0111"; + + constant LOG_INDEX_BITS : natural := log2(LOG_LENGTH); + -- Some internal wires signal stat_reg : std_ulogic_vector(63 downto 0); @@ -89,6 +106,12 @@ architecture behave of core_debug is signal do_gspr_rd : std_ulogic; signal gspr_index : gspr_index_t; + signal log_dmi_addr : std_ulogic_vector(31 downto 0) := (others => '0'); + signal log_dmi_data : std_ulogic_vector(63 downto 0) := (others => '0'); + signal do_dmi_log_rd : std_ulogic; + signal dmi_read_log_data : std_ulogic; + signal dmi_read_log_data_1 : std_ulogic; + begin -- Single cycle register accesses on DMI except for GSPR data dmi_ack <= dmi_req when dmi_addr /= DBG_CORE_GSPR_DATA @@ -108,6 +131,8 @@ begin nia when DBG_CORE_NIA, msr when DBG_CORE_MSR, dbg_gpr_data when DBG_CORE_GSPR_DATA, + log_write_addr & log_dmi_addr when DBG_CORE_LOG_ADDR, + log_dmi_data when DBG_CORE_LOG_DATA, (others => '0') when others; -- DMI writes @@ -118,6 +143,7 @@ begin do_step <= '0'; do_reset <= '0'; do_icreset <= '0'; + do_dmi_log_rd <= '0'; if (rst) then stopping <= '0'; @@ -151,11 +177,26 @@ begin end if; elsif dmi_addr = DBG_CORE_GSPR_INDEX then gspr_index <= dmi_din(gspr_index_t'left downto 0); + elsif dmi_addr = DBG_CORE_LOG_ADDR then + log_dmi_addr <= dmi_din(31 downto 0); + do_dmi_log_rd <= '1'; end if; else report("DMI read from " & to_string(dmi_addr)); end if; + + elsif dmi_read_log_data = '0' and dmi_read_log_data_1 = '1' then + -- Increment log_dmi_addr after the end of a read from DBG_CORE_LOG_DATA + log_dmi_addr(LOG_INDEX_BITS + 1 downto 0) <= + std_ulogic_vector(unsigned(log_dmi_addr(LOG_INDEX_BITS+1 downto 0)) + 1); + do_dmi_log_rd <= '1'; end if; + dmi_read_log_data_1 <= dmi_read_log_data; + if dmi_req = '1' and dmi_addr = DBG_CORE_LOG_DATA then + dmi_read_log_data <= '1'; + else + dmi_read_log_data <= '0'; + end if; -- Set core stop on terminate. We'll be stopping some time *after* -- the offending instruction, at least until we can do back flushes @@ -175,5 +216,87 @@ begin core_rst <= do_reset; icache_rst <= do_icreset; terminated_out <= terminated; + + -- Logging RAM + maybe_log: if LOG_LENGTH > 0 generate + subtype log_ptr_t is unsigned(LOG_INDEX_BITS - 1 downto 0); + type log_array_t is array(0 to LOG_LENGTH - 1) of std_ulogic_vector(255 downto 0); + signal log_array : log_array_t; + signal log_rd_ptr : log_ptr_t; + signal log_wr_ptr : log_ptr_t; + signal log_toggle : std_ulogic; + signal log_wr_enable : std_ulogic; + signal log_rd_ptr_latched : log_ptr_t; + signal log_rd : std_ulogic_vector(255 downto 0); + signal log_dmi_reading : std_ulogic; + signal log_dmi_read_done : std_ulogic; + + function select_dword(data : std_ulogic_vector(255 downto 0); + addr : std_ulogic_vector(31 downto 0)) return std_ulogic_vector is + variable firstbit : integer; + begin + firstbit := to_integer(unsigned(addr(1 downto 0))) * 64; + return data(firstbit + 63 downto firstbit); + end; + + attribute ram_style : string; + attribute ram_style of log_array : signal is "block"; + attribute ram_decomp : string; + attribute ram_decomp of log_array : signal is "power"; + + begin + -- Use MSB of read addresses to stop the logging + log_wr_enable <= not (log_read_addr(31) or log_dmi_addr(31)); + + log_ram: process(clk) + begin + if rising_edge(clk) then + if log_wr_enable = '1' then + log_array(to_integer(log_wr_ptr)) <= log_data; + end if; + log_rd <= log_array(to_integer(log_rd_ptr_latched)); + end if; + end process; + + + log_buffer: process(clk) + variable b : integer; + variable data : std_ulogic_vector(255 downto 0); + begin + if rising_edge(clk) then + if rst = '1' then + log_wr_ptr <= (others => '0'); + log_toggle <= '0'; + elsif log_wr_enable = '1' then + if log_wr_ptr = to_unsigned(LOG_LENGTH - 1, LOG_INDEX_BITS) then + log_toggle <= not log_toggle; + end if; + log_wr_ptr <= log_wr_ptr + 1; + end if; + if do_dmi_log_rd = '1' then + log_rd_ptr_latched <= unsigned(log_dmi_addr(LOG_INDEX_BITS + 1 downto 2)); + else + log_rd_ptr_latched <= unsigned(log_read_addr(LOG_INDEX_BITS + 1 downto 2)); + end if; + if log_dmi_read_done = '1' then + log_dmi_data <= select_dword(log_rd, log_dmi_addr); + else + log_read_data <= select_dword(log_rd, log_read_addr); + end if; + log_dmi_read_done <= log_dmi_reading; + log_dmi_reading <= do_dmi_log_rd; + end if; + end process; + log_write_addr(LOG_INDEX_BITS - 1 downto 0) <= std_ulogic_vector(log_wr_ptr); + log_write_addr(LOG_INDEX_BITS) <= '1'; + log_write_addr(31 downto LOG_INDEX_BITS + 1) <= (others => '0'); + end generate; + + no_log: if LOG_LENGTH = 0 generate + begin + log_read_data <= (others => '0'); + log_write_addr <= x"00000001"; + end generate; + end behave; diff --git a/cr_file.vhdl b/cr_file.vhdl index a6dd585..37fa76b 100644 --- a/cr_file.vhdl +++ b/cr_file.vhdl @@ -18,7 +18,9 @@ entity cr_file is w_in : in WritebackToCrFileType; -- debug - sim_dump : in std_ulogic + sim_dump : in std_ulogic; + + log_out : out std_ulogic_vector(12 downto 0) ); end entity cr_file; @@ -27,6 +29,7 @@ architecture behaviour of cr_file is signal crs_updated : std_ulogic_vector(31 downto 0); signal xerc : xer_common_t := xerc_init; signal xerc_updated : xer_common_t; + signal log_data : std_ulogic_vector(12 downto 0); begin cr_create_0: process(all) variable hi, lo : integer := 0; @@ -88,4 +91,14 @@ begin end process; end generate; + cr_log: process(clk) + begin + if rising_edge(clk) then + log_data <= w_in.write_cr_enable & + w_in.write_cr_data(31 downto 28) & + w_in.write_cr_mask; + end if; + end process; + log_out <= log_data; + end architecture behaviour; diff --git a/cr_hazard.vhdl b/cr_hazard.vhdl index f6c5f3f..4b79020 100644 --- a/cr_hazard.vhdl +++ b/cr_hazard.vhdl @@ -4,11 +4,15 @@ use ieee.numeric_std.all; entity cr_hazard is generic ( - PIPELINE_DEPTH : natural := 2 + PIPELINE_DEPTH : natural := 1 ); port( clk : in std_ulogic; - stall_in : in std_ulogic; + busy_in : in std_ulogic; + deferred : in std_ulogic; + complete_in : in std_ulogic; + flush_in : in std_ulogic; + issuing : in std_ulogic; cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; @@ -22,7 +26,7 @@ architecture behaviour of cr_hazard is end record; constant pipeline_entry_init : pipeline_entry_type := (valid => '0'); - type pipeline_t is array(0 to PIPELINE_DEPTH-1) of pipeline_entry_type; + type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type; constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init); signal r, rin : pipeline_t := pipeline_t_init; @@ -30,9 +34,7 @@ begin cr_hazard0: process(clk) begin if rising_edge(clk) then - if stall_in = '0' then - r <= rin; - end if; + r <= rin; end if; end process; @@ -41,22 +43,23 @@ begin begin v := r; - stall_out <= '0'; - loop_0: for i in 0 to PIPELINE_DEPTH-1 loop - if (r(i).valid = cr_read_in) then - stall_out <= '1'; - end if; - end loop; - - v(0).valid := cr_write_in; - loop_1: for i in 0 to PIPELINE_DEPTH-2 loop - -- propagate to next slot - v(i+1) := r(i); - end loop; + -- XXX assumes PIPELINE_DEPTH = 1 + if complete_in = '1' then + v(1).valid := '0'; + end if; + stall_out <= cr_read_in and (v(0).valid or v(1).valid); - -- asynchronous output - if cr_read_in = '0' then - stall_out <= '0'; + -- XXX assumes PIPELINE_DEPTH = 1 + if busy_in = '0' then + v(1) := r(0); + v(0).valid := '0'; + end if; + if deferred = '0' and issuing = '1' then + v(0).valid := cr_write_in; + end if; + if flush_in = '1' then + v(0).valid := '0'; + v(1).valid := '0'; end if; -- update registers diff --git a/dcache.vhdl b/dcache.vhdl index 9df5562..9ecb6a9 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -46,7 +46,9 @@ entity dcache is stall_out : out std_ulogic; wishbone_out : out wishbone_master_out; - wishbone_in : in wishbone_slave_out + wishbone_in : in wishbone_slave_out; + + log_out : out std_ulogic_vector(19 downto 0) ); end entity dcache; @@ -81,6 +83,8 @@ architecture rtl of dcache is constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS; -- TAG_BITS is the number of bits of the tag part of the address constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS; + -- TAG_WIDTH is the width in bits of each way of the tag RAM + constant TAG_WIDTH : natural := TAG_BITS + 7 - ((TAG_BITS + 7) mod 8); -- WAY_BITS is the number of bits to select a way constant WAY_BITS : natural := log2(NUM_WAYS); @@ -98,6 +102,7 @@ architecture rtl of dcache is subtype row_t is integer range 0 to BRAM_ROWS-1; subtype index_t is integer range 0 to NUM_LINES-1; subtype way_t is integer range 0 to NUM_WAYS-1; + subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0); -- The cache data BRAM organized as described above for each way subtype cache_row_t is std_ulogic_vector(wishbone_data_bits-1 downto 0); @@ -108,17 +113,19 @@ architecture rtl of dcache is subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0); -- type cache_tags_set_t is array(way_t) of cache_tag_t; -- type cache_tags_array_t is array(index_t) of cache_tags_set_t; - constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS; + constant TAG_RAM_WIDTH : natural := TAG_WIDTH * NUM_WAYS; subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0); type cache_tags_array_t is array(index_t) of cache_tags_set_t; -- The cache valid bits subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); type cache_valids_t is array(index_t) of cache_way_valids_t; + type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs - signal cache_tags : cache_tags_array_t; - signal cache_valids : cache_valids_t; + signal cache_tags : cache_tags_array_t; + signal cache_tag_set : cache_tags_set_t; + signal cache_valids : cache_valids_t; attribute ram_style : string; attribute ram_style of cache_tags : signal is "distributed"; @@ -175,18 +182,17 @@ architecture rtl of dcache is -- Type of operation on a "valid" input type op_t is (OP_NONE, + OP_BAD, -- NC cache hit, TLB miss, prot/RC failure + OP_STCX_FAIL, -- conditional store w/o reservation OP_LOAD_HIT, -- Cache hit on load OP_LOAD_MISS, -- Load missing cache OP_LOAD_NC, -- Non-cachable load - OP_BAD, -- BAD: Cache hit on NC load/store - OP_TLB_ERR, -- TLB miss or protection/RC failure OP_STORE_HIT, -- Store hitting cache OP_STORE_MISS); -- Store missing cache -- Cache state machine type state_t is (IDLE, -- Normal load hit processing RELOAD_WAIT_ACK, -- Cache reload wait ack - FINISH_LD_MISS, -- Extra cycle after load miss STORE_WAIT_ACK, -- Store wait ack NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack @@ -216,38 +222,64 @@ architecture rtl of dcache is end record; signal r0 : reg_stage_0_t; - signal r0_valid : std_ulogic; - + signal r0_full : std_ulogic; + + type mem_access_request_t is record + op : op_t; + dcbz : std_ulogic; + real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + data : std_ulogic_vector(63 downto 0); + byte_sel : std_ulogic_vector(7 downto 0); + hit_way : way_t; + repl_way : way_t; + same_tag : std_ulogic; + end record; + -- First stage register, contains state for stage 1 of load hits -- and for the state machine used by all other operations -- type reg_stage_1_t is record - -- Latch the complete request from ls1 - req : Loadstore1ToDcacheType; - mmu_req : std_ulogic; + -- Info about the request + full : std_ulogic; -- have uncompleted request + mmu_req : std_ulogic; -- request is from MMU + req : mem_access_request_t; -- Cache hit state hit_way : way_t; hit_load_valid : std_ulogic; - -- Data buffer for "slow" read ops (load miss and NC loads). - slow_data : std_ulogic_vector(63 downto 0); - slow_valid : std_ulogic; - - -- Signal to complete a failed stcx. - stcx_fail : std_ulogic; + -- 2-stage data buffer for data forwarded from writes to reads + forward_data1 : std_ulogic_vector(63 downto 0); + forward_data2 : std_ulogic_vector(63 downto 0); + forward_sel1 : std_ulogic_vector(7 downto 0); + forward_valid1 : std_ulogic; + forward_way1 : way_t; + forward_row1 : row_t; + use_forward1 : std_ulogic; + forward_sel : std_ulogic_vector(7 downto 0); -- Cache miss state (reload state machine) state : state_t; + dcbz : std_ulogic; + write_bram : std_ulogic; + write_tag : std_ulogic; + slow_valid : std_ulogic; wb : wishbone_master_out; + reload_tag : cache_tag_t; store_way : way_t; store_row : row_t; store_index : index_t; + end_row_ix : row_in_line_t; + rows_valid : row_per_line_valid_t; + acks_pending : unsigned(2 downto 0); -- Signals to complete with error error_done : std_ulogic; cache_paradox : std_ulogic; + -- Signal to complete a failed stcx. + stcx_fail : std_ulogic; + -- completion signal for tlbie tlbie_done : std_ulogic; end record; @@ -270,7 +302,7 @@ architecture rtl of dcache is signal req_tag : cache_tag_t; signal req_op : op_t; signal req_data : std_ulogic_vector(63 downto 0); - signal req_laddr : std_ulogic_vector(63 downto 0); + signal req_same_tag : std_ulogic; signal early_req_row : row_t; @@ -278,6 +310,12 @@ architecture rtl of dcache is signal set_rsrv : std_ulogic; signal clear_rsrv : std_ulogic; + signal r0_valid : std_ulogic; + signal r0_stall : std_ulogic; + + signal use_forward1_next : std_ulogic; + signal use_forward2_next : std_ulogic; + -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; signal cache_out : cache_ram_out_t; @@ -303,6 +341,7 @@ architecture rtl of dcache is signal perm_attr : perm_attr_t; signal rc_ok : std_ulogic; signal perm_ok : std_ulogic; + signal access_ok : std_ulogic; -- TLB PLRU output interface type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0); @@ -313,31 +352,35 @@ architecture rtl of dcache is -- -- Return the cache line index (tag index) for an address - function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is + function get_index(addr: std_ulogic_vector) return index_t is begin return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS))); end; -- Return the cache row index (data memory) for an address - function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is + function get_row(addr: std_ulogic_vector) return row_t is begin return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS))); end; + -- Return the index of a row within a line + function get_row_of_line(row: row_t) return row_in_line_t is + variable row_v : unsigned(ROW_BITS-1 downto 0); + begin + row_v := to_unsigned(row, ROW_BITS); + return row_v(ROW_LINEBITS-1 downto 0); + end; + -- Returns whether this is the last row of a line - function is_last_row_addr(addr: wishbone_addr_type) return boolean is - constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); + function is_last_row_addr(addr: wishbone_addr_type; last: row_in_line_t) return boolean is begin - return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones; + return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last; end; -- Returns whether this is the last row of a line - function is_last_row(row: row_t) return boolean is - variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0); - constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); + function is_last_row(row: row_t; last: row_in_line_t) return boolean is begin - row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS)); - return row_v(ROW_LINEBITS-1 downto 0) = ones; + return get_row_of_line(row) = last; end; -- Return the address of the next row in the current cache line @@ -369,7 +412,7 @@ architecture rtl of dcache is end; -- Get the tag value from the address - function get_tag(addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)) return cache_tag_t is + function get_tag(addr: std_ulogic_vector) return cache_tag_t is begin return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS); end; @@ -377,14 +420,7 @@ architecture rtl of dcache is -- Read a tag from a tag memory row function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is begin - return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS); - end; - - -- Write a tag to tag memory row - procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t; - tag: cache_tag_t) is - begin - tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; + return tagset(way * TAG_WIDTH + TAG_BITS - 1 downto way * TAG_WIDTH); end; -- Read a TLB tag from a TLB tag memory row @@ -419,6 +455,8 @@ architecture rtl of dcache is ptes(j + TLB_PTE_BITS - 1 downto j) := newpte; end; + signal log_data : std_ulogic_vector(19 downto 0); + begin assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; @@ -435,38 +473,42 @@ begin report "geometry bits don't add up" severity FAILURE; assert (64 = wishbone_data_bits) report "Can't yet handle a wishbone width that isn't 64-bits" severity FAILURE; + assert SET_SIZE_BITS <= TLB_LG_PGSZ report "Set indexed by virtual address" severity FAILURE; -- Latch the request in r0.req as long as we're not stalling stage_0 : process(clk) + variable r : reg_stage_0_t; begin if rising_edge(clk) then + assert (d_in.valid and m_in.valid) = '0' report + "request collision loadstore vs MMU"; + if m_in.valid = '1' then + r.req.valid := '1'; + r.req.load := not (m_in.tlbie or m_in.tlbld); + r.req.dcbz := '0'; + r.req.nc := '0'; + r.req.reserve := '0'; + r.req.virt_mode := '0'; + r.req.priv_mode := '1'; + r.req.addr := m_in.addr; + r.req.data := m_in.pte; + r.req.byte_sel := (others => '1'); + r.tlbie := m_in.tlbie; + r.doall := m_in.doall; + r.tlbld := m_in.tlbld; + r.mmu_req := '1'; + else + r.req := d_in; + r.tlbie := '0'; + r.doall := '0'; + r.tlbld := '0'; + r.mmu_req := '0'; + end if; if rst = '1' then - r0.req.valid <= '0'; - elsif stall_out = '0' then - assert (d_in.valid and m_in.valid) = '0' report - "request collision loadstore vs MMU"; - if m_in.valid = '1' then - r0.req.valid <= '1'; - r0.req.load <= not (m_in.tlbie or m_in.tlbld); - r0.req.dcbz <= '0'; - r0.req.nc <= '0'; - r0.req.reserve <= '0'; - r0.req.virt_mode <= '0'; - r0.req.priv_mode <= '1'; - r0.req.addr <= m_in.addr; - r0.req.data <= m_in.pte; - r0.req.byte_sel <= (others => '1'); - r0.tlbie <= m_in.tlbie; - r0.doall <= m_in.doall; - r0.tlbld <= m_in.tlbld; - r0.mmu_req <= '1'; - else - r0.req <= d_in; - r0.tlbie <= '0'; - r0.doall <= '0'; - r0.tlbld <= '0'; - r0.mmu_req <= '0'; - end if; + r0_full <= '0'; + elsif r1.full = '0' or r0_full = '0' then + r0 <= r; + r0_full <= r.req.valid; end if; end if; end process; @@ -474,9 +516,10 @@ begin -- we don't yet handle collisions between loadstore1 requests and MMU requests m_out.stall <= '0'; - -- Hold off the request in r0 when stalling, - -- and cancel it if we get an error in a previous request. - r0_valid <= r0.req.valid and not stall_out and not r1.error_done; + -- Hold off the request in r0 when r1 has an uncompleted request + r0_stall <= r0_full and r1.full; + r0_valid <= r0_full and not r1.full; + stall_out <= r0_stall; -- TLB -- Operates in the second cycle on the request latched in r0.req. @@ -486,20 +529,19 @@ begin variable addrbits : std_ulogic_vector(TLB_SET_BITS - 1 downto 0); begin if rising_edge(clk) then - if stall_out = '1' then - -- keep reading the same thing while stalled - index := tlb_req_index; + if m_in.valid = '1' then + addrbits := m_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); else - if m_in.valid = '1' then - addrbits := m_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); - else - addrbits := d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); - end if; - index := to_integer(unsigned(addrbits)); + addrbits := d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); + end if; + index := to_integer(unsigned(addrbits)); + -- If we have any op and the previous op isn't finished, + -- then keep the same output for next cycle. + if r0_stall = '0' then + tlb_valid_way <= dtlb_valids(index); + tlb_tag_way <= dtlb_tags(index); + tlb_pte_way <= dtlb_ptes(index); end if; - tlb_valid_way <= dtlb_valids(index); - tlb_tag_way <= dtlb_tags(index); - tlb_pte_way <= dtlb_ptes(index); end if; end process; @@ -565,10 +607,12 @@ begin valid_ra <= tlb_hit or not r0.req.virt_mode; if r0.req.virt_mode = '1' then ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & - r0.req.addr(TLB_LG_PGSZ - 1 downto 0); + r0.req.addr(TLB_LG_PGSZ - 1 downto ROW_OFF_BITS) & + (ROW_OFF_BITS-1 downto 0 => '0'); perm_attr <= extract_perm_attr(pte); else - ra <= r0.req.addr(REAL_ADDR_BITS - 1 downto 0); + ra <= r0.req.addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & + (ROW_OFF_BITS-1 downto 0 => '0'); perm_attr <= real_mode_perm_attr; end if; end process; @@ -648,35 +692,45 @@ begin end generate; end generate; + -- Cache tag RAM read port + cache_tag_read : process(clk) + variable index : index_t; + begin + if rising_edge(clk) then + if r0_stall = '1' then + index := req_index; + elsif m_in.valid = '1' then + index := get_index(m_in.addr); + else + index := get_index(d_in.addr); + end if; + cache_tag_set <= cache_tags(index); + end if; + end process; + -- Cache request parsing and hit detection dcache_request : process(all) - variable is_hit : std_ulogic; - variable hit_way : way_t; - variable op : op_t; - variable opsel : std_ulogic_vector(2 downto 0); - variable go : std_ulogic; - variable nc : std_ulogic; - variable s_hit : std_ulogic; - variable s_tag : cache_tag_t; - variable s_pte : tlb_pte_t; - variable s_ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); - variable hit_set : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); + variable is_hit : std_ulogic; + variable hit_way : way_t; + variable op : op_t; + variable opsel : std_ulogic_vector(2 downto 0); + variable go : std_ulogic; + variable nc : std_ulogic; + variable s_hit : std_ulogic; + variable s_tag : cache_tag_t; + variable s_pte : tlb_pte_t; + variable s_ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + variable hit_set : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); variable hit_way_set : hit_way_set_t; + variable rel_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); + variable rel_match : std_ulogic; begin -- Extract line, row and tag from request req_index <= get_index(r0.req.addr); req_row <= get_row(r0.req.addr); req_tag <= get_tag(ra); - -- Only do anything if not being stalled by stage 1 - go := r0_valid and not (r0.tlbie or r0.tlbld); - - -- Calculate address of beginning of cache line, will be - -- used for cache miss processing if needed - -- - req_laddr <= (63 downto REAL_ADDR_BITS => '0') & - ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) & - (LINE_OFF_BITS-1 downto 0 => '0'); + go := r0_valid and not (r0.tlbie or r0.tlbld) and not r1.error_done; -- Test if pending request is a hit on any way -- In order to make timing in virtual mode, when we are using the TLB, @@ -684,7 +738,9 @@ begin -- the TLB, and then decide later which match to use. hit_way := 0; is_hit := '0'; + rel_match := '0'; if r0.req.virt_mode = '1' then + rel_matches := (others => '0'); for j in tlb_way_t loop hit_way_set(j) := 0; s_hit := '0'; @@ -694,27 +750,62 @@ begin s_tag := get_tag(s_ra); for i in way_t loop if go = '1' and cache_valids(req_index)(i) = '1' and - read_tag(i, cache_tags(req_index)) = s_tag and + read_tag(i, cache_tag_set) = s_tag and tlb_valid_way(j) = '1' then hit_way_set(j) := i; s_hit := '1'; end if; end loop; hit_set(j) := s_hit; + if s_tag = r1.reload_tag then + rel_matches(j) := '1'; + end if; end loop; if tlb_hit = '1' then is_hit := hit_set(tlb_hit_way); hit_way := hit_way_set(tlb_hit_way); + rel_match := rel_matches(tlb_hit_way); end if; else - s_tag := get_tag(r0.req.addr(REAL_ADDR_BITS - 1 downto 0)); + s_tag := get_tag(r0.req.addr); for i in way_t loop if go = '1' and cache_valids(req_index)(i) = '1' and - read_tag(i, cache_tags(req_index)) = s_tag then + read_tag(i, cache_tag_set) = s_tag then hit_way := i; is_hit := '1'; end if; end loop; + if s_tag = r1.reload_tag then + rel_match := '1'; + end if; + end if; + req_same_tag <= rel_match; + + -- See if the request matches the line currently being reloaded + if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index and + rel_match = '1' then + -- For a store, consider this a hit even if the row isn't valid + -- since it will be by the time we perform the store. + -- For a load, check the appropriate row valid bit. + is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE); + hit_way := r1.store_way; + end if; + + -- Whether to use forwarded data for a load or not + use_forward1_next <= '0'; + if get_row(r1.req.real_addr) = req_row and r1.req.hit_way = hit_way then + -- Only need to consider r1.write_bram here, since if we are + -- writing refill data here, then we don't have a cache hit this + -- cycle on the line being refilled. (There is the possibility + -- that the load following the load miss that started the refill + -- could be to the old contents of the victim line, since it is a + -- couple of cycles after the refill starts before we see the + -- updated cache tag. In that case we don't use the bypass.) + use_forward1_next <= r1.write_bram; + end if; + use_forward2_next <= '0'; + if r1.forward_row1 = req_row and r1.forward_way1 = hit_way then + use_forward2_next <= r1.forward_valid1; end if; -- The way that matched on a hit @@ -728,6 +819,7 @@ begin rc_ok <= perm_attr.reference and (r0.req.load or perm_attr.changed); perm_ok <= (r0.req.priv_mode or not perm_attr.priv) and (perm_attr.wr_perm or (r0.req.load and perm_attr.rd_perm)); + access_ok <= valid_ra and perm_ok and rc_ok; -- Combine the request and cache hit status to decide what -- operation needs to be done @@ -735,7 +827,11 @@ begin nc := r0.req.nc or perm_attr.nocache; op := OP_NONE; if go = '1' then - if valid_ra = '1' and rc_ok = '1' and perm_ok = '1' then + if access_ok = '0' then + op := OP_BAD; + elsif cancel_store = '1' then + op := OP_STCX_FAIL; + else opsel := r0.req.load & nc & is_hit; case opsel is when "101" => op := OP_LOAD_HIT; @@ -748,8 +844,6 @@ begin when "111" => op := OP_BAD; when others => op := OP_NONE; end case; - else - op := OP_TLB_ERR; end if; end if; req_op <= op; @@ -758,7 +852,7 @@ begin -- in the cases where we need to read the cache data BRAM. -- If we're stalling then we need to keep reading the last -- row requested. - if stall_out = '0' then + if r0_stall = '0' then if m_in.valid = '1' then early_req_row <= get_row(m_in.addr); else @@ -772,9 +866,6 @@ begin -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- Generate stalls from stage 1 state machine - stall_out <= '1' when r1.state /= IDLE else '0'; - -- Handle load-with-reservation and store-conditional instructions reservation_comb: process(all) begin @@ -801,11 +892,15 @@ begin reservation_reg: process(clk) begin if rising_edge(clk) then - if rst = '1' or clear_rsrv = '1' then + if rst = '1' then reservation.valid <= '0'; - elsif set_rsrv = '1' then - reservation.valid <= '1'; - reservation.addr <= r0.req.addr(63 downto LINE_OFF_BITS); + elsif r0_valid = '1' and access_ok = '1' then + if clear_rsrv = '1' then + reservation.valid <= '0'; + elsif set_rsrv = '1' then + reservation.valid <= '1'; + reservation.addr <= r0.req.addr(63 downto LINE_OFF_BITS); + end if; end if; end if; end process; @@ -813,11 +908,28 @@ begin -- Return data for loads & completion control logic -- writeback_control: process(all) + variable data_out : std_ulogic_vector(63 downto 0); + variable data_fwd : std_ulogic_vector(63 downto 0); + variable j : integer; begin + -- Use the bypass if are reading the row that was written 1 or 2 cycles + -- ago, including for the slow_valid = 1 case (i.e. completing a load + -- miss or a non-cacheable load). + if r1.use_forward1 = '1' then + data_fwd := r1.forward_data1; + else + data_fwd := r1.forward_data2; + end if; + data_out := cache_out(r1.hit_way); + for i in 0 to 7 loop + j := i * 8; + if r1.forward_sel(i) = '1' then + data_out(j + 7 downto j) := data_fwd(j + 7 downto j); + end if; + end loop; - -- The mux on d_out.data defaults to the normal load hit case. d_out.valid <= '0'; - d_out.data <= cache_out(r1.hit_way); + d_out.data <= data_out; d_out.store_done <= '0'; d_out.error <= '0'; d_out.cache_paradox <= '0'; @@ -825,7 +937,7 @@ begin -- Outputs to MMU m_out.done <= r1.tlbie_done; m_out.err <= '0'; - m_out.data <= cache_out(r1.hit_way); + m_out.data <= data_out; -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store @@ -849,7 +961,7 @@ begin -- Request came from loadstore1... -- Load hit case is the standard path if r1.hit_load_valid = '1' then - report "completing load hit"; + report "completing load hit data=" & to_hstring(data_out); d_out.valid <= '1'; end if; @@ -863,16 +975,8 @@ begin -- Slow ops (load miss, NC, stores) if r1.slow_valid = '1' then - -- If it's a load, enable register writeback and switch - -- mux accordingly - -- - if r1.req.load then - -- Read data comes from the slow data latch - d_out.data <= r1.slow_data; - end if; d_out.store_done <= '1'; - - report "completing store or load miss"; + report "completing store or load miss data=" & to_hstring(data_out); d_out.valid <= '1'; end if; @@ -897,8 +1001,6 @@ begin -- Slow ops (i.e. load miss) if r1.slow_valid = '1' then - -- Read data comes from the slow data latch - m_out.data <= r1.slow_data; report "completing MMU load miss, data=" & to_hstring(m_out.data); m_out.done <= '1'; end if; @@ -942,8 +1044,6 @@ begin wr_data => wr_data ); process(all) - variable tmp_adr : std_ulogic_vector(63 downto 0); - variable reloading : boolean; begin -- Cache hit reads do_read <= '1'; @@ -955,43 +1055,40 @@ begin -- Defaults to wishbone read responses (cache refill), -- -- For timing, the mux on wr_data/sel/addr is not dependent on anything - -- other than the current state. Only the do_write signal is. + -- other than the current state. -- - if r1.state = IDLE then - -- In IDLE state, the only write path is the store-hit update case - wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); - wr_data <= r0.req.data; - wr_sel <= r0.req.byte_sel; + wr_sel_m <= (others => '0'); + + do_write <= '0'; + if r1.write_bram = '1' then + -- Write store data to BRAM. This happens one cycle after the + -- store is in r0. + wr_data <= r1.req.data; + wr_sel <= r1.req.byte_sel; + wr_addr <= std_ulogic_vector(to_unsigned(get_row(r1.req.real_addr), ROW_BITS)); + if i = r1.req.hit_way then + do_write <= '1'; + end if; else -- Otherwise, we might be doing a reload or a DCBZ - if r1.req.dcbz = '1' then + if r1.dcbz = '1' then wr_data <= (others => '0'); else wr_data <= wishbone_in.dat; end if; - wr_sel <= (others => '1'); - wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); - end if; + wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); + wr_sel <= (others => '1'); - -- The two actual write cases here - do_write <= '0'; - reloading := r1.state = RELOAD_WAIT_ACK; - if reloading and wishbone_in.ack = '1' and r1.store_way = i then - do_write <= '1'; - end if; - if req_op = OP_STORE_HIT and req_hit_way = i and cancel_store = '0' and - r0.req.dcbz = '0' then - assert not reloading report "Store hit while in state:" & - state_t'image(r1.state) - severity FAILURE; - do_write <= '1'; + if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r1.store_way = i then + do_write <= '1'; + end if; end if; -- Mask write selects with do_write since BRAM doesn't -- have a global write-enable - for i in 0 to ROW_SIZE-1 loop - wr_sel_m(i) <= wr_sel(i) and do_write; - end loop; + if do_write = '1' then + wr_sel_m <= wr_sel; + end if; end process; end generate; @@ -1003,15 +1100,7 @@ begin dcache_fast_hit : process(clk) begin if rising_edge(clk) then - -- If we have a request incoming, we have to latch it as r0.req.valid - -- is only set for a single cycle. It's up to the control logic to - -- ensure we don't override an uncompleted request (for now we are - -- single issue on load/stores so we are fine, later, we can generate - -- a stall output if necessary). - - if req_op /= OP_NONE and stall_out = '0' then - r1.req <= r0.req; - r1.mmu_req <= r0.mmu_req; + if req_op /= OP_NONE then report "op:" & op_t'image(req_op) & " addr:" & to_hstring(r0.req.addr) & " nc:" & std_ulogic'image(r0.req.nc) & @@ -1019,8 +1108,11 @@ begin " tag:" & to_hstring(req_tag) & " way: " & integer'image(req_hit_way); end if; + if r0_valid = '1' then + r1.mmu_req <= r0.mmu_req; + end if; - -- Fast path for load/store hits. Set signals for the writeback controls. + -- Fast path for load/store hits. Set signals for the writeback controls. if req_op = OP_LOAD_HIT then r1.hit_way <= req_hit_way; r1.hit_load_valid <= '1'; @@ -1028,27 +1120,29 @@ begin r1.hit_load_valid <= '0'; end if; - if req_op = OP_TLB_ERR then + if req_op = OP_BAD then report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) & " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok); r1.error_done <= '1'; - r1.cache_paradox <= '0'; - elsif req_op = OP_BAD then - report "Signalling cache paradox"; - r1.error_done <= '1'; - r1.cache_paradox <= '1'; + r1.cache_paradox <= access_ok; else r1.error_done <= '0'; r1.cache_paradox <= '0'; end if; + if req_op = OP_STCX_FAIL then + r1.stcx_fail <= '1'; + else + r1.stcx_fail <= '0'; + end if; + -- complete tlbies and TLB loads in the third cycle r1.tlbie_done <= r0_valid and (r0.tlbie or r0.tlbld); end if; end process; -- - -- Every other case is handled by this state machine: + -- Memory accesses are handled by this state machine: -- -- * Cache load miss/reload (in conjunction with "rams") -- * Load hits for non-cachable forms @@ -1058,16 +1152,45 @@ begin -- operates at stage 1. -- dcache_slow : process(clk) - variable tagset : cache_tags_set_t; variable stbs_done : boolean; + variable req : mem_access_request_t; + variable acks : unsigned(2 downto 0); begin if rising_edge(clk) then + r1.use_forward1 <= use_forward1_next; + r1.forward_sel <= (others => '0'); + if use_forward1_next = '1' then + r1.forward_sel <= r1.req.byte_sel; + elsif use_forward2_next = '1' then + r1.forward_sel <= r1.forward_sel1; + end if; + + r1.forward_data2 <= r1.forward_data1; + if r1.write_bram = '1' then + r1.forward_data1 <= r1.req.data; + r1.forward_sel1 <= r1.req.byte_sel; + r1.forward_way1 <= r1.req.hit_way; + r1.forward_row1 <= get_row(r1.req.real_addr); + r1.forward_valid1 <= '1'; + else + if r1.dcbz = '1' then + r1.forward_data1 <= (others => '0'); + else + r1.forward_data1 <= wishbone_in.dat; + end if; + r1.forward_sel1 <= (others => '1'); + r1.forward_way1 <= r1.store_way; + r1.forward_row1 <= r1.store_row; + r1.forward_valid1 <= '0'; + end if; + -- On reset, clear all valid bits to force misses if rst = '1' then for i in index_t loop cache_valids(i) <= (others => '0'); end loop; r1.state <= IDLE; + r1.full <= '0'; r1.slow_valid <= '0'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; @@ -1077,44 +1200,79 @@ begin else -- One cycle pulses reset r1.slow_valid <= '0'; - r1.stcx_fail <= '0'; + r1.write_bram <= '0'; + + if r1.write_tag = '1' then + -- Store new tag in selected way + for i in 0 to NUM_WAYS-1 loop + if i = r1.store_way then + cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <= + (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag; + end if; + end loop; + r1.write_tag <= '0'; + end if; + + -- Take request from r1.req if there is one there, + -- else from req_op, ra, etc. + if r1.full = '1' then + req := r1.req; + else + req.op := req_op; + req.dcbz := r0.req.dcbz; + req.real_addr := ra; + req.data := r0.req.data; + req.byte_sel := r0.req.byte_sel; + req.hit_way := req_hit_way; + req.repl_way := replace_way; + req.same_tag := req_same_tag; + + -- Store the incoming request from r0, if it is a slow request + -- Note that r1.full = 1 implies req_op = OP_NONE + if req_op = OP_LOAD_MISS or req_op = OP_LOAD_NC or + req_op = OP_STORE_MISS or req_op = OP_STORE_HIT then + r1.req <= req; + r1.full <= '1'; + end if; + end if; -- Main state machine case r1.state is when IDLE => - case req_op is + r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0); + r1.dcbz <= '0'; + + -- Keep track of our index and way for subsequent stores. + r1.store_index <= get_index(req.real_addr); + r1.store_row <= get_row(req.real_addr); + r1.end_row_ix <= get_row_of_line(get_row(req.real_addr)) - 1; + r1.reload_tag <= get_tag(req.real_addr); + r1.req.same_tag <= '1'; + + if req.op = OP_STORE_HIT then + r1.store_way <= req.hit_way; + else + r1.store_way <= req.repl_way; + end if; + + -- Reset per-row valid bits, ready for handling OP_LOAD_MISS + for i in 0 to ROW_PER_LINE - 1 loop + r1.rows_valid(i) <= '0'; + end loop; + + case req.op is when OP_LOAD_HIT => -- stay in IDLE state when OP_LOAD_MISS => -- Normal load cache miss, start the reload machine -- - report "cache miss addr:" & to_hstring(r0.req.addr) & - " idx:" & integer'image(req_index) & - " way:" & integer'image(replace_way) & - " tag:" & to_hstring(req_tag); - - -- Force misses on that way while reloading that line - cache_valids(req_index)(replace_way) <= '0'; - - -- Store new tag in selected way - for i in 0 to NUM_WAYS-1 loop - if i = replace_way then - tagset := cache_tags(req_index); - write_tag(i, tagset, req_tag); - cache_tags(req_index) <= tagset; - end if; - end loop; - - -- Keep track of our index and way for subsequent stores. - r1.store_index <= req_index; - r1.store_way <= replace_way; - r1.store_row <= get_row(req_laddr); - - -- Prep for first wishbone read. We calculate the address of - -- the start of the cache line and start the WB cycle - -- - r1.wb.adr <= req_laddr(r1.wb.adr'left downto 0); + report "cache miss real addr:" & to_hstring(req.real_addr) & + " idx:" & integer'image(get_index(req.real_addr)) & + " way:" & integer'image(req.repl_way) & + " tag:" & to_hstring(get_tag(req.real_addr)); + + -- Start the wishbone cycle r1.wb.sel <= (others => '1'); r1.wb.we <= '0'; r1.wb.cyc <= '1'; @@ -1122,74 +1280,52 @@ begin -- Track that we had one request sent r1.state <= RELOAD_WAIT_ACK; + r1.write_tag <= '1'; when OP_LOAD_NC => - r1.wb.sel <= r0.req.byte_sel; - r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; + r1.wb.sel <= req.byte_sel; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; r1.state <= NC_LOAD_WAIT_ACK; when OP_STORE_HIT | OP_STORE_MISS => - if r0.req.dcbz = '0' then - r1.wb.sel <= r0.req.byte_sel; - r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; - r1.wb.dat <= r0.req.data; - if cancel_store = '0' then - r1.wb.cyc <= '1'; - r1.wb.stb <= '1'; - r1.wb.we <= '1'; - r1.state <= STORE_WAIT_ACK; - else - r1.stcx_fail <= '1'; - r1.state <= IDLE; + if req.dcbz = '0' then + r1.wb.sel <= req.byte_sel; + r1.wb.dat <= req.data; + r1.state <= STORE_WAIT_ACK; + r1.acks_pending <= to_unsigned(1, 3); + r1.full <= '0'; + r1.slow_valid <= '1'; + if req.op = OP_STORE_HIT then + r1.write_bram <= '1'; end if; else -- dcbz is handled much like a load miss except -- that we are writing to memory instead of reading - r1.store_index <= req_index; - r1.store_row <= get_row(req_laddr); - - if req_op = OP_STORE_HIT then - r1.store_way <= req_hit_way; - else - r1.store_way <= replace_way; - - -- Force misses on the victim way while zeroing - cache_valids(req_index)(replace_way) <= '0'; - - -- Store new tag in selected way - for i in 0 to NUM_WAYS-1 loop - if i = replace_way then - tagset := cache_tags(req_index); - write_tag(i, tagset, req_tag); - cache_tags(req_index) <= tagset; - end if; - end loop; - end if; - -- Set up for wishbone writes - r1.wb.adr <= req_laddr(r1.wb.adr'left downto 0); + -- Start the wishbone writes r1.wb.sel <= (others => '1'); - r1.wb.we <= '1'; r1.wb.dat <= (others => '0'); - r1.wb.cyc <= '1'; - r1.wb.stb <= '1'; -- Handle the rest like a load miss r1.state <= RELOAD_WAIT_ACK; + r1.write_tag <= '1'; + r1.dcbz <= '1'; end if; + r1.wb.we <= '1'; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; -- OP_NONE and OP_BAD do nothing - -- OP_BAD was handled above already + -- OP_BAD & OP_STCX_FAIL were handled above already when OP_NONE => when OP_BAD => - when OP_TLB_ERR => + when OP_STCX_FAIL => end case; - when RELOAD_WAIT_ACK => - -- Requests are all sent if stb is 0 + when RELOAD_WAIT_ACK => + -- Requests are all sent if stb is 0 stbs_done := r1.wb.stb = '0'; -- If we are still sending requests, was one accepted ? @@ -1198,7 +1334,7 @@ begin -- stb and set stbs_done so we can handle an eventual last -- ack on the same cycle. -- - if is_last_row_addr(r1.wb.adr) then + if is_last_row_addr(r1.wb.adr, r1.end_row_ix) then r1.wb.stb <= '0'; stbs_done := true; end if; @@ -1208,44 +1344,76 @@ begin end if; -- Incoming acks processing + r1.forward_valid1 <= wishbone_in.ack; if wishbone_in.ack = '1' then - -- Is this the data we were looking for ? Latch it so - -- we can respond later. We don't currently complete the - -- pending miss request immediately, we wait for the - -- whole line to be loaded. The reason is that if we - -- did, we would potentially get new requests in while - -- not idle, which we don't currently know how to deal - -- with. - -- - if r1.store_row = get_row(r1.req.addr) and r1.req.dcbz = '0' then - r1.slow_data <= wishbone_in.dat; + r1.rows_valid(r1.store_row mod ROW_PER_LINE) <= '1'; + -- If this is the data we were looking for, we can + -- complete the request next cycle. + -- Compare the whole address in case the request in + -- r1.req is not the one that started this refill. + if r1.full = '1' and r1.req.same_tag = '1' and + ((r1.dcbz = '1' and r1.req.dcbz = '1') or + (r1.dcbz = '0' and r1.req.op = OP_LOAD_MISS)) and + r1.store_row = get_row(r1.req.real_addr) then + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.forward_sel <= (others => '1'); + r1.use_forward1 <= '1'; end if; -- Check for completion - if stbs_done and is_last_row(r1.store_row) then + if stbs_done and is_last_row(r1.store_row, r1.end_row_ix) then -- Complete wishbone cycle r1.wb.cyc <= '0'; -- Cache line is now valid cache_valids(r1.store_index)(r1.store_way) <= '1'; - -- Don't complete and go idle until next cycle, in - -- case the next request is for the last dword of - -- the cache line we just loaded. - r1.state <= FINISH_LD_MISS; + r1.state <= IDLE; end if; -- Increment store row counter r1.store_row <= next_row(r1.store_row); end if; - when FINISH_LD_MISS => - -- Write back the load data that we got - r1.slow_valid <= '1'; - r1.state <= IDLE; - report "completing miss !"; + when STORE_WAIT_ACK => + stbs_done := r1.wb.stb = '0'; + acks := r1.acks_pending; + -- Clear stb when slave accepted request + if wishbone_in.stall = '0' then + -- See if there is another store waiting to be done + -- which is in the same real page. + if acks < 7 and req.same_tag = '1' and + (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then + r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0); + r1.wb.dat <= req.data; + r1.wb.sel <= req.byte_sel; + r1.wb.stb <= '1'; + stbs_done := false; + if req.op = OP_STORE_HIT then + r1.write_bram <= '1'; + end if; + r1.full <= '0'; + r1.slow_valid <= '1'; + acks := acks + 1; + else + r1.wb.stb <= '0'; + stbs_done := true; + end if; + end if; + + -- Got ack ? See if complete. + if wishbone_in.ack = '1' then + if stbs_done and acks = 1 then + r1.state <= IDLE; + r1.wb.cyc <= '0'; + r1.wb.stb <= '0'; + end if; + acks := acks - 1; + end if; + r1.acks_pending <= acks; - when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => + when NC_LOAD_WAIT_ACK => -- Clear stb when slave accepted request if wishbone_in.stall = '0' then r1.wb.stb <= '0'; @@ -1253,11 +1421,11 @@ begin -- Got ack ? complete. if wishbone_in.ack = '1' then - if r1.state = NC_LOAD_WAIT_ACK then - r1.slow_data <= wishbone_in.dat; - end if; r1.state <= IDLE; + r1.full <= '0'; r1.slow_valid <= '1'; + r1.forward_sel <= (others => '1'); + r1.use_forward1 <= '1'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; end if; @@ -1265,4 +1433,22 @@ begin end if; end if; end process; + + dcache_log: process(clk) + begin + if rising_edge(clk) then + log_data <= r1.wb.adr(5 downto 3) & + wishbone_in.stall & + wishbone_in.ack & + r1.wb.stb & r1.wb.cyc & + d_out.error & + d_out.valid & + std_ulogic_vector(to_unsigned(op_t'pos(req_op), 3)) & + stall_out & + std_ulogic_vector(to_unsigned(tlb_hit_way, 3)) & + valid_ra & + std_ulogic_vector(to_unsigned(state_t'pos(r1.state), 3)); + end if; + end process; + log_out <= log_data; end; diff --git a/decode1.vhdl b/decode1.vhdl index b6da5d7..2060e64 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -8,19 +8,24 @@ use work.decode_types.all; entity decode1 is port ( - clk : in std_ulogic; - rst : in std_ulogic; - - stall_in : in std_ulogic; - flush_in : in std_ulogic; - - f_in : in Fetch2ToDecode1Type; - d_out : out Decode1ToDecode2Type + clk : in std_ulogic; + rst : in std_ulogic; + + stall_in : in std_ulogic; + flush_in : in std_ulogic; + busy_out : out std_ulogic; + flush_out : out std_ulogic; + + f_in : in IcacheToDecode1Type; + f_out : out Decode1ToFetch1Type; + d_out : out Decode1ToDecode2Type; + log_out : out std_ulogic_vector(12 downto 0) ); end entity decode1; architecture behaviour of decode1 is signal r, rin : Decode1ToDecode2Type; + signal s : Decode1ToDecode2Type; subtype major_opcode_t is unsigned(5 downto 0); type major_rom_array_t is array(0 to 63) of decode_rom_t; @@ -352,24 +357,45 @@ architecture behaviour of decode1 is constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); constant fetch_fail_inst: decode_rom_t := (LDST, OP_FETCH_FAILED, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); + signal log_data : std_ulogic_vector(12 downto 0); + begin decode1_0: process(clk) begin if rising_edge(clk) then - -- Output state remains unchanged on stall, unless we are flushing - if rst = '1' or flush_in = '1' or stall_in = '0' then - r <= rin; + if rst = '1' then + r <= Decode1ToDecode2Init; + s <= Decode1ToDecode2Init; + elsif flush_in = '1' then + r.valid <= '0'; + s.valid <= '0'; + elsif s.valid = '1' then + if stall_in = '0' then + r <= s; + s.valid <= '0'; + end if; + else + s <= rin; + s.valid <= rin.valid and r.valid and stall_in; + if r.valid = '0' or stall_in = '0' then + r <= rin; + end if; end if; end if; end process; + busy_out <= s.valid; decode1_1: process(all) variable v : Decode1ToDecode2Type; + variable f : Decode1ToFetch1Type; variable majorop : major_opcode_t; variable op_19_bits: std_ulogic_vector(2 downto 0); variable sprn : spr_num_t; + variable br_nia : std_ulogic_vector(61 downto 0); + variable br_target : std_ulogic_vector(61 downto 0); + variable br_offset : signed(23 downto 0); begin - v := r; + v := Decode1ToDecode2Init; v.valid := f_in.valid; v.nia := f_in.nia; @@ -395,6 +421,31 @@ begin -- major opcode 31, lots of things v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1)))); + -- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path + sprn := decode_spr_num(f_in.insn); + v.ispr1 := fast_spr_num(sprn); + + if std_match(f_in.insn(10 downto 1), "01-1010011") then + -- mfspr or mtspr + -- Make slow SPRs single issue + if is_fast_spr(v.ispr1) = '0' then + v.decode.sgl_pipe := '1'; + -- send MMU-related SPRs to loadstore1 + case sprn is + when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PRTBL => + v.decode.unit := LDST; + when others => + end case; + end if; + end if; + + elsif majorop = "010000" then + -- CTR may be needed as input to bc + v.decode := major_decode_rom_array(to_integer(majorop)); + if f_in.insn(23) = '0' then + v.ispr1 := fast_spr_num(SPR_CTR); + end if; + elsif majorop = "010011" then if decode_op_19_valid(to_integer(unsigned(f_in.insn(10 downto 1)))) = '0' then report "op 19 illegal subcode"; @@ -405,6 +456,27 @@ begin report "op 19 sub " & to_hstring(op_19_bits); end if; + -- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path + if f_in.insn(2) = '0' then + -- Could be OP_BCREG: bclr, bcctr, bctar + -- Branch uses CTR as condition when BO(2) is 0. This is + -- also used to indicate that CTR is modified (they go + -- together). + if f_in.insn(23) = '0' then + v.ispr1 := fast_spr_num(SPR_CTR); + end if; + -- TODO: Add TAR + if f_in.insn(10) = '0' then + v.ispr2 := fast_spr_num(SPR_LR); + else + v.ispr2 := fast_spr_num(SPR_CTR); + end if; + else + -- Could be OP_RFID + v.ispr1 := fast_spr_num(SPR_SRR0); + v.ispr2 := fast_spr_num(SPR_SRR1); + end if; + elsif majorop = "011110" then v.decode := decode_op_30_array(to_integer(unsigned(f_in.insn(4 downto 1)))); @@ -422,56 +494,45 @@ begin v.decode := major_decode_rom_array(to_integer(majorop)); end if; - -- Set ISPR1/ISPR2 when needed - if v.decode.insn_type = OP_BC or v.decode.insn_type = OP_BCREG then - -- Branch uses CTR as condition when BO(2) is 0. This is - -- also used to indicate that CTR is modified (they go - -- together). - -- - if f_in.insn(23) = '0' then - v.ispr1 := fast_spr_num(SPR_CTR); - end if; - - -- Branch source register is an SPR - if v.decode.insn_type = OP_BCREG then - -- TODO: Add TAR - if f_in.insn(10) = '0' then - v.ispr2 := fast_spr_num(SPR_LR); - else - v.ispr2 := fast_spr_num(SPR_CTR); - end if; - end if; - elsif v.decode.insn_type = OP_MFSPR or v.decode.insn_type = OP_MTSPR then - sprn := decode_spr_num(f_in.insn); - v.ispr1 := fast_spr_num(sprn); - -- Make slow SPRs single issue - if is_fast_spr(v.ispr1) = '0' then - v.decode.sgl_pipe := '1'; - -- send MMU-related SPRs to loadstore1 - case sprn is - when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PRTBL => - v.decode.unit := LDST; - when others => - end case; - end if; - elsif v.decode.insn_type = OP_RFID then - report "PPC RFID"; - v.ispr1 := fast_spr_num(SPR_SRR0); - v.ispr2 := fast_spr_num(SPR_SRR1); + -- Branch predictor + -- Note bclr, bcctr and bctar are predicted not taken as we have no + -- count cache or link stack. + br_offset := (others => '0'); + if majorop = 18 then + -- Unconditional branches are always taken + v.br_pred := '1'; + br_offset := signed(f_in.insn(25 downto 2)); + elsif majorop = 16 then + -- Predict backward branches as taken, forward as untaken + v.br_pred := f_in.insn(15); + br_offset := resize(signed(f_in.insn(15 downto 2)), 24); end if; - - if flush_in = '1' then - v.valid := '0'; - end if; - - if rst = '1' then - v := Decode1ToDecode2Init; + br_nia := f_in.nia(63 downto 2); + if f_in.insn(1) = '1' then + br_nia := (others => '0'); end if; + br_target := std_ulogic_vector(signed(br_nia) + br_offset); + f.redirect := v.br_pred and f_in.valid and not flush_in and not s.valid; + f.redirect_nia := br_target & "00"; -- Update registers rin <= v; -- Update outputs d_out <= r; + f_out <= f; + flush_out <= f.redirect; end process; + + dec1_log : process(clk) + begin + if rising_edge(clk) then + log_data <= std_ulogic_vector(to_unsigned(insn_type_t'pos(r.decode.insn_type), 6)) & + r.nia(5 downto 2) & + std_ulogic_vector(to_unsigned(unit_t'pos(r.decode.unit), 2)) & + r.valid; + end if; + end process; + log_out <= log_data; + end architecture behaviour; diff --git a/decode2.vhdl b/decode2.vhdl index f889a23..80687a0 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -17,7 +17,7 @@ entity decode2 is rst : in std_ulogic; complete_in : in std_ulogic; - stall_in : in std_ulogic; + busy_in : in std_ulogic; stall_out : out std_ulogic; stopped_out : out std_ulogic; @@ -32,7 +32,9 @@ entity decode2 is r_out : out Decode2ToRegisterFileType; c_in : in CrFileToDecode2Type; - c_out : out Decode2ToCrFileType + c_out : out Decode2ToCrFileType; + + log_out : out std_ulogic_vector(9 downto 0) ); end entity decode2; @@ -43,6 +45,10 @@ architecture behaviour of decode2 is signal r, rin : reg_type; + signal deferred : std_ulogic; + + signal log_data : std_ulogic_vector(9 downto 0); + type decode_input_reg_t is record reg_valid : std_ulogic; reg : gspr_index_t; @@ -61,8 +67,6 @@ architecture behaviour of decode2 is return decode_input_reg_t is begin if t = RA or (t = RA_OR_ZERO and insn_ra(insn_in) /= "00000") then - assert is_fast_spr(ispr) = '0' report "Decode A says GPR but ISPR says SPR:" & - to_hstring(ispr) severity failure; return ('1', gpr_to_gspr(insn_ra(insn_in)), reg_data); elsif t = SPR then -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. @@ -87,8 +91,6 @@ architecture behaviour of decode2 is begin case t is when RB => - assert is_fast_spr(ispr) = '0' report "Decode B says GPR but ISPR says SPR:" & - to_hstring(ispr) severity failure; ret := ('1', gpr_to_gspr(insn_rb(insn_in)), reg_data); when CONST_UI => ret := ('0', (others => '0'), std_ulogic_vector(resize(unsigned(insn_ui(insn_in)), 64))); @@ -196,6 +198,9 @@ architecture behaviour of decode2 is signal gpr_write : gspr_index_t; signal gpr_bypassable : std_ulogic; + signal update_gpr_write_valid : std_ulogic; + signal update_gpr_write_reg : gspr_index_t; + signal gpr_a_read_valid : std_ulogic; signal gpr_a_read :gspr_index_t; signal gpr_a_bypass : std_ulogic; @@ -220,7 +225,8 @@ begin complete_in => complete_in, valid_in => control_valid_in, - stall_in => stall_in, + busy_in => busy_in, + deferred => deferred, flush_in => flush_in, sgl_pipe_in => control_sgl_pipe, stop_mark_in => d_in.stop_mark, @@ -229,6 +235,9 @@ begin gpr_write_in => gpr_write, gpr_bypassable => gpr_bypassable, + update_gpr_write_valid => update_gpr_write_valid, + update_gpr_write_reg => update_gpr_write_reg, + gpr_a_read_valid_in => gpr_a_read_valid, gpr_a_read_in => gpr_a_read, @@ -250,18 +259,24 @@ begin gpr_bypass_c => gpr_c_bypass ); + deferred <= r.e.valid and busy_in; + decode2_0: process(clk) begin if rising_edge(clk) then - if rin.e.valid = '1' then - report "execute " & to_hstring(rin.e.nia); + if rst = '1' or flush_in = '1' or deferred = '0' then + if rin.e.valid = '1' then + report "execute " & to_hstring(rin.e.nia); + end if; + r <= rin; end if; - r <= rin; end if; end process; - r_out.read1_reg <= gpr_or_spr_to_gspr(insn_ra(d_in.insn), d_in.ispr1); - r_out.read2_reg <= gpr_or_spr_to_gspr(insn_rb(d_in.insn), d_in.ispr2); + r_out.read1_reg <= d_in.ispr1 when d_in.decode.input_reg_a = SPR + else gpr_to_gspr(insn_ra(d_in.insn)); + r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR + else gpr_to_gspr(insn_rb(d_in.insn)); r_out.read3_reg <= insn_rs(d_in.insn); c_out.read <= d_in.decode.input_cr; @@ -343,6 +358,7 @@ begin v.e.sign_extend := d_in.decode.sign_extend; v.e.update := d_in.decode.update; v.e.reserve := d_in.decode.reserve; + v.e.br_pred := d_in.br_pred; -- issue control control_valid_in <= d_in.valid; @@ -354,6 +370,13 @@ begin if EX1_BYPASS and d_in.decode.unit = ALU then gpr_bypassable <= '1'; end if; + update_gpr_write_valid <= d_in.decode.update; + update_gpr_write_reg <= decoded_reg_a.reg; + if v.e.lr = '1' then + -- there are no instructions that have both update=1 and lr=1 + update_gpr_write_valid <= '1'; + update_gpr_write_reg <= fast_spr_num(SPR_LR); + end if; gpr_a_read_valid <= decoded_reg_a.reg_valid; gpr_a_read <= decoded_reg_a.reg; @@ -371,7 +394,7 @@ begin v.e.insn_type := OP_ILLEGAL; end if; - if rst = '1' then + if rst = '1' or flush_in = '1' then v.e := Decode2ToExecute1Init; end if; @@ -381,4 +404,19 @@ begin -- Update outputs e_out <= r.e; end process; + + dec2_log : process(clk) + begin + if rising_edge(clk) then + log_data <= r.e.nia(5 downto 2) & + r.e.valid & + stopped_out & + stall_out & + r.e.bypass_data3 & + r.e.bypass_data2 & + r.e.bypass_data1; + end if; + end process; + log_out <= log_data; + end architecture behaviour; diff --git a/execute1.vhdl b/execute1.vhdl index cac8e8a..902af70 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -20,7 +20,7 @@ entity execute1 is -- asynchronous flush_out : out std_ulogic; - stall_out : out std_ulogic; + busy_out : out std_ulogic; e_in : in Decode2ToExecute1Type; l_in : in Loadstore1ToExecute1Type; @@ -36,34 +36,44 @@ entity execute1 is dbg_msr_out : out std_ulogic_vector(63 downto 0); icache_inval : out std_ulogic; - terminate_out : out std_ulogic + terminate_out : out std_ulogic; + + log_out : out std_ulogic_vector(14 downto 0); + log_rd_addr : out std_ulogic_vector(31 downto 0); + log_rd_data : in std_ulogic_vector(63 downto 0); + log_wr_addr : in std_ulogic_vector(31 downto 0) ); end entity execute1; architecture behaviour of execute1 is type reg_type is record e : Execute1ToWritebackType; + busy: std_ulogic; + terminate: std_ulogic; lr_update : std_ulogic; next_lr : std_ulogic_vector(63 downto 0); mul_in_progress : std_ulogic; div_in_progress : std_ulogic; cntz_in_progress : std_ulogic; + slow_op_insn : insn_type_t; slow_op_dest : gpr_index_t; slow_op_rc : std_ulogic; slow_op_oe : std_ulogic; slow_op_xerc : xer_common_t; - ldst_nia : std_ulogic_vector(63 downto 0); + last_nia : std_ulogic_vector(63 downto 0); + log_addr_spr : std_ulogic_vector(31 downto 0); end record; constant reg_type_init : reg_type := - (e => Execute1ToWritebackInit, lr_update => '0', + (e => Execute1ToWritebackInit, busy => '0', lr_update => '0', terminate => '0', mul_in_progress => '0', div_in_progress => '0', cntz_in_progress => '0', - slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init, - next_lr => (others => '0'), ldst_nia => (others => '0'), others => (others => '0')); + slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init, + next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0')); signal r, rin : reg_type; signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0); + signal valid_in : std_ulogic; signal ctrl: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0')); signal ctrl_tmp: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0')); signal right_shift, rot_clear_left, rot_clear_right: std_ulogic; @@ -72,8 +82,6 @@ architecture behaviour of execute1 is signal rotator_carry: std_ulogic; signal logical_result: std_ulogic_vector(63 downto 0); signal countzero_result: std_ulogic_vector(63 downto 0); - signal popcnt_result: std_ulogic_vector(63 downto 0); - signal parity_result: std_ulogic_vector(63 downto 0); -- multiply signals signal x_to_multiply: Execute1ToMultiplyType; @@ -83,6 +91,11 @@ architecture behaviour of execute1 is signal x_to_divider: Execute1ToDividerType; signal divider_to_x: DividerToExecute1Type; + -- signals for logging + signal exception_log : std_ulogic; + signal irq_valid_log : std_ulogic; + signal log_data : std_ulogic_vector(14 downto 0); + type privilege_level is (USER, SUPER); type op_privilege_array is array(insn_type_t) of privilege_level; constant op_privilege: op_privilege_array := ( @@ -193,9 +206,7 @@ begin invert_in => e_in.invert_a, invert_out => e_in.invert_out, result => logical_result, - datalen => e_in.data_len, - popcnt => popcnt_result, - parity => parity_result + datalen => e_in.data_len ); countzero_0: entity work.zero_counter @@ -223,11 +234,17 @@ begin ); dbg_msr_out <= ctrl.msr; + log_rd_addr <= r.log_addr_spr; a_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data1 = '1' else e_in.read_data1; b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2; c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3; + busy_out <= l_in.busy or r.busy; + valid_in <= e_in.valid and not busy_out; + + terminate_out <= r.terminate; + execute1_0: process(clk) begin if rising_edge(clk) then @@ -238,7 +255,7 @@ begin else r <= rin; ctrl <= ctrl_tmp; - assert not (r.lr_update = '1' and e_in.valid = '1') + assert not (r.lr_update = '1' and valid_in = '1') report "LR update collision with valid in EX1" severity failure; if r.lr_update = '1' then @@ -274,7 +291,6 @@ begin variable sign1, sign2 : std_ulogic; variable abs1, abs2 : signed(63 downto 0); variable overflow : std_ulogic; - variable negative : std_ulogic; variable zerohi, zerolo : std_ulogic; variable msb_a, msb_b : std_ulogic; variable a_lt : std_ulogic; @@ -284,11 +300,18 @@ begin variable exception_nextpc : std_ulogic; variable trapval : std_ulogic_vector(4 downto 0); variable illegal : std_ulogic; + variable is_branch : std_ulogic; + variable taken_branch : std_ulogic; + variable abs_branch : std_ulogic; + variable spr_val : std_ulogic_vector(63 downto 0); begin result := (others => '0'); result_with_carry := (others => '0'); result_en := '0'; newcrf := (others => '0'); + is_branch := '0'; + taken_branch := '0'; + abs_branch := '0'; v := r; v.e := Execute1ToWritebackInit; @@ -334,32 +357,7 @@ begin v.div_in_progress := '0'; v.cntz_in_progress := '0'; - -- signals to multiply unit - x_to_multiply <= Execute1ToMultiplyInit; - x_to_multiply.insn_type <= e_in.insn_type; - x_to_multiply.is_32bit <= e_in.is_32bit; - - if e_in.is_32bit = '1' then - if e_in.is_signed = '1' then - x_to_multiply.data1 <= (others => a_in(31)); - x_to_multiply.data1(31 downto 0) <= a_in(31 downto 0); - x_to_multiply.data2 <= (others => b_in(31)); - x_to_multiply.data2(31 downto 0) <= b_in(31 downto 0); - else - x_to_multiply.data1 <= '0' & x"00000000" & a_in(31 downto 0); - x_to_multiply.data2 <= '0' & x"00000000" & b_in(31 downto 0); - end if; - else - if e_in.is_signed = '1' then - x_to_multiply.data1 <= a_in(63) & a_in; - x_to_multiply.data2 <= b_in(63) & b_in; - else - x_to_multiply.data1 <= '0' & a_in; - x_to_multiply.data2 <= '0' & b_in; - end if; - end if; - - -- signals to divide unit + -- signals to multiply and divide units sign1 := '0'; sign2 := '0'; if e_in.is_signed = '1' then @@ -383,15 +381,22 @@ begin abs2 := - signed(b_in); end if; + x_to_multiply <= Execute1ToMultiplyInit; + x_to_multiply.is_32bit <= e_in.is_32bit; + x_to_divider <= Execute1ToDividerInit; x_to_divider.is_signed <= e_in.is_signed; x_to_divider.is_32bit <= e_in.is_32bit; if e_in.insn_type = OP_MOD then x_to_divider.is_modulus <= '1'; end if; + + x_to_multiply.neg_result <= sign1 xor sign2; x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus); if e_in.is_32bit = '0' then -- 64-bit forms + x_to_multiply.data1 <= std_ulogic_vector(abs1); + x_to_multiply.data2 <= std_ulogic_vector(abs2); if e_in.insn_type = OP_DIVE then x_to_divider.is_extended <= '1'; end if; @@ -399,6 +404,8 @@ begin x_to_divider.divisor <= std_ulogic_vector(abs2); else -- 32-bit forms + x_to_multiply.data1 <= x"00000000" & std_ulogic_vector(abs1(31 downto 0)); + x_to_multiply.data2 <= x"00000000" & std_ulogic_vector(abs2(31 downto 0)); x_to_divider.is_extended <= '0'; if e_in.insn_type = OP_DIVE then -- extended forms x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000"; @@ -426,9 +433,9 @@ begin end if; end if; - terminate_out <= '0'; + v.terminate := '0'; icache_inval <= '0'; - stall_out <= '0'; + v.busy := '0'; f_out <= Execute1ToFetch1TypeInit; -- send MSR[IR] and ~MSR[PR] up to fetch1 f_out.virt_mode <= ctrl.msr(MSR_IR); @@ -450,6 +457,9 @@ begin v.e.exc_write_enable := '0'; v.e.exc_write_reg := fast_spr_num(SPR_SRR0); v.e.exc_write_data := e_in.nia; + if valid_in = '1' then + v.last_nia := e_in.nia; + end if; if ctrl.irq_state = WRITE_SRR1 then v.e.exc_write_reg := fast_spr_num(SPR_SRR1); @@ -466,10 +476,10 @@ begin f_out.virt_mode <= '0'; f_out.priv_mode <= '1'; f_out.redirect_nia <= ctrl.irq_nia; - v.e.valid := e_in.valid; + v.e.valid := '1'; report "Writing SRR1: " & to_hstring(ctrl.srr1); - elsif irq_valid = '1' and e_in.valid = '1' then + elsif irq_valid = '1' and valid_in = '1' then -- we need two cycles to write srr0 and 1 -- will need more when we have to write HEIR -- Don't deliver the interrupt until we have a valid instruction @@ -477,7 +487,7 @@ begin exception := '1'; ctrl_tmp.srr1 <= msr_copy(ctrl.msr); - elsif e_in.valid = '1' and ctrl.msr(MSR_PR) = '1' and + elsif valid_in = '1' and ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then -- generate a program interrupt exception := '1'; @@ -487,12 +497,13 @@ begin ctrl_tmp.srr1(63 - 45) <= '1'; report "privileged instruction"; - elsif e_in.valid = '1' and e_in.unit = ALU then + elsif valid_in = '1' and e_in.unit = ALU then report "execute nia " & to_hstring(e_in.nia); v.e.valid := '1'; v.e.write_reg := e_in.write_reg; + v.slow_op_insn := e_in.insn_type; v.slow_op_dest := gspr_to_gpr(e_in.write_reg); v.slow_op_rc := e_in.rc; v.slow_op_oe := e_in.oe; @@ -521,7 +532,7 @@ begin -- check bits 1-10 of the instruction to make sure it's attn -- if not then it is illegal if e_in.insn(10 downto 1) = "0100000000" then - terminate_out <= '1'; + v.terminate := '1'; report "ATTN"; else illegal := '1'; @@ -612,16 +623,13 @@ begin end if; end if; end if; - when OP_AND | OP_OR | OP_XOR => + when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS => result := logical_result; result_en := '1'; when OP_B => - f_out.redirect <= '1'; - if (insn_aa(e_in.insn)) then - f_out.redirect_nia <= b_in; - else - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in)); - end if; + is_branch := '1'; + taken_branch := '1'; + abs_branch := insn_aa(e_in.insn); when OP_BC => -- read_data1 is CTR bo := insn_bo(e_in.insn); @@ -631,14 +639,9 @@ begin result_en := '1'; v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then - f_out.redirect <= '1'; - if (insn_aa(e_in.insn)) then - f_out.redirect_nia <= b_in; - else - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in)); - end if; - end if; + is_branch := '1'; + taken_branch := ppc_bc_taken(bo, bi, e_in.cr, a_in); + abs_branch := insn_aa(e_in.insn); when OP_BCREG => -- read_data1 is CTR -- read_data2 is target register (CTR, LR or TAR) @@ -649,7 +652,7 @@ begin result_en := '1'; v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then + if ppc_bc_taken(bo, bi, e_in.cr, a_in) = '1' then f_out.redirect <= '1'; f_out.redirect_nia <= b_in(63 downto 2) & "00"; end if; @@ -670,27 +673,10 @@ begin ctrl_tmp.msr(MSR_DR) <= '1'; end if; - when OP_CMPB => - result := ppc_cmpb(c_in, b_in); - result_en := '1'; when OP_CNTZ => v.e.valid := '0'; v.cntz_in_progress := '1'; - stall_out <= '1'; - when OP_EXTS => - -- note data_len is a 1-hot encoding - negative := (e_in.data_len(0) and c_in(7)) or - (e_in.data_len(1) and c_in(15)) or - (e_in.data_len(2) and c_in(31)); - result := (others => negative); - if e_in.data_len(2) = '1' then - result(31 downto 16) := c_in(31 downto 16); - end if; - if e_in.data_len(2) = '1' or e_in.data_len(1) = '1' then - result(15 downto 8) := c_in(15 downto 8); - end if; - result(7 downto 0) := c_in(7 downto 0); - result_en := '1'; + v.busy := '1'; when OP_ISEL => crbit := to_integer(unsigned(insn_bc(e_in.insn))); if e_in.cr(31-crbit) = '1' then @@ -762,19 +748,25 @@ begin result(63-45) := v.e.xerc.ca32; end if; else + spr_val := c_in; case decode_spr_num(e_in.insn) is when SPR_TB => - result := ctrl.tb; + spr_val := ctrl.tb; when SPR_DEC => - result := ctrl.dec; + spr_val := ctrl.dec; + when 724 => -- LOG_ADDR SPR + spr_val := log_wr_addr & r.log_addr_spr; + when 725 => -- LOG_DATA SPR + spr_val := log_rd_data; + v.log_addr_spr := std_ulogic_vector(unsigned(r.log_addr_spr) + 1); when others => -- mfspr from unimplemented SPRs should be a nop in -- supervisor mode and a program interrupt for user mode - result := c_in; if ctrl.msr(MSR_PR) = '1' then illegal := '1'; end if; end case; + result := spr_val; end if; when OP_MFCR => if e_in.insn(20) = '0' then @@ -840,6 +832,8 @@ begin case decode_spr_num(e_in.insn) is when SPR_DEC => ctrl_tmp.dec <= c_in; + when 724 => -- LOG_ADDR SPR + v.log_addr_spr := c_in(31 downto 0); when others => -- mtspr to unimplemented SPRs should be a nop in -- supervisor mode and a program interrupt for user mode @@ -848,12 +842,6 @@ begin end if; end case; end if; - when OP_POPCNT => - result := popcnt_result; - result_en := '1'; - when OP_PRTY => - result := parity_result; - result_en := '1'; when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR | OP_EXTSWSLI => result := rotator_result; if e_in.output_carry = '1' then @@ -871,53 +859,65 @@ begin when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 => v.e.valid := '0'; v.mul_in_progress := '1'; - stall_out <= '1'; + v.busy := '1'; x_to_multiply.valid <= '1'; when OP_DIV | OP_DIVE | OP_MOD => v.e.valid := '0'; v.div_in_progress := '1'; - stall_out <= '1'; + v.busy := '1'; x_to_divider.valid <= '1'; when others => - terminate_out <= '1'; + v.terminate := '1'; report "illegal"; end case; - v.e.rc := e_in.rc and e_in.valid; + v.e.rc := e_in.rc and valid_in; + + -- Mispredicted branches cause a redirect + if is_branch = '1' and taken_branch /= e_in.br_pred then + f_out.redirect <= '1'; + if taken_branch = '1' then + if abs_branch = '1' then + f_out.redirect_nia <= b_in; + else + f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in)); + end if; + else + f_out.redirect_nia <= next_nia; + end if; + end if; -- Update LR on the next cycle after a branch link - -- - -- WARNING: The LR update isn't tracked by our hazard tracker. This - -- will work (well I hope) because it only happens on branches - -- which will flush all decoded instructions. By the time - -- fetch catches up, we'll have the new LR. This will - -- *not* work properly however if we have a branch predictor, - -- in which case the solution would probably be to keep a - -- local cache of the updated LR in execute1 (flushed on - -- exceptions) that is used instead of the value from - -- decode when its content is valid. + -- If we're not writing back anything else, we can write back LR + -- this cycle, otherwise we take an extra cycle. We use the + -- exc_write path since next_nia is written through that path + -- in other places. if e_in.lr = '1' then - v.lr_update := '1'; - v.next_lr := next_nia; - v.e.valid := '0'; - report "Delayed LR update to " & to_hstring(next_nia); - stall_out <= '1'; + if result_en = '0' then + v.e.exc_write_enable := '1'; + v.e.exc_write_data := next_nia; + v.e.exc_write_reg := fast_spr_num(SPR_LR); + else + v.lr_update := '1'; + v.next_lr := next_nia; + v.e.valid := '0'; + report "Delayed LR update to " & to_hstring(next_nia); + v.busy := '1'; + end if; end if; - elsif e_in.valid = '1' then + elsif valid_in = '1' then -- instruction for other units, i.e. LDST - v.ldst_nia := e_in.nia; - v.e.valid := '0'; if e_in.unit = LDST then lv.valid := '1'; end if; elsif r.lr_update = '1' then - result_en := '1'; - result := r.next_lr; - v.e.write_reg := fast_spr_num(SPR_LR); + v.e.exc_write_enable := '1'; + v.e.exc_write_data := r.next_lr; + v.e.exc_write_reg := fast_spr_num(SPR_LR); v.e.valid := '1'; elsif r.cntz_in_progress = '1' then -- cnt[lt]z always takes two cycles @@ -931,8 +931,18 @@ begin if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or (r.div_in_progress = '1' and divider_to_x.valid = '1') then if r.mul_in_progress = '1' then - result := multiply_to_x.write_reg_data; - overflow := multiply_to_x.overflow; + overflow := '0'; + case r.slow_op_insn is + when OP_MUL_H32 => + result := multiply_to_x.result(63 downto 32) & + multiply_to_x.result(63 downto 32); + when OP_MUL_H64 => + result := multiply_to_x.result(127 downto 64); + when others => + -- i.e. OP_MUL_L64 + result := multiply_to_x.result(63 downto 0); + overflow := multiply_to_x.overflow; + end case; else result := divider_to_x.write_reg_data; overflow := divider_to_x.overflow; @@ -952,7 +962,7 @@ begin end if; v.e.valid := '1'; else - stall_out <= '1'; + v.busy := '1'; v.mul_in_progress := r.mul_in_progress; v.div_in_progress := r.div_in_progress; end if; @@ -973,7 +983,8 @@ begin v.e.exc_write_data := next_nia; end if; ctrl_tmp.irq_state <= WRITE_SRR1; - v.e.valid := '1'; + v.busy := '1'; + v.e.valid := '0'; end if; v.e.write_data := result; @@ -1002,10 +1013,9 @@ begin end if; v.e.exc_write_enable := '1'; v.e.exc_write_reg := fast_spr_num(SPR_SRR0); - v.e.exc_write_data := r.ldst_nia; - report "ldst exception writing srr0=" & to_hstring(r.ldst_nia); + v.e.exc_write_data := r.last_nia; + report "ldst exception writing srr0=" & to_hstring(r.last_nia); ctrl_tmp.irq_state <= WRITE_SRR1; - v.e.valid := '1'; -- complete the original load or store end if; -- Outputs to loadstore1 (async) @@ -1040,5 +1050,26 @@ begin l_out <= lv; e_out <= r.e; flush_out <= f_out.redirect; + + exception_log <= exception; + irq_valid_log <= irq_valid; + end process; + + ex1_log : process(clk) + begin + if rising_edge(clk) then + log_data <= ctrl.msr(MSR_EE) & ctrl.msr(MSR_PR) & + ctrl.msr(MSR_IR) & ctrl.msr(MSR_DR) & + exception_log & + irq_valid_log & + std_ulogic_vector(to_unsigned(irq_state_t'pos(ctrl.irq_state), 1)) & + "000" & + r.e.write_enable & + r.e.valid & + f_out.redirect & + r.busy & + flush_out; + end if; end process; + log_out <= log_data; end architecture behaviour; diff --git a/fetch1.vhdl b/fetch1.vhdl index cb1d1df..0d9c6f7 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -23,8 +23,14 @@ entity fetch1 is -- redirect from execution unit e_in : in Execute1ToFetch1Type; + -- redirect from decode1 + d_in : in Decode1ToFetch1Type; + -- Request to icache - i_out : out Fetch1ToIcacheType + i_out : out Fetch1ToIcacheType; + + -- outputs to logger + log_out : out std_ulogic_vector(42 downto 0) ); end entity fetch1; @@ -35,16 +41,18 @@ architecture behaviour of fetch1 is end record; signal r, r_next : Fetch1ToIcacheType; signal r_int, r_next_int : reg_internal_t; + signal log_nia : std_ulogic_vector(42 downto 0); begin regs : process(clk) begin if rising_edge(clk) then + log_nia <= r.nia(63) & r.nia(43 downto 2); if r /= r_next then report "fetch1 rst:" & std_ulogic'image(rst) & " IR:" & std_ulogic'image(e_in.virt_mode) & " P:" & std_ulogic'image(e_in.priv_mode) & - " R:" & std_ulogic'image(e_in.redirect) & + " R:" & std_ulogic'image(e_in.redirect) & std_ulogic'image(d_in.redirect) & " S:" & std_ulogic'image(stall_in) & " T:" & std_ulogic'image(stop_in) & " nia:" & to_hstring(r_next.nia) & @@ -54,6 +62,7 @@ begin r_int <= r_next_int; end if; end process; + log_out <= log_nia; comb : process(all) variable v : Fetch1ToIcacheType; @@ -62,6 +71,7 @@ begin begin v := r; v_int := r_int; + v.sequential := '0'; if rst = '1' then if alt_reset_in = '1' then @@ -76,6 +86,8 @@ begin v.nia := e_in.redirect_nia; v.virt_mode := e_in.virt_mode; v.priv_mode := e_in.priv_mode; + elsif d_in.redirect = '1' then + v.nia := d_in.redirect_nia; elsif stall_in = '0' then -- For debug stop/step to work properly we need a little bit of @@ -122,6 +134,7 @@ begin if increment then v.nia := std_logic_vector(unsigned(v.nia) + 4); + v.sequential := '1'; end if; end if; diff --git a/fetch2.vhdl b/fetch2.vhdl deleted file mode 100644 index 13ff56e..0000000 --- a/fetch2.vhdl +++ /dev/null @@ -1,123 +0,0 @@ -library ieee; -use ieee.std_logic_1164.all; -use ieee.numeric_std.all; - -library work; -use work.common.all; -use work.wishbone_types.all; - -entity fetch2 is - port( - clk : in std_ulogic; - rst : in std_ulogic; - - stall_in : in std_ulogic; - flush_in : in std_ulogic; - - -- Results from icache - i_in : in IcacheToFetch2Type; - - -- Output to decode - f_out : out Fetch2ToDecode1Type - ); -end entity fetch2; - -architecture behaviour of fetch2 is - - -- The icache cannot stall, so we need to stash a cycle - -- of output from it when we stall. - type reg_internal_type is record - stash : IcacheToFetch2Type; - stash_valid : std_ulogic; - stopped : std_ulogic; - end record; - - signal r_int, rin_int : reg_internal_type; - signal r, rin : Fetch2ToDecode1Type; - -begin - regs : process(clk) - begin - if rising_edge(clk) then - - if (r /= rin) then - report "fetch2 rst:" & std_ulogic'image(rst) & - " S:" & std_ulogic'image(stall_in) & - " F:" & std_ulogic'image(flush_in) & - " T:" & std_ulogic'image(rin.stop_mark) & - " V:" & std_ulogic'image(rin.valid) & - " FF:" & std_ulogic'image(rin.fetch_failed) & - " nia:" & to_hstring(rin.nia); - end if; - - -- Output state remains unchanged on stall, unless we are flushing - if rst = '1' or flush_in = '1' or stall_in = '0' then - r <= rin; - end if; - - -- Internal state is updated on every clock - r_int <= rin_int; - end if; - end process; - - comb : process(all) - variable v : Fetch2ToDecode1Type; - variable v_int : reg_internal_type; - variable v_i_in : IcacheToFetch2Type; - begin - v := r; - v_int := r_int; - - -- If stalling, stash away the current input from the icache - if stall_in = '1' and v_int.stash_valid = '0' then - v_int.stash := i_in; - v_int.stash_valid := '1'; - end if; - - -- If unstalling, source input from the stash and invalidate it, - -- otherwise source normally from the icache. - -- - v_i_in := i_in; - if v_int.stash_valid = '1' and stall_in = '0' then - v_i_in := v_int.stash; - v_int.stash_valid := '0'; - end if; - - v.valid := v_i_in.valid; - v.stop_mark := v_i_in.stop_mark; - v.fetch_failed := v_i_in.fetch_failed; - v.nia := v_i_in.nia; - v.insn := v_i_in.insn; - - -- Clear stash internal valid bit on flush. We still mark - -- the stash itself as valid since we still want to override - -- whatever comes form icache when unstalling, but we'll - -- override it with something invalid. - -- - if flush_in = '1' then - v_int.stash.valid := '0'; - v_int.stash.fetch_failed := '0'; - end if; - - -- If we are flushing or the instruction comes with a stop mark - -- we tag it as invalid so it doesn't get decoded and executed - if flush_in = '1' or v.stop_mark = '1' then - v.valid := '0'; - v.fetch_failed := '0'; - end if; - - -- Clear stash on reset - if rst = '1' then - v_int.stash_valid := '0'; - v.valid := '0'; - end if; - - -- Update registers - rin <= v; - rin_int <= v_int; - - -- Update outputs - f_out <= r; - end process; - -end architecture behaviour; diff --git a/fpga/top-arty.vhdl b/fpga/top-arty.vhdl index d22212a..c8d560a 100644 --- a/fpga/top-arty.vhdl +++ b/fpga/top-arty.vhdl @@ -20,7 +20,8 @@ entity toplevel is SCLK_STARTUPE2 : boolean := false; SPI_FLASH_OFFSET : integer := 4194304; SPI_FLASH_DEF_CKDV : natural := 1; - SPI_FLASH_DEF_QUAD : boolean := true + SPI_FLASH_DEF_QUAD : boolean := true; + LOG_LENGTH : natural := 512 ); port( ext_clk : in std_ulogic; @@ -140,7 +141,8 @@ begin SPI_FLASH_DLINES => 4, SPI_FLASH_OFFSET => SPI_FLASH_OFFSET, SPI_FLASH_DEF_CKDV => SPI_FLASH_DEF_CKDV, - SPI_FLASH_DEF_QUAD => SPI_FLASH_DEF_QUAD + SPI_FLASH_DEF_QUAD => SPI_FLASH_DEF_QUAD, + LOG_LENGTH => LOG_LENGTH ) port map ( -- System signals diff --git a/gpr_hazard.vhdl b/gpr_hazard.vhdl index de4f7d2..0fa66c5 100644 --- a/gpr_hazard.vhdl +++ b/gpr_hazard.vhdl @@ -4,11 +4,15 @@ use ieee.numeric_std.all; entity gpr_hazard is generic ( - PIPELINE_DEPTH : natural := 2 + PIPELINE_DEPTH : natural := 1 ); port( clk : in std_ulogic; - stall_in : in std_ulogic; + busy_in : in std_ulogic; + deferred : in std_ulogic; + complete_in : in std_ulogic; + flush_in : in std_ulogic; + issuing : in std_ulogic; gpr_write_valid_in : in std_ulogic; gpr_write_in : in std_ulogic_vector(5 downto 0); @@ -16,6 +20,9 @@ entity gpr_hazard is gpr_read_valid_in : in std_ulogic; gpr_read_in : in std_ulogic_vector(5 downto 0); + ugpr_write_valid : in std_ulogic; + ugpr_write_reg : in std_ulogic_vector(5 downto 0); + stall_out : out std_ulogic; use_bypass : out std_ulogic ); @@ -25,10 +32,13 @@ architecture behaviour of gpr_hazard is valid : std_ulogic; bypass : std_ulogic; gpr : std_ulogic_vector(5 downto 0); + ugpr_valid : std_ulogic; + ugpr : std_ulogic_vector(5 downto 0); end record; - constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0')); + constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0'), + ugpr_valid => '0', ugpr => (others => '0')); - type pipeline_t is array(0 to PIPELINE_DEPTH-1) of pipeline_entry_type; + type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type; constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init); signal r, rin : pipeline_t := pipeline_t_init; @@ -45,50 +55,46 @@ begin begin v := r; + if complete_in = '1' then + v(PIPELINE_DEPTH).valid := '0'; + v(PIPELINE_DEPTH).ugpr_valid := '0'; + end if; + stall_out <= '0'; use_bypass <= '0'; if gpr_read_valid_in = '1' then - if r(0).valid = '1' and r(0).gpr = gpr_read_in then - if r(0).bypass = '1' and stall_in = '0' then - use_bypass <= '1'; - else - stall_out <= '1'; - end if; - end if; - loop_0: for i in 1 to PIPELINE_DEPTH-1 loop - if r(i).valid = '1' and r(i).gpr = gpr_read_in then + loop_0: for i in 0 to PIPELINE_DEPTH loop + if v(i).valid = '1' and r(i).gpr = gpr_read_in then if r(i).bypass = '1' then use_bypass <= '1'; else stall_out <= '1'; end if; end if; + if v(i).ugpr_valid = '1' and r(i).ugpr = gpr_read_in then + stall_out <= '1'; + end if; end loop; end if; - if stall_in = '0' then + -- XXX assumes PIPELINE_DEPTH = 1 + if busy_in = '0' then + v(1) := v(0); + v(0).valid := '0'; + v(0).ugpr_valid := '0'; + end if; + if deferred = '0' and issuing = '1' then v(0).valid := gpr_write_valid_in; v(0).bypass := bypass_avail; v(0).gpr := gpr_write_in; - loop_1: for i in 1 to PIPELINE_DEPTH-1 loop - -- propagate to next slot - v(i).valid := r(i-1).valid; - v(i).bypass := r(i-1).bypass; - v(i).gpr := r(i-1).gpr; - end loop; - - else - -- stage 0 stalled, so stage 1 becomes empty - loop_1b: for i in 1 to PIPELINE_DEPTH-1 loop - -- propagate to next slot - if i = 1 then - v(i).valid := '0'; - else - v(i).valid := r(i-1).valid; - v(i).bypass := r(i-1).bypass; - v(i).gpr := r(i-1).gpr; - end if; - end loop; + v(0).ugpr_valid := ugpr_write_valid; + v(0).ugpr := ugpr_write_reg; + end if; + if flush_in = '1' then + v(0).valid := '0'; + v(0).ugpr_valid := '0'; + v(1).valid := '0'; + v(1).ugpr_valid := '0'; end if; -- update registers diff --git a/icache.vhdl b/icache.vhdl index 27f8c6a..739e047 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -48,16 +48,19 @@ entity icache is rst : in std_ulogic; i_in : in Fetch1ToIcacheType; - i_out : out IcacheToFetch2Type; + i_out : out IcacheToDecode1Type; m_in : in MmuToIcacheType; + stall_in : in std_ulogic; stall_out : out std_ulogic; flush_in : in std_ulogic; inval_in : in std_ulogic; wishbone_out : out wishbone_master_out; - wishbone_in : in wishbone_slave_out + wishbone_in : in wishbone_slave_out; + + log_out : out std_ulogic_vector(53 downto 0) ); end entity icache; @@ -112,6 +115,7 @@ architecture rtl of icache is subtype row_t is integer range 0 to BRAM_ROWS-1; subtype index_t is integer range 0 to NUM_LINES-1; subtype way_t is integer range 0 to NUM_WAYS-1; + subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0); -- The cache data BRAM organized as described above for each way subtype cache_row_t is std_ulogic_vector(wishbone_data_bits-1 downto 0); @@ -129,6 +133,7 @@ architecture rtl of icache is -- The cache valid bits subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); type cache_valids_t is array(index_t) of cache_way_valids_t; + type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs signal cache_tags : cache_tags_array_t; @@ -176,6 +181,8 @@ architecture rtl of icache is store_row : row_t; store_tag : cache_tag_t; store_valid : std_ulogic; + end_row_ix : row_in_line_t; + rows_valid : row_per_line_valid_t; -- TLB miss state fetch_failed : std_ulogic; @@ -197,6 +204,10 @@ architecture rtl of icache is signal ra_valid : std_ulogic; signal priv_fault : std_ulogic; signal access_ok : std_ulogic; + signal use_previous : std_ulogic; + + -- Output data to logger + signal log_data : std_ulogic_vector(53 downto 0); -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; @@ -219,20 +230,24 @@ architecture rtl of icache is return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS))); end; + -- Return the index of a row within a line + function get_row_of_line(row: row_t) return row_in_line_t is + variable row_v : unsigned(ROW_BITS-1 downto 0); + begin + row_v := to_unsigned(row, ROW_BITS); + return row_v(ROW_LINEBITS-1 downto 0); + end; + -- Returns whether this is the last row of a line - function is_last_row_addr(addr: wishbone_addr_type) return boolean is - constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); + function is_last_row_addr(addr: wishbone_addr_type; last: row_in_line_t) return boolean is begin - return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones; + return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last; end; -- Returns whether this is the last row of a line - function is_last_row(row: row_t) return boolean is - variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0); - constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); + function is_last_row(row: row_t; last: row_in_line_t) return boolean is begin - row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS)); - return row_v(ROW_LINEBITS-1 downto 0) = ones; + return get_row_of_line(row) = last; end; -- Return the address of the next row in the current cache line @@ -361,7 +376,7 @@ begin ); process(all) begin - do_read <= '1'; + do_read <= not (stall_in or use_previous); do_write <= '0'; if wishbone_in.ack = '1' and r.store_way = i then do_write <= '1'; @@ -466,23 +481,38 @@ begin variable is_hit : std_ulogic; variable hit_way : way_t; begin + -- i_in.sequential means that i_in.nia this cycle is 4 more than + -- last cycle. If we read more than 32 bits at a time, had a cache hit + -- last cycle, and we don't want the first 32-bit chunk, then we can + -- keep the data we read last cycle and just use that. + if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then + use_previous <= i_in.sequential and r.hit_valid; + else + use_previous <= '0'; + end if; + -- Extract line, row and tag from request req_index <= get_index(i_in.nia); req_row <= get_row(i_in.nia); req_tag <= get_tag(real_addr); - -- Calculate address of beginning of cache line, will be + -- Calculate address of beginning of cache row, will be -- used for cache miss processing if needed -- req_laddr <= (63 downto REAL_ADDR_BITS => '0') & - real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) & - (LINE_OFF_BITS-1 downto 0 => '0'); + real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & + (ROW_OFF_BITS-1 downto 0 => '0'); -- Test if pending request is a hit on any way hit_way := 0; is_hit := '0'; for i in way_t loop - if i_in.req = '1' and cache_valids(req_index)(i) = '1' then + if i_in.req = '1' and + (cache_valids(req_index)(i) = '1' or + (r.state = WAIT_ACK and + req_index = r.store_index and + i = r.store_way and + r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then if read_tag(i, cache_tags(req_index)) = req_tag then hit_way := i; is_hit := '1'; @@ -528,25 +558,35 @@ begin icache_hit : process(clk) begin if rising_edge(clk) then - -- On a hit, latch the request for the next cycle, when the BRAM data - -- will be available on the cache_out output of the corresponding way - -- - r.hit_valid <= req_is_hit; - -- Send stop marks and NIA down regardless of validity - r.hit_smark <= i_in.stop_mark; - r.hit_nia <= i_in.nia; - if req_is_hit = '1' then - r.hit_way <= req_hit_way; - r.hit_smark <= i_in.stop_mark; - - report "cache hit nia:" & to_hstring(i_in.nia) & - " IR:" & std_ulogic'image(i_in.virt_mode) & - " SM:" & std_ulogic'image(i_in.stop_mark) & - " idx:" & integer'image(req_index) & - " tag:" & to_hstring(req_tag) & - " way:" & integer'image(req_hit_way) & - " RA:" & to_hstring(real_addr); + -- keep outputs to fetch2 unchanged on a stall + -- except that flush or reset sets valid to 0 + -- If use_previous, keep the same data as last cycle and use the second half + if stall_in = '1' or use_previous = '1' then + if rst = '1' or flush_in = '1' then + r.hit_valid <= '0'; + end if; + else + -- On a hit, latch the request for the next cycle, when the BRAM data + -- will be available on the cache_out output of the corresponding way + -- + r.hit_valid <= req_is_hit; + if req_is_hit = '1' then + r.hit_way <= req_hit_way; + + report "cache hit nia:" & to_hstring(i_in.nia) & + " IR:" & std_ulogic'image(i_in.virt_mode) & + " SM:" & std_ulogic'image(i_in.stop_mark) & + " idx:" & integer'image(req_index) & + " tag:" & to_hstring(req_tag) & + " way:" & integer'image(req_hit_way) & + " RA:" & to_hstring(real_addr); + end if; end if; + if stall_in = '0' then + -- Send stop marks and NIA down regardless of validity + r.hit_smark <= i_in.stop_mark; + r.hit_nia <= i_in.nia; + end if; end if; end process; @@ -584,6 +624,11 @@ begin -- Main state machine case r.state is when IDLE => + -- Reset per-row valid flags, only used in WAIT_ACK + for i in 0 to ROW_PER_LINE - 1 loop + r.rows_valid(i) <= '0'; + end loop; + -- We need to read a cache line if req_is_miss = '1' then report "cache miss nia:" & to_hstring(i_in.nia) & @@ -600,6 +645,7 @@ begin r.store_row <= get_row(req_laddr); r.store_tag <= req_tag; r.store_valid <= '1'; + r.end_row_ix <= get_row_of_line(get_row(req_laddr)) - 1; -- Prep for first wishbone read. We calculate the address of -- the start of the cache line and start the WB cycle. @@ -637,7 +683,7 @@ begin -- stb and set stbs_done so we can handle an eventual last -- ack on the same cycle. -- - if is_last_row_addr(r.wb.adr) then + if is_last_row_addr(r.wb.adr, r.end_row_ix) then r.wb.stb <= '0'; stbs_done := true; end if; @@ -648,8 +694,9 @@ begin -- Incoming acks processing if wishbone_in.ack = '1' then + r.rows_valid(r.store_row mod ROW_PER_LINE) <= '1'; -- Check for completion - if stbs_done and is_last_row(r.store_row) then + if stbs_done and is_last_row(r.store_row, r.end_row_ix) then -- Complete wishbone cycle r.wb.cyc <= '0'; @@ -669,9 +716,41 @@ begin -- TLB miss and protection fault processing if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then r.fetch_failed <= '0'; - elsif i_in.req = '1' and access_ok = '0' then + elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then r.fetch_failed <= '1'; end if; end if; end process; + + data_log: process(clk) + variable lway: way_t; + variable wstate: std_ulogic; + begin + if rising_edge(clk) then + if req_is_hit then + lway := req_hit_way; + else + lway := replace_way; + end if; + wstate := '0'; + if r.state /= IDLE then + wstate := '1'; + end if; + log_data <= i_out.valid & + i_out.insn & + wishbone_in.ack & + r.wb.adr(5 downto 3) & + r.wb.stb & r.wb.cyc & + wishbone_in.stall & + stall_out & + r.fetch_failed & + r.hit_nia(5 downto 2) & + wstate & + std_ulogic_vector(to_unsigned(lway, 3)) & + req_is_hit & req_is_miss & + access_ok & + ra_valid; + end if; + end process; + log_out <= log_data; end; diff --git a/icache_tb.vhdl b/icache_tb.vhdl index 39e28d5..1d179d6 100644 --- a/icache_tb.vhdl +++ b/icache_tb.vhdl @@ -13,7 +13,7 @@ architecture behave of icache_tb is signal rst : std_ulogic; signal i_out : Fetch1ToIcacheType; - signal i_in : IcacheToFetch2Type; + signal i_in : IcacheToDecode1Type; signal m_out : MmuToIcacheType; @@ -33,6 +33,7 @@ begin i_in => i_out, i_out => i_in, m_in => m_out, + stall_in => '0', flush_in => '0', inval_in => '0', wishbone_out => wb_bram_in, diff --git a/loadstore1.vhdl b/loadstore1.vhdl index e71ad74..cf00987 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -25,7 +25,8 @@ entity loadstore1 is m_in : in MmuToLoadstore1Type; dc_stall : in std_ulogic; - stall_out : out std_ulogic + + log_out : out std_ulogic_vector(9 downto 0) ); end loadstore1; @@ -41,7 +42,8 @@ architecture behave of loadstore1 is ACK_WAIT, -- waiting for ack from dcache LD_UPDATE, -- writing rA with computed addr on load MMU_LOOKUP, -- waiting for MMU to look up translation - TLBIE_WAIT -- waiting for MMU to finish doing a tlbie + TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie + SPR_CMPLT -- complete a mf/tspr operation ); type reg_stage_t is record @@ -49,6 +51,7 @@ architecture behave of loadstore1 is load : std_ulogic; tlbie : std_ulogic; dcbz : std_ulogic; + mfspr : std_ulogic; addr : std_ulogic_vector(63 downto 0); store_data : std_ulogic_vector(63 downto 0); load_data : std_ulogic_vector(63 downto 0); @@ -71,6 +74,7 @@ architecture behave of loadstore1 is dar : std_ulogic_vector(63 downto 0); dsisr : std_ulogic_vector(31 downto 0); instr_fault : std_ulogic; + sprval : std_ulogic_vector(63 downto 0); end record; type byte_sel_t is array(0 to 7) of std_ulogic; @@ -80,6 +84,8 @@ architecture behave of loadstore1 is signal r, rin : reg_stage_t; signal lsu_sum : std_ulogic_vector(63 downto 0); + signal log_data : std_ulogic_vector(9 downto 0); + -- Generate byte enables from sizes function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is begin @@ -135,7 +141,7 @@ begin variable long_sel : std_ulogic_vector(15 downto 0); variable byte_sel : std_ulogic_vector(7 downto 0); variable req : std_ulogic; - variable stall : std_ulogic; + variable busy : std_ulogic; variable addr : std_ulogic_vector(63 downto 0); variable wdata : std_ulogic_vector(63 downto 0); variable write_enable : std_ulogic; @@ -147,9 +153,7 @@ begin variable use_second : byte_sel_t; variable trim_ctl : trim_ctl_t; variable negative : std_ulogic; - variable mfspr : std_ulogic; variable sprn : std_ulogic_vector(9 downto 0); - variable sprval : std_ulogic_vector(63 downto 0); variable exception : std_ulogic; variable next_addr : std_ulogic_vector(63 downto 0); variable mmureq : std_ulogic; @@ -159,16 +163,12 @@ begin begin v := r; req := '0'; - stall := '0'; - done := '0'; byte_sel := (others => '0'); addr := lsu_sum; - mfspr := '0'; + v.mfspr := '0'; mmu_mtspr := '0'; itlb_fault := '0'; sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10)); - sprval := (others => '0'); -- avoid inferred latches - exception := '0'; dsisr := (others => '0'); mmureq := '0'; @@ -227,130 +227,18 @@ begin -- compute (addr + 8) & ~7 for the second doubleword when unaligned next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; + done := '0'; + exception := '0'; case r.state is when IDLE => - if l_in.valid = '1' then - v.addr := lsu_sum; - v.load := '0'; - v.dcbz := '0'; - v.tlbie := '0'; - v.instr_fault := '0'; - v.dwords_done := '0'; - case l_in.op is - when OP_STORE => - req := '1'; - when OP_LOAD => - req := '1'; - v.load := '1'; - when OP_DCBZ => - req := '1'; - v.dcbz := '1'; - when OP_TLBIE => - mmureq := '1'; - stall := '1'; - v.tlbie := '1'; - v.state := TLBIE_WAIT; - when OP_MFSPR => - done := '1'; - mfspr := '1'; - -- partial decode on SPR number should be adequate given - -- the restricted set that get sent down this path - if sprn(9) = '0' and sprn(5) = '0' then - if sprn(0) = '0' then - sprval := x"00000000" & r.dsisr; - else - sprval := r.dar; - end if; - else - -- reading one of the SPRs in the MMU - sprval := m_in.sprval; - end if; - when OP_MTSPR => - if sprn(9) = '0' and sprn(5) = '0' then - if sprn(0) = '0' then - v.dsisr := l_in.data(31 downto 0); - else - v.dar := l_in.data; - end if; - done := '1'; - else - -- writing one of the SPRs in the MMU - mmu_mtspr := '1'; - stall := '1'; - v.state := TLBIE_WAIT; - end if; - when OP_FETCH_FAILED => - -- send it to the MMU to do the radix walk - addr := l_in.nia; - v.addr := l_in.nia; - v.instr_fault := '1'; - mmureq := '1'; - stall := '1'; - v.state := MMU_LOOKUP; - when others => - assert false report "unknown op sent to loadstore1"; - end case; - - v.write_reg := l_in.write_reg; - v.length := l_in.length; - v.byte_reverse := l_in.byte_reverse; - v.sign_extend := l_in.sign_extend; - v.update := l_in.update; - v.update_reg := l_in.update_reg; - v.xerc := l_in.xerc; - v.reserve := l_in.reserve; - v.rc := l_in.rc; - v.nc := l_in.ci; - v.virt_mode := l_in.virt_mode; - v.priv_mode := l_in.priv_mode; - - -- XXX Temporary hack. Mark the op as non-cachable if the address - -- is the form 0xc------- for a real-mode access. - -- - -- This will have to be replaced by a combination of implementing the - -- proper HV CI load/store instructions and having an MMU to get the I - -- bit otherwise. - if lsu_sum(31 downto 28) = "1100" and l_in.virt_mode = '0' then - v.nc := '1'; - end if; - - -- Do length_to_sel and work out if we are doing 2 dwords - long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0)); - byte_sel := long_sel(7 downto 0); - v.first_bytes := byte_sel; - v.second_bytes := long_sel(15 downto 8); - - -- Do byte reversing and rotating for stores in the first cycle - byte_offset := unsigned(lsu_sum(2 downto 0)); - brev_lenm1 := "000"; - if l_in.byte_reverse = '1' then - brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; - end if; - for i in 0 to 7 loop - k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; - j := to_integer(k) * 8; - v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); - end loop; - - if req = '1' then - stall := '1'; - if long_sel(15 downto 8) = "00000000" then - v.state := ACK_WAIT; - else - v.state := SECOND_REQ; - end if; - end if; - end if; when SECOND_REQ => addr := next_addr; byte_sel := r.second_bytes; req := '1'; - stall := '1'; v.state := ACK_WAIT; when ACK_WAIT => - stall := '1'; if d_in.valid = '1' then if d_in.error = '1' then -- dcache will discard the second request if it @@ -388,7 +276,6 @@ begin else -- stores write back rA update in this cycle do_update := r.update; - stall := '0'; done := '1'; v.state := IDLE; end if; @@ -397,7 +284,6 @@ begin end if; when MMU_LOOKUP => - stall := '1'; if r.dwords_done = '1' then addr := next_addr; byte_sel := r.second_bytes; @@ -418,7 +304,6 @@ begin end if; else -- nothing to do, the icache retries automatically - stall := '0'; done := '1'; v.state := IDLE; end if; @@ -434,10 +319,8 @@ begin end if; when TLBIE_WAIT => - stall := '1'; if m_in.done = '1' then -- tlbie is finished - stall := '0'; done := '1'; v.state := IDLE; end if; @@ -447,8 +330,123 @@ begin v.state := IDLE; done := '1'; + when SPR_CMPLT => + done := '1'; + v.state := IDLE; + end case; + busy := '1'; + if r.state = IDLE or done = '1' then + busy := '0'; + end if; + + -- Note that l_in.valid is gated with busy inside execute1 + if l_in.valid = '1' then + v.addr := lsu_sum; + v.load := '0'; + v.dcbz := '0'; + v.tlbie := '0'; + v.instr_fault := '0'; + v.dwords_done := '0'; + v.write_reg := l_in.write_reg; + v.length := l_in.length; + v.byte_reverse := l_in.byte_reverse; + v.sign_extend := l_in.sign_extend; + v.update := l_in.update; + v.update_reg := l_in.update_reg; + v.xerc := l_in.xerc; + v.reserve := l_in.reserve; + v.rc := l_in.rc; + v.nc := l_in.ci; + v.virt_mode := l_in.virt_mode; + v.priv_mode := l_in.priv_mode; + + -- XXX Temporary hack. Mark the op as non-cachable if the address + -- is the form 0xc------- for a real-mode access. + if lsu_sum(31 downto 28) = "1100" and l_in.virt_mode = '0' then + v.nc := '1'; + end if; + + -- Do length_to_sel and work out if we are doing 2 dwords + long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0)); + byte_sel := long_sel(7 downto 0); + v.first_bytes := byte_sel; + v.second_bytes := long_sel(15 downto 8); + + -- Do byte reversing and rotating for stores in the first cycle + byte_offset := unsigned(lsu_sum(2 downto 0)); + brev_lenm1 := "000"; + if l_in.byte_reverse = '1' then + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + end if; + for i in 0 to 7 loop + k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; + j := to_integer(k) * 8; + v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); + end loop; + + case l_in.op is + when OP_STORE => + req := '1'; + when OP_LOAD => + req := '1'; + v.load := '1'; + when OP_DCBZ => + req := '1'; + v.dcbz := '1'; + when OP_TLBIE => + mmureq := '1'; + v.tlbie := '1'; + v.state := TLBIE_WAIT; + when OP_MFSPR => + v.mfspr := '1'; + -- partial decode on SPR number should be adequate given + -- the restricted set that get sent down this path + if sprn(9) = '0' and sprn(5) = '0' then + if sprn(0) = '0' then + v.sprval := x"00000000" & r.dsisr; + else + v.sprval := r.dar; + end if; + else + -- reading one of the SPRs in the MMU + v.sprval := m_in.sprval; + end if; + v.state := SPR_CMPLT; + when OP_MTSPR => + if sprn(9) = '0' and sprn(5) = '0' then + if sprn(0) = '0' then + v.dsisr := l_in.data(31 downto 0); + else + v.dar := l_in.data; + end if; + v.state := SPR_CMPLT; + else + -- writing one of the SPRs in the MMU + mmu_mtspr := '1'; + v.state := TLBIE_WAIT; + end if; + when OP_FETCH_FAILED => + -- send it to the MMU to do the radix walk + addr := l_in.nia; + v.addr := l_in.nia; + v.instr_fault := '1'; + mmureq := '1'; + v.state := MMU_LOOKUP; + when others => + assert false report "unknown op sent to loadstore1"; + end case; + + if req = '1' then + if long_sel(15 downto 8) = "00000000" then + v.state := ACK_WAIT; + else + v.state := SECOND_REQ; + end if; + end if; + end if; + -- Update outputs to dcache d_out.valid <= req; d_out.load <= v.load; @@ -477,10 +475,10 @@ begin -- Multiplex either cache data to the destination GPR or -- the address for the rA update. l_out.valid <= done; - if mfspr = '1' then + if r.mfspr = '1' then l_out.write_enable <= '1'; - l_out.write_reg <= l_in.write_reg; - l_out.write_data <= sprval; + l_out.write_reg <= r.write_reg; + l_out.write_data <= r.sprval; elsif do_update = '1' then l_out.write_enable <= '1'; l_out.write_reg <= r.update_reg; @@ -495,6 +493,7 @@ begin l_out.store_done <= d_in.store_done; -- update exception info back to execute1 + e_out.busy <= busy; e_out.exception <= exception; e_out.instr_fault <= r.instr_fault; e_out.invalid <= m_in.invalid; @@ -509,11 +508,23 @@ begin end if; end if; - stall_out <= stall; - -- Update registers rin <= v; end process; + ls1_log: process(clk) + begin + if rising_edge(clk) then + log_data <= e_out.busy & + e_out.exception & + l_out.valid & + m_out.valid & + d_out.valid & + m_in.done & + r.dwords_done & + std_ulogic_vector(to_unsigned(state_t'pos(r.state), 3)); + end if; + end process; + log_out <= log_data; end; diff --git a/logical.vhdl b/logical.vhdl index 4dfc13d..5e6abfa 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -4,6 +4,7 @@ use ieee.numeric_std.all; library work; use work.decode_types.all; +use work.ppc_fx_insns.all; entity logical is port ( @@ -13,9 +14,7 @@ entity logical is invert_in : in std_ulogic; invert_out : in std_ulogic; result : out std_ulogic_vector(63 downto 0); - datalen : in std_logic_vector(3 downto 0); - popcnt : out std_ulogic_vector(63 downto 0); - parity : out std_ulogic_vector(63 downto 0) + datalen : in std_logic_vector(3 downto 0) ); end entity logical; @@ -34,30 +33,14 @@ architecture behaviour of logical is type sixbit2 is array(0 to 1) of sixbit; signal pc32 : sixbit2; signal par0, par1 : std_ulogic; + signal popcnt : std_ulogic_vector(63 downto 0); + signal parity : std_ulogic_vector(63 downto 0); begin logical_0: process(all) variable rb_adj, tmp : std_ulogic_vector(63 downto 0); + variable negative : std_ulogic; begin - rb_adj := rb; - if invert_in = '1' then - rb_adj := not rb; - end if; - - case op is - when OP_AND => - tmp := rs and rb_adj; - when OP_OR => - tmp := rs or rb_adj; - when others => - tmp := rs xor rb_adj; - end case; - - result <= tmp; - if invert_out = '1' then - result <= not tmp; - end if; - -- population counts for i in 0 to 31 loop pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1)); @@ -98,5 +81,44 @@ begin parity(32) <= par1; end if; + rb_adj := rb; + if invert_in = '1' then + rb_adj := not rb; + end if; + + case op is + when OP_AND => + tmp := rs and rb_adj; + when OP_OR => + tmp := rs or rb_adj; + when OP_XOR => + tmp := rs xor rb_adj; + when OP_POPCNT => + tmp := popcnt; + when OP_PRTY => + tmp := parity; + when OP_CMPB => + tmp := ppc_cmpb(rs, rb); + when others => + -- EXTS + -- note datalen is a 1-hot encoding + negative := (datalen(0) and rs(7)) or + (datalen(1) and rs(15)) or + (datalen(2) and rs(31)); + tmp := (others => negative); + if datalen(2) = '1' then + tmp(31 downto 16) := rs(31 downto 16); + end if; + if datalen(2) = '1' or datalen(1) = '1' then + tmp(15 downto 8) := rs(15 downto 8); + end if; + tmp(7 downto 0) := rs(7 downto 0); + end case; + + if invert_out = '1' then + tmp := not tmp; + end if; + result <= tmp; + end process; end behaviour; diff --git a/microwatt.core b/microwatt.core index 87ef39d..83d7762 100644 --- a/microwatt.core +++ b/microwatt.core @@ -9,7 +9,6 @@ filesets: - wishbone_types.vhdl - common.vhdl - fetch1.vhdl - - fetch2.vhdl - decode1.vhdl - helpers.vhdl - decode2.vhdl @@ -27,7 +26,6 @@ filesets: - loadstore1.vhdl - mmu.vhdl - dcache.vhdl - - multiply.vhdl - divider.vhdl - rotator.vhdl - writeback.vhdl @@ -63,6 +61,10 @@ filesets: - fpga/firmware.hex : {copyto : firmware.hex, file_type : user} file_type : vhdlSource-2008 + xilinx_specific: + files: + - xilinx-mult.vhdl : {file_type : vhdlSource-2008} + debug_xilinx: files: - dmi_dtm_xilinx.vhdl : {file_type : vhdlSource-2008} @@ -101,20 +103,21 @@ filesets: targets: nexys_a7: default_tool: vivado - filesets: [core, nexys_a7, soc, fpga, debug_xilinx] + filesets: [core, nexys_a7, soc, fpga, debug_xilinx, xilinx_specific] parameters : - memory_size - ram_init_file - clk_input - clk_frequency - disable_flatten_core + - log_length=2048 tools: vivado: {part : xc7a100tcsg324-1} toplevel : toplevel nexys_video-nodram: default_tool: vivado - filesets: [core, nexys_video, soc, fpga, debug_xilinx] + filesets: [core, nexys_video, soc, fpga, debug_xilinx, xilinx_specific] parameters : - memory_size - ram_init_file @@ -122,13 +125,14 @@ targets: - clk_frequency - disable_flatten_core - spi_flash_offset=10485760 + - log_length=2048 tools: vivado: {part : xc7a200tsbg484-1} toplevel : toplevel nexys_video: default_tool: vivado - filesets: [core, nexys_video, soc, fpga, debug_xilinx, litedram] + filesets: [core, nexys_video, soc, fpga, debug_xilinx, litedram, xilinx_specific] parameters: - memory_size - ram_init_file @@ -136,6 +140,7 @@ targets: - disable_flatten_core - no_bram - spi_flash_offset=10485760 + - log_length=2048 generate: [dram_nexys_video] tools: vivado: {part : xc7a200tsbg484-1} @@ -143,7 +148,7 @@ targets: arty_a7-35-nodram: default_tool: vivado - filesets: [core, arty_a7, soc, fpga, debug_xilinx] + filesets: [core, arty_a7, soc, fpga, debug_xilinx, xilinx_specific] parameters : - memory_size - ram_init_file @@ -151,13 +156,14 @@ targets: - clk_frequency - disable_flatten_core - spi_flash_offset=3145728 + - log_length=512 tools: vivado: {part : xc7a35ticsg324-1L} toplevel : toplevel arty_a7-35: default_tool: vivado - filesets: [core, arty_a7, soc, fpga, debug_xilinx, litedram] + filesets: [core, arty_a7, soc, fpga, debug_xilinx, litedram, xilinx_specific] parameters : - memory_size - ram_init_file @@ -165,6 +171,7 @@ targets: - disable_flatten_core - no_bram - spi_flash_offset=3145728 + - log_length=512 generate: [dram_arty] tools: vivado: {part : xc7a35ticsg324-1L} @@ -172,7 +179,7 @@ targets: arty_a7-100-nodram: default_tool: vivado - filesets: [core, arty_a7, soc, fpga, debug_xilinx] + filesets: [core, arty_a7, soc, fpga, debug_xilinx, xilinx_specific] parameters : - memory_size - ram_init_file @@ -180,13 +187,14 @@ targets: - clk_frequency - disable_flatten_core - spi_flash_offset=4194304 + - log_length=2048 tools: vivado: {part : xc7a100ticsg324-1L} toplevel : toplevel arty_a7-100: default_tool: vivado - filesets: [core, arty_a7, soc, fpga, debug_xilinx, litedram] + filesets: [core, arty_a7, soc, fpga, debug_xilinx, litedram, xilinx_specific] parameters: - memory_size - ram_init_file @@ -194,6 +202,7 @@ targets: - disable_flatten_core - no_bram - spi_flash_offset=4194304 + - log_length=2048 generate: [dram_arty] tools: vivado: {part : xc7a100ticsg324-1L} @@ -201,7 +210,7 @@ targets: cmod_a7-35: default_tool: vivado - filesets: [core, cmod_a7-35, soc, fpga, debug_xilinx] + filesets: [core, cmod_a7-35, soc, fpga, debug_xilinx, xilinx_specific] parameters : - memory_size - ram_init_file @@ -209,12 +218,13 @@ targets: - clk_input=12000000 - clk_frequency - disable_flatten_core + - log_length=512 tools: vivado: {part : xc7a35tcpg236-1} toplevel : toplevel synth: - filesets: [core, soc] + filesets: [core, soc, xilinx_specific] tools: vivado: {pnr : none} toplevel: core @@ -279,3 +289,8 @@ parameters: datatype : int description : Offset (in bytes) in the SPI flash of the code payload to run paramtype : generic + + log_length: + datatype : int + description : Length of the core log buffer in entries (32 bytes each) + paramtype : generic diff --git a/mmu.vhdl b/mmu.vhdl index 0eefbab..fc2dd7a 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -27,6 +27,7 @@ end mmu; architecture behave of mmu is type state_t is (IDLE, + DO_TLBIE, TLB_WAIT, PROC_TBL_READ, PROC_TBL_WAIT, @@ -44,6 +45,7 @@ architecture behave of mmu is store : std_ulogic; priv : std_ulogic; addr : std_ulogic_vector(63 downto 0); + inval_all : std_ulogic; -- config SPRs prtbl : std_ulogic_vector(63 downto 0); pid : std_ulogic_vector(31 downto 0); @@ -178,7 +180,6 @@ begin variable tlb_load : std_ulogic; variable itlb_load : std_ulogic; variable tlbie_req : std_ulogic; - variable inval_all : std_ulogic; variable prtbl_rd : std_ulogic; variable pt_valid : std_ulogic; variable effpid : std_ulogic_vector(31 downto 0); @@ -207,7 +208,7 @@ begin tlb_load := '0'; itlb_load := '0'; tlbie_req := '0'; - inval_all := '0'; + v.inval_all := '0'; prtbl_rd := '0'; -- Radix tree data structures in memory are big-endian, @@ -240,19 +241,17 @@ begin v.store := not (l_in.load or l_in.iside); v.priv := l_in.priv; if l_in.tlbie = '1' then - dcreq := '1'; - tlbie_req := '1'; -- Invalidate all iTLB/dTLB entries for tlbie with -- RB[IS] != 0 or RB[AP] != 0, or for slbia - inval_all := l_in.slbia or l_in.addr(11) or l_in.addr(10) or - l_in.addr(7) or l_in.addr(6) or l_in.addr(5); + v.inval_all := l_in.slbia or l_in.addr(11) or l_in.addr(10) or + l_in.addr(7) or l_in.addr(6) or l_in.addr(5); -- The RIC field of the tlbie instruction comes across on the -- sprn bus as bits 2--3. RIC=2 flushes process table caches. if l_in.sprn(3) = '1' then v.pt0_valid := '0'; v.pt3_valid := '0'; end if; - v.state := TLB_WAIT; + v.state := DO_TLBIE; else v.valid := '1'; if pt_valid = '0' then @@ -281,12 +280,15 @@ begin v.pt3_valid := '0'; end if; v.pt0_valid := '0'; - dcreq := '1'; - tlbie_req := '1'; - inval_all := '1'; - v.state := TLB_WAIT; + v.inval_all := '1'; + v.state := DO_TLBIE; end if; + when DO_TLBIE => + dcreq := '1'; + tlbie_req := '1'; + v.state := TLB_WAIT; + when TLB_WAIT => if d_in.done = '1' then done := '1'; @@ -436,8 +438,8 @@ begin -- drive outputs if tlbie_req = '1' then - addr := l_in.addr; - tlb_data := l_in.rs; + addr := r.addr; + tlb_data := (others => '0'); elsif tlb_load = '1' then addr := r.addr(63 downto 12) & x"000"; tlb_data := pte; @@ -458,14 +460,14 @@ begin d_out.valid <= dcreq; d_out.tlbie <= tlbie_req; - d_out.doall <= inval_all; + d_out.doall <= r.inval_all; d_out.tlbld <= tlb_load; d_out.addr <= addr; d_out.pte <= tlb_data; i_out.tlbld <= itlb_load; i_out.tlbie <= tlbie_req; - i_out.doall <= inval_all; + i_out.doall <= r.inval_all; i_out.addr <= addr; i_out.pte <= tlb_data; diff --git a/multiply.vhdl b/multiply.vhdl index 959c114..7a4c81b 100644 --- a/multiply.vhdl +++ b/multiply.vhdl @@ -4,11 +4,10 @@ use ieee.numeric_std.all; library work; use work.common.all; -use work.decode_types.all; entity multiply is generic ( - PIPELINE_DEPTH : natural := 16 + PIPELINE_DEPTH : natural := 4 ); port ( clk : in std_logic; @@ -19,17 +18,16 @@ entity multiply is end entity multiply; architecture behaviour of multiply is - signal m: Execute1ToMultiplyType; + signal m: Execute1ToMultiplyType := Execute1ToMultiplyInit; type multiply_pipeline_stage is record valid : std_ulogic; - insn_type : insn_type_t; - data : signed(129 downto 0); + data : unsigned(127 downto 0); is_32bit : std_ulogic; + neg_res : std_ulogic; end record; constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0', - insn_type => OP_ILLEGAL, - is_32bit => '0', + is_32bit => '0', neg_res => '0', data => (others => '0')); type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage; @@ -51,50 +49,35 @@ begin multiply_1: process(all) variable v : reg_type; - variable d : std_ulogic_vector(129 downto 0); + variable d : std_ulogic_vector(127 downto 0); variable d2 : std_ulogic_vector(63 downto 0); variable ov : std_ulogic; begin - v := r; - - m_out <= MultiplyToExecute1Init; - v.multiply_pipeline(0).valid := m.valid; - v.multiply_pipeline(0).insn_type := m.insn_type; - v.multiply_pipeline(0).data := signed(m.data1) * signed(m.data2); + v.multiply_pipeline(0).data := unsigned(m.data1) * unsigned(m.data2); v.multiply_pipeline(0).is_32bit := m.is_32bit; + v.multiply_pipeline(0).neg_res := m.neg_result; loop_0: for i in 1 to PIPELINE_DEPTH-1 loop v.multiply_pipeline(i) := r.multiply_pipeline(i-1); end loop; - d := std_ulogic_vector(v.multiply_pipeline(PIPELINE_DEPTH-1).data); - ov := '0'; + if v.multiply_pipeline(PIPELINE_DEPTH-1).neg_res = '0' then + d := std_ulogic_vector(v.multiply_pipeline(PIPELINE_DEPTH-1).data); + else + d := std_ulogic_vector(- signed(v.multiply_pipeline(PIPELINE_DEPTH-1).data)); + end if; - -- TODO: Handle overflows - case_0: case v.multiply_pipeline(PIPELINE_DEPTH-1).insn_type is - when OP_MUL_L64 => - d2 := d(63 downto 0); - if v.multiply_pipeline(PIPELINE_DEPTH-1).is_32bit = '1' then - ov := (or d(63 downto 31)) and not (and d(63 downto 31)); - else - ov := (or d(127 downto 63)) and not (and d(127 downto 63)); - end if; - when OP_MUL_H32 => - d2 := d(63 downto 32) & d(63 downto 32); - when OP_MUL_H64 => - d2 := d(127 downto 64); - when others => - --report "Illegal insn type in multiplier"; - d2 := (others => '0'); - end case; + ov := '0'; + if v.multiply_pipeline(PIPELINE_DEPTH-1).is_32bit = '1' then + ov := (or d(63 downto 31)) and not (and d(63 downto 31)); + else + ov := (or d(127 downto 63)) and not (and d(127 downto 63)); + end if; - m_out.write_reg_data <= d2; + m_out.result <= d; m_out.overflow <= ov; - - if v.multiply_pipeline(PIPELINE_DEPTH-1).valid = '1' then - m_out.valid <= '1'; - end if; + m_out.valid <= v.multiply_pipeline(PIPELINE_DEPTH-1).valid; rin <= v; end process; diff --git a/multiply_tb.vhdl b/multiply_tb.vhdl index ee80de0..87f029d 100644 --- a/multiply_tb.vhdl +++ b/multiply_tb.vhdl @@ -17,8 +17,18 @@ architecture behave of multiply_tb is constant pipeline_depth : integer := 4; - signal m1 : Execute1ToMultiplyType; + signal m1 : Execute1ToMultiplyType := Execute1ToMultiplyInit; signal m2 : MultiplyToExecute1Type; + + function absval(x: std_ulogic_vector) return std_ulogic_vector is + begin + if x(x'left) = '1' then + return std_ulogic_vector(- signed(x)); + else + return x; + end if; + end; + begin multiply_0: entity work.multiply generic map (PIPELINE_DEPTH => pipeline_depth) @@ -39,9 +49,8 @@ begin wait for clk_period; m1.valid <= '1'; - m1.insn_type <= OP_MUL_L64; - m1.data1 <= '0' & x"0000000000001000"; - m1.data2 <= '0' & x"0000000000001111"; + m1.data1 <= x"0000000000001000"; + m1.data2 <= x"0000000000001111"; wait for clk_period; assert m2.valid = '0'; @@ -56,7 +65,7 @@ begin wait for clk_period; assert m2.valid = '1'; - assert m2.write_reg_data = x"0000000001111000"; + assert m2.result = x"00000000000000000000000001111000"; wait for clk_period; assert m2.valid = '0'; @@ -70,7 +79,7 @@ begin wait for clk_period * (pipeline_depth-1); assert m2.valid = '1'; - assert m2.write_reg_data = x"0000000001111000"; + assert m2.result = x"00000000000000000000000001111000"; -- test mulld mulld_loop : for i in 0 to 1000 loop @@ -79,10 +88,10 @@ begin behave_rt := ppc_mulld(ra, rb); - m1.data1 <= '0' & ra; - m1.data2 <= '0' & rb; + m1.data1 <= absval(ra); + m1.data2 <= absval(rb); + m1.neg_result <= ra(63) xor rb(63); m1.valid <= '1'; - m1.insn_type <= OP_MUL_L64; wait for clk_period; @@ -92,8 +101,8 @@ begin assert m2.valid = '1'; - assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data) - report "bad mulld expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data); + assert to_hstring(behave_rt) = to_hstring(m2.result(63 downto 0)) + report "bad mulld expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.result(63 downto 0)); end loop; -- test mulhdu @@ -103,10 +112,10 @@ begin behave_rt := ppc_mulhdu(ra, rb); - m1.data1 <= '0' & ra; - m1.data2 <= '0' & rb; + m1.data1 <= ra; + m1.data2 <= rb; + m1.neg_result <= '0'; m1.valid <= '1'; - m1.insn_type <= OP_MUL_H64; wait for clk_period; @@ -116,8 +125,8 @@ begin assert m2.valid = '1'; - assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data) - report "bad mulhdu expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data); + assert to_hstring(behave_rt) = to_hstring(m2.result(127 downto 64)) + report "bad mulhdu expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.result(127 downto 64)); end loop; -- test mulhd @@ -127,10 +136,10 @@ begin behave_rt := ppc_mulhd(ra, rb); - m1.data1 <= ra(63) & ra; - m1.data2 <= rb(63) & rb; + m1.data1 <= absval(ra); + m1.data2 <= absval(rb); + m1.neg_result <= ra(63) xor rb(63); m1.valid <= '1'; - m1.insn_type <= OP_MUL_H64; wait for clk_period; @@ -140,8 +149,8 @@ begin assert m2.valid = '1'; - assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data) - report "bad mulhd expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data); + assert to_hstring(behave_rt) = to_hstring(m2.result(127 downto 64)) + report "bad mulhd expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.result(127 downto 64)); end loop; -- test mullw @@ -151,12 +160,12 @@ begin behave_rt := ppc_mullw(ra, rb); - m1.data1 <= (others => ra(31)); - m1.data1(31 downto 0) <= ra(31 downto 0); - m1.data2 <= (others => rb(31)); - m1.data2(31 downto 0) <= rb(31 downto 0); + m1.data1 <= (others => '0'); + m1.data1(31 downto 0) <= absval(ra(31 downto 0)); + m1.data2 <= (others => '0'); + m1.data2(31 downto 0) <= absval(rb(31 downto 0)); + m1.neg_result <= ra(31) xor rb(31); m1.valid <= '1'; - m1.insn_type <= OP_MUL_L64; wait for clk_period; @@ -166,8 +175,8 @@ begin assert m2.valid = '1'; - assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data) - report "bad mullw expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data); + assert to_hstring(behave_rt) = to_hstring(m2.result(63 downto 0)) + report "bad mullw expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.result(63 downto 0)); end loop; -- test mulhw @@ -177,12 +186,12 @@ begin behave_rt := ppc_mulhw(ra, rb); - m1.data1 <= (others => ra(31)); - m1.data1(31 downto 0) <= ra(31 downto 0); - m1.data2 <= (others => rb(31)); - m1.data2(31 downto 0) <= rb(31 downto 0); + m1.data1 <= (others => '0'); + m1.data1(31 downto 0) <= absval(ra(31 downto 0)); + m1.data2 <= (others => '0'); + m1.data2(31 downto 0) <= absval(rb(31 downto 0)); + m1.neg_result <= ra(31) xor rb(31); m1.valid <= '1'; - m1.insn_type <= OP_MUL_H32; wait for clk_period; @@ -192,8 +201,9 @@ begin assert m2.valid = '1'; - assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data) - report "bad mulhw expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data); + assert to_hstring(behave_rt) = to_hstring(m2.result(63 downto 32) & m2.result(63 downto 32)) + report "bad mulhw expected " & to_hstring(behave_rt) & " got " & + to_hstring(m2.result(63 downto 32) & m2.result(63 downto 32)); end loop; -- test mulhwu @@ -207,8 +217,8 @@ begin m1.data1(31 downto 0) <= ra(31 downto 0); m1.data2 <= (others => '0'); m1.data2(31 downto 0) <= rb(31 downto 0); + m1.neg_result <= '0'; m1.valid <= '1'; - m1.insn_type <= OP_MUL_H32; wait for clk_period; @@ -218,8 +228,9 @@ begin assert m2.valid = '1'; - assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data) - report "bad mulhwu expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data); + assert to_hstring(behave_rt) = to_hstring(m2.result(63 downto 32) & m2.result(63 downto 32)) + report "bad mulhwu expected " & to_hstring(behave_rt) & " got " & + to_hstring(m2.result(63 downto 32) & m2.result(63 downto 32)); end loop; -- test mulli @@ -229,11 +240,11 @@ begin behave_rt := ppc_mulli(ra, si); - m1.data1 <= ra(63) & ra; - m1.data2 <= (others => si(15)); - m1.data2(15 downto 0) <= si; + m1.data1 <= absval(ra); + m1.data2 <= (others => '0'); + m1.data2(15 downto 0) <= absval(si); + m1.neg_result <= ra(63) xor si(15); m1.valid <= '1'; - m1.insn_type <= OP_MUL_L64; wait for clk_period; @@ -243,8 +254,8 @@ begin assert m2.valid = '1'; - assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data) - report "bad mulli expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data); + assert to_hstring(behave_rt) = to_hstring(m2.result(63 downto 0)) + report "bad mulli expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.result(63 downto 0)); end loop; std.env.finish; diff --git a/ppc_fx_insns.vhdl b/ppc_fx_insns.vhdl index 0bf011d..5fdf1c7 100644 --- a/ppc_fx_insns.vhdl +++ b/ppc_fx_insns.vhdl @@ -93,7 +93,7 @@ package ppc_fx_insns is function ppc_divd (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; function ppc_divwu (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; - function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return integer; + function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return std_ulogic; end package ppc_fx_insns; package body ppc_fx_insns is @@ -785,13 +785,12 @@ package body ppc_fx_insns is return std_ulogic_vector(resize(tmp, ra'length)); end; - function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return integer is + function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return std_ulogic is variable crfield: integer; variable crbit_match: std_ulogic; variable ctr_not_zero: std_ulogic; variable ctr_ok: std_ulogic; variable cond_ok: std_ulogic; - variable ret: integer; begin crfield := to_integer(unsigned(bi)); -- BE bit numbering @@ -800,12 +799,7 @@ package body ppc_fx_insns is ctr_not_zero := '1' when ctr /= x"0000000000000001" else '0'; ctr_ok := bo(4-2) or (ctr_not_zero xor bo(4-3)); cond_ok := bo(4-0) or crbit_match; - if ctr_ok = '1' and cond_ok = '1' then - ret := 1; - else - ret := 0; - end if; - return ret; + return ctr_ok and cond_ok; end; end package body ppc_fx_insns; diff --git a/register_file.vhdl b/register_file.vhdl index 2cffeea..260255e 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -24,7 +24,9 @@ entity register_file is -- debug sim_dump : in std_ulogic; - sim_dump_done : out std_ulogic + sim_dump_done : out std_ulogic; + + log_out : out std_ulogic_vector(70 downto 0) ); end entity register_file; @@ -34,18 +36,19 @@ architecture behaviour of register_file is signal rd_port_b : std_ulogic_vector(63 downto 0); signal dbg_data : std_ulogic_vector(63 downto 0); signal dbg_ack : std_ulogic; + signal log_data : std_ulogic_vector(70 downto 0); begin -- synchronous writes register_write_0: process(clk) begin if rising_edge(clk) then if w_in.write_enable = '1' then - assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure; if w_in.write_reg(5) = '0' then report "Writing GPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data); else report "Writing GSPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data); end if; + assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure; registers(to_integer(unsigned(w_in.write_reg))) <= w_in.write_data; end if; end if; @@ -131,4 +134,13 @@ begin sim_dump_done <= '0'; end generate; + reg_log: process(clk) + begin + if rising_edge(clk) then + log_data <= w_in.write_data & + w_in.write_enable & + w_in.write_reg; + end if; + end process; + log_out <= log_data; end architecture behaviour; diff --git a/scripts/fmt_log/Makefile b/scripts/fmt_log/Makefile new file mode 100644 index 0000000..04d1e9a --- /dev/null +++ b/scripts/fmt_log/Makefile @@ -0,0 +1,12 @@ +CFLAGS = -O2 -g -Wall -std=c99 + +all: fmt_log + +fmt_log: fmt_log.c + $(CC) -o $@ $^ $(CFLAGS) + +clean: + rm -f fmt_log +distclean: + rm -f *~ + diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c new file mode 100644 index 0000000..c8fb501 --- /dev/null +++ b/scripts/fmt_log/fmt_log.c @@ -0,0 +1,235 @@ +#include +#include +#include + +typedef unsigned long long u64; + +struct log_entry { + u64 nia_lo: 42; + u64 nia_hi: 1; + u64 ic_ra_valid: 1; + u64 ic_access_ok: 1; + u64 ic_is_miss: 1; + u64 ic_is_hit: 1; + u64 ic_way: 3; + u64 ic_state: 1; + u64 ic_part_nia: 4; + u64 ic_fetch_failed: 1; + u64 ic_stall_out: 1; + u64 ic_wb_stall: 1; + u64 ic_wb_cyc: 1; + u64 ic_wb_stb: 1; + u64 ic_wb_adr: 3; + u64 ic_wb_ack: 1; + + u64 ic_insn: 32; + u64 ic_valid: 1; + u64 d1_valid: 1; + u64 d1_unit: 2; + u64 d1_part_nia: 4; + u64 d1_insn_type: 6; + u64 d2_bypass_a: 1; + u64 d2_bypass_b: 1; + u64 d2_bypass_c: 1; + u64 d2_stall_out: 1; + u64 d2_stopped_out: 1; + u64 d2_valid: 1; + u64 d2_part_nia: 4; + u64 e1_flush_out: 1; + u64 e1_stall_out: 1; + u64 e1_redirect: 1; + u64 e1_valid: 1; + u64 e1_write_enable: 1; + u64 e1_unused: 3; + + u64 e1_irq_state: 1; + u64 e1_irq: 1; + u64 e1_exception: 1; + u64 e1_msr_dr: 1; + u64 e1_msr_ir: 1; + u64 e1_msr_pr: 1; + u64 e1_msr_ee: 1; + u64 pad1: 5; + u64 ls_state: 3; + u64 ls_dw_done: 1; + u64 ls_min_done: 1; + u64 ls_do_valid: 1; + u64 ls_mo_valid: 1; + u64 ls_lo_valid: 1; + u64 ls_eo_except: 1; + u64 ls_stall_out: 1; + u64 pad2: 2; + u64 dc_state: 3; + u64 dc_ra_valid: 1; + u64 dc_tlb_way: 3; + u64 dc_stall_out: 1; + u64 dc_op: 3; + u64 dc_do_valid: 1; + u64 dc_do_error: 1; + u64 dc_wb_cyc: 1; + u64 dc_wb_stb: 1; + u64 dc_wb_ack: 1; + u64 dc_wb_stall: 1; + u64 dc_wb_adr: 3; + u64 cr_wr_mask: 8; + u64 cr_wr_data: 4; + u64 cr_wr_enable: 1; + u64 reg_wr_reg: 6; + u64 reg_wr_enable: 1; + + u64 reg_wr_data; +}; + +#define FLAG(i, y) (log.i? y: ' ') +#define FLGA(i, y, z) (log.i? y: z) +#define PNIA(f) (full_nia[log.f] & 0xff) + +const char *units[4] = { "--", "al", "ls", "?3" }; +const char *ops[64] = +{ + "illegal", "nop ", "add ", "and ", "attn ", "b ", "bc ", "bcreg ", + "bperm ", "cmp ", "cmpb ", "cmpeqb ", "cmprb ", "cntz ", "crop ", "darn ", + "dcbf ", "dcbst ", "dcbt ", "dcbtst ", "dcbz ", "div ", "dive ", "exts ", + "extswsl", "icbi ", "icbt ", "isel ", "isync ", "ld ", "st ", "maddhd ", + "maddhdu", "maddld ", "mcrxr ", "mcrxrx ", "mfcr ", "mfmsr ", "mfspr ", "mod ", + "mtcrf ", "mtmsr ", "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "or ", "popcnt ", + "prty ", "rfid ", "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", "shl ", + "shr ", "sync ", "tlbie ", "trap ", "xor ", "ffail ", "?62 ", "?63 " +}; + +const char *spr_names[13] = +{ + "lr ", "ctr", "sr0", "sr1", "hr0", "hr1", "sg0", "sg1", + "sg2", "sg3", "hg0", "hg1", "xer" +}; + +int main(int ac, char **av) +{ + struct log_entry log; + u64 full_nia[16]; + long int lineno = 1; + FILE *f; + const char *filename; + int i; + long int ncompl = 0; + + if (ac != 1 && ac != 2) { + fprintf(stderr, "Usage: %s [filename]\n", av[0]); + exit(1); + } + f = stdin; + if (ac == 2) { + filename = av[1]; + f = fopen(filename, "rb"); + if (f == NULL) { + perror(filename); + exit(1); + } + } + + for (i = 0; i < 15; ++i) + full_nia[i] = i << 2; + + while (fread(&log, sizeof(log), 1, f) == 1) { + full_nia[log.nia_lo & 0xf] = (log.nia_hi? 0xc000000000000000: 0) | + (log.nia_lo << 2); + if (lineno % 20 == 1) { + printf(" fetch1 NIA icache decode1 decode2 execute1 loadstore dcache CR GSPR\n"); + printf(" ---------------- TAHW S -WB-- pN --insn-- pN un op pN byp FR IIE MSR WC SD MM CE SRTO DE -WB-- c ms reg val\n"); + printf(" LdMy t csnSa IA IA it IA abc le srx EPID em tw rd mx tAwp vr csnSa 0 k\n"); + } + printf("%4ld %c0000%.11llx %c ", lineno, + (log.nia_hi? 'c': '0'), + (unsigned long long)log.nia_lo << 2, + FLAG(ic_stall_out, '|')); + printf("%c%c%c%d %c %c%c%d%c%c %.2llx ", + FLGA(ic_ra_valid, ' ', 'T'), + FLGA(ic_access_ok, ' ', 'X'), + FLGA(ic_is_hit, 'H', FLGA(ic_is_miss, 'M', ' ')), + log.ic_way, + FLAG(ic_state, 'W'), + FLAG(ic_wb_cyc, 'c'), + FLAG(ic_wb_stb, 's'), + log.ic_wb_adr, + FLAG(ic_wb_stall, 'S'), + FLAG(ic_wb_ack, 'a'), + PNIA(ic_part_nia)); + if (log.ic_valid) + printf("%.8x", log.ic_insn); + else if (log.ic_fetch_failed) + printf("!!!!!!!!"); + else + printf("--------"); + printf(" %c%c %.2llx ", + FLAG(ic_valid, '>'), + FLAG(d2_stall_out, '|'), + PNIA(d1_part_nia)); + if (log.d1_valid) + printf("%s %s", + units[log.d1_unit], + ops[log.d1_insn_type]); + else + printf("-- -------"); + printf(" %c%c ", + FLAG(d1_valid, '>'), + FLAG(d2_stall_out, '|')); + printf("%.2llx %c%c%c %c%c ", + PNIA(d2_part_nia), + FLAG(d2_bypass_a, 'a'), + FLAG(d2_bypass_b, 'b'), + FLAG(d2_bypass_c, 'c'), + FLAG(d2_valid, '>'), + FLAG(e1_stall_out, '|')); + printf("%c%c %c%c%c %c%c%c%c %c%c ", + FLAG(e1_flush_out, 'F'), + FLAG(e1_redirect, 'R'), + FLAG(e1_irq_state, 'w'), + FLAG(e1_irq, 'I'), + FLAG(e1_exception, 'X'), + FLAG(e1_msr_ee, 'E'), + FLGA(e1_msr_pr, 'u', 's'), + FLAG(e1_msr_ir, 'I'), + FLAG(e1_msr_dr, 'D'), + FLAG(e1_write_enable, 'W'), + FLAG(e1_valid, 'C')); + printf("%c %d%d %c%c %c%c %c ", + FLAG(ls_stall_out, '|'), + log.ls_state, + log.ls_dw_done, + FLAG(ls_mo_valid, 'M'), + FLAG(ls_min_done, 'm'), + FLAG(ls_lo_valid, 'C'), + FLAG(ls_eo_except, 'X'), + FLAG(ls_do_valid, '>')); + printf("%d%c%d%d %c%c %c%c%d%c%c ", + log.dc_state, + FLAG(dc_ra_valid, 'R'), + log.dc_tlb_way, + log.dc_op, + FLAG(dc_do_valid, 'V'), + FLAG(dc_do_error, 'E'), + FLAG(dc_wb_cyc, 'c'), + FLAG(dc_wb_stb, 's'), + log.dc_wb_adr, + FLAG(dc_wb_stall, 'S'), + FLAG(dc_wb_ack, 'a')); + if (log.cr_wr_enable) + printf("%x>%.2x ", log.cr_wr_data, log.cr_wr_mask); + else + printf(" "); + if (log.reg_wr_enable) { + if (log.reg_wr_reg < 32 || log.reg_wr_reg > 44) + printf("r%02d", log.reg_wr_reg); + else + printf("%s", spr_names[log.reg_wr_reg - 32]); + printf("=%.16llx", log.reg_wr_data); + } + printf("\n"); + ++lineno; + if (log.ls_lo_valid || log.e1_valid) + ++ncompl; + } + printf("%ld instructions completed, %.2f CPI\n", ncompl, + (double)(lineno - 1) / ncompl); + exit(0); +} diff --git a/scripts/mw_debug/mw_debug.c b/scripts/mw_debug/mw_debug.c index c58073b..28e43b4 100644 --- a/scripts/mw_debug/mw_debug.c +++ b/scripts/mw_debug/mw_debug.c @@ -42,6 +42,9 @@ #define DBG_CORE_GSPR_INDEX 0x14 #define DBG_CORE_GSPR_DATA 0x15 +#define DBG_LOG_ADDR 0x16 +#define DBG_LOG_DATA 0x17 + static bool debug; struct backend { @@ -507,8 +510,10 @@ static void load(const char *filename, uint64_t addr) // if (rc < 8) XXX fixup endian ? check(dmi_write(DBG_WB_DATA, data), "writing WB_DATA"); count += 8; - if (!(count % 1024)) - printf("%x...\n", count); + if (!(count % 1024)) { + printf("%x...\r", count); + fflush(stdout); + } } close(fd); printf("%x done.\n", count); @@ -535,8 +540,10 @@ static void save(const char *filename, uint64_t addr, uint64_t size) break; } count += 8; - if (!(count % 1024)) - printf("%x...\n", count); + if (!(count % 1024)) { + printf("%x...\r", count); + fflush(stdout); + } if (count >= size) break; } @@ -544,6 +551,73 @@ static void save(const char *filename, uint64_t addr, uint64_t size) printf("%x done.\n", count); } +#define LOG_STOP 0x80000000ull + +static void log_start(void) +{ + check(dmi_write(DBG_LOG_ADDR, 0), "writing LOG_ADDR"); +} + +static void log_stop(void) +{ + uint64_t lsize, laddr, waddr; + + check(dmi_write(DBG_LOG_ADDR, LOG_STOP), "writing LOG_ADDR"); + check(dmi_read(DBG_LOG_ADDR, &laddr), "reading LOG_ADDR"); + waddr = laddr >> 32; + for (lsize = 1; lsize; lsize <<= 1) + if ((waddr >> 1) < lsize) + break; + waddr &= ~lsize; + printf("Log size = %" PRIu64 " entries, ", lsize); + printf("write ptr = %" PRIx64 "\n", waddr); +} + +static void log_dump(const char *filename) +{ + FILE *f; + uint64_t lsize, laddr, waddr; + uint64_t orig_laddr; + uint64_t i, ldata; + + f = fopen(filename, "w"); + if (f == NULL) { + fprintf(stderr, "Failed to create '%s': %s\n", filename, + strerror(errno)); + exit(1); + } + + check(dmi_read(DBG_LOG_ADDR, &orig_laddr), "reading LOG_ADDR"); + if (!(orig_laddr & LOG_STOP)) + check(dmi_write(DBG_LOG_ADDR, LOG_STOP), "writing LOG_ADDR"); + + waddr = orig_laddr >> 32; + for (lsize = 1; lsize; lsize <<= 1) + if ((waddr >> 1) < lsize) + break; + waddr &= ~lsize; + printf("Log size = %" PRIu64 " entries\n", lsize); + + laddr = LOG_STOP | (waddr << 2); + check(dmi_write(DBG_LOG_ADDR, laddr), "writing LOG_ADDR"); + + for (i = 0; i < lsize * 4; ++i) { + check(dmi_read(DBG_LOG_DATA, &ldata), "reading LOG_DATA"); + if (fwrite(&ldata, sizeof(ldata), 1, f) != 1) { + fprintf(stderr, "Write error on %s\n", filename); + exit(1); + } + if (!(i % 128)) { + printf("%" PRIu64 "...\r", i * 8); + fflush(stdout); + } + } + fclose(f); + printf("%" PRIu64 " done\n", lsize * 32); + + check(dmi_write(DBG_LOG_ADDR, orig_laddr), "writing LOG_ADDR"); +} + static void usage(const char *cmd) { fprintf(stderr, "Usage: %s -b \n", cmd); @@ -568,6 +642,12 @@ static void usage(const char *cmd) fprintf(stderr, " gpr [count]\n"); fprintf(stderr, " status\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " Core logging:\n"); + fprintf(stderr, " lstart start logging\n"); + fprintf(stderr, " lstop stop logging\n"); + fprintf(stderr, " ldump dump log to file\n"); + fprintf(stderr, "\n"); fprintf(stderr, " JTAG:\n"); fprintf(stderr, " dmiread \n"); @@ -706,6 +786,17 @@ int main(int argc, char *argv[]) if (((i+1) < argc) && isdigit(argv[i+1][0])) count = strtoul(argv[++i], NULL, 10); gpr_read(reg, count); + } else if (strcmp(argv[i], "lstart") == 0) { + log_start(); + } else if (strcmp(argv[i], "lstop") == 0) { + log_stop(); + } else if (strcmp(argv[i], "ldump") == 0) { + const char *filename; + + if ((i+1) >= argc) + usage(argv[0]); + filename = argv[++i]; + log_dump(filename); } else { fprintf(stderr, "Unknown command %s\n", argv[i]); exit(1); diff --git a/soc.vhdl b/soc.vhdl index ceaf9a9..6cf9a7f 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -51,7 +51,8 @@ entity soc is SPI_FLASH_DLINES : positive := 1; SPI_FLASH_OFFSET : integer := 0; SPI_FLASH_DEF_CKDV : natural := 2; - SPI_FLASH_DEF_QUAD : boolean := false + SPI_FLASH_DEF_QUAD : boolean := false; + LOG_LENGTH : natural := 512 ); port( rst : in std_ulogic; @@ -198,7 +199,8 @@ begin generic map( SIM => SIM, DISABLE_FLATTEN => DISABLE_FLATTEN_CORE, - ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1') + ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'), + LOG_LENGTH => LOG_LENGTH ) port map( clk => system_clk, diff --git a/tests/test_xics.bin b/tests/test_xics.bin index 6dd993c..327f98f 100755 Binary files a/tests/test_xics.bin and b/tests/test_xics.bin differ diff --git a/tests/xics/xics.c b/tests/xics/xics.c index 2ff4c54..a2db3a5 100644 --- a/tests/xics/xics.c +++ b/tests/xics/xics.c @@ -9,6 +9,14 @@ #undef DEBUG //#define DEBUG 1 +void delay(void) +{ + static volatile int i; + + for (i = 0; i < 10; ++i) + ; +} + void print_number(unsigned int i) // only for i = 0-999 { unsigned int j, k, m; @@ -148,14 +156,17 @@ int xics_test_0(void) xics_write8(XICS_MFRR, 0x05); // cause 0x500 interrupt // still masked, so shouldn't happen yet + delay(); assert(isrs_run == 0); // unmask IPI only xics_write8(XICS_XIRR, 0x40); + delay(); assert(isrs_run == ISR_IPI); // unmask UART xics_write8(XICS_XIRR, 0xc0); + delay(); assert(isrs_run == (ISR_IPI | ISR_UART)); // cleanup @@ -174,12 +185,14 @@ int xics_test_1(void) xics_write8(XICS_XIRR, 0xff); // allow all interrupts // should be none pending + delay(); assert(isrs_run == 0); // trigger both potato_uart_irq_en(); // cause 0x500 interrupt xics_write8(XICS_MFRR, 0x05); // cause 0x500 interrupt + delay(); assert(isrs_run == (ISR_IPI | ISR_UART)); // cleanup @@ -208,9 +221,11 @@ int xics_test_2(void) // trigger an IPI xics_write8(XICS_MFRR, 0x05); // cause 0x500 interrupt + delay(); assert(isrs_run == 0); mtmsrd(0x9000000000008003); // EE on + delay(); assert(isrs_run == ISR_IPI); // cleanup diff --git a/writeback.vhdl b/writeback.vhdl index 60afebb..d02a0b1 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -22,27 +22,33 @@ end entity writeback; architecture behaviour of writeback is begin - writeback_1: process(all) + writeback_0: process(clk) variable x : std_ulogic_vector(0 downto 0); variable y : std_ulogic_vector(0 downto 0); variable w : std_ulogic_vector(0 downto 0); + begin + if rising_edge(clk) then + -- Do consistency checks only on the clock edge + x(0) := e_in.valid; + y(0) := l_in.valid; + assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; + + x(0) := e_in.write_enable or e_in.exc_write_enable; + y(0) := l_in.write_enable; + assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; + + w(0) := e_in.write_cr_enable; + x(0) := (e_in.write_enable and e_in.rc); + assert (to_integer(unsigned(w)) + to_integer(unsigned(x))) <= 1 severity failure; + end if; + end process; + + writeback_1: process(all) variable cf: std_ulogic_vector(3 downto 0); variable zero : std_ulogic; variable sign : std_ulogic; variable scf : std_ulogic_vector(3 downto 0); begin - x(0) := e_in.valid; - y(0) := l_in.valid; - assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; - - x(0) := e_in.write_enable or e_in.exc_write_enable; - y(0) := l_in.write_enable; - assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; - - w(0) := e_in.write_cr_enable; - x(0) := (e_in.write_enable and e_in.rc); - assert (to_integer(unsigned(w)) + to_integer(unsigned(x))) <= 1 severity failure; - w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; diff --git a/xilinx-mult.vhdl b/xilinx-mult.vhdl new file mode 100644 index 0000000..46366d6 --- /dev/null +++ b/xilinx-mult.vhdl @@ -0,0 +1,985 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.common.all; + +library unisim; +use unisim.vcomponents.all; + +entity multiply is + port ( + clk : in std_logic; + + m_in : in Execute1ToMultiplyType; + m_out : out MultiplyToExecute1Type + ); +end entity multiply; + +architecture behaviour of multiply is + signal m00_p, m01_p, m02_p, m03_p : std_ulogic_vector(47 downto 0); + signal m00_pc : std_ulogic_vector(47 downto 0); + signal m10_p, m11_p, m12_p, m13_p : std_ulogic_vector(47 downto 0); + signal m11_pc, m12_pc, m13_pc : std_ulogic_vector(47 downto 0); + signal m20_p, m21_p, m22_p, m23_p : std_ulogic_vector(47 downto 0); + signal s0_pc, s1_pc : std_ulogic_vector(47 downto 0); + signal product_lo : std_ulogic_vector(31 downto 0); + signal product : std_ulogic_vector(127 downto 0); + signal addend : std_ulogic_vector(127 downto 0); + signal s0_carry, p0_carry : std_ulogic_vector(3 downto 0); + signal p0_mask : std_ulogic_vector(47 downto 0); + signal p0_pat, p0_patb : std_ulogic; + signal p1_pat, p1_patb : std_ulogic; + + signal req_32bit, r32_1 : std_ulogic; + signal req_neg, rneg_1 : std_ulogic; + signal valid_1 : std_ulogic; + +begin + addend <= (others => m_in.neg_result); + + m00: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000" & m_in.data1(22 downto 0), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(16 downto 0), + BCIN => (others => '0'), + C => "00000000000000" & addend(33 downto 0), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m00_p, + PCIN => (others => '0'), + PCOUT => m00_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m01: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000" & m_in.data1(22 downto 0), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(33 downto 17), + BCIN => (others => '0'), + C => (others => '0'), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "1010101", + P => m01_p, + PCIN => m00_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m02: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000" & m_in.data1(22 downto 0), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(50 downto 34), + BCIN => (others => '0'), + C => x"0000000" & "000" & addend(50 downto 34), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m02_p, + PCIN => (others => '0'), + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m03: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000" & m_in.data1(22 downto 0), + ACIN => (others => '0'), + ALUMODE => "0000", + B => "00000" & m_in.data2(63 downto 51), + BCIN => (others => '0'), + C => x"000000" & '0' & addend(73 downto 51), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m03_p, + PCIN => (others => '0'), + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m10: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000000000" & m_in.data1(39 downto 23), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(16 downto 0), + BCIN => (others => '0'), + C => x"000" & "00" & m01_p(39 downto 6), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '0', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m10_p, + PCIN => (others => '0'), + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m11: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000000000" & m_in.data1(39 downto 23), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(33 downto 17), + BCIN => (others => '0'), + C => x"000" & "00" & m02_p(39 downto 6), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '0', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m11_p, + PCIN => (others => '0'), + PCOUT => m11_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m12: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000000000" & m_in.data1(39 downto 23), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(50 downto 34), + BCIN => (others => '0'), + C => x"0000" & '0' & m03_p(36 downto 6), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '0', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m12_p, + PCIN => (others => '0'), + PCOUT => m12_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m13: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000000000" & m_in.data1(39 downto 23), + ACIN => (others => '0'), + ALUMODE => "0000", + B => "00000" & m_in.data2(63 downto 51), + BCIN => (others => '0'), + C => x"0000000" & "000" & addend(90 downto 74), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m13_p, + PCIN => (others => '0'), + PCOUT => m13_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m20: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "000000" & m_in.data1(63 downto 40), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(16 downto 0), + BCIN => (others => '0'), + C => (others => '0'), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0010101", + P => m20_p, + PCIN => m11_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m21: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "000000" & m_in.data1(63 downto 40), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(33 downto 17), + BCIN => (others => '0'), + C => (others => '0'), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0010101", + P => m21_p, + PCIN => m12_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m22: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "000000" & m_in.data1(63 downto 40), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & m_in.data2(50 downto 34), + BCIN => (others => '0'), + C => (others => '0'), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0010101", + P => m22_p, + PCIN => m13_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m23: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + INMODEREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "000000" & m_in.data1(63 downto 40), + ACIN => (others => '0'), + ALUMODE => "0000", + B => "00000" & m_in.data2(63 downto 51), + BCIN => (others => '0'), + C => x"00" & "000" & addend(127 downto 91), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '1', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m23_p, + PCIN => (others => '0'), + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + s0: DSP48E1 + generic map ( + ACASCREG => 1, + ALUMODEREG => 0, + AREG => 1, + BCASCREG => 1, + BREG => 1, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 1, + INMODEREG => 0, + MREG => 0, + OPMODEREG => 0, + PREG => 0, + USE_MULT => "none" + ) + port map ( + A => m22_p(5 downto 0) & x"0000" & m10_p(34 downto 27), + ACIN => (others => '0'), + ALUMODE => "0000", + B => m10_p(26 downto 9), + BCIN => (others => '0'), + C => m20_p(39 downto 0) & m02_p(5 downto 0) & "00", + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CARRYOUT => s0_carry, + CEA1 => '0', + CEA2 => '1', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '1', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '0', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0001111", + PCIN => (others => '0'), + PCOUT => s0_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + s1: DSP48E1 + generic map ( + ACASCREG => 1, + ALUMODEREG => 0, + AREG => 1, + BCASCREG => 1, + BREG => 1, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 1, + INMODEREG => 0, + MREG => 0, + OPMODEREG => 0, + PREG => 0, + USE_MULT => "none" + ) + port map ( + A => x"000" & m22_p(41 downto 24), + ACIN => (others => '0'), + ALUMODE => "0000", + B => m22_p(23 downto 6), + BCIN => (others => '0'), + C => m23_p(36 downto 0) & x"00" & "0" & m20_p(41 downto 40), + CARRYCASCIN => '0', + CARRYIN => s0_carry(3), + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '1', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '1', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '0', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0001111", + PCIN => (others => '0'), + PCOUT => s1_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + -- mask is 0 for 32-bit ops, 0x0000ffffffff for 64-bit + p0_mask(47 downto 31) <= (others => '0'); + p0_mask(30 downto 0) <= (others => not r32_1); + + p0: DSP48E1 + generic map ( + ACASCREG => 1, + ALUMODEREG => 1, + AREG => 1, + BCASCREG => 1, + BREG => 1, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 1, + INMODEREG => 0, + MREG => 0, + OPMODEREG => 0, + PREG => 0, + SEL_MASK => "C", + USE_MULT => "none", + USE_PATTERN_DETECT => "PATDET" + ) + port map ( + A => m21_p(22 downto 0) & m03_p(5 downto 0) & '0', + ACIN => (others => '0'), + ALUMODE => "00" & rneg_1 & '0', + B => (others => '0'), + BCIN => (others => '0'), + C => p0_mask, + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CARRYOUT => p0_carry, + CEA1 => '0', + CEA2 => '1', + CEAD => '0', + CEALUMODE => '1', + CEB1 => '0', + CEB2 => '1', + CEC => '1', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '0', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0010011", + P => product(79 downto 32), + PATTERNDETECT => p0_pat, + PATTERNBDETECT => p0_patb, + PCIN => s0_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + p1: DSP48E1 + generic map ( + ACASCREG => 1, + ALUMODEREG => 1, + AREG => 1, + BCASCREG => 1, + BREG => 1, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 0, + INMODEREG => 0, + MASK => x"000000000000", + MREG => 0, + OPMODEREG => 0, + PREG => 0, + USE_MULT => "none", + USE_PATTERN_DETECT => "PATDET" + ) + port map ( + A => x"0000000" & '0' & m21_p(41), + ACIN => (others => '0'), + ALUMODE => "00" & rneg_1 & '0', + B => m21_p(40 downto 23), + BCIN => (others => '0'), + C => (others => '0'), + CARRYCASCIN => '0', + CARRYIN => p0_carry(3), + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '1', + CEAD => '0', + CEALUMODE => '1', + CEB1 => '0', + CEB2 => '1', + CEC => '0', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '0', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0010011", + P => product(127 downto 80), + PATTERNDETECT => p1_pat, + PATTERNBDETECT => p1_patb, + PCIN => s1_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + product(31 downto 0) <= product_lo xor (31 downto 0 => req_neg); + + mult_out: process(all) + variable ov : std_ulogic; + begin + -- set overflow if the high bits are neither all zeroes nor all ones + if req_32bit = '0' then + ov := not ((p1_pat and p0_pat) or (p1_patb and p0_patb)); + else + ov := not ((p1_pat and p0_pat and not product(31)) or + (p1_patb and p0_patb and product(31))); + end if; + + m_out.result <= product; + m_out.overflow <= ov; + end process; + + process(clk) + begin + if rising_edge(clk) then + product_lo <= m10_p(8 downto 0) & m01_p(5 downto 0) & m00_p(16 downto 0); + m_out.valid <= valid_1; + valid_1 <= m_in.valid; + req_32bit <= r32_1; + r32_1 <= m_in.is_32bit; + req_neg <= rneg_1; + rneg_1 <= m_in.neg_result; + end if; + end process; + +end architecture behaviour;