From 9789d258fb1f0edbf0cc7e45eac1b2e7625633cb Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 25 Sep 2019 20:27:08 +1000 Subject: [PATCH 1/4] loadstore2: Do data formatting after a register stage This moves the data formatting for read data to after a register, instead of before, in order to improve timing. The data formatting is now effectively combinational logic on the input side of the writeback stage. Signed-off-by: Paul Mackerras --- loadstore2.vhdl | 85 ++++++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 36 deletions(-) diff --git a/loadstore2.vhdl b/loadstore2.vhdl index f4b977a..17ef7e1 100644 --- a/loadstore2.vhdl +++ b/loadstore2.vhdl @@ -26,6 +26,10 @@ architecture behave of loadstore2 is signal l_saved : Loadstore1ToLoadstore2Type; signal w_tmp : Loadstore2ToWritebackType; signal m_tmp : wishbone_master_out; + signal read_data : std_ulogic_vector(63 downto 0); + signal read_data_shift : std_ulogic_vector(2 downto 0); + signal sign_extend_byte_reverse: std_ulogic_vector(1 downto 0); + signal dlength : std_ulogic_vector(3 downto 0); type state_t is (IDLE, WAITING_FOR_READ_ACK, WAITING_FOR_WRITE_ACK); signal state : state_t := IDLE; @@ -56,21 +60,53 @@ architecture behave of loadstore2 is return std_ulogic_vector(shift_left(unsigned(length_to_sel(size)), to_integer(unsigned(address(2 downto 0))))); end function wishbone_data_sel; begin + + loadstore2_1: process(all) + variable tmp : std_ulogic_vector(63 downto 0); + variable data : std_ulogic_vector(63 downto 0); + begin + tmp := std_logic_vector(shift_right(unsigned(read_data), to_integer(unsigned(read_data_shift)) * 8)); + data := (others => '0'); + case to_integer(unsigned(dlength)) is + when 0 => + when 1 => + data(7 downto 0) := tmp(7 downto 0); + when 2 => + data(15 downto 0) := tmp(15 downto 0); + when 4 => + data(31 downto 0) := tmp(31 downto 0); + when 8 => + data(63 downto 0) := tmp(63 downto 0); + when others => + assert false report "invalid length" severity failure; + data(63 downto 0) := tmp(63 downto 0); + end case; + + case sign_extend_byte_reverse is + when "10" => + w_tmp.write_data <= sign_extend(data, to_integer(unsigned(l_saved.length))); + when "01" => + w_tmp.write_data <= byte_reverse(data, to_integer(unsigned(l_saved.length))); + when others => + w_tmp.write_data <= data; + end case; + end process; + w_out <= w_tmp; m_out <= m_tmp; loadstore2_0: process(clk) - variable tmp : std_ulogic_vector(63 downto 0); - variable data : std_ulogic_vector(63 downto 0); - variable sign_extend_byte_reverse : std_ulogic_vector(1 downto 0); begin if rising_edge(clk) then - tmp := (others => '0'); - data := (others => '0'); - w_tmp <= Loadstore2ToWritebackInit; + w_tmp.valid <= '0'; + w_tmp.write_enable <= '0'; + w_tmp.write_reg <= (others => '0'); l_saved <= l_saved; + read_data_shift <= "000"; + sign_extend_byte_reverse <= "00"; + dlength <= "1000"; case_0: case state is when IDLE => @@ -95,15 +131,14 @@ begin if l_in.update = '1' then w_tmp.write_enable <= '1'; w_tmp.write_reg <= l_in.update_reg; - w_tmp.write_data <= l_in.addr; + read_data <= l_in.addr; end if; state <= WAITING_FOR_READ_ACK; else m_tmp.we <= '1'; - data := l_in.data; - m_tmp.dat <= std_logic_vector(shift_left(unsigned(data), wishbone_data_shift(l_in.addr))); + m_tmp.dat <= std_logic_vector(shift_left(unsigned(l_in.data), wishbone_data_shift(l_in.addr))); assert l_in.sign_extend = '0' report "sign extension doesn't make sense for stores" severity failure; @@ -113,32 +148,10 @@ begin when WAITING_FOR_READ_ACK => if m_in.ack = '1' then - tmp := std_logic_vector(shift_right(unsigned(m_in.dat), wishbone_data_shift(l_saved.addr))); - case to_integer(unsigned(l_saved.length)) is - when 0 => - when 1 => - data(7 downto 0) := tmp(7 downto 0); - when 2 => - data(15 downto 0) := tmp(15 downto 0); - when 4 => - data(31 downto 0) := tmp(31 downto 0); - when 8 => - data(63 downto 0) := tmp(63 downto 0); - when others => - assert false report "invalid length" severity failure; - end case; - - sign_extend_byte_reverse := l_saved.sign_extend & l_saved.byte_reverse; - - case sign_extend_byte_reverse is - when "10" => - data := sign_extend(data, to_integer(unsigned(l_saved.length))); - when "01" => - data := byte_reverse(data, to_integer(unsigned(l_saved.length))); - when others => - end case; - - w_tmp.write_data <= data; + read_data <= m_in.dat; + read_data_shift <= l_saved.addr(2 downto 0); + dlength <= l_saved.length; + sign_extend_byte_reverse <= l_saved.sign_extend & l_saved.byte_reverse; -- write data to register file w_tmp.valid <= '1'; @@ -155,7 +168,7 @@ begin if l_saved.update = '1' then w_tmp.write_enable <= '1'; w_tmp.write_reg <= l_saved.update_reg; - w_tmp.write_data <= l_saved.addr; + read_data <= l_saved.addr; end if; m_tmp <= wishbone_master_out_init; From 48e6e719d302b0484c608a51788a02fca0c2ce43 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 26 Sep 2019 10:53:55 +1000 Subject: [PATCH 2/4] Multiply needs to be 16 stages to fix all timing issues This seems dependent on the FPGA type/size, so we should probably make it a toplevel generic, but for now this helps on the Arty A7-35 Signed-off-by: Benjamin Herrenschmidt --- multiply.vhdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multiply.vhdl b/multiply.vhdl index 6f80660..71aceca 100644 --- a/multiply.vhdl +++ b/multiply.vhdl @@ -10,7 +10,7 @@ use work.crhelpers.all; entity multiply is generic ( - PIPELINE_DEPTH : natural := 2 + PIPELINE_DEPTH : natural := 16 ); port ( clk : in std_logic; From 5f281099571d5b0cc11804441651d89abe2d1763 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 26 Sep 2019 11:09:46 +1000 Subject: [PATCH 3/4] Don't reset JTAG request register asynchronously There's no point and it causes Vivado to spew a pile of warnings Signed-off-by: Benjamin Herrenschmidt --- dmi_dtm_xilinx.vhdl | 1 - 1 file changed, 1 deletion(-) diff --git a/dmi_dtm_xilinx.vhdl b/dmi_dtm_xilinx.vhdl index bab7ce8..6e47c56 100644 --- a/dmi_dtm_xilinx.vhdl +++ b/dmi_dtm_xilinx.vhdl @@ -222,7 +222,6 @@ begin begin if jtag_reset = '1' then shiftr <= (others => '0'); - request <= (others => '0'); jtag_req <= '0'; elsif rising_edge(jtag_clk) then From 9961d70dfbcae24dc75b1f5a963ac7372b534af4 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 24 Sep 2019 14:57:34 +1000 Subject: [PATCH 4/4] Improve PLL/MMCM clocks configuration We can now pass both the input clock and target clock frequency via generics. Add support for both 50Mhz and 100Mhz target freqs for both cases. Signed-off-by: Benjamin Herrenschmidt --- fpga/clk_gen_mcmm.vhd | 74 ++++++++++++++++++------ fpga/clk_gen_plle2.vhd | 128 ++++++++++++++++++++++++----------------- fpga/toplevel.vhdl | 8 ++- microwatt.core | 42 ++++++++++++-- 4 files changed, 176 insertions(+), 76 deletions(-) diff --git a/fpga/clk_gen_mcmm.vhd b/fpga/clk_gen_mcmm.vhd index 206b02a..08db930 100644 --- a/fpga/clk_gen_mcmm.vhd +++ b/fpga/clk_gen_mcmm.vhd @@ -6,7 +6,9 @@ use UNISIM.vcomponents.all; entity clock_generator is generic ( - clk_period_hz : positive := 12000000); + CLK_INPUT_HZ : positive := 12000000; + CLK_OUTPUT_HZ : positive := 50000000 + ); port ( ext_clk : in std_logic; pll_rst_in : in std_logic; @@ -22,28 +24,66 @@ architecture rtl of clock_generator is clkfbout_mult : real range 2.0 to 64.0; clkout_divide : real range 1.0 to 128.0; divclk_divide : integer range 1 to 106; + force_rst : std_ulogic; end record; function gen_pll_settings ( - constant freq_hz : positive) + constant input_hz : positive; + constant output_hz : positive) return pll_settings_t is + + constant bad_settings : pll_settings_t := + (clkin_period => 0.0, + clkfbout_mult => 2.0, + clkout_divide => 1.0, + divclk_divide => 1, + force_rst => '1'); begin - if freq_hz = 100000000 then - return (clkin_period => 10.0, - clkfbout_mult => 16.0, - clkout_divide => 32.0, - divclk_divide => 1); - elsif freq_hz = 12000000 then - return (clkin_period => 83.33, - clkfbout_mult => 50.0, - clkout_divide => 12.0, - divclk_divide => 1); - else - report "Unsupported input frequency" severity failure; - end if; + case input_hz is + when 100000000 => + case output_hz is + when 100000000 => + return (clkin_period => 10.0, + clkfbout_mult => 16.0, + clkout_divide => 16.0, + divclk_divide => 1, + force_rst => '0'); + when 50000000 => + return (clkin_period => 10.0, + clkfbout_mult => 16.0, + clkout_divide => 32.0, + divclk_divide => 1, + force_rst => '0'); + when others => + report "Unsupported output frequency" severity failure; + return bad_settings; + end case; + when 12000000 => + case output_hz is + when 100000000 => + return (clkin_period => 83.33, + clkfbout_mult => 50.0, + clkout_divide => 6.0, + divclk_divide => 1, + force_rst => '0'); + when 50000000 => + return (clkin_period => 83.33, + clkfbout_mult => 50.0, + clkout_divide => 12.0, + divclk_divide => 1, + force_rst => '0'); + when others => + report "Unsupported output frequency" severity failure; + return bad_settings; + end case; + when others => + report "Unsupported input frequency" severity failure; + return bad_settings; + end case; end function gen_pll_settings; - constant pll_settings : pll_settings_t := gen_pll_settings(clk_period_hz); + constant pll_settings : pll_settings_t := gen_pll_settings(clk_input_hz, + clk_output_hz); begin pll : MMCME2_BASE generic map ( @@ -71,6 +111,6 @@ begin CLKFBIN => clkfb, CLKIN1 => ext_clk, PWRDWN => '0', - RST => pll_rst_in + RST => pll_rst_in or pll_settings.force_rst ); end architecture rtl; diff --git a/fpga/clk_gen_plle2.vhd b/fpga/clk_gen_plle2.vhd index f82cb53..750840b 100644 --- a/fpga/clk_gen_plle2.vhd +++ b/fpga/clk_gen_plle2.vhd @@ -5,67 +5,89 @@ Library UNISIM; use UNISIM.vcomponents.all; entity clock_generator is - generic ( - clk_period_hz : positive := 100000000); - port ( - ext_clk : in std_logic; - pll_rst_in : in std_logic; - pll_clk_out : out std_logic; - pll_locked_out : out std_logic); + generic ( + CLK_INPUT_HZ : positive := 100000000; + CLK_OUTPUT_HZ : positive := 100000000 + ); + port ( + ext_clk : in std_logic; + pll_rst_in : in std_logic; + pll_clk_out : out std_logic; + pll_locked_out : out std_logic); end entity clock_generator; architecture rtl of clock_generator is + signal clkfb : std_ulogic; - signal clkfb : std_ulogic; + type pll_settings_t is record + clkin_period : real range 0.000 to 52.631; + clkfbout_mult : integer range 2 to 64; + clkout_divide : integer range 1 to 128; + divclk_divide : integer range 1 to 56; + force_rst : std_ulogic; + end record; - type pll_settings_t is record - clkin_period : real range 0.000 to 52.631; - clkfbout_mult : integer range 2 to 64; - clkout_divide : integer range 1 to 128; - divclk_divide : integer range 1 to 56; - end record; + function gen_pll_settings ( + constant input_hz : positive; + constant output_hz : positive) + return pll_settings_t is - function gen_pll_settings ( - constant freq_hz : positive) - return pll_settings_t is - begin - if freq_hz = 100000000 then - return (clkin_period => 10.0, - clkfbout_mult => 16, - clkout_divide => 32, - divclk_divide => 1); - else - report "Unsupported input frequency" severity failure; --- return (clkin_period => 0.0, --- clkfbout_mult => 0, --- clkout_divide => 0, --- divclk_divide => 0); - end if; - end function gen_pll_settings; + constant bad_settings : pll_settings_t := + (clkin_period => 0.0, + clkfbout_mult => 2, + clkout_divide => 1, + divclk_divide => 1, + force_rst => '1'); + begin + case input_hz is + when 100000000 => + case output_hz is + when 100000000 => + return (clkin_period => 10.0, + clkfbout_mult => 16, + clkout_divide => 16, + divclk_divide => 1, + force_rst => '0'); + when 50000000 => + return (clkin_period => 10.0, + clkfbout_mult => 16, + clkout_divide => 32, + divclk_divide => 1, + force_rst => '0'); + when others => + report "Unsupported output frequency" severity failure; + return bad_settings; + end case; + when others => + report "Unsupported input frequency" severity failure; + return bad_settings; + end case; + end function gen_pll_settings; - constant pll_settings : pll_settings_t := gen_pll_settings(clk_period_hz); + constant pll_settings : pll_settings_t := gen_pll_settings(clk_input_hz, + clk_output_hz); begin - pll : PLLE2_BASE - generic map ( - BANDWIDTH => "OPTIMIZED", - CLKFBOUT_MULT => pll_settings.clkfbout_mult, - CLKIN1_PERIOD => pll_settings.clkin_period, - CLKOUT0_DIVIDE => pll_settings.clkout_divide, - DIVCLK_DIVIDE => pll_settings.divclk_divide, - STARTUP_WAIT => "FALSE") - port map ( - CLKOUT0 => pll_clk_out, - CLKOUT1 => open, - CLKOUT2 => open, - CLKOUT3 => open, - CLKOUT4 => open, - CLKOUT5 => open, - CLKFBOUT => clkfb, - LOCKED => pll_locked_out, - CLKIN1 => ext_clk, - PWRDWN => '0', - RST => pll_rst_in, - CLKFBIN => clkfb); + pll : PLLE2_BASE + generic map ( + BANDWIDTH => "OPTIMIZED", + CLKFBOUT_MULT => pll_settings.clkfbout_mult, + CLKIN1_PERIOD => pll_settings.clkin_period, + CLKOUT0_DIVIDE => pll_settings.clkout_divide, + DIVCLK_DIVIDE => pll_settings.divclk_divide, + STARTUP_WAIT => "FALSE") + port map ( + CLKOUT0 => pll_clk_out, + CLKOUT1 => open, + CLKOUT2 => open, + CLKOUT3 => open, + CLKOUT4 => open, + CLKOUT5 => open, + CLKFBOUT => clkfb, + LOCKED => pll_locked_out, + CLKIN1 => ext_clk, + PWRDWN => '0', + RST => pll_rst_in or pll_settings.force_rst, + CLKFBIN => clkfb); end architecture rtl; diff --git a/fpga/toplevel.vhdl b/fpga/toplevel.vhdl index c6ed4ae..d73c802 100644 --- a/fpga/toplevel.vhdl +++ b/fpga/toplevel.vhdl @@ -5,7 +5,9 @@ entity toplevel is generic ( MEMORY_SIZE : positive := 524288; RAM_INIT_FILE : string := "firmware.hex"; - RESET_LOW : boolean := true + RESET_LOW : boolean := true; + CLK_INPUT : positive := 100000000; + CLK_FREQUENCY : positive := 100000000 ); port( ext_clk : in std_ulogic; @@ -43,6 +45,10 @@ begin ); clkgen: entity work.clock_generator + generic map( + CLK_INPUT_HZ => CLK_INPUT, + CLK_OUTPUT_HZ => CLK_FREQUENCY + ) port map( ext_clk => ext_clk, pll_rst_in => pll_rst, diff --git a/microwatt.core b/microwatt.core index b32148a..94ff0c1 100644 --- a/microwatt.core +++ b/microwatt.core @@ -81,7 +81,11 @@ targets: nexys_a7: default_tool: vivado filesets: [core, nexys_a7, soc, fpga, debug_xilinx] - parameters : [memory_size, ram_init_file] + parameters : + - memory_size + - ram_init_file + - clk_input + - clk_frequency tools: vivado: {part : xc7a100tcsg324-1} toplevel : toplevel @@ -89,7 +93,11 @@ targets: nexys_video: default_tool: vivado filesets: [core, nexys_video, soc, fpga, debug_xilinx] - parameters : [memory_size, ram_init_file] + parameters : + - memory_size + - ram_init_file + - clk_input + - clk_frequency tools: vivado: {part : xc7a200tsbg484-1} toplevel : toplevel @@ -97,7 +105,11 @@ targets: arty_a7-35: default_tool: vivado filesets: [core, arty_a7, soc, fpga, debug_xilinx] - parameters : [memory_size, ram_init_file] + parameters : + - memory_size + - ram_init_file + - clk_input + - clk_frequency tools: vivado: {part : xc7a35ticsg324-1L} toplevel : toplevel @@ -105,7 +117,11 @@ targets: arty_a7-100: default_tool: vivado filesets: [core, arty_a7, soc, fpga, debug_xilinx] - parameters : [memory_size, ram_init_file] + parameters : + - memory_size + - ram_init_file + - clk_input + - clk_frequency tools: vivado: {part : xc7a100ticsg324-1L} toplevel : toplevel @@ -113,7 +129,11 @@ targets: cmod_a7-35: default_tool: vivado filesets: [core, cmod_a7-35, soc, fpga, debug_xilinx] - parameters : [memory_size, ram_init_file, reset_low=false] + parameters : + - memory_size + - ram_init_file + - reset_low=false + - clk_input=12000000 tools: vivado: {part : xc7a35tcpg236-1} toplevel : toplevel @@ -139,3 +159,15 @@ parameters: datatype : bool description : External reset button polarity paramtype : generic + + clk_input: + datatype : int + description : Clock input frequency in HZ (for top-generic based boards) + paramtype : generic + default : 100000000 + + clk_frequency: + datatype : int + description : Generated system clock frequency in HZ (for top-generic based boards) + paramtype : generic + default : 50000000