From 734e4c4a529152bfd6d8a1f2e93c520e34755191 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 11 Sep 2021 17:10:20 +1000 Subject: [PATCH] core: Add a short multiplier This adds an optional 16 bit x 16 bit signed multiplier and uses it for multiply instructions that return the low 64 bits of the product (mull[dw][o] and mulli, but not maddld) when the operands are both in the range -2^15 .. 2^15 - 1. The "short" 16-bit multiplier produces its result combinatorially, so a multiply that uses it executes in one cycle. This improves the coremark result by about 4%, since coremark does quite a lot of multiplies and they almost all have operands that fit into 16 bits. The presence of the short multiplier is controlled by a generic at the execute1, SOC, core and top levels. For now, it defaults to off for all platforms, and can be enabled using the --has_short_mult flag to fusesoc. Signed-off-by: Paul Mackerras --- core.vhdl | 2 + execute1.vhdl | 55 +++++++++++++++++++++++--- fpga/top-arty.vhdl | 2 + fpga/top-generic.vhdl | 2 + fpga/top-nexys-video.vhdl | 2 + microwatt.core | 12 ++++++ multiply.vhdl | 19 +++++++++ soc.vhdl | 2 + xilinx-mult.vhdl | 81 +++++++++++++++++++++++++++++++++++++++ 9 files changed, 172 insertions(+), 5 deletions(-) diff --git a/core.vhdl b/core.vhdl index 32bfe88..cf730c5 100644 --- a/core.vhdl +++ b/core.vhdl @@ -13,6 +13,7 @@ entity core is EX1_BYPASS : boolean := true; HAS_FPU : boolean := true; HAS_BTC : boolean := true; + HAS_SHORT_MULT : boolean := false; ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0'); LOG_LENGTH : natural := 512; ICACHE_NUM_LINES : natural := 64; @@ -340,6 +341,7 @@ begin generic map ( EX1_BYPASS => EX1_BYPASS, HAS_FPU => HAS_FPU, + HAS_SHORT_MULT => HAS_SHORT_MULT, LOG_LENGTH => LOG_LENGTH ) port map ( diff --git a/execute1.vhdl b/execute1.vhdl index ef89bf7..7b90181 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -14,6 +14,7 @@ entity execute1 is generic ( EX1_BYPASS : boolean := true; HAS_FPU : boolean := true; + HAS_SHORT_MULT : boolean := false; -- Non-zero to enable log data collection LOG_LENGTH : natural := 0 ); @@ -95,6 +96,7 @@ architecture behaviour of execute1 is signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0); signal cr_in : std_ulogic_vector(31 downto 0); signal xerc_in : xer_common_t; + signal mshort_p : std_ulogic_vector(31 downto 0) := (others => '0'); signal valid_in : std_ulogic; signal ctrl: ctrl_t := (others => (others => '0')); @@ -230,6 +232,24 @@ architecture behaviour of execute1 is return msr_out; end; + -- Work out whether a signed value fits into n bits, + -- that is, see if it is in the range -2^(n-1) .. 2^(n-1) - 1 + function fits_in_n_bits(val: std_ulogic_vector; n: integer) return boolean is + variable x, xp1: std_ulogic_vector(val'left downto val'right); + begin + x := val; + if val(val'left) = '0' then + x := not val; + end if; + xp1 := bit_reverse(std_ulogic_vector(unsigned(bit_reverse(x)) + 1)); + x := x and not xp1; + -- For positive inputs, x has ones at the positions + -- to the left of the leftmost 1 bit in val. + -- For negative inputs, x has ones to the left of + -- the leftmost 0 bit in val. + return x(n - 1) = '1'; + end; + -- Tell vivado to keep the hierarchy for the random module so that the -- net names in the xdc file match. attribute keep_hierarchy : string; @@ -304,6 +324,17 @@ begin p_out => pmu_to_x ); + short_mult_0: if HAS_SHORT_MULT generate + begin + short_mult: entity work.short_multiply + port map ( + clk => clk, + a_in => a_in(15 downto 0), + b_in => b_in(15 downto 0), + m_out => mshort_p + ); + end generate; + dbg_msr_out <= ctrl.msr; log_rd_addr <= r.log_addr_spr; @@ -509,7 +540,11 @@ begin case current.sub_select(1 downto 0) is when "00" => - muldiv_result <= multiply_to_x.result(63 downto 0); + if HAS_SHORT_MULT and r.mul_in_progress = '0' then + muldiv_result <= std_ulogic_vector(resize(signed(mshort_p), 64)); + else + muldiv_result <= multiply_to_x.result(63 downto 0); + end if; when "01" => muldiv_result <= multiply_to_x.result(127 downto 64); when "10" => @@ -1121,10 +1156,20 @@ begin icache_inval <= '1'; when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 => - v.e.valid := '0'; - v.mul_in_progress := '1'; - v.busy := '1'; - x_to_multiply.valid <= '1'; + if HAS_SHORT_MULT and e_in.insn_type = OP_MUL_L64 and e_in.insn(26) = '1' and + fits_in_n_bits(a_in, 16) and fits_in_n_bits(b_in, 16) then + -- Operands fit into 16 bits, so use short multiplier + if e_in.oe = '1' then + -- Note 16x16 multiply can't overflow, even for mullwo + set_ov(v.e, '0', '0'); + end if; + else + -- Use standard multiplier + v.e.valid := '0'; + v.mul_in_progress := '1'; + v.busy := '1'; + x_to_multiply.valid <= '1'; + end if; when OP_DIV | OP_DIVE | OP_MOD => v.e.valid := '0'; diff --git a/fpga/top-arty.vhdl b/fpga/top-arty.vhdl index 6112c0a..533cad2 100644 --- a/fpga/top-arty.vhdl +++ b/fpga/top-arty.vhdl @@ -16,6 +16,7 @@ entity toplevel is CLK_FREQUENCY : positive := 100000000; HAS_FPU : boolean := true; HAS_BTC : boolean := true; + HAS_SHORT_MULT : boolean := false; USE_LITEDRAM : boolean := false; NO_BRAM : boolean := false; DISABLE_FLATTEN_CORE : boolean := false; @@ -194,6 +195,7 @@ begin CLK_FREQ => CLK_FREQUENCY, HAS_FPU => HAS_FPU, HAS_BTC => HAS_BTC, + HAS_SHORT_MULT => HAS_SHORT_MULT, HAS_DRAM => USE_LITEDRAM, DRAM_SIZE => 256 * 1024 * 1024, DRAM_INIT_SIZE => PAYLOAD_SIZE, diff --git a/fpga/top-generic.vhdl b/fpga/top-generic.vhdl index c75e465..da42bb5 100644 --- a/fpga/top-generic.vhdl +++ b/fpga/top-generic.vhdl @@ -13,6 +13,7 @@ entity toplevel is CLK_FREQUENCY : positive := 100000000; HAS_FPU : boolean := true; HAS_BTC : boolean := false; + HAS_SHORT_MULT: boolean := false; ICACHE_NUM_LINES : natural := 64; LOG_LENGTH : natural := 512; DISABLE_FLATTEN_CORE : boolean := false; @@ -74,6 +75,7 @@ begin CLK_FREQ => CLK_FREQUENCY, HAS_FPU => HAS_FPU, HAS_BTC => HAS_BTC, + HAS_SHORT_MULT => HAS_SHORT_MULT, ICACHE_NUM_LINES => ICACHE_NUM_LINES, LOG_LENGTH => LOG_LENGTH, DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE, diff --git a/fpga/top-nexys-video.vhdl b/fpga/top-nexys-video.vhdl index 1cf1df2..f5a5c71 100644 --- a/fpga/top-nexys-video.vhdl +++ b/fpga/top-nexys-video.vhdl @@ -16,6 +16,7 @@ entity toplevel is CLK_FREQUENCY : positive := 100000000; HAS_FPU : boolean := true; HAS_BTC : boolean := true; + HAS_SHORT_MULT: boolean := false; USE_LITEDRAM : boolean := false; NO_BRAM : boolean := false; DISABLE_FLATTEN_CORE : boolean := false; @@ -170,6 +171,7 @@ begin CLK_FREQ => CLK_FREQUENCY, HAS_FPU => HAS_FPU, HAS_BTC => HAS_BTC, + HAS_SHORT_MULT=> HAS_SHORT_MULT, HAS_DRAM => USE_LITEDRAM, DRAM_SIZE => 512 * 1024 * 1024, DRAM_INIT_SIZE => PAYLOAD_SIZE, diff --git a/microwatt.core b/microwatt.core index 21468de..8443911 100644 --- a/microwatt.core +++ b/microwatt.core @@ -138,6 +138,7 @@ targets: - uart_is_16550 - has_fpu - has_btc + - has_short_mult tools: vivado: {part : xc7a100tcsg324-1} toplevel : toplevel @@ -243,6 +244,7 @@ targets: - uart_is_16550 - has_fpu - has_btc + - has_short_mult generate: [litedram_nexys_video, liteeth_nexys_video, litesdcard_nexys_video] tools: vivado: {part : xc7a200tsbg484-1} @@ -263,6 +265,7 @@ targets: - has_uart1 - has_fpu=false - has_btc=false + - has_short_mult - use_litesdcard tools: vivado: {part : xc7a35ticsg324-1L} @@ -285,6 +288,7 @@ targets: - has_uart1 - has_fpu=false - has_btc=false + - has_short_mult generate: [litedram_arty, liteeth_arty, litesdcard_arty] tools: vivado: {part : xc7a35ticsg324-1L} @@ -305,6 +309,7 @@ targets: - has_uart1 - has_fpu - has_btc + - has_short_mult - use_litesdcard tools: vivado: {part : xc7a100ticsg324-1L} @@ -327,6 +332,7 @@ targets: - has_uart1 - has_fpu - has_btc + - has_short_mult generate: [litedram_arty, liteeth_arty, litesdcard_arty] tools: vivado: {part : xc7a100ticsg324-1L} @@ -430,6 +436,12 @@ parameters: paramtype : generic default : true + has_short_mult: + datatype : bool + description : Include a 16 bit x 16 bit single-cycle multiplier in the core + paramtype : generic + default : false + disable_flatten_core: datatype : bool description : Prevent Vivado from flattening the main core components diff --git a/multiply.vhdl b/multiply.vhdl index a7ca7ac..c09fc22 100644 --- a/multiply.vhdl +++ b/multiply.vhdl @@ -86,3 +86,22 @@ begin rin <= v; end process; end architecture behaviour; + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +entity short_multiply is + port ( + clk : in std_ulogic; + + a_in : in std_ulogic_vector(15 downto 0); + b_in : in std_ulogic_vector(15 downto 0); + m_out : out std_ulogic_vector(31 downto 0) + ); +end entity short_multiply; + +architecture behaviour of short_multiply is +begin + m_out <= std_ulogic_vector(signed(a_in) * signed(b_in)); +end architecture behaviour; diff --git a/soc.vhdl b/soc.vhdl index 3cbba7a..ec94b96 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -59,6 +59,7 @@ entity soc is SIM : boolean; HAS_FPU : boolean := true; HAS_BTC : boolean := true; + HAS_SHORT_MULT : boolean := false; DISABLE_FLATTEN_CORE : boolean := false; HAS_DRAM : boolean := false; DRAM_SIZE : integer := 0; @@ -325,6 +326,7 @@ begin SIM => SIM, HAS_FPU => HAS_FPU, HAS_BTC => HAS_BTC, + HAS_SHORT_MULT => HAS_SHORT_MULT, DISABLE_FLATTEN => DISABLE_FLATTEN_CORE, ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'), LOG_LENGTH => LOG_LENGTH, diff --git a/xilinx-mult.vhdl b/xilinx-mult.vhdl index 6a0d508..97259b4 100644 --- a/xilinx-mult.vhdl +++ b/xilinx-mult.vhdl @@ -992,3 +992,84 @@ begin end process; end architecture behaviour; + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library unisim; +use unisim.vcomponents.all; + +entity short_multiply is + port ( + clk : in std_logic; + + a_in : in std_ulogic_vector(15 downto 0); + b_in : in std_ulogic_vector(15 downto 0); + m_out : out std_ulogic_vector(31 downto 0) + ); +end entity short_multiply; + +architecture behaviour of short_multiply is + signal mshort_p : std_ulogic_vector(47 downto 0); +begin + mshort: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 0, + INMODEREG => 0, + MREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => std_ulogic_vector(resize(signed(a_in(15 downto 0)), 30)), + ACIN => (others => '0'), + ALUMODE => "0000", + B => std_ulogic_vector(resize(signed(b_in(15 downto 0)), 18)), + BCIN => (others => '0'), + C => 48x"0", + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '0', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '0', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => mshort_p, + PCIN => (others => '0'), + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m_out <= mshort_p(31 downto 0); + +end architecture behaviour;