core: Add a short multiplier

This adds an optional 16 bit x 16 bit signed multiplier and uses it for multiply instructions that return the low 64 bits of the product (mull[dw][o] and mulli, but not maddld) when the operands are both in the range -2^15 .. 2^15 - 1. The "short" 16-bit multiplier produces its result combinatorially, so a multiply that uses it executes in one cycle. This improves the coremark result by about 4%, since coremark does quite a lot of multiplies and they almost all have operands that fit into 16 bits. The presence of the short multiplier is controlled by a generic at the execute1, SOC, core and top levels. For now, it defaults to off for all platforms, and can be enabled using the --has_short_mult flag to fusesoc. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago · 734e4c4a52
parent 2224b28c2c
commit 734e4c4a52
9 changed files with 172 additions and 5 deletions
--- a/core.vhdl
+++ b/core.vhdl
@ -13,6 +13,7 @@ entity core is
        EX1_BYPASS : boolean := true;
        HAS_FPU : boolean := true;
        HAS_BTC : boolean := true;
        HAS_SHORT_MULT : boolean := false;
 	ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0');
        LOG_LENGTH : natural := 512;
        ICACHE_NUM_LINES : natural := 64;
@ -340,6 +341,7 @@ begin
        generic map (
            EX1_BYPASS => EX1_BYPASS,
            HAS_FPU => HAS_FPU,
            HAS_SHORT_MULT => HAS_SHORT_MULT,
            LOG_LENGTH => LOG_LENGTH
            )
        port map (
--- a/execute1.vhdl
+++ b/execute1.vhdl
@ -14,6 +14,7 @@ entity execute1 is
    generic (
        EX1_BYPASS : boolean := true;
        HAS_FPU : boolean := true;
        HAS_SHORT_MULT : boolean := false;
        -- Non-zero to enable log data collection
        LOG_LENGTH : natural := 0
        );
@ -95,6 +96,7 @@ architecture behaviour of execute1 is
    signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0);
    signal cr_in : std_ulogic_vector(31 downto 0);
    signal xerc_in : xer_common_t;
    signal mshort_p : std_ulogic_vector(31 downto 0) := (others => '0');
    signal valid_in : std_ulogic;
    signal ctrl: ctrl_t := (others => (others => '0'));
@ -230,6 +232,24 @@ architecture behaviour of execute1 is
 	return msr_out;
    end;
    -- Work out whether a signed value fits into n bits,
    -- that is, see if it is in the range -2^(n-1) .. 2^(n-1) - 1
    function fits_in_n_bits(val: std_ulogic_vector; n: integer) return boolean is
        variable x, xp1: std_ulogic_vector(val'left downto val'right);
    begin
        x := val;
        if val(val'left) = '0' then
            x := not val;
        end if;
        xp1 := bit_reverse(std_ulogic_vector(unsigned(bit_reverse(x)) + 1));
        x := x and not xp1;
        -- For positive inputs, x has ones at the positions
        -- to the left of the leftmost 1 bit in val.
        -- For negative inputs, x has ones to the left of
        -- the leftmost 0 bit in val.
        return x(n - 1) = '1';
    end;
    -- Tell vivado to keep the hierarchy for the random module so that the
    -- net names in the xdc file match.
    attribute keep_hierarchy : string;
@ -304,6 +324,17 @@ begin
            p_out => pmu_to_x
            );
    short_mult_0: if HAS_SHORT_MULT generate
    begin
        short_mult: entity work.short_multiply
        port map (
            clk => clk,
            a_in => a_in(15 downto 0),
            b_in => b_in(15 downto 0),
            m_out => mshort_p
            );
    end generate;
    dbg_msr_out <= ctrl.msr;
    log_rd_addr <= r.log_addr_spr;
@ -509,7 +540,11 @@ begin
        case current.sub_select(1 downto 0) is
            when "00" =>
-                muldiv_result <= multiply_to_x.result(63 downto 0);
+                if HAS_SHORT_MULT and r.mul_in_progress = '0' then
                    muldiv_result <= std_ulogic_vector(resize(signed(mshort_p), 64));
                else
                    muldiv_result <= multiply_to_x.result(63 downto 0);
                end if;
            when "01" =>
                muldiv_result <= multiply_to_x.result(127 downto 64);
            when "10" =>
@ -1121,10 +1156,20 @@ begin
 		icache_inval <= '1';
 	    when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 =>
-		v.e.valid := '0';
+                if HAS_SHORT_MULT and e_in.insn_type = OP_MUL_L64 and e_in.insn(26) = '1' and
-		v.mul_in_progress := '1';
+                    fits_in_n_bits(a_in, 16) and fits_in_n_bits(b_in, 16) then
-		v.busy := '1';
+                    -- Operands fit into 16 bits, so use short multiplier
-		x_to_multiply.valid <= '1';
+                    if e_in.oe = '1' then
                        -- Note 16x16 multiply can't overflow, even for mullwo
                        set_ov(v.e, '0', '0');
                    end if;
                else
                    -- Use standard multiplier
                    v.e.valid := '0';
                    v.mul_in_progress := '1';
                    v.busy := '1';
                    x_to_multiply.valid <= '1';
                end if;
 	    when OP_DIV | OP_DIVE | OP_MOD =>
 		v.e.valid := '0';
--- a/fpga/top-arty.vhdl
+++ b/fpga/top-arty.vhdl
@ -16,6 +16,7 @@ entity toplevel is
        CLK_FREQUENCY      : positive := 100000000;
        HAS_FPU            : boolean  := true;
        HAS_BTC            : boolean  := true;
        HAS_SHORT_MULT     : boolean  := false;
        USE_LITEDRAM       : boolean  := false;
        NO_BRAM            : boolean  := false;
        DISABLE_FLATTEN_CORE : boolean := false;
@ -194,6 +195,7 @@ begin
            CLK_FREQ           => CLK_FREQUENCY,
            HAS_FPU            => HAS_FPU,
            HAS_BTC            => HAS_BTC,
            HAS_SHORT_MULT     => HAS_SHORT_MULT,
            HAS_DRAM           => USE_LITEDRAM,
            DRAM_SIZE          => 256 * 1024 * 1024,
            DRAM_INIT_SIZE     => PAYLOAD_SIZE,
--- a/fpga/top-generic.vhdl
+++ b/fpga/top-generic.vhdl
@ -13,6 +13,7 @@ entity toplevel is
 	CLK_FREQUENCY : positive := 100000000;
        HAS_FPU       : boolean  := true;
        HAS_BTC       : boolean  := false;
        HAS_SHORT_MULT: boolean  := false;
        ICACHE_NUM_LINES : natural := 64;
        LOG_LENGTH    : natural := 512;
 	DISABLE_FLATTEN_CORE : boolean := false;
@ -74,6 +75,7 @@ begin
 	    CLK_FREQ      => CLK_FREQUENCY,
            HAS_FPU       => HAS_FPU,
            HAS_BTC       => HAS_BTC,
            HAS_SHORT_MULT => HAS_SHORT_MULT,
 	    ICACHE_NUM_LINES => ICACHE_NUM_LINES,
            LOG_LENGTH    => LOG_LENGTH,
 	    DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE,
--- a/fpga/top-nexys-video.vhdl
+++ b/fpga/top-nexys-video.vhdl
@ -16,6 +16,7 @@ entity toplevel is
 	CLK_FREQUENCY : positive := 100000000;
        HAS_FPU       : boolean  := true;
        HAS_BTC       : boolean  := true;
        HAS_SHORT_MULT: boolean  := false;
 	USE_LITEDRAM  : boolean  := false;
 	NO_BRAM       : boolean  := false;
 	DISABLE_FLATTEN_CORE : boolean := false;
@ -170,6 +171,7 @@ begin
 	    CLK_FREQ      => CLK_FREQUENCY,
            HAS_FPU       => HAS_FPU,
            HAS_BTC       => HAS_BTC,
            HAS_SHORT_MULT=> HAS_SHORT_MULT,
 	    HAS_DRAM      => USE_LITEDRAM,
 	    DRAM_SIZE     => 512 * 1024 * 1024,
            DRAM_INIT_SIZE => PAYLOAD_SIZE,
--- a/microwatt.core
+++ b/microwatt.core
@ -138,6 +138,7 @@ targets:
      - uart_is_16550
      - has_fpu
      - has_btc
      - has_short_mult
    tools:
      vivado: {part : xc7a100tcsg324-1}
    toplevel : toplevel
@ -243,6 +244,7 @@ targets:
      - uart_is_16550
      - has_fpu
      - has_btc
      - has_short_mult
    generate: [litedram_nexys_video, liteeth_nexys_video, litesdcard_nexys_video]
    tools:
      vivado: {part : xc7a200tsbg484-1}
@ -263,6 +265,7 @@ targets:
      - has_uart1
      - has_fpu=false
      - has_btc=false
      - has_short_mult
      - use_litesdcard
    tools:
      vivado: {part : xc7a35ticsg324-1L}
@ -285,6 +288,7 @@ targets:
      - has_uart1
      - has_fpu=false
      - has_btc=false
      - has_short_mult
    generate: [litedram_arty, liteeth_arty, litesdcard_arty]
    tools:
      vivado: {part : xc7a35ticsg324-1L}
@ -305,6 +309,7 @@ targets:
      - has_uart1
      - has_fpu
      - has_btc
      - has_short_mult
      - use_litesdcard
    tools:
      vivado: {part : xc7a100ticsg324-1L}
@ -327,6 +332,7 @@ targets:
      - has_uart1
      - has_fpu
      - has_btc
      - has_short_mult
    generate: [litedram_arty, liteeth_arty, litesdcard_arty]
    tools:
      vivado: {part : xc7a100ticsg324-1L}
@ -430,6 +436,12 @@ parameters:
    paramtype   : generic
    default     : true
  has_short_mult:
    datatype    : bool
    description : Include a 16 bit x 16 bit single-cycle multiplier in the core
    paramtype   : generic
    default     : false
  disable_flatten_core:
    datatype    : bool
    description : Prevent Vivado from flattening the main core components
--- a/multiply.vhdl
+++ b/multiply.vhdl
@ -86,3 +86,22 @@ begin
        rin <= v;
    end process;
 end architecture behaviour;
 library ieee;
 use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;
 entity short_multiply is
    port (
        clk   : in std_ulogic;
        a_in  : in std_ulogic_vector(15 downto 0);
        b_in  : in std_ulogic_vector(15 downto 0);
        m_out : out std_ulogic_vector(31 downto 0)
        );
 end entity short_multiply;
 architecture behaviour of short_multiply is
 begin
    m_out <= std_ulogic_vector(signed(a_in) * signed(b_in));
 end architecture behaviour;
--- a/soc.vhdl
+++ b/soc.vhdl
@ -59,6 +59,7 @@ entity soc is
 	SIM                : boolean;
        HAS_FPU            : boolean := true;
        HAS_BTC            : boolean := true;
        HAS_SHORT_MULT     : boolean := false;
 	DISABLE_FLATTEN_CORE : boolean := false;
 	HAS_DRAM           : boolean  := false;
 	DRAM_SIZE          : integer := 0;
@ -325,6 +326,7 @@ begin
 	    SIM => SIM,
            HAS_FPU => HAS_FPU,
            HAS_BTC => HAS_BTC,
            HAS_SHORT_MULT => HAS_SHORT_MULT,
 	    DISABLE_FLATTEN => DISABLE_FLATTEN_CORE,
 	    ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'),
            LOG_LENGTH => LOG_LENGTH,
--- a/xilinx-mult.vhdl
+++ b/xilinx-mult.vhdl
@ -992,3 +992,84 @@ begin
    end process;
 end architecture behaviour;
 library ieee;
 use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;
 library unisim;
 use unisim.vcomponents.all;
 entity short_multiply is
    port (
        clk   : in std_logic;
        a_in  : in std_ulogic_vector(15 downto 0);
        b_in  : in std_ulogic_vector(15 downto 0);
        m_out : out std_ulogic_vector(31 downto 0)
        );
 end entity short_multiply;
 architecture behaviour of short_multiply is
    signal mshort_p : std_ulogic_vector(47 downto 0);
 begin
    mshort: DSP48E1
        generic map (
            ACASCREG => 0,
            ALUMODEREG => 0,
            AREG => 0,
            BCASCREG => 0,
            BREG => 0,
            CARRYINREG => 0,
            CARRYINSELREG => 0,
            CREG => 0,
            INMODEREG => 0,
            MREG => 0,
            OPMODEREG => 0,
            PREG => 0
            )
        port map (
            A => std_ulogic_vector(resize(signed(a_in(15 downto 0)), 30)),
            ACIN => (others => '0'),
            ALUMODE => "0000",
            B => std_ulogic_vector(resize(signed(b_in(15 downto 0)), 18)),
            BCIN => (others => '0'),
            C => 48x"0",
            CARRYCASCIN => '0',
            CARRYIN => '0',
            CARRYINSEL => "000",
            CEA1 => '0',
            CEA2 => '0',
            CEAD => '0',
            CEALUMODE => '0',
            CEB1 => '0',
            CEB2 => '0',
            CEC => '0',
            CECARRYIN => '0',
            CECTRL => '0',
            CED => '0',
            CEINMODE => '0',
            CEM => '0',
            CEP => '0',
            CLK => clk,
            D => (others => '0'),
            INMODE => "00000",
            MULTSIGNIN => '0',
            OPMODE => "0110101",
            P => mshort_p,
            PCIN => (others => '0'),
            RSTA => '0',
            RSTALLCARRYIN => '0',
            RSTALUMODE => '0',
            RSTB => '0',
            RSTC => '0',
            RSTCTRL => '0',
            RSTD => '0',
            RSTINMODE => '0',
            RSTM => '0',
            RSTP => '0'
            );
    m_out <= mshort_p(31 downto 0);
 end architecture behaviour;