diff --git a/Makefile b/Makefile index ebb1b79..794cbc1 100644 --- a/Makefile +++ b/Makefile @@ -60,9 +60,9 @@ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ decode1.vhdl helpers.vhdl insn_helpers.vhdl \ control.vhdl decode2.vhdl register_file.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ - logical.vhdl countbits.vhdl multiply.vhdl divider.vhdl execute1.vhdl \ - loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \ - core.vhdl fpu.vhdl pmu.vhdl + logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \ + execute1.vhdl loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl \ + core_debug.vhdl core.vhdl fpu.vhdl pmu.vhdl soc_files = wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \ wishbone_debug_master.vhdl xics.vhdl syscon.vhdl gpio.vhdl soc.vhdl \ diff --git a/execute1.vhdl b/execute1.vhdl index 92da2ee..948bdd6 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -85,6 +85,7 @@ architecture behaviour of execute1 is write_pmuspr : std_ulogic; ramspr_write_even : std_ulogic; ramspr_write_odd : std_ulogic; + mult_32s : std_ulogic; end record; constant side_effect_init : side_effect_type := (others => '0'); @@ -203,6 +204,8 @@ architecture behaviour of execute1 is -- multiply signals signal x_to_multiply: MultiplyInputType; signal multiply_to_x: MultiplyOutputType; + signal x_to_mult_32s: MultiplyInputType; + signal mult_32s_to_x: MultiplyOutputType; -- divider signals signal x_to_divider: Execute1ToDividerType; @@ -411,6 +414,14 @@ begin m_out => multiply_to_x ); + mult_32s_0: entity work.multiply_32s + port map ( + clk => clk, + stall => stage2_stall, + m_in => x_to_mult_32s, + m_out => mult_32s_to_x + ); + divider_0: if not HAS_FPU generate div_0: entity work.divider port map ( @@ -730,14 +741,14 @@ begin addend := not addend; end if; + x_to_multiply.data1 <= std_ulogic_vector(abs1); + x_to_multiply.data2 <= std_ulogic_vector(abs2); x_to_multiply.is_32bit <= e_in.is_32bit; x_to_multiply.not_result <= sign1 xor sign2; x_to_multiply.addend <= addend; x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus); if e_in.is_32bit = '0' then -- 64-bit forms - x_to_multiply.data1 <= std_ulogic_vector(abs1); - x_to_multiply.data2 <= std_ulogic_vector(abs2); if e_in.insn_type = OP_DIVE then x_to_divider.is_extended <= '1'; end if; @@ -745,8 +756,6 @@ begin x_to_divider.divisor <= std_ulogic_vector(abs2); else -- 32-bit forms - x_to_multiply.data1 <= x"00000000" & std_ulogic_vector(abs1(31 downto 0)); - x_to_multiply.data2 <= x"00000000" & std_ulogic_vector(abs2(31 downto 0)); x_to_divider.is_extended <= '0'; if e_in.insn_type = OP_DIVE then -- extended forms x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000"; @@ -756,6 +765,14 @@ begin x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0)); end if; + -- signals to 32-bit multiplier + x_to_mult_32s.data1 <= 31x"0" & (a_in(31) and e_in.is_signed) & a_in(31 downto 0); + x_to_mult_32s.data2 <= 31x"0" & (b_in(31) and e_in.is_signed) & b_in(31 downto 0); + -- The following are unused, but set here to avoid X states + x_to_mult_32s.is_32bit <= '1'; + x_to_mult_32s.not_result <= '0'; + x_to_mult_32s.addend <= (others => '0'); + shortmul_result <= std_ulogic_vector(resize(signed(mshort_p), 64)); case ex1.mul_select is when "00" => @@ -1271,7 +1288,11 @@ begin v.se.icache_inval := '1'; when OP_MUL_L64 => - if HAS_SHORT_MULT and e_in.reg_valid3 = '0' and + if e_in.is_32bit = '1' then + v.se.mult_32s := '1'; + v.res2_sel := "00"; + slow_op := '1'; + elsif HAS_SHORT_MULT and e_in.reg_valid3 = '0' and fits_in_n_bits(a_in, 16) and fits_in_n_bits(b_in, 16) then -- Operands fit into 16 bits, so use short multiplier if e_in.oe = '1' then @@ -1285,11 +1306,16 @@ begin owait := '1'; end if; - when OP_MUL_H64 | OP_MUL_H32 => + when OP_MUL_H64 => v.start_mul := '1'; slow_op := '1'; owait := '1'; + when OP_MUL_H32 => + v.se.mult_32s := '1'; + v.res2_sel := "01"; + slow_op := '1'; + when OP_DIV | OP_DIVE | OP_MOD => if not HAS_FPU then v.start_div := '1'; @@ -1370,6 +1396,7 @@ begin fv := Execute1ToFPUInit; x_to_multiply.valid <= '0'; + x_to_mult_32s.valid <= '0'; x_to_divider.valid <= '0'; v.ext_interrupt := '0'; v.taken_branch_event := '0'; @@ -1456,6 +1483,7 @@ begin v.res2_sel := actions.res2_sel; v.msr := actions.new_msr; x_to_multiply.valid <= actions.start_mul; + x_to_mult_32s.valid <= actions.se.mult_32s; v.mul_in_progress := actions.start_mul; x_to_divider.valid <= actions.start_div; v.div_in_progress := actions.start_div; @@ -1624,11 +1652,6 @@ begin -- Second execute stage control execute2_1: process(all) variable v : reg_stage2_type; - variable overflow : std_ulogic; - variable lv : Execute1ToLoadstore1Type; - variable fv : Execute1ToFPUType; - variable k : integer; - variable go : std_ulogic; variable bypass_valid : std_ulogic; variable rcresult : std_ulogic_vector(63 downto 0); variable sprres : std_ulogic_vector(63 downto 0); @@ -1647,6 +1670,14 @@ begin v.br_mispredict := ex1.br_mispredict; end if; + if ex1.se.mult_32s = '1' and ex1.oe = '1' then + v.e.xerc.ov := mult_32s_to_x.overflow; + v.e.xerc.ov32 := mult_32s_to_x.overflow; + if mult_32s_to_x.overflow = '1' then + v.e.xerc.so := '1'; + end if; + end if; + ctrl_tmp <= ctrl; -- FIXME: run at 512MHz not core freq ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); @@ -1667,24 +1698,34 @@ begin v.e.write_xerc_enable := '0'; v.e.redirect := '0'; v.e.br_last := '0'; - v.se := side_effect_init; v.taken_branch_event := '0'; v.br_mispredict := '0'; end if; if flush_in = '1' then v.e.valid := '0'; v.e.interrupt := '0'; + v.se := side_effect_init; v.ext_interrupt := '0'; end if; -- This is split like this because mfspr doesn't have an Rc bit, -- and we don't want the zero-detect logic to be after the -- SPR mux for timing reasons. - if ex1.res2_sel(0) = '0' then + if ex1.se.mult_32s = '1' then + if ex1.res2_sel(0) = '0' then + rcresult := mult_32s_to_x.result(63 downto 0); + else + rcresult := mult_32s_to_x.result(63 downto 32) & + mult_32s_to_x.result(63 downto 32); + end if; + elsif ex1.res2_sel(0) = '0' then rcresult := ex1.e.write_data; - sprres := spr_result; else rcresult := countbits_result; + end if; + if ex1.res2_sel(0) = '0' then + sprres := spr_result; + else sprres := pmu_to_x.spr_val; end if; if ex1.res2_sel(1) = '0' then @@ -1708,7 +1749,7 @@ begin cr_res(31) := sign; cr_res(30) := not (sign or zero); cr_res(29) := zero; - cr_res(28) := ex1.e.xerc.so; + cr_res(28) := v.e.xerc.so; cr_mask(7) := '1'; end if; diff --git a/microwatt.core b/microwatt.core index 4c8695e..b817901 100644 --- a/microwatt.core +++ b/microwatt.core @@ -66,6 +66,7 @@ filesets: xilinx_specific: files: - xilinx-mult.vhdl : {file_type : vhdlSource-2008} + - xilinx-mult-32s.vhdl : {file_type : vhdlSource-2008} - fpga/fpga-random.vhdl : {file_type : vhdlSource-2008} - fpga/fpga-random.xdc : {file_type : xdc} diff --git a/multiply-32s.vhdl b/multiply-32s.vhdl new file mode 100644 index 0000000..0639dbf --- /dev/null +++ b/multiply-32s.vhdl @@ -0,0 +1,55 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.common.all; + +-- Signed 33b x 33b multiplier giving 64-bit product, with no addend, +-- with fixed 1-cycle latency. + +entity multiply_32s is + port ( + clk : in std_logic; + stall : in std_ulogic; + + m_in : in MultiplyInputType; + m_out : out MultiplyOutputType + ); +end entity multiply_32s; + +architecture behaviour of multiply_32s is + type reg_type is record + valid : std_ulogic; + data : signed(65 downto 0); + end record; + constant reg_type_init : reg_type := (valid => '0', data => (others => '0')); + + signal r, rin : reg_type := reg_type_init; +begin + multiply_0: process(clk) + begin + if rising_edge(clk) and stall = '0' then + r <= rin; + end if; + end process; + + multiply_1: process(all) + variable v : reg_type; + variable d : std_ulogic_vector(63 downto 0); + variable ov : std_ulogic; + begin + v.valid := m_in.valid; + v.data := signed(m_in.data1(32 downto 0)) * signed(m_in.data2(32 downto 0)); + + d := std_ulogic_vector(r.data(63 downto 0)); + + ov := (or d(63 downto 31)) and not (and d(63 downto 31)); + + m_out.result <= 64x"0" & d; + m_out.overflow <= ov; + m_out.valid <= r.valid; + + rin <= v; + end process; +end architecture behaviour; diff --git a/xilinx-mult-32s.vhdl b/xilinx-mult-32s.vhdl new file mode 100644 index 0000000..fde19ae --- /dev/null +++ b/xilinx-mult-32s.vhdl @@ -0,0 +1,293 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.common.all; + +library unisim; +use unisim.vcomponents.all; + +-- Signed 33b x 33b multiplier giving 64-bit product, with no addend. + +entity multiply_32s is + port ( + clk : in std_logic; + stall : in std_ulogic; + + m_in : in MultiplyInputType; + m_out : out MultiplyOutputType + ); +end entity multiply_32s; + +architecture behaviour of multiply_32s is + signal clocken : std_ulogic; + signal data1 : std_ulogic_vector(52 downto 0); + signal data2 : std_ulogic_vector(34 downto 0); + signal m00_p, m01_p : std_ulogic_vector(47 downto 0); + signal m00_pc : std_ulogic_vector(47 downto 0); + signal m10_p, m11_p : std_ulogic_vector(47 downto 0); + signal m10_pc : std_ulogic_vector(47 downto 0); + signal p0_pat, p0_patb : std_ulogic; + signal p1_pat, p1_patb : std_ulogic; + signal product_lo : std_ulogic_vector(22 downto 0); + +begin + -- sign extend + data1 <= std_ulogic_vector(resize(signed(m_in.data1(32 downto 0)), 53)); + data2 <= std_ulogic_vector(resize(signed(m_in.data2(32 downto 0)), 35)); + + clocken <= m_in.valid and not stall; + + m00: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 0, + INMODEREG => 0, + MREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000" & data1(22 downto 0), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & data2(16 downto 0), + BCIN => (others => '0'), + C => (others => '0'), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '0', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '0', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m00_p, + PCIN => (others => '0'), + PCOUT => m00_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m01: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 0, + INMODEREG => 0, + MREG => 0, + OPMODEREG => 0, + PREG => 0 + ) + port map ( + A => "0000000" & data1(22 downto 0), + ACIN => (others => '0'), + ALUMODE => "0000", + B => data2(34 downto 17), + BCIN => (others => '0'), + C => (others => '0'), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '0', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => '0', + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "1010101", + P => m01_p, + PCIN => m00_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m10: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 1, + INMODEREG => 0, + MASK => x"fffffffe00ff", + OPMODEREG => 0, + PREG => 0, + USE_PATTERN_DETECT => "PATDET" + ) + port map ( + A => data1(52 downto 23), + ACIN => (others => '0'), + ALUMODE => "0000", + B => '0' & data2(16 downto 0), + BCIN => (others => '0'), + C => std_ulogic_vector(resize(signed(m01_p(38 downto 6)), 48)), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => clocken, + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => clocken, + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "0110101", + P => m10_p, + PATTERNDETECT => p0_pat, + PATTERNBDETECT => p0_patb, + PCIN => (others => '0'), + PCOUT => m10_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m11: DSP48E1 + generic map ( + ACASCREG => 0, + ALUMODEREG => 0, + AREG => 0, + BCASCREG => 0, + BREG => 0, + CARRYINREG => 0, + CARRYINSELREG => 0, + CREG => 0, + INMODEREG => 0, + MASK => x"fffffc000000", + OPMODEREG => 0, + PREG => 0, + USE_PATTERN_DETECT => "PATDET" + ) + port map ( + A => data1(52 downto 23), + ACIN => (others => '0'), + ALUMODE => "0000", + B => data2(34 downto 17), + BCIN => (others => '0'), + C => (others => '0'), + CARRYCASCIN => '0', + CARRYIN => '0', + CARRYINSEL => "000", + CEA1 => '0', + CEA2 => '0', + CEAD => '0', + CEALUMODE => '0', + CEB1 => '0', + CEB2 => '0', + CEC => '0', + CECARRYIN => '0', + CECTRL => '0', + CED => '0', + CEINMODE => '0', + CEM => clocken, + CEP => '0', + CLK => clk, + D => (others => '0'), + INMODE => "00000", + MULTSIGNIN => '0', + OPMODE => "1010101", + P => m11_p, + PATTERNDETECT => p1_pat, + PATTERNBDETECT => p1_patb, + PCIN => m10_pc, + RSTA => '0', + RSTALLCARRYIN => '0', + RSTALUMODE => '0', + RSTB => '0', + RSTC => '0', + RSTCTRL => '0', + RSTD => '0', + RSTINMODE => '0', + RSTM => '0', + RSTP => '0' + ); + + m_out.result(127 downto 64) <= (others => '0'); + m_out.result(63 downto 40) <= m11_p(23 downto 0); + m_out.result(39 downto 23) <= m10_p(16 downto 0); + m_out.result(22 downto 0) <= product_lo; + + m_out.overflow <= not ((p0_pat and p1_pat) or (p0_patb and p1_patb)); + + process(clk) + begin + if rising_edge(clk) and stall = '0' then + m_out.valid <= m_in.valid; + product_lo <= m01_p(5 downto 0) & m00_p(16 downto 0); + end if; + end process; + +end architecture behaviour;