diff --git a/Makefile b/Makefile
index ebb1b79..794cbc1 100644
--- a/Makefile
+++ b/Makefile
@@ -60,9 +60,9 @@ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
 	decode1.vhdl helpers.vhdl insn_helpers.vhdl \
 	control.vhdl decode2.vhdl register_file.vhdl \
 	cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \
-	logical.vhdl countbits.vhdl multiply.vhdl divider.vhdl execute1.vhdl \
-	loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \
-	core.vhdl fpu.vhdl pmu.vhdl
+	logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \
+	execute1.vhdl loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl \
+	core_debug.vhdl core.vhdl fpu.vhdl pmu.vhdl
 
 soc_files = wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \
 	wishbone_debug_master.vhdl xics.vhdl syscon.vhdl gpio.vhdl soc.vhdl \
diff --git a/execute1.vhdl b/execute1.vhdl
index 92da2ee..948bdd6 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -85,6 +85,7 @@ architecture behaviour of execute1 is
         write_pmuspr : std_ulogic;
         ramspr_write_even : std_ulogic;
         ramspr_write_odd : std_ulogic;
+        mult_32s : std_ulogic;
     end record;
     constant side_effect_init : side_effect_type := (others => '0');
 
@@ -203,6 +204,8 @@ architecture behaviour of execute1 is
     -- multiply signals
     signal x_to_multiply: MultiplyInputType;
     signal multiply_to_x: MultiplyOutputType;
+    signal x_to_mult_32s: MultiplyInputType;
+    signal mult_32s_to_x: MultiplyOutputType;
 
     -- divider signals
     signal x_to_divider: Execute1ToDividerType;
@@ -411,6 +414,14 @@ begin
             m_out => multiply_to_x
             );
 
+    mult_32s_0: entity work.multiply_32s
+        port map (
+            clk => clk,
+            stall => stage2_stall,
+            m_in => x_to_mult_32s,
+            m_out => mult_32s_to_x
+            );
+
     divider_0: if not HAS_FPU generate
         div_0: entity work.divider
             port map (
@@ -730,14 +741,14 @@ begin
             addend := not addend;
         end if;
 
+        x_to_multiply.data1 <= std_ulogic_vector(abs1);
+        x_to_multiply.data2 <= std_ulogic_vector(abs2);
 	x_to_multiply.is_32bit <= e_in.is_32bit;
         x_to_multiply.not_result <= sign1 xor sign2;
         x_to_multiply.addend <= addend;
         x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus);
         if e_in.is_32bit = '0' then
             -- 64-bit forms
-            x_to_multiply.data1 <= std_ulogic_vector(abs1);
-            x_to_multiply.data2 <= std_ulogic_vector(abs2);
             if e_in.insn_type = OP_DIVE then
                 x_to_divider.is_extended <= '1';
             end if;
@@ -745,8 +756,6 @@ begin
             x_to_divider.divisor <= std_ulogic_vector(abs2);
         else
             -- 32-bit forms
-            x_to_multiply.data1 <= x"00000000" & std_ulogic_vector(abs1(31 downto 0));
-            x_to_multiply.data2 <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
             x_to_divider.is_extended <= '0';
             if e_in.insn_type = OP_DIVE then   -- extended forms
                 x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000";
@@ -756,6 +765,14 @@ begin
             x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
         end if;
 
+        -- signals to 32-bit multiplier
+        x_to_mult_32s.data1 <= 31x"0" & (a_in(31) and e_in.is_signed) & a_in(31 downto 0);
+        x_to_mult_32s.data2 <= 31x"0" & (b_in(31) and e_in.is_signed) & b_in(31 downto 0);
+        -- The following are unused, but set here to avoid X states
+        x_to_mult_32s.is_32bit <= '1';
+        x_to_mult_32s.not_result <= '0';
+        x_to_mult_32s.addend <= (others => '0');
+
         shortmul_result <= std_ulogic_vector(resize(signed(mshort_p), 64));
         case ex1.mul_select is
             when "00" =>
@@ -1271,7 +1288,11 @@ begin
 		v.se.icache_inval := '1';
 
 	    when OP_MUL_L64 =>
-                if HAS_SHORT_MULT and e_in.reg_valid3 = '0' and
+                if e_in.is_32bit = '1' then
+                    v.se.mult_32s := '1';
+                    v.res2_sel := "00";
+                    slow_op := '1';
+                elsif HAS_SHORT_MULT and e_in.reg_valid3 = '0' and
                     fits_in_n_bits(a_in, 16) and fits_in_n_bits(b_in, 16) then
                     -- Operands fit into 16 bits, so use short multiplier
                     if e_in.oe = '1' then
@@ -1285,11 +1306,16 @@ begin
                     owait := '1';
                 end if;
 
-	    when OP_MUL_H64 | OP_MUL_H32 =>
+	    when OP_MUL_H64 =>
                 v.start_mul := '1';
                 slow_op := '1';
                 owait := '1';
 
+            when OP_MUL_H32 =>
+                v.se.mult_32s := '1';
+                v.res2_sel := "01";
+                slow_op := '1';
+
 	    when OP_DIV | OP_DIVE | OP_MOD =>
                 if not HAS_FPU then
                     v.start_div := '1';
@@ -1370,6 +1396,7 @@ begin
         fv := Execute1ToFPUInit;
 
         x_to_multiply.valid <= '0';
+        x_to_mult_32s.valid <= '0';
         x_to_divider.valid <= '0';
         v.ext_interrupt := '0';
         v.taken_branch_event := '0';
@@ -1456,6 +1483,7 @@ begin
             v.res2_sel := actions.res2_sel;
             v.msr := actions.new_msr;
             x_to_multiply.valid <= actions.start_mul;
+            x_to_mult_32s.valid <= actions.se.mult_32s;
             v.mul_in_progress := actions.start_mul;
             x_to_divider.valid <= actions.start_div;
             v.div_in_progress := actions.start_div;
@@ -1624,11 +1652,6 @@ begin
     -- Second execute stage control
     execute2_1: process(all)
 	variable v : reg_stage2_type;
-	variable overflow : std_ulogic;
-        variable lv : Execute1ToLoadstore1Type;
-        variable fv : Execute1ToFPUType;
-        variable k : integer;
-        variable go : std_ulogic;
         variable bypass_valid : std_ulogic;
         variable rcresult : std_ulogic_vector(63 downto 0);
         variable sprres : std_ulogic_vector(63 downto 0);
@@ -1647,6 +1670,14 @@ begin
             v.br_mispredict := ex1.br_mispredict;
         end if;
 
+        if ex1.se.mult_32s = '1' and ex1.oe = '1' then
+            v.e.xerc.ov := mult_32s_to_x.overflow;
+            v.e.xerc.ov32 := mult_32s_to_x.overflow;
+            if mult_32s_to_x.overflow = '1' then
+                v.e.xerc.so := '1';
+            end if;
+        end if;
+
 	ctrl_tmp <= ctrl;
 	-- FIXME: run at 512MHz not core freq
 	ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
@@ -1667,24 +1698,34 @@ begin
             v.e.write_xerc_enable := '0';
             v.e.redirect := '0';
             v.e.br_last := '0';
-            v.se := side_effect_init;
             v.taken_branch_event := '0';
             v.br_mispredict := '0';
         end if;
         if flush_in = '1' then
             v.e.valid := '0';
             v.e.interrupt := '0';
+            v.se := side_effect_init;
             v.ext_interrupt := '0';
         end if;
 
         -- This is split like this because mfspr doesn't have an Rc bit,
         -- and we don't want the zero-detect logic to be after the
         -- SPR mux for timing reasons.
-        if ex1.res2_sel(0) = '0' then
+        if ex1.se.mult_32s = '1' then
+            if ex1.res2_sel(0) = '0' then
+                rcresult := mult_32s_to_x.result(63 downto 0);
+            else
+                rcresult := mult_32s_to_x.result(63 downto 32) &
+                            mult_32s_to_x.result(63 downto 32);
+            end if;
+        elsif ex1.res2_sel(0) = '0' then
             rcresult := ex1.e.write_data;
-            sprres := spr_result;
         else
             rcresult := countbits_result;
+        end if;
+        if ex1.res2_sel(0) = '0' then
+            sprres := spr_result;
+        else
             sprres := pmu_to_x.spr_val;
         end if;
         if ex1.res2_sel(1) = '0' then
@@ -1708,7 +1749,7 @@ begin
             cr_res(31) := sign;
             cr_res(30) := not (sign or zero);
             cr_res(29) := zero;
-            cr_res(28) := ex1.e.xerc.so;
+            cr_res(28) := v.e.xerc.so;
             cr_mask(7) := '1';
         end if;
 
diff --git a/microwatt.core b/microwatt.core
index 4c8695e..b817901 100644
--- a/microwatt.core
+++ b/microwatt.core
@@ -66,6 +66,7 @@ filesets:
   xilinx_specific:
     files:
       - xilinx-mult.vhdl : {file_type : vhdlSource-2008}
+      - xilinx-mult-32s.vhdl : {file_type : vhdlSource-2008}
       - fpga/fpga-random.vhdl : {file_type : vhdlSource-2008}
       - fpga/fpga-random.xdc : {file_type : xdc}
 
diff --git a/multiply-32s.vhdl b/multiply-32s.vhdl
new file mode 100644
index 0000000..0639dbf
--- /dev/null
+++ b/multiply-32s.vhdl
@@ -0,0 +1,55 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.common.all;
+
+-- Signed 33b x 33b multiplier giving 64-bit product, with no addend,
+-- with fixed 1-cycle latency.
+
+entity multiply_32s is
+    port (
+        clk   : in std_logic;
+        stall : in std_ulogic;
+
+        m_in  : in MultiplyInputType;
+        m_out : out MultiplyOutputType
+        );
+end entity multiply_32s;
+
+architecture behaviour of multiply_32s is
+    type reg_type is record
+        valid     : std_ulogic;
+        data      : signed(65 downto 0);
+    end record;
+    constant reg_type_init : reg_type := (valid => '0', data => (others => '0'));
+
+    signal r, rin : reg_type := reg_type_init;
+begin
+    multiply_0: process(clk)
+    begin
+        if rising_edge(clk) and stall = '0' then
+            r <= rin;
+        end if;
+    end process;
+
+    multiply_1: process(all)
+        variable v : reg_type;
+        variable d : std_ulogic_vector(63 downto 0);
+	variable ov : std_ulogic;
+    begin
+        v.valid := m_in.valid;
+        v.data := signed(m_in.data1(32 downto 0)) * signed(m_in.data2(32 downto 0));
+
+        d := std_ulogic_vector(r.data(63 downto 0));
+
+        ov := (or d(63 downto 31)) and not (and d(63 downto 31));
+
+        m_out.result <= 64x"0" & d;
+        m_out.overflow <= ov;
+        m_out.valid <= r.valid;
+
+        rin <= v;
+    end process;
+end architecture behaviour;
diff --git a/xilinx-mult-32s.vhdl b/xilinx-mult-32s.vhdl
new file mode 100644
index 0000000..fde19ae
--- /dev/null
+++ b/xilinx-mult-32s.vhdl
@@ -0,0 +1,293 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.common.all;
+
+library unisim;
+use unisim.vcomponents.all;
+
+-- Signed 33b x 33b multiplier giving 64-bit product, with no addend.
+
+entity multiply_32s is
+    port (
+        clk   : in std_logic;
+        stall : in std_ulogic;
+
+        m_in  : in MultiplyInputType;
+        m_out : out MultiplyOutputType
+        );
+end entity multiply_32s;
+
+architecture behaviour of multiply_32s is
+    signal clocken : std_ulogic;
+    signal data1 : std_ulogic_vector(52 downto 0);
+    signal data2 : std_ulogic_vector(34 downto 0);
+    signal m00_p, m01_p : std_ulogic_vector(47 downto 0);
+    signal m00_pc : std_ulogic_vector(47 downto 0);
+    signal m10_p, m11_p : std_ulogic_vector(47 downto 0);
+    signal m10_pc : std_ulogic_vector(47 downto 0);
+    signal p0_pat, p0_patb : std_ulogic;
+    signal p1_pat, p1_patb : std_ulogic;
+    signal product_lo : std_ulogic_vector(22 downto 0);
+
+begin
+    -- sign extend
+    data1 <= std_ulogic_vector(resize(signed(m_in.data1(32 downto 0)), 53));
+    data2 <= std_ulogic_vector(resize(signed(m_in.data2(32 downto 0)), 35));
+
+    clocken <= m_in.valid and not stall;
+
+    m00: DSP48E1
+        generic map (
+            ACASCREG => 0,
+            ALUMODEREG => 0,
+            AREG => 0,
+            BCASCREG => 0,
+            BREG => 0,
+            CARRYINREG => 0,
+            CARRYINSELREG => 0,
+            CREG => 0,
+            INMODEREG => 0,
+            MREG => 0,
+            OPMODEREG => 0,
+            PREG => 0
+            )
+        port map (
+            A => "0000000" & data1(22 downto 0),
+            ACIN => (others => '0'),
+            ALUMODE => "0000",
+            B => '0' & data2(16 downto 0),
+            BCIN => (others => '0'),
+            C => (others => '0'),
+            CARRYCASCIN => '0',
+            CARRYIN => '0',
+            CARRYINSEL => "000",
+            CEA1 => '0',
+            CEA2 => '0',
+            CEAD => '0',
+            CEALUMODE => '0',
+            CEB1 => '0',
+            CEB2 => '0',
+            CEC => '0',
+            CECARRYIN => '0',
+            CECTRL => '0',
+            CED => '0',
+            CEINMODE => '0',
+            CEM => '0',
+            CEP => '0',
+            CLK => clk,
+            D => (others => '0'),
+            INMODE => "00000",
+            MULTSIGNIN => '0',
+            OPMODE => "0110101",
+            P => m00_p,
+            PCIN => (others => '0'),
+            PCOUT => m00_pc,
+            RSTA => '0',
+            RSTALLCARRYIN => '0',
+            RSTALUMODE => '0',
+            RSTB => '0',
+            RSTC => '0',
+            RSTCTRL => '0',
+            RSTD => '0',
+            RSTINMODE => '0',
+            RSTM => '0',
+            RSTP => '0'
+            );
+
+    m01: DSP48E1
+        generic map (
+            ACASCREG => 0,
+            ALUMODEREG => 0,
+            AREG => 0,
+            BCASCREG => 0,
+            BREG => 0,
+            CARRYINREG => 0,
+            CARRYINSELREG => 0,
+            CREG => 0,
+            INMODEREG => 0,
+            MREG => 0,
+            OPMODEREG => 0,
+            PREG => 0
+            )
+        port map (
+            A => "0000000" & data1(22 downto 0),
+            ACIN => (others => '0'),
+            ALUMODE => "0000",
+            B => data2(34 downto 17),
+            BCIN => (others => '0'),
+            C => (others => '0'),
+            CARRYCASCIN => '0',
+            CARRYIN => '0',
+            CARRYINSEL => "000",
+            CEA1 => '0',
+            CEA2 => '0',
+            CEAD => '0',
+            CEALUMODE => '0',
+            CEB1 => '0',
+            CEB2 => '0',
+            CEC => '0',
+            CECARRYIN => '0',
+            CECTRL => '0',
+            CED => '0',
+            CEINMODE => '0',
+            CEM => '0',
+            CEP => '0',
+            CLK => clk,
+            D => (others => '0'),
+            INMODE => "00000",
+            MULTSIGNIN => '0',
+            OPMODE => "1010101",
+            P => m01_p,
+            PCIN => m00_pc,
+            RSTA => '0',
+            RSTALLCARRYIN => '0',
+            RSTALUMODE => '0',
+            RSTB => '0',
+            RSTC => '0',
+            RSTCTRL => '0',
+            RSTD => '0',
+            RSTINMODE => '0',
+            RSTM => '0',
+            RSTP => '0'
+            );
+
+    m10: DSP48E1
+        generic map (
+            ACASCREG => 0,
+            ALUMODEREG => 0,
+            AREG => 0,
+            BCASCREG => 0,
+            BREG => 0,
+            CARRYINREG => 0,
+            CARRYINSELREG => 0,
+            CREG => 1,
+            INMODEREG => 0,
+            MASK => x"fffffffe00ff",
+            OPMODEREG => 0,
+            PREG => 0,
+            USE_PATTERN_DETECT => "PATDET"
+            )
+        port map (
+            A => data1(52 downto 23),
+            ACIN => (others => '0'),
+            ALUMODE => "0000",
+            B => '0' & data2(16 downto 0),
+            BCIN => (others => '0'),
+            C => std_ulogic_vector(resize(signed(m01_p(38 downto 6)), 48)),
+            CARRYCASCIN => '0',
+            CARRYIN => '0',
+            CARRYINSEL => "000",
+            CEA1 => '0',
+            CEA2 => '0',
+            CEAD => '0',
+            CEALUMODE => '0',
+            CEB1 => '0',
+            CEB2 => '0',
+            CEC => clocken,
+            CECARRYIN => '0',
+            CECTRL => '0',
+            CED => '0',
+            CEINMODE => '0',
+            CEM => clocken,
+            CEP => '0',
+            CLK => clk,
+            D => (others => '0'),
+            INMODE => "00000",
+            MULTSIGNIN => '0',
+            OPMODE => "0110101",
+            P => m10_p,
+            PATTERNDETECT => p0_pat,
+            PATTERNBDETECT => p0_patb,
+            PCIN => (others => '0'),
+            PCOUT => m10_pc,
+            RSTA => '0',
+            RSTALLCARRYIN => '0',
+            RSTALUMODE => '0',
+            RSTB => '0',
+            RSTC => '0',
+            RSTCTRL => '0',
+            RSTD => '0',
+            RSTINMODE => '0',
+            RSTM => '0',
+            RSTP => '0'
+            );
+
+    m11: DSP48E1
+        generic map (
+            ACASCREG => 0,
+            ALUMODEREG => 0,
+            AREG => 0,
+            BCASCREG => 0,
+            BREG => 0,
+            CARRYINREG => 0,
+            CARRYINSELREG => 0,
+            CREG => 0,
+            INMODEREG => 0,
+            MASK => x"fffffc000000",
+            OPMODEREG => 0,
+            PREG => 0,
+            USE_PATTERN_DETECT => "PATDET"
+            )
+        port map (
+            A => data1(52 downto 23),
+            ACIN => (others => '0'),
+            ALUMODE => "0000",
+            B => data2(34 downto 17),
+            BCIN => (others => '0'),
+            C => (others => '0'),
+            CARRYCASCIN => '0',
+            CARRYIN => '0',
+            CARRYINSEL => "000",
+            CEA1 => '0',
+            CEA2 => '0',
+            CEAD => '0',
+            CEALUMODE => '0',
+            CEB1 => '0',
+            CEB2 => '0',
+            CEC => '0',
+            CECARRYIN => '0',
+            CECTRL => '0',
+            CED => '0',
+            CEINMODE => '0',
+            CEM => clocken,
+            CEP => '0',
+            CLK => clk,
+            D => (others => '0'),
+            INMODE => "00000",
+            MULTSIGNIN => '0',
+            OPMODE => "1010101",
+            P => m11_p,
+            PATTERNDETECT => p1_pat,
+            PATTERNBDETECT => p1_patb,
+            PCIN => m10_pc,
+            RSTA => '0',
+            RSTALLCARRYIN => '0',
+            RSTALUMODE => '0',
+            RSTB => '0',
+            RSTC => '0',
+            RSTCTRL => '0',
+            RSTD => '0',
+            RSTINMODE => '0',
+            RSTM => '0',
+            RSTP => '0'
+            );
+
+    m_out.result(127 downto 64) <= (others => '0');
+    m_out.result(63 downto 40) <= m11_p(23 downto 0);
+    m_out.result(39 downto 23) <= m10_p(16 downto 0);
+    m_out.result(22 downto 0)  <= product_lo;
+
+    m_out.overflow <= not ((p0_pat and p1_pat) or (p0_patb and p1_patb));
+
+    process(clk)
+    begin
+        if rising_edge(clk) and stall = '0' then
+            m_out.valid <= m_in.valid;
+            product_lo <= m01_p(5 downto 0) & m00_p(16 downto 0);
+        end if;
+    end process;
+
+end architecture behaviour;