diff --git a/Makefile b/Makefile
index ce08c33..2780db0 100644
--- a/Makefile
+++ b/Makefile
@@ -31,7 +31,7 @@ common.o: decode_types.o
 control.o: gpr_hazard.o cr_hazard.o common.o
 sim_jtag.o: sim_jtag_socket.o
 core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o
-core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o multiply.o writeback.o core_debug.o divider.o
+core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o writeback.o core_debug.o
 core_debug.o: common.o
 countzero.o:
 countzero_tb.o: common.o glibc_random.o countzero.o
@@ -40,7 +40,7 @@ crhelpers.o: common.o
 decode1.o: common.o decode_types.o
 decode2.o: decode_types.o common.o helpers.o insn_helpers.o control.o
 decode_types.o:
-execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o
+execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o multiply.o divider.o
 fetch1.o: common.o
 fetch2.o: common.o wishbone_types.o
 glibc_random_helpers.o:
diff --git a/common.vhdl b/common.vhdl
index a27f4f2..ffddb0b 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -109,6 +109,9 @@ package common is
 	read_data1: std_ulogic_vector(63 downto 0);
 	read_data2: std_ulogic_vector(63 downto 0);
 	read_data3: std_ulogic_vector(63 downto 0);
+        bypass_data1: std_ulogic;
+        bypass_data2: std_ulogic;
+        bypass_data3: std_ulogic;
 	cr: std_ulogic_vector(31 downto 0);
 	xerc: xer_common_t;
 	lr: std_ulogic;
@@ -124,44 +127,41 @@ package common is
 	is_signed: std_ulogic;
 	insn: std_ulogic_vector(31 downto 0);
 	data_len: std_ulogic_vector(3 downto 0);
+	byte_reverse : std_ulogic;
+	sign_extend : std_ulogic;			-- do we need to sign extend?
+	update : std_ulogic;				-- is this an update instruction?
     end record;
     constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
-	(valid => '0', insn_type => OP_ILLEGAL, lr => '0', rc => '0', oe => '0', invert_a => '0',
+	(valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
+         lr => '0', rc => '0', oe => '0', invert_a => '0',
 	 invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
-	 is_32bit => '0', is_signed => '0', xerc => xerc_init, others => (others => '0'));
+	 is_32bit => '0', is_signed => '0', xerc => xerc_init,
+         byte_reverse => '0', sign_extend => '0', update => '0', others => (others => '0'));
 
-    type Decode2ToMultiplyType is record
+    type Execute1ToMultiplyType is record
 	valid: std_ulogic;
 	insn_type: insn_type_t;
-	write_reg: gpr_index_t;
 	data1: std_ulogic_vector(64 downto 0);
 	data2: std_ulogic_vector(64 downto 0);
-	rc: std_ulogic;
-	oe: std_ulogic;
 	is_32bit: std_ulogic;
-	xerc: xer_common_t;
     end record;
-    constant Decode2ToMultiplyInit : Decode2ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, rc => '0',
-							       oe => '0', is_32bit => '0', xerc => xerc_init,
-							       others => (others => '0'));
+    constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL,
+								 is_32bit => '0',
+								 others => (others => '0'));
 
-    type Decode2ToDividerType is record
+    type Execute1ToDividerType is record
 	valid: std_ulogic;
-	write_reg: gpr_index_t;
 	dividend: std_ulogic_vector(63 downto 0);
 	divisor: std_ulogic_vector(63 downto 0);
 	is_signed: std_ulogic;
 	is_32bit: std_ulogic;
 	is_extended: std_ulogic;
 	is_modulus: std_ulogic;
-	rc: std_ulogic;
-	oe: std_ulogic;
-	xerc: xer_common_t;
+        neg_result: std_ulogic;
     end record;
-    constant Decode2ToDividerInit: Decode2ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0',
-							    is_extended => '0', is_modulus => '0',
-							    rc => '0', oe => '0', xerc => xerc_init,
-							    others => (others => '0'));
+    constant Execute1ToDividerInit: Execute1ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0',
+                                                              is_extended => '0', is_modulus => '0',
+                                                              neg_result => '0', others => (others => '0'));
 
     type Decode2ToRegisterFileType is record
 	read1_enable : std_ulogic;
@@ -193,7 +193,7 @@ package common is
     end record;
     constant Execute1ToFetch1TypeInit : Execute1ToFetch1Type := (redirect => '0', others => (others => '0'));
 
-    type Decode2ToLoadstore1Type is record
+    type Execute1ToLoadstore1Type is record
 	valid : std_ulogic;
 	load : std_ulogic;				-- is this a load or store
 	addr1 : std_ulogic_vector(63 downto 0);
@@ -207,9 +207,9 @@ package common is
 	update_reg : gpr_index_t;                      	-- if so, the register to update
 	xerc : xer_common_t;
     end record;
-    constant Decode2ToLoadstore1Init : Decode2ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0',
-								   sign_extend => '0', update => '0', xerc => xerc_init,
-								   others => (others => '0'));
+    constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0',
+                                                                     sign_extend => '0', update => '0', xerc => xerc_init,
+                                                                     others => (others => '0'));
 
     type Loadstore1ToDcacheType is record
 	valid : std_ulogic;
@@ -248,48 +248,32 @@ package common is
 	write_enable : std_ulogic;
 	write_reg: gspr_index_t;
 	write_data: std_ulogic_vector(63 downto 0);
-	write_len : std_ulogic_vector(3 downto 0);
 	write_cr_enable : std_ulogic;
 	write_cr_mask : std_ulogic_vector(7 downto 0);
 	write_cr_data : std_ulogic_vector(31 downto 0);
 	write_xerc_enable : std_ulogic;
 	xerc : xer_common_t;
-	sign_extend: std_ulogic;
     end record;
     constant Execute1ToWritebackInit : Execute1ToWritebackType := (valid => '0', rc => '0', write_enable => '0',
-								   write_cr_enable => '0', sign_extend => '0',
+								   write_cr_enable => '0',
 								   write_xerc_enable => '0', xerc => xerc_init,
 								   others => (others => '0'));
 
-    type MultiplyToWritebackType is record
+    type MultiplyToExecute1Type is record
 	valid: std_ulogic;
-
-	write_reg_enable : std_ulogic;
-	write_reg_nr: gpr_index_t;
 	write_reg_data: std_ulogic_vector(63 downto 0);
-	write_xerc_enable : std_ulogic;
-	xerc : xer_common_t;
-	rc: std_ulogic;
+        overflow : std_ulogic;
     end record;
-    constant MultiplyToWritebackInit : MultiplyToWritebackType := (valid => '0', write_reg_enable => '0',
-								   rc => '0', write_xerc_enable => '0',
-								   xerc => xerc_init,
-								   others => (others => '0'));
+    constant MultiplyToExecute1Init : MultiplyToExecute1Type := (valid => '0', overflow => '0',
+								 others => (others => '0'));
 
-    type DividerToWritebackType is record
+    type DividerToExecute1Type is record
 	valid: std_ulogic;
-
-	write_reg_enable : std_ulogic;
-	write_reg_nr: gpr_index_t;
 	write_reg_data: std_ulogic_vector(63 downto 0);
-	write_xerc_enable : std_ulogic;
-	xerc : xer_common_t;
-	rc: std_ulogic;
+        overflow : std_ulogic;
     end record;
-    constant DividerToWritebackInit : DividerToWritebackType := (valid => '0', write_reg_enable => '0',
-								 rc => '0', write_xerc_enable => '0',
-								 xerc => xerc_init,
-								 others => (others => '0'));
+    constant DividerToExecute1Init : DividerToExecute1Type := (valid => '0', overflow => '0',
+                                                               others => (others => '0'));
 
     type WritebackToRegisterFileType is record
 	write_reg : gspr_index_t;
diff --git a/control.vhdl b/control.vhdl
index ead3c1f..064ff98 100644
--- a/control.vhdl
+++ b/control.vhdl
@@ -21,6 +21,7 @@ entity control is
 
         gpr_write_valid_in  : in std_ulogic;
         gpr_write_in        : in gspr_index_t;
+        gpr_bypassable      : in std_ulogic;
 
         gpr_a_read_valid_in : in std_ulogic;
         gpr_a_read_in       : in gspr_index_t;
@@ -36,7 +37,11 @@ entity control is
 
         valid_out           : out std_ulogic;
         stall_out           : out std_ulogic;
-        stopped_out         : out std_ulogic
+        stopped_out         : out std_ulogic;
+
+        gpr_bypass_a        : out std_ulogic;
+        gpr_bypass_b        : out std_ulogic;
+        gpr_bypass_c        : out std_ulogic
         );
 end entity control;
 
@@ -71,10 +76,12 @@ begin
 
             gpr_write_valid_in => gpr_write_valid,
             gpr_write_in       => gpr_write_in,
+            bypass_avail       => gpr_bypassable,
             gpr_read_valid_in  => gpr_a_read_valid_in,
             gpr_read_in        => gpr_a_read_in,
 
-            stall_out          => stall_a_out
+            stall_out          => stall_a_out,
+            use_bypass         => gpr_bypass_a
             );
 
     gpr_hazard1: entity work.gpr_hazard
@@ -87,10 +94,12 @@ begin
 
             gpr_write_valid_in => gpr_write_valid,
             gpr_write_in       => gpr_write_in,
+            bypass_avail       => gpr_bypassable,
             gpr_read_valid_in  => gpr_b_read_valid_in,
             gpr_read_in        => gpr_b_read_in,
 
-            stall_out          => stall_b_out
+            stall_out          => stall_b_out,
+            use_bypass         => gpr_bypass_b
             );
 
     gpr_c_read_in_fmt <= "0" & gpr_c_read_in;
@@ -105,10 +114,12 @@ begin
 
             gpr_write_valid_in => gpr_write_valid,
             gpr_write_in       => gpr_write_in,
+            bypass_avail       => gpr_bypassable,
             gpr_read_valid_in  => gpr_c_read_valid_in,
             gpr_read_in        => gpr_c_read_in_fmt,
 
-            stall_out          => stall_c_out
+            stall_out          => stall_c_out,
+            use_bypass         => gpr_bypass_c
             );
 
     cr_hazard0: entity work.cr_hazard
diff --git a/core.vhdl b/core.vhdl
index eb0b526..bc0b16f 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -9,7 +9,8 @@ use work.wishbone_types.all;
 entity core is
     generic (
         SIM : boolean := false;
-	DISABLE_FLATTEN : boolean := false
+	DISABLE_FLATTEN : boolean := false;
+        EX1_BYPASS : boolean := true
         );
     port (
         clk          : in std_logic;
@@ -59,18 +60,10 @@ architecture behave of core is
     signal execute1_to_fetch1: Execute1ToFetch1Type;
 
     -- load store signals
-    signal decode2_to_loadstore1: Decode2ToLoadstore1Type;
+    signal execute1_to_loadstore1: Execute1ToLoadstore1Type;
     signal loadstore1_to_dcache: Loadstore1ToDcacheType;
     signal dcache_to_writeback: DcacheToWritebackType;
 
-    -- multiply signals
-    signal decode2_to_multiply: Decode2ToMultiplyType;
-    signal multiply_to_writeback: MultiplyToWritebackType;
-
-    -- divider signals
-    signal decode2_to_divider: Decode2ToDividerType;
-    signal divider_to_writeback: DividerToWritebackType;
-
     -- local signals
     signal fetch1_stall_in : std_ulogic;
     signal icache_stall_out : std_ulogic;
@@ -115,8 +108,6 @@ architecture behave of core is
     attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN);
     attribute keep_hierarchy of cr_file_0 : label is keep_h(DISABLE_FLATTEN);
     attribute keep_hierarchy of execute1_0 : label is keep_h(DISABLE_FLATTEN);
-    attribute keep_hierarchy of multiply_0 : label is keep_h(DISABLE_FLATTEN);
-    attribute keep_hierarchy of divider_0 : label is keep_h(DISABLE_FLATTEN);
     attribute keep_hierarchy of loadstore1_0 : label is keep_h(DISABLE_FLATTEN);
     attribute keep_hierarchy of dcache_0 : label is keep_h(DISABLE_FLATTEN);
     attribute keep_hierarchy of writeback_0 : label is keep_h(DISABLE_FLATTEN);
@@ -186,6 +177,9 @@ begin
     decode1_stall_in <= decode2_stall_out;
 
     decode2_0: entity work.decode2
+        generic map (
+            EX1_BYPASS => EX1_BYPASS
+            )
         port map (
             clk => clk,
             rst => core_rst,
@@ -196,9 +190,6 @@ begin
 	    stopped_out => dbg_core_is_stopped,
             d_in => decode1_to_decode2,
             e_out => decode2_to_execute1,
-            l_out => decode2_to_loadstore1,
-            m_out => decode2_to_multiply,
-            d_out => decode2_to_divider,
             r_in => register_file_to_decode2,
             r_out => decode2_to_register_file,
             c_in => cr_file_to_decode2,
@@ -232,11 +223,16 @@ begin
             );
 
     execute1_0: entity work.execute1
+        generic map (
+            EX1_BYPASS => EX1_BYPASS
+            )
         port map (
             clk => clk,
+            rst => core_rst,
             flush_out => flush,
 	    stall_out => ex1_stall_out,
             e_in => decode2_to_execute1,
+            l_out => execute1_to_loadstore1,
             f_out => execute1_to_fetch1,
             e_out => execute1_to_writeback,
 	    icache_inval => ex1_icache_inval,
@@ -246,7 +242,7 @@ begin
     loadstore1_0: entity work.loadstore1
         port map (
             clk => clk,
-            l_in => decode2_to_loadstore1,
+            l_in => execute1_to_loadstore1,
             l_out => loadstore1_to_dcache
             );
 
@@ -265,28 +261,11 @@ begin
             wishbone_out => wishbone_data_out
             );
 
-    multiply_0: entity work.multiply
-        port map (
-            clk => clk,
-            m_in => decode2_to_multiply,
-            m_out => multiply_to_writeback
-            );
-
-    divider_0: entity work.divider
-        port map (
-            clk => clk,
-            rst => core_rst,
-            d_in => decode2_to_divider,
-            d_out => divider_to_writeback
-            );
-
     writeback_0: entity work.writeback
         port map (
             clk => clk,
             e_in => execute1_to_writeback,
             l_in => dcache_to_writeback,
-            m_in => multiply_to_writeback,
-            d_in => divider_to_writeback,
             w_out => writeback_to_register_file,
             c_out => writeback_to_cr_file,
             complete_out => complete
diff --git a/countzero.vhdl b/countzero.vhdl
index d3960f0..50e6ead 100644
--- a/countzero.vhdl
+++ b/countzero.vhdl
@@ -6,6 +6,7 @@ library work;
 
 entity zero_counter is
     port (
+        clk         : in std_logic;
 	rs          : in std_ulogic_vector(63 downto 0);
 	count_right : in std_ulogic;
 	is_32bit    : in std_ulogic;
@@ -14,10 +15,14 @@ entity zero_counter is
 end entity zero_counter;
 
 architecture behaviour of zero_counter is
-    signal y, z     : std_ulogic_vector(3 downto 0);
-    signal v16      : std_ulogic_vector(15 downto 0);
-    signal v4       : std_ulogic_vector(3 downto 0);
-    signal sel      : std_ulogic_vector(5 downto 0);
+    type intermediate_result is record
+        v16: std_ulogic_vector(15 downto 0);
+        sel_hi: std_ulogic_vector(1 downto 0);
+        is_32bit: std_ulogic;
+        count_right: std_ulogic;
+    end record;
+
+    signal r, r_in  : intermediate_result;
 
     -- Return the index of the leftmost or rightmost 1 in a set of 4 bits.
     -- Assumes v is not "0000"; if it is, return (right ? "11" : "00").
@@ -47,65 +52,83 @@ architecture behaviour of zero_counter is
     end;
 
 begin
-    zerocounter0: process(all)
+    zerocounter_0: process(clk)
+    begin
+	if rising_edge(clk) then
+            r <= r_in;
+        end if;
+    end process;
+
+    zerocounter_1: process(all)
+        variable v: intermediate_result;
+        variable y, z: std_ulogic_vector(3 downto 0);
+        variable sel: std_ulogic_vector(5 downto 0);
+        variable v4: std_ulogic_vector(3 downto 0);
+
     begin
 	-- Test 4 groups of 16 bits each.
 	-- The top 2 groups are considered to be zero in 32-bit mode.
-	z(0) <= or (rs(15 downto 0));
-	z(1) <= or (rs(31 downto 16));
-	z(2) <= or (rs(47 downto 32));
-	z(3) <= or (rs(63 downto 48));
+	z(0) := or (rs(15 downto 0));
+	z(1) := or (rs(31 downto 16));
+	z(2) := or (rs(47 downto 32));
+	z(3) := or (rs(63 downto 48));
         if is_32bit = '0' then
-            sel(5 downto 4) <= encoder(z, count_right);
+            v.sel_hi := encoder(z, count_right);
         else
-            sel(5) <= '0';
+            v.sel_hi(1) := '0';
             if count_right = '0' then
-                sel(4) <= z(1);
+                v.sel_hi(0) := z(1);
             else
-                sel(4) <= not z(0);
+                v.sel_hi(0) := not z(0);
             end if;
         end if;
 
 	-- Select the leftmost/rightmost non-zero group of 16 bits
-	case sel(5 downto 4) is
+	case v.sel_hi is
 	    when "00" =>
-		v16 <= rs(15 downto 0);
+		v.v16 := rs(15 downto 0);
 	    when "01" =>
-		v16 <= rs(31 downto 16);
+		v.v16 := rs(31 downto 16);
 	    when "10" =>
-		v16 <= rs(47 downto 32);
+		v.v16 := rs(47 downto 32);
 	    when others =>
-		v16 <= rs(63 downto 48);
+		v.v16 := rs(63 downto 48);
 	end case;
 
+        -- Latch this and do the rest in the next cycle, for the sake of timing
+        v.is_32bit := is_32bit;
+        v.count_right := count_right;
+        r_in <= v;
+        sel(5 downto 4) := r.sel_hi;
+
 	-- Test 4 groups of 4 bits
-	y(0) <= or (v16(3 downto 0));
-	y(1) <= or (v16(7 downto 4));
-	y(2) <= or (v16(11 downto 8));
-	y(3) <= or (v16(15 downto 12));
-	sel(3 downto 2) <= encoder(y, count_right);
+	y(0) := or (r.v16(3 downto 0));
+	y(1) := or (r.v16(7 downto 4));
+	y(2) := or (r.v16(11 downto 8));
+	y(3) := or (r.v16(15 downto 12));
+	sel(3 downto 2) := encoder(y, r.count_right);
 
 	-- Select the leftmost/rightmost non-zero group of 4 bits
 	case sel(3 downto 2) is
 	    when "00" =>
-		v4 <= v16(3 downto 0);
+		v4 := r.v16(3 downto 0);
 	    when "01" =>
-		v4 <= v16(7 downto 4);
+		v4 := r.v16(7 downto 4);
 	    when "10" =>
-		v4 <= v16(11 downto 8);
+		v4 := r.v16(11 downto 8);
 	    when others =>
-		v4 <= v16(15 downto 12);
+		v4 := r.v16(15 downto 12);
 	end case;
 
-	sel(1 downto 0) <= encoder(v4, count_right);
+	sel(1 downto 0) := encoder(v4, r.count_right);
 
 	-- sel is now the index of the leftmost/rightmost 1 bit in rs
 	if v4 = "0000" then
 	    -- operand is zero, return 32 for 32-bit, else 64
-	    result <= x"00000000000000" & '0' & not is_32bit & is_32bit & "00000";
-	elsif count_right = '0' then
+	    result <= x"00000000000000" & '0' & not r.is_32bit & r.is_32bit & "00000";
+	elsif r.count_right = '0' then
 	    -- return (63 - sel), trimmed to 5 bits in 32-bit mode
-	    result <= x"00000000000000" & "00" & (not sel(5) and not is_32bit) & not sel(4 downto 0);
+	    result <= x"00000000000000" & "00" & (not sel(5) and not r.is_32bit) & not sel(4 downto 0);
 	else
 	    result <= x"00000000000000" & "00" & sel;
 	end if;
diff --git a/countzero_tb.vhdl b/countzero_tb.vhdl
index 91de334..21529de 100644
--- a/countzero_tb.vhdl
+++ b/countzero_tb.vhdl
@@ -15,16 +15,26 @@ architecture behave of countzero_tb is
     signal is_32bit, count_right: std_ulogic := '0';
     signal result: std_ulogic_vector(63 downto 0);
     signal randno: std_ulogic_vector(63 downto 0);
+    signal clk: std_ulogic;
 
 begin
     zerocounter_0: entity work.zero_counter
 	port map (
+            clk => clk,
 	    rs => rs,
 	    result => result,
 	    count_right => count_right,
 	    is_32bit => is_32bit
 	);
 
+    clk_process: process
+    begin
+        clk <= '0';
+        wait for clk_period/2;
+        clk <= '1';
+        wait for clk_period/2;
+    end process;
+
     stim_process: process
         variable r: std_ulogic_vector(63 downto 0);
     begin
diff --git a/decode1.vhdl b/decode1.vhdl
index 51a2643..f1b5ad4 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -44,8 +44,8 @@ architecture behaviour of decode1 is
 		29 =>       (ALU,    OP_AND,       NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0'), -- andis.
 		18 =>       (ALU,    OP_B,         NONE,       CONST_LI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- b
 		16 =>       (ALU,    OP_BC,        SPR,        CONST_BD,    NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc
-		11 =>       (ALU,    OP_CMP,       RA,         CONST_SI,    NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpi
-		10 =>       (ALU,    OP_CMPL,      RA,         CONST_UI,    NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli
+		11 =>       (ALU,    OP_CMP,       RA,         CONST_SI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmpi
+		10 =>       (ALU,    OP_CMP,       RA,         CONST_UI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli
 		34 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbz
 		35 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lbzu
 		42 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lha
@@ -54,7 +54,7 @@ architecture behaviour of decode1 is
 		41 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lhzu
 		32 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwz
                 33 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lwzu
-		 7 =>       (MUL,    OP_MUL_L64,   RA,         CONST_SI,    NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '1'), -- mulli
+		 7 =>       (ALU,    OP_MUL_L64,   RA,         CONST_SI,    NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli
 		24 =>       (ALU,    OP_OR,        NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ori
 		25 =>       (ALU,    OP_OR,        NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- oris
 		20 =>       (ALU,    OP_RLC,       RA,         CONST_SH32,  RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- rlwimi
@@ -66,7 +66,7 @@ architecture behaviour of decode1 is
 		45 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- sthu
 		36 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stw
 		37 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stwu
-		 8 =>       (ALU,    OP_ADD,       RA,         CONST_SI,    NONE, RT,   '0', '0', '1', '0', ONE,  '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- subfic
+		 8 =>       (ALU,    OP_ADD,       RA,         CONST_SI,    NONE, RT,   '0', '0', '1', '0', ONE,  '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic
 		 2 =>       (ALU,    OP_TDI,       RA,         CONST_SI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi
 		--PPC_TWI 3
 		26 =>       (ALU,    OP_XOR,       NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- xori
@@ -145,10 +145,10 @@ architecture behaviour of decode1 is
 		2#0000011100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- and
 		2#0000111100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- andc
 		-- 2#0011111100# bperm
-		2#0000000000#  =>       (ALU,    OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmp
+		2#0000000000#  =>       (ALU,    OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmp
 		2#0111111100#  =>       (ALU,    OP_CMPB,      NONE,       RB,          RS,   RA,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpb
 		-- 2#0011100000# cmpeqb
-		2#0000100000#  =>       (ALU,    OP_CMPL,      RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpl
+		2#0000100000#  =>       (ALU,    OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpl
 		-- 2#0011000000# cmprb
 		2#0000111010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- cntlzd
 		2#0000011010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- cntlzw
@@ -160,22 +160,22 @@ architecture behaviour of decode1 is
 		2#0100010110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbt
 		2#0011110110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbtst
 		-- 2#1111110110# dcbz
-		2#0110001001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdeu
-		2#1110001001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdeuo
-		2#0110001011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divweu
-		2#1110001011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divweuo
-		2#0110101001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divde
-		2#1110101001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdeo
-		2#0110101011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divwe
-		2#1110101011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divweo
-		2#0111001001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdu
-		2#1111001001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divduo
-		2#0111001011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divwu
-		2#1111001011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divwuo
-		2#0111101001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divd
-		2#1111101001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdo
-		2#0111101011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divw
-		2#1111101011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divwo
+		2#0110001001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divdeu
+		2#1110001001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divdeuo
+		2#0110001011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divweu
+		2#1110001011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divweuo
+		2#0110101001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- divde
+		2#1110101001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- divdeo
+		2#0110101011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- divwe
+		2#1110101011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- divweo
+		2#0111001001#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divdu
+		2#1111001001#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divduo
+		2#0111001011#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divwu
+		2#1111001011#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divwuo
+		2#0111101001#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- divd
+		2#1111101001#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- divdo
+		2#0111101011#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- divw
+		2#1111101011#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- divwo
 		2#0100011100#  =>       (ALU,    OP_XOR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- eqv
 		2#1110111010#  =>       (ALU,    OP_EXTS,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- extsb
 		2#1110011010#  =>       (ALU,    OP_EXTS,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- extsh
@@ -238,36 +238,36 @@ architecture behaviour of decode1 is
 		-- 2#1001000000# mcrxrx
 		2#0000010011#  =>       (ALU,    OP_MFCR,      NONE,       NONE,        NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf
 		2#0101010011#  =>       (ALU,    OP_MFSPR,     SPR,        NONE,        NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfspr
-		2#0100001001#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modud
-		2#0100001011#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- moduw
-		2#1100001001#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsd
-		2#1100001011#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsw
+		2#0100001001#  =>       (ALU,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- modud
+		2#0100001011#  =>       (ALU,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- moduw
+		2#1100001001#  =>       (ALU,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- modsd
+		2#1100001011#  =>       (ALU,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', NONE, '0', '0'), -- modsw
 		2#0010010000#  =>       (ALU,    OP_MTCRF,     NONE,       NONE,        RS,   NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtcrf/mtocrf
 		2#0111010011#  =>       (ALU,    OP_MTSPR,     NONE,       NONE,        RS,   SPR,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtspr
-		2#0001001001#  =>       (MUL,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '1'), -- mulhd
-		2#0000001001#  =>       (MUL,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- mulhdu
-		2#0001001011#  =>       (MUL,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '1'), -- mulhw
-		2#0000001011#  =>       (MUL,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '1'), -- mulhwu
+		2#0001001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulhd
+		2#0000001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- mulhdu
+		2#0001001011#  =>       (ALU,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- mulhw
+		2#0000001011#  =>       (ALU,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- mulhwu
                 -- next 4 have reserved bit set
-		2#1001001001#  =>       (MUL,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '1'), -- mulhd
-		2#1000001001#  =>       (MUL,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- mulhdu
-		2#1001001011#  =>       (MUL,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '1'), -- mulhw
-		2#1000001011#  =>       (MUL,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '1'), -- mulhwu
-		2#0011101001#  =>       (MUL,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '1'), -- mulld
-		2#1011101001#  =>       (MUL,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '1'), -- mulldo
-		2#0011101011#  =>       (MUL,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '1'), -- mullw
-		2#1011101011#  =>       (MUL,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '1'), -- mullwo
+		2#1001001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulhd
+		2#1000001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- mulhdu
+		2#1001001011#  =>       (ALU,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- mulhw
+		2#1000001011#  =>       (ALU,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- mulhwu
+		2#0011101001#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulld
+		2#1011101001#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulldo
+		2#0011101011#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- mullw
+		2#1011101011#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- mullwo
 		2#0111011100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nand
 		2#0001101000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- neg
 		2#1001101000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nego
 		2#0001111100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nor
 		2#0110111100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- or
 		2#0110011100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- orc
-		2#0001111010#  =>       (ALU,    OP_POPCNTB,   NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntb
-		2#0111111010#  =>       (ALU,    OP_POPCNTD,   NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntd
-		2#0101111010#  =>       (ALU,    OP_POPCNTW,   NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntw
-		2#0010111010#  =>       (ALU,    OP_PRTYD,     NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd
-		2#0010011010#  =>       (ALU,    OP_PRTYW,     NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw
+		2#0001111010#  =>       (ALU,    OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntb
+		2#0111111010#  =>       (ALU,    OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntd
+		2#0101111010#  =>       (ALU,    OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntw
+		2#0010111010#  =>       (ALU,    OP_PRTY,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd
+		2#0010011010#  =>       (ALU,    OP_PRTY,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw
 		-- 2#0010000000# setb
 		2#0000011011#  =>       (ALU,    OP_SHL,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- sld
 		2#0000011000#  =>       (ALU,    OP_SHL,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- slw
diff --git a/decode2.vhdl b/decode2.vhdl
index f6f7101..582fa5b 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -9,6 +9,9 @@ use work.helpers.all;
 use work.insn_helpers.all;
 
 entity decode2 is
+        generic (
+                EX1_BYPASS : boolean := true
+        );
 	port (
 		clk   : in std_ulogic;
 		rst   : in std_ulogic;
@@ -24,9 +27,6 @@ entity decode2 is
 		d_in  : in Decode1ToDecode2Type;
 
 		e_out : out Decode2ToExecute1Type;
-		m_out : out Decode2ToMultiplyType;
-                d_out : out Decode2ToDividerType;
-		l_out : out Decode2ToLoadstore1Type;
 
 		r_in  : in RegisterFileToDecode2Type;
 		r_out : out Decode2ToRegisterFileType;
@@ -39,9 +39,6 @@ end entity decode2;
 architecture behaviour of decode2 is
 	type reg_type is record
 		e : Decode2ToExecute1Type;
-		m : Decode2ToMultiplyType;
-                d : Decode2ToDividerType;
-		l : Decode2ToLoadstore1Type;
 	end record;
 
 	signal r, rin : reg_type;
@@ -188,15 +185,19 @@ architecture behaviour of decode2 is
 
 	signal gpr_write_valid : std_ulogic;
 	signal gpr_write : gspr_index_t;
+        signal gpr_bypassable  : std_ulogic;
 
 	signal gpr_a_read_valid : std_ulogic;
 	signal gpr_a_read :gspr_index_t;
+        signal gpr_a_bypass : std_ulogic;
 
 	signal gpr_b_read_valid : std_ulogic;
 	signal gpr_b_read : gspr_index_t;
+        signal gpr_b_bypass : std_ulogic;
 
 	signal gpr_c_read_valid : std_ulogic;
 	signal gpr_c_read : gpr_index_t;
+        signal gpr_c_bypass : std_ulogic;
 
 	signal cr_write_valid : std_ulogic;
 begin
@@ -217,6 +218,7 @@ begin
 
 		gpr_write_valid_in => gpr_write_valid,
 		gpr_write_in       => gpr_write,
+                gpr_bypassable     => gpr_bypassable,
 
 		gpr_a_read_valid_in  => gpr_a_read_valid,
 		gpr_a_read_in        => gpr_a_read,
@@ -232,13 +234,17 @@ begin
 
 		valid_out   => control_valid_out,
 		stall_out   => stall_out,
-		stopped_out => stopped_out
+		stopped_out => stopped_out,
+
+                gpr_bypass_a => gpr_a_bypass,
+                gpr_bypass_b => gpr_b_bypass,
+                gpr_bypass_c => gpr_c_bypass
 	);
 
 	decode2_0: process(clk)
 	begin
 		if rising_edge(clk) then
-			if rin.e.valid = '1' or rin.l.valid = '1' or rin.m.valid = '1' or rin.d.valid = '1' then
+			if rin.e.valid = '1' then
 				report "execute " & to_hstring(rin.e.nia);
 			end if;
 			r <= rin;
@@ -259,21 +265,16 @@ begin
 		variable decoded_reg_b : decode_input_reg_t;
 		variable decoded_reg_c : decode_input_reg_t;
 		variable decoded_reg_o : decode_output_reg_t;
-                variable signed_division: std_ulogic;
                 variable length : std_ulogic_vector(3 downto 0);
 	begin
 		v := r;
 
 		v.e := Decode2ToExecute1Init;
-		v.l := Decode2ToLoadStore1Init;
-		v.m := Decode2ToMultiplyInit;
-                v.d := Decode2ToDividerInit;
 
 		mul_a := (others => '0');
 		mul_b := (others => '0');
 
 		--v.e.input_cr := d_in.decode.input_cr;
-		--v.m.input_cr := d_in.decode.input_cr;
 		--v.e.output_cr := d_in.decode.output_cr;
     
 		decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1);
@@ -303,12 +304,17 @@ begin
 		v.e.insn_type := d_in.decode.insn_type;
 		v.e.read_reg1 := decoded_reg_a.reg;
 		v.e.read_data1 := decoded_reg_a.data;
+                v.e.bypass_data1 := gpr_a_bypass;
 		v.e.read_reg2 := decoded_reg_b.reg;
 		v.e.read_data2 := decoded_reg_b.data;
+                v.e.bypass_data2 := gpr_b_bypass;
                 v.e.read_data3 := decoded_reg_c.data;
+                v.e.bypass_data3 := gpr_c_bypass;
 		v.e.write_reg := decoded_reg_o.reg;
 		v.e.rc := decode_rc(d_in.decode.rc, d_in.insn);
-		v.e.oe := decode_oe(d_in.decode.rc, d_in.insn);
+                if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then
+                        v.e.oe := decode_oe(d_in.decode.rc, d_in.insn);
+                end if;
 		v.e.cr := c_in.read_cr_data;
 		v.e.xerc := c_in.read_xerc_data;
                 v.e.invert_a := d_in.decode.invert_a;
@@ -322,102 +328,9 @@ begin
 		end if;
                 v.e.insn := d_in.insn;
                 v.e.data_len := length;
-
-		-- multiply unit
-		v.m.insn_type := d_in.decode.insn_type;
-		mul_a := decoded_reg_a.data;
-		mul_b := decoded_reg_b.data;
-		v.m.write_reg := gspr_to_gpr(decoded_reg_o.reg);
-		v.m.rc := decode_rc(d_in.decode.rc, d_in.insn);
-		v.m.xerc := c_in.read_xerc_data;
-		if v.m.insn_type = OP_MUL_L64 then
-		  v.m.oe := decode_oe(d_in.decode.rc, d_in.insn);
-		end if;
-		v.m.is_32bit := d_in.decode.is_32bit;
-
-		if d_in.decode.is_32bit = '1' then
-			if d_in.decode.is_signed = '1' then
-				v.m.data1 := (others => mul_a(31));
-				v.m.data1(31 downto 0) := mul_a(31 downto 0);
-				v.m.data2 := (others => mul_b(31));
-				v.m.data2(31 downto 0) := mul_b(31 downto 0);
-			else
-				v.m.data1 := '0' & x"00000000" & mul_a(31 downto 0);
-				v.m.data2 := '0' & x"00000000" & mul_b(31 downto 0);
-			end if;
-		else
-			if d_in.decode.is_signed = '1' then
-				v.m.data1 := mul_a(63) & mul_a;
-				v.m.data2 := mul_b(63) & mul_b;
-			else
-				v.m.data1 := '0' & mul_a;
-				v.m.data2 := '0' & mul_b;
-			end if;
-		end if;
-
-                -- divide unit
-                -- PPC divide and modulus instruction words have these bits in
-                -- the bottom 11 bits: o1dns 010t1 r
-                -- where o = OE for div instrs, signedness for mod instrs
-                --       d = 1 for div*, 0 for mod*
-                --       n = 1 for normal, 0 for extended (dividend << 32/64)
-                --       s = 1 for signed, 0 for unsigned (for div*)
-                --       t = 1 for 32-bit, 0 for 64-bit
-                --       r = RC bit (record condition code)
-		v.d.write_reg := gspr_to_gpr(decoded_reg_o.reg);
-                v.d.is_modulus := not d_in.insn(8);
-                v.d.is_32bit := d_in.insn(2);
-                if d_in.insn(8) = '1' then
-                        signed_division := d_in.insn(6);
-                else
-                        signed_division := d_in.insn(10);
-                end if;
-                v.d.is_signed := signed_division;
-                if d_in.insn(2) = '0' then
-                        -- 64-bit forms
-                        if d_in.insn(8) = '1' and d_in.insn(7) = '0' then
-                                v.d.is_extended := '1';
-                        end if;
-                        v.d.dividend := decoded_reg_a.data;
-                        v.d.divisor := decoded_reg_b.data;
-                else
-                        -- 32-bit forms
-                        if d_in.insn(8) = '1' and d_in.insn(7) = '0' then   -- extended forms
-                                v.d.dividend := decoded_reg_a.data(31 downto 0) & x"00000000";
-                        elsif signed_division = '1' and decoded_reg_a.data(31) = '1' then
-                                -- sign extend to 64 bits
-                                v.d.dividend := x"ffffffff" & decoded_reg_a.data(31 downto 0);
-                        else
-                                v.d.dividend := x"00000000" & decoded_reg_a.data(31 downto 0);
-                        end if;
-                        if signed_division = '1' and decoded_reg_b.data(31) = '1' then
-                                v.d.divisor := x"ffffffff" & decoded_reg_b.data(31 downto 0);
-                        else
-                                v.d.divisor := x"00000000" & decoded_reg_b.data(31 downto 0);
-                        end if;
-                end if;
-                v.d.rc := decode_rc(d_in.decode.rc, d_in.insn);
-		v.d.xerc := c_in.read_xerc_data;
-		v.d.oe := decode_oe(d_in.decode.rc, d_in.insn);
-
-		-- load/store unit
-		v.l.update_reg := gspr_to_gpr(decoded_reg_a.reg);
-		v.l.addr1 := decoded_reg_a.data;
-		v.l.addr2 := decoded_reg_b.data;
-		v.l.data := decoded_reg_c.data;
-		v.l.write_reg := gspr_to_gpr(decoded_reg_o.reg);
-
-		if d_in.decode.insn_type = OP_LOAD then
-			v.l.load := '1';
-		else
-			v.l.load := '0';
-		end if;
-
-                v.l.length := length;
-		v.l.byte_reverse := d_in.decode.byte_reverse;
-		v.l.sign_extend := d_in.decode.sign_extend;
-		v.l.update := d_in.decode.update;
-		v.l.xerc := c_in.read_xerc_data;
+		v.e.byte_reverse := d_in.decode.byte_reverse;
+		v.e.sign_extend := d_in.decode.sign_extend;
+		v.e.update := d_in.decode.update;
 
 		-- issue control
 		control_valid_in <= d_in.valid;
@@ -425,6 +338,10 @@ begin
 
 		gpr_write_valid <= decoded_reg_o.reg_valid;
 		gpr_write <= decoded_reg_o.reg;
+                gpr_bypassable <= '0';
+                if EX1_BYPASS and d_in.decode.unit = ALU then
+                        gpr_bypassable <= '1';
+                end if;
 
 		gpr_a_read_valid <= decoded_reg_a.reg_valid;
 		gpr_a_read <= decoded_reg_a.reg;
@@ -437,29 +354,13 @@ begin
 
                 cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn);
 
-		v.e.valid := '0';
-		v.m.valid := '0';
-                v.d.valid := '0';
-		v.l.valid := '0';
-		case d_in.decode.unit is
-		when ALU =>
-			v.e.valid := control_valid_out;
-		when LDST =>
-			v.l.valid := control_valid_out;
-		when MUL =>
-			v.m.valid := control_valid_out;
-                when DIV =>
-                        v.d.valid := control_valid_out;
-		when NONE =>
-			v.e.valid := control_valid_out;
+		v.e.valid := control_valid_out;
+		if d_in.decode.unit = NONE then
 			v.e.insn_type := OP_ILLEGAL;
-		end case;
+		end if;
 
 		if rst = '1' then
 			v.e := Decode2ToExecute1Init;
-			v.l := Decode2ToLoadStore1Init;
-			v.m := Decode2ToMultiplyInit;
-                        v.d := Decode2ToDividerInit;
 		end if;
 
 		-- Update registers
@@ -467,8 +368,5 @@ begin
 
 		-- Update outputs
 		e_out <= r.e;
-		l_out <= r.l;
-		m_out <= r.m;
-                d_out <= r.d;
 	end process;
 end architecture behaviour;
diff --git a/decode_types.vhdl b/decode_types.vhdl
index e847fcf..21d8b68 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -4,18 +4,18 @@ use ieee.std_logic_1164.all;
 package decode_types is
     type insn_type_t is (OP_ILLEGAL, OP_NOP, OP_ADD,
 			 OP_ADDPCIS, OP_AND, OP_ATTN, OP_B, OP_BC, OP_BCREG,
-			 OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPL, OP_CMPRB,
+			 OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB,
 			 OP_CNTZ, OP_CRAND,
 			 OP_CRANDC, OP_CREQV, OP_CRNAND, OP_CRNOR, OP_CROR, OP_CRORC,
 			 OP_CRXOR, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST,
-			 OP_DCBZ, OP_DIV, OP_EXTS,
+			 OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS,
 			 OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC,
 			 OP_LOAD, OP_STORE, OP_MADDHD, OP_MADDHDU, OP_MADDLD, OP_MCRF,
 			 OP_MCRXR, OP_MCRXRX, OP_MFCR, OP_MFSPR, OP_MOD,
 			 OP_MTCRF, OP_MTSPR, OP_MUL_L64,
 			 OP_MUL_H64, OP_MUL_H32, OP_OR,
-			 OP_POPCNTB, OP_POPCNTD, OP_POPCNTW, OP_PRTYD,
-			 OP_PRTYW, OP_RLC, OP_RLCL, OP_RLCR, OP_SETB,
+			 OP_POPCNT, OP_PRTY,
+			 OP_RLC, OP_RLCL, OP_RLCR, OP_SETB,
 			 OP_SHL, OP_SHR,
 			 OP_SYNC, OP_TD, OP_TDI, OP_TW,
 			 OP_TWI, OP_XOR, OP_SIM_CONFIG
@@ -46,7 +46,7 @@ package decode_types is
 
     constant TOO_OFFSET : integer := 0;
 
-    type unit_t is (NONE, ALU, LDST, MUL, DIV);
+    type unit_t is (NONE, ALU, LDST);
     type length_t is (NONE, is1B, is2B, is4B, is8B);
 
     type decode_rom_t is record
diff --git a/divider.vhdl b/divider.vhdl
index affab85..aef65a4 100644
--- a/divider.vhdl
+++ b/divider.vhdl
@@ -10,8 +10,8 @@ entity divider is
     port (
         clk   : in std_logic;
         rst   : in std_logic;
-        d_in  : in Decode2ToDividerType;
-        d_out : out DividerToWritebackType
+        d_in  : in Execute1ToDividerType;
+        d_out : out DividerToExecute1Type
         );
 end entity divider;
 
@@ -23,20 +23,15 @@ architecture behaviour of divider is
     signal sresult    : std_ulogic_vector(64 downto 0);
     signal oresult    : std_ulogic_vector(63 downto 0);
     signal running    : std_ulogic;
-    signal signcheck  : std_ulogic;
     signal count      : unsigned(6 downto 0);
     signal neg_result : std_ulogic;
     signal is_modulus : std_ulogic;
     signal is_32bit   : std_ulogic;
     signal extended   : std_ulogic;
     signal is_signed  : std_ulogic;
-    signal rc         : std_ulogic;
-    signal write_reg  : std_ulogic_vector(4 downto 0);
     signal overflow   : std_ulogic;
     signal ovf32      : std_ulogic;
     signal did_ovf    : std_ulogic;
-    signal oe         : std_ulogic;
-    signal xerc       : xer_common_t;
 begin
     divider_0: process(clk)
     begin
@@ -48,40 +43,22 @@ begin
                 running <= '0';
                 count <= "0000000";
             elsif d_in.valid = '1' then
-                if d_in.is_extended = '1' and not (d_in.is_signed = '1' and d_in.dividend(63) = '1') then
+                if d_in.is_extended = '1'  then
                     dend <= '0' & d_in.dividend & x"0000000000000000";
                 else
                     dend <= '0' & x"0000000000000000" & d_in.dividend;
                 end if;
                 div <= unsigned(d_in.divisor);
                 quot <= (others => '0');
-                write_reg <= d_in.write_reg;
-                neg_result <= '0';
+                neg_result <= d_in.neg_result;
                 is_modulus <= d_in.is_modulus;
                 extended <= d_in.is_extended;
                 is_32bit <= d_in.is_32bit;
                 is_signed <= d_in.is_signed;
-                rc <= d_in.rc;
-                oe <= d_in.oe;
-		xerc <= d_in.xerc;
                 count <= "1111111";
                 running <= '1';
                 overflow <= '0';
                 ovf32 <= '0';
-                signcheck <= d_in.is_signed and (d_in.dividend(63) or d_in.divisor(63));
-            elsif signcheck = '1' then
-                signcheck <= '0';
-                neg_result <= dend(63) xor (div(63) and not is_modulus);
-                if dend(63) = '1' then
-                    if extended = '1' then
-                        dend <= '0' & std_ulogic_vector(- signed(dend(63 downto 0))) & x"0000000000000000";
-                    else
-                        dend <= '0' & x"0000000000000000" & std_ulogic_vector(- signed(dend(63 downto 0)));
-                    end if;
-                end if;
-                if div(63) = '1' then
-                    div <= unsigned(- signed(div));
-                end if;
             elsif running = '1' then
                 if count = "0111111" then
                     running <= '0';
@@ -113,9 +90,6 @@ begin
 
     divider_1: process(all)
     begin
-        d_out.write_reg_nr <= write_reg;
-        d_out.rc <= rc;
-
         if is_modulus = '1' then
             result <= dend(128 downto 65);
         else
@@ -151,23 +125,9 @@ begin
         if rising_edge(clk) then
 	    d_out.valid <= '0';
             d_out.write_reg_data <= oresult;
-	    d_out.write_reg_enable <= '0';
-	    d_out.write_xerc_enable <= '0';
-	    d_out.xerc <= xerc;
+	    d_out.overflow <= did_ovf;
             if count = "1000000" then
                 d_out.valid <= '1';
-                d_out.write_reg_enable <= '1';
-		d_out.write_xerc_enable <= oe;
-
-		-- We must test oe because the RC update code in writeback
-		-- will use the xerc value to set CR0:SO so we must not clobber
-		-- xerc if OE wasn't set.
-		--
-		if oe = '1' then
-		    d_out.xerc.ov <= did_ovf;
-		    d_out.xerc.ov32 <= did_ovf;
-		    d_out.xerc.so <= xerc.so or did_ovf;
-		end if;
             end if;
         end if;
     end process;
diff --git a/divider_tb.vhdl b/divider_tb.vhdl
index 5f809bb..95156a3 100644
--- a/divider_tb.vhdl
+++ b/divider_tb.vhdl
@@ -16,8 +16,8 @@ architecture behave of divider_tb is
     signal rst              : std_ulogic;
     constant clk_period     : time := 10 ns;
 
-    signal d1               : Decode2ToDividerType;
-    signal d2               : DividerToWritebackType;
+    signal d1               : Execute1ToDividerType;
+    signal d2               : DividerToExecute1Type;
 begin
     divider_0: entity work.divider
         port map (clk => clk, rst => rst, d_in => d1, d_out => d2);
@@ -43,14 +43,13 @@ begin
         rst <= '0';
 
         d1.valid <= '1';
-        d1.write_reg <= "10001";
         d1.dividend <= x"0000000010001000";
         d1.divisor  <= x"0000000000001111";
         d1.is_signed <= '0';
         d1.is_32bit <= '0';
         d1.is_extended <= '0';
         d1.is_modulus <= '0';
-        d1.rc <= '0';
+        d1.neg_result <= '0';
 
         wait for clk_period;
         assert d2.valid = '0';
@@ -65,16 +64,12 @@ begin
         end loop;
 
         assert d2.valid = '1';
-        assert d2.write_reg_enable = '1';
-        assert d2.write_reg_nr = "10001";
         assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data);
-        assert d2.rc = '0';
 
         wait for clk_period;
         assert d2.valid = '0' report "valid";
 
         d1.valid <= '1';
-        d1.rc <= '1';
 
         wait for clk_period;
         assert d2.valid = '0' report "valid";
@@ -89,10 +84,7 @@ begin
         end loop;
 
         assert d2.valid = '1';
-        assert d2.write_reg_enable = '1';
-        assert d2.write_reg_nr = "10001";
         assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data);
-        assert d2.rc = '1';
 
         wait for clk_period;
         assert d2.valid = '0';
@@ -105,9 +97,10 @@ begin
                     ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
                     rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
 
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                     d1.is_signed <= '1';
+                    d1.neg_result <= ra(63) xor rb(63);
                     d1.valid <= '1';
 
                     wait for clk_period;
@@ -142,6 +135,7 @@ begin
                     d1.dividend <= ra;
                     d1.divisor <= rb;
                     d1.is_signed <= '0';
+                    d1.neg_result <= '0';
                     d1.valid <= '1';
 
                     wait for clk_period;
@@ -173,9 +167,10 @@ begin
                     ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
                     rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
 
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                     d1.is_signed <= '1';
+                    d1.neg_result <= ra(63) xor rb(63);
                     d1.is_extended <= '1';
                     d1.valid <= '1';
 
@@ -216,6 +211,7 @@ begin
                     d1.dividend <= ra;
                     d1.divisor <= rb;
                     d1.is_signed <= '0';
+                    d1.neg_result <= '0';
                     d1.is_extended <= '1';
                     d1.valid <= '1';
 
@@ -250,9 +246,10 @@ begin
                     ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
                     rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
 
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                     d1.is_signed <= '1';
+                    d1.neg_result <= ra(63) xor rb(63);
                     d1.is_extended <= '0';
                     d1.is_32bit <= '1';
                     d1.valid <= '1';
@@ -289,6 +286,7 @@ begin
                     d1.dividend <= ra;
                     d1.divisor <= rb;
                     d1.is_signed <= '0';
+                    d1.neg_result <= '0';
                     d1.is_extended <= '0';
                     d1.is_32bit <= '1';
                     d1.valid <= '1';
@@ -322,9 +320,10 @@ begin
                     ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 32)) & x"00000000";
                     rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
 
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                     d1.is_signed <= '1';
+                    d1.neg_result <= ra(63) xor rb(63);
                     d1.is_extended <= '0';
                     d1.is_32bit <= '1';
                     d1.valid <= '1';
@@ -365,6 +364,7 @@ begin
                     d1.dividend <= ra;
                     d1.divisor <= rb;
                     d1.is_signed <= '0';
+                    d1.neg_result <= '0';
                     d1.is_extended <= '0';
                     d1.is_32bit <= '1';
                     d1.valid <= '1';
@@ -398,9 +398,10 @@ begin
                     ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
                     rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
 
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                     d1.is_signed <= '1';
+                    d1.neg_result <= ra(63);
                     d1.is_extended <= '0';
                     d1.is_32bit <= '0';
                     d1.is_modulus <= '1';
@@ -438,6 +439,7 @@ begin
                     d1.dividend <= ra;
                     d1.divisor <= rb;
                     d1.is_signed <= '0';
+                    d1.neg_result <= '0';
                     d1.is_extended <= '0';
                     d1.is_32bit <= '0';
                     d1.is_modulus <= '1';
@@ -472,9 +474,10 @@ begin
                     ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
                     rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
 
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                     d1.is_signed <= '1';
+                    d1.neg_result <= ra(63);
                     d1.is_extended <= '0';
                     d1.is_32bit <= '1';
                     d1.is_modulus <= '1';
@@ -517,6 +520,7 @@ begin
                     d1.dividend <= ra;
                     d1.divisor <= rb;
                     d1.is_signed <= '0';
+                    d1.neg_result <= '0';
                     d1.is_extended <= '0';
                     d1.is_32bit <= '1';
                     d1.is_modulus <= '1';
diff --git a/execute1.vhdl b/execute1.vhdl
index 4714ec5..ae13c72 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -11,8 +11,12 @@ use work.insn_helpers.all;
 use work.ppc_fx_insns.all;
 
 entity execute1 is
+    generic (
+        EX1_BYPASS : boolean := true
+        );
     port (
 	clk   : in std_ulogic;
+        rst   : in std_ulogic;
 
 	-- asynchronous
 	flush_out : out std_ulogic;
@@ -21,6 +25,7 @@ entity execute1 is
 	e_in  : in Decode2ToExecute1Type;
 
 	-- asynchronous
+        l_out : out Execute1ToLoadstore1Type;
 	f_out : out Execute1ToFetch1Type;
 
 	e_out : out Execute1ToWritebackType;
@@ -35,10 +40,19 @@ architecture behaviour of execute1 is
 	e : Execute1ToWritebackType;
 	lr_update : std_ulogic;
 	next_lr : std_ulogic_vector(63 downto 0);
+	mul_in_progress : std_ulogic;
+        div_in_progress : std_ulogic;
+        cntz_in_progress : std_ulogic;
+	slow_op_dest : gpr_index_t;
+	slow_op_rc : std_ulogic;
+	slow_op_oe : std_ulogic;
+	slow_op_xerc : xer_common_t;
     end record;
 
     signal r, rin : reg_type;
 
+    signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0);
+
     signal ctrl: ctrl_t := (others => (others => '0'));
     signal ctrl_tmp: ctrl_t := (others => (others => '0'));
 
@@ -47,6 +61,16 @@ architecture behaviour of execute1 is
     signal rotator_carry: std_ulogic;
     signal logical_result: std_ulogic_vector(63 downto 0);
     signal countzero_result: std_ulogic_vector(63 downto 0);
+    signal popcnt_result: std_ulogic_vector(63 downto 0);
+    signal parity_result: std_ulogic_vector(63 downto 0);
+
+    -- multiply signals
+    signal x_to_multiply: Execute1ToMultiplyType;
+    signal multiply_to_x: MultiplyToExecute1Type;
+
+    -- divider signals
+    signal x_to_divider: Execute1ToDividerType;
+    signal divider_to_x: DividerToExecute1Type;
 
     procedure set_carry(e: inout Execute1ToWritebackType;
 			carry32 : in std_ulogic;
@@ -92,9 +116,9 @@ begin
 
     rotator_0: entity work.rotator
 	port map (
-	    rs => e_in.read_data3,
-	    ra => e_in.read_data1,
-	    shift => e_in.read_data2(6 downto 0),
+	    rs => c_in,
+	    ra => a_in,
+	    shift => b_in(6 downto 0),
 	    insn => e_in.insn,
 	    is_32bit => e_in.is_32bit,
 	    right_shift => right_shift,
@@ -107,22 +131,45 @@ begin
 
     logical_0: entity work.logical
 	port map (
-	    rs => e_in.read_data3,
-	    rb => e_in.read_data2,
+	    rs => c_in,
+	    rb => b_in,
 	    op => e_in.insn_type,
 	    invert_in => e_in.invert_a,
 	    invert_out => e_in.invert_out,
-	    result => logical_result
+	    result => logical_result,
+            datalen => e_in.data_len,
+            popcnt => popcnt_result,
+            parity => parity_result
 	    );
 
     countzero_0: entity work.zero_counter
 	port map (
-	    rs => e_in.read_data3,
+            clk => clk,
+	    rs => c_in,
 	    count_right => e_in.insn(10),
 	    is_32bit => e_in.is_32bit,
 	    result => countzero_result
 	    );
 
+    multiply_0: entity work.multiply
+        port map (
+            clk => clk,
+            m_in => x_to_multiply,
+            m_out => multiply_to_x
+            );
+
+    divider_0: entity work.divider
+        port map (
+            clk => clk,
+            rst => rst,
+            d_in => x_to_divider,
+            d_out => divider_to_x
+            );
+
+    a_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data1 = '1' else e_in.read_data1;
+    b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2;
+    c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3;
+
     execute1_0: process(clk)
     begin
 	if rising_edge(clk) then
@@ -159,6 +206,14 @@ begin
 	variable l : std_ulogic;
 	variable next_nia : std_ulogic_vector(63 downto 0);
         variable carry_32, carry_64 : std_ulogic;
+        variable sign1, sign2 : std_ulogic;
+        variable abs1, abs2 : signed(63 downto 0);
+	variable overflow : std_ulogic;
+	variable negative : std_ulogic;
+        variable zerohi, zerolo : std_ulogic;
+        variable msb_a, msb_b : std_ulogic;
+        variable a_lt : std_ulogic;
+        variable lv : Execute1ToLoadstore1Type;
     begin
 	result := (others => '0');
 	result_with_carry := (others => '0');
@@ -204,6 +259,83 @@ begin
 	end if;
 
 	v.lr_update := '0';
+	v.mul_in_progress := '0';
+        v.div_in_progress := '0';
+        v.cntz_in_progress := '0';
+
+	-- signals to multiply unit
+	x_to_multiply <= Execute1ToMultiplyInit;
+	x_to_multiply.insn_type <= e_in.insn_type;
+	x_to_multiply.is_32bit <= e_in.is_32bit;
+
+	if e_in.is_32bit = '1' then
+	    if e_in.is_signed = '1' then
+		x_to_multiply.data1 <= (others => a_in(31));
+		x_to_multiply.data1(31 downto 0) <= a_in(31 downto 0);
+		x_to_multiply.data2 <= (others => b_in(31));
+		x_to_multiply.data2(31 downto 0) <= b_in(31 downto 0);
+	    else
+		x_to_multiply.data1 <= '0' & x"00000000" & a_in(31 downto 0);
+		x_to_multiply.data2 <= '0' & x"00000000" & b_in(31 downto 0);
+	    end if;
+	else
+	    if e_in.is_signed = '1' then
+		x_to_multiply.data1 <= a_in(63) & a_in;
+		x_to_multiply.data2 <= b_in(63) & b_in;
+	    else
+		x_to_multiply.data1 <= '0' & a_in;
+		x_to_multiply.data2 <= '0' & b_in;
+	    end if;
+	end if;
+
+        -- signals to divide unit
+        sign1 := '0';
+        sign2 := '0';
+        if e_in.is_signed = '1' then
+            if e_in.is_32bit = '1' then
+                sign1 := a_in(31);
+                sign2 := b_in(31);
+            else
+                sign1 := a_in(63);
+                sign2 := b_in(63);
+            end if;
+        end if;
+        -- take absolute values
+        if sign1 = '0' then
+            abs1 := signed(a_in);
+        else
+            abs1 := - signed(a_in);
+        end if;
+        if sign2 = '0' then
+            abs2 := signed(b_in);
+        else
+            abs2 := - signed(b_in);
+        end if;
+
+        x_to_divider <= Execute1ToDividerInit;
+        x_to_divider.is_signed <= e_in.is_signed;
+	x_to_divider.is_32bit <= e_in.is_32bit;
+        if e_in.insn_type = OP_MOD then
+            x_to_divider.is_modulus <= '1';
+        end if;
+        x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus);
+        if e_in.is_32bit = '0' then
+            -- 64-bit forms
+            if e_in.insn_type = OP_DIVE then
+                x_to_divider.is_extended <= '1';
+            end if;
+            x_to_divider.dividend <= std_ulogic_vector(abs1);
+            x_to_divider.divisor <= std_ulogic_vector(abs2);
+        else
+            -- 32-bit forms
+            x_to_divider.is_extended <= '0';
+            if e_in.insn_type = OP_DIVE then   -- extended forms
+                x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000";
+            else
+                x_to_divider.dividend <= x"00000000" & std_ulogic_vector(abs1(31 downto 0));
+            end if;
+            x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
+        end if;
 
 	ctrl_tmp <= ctrl;
 	-- FIXME: run at 512MHz not core freq
@@ -226,8 +358,10 @@ begin
 
 	    v.e.valid := '1';
 	    v.e.write_reg := e_in.write_reg;
-	    v.e.write_len := x"8";
-	    v.e.sign_extend := '0';
+	    v.slow_op_dest := gspr_to_gpr(e_in.write_reg);
+	    v.slow_op_rc := e_in.rc;
+	    v.slow_op_oe := e_in.oe;
+	    v.slow_op_xerc := v.e.xerc;
 
 	    case_0: case e_in.insn_type is
 
@@ -236,51 +370,93 @@ begin
 		report "illegal";
 	    when OP_NOP =>
 		-- Do nothing
-	    when OP_ADD =>
+	    when OP_ADD | OP_CMP =>
 		if e_in.invert_a = '0' then
-		    a_inv := e_in.read_data1;
+		    a_inv := a_in;
 		else
-		    a_inv := not e_in.read_data1;
+		    a_inv := not a_in;
 		end if;
-		result_with_carry := ppc_adde(a_inv, e_in.read_data2,
+		result_with_carry := ppc_adde(a_inv, b_in,
 					      decode_input_carry(e_in.input_carry, v.e.xerc));
 		result := result_with_carry(63 downto 0);
-                carry_32 := result(32) xor a_inv(32) xor e_in.read_data2(32);
+                carry_32 := result(32) xor a_inv(32) xor b_in(32);
                 carry_64 := result_with_carry(64);
-		if e_in.output_carry = '1' then
-		    set_carry(v.e, carry_32, carry_64);
-		end if;
-		if e_in.oe = '1' then
-		    set_ov(v.e,
-			   calc_ov(a_inv(63), e_in.read_data2(63), carry_64, result_with_carry(63)),
-			   calc_ov(a_inv(31), e_in.read_data2(31), carry_32, result_with_carry(31)));
-		end if;
-		result_en := '1';
+                if e_in.insn_type = OP_ADD then
+                    if e_in.output_carry = '1' then
+                        set_carry(v.e, carry_32, carry_64);
+                    end if;
+                    if e_in.oe = '1' then
+                        set_ov(v.e,
+                               calc_ov(a_inv(63), b_in(63), carry_64, result_with_carry(63)),
+                               calc_ov(a_inv(31), b_in(31), carry_32, result_with_carry(31)));
+                    end if;
+                    result_en := '1';
+                else
+                    -- CMP and CMPL instructions
+                    -- Note, we have done RB - RA, not RA - RB
+                    bf := insn_bf(e_in.insn);
+                    l := insn_l(e_in.insn);
+                    v.e.write_cr_enable := '1';
+                    crnum := to_integer(unsigned(bf));
+                    v.e.write_cr_mask := num_to_fxm(crnum);
+                    zerolo := not (or (a_in(31 downto 0) xor b_in(31 downto 0)));
+                    zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32)));
+                    if zerolo = '1' and (l = '0' or zerohi = '1') then
+                        -- values are equal
+                        newcrf := "001" & v.e.xerc.so;
+                    else
+                        if l = '1' then
+                            -- 64-bit comparison
+                            msb_a := a_in(63);
+                            msb_b := b_in(63);
+                        else
+                            -- 32-bit comparison
+                            msb_a := a_in(31);
+                            msb_b := b_in(31);
+                        end if;
+                        if msb_a /= msb_b then
+                            -- Subtraction might overflow, but
+                            -- comparison is clear from MSB difference.
+                            -- for signed, 0 is greater; for unsigned, 1 is greater
+                            a_lt := msb_a xnor e_in.is_signed;
+                        else
+                            -- Subtraction cannot overflow since MSBs are equal.
+                            -- carry = 1 indicates RA is smaller (signed or unsigned)
+                            a_lt := (not l and carry_32) or (l and carry_64);
+                        end if;
+                        newcrf := a_lt & not a_lt & '0' & v.e.xerc.so;
+                    end if;
+                    for i in 0 to 7 loop
+                        lo := i*4;
+                        hi := lo + 3;
+                        v.e.write_cr_data(hi downto lo) := newcrf;
+                    end loop;
+                end if;
 	    when OP_AND | OP_OR | OP_XOR =>
 		result := logical_result;
 		result_en := '1';
 	    when OP_B =>
 		f_out.redirect <= '1';
 		if (insn_aa(e_in.insn)) then
-		    f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2));
+		    f_out.redirect_nia <= std_ulogic_vector(signed(b_in));
 		else
-		    f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2));
+		    f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
 		end if;
 	    when OP_BC =>
 		-- read_data1 is CTR
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
 		if bo(4-2) = '0' then
-		    result := std_ulogic_vector(unsigned(e_in.read_data1) - 1);
+		    result := std_ulogic_vector(unsigned(a_in) - 1);
 		    result_en := '1';
 		    v.e.write_reg := fast_spr_num(SPR_CTR);
 		end if;
-		if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then
+		if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then
 		    f_out.redirect <= '1';
 		    if (insn_aa(e_in.insn)) then
-			f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2));
+			f_out.redirect_nia <= std_ulogic_vector(signed(b_in));
 		    else
-			f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2));
+			f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
 		    end if;
 		end if;
 	    when OP_BCREG =>
@@ -289,53 +465,41 @@ begin
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
 		if bo(4-2) = '0' and e_in.insn(10) = '0' then
-		    result := std_ulogic_vector(unsigned(e_in.read_data1) - 1);
+		    result := std_ulogic_vector(unsigned(a_in) - 1);
 		    result_en := '1';
 		    v.e.write_reg := fast_spr_num(SPR_CTR);
 		end if;
-		if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then
+		if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then
 		    f_out.redirect <= '1';
-		    f_out.redirect_nia <= e_in.read_data2(63 downto 2) & "00";
+		    f_out.redirect_nia <= b_in(63 downto 2) & "00";
 		end if;
 	    when OP_CMPB =>
-		result := ppc_cmpb(e_in.read_data3, e_in.read_data2);
+		result := ppc_cmpb(c_in, b_in);
 		result_en := '1';
-	    when OP_CMP =>
-		bf := insn_bf(e_in.insn);
-		l := insn_l(e_in.insn);
-		v.e.write_cr_enable := '1';
-		crnum := to_integer(unsigned(bf));
-		v.e.write_cr_mask := num_to_fxm(crnum);
-		for i in 0 to 7 loop
-		    lo := i*4;
-		    hi := lo + 3;
-		    v.e.write_cr_data(hi downto lo) := ppc_cmp(l, e_in.read_data1, e_in.read_data2, v.e.xerc.so);
-		end loop;
-	    when OP_CMPL =>
-		bf := insn_bf(e_in.insn);
-		l := insn_l(e_in.insn);
-		v.e.write_cr_enable := '1';
-		crnum := to_integer(unsigned(bf));
-		v.e.write_cr_mask := num_to_fxm(crnum);
-		for i in 0 to 7 loop
-		    lo := i*4;
-		    hi := lo + 3;
-		    v.e.write_cr_data(hi downto lo) := ppc_cmpl(l, e_in.read_data1, e_in.read_data2, v.e.xerc.so);
-		end loop;
-	    when OP_CNTZ =>
-		result := countzero_result;
-		result_en := '1';
-	    when OP_EXTS =>
-		v.e.write_len := e_in.data_len;
-		v.e.sign_extend := '1';
-		result := e_in.read_data3;
+            when OP_CNTZ =>
+                v.e.valid := '0';
+                v.cntz_in_progress := '1';
+                stall_out <= '1';
+            when OP_EXTS =>
+                -- note data_len is a 1-hot encoding
+		negative := (e_in.data_len(0) and c_in(7)) or
+			    (e_in.data_len(1) and c_in(15)) or
+			    (e_in.data_len(2) and c_in(31));
+		result := (others => negative);
+		if e_in.data_len(2) = '1' then
+		    result(31 downto 16) := c_in(31 downto 16);
+		end if;
+		if e_in.data_len(2) = '1' or e_in.data_len(1) = '1' then
+		    result(15 downto 8) := c_in(15 downto 8);
+		end if;
+		result(7 downto 0) := c_in(7 downto 0);
 		result_en := '1';
 	    when OP_ISEL =>
 		crbit := to_integer(unsigned(insn_bc(e_in.insn)));
 		if e_in.cr(31-crbit) = '1' then
-		    result := e_in.read_data1;
+		    result := a_in;
 		else
-		    result := e_in.read_data2;
+		    result := b_in;
 		end if;
 		result_en := '1';
 	    when OP_MCRF =>
@@ -400,7 +564,7 @@ begin
 		end if;
 	    when OP_MFSPR =>
 		if is_fast_spr(e_in.read_reg1) then
-		    result := e_in.read_data1;
+		    result := a_in;
 		    if decode_spr_num(e_in.insn) = SPR_XER then
 			-- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer
 			result(63 downto 32) := (others => '0');
@@ -447,19 +611,19 @@ begin
 		    crnum := fxm_to_num(insn_fxm(e_in.insn));
 		    v.e.write_cr_mask := num_to_fxm(crnum);
 		end if;
-		v.e.write_cr_data := e_in.read_data3(31 downto 0);
+		v.e.write_cr_data := c_in(31 downto 0);
 	    when OP_MTSPR =>
 		report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
-		    "=" & to_hstring(e_in.read_data3);
+		    "=" & to_hstring(c_in);
 		if is_fast_spr(e_in.write_reg) then
-		    result := e_in.read_data3;
+		    result := c_in;
 		    result_en := '1';
 		    if decode_spr_num(e_in.insn) = SPR_XER then
-			v.e.xerc.so := e_in.read_data3(63-32);
-			v.e.xerc.ov := e_in.read_data3(63-33);
-			v.e.xerc.ca := e_in.read_data3(63-34);
-			v.e.xerc.ov32 := e_in.read_data3(63-44);
-			v.e.xerc.ca32 := e_in.read_data3(63-45);
+			v.e.xerc.so := c_in(63-32);
+			v.e.xerc.ov := c_in(63-33);
+			v.e.xerc.ca := c_in(63-34);
+			v.e.xerc.ov32 := c_in(63-44);
+			v.e.xerc.ca32 := c_in(63-45);
 			v.e.write_xerc_enable := '1';
 		    end if;
 		else
@@ -468,20 +632,11 @@ begin
 --		    when others =>
 --		    end case;
 		end if;
-	    when OP_POPCNTB =>
-		result := ppc_popcntb(e_in.read_data3);
+	    when OP_POPCNT =>
+		result := popcnt_result;
 		result_en := '1';
-	    when OP_POPCNTW =>
-		result := ppc_popcntw(e_in.read_data3);
-		result_en := '1';
-	    when OP_POPCNTD =>
-		result := ppc_popcntd(e_in.read_data3);
-		result_en := '1';
-	    when OP_PRTYD =>
-		result := ppc_prtyd(e_in.read_data3);
-		result_en := '1';
-	    when OP_PRTYW =>
-		result := ppc_prtyw(e_in.read_data3);
+	    when OP_PRTY =>
+		result := parity_result;
 		result_en := '1';
 	    when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR =>
 		result := rotator_result;
@@ -506,11 +661,29 @@ begin
 	    when OP_ICBI =>
 		icache_inval <= '1';
 
-	    when others =>
+	    when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 =>
+		v.e.valid := '0';
+		v.mul_in_progress := '1';
+		stall_out <= '1';
+		x_to_multiply.valid <= '1';
+
+	    when OP_DIV | OP_DIVE | OP_MOD =>
+		v.e.valid := '0';
+		v.div_in_progress := '1';
+		stall_out <= '1';
+		x_to_divider.valid <= '1';
+
+            when OP_LOAD | OP_STORE =>
+                -- loadstore/dcache has its own port to writeback
+                v.e.valid := '0';
+
+            when others =>
 		terminate_out <= '1';
 		report "illegal";
 	    end case;
 
+	    v.e.rc := e_in.rc and e_in.valid;
+
 	    -- Update LR on the next cycle after a branch link
 	    --
 	    -- WARNING: The LR update isn't tracked by our hazard tracker. This
@@ -533,20 +706,74 @@ begin
 	    result_en := '1';
 	    result := r.next_lr;
 	    v.e.write_reg := fast_spr_num(SPR_LR);
-	    v.e.write_len := x"8";
-	    v.e.sign_extend := '0';
 	    v.e.valid := '1';
+        elsif r.cntz_in_progress = '1' then
+            -- cnt[lt]z always takes two cycles
+            result := countzero_result;
+            result_en := '1';
+            v.e.write_reg := gpr_to_gspr(v.slow_op_dest);
+            v.e.rc := v.slow_op_rc;
+            v.e.xerc := v.slow_op_xerc;
+            v.e.valid := '1';
+	elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then
+	    if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or
+	       (r.div_in_progress = '1' and divider_to_x.valid = '1') then
+		if r.mul_in_progress = '1' then
+		    result := multiply_to_x.write_reg_data;
+		    overflow := multiply_to_x.overflow;
+		else
+		    result := divider_to_x.write_reg_data;
+		    overflow := divider_to_x.overflow;
+		end if;
+		result_en := '1';
+		v.e.write_reg := gpr_to_gspr(v.slow_op_dest);
+		v.e.rc := v.slow_op_rc;
+		v.e.xerc := v.slow_op_xerc;
+		v.e.write_xerc_enable := v.slow_op_oe;
+		-- We must test oe because the RC update code in writeback
+		-- will use the xerc value to set CR0:SO so we must not clobber
+		-- xerc if OE wasn't set.
+		if v.slow_op_oe = '1' then
+		    v.e.xerc.ov := overflow;
+		    v.e.xerc.ov32 := overflow;
+		    v.e.xerc.so := v.slow_op_xerc.so or overflow;
+		end if;
+		v.e.valid := '1';
+	    else
+		stall_out <= '1';
+		v.mul_in_progress := r.mul_in_progress;
+		v.div_in_progress := r.div_in_progress;
+	    end if;
 	end if;
 
 	v.e.write_data := result;
 	v.e.write_enable := result_en;
-	v.e.rc := e_in.rc and e_in.valid;
+
+        -- Outputs to loadstore1 (async)
+        lv := Execute1ToLoadstore1Init;
+        if e_in.valid = '1' and (e_in.insn_type = OP_LOAD or e_in.insn_type = OP_STORE) then
+            lv.valid := '1';
+        end if;
+        if e_in.insn_type = OP_LOAD then
+            lv.load := '1';
+        end if;
+        lv.addr1 := a_in;
+        lv.addr2 := b_in;
+        lv.data := c_in;
+        lv.write_reg := gspr_to_gpr(e_in.write_reg);
+        lv.length := e_in.data_len;
+        lv.byte_reverse := e_in.byte_reverse;
+        lv.sign_extend := e_in.sign_extend;
+        lv.update := e_in.update;
+        lv.update_reg := gspr_to_gpr(e_in.read_reg1);
+        lv.xerc := v.e.xerc;
 
 	-- Update registers
 	rin <= v;
 
 	-- update outputs
 	--f_out <= r.f;
+        l_out <= lv;
 	e_out <= r.e;
 	flush_out <= f_out.redirect;
     end process;
diff --git a/gpr_hazard.vhdl b/gpr_hazard.vhdl
index 705e69d..de4f7d2 100644
--- a/gpr_hazard.vhdl
+++ b/gpr_hazard.vhdl
@@ -12,18 +12,21 @@ entity gpr_hazard is
 
         gpr_write_valid_in : in std_ulogic;
         gpr_write_in       : in std_ulogic_vector(5 downto 0);
+        bypass_avail       : in std_ulogic;
         gpr_read_valid_in  : in std_ulogic;
         gpr_read_in        : in std_ulogic_vector(5 downto 0);
 
-        stall_out          : out std_ulogic
+        stall_out          : out std_ulogic;
+        use_bypass         : out std_ulogic
         );
 end entity gpr_hazard;
 architecture behaviour of gpr_hazard is
     type pipeline_entry_type is record
-        valid : std_ulogic;
-        gpr   : std_ulogic_vector(5 downto 0);
+        valid  : std_ulogic;
+        bypass : std_ulogic;
+        gpr    : std_ulogic_vector(5 downto 0);
     end record;
-    constant pipeline_entry_init : pipeline_entry_type := (valid => '0', gpr => (others => '0'));
+    constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0'));
 
     type pipeline_t is array(0 to PIPELINE_DEPTH-1) of pipeline_entry_type;
     constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init);
@@ -33,9 +36,7 @@ begin
     gpr_hazard0: process(clk)
     begin
         if rising_edge(clk) then
-	    if stall_in = '0' then
-		r <= rin;
-	    end if;
+            r <= rin;
         end if;
     end process;
 
@@ -45,22 +46,49 @@ begin
         v := r;
 
         stall_out <= '0';
-        loop_0: for i in 0 to PIPELINE_DEPTH-1 loop
-            if ((r(i).valid = gpr_read_valid_in) and r(i).gpr = gpr_read_in) then
-                stall_out <= '1';
+        use_bypass <= '0';
+        if gpr_read_valid_in = '1' then
+            if r(0).valid = '1' and r(0).gpr = gpr_read_in then
+                if r(0).bypass = '1' and stall_in = '0' then
+                    use_bypass <= '1';
+                else
+                    stall_out <= '1';
+                end if;
             end if;
-        end loop;
+            loop_0: for i in 1 to PIPELINE_DEPTH-1 loop
+                if r(i).valid = '1' and r(i).gpr = gpr_read_in then
+                    if r(i).bypass = '1' then
+                        use_bypass <= '1';
+                    else
+                        stall_out <= '1';
+                    end if;
+                end if;
+            end loop;
+        end if;
 
-        v(0).valid := gpr_write_valid_in;
-        v(0).gpr   := gpr_write_in;
-        loop_1: for i in 0 to PIPELINE_DEPTH-2 loop
-            -- propagate to next slot
-            v(i+1) := r(i);
-        end loop;
+        if stall_in = '0' then
+            v(0).valid  := gpr_write_valid_in;
+            v(0).bypass := bypass_avail;
+            v(0).gpr    := gpr_write_in;
+            loop_1: for i in 1 to PIPELINE_DEPTH-1 loop
+                -- propagate to next slot
+                v(i).valid  := r(i-1).valid;
+                v(i).bypass := r(i-1).bypass;
+                v(i).gpr    := r(i-1).gpr;
+            end loop;
 
-        -- asynchronous output
-        if gpr_read_valid_in = '0' then
-            stall_out <= '0';
+        else
+            -- stage 0 stalled, so stage 1 becomes empty
+            loop_1b: for i in 1 to PIPELINE_DEPTH-1 loop
+                -- propagate to next slot
+                if i = 1 then
+                    v(i).valid := '0';
+                else
+                    v(i).valid  := r(i-1).valid;
+                    v(i).bypass := r(i-1).bypass;
+                    v(i).gpr    := r(i-1).gpr;
+                end if;
+            end loop;
         end if;
 
         -- update registers
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index 1c16c46..5b61d4c 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -13,7 +13,7 @@ entity loadstore1 is
     port (
         clk   : in std_ulogic;
 
-        l_in  : in Decode2ToLoadstore1Type;
+        l_in  : in Execute1ToLoadstore1Type;
 
         l_out : out Loadstore1ToDcacheType
         );
diff --git a/logical.vhdl b/logical.vhdl
index b92b98d..4dfc13d 100644
--- a/logical.vhdl
+++ b/logical.vhdl
@@ -12,11 +12,29 @@ entity logical is
         op         : in insn_type_t;
         invert_in  : in std_ulogic;
         invert_out : in std_ulogic;
-        result     : out std_ulogic_vector(63 downto 0)
+        result     : out std_ulogic_vector(63 downto 0);
+        datalen    : in std_logic_vector(3 downto 0);
+        popcnt     : out std_ulogic_vector(63 downto 0);
+        parity     : out std_ulogic_vector(63 downto 0)
         );
 end entity logical;
 
 architecture behaviour of logical is
+
+    subtype twobit is unsigned(1 downto 0);
+    type twobit32 is array(0 to 31) of twobit;
+    signal pc2      : twobit32;
+    subtype threebit is unsigned(2 downto 0);
+    type threebit16 is array(0 to 15) of threebit;
+    signal pc4      : threebit16;
+    subtype fourbit is unsigned(3 downto 0);
+    type fourbit8 is array(0 to 7) of fourbit;
+    signal pc8      : fourbit8;
+    subtype sixbit is unsigned(5 downto 0);
+    type sixbit2 is array(0 to 1) of sixbit;
+    signal pc32     : sixbit2;
+    signal par0, par1 : std_ulogic;
+
 begin
     logical_0: process(all)
         variable rb_adj, tmp : std_ulogic_vector(63 downto 0);
@@ -40,5 +58,45 @@ begin
             result <= not tmp;
         end if;
 
+        -- population counts
+        for i in 0 to 31 loop
+            pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1));
+        end loop;
+        for i in 0 to 15 loop
+            pc4(i) <= ('0' & pc2(i * 2)) + ('0' & pc2(i * 2 + 1));
+        end loop;
+        for i in 0 to 7 loop
+            pc8(i) <= ('0' & pc4(i * 2)) + ('0' & pc4(i * 2 + 1));
+        end loop;
+        for i in 0 to 1 loop
+            pc32(i) <= ("00" & pc8(i * 4)) + ("00" & pc8(i * 4 + 1)) +
+                       ("00" & pc8(i * 4 + 2)) + ("00" & pc8(i * 4 + 3));
+        end loop;
+        popcnt <= (others => '0');
+        if datalen(3 downto 2) = "00" then
+            -- popcntb
+            for i in 0 to 7 loop
+                popcnt(i * 8 + 3 downto i * 8) <= std_ulogic_vector(pc8(i));
+            end loop;
+        elsif datalen(3) = '0' then
+            -- popcntw
+            for i in 0 to 1 loop
+                popcnt(i * 32 + 5 downto i * 32) <= std_ulogic_vector(pc32(i));
+            end loop;
+        else
+            popcnt(6 downto 0) <= std_ulogic_vector(('0' & pc32(0)) + ('0' & pc32(1)));
+        end if;
+
+        -- parity calculations
+        par0 <= rs(0) xor rs(8) xor rs(16) xor rs(24);
+        par1 <= rs(32) xor rs(40) xor rs(48) xor rs(56);
+        parity <= (others => '0');
+        if datalen(3) = '1' then
+            parity(0) <= par0 xor par1;
+        else
+            parity(0) <= par0;
+            parity(32) <= par1;
+        end if;
+
     end process;
 end behaviour;
diff --git a/multiply.vhdl b/multiply.vhdl
index 23339b5..959c114 100644
--- a/multiply.vhdl
+++ b/multiply.vhdl
@@ -13,31 +13,24 @@ entity multiply is
     port (
         clk   : in std_logic;
 
-        m_in  : in Decode2ToMultiplyType;
-        m_out : out MultiplyToWritebackType
+        m_in  : in Execute1ToMultiplyType;
+        m_out : out MultiplyToExecute1Type
         );
 end entity multiply;
 
 architecture behaviour of multiply is
-    signal m: Decode2ToMultiplyType;
+    signal m: Execute1ToMultiplyType;
 
     type multiply_pipeline_stage is record
         valid     : std_ulogic;
         insn_type  : insn_type_t;
         data      : signed(129 downto 0);
-        write_reg : std_ulogic_vector(4 downto 0);
-        rc        : std_ulogic;
-	oe        : std_ulogic;
 	is_32bit  : std_ulogic;
-	xerc      : xer_common_t;
     end record;
     constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0',
 								     insn_type => OP_ILLEGAL,
-								     rc => '0', oe => '0',
 								     is_32bit => '0',
-								     xerc => xerc_init,
-								     data => (others => '0'),
-								     others => (others => '0'));
+								     data => (others => '0'));
 
     type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage;
     constant MultiplyPipelineInit : multiply_pipeline_type := (others => MultiplyPipelineStageInit);
@@ -64,16 +57,12 @@ begin
     begin
         v := r;
 
-        m_out <= MultiplyToWritebackInit;
+        m_out <= MultiplyToExecute1Init;
 
         v.multiply_pipeline(0).valid := m.valid;
         v.multiply_pipeline(0).insn_type := m.insn_type;
         v.multiply_pipeline(0).data := signed(m.data1) * signed(m.data2);
-        v.multiply_pipeline(0).write_reg := m.write_reg;
-        v.multiply_pipeline(0).rc := m.rc;
-        v.multiply_pipeline(0).oe := m.oe;
         v.multiply_pipeline(0).is_32bit := m.is_32bit;
-        v.multiply_pipeline(0).xerc := m.xerc;
 
         loop_0: for i in 1 to PIPELINE_DEPTH-1 loop
             v.multiply_pipeline(i) := r.multiply_pipeline(i-1);
@@ -101,25 +90,10 @@ begin
         end case;
 
         m_out.write_reg_data <= d2;
-        m_out.write_reg_nr <= v.multiply_pipeline(PIPELINE_DEPTH-1).write_reg;
-	m_out.xerc <= v.multiply_pipeline(PIPELINE_DEPTH-1).xerc;
+        m_out.overflow <= ov;
 
-	-- Generate OV/OV32/SO when OE=1
         if v.multiply_pipeline(PIPELINE_DEPTH-1).valid = '1' then
             m_out.valid <= '1';
-            m_out.write_reg_enable <= '1';
-            m_out.rc <= v.multiply_pipeline(PIPELINE_DEPTH-1).rc;
-            m_out.write_xerc_enable <= v.multiply_pipeline(PIPELINE_DEPTH-1).oe;
-
-	    -- We must test oe because the RC update code in writeback
-	    -- will use the xerc value to set CR0:SO so we must not clobber
-	    -- xerc if OE wasn't set.
-	    --
-	    if v.multiply_pipeline(PIPELINE_DEPTH-1).oe = '1' then
-		m_out.xerc.ov <= ov;
-		m_out.xerc.ov32 <= ov;
-		m_out.xerc.so <= v.multiply_pipeline(PIPELINE_DEPTH-1).xerc.so or ov;
-	    end if;
         end if;
 
         rin <= v;
diff --git a/multiply_tb.vhdl b/multiply_tb.vhdl
index 48f83ab..8f1d795 100644
--- a/multiply_tb.vhdl
+++ b/multiply_tb.vhdl
@@ -17,8 +17,8 @@ architecture behave of multiply_tb is
 
     constant pipeline_depth : integer := 4;
 
-    signal m1               : Decode2ToMultiplyType;
-    signal m2               : MultiplyToWritebackType;
+    signal m1               : Execute1ToMultiplyType;
+    signal m2               : MultiplyToExecute1Type;
 begin
     multiply_0: entity work.multiply
         generic map (PIPELINE_DEPTH => pipeline_depth)
@@ -40,10 +40,8 @@ begin
 
         m1.valid <= '1';
         m1.insn_type <= OP_MUL_L64;
-        m1.write_reg <= "10001";
         m1.data1 <= '0' & x"0000000000001000";
         m1.data2 <= '0' & x"0000000000001111";
-        m1.rc <= '0';
 
         wait for clk_period;
         assert m2.valid = '0';
@@ -58,16 +56,12 @@ begin
 
         wait for clk_period;
         assert m2.valid = '1';
-        assert m2.write_reg_enable = '1';
-        assert m2.write_reg_nr = "10001";
         assert m2.write_reg_data = x"0000000001111000";
-        assert m2.rc = '0';
 
         wait for clk_period;
         assert m2.valid = '0';
 
         m1.valid <= '1';
-        m1.rc <= '1';
 
         wait for clk_period;
         assert m2.valid = '0';
@@ -76,10 +70,7 @@ begin
 
         wait for clk_period * (pipeline_depth-1);
         assert m2.valid = '1';
-        assert m2.write_reg_enable = '1';
-        assert m2.write_reg_nr = "10001";
         assert m2.write_reg_data = x"0000000001111000";
-        assert m2.rc = '1';
 
         -- test mulld
         mulld_loop : for i in 0 to 1000 loop
diff --git a/writeback.vhdl b/writeback.vhdl
index 8582166..e53f46b 100644
--- a/writeback.vhdl
+++ b/writeback.vhdl
@@ -12,8 +12,6 @@ entity writeback is
 
         e_in         : in Execute1ToWritebackType;
         l_in         : in DcacheToWritebackType;
-        m_in         : in MultiplyToWritebackType;
-        d_in         : in DividerToWritebackType;
 
         w_out        : out WritebackToRegisterFileType;
         c_out        : out WritebackToCrFileType;
@@ -44,7 +42,6 @@ architecture behaviour of writeback is
     signal sign_extend : std_ulogic;
     signal negative : std_ulogic;
     signal second_word : std_ulogic;
-    signal zero : std_ulogic;
 begin
     writeback_0: process(clk)
     begin
@@ -64,44 +61,32 @@ begin
         variable k : unsigned(3 downto 0);
 	variable cf: std_ulogic_vector(3 downto 0);
 	variable xe: xer_common_t;
+        variable zero : std_ulogic;
+        variable sign : std_ulogic;
     begin
         x := "" & e_in.valid;
         y := "" & l_in.valid;
-        z := "" & m_in.valid;
-        w := "" & d_in.valid;
-        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z)) + to_integer(unsigned(w))) <= 1 severity failure;
+        assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure;
 
         x := "" & e_in.write_enable;
         y := "" & l_in.write_enable;
-        z := "" & m_in.write_reg_enable;
-        w := "" & d_in.write_reg_enable;
-        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z)) + to_integer(unsigned(w))) <= 1 severity failure;
+        assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure;
 
         w := "" & e_in.write_cr_enable;
         x := "" & (e_in.write_enable and e_in.rc);
-        y := "" & (m_in.valid and m_in.rc);
-        z := "" & (d_in.valid and d_in.rc);
-        assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
-
-        x := "" & e_in.write_xerc_enable;
-        y := "" & m_in.write_xerc_enable;
-        z := "" & D_in.write_xerc_enable;
-        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
+        assert (to_integer(unsigned(w)) + to_integer(unsigned(x))) <= 1 severity failure;
 
         w_out <= WritebackToRegisterFileInit;
         c_out <= WritebackToCrFileInit;
 
         complete_out <= '0';
-        if e_in.valid = '1' or l_in.valid = '1' or m_in.valid = '1' or d_in.valid = '1' then
+        if e_in.valid = '1' or l_in.valid = '1' then
             complete_out <= '1';
         end if;
 
         rc <= '0';
         brev_lenm1 <= "000";
-        byte_offset <= "000";
-        data_len <= x"8";
         partial_write <= '0';
-        sign_extend <= '0';
         second_word <= '0';
 	xe := e_in.xerc;
 	data_in <= (others => '0');
@@ -109,9 +94,6 @@ begin
         if e_in.write_enable = '1' then
             w_out.write_reg <= e_in.write_reg;
             w_out.write_enable <= '1';
-	    data_in <= e_in.write_data;
-            data_len <= unsigned(e_in.write_len);
-            sign_extend <= e_in.sign_extend;
             rc <= e_in.rc;
         end if;
 
@@ -126,12 +108,11 @@ begin
             c_out.write_xerc_data <= e_in.xerc;
 	end if;
 
+        sign_extend <= l_in.sign_extend;
+        data_len <= unsigned(l_in.write_len);
+        byte_offset <= unsigned(l_in.write_shift);
 	if l_in.write_enable = '1' then
             w_out.write_reg <= gpr_to_gspr(l_in.write_reg);
-            data_in <= l_in.write_data;
-            data_len <= unsigned(l_in.write_len);
-            byte_offset <= unsigned(l_in.write_shift);
-            sign_extend <= l_in.sign_extend;
             if l_in.byte_reverse = '1' then
                 brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1;
             end if;
@@ -143,32 +124,6 @@ begin
 	    xe := l_in.xerc;
         end if;
 
-        if m_in.write_reg_enable = '1' then
-            w_out.write_enable <= '1';
-            w_out.write_reg <= gpr_to_gspr(m_in.write_reg_nr);
-            data_in <= m_in.write_reg_data;
-            rc <= m_in.rc;
-	    xe := m_in.xerc;
-        end if;
-
-	if m_in.write_xerc_enable = '1' then
-            c_out.write_xerc_enable <= '1';
-            c_out.write_xerc_data <= m_in.xerc;
-	end if;
-
-        if d_in.write_reg_enable = '1' then
-            w_out.write_enable <= '1';
-            w_out.write_reg <= gpr_to_gspr(d_in.write_reg_nr);
-            data_in <= d_in.write_reg_data;
-            rc <= d_in.rc;
-	    xe := d_in.xerc;
-        end if;
-
-	if d_in.write_xerc_enable = '1' then
-            c_out.write_xerc_enable <= '1';
-            c_out.write_xerc_data <= d_in.xerc;
-	end if;
-
         -- shift and byte-reverse data bytes
         for i in 0 to 7 loop
             k := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
@@ -177,7 +132,7 @@ begin
         end loop;
         for i in 0 to 7 loop
             j := to_integer(perm(i)) * 8;
-            data_permuted(i * 8 + 7 downto i * 8) <= data_in(j + 7 downto j);
+            data_permuted(i * 8 + 7 downto i * 8) <= l_in.write_data(j + 7 downto j);
         end loop;
 
         -- If the data can arrive split over two cycles, this will be correct
@@ -199,16 +154,12 @@ begin
                 trim_ctl(i) <= '0' & (negative and sign_extend);
             end if;
         end loop;
-	zero <= not negative;
         for i in 0 to 7 loop
             case trim_ctl(i) is
                 when "11" =>
                     data_trimmed(i * 8 + 7 downto i * 8) <= data_latched(i * 8 + 7 downto i * 8);
                 when "10" =>
                     data_trimmed(i * 8 + 7 downto i * 8) <= data_permuted(i * 8 + 7 downto i * 8);
-		    if or data_permuted(i * 8 + 7 downto i * 8) /= '0' then
-			zero <= '0';
-		    end if;
                 when "01" =>
                     data_trimmed(i * 8 + 7 downto i * 8) <= x"FF";
                 when others =>
@@ -217,14 +168,21 @@ begin
         end loop;
 
         -- deliver to regfile
-        w_out.write_data <= data_trimmed;
+        if l_in.write_enable = '1' then
+            w_out.write_data <= data_trimmed;
+        else
+            w_out.write_data <= e_in.write_data;
+        end if;
 
         -- Perform CR0 update for RC forms
+        -- Note that loads never have a form with an RC bit, therefore this can test e_in.write_data
         if rc = '1' then
+            sign := e_in.write_data(63);
+            zero := not (or e_in.write_data);
             c_out.write_cr_enable <= '1';
             c_out.write_cr_mask <= num_to_fxm(0);
-	    cf(3) := negative;
-	    cf(2) := not negative and not zero;
+	    cf(3) := sign;
+	    cf(2) := not sign and not zero;
 	    cf(1) := zero;
 	    cf(0) := xe.so;
 	    c_out.write_cr_data(31 downto 28) <= cf;