Merge pull request #134 from paulusmack/master

Add bypass from execute1 output to input
6 years ago · f77b31a552
parent d876484229 2661b9b985
commit f77b31a552
18 changed files with 688 additions and 583 deletions
--- a/4
+++ b/4
@ -31,7 +31,7 @@ common.o: decode_types.o
 control.o: gpr_hazard.o cr_hazard.o common.o
 sim_jtag.o: sim_jtag_socket.o
 core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o
-core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o multiply.o writeback.o core_debug.o divider.o
+core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o writeback.o core_debug.o
 core_debug.o: common.o
 countzero.o:
 countzero_tb.o: common.o glibc_random.o countzero.o
@ -40,7 +40,7 @@ crhelpers.o: common.o
 decode1.o: common.o decode_types.o
 decode2.o: decode_types.o common.o helpers.o insn_helpers.o control.o
 decode_types.o:
-execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o
+execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o multiply.o divider.o
 fetch1.o: common.o
 fetch2.o: common.o wishbone_types.o
 glibc_random_helpers.o:
--- a/common.vhdl
+++ b/common.vhdl
@ -109,6 +109,9 @@ package common is
 	read_data1: std_ulogic_vector(63 downto 0);
 	read_data2: std_ulogic_vector(63 downto 0);
 	read_data3: std_ulogic_vector(63 downto 0);
        bypass_data1: std_ulogic;
        bypass_data2: std_ulogic;
        bypass_data3: std_ulogic;
 	cr: std_ulogic_vector(31 downto 0);
 	xerc: xer_common_t;
 	lr: std_ulogic;
@ -124,44 +127,41 @@ package common is
 	is_signed: std_ulogic;
 	insn: std_ulogic_vector(31 downto 0);
 	data_len: std_ulogic_vector(3 downto 0);
 	byte_reverse : std_ulogic;
 	sign_extend : std_ulogic;			-- do we need to sign extend?
 	update : std_ulogic;				-- is this an update instruction?
    end record;
    constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
-	(valid => '0', insn_type => OP_ILLEGAL, lr => '0', rc => '0', oe => '0', invert_a => '0',
+	(valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
         lr => '0', rc => '0', oe => '0', invert_a => '0',
 	 invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
-	 is_32bit => '0', is_signed => '0', xerc => xerc_init, others => (others => '0'));
+	 is_32bit => '0', is_signed => '0', xerc => xerc_init,
         byte_reverse => '0', sign_extend => '0', update => '0', others => (others => '0'));
-    type Decode2ToMultiplyType is record
+    type Execute1ToMultiplyType is record
 	valid: std_ulogic;
 	insn_type: insn_type_t;
 	write_reg: gpr_index_t;
 	data1: std_ulogic_vector(64 downto 0);
 	data2: std_ulogic_vector(64 downto 0);
 	rc: std_ulogic;
 	oe: std_ulogic;
 	is_32bit: std_ulogic;
 	xerc: xer_common_t;
    end record;
-    constant Decode2ToMultiplyInit : Decode2ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, rc => '0',
+    constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL,
-							       oe => '0', is_32bit => '0', xerc => xerc_init,
+								 is_32bit => '0',
 								 others => (others => '0'));
-    type Decode2ToDividerType is record
+    type Execute1ToDividerType is record
 	valid: std_ulogic;
 	write_reg: gpr_index_t;
 	dividend: std_ulogic_vector(63 downto 0);
 	divisor: std_ulogic_vector(63 downto 0);
 	is_signed: std_ulogic;
 	is_32bit: std_ulogic;
 	is_extended: std_ulogic;
 	is_modulus: std_ulogic;
-	rc: std_ulogic;
+        neg_result: std_ulogic;
 	oe: std_ulogic;
 	xerc: xer_common_t;
    end record;
-    constant Decode2ToDividerInit: Decode2ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0',
+    constant Execute1ToDividerInit: Execute1ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0',
                                                              is_extended => '0', is_modulus => '0',
-							    rc => '0', oe => '0', xerc => xerc_init,
+                                                              neg_result => '0', others => (others => '0'));
 							    others => (others => '0'));
    type Decode2ToRegisterFileType is record
 	read1_enable : std_ulogic;
@ -193,7 +193,7 @@ package common is
    end record;
    constant Execute1ToFetch1TypeInit : Execute1ToFetch1Type := (redirect => '0', others => (others => '0'));
-    type Decode2ToLoadstore1Type is record
+    type Execute1ToLoadstore1Type is record
 	valid : std_ulogic;
 	load : std_ulogic;				-- is this a load or store
 	addr1 : std_ulogic_vector(63 downto 0);
@ -207,7 +207,7 @@ package common is
 	update_reg : gpr_index_t;                      	-- if so, the register to update
 	xerc : xer_common_t;
    end record;
-    constant Decode2ToLoadstore1Init : Decode2ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0',
+    constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0',
                                                                     sign_extend => '0', update => '0', xerc => xerc_init,
                                                                     others => (others => '0'));
@ -248,47 +248,31 @@ package common is
 	write_enable : std_ulogic;
 	write_reg: gspr_index_t;
 	write_data: std_ulogic_vector(63 downto 0);
 	write_len : std_ulogic_vector(3 downto 0);
 	write_cr_enable : std_ulogic;
 	write_cr_mask : std_ulogic_vector(7 downto 0);
 	write_cr_data : std_ulogic_vector(31 downto 0);
 	write_xerc_enable : std_ulogic;
 	xerc : xer_common_t;
 	sign_extend: std_ulogic;
    end record;
    constant Execute1ToWritebackInit : Execute1ToWritebackType := (valid => '0', rc => '0', write_enable => '0',
-								   write_cr_enable => '0', sign_extend => '0',
+								   write_cr_enable => '0',
 								   write_xerc_enable => '0', xerc => xerc_init,
 								   others => (others => '0'));
-    type MultiplyToWritebackType is record
+    type MultiplyToExecute1Type is record
 	valid: std_ulogic;
 	write_reg_enable : std_ulogic;
 	write_reg_nr: gpr_index_t;
 	write_reg_data: std_ulogic_vector(63 downto 0);
-	write_xerc_enable : std_ulogic;
+        overflow : std_ulogic;
 	xerc : xer_common_t;
 	rc: std_ulogic;
    end record;
-    constant MultiplyToWritebackInit : MultiplyToWritebackType := (valid => '0', write_reg_enable => '0',
+    constant MultiplyToExecute1Init : MultiplyToExecute1Type := (valid => '0', overflow => '0',
 								   rc => '0', write_xerc_enable => '0',
 								   xerc => xerc_init,
 								 others => (others => '0'));
-    type DividerToWritebackType is record
+    type DividerToExecute1Type is record
 	valid: std_ulogic;
 	write_reg_enable : std_ulogic;
 	write_reg_nr: gpr_index_t;
 	write_reg_data: std_ulogic_vector(63 downto 0);
-	write_xerc_enable : std_ulogic;
+        overflow : std_ulogic;
 	xerc : xer_common_t;
 	rc: std_ulogic;
    end record;
-    constant DividerToWritebackInit : DividerToWritebackType := (valid => '0', write_reg_enable => '0',
+    constant DividerToExecute1Init : DividerToExecute1Type := (valid => '0', overflow => '0',
 								 rc => '0', write_xerc_enable => '0',
 								 xerc => xerc_init,
                                                               others => (others => '0'));
    type WritebackToRegisterFileType is record
--- a/control.vhdl
+++ b/control.vhdl
@ -21,6 +21,7 @@ entity control is
        gpr_write_valid_in  : in std_ulogic;
        gpr_write_in        : in gspr_index_t;
        gpr_bypassable      : in std_ulogic;
        gpr_a_read_valid_in : in std_ulogic;
        gpr_a_read_in       : in gspr_index_t;
@ -36,7 +37,11 @@ entity control is
        valid_out           : out std_ulogic;
        stall_out           : out std_ulogic;
-        stopped_out         : out std_ulogic
+        stopped_out         : out std_ulogic;
        gpr_bypass_a        : out std_ulogic;
        gpr_bypass_b        : out std_ulogic;
        gpr_bypass_c        : out std_ulogic
        );
 end entity control;
@ -71,10 +76,12 @@ begin
            gpr_write_valid_in => gpr_write_valid,
            gpr_write_in       => gpr_write_in,
            bypass_avail       => gpr_bypassable,
            gpr_read_valid_in  => gpr_a_read_valid_in,
            gpr_read_in        => gpr_a_read_in,
-            stall_out          => stall_a_out
+            stall_out          => stall_a_out,
            use_bypass         => gpr_bypass_a
            );
    gpr_hazard1: entity work.gpr_hazard
@ -87,10 +94,12 @@ begin
            gpr_write_valid_in => gpr_write_valid,
            gpr_write_in       => gpr_write_in,
            bypass_avail       => gpr_bypassable,
            gpr_read_valid_in  => gpr_b_read_valid_in,
            gpr_read_in        => gpr_b_read_in,
-            stall_out          => stall_b_out
+            stall_out          => stall_b_out,
            use_bypass         => gpr_bypass_b
            );
    gpr_c_read_in_fmt <= "0" & gpr_c_read_in;
@ -105,10 +114,12 @@ begin
            gpr_write_valid_in => gpr_write_valid,
            gpr_write_in       => gpr_write_in,
            bypass_avail       => gpr_bypassable,
            gpr_read_valid_in  => gpr_c_read_valid_in,
            gpr_read_in        => gpr_c_read_in_fmt,
-            stall_out          => stall_c_out
+            stall_out          => stall_c_out,
            use_bypass         => gpr_bypass_c
            );
    cr_hazard0: entity work.cr_hazard
--- a/core.vhdl
+++ b/core.vhdl
@ -9,7 +9,8 @@ use work.wishbone_types.all;
 entity core is
    generic (
        SIM : boolean := false;
-	DISABLE_FLATTEN : boolean := false
+	DISABLE_FLATTEN : boolean := false;
        EX1_BYPASS : boolean := true
        );
    port (
        clk          : in std_logic;
@ -59,18 +60,10 @@ architecture behave of core is
    signal execute1_to_fetch1: Execute1ToFetch1Type;
    -- load store signals
-    signal decode2_to_loadstore1: Decode2ToLoadstore1Type;
+    signal execute1_to_loadstore1: Execute1ToLoadstore1Type;
    signal loadstore1_to_dcache: Loadstore1ToDcacheType;
    signal dcache_to_writeback: DcacheToWritebackType;
    -- multiply signals
    signal decode2_to_multiply: Decode2ToMultiplyType;
    signal multiply_to_writeback: MultiplyToWritebackType;
    -- divider signals
    signal decode2_to_divider: Decode2ToDividerType;
    signal divider_to_writeback: DividerToWritebackType;
    -- local signals
    signal fetch1_stall_in : std_ulogic;
    signal icache_stall_out : std_ulogic;
@ -115,8 +108,6 @@ architecture behave of core is
    attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN);
    attribute keep_hierarchy of cr_file_0 : label is keep_h(DISABLE_FLATTEN);
    attribute keep_hierarchy of execute1_0 : label is keep_h(DISABLE_FLATTEN);
    attribute keep_hierarchy of multiply_0 : label is keep_h(DISABLE_FLATTEN);
    attribute keep_hierarchy of divider_0 : label is keep_h(DISABLE_FLATTEN);
    attribute keep_hierarchy of loadstore1_0 : label is keep_h(DISABLE_FLATTEN);
    attribute keep_hierarchy of dcache_0 : label is keep_h(DISABLE_FLATTEN);
    attribute keep_hierarchy of writeback_0 : label is keep_h(DISABLE_FLATTEN);
@ -186,6 +177,9 @@ begin
    decode1_stall_in <= decode2_stall_out;
    decode2_0: entity work.decode2
        generic map (
            EX1_BYPASS => EX1_BYPASS
            )
        port map (
            clk => clk,
            rst => core_rst,
@ -196,9 +190,6 @@ begin
 	    stopped_out => dbg_core_is_stopped,
            d_in => decode1_to_decode2,
            e_out => decode2_to_execute1,
            l_out => decode2_to_loadstore1,
            m_out => decode2_to_multiply,
            d_out => decode2_to_divider,
            r_in => register_file_to_decode2,
            r_out => decode2_to_register_file,
            c_in => cr_file_to_decode2,
@ -232,11 +223,16 @@ begin
            );
    execute1_0: entity work.execute1
        generic map (
            EX1_BYPASS => EX1_BYPASS
            )
        port map (
            clk => clk,
            rst => core_rst,
            flush_out => flush,
 	    stall_out => ex1_stall_out,
            e_in => decode2_to_execute1,
            l_out => execute1_to_loadstore1,
            f_out => execute1_to_fetch1,
            e_out => execute1_to_writeback,
 	    icache_inval => ex1_icache_inval,
@ -246,7 +242,7 @@ begin
    loadstore1_0: entity work.loadstore1
        port map (
            clk => clk,
-            l_in => decode2_to_loadstore1,
+            l_in => execute1_to_loadstore1,
            l_out => loadstore1_to_dcache
            );
@ -265,28 +261,11 @@ begin
            wishbone_out => wishbone_data_out
            );
    multiply_0: entity work.multiply
        port map (
            clk => clk,
            m_in => decode2_to_multiply,
            m_out => multiply_to_writeback
            );
    divider_0: entity work.divider
        port map (
            clk => clk,
            rst => core_rst,
            d_in => decode2_to_divider,
            d_out => divider_to_writeback
            );
    writeback_0: entity work.writeback
        port map (
            clk => clk,
            e_in => execute1_to_writeback,
            l_in => dcache_to_writeback,
            m_in => multiply_to_writeback,
            d_in => divider_to_writeback,
            w_out => writeback_to_register_file,
            c_out => writeback_to_cr_file,
            complete_out => complete
--- a/countzero.vhdl
+++ b/countzero.vhdl
@ -6,6 +6,7 @@ library work;
 entity zero_counter is
    port (
        clk         : in std_logic;
 	rs          : in std_ulogic_vector(63 downto 0);
 	count_right : in std_ulogic;
 	is_32bit    : in std_ulogic;
@ -14,10 +15,14 @@ entity zero_counter is
 end entity zero_counter;
 architecture behaviour of zero_counter is
-    signal y, z     : std_ulogic_vector(3 downto 0);
+    type intermediate_result is record
-    signal v16      : std_ulogic_vector(15 downto 0);
+        v16: std_ulogic_vector(15 downto 0);
-    signal v4       : std_ulogic_vector(3 downto 0);
+        sel_hi: std_ulogic_vector(1 downto 0);
-    signal sel      : std_ulogic_vector(5 downto 0);
+        is_32bit: std_ulogic;
        count_right: std_ulogic;
    end record;
    signal r, r_in  : intermediate_result;
    -- Return the index of the leftmost or rightmost 1 in a set of 4 bits.
    -- Assumes v is not "0000"; if it is, return (right ? "11" : "00").
@ -47,65 +52,83 @@ architecture behaviour of zero_counter is
    end;
 begin
-    zerocounter0: process(all)
+    zerocounter_0: process(clk)
    begin
 	if rising_edge(clk) then
            r <= r_in;
        end if;
    end process;
    zerocounter_1: process(all)
        variable v: intermediate_result;
        variable y, z: std_ulogic_vector(3 downto 0);
        variable sel: std_ulogic_vector(5 downto 0);
        variable v4: std_ulogic_vector(3 downto 0);
    begin
 	-- Test 4 groups of 16 bits each.
 	-- The top 2 groups are considered to be zero in 32-bit mode.
-	z(0) <= or (rs(15 downto 0));
+	z(0) := or (rs(15 downto 0));
-	z(1) <= or (rs(31 downto 16));
+	z(1) := or (rs(31 downto 16));
-	z(2) <= or (rs(47 downto 32));
+	z(2) := or (rs(47 downto 32));
-	z(3) <= or (rs(63 downto 48));
+	z(3) := or (rs(63 downto 48));
        if is_32bit = '0' then
-            sel(5 downto 4) <= encoder(z, count_right);
+            v.sel_hi := encoder(z, count_right);
        else
-            sel(5) <= '0';
+            v.sel_hi(1) := '0';
            if count_right = '0' then
-                sel(4) <= z(1);
+                v.sel_hi(0) := z(1);
            else
-                sel(4) <= not z(0);
+                v.sel_hi(0) := not z(0);
            end if;
        end if;
 	-- Select the leftmost/rightmost non-zero group of 16 bits
-	case sel(5 downto 4) is
+	case v.sel_hi is
 	    when "00" =>
-		v16 <= rs(15 downto 0);
+		v.v16 := rs(15 downto 0);
 	    when "01" =>
-		v16 <= rs(31 downto 16);
+		v.v16 := rs(31 downto 16);
 	    when "10" =>
-		v16 <= rs(47 downto 32);
+		v.v16 := rs(47 downto 32);
 	    when others =>
-		v16 <= rs(63 downto 48);
+		v.v16 := rs(63 downto 48);
 	end case;
        -- Latch this and do the rest in the next cycle, for the sake of timing
        v.is_32bit := is_32bit;
        v.count_right := count_right;
        r_in <= v;
        sel(5 downto 4) := r.sel_hi;
 	-- Test 4 groups of 4 bits
-	y(0) <= or (v16(3 downto 0));
+	y(0) := or (r.v16(3 downto 0));
-	y(1) <= or (v16(7 downto 4));
+	y(1) := or (r.v16(7 downto 4));
-	y(2) <= or (v16(11 downto 8));
+	y(2) := or (r.v16(11 downto 8));
-	y(3) <= or (v16(15 downto 12));
+	y(3) := or (r.v16(15 downto 12));
-	sel(3 downto 2) <= encoder(y, count_right);
+	sel(3 downto 2) := encoder(y, r.count_right);
 	-- Select the leftmost/rightmost non-zero group of 4 bits
 	case sel(3 downto 2) is
 	    when "00" =>
-		v4 <= v16(3 downto 0);
+		v4 := r.v16(3 downto 0);
 	    when "01" =>
-		v4 <= v16(7 downto 4);
+		v4 := r.v16(7 downto 4);
 	    when "10" =>
-		v4 <= v16(11 downto 8);
+		v4 := r.v16(11 downto 8);
 	    when others =>
-		v4 <= v16(15 downto 12);
+		v4 := r.v16(15 downto 12);
 	end case;
-	sel(1 downto 0) <= encoder(v4, count_right);
+	sel(1 downto 0) := encoder(v4, r.count_right);
 	-- sel is now the index of the leftmost/rightmost 1 bit in rs
 	if v4 = "0000" then
 	    -- operand is zero, return 32 for 32-bit, else 64
-	    result <= x"00000000000000" & '0' & not is_32bit & is_32bit & "00000";
+	    result <= x"00000000000000" & '0' & not r.is_32bit & r.is_32bit & "00000";
-	elsif count_right = '0' then
+	elsif r.count_right = '0' then
 	    -- return (63 - sel), trimmed to 5 bits in 32-bit mode
-	    result <= x"00000000000000" & "00" & (not sel(5) and not is_32bit) & not sel(4 downto 0);
+	    result <= x"00000000000000" & "00" & (not sel(5) and not r.is_32bit) & not sel(4 downto 0);
 	else
 	    result <= x"00000000000000" & "00" & sel;
 	end if;
--- a/countzero_tb.vhdl
+++ b/countzero_tb.vhdl
@ -15,16 +15,26 @@ architecture behave of countzero_tb is
    signal is_32bit, count_right: std_ulogic := '0';
    signal result: std_ulogic_vector(63 downto 0);
    signal randno: std_ulogic_vector(63 downto 0);
    signal clk: std_ulogic;
 begin
    zerocounter_0: entity work.zero_counter
 	port map (
            clk => clk,
 	    rs => rs,
 	    result => result,
 	    count_right => count_right,
 	    is_32bit => is_32bit
 	);
    clk_process: process
    begin
        clk <= '0';
        wait for clk_period/2;
        clk <= '1';
        wait for clk_period/2;
    end process;
    stim_process: process
        variable r: std_ulogic_vector(63 downto 0);
    begin
--- a/decode1.vhdl
+++ b/decode1.vhdl
@ -44,8 +44,8 @@ architecture behaviour of decode1 is
 		29 =>       (ALU,    OP_AND,       NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0'), -- andis.
 		18 =>       (ALU,    OP_B,         NONE,       CONST_LI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- b
 		16 =>       (ALU,    OP_BC,        SPR,        CONST_BD,    NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc
-		11 =>       (ALU,    OP_CMP,       RA,         CONST_SI,    NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpi
+		11 =>       (ALU,    OP_CMP,       RA,         CONST_SI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmpi
-		10 =>       (ALU,    OP_CMPL,      RA,         CONST_UI,    NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli
+		10 =>       (ALU,    OP_CMP,       RA,         CONST_UI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli
 		34 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbz
 		35 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lbzu
 		42 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lha
@ -54,7 +54,7 @@ architecture behaviour of decode1 is
 		41 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lhzu
 		32 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwz
                33 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lwzu
-		 7 =>       (MUL,    OP_MUL_L64,   RA,         CONST_SI,    NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '1'), -- mulli
+		 7 =>       (ALU,    OP_MUL_L64,   RA,         CONST_SI,    NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli
 		24 =>       (ALU,    OP_OR,        NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ori
 		25 =>       (ALU,    OP_OR,        NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- oris
 		20 =>       (ALU,    OP_RLC,       RA,         CONST_SH32,  RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- rlwimi
@ -66,7 +66,7 @@ architecture behaviour of decode1 is
 		45 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- sthu
 		36 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stw
 		37 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stwu
-		 8 =>       (ALU,    OP_ADD,       RA,         CONST_SI,    NONE, RT,   '0', '0', '1', '0', ONE,  '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- subfic
+		 8 =>       (ALU,    OP_ADD,       RA,         CONST_SI,    NONE, RT,   '0', '0', '1', '0', ONE,  '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic
 		 2 =>       (ALU,    OP_TDI,       RA,         CONST_SI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi
 		--PPC_TWI 3
 		26 =>       (ALU,    OP_XOR,       NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- xori
@ -145,10 +145,10 @@ architecture behaviour of decode1 is
 		2#0000011100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- and
 		2#0000111100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- andc
 		-- 2#0011111100# bperm
-		2#0000000000#  =>       (ALU,    OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmp
+		2#0000000000#  =>       (ALU,    OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmp
 		2#0111111100#  =>       (ALU,    OP_CMPB,      NONE,       RB,          RS,   RA,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpb
 		-- 2#0011100000# cmpeqb
-		2#0000100000#  =>       (ALU,    OP_CMPL,      RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpl
+		2#0000100000#  =>       (ALU,    OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpl
 		-- 2#0011000000# cmprb
 		2#0000111010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- cntlzd
 		2#0000011010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- cntlzw
@ -160,22 +160,22 @@ architecture behaviour of decode1 is
 		2#0100010110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbt
 		2#0011110110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbtst
 		-- 2#1111110110# dcbz
-		2#0110001001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdeu
+		2#0110001001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divdeu
-		2#1110001001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdeuo
+		2#1110001001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divdeuo
-		2#0110001011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divweu
+		2#0110001011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divweu
-		2#1110001011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divweuo
+		2#1110001011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divweuo
-		2#0110101001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divde
+		2#0110101001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- divde
-		2#1110101001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdeo
+		2#1110101001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- divdeo
-		2#0110101011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divwe
+		2#0110101011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- divwe
-		2#1110101011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divweo
+		2#1110101011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- divweo
-		2#0111001001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdu
+		2#0111001001#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divdu
-		2#1111001001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divduo
+		2#1111001001#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divduo
-		2#0111001011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divwu
+		2#0111001011#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divwu
-		2#1111001011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divwuo
+		2#1111001011#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divwuo
-		2#0111101001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divd
+		2#0111101001#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- divd
-		2#1111101001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdo
+		2#1111101001#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- divdo
-		2#0111101011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divw
+		2#0111101011#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- divw
-		2#1111101011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divwo
+		2#1111101011#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- divwo
 		2#0100011100#  =>       (ALU,    OP_XOR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- eqv
 		2#1110111010#  =>       (ALU,    OP_EXTS,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- extsb
 		2#1110011010#  =>       (ALU,    OP_EXTS,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- extsh
@ -238,36 +238,36 @@ architecture behaviour of decode1 is
 		-- 2#1001000000# mcrxrx
 		2#0000010011#  =>       (ALU,    OP_MFCR,      NONE,       NONE,        NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf
 		2#0101010011#  =>       (ALU,    OP_MFSPR,     SPR,        NONE,        NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfspr
-		2#0100001001#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modud
+		2#0100001001#  =>       (ALU,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- modud
-		2#0100001011#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- moduw
+		2#0100001011#  =>       (ALU,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- moduw
-		2#1100001001#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsd
+		2#1100001001#  =>       (ALU,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- modsd
-		2#1100001011#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsw
+		2#1100001011#  =>       (ALU,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', NONE, '0', '0'), -- modsw
 		2#0010010000#  =>       (ALU,    OP_MTCRF,     NONE,       NONE,        RS,   NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtcrf/mtocrf
 		2#0111010011#  =>       (ALU,    OP_MTSPR,     NONE,       NONE,        RS,   SPR,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtspr
-		2#0001001001#  =>       (MUL,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '1'), -- mulhd
+		2#0001001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulhd
-		2#0000001001#  =>       (MUL,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- mulhdu
+		2#0000001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- mulhdu
-		2#0001001011#  =>       (MUL,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '1'), -- mulhw
+		2#0001001011#  =>       (ALU,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- mulhw
-		2#0000001011#  =>       (MUL,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '1'), -- mulhwu
+		2#0000001011#  =>       (ALU,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- mulhwu
                -- next 4 have reserved bit set
-		2#1001001001#  =>       (MUL,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '1'), -- mulhd
+		2#1001001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulhd
-		2#1000001001#  =>       (MUL,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- mulhdu
+		2#1000001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- mulhdu
-		2#1001001011#  =>       (MUL,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '1'), -- mulhw
+		2#1001001011#  =>       (ALU,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- mulhw
-		2#1000001011#  =>       (MUL,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '1'), -- mulhwu
+		2#1000001011#  =>       (ALU,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- mulhwu
-		2#0011101001#  =>       (MUL,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '1'), -- mulld
+		2#0011101001#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulld
-		2#1011101001#  =>       (MUL,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '1'), -- mulldo
+		2#1011101001#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulldo
-		2#0011101011#  =>       (MUL,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '1'), -- mullw
+		2#0011101011#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- mullw
-		2#1011101011#  =>       (MUL,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '1'), -- mullwo
+		2#1011101011#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- mullwo
 		2#0111011100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nand
 		2#0001101000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- neg
 		2#1001101000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nego
 		2#0001111100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nor
 		2#0110111100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- or
 		2#0110011100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- orc
-		2#0001111010#  =>       (ALU,    OP_POPCNTB,   NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntb
+		2#0001111010#  =>       (ALU,    OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntb
-		2#0111111010#  =>       (ALU,    OP_POPCNTD,   NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntd
+		2#0111111010#  =>       (ALU,    OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntd
-		2#0101111010#  =>       (ALU,    OP_POPCNTW,   NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntw
+		2#0101111010#  =>       (ALU,    OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntw
-		2#0010111010#  =>       (ALU,    OP_PRTYD,     NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd
+		2#0010111010#  =>       (ALU,    OP_PRTY,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd
-		2#0010011010#  =>       (ALU,    OP_PRTYW,     NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw
+		2#0010011010#  =>       (ALU,    OP_PRTY,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw
 		-- 2#0010000000# setb
 		2#0000011011#  =>       (ALU,    OP_SHL,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- sld
 		2#0000011000#  =>       (ALU,    OP_SHL,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- slw
--- a/decode2.vhdl
+++ b/decode2.vhdl
@ -9,6 +9,9 @@ use work.helpers.all;
 use work.insn_helpers.all;
 entity decode2 is
        generic (
                EX1_BYPASS : boolean := true
        );
 	port (
 		clk   : in std_ulogic;
 		rst   : in std_ulogic;
@ -24,9 +27,6 @@ entity decode2 is
 		d_in  : in Decode1ToDecode2Type;
 		e_out : out Decode2ToExecute1Type;
 		m_out : out Decode2ToMultiplyType;
                d_out : out Decode2ToDividerType;
 		l_out : out Decode2ToLoadstore1Type;
 		r_in  : in RegisterFileToDecode2Type;
 		r_out : out Decode2ToRegisterFileType;
@ -39,9 +39,6 @@ end entity decode2;
 architecture behaviour of decode2 is
 	type reg_type is record
 		e : Decode2ToExecute1Type;
 		m : Decode2ToMultiplyType;
                d : Decode2ToDividerType;
 		l : Decode2ToLoadstore1Type;
 	end record;
 	signal r, rin : reg_type;
@ -188,15 +185,19 @@ architecture behaviour of decode2 is
 	signal gpr_write_valid : std_ulogic;
 	signal gpr_write : gspr_index_t;
        signal gpr_bypassable  : std_ulogic;
 	signal gpr_a_read_valid : std_ulogic;
 	signal gpr_a_read :gspr_index_t;
        signal gpr_a_bypass : std_ulogic;
 	signal gpr_b_read_valid : std_ulogic;
 	signal gpr_b_read : gspr_index_t;
        signal gpr_b_bypass : std_ulogic;
 	signal gpr_c_read_valid : std_ulogic;
 	signal gpr_c_read : gpr_index_t;
        signal gpr_c_bypass : std_ulogic;
 	signal cr_write_valid : std_ulogic;
 begin
@ -217,6 +218,7 @@ begin
 		gpr_write_valid_in => gpr_write_valid,
 		gpr_write_in       => gpr_write,
                gpr_bypassable     => gpr_bypassable,
 		gpr_a_read_valid_in  => gpr_a_read_valid,
 		gpr_a_read_in        => gpr_a_read,
@ -232,13 +234,17 @@ begin
 		valid_out   => control_valid_out,
 		stall_out   => stall_out,
-		stopped_out => stopped_out
+		stopped_out => stopped_out,
                gpr_bypass_a => gpr_a_bypass,
                gpr_bypass_b => gpr_b_bypass,
                gpr_bypass_c => gpr_c_bypass
 	);
 	decode2_0: process(clk)
 	begin
 		if rising_edge(clk) then
-			if rin.e.valid = '1' or rin.l.valid = '1' or rin.m.valid = '1' or rin.d.valid = '1' then
+			if rin.e.valid = '1' then
 				report "execute " & to_hstring(rin.e.nia);
 			end if;
 			r <= rin;
@ -259,21 +265,16 @@ begin
 		variable decoded_reg_b : decode_input_reg_t;
 		variable decoded_reg_c : decode_input_reg_t;
 		variable decoded_reg_o : decode_output_reg_t;
                variable signed_division: std_ulogic;
                variable length : std_ulogic_vector(3 downto 0);
 	begin
 		v := r;
 		v.e := Decode2ToExecute1Init;
 		v.l := Decode2ToLoadStore1Init;
 		v.m := Decode2ToMultiplyInit;
                v.d := Decode2ToDividerInit;
 		mul_a := (others => '0');
 		mul_b := (others => '0');
 		--v.e.input_cr := d_in.decode.input_cr;
 		--v.m.input_cr := d_in.decode.input_cr;
 		--v.e.output_cr := d_in.decode.output_cr;
 		decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1);
@ -303,12 +304,17 @@ begin
 		v.e.insn_type := d_in.decode.insn_type;
 		v.e.read_reg1 := decoded_reg_a.reg;
 		v.e.read_data1 := decoded_reg_a.data;
                v.e.bypass_data1 := gpr_a_bypass;
 		v.e.read_reg2 := decoded_reg_b.reg;
 		v.e.read_data2 := decoded_reg_b.data;
                v.e.bypass_data2 := gpr_b_bypass;
                v.e.read_data3 := decoded_reg_c.data;
                v.e.bypass_data3 := gpr_c_bypass;
 		v.e.write_reg := decoded_reg_o.reg;
 		v.e.rc := decode_rc(d_in.decode.rc, d_in.insn);
                if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then
                        v.e.oe := decode_oe(d_in.decode.rc, d_in.insn);
                end if;
 		v.e.cr := c_in.read_cr_data;
 		v.e.xerc := c_in.read_xerc_data;
                v.e.invert_a := d_in.decode.invert_a;
@ -322,102 +328,9 @@ begin
 		end if;
                v.e.insn := d_in.insn;
                v.e.data_len := length;
-
+		v.e.byte_reverse := d_in.decode.byte_reverse;
-		-- multiply unit
+		v.e.sign_extend := d_in.decode.sign_extend;
-		v.m.insn_type := d_in.decode.insn_type;
+		v.e.update := d_in.decode.update;
 		mul_a := decoded_reg_a.data;
 		mul_b := decoded_reg_b.data;
 		v.m.write_reg := gspr_to_gpr(decoded_reg_o.reg);
 		v.m.rc := decode_rc(d_in.decode.rc, d_in.insn);
 		v.m.xerc := c_in.read_xerc_data;
 		if v.m.insn_type = OP_MUL_L64 then
 		  v.m.oe := decode_oe(d_in.decode.rc, d_in.insn);
 		end if;
 		v.m.is_32bit := d_in.decode.is_32bit;
 		if d_in.decode.is_32bit = '1' then
 			if d_in.decode.is_signed = '1' then
 				v.m.data1 := (others => mul_a(31));
 				v.m.data1(31 downto 0) := mul_a(31 downto 0);
 				v.m.data2 := (others => mul_b(31));
 				v.m.data2(31 downto 0) := mul_b(31 downto 0);
 			else
 				v.m.data1 := '0' & x"00000000" & mul_a(31 downto 0);
 				v.m.data2 := '0' & x"00000000" & mul_b(31 downto 0);
 			end if;
 		else
 			if d_in.decode.is_signed = '1' then
 				v.m.data1 := mul_a(63) & mul_a;
 				v.m.data2 := mul_b(63) & mul_b;
 			else
 				v.m.data1 := '0' & mul_a;
 				v.m.data2 := '0' & mul_b;
 			end if;
 		end if;
                -- divide unit
                -- PPC divide and modulus instruction words have these bits in
                -- the bottom 11 bits: o1dns 010t1 r
                -- where o = OE for div instrs, signedness for mod instrs
                --       d = 1 for div*, 0 for mod*
                --       n = 1 for normal, 0 for extended (dividend << 32/64)
                --       s = 1 for signed, 0 for unsigned (for div*)
                --       t = 1 for 32-bit, 0 for 64-bit
                --       r = RC bit (record condition code)
 		v.d.write_reg := gspr_to_gpr(decoded_reg_o.reg);
                v.d.is_modulus := not d_in.insn(8);
                v.d.is_32bit := d_in.insn(2);
                if d_in.insn(8) = '1' then
                        signed_division := d_in.insn(6);
                else
                        signed_division := d_in.insn(10);
                end if;
                v.d.is_signed := signed_division;
                if d_in.insn(2) = '0' then
                        -- 64-bit forms
                        if d_in.insn(8) = '1' and d_in.insn(7) = '0' then
                                v.d.is_extended := '1';
                        end if;
                        v.d.dividend := decoded_reg_a.data;
                        v.d.divisor := decoded_reg_b.data;
                else
                        -- 32-bit forms
                        if d_in.insn(8) = '1' and d_in.insn(7) = '0' then   -- extended forms
                                v.d.dividend := decoded_reg_a.data(31 downto 0) & x"00000000";
                        elsif signed_division = '1' and decoded_reg_a.data(31) = '1' then
                                -- sign extend to 64 bits
                                v.d.dividend := x"ffffffff" & decoded_reg_a.data(31 downto 0);
                        else
                                v.d.dividend := x"00000000" & decoded_reg_a.data(31 downto 0);
                        end if;
                        if signed_division = '1' and decoded_reg_b.data(31) = '1' then
                                v.d.divisor := x"ffffffff" & decoded_reg_b.data(31 downto 0);
                        else
                                v.d.divisor := x"00000000" & decoded_reg_b.data(31 downto 0);
                        end if;
                end if;
                v.d.rc := decode_rc(d_in.decode.rc, d_in.insn);
 		v.d.xerc := c_in.read_xerc_data;
 		v.d.oe := decode_oe(d_in.decode.rc, d_in.insn);
 		-- load/store unit
 		v.l.update_reg := gspr_to_gpr(decoded_reg_a.reg);
 		v.l.addr1 := decoded_reg_a.data;
 		v.l.addr2 := decoded_reg_b.data;
 		v.l.data := decoded_reg_c.data;
 		v.l.write_reg := gspr_to_gpr(decoded_reg_o.reg);
 		if d_in.decode.insn_type = OP_LOAD then
 			v.l.load := '1';
 		else
 			v.l.load := '0';
 		end if;
                v.l.length := length;
 		v.l.byte_reverse := d_in.decode.byte_reverse;
 		v.l.sign_extend := d_in.decode.sign_extend;
 		v.l.update := d_in.decode.update;
 		v.l.xerc := c_in.read_xerc_data;
 		-- issue control
 		control_valid_in <= d_in.valid;
@ -425,6 +338,10 @@ begin
 		gpr_write_valid <= decoded_reg_o.reg_valid;
 		gpr_write <= decoded_reg_o.reg;
                gpr_bypassable <= '0';
                if EX1_BYPASS and d_in.decode.unit = ALU then
                        gpr_bypassable <= '1';
                end if;
 		gpr_a_read_valid <= decoded_reg_a.reg_valid;
 		gpr_a_read <= decoded_reg_a.reg;
@ -437,29 +354,13 @@ begin
                cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn);
 		v.e.valid := '0';
 		v.m.valid := '0';
                v.d.valid := '0';
 		v.l.valid := '0';
 		case d_in.decode.unit is
 		when ALU =>
 			v.e.valid := control_valid_out;
 		when LDST =>
 			v.l.valid := control_valid_out;
 		when MUL =>
 			v.m.valid := control_valid_out;
                when DIV =>
                        v.d.valid := control_valid_out;
 		when NONE =>
 		v.e.valid := control_valid_out;
 		if d_in.decode.unit = NONE then
 			v.e.insn_type := OP_ILLEGAL;
-		end case;
+		end if;
 		if rst = '1' then
 			v.e := Decode2ToExecute1Init;
 			v.l := Decode2ToLoadStore1Init;
 			v.m := Decode2ToMultiplyInit;
                        v.d := Decode2ToDividerInit;
 		end if;
 		-- Update registers
@ -467,8 +368,5 @@ begin
 		-- Update outputs
 		e_out <= r.e;
 		l_out <= r.l;
 		m_out <= r.m;
                d_out <= r.d;
 	end process;
 end architecture behaviour;
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@ -4,18 +4,18 @@ use ieee.std_logic_1164.all;
 package decode_types is
    type insn_type_t is (OP_ILLEGAL, OP_NOP, OP_ADD,
 			 OP_ADDPCIS, OP_AND, OP_ATTN, OP_B, OP_BC, OP_BCREG,
-			 OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPL, OP_CMPRB,
+			 OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB,
 			 OP_CNTZ, OP_CRAND,
 			 OP_CRANDC, OP_CREQV, OP_CRNAND, OP_CRNOR, OP_CROR, OP_CRORC,
 			 OP_CRXOR, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST,
-			 OP_DCBZ, OP_DIV, OP_EXTS,
+			 OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS,
 			 OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC,
 			 OP_LOAD, OP_STORE, OP_MADDHD, OP_MADDHDU, OP_MADDLD, OP_MCRF,
 			 OP_MCRXR, OP_MCRXRX, OP_MFCR, OP_MFSPR, OP_MOD,
 			 OP_MTCRF, OP_MTSPR, OP_MUL_L64,
 			 OP_MUL_H64, OP_MUL_H32, OP_OR,
-			 OP_POPCNTB, OP_POPCNTD, OP_POPCNTW, OP_PRTYD,
+			 OP_POPCNT, OP_PRTY,
-			 OP_PRTYW, OP_RLC, OP_RLCL, OP_RLCR, OP_SETB,
+			 OP_RLC, OP_RLCL, OP_RLCR, OP_SETB,
 			 OP_SHL, OP_SHR,
 			 OP_SYNC, OP_TD, OP_TDI, OP_TW,
 			 OP_TWI, OP_XOR, OP_SIM_CONFIG
@ -46,7 +46,7 @@ package decode_types is
    constant TOO_OFFSET : integer := 0;
-    type unit_t is (NONE, ALU, LDST, MUL, DIV);
+    type unit_t is (NONE, ALU, LDST);
    type length_t is (NONE, is1B, is2B, is4B, is8B);
    type decode_rom_t is record
--- a/divider.vhdl
+++ b/divider.vhdl
@ -10,8 +10,8 @@ entity divider is
    port (
        clk   : in std_logic;
        rst   : in std_logic;
-        d_in  : in Decode2ToDividerType;
+        d_in  : in Execute1ToDividerType;
-        d_out : out DividerToWritebackType
+        d_out : out DividerToExecute1Type
        );
 end entity divider;
@ -23,20 +23,15 @@ architecture behaviour of divider is
    signal sresult    : std_ulogic_vector(64 downto 0);
    signal oresult    : std_ulogic_vector(63 downto 0);
    signal running    : std_ulogic;
    signal signcheck  : std_ulogic;
    signal count      : unsigned(6 downto 0);
    signal neg_result : std_ulogic;
    signal is_modulus : std_ulogic;
    signal is_32bit   : std_ulogic;
    signal extended   : std_ulogic;
    signal is_signed  : std_ulogic;
    signal rc         : std_ulogic;
    signal write_reg  : std_ulogic_vector(4 downto 0);
    signal overflow   : std_ulogic;
    signal ovf32      : std_ulogic;
    signal did_ovf    : std_ulogic;
    signal oe         : std_ulogic;
    signal xerc       : xer_common_t;
 begin
    divider_0: process(clk)
    begin
@ -48,40 +43,22 @@ begin
                running <= '0';
                count <= "0000000";
            elsif d_in.valid = '1' then
-                if d_in.is_extended = '1' and not (d_in.is_signed = '1' and d_in.dividend(63) = '1') then
+                if d_in.is_extended = '1'  then
                    dend <= '0' & d_in.dividend & x"0000000000000000";
                else
                    dend <= '0' & x"0000000000000000" & d_in.dividend;
                end if;
                div <= unsigned(d_in.divisor);
                quot <= (others => '0');
-                write_reg <= d_in.write_reg;
+                neg_result <= d_in.neg_result;
                neg_result <= '0';
                is_modulus <= d_in.is_modulus;
                extended <= d_in.is_extended;
                is_32bit <= d_in.is_32bit;
                is_signed <= d_in.is_signed;
                rc <= d_in.rc;
                oe <= d_in.oe;
 		xerc <= d_in.xerc;
                count <= "1111111";
                running <= '1';
                overflow <= '0';
                ovf32 <= '0';
                signcheck <= d_in.is_signed and (d_in.dividend(63) or d_in.divisor(63));
            elsif signcheck = '1' then
                signcheck <= '0';
                neg_result <= dend(63) xor (div(63) and not is_modulus);
                if dend(63) = '1' then
                    if extended = '1' then
                        dend <= '0' & std_ulogic_vector(- signed(dend(63 downto 0))) & x"0000000000000000";
                    else
                        dend <= '0' & x"0000000000000000" & std_ulogic_vector(- signed(dend(63 downto 0)));
                    end if;
                end if;
                if div(63) = '1' then
                    div <= unsigned(- signed(div));
                end if;
            elsif running = '1' then
                if count = "0111111" then
                    running <= '0';
@ -113,9 +90,6 @@ begin
    divider_1: process(all)
    begin
        d_out.write_reg_nr <= write_reg;
        d_out.rc <= rc;
        if is_modulus = '1' then
            result <= dend(128 downto 65);
        else
@ -151,23 +125,9 @@ begin
        if rising_edge(clk) then
 	    d_out.valid <= '0';
            d_out.write_reg_data <= oresult;
-	    d_out.write_reg_enable <= '0';
+	    d_out.overflow <= did_ovf;
 	    d_out.write_xerc_enable <= '0';
 	    d_out.xerc <= xerc;
            if count = "1000000" then
                d_out.valid <= '1';
                d_out.write_reg_enable <= '1';
 		d_out.write_xerc_enable <= oe;
 		-- We must test oe because the RC update code in writeback
 		-- will use the xerc value to set CR0:SO so we must not clobber
 		-- xerc if OE wasn't set.
 		--
 		if oe = '1' then
 		    d_out.xerc.ov <= did_ovf;
 		    d_out.xerc.ov32 <= did_ovf;
 		    d_out.xerc.so <= xerc.so or did_ovf;
 		end if;
            end if;
        end if;
    end process;
--- a/divider_tb.vhdl
+++ b/divider_tb.vhdl
@ -16,8 +16,8 @@ architecture behave of divider_tb is
    signal rst              : std_ulogic;
    constant clk_period     : time := 10 ns;
-    signal d1               : Decode2ToDividerType;
+    signal d1               : Execute1ToDividerType;
-    signal d2               : DividerToWritebackType;
+    signal d2               : DividerToExecute1Type;
 begin
    divider_0: entity work.divider
        port map (clk => clk, rst => rst, d_in => d1, d_out => d2);
@ -43,14 +43,13 @@ begin
        rst <= '0';
        d1.valid <= '1';
        d1.write_reg <= "10001";
        d1.dividend <= x"0000000010001000";
        d1.divisor  <= x"0000000000001111";
        d1.is_signed <= '0';
        d1.is_32bit <= '0';
        d1.is_extended <= '0';
        d1.is_modulus <= '0';
-        d1.rc <= '0';
+        d1.neg_result <= '0';
        wait for clk_period;
        assert d2.valid = '0';
@ -65,16 +64,12 @@ begin
        end loop;
        assert d2.valid = '1';
        assert d2.write_reg_enable = '1';
        assert d2.write_reg_nr = "10001";
        assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data);
        assert d2.rc = '0';
        wait for clk_period;
        assert d2.valid = '0' report "valid";
        d1.valid <= '1';
        d1.rc <= '1';
        wait for clk_period;
        assert d2.valid = '0' report "valid";
@ -89,10 +84,7 @@ begin
        end loop;
        assert d2.valid = '1';
        assert d2.write_reg_enable = '1';
        assert d2.write_reg_nr = "10001";
        assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data);
        assert d2.rc = '1';
        wait for clk_period;
        assert d2.valid = '0';
@ -105,9 +97,10 @@ begin
                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
-                    d1.dividend <= ra;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
-                    d1.divisor <= rb;
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                    d1.is_signed <= '1';
                    d1.neg_result <= ra(63) xor rb(63);
                    d1.valid <= '1';
                    wait for clk_period;
@ -142,6 +135,7 @@ begin
                    d1.dividend <= ra;
                    d1.divisor <= rb;
                    d1.is_signed <= '0';
                    d1.neg_result <= '0';
                    d1.valid <= '1';
                    wait for clk_period;
@ -173,9 +167,10 @@ begin
                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
-                    d1.dividend <= ra;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
-                    d1.divisor <= rb;
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                    d1.is_signed <= '1';
                    d1.neg_result <= ra(63) xor rb(63);
                    d1.is_extended <= '1';
                    d1.valid <= '1';
@ -216,6 +211,7 @@ begin
                    d1.dividend <= ra;
                    d1.divisor <= rb;
                    d1.is_signed <= '0';
                    d1.neg_result <= '0';
                    d1.is_extended <= '1';
                    d1.valid <= '1';
@ -250,9 +246,10 @@ begin
                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
-                    d1.dividend <= ra;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
-                    d1.divisor <= rb;
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                    d1.is_signed <= '1';
                    d1.neg_result <= ra(63) xor rb(63);
                    d1.is_extended <= '0';
                    d1.is_32bit <= '1';
                    d1.valid <= '1';
@ -289,6 +286,7 @@ begin
                    d1.dividend <= ra;
                    d1.divisor <= rb;
                    d1.is_signed <= '0';
                    d1.neg_result <= '0';
                    d1.is_extended <= '0';
                    d1.is_32bit <= '1';
                    d1.valid <= '1';
@ -322,9 +320,10 @@ begin
                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 32)) & x"00000000";
                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
-                    d1.dividend <= ra;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
-                    d1.divisor <= rb;
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                    d1.is_signed <= '1';
                    d1.neg_result <= ra(63) xor rb(63);
                    d1.is_extended <= '0';
                    d1.is_32bit <= '1';
                    d1.valid <= '1';
@ -365,6 +364,7 @@ begin
                    d1.dividend <= ra;
                    d1.divisor <= rb;
                    d1.is_signed <= '0';
                    d1.neg_result <= '0';
                    d1.is_extended <= '0';
                    d1.is_32bit <= '1';
                    d1.valid <= '1';
@ -398,9 +398,10 @@ begin
                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
-                    d1.dividend <= ra;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
-                    d1.divisor <= rb;
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                    d1.is_signed <= '1';
                    d1.neg_result <= ra(63);
                    d1.is_extended <= '0';
                    d1.is_32bit <= '0';
                    d1.is_modulus <= '1';
@ -438,6 +439,7 @@ begin
                    d1.dividend <= ra;
                    d1.divisor <= rb;
                    d1.is_signed <= '0';
                    d1.neg_result <= '0';
                    d1.is_extended <= '0';
                    d1.is_32bit <= '0';
                    d1.is_modulus <= '1';
@ -472,9 +474,10 @@ begin
                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
-                    d1.dividend <= ra;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
-                    d1.divisor <= rb;
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                    d1.is_signed <= '1';
                    d1.neg_result <= ra(63);
                    d1.is_extended <= '0';
                    d1.is_32bit <= '1';
                    d1.is_modulus <= '1';
@ -517,6 +520,7 @@ begin
                    d1.dividend <= ra;
                    d1.divisor <= rb;
                    d1.is_signed <= '0';
                    d1.neg_result <= '0';
                    d1.is_extended <= '0';
                    d1.is_32bit <= '1';
                    d1.is_modulus <= '1';
--- a/execute1.vhdl
+++ b/execute1.vhdl
@ -11,8 +11,12 @@ use work.insn_helpers.all;
 use work.ppc_fx_insns.all;
 entity execute1 is
    generic (
        EX1_BYPASS : boolean := true
        );
    port (
 	clk   : in std_ulogic;
        rst   : in std_ulogic;
 	-- asynchronous
 	flush_out : out std_ulogic;
@ -21,6 +25,7 @@ entity execute1 is
 	e_in  : in Decode2ToExecute1Type;
 	-- asynchronous
        l_out : out Execute1ToLoadstore1Type;
 	f_out : out Execute1ToFetch1Type;
 	e_out : out Execute1ToWritebackType;
@ -35,10 +40,19 @@ architecture behaviour of execute1 is
 	e : Execute1ToWritebackType;
 	lr_update : std_ulogic;
 	next_lr : std_ulogic_vector(63 downto 0);
 	mul_in_progress : std_ulogic;
        div_in_progress : std_ulogic;
        cntz_in_progress : std_ulogic;
 	slow_op_dest : gpr_index_t;
 	slow_op_rc : std_ulogic;
 	slow_op_oe : std_ulogic;
 	slow_op_xerc : xer_common_t;
    end record;
    signal r, rin : reg_type;
    signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0);
    signal ctrl: ctrl_t := (others => (others => '0'));
    signal ctrl_tmp: ctrl_t := (others => (others => '0'));
@ -47,6 +61,16 @@ architecture behaviour of execute1 is
    signal rotator_carry: std_ulogic;
    signal logical_result: std_ulogic_vector(63 downto 0);
    signal countzero_result: std_ulogic_vector(63 downto 0);
    signal popcnt_result: std_ulogic_vector(63 downto 0);
    signal parity_result: std_ulogic_vector(63 downto 0);
    -- multiply signals
    signal x_to_multiply: Execute1ToMultiplyType;
    signal multiply_to_x: MultiplyToExecute1Type;
    -- divider signals
    signal x_to_divider: Execute1ToDividerType;
    signal divider_to_x: DividerToExecute1Type;
    procedure set_carry(e: inout Execute1ToWritebackType;
 			carry32 : in std_ulogic;
@ -92,9 +116,9 @@ begin
    rotator_0: entity work.rotator
 	port map (
-	    rs => e_in.read_data3,
+	    rs => c_in,
-	    ra => e_in.read_data1,
+	    ra => a_in,
-	    shift => e_in.read_data2(6 downto 0),
+	    shift => b_in(6 downto 0),
 	    insn => e_in.insn,
 	    is_32bit => e_in.is_32bit,
 	    right_shift => right_shift,
@ -107,22 +131,45 @@ begin
    logical_0: entity work.logical
 	port map (
-	    rs => e_in.read_data3,
+	    rs => c_in,
-	    rb => e_in.read_data2,
+	    rb => b_in,
 	    op => e_in.insn_type,
 	    invert_in => e_in.invert_a,
 	    invert_out => e_in.invert_out,
-	    result => logical_result
+	    result => logical_result,
            datalen => e_in.data_len,
            popcnt => popcnt_result,
            parity => parity_result
 	    );
    countzero_0: entity work.zero_counter
 	port map (
-	    rs => e_in.read_data3,
+            clk => clk,
 	    rs => c_in,
 	    count_right => e_in.insn(10),
 	    is_32bit => e_in.is_32bit,
 	    result => countzero_result
 	    );
    multiply_0: entity work.multiply
        port map (
            clk => clk,
            m_in => x_to_multiply,
            m_out => multiply_to_x
            );
    divider_0: entity work.divider
        port map (
            clk => clk,
            rst => rst,
            d_in => x_to_divider,
            d_out => divider_to_x
            );
    a_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data1 = '1' else e_in.read_data1;
    b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2;
    c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3;
    execute1_0: process(clk)
    begin
 	if rising_edge(clk) then
@ -159,6 +206,14 @@ begin
 	variable l : std_ulogic;
 	variable next_nia : std_ulogic_vector(63 downto 0);
        variable carry_32, carry_64 : std_ulogic;
        variable sign1, sign2 : std_ulogic;
        variable abs1, abs2 : signed(63 downto 0);
 	variable overflow : std_ulogic;
 	variable negative : std_ulogic;
        variable zerohi, zerolo : std_ulogic;
        variable msb_a, msb_b : std_ulogic;
        variable a_lt : std_ulogic;
        variable lv : Execute1ToLoadstore1Type;
    begin
 	result := (others => '0');
 	result_with_carry := (others => '0');
@ -204,6 +259,83 @@ begin
 	end if;
 	v.lr_update := '0';
 	v.mul_in_progress := '0';
        v.div_in_progress := '0';
        v.cntz_in_progress := '0';
 	-- signals to multiply unit
 	x_to_multiply <= Execute1ToMultiplyInit;
 	x_to_multiply.insn_type <= e_in.insn_type;
 	x_to_multiply.is_32bit <= e_in.is_32bit;
 	if e_in.is_32bit = '1' then
 	    if e_in.is_signed = '1' then
 		x_to_multiply.data1 <= (others => a_in(31));
 		x_to_multiply.data1(31 downto 0) <= a_in(31 downto 0);
 		x_to_multiply.data2 <= (others => b_in(31));
 		x_to_multiply.data2(31 downto 0) <= b_in(31 downto 0);
 	    else
 		x_to_multiply.data1 <= '0' & x"00000000" & a_in(31 downto 0);
 		x_to_multiply.data2 <= '0' & x"00000000" & b_in(31 downto 0);
 	    end if;
 	else
 	    if e_in.is_signed = '1' then
 		x_to_multiply.data1 <= a_in(63) & a_in;
 		x_to_multiply.data2 <= b_in(63) & b_in;
 	    else
 		x_to_multiply.data1 <= '0' & a_in;
 		x_to_multiply.data2 <= '0' & b_in;
 	    end if;
 	end if;
        -- signals to divide unit
        sign1 := '0';
        sign2 := '0';
        if e_in.is_signed = '1' then
            if e_in.is_32bit = '1' then
                sign1 := a_in(31);
                sign2 := b_in(31);
            else
                sign1 := a_in(63);
                sign2 := b_in(63);
            end if;
        end if;
        -- take absolute values
        if sign1 = '0' then
            abs1 := signed(a_in);
        else
            abs1 := - signed(a_in);
        end if;
        if sign2 = '0' then
            abs2 := signed(b_in);
        else
            abs2 := - signed(b_in);
        end if;
        x_to_divider <= Execute1ToDividerInit;
        x_to_divider.is_signed <= e_in.is_signed;
 	x_to_divider.is_32bit <= e_in.is_32bit;
        if e_in.insn_type = OP_MOD then
            x_to_divider.is_modulus <= '1';
        end if;
        x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus);
        if e_in.is_32bit = '0' then
            -- 64-bit forms
            if e_in.insn_type = OP_DIVE then
                x_to_divider.is_extended <= '1';
            end if;
            x_to_divider.dividend <= std_ulogic_vector(abs1);
            x_to_divider.divisor <= std_ulogic_vector(abs2);
        else
            -- 32-bit forms
            x_to_divider.is_extended <= '0';
            if e_in.insn_type = OP_DIVE then   -- extended forms
                x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000";
            else
                x_to_divider.dividend <= x"00000000" & std_ulogic_vector(abs1(31 downto 0));
            end if;
            x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
        end if;
 	ctrl_tmp <= ctrl;
 	-- FIXME: run at 512MHz not core freq
@ -226,8 +358,10 @@ begin
 	    v.e.valid := '1';
 	    v.e.write_reg := e_in.write_reg;
-	    v.e.write_len := x"8";
+	    v.slow_op_dest := gspr_to_gpr(e_in.write_reg);
-	    v.e.sign_extend := '0';
+	    v.slow_op_rc := e_in.rc;
 	    v.slow_op_oe := e_in.oe;
 	    v.slow_op_xerc := v.e.xerc;
 	    case_0: case e_in.insn_type is
@ -236,51 +370,93 @@ begin
 		report "illegal";
 	    when OP_NOP =>
 		-- Do nothing
-	    when OP_ADD =>
+	    when OP_ADD | OP_CMP =>
 		if e_in.invert_a = '0' then
-		    a_inv := e_in.read_data1;
+		    a_inv := a_in;
 		else
-		    a_inv := not e_in.read_data1;
+		    a_inv := not a_in;
 		end if;
-		result_with_carry := ppc_adde(a_inv, e_in.read_data2,
+		result_with_carry := ppc_adde(a_inv, b_in,
 					      decode_input_carry(e_in.input_carry, v.e.xerc));
 		result := result_with_carry(63 downto 0);
-                carry_32 := result(32) xor a_inv(32) xor e_in.read_data2(32);
+                carry_32 := result(32) xor a_inv(32) xor b_in(32);
                carry_64 := result_with_carry(64);
                if e_in.insn_type = OP_ADD then
                    if e_in.output_carry = '1' then
                        set_carry(v.e, carry_32, carry_64);
                    end if;
                    if e_in.oe = '1' then
                        set_ov(v.e,
-			   calc_ov(a_inv(63), e_in.read_data2(63), carry_64, result_with_carry(63)),
+                               calc_ov(a_inv(63), b_in(63), carry_64, result_with_carry(63)),
-			   calc_ov(a_inv(31), e_in.read_data2(31), carry_32, result_with_carry(31)));
+                               calc_ov(a_inv(31), b_in(31), carry_32, result_with_carry(31)));
                    end if;
                    result_en := '1';
                else
                    -- CMP and CMPL instructions
                    -- Note, we have done RB - RA, not RA - RB
                    bf := insn_bf(e_in.insn);
                    l := insn_l(e_in.insn);
                    v.e.write_cr_enable := '1';
                    crnum := to_integer(unsigned(bf));
                    v.e.write_cr_mask := num_to_fxm(crnum);
                    zerolo := not (or (a_in(31 downto 0) xor b_in(31 downto 0)));
                    zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32)));
                    if zerolo = '1' and (l = '0' or zerohi = '1') then
                        -- values are equal
                        newcrf := "001" & v.e.xerc.so;
                    else
                        if l = '1' then
                            -- 64-bit comparison
                            msb_a := a_in(63);
                            msb_b := b_in(63);
                        else
                            -- 32-bit comparison
                            msb_a := a_in(31);
                            msb_b := b_in(31);
                        end if;
                        if msb_a /= msb_b then
                            -- Subtraction might overflow, but
                            -- comparison is clear from MSB difference.
                            -- for signed, 0 is greater; for unsigned, 1 is greater
                            a_lt := msb_a xnor e_in.is_signed;
                        else
                            -- Subtraction cannot overflow since MSBs are equal.
                            -- carry = 1 indicates RA is smaller (signed or unsigned)
                            a_lt := (not l and carry_32) or (l and carry_64);
                        end if;
                        newcrf := a_lt & not a_lt & '0' & v.e.xerc.so;
                    end if;
                    for i in 0 to 7 loop
                        lo := i*4;
                        hi := lo + 3;
                        v.e.write_cr_data(hi downto lo) := newcrf;
                    end loop;
                end if;
 	    when OP_AND | OP_OR | OP_XOR =>
 		result := logical_result;
 		result_en := '1';
 	    when OP_B =>
 		f_out.redirect <= '1';
 		if (insn_aa(e_in.insn)) then
-		    f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2));
+		    f_out.redirect_nia <= std_ulogic_vector(signed(b_in));
 		else
-		    f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2));
+		    f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
 		end if;
 	    when OP_BC =>
 		-- read_data1 is CTR
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
 		if bo(4-2) = '0' then
-		    result := std_ulogic_vector(unsigned(e_in.read_data1) - 1);
+		    result := std_ulogic_vector(unsigned(a_in) - 1);
 		    result_en := '1';
 		    v.e.write_reg := fast_spr_num(SPR_CTR);
 		end if;
-		if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then
+		if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then
 		    f_out.redirect <= '1';
 		    if (insn_aa(e_in.insn)) then
-			f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2));
+			f_out.redirect_nia <= std_ulogic_vector(signed(b_in));
 		    else
-			f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2));
+			f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
 		    end if;
 		end if;
 	    when OP_BCREG =>
@ -289,53 +465,41 @@ begin
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
 		if bo(4-2) = '0' and e_in.insn(10) = '0' then
-		    result := std_ulogic_vector(unsigned(e_in.read_data1) - 1);
+		    result := std_ulogic_vector(unsigned(a_in) - 1);
 		    result_en := '1';
 		    v.e.write_reg := fast_spr_num(SPR_CTR);
 		end if;
-		if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then
+		if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then
 		    f_out.redirect <= '1';
-		    f_out.redirect_nia <= e_in.read_data2(63 downto 2) & "00";
+		    f_out.redirect_nia <= b_in(63 downto 2) & "00";
 		end if;
 	    when OP_CMPB =>
-		result := ppc_cmpb(e_in.read_data3, e_in.read_data2);
+		result := ppc_cmpb(c_in, b_in);
 		result_en := '1';
 	    when OP_CMP =>
 		bf := insn_bf(e_in.insn);
 		l := insn_l(e_in.insn);
 		v.e.write_cr_enable := '1';
 		crnum := to_integer(unsigned(bf));
 		v.e.write_cr_mask := num_to_fxm(crnum);
 		for i in 0 to 7 loop
 		    lo := i*4;
 		    hi := lo + 3;
 		    v.e.write_cr_data(hi downto lo) := ppc_cmp(l, e_in.read_data1, e_in.read_data2, v.e.xerc.so);
 		end loop;
 	    when OP_CMPL =>
 		bf := insn_bf(e_in.insn);
 		l := insn_l(e_in.insn);
 		v.e.write_cr_enable := '1';
 		crnum := to_integer(unsigned(bf));
 		v.e.write_cr_mask := num_to_fxm(crnum);
 		for i in 0 to 7 loop
 		    lo := i*4;
 		    hi := lo + 3;
 		    v.e.write_cr_data(hi downto lo) := ppc_cmpl(l, e_in.read_data1, e_in.read_data2, v.e.xerc.so);
 		end loop;
            when OP_CNTZ =>
-		result := countzero_result;
+                v.e.valid := '0';
-		result_en := '1';
+                v.cntz_in_progress := '1';
                stall_out <= '1';
            when OP_EXTS =>
-		v.e.write_len := e_in.data_len;
+                -- note data_len is a 1-hot encoding
-		v.e.sign_extend := '1';
+		negative := (e_in.data_len(0) and c_in(7)) or
-		result := e_in.read_data3;
+			    (e_in.data_len(1) and c_in(15)) or
 			    (e_in.data_len(2) and c_in(31));
 		result := (others => negative);
 		if e_in.data_len(2) = '1' then
 		    result(31 downto 16) := c_in(31 downto 16);
 		end if;
 		if e_in.data_len(2) = '1' or e_in.data_len(1) = '1' then
 		    result(15 downto 8) := c_in(15 downto 8);
 		end if;
 		result(7 downto 0) := c_in(7 downto 0);
 		result_en := '1';
 	    when OP_ISEL =>
 		crbit := to_integer(unsigned(insn_bc(e_in.insn)));
 		if e_in.cr(31-crbit) = '1' then
-		    result := e_in.read_data1;
+		    result := a_in;
 		else
-		    result := e_in.read_data2;
+		    result := b_in;
 		end if;
 		result_en := '1';
 	    when OP_MCRF =>
@ -400,7 +564,7 @@ begin
 		end if;
 	    when OP_MFSPR =>
 		if is_fast_spr(e_in.read_reg1) then
-		    result := e_in.read_data1;
+		    result := a_in;
 		    if decode_spr_num(e_in.insn) = SPR_XER then
 			-- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer
 			result(63 downto 32) := (others => '0');
@ -447,19 +611,19 @@ begin
 		    crnum := fxm_to_num(insn_fxm(e_in.insn));
 		    v.e.write_cr_mask := num_to_fxm(crnum);
 		end if;
-		v.e.write_cr_data := e_in.read_data3(31 downto 0);
+		v.e.write_cr_data := c_in(31 downto 0);
 	    when OP_MTSPR =>
 		report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
-		    "=" & to_hstring(e_in.read_data3);
+		    "=" & to_hstring(c_in);
 		if is_fast_spr(e_in.write_reg) then
-		    result := e_in.read_data3;
+		    result := c_in;
 		    result_en := '1';
 		    if decode_spr_num(e_in.insn) = SPR_XER then
-			v.e.xerc.so := e_in.read_data3(63-32);
+			v.e.xerc.so := c_in(63-32);
-			v.e.xerc.ov := e_in.read_data3(63-33);
+			v.e.xerc.ov := c_in(63-33);
-			v.e.xerc.ca := e_in.read_data3(63-34);
+			v.e.xerc.ca := c_in(63-34);
-			v.e.xerc.ov32 := e_in.read_data3(63-44);
+			v.e.xerc.ov32 := c_in(63-44);
-			v.e.xerc.ca32 := e_in.read_data3(63-45);
+			v.e.xerc.ca32 := c_in(63-45);
 			v.e.write_xerc_enable := '1';
 		    end if;
 		else
@ -468,20 +632,11 @@ begin
 --		    when others =>
 --		    end case;
 		end if;
-	    when OP_POPCNTB =>
+	    when OP_POPCNT =>
-		result := ppc_popcntb(e_in.read_data3);
+		result := popcnt_result;
 		result_en := '1';
 	    when OP_POPCNTW =>
 		result := ppc_popcntw(e_in.read_data3);
 		result_en := '1';
 	    when OP_POPCNTD =>
 		result := ppc_popcntd(e_in.read_data3);
 		result_en := '1';
 	    when OP_PRTYD =>
 		result := ppc_prtyd(e_in.read_data3);
 		result_en := '1';
-	    when OP_PRTYW =>
+	    when OP_PRTY =>
-		result := ppc_prtyw(e_in.read_data3);
+		result := parity_result;
 		result_en := '1';
 	    when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR =>
 		result := rotator_result;
@ -506,11 +661,29 @@ begin
 	    when OP_ICBI =>
 		icache_inval <= '1';
 	    when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 =>
 		v.e.valid := '0';
 		v.mul_in_progress := '1';
 		stall_out <= '1';
 		x_to_multiply.valid <= '1';
 	    when OP_DIV | OP_DIVE | OP_MOD =>
 		v.e.valid := '0';
 		v.div_in_progress := '1';
 		stall_out <= '1';
 		x_to_divider.valid <= '1';
            when OP_LOAD | OP_STORE =>
                -- loadstore/dcache has its own port to writeback
                v.e.valid := '0';
            when others =>
 		terminate_out <= '1';
 		report "illegal";
 	    end case;
 	    v.e.rc := e_in.rc and e_in.valid;
 	    -- Update LR on the next cycle after a branch link
 	    --
 	    -- WARNING: The LR update isn't tracked by our hazard tracker. This
@ -533,20 +706,74 @@ begin
 	    result_en := '1';
 	    result := r.next_lr;
 	    v.e.write_reg := fast_spr_num(SPR_LR);
 	    v.e.write_len := x"8";
 	    v.e.sign_extend := '0';
 	    v.e.valid := '1';
        elsif r.cntz_in_progress = '1' then
            -- cnt[lt]z always takes two cycles
            result := countzero_result;
            result_en := '1';
            v.e.write_reg := gpr_to_gspr(v.slow_op_dest);
            v.e.rc := v.slow_op_rc;
            v.e.xerc := v.slow_op_xerc;
            v.e.valid := '1';
 	elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then
 	    if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or
 	       (r.div_in_progress = '1' and divider_to_x.valid = '1') then
 		if r.mul_in_progress = '1' then
 		    result := multiply_to_x.write_reg_data;
 		    overflow := multiply_to_x.overflow;
 		else
 		    result := divider_to_x.write_reg_data;
 		    overflow := divider_to_x.overflow;
 		end if;
 		result_en := '1';
 		v.e.write_reg := gpr_to_gspr(v.slow_op_dest);
 		v.e.rc := v.slow_op_rc;
 		v.e.xerc := v.slow_op_xerc;
 		v.e.write_xerc_enable := v.slow_op_oe;
 		-- We must test oe because the RC update code in writeback
 		-- will use the xerc value to set CR0:SO so we must not clobber
 		-- xerc if OE wasn't set.
 		if v.slow_op_oe = '1' then
 		    v.e.xerc.ov := overflow;
 		    v.e.xerc.ov32 := overflow;
 		    v.e.xerc.so := v.slow_op_xerc.so or overflow;
 		end if;
 		v.e.valid := '1';
 	    else
 		stall_out <= '1';
 		v.mul_in_progress := r.mul_in_progress;
 		v.div_in_progress := r.div_in_progress;
 	    end if;
 	end if;
 	v.e.write_data := result;
 	v.e.write_enable := result_en;
-	v.e.rc := e_in.rc and e_in.valid;
+
        -- Outputs to loadstore1 (async)
        lv := Execute1ToLoadstore1Init;
        if e_in.valid = '1' and (e_in.insn_type = OP_LOAD or e_in.insn_type = OP_STORE) then
            lv.valid := '1';
        end if;
        if e_in.insn_type = OP_LOAD then
            lv.load := '1';
        end if;
        lv.addr1 := a_in;
        lv.addr2 := b_in;
        lv.data := c_in;
        lv.write_reg := gspr_to_gpr(e_in.write_reg);
        lv.length := e_in.data_len;
        lv.byte_reverse := e_in.byte_reverse;
        lv.sign_extend := e_in.sign_extend;
        lv.update := e_in.update;
        lv.update_reg := gspr_to_gpr(e_in.read_reg1);
        lv.xerc := v.e.xerc;
 	-- Update registers
 	rin <= v;
 	-- update outputs
 	--f_out <= r.f;
        l_out <= lv;
 	e_out <= r.e;
 	flush_out <= f_out.redirect;
    end process;
--- a/gpr_hazard.vhdl
+++ b/gpr_hazard.vhdl
@ -12,18 +12,21 @@ entity gpr_hazard is
        gpr_write_valid_in : in std_ulogic;
        gpr_write_in       : in std_ulogic_vector(5 downto 0);
        bypass_avail       : in std_ulogic;
        gpr_read_valid_in  : in std_ulogic;
        gpr_read_in        : in std_ulogic_vector(5 downto 0);
-        stall_out          : out std_ulogic
+        stall_out          : out std_ulogic;
        use_bypass         : out std_ulogic
        );
 end entity gpr_hazard;
 architecture behaviour of gpr_hazard is
    type pipeline_entry_type is record
        valid  : std_ulogic;
        bypass : std_ulogic;
        gpr    : std_ulogic_vector(5 downto 0);
    end record;
-    constant pipeline_entry_init : pipeline_entry_type := (valid => '0', gpr => (others => '0'));
+    constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0'));
    type pipeline_t is array(0 to PIPELINE_DEPTH-1) of pipeline_entry_type;
    constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init);
@ -33,10 +36,8 @@ begin
    gpr_hazard0: process(clk)
    begin
        if rising_edge(clk) then
 	    if stall_in = '0' then
            r <= rin;
        end if;
        end if;
    end process;
    gpr_hazard1: process(all)
@ -45,22 +46,49 @@ begin
        v := r;
        stall_out <= '0';
-        loop_0: for i in 0 to PIPELINE_DEPTH-1 loop
+        use_bypass <= '0';
-            if ((r(i).valid = gpr_read_valid_in) and r(i).gpr = gpr_read_in) then
+        if gpr_read_valid_in = '1' then
            if r(0).valid = '1' and r(0).gpr = gpr_read_in then
                if r(0).bypass = '1' and stall_in = '0' then
                    use_bypass <= '1';
                else
                    stall_out <= '1';
                end if;
            end if;
            loop_0: for i in 1 to PIPELINE_DEPTH-1 loop
                if r(i).valid = '1' and r(i).gpr = gpr_read_in then
                    if r(i).bypass = '1' then
                        use_bypass <= '1';
                    else
                        stall_out <= '1';
                    end if;
                end if;
            end loop;
        end if;
        if stall_in = '0' then
            v(0).valid  := gpr_write_valid_in;
            v(0).bypass := bypass_avail;
            v(0).gpr    := gpr_write_in;
-        loop_1: for i in 0 to PIPELINE_DEPTH-2 loop
+            loop_1: for i in 1 to PIPELINE_DEPTH-1 loop
                -- propagate to next slot
-            v(i+1) := r(i);
+                v(i).valid  := r(i-1).valid;
                v(i).bypass := r(i-1).bypass;
                v(i).gpr    := r(i-1).gpr;
            end loop;
-        -- asynchronous output
+        else
-        if gpr_read_valid_in = '0' then
+            -- stage 0 stalled, so stage 1 becomes empty
-            stall_out <= '0';
+            loop_1b: for i in 1 to PIPELINE_DEPTH-1 loop
                -- propagate to next slot
                if i = 1 then
                    v(i).valid := '0';
                else
                    v(i).valid  := r(i-1).valid;
                    v(i).bypass := r(i-1).bypass;
                    v(i).gpr    := r(i-1).gpr;
                end if;
            end loop;
        end if;
        -- update registers
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@ -13,7 +13,7 @@ entity loadstore1 is
    port (
        clk   : in std_ulogic;
-        l_in  : in Decode2ToLoadstore1Type;
+        l_in  : in Execute1ToLoadstore1Type;
        l_out : out Loadstore1ToDcacheType
        );
--- a/logical.vhdl
+++ b/logical.vhdl
@ -12,11 +12,29 @@ entity logical is
        op         : in insn_type_t;
        invert_in  : in std_ulogic;
        invert_out : in std_ulogic;
-        result     : out std_ulogic_vector(63 downto 0)
+        result     : out std_ulogic_vector(63 downto 0);
        datalen    : in std_logic_vector(3 downto 0);
        popcnt     : out std_ulogic_vector(63 downto 0);
        parity     : out std_ulogic_vector(63 downto 0)
        );
 end entity logical;
 architecture behaviour of logical is
    subtype twobit is unsigned(1 downto 0);
    type twobit32 is array(0 to 31) of twobit;
    signal pc2      : twobit32;
    subtype threebit is unsigned(2 downto 0);
    type threebit16 is array(0 to 15) of threebit;
    signal pc4      : threebit16;
    subtype fourbit is unsigned(3 downto 0);
    type fourbit8 is array(0 to 7) of fourbit;
    signal pc8      : fourbit8;
    subtype sixbit is unsigned(5 downto 0);
    type sixbit2 is array(0 to 1) of sixbit;
    signal pc32     : sixbit2;
    signal par0, par1 : std_ulogic;
 begin
    logical_0: process(all)
        variable rb_adj, tmp : std_ulogic_vector(63 downto 0);
@ -40,5 +58,45 @@ begin
            result <= not tmp;
        end if;
        -- population counts
        for i in 0 to 31 loop
            pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1));
        end loop;
        for i in 0 to 15 loop
            pc4(i) <= ('0' & pc2(i * 2)) + ('0' & pc2(i * 2 + 1));
        end loop;
        for i in 0 to 7 loop
            pc8(i) <= ('0' & pc4(i * 2)) + ('0' & pc4(i * 2 + 1));
        end loop;
        for i in 0 to 1 loop
            pc32(i) <= ("00" & pc8(i * 4)) + ("00" & pc8(i * 4 + 1)) +
                       ("00" & pc8(i * 4 + 2)) + ("00" & pc8(i * 4 + 3));
        end loop;
        popcnt <= (others => '0');
        if datalen(3 downto 2) = "00" then
            -- popcntb
            for i in 0 to 7 loop
                popcnt(i * 8 + 3 downto i * 8) <= std_ulogic_vector(pc8(i));
            end loop;
        elsif datalen(3) = '0' then
            -- popcntw
            for i in 0 to 1 loop
                popcnt(i * 32 + 5 downto i * 32) <= std_ulogic_vector(pc32(i));
            end loop;
        else
            popcnt(6 downto 0) <= std_ulogic_vector(('0' & pc32(0)) + ('0' & pc32(1)));
        end if;
        -- parity calculations
        par0 <= rs(0) xor rs(8) xor rs(16) xor rs(24);
        par1 <= rs(32) xor rs(40) xor rs(48) xor rs(56);
        parity <= (others => '0');
        if datalen(3) = '1' then
            parity(0) <= par0 xor par1;
        else
            parity(0) <= par0;
            parity(32) <= par1;
        end if;
    end process;
 end behaviour;
--- a/multiply.vhdl
+++ b/multiply.vhdl
@ -13,31 +13,24 @@ entity multiply is
    port (
        clk   : in std_logic;
-        m_in  : in Decode2ToMultiplyType;
+        m_in  : in Execute1ToMultiplyType;
-        m_out : out MultiplyToWritebackType
+        m_out : out MultiplyToExecute1Type
        );
 end entity multiply;
 architecture behaviour of multiply is
-    signal m: Decode2ToMultiplyType;
+    signal m: Execute1ToMultiplyType;
    type multiply_pipeline_stage is record
        valid     : std_ulogic;
        insn_type  : insn_type_t;
        data      : signed(129 downto 0);
        write_reg : std_ulogic_vector(4 downto 0);
        rc        : std_ulogic;
 	oe        : std_ulogic;
 	is_32bit  : std_ulogic;
 	xerc      : xer_common_t;
    end record;
    constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0',
 								     insn_type => OP_ILLEGAL,
 								     rc => '0', oe => '0',
 								     is_32bit => '0',
-								     xerc => xerc_init,
+								     data => (others => '0'));
 								     data => (others => '0'),
 								     others => (others => '0'));
    type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage;
    constant MultiplyPipelineInit : multiply_pipeline_type := (others => MultiplyPipelineStageInit);
@ -64,16 +57,12 @@ begin
    begin
        v := r;
-        m_out <= MultiplyToWritebackInit;
+        m_out <= MultiplyToExecute1Init;
        v.multiply_pipeline(0).valid := m.valid;
        v.multiply_pipeline(0).insn_type := m.insn_type;
        v.multiply_pipeline(0).data := signed(m.data1) * signed(m.data2);
        v.multiply_pipeline(0).write_reg := m.write_reg;
        v.multiply_pipeline(0).rc := m.rc;
        v.multiply_pipeline(0).oe := m.oe;
        v.multiply_pipeline(0).is_32bit := m.is_32bit;
        v.multiply_pipeline(0).xerc := m.xerc;
        loop_0: for i in 1 to PIPELINE_DEPTH-1 loop
            v.multiply_pipeline(i) := r.multiply_pipeline(i-1);
@ -101,25 +90,10 @@ begin
        end case;
        m_out.write_reg_data <= d2;
-        m_out.write_reg_nr <= v.multiply_pipeline(PIPELINE_DEPTH-1).write_reg;
+        m_out.overflow <= ov;
 	m_out.xerc <= v.multiply_pipeline(PIPELINE_DEPTH-1).xerc;
 	-- Generate OV/OV32/SO when OE=1
        if v.multiply_pipeline(PIPELINE_DEPTH-1).valid = '1' then
            m_out.valid <= '1';
            m_out.write_reg_enable <= '1';
            m_out.rc <= v.multiply_pipeline(PIPELINE_DEPTH-1).rc;
            m_out.write_xerc_enable <= v.multiply_pipeline(PIPELINE_DEPTH-1).oe;
 	    -- We must test oe because the RC update code in writeback
 	    -- will use the xerc value to set CR0:SO so we must not clobber
 	    -- xerc if OE wasn't set.
 	    --
 	    if v.multiply_pipeline(PIPELINE_DEPTH-1).oe = '1' then
 		m_out.xerc.ov <= ov;
 		m_out.xerc.ov32 <= ov;
 		m_out.xerc.so <= v.multiply_pipeline(PIPELINE_DEPTH-1).xerc.so or ov;
 	    end if;
        end if;
        rin <= v;
--- a/multiply_tb.vhdl
+++ b/multiply_tb.vhdl
@ -17,8 +17,8 @@ architecture behave of multiply_tb is
    constant pipeline_depth : integer := 4;
-    signal m1               : Decode2ToMultiplyType;
+    signal m1               : Execute1ToMultiplyType;
-    signal m2               : MultiplyToWritebackType;
+    signal m2               : MultiplyToExecute1Type;
 begin
    multiply_0: entity work.multiply
        generic map (PIPELINE_DEPTH => pipeline_depth)
@ -40,10 +40,8 @@ begin
        m1.valid <= '1';
        m1.insn_type <= OP_MUL_L64;
        m1.write_reg <= "10001";
        m1.data1 <= '0' & x"0000000000001000";
        m1.data2 <= '0' & x"0000000000001111";
        m1.rc <= '0';
        wait for clk_period;
        assert m2.valid = '0';
@ -58,16 +56,12 @@ begin
        wait for clk_period;
        assert m2.valid = '1';
        assert m2.write_reg_enable = '1';
        assert m2.write_reg_nr = "10001";
        assert m2.write_reg_data = x"0000000001111000";
        assert m2.rc = '0';
        wait for clk_period;
        assert m2.valid = '0';
        m1.valid <= '1';
        m1.rc <= '1';
        wait for clk_period;
        assert m2.valid = '0';
@ -76,10 +70,7 @@ begin
        wait for clk_period * (pipeline_depth-1);
        assert m2.valid = '1';
        assert m2.write_reg_enable = '1';
        assert m2.write_reg_nr = "10001";
        assert m2.write_reg_data = x"0000000001111000";
        assert m2.rc = '1';
        -- test mulld
        mulld_loop : for i in 0 to 1000 loop
--- a/writeback.vhdl
+++ b/writeback.vhdl
@ -12,8 +12,6 @@ entity writeback is
        e_in         : in Execute1ToWritebackType;
        l_in         : in DcacheToWritebackType;
        m_in         : in MultiplyToWritebackType;
        d_in         : in DividerToWritebackType;
        w_out        : out WritebackToRegisterFileType;
        c_out        : out WritebackToCrFileType;
@ -44,7 +42,6 @@ architecture behaviour of writeback is
    signal sign_extend : std_ulogic;
    signal negative : std_ulogic;
    signal second_word : std_ulogic;
    signal zero : std_ulogic;
 begin
    writeback_0: process(clk)
    begin
@ -64,44 +61,32 @@ begin
        variable k : unsigned(3 downto 0);
 	variable cf: std_ulogic_vector(3 downto 0);
 	variable xe: xer_common_t;
        variable zero : std_ulogic;
        variable sign : std_ulogic;
    begin
        x := "" & e_in.valid;
        y := "" & l_in.valid;
-        z := "" & m_in.valid;
+        assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure;
        w := "" & d_in.valid;
        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z)) + to_integer(unsigned(w))) <= 1 severity failure;
        x := "" & e_in.write_enable;
        y := "" & l_in.write_enable;
-        z := "" & m_in.write_reg_enable;
+        assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure;
        w := "" & d_in.write_reg_enable;
        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z)) + to_integer(unsigned(w))) <= 1 severity failure;
        w := "" & e_in.write_cr_enable;
        x := "" & (e_in.write_enable and e_in.rc);
-        y := "" & (m_in.valid and m_in.rc);
+        assert (to_integer(unsigned(w)) + to_integer(unsigned(x))) <= 1 severity failure;
        z := "" & (d_in.valid and d_in.rc);
        assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
        x := "" & e_in.write_xerc_enable;
        y := "" & m_in.write_xerc_enable;
        z := "" & D_in.write_xerc_enable;
        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
        w_out <= WritebackToRegisterFileInit;
        c_out <= WritebackToCrFileInit;
        complete_out <= '0';
-        if e_in.valid = '1' or l_in.valid = '1' or m_in.valid = '1' or d_in.valid = '1' then
+        if e_in.valid = '1' or l_in.valid = '1' then
            complete_out <= '1';
        end if;
        rc <= '0';
        brev_lenm1 <= "000";
        byte_offset <= "000";
        data_len <= x"8";
        partial_write <= '0';
        sign_extend <= '0';
        second_word <= '0';
 	xe := e_in.xerc;
 	data_in <= (others => '0');
@ -109,9 +94,6 @@ begin
        if e_in.write_enable = '1' then
            w_out.write_reg <= e_in.write_reg;
            w_out.write_enable <= '1';
 	    data_in <= e_in.write_data;
            data_len <= unsigned(e_in.write_len);
            sign_extend <= e_in.sign_extend;
            rc <= e_in.rc;
        end if;
@ -126,12 +108,11 @@ begin
            c_out.write_xerc_data <= e_in.xerc;
 	end if;
-	if l_in.write_enable = '1' then
+        sign_extend <= l_in.sign_extend;
            w_out.write_reg <= gpr_to_gspr(l_in.write_reg);
            data_in <= l_in.write_data;
        data_len <= unsigned(l_in.write_len);
        byte_offset <= unsigned(l_in.write_shift);
-            sign_extend <= l_in.sign_extend;
+	if l_in.write_enable = '1' then
            w_out.write_reg <= gpr_to_gspr(l_in.write_reg);
            if l_in.byte_reverse = '1' then
                brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1;
            end if;
@ -143,32 +124,6 @@ begin
 	    xe := l_in.xerc;
        end if;
        if m_in.write_reg_enable = '1' then
            w_out.write_enable <= '1';
            w_out.write_reg <= gpr_to_gspr(m_in.write_reg_nr);
            data_in <= m_in.write_reg_data;
            rc <= m_in.rc;
 	    xe := m_in.xerc;
        end if;
 	if m_in.write_xerc_enable = '1' then
            c_out.write_xerc_enable <= '1';
            c_out.write_xerc_data <= m_in.xerc;
 	end if;
        if d_in.write_reg_enable = '1' then
            w_out.write_enable <= '1';
            w_out.write_reg <= gpr_to_gspr(d_in.write_reg_nr);
            data_in <= d_in.write_reg_data;
            rc <= d_in.rc;
 	    xe := d_in.xerc;
        end if;
 	if d_in.write_xerc_enable = '1' then
            c_out.write_xerc_enable <= '1';
            c_out.write_xerc_data <= d_in.xerc;
 	end if;
        -- shift and byte-reverse data bytes
        for i in 0 to 7 loop
            k := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
@ -177,7 +132,7 @@ begin
        end loop;
        for i in 0 to 7 loop
            j := to_integer(perm(i)) * 8;
-            data_permuted(i * 8 + 7 downto i * 8) <= data_in(j + 7 downto j);
+            data_permuted(i * 8 + 7 downto i * 8) <= l_in.write_data(j + 7 downto j);
        end loop;
        -- If the data can arrive split over two cycles, this will be correct
@ -199,16 +154,12 @@ begin
                trim_ctl(i) <= '0' & (negative and sign_extend);
            end if;
        end loop;
 	zero <= not negative;
        for i in 0 to 7 loop
            case trim_ctl(i) is
                when "11" =>
                    data_trimmed(i * 8 + 7 downto i * 8) <= data_latched(i * 8 + 7 downto i * 8);
                when "10" =>
                    data_trimmed(i * 8 + 7 downto i * 8) <= data_permuted(i * 8 + 7 downto i * 8);
 		    if or data_permuted(i * 8 + 7 downto i * 8) /= '0' then
 			zero <= '0';
 		    end if;
                when "01" =>
                    data_trimmed(i * 8 + 7 downto i * 8) <= x"FF";
                when others =>
@ -217,14 +168,21 @@ begin
        end loop;
        -- deliver to regfile
        if l_in.write_enable = '1' then
            w_out.write_data <= data_trimmed;
        else
            w_out.write_data <= e_in.write_data;
        end if;
        -- Perform CR0 update for RC forms
        -- Note that loads never have a form with an RC bit, therefore this can test e_in.write_data
        if rc = '1' then
            sign := e_in.write_data(63);
            zero := not (or e_in.write_data);
            c_out.write_cr_enable <= '1';
            c_out.write_cr_mask <= num_to_fxm(0);
-	    cf(3) := negative;
+	    cf(3) := sign;
-	    cf(2) := not negative and not zero;
+	    cf(2) := not sign and not zero;
 	    cf(1) := zero;
 	    cf(0) := xe.so;
 	    c_out.write_cr_data(31 downto 28) <= cf;