From 2167186b5fae691b2a165cc5bfaaa79fe4713733 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 10 Dec 2019 20:52:21 +1100
Subject: [PATCH 01/10] Make multiplier hang off the side of execute1

With this, the multiplier isn't a separate pipe that decode2 issues
instructions to, but rather is a unit that execute1 sends operands
to and which sends the result back to execute1, which then sends it
to writeback.  Execute1 now sends a stall signal when it gets a
multiply instruction until it gets a valid signal back from the
multiplier.

This all means that we no longer need to mark the multiply
instructions as single-issue.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Makefile          |  4 +--
 common.vhdl       | 19 +++++++------
 core.vhdl         | 14 ----------
 decode1.vhdl      | 26 +++++++++---------
 decode2.vhdl      | 43 +-----------------------------
 decode_types.vhdl |  2 +-
 execute1.vhdl     | 68 ++++++++++++++++++++++++++++++++++++++++++++++-
 multiply.vhdl     |  9 +++----
 multiply_tb.vhdl  |  6 ++---
 writeback.vhdl    | 32 +++++-----------------
 10 files changed, 106 insertions(+), 117 deletions(-)

diff --git a/Makefile b/Makefile
index e2398c0..720e8d5 100644
--- a/Makefile
+++ b/Makefile
@@ -17,7 +17,7 @@ common.o: decode_types.o
 control.o: gpr_hazard.o cr_hazard.o common.o
 sim_jtag.o: sim_jtag_socket.o
 core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o
-core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o multiply.o writeback.o core_debug.o divider.o
+core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o writeback.o core_debug.o divider.o
 core_debug.o: common.o
 countzero.o:
 countzero_tb.o: common.o glibc_random.o countzero.o
@@ -26,7 +26,7 @@ crhelpers.o: common.o
 decode1.o: common.o decode_types.o
 decode2.o: decode_types.o common.o helpers.o insn_helpers.o control.o
 decode_types.o:
-execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o
+execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o multiply.o
 fetch1.o: common.o
 fetch2.o: common.o wishbone_types.o
 glibc_random_helpers.o:
diff --git a/common.vhdl b/common.vhdl
index a27f4f2..9c18230 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -130,7 +130,7 @@ package common is
 	 invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
 	 is_32bit => '0', is_signed => '0', xerc => xerc_init, others => (others => '0'));
 
-    type Decode2ToMultiplyType is record
+    type Execute1ToMultiplyType is record
 	valid: std_ulogic;
 	insn_type: insn_type_t;
 	write_reg: gpr_index_t;
@@ -141,9 +141,9 @@ package common is
 	is_32bit: std_ulogic;
 	xerc: xer_common_t;
     end record;
-    constant Decode2ToMultiplyInit : Decode2ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, rc => '0',
-							       oe => '0', is_32bit => '0', xerc => xerc_init,
-							       others => (others => '0'));
+    constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, rc => '0',
+								 oe => '0', is_32bit => '0', xerc => xerc_init,
+								 others => (others => '0'));
 
     type Decode2ToDividerType is record
 	valid: std_ulogic;
@@ -261,20 +261,19 @@ package common is
 								   write_xerc_enable => '0', xerc => xerc_init,
 								   others => (others => '0'));
 
-    type MultiplyToWritebackType is record
+    type MultiplyToExecute1Type is record
 	valid: std_ulogic;
 
-	write_reg_enable : std_ulogic;
 	write_reg_nr: gpr_index_t;
 	write_reg_data: std_ulogic_vector(63 downto 0);
 	write_xerc_enable : std_ulogic;
 	xerc : xer_common_t;
 	rc: std_ulogic;
     end record;
-    constant MultiplyToWritebackInit : MultiplyToWritebackType := (valid => '0', write_reg_enable => '0',
-								   rc => '0', write_xerc_enable => '0',
-								   xerc => xerc_init,
-								   others => (others => '0'));
+    constant MultiplyToExecute1Init : MultiplyToExecute1Type := (valid => '0',
+								 rc => '0', write_xerc_enable => '0',
+								 xerc => xerc_init,
+								 others => (others => '0'));
 
     type DividerToWritebackType is record
 	valid: std_ulogic;
diff --git a/core.vhdl b/core.vhdl
index eb0b526..71c10b3 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -63,10 +63,6 @@ architecture behave of core is
     signal loadstore1_to_dcache: Loadstore1ToDcacheType;
     signal dcache_to_writeback: DcacheToWritebackType;
 
-    -- multiply signals
-    signal decode2_to_multiply: Decode2ToMultiplyType;
-    signal multiply_to_writeback: MultiplyToWritebackType;
-
     -- divider signals
     signal decode2_to_divider: Decode2ToDividerType;
     signal divider_to_writeback: DividerToWritebackType;
@@ -115,7 +111,6 @@ architecture behave of core is
     attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN);
     attribute keep_hierarchy of cr_file_0 : label is keep_h(DISABLE_FLATTEN);
     attribute keep_hierarchy of execute1_0 : label is keep_h(DISABLE_FLATTEN);
-    attribute keep_hierarchy of multiply_0 : label is keep_h(DISABLE_FLATTEN);
     attribute keep_hierarchy of divider_0 : label is keep_h(DISABLE_FLATTEN);
     attribute keep_hierarchy of loadstore1_0 : label is keep_h(DISABLE_FLATTEN);
     attribute keep_hierarchy of dcache_0 : label is keep_h(DISABLE_FLATTEN);
@@ -197,7 +192,6 @@ begin
             d_in => decode1_to_decode2,
             e_out => decode2_to_execute1,
             l_out => decode2_to_loadstore1,
-            m_out => decode2_to_multiply,
             d_out => decode2_to_divider,
             r_in => register_file_to_decode2,
             r_out => decode2_to_register_file,
@@ -265,13 +259,6 @@ begin
             wishbone_out => wishbone_data_out
             );
 
-    multiply_0: entity work.multiply
-        port map (
-            clk => clk,
-            m_in => decode2_to_multiply,
-            m_out => multiply_to_writeback
-            );
-
     divider_0: entity work.divider
         port map (
             clk => clk,
@@ -285,7 +272,6 @@ begin
             clk => clk,
             e_in => execute1_to_writeback,
             l_in => dcache_to_writeback,
-            m_in => multiply_to_writeback,
             d_in => divider_to_writeback,
             w_out => writeback_to_register_file,
             c_out => writeback_to_cr_file,
diff --git a/decode1.vhdl b/decode1.vhdl
index 51a2643..4e1d063 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -54,7 +54,7 @@ architecture behaviour of decode1 is
 		41 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lhzu
 		32 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwz
                 33 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lwzu
-		 7 =>       (MUL,    OP_MUL_L64,   RA,         CONST_SI,    NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '1'), -- mulli
+		 7 =>       (ALU,    OP_MUL_L64,   RA,         CONST_SI,    NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli
 		24 =>       (ALU,    OP_OR,        NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ori
 		25 =>       (ALU,    OP_OR,        NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- oris
 		20 =>       (ALU,    OP_RLC,       RA,         CONST_SH32,  RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- rlwimi
@@ -244,19 +244,19 @@ architecture behaviour of decode1 is
 		2#1100001011#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsw
 		2#0010010000#  =>       (ALU,    OP_MTCRF,     NONE,       NONE,        RS,   NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtcrf/mtocrf
 		2#0111010011#  =>       (ALU,    OP_MTSPR,     NONE,       NONE,        RS,   SPR,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtspr
-		2#0001001001#  =>       (MUL,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '1'), -- mulhd
-		2#0000001001#  =>       (MUL,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- mulhdu
-		2#0001001011#  =>       (MUL,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '1'), -- mulhw
-		2#0000001011#  =>       (MUL,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '1'), -- mulhwu
+		2#0001001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulhd
+		2#0000001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- mulhdu
+		2#0001001011#  =>       (ALU,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- mulhw
+		2#0000001011#  =>       (ALU,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- mulhwu
                 -- next 4 have reserved bit set
-		2#1001001001#  =>       (MUL,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '1'), -- mulhd
-		2#1000001001#  =>       (MUL,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- mulhdu
-		2#1001001011#  =>       (MUL,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '1'), -- mulhw
-		2#1000001011#  =>       (MUL,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '1'), -- mulhwu
-		2#0011101001#  =>       (MUL,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '1'), -- mulld
-		2#1011101001#  =>       (MUL,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '1'), -- mulldo
-		2#0011101011#  =>       (MUL,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '1'), -- mullw
-		2#1011101011#  =>       (MUL,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '1'), -- mullwo
+		2#1001001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulhd
+		2#1000001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- mulhdu
+		2#1001001011#  =>       (ALU,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- mulhw
+		2#1000001011#  =>       (ALU,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- mulhwu
+		2#0011101001#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulld
+		2#1011101001#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulldo
+		2#0011101011#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- mullw
+		2#1011101011#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- mullwo
 		2#0111011100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nand
 		2#0001101000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- neg
 		2#1001101000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nego
diff --git a/decode2.vhdl b/decode2.vhdl
index f6f7101..2da5c41 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -24,7 +24,6 @@ entity decode2 is
 		d_in  : in Decode1ToDecode2Type;
 
 		e_out : out Decode2ToExecute1Type;
-		m_out : out Decode2ToMultiplyType;
                 d_out : out Decode2ToDividerType;
 		l_out : out Decode2ToLoadstore1Type;
 
@@ -39,7 +38,6 @@ end entity decode2;
 architecture behaviour of decode2 is
 	type reg_type is record
 		e : Decode2ToExecute1Type;
-		m : Decode2ToMultiplyType;
                 d : Decode2ToDividerType;
 		l : Decode2ToLoadstore1Type;
 	end record;
@@ -238,7 +236,7 @@ begin
 	decode2_0: process(clk)
 	begin
 		if rising_edge(clk) then
-			if rin.e.valid = '1' or rin.l.valid = '1' or rin.m.valid = '1' or rin.d.valid = '1' then
+			if rin.e.valid = '1' or rin.l.valid = '1' or rin.d.valid = '1' then
 				report "execute " & to_hstring(rin.e.nia);
 			end if;
 			r <= rin;
@@ -266,14 +264,12 @@ begin
 
 		v.e := Decode2ToExecute1Init;
 		v.l := Decode2ToLoadStore1Init;
-		v.m := Decode2ToMultiplyInit;
                 v.d := Decode2ToDividerInit;
 
 		mul_a := (others => '0');
 		mul_b := (others => '0');
 
 		--v.e.input_cr := d_in.decode.input_cr;
-		--v.m.input_cr := d_in.decode.input_cr;
 		--v.e.output_cr := d_in.decode.output_cr;
     
 		decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1);
@@ -323,38 +319,6 @@ begin
                 v.e.insn := d_in.insn;
                 v.e.data_len := length;
 
-		-- multiply unit
-		v.m.insn_type := d_in.decode.insn_type;
-		mul_a := decoded_reg_a.data;
-		mul_b := decoded_reg_b.data;
-		v.m.write_reg := gspr_to_gpr(decoded_reg_o.reg);
-		v.m.rc := decode_rc(d_in.decode.rc, d_in.insn);
-		v.m.xerc := c_in.read_xerc_data;
-		if v.m.insn_type = OP_MUL_L64 then
-		  v.m.oe := decode_oe(d_in.decode.rc, d_in.insn);
-		end if;
-		v.m.is_32bit := d_in.decode.is_32bit;
-
-		if d_in.decode.is_32bit = '1' then
-			if d_in.decode.is_signed = '1' then
-				v.m.data1 := (others => mul_a(31));
-				v.m.data1(31 downto 0) := mul_a(31 downto 0);
-				v.m.data2 := (others => mul_b(31));
-				v.m.data2(31 downto 0) := mul_b(31 downto 0);
-			else
-				v.m.data1 := '0' & x"00000000" & mul_a(31 downto 0);
-				v.m.data2 := '0' & x"00000000" & mul_b(31 downto 0);
-			end if;
-		else
-			if d_in.decode.is_signed = '1' then
-				v.m.data1 := mul_a(63) & mul_a;
-				v.m.data2 := mul_b(63) & mul_b;
-			else
-				v.m.data1 := '0' & mul_a;
-				v.m.data2 := '0' & mul_b;
-			end if;
-		end if;
-
                 -- divide unit
                 -- PPC divide and modulus instruction words have these bits in
                 -- the bottom 11 bits: o1dns 010t1 r
@@ -438,7 +402,6 @@ begin
                 cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn);
 
 		v.e.valid := '0';
-		v.m.valid := '0';
                 v.d.valid := '0';
 		v.l.valid := '0';
 		case d_in.decode.unit is
@@ -446,8 +409,6 @@ begin
 			v.e.valid := control_valid_out;
 		when LDST =>
 			v.l.valid := control_valid_out;
-		when MUL =>
-			v.m.valid := control_valid_out;
                 when DIV =>
                         v.d.valid := control_valid_out;
 		when NONE =>
@@ -458,7 +419,6 @@ begin
 		if rst = '1' then
 			v.e := Decode2ToExecute1Init;
 			v.l := Decode2ToLoadStore1Init;
-			v.m := Decode2ToMultiplyInit;
                         v.d := Decode2ToDividerInit;
 		end if;
 
@@ -468,7 +428,6 @@ begin
 		-- Update outputs
 		e_out <= r.e;
 		l_out <= r.l;
-		m_out <= r.m;
                 d_out <= r.d;
 	end process;
 end architecture behaviour;
diff --git a/decode_types.vhdl b/decode_types.vhdl
index e847fcf..9860406 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -46,7 +46,7 @@ package decode_types is
 
     constant TOO_OFFSET : integer := 0;
 
-    type unit_t is (NONE, ALU, LDST, MUL, DIV);
+    type unit_t is (NONE, ALU, LDST, DIV);
     type length_t is (NONE, is1B, is2B, is4B, is8B);
 
     type decode_rom_t is record
diff --git a/execute1.vhdl b/execute1.vhdl
index 4714ec5..710044f 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -35,6 +35,7 @@ architecture behaviour of execute1 is
 	e : Execute1ToWritebackType;
 	lr_update : std_ulogic;
 	next_lr : std_ulogic_vector(63 downto 0);
+	mul_in_progress : std_ulogic;
     end record;
 
     signal r, rin : reg_type;
@@ -48,6 +49,10 @@ architecture behaviour of execute1 is
     signal logical_result: std_ulogic_vector(63 downto 0);
     signal countzero_result: std_ulogic_vector(63 downto 0);
 
+    -- multiply signals
+    signal x_to_multiply: Execute1ToMultiplyType;
+    signal multiply_to_x: MultiplyToExecute1Type;
+
     procedure set_carry(e: inout Execute1ToWritebackType;
 			carry32 : in std_ulogic;
 			carry : in std_ulogic) is
@@ -123,6 +128,13 @@ begin
 	    result => countzero_result
 	    );
 
+    multiply_0: entity work.multiply
+        port map (
+            clk => clk,
+            m_in => x_to_multiply,
+            m_out => multiply_to_x
+            );
+
     execute1_0: process(clk)
     begin
 	if rising_edge(clk) then
@@ -204,6 +216,38 @@ begin
 	end if;
 
 	v.lr_update := '0';
+	v.mul_in_progress := '0';
+
+	-- signals to multiply unit
+	x_to_multiply <= Execute1ToMultiplyInit;
+	x_to_multiply.insn_type <= e_in.insn_type;
+	x_to_multiply.write_reg <= gspr_to_gpr(e_in.write_reg);
+	x_to_multiply.rc <= e_in.rc;
+	x_to_multiply.xerc <= v.e.xerc;
+	if e_in.insn_type = OP_MUL_L64 then
+	    x_to_multiply.oe <= e_in.oe;
+	end if;
+	x_to_multiply.is_32bit <= e_in.is_32bit;
+
+	if e_in.is_32bit = '1' then
+	    if e_in.is_signed = '1' then
+		x_to_multiply.data1 <= (others => e_in.read_data1(31));
+		x_to_multiply.data1(31 downto 0) <= e_in.read_data1(31 downto 0);
+		x_to_multiply.data2 <= (others => e_in.read_data2(31));
+		x_to_multiply.data2(31 downto 0) <= e_in.read_data2(31 downto 0);
+	    else
+		x_to_multiply.data1 <= '0' & x"00000000" & e_in.read_data1(31 downto 0);
+		x_to_multiply.data2 <= '0' & x"00000000" & e_in.read_data2(31 downto 0);
+	    end if;
+	else
+	    if e_in.is_signed = '1' then
+		x_to_multiply.data1 <= e_in.read_data1(63) & e_in.read_data1;
+		x_to_multiply.data2 <= e_in.read_data2(63) & e_in.read_data2;
+	    else
+		x_to_multiply.data1 <= '0' & e_in.read_data1;
+		x_to_multiply.data2 <= '0' & e_in.read_data2;
+	    end if;
+	end if;
 
 	ctrl_tmp <= ctrl;
 	-- FIXME: run at 512MHz not core freq
@@ -506,11 +550,19 @@ begin
 	    when OP_ICBI =>
 		icache_inval <= '1';
 
+		when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 =>
+		v.e.valid := '0';
+		v.mul_in_progress := '1';
+		stall_out <= '1';
+		x_to_multiply.valid <= '1';
+
 	    when others =>
 		terminate_out <= '1';
 		report "illegal";
 	    end case;
 
+	    v.e.rc := e_in.rc and e_in.valid;
+
 	    -- Update LR on the next cycle after a branch link
 	    --
 	    -- WARNING: The LR update isn't tracked by our hazard tracker. This
@@ -536,11 +588,25 @@ begin
 	    v.e.write_len := x"8";
 	    v.e.sign_extend := '0';
 	    v.e.valid := '1';
+	elsif r.mul_in_progress = '1' then
+	    if multiply_to_x.valid = '1' then
+		v.e.write_reg := gpr_to_gspr(multiply_to_x.write_reg_nr);
+		result := multiply_to_x.write_reg_data;
+		result_en := '1';
+		v.e.rc := multiply_to_x.rc;
+		v.e.xerc := multiply_to_x.xerc;
+		v.e.write_xerc_enable := multiply_to_x.write_xerc_enable;
+		v.e.valid := '1';
+		v.e.write_len := x"8";
+		v.e.sign_extend := '0';
+	    else
+		stall_out <= '1';
+		v.mul_in_progress := '1';
+	    end if;
 	end if;
 
 	v.e.write_data := result;
 	v.e.write_enable := result_en;
-	v.e.rc := e_in.rc and e_in.valid;
 
 	-- Update registers
 	rin <= v;
diff --git a/multiply.vhdl b/multiply.vhdl
index 23339b5..714b844 100644
--- a/multiply.vhdl
+++ b/multiply.vhdl
@@ -13,13 +13,13 @@ entity multiply is
     port (
         clk   : in std_logic;
 
-        m_in  : in Decode2ToMultiplyType;
-        m_out : out MultiplyToWritebackType
+        m_in  : in Execute1ToMultiplyType;
+        m_out : out MultiplyToExecute1Type
         );
 end entity multiply;
 
 architecture behaviour of multiply is
-    signal m: Decode2ToMultiplyType;
+    signal m: Execute1ToMultiplyType;
 
     type multiply_pipeline_stage is record
         valid     : std_ulogic;
@@ -64,7 +64,7 @@ begin
     begin
         v := r;
 
-        m_out <= MultiplyToWritebackInit;
+        m_out <= MultiplyToExecute1Init;
 
         v.multiply_pipeline(0).valid := m.valid;
         v.multiply_pipeline(0).insn_type := m.insn_type;
@@ -107,7 +107,6 @@ begin
 	-- Generate OV/OV32/SO when OE=1
         if v.multiply_pipeline(PIPELINE_DEPTH-1).valid = '1' then
             m_out.valid <= '1';
-            m_out.write_reg_enable <= '1';
             m_out.rc <= v.multiply_pipeline(PIPELINE_DEPTH-1).rc;
             m_out.write_xerc_enable <= v.multiply_pipeline(PIPELINE_DEPTH-1).oe;
 
diff --git a/multiply_tb.vhdl b/multiply_tb.vhdl
index 48f83ab..a76d739 100644
--- a/multiply_tb.vhdl
+++ b/multiply_tb.vhdl
@@ -17,8 +17,8 @@ architecture behave of multiply_tb is
 
     constant pipeline_depth : integer := 4;
 
-    signal m1               : Decode2ToMultiplyType;
-    signal m2               : MultiplyToWritebackType;
+    signal m1               : Execute1ToMultiplyType;
+    signal m2               : MultiplyToExecute1Type;
 begin
     multiply_0: entity work.multiply
         generic map (PIPELINE_DEPTH => pipeline_depth)
@@ -58,7 +58,6 @@ begin
 
         wait for clk_period;
         assert m2.valid = '1';
-        assert m2.write_reg_enable = '1';
         assert m2.write_reg_nr = "10001";
         assert m2.write_reg_data = x"0000000001111000";
         assert m2.rc = '0';
@@ -76,7 +75,6 @@ begin
 
         wait for clk_period * (pipeline_depth-1);
         assert m2.valid = '1';
-        assert m2.write_reg_enable = '1';
         assert m2.write_reg_nr = "10001";
         assert m2.write_reg_data = x"0000000001111000";
         assert m2.rc = '1';
diff --git a/writeback.vhdl b/writeback.vhdl
index 8582166..1323f71 100644
--- a/writeback.vhdl
+++ b/writeback.vhdl
@@ -12,7 +12,6 @@ entity writeback is
 
         e_in         : in Execute1ToWritebackType;
         l_in         : in DcacheToWritebackType;
-        m_in         : in MultiplyToWritebackType;
         d_in         : in DividerToWritebackType;
 
         w_out        : out WritebackToRegisterFileType;
@@ -67,32 +66,28 @@ begin
     begin
         x := "" & e_in.valid;
         y := "" & l_in.valid;
-        z := "" & m_in.valid;
-        w := "" & d_in.valid;
-        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z)) + to_integer(unsigned(w))) <= 1 severity failure;
+        z := "" & d_in.valid;
+        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
 
         x := "" & e_in.write_enable;
         y := "" & l_in.write_enable;
-        z := "" & m_in.write_reg_enable;
-        w := "" & d_in.write_reg_enable;
-        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z)) + to_integer(unsigned(w))) <= 1 severity failure;
+        z := "" & d_in.write_reg_enable;
+        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
 
         w := "" & e_in.write_cr_enable;
         x := "" & (e_in.write_enable and e_in.rc);
-        y := "" & (m_in.valid and m_in.rc);
         z := "" & (d_in.valid and d_in.rc);
-        assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
+        assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(z))) <= 1 severity failure;
 
         x := "" & e_in.write_xerc_enable;
-        y := "" & m_in.write_xerc_enable;
         z := "" & D_in.write_xerc_enable;
-        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
+        assert (to_integer(unsigned(x)) + to_integer(unsigned(z))) <= 1 severity failure;
 
         w_out <= WritebackToRegisterFileInit;
         c_out <= WritebackToCrFileInit;
 
         complete_out <= '0';
-        if e_in.valid = '1' or l_in.valid = '1' or m_in.valid = '1' or d_in.valid = '1' then
+        if e_in.valid = '1' or l_in.valid = '1' or d_in.valid = '1' then
             complete_out <= '1';
         end if;
 
@@ -143,19 +138,6 @@ begin
 	    xe := l_in.xerc;
         end if;
 
-        if m_in.write_reg_enable = '1' then
-            w_out.write_enable <= '1';
-            w_out.write_reg <= gpr_to_gspr(m_in.write_reg_nr);
-            data_in <= m_in.write_reg_data;
-            rc <= m_in.rc;
-	    xe := m_in.xerc;
-        end if;
-
-	if m_in.write_xerc_enable = '1' then
-            c_out.write_xerc_enable <= '1';
-            c_out.write_xerc_data <= m_in.xerc;
-	end if;
-
         if d_in.write_reg_enable = '1' then
             w_out.write_enable <= '1';
             w_out.write_reg <= gpr_to_gspr(d_in.write_reg_nr);

From 39d18d27388ee97ef598e8ee5ce73d30db257b0a Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 12 Dec 2019 08:47:42 +1100
Subject: [PATCH 02/10] Make divider hang off the side of execute1

With this, the divider is a unit that execute1 sends operands to and
which sends its results back to execute1, which then send them to
writeback.  Execute1 now sends a stall signal when it gets a divide
or modulus instruction until it gets a valid signal back from the
divider.  Divide and modulus instructions are no longer marked as
single-issue.

The data formatting step that used to be done in decode2 for div
and mod instructions is now done in execute1.  We also do the
absolute value operation in that same cycle instead of taking an
extra cycle inside the divider for signed operations with a
negative operand.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Makefile          |  4 +-
 common.vhdl       | 22 +++++------
 core.vhdl         | 16 +-------
 decode1.vhdl      | 40 ++++++++++----------
 decode2.vhdl      | 56 +---------------------------
 decode_types.vhdl |  4 +-
 divider.vhdl      | 25 ++-----------
 divider_tb.vhdl   | 43 +++++++++++++--------
 execute1.vhdl     | 95 ++++++++++++++++++++++++++++++++++++++++++++++-
 writeback.vhdl    | 29 ++-------------
 10 files changed, 165 insertions(+), 169 deletions(-)

diff --git a/Makefile b/Makefile
index 720e8d5..939f48e 100644
--- a/Makefile
+++ b/Makefile
@@ -17,7 +17,7 @@ common.o: decode_types.o
 control.o: gpr_hazard.o cr_hazard.o common.o
 sim_jtag.o: sim_jtag_socket.o
 core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o
-core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o writeback.o core_debug.o divider.o
+core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o writeback.o core_debug.o
 core_debug.o: common.o
 countzero.o:
 countzero_tb.o: common.o glibc_random.o countzero.o
@@ -26,7 +26,7 @@ crhelpers.o: common.o
 decode1.o: common.o decode_types.o
 decode2.o: decode_types.o common.o helpers.o insn_helpers.o control.o
 decode_types.o:
-execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o multiply.o
+execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o multiply.o divider.o
 fetch1.o: common.o
 fetch2.o: common.o wishbone_types.o
 glibc_random_helpers.o:
diff --git a/common.vhdl b/common.vhdl
index 9c18230..1d0bbac 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -145,7 +145,7 @@ package common is
 								 oe => '0', is_32bit => '0', xerc => xerc_init,
 								 others => (others => '0'));
 
-    type Decode2ToDividerType is record
+    type Execute1ToDividerType is record
 	valid: std_ulogic;
 	write_reg: gpr_index_t;
 	dividend: std_ulogic_vector(63 downto 0);
@@ -154,14 +154,15 @@ package common is
 	is_32bit: std_ulogic;
 	is_extended: std_ulogic;
 	is_modulus: std_ulogic;
+        neg_result: std_ulogic;
 	rc: std_ulogic;
 	oe: std_ulogic;
 	xerc: xer_common_t;
     end record;
-    constant Decode2ToDividerInit: Decode2ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0',
-							    is_extended => '0', is_modulus => '0',
-							    rc => '0', oe => '0', xerc => xerc_init,
-							    others => (others => '0'));
+    constant Execute1ToDividerInit: Execute1ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0',
+                                                              is_extended => '0', is_modulus => '0',
+                                                              rc => '0', oe => '0', xerc => xerc_init,
+                                                              neg_result => '0', others => (others => '0'));
 
     type Decode2ToRegisterFileType is record
 	read1_enable : std_ulogic;
@@ -275,20 +276,19 @@ package common is
 								 xerc => xerc_init,
 								 others => (others => '0'));
 
-    type DividerToWritebackType is record
+    type DividerToExecute1Type is record
 	valid: std_ulogic;
 
-	write_reg_enable : std_ulogic;
 	write_reg_nr: gpr_index_t;
 	write_reg_data: std_ulogic_vector(63 downto 0);
 	write_xerc_enable : std_ulogic;
 	xerc : xer_common_t;
 	rc: std_ulogic;
     end record;
-    constant DividerToWritebackInit : DividerToWritebackType := (valid => '0', write_reg_enable => '0',
-								 rc => '0', write_xerc_enable => '0',
-								 xerc => xerc_init,
-								 others => (others => '0'));
+    constant DividerToExecute1Init : DividerToExecute1Type := (valid => '0',
+                                                               rc => '0', write_xerc_enable => '0',
+                                                               xerc => xerc_init,
+                                                               others => (others => '0'));
 
     type WritebackToRegisterFileType is record
 	write_reg : gspr_index_t;
diff --git a/core.vhdl b/core.vhdl
index 71c10b3..a38cf36 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -63,10 +63,6 @@ architecture behave of core is
     signal loadstore1_to_dcache: Loadstore1ToDcacheType;
     signal dcache_to_writeback: DcacheToWritebackType;
 
-    -- divider signals
-    signal decode2_to_divider: Decode2ToDividerType;
-    signal divider_to_writeback: DividerToWritebackType;
-
     -- local signals
     signal fetch1_stall_in : std_ulogic;
     signal icache_stall_out : std_ulogic;
@@ -111,7 +107,6 @@ architecture behave of core is
     attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN);
     attribute keep_hierarchy of cr_file_0 : label is keep_h(DISABLE_FLATTEN);
     attribute keep_hierarchy of execute1_0 : label is keep_h(DISABLE_FLATTEN);
-    attribute keep_hierarchy of divider_0 : label is keep_h(DISABLE_FLATTEN);
     attribute keep_hierarchy of loadstore1_0 : label is keep_h(DISABLE_FLATTEN);
     attribute keep_hierarchy of dcache_0 : label is keep_h(DISABLE_FLATTEN);
     attribute keep_hierarchy of writeback_0 : label is keep_h(DISABLE_FLATTEN);
@@ -192,7 +187,6 @@ begin
             d_in => decode1_to_decode2,
             e_out => decode2_to_execute1,
             l_out => decode2_to_loadstore1,
-            d_out => decode2_to_divider,
             r_in => register_file_to_decode2,
             r_out => decode2_to_register_file,
             c_in => cr_file_to_decode2,
@@ -228,6 +222,7 @@ begin
     execute1_0: entity work.execute1
         port map (
             clk => clk,
+            rst => core_rst,
             flush_out => flush,
 	    stall_out => ex1_stall_out,
             e_in => decode2_to_execute1,
@@ -259,20 +254,11 @@ begin
             wishbone_out => wishbone_data_out
             );
 
-    divider_0: entity work.divider
-        port map (
-            clk => clk,
-            rst => core_rst,
-            d_in => decode2_to_divider,
-            d_out => divider_to_writeback
-            );
-
     writeback_0: entity work.writeback
         port map (
             clk => clk,
             e_in => execute1_to_writeback,
             l_in => dcache_to_writeback,
-            d_in => divider_to_writeback,
             w_out => writeback_to_register_file,
             c_out => writeback_to_cr_file,
             complete_out => complete
diff --git a/decode1.vhdl b/decode1.vhdl
index 4e1d063..6ac3f01 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -160,22 +160,22 @@ architecture behaviour of decode1 is
 		2#0100010110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbt
 		2#0011110110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbtst
 		-- 2#1111110110# dcbz
-		2#0110001001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdeu
-		2#1110001001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdeuo
-		2#0110001011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divweu
-		2#1110001011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divweuo
-		2#0110101001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divde
-		2#1110101001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdeo
-		2#0110101011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divwe
-		2#1110101011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divweo
-		2#0111001001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdu
-		2#1111001001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divduo
-		2#0111001011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divwu
-		2#1111001011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divwuo
-		2#0111101001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divd
-		2#1111101001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdo
-		2#0111101011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divw
-		2#1111101011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divwo
+		2#0110001001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divdeu
+		2#1110001001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divdeuo
+		2#0110001011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divweu
+		2#1110001011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divweuo
+		2#0110101001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- divde
+		2#1110101001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- divdeo
+		2#0110101011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- divwe
+		2#1110101011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- divweo
+		2#0111001001#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divdu
+		2#1111001001#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divduo
+		2#0111001011#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divwu
+		2#1111001011#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divwuo
+		2#0111101001#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- divd
+		2#1111101001#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- divdo
+		2#0111101011#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- divw
+		2#1111101011#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- divwo
 		2#0100011100#  =>       (ALU,    OP_XOR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- eqv
 		2#1110111010#  =>       (ALU,    OP_EXTS,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- extsb
 		2#1110011010#  =>       (ALU,    OP_EXTS,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- extsh
@@ -238,10 +238,10 @@ architecture behaviour of decode1 is
 		-- 2#1001000000# mcrxrx
 		2#0000010011#  =>       (ALU,    OP_MFCR,      NONE,       NONE,        NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf
 		2#0101010011#  =>       (ALU,    OP_MFSPR,     SPR,        NONE,        NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfspr
-		2#0100001001#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modud
-		2#0100001011#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- moduw
-		2#1100001001#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsd
-		2#1100001011#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsw
+		2#0100001001#  =>       (ALU,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- modud
+		2#0100001011#  =>       (ALU,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- moduw
+		2#1100001001#  =>       (ALU,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- modsd
+		2#1100001011#  =>       (ALU,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', NONE, '0', '0'), -- modsw
 		2#0010010000#  =>       (ALU,    OP_MTCRF,     NONE,       NONE,        RS,   NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtcrf/mtocrf
 		2#0111010011#  =>       (ALU,    OP_MTSPR,     NONE,       NONE,        RS,   SPR,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtspr
 		2#0001001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulhd
diff --git a/decode2.vhdl b/decode2.vhdl
index 2da5c41..a95dae3 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -24,7 +24,6 @@ entity decode2 is
 		d_in  : in Decode1ToDecode2Type;
 
 		e_out : out Decode2ToExecute1Type;
-                d_out : out Decode2ToDividerType;
 		l_out : out Decode2ToLoadstore1Type;
 
 		r_in  : in RegisterFileToDecode2Type;
@@ -38,7 +37,6 @@ end entity decode2;
 architecture behaviour of decode2 is
 	type reg_type is record
 		e : Decode2ToExecute1Type;
-                d : Decode2ToDividerType;
 		l : Decode2ToLoadstore1Type;
 	end record;
 
@@ -236,7 +234,7 @@ begin
 	decode2_0: process(clk)
 	begin
 		if rising_edge(clk) then
-			if rin.e.valid = '1' or rin.l.valid = '1' or rin.d.valid = '1' then
+			if rin.e.valid = '1' or rin.l.valid = '1' then
 				report "execute " & to_hstring(rin.e.nia);
 			end if;
 			r <= rin;
@@ -257,14 +255,12 @@ begin
 		variable decoded_reg_b : decode_input_reg_t;
 		variable decoded_reg_c : decode_input_reg_t;
 		variable decoded_reg_o : decode_output_reg_t;
-                variable signed_division: std_ulogic;
                 variable length : std_ulogic_vector(3 downto 0);
 	begin
 		v := r;
 
 		v.e := Decode2ToExecute1Init;
 		v.l := Decode2ToLoadStore1Init;
-                v.d := Decode2ToDividerInit;
 
 		mul_a := (others => '0');
 		mul_b := (others => '0');
@@ -319,51 +315,6 @@ begin
                 v.e.insn := d_in.insn;
                 v.e.data_len := length;
 
-                -- divide unit
-                -- PPC divide and modulus instruction words have these bits in
-                -- the bottom 11 bits: o1dns 010t1 r
-                -- where o = OE for div instrs, signedness for mod instrs
-                --       d = 1 for div*, 0 for mod*
-                --       n = 1 for normal, 0 for extended (dividend << 32/64)
-                --       s = 1 for signed, 0 for unsigned (for div*)
-                --       t = 1 for 32-bit, 0 for 64-bit
-                --       r = RC bit (record condition code)
-		v.d.write_reg := gspr_to_gpr(decoded_reg_o.reg);
-                v.d.is_modulus := not d_in.insn(8);
-                v.d.is_32bit := d_in.insn(2);
-                if d_in.insn(8) = '1' then
-                        signed_division := d_in.insn(6);
-                else
-                        signed_division := d_in.insn(10);
-                end if;
-                v.d.is_signed := signed_division;
-                if d_in.insn(2) = '0' then
-                        -- 64-bit forms
-                        if d_in.insn(8) = '1' and d_in.insn(7) = '0' then
-                                v.d.is_extended := '1';
-                        end if;
-                        v.d.dividend := decoded_reg_a.data;
-                        v.d.divisor := decoded_reg_b.data;
-                else
-                        -- 32-bit forms
-                        if d_in.insn(8) = '1' and d_in.insn(7) = '0' then   -- extended forms
-                                v.d.dividend := decoded_reg_a.data(31 downto 0) & x"00000000";
-                        elsif signed_division = '1' and decoded_reg_a.data(31) = '1' then
-                                -- sign extend to 64 bits
-                                v.d.dividend := x"ffffffff" & decoded_reg_a.data(31 downto 0);
-                        else
-                                v.d.dividend := x"00000000" & decoded_reg_a.data(31 downto 0);
-                        end if;
-                        if signed_division = '1' and decoded_reg_b.data(31) = '1' then
-                                v.d.divisor := x"ffffffff" & decoded_reg_b.data(31 downto 0);
-                        else
-                                v.d.divisor := x"00000000" & decoded_reg_b.data(31 downto 0);
-                        end if;
-                end if;
-                v.d.rc := decode_rc(d_in.decode.rc, d_in.insn);
-		v.d.xerc := c_in.read_xerc_data;
-		v.d.oe := decode_oe(d_in.decode.rc, d_in.insn);
-
 		-- load/store unit
 		v.l.update_reg := gspr_to_gpr(decoded_reg_a.reg);
 		v.l.addr1 := decoded_reg_a.data;
@@ -402,15 +353,12 @@ begin
                 cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn);
 
 		v.e.valid := '0';
-                v.d.valid := '0';
 		v.l.valid := '0';
 		case d_in.decode.unit is
 		when ALU =>
 			v.e.valid := control_valid_out;
 		when LDST =>
 			v.l.valid := control_valid_out;
-                when DIV =>
-                        v.d.valid := control_valid_out;
 		when NONE =>
 			v.e.valid := control_valid_out;
 			v.e.insn_type := OP_ILLEGAL;
@@ -419,7 +367,6 @@ begin
 		if rst = '1' then
 			v.e := Decode2ToExecute1Init;
 			v.l := Decode2ToLoadStore1Init;
-                        v.d := Decode2ToDividerInit;
 		end if;
 
 		-- Update registers
@@ -428,6 +375,5 @@ begin
 		-- Update outputs
 		e_out <= r.e;
 		l_out <= r.l;
-                d_out <= r.d;
 	end process;
 end architecture behaviour;
diff --git a/decode_types.vhdl b/decode_types.vhdl
index 9860406..fdc1e6e 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -8,7 +8,7 @@ package decode_types is
 			 OP_CNTZ, OP_CRAND,
 			 OP_CRANDC, OP_CREQV, OP_CRNAND, OP_CRNOR, OP_CROR, OP_CRORC,
 			 OP_CRXOR, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST,
-			 OP_DCBZ, OP_DIV, OP_EXTS,
+			 OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS,
 			 OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC,
 			 OP_LOAD, OP_STORE, OP_MADDHD, OP_MADDHDU, OP_MADDLD, OP_MCRF,
 			 OP_MCRXR, OP_MCRXRX, OP_MFCR, OP_MFSPR, OP_MOD,
@@ -46,7 +46,7 @@ package decode_types is
 
     constant TOO_OFFSET : integer := 0;
 
-    type unit_t is (NONE, ALU, LDST, DIV);
+    type unit_t is (NONE, ALU, LDST);
     type length_t is (NONE, is1B, is2B, is4B, is8B);
 
     type decode_rom_t is record
diff --git a/divider.vhdl b/divider.vhdl
index affab85..33d2a0d 100644
--- a/divider.vhdl
+++ b/divider.vhdl
@@ -10,8 +10,8 @@ entity divider is
     port (
         clk   : in std_logic;
         rst   : in std_logic;
-        d_in  : in Decode2ToDividerType;
-        d_out : out DividerToWritebackType
+        d_in  : in Execute1ToDividerType;
+        d_out : out DividerToExecute1Type
         );
 end entity divider;
 
@@ -23,7 +23,6 @@ architecture behaviour of divider is
     signal sresult    : std_ulogic_vector(64 downto 0);
     signal oresult    : std_ulogic_vector(63 downto 0);
     signal running    : std_ulogic;
-    signal signcheck  : std_ulogic;
     signal count      : unsigned(6 downto 0);
     signal neg_result : std_ulogic;
     signal is_modulus : std_ulogic;
@@ -48,7 +47,7 @@ begin
                 running <= '0';
                 count <= "0000000";
             elsif d_in.valid = '1' then
-                if d_in.is_extended = '1' and not (d_in.is_signed = '1' and d_in.dividend(63) = '1') then
+                if d_in.is_extended = '1'  then
                     dend <= '0' & d_in.dividend & x"0000000000000000";
                 else
                     dend <= '0' & x"0000000000000000" & d_in.dividend;
@@ -56,7 +55,7 @@ begin
                 div <= unsigned(d_in.divisor);
                 quot <= (others => '0');
                 write_reg <= d_in.write_reg;
-                neg_result <= '0';
+                neg_result <= d_in.neg_result;
                 is_modulus <= d_in.is_modulus;
                 extended <= d_in.is_extended;
                 is_32bit <= d_in.is_32bit;
@@ -68,20 +67,6 @@ begin
                 running <= '1';
                 overflow <= '0';
                 ovf32 <= '0';
-                signcheck <= d_in.is_signed and (d_in.dividend(63) or d_in.divisor(63));
-            elsif signcheck = '1' then
-                signcheck <= '0';
-                neg_result <= dend(63) xor (div(63) and not is_modulus);
-                if dend(63) = '1' then
-                    if extended = '1' then
-                        dend <= '0' & std_ulogic_vector(- signed(dend(63 downto 0))) & x"0000000000000000";
-                    else
-                        dend <= '0' & x"0000000000000000" & std_ulogic_vector(- signed(dend(63 downto 0)));
-                    end if;
-                end if;
-                if div(63) = '1' then
-                    div <= unsigned(- signed(div));
-                end if;
             elsif running = '1' then
                 if count = "0111111" then
                     running <= '0';
@@ -151,12 +136,10 @@ begin
         if rising_edge(clk) then
 	    d_out.valid <= '0';
             d_out.write_reg_data <= oresult;
-	    d_out.write_reg_enable <= '0';
 	    d_out.write_xerc_enable <= '0';
 	    d_out.xerc <= xerc;
             if count = "1000000" then
                 d_out.valid <= '1';
-                d_out.write_reg_enable <= '1';
 		d_out.write_xerc_enable <= oe;
 
 		-- We must test oe because the RC update code in writeback
diff --git a/divider_tb.vhdl b/divider_tb.vhdl
index 5f809bb..8151315 100644
--- a/divider_tb.vhdl
+++ b/divider_tb.vhdl
@@ -16,8 +16,8 @@ architecture behave of divider_tb is
     signal rst              : std_ulogic;
     constant clk_period     : time := 10 ns;
 
-    signal d1               : Decode2ToDividerType;
-    signal d2               : DividerToWritebackType;
+    signal d1               : Execute1ToDividerType;
+    signal d2               : DividerToExecute1Type;
 begin
     divider_0: entity work.divider
         port map (clk => clk, rst => rst, d_in => d1, d_out => d2);
@@ -50,6 +50,7 @@ begin
         d1.is_32bit <= '0';
         d1.is_extended <= '0';
         d1.is_modulus <= '0';
+        d1.neg_result <= '0';
         d1.rc <= '0';
 
         wait for clk_period;
@@ -65,7 +66,6 @@ begin
         end loop;
 
         assert d2.valid = '1';
-        assert d2.write_reg_enable = '1';
         assert d2.write_reg_nr = "10001";
         assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data);
         assert d2.rc = '0';
@@ -89,7 +89,6 @@ begin
         end loop;
 
         assert d2.valid = '1';
-        assert d2.write_reg_enable = '1';
         assert d2.write_reg_nr = "10001";
         assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data);
         assert d2.rc = '1';
@@ -105,9 +104,10 @@ begin
                     ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
                     rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
 
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                     d1.is_signed <= '1';
+                    d1.neg_result <= ra(63) xor rb(63);
                     d1.valid <= '1';
 
                     wait for clk_period;
@@ -142,6 +142,7 @@ begin
                     d1.dividend <= ra;
                     d1.divisor <= rb;
                     d1.is_signed <= '0';
+                    d1.neg_result <= '0';
                     d1.valid <= '1';
 
                     wait for clk_period;
@@ -173,9 +174,10 @@ begin
                     ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
                     rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
 
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                     d1.is_signed <= '1';
+                    d1.neg_result <= ra(63) xor rb(63);
                     d1.is_extended <= '1';
                     d1.valid <= '1';
 
@@ -216,6 +218,7 @@ begin
                     d1.dividend <= ra;
                     d1.divisor <= rb;
                     d1.is_signed <= '0';
+                    d1.neg_result <= '0';
                     d1.is_extended <= '1';
                     d1.valid <= '1';
 
@@ -250,9 +253,10 @@ begin
                     ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
                     rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
 
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                     d1.is_signed <= '1';
+                    d1.neg_result <= ra(63) xor rb(63);
                     d1.is_extended <= '0';
                     d1.is_32bit <= '1';
                     d1.valid <= '1';
@@ -289,6 +293,7 @@ begin
                     d1.dividend <= ra;
                     d1.divisor <= rb;
                     d1.is_signed <= '0';
+                    d1.neg_result <= '0';
                     d1.is_extended <= '0';
                     d1.is_32bit <= '1';
                     d1.valid <= '1';
@@ -322,9 +327,10 @@ begin
                     ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 32)) & x"00000000";
                     rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
 
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                     d1.is_signed <= '1';
+                    d1.neg_result <= ra(63) xor rb(63);
                     d1.is_extended <= '0';
                     d1.is_32bit <= '1';
                     d1.valid <= '1';
@@ -365,6 +371,7 @@ begin
                     d1.dividend <= ra;
                     d1.divisor <= rb;
                     d1.is_signed <= '0';
+                    d1.neg_result <= '0';
                     d1.is_extended <= '0';
                     d1.is_32bit <= '1';
                     d1.valid <= '1';
@@ -398,9 +405,10 @@ begin
                     ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
                     rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
 
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                     d1.is_signed <= '1';
+                    d1.neg_result <= ra(63);
                     d1.is_extended <= '0';
                     d1.is_32bit <= '0';
                     d1.is_modulus <= '1';
@@ -438,6 +446,7 @@ begin
                     d1.dividend <= ra;
                     d1.divisor <= rb;
                     d1.is_signed <= '0';
+                    d1.neg_result <= '0';
                     d1.is_extended <= '0';
                     d1.is_32bit <= '0';
                     d1.is_modulus <= '1';
@@ -472,9 +481,10 @@ begin
                     ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
                     rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
 
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
+                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
                     d1.is_signed <= '1';
+                    d1.neg_result <= ra(63);
                     d1.is_extended <= '0';
                     d1.is_32bit <= '1';
                     d1.is_modulus <= '1';
@@ -517,6 +527,7 @@ begin
                     d1.dividend <= ra;
                     d1.divisor <= rb;
                     d1.is_signed <= '0';
+                    d1.neg_result <= '0';
                     d1.is_extended <= '0';
                     d1.is_32bit <= '1';
                     d1.is_modulus <= '1';
diff --git a/execute1.vhdl b/execute1.vhdl
index 710044f..7bcffdc 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -13,6 +13,7 @@ use work.ppc_fx_insns.all;
 entity execute1 is
     port (
 	clk   : in std_ulogic;
+        rst   : in std_ulogic;
 
 	-- asynchronous
 	flush_out : out std_ulogic;
@@ -36,6 +37,7 @@ architecture behaviour of execute1 is
 	lr_update : std_ulogic;
 	next_lr : std_ulogic_vector(63 downto 0);
 	mul_in_progress : std_ulogic;
+        div_in_progress : std_ulogic;
     end record;
 
     signal r, rin : reg_type;
@@ -53,6 +55,10 @@ architecture behaviour of execute1 is
     signal x_to_multiply: Execute1ToMultiplyType;
     signal multiply_to_x: MultiplyToExecute1Type;
 
+    -- divider signals
+    signal x_to_divider: Execute1ToDividerType;
+    signal divider_to_x: DividerToExecute1Type;
+
     procedure set_carry(e: inout Execute1ToWritebackType;
 			carry32 : in std_ulogic;
 			carry : in std_ulogic) is
@@ -135,6 +141,14 @@ begin
             m_out => multiply_to_x
             );
 
+    divider_0: entity work.divider
+        port map (
+            clk => clk,
+            rst => rst,
+            d_in => x_to_divider,
+            d_out => divider_to_x
+            );
+
     execute1_0: process(clk)
     begin
 	if rising_edge(clk) then
@@ -171,6 +185,8 @@ begin
 	variable l : std_ulogic;
 	variable next_nia : std_ulogic_vector(63 downto 0);
         variable carry_32, carry_64 : std_ulogic;
+        variable sign1, sign2 : std_ulogic;
+        variable abs1, abs2 : signed(63 downto 0);
     begin
 	result := (others => '0');
 	result_with_carry := (others => '0');
@@ -217,6 +233,7 @@ begin
 
 	v.lr_update := '0';
 	v.mul_in_progress := '0';
+        v.div_in_progress := '0';
 
 	-- signals to multiply unit
 	x_to_multiply <= Execute1ToMultiplyInit;
@@ -249,6 +266,59 @@ begin
 	    end if;
 	end if;
 
+        -- signals to divide unit
+        sign1 := '0';
+        sign2 := '0';
+        if e_in.is_signed = '1' then
+            if e_in.is_32bit = '1' then
+                sign1 := e_in.read_data1(31);
+                sign2 := e_in.read_data2(31);
+            else
+                sign1 := e_in.read_data1(63);
+                sign2 := e_in.read_data2(63);
+            end if;
+        end if;
+        -- take absolute values
+        if sign1 = '0' then
+            abs1 := signed(e_in.read_data1);
+        else
+            abs1 := - signed(e_in.read_data1);
+        end if;
+        if sign2 = '0' then
+            abs2 := signed(e_in.read_data2);
+        else
+            abs2 := - signed(e_in.read_data2);
+        end if;
+
+        x_to_divider <= Execute1ToDividerInit;
+	x_to_divider.write_reg <= gspr_to_gpr(e_in.write_reg);
+        x_to_divider.is_signed <= e_in.is_signed;
+	x_to_divider.is_32bit <= e_in.is_32bit;
+        if e_in.insn_type = OP_MOD then
+            x_to_divider.is_modulus <= '1';
+        end if;
+        x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus);
+	x_to_divider.rc <= e_in.rc;
+	x_to_divider.oe <= e_in.oe;
+	x_to_divider.xerc <= v.e.xerc;
+        if e_in.is_32bit = '0' then
+            -- 64-bit forms
+            if e_in.insn_type = OP_DIVE then
+                x_to_divider.is_extended <= '1';
+            end if;
+            x_to_divider.dividend <= std_ulogic_vector(abs1);
+            x_to_divider.divisor <= std_ulogic_vector(abs2);
+        else
+            -- 32-bit forms
+            x_to_divider.is_extended <= '0';
+            if e_in.insn_type = OP_DIVE then   -- extended forms
+                x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000";
+            else
+                x_to_divider.dividend <= x"00000000" & std_ulogic_vector(abs1(31 downto 0));
+            end if;
+            x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
+        end if;
+
 	ctrl_tmp <= ctrl;
 	-- FIXME: run at 512MHz not core freq
 	ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
@@ -550,13 +620,19 @@ begin
 	    when OP_ICBI =>
 		icache_inval <= '1';
 
-		when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 =>
+	    when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 =>
 		v.e.valid := '0';
 		v.mul_in_progress := '1';
 		stall_out <= '1';
 		x_to_multiply.valid <= '1';
 
-	    when others =>
+	    when OP_DIV | OP_DIVE | OP_MOD =>
+		v.e.valid := '0';
+		v.div_in_progress := '1';
+		stall_out <= '1';
+		x_to_divider.valid <= '1';
+
+            when others =>
 		terminate_out <= '1';
 		report "illegal";
 	    end case;
@@ -603,6 +679,21 @@ begin
 		stall_out <= '1';
 		v.mul_in_progress := '1';
 	    end if;
+        elsif r.div_in_progress = '1' then
+            if divider_to_x.valid = '1' then
+                v.e.write_reg := gpr_to_gspr(divider_to_x.write_reg_nr);
+                result := divider_to_x.write_reg_data;
+                result_en := '1';
+                v.e.rc := divider_to_x.rc;
+                v.e.xerc := divider_to_x.xerc;
+                v.e.write_xerc_enable := divider_to_x.write_xerc_enable;
+                v.e.valid := '1';
+                v.e.write_len := x"8";
+		v.e.sign_extend := '0';
+	    else
+		stall_out <= '1';
+		v.div_in_progress := '1';
+	    end if;
 	end if;
 
 	v.e.write_data := result;
diff --git a/writeback.vhdl b/writeback.vhdl
index 1323f71..08efe91 100644
--- a/writeback.vhdl
+++ b/writeback.vhdl
@@ -12,7 +12,6 @@ entity writeback is
 
         e_in         : in Execute1ToWritebackType;
         l_in         : in DcacheToWritebackType;
-        d_in         : in DividerToWritebackType;
 
         w_out        : out WritebackToRegisterFileType;
         c_out        : out WritebackToCrFileType;
@@ -66,28 +65,21 @@ begin
     begin
         x := "" & e_in.valid;
         y := "" & l_in.valid;
-        z := "" & d_in.valid;
-        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
+        assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure;
 
         x := "" & e_in.write_enable;
         y := "" & l_in.write_enable;
-        z := "" & d_in.write_reg_enable;
-        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
+        assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure;
 
         w := "" & e_in.write_cr_enable;
         x := "" & (e_in.write_enable and e_in.rc);
-        z := "" & (d_in.valid and d_in.rc);
-        assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(z))) <= 1 severity failure;
-
-        x := "" & e_in.write_xerc_enable;
-        z := "" & D_in.write_xerc_enable;
-        assert (to_integer(unsigned(x)) + to_integer(unsigned(z))) <= 1 severity failure;
+        assert (to_integer(unsigned(w)) + to_integer(unsigned(x))) <= 1 severity failure;
 
         w_out <= WritebackToRegisterFileInit;
         c_out <= WritebackToCrFileInit;
 
         complete_out <= '0';
-        if e_in.valid = '1' or l_in.valid = '1' or d_in.valid = '1' then
+        if e_in.valid = '1' or l_in.valid = '1' then
             complete_out <= '1';
         end if;
 
@@ -138,19 +130,6 @@ begin
 	    xe := l_in.xerc;
         end if;
 
-        if d_in.write_reg_enable = '1' then
-            w_out.write_enable <= '1';
-            w_out.write_reg <= gpr_to_gspr(d_in.write_reg_nr);
-            data_in <= d_in.write_reg_data;
-            rc <= d_in.rc;
-	    xe := d_in.xerc;
-        end if;
-
-	if d_in.write_xerc_enable = '1' then
-            c_out.write_xerc_enable <= '1';
-            c_out.write_xerc_data <= d_in.xerc;
-	end if;
-
         -- shift and byte-reverse data bytes
         for i in 0 to 7 loop
             k := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);

From c9a2076dd3c2e26b3d9ddef72cc6e471c503b7d2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 12 Dec 2019 11:21:25 +1100
Subject: [PATCH 03/10] execute1: Remember dest GPR, RC, OE, XER for slow
 operations

For multiply and divide operations, execute1 now records the
destination GPR number, RC and OE from the instruction, and the
XER value.  This means that the multiply and divide units don't
need to record those values and then send them back to execute1.
This makes the interface to those units a bit simpler.  They
simply report an overflow signal along with the result value, and
execute1 takes care of updating XER if necessary.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl      | 33 +++++-------------------
 decode2.vhdl     |  4 ++-
 divider.vhdl     | 25 +-----------------
 divider_tb.vhdl  |  7 -----
 execute1.vhdl    | 66 ++++++++++++++++++++++++------------------------
 multiply.vhdl    | 29 ++-------------------
 multiply_tb.vhdl |  7 -----
 7 files changed, 45 insertions(+), 126 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 1d0bbac..639f0f7 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -133,21 +133,16 @@ package common is
     type Execute1ToMultiplyType is record
 	valid: std_ulogic;
 	insn_type: insn_type_t;
-	write_reg: gpr_index_t;
 	data1: std_ulogic_vector(64 downto 0);
 	data2: std_ulogic_vector(64 downto 0);
-	rc: std_ulogic;
-	oe: std_ulogic;
 	is_32bit: std_ulogic;
-	xerc: xer_common_t;
     end record;
-    constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, rc => '0',
-								 oe => '0', is_32bit => '0', xerc => xerc_init,
+    constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL,
+								 is_32bit => '0',
 								 others => (others => '0'));
 
     type Execute1ToDividerType is record
 	valid: std_ulogic;
-	write_reg: gpr_index_t;
 	dividend: std_ulogic_vector(63 downto 0);
 	divisor: std_ulogic_vector(63 downto 0);
 	is_signed: std_ulogic;
@@ -155,13 +150,9 @@ package common is
 	is_extended: std_ulogic;
 	is_modulus: std_ulogic;
         neg_result: std_ulogic;
-	rc: std_ulogic;
-	oe: std_ulogic;
-	xerc: xer_common_t;
     end record;
     constant Execute1ToDividerInit: Execute1ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0',
                                                               is_extended => '0', is_modulus => '0',
-                                                              rc => '0', oe => '0', xerc => xerc_init,
                                                               neg_result => '0', others => (others => '0'));
 
     type Decode2ToRegisterFileType is record
@@ -264,30 +255,18 @@ package common is
 
     type MultiplyToExecute1Type is record
 	valid: std_ulogic;
-
-	write_reg_nr: gpr_index_t;
 	write_reg_data: std_ulogic_vector(63 downto 0);
-	write_xerc_enable : std_ulogic;
-	xerc : xer_common_t;
-	rc: std_ulogic;
+        overflow : std_ulogic;
     end record;
-    constant MultiplyToExecute1Init : MultiplyToExecute1Type := (valid => '0',
-								 rc => '0', write_xerc_enable => '0',
-								 xerc => xerc_init,
+    constant MultiplyToExecute1Init : MultiplyToExecute1Type := (valid => '0', overflow => '0',
 								 others => (others => '0'));
 
     type DividerToExecute1Type is record
 	valid: std_ulogic;
-
-	write_reg_nr: gpr_index_t;
 	write_reg_data: std_ulogic_vector(63 downto 0);
-	write_xerc_enable : std_ulogic;
-	xerc : xer_common_t;
-	rc: std_ulogic;
+        overflow : std_ulogic;
     end record;
-    constant DividerToExecute1Init : DividerToExecute1Type := (valid => '0',
-                                                               rc => '0', write_xerc_enable => '0',
-                                                               xerc => xerc_init,
+    constant DividerToExecute1Init : DividerToExecute1Type := (valid => '0', overflow => '0',
                                                                others => (others => '0'));
 
     type WritebackToRegisterFileType is record
diff --git a/decode2.vhdl b/decode2.vhdl
index a95dae3..6cd4574 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -300,7 +300,9 @@ begin
                 v.e.read_data3 := decoded_reg_c.data;
 		v.e.write_reg := decoded_reg_o.reg;
 		v.e.rc := decode_rc(d_in.decode.rc, d_in.insn);
-		v.e.oe := decode_oe(d_in.decode.rc, d_in.insn);
+                if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then
+                        v.e.oe := decode_oe(d_in.decode.rc, d_in.insn);
+                end if;
 		v.e.cr := c_in.read_cr_data;
 		v.e.xerc := c_in.read_xerc_data;
                 v.e.invert_a := d_in.decode.invert_a;
diff --git a/divider.vhdl b/divider.vhdl
index 33d2a0d..aef65a4 100644
--- a/divider.vhdl
+++ b/divider.vhdl
@@ -29,13 +29,9 @@ architecture behaviour of divider is
     signal is_32bit   : std_ulogic;
     signal extended   : std_ulogic;
     signal is_signed  : std_ulogic;
-    signal rc         : std_ulogic;
-    signal write_reg  : std_ulogic_vector(4 downto 0);
     signal overflow   : std_ulogic;
     signal ovf32      : std_ulogic;
     signal did_ovf    : std_ulogic;
-    signal oe         : std_ulogic;
-    signal xerc       : xer_common_t;
 begin
     divider_0: process(clk)
     begin
@@ -54,15 +50,11 @@ begin
                 end if;
                 div <= unsigned(d_in.divisor);
                 quot <= (others => '0');
-                write_reg <= d_in.write_reg;
                 neg_result <= d_in.neg_result;
                 is_modulus <= d_in.is_modulus;
                 extended <= d_in.is_extended;
                 is_32bit <= d_in.is_32bit;
                 is_signed <= d_in.is_signed;
-                rc <= d_in.rc;
-                oe <= d_in.oe;
-		xerc <= d_in.xerc;
                 count <= "1111111";
                 running <= '1';
                 overflow <= '0';
@@ -98,9 +90,6 @@ begin
 
     divider_1: process(all)
     begin
-        d_out.write_reg_nr <= write_reg;
-        d_out.rc <= rc;
-
         if is_modulus = '1' then
             result <= dend(128 downto 65);
         else
@@ -136,21 +125,9 @@ begin
         if rising_edge(clk) then
 	    d_out.valid <= '0';
             d_out.write_reg_data <= oresult;
-	    d_out.write_xerc_enable <= '0';
-	    d_out.xerc <= xerc;
+	    d_out.overflow <= did_ovf;
             if count = "1000000" then
                 d_out.valid <= '1';
-		d_out.write_xerc_enable <= oe;
-
-		-- We must test oe because the RC update code in writeback
-		-- will use the xerc value to set CR0:SO so we must not clobber
-		-- xerc if OE wasn't set.
-		--
-		if oe = '1' then
-		    d_out.xerc.ov <= did_ovf;
-		    d_out.xerc.ov32 <= did_ovf;
-		    d_out.xerc.so <= xerc.so or did_ovf;
-		end if;
             end if;
         end if;
     end process;
diff --git a/divider_tb.vhdl b/divider_tb.vhdl
index 8151315..95156a3 100644
--- a/divider_tb.vhdl
+++ b/divider_tb.vhdl
@@ -43,7 +43,6 @@ begin
         rst <= '0';
 
         d1.valid <= '1';
-        d1.write_reg <= "10001";
         d1.dividend <= x"0000000010001000";
         d1.divisor  <= x"0000000000001111";
         d1.is_signed <= '0';
@@ -51,7 +50,6 @@ begin
         d1.is_extended <= '0';
         d1.is_modulus <= '0';
         d1.neg_result <= '0';
-        d1.rc <= '0';
 
         wait for clk_period;
         assert d2.valid = '0';
@@ -66,15 +64,12 @@ begin
         end loop;
 
         assert d2.valid = '1';
-        assert d2.write_reg_nr = "10001";
         assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data);
-        assert d2.rc = '0';
 
         wait for clk_period;
         assert d2.valid = '0' report "valid";
 
         d1.valid <= '1';
-        d1.rc <= '1';
 
         wait for clk_period;
         assert d2.valid = '0' report "valid";
@@ -89,9 +84,7 @@ begin
         end loop;
 
         assert d2.valid = '1';
-        assert d2.write_reg_nr = "10001";
         assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data);
-        assert d2.rc = '1';
 
         wait for clk_period;
         assert d2.valid = '0';
diff --git a/execute1.vhdl b/execute1.vhdl
index 7bcffdc..94845d8 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -38,6 +38,10 @@ architecture behaviour of execute1 is
 	next_lr : std_ulogic_vector(63 downto 0);
 	mul_in_progress : std_ulogic;
         div_in_progress : std_ulogic;
+	slow_op_dest : gpr_index_t;
+	slow_op_rc : std_ulogic;
+	slow_op_oe : std_ulogic;
+	slow_op_xerc : xer_common_t;
     end record;
 
     signal r, rin : reg_type;
@@ -187,6 +191,7 @@ begin
         variable carry_32, carry_64 : std_ulogic;
         variable sign1, sign2 : std_ulogic;
         variable abs1, abs2 : signed(63 downto 0);
+	variable overflow : std_ulogic;
     begin
 	result := (others => '0');
 	result_with_carry := (others => '0');
@@ -238,12 +243,6 @@ begin
 	-- signals to multiply unit
 	x_to_multiply <= Execute1ToMultiplyInit;
 	x_to_multiply.insn_type <= e_in.insn_type;
-	x_to_multiply.write_reg <= gspr_to_gpr(e_in.write_reg);
-	x_to_multiply.rc <= e_in.rc;
-	x_to_multiply.xerc <= v.e.xerc;
-	if e_in.insn_type = OP_MUL_L64 then
-	    x_to_multiply.oe <= e_in.oe;
-	end if;
 	x_to_multiply.is_32bit <= e_in.is_32bit;
 
 	if e_in.is_32bit = '1' then
@@ -291,16 +290,12 @@ begin
         end if;
 
         x_to_divider <= Execute1ToDividerInit;
-	x_to_divider.write_reg <= gspr_to_gpr(e_in.write_reg);
         x_to_divider.is_signed <= e_in.is_signed;
 	x_to_divider.is_32bit <= e_in.is_32bit;
         if e_in.insn_type = OP_MOD then
             x_to_divider.is_modulus <= '1';
         end if;
         x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus);
-	x_to_divider.rc <= e_in.rc;
-	x_to_divider.oe <= e_in.oe;
-	x_to_divider.xerc <= v.e.xerc;
         if e_in.is_32bit = '0' then
             -- 64-bit forms
             if e_in.insn_type = OP_DIVE then
@@ -342,6 +337,10 @@ begin
 	    v.e.write_reg := e_in.write_reg;
 	    v.e.write_len := x"8";
 	    v.e.sign_extend := '0';
+	    v.slow_op_dest := gspr_to_gpr(e_in.write_reg);
+	    v.slow_op_rc := e_in.rc;
+	    v.slow_op_oe := e_in.oe;
+	    v.slow_op_xerc := v.e.xerc;
 
 	    case_0: case e_in.insn_type is
 
@@ -664,35 +663,36 @@ begin
 	    v.e.write_len := x"8";
 	    v.e.sign_extend := '0';
 	    v.e.valid := '1';
-	elsif r.mul_in_progress = '1' then
-	    if multiply_to_x.valid = '1' then
-		v.e.write_reg := gpr_to_gspr(multiply_to_x.write_reg_nr);
-		result := multiply_to_x.write_reg_data;
+	elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then
+	    if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or
+	       (r.div_in_progress = '1' and divider_to_x.valid = '1') then
+		if r.mul_in_progress = '1' then
+		    result := multiply_to_x.write_reg_data;
+		    overflow := multiply_to_x.overflow;
+		else
+		    result := divider_to_x.write_reg_data;
+		    overflow := divider_to_x.overflow;
+		end if;
 		result_en := '1';
-		v.e.rc := multiply_to_x.rc;
-		v.e.xerc := multiply_to_x.xerc;
-		v.e.write_xerc_enable := multiply_to_x.write_xerc_enable;
+		v.e.write_reg := gpr_to_gspr(v.slow_op_dest);
+		v.e.rc := v.slow_op_rc;
+		v.e.xerc := v.slow_op_xerc;
+		v.e.write_xerc_enable := v.slow_op_oe;
+		-- We must test oe because the RC update code in writeback
+		-- will use the xerc value to set CR0:SO so we must not clobber
+		-- xerc if OE wasn't set.
+		if v.slow_op_oe = '1' then
+		    v.e.xerc.ov := overflow;
+		    v.e.xerc.ov32 := overflow;
+		    v.e.xerc.so := v.slow_op_xerc.so or overflow;
+		end if;
 		v.e.valid := '1';
 		v.e.write_len := x"8";
 		v.e.sign_extend := '0';
 	    else
 		stall_out <= '1';
-		v.mul_in_progress := '1';
-	    end if;
-        elsif r.div_in_progress = '1' then
-            if divider_to_x.valid = '1' then
-                v.e.write_reg := gpr_to_gspr(divider_to_x.write_reg_nr);
-                result := divider_to_x.write_reg_data;
-                result_en := '1';
-                v.e.rc := divider_to_x.rc;
-                v.e.xerc := divider_to_x.xerc;
-                v.e.write_xerc_enable := divider_to_x.write_xerc_enable;
-                v.e.valid := '1';
-                v.e.write_len := x"8";
-		v.e.sign_extend := '0';
-	    else
-		stall_out <= '1';
-		v.div_in_progress := '1';
+		v.mul_in_progress := r.mul_in_progress;
+		v.div_in_progress := r.div_in_progress;
 	    end if;
 	end if;
 
diff --git a/multiply.vhdl b/multiply.vhdl
index 714b844..959c114 100644
--- a/multiply.vhdl
+++ b/multiply.vhdl
@@ -25,19 +25,12 @@ architecture behaviour of multiply is
         valid     : std_ulogic;
         insn_type  : insn_type_t;
         data      : signed(129 downto 0);
-        write_reg : std_ulogic_vector(4 downto 0);
-        rc        : std_ulogic;
-	oe        : std_ulogic;
 	is_32bit  : std_ulogic;
-	xerc      : xer_common_t;
     end record;
     constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0',
 								     insn_type => OP_ILLEGAL,
-								     rc => '0', oe => '0',
 								     is_32bit => '0',
-								     xerc => xerc_init,
-								     data => (others => '0'),
-								     others => (others => '0'));
+								     data => (others => '0'));
 
     type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage;
     constant MultiplyPipelineInit : multiply_pipeline_type := (others => MultiplyPipelineStageInit);
@@ -69,11 +62,7 @@ begin
         v.multiply_pipeline(0).valid := m.valid;
         v.multiply_pipeline(0).insn_type := m.insn_type;
         v.multiply_pipeline(0).data := signed(m.data1) * signed(m.data2);
-        v.multiply_pipeline(0).write_reg := m.write_reg;
-        v.multiply_pipeline(0).rc := m.rc;
-        v.multiply_pipeline(0).oe := m.oe;
         v.multiply_pipeline(0).is_32bit := m.is_32bit;
-        v.multiply_pipeline(0).xerc := m.xerc;
 
         loop_0: for i in 1 to PIPELINE_DEPTH-1 loop
             v.multiply_pipeline(i) := r.multiply_pipeline(i-1);
@@ -101,24 +90,10 @@ begin
         end case;
 
         m_out.write_reg_data <= d2;
-        m_out.write_reg_nr <= v.multiply_pipeline(PIPELINE_DEPTH-1).write_reg;
-	m_out.xerc <= v.multiply_pipeline(PIPELINE_DEPTH-1).xerc;
+        m_out.overflow <= ov;
 
-	-- Generate OV/OV32/SO when OE=1
         if v.multiply_pipeline(PIPELINE_DEPTH-1).valid = '1' then
             m_out.valid <= '1';
-            m_out.rc <= v.multiply_pipeline(PIPELINE_DEPTH-1).rc;
-            m_out.write_xerc_enable <= v.multiply_pipeline(PIPELINE_DEPTH-1).oe;
-
-	    -- We must test oe because the RC update code in writeback
-	    -- will use the xerc value to set CR0:SO so we must not clobber
-	    -- xerc if OE wasn't set.
-	    --
-	    if v.multiply_pipeline(PIPELINE_DEPTH-1).oe = '1' then
-		m_out.xerc.ov <= ov;
-		m_out.xerc.ov32 <= ov;
-		m_out.xerc.so <= v.multiply_pipeline(PIPELINE_DEPTH-1).xerc.so or ov;
-	    end if;
         end if;
 
         rin <= v;
diff --git a/multiply_tb.vhdl b/multiply_tb.vhdl
index a76d739..8f1d795 100644
--- a/multiply_tb.vhdl
+++ b/multiply_tb.vhdl
@@ -40,10 +40,8 @@ begin
 
         m1.valid <= '1';
         m1.insn_type <= OP_MUL_L64;
-        m1.write_reg <= "10001";
         m1.data1 <= '0' & x"0000000000001000";
         m1.data2 <= '0' & x"0000000000001111";
-        m1.rc <= '0';
 
         wait for clk_period;
         assert m2.valid = '0';
@@ -58,15 +56,12 @@ begin
 
         wait for clk_period;
         assert m2.valid = '1';
-        assert m2.write_reg_nr = "10001";
         assert m2.write_reg_data = x"0000000001111000";
-        assert m2.rc = '0';
 
         wait for clk_period;
         assert m2.valid = '0';
 
         m1.valid <= '1';
-        m1.rc <= '1';
 
         wait for clk_period;
         assert m2.valid = '0';
@@ -75,9 +70,7 @@ begin
 
         wait for clk_period * (pipeline_depth-1);
         assert m2.valid = '1';
-        assert m2.write_reg_nr = "10001";
         assert m2.write_reg_data = x"0000000001111000";
-        assert m2.rc = '1';
 
         -- test mulld
         mulld_loop : for i in 0 to 1000 loop

From d956846667ef558e51705c0d22152aa912629454 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 12 Dec 2019 15:25:45 +1100
Subject: [PATCH 04/10] execute1: Move EXTS* instruction back into execute1

This moves the sign extension done by the extsb, extsh and extsw
instructions back into execute1.  This means that we no longer need
any data formatting in writeback for results coming from execute1,
so this modifies writeback so the data formatter inputs come
directly from the loadstore unit output.  The condition code
updates for RC=1 form instructions are now done on the value from
execute1 rather than the output of the data formatter, which should
help timing.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl    |  4 +---
 execute1.vhdl  | 24 ++++++++++++++----------
 writeback.vhdl | 35 ++++++++++++++++-------------------
 3 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 639f0f7..8612389 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -240,16 +240,14 @@ package common is
 	write_enable : std_ulogic;
 	write_reg: gspr_index_t;
 	write_data: std_ulogic_vector(63 downto 0);
-	write_len : std_ulogic_vector(3 downto 0);
 	write_cr_enable : std_ulogic;
 	write_cr_mask : std_ulogic_vector(7 downto 0);
 	write_cr_data : std_ulogic_vector(31 downto 0);
 	write_xerc_enable : std_ulogic;
 	xerc : xer_common_t;
-	sign_extend: std_ulogic;
     end record;
     constant Execute1ToWritebackInit : Execute1ToWritebackType := (valid => '0', rc => '0', write_enable => '0',
-								   write_cr_enable => '0', sign_extend => '0',
+								   write_cr_enable => '0',
 								   write_xerc_enable => '0', xerc => xerc_init,
 								   others => (others => '0'));
 
diff --git a/execute1.vhdl b/execute1.vhdl
index 94845d8..1991009 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -192,6 +192,7 @@ begin
         variable sign1, sign2 : std_ulogic;
         variable abs1, abs2 : signed(63 downto 0);
 	variable overflow : std_ulogic;
+	variable negative : std_ulogic;
     begin
 	result := (others => '0');
 	result_with_carry := (others => '0');
@@ -335,8 +336,6 @@ begin
 
 	    v.e.valid := '1';
 	    v.e.write_reg := e_in.write_reg;
-	    v.e.write_len := x"8";
-	    v.e.sign_extend := '0';
 	    v.slow_op_dest := gspr_to_gpr(e_in.write_reg);
 	    v.slow_op_rc := e_in.rc;
 	    v.slow_op_oe := e_in.oe;
@@ -438,10 +437,19 @@ begin
 	    when OP_CNTZ =>
 		result := countzero_result;
 		result_en := '1';
-	    when OP_EXTS =>
-		v.e.write_len := e_in.data_len;
-		v.e.sign_extend := '1';
-		result := e_in.read_data3;
+            when OP_EXTS =>
+                -- note data_len is a 1-hot encoding
+		negative := (e_in.data_len(0) and e_in.read_data3(7)) or
+			    (e_in.data_len(1) and e_in.read_data3(15)) or
+			    (e_in.data_len(2) and e_in.read_data3(31));
+		result := (others => negative);
+		if e_in.data_len(2) = '1' then
+		    result(31 downto 16) := e_in.read_data3(31 downto 16);
+		end if;
+		if e_in.data_len(2) = '1' or e_in.data_len(1) = '1' then
+		    result(15 downto 8) := e_in.read_data3(15 downto 8);
+		end if;
+		result(7 downto 0) := e_in.read_data3(7 downto 0);
 		result_en := '1';
 	    when OP_ISEL =>
 		crbit := to_integer(unsigned(insn_bc(e_in.insn)));
@@ -660,8 +668,6 @@ begin
 	    result_en := '1';
 	    result := r.next_lr;
 	    v.e.write_reg := fast_spr_num(SPR_LR);
-	    v.e.write_len := x"8";
-	    v.e.sign_extend := '0';
 	    v.e.valid := '1';
 	elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then
 	    if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or
@@ -687,8 +693,6 @@ begin
 		    v.e.xerc.so := v.slow_op_xerc.so or overflow;
 		end if;
 		v.e.valid := '1';
-		v.e.write_len := x"8";
-		v.e.sign_extend := '0';
 	    else
 		stall_out <= '1';
 		v.mul_in_progress := r.mul_in_progress;
diff --git a/writeback.vhdl b/writeback.vhdl
index 08efe91..e53f46b 100644
--- a/writeback.vhdl
+++ b/writeback.vhdl
@@ -42,7 +42,6 @@ architecture behaviour of writeback is
     signal sign_extend : std_ulogic;
     signal negative : std_ulogic;
     signal second_word : std_ulogic;
-    signal zero : std_ulogic;
 begin
     writeback_0: process(clk)
     begin
@@ -62,6 +61,8 @@ begin
         variable k : unsigned(3 downto 0);
 	variable cf: std_ulogic_vector(3 downto 0);
 	variable xe: xer_common_t;
+        variable zero : std_ulogic;
+        variable sign : std_ulogic;
     begin
         x := "" & e_in.valid;
         y := "" & l_in.valid;
@@ -85,10 +86,7 @@ begin
 
         rc <= '0';
         brev_lenm1 <= "000";
-        byte_offset <= "000";
-        data_len <= x"8";
         partial_write <= '0';
-        sign_extend <= '0';
         second_word <= '0';
 	xe := e_in.xerc;
 	data_in <= (others => '0');
@@ -96,9 +94,6 @@ begin
         if e_in.write_enable = '1' then
             w_out.write_reg <= e_in.write_reg;
             w_out.write_enable <= '1';
-	    data_in <= e_in.write_data;
-            data_len <= unsigned(e_in.write_len);
-            sign_extend <= e_in.sign_extend;
             rc <= e_in.rc;
         end if;
 
@@ -113,12 +108,11 @@ begin
             c_out.write_xerc_data <= e_in.xerc;
 	end if;
 
+        sign_extend <= l_in.sign_extend;
+        data_len <= unsigned(l_in.write_len);
+        byte_offset <= unsigned(l_in.write_shift);
 	if l_in.write_enable = '1' then
             w_out.write_reg <= gpr_to_gspr(l_in.write_reg);
-            data_in <= l_in.write_data;
-            data_len <= unsigned(l_in.write_len);
-            byte_offset <= unsigned(l_in.write_shift);
-            sign_extend <= l_in.sign_extend;
             if l_in.byte_reverse = '1' then
                 brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1;
             end if;
@@ -138,7 +132,7 @@ begin
         end loop;
         for i in 0 to 7 loop
             j := to_integer(perm(i)) * 8;
-            data_permuted(i * 8 + 7 downto i * 8) <= data_in(j + 7 downto j);
+            data_permuted(i * 8 + 7 downto i * 8) <= l_in.write_data(j + 7 downto j);
         end loop;
 
         -- If the data can arrive split over two cycles, this will be correct
@@ -160,16 +154,12 @@ begin
                 trim_ctl(i) <= '0' & (negative and sign_extend);
             end if;
         end loop;
-	zero <= not negative;
         for i in 0 to 7 loop
             case trim_ctl(i) is
                 when "11" =>
                     data_trimmed(i * 8 + 7 downto i * 8) <= data_latched(i * 8 + 7 downto i * 8);
                 when "10" =>
                     data_trimmed(i * 8 + 7 downto i * 8) <= data_permuted(i * 8 + 7 downto i * 8);
-		    if or data_permuted(i * 8 + 7 downto i * 8) /= '0' then
-			zero <= '0';
-		    end if;
                 when "01" =>
                     data_trimmed(i * 8 + 7 downto i * 8) <= x"FF";
                 when others =>
@@ -178,14 +168,21 @@ begin
         end loop;
 
         -- deliver to regfile
-        w_out.write_data <= data_trimmed;
+        if l_in.write_enable = '1' then
+            w_out.write_data <= data_trimmed;
+        else
+            w_out.write_data <= e_in.write_data;
+        end if;
 
         -- Perform CR0 update for RC forms
+        -- Note that loads never have a form with an RC bit, therefore this can test e_in.write_data
         if rc = '1' then
+            sign := e_in.write_data(63);
+            zero := not (or e_in.write_data);
             c_out.write_cr_enable <= '1';
             c_out.write_cr_mask <= num_to_fxm(0);
-	    cf(3) := negative;
-	    cf(2) := not negative and not zero;
+	    cf(3) := sign;
+	    cf(2) := not sign and not zero;
 	    cf(1) := zero;
 	    cf(0) := xe.so;
 	    c_out.write_cr_data(31 downto 28) <= cf;

From d2ca625b3b9c98de607b2a56f8428c70ab343891 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 13 Dec 2019 15:48:54 +1100
Subject: [PATCH 05/10] execute: Do comparisons using the main adder

This handles OP_CMP like a subtraction; the main adder computes
~RA + RB + 1, and the condition codes are computed from the results.
A direct comparison of the two input operands is used to calculate the
EQ bit of the condition result.  The LT and GT bits are computed from
the MSB of the subtraction result, the carry out from the subtraction,
and the MSBs of the operands.  For a 32-bit comparison, the 32-bit
carry and bit 31 of the result and input operands are used; for a
64-bit comparison, the 64-bit carry and bit 63 of the operands and
result are used.

It turns out to be more convenient to use the 'signed' field of
the decode table to distinguish signed from unsigned comparisons,
rather than the insn_type.  Therefore this uses OP_CMP for both
cmp and cmpl, which also has the benefit of reducing the number
of values in insn_type_t.

Doing this saves over 200 slice LUTs on the Arty A7-100 and improves
timing slightly as well.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl      |  8 ++---
 decode_types.vhdl |  2 +-
 execute1.vhdl     | 87 ++++++++++++++++++++++++++++++-----------------
 3 files changed, 60 insertions(+), 37 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 6ac3f01..0e42d1b 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -44,8 +44,8 @@ architecture behaviour of decode1 is
 		29 =>       (ALU,    OP_AND,       NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0'), -- andis.
 		18 =>       (ALU,    OP_B,         NONE,       CONST_LI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- b
 		16 =>       (ALU,    OP_BC,        SPR,        CONST_BD,    NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc
-		11 =>       (ALU,    OP_CMP,       RA,         CONST_SI,    NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpi
-		10 =>       (ALU,    OP_CMPL,      RA,         CONST_UI,    NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli
+		11 =>       (ALU,    OP_CMP,       RA,         CONST_SI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmpi
+		10 =>       (ALU,    OP_CMP,       RA,         CONST_UI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli
 		34 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbz
 		35 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lbzu
 		42 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lha
@@ -145,10 +145,10 @@ architecture behaviour of decode1 is
 		2#0000011100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- and
 		2#0000111100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- andc
 		-- 2#0011111100# bperm
-		2#0000000000#  =>       (ALU,    OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmp
+		2#0000000000#  =>       (ALU,    OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmp
 		2#0111111100#  =>       (ALU,    OP_CMPB,      NONE,       RB,          RS,   RA,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpb
 		-- 2#0011100000# cmpeqb
-		2#0000100000#  =>       (ALU,    OP_CMPL,      RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpl
+		2#0000100000#  =>       (ALU,    OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpl
 		-- 2#0011000000# cmprb
 		2#0000111010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- cntlzd
 		2#0000011010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- cntlzw
diff --git a/decode_types.vhdl b/decode_types.vhdl
index fdc1e6e..82039bd 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -4,7 +4,7 @@ use ieee.std_logic_1164.all;
 package decode_types is
     type insn_type_t is (OP_ILLEGAL, OP_NOP, OP_ADD,
 			 OP_ADDPCIS, OP_AND, OP_ATTN, OP_B, OP_BC, OP_BCREG,
-			 OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPL, OP_CMPRB,
+			 OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB,
 			 OP_CNTZ, OP_CRAND,
 			 OP_CRANDC, OP_CREQV, OP_CRNAND, OP_CRNOR, OP_CROR, OP_CRORC,
 			 OP_CRXOR, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST,
diff --git a/execute1.vhdl b/execute1.vhdl
index 1991009..6889a6a 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -193,6 +193,9 @@ begin
         variable abs1, abs2 : signed(63 downto 0);
 	variable overflow : std_ulogic;
 	variable negative : std_ulogic;
+        variable zerohi, zerolo : std_ulogic;
+        variable msb_a, msb_b : std_ulogic;
+        variable a_lt : std_ulogic;
     begin
 	result := (others => '0');
 	result_with_carry := (others => '0');
@@ -348,7 +351,7 @@ begin
 		report "illegal";
 	    when OP_NOP =>
 		-- Do nothing
-	    when OP_ADD =>
+	    when OP_ADD | OP_CMP =>
 		if e_in.invert_a = '0' then
 		    a_inv := e_in.read_data1;
 		else
@@ -359,15 +362,57 @@ begin
 		result := result_with_carry(63 downto 0);
                 carry_32 := result(32) xor a_inv(32) xor e_in.read_data2(32);
                 carry_64 := result_with_carry(64);
-		if e_in.output_carry = '1' then
-		    set_carry(v.e, carry_32, carry_64);
-		end if;
-		if e_in.oe = '1' then
-		    set_ov(v.e,
-			   calc_ov(a_inv(63), e_in.read_data2(63), carry_64, result_with_carry(63)),
-			   calc_ov(a_inv(31), e_in.read_data2(31), carry_32, result_with_carry(31)));
-		end if;
-		result_en := '1';
+                if e_in.insn_type = OP_ADD then
+                    if e_in.output_carry = '1' then
+                        set_carry(v.e, carry_32, carry_64);
+                    end if;
+                    if e_in.oe = '1' then
+                        set_ov(v.e,
+                               calc_ov(a_inv(63), e_in.read_data2(63), carry_64, result_with_carry(63)),
+                               calc_ov(a_inv(31), e_in.read_data2(31), carry_32, result_with_carry(31)));
+                    end if;
+                    result_en := '1';
+                else
+                    -- CMP and CMPL instructions
+                    -- Note, we have done RB - RA, not RA - RB
+                    bf := insn_bf(e_in.insn);
+                    l := insn_l(e_in.insn);
+                    v.e.write_cr_enable := '1';
+                    crnum := to_integer(unsigned(bf));
+                    v.e.write_cr_mask := num_to_fxm(crnum);
+                    zerolo := not (or (e_in.read_data1(31 downto 0) xor e_in.read_data2(31 downto 0)));
+                    zerohi := not (or (e_in.read_data1(63 downto 32) xor e_in.read_data2(63 downto 32)));
+                    if zerolo = '1' and (l = '0' or zerohi = '1') then
+                        -- values are equal
+                        newcrf := "001" & v.e.xerc.so;
+                    else
+                        if l = '1' then
+                            -- 64-bit comparison
+                            msb_a := e_in.read_data1(63);
+                            msb_b := e_in.read_data2(63);
+                        else
+                            -- 32-bit comparison
+                            msb_a := e_in.read_data1(31);
+                            msb_b := e_in.read_data2(31);
+                        end if;
+                        if msb_a /= msb_b then
+                            -- Subtraction might overflow, but
+                            -- comparison is clear from MSB difference.
+                            -- for signed, 0 is greater; for unsigned, 1 is greater
+                            a_lt := msb_a xnor e_in.is_signed;
+                        else
+                            -- Subtraction cannot overflow since MSBs are equal.
+                            -- carry = 1 indicates RA is smaller (signed or unsigned)
+                            a_lt := (not l and carry_32) or (l and carry_64);
+                        end if;
+                        newcrf := a_lt & not a_lt & '0' & v.e.xerc.so;
+                    end if;
+                    for i in 0 to 7 loop
+                        lo := i*4;
+                        hi := lo + 3;
+                        v.e.write_cr_data(hi downto lo) := newcrf;
+                    end loop;
+                end if;
 	    when OP_AND | OP_OR | OP_XOR =>
 		result := logical_result;
 		result_en := '1';
@@ -412,28 +457,6 @@ begin
 	    when OP_CMPB =>
 		result := ppc_cmpb(e_in.read_data3, e_in.read_data2);
 		result_en := '1';
-	    when OP_CMP =>
-		bf := insn_bf(e_in.insn);
-		l := insn_l(e_in.insn);
-		v.e.write_cr_enable := '1';
-		crnum := to_integer(unsigned(bf));
-		v.e.write_cr_mask := num_to_fxm(crnum);
-		for i in 0 to 7 loop
-		    lo := i*4;
-		    hi := lo + 3;
-		    v.e.write_cr_data(hi downto lo) := ppc_cmp(l, e_in.read_data1, e_in.read_data2, v.e.xerc.so);
-		end loop;
-	    when OP_CMPL =>
-		bf := insn_bf(e_in.insn);
-		l := insn_l(e_in.insn);
-		v.e.write_cr_enable := '1';
-		crnum := to_integer(unsigned(bf));
-		v.e.write_cr_mask := num_to_fxm(crnum);
-		for i in 0 to 7 loop
-		    lo := i*4;
-		    hi := lo + 3;
-		    v.e.write_cr_data(hi downto lo) := ppc_cmpl(l, e_in.read_data1, e_in.read_data2, v.e.xerc.so);
-		end loop;
 	    when OP_CNTZ =>
 		result := countzero_result;
 		result_en := '1';

From 0c714f1be680ed36373be0ee9c15d30a7cc263b6 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 13 Jan 2020 18:13:09 +1100
Subject: [PATCH 06/10] execute: Move popcnt and prty instructions into the
 logical unit

This implements logic in the logical entity to calculate the results
of the popcnt* and prty* instructions.  We now have one insn_type_t
value for the 3 popcnt variants and one for the two prty variants,
using the length field of the decode_rom_t to distinguish between
them.  The implementations in logical.vhdl using recursive
algorithms rather than the simple functions in ppc_fx_insns.vhdl.

This gives a saving of about 140 slice LUTs on the A7-100 and
improves timing slightly.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl      | 10 ++++----
 decode_types.vhdl |  4 ++--
 execute1.vhdl     | 24 ++++++++-----------
 logical.vhdl      | 60 ++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 76 insertions(+), 22 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 0e42d1b..d2dbd96 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -263,11 +263,11 @@ architecture behaviour of decode1 is
 		2#0001111100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nor
 		2#0110111100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- or
 		2#0110011100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- orc
-		2#0001111010#  =>       (ALU,    OP_POPCNTB,   NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntb
-		2#0111111010#  =>       (ALU,    OP_POPCNTD,   NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntd
-		2#0101111010#  =>       (ALU,    OP_POPCNTW,   NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntw
-		2#0010111010#  =>       (ALU,    OP_PRTYD,     NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd
-		2#0010011010#  =>       (ALU,    OP_PRTYW,     NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw
+		2#0001111010#  =>       (ALU,    OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntb
+		2#0111111010#  =>       (ALU,    OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntd
+		2#0101111010#  =>       (ALU,    OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntw
+		2#0010111010#  =>       (ALU,    OP_PRTY,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd
+		2#0010011010#  =>       (ALU,    OP_PRTY,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw
 		-- 2#0010000000# setb
 		2#0000011011#  =>       (ALU,    OP_SHL,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- sld
 		2#0000011000#  =>       (ALU,    OP_SHL,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- slw
diff --git a/decode_types.vhdl b/decode_types.vhdl
index 82039bd..21d8b68 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -14,8 +14,8 @@ package decode_types is
 			 OP_MCRXR, OP_MCRXRX, OP_MFCR, OP_MFSPR, OP_MOD,
 			 OP_MTCRF, OP_MTSPR, OP_MUL_L64,
 			 OP_MUL_H64, OP_MUL_H32, OP_OR,
-			 OP_POPCNTB, OP_POPCNTD, OP_POPCNTW, OP_PRTYD,
-			 OP_PRTYW, OP_RLC, OP_RLCL, OP_RLCR, OP_SETB,
+			 OP_POPCNT, OP_PRTY,
+			 OP_RLC, OP_RLCL, OP_RLCR, OP_SETB,
 			 OP_SHL, OP_SHR,
 			 OP_SYNC, OP_TD, OP_TDI, OP_TW,
 			 OP_TWI, OP_XOR, OP_SIM_CONFIG
diff --git a/execute1.vhdl b/execute1.vhdl
index 6889a6a..5a626f8 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -54,6 +54,8 @@ architecture behaviour of execute1 is
     signal rotator_carry: std_ulogic;
     signal logical_result: std_ulogic_vector(63 downto 0);
     signal countzero_result: std_ulogic_vector(63 downto 0);
+    signal popcnt_result: std_ulogic_vector(63 downto 0);
+    signal parity_result: std_ulogic_vector(63 downto 0);
 
     -- multiply signals
     signal x_to_multiply: Execute1ToMultiplyType;
@@ -127,7 +129,10 @@ begin
 	    op => e_in.insn_type,
 	    invert_in => e_in.invert_a,
 	    invert_out => e_in.invert_out,
-	    result => logical_result
+	    result => logical_result,
+            datalen => e_in.data_len,
+            popcnt => popcnt_result,
+            parity => parity_result
 	    );
 
     countzero_0: entity work.zero_counter
@@ -612,20 +617,11 @@ begin
 --		    when others =>
 --		    end case;
 		end if;
-	    when OP_POPCNTB =>
-		result := ppc_popcntb(e_in.read_data3);
+	    when OP_POPCNT =>
+		result := popcnt_result;
 		result_en := '1';
-	    when OP_POPCNTW =>
-		result := ppc_popcntw(e_in.read_data3);
-		result_en := '1';
-	    when OP_POPCNTD =>
-		result := ppc_popcntd(e_in.read_data3);
-		result_en := '1';
-	    when OP_PRTYD =>
-		result := ppc_prtyd(e_in.read_data3);
-		result_en := '1';
-	    when OP_PRTYW =>
-		result := ppc_prtyw(e_in.read_data3);
+	    when OP_PRTY =>
+		result := parity_result;
 		result_en := '1';
 	    when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR =>
 		result := rotator_result;
diff --git a/logical.vhdl b/logical.vhdl
index b92b98d..4dfc13d 100644
--- a/logical.vhdl
+++ b/logical.vhdl
@@ -12,11 +12,29 @@ entity logical is
         op         : in insn_type_t;
         invert_in  : in std_ulogic;
         invert_out : in std_ulogic;
-        result     : out std_ulogic_vector(63 downto 0)
+        result     : out std_ulogic_vector(63 downto 0);
+        datalen    : in std_logic_vector(3 downto 0);
+        popcnt     : out std_ulogic_vector(63 downto 0);
+        parity     : out std_ulogic_vector(63 downto 0)
         );
 end entity logical;
 
 architecture behaviour of logical is
+
+    subtype twobit is unsigned(1 downto 0);
+    type twobit32 is array(0 to 31) of twobit;
+    signal pc2      : twobit32;
+    subtype threebit is unsigned(2 downto 0);
+    type threebit16 is array(0 to 15) of threebit;
+    signal pc4      : threebit16;
+    subtype fourbit is unsigned(3 downto 0);
+    type fourbit8 is array(0 to 7) of fourbit;
+    signal pc8      : fourbit8;
+    subtype sixbit is unsigned(5 downto 0);
+    type sixbit2 is array(0 to 1) of sixbit;
+    signal pc32     : sixbit2;
+    signal par0, par1 : std_ulogic;
+
 begin
     logical_0: process(all)
         variable rb_adj, tmp : std_ulogic_vector(63 downto 0);
@@ -40,5 +58,45 @@ begin
             result <= not tmp;
         end if;
 
+        -- population counts
+        for i in 0 to 31 loop
+            pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1));
+        end loop;
+        for i in 0 to 15 loop
+            pc4(i) <= ('0' & pc2(i * 2)) + ('0' & pc2(i * 2 + 1));
+        end loop;
+        for i in 0 to 7 loop
+            pc8(i) <= ('0' & pc4(i * 2)) + ('0' & pc4(i * 2 + 1));
+        end loop;
+        for i in 0 to 1 loop
+            pc32(i) <= ("00" & pc8(i * 4)) + ("00" & pc8(i * 4 + 1)) +
+                       ("00" & pc8(i * 4 + 2)) + ("00" & pc8(i * 4 + 3));
+        end loop;
+        popcnt <= (others => '0');
+        if datalen(3 downto 2) = "00" then
+            -- popcntb
+            for i in 0 to 7 loop
+                popcnt(i * 8 + 3 downto i * 8) <= std_ulogic_vector(pc8(i));
+            end loop;
+        elsif datalen(3) = '0' then
+            -- popcntw
+            for i in 0 to 1 loop
+                popcnt(i * 32 + 5 downto i * 32) <= std_ulogic_vector(pc32(i));
+            end loop;
+        else
+            popcnt(6 downto 0) <= std_ulogic_vector(('0' & pc32(0)) + ('0' & pc32(1)));
+        end if;
+
+        -- parity calculations
+        par0 <= rs(0) xor rs(8) xor rs(16) xor rs(24);
+        par1 <= rs(32) xor rs(40) xor rs(48) xor rs(56);
+        parity <= (others => '0');
+        if datalen(3) = '1' then
+            parity(0) <= par0 xor par1;
+        else
+            parity(0) <= par0;
+            parity(32) <= par1;
+        end if;
+
     end process;
 end behaviour;

From b14d9820116ebe8c39179c8b6c5565d340bdb72c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 13 Jan 2020 13:23:42 +1100
Subject: [PATCH 07/10] execute: Implement bypass from output of execute1 to
 input

This enables back-to-back execution of integer instructions where
the first instruction writes a GPR and the second reads the same
GPR.  This is done with a set of multiplexers at the start of
execute1 which enable any of the three input operands to be taken
from the output of execute1 (i.e. r.e.write_data) rather than the
input from decode2 (i.e. e_in.read_data[123]).

This also requires changes to the hazard detection and handling.
Decode2 generates a signal indicating that the GPR being written
is available for bypass, which is true for instructions that are
executed in execute1 (rather than loadstore1/dcache).  The
gpr_hazard module stores this "bypassable" bit, and if the same
GPR needs to be read by a subsequent instruction, it outputs a
"use_bypass" signal rather than generating a stall.  The
use_bypass signal is then latched at the output of decode2 and
passed down to execute1 to control the input multiplexer.

At the moment there is no bypass on the inputs to loadstore1, but that
is OK because all load and store instructions are marked as
single-issue.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl     |   6 ++-
 control.vhdl    |  19 +++++--
 core.vhdl       |   9 +++-
 decode2.vhdl    |  21 +++++++-
 execute1.vhdl   | 135 ++++++++++++++++++++++++++----------------------
 gpr_hazard.vhdl |  68 +++++++++++++++++-------
 6 files changed, 168 insertions(+), 90 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 8612389..9c8a942 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -109,6 +109,9 @@ package common is
 	read_data1: std_ulogic_vector(63 downto 0);
 	read_data2: std_ulogic_vector(63 downto 0);
 	read_data3: std_ulogic_vector(63 downto 0);
+        bypass_data1: std_ulogic;
+        bypass_data2: std_ulogic;
+        bypass_data3: std_ulogic;
 	cr: std_ulogic_vector(31 downto 0);
 	xerc: xer_common_t;
 	lr: std_ulogic;
@@ -126,7 +129,8 @@ package common is
 	data_len: std_ulogic_vector(3 downto 0);
     end record;
     constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
-	(valid => '0', insn_type => OP_ILLEGAL, lr => '0', rc => '0', oe => '0', invert_a => '0',
+	(valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
+         lr => '0', rc => '0', oe => '0', invert_a => '0',
 	 invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
 	 is_32bit => '0', is_signed => '0', xerc => xerc_init, others => (others => '0'));
 
diff --git a/control.vhdl b/control.vhdl
index ead3c1f..064ff98 100644
--- a/control.vhdl
+++ b/control.vhdl
@@ -21,6 +21,7 @@ entity control is
 
         gpr_write_valid_in  : in std_ulogic;
         gpr_write_in        : in gspr_index_t;
+        gpr_bypassable      : in std_ulogic;
 
         gpr_a_read_valid_in : in std_ulogic;
         gpr_a_read_in       : in gspr_index_t;
@@ -36,7 +37,11 @@ entity control is
 
         valid_out           : out std_ulogic;
         stall_out           : out std_ulogic;
-        stopped_out         : out std_ulogic
+        stopped_out         : out std_ulogic;
+
+        gpr_bypass_a        : out std_ulogic;
+        gpr_bypass_b        : out std_ulogic;
+        gpr_bypass_c        : out std_ulogic
         );
 end entity control;
 
@@ -71,10 +76,12 @@ begin
 
             gpr_write_valid_in => gpr_write_valid,
             gpr_write_in       => gpr_write_in,
+            bypass_avail       => gpr_bypassable,
             gpr_read_valid_in  => gpr_a_read_valid_in,
             gpr_read_in        => gpr_a_read_in,
 
-            stall_out          => stall_a_out
+            stall_out          => stall_a_out,
+            use_bypass         => gpr_bypass_a
             );
 
     gpr_hazard1: entity work.gpr_hazard
@@ -87,10 +94,12 @@ begin
 
             gpr_write_valid_in => gpr_write_valid,
             gpr_write_in       => gpr_write_in,
+            bypass_avail       => gpr_bypassable,
             gpr_read_valid_in  => gpr_b_read_valid_in,
             gpr_read_in        => gpr_b_read_in,
 
-            stall_out          => stall_b_out
+            stall_out          => stall_b_out,
+            use_bypass         => gpr_bypass_b
             );
 
     gpr_c_read_in_fmt <= "0" & gpr_c_read_in;
@@ -105,10 +114,12 @@ begin
 
             gpr_write_valid_in => gpr_write_valid,
             gpr_write_in       => gpr_write_in,
+            bypass_avail       => gpr_bypassable,
             gpr_read_valid_in  => gpr_c_read_valid_in,
             gpr_read_in        => gpr_c_read_in_fmt,
 
-            stall_out          => stall_c_out
+            stall_out          => stall_c_out,
+            use_bypass         => gpr_bypass_c
             );
 
     cr_hazard0: entity work.cr_hazard
diff --git a/core.vhdl b/core.vhdl
index a38cf36..aa86689 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -9,7 +9,8 @@ use work.wishbone_types.all;
 entity core is
     generic (
         SIM : boolean := false;
-	DISABLE_FLATTEN : boolean := false
+	DISABLE_FLATTEN : boolean := false;
+        EX1_BYPASS : boolean := true
         );
     port (
         clk          : in std_logic;
@@ -176,6 +177,9 @@ begin
     decode1_stall_in <= decode2_stall_out;
 
     decode2_0: entity work.decode2
+        generic map (
+            EX1_BYPASS => EX1_BYPASS
+            )
         port map (
             clk => clk,
             rst => core_rst,
@@ -220,6 +224,9 @@ begin
             );
 
     execute1_0: entity work.execute1
+        generic map (
+            EX1_BYPASS => EX1_BYPASS
+            )
         port map (
             clk => clk,
             rst => core_rst,
diff --git a/decode2.vhdl b/decode2.vhdl
index 6cd4574..6e3bd8a 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -9,6 +9,9 @@ use work.helpers.all;
 use work.insn_helpers.all;
 
 entity decode2 is
+        generic (
+                EX1_BYPASS : boolean := true
+        );
 	port (
 		clk   : in std_ulogic;
 		rst   : in std_ulogic;
@@ -184,15 +187,19 @@ architecture behaviour of decode2 is
 
 	signal gpr_write_valid : std_ulogic;
 	signal gpr_write : gspr_index_t;
+        signal gpr_bypassable  : std_ulogic;
 
 	signal gpr_a_read_valid : std_ulogic;
 	signal gpr_a_read :gspr_index_t;
+        signal gpr_a_bypass : std_ulogic;
 
 	signal gpr_b_read_valid : std_ulogic;
 	signal gpr_b_read : gspr_index_t;
+        signal gpr_b_bypass : std_ulogic;
 
 	signal gpr_c_read_valid : std_ulogic;
 	signal gpr_c_read : gpr_index_t;
+        signal gpr_c_bypass : std_ulogic;
 
 	signal cr_write_valid : std_ulogic;
 begin
@@ -213,6 +220,7 @@ begin
 
 		gpr_write_valid_in => gpr_write_valid,
 		gpr_write_in       => gpr_write,
+                gpr_bypassable     => gpr_bypassable,
 
 		gpr_a_read_valid_in  => gpr_a_read_valid,
 		gpr_a_read_in        => gpr_a_read,
@@ -228,7 +236,11 @@ begin
 
 		valid_out   => control_valid_out,
 		stall_out   => stall_out,
-		stopped_out => stopped_out
+		stopped_out => stopped_out,
+
+                gpr_bypass_a => gpr_a_bypass,
+                gpr_bypass_b => gpr_b_bypass,
+                gpr_bypass_c => gpr_c_bypass
 	);
 
 	decode2_0: process(clk)
@@ -295,9 +307,12 @@ begin
 		v.e.insn_type := d_in.decode.insn_type;
 		v.e.read_reg1 := decoded_reg_a.reg;
 		v.e.read_data1 := decoded_reg_a.data;
+                v.e.bypass_data1 := gpr_a_bypass;
 		v.e.read_reg2 := decoded_reg_b.reg;
 		v.e.read_data2 := decoded_reg_b.data;
+                v.e.bypass_data2 := gpr_b_bypass;
                 v.e.read_data3 := decoded_reg_c.data;
+                v.e.bypass_data3 := gpr_c_bypass;
 		v.e.write_reg := decoded_reg_o.reg;
 		v.e.rc := decode_rc(d_in.decode.rc, d_in.insn);
                 if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then
@@ -342,6 +357,10 @@ begin
 
 		gpr_write_valid <= decoded_reg_o.reg_valid;
 		gpr_write <= decoded_reg_o.reg;
+                gpr_bypassable <= '0';
+                if EX1_BYPASS and d_in.decode.unit = ALU then
+                        gpr_bypassable <= '1';
+                end if;
 
 		gpr_a_read_valid <= decoded_reg_a.reg_valid;
 		gpr_a_read <= decoded_reg_a.reg;
diff --git a/execute1.vhdl b/execute1.vhdl
index 5a626f8..d63697c 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -11,6 +11,9 @@ use work.insn_helpers.all;
 use work.ppc_fx_insns.all;
 
 entity execute1 is
+    generic (
+        EX1_BYPASS : boolean := true
+        );
     port (
 	clk   : in std_ulogic;
         rst   : in std_ulogic;
@@ -46,6 +49,8 @@ architecture behaviour of execute1 is
 
     signal r, rin : reg_type;
 
+    signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0);
+
     signal ctrl: ctrl_t := (others => (others => '0'));
     signal ctrl_tmp: ctrl_t := (others => (others => '0'));
 
@@ -109,9 +114,9 @@ begin
 
     rotator_0: entity work.rotator
 	port map (
-	    rs => e_in.read_data3,
-	    ra => e_in.read_data1,
-	    shift => e_in.read_data2(6 downto 0),
+	    rs => c_in,
+	    ra => a_in,
+	    shift => b_in(6 downto 0),
 	    insn => e_in.insn,
 	    is_32bit => e_in.is_32bit,
 	    right_shift => right_shift,
@@ -124,8 +129,8 @@ begin
 
     logical_0: entity work.logical
 	port map (
-	    rs => e_in.read_data3,
-	    rb => e_in.read_data2,
+	    rs => c_in,
+	    rb => b_in,
 	    op => e_in.insn_type,
 	    invert_in => e_in.invert_a,
 	    invert_out => e_in.invert_out,
@@ -137,7 +142,7 @@ begin
 
     countzero_0: entity work.zero_counter
 	port map (
-	    rs => e_in.read_data3,
+	    rs => c_in,
 	    count_right => e_in.insn(10),
 	    is_32bit => e_in.is_32bit,
 	    result => countzero_result
@@ -158,6 +163,10 @@ begin
             d_out => divider_to_x
             );
 
+    a_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data1 = '1' else e_in.read_data1;
+    b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2;
+    c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3;
+
     execute1_0: process(clk)
     begin
 	if rising_edge(clk) then
@@ -256,21 +265,21 @@ begin
 
 	if e_in.is_32bit = '1' then
 	    if e_in.is_signed = '1' then
-		x_to_multiply.data1 <= (others => e_in.read_data1(31));
-		x_to_multiply.data1(31 downto 0) <= e_in.read_data1(31 downto 0);
-		x_to_multiply.data2 <= (others => e_in.read_data2(31));
-		x_to_multiply.data2(31 downto 0) <= e_in.read_data2(31 downto 0);
+		x_to_multiply.data1 <= (others => a_in(31));
+		x_to_multiply.data1(31 downto 0) <= a_in(31 downto 0);
+		x_to_multiply.data2 <= (others => b_in(31));
+		x_to_multiply.data2(31 downto 0) <= b_in(31 downto 0);
 	    else
-		x_to_multiply.data1 <= '0' & x"00000000" & e_in.read_data1(31 downto 0);
-		x_to_multiply.data2 <= '0' & x"00000000" & e_in.read_data2(31 downto 0);
+		x_to_multiply.data1 <= '0' & x"00000000" & a_in(31 downto 0);
+		x_to_multiply.data2 <= '0' & x"00000000" & b_in(31 downto 0);
 	    end if;
 	else
 	    if e_in.is_signed = '1' then
-		x_to_multiply.data1 <= e_in.read_data1(63) & e_in.read_data1;
-		x_to_multiply.data2 <= e_in.read_data2(63) & e_in.read_data2;
+		x_to_multiply.data1 <= a_in(63) & a_in;
+		x_to_multiply.data2 <= b_in(63) & b_in;
 	    else
-		x_to_multiply.data1 <= '0' & e_in.read_data1;
-		x_to_multiply.data2 <= '0' & e_in.read_data2;
+		x_to_multiply.data1 <= '0' & a_in;
+		x_to_multiply.data2 <= '0' & b_in;
 	    end if;
 	end if;
 
@@ -279,23 +288,23 @@ begin
         sign2 := '0';
         if e_in.is_signed = '1' then
             if e_in.is_32bit = '1' then
-                sign1 := e_in.read_data1(31);
-                sign2 := e_in.read_data2(31);
+                sign1 := a_in(31);
+                sign2 := b_in(31);
             else
-                sign1 := e_in.read_data1(63);
-                sign2 := e_in.read_data2(63);
+                sign1 := a_in(63);
+                sign2 := b_in(63);
             end if;
         end if;
         -- take absolute values
         if sign1 = '0' then
-            abs1 := signed(e_in.read_data1);
+            abs1 := signed(a_in);
         else
-            abs1 := - signed(e_in.read_data1);
+            abs1 := - signed(a_in);
         end if;
         if sign2 = '0' then
-            abs2 := signed(e_in.read_data2);
+            abs2 := signed(b_in);
         else
-            abs2 := - signed(e_in.read_data2);
+            abs2 := - signed(b_in);
         end if;
 
         x_to_divider <= Execute1ToDividerInit;
@@ -358,14 +367,14 @@ begin
 		-- Do nothing
 	    when OP_ADD | OP_CMP =>
 		if e_in.invert_a = '0' then
-		    a_inv := e_in.read_data1;
+		    a_inv := a_in;
 		else
-		    a_inv := not e_in.read_data1;
+		    a_inv := not a_in;
 		end if;
-		result_with_carry := ppc_adde(a_inv, e_in.read_data2,
+		result_with_carry := ppc_adde(a_inv, b_in,
 					      decode_input_carry(e_in.input_carry, v.e.xerc));
 		result := result_with_carry(63 downto 0);
-                carry_32 := result(32) xor a_inv(32) xor e_in.read_data2(32);
+                carry_32 := result(32) xor a_inv(32) xor b_in(32);
                 carry_64 := result_with_carry(64);
                 if e_in.insn_type = OP_ADD then
                     if e_in.output_carry = '1' then
@@ -373,8 +382,8 @@ begin
                     end if;
                     if e_in.oe = '1' then
                         set_ov(v.e,
-                               calc_ov(a_inv(63), e_in.read_data2(63), carry_64, result_with_carry(63)),
-                               calc_ov(a_inv(31), e_in.read_data2(31), carry_32, result_with_carry(31)));
+                               calc_ov(a_inv(63), b_in(63), carry_64, result_with_carry(63)),
+                               calc_ov(a_inv(31), b_in(31), carry_32, result_with_carry(31)));
                     end if;
                     result_en := '1';
                 else
@@ -385,20 +394,20 @@ begin
                     v.e.write_cr_enable := '1';
                     crnum := to_integer(unsigned(bf));
                     v.e.write_cr_mask := num_to_fxm(crnum);
-                    zerolo := not (or (e_in.read_data1(31 downto 0) xor e_in.read_data2(31 downto 0)));
-                    zerohi := not (or (e_in.read_data1(63 downto 32) xor e_in.read_data2(63 downto 32)));
+                    zerolo := not (or (a_in(31 downto 0) xor b_in(31 downto 0)));
+                    zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32)));
                     if zerolo = '1' and (l = '0' or zerohi = '1') then
                         -- values are equal
                         newcrf := "001" & v.e.xerc.so;
                     else
                         if l = '1' then
                             -- 64-bit comparison
-                            msb_a := e_in.read_data1(63);
-                            msb_b := e_in.read_data2(63);
+                            msb_a := a_in(63);
+                            msb_b := b_in(63);
                         else
                             -- 32-bit comparison
-                            msb_a := e_in.read_data1(31);
-                            msb_b := e_in.read_data2(31);
+                            msb_a := a_in(31);
+                            msb_b := b_in(31);
                         end if;
                         if msb_a /= msb_b then
                             -- Subtraction might overflow, but
@@ -424,25 +433,25 @@ begin
 	    when OP_B =>
 		f_out.redirect <= '1';
 		if (insn_aa(e_in.insn)) then
-		    f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2));
+		    f_out.redirect_nia <= std_ulogic_vector(signed(b_in));
 		else
-		    f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2));
+		    f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
 		end if;
 	    when OP_BC =>
 		-- read_data1 is CTR
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
 		if bo(4-2) = '0' then
-		    result := std_ulogic_vector(unsigned(e_in.read_data1) - 1);
+		    result := std_ulogic_vector(unsigned(a_in) - 1);
 		    result_en := '1';
 		    v.e.write_reg := fast_spr_num(SPR_CTR);
 		end if;
-		if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then
+		if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then
 		    f_out.redirect <= '1';
 		    if (insn_aa(e_in.insn)) then
-			f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2));
+			f_out.redirect_nia <= std_ulogic_vector(signed(b_in));
 		    else
-			f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2));
+			f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
 		    end if;
 		end if;
 	    when OP_BCREG =>
@@ -451,40 +460,40 @@ begin
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
 		if bo(4-2) = '0' and e_in.insn(10) = '0' then
-		    result := std_ulogic_vector(unsigned(e_in.read_data1) - 1);
+		    result := std_ulogic_vector(unsigned(a_in) - 1);
 		    result_en := '1';
 		    v.e.write_reg := fast_spr_num(SPR_CTR);
 		end if;
-		if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then
+		if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then
 		    f_out.redirect <= '1';
-		    f_out.redirect_nia <= e_in.read_data2(63 downto 2) & "00";
+		    f_out.redirect_nia <= b_in(63 downto 2) & "00";
 		end if;
 	    when OP_CMPB =>
-		result := ppc_cmpb(e_in.read_data3, e_in.read_data2);
+		result := ppc_cmpb(c_in, b_in);
 		result_en := '1';
 	    when OP_CNTZ =>
 		result := countzero_result;
 		result_en := '1';
             when OP_EXTS =>
                 -- note data_len is a 1-hot encoding
-		negative := (e_in.data_len(0) and e_in.read_data3(7)) or
-			    (e_in.data_len(1) and e_in.read_data3(15)) or
-			    (e_in.data_len(2) and e_in.read_data3(31));
+		negative := (e_in.data_len(0) and c_in(7)) or
+			    (e_in.data_len(1) and c_in(15)) or
+			    (e_in.data_len(2) and c_in(31));
 		result := (others => negative);
 		if e_in.data_len(2) = '1' then
-		    result(31 downto 16) := e_in.read_data3(31 downto 16);
+		    result(31 downto 16) := c_in(31 downto 16);
 		end if;
 		if e_in.data_len(2) = '1' or e_in.data_len(1) = '1' then
-		    result(15 downto 8) := e_in.read_data3(15 downto 8);
+		    result(15 downto 8) := c_in(15 downto 8);
 		end if;
-		result(7 downto 0) := e_in.read_data3(7 downto 0);
+		result(7 downto 0) := c_in(7 downto 0);
 		result_en := '1';
 	    when OP_ISEL =>
 		crbit := to_integer(unsigned(insn_bc(e_in.insn)));
 		if e_in.cr(31-crbit) = '1' then
-		    result := e_in.read_data1;
+		    result := a_in;
 		else
-		    result := e_in.read_data2;
+		    result := b_in;
 		end if;
 		result_en := '1';
 	    when OP_MCRF =>
@@ -549,7 +558,7 @@ begin
 		end if;
 	    when OP_MFSPR =>
 		if is_fast_spr(e_in.read_reg1) then
-		    result := e_in.read_data1;
+		    result := a_in;
 		    if decode_spr_num(e_in.insn) = SPR_XER then
 			-- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer
 			result(63 downto 32) := (others => '0');
@@ -596,19 +605,19 @@ begin
 		    crnum := fxm_to_num(insn_fxm(e_in.insn));
 		    v.e.write_cr_mask := num_to_fxm(crnum);
 		end if;
-		v.e.write_cr_data := e_in.read_data3(31 downto 0);
+		v.e.write_cr_data := c_in(31 downto 0);
 	    when OP_MTSPR =>
 		report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
-		    "=" & to_hstring(e_in.read_data3);
+		    "=" & to_hstring(c_in);
 		if is_fast_spr(e_in.write_reg) then
-		    result := e_in.read_data3;
+		    result := c_in;
 		    result_en := '1';
 		    if decode_spr_num(e_in.insn) = SPR_XER then
-			v.e.xerc.so := e_in.read_data3(63-32);
-			v.e.xerc.ov := e_in.read_data3(63-33);
-			v.e.xerc.ca := e_in.read_data3(63-34);
-			v.e.xerc.ov32 := e_in.read_data3(63-44);
-			v.e.xerc.ca32 := e_in.read_data3(63-45);
+			v.e.xerc.so := c_in(63-32);
+			v.e.xerc.ov := c_in(63-33);
+			v.e.xerc.ca := c_in(63-34);
+			v.e.xerc.ov32 := c_in(63-44);
+			v.e.xerc.ca32 := c_in(63-45);
 			v.e.write_xerc_enable := '1';
 		    end if;
 		else
diff --git a/gpr_hazard.vhdl b/gpr_hazard.vhdl
index 705e69d..de4f7d2 100644
--- a/gpr_hazard.vhdl
+++ b/gpr_hazard.vhdl
@@ -12,18 +12,21 @@ entity gpr_hazard is
 
         gpr_write_valid_in : in std_ulogic;
         gpr_write_in       : in std_ulogic_vector(5 downto 0);
+        bypass_avail       : in std_ulogic;
         gpr_read_valid_in  : in std_ulogic;
         gpr_read_in        : in std_ulogic_vector(5 downto 0);
 
-        stall_out          : out std_ulogic
+        stall_out          : out std_ulogic;
+        use_bypass         : out std_ulogic
         );
 end entity gpr_hazard;
 architecture behaviour of gpr_hazard is
     type pipeline_entry_type is record
-        valid : std_ulogic;
-        gpr   : std_ulogic_vector(5 downto 0);
+        valid  : std_ulogic;
+        bypass : std_ulogic;
+        gpr    : std_ulogic_vector(5 downto 0);
     end record;
-    constant pipeline_entry_init : pipeline_entry_type := (valid => '0', gpr => (others => '0'));
+    constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0'));
 
     type pipeline_t is array(0 to PIPELINE_DEPTH-1) of pipeline_entry_type;
     constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init);
@@ -33,9 +36,7 @@ begin
     gpr_hazard0: process(clk)
     begin
         if rising_edge(clk) then
-	    if stall_in = '0' then
-		r <= rin;
-	    end if;
+            r <= rin;
         end if;
     end process;
 
@@ -45,22 +46,49 @@ begin
         v := r;
 
         stall_out <= '0';
-        loop_0: for i in 0 to PIPELINE_DEPTH-1 loop
-            if ((r(i).valid = gpr_read_valid_in) and r(i).gpr = gpr_read_in) then
-                stall_out <= '1';
+        use_bypass <= '0';
+        if gpr_read_valid_in = '1' then
+            if r(0).valid = '1' and r(0).gpr = gpr_read_in then
+                if r(0).bypass = '1' and stall_in = '0' then
+                    use_bypass <= '1';
+                else
+                    stall_out <= '1';
+                end if;
             end if;
-        end loop;
+            loop_0: for i in 1 to PIPELINE_DEPTH-1 loop
+                if r(i).valid = '1' and r(i).gpr = gpr_read_in then
+                    if r(i).bypass = '1' then
+                        use_bypass <= '1';
+                    else
+                        stall_out <= '1';
+                    end if;
+                end if;
+            end loop;
+        end if;
 
-        v(0).valid := gpr_write_valid_in;
-        v(0).gpr   := gpr_write_in;
-        loop_1: for i in 0 to PIPELINE_DEPTH-2 loop
-            -- propagate to next slot
-            v(i+1) := r(i);
-        end loop;
+        if stall_in = '0' then
+            v(0).valid  := gpr_write_valid_in;
+            v(0).bypass := bypass_avail;
+            v(0).gpr    := gpr_write_in;
+            loop_1: for i in 1 to PIPELINE_DEPTH-1 loop
+                -- propagate to next slot
+                v(i).valid  := r(i-1).valid;
+                v(i).bypass := r(i-1).bypass;
+                v(i).gpr    := r(i-1).gpr;
+            end loop;
 
-        -- asynchronous output
-        if gpr_read_valid_in = '0' then
-            stall_out <= '0';
+        else
+            -- stage 0 stalled, so stage 1 becomes empty
+            loop_1b: for i in 1 to PIPELINE_DEPTH-1 loop
+                -- propagate to next slot
+                if i = 1 then
+                    v(i).valid := '0';
+                else
+                    v(i).valid  := r(i-1).valid;
+                    v(i).bypass := r(i-1).bypass;
+                    v(i).gpr    := r(i-1).gpr;
+                end if;
+            end loop;
         end if;
 
         -- update registers

From 5422007f83bff7550e8d3064e9c086fa668eb4d9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 14 Jan 2020 10:28:45 +1100
Subject: [PATCH 08/10] Plumb loadstore1 input from execute1 not decode2

This allows us to use the bypass at the input of execute1 for the
address and data operands for loadstore1.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl     | 14 +++++++++-----
 core.vhdl       |  6 +++---
 decode2.vhdl    | 42 +++++++-----------------------------------
 execute1.vhdl   | 26 ++++++++++++++++++++++++++
 loadstore1.vhdl |  2 +-
 5 files changed, 46 insertions(+), 44 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 9c8a942..ffddb0b 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -127,12 +127,16 @@ package common is
 	is_signed: std_ulogic;
 	insn: std_ulogic_vector(31 downto 0);
 	data_len: std_ulogic_vector(3 downto 0);
+	byte_reverse : std_ulogic;
+	sign_extend : std_ulogic;			-- do we need to sign extend?
+	update : std_ulogic;				-- is this an update instruction?
     end record;
     constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
 	(valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
          lr => '0', rc => '0', oe => '0', invert_a => '0',
 	 invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
-	 is_32bit => '0', is_signed => '0', xerc => xerc_init, others => (others => '0'));
+	 is_32bit => '0', is_signed => '0', xerc => xerc_init,
+         byte_reverse => '0', sign_extend => '0', update => '0', others => (others => '0'));
 
     type Execute1ToMultiplyType is record
 	valid: std_ulogic;
@@ -189,7 +193,7 @@ package common is
     end record;
     constant Execute1ToFetch1TypeInit : Execute1ToFetch1Type := (redirect => '0', others => (others => '0'));
 
-    type Decode2ToLoadstore1Type is record
+    type Execute1ToLoadstore1Type is record
 	valid : std_ulogic;
 	load : std_ulogic;				-- is this a load or store
 	addr1 : std_ulogic_vector(63 downto 0);
@@ -203,9 +207,9 @@ package common is
 	update_reg : gpr_index_t;                      	-- if so, the register to update
 	xerc : xer_common_t;
     end record;
-    constant Decode2ToLoadstore1Init : Decode2ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0',
-								   sign_extend => '0', update => '0', xerc => xerc_init,
-								   others => (others => '0'));
+    constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0',
+                                                                     sign_extend => '0', update => '0', xerc => xerc_init,
+                                                                     others => (others => '0'));
 
     type Loadstore1ToDcacheType is record
 	valid : std_ulogic;
diff --git a/core.vhdl b/core.vhdl
index aa86689..bc0b16f 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -60,7 +60,7 @@ architecture behave of core is
     signal execute1_to_fetch1: Execute1ToFetch1Type;
 
     -- load store signals
-    signal decode2_to_loadstore1: Decode2ToLoadstore1Type;
+    signal execute1_to_loadstore1: Execute1ToLoadstore1Type;
     signal loadstore1_to_dcache: Loadstore1ToDcacheType;
     signal dcache_to_writeback: DcacheToWritebackType;
 
@@ -190,7 +190,6 @@ begin
 	    stopped_out => dbg_core_is_stopped,
             d_in => decode1_to_decode2,
             e_out => decode2_to_execute1,
-            l_out => decode2_to_loadstore1,
             r_in => register_file_to_decode2,
             r_out => decode2_to_register_file,
             c_in => cr_file_to_decode2,
@@ -233,6 +232,7 @@ begin
             flush_out => flush,
 	    stall_out => ex1_stall_out,
             e_in => decode2_to_execute1,
+            l_out => execute1_to_loadstore1,
             f_out => execute1_to_fetch1,
             e_out => execute1_to_writeback,
 	    icache_inval => ex1_icache_inval,
@@ -242,7 +242,7 @@ begin
     loadstore1_0: entity work.loadstore1
         port map (
             clk => clk,
-            l_in => decode2_to_loadstore1,
+            l_in => execute1_to_loadstore1,
             l_out => loadstore1_to_dcache
             );
 
diff --git a/decode2.vhdl b/decode2.vhdl
index 6e3bd8a..582fa5b 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -27,7 +27,6 @@ entity decode2 is
 		d_in  : in Decode1ToDecode2Type;
 
 		e_out : out Decode2ToExecute1Type;
-		l_out : out Decode2ToLoadstore1Type;
 
 		r_in  : in RegisterFileToDecode2Type;
 		r_out : out Decode2ToRegisterFileType;
@@ -40,7 +39,6 @@ end entity decode2;
 architecture behaviour of decode2 is
 	type reg_type is record
 		e : Decode2ToExecute1Type;
-		l : Decode2ToLoadstore1Type;
 	end record;
 
 	signal r, rin : reg_type;
@@ -246,7 +244,7 @@ begin
 	decode2_0: process(clk)
 	begin
 		if rising_edge(clk) then
-			if rin.e.valid = '1' or rin.l.valid = '1' then
+			if rin.e.valid = '1' then
 				report "execute " & to_hstring(rin.e.nia);
 			end if;
 			r <= rin;
@@ -272,7 +270,6 @@ begin
 		v := r;
 
 		v.e := Decode2ToExecute1Init;
-		v.l := Decode2ToLoadStore1Init;
 
 		mul_a := (others => '0');
 		mul_b := (others => '0');
@@ -331,25 +328,9 @@ begin
 		end if;
                 v.e.insn := d_in.insn;
                 v.e.data_len := length;
-
-		-- load/store unit
-		v.l.update_reg := gspr_to_gpr(decoded_reg_a.reg);
-		v.l.addr1 := decoded_reg_a.data;
-		v.l.addr2 := decoded_reg_b.data;
-		v.l.data := decoded_reg_c.data;
-		v.l.write_reg := gspr_to_gpr(decoded_reg_o.reg);
-
-		if d_in.decode.insn_type = OP_LOAD then
-			v.l.load := '1';
-		else
-			v.l.load := '0';
-		end if;
-
-                v.l.length := length;
-		v.l.byte_reverse := d_in.decode.byte_reverse;
-		v.l.sign_extend := d_in.decode.sign_extend;
-		v.l.update := d_in.decode.update;
-		v.l.xerc := c_in.read_xerc_data;
+		v.e.byte_reverse := d_in.decode.byte_reverse;
+		v.e.sign_extend := d_in.decode.sign_extend;
+		v.e.update := d_in.decode.update;
 
 		-- issue control
 		control_valid_in <= d_in.valid;
@@ -373,21 +354,13 @@ begin
 
                 cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn);
 
-		v.e.valid := '0';
-		v.l.valid := '0';
-		case d_in.decode.unit is
-		when ALU =>
-			v.e.valid := control_valid_out;
-		when LDST =>
-			v.l.valid := control_valid_out;
-		when NONE =>
-			v.e.valid := control_valid_out;
+		v.e.valid := control_valid_out;
+		if d_in.decode.unit = NONE then
 			v.e.insn_type := OP_ILLEGAL;
-		end case;
+		end if;
 
 		if rst = '1' then
 			v.e := Decode2ToExecute1Init;
-			v.l := Decode2ToLoadStore1Init;
 		end if;
 
 		-- Update registers
@@ -395,6 +368,5 @@ begin
 
 		-- Update outputs
 		e_out <= r.e;
-		l_out <= r.l;
 	end process;
 end architecture behaviour;
diff --git a/execute1.vhdl b/execute1.vhdl
index d63697c..e49494f 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -25,6 +25,7 @@ entity execute1 is
 	e_in  : in Decode2ToExecute1Type;
 
 	-- asynchronous
+        l_out : out Execute1ToLoadstore1Type;
 	f_out : out Execute1ToFetch1Type;
 
 	e_out : out Execute1ToWritebackType;
@@ -210,6 +211,7 @@ begin
         variable zerohi, zerolo : std_ulogic;
         variable msb_a, msb_b : std_ulogic;
         variable a_lt : std_ulogic;
+        variable lv : Execute1ToLoadstore1Type;
     begin
 	result := (others => '0');
 	result_with_carry := (others => '0');
@@ -667,6 +669,10 @@ begin
 		stall_out <= '1';
 		x_to_divider.valid <= '1';
 
+            when OP_LOAD | OP_STORE =>
+                -- loadstore/dcache has its own port to writeback
+                v.e.valid := '0';
+
             when others =>
 		terminate_out <= '1';
 		report "illegal";
@@ -731,11 +737,31 @@ begin
 	v.e.write_data := result;
 	v.e.write_enable := result_en;
 
+        -- Outputs to loadstore1 (async)
+        lv := Execute1ToLoadstore1Init;
+        if e_in.valid = '1' and (e_in.insn_type = OP_LOAD or e_in.insn_type = OP_STORE) then
+            lv.valid := '1';
+        end if;
+        if e_in.insn_type = OP_LOAD then
+            lv.load := '1';
+        end if;
+        lv.addr1 := a_in;
+        lv.addr2 := b_in;
+        lv.data := c_in;
+        lv.write_reg := gspr_to_gpr(e_in.write_reg);
+        lv.length := e_in.data_len;
+        lv.byte_reverse := e_in.byte_reverse;
+        lv.sign_extend := e_in.sign_extend;
+        lv.update := e_in.update;
+        lv.update_reg := gspr_to_gpr(e_in.read_reg1);
+        lv.xerc := v.e.xerc;
+
 	-- Update registers
 	rin <= v;
 
 	-- update outputs
 	--f_out <= r.f;
+        l_out <= lv;
 	e_out <= r.e;
 	flush_out <= f_out.redirect;
     end process;
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index 1c16c46..5b61d4c 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -13,7 +13,7 @@ entity loadstore1 is
     port (
         clk   : in std_ulogic;
 
-        l_in  : in Decode2ToLoadstore1Type;
+        l_in  : in Execute1ToLoadstore1Type;
 
         l_out : out Loadstore1ToDcacheType
         );

From e08ca4ab8eba7bec404f82396e41d3b5c616b94d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 14 Jan 2020 21:55:33 +1100
Subject: [PATCH 09/10] countzero: Add a register to help make timing

This adds a register in the middle of the countzero computation,
so that we now have two cycles to count leading or trailing zeroes
instead of just one.  Execute1 now outputs a one-cycle stall signal
when it encounters a cntlz* or cnttz* instruction.  With this,
the countzero path no longer fails timing on the Artix-7 at 100MHz.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 countzero.vhdl    | 85 ++++++++++++++++++++++++++++++-----------------
 countzero_tb.vhdl | 10 ++++++
 execute1.vhdl     | 18 ++++++++--
 3 files changed, 79 insertions(+), 34 deletions(-)

diff --git a/countzero.vhdl b/countzero.vhdl
index d3960f0..50e6ead 100644
--- a/countzero.vhdl
+++ b/countzero.vhdl
@@ -6,6 +6,7 @@ library work;
 
 entity zero_counter is
     port (
+        clk         : in std_logic;
 	rs          : in std_ulogic_vector(63 downto 0);
 	count_right : in std_ulogic;
 	is_32bit    : in std_ulogic;
@@ -14,10 +15,14 @@ entity zero_counter is
 end entity zero_counter;
 
 architecture behaviour of zero_counter is
-    signal y, z     : std_ulogic_vector(3 downto 0);
-    signal v16      : std_ulogic_vector(15 downto 0);
-    signal v4       : std_ulogic_vector(3 downto 0);
-    signal sel      : std_ulogic_vector(5 downto 0);
+    type intermediate_result is record
+        v16: std_ulogic_vector(15 downto 0);
+        sel_hi: std_ulogic_vector(1 downto 0);
+        is_32bit: std_ulogic;
+        count_right: std_ulogic;
+    end record;
+
+    signal r, r_in  : intermediate_result;
 
     -- Return the index of the leftmost or rightmost 1 in a set of 4 bits.
     -- Assumes v is not "0000"; if it is, return (right ? "11" : "00").
@@ -47,65 +52,83 @@ architecture behaviour of zero_counter is
     end;
 
 begin
-    zerocounter0: process(all)
+    zerocounter_0: process(clk)
+    begin
+	if rising_edge(clk) then
+            r <= r_in;
+        end if;
+    end process;
+
+    zerocounter_1: process(all)
+        variable v: intermediate_result;
+        variable y, z: std_ulogic_vector(3 downto 0);
+        variable sel: std_ulogic_vector(5 downto 0);
+        variable v4: std_ulogic_vector(3 downto 0);
+
     begin
 	-- Test 4 groups of 16 bits each.
 	-- The top 2 groups are considered to be zero in 32-bit mode.
-	z(0) <= or (rs(15 downto 0));
-	z(1) <= or (rs(31 downto 16));
-	z(2) <= or (rs(47 downto 32));
-	z(3) <= or (rs(63 downto 48));
+	z(0) := or (rs(15 downto 0));
+	z(1) := or (rs(31 downto 16));
+	z(2) := or (rs(47 downto 32));
+	z(3) := or (rs(63 downto 48));
         if is_32bit = '0' then
-            sel(5 downto 4) <= encoder(z, count_right);
+            v.sel_hi := encoder(z, count_right);
         else
-            sel(5) <= '0';
+            v.sel_hi(1) := '0';
             if count_right = '0' then
-                sel(4) <= z(1);
+                v.sel_hi(0) := z(1);
             else
-                sel(4) <= not z(0);
+                v.sel_hi(0) := not z(0);
             end if;
         end if;
 
 	-- Select the leftmost/rightmost non-zero group of 16 bits
-	case sel(5 downto 4) is
+	case v.sel_hi is
 	    when "00" =>
-		v16 <= rs(15 downto 0);
+		v.v16 := rs(15 downto 0);
 	    when "01" =>
-		v16 <= rs(31 downto 16);
+		v.v16 := rs(31 downto 16);
 	    when "10" =>
-		v16 <= rs(47 downto 32);
+		v.v16 := rs(47 downto 32);
 	    when others =>
-		v16 <= rs(63 downto 48);
+		v.v16 := rs(63 downto 48);
 	end case;
 
+        -- Latch this and do the rest in the next cycle, for the sake of timing
+        v.is_32bit := is_32bit;
+        v.count_right := count_right;
+        r_in <= v;
+        sel(5 downto 4) := r.sel_hi;
+
 	-- Test 4 groups of 4 bits
-	y(0) <= or (v16(3 downto 0));
-	y(1) <= or (v16(7 downto 4));
-	y(2) <= or (v16(11 downto 8));
-	y(3) <= or (v16(15 downto 12));
-	sel(3 downto 2) <= encoder(y, count_right);
+	y(0) := or (r.v16(3 downto 0));
+	y(1) := or (r.v16(7 downto 4));
+	y(2) := or (r.v16(11 downto 8));
+	y(3) := or (r.v16(15 downto 12));
+	sel(3 downto 2) := encoder(y, r.count_right);
 
 	-- Select the leftmost/rightmost non-zero group of 4 bits
 	case sel(3 downto 2) is
 	    when "00" =>
-		v4 <= v16(3 downto 0);
+		v4 := r.v16(3 downto 0);
 	    when "01" =>
-		v4 <= v16(7 downto 4);
+		v4 := r.v16(7 downto 4);
 	    when "10" =>
-		v4 <= v16(11 downto 8);
+		v4 := r.v16(11 downto 8);
 	    when others =>
-		v4 <= v16(15 downto 12);
+		v4 := r.v16(15 downto 12);
 	end case;
 
-	sel(1 downto 0) <= encoder(v4, count_right);
+	sel(1 downto 0) := encoder(v4, r.count_right);
 
 	-- sel is now the index of the leftmost/rightmost 1 bit in rs
 	if v4 = "0000" then
 	    -- operand is zero, return 32 for 32-bit, else 64
-	    result <= x"00000000000000" & '0' & not is_32bit & is_32bit & "00000";
-	elsif count_right = '0' then
+	    result <= x"00000000000000" & '0' & not r.is_32bit & r.is_32bit & "00000";
+	elsif r.count_right = '0' then
 	    -- return (63 - sel), trimmed to 5 bits in 32-bit mode
-	    result <= x"00000000000000" & "00" & (not sel(5) and not is_32bit) & not sel(4 downto 0);
+	    result <= x"00000000000000" & "00" & (not sel(5) and not r.is_32bit) & not sel(4 downto 0);
 	else
 	    result <= x"00000000000000" & "00" & sel;
 	end if;
diff --git a/countzero_tb.vhdl b/countzero_tb.vhdl
index 91de334..21529de 100644
--- a/countzero_tb.vhdl
+++ b/countzero_tb.vhdl
@@ -15,16 +15,26 @@ architecture behave of countzero_tb is
     signal is_32bit, count_right: std_ulogic := '0';
     signal result: std_ulogic_vector(63 downto 0);
     signal randno: std_ulogic_vector(63 downto 0);
+    signal clk: std_ulogic;
 
 begin
     zerocounter_0: entity work.zero_counter
 	port map (
+            clk => clk,
 	    rs => rs,
 	    result => result,
 	    count_right => count_right,
 	    is_32bit => is_32bit
 	);
 
+    clk_process: process
+    begin
+        clk <= '0';
+        wait for clk_period/2;
+        clk <= '1';
+        wait for clk_period/2;
+    end process;
+
     stim_process: process
         variable r: std_ulogic_vector(63 downto 0);
     begin
diff --git a/execute1.vhdl b/execute1.vhdl
index e49494f..ae13c72 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -42,6 +42,7 @@ architecture behaviour of execute1 is
 	next_lr : std_ulogic_vector(63 downto 0);
 	mul_in_progress : std_ulogic;
         div_in_progress : std_ulogic;
+        cntz_in_progress : std_ulogic;
 	slow_op_dest : gpr_index_t;
 	slow_op_rc : std_ulogic;
 	slow_op_oe : std_ulogic;
@@ -143,6 +144,7 @@ begin
 
     countzero_0: entity work.zero_counter
 	port map (
+            clk => clk,
 	    rs => c_in,
 	    count_right => e_in.insn(10),
 	    is_32bit => e_in.is_32bit,
@@ -259,6 +261,7 @@ begin
 	v.lr_update := '0';
 	v.mul_in_progress := '0';
         v.div_in_progress := '0';
+        v.cntz_in_progress := '0';
 
 	-- signals to multiply unit
 	x_to_multiply <= Execute1ToMultiplyInit;
@@ -473,9 +476,10 @@ begin
 	    when OP_CMPB =>
 		result := ppc_cmpb(c_in, b_in);
 		result_en := '1';
-	    when OP_CNTZ =>
-		result := countzero_result;
-		result_en := '1';
+            when OP_CNTZ =>
+                v.e.valid := '0';
+                v.cntz_in_progress := '1';
+                stall_out <= '1';
             when OP_EXTS =>
                 -- note data_len is a 1-hot encoding
 		negative := (e_in.data_len(0) and c_in(7)) or
@@ -703,6 +707,14 @@ begin
 	    result := r.next_lr;
 	    v.e.write_reg := fast_spr_num(SPR_LR);
 	    v.e.valid := '1';
+        elsif r.cntz_in_progress = '1' then
+            -- cnt[lt]z always takes two cycles
+            result := countzero_result;
+            result_en := '1';
+            v.e.write_reg := gpr_to_gspr(v.slow_op_dest);
+            v.e.rc := v.slow_op_rc;
+            v.e.xerc := v.slow_op_xerc;
+            v.e.valid := '1';
 	elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then
 	    if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or
 	       (r.div_in_progress = '1' and divider_to_x.valid = '1') then

From 2661b9b985698a4ecd2854befa7c83f4e0c7b02e Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 14 Jan 2020 23:20:42 +1100
Subject: [PATCH 10/10] decode1: Mark subfic as pipelined

This seems just to have been missed in commit f291efa26690 ("decode1:
Mark ALU ops using carry as pipelined").

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index d2dbd96..f1b5ad4 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -66,7 +66,7 @@ architecture behaviour of decode1 is
 		45 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- sthu
 		36 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stw
 		37 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stwu
-		 8 =>       (ALU,    OP_ADD,       RA,         CONST_SI,    NONE, RT,   '0', '0', '1', '0', ONE,  '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- subfic
+		 8 =>       (ALU,    OP_ADD,       RA,         CONST_SI,    NONE, RT,   '0', '0', '1', '0', ONE,  '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic
 		 2 =>       (ALU,    OP_TDI,       RA,         CONST_SI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi
 		--PPC_TWI 3
 		26 =>       (ALU,    OP_XOR,       NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- xori