From e08ca4ab8eba7bec404f82396e41d3b5c616b94d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 14 Jan 2020 21:55:33 +1100
Subject: [PATCH] countzero: Add a register to help make timing

This adds a register in the middle of the countzero computation,
so that we now have two cycles to count leading or trailing zeroes
instead of just one.  Execute1 now outputs a one-cycle stall signal
when it encounters a cntlz* or cnttz* instruction.  With this,
the countzero path no longer fails timing on the Artix-7 at 100MHz.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 countzero.vhdl    | 85 ++++++++++++++++++++++++++++++-----------------
 countzero_tb.vhdl | 10 ++++++
 execute1.vhdl     | 18 ++++++++--
 3 files changed, 79 insertions(+), 34 deletions(-)

diff --git a/countzero.vhdl b/countzero.vhdl
index d3960f0..50e6ead 100644
--- a/countzero.vhdl
+++ b/countzero.vhdl
@@ -6,6 +6,7 @@ library work;
 
 entity zero_counter is
     port (
+        clk         : in std_logic;
 	rs          : in std_ulogic_vector(63 downto 0);
 	count_right : in std_ulogic;
 	is_32bit    : in std_ulogic;
@@ -14,10 +15,14 @@ entity zero_counter is
 end entity zero_counter;
 
 architecture behaviour of zero_counter is
-    signal y, z     : std_ulogic_vector(3 downto 0);
-    signal v16      : std_ulogic_vector(15 downto 0);
-    signal v4       : std_ulogic_vector(3 downto 0);
-    signal sel      : std_ulogic_vector(5 downto 0);
+    type intermediate_result is record
+        v16: std_ulogic_vector(15 downto 0);
+        sel_hi: std_ulogic_vector(1 downto 0);
+        is_32bit: std_ulogic;
+        count_right: std_ulogic;
+    end record;
+
+    signal r, r_in  : intermediate_result;
 
     -- Return the index of the leftmost or rightmost 1 in a set of 4 bits.
     -- Assumes v is not "0000"; if it is, return (right ? "11" : "00").
@@ -47,65 +52,83 @@ architecture behaviour of zero_counter is
     end;
 
 begin
-    zerocounter0: process(all)
+    zerocounter_0: process(clk)
+    begin
+	if rising_edge(clk) then
+            r <= r_in;
+        end if;
+    end process;
+
+    zerocounter_1: process(all)
+        variable v: intermediate_result;
+        variable y, z: std_ulogic_vector(3 downto 0);
+        variable sel: std_ulogic_vector(5 downto 0);
+        variable v4: std_ulogic_vector(3 downto 0);
+
     begin
 	-- Test 4 groups of 16 bits each.
 	-- The top 2 groups are considered to be zero in 32-bit mode.
-	z(0) <= or (rs(15 downto 0));
-	z(1) <= or (rs(31 downto 16));
-	z(2) <= or (rs(47 downto 32));
-	z(3) <= or (rs(63 downto 48));
+	z(0) := or (rs(15 downto 0));
+	z(1) := or (rs(31 downto 16));
+	z(2) := or (rs(47 downto 32));
+	z(3) := or (rs(63 downto 48));
         if is_32bit = '0' then
-            sel(5 downto 4) <= encoder(z, count_right);
+            v.sel_hi := encoder(z, count_right);
         else
-            sel(5) <= '0';
+            v.sel_hi(1) := '0';
             if count_right = '0' then
-                sel(4) <= z(1);
+                v.sel_hi(0) := z(1);
             else
-                sel(4) <= not z(0);
+                v.sel_hi(0) := not z(0);
             end if;
         end if;
 
 	-- Select the leftmost/rightmost non-zero group of 16 bits
-	case sel(5 downto 4) is
+	case v.sel_hi is
 	    when "00" =>
-		v16 <= rs(15 downto 0);
+		v.v16 := rs(15 downto 0);
 	    when "01" =>
-		v16 <= rs(31 downto 16);
+		v.v16 := rs(31 downto 16);
 	    when "10" =>
-		v16 <= rs(47 downto 32);
+		v.v16 := rs(47 downto 32);
 	    when others =>
-		v16 <= rs(63 downto 48);
+		v.v16 := rs(63 downto 48);
 	end case;
 
+        -- Latch this and do the rest in the next cycle, for the sake of timing
+        v.is_32bit := is_32bit;
+        v.count_right := count_right;
+        r_in <= v;
+        sel(5 downto 4) := r.sel_hi;
+
 	-- Test 4 groups of 4 bits
-	y(0) <= or (v16(3 downto 0));
-	y(1) <= or (v16(7 downto 4));
-	y(2) <= or (v16(11 downto 8));
-	y(3) <= or (v16(15 downto 12));
-	sel(3 downto 2) <= encoder(y, count_right);
+	y(0) := or (r.v16(3 downto 0));
+	y(1) := or (r.v16(7 downto 4));
+	y(2) := or (r.v16(11 downto 8));
+	y(3) := or (r.v16(15 downto 12));
+	sel(3 downto 2) := encoder(y, r.count_right);
 
 	-- Select the leftmost/rightmost non-zero group of 4 bits
 	case sel(3 downto 2) is
 	    when "00" =>
-		v4 <= v16(3 downto 0);
+		v4 := r.v16(3 downto 0);
 	    when "01" =>
-		v4 <= v16(7 downto 4);
+		v4 := r.v16(7 downto 4);
 	    when "10" =>
-		v4 <= v16(11 downto 8);
+		v4 := r.v16(11 downto 8);
 	    when others =>
-		v4 <= v16(15 downto 12);
+		v4 := r.v16(15 downto 12);
 	end case;
 
-	sel(1 downto 0) <= encoder(v4, count_right);
+	sel(1 downto 0) := encoder(v4, r.count_right);
 
 	-- sel is now the index of the leftmost/rightmost 1 bit in rs
 	if v4 = "0000" then
 	    -- operand is zero, return 32 for 32-bit, else 64
-	    result <= x"00000000000000" & '0' & not is_32bit & is_32bit & "00000";
-	elsif count_right = '0' then
+	    result <= x"00000000000000" & '0' & not r.is_32bit & r.is_32bit & "00000";
+	elsif r.count_right = '0' then
 	    -- return (63 - sel), trimmed to 5 bits in 32-bit mode
-	    result <= x"00000000000000" & "00" & (not sel(5) and not is_32bit) & not sel(4 downto 0);
+	    result <= x"00000000000000" & "00" & (not sel(5) and not r.is_32bit) & not sel(4 downto 0);
 	else
 	    result <= x"00000000000000" & "00" & sel;
 	end if;
diff --git a/countzero_tb.vhdl b/countzero_tb.vhdl
index 91de334..21529de 100644
--- a/countzero_tb.vhdl
+++ b/countzero_tb.vhdl
@@ -15,16 +15,26 @@ architecture behave of countzero_tb is
     signal is_32bit, count_right: std_ulogic := '0';
     signal result: std_ulogic_vector(63 downto 0);
     signal randno: std_ulogic_vector(63 downto 0);
+    signal clk: std_ulogic;
 
 begin
     zerocounter_0: entity work.zero_counter
 	port map (
+            clk => clk,
 	    rs => rs,
 	    result => result,
 	    count_right => count_right,
 	    is_32bit => is_32bit
 	);
 
+    clk_process: process
+    begin
+        clk <= '0';
+        wait for clk_period/2;
+        clk <= '1';
+        wait for clk_period/2;
+    end process;
+
     stim_process: process
         variable r: std_ulogic_vector(63 downto 0);
     begin
diff --git a/execute1.vhdl b/execute1.vhdl
index e49494f..ae13c72 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -42,6 +42,7 @@ architecture behaviour of execute1 is
 	next_lr : std_ulogic_vector(63 downto 0);
 	mul_in_progress : std_ulogic;
         div_in_progress : std_ulogic;
+        cntz_in_progress : std_ulogic;
 	slow_op_dest : gpr_index_t;
 	slow_op_rc : std_ulogic;
 	slow_op_oe : std_ulogic;
@@ -143,6 +144,7 @@ begin
 
     countzero_0: entity work.zero_counter
 	port map (
+            clk => clk,
 	    rs => c_in,
 	    count_right => e_in.insn(10),
 	    is_32bit => e_in.is_32bit,
@@ -259,6 +261,7 @@ begin
 	v.lr_update := '0';
 	v.mul_in_progress := '0';
         v.div_in_progress := '0';
+        v.cntz_in_progress := '0';
 
 	-- signals to multiply unit
 	x_to_multiply <= Execute1ToMultiplyInit;
@@ -473,9 +476,10 @@ begin
 	    when OP_CMPB =>
 		result := ppc_cmpb(c_in, b_in);
 		result_en := '1';
-	    when OP_CNTZ =>
-		result := countzero_result;
-		result_en := '1';
+            when OP_CNTZ =>
+                v.e.valid := '0';
+                v.cntz_in_progress := '1';
+                stall_out <= '1';
             when OP_EXTS =>
                 -- note data_len is a 1-hot encoding
 		negative := (e_in.data_len(0) and c_in(7)) or
@@ -703,6 +707,14 @@ begin
 	    result := r.next_lr;
 	    v.e.write_reg := fast_spr_num(SPR_LR);
 	    v.e.valid := '1';
+        elsif r.cntz_in_progress = '1' then
+            -- cnt[lt]z always takes two cycles
+            result := countzero_result;
+            result_en := '1';
+            v.e.write_reg := gpr_to_gspr(v.slow_op_dest);
+            v.e.rc := v.slow_op_rc;
+            v.e.xerc := v.slow_op_xerc;
+            v.e.valid := '1';
 	elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then
 	    if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or
 	       (r.div_in_progress = '1' and divider_to_x.valid = '1') then