From 10869888833847d6b899dc7c23c002f3fce85f71 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 21 Feb 2022 09:58:07 +1100
Subject: [PATCH] countzero: Use alternative algorithm for higher bits

This implements an alternative count-leading-zeroes algorithm which
uses less LUTs to generate the higher-order bits (2..5) of the
result.

By doing (v | -v) rather than (v & -v), we get a value which has ones
from the MSB down to the rightmost 1 bit in v and then zeroes down to
the LSB.  This means that we can generate the MSB of the result (the
index of the rightmost 1 bit in v) just by looking at bits 63 and 31
of (v | -v), assuming that v is 64 bits.  Bit 4 of the result requires
looking at bits 63, 47, 31 and 15.  In contrast, each bit of the
result using (v & -v), which has a single 1, requires ORing together
32 bits.

It turns out that the minimum LUT usage comes from using (v & -v) to
generate bits 0 and 1 of the result, and using (v | -v) to generate
bits 2 to 5.  This saves almost 60 6-input LUTs on the Artix-7.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 countbits.vhdl | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/countbits.vhdl b/countbits.vhdl
index 134540f..ab5be2a 100644
--- a/countbits.vhdl
+++ b/countbits.vhdl
@@ -20,10 +20,11 @@ end entity bit_counter;
 architecture behaviour of bit_counter is
     -- signals for count-leading/trailing-zeroes
     signal inp : std_ulogic_vector(63 downto 0);
+    signal inp_r : std_ulogic_vector(63 downto 0);
     signal sum : std_ulogic_vector(64 downto 0);
-    signal msb_r : std_ulogic;
+    signal sum_r : std_ulogic_vector(64 downto 0);
     signal onehot : std_ulogic_vector(63 downto 0);
-    signal onehot_r : std_ulogic_vector(63 downto 0);
+    signal edge : std_ulogic_vector(63 downto 0);
     signal bitnum : std_ulogic_vector(5 downto 0);
     signal cntz : std_ulogic_vector(63 downto 0);
 
@@ -45,16 +46,36 @@ architecture behaviour of bit_counter is
     signal pc32     : sixbit2;
     signal popcnt   : std_ulogic_vector(63 downto 0);
 
+    function edgelocation(v: std_ulogic_vector; nbits: natural) return std_ulogic_vector is
+        variable p: std_ulogic_vector(nbits - 1 downto 0);
+        variable stride: natural;
+        variable b: std_ulogic;
+        variable k: natural;
+    begin
+        stride := 2;
+        for i in 0 to nbits - 1 loop
+            b := '0';
+            for j in 0 to (2**nbits / stride) - 1 loop
+                k := j * stride;
+                b := b or (v(k + stride - 1) and not v(k + (stride/2) - 1));
+            end loop;
+            p(i) := b;
+            stride := stride * 2;
+        end loop;
+        return p;
+    end function;
+
 begin
     countzero_r: process(clk)
     begin
         if rising_edge(clk) then
-            msb_r <= sum(64);
-            onehot_r <= onehot;
+            inp_r <= inp;
+            sum_r <= sum;
         end if;
     end process;
 
     countzero: process(all)
+        variable bitnum_e, bitnum_o : std_ulogic_vector(5 downto 0);
     begin
         if is_32bit = '0' then
             if count_right = '0' then
@@ -72,12 +93,16 @@ begin
         end if;
 
         sum <= std_ulogic_vector(unsigned('0' & not inp) + 1);
-        onehot <= sum(63 downto 0) and inp;
 
         -- The following occurs after a clock edge
-        bitnum <= bit_number(onehot_r);
+        edge <= sum_r(63 downto 0) or inp_r;
+        bitnum_e := edgelocation(edge, 6);
+        onehot <= sum_r(63 downto 0) and inp_r;
+        bitnum_o := bit_number(onehot);
+        bitnum(5 downto 2) <= bitnum_e(5 downto 2);
+        bitnum(1 downto 0) <= bitnum_o(1 downto 0);
 
-        cntz <= 57x"0" & msb_r & bitnum;
+        cntz <= 57x"0" & sum_r(64) & bitnum;
     end process;
 
     popcnt_r: process(clk)