forked from cores/microwatt
core: Make popcnt* take two cycles
This moves the calculation of the result for popcnt* into the countbits unit, renamed from countzero, so that we can take two cycles to get the result. The motivation for this is that the popcnt* calculation was showing up as a critical path. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>fpu-constant
parent
6ff3b2499c
commit
2491aa7fc5
@ -0,0 +1,130 @@
|
|||||||
|
library ieee;
|
||||||
|
use ieee.std_logic_1164.all;
|
||||||
|
use ieee.numeric_std.all;
|
||||||
|
|
||||||
|
library work;
|
||||||
|
use work.helpers.all;
|
||||||
|
|
||||||
|
entity bit_counter is
|
||||||
|
port (
|
||||||
|
clk : in std_logic;
|
||||||
|
rs : in std_ulogic_vector(63 downto 0);
|
||||||
|
count_right : in std_ulogic;
|
||||||
|
do_popcnt : in std_ulogic;
|
||||||
|
is_32bit : in std_ulogic;
|
||||||
|
datalen : in std_ulogic_vector(3 downto 0);
|
||||||
|
result : out std_ulogic_vector(63 downto 0)
|
||||||
|
);
|
||||||
|
end entity bit_counter;
|
||||||
|
|
||||||
|
architecture behaviour of bit_counter is
|
||||||
|
-- signals for count-leading/trailing-zeroes
|
||||||
|
signal inp : std_ulogic_vector(63 downto 0);
|
||||||
|
signal sum : std_ulogic_vector(64 downto 0);
|
||||||
|
signal msb_r : std_ulogic;
|
||||||
|
signal onehot : std_ulogic_vector(63 downto 0);
|
||||||
|
signal onehot_r : std_ulogic_vector(63 downto 0);
|
||||||
|
signal bitnum : std_ulogic_vector(5 downto 0);
|
||||||
|
signal cntz : std_ulogic_vector(63 downto 0);
|
||||||
|
|
||||||
|
-- signals for popcnt
|
||||||
|
signal dlen_r : std_ulogic_vector(3 downto 0);
|
||||||
|
signal pcnt_r : std_ulogic;
|
||||||
|
subtype twobit is unsigned(1 downto 0);
|
||||||
|
type twobit32 is array(0 to 31) of twobit;
|
||||||
|
signal pc2 : twobit32;
|
||||||
|
subtype threebit is unsigned(2 downto 0);
|
||||||
|
type threebit16 is array(0 to 15) of threebit;
|
||||||
|
signal pc4 : threebit16;
|
||||||
|
subtype fourbit is unsigned(3 downto 0);
|
||||||
|
type fourbit8 is array(0 to 7) of fourbit;
|
||||||
|
signal pc8 : fourbit8;
|
||||||
|
signal pc8_r : fourbit8;
|
||||||
|
subtype sixbit is unsigned(5 downto 0);
|
||||||
|
type sixbit2 is array(0 to 1) of sixbit;
|
||||||
|
signal pc32 : sixbit2;
|
||||||
|
signal popcnt : std_ulogic_vector(63 downto 0);
|
||||||
|
|
||||||
|
begin
|
||||||
|
countzero_r: process(clk)
|
||||||
|
begin
|
||||||
|
if rising_edge(clk) then
|
||||||
|
msb_r <= sum(64);
|
||||||
|
onehot_r <= onehot;
|
||||||
|
end if;
|
||||||
|
end process;
|
||||||
|
|
||||||
|
countzero: process(all)
|
||||||
|
begin
|
||||||
|
if is_32bit = '0' then
|
||||||
|
if count_right = '0' then
|
||||||
|
inp <= bit_reverse(rs);
|
||||||
|
else
|
||||||
|
inp <= rs;
|
||||||
|
end if;
|
||||||
|
else
|
||||||
|
inp(63 downto 32) <= x"FFFFFFFF";
|
||||||
|
if count_right = '0' then
|
||||||
|
inp(31 downto 0) <= bit_reverse(rs(31 downto 0));
|
||||||
|
else
|
||||||
|
inp(31 downto 0) <= rs(31 downto 0);
|
||||||
|
end if;
|
||||||
|
end if;
|
||||||
|
|
||||||
|
sum <= std_ulogic_vector(unsigned('0' & not inp) + 1);
|
||||||
|
onehot <= sum(63 downto 0) and inp;
|
||||||
|
|
||||||
|
-- The following occurs after a clock edge
|
||||||
|
bitnum <= bit_number(onehot_r);
|
||||||
|
|
||||||
|
cntz <= 57x"0" & msb_r & bitnum;
|
||||||
|
end process;
|
||||||
|
|
||||||
|
popcnt_r: process(clk)
|
||||||
|
begin
|
||||||
|
if rising_edge(clk) then
|
||||||
|
for i in 0 to 7 loop
|
||||||
|
pc8_r(i) <= pc8(i);
|
||||||
|
end loop;
|
||||||
|
dlen_r <= datalen;
|
||||||
|
pcnt_r <= do_popcnt;
|
||||||
|
end if;
|
||||||
|
end process;
|
||||||
|
|
||||||
|
popcnt_a: process(all)
|
||||||
|
begin
|
||||||
|
for i in 0 to 31 loop
|
||||||
|
pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1));
|
||||||
|
end loop;
|
||||||
|
for i in 0 to 15 loop
|
||||||
|
pc4(i) <= ('0' & pc2(i * 2)) + ('0' & pc2(i * 2 + 1));
|
||||||
|
end loop;
|
||||||
|
for i in 0 to 7 loop
|
||||||
|
pc8(i) <= ('0' & pc4(i * 2)) + ('0' & pc4(i * 2 + 1));
|
||||||
|
end loop;
|
||||||
|
|
||||||
|
-- after a clock edge
|
||||||
|
for i in 0 to 1 loop
|
||||||
|
pc32(i) <= ("00" & pc8_r(i * 4)) + ("00" & pc8_r(i * 4 + 1)) +
|
||||||
|
("00" & pc8_r(i * 4 + 2)) + ("00" & pc8_r(i * 4 + 3));
|
||||||
|
end loop;
|
||||||
|
|
||||||
|
popcnt <= (others => '0');
|
||||||
|
if dlen_r(3 downto 2) = "00" then
|
||||||
|
-- popcntb
|
||||||
|
for i in 0 to 7 loop
|
||||||
|
popcnt(i * 8 + 3 downto i * 8) <= std_ulogic_vector(pc8_r(i));
|
||||||
|
end loop;
|
||||||
|
elsif dlen_r(3) = '0' then
|
||||||
|
-- popcntw
|
||||||
|
for i in 0 to 1 loop
|
||||||
|
popcnt(i * 32 + 5 downto i * 32) <= std_ulogic_vector(pc32(i));
|
||||||
|
end loop;
|
||||||
|
else
|
||||||
|
popcnt(6 downto 0) <= std_ulogic_vector(('0' & pc32(0)) + ('0' & pc32(1)));
|
||||||
|
end if;
|
||||||
|
end process;
|
||||||
|
|
||||||
|
result <= cntz when pcnt_r = '0' else popcnt;
|
||||||
|
|
||||||
|
end behaviour;
|
@ -1,60 +0,0 @@
|
|||||||
library ieee;
|
|
||||||
use ieee.std_logic_1164.all;
|
|
||||||
use ieee.numeric_std.all;
|
|
||||||
|
|
||||||
library work;
|
|
||||||
use work.helpers.all;
|
|
||||||
|
|
||||||
entity zero_counter is
|
|
||||||
port (
|
|
||||||
clk : in std_logic;
|
|
||||||
rs : in std_ulogic_vector(63 downto 0);
|
|
||||||
count_right : in std_ulogic;
|
|
||||||
is_32bit : in std_ulogic;
|
|
||||||
result : out std_ulogic_vector(63 downto 0)
|
|
||||||
);
|
|
||||||
end entity zero_counter;
|
|
||||||
|
|
||||||
architecture behaviour of zero_counter is
|
|
||||||
signal inp : std_ulogic_vector(63 downto 0);
|
|
||||||
signal sum : std_ulogic_vector(64 downto 0);
|
|
||||||
signal msb_r : std_ulogic;
|
|
||||||
signal onehot : std_ulogic_vector(63 downto 0);
|
|
||||||
signal onehot_r : std_ulogic_vector(63 downto 0);
|
|
||||||
signal bitnum : std_ulogic_vector(5 downto 0);
|
|
||||||
|
|
||||||
begin
|
|
||||||
countzero_r: process(clk)
|
|
||||||
begin
|
|
||||||
if rising_edge(clk) then
|
|
||||||
msb_r <= sum(64);
|
|
||||||
onehot_r <= onehot;
|
|
||||||
end if;
|
|
||||||
end process;
|
|
||||||
|
|
||||||
countzero: process(all)
|
|
||||||
begin
|
|
||||||
if is_32bit = '0' then
|
|
||||||
if count_right = '0' then
|
|
||||||
inp <= bit_reverse(rs);
|
|
||||||
else
|
|
||||||
inp <= rs;
|
|
||||||
end if;
|
|
||||||
else
|
|
||||||
inp(63 downto 32) <= x"FFFFFFFF";
|
|
||||||
if count_right = '0' then
|
|
||||||
inp(31 downto 0) <= bit_reverse(rs(31 downto 0));
|
|
||||||
else
|
|
||||||
inp(31 downto 0) <= rs(31 downto 0);
|
|
||||||
end if;
|
|
||||||
end if;
|
|
||||||
|
|
||||||
sum <= std_ulogic_vector(unsigned('0' & not inp) + 1);
|
|
||||||
onehot <= sum(63 downto 0) and inp;
|
|
||||||
|
|
||||||
-- The following occurs after a clock edge
|
|
||||||
bitnum <= bit_number(onehot_r);
|
|
||||||
|
|
||||||
result <= x"00000000000000" & "0" & msb_r & bitnum;
|
|
||||||
end process;
|
|
||||||
end behaviour;
|
|
Loading…
Reference in New Issue