From e08ca4ab8eba7bec404f82396e41d3b5c616b94d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 14 Jan 2020 21:55:33 +1100 Subject: [PATCH] countzero: Add a register to help make timing This adds a register in the middle of the countzero computation, so that we now have two cycles to count leading or trailing zeroes instead of just one. Execute1 now outputs a one-cycle stall signal when it encounters a cntlz* or cnttz* instruction. With this, the countzero path no longer fails timing on the Artix-7 at 100MHz. Signed-off-by: Paul Mackerras --- countzero.vhdl | 85 ++++++++++++++++++++++++++++++----------------- countzero_tb.vhdl | 10 ++++++ execute1.vhdl | 18 ++++++++-- 3 files changed, 79 insertions(+), 34 deletions(-) diff --git a/countzero.vhdl b/countzero.vhdl index d3960f0..50e6ead 100644 --- a/countzero.vhdl +++ b/countzero.vhdl @@ -6,6 +6,7 @@ library work; entity zero_counter is port ( + clk : in std_logic; rs : in std_ulogic_vector(63 downto 0); count_right : in std_ulogic; is_32bit : in std_ulogic; @@ -14,10 +15,14 @@ entity zero_counter is end entity zero_counter; architecture behaviour of zero_counter is - signal y, z : std_ulogic_vector(3 downto 0); - signal v16 : std_ulogic_vector(15 downto 0); - signal v4 : std_ulogic_vector(3 downto 0); - signal sel : std_ulogic_vector(5 downto 0); + type intermediate_result is record + v16: std_ulogic_vector(15 downto 0); + sel_hi: std_ulogic_vector(1 downto 0); + is_32bit: std_ulogic; + count_right: std_ulogic; + end record; + + signal r, r_in : intermediate_result; -- Return the index of the leftmost or rightmost 1 in a set of 4 bits. -- Assumes v is not "0000"; if it is, return (right ? "11" : "00"). @@ -47,65 +52,83 @@ architecture behaviour of zero_counter is end; begin - zerocounter0: process(all) + zerocounter_0: process(clk) + begin + if rising_edge(clk) then + r <= r_in; + end if; + end process; + + zerocounter_1: process(all) + variable v: intermediate_result; + variable y, z: std_ulogic_vector(3 downto 0); + variable sel: std_ulogic_vector(5 downto 0); + variable v4: std_ulogic_vector(3 downto 0); + begin -- Test 4 groups of 16 bits each. -- The top 2 groups are considered to be zero in 32-bit mode. - z(0) <= or (rs(15 downto 0)); - z(1) <= or (rs(31 downto 16)); - z(2) <= or (rs(47 downto 32)); - z(3) <= or (rs(63 downto 48)); + z(0) := or (rs(15 downto 0)); + z(1) := or (rs(31 downto 16)); + z(2) := or (rs(47 downto 32)); + z(3) := or (rs(63 downto 48)); if is_32bit = '0' then - sel(5 downto 4) <= encoder(z, count_right); + v.sel_hi := encoder(z, count_right); else - sel(5) <= '0'; + v.sel_hi(1) := '0'; if count_right = '0' then - sel(4) <= z(1); + v.sel_hi(0) := z(1); else - sel(4) <= not z(0); + v.sel_hi(0) := not z(0); end if; end if; -- Select the leftmost/rightmost non-zero group of 16 bits - case sel(5 downto 4) is + case v.sel_hi is when "00" => - v16 <= rs(15 downto 0); + v.v16 := rs(15 downto 0); when "01" => - v16 <= rs(31 downto 16); + v.v16 := rs(31 downto 16); when "10" => - v16 <= rs(47 downto 32); + v.v16 := rs(47 downto 32); when others => - v16 <= rs(63 downto 48); + v.v16 := rs(63 downto 48); end case; + -- Latch this and do the rest in the next cycle, for the sake of timing + v.is_32bit := is_32bit; + v.count_right := count_right; + r_in <= v; + sel(5 downto 4) := r.sel_hi; + -- Test 4 groups of 4 bits - y(0) <= or (v16(3 downto 0)); - y(1) <= or (v16(7 downto 4)); - y(2) <= or (v16(11 downto 8)); - y(3) <= or (v16(15 downto 12)); - sel(3 downto 2) <= encoder(y, count_right); + y(0) := or (r.v16(3 downto 0)); + y(1) := or (r.v16(7 downto 4)); + y(2) := or (r.v16(11 downto 8)); + y(3) := or (r.v16(15 downto 12)); + sel(3 downto 2) := encoder(y, r.count_right); -- Select the leftmost/rightmost non-zero group of 4 bits case sel(3 downto 2) is when "00" => - v4 <= v16(3 downto 0); + v4 := r.v16(3 downto 0); when "01" => - v4 <= v16(7 downto 4); + v4 := r.v16(7 downto 4); when "10" => - v4 <= v16(11 downto 8); + v4 := r.v16(11 downto 8); when others => - v4 <= v16(15 downto 12); + v4 := r.v16(15 downto 12); end case; - sel(1 downto 0) <= encoder(v4, count_right); + sel(1 downto 0) := encoder(v4, r.count_right); -- sel is now the index of the leftmost/rightmost 1 bit in rs if v4 = "0000" then -- operand is zero, return 32 for 32-bit, else 64 - result <= x"00000000000000" & '0' & not is_32bit & is_32bit & "00000"; - elsif count_right = '0' then + result <= x"00000000000000" & '0' & not r.is_32bit & r.is_32bit & "00000"; + elsif r.count_right = '0' then -- return (63 - sel), trimmed to 5 bits in 32-bit mode - result <= x"00000000000000" & "00" & (not sel(5) and not is_32bit) & not sel(4 downto 0); + result <= x"00000000000000" & "00" & (not sel(5) and not r.is_32bit) & not sel(4 downto 0); else result <= x"00000000000000" & "00" & sel; end if; diff --git a/countzero_tb.vhdl b/countzero_tb.vhdl index 91de334..21529de 100644 --- a/countzero_tb.vhdl +++ b/countzero_tb.vhdl @@ -15,16 +15,26 @@ architecture behave of countzero_tb is signal is_32bit, count_right: std_ulogic := '0'; signal result: std_ulogic_vector(63 downto 0); signal randno: std_ulogic_vector(63 downto 0); + signal clk: std_ulogic; begin zerocounter_0: entity work.zero_counter port map ( + clk => clk, rs => rs, result => result, count_right => count_right, is_32bit => is_32bit ); + clk_process: process + begin + clk <= '0'; + wait for clk_period/2; + clk <= '1'; + wait for clk_period/2; + end process; + stim_process: process variable r: std_ulogic_vector(63 downto 0); begin diff --git a/execute1.vhdl b/execute1.vhdl index e49494f..ae13c72 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -42,6 +42,7 @@ architecture behaviour of execute1 is next_lr : std_ulogic_vector(63 downto 0); mul_in_progress : std_ulogic; div_in_progress : std_ulogic; + cntz_in_progress : std_ulogic; slow_op_dest : gpr_index_t; slow_op_rc : std_ulogic; slow_op_oe : std_ulogic; @@ -143,6 +144,7 @@ begin countzero_0: entity work.zero_counter port map ( + clk => clk, rs => c_in, count_right => e_in.insn(10), is_32bit => e_in.is_32bit, @@ -259,6 +261,7 @@ begin v.lr_update := '0'; v.mul_in_progress := '0'; v.div_in_progress := '0'; + v.cntz_in_progress := '0'; -- signals to multiply unit x_to_multiply <= Execute1ToMultiplyInit; @@ -473,9 +476,10 @@ begin when OP_CMPB => result := ppc_cmpb(c_in, b_in); result_en := '1'; - when OP_CNTZ => - result := countzero_result; - result_en := '1'; + when OP_CNTZ => + v.e.valid := '0'; + v.cntz_in_progress := '1'; + stall_out <= '1'; when OP_EXTS => -- note data_len is a 1-hot encoding negative := (e_in.data_len(0) and c_in(7)) or @@ -703,6 +707,14 @@ begin result := r.next_lr; v.e.write_reg := fast_spr_num(SPR_LR); v.e.valid := '1'; + elsif r.cntz_in_progress = '1' then + -- cnt[lt]z always takes two cycles + result := countzero_result; + result_en := '1'; + v.e.write_reg := gpr_to_gspr(v.slow_op_dest); + v.e.rc := v.slow_op_rc; + v.e.xerc := v.slow_op_xerc; + v.e.valid := '1'; elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or (r.div_in_progress = '1' and divider_to_x.valid = '1') then