execute: Consolidate count-leading/trailing-zeroes implementations

This adds combinatorial logic that does 32-bit and 64-bit count
leading and trailing zeroes in one unit, and consolidates the
four instructions under a single OP_CNTZ opcode.

This saves 84 slice LUTs on the Arty A7-100.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
jtag-port
Paul Mackerras 5 years ago
parent b8fb721b81
commit 24a4a796ce

@ -18,12 +18,13 @@ sim_jtag.o: sim_jtag_socket.o
core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o
core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o execute2.o loadstore1.o loadstore2.o multiply.o writeback.o core_debug.o divider.o core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o execute2.o loadstore1.o loadstore2.o multiply.o writeback.o core_debug.o divider.o
core_debug.o: common.o core_debug.o: common.o
countzero.o:
cr_file.o: common.o cr_file.o: common.o
crhelpers.o: common.o crhelpers.o: common.o
decode1.o: common.o decode_types.o decode1.o: common.o decode_types.o
decode2.o: decode_types.o common.o helpers.o insn_helpers.o decode2.o: decode_types.o common.o helpers.o insn_helpers.o
decode_types.o: decode_types.o:
execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o
execute2.o: common.o crhelpers.o ppc_fx_insns.o execute2.o: common.o crhelpers.o ppc_fx_insns.o
fetch1.o: common.o fetch1.o: common.o
fetch2.o: common.o wishbone_types.o fetch2.o: common.o wishbone_types.o

@ -0,0 +1,103 @@
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

library work;

entity zero_counter is
port (
rs : in std_ulogic_vector(63 downto 0);
count_right : in std_ulogic;
is_32bit : in std_ulogic;
result : out std_ulogic_vector(63 downto 0)
);
end entity zero_counter;

architecture behaviour of zero_counter is
signal l32, r32 : std_ulogic;
signal v32 : std_ulogic_vector(31 downto 0);
signal v16 : std_ulogic_vector(15 downto 0);
signal v8 : std_ulogic_vector(7 downto 0);
signal v4 : std_ulogic_vector(3 downto 0);
signal sel : std_ulogic_vector(5 downto 0);
begin
zerocounter0: process(all)
begin
l32 <= or (rs(63 downto 32));
r32 <= or (rs(31 downto 0));
if (l32 = '0' or is_32bit = '1') and r32 = '0' then
-- operand is zero, return 32 for 32-bit, else 64
result <= x"00000000000000" & '0' & not is_32bit & is_32bit & "00000";
else

if count_right = '0' then
sel(5) <= l32 and (not is_32bit);
else
sel(5) <= (not r32) and (not is_32bit);
end if;
if sel(5) = '1' then
v32 <= rs(63 downto 32);
else
v32 <= rs(31 downto 0);
end if;

if count_right = '0' then
sel(4) <= or (v32(31 downto 16));
else
sel(4) <= not (or (v32(15 downto 0)));
end if;
if sel(4) = '1' then
v16 <= v32(31 downto 16);
else
v16 <= v32(15 downto 0);
end if;

if count_right = '0' then
sel(3) <= or (v16(15 downto 8));
else
sel(3) <= not (or (v16(7 downto 0)));
end if;
if sel(3) = '1' then
v8 <= v16(15 downto 8);
else
v8 <= v16(7 downto 0);
end if;

if count_right = '0' then
sel(2) <= or (v8(7 downto 4));
else
sel(2) <= not (or (v8(3 downto 0)));
end if;
if sel(2) = '1' then
v4 <= v8(7 downto 4);
else
v4 <= v8(3 downto 0);
end if;

if count_right = '0' then
if v4(3) = '1' then
sel(1 downto 0) <= "11";
elsif v4(2) = '1' then
sel(1 downto 0) <= "10";
elsif v4(1) = '1' then
sel(1 downto 0) <= "01";
else
sel(1 downto 0) <= "00";
end if;
result <= x"00000000000000" & "00" & (not sel(5) and not is_32bit) & not sel(4 downto 0);
else
if v4(0) = '1' then
sel(1 downto 0) <= "00";
elsif v4(1) = '1' then
sel(1 downto 0) <= "01";
elsif v4(2) = '1' then
sel(1 downto 0) <= "10";
else
sel(1 downto 0) <= "11";
end if;
result <= x"00000000000000" & "00" & sel;
end if;
end if;

end process;
end behaviour;

@ -145,10 +145,10 @@ architecture behaviour of decode1 is
-- 2#0011100000# cmpeqb -- 2#0011100000# cmpeqb
2#0000100000# => (ALU, OP_CMPL, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- cmpl 2#0000100000# => (ALU, OP_CMPL, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- cmpl
-- 2#0011000000# cmprb -- 2#0011000000# cmprb
2#0000111010# => (ALU, OP_CNTLZD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- cntlzd 2#0000111010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- cntlzd
2#0000011010# => (ALU, OP_CNTLZW, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- cntlzw 2#0000011010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '1'), -- cntlzw
2#1000111010# => (ALU, OP_CNTTZD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- cnttzd 2#1000111010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- cnttzd
2#1000011010# => (ALU, OP_CNTTZW, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- cnttzw 2#1000011010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '1'), -- cnttzw
-- 2#1011110011# darn -- 2#1011110011# darn
2#0001010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbf 2#0001010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbf
2#0000110110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbst 2#0000110110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbst

@ -5,7 +5,7 @@ package decode_types is
type insn_type_t is (OP_ILLEGAL, OP_NOP, OP_ADD, type insn_type_t is (OP_ILLEGAL, OP_NOP, OP_ADD,
OP_ADDPCIS, OP_AND, OP_ATTN, OP_B, OP_BC, OP_BCREG, OP_ADDPCIS, OP_AND, OP_ATTN, OP_B, OP_BC, OP_BCREG,
OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPL, OP_CMPRB, OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPL, OP_CMPRB,
OP_CNTLZD, OP_CNTLZW, OP_CNTTZD, OP_CNTTZW, OP_CRAND, OP_CNTZ, OP_CRAND,
OP_CRANDC, OP_CREQV, OP_CRNAND, OP_CRNOR, OP_CROR, OP_CRORC, OP_CRANDC, OP_CREQV, OP_CRNAND, OP_CRNOR, OP_CROR, OP_CRORC,
OP_CRXOR, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST, OP_CRXOR, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST,
OP_DCBZ, OP_DIV, OP_EXTSB, OP_EXTSH, OP_EXTSW, OP_DCBZ, OP_DIV, OP_EXTSB, OP_EXTSH, OP_EXTSW,

@ -46,6 +46,7 @@ architecture behaviour of execute1 is
signal rotator_result: std_ulogic_vector(63 downto 0); signal rotator_result: std_ulogic_vector(63 downto 0);
signal rotator_carry: std_ulogic; signal rotator_carry: std_ulogic;
signal logical_result: std_ulogic_vector(63 downto 0); signal logical_result: std_ulogic_vector(63 downto 0);
signal countzero_result: std_ulogic_vector(63 downto 0);


function decode_input_carry (carry_sel : carry_in_t; ca_in : std_ulogic) return std_ulogic is function decode_input_carry (carry_sel : carry_in_t; ca_in : std_ulogic) return std_ulogic is
begin begin
@ -85,6 +86,14 @@ begin
result => logical_result result => logical_result
); );


countzero_0: entity work.zero_counter
port map (
rs => e_in.read_data3,
count_right => e_in.insn(10),
is_32bit => e_in.is_32bit,
result => countzero_result
);

execute1_0: process(clk) execute1_0: process(clk)
begin begin
if rising_edge(clk) then if rising_edge(clk) then
@ -217,17 +226,8 @@ begin
hi := lo + 3; hi := lo + 3;
v.e.write_cr_data(hi downto lo) := ppc_cmpl(l, e_in.read_data1, e_in.read_data2); v.e.write_cr_data(hi downto lo) := ppc_cmpl(l, e_in.read_data1, e_in.read_data2);
end loop; end loop;
when OP_CNTLZW => when OP_CNTZ =>
result := ppc_cntlzw(e_in.read_data3); result := countzero_result;
result_en := 1;
when OP_CNTTZW =>
result := ppc_cnttzw(e_in.read_data3);
result_en := 1;
when OP_CNTLZD =>
result := ppc_cntlzd(e_in.read_data3);
result_en := 1;
when OP_CNTTZD =>
result := ppc_cnttzd(e_in.read_data3);
result_en := 1; result_en := 1;
when OP_EXTSB => when OP_EXTSB =>
result := ppc_extsb(e_in.read_data3); result := ppc_extsb(e_in.read_data3);

@ -19,6 +19,7 @@ filesets:
- ppc_fx_insns.vhdl - ppc_fx_insns.vhdl
- sim_console.vhdl - sim_console.vhdl
- logical.vhdl - logical.vhdl
- countzero.vhdl
- execute1.vhdl - execute1.vhdl
- execute2.vhdl - execute2.vhdl
- loadstore1.vhdl - loadstore1.vhdl

Loading…
Cancel
Save