Merge pull request #324 from paulusmack/master

Performance and timing improvements
pull/327/head
Michael Neuling 3 years ago committed by GitHub
commit 2224b28c2c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -196,6 +196,7 @@ package common is
stop_mark: std_ulogic; stop_mark: std_ulogic;
sequential: std_ulogic; sequential: std_ulogic;
predicted : std_ulogic; predicted : std_ulogic;
pred_ntaken : std_ulogic;
nia: std_ulogic_vector(63 downto 0); nia: std_ulogic_vector(63 downto 0);
end record; end record;


@ -207,6 +208,7 @@ package common is
insn: std_ulogic_vector(31 downto 0); insn: std_ulogic_vector(31 downto 0);
big_endian: std_ulogic; big_endian: std_ulogic;
next_predicted: std_ulogic; next_predicted: std_ulogic;
next_pred_ntaken: std_ulogic;
end record; end record;


type IcacheEventType is record type IcacheEventType is record

@ -740,6 +740,8 @@ begin
bv.br_offset := br_offset; bv.br_offset := br_offset;
if f_in.next_predicted = '1' then if f_in.next_predicted = '1' then
v.br_pred := '1'; v.br_pred := '1';
elsif f_in.next_pred_ntaken = '1' then
v.br_pred := '0';
end if; end if;
bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out and not f_in.next_predicted; bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out and not f_in.next_predicted;
-- after a clock edge... -- after a clock edge...

@ -40,7 +40,8 @@ architecture behaviour of fetch1 is
type reg_internal_t is record type reg_internal_t is record
mode_32bit: std_ulogic; mode_32bit: std_ulogic;
rd_is_niap4: std_ulogic; rd_is_niap4: std_ulogic;
predicted: std_ulogic; predicted_taken: std_ulogic;
pred_not_taken: std_ulogic;
predicted_nia: std_ulogic_vector(63 downto 0); predicted_nia: std_ulogic_vector(63 downto 0);
end record; end record;
signal r, r_next : Fetch1ToIcacheType; signal r, r_next : Fetch1ToIcacheType;
@ -52,7 +53,7 @@ architecture behaviour of fetch1 is
constant BTC_TAG_BITS : integer := 62 - BTC_ADDR_BITS; constant BTC_TAG_BITS : integer := 62 - BTC_ADDR_BITS;
constant BTC_TARGET_BITS : integer := 62; constant BTC_TARGET_BITS : integer := 62;
constant BTC_SIZE : integer := 2 ** BTC_ADDR_BITS; constant BTC_SIZE : integer := 2 ** BTC_ADDR_BITS;
constant BTC_WIDTH : integer := BTC_TAG_BITS + BTC_TARGET_BITS; constant BTC_WIDTH : integer := BTC_TAG_BITS + BTC_TARGET_BITS + 1;
type btc_mem_type is array (0 to BTC_SIZE - 1) of std_ulogic_vector(BTC_WIDTH - 1 downto 0); type btc_mem_type is array (0 to BTC_SIZE - 1) of std_ulogic_vector(BTC_WIDTH - 1 downto 0);


signal btc_rd_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0) := (others => '0'); signal btc_rd_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0) := (others => '0');
@ -83,8 +84,10 @@ begin
end if; end if;
if advance_nia = '1' then if advance_nia = '1' then
r.predicted <= r_next.predicted; r.predicted <= r_next.predicted;
r.pred_ntaken <= r_next.pred_ntaken;
r.nia <= r_next.nia; r.nia <= r_next.nia;
r_int.predicted <= r_next_int.predicted; r_int.predicted_taken <= r_next_int.predicted_taken;
r_int.pred_not_taken <= r_next_int.pred_not_taken;
r_int.predicted_nia <= r_next_int.predicted_nia; r_int.predicted_nia <= r_next_int.predicted_nia;
r_int.rd_is_niap4 <= r_next.sequential; r_int.rd_is_niap4 <= r_next.sequential;
end if; end if;
@ -107,13 +110,12 @@ begin
signal btc_wr : std_ulogic; signal btc_wr : std_ulogic;
signal btc_wr_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0); signal btc_wr_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0);
signal btc_wr_addr : std_ulogic_vector(BTC_ADDR_BITS - 1 downto 0); signal btc_wr_addr : std_ulogic_vector(BTC_ADDR_BITS - 1 downto 0);
signal btc_wr_v : std_ulogic;
begin begin
btc_wr_data <= w_in.br_nia(63 downto BTC_ADDR_BITS + 2) & btc_wr_data <= w_in.br_taken &
w_in.br_nia(63 downto BTC_ADDR_BITS + 2) &
w_in.redirect_nia(63 downto 2); w_in.redirect_nia(63 downto 2);
btc_wr_addr <= w_in.br_nia(BTC_ADDR_BITS + 1 downto 2); btc_wr_addr <= w_in.br_nia(BTC_ADDR_BITS + 1 downto 2);
btc_wr <= w_in.br_last; btc_wr <= w_in.br_last;
btc_wr_v <= w_in.br_taken;


btc_ram : process(clk) btc_ram : process(clk)
variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0); variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0);
@ -131,7 +133,7 @@ begin
if inval_btc = '1' or rst = '1' then if inval_btc = '1' or rst = '1' then
btc_valids <= (others => '0'); btc_valids <= (others => '0');
elsif btc_wr = '1' then elsif btc_wr = '1' then
btc_valids(to_integer(unsigned(btc_wr_addr))) <= btc_wr_v; btc_valids(to_integer(unsigned(btc_wr_addr))) <= '1';
end if; end if;
end if; end if;
end process; end process;
@ -145,7 +147,9 @@ begin
v_int := r_int; v_int := r_int;
v.sequential := '0'; v.sequential := '0';
v.predicted := '0'; v.predicted := '0';
v_int.predicted := '0'; v.pred_ntaken := '0';
v_int.predicted_taken := '0';
v_int.pred_not_taken := '0';


if rst = '1' then if rst = '1' then
if alt_reset_in = '1' then if alt_reset_in = '1' then
@ -172,19 +176,21 @@ begin
if r_int.mode_32bit = '1' then if r_int.mode_32bit = '1' then
v.nia(63 downto 32) := (others => '0'); v.nia(63 downto 32) := (others => '0');
end if; end if;
elsif r_int.predicted = '1' then elsif r_int.predicted_taken = '1' then
v.nia := r_int.predicted_nia; v.nia := r_int.predicted_nia;
v.predicted := '1'; v.predicted := '1';
else else
v.sequential := '1'; v.sequential := '1';
v.pred_ntaken := r_int.pred_not_taken;
v.nia := std_ulogic_vector(unsigned(r.nia) + 4); v.nia := std_ulogic_vector(unsigned(r.nia) + 4);
if r_int.mode_32bit = '1' then if r_int.mode_32bit = '1' then
v.nia(63 downto 32) := x"00000000"; v.nia(63 downto 32) := x"00000000";
end if; end if;
if btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and if btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and
btc_rd_data(BTC_WIDTH - 1 downto BTC_TARGET_BITS) btc_rd_data(BTC_WIDTH - 2 downto BTC_TARGET_BITS)
= v.nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then = v.nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then
v_int.predicted := '1'; v_int.predicted_taken := btc_rd_data(BTC_WIDTH - 1);
v_int.pred_not_taken := not btc_rd_data(BTC_WIDTH - 1);
end if; end if;
end if; end if;
v_int.predicted_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00"; v_int.predicted_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00";

@ -577,6 +577,7 @@ begin
i_out.fetch_failed <= r.fetch_failed; i_out.fetch_failed <= r.fetch_failed;
i_out.big_endian <= r.big_endian; i_out.big_endian <= r.big_endian;
i_out.next_predicted <= i_in.predicted; i_out.next_predicted <= i_in.predicted;
i_out.next_pred_ntaken <= i_in.pred_ntaken;


-- Stall fetch1 if we have a miss on cache or TLB or a protection fault -- Stall fetch1 if we have a miss on cache or TLB or a protection fault
stall_out <= not (is_hit and access_ok); stall_out <= not (is_hit and access_ok);

@ -24,7 +24,6 @@ architecture behaviour of multiply is
signal m11_pc, m12_pc, m13_pc : std_ulogic_vector(47 downto 0); signal m11_pc, m12_pc, m13_pc : std_ulogic_vector(47 downto 0);
signal m20_p, m21_p, m22_p, m23_p : std_ulogic_vector(47 downto 0); signal m20_p, m21_p, m22_p, m23_p : std_ulogic_vector(47 downto 0);
signal s0_pc, s1_pc : std_ulogic_vector(47 downto 0); signal s0_pc, s1_pc : std_ulogic_vector(47 downto 0);
signal product_lo : std_ulogic_vector(31 downto 0);
signal product : std_ulogic_vector(127 downto 0); signal product : std_ulogic_vector(127 downto 0);
signal addend : std_ulogic_vector(127 downto 0); signal addend : std_ulogic_vector(127 downto 0);
signal s0_carry, p0_carry : std_ulogic_vector(3 downto 0); signal s0_carry, p0_carry : std_ulogic_vector(3 downto 0);
@ -33,7 +32,7 @@ architecture behaviour of multiply is
signal p1_pat, p1_patb : std_ulogic; signal p1_pat, p1_patb : std_ulogic;


signal req_32bit, r32_1 : std_ulogic; signal req_32bit, r32_1 : std_ulogic;
signal req_not, rnot_1 : std_ulogic; signal rnot_1 : std_ulogic;
signal valid_1 : std_ulogic; signal valid_1 : std_ulogic;
signal overflow, ovf_in : std_ulogic; signal overflow, ovf_in : std_ulogic;


@ -49,9 +48,11 @@ begin
BREG => 0, BREG => 0,
CARRYINREG => 0, CARRYINREG => 0,
CARRYINSELREG => 0, CARRYINSELREG => 0,
CREG => 0,
INMODEREG => 0, INMODEREG => 0,
MREG => 0,
OPMODEREG => 0, OPMODEREG => 0,
PREG => 0 PREG => 1
) )
port map ( port map (
A => "0000000" & m_in.data1(22 downto 0), A => "0000000" & m_in.data1(22 downto 0),
@ -69,13 +70,13 @@ begin
CEALUMODE => '0', CEALUMODE => '0',
CEB1 => '0', CEB1 => '0',
CEB2 => '0', CEB2 => '0',
CEC => '1', CEC => '0',
CECARRYIN => '0', CECARRYIN => '0',
CECTRL => '0', CECTRL => '0',
CED => '0', CED => '0',
CEINMODE => '0', CEINMODE => '0',
CEM => m_in.valid, CEM => '0',
CEP => '0', CEP => m_in.valid,
CLK => clk, CLK => clk,
D => (others => '0'), D => (others => '0'),
INMODE => "00000", INMODE => "00000",
@ -160,9 +161,11 @@ begin
BREG => 0, BREG => 0,
CARRYINREG => 0, CARRYINREG => 0,
CARRYINSELREG => 0, CARRYINSELREG => 0,
CREG => 0,
INMODEREG => 0, INMODEREG => 0,
MREG => 0,
OPMODEREG => 0, OPMODEREG => 0,
PREG => 0 PREG => 1
) )
port map ( port map (
A => "0000000" & m_in.data1(22 downto 0), A => "0000000" & m_in.data1(22 downto 0),
@ -180,13 +183,13 @@ begin
CEALUMODE => '0', CEALUMODE => '0',
CEB1 => '0', CEB1 => '0',
CEB2 => '0', CEB2 => '0',
CEC => '1', CEC => '0',
CECARRYIN => '0', CECARRYIN => '0',
CECTRL => '0', CECTRL => '0',
CED => '0', CED => '0',
CEINMODE => '0', CEINMODE => '0',
CEM => m_in.valid, CEM => '0',
CEP => '0', CEP => m_in.valid,
CLK => clk, CLK => clk,
D => (others => '0'), D => (others => '0'),
INMODE => "00000", INMODE => "00000",
@ -215,9 +218,11 @@ begin
BREG => 0, BREG => 0,
CARRYINREG => 0, CARRYINREG => 0,
CARRYINSELREG => 0, CARRYINSELREG => 0,
CREG => 0,
INMODEREG => 0, INMODEREG => 0,
MREG => 0,
OPMODEREG => 0, OPMODEREG => 0,
PREG => 0 PREG => 1
) )
port map ( port map (
A => "0000000" & m_in.data1(22 downto 0), A => "0000000" & m_in.data1(22 downto 0),
@ -235,13 +240,13 @@ begin
CEALUMODE => '0', CEALUMODE => '0',
CEB1 => '0', CEB1 => '0',
CEB2 => '0', CEB2 => '0',
CEC => '1', CEC => '0',
CECARRYIN => '0', CECARRYIN => '0',
CECTRL => '0', CECTRL => '0',
CED => '0', CED => '0',
CEINMODE => '0', CEINMODE => '0',
CEM => m_in.valid, CEM => '0',
CEP => '0', CEP => m_in.valid,
CLK => clk, CLK => clk,
D => (others => '0'), D => (others => '0'),
INMODE => "00000", INMODE => "00000",
@ -709,18 +714,18 @@ begin


s0: DSP48E1 s0: DSP48E1
generic map ( generic map (
ACASCREG => 1, ACASCREG => 0,
ALUMODEREG => 0, ALUMODEREG => 0,
AREG => 1, AREG => 0,
BCASCREG => 1, BCASCREG => 0,
BREG => 1, BREG => 0,
CARRYINREG => 0, CARRYINREG => 0,
CARRYINSELREG => 0, CARRYINSELREG => 0,
CREG => 1, CREG => 0,
INMODEREG => 0, INMODEREG => 0,
MREG => 0, MREG => 0,
OPMODEREG => 0, OPMODEREG => 0,
PREG => 0, PREG => 1,
USE_MULT => "none" USE_MULT => "none"
) )
port map ( port map (
@ -735,18 +740,18 @@ begin
CARRYINSEL => "000", CARRYINSEL => "000",
CARRYOUT => s0_carry, CARRYOUT => s0_carry,
CEA1 => '0', CEA1 => '0',
CEA2 => valid_1, CEA2 => '0',
CEAD => '0', CEAD => '0',
CEALUMODE => '0', CEALUMODE => '0',
CEB1 => '0', CEB1 => '0',
CEB2 => valid_1, CEB2 => '0',
CEC => valid_1, CEC => '0',
CECARRYIN => '0', CECARRYIN => '0',
CECTRL => '0', CECTRL => '0',
CED => '0', CED => '0',
CEINMODE => '0', CEINMODE => '0',
CEM => '0', CEM => '0',
CEP => '0', CEP => valid_1,
CLK => clk, CLK => clk,
D => (others => '0'), D => (others => '0'),
INMODE => "00000", INMODE => "00000",
@ -953,8 +958,6 @@ begin
RSTP => '0' RSTP => '0'
); );


product(31 downto 0) <= product_lo xor (31 downto 0 => req_not);

mult_out: process(all) mult_out: process(all)
variable ov : std_ulogic; variable ov : std_ulogic;
begin begin
@ -974,12 +977,15 @@ begin
process(clk) process(clk)
begin begin
if rising_edge(clk) then if rising_edge(clk) then
product_lo <= m10_p(8 downto 0) & m01_p(5 downto 0) & m00_p(16 downto 0); if rnot_1 = '0' then
product(31 downto 0) <= m10_p(8 downto 0) & m01_p(5 downto 0) & m00_p(16 downto 0);
else
product(31 downto 0) <= not (m10_p(8 downto 0) & m01_p(5 downto 0) & m00_p(16 downto 0));
end if;
m_out.valid <= valid_1; m_out.valid <= valid_1;
valid_1 <= m_in.valid; valid_1 <= m_in.valid;
req_32bit <= r32_1; req_32bit <= r32_1;
r32_1 <= m_in.is_32bit; r32_1 <= m_in.is_32bit;
req_not <= rnot_1;
rnot_1 <= m_in.not_result; rnot_1 <= m_in.not_result;
overflow <= ovf_in; overflow <= ovf_in;
end if; end if;

Loading…
Cancel
Save