diff --git a/common.vhdl b/common.vhdl index a6562e0..2d010ab 100644 --- a/common.vhdl +++ b/common.vhdl @@ -196,6 +196,7 @@ package common is stop_mark: std_ulogic; sequential: std_ulogic; predicted : std_ulogic; + pred_ntaken : std_ulogic; nia: std_ulogic_vector(63 downto 0); end record; @@ -207,6 +208,7 @@ package common is insn: std_ulogic_vector(31 downto 0); big_endian: std_ulogic; next_predicted: std_ulogic; + next_pred_ntaken: std_ulogic; end record; type IcacheEventType is record diff --git a/decode1.vhdl b/decode1.vhdl index fafdb85..0774250 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -740,6 +740,8 @@ begin bv.br_offset := br_offset; if f_in.next_predicted = '1' then v.br_pred := '1'; + elsif f_in.next_pred_ntaken = '1' then + v.br_pred := '0'; end if; bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out and not f_in.next_predicted; -- after a clock edge... diff --git a/fetch1.vhdl b/fetch1.vhdl index 788a76d..ac08eba 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -40,7 +40,8 @@ architecture behaviour of fetch1 is type reg_internal_t is record mode_32bit: std_ulogic; rd_is_niap4: std_ulogic; - predicted: std_ulogic; + predicted_taken: std_ulogic; + pred_not_taken: std_ulogic; predicted_nia: std_ulogic_vector(63 downto 0); end record; signal r, r_next : Fetch1ToIcacheType; @@ -52,7 +53,7 @@ architecture behaviour of fetch1 is constant BTC_TAG_BITS : integer := 62 - BTC_ADDR_BITS; constant BTC_TARGET_BITS : integer := 62; constant BTC_SIZE : integer := 2 ** BTC_ADDR_BITS; - constant BTC_WIDTH : integer := BTC_TAG_BITS + BTC_TARGET_BITS; + constant BTC_WIDTH : integer := BTC_TAG_BITS + BTC_TARGET_BITS + 1; type btc_mem_type is array (0 to BTC_SIZE - 1) of std_ulogic_vector(BTC_WIDTH - 1 downto 0); signal btc_rd_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0) := (others => '0'); @@ -83,8 +84,10 @@ begin end if; if advance_nia = '1' then r.predicted <= r_next.predicted; + r.pred_ntaken <= r_next.pred_ntaken; r.nia <= r_next.nia; - r_int.predicted <= r_next_int.predicted; + r_int.predicted_taken <= r_next_int.predicted_taken; + r_int.pred_not_taken <= r_next_int.pred_not_taken; r_int.predicted_nia <= r_next_int.predicted_nia; r_int.rd_is_niap4 <= r_next.sequential; end if; @@ -107,13 +110,12 @@ begin signal btc_wr : std_ulogic; signal btc_wr_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0); signal btc_wr_addr : std_ulogic_vector(BTC_ADDR_BITS - 1 downto 0); - signal btc_wr_v : std_ulogic; begin - btc_wr_data <= w_in.br_nia(63 downto BTC_ADDR_BITS + 2) & + btc_wr_data <= w_in.br_taken & + w_in.br_nia(63 downto BTC_ADDR_BITS + 2) & w_in.redirect_nia(63 downto 2); btc_wr_addr <= w_in.br_nia(BTC_ADDR_BITS + 1 downto 2); btc_wr <= w_in.br_last; - btc_wr_v <= w_in.br_taken; btc_ram : process(clk) variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0); @@ -131,7 +133,7 @@ begin if inval_btc = '1' or rst = '1' then btc_valids <= (others => '0'); elsif btc_wr = '1' then - btc_valids(to_integer(unsigned(btc_wr_addr))) <= btc_wr_v; + btc_valids(to_integer(unsigned(btc_wr_addr))) <= '1'; end if; end if; end process; @@ -145,7 +147,9 @@ begin v_int := r_int; v.sequential := '0'; v.predicted := '0'; - v_int.predicted := '0'; + v.pred_ntaken := '0'; + v_int.predicted_taken := '0'; + v_int.pred_not_taken := '0'; if rst = '1' then if alt_reset_in = '1' then @@ -172,19 +176,21 @@ begin if r_int.mode_32bit = '1' then v.nia(63 downto 32) := (others => '0'); end if; - elsif r_int.predicted = '1' then + elsif r_int.predicted_taken = '1' then v.nia := r_int.predicted_nia; v.predicted := '1'; else v.sequential := '1'; + v.pred_ntaken := r_int.pred_not_taken; v.nia := std_ulogic_vector(unsigned(r.nia) + 4); if r_int.mode_32bit = '1' then v.nia(63 downto 32) := x"00000000"; end if; if btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and - btc_rd_data(BTC_WIDTH - 1 downto BTC_TARGET_BITS) + btc_rd_data(BTC_WIDTH - 2 downto BTC_TARGET_BITS) = v.nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then - v_int.predicted := '1'; + v_int.predicted_taken := btc_rd_data(BTC_WIDTH - 1); + v_int.pred_not_taken := not btc_rd_data(BTC_WIDTH - 1); end if; end if; v_int.predicted_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00"; diff --git a/icache.vhdl b/icache.vhdl index 57d3437..30dbd28 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -577,6 +577,7 @@ begin i_out.fetch_failed <= r.fetch_failed; i_out.big_endian <= r.big_endian; i_out.next_predicted <= i_in.predicted; + i_out.next_pred_ntaken <= i_in.pred_ntaken; -- Stall fetch1 if we have a miss on cache or TLB or a protection fault stall_out <= not (is_hit and access_ok); diff --git a/xilinx-mult.vhdl b/xilinx-mult.vhdl index 22d73c7..6a0d508 100644 --- a/xilinx-mult.vhdl +++ b/xilinx-mult.vhdl @@ -24,7 +24,6 @@ architecture behaviour of multiply is signal m11_pc, m12_pc, m13_pc : std_ulogic_vector(47 downto 0); signal m20_p, m21_p, m22_p, m23_p : std_ulogic_vector(47 downto 0); signal s0_pc, s1_pc : std_ulogic_vector(47 downto 0); - signal product_lo : std_ulogic_vector(31 downto 0); signal product : std_ulogic_vector(127 downto 0); signal addend : std_ulogic_vector(127 downto 0); signal s0_carry, p0_carry : std_ulogic_vector(3 downto 0); @@ -33,7 +32,7 @@ architecture behaviour of multiply is signal p1_pat, p1_patb : std_ulogic; signal req_32bit, r32_1 : std_ulogic; - signal req_not, rnot_1 : std_ulogic; + signal rnot_1 : std_ulogic; signal valid_1 : std_ulogic; signal overflow, ovf_in : std_ulogic; @@ -49,9 +48,11 @@ begin BREG => 0, CARRYINREG => 0, CARRYINSELREG => 0, + CREG => 0, INMODEREG => 0, + MREG => 0, OPMODEREG => 0, - PREG => 0 + PREG => 1 ) port map ( A => "0000000" & m_in.data1(22 downto 0), @@ -69,13 +70,13 @@ begin CEALUMODE => '0', CEB1 => '0', CEB2 => '0', - CEC => '1', + CEC => '0', CECARRYIN => '0', CECTRL => '0', CED => '0', CEINMODE => '0', - CEM => m_in.valid, - CEP => '0', + CEM => '0', + CEP => m_in.valid, CLK => clk, D => (others => '0'), INMODE => "00000", @@ -160,9 +161,11 @@ begin BREG => 0, CARRYINREG => 0, CARRYINSELREG => 0, + CREG => 0, INMODEREG => 0, + MREG => 0, OPMODEREG => 0, - PREG => 0 + PREG => 1 ) port map ( A => "0000000" & m_in.data1(22 downto 0), @@ -180,13 +183,13 @@ begin CEALUMODE => '0', CEB1 => '0', CEB2 => '0', - CEC => '1', + CEC => '0', CECARRYIN => '0', CECTRL => '0', CED => '0', CEINMODE => '0', - CEM => m_in.valid, - CEP => '0', + CEM => '0', + CEP => m_in.valid, CLK => clk, D => (others => '0'), INMODE => "00000", @@ -215,9 +218,11 @@ begin BREG => 0, CARRYINREG => 0, CARRYINSELREG => 0, + CREG => 0, INMODEREG => 0, + MREG => 0, OPMODEREG => 0, - PREG => 0 + PREG => 1 ) port map ( A => "0000000" & m_in.data1(22 downto 0), @@ -235,13 +240,13 @@ begin CEALUMODE => '0', CEB1 => '0', CEB2 => '0', - CEC => '1', + CEC => '0', CECARRYIN => '0', CECTRL => '0', CED => '0', CEINMODE => '0', - CEM => m_in.valid, - CEP => '0', + CEM => '0', + CEP => m_in.valid, CLK => clk, D => (others => '0'), INMODE => "00000", @@ -709,18 +714,18 @@ begin s0: DSP48E1 generic map ( - ACASCREG => 1, + ACASCREG => 0, ALUMODEREG => 0, - AREG => 1, - BCASCREG => 1, - BREG => 1, + AREG => 0, + BCASCREG => 0, + BREG => 0, CARRYINREG => 0, CARRYINSELREG => 0, - CREG => 1, + CREG => 0, INMODEREG => 0, MREG => 0, OPMODEREG => 0, - PREG => 0, + PREG => 1, USE_MULT => "none" ) port map ( @@ -735,18 +740,18 @@ begin CARRYINSEL => "000", CARRYOUT => s0_carry, CEA1 => '0', - CEA2 => valid_1, + CEA2 => '0', CEAD => '0', CEALUMODE => '0', CEB1 => '0', - CEB2 => valid_1, - CEC => valid_1, + CEB2 => '0', + CEC => '0', CECARRYIN => '0', CECTRL => '0', CED => '0', CEINMODE => '0', CEM => '0', - CEP => '0', + CEP => valid_1, CLK => clk, D => (others => '0'), INMODE => "00000", @@ -953,8 +958,6 @@ begin RSTP => '0' ); - product(31 downto 0) <= product_lo xor (31 downto 0 => req_not); - mult_out: process(all) variable ov : std_ulogic; begin @@ -974,12 +977,15 @@ begin process(clk) begin if rising_edge(clk) then - product_lo <= m10_p(8 downto 0) & m01_p(5 downto 0) & m00_p(16 downto 0); + if rnot_1 = '0' then + product(31 downto 0) <= m10_p(8 downto 0) & m01_p(5 downto 0) & m00_p(16 downto 0); + else + product(31 downto 0) <= not (m10_p(8 downto 0) & m01_p(5 downto 0) & m00_p(16 downto 0)); + end if; m_out.valid <= valid_1; valid_1 <= m_in.valid; req_32bit <= r32_1; r32_1 <= m_in.is_32bit; - req_not <= rnot_1; rnot_1 <= m_in.not_result; overflow <= ovf_in; end if;