diff --git a/Makefile b/Makefile index ce08c33..2780db0 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ common.o: decode_types.o control.o: gpr_hazard.o cr_hazard.o common.o sim_jtag.o: sim_jtag_socket.o core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o -core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o multiply.o writeback.o core_debug.o divider.o +core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o writeback.o core_debug.o core_debug.o: common.o countzero.o: countzero_tb.o: common.o glibc_random.o countzero.o @@ -40,7 +40,7 @@ crhelpers.o: common.o decode1.o: common.o decode_types.o decode2.o: decode_types.o common.o helpers.o insn_helpers.o control.o decode_types.o: -execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o +execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o multiply.o divider.o fetch1.o: common.o fetch2.o: common.o wishbone_types.o glibc_random_helpers.o: diff --git a/common.vhdl b/common.vhdl index a27f4f2..ffddb0b 100644 --- a/common.vhdl +++ b/common.vhdl @@ -109,6 +109,9 @@ package common is read_data1: std_ulogic_vector(63 downto 0); read_data2: std_ulogic_vector(63 downto 0); read_data3: std_ulogic_vector(63 downto 0); + bypass_data1: std_ulogic; + bypass_data2: std_ulogic; + bypass_data3: std_ulogic; cr: std_ulogic_vector(31 downto 0); xerc: xer_common_t; lr: std_ulogic; @@ -124,44 +127,41 @@ package common is is_signed: std_ulogic; insn: std_ulogic_vector(31 downto 0); data_len: std_ulogic_vector(3 downto 0); + byte_reverse : std_ulogic; + sign_extend : std_ulogic; -- do we need to sign extend? + update : std_ulogic; -- is this an update instruction? end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := - (valid => '0', insn_type => OP_ILLEGAL, lr => '0', rc => '0', oe => '0', invert_a => '0', + (valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', + lr => '0', rc => '0', oe => '0', invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', - is_32bit => '0', is_signed => '0', xerc => xerc_init, others => (others => '0')); + is_32bit => '0', is_signed => '0', xerc => xerc_init, + byte_reverse => '0', sign_extend => '0', update => '0', others => (others => '0')); - type Decode2ToMultiplyType is record + type Execute1ToMultiplyType is record valid: std_ulogic; insn_type: insn_type_t; - write_reg: gpr_index_t; data1: std_ulogic_vector(64 downto 0); data2: std_ulogic_vector(64 downto 0); - rc: std_ulogic; - oe: std_ulogic; is_32bit: std_ulogic; - xerc: xer_common_t; end record; - constant Decode2ToMultiplyInit : Decode2ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, rc => '0', - oe => '0', is_32bit => '0', xerc => xerc_init, - others => (others => '0')); + constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, + is_32bit => '0', + others => (others => '0')); - type Decode2ToDividerType is record + type Execute1ToDividerType is record valid: std_ulogic; - write_reg: gpr_index_t; dividend: std_ulogic_vector(63 downto 0); divisor: std_ulogic_vector(63 downto 0); is_signed: std_ulogic; is_32bit: std_ulogic; is_extended: std_ulogic; is_modulus: std_ulogic; - rc: std_ulogic; - oe: std_ulogic; - xerc: xer_common_t; + neg_result: std_ulogic; end record; - constant Decode2ToDividerInit: Decode2ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0', - is_extended => '0', is_modulus => '0', - rc => '0', oe => '0', xerc => xerc_init, - others => (others => '0')); + constant Execute1ToDividerInit: Execute1ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0', + is_extended => '0', is_modulus => '0', + neg_result => '0', others => (others => '0')); type Decode2ToRegisterFileType is record read1_enable : std_ulogic; @@ -193,7 +193,7 @@ package common is end record; constant Execute1ToFetch1TypeInit : Execute1ToFetch1Type := (redirect => '0', others => (others => '0')); - type Decode2ToLoadstore1Type is record + type Execute1ToLoadstore1Type is record valid : std_ulogic; load : std_ulogic; -- is this a load or store addr1 : std_ulogic_vector(63 downto 0); @@ -207,9 +207,9 @@ package common is update_reg : gpr_index_t; -- if so, the register to update xerc : xer_common_t; end record; - constant Decode2ToLoadstore1Init : Decode2ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0', - sign_extend => '0', update => '0', xerc => xerc_init, - others => (others => '0')); + constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0', + sign_extend => '0', update => '0', xerc => xerc_init, + others => (others => '0')); type Loadstore1ToDcacheType is record valid : std_ulogic; @@ -248,48 +248,32 @@ package common is write_enable : std_ulogic; write_reg: gspr_index_t; write_data: std_ulogic_vector(63 downto 0); - write_len : std_ulogic_vector(3 downto 0); write_cr_enable : std_ulogic; write_cr_mask : std_ulogic_vector(7 downto 0); write_cr_data : std_ulogic_vector(31 downto 0); write_xerc_enable : std_ulogic; xerc : xer_common_t; - sign_extend: std_ulogic; end record; constant Execute1ToWritebackInit : Execute1ToWritebackType := (valid => '0', rc => '0', write_enable => '0', - write_cr_enable => '0', sign_extend => '0', + write_cr_enable => '0', write_xerc_enable => '0', xerc => xerc_init, others => (others => '0')); - type MultiplyToWritebackType is record + type MultiplyToExecute1Type is record valid: std_ulogic; - - write_reg_enable : std_ulogic; - write_reg_nr: gpr_index_t; write_reg_data: std_ulogic_vector(63 downto 0); - write_xerc_enable : std_ulogic; - xerc : xer_common_t; - rc: std_ulogic; + overflow : std_ulogic; end record; - constant MultiplyToWritebackInit : MultiplyToWritebackType := (valid => '0', write_reg_enable => '0', - rc => '0', write_xerc_enable => '0', - xerc => xerc_init, - others => (others => '0')); + constant MultiplyToExecute1Init : MultiplyToExecute1Type := (valid => '0', overflow => '0', + others => (others => '0')); - type DividerToWritebackType is record + type DividerToExecute1Type is record valid: std_ulogic; - - write_reg_enable : std_ulogic; - write_reg_nr: gpr_index_t; write_reg_data: std_ulogic_vector(63 downto 0); - write_xerc_enable : std_ulogic; - xerc : xer_common_t; - rc: std_ulogic; + overflow : std_ulogic; end record; - constant DividerToWritebackInit : DividerToWritebackType := (valid => '0', write_reg_enable => '0', - rc => '0', write_xerc_enable => '0', - xerc => xerc_init, - others => (others => '0')); + constant DividerToExecute1Init : DividerToExecute1Type := (valid => '0', overflow => '0', + others => (others => '0')); type WritebackToRegisterFileType is record write_reg : gspr_index_t; diff --git a/control.vhdl b/control.vhdl index ead3c1f..064ff98 100644 --- a/control.vhdl +++ b/control.vhdl @@ -21,6 +21,7 @@ entity control is gpr_write_valid_in : in std_ulogic; gpr_write_in : in gspr_index_t; + gpr_bypassable : in std_ulogic; gpr_a_read_valid_in : in std_ulogic; gpr_a_read_in : in gspr_index_t; @@ -36,7 +37,11 @@ entity control is valid_out : out std_ulogic; stall_out : out std_ulogic; - stopped_out : out std_ulogic + stopped_out : out std_ulogic; + + gpr_bypass_a : out std_ulogic; + gpr_bypass_b : out std_ulogic; + gpr_bypass_c : out std_ulogic ); end entity control; @@ -71,10 +76,12 @@ begin gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, + bypass_avail => gpr_bypassable, gpr_read_valid_in => gpr_a_read_valid_in, gpr_read_in => gpr_a_read_in, - stall_out => stall_a_out + stall_out => stall_a_out, + use_bypass => gpr_bypass_a ); gpr_hazard1: entity work.gpr_hazard @@ -87,10 +94,12 @@ begin gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, + bypass_avail => gpr_bypassable, gpr_read_valid_in => gpr_b_read_valid_in, gpr_read_in => gpr_b_read_in, - stall_out => stall_b_out + stall_out => stall_b_out, + use_bypass => gpr_bypass_b ); gpr_c_read_in_fmt <= "0" & gpr_c_read_in; @@ -105,10 +114,12 @@ begin gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, + bypass_avail => gpr_bypassable, gpr_read_valid_in => gpr_c_read_valid_in, gpr_read_in => gpr_c_read_in_fmt, - stall_out => stall_c_out + stall_out => stall_c_out, + use_bypass => gpr_bypass_c ); cr_hazard0: entity work.cr_hazard diff --git a/core.vhdl b/core.vhdl index eb0b526..bc0b16f 100644 --- a/core.vhdl +++ b/core.vhdl @@ -9,7 +9,8 @@ use work.wishbone_types.all; entity core is generic ( SIM : boolean := false; - DISABLE_FLATTEN : boolean := false + DISABLE_FLATTEN : boolean := false; + EX1_BYPASS : boolean := true ); port ( clk : in std_logic; @@ -59,18 +60,10 @@ architecture behave of core is signal execute1_to_fetch1: Execute1ToFetch1Type; -- load store signals - signal decode2_to_loadstore1: Decode2ToLoadstore1Type; + signal execute1_to_loadstore1: Execute1ToLoadstore1Type; signal loadstore1_to_dcache: Loadstore1ToDcacheType; signal dcache_to_writeback: DcacheToWritebackType; - -- multiply signals - signal decode2_to_multiply: Decode2ToMultiplyType; - signal multiply_to_writeback: MultiplyToWritebackType; - - -- divider signals - signal decode2_to_divider: Decode2ToDividerType; - signal divider_to_writeback: DividerToWritebackType; - -- local signals signal fetch1_stall_in : std_ulogic; signal icache_stall_out : std_ulogic; @@ -115,8 +108,6 @@ architecture behave of core is attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of cr_file_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of execute1_0 : label is keep_h(DISABLE_FLATTEN); - attribute keep_hierarchy of multiply_0 : label is keep_h(DISABLE_FLATTEN); - attribute keep_hierarchy of divider_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of loadstore1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of dcache_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of writeback_0 : label is keep_h(DISABLE_FLATTEN); @@ -186,6 +177,9 @@ begin decode1_stall_in <= decode2_stall_out; decode2_0: entity work.decode2 + generic map ( + EX1_BYPASS => EX1_BYPASS + ) port map ( clk => clk, rst => core_rst, @@ -196,9 +190,6 @@ begin stopped_out => dbg_core_is_stopped, d_in => decode1_to_decode2, e_out => decode2_to_execute1, - l_out => decode2_to_loadstore1, - m_out => decode2_to_multiply, - d_out => decode2_to_divider, r_in => register_file_to_decode2, r_out => decode2_to_register_file, c_in => cr_file_to_decode2, @@ -232,11 +223,16 @@ begin ); execute1_0: entity work.execute1 + generic map ( + EX1_BYPASS => EX1_BYPASS + ) port map ( clk => clk, + rst => core_rst, flush_out => flush, stall_out => ex1_stall_out, e_in => decode2_to_execute1, + l_out => execute1_to_loadstore1, f_out => execute1_to_fetch1, e_out => execute1_to_writeback, icache_inval => ex1_icache_inval, @@ -246,7 +242,7 @@ begin loadstore1_0: entity work.loadstore1 port map ( clk => clk, - l_in => decode2_to_loadstore1, + l_in => execute1_to_loadstore1, l_out => loadstore1_to_dcache ); @@ -265,28 +261,11 @@ begin wishbone_out => wishbone_data_out ); - multiply_0: entity work.multiply - port map ( - clk => clk, - m_in => decode2_to_multiply, - m_out => multiply_to_writeback - ); - - divider_0: entity work.divider - port map ( - clk => clk, - rst => core_rst, - d_in => decode2_to_divider, - d_out => divider_to_writeback - ); - writeback_0: entity work.writeback port map ( clk => clk, e_in => execute1_to_writeback, l_in => dcache_to_writeback, - m_in => multiply_to_writeback, - d_in => divider_to_writeback, w_out => writeback_to_register_file, c_out => writeback_to_cr_file, complete_out => complete diff --git a/countzero.vhdl b/countzero.vhdl index d3960f0..50e6ead 100644 --- a/countzero.vhdl +++ b/countzero.vhdl @@ -6,6 +6,7 @@ library work; entity zero_counter is port ( + clk : in std_logic; rs : in std_ulogic_vector(63 downto 0); count_right : in std_ulogic; is_32bit : in std_ulogic; @@ -14,10 +15,14 @@ entity zero_counter is end entity zero_counter; architecture behaviour of zero_counter is - signal y, z : std_ulogic_vector(3 downto 0); - signal v16 : std_ulogic_vector(15 downto 0); - signal v4 : std_ulogic_vector(3 downto 0); - signal sel : std_ulogic_vector(5 downto 0); + type intermediate_result is record + v16: std_ulogic_vector(15 downto 0); + sel_hi: std_ulogic_vector(1 downto 0); + is_32bit: std_ulogic; + count_right: std_ulogic; + end record; + + signal r, r_in : intermediate_result; -- Return the index of the leftmost or rightmost 1 in a set of 4 bits. -- Assumes v is not "0000"; if it is, return (right ? "11" : "00"). @@ -47,65 +52,83 @@ architecture behaviour of zero_counter is end; begin - zerocounter0: process(all) + zerocounter_0: process(clk) + begin + if rising_edge(clk) then + r <= r_in; + end if; + end process; + + zerocounter_1: process(all) + variable v: intermediate_result; + variable y, z: std_ulogic_vector(3 downto 0); + variable sel: std_ulogic_vector(5 downto 0); + variable v4: std_ulogic_vector(3 downto 0); + begin -- Test 4 groups of 16 bits each. -- The top 2 groups are considered to be zero in 32-bit mode. - z(0) <= or (rs(15 downto 0)); - z(1) <= or (rs(31 downto 16)); - z(2) <= or (rs(47 downto 32)); - z(3) <= or (rs(63 downto 48)); + z(0) := or (rs(15 downto 0)); + z(1) := or (rs(31 downto 16)); + z(2) := or (rs(47 downto 32)); + z(3) := or (rs(63 downto 48)); if is_32bit = '0' then - sel(5 downto 4) <= encoder(z, count_right); + v.sel_hi := encoder(z, count_right); else - sel(5) <= '0'; + v.sel_hi(1) := '0'; if count_right = '0' then - sel(4) <= z(1); + v.sel_hi(0) := z(1); else - sel(4) <= not z(0); + v.sel_hi(0) := not z(0); end if; end if; -- Select the leftmost/rightmost non-zero group of 16 bits - case sel(5 downto 4) is + case v.sel_hi is when "00" => - v16 <= rs(15 downto 0); + v.v16 := rs(15 downto 0); when "01" => - v16 <= rs(31 downto 16); + v.v16 := rs(31 downto 16); when "10" => - v16 <= rs(47 downto 32); + v.v16 := rs(47 downto 32); when others => - v16 <= rs(63 downto 48); + v.v16 := rs(63 downto 48); end case; + -- Latch this and do the rest in the next cycle, for the sake of timing + v.is_32bit := is_32bit; + v.count_right := count_right; + r_in <= v; + sel(5 downto 4) := r.sel_hi; + -- Test 4 groups of 4 bits - y(0) <= or (v16(3 downto 0)); - y(1) <= or (v16(7 downto 4)); - y(2) <= or (v16(11 downto 8)); - y(3) <= or (v16(15 downto 12)); - sel(3 downto 2) <= encoder(y, count_right); + y(0) := or (r.v16(3 downto 0)); + y(1) := or (r.v16(7 downto 4)); + y(2) := or (r.v16(11 downto 8)); + y(3) := or (r.v16(15 downto 12)); + sel(3 downto 2) := encoder(y, r.count_right); -- Select the leftmost/rightmost non-zero group of 4 bits case sel(3 downto 2) is when "00" => - v4 <= v16(3 downto 0); + v4 := r.v16(3 downto 0); when "01" => - v4 <= v16(7 downto 4); + v4 := r.v16(7 downto 4); when "10" => - v4 <= v16(11 downto 8); + v4 := r.v16(11 downto 8); when others => - v4 <= v16(15 downto 12); + v4 := r.v16(15 downto 12); end case; - sel(1 downto 0) <= encoder(v4, count_right); + sel(1 downto 0) := encoder(v4, r.count_right); -- sel is now the index of the leftmost/rightmost 1 bit in rs if v4 = "0000" then -- operand is zero, return 32 for 32-bit, else 64 - result <= x"00000000000000" & '0' & not is_32bit & is_32bit & "00000"; - elsif count_right = '0' then + result <= x"00000000000000" & '0' & not r.is_32bit & r.is_32bit & "00000"; + elsif r.count_right = '0' then -- return (63 - sel), trimmed to 5 bits in 32-bit mode - result <= x"00000000000000" & "00" & (not sel(5) and not is_32bit) & not sel(4 downto 0); + result <= x"00000000000000" & "00" & (not sel(5) and not r.is_32bit) & not sel(4 downto 0); else result <= x"00000000000000" & "00" & sel; end if; diff --git a/countzero_tb.vhdl b/countzero_tb.vhdl index 91de334..21529de 100644 --- a/countzero_tb.vhdl +++ b/countzero_tb.vhdl @@ -15,16 +15,26 @@ architecture behave of countzero_tb is signal is_32bit, count_right: std_ulogic := '0'; signal result: std_ulogic_vector(63 downto 0); signal randno: std_ulogic_vector(63 downto 0); + signal clk: std_ulogic; begin zerocounter_0: entity work.zero_counter port map ( + clk => clk, rs => rs, result => result, count_right => count_right, is_32bit => is_32bit ); + clk_process: process + begin + clk <= '0'; + wait for clk_period/2; + clk <= '1'; + wait for clk_period/2; + end process; + stim_process: process variable r: std_ulogic_vector(63 downto 0); begin diff --git a/decode1.vhdl b/decode1.vhdl index 51a2643..f1b5ad4 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -44,8 +44,8 @@ architecture behaviour of decode1 is 29 => (ALU, OP_AND, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0'), -- andis. 18 => (ALU, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- b 16 => (ALU, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc - 11 => (ALU, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpi - 10 => (ALU, OP_CMPL, RA, CONST_UI, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli + 11 => (ALU, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmpi + 10 => (ALU, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli 34 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbz 35 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lbzu 42 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lha @@ -54,7 +54,7 @@ architecture behaviour of decode1 is 41 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lhzu 32 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwz 33 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lwzu - 7 => (MUL, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '1'), -- mulli + 7 => (ALU, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli 24 => (ALU, OP_OR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ori 25 => (ALU, OP_OR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- oris 20 => (ALU, OP_RLC, RA, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwimi @@ -66,7 +66,7 @@ architecture behaviour of decode1 is 45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- sthu 36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stw 37 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stwu - 8 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- subfic + 8 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic 2 => (ALU, OP_TDI, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi --PPC_TWI 3 26 => (ALU, OP_XOR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- xori @@ -145,10 +145,10 @@ architecture behaviour of decode1 is 2#0000011100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- and 2#0000111100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- andc -- 2#0011111100# bperm - 2#0000000000# => (ALU, OP_CMP, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmp + 2#0000000000# => (ALU, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmp 2#0111111100# => (ALU, OP_CMPB, NONE, RB, RS, RA, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpb -- 2#0011100000# cmpeqb - 2#0000100000# => (ALU, OP_CMPL, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpl + 2#0000100000# => (ALU, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpl -- 2#0011000000# cmprb 2#0000111010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- cntlzd 2#0000011010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- cntlzw @@ -160,22 +160,22 @@ architecture behaviour of decode1 is 2#0100010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbt 2#0011110110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbtst -- 2#1111110110# dcbz - 2#0110001001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdeu - 2#1110001001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdeuo - 2#0110001011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divweu - 2#1110001011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divweuo - 2#0110101001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divde - 2#1110101001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdeo - 2#0110101011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divwe - 2#1110101011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divweo - 2#0111001001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdu - 2#1111001001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divduo - 2#0111001011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divwu - 2#1111001011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divwuo - 2#0111101001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divd - 2#1111101001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdo - 2#0111101011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divw - 2#1111101011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divwo + 2#0110001001# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- divdeu + 2#1110001001# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- divdeuo + 2#0110001011# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- divweu + 2#1110001011# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- divweuo + 2#0110101001# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- divde + 2#1110101001# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- divdeo + 2#0110101011# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- divwe + 2#1110101011# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- divweo + 2#0111001001# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- divdu + 2#1111001001# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- divduo + 2#0111001011# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- divwu + 2#1111001011# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- divwuo + 2#0111101001# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- divd + 2#1111101001# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- divdo + 2#0111101011# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- divw + 2#1111101011# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- divwo 2#0100011100# => (ALU, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- eqv 2#1110111010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsb 2#1110011010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsh @@ -238,36 +238,36 @@ architecture behaviour of decode1 is -- 2#1001000000# mcrxrx 2#0000010011# => (ALU, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf 2#0101010011# => (ALU, OP_MFSPR, SPR, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfspr - 2#0100001001# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modud - 2#0100001011# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- moduw - 2#1100001001# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsd - 2#1100001011# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsw + 2#0100001001# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- modud + 2#0100001011# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- moduw + 2#1100001001# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- modsd + 2#1100001011# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', NONE, '0', '0'), -- modsw 2#0010010000# => (ALU, OP_MTCRF, NONE, NONE, RS, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtcrf/mtocrf 2#0111010011# => (ALU, OP_MTSPR, NONE, NONE, RS, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtspr - 2#0001001001# => (MUL, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '1'), -- mulhd - 2#0000001001# => (MUL, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- mulhdu - 2#0001001011# => (MUL, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '1'), -- mulhw - 2#0000001011# => (MUL, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '1'), -- mulhwu + 2#0001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd + 2#0000001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- mulhdu + 2#0001001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mulhw + 2#0000001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- mulhwu -- next 4 have reserved bit set - 2#1001001001# => (MUL, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '1'), -- mulhd - 2#1000001001# => (MUL, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- mulhdu - 2#1001001011# => (MUL, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '1'), -- mulhw - 2#1000001011# => (MUL, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '1'), -- mulhwu - 2#0011101001# => (MUL, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '1'), -- mulld - 2#1011101001# => (MUL, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '1'), -- mulldo - 2#0011101011# => (MUL, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '1'), -- mullw - 2#1011101011# => (MUL, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '1'), -- mullwo + 2#1001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd + 2#1000001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- mulhdu + 2#1001001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mulhw + 2#1000001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- mulhwu + 2#0011101001# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulld + 2#1011101001# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulldo + 2#0011101011# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mullw + 2#1011101011# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mullwo 2#0111011100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nand 2#0001101000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- neg 2#1001101000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nego 2#0001111100# => (ALU, OP_OR, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nor 2#0110111100# => (ALU, OP_OR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- or 2#0110011100# => (ALU, OP_OR, NONE, RB, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- orc - 2#0001111010# => (ALU, OP_POPCNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntb - 2#0111111010# => (ALU, OP_POPCNTD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntd - 2#0101111010# => (ALU, OP_POPCNTW, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntw - 2#0010111010# => (ALU, OP_PRTYD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd - 2#0010011010# => (ALU, OP_PRTYW, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw + 2#0001111010# => (ALU, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntb + 2#0111111010# => (ALU, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntd + 2#0101111010# => (ALU, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntw + 2#0010111010# => (ALU, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd + 2#0010011010# => (ALU, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw -- 2#0010000000# setb 2#0000011011# => (ALU, OP_SHL, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- sld 2#0000011000# => (ALU, OP_SHL, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- slw diff --git a/decode2.vhdl b/decode2.vhdl index f6f7101..582fa5b 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -9,6 +9,9 @@ use work.helpers.all; use work.insn_helpers.all; entity decode2 is + generic ( + EX1_BYPASS : boolean := true + ); port ( clk : in std_ulogic; rst : in std_ulogic; @@ -24,9 +27,6 @@ entity decode2 is d_in : in Decode1ToDecode2Type; e_out : out Decode2ToExecute1Type; - m_out : out Decode2ToMultiplyType; - d_out : out Decode2ToDividerType; - l_out : out Decode2ToLoadstore1Type; r_in : in RegisterFileToDecode2Type; r_out : out Decode2ToRegisterFileType; @@ -39,9 +39,6 @@ end entity decode2; architecture behaviour of decode2 is type reg_type is record e : Decode2ToExecute1Type; - m : Decode2ToMultiplyType; - d : Decode2ToDividerType; - l : Decode2ToLoadstore1Type; end record; signal r, rin : reg_type; @@ -188,15 +185,19 @@ architecture behaviour of decode2 is signal gpr_write_valid : std_ulogic; signal gpr_write : gspr_index_t; + signal gpr_bypassable : std_ulogic; signal gpr_a_read_valid : std_ulogic; signal gpr_a_read :gspr_index_t; + signal gpr_a_bypass : std_ulogic; signal gpr_b_read_valid : std_ulogic; signal gpr_b_read : gspr_index_t; + signal gpr_b_bypass : std_ulogic; signal gpr_c_read_valid : std_ulogic; signal gpr_c_read : gpr_index_t; + signal gpr_c_bypass : std_ulogic; signal cr_write_valid : std_ulogic; begin @@ -217,6 +218,7 @@ begin gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write, + gpr_bypassable => gpr_bypassable, gpr_a_read_valid_in => gpr_a_read_valid, gpr_a_read_in => gpr_a_read, @@ -232,13 +234,17 @@ begin valid_out => control_valid_out, stall_out => stall_out, - stopped_out => stopped_out + stopped_out => stopped_out, + + gpr_bypass_a => gpr_a_bypass, + gpr_bypass_b => gpr_b_bypass, + gpr_bypass_c => gpr_c_bypass ); decode2_0: process(clk) begin if rising_edge(clk) then - if rin.e.valid = '1' or rin.l.valid = '1' or rin.m.valid = '1' or rin.d.valid = '1' then + if rin.e.valid = '1' then report "execute " & to_hstring(rin.e.nia); end if; r <= rin; @@ -259,21 +265,16 @@ begin variable decoded_reg_b : decode_input_reg_t; variable decoded_reg_c : decode_input_reg_t; variable decoded_reg_o : decode_output_reg_t; - variable signed_division: std_ulogic; variable length : std_ulogic_vector(3 downto 0); begin v := r; v.e := Decode2ToExecute1Init; - v.l := Decode2ToLoadStore1Init; - v.m := Decode2ToMultiplyInit; - v.d := Decode2ToDividerInit; mul_a := (others => '0'); mul_b := (others => '0'); --v.e.input_cr := d_in.decode.input_cr; - --v.m.input_cr := d_in.decode.input_cr; --v.e.output_cr := d_in.decode.output_cr; decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1); @@ -303,12 +304,17 @@ begin v.e.insn_type := d_in.decode.insn_type; v.e.read_reg1 := decoded_reg_a.reg; v.e.read_data1 := decoded_reg_a.data; + v.e.bypass_data1 := gpr_a_bypass; v.e.read_reg2 := decoded_reg_b.reg; v.e.read_data2 := decoded_reg_b.data; + v.e.bypass_data2 := gpr_b_bypass; v.e.read_data3 := decoded_reg_c.data; + v.e.bypass_data3 := gpr_c_bypass; v.e.write_reg := decoded_reg_o.reg; v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); - v.e.oe := decode_oe(d_in.decode.rc, d_in.insn); + if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then + v.e.oe := decode_oe(d_in.decode.rc, d_in.insn); + end if; v.e.cr := c_in.read_cr_data; v.e.xerc := c_in.read_xerc_data; v.e.invert_a := d_in.decode.invert_a; @@ -322,102 +328,9 @@ begin end if; v.e.insn := d_in.insn; v.e.data_len := length; - - -- multiply unit - v.m.insn_type := d_in.decode.insn_type; - mul_a := decoded_reg_a.data; - mul_b := decoded_reg_b.data; - v.m.write_reg := gspr_to_gpr(decoded_reg_o.reg); - v.m.rc := decode_rc(d_in.decode.rc, d_in.insn); - v.m.xerc := c_in.read_xerc_data; - if v.m.insn_type = OP_MUL_L64 then - v.m.oe := decode_oe(d_in.decode.rc, d_in.insn); - end if; - v.m.is_32bit := d_in.decode.is_32bit; - - if d_in.decode.is_32bit = '1' then - if d_in.decode.is_signed = '1' then - v.m.data1 := (others => mul_a(31)); - v.m.data1(31 downto 0) := mul_a(31 downto 0); - v.m.data2 := (others => mul_b(31)); - v.m.data2(31 downto 0) := mul_b(31 downto 0); - else - v.m.data1 := '0' & x"00000000" & mul_a(31 downto 0); - v.m.data2 := '0' & x"00000000" & mul_b(31 downto 0); - end if; - else - if d_in.decode.is_signed = '1' then - v.m.data1 := mul_a(63) & mul_a; - v.m.data2 := mul_b(63) & mul_b; - else - v.m.data1 := '0' & mul_a; - v.m.data2 := '0' & mul_b; - end if; - end if; - - -- divide unit - -- PPC divide and modulus instruction words have these bits in - -- the bottom 11 bits: o1dns 010t1 r - -- where o = OE for div instrs, signedness for mod instrs - -- d = 1 for div*, 0 for mod* - -- n = 1 for normal, 0 for extended (dividend << 32/64) - -- s = 1 for signed, 0 for unsigned (for div*) - -- t = 1 for 32-bit, 0 for 64-bit - -- r = RC bit (record condition code) - v.d.write_reg := gspr_to_gpr(decoded_reg_o.reg); - v.d.is_modulus := not d_in.insn(8); - v.d.is_32bit := d_in.insn(2); - if d_in.insn(8) = '1' then - signed_division := d_in.insn(6); - else - signed_division := d_in.insn(10); - end if; - v.d.is_signed := signed_division; - if d_in.insn(2) = '0' then - -- 64-bit forms - if d_in.insn(8) = '1' and d_in.insn(7) = '0' then - v.d.is_extended := '1'; - end if; - v.d.dividend := decoded_reg_a.data; - v.d.divisor := decoded_reg_b.data; - else - -- 32-bit forms - if d_in.insn(8) = '1' and d_in.insn(7) = '0' then -- extended forms - v.d.dividend := decoded_reg_a.data(31 downto 0) & x"00000000"; - elsif signed_division = '1' and decoded_reg_a.data(31) = '1' then - -- sign extend to 64 bits - v.d.dividend := x"ffffffff" & decoded_reg_a.data(31 downto 0); - else - v.d.dividend := x"00000000" & decoded_reg_a.data(31 downto 0); - end if; - if signed_division = '1' and decoded_reg_b.data(31) = '1' then - v.d.divisor := x"ffffffff" & decoded_reg_b.data(31 downto 0); - else - v.d.divisor := x"00000000" & decoded_reg_b.data(31 downto 0); - end if; - end if; - v.d.rc := decode_rc(d_in.decode.rc, d_in.insn); - v.d.xerc := c_in.read_xerc_data; - v.d.oe := decode_oe(d_in.decode.rc, d_in.insn); - - -- load/store unit - v.l.update_reg := gspr_to_gpr(decoded_reg_a.reg); - v.l.addr1 := decoded_reg_a.data; - v.l.addr2 := decoded_reg_b.data; - v.l.data := decoded_reg_c.data; - v.l.write_reg := gspr_to_gpr(decoded_reg_o.reg); - - if d_in.decode.insn_type = OP_LOAD then - v.l.load := '1'; - else - v.l.load := '0'; - end if; - - v.l.length := length; - v.l.byte_reverse := d_in.decode.byte_reverse; - v.l.sign_extend := d_in.decode.sign_extend; - v.l.update := d_in.decode.update; - v.l.xerc := c_in.read_xerc_data; + v.e.byte_reverse := d_in.decode.byte_reverse; + v.e.sign_extend := d_in.decode.sign_extend; + v.e.update := d_in.decode.update; -- issue control control_valid_in <= d_in.valid; @@ -425,6 +338,10 @@ begin gpr_write_valid <= decoded_reg_o.reg_valid; gpr_write <= decoded_reg_o.reg; + gpr_bypassable <= '0'; + if EX1_BYPASS and d_in.decode.unit = ALU then + gpr_bypassable <= '1'; + end if; gpr_a_read_valid <= decoded_reg_a.reg_valid; gpr_a_read <= decoded_reg_a.reg; @@ -437,29 +354,13 @@ begin cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); - v.e.valid := '0'; - v.m.valid := '0'; - v.d.valid := '0'; - v.l.valid := '0'; - case d_in.decode.unit is - when ALU => - v.e.valid := control_valid_out; - when LDST => - v.l.valid := control_valid_out; - when MUL => - v.m.valid := control_valid_out; - when DIV => - v.d.valid := control_valid_out; - when NONE => - v.e.valid := control_valid_out; + v.e.valid := control_valid_out; + if d_in.decode.unit = NONE then v.e.insn_type := OP_ILLEGAL; - end case; + end if; if rst = '1' then v.e := Decode2ToExecute1Init; - v.l := Decode2ToLoadStore1Init; - v.m := Decode2ToMultiplyInit; - v.d := Decode2ToDividerInit; end if; -- Update registers @@ -467,8 +368,5 @@ begin -- Update outputs e_out <= r.e; - l_out <= r.l; - m_out <= r.m; - d_out <= r.d; end process; end architecture behaviour; diff --git a/decode_types.vhdl b/decode_types.vhdl index e847fcf..21d8b68 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -4,18 +4,18 @@ use ieee.std_logic_1164.all; package decode_types is type insn_type_t is (OP_ILLEGAL, OP_NOP, OP_ADD, OP_ADDPCIS, OP_AND, OP_ATTN, OP_B, OP_BC, OP_BCREG, - OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPL, OP_CMPRB, + OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB, OP_CNTZ, OP_CRAND, OP_CRANDC, OP_CREQV, OP_CRNAND, OP_CRNOR, OP_CROR, OP_CRORC, OP_CRXOR, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST, - OP_DCBZ, OP_DIV, OP_EXTS, + OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS, OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC, OP_LOAD, OP_STORE, OP_MADDHD, OP_MADDHDU, OP_MADDLD, OP_MCRF, OP_MCRXR, OP_MCRXRX, OP_MFCR, OP_MFSPR, OP_MOD, OP_MTCRF, OP_MTSPR, OP_MUL_L64, OP_MUL_H64, OP_MUL_H32, OP_OR, - OP_POPCNTB, OP_POPCNTD, OP_POPCNTW, OP_PRTYD, - OP_PRTYW, OP_RLC, OP_RLCL, OP_RLCR, OP_SETB, + OP_POPCNT, OP_PRTY, + OP_RLC, OP_RLCL, OP_RLCR, OP_SETB, OP_SHL, OP_SHR, OP_SYNC, OP_TD, OP_TDI, OP_TW, OP_TWI, OP_XOR, OP_SIM_CONFIG @@ -46,7 +46,7 @@ package decode_types is constant TOO_OFFSET : integer := 0; - type unit_t is (NONE, ALU, LDST, MUL, DIV); + type unit_t is (NONE, ALU, LDST); type length_t is (NONE, is1B, is2B, is4B, is8B); type decode_rom_t is record diff --git a/divider.vhdl b/divider.vhdl index affab85..aef65a4 100644 --- a/divider.vhdl +++ b/divider.vhdl @@ -10,8 +10,8 @@ entity divider is port ( clk : in std_logic; rst : in std_logic; - d_in : in Decode2ToDividerType; - d_out : out DividerToWritebackType + d_in : in Execute1ToDividerType; + d_out : out DividerToExecute1Type ); end entity divider; @@ -23,20 +23,15 @@ architecture behaviour of divider is signal sresult : std_ulogic_vector(64 downto 0); signal oresult : std_ulogic_vector(63 downto 0); signal running : std_ulogic; - signal signcheck : std_ulogic; signal count : unsigned(6 downto 0); signal neg_result : std_ulogic; signal is_modulus : std_ulogic; signal is_32bit : std_ulogic; signal extended : std_ulogic; signal is_signed : std_ulogic; - signal rc : std_ulogic; - signal write_reg : std_ulogic_vector(4 downto 0); signal overflow : std_ulogic; signal ovf32 : std_ulogic; signal did_ovf : std_ulogic; - signal oe : std_ulogic; - signal xerc : xer_common_t; begin divider_0: process(clk) begin @@ -48,40 +43,22 @@ begin running <= '0'; count <= "0000000"; elsif d_in.valid = '1' then - if d_in.is_extended = '1' and not (d_in.is_signed = '1' and d_in.dividend(63) = '1') then + if d_in.is_extended = '1' then dend <= '0' & d_in.dividend & x"0000000000000000"; else dend <= '0' & x"0000000000000000" & d_in.dividend; end if; div <= unsigned(d_in.divisor); quot <= (others => '0'); - write_reg <= d_in.write_reg; - neg_result <= '0'; + neg_result <= d_in.neg_result; is_modulus <= d_in.is_modulus; extended <= d_in.is_extended; is_32bit <= d_in.is_32bit; is_signed <= d_in.is_signed; - rc <= d_in.rc; - oe <= d_in.oe; - xerc <= d_in.xerc; count <= "1111111"; running <= '1'; overflow <= '0'; ovf32 <= '0'; - signcheck <= d_in.is_signed and (d_in.dividend(63) or d_in.divisor(63)); - elsif signcheck = '1' then - signcheck <= '0'; - neg_result <= dend(63) xor (div(63) and not is_modulus); - if dend(63) = '1' then - if extended = '1' then - dend <= '0' & std_ulogic_vector(- signed(dend(63 downto 0))) & x"0000000000000000"; - else - dend <= '0' & x"0000000000000000" & std_ulogic_vector(- signed(dend(63 downto 0))); - end if; - end if; - if div(63) = '1' then - div <= unsigned(- signed(div)); - end if; elsif running = '1' then if count = "0111111" then running <= '0'; @@ -113,9 +90,6 @@ begin divider_1: process(all) begin - d_out.write_reg_nr <= write_reg; - d_out.rc <= rc; - if is_modulus = '1' then result <= dend(128 downto 65); else @@ -151,23 +125,9 @@ begin if rising_edge(clk) then d_out.valid <= '0'; d_out.write_reg_data <= oresult; - d_out.write_reg_enable <= '0'; - d_out.write_xerc_enable <= '0'; - d_out.xerc <= xerc; + d_out.overflow <= did_ovf; if count = "1000000" then d_out.valid <= '1'; - d_out.write_reg_enable <= '1'; - d_out.write_xerc_enable <= oe; - - -- We must test oe because the RC update code in writeback - -- will use the xerc value to set CR0:SO so we must not clobber - -- xerc if OE wasn't set. - -- - if oe = '1' then - d_out.xerc.ov <= did_ovf; - d_out.xerc.ov32 <= did_ovf; - d_out.xerc.so <= xerc.so or did_ovf; - end if; end if; end if; end process; diff --git a/divider_tb.vhdl b/divider_tb.vhdl index 5f809bb..95156a3 100644 --- a/divider_tb.vhdl +++ b/divider_tb.vhdl @@ -16,8 +16,8 @@ architecture behave of divider_tb is signal rst : std_ulogic; constant clk_period : time := 10 ns; - signal d1 : Decode2ToDividerType; - signal d2 : DividerToWritebackType; + signal d1 : Execute1ToDividerType; + signal d2 : DividerToExecute1Type; begin divider_0: entity work.divider port map (clk => clk, rst => rst, d_in => d1, d_out => d2); @@ -43,14 +43,13 @@ begin rst <= '0'; d1.valid <= '1'; - d1.write_reg <= "10001"; d1.dividend <= x"0000000010001000"; d1.divisor <= x"0000000000001111"; d1.is_signed <= '0'; d1.is_32bit <= '0'; d1.is_extended <= '0'; d1.is_modulus <= '0'; - d1.rc <= '0'; + d1.neg_result <= '0'; wait for clk_period; assert d2.valid = '0'; @@ -65,16 +64,12 @@ begin end loop; assert d2.valid = '1'; - assert d2.write_reg_enable = '1'; - assert d2.write_reg_nr = "10001"; assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data); - assert d2.rc = '0'; wait for clk_period; assert d2.valid = '0' report "valid"; d1.valid <= '1'; - d1.rc <= '1'; wait for clk_period; assert d2.valid = '0' report "valid"; @@ -89,10 +84,7 @@ begin end loop; assert d2.valid = '1'; - assert d2.write_reg_enable = '1'; - assert d2.write_reg_nr = "10001"; assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data); - assert d2.rc = '1'; wait for clk_period; assert d2.valid = '0'; @@ -105,9 +97,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64)); rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63) xor rb(63); d1.valid <= '1'; wait for clk_period; @@ -142,6 +135,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.valid <= '1'; wait for clk_period; @@ -173,9 +167,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64)); rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63) xor rb(63); d1.is_extended <= '1'; d1.valid <= '1'; @@ -216,6 +211,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.is_extended <= '1'; d1.valid <= '1'; @@ -250,9 +246,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64)); rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63) xor rb(63); d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.valid <= '1'; @@ -289,6 +286,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.valid <= '1'; @@ -322,9 +320,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 32)) & x"00000000"; rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63) xor rb(63); d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.valid <= '1'; @@ -365,6 +364,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.valid <= '1'; @@ -398,9 +398,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64)); rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63); d1.is_extended <= '0'; d1.is_32bit <= '0'; d1.is_modulus <= '1'; @@ -438,6 +439,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.is_extended <= '0'; d1.is_32bit <= '0'; d1.is_modulus <= '1'; @@ -472,9 +474,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64)); rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63); d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.is_modulus <= '1'; @@ -517,6 +520,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.is_modulus <= '1'; diff --git a/execute1.vhdl b/execute1.vhdl index 4714ec5..ae13c72 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -11,8 +11,12 @@ use work.insn_helpers.all; use work.ppc_fx_insns.all; entity execute1 is + generic ( + EX1_BYPASS : boolean := true + ); port ( clk : in std_ulogic; + rst : in std_ulogic; -- asynchronous flush_out : out std_ulogic; @@ -21,6 +25,7 @@ entity execute1 is e_in : in Decode2ToExecute1Type; -- asynchronous + l_out : out Execute1ToLoadstore1Type; f_out : out Execute1ToFetch1Type; e_out : out Execute1ToWritebackType; @@ -35,10 +40,19 @@ architecture behaviour of execute1 is e : Execute1ToWritebackType; lr_update : std_ulogic; next_lr : std_ulogic_vector(63 downto 0); + mul_in_progress : std_ulogic; + div_in_progress : std_ulogic; + cntz_in_progress : std_ulogic; + slow_op_dest : gpr_index_t; + slow_op_rc : std_ulogic; + slow_op_oe : std_ulogic; + slow_op_xerc : xer_common_t; end record; signal r, rin : reg_type; + signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0); + signal ctrl: ctrl_t := (others => (others => '0')); signal ctrl_tmp: ctrl_t := (others => (others => '0')); @@ -47,6 +61,16 @@ architecture behaviour of execute1 is signal rotator_carry: std_ulogic; signal logical_result: std_ulogic_vector(63 downto 0); signal countzero_result: std_ulogic_vector(63 downto 0); + signal popcnt_result: std_ulogic_vector(63 downto 0); + signal parity_result: std_ulogic_vector(63 downto 0); + + -- multiply signals + signal x_to_multiply: Execute1ToMultiplyType; + signal multiply_to_x: MultiplyToExecute1Type; + + -- divider signals + signal x_to_divider: Execute1ToDividerType; + signal divider_to_x: DividerToExecute1Type; procedure set_carry(e: inout Execute1ToWritebackType; carry32 : in std_ulogic; @@ -92,9 +116,9 @@ begin rotator_0: entity work.rotator port map ( - rs => e_in.read_data3, - ra => e_in.read_data1, - shift => e_in.read_data2(6 downto 0), + rs => c_in, + ra => a_in, + shift => b_in(6 downto 0), insn => e_in.insn, is_32bit => e_in.is_32bit, right_shift => right_shift, @@ -107,22 +131,45 @@ begin logical_0: entity work.logical port map ( - rs => e_in.read_data3, - rb => e_in.read_data2, + rs => c_in, + rb => b_in, op => e_in.insn_type, invert_in => e_in.invert_a, invert_out => e_in.invert_out, - result => logical_result + result => logical_result, + datalen => e_in.data_len, + popcnt => popcnt_result, + parity => parity_result ); countzero_0: entity work.zero_counter port map ( - rs => e_in.read_data3, + clk => clk, + rs => c_in, count_right => e_in.insn(10), is_32bit => e_in.is_32bit, result => countzero_result ); + multiply_0: entity work.multiply + port map ( + clk => clk, + m_in => x_to_multiply, + m_out => multiply_to_x + ); + + divider_0: entity work.divider + port map ( + clk => clk, + rst => rst, + d_in => x_to_divider, + d_out => divider_to_x + ); + + a_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data1 = '1' else e_in.read_data1; + b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2; + c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3; + execute1_0: process(clk) begin if rising_edge(clk) then @@ -159,6 +206,14 @@ begin variable l : std_ulogic; variable next_nia : std_ulogic_vector(63 downto 0); variable carry_32, carry_64 : std_ulogic; + variable sign1, sign2 : std_ulogic; + variable abs1, abs2 : signed(63 downto 0); + variable overflow : std_ulogic; + variable negative : std_ulogic; + variable zerohi, zerolo : std_ulogic; + variable msb_a, msb_b : std_ulogic; + variable a_lt : std_ulogic; + variable lv : Execute1ToLoadstore1Type; begin result := (others => '0'); result_with_carry := (others => '0'); @@ -204,6 +259,83 @@ begin end if; v.lr_update := '0'; + v.mul_in_progress := '0'; + v.div_in_progress := '0'; + v.cntz_in_progress := '0'; + + -- signals to multiply unit + x_to_multiply <= Execute1ToMultiplyInit; + x_to_multiply.insn_type <= e_in.insn_type; + x_to_multiply.is_32bit <= e_in.is_32bit; + + if e_in.is_32bit = '1' then + if e_in.is_signed = '1' then + x_to_multiply.data1 <= (others => a_in(31)); + x_to_multiply.data1(31 downto 0) <= a_in(31 downto 0); + x_to_multiply.data2 <= (others => b_in(31)); + x_to_multiply.data2(31 downto 0) <= b_in(31 downto 0); + else + x_to_multiply.data1 <= '0' & x"00000000" & a_in(31 downto 0); + x_to_multiply.data2 <= '0' & x"00000000" & b_in(31 downto 0); + end if; + else + if e_in.is_signed = '1' then + x_to_multiply.data1 <= a_in(63) & a_in; + x_to_multiply.data2 <= b_in(63) & b_in; + else + x_to_multiply.data1 <= '0' & a_in; + x_to_multiply.data2 <= '0' & b_in; + end if; + end if; + + -- signals to divide unit + sign1 := '0'; + sign2 := '0'; + if e_in.is_signed = '1' then + if e_in.is_32bit = '1' then + sign1 := a_in(31); + sign2 := b_in(31); + else + sign1 := a_in(63); + sign2 := b_in(63); + end if; + end if; + -- take absolute values + if sign1 = '0' then + abs1 := signed(a_in); + else + abs1 := - signed(a_in); + end if; + if sign2 = '0' then + abs2 := signed(b_in); + else + abs2 := - signed(b_in); + end if; + + x_to_divider <= Execute1ToDividerInit; + x_to_divider.is_signed <= e_in.is_signed; + x_to_divider.is_32bit <= e_in.is_32bit; + if e_in.insn_type = OP_MOD then + x_to_divider.is_modulus <= '1'; + end if; + x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus); + if e_in.is_32bit = '0' then + -- 64-bit forms + if e_in.insn_type = OP_DIVE then + x_to_divider.is_extended <= '1'; + end if; + x_to_divider.dividend <= std_ulogic_vector(abs1); + x_to_divider.divisor <= std_ulogic_vector(abs2); + else + -- 32-bit forms + x_to_divider.is_extended <= '0'; + if e_in.insn_type = OP_DIVE then -- extended forms + x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000"; + else + x_to_divider.dividend <= x"00000000" & std_ulogic_vector(abs1(31 downto 0)); + end if; + x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0)); + end if; ctrl_tmp <= ctrl; -- FIXME: run at 512MHz not core freq @@ -226,8 +358,10 @@ begin v.e.valid := '1'; v.e.write_reg := e_in.write_reg; - v.e.write_len := x"8"; - v.e.sign_extend := '0'; + v.slow_op_dest := gspr_to_gpr(e_in.write_reg); + v.slow_op_rc := e_in.rc; + v.slow_op_oe := e_in.oe; + v.slow_op_xerc := v.e.xerc; case_0: case e_in.insn_type is @@ -236,51 +370,93 @@ begin report "illegal"; when OP_NOP => -- Do nothing - when OP_ADD => + when OP_ADD | OP_CMP => if e_in.invert_a = '0' then - a_inv := e_in.read_data1; + a_inv := a_in; else - a_inv := not e_in.read_data1; + a_inv := not a_in; end if; - result_with_carry := ppc_adde(a_inv, e_in.read_data2, + result_with_carry := ppc_adde(a_inv, b_in, decode_input_carry(e_in.input_carry, v.e.xerc)); result := result_with_carry(63 downto 0); - carry_32 := result(32) xor a_inv(32) xor e_in.read_data2(32); + carry_32 := result(32) xor a_inv(32) xor b_in(32); carry_64 := result_with_carry(64); - if e_in.output_carry = '1' then - set_carry(v.e, carry_32, carry_64); - end if; - if e_in.oe = '1' then - set_ov(v.e, - calc_ov(a_inv(63), e_in.read_data2(63), carry_64, result_with_carry(63)), - calc_ov(a_inv(31), e_in.read_data2(31), carry_32, result_with_carry(31))); - end if; - result_en := '1'; + if e_in.insn_type = OP_ADD then + if e_in.output_carry = '1' then + set_carry(v.e, carry_32, carry_64); + end if; + if e_in.oe = '1' then + set_ov(v.e, + calc_ov(a_inv(63), b_in(63), carry_64, result_with_carry(63)), + calc_ov(a_inv(31), b_in(31), carry_32, result_with_carry(31))); + end if; + result_en := '1'; + else + -- CMP and CMPL instructions + -- Note, we have done RB - RA, not RA - RB + bf := insn_bf(e_in.insn); + l := insn_l(e_in.insn); + v.e.write_cr_enable := '1'; + crnum := to_integer(unsigned(bf)); + v.e.write_cr_mask := num_to_fxm(crnum); + zerolo := not (or (a_in(31 downto 0) xor b_in(31 downto 0))); + zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32))); + if zerolo = '1' and (l = '0' or zerohi = '1') then + -- values are equal + newcrf := "001" & v.e.xerc.so; + else + if l = '1' then + -- 64-bit comparison + msb_a := a_in(63); + msb_b := b_in(63); + else + -- 32-bit comparison + msb_a := a_in(31); + msb_b := b_in(31); + end if; + if msb_a /= msb_b then + -- Subtraction might overflow, but + -- comparison is clear from MSB difference. + -- for signed, 0 is greater; for unsigned, 1 is greater + a_lt := msb_a xnor e_in.is_signed; + else + -- Subtraction cannot overflow since MSBs are equal. + -- carry = 1 indicates RA is smaller (signed or unsigned) + a_lt := (not l and carry_32) or (l and carry_64); + end if; + newcrf := a_lt & not a_lt & '0' & v.e.xerc.so; + end if; + for i in 0 to 7 loop + lo := i*4; + hi := lo + 3; + v.e.write_cr_data(hi downto lo) := newcrf; + end loop; + end if; when OP_AND | OP_OR | OP_XOR => result := logical_result; result_en := '1'; when OP_B => f_out.redirect <= '1'; if (insn_aa(e_in.insn)) then - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2)); + f_out.redirect_nia <= std_ulogic_vector(signed(b_in)); else - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2)); + f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in)); end if; when OP_BC => -- read_data1 is CTR bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); if bo(4-2) = '0' then - result := std_ulogic_vector(unsigned(e_in.read_data1) - 1); + result := std_ulogic_vector(unsigned(a_in) - 1); result_en := '1'; v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then + if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then f_out.redirect <= '1'; if (insn_aa(e_in.insn)) then - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2)); + f_out.redirect_nia <= std_ulogic_vector(signed(b_in)); else - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2)); + f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in)); end if; end if; when OP_BCREG => @@ -289,53 +465,41 @@ begin bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); if bo(4-2) = '0' and e_in.insn(10) = '0' then - result := std_ulogic_vector(unsigned(e_in.read_data1) - 1); + result := std_ulogic_vector(unsigned(a_in) - 1); result_en := '1'; v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then + if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then f_out.redirect <= '1'; - f_out.redirect_nia <= e_in.read_data2(63 downto 2) & "00"; + f_out.redirect_nia <= b_in(63 downto 2) & "00"; end if; when OP_CMPB => - result := ppc_cmpb(e_in.read_data3, e_in.read_data2); + result := ppc_cmpb(c_in, b_in); result_en := '1'; - when OP_CMP => - bf := insn_bf(e_in.insn); - l := insn_l(e_in.insn); - v.e.write_cr_enable := '1'; - crnum := to_integer(unsigned(bf)); - v.e.write_cr_mask := num_to_fxm(crnum); - for i in 0 to 7 loop - lo := i*4; - hi := lo + 3; - v.e.write_cr_data(hi downto lo) := ppc_cmp(l, e_in.read_data1, e_in.read_data2, v.e.xerc.so); - end loop; - when OP_CMPL => - bf := insn_bf(e_in.insn); - l := insn_l(e_in.insn); - v.e.write_cr_enable := '1'; - crnum := to_integer(unsigned(bf)); - v.e.write_cr_mask := num_to_fxm(crnum); - for i in 0 to 7 loop - lo := i*4; - hi := lo + 3; - v.e.write_cr_data(hi downto lo) := ppc_cmpl(l, e_in.read_data1, e_in.read_data2, v.e.xerc.so); - end loop; - when OP_CNTZ => - result := countzero_result; - result_en := '1'; - when OP_EXTS => - v.e.write_len := e_in.data_len; - v.e.sign_extend := '1'; - result := e_in.read_data3; + when OP_CNTZ => + v.e.valid := '0'; + v.cntz_in_progress := '1'; + stall_out <= '1'; + when OP_EXTS => + -- note data_len is a 1-hot encoding + negative := (e_in.data_len(0) and c_in(7)) or + (e_in.data_len(1) and c_in(15)) or + (e_in.data_len(2) and c_in(31)); + result := (others => negative); + if e_in.data_len(2) = '1' then + result(31 downto 16) := c_in(31 downto 16); + end if; + if e_in.data_len(2) = '1' or e_in.data_len(1) = '1' then + result(15 downto 8) := c_in(15 downto 8); + end if; + result(7 downto 0) := c_in(7 downto 0); result_en := '1'; when OP_ISEL => crbit := to_integer(unsigned(insn_bc(e_in.insn))); if e_in.cr(31-crbit) = '1' then - result := e_in.read_data1; + result := a_in; else - result := e_in.read_data2; + result := b_in; end if; result_en := '1'; when OP_MCRF => @@ -400,7 +564,7 @@ begin end if; when OP_MFSPR => if is_fast_spr(e_in.read_reg1) then - result := e_in.read_data1; + result := a_in; if decode_spr_num(e_in.insn) = SPR_XER then -- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer result(63 downto 32) := (others => '0'); @@ -447,19 +611,19 @@ begin crnum := fxm_to_num(insn_fxm(e_in.insn)); v.e.write_cr_mask := num_to_fxm(crnum); end if; - v.e.write_cr_data := e_in.read_data3(31 downto 0); + v.e.write_cr_data := c_in(31 downto 0); when OP_MTSPR => report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & - "=" & to_hstring(e_in.read_data3); + "=" & to_hstring(c_in); if is_fast_spr(e_in.write_reg) then - result := e_in.read_data3; + result := c_in; result_en := '1'; if decode_spr_num(e_in.insn) = SPR_XER then - v.e.xerc.so := e_in.read_data3(63-32); - v.e.xerc.ov := e_in.read_data3(63-33); - v.e.xerc.ca := e_in.read_data3(63-34); - v.e.xerc.ov32 := e_in.read_data3(63-44); - v.e.xerc.ca32 := e_in.read_data3(63-45); + v.e.xerc.so := c_in(63-32); + v.e.xerc.ov := c_in(63-33); + v.e.xerc.ca := c_in(63-34); + v.e.xerc.ov32 := c_in(63-44); + v.e.xerc.ca32 := c_in(63-45); v.e.write_xerc_enable := '1'; end if; else @@ -468,20 +632,11 @@ begin -- when others => -- end case; end if; - when OP_POPCNTB => - result := ppc_popcntb(e_in.read_data3); + when OP_POPCNT => + result := popcnt_result; result_en := '1'; - when OP_POPCNTW => - result := ppc_popcntw(e_in.read_data3); - result_en := '1'; - when OP_POPCNTD => - result := ppc_popcntd(e_in.read_data3); - result_en := '1'; - when OP_PRTYD => - result := ppc_prtyd(e_in.read_data3); - result_en := '1'; - when OP_PRTYW => - result := ppc_prtyw(e_in.read_data3); + when OP_PRTY => + result := parity_result; result_en := '1'; when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR => result := rotator_result; @@ -506,11 +661,29 @@ begin when OP_ICBI => icache_inval <= '1'; - when others => + when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 => + v.e.valid := '0'; + v.mul_in_progress := '1'; + stall_out <= '1'; + x_to_multiply.valid <= '1'; + + when OP_DIV | OP_DIVE | OP_MOD => + v.e.valid := '0'; + v.div_in_progress := '1'; + stall_out <= '1'; + x_to_divider.valid <= '1'; + + when OP_LOAD | OP_STORE => + -- loadstore/dcache has its own port to writeback + v.e.valid := '0'; + + when others => terminate_out <= '1'; report "illegal"; end case; + v.e.rc := e_in.rc and e_in.valid; + -- Update LR on the next cycle after a branch link -- -- WARNING: The LR update isn't tracked by our hazard tracker. This @@ -533,20 +706,74 @@ begin result_en := '1'; result := r.next_lr; v.e.write_reg := fast_spr_num(SPR_LR); - v.e.write_len := x"8"; - v.e.sign_extend := '0'; v.e.valid := '1'; + elsif r.cntz_in_progress = '1' then + -- cnt[lt]z always takes two cycles + result := countzero_result; + result_en := '1'; + v.e.write_reg := gpr_to_gspr(v.slow_op_dest); + v.e.rc := v.slow_op_rc; + v.e.xerc := v.slow_op_xerc; + v.e.valid := '1'; + elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then + if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or + (r.div_in_progress = '1' and divider_to_x.valid = '1') then + if r.mul_in_progress = '1' then + result := multiply_to_x.write_reg_data; + overflow := multiply_to_x.overflow; + else + result := divider_to_x.write_reg_data; + overflow := divider_to_x.overflow; + end if; + result_en := '1'; + v.e.write_reg := gpr_to_gspr(v.slow_op_dest); + v.e.rc := v.slow_op_rc; + v.e.xerc := v.slow_op_xerc; + v.e.write_xerc_enable := v.slow_op_oe; + -- We must test oe because the RC update code in writeback + -- will use the xerc value to set CR0:SO so we must not clobber + -- xerc if OE wasn't set. + if v.slow_op_oe = '1' then + v.e.xerc.ov := overflow; + v.e.xerc.ov32 := overflow; + v.e.xerc.so := v.slow_op_xerc.so or overflow; + end if; + v.e.valid := '1'; + else + stall_out <= '1'; + v.mul_in_progress := r.mul_in_progress; + v.div_in_progress := r.div_in_progress; + end if; end if; v.e.write_data := result; v.e.write_enable := result_en; - v.e.rc := e_in.rc and e_in.valid; + + -- Outputs to loadstore1 (async) + lv := Execute1ToLoadstore1Init; + if e_in.valid = '1' and (e_in.insn_type = OP_LOAD or e_in.insn_type = OP_STORE) then + lv.valid := '1'; + end if; + if e_in.insn_type = OP_LOAD then + lv.load := '1'; + end if; + lv.addr1 := a_in; + lv.addr2 := b_in; + lv.data := c_in; + lv.write_reg := gspr_to_gpr(e_in.write_reg); + lv.length := e_in.data_len; + lv.byte_reverse := e_in.byte_reverse; + lv.sign_extend := e_in.sign_extend; + lv.update := e_in.update; + lv.update_reg := gspr_to_gpr(e_in.read_reg1); + lv.xerc := v.e.xerc; -- Update registers rin <= v; -- update outputs --f_out <= r.f; + l_out <= lv; e_out <= r.e; flush_out <= f_out.redirect; end process; diff --git a/gpr_hazard.vhdl b/gpr_hazard.vhdl index 705e69d..de4f7d2 100644 --- a/gpr_hazard.vhdl +++ b/gpr_hazard.vhdl @@ -12,18 +12,21 @@ entity gpr_hazard is gpr_write_valid_in : in std_ulogic; gpr_write_in : in std_ulogic_vector(5 downto 0); + bypass_avail : in std_ulogic; gpr_read_valid_in : in std_ulogic; gpr_read_in : in std_ulogic_vector(5 downto 0); - stall_out : out std_ulogic + stall_out : out std_ulogic; + use_bypass : out std_ulogic ); end entity gpr_hazard; architecture behaviour of gpr_hazard is type pipeline_entry_type is record - valid : std_ulogic; - gpr : std_ulogic_vector(5 downto 0); + valid : std_ulogic; + bypass : std_ulogic; + gpr : std_ulogic_vector(5 downto 0); end record; - constant pipeline_entry_init : pipeline_entry_type := (valid => '0', gpr => (others => '0')); + constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0')); type pipeline_t is array(0 to PIPELINE_DEPTH-1) of pipeline_entry_type; constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init); @@ -33,9 +36,7 @@ begin gpr_hazard0: process(clk) begin if rising_edge(clk) then - if stall_in = '0' then - r <= rin; - end if; + r <= rin; end if; end process; @@ -45,22 +46,49 @@ begin v := r; stall_out <= '0'; - loop_0: for i in 0 to PIPELINE_DEPTH-1 loop - if ((r(i).valid = gpr_read_valid_in) and r(i).gpr = gpr_read_in) then - stall_out <= '1'; + use_bypass <= '0'; + if gpr_read_valid_in = '1' then + if r(0).valid = '1' and r(0).gpr = gpr_read_in then + if r(0).bypass = '1' and stall_in = '0' then + use_bypass <= '1'; + else + stall_out <= '1'; + end if; end if; - end loop; + loop_0: for i in 1 to PIPELINE_DEPTH-1 loop + if r(i).valid = '1' and r(i).gpr = gpr_read_in then + if r(i).bypass = '1' then + use_bypass <= '1'; + else + stall_out <= '1'; + end if; + end if; + end loop; + end if; - v(0).valid := gpr_write_valid_in; - v(0).gpr := gpr_write_in; - loop_1: for i in 0 to PIPELINE_DEPTH-2 loop - -- propagate to next slot - v(i+1) := r(i); - end loop; + if stall_in = '0' then + v(0).valid := gpr_write_valid_in; + v(0).bypass := bypass_avail; + v(0).gpr := gpr_write_in; + loop_1: for i in 1 to PIPELINE_DEPTH-1 loop + -- propagate to next slot + v(i).valid := r(i-1).valid; + v(i).bypass := r(i-1).bypass; + v(i).gpr := r(i-1).gpr; + end loop; - -- asynchronous output - if gpr_read_valid_in = '0' then - stall_out <= '0'; + else + -- stage 0 stalled, so stage 1 becomes empty + loop_1b: for i in 1 to PIPELINE_DEPTH-1 loop + -- propagate to next slot + if i = 1 then + v(i).valid := '0'; + else + v(i).valid := r(i-1).valid; + v(i).bypass := r(i-1).bypass; + v(i).gpr := r(i-1).gpr; + end if; + end loop; end if; -- update registers diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 1c16c46..5b61d4c 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -13,7 +13,7 @@ entity loadstore1 is port ( clk : in std_ulogic; - l_in : in Decode2ToLoadstore1Type; + l_in : in Execute1ToLoadstore1Type; l_out : out Loadstore1ToDcacheType ); diff --git a/logical.vhdl b/logical.vhdl index b92b98d..4dfc13d 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -12,11 +12,29 @@ entity logical is op : in insn_type_t; invert_in : in std_ulogic; invert_out : in std_ulogic; - result : out std_ulogic_vector(63 downto 0) + result : out std_ulogic_vector(63 downto 0); + datalen : in std_logic_vector(3 downto 0); + popcnt : out std_ulogic_vector(63 downto 0); + parity : out std_ulogic_vector(63 downto 0) ); end entity logical; architecture behaviour of logical is + + subtype twobit is unsigned(1 downto 0); + type twobit32 is array(0 to 31) of twobit; + signal pc2 : twobit32; + subtype threebit is unsigned(2 downto 0); + type threebit16 is array(0 to 15) of threebit; + signal pc4 : threebit16; + subtype fourbit is unsigned(3 downto 0); + type fourbit8 is array(0 to 7) of fourbit; + signal pc8 : fourbit8; + subtype sixbit is unsigned(5 downto 0); + type sixbit2 is array(0 to 1) of sixbit; + signal pc32 : sixbit2; + signal par0, par1 : std_ulogic; + begin logical_0: process(all) variable rb_adj, tmp : std_ulogic_vector(63 downto 0); @@ -40,5 +58,45 @@ begin result <= not tmp; end if; + -- population counts + for i in 0 to 31 loop + pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1)); + end loop; + for i in 0 to 15 loop + pc4(i) <= ('0' & pc2(i * 2)) + ('0' & pc2(i * 2 + 1)); + end loop; + for i in 0 to 7 loop + pc8(i) <= ('0' & pc4(i * 2)) + ('0' & pc4(i * 2 + 1)); + end loop; + for i in 0 to 1 loop + pc32(i) <= ("00" & pc8(i * 4)) + ("00" & pc8(i * 4 + 1)) + + ("00" & pc8(i * 4 + 2)) + ("00" & pc8(i * 4 + 3)); + end loop; + popcnt <= (others => '0'); + if datalen(3 downto 2) = "00" then + -- popcntb + for i in 0 to 7 loop + popcnt(i * 8 + 3 downto i * 8) <= std_ulogic_vector(pc8(i)); + end loop; + elsif datalen(3) = '0' then + -- popcntw + for i in 0 to 1 loop + popcnt(i * 32 + 5 downto i * 32) <= std_ulogic_vector(pc32(i)); + end loop; + else + popcnt(6 downto 0) <= std_ulogic_vector(('0' & pc32(0)) + ('0' & pc32(1))); + end if; + + -- parity calculations + par0 <= rs(0) xor rs(8) xor rs(16) xor rs(24); + par1 <= rs(32) xor rs(40) xor rs(48) xor rs(56); + parity <= (others => '0'); + if datalen(3) = '1' then + parity(0) <= par0 xor par1; + else + parity(0) <= par0; + parity(32) <= par1; + end if; + end process; end behaviour; diff --git a/multiply.vhdl b/multiply.vhdl index 23339b5..959c114 100644 --- a/multiply.vhdl +++ b/multiply.vhdl @@ -13,31 +13,24 @@ entity multiply is port ( clk : in std_logic; - m_in : in Decode2ToMultiplyType; - m_out : out MultiplyToWritebackType + m_in : in Execute1ToMultiplyType; + m_out : out MultiplyToExecute1Type ); end entity multiply; architecture behaviour of multiply is - signal m: Decode2ToMultiplyType; + signal m: Execute1ToMultiplyType; type multiply_pipeline_stage is record valid : std_ulogic; insn_type : insn_type_t; data : signed(129 downto 0); - write_reg : std_ulogic_vector(4 downto 0); - rc : std_ulogic; - oe : std_ulogic; is_32bit : std_ulogic; - xerc : xer_common_t; end record; constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0', insn_type => OP_ILLEGAL, - rc => '0', oe => '0', is_32bit => '0', - xerc => xerc_init, - data => (others => '0'), - others => (others => '0')); + data => (others => '0')); type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage; constant MultiplyPipelineInit : multiply_pipeline_type := (others => MultiplyPipelineStageInit); @@ -64,16 +57,12 @@ begin begin v := r; - m_out <= MultiplyToWritebackInit; + m_out <= MultiplyToExecute1Init; v.multiply_pipeline(0).valid := m.valid; v.multiply_pipeline(0).insn_type := m.insn_type; v.multiply_pipeline(0).data := signed(m.data1) * signed(m.data2); - v.multiply_pipeline(0).write_reg := m.write_reg; - v.multiply_pipeline(0).rc := m.rc; - v.multiply_pipeline(0).oe := m.oe; v.multiply_pipeline(0).is_32bit := m.is_32bit; - v.multiply_pipeline(0).xerc := m.xerc; loop_0: for i in 1 to PIPELINE_DEPTH-1 loop v.multiply_pipeline(i) := r.multiply_pipeline(i-1); @@ -101,25 +90,10 @@ begin end case; m_out.write_reg_data <= d2; - m_out.write_reg_nr <= v.multiply_pipeline(PIPELINE_DEPTH-1).write_reg; - m_out.xerc <= v.multiply_pipeline(PIPELINE_DEPTH-1).xerc; + m_out.overflow <= ov; - -- Generate OV/OV32/SO when OE=1 if v.multiply_pipeline(PIPELINE_DEPTH-1).valid = '1' then m_out.valid <= '1'; - m_out.write_reg_enable <= '1'; - m_out.rc <= v.multiply_pipeline(PIPELINE_DEPTH-1).rc; - m_out.write_xerc_enable <= v.multiply_pipeline(PIPELINE_DEPTH-1).oe; - - -- We must test oe because the RC update code in writeback - -- will use the xerc value to set CR0:SO so we must not clobber - -- xerc if OE wasn't set. - -- - if v.multiply_pipeline(PIPELINE_DEPTH-1).oe = '1' then - m_out.xerc.ov <= ov; - m_out.xerc.ov32 <= ov; - m_out.xerc.so <= v.multiply_pipeline(PIPELINE_DEPTH-1).xerc.so or ov; - end if; end if; rin <= v; diff --git a/multiply_tb.vhdl b/multiply_tb.vhdl index 48f83ab..8f1d795 100644 --- a/multiply_tb.vhdl +++ b/multiply_tb.vhdl @@ -17,8 +17,8 @@ architecture behave of multiply_tb is constant pipeline_depth : integer := 4; - signal m1 : Decode2ToMultiplyType; - signal m2 : MultiplyToWritebackType; + signal m1 : Execute1ToMultiplyType; + signal m2 : MultiplyToExecute1Type; begin multiply_0: entity work.multiply generic map (PIPELINE_DEPTH => pipeline_depth) @@ -40,10 +40,8 @@ begin m1.valid <= '1'; m1.insn_type <= OP_MUL_L64; - m1.write_reg <= "10001"; m1.data1 <= '0' & x"0000000000001000"; m1.data2 <= '0' & x"0000000000001111"; - m1.rc <= '0'; wait for clk_period; assert m2.valid = '0'; @@ -58,16 +56,12 @@ begin wait for clk_period; assert m2.valid = '1'; - assert m2.write_reg_enable = '1'; - assert m2.write_reg_nr = "10001"; assert m2.write_reg_data = x"0000000001111000"; - assert m2.rc = '0'; wait for clk_period; assert m2.valid = '0'; m1.valid <= '1'; - m1.rc <= '1'; wait for clk_period; assert m2.valid = '0'; @@ -76,10 +70,7 @@ begin wait for clk_period * (pipeline_depth-1); assert m2.valid = '1'; - assert m2.write_reg_enable = '1'; - assert m2.write_reg_nr = "10001"; assert m2.write_reg_data = x"0000000001111000"; - assert m2.rc = '1'; -- test mulld mulld_loop : for i in 0 to 1000 loop diff --git a/writeback.vhdl b/writeback.vhdl index 8582166..e53f46b 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -12,8 +12,6 @@ entity writeback is e_in : in Execute1ToWritebackType; l_in : in DcacheToWritebackType; - m_in : in MultiplyToWritebackType; - d_in : in DividerToWritebackType; w_out : out WritebackToRegisterFileType; c_out : out WritebackToCrFileType; @@ -44,7 +42,6 @@ architecture behaviour of writeback is signal sign_extend : std_ulogic; signal negative : std_ulogic; signal second_word : std_ulogic; - signal zero : std_ulogic; begin writeback_0: process(clk) begin @@ -64,44 +61,32 @@ begin variable k : unsigned(3 downto 0); variable cf: std_ulogic_vector(3 downto 0); variable xe: xer_common_t; + variable zero : std_ulogic; + variable sign : std_ulogic; begin x := "" & e_in.valid; y := "" & l_in.valid; - z := "" & m_in.valid; - w := "" & d_in.valid; - assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z)) + to_integer(unsigned(w))) <= 1 severity failure; + assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; x := "" & e_in.write_enable; y := "" & l_in.write_enable; - z := "" & m_in.write_reg_enable; - w := "" & d_in.write_reg_enable; - assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z)) + to_integer(unsigned(w))) <= 1 severity failure; + assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; w := "" & e_in.write_cr_enable; x := "" & (e_in.write_enable and e_in.rc); - y := "" & (m_in.valid and m_in.rc); - z := "" & (d_in.valid and d_in.rc); - assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure; - - x := "" & e_in.write_xerc_enable; - y := "" & m_in.write_xerc_enable; - z := "" & D_in.write_xerc_enable; - assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure; + assert (to_integer(unsigned(w)) + to_integer(unsigned(x))) <= 1 severity failure; w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; complete_out <= '0'; - if e_in.valid = '1' or l_in.valid = '1' or m_in.valid = '1' or d_in.valid = '1' then + if e_in.valid = '1' or l_in.valid = '1' then complete_out <= '1'; end if; rc <= '0'; brev_lenm1 <= "000"; - byte_offset <= "000"; - data_len <= x"8"; partial_write <= '0'; - sign_extend <= '0'; second_word <= '0'; xe := e_in.xerc; data_in <= (others => '0'); @@ -109,9 +94,6 @@ begin if e_in.write_enable = '1' then w_out.write_reg <= e_in.write_reg; w_out.write_enable <= '1'; - data_in <= e_in.write_data; - data_len <= unsigned(e_in.write_len); - sign_extend <= e_in.sign_extend; rc <= e_in.rc; end if; @@ -126,12 +108,11 @@ begin c_out.write_xerc_data <= e_in.xerc; end if; + sign_extend <= l_in.sign_extend; + data_len <= unsigned(l_in.write_len); + byte_offset <= unsigned(l_in.write_shift); if l_in.write_enable = '1' then w_out.write_reg <= gpr_to_gspr(l_in.write_reg); - data_in <= l_in.write_data; - data_len <= unsigned(l_in.write_len); - byte_offset <= unsigned(l_in.write_shift); - sign_extend <= l_in.sign_extend; if l_in.byte_reverse = '1' then brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1; end if; @@ -143,32 +124,6 @@ begin xe := l_in.xerc; end if; - if m_in.write_reg_enable = '1' then - w_out.write_enable <= '1'; - w_out.write_reg <= gpr_to_gspr(m_in.write_reg_nr); - data_in <= m_in.write_reg_data; - rc <= m_in.rc; - xe := m_in.xerc; - end if; - - if m_in.write_xerc_enable = '1' then - c_out.write_xerc_enable <= '1'; - c_out.write_xerc_data <= m_in.xerc; - end if; - - if d_in.write_reg_enable = '1' then - w_out.write_enable <= '1'; - w_out.write_reg <= gpr_to_gspr(d_in.write_reg_nr); - data_in <= d_in.write_reg_data; - rc <= d_in.rc; - xe := d_in.xerc; - end if; - - if d_in.write_xerc_enable = '1' then - c_out.write_xerc_enable <= '1'; - c_out.write_xerc_data <= d_in.xerc; - end if; - -- shift and byte-reverse data bytes for i in 0 to 7 loop k := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); @@ -177,7 +132,7 @@ begin end loop; for i in 0 to 7 loop j := to_integer(perm(i)) * 8; - data_permuted(i * 8 + 7 downto i * 8) <= data_in(j + 7 downto j); + data_permuted(i * 8 + 7 downto i * 8) <= l_in.write_data(j + 7 downto j); end loop; -- If the data can arrive split over two cycles, this will be correct @@ -199,16 +154,12 @@ begin trim_ctl(i) <= '0' & (negative and sign_extend); end if; end loop; - zero <= not negative; for i in 0 to 7 loop case trim_ctl(i) is when "11" => data_trimmed(i * 8 + 7 downto i * 8) <= data_latched(i * 8 + 7 downto i * 8); when "10" => data_trimmed(i * 8 + 7 downto i * 8) <= data_permuted(i * 8 + 7 downto i * 8); - if or data_permuted(i * 8 + 7 downto i * 8) /= '0' then - zero <= '0'; - end if; when "01" => data_trimmed(i * 8 + 7 downto i * 8) <= x"FF"; when others => @@ -217,14 +168,21 @@ begin end loop; -- deliver to regfile - w_out.write_data <= data_trimmed; + if l_in.write_enable = '1' then + w_out.write_data <= data_trimmed; + else + w_out.write_data <= e_in.write_data; + end if; -- Perform CR0 update for RC forms + -- Note that loads never have a form with an RC bit, therefore this can test e_in.write_data if rc = '1' then + sign := e_in.write_data(63); + zero := not (or e_in.write_data); c_out.write_cr_enable <= '1'; c_out.write_cr_mask <= num_to_fxm(0); - cf(3) := negative; - cf(2) := not negative and not zero; + cf(3) := sign; + cf(2) := not sign and not zero; cf(1) := zero; cf(0) := xe.so; c_out.write_cr_data(31 downto 28) <= cf;