From 2167186b5fae691b2a165cc5bfaaa79fe4713733 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 10 Dec 2019 20:52:21 +1100 Subject: [PATCH 01/10] Make multiplier hang off the side of execute1 With this, the multiplier isn't a separate pipe that decode2 issues instructions to, but rather is a unit that execute1 sends operands to and which sends the result back to execute1, which then sends it to writeback. Execute1 now sends a stall signal when it gets a multiply instruction until it gets a valid signal back from the multiplier. This all means that we no longer need to mark the multiply instructions as single-issue. Signed-off-by: Paul Mackerras --- Makefile | 4 +-- common.vhdl | 19 +++++++------ core.vhdl | 14 ---------- decode1.vhdl | 26 +++++++++--------- decode2.vhdl | 43 +----------------------------- decode_types.vhdl | 2 +- execute1.vhdl | 68 ++++++++++++++++++++++++++++++++++++++++++++++- multiply.vhdl | 9 +++---- multiply_tb.vhdl | 6 ++--- writeback.vhdl | 32 +++++----------------- 10 files changed, 106 insertions(+), 117 deletions(-) diff --git a/Makefile b/Makefile index e2398c0..720e8d5 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ common.o: decode_types.o control.o: gpr_hazard.o cr_hazard.o common.o sim_jtag.o: sim_jtag_socket.o core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o -core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o multiply.o writeback.o core_debug.o divider.o +core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o writeback.o core_debug.o divider.o core_debug.o: common.o countzero.o: countzero_tb.o: common.o glibc_random.o countzero.o @@ -26,7 +26,7 @@ crhelpers.o: common.o decode1.o: common.o decode_types.o decode2.o: decode_types.o common.o helpers.o insn_helpers.o control.o decode_types.o: -execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o +execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o multiply.o fetch1.o: common.o fetch2.o: common.o wishbone_types.o glibc_random_helpers.o: diff --git a/common.vhdl b/common.vhdl index a27f4f2..9c18230 100644 --- a/common.vhdl +++ b/common.vhdl @@ -130,7 +130,7 @@ package common is invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, others => (others => '0')); - type Decode2ToMultiplyType is record + type Execute1ToMultiplyType is record valid: std_ulogic; insn_type: insn_type_t; write_reg: gpr_index_t; @@ -141,9 +141,9 @@ package common is is_32bit: std_ulogic; xerc: xer_common_t; end record; - constant Decode2ToMultiplyInit : Decode2ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, rc => '0', - oe => '0', is_32bit => '0', xerc => xerc_init, - others => (others => '0')); + constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, rc => '0', + oe => '0', is_32bit => '0', xerc => xerc_init, + others => (others => '0')); type Decode2ToDividerType is record valid: std_ulogic; @@ -261,20 +261,19 @@ package common is write_xerc_enable => '0', xerc => xerc_init, others => (others => '0')); - type MultiplyToWritebackType is record + type MultiplyToExecute1Type is record valid: std_ulogic; - write_reg_enable : std_ulogic; write_reg_nr: gpr_index_t; write_reg_data: std_ulogic_vector(63 downto 0); write_xerc_enable : std_ulogic; xerc : xer_common_t; rc: std_ulogic; end record; - constant MultiplyToWritebackInit : MultiplyToWritebackType := (valid => '0', write_reg_enable => '0', - rc => '0', write_xerc_enable => '0', - xerc => xerc_init, - others => (others => '0')); + constant MultiplyToExecute1Init : MultiplyToExecute1Type := (valid => '0', + rc => '0', write_xerc_enable => '0', + xerc => xerc_init, + others => (others => '0')); type DividerToWritebackType is record valid: std_ulogic; diff --git a/core.vhdl b/core.vhdl index eb0b526..71c10b3 100644 --- a/core.vhdl +++ b/core.vhdl @@ -63,10 +63,6 @@ architecture behave of core is signal loadstore1_to_dcache: Loadstore1ToDcacheType; signal dcache_to_writeback: DcacheToWritebackType; - -- multiply signals - signal decode2_to_multiply: Decode2ToMultiplyType; - signal multiply_to_writeback: MultiplyToWritebackType; - -- divider signals signal decode2_to_divider: Decode2ToDividerType; signal divider_to_writeback: DividerToWritebackType; @@ -115,7 +111,6 @@ architecture behave of core is attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of cr_file_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of execute1_0 : label is keep_h(DISABLE_FLATTEN); - attribute keep_hierarchy of multiply_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of divider_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of loadstore1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of dcache_0 : label is keep_h(DISABLE_FLATTEN); @@ -197,7 +192,6 @@ begin d_in => decode1_to_decode2, e_out => decode2_to_execute1, l_out => decode2_to_loadstore1, - m_out => decode2_to_multiply, d_out => decode2_to_divider, r_in => register_file_to_decode2, r_out => decode2_to_register_file, @@ -265,13 +259,6 @@ begin wishbone_out => wishbone_data_out ); - multiply_0: entity work.multiply - port map ( - clk => clk, - m_in => decode2_to_multiply, - m_out => multiply_to_writeback - ); - divider_0: entity work.divider port map ( clk => clk, @@ -285,7 +272,6 @@ begin clk => clk, e_in => execute1_to_writeback, l_in => dcache_to_writeback, - m_in => multiply_to_writeback, d_in => divider_to_writeback, w_out => writeback_to_register_file, c_out => writeback_to_cr_file, diff --git a/decode1.vhdl b/decode1.vhdl index 51a2643..4e1d063 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -54,7 +54,7 @@ architecture behaviour of decode1 is 41 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lhzu 32 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lwz 33 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lwzu - 7 => (MUL, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '1'), -- mulli + 7 => (ALU, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli 24 => (ALU, OP_OR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ori 25 => (ALU, OP_OR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- oris 20 => (ALU, OP_RLC, RA, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwimi @@ -244,19 +244,19 @@ architecture behaviour of decode1 is 2#1100001011# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsw 2#0010010000# => (ALU, OP_MTCRF, NONE, NONE, RS, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtcrf/mtocrf 2#0111010011# => (ALU, OP_MTSPR, NONE, NONE, RS, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtspr - 2#0001001001# => (MUL, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '1'), -- mulhd - 2#0000001001# => (MUL, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- mulhdu - 2#0001001011# => (MUL, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '1'), -- mulhw - 2#0000001011# => (MUL, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '1'), -- mulhwu + 2#0001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd + 2#0000001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- mulhdu + 2#0001001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mulhw + 2#0000001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- mulhwu -- next 4 have reserved bit set - 2#1001001001# => (MUL, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '1'), -- mulhd - 2#1000001001# => (MUL, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- mulhdu - 2#1001001011# => (MUL, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '1'), -- mulhw - 2#1000001011# => (MUL, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '1'), -- mulhwu - 2#0011101001# => (MUL, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '1'), -- mulld - 2#1011101001# => (MUL, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '1'), -- mulldo - 2#0011101011# => (MUL, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '1'), -- mullw - 2#1011101011# => (MUL, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '1'), -- mullwo + 2#1001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd + 2#1000001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- mulhdu + 2#1001001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mulhw + 2#1000001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- mulhwu + 2#0011101001# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulld + 2#1011101001# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulldo + 2#0011101011# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mullw + 2#1011101011# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mullwo 2#0111011100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nand 2#0001101000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- neg 2#1001101000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nego diff --git a/decode2.vhdl b/decode2.vhdl index f6f7101..2da5c41 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -24,7 +24,6 @@ entity decode2 is d_in : in Decode1ToDecode2Type; e_out : out Decode2ToExecute1Type; - m_out : out Decode2ToMultiplyType; d_out : out Decode2ToDividerType; l_out : out Decode2ToLoadstore1Type; @@ -39,7 +38,6 @@ end entity decode2; architecture behaviour of decode2 is type reg_type is record e : Decode2ToExecute1Type; - m : Decode2ToMultiplyType; d : Decode2ToDividerType; l : Decode2ToLoadstore1Type; end record; @@ -238,7 +236,7 @@ begin decode2_0: process(clk) begin if rising_edge(clk) then - if rin.e.valid = '1' or rin.l.valid = '1' or rin.m.valid = '1' or rin.d.valid = '1' then + if rin.e.valid = '1' or rin.l.valid = '1' or rin.d.valid = '1' then report "execute " & to_hstring(rin.e.nia); end if; r <= rin; @@ -266,14 +264,12 @@ begin v.e := Decode2ToExecute1Init; v.l := Decode2ToLoadStore1Init; - v.m := Decode2ToMultiplyInit; v.d := Decode2ToDividerInit; mul_a := (others => '0'); mul_b := (others => '0'); --v.e.input_cr := d_in.decode.input_cr; - --v.m.input_cr := d_in.decode.input_cr; --v.e.output_cr := d_in.decode.output_cr; decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1); @@ -323,38 +319,6 @@ begin v.e.insn := d_in.insn; v.e.data_len := length; - -- multiply unit - v.m.insn_type := d_in.decode.insn_type; - mul_a := decoded_reg_a.data; - mul_b := decoded_reg_b.data; - v.m.write_reg := gspr_to_gpr(decoded_reg_o.reg); - v.m.rc := decode_rc(d_in.decode.rc, d_in.insn); - v.m.xerc := c_in.read_xerc_data; - if v.m.insn_type = OP_MUL_L64 then - v.m.oe := decode_oe(d_in.decode.rc, d_in.insn); - end if; - v.m.is_32bit := d_in.decode.is_32bit; - - if d_in.decode.is_32bit = '1' then - if d_in.decode.is_signed = '1' then - v.m.data1 := (others => mul_a(31)); - v.m.data1(31 downto 0) := mul_a(31 downto 0); - v.m.data2 := (others => mul_b(31)); - v.m.data2(31 downto 0) := mul_b(31 downto 0); - else - v.m.data1 := '0' & x"00000000" & mul_a(31 downto 0); - v.m.data2 := '0' & x"00000000" & mul_b(31 downto 0); - end if; - else - if d_in.decode.is_signed = '1' then - v.m.data1 := mul_a(63) & mul_a; - v.m.data2 := mul_b(63) & mul_b; - else - v.m.data1 := '0' & mul_a; - v.m.data2 := '0' & mul_b; - end if; - end if; - -- divide unit -- PPC divide and modulus instruction words have these bits in -- the bottom 11 bits: o1dns 010t1 r @@ -438,7 +402,6 @@ begin cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); v.e.valid := '0'; - v.m.valid := '0'; v.d.valid := '0'; v.l.valid := '0'; case d_in.decode.unit is @@ -446,8 +409,6 @@ begin v.e.valid := control_valid_out; when LDST => v.l.valid := control_valid_out; - when MUL => - v.m.valid := control_valid_out; when DIV => v.d.valid := control_valid_out; when NONE => @@ -458,7 +419,6 @@ begin if rst = '1' then v.e := Decode2ToExecute1Init; v.l := Decode2ToLoadStore1Init; - v.m := Decode2ToMultiplyInit; v.d := Decode2ToDividerInit; end if; @@ -468,7 +428,6 @@ begin -- Update outputs e_out <= r.e; l_out <= r.l; - m_out <= r.m; d_out <= r.d; end process; end architecture behaviour; diff --git a/decode_types.vhdl b/decode_types.vhdl index e847fcf..9860406 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -46,7 +46,7 @@ package decode_types is constant TOO_OFFSET : integer := 0; - type unit_t is (NONE, ALU, LDST, MUL, DIV); + type unit_t is (NONE, ALU, LDST, DIV); type length_t is (NONE, is1B, is2B, is4B, is8B); type decode_rom_t is record diff --git a/execute1.vhdl b/execute1.vhdl index 4714ec5..710044f 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -35,6 +35,7 @@ architecture behaviour of execute1 is e : Execute1ToWritebackType; lr_update : std_ulogic; next_lr : std_ulogic_vector(63 downto 0); + mul_in_progress : std_ulogic; end record; signal r, rin : reg_type; @@ -48,6 +49,10 @@ architecture behaviour of execute1 is signal logical_result: std_ulogic_vector(63 downto 0); signal countzero_result: std_ulogic_vector(63 downto 0); + -- multiply signals + signal x_to_multiply: Execute1ToMultiplyType; + signal multiply_to_x: MultiplyToExecute1Type; + procedure set_carry(e: inout Execute1ToWritebackType; carry32 : in std_ulogic; carry : in std_ulogic) is @@ -123,6 +128,13 @@ begin result => countzero_result ); + multiply_0: entity work.multiply + port map ( + clk => clk, + m_in => x_to_multiply, + m_out => multiply_to_x + ); + execute1_0: process(clk) begin if rising_edge(clk) then @@ -204,6 +216,38 @@ begin end if; v.lr_update := '0'; + v.mul_in_progress := '0'; + + -- signals to multiply unit + x_to_multiply <= Execute1ToMultiplyInit; + x_to_multiply.insn_type <= e_in.insn_type; + x_to_multiply.write_reg <= gspr_to_gpr(e_in.write_reg); + x_to_multiply.rc <= e_in.rc; + x_to_multiply.xerc <= v.e.xerc; + if e_in.insn_type = OP_MUL_L64 then + x_to_multiply.oe <= e_in.oe; + end if; + x_to_multiply.is_32bit <= e_in.is_32bit; + + if e_in.is_32bit = '1' then + if e_in.is_signed = '1' then + x_to_multiply.data1 <= (others => e_in.read_data1(31)); + x_to_multiply.data1(31 downto 0) <= e_in.read_data1(31 downto 0); + x_to_multiply.data2 <= (others => e_in.read_data2(31)); + x_to_multiply.data2(31 downto 0) <= e_in.read_data2(31 downto 0); + else + x_to_multiply.data1 <= '0' & x"00000000" & e_in.read_data1(31 downto 0); + x_to_multiply.data2 <= '0' & x"00000000" & e_in.read_data2(31 downto 0); + end if; + else + if e_in.is_signed = '1' then + x_to_multiply.data1 <= e_in.read_data1(63) & e_in.read_data1; + x_to_multiply.data2 <= e_in.read_data2(63) & e_in.read_data2; + else + x_to_multiply.data1 <= '0' & e_in.read_data1; + x_to_multiply.data2 <= '0' & e_in.read_data2; + end if; + end if; ctrl_tmp <= ctrl; -- FIXME: run at 512MHz not core freq @@ -506,11 +550,19 @@ begin when OP_ICBI => icache_inval <= '1'; + when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 => + v.e.valid := '0'; + v.mul_in_progress := '1'; + stall_out <= '1'; + x_to_multiply.valid <= '1'; + when others => terminate_out <= '1'; report "illegal"; end case; + v.e.rc := e_in.rc and e_in.valid; + -- Update LR on the next cycle after a branch link -- -- WARNING: The LR update isn't tracked by our hazard tracker. This @@ -536,11 +588,25 @@ begin v.e.write_len := x"8"; v.e.sign_extend := '0'; v.e.valid := '1'; + elsif r.mul_in_progress = '1' then + if multiply_to_x.valid = '1' then + v.e.write_reg := gpr_to_gspr(multiply_to_x.write_reg_nr); + result := multiply_to_x.write_reg_data; + result_en := '1'; + v.e.rc := multiply_to_x.rc; + v.e.xerc := multiply_to_x.xerc; + v.e.write_xerc_enable := multiply_to_x.write_xerc_enable; + v.e.valid := '1'; + v.e.write_len := x"8"; + v.e.sign_extend := '0'; + else + stall_out <= '1'; + v.mul_in_progress := '1'; + end if; end if; v.e.write_data := result; v.e.write_enable := result_en; - v.e.rc := e_in.rc and e_in.valid; -- Update registers rin <= v; diff --git a/multiply.vhdl b/multiply.vhdl index 23339b5..714b844 100644 --- a/multiply.vhdl +++ b/multiply.vhdl @@ -13,13 +13,13 @@ entity multiply is port ( clk : in std_logic; - m_in : in Decode2ToMultiplyType; - m_out : out MultiplyToWritebackType + m_in : in Execute1ToMultiplyType; + m_out : out MultiplyToExecute1Type ); end entity multiply; architecture behaviour of multiply is - signal m: Decode2ToMultiplyType; + signal m: Execute1ToMultiplyType; type multiply_pipeline_stage is record valid : std_ulogic; @@ -64,7 +64,7 @@ begin begin v := r; - m_out <= MultiplyToWritebackInit; + m_out <= MultiplyToExecute1Init; v.multiply_pipeline(0).valid := m.valid; v.multiply_pipeline(0).insn_type := m.insn_type; @@ -107,7 +107,6 @@ begin -- Generate OV/OV32/SO when OE=1 if v.multiply_pipeline(PIPELINE_DEPTH-1).valid = '1' then m_out.valid <= '1'; - m_out.write_reg_enable <= '1'; m_out.rc <= v.multiply_pipeline(PIPELINE_DEPTH-1).rc; m_out.write_xerc_enable <= v.multiply_pipeline(PIPELINE_DEPTH-1).oe; diff --git a/multiply_tb.vhdl b/multiply_tb.vhdl index 48f83ab..a76d739 100644 --- a/multiply_tb.vhdl +++ b/multiply_tb.vhdl @@ -17,8 +17,8 @@ architecture behave of multiply_tb is constant pipeline_depth : integer := 4; - signal m1 : Decode2ToMultiplyType; - signal m2 : MultiplyToWritebackType; + signal m1 : Execute1ToMultiplyType; + signal m2 : MultiplyToExecute1Type; begin multiply_0: entity work.multiply generic map (PIPELINE_DEPTH => pipeline_depth) @@ -58,7 +58,6 @@ begin wait for clk_period; assert m2.valid = '1'; - assert m2.write_reg_enable = '1'; assert m2.write_reg_nr = "10001"; assert m2.write_reg_data = x"0000000001111000"; assert m2.rc = '0'; @@ -76,7 +75,6 @@ begin wait for clk_period * (pipeline_depth-1); assert m2.valid = '1'; - assert m2.write_reg_enable = '1'; assert m2.write_reg_nr = "10001"; assert m2.write_reg_data = x"0000000001111000"; assert m2.rc = '1'; diff --git a/writeback.vhdl b/writeback.vhdl index 8582166..1323f71 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -12,7 +12,6 @@ entity writeback is e_in : in Execute1ToWritebackType; l_in : in DcacheToWritebackType; - m_in : in MultiplyToWritebackType; d_in : in DividerToWritebackType; w_out : out WritebackToRegisterFileType; @@ -67,32 +66,28 @@ begin begin x := "" & e_in.valid; y := "" & l_in.valid; - z := "" & m_in.valid; - w := "" & d_in.valid; - assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z)) + to_integer(unsigned(w))) <= 1 severity failure; + z := "" & d_in.valid; + assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure; x := "" & e_in.write_enable; y := "" & l_in.write_enable; - z := "" & m_in.write_reg_enable; - w := "" & d_in.write_reg_enable; - assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z)) + to_integer(unsigned(w))) <= 1 severity failure; + z := "" & d_in.write_reg_enable; + assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure; w := "" & e_in.write_cr_enable; x := "" & (e_in.write_enable and e_in.rc); - y := "" & (m_in.valid and m_in.rc); z := "" & (d_in.valid and d_in.rc); - assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure; + assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(z))) <= 1 severity failure; x := "" & e_in.write_xerc_enable; - y := "" & m_in.write_xerc_enable; z := "" & D_in.write_xerc_enable; - assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure; + assert (to_integer(unsigned(x)) + to_integer(unsigned(z))) <= 1 severity failure; w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; complete_out <= '0'; - if e_in.valid = '1' or l_in.valid = '1' or m_in.valid = '1' or d_in.valid = '1' then + if e_in.valid = '1' or l_in.valid = '1' or d_in.valid = '1' then complete_out <= '1'; end if; @@ -143,19 +138,6 @@ begin xe := l_in.xerc; end if; - if m_in.write_reg_enable = '1' then - w_out.write_enable <= '1'; - w_out.write_reg <= gpr_to_gspr(m_in.write_reg_nr); - data_in <= m_in.write_reg_data; - rc <= m_in.rc; - xe := m_in.xerc; - end if; - - if m_in.write_xerc_enable = '1' then - c_out.write_xerc_enable <= '1'; - c_out.write_xerc_data <= m_in.xerc; - end if; - if d_in.write_reg_enable = '1' then w_out.write_enable <= '1'; w_out.write_reg <= gpr_to_gspr(d_in.write_reg_nr); From 39d18d27388ee97ef598e8ee5ce73d30db257b0a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 12 Dec 2019 08:47:42 +1100 Subject: [PATCH 02/10] Make divider hang off the side of execute1 With this, the divider is a unit that execute1 sends operands to and which sends its results back to execute1, which then send them to writeback. Execute1 now sends a stall signal when it gets a divide or modulus instruction until it gets a valid signal back from the divider. Divide and modulus instructions are no longer marked as single-issue. The data formatting step that used to be done in decode2 for div and mod instructions is now done in execute1. We also do the absolute value operation in that same cycle instead of taking an extra cycle inside the divider for signed operations with a negative operand. Signed-off-by: Paul Mackerras --- Makefile | 4 +- common.vhdl | 22 +++++------ core.vhdl | 16 +------- decode1.vhdl | 40 ++++++++++---------- decode2.vhdl | 56 +--------------------------- decode_types.vhdl | 4 +- divider.vhdl | 25 ++----------- divider_tb.vhdl | 43 +++++++++++++-------- execute1.vhdl | 95 ++++++++++++++++++++++++++++++++++++++++++++++- writeback.vhdl | 29 ++------------- 10 files changed, 165 insertions(+), 169 deletions(-) diff --git a/Makefile b/Makefile index 720e8d5..939f48e 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ common.o: decode_types.o control.o: gpr_hazard.o cr_hazard.o common.o sim_jtag.o: sim_jtag_socket.o core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o -core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o writeback.o core_debug.o divider.o +core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o writeback.o core_debug.o core_debug.o: common.o countzero.o: countzero_tb.o: common.o glibc_random.o countzero.o @@ -26,7 +26,7 @@ crhelpers.o: common.o decode1.o: common.o decode_types.o decode2.o: decode_types.o common.o helpers.o insn_helpers.o control.o decode_types.o: -execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o multiply.o +execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o multiply.o divider.o fetch1.o: common.o fetch2.o: common.o wishbone_types.o glibc_random_helpers.o: diff --git a/common.vhdl b/common.vhdl index 9c18230..1d0bbac 100644 --- a/common.vhdl +++ b/common.vhdl @@ -145,7 +145,7 @@ package common is oe => '0', is_32bit => '0', xerc => xerc_init, others => (others => '0')); - type Decode2ToDividerType is record + type Execute1ToDividerType is record valid: std_ulogic; write_reg: gpr_index_t; dividend: std_ulogic_vector(63 downto 0); @@ -154,14 +154,15 @@ package common is is_32bit: std_ulogic; is_extended: std_ulogic; is_modulus: std_ulogic; + neg_result: std_ulogic; rc: std_ulogic; oe: std_ulogic; xerc: xer_common_t; end record; - constant Decode2ToDividerInit: Decode2ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0', - is_extended => '0', is_modulus => '0', - rc => '0', oe => '0', xerc => xerc_init, - others => (others => '0')); + constant Execute1ToDividerInit: Execute1ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0', + is_extended => '0', is_modulus => '0', + rc => '0', oe => '0', xerc => xerc_init, + neg_result => '0', others => (others => '0')); type Decode2ToRegisterFileType is record read1_enable : std_ulogic; @@ -275,20 +276,19 @@ package common is xerc => xerc_init, others => (others => '0')); - type DividerToWritebackType is record + type DividerToExecute1Type is record valid: std_ulogic; - write_reg_enable : std_ulogic; write_reg_nr: gpr_index_t; write_reg_data: std_ulogic_vector(63 downto 0); write_xerc_enable : std_ulogic; xerc : xer_common_t; rc: std_ulogic; end record; - constant DividerToWritebackInit : DividerToWritebackType := (valid => '0', write_reg_enable => '0', - rc => '0', write_xerc_enable => '0', - xerc => xerc_init, - others => (others => '0')); + constant DividerToExecute1Init : DividerToExecute1Type := (valid => '0', + rc => '0', write_xerc_enable => '0', + xerc => xerc_init, + others => (others => '0')); type WritebackToRegisterFileType is record write_reg : gspr_index_t; diff --git a/core.vhdl b/core.vhdl index 71c10b3..a38cf36 100644 --- a/core.vhdl +++ b/core.vhdl @@ -63,10 +63,6 @@ architecture behave of core is signal loadstore1_to_dcache: Loadstore1ToDcacheType; signal dcache_to_writeback: DcacheToWritebackType; - -- divider signals - signal decode2_to_divider: Decode2ToDividerType; - signal divider_to_writeback: DividerToWritebackType; - -- local signals signal fetch1_stall_in : std_ulogic; signal icache_stall_out : std_ulogic; @@ -111,7 +107,6 @@ architecture behave of core is attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of cr_file_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of execute1_0 : label is keep_h(DISABLE_FLATTEN); - attribute keep_hierarchy of divider_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of loadstore1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of dcache_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of writeback_0 : label is keep_h(DISABLE_FLATTEN); @@ -192,7 +187,6 @@ begin d_in => decode1_to_decode2, e_out => decode2_to_execute1, l_out => decode2_to_loadstore1, - d_out => decode2_to_divider, r_in => register_file_to_decode2, r_out => decode2_to_register_file, c_in => cr_file_to_decode2, @@ -228,6 +222,7 @@ begin execute1_0: entity work.execute1 port map ( clk => clk, + rst => core_rst, flush_out => flush, stall_out => ex1_stall_out, e_in => decode2_to_execute1, @@ -259,20 +254,11 @@ begin wishbone_out => wishbone_data_out ); - divider_0: entity work.divider - port map ( - clk => clk, - rst => core_rst, - d_in => decode2_to_divider, - d_out => divider_to_writeback - ); - writeback_0: entity work.writeback port map ( clk => clk, e_in => execute1_to_writeback, l_in => dcache_to_writeback, - d_in => divider_to_writeback, w_out => writeback_to_register_file, c_out => writeback_to_cr_file, complete_out => complete diff --git a/decode1.vhdl b/decode1.vhdl index 4e1d063..6ac3f01 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -160,22 +160,22 @@ architecture behaviour of decode1 is 2#0100010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbt 2#0011110110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbtst -- 2#1111110110# dcbz - 2#0110001001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdeu - 2#1110001001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdeuo - 2#0110001011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divweu - 2#1110001011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divweuo - 2#0110101001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divde - 2#1110101001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdeo - 2#0110101011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divwe - 2#1110101011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divweo - 2#0111001001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdu - 2#1111001001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divduo - 2#0111001011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divwu - 2#1111001011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divwuo - 2#0111101001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divd - 2#1111101001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdo - 2#0111101011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divw - 2#1111101011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divwo + 2#0110001001# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- divdeu + 2#1110001001# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- divdeuo + 2#0110001011# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- divweu + 2#1110001011# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- divweuo + 2#0110101001# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- divde + 2#1110101001# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- divdeo + 2#0110101011# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- divwe + 2#1110101011# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- divweo + 2#0111001001# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- divdu + 2#1111001001# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- divduo + 2#0111001011# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- divwu + 2#1111001011# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- divwuo + 2#0111101001# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- divd + 2#1111101001# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- divdo + 2#0111101011# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- divw + 2#1111101011# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- divwo 2#0100011100# => (ALU, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- eqv 2#1110111010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsb 2#1110011010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsh @@ -238,10 +238,10 @@ architecture behaviour of decode1 is -- 2#1001000000# mcrxrx 2#0000010011# => (ALU, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf 2#0101010011# => (ALU, OP_MFSPR, SPR, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfspr - 2#0100001001# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modud - 2#0100001011# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- moduw - 2#1100001001# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsd - 2#1100001011# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsw + 2#0100001001# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- modud + 2#0100001011# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- moduw + 2#1100001001# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- modsd + 2#1100001011# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', NONE, '0', '0'), -- modsw 2#0010010000# => (ALU, OP_MTCRF, NONE, NONE, RS, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtcrf/mtocrf 2#0111010011# => (ALU, OP_MTSPR, NONE, NONE, RS, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtspr 2#0001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd diff --git a/decode2.vhdl b/decode2.vhdl index 2da5c41..a95dae3 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -24,7 +24,6 @@ entity decode2 is d_in : in Decode1ToDecode2Type; e_out : out Decode2ToExecute1Type; - d_out : out Decode2ToDividerType; l_out : out Decode2ToLoadstore1Type; r_in : in RegisterFileToDecode2Type; @@ -38,7 +37,6 @@ end entity decode2; architecture behaviour of decode2 is type reg_type is record e : Decode2ToExecute1Type; - d : Decode2ToDividerType; l : Decode2ToLoadstore1Type; end record; @@ -236,7 +234,7 @@ begin decode2_0: process(clk) begin if rising_edge(clk) then - if rin.e.valid = '1' or rin.l.valid = '1' or rin.d.valid = '1' then + if rin.e.valid = '1' or rin.l.valid = '1' then report "execute " & to_hstring(rin.e.nia); end if; r <= rin; @@ -257,14 +255,12 @@ begin variable decoded_reg_b : decode_input_reg_t; variable decoded_reg_c : decode_input_reg_t; variable decoded_reg_o : decode_output_reg_t; - variable signed_division: std_ulogic; variable length : std_ulogic_vector(3 downto 0); begin v := r; v.e := Decode2ToExecute1Init; v.l := Decode2ToLoadStore1Init; - v.d := Decode2ToDividerInit; mul_a := (others => '0'); mul_b := (others => '0'); @@ -319,51 +315,6 @@ begin v.e.insn := d_in.insn; v.e.data_len := length; - -- divide unit - -- PPC divide and modulus instruction words have these bits in - -- the bottom 11 bits: o1dns 010t1 r - -- where o = OE for div instrs, signedness for mod instrs - -- d = 1 for div*, 0 for mod* - -- n = 1 for normal, 0 for extended (dividend << 32/64) - -- s = 1 for signed, 0 for unsigned (for div*) - -- t = 1 for 32-bit, 0 for 64-bit - -- r = RC bit (record condition code) - v.d.write_reg := gspr_to_gpr(decoded_reg_o.reg); - v.d.is_modulus := not d_in.insn(8); - v.d.is_32bit := d_in.insn(2); - if d_in.insn(8) = '1' then - signed_division := d_in.insn(6); - else - signed_division := d_in.insn(10); - end if; - v.d.is_signed := signed_division; - if d_in.insn(2) = '0' then - -- 64-bit forms - if d_in.insn(8) = '1' and d_in.insn(7) = '0' then - v.d.is_extended := '1'; - end if; - v.d.dividend := decoded_reg_a.data; - v.d.divisor := decoded_reg_b.data; - else - -- 32-bit forms - if d_in.insn(8) = '1' and d_in.insn(7) = '0' then -- extended forms - v.d.dividend := decoded_reg_a.data(31 downto 0) & x"00000000"; - elsif signed_division = '1' and decoded_reg_a.data(31) = '1' then - -- sign extend to 64 bits - v.d.dividend := x"ffffffff" & decoded_reg_a.data(31 downto 0); - else - v.d.dividend := x"00000000" & decoded_reg_a.data(31 downto 0); - end if; - if signed_division = '1' and decoded_reg_b.data(31) = '1' then - v.d.divisor := x"ffffffff" & decoded_reg_b.data(31 downto 0); - else - v.d.divisor := x"00000000" & decoded_reg_b.data(31 downto 0); - end if; - end if; - v.d.rc := decode_rc(d_in.decode.rc, d_in.insn); - v.d.xerc := c_in.read_xerc_data; - v.d.oe := decode_oe(d_in.decode.rc, d_in.insn); - -- load/store unit v.l.update_reg := gspr_to_gpr(decoded_reg_a.reg); v.l.addr1 := decoded_reg_a.data; @@ -402,15 +353,12 @@ begin cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); v.e.valid := '0'; - v.d.valid := '0'; v.l.valid := '0'; case d_in.decode.unit is when ALU => v.e.valid := control_valid_out; when LDST => v.l.valid := control_valid_out; - when DIV => - v.d.valid := control_valid_out; when NONE => v.e.valid := control_valid_out; v.e.insn_type := OP_ILLEGAL; @@ -419,7 +367,6 @@ begin if rst = '1' then v.e := Decode2ToExecute1Init; v.l := Decode2ToLoadStore1Init; - v.d := Decode2ToDividerInit; end if; -- Update registers @@ -428,6 +375,5 @@ begin -- Update outputs e_out <= r.e; l_out <= r.l; - d_out <= r.d; end process; end architecture behaviour; diff --git a/decode_types.vhdl b/decode_types.vhdl index 9860406..fdc1e6e 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -8,7 +8,7 @@ package decode_types is OP_CNTZ, OP_CRAND, OP_CRANDC, OP_CREQV, OP_CRNAND, OP_CRNOR, OP_CROR, OP_CRORC, OP_CRXOR, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST, - OP_DCBZ, OP_DIV, OP_EXTS, + OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS, OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC, OP_LOAD, OP_STORE, OP_MADDHD, OP_MADDHDU, OP_MADDLD, OP_MCRF, OP_MCRXR, OP_MCRXRX, OP_MFCR, OP_MFSPR, OP_MOD, @@ -46,7 +46,7 @@ package decode_types is constant TOO_OFFSET : integer := 0; - type unit_t is (NONE, ALU, LDST, DIV); + type unit_t is (NONE, ALU, LDST); type length_t is (NONE, is1B, is2B, is4B, is8B); type decode_rom_t is record diff --git a/divider.vhdl b/divider.vhdl index affab85..33d2a0d 100644 --- a/divider.vhdl +++ b/divider.vhdl @@ -10,8 +10,8 @@ entity divider is port ( clk : in std_logic; rst : in std_logic; - d_in : in Decode2ToDividerType; - d_out : out DividerToWritebackType + d_in : in Execute1ToDividerType; + d_out : out DividerToExecute1Type ); end entity divider; @@ -23,7 +23,6 @@ architecture behaviour of divider is signal sresult : std_ulogic_vector(64 downto 0); signal oresult : std_ulogic_vector(63 downto 0); signal running : std_ulogic; - signal signcheck : std_ulogic; signal count : unsigned(6 downto 0); signal neg_result : std_ulogic; signal is_modulus : std_ulogic; @@ -48,7 +47,7 @@ begin running <= '0'; count <= "0000000"; elsif d_in.valid = '1' then - if d_in.is_extended = '1' and not (d_in.is_signed = '1' and d_in.dividend(63) = '1') then + if d_in.is_extended = '1' then dend <= '0' & d_in.dividend & x"0000000000000000"; else dend <= '0' & x"0000000000000000" & d_in.dividend; @@ -56,7 +55,7 @@ begin div <= unsigned(d_in.divisor); quot <= (others => '0'); write_reg <= d_in.write_reg; - neg_result <= '0'; + neg_result <= d_in.neg_result; is_modulus <= d_in.is_modulus; extended <= d_in.is_extended; is_32bit <= d_in.is_32bit; @@ -68,20 +67,6 @@ begin running <= '1'; overflow <= '0'; ovf32 <= '0'; - signcheck <= d_in.is_signed and (d_in.dividend(63) or d_in.divisor(63)); - elsif signcheck = '1' then - signcheck <= '0'; - neg_result <= dend(63) xor (div(63) and not is_modulus); - if dend(63) = '1' then - if extended = '1' then - dend <= '0' & std_ulogic_vector(- signed(dend(63 downto 0))) & x"0000000000000000"; - else - dend <= '0' & x"0000000000000000" & std_ulogic_vector(- signed(dend(63 downto 0))); - end if; - end if; - if div(63) = '1' then - div <= unsigned(- signed(div)); - end if; elsif running = '1' then if count = "0111111" then running <= '0'; @@ -151,12 +136,10 @@ begin if rising_edge(clk) then d_out.valid <= '0'; d_out.write_reg_data <= oresult; - d_out.write_reg_enable <= '0'; d_out.write_xerc_enable <= '0'; d_out.xerc <= xerc; if count = "1000000" then d_out.valid <= '1'; - d_out.write_reg_enable <= '1'; d_out.write_xerc_enable <= oe; -- We must test oe because the RC update code in writeback diff --git a/divider_tb.vhdl b/divider_tb.vhdl index 5f809bb..8151315 100644 --- a/divider_tb.vhdl +++ b/divider_tb.vhdl @@ -16,8 +16,8 @@ architecture behave of divider_tb is signal rst : std_ulogic; constant clk_period : time := 10 ns; - signal d1 : Decode2ToDividerType; - signal d2 : DividerToWritebackType; + signal d1 : Execute1ToDividerType; + signal d2 : DividerToExecute1Type; begin divider_0: entity work.divider port map (clk => clk, rst => rst, d_in => d1, d_out => d2); @@ -50,6 +50,7 @@ begin d1.is_32bit <= '0'; d1.is_extended <= '0'; d1.is_modulus <= '0'; + d1.neg_result <= '0'; d1.rc <= '0'; wait for clk_period; @@ -65,7 +66,6 @@ begin end loop; assert d2.valid = '1'; - assert d2.write_reg_enable = '1'; assert d2.write_reg_nr = "10001"; assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data); assert d2.rc = '0'; @@ -89,7 +89,6 @@ begin end loop; assert d2.valid = '1'; - assert d2.write_reg_enable = '1'; assert d2.write_reg_nr = "10001"; assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data); assert d2.rc = '1'; @@ -105,9 +104,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64)); rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63) xor rb(63); d1.valid <= '1'; wait for clk_period; @@ -142,6 +142,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.valid <= '1'; wait for clk_period; @@ -173,9 +174,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64)); rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63) xor rb(63); d1.is_extended <= '1'; d1.valid <= '1'; @@ -216,6 +218,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.is_extended <= '1'; d1.valid <= '1'; @@ -250,9 +253,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64)); rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63) xor rb(63); d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.valid <= '1'; @@ -289,6 +293,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.valid <= '1'; @@ -322,9 +327,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 32)) & x"00000000"; rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63) xor rb(63); d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.valid <= '1'; @@ -365,6 +371,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.valid <= '1'; @@ -398,9 +405,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64)); rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63); d1.is_extended <= '0'; d1.is_32bit <= '0'; d1.is_modulus <= '1'; @@ -438,6 +446,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.is_extended <= '0'; d1.is_32bit <= '0'; d1.is_modulus <= '1'; @@ -472,9 +481,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64)); rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63); d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.is_modulus <= '1'; @@ -517,6 +527,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.is_modulus <= '1'; diff --git a/execute1.vhdl b/execute1.vhdl index 710044f..7bcffdc 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -13,6 +13,7 @@ use work.ppc_fx_insns.all; entity execute1 is port ( clk : in std_ulogic; + rst : in std_ulogic; -- asynchronous flush_out : out std_ulogic; @@ -36,6 +37,7 @@ architecture behaviour of execute1 is lr_update : std_ulogic; next_lr : std_ulogic_vector(63 downto 0); mul_in_progress : std_ulogic; + div_in_progress : std_ulogic; end record; signal r, rin : reg_type; @@ -53,6 +55,10 @@ architecture behaviour of execute1 is signal x_to_multiply: Execute1ToMultiplyType; signal multiply_to_x: MultiplyToExecute1Type; + -- divider signals + signal x_to_divider: Execute1ToDividerType; + signal divider_to_x: DividerToExecute1Type; + procedure set_carry(e: inout Execute1ToWritebackType; carry32 : in std_ulogic; carry : in std_ulogic) is @@ -135,6 +141,14 @@ begin m_out => multiply_to_x ); + divider_0: entity work.divider + port map ( + clk => clk, + rst => rst, + d_in => x_to_divider, + d_out => divider_to_x + ); + execute1_0: process(clk) begin if rising_edge(clk) then @@ -171,6 +185,8 @@ begin variable l : std_ulogic; variable next_nia : std_ulogic_vector(63 downto 0); variable carry_32, carry_64 : std_ulogic; + variable sign1, sign2 : std_ulogic; + variable abs1, abs2 : signed(63 downto 0); begin result := (others => '0'); result_with_carry := (others => '0'); @@ -217,6 +233,7 @@ begin v.lr_update := '0'; v.mul_in_progress := '0'; + v.div_in_progress := '0'; -- signals to multiply unit x_to_multiply <= Execute1ToMultiplyInit; @@ -249,6 +266,59 @@ begin end if; end if; + -- signals to divide unit + sign1 := '0'; + sign2 := '0'; + if e_in.is_signed = '1' then + if e_in.is_32bit = '1' then + sign1 := e_in.read_data1(31); + sign2 := e_in.read_data2(31); + else + sign1 := e_in.read_data1(63); + sign2 := e_in.read_data2(63); + end if; + end if; + -- take absolute values + if sign1 = '0' then + abs1 := signed(e_in.read_data1); + else + abs1 := - signed(e_in.read_data1); + end if; + if sign2 = '0' then + abs2 := signed(e_in.read_data2); + else + abs2 := - signed(e_in.read_data2); + end if; + + x_to_divider <= Execute1ToDividerInit; + x_to_divider.write_reg <= gspr_to_gpr(e_in.write_reg); + x_to_divider.is_signed <= e_in.is_signed; + x_to_divider.is_32bit <= e_in.is_32bit; + if e_in.insn_type = OP_MOD then + x_to_divider.is_modulus <= '1'; + end if; + x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus); + x_to_divider.rc <= e_in.rc; + x_to_divider.oe <= e_in.oe; + x_to_divider.xerc <= v.e.xerc; + if e_in.is_32bit = '0' then + -- 64-bit forms + if e_in.insn_type = OP_DIVE then + x_to_divider.is_extended <= '1'; + end if; + x_to_divider.dividend <= std_ulogic_vector(abs1); + x_to_divider.divisor <= std_ulogic_vector(abs2); + else + -- 32-bit forms + x_to_divider.is_extended <= '0'; + if e_in.insn_type = OP_DIVE then -- extended forms + x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000"; + else + x_to_divider.dividend <= x"00000000" & std_ulogic_vector(abs1(31 downto 0)); + end if; + x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0)); + end if; + ctrl_tmp <= ctrl; -- FIXME: run at 512MHz not core freq ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); @@ -550,13 +620,19 @@ begin when OP_ICBI => icache_inval <= '1'; - when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 => + when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 => v.e.valid := '0'; v.mul_in_progress := '1'; stall_out <= '1'; x_to_multiply.valid <= '1'; - when others => + when OP_DIV | OP_DIVE | OP_MOD => + v.e.valid := '0'; + v.div_in_progress := '1'; + stall_out <= '1'; + x_to_divider.valid <= '1'; + + when others => terminate_out <= '1'; report "illegal"; end case; @@ -603,6 +679,21 @@ begin stall_out <= '1'; v.mul_in_progress := '1'; end if; + elsif r.div_in_progress = '1' then + if divider_to_x.valid = '1' then + v.e.write_reg := gpr_to_gspr(divider_to_x.write_reg_nr); + result := divider_to_x.write_reg_data; + result_en := '1'; + v.e.rc := divider_to_x.rc; + v.e.xerc := divider_to_x.xerc; + v.e.write_xerc_enable := divider_to_x.write_xerc_enable; + v.e.valid := '1'; + v.e.write_len := x"8"; + v.e.sign_extend := '0'; + else + stall_out <= '1'; + v.div_in_progress := '1'; + end if; end if; v.e.write_data := result; diff --git a/writeback.vhdl b/writeback.vhdl index 1323f71..08efe91 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -12,7 +12,6 @@ entity writeback is e_in : in Execute1ToWritebackType; l_in : in DcacheToWritebackType; - d_in : in DividerToWritebackType; w_out : out WritebackToRegisterFileType; c_out : out WritebackToCrFileType; @@ -66,28 +65,21 @@ begin begin x := "" & e_in.valid; y := "" & l_in.valid; - z := "" & d_in.valid; - assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure; + assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; x := "" & e_in.write_enable; y := "" & l_in.write_enable; - z := "" & d_in.write_reg_enable; - assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure; + assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; w := "" & e_in.write_cr_enable; x := "" & (e_in.write_enable and e_in.rc); - z := "" & (d_in.valid and d_in.rc); - assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(z))) <= 1 severity failure; - - x := "" & e_in.write_xerc_enable; - z := "" & D_in.write_xerc_enable; - assert (to_integer(unsigned(x)) + to_integer(unsigned(z))) <= 1 severity failure; + assert (to_integer(unsigned(w)) + to_integer(unsigned(x))) <= 1 severity failure; w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; complete_out <= '0'; - if e_in.valid = '1' or l_in.valid = '1' or d_in.valid = '1' then + if e_in.valid = '1' or l_in.valid = '1' then complete_out <= '1'; end if; @@ -138,19 +130,6 @@ begin xe := l_in.xerc; end if; - if d_in.write_reg_enable = '1' then - w_out.write_enable <= '1'; - w_out.write_reg <= gpr_to_gspr(d_in.write_reg_nr); - data_in <= d_in.write_reg_data; - rc <= d_in.rc; - xe := d_in.xerc; - end if; - - if d_in.write_xerc_enable = '1' then - c_out.write_xerc_enable <= '1'; - c_out.write_xerc_data <= d_in.xerc; - end if; - -- shift and byte-reverse data bytes for i in 0 to 7 loop k := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); From c9a2076dd3c2e26b3d9ddef72cc6e471c503b7d2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 12 Dec 2019 11:21:25 +1100 Subject: [PATCH 03/10] execute1: Remember dest GPR, RC, OE, XER for slow operations For multiply and divide operations, execute1 now records the destination GPR number, RC and OE from the instruction, and the XER value. This means that the multiply and divide units don't need to record those values and then send them back to execute1. This makes the interface to those units a bit simpler. They simply report an overflow signal along with the result value, and execute1 takes care of updating XER if necessary. Signed-off-by: Paul Mackerras --- common.vhdl | 33 +++++------------------- decode2.vhdl | 4 ++- divider.vhdl | 25 +----------------- divider_tb.vhdl | 7 ----- execute1.vhdl | 66 ++++++++++++++++++++++++------------------------ multiply.vhdl | 29 ++------------------- multiply_tb.vhdl | 7 ----- 7 files changed, 45 insertions(+), 126 deletions(-) diff --git a/common.vhdl b/common.vhdl index 1d0bbac..639f0f7 100644 --- a/common.vhdl +++ b/common.vhdl @@ -133,21 +133,16 @@ package common is type Execute1ToMultiplyType is record valid: std_ulogic; insn_type: insn_type_t; - write_reg: gpr_index_t; data1: std_ulogic_vector(64 downto 0); data2: std_ulogic_vector(64 downto 0); - rc: std_ulogic; - oe: std_ulogic; is_32bit: std_ulogic; - xerc: xer_common_t; end record; - constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, rc => '0', - oe => '0', is_32bit => '0', xerc => xerc_init, + constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, + is_32bit => '0', others => (others => '0')); type Execute1ToDividerType is record valid: std_ulogic; - write_reg: gpr_index_t; dividend: std_ulogic_vector(63 downto 0); divisor: std_ulogic_vector(63 downto 0); is_signed: std_ulogic; @@ -155,13 +150,9 @@ package common is is_extended: std_ulogic; is_modulus: std_ulogic; neg_result: std_ulogic; - rc: std_ulogic; - oe: std_ulogic; - xerc: xer_common_t; end record; constant Execute1ToDividerInit: Execute1ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0', is_extended => '0', is_modulus => '0', - rc => '0', oe => '0', xerc => xerc_init, neg_result => '0', others => (others => '0')); type Decode2ToRegisterFileType is record @@ -264,30 +255,18 @@ package common is type MultiplyToExecute1Type is record valid: std_ulogic; - - write_reg_nr: gpr_index_t; write_reg_data: std_ulogic_vector(63 downto 0); - write_xerc_enable : std_ulogic; - xerc : xer_common_t; - rc: std_ulogic; + overflow : std_ulogic; end record; - constant MultiplyToExecute1Init : MultiplyToExecute1Type := (valid => '0', - rc => '0', write_xerc_enable => '0', - xerc => xerc_init, + constant MultiplyToExecute1Init : MultiplyToExecute1Type := (valid => '0', overflow => '0', others => (others => '0')); type DividerToExecute1Type is record valid: std_ulogic; - - write_reg_nr: gpr_index_t; write_reg_data: std_ulogic_vector(63 downto 0); - write_xerc_enable : std_ulogic; - xerc : xer_common_t; - rc: std_ulogic; + overflow : std_ulogic; end record; - constant DividerToExecute1Init : DividerToExecute1Type := (valid => '0', - rc => '0', write_xerc_enable => '0', - xerc => xerc_init, + constant DividerToExecute1Init : DividerToExecute1Type := (valid => '0', overflow => '0', others => (others => '0')); type WritebackToRegisterFileType is record diff --git a/decode2.vhdl b/decode2.vhdl index a95dae3..6cd4574 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -300,7 +300,9 @@ begin v.e.read_data3 := decoded_reg_c.data; v.e.write_reg := decoded_reg_o.reg; v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); - v.e.oe := decode_oe(d_in.decode.rc, d_in.insn); + if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then + v.e.oe := decode_oe(d_in.decode.rc, d_in.insn); + end if; v.e.cr := c_in.read_cr_data; v.e.xerc := c_in.read_xerc_data; v.e.invert_a := d_in.decode.invert_a; diff --git a/divider.vhdl b/divider.vhdl index 33d2a0d..aef65a4 100644 --- a/divider.vhdl +++ b/divider.vhdl @@ -29,13 +29,9 @@ architecture behaviour of divider is signal is_32bit : std_ulogic; signal extended : std_ulogic; signal is_signed : std_ulogic; - signal rc : std_ulogic; - signal write_reg : std_ulogic_vector(4 downto 0); signal overflow : std_ulogic; signal ovf32 : std_ulogic; signal did_ovf : std_ulogic; - signal oe : std_ulogic; - signal xerc : xer_common_t; begin divider_0: process(clk) begin @@ -54,15 +50,11 @@ begin end if; div <= unsigned(d_in.divisor); quot <= (others => '0'); - write_reg <= d_in.write_reg; neg_result <= d_in.neg_result; is_modulus <= d_in.is_modulus; extended <= d_in.is_extended; is_32bit <= d_in.is_32bit; is_signed <= d_in.is_signed; - rc <= d_in.rc; - oe <= d_in.oe; - xerc <= d_in.xerc; count <= "1111111"; running <= '1'; overflow <= '0'; @@ -98,9 +90,6 @@ begin divider_1: process(all) begin - d_out.write_reg_nr <= write_reg; - d_out.rc <= rc; - if is_modulus = '1' then result <= dend(128 downto 65); else @@ -136,21 +125,9 @@ begin if rising_edge(clk) then d_out.valid <= '0'; d_out.write_reg_data <= oresult; - d_out.write_xerc_enable <= '0'; - d_out.xerc <= xerc; + d_out.overflow <= did_ovf; if count = "1000000" then d_out.valid <= '1'; - d_out.write_xerc_enable <= oe; - - -- We must test oe because the RC update code in writeback - -- will use the xerc value to set CR0:SO so we must not clobber - -- xerc if OE wasn't set. - -- - if oe = '1' then - d_out.xerc.ov <= did_ovf; - d_out.xerc.ov32 <= did_ovf; - d_out.xerc.so <= xerc.so or did_ovf; - end if; end if; end if; end process; diff --git a/divider_tb.vhdl b/divider_tb.vhdl index 8151315..95156a3 100644 --- a/divider_tb.vhdl +++ b/divider_tb.vhdl @@ -43,7 +43,6 @@ begin rst <= '0'; d1.valid <= '1'; - d1.write_reg <= "10001"; d1.dividend <= x"0000000010001000"; d1.divisor <= x"0000000000001111"; d1.is_signed <= '0'; @@ -51,7 +50,6 @@ begin d1.is_extended <= '0'; d1.is_modulus <= '0'; d1.neg_result <= '0'; - d1.rc <= '0'; wait for clk_period; assert d2.valid = '0'; @@ -66,15 +64,12 @@ begin end loop; assert d2.valid = '1'; - assert d2.write_reg_nr = "10001"; assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data); - assert d2.rc = '0'; wait for clk_period; assert d2.valid = '0' report "valid"; d1.valid <= '1'; - d1.rc <= '1'; wait for clk_period; assert d2.valid = '0' report "valid"; @@ -89,9 +84,7 @@ begin end loop; assert d2.valid = '1'; - assert d2.write_reg_nr = "10001"; assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data); - assert d2.rc = '1'; wait for clk_period; assert d2.valid = '0'; diff --git a/execute1.vhdl b/execute1.vhdl index 7bcffdc..94845d8 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -38,6 +38,10 @@ architecture behaviour of execute1 is next_lr : std_ulogic_vector(63 downto 0); mul_in_progress : std_ulogic; div_in_progress : std_ulogic; + slow_op_dest : gpr_index_t; + slow_op_rc : std_ulogic; + slow_op_oe : std_ulogic; + slow_op_xerc : xer_common_t; end record; signal r, rin : reg_type; @@ -187,6 +191,7 @@ begin variable carry_32, carry_64 : std_ulogic; variable sign1, sign2 : std_ulogic; variable abs1, abs2 : signed(63 downto 0); + variable overflow : std_ulogic; begin result := (others => '0'); result_with_carry := (others => '0'); @@ -238,12 +243,6 @@ begin -- signals to multiply unit x_to_multiply <= Execute1ToMultiplyInit; x_to_multiply.insn_type <= e_in.insn_type; - x_to_multiply.write_reg <= gspr_to_gpr(e_in.write_reg); - x_to_multiply.rc <= e_in.rc; - x_to_multiply.xerc <= v.e.xerc; - if e_in.insn_type = OP_MUL_L64 then - x_to_multiply.oe <= e_in.oe; - end if; x_to_multiply.is_32bit <= e_in.is_32bit; if e_in.is_32bit = '1' then @@ -291,16 +290,12 @@ begin end if; x_to_divider <= Execute1ToDividerInit; - x_to_divider.write_reg <= gspr_to_gpr(e_in.write_reg); x_to_divider.is_signed <= e_in.is_signed; x_to_divider.is_32bit <= e_in.is_32bit; if e_in.insn_type = OP_MOD then x_to_divider.is_modulus <= '1'; end if; x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus); - x_to_divider.rc <= e_in.rc; - x_to_divider.oe <= e_in.oe; - x_to_divider.xerc <= v.e.xerc; if e_in.is_32bit = '0' then -- 64-bit forms if e_in.insn_type = OP_DIVE then @@ -342,6 +337,10 @@ begin v.e.write_reg := e_in.write_reg; v.e.write_len := x"8"; v.e.sign_extend := '0'; + v.slow_op_dest := gspr_to_gpr(e_in.write_reg); + v.slow_op_rc := e_in.rc; + v.slow_op_oe := e_in.oe; + v.slow_op_xerc := v.e.xerc; case_0: case e_in.insn_type is @@ -664,35 +663,36 @@ begin v.e.write_len := x"8"; v.e.sign_extend := '0'; v.e.valid := '1'; - elsif r.mul_in_progress = '1' then - if multiply_to_x.valid = '1' then - v.e.write_reg := gpr_to_gspr(multiply_to_x.write_reg_nr); - result := multiply_to_x.write_reg_data; + elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then + if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or + (r.div_in_progress = '1' and divider_to_x.valid = '1') then + if r.mul_in_progress = '1' then + result := multiply_to_x.write_reg_data; + overflow := multiply_to_x.overflow; + else + result := divider_to_x.write_reg_data; + overflow := divider_to_x.overflow; + end if; result_en := '1'; - v.e.rc := multiply_to_x.rc; - v.e.xerc := multiply_to_x.xerc; - v.e.write_xerc_enable := multiply_to_x.write_xerc_enable; + v.e.write_reg := gpr_to_gspr(v.slow_op_dest); + v.e.rc := v.slow_op_rc; + v.e.xerc := v.slow_op_xerc; + v.e.write_xerc_enable := v.slow_op_oe; + -- We must test oe because the RC update code in writeback + -- will use the xerc value to set CR0:SO so we must not clobber + -- xerc if OE wasn't set. + if v.slow_op_oe = '1' then + v.e.xerc.ov := overflow; + v.e.xerc.ov32 := overflow; + v.e.xerc.so := v.slow_op_xerc.so or overflow; + end if; v.e.valid := '1'; v.e.write_len := x"8"; v.e.sign_extend := '0'; else stall_out <= '1'; - v.mul_in_progress := '1'; - end if; - elsif r.div_in_progress = '1' then - if divider_to_x.valid = '1' then - v.e.write_reg := gpr_to_gspr(divider_to_x.write_reg_nr); - result := divider_to_x.write_reg_data; - result_en := '1'; - v.e.rc := divider_to_x.rc; - v.e.xerc := divider_to_x.xerc; - v.e.write_xerc_enable := divider_to_x.write_xerc_enable; - v.e.valid := '1'; - v.e.write_len := x"8"; - v.e.sign_extend := '0'; - else - stall_out <= '1'; - v.div_in_progress := '1'; + v.mul_in_progress := r.mul_in_progress; + v.div_in_progress := r.div_in_progress; end if; end if; diff --git a/multiply.vhdl b/multiply.vhdl index 714b844..959c114 100644 --- a/multiply.vhdl +++ b/multiply.vhdl @@ -25,19 +25,12 @@ architecture behaviour of multiply is valid : std_ulogic; insn_type : insn_type_t; data : signed(129 downto 0); - write_reg : std_ulogic_vector(4 downto 0); - rc : std_ulogic; - oe : std_ulogic; is_32bit : std_ulogic; - xerc : xer_common_t; end record; constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0', insn_type => OP_ILLEGAL, - rc => '0', oe => '0', is_32bit => '0', - xerc => xerc_init, - data => (others => '0'), - others => (others => '0')); + data => (others => '0')); type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage; constant MultiplyPipelineInit : multiply_pipeline_type := (others => MultiplyPipelineStageInit); @@ -69,11 +62,7 @@ begin v.multiply_pipeline(0).valid := m.valid; v.multiply_pipeline(0).insn_type := m.insn_type; v.multiply_pipeline(0).data := signed(m.data1) * signed(m.data2); - v.multiply_pipeline(0).write_reg := m.write_reg; - v.multiply_pipeline(0).rc := m.rc; - v.multiply_pipeline(0).oe := m.oe; v.multiply_pipeline(0).is_32bit := m.is_32bit; - v.multiply_pipeline(0).xerc := m.xerc; loop_0: for i in 1 to PIPELINE_DEPTH-1 loop v.multiply_pipeline(i) := r.multiply_pipeline(i-1); @@ -101,24 +90,10 @@ begin end case; m_out.write_reg_data <= d2; - m_out.write_reg_nr <= v.multiply_pipeline(PIPELINE_DEPTH-1).write_reg; - m_out.xerc <= v.multiply_pipeline(PIPELINE_DEPTH-1).xerc; + m_out.overflow <= ov; - -- Generate OV/OV32/SO when OE=1 if v.multiply_pipeline(PIPELINE_DEPTH-1).valid = '1' then m_out.valid <= '1'; - m_out.rc <= v.multiply_pipeline(PIPELINE_DEPTH-1).rc; - m_out.write_xerc_enable <= v.multiply_pipeline(PIPELINE_DEPTH-1).oe; - - -- We must test oe because the RC update code in writeback - -- will use the xerc value to set CR0:SO so we must not clobber - -- xerc if OE wasn't set. - -- - if v.multiply_pipeline(PIPELINE_DEPTH-1).oe = '1' then - m_out.xerc.ov <= ov; - m_out.xerc.ov32 <= ov; - m_out.xerc.so <= v.multiply_pipeline(PIPELINE_DEPTH-1).xerc.so or ov; - end if; end if; rin <= v; diff --git a/multiply_tb.vhdl b/multiply_tb.vhdl index a76d739..8f1d795 100644 --- a/multiply_tb.vhdl +++ b/multiply_tb.vhdl @@ -40,10 +40,8 @@ begin m1.valid <= '1'; m1.insn_type <= OP_MUL_L64; - m1.write_reg <= "10001"; m1.data1 <= '0' & x"0000000000001000"; m1.data2 <= '0' & x"0000000000001111"; - m1.rc <= '0'; wait for clk_period; assert m2.valid = '0'; @@ -58,15 +56,12 @@ begin wait for clk_period; assert m2.valid = '1'; - assert m2.write_reg_nr = "10001"; assert m2.write_reg_data = x"0000000001111000"; - assert m2.rc = '0'; wait for clk_period; assert m2.valid = '0'; m1.valid <= '1'; - m1.rc <= '1'; wait for clk_period; assert m2.valid = '0'; @@ -75,9 +70,7 @@ begin wait for clk_period * (pipeline_depth-1); assert m2.valid = '1'; - assert m2.write_reg_nr = "10001"; assert m2.write_reg_data = x"0000000001111000"; - assert m2.rc = '1'; -- test mulld mulld_loop : for i in 0 to 1000 loop From d956846667ef558e51705c0d22152aa912629454 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 12 Dec 2019 15:25:45 +1100 Subject: [PATCH 04/10] execute1: Move EXTS* instruction back into execute1 This moves the sign extension done by the extsb, extsh and extsw instructions back into execute1. This means that we no longer need any data formatting in writeback for results coming from execute1, so this modifies writeback so the data formatter inputs come directly from the loadstore unit output. The condition code updates for RC=1 form instructions are now done on the value from execute1 rather than the output of the data formatter, which should help timing. Signed-off-by: Paul Mackerras --- common.vhdl | 4 +--- execute1.vhdl | 24 ++++++++++++++---------- writeback.vhdl | 35 ++++++++++++++++------------------- 3 files changed, 31 insertions(+), 32 deletions(-) diff --git a/common.vhdl b/common.vhdl index 639f0f7..8612389 100644 --- a/common.vhdl +++ b/common.vhdl @@ -240,16 +240,14 @@ package common is write_enable : std_ulogic; write_reg: gspr_index_t; write_data: std_ulogic_vector(63 downto 0); - write_len : std_ulogic_vector(3 downto 0); write_cr_enable : std_ulogic; write_cr_mask : std_ulogic_vector(7 downto 0); write_cr_data : std_ulogic_vector(31 downto 0); write_xerc_enable : std_ulogic; xerc : xer_common_t; - sign_extend: std_ulogic; end record; constant Execute1ToWritebackInit : Execute1ToWritebackType := (valid => '0', rc => '0', write_enable => '0', - write_cr_enable => '0', sign_extend => '0', + write_cr_enable => '0', write_xerc_enable => '0', xerc => xerc_init, others => (others => '0')); diff --git a/execute1.vhdl b/execute1.vhdl index 94845d8..1991009 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -192,6 +192,7 @@ begin variable sign1, sign2 : std_ulogic; variable abs1, abs2 : signed(63 downto 0); variable overflow : std_ulogic; + variable negative : std_ulogic; begin result := (others => '0'); result_with_carry := (others => '0'); @@ -335,8 +336,6 @@ begin v.e.valid := '1'; v.e.write_reg := e_in.write_reg; - v.e.write_len := x"8"; - v.e.sign_extend := '0'; v.slow_op_dest := gspr_to_gpr(e_in.write_reg); v.slow_op_rc := e_in.rc; v.slow_op_oe := e_in.oe; @@ -438,10 +437,19 @@ begin when OP_CNTZ => result := countzero_result; result_en := '1'; - when OP_EXTS => - v.e.write_len := e_in.data_len; - v.e.sign_extend := '1'; - result := e_in.read_data3; + when OP_EXTS => + -- note data_len is a 1-hot encoding + negative := (e_in.data_len(0) and e_in.read_data3(7)) or + (e_in.data_len(1) and e_in.read_data3(15)) or + (e_in.data_len(2) and e_in.read_data3(31)); + result := (others => negative); + if e_in.data_len(2) = '1' then + result(31 downto 16) := e_in.read_data3(31 downto 16); + end if; + if e_in.data_len(2) = '1' or e_in.data_len(1) = '1' then + result(15 downto 8) := e_in.read_data3(15 downto 8); + end if; + result(7 downto 0) := e_in.read_data3(7 downto 0); result_en := '1'; when OP_ISEL => crbit := to_integer(unsigned(insn_bc(e_in.insn))); @@ -660,8 +668,6 @@ begin result_en := '1'; result := r.next_lr; v.e.write_reg := fast_spr_num(SPR_LR); - v.e.write_len := x"8"; - v.e.sign_extend := '0'; v.e.valid := '1'; elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or @@ -687,8 +693,6 @@ begin v.e.xerc.so := v.slow_op_xerc.so or overflow; end if; v.e.valid := '1'; - v.e.write_len := x"8"; - v.e.sign_extend := '0'; else stall_out <= '1'; v.mul_in_progress := r.mul_in_progress; diff --git a/writeback.vhdl b/writeback.vhdl index 08efe91..e53f46b 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -42,7 +42,6 @@ architecture behaviour of writeback is signal sign_extend : std_ulogic; signal negative : std_ulogic; signal second_word : std_ulogic; - signal zero : std_ulogic; begin writeback_0: process(clk) begin @@ -62,6 +61,8 @@ begin variable k : unsigned(3 downto 0); variable cf: std_ulogic_vector(3 downto 0); variable xe: xer_common_t; + variable zero : std_ulogic; + variable sign : std_ulogic; begin x := "" & e_in.valid; y := "" & l_in.valid; @@ -85,10 +86,7 @@ begin rc <= '0'; brev_lenm1 <= "000"; - byte_offset <= "000"; - data_len <= x"8"; partial_write <= '0'; - sign_extend <= '0'; second_word <= '0'; xe := e_in.xerc; data_in <= (others => '0'); @@ -96,9 +94,6 @@ begin if e_in.write_enable = '1' then w_out.write_reg <= e_in.write_reg; w_out.write_enable <= '1'; - data_in <= e_in.write_data; - data_len <= unsigned(e_in.write_len); - sign_extend <= e_in.sign_extend; rc <= e_in.rc; end if; @@ -113,12 +108,11 @@ begin c_out.write_xerc_data <= e_in.xerc; end if; + sign_extend <= l_in.sign_extend; + data_len <= unsigned(l_in.write_len); + byte_offset <= unsigned(l_in.write_shift); if l_in.write_enable = '1' then w_out.write_reg <= gpr_to_gspr(l_in.write_reg); - data_in <= l_in.write_data; - data_len <= unsigned(l_in.write_len); - byte_offset <= unsigned(l_in.write_shift); - sign_extend <= l_in.sign_extend; if l_in.byte_reverse = '1' then brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1; end if; @@ -138,7 +132,7 @@ begin end loop; for i in 0 to 7 loop j := to_integer(perm(i)) * 8; - data_permuted(i * 8 + 7 downto i * 8) <= data_in(j + 7 downto j); + data_permuted(i * 8 + 7 downto i * 8) <= l_in.write_data(j + 7 downto j); end loop; -- If the data can arrive split over two cycles, this will be correct @@ -160,16 +154,12 @@ begin trim_ctl(i) <= '0' & (negative and sign_extend); end if; end loop; - zero <= not negative; for i in 0 to 7 loop case trim_ctl(i) is when "11" => data_trimmed(i * 8 + 7 downto i * 8) <= data_latched(i * 8 + 7 downto i * 8); when "10" => data_trimmed(i * 8 + 7 downto i * 8) <= data_permuted(i * 8 + 7 downto i * 8); - if or data_permuted(i * 8 + 7 downto i * 8) /= '0' then - zero <= '0'; - end if; when "01" => data_trimmed(i * 8 + 7 downto i * 8) <= x"FF"; when others => @@ -178,14 +168,21 @@ begin end loop; -- deliver to regfile - w_out.write_data <= data_trimmed; + if l_in.write_enable = '1' then + w_out.write_data <= data_trimmed; + else + w_out.write_data <= e_in.write_data; + end if; -- Perform CR0 update for RC forms + -- Note that loads never have a form with an RC bit, therefore this can test e_in.write_data if rc = '1' then + sign := e_in.write_data(63); + zero := not (or e_in.write_data); c_out.write_cr_enable <= '1'; c_out.write_cr_mask <= num_to_fxm(0); - cf(3) := negative; - cf(2) := not negative and not zero; + cf(3) := sign; + cf(2) := not sign and not zero; cf(1) := zero; cf(0) := xe.so; c_out.write_cr_data(31 downto 28) <= cf; From d2ca625b3b9c98de607b2a56f8428c70ab343891 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 13 Dec 2019 15:48:54 +1100 Subject: [PATCH 05/10] execute: Do comparisons using the main adder This handles OP_CMP like a subtraction; the main adder computes ~RA + RB + 1, and the condition codes are computed from the results. A direct comparison of the two input operands is used to calculate the EQ bit of the condition result. The LT and GT bits are computed from the MSB of the subtraction result, the carry out from the subtraction, and the MSBs of the operands. For a 32-bit comparison, the 32-bit carry and bit 31 of the result and input operands are used; for a 64-bit comparison, the 64-bit carry and bit 63 of the operands and result are used. It turns out to be more convenient to use the 'signed' field of the decode table to distinguish signed from unsigned comparisons, rather than the insn_type. Therefore this uses OP_CMP for both cmp and cmpl, which also has the benefit of reducing the number of values in insn_type_t. Doing this saves over 200 slice LUTs on the Arty A7-100 and improves timing slightly as well. Signed-off-by: Paul Mackerras --- decode1.vhdl | 8 ++--- decode_types.vhdl | 2 +- execute1.vhdl | 87 ++++++++++++++++++++++++++++++----------------- 3 files changed, 60 insertions(+), 37 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 6ac3f01..0e42d1b 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -44,8 +44,8 @@ architecture behaviour of decode1 is 29 => (ALU, OP_AND, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0'), -- andis. 18 => (ALU, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- b 16 => (ALU, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc - 11 => (ALU, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpi - 10 => (ALU, OP_CMPL, RA, CONST_UI, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli + 11 => (ALU, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmpi + 10 => (ALU, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli 34 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbz 35 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lbzu 42 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lha @@ -145,10 +145,10 @@ architecture behaviour of decode1 is 2#0000011100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- and 2#0000111100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- andc -- 2#0011111100# bperm - 2#0000000000# => (ALU, OP_CMP, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmp + 2#0000000000# => (ALU, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmp 2#0111111100# => (ALU, OP_CMPB, NONE, RB, RS, RA, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpb -- 2#0011100000# cmpeqb - 2#0000100000# => (ALU, OP_CMPL, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpl + 2#0000100000# => (ALU, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpl -- 2#0011000000# cmprb 2#0000111010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- cntlzd 2#0000011010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- cntlzw diff --git a/decode_types.vhdl b/decode_types.vhdl index fdc1e6e..82039bd 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -4,7 +4,7 @@ use ieee.std_logic_1164.all; package decode_types is type insn_type_t is (OP_ILLEGAL, OP_NOP, OP_ADD, OP_ADDPCIS, OP_AND, OP_ATTN, OP_B, OP_BC, OP_BCREG, - OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPL, OP_CMPRB, + OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB, OP_CNTZ, OP_CRAND, OP_CRANDC, OP_CREQV, OP_CRNAND, OP_CRNOR, OP_CROR, OP_CRORC, OP_CRXOR, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST, diff --git a/execute1.vhdl b/execute1.vhdl index 1991009..6889a6a 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -193,6 +193,9 @@ begin variable abs1, abs2 : signed(63 downto 0); variable overflow : std_ulogic; variable negative : std_ulogic; + variable zerohi, zerolo : std_ulogic; + variable msb_a, msb_b : std_ulogic; + variable a_lt : std_ulogic; begin result := (others => '0'); result_with_carry := (others => '0'); @@ -348,7 +351,7 @@ begin report "illegal"; when OP_NOP => -- Do nothing - when OP_ADD => + when OP_ADD | OP_CMP => if e_in.invert_a = '0' then a_inv := e_in.read_data1; else @@ -359,15 +362,57 @@ begin result := result_with_carry(63 downto 0); carry_32 := result(32) xor a_inv(32) xor e_in.read_data2(32); carry_64 := result_with_carry(64); - if e_in.output_carry = '1' then - set_carry(v.e, carry_32, carry_64); - end if; - if e_in.oe = '1' then - set_ov(v.e, - calc_ov(a_inv(63), e_in.read_data2(63), carry_64, result_with_carry(63)), - calc_ov(a_inv(31), e_in.read_data2(31), carry_32, result_with_carry(31))); - end if; - result_en := '1'; + if e_in.insn_type = OP_ADD then + if e_in.output_carry = '1' then + set_carry(v.e, carry_32, carry_64); + end if; + if e_in.oe = '1' then + set_ov(v.e, + calc_ov(a_inv(63), e_in.read_data2(63), carry_64, result_with_carry(63)), + calc_ov(a_inv(31), e_in.read_data2(31), carry_32, result_with_carry(31))); + end if; + result_en := '1'; + else + -- CMP and CMPL instructions + -- Note, we have done RB - RA, not RA - RB + bf := insn_bf(e_in.insn); + l := insn_l(e_in.insn); + v.e.write_cr_enable := '1'; + crnum := to_integer(unsigned(bf)); + v.e.write_cr_mask := num_to_fxm(crnum); + zerolo := not (or (e_in.read_data1(31 downto 0) xor e_in.read_data2(31 downto 0))); + zerohi := not (or (e_in.read_data1(63 downto 32) xor e_in.read_data2(63 downto 32))); + if zerolo = '1' and (l = '0' or zerohi = '1') then + -- values are equal + newcrf := "001" & v.e.xerc.so; + else + if l = '1' then + -- 64-bit comparison + msb_a := e_in.read_data1(63); + msb_b := e_in.read_data2(63); + else + -- 32-bit comparison + msb_a := e_in.read_data1(31); + msb_b := e_in.read_data2(31); + end if; + if msb_a /= msb_b then + -- Subtraction might overflow, but + -- comparison is clear from MSB difference. + -- for signed, 0 is greater; for unsigned, 1 is greater + a_lt := msb_a xnor e_in.is_signed; + else + -- Subtraction cannot overflow since MSBs are equal. + -- carry = 1 indicates RA is smaller (signed or unsigned) + a_lt := (not l and carry_32) or (l and carry_64); + end if; + newcrf := a_lt & not a_lt & '0' & v.e.xerc.so; + end if; + for i in 0 to 7 loop + lo := i*4; + hi := lo + 3; + v.e.write_cr_data(hi downto lo) := newcrf; + end loop; + end if; when OP_AND | OP_OR | OP_XOR => result := logical_result; result_en := '1'; @@ -412,28 +457,6 @@ begin when OP_CMPB => result := ppc_cmpb(e_in.read_data3, e_in.read_data2); result_en := '1'; - when OP_CMP => - bf := insn_bf(e_in.insn); - l := insn_l(e_in.insn); - v.e.write_cr_enable := '1'; - crnum := to_integer(unsigned(bf)); - v.e.write_cr_mask := num_to_fxm(crnum); - for i in 0 to 7 loop - lo := i*4; - hi := lo + 3; - v.e.write_cr_data(hi downto lo) := ppc_cmp(l, e_in.read_data1, e_in.read_data2, v.e.xerc.so); - end loop; - when OP_CMPL => - bf := insn_bf(e_in.insn); - l := insn_l(e_in.insn); - v.e.write_cr_enable := '1'; - crnum := to_integer(unsigned(bf)); - v.e.write_cr_mask := num_to_fxm(crnum); - for i in 0 to 7 loop - lo := i*4; - hi := lo + 3; - v.e.write_cr_data(hi downto lo) := ppc_cmpl(l, e_in.read_data1, e_in.read_data2, v.e.xerc.so); - end loop; when OP_CNTZ => result := countzero_result; result_en := '1'; From 0c714f1be680ed36373be0ee9c15d30a7cc263b6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 13 Jan 2020 18:13:09 +1100 Subject: [PATCH 06/10] execute: Move popcnt and prty instructions into the logical unit This implements logic in the logical entity to calculate the results of the popcnt* and prty* instructions. We now have one insn_type_t value for the 3 popcnt variants and one for the two prty variants, using the length field of the decode_rom_t to distinguish between them. The implementations in logical.vhdl using recursive algorithms rather than the simple functions in ppc_fx_insns.vhdl. This gives a saving of about 140 slice LUTs on the A7-100 and improves timing slightly. Signed-off-by: Paul Mackerras --- decode1.vhdl | 10 ++++---- decode_types.vhdl | 4 ++-- execute1.vhdl | 24 ++++++++----------- logical.vhdl | 60 ++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 76 insertions(+), 22 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 0e42d1b..d2dbd96 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -263,11 +263,11 @@ architecture behaviour of decode1 is 2#0001111100# => (ALU, OP_OR, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nor 2#0110111100# => (ALU, OP_OR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- or 2#0110011100# => (ALU, OP_OR, NONE, RB, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- orc - 2#0001111010# => (ALU, OP_POPCNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntb - 2#0111111010# => (ALU, OP_POPCNTD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntd - 2#0101111010# => (ALU, OP_POPCNTW, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntw - 2#0010111010# => (ALU, OP_PRTYD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd - 2#0010011010# => (ALU, OP_PRTYW, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw + 2#0001111010# => (ALU, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntb + 2#0111111010# => (ALU, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntd + 2#0101111010# => (ALU, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntw + 2#0010111010# => (ALU, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd + 2#0010011010# => (ALU, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw -- 2#0010000000# setb 2#0000011011# => (ALU, OP_SHL, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- sld 2#0000011000# => (ALU, OP_SHL, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- slw diff --git a/decode_types.vhdl b/decode_types.vhdl index 82039bd..21d8b68 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -14,8 +14,8 @@ package decode_types is OP_MCRXR, OP_MCRXRX, OP_MFCR, OP_MFSPR, OP_MOD, OP_MTCRF, OP_MTSPR, OP_MUL_L64, OP_MUL_H64, OP_MUL_H32, OP_OR, - OP_POPCNTB, OP_POPCNTD, OP_POPCNTW, OP_PRTYD, - OP_PRTYW, OP_RLC, OP_RLCL, OP_RLCR, OP_SETB, + OP_POPCNT, OP_PRTY, + OP_RLC, OP_RLCL, OP_RLCR, OP_SETB, OP_SHL, OP_SHR, OP_SYNC, OP_TD, OP_TDI, OP_TW, OP_TWI, OP_XOR, OP_SIM_CONFIG diff --git a/execute1.vhdl b/execute1.vhdl index 6889a6a..5a626f8 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -54,6 +54,8 @@ architecture behaviour of execute1 is signal rotator_carry: std_ulogic; signal logical_result: std_ulogic_vector(63 downto 0); signal countzero_result: std_ulogic_vector(63 downto 0); + signal popcnt_result: std_ulogic_vector(63 downto 0); + signal parity_result: std_ulogic_vector(63 downto 0); -- multiply signals signal x_to_multiply: Execute1ToMultiplyType; @@ -127,7 +129,10 @@ begin op => e_in.insn_type, invert_in => e_in.invert_a, invert_out => e_in.invert_out, - result => logical_result + result => logical_result, + datalen => e_in.data_len, + popcnt => popcnt_result, + parity => parity_result ); countzero_0: entity work.zero_counter @@ -612,20 +617,11 @@ begin -- when others => -- end case; end if; - when OP_POPCNTB => - result := ppc_popcntb(e_in.read_data3); + when OP_POPCNT => + result := popcnt_result; result_en := '1'; - when OP_POPCNTW => - result := ppc_popcntw(e_in.read_data3); - result_en := '1'; - when OP_POPCNTD => - result := ppc_popcntd(e_in.read_data3); - result_en := '1'; - when OP_PRTYD => - result := ppc_prtyd(e_in.read_data3); - result_en := '1'; - when OP_PRTYW => - result := ppc_prtyw(e_in.read_data3); + when OP_PRTY => + result := parity_result; result_en := '1'; when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR => result := rotator_result; diff --git a/logical.vhdl b/logical.vhdl index b92b98d..4dfc13d 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -12,11 +12,29 @@ entity logical is op : in insn_type_t; invert_in : in std_ulogic; invert_out : in std_ulogic; - result : out std_ulogic_vector(63 downto 0) + result : out std_ulogic_vector(63 downto 0); + datalen : in std_logic_vector(3 downto 0); + popcnt : out std_ulogic_vector(63 downto 0); + parity : out std_ulogic_vector(63 downto 0) ); end entity logical; architecture behaviour of logical is + + subtype twobit is unsigned(1 downto 0); + type twobit32 is array(0 to 31) of twobit; + signal pc2 : twobit32; + subtype threebit is unsigned(2 downto 0); + type threebit16 is array(0 to 15) of threebit; + signal pc4 : threebit16; + subtype fourbit is unsigned(3 downto 0); + type fourbit8 is array(0 to 7) of fourbit; + signal pc8 : fourbit8; + subtype sixbit is unsigned(5 downto 0); + type sixbit2 is array(0 to 1) of sixbit; + signal pc32 : sixbit2; + signal par0, par1 : std_ulogic; + begin logical_0: process(all) variable rb_adj, tmp : std_ulogic_vector(63 downto 0); @@ -40,5 +58,45 @@ begin result <= not tmp; end if; + -- population counts + for i in 0 to 31 loop + pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1)); + end loop; + for i in 0 to 15 loop + pc4(i) <= ('0' & pc2(i * 2)) + ('0' & pc2(i * 2 + 1)); + end loop; + for i in 0 to 7 loop + pc8(i) <= ('0' & pc4(i * 2)) + ('0' & pc4(i * 2 + 1)); + end loop; + for i in 0 to 1 loop + pc32(i) <= ("00" & pc8(i * 4)) + ("00" & pc8(i * 4 + 1)) + + ("00" & pc8(i * 4 + 2)) + ("00" & pc8(i * 4 + 3)); + end loop; + popcnt <= (others => '0'); + if datalen(3 downto 2) = "00" then + -- popcntb + for i in 0 to 7 loop + popcnt(i * 8 + 3 downto i * 8) <= std_ulogic_vector(pc8(i)); + end loop; + elsif datalen(3) = '0' then + -- popcntw + for i in 0 to 1 loop + popcnt(i * 32 + 5 downto i * 32) <= std_ulogic_vector(pc32(i)); + end loop; + else + popcnt(6 downto 0) <= std_ulogic_vector(('0' & pc32(0)) + ('0' & pc32(1))); + end if; + + -- parity calculations + par0 <= rs(0) xor rs(8) xor rs(16) xor rs(24); + par1 <= rs(32) xor rs(40) xor rs(48) xor rs(56); + parity <= (others => '0'); + if datalen(3) = '1' then + parity(0) <= par0 xor par1; + else + parity(0) <= par0; + parity(32) <= par1; + end if; + end process; end behaviour; From b14d9820116ebe8c39179c8b6c5565d340bdb72c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 13 Jan 2020 13:23:42 +1100 Subject: [PATCH 07/10] execute: Implement bypass from output of execute1 to input This enables back-to-back execution of integer instructions where the first instruction writes a GPR and the second reads the same GPR. This is done with a set of multiplexers at the start of execute1 which enable any of the three input operands to be taken from the output of execute1 (i.e. r.e.write_data) rather than the input from decode2 (i.e. e_in.read_data[123]). This also requires changes to the hazard detection and handling. Decode2 generates a signal indicating that the GPR being written is available for bypass, which is true for instructions that are executed in execute1 (rather than loadstore1/dcache). The gpr_hazard module stores this "bypassable" bit, and if the same GPR needs to be read by a subsequent instruction, it outputs a "use_bypass" signal rather than generating a stall. The use_bypass signal is then latched at the output of decode2 and passed down to execute1 to control the input multiplexer. At the moment there is no bypass on the inputs to loadstore1, but that is OK because all load and store instructions are marked as single-issue. Signed-off-by: Paul Mackerras --- common.vhdl | 6 ++- control.vhdl | 19 +++++-- core.vhdl | 9 +++- decode2.vhdl | 21 +++++++- execute1.vhdl | 135 ++++++++++++++++++++++++++---------------------- gpr_hazard.vhdl | 68 +++++++++++++++++------- 6 files changed, 168 insertions(+), 90 deletions(-) diff --git a/common.vhdl b/common.vhdl index 8612389..9c8a942 100644 --- a/common.vhdl +++ b/common.vhdl @@ -109,6 +109,9 @@ package common is read_data1: std_ulogic_vector(63 downto 0); read_data2: std_ulogic_vector(63 downto 0); read_data3: std_ulogic_vector(63 downto 0); + bypass_data1: std_ulogic; + bypass_data2: std_ulogic; + bypass_data3: std_ulogic; cr: std_ulogic_vector(31 downto 0); xerc: xer_common_t; lr: std_ulogic; @@ -126,7 +129,8 @@ package common is data_len: std_ulogic_vector(3 downto 0); end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := - (valid => '0', insn_type => OP_ILLEGAL, lr => '0', rc => '0', oe => '0', invert_a => '0', + (valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', + lr => '0', rc => '0', oe => '0', invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, others => (others => '0')); diff --git a/control.vhdl b/control.vhdl index ead3c1f..064ff98 100644 --- a/control.vhdl +++ b/control.vhdl @@ -21,6 +21,7 @@ entity control is gpr_write_valid_in : in std_ulogic; gpr_write_in : in gspr_index_t; + gpr_bypassable : in std_ulogic; gpr_a_read_valid_in : in std_ulogic; gpr_a_read_in : in gspr_index_t; @@ -36,7 +37,11 @@ entity control is valid_out : out std_ulogic; stall_out : out std_ulogic; - stopped_out : out std_ulogic + stopped_out : out std_ulogic; + + gpr_bypass_a : out std_ulogic; + gpr_bypass_b : out std_ulogic; + gpr_bypass_c : out std_ulogic ); end entity control; @@ -71,10 +76,12 @@ begin gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, + bypass_avail => gpr_bypassable, gpr_read_valid_in => gpr_a_read_valid_in, gpr_read_in => gpr_a_read_in, - stall_out => stall_a_out + stall_out => stall_a_out, + use_bypass => gpr_bypass_a ); gpr_hazard1: entity work.gpr_hazard @@ -87,10 +94,12 @@ begin gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, + bypass_avail => gpr_bypassable, gpr_read_valid_in => gpr_b_read_valid_in, gpr_read_in => gpr_b_read_in, - stall_out => stall_b_out + stall_out => stall_b_out, + use_bypass => gpr_bypass_b ); gpr_c_read_in_fmt <= "0" & gpr_c_read_in; @@ -105,10 +114,12 @@ begin gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, + bypass_avail => gpr_bypassable, gpr_read_valid_in => gpr_c_read_valid_in, gpr_read_in => gpr_c_read_in_fmt, - stall_out => stall_c_out + stall_out => stall_c_out, + use_bypass => gpr_bypass_c ); cr_hazard0: entity work.cr_hazard diff --git a/core.vhdl b/core.vhdl index a38cf36..aa86689 100644 --- a/core.vhdl +++ b/core.vhdl @@ -9,7 +9,8 @@ use work.wishbone_types.all; entity core is generic ( SIM : boolean := false; - DISABLE_FLATTEN : boolean := false + DISABLE_FLATTEN : boolean := false; + EX1_BYPASS : boolean := true ); port ( clk : in std_logic; @@ -176,6 +177,9 @@ begin decode1_stall_in <= decode2_stall_out; decode2_0: entity work.decode2 + generic map ( + EX1_BYPASS => EX1_BYPASS + ) port map ( clk => clk, rst => core_rst, @@ -220,6 +224,9 @@ begin ); execute1_0: entity work.execute1 + generic map ( + EX1_BYPASS => EX1_BYPASS + ) port map ( clk => clk, rst => core_rst, diff --git a/decode2.vhdl b/decode2.vhdl index 6cd4574..6e3bd8a 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -9,6 +9,9 @@ use work.helpers.all; use work.insn_helpers.all; entity decode2 is + generic ( + EX1_BYPASS : boolean := true + ); port ( clk : in std_ulogic; rst : in std_ulogic; @@ -184,15 +187,19 @@ architecture behaviour of decode2 is signal gpr_write_valid : std_ulogic; signal gpr_write : gspr_index_t; + signal gpr_bypassable : std_ulogic; signal gpr_a_read_valid : std_ulogic; signal gpr_a_read :gspr_index_t; + signal gpr_a_bypass : std_ulogic; signal gpr_b_read_valid : std_ulogic; signal gpr_b_read : gspr_index_t; + signal gpr_b_bypass : std_ulogic; signal gpr_c_read_valid : std_ulogic; signal gpr_c_read : gpr_index_t; + signal gpr_c_bypass : std_ulogic; signal cr_write_valid : std_ulogic; begin @@ -213,6 +220,7 @@ begin gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write, + gpr_bypassable => gpr_bypassable, gpr_a_read_valid_in => gpr_a_read_valid, gpr_a_read_in => gpr_a_read, @@ -228,7 +236,11 @@ begin valid_out => control_valid_out, stall_out => stall_out, - stopped_out => stopped_out + stopped_out => stopped_out, + + gpr_bypass_a => gpr_a_bypass, + gpr_bypass_b => gpr_b_bypass, + gpr_bypass_c => gpr_c_bypass ); decode2_0: process(clk) @@ -295,9 +307,12 @@ begin v.e.insn_type := d_in.decode.insn_type; v.e.read_reg1 := decoded_reg_a.reg; v.e.read_data1 := decoded_reg_a.data; + v.e.bypass_data1 := gpr_a_bypass; v.e.read_reg2 := decoded_reg_b.reg; v.e.read_data2 := decoded_reg_b.data; + v.e.bypass_data2 := gpr_b_bypass; v.e.read_data3 := decoded_reg_c.data; + v.e.bypass_data3 := gpr_c_bypass; v.e.write_reg := decoded_reg_o.reg; v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then @@ -342,6 +357,10 @@ begin gpr_write_valid <= decoded_reg_o.reg_valid; gpr_write <= decoded_reg_o.reg; + gpr_bypassable <= '0'; + if EX1_BYPASS and d_in.decode.unit = ALU then + gpr_bypassable <= '1'; + end if; gpr_a_read_valid <= decoded_reg_a.reg_valid; gpr_a_read <= decoded_reg_a.reg; diff --git a/execute1.vhdl b/execute1.vhdl index 5a626f8..d63697c 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -11,6 +11,9 @@ use work.insn_helpers.all; use work.ppc_fx_insns.all; entity execute1 is + generic ( + EX1_BYPASS : boolean := true + ); port ( clk : in std_ulogic; rst : in std_ulogic; @@ -46,6 +49,8 @@ architecture behaviour of execute1 is signal r, rin : reg_type; + signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0); + signal ctrl: ctrl_t := (others => (others => '0')); signal ctrl_tmp: ctrl_t := (others => (others => '0')); @@ -109,9 +114,9 @@ begin rotator_0: entity work.rotator port map ( - rs => e_in.read_data3, - ra => e_in.read_data1, - shift => e_in.read_data2(6 downto 0), + rs => c_in, + ra => a_in, + shift => b_in(6 downto 0), insn => e_in.insn, is_32bit => e_in.is_32bit, right_shift => right_shift, @@ -124,8 +129,8 @@ begin logical_0: entity work.logical port map ( - rs => e_in.read_data3, - rb => e_in.read_data2, + rs => c_in, + rb => b_in, op => e_in.insn_type, invert_in => e_in.invert_a, invert_out => e_in.invert_out, @@ -137,7 +142,7 @@ begin countzero_0: entity work.zero_counter port map ( - rs => e_in.read_data3, + rs => c_in, count_right => e_in.insn(10), is_32bit => e_in.is_32bit, result => countzero_result @@ -158,6 +163,10 @@ begin d_out => divider_to_x ); + a_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data1 = '1' else e_in.read_data1; + b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2; + c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3; + execute1_0: process(clk) begin if rising_edge(clk) then @@ -256,21 +265,21 @@ begin if e_in.is_32bit = '1' then if e_in.is_signed = '1' then - x_to_multiply.data1 <= (others => e_in.read_data1(31)); - x_to_multiply.data1(31 downto 0) <= e_in.read_data1(31 downto 0); - x_to_multiply.data2 <= (others => e_in.read_data2(31)); - x_to_multiply.data2(31 downto 0) <= e_in.read_data2(31 downto 0); + x_to_multiply.data1 <= (others => a_in(31)); + x_to_multiply.data1(31 downto 0) <= a_in(31 downto 0); + x_to_multiply.data2 <= (others => b_in(31)); + x_to_multiply.data2(31 downto 0) <= b_in(31 downto 0); else - x_to_multiply.data1 <= '0' & x"00000000" & e_in.read_data1(31 downto 0); - x_to_multiply.data2 <= '0' & x"00000000" & e_in.read_data2(31 downto 0); + x_to_multiply.data1 <= '0' & x"00000000" & a_in(31 downto 0); + x_to_multiply.data2 <= '0' & x"00000000" & b_in(31 downto 0); end if; else if e_in.is_signed = '1' then - x_to_multiply.data1 <= e_in.read_data1(63) & e_in.read_data1; - x_to_multiply.data2 <= e_in.read_data2(63) & e_in.read_data2; + x_to_multiply.data1 <= a_in(63) & a_in; + x_to_multiply.data2 <= b_in(63) & b_in; else - x_to_multiply.data1 <= '0' & e_in.read_data1; - x_to_multiply.data2 <= '0' & e_in.read_data2; + x_to_multiply.data1 <= '0' & a_in; + x_to_multiply.data2 <= '0' & b_in; end if; end if; @@ -279,23 +288,23 @@ begin sign2 := '0'; if e_in.is_signed = '1' then if e_in.is_32bit = '1' then - sign1 := e_in.read_data1(31); - sign2 := e_in.read_data2(31); + sign1 := a_in(31); + sign2 := b_in(31); else - sign1 := e_in.read_data1(63); - sign2 := e_in.read_data2(63); + sign1 := a_in(63); + sign2 := b_in(63); end if; end if; -- take absolute values if sign1 = '0' then - abs1 := signed(e_in.read_data1); + abs1 := signed(a_in); else - abs1 := - signed(e_in.read_data1); + abs1 := - signed(a_in); end if; if sign2 = '0' then - abs2 := signed(e_in.read_data2); + abs2 := signed(b_in); else - abs2 := - signed(e_in.read_data2); + abs2 := - signed(b_in); end if; x_to_divider <= Execute1ToDividerInit; @@ -358,14 +367,14 @@ begin -- Do nothing when OP_ADD | OP_CMP => if e_in.invert_a = '0' then - a_inv := e_in.read_data1; + a_inv := a_in; else - a_inv := not e_in.read_data1; + a_inv := not a_in; end if; - result_with_carry := ppc_adde(a_inv, e_in.read_data2, + result_with_carry := ppc_adde(a_inv, b_in, decode_input_carry(e_in.input_carry, v.e.xerc)); result := result_with_carry(63 downto 0); - carry_32 := result(32) xor a_inv(32) xor e_in.read_data2(32); + carry_32 := result(32) xor a_inv(32) xor b_in(32); carry_64 := result_with_carry(64); if e_in.insn_type = OP_ADD then if e_in.output_carry = '1' then @@ -373,8 +382,8 @@ begin end if; if e_in.oe = '1' then set_ov(v.e, - calc_ov(a_inv(63), e_in.read_data2(63), carry_64, result_with_carry(63)), - calc_ov(a_inv(31), e_in.read_data2(31), carry_32, result_with_carry(31))); + calc_ov(a_inv(63), b_in(63), carry_64, result_with_carry(63)), + calc_ov(a_inv(31), b_in(31), carry_32, result_with_carry(31))); end if; result_en := '1'; else @@ -385,20 +394,20 @@ begin v.e.write_cr_enable := '1'; crnum := to_integer(unsigned(bf)); v.e.write_cr_mask := num_to_fxm(crnum); - zerolo := not (or (e_in.read_data1(31 downto 0) xor e_in.read_data2(31 downto 0))); - zerohi := not (or (e_in.read_data1(63 downto 32) xor e_in.read_data2(63 downto 32))); + zerolo := not (or (a_in(31 downto 0) xor b_in(31 downto 0))); + zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32))); if zerolo = '1' and (l = '0' or zerohi = '1') then -- values are equal newcrf := "001" & v.e.xerc.so; else if l = '1' then -- 64-bit comparison - msb_a := e_in.read_data1(63); - msb_b := e_in.read_data2(63); + msb_a := a_in(63); + msb_b := b_in(63); else -- 32-bit comparison - msb_a := e_in.read_data1(31); - msb_b := e_in.read_data2(31); + msb_a := a_in(31); + msb_b := b_in(31); end if; if msb_a /= msb_b then -- Subtraction might overflow, but @@ -424,25 +433,25 @@ begin when OP_B => f_out.redirect <= '1'; if (insn_aa(e_in.insn)) then - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2)); + f_out.redirect_nia <= std_ulogic_vector(signed(b_in)); else - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2)); + f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in)); end if; when OP_BC => -- read_data1 is CTR bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); if bo(4-2) = '0' then - result := std_ulogic_vector(unsigned(e_in.read_data1) - 1); + result := std_ulogic_vector(unsigned(a_in) - 1); result_en := '1'; v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then + if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then f_out.redirect <= '1'; if (insn_aa(e_in.insn)) then - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2)); + f_out.redirect_nia <= std_ulogic_vector(signed(b_in)); else - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2)); + f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in)); end if; end if; when OP_BCREG => @@ -451,40 +460,40 @@ begin bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); if bo(4-2) = '0' and e_in.insn(10) = '0' then - result := std_ulogic_vector(unsigned(e_in.read_data1) - 1); + result := std_ulogic_vector(unsigned(a_in) - 1); result_en := '1'; v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then + if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then f_out.redirect <= '1'; - f_out.redirect_nia <= e_in.read_data2(63 downto 2) & "00"; + f_out.redirect_nia <= b_in(63 downto 2) & "00"; end if; when OP_CMPB => - result := ppc_cmpb(e_in.read_data3, e_in.read_data2); + result := ppc_cmpb(c_in, b_in); result_en := '1'; when OP_CNTZ => result := countzero_result; result_en := '1'; when OP_EXTS => -- note data_len is a 1-hot encoding - negative := (e_in.data_len(0) and e_in.read_data3(7)) or - (e_in.data_len(1) and e_in.read_data3(15)) or - (e_in.data_len(2) and e_in.read_data3(31)); + negative := (e_in.data_len(0) and c_in(7)) or + (e_in.data_len(1) and c_in(15)) or + (e_in.data_len(2) and c_in(31)); result := (others => negative); if e_in.data_len(2) = '1' then - result(31 downto 16) := e_in.read_data3(31 downto 16); + result(31 downto 16) := c_in(31 downto 16); end if; if e_in.data_len(2) = '1' or e_in.data_len(1) = '1' then - result(15 downto 8) := e_in.read_data3(15 downto 8); + result(15 downto 8) := c_in(15 downto 8); end if; - result(7 downto 0) := e_in.read_data3(7 downto 0); + result(7 downto 0) := c_in(7 downto 0); result_en := '1'; when OP_ISEL => crbit := to_integer(unsigned(insn_bc(e_in.insn))); if e_in.cr(31-crbit) = '1' then - result := e_in.read_data1; + result := a_in; else - result := e_in.read_data2; + result := b_in; end if; result_en := '1'; when OP_MCRF => @@ -549,7 +558,7 @@ begin end if; when OP_MFSPR => if is_fast_spr(e_in.read_reg1) then - result := e_in.read_data1; + result := a_in; if decode_spr_num(e_in.insn) = SPR_XER then -- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer result(63 downto 32) := (others => '0'); @@ -596,19 +605,19 @@ begin crnum := fxm_to_num(insn_fxm(e_in.insn)); v.e.write_cr_mask := num_to_fxm(crnum); end if; - v.e.write_cr_data := e_in.read_data3(31 downto 0); + v.e.write_cr_data := c_in(31 downto 0); when OP_MTSPR => report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & - "=" & to_hstring(e_in.read_data3); + "=" & to_hstring(c_in); if is_fast_spr(e_in.write_reg) then - result := e_in.read_data3; + result := c_in; result_en := '1'; if decode_spr_num(e_in.insn) = SPR_XER then - v.e.xerc.so := e_in.read_data3(63-32); - v.e.xerc.ov := e_in.read_data3(63-33); - v.e.xerc.ca := e_in.read_data3(63-34); - v.e.xerc.ov32 := e_in.read_data3(63-44); - v.e.xerc.ca32 := e_in.read_data3(63-45); + v.e.xerc.so := c_in(63-32); + v.e.xerc.ov := c_in(63-33); + v.e.xerc.ca := c_in(63-34); + v.e.xerc.ov32 := c_in(63-44); + v.e.xerc.ca32 := c_in(63-45); v.e.write_xerc_enable := '1'; end if; else diff --git a/gpr_hazard.vhdl b/gpr_hazard.vhdl index 705e69d..de4f7d2 100644 --- a/gpr_hazard.vhdl +++ b/gpr_hazard.vhdl @@ -12,18 +12,21 @@ entity gpr_hazard is gpr_write_valid_in : in std_ulogic; gpr_write_in : in std_ulogic_vector(5 downto 0); + bypass_avail : in std_ulogic; gpr_read_valid_in : in std_ulogic; gpr_read_in : in std_ulogic_vector(5 downto 0); - stall_out : out std_ulogic + stall_out : out std_ulogic; + use_bypass : out std_ulogic ); end entity gpr_hazard; architecture behaviour of gpr_hazard is type pipeline_entry_type is record - valid : std_ulogic; - gpr : std_ulogic_vector(5 downto 0); + valid : std_ulogic; + bypass : std_ulogic; + gpr : std_ulogic_vector(5 downto 0); end record; - constant pipeline_entry_init : pipeline_entry_type := (valid => '0', gpr => (others => '0')); + constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0')); type pipeline_t is array(0 to PIPELINE_DEPTH-1) of pipeline_entry_type; constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init); @@ -33,9 +36,7 @@ begin gpr_hazard0: process(clk) begin if rising_edge(clk) then - if stall_in = '0' then - r <= rin; - end if; + r <= rin; end if; end process; @@ -45,22 +46,49 @@ begin v := r; stall_out <= '0'; - loop_0: for i in 0 to PIPELINE_DEPTH-1 loop - if ((r(i).valid = gpr_read_valid_in) and r(i).gpr = gpr_read_in) then - stall_out <= '1'; + use_bypass <= '0'; + if gpr_read_valid_in = '1' then + if r(0).valid = '1' and r(0).gpr = gpr_read_in then + if r(0).bypass = '1' and stall_in = '0' then + use_bypass <= '1'; + else + stall_out <= '1'; + end if; end if; - end loop; + loop_0: for i in 1 to PIPELINE_DEPTH-1 loop + if r(i).valid = '1' and r(i).gpr = gpr_read_in then + if r(i).bypass = '1' then + use_bypass <= '1'; + else + stall_out <= '1'; + end if; + end if; + end loop; + end if; - v(0).valid := gpr_write_valid_in; - v(0).gpr := gpr_write_in; - loop_1: for i in 0 to PIPELINE_DEPTH-2 loop - -- propagate to next slot - v(i+1) := r(i); - end loop; + if stall_in = '0' then + v(0).valid := gpr_write_valid_in; + v(0).bypass := bypass_avail; + v(0).gpr := gpr_write_in; + loop_1: for i in 1 to PIPELINE_DEPTH-1 loop + -- propagate to next slot + v(i).valid := r(i-1).valid; + v(i).bypass := r(i-1).bypass; + v(i).gpr := r(i-1).gpr; + end loop; - -- asynchronous output - if gpr_read_valid_in = '0' then - stall_out <= '0'; + else + -- stage 0 stalled, so stage 1 becomes empty + loop_1b: for i in 1 to PIPELINE_DEPTH-1 loop + -- propagate to next slot + if i = 1 then + v(i).valid := '0'; + else + v(i).valid := r(i-1).valid; + v(i).bypass := r(i-1).bypass; + v(i).gpr := r(i-1).gpr; + end if; + end loop; end if; -- update registers From 5422007f83bff7550e8d3064e9c086fa668eb4d9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 14 Jan 2020 10:28:45 +1100 Subject: [PATCH 08/10] Plumb loadstore1 input from execute1 not decode2 This allows us to use the bypass at the input of execute1 for the address and data operands for loadstore1. Signed-off-by: Paul Mackerras --- common.vhdl | 14 +++++++++----- core.vhdl | 6 +++--- decode2.vhdl | 42 +++++++----------------------------------- execute1.vhdl | 26 ++++++++++++++++++++++++++ loadstore1.vhdl | 2 +- 5 files changed, 46 insertions(+), 44 deletions(-) diff --git a/common.vhdl b/common.vhdl index 9c8a942..ffddb0b 100644 --- a/common.vhdl +++ b/common.vhdl @@ -127,12 +127,16 @@ package common is is_signed: std_ulogic; insn: std_ulogic_vector(31 downto 0); data_len: std_ulogic_vector(3 downto 0); + byte_reverse : std_ulogic; + sign_extend : std_ulogic; -- do we need to sign extend? + update : std_ulogic; -- is this an update instruction? end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', - is_32bit => '0', is_signed => '0', xerc => xerc_init, others => (others => '0')); + is_32bit => '0', is_signed => '0', xerc => xerc_init, + byte_reverse => '0', sign_extend => '0', update => '0', others => (others => '0')); type Execute1ToMultiplyType is record valid: std_ulogic; @@ -189,7 +193,7 @@ package common is end record; constant Execute1ToFetch1TypeInit : Execute1ToFetch1Type := (redirect => '0', others => (others => '0')); - type Decode2ToLoadstore1Type is record + type Execute1ToLoadstore1Type is record valid : std_ulogic; load : std_ulogic; -- is this a load or store addr1 : std_ulogic_vector(63 downto 0); @@ -203,9 +207,9 @@ package common is update_reg : gpr_index_t; -- if so, the register to update xerc : xer_common_t; end record; - constant Decode2ToLoadstore1Init : Decode2ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0', - sign_extend => '0', update => '0', xerc => xerc_init, - others => (others => '0')); + constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0', + sign_extend => '0', update => '0', xerc => xerc_init, + others => (others => '0')); type Loadstore1ToDcacheType is record valid : std_ulogic; diff --git a/core.vhdl b/core.vhdl index aa86689..bc0b16f 100644 --- a/core.vhdl +++ b/core.vhdl @@ -60,7 +60,7 @@ architecture behave of core is signal execute1_to_fetch1: Execute1ToFetch1Type; -- load store signals - signal decode2_to_loadstore1: Decode2ToLoadstore1Type; + signal execute1_to_loadstore1: Execute1ToLoadstore1Type; signal loadstore1_to_dcache: Loadstore1ToDcacheType; signal dcache_to_writeback: DcacheToWritebackType; @@ -190,7 +190,6 @@ begin stopped_out => dbg_core_is_stopped, d_in => decode1_to_decode2, e_out => decode2_to_execute1, - l_out => decode2_to_loadstore1, r_in => register_file_to_decode2, r_out => decode2_to_register_file, c_in => cr_file_to_decode2, @@ -233,6 +232,7 @@ begin flush_out => flush, stall_out => ex1_stall_out, e_in => decode2_to_execute1, + l_out => execute1_to_loadstore1, f_out => execute1_to_fetch1, e_out => execute1_to_writeback, icache_inval => ex1_icache_inval, @@ -242,7 +242,7 @@ begin loadstore1_0: entity work.loadstore1 port map ( clk => clk, - l_in => decode2_to_loadstore1, + l_in => execute1_to_loadstore1, l_out => loadstore1_to_dcache ); diff --git a/decode2.vhdl b/decode2.vhdl index 6e3bd8a..582fa5b 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -27,7 +27,6 @@ entity decode2 is d_in : in Decode1ToDecode2Type; e_out : out Decode2ToExecute1Type; - l_out : out Decode2ToLoadstore1Type; r_in : in RegisterFileToDecode2Type; r_out : out Decode2ToRegisterFileType; @@ -40,7 +39,6 @@ end entity decode2; architecture behaviour of decode2 is type reg_type is record e : Decode2ToExecute1Type; - l : Decode2ToLoadstore1Type; end record; signal r, rin : reg_type; @@ -246,7 +244,7 @@ begin decode2_0: process(clk) begin if rising_edge(clk) then - if rin.e.valid = '1' or rin.l.valid = '1' then + if rin.e.valid = '1' then report "execute " & to_hstring(rin.e.nia); end if; r <= rin; @@ -272,7 +270,6 @@ begin v := r; v.e := Decode2ToExecute1Init; - v.l := Decode2ToLoadStore1Init; mul_a := (others => '0'); mul_b := (others => '0'); @@ -331,25 +328,9 @@ begin end if; v.e.insn := d_in.insn; v.e.data_len := length; - - -- load/store unit - v.l.update_reg := gspr_to_gpr(decoded_reg_a.reg); - v.l.addr1 := decoded_reg_a.data; - v.l.addr2 := decoded_reg_b.data; - v.l.data := decoded_reg_c.data; - v.l.write_reg := gspr_to_gpr(decoded_reg_o.reg); - - if d_in.decode.insn_type = OP_LOAD then - v.l.load := '1'; - else - v.l.load := '0'; - end if; - - v.l.length := length; - v.l.byte_reverse := d_in.decode.byte_reverse; - v.l.sign_extend := d_in.decode.sign_extend; - v.l.update := d_in.decode.update; - v.l.xerc := c_in.read_xerc_data; + v.e.byte_reverse := d_in.decode.byte_reverse; + v.e.sign_extend := d_in.decode.sign_extend; + v.e.update := d_in.decode.update; -- issue control control_valid_in <= d_in.valid; @@ -373,21 +354,13 @@ begin cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); - v.e.valid := '0'; - v.l.valid := '0'; - case d_in.decode.unit is - when ALU => - v.e.valid := control_valid_out; - when LDST => - v.l.valid := control_valid_out; - when NONE => - v.e.valid := control_valid_out; + v.e.valid := control_valid_out; + if d_in.decode.unit = NONE then v.e.insn_type := OP_ILLEGAL; - end case; + end if; if rst = '1' then v.e := Decode2ToExecute1Init; - v.l := Decode2ToLoadStore1Init; end if; -- Update registers @@ -395,6 +368,5 @@ begin -- Update outputs e_out <= r.e; - l_out <= r.l; end process; end architecture behaviour; diff --git a/execute1.vhdl b/execute1.vhdl index d63697c..e49494f 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -25,6 +25,7 @@ entity execute1 is e_in : in Decode2ToExecute1Type; -- asynchronous + l_out : out Execute1ToLoadstore1Type; f_out : out Execute1ToFetch1Type; e_out : out Execute1ToWritebackType; @@ -210,6 +211,7 @@ begin variable zerohi, zerolo : std_ulogic; variable msb_a, msb_b : std_ulogic; variable a_lt : std_ulogic; + variable lv : Execute1ToLoadstore1Type; begin result := (others => '0'); result_with_carry := (others => '0'); @@ -667,6 +669,10 @@ begin stall_out <= '1'; x_to_divider.valid <= '1'; + when OP_LOAD | OP_STORE => + -- loadstore/dcache has its own port to writeback + v.e.valid := '0'; + when others => terminate_out <= '1'; report "illegal"; @@ -731,11 +737,31 @@ begin v.e.write_data := result; v.e.write_enable := result_en; + -- Outputs to loadstore1 (async) + lv := Execute1ToLoadstore1Init; + if e_in.valid = '1' and (e_in.insn_type = OP_LOAD or e_in.insn_type = OP_STORE) then + lv.valid := '1'; + end if; + if e_in.insn_type = OP_LOAD then + lv.load := '1'; + end if; + lv.addr1 := a_in; + lv.addr2 := b_in; + lv.data := c_in; + lv.write_reg := gspr_to_gpr(e_in.write_reg); + lv.length := e_in.data_len; + lv.byte_reverse := e_in.byte_reverse; + lv.sign_extend := e_in.sign_extend; + lv.update := e_in.update; + lv.update_reg := gspr_to_gpr(e_in.read_reg1); + lv.xerc := v.e.xerc; + -- Update registers rin <= v; -- update outputs --f_out <= r.f; + l_out <= lv; e_out <= r.e; flush_out <= f_out.redirect; end process; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 1c16c46..5b61d4c 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -13,7 +13,7 @@ entity loadstore1 is port ( clk : in std_ulogic; - l_in : in Decode2ToLoadstore1Type; + l_in : in Execute1ToLoadstore1Type; l_out : out Loadstore1ToDcacheType ); From e08ca4ab8eba7bec404f82396e41d3b5c616b94d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 14 Jan 2020 21:55:33 +1100 Subject: [PATCH 09/10] countzero: Add a register to help make timing This adds a register in the middle of the countzero computation, so that we now have two cycles to count leading or trailing zeroes instead of just one. Execute1 now outputs a one-cycle stall signal when it encounters a cntlz* or cnttz* instruction. With this, the countzero path no longer fails timing on the Artix-7 at 100MHz. Signed-off-by: Paul Mackerras --- countzero.vhdl | 85 ++++++++++++++++++++++++++++++----------------- countzero_tb.vhdl | 10 ++++++ execute1.vhdl | 18 ++++++++-- 3 files changed, 79 insertions(+), 34 deletions(-) diff --git a/countzero.vhdl b/countzero.vhdl index d3960f0..50e6ead 100644 --- a/countzero.vhdl +++ b/countzero.vhdl @@ -6,6 +6,7 @@ library work; entity zero_counter is port ( + clk : in std_logic; rs : in std_ulogic_vector(63 downto 0); count_right : in std_ulogic; is_32bit : in std_ulogic; @@ -14,10 +15,14 @@ entity zero_counter is end entity zero_counter; architecture behaviour of zero_counter is - signal y, z : std_ulogic_vector(3 downto 0); - signal v16 : std_ulogic_vector(15 downto 0); - signal v4 : std_ulogic_vector(3 downto 0); - signal sel : std_ulogic_vector(5 downto 0); + type intermediate_result is record + v16: std_ulogic_vector(15 downto 0); + sel_hi: std_ulogic_vector(1 downto 0); + is_32bit: std_ulogic; + count_right: std_ulogic; + end record; + + signal r, r_in : intermediate_result; -- Return the index of the leftmost or rightmost 1 in a set of 4 bits. -- Assumes v is not "0000"; if it is, return (right ? "11" : "00"). @@ -47,65 +52,83 @@ architecture behaviour of zero_counter is end; begin - zerocounter0: process(all) + zerocounter_0: process(clk) + begin + if rising_edge(clk) then + r <= r_in; + end if; + end process; + + zerocounter_1: process(all) + variable v: intermediate_result; + variable y, z: std_ulogic_vector(3 downto 0); + variable sel: std_ulogic_vector(5 downto 0); + variable v4: std_ulogic_vector(3 downto 0); + begin -- Test 4 groups of 16 bits each. -- The top 2 groups are considered to be zero in 32-bit mode. - z(0) <= or (rs(15 downto 0)); - z(1) <= or (rs(31 downto 16)); - z(2) <= or (rs(47 downto 32)); - z(3) <= or (rs(63 downto 48)); + z(0) := or (rs(15 downto 0)); + z(1) := or (rs(31 downto 16)); + z(2) := or (rs(47 downto 32)); + z(3) := or (rs(63 downto 48)); if is_32bit = '0' then - sel(5 downto 4) <= encoder(z, count_right); + v.sel_hi := encoder(z, count_right); else - sel(5) <= '0'; + v.sel_hi(1) := '0'; if count_right = '0' then - sel(4) <= z(1); + v.sel_hi(0) := z(1); else - sel(4) <= not z(0); + v.sel_hi(0) := not z(0); end if; end if; -- Select the leftmost/rightmost non-zero group of 16 bits - case sel(5 downto 4) is + case v.sel_hi is when "00" => - v16 <= rs(15 downto 0); + v.v16 := rs(15 downto 0); when "01" => - v16 <= rs(31 downto 16); + v.v16 := rs(31 downto 16); when "10" => - v16 <= rs(47 downto 32); + v.v16 := rs(47 downto 32); when others => - v16 <= rs(63 downto 48); + v.v16 := rs(63 downto 48); end case; + -- Latch this and do the rest in the next cycle, for the sake of timing + v.is_32bit := is_32bit; + v.count_right := count_right; + r_in <= v; + sel(5 downto 4) := r.sel_hi; + -- Test 4 groups of 4 bits - y(0) <= or (v16(3 downto 0)); - y(1) <= or (v16(7 downto 4)); - y(2) <= or (v16(11 downto 8)); - y(3) <= or (v16(15 downto 12)); - sel(3 downto 2) <= encoder(y, count_right); + y(0) := or (r.v16(3 downto 0)); + y(1) := or (r.v16(7 downto 4)); + y(2) := or (r.v16(11 downto 8)); + y(3) := or (r.v16(15 downto 12)); + sel(3 downto 2) := encoder(y, r.count_right); -- Select the leftmost/rightmost non-zero group of 4 bits case sel(3 downto 2) is when "00" => - v4 <= v16(3 downto 0); + v4 := r.v16(3 downto 0); when "01" => - v4 <= v16(7 downto 4); + v4 := r.v16(7 downto 4); when "10" => - v4 <= v16(11 downto 8); + v4 := r.v16(11 downto 8); when others => - v4 <= v16(15 downto 12); + v4 := r.v16(15 downto 12); end case; - sel(1 downto 0) <= encoder(v4, count_right); + sel(1 downto 0) := encoder(v4, r.count_right); -- sel is now the index of the leftmost/rightmost 1 bit in rs if v4 = "0000" then -- operand is zero, return 32 for 32-bit, else 64 - result <= x"00000000000000" & '0' & not is_32bit & is_32bit & "00000"; - elsif count_right = '0' then + result <= x"00000000000000" & '0' & not r.is_32bit & r.is_32bit & "00000"; + elsif r.count_right = '0' then -- return (63 - sel), trimmed to 5 bits in 32-bit mode - result <= x"00000000000000" & "00" & (not sel(5) and not is_32bit) & not sel(4 downto 0); + result <= x"00000000000000" & "00" & (not sel(5) and not r.is_32bit) & not sel(4 downto 0); else result <= x"00000000000000" & "00" & sel; end if; diff --git a/countzero_tb.vhdl b/countzero_tb.vhdl index 91de334..21529de 100644 --- a/countzero_tb.vhdl +++ b/countzero_tb.vhdl @@ -15,16 +15,26 @@ architecture behave of countzero_tb is signal is_32bit, count_right: std_ulogic := '0'; signal result: std_ulogic_vector(63 downto 0); signal randno: std_ulogic_vector(63 downto 0); + signal clk: std_ulogic; begin zerocounter_0: entity work.zero_counter port map ( + clk => clk, rs => rs, result => result, count_right => count_right, is_32bit => is_32bit ); + clk_process: process + begin + clk <= '0'; + wait for clk_period/2; + clk <= '1'; + wait for clk_period/2; + end process; + stim_process: process variable r: std_ulogic_vector(63 downto 0); begin diff --git a/execute1.vhdl b/execute1.vhdl index e49494f..ae13c72 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -42,6 +42,7 @@ architecture behaviour of execute1 is next_lr : std_ulogic_vector(63 downto 0); mul_in_progress : std_ulogic; div_in_progress : std_ulogic; + cntz_in_progress : std_ulogic; slow_op_dest : gpr_index_t; slow_op_rc : std_ulogic; slow_op_oe : std_ulogic; @@ -143,6 +144,7 @@ begin countzero_0: entity work.zero_counter port map ( + clk => clk, rs => c_in, count_right => e_in.insn(10), is_32bit => e_in.is_32bit, @@ -259,6 +261,7 @@ begin v.lr_update := '0'; v.mul_in_progress := '0'; v.div_in_progress := '0'; + v.cntz_in_progress := '0'; -- signals to multiply unit x_to_multiply <= Execute1ToMultiplyInit; @@ -473,9 +476,10 @@ begin when OP_CMPB => result := ppc_cmpb(c_in, b_in); result_en := '1'; - when OP_CNTZ => - result := countzero_result; - result_en := '1'; + when OP_CNTZ => + v.e.valid := '0'; + v.cntz_in_progress := '1'; + stall_out <= '1'; when OP_EXTS => -- note data_len is a 1-hot encoding negative := (e_in.data_len(0) and c_in(7)) or @@ -703,6 +707,14 @@ begin result := r.next_lr; v.e.write_reg := fast_spr_num(SPR_LR); v.e.valid := '1'; + elsif r.cntz_in_progress = '1' then + -- cnt[lt]z always takes two cycles + result := countzero_result; + result_en := '1'; + v.e.write_reg := gpr_to_gspr(v.slow_op_dest); + v.e.rc := v.slow_op_rc; + v.e.xerc := v.slow_op_xerc; + v.e.valid := '1'; elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or (r.div_in_progress = '1' and divider_to_x.valid = '1') then From 2661b9b985698a4ecd2854befa7c83f4e0c7b02e Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 14 Jan 2020 23:20:42 +1100 Subject: [PATCH 10/10] decode1: Mark subfic as pipelined This seems just to have been missed in commit f291efa26690 ("decode1: Mark ALU ops using carry as pipelined"). Signed-off-by: Paul Mackerras --- decode1.vhdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/decode1.vhdl b/decode1.vhdl index d2dbd96..f1b5ad4 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -66,7 +66,7 @@ architecture behaviour of decode1 is 45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- sthu 36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stw 37 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stwu - 8 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- subfic + 8 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic 2 => (ALU, OP_TDI, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi --PPC_TWI 3 26 => (ALU, OP_XOR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- xori