From 39d18d27388ee97ef598e8ee5ce73d30db257b0a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 12 Dec 2019 08:47:42 +1100 Subject: [PATCH] Make divider hang off the side of execute1 With this, the divider is a unit that execute1 sends operands to and which sends its results back to execute1, which then send them to writeback. Execute1 now sends a stall signal when it gets a divide or modulus instruction until it gets a valid signal back from the divider. Divide and modulus instructions are no longer marked as single-issue. The data formatting step that used to be done in decode2 for div and mod instructions is now done in execute1. We also do the absolute value operation in that same cycle instead of taking an extra cycle inside the divider for signed operations with a negative operand. Signed-off-by: Paul Mackerras --- Makefile | 4 +- common.vhdl | 22 +++++------ core.vhdl | 16 +------- decode1.vhdl | 40 ++++++++++---------- decode2.vhdl | 56 +--------------------------- decode_types.vhdl | 4 +- divider.vhdl | 25 ++----------- divider_tb.vhdl | 43 +++++++++++++-------- execute1.vhdl | 95 ++++++++++++++++++++++++++++++++++++++++++++++- writeback.vhdl | 29 ++------------- 10 files changed, 165 insertions(+), 169 deletions(-) diff --git a/Makefile b/Makefile index 720e8d5..939f48e 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ common.o: decode_types.o control.o: gpr_hazard.o cr_hazard.o common.o sim_jtag.o: sim_jtag_socket.o core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o -core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o writeback.o core_debug.o divider.o +core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o writeback.o core_debug.o core_debug.o: common.o countzero.o: countzero_tb.o: common.o glibc_random.o countzero.o @@ -26,7 +26,7 @@ crhelpers.o: common.o decode1.o: common.o decode_types.o decode2.o: decode_types.o common.o helpers.o insn_helpers.o control.o decode_types.o: -execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o multiply.o +execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o multiply.o divider.o fetch1.o: common.o fetch2.o: common.o wishbone_types.o glibc_random_helpers.o: diff --git a/common.vhdl b/common.vhdl index 9c18230..1d0bbac 100644 --- a/common.vhdl +++ b/common.vhdl @@ -145,7 +145,7 @@ package common is oe => '0', is_32bit => '0', xerc => xerc_init, others => (others => '0')); - type Decode2ToDividerType is record + type Execute1ToDividerType is record valid: std_ulogic; write_reg: gpr_index_t; dividend: std_ulogic_vector(63 downto 0); @@ -154,14 +154,15 @@ package common is is_32bit: std_ulogic; is_extended: std_ulogic; is_modulus: std_ulogic; + neg_result: std_ulogic; rc: std_ulogic; oe: std_ulogic; xerc: xer_common_t; end record; - constant Decode2ToDividerInit: Decode2ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0', - is_extended => '0', is_modulus => '0', - rc => '0', oe => '0', xerc => xerc_init, - others => (others => '0')); + constant Execute1ToDividerInit: Execute1ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0', + is_extended => '0', is_modulus => '0', + rc => '0', oe => '0', xerc => xerc_init, + neg_result => '0', others => (others => '0')); type Decode2ToRegisterFileType is record read1_enable : std_ulogic; @@ -275,20 +276,19 @@ package common is xerc => xerc_init, others => (others => '0')); - type DividerToWritebackType is record + type DividerToExecute1Type is record valid: std_ulogic; - write_reg_enable : std_ulogic; write_reg_nr: gpr_index_t; write_reg_data: std_ulogic_vector(63 downto 0); write_xerc_enable : std_ulogic; xerc : xer_common_t; rc: std_ulogic; end record; - constant DividerToWritebackInit : DividerToWritebackType := (valid => '0', write_reg_enable => '0', - rc => '0', write_xerc_enable => '0', - xerc => xerc_init, - others => (others => '0')); + constant DividerToExecute1Init : DividerToExecute1Type := (valid => '0', + rc => '0', write_xerc_enable => '0', + xerc => xerc_init, + others => (others => '0')); type WritebackToRegisterFileType is record write_reg : gspr_index_t; diff --git a/core.vhdl b/core.vhdl index 71c10b3..a38cf36 100644 --- a/core.vhdl +++ b/core.vhdl @@ -63,10 +63,6 @@ architecture behave of core is signal loadstore1_to_dcache: Loadstore1ToDcacheType; signal dcache_to_writeback: DcacheToWritebackType; - -- divider signals - signal decode2_to_divider: Decode2ToDividerType; - signal divider_to_writeback: DividerToWritebackType; - -- local signals signal fetch1_stall_in : std_ulogic; signal icache_stall_out : std_ulogic; @@ -111,7 +107,6 @@ architecture behave of core is attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of cr_file_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of execute1_0 : label is keep_h(DISABLE_FLATTEN); - attribute keep_hierarchy of divider_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of loadstore1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of dcache_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of writeback_0 : label is keep_h(DISABLE_FLATTEN); @@ -192,7 +187,6 @@ begin d_in => decode1_to_decode2, e_out => decode2_to_execute1, l_out => decode2_to_loadstore1, - d_out => decode2_to_divider, r_in => register_file_to_decode2, r_out => decode2_to_register_file, c_in => cr_file_to_decode2, @@ -228,6 +222,7 @@ begin execute1_0: entity work.execute1 port map ( clk => clk, + rst => core_rst, flush_out => flush, stall_out => ex1_stall_out, e_in => decode2_to_execute1, @@ -259,20 +254,11 @@ begin wishbone_out => wishbone_data_out ); - divider_0: entity work.divider - port map ( - clk => clk, - rst => core_rst, - d_in => decode2_to_divider, - d_out => divider_to_writeback - ); - writeback_0: entity work.writeback port map ( clk => clk, e_in => execute1_to_writeback, l_in => dcache_to_writeback, - d_in => divider_to_writeback, w_out => writeback_to_register_file, c_out => writeback_to_cr_file, complete_out => complete diff --git a/decode1.vhdl b/decode1.vhdl index 4e1d063..6ac3f01 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -160,22 +160,22 @@ architecture behaviour of decode1 is 2#0100010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbt 2#0011110110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbtst -- 2#1111110110# dcbz - 2#0110001001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdeu - 2#1110001001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdeuo - 2#0110001011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divweu - 2#1110001011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divweuo - 2#0110101001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divde - 2#1110101001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdeo - 2#0110101011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divwe - 2#1110101011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divweo - 2#0111001001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdu - 2#1111001001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divduo - 2#0111001011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divwu - 2#1111001011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divwuo - 2#0111101001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divd - 2#1111101001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdo - 2#0111101011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divw - 2#1111101011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divwo + 2#0110001001# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- divdeu + 2#1110001001# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- divdeuo + 2#0110001011# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- divweu + 2#1110001011# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- divweuo + 2#0110101001# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- divde + 2#1110101001# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- divdeo + 2#0110101011# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- divwe + 2#1110101011# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- divweo + 2#0111001001# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- divdu + 2#1111001001# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- divduo + 2#0111001011# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- divwu + 2#1111001011# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- divwuo + 2#0111101001# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- divd + 2#1111101001# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- divdo + 2#0111101011# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- divw + 2#1111101011# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- divwo 2#0100011100# => (ALU, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- eqv 2#1110111010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsb 2#1110011010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsh @@ -238,10 +238,10 @@ architecture behaviour of decode1 is -- 2#1001000000# mcrxrx 2#0000010011# => (ALU, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf 2#0101010011# => (ALU, OP_MFSPR, SPR, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfspr - 2#0100001001# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modud - 2#0100001011# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- moduw - 2#1100001001# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsd - 2#1100001011# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsw + 2#0100001001# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- modud + 2#0100001011# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- moduw + 2#1100001001# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- modsd + 2#1100001011# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', NONE, '0', '0'), -- modsw 2#0010010000# => (ALU, OP_MTCRF, NONE, NONE, RS, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtcrf/mtocrf 2#0111010011# => (ALU, OP_MTSPR, NONE, NONE, RS, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtspr 2#0001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd diff --git a/decode2.vhdl b/decode2.vhdl index 2da5c41..a95dae3 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -24,7 +24,6 @@ entity decode2 is d_in : in Decode1ToDecode2Type; e_out : out Decode2ToExecute1Type; - d_out : out Decode2ToDividerType; l_out : out Decode2ToLoadstore1Type; r_in : in RegisterFileToDecode2Type; @@ -38,7 +37,6 @@ end entity decode2; architecture behaviour of decode2 is type reg_type is record e : Decode2ToExecute1Type; - d : Decode2ToDividerType; l : Decode2ToLoadstore1Type; end record; @@ -236,7 +234,7 @@ begin decode2_0: process(clk) begin if rising_edge(clk) then - if rin.e.valid = '1' or rin.l.valid = '1' or rin.d.valid = '1' then + if rin.e.valid = '1' or rin.l.valid = '1' then report "execute " & to_hstring(rin.e.nia); end if; r <= rin; @@ -257,14 +255,12 @@ begin variable decoded_reg_b : decode_input_reg_t; variable decoded_reg_c : decode_input_reg_t; variable decoded_reg_o : decode_output_reg_t; - variable signed_division: std_ulogic; variable length : std_ulogic_vector(3 downto 0); begin v := r; v.e := Decode2ToExecute1Init; v.l := Decode2ToLoadStore1Init; - v.d := Decode2ToDividerInit; mul_a := (others => '0'); mul_b := (others => '0'); @@ -319,51 +315,6 @@ begin v.e.insn := d_in.insn; v.e.data_len := length; - -- divide unit - -- PPC divide and modulus instruction words have these bits in - -- the bottom 11 bits: o1dns 010t1 r - -- where o = OE for div instrs, signedness for mod instrs - -- d = 1 for div*, 0 for mod* - -- n = 1 for normal, 0 for extended (dividend << 32/64) - -- s = 1 for signed, 0 for unsigned (for div*) - -- t = 1 for 32-bit, 0 for 64-bit - -- r = RC bit (record condition code) - v.d.write_reg := gspr_to_gpr(decoded_reg_o.reg); - v.d.is_modulus := not d_in.insn(8); - v.d.is_32bit := d_in.insn(2); - if d_in.insn(8) = '1' then - signed_division := d_in.insn(6); - else - signed_division := d_in.insn(10); - end if; - v.d.is_signed := signed_division; - if d_in.insn(2) = '0' then - -- 64-bit forms - if d_in.insn(8) = '1' and d_in.insn(7) = '0' then - v.d.is_extended := '1'; - end if; - v.d.dividend := decoded_reg_a.data; - v.d.divisor := decoded_reg_b.data; - else - -- 32-bit forms - if d_in.insn(8) = '1' and d_in.insn(7) = '0' then -- extended forms - v.d.dividend := decoded_reg_a.data(31 downto 0) & x"00000000"; - elsif signed_division = '1' and decoded_reg_a.data(31) = '1' then - -- sign extend to 64 bits - v.d.dividend := x"ffffffff" & decoded_reg_a.data(31 downto 0); - else - v.d.dividend := x"00000000" & decoded_reg_a.data(31 downto 0); - end if; - if signed_division = '1' and decoded_reg_b.data(31) = '1' then - v.d.divisor := x"ffffffff" & decoded_reg_b.data(31 downto 0); - else - v.d.divisor := x"00000000" & decoded_reg_b.data(31 downto 0); - end if; - end if; - v.d.rc := decode_rc(d_in.decode.rc, d_in.insn); - v.d.xerc := c_in.read_xerc_data; - v.d.oe := decode_oe(d_in.decode.rc, d_in.insn); - -- load/store unit v.l.update_reg := gspr_to_gpr(decoded_reg_a.reg); v.l.addr1 := decoded_reg_a.data; @@ -402,15 +353,12 @@ begin cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); v.e.valid := '0'; - v.d.valid := '0'; v.l.valid := '0'; case d_in.decode.unit is when ALU => v.e.valid := control_valid_out; when LDST => v.l.valid := control_valid_out; - when DIV => - v.d.valid := control_valid_out; when NONE => v.e.valid := control_valid_out; v.e.insn_type := OP_ILLEGAL; @@ -419,7 +367,6 @@ begin if rst = '1' then v.e := Decode2ToExecute1Init; v.l := Decode2ToLoadStore1Init; - v.d := Decode2ToDividerInit; end if; -- Update registers @@ -428,6 +375,5 @@ begin -- Update outputs e_out <= r.e; l_out <= r.l; - d_out <= r.d; end process; end architecture behaviour; diff --git a/decode_types.vhdl b/decode_types.vhdl index 9860406..fdc1e6e 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -8,7 +8,7 @@ package decode_types is OP_CNTZ, OP_CRAND, OP_CRANDC, OP_CREQV, OP_CRNAND, OP_CRNOR, OP_CROR, OP_CRORC, OP_CRXOR, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST, - OP_DCBZ, OP_DIV, OP_EXTS, + OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS, OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC, OP_LOAD, OP_STORE, OP_MADDHD, OP_MADDHDU, OP_MADDLD, OP_MCRF, OP_MCRXR, OP_MCRXRX, OP_MFCR, OP_MFSPR, OP_MOD, @@ -46,7 +46,7 @@ package decode_types is constant TOO_OFFSET : integer := 0; - type unit_t is (NONE, ALU, LDST, DIV); + type unit_t is (NONE, ALU, LDST); type length_t is (NONE, is1B, is2B, is4B, is8B); type decode_rom_t is record diff --git a/divider.vhdl b/divider.vhdl index affab85..33d2a0d 100644 --- a/divider.vhdl +++ b/divider.vhdl @@ -10,8 +10,8 @@ entity divider is port ( clk : in std_logic; rst : in std_logic; - d_in : in Decode2ToDividerType; - d_out : out DividerToWritebackType + d_in : in Execute1ToDividerType; + d_out : out DividerToExecute1Type ); end entity divider; @@ -23,7 +23,6 @@ architecture behaviour of divider is signal sresult : std_ulogic_vector(64 downto 0); signal oresult : std_ulogic_vector(63 downto 0); signal running : std_ulogic; - signal signcheck : std_ulogic; signal count : unsigned(6 downto 0); signal neg_result : std_ulogic; signal is_modulus : std_ulogic; @@ -48,7 +47,7 @@ begin running <= '0'; count <= "0000000"; elsif d_in.valid = '1' then - if d_in.is_extended = '1' and not (d_in.is_signed = '1' and d_in.dividend(63) = '1') then + if d_in.is_extended = '1' then dend <= '0' & d_in.dividend & x"0000000000000000"; else dend <= '0' & x"0000000000000000" & d_in.dividend; @@ -56,7 +55,7 @@ begin div <= unsigned(d_in.divisor); quot <= (others => '0'); write_reg <= d_in.write_reg; - neg_result <= '0'; + neg_result <= d_in.neg_result; is_modulus <= d_in.is_modulus; extended <= d_in.is_extended; is_32bit <= d_in.is_32bit; @@ -68,20 +67,6 @@ begin running <= '1'; overflow <= '0'; ovf32 <= '0'; - signcheck <= d_in.is_signed and (d_in.dividend(63) or d_in.divisor(63)); - elsif signcheck = '1' then - signcheck <= '0'; - neg_result <= dend(63) xor (div(63) and not is_modulus); - if dend(63) = '1' then - if extended = '1' then - dend <= '0' & std_ulogic_vector(- signed(dend(63 downto 0))) & x"0000000000000000"; - else - dend <= '0' & x"0000000000000000" & std_ulogic_vector(- signed(dend(63 downto 0))); - end if; - end if; - if div(63) = '1' then - div <= unsigned(- signed(div)); - end if; elsif running = '1' then if count = "0111111" then running <= '0'; @@ -151,12 +136,10 @@ begin if rising_edge(clk) then d_out.valid <= '0'; d_out.write_reg_data <= oresult; - d_out.write_reg_enable <= '0'; d_out.write_xerc_enable <= '0'; d_out.xerc <= xerc; if count = "1000000" then d_out.valid <= '1'; - d_out.write_reg_enable <= '1'; d_out.write_xerc_enable <= oe; -- We must test oe because the RC update code in writeback diff --git a/divider_tb.vhdl b/divider_tb.vhdl index 5f809bb..8151315 100644 --- a/divider_tb.vhdl +++ b/divider_tb.vhdl @@ -16,8 +16,8 @@ architecture behave of divider_tb is signal rst : std_ulogic; constant clk_period : time := 10 ns; - signal d1 : Decode2ToDividerType; - signal d2 : DividerToWritebackType; + signal d1 : Execute1ToDividerType; + signal d2 : DividerToExecute1Type; begin divider_0: entity work.divider port map (clk => clk, rst => rst, d_in => d1, d_out => d2); @@ -50,6 +50,7 @@ begin d1.is_32bit <= '0'; d1.is_extended <= '0'; d1.is_modulus <= '0'; + d1.neg_result <= '0'; d1.rc <= '0'; wait for clk_period; @@ -65,7 +66,6 @@ begin end loop; assert d2.valid = '1'; - assert d2.write_reg_enable = '1'; assert d2.write_reg_nr = "10001"; assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data); assert d2.rc = '0'; @@ -89,7 +89,6 @@ begin end loop; assert d2.valid = '1'; - assert d2.write_reg_enable = '1'; assert d2.write_reg_nr = "10001"; assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data); assert d2.rc = '1'; @@ -105,9 +104,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64)); rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63) xor rb(63); d1.valid <= '1'; wait for clk_period; @@ -142,6 +142,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.valid <= '1'; wait for clk_period; @@ -173,9 +174,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64)); rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63) xor rb(63); d1.is_extended <= '1'; d1.valid <= '1'; @@ -216,6 +218,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.is_extended <= '1'; d1.valid <= '1'; @@ -250,9 +253,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64)); rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63) xor rb(63); d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.valid <= '1'; @@ -289,6 +293,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.valid <= '1'; @@ -322,9 +327,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 32)) & x"00000000"; rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63) xor rb(63); d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.valid <= '1'; @@ -365,6 +371,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.valid <= '1'; @@ -398,9 +405,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64)); rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63); d1.is_extended <= '0'; d1.is_32bit <= '0'; d1.is_modulus <= '1'; @@ -438,6 +446,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.is_extended <= '0'; d1.is_32bit <= '0'; d1.is_modulus <= '1'; @@ -472,9 +481,10 @@ begin ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64)); rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64)); - d1.dividend <= ra; - d1.divisor <= rb; + d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra)); + d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb)); d1.is_signed <= '1'; + d1.neg_result <= ra(63); d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.is_modulus <= '1'; @@ -517,6 +527,7 @@ begin d1.dividend <= ra; d1.divisor <= rb; d1.is_signed <= '0'; + d1.neg_result <= '0'; d1.is_extended <= '0'; d1.is_32bit <= '1'; d1.is_modulus <= '1'; diff --git a/execute1.vhdl b/execute1.vhdl index 710044f..7bcffdc 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -13,6 +13,7 @@ use work.ppc_fx_insns.all; entity execute1 is port ( clk : in std_ulogic; + rst : in std_ulogic; -- asynchronous flush_out : out std_ulogic; @@ -36,6 +37,7 @@ architecture behaviour of execute1 is lr_update : std_ulogic; next_lr : std_ulogic_vector(63 downto 0); mul_in_progress : std_ulogic; + div_in_progress : std_ulogic; end record; signal r, rin : reg_type; @@ -53,6 +55,10 @@ architecture behaviour of execute1 is signal x_to_multiply: Execute1ToMultiplyType; signal multiply_to_x: MultiplyToExecute1Type; + -- divider signals + signal x_to_divider: Execute1ToDividerType; + signal divider_to_x: DividerToExecute1Type; + procedure set_carry(e: inout Execute1ToWritebackType; carry32 : in std_ulogic; carry : in std_ulogic) is @@ -135,6 +141,14 @@ begin m_out => multiply_to_x ); + divider_0: entity work.divider + port map ( + clk => clk, + rst => rst, + d_in => x_to_divider, + d_out => divider_to_x + ); + execute1_0: process(clk) begin if rising_edge(clk) then @@ -171,6 +185,8 @@ begin variable l : std_ulogic; variable next_nia : std_ulogic_vector(63 downto 0); variable carry_32, carry_64 : std_ulogic; + variable sign1, sign2 : std_ulogic; + variable abs1, abs2 : signed(63 downto 0); begin result := (others => '0'); result_with_carry := (others => '0'); @@ -217,6 +233,7 @@ begin v.lr_update := '0'; v.mul_in_progress := '0'; + v.div_in_progress := '0'; -- signals to multiply unit x_to_multiply <= Execute1ToMultiplyInit; @@ -249,6 +266,59 @@ begin end if; end if; + -- signals to divide unit + sign1 := '0'; + sign2 := '0'; + if e_in.is_signed = '1' then + if e_in.is_32bit = '1' then + sign1 := e_in.read_data1(31); + sign2 := e_in.read_data2(31); + else + sign1 := e_in.read_data1(63); + sign2 := e_in.read_data2(63); + end if; + end if; + -- take absolute values + if sign1 = '0' then + abs1 := signed(e_in.read_data1); + else + abs1 := - signed(e_in.read_data1); + end if; + if sign2 = '0' then + abs2 := signed(e_in.read_data2); + else + abs2 := - signed(e_in.read_data2); + end if; + + x_to_divider <= Execute1ToDividerInit; + x_to_divider.write_reg <= gspr_to_gpr(e_in.write_reg); + x_to_divider.is_signed <= e_in.is_signed; + x_to_divider.is_32bit <= e_in.is_32bit; + if e_in.insn_type = OP_MOD then + x_to_divider.is_modulus <= '1'; + end if; + x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus); + x_to_divider.rc <= e_in.rc; + x_to_divider.oe <= e_in.oe; + x_to_divider.xerc <= v.e.xerc; + if e_in.is_32bit = '0' then + -- 64-bit forms + if e_in.insn_type = OP_DIVE then + x_to_divider.is_extended <= '1'; + end if; + x_to_divider.dividend <= std_ulogic_vector(abs1); + x_to_divider.divisor <= std_ulogic_vector(abs2); + else + -- 32-bit forms + x_to_divider.is_extended <= '0'; + if e_in.insn_type = OP_DIVE then -- extended forms + x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000"; + else + x_to_divider.dividend <= x"00000000" & std_ulogic_vector(abs1(31 downto 0)); + end if; + x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0)); + end if; + ctrl_tmp <= ctrl; -- FIXME: run at 512MHz not core freq ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); @@ -550,13 +620,19 @@ begin when OP_ICBI => icache_inval <= '1'; - when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 => + when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 => v.e.valid := '0'; v.mul_in_progress := '1'; stall_out <= '1'; x_to_multiply.valid <= '1'; - when others => + when OP_DIV | OP_DIVE | OP_MOD => + v.e.valid := '0'; + v.div_in_progress := '1'; + stall_out <= '1'; + x_to_divider.valid <= '1'; + + when others => terminate_out <= '1'; report "illegal"; end case; @@ -603,6 +679,21 @@ begin stall_out <= '1'; v.mul_in_progress := '1'; end if; + elsif r.div_in_progress = '1' then + if divider_to_x.valid = '1' then + v.e.write_reg := gpr_to_gspr(divider_to_x.write_reg_nr); + result := divider_to_x.write_reg_data; + result_en := '1'; + v.e.rc := divider_to_x.rc; + v.e.xerc := divider_to_x.xerc; + v.e.write_xerc_enable := divider_to_x.write_xerc_enable; + v.e.valid := '1'; + v.e.write_len := x"8"; + v.e.sign_extend := '0'; + else + stall_out <= '1'; + v.div_in_progress := '1'; + end if; end if; v.e.write_data := result; diff --git a/writeback.vhdl b/writeback.vhdl index 1323f71..08efe91 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -12,7 +12,6 @@ entity writeback is e_in : in Execute1ToWritebackType; l_in : in DcacheToWritebackType; - d_in : in DividerToWritebackType; w_out : out WritebackToRegisterFileType; c_out : out WritebackToCrFileType; @@ -66,28 +65,21 @@ begin begin x := "" & e_in.valid; y := "" & l_in.valid; - z := "" & d_in.valid; - assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure; + assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; x := "" & e_in.write_enable; y := "" & l_in.write_enable; - z := "" & d_in.write_reg_enable; - assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure; + assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; w := "" & e_in.write_cr_enable; x := "" & (e_in.write_enable and e_in.rc); - z := "" & (d_in.valid and d_in.rc); - assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(z))) <= 1 severity failure; - - x := "" & e_in.write_xerc_enable; - z := "" & D_in.write_xerc_enable; - assert (to_integer(unsigned(x)) + to_integer(unsigned(z))) <= 1 severity failure; + assert (to_integer(unsigned(w)) + to_integer(unsigned(x))) <= 1 severity failure; w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; complete_out <= '0'; - if e_in.valid = '1' or l_in.valid = '1' or d_in.valid = '1' then + if e_in.valid = '1' or l_in.valid = '1' then complete_out <= '1'; end if; @@ -138,19 +130,6 @@ begin xe := l_in.xerc; end if; - if d_in.write_reg_enable = '1' then - w_out.write_enable <= '1'; - w_out.write_reg <= gpr_to_gspr(d_in.write_reg_nr); - data_in <= d_in.write_reg_data; - rc <= d_in.rc; - xe := d_in.xerc; - end if; - - if d_in.write_xerc_enable = '1' then - c_out.write_xerc_enable <= '1'; - c_out.write_xerc_data <= d_in.xerc; - end if; - -- shift and byte-reverse data bytes for i in 0 to 7 loop k := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);