From 955fa561fb7bb0c27b427ea2a84e5bcea2e63342 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 17 Jan 2024 21:05:30 +1100 Subject: [PATCH 01/24] FPU: Move most result_sign computation out of state machine This moves the computation of r.result_sign out of the various states for most instructions. Now the sign is mostly computed in the first cycle (when e_in.valid is true). The set of operations done on r.result_sign in the state machine are now restricted to 5 (other than no change): invert, xor with r.is_subtract, or set to the sign of A, B or C. Similarly r.is_subtract and r.negate are computed in the first cycle now. Signed-off-by: Paul Mackerras --- fpu.vhdl | 109 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 64 insertions(+), 45 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index f07f9d1..12181cf 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -811,7 +811,6 @@ begin variable mshift : signed(EXP_BITS-1 downto 0); variable need_check : std_ulogic; variable msb : std_ulogic; - variable is_add : std_ulogic; variable set_a : std_ulogic; variable set_a_exp : std_ulogic; variable set_a_mant : std_ulogic; @@ -889,6 +888,7 @@ begin v.divmod := '0'; v.is_sqrt := '0'; v.is_multiply := '0'; + v.is_subtract := '0'; fpin_a := '0'; fpin_b := '0'; fpin_c := '0'; @@ -896,6 +896,8 @@ begin v.use_b := e_in.valid_b; v.use_c := e_in.valid_c; v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN); + v.result_sign := '0'; + v.negate := '0'; case e_in.op is when OP_FP_ARITH => fpin_a := e_in.valid_a; @@ -913,6 +915,25 @@ begin if e_in.insn(5 downto 1) = "01111" then v.round_mode := "001"; end if; + case e_in.insn(5 downto 1) is + when "10100" | "10101" => -- fadd and fsub + v.result_sign := e_in.fra(63); + if unsigned(e_in.fra(62 downto 52)) <= unsigned(e_in.frb(62 downto 52)) then + v.result_sign := e_in.frb(63) xnor e_in.insn(1); + end if; + v.is_subtract := not (e_in.fra(63) xor e_in.frb(63) xor e_in.insn(1)); + when "11001" => -- fmul + v.result_sign := e_in.fra(63) xor e_in.frc(63); + when "11100" | "11101" | "11110" | "11111" => --fmadd family + v.result_sign := e_in.fra(63) xor e_in.frc(63); + v.is_subtract := not (e_in.fra(63) xor e_in.frb(63) xor + e_in.frc(63) xor e_in.insn(1)); + v.negate := e_in.insn(2); + when "10010" => -- fdiv + v.result_sign := e_in.fra(63) xor e_in.frb(63); + when others => + v.result_sign := e_in.frb(63); + end case; when OP_FP_CMP => fpin_a := e_in.valid_a; fpin_b := e_in.valid_b; @@ -921,6 +942,12 @@ begin v.fp_rc := e_in.rc; opcbits := e_in.insn(10) & e_in.insn(8) & e_in.insn(4) & e_in.insn(2) & e_in.insn(1); exec_state := misc_decode(to_integer(unsigned(opcbits))); + case opcbits is + when "10110" => -- fcfid + v.result_sign := e_in.frb(63); + when others => + v.result_sign := '0'; + end case; when OP_FP_MOVE => v.fp_rc := e_in.rc; fpin_a := e_in.valid_a; @@ -928,22 +955,49 @@ begin fpin_c := e_in.valid_c; if e_in.insn(5) = '0' then exec_state := DO_FMR; + if e_in.insn(9) = '1' then + v.result_sign := '0'; -- fabs + elsif e_in.insn(8) = '1' then + v.result_sign := '1'; -- fnabs + elsif e_in.insn(7) = '1' then + v.result_sign := e_in.frb(63); -- fmr + elsif e_in.insn(6) = '1' then + v.result_sign := not e_in.frb(63); -- fneg + else + v.result_sign := e_in.fra(63); -- fcpsgn + end if; else exec_state := DO_FSEL; + v.result_sign := e_in.frb(63); end if; when OP_DIV => v.integer_op := '1'; is_32bint := e_in.single; + if e_in.single = '0' then + v.result_sign := e_in.is_signed and (e_in.fra(63) xor e_in.frb(63)); + else + v.result_sign := e_in.is_signed and (e_in.fra(31) xor e_in.frb(31)); + end if; exec_state := DO_IDIVMOD; when OP_DIVE => v.integer_op := '1'; v.divext := '1'; is_32bint := e_in.single; + if e_in.single = '0' then + v.result_sign := e_in.is_signed and (e_in.fra(63) xor e_in.frb(63)); + else + v.result_sign := e_in.is_signed and (e_in.fra(31) xor e_in.frb(31)); + end if; exec_state := DO_IDIVMOD; when OP_MOD => v.integer_op := '1'; v.divmod := '1'; is_32bint := e_in.single; + if e_in.single = '0' then + v.result_sign := e_in.is_signed and e_in.fra(63); + else + v.result_sign := e_in.is_signed and e_in.fra(31); + end if; exec_state := DO_IDIVMOD; when others => exec_state := DO_ILLEGAL; @@ -951,7 +1005,6 @@ begin v.quieten_nan := '1'; v.tiny := '0'; v.denorm := '0'; - v.is_subtract := '0'; v.add_bsmall := '0'; v.int_ovf := '0'; v.div_close := '0'; @@ -1096,7 +1149,6 @@ begin case r.state is when IDLE => v.invalid := '0'; - v.negate := '0'; if e_in.valid = '1' then v.opsel_a := AIN_B; v.busy := '1'; @@ -1319,24 +1371,12 @@ begin re_sel2 <= REXP2_B; re_set_result <= '1'; v.quieten_nan := '0'; - if r.insn(9) = '1' then - v.result_sign := '0'; -- fabs - elsif r.insn(8) = '1' then - v.result_sign := '1'; -- fnabs - elsif r.insn(7) = '1' then - v.result_sign := r.b.negative; -- fmr - elsif r.insn(6) = '1' then - v.result_sign := not r.b.negative; -- fneg - else - v.result_sign := r.a.negative; -- fcpsgn - end if; v.writing_fpr := '1'; v.instr_done := '1'; when DO_FRI => -- fri[nzpm] -- r.opsel_a = AIN_B v.result_class := r.b.class; - v.result_sign := r.b.negative; re_sel2 <= REXP2_B; re_set_result <= '1'; -- set shift to exponent - 52 @@ -1365,7 +1405,6 @@ begin when DO_FRSP => -- r.opsel_a = AIN_B, r.shift = 0 v.result_class := r.b.class; - v.result_sign := r.b.negative; re_sel2 <= REXP2_B; re_set_result <= '1'; -- set shift to exponent - -126 @@ -1398,7 +1437,6 @@ begin -- instr bit 1: 1=round to zero 0=use fpscr[RN] -- r.opsel_a = AIN_B v.result_class := r.b.class; - v.result_sign := r.b.negative; re_sel2 <= REXP2_B; re_set_result <= '1'; rs_sel1 <= RSH1_B; @@ -1441,12 +1479,10 @@ begin when DO_FCFID => -- r.opsel_a = AIN_B - v.result_sign := '0'; if r.insn(8) = '0' and r.b.negative = '1' then -- fcfid[s] with negative operand, set R = -B opsel_ainv <= '1'; carry_in <= '1'; - v.result_sign := '1'; end if; v.result_class := r.b.class; re_con2 <= RECON2_UNIT; @@ -1462,7 +1498,6 @@ begin when DO_FADD => -- fadd[s] and fsub[s] -- r.opsel_a = AIN_A - v.result_sign := r.a.negative; v.result_class := r.a.class; re_sel1 <= REXP1_A; re_set_result <= '1'; @@ -1472,13 +1507,10 @@ begin rs_sel2 <= RSH2_A; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; - is_add := r.a.negative xor r.b.negative xor r.insn(1); - v.is_subtract := not is_add; if r.a.class = FINITE and r.b.class = FINITE then v.add_bsmall := r.exp_cmp; v.opsel_a := AIN_B; if r.exp_cmp = '0' then - v.result_sign := r.b.negative xnor r.insn(1); if r.a.exponent = r.b.exponent then v.state := ADD_2; else @@ -1491,7 +1523,7 @@ begin else if r.a.class = NAN or r.b.class = NAN then v.state := NAN_RESULT; - elsif r.a.class = INFINITY and r.b.class = INFINITY and is_add = '0' then + elsif r.a.class = INFINITY and r.b.class = INFINITY and r.is_subtract = '1' then -- invalid operation, construct QNaN v.fpscr(FPSCR_VXISI) := '1'; qnan_result := '1'; @@ -1502,7 +1534,6 @@ begin else -- result is +/- B v.opsel_a := AIN_B; - v.result_sign := r.b.negative xnor r.insn(1); v.state := EXC_RESULT; end if; end if; @@ -1510,7 +1541,6 @@ begin when DO_FMUL => -- fmul[s] -- r.opsel_a = AIN_A unless C is denorm and A isn't - v.result_sign := r.a.negative xor r.c.negative; v.result_class := r.a.class; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; @@ -1550,7 +1580,6 @@ begin v.result_class := r.a.class; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; - v.result_sign := r.a.negative xor r.b.negative; re_sel1 <= REXP1_A; re_sel2 <= REXP2_B; re_neg2 <= '1'; @@ -1599,7 +1628,6 @@ begin v.result_sign := r.c.negative; else v.opsel_a := AIN_B; - v.result_sign := r.b.negative; end if; v.quieten_nan := '0'; v.state := EXC_RESULT; @@ -1607,7 +1635,6 @@ begin when DO_FSQRT => -- r.opsel_a = AIN_B v.result_class := r.b.class; - v.result_sign := r.b.negative; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; re_sel2 <= REXP2_B; @@ -1643,7 +1670,6 @@ begin when DO_FRE => -- r.opsel_a = AIN_B v.result_class := r.b.class; - v.result_sign := r.b.negative; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; re_sel2 <= REXP2_B; @@ -1669,7 +1695,6 @@ begin when DO_FRSQRTE => -- r.opsel_a = AIN_B v.result_class := r.b.class; - v.result_sign := r.b.negative; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; re_sel2 <= REXP2_B; @@ -1708,7 +1733,6 @@ begin -- fmadd, fmsub, fnmadd, fnmsub -- r.opsel_a = AIN_A if A is denorm, else AIN_C if C is denorm, -- else AIN_B - v.result_sign := r.a.negative; v.result_class := r.a.class; -- put a.exp + c.exp into result_exp re_sel1 <= REXP1_A; @@ -1718,9 +1742,6 @@ begin rs_sel1 <= RSH1_B; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; - is_add := r.a.negative xor r.c.negative xor r.b.negative xor r.insn(1); - v.negate := r.insn(2); - v.is_subtract := not is_add; if r.a.class = FINITE and r.c.class = FINITE and (r.b.class = FINITE or r.b.class = ZERO) then -- Make sure A and C are normalized @@ -1730,13 +1751,13 @@ begin v.state := RENORM_C; elsif r.b.class = ZERO then -- no addend, degenerates to multiply - v.result_sign := r.a.negative xor r.c.negative; f_to_multiply.valid <= '1'; v.is_multiply := '1'; v.state := MULT_1; elsif r.madd_cmp = '0' then -- addend is bigger, do multiply first - v.result_sign := r.b.negative xnor r.insn(1); + -- if subtracting, sign is opposite to initial estimate + v.result_sign := r.result_sign xor r.is_subtract; f_to_multiply.valid <= '1'; v.first := '1'; v.state := FMADD_0; @@ -1753,21 +1774,20 @@ begin v.fpscr(FPSCR_VXIMZ) := '1'; qnan_result := '1'; elsif r.a.class = INFINITY or r.c.class = INFINITY then - if r.b.class = INFINITY and is_add = '0' then + if r.b.class = INFINITY and r.is_subtract = '1' then -- invalid operation, construct QNaN v.fpscr(FPSCR_VXISI) := '1'; qnan_result := '1'; else -- result is infinity v.result_class := INFINITY; - v.result_sign := r.a.negative xor r.c.negative; arith_done := '1'; end if; else -- Here A is zero, C is zero, or B is infinity -- Result is +/-B in all of those cases v.opsel_a := AIN_B; - v.result_sign := r.b.negative xnor r.insn(1); + v.result_sign := r.result_sign xor r.is_subtract; v.state := EXC_RESULT; end if; end if; @@ -1970,7 +1990,7 @@ begin -- product is bigger here -- shift B right and use it as the addend to the multiplier -- for subtract, multiplier does B - A * C - v.result_sign := r.a.negative xor r.c.negative xor r.is_subtract; + v.result_sign := r.result_sign xor r.is_subtract; re_sel2 <= REXP2_B; re_set_result <= '1'; -- set shift to b.exp - result_exp + 64 @@ -2638,7 +2658,6 @@ begin when DO_IDIVMOD => -- r.opsel_a = AIN_B - v.result_sign := r.is_signed and (r.a.negative xor (r.b.negative and not r.divmod)); if r.b.class = ZERO then -- B is zero, signal overflow v.int_ovf := '1'; @@ -3168,7 +3187,7 @@ begin end case; - rsign := v.result_sign; + rsign := r.result_sign; if zero_divide = '1' then v.fpscr(FPSCR_ZX) := '1'; end if; @@ -3191,10 +3210,10 @@ begin v.writing_fpr := '1'; v.update_fprf := '1'; end if; - if v.is_subtract = '1' and v.result_class = ZERO then + if r.is_subtract = '1' and v.result_class = ZERO then rsign := r.round_mode(0) and r.round_mode(1); end if; - if v.negate = '1' and v.result_class /= NAN then + if r.negate = '1' and v.result_class /= NAN then rsign := not rsign; end if; v.instr_done := '1'; From 71b7df679b46a85e367c83927d5bbb15c78b2892 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 18 Jan 2024 22:06:13 +1100 Subject: [PATCH 02/24] FPU: Calculate quieten_nan in first cycle Signed-off-by: Paul Mackerras --- fpu.vhdl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 12181cf..72385a3 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -898,6 +898,7 @@ begin v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN); v.result_sign := '0'; v.negate := '0'; + v.quieten_nan := '1'; case e_in.op is when OP_FP_ARITH => fpin_a := e_in.valid_a; @@ -953,6 +954,7 @@ begin fpin_a := e_in.valid_a; fpin_b := e_in.valid_b; fpin_c := e_in.valid_c; + v.quieten_nan := '0'; if e_in.insn(5) = '0' then exec_state := DO_FMR; if e_in.insn(9) = '1' then @@ -1002,7 +1004,6 @@ begin when others => exec_state := DO_ILLEGAL; end case; - v.quieten_nan := '1'; v.tiny := '0'; v.denorm := '0'; v.add_bsmall := '0'; @@ -1370,7 +1371,6 @@ begin v.result_class := r.b.class; re_sel2 <= REXP2_B; re_set_result <= '1'; - v.quieten_nan := '0'; v.writing_fpr := '1'; v.instr_done := '1'; @@ -1629,7 +1629,6 @@ begin else v.opsel_a := AIN_B; end if; - v.quieten_nan := '0'; v.state := EXC_RESULT; when DO_FSQRT => @@ -3575,7 +3574,7 @@ begin v.sp_result := r.single_prec; v.int_result := int_result; v.illegal := illegal; - v.nsnan_result := v.quieten_nan; + v.nsnan_result := r.quieten_nan; v.res_sign := rsign; if r.integer_op = '1' then v.cr_mask := num_to_fxm(0); From 27b3e4235347ad0bdc183254bfc7bd901091ae3a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 19 Jan 2024 09:37:16 +1100 Subject: [PATCH 03/24] FPU: Move result_sign computations from state machine to a data path Instead of operating on result_sign directly, the state machine now sets a control variable "rsgn_op" that then directs a tiny ALU to do what's required. Signed-off-by: Paul Mackerras --- fpu.vhdl | 42 +++++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 72385a3..7558493 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -287,6 +287,11 @@ architecture behaviour of fpu is signal rs_neg2 : std_ulogic; signal rs_norm : std_ulogic; + constant RSGN_NOP : std_ulogic_vector(1 downto 0) := "00"; + constant RSGN_INV : std_ulogic_vector(1 downto 0) := "01"; + constant RSGN_SUB : std_ulogic_vector(1 downto 0) := "10"; + constant RSGN_SEL : std_ulogic_vector(1 downto 0) := "11"; + constant arith_decode : decode32 := ( -- indexed by bits 5..1 of opcode 2#01000# => DO_FRI, @@ -851,6 +856,7 @@ begin variable int_result : std_ulogic; variable illegal : std_ulogic; variable rsign : std_ulogic; + variable rsgn_op : std_ulogic_vector(1 downto 0); begin v := r; v.complete := '0'; @@ -1147,6 +1153,8 @@ begin rs_neg2 <= '0'; rs_norm <= '0'; + rsgn_op := RSGN_NOP; + case r.state is when IDLE => v.invalid := '0'; @@ -1625,7 +1633,7 @@ begin when DO_FSEL => if r.a.class = ZERO or (r.a.negative = '0' and r.a.class /= NAN) then v.opsel_a := AIN_C; - v.result_sign := r.c.negative; + rsgn_op := RSGN_SEL; else v.opsel_a := AIN_B; end if; @@ -1756,7 +1764,7 @@ begin elsif r.madd_cmp = '0' then -- addend is bigger, do multiply first -- if subtracting, sign is opposite to initial estimate - v.result_sign := r.result_sign xor r.is_subtract; + rsgn_op := RSGN_SUB; f_to_multiply.valid <= '1'; v.first := '1'; v.state := FMADD_0; @@ -1786,7 +1794,7 @@ begin -- Here A is zero, C is zero, or B is infinity -- Result is +/-B in all of those cases v.opsel_a := AIN_B; - v.result_sign := r.result_sign xor r.is_subtract; + rsgn_op := RSGN_SUB; v.state := EXC_RESULT; end if; end if; @@ -1913,7 +1921,7 @@ begin re_sel2 <= REXP2_NE; if r.r(63) = '1' then -- result is opposite sign to expected - v.result_sign := not r.result_sign; + rsgn_op := RSGN_INV; opsel_ainv <= '1'; carry_in <= '1'; v.state := FINISH; @@ -1989,7 +1997,7 @@ begin -- product is bigger here -- shift B right and use it as the addend to the multiplier -- for subtract, multiplier does B - A * C - v.result_sign := r.result_sign xor r.is_subtract; + rsgn_op := RSGN_SUB; re_sel2 <= REXP2_B; re_set_result <= '1'; -- set shift to b.exp - result_exp + 64 @@ -2031,7 +2039,7 @@ begin when FMADD_5 => -- negate R:S:X if negative if r.r(63) = '1' then - v.result_sign := not r.result_sign; + rsgn_op := RSGN_INV; opsel_ainv <= '1'; carry_in <= not (s_nz or r.x); opsel_s <= S_NEG; @@ -2629,14 +2637,12 @@ begin end if; if r.use_a = '1' and r.a.class = NAN then v.opsel_a := AIN_A; - v.result_sign := r.a.negative; elsif r.use_b = '1' and r.b.class = NAN then v.opsel_a := AIN_B; - v.result_sign := r.b.negative; elsif r.use_c = '1' and r.c.class = NAN then v.opsel_a := AIN_C; - v.result_sign := r.c.negative; end if; + rsgn_op := RSGN_SEL; v.state := EXC_RESULT; when EXC_RESULT => @@ -3186,6 +3192,24 @@ begin end case; + case rsgn_op is + when RSGN_SEL => + case v.opsel_a is + when AIN_A => + v.result_sign := r.a.negative; + when AIN_B => + v.result_sign := r.b.negative; + when AIN_C => + v.result_sign := r.c.negative; + when others => + end case; + when RSGN_SUB => + v.result_sign := r.result_sign xor r.is_subtract; + when RSGN_INV => + v.result_sign := not r.result_sign; + when others => + end case; + rsign := r.result_sign; if zero_divide = '1' then v.fpscr(FPSCR_ZX) := '1'; From 707dd619a039240304a6245480dff05c70b219b6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 5 Feb 2024 14:25:10 +1100 Subject: [PATCH 04/24] FPU: Move NaN/infinity and zero/denorm handling out to separate states This should simplify the DO_* states and hopefully be simpler overall. Signed-off-by: Paul Mackerras --- fpu.vhdl | 624 +++++++++++++++++++++++++++---------------------------- 1 file changed, 305 insertions(+), 319 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 7558493..45f5fe0 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -41,11 +41,13 @@ architecture behaviour of fpu is class : fp_number_class; negative : std_ulogic; denorm : std_ulogic; + naninf : std_ulogic; + zeroexp : std_ulogic; exponent : signed(EXP_BITS-1 downto 0); -- unbiased mantissa : std_ulogic_vector(63 downto 0); -- 8.56 format end record; - type state_t is (IDLE, DO_ILLEGAL, + type state_t is (IDLE, DO_ILLEGAL, DO_NAN_INF, DO_ZERO_DEN, DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF, DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT, DO_FCFID, DO_FCTI, @@ -77,7 +79,7 @@ architecture behaviour of fpu is RENORM_A, RENORM_A2, RENORM_B, RENORM_B2, RENORM_C, RENORM_C2, - NAN_RESULT, EXC_RESULT, + EXC_RESULT, IDIV_NORMB, IDIV_NORMB2, IDIV_NORMB3, IDIV_CLZA, IDIV_CLZA2, IDIV_CLZA3, IDIV_NR0, IDIV_NR1, IDIV_NR2, IDIV_USE0_5, @@ -144,7 +146,9 @@ architecture behaviour of fpu is exp_cmp : std_ulogic; madd_cmp : std_ulogic; add_bsmall : std_ulogic; + is_addition : std_ulogic; is_multiply : std_ulogic; + is_inverse : std_ulogic; is_sqrt : std_ulogic; first : std_ulogic; count : unsigned(1 downto 0); @@ -170,6 +174,8 @@ architecture behaviour of fpu is xerc : xer_common_t; xerc_result : xer_common_t; res_sign : std_ulogic; + res_int : std_ulogic; + exec_state : state_t; end record; type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0); @@ -567,11 +573,15 @@ architecture behaviour of fpu is begin reg.negative := fpr(63); reg.denorm := '0'; + reg.naninf := '0'; + reg.zeroexp := '0'; exp_nz := or (fpr(62 downto 52)); exp_ao := and (fpr(62 downto 52)); frac_nz := or (fpr(51 downto 0)); low_nz := or (fpr(31 downto 0)); if is_fp = '1' then + reg.naninf := exp_ao; + reg.zeroexp := not exp_nz; reg.denorm := frac_nz and not exp_nz; reg.exponent := signed(resize(unsigned(fpr(62 downto 52)), EXP_BITS)) - to_signed(1023, EXP_BITS); if exp_nz = '0' then @@ -724,6 +734,7 @@ begin r.cr_mask <= (others =>'0'); r.cr_result <= (others =>'0'); r.instr_tag.valid <= '0'; + r.exec_state <= IDLE; if rst = '1' then r.fpscr <= (others => '0'); r.comm_fpscr <= (others => '0'); @@ -853,16 +864,21 @@ begin variable rsh_in2 : signed(EXP_BITS-1 downto 0); variable exec_state : state_t; variable opcbits : std_ulogic_vector(4 downto 0); - variable int_result : std_ulogic; variable illegal : std_ulogic; variable rsign : std_ulogic; variable rsgn_op : std_ulogic_vector(1 downto 0); + variable is_nan_inf : std_ulogic; + variable is_zero_den : std_ulogic; + variable sign_inv : std_ulogic; begin v := r; v.complete := '0'; v.do_intr := '0'; is_32bint := '0'; exec_state := IDLE; + is_nan_inf := '0'; + is_zero_den := '0'; + sign_inv := '0'; if r.complete = '1' or r.do_intr = '1' then v.instr_done := '0'; @@ -894,7 +910,9 @@ begin v.divmod := '0'; v.is_sqrt := '0'; v.is_multiply := '0'; + v.is_addition := '0'; v.is_subtract := '0'; + v.is_inverse := '0'; fpin_a := '0'; fpin_b := '0'; fpin_c := '0'; @@ -905,6 +923,7 @@ begin v.result_sign := '0'; v.negate := '0'; v.quieten_nan := '1'; + v.int_result := '0'; case e_in.op is when OP_FP_ARITH => fpin_a := e_in.valid_a; @@ -913,32 +932,40 @@ begin v.longmask := e_in.single; v.fp_rc := e_in.rc; exec_state := arith_decode(to_integer(unsigned(e_in.insn(5 downto 1)))); - if e_in.insn(5 downto 1) = "11001" or e_in.insn(5 downto 3) = "111" then - v.is_multiply := '1'; - end if; if e_in.insn(5 downto 1) = "10110" or e_in.insn(5 downto 1) = "11010" then v.is_sqrt := '1'; end if; - if e_in.insn(5 downto 1) = "01111" then + if e_in.insn(5 downto 1) = "01111" then -- fcti*z v.round_mode := "001"; end if; case e_in.insn(5 downto 1) is when "10100" | "10101" => -- fadd and fsub + v.is_addition := '1'; v.result_sign := e_in.fra(63); if unsigned(e_in.fra(62 downto 52)) <= unsigned(e_in.frb(62 downto 52)) then v.result_sign := e_in.frb(63) xnor e_in.insn(1); end if; v.is_subtract := not (e_in.fra(63) xor e_in.frb(63) xor e_in.insn(1)); when "11001" => -- fmul + v.is_multiply := '1'; v.result_sign := e_in.fra(63) xor e_in.frc(63); when "11100" | "11101" | "11110" | "11111" => --fmadd family + v.is_multiply := '1'; + v.is_addition := '1'; v.result_sign := e_in.fra(63) xor e_in.frc(63); v.is_subtract := not (e_in.fra(63) xor e_in.frb(63) xor e_in.frc(63) xor e_in.insn(1)); v.negate := e_in.insn(2); when "10010" => -- fdiv + v.is_inverse := '1'; v.result_sign := e_in.fra(63) xor e_in.frb(63); - when others => + when "11000" | "11010" => -- fre and frsqrte + v.is_inverse := '1'; + v.result_sign := e_in.frb(63); + when "01110" | "01111" => -- fcti* + v.int_result := '1'; + v.result_sign := e_in.frb(63); + when others => -- fri* and frsp v.result_sign := e_in.frb(63); end case; when OP_FP_CMP => @@ -950,6 +977,10 @@ begin opcbits := e_in.insn(10) & e_in.insn(8) & e_in.insn(4) & e_in.insn(2) & e_in.insn(1); exec_state := misc_decode(to_integer(unsigned(opcbits))); case opcbits is + when "10010" | "11010" | "10011" => + -- fmrg*, mffs + v.int_result := '1'; + v.result_sign := '0'; when "10110" => -- fcfid v.result_sign := e_in.frb(63); when others => @@ -1023,6 +1054,11 @@ begin v.b := bdec; v.c := cdec; + if e_in.op = OP_FP_ARITH then + is_nan_inf := adec.naninf or bdec.naninf or cdec.naninf; + is_zero_den := adec.zeroexp or bdec.zeroexp or cdec.zeroexp; + end if; + v.exp_cmp := '0'; if adec.exponent > bdec.exponent then v.exp_cmp := '1'; @@ -1137,7 +1173,6 @@ begin rbit_inc := '0'; mult_mask := '0'; rnd_b32 := '0'; - int_result := '0'; illegal := '0'; re_sel1 <= REXP1_ZERO; @@ -1165,32 +1200,176 @@ begin (e_in.valid_b = '0' or e_in.valid_c = '0') then v.opsel_a := AIN_A; end if; - if e_in.op = OP_FP_ARITH then - -- input selection for denorm cases - case e_in.insn(5 downto 1) is - when "10010" => -- fdiv - if v.b.mantissa(UNIT_BIT) = '0' and v.a.mantissa(UNIT_BIT) = '1' then - v.opsel_a := AIN_B; - end if; - when "11001" => -- fmul - if v.c.mantissa(UNIT_BIT) = '0' and v.a.mantissa(UNIT_BIT) = '1' then - v.opsel_a := AIN_C; - end if; - when "11100" | "11101" | "11110" | "11111" => -- fmadd etc. - if v.a.mantissa(UNIT_BIT) = '0' then - v.opsel_a := AIN_A; - elsif v.c.mantissa(UNIT_BIT) = '0' then - v.opsel_a := AIN_C; - end if; - when others => - end case; + v.exec_state := exec_state; + if is_nan_inf = '1' then + v.state := DO_NAN_INF; + elsif is_zero_den = '1' then + v.state := DO_ZERO_DEN; + else + v.state := exec_state; end if; - v.state := exec_state; end if; v.x := '0'; v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX); set_s := '1'; + when DO_NAN_INF => + -- At least one floating-point operand is infinity or NaN + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + + if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or + (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or + (r.c.class = NAN and r.c.mantissa(QNAN_BIT) = '0') then + -- Signalling NAN + v.fpscr(FPSCR_VXSNAN) := '1'; + invalid := '1'; + end if; + if r.a.class = NAN or r.b.class = NAN or r.c.class = NAN then + if r.int_result = '1' then + v.state := INT_OFLOW; + else + if r.a.class = NAN then + v.opsel_a := AIN_A; + elsif r.b.class = NAN then + v.opsel_a := AIN_B; + elsif r.c.class = NAN then + v.opsel_a := AIN_C; + end if; + rsgn_op := RSGN_SEL; + v.state := EXC_RESULT; + end if; + + else + if r.a.class = INFINITY then + if r.is_multiply = '1' and r.c.class = ZERO then + -- invalid operation, construct QNaN + v.fpscr(FPSCR_VXIMZ) := '1'; + qnan_result := '1'; + elsif r.is_subtract = '1' and r.b.class = INFINITY then + v.fpscr(FPSCR_VXISI) := '1'; + qnan_result := '1'; + elsif r.is_inverse = '1' and r.b.class = INFINITY then + v.fpscr(FPSCR_VXIDI) := '1'; + qnan_result := '1'; + else + v.result_class := INFINITY; + end if; + arith_done := '1'; + elsif r.c.class = INFINITY then + if r.is_multiply = '1' and r.a.class = ZERO then + -- invalid operation, construct QNaN + v.fpscr(FPSCR_VXIMZ) := '1'; + qnan_result := '1'; + elsif r.is_subtract = '1' and r.b.class = INFINITY then + v.fpscr(FPSCR_VXISI) := '1'; + qnan_result := '1'; + else + v.result_class := INFINITY; + end if; + arith_done := '1'; + else + -- r.b.class = INFINITY + if r.int_result = '1' then + -- fcti* + v.state := INT_OFLOW; + elsif r.is_sqrt = '1' and r.b.negative = '1' then + v.fpscr(FPSCR_VXSQRT) := '1'; + qnan_result := '1'; + elsif r.is_inverse = '1' then + -- fdiv, fre, frsqrte + v.result_class := ZERO; + arith_done := '1'; + else + sign_inv := r.is_multiply and r.is_subtract; + v.result_class := INFINITY; + arith_done := '1'; + end if; + end if; + end if; + + when DO_ZERO_DEN => + -- At least one floating point operand is zero or denormalized + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + if (r.use_a = '1' and r.a.class = ZERO) or + (r.use_b = '1' and r.b.class = ZERO and r.is_multiply = '0') or + (r.use_c = '1' and r.c.class = ZERO) then + if r.use_a = '1' and r.a.class = ZERO then + if r.is_inverse = '1' then + -- fdiv; result is 0 unless B=0 + if r.b.class = ZERO then + v.fpscr(FPSCR_VXZDZ) := '1'; + qnan_result := '1'; + else + v.result_class := ZERO; + end if; + arith_done := '1'; + elsif r.is_addition = '1' then + -- result is +/- B + v.opsel_a := AIN_B; + if r.is_multiply = '1' then + rsgn_op := RSGN_SUB; + end if; + v.state := EXC_RESULT; + else + v.result_class := ZERO; + arith_done := '1'; + end if; + elsif r.use_c = '1' and r.c.class = ZERO then + v.opsel_a := AIN_B; + rsgn_op := RSGN_SUB; + v.state := EXC_RESULT; + else + -- B is zero, other operands are finite + if r.int_result = '1' then + -- fcti*, r.opsel_a = AIN_B + arith_done := '1'; + elsif r.is_inverse = '1' then + -- fdiv, fre, frsqrte + v.result_class := INFINITY; + zero_divide := '1'; + arith_done := '1'; + elsif r.is_addition = '1' then + -- fadd, r.opsel_a = AIN_A + v.result_class := FINITE; + re_sel1 <= REXP1_A; + re_set_result <= '1'; + arith_done := '1'; + else + -- other things, result is zero + v.result_class := ZERO; + arith_done := '1'; + end if; + end if; + + else + -- some operand is denorm, and/or it's fmadd/fmsub with B=0 + v.opsel_a := AIN_B; + if r.use_a = '1' and (r.use_b = '0' or r.use_c = '0') then + v.opsel_a := AIN_A; + end if; + -- input selection for denorm cases + case r.insn(5 downto 1) is + when "10010" => -- fdiv + if r.b.mantissa(UNIT_BIT) = '0' and r.a.mantissa(UNIT_BIT) = '1' then + v.opsel_a := AIN_B; + end if; + when "11001" => -- fmul + if r.c.mantissa(UNIT_BIT) = '0' and r.a.mantissa(UNIT_BIT) = '1' then + v.opsel_a := AIN_C; + end if; + when "11100" | "11101" | "11110" | "11111" => -- fmadd etc. + if r.a.mantissa(UNIT_BIT) = '0' then + v.opsel_a := AIN_A; + elsif r.c.mantissa(UNIT_BIT) = '0' then + v.opsel_a := AIN_C; + end if; + when others => + end case; + v.state := r.exec_state; + end if; + when DO_ILLEGAL => illegal := '1'; v.instr_done := '1'; @@ -1323,7 +1502,6 @@ begin -- fmrgew, fmrgow opsel_r <= RES_MISC; misc_sel <= "01" & r.insn(8) & '0'; - int_result := '1'; v.writing_fpr := '1'; v.instr_done := '1'; @@ -1355,7 +1533,6 @@ begin v.illegal := '1'; v.writing_fpr := '0'; end case; - int_result := '1'; v.instr_done := '1'; when DO_MTFSF => @@ -1393,21 +1570,12 @@ begin rs_neg2 <= '1'; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; - if r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0' then - -- Signalling NAN - v.fpscr(FPSCR_VXSNAN) := '1'; - invalid := '1'; - end if; - if r.b.class = FINITE then - if r.b.exponent >= to_signed(52, EXP_BITS) then - -- integer already, no rounding required - arith_done := '1'; - else - v.state := FRI_1; - v.round_mode := '1' & r.insn(7 downto 6); - end if; - else + if r.b.exponent >= to_signed(52, EXP_BITS) then + -- integer already, no rounding required arith_done := '1'; + else + v.state := FRI_1; + v.round_mode := '1' & r.insn(7 downto 6); end if; when DO_FRSP => @@ -1421,22 +1589,13 @@ begin rs_neg2 <= '1'; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; - if r.b.class = NAN and r.b.mantissa(53) = '0' then - -- Signalling NAN - v.fpscr(FPSCR_VXSNAN) := '1'; - invalid := '1'; - end if; set_x := '1'; - if r.b.class = FINITE then - if r.b.exponent < to_signed(-126, EXP_BITS) then - v.state := ROUND_UFLOW; - elsif r.b.exponent > to_signed(127, EXP_BITS) then - v.state := ROUND_OFLOW; - else - v.state := ROUNDING; - end if; + if r.b.exponent < to_signed(-126, EXP_BITS) then + v.state := ROUND_UFLOW; + elsif r.b.exponent > to_signed(127, EXP_BITS) then + v.state := ROUND_OFLOW; else - arith_done := '1'; + v.state := ROUNDING; end if; when DO_FCTI => @@ -1451,39 +1610,25 @@ begin rs_neg2 <= '1'; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; - if r.b.class = NAN and r.b.mantissa(53) = '0' then - -- Signalling NAN - v.fpscr(FPSCR_VXSNAN) := '1'; - invalid := '1'; - end if; - int_result := '1'; - - case r.b.class is - when ZERO => - arith_done := '1'; - when FINITE => - if r.b.exponent >= to_signed(64, EXP_BITS) or - (r.insn(9) = '0' and r.b.exponent >= to_signed(32, EXP_BITS)) then - v.state := INT_OFLOW; - elsif r.b.exponent >= to_signed(52, EXP_BITS) then - -- integer already, no rounding required, - -- shift into final position - -- set shift to exponent - 56 - rs_con2 <= RSCON2_UNIT; - if r.insn(8) = '1' and r.b.negative = '1' then - v.state := INT_OFLOW; - else - v.state := INT_ISHIFT; - end if; - else - -- set shift to exponent - 52 - rs_con2 <= RSCON2_52; - v.state := INT_SHIFT; - end if; - when INFINITY | NAN => + if r.b.exponent >= to_signed(64, EXP_BITS) or + (r.insn(9) = '0' and r.b.exponent >= to_signed(32, EXP_BITS)) then + v.state := INT_OFLOW; + elsif r.b.exponent >= to_signed(52, EXP_BITS) then + -- integer already, no rounding required, + -- shift into final position + -- set shift to exponent - 56 + rs_con2 <= RSCON2_UNIT; + if r.insn(8) = '1' and r.b.negative = '1' then v.state := INT_OFLOW; - end case; + else + v.state := INT_ISHIFT; + end if; + else + -- set shift to exponent - 52 + rs_con2 <= RSCON2_52; + v.state := INT_SHIFT; + end if; when DO_FCFID => -- r.opsel_a = AIN_B @@ -1515,35 +1660,17 @@ begin rs_sel2 <= RSH2_A; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; - if r.a.class = FINITE and r.b.class = FINITE then - v.add_bsmall := r.exp_cmp; - v.opsel_a := AIN_B; - if r.exp_cmp = '0' then - if r.a.exponent = r.b.exponent then - v.state := ADD_2; - else - v.longmask := '0'; - v.state := ADD_SHIFT; - end if; + v.add_bsmall := r.exp_cmp; + v.opsel_a := AIN_B; + if r.exp_cmp = '0' then + if r.a.exponent = r.b.exponent then + v.state := ADD_2; else - v.state := ADD_1; + v.longmask := '0'; + v.state := ADD_SHIFT; end if; else - if r.a.class = NAN or r.b.class = NAN then - v.state := NAN_RESULT; - elsif r.a.class = INFINITY and r.b.class = INFINITY and r.is_subtract = '1' then - -- invalid operation, construct QNaN - v.fpscr(FPSCR_VXISI) := '1'; - qnan_result := '1'; - arith_done := '1'; - elsif r.a.class = INFINITY or r.b.class = ZERO then - -- result is A; we're already set up to put A into R - arith_done := '1'; - else - -- result is +/- B - v.opsel_a := AIN_B; - v.state := EXC_RESULT; - end if; + v.state := ADD_1; end if; when DO_FMUL => @@ -1555,32 +1682,14 @@ begin re_sel1 <= REXP1_A; re_sel2 <= REXP2_C; re_set_result <= '1'; - if r.a.class = FINITE and r.c.class = FINITE then - -- Renormalize denorm operands - if r.a.mantissa(UNIT_BIT) = '0' then - v.state := RENORM_A; - elsif r.c.mantissa(UNIT_BIT) = '0' then - v.state := RENORM_C; - else - f_to_multiply.valid <= '1'; - v.state := MULT_1; - end if; + -- Renormalize denorm operands + if r.a.mantissa(UNIT_BIT) = '0' then + v.state := RENORM_A; + elsif r.c.mantissa(UNIT_BIT) = '0' then + v.state := RENORM_C; else - if r.a.class = NAN or r.c.class = NAN then - v.state := NAN_RESULT; - elsif (r.a.class = INFINITY and r.c.class = ZERO) or - (r.a.class = ZERO and r.c.class = INFINITY) then - -- invalid operation, construct QNaN - v.fpscr(FPSCR_VXIMZ) := '1'; - qnan_result := '1'; - elsif r.a.class = ZERO or r.a.class = INFINITY then - -- result is +/- A - arith_done := '1'; - else - -- r.c.class is ZERO or INFINITY - v.opsel_a := AIN_C; - v.state := EXC_RESULT; - end if; + f_to_multiply.valid <= '1'; + v.state := MULT_1; end if; when DO_FDIV => @@ -1593,41 +1702,14 @@ begin re_neg2 <= '1'; re_set_result <= '1'; v.count := "00"; - if r.a.class = FINITE and r.b.class = FINITE then - -- Renormalize denorm operands - if r.a.mantissa(UNIT_BIT) = '0' then - v.state := RENORM_A; - elsif r.b.mantissa(UNIT_BIT) = '0' then - v.state := RENORM_B; - else - v.first := '1'; - v.state := DIV_2; - end if; + -- Renormalize denorm operands + if r.a.mantissa(UNIT_BIT) = '0' then + v.state := RENORM_A; + elsif r.b.mantissa(UNIT_BIT) = '0' then + v.state := RENORM_B; else - if r.a.class = NAN or r.b.class = NAN then - v.state := NAN_RESULT; - elsif r.b.class = INFINITY then - if r.a.class = INFINITY then - v.fpscr(FPSCR_VXIDI) := '1'; - qnan_result := '1'; - else - v.result_class := ZERO; - end if; - arith_done := '1'; - elsif r.b.class = ZERO then - if r.a.class = ZERO then - v.fpscr(FPSCR_VXZDZ) := '1'; - qnan_result := '1'; - else - if r.a.class = FINITE then - zero_divide := '1'; - end if; - v.result_class := INFINITY; - end if; - arith_done := '1'; - else -- r.b.class = FINITE, result_class = r.a.class - arith_done := '1'; - end if; + v.first := '1'; + v.state := DIV_2; end if; when DO_FSEL => @@ -1646,33 +1728,18 @@ begin v.fpscr(FPSCR_FI) := '0'; re_sel2 <= REXP2_B; re_set_result <= '1'; - case r.b.class is - when FINITE => - if r.b.negative = '1' then - v.fpscr(FPSCR_VXSQRT) := '1'; - qnan_result := '1'; - elsif r.b.mantissa(UNIT_BIT) = '0' then - v.state := RENORM_B; - elsif r.b.exponent(0) = '0' then - v.state := SQRT_1; - else - -- set shift to 1 - rs_con2 <= RSCON2_1; - v.state := RENORM_B2; - end if; - when NAN => - v.state := NAN_RESULT; - when ZERO => - -- result is B - arith_done := '1'; - when INFINITY => - if r.b.negative = '1' then - v.fpscr(FPSCR_VXSQRT) := '1'; - qnan_result := '1'; - -- else result is B - end if; - arith_done := '1'; - end case; + if r.b.negative = '1' then + v.fpscr(FPSCR_VXSQRT) := '1'; + qnan_result := '1'; + elsif r.b.mantissa(UNIT_BIT) = '0' then + v.state := RENORM_B; + elsif r.b.exponent(0) = '0' then + v.state := SQRT_1; + else + -- set shift to 1 + rs_con2 <= RSCON2_1; + v.state := RENORM_B2; + end if; when DO_FRE => -- r.opsel_a = AIN_B @@ -1681,23 +1748,11 @@ begin v.fpscr(FPSCR_FI) := '0'; re_sel2 <= REXP2_B; re_set_result <= '1'; - case r.b.class is - when FINITE => - if r.b.mantissa(UNIT_BIT) = '0' then - v.state := RENORM_B; - else - v.state := FRE_1; - end if; - when NAN => - v.state := NAN_RESULT; - when INFINITY => - v.result_class := ZERO; - arith_done := '1'; - when ZERO => - v.result_class := INFINITY; - zero_divide := '1'; - arith_done := '1'; - end case; + if r.b.mantissa(UNIT_BIT) = '0' then + v.state := RENORM_B; + else + v.state := FRE_1; + end if; when DO_FRSQRTE => -- r.opsel_a = AIN_B @@ -1708,33 +1763,16 @@ begin re_set_result <= '1'; -- set shift to 1 rs_con2 <= RSCON2_1; - case r.b.class is - when FINITE => - if r.b.negative = '1' then - v.fpscr(FPSCR_VXSQRT) := '1'; - qnan_result := '1'; - elsif r.b.mantissa(UNIT_BIT) = '0' then - v.state := RENORM_B; - elsif r.b.exponent(0) = '0' then - v.state := RSQRT_1; - else - v.state := RENORM_B2; - end if; - when NAN => - v.state := NAN_RESULT; - when INFINITY => - if r.b.negative = '1' then - v.fpscr(FPSCR_VXSQRT) := '1'; - qnan_result := '1'; - else - v.result_class := ZERO; - end if; - arith_done := '1'; - when ZERO => - v.result_class := INFINITY; - zero_divide := '1'; - arith_done := '1'; - end case; + if r.b.negative = '1' then + v.fpscr(FPSCR_VXSQRT) := '1'; + qnan_result := '1'; + elsif r.b.mantissa(UNIT_BIT) = '0' then + v.state := RENORM_B; + elsif r.b.exponent(0) = '0' then + v.state := RSQRT_1; + else + v.state := RENORM_B2; + end if; when DO_FMADD => -- fmadd, fmsub, fnmadd, fnmsub @@ -1749,54 +1787,25 @@ begin rs_sel1 <= RSH1_B; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; - if r.a.class = FINITE and r.c.class = FINITE and - (r.b.class = FINITE or r.b.class = ZERO) then - -- Make sure A and C are normalized - if r.a.mantissa(UNIT_BIT) = '0' then - v.state := RENORM_A; - elsif r.c.mantissa(UNIT_BIT) = '0' then - v.state := RENORM_C; - elsif r.b.class = ZERO then - -- no addend, degenerates to multiply - f_to_multiply.valid <= '1'; - v.is_multiply := '1'; - v.state := MULT_1; - elsif r.madd_cmp = '0' then - -- addend is bigger, do multiply first - -- if subtracting, sign is opposite to initial estimate - rsgn_op := RSGN_SUB; - f_to_multiply.valid <= '1'; - v.first := '1'; - v.state := FMADD_0; - else - -- product is bigger, shift B first - v.state := FMADD_1; - end if; + -- Make sure A and C are normalized + if r.a.mantissa(UNIT_BIT) = '0' then + v.state := RENORM_A; + elsif r.c.mantissa(UNIT_BIT) = '0' then + v.state := RENORM_C; + elsif r.b.class = ZERO then + -- no addend, degenerates to multiply + f_to_multiply.valid <= '1'; + v.state := MULT_1; + elsif r.madd_cmp = '0' then + -- addend is bigger, do multiply first + -- if subtracting, sign is opposite to initial estimate + rsgn_op := RSGN_SUB; + f_to_multiply.valid <= '1'; + v.first := '1'; + v.state := FMADD_0; else - if r.a.class = NAN or r.b.class = NAN or r.c.class = NAN then - v.state := NAN_RESULT; - elsif (r.a.class = ZERO and r.c.class = INFINITY) or - (r.a.class = INFINITY and r.c.class = ZERO) then - -- invalid operation, construct QNaN - v.fpscr(FPSCR_VXIMZ) := '1'; - qnan_result := '1'; - elsif r.a.class = INFINITY or r.c.class = INFINITY then - if r.b.class = INFINITY and r.is_subtract = '1' then - -- invalid operation, construct QNaN - v.fpscr(FPSCR_VXISI) := '1'; - qnan_result := '1'; - else - -- result is infinity - v.result_class := INFINITY; - arith_done := '1'; - end if; - else - -- Here A is zero, C is zero, or B is infinity - -- Result is +/-B in all of those cases - v.opsel_a := AIN_B; - rsgn_op := RSGN_SUB; - v.state := EXC_RESULT; - end if; + -- product is bigger, shift B first + v.state := FMADD_1; end if; when RENORM_A => @@ -2403,7 +2412,6 @@ begin when others => -- fctidu[z] need_check := r.r(63); end case; - int_result := '1'; if need_check = '1' then v.state := INT_CHECK; else @@ -2430,7 +2438,6 @@ begin v.fpscr(FPSCR_XX) := '1'; end if; end if; - int_result := '1'; arith_done := '1'; when INT_OFLOW => @@ -2441,7 +2448,6 @@ begin end if; v.fpscr(FPSCR_VXCVI) := '1'; invalid := '1'; - int_result := '1'; arith_done := '1'; when FRI_1 => @@ -2627,24 +2633,6 @@ begin re_set_result <= '1'; arith_done := '1'; - when NAN_RESULT => - if (r.use_a = '1' and r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or - (r.use_b = '1' and r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or - (r.use_c = '1' and r.c.class = NAN and r.c.mantissa(QNAN_BIT) = '0') then - -- Signalling NAN - v.fpscr(FPSCR_VXSNAN) := '1'; - invalid := '1'; - end if; - if r.use_a = '1' and r.a.class = NAN then - v.opsel_a := AIN_A; - elsif r.use_b = '1' and r.b.class = NAN then - v.opsel_a := AIN_B; - elsif r.use_c = '1' and r.c.class = NAN then - v.opsel_a := AIN_C; - end if; - rsgn_op := RSGN_SEL; - v.state := EXC_RESULT; - when EXC_RESULT => -- r.opsel_a = AIN_A, AIN_B or AIN_C according to which input is the result case r.opsel_a is @@ -3172,7 +3160,6 @@ begin end if; end if; v.cr_result(0) := v.xerc.so; - int_result := '1'; v.writing_fpr := '1'; v.instr_done := '1'; when IDIV_ZERO => @@ -3186,7 +3173,6 @@ begin v.writing_xer := '1'; end if; v.cr_result := "001" & v.xerc_result.so; - int_result := '1'; v.writing_fpr := '1'; v.instr_done := '1'; @@ -3210,7 +3196,7 @@ begin when others => end case; - rsign := r.result_sign; + rsign := r.result_sign xor sign_inv; if zero_divide = '1' then v.fpscr(FPSCR_ZX) := '1'; end if; @@ -3596,7 +3582,7 @@ begin v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX); end if; v.sp_result := r.single_prec; - v.int_result := int_result; + v.res_int := r.int_result or r.integer_op; v.illegal := illegal; v.nsnan_result := r.quieten_nan; v.res_sign := rsign; @@ -3627,7 +3613,7 @@ begin end if; -- This mustn't depend on any fields of r that are modified in IDLE state. - if r.int_result = '1' then + if r.res_int = '1' then fp_result <= r.r; else fp_result <= pack_dp(r.res_sign, r.result_class, r.result_exp, r.r, From a3613d863b682d422c1dc412a4137e350c06a44e Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 10 Feb 2024 16:53:21 +1100 Subject: [PATCH 05/24] FPU: Simplify sign calculation in FP multiply-add instructions By starting out with result_sign = +/- sign of B, we avoid the need to flip the result sign in a few places. This also simplifies DO_FMADD state a bit by having DO_ZERO_DEN go to DO_FMUL state for floating multiply-add where B is zero. (The RENORM_A2 and RENORM_C2 states already do this.) Signed-off-by: Paul Mackerras --- fpu.vhdl | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 45f5fe0..f0a180f 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -952,7 +952,7 @@ begin when "11100" | "11101" | "11110" | "11111" => --fmadd family v.is_multiply := '1'; v.is_addition := '1'; - v.result_sign := e_in.fra(63) xor e_in.frc(63); + v.result_sign := e_in.frb(63) xnor e_in.insn(1); v.is_subtract := not (e_in.fra(63) xor e_in.frb(63) xor e_in.frc(63) xor e_in.insn(1)); v.negate := e_in.insn(2); @@ -1253,6 +1253,7 @@ begin v.fpscr(FPSCR_VXIDI) := '1'; qnan_result := '1'; else + sign_inv := r.is_multiply and r.is_subtract; v.result_class := INFINITY; end if; arith_done := '1'; @@ -1265,6 +1266,7 @@ begin v.fpscr(FPSCR_VXISI) := '1'; qnan_result := '1'; else + sign_inv := r.is_multiply and r.is_subtract; v.result_class := INFINITY; end if; arith_done := '1'; @@ -1281,7 +1283,6 @@ begin v.result_class := ZERO; arith_done := '1'; else - sign_inv := r.is_multiply and r.is_subtract; v.result_class := INFINITY; arith_done := '1'; end if; @@ -1308,9 +1309,6 @@ begin elsif r.is_addition = '1' then -- result is +/- B v.opsel_a := AIN_B; - if r.is_multiply = '1' then - rsgn_op := RSGN_SUB; - end if; v.state := EXC_RESULT; else v.result_class := ZERO; @@ -1318,7 +1316,6 @@ begin end if; elsif r.use_c = '1' and r.c.class = ZERO then v.opsel_a := AIN_B; - rsgn_op := RSGN_SUB; v.state := EXC_RESULT; else -- B is zero, other operands are finite @@ -1349,6 +1346,14 @@ begin if r.use_a = '1' and (r.use_b = '0' or r.use_c = '0') then v.opsel_a := AIN_A; end if; + if r.use_b = '1' and r.b.class = ZERO and r.use_c = '1' then + -- turn fmadd/sub into fmul + v.opsel_a := AIN_A; + rsgn_op := RSGN_SUB; + v.state := DO_FMUL; + else + v.state := r.exec_state; + end if; -- input selection for denorm cases case r.insn(5 downto 1) is when "10010" => -- fdiv @@ -1367,7 +1372,6 @@ begin end if; when others => end case; - v.state := r.exec_state; end if; when DO_ILLEGAL => @@ -1792,14 +1796,9 @@ begin v.state := RENORM_A; elsif r.c.mantissa(UNIT_BIT) = '0' then v.state := RENORM_C; - elsif r.b.class = ZERO then - -- no addend, degenerates to multiply - f_to_multiply.valid <= '1'; - v.state := MULT_1; elsif r.madd_cmp = '0' then -- addend is bigger, do multiply first -- if subtracting, sign is opposite to initial estimate - rsgn_op := RSGN_SUB; f_to_multiply.valid <= '1'; v.first := '1'; v.state := FMADD_0; @@ -2006,7 +2005,6 @@ begin -- product is bigger here -- shift B right and use it as the addend to the multiplier -- for subtract, multiplier does B - A * C - rsgn_op := RSGN_SUB; re_sel2 <= REXP2_B; re_set_result <= '1'; -- set shift to b.exp - result_exp + 64 From 9ac71cfbf2a666b3ff5e75fbaa3c9e99cee19597 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 5 Apr 2024 09:34:14 +1100 Subject: [PATCH 06/24] tests/fpu: Add more floating multiply-add tests Add more tests to check that the result sign computations are correct. Signed-off-by: Paul Mackerras --- tests/fpu/fpu.c | 71 +++++++++++++++++++++++++++++++++++++++++++++ tests/test_fpu.bin | Bin 31832 -> 32896 bytes 2 files changed, 71 insertions(+) diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 79ba7fa..c13110f 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -1496,34 +1496,105 @@ struct fmavals { unsigned long nfma; unsigned long nfms; } fmavals[] = { + /* +0 * +0 +- +0 -> +0, +0, -0, -0 */ { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x8000000000000000, 0x8000000000000000 }, + /* +0 * NaNC +- +0 -> NaNC, NaNC, NaNC, NaNC */ { 0x0000000000000000, 0x7ffc000000000000, 0x0000000000000000, 0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000 }, + /* +0 * NaNC +- NaNB -> NaNB, NaNB, NaNB, NaNB */ { 0x0000000000000000, 0x7ffc000000000000, 0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000 }, + /* NaNA * NaNC +- NaNB -> NaNA, NaNA, NaNA, NaNA */ { 0x7ffa000000000000, 0x7ffc000000000000, 0x7ffb000000000000, 0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000 }, + /* +1.0 * -0 +- +finite B -> +B, -B, -B, +B */ { 0x3ff0000000000000, 0x8000000000000000, 0x678123456789abcd, 0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd }, + /* +1.0 * -1.0 +- (B = +3.818e+190) -> +B, -B, -B, +B */ { 0x3ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, 0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd }, + /* +inf * -1.0 +- +finite B -> -inf, -inf, +inf, +inf */ { 0x7ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, 0xfff0000000000000, 0xfff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 }, + /* +inf * +0 +- +finite B -> NaNQ, NaNQ, NaNQ, NaNQ */ { 0x7ff0000000000000, 0x0000000000000000, 0x678123456789abcd, 0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000 }, + /* +1.0 * +1.0 +- 1.00000012 -> +2.00000012, +1.2e-7, -2.00000012, -1.2e-7 */ { 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000020000000, 0x4000000010000000, 0xbe80000000000000, 0xc000000010000000, 0x3e80000000000000 }, + /* +(1 + 2^-52) * +(1 + 2^-52) +- +1.0 -> +(2 + 2^-51), +2^-51, -(2 + 2^-51), -2^-51 */ { 0x3ff0000000000001, 0x3ff0000000000001, 0x3ff0000000000000, 0x4000000000000001, 0x3cc0000000000000, 0xc000000000000001, 0xbcc0000000000000 }, + /* +(1 + 3*2^-52) * +(1 + 2^-51) +- +1.0 -> +(2 + 2^-50), +5 * 2^-52 + 2^-101, -, - */ { 0x3ff0000000000003, 0x3ff0000000000002, 0x3ff0000000000000, 0x4000000000000002, 0x3cd4000000000002, 0xc000000000000002, 0xbcd4000000000002 }, + /* +2.443e-77 * 2.828 +- 6.909e-77 -> -1.402e-93, +1.382e-76, +1.402e-93, -1.382e-76 */ { 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb020000000000000, 0xaca765753908cd20, 0x3030000000000000, 0x2ca765753908cd20, 0xb030000000000000 }, + /* +2.443e-77 * 2.828 +- 6.909e-77 -> +9.446e-93, +1.382e-76, -9.446e-93, -1.382e-76 */ { 0x3006a09e667f3bcd, 0x4006a09e667f3bcd, 0xb020000000000000, 0x2cd3b3efbf5e2229, 0x3030000000000000, 0xacd3b3efbf5e2229, 0xb030000000000000 }, + /* +2.443e-77 * 2.828 +- -1.1055e-75 -> -1.0364e-75, +1.1746e-75, +1.0364e-75, -1.1746e-75 */ { 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb060003450000000, 0xb05e0068a0000000, 0x3061003450000000, 0x305e0068a0000000, 0xb061003450000000 }, + /* +2 * +3 +- 3 -> +9, +3, -9, -3 */ + { 0x4000000000000000, 0x4008000000000000, 0x4008000000000000, + 0x4022000000000000, 0x4008000000000000, 0xc022000000000000, 0xc008000000000000 }, + /* +2 * +3 +- 5 -> +11, +1, -11, -1 */ + { 0x4000000000000000, 0x4008000000000000, 0x4014000000000000, + 0x4026000000000000, 0x3ff0000000000000, 0xc026000000000000, 0xbff0000000000000 }, + /* +2 * +3 +- 7 -> +13, -1, -13, +1 */ + { 0x4000000000000000, 0x4008000000000000, 0x401c000000000000, + 0x402a000000000000, 0xbff0000000000000, 0xc02a000000000000, 0x3ff0000000000000 }, + /* +2 * +3 +- 9 -> +15, -3, -15, +3 */ + { 0x4000000000000000, 0x4008000000000000, 0x4022000000000000, + 0x402e000000000000, 0xc008000000000000, 0xc02e000000000000, 0x4008000000000000 }, + /* +2 * +3 +- -3 -> +3, +9, -3, -9 */ + { 0x4000000000000000, 0x4008000000000000, 0xc008000000000000, + 0x4008000000000000, 0x4022000000000000, 0xc008000000000000, 0xc022000000000000 }, + /* +2 * +3 +- -5 -> +1, +11, -1, -11 */ + { 0x4000000000000000, 0x4008000000000000, 0xc014000000000000, + 0x3ff0000000000000, 0x4026000000000000, 0xbff0000000000000, 0xc026000000000000 }, + /* +2 * +3 +- -7 -> -1, +13, +1, -13 */ + { 0x4000000000000000, 0x4008000000000000, 0xc01c000000000000, + 0xbff0000000000000, 0x402a000000000000, 0x3ff0000000000000, 0xc02a000000000000 }, + /* +2 * +3 +- -9 -> -3, +15, +3, -15 */ + { 0x4000000000000000, 0x4008000000000000, 0xc022000000000000, + 0xc008000000000000, 0x402e000000000000, 0x4008000000000000, 0xc02e000000000000 }, + /* +2 * -3 +- 3 -> -3, -9, +3, +9 */ + { 0x4000000000000000, 0xc008000000000000, 0x4008000000000000, + 0xc008000000000000, 0xc022000000000000, 0x4008000000000000, 0x4022000000000000 }, + /* +2 * -3 +- 5 -> -1, -11, +1, +11 */ + { 0x4000000000000000, 0xc008000000000000, 0x4014000000000000, + 0xbff0000000000000, 0xc026000000000000, 0x3ff0000000000000, 0x4026000000000000 }, + /* +2 * -3 +- 7 -> +1, -13, -1, +13 */ + { 0x4000000000000000, 0xc008000000000000, 0x401c000000000000, + 0x3ff0000000000000, 0xc02a000000000000, 0xbff0000000000000, 0x402a000000000000 }, + /* +2 * -3 +- 9 -> +3, -15, -3, +15 */ + { 0x4000000000000000, 0xc008000000000000, 0x4022000000000000, + 0x4008000000000000, 0xc02e000000000000, 0xc008000000000000, 0x402e000000000000 }, + /* -2 * +3 +- -3 -> -9, -3, +9, +3 */ + { 0xc000000000000000, 0x4008000000000000, 0xc008000000000000, + 0xc022000000000000, 0xc008000000000000, 0x4022000000000000, 0x4008000000000000 }, + /* -2 * +3 +- -5 -> -11, -1, +11, +1 */ + { 0xc000000000000000, 0x4008000000000000, 0xc014000000000000, + 0xc026000000000000, 0xbff0000000000000, 0x4026000000000000, 0x3ff0000000000000 }, + /* -2 * +3 +- -7 -> -13, +1, +13, -1 */ + { 0xc000000000000000, 0x4008000000000000, 0xc01c000000000000, + 0xc02a000000000000, 0x3ff0000000000000, 0x402a000000000000, 0xbff0000000000000 }, + /* -2 * +3 +- -9 -> -15, +3, +15, -3 */ + { 0xc000000000000000, 0x4008000000000000, 0xc022000000000000, + 0xc02e000000000000, 0x4008000000000000, 0x402e000000000000, 0xc008000000000000 }, + /* -2 * +3 +- +0 -> -6, -6, +6, +6 */ + { 0xc000000000000000, 0x4008000000000000, 0x0000000000000000, + 0xc018000000000000, 0xc018000000000000, 0x4018000000000000, 0x4018000000000000 }, + /* +2 * -3 +- -0 -> -6, -6, +6, +6 */ + { 0x4000000000000000, 0xc008000000000000, 0x8000000000000000, + 0xc018000000000000, 0xc018000000000000, 0x4018000000000000, 0x4018000000000000 }, + /* 2^-1026 * (1.5 * 2^1023) +- -0 -> (1.5 * 2^-3), ditto, -ditto, -ditto */ + { 0x0001000000000000, 0x7fe8000000000000, 0x8000000000000000, + 0x3fc8000000000000, 0x3fc8000000000000, 0xbfc8000000000000, 0xbfc8000000000000 }, }; int test23(long arg) diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index cc6c1ccb06362188008b2c1a3cdc5936a329d9f3..24878af7125b68dfa588e23782443a3a7f773f65 100755 GIT binary patch delta 3912 zcmaJ@3s6+o89rx0VU@K!1zBKU1a>#PU+x0S-UWeWkynFNnUJ{VX?zqqNeUI+g)$v+ zqyc}^Av0;J9d#K zX#HLfA;VXXe~8oOf8^#VD=aKNJF-Mks>GB6SH@~g zvDL5fJzU4V2~UW?!S%Hu$%?J9} zytw2{;ylCaW_kX%cAoFC6Ea>B4jW?aG#n1aZd|x&ZLg*8OSrT&3qDK=9bFk0N~sr` zmS)o==w4b)&%t@bNQjEBrlqhJF%|Y9j>A;^Yjh2CCREdAID@zbl*_8=3TQ$!!W)RQ za077!Oo`QW540n`2PY8+fU2wMlTd>=GCHUR8ak3noG=re09>+G)rJpLCzfae+LJYO z3I>yxCv{-6i|!xtl!cjADTy*8iF9+TiOH%ZhEAABE{JY55a%&WmgPT@w!H&SfjK2h zG1~?$DOEHU_NA!dR$2IuhKIGb|9Cjt@UY1}JnV8WCVtDokn>&acq|r<{&-0w!DEeU zjg7h7MxwhrHm0@Z5NaJ8Gr6_ohK1Q0i32y+-DUfEUxP%Au$N0?#^henlq|^krQZ5J zoY3U65N;y9AL6uj8U?Ggc1Hb5`#dG^KpT%GiPLSRAH$$d5dNlnei>;$U?YP3n9$2fjVP+P#W#VcsvY!#_#Aw=r%1^480A1G;N{J zg2}v*a-*-C4~Nh~aAd_Na|x!dCrOW^L+$Aj-=OuSvu&qJ2$uRHypmnU^>@BRj@fvtw`A;jNmOxpdn(l^Qc(PA^cLZ21? z+@lp=t;t#b#+L^}!n;1Y!S>eT=L>(?g|o4~)QC1-pMLNpYV1W=fG0Btd(d(33(qui z2NgeKXQ-5&SPSx>3Eb8I$07GVa*j{<+QLb`HV1pg?6HrdS4&RNsDj2`OaBA5cu(mv z{P}N#eKu`~Z~Y_;EY&Vyp$z%-eQX}^!MC=p%DJak%ihA15@&xW*{8UhM2L-O!`-_p zT-$o_a5E2<%#aVhva2icLo*Uex`rFLZ3B3(IP2;R#UI&6xe&rxBZ9cddCySKY4BsW zh09k7Q7AK{xQ=r5zjuUETj(X8#}`%@k%&ycf?T6W5yo+N#r~D&M;_st@qPpx^2hOubE-TWzg*Mh8fpPmg$BQ0nTXFq9pWz7jCdUS5wF3K z3XLN4Be+zNLNj0*sb)~E(9kZ(MBEQ`h!tAZnN`^sR%=yu zrtykYqB^C6{D!`zs94e%<=DP%(GpVrkLwE^i*XHBwhN(zOu$L>Ylf++5=Hah;bv8y zG6$uX7ZB16?&^FD{PpVeIOjE<*CNjb(bE6tbE+$LxnMHusFSVy@y-|#Ty)qV|iQ~J+-nX1jg}y(ZkkA z_!Ljh_Qr*y8I50rc{tQ@gmc0anDxQWeo2>RAqu2qzgSMx)1-L@|BbPc=4VLA%xQ5B z_VXlhqF|P0+0g74d3m8=N4;d9CC%Y}B8)6)mIDj+lkFtkL-d^BM`S@SEfDM&EAetb zflQ^$(*B2ElI$nh1ril1iE-lka$@0raew@uDhhJ% z#3w=$Bt9^6qLLuZ!6hYLN%sfMob)Ws(hmfgG|2&YMT2uvW`X&%#Gi)CG=5aG#+v$H DLkTV0 delta 2865 zcmZuydr*_v6~8wmKm?S4kw<_8LITMG$?{AHAps18hk&E3!nnIeo0dv#$&Q`IAxNaK z0~WTj$3NH^w5wCxUCef5fOfX7j<(gxcGu#xZgpfV?0{qHy3J~Bv7%qkP2vDfznS~{ z?){x}&pnU(a3AK~T%t-s%tk_LY*)f+4&vYLcGcJj*^OKqa&5@9*-7V+GLN1gYRRR` z(^Yh-@Uvpe78?(lA`#j~SKaIMd-JV$`vj%0q zs(-(Wkg?m7|3~&sy&GjmZV#`>!u$*Mc;H%e4zKSFJdZBc`g}6u99O3I`Bdn)>R?NY z%8t$E<5Hil%@ca|xY@|+fwYaeVnsHw zUgA=hIPR2*1qb7+=uWtVxC?{{Rn!Vw5wqYu#ATRAJPWGC zDmn}f#H;WL;xV9-D!Lo05zkHxNr3Vv(}@*)G4WtdmWiA=C=K1aou&rTGza^WrSt&| zCKn|gz=9Vpf7DePrfL!rVOA13Y~Dds`qEH&6U-&&$Jpe=I*tLe94sggIdLZ_Q*wC3 z0k5T0(s($YB6;J#T{7bmH~-YfU3xk;rm{tli@MVx(=Jv?9NSrv7#~gO60?;_QlCT+ znK6@;>Cc^0*|db}nXZh?>u_vmoD6l?nmd06o235rDy3=+(jhen!jGisr(s;0$K1G! z^oM}^jEZ5a%*3d7W#3H*cZ}+yiCfpZ%+*qINVl7$>L$bXx{iS;wS(S*!BoHMwbX!W zII31ZU8}kiwxs=pPQm50ZOm$-JeU3ew#c_dMLYJQZMt+FV;O{xXZR`eCbql1yS^>HVa$guC+R?*2Ew*QA6>!d8k zXA;{zCPTe4HjW3QGA?Q@V^hW{*D$EhqIvS~Kr}~#ffEUwP`(?Jow(K^_i<=ZD4{i@ zFKUfP@GL{p^s-&%U-M(GwiNv$B-!ud%Q|fB0@NWy;+N})&Amnk_GiZN;uFxDec;O+ zpx2>ORm5vQ1HV@NkhX$C-A2j8korsr)q^o7juN=8j};Qfc|$nneHt7&Cd!BL9C^&~ z7-BuVD!F|hz}=iZLZ(I>oWToQH4S(!Pii`1UKg!wW40ZgqmZL54yxWUpjE8~ck@YY zZ46dnP08BdgXh{KiEKkTAN{`T@qMnA4To=YW9cLa^4=7P*RN=`zXRQQl}ziiJhhlDGU#Q6GwB=sg_C!l z1!4Z10#Ve;oU4QIc77q7b0J?%XTY1^ASjDmkq-~RYXyG!I|XVUiG<%4)C(LDD+=KQ zpe)Q~23iW$G#`2k8w8{4R>}L}VL+}dQe#cF7B%3JIaws(h4;aQBLB>XC>eodT?37R z{kk79V|R2l{s@{ESOlf|z?qYGy1=O4AzV*NvzDHBn}^}LR+4Z&EO3!JN5kU@*dtq_fLm} zyJqH@7czVk$!@2l%f)efO0=ls_M{C>p^P0U0TxgPLx#9)GiLr4vZr|+sClY{mDY;f zW#o)sa4Gss=t=8m>mK#M zH^z=tqqj;=;8qctK1IE2CJEv{OcDz|Mjk$KjWw7%k7A$qyPAW1yo6VCA;heYe2I%* zGlm+o6z%OZ=SBHf7iI-L7&F)NS$AT|%%|ezFE|blum6iiWLh&IDvvcr{+Z)E0Rdr> z?bv>aoNN`>iv6?5r3X3ocVW8-w(<02-+@KM z0T5WEc&-$PvtU8wgB>vodJ%hI)FKtne#vo3L5F4d|5peh3vkOK7ij;+adki|<#anJ zDmD1HHdSh}({Bb-R4Xhazn~|0o7Xpr$`0AqttaJwzY|&(fi_s$KF74q!8p!qg!xJX zVsaJ3=BhmEgTbl{@#fi;^l5R%f`E`lm_~UCJgcgpb5LfjQ1#5M_{S1I8A`|ia+|Tw zzB+7Ik^2-rwl;|47dY;ipC|q@WrYzk45I2-Ou3?3L))RLIu9f7$9_6|Qf Date: Mon, 5 Feb 2024 21:57:59 +1100 Subject: [PATCH 07/24] FPU: Reorganize NaN and infinity handling and improve arch compliance The architecture specifies that an invalid operation exception for signalling NaN (VXSNAN) can occur in the same instructions as an invalid operation exception for infinity times zero (VXIMZ) in the case of a multiply-add instruction where B is a signalling NaN, and one of A and C is infinity and the other is zero. This moves the invalid operation tests around so as to handle this case correctly. It also restructures the infinity and NaN cases to simplify the logic a little. Signed-off-by: Paul Mackerras --- fpu.vhdl | 74 +++++++++++++++++++++++++------------------------------- 1 file changed, 33 insertions(+), 41 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index f0a180f..d7a5e42 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -838,6 +838,7 @@ begin variable set_y : std_ulogic; variable set_s : std_ulogic; variable qnan_result : std_ulogic; + variable invalid_mul : std_ulogic; variable px_nz : std_ulogic; variable pcmpb_eq : std_ulogic; variable pcmpb_lt : std_ulogic; @@ -1217,6 +1218,7 @@ begin -- At least one floating-point operand is infinity or NaN v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; + invalid_mul := '0'; if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or @@ -1225,6 +1227,15 @@ begin v.fpscr(FPSCR_VXSNAN) := '1'; invalid := '1'; end if; + -- Check for this case here since VXIMZ can be set along with VXSNAN + if r.is_multiply = '1' and + ((r.a.class = INFINITY and r.c.class = ZERO) or + (r.a.class = ZERO and r.c.class = INFINITY)) then + v.fpscr(FPSCR_VXIMZ) := '1'; + qnan_result := '1'; + invalid_mul := '1'; + end if; + if r.a.class = NAN or r.b.class = NAN or r.c.class = NAN then if r.int_result = '1' then v.state := INT_OFLOW; @@ -1241,51 +1252,32 @@ begin end if; else - if r.a.class = INFINITY then - if r.is_multiply = '1' and r.c.class = ZERO then - -- invalid operation, construct QNaN - v.fpscr(FPSCR_VXIMZ) := '1'; - qnan_result := '1'; - elsif r.is_subtract = '1' and r.b.class = INFINITY then + if (r.a.class = INFINITY or r.c.class = INFINITY) and invalid_mul = '0' then + sign_inv := r.is_multiply and r.is_subtract; + if r.is_subtract = '1' and r.b.class = INFINITY then v.fpscr(FPSCR_VXISI) := '1'; qnan_result := '1'; - elsif r.is_inverse = '1' and r.b.class = INFINITY then - v.fpscr(FPSCR_VXIDI) := '1'; - qnan_result := '1'; - else - sign_inv := r.is_multiply and r.is_subtract; - v.result_class := INFINITY; end if; - arith_done := '1'; - elsif r.c.class = INFINITY then - if r.is_multiply = '1' and r.a.class = ZERO then - -- invalid operation, construct QNaN - v.fpscr(FPSCR_VXIMZ) := '1'; - qnan_result := '1'; - elsif r.is_subtract = '1' and r.b.class = INFINITY then - v.fpscr(FPSCR_VXISI) := '1'; - qnan_result := '1'; - else - sign_inv := r.is_multiply and r.is_subtract; - v.result_class := INFINITY; - end if; - arith_done := '1'; + end if; + if r.is_inverse = '1' and r.a.class = INFINITY and r.b.class = INFINITY then + v.fpscr(FPSCR_VXIDI) := '1'; + qnan_result := '1'; + end if; + if r.b.class = INFINITY and r.is_sqrt = '1' and r.b.negative = '1' then + v.fpscr(FPSCR_VXSQRT) := '1'; + qnan_result := '1'; + end if; + if r.b.class = INFINITY and r.is_inverse = '1' then + -- fdiv, fre, frsqrte + v.result_class := ZERO; else - -- r.b.class = INFINITY - if r.int_result = '1' then - -- fcti* - v.state := INT_OFLOW; - elsif r.is_sqrt = '1' and r.b.negative = '1' then - v.fpscr(FPSCR_VXSQRT) := '1'; - qnan_result := '1'; - elsif r.is_inverse = '1' then - -- fdiv, fre, frsqrte - v.result_class := ZERO; - arith_done := '1'; - else - v.result_class := INFINITY; - arith_done := '1'; - end if; + v.result_class := INFINITY; + end if; + if r.b.class = INFINITY and r.int_result = '1' then + -- fcti* + v.state := INT_OFLOW; + else + arith_done := '1'; end if; end if; From 2422585e140d1de47e3ff6d88495657fdbe7f703 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Feb 2024 22:16:13 +1100 Subject: [PATCH 08/24] FPU: Reduce use of r.insn inside the state machine Instead use things derived from the instruction in the first cycle, such as r.is_multiply, r.is_addition, etc. Signed-off-by: Paul Mackerras --- fpu.vhdl | 60 +++++++++++++++++++++++++------------------------------- 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index d7a5e42..b602648 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -938,6 +938,8 @@ begin end if; if e_in.insn(5 downto 1) = "01111" then -- fcti*z v.round_mode := "001"; + elsif e_in.insn(5 downto 1) = "01000" then -- fri* + v.round_mode := '1' & e_in.insn(7 downto 6); end if; case e_in.insn(5 downto 1) is when "10100" | "10101" => -- fadd and fsub @@ -1334,36 +1336,29 @@ begin else -- some operand is denorm, and/or it's fmadd/fmsub with B=0 - v.opsel_a := AIN_B; - if r.use_a = '1' and (r.use_b = '0' or r.use_c = '0') then + -- input selection for denorm cases + -- A and C are non-zero if present, + -- B is non-zero if present except for multiply-add + if r.a.zeroexp = '1' and (r.is_multiply or r.is_inverse) = '1' then v.opsel_a := AIN_A; + elsif r.b.zeroexp = '1' and (r.is_inverse or r.is_sqrt) = '1' then + v.opsel_a := AIN_B; + elsif r.c.zeroexp = '1' then + v.opsel_a := AIN_C; + else + v.opsel_a := AIN_B; + if r.use_a = '1' and (r.use_b = '0' or r.use_c = '0' or r.b.class = ZERO) then + v.opsel_a := AIN_A; + end if; end if; - if r.use_b = '1' and r.b.class = ZERO and r.use_c = '1' then - -- turn fmadd/sub into fmul - v.opsel_a := AIN_A; + if r.is_multiply = '1' and r.b.class = ZERO then + -- This will trigger for fmul as well as fmadd/sub, but + -- it doesn't matter since r.is_subtract = 0 for fmul. rsgn_op := RSGN_SUB; v.state := DO_FMUL; else v.state := r.exec_state; end if; - -- input selection for denorm cases - case r.insn(5 downto 1) is - when "10010" => -- fdiv - if r.b.mantissa(UNIT_BIT) = '0' and r.a.mantissa(UNIT_BIT) = '1' then - v.opsel_a := AIN_B; - end if; - when "11001" => -- fmul - if r.c.mantissa(UNIT_BIT) = '0' and r.a.mantissa(UNIT_BIT) = '1' then - v.opsel_a := AIN_C; - end if; - when "11100" | "11101" | "11110" | "11111" => -- fmadd etc. - if r.a.mantissa(UNIT_BIT) = '0' then - v.opsel_a := AIN_A; - elsif r.c.mantissa(UNIT_BIT) = '0' then - v.opsel_a := AIN_C; - end if; - when others => - end case; end if; when DO_ILLEGAL => @@ -1571,7 +1566,6 @@ begin arith_done := '1'; else v.state := FRI_1; - v.round_mode := '1' & r.insn(7 downto 6); end if; when DO_FRSP => @@ -1813,9 +1807,9 @@ begin set_a := '1'; re_sel2 <= REXP2_NE; re_set_result <= '1'; - if r.insn(4) = '1' then + if r.is_multiply = '1' then if r.c.mantissa(UNIT_BIT) = '1' then - if r.insn(3) = '0' or r.b.class = ZERO then + if r.is_addition = '0' or r.b.class = ZERO then v.first := '1'; v.state := MULT_1; else @@ -1867,7 +1861,7 @@ begin set_c := '1'; re_sel2 <= REXP2_NE; re_set_result <= '1'; - if r.insn(3) = '0' or r.b.class = ZERO then + if r.is_addition = '0' or r.b.class = ZERO then v.first := '1'; v.state := MULT_1; else @@ -2081,16 +2075,16 @@ begin re_set_result <= '1'; end if; v.first := '1'; - if r.insn(4) = '0' then - if r.insn(3) = '0' then - v.state := DIV_2; + if r.is_sqrt = '1' then + if r.is_inverse = '1' then + v.state := RSQRT_1; else v.state := SQRT_1; end if; - elsif r.insn(2) = '0' then - v.state := FRE_1; + elsif r.use_a = '1' then + v.state := DIV_2; else - v.state := RSQRT_1; + v.state := FRE_1; end if; when DIV_2 => From 4e5f856c5517eebb022f17b07cfbe73ff95d422b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 13 Feb 2024 17:17:03 +1100 Subject: [PATCH 09/24] FPU: Factor out some of the common elements of the DO_* states Signed-off-by: Paul Mackerras --- fpu.vhdl | 36 ++++++++++-------------------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index b602648..345dc6e 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -146,6 +146,7 @@ architecture behaviour of fpu is exp_cmp : std_ulogic; madd_cmp : std_ulogic; add_bsmall : std_ulogic; + is_arith : std_ulogic; is_addition : std_ulogic; is_multiply : std_ulogic; is_inverse : std_ulogic; @@ -176,6 +177,7 @@ architecture behaviour of fpu is res_sign : std_ulogic; res_int : std_ulogic; exec_state : state_t; + cycle_1 : std_ulogic; end record; type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0); @@ -880,6 +882,7 @@ begin is_nan_inf := '0'; is_zero_den := '0'; sign_inv := '0'; + v.cycle_1 := e_in.valid; if r.complete = '1' or r.do_intr = '1' then v.instr_done := '0'; @@ -925,6 +928,7 @@ begin v.negate := '0'; v.quieten_nan := '1'; v.int_result := '0'; + v.is_arith := '0'; case e_in.op is when OP_FP_ARITH => fpin_a := e_in.valid_a; @@ -932,6 +936,7 @@ begin fpin_c := e_in.valid_c; v.longmask := e_in.single; v.fp_rc := e_in.rc; + v.is_arith := '1'; exec_state := arith_decode(to_integer(unsigned(e_in.insn(5 downto 1)))); if e_in.insn(5 downto 1) = "10110" or e_in.insn(5 downto 1) = "11010" then v.is_sqrt := '1'; @@ -1193,6 +1198,11 @@ begin rsgn_op := RSGN_NOP; + if r.cycle_1 = '1' and r.is_arith = '1' then + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + end if; + case r.state is when IDLE => v.invalid := '0'; @@ -1218,8 +1228,6 @@ begin when DO_NAN_INF => -- At least one floating-point operand is infinity or NaN - v.fpscr(FPSCR_FR) := '0'; - v.fpscr(FPSCR_FI) := '0'; invalid_mul := '0'; if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or @@ -1285,8 +1293,6 @@ begin when DO_ZERO_DEN => -- At least one floating point operand is zero or denormalized - v.fpscr(FPSCR_FR) := '0'; - v.fpscr(FPSCR_FI) := '0'; if (r.use_a = '1' and r.a.class = ZERO) or (r.use_b = '1' and r.b.class = ZERO and r.is_multiply = '0') or (r.use_c = '1' and r.c.class = ZERO) then @@ -1559,8 +1565,6 @@ begin rs_sel1 <= RSH1_B; rs_con2 <= RSCON2_52; rs_neg2 <= '1'; - v.fpscr(FPSCR_FR) := '0'; - v.fpscr(FPSCR_FI) := '0'; if r.b.exponent >= to_signed(52, EXP_BITS) then -- integer already, no rounding required arith_done := '1'; @@ -1577,8 +1581,6 @@ begin rs_sel1 <= RSH1_B; rs_con2 <= RSCON2_MINEXP; rs_neg2 <= '1'; - v.fpscr(FPSCR_FR) := '0'; - v.fpscr(FPSCR_FI) := '0'; set_x := '1'; if r.b.exponent < to_signed(-126, EXP_BITS) then v.state := ROUND_UFLOW; @@ -1598,8 +1600,6 @@ begin re_set_result <= '1'; rs_sel1 <= RSH1_B; rs_neg2 <= '1'; - v.fpscr(FPSCR_FR) := '0'; - v.fpscr(FPSCR_FI) := '0'; if r.b.exponent >= to_signed(64, EXP_BITS) or (r.insn(9) = '0' and r.b.exponent >= to_signed(32, EXP_BITS)) then @@ -1630,8 +1630,6 @@ begin v.result_class := r.b.class; re_con2 <= RECON2_UNIT; re_set_result <= '1'; - v.fpscr(FPSCR_FR) := '0'; - v.fpscr(FPSCR_FI) := '0'; if r.b.class = ZERO then arith_done := '1'; else @@ -1648,8 +1646,6 @@ begin rs_sel1 <= RSH1_B; rs_neg1 <= '1'; rs_sel2 <= RSH2_A; - v.fpscr(FPSCR_FR) := '0'; - v.fpscr(FPSCR_FI) := '0'; v.add_bsmall := r.exp_cmp; v.opsel_a := AIN_B; if r.exp_cmp = '0' then @@ -1667,8 +1663,6 @@ begin -- fmul[s] -- r.opsel_a = AIN_A unless C is denorm and A isn't v.result_class := r.a.class; - v.fpscr(FPSCR_FR) := '0'; - v.fpscr(FPSCR_FI) := '0'; re_sel1 <= REXP1_A; re_sel2 <= REXP2_C; re_set_result <= '1'; @@ -1685,8 +1679,6 @@ begin when DO_FDIV => -- r.opsel_a = AIN_A unless B is denorm and A isn't v.result_class := r.a.class; - v.fpscr(FPSCR_FR) := '0'; - v.fpscr(FPSCR_FI) := '0'; re_sel1 <= REXP1_A; re_sel2 <= REXP2_B; re_neg2 <= '1'; @@ -1714,8 +1706,6 @@ begin when DO_FSQRT => -- r.opsel_a = AIN_B v.result_class := r.b.class; - v.fpscr(FPSCR_FR) := '0'; - v.fpscr(FPSCR_FI) := '0'; re_sel2 <= REXP2_B; re_set_result <= '1'; if r.b.negative = '1' then @@ -1734,8 +1724,6 @@ begin when DO_FRE => -- r.opsel_a = AIN_B v.result_class := r.b.class; - v.fpscr(FPSCR_FR) := '0'; - v.fpscr(FPSCR_FI) := '0'; re_sel2 <= REXP2_B; re_set_result <= '1'; if r.b.mantissa(UNIT_BIT) = '0' then @@ -1747,8 +1735,6 @@ begin when DO_FRSQRTE => -- r.opsel_a = AIN_B v.result_class := r.b.class; - v.fpscr(FPSCR_FR) := '0'; - v.fpscr(FPSCR_FI) := '0'; re_sel2 <= REXP2_B; re_set_result <= '1'; -- set shift to 1 @@ -1775,8 +1761,6 @@ begin re_set_result <= '1'; -- put b.exp into shift rs_sel1 <= RSH1_B; - v.fpscr(FPSCR_FR) := '0'; - v.fpscr(FPSCR_FI) := '0'; -- Make sure A and C are normalized if r.a.mantissa(UNIT_BIT) = '0' then v.state := RENORM_A; From cf866ce91080a97f6ef5666f2d47d58d15aaf7d9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 29 Feb 2024 21:39:36 +1100 Subject: [PATCH 10/24] FPU: Simplify logic for setting r.x Since r.x is mostly set from the value in r.r and only once from anything else (r.b.mantissa), move the check to before the input multiplexer for the main adder, so it works on r.r rather than whatever is selected by r.opsel_a. For the case in DO_FRSP where we have B selected by r.opsel_a, we add a new state so that we now get B into R and then check the low bits of R. Signed-off-by: Paul Mackerras --- fpu.vhdl | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 345dc6e..1a584d5 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -51,7 +51,7 @@ architecture behaviour of fpu is DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF, DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT, DO_FCFID, DO_FCTI, - DO_FRSP, DO_FRI, + DO_FRSP, DO_FRSP_2, DO_FRI, DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT, DO_FMADD, DO_FRE, DO_FRSQRTE, DO_FSEL, @@ -1577,6 +1577,10 @@ begin v.result_class := r.b.class; re_sel2 <= REXP2_B; re_set_result <= '1'; + v.state := DO_FRSP_2; + + when DO_FRSP_2 => + -- r.opsel_a = AIN_R, r.shift = 0 -- set shift to exponent - -126 rs_sel1 <= RSH1_B; rs_con2 <= RSCON2_MINEXP; @@ -3269,6 +3273,9 @@ begin else mask := right_mask(unsigned(mshift(5 downto 0))); end if; + if (or (mask and r.r)) = '1' and set_x = '1' then + v.x := '1'; + end if; case r.opsel_a is when AIN_R => in_a0 := r.r; @@ -3279,9 +3286,6 @@ begin when others => in_a0 := r.c.mantissa; end case; - if (or (mask and in_a0)) = '1' and set_x = '1' then - v.x := '1'; - end if; if opsel_ainv = '1' then in_a0 := not in_a0; end if; From 2731384a4be19877c40fedfaeb72565d279577df Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 1 Mar 2024 22:12:46 +1100 Subject: [PATCH 11/24] FPU: Reduce misc_sel to 3 bits Signed-off-by: Paul Mackerras --- fpu.vhdl | 101 +++++++++++++++++++++++++------------------------------ 1 file changed, 45 insertions(+), 56 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 1a584d5..a309705 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -200,7 +200,7 @@ architecture behaviour of fpu is signal r_lo_nz : std_ulogic; signal r_gt_1 : std_ulogic; signal s_nz : std_ulogic; - signal misc_sel : std_ulogic_vector(3 downto 0); + signal misc_sel : std_ulogic_vector(2 downto 0); signal f_to_multiply : MultiplyInputType; signal multiply_to_f : MultiplyOutputType; signal msel_1 : std_ulogic_vector(1 downto 0); @@ -1150,7 +1150,7 @@ begin opsel_r <= RES_SUM; opsel_s <= S_ZERO; carry_in <= '0'; - misc_sel <= "0000"; + misc_sel <= "000"; fpscr_mask := (others => '1'); update_fx := '0'; arith_done := '0'; @@ -1498,13 +1498,14 @@ begin when DO_FMRG => -- fmrgew, fmrgow opsel_r <= RES_MISC; - misc_sel <= "01" & r.insn(8) & '0'; + misc_sel <= "100"; v.writing_fpr := '1'; v.instr_done := '1'; when DO_MFFS => v.writing_fpr := '1'; opsel_r <= RES_MISC; + misc_sel <= "011"; case r.insn(20 downto 16) is when "00000" => -- mffs @@ -2153,7 +2154,7 @@ begin re_neg1 <= '1'; re_set_result <= '1'; opsel_r <= RES_MISC; - misc_sel <= "0111"; + misc_sel <= "101"; -- set shift to 1 rs_con2 <= RSCON2_1; v.state := NORMALIZE; @@ -2173,7 +2174,7 @@ begin when RSQRT_1 => opsel_r <= RES_MISC; - misc_sel <= "0111"; + misc_sel <= "101"; re_sel1 <= REXP1_BHALF; re_neg1 <= '1'; re_set_result <= '1'; @@ -2186,7 +2187,7 @@ begin -- also transfer B (in R) to A set_a := '1'; opsel_r <= RES_MISC; - misc_sel <= "0111"; + misc_sel <= "101"; msel_1 <= MUL1_B; msel_2 <= MUL2_LUT; f_to_multiply.valid <= '1'; @@ -2399,7 +2400,7 @@ begin else msb := r.r(63); end if; - misc_sel <= '1' & r.insn(9 downto 8) & r.result_sign; + misc_sel <= "110"; if (r.insn(8) = '0' and msb /= r.result_sign) or (r.insn(8) = '1' and msb /= '1') then opsel_r <= RES_MISC; @@ -2414,10 +2415,7 @@ begin when INT_OFLOW => opsel_r <= RES_MISC; - misc_sel <= '1' & r.insn(9 downto 8) & r.result_sign; - if r.b.class = NAN then - misc_sel(0) <= '1'; - end if; + misc_sel <= "110"; v.fpscr(FPSCR_VXCVI) := '1'; invalid := '1'; arith_done := '1'; @@ -2515,7 +2513,7 @@ begin re_con2 <= RECON2_MAX; re_set_result <= '1'; opsel_r <= RES_MISC; - misc_sel <= "001" & r.single_prec; + misc_sel <= "010"; arith_done := '1'; else -- enabled overflow exception @@ -2761,7 +2759,7 @@ begin -- less than 0.5, in which case we want to use 0.5, to avoid -- infinite loops in some cases. opsel_r <= RES_MISC; - misc_sel <= "0001"; + misc_sel <= "001"; if multiply_to_f.valid = '1' then v.first := '1'; if r.count = "11" then @@ -2774,7 +2772,7 @@ begin -- Get 0.5 into R; it turns out the generated -- QNaN mantissa is actually what we want opsel_r <= RES_MISC; - misc_sel <= "0001"; + misc_sel <= "001"; v.opsel_a := AIN_A; -- set shift to 64 rs_con2 <= RSCON2_64; @@ -3136,7 +3134,7 @@ begin v.instr_done := '1'; when IDIV_ZERO => opsel_r <= RES_MISC; - misc_sel <= "0101"; + misc_sel <= "000"; v.xerc_result := v.xerc; if r.oe = '1' then v.xerc_result.ov := r.int_ovf; @@ -3176,7 +3174,7 @@ begin invalid := '1'; v.result_class := NAN; rsign := '0'; - misc_sel <= "0001"; + misc_sel <= "001"; opsel_r <= RES_MISC; arith_done := '1'; end if; @@ -3342,50 +3340,41 @@ begin when others => misc := (others => '0'); case misc_sel is - when "0000" => - misc := x"00000000" & (r.fpscr and fpscr_mask); - when "0001" => - -- generated QNaN mantissa + when "000" => + -- zero result, used in idiv logic + when "001" => + -- generated QNaN mantissa; also used for 0.5 in idiv logic misc(QNAN_BIT) := '1'; - when "0010" => - -- mantissa of max representable DP number - misc(UNIT_BIT downto DP_LSB) := (others => '1'); - when "0011" => - -- mantissa of max representable SP number + when "010" => + -- mantissa of max representable number, DP or SP misc(UNIT_BIT downto SP_LSB) := (others => '1'); - when "0100" => - -- fmrgow result - misc := r.a.mantissa(31 downto 0) & r.b.mantissa(31 downto 0); - when "0110" => - -- fmrgew result - misc := r.a.mantissa(63 downto 32) & r.b.mantissa(63 downto 32); - when "0111" => + misc(SP_LSB-1 downto DP_LSB) := (others => not r.single_prec); + when "011" => + -- read FPSCR + misc := x"00000000" & (r.fpscr and fpscr_mask); + when "100" => + -- fmrgow/fmrgew result + if r.insn(8) = '0' then + misc := r.a.mantissa(31 downto 0) & r.b.mantissa(31 downto 0); + else + misc := r.a.mantissa(63 downto 32) & r.b.mantissa(63 downto 32); + end if; + when "101" => misc := std_ulogic_vector(shift_left(resize(unsigned(inverse_est), 64), UNIT_BIT - 19)); - when "1000" => - -- max positive result for fctiw[z] - misc := x"000000007fffffff"; - when "1001" => - -- max negative result for fctiw[z] - misc := x"ffffffff80000000"; - when "1010" => - -- max positive result for fctiwu[z] - misc := x"00000000ffffffff"; - when "1011" => - -- max negative result for fctiwu[z] - misc := x"0000000000000000"; - when "1100" => - -- max positive result for fctid[z] - misc := x"7fffffffffffffff"; - when "1101" => - -- max negative result for fctid[z] - misc := x"8000000000000000"; - when "1110" => - -- max positive result for fctidu[z] - misc := x"ffffffffffffffff"; - when "1111" => - -- max negative result for fctidu[z] - misc := x"0000000000000000"; + when "110" => + -- max positive or negative result for fcti* + if r.result_sign = '0' and r.b.class /= NAN then + misc := x"000000007fffffff"; + misc(31) := r.insn(8) or r.insn(9); -- unsigned or dword + misc(62 downto 32) := (others => r.insn(9)); -- dword + misc(63) := r.insn(8) and r.insn(8); + elsif r.insn(8) = '0' then + misc(63) := '1'; + if r.insn(9) = '0' then + misc(62 downto 31) := (others => '1'); + end if; + end if; when others => end case; result <= misc; From ba2add029af5aea4bc5d0240a7fcb17351db74a7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 5 Mar 2024 16:46:08 +1100 Subject: [PATCH 12/24] FPU: Remove need to set opsel_a one cycle ahead Most states set opsel_a directly to select the operand for the A input of the main adder. The exception is the EXC_RESULT state, which uses r.opsel_a set by the previous cycle to indicate which input operand to use as the result. In order to make timing, ensure that the controls that select the inputs to the main adder (opsel_*, etc.) don't depend on any complicated functions of the data (such as px_nz, pcmpb_eq, pcmpb_lt, etc.), but are as far as possible constant for each state. There is now a control called set_r for whether the result is written to r.r, which enables us to avoid setting opsel_b or opsel_r conditionally in some cases. Also, to avoid a data-dependent setting of msel_2 in IDIV_DODIV state, the IDIV_NR1 and IDIV_NR2 states have been reworked so that completion of the required number of iterations is checked in IDIV_NR1 state, and at that point, if the inverse estimate is < 0.5, we go to IDIV_USE0_5 state in order to use 0.5 as the estimate. This means that in the normal case, the inverse estimate is already in Y when we get to IDIV_DODIV state. IDIV_USE0_5 has been reworked to put R (which will contain 0.5) into Y as the inverse estimate. That means that IDIV_DODIV state doesn't have any data-dependent logic to put either P or R into Y. Signed-off-by: Paul Mackerras --- fpu.vhdl | 239 ++++++++++++++++++++++++++----------------------------- 1 file changed, 113 insertions(+), 126 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index a309705..3914a97 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -185,6 +185,7 @@ architecture behaviour of fpu is signal r, rin : reg_type; signal fp_result : std_ulogic_vector(63 downto 0); + signal opsel_a : std_ulogic_vector(1 downto 0); signal opsel_b : std_ulogic_vector(1 downto 0); signal opsel_r : std_ulogic_vector(1 downto 0); signal opsel_s : std_ulogic_vector(1 downto 0); @@ -838,6 +839,7 @@ begin variable set_b_mant : std_ulogic; variable set_c : std_ulogic; variable set_y : std_ulogic; + variable set_r : std_ulogic; variable set_s : std_ulogic; variable qnan_result : std_ulogic; variable invalid_mul : std_ulogic; @@ -1143,6 +1145,7 @@ begin v.first := '0'; v.doing_ftdiv := "00"; v.opsel_a := AIN_R; + opsel_a <= AIN_R; opsel_ainv <= '0'; opsel_mask <= '0'; opsel_b <= BIN_ZERO; @@ -1166,6 +1169,7 @@ begin set_b := '0'; set_b_mant := '0'; set_c := '0'; + set_r := '1'; set_s := '0'; f_to_multiply.is_signed <= '0'; f_to_multiply.valid <= '0'; @@ -1207,12 +1211,7 @@ begin when IDLE => v.invalid := '0'; if e_in.valid = '1' then - v.opsel_a := AIN_B; v.busy := '1'; - if e_in.op = OP_FP_ARITH and e_in.valid_a = '1' and - (e_in.valid_b = '0' or e_in.valid_c = '0') then - v.opsel_a := AIN_A; - end if; v.exec_state := exec_state; if is_nan_inf = '1' then v.state := DO_NAN_INF; @@ -1293,6 +1292,11 @@ begin when DO_ZERO_DEN => -- At least one floating point operand is zero or denormalized + if r.is_addition = '1' then + opsel_a <= AIN_A; + else + opsel_a <= AIN_B; + end if; if (r.use_a = '1' and r.a.class = ZERO) or (r.use_b = '1' and r.b.class = ZERO and r.is_multiply = '0') or (r.use_c = '1' and r.c.class = ZERO) then @@ -1320,7 +1324,7 @@ begin else -- B is zero, other operands are finite if r.int_result = '1' then - -- fcti*, r.opsel_a = AIN_B + -- fcti* arith_done := '1'; elsif r.is_inverse = '1' then -- fdiv, fre, frsqrte @@ -1328,7 +1332,7 @@ begin zero_divide := '1'; arith_done := '1'; elsif r.is_addition = '1' then - -- fadd, r.opsel_a = AIN_A + -- fadd, fsub v.result_class := FINITE; re_sel1 <= REXP1_A; re_set_result <= '1'; @@ -1342,21 +1346,8 @@ begin else -- some operand is denorm, and/or it's fmadd/fmsub with B=0 - -- input selection for denorm cases -- A and C are non-zero if present, -- B is non-zero if present except for multiply-add - if r.a.zeroexp = '1' and (r.is_multiply or r.is_inverse) = '1' then - v.opsel_a := AIN_A; - elsif r.b.zeroexp = '1' and (r.is_inverse or r.is_sqrt) = '1' then - v.opsel_a := AIN_B; - elsif r.c.zeroexp = '1' then - v.opsel_a := AIN_C; - else - v.opsel_a := AIN_B; - if r.use_a = '1' and (r.use_b = '0' or r.use_c = '0' or r.b.class = ZERO) then - v.opsel_a := AIN_A; - end if; - end if; if r.is_multiply = '1' and r.b.class = ZERO then -- This will trigger for fmul as well as fmadd/sub, but -- it doesn't matter since r.is_subtract = 0 for fmul. @@ -1389,7 +1380,7 @@ begin re_sel2 <= REXP2_B; re_set_result <= '1'; if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or - (r.b.class = FINITE and r.b.mantissa(UNIT_BIT) = '0') then + (r.b.class = FINITE and r.b.denorm = '1') then v.cr_result(2) := '1'; end if; if r.a.class = NAN or r.a.class = INFINITY or @@ -1408,7 +1399,7 @@ begin v.instr_done := '1'; v.cr_result := "0000"; if r.b.class = ZERO or r.b.class = INFINITY or - (r.b.class = FINITE and r.b.mantissa(UNIT_BIT) = '0') then + (r.b.class = FINITE and r.b.denorm = '1') then v.cr_result(2) := '1'; end if; if r.b.class = NAN or r.b.class = INFINITY or r.b.class = ZERO @@ -1418,7 +1409,7 @@ begin when DO_FCMP => -- fcmp[uo] - -- r.opsel_a = AIN_B + opsel_a <= AIN_B; v.instr_done := '1'; update_fx := '1'; re_sel2 <= REXP2_B; @@ -1467,7 +1458,6 @@ begin -- Prepare to subtract mantissas, put B in R v.cr_result := "0000"; v.instr_done := '0'; - v.opsel_a := AIN_A; v.state := CMP_1; end if; v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result; @@ -1550,7 +1540,7 @@ begin v.instr_done := '1'; when DO_FMR => - -- r.opsel_a = AIN_B + opsel_a <= AIN_B; v.result_class := r.b.class; re_sel2 <= REXP2_B; re_set_result <= '1'; @@ -1558,7 +1548,7 @@ begin v.instr_done := '1'; when DO_FRI => -- fri[nzpm] - -- r.opsel_a = AIN_B + opsel_a <= AIN_B; v.result_class := r.b.class; re_sel2 <= REXP2_B; re_set_result <= '1'; @@ -1574,14 +1564,15 @@ begin end if; when DO_FRSP => - -- r.opsel_a = AIN_B, r.shift = 0 + -- r.shift = 0 + opsel_a <= AIN_B; v.result_class := r.b.class; re_sel2 <= REXP2_B; re_set_result <= '1'; v.state := DO_FRSP_2; when DO_FRSP_2 => - -- r.opsel_a = AIN_R, r.shift = 0 + -- r.shift = 0 -- set shift to exponent - -126 rs_sel1 <= RSH1_B; rs_con2 <= RSCON2_MINEXP; @@ -1599,7 +1590,7 @@ begin -- instr bit 9: 1=dword 0=word -- instr bit 8: 1=unsigned 0=signed -- instr bit 1: 1=round to zero 0=use fpscr[RN] - -- r.opsel_a = AIN_B + opsel_a <= AIN_B; v.result_class := r.b.class; re_sel2 <= REXP2_B; re_set_result <= '1'; @@ -1626,7 +1617,7 @@ begin end if; when DO_FCFID => - -- r.opsel_a = AIN_B + opsel_a <= AIN_B; if r.insn(8) = '0' and r.b.negative = '1' then -- fcfid[s] with negative operand, set R = -B opsel_ainv <= '1'; @@ -1643,7 +1634,7 @@ begin when DO_FADD => -- fadd[s] and fsub[s] - -- r.opsel_a = AIN_A + opsel_a <= AIN_A; v.result_class := r.a.class; re_sel1 <= REXP1_A; re_set_result <= '1'; @@ -1652,7 +1643,6 @@ begin rs_neg1 <= '1'; rs_sel2 <= RSH2_A; v.add_bsmall := r.exp_cmp; - v.opsel_a := AIN_B; if r.exp_cmp = '0' then if r.a.exponent = r.b.exponent then v.state := ADD_2; @@ -1666,15 +1656,16 @@ begin when DO_FMUL => -- fmul[s] - -- r.opsel_a = AIN_A unless C is denorm and A isn't + opsel_a <= AIN_A; v.result_class := r.a.class; re_sel1 <= REXP1_A; re_sel2 <= REXP2_C; re_set_result <= '1'; -- Renormalize denorm operands - if r.a.mantissa(UNIT_BIT) = '0' then + if r.a.denorm = '1' then v.state := RENORM_A; - elsif r.c.mantissa(UNIT_BIT) = '0' then + elsif r.c.denorm = '1' then + opsel_a <= AIN_C; v.state := RENORM_C; else f_to_multiply.valid <= '1'; @@ -1682,7 +1673,7 @@ begin end if; when DO_FDIV => - -- r.opsel_a = AIN_A unless B is denorm and A isn't + opsel_a <= AIN_A; v.result_class := r.a.class; re_sel1 <= REXP1_A; re_sel2 <= REXP2_B; @@ -1690,9 +1681,10 @@ begin re_set_result <= '1'; v.count := "00"; -- Renormalize denorm operands - if r.a.mantissa(UNIT_BIT) = '0' then + if r.a.denorm = '1' then v.state := RENORM_A; - elsif r.b.mantissa(UNIT_BIT) = '0' then + elsif r.b.denorm = '1' then + opsel_a <= AIN_B; v.state := RENORM_B; else v.first := '1'; @@ -1700,23 +1692,23 @@ begin end if; when DO_FSEL => + rsgn_op := RSGN_SEL; if r.a.class = ZERO or (r.a.negative = '0' and r.a.class /= NAN) then v.opsel_a := AIN_C; - rsgn_op := RSGN_SEL; else v.opsel_a := AIN_B; end if; v.state := EXC_RESULT; when DO_FSQRT => - -- r.opsel_a = AIN_B + opsel_a <= AIN_B; v.result_class := r.b.class; re_sel2 <= REXP2_B; re_set_result <= '1'; if r.b.negative = '1' then v.fpscr(FPSCR_VXSQRT) := '1'; qnan_result := '1'; - elsif r.b.mantissa(UNIT_BIT) = '0' then + elsif r.b.denorm = '1' then v.state := RENORM_B; elsif r.b.exponent(0) = '0' then v.state := SQRT_1; @@ -1727,18 +1719,18 @@ begin end if; when DO_FRE => - -- r.opsel_a = AIN_B + opsel_a <= AIN_B; v.result_class := r.b.class; re_sel2 <= REXP2_B; re_set_result <= '1'; - if r.b.mantissa(UNIT_BIT) = '0' then + if r.b.denorm = '1' then v.state := RENORM_B; else v.state := FRE_1; end if; when DO_FRSQRTE => - -- r.opsel_a = AIN_B + opsel_a <= AIN_B; v.result_class := r.b.class; re_sel2 <= REXP2_B; re_set_result <= '1'; @@ -1747,7 +1739,7 @@ begin if r.b.negative = '1' then v.fpscr(FPSCR_VXSQRT) := '1'; qnan_result := '1'; - elsif r.b.mantissa(UNIT_BIT) = '0' then + elsif r.b.denorm = '1' then v.state := RENORM_B; elsif r.b.exponent(0) = '0' then v.state := RSQRT_1; @@ -1757,8 +1749,7 @@ begin when DO_FMADD => -- fmadd, fmsub, fnmadd, fnmsub - -- r.opsel_a = AIN_A if A is denorm, else AIN_C if C is denorm, - -- else AIN_B + opsel_a <= AIN_B; v.result_class := r.a.class; -- put a.exp + c.exp into result_exp re_sel1 <= REXP1_A; @@ -1767,9 +1758,11 @@ begin -- put b.exp into shift rs_sel1 <= RSH1_B; -- Make sure A and C are normalized - if r.a.mantissa(UNIT_BIT) = '0' then + if r.a.denorm = '1' then + opsel_a <= AIN_A; v.state := RENORM_A; - elsif r.c.mantissa(UNIT_BIT) = '0' then + elsif r.c.denorm = '1' then + opsel_a <= AIN_C; v.state := RENORM_C; elsif r.madd_cmp = '0' then -- addend is bigger, do multiply first @@ -1785,18 +1778,13 @@ begin when RENORM_A => rs_norm <= '1'; v.state := RENORM_A2; - if r.use_c = '1' and r.c.denorm = '1' then - v.opsel_a := AIN_C; - else - v.opsel_a := AIN_B; - end if; when RENORM_A2 => - -- r.opsel_a = AIN_C for fmul/fmadd, AIN_B for fdiv set_a := '1'; re_sel2 <= REXP2_NE; re_set_result <= '1'; if r.is_multiply = '1' then + opsel_a <= AIN_C; if r.c.mantissa(UNIT_BIT) = '1' then if r.is_addition = '0' or r.b.class = ZERO then v.first := '1'; @@ -1806,13 +1794,13 @@ begin if new_exp + 1 >= r.b.exponent then v.madd_cmp := '1'; end if; - v.opsel_a := AIN_B; v.state := DO_FMADD; end if; else v.state := RENORM_C; end if; else + opsel_a <= AIN_B; if r.b.mantissa(UNIT_BIT) = '1' then v.first := '1'; v.state := DIV_2; @@ -1839,7 +1827,6 @@ begin re_sel2 <= REXP2_NE; re_set_result <= '1'; end if; - v.opsel_a := AIN_B; v.state := LOOKUP; when RENORM_C => @@ -1858,12 +1845,12 @@ begin if new_exp + 1 >= r.b.exponent then v.madd_cmp := '1'; end if; - v.opsel_a := AIN_B; v.state := DO_FMADD; end if; when ADD_1 => -- transferring B to R + opsel_a <= AIN_B; re_sel2 <= REXP2_B; re_set_result <= '1'; -- set shift to b.exp - a.exp @@ -1881,15 +1868,14 @@ begin v.x := s_nz; set_x := '1'; v.longmask := r.single_prec; - if r.add_bsmall = '1' then - v.opsel_a := AIN_A; - else - v.opsel_a := AIN_B; - end if; v.state := ADD_2; when ADD_2 => - -- r.opsel_a = AIN_A if r.add_bsmall = 1 else AIN_B + if r.add_bsmall = '1' then + opsel_a <= AIN_A; + else + opsel_a <= AIN_B; + end if; opsel_b <= BIN_R; opsel_binv <= r.is_subtract; carry_in <= r.is_subtract and not r.x; @@ -1931,7 +1917,7 @@ begin end if; when CMP_1 => - -- r.opsel_a = AIN_A + opsel_a <= AIN_A; opsel_b <= BIN_R; opsel_binv <= '1'; carry_in <= '1'; @@ -2033,6 +2019,8 @@ begin when FMADD_6 => -- r.shift = UNIT_BIT (or 0, but only if r is now nonzero) + set_r := '0'; + opsel_r <= RES_SHIFT; re_sel2 <= REXP2_NE; rs_norm <= '1'; if (r.r(UNIT_BIT + 2) or r_hi_nz or r_lo_nz or (or (r.r(DP_LSB - 1 downto 0)))) = '0' then @@ -2043,7 +2031,7 @@ begin else -- R is all zeroes but there are non-zero bits in S -- so shift them into R and set S to 0 - opsel_r <= RES_SHIFT; + set_r := '1'; re_set_result <= '1'; set_s := '1'; v.state := FINISH; @@ -2055,10 +2043,10 @@ begin end if; when LOOKUP => - -- r.opsel_a = AIN_B -- wait one cycle for inverse_table[B] lookup -- if this is a division, compute exponent -- (see comment on RENORM_B2 above) + opsel_a <= AIN_B; if r.use_a = '1' then re_sel2 <= REXP2_NE; re_set_result <= '1'; @@ -2136,15 +2124,15 @@ begin end if; when DIV_6 => - -- r.opsel_a = AIN_R -- test if remainder is 0 or >= B + opsel_b <= BIN_RND; + rbit_inc := '1'; if pcmpb_lt = '1' then -- quotient is correct, set X if remainder non-zero + set_r := '0'; v.x := r.p(UNIT_BIT + 2) or px_nz; else -- quotient needs to be incremented by 1 in R-bit position - rbit_inc := '1'; - opsel_b <= BIN_RND; v.x := not pcmpb_eq; end if; v.state := FINISH; @@ -2575,6 +2563,7 @@ begin when ROUNDING_3 => -- r.shift = clz(r.r) - 9 + opsel_r <= RES_SHIFT; mant_nz := r_hi_nz or (r_lo_nz and not r.single_prec); re_sel2 <= REXP2_NE; -- set shift to new_exp - min_exp (== -1022) @@ -2582,11 +2571,11 @@ begin rs_con2 <= RSCON2_MINEXP; rs_neg2 <= '1'; if mant_nz = '0' then + set_r := '0'; v.result_class := ZERO; arith_done := '1'; else -- Renormalize result after rounding - opsel_r <= RES_SHIFT; re_set_result <= '1'; v.denorm := exp_tiny; if new_exp < to_signed(-1022, EXP_BITS) then @@ -2605,6 +2594,7 @@ begin when EXC_RESULT => -- r.opsel_a = AIN_A, AIN_B or AIN_C according to which input is the result + opsel_a <= r.opsel_a; case r.opsel_a is when AIN_B => re_sel2 <= REXP2_B; @@ -2620,7 +2610,7 @@ begin arith_done := '1'; when DO_IDIVMOD => - -- r.opsel_a = AIN_B + opsel_a <= AIN_B; if r.b.class = ZERO then -- B is zero, signal overflow v.int_ovf := '1'; @@ -2657,21 +2647,19 @@ begin -- add the X bit onto R to round up B carry_in <= r.x; -- prepare to do count-leading-zeroes on A - v.opsel_a := AIN_A; v.state := IDIV_CLZA; when IDIV_CLZA => set_b := '1'; -- put R back into B - -- r.opsel_a = AIN_A + opsel_a <= AIN_A; if r.is_signed = '1' and r.a.negative = '1' then opsel_ainv <= '1'; carry_in <= '1'; end if; re_con2 <= RECON2_UNIT; re_set_result <= '1'; - v.opsel_a := AIN_C; v.state := IDIV_CLZA2; when IDIV_CLZA2 => - -- r.opsel_a = AIN_C + opsel_a <= AIN_C; rs_norm <= '1'; -- write the dividend back into A in case we negated it set_a_mant := '1'; @@ -2720,6 +2708,12 @@ begin msel_inv <= '1'; msel_2 <= MUL2_LUT; set_y := '1'; + -- Get 0.5 into R in case the inverse estimate turns out to be + -- less than 0.5, in which case we want to use 0.5, to avoid + -- infinite loops in some cases. + -- It turns out the generated QNaN mantissa is actually what we want + opsel_r <= RES_MISC; + misc_sel <= "001"; if r.b.mantissa(UNIT_BIT + 1) = '1' then -- rounding up of the mantissa caused overflow, meaning the -- normalized B is 2.0. Since this is outside the range @@ -2740,10 +2734,22 @@ begin msel_2 <= MUL2_P; set_y := r.first; pshift := '1'; - f_to_multiply.valid <= r.first; + -- set shift to 64 + rs_con2 <= RSCON2_64; + if r.first = '1' then + if r.count = "11" then + if r.p(UNIT_BIT) = '0' and r.p(UNIT_BIT - 1) = '0' then + -- inverse estimate is < 0.5, so use 0.5 + v.state := IDIV_USE0_5; + else + v.state := IDIV_DODIV; + end if; + else + f_to_multiply.valid <= r.first; + end if; + end if; if multiply_to_f.valid = '1' then v.first := '1'; - v.count := r.count + 1; v.state := IDIV_NR2; end if; when IDIV_NR2 => @@ -2752,42 +2758,25 @@ begin msel_2 <= MUL2_P; f_to_multiply.valid <= r.first; pshift := '1'; - v.opsel_a := AIN_A; - -- set shift to 64 - rs_con2 <= RSCON2_64; - -- Get 0.5 into R in case the inverse estimate turns out to be - -- less than 0.5, in which case we want to use 0.5, to avoid - -- infinite loops in some cases. - opsel_r <= RES_MISC; - misc_sel <= "001"; + if r.first = '1' then + v.count := r.count + 1; + end if; if multiply_to_f.valid = '1' then v.first := '1'; - if r.count = "11" then - v.state := IDIV_DODIV; - else - v.state := IDIV_NR1; - end if; + v.state := IDIV_NR1; end if; when IDIV_USE0_5 => - -- Get 0.5 into R; it turns out the generated - -- QNaN mantissa is actually what we want - opsel_r <= RES_MISC; - misc_sel <= "001"; - v.opsel_a := AIN_A; + -- Put the 0.5 which is in R into Y as the inverse estimate + set_y := '1'; + msel_2 <= MUL2_R; -- set shift to 64 rs_con2 <= RSCON2_64; v.state := IDIV_DODIV; when IDIV_DODIV => - -- r.opsel_a = AIN_A -- r.shift = 64 - -- inverse estimate is in P or in R; copy it to Y - if r.b.mantissa(UNIT_BIT + 1) = '1' or - (r.p(UNIT_BIT) = '0' and r.p(UNIT_BIT - 1) = '0') then - msel_2 <= MUL2_R; - else - msel_2 <= MUL2_P; - end if; - set_y := '1'; + -- inverse estimate is in Y + -- put A (dividend) into R + opsel_a <= AIN_A; -- shift_res is 0 because r.shift = 64; -- put that into B, which now holds the quotient set_b_mant := '1'; @@ -2809,7 +2798,6 @@ begin else -- handle top bit of quotient specially -- for this we need the divisor left-justified in B - v.opsel_a := AIN_C; v.state := IDIV_EXT_TBH; end if; when IDIV_SH32 => @@ -2864,7 +2852,8 @@ begin msel_2 <= MUL2_P; v.inc_quot := not pcmpc_lt and not r.divmod; if r.divmod = '0' then - v.opsel_a := AIN_B; + -- get B into R for IDIV_DIVADJ state + opsel_a <= AIN_B; end if; -- set shift to UNIT_BIT (== 56) rs_con2 <= RSCON2_UNIT; @@ -2894,12 +2883,11 @@ begin -- r.shift = - b.exponent -- shift the quotient estimate right by b.exponent bits opsel_r <= RES_SHIFT; - v.opsel_a := AIN_B; v.first := '1'; v.state := IDIV_DIV7; when IDIV_DIV7 => - -- r.opsel_a = AIN_B -- add shifted quotient delta onto the total quotient + opsel_a <= AIN_B; opsel_b <= BIN_R; v.first := '1'; v.state := IDIV_DIV8; @@ -2923,12 +2911,11 @@ begin msel_1 <= MUL1_Y; msel_2 <= MUL2_P; v.inc_quot := not pcmpc_lt and not r.divmod; - if r.divmod = '0' then - v.opsel_a := AIN_B; - end if; -- set shift to UNIT_BIT (== 56) rs_con2 <= RSCON2_UNIT; if r.divmod = '0' then + -- get B into R for IDIV_DIVADJ state + opsel_a <= AIN_B; v.state := IDIV_DIVADJ; elsif pcmpc_eq = '1' then v.state := IDIV_ZERO; @@ -2936,16 +2923,17 @@ begin v.state := IDIV_MODADJ; end if; when IDIV_EXT_TBH => - -- r.opsel_a = AIN_C; get divisor into R and prepare to shift left + -- get divisor into R and prepare to shift left -- set shift to 63 - b.exp + opsel_a <= AIN_C; rs_sel1 <= RSH1_B; rs_neg1 <= '1'; rs_con2 <= RSCON2_63; - v.opsel_a := AIN_A; v.state := IDIV_EXT_TBH2; when IDIV_EXT_TBH2 => - -- r.opsel_a = AIN_A; divisor is in R + -- divisor is in R -- r.shift = 63 - b.exponent; shift and put into B + opsel_a <= AIN_A; set_b_mant := '1'; -- set shift to 64 - UNIT_BIT (== 8) rs_con2 <= RSCON2_64_UNIT; @@ -2966,13 +2954,13 @@ begin -- r.shift = 64 - B.exponent, so is at least 1 opsel_r <= RES_SHIFT; -- top bit of A gets lost in the shift, so handle it specially - v.opsel_a := AIN_B; -- set shift to 63 rs_con2 <= RSCON2_63; v.state := IDIV_EXT_TBH5; when IDIV_EXT_TBH5 => - -- r.opsel_a = AIN_B, r.shift = 63 + -- r.shift = 63 -- shifted dividend is in R, subtract left-justified divisor + opsel_a <= AIN_B; opsel_b <= BIN_R; opsel_ainv <= '1'; carry_in <= '1'; @@ -3004,15 +2992,14 @@ begin msel_2 <= MUL2_R; f_to_multiply.valid <= r.first; pshift := '1'; - v.opsel_a := AIN_B; opsel_r <= RES_MULT; if multiply_to_f.valid = '1' then v.first := '1'; v.state := IDIV_EXTDIV3; end if; when IDIV_EXTDIV3 => - -- r.opsel_a = AIN_B -- delta quotient is in R; add it to B + opsel_a <= AIN_B; opsel_b <= BIN_R; v.first := '1'; v.state := IDIV_EXTDIV4; @@ -3040,12 +3027,11 @@ begin opsel_r <= RES_SHIFT; -- test LS 64b of remainder in P against divisor in C v.inc_quot := not pcmpc_lt; - v.opsel_a := AIN_B; v.state := IDIV_EXTDIV6; when IDIV_EXTDIV6 => - -- r.opsel_a = AIN_B -- shifted remainder is in R, see if it is > 1 -- and compute R = R * Y if so + opsel_a <= AIN_B; msel_1 <= MUL1_Y; msel_2 <= MUL2_R; pshift := '1'; @@ -3060,7 +3046,6 @@ begin -- result is in R/S opsel_r <= RES_SHIFT; if pcmpc_lt = '0' then - v.opsel_a := AIN_C; v.state := IDIV_MODSUB; elsif r.result_sign = '0' then v.state := IDIV_DONE; @@ -3068,8 +3053,8 @@ begin v.state := IDIV_DIVADJ; end if; when IDIV_MODSUB => - -- r.opsel_a = AIN_C -- Subtract divisor from remainder + opsel_a <= AIN_C; opsel_ainv <= '1'; carry_in <= '1'; opsel_b <= BIN_R; @@ -3079,7 +3064,7 @@ begin v.state := IDIV_DIVADJ; end if; when IDIV_DIVADJ => - -- result (so far) is on the A input of the adder + -- result (so far) is in R -- set carry to increment quotient if needed -- and also negate R if the answer is negative opsel_ainv <= r.result_sign; @@ -3274,7 +3259,7 @@ begin if (or (mask and r.r)) = '1' and set_x = '1' then v.x := '1'; end if; - case r.opsel_a is + case opsel_a is when AIN_R => in_a0 := r.r; when AIN_A => @@ -3379,7 +3364,9 @@ begin end case; result <= misc; end case; - v.r := result; + if set_r = '1' then + v.r := result; + end if; if set_s = '1' then case opsel_s is when S_NEG => From 850b87c83fe5aa3345f5fde18a17cd8a813af86c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 5 Mar 2024 20:50:45 +1100 Subject: [PATCH 13/24] FPU: Get rid of r.madd_cmp and r.exp_cmp This saves a few LUTs and simplifies the code a little. Signed-off-by: Paul Mackerras --- fpu.vhdl | 38 +++++++++----------------------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 3914a97..8a82e03 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -143,8 +143,6 @@ architecture behaviour of fpu is denorm : std_ulogic; round_mode : std_ulogic_vector(2 downto 0); is_subtract : std_ulogic; - exp_cmp : std_ulogic; - madd_cmp : std_ulogic; add_bsmall : std_ulogic; is_arith : std_ulogic; is_addition : std_ulogic; @@ -1069,15 +1067,6 @@ begin is_zero_den := adec.zeroexp or bdec.zeroexp or cdec.zeroexp; end if; - v.exp_cmp := '0'; - if adec.exponent > bdec.exponent then - v.exp_cmp := '1'; - end if; - v.madd_cmp := '0'; - if (adec.exponent + cdec.exponent + 1) >= bdec.exponent then - v.madd_cmp := '1'; - end if; - v.a_hi := 8x"0"; v.a_lo := 56x"0"; end if; @@ -1448,7 +1437,7 @@ begin v.cr_result := r.a.negative & not r.a.negative & "00"; elsif r.b.class = INFINITY then v.cr_result := not r.b.negative & r.b.negative & "00"; - elsif r.exp_cmp = '1' then + elsif r.a.exponent > r.b.exponent then -- A and B are both finite from here down v.cr_result := r.a.negative & not r.a.negative & "00"; elsif r.a.exponent /= r.b.exponent then @@ -1642,15 +1631,14 @@ begin rs_sel1 <= RSH1_B; rs_neg1 <= '1'; rs_sel2 <= RSH2_A; - v.add_bsmall := r.exp_cmp; - if r.exp_cmp = '0' then - if r.a.exponent = r.b.exponent then - v.state := ADD_2; - else - v.longmask := '0'; - v.state := ADD_SHIFT; - end if; + v.add_bsmall := '0'; + if r.a.exponent = r.b.exponent then + v.state := ADD_2; + elsif r.a.exponent < r.b.exponent then + v.longmask := '0'; + v.state := ADD_SHIFT; else + v.add_bsmall := '1'; v.state := ADD_1; end if; @@ -1764,7 +1752,7 @@ begin elsif r.c.denorm = '1' then opsel_a <= AIN_C; v.state := RENORM_C; - elsif r.madd_cmp = '0' then + elsif (r.a.exponent + r.c.exponent + 1) < r.b.exponent then -- addend is bigger, do multiply first -- if subtracting, sign is opposite to initial estimate f_to_multiply.valid <= '1'; @@ -1790,10 +1778,6 @@ begin v.first := '1'; v.state := MULT_1; else - v.madd_cmp := '0'; - if new_exp + 1 >= r.b.exponent then - v.madd_cmp := '1'; - end if; v.state := DO_FMADD; end if; else @@ -1841,10 +1825,6 @@ begin v.first := '1'; v.state := MULT_1; else - v.madd_cmp := '0'; - if new_exp + 1 >= r.b.exponent then - v.madd_cmp := '1'; - end if; v.state := DO_FMADD; end if; From 8648ddb64f2a7c99ad70fe3989879ea0ba8ea4d9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 6 Mar 2024 13:45:58 +1100 Subject: [PATCH 14/24] FPU: Eliminate EXC_RESULT state This lets us remove r.opsel_a and is a step towards moving the handling of exceptional cases out to a separate process. Signed-off-by: Paul Mackerras --- fpu.vhdl | 187 +++++++++++++++++++++++++------------------------------ 1 file changed, 84 insertions(+), 103 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 8a82e03..60640af 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -79,7 +79,6 @@ architecture behaviour of fpu is RENORM_A, RENORM_A2, RENORM_B, RENORM_B2, RENORM_C, RENORM_C2, - EXC_RESULT, IDIV_NORMB, IDIV_NORMB2, IDIV_NORMB3, IDIV_CLZA, IDIV_CLZA2, IDIV_CLZA3, IDIV_NR0, IDIV_NR1, IDIV_NR2, IDIV_USE0_5, @@ -152,7 +151,6 @@ architecture behaviour of fpu is first : std_ulogic; count : unsigned(1 downto 0); doing_ftdiv : std_ulogic_vector(1 downto 0); - opsel_a : std_ulogic_vector(1 downto 0); use_a : std_ulogic; use_b : std_ulogic; use_c : std_ulogic; @@ -872,7 +870,6 @@ begin variable rsgn_op : std_ulogic_vector(1 downto 0); variable is_nan_inf : std_ulogic; variable is_zero_den : std_ulogic; - variable sign_inv : std_ulogic; begin v := r; v.complete := '0'; @@ -881,7 +878,6 @@ begin exec_state := IDLE; is_nan_inf := '0'; is_zero_den := '0'; - sign_inv := '0'; v.cycle_1 := e_in.valid; if r.complete = '1' or r.do_intr = '1' then @@ -1133,7 +1129,6 @@ begin v.update_fprf := '0'; v.first := '0'; v.doing_ftdiv := "00"; - v.opsel_a := AIN_R; opsel_a <= AIN_R; opsel_ainv <= '0'; opsel_mask <= '0'; @@ -1216,7 +1211,13 @@ begin when DO_NAN_INF => -- At least one floating-point operand is infinity or NaN - invalid_mul := '0'; + if r.a.class = NAN then + opsel_a <= AIN_A; + elsif r.b.class = NAN then + opsel_a <= AIN_B; + else + opsel_a <= AIN_C; + end if; if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or @@ -1225,33 +1226,34 @@ begin v.fpscr(FPSCR_VXSNAN) := '1'; invalid := '1'; end if; + -- Check for this case here since VXIMZ can be set along with VXSNAN + invalid_mul := '0'; if r.is_multiply = '1' and ((r.a.class = INFINITY and r.c.class = ZERO) or (r.a.class = ZERO and r.c.class = INFINITY)) then v.fpscr(FPSCR_VXIMZ) := '1'; - qnan_result := '1'; invalid_mul := '1'; end if; + if r.int_result = '1' then + opsel_r <= RES_MISC; + misc_sel <= "110"; + v.fpscr(FPSCR_VXCVI) := '1'; + invalid := '1'; + end if; + if r.a.class = NAN or r.b.class = NAN or r.c.class = NAN then - if r.int_result = '1' then - v.state := INT_OFLOW; - else - if r.a.class = NAN then - v.opsel_a := AIN_A; - elsif r.b.class = NAN then - v.opsel_a := AIN_B; - elsif r.c.class = NAN then - v.opsel_a := AIN_C; - end if; - rsgn_op := RSGN_SEL; - v.state := EXC_RESULT; - end if; + rsgn_op := RSGN_SEL; + v.result_class := NAN; else - if (r.a.class = INFINITY or r.c.class = INFINITY) and invalid_mul = '0' then - sign_inv := r.is_multiply and r.is_subtract; + if invalid_mul = '1' then + qnan_result := '1'; + elsif (r.a.class = INFINITY or r.c.class = INFINITY) then + if r.is_multiply = '1' then + rsgn_op := RSGN_SUB; + end if; if r.is_subtract = '1' and r.b.class = INFINITY then v.fpscr(FPSCR_VXISI) := '1'; qnan_result := '1'; @@ -1271,67 +1273,55 @@ begin else v.result_class := INFINITY; end if; - if r.b.class = INFINITY and r.int_result = '1' then - -- fcti* - v.state := INT_OFLOW; - else - arith_done := '1'; - end if; end if; + arith_done := '1'; when DO_ZERO_DEN => -- At least one floating point operand is zero or denormalized - if r.is_addition = '1' then - opsel_a <= AIN_A; - else + if r.use_a = '1' and r.a.class = ZERO then opsel_a <= AIN_B; - end if; - if (r.use_a = '1' and r.a.class = ZERO) or - (r.use_b = '1' and r.b.class = ZERO and r.is_multiply = '0') or - (r.use_c = '1' and r.c.class = ZERO) then - if r.use_a = '1' and r.a.class = ZERO then - if r.is_inverse = '1' then - -- fdiv; result is 0 unless B=0 - if r.b.class = ZERO then - v.fpscr(FPSCR_VXZDZ) := '1'; - qnan_result := '1'; - else - v.result_class := ZERO; - end if; - arith_done := '1'; - elsif r.is_addition = '1' then - -- result is +/- B - v.opsel_a := AIN_B; - v.state := EXC_RESULT; - else - v.result_class := ZERO; - arith_done := '1'; - end if; - elsif r.use_c = '1' and r.c.class = ZERO then - v.opsel_a := AIN_B; - v.state := EXC_RESULT; + re_sel2 <= REXP2_B; + re_set_result <= '1'; + if r.is_inverse = '1' and r.b.class = ZERO then + -- fdiv with B=0 + v.fpscr(FPSCR_VXZDZ) := '1'; + qnan_result := '1'; + end if; + if r.is_addition = '1' then + -- result is +/- B + v.result_class := r.b.class; else - -- B is zero, other operands are finite - if r.int_result = '1' then - -- fcti* - arith_done := '1'; - elsif r.is_inverse = '1' then - -- fdiv, fre, frsqrte - v.result_class := INFINITY; - zero_divide := '1'; - arith_done := '1'; - elsif r.is_addition = '1' then - -- fadd, fsub - v.result_class := FINITE; - re_sel1 <= REXP1_A; - re_set_result <= '1'; - arith_done := '1'; - else - -- other things, result is zero - v.result_class := ZERO; - arith_done := '1'; - end if; + v.result_class := ZERO; end if; + arith_done := '1'; + elsif r.use_c = '1' and r.c.class = ZERO then + -- fmul or fmadd/sub with C=0 + opsel_a <= AIN_B; + re_sel2 <= REXP2_B; + re_set_result <= '1'; + if r.is_addition = '1' then + v.result_class := r.b.class; + else + v.result_class := ZERO; + end if; + arith_done := '1'; + elsif (r.use_b = '1' and r.b.class = ZERO and r.is_multiply = '0') then + -- B is zero, other operands are finite, not fmadd* + opsel_a <= AIN_A; + re_sel1 <= REXP1_A; + re_set_result <= '1'; + if r.is_inverse = '1' then + -- fdiv, fre, frsqrte + v.result_class := INFINITY; + zero_divide := '1'; + elsif r.is_addition = '1' then + -- fadd, fsub + v.result_class := FINITE; + else + -- other things, result is zero + v.result_class := ZERO; + end if; + arith_done := '1'; else -- some operand is denorm, and/or it's fmadd/fmsub with B=0 @@ -1682,11 +1672,16 @@ begin when DO_FSEL => rsgn_op := RSGN_SEL; if r.a.class = ZERO or (r.a.negative = '0' and r.a.class /= NAN) then - v.opsel_a := AIN_C; + opsel_a <= AIN_C; + re_sel2 <= REXP2_C; + v.result_class := r.c.class; else - v.opsel_a := AIN_B; + opsel_a <= AIN_B; + re_sel2 <= REXP2_B; + v.result_class := r.b.class; end if; - v.state := EXC_RESULT; + re_set_result <= '1'; + arith_done := '1'; when DO_FSQRT => opsel_a <= AIN_B; @@ -2572,23 +2567,6 @@ begin re_set_result <= '1'; arith_done := '1'; - when EXC_RESULT => - -- r.opsel_a = AIN_A, AIN_B or AIN_C according to which input is the result - opsel_a <= r.opsel_a; - case r.opsel_a is - when AIN_B => - re_sel2 <= REXP2_B; - v.result_class := r.b.class; - when AIN_C => - re_sel2 <= REXP2_C; - v.result_class := r.c.class; - when others => - re_sel1 <= REXP1_A; - v.result_class := r.a.class; - end case; - re_set_result <= '1'; - arith_done := '1'; - when DO_IDIVMOD => opsel_a <= AIN_B; if r.b.class = ZERO then @@ -3113,25 +3091,28 @@ begin end case; + rsign := r.result_sign; case rsgn_op is when RSGN_SEL => - case v.opsel_a is + case opsel_a is when AIN_A => - v.result_sign := r.a.negative; + rsign := r.a.negative; when AIN_B => - v.result_sign := r.b.negative; + rsign := r.b.negative; when AIN_C => - v.result_sign := r.c.negative; + rsign := r.c.negative; when others => end case; + v.result_sign := rsign; when RSGN_SUB => - v.result_sign := r.result_sign xor r.is_subtract; + rsign := r.result_sign xor r.is_subtract; + v.result_sign := rsign; when RSGN_INV => - v.result_sign := not r.result_sign; + rsign := not r.result_sign; + v.result_sign := rsign; when others => end case; - rsign := r.result_sign xor sign_inv; if zero_divide = '1' then v.fpscr(FPSCR_ZX) := '1'; end if; From 70819c4c39d6892d7d9a338a6dd5b798bbb309a7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 7 Mar 2024 13:53:01 +1100 Subject: [PATCH 15/24] FPU: Do renormalization from DO_ZERO_DEN state Instead of having the various DO_* states (DO_FMUL, DO_FDIV, etc.) handle checking for denormalized inputs, we now have DO_ZERO_DEN state check for denormalized inputs and branch to RENORM_{A,B,C} to handle them. This also meant some changes were needed in how fsqrt and frsqrte handled inputs with odd exponent. The DO_FSQRT and DO_FRSQRTE states were very similar and have been combined into one. Signed-off-by: Paul Mackerras --- fpu.vhdl | 219 ++++++++++++++++++++----------------------------------- 1 file changed, 79 insertions(+), 140 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 60640af..ebbb564 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -53,7 +53,7 @@ architecture behaviour of fpu is DO_FCFID, DO_FCTI, DO_FRSP, DO_FRSP_2, DO_FRI, DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT, DO_FMADD, - DO_FRE, DO_FRSQRTE, + DO_FRE, DO_FSEL, DO_IDIVMOD, FRI_1, @@ -62,10 +62,9 @@ architecture behaviour of fpu is MULT_1, FMADD_0, FMADD_1, FMADD_2, FMADD_3, FMADD_4, FMADD_5, FMADD_6, - LOOKUP, DIV_2, DIV_3, DIV_4, DIV_5, DIV_6, FRE_1, - RSQRT_1, + SQRT_ODD, RSQRT_1, FTDIV_1, SQRT_1, SQRT_2, SQRT_3, SQRT_4, SQRT_5, SQRT_6, SQRT_7, SQRT_8, @@ -76,9 +75,8 @@ architecture behaviour of fpu is ROUND_UFLOW, ROUND_OFLOW, ROUNDING, ROUNDING_2, ROUNDING_3, DENORM, - RENORM_A, RENORM_A2, - RENORM_B, RENORM_B2, - RENORM_C, RENORM_C2, + RENORM_A, RENORM_B, RENORM_C, + RENORM_1, RENORM_2, IDIV_NORMB, IDIV_NORMB2, IDIV_NORMB3, IDIV_CLZA, IDIV_CLZA2, IDIV_CLZA3, IDIV_NR0, IDIV_NR1, IDIV_NR2, IDIV_USE0_5, @@ -174,6 +172,7 @@ architecture behaviour of fpu is res_int : std_ulogic; exec_state : state_t; cycle_1 : std_ulogic; + regsel : std_ulogic_vector(1 downto 0); end record; type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0); @@ -309,7 +308,7 @@ architecture behaviour of fpu is 2#10110# => DO_FSQRT, 2#11000# => DO_FRE, 2#11001# => DO_FMUL, - 2#11010# => DO_FRSQRTE, + 2#11010# => DO_FSQRT, 2#11100# => DO_FMADD, 2#11101# => DO_FMADD, 2#11110# => DO_FMADD, @@ -870,6 +869,7 @@ begin variable rsgn_op : std_ulogic_vector(1 downto 0); variable is_nan_inf : std_ulogic; variable is_zero_den : std_ulogic; + variable set_reg_ind : std_ulogic; begin v := r; v.complete := '0'; @@ -1170,6 +1170,7 @@ begin mult_mask := '0'; rnd_b32 := '0'; illegal := '0'; + set_reg_ind := '0'; re_sel1 <= REXP1_ZERO; re_sel2 <= REXP2_CON; @@ -1208,6 +1209,7 @@ begin v.x := '0'; v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX); set_s := '1'; + v.regsel := AIN_R; when DO_NAN_INF => -- At least one floating-point operand is infinity or NaN @@ -1331,6 +1333,14 @@ begin -- This will trigger for fmul as well as fmadd/sub, but -- it doesn't matter since r.is_subtract = 0 for fmul. rsgn_op := RSGN_SUB; + end if; + if r.a.denorm = '1' and (r.is_multiply = '1' or r.is_inverse = '1') then + v.state := RENORM_A; + elsif r.c.denorm = '1' then + v.state := RENORM_C; + elsif r.b.denorm = '1' and (r.is_inverse = '1' or r.is_sqrt = '1') then + v.state := RENORM_B; + elsif r.is_multiply = '1' and r.b.class = ZERO then v.state := DO_FMUL; else v.state := r.exec_state; @@ -1639,16 +1649,8 @@ begin re_sel1 <= REXP1_A; re_sel2 <= REXP2_C; re_set_result <= '1'; - -- Renormalize denorm operands - if r.a.denorm = '1' then - v.state := RENORM_A; - elsif r.c.denorm = '1' then - opsel_a <= AIN_C; - v.state := RENORM_C; - else - f_to_multiply.valid <= '1'; - v.state := MULT_1; - end if; + f_to_multiply.valid <= '1'; + v.state := MULT_1; when DO_FDIV => opsel_a <= AIN_A; @@ -1658,16 +1660,8 @@ begin re_neg2 <= '1'; re_set_result <= '1'; v.count := "00"; - -- Renormalize denorm operands - if r.a.denorm = '1' then - v.state := RENORM_A; - elsif r.b.denorm = '1' then - opsel_a <= AIN_B; - v.state := RENORM_B; - else - v.first := '1'; - v.state := DIV_2; - end if; + v.first := '1'; + v.state := DIV_2; when DO_FSEL => rsgn_op := RSGN_SEL; @@ -1691,14 +1685,13 @@ begin if r.b.negative = '1' then v.fpscr(FPSCR_VXSQRT) := '1'; qnan_result := '1'; - elsif r.b.denorm = '1' then - v.state := RENORM_B; - elsif r.b.exponent(0) = '0' then + end if; + if r.b.exponent(0) = '1' then + v.state := SQRT_ODD; + elsif r.is_inverse = '0' then v.state := SQRT_1; else - -- set shift to 1 - rs_con2 <= RSCON2_1; - v.state := RENORM_B2; + v.state := RSQRT_1; end if; when DO_FRE => @@ -1706,29 +1699,7 @@ begin v.result_class := r.b.class; re_sel2 <= REXP2_B; re_set_result <= '1'; - if r.b.denorm = '1' then - v.state := RENORM_B; - else - v.state := FRE_1; - end if; - - when DO_FRSQRTE => - opsel_a <= AIN_B; - v.result_class := r.b.class; - re_sel2 <= REXP2_B; - re_set_result <= '1'; - -- set shift to 1 - rs_con2 <= RSCON2_1; - if r.b.negative = '1' then - v.fpscr(FPSCR_VXSQRT) := '1'; - qnan_result := '1'; - elsif r.b.denorm = '1' then - v.state := RENORM_B; - elsif r.b.exponent(0) = '0' then - v.state := RSQRT_1; - else - v.state := RENORM_B2; - end if; + v.state := FRE_1; when DO_FMADD => -- fmadd, fmsub, fnmadd, fnmsub @@ -1740,14 +1711,7 @@ begin re_set_result <= '1'; -- put b.exp into shift rs_sel1 <= RSH1_B; - -- Make sure A and C are normalized - if r.a.denorm = '1' then - opsel_a <= AIN_A; - v.state := RENORM_A; - elsif r.c.denorm = '1' then - opsel_a <= AIN_C; - v.state := RENORM_C; - elsif (r.a.exponent + r.c.exponent + 1) < r.b.exponent then + if (r.a.exponent + r.c.exponent + 1) < r.b.exponent then -- addend is bigger, do multiply first -- if subtracting, sign is opposite to initial estimate f_to_multiply.valid <= '1'; @@ -1759,68 +1723,48 @@ begin end if; when RENORM_A => - rs_norm <= '1'; - v.state := RENORM_A2; - - when RENORM_A2 => - set_a := '1'; - re_sel2 <= REXP2_NE; + -- Get A into R + opsel_a <= AIN_A; + v.regsel := AIN_A; + re_sel1 <= REXP1_A; re_set_result <= '1'; - if r.is_multiply = '1' then - opsel_a <= AIN_C; - if r.c.mantissa(UNIT_BIT) = '1' then - if r.is_addition = '0' or r.b.class = ZERO then - v.first := '1'; - v.state := MULT_1; - else - v.state := DO_FMADD; - end if; - else - v.state := RENORM_C; - end if; - else - opsel_a <= AIN_B; - if r.b.mantissa(UNIT_BIT) = '1' then - v.first := '1'; - v.state := DIV_2; - else - v.state := RENORM_B; - end if; - end if; + v.a.denorm := '0'; + v.state := RENORM_1; when RENORM_B => - rs_norm <= '1'; - renorm_sqrt := r.is_sqrt; - v.state := RENORM_B2; - - when RENORM_B2 => - set_b := '1'; - -- For fdiv, we need to increase result_exp by shift rather - -- than decreasing it as for fre/frsqrte and fsqrt. - -- We do that by negating r.shift in this cycle and then - -- setting result_exp to new_exp in the next cycle - if r.use_a = '1' then - rs_sel1 <= RSH1_S; - rs_neg1 <= '1'; - else - re_sel2 <= REXP2_NE; - re_set_result <= '1'; - end if; - v.state := LOOKUP; + -- Get B into R + opsel_a <= AIN_B; + v.regsel := AIN_B; + re_sel2 <= REXP2_B; + re_set_result <= '1'; + v.b.denorm := '0'; + v.state := RENORM_1; when RENORM_C => + -- Get C into R + opsel_a <= AIN_C; + v.regsel := AIN_C; + re_sel2 <= REXP2_C; + re_set_result <= '1'; + v.c.denorm := '0'; + v.state := RENORM_1; + + when RENORM_1 => rs_norm <= '1'; - v.state := RENORM_C2; + renorm_sqrt := r.is_sqrt; + v.state := RENORM_2; - when RENORM_C2 => - set_c := '1'; - re_sel2 <= REXP2_NE; - re_set_result <= '1'; - if r.is_addition = '0' or r.b.class = ZERO then - v.first := '1'; - v.state := MULT_1; + when RENORM_2 => + set_reg_ind := '1'; + if r.c.denorm = '1' then + -- must be either fmul or fmadd/sub + v.state := RENORM_C; + elsif r.b.denorm = '1' and r.is_addition = '0' then + v.state := RENORM_B; + elsif r.is_multiply = '1' and r.b.class = ZERO then + v.state := DO_FMUL; else - v.state := DO_FMADD; + v.state := r.exec_state; end if; when ADD_1 => @@ -2017,28 +1961,6 @@ begin v.state := NORMALIZE; end if; - when LOOKUP => - -- wait one cycle for inverse_table[B] lookup - -- if this is a division, compute exponent - -- (see comment on RENORM_B2 above) - opsel_a <= AIN_B; - if r.use_a = '1' then - re_sel2 <= REXP2_NE; - re_set_result <= '1'; - end if; - v.first := '1'; - if r.is_sqrt = '1' then - if r.is_inverse = '1' then - v.state := RSQRT_1; - else - v.state := SQRT_1; - end if; - elsif r.use_a = '1' then - v.state := DIV_2; - else - v.state := FRE_1; - end if; - when DIV_2 => -- compute Y = inverse_table[B] (when count=0); P = 2 - B * Y msel_1 <= MUL1_B; @@ -2135,6 +2057,12 @@ begin v.doing_ftdiv := "10"; end if; + when SQRT_ODD => + -- set shift to 1 + rs_con2 <= RSCON2_1; + v.regsel := AIN_B; + v.state := RENORM_2; + when RSQRT_1 => opsel_r <= RES_MISC; misc_sel <= "101"; @@ -3344,6 +3272,17 @@ begin end case; end if; + if set_reg_ind = '1' then + case r.regsel is + when AIN_A => + set_a := '1'; + when AIN_B => + set_b := '1'; + when AIN_C => + set_c := '1'; + when others => + end case; + end if; if set_a = '1' or set_a_exp = '1' then v.a.exponent := new_exp; end if; From 5f0b2d433da9cda0777fbf9bbfe58641c6a2bc57 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 7 Mar 2024 21:01:53 +1100 Subject: [PATCH 16/24] FPU: Simplify calculation of result_class For the various arithmetic operators, we only get to the DO_* states when the inputs are finite (not zero, infinity or NaN), so we can replace setting of v.result_class to r.a.class or r.b.class with a overall setting of it to FINITE in cycle 1 of all those operations. Also, integer division doesn't need to set the result class since the result is integer. Signed-off-by: Paul Mackerras --- fpu.vhdl | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index ebbb564..a0a52a8 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1190,6 +1190,7 @@ begin if r.cycle_1 = '1' and r.is_arith = '1' then v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; + v.result_class := FINITE; end if; case r.state is @@ -1538,7 +1539,6 @@ begin when DO_FRI => -- fri[nzpm] opsel_a <= AIN_B; - v.result_class := r.b.class; re_sel2 <= REXP2_B; re_set_result <= '1'; -- set shift to exponent - 52 @@ -1555,7 +1555,6 @@ begin when DO_FRSP => -- r.shift = 0 opsel_a <= AIN_B; - v.result_class := r.b.class; re_sel2 <= REXP2_B; re_set_result <= '1'; v.state := DO_FRSP_2; @@ -1580,7 +1579,6 @@ begin -- instr bit 8: 1=unsigned 0=signed -- instr bit 1: 1=round to zero 0=use fpscr[RN] opsel_a <= AIN_B; - v.result_class := r.b.class; re_sel2 <= REXP2_B; re_set_result <= '1'; rs_sel1 <= RSH1_B; @@ -1624,7 +1622,6 @@ begin when DO_FADD => -- fadd[s] and fsub[s] opsel_a <= AIN_A; - v.result_class := r.a.class; re_sel1 <= REXP1_A; re_set_result <= '1'; -- set shift to a.exp - b.exp @@ -1645,7 +1642,6 @@ begin when DO_FMUL => -- fmul[s] opsel_a <= AIN_A; - v.result_class := r.a.class; re_sel1 <= REXP1_A; re_sel2 <= REXP2_C; re_set_result <= '1'; @@ -1654,7 +1650,6 @@ begin when DO_FDIV => opsel_a <= AIN_A; - v.result_class := r.a.class; re_sel1 <= REXP1_A; re_sel2 <= REXP2_B; re_neg2 <= '1'; @@ -1679,7 +1674,6 @@ begin when DO_FSQRT => opsel_a <= AIN_B; - v.result_class := r.b.class; re_sel2 <= REXP2_B; re_set_result <= '1'; if r.b.negative = '1' then @@ -1696,7 +1690,6 @@ begin when DO_FRE => opsel_a <= AIN_B; - v.result_class := r.b.class; re_sel2 <= REXP2_B; re_set_result <= '1'; v.state := FRE_1; @@ -1704,7 +1697,6 @@ begin when DO_FMADD => -- fmadd, fmsub, fnmadd, fnmsub opsel_a <= AIN_B; - v.result_class := r.a.class; -- put a.exp + c.exp into result_exp re_sel1 <= REXP1_A; re_sel2 <= REXP2_C; @@ -2511,7 +2503,6 @@ begin opsel_ainv <= '1'; carry_in <= '1'; end if; - v.result_class := FINITE; re_con2 <= RECON2_UNIT; re_set_result <= '1'; v.state := IDIV_NORMB; From 0e7c11a0e4970330954dfa681c4968ef4aa53a30 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 8 Mar 2024 14:44:47 +1100 Subject: [PATCH 17/24] FPU: Move result_class logic outside of state machine The various states choose one of four operations (including no-op) to be done on result_class. Some operations have side-effects on arith_done or FPSCR. The DO_NAN_INF and DO_ZERO_DEN states still set result_class directly since their logic is expected to move out to a separate process later. Signed-off-by: Paul Mackerras --- fpu.vhdl | 98 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 41 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index a0a52a8..baa087f 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -296,6 +296,12 @@ architecture behaviour of fpu is constant RSGN_SUB : std_ulogic_vector(1 downto 0) := "10"; constant RSGN_SEL : std_ulogic_vector(1 downto 0) := "11"; + signal rcls_op : std_ulogic_vector(1 downto 0); + constant RCLS_NOP : std_ulogic_vector(1 downto 0) := "00"; + constant RCLS_SEL : std_ulogic_vector(1 downto 0) := "01"; + constant RCLS_TZERO : std_ulogic_vector(1 downto 0) := "10"; + constant RCLS_TINF : std_ulogic_vector(1 downto 0) := "11"; + constant arith_decode : decode32 := ( -- indexed by bits 5..1 of opcode 2#01000# => DO_FRI, @@ -813,7 +819,6 @@ begin variable arith_done : std_ulogic; variable invalid : std_ulogic; variable zero_divide : std_ulogic; - variable mant_nz : std_ulogic; variable min_exp : signed(EXP_BITS-1 downto 0); variable max_exp : signed(EXP_BITS-1 downto 0); variable bias_exp : signed(EXP_BITS-1 downto 0); @@ -1186,6 +1191,7 @@ begin rs_norm <= '0'; rsgn_op := RSGN_NOP; + rcls_op <= RCLS_NOP; if r.cycle_1 = '1' and r.is_arith = '1' then v.fpscr(FPSCR_FR) := '0'; @@ -1531,7 +1537,7 @@ begin when DO_FMR => opsel_a <= AIN_B; - v.result_class := r.b.class; + rcls_op <= RCLS_SEL; re_sel2 <= REXP2_B; re_set_result <= '1'; v.writing_fpr := '1'; @@ -1605,12 +1611,12 @@ begin when DO_FCFID => opsel_a <= AIN_B; + rcls_op <= RCLS_SEL; if r.insn(8) = '0' and r.b.negative = '1' then -- fcfid[s] with negative operand, set R = -B opsel_ainv <= '1'; carry_in <= '1'; end if; - v.result_class := r.b.class; re_con2 <= RECON2_UNIT; re_set_result <= '1'; if r.b.class = ZERO then @@ -1660,14 +1666,13 @@ begin when DO_FSEL => rsgn_op := RSGN_SEL; + rcls_op <= RCLS_SEL; if r.a.class = ZERO or (r.a.negative = '0' and r.a.class /= NAN) then opsel_a <= AIN_C; re_sel2 <= REXP2_C; - v.result_class := r.c.class; else opsel_a <= AIN_B; re_sel2 <= REXP2_B; - v.result_class := r.b.class; end if; re_set_result <= '1'; arith_done := '1'; @@ -1799,6 +1804,7 @@ begin -- check for overflow or negative result (can't get both) -- r.shift = -1 re_sel2 <= REXP2_NE; + rcls_op <= RCLS_TZERO; if r.r(63) = '1' then -- result is opposite sign to expected rsgn_op := RSGN_INV; @@ -1818,10 +1824,6 @@ begin elsif r.r(UNIT_BIT) = '1' then set_x := '1'; v.state := ROUNDING; - elsif (r_hi_nz or r_lo_nz or (or (r.r(DP_LSB - 1 downto 0)))) = '0' then - -- r.x must be zero at this point - v.result_class := ZERO; - arith_done := '1'; else rs_norm <= '1'; v.state := NORMALIZE; @@ -1934,19 +1936,15 @@ begin opsel_r <= RES_SHIFT; re_sel2 <= REXP2_NE; rs_norm <= '1'; + rcls_op <= RCLS_TZERO; if (r.r(UNIT_BIT + 2) or r_hi_nz or r_lo_nz or (or (r.r(DP_LSB - 1 downto 0)))) = '0' then - if s_nz = '0' then - -- must be a subtraction, and r.x must be zero - v.result_class := ZERO; - arith_done := '1'; - else - -- R is all zeroes but there are non-zero bits in S - -- so shift them into R and set S to 0 - set_r := '1'; - re_set_result <= '1'; - set_s := '1'; - v.state := FINISH; - end if; + -- S = 0 case is handled by RCLS_TZERO logic, otherwise... + -- R is all zeroes but there are non-zero bits in S + -- so shift them into R and set S to 0 + set_r := '1'; + re_set_result <= '1'; + set_s := '1'; + v.state := FINISH; elsif r.r(UNIT_BIT + 2 downto UNIT_BIT) = "001" then v.state := FINISH; else @@ -2379,19 +2377,13 @@ begin end if; when ROUND_OFLOW => + rcls_op <= RCLS_TINF; v.fpscr(FPSCR_OX) := '1'; if r.fpscr(FPSCR_OE) = '0' then -- disabled overflow exception -- result depends on rounding mode v.fpscr(FPSCR_XX) := '1'; v.fpscr(FPSCR_FI) := '1'; - if r.round_mode(1 downto 0) = "00" or - (r.round_mode(1) = '1' and r.round_mode(0) = r.result_sign) then - v.result_class := INFINITY; - v.fpscr(FPSCR_FR) := '1'; - else - v.fpscr(FPSCR_FR) := '0'; - end if; -- construct largest representable number re_con2 <= RECON2_MAX; re_set_result <= '1'; @@ -2459,25 +2451,20 @@ begin when ROUNDING_3 => -- r.shift = clz(r.r) - 9 opsel_r <= RES_SHIFT; - mant_nz := r_hi_nz or (r_lo_nz and not r.single_prec); re_sel2 <= REXP2_NE; -- set shift to new_exp - min_exp (== -1022) rs_sel1 <= RSH1_NE; rs_con2 <= RSCON2_MINEXP; rs_neg2 <= '1'; - if mant_nz = '0' then - set_r := '0'; - v.result_class := ZERO; - arith_done := '1'; + rcls_op <= RCLS_TZERO; + -- If the result is zero, that's handled below. + -- Renormalize result after rounding + re_set_result <= '1'; + v.denorm := exp_tiny; + if new_exp < to_signed(-1022, EXP_BITS) then + v.state := DENORM; else - -- Renormalize result after rounding - re_set_result <= '1'; - v.denorm := exp_tiny; - if new_exp < to_signed(-1022, EXP_BITS) then - v.state := DENORM; - else - arith_done := '1'; - end if; + arith_done := '1'; end if; when DENORM => @@ -3032,6 +3019,35 @@ begin when others => end case; + case rcls_op is + when RCLS_SEL => + case opsel_a is + when AIN_A => + v.result_class := r.a.class; + when AIN_B => + v.result_class := r.b.class; + when AIN_C => + v.result_class := r.c.class; + when others => + end case; + when RCLS_TZERO => + if or (r.r(UNIT_BIT + 2 downto 0)) = '0' and s_nz = '0' then + v.result_class := ZERO; + arith_done := '1'; + end if; + when RCLS_TINF => + if r.fpscr(FPSCR_OE) = '0' then + if r.round_mode(1 downto 0) = "00" or + (r.round_mode(1) = '1' and r.round_mode(0) = r.result_sign) then + v.result_class := INFINITY; + v.fpscr(FPSCR_FR) := '1'; + else + v.fpscr(FPSCR_FR) := '0'; + end if; + end if; + when others => + end case; + if zero_divide = '1' then v.fpscr(FPSCR_ZX) := '1'; end if; From bbc485f33657168bec25df5c954d2911f94c5845 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 11 Mar 2024 12:31:58 +1100 Subject: [PATCH 18/24] FPU: Rework inputs to the main adder With this, the A input no longer has R as an option but now takes the rounding constants and the low-order bits of P (used as an adjustment in the square root algorithm). The B input has either R or zero. Both inputs can be optionally inverted for subtraction. The select inputs to the multiplexers now have 3 bits in opsel_a and 1 bit in opsel_b. The states which need R to be set now explicitly have set_r := 1 even though that is the default, essentially for documentation reasons. Similarly some states set opsel_b <= BIN_R even though that is the default. Signed-off-by: Paul Mackerras --- fpu.vhdl | 243 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 177 insertions(+), 66 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index baa087f..648bbaa 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -172,7 +172,7 @@ architecture behaviour of fpu is res_int : std_ulogic; exec_state : state_t; cycle_1 : std_ulogic; - regsel : std_ulogic_vector(1 downto 0); + regsel : std_ulogic_vector(2 downto 0); end record; type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0); @@ -180,8 +180,8 @@ architecture behaviour of fpu is signal r, rin : reg_type; signal fp_result : std_ulogic_vector(63 downto 0); - signal opsel_a : std_ulogic_vector(1 downto 0); - signal opsel_b : std_ulogic_vector(1 downto 0); + signal opsel_a : std_ulogic_vector(2 downto 0); + signal opsel_b : std_ulogic; signal opsel_r : std_ulogic_vector(1 downto 0); signal opsel_s : std_ulogic_vector(1 downto 0); signal opsel_ainv : std_ulogic; @@ -206,15 +206,17 @@ architecture behaviour of fpu is signal inverse_est : std_ulogic_vector(18 downto 0); -- opsel values - constant AIN_R : std_ulogic_vector(1 downto 0) := "00"; - constant AIN_A : std_ulogic_vector(1 downto 0) := "01"; - constant AIN_B : std_ulogic_vector(1 downto 0) := "10"; - constant AIN_C : std_ulogic_vector(1 downto 0) := "11"; - - constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00"; - constant BIN_R : std_ulogic_vector(1 downto 0) := "01"; - constant BIN_RND : std_ulogic_vector(1 downto 0) := "10"; - constant BIN_PS8 : std_ulogic_vector(1 downto 0) := "11"; + constant AIN_ZERO : std_ulogic_vector(2 downto 0) := "000"; + constant AIN_A : std_ulogic_vector(2 downto 0) := "001"; + constant AIN_B : std_ulogic_vector(2 downto 0) := "010"; + constant AIN_C : std_ulogic_vector(2 downto 0) := "011"; + constant AIN_PS8 : std_ulogic_vector(2 downto 0) := "100"; + constant AIN_RND_B32 : std_ulogic_vector(2 downto 0) := "101"; + constant AIN_RND_RBIT : std_ulogic_vector(2 downto 0) := "110"; + constant AIN_RND : std_ulogic_vector(2 downto 0) := "111"; + + constant BIN_ZERO : std_ulogic := '0'; + constant BIN_R : std_ulogic := '1'; constant RES_SUM : std_ulogic_vector(1 downto 0) := "00"; constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01"; @@ -857,10 +859,8 @@ begin variable maddend : std_ulogic_vector(127 downto 0); variable sum : std_ulogic_vector(63 downto 0); variable round_inc : std_ulogic_vector(63 downto 0); - variable rbit_inc : std_ulogic; variable mult_mask : std_ulogic; variable sign_bit : std_ulogic; - variable rnd_b32 : std_ulogic; variable rexp_in1 : signed(EXP_BITS-1 downto 0); variable rexp_in2 : signed(EXP_BITS-1 downto 0); variable rexp_cin : std_ulogic; @@ -1134,10 +1134,10 @@ begin v.update_fprf := '0'; v.first := '0'; v.doing_ftdiv := "00"; - opsel_a <= AIN_R; + opsel_a <= AIN_ZERO; opsel_ainv <= '0'; opsel_mask <= '0'; - opsel_b <= BIN_ZERO; + opsel_b <= BIN_R; opsel_binv <= '0'; opsel_r <= RES_SUM; opsel_s <= S_ZERO; @@ -1171,9 +1171,7 @@ begin renorm_sqrt := '0'; shiftin := '0'; shiftin0 := '0'; - rbit_inc := '0'; mult_mask := '0'; - rnd_b32 := '0'; illegal := '0'; set_reg_ind := '0'; @@ -1216,7 +1214,7 @@ begin v.x := '0'; v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX); set_s := '1'; - v.regsel := AIN_R; + v.regsel := AIN_ZERO; when DO_NAN_INF => -- At least one floating-point operand is infinity or NaN @@ -1227,6 +1225,8 @@ begin else opsel_a <= AIN_C; end if; + opsel_b <= BIN_ZERO; + set_r := '1'; if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or @@ -1287,6 +1287,8 @@ begin when DO_ZERO_DEN => -- At least one floating point operand is zero or denormalized + opsel_b <= BIN_ZERO; + set_r := '1'; if r.use_a = '1' and r.a.class = ZERO then opsel_a <= AIN_B; re_sel2 <= REXP2_B; @@ -1406,6 +1408,8 @@ begin when DO_FCMP => -- fcmp[uo] opsel_a <= AIN_B; + opsel_b <= BIN_ZERO; + set_r := '1'; v.instr_done := '1'; update_fx := '1'; re_sel2 <= REXP2_B; @@ -1483,6 +1487,7 @@ begin when DO_FMRG => -- fmrgew, fmrgow + set_r := '1'; opsel_r <= RES_MISC; misc_sel <= "100"; v.writing_fpr := '1'; @@ -1490,6 +1495,7 @@ begin when DO_MFFS => v.writing_fpr := '1'; + set_r := '1'; opsel_r <= RES_MISC; misc_sel <= "011"; case r.insn(20 downto 16) is @@ -1537,6 +1543,8 @@ begin when DO_FMR => opsel_a <= AIN_B; + opsel_b <= BIN_ZERO; + set_r := '1'; rcls_op <= RCLS_SEL; re_sel2 <= REXP2_B; re_set_result <= '1'; @@ -1545,6 +1553,8 @@ begin when DO_FRI => -- fri[nzpm] opsel_a <= AIN_B; + opsel_b <= BIN_ZERO; + set_r := '1'; re_sel2 <= REXP2_B; re_set_result <= '1'; -- set shift to exponent - 52 @@ -1561,17 +1571,19 @@ begin when DO_FRSP => -- r.shift = 0 opsel_a <= AIN_B; + opsel_b <= BIN_ZERO; + set_r := '1'; re_sel2 <= REXP2_B; re_set_result <= '1'; v.state := DO_FRSP_2; when DO_FRSP_2 => -- r.shift = 0 - -- set shift to exponent - -126 + -- set shift to exponent - -126 (for ROUND_UFLOW state) rs_sel1 <= RSH1_B; rs_con2 <= RSCON2_MINEXP; rs_neg2 <= '1'; - set_x := '1'; + set_x := '1'; -- uses r.r and r.shift if r.b.exponent < to_signed(-126, EXP_BITS) then v.state := ROUND_UFLOW; elsif r.b.exponent > to_signed(127, EXP_BITS) then @@ -1585,6 +1597,8 @@ begin -- instr bit 8: 1=unsigned 0=signed -- instr bit 1: 1=round to zero 0=use fpscr[RN] opsel_a <= AIN_B; + opsel_b <= BIN_ZERO; + set_r := '1'; re_sel2 <= REXP2_B; re_set_result <= '1'; rs_sel1 <= RSH1_B; @@ -1611,6 +1625,8 @@ begin when DO_FCFID => opsel_a <= AIN_B; + opsel_b <= BIN_ZERO; + set_r := '1'; rcls_op <= RCLS_SEL; if r.insn(8) = '0' and r.b.negative = '1' then -- fcfid[s] with negative operand, set R = -B @@ -1628,6 +1644,8 @@ begin when DO_FADD => -- fadd[s] and fsub[s] opsel_a <= AIN_A; + opsel_b <= BIN_ZERO; + set_r := '1'; re_sel1 <= REXP1_A; re_set_result <= '1'; -- set shift to a.exp - b.exp @@ -1648,6 +1666,8 @@ begin when DO_FMUL => -- fmul[s] opsel_a <= AIN_A; + opsel_b <= BIN_ZERO; + set_r := '1'; re_sel1 <= REXP1_A; re_sel2 <= REXP2_C; re_set_result <= '1'; @@ -1656,6 +1676,8 @@ begin when DO_FDIV => opsel_a <= AIN_A; + opsel_b <= BIN_ZERO; + set_r := '1'; re_sel1 <= REXP1_A; re_sel2 <= REXP2_B; re_neg2 <= '1'; @@ -1674,11 +1696,15 @@ begin opsel_a <= AIN_B; re_sel2 <= REXP2_B; end if; + opsel_b <= BIN_ZERO; + set_r := '1'; re_set_result <= '1'; arith_done := '1'; when DO_FSQRT => opsel_a <= AIN_B; + opsel_b <= BIN_ZERO; + set_r := '1'; re_sel2 <= REXP2_B; re_set_result <= '1'; if r.b.negative = '1' then @@ -1694,7 +1720,6 @@ begin end if; when DO_FRE => - opsel_a <= AIN_B; re_sel2 <= REXP2_B; re_set_result <= '1'; v.state := FRE_1; @@ -1702,6 +1727,8 @@ begin when DO_FMADD => -- fmadd, fmsub, fnmadd, fnmsub opsel_a <= AIN_B; + opsel_b <= BIN_ZERO; + set_r := '1'; -- put a.exp + c.exp into result_exp re_sel1 <= REXP1_A; re_sel2 <= REXP2_C; @@ -1722,6 +1749,8 @@ begin when RENORM_A => -- Get A into R opsel_a <= AIN_A; + opsel_b <= BIN_ZERO; + set_r := '1'; v.regsel := AIN_A; re_sel1 <= REXP1_A; re_set_result <= '1'; @@ -1731,6 +1760,8 @@ begin when RENORM_B => -- Get B into R opsel_a <= AIN_B; + opsel_b <= BIN_ZERO; + set_r := '1'; v.regsel := AIN_B; re_sel2 <= REXP2_B; re_set_result <= '1'; @@ -1740,6 +1771,8 @@ begin when RENORM_C => -- Get C into R opsel_a <= AIN_C; + opsel_b <= BIN_ZERO; + set_r := '1'; v.regsel := AIN_C; re_sel2 <= REXP2_C; re_set_result <= '1'; @@ -1767,6 +1800,8 @@ begin when ADD_1 => -- transferring B to R opsel_a <= AIN_B; + opsel_b <= BIN_ZERO; + set_r := '1'; re_sel2 <= REXP2_B; re_set_result <= '1'; -- set shift to b.exp - a.exp @@ -1779,6 +1814,7 @@ begin when ADD_SHIFT => -- r.shift = - exponent difference, r.longmask = 0 opsel_r <= RES_SHIFT; + set_r := '1'; re_sel2 <= REXP2_NE; re_set_result <= '1'; v.x := s_nz; @@ -1795,6 +1831,7 @@ begin opsel_b <= BIN_R; opsel_binv <= r.is_subtract; carry_in <= r.is_subtract and not r.x; + set_r := '1'; -- set shift to -1 rs_con2 <= RSCON2_1; rs_neg2 <= '1'; @@ -1808,12 +1845,15 @@ begin if r.r(63) = '1' then -- result is opposite sign to expected rsgn_op := RSGN_INV; - opsel_ainv <= '1'; + opsel_a <= AIN_ZERO; + set_r := '1'; + opsel_binv <= '1'; carry_in <= '1'; v.state := FINISH; elsif r.r(UNIT_BIT + 1) = '1' then -- sum overflowed, shift right opsel_r <= RES_SHIFT; + set_r := '1'; re_set_result <= '1'; set_x := '1'; if exp_huge = '1' then @@ -1834,6 +1874,7 @@ begin opsel_b <= BIN_R; opsel_binv <= '1'; carry_in <= '1'; + set_r := '1'; v.state := CMP_2; when CMP_2 => @@ -1851,6 +1892,7 @@ begin when MULT_1 => f_to_multiply.valid <= r.first; opsel_r <= RES_MULT; + set_r := '1'; if multiply_to_f.valid = '1' then v.state := FINISH; end if; @@ -1867,6 +1909,7 @@ begin rs_sel1 <= RSH1_S; end if; opsel_r <= RES_MULT; + set_r := '1'; opsel_s <= S_MULT; set_s := '1'; if multiply_to_f.valid = '1' then @@ -1901,6 +1944,7 @@ begin when FMADD_3 => -- r.shift = addend exp - product exp opsel_r <= RES_SHIFT; + set_r := '1'; re_sel2 <= REXP2_NE; re_set_result <= '1'; v.first := '1'; @@ -1914,6 +1958,7 @@ begin opsel_s <= S_MULT; set_s := '1'; if multiply_to_f.valid = '1' then + set_r := '1'; v.state := FMADD_5; end if; @@ -1921,8 +1966,9 @@ begin -- negate R:S:X if negative if r.r(63) = '1' then rsgn_op := RSGN_INV; - opsel_ainv <= '1'; + opsel_binv <= '1'; carry_in <= not (s_nz or r.x); + set_r := '1'; opsel_s <= S_NEG; set_s := '1'; end if; @@ -1993,8 +2039,9 @@ begin f_to_multiply.valid <= r.first; pshift := '1'; mult_mask := '1'; + opsel_r <= RES_MULT; if multiply_to_f.valid = '1' then - opsel_r <= RES_MULT; + set_r := '1'; v.first := '1'; v.state := DIV_5; end if; @@ -2012,14 +2059,14 @@ begin when DIV_6 => -- test if remainder is 0 or >= B - opsel_b <= BIN_RND; - rbit_inc := '1'; + opsel_a <= AIN_RND_RBIT; if pcmpb_lt = '1' then -- quotient is correct, set X if remainder non-zero set_r := '0'; v.x := r.p(UNIT_BIT + 2) or px_nz; else -- quotient needs to be incremented by 1 in R-bit position + set_r := '1'; v.x := not pcmpb_eq; end if; v.state := FINISH; @@ -2029,6 +2076,7 @@ begin re_neg1 <= '1'; re_set_result <= '1'; opsel_r <= RES_MISC; + set_r := '1'; misc_sel <= "101"; -- set shift to 1 rs_con2 <= RSCON2_1; @@ -2056,6 +2104,7 @@ begin when RSQRT_1 => opsel_r <= RES_MISC; misc_sel <= "101"; + set_r := '1'; re_sel1 <= REXP1_BHALF; re_neg1 <= '1'; re_set_result <= '1'; @@ -2069,6 +2118,7 @@ begin set_a := '1'; opsel_r <= RES_MISC; misc_sel <= "101"; + set_r := '1'; msel_1 <= MUL1_B; msel_2 <= MUL2_LUT; f_to_multiply.valid <= '1'; @@ -2083,6 +2133,7 @@ begin -- not expecting multiplier result yet -- r.shift = -1 opsel_r <= RES_SHIFT; + set_r := '1'; re_sel2 <= REXP2_NE; re_set_result <= '1'; v.first := '1'; @@ -2094,9 +2145,10 @@ begin set_y := r.first; pshift := '1'; mult_mask := '1'; + opsel_r <= RES_MULT; if multiply_to_f.valid = '1' then -- put result into R - opsel_r <= RES_MULT; + set_r := '1'; v.first := '1'; v.state := SQRT_4; end if; @@ -2139,9 +2191,11 @@ begin -- wait for second multiply (should be here already) pshift := '1'; mult_mask := '1'; + opsel_r <= RES_MULT; + set_r := '1'; if multiply_to_f.valid = '1' then -- put result into R - opsel_r <= RES_MULT; + set_r := '1'; v.first := '1'; v.count := r.count + 1; if r.count < 2 then @@ -2184,7 +2238,8 @@ begin when SQRT_10 => -- Add the bottom 8 bits of P, sign-extended, onto R. - opsel_b <= BIN_PS8; + opsel_a <= AIN_PS8; + set_r := '1'; re_sel1 <= REXP1_BHALF; re_set_result <= '1'; -- set shift to 1 @@ -2208,12 +2263,14 @@ begin when SQRT_12 => -- test if remainder is 0 or >= B = 2*R + 1 + set_r := '0'; + carry_in <= '1'; if pcmpb_lt = '1' then -- square root is correct, set X if remainder non-zero v.x := r.p(UNIT_BIT + 2) or px_nz; else -- square root needs to be incremented by 1 - carry_in <= '1'; + set_r := '1'; v.x := not pcmpb_eq; end if; v.state := FINISH; @@ -2221,6 +2278,7 @@ begin when INT_SHIFT => -- r.shift = b.exponent - 52 opsel_r <= RES_SHIFT; + set_r := '1'; re_sel2 <= REXP2_NE; re_set_result <= '1'; set_x := '1'; @@ -2232,6 +2290,7 @@ begin when INT_ROUND => -- r.shift = -4 (== 52 - UNIT_BIT) opsel_r <= RES_SHIFT; + set_r := '1'; re_sel2 <= REXP2_NE; re_set_result <= '1'; round := fp_rounding(r.r, r.x, '0', r.round_mode, r.result_sign); @@ -2247,14 +2306,16 @@ begin when INT_ISHIFT => -- r.shift = b.exponent - UNIT_BIT; opsel_r <= RES_SHIFT; + set_r := '1'; re_sel2 <= REXP2_NE; re_set_result <= '1'; v.state := INT_FINAL; when INT_FINAL => -- Negate if necessary, and increment for rounding if needed - opsel_ainv <= r.result_sign; + opsel_binv <= r.result_sign; carry_in <= r.fpscr(FPSCR_FR) xor r.result_sign; + set_r := '1'; -- Check for possible overflows case r.insn(9 downto 8) is when "00" => -- fctiw[z] @@ -2281,13 +2342,15 @@ begin else msb := r.r(63); end if; + opsel_r <= RES_MISC; misc_sel <= "110"; if (r.insn(8) = '0' and msb /= r.result_sign) or (r.insn(8) = '1' and msb /= '1') then - opsel_r <= RES_MISC; + set_r := '1'; v.fpscr(FPSCR_VXCVI) := '1'; invalid := '1'; else + set_r := '0'; if r.fpscr(FPSCR_FI) = '1' then v.fpscr(FPSCR_XX) := '1'; end if; @@ -2297,6 +2360,7 @@ begin when INT_OFLOW => opsel_r <= RES_MISC; misc_sel <= "110"; + set_r := '1'; v.fpscr(FPSCR_VXCVI) := '1'; invalid := '1'; arith_done := '1'; @@ -2304,6 +2368,7 @@ begin when FRI_1 => -- r.shift = b.exponent - 52 opsel_r <= RES_SHIFT; + set_r := '1'; re_sel2 <= REXP2_NE; re_set_result <= '1'; set_x := '1'; @@ -2335,6 +2400,7 @@ begin -- Shift so we have 9 leading zeroes (we know R is non-zero) -- r.shift = clz(r.r) - 7 opsel_r <= RES_SHIFT; + set_r := '1'; re_sel2 <= REXP2_NE; re_set_result <= '1'; -- set shift to new_exp - min_exp @@ -2353,10 +2419,12 @@ begin when ROUND_UFLOW => -- r.shift = - amount by which exponent underflows v.tiny := '1'; + opsel_r <= RES_SHIFT; + set_r := '0'; if r.fpscr(FPSCR_UE) = '0' then -- disabled underflow exception case -- have to denormalize before rounding - opsel_r <= RES_SHIFT; + set_r := '1'; re_sel2 <= REXP2_NE; re_set_result <= '1'; set_x := '1'; @@ -2379,16 +2447,18 @@ begin when ROUND_OFLOW => rcls_op <= RCLS_TINF; v.fpscr(FPSCR_OX) := '1'; + opsel_r <= RES_MISC; + misc_sel <= "010"; + set_r := '0'; if r.fpscr(FPSCR_OE) = '0' then -- disabled overflow exception -- result depends on rounding mode + set_r := '1'; v.fpscr(FPSCR_XX) := '1'; v.fpscr(FPSCR_FI) := '1'; -- construct largest representable number re_con2 <= RECON2_MAX; re_set_result <= '1'; - opsel_r <= RES_MISC; - misc_sel <= "010"; arith_done := '1'; else -- enabled overflow exception @@ -2401,11 +2471,12 @@ begin when ROUNDING => opsel_mask <= '1'; + set_r := '1'; round := fp_rounding(r.r, r.x, r.single_prec, r.round_mode, r.result_sign); v.fpscr(FPSCR_FR downto FPSCR_FI) := round; if round(1) = '1' then -- increment the LSB for the precision - opsel_b <= BIN_RND; + opsel_a <= AIN_RND; -- set shift to -1 rs_con2 <= RSCON2_1; rs_neg2 <= '1'; @@ -2432,8 +2503,10 @@ begin -- r.shift = -1 v.x := '0'; re_sel2 <= REXP2_NE; + opsel_r <= RES_SHIFT; + set_r := '0'; if r.r(UNIT_BIT + 1) = '1' then - opsel_r <= RES_SHIFT; + set_r := '1'; re_set_result <= '1'; if exp_huge = '1' then v.state := ROUND_OFLOW; @@ -2451,6 +2524,7 @@ begin when ROUNDING_3 => -- r.shift = clz(r.r) - 9 opsel_r <= RES_SHIFT; + set_r := '1'; re_sel2 <= REXP2_NE; -- set shift to new_exp - min_exp (== -1022) rs_sel1 <= RSH1_NE; @@ -2470,12 +2544,23 @@ begin when DENORM => -- r.shift = result_exp - -1022 opsel_r <= RES_SHIFT; + set_r := '1'; re_sel2 <= REXP2_NE; re_set_result <= '1'; arith_done := '1'; when DO_IDIVMOD => opsel_a <= AIN_B; + opsel_b <= BIN_ZERO; + set_r := '1'; + -- take absolute value for signed division + if r.is_signed = '1' and r.b.negative = '1' then + opsel_ainv <= '1'; + carry_in <= '1'; + end if; + -- normalize and round up B to 8.56 format, like fcfid[u] + re_con2 <= RECON2_UNIT; + re_set_result <= '1'; if r.b.class = ZERO then -- B is zero, signal overflow v.int_ovf := '1'; @@ -2484,14 +2569,6 @@ begin -- A is zero, result is zero (both for div and for mod) v.state := IDIV_ZERO; else - -- take absolute value for signed division, and - -- normalize and round up B to 8.56 format, like fcfid[u] - if r.is_signed = '1' and r.b.negative = '1' then - opsel_ainv <= '1'; - carry_in <= '1'; - end if; - re_con2 <= RECON2_UNIT; - re_set_result <= '1'; v.state := IDIV_NORMB; end if; when IDIV_NORMB => @@ -2504,17 +2581,21 @@ begin -- get B into the range [1, 2) in 8.56 format set_x := '1'; -- record if any 1 bits shifted out opsel_r <= RES_SHIFT; + set_r := '1'; re_sel2 <= REXP2_NE; re_set_result <= '1'; v.state := IDIV_NORMB3; when IDIV_NORMB3 => -- add the X bit onto R to round up B carry_in <= r.x; + set_r := '1'; -- prepare to do count-leading-zeroes on A v.state := IDIV_CLZA; when IDIV_CLZA => set_b := '1'; -- put R back into B opsel_a <= AIN_A; + opsel_b <= BIN_ZERO; + set_r := '1'; if r.is_signed = '1' and r.a.negative = '1' then opsel_ainv <= '1'; carry_in <= '1'; @@ -2523,16 +2604,17 @@ begin re_set_result <= '1'; v.state := IDIV_CLZA2; when IDIV_CLZA2 => - opsel_a <= AIN_C; rs_norm <= '1'; -- write the dividend back into A in case we negated it set_a_mant := '1'; -- while doing the count-leading-zeroes on A, -- also compute A - B to tell us whether A >= B -- (using the original value of B, which is now in C) + opsel_a <= AIN_C; opsel_b <= BIN_R; opsel_ainv <= '1'; carry_in <= '1'; + set_r := '1'; v.state := IDIV_CLZA3; when IDIV_CLZA3 => -- save the exponent of A (but don't overwrite the mantissa) @@ -2578,6 +2660,7 @@ begin -- It turns out the generated QNaN mantissa is actually what we want opsel_r <= RES_MISC; misc_sel <= "001"; + set_r := '1'; if r.b.mantissa(UNIT_BIT + 1) = '1' then -- rounding up of the mantissa caused overflow, meaning the -- normalized B is 2.0. Since this is outside the range @@ -2641,6 +2724,8 @@ begin -- inverse estimate is in Y -- put A (dividend) into R opsel_a <= AIN_A; + opsel_b <= BIN_ZERO; + set_r := '1'; -- shift_res is 0 because r.shift = 64; -- put that into B, which now holds the quotient set_b_mant := '1'; @@ -2667,6 +2752,7 @@ begin when IDIV_SH32 => -- r.shift = 32, R contains the dividend opsel_r <= RES_SHIFT; + set_r := '1'; -- set shift to -UNIT_BIT (== -56) rs_con2 <= RSCON2_UNIT; rs_neg2 <= '1'; @@ -2687,12 +2773,14 @@ begin rs_sel1 <= RSH1_B; rs_neg1 <= '1'; if multiply_to_f.valid = '1' then + set_r := '1'; v.state := IDIV_DIV2; end if; when IDIV_DIV2 => -- r.shift = - b.exponent -- shift the quotient estimate right by b.exponent bits opsel_r <= RES_SHIFT; + set_r := '1'; v.first := '1'; v.state := IDIV_DIV3; when IDIV_DIV3 => @@ -2708,6 +2796,7 @@ begin opsel_s <= S_MULT; set_s := '1'; if multiply_to_f.valid = '1' then + set_r := '1'; v.state := IDIV_DIV4; end if; when IDIV_DIV4 => @@ -2718,6 +2807,8 @@ begin if r.divmod = '0' then -- get B into R for IDIV_DIVADJ state opsel_a <= AIN_B; + opsel_b <= BIN_ZERO; + set_r := '1'; end if; -- set shift to UNIT_BIT (== 56) rs_con2 <= RSCON2_UNIT; @@ -2741,18 +2832,21 @@ begin rs_sel1 <= RSH1_B; rs_neg1 <= '1'; if multiply_to_f.valid = '1' then + set_r := '1'; v.state := IDIV_DIV6; end if; when IDIV_DIV6 => -- r.shift = - b.exponent -- shift the quotient estimate right by b.exponent bits opsel_r <= RES_SHIFT; + set_r := '1'; v.first := '1'; v.state := IDIV_DIV7; when IDIV_DIV7 => -- add shifted quotient delta onto the total quotient opsel_a <= AIN_B; opsel_b <= BIN_R; + set_r := '1'; v.first := '1'; v.state := IDIV_DIV8; when IDIV_DIV8 => @@ -2768,6 +2862,7 @@ begin opsel_s <= S_MULT; set_s := '1'; if multiply_to_f.valid = '1' then + set_r := '1'; v.state := IDIV_DIV9; end if; when IDIV_DIV9 => @@ -2780,6 +2875,8 @@ begin if r.divmod = '0' then -- get B into R for IDIV_DIVADJ state opsel_a <= AIN_B; + opsel_b <= BIN_ZERO; + set_r := '1'; v.state := IDIV_DIVADJ; elsif pcmpc_eq = '1' then v.state := IDIV_ZERO; @@ -2790,6 +2887,8 @@ begin -- get divisor into R and prepare to shift left -- set shift to 63 - b.exp opsel_a <= AIN_C; + opsel_b <= BIN_ZERO; + set_r := '1'; rs_sel1 <= RSH1_B; rs_neg1 <= '1'; rs_con2 <= RSCON2_63; @@ -2798,6 +2897,8 @@ begin -- divisor is in R -- r.shift = 63 - b.exponent; shift and put into B opsel_a <= AIN_A; + opsel_b <= BIN_ZERO; + set_r := '1'; set_b_mant := '1'; -- set shift to 64 - UNIT_BIT (== 8) rs_con2 <= RSCON2_64_UNIT; @@ -2817,6 +2918,7 @@ begin -- dividend (A) is in R -- r.shift = 64 - B.exponent, so is at least 1 opsel_r <= RES_SHIFT; + set_r := '1'; -- top bit of A gets lost in the shift, so handle it specially -- set shift to 63 rs_con2 <= RSCON2_63; @@ -2828,6 +2930,7 @@ begin opsel_b <= BIN_R; opsel_ainv <= '1'; carry_in <= '1'; + set_r := '1'; -- and put 1<<63 into B as the divisor (S is still 0) shiftin0 := '1'; set_b_mant := '1'; @@ -2848,6 +2951,7 @@ begin -- dividend is in R -- r.shift = 64 - B.exponent opsel_r <= RES_SHIFT; + set_r := '1'; v.first := '1'; v.state := IDIV_EXTDIV2; when IDIV_EXTDIV2 => @@ -2858,6 +2962,7 @@ begin pshift := '1'; opsel_r <= RES_MULT; if multiply_to_f.valid = '1' then + set_r := '1'; v.first := '1'; v.state := IDIV_EXTDIV3; end if; @@ -2865,6 +2970,7 @@ begin -- delta quotient is in R; add it to B opsel_a <= AIN_B; opsel_b <= BIN_R; + set_r := '1'; v.first := '1'; v.state := IDIV_EXTDIV4; when IDIV_EXTDIV4 => @@ -2883,12 +2989,14 @@ begin rs_neg1 <= '1'; rs_con2 <= RSCON2_UNIT; if multiply_to_f.valid = '1' then + set_r := '1'; v.state := IDIV_EXTDIV5; end if; when IDIV_EXTDIV5 => -- r.shift = r.b.exponent - 56 -- remainder is in R/S; shift it right r.b.exponent bits opsel_r <= RES_SHIFT; + set_r := '1'; -- test LS 64b of remainder in P against divisor in C v.inc_quot := not pcmpc_lt; v.state := IDIV_EXTDIV6; @@ -2896,6 +3004,8 @@ begin -- shifted remainder is in R, see if it is > 1 -- and compute R = R * Y if so opsel_a <= AIN_B; + opsel_b <= BIN_ZERO; + set_r := '0'; msel_1 <= MUL1_Y; msel_2 <= MUL2_R; pshift := '1'; @@ -2903,12 +3013,15 @@ begin f_to_multiply.valid <= '1'; v.state := IDIV_EXTDIV2; else + -- Put B (quotient) into R for IDIV_DIVADJ state + set_r := '1'; v.state := IDIV_DIVADJ; end if; when IDIV_MODADJ => -- r.shift = 56 -- result is in R/S opsel_r <= RES_SHIFT; + set_r := '1'; if pcmpc_lt = '0' then v.state := IDIV_MODSUB; elsif r.result_sign = '0' then @@ -2922,6 +3035,7 @@ begin opsel_ainv <= '1'; carry_in <= '1'; opsel_b <= BIN_R; + set_r := '1'; if r.result_sign = '0' then v.state := IDIV_DONE; else @@ -2931,11 +3045,11 @@ begin -- result (so far) is in R -- set carry to increment quotient if needed -- and also negate R if the answer is negative - opsel_ainv <= r.result_sign; + opsel_binv <= r.result_sign; carry_in <= r.inc_quot xor r.result_sign; - rnd_b32 := '1'; + set_r := '1'; if r.divmod = '0' then - opsel_b <= BIN_RND; + opsel_a <= AIN_RND_B32; end if; if r.is_signed = '0' then v.state := IDIV_DONE; @@ -2984,6 +3098,7 @@ begin when IDIV_ZERO => opsel_r <= RES_MISC; misc_sel <= "000"; + set_r := '1'; v.xerc_result := v.xerc; if r.oe = '1' then v.xerc_result.ov := r.int_ovf; @@ -3156,36 +3271,32 @@ begin v.x := '1'; end if; case opsel_a is - when AIN_R => - in_a0 := r.r; when AIN_A => in_a0 := r.a.mantissa; when AIN_B => in_a0 := r.b.mantissa; - when others => + when AIN_C => in_a0 := r.c.mantissa; + when AIN_PS8 => -- 8 LSBs of P sign-extended to 64 + in_a0 := std_ulogic_vector(resize(signed(r.p(7 downto 0)), 64)); + when AIN_RND_B32 => + in_a0 := (32 => r.result_sign and r.single_prec, others => '0'); + when AIN_RND_RBIT => + in_a0 := (DP_RBIT => '1', others => '0'); + when AIN_RND => + in_a0 := (SP_LSB => r.single_prec, DP_LSB => not r.single_prec, others => '0'); + when others => + in_a0 := (others => '0'); end case; if opsel_ainv = '1' then in_a0 := not in_a0; end if; in_a <= in_a0; case opsel_b is - when BIN_ZERO => - in_b0 := (others => '0'); when BIN_R => in_b0 := r.r; - when BIN_RND => - if rnd_b32 = '1' then - round_inc := (32 => r.result_sign and r.single_prec, others => '0'); - elsif rbit_inc = '0' then - round_inc := (SP_LSB => r.single_prec, DP_LSB => not r.single_prec, others => '0'); - else - round_inc := (DP_RBIT => '1', others => '0'); - end if; - in_b0 := round_inc; when others => - -- BIN_PS8, 8 LSBs of P sign-extended to 64 - in_b0 := std_ulogic_vector(resize(signed(r.p(7 downto 0)), 64)); + in_b0 := (others => '0'); end case; if opsel_binv = '1' then in_b0 := not in_b0; From fcfdbc449c9f77ad20dd159373d7522248bb1c5c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 13 Mar 2024 09:45:46 +1100 Subject: [PATCH 19/24] FPU: Move condition register calculations to an explicit data path Instead of calculating v.cr_result in the state machine, we now have the state machine set a 'cr_op' variable which then controls what computation the CR data path does to set v.cr_result. The CR data path also handles updating the XERC result bits for integer operations (division and modulus). Signed-off-by: Paul Mackerras --- fpu.vhdl | 269 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 153 insertions(+), 116 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 648bbaa..066f664 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -304,6 +304,13 @@ architecture behaviour of fpu is constant RCLS_TZERO : std_ulogic_vector(1 downto 0) := "10"; constant RCLS_TINF : std_ulogic_vector(1 downto 0) := "11"; + constant CROP_NONE : std_ulogic_vector(2 downto 0) := "000"; + constant CROP_FCMP : std_ulogic_vector(2 downto 0) := "001"; + constant CROP_MCRFS : std_ulogic_vector(2 downto 0) := "010"; + constant CROP_FTDIV : std_ulogic_vector(2 downto 0) := "100"; + constant CROP_FTSQRT : std_ulogic_vector(2 downto 0) := "101"; + constant CROP_INTRES : std_ulogic_vector(2 downto 0) := "110"; + constant arith_decode : decode32 := ( -- indexed by bits 5..1 of opcode 2#01000# => DO_FRI, @@ -875,6 +882,10 @@ begin variable is_nan_inf : std_ulogic; variable is_zero_den : std_ulogic; variable set_reg_ind : std_ulogic; + variable cr_op : std_ulogic_vector(2 downto 0); + variable cr_result : std_ulogic_vector(3 downto 0); + variable set_cr : std_ulogic; + variable set_fpcc : std_ulogic; begin v := r; v.complete := '0'; @@ -1144,6 +1155,7 @@ begin carry_in <= '0'; misc_sel <= "000"; fpscr_mask := (others => '1'); + cr_op := CROP_NONE; update_fx := '0'; arith_done := '0'; invalid := '0'; @@ -1160,6 +1172,8 @@ begin set_c := '0'; set_r := '1'; set_s := '0'; + set_cr := '0'; + set_fpcc := '0'; f_to_multiply.is_signed <= '0'; f_to_multiply.valid <= '0'; msel_1 <= MUL1_A; @@ -1361,6 +1375,8 @@ begin v.instr_done := '1'; when DO_MCRFS => + cr_op := CROP_MCRFS; + set_cr := '1'; j := to_integer(unsigned(insn_bfa(r.insn))); for i in 0 to 7 loop if i = j then @@ -1373,94 +1389,56 @@ begin v.instr_done := '1'; when DO_FTDIV => - v.cr_result := "0000"; -- set result_exp to the exponent of B re_sel2 <= REXP2_B; re_set_result <= '1'; - if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or - (r.b.class = FINITE and r.b.denorm = '1') then - v.cr_result(2) := '1'; - end if; - if r.a.class = NAN or r.a.class = INFINITY or - r.b.class = NAN or r.b.class = ZERO or r.b.class = INFINITY or - (r.a.class = FINITE and r.a.exponent <= to_signed(-970, EXP_BITS)) then - v.cr_result(1) := '1'; - v.instr_done := '1'; - else + cr_op := CROP_FTDIV; + if (r.a.class = ZERO or r.a.class = FINITE) and r.b.class = FINITE then v.doing_ftdiv := "11"; v.first := '1'; v.state := FTDIV_1; - v.instr_done := '0'; + else + set_cr := '1'; + v.instr_done := '1'; end if; when DO_FTSQRT => + cr_op := CROP_FTSQRT; + set_cr := '1'; v.instr_done := '1'; - v.cr_result := "0000"; - if r.b.class = ZERO or r.b.class = INFINITY or - (r.b.class = FINITE and r.b.denorm = '1') then - v.cr_result(2) := '1'; - end if; - if r.b.class = NAN or r.b.class = INFINITY or r.b.class = ZERO - or r.b.negative = '1' or r.b.exponent <= to_signed(-970, EXP_BITS) then - v.cr_result(1) := '1'; - end if; when DO_FCMP => -- fcmp[uo] + -- Prepare to subtract mantissas, put B in R opsel_a <= AIN_B; opsel_b <= BIN_ZERO; set_r := '1'; - v.instr_done := '1'; update_fx := '1'; - re_sel2 <= REXP2_B; - re_set_result <= '1'; - if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or - (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') then - -- Signalling NAN - v.fpscr(FPSCR_VXSNAN) := '1'; - if r.insn(6) = '1' and r.fpscr(FPSCR_VE) = '0' then - v.fpscr(FPSCR_VXVC) := '1'; - end if; - invalid := '1'; - v.cr_result := "0001"; -- unordered - elsif r.a.class = NAN or r.b.class = NAN then - if r.insn(6) = '1' then + cr_op := CROP_FCMP; + if r.a.class = NAN or r.b.class = NAN then + if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or + (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') then + -- Signalling NAN + v.fpscr(FPSCR_VXSNAN) := '1'; + if r.insn(6) = '1' and r.fpscr(FPSCR_VE) = '0' then + v.fpscr(FPSCR_VXVC) := '1'; + end if; + invalid := '1'; + elsif r.insn(6) = '1' then -- fcmpo v.fpscr(FPSCR_VXVC) := '1'; invalid := '1'; end if; - v.cr_result := "0001"; -- unordered - elsif r.a.class = ZERO and r.b.class = ZERO then - v.cr_result := "0010"; -- equal - elsif r.a.negative /= r.b.negative then - v.cr_result := r.a.negative & r.b.negative & "00"; - elsif r.a.class = ZERO then - -- A and B are the same sign from here down - v.cr_result := not r.b.negative & r.b.negative & "00"; - elsif r.a.class = INFINITY then - if r.b.class = INFINITY then - v.cr_result := "0010"; - else - v.cr_result := r.a.negative & not r.a.negative & "00"; - end if; - elsif r.b.class = ZERO then - -- A is finite from here down - v.cr_result := r.a.negative & not r.a.negative & "00"; - elsif r.b.class = INFINITY then - v.cr_result := not r.b.negative & r.b.negative & "00"; - elsif r.a.exponent > r.b.exponent then - -- A and B are both finite from here down - v.cr_result := r.a.negative & not r.a.negative & "00"; - elsif r.a.exponent /= r.b.exponent then - -- A exponent is smaller than B - v.cr_result := not r.a.negative & r.a.negative & "00"; - else - -- Prepare to subtract mantissas, put B in R - v.cr_result := "0000"; - v.instr_done := '0'; + end if; + if r.a.class = FINITE and r.b.class = FINITE and + r.a.negative = r.b.negative and + r.a.exponent = r.b.exponent then v.state := CMP_1; + else + set_cr := '1'; + set_fpcc := '1'; + v.instr_done := '1'; end if; - v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result; when DO_MTFSB => -- mtfsb{0,1} @@ -1878,15 +1856,9 @@ begin v.state := CMP_2; when CMP_2 => - if r.r(63) = '1' then - -- A is smaller in magnitude - v.cr_result := not r.a.negative & r.a.negative & "00"; - elsif (r_hi_nz or r_lo_nz) = '0' then - v.cr_result := "0010"; - else - v.cr_result := r.a.negative & not r.a.negative & "00"; - end if; - v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result; + cr_op := CROP_FCMP; + set_cr := '1'; + set_fpcc := '1'; v.instr_done := '1'; when MULT_1 => @@ -2086,10 +2058,10 @@ begin -- We go through this state up to two times; the first sees if -- B.exponent is in the range [-1021,1020], and the second tests -- whether B.exp - A.exp is in the range [-1022,1020]. - v.cr_result(1) := exp_tiny or exp_huge; - -- set shift to a.exp - rs_sel2 <= RSH2_A; + rs_sel2 <= RSH2_A; -- set shift to a.exp + cr_op := CROP_FTDIV; if exp_tiny = '1' or exp_huge = '1' or r.a.class = ZERO or r.first = '0' then + set_cr := '1'; v.instr_done := '1'; else v.doing_ftdiv := "10"; @@ -3057,58 +3029,26 @@ begin v.state := IDIV_OVFCHK; end if; when IDIV_OVFCHK => + opsel_r <= RES_MISC; + misc_sel <= "000"; if r.single_prec = '0' then sign_bit := r.r(63); else sign_bit := r.r(31); end if; v.int_ovf := sign_bit xor r.result_sign; - if v.int_ovf = '1' then - v.state := IDIV_ZERO; - else - v.state := IDIV_DONE; - end if; + set_r := sign_bit xor r.result_sign; + v.state := IDIV_DONE; when IDIV_DONE => - v.xerc_result := v.xerc; - if r.oe = '1' then - v.xerc_result.ov := '0'; - v.xerc_result.ov32 := '0'; - v.writing_xer := '1'; - end if; - if r.m32b = '0' then - v.cr_result(3) := r.r(63); - v.cr_result(2 downto 1) := "00"; - if r.r = 64x"0" then - v.cr_result(1) := '1'; - else - v.cr_result(2) := not r.r(63); - end if; - else - v.cr_result(3) := r.r(31); - v.cr_result(2 downto 1) := "00"; - if r.r(31 downto 0) = 32x"0" then - v.cr_result(1) := '1'; - else - v.cr_result(2) := not r.r(31); - end if; - end if; - v.cr_result(0) := v.xerc.so; + cr_op := CROP_INTRES; + set_cr := '1'; v.writing_fpr := '1'; v.instr_done := '1'; when IDIV_ZERO => opsel_r <= RES_MISC; misc_sel <= "000"; set_r := '1'; - v.xerc_result := v.xerc; - if r.oe = '1' then - v.xerc_result.ov := r.int_ovf; - v.xerc_result.ov32 := r.int_ovf; - v.xerc_result.so := r.xerc.so or r.int_ovf; - v.writing_xer := '1'; - end if; - v.cr_result := "001" & v.xerc_result.so; - v.writing_fpr := '1'; - v.instr_done := '1'; + v.state := IDIV_DONE; end case; @@ -3525,6 +3465,103 @@ begin v.shift := rsh_in1 + rsh_in2 + (rs_neg1 or rs_neg2); end if; + -- Condition register data path + cr_result := "0000"; + case cr_op is + when CROP_FCMP => + if r.a.class = NAN or r.b.class = NAN then + cr_result := "0001"; -- unordered + elsif r.a.class = ZERO and r.b.class = ZERO then + cr_result := "0010"; -- equal + elsif r.a.negative /= r.b.negative then + cr_result := r.a.negative & r.b.negative & "00"; + elsif r.a.class = INFINITY and r.b.class = INFINITY then + -- A and B are the same sign from here down + cr_result := "0010"; + elsif r.a.class = ZERO then + cr_result := not r.b.negative & r.b.negative & "00"; + elsif r.a.class = INFINITY then + cr_result := r.a.negative & not r.a.negative & "00"; + elsif r.b.class = ZERO then + -- A is finite from here down + cr_result := r.a.negative & not r.a.negative & "00"; + elsif r.b.class = INFINITY then + cr_result := not r.b.negative & r.b.negative & "00"; + elsif r.a.exponent > r.b.exponent then + -- A and B are both finite from here down + cr_result := r.a.negative & not r.a.negative & "00"; + elsif r.a.exponent /= r.b.exponent then + -- A exponent is smaller than B + cr_result := not r.a.negative & r.a.negative & "00"; + elsif r.r(63) = '1' then + -- A is smaller in magnitude + cr_result := not r.a.negative & r.a.negative & "00"; + elsif (r_hi_nz or r_lo_nz) = '0' then + cr_result := "0010"; + else + cr_result := r.a.negative & not r.a.negative & "00"; + end if; + when CROP_MCRFS => + j := to_integer(unsigned(insn_bfa(r.insn))); + for i in 0 to 7 loop + if i = j then + k := (7 - i) * 4; + cr_result := r.fpscr(k + 3 downto k); + end if; + end loop; + when CROP_FTDIV => + if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or + (r.b.class = FINITE and r.b.denorm = '1') then + cr_result(2) := '1'; + end if; + if r.a.class = NAN or r.a.class = INFINITY or + r.b.class = NAN or r.b.class = ZERO or r.b.class = INFINITY or + (r.a.class = FINITE and r.a.exponent <= to_signed(-970, EXP_BITS)) or + (r.doing_ftdiv(1) = '1' and (exp_tiny or exp_huge) = '1') then + cr_result(1) := '1'; + end if; + when CROP_FTSQRT => + if r.b.class = ZERO or r.b.class = INFINITY or + (r.b.class = FINITE and r.b.denorm = '1') then + cr_result(2) := '1'; + end if; + if r.b.class = NAN or r.b.class = INFINITY or r.b.class = ZERO + or r.b.negative = '1' or r.b.exponent <= to_signed(-970, EXP_BITS) then + cr_result(1) := '1'; + end if; + when CROP_INTRES => + v.xerc_result := v.xerc; + if r.oe = '1' then + v.xerc_result.ov := r.int_ovf; + v.xerc_result.ov32 := r.int_ovf; + v.xerc_result.so := r.xerc.so or r.int_ovf; + v.writing_xer := '1'; + end if; + if r.m32b = '0' then + cr_result(3) := r.r(63); + if r.r = 64x"0" then + cr_result(1) := '1'; + else + cr_result(2) := not r.r(63); + end if; + else + cr_result(3) := r.r(31); + if r.r(31 downto 0) = 32x"0" then + cr_result(1) := '1'; + else + cr_result(2) := not r.r(31); + end if; + end if; + cr_result(0) := v.xerc_result.so; + when others => + end case; + if set_cr = '1' then + v.cr_result := cr_result; + end if; + if set_fpcc = '1' then + v.fpscr(FPSCR_FL downto FPSCR_FU) := cr_result; + end if; + if r.update_fprf = '1' then v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.res_sign, r.result_class, r.r(UNIT_BIT) and not r.denorm); From b1bd2aa86532a6ac583502dc0060c72f861c1a3e Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 13 Mar 2024 14:05:52 +1100 Subject: [PATCH 20/24] FPU: Make set_r independent of multiply_to_f.valid Signed-off-by: Paul Mackerras --- fpu.vhdl | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 066f664..6ca5982 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1924,13 +1924,13 @@ begin when FMADD_4 => msel_add <= MULADD_RS; + set_r := '1'; f_to_multiply.valid <= r.first; msel_inv <= r.is_subtract; opsel_r <= RES_MULT; opsel_s <= S_MULT; set_s := '1'; if multiply_to_f.valid = '1' then - set_r := '1'; v.state := FMADD_5; end if; @@ -2012,8 +2012,8 @@ begin pshift := '1'; mult_mask := '1'; opsel_r <= RES_MULT; + set_r := '1'; if multiply_to_f.valid = '1' then - set_r := '1'; v.first := '1'; v.state := DIV_5; end if; @@ -2118,9 +2118,8 @@ begin pshift := '1'; mult_mask := '1'; opsel_r <= RES_MULT; + set_r := '1'; if multiply_to_f.valid = '1' then - -- put result into R - set_r := '1'; v.first := '1'; v.state := SQRT_4; end if; @@ -2166,8 +2165,6 @@ begin opsel_r <= RES_MULT; set_r := '1'; if multiply_to_f.valid = '1' then - -- put result into R - set_r := '1'; v.first := '1'; v.count := r.count + 1; if r.count < 2 then @@ -2741,11 +2738,11 @@ begin f_to_multiply.valid <= r.first; pshift := '1'; opsel_r <= RES_MULT; + set_r := '1'; -- set shift to - b.exp rs_sel1 <= RSH1_B; rs_neg1 <= '1'; if multiply_to_f.valid = '1' then - set_r := '1'; v.state := IDIV_DIV2; end if; when IDIV_DIV2 => @@ -2765,10 +2762,10 @@ begin -- store the current quotient estimate in B set_b_mant := r.first; opsel_r <= RES_MULT; + set_r := '1'; opsel_s <= S_MULT; set_s := '1'; if multiply_to_f.valid = '1' then - set_r := '1'; v.state := IDIV_DIV4; end if; when IDIV_DIV4 => @@ -2800,11 +2797,11 @@ begin when IDIV_DIV5 => pshift := '1'; opsel_r <= RES_MULT; + set_r := '1'; -- set shift to - b.exp rs_sel1 <= RSH1_B; rs_neg1 <= '1'; if multiply_to_f.valid = '1' then - set_r := '1'; v.state := IDIV_DIV6; end if; when IDIV_DIV6 => @@ -2831,10 +2828,10 @@ begin -- store the current quotient estimate in B set_b_mant := r.first; opsel_r <= RES_MULT; + set_r := '1'; opsel_s <= S_MULT; set_s := '1'; if multiply_to_f.valid = '1' then - set_r := '1'; v.state := IDIV_DIV9; end if; when IDIV_DIV9 => @@ -2933,8 +2930,8 @@ begin f_to_multiply.valid <= r.first; pshift := '1'; opsel_r <= RES_MULT; + set_r := '1'; if multiply_to_f.valid = '1' then - set_r := '1'; v.first := '1'; v.state := IDIV_EXTDIV3; end if; @@ -2954,6 +2951,7 @@ begin msel_inv <= '1'; f_to_multiply.valid <= r.first; opsel_r <= RES_MULT; + set_r := '1'; opsel_s <= S_MULT; set_s := '1'; -- set shift to UNIT_BIT - b.exp @@ -2961,7 +2959,6 @@ begin rs_neg1 <= '1'; rs_con2 <= RSCON2_UNIT; if multiply_to_f.valid = '1' then - set_r := '1'; v.state := IDIV_EXTDIV5; end if; when IDIV_EXTDIV5 => From b4aae8511df964dc50bfc0ec1627a05298f4b7e3 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 14 Mar 2024 20:41:59 +1100 Subject: [PATCH 21/24] FPU: Move special case handling to a separate process This creates a new fpu_specialcases process that handles most of the logic that was previously in the DO_NAN_INF and DO_ZERO_DEN states. What remains of those states, i.e. the handling of denormalized inputs, is in a new DO_SPECIAL state. The state machine goes into DO_SPECIAL state after IDLE for any arithmetic operation where an input is a NaN, infinity, zero or denormalized value. Doing this means that the rest of the state machine won't try to start any computation which would need to be overridden by the logic to produce the result value selected by the fpu_specialcases process. Signed-off-by: Paul Mackerras --- fpu.vhdl | 360 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 205 insertions(+), 155 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 6ca5982..0698c63 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -47,7 +47,7 @@ architecture behaviour of fpu is mantissa : std_ulogic_vector(63 downto 0); -- 8.56 format end record; - type state_t is (IDLE, DO_ILLEGAL, DO_NAN_INF, DO_ZERO_DEN, + type state_t is (IDLE, DO_ILLEGAL, DO_SPECIAL, DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF, DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT, DO_FCFID, DO_FCTI, @@ -92,6 +92,17 @@ architecture behaviour of fpu is type decode32 is array(0 to 31) of state_t; type decode8 is array(0 to 7) of state_t; + type specialcase_t is record + invalid : std_ulogic; + zero_divide : std_ulogic; + new_fpscr : std_ulogic_vector(31 downto 0); + immed_result : std_ulogic; -- result is an input, zero, infinity or NaN + qnan_result : std_ulogic; + result_sel : std_ulogic_vector(2 downto 0); + result_class : fp_number_class; + rsgn_op : std_ulogic_vector(1 downto 0); + end record; + type reg_type is record state : state_t; busy : std_ulogic; @@ -172,7 +183,9 @@ architecture behaviour of fpu is res_int : std_ulogic; exec_state : state_t; cycle_1 : std_ulogic; + cycle_1_ar : std_ulogic; regsel : std_ulogic_vector(2 downto 0); + is_nan_inf : std_ulogic; end record; type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0); @@ -311,6 +324,8 @@ architecture behaviour of fpu is constant CROP_FTSQRT : std_ulogic_vector(2 downto 0) := "101"; constant CROP_INTRES : std_ulogic_vector(2 downto 0) := "110"; + signal scinfo : specialcase_t; + constant arith_decode : decode32 := ( -- indexed by bits 5..1 of opcode 2#01000# => DO_FRI, @@ -806,6 +821,140 @@ begin w_out.intr_vec <= 16#700#; w_out.srr1 <= (47-44 => r.illegal, 47-43 => not r.illegal, others => '0'); + -- This is active in the second cycle of an instruction, and works out if + -- we have a special case where one or more operand is NaN, infinity, or zero, + -- meaning that an exception is generated or a specific value results + -- immediately without further calculation. + fpu_specialcases: process(all) + variable e : specialcase_t; + variable invalid_mul : std_ulogic; + begin + e.invalid := '0'; + e.zero_divide := '0'; + e.new_fpscr := (others => '0'); + e.immed_result := '0'; + e.qnan_result := '0'; + e.result_sel := AIN_ZERO; + e.result_class := FINITE; + e.rsgn_op := RSGN_NOP; + + -- Check if any operand is a signalling NAN + if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or + (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or + (r.c.class = NAN and r.c.mantissa(QNAN_BIT) = '0') then + e.new_fpscr(FPSCR_VXSNAN) := '1'; + e.invalid := '1'; + end if; + + -- Check for this case here since VXIMZ can be set along with VXSNAN + invalid_mul := '0'; + if r.is_multiply = '1' and + ((r.a.class = INFINITY and r.c.class = ZERO) or + (r.a.class = ZERO and r.c.class = INFINITY)) then + e.new_fpscr(FPSCR_VXIMZ) := '1'; + e.invalid := '1'; + invalid_mul := '1'; + end if; + + -- Note that any operand for which r.use_X is 0 will have class = ZERO + if r.is_nan_inf = '1' then + e.immed_result := '1'; + + if r.int_result = '1' then + e.qnan_result := '1'; + e.new_fpscr(FPSCR_VXCVI) := '1'; + + elsif r.a.class = NAN or r.b.class = NAN or r.c.class = NAN then + e.result_class := NAN; + e.rsgn_op := RSGN_SEL; + -- Select the first input that is a NaN + if r.a.class = NAN then + e.result_sel := AIN_A; + elsif r.b.class = NAN then + e.result_sel := AIN_B; + elsif r.c.class = NAN then + e.result_sel := AIN_C; + end if; + + else + -- some operand is an infinity + if invalid_mul = '1' then + e.qnan_result := '1'; + elsif (r.a.class = INFINITY or r.c.class = INFINITY) then + if r.is_multiply = '1' then + e.rsgn_op := RSGN_SUB; + end if; + if r.is_subtract = '1' and r.b.class = INFINITY then + e.new_fpscr(FPSCR_VXISI) := '1'; + e.qnan_result := '1'; + end if; + end if; + if r.is_inverse = '1' and r.a.class = INFINITY and r.b.class = INFINITY then + e.new_fpscr(FPSCR_VXIDI) := '1'; + e.qnan_result := '1'; + end if; + if r.b.class = INFINITY and r.is_sqrt = '1' and r.b.negative = '1' then + e.new_fpscr(FPSCR_VXSQRT) := '1'; + e.qnan_result := '1'; + end if; + if r.b.class = INFINITY and r.is_inverse = '1' then + -- fdiv, fre, frsqrte + e.result_class := ZERO; + else + e.result_class := INFINITY; + end if; + end if; + + elsif r.use_a = '1' and r.a.class = ZERO then + e.immed_result := '1'; + if r.is_addition = '1' then + -- result is +/- B + e.result_sel := AIN_B; + e.result_class := r.b.class; + else + e.result_class := ZERO; + end if; + if r.is_inverse = '1' and r.b.class = ZERO then + -- fdiv 0 / 0 + e.new_fpscr(FPSCR_VXZDZ) := '1'; + e.qnan_result := '1'; + end if; + + elsif r.use_c = '1' and r.c.class = ZERO then + -- fmadd/sub A * 0 + B + e.immed_result := '1'; + e.result_sel := AIN_B; + e.result_class := r.b.class; + + elsif r.use_b = '1' and r.b.class = ZERO and r.is_multiply = '0' then + -- B is zero, other operands are finite + e.immed_result := '1'; + if r.is_inverse = '1' then + -- fdiv, fre, frsqrte + e.result_class := INFINITY; + e.new_fpscr(FPSCR_ZX) := '1'; + e.zero_divide := '1'; + elsif r.is_addition = '1' then + -- fadd, result is A + e.result_sel := AIN_A; + else + -- other things, result is zero + e.result_class := ZERO; + end if; + end if; + if r.is_sqrt = '1' and r.b.class = FINITE and r.b.negative = '1' then + e.immed_result := '1'; + e.new_fpscr(FPSCR_VXSQRT) := '1'; + e.qnan_result := '1'; + end if; + + if e.qnan_result = '1' then + e.invalid := '1'; + e.result_class := NAN; + end if; + scinfo <= e; + end process; + fpu_1: process(all) variable v : reg_type; variable adec : fpu_reg_type; @@ -895,6 +1044,7 @@ begin is_nan_inf := '0'; is_zero_den := '0'; v.cycle_1 := e_in.valid; + v.cycle_1_ar := '0'; if r.complete = '1' or r.do_intr = '1' then v.instr_done := '0'; @@ -949,6 +1099,7 @@ begin v.longmask := e_in.single; v.fp_rc := e_in.rc; v.is_arith := '1'; + v.cycle_1_ar := '1'; exec_state := arith_decode(to_integer(unsigned(e_in.insn(5 downto 1)))); if e_in.insn(5 downto 1) = "10110" or e_in.insn(5 downto 1) = "11010" then v.is_sqrt := '1'; @@ -1205,7 +1356,7 @@ begin rsgn_op := RSGN_NOP; rcls_op <= RCLS_NOP; - if r.cycle_1 = '1' and r.is_arith = '1' then + if r.cycle_1_ar = '1' then v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; v.result_class := FINITE; @@ -1217,10 +1368,9 @@ begin if e_in.valid = '1' then v.busy := '1'; v.exec_state := exec_state; - if is_nan_inf = '1' then - v.state := DO_NAN_INF; - elsif is_zero_den = '1' then - v.state := DO_ZERO_DEN; + v.is_nan_inf := is_nan_inf; + if is_nan_inf = '1' or is_zero_den = '1' then + v.state := DO_SPECIAL; else v.state := exec_state; end if; @@ -1230,144 +1380,25 @@ begin set_s := '1'; v.regsel := AIN_ZERO; - when DO_NAN_INF => - -- At least one floating-point operand is infinity or NaN - if r.a.class = NAN then - opsel_a <= AIN_A; - elsif r.b.class = NAN then - opsel_a <= AIN_B; - else - opsel_a <= AIN_C; - end if; - opsel_b <= BIN_ZERO; - set_r := '1'; - - if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or - (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or - (r.c.class = NAN and r.c.mantissa(QNAN_BIT) = '0') then - -- Signalling NAN - v.fpscr(FPSCR_VXSNAN) := '1'; - invalid := '1'; - end if; - - -- Check for this case here since VXIMZ can be set along with VXSNAN - invalid_mul := '0'; - if r.is_multiply = '1' and - ((r.a.class = INFINITY and r.c.class = ZERO) or - (r.a.class = ZERO and r.c.class = INFINITY)) then - v.fpscr(FPSCR_VXIMZ) := '1'; - invalid_mul := '1'; - end if; - - if r.int_result = '1' then - opsel_r <= RES_MISC; - misc_sel <= "110"; - v.fpscr(FPSCR_VXCVI) := '1'; - invalid := '1'; - end if; - - if r.a.class = NAN or r.b.class = NAN or r.c.class = NAN then - rsgn_op := RSGN_SEL; - v.result_class := NAN; - - else - if invalid_mul = '1' then - qnan_result := '1'; - elsif (r.a.class = INFINITY or r.c.class = INFINITY) then - if r.is_multiply = '1' then - rsgn_op := RSGN_SUB; - end if; - if r.is_subtract = '1' and r.b.class = INFINITY then - v.fpscr(FPSCR_VXISI) := '1'; - qnan_result := '1'; - end if; - end if; - if r.is_inverse = '1' and r.a.class = INFINITY and r.b.class = INFINITY then - v.fpscr(FPSCR_VXIDI) := '1'; - qnan_result := '1'; - end if; - if r.b.class = INFINITY and r.is_sqrt = '1' and r.b.negative = '1' then - v.fpscr(FPSCR_VXSQRT) := '1'; - qnan_result := '1'; - end if; - if r.b.class = INFINITY and r.is_inverse = '1' then - -- fdiv, fre, frsqrte - v.result_class := ZERO; - else - v.result_class := INFINITY; - end if; - end if; - arith_done := '1'; - - when DO_ZERO_DEN => - -- At least one floating point operand is zero or denormalized - opsel_b <= BIN_ZERO; - set_r := '1'; - if r.use_a = '1' and r.a.class = ZERO then - opsel_a <= AIN_B; - re_sel2 <= REXP2_B; - re_set_result <= '1'; - if r.is_inverse = '1' and r.b.class = ZERO then - -- fdiv with B=0 - v.fpscr(FPSCR_VXZDZ) := '1'; - qnan_result := '1'; - end if; - if r.is_addition = '1' then - -- result is +/- B - v.result_class := r.b.class; - else - v.result_class := ZERO; - end if; - arith_done := '1'; - elsif r.use_c = '1' and r.c.class = ZERO then - -- fmul or fmadd/sub with C=0 - opsel_a <= AIN_B; - re_sel2 <= REXP2_B; - re_set_result <= '1'; - if r.is_addition = '1' then - v.result_class := r.b.class; - else - v.result_class := ZERO; - end if; - arith_done := '1'; - elsif (r.use_b = '1' and r.b.class = ZERO and r.is_multiply = '0') then - -- B is zero, other operands are finite, not fmadd* - opsel_a <= AIN_A; - re_sel1 <= REXP1_A; - re_set_result <= '1'; - if r.is_inverse = '1' then - -- fdiv, fre, frsqrte - v.result_class := INFINITY; - zero_divide := '1'; - elsif r.is_addition = '1' then - -- fadd, fsub - v.result_class := FINITE; - else - -- other things, result is zero - v.result_class := ZERO; - end if; - arith_done := '1'; - + when DO_SPECIAL => + -- At least one floating point operand is NaN, infinity, zero or denormalized + -- Most of the special cases are handled in the fpu_specialcases process + -- and in the code below (the scinfo.immed_result = '1' block). + if r.is_multiply = '1' and r.b.class = ZERO then + -- This will trigger for fmul as well as fmadd/sub, but + -- it doesn't matter since r.is_subtract = 0 for fmul. + rsgn_op := RSGN_SUB; + end if; + if r.a.denorm = '1' and (r.is_multiply = '1' or r.is_inverse = '1') then + v.state := RENORM_A; + elsif r.c.denorm = '1' then + v.state := RENORM_C; + elsif r.b.denorm = '1' and (r.is_inverse = '1' or r.is_sqrt = '1') then + v.state := RENORM_B; + elsif r.is_multiply = '1' and r.b.class = ZERO then + v.state := DO_FMUL; else - -- some operand is denorm, and/or it's fmadd/fmsub with B=0 - -- A and C are non-zero if present, - -- B is non-zero if present except for multiply-add - if r.is_multiply = '1' and r.b.class = ZERO then - -- This will trigger for fmul as well as fmadd/sub, but - -- it doesn't matter since r.is_subtract = 0 for fmul. - rsgn_op := RSGN_SUB; - end if; - if r.a.denorm = '1' and (r.is_multiply = '1' or r.is_inverse = '1') then - v.state := RENORM_A; - elsif r.c.denorm = '1' then - v.state := RENORM_C; - elsif r.b.denorm = '1' and (r.is_inverse = '1' or r.is_sqrt = '1') then - v.state := RENORM_B; - elsif r.is_multiply = '1' and r.b.class = ZERO then - v.state := DO_FMUL; - else - v.state := r.exec_state; - end if; + v.state := r.exec_state; end if; when DO_ILLEGAL => @@ -1685,10 +1716,6 @@ begin set_r := '1'; re_sel2 <= REXP2_B; re_set_result <= '1'; - if r.b.negative = '1' then - v.fpscr(FPSCR_VXSQRT) := '1'; - qnan_result := '1'; - end if; if r.b.exponent(0) = '1' then v.state := SQRT_ODD; elsif r.is_inverse = '0' then @@ -3049,6 +3076,37 @@ begin end case; + -- Handle exceptions and special cases for arithmetic operations + if r.cycle_1_ar = '1' then + v.fpscr := r.fpscr or scinfo.new_fpscr; + invalid := scinfo.invalid; + zero_divide := scinfo.zero_divide; + qnan_result := scinfo.qnan_result; + if scinfo.immed_result = '1' then + -- state machine is in the DO_SPECIAL or DO_FSQRT state here + arith_done := '1'; + set_r := '1'; + opsel_a <= scinfo.result_sel; + opsel_b <= BIN_ZERO; + if scinfo.qnan_result = '1' then + opsel_r <= RES_MISC; + if r.int_result = '0' then + misc_sel <= "001"; + else + misc_sel <= "110"; + end if; + end if; + rsgn_op := scinfo.rsgn_op; + v.result_class := scinfo.result_class; + if scinfo.result_sel = AIN_B then + re_sel2 <= REXP2_B; + else + re_sel1 <= REXP1_A; + end if; + re_set_result <= '1'; + end if; + end if; + rsign := r.result_sign; case rsgn_op is when RSGN_SEL => @@ -3100,16 +3158,8 @@ begin when others => end case; - if zero_divide = '1' then - v.fpscr(FPSCR_ZX) := '1'; - end if; if qnan_result = '1' then - invalid := '1'; - v.result_class := NAN; rsign := '0'; - misc_sel <= "001"; - opsel_r <= RES_MISC; - arith_done := '1'; end if; if invalid = '1' then v.invalid := '1'; From b63773f6e9f2a3ec5e145630455af6a35fd1a366 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 19 Mar 2024 15:36:50 +1100 Subject: [PATCH 22/24] FPU: Move computation of main adder inputs out of the state machine Signed-off-by: Paul Mackerras --- decode1.vhdl | 4 +- fpu.vhdl | 127 +++++++++++++++++++++++++++++++-------------------- 2 files changed, 79 insertions(+), 52 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 2fb1ad4..1978a27 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -151,8 +151,8 @@ architecture behaviour of decode1 is INSN_fabs => (FPU, FPU, OP_FP_MOVE, NONE, FRB, NONE, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', '0', NONE), INSN_fadd => (FPU, FPU, OP_FP_ARITH, FRA, FRB, NONE, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', '0', NONE), INSN_fadds => (FPU, FPU, OP_FP_ARITH, FRA, FRB, NONE, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', '0', NONE), - INSN_fcfid => (FPU, FPU, OP_FP_MISC, NONE, FRB, NONE, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', '0', NONE), - INSN_fcfids => (FPU, FPU, OP_FP_MISC, NONE, FRB, NONE, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', '0', NONE), + INSN_fcfid => (FPU, FPU, OP_FP_MISC, NONE, FRB, NONE, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', '0', NONE), + INSN_fcfids => (FPU, FPU, OP_FP_MISC, NONE, FRB, NONE, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', '0', NONE), INSN_fcfidu => (FPU, FPU, OP_FP_MISC, NONE, FRB, NONE, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', '0', NONE), INSN_fcfidus => (FPU, FPU, OP_FP_MISC, NONE, FRB, NONE, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', '0', NONE), INSN_fcmpo => (FPU, FPU, OP_FP_CMP, FRA, FRB, NONE, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', '0', NONE), diff --git a/fpu.vhdl b/fpu.vhdl index 0698c63..28cd55f 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -194,16 +194,16 @@ architecture behaviour of fpu is signal fp_result : std_ulogic_vector(63 downto 0); signal opsel_a : std_ulogic_vector(2 downto 0); - signal opsel_b : std_ulogic; + signal opsel_b : std_ulogic_vector(2 downto 0); + signal opsel_c : std_ulogic_vector(2 downto 0); signal opsel_r : std_ulogic_vector(1 downto 0); signal opsel_s : std_ulogic_vector(1 downto 0); - signal opsel_ainv : std_ulogic; + signal opsel_aneg : std_ulogic; + signal opsel_aabs : std_ulogic; signal opsel_mask : std_ulogic; - signal opsel_binv : std_ulogic; signal in_a : std_ulogic_vector(63 downto 0); signal in_b : std_ulogic_vector(63 downto 0); signal result : std_ulogic_vector(63 downto 0); - signal carry_in : std_ulogic; signal lost_bits : std_ulogic; signal r_hi_nz : std_ulogic; signal r_lo_nz : std_ulogic; @@ -228,8 +228,20 @@ architecture behaviour of fpu is constant AIN_RND_RBIT : std_ulogic_vector(2 downto 0) := "110"; constant AIN_RND : std_ulogic_vector(2 downto 0) := "111"; - constant BIN_ZERO : std_ulogic := '0'; - constant BIN_R : std_ulogic := '1'; + constant BIN_ZERO : std_ulogic_vector(2 downto 0) := "000"; + constant BIN_R : std_ulogic_vector(2 downto 0) := "001"; + constant BIN_MINUSR : std_ulogic_vector(2 downto 0) := "100"; + constant BIN_ABSR : std_ulogic_vector(2 downto 0) := "101"; + constant BIN_ADDSUBR : std_ulogic_vector(2 downto 0) := "110"; + constant BIN_RSIGNR : std_ulogic_vector(2 downto 0) := "111"; + + constant CIN_ZERO : std_ulogic_vector(2 downto 0) := "000"; + constant CIN_SUBEXT : std_ulogic_vector(2 downto 0) := "001"; + constant CIN_ABSEXT : std_ulogic_vector(2 downto 0) := "010"; + constant CIN_INC : std_ulogic_vector(2 downto 0) := "011"; + constant CIN_ROUND : std_ulogic_vector(2 downto 0) := "100"; + constant CIN_RNDX : std_ulogic_vector(2 downto 0) := "101"; + constant CIN_RNDQ : std_ulogic_vector(2 downto 0) := "110"; constant RES_SUM : std_ulogic_vector(1 downto 0) := "00"; constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01"; @@ -1035,6 +1047,9 @@ begin variable cr_result : std_ulogic_vector(3 downto 0); variable set_cr : std_ulogic; variable set_fpcc : std_ulogic; + variable asign : std_ulogic; + variable bneg : std_ulogic; + variable ci : std_ulogic; begin v := r; v.complete := '0'; @@ -1297,13 +1312,13 @@ begin v.first := '0'; v.doing_ftdiv := "00"; opsel_a <= AIN_ZERO; - opsel_ainv <= '0'; + opsel_aneg <= '0'; + opsel_aabs <= '0'; opsel_mask <= '0'; opsel_b <= BIN_R; - opsel_binv <= '0'; + opsel_c <= CIN_ZERO; opsel_r <= RES_SUM; opsel_s <= S_ZERO; - carry_in <= '0'; misc_sel <= "000"; fpscr_mask := (others => '1'); cr_op := CROP_NONE; @@ -1634,14 +1649,10 @@ begin when DO_FCFID => opsel_a <= AIN_B; + opsel_aabs <= '1'; opsel_b <= BIN_ZERO; set_r := '1'; rcls_op <= RCLS_SEL; - if r.insn(8) = '0' and r.b.negative = '1' then - -- fcfid[s] with negative operand, set R = -B - opsel_ainv <= '1'; - carry_in <= '1'; - end if; re_con2 <= RECON2_UNIT; re_set_result <= '1'; if r.b.class = ZERO then @@ -1833,9 +1844,8 @@ begin else opsel_a <= AIN_B; end if; - opsel_b <= BIN_R; - opsel_binv <= r.is_subtract; - carry_in <= r.is_subtract and not r.x; + opsel_b <= BIN_ADDSUBR; + opsel_c <= CIN_SUBEXT; set_r := '1'; -- set shift to -1 rs_con2 <= RSCON2_1; @@ -1847,13 +1857,12 @@ begin -- r.shift = -1 re_sel2 <= REXP2_NE; rcls_op <= RCLS_TZERO; + opsel_a <= AIN_ZERO; + opsel_b <= BIN_ABSR; if r.r(63) = '1' then -- result is opposite sign to expected rsgn_op := RSGN_INV; - opsel_a <= AIN_ZERO; set_r := '1'; - opsel_binv <= '1'; - carry_in <= '1'; v.state := FINISH; elsif r.r(UNIT_BIT + 1) = '1' then -- sum overflowed, shift right @@ -1876,9 +1885,7 @@ begin when CMP_1 => opsel_a <= AIN_A; - opsel_b <= BIN_R; - opsel_binv <= '1'; - carry_in <= '1'; + opsel_b <= BIN_MINUSR; set_r := '1'; v.state := CMP_2; @@ -1963,10 +1970,10 @@ begin when FMADD_5 => -- negate R:S:X if negative + opsel_b <= BIN_ABSR; + opsel_c <= CIN_ABSEXT; if r.r(63) = '1' then rsgn_op := RSGN_INV; - opsel_binv <= '1'; - carry_in <= not (s_nz or r.x); set_r := '1'; opsel_s <= S_NEG; set_s := '1'; @@ -2260,7 +2267,7 @@ begin when SQRT_12 => -- test if remainder is 0 or >= B = 2*R + 1 set_r := '0'; - carry_in <= '1'; + opsel_c <= CIN_INC; if pcmpb_lt = '1' then -- square root is correct, set X if remainder non-zero v.x := r.p(UNIT_BIT + 2) or px_nz; @@ -2309,8 +2316,8 @@ begin when INT_FINAL => -- Negate if necessary, and increment for rounding if needed - opsel_binv <= r.result_sign; - carry_in <= r.fpscr(FPSCR_FR) xor r.result_sign; + opsel_b <= BIN_RSIGNR; + opsel_c <= CIN_ROUND; set_r := '1'; -- Check for possible overflows case r.insn(9 downto 8) is @@ -2547,13 +2554,9 @@ begin when DO_IDIVMOD => opsel_a <= AIN_B; + opsel_aabs <= '1'; opsel_b <= BIN_ZERO; set_r := '1'; - -- take absolute value for signed division - if r.is_signed = '1' and r.b.negative = '1' then - opsel_ainv <= '1'; - carry_in <= '1'; - end if; -- normalize and round up B to 8.56 format, like fcfid[u] re_con2 <= RECON2_UNIT; re_set_result <= '1'; @@ -2583,19 +2586,16 @@ begin v.state := IDIV_NORMB3; when IDIV_NORMB3 => -- add the X bit onto R to round up B - carry_in <= r.x; + opsel_c <= CIN_RNDX; set_r := '1'; -- prepare to do count-leading-zeroes on A v.state := IDIV_CLZA; when IDIV_CLZA => set_b := '1'; -- put R back into B opsel_a <= AIN_A; + opsel_aabs <= '1'; opsel_b <= BIN_ZERO; set_r := '1'; - if r.is_signed = '1' and r.a.negative = '1' then - opsel_ainv <= '1'; - carry_in <= '1'; - end if; re_con2 <= RECON2_UNIT; re_set_result <= '1'; v.state := IDIV_CLZA2; @@ -2608,8 +2608,7 @@ begin -- (using the original value of B, which is now in C) opsel_a <= AIN_C; opsel_b <= BIN_R; - opsel_ainv <= '1'; - carry_in <= '1'; + opsel_aneg <= '1'; set_r := '1'; v.state := IDIV_CLZA3; when IDIV_CLZA3 => @@ -2924,8 +2923,7 @@ begin -- shifted dividend is in R, subtract left-justified divisor opsel_a <= AIN_B; opsel_b <= BIN_R; - opsel_ainv <= '1'; - carry_in <= '1'; + opsel_aneg <= '1'; set_r := '1'; -- and put 1<<63 into B as the divisor (S is still 0) shiftin0 := '1'; @@ -3028,8 +3026,7 @@ begin when IDIV_MODSUB => -- Subtract divisor from remainder opsel_a <= AIN_C; - opsel_ainv <= '1'; - carry_in <= '1'; + opsel_aneg <= '1'; opsel_b <= BIN_R; set_r := '1'; if r.result_sign = '0' then @@ -3041,8 +3038,8 @@ begin -- result (so far) is in R -- set carry to increment quotient if needed -- and also negate R if the answer is negative - opsel_binv <= r.result_sign; - carry_in <= r.inc_quot xor r.result_sign; + opsel_b <= BIN_RSIGNR; + opsel_c <= CIN_RNDQ; set_r := '1'; if r.divmod = '0' then opsel_a <= AIN_RND_B32; @@ -3257,11 +3254,14 @@ begin if (or (mask and r.r)) = '1' and set_x = '1' then v.x := '1'; end if; + asign := '0'; case opsel_a is when AIN_A => in_a0 := r.a.mantissa; + asign := r.a.negative; when AIN_B => in_a0 := r.b.mantissa; + asign := r.b.negative; when AIN_C => in_a0 := r.c.mantissa; when AIN_PS8 => -- 8 LSBs of P sign-extended to 64 @@ -3275,18 +3275,45 @@ begin when others => in_a0 := (others => '0'); end case; - if opsel_ainv = '1' then + ci := '0'; + case opsel_c is + when CIN_SUBEXT => + ci := r.is_subtract and r.x; + when CIN_ABSEXT => + ci := r.r(63) and (s_nz or r.x); + when CIN_INC => + ci := '1'; + when CIN_ROUND => + ci := r.fpscr(FPSCR_FR); + when CIN_RNDX => + ci := r.x; + when CIN_RNDQ => + ci := r.inc_quot; + when others => + end case; + if opsel_aneg = '1' or (opsel_aabs = '1' and r.is_signed = '1' and asign = '1') then in_a0 := not in_a0; + ci := not ci; end if; in_a <= in_a0; + in_b0 := r.r; + bneg := '0'; case opsel_b is when BIN_R => - in_b0 := r.r; + when BIN_MINUSR => + bneg := '1'; + when BIN_ABSR => + bneg := r.r(63); + when BIN_ADDSUBR => + bneg := r.is_subtract; + when BIN_RSIGNR => + bneg := r.result_sign; when others => in_b0 := (others => '0'); end case; - if opsel_binv = '1' then + if bneg = '1' then in_b0 := not in_b0; + ci := not ci; end if; in_b <= in_b0; if is_X(r.shift) then @@ -3298,7 +3325,7 @@ begin else shift_res := (others => '0'); end if; - sum := std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in); + sum := std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + ci); if opsel_mask = '1' then sum(DP_LSB - 1 downto 0) := "0000"; if r.single_prec = '1' then From 73505b16262438369b4f68f1e81e8e496b6d49b2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 18 Mar 2025 20:53:27 +1100 Subject: [PATCH 23/24] FPU: Provide a separate path for transferring A/B/C to R The timing path from r.a.class to result showed up as a critical path on the Artix-7, apparently because of transfers of A, B or C to R in special cases (e.g. NaN inputs) and the fsel instruction. To alleviate this, we provide a path via the miscellaneous value multiplexer from A, B and C to R, selected via opsel_R = RES_MISC and misc_sel = 111. A new selector opsel_sel selects which of A, B or C to transfer, using the same encoding as opsel_a. This new selector is now also used for the result class when rcls_op = RCLS_SEL and for the result sign when rsgn_op = RSGN_SEL. This reduces the number of things that opsel_a depends on and eases timing in the main adder path. Signed-off-by: Paul Mackerras --- fpu.vhdl | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 28cd55f..4ef2d14 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -201,6 +201,7 @@ architecture behaviour of fpu is signal opsel_aneg : std_ulogic; signal opsel_aabs : std_ulogic; signal opsel_mask : std_ulogic; + signal opsel_sel : std_ulogic_vector(2 downto 0); signal in_a : std_ulogic_vector(63 downto 0); signal in_b : std_ulogic_vector(63 downto 0); signal result : std_ulogic_vector(63 downto 0); @@ -1320,6 +1321,7 @@ begin opsel_r <= RES_SUM; opsel_s <= S_ZERO; misc_sel <= "000"; + opsel_sel <= AIN_ZERO; fpscr_mask := (others => '1'); cr_op := CROP_NONE; update_fx := '0'; @@ -1566,8 +1568,9 @@ begin v.instr_done := '1'; when DO_FMR => - opsel_a <= AIN_B; - opsel_b <= BIN_ZERO; + opsel_r <= RES_MISC; + misc_sel <= "111"; + opsel_sel <= AIN_B; set_r := '1'; rcls_op <= RCLS_SEL; re_sel2 <= REXP2_B; @@ -1652,6 +1655,7 @@ begin opsel_aabs <= '1'; opsel_b <= BIN_ZERO; set_r := '1'; + opsel_sel <= AIN_B; rcls_op <= RCLS_SEL; re_con2 <= RECON2_UNIT; re_set_result <= '1'; @@ -1710,13 +1714,14 @@ begin rsgn_op := RSGN_SEL; rcls_op <= RCLS_SEL; if r.a.class = ZERO or (r.a.negative = '0' and r.a.class /= NAN) then - opsel_a <= AIN_C; + opsel_sel <= AIN_C; re_sel2 <= REXP2_C; else - opsel_a <= AIN_B; + opsel_sel <= AIN_B; re_sel2 <= REXP2_B; end if; - opsel_b <= BIN_ZERO; + opsel_r <= RES_MISC; + misc_sel <= "111"; set_r := '1'; re_set_result <= '1'; arith_done := '1'; @@ -3083,15 +3088,16 @@ begin -- state machine is in the DO_SPECIAL or DO_FSQRT state here arith_done := '1'; set_r := '1'; - opsel_a <= scinfo.result_sel; - opsel_b <= BIN_ZERO; + opsel_r <= RES_MISC; + opsel_sel <= scinfo.result_sel; if scinfo.qnan_result = '1' then - opsel_r <= RES_MISC; if r.int_result = '0' then misc_sel <= "001"; else misc_sel <= "110"; end if; + else + misc_sel <= "111"; end if; rsgn_op := scinfo.rsgn_op; v.result_class := scinfo.result_class; @@ -3107,7 +3113,7 @@ begin rsign := r.result_sign; case rsgn_op is when RSGN_SEL => - case opsel_a is + case opsel_sel is when AIN_A => rsign := r.a.negative; when AIN_B => @@ -3128,7 +3134,7 @@ begin case rcls_op is when RCLS_SEL => - case opsel_a is + case opsel_sel is when AIN_A => v.result_class := r.a.class; when AIN_B => @@ -3366,6 +3372,7 @@ begin misc := r.a.mantissa(63 downto 32) & r.b.mantissa(63 downto 32); end if; when "101" => + -- LUT value misc := std_ulogic_vector(shift_left(resize(unsigned(inverse_est), 64), UNIT_BIT - 19)); when "110" => @@ -3382,6 +3389,16 @@ begin end if; end if; when others => + -- A, B or C, according to opsel_sel + case opsel_sel is + when AIN_A => + misc := r.a.mantissa; + when AIN_B => + misc := r.b.mantissa; + when AIN_C => + misc := r.c.mantissa; + when others => + end case; end case; result <= misc; end case; From 3268ef717cfbc38290b0b49be22cb6679e378fb9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 21 Mar 2025 21:41:39 +1100 Subject: [PATCH 24/24] FPU: Make opsel_a a function of just the state This adds some extra states and transitions so that opsel_a becomes a function only of the current state. Signed-off-by: Paul Mackerras --- fpu.vhdl | 89 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 37 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 4ef2d14..5648012 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -57,7 +57,7 @@ architecture behaviour of fpu is DO_FSEL, DO_IDIVMOD, FRI_1, - ADD_1, ADD_SHIFT, ADD_2, ADD_3, + ADD_1, ADD_SHIFT, ADD_2, ADD_2B, ADD_3, CMP_1, CMP_2, MULT_1, FMADD_0, FMADD_1, FMADD_2, FMADD_3, @@ -73,7 +73,7 @@ architecture behaviour of fpu is INT_FINAL, INT_CHECK, INT_OFLOW, FINISH, NORMALIZE, ROUND_UFLOW, ROUND_OFLOW, - ROUNDING, ROUNDING_2, ROUNDING_3, + ROUNDING, ROUND_INC, ROUNDING_2, ROUNDING_3, DENORM, RENORM_A, RENORM_B, RENORM_C, RENORM_1, RENORM_2, @@ -87,7 +87,8 @@ architecture behaviour of fpu is IDIV_EXT_TBH4, IDIV_EXT_TBH5, IDIV_EXTDIV, IDIV_EXTDIV1, IDIV_EXTDIV2, IDIV_EXTDIV3, IDIV_EXTDIV4, IDIV_EXTDIV5, IDIV_EXTDIV6, - IDIV_MODADJ, IDIV_MODSUB, IDIV_DIVADJ, IDIV_OVFCHK, IDIV_DONE, IDIV_ZERO); + IDIV_MODADJ, IDIV_MODADJ_NEG, IDIV_MODSUB, + IDIV_DIVADJ, IDIV_OVFCHK, IDIV_DONE, IDIV_ZERO); type decode32 is array(0 to 31) of state_t; type decode8 is array(0 to 7) of state_t; @@ -1027,7 +1028,6 @@ begin variable mulexp : signed(EXP_BITS-1 downto 0); variable maddend : std_ulogic_vector(127 downto 0); variable sum : std_ulogic_vector(63 downto 0); - variable round_inc : std_ulogic_vector(63 downto 0); variable mult_mask : std_ulogic; variable sign_bit : std_ulogic; variable rexp_in1 : signed(EXP_BITS-1 downto 0); @@ -1678,7 +1678,7 @@ begin rs_sel2 <= RSH2_A; v.add_bsmall := '0'; if r.a.exponent = r.b.exponent then - v.state := ADD_2; + v.state := ADD_2B; elsif r.a.exponent < r.b.exponent then v.longmask := '0'; v.state := ADD_SHIFT; @@ -1841,14 +1841,24 @@ begin v.x := s_nz; set_x := '1'; v.longmask := r.single_prec; - v.state := ADD_2; - - when ADD_2 => if r.add_bsmall = '1' then - opsel_a <= AIN_A; + v.state := ADD_2; else - opsel_a <= AIN_B; + v.state := ADD_2B; end if; + + when ADD_2 => + opsel_a <= AIN_A; + opsel_b <= BIN_ADDSUBR; + opsel_c <= CIN_SUBEXT; + set_r := '1'; + -- set shift to -1 + rs_con2 <= RSCON2_1; + rs_neg2 <= '1'; + v.state := ADD_3; + + when ADD_2B => + opsel_a <= AIN_B; opsel_b <= BIN_ADDSUBR; opsel_c <= CIN_SUBEXT; set_r := '1'; @@ -2484,20 +2494,14 @@ begin v.fpscr(FPSCR_FR downto FPSCR_FI) := round; if round(1) = '1' then -- increment the LSB for the precision - opsel_a <= AIN_RND; - -- set shift to -1 - rs_con2 <= RSCON2_1; - rs_neg2 <= '1'; - v.state := ROUNDING_2; + v.state := ROUND_INC; + elsif r.r(UNIT_BIT) = '0' then + -- result after masking could be zero, or could be a + -- denormalized result that needs to be renormalized + rs_norm <= '1'; + v.state := ROUNDING_3; else - if r.r(UNIT_BIT) = '0' then - -- result after masking could be zero, or could be a - -- denormalized result that needs to be renormalized - rs_norm <= '1'; - v.state := ROUNDING_3; - else - arith_done := '1'; - end if; + arith_done := '1'; end if; if round(0) = '1' then v.fpscr(FPSCR_XX) := '1'; @@ -2506,6 +2510,14 @@ begin end if; end if; + when ROUND_INC => + set_r := '1'; + opsel_a <= AIN_RND; + -- set shift to -1 + rs_con2 <= RSCON2_1; + rs_neg2 <= '1'; + v.state := ROUNDING_2; + when ROUNDING_2 => -- Check for overflow during rounding -- r.shift = -1 @@ -2804,12 +2816,10 @@ begin msel_1 <= MUL1_Y; msel_2 <= MUL2_P; v.inc_quot := not pcmpc_lt and not r.divmod; - if r.divmod = '0' then - -- get B into R for IDIV_DIVADJ state - opsel_a <= AIN_B; - opsel_b <= BIN_ZERO; - set_r := '1'; - end if; + -- if dividing, get B into R for IDIV_DIVADJ state + opsel_a <= AIN_B; + opsel_b <= BIN_ZERO; + set_r := not r.divmod; -- set shift to UNIT_BIT (== 56) rs_con2 <= RSCON2_UNIT; if pcmpc_lt = '1' or pcmpc_eq = '1' then @@ -2872,11 +2882,11 @@ begin v.inc_quot := not pcmpc_lt and not r.divmod; -- set shift to UNIT_BIT (== 56) rs_con2 <= RSCON2_UNIT; + -- if dividing, get B into R for IDIV_DIVADJ state + opsel_a <= AIN_B; + opsel_b <= BIN_ZERO; + set_r := not r.divmod; if r.divmod = '0' then - -- get B into R for IDIV_DIVADJ state - opsel_a <= AIN_B; - opsel_b <= BIN_ZERO; - set_r := '1'; v.state := IDIV_DIVADJ; elsif pcmpc_eq = '1' then v.state := IDIV_ZERO; @@ -3026,8 +3036,15 @@ begin elsif r.result_sign = '0' then v.state := IDIV_DONE; else - v.state := IDIV_DIVADJ; + v.state := IDIV_MODADJ_NEG; end if; + when IDIV_MODADJ_NEG => + -- result (so far) is in R + -- set carry to increment quotient if needed + -- and also negate R since the answer is negative + opsel_b <= BIN_MINUSR; + set_r := '1'; + v.state := IDIV_OVFCHK; when IDIV_MODSUB => -- Subtract divisor from remainder opsel_a <= AIN_C; @@ -3043,12 +3060,10 @@ begin -- result (so far) is in R -- set carry to increment quotient if needed -- and also negate R if the answer is negative + opsel_a <= AIN_RND_B32; opsel_b <= BIN_RSIGNR; opsel_c <= CIN_RNDQ; set_r := '1'; - if r.divmod = '0' then - opsel_a <= AIN_RND_B32; - end if; if r.is_signed = '0' then v.state := IDIV_DONE; else