From 955fa561fb7bb0c27b427ea2a84e5bcea2e63342 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 17 Jan 2024 21:05:30 +1100
Subject: [PATCH 01/24] FPU: Move most result_sign computation out of state
 machine

This moves the computation of r.result_sign out of the various
states for most instructions.  Now the sign is mostly computed in the
first cycle (when e_in.valid is true).

The set of operations done on r.result_sign in the state machine are
now restricted to 5 (other than no change): invert, xor with
r.is_subtract, or set to the sign of A, B or C.

Similarly r.is_subtract and r.negate are computed in the first cycle
now.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 109 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 64 insertions(+), 45 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index f07f9d1..12181cf 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -811,7 +811,6 @@ begin
         variable mshift      : signed(EXP_BITS-1 downto 0);
         variable need_check  : std_ulogic;
         variable msb         : std_ulogic;
-        variable is_add      : std_ulogic;
         variable set_a       : std_ulogic;
         variable set_a_exp   : std_ulogic;
         variable set_a_mant  : std_ulogic;
@@ -889,6 +888,7 @@ begin
             v.divmod := '0';
             v.is_sqrt := '0';
             v.is_multiply := '0';
+            v.is_subtract := '0';
             fpin_a := '0';
             fpin_b := '0';
             fpin_c := '0';
@@ -896,6 +896,8 @@ begin
             v.use_b := e_in.valid_b;
             v.use_c := e_in.valid_c;
             v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN);
+            v.result_sign := '0';
+            v.negate := '0';
             case e_in.op is
                 when OP_FP_ARITH =>
                     fpin_a := e_in.valid_a;
@@ -913,6 +915,25 @@ begin
                     if e_in.insn(5 downto 1) = "01111" then
                         v.round_mode := "001";
                     end if;
+                    case e_in.insn(5 downto 1) is
+                        when "10100" | "10101" =>       -- fadd and fsub
+                            v.result_sign := e_in.fra(63);
+                            if unsigned(e_in.fra(62 downto 52)) <= unsigned(e_in.frb(62 downto 52)) then
+                                v.result_sign := e_in.frb(63) xnor e_in.insn(1);
+                            end if;
+                            v.is_subtract := not (e_in.fra(63) xor e_in.frb(63) xor e_in.insn(1));
+                        when "11001" =>         -- fmul
+                            v.result_sign := e_in.fra(63) xor e_in.frc(63);
+                        when "11100" | "11101" | "11110" | "11111" =>   --fmadd family
+                            v.result_sign := e_in.fra(63) xor e_in.frc(63);
+                            v.is_subtract := not (e_in.fra(63) xor e_in.frb(63) xor
+                                                  e_in.frc(63) xor e_in.insn(1));
+                            v.negate := e_in.insn(2);
+                        when "10010" =>         -- fdiv
+                            v.result_sign := e_in.fra(63) xor e_in.frb(63);
+                        when others =>
+                            v.result_sign := e_in.frb(63);
+                    end case;
                 when OP_FP_CMP =>
                     fpin_a := e_in.valid_a;
                     fpin_b := e_in.valid_b;
@@ -921,6 +942,12 @@ begin
                     v.fp_rc := e_in.rc;
                     opcbits := e_in.insn(10) & e_in.insn(8) & e_in.insn(4) & e_in.insn(2) & e_in.insn(1);
                     exec_state := misc_decode(to_integer(unsigned(opcbits)));
+                    case opcbits is
+                        when "10110" =>        -- fcfid
+                            v.result_sign := e_in.frb(63);
+                        when others =>
+                            v.result_sign := '0';
+                    end case;
                 when OP_FP_MOVE =>
                     v.fp_rc := e_in.rc;
                     fpin_a := e_in.valid_a;
@@ -928,22 +955,49 @@ begin
                     fpin_c := e_in.valid_c;
                     if e_in.insn(5) = '0' then
                         exec_state := DO_FMR;
+                        if e_in.insn(9) = '1' then
+                            v.result_sign := '0';              -- fabs
+                        elsif e_in.insn(8) = '1' then
+                            v.result_sign := '1';              -- fnabs
+                        elsif e_in.insn(7) = '1' then
+                            v.result_sign := e_in.frb(63);     -- fmr
+                        elsif e_in.insn(6) = '1' then
+                            v.result_sign := not e_in.frb(63); -- fneg
+                        else
+                            v.result_sign := e_in.fra(63);     -- fcpsgn
+                        end if;
                     else
                         exec_state := DO_FSEL;
+                        v.result_sign := e_in.frb(63);
                     end if;
                 when OP_DIV =>
                     v.integer_op := '1';
                     is_32bint := e_in.single;
+                    if e_in.single = '0' then
+                        v.result_sign := e_in.is_signed and (e_in.fra(63) xor e_in.frb(63));
+                    else
+                        v.result_sign := e_in.is_signed and (e_in.fra(31) xor e_in.frb(31));
+                    end if;
                     exec_state := DO_IDIVMOD;
                 when OP_DIVE =>
                     v.integer_op := '1';
                     v.divext := '1';
                     is_32bint := e_in.single;
+                    if e_in.single = '0' then
+                        v.result_sign := e_in.is_signed and (e_in.fra(63) xor e_in.frb(63));
+                    else
+                        v.result_sign := e_in.is_signed and (e_in.fra(31) xor e_in.frb(31));
+                    end if;
                     exec_state := DO_IDIVMOD;
                 when OP_MOD =>
                     v.integer_op := '1';
                     v.divmod := '1';
                     is_32bint := e_in.single;
+                    if e_in.single = '0' then
+                        v.result_sign := e_in.is_signed and e_in.fra(63);
+                    else
+                        v.result_sign := e_in.is_signed and e_in.fra(31);
+                    end if;
                     exec_state := DO_IDIVMOD;
                 when others =>
                     exec_state := DO_ILLEGAL;
@@ -951,7 +1005,6 @@ begin
             v.quieten_nan := '1';
             v.tiny := '0';
             v.denorm := '0';
-            v.is_subtract := '0';
             v.add_bsmall := '0';
             v.int_ovf := '0';
             v.div_close := '0';
@@ -1096,7 +1149,6 @@ begin
         case r.state is
             when IDLE =>
                 v.invalid := '0';
-                v.negate := '0';
                 if e_in.valid = '1' then
                     v.opsel_a := AIN_B;
                     v.busy := '1';
@@ -1319,24 +1371,12 @@ begin
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 v.quieten_nan := '0';
-                if r.insn(9) = '1' then
-                    v.result_sign := '0';              -- fabs
-                elsif r.insn(8) = '1' then
-                    v.result_sign := '1';              -- fnabs
-                elsif r.insn(7) = '1' then
-                    v.result_sign := r.b.negative;     -- fmr
-                elsif r.insn(6) = '1' then
-                    v.result_sign := not r.b.negative; -- fneg
-                else
-                    v.result_sign := r.a.negative;     -- fcpsgn
-                end if;
                 v.writing_fpr := '1';
                 v.instr_done := '1';
 
             when DO_FRI =>    -- fri[nzpm]
                 -- r.opsel_a = AIN_B
                 v.result_class := r.b.class;
-                v.result_sign := r.b.negative;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 -- set shift to exponent - 52
@@ -1365,7 +1405,6 @@ begin
             when DO_FRSP =>
                 -- r.opsel_a = AIN_B, r.shift = 0
                 v.result_class := r.b.class;
-                v.result_sign := r.b.negative;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 -- set shift to exponent - -126
@@ -1398,7 +1437,6 @@ begin
                 -- instr bit 1: 1=round to zero 0=use fpscr[RN]
                 -- r.opsel_a = AIN_B
                 v.result_class := r.b.class;
-                v.result_sign := r.b.negative;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 rs_sel1 <= RSH1_B;
@@ -1441,12 +1479,10 @@ begin
 
             when DO_FCFID =>
                 -- r.opsel_a = AIN_B
-                v.result_sign := '0';
                 if r.insn(8) = '0' and r.b.negative = '1' then
                     -- fcfid[s] with negative operand, set R = -B
                     opsel_ainv <= '1';
                     carry_in <= '1';
-                    v.result_sign := '1';
                 end if;
                 v.result_class := r.b.class;
                 re_con2 <= RECON2_UNIT;
@@ -1462,7 +1498,6 @@ begin
             when DO_FADD =>
                 -- fadd[s] and fsub[s]
                 -- r.opsel_a = AIN_A
-                v.result_sign := r.a.negative;
                 v.result_class := r.a.class;
                 re_sel1 <= REXP1_A;
                 re_set_result <= '1';
@@ -1472,13 +1507,10 @@ begin
                 rs_sel2 <= RSH2_A;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
-                is_add := r.a.negative xor r.b.negative xor r.insn(1);
-                v.is_subtract := not is_add;
                 if r.a.class = FINITE and r.b.class = FINITE then
                     v.add_bsmall := r.exp_cmp;
                     v.opsel_a := AIN_B;
                     if r.exp_cmp = '0' then
-                        v.result_sign := r.b.negative xnor r.insn(1);
                         if r.a.exponent = r.b.exponent then
                             v.state := ADD_2;
                         else
@@ -1491,7 +1523,7 @@ begin
                 else
                     if r.a.class = NAN or r.b.class = NAN then
                         v.state := NAN_RESULT;
-                    elsif r.a.class = INFINITY and r.b.class = INFINITY and is_add = '0' then
+                    elsif r.a.class = INFINITY and r.b.class = INFINITY and r.is_subtract = '1' then
                         -- invalid operation, construct QNaN
                         v.fpscr(FPSCR_VXISI) := '1';
                         qnan_result := '1';
@@ -1502,7 +1534,6 @@ begin
                     else
                         -- result is +/- B
                         v.opsel_a := AIN_B;
-                        v.result_sign := r.b.negative xnor r.insn(1);
                         v.state := EXC_RESULT;
                     end if;
                 end if;
@@ -1510,7 +1541,6 @@ begin
             when DO_FMUL =>
                 -- fmul[s]
                 -- r.opsel_a = AIN_A unless C is denorm and A isn't
-                v.result_sign := r.a.negative xor r.c.negative;
                 v.result_class := r.a.class;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
@@ -1550,7 +1580,6 @@ begin
                 v.result_class := r.a.class;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
-                v.result_sign := r.a.negative xor r.b.negative;
                 re_sel1 <= REXP1_A;
                 re_sel2 <= REXP2_B;
                 re_neg2 <= '1';
@@ -1599,7 +1628,6 @@ begin
                     v.result_sign := r.c.negative;
                 else
                     v.opsel_a := AIN_B;
-                    v.result_sign := r.b.negative;
                 end if;
                 v.quieten_nan := '0';
                 v.state := EXC_RESULT;
@@ -1607,7 +1635,6 @@ begin
             when DO_FSQRT =>
                 -- r.opsel_a = AIN_B
                 v.result_class := r.b.class;
-                v.result_sign := r.b.negative;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
                 re_sel2 <= REXP2_B;
@@ -1643,7 +1670,6 @@ begin
             when DO_FRE =>
                 -- r.opsel_a = AIN_B
                 v.result_class := r.b.class;
-                v.result_sign := r.b.negative;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
                 re_sel2 <= REXP2_B;
@@ -1669,7 +1695,6 @@ begin
             when DO_FRSQRTE =>
                 -- r.opsel_a = AIN_B
                 v.result_class := r.b.class;
-                v.result_sign := r.b.negative;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
                 re_sel2 <= REXP2_B;
@@ -1708,7 +1733,6 @@ begin
                 -- fmadd, fmsub, fnmadd, fnmsub
                 -- r.opsel_a = AIN_A if A is denorm, else AIN_C if C is denorm,
                 -- else AIN_B
-                v.result_sign := r.a.negative;
                 v.result_class := r.a.class;
                 -- put a.exp + c.exp into result_exp
                 re_sel1 <= REXP1_A;
@@ -1718,9 +1742,6 @@ begin
                 rs_sel1 <= RSH1_B;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
-                is_add := r.a.negative xor r.c.negative xor r.b.negative xor r.insn(1);
-                v.negate := r.insn(2);
-                v.is_subtract := not is_add;
                 if r.a.class = FINITE and r.c.class = FINITE and
                     (r.b.class = FINITE or r.b.class = ZERO) then
                     -- Make sure A and C are normalized
@@ -1730,13 +1751,13 @@ begin
                         v.state := RENORM_C;
                     elsif r.b.class = ZERO then
                         -- no addend, degenerates to multiply
-                        v.result_sign := r.a.negative xor r.c.negative;
                         f_to_multiply.valid <= '1';
                         v.is_multiply := '1';
                         v.state := MULT_1;
                     elsif r.madd_cmp = '0' then
                         -- addend is bigger, do multiply first
-                        v.result_sign := r.b.negative xnor r.insn(1);
+                        -- if subtracting, sign is opposite to initial estimate
+                        v.result_sign := r.result_sign xor r.is_subtract;
                         f_to_multiply.valid <= '1';
                         v.first := '1';
                         v.state := FMADD_0;
@@ -1753,21 +1774,20 @@ begin
                         v.fpscr(FPSCR_VXIMZ) := '1';
                         qnan_result := '1';
                     elsif r.a.class = INFINITY or r.c.class = INFINITY then
-                        if r.b.class = INFINITY and is_add = '0' then
+                        if r.b.class = INFINITY and r.is_subtract = '1' then
                             -- invalid operation, construct QNaN
                             v.fpscr(FPSCR_VXISI) := '1';
                             qnan_result := '1';
                         else
                             -- result is infinity
                             v.result_class := INFINITY;
-                            v.result_sign := r.a.negative xor r.c.negative;
                             arith_done := '1';
                         end if;
                     else
                         -- Here A is zero, C is zero, or B is infinity
                         -- Result is +/-B in all of those cases
                         v.opsel_a := AIN_B;
-                        v.result_sign := r.b.negative xnor r.insn(1);
+                        v.result_sign := r.result_sign xor r.is_subtract;
                         v.state := EXC_RESULT;
                     end if;
                 end if;
@@ -1970,7 +1990,7 @@ begin
                 -- product is bigger here
                 -- shift B right and use it as the addend to the multiplier
                 -- for subtract, multiplier does B - A * C
-                v.result_sign := r.a.negative xor r.c.negative xor r.is_subtract;
+                v.result_sign := r.result_sign xor r.is_subtract;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 -- set shift to b.exp - result_exp + 64
@@ -2638,7 +2658,6 @@ begin
 
             when DO_IDIVMOD =>
                 -- r.opsel_a = AIN_B
-                v.result_sign := r.is_signed and (r.a.negative xor (r.b.negative and not r.divmod));
                 if r.b.class = ZERO then
                     -- B is zero, signal overflow
                     v.int_ovf := '1';
@@ -3168,7 +3187,7 @@ begin
 
         end case;
 
-        rsign := v.result_sign;
+        rsign := r.result_sign;
         if zero_divide = '1' then
             v.fpscr(FPSCR_ZX) := '1';
         end if;
@@ -3191,10 +3210,10 @@ begin
                 v.writing_fpr := '1';
                 v.update_fprf := '1';
             end if;
-            if v.is_subtract = '1' and v.result_class = ZERO then
+            if r.is_subtract = '1' and v.result_class = ZERO then
                 rsign := r.round_mode(0) and r.round_mode(1);
             end if;
-            if v.negate = '1' and v.result_class /= NAN then
+            if r.negate = '1' and v.result_class /= NAN then
                 rsign := not rsign;
             end if;
             v.instr_done := '1';

From 71b7df679b46a85e367c83927d5bbb15c78b2892 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 18 Jan 2024 22:06:13 +1100
Subject: [PATCH 02/24] FPU: Calculate quieten_nan in first cycle

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index 12181cf..72385a3 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -898,6 +898,7 @@ begin
             v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN);
             v.result_sign := '0';
             v.negate := '0';
+            v.quieten_nan := '1';
             case e_in.op is
                 when OP_FP_ARITH =>
                     fpin_a := e_in.valid_a;
@@ -953,6 +954,7 @@ begin
                     fpin_a := e_in.valid_a;
                     fpin_b := e_in.valid_b;
                     fpin_c := e_in.valid_c;
+                    v.quieten_nan := '0';
                     if e_in.insn(5) = '0' then
                         exec_state := DO_FMR;
                         if e_in.insn(9) = '1' then
@@ -1002,7 +1004,6 @@ begin
                 when others =>
                     exec_state := DO_ILLEGAL;
             end case;
-            v.quieten_nan := '1';
             v.tiny := '0';
             v.denorm := '0';
             v.add_bsmall := '0';
@@ -1370,7 +1371,6 @@ begin
                 v.result_class := r.b.class;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
-                v.quieten_nan := '0';
                 v.writing_fpr := '1';
                 v.instr_done := '1';
 
@@ -1629,7 +1629,6 @@ begin
                 else
                     v.opsel_a := AIN_B;
                 end if;
-                v.quieten_nan := '0';
                 v.state := EXC_RESULT;
 
             when DO_FSQRT =>
@@ -3575,7 +3574,7 @@ begin
                 v.sp_result := r.single_prec;
                 v.int_result := int_result;
                 v.illegal := illegal;
-                v.nsnan_result := v.quieten_nan;
+                v.nsnan_result := r.quieten_nan;
                 v.res_sign := rsign;
                 if r.integer_op = '1' then
                     v.cr_mask := num_to_fxm(0);

From 27b3e4235347ad0bdc183254bfc7bd901091ae3a Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 19 Jan 2024 09:37:16 +1100
Subject: [PATCH 03/24] FPU: Move result_sign computations from state machine
 to a data path

Instead of operating on result_sign directly, the state machine now
sets a control variable "rsgn_op" that then directs a tiny ALU to do
what's required.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 42 +++++++++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index 72385a3..7558493 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -287,6 +287,11 @@ architecture behaviour of fpu is
     signal rs_neg2       : std_ulogic;
     signal rs_norm       : std_ulogic;
 
+    constant RSGN_NOP : std_ulogic_vector(1 downto 0) := "00";
+    constant RSGN_INV : std_ulogic_vector(1 downto 0) := "01";
+    constant RSGN_SUB : std_ulogic_vector(1 downto 0) := "10";
+    constant RSGN_SEL : std_ulogic_vector(1 downto 0) := "11";
+
     constant arith_decode : decode32 := (
         -- indexed by bits 5..1 of opcode
         2#01000# => DO_FRI,
@@ -851,6 +856,7 @@ begin
         variable int_result  : std_ulogic;
         variable illegal     : std_ulogic;
         variable rsign       : std_ulogic;
+        variable rsgn_op     : std_ulogic_vector(1 downto 0);
     begin
         v := r;
         v.complete := '0';
@@ -1147,6 +1153,8 @@ begin
         rs_neg2 <= '0';
         rs_norm <= '0';
 
+        rsgn_op := RSGN_NOP;
+
         case r.state is
             when IDLE =>
                 v.invalid := '0';
@@ -1625,7 +1633,7 @@ begin
             when DO_FSEL =>
                 if r.a.class = ZERO or (r.a.negative = '0' and r.a.class /= NAN) then
                     v.opsel_a := AIN_C;
-                    v.result_sign := r.c.negative;
+                    rsgn_op := RSGN_SEL;
                 else
                     v.opsel_a := AIN_B;
                 end if;
@@ -1756,7 +1764,7 @@ begin
                     elsif r.madd_cmp = '0' then
                         -- addend is bigger, do multiply first
                         -- if subtracting, sign is opposite to initial estimate
-                        v.result_sign := r.result_sign xor r.is_subtract;
+                        rsgn_op := RSGN_SUB;
                         f_to_multiply.valid <= '1';
                         v.first := '1';
                         v.state := FMADD_0;
@@ -1786,7 +1794,7 @@ begin
                         -- Here A is zero, C is zero, or B is infinity
                         -- Result is +/-B in all of those cases
                         v.opsel_a := AIN_B;
-                        v.result_sign := r.result_sign xor r.is_subtract;
+                        rsgn_op := RSGN_SUB;
                         v.state := EXC_RESULT;
                     end if;
                 end if;
@@ -1913,7 +1921,7 @@ begin
                 re_sel2 <= REXP2_NE;
                 if r.r(63) = '1' then
                     -- result is opposite sign to expected
-                    v.result_sign := not r.result_sign;
+                    rsgn_op := RSGN_INV;
                     opsel_ainv <= '1';
                     carry_in <= '1';
                     v.state := FINISH;
@@ -1989,7 +1997,7 @@ begin
                 -- product is bigger here
                 -- shift B right and use it as the addend to the multiplier
                 -- for subtract, multiplier does B - A * C
-                v.result_sign := r.result_sign xor r.is_subtract;
+                rsgn_op := RSGN_SUB;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 -- set shift to b.exp - result_exp + 64
@@ -2031,7 +2039,7 @@ begin
             when FMADD_5 =>
                 -- negate R:S:X if negative
                 if r.r(63) = '1' then
-                    v.result_sign := not r.result_sign;
+                    rsgn_op := RSGN_INV;
                     opsel_ainv <= '1';
                     carry_in <= not (s_nz or r.x);
                     opsel_s <= S_NEG;
@@ -2629,14 +2637,12 @@ begin
                 end if;
                 if r.use_a = '1' and r.a.class = NAN then
                     v.opsel_a := AIN_A;
-                    v.result_sign := r.a.negative;
                 elsif r.use_b = '1' and r.b.class = NAN then
                     v.opsel_a := AIN_B;
-                    v.result_sign := r.b.negative;
                 elsif r.use_c = '1' and r.c.class = NAN then
                     v.opsel_a := AIN_C;
-                    v.result_sign := r.c.negative;
                 end if;
+                rsgn_op := RSGN_SEL;
                 v.state := EXC_RESULT;
 
             when EXC_RESULT =>
@@ -3186,6 +3192,24 @@ begin
 
         end case;
 
+        case rsgn_op is
+            when RSGN_SEL =>
+                case v.opsel_a is
+                    when AIN_A =>
+                        v.result_sign := r.a.negative;
+                    when AIN_B =>
+                        v.result_sign := r.b.negative;
+                    when AIN_C =>
+                        v.result_sign := r.c.negative;
+                    when others =>
+                end case;
+            when RSGN_SUB =>
+                v.result_sign := r.result_sign xor r.is_subtract;
+            when RSGN_INV =>
+                v.result_sign := not r.result_sign;
+            when others =>
+        end case;
+
         rsign := r.result_sign;
         if zero_divide = '1' then
             v.fpscr(FPSCR_ZX) := '1';

From 707dd619a039240304a6245480dff05c70b219b6 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 5 Feb 2024 14:25:10 +1100
Subject: [PATCH 04/24] FPU: Move NaN/infinity and zero/denorm handling out to
 separate states

This should simplify the DO_* states and hopefully be simpler overall.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 624 +++++++++++++++++++++++++++----------------------------
 1 file changed, 305 insertions(+), 319 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index 7558493..45f5fe0 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -41,11 +41,13 @@ architecture behaviour of fpu is
         class    : fp_number_class;
         negative : std_ulogic;
         denorm   : std_ulogic;
+        naninf   : std_ulogic;
+        zeroexp  : std_ulogic;
         exponent : signed(EXP_BITS-1 downto 0);         -- unbiased
         mantissa : std_ulogic_vector(63 downto 0);      -- 8.56 format
     end record;
 
-    type state_t is (IDLE, DO_ILLEGAL,
+    type state_t is (IDLE, DO_ILLEGAL, DO_NAN_INF, DO_ZERO_DEN,
                      DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
                      DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT,
                      DO_FCFID, DO_FCTI,
@@ -77,7 +79,7 @@ architecture behaviour of fpu is
                      RENORM_A, RENORM_A2,
                      RENORM_B, RENORM_B2,
                      RENORM_C, RENORM_C2,
-                     NAN_RESULT, EXC_RESULT,
+                     EXC_RESULT,
                      IDIV_NORMB, IDIV_NORMB2, IDIV_NORMB3,
                      IDIV_CLZA, IDIV_CLZA2, IDIV_CLZA3,
                      IDIV_NR0, IDIV_NR1, IDIV_NR2, IDIV_USE0_5,
@@ -144,7 +146,9 @@ architecture behaviour of fpu is
         exp_cmp      : std_ulogic;
         madd_cmp     : std_ulogic;
         add_bsmall   : std_ulogic;
+        is_addition  : std_ulogic;
         is_multiply  : std_ulogic;
+        is_inverse   : std_ulogic;
         is_sqrt      : std_ulogic;
         first        : std_ulogic;
         count        : unsigned(1 downto 0);
@@ -170,6 +174,8 @@ architecture behaviour of fpu is
         xerc         : xer_common_t;
         xerc_result  : xer_common_t;
         res_sign     : std_ulogic;
+        res_int      : std_ulogic;
+        exec_state   : state_t;
     end record;
 
     type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0);
@@ -567,11 +573,15 @@ architecture behaviour of fpu is
     begin
         reg.negative := fpr(63);
         reg.denorm := '0';
+        reg.naninf := '0';
+        reg.zeroexp := '0';
         exp_nz := or (fpr(62 downto 52));
         exp_ao := and (fpr(62 downto 52));
         frac_nz := or (fpr(51 downto 0));
         low_nz := or (fpr(31 downto 0));
         if is_fp = '1' then
+            reg.naninf := exp_ao;
+            reg.zeroexp := not exp_nz;
             reg.denorm := frac_nz and not exp_nz;
             reg.exponent := signed(resize(unsigned(fpr(62 downto 52)), EXP_BITS)) - to_signed(1023, EXP_BITS);
             if exp_nz = '0' then
@@ -724,6 +734,7 @@ begin
                 r.cr_mask <= (others =>'0');
                 r.cr_result <= (others =>'0');
                 r.instr_tag.valid <= '0';
+                r.exec_state <= IDLE;
                 if rst = '1' then
                     r.fpscr <= (others => '0');
                     r.comm_fpscr <= (others => '0');
@@ -853,16 +864,21 @@ begin
         variable rsh_in2     : signed(EXP_BITS-1 downto 0);
         variable exec_state  : state_t;
         variable opcbits     : std_ulogic_vector(4 downto 0);
-        variable int_result  : std_ulogic;
         variable illegal     : std_ulogic;
         variable rsign       : std_ulogic;
         variable rsgn_op     : std_ulogic_vector(1 downto 0);
+        variable is_nan_inf  : std_ulogic;
+        variable is_zero_den : std_ulogic;
+        variable sign_inv    : std_ulogic;
     begin
         v := r;
         v.complete := '0';
         v.do_intr := '0';
         is_32bint := '0';
         exec_state := IDLE;
+        is_nan_inf := '0';
+        is_zero_den := '0';
+        sign_inv := '0';
 
         if r.complete = '1' or r.do_intr = '1' then
             v.instr_done := '0';
@@ -894,7 +910,9 @@ begin
             v.divmod := '0';
             v.is_sqrt := '0';
             v.is_multiply := '0';
+            v.is_addition := '0';
             v.is_subtract := '0';
+            v.is_inverse := '0';
             fpin_a := '0';
             fpin_b := '0';
             fpin_c := '0';
@@ -905,6 +923,7 @@ begin
             v.result_sign := '0';
             v.negate := '0';
             v.quieten_nan := '1';
+            v.int_result := '0';
             case e_in.op is
                 when OP_FP_ARITH =>
                     fpin_a := e_in.valid_a;
@@ -913,32 +932,40 @@ begin
                     v.longmask := e_in.single;
                     v.fp_rc := e_in.rc;
                     exec_state := arith_decode(to_integer(unsigned(e_in.insn(5 downto 1))));
-                    if e_in.insn(5 downto 1) = "11001" or e_in.insn(5 downto 3) = "111" then
-                        v.is_multiply := '1';
-                    end if;
                     if e_in.insn(5 downto 1) = "10110" or e_in.insn(5 downto 1) = "11010" then
                         v.is_sqrt := '1';
                     end if;
-                    if e_in.insn(5 downto 1) = "01111" then
+                    if e_in.insn(5 downto 1) = "01111" then   -- fcti*z
                         v.round_mode := "001";
                     end if;
                     case e_in.insn(5 downto 1) is
                         when "10100" | "10101" =>       -- fadd and fsub
+                            v.is_addition := '1';
                             v.result_sign := e_in.fra(63);
                             if unsigned(e_in.fra(62 downto 52)) <= unsigned(e_in.frb(62 downto 52)) then
                                 v.result_sign := e_in.frb(63) xnor e_in.insn(1);
                             end if;
                             v.is_subtract := not (e_in.fra(63) xor e_in.frb(63) xor e_in.insn(1));
                         when "11001" =>         -- fmul
+                            v.is_multiply := '1';
                             v.result_sign := e_in.fra(63) xor e_in.frc(63);
                         when "11100" | "11101" | "11110" | "11111" =>   --fmadd family
+                            v.is_multiply := '1';
+                            v.is_addition := '1';
                             v.result_sign := e_in.fra(63) xor e_in.frc(63);
                             v.is_subtract := not (e_in.fra(63) xor e_in.frb(63) xor
                                                   e_in.frc(63) xor e_in.insn(1));
                             v.negate := e_in.insn(2);
                         when "10010" =>         -- fdiv
+                            v.is_inverse := '1';
                             v.result_sign := e_in.fra(63) xor e_in.frb(63);
-                        when others =>
+                        when "11000" | "11010" =>       -- fre and frsqrte
+                            v.is_inverse := '1';
+                            v.result_sign := e_in.frb(63);
+                        when "01110" | "01111" =>       -- fcti*
+                            v.int_result := '1';
+                            v.result_sign := e_in.frb(63);
+                        when others =>                  -- fri* and frsp
                             v.result_sign := e_in.frb(63);
                     end case;
                 when OP_FP_CMP =>
@@ -950,6 +977,10 @@ begin
                     opcbits := e_in.insn(10) & e_in.insn(8) & e_in.insn(4) & e_in.insn(2) & e_in.insn(1);
                     exec_state := misc_decode(to_integer(unsigned(opcbits)));
                     case opcbits is
+                        when "10010" | "11010" | "10011" =>
+                            -- fmrg*, mffs
+                            v.int_result := '1';
+                            v.result_sign := '0';
                         when "10110" =>        -- fcfid
                             v.result_sign := e_in.frb(63);
                         when others =>
@@ -1023,6 +1054,11 @@ begin
             v.b := bdec;
             v.c := cdec;
 
+            if e_in.op = OP_FP_ARITH then
+                is_nan_inf := adec.naninf or bdec.naninf or cdec.naninf;
+                is_zero_den := adec.zeroexp or bdec.zeroexp or cdec.zeroexp;
+            end if;
+
             v.exp_cmp := '0';
             if adec.exponent > bdec.exponent then
                 v.exp_cmp := '1';
@@ -1137,7 +1173,6 @@ begin
         rbit_inc := '0';
         mult_mask := '0';
         rnd_b32 := '0';
-        int_result := '0';
         illegal := '0';
 
         re_sel1 <= REXP1_ZERO;
@@ -1165,32 +1200,176 @@ begin
                         (e_in.valid_b = '0' or e_in.valid_c = '0') then
                         v.opsel_a := AIN_A;
                     end if;
-                    if e_in.op = OP_FP_ARITH then
-                        -- input selection for denorm cases
-                        case e_in.insn(5 downto 1) is
-                            when "10010" =>         -- fdiv
-                                if v.b.mantissa(UNIT_BIT) = '0' and v.a.mantissa(UNIT_BIT) = '1' then
-                                    v.opsel_a := AIN_B;
-                                end if;
-                            when "11001" =>         -- fmul
-                                if v.c.mantissa(UNIT_BIT) = '0' and v.a.mantissa(UNIT_BIT) = '1' then
-                                    v.opsel_a := AIN_C;
-                                end if;
-                            when "11100" | "11101" | "11110" | "11111" =>   -- fmadd etc.
-                                if v.a.mantissa(UNIT_BIT) = '0' then
-                                    v.opsel_a := AIN_A;
-                                elsif v.c.mantissa(UNIT_BIT) = '0' then
-                                    v.opsel_a := AIN_C;
-                                end if;
-                            when others =>
-                        end case;
+                    v.exec_state := exec_state;
+                    if is_nan_inf = '1' then
+                        v.state := DO_NAN_INF;
+                    elsif is_zero_den = '1' then
+                        v.state := DO_ZERO_DEN;
+                    else
+                        v.state := exec_state;
                     end if;
-                    v.state := exec_state;
                 end if;
                 v.x := '0';
                 v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX);
                 set_s := '1';
 
+            when DO_NAN_INF =>
+                -- At least one floating-point operand is infinity or NaN
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+
+                if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or
+                    (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or
+                    (r.c.class = NAN and r.c.mantissa(QNAN_BIT) = '0') then
+                    -- Signalling NAN
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    invalid := '1';
+                end if;
+                if r.a.class = NAN or r.b.class = NAN or r.c.class = NAN then
+                    if r.int_result = '1' then
+                        v.state := INT_OFLOW;
+                    else
+                        if r.a.class = NAN then
+                            v.opsel_a := AIN_A;
+                        elsif r.b.class = NAN then
+                            v.opsel_a := AIN_B;
+                        elsif r.c.class = NAN then
+                            v.opsel_a := AIN_C;
+                        end if;
+                        rsgn_op := RSGN_SEL;
+                        v.state := EXC_RESULT;
+                    end if;
+
+                else
+                    if r.a.class = INFINITY then
+                        if r.is_multiply = '1' and r.c.class = ZERO then
+                            -- invalid operation, construct QNaN
+                            v.fpscr(FPSCR_VXIMZ) := '1';
+                            qnan_result := '1';
+                        elsif r.is_subtract = '1' and r.b.class = INFINITY then
+                            v.fpscr(FPSCR_VXISI) := '1';
+                            qnan_result := '1';
+                        elsif r.is_inverse = '1' and r.b.class = INFINITY then
+                            v.fpscr(FPSCR_VXIDI) := '1';
+                            qnan_result := '1';
+                        else
+                            v.result_class := INFINITY;
+                        end if;
+                        arith_done := '1';
+                    elsif r.c.class = INFINITY then
+                        if r.is_multiply = '1' and r.a.class = ZERO then
+                            -- invalid operation, construct QNaN
+                            v.fpscr(FPSCR_VXIMZ) := '1';
+                            qnan_result := '1';
+                        elsif r.is_subtract = '1' and r.b.class = INFINITY then
+                            v.fpscr(FPSCR_VXISI) := '1';
+                            qnan_result := '1';
+                        else
+                            v.result_class := INFINITY;
+                        end if;
+                        arith_done := '1';
+                    else
+                        -- r.b.class = INFINITY
+                        if r.int_result = '1' then
+                            -- fcti*
+                            v.state := INT_OFLOW;
+                        elsif r.is_sqrt = '1' and r.b.negative = '1' then
+                            v.fpscr(FPSCR_VXSQRT) := '1';
+                            qnan_result := '1';
+                        elsif r.is_inverse = '1' then
+                            -- fdiv, fre, frsqrte
+                            v.result_class := ZERO;
+                            arith_done := '1';
+                        else
+                            sign_inv := r.is_multiply and r.is_subtract;
+                            v.result_class := INFINITY;
+                            arith_done := '1';
+                        end if;
+                    end if;
+                end if;
+
+            when DO_ZERO_DEN =>
+                -- At least one floating point operand is zero or denormalized
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if (r.use_a = '1' and r.a.class = ZERO) or
+                    (r.use_b = '1' and r.b.class = ZERO and r.is_multiply = '0') or
+                    (r.use_c = '1' and r.c.class = ZERO) then
+                    if r.use_a = '1' and r.a.class = ZERO then
+                        if r.is_inverse = '1' then
+                            -- fdiv; result is 0 unless B=0
+                            if r.b.class = ZERO then
+                                v.fpscr(FPSCR_VXZDZ) := '1';
+                                qnan_result := '1';
+                            else
+                                v.result_class := ZERO;
+                            end if;
+                            arith_done := '1';
+                        elsif r.is_addition = '1' then
+                            -- result is +/- B
+                            v.opsel_a := AIN_B;
+                            if r.is_multiply = '1' then
+                                rsgn_op := RSGN_SUB;
+                            end if;
+                            v.state := EXC_RESULT;
+                        else
+                            v.result_class := ZERO;
+                            arith_done := '1';
+                        end if;
+                    elsif r.use_c = '1' and r.c.class = ZERO then
+                        v.opsel_a := AIN_B;
+                        rsgn_op := RSGN_SUB;
+                        v.state := EXC_RESULT;
+                    else
+                        -- B is zero, other operands are finite
+                        if r.int_result = '1' then
+                            -- fcti*, r.opsel_a = AIN_B
+                            arith_done := '1';
+                        elsif r.is_inverse = '1' then
+                            -- fdiv, fre, frsqrte
+                            v.result_class := INFINITY;
+                            zero_divide := '1';
+                            arith_done := '1';
+                        elsif r.is_addition = '1' then
+                            -- fadd, r.opsel_a = AIN_A
+                            v.result_class := FINITE;
+                            re_sel1 <= REXP1_A;
+                            re_set_result <= '1';
+                            arith_done := '1';
+                        else
+                            -- other things, result is zero
+                            v.result_class := ZERO;
+                            arith_done := '1';
+                        end if;
+                    end if;
+
+                else
+                    -- some operand is denorm, and/or it's fmadd/fmsub with B=0
+                    v.opsel_a := AIN_B;
+                    if r.use_a = '1' and (r.use_b = '0' or r.use_c = '0') then
+                        v.opsel_a := AIN_A;
+                    end if;
+                    -- input selection for denorm cases
+                    case r.insn(5 downto 1) is
+                        when "10010" =>         -- fdiv
+                            if r.b.mantissa(UNIT_BIT) = '0' and r.a.mantissa(UNIT_BIT) = '1' then
+                                v.opsel_a := AIN_B;
+                            end if;
+                        when "11001" =>         -- fmul
+                            if r.c.mantissa(UNIT_BIT) = '0' and r.a.mantissa(UNIT_BIT) = '1' then
+                                v.opsel_a := AIN_C;
+                            end if;
+                        when "11100" | "11101" | "11110" | "11111" =>   -- fmadd etc.
+                            if r.a.mantissa(UNIT_BIT) = '0' then
+                                v.opsel_a := AIN_A;
+                            elsif r.c.mantissa(UNIT_BIT) = '0' then
+                                v.opsel_a := AIN_C;
+                            end if;
+                        when others =>
+                    end case;
+                    v.state := r.exec_state;
+                end if;
+
             when DO_ILLEGAL =>
                 illegal := '1';
                 v.instr_done := '1';
@@ -1323,7 +1502,6 @@ begin
                 -- fmrgew, fmrgow
                 opsel_r <= RES_MISC;
                 misc_sel <= "01" & r.insn(8) & '0';
-                int_result := '1';
                 v.writing_fpr := '1';
                 v.instr_done := '1';
 
@@ -1355,7 +1533,6 @@ begin
                         v.illegal := '1';
                         v.writing_fpr := '0';
                 end case;
-                int_result := '1';
                 v.instr_done := '1';
 
             when DO_MTFSF =>
@@ -1393,21 +1570,12 @@ begin
                 rs_neg2 <= '1';
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
-                if r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0' then
-                    -- Signalling NAN
-                    v.fpscr(FPSCR_VXSNAN) := '1';
-                    invalid := '1';
-                end if;
-                if r.b.class = FINITE then
-                    if r.b.exponent >= to_signed(52, EXP_BITS) then
-                        -- integer already, no rounding required
-                        arith_done := '1';
-                    else
-                        v.state := FRI_1;
-                        v.round_mode := '1' & r.insn(7 downto 6);
-                    end if;
-                else
+                if r.b.exponent >= to_signed(52, EXP_BITS) then
+                    -- integer already, no rounding required
                     arith_done := '1';
+                else
+                    v.state := FRI_1;
+                    v.round_mode := '1' & r.insn(7 downto 6);
                 end if;
 
             when DO_FRSP =>
@@ -1421,22 +1589,13 @@ begin
                 rs_neg2 <= '1';
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
-                if r.b.class = NAN and r.b.mantissa(53) = '0' then
-                    -- Signalling NAN
-                    v.fpscr(FPSCR_VXSNAN) := '1';
-                    invalid := '1';
-                end if;
                 set_x := '1';
-                if r.b.class = FINITE then
-                    if r.b.exponent < to_signed(-126, EXP_BITS) then
-                        v.state := ROUND_UFLOW;
-                    elsif r.b.exponent > to_signed(127, EXP_BITS) then
-                        v.state := ROUND_OFLOW;
-                    else
-                        v.state := ROUNDING;
-                    end if;
+                if r.b.exponent < to_signed(-126, EXP_BITS) then
+                    v.state := ROUND_UFLOW;
+                elsif r.b.exponent > to_signed(127, EXP_BITS) then
+                    v.state := ROUND_OFLOW;
                 else
-                    arith_done := '1';
+                    v.state := ROUNDING;
                 end if;
 
             when DO_FCTI =>
@@ -1451,39 +1610,25 @@ begin
                 rs_neg2 <= '1';
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
-                if r.b.class = NAN and r.b.mantissa(53) = '0' then
-                    -- Signalling NAN
-                    v.fpscr(FPSCR_VXSNAN) := '1';
-                    invalid := '1';
-                end if;
 
-                int_result := '1';
-
-                case r.b.class is
-                    when ZERO =>
-                        arith_done := '1';
-                    when FINITE =>
-                        if r.b.exponent >= to_signed(64, EXP_BITS) or
-                            (r.insn(9) = '0' and r.b.exponent >= to_signed(32, EXP_BITS)) then
-                            v.state := INT_OFLOW;
-                        elsif r.b.exponent >= to_signed(52, EXP_BITS) then
-                            -- integer already, no rounding required,
-                            -- shift into final position
-                            -- set shift to exponent - 56
-                            rs_con2 <= RSCON2_UNIT;
-                            if r.insn(8) = '1' and r.b.negative = '1' then
-                                v.state := INT_OFLOW;
-                            else
-                                v.state := INT_ISHIFT;
-                            end if;
-                        else
-                            -- set shift to exponent - 52
-                            rs_con2 <= RSCON2_52;
-                            v.state := INT_SHIFT;
-                        end if;
-                    when INFINITY | NAN =>
+                if r.b.exponent >= to_signed(64, EXP_BITS) or
+                    (r.insn(9) = '0' and r.b.exponent >= to_signed(32, EXP_BITS)) then
+                    v.state := INT_OFLOW;
+                elsif r.b.exponent >= to_signed(52, EXP_BITS) then
+                    -- integer already, no rounding required,
+                    -- shift into final position
+                    -- set shift to exponent - 56
+                    rs_con2 <= RSCON2_UNIT;
+                    if r.insn(8) = '1' and r.b.negative = '1' then
                         v.state := INT_OFLOW;
-                end case;
+                    else
+                        v.state := INT_ISHIFT;
+                    end if;
+                else
+                    -- set shift to exponent - 52
+                    rs_con2 <= RSCON2_52;
+                    v.state := INT_SHIFT;
+                end if;
 
             when DO_FCFID =>
                 -- r.opsel_a = AIN_B
@@ -1515,35 +1660,17 @@ begin
                 rs_sel2 <= RSH2_A;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
-                if r.a.class = FINITE and r.b.class = FINITE then
-                    v.add_bsmall := r.exp_cmp;
-                    v.opsel_a := AIN_B;
-                    if r.exp_cmp = '0' then
-                        if r.a.exponent = r.b.exponent then
-                            v.state := ADD_2;
-                        else
-                            v.longmask := '0';
-                            v.state := ADD_SHIFT;
-                        end if;
+                v.add_bsmall := r.exp_cmp;
+                v.opsel_a := AIN_B;
+                if r.exp_cmp = '0' then
+                    if r.a.exponent = r.b.exponent then
+                        v.state := ADD_2;
                     else
-                        v.state := ADD_1;
+                        v.longmask := '0';
+                        v.state := ADD_SHIFT;
                     end if;
                 else
-                    if r.a.class = NAN or r.b.class = NAN then
-                        v.state := NAN_RESULT;
-                    elsif r.a.class = INFINITY and r.b.class = INFINITY and r.is_subtract = '1' then
-                        -- invalid operation, construct QNaN
-                        v.fpscr(FPSCR_VXISI) := '1';
-                        qnan_result := '1';
-                        arith_done := '1';
-                    elsif r.a.class = INFINITY or r.b.class = ZERO then
-                        -- result is A; we're already set up to put A into R
-                        arith_done := '1';
-                    else
-                        -- result is +/- B
-                        v.opsel_a := AIN_B;
-                        v.state := EXC_RESULT;
-                    end if;
+                    v.state := ADD_1;
                 end if;
 
             when DO_FMUL =>
@@ -1555,32 +1682,14 @@ begin
                 re_sel1 <= REXP1_A;
                 re_sel2 <= REXP2_C;
                 re_set_result <= '1';
-                if r.a.class = FINITE and r.c.class = FINITE then
-                    -- Renormalize denorm operands
-                    if r.a.mantissa(UNIT_BIT) = '0' then
-                        v.state := RENORM_A;
-                    elsif r.c.mantissa(UNIT_BIT) = '0' then
-                        v.state := RENORM_C;
-                    else
-                        f_to_multiply.valid <= '1';
-                        v.state := MULT_1;
-                    end if;
+                -- Renormalize denorm operands
+                if r.a.mantissa(UNIT_BIT) = '0' then
+                    v.state := RENORM_A;
+                elsif r.c.mantissa(UNIT_BIT) = '0' then
+                    v.state := RENORM_C;
                 else
-                    if r.a.class = NAN or r.c.class = NAN then
-                        v.state := NAN_RESULT;
-                    elsif (r.a.class = INFINITY and r.c.class = ZERO) or
-                        (r.a.class = ZERO and r.c.class = INFINITY) then
-                        -- invalid operation, construct QNaN
-                        v.fpscr(FPSCR_VXIMZ) := '1';
-                        qnan_result := '1';
-                    elsif r.a.class = ZERO or r.a.class = INFINITY then
-                        -- result is +/- A
-                        arith_done := '1';
-                    else
-                        -- r.c.class is ZERO or INFINITY
-                        v.opsel_a := AIN_C;
-                        v.state := EXC_RESULT;
-                    end if;
+                    f_to_multiply.valid <= '1';
+                    v.state := MULT_1;
                 end if;
 
             when DO_FDIV =>
@@ -1593,41 +1702,14 @@ begin
                 re_neg2 <= '1';
                 re_set_result <= '1';
                 v.count := "00";
-                if r.a.class = FINITE and r.b.class = FINITE then
-                    -- Renormalize denorm operands
-                    if r.a.mantissa(UNIT_BIT) = '0' then
-                        v.state := RENORM_A;
-                    elsif r.b.mantissa(UNIT_BIT) = '0' then
-                        v.state := RENORM_B;
-                    else
-                        v.first := '1';
-                        v.state := DIV_2;
-                    end if;
+                -- Renormalize denorm operands
+                if r.a.mantissa(UNIT_BIT) = '0' then
+                    v.state := RENORM_A;
+                elsif r.b.mantissa(UNIT_BIT) = '0' then
+                    v.state := RENORM_B;
                 else
-                    if r.a.class = NAN or r.b.class = NAN then
-                        v.state := NAN_RESULT;
-                    elsif r.b.class = INFINITY then
-                        if r.a.class = INFINITY then
-                            v.fpscr(FPSCR_VXIDI) := '1';
-                            qnan_result := '1';
-                        else
-                            v.result_class := ZERO;
-                        end if;
-                        arith_done := '1';
-                    elsif r.b.class = ZERO then
-                        if r.a.class = ZERO then
-                            v.fpscr(FPSCR_VXZDZ) := '1';
-                            qnan_result := '1';
-                        else
-                            if r.a.class = FINITE then
-                                zero_divide := '1';
-                            end if;
-                            v.result_class := INFINITY;
-                        end if;
-                        arith_done := '1';
-                    else -- r.b.class = FINITE, result_class = r.a.class
-                        arith_done := '1';
-                    end if;
+                    v.first := '1';
+                    v.state := DIV_2;
                 end if;
 
             when DO_FSEL =>
@@ -1646,33 +1728,18 @@ begin
                 v.fpscr(FPSCR_FI) := '0';
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
-                case r.b.class is
-                    when FINITE =>
-                        if r.b.negative = '1' then
-                            v.fpscr(FPSCR_VXSQRT) := '1';
-                            qnan_result := '1';
-                        elsif r.b.mantissa(UNIT_BIT) = '0' then
-                            v.state := RENORM_B;
-                        elsif r.b.exponent(0) = '0' then
-                            v.state := SQRT_1;
-                        else
-                            -- set shift to 1
-                            rs_con2 <= RSCON2_1;
-                            v.state := RENORM_B2;
-                        end if;
-                    when NAN =>
-                        v.state := NAN_RESULT;
-                    when ZERO =>
-                        -- result is B
-                        arith_done := '1';
-                    when INFINITY =>
-                        if r.b.negative = '1' then
-                            v.fpscr(FPSCR_VXSQRT) := '1';
-                            qnan_result := '1';
-                        -- else result is B
-                        end if;
-                        arith_done := '1';
-                end case;
+                if r.b.negative = '1' then
+                    v.fpscr(FPSCR_VXSQRT) := '1';
+                    qnan_result := '1';
+                elsif r.b.mantissa(UNIT_BIT) = '0' then
+                    v.state := RENORM_B;
+                elsif r.b.exponent(0) = '0' then
+                    v.state := SQRT_1;
+                else
+                    -- set shift to 1
+                    rs_con2 <= RSCON2_1;
+                    v.state := RENORM_B2;
+                end if;
 
             when DO_FRE =>
                 -- r.opsel_a = AIN_B
@@ -1681,23 +1748,11 @@ begin
                 v.fpscr(FPSCR_FI) := '0';
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
-                case r.b.class is
-                    when FINITE =>
-                        if r.b.mantissa(UNIT_BIT) = '0' then
-                            v.state := RENORM_B;
-                        else
-                            v.state := FRE_1;
-                        end if;
-                    when NAN =>
-                        v.state := NAN_RESULT;
-                    when INFINITY =>
-                        v.result_class := ZERO;
-                        arith_done := '1';
-                    when ZERO =>
-                        v.result_class := INFINITY;
-                        zero_divide := '1';
-                        arith_done := '1';
-                end case;
+                if r.b.mantissa(UNIT_BIT) = '0' then
+                    v.state := RENORM_B;
+                else
+                    v.state := FRE_1;
+                end if;
 
             when DO_FRSQRTE =>
                 -- r.opsel_a = AIN_B
@@ -1708,33 +1763,16 @@ begin
                 re_set_result <= '1';
                 -- set shift to 1
                 rs_con2 <= RSCON2_1;
-                case r.b.class is
-                    when FINITE =>
-                        if r.b.negative = '1' then
-                            v.fpscr(FPSCR_VXSQRT) := '1';
-                            qnan_result := '1';
-                        elsif r.b.mantissa(UNIT_BIT) = '0' then
-                            v.state := RENORM_B;
-                        elsif r.b.exponent(0) = '0' then
-                            v.state := RSQRT_1;
-                        else
-                            v.state := RENORM_B2;
-                        end if;
-                    when NAN =>
-                        v.state := NAN_RESULT;
-                    when INFINITY =>
-                        if r.b.negative = '1' then
-                            v.fpscr(FPSCR_VXSQRT) := '1';
-                            qnan_result := '1';
-                        else
-                            v.result_class := ZERO;
-                        end if;
-                        arith_done := '1';
-                    when ZERO =>
-                        v.result_class := INFINITY;
-                        zero_divide := '1';
-                        arith_done := '1';
-                end case;
+                if r.b.negative = '1' then
+                    v.fpscr(FPSCR_VXSQRT) := '1';
+                    qnan_result := '1';
+                elsif r.b.mantissa(UNIT_BIT) = '0' then
+                    v.state := RENORM_B;
+                elsif r.b.exponent(0) = '0' then
+                    v.state := RSQRT_1;
+                else
+                    v.state := RENORM_B2;
+                end if;
 
             when DO_FMADD =>
                 -- fmadd, fmsub, fnmadd, fnmsub
@@ -1749,54 +1787,25 @@ begin
                 rs_sel1 <= RSH1_B;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
-                if r.a.class = FINITE and r.c.class = FINITE and
-                    (r.b.class = FINITE or r.b.class = ZERO) then
-                    -- Make sure A and C are normalized
-                    if r.a.mantissa(UNIT_BIT) = '0' then
-                        v.state := RENORM_A;
-                    elsif r.c.mantissa(UNIT_BIT) = '0' then
-                        v.state := RENORM_C;
-                    elsif r.b.class = ZERO then
-                        -- no addend, degenerates to multiply
-                        f_to_multiply.valid <= '1';
-                        v.is_multiply := '1';
-                        v.state := MULT_1;
-                    elsif r.madd_cmp = '0' then
-                        -- addend is bigger, do multiply first
-                        -- if subtracting, sign is opposite to initial estimate
-                        rsgn_op := RSGN_SUB;
-                        f_to_multiply.valid <= '1';
-                        v.first := '1';
-                        v.state := FMADD_0;
-                    else
-                        -- product is bigger, shift B first
-                        v.state := FMADD_1;
-                    end if;
+                -- Make sure A and C are normalized
+                if r.a.mantissa(UNIT_BIT) = '0' then
+                    v.state := RENORM_A;
+                elsif r.c.mantissa(UNIT_BIT) = '0' then
+                    v.state := RENORM_C;
+                elsif r.b.class = ZERO then
+                    -- no addend, degenerates to multiply
+                    f_to_multiply.valid <= '1';
+                    v.state := MULT_1;
+                elsif r.madd_cmp = '0' then
+                    -- addend is bigger, do multiply first
+                    -- if subtracting, sign is opposite to initial estimate
+                    rsgn_op := RSGN_SUB;
+                    f_to_multiply.valid <= '1';
+                    v.first := '1';
+                    v.state := FMADD_0;
                 else
-                    if r.a.class = NAN or r.b.class = NAN or r.c.class = NAN then
-                        v.state := NAN_RESULT;
-                    elsif (r.a.class = ZERO and r.c.class = INFINITY) or
-                        (r.a.class = INFINITY and r.c.class = ZERO) then
-                        -- invalid operation, construct QNaN
-                        v.fpscr(FPSCR_VXIMZ) := '1';
-                        qnan_result := '1';
-                    elsif r.a.class = INFINITY or r.c.class = INFINITY then
-                        if r.b.class = INFINITY and r.is_subtract = '1' then
-                            -- invalid operation, construct QNaN
-                            v.fpscr(FPSCR_VXISI) := '1';
-                            qnan_result := '1';
-                        else
-                            -- result is infinity
-                            v.result_class := INFINITY;
-                            arith_done := '1';
-                        end if;
-                    else
-                        -- Here A is zero, C is zero, or B is infinity
-                        -- Result is +/-B in all of those cases
-                        v.opsel_a := AIN_B;
-                        rsgn_op := RSGN_SUB;
-                        v.state := EXC_RESULT;
-                    end if;
+                    -- product is bigger, shift B first
+                    v.state := FMADD_1;
                 end if;
 
             when RENORM_A =>
@@ -2403,7 +2412,6 @@ begin
                     when others =>      -- fctidu[z]
                         need_check := r.r(63);
                 end case;
-                int_result := '1';
                 if need_check = '1' then
                     v.state := INT_CHECK;
                 else
@@ -2430,7 +2438,6 @@ begin
                         v.fpscr(FPSCR_XX) := '1';
                     end if;
                 end if;
-                int_result := '1';
                 arith_done := '1';
 
             when INT_OFLOW =>
@@ -2441,7 +2448,6 @@ begin
                 end if;
                 v.fpscr(FPSCR_VXCVI) := '1';
                 invalid := '1';
-                int_result := '1';
                 arith_done := '1';
 
             when FRI_1 =>
@@ -2627,24 +2633,6 @@ begin
                 re_set_result <= '1';
                 arith_done := '1';
 
-            when NAN_RESULT =>
-                if (r.use_a = '1' and r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or
-                    (r.use_b = '1' and r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or
-                    (r.use_c = '1' and r.c.class = NAN and r.c.mantissa(QNAN_BIT) = '0') then
-                    -- Signalling NAN
-                    v.fpscr(FPSCR_VXSNAN) := '1';
-                    invalid := '1';
-                end if;
-                if r.use_a = '1' and r.a.class = NAN then
-                    v.opsel_a := AIN_A;
-                elsif r.use_b = '1' and r.b.class = NAN then
-                    v.opsel_a := AIN_B;
-                elsif r.use_c = '1' and r.c.class = NAN then
-                    v.opsel_a := AIN_C;
-                end if;
-                rsgn_op := RSGN_SEL;
-                v.state := EXC_RESULT;
-
             when EXC_RESULT =>
                 -- r.opsel_a = AIN_A, AIN_B or AIN_C according to which input is the result
                 case r.opsel_a is
@@ -3172,7 +3160,6 @@ begin
                     end if;
                 end if;
                 v.cr_result(0) := v.xerc.so;
-                int_result := '1';
                 v.writing_fpr := '1';
                 v.instr_done := '1';
             when IDIV_ZERO =>
@@ -3186,7 +3173,6 @@ begin
                     v.writing_xer := '1';
                 end if;
                 v.cr_result := "001" & v.xerc_result.so;
-                int_result := '1';
                 v.writing_fpr := '1';
                 v.instr_done := '1';
 
@@ -3210,7 +3196,7 @@ begin
             when others =>
         end case;
 
-        rsign := r.result_sign;
+        rsign := r.result_sign xor sign_inv;
         if zero_divide = '1' then
             v.fpscr(FPSCR_ZX) := '1';
         end if;
@@ -3596,7 +3582,7 @@ begin
                     v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX);
                 end if;
                 v.sp_result := r.single_prec;
-                v.int_result := int_result;
+                v.res_int := r.int_result or r.integer_op;
                 v.illegal := illegal;
                 v.nsnan_result := r.quieten_nan;
                 v.res_sign := rsign;
@@ -3627,7 +3613,7 @@ begin
         end if;
 
         -- This mustn't depend on any fields of r that are modified in IDLE state.
-        if r.int_result = '1' then
+        if r.res_int = '1' then
             fp_result <= r.r;
         else
             fp_result <= pack_dp(r.res_sign, r.result_class, r.result_exp, r.r,

From a3613d863b682d422c1dc412a4137e350c06a44e Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 10 Feb 2024 16:53:21 +1100
Subject: [PATCH 05/24] FPU: Simplify sign calculation in FP multiply-add
 instructions

By starting out with result_sign = +/- sign of B, we avoid the need to
flip the result sign in a few places.

This also simplifies DO_FMADD state a bit by having DO_ZERO_DEN go to
DO_FMUL state for floating multiply-add where B is zero.  (The
RENORM_A2 and RENORM_C2 states already do this.)

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index 45f5fe0..f0a180f 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -952,7 +952,7 @@ begin
                         when "11100" | "11101" | "11110" | "11111" =>   --fmadd family
                             v.is_multiply := '1';
                             v.is_addition := '1';
-                            v.result_sign := e_in.fra(63) xor e_in.frc(63);
+                            v.result_sign := e_in.frb(63) xnor e_in.insn(1);
                             v.is_subtract := not (e_in.fra(63) xor e_in.frb(63) xor
                                                   e_in.frc(63) xor e_in.insn(1));
                             v.negate := e_in.insn(2);
@@ -1253,6 +1253,7 @@ begin
                             v.fpscr(FPSCR_VXIDI) := '1';
                             qnan_result := '1';
                         else
+                            sign_inv := r.is_multiply and r.is_subtract;
                             v.result_class := INFINITY;
                         end if;
                         arith_done := '1';
@@ -1265,6 +1266,7 @@ begin
                             v.fpscr(FPSCR_VXISI) := '1';
                             qnan_result := '1';
                         else
+                            sign_inv := r.is_multiply and r.is_subtract;
                             v.result_class := INFINITY;
                         end if;
                         arith_done := '1';
@@ -1281,7 +1283,6 @@ begin
                             v.result_class := ZERO;
                             arith_done := '1';
                         else
-                            sign_inv := r.is_multiply and r.is_subtract;
                             v.result_class := INFINITY;
                             arith_done := '1';
                         end if;
@@ -1308,9 +1309,6 @@ begin
                         elsif r.is_addition = '1' then
                             -- result is +/- B
                             v.opsel_a := AIN_B;
-                            if r.is_multiply = '1' then
-                                rsgn_op := RSGN_SUB;
-                            end if;
                             v.state := EXC_RESULT;
                         else
                             v.result_class := ZERO;
@@ -1318,7 +1316,6 @@ begin
                         end if;
                     elsif r.use_c = '1' and r.c.class = ZERO then
                         v.opsel_a := AIN_B;
-                        rsgn_op := RSGN_SUB;
                         v.state := EXC_RESULT;
                     else
                         -- B is zero, other operands are finite
@@ -1349,6 +1346,14 @@ begin
                     if r.use_a = '1' and (r.use_b = '0' or r.use_c = '0') then
                         v.opsel_a := AIN_A;
                     end if;
+                    if r.use_b = '1' and r.b.class = ZERO and r.use_c = '1' then
+                        -- turn fmadd/sub into fmul
+                        v.opsel_a := AIN_A;
+                        rsgn_op := RSGN_SUB;
+                        v.state := DO_FMUL;
+                    else
+                        v.state := r.exec_state;
+                    end if;
                     -- input selection for denorm cases
                     case r.insn(5 downto 1) is
                         when "10010" =>         -- fdiv
@@ -1367,7 +1372,6 @@ begin
                             end if;
                         when others =>
                     end case;
-                    v.state := r.exec_state;
                 end if;
 
             when DO_ILLEGAL =>
@@ -1792,14 +1796,9 @@ begin
                     v.state := RENORM_A;
                 elsif r.c.mantissa(UNIT_BIT) = '0' then
                     v.state := RENORM_C;
-                elsif r.b.class = ZERO then
-                    -- no addend, degenerates to multiply
-                    f_to_multiply.valid <= '1';
-                    v.state := MULT_1;
                 elsif r.madd_cmp = '0' then
                     -- addend is bigger, do multiply first
                     -- if subtracting, sign is opposite to initial estimate
-                    rsgn_op := RSGN_SUB;
                     f_to_multiply.valid <= '1';
                     v.first := '1';
                     v.state := FMADD_0;
@@ -2006,7 +2005,6 @@ begin
                 -- product is bigger here
                 -- shift B right and use it as the addend to the multiplier
                 -- for subtract, multiplier does B - A * C
-                rsgn_op := RSGN_SUB;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 -- set shift to b.exp - result_exp + 64

From 9ac71cfbf2a666b3ff5e75fbaa3c9e99cee19597 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 5 Apr 2024 09:34:14 +1100
Subject: [PATCH 06/24] tests/fpu: Add more floating multiply-add tests

Add more tests to check that the result sign computations are correct.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 tests/fpu/fpu.c    |  71 +++++++++++++++++++++++++++++++++++++++++++++
 tests/test_fpu.bin | Bin 31832 -> 32896 bytes
 2 files changed, 71 insertions(+)

diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index 79ba7fa..c13110f 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -1496,34 +1496,105 @@ struct fmavals {
 	unsigned long nfma;
 	unsigned long nfms;
 } fmavals[] = {
+	/* +0 * +0 +- +0 -> +0, +0, -0, -0 */
 	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
 	  0x0000000000000000, 0x0000000000000000, 0x8000000000000000, 0x8000000000000000 },
+	/* +0 * NaNC +- +0 -> NaNC, NaNC, NaNC, NaNC */
 	{ 0x0000000000000000, 0x7ffc000000000000, 0x0000000000000000,
 	  0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000 },
+	/* +0 * NaNC +- NaNB -> NaNB, NaNB, NaNB, NaNB */
 	{ 0x0000000000000000, 0x7ffc000000000000, 0x7ffb000000000000,
 	  0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000 },
+	/* NaNA * NaNC +- NaNB -> NaNA, NaNA, NaNA, NaNA */
 	{ 0x7ffa000000000000, 0x7ffc000000000000, 0x7ffb000000000000,
 	  0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000 },
+	/* +1.0 * -0 +- +finite B -> +B, -B, -B, +B */
 	{ 0x3ff0000000000000, 0x8000000000000000, 0x678123456789abcd, 
 	  0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd },
+	/* +1.0 * -1.0 +- (B = +3.818e+190) -> +B, -B, -B, +B */
 	{ 0x3ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, 
 	  0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd },
+	/* +inf * -1.0 +- +finite B -> -inf, -inf, +inf, +inf */
 	{ 0x7ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, 
 	  0xfff0000000000000, 0xfff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
+	/* +inf * +0 +- +finite B -> NaNQ, NaNQ, NaNQ, NaNQ */
 	{ 0x7ff0000000000000, 0x0000000000000000, 0x678123456789abcd, 
 	  0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000 },
+	/* +1.0 * +1.0 +- 1.00000012 -> +2.00000012, +1.2e-7, -2.00000012, -1.2e-7 */
 	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000020000000, 
 	  0x4000000010000000, 0xbe80000000000000, 0xc000000010000000, 0x3e80000000000000 },
+	/* +(1 + 2^-52) * +(1 + 2^-52) +- +1.0 -> +(2 + 2^-51), +2^-51, -(2 + 2^-51), -2^-51 */
 	{ 0x3ff0000000000001, 0x3ff0000000000001, 0x3ff0000000000000,
 	  0x4000000000000001, 0x3cc0000000000000, 0xc000000000000001, 0xbcc0000000000000 },
+	/* +(1 + 3*2^-52) * +(1 + 2^-51) +- +1.0 -> +(2 + 2^-50), +5 * 2^-52 + 2^-101, -, - */
 	{ 0x3ff0000000000003, 0x3ff0000000000002, 0x3ff0000000000000,
 	  0x4000000000000002, 0x3cd4000000000002, 0xc000000000000002, 0xbcd4000000000002 },
+	/* +2.443e-77 * 2.828 +- 6.909e-77 -> -1.402e-93, +1.382e-76, +1.402e-93, -1.382e-76 */
 	{ 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb020000000000000,
 	  0xaca765753908cd20, 0x3030000000000000, 0x2ca765753908cd20, 0xb030000000000000 },
+	/* +2.443e-77 * 2.828 +- 6.909e-77 -> +9.446e-93, +1.382e-76, -9.446e-93, -1.382e-76 */
 	{ 0x3006a09e667f3bcd, 0x4006a09e667f3bcd, 0xb020000000000000,
 	  0x2cd3b3efbf5e2229, 0x3030000000000000, 0xacd3b3efbf5e2229, 0xb030000000000000 },
+	/* +2.443e-77 * 2.828 +- -1.1055e-75 -> -1.0364e-75, +1.1746e-75, +1.0364e-75, -1.1746e-75 */
 	{ 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb060003450000000,
 	  0xb05e0068a0000000, 0x3061003450000000, 0x305e0068a0000000, 0xb061003450000000 },
+	/* +2 * +3 +- 3 -> +9, +3, -9, -3 */
+	{ 0x4000000000000000, 0x4008000000000000, 0x4008000000000000,
+	  0x4022000000000000, 0x4008000000000000, 0xc022000000000000, 0xc008000000000000 },
+	/* +2 * +3 +- 5 -> +11, +1, -11, -1 */
+	{ 0x4000000000000000, 0x4008000000000000, 0x4014000000000000,
+	  0x4026000000000000, 0x3ff0000000000000, 0xc026000000000000, 0xbff0000000000000 },
+	/* +2 * +3 +- 7 -> +13, -1, -13, +1 */
+	{ 0x4000000000000000, 0x4008000000000000, 0x401c000000000000,
+	  0x402a000000000000, 0xbff0000000000000, 0xc02a000000000000, 0x3ff0000000000000 },
+	/* +2 * +3 +- 9 -> +15, -3, -15, +3 */
+	{ 0x4000000000000000, 0x4008000000000000, 0x4022000000000000,
+	  0x402e000000000000, 0xc008000000000000, 0xc02e000000000000, 0x4008000000000000 },
+	/* +2 * +3 +- -3 -> +3, +9, -3, -9 */
+	{ 0x4000000000000000, 0x4008000000000000, 0xc008000000000000,
+	  0x4008000000000000, 0x4022000000000000, 0xc008000000000000, 0xc022000000000000 },
+	/* +2 * +3 +- -5 -> +1, +11, -1, -11 */
+	{ 0x4000000000000000, 0x4008000000000000, 0xc014000000000000,
+	  0x3ff0000000000000, 0x4026000000000000, 0xbff0000000000000, 0xc026000000000000 },
+	/* +2 * +3 +- -7 -> -1, +13, +1, -13 */
+	{ 0x4000000000000000, 0x4008000000000000, 0xc01c000000000000,
+	  0xbff0000000000000, 0x402a000000000000, 0x3ff0000000000000, 0xc02a000000000000 },
+	/* +2 * +3 +- -9 -> -3, +15, +3, -15 */
+	{ 0x4000000000000000, 0x4008000000000000, 0xc022000000000000,
+	  0xc008000000000000, 0x402e000000000000, 0x4008000000000000, 0xc02e000000000000 },
+	/* +2 * -3 +- 3 -> -3, -9, +3, +9 */
+	{ 0x4000000000000000, 0xc008000000000000, 0x4008000000000000,
+	  0xc008000000000000, 0xc022000000000000, 0x4008000000000000, 0x4022000000000000 },
+	/* +2 * -3 +- 5 -> -1, -11, +1, +11 */
+	{ 0x4000000000000000, 0xc008000000000000, 0x4014000000000000,
+	  0xbff0000000000000, 0xc026000000000000, 0x3ff0000000000000, 0x4026000000000000 },
+	/* +2 * -3 +- 7 -> +1, -13, -1, +13 */
+	{ 0x4000000000000000, 0xc008000000000000, 0x401c000000000000,
+	  0x3ff0000000000000, 0xc02a000000000000, 0xbff0000000000000, 0x402a000000000000 },
+	/* +2 * -3 +- 9 -> +3, -15, -3, +15 */
+	{ 0x4000000000000000, 0xc008000000000000, 0x4022000000000000,
+	  0x4008000000000000, 0xc02e000000000000, 0xc008000000000000, 0x402e000000000000 },
+	/* -2 * +3 +- -3 -> -9, -3, +9, +3 */
+	{ 0xc000000000000000, 0x4008000000000000, 0xc008000000000000,
+	  0xc022000000000000, 0xc008000000000000, 0x4022000000000000, 0x4008000000000000 },
+	/* -2 * +3 +- -5 -> -11, -1, +11, +1 */
+	{ 0xc000000000000000, 0x4008000000000000, 0xc014000000000000,
+	  0xc026000000000000, 0xbff0000000000000, 0x4026000000000000, 0x3ff0000000000000 },
+	/* -2 * +3 +- -7 -> -13, +1, +13, -1 */
+	{ 0xc000000000000000, 0x4008000000000000, 0xc01c000000000000,
+	  0xc02a000000000000, 0x3ff0000000000000, 0x402a000000000000, 0xbff0000000000000 },
+	/* -2 * +3 +- -9 -> -15, +3, +15, -3 */
+	{ 0xc000000000000000, 0x4008000000000000, 0xc022000000000000,
+	  0xc02e000000000000, 0x4008000000000000, 0x402e000000000000, 0xc008000000000000 },
+	/* -2 * +3 +- +0 -> -6, -6, +6, +6 */
+	{ 0xc000000000000000, 0x4008000000000000, 0x0000000000000000,
+	  0xc018000000000000, 0xc018000000000000, 0x4018000000000000, 0x4018000000000000 },
+	/* +2 * -3 +- -0 -> -6, -6, +6, +6 */
+	{ 0x4000000000000000, 0xc008000000000000, 0x8000000000000000,
+	  0xc018000000000000, 0xc018000000000000, 0x4018000000000000, 0x4018000000000000 },
+	/* 2^-1026 * (1.5 * 2^1023) +- -0 -> (1.5 * 2^-3), ditto, -ditto, -ditto */
+	{ 0x0001000000000000, 0x7fe8000000000000, 0x8000000000000000,
+	  0x3fc8000000000000, 0x3fc8000000000000, 0xbfc8000000000000, 0xbfc8000000000000 },
 };
 
 int test23(long arg)
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index cc6c1ccb06362188008b2c1a3cdc5936a329d9f3..24878af7125b68dfa588e23782443a3a7f773f65 100755
GIT binary patch
delta 3912
zcmaJ@3s6+o89rx0VU@K!1zBKU1a>#PU+x0S-UWeWkynFNnUJ{VX?zqqNeUI+g)$v+
zqyc}^Av0;J9d#<AS&Pop665M5t)^;YCl6DN#fIsCk%VNd!TPxU&wVVillIKq@7()+
z|9{T^ICt+}nWf%tMH(TTjS!dnPhqZ3oS*Z0U2a0wA-55^jmT|$j&u*E<<m2REqQeI
zSpV1$JRLs0m5`b#LZ<$L#uhE{))TzhA@EaIZ}EIBduA82-Qg=o`}1h~!PO5R(b^>K
zX#HLfA;VXXe~<Q6y%)8eSBK`bFkz!!A6$yeQuyA6hmplOEiGEx%nXN@%iYpqaxWn@
zod(L;UZ`}rXL#M+!+l!Y*B3>8oOf8^#VD=aK<AuZs0@fx6mud@bC;3yaEJRi8(W+$
z`t?N@E^O|c<aJXWKQxX0Icfoo4$US`F?j_Rge45uqgPWO6vd>NJF-Mks>GB6SH@~g
zvDL5fJzU4V2~UW?!S<N6#GjyqMHJ*WB{@`--;GIG)aEPi7Po?b#pEk}>%Hu$%?J9}
zytw2{;ylCaW_kX%cAoFC6Ea>B4jW?aG#n1aZd|x&ZLg*8OSrT&3qDK=9bFk0N~sr`
zmS)o==w4b)&%t@bNQjEBrlqhJF%|Y9j>A;^Yjh2CCREdAID@zbl*_8=3TQ$!!W)RQ
za077!Oo`QW540n`2PY8+fU2wMlTd>=GCHUR8ak3noG=re09>+G)rJpLCzfae+LJYO
z3I>yxCv{-6i|!xtl!cjADTy*8iF9+TiOH%ZhEAABE{JY55a%&WmgPT@w!H&SfjK2h
zG1~?$DOEHU_NA!dR$2IuhKIGb|9Cjt@UY1}JnV8WCVtDokn>&acq|r<{&-0w!DEeU
zjg7h7MxwhrHm0@Z5NaJ8Gr6_ohK1Q0i32y+-DUfEUxP%Au$N0?#^henlq|^krQZ5J
zoY3U65N;y9AL6uj8U?Ggc1Hb5`#dG^KpT%GiPLSRAH$$d5dNlnei>;$U<oH)OU*sL
zmXht3bwp=59`;OEGc@UcN{_*L{S$1B6hj_;5t<B7sKVQyv9PEwW7LCi%wWeReS_$L
z+*CDtBkEIM!k(N=h3Kdx;%t`+MToy--~c@J*ils^ZW)268xKoq0xU~Ys}e-U!I(&J
zq{X}DF;a-h-?2Aaox!Xu#O?h)o4a?!GzO$J?kB=d<4X{cZj3S|&X4AI(E3<9Y)an|
z6c9D4bAy+T#0stbH(y&u+>?YP3n9$2fjVP+P#W#VcsvY!#_#Aw=r%1^480A1G;N{J
zg2}v*a-*-C4~Nh~aAd_Na|x!dCrOW^L+$Aj-=OuSvu&qJ2$uRHypm<d_MONwM86SD
zoZXMq({~tdX02B;v)MMjWExgwH_!+;ki9i}ZPeTuWiR92Ly(tKESX*}!BumzF}ede
zwb7lC^XoG7Yf$9w#4Y=C<5}wu=2p>nU^>@BRj@fvtw<Pxoq1(U`5@06$Ij2-O;&Wf
zwNWU#VVe(>`A;jNmOxpdn(l^Q<X184OZlR8|I~cz5l|I8tx&08OP-nz!p?#sW_`B6
zOfSH6L4(2;fl8vnKL9O-g0#EPthm1z-Yu+GY+fvAIRjuW%42R%6`5%c^c6KI_AL^n
zez+q_=H+HA{;K5-ijS3|v<uEI7rgGEGz2<J164x1<;QHHe_C8ZZfhr4v}IQDy)bO+
z0Ecz8Qb)=%W*_!&L(pVV)01$@`v1Gw1m9Tgv=?%V(-pcfp)P~)YO#>c(PA^cLZ21?
z+@lp=t;t#b#+L^}!n;1Y!S>eT=L>(?g|o4~)QC1-pMLNpYV1W=fG0Btd(d(33(qui
z2NgeKXQ-5&SPSx>3Eb8I$07GVa*j{<+QLb`HV1pg?6HrdS4&RNsDj2`OaBA5cu(mv
z{P}N#eKu`~Z~Y_;EY&Vyp$z%-eQX}^!MC=p%DJak%ihA15@&xW*{8UhM2L-O!`-_p
zT-$o_a5E2<%#aVhva2icLo*Uex`rFLZ3B3(IP2;R#UI&6xe&rxBZ9cddCySKY4BsW
zh09k7Q7AK{xQ=r5zjuUETj(X8#}`%@k%&ycf?T6W5yo+N#r~D&M;_st@qPpx<uP%8
zKqXaD*xZHJIoMF1PG5$>^2hOubE-TWzg*Mh8fpPmg$BQ0nTXFq9pWz7jCdUS5wF3K
z3XLN4Be+zNLNj0*sb)~E(9kZ(MBEQ`h!<cp;%(?hOoAhbE8rsHCYVOt1FA|5mN*k}
z0_qSWU^8L?^ds(qBbA!CiBEVQYcC8f;_VPZZo!R8!vf2vJnsZ$l>tAZnN`^sR%=yu
zrtykYqB^C6{D!`zs94e%<=DP%(GpVrkLwE^i*XHBwhN(zOu$L>Ylf++5=Hah;bv8y
zG6$uX7ZB16?&^FD{PpVeIOjE<*CNjb(<cP|a&^9f`y5=(V&I(N7=MkkB5ud{T(qni
z#}^VZ0Q;OV)DI_|4W^_Cp5MdP!NK&5Fhbr$u2*2h$T;L8AhRYLThdySPj|o|-W%ae
zO$oN)0p908rYoC<LM!5a=to=urx3?s8nFwKYqMzr)FE168{%F#g2=(=h_^wtG9MYo
z%9yywZ+YG;?QF!E!>bE6tbE+$LxnMHusFSVy@y-|#Ty)qV|iQ~J+-nX1jg}y(ZkkA
z_!Ljh_Qr*y8I50rc{tQ@gmc0anDxQWeo2>RAqu2qzgSMx)1-L@|BbPc=4VLA%xQ5B
z_VXlhqF|P0+0g74d3m8=N4;d9CC%Y}B8)6)mIDj+lkFtkL-d^BM`S@SEfDM&EAetb
zflQ^$(*B2ElI$nh1ril1<RL`PG<ZWf5IL|wrW_H5oN2&M+DOh^_6w{d`RS!u@WcHB
znae$tTae~Q5l3*w!5hjgl<nk%<aOkXgZ=azA#DG^A(AtaBbPHKer7diC+mUC{DM5!
zN=|$Z_LEzf<`;DgP}rXYdTIZDN4bZA%p^NGGda`X4dnm>iE-lka$@0raew@uDhhJ%
z#3w=$Bt9^6qLLuZ!6hYLN%sfMob)Ws(hmfgG|2&YMT2uvW`X&%#Gi)CG=5aG#+v$H
DLkTV0

delta 2865
zcmZuydr*_v6~8wmKm?S4kw<_8LITMG$?{AHAps18hk&E3!nnIeo0dv#$&Q`IAxNaK
z0~WTj$3NH^w5wCxUCef5fOfX7j<(gxcGu#xZgpfV?0{qHy3J~Bv7%qkP2vDfznS~{
z?){x}&pnU(a3AK~T%t-s%tk_LY*)f+4&vYLcGcJj*^OKqa&5@9*-7V+GLN1gYRRR`
z<AdimbTxX?^n_G<3Gx03m31=Wt|!>(^Yh-@Uvpe78?(lA`#j~SKaIMd-JV$`vj%0q
zs(-(Wkg?m7|3~&sy&GjmZV#`>!u$*Mc;H%e4zKSFJdZBc`g}6u99O3I`Bdn)>R?NY
z%8t$E<5Hi<sm%C2OMpMqud@I13zZF8MKV)xl9yr(FDA;V%FdLDRqe_o-R71SneiT1
z=4Hm1;THy+HPsV;6bWcdq=r}n!SgU7EMT}3vx9n|Fg8W)&It&Eneg&(CCrAGS)b*)
z&CaI_T_JuB4#g@HYfywq^ovJ=Vtzn;AvR^5)l+^ruoT>l%@ca|xY@|+fwYaeVnsHw
zUgA=hIPR2*<GM|R%o!r#`x{L(8a~?C7TVt2qwBv9*EZ$AOcH;hPRysY54LX7P#GNF
zR7F3BX~c~nimRe#XhzJ0lZXr8jr%#>1qb7+=uWtVxC?{{Rn!Vw5wqYu#ATRAJPWGC
zDmn}f#H;WL;xV9-D!Lo05zkHxNr3Vv(}@*)G4WtdmWiA=C=K1aou&rTGza^WrSt&|
zCKn|gz=9Vpf7DePrfL!rVOA13Y~Dds`qEH&6U-&&$Jpe=I*tLe94sggIdLZ_Q*wC3
z0k5T0(s($YB6;J#T{7bmH~-YfU3xk;rm{tli@MVx(=Jv?9NSrv7#~gO60?;_QlCT+
znK6@;>Cc^0*|db}nXZh?>u_vmoD6l?nmd06o235rDy3=+(jhen!jGisr(s;0$K1G!
z^oM}^jEZ5a%*3d7W#3H*cZ}+yiCfpZ%+*qINVl7$>L$bXx{iS;wS(S*!BoHMwbX!W
zII31ZU8}kiwxs=pPQm50ZOm$-JeU3ew#c_dMLYJQZMt+FV;O{x<t8lO14IjCr%Tvb
zsY&05)tX8NkvNH19l`Vwa?#KVWf>XZR`eCbql1yS^>HVa$guC+R?*2Ew*QA6>!d8k
zXA;{zCPTe4HjW3QGA?Q@V^hW{*D$EhqIvS~Kr}~#ffEUwP`(?Jow(K^_i<=ZD4{i@
zFKUfP@GL{p^s-&%U-M(GwiNv$B-!ud%Q|fB0@NWy;+N})&Amnk_GiZN;uFxDec;O+
zpx2>ORm5vQ1HV@NkhX$C-A2j8korsr)q^o7juN=8j};Qfc|$nneHt7&Cd!BL9C^&~
z7-BuVD!F|hz}=iZLZ(I>oWToQH4S(!Pii`1UKg!wW40ZgqmZL54yxWUpjE8~ck@YY
zZ46dnP08BdgXh{KiEKkTAN{`T@qMnA<GwP<Fb6CjxHC?@T1hX0_th?eXag<-4xN}k
zXYc{Mom<Lw&Xrs>4To=YW9cLa^4=7P*RN=`zXRQQl}ziiJhhlDGU#Q6GwB=sg_C!l
z1!4Z10#Ve;oU4QIc77q7b0J?%XTY1^ASjDmkq-~RYXyG!I|XVUiG<%4)C(LDD+=KQ
zpe)Q~23iW$G#`2k8w8{4R>}L}VL+}dQe#cF7B%3JIaws(h4;aQBLB>XC>eodT?37R
z{kk79V|R2l{s@{ESOlf|z?qYGy1=O4AzV*NvzDHBn}^}LR+4Z&EO3!J<tJQDiOaS4
zh^zhT^mNDCYHxuDdJ|rt*~J+G1<rVp+x9_ttJojmSg{&U-;Ls*>N5kU@*dtq_fLm}
zyJqH@7czVk$!@2l%f)efO0=ls_M{C>p^P0U0TxgPLx#9)GiLr4vZr|+sClY{mDY;f
zW#o)sa<v8bxLPe%gvr7uZWw+;MY_fw-QY83sJkQq-;V=u+$h5*>4Gss=t=8m>mK#M
zH^z=tqqj;=;8qctK1IE2CJEv{OcDz|Mjk$KjWw7%k7A$qyPAW1yo6VCA;heYe2I%*
zGlm+o6z%OZ=SBHf7iI-L7&F)NS$AT|%%|ezFE|blum6iiWLh&IDvvcr{+Z)E0Rdr>
z?bv>aoNN`>iv6?5r3X3ocVW8-w(<<T9{bB*p#v~guE7nzST3bn5L8HUhba(ufd#P#
z?1&!dMf@v_R!Dive}O9%DKs4xk*Wj1W+^=Y3d9kxApQaDh+ji5;zk%nEP>02-+@KM
z0T5WEc&-$PvtU8wgB>vodJ%hI)FKtne#vo3L5F4d|5peh3vkOK7ij;+adki|<#anJ
zDmD1HHdSh}({Bb-R4Xhazn~|0o7Xpr$`0AqttaJwzY|&(fi_s$KF74q!8p!qg!xJX
zVsaJ3=BhmEgTbl{@#fi;^l5R%f`E`lm_~UCJgcgpb5LfjQ1#5M_{S1I8A`|ia+|Tw
zzB+7Ik^2-rwl;|47dY;ipC|q@WrYzk45I2-Ou3?3L))RLIu9f7$9_6|Qf<Ij^D*|b
zKvAQ?+q4Psedt9r!xZ8yEFwBVT&uzSS`hQ$$A}{^idX_S5Pcx1%TtYf%W*xcSH^zi
zLymI?xlH_<u{pt3_lnqq62`EPMdikJ5l+^{3T{5)xYmhFb@dT&6Ypouwhd7KdicZ}
IyVBDC2fW16X8-^I


From 7812a55b6ce73ba1388b0cdc767afe3de43b6a56 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 5 Feb 2024 21:57:59 +1100
Subject: [PATCH 07/24] FPU: Reorganize NaN and infinity handling and improve
 arch compliance

The architecture specifies that an invalid operation exception for
signalling NaN (VXSNAN) can occur in the same instructions as an
invalid operation exception for infinity times zero (VXIMZ) in the
case of a multiply-add instruction where B is a signalling NaN, and
one of A and C is infinity and the other is zero.  This moves the
invalid operation tests around so as to handle this case correctly.
It also restructures the infinity and NaN cases to simplify the logic
a little.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 74 +++++++++++++++++++++++++-------------------------------
 1 file changed, 33 insertions(+), 41 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index f0a180f..d7a5e42 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -838,6 +838,7 @@ begin
         variable set_y       : std_ulogic;
         variable set_s       : std_ulogic;
         variable qnan_result : std_ulogic;
+        variable invalid_mul : std_ulogic;
         variable px_nz       : std_ulogic;
         variable pcmpb_eq    : std_ulogic;
         variable pcmpb_lt    : std_ulogic;
@@ -1217,6 +1218,7 @@ begin
                 -- At least one floating-point operand is infinity or NaN
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
+                invalid_mul := '0';
 
                 if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or
                     (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or
@@ -1225,6 +1227,15 @@ begin
                     v.fpscr(FPSCR_VXSNAN) := '1';
                     invalid := '1';
                 end if;
+                -- Check for this case here since VXIMZ can be set along with VXSNAN
+                if r.is_multiply = '1' and
+                    ((r.a.class = INFINITY and r.c.class = ZERO) or
+                     (r.a.class = ZERO and r.c.class = INFINITY)) then
+                    v.fpscr(FPSCR_VXIMZ) := '1';
+                    qnan_result := '1';
+                    invalid_mul := '1';
+                end if;
+
                 if r.a.class = NAN or r.b.class = NAN or r.c.class = NAN then
                     if r.int_result = '1' then
                         v.state := INT_OFLOW;
@@ -1241,51 +1252,32 @@ begin
                     end if;
 
                 else
-                    if r.a.class = INFINITY then
-                        if r.is_multiply = '1' and r.c.class = ZERO then
-                            -- invalid operation, construct QNaN
-                            v.fpscr(FPSCR_VXIMZ) := '1';
-                            qnan_result := '1';
-                        elsif r.is_subtract = '1' and r.b.class = INFINITY then
+                    if (r.a.class = INFINITY or r.c.class = INFINITY) and invalid_mul = '0' then
+                        sign_inv := r.is_multiply and r.is_subtract;
+                        if r.is_subtract = '1' and r.b.class = INFINITY then
                             v.fpscr(FPSCR_VXISI) := '1';
                             qnan_result := '1';
-                        elsif r.is_inverse = '1' and r.b.class = INFINITY then
-                            v.fpscr(FPSCR_VXIDI) := '1';
-                            qnan_result := '1';
-                        else
-                            sign_inv := r.is_multiply and r.is_subtract;
-                            v.result_class := INFINITY;
                         end if;
-                        arith_done := '1';
-                    elsif r.c.class = INFINITY then
-                        if r.is_multiply = '1' and r.a.class = ZERO then
-                            -- invalid operation, construct QNaN
-                            v.fpscr(FPSCR_VXIMZ) := '1';
-                            qnan_result := '1';
-                        elsif r.is_subtract = '1' and r.b.class = INFINITY then
-                            v.fpscr(FPSCR_VXISI) := '1';
-                            qnan_result := '1';
-                        else
-                            sign_inv := r.is_multiply and r.is_subtract;
-                            v.result_class := INFINITY;
-                        end if;
-                        arith_done := '1';
+                    end if;
+                    if r.is_inverse = '1' and r.a.class = INFINITY and r.b.class = INFINITY then
+                        v.fpscr(FPSCR_VXIDI) := '1';
+                        qnan_result := '1';
+                    end if;
+                    if r.b.class = INFINITY and r.is_sqrt = '1' and r.b.negative = '1' then
+                        v.fpscr(FPSCR_VXSQRT) := '1';
+                        qnan_result := '1';
+                    end if;
+                    if r.b.class = INFINITY and r.is_inverse = '1' then
+                        -- fdiv, fre, frsqrte
+                        v.result_class := ZERO;
                     else
-                        -- r.b.class = INFINITY
-                        if r.int_result = '1' then
-                            -- fcti*
-                            v.state := INT_OFLOW;
-                        elsif r.is_sqrt = '1' and r.b.negative = '1' then
-                            v.fpscr(FPSCR_VXSQRT) := '1';
-                            qnan_result := '1';
-                        elsif r.is_inverse = '1' then
-                            -- fdiv, fre, frsqrte
-                            v.result_class := ZERO;
-                            arith_done := '1';
-                        else
-                            v.result_class := INFINITY;
-                            arith_done := '1';
-                        end if;
+                        v.result_class := INFINITY;
+                    end if;
+                    if r.b.class = INFINITY and r.int_result = '1' then
+                        -- fcti*
+                        v.state := INT_OFLOW;
+                    else
+                        arith_done := '1';
                     end if;
                 end if;
 

From 2422585e140d1de47e3ff6d88495657fdbe7f703 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 12 Feb 2024 22:16:13 +1100
Subject: [PATCH 08/24] FPU: Reduce use of r.insn inside the state machine

Instead use things derived from the instruction in the first cycle,
such as r.is_multiply, r.is_addition, etc.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 60 +++++++++++++++++++++++++-------------------------------
 1 file changed, 27 insertions(+), 33 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index d7a5e42..b602648 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -938,6 +938,8 @@ begin
                     end if;
                     if e_in.insn(5 downto 1) = "01111" then   -- fcti*z
                         v.round_mode := "001";
+                    elsif e_in.insn(5 downto 1) = "01000" then   -- fri*
+                        v.round_mode := '1' & e_in.insn(7 downto 6);
                     end if;
                     case e_in.insn(5 downto 1) is
                         when "10100" | "10101" =>       -- fadd and fsub
@@ -1334,36 +1336,29 @@ begin
 
                 else
                     -- some operand is denorm, and/or it's fmadd/fmsub with B=0
-                    v.opsel_a := AIN_B;
-                    if r.use_a = '1' and (r.use_b = '0' or r.use_c = '0') then
+                    -- input selection for denorm cases
+                    -- A and C are non-zero if present,
+                    -- B is non-zero if present except for multiply-add
+                    if r.a.zeroexp = '1' and (r.is_multiply or r.is_inverse) = '1' then
                         v.opsel_a := AIN_A;
+                    elsif r.b.zeroexp = '1' and (r.is_inverse or r.is_sqrt) = '1' then
+                        v.opsel_a := AIN_B;
+                    elsif r.c.zeroexp = '1' then
+                        v.opsel_a := AIN_C;
+                    else
+                        v.opsel_a := AIN_B;
+                        if r.use_a = '1' and (r.use_b = '0' or r.use_c = '0' or r.b.class = ZERO) then
+                            v.opsel_a := AIN_A;
+                        end if;
                     end if;
-                    if r.use_b = '1' and r.b.class = ZERO and r.use_c = '1' then
-                        -- turn fmadd/sub into fmul
-                        v.opsel_a := AIN_A;
+                    if r.is_multiply = '1' and r.b.class = ZERO then
+                        -- This will trigger for fmul as well as fmadd/sub, but
+                        -- it doesn't matter since r.is_subtract = 0 for fmul.
                         rsgn_op := RSGN_SUB;
                         v.state := DO_FMUL;
                     else
                         v.state := r.exec_state;
                     end if;
-                    -- input selection for denorm cases
-                    case r.insn(5 downto 1) is
-                        when "10010" =>         -- fdiv
-                            if r.b.mantissa(UNIT_BIT) = '0' and r.a.mantissa(UNIT_BIT) = '1' then
-                                v.opsel_a := AIN_B;
-                            end if;
-                        when "11001" =>         -- fmul
-                            if r.c.mantissa(UNIT_BIT) = '0' and r.a.mantissa(UNIT_BIT) = '1' then
-                                v.opsel_a := AIN_C;
-                            end if;
-                        when "11100" | "11101" | "11110" | "11111" =>   -- fmadd etc.
-                            if r.a.mantissa(UNIT_BIT) = '0' then
-                                v.opsel_a := AIN_A;
-                            elsif r.c.mantissa(UNIT_BIT) = '0' then
-                                v.opsel_a := AIN_C;
-                            end if;
-                        when others =>
-                    end case;
                 end if;
 
             when DO_ILLEGAL =>
@@ -1571,7 +1566,6 @@ begin
                     arith_done := '1';
                 else
                     v.state := FRI_1;
-                    v.round_mode := '1' & r.insn(7 downto 6);
                 end if;
 
             when DO_FRSP =>
@@ -1813,9 +1807,9 @@ begin
                 set_a := '1';
                 re_sel2 <= REXP2_NE;
                 re_set_result <= '1';
-                if r.insn(4) = '1' then
+                if r.is_multiply = '1' then
                     if r.c.mantissa(UNIT_BIT) = '1' then
-                        if r.insn(3) = '0' or r.b.class = ZERO then
+                        if r.is_addition = '0' or r.b.class = ZERO then
                             v.first := '1';
                             v.state := MULT_1;
                         else
@@ -1867,7 +1861,7 @@ begin
                 set_c := '1';
                 re_sel2 <= REXP2_NE;
                 re_set_result <= '1';
-                if r.insn(3) = '0' or r.b.class = ZERO then
+                if r.is_addition = '0' or r.b.class = ZERO then
                     v.first := '1';
                     v.state := MULT_1;
                 else
@@ -2081,16 +2075,16 @@ begin
                     re_set_result <= '1';
                 end if;
                 v.first := '1';
-                if r.insn(4) = '0' then
-                    if r.insn(3) = '0' then
-                        v.state := DIV_2;
+                if r.is_sqrt = '1' then
+                    if r.is_inverse = '1' then
+                        v.state := RSQRT_1;
                     else
                         v.state := SQRT_1;
                     end if;
-                elsif r.insn(2) = '0' then
-                    v.state := FRE_1;
+                elsif r.use_a = '1' then
+                    v.state := DIV_2;
                 else
-                    v.state := RSQRT_1;
+                    v.state := FRE_1;
                 end if;
 
             when DIV_2 =>

From 4e5f856c5517eebb022f17b07cfbe73ff95d422b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 13 Feb 2024 17:17:03 +1100
Subject: [PATCH 09/24] FPU: Factor out some of the common elements of the DO_*
 states

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 36 ++++++++++--------------------------
 1 file changed, 10 insertions(+), 26 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index b602648..345dc6e 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -146,6 +146,7 @@ architecture behaviour of fpu is
         exp_cmp      : std_ulogic;
         madd_cmp     : std_ulogic;
         add_bsmall   : std_ulogic;
+        is_arith     : std_ulogic;
         is_addition  : std_ulogic;
         is_multiply  : std_ulogic;
         is_inverse   : std_ulogic;
@@ -176,6 +177,7 @@ architecture behaviour of fpu is
         res_sign     : std_ulogic;
         res_int      : std_ulogic;
         exec_state   : state_t;
+        cycle_1      : std_ulogic;
     end record;
 
     type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0);
@@ -880,6 +882,7 @@ begin
         is_nan_inf := '0';
         is_zero_den := '0';
         sign_inv := '0';
+        v.cycle_1 := e_in.valid;
 
         if r.complete = '1' or r.do_intr = '1' then
             v.instr_done := '0';
@@ -925,6 +928,7 @@ begin
             v.negate := '0';
             v.quieten_nan := '1';
             v.int_result := '0';
+            v.is_arith := '0';
             case e_in.op is
                 when OP_FP_ARITH =>
                     fpin_a := e_in.valid_a;
@@ -932,6 +936,7 @@ begin
                     fpin_c := e_in.valid_c;
                     v.longmask := e_in.single;
                     v.fp_rc := e_in.rc;
+                    v.is_arith := '1';
                     exec_state := arith_decode(to_integer(unsigned(e_in.insn(5 downto 1))));
                     if e_in.insn(5 downto 1) = "10110" or e_in.insn(5 downto 1) = "11010" then
                         v.is_sqrt := '1';
@@ -1193,6 +1198,11 @@ begin
 
         rsgn_op := RSGN_NOP;
 
+        if r.cycle_1 = '1' and r.is_arith = '1' then
+            v.fpscr(FPSCR_FR) := '0';
+            v.fpscr(FPSCR_FI) := '0';
+        end if;
+
         case r.state is
             when IDLE =>
                 v.invalid := '0';
@@ -1218,8 +1228,6 @@ begin
 
             when DO_NAN_INF =>
                 -- At least one floating-point operand is infinity or NaN
-                v.fpscr(FPSCR_FR) := '0';
-                v.fpscr(FPSCR_FI) := '0';
                 invalid_mul := '0';
 
                 if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or
@@ -1285,8 +1293,6 @@ begin
 
             when DO_ZERO_DEN =>
                 -- At least one floating point operand is zero or denormalized
-                v.fpscr(FPSCR_FR) := '0';
-                v.fpscr(FPSCR_FI) := '0';
                 if (r.use_a = '1' and r.a.class = ZERO) or
                     (r.use_b = '1' and r.b.class = ZERO and r.is_multiply = '0') or
                     (r.use_c = '1' and r.c.class = ZERO) then
@@ -1559,8 +1565,6 @@ begin
                 rs_sel1 <= RSH1_B;
                 rs_con2 <= RSCON2_52;
                 rs_neg2 <= '1';
-                v.fpscr(FPSCR_FR) := '0';
-                v.fpscr(FPSCR_FI) := '0';
                 if r.b.exponent >= to_signed(52, EXP_BITS) then
                     -- integer already, no rounding required
                     arith_done := '1';
@@ -1577,8 +1581,6 @@ begin
                 rs_sel1 <= RSH1_B;
                 rs_con2 <= RSCON2_MINEXP;
                 rs_neg2 <= '1';
-                v.fpscr(FPSCR_FR) := '0';
-                v.fpscr(FPSCR_FI) := '0';
                 set_x := '1';
                 if r.b.exponent < to_signed(-126, EXP_BITS) then
                     v.state := ROUND_UFLOW;
@@ -1598,8 +1600,6 @@ begin
                 re_set_result <= '1';
                 rs_sel1 <= RSH1_B;
                 rs_neg2 <= '1';
-                v.fpscr(FPSCR_FR) := '0';
-                v.fpscr(FPSCR_FI) := '0';
 
                 if r.b.exponent >= to_signed(64, EXP_BITS) or
                     (r.insn(9) = '0' and r.b.exponent >= to_signed(32, EXP_BITS)) then
@@ -1630,8 +1630,6 @@ begin
                 v.result_class := r.b.class;
                 re_con2 <= RECON2_UNIT;
                 re_set_result <= '1';
-                v.fpscr(FPSCR_FR) := '0';
-                v.fpscr(FPSCR_FI) := '0';
                 if r.b.class = ZERO then
                     arith_done := '1';
                 else
@@ -1648,8 +1646,6 @@ begin
                 rs_sel1 <= RSH1_B;
                 rs_neg1 <= '1';
                 rs_sel2 <= RSH2_A;
-                v.fpscr(FPSCR_FR) := '0';
-                v.fpscr(FPSCR_FI) := '0';
                 v.add_bsmall := r.exp_cmp;
                 v.opsel_a := AIN_B;
                 if r.exp_cmp = '0' then
@@ -1667,8 +1663,6 @@ begin
                 -- fmul[s]
                 -- r.opsel_a = AIN_A unless C is denorm and A isn't
                 v.result_class := r.a.class;
-                v.fpscr(FPSCR_FR) := '0';
-                v.fpscr(FPSCR_FI) := '0';
                 re_sel1 <= REXP1_A;
                 re_sel2 <= REXP2_C;
                 re_set_result <= '1';
@@ -1685,8 +1679,6 @@ begin
             when DO_FDIV =>
                 -- r.opsel_a = AIN_A unless B is denorm and A isn't
                 v.result_class := r.a.class;
-                v.fpscr(FPSCR_FR) := '0';
-                v.fpscr(FPSCR_FI) := '0';
                 re_sel1 <= REXP1_A;
                 re_sel2 <= REXP2_B;
                 re_neg2 <= '1';
@@ -1714,8 +1706,6 @@ begin
             when DO_FSQRT =>
                 -- r.opsel_a = AIN_B
                 v.result_class := r.b.class;
-                v.fpscr(FPSCR_FR) := '0';
-                v.fpscr(FPSCR_FI) := '0';
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 if r.b.negative = '1' then
@@ -1734,8 +1724,6 @@ begin
             when DO_FRE =>
                 -- r.opsel_a = AIN_B
                 v.result_class := r.b.class;
-                v.fpscr(FPSCR_FR) := '0';
-                v.fpscr(FPSCR_FI) := '0';
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 if r.b.mantissa(UNIT_BIT) = '0' then
@@ -1747,8 +1735,6 @@ begin
             when DO_FRSQRTE =>
                 -- r.opsel_a = AIN_B
                 v.result_class := r.b.class;
-                v.fpscr(FPSCR_FR) := '0';
-                v.fpscr(FPSCR_FI) := '0';
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 -- set shift to 1
@@ -1775,8 +1761,6 @@ begin
                 re_set_result <= '1';
                 -- put b.exp into shift
                 rs_sel1 <= RSH1_B;
-                v.fpscr(FPSCR_FR) := '0';
-                v.fpscr(FPSCR_FI) := '0';
                 -- Make sure A and C are normalized
                 if r.a.mantissa(UNIT_BIT) = '0' then
                     v.state := RENORM_A;

From cf866ce91080a97f6ef5666f2d47d58d15aaf7d9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 29 Feb 2024 21:39:36 +1100
Subject: [PATCH 10/24] FPU: Simplify logic for setting r.x

Since r.x is mostly set from the value in r.r and only once from
anything else (r.b.mantissa), move the check to before the input
multiplexer for the main adder, so it works on r.r rather than
whatever is selected by r.opsel_a.

For the case in DO_FRSP where we have B selected by r.opsel_a, we add
a new state so that we now get B into R and then check the low bits of
R.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index 345dc6e..1a584d5 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -51,7 +51,7 @@ architecture behaviour of fpu is
                      DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
                      DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT,
                      DO_FCFID, DO_FCTI,
-                     DO_FRSP, DO_FRI,
+                     DO_FRSP, DO_FRSP_2, DO_FRI,
                      DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT, DO_FMADD,
                      DO_FRE, DO_FRSQRTE,
                      DO_FSEL,
@@ -1577,6 +1577,10 @@ begin
                 v.result_class := r.b.class;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
+                v.state := DO_FRSP_2;
+
+            when DO_FRSP_2 =>
+                -- r.opsel_a = AIN_R, r.shift = 0
                 -- set shift to exponent - -126
                 rs_sel1 <= RSH1_B;
                 rs_con2 <= RSCON2_MINEXP;
@@ -3269,6 +3273,9 @@ begin
         else
             mask := right_mask(unsigned(mshift(5 downto 0)));
         end if;
+        if (or (mask and r.r)) = '1' and set_x = '1' then
+            v.x := '1';
+        end if;
         case r.opsel_a is
             when AIN_R =>
                 in_a0 := r.r;
@@ -3279,9 +3286,6 @@ begin
             when others =>
                 in_a0 := r.c.mantissa;
         end case;
-        if (or (mask and in_a0)) = '1' and set_x = '1' then
-            v.x := '1';
-        end if;
         if opsel_ainv = '1' then
             in_a0 := not in_a0;
         end if;

From 2731384a4be19877c40fedfaeb72565d279577df Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 1 Mar 2024 22:12:46 +1100
Subject: [PATCH 11/24] FPU: Reduce misc_sel to 3 bits

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 101 +++++++++++++++++++++++++------------------------------
 1 file changed, 45 insertions(+), 56 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index 1a584d5..a309705 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -200,7 +200,7 @@ architecture behaviour of fpu is
     signal r_lo_nz       : std_ulogic;
     signal r_gt_1        : std_ulogic;
     signal s_nz          : std_ulogic;
-    signal misc_sel      : std_ulogic_vector(3 downto 0);
+    signal misc_sel      : std_ulogic_vector(2 downto 0);
     signal f_to_multiply : MultiplyInputType;
     signal multiply_to_f : MultiplyOutputType;
     signal msel_1        : std_ulogic_vector(1 downto 0);
@@ -1150,7 +1150,7 @@ begin
         opsel_r <= RES_SUM;
         opsel_s <= S_ZERO;
         carry_in <= '0';
-        misc_sel <= "0000";
+        misc_sel <= "000";
         fpscr_mask := (others => '1');
         update_fx := '0';
         arith_done := '0';
@@ -1498,13 +1498,14 @@ begin
             when DO_FMRG =>
                 -- fmrgew, fmrgow
                 opsel_r <= RES_MISC;
-                misc_sel <= "01" & r.insn(8) & '0';
+                misc_sel <= "100";
                 v.writing_fpr := '1';
                 v.instr_done := '1';
 
             when DO_MFFS =>
                 v.writing_fpr := '1';
                 opsel_r <= RES_MISC;
+                misc_sel <= "011";
                 case r.insn(20 downto 16) is
                     when "00000" =>
                         -- mffs
@@ -2153,7 +2154,7 @@ begin
                 re_neg1 <= '1';
                 re_set_result <= '1';
                 opsel_r <= RES_MISC;
-                misc_sel <= "0111";
+                misc_sel <= "101";
                 -- set shift to 1
                 rs_con2 <= RSCON2_1;
                 v.state := NORMALIZE;
@@ -2173,7 +2174,7 @@ begin
 
             when RSQRT_1 =>
                 opsel_r <= RES_MISC;
-                misc_sel <= "0111";
+                misc_sel <= "101";
                 re_sel1 <= REXP1_BHALF;
                 re_neg1 <= '1';
                 re_set_result <= '1';
@@ -2186,7 +2187,7 @@ begin
                 -- also transfer B (in R) to A
                 set_a := '1';
                 opsel_r <= RES_MISC;
-                misc_sel <= "0111";
+                misc_sel <= "101";
                 msel_1 <= MUL1_B;
                 msel_2 <= MUL2_LUT;
                 f_to_multiply.valid <= '1';
@@ -2399,7 +2400,7 @@ begin
                 else
                     msb := r.r(63);
                 end if;
-                misc_sel <= '1' & r.insn(9 downto 8) & r.result_sign;
+                misc_sel <= "110";
                 if (r.insn(8) = '0' and msb /= r.result_sign) or
                     (r.insn(8) = '1' and msb /= '1') then
                     opsel_r <= RES_MISC;
@@ -2414,10 +2415,7 @@ begin
 
             when INT_OFLOW =>
                 opsel_r <= RES_MISC;
-                misc_sel <= '1' & r.insn(9 downto 8) & r.result_sign;
-                if r.b.class = NAN then
-                    misc_sel(0) <= '1';
-                end if;
+                misc_sel <= "110";
                 v.fpscr(FPSCR_VXCVI) := '1';
                 invalid := '1';
                 arith_done := '1';
@@ -2515,7 +2513,7 @@ begin
                     re_con2 <= RECON2_MAX;
                     re_set_result <= '1';
                     opsel_r <= RES_MISC;
-                    misc_sel <= "001" & r.single_prec;
+                    misc_sel <= "010";
                     arith_done := '1';
                 else
                     -- enabled overflow exception
@@ -2761,7 +2759,7 @@ begin
                 -- less than 0.5, in which case we want to use 0.5, to avoid
                 -- infinite loops in some cases.
                 opsel_r <= RES_MISC;
-                misc_sel <= "0001";
+                misc_sel <= "001";
                 if multiply_to_f.valid = '1' then
                     v.first := '1';
                     if r.count = "11" then
@@ -2774,7 +2772,7 @@ begin
                 -- Get 0.5 into R; it turns out the generated
                 -- QNaN mantissa is actually what we want
                 opsel_r <= RES_MISC;
-                misc_sel <= "0001";
+                misc_sel <= "001";
                 v.opsel_a := AIN_A;
                 -- set shift to 64
                 rs_con2 <= RSCON2_64;
@@ -3136,7 +3134,7 @@ begin
                 v.instr_done := '1';
             when IDIV_ZERO =>
                 opsel_r <= RES_MISC;
-                misc_sel <= "0101";
+                misc_sel <= "000";
                 v.xerc_result := v.xerc;
                 if r.oe = '1' then
                     v.xerc_result.ov := r.int_ovf;
@@ -3176,7 +3174,7 @@ begin
             invalid := '1';
             v.result_class := NAN;
             rsign := '0';
-            misc_sel <= "0001";
+            misc_sel <= "001";
             opsel_r <= RES_MISC;
             arith_done := '1';
         end if;
@@ -3342,50 +3340,41 @@ begin
             when others =>
                 misc := (others => '0');
                 case misc_sel is
-                    when "0000" =>
-                        misc := x"00000000" & (r.fpscr and fpscr_mask);
-                    when "0001" =>
-                        -- generated QNaN mantissa
+                    when "000" =>
+                        -- zero result, used in idiv logic
+                    when "001" =>
+                        -- generated QNaN mantissa; also used for 0.5 in idiv logic
                         misc(QNAN_BIT) := '1';
-                    when "0010" =>
-                        -- mantissa of max representable DP number
-                        misc(UNIT_BIT downto DP_LSB) := (others => '1');
-                    when "0011" =>
-                        -- mantissa of max representable SP number
+                    when "010" =>
+                        -- mantissa of max representable number, DP or SP
                         misc(UNIT_BIT downto SP_LSB) := (others => '1');
-                    when "0100" =>
-                        -- fmrgow result
-                        misc := r.a.mantissa(31 downto 0) & r.b.mantissa(31 downto 0);
-                    when "0110" =>
-                        -- fmrgew result
-                        misc := r.a.mantissa(63 downto 32) & r.b.mantissa(63 downto 32);
-                    when "0111" =>
+                        misc(SP_LSB-1 downto DP_LSB) := (others => not r.single_prec);
+                    when "011" =>
+                        -- read FPSCR
+                        misc := x"00000000" & (r.fpscr and fpscr_mask);
+                    when "100" =>
+                        -- fmrgow/fmrgew result
+                        if r.insn(8) = '0' then
+                            misc := r.a.mantissa(31 downto 0) & r.b.mantissa(31 downto 0);
+                        else
+                            misc := r.a.mantissa(63 downto 32) & r.b.mantissa(63 downto 32);
+                        end if;
+                    when "101" =>
                         misc := std_ulogic_vector(shift_left(resize(unsigned(inverse_est), 64),
                                                              UNIT_BIT - 19));
-                    when "1000" =>
-                        -- max positive result for fctiw[z]
-                        misc := x"000000007fffffff";
-                    when "1001" =>
-                        -- max negative result for fctiw[z]
-                        misc := x"ffffffff80000000";
-                    when "1010" =>
-                        -- max positive result for fctiwu[z]
-                        misc := x"00000000ffffffff";
-                    when "1011" =>
-                        -- max negative result for fctiwu[z]
-                        misc := x"0000000000000000";
-                    when "1100" =>
-                        -- max positive result for fctid[z]
-                        misc := x"7fffffffffffffff";
-                    when "1101" =>
-                        -- max negative result for fctid[z]
-                        misc := x"8000000000000000";
-                    when "1110" =>
-                        -- max positive result for fctidu[z]
-                        misc := x"ffffffffffffffff";
-                    when "1111" =>
-                        -- max negative result for fctidu[z]
-                        misc := x"0000000000000000";
+                    when "110" =>
+                        -- max positive or negative result for fcti*
+                        if r.result_sign = '0' and r.b.class /= NAN then
+                            misc := x"000000007fffffff";
+                            misc(31) := r.insn(8) or r.insn(9);  -- unsigned or dword
+                            misc(62 downto 32) := (others => r.insn(9));  -- dword
+                            misc(63) := r.insn(8) and r.insn(8);
+                        elsif r.insn(8) = '0' then
+                            misc(63) := '1';
+                            if r.insn(9) = '0' then
+                                misc(62 downto 31) := (others => '1');
+                            end if;
+                        end if;
                     when others =>
                 end case;
                 result <= misc;

From ba2add029af5aea4bc5d0240a7fcb17351db74a7 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 5 Mar 2024 16:46:08 +1100
Subject: [PATCH 12/24] FPU: Remove need to set opsel_a one cycle ahead

Most states set opsel_a directly to select the operand for the A input
of the main adder.  The exception is the EXC_RESULT state, which uses
r.opsel_a set by the previous cycle to indicate which input operand to
use as the result.

In order to make timing, ensure that the controls that select the
inputs to the main adder (opsel_*, etc.) don't depend on any
complicated functions of the data (such as px_nz, pcmpb_eq, pcmpb_lt,
etc.), but are as far as possible constant for each state.  There is
now a control called set_r for whether the result is written to r.r,
which enables us to avoid setting opsel_b or opsel_r conditionally in
some cases.

Also, to avoid a data-dependent setting of msel_2 in IDIV_DODIV state,
the IDIV_NR1 and IDIV_NR2 states have been reworked so that completion
of the required number of iterations is checked in IDIV_NR1 state, and
at that point, if the inverse estimate is < 0.5, we go to IDIV_USE0_5
state in order to use 0.5 as the estimate.  This means that in the
normal case, the inverse estimate is already in Y when we get to
IDIV_DODIV state.  IDIV_USE0_5 has been reworked to put R (which will
contain 0.5) into Y as the inverse estimate.  That means that
IDIV_DODIV state doesn't have any data-dependent logic to put either P
or R into Y.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 239 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 113 insertions(+), 126 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index a309705..3914a97 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -185,6 +185,7 @@ architecture behaviour of fpu is
     signal r, rin : reg_type;
 
     signal fp_result     : std_ulogic_vector(63 downto 0);
+    signal opsel_a       : std_ulogic_vector(1 downto 0);
     signal opsel_b       : std_ulogic_vector(1 downto 0);
     signal opsel_r       : std_ulogic_vector(1 downto 0);
     signal opsel_s       : std_ulogic_vector(1 downto 0);
@@ -838,6 +839,7 @@ begin
         variable set_b_mant  : std_ulogic;
         variable set_c       : std_ulogic;
         variable set_y       : std_ulogic;
+        variable set_r       : std_ulogic;
         variable set_s       : std_ulogic;
         variable qnan_result : std_ulogic;
         variable invalid_mul : std_ulogic;
@@ -1143,6 +1145,7 @@ begin
         v.first := '0';
         v.doing_ftdiv := "00";
         v.opsel_a := AIN_R;
+        opsel_a <= AIN_R;
         opsel_ainv <= '0';
         opsel_mask <= '0';
         opsel_b <= BIN_ZERO;
@@ -1166,6 +1169,7 @@ begin
         set_b := '0';
         set_b_mant := '0';
         set_c := '0';
+        set_r := '1';
         set_s := '0';
         f_to_multiply.is_signed <= '0';
         f_to_multiply.valid <= '0';
@@ -1207,12 +1211,7 @@ begin
             when IDLE =>
                 v.invalid := '0';
                 if e_in.valid = '1' then
-                    v.opsel_a := AIN_B;
                     v.busy := '1';
-                    if e_in.op = OP_FP_ARITH and e_in.valid_a = '1' and
-                        (e_in.valid_b = '0' or e_in.valid_c = '0') then
-                        v.opsel_a := AIN_A;
-                    end if;
                     v.exec_state := exec_state;
                     if is_nan_inf = '1' then
                         v.state := DO_NAN_INF;
@@ -1293,6 +1292,11 @@ begin
 
             when DO_ZERO_DEN =>
                 -- At least one floating point operand is zero or denormalized
+                if r.is_addition = '1' then
+                    opsel_a <= AIN_A;
+                else
+                    opsel_a <= AIN_B;
+                end if;
                 if (r.use_a = '1' and r.a.class = ZERO) or
                     (r.use_b = '1' and r.b.class = ZERO and r.is_multiply = '0') or
                     (r.use_c = '1' and r.c.class = ZERO) then
@@ -1320,7 +1324,7 @@ begin
                     else
                         -- B is zero, other operands are finite
                         if r.int_result = '1' then
-                            -- fcti*, r.opsel_a = AIN_B
+                            -- fcti*
                             arith_done := '1';
                         elsif r.is_inverse = '1' then
                             -- fdiv, fre, frsqrte
@@ -1328,7 +1332,7 @@ begin
                             zero_divide := '1';
                             arith_done := '1';
                         elsif r.is_addition = '1' then
-                            -- fadd, r.opsel_a = AIN_A
+                            -- fadd, fsub
                             v.result_class := FINITE;
                             re_sel1 <= REXP1_A;
                             re_set_result <= '1';
@@ -1342,21 +1346,8 @@ begin
 
                 else
                     -- some operand is denorm, and/or it's fmadd/fmsub with B=0
-                    -- input selection for denorm cases
                     -- A and C are non-zero if present,
                     -- B is non-zero if present except for multiply-add
-                    if r.a.zeroexp = '1' and (r.is_multiply or r.is_inverse) = '1' then
-                        v.opsel_a := AIN_A;
-                    elsif r.b.zeroexp = '1' and (r.is_inverse or r.is_sqrt) = '1' then
-                        v.opsel_a := AIN_B;
-                    elsif r.c.zeroexp = '1' then
-                        v.opsel_a := AIN_C;
-                    else
-                        v.opsel_a := AIN_B;
-                        if r.use_a = '1' and (r.use_b = '0' or r.use_c = '0' or r.b.class = ZERO) then
-                            v.opsel_a := AIN_A;
-                        end if;
-                    end if;
                     if r.is_multiply = '1' and r.b.class = ZERO then
                         -- This will trigger for fmul as well as fmadd/sub, but
                         -- it doesn't matter since r.is_subtract = 0 for fmul.
@@ -1389,7 +1380,7 @@ begin
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or
-                    (r.b.class = FINITE and r.b.mantissa(UNIT_BIT) = '0') then
+                    (r.b.class = FINITE and r.b.denorm = '1') then
                     v.cr_result(2) := '1';
                 end if;
                 if r.a.class = NAN or r.a.class = INFINITY or
@@ -1408,7 +1399,7 @@ begin
                 v.instr_done := '1';
                 v.cr_result := "0000";
                 if r.b.class = ZERO or r.b.class = INFINITY or
-                    (r.b.class = FINITE and r.b.mantissa(UNIT_BIT) = '0') then
+                    (r.b.class = FINITE and r.b.denorm = '1') then
                     v.cr_result(2) := '1';
                 end if;
                 if r.b.class = NAN or r.b.class = INFINITY or r.b.class = ZERO
@@ -1418,7 +1409,7 @@ begin
 
             when DO_FCMP =>
                 -- fcmp[uo]
-                -- r.opsel_a = AIN_B
+                opsel_a <= AIN_B;
                 v.instr_done := '1';
                 update_fx := '1';
                 re_sel2 <= REXP2_B;
@@ -1467,7 +1458,6 @@ begin
                     -- Prepare to subtract mantissas, put B in R
                     v.cr_result := "0000";
                     v.instr_done := '0';
-                    v.opsel_a := AIN_A;
                     v.state := CMP_1;
                 end if;
                 v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result;
@@ -1550,7 +1540,7 @@ begin
                 v.instr_done := '1';
 
             when DO_FMR =>
-                -- r.opsel_a = AIN_B
+                opsel_a <= AIN_B;
                 v.result_class := r.b.class;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
@@ -1558,7 +1548,7 @@ begin
                 v.instr_done := '1';
 
             when DO_FRI =>    -- fri[nzpm]
-                -- r.opsel_a = AIN_B
+                opsel_a <= AIN_B;
                 v.result_class := r.b.class;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
@@ -1574,14 +1564,15 @@ begin
                 end if;
 
             when DO_FRSP =>
-                -- r.opsel_a = AIN_B, r.shift = 0
+                -- r.shift = 0
+                opsel_a <= AIN_B;
                 v.result_class := r.b.class;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 v.state := DO_FRSP_2;
 
             when DO_FRSP_2 =>
-                -- r.opsel_a = AIN_R, r.shift = 0
+                -- r.shift = 0
                 -- set shift to exponent - -126
                 rs_sel1 <= RSH1_B;
                 rs_con2 <= RSCON2_MINEXP;
@@ -1599,7 +1590,7 @@ begin
                 -- instr bit 9: 1=dword 0=word
                 -- instr bit 8: 1=unsigned 0=signed
                 -- instr bit 1: 1=round to zero 0=use fpscr[RN]
-                -- r.opsel_a = AIN_B
+                opsel_a <= AIN_B;
                 v.result_class := r.b.class;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
@@ -1626,7 +1617,7 @@ begin
                 end if;
 
             when DO_FCFID =>
-                -- r.opsel_a = AIN_B
+                opsel_a <= AIN_B;
                 if r.insn(8) = '0' and r.b.negative = '1' then
                     -- fcfid[s] with negative operand, set R = -B
                     opsel_ainv <= '1';
@@ -1643,7 +1634,7 @@ begin
 
             when DO_FADD =>
                 -- fadd[s] and fsub[s]
-                -- r.opsel_a = AIN_A
+                opsel_a <= AIN_A;
                 v.result_class := r.a.class;
                 re_sel1 <= REXP1_A;
                 re_set_result <= '1';
@@ -1652,7 +1643,6 @@ begin
                 rs_neg1 <= '1';
                 rs_sel2 <= RSH2_A;
                 v.add_bsmall := r.exp_cmp;
-                v.opsel_a := AIN_B;
                 if r.exp_cmp = '0' then
                     if r.a.exponent = r.b.exponent then
                         v.state := ADD_2;
@@ -1666,15 +1656,16 @@ begin
 
             when DO_FMUL =>
                 -- fmul[s]
-                -- r.opsel_a = AIN_A unless C is denorm and A isn't
+                opsel_a <= AIN_A;
                 v.result_class := r.a.class;
                 re_sel1 <= REXP1_A;
                 re_sel2 <= REXP2_C;
                 re_set_result <= '1';
                 -- Renormalize denorm operands
-                if r.a.mantissa(UNIT_BIT) = '0' then
+                if r.a.denorm = '1' then
                     v.state := RENORM_A;
-                elsif r.c.mantissa(UNIT_BIT) = '0' then
+                elsif r.c.denorm = '1' then
+                    opsel_a <= AIN_C;
                     v.state := RENORM_C;
                 else
                     f_to_multiply.valid <= '1';
@@ -1682,7 +1673,7 @@ begin
                 end if;
 
             when DO_FDIV =>
-                -- r.opsel_a = AIN_A unless B is denorm and A isn't
+                opsel_a <= AIN_A;
                 v.result_class := r.a.class;
                 re_sel1 <= REXP1_A;
                 re_sel2 <= REXP2_B;
@@ -1690,9 +1681,10 @@ begin
                 re_set_result <= '1';
                 v.count := "00";
                 -- Renormalize denorm operands
-                if r.a.mantissa(UNIT_BIT) = '0' then
+                if r.a.denorm = '1' then
                     v.state := RENORM_A;
-                elsif r.b.mantissa(UNIT_BIT) = '0' then
+                elsif r.b.denorm = '1' then
+                    opsel_a <= AIN_B;
                     v.state := RENORM_B;
                 else
                     v.first := '1';
@@ -1700,23 +1692,23 @@ begin
                 end if;
 
             when DO_FSEL =>
+                rsgn_op := RSGN_SEL;
                 if r.a.class = ZERO or (r.a.negative = '0' and r.a.class /= NAN) then
                     v.opsel_a := AIN_C;
-                    rsgn_op := RSGN_SEL;
                 else
                     v.opsel_a := AIN_B;
                 end if;
                 v.state := EXC_RESULT;
 
             when DO_FSQRT =>
-                -- r.opsel_a = AIN_B
+                opsel_a <= AIN_B;
                 v.result_class := r.b.class;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 if r.b.negative = '1' then
                     v.fpscr(FPSCR_VXSQRT) := '1';
                     qnan_result := '1';
-                elsif r.b.mantissa(UNIT_BIT) = '0' then
+                elsif r.b.denorm = '1' then
                     v.state := RENORM_B;
                 elsif r.b.exponent(0) = '0' then
                     v.state := SQRT_1;
@@ -1727,18 +1719,18 @@ begin
                 end if;
 
             when DO_FRE =>
-                -- r.opsel_a = AIN_B
+                opsel_a <= AIN_B;
                 v.result_class := r.b.class;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
-                if r.b.mantissa(UNIT_BIT) = '0' then
+                if r.b.denorm = '1' then
                     v.state := RENORM_B;
                 else
                     v.state := FRE_1;
                 end if;
 
             when DO_FRSQRTE =>
-                -- r.opsel_a = AIN_B
+                opsel_a <= AIN_B;
                 v.result_class := r.b.class;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
@@ -1747,7 +1739,7 @@ begin
                 if r.b.negative = '1' then
                     v.fpscr(FPSCR_VXSQRT) := '1';
                     qnan_result := '1';
-                elsif r.b.mantissa(UNIT_BIT) = '0' then
+                elsif r.b.denorm = '1' then
                     v.state := RENORM_B;
                 elsif r.b.exponent(0) = '0' then
                     v.state := RSQRT_1;
@@ -1757,8 +1749,7 @@ begin
 
             when DO_FMADD =>
                 -- fmadd, fmsub, fnmadd, fnmsub
-                -- r.opsel_a = AIN_A if A is denorm, else AIN_C if C is denorm,
-                -- else AIN_B
+                opsel_a <= AIN_B;
                 v.result_class := r.a.class;
                 -- put a.exp + c.exp into result_exp
                 re_sel1 <= REXP1_A;
@@ -1767,9 +1758,11 @@ begin
                 -- put b.exp into shift
                 rs_sel1 <= RSH1_B;
                 -- Make sure A and C are normalized
-                if r.a.mantissa(UNIT_BIT) = '0' then
+                if r.a.denorm = '1' then
+                    opsel_a <= AIN_A;
                     v.state := RENORM_A;
-                elsif r.c.mantissa(UNIT_BIT) = '0' then
+                elsif r.c.denorm = '1' then
+                    opsel_a <= AIN_C;
                     v.state := RENORM_C;
                 elsif r.madd_cmp = '0' then
                     -- addend is bigger, do multiply first
@@ -1785,18 +1778,13 @@ begin
             when RENORM_A =>
                 rs_norm <= '1';
                 v.state := RENORM_A2;
-                if r.use_c = '1' and r.c.denorm = '1' then
-                    v.opsel_a := AIN_C;
-                else
-                    v.opsel_a := AIN_B;
-                end if;
 
             when RENORM_A2 =>
-                -- r.opsel_a = AIN_C for fmul/fmadd, AIN_B for fdiv
                 set_a := '1';
                 re_sel2 <= REXP2_NE;
                 re_set_result <= '1';
                 if r.is_multiply = '1' then
+                    opsel_a <= AIN_C;
                     if r.c.mantissa(UNIT_BIT) = '1' then
                         if r.is_addition = '0' or r.b.class = ZERO then
                             v.first := '1';
@@ -1806,13 +1794,13 @@ begin
                             if new_exp + 1 >= r.b.exponent then
                                 v.madd_cmp := '1';
                             end if;
-                            v.opsel_a := AIN_B;
                             v.state := DO_FMADD;
                         end if;
                     else
                         v.state := RENORM_C;
                     end if;
                 else
+                    opsel_a <= AIN_B;
                     if r.b.mantissa(UNIT_BIT) = '1' then
                         v.first := '1';
                         v.state := DIV_2;
@@ -1839,7 +1827,6 @@ begin
                     re_sel2 <= REXP2_NE;
                     re_set_result <= '1';
                 end if;
-                v.opsel_a := AIN_B;
                 v.state := LOOKUP;
 
             when RENORM_C =>
@@ -1858,12 +1845,12 @@ begin
                     if new_exp + 1 >= r.b.exponent then
                         v.madd_cmp := '1';
                     end if;
-                    v.opsel_a := AIN_B;
                     v.state := DO_FMADD;
                 end if;
 
             when ADD_1 =>
                 -- transferring B to R
+                opsel_a <= AIN_B;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 -- set shift to b.exp - a.exp
@@ -1881,15 +1868,14 @@ begin
                 v.x := s_nz;
                 set_x := '1';
                 v.longmask := r.single_prec;
-                if r.add_bsmall = '1' then
-                    v.opsel_a := AIN_A;
-                else
-                    v.opsel_a := AIN_B;
-                end if;
                 v.state := ADD_2;
 
             when ADD_2 =>
-                -- r.opsel_a = AIN_A if r.add_bsmall = 1 else AIN_B
+                if r.add_bsmall = '1' then
+                    opsel_a <= AIN_A;
+                else
+                    opsel_a <= AIN_B;
+                end if;
                 opsel_b <= BIN_R;
                 opsel_binv <= r.is_subtract;
                 carry_in <= r.is_subtract and not r.x;
@@ -1931,7 +1917,7 @@ begin
                 end if;
 
             when CMP_1 =>
-                -- r.opsel_a = AIN_A
+                opsel_a <= AIN_A;
                 opsel_b <= BIN_R;
                 opsel_binv <= '1';
                 carry_in <= '1';
@@ -2033,6 +2019,8 @@ begin
 
             when FMADD_6 =>
                 -- r.shift = UNIT_BIT (or 0, but only if r is now nonzero)
+                set_r := '0';
+                opsel_r <= RES_SHIFT;
                 re_sel2 <= REXP2_NE;
                 rs_norm <= '1';
                 if (r.r(UNIT_BIT + 2) or r_hi_nz or r_lo_nz or (or (r.r(DP_LSB - 1 downto 0)))) = '0' then
@@ -2043,7 +2031,7 @@ begin
                     else
                         -- R is all zeroes but there are non-zero bits in S
                         -- so shift them into R and set S to 0
-                        opsel_r <= RES_SHIFT;
+                        set_r := '1';
                         re_set_result <= '1';
                         set_s := '1';
                         v.state := FINISH;
@@ -2055,10 +2043,10 @@ begin
                 end if;
 
             when LOOKUP =>
-                -- r.opsel_a = AIN_B
                 -- wait one cycle for inverse_table[B] lookup
                 -- if this is a division, compute exponent
                 -- (see comment on RENORM_B2 above)
+                opsel_a <= AIN_B;
                 if r.use_a = '1' then
                     re_sel2 <= REXP2_NE;
                     re_set_result <= '1';
@@ -2136,15 +2124,15 @@ begin
                 end if;
 
             when DIV_6 =>
-                -- r.opsel_a = AIN_R
                 -- test if remainder is 0 or >= B
+                opsel_b <= BIN_RND;
+                rbit_inc := '1';
                 if pcmpb_lt = '1' then
                     -- quotient is correct, set X if remainder non-zero
+                    set_r := '0';
                     v.x := r.p(UNIT_BIT + 2) or px_nz;
                 else
                     -- quotient needs to be incremented by 1 in R-bit position
-                    rbit_inc := '1';
-                    opsel_b <= BIN_RND;
                     v.x := not pcmpb_eq;
                 end if;
                 v.state := FINISH;
@@ -2575,6 +2563,7 @@ begin
 
             when ROUNDING_3 =>
                 -- r.shift = clz(r.r) - 9
+                opsel_r <= RES_SHIFT;
                 mant_nz := r_hi_nz or (r_lo_nz and not r.single_prec);
                 re_sel2 <= REXP2_NE;
                 -- set shift to new_exp - min_exp (== -1022)
@@ -2582,11 +2571,11 @@ begin
                 rs_con2 <= RSCON2_MINEXP;
                 rs_neg2 <= '1';
                 if mant_nz = '0' then
+                    set_r := '0';
                     v.result_class := ZERO;
                     arith_done := '1';
                 else
                     -- Renormalize result after rounding
-                    opsel_r <= RES_SHIFT;
                     re_set_result <= '1';
                     v.denorm := exp_tiny;
                     if new_exp < to_signed(-1022, EXP_BITS) then
@@ -2605,6 +2594,7 @@ begin
 
             when EXC_RESULT =>
                 -- r.opsel_a = AIN_A, AIN_B or AIN_C according to which input is the result
+                opsel_a <= r.opsel_a;
                 case r.opsel_a is
                     when AIN_B =>
                         re_sel2 <= REXP2_B;
@@ -2620,7 +2610,7 @@ begin
                 arith_done := '1';
 
             when DO_IDIVMOD =>
-                -- r.opsel_a = AIN_B
+                opsel_a <= AIN_B;
                 if r.b.class = ZERO then
                     -- B is zero, signal overflow
                     v.int_ovf := '1';
@@ -2657,21 +2647,19 @@ begin
                 -- add the X bit onto R to round up B
                 carry_in <= r.x;
                 -- prepare to do count-leading-zeroes on A
-                v.opsel_a := AIN_A;
                 v.state := IDIV_CLZA;
             when IDIV_CLZA =>
                 set_b := '1';           -- put R back into B
-                -- r.opsel_a = AIN_A
+                opsel_a <= AIN_A;
                 if r.is_signed = '1' and r.a.negative = '1' then
                     opsel_ainv <= '1';
                     carry_in <= '1';
                 end if;
                 re_con2 <= RECON2_UNIT;
                 re_set_result <= '1';
-                v.opsel_a := AIN_C;
                 v.state := IDIV_CLZA2;
             when IDIV_CLZA2 =>
-                -- r.opsel_a = AIN_C
+                opsel_a <= AIN_C;
                 rs_norm <= '1';
                 -- write the dividend back into A in case we negated it
                 set_a_mant := '1';
@@ -2720,6 +2708,12 @@ begin
                 msel_inv <= '1';
                 msel_2 <= MUL2_LUT;
                 set_y := '1';
+                -- Get 0.5 into R in case the inverse estimate turns out to be
+                -- less than 0.5, in which case we want to use 0.5, to avoid
+                -- infinite loops in some cases.
+                -- It turns out the generated QNaN mantissa is actually what we want
+                opsel_r <= RES_MISC;
+                misc_sel <= "001";
                 if r.b.mantissa(UNIT_BIT + 1) = '1' then
                     -- rounding up of the mantissa caused overflow, meaning the
                     -- normalized B is 2.0.  Since this is outside the range
@@ -2740,10 +2734,22 @@ begin
                 msel_2 <= MUL2_P;
                 set_y := r.first;
                 pshift := '1';
-                f_to_multiply.valid <= r.first;
+                -- set shift to 64
+                rs_con2 <= RSCON2_64;
+                if r.first = '1' then
+                    if r.count = "11" then
+                        if r.p(UNIT_BIT) = '0' and r.p(UNIT_BIT - 1) = '0' then
+                            -- inverse estimate is < 0.5, so use 0.5
+                            v.state := IDIV_USE0_5;
+                        else
+                            v.state := IDIV_DODIV;
+                        end if;
+                    else
+                        f_to_multiply.valid <= r.first;
+                    end if;
+                end if;
                 if multiply_to_f.valid = '1' then
                     v.first := '1';
-                    v.count := r.count + 1;
                     v.state := IDIV_NR2;
                 end if;
             when IDIV_NR2 =>
@@ -2752,42 +2758,25 @@ begin
                 msel_2 <= MUL2_P;
                 f_to_multiply.valid <= r.first;
                 pshift := '1';
-                v.opsel_a := AIN_A;
-                -- set shift to 64
-                rs_con2 <= RSCON2_64;
-                -- Get 0.5 into R in case the inverse estimate turns out to be
-                -- less than 0.5, in which case we want to use 0.5, to avoid
-                -- infinite loops in some cases.
-                opsel_r <= RES_MISC;
-                misc_sel <= "001";
+                if r.first = '1' then
+                    v.count := r.count + 1;
+                end if;
                 if multiply_to_f.valid = '1' then
                     v.first := '1';
-                    if r.count = "11" then
-                        v.state := IDIV_DODIV;
-                    else
-                        v.state := IDIV_NR1;
-                    end if;
+                    v.state := IDIV_NR1;
                 end if;
             when IDIV_USE0_5 =>
-                -- Get 0.5 into R; it turns out the generated
-                -- QNaN mantissa is actually what we want
-                opsel_r <= RES_MISC;
-                misc_sel <= "001";
-                v.opsel_a := AIN_A;
+                -- Put the 0.5 which is in R into Y as the inverse estimate
+                set_y := '1';
+                msel_2 <= MUL2_R;
                 -- set shift to 64
                 rs_con2 <= RSCON2_64;
                 v.state := IDIV_DODIV;
             when IDIV_DODIV =>
-                -- r.opsel_a = AIN_A
                 -- r.shift = 64
-                -- inverse estimate is in P or in R; copy it to Y
-                if r.b.mantissa(UNIT_BIT + 1) = '1' or
-                    (r.p(UNIT_BIT) = '0' and r.p(UNIT_BIT - 1) = '0') then
-                    msel_2 <= MUL2_R;
-                else
-                    msel_2 <= MUL2_P;
-                end if;
-                set_y := '1';
+                -- inverse estimate is in Y
+                -- put A (dividend) into R
+                opsel_a <= AIN_A;
                 -- shift_res is 0 because r.shift = 64;
                 -- put that into B, which now holds the quotient
                 set_b_mant := '1';
@@ -2809,7 +2798,6 @@ begin
                 else
                     -- handle top bit of quotient specially
                     -- for this we need the divisor left-justified in B
-                    v.opsel_a := AIN_C;
                     v.state := IDIV_EXT_TBH;
                 end if;
             when IDIV_SH32 =>
@@ -2864,7 +2852,8 @@ begin
                 msel_2 <= MUL2_P;
                 v.inc_quot := not pcmpc_lt and not r.divmod;
                 if r.divmod = '0' then
-                    v.opsel_a := AIN_B;
+                    -- get B into R for IDIV_DIVADJ state
+                    opsel_a <= AIN_B;
                 end if;
                 -- set shift to UNIT_BIT (== 56)
                 rs_con2 <= RSCON2_UNIT;
@@ -2894,12 +2883,11 @@ begin
                 -- r.shift = - b.exponent
                 -- shift the quotient estimate right by b.exponent bits
                 opsel_r <= RES_SHIFT;
-                v.opsel_a := AIN_B;
                 v.first := '1';
                 v.state := IDIV_DIV7;
             when IDIV_DIV7 =>
-                -- r.opsel_a = AIN_B
                 -- add shifted quotient delta onto the total quotient
+                opsel_a <= AIN_B;
                 opsel_b <= BIN_R;
                 v.first := '1';
                 v.state := IDIV_DIV8;
@@ -2923,12 +2911,11 @@ begin
                 msel_1 <= MUL1_Y;
                 msel_2 <= MUL2_P;
                 v.inc_quot := not pcmpc_lt and not r.divmod;
-                if r.divmod = '0' then
-                    v.opsel_a := AIN_B;
-                end if;
                 -- set shift to UNIT_BIT (== 56)
                 rs_con2 <= RSCON2_UNIT;
                 if r.divmod = '0' then
+                    -- get B into R for IDIV_DIVADJ state
+                    opsel_a <= AIN_B;
                     v.state := IDIV_DIVADJ;
                 elsif pcmpc_eq = '1' then
                     v.state := IDIV_ZERO;
@@ -2936,16 +2923,17 @@ begin
                     v.state := IDIV_MODADJ;
                 end if;
             when IDIV_EXT_TBH =>
-                -- r.opsel_a = AIN_C; get divisor into R and prepare to shift left
+                -- get divisor into R and prepare to shift left
                 -- set shift to 63 - b.exp
+                opsel_a <= AIN_C;
                 rs_sel1 <= RSH1_B;
                 rs_neg1 <= '1';
                 rs_con2 <= RSCON2_63;
-                v.opsel_a := AIN_A;
                 v.state := IDIV_EXT_TBH2;
             when IDIV_EXT_TBH2 =>
-                -- r.opsel_a = AIN_A; divisor is in R
+                -- divisor is in R
                 -- r.shift = 63 - b.exponent; shift and put into B
+                opsel_a <= AIN_A;
                 set_b_mant := '1';
                 -- set shift to 64 - UNIT_BIT (== 8)
                 rs_con2 <= RSCON2_64_UNIT;
@@ -2966,13 +2954,13 @@ begin
                 -- r.shift = 64 - B.exponent, so is at least 1
                 opsel_r <= RES_SHIFT;
                 -- top bit of A gets lost in the shift, so handle it specially
-                v.opsel_a := AIN_B;
                 -- set shift to 63
                 rs_con2 <= RSCON2_63;
                 v.state := IDIV_EXT_TBH5;
             when IDIV_EXT_TBH5 =>
-                -- r.opsel_a = AIN_B, r.shift = 63
+                -- r.shift = 63
                 -- shifted dividend is in R, subtract left-justified divisor
+                opsel_a <= AIN_B;
                 opsel_b <= BIN_R;
                 opsel_ainv <= '1';
                 carry_in <= '1';
@@ -3004,15 +2992,14 @@ begin
                 msel_2 <= MUL2_R;
                 f_to_multiply.valid <= r.first;
                 pshift := '1';
-                v.opsel_a := AIN_B;
                 opsel_r <= RES_MULT;
                 if multiply_to_f.valid = '1' then
                     v.first := '1';
                     v.state := IDIV_EXTDIV3;
                 end if;
             when IDIV_EXTDIV3 =>
-                -- r.opsel_a = AIN_B
                 -- delta quotient is in R; add it to B
+                opsel_a <= AIN_B;
                 opsel_b <= BIN_R;
                 v.first := '1';
                 v.state := IDIV_EXTDIV4;
@@ -3040,12 +3027,11 @@ begin
                 opsel_r <= RES_SHIFT;
                 -- test LS 64b of remainder in P against divisor in C
                 v.inc_quot := not pcmpc_lt;
-                v.opsel_a := AIN_B;
                 v.state := IDIV_EXTDIV6;
             when IDIV_EXTDIV6 =>
-                -- r.opsel_a = AIN_B
                 -- shifted remainder is in R, see if it is > 1
                 -- and compute R = R * Y if so
+                opsel_a <= AIN_B;
                 msel_1 <= MUL1_Y;
                 msel_2 <= MUL2_R;
                 pshift := '1';
@@ -3060,7 +3046,6 @@ begin
                 -- result is in R/S
                 opsel_r <= RES_SHIFT;
                 if pcmpc_lt = '0' then
-                    v.opsel_a := AIN_C;
                     v.state := IDIV_MODSUB;
                 elsif r.result_sign = '0' then
                     v.state := IDIV_DONE;
@@ -3068,8 +3053,8 @@ begin
                     v.state := IDIV_DIVADJ;
                 end if;
             when IDIV_MODSUB =>
-                -- r.opsel_a = AIN_C
                 -- Subtract divisor from remainder
+                opsel_a <= AIN_C;
                 opsel_ainv <= '1';
                 carry_in <= '1';
                 opsel_b <= BIN_R;
@@ -3079,7 +3064,7 @@ begin
                     v.state := IDIV_DIVADJ;
                 end if;
             when IDIV_DIVADJ =>
-                -- result (so far) is on the A input of the adder
+                -- result (so far) is in R
                 -- set carry to increment quotient if needed
                 -- and also negate R if the answer is negative
                 opsel_ainv <= r.result_sign;
@@ -3274,7 +3259,7 @@ begin
         if (or (mask and r.r)) = '1' and set_x = '1' then
             v.x := '1';
         end if;
-        case r.opsel_a is
+        case opsel_a is
             when AIN_R =>
                 in_a0 := r.r;
             when AIN_A =>
@@ -3379,7 +3364,9 @@ begin
                 end case;
                 result <= misc;
         end case;
-        v.r := result;
+        if set_r = '1' then
+            v.r := result;
+        end if;
         if set_s = '1' then
             case opsel_s is
                 when S_NEG =>

From 850b87c83fe5aa3345f5fde18a17cd8a813af86c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 5 Mar 2024 20:50:45 +1100
Subject: [PATCH 13/24] FPU: Get rid of r.madd_cmp and r.exp_cmp

This saves a few LUTs and simplifies the code a little.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 38 +++++++++-----------------------------
 1 file changed, 9 insertions(+), 29 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index 3914a97..8a82e03 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -143,8 +143,6 @@ architecture behaviour of fpu is
         denorm       : std_ulogic;
         round_mode   : std_ulogic_vector(2 downto 0);
         is_subtract  : std_ulogic;
-        exp_cmp      : std_ulogic;
-        madd_cmp     : std_ulogic;
         add_bsmall   : std_ulogic;
         is_arith     : std_ulogic;
         is_addition  : std_ulogic;
@@ -1069,15 +1067,6 @@ begin
                 is_zero_den := adec.zeroexp or bdec.zeroexp or cdec.zeroexp;
             end if;
 
-            v.exp_cmp := '0';
-            if adec.exponent > bdec.exponent then
-                v.exp_cmp := '1';
-            end if;
-            v.madd_cmp := '0';
-            if (adec.exponent + cdec.exponent + 1) >= bdec.exponent then
-                v.madd_cmp := '1';
-            end if;
-
             v.a_hi := 8x"0";
             v.a_lo := 56x"0";
         end if;
@@ -1448,7 +1437,7 @@ begin
                     v.cr_result := r.a.negative & not r.a.negative & "00";
                 elsif r.b.class = INFINITY then
                     v.cr_result := not r.b.negative & r.b.negative & "00";
-                elsif r.exp_cmp = '1' then
+                elsif r.a.exponent > r.b.exponent then
                     -- A and B are both finite from here down
                     v.cr_result := r.a.negative & not r.a.negative & "00";
                 elsif r.a.exponent /= r.b.exponent then
@@ -1642,15 +1631,14 @@ begin
                 rs_sel1 <= RSH1_B;
                 rs_neg1 <= '1';
                 rs_sel2 <= RSH2_A;
-                v.add_bsmall := r.exp_cmp;
-                if r.exp_cmp = '0' then
-                    if r.a.exponent = r.b.exponent then
-                        v.state := ADD_2;
-                    else
-                        v.longmask := '0';
-                        v.state := ADD_SHIFT;
-                    end if;
+                v.add_bsmall := '0';
+                if r.a.exponent = r.b.exponent then
+                    v.state := ADD_2;
+                elsif r.a.exponent < r.b.exponent then
+                    v.longmask := '0';
+                    v.state := ADD_SHIFT;
                 else
+                    v.add_bsmall := '1';
                     v.state := ADD_1;
                 end if;
 
@@ -1764,7 +1752,7 @@ begin
                 elsif r.c.denorm = '1' then
                     opsel_a <= AIN_C;
                     v.state := RENORM_C;
-                elsif r.madd_cmp = '0' then
+                elsif (r.a.exponent + r.c.exponent + 1) < r.b.exponent then
                     -- addend is bigger, do multiply first
                     -- if subtracting, sign is opposite to initial estimate
                     f_to_multiply.valid <= '1';
@@ -1790,10 +1778,6 @@ begin
                             v.first := '1';
                             v.state := MULT_1;
                         else
-                            v.madd_cmp := '0';
-                            if new_exp + 1 >= r.b.exponent then
-                                v.madd_cmp := '1';
-                            end if;
                             v.state := DO_FMADD;
                         end if;
                     else
@@ -1841,10 +1825,6 @@ begin
                     v.first := '1';
                     v.state := MULT_1;
                 else
-                    v.madd_cmp := '0';
-                    if new_exp + 1 >= r.b.exponent then
-                        v.madd_cmp := '1';
-                    end if;
                     v.state := DO_FMADD;
                 end if;
 

From 8648ddb64f2a7c99ad70fe3989879ea0ba8ea4d9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 6 Mar 2024 13:45:58 +1100
Subject: [PATCH 14/24] FPU: Eliminate EXC_RESULT state

This lets us remove r.opsel_a and is a step towards moving the
handling of exceptional cases out to a separate process.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 187 +++++++++++++++++++++++++------------------------------
 1 file changed, 84 insertions(+), 103 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index 8a82e03..60640af 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -79,7 +79,6 @@ architecture behaviour of fpu is
                      RENORM_A, RENORM_A2,
                      RENORM_B, RENORM_B2,
                      RENORM_C, RENORM_C2,
-                     EXC_RESULT,
                      IDIV_NORMB, IDIV_NORMB2, IDIV_NORMB3,
                      IDIV_CLZA, IDIV_CLZA2, IDIV_CLZA3,
                      IDIV_NR0, IDIV_NR1, IDIV_NR2, IDIV_USE0_5,
@@ -152,7 +151,6 @@ architecture behaviour of fpu is
         first        : std_ulogic;
         count        : unsigned(1 downto 0);
         doing_ftdiv  : std_ulogic_vector(1 downto 0);
-        opsel_a      : std_ulogic_vector(1 downto 0);
         use_a        : std_ulogic;
         use_b        : std_ulogic;
         use_c        : std_ulogic;
@@ -872,7 +870,6 @@ begin
         variable rsgn_op     : std_ulogic_vector(1 downto 0);
         variable is_nan_inf  : std_ulogic;
         variable is_zero_den : std_ulogic;
-        variable sign_inv    : std_ulogic;
     begin
         v := r;
         v.complete := '0';
@@ -881,7 +878,6 @@ begin
         exec_state := IDLE;
         is_nan_inf := '0';
         is_zero_den := '0';
-        sign_inv := '0';
         v.cycle_1 := e_in.valid;
 
         if r.complete = '1' or r.do_intr = '1' then
@@ -1133,7 +1129,6 @@ begin
         v.update_fprf := '0';
         v.first := '0';
         v.doing_ftdiv := "00";
-        v.opsel_a := AIN_R;
         opsel_a <= AIN_R;
         opsel_ainv <= '0';
         opsel_mask <= '0';
@@ -1216,7 +1211,13 @@ begin
 
             when DO_NAN_INF =>
                 -- At least one floating-point operand is infinity or NaN
-                invalid_mul := '0';
+                if r.a.class = NAN then
+                    opsel_a <= AIN_A;
+                elsif r.b.class = NAN then
+                    opsel_a <= AIN_B;
+                else
+                    opsel_a <= AIN_C;
+                end if;
 
                 if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or
                     (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or
@@ -1225,33 +1226,34 @@ begin
                     v.fpscr(FPSCR_VXSNAN) := '1';
                     invalid := '1';
                 end if;
+
                 -- Check for this case here since VXIMZ can be set along with VXSNAN
+                invalid_mul := '0';
                 if r.is_multiply = '1' and
                     ((r.a.class = INFINITY and r.c.class = ZERO) or
                      (r.a.class = ZERO and r.c.class = INFINITY)) then
                     v.fpscr(FPSCR_VXIMZ) := '1';
-                    qnan_result := '1';
                     invalid_mul := '1';
                 end if;
 
+                if r.int_result = '1' then
+                    opsel_r <= RES_MISC;
+                    misc_sel <= "110";
+                    v.fpscr(FPSCR_VXCVI) := '1';
+                    invalid := '1';
+                end if;
+
                 if r.a.class = NAN or r.b.class = NAN or r.c.class = NAN then
-                    if r.int_result = '1' then
-                        v.state := INT_OFLOW;
-                    else
-                        if r.a.class = NAN then
-                            v.opsel_a := AIN_A;
-                        elsif r.b.class = NAN then
-                            v.opsel_a := AIN_B;
-                        elsif r.c.class = NAN then
-                            v.opsel_a := AIN_C;
-                        end if;
-                        rsgn_op := RSGN_SEL;
-                        v.state := EXC_RESULT;
-                    end if;
+                    rsgn_op := RSGN_SEL;
+                    v.result_class := NAN;
 
                 else
-                    if (r.a.class = INFINITY or r.c.class = INFINITY) and invalid_mul = '0' then
-                        sign_inv := r.is_multiply and r.is_subtract;
+                    if invalid_mul = '1' then
+                        qnan_result := '1';
+                    elsif (r.a.class = INFINITY or r.c.class = INFINITY) then
+                        if r.is_multiply = '1' then
+                            rsgn_op := RSGN_SUB;
+                        end if;
                         if r.is_subtract = '1' and r.b.class = INFINITY then
                             v.fpscr(FPSCR_VXISI) := '1';
                             qnan_result := '1';
@@ -1271,67 +1273,55 @@ begin
                     else
                         v.result_class := INFINITY;
                     end if;
-                    if r.b.class = INFINITY and r.int_result = '1' then
-                        -- fcti*
-                        v.state := INT_OFLOW;
-                    else
-                        arith_done := '1';
-                    end if;
                 end if;
+                arith_done := '1';
 
             when DO_ZERO_DEN =>
                 -- At least one floating point operand is zero or denormalized
-                if r.is_addition = '1' then
-                    opsel_a <= AIN_A;
-                else
+                if r.use_a = '1' and r.a.class = ZERO then
                     opsel_a <= AIN_B;
-                end if;
-                if (r.use_a = '1' and r.a.class = ZERO) or
-                    (r.use_b = '1' and r.b.class = ZERO and r.is_multiply = '0') or
-                    (r.use_c = '1' and r.c.class = ZERO) then
-                    if r.use_a = '1' and r.a.class = ZERO then
-                        if r.is_inverse = '1' then
-                            -- fdiv; result is 0 unless B=0
-                            if r.b.class = ZERO then
-                                v.fpscr(FPSCR_VXZDZ) := '1';
-                                qnan_result := '1';
-                            else
-                                v.result_class := ZERO;
-                            end if;
-                            arith_done := '1';
-                        elsif r.is_addition = '1' then
-                            -- result is +/- B
-                            v.opsel_a := AIN_B;
-                            v.state := EXC_RESULT;
-                        else
-                            v.result_class := ZERO;
-                            arith_done := '1';
-                        end if;
-                    elsif r.use_c = '1' and r.c.class = ZERO then
-                        v.opsel_a := AIN_B;
-                        v.state := EXC_RESULT;
+                    re_sel2 <= REXP2_B;
+                    re_set_result <= '1';
+                    if r.is_inverse = '1' and r.b.class = ZERO then
+                        -- fdiv with B=0
+                        v.fpscr(FPSCR_VXZDZ) := '1';
+                        qnan_result := '1';
+                    end if;
+                    if r.is_addition = '1' then
+                        -- result is +/- B
+                        v.result_class := r.b.class;
                     else
-                        -- B is zero, other operands are finite
-                        if r.int_result = '1' then
-                            -- fcti*
-                            arith_done := '1';
-                        elsif r.is_inverse = '1' then
-                            -- fdiv, fre, frsqrte
-                            v.result_class := INFINITY;
-                            zero_divide := '1';
-                            arith_done := '1';
-                        elsif r.is_addition = '1' then
-                            -- fadd, fsub
-                            v.result_class := FINITE;
-                            re_sel1 <= REXP1_A;
-                            re_set_result <= '1';
-                            arith_done := '1';
-                        else
-                            -- other things, result is zero
-                            v.result_class := ZERO;
-                            arith_done := '1';
-                        end if;
+                        v.result_class := ZERO;
                     end if;
+                    arith_done := '1';
+                elsif r.use_c = '1' and r.c.class = ZERO then
+                    -- fmul or fmadd/sub with C=0
+                    opsel_a <= AIN_B;
+                    re_sel2 <= REXP2_B;
+                    re_set_result <= '1';
+                    if r.is_addition = '1' then
+                        v.result_class := r.b.class;
+                    else
+                        v.result_class := ZERO;
+                    end if;
+                    arith_done := '1';
+                elsif (r.use_b = '1' and r.b.class = ZERO and r.is_multiply = '0') then
+                    -- B is zero, other operands are finite, not fmadd*
+                    opsel_a <= AIN_A;
+                    re_sel1 <= REXP1_A;
+                    re_set_result <= '1';
+                    if r.is_inverse = '1' then
+                        -- fdiv, fre, frsqrte
+                        v.result_class := INFINITY;
+                        zero_divide := '1';
+                    elsif r.is_addition = '1' then
+                        -- fadd, fsub
+                        v.result_class := FINITE;
+                    else
+                        -- other things, result is zero
+                        v.result_class := ZERO;
+                    end if;
+                    arith_done := '1';
 
                 else
                     -- some operand is denorm, and/or it's fmadd/fmsub with B=0
@@ -1682,11 +1672,16 @@ begin
             when DO_FSEL =>
                 rsgn_op := RSGN_SEL;
                 if r.a.class = ZERO or (r.a.negative = '0' and r.a.class /= NAN) then
-                    v.opsel_a := AIN_C;
+                    opsel_a <= AIN_C;
+                    re_sel2 <= REXP2_C;
+                    v.result_class := r.c.class;
                 else
-                    v.opsel_a := AIN_B;
+                    opsel_a <= AIN_B;
+                    re_sel2 <= REXP2_B;
+                    v.result_class := r.b.class;
                 end if;
-                v.state := EXC_RESULT;
+                re_set_result <= '1';
+                arith_done := '1';
 
             when DO_FSQRT =>
                 opsel_a <= AIN_B;
@@ -2572,23 +2567,6 @@ begin
                 re_set_result <= '1';
                 arith_done := '1';
 
-            when EXC_RESULT =>
-                -- r.opsel_a = AIN_A, AIN_B or AIN_C according to which input is the result
-                opsel_a <= r.opsel_a;
-                case r.opsel_a is
-                    when AIN_B =>
-                        re_sel2 <= REXP2_B;
-                        v.result_class := r.b.class;
-                    when AIN_C =>
-                        re_sel2 <= REXP2_C;
-                        v.result_class := r.c.class;
-                    when others =>
-                        re_sel1 <= REXP1_A;
-                        v.result_class := r.a.class;
-                end case;
-                re_set_result <= '1';
-                arith_done := '1';
-
             when DO_IDIVMOD =>
                 opsel_a <= AIN_B;
                 if r.b.class = ZERO then
@@ -3113,25 +3091,28 @@ begin
 
         end case;
 
+        rsign := r.result_sign;
         case rsgn_op is
             when RSGN_SEL =>
-                case v.opsel_a is
+                case opsel_a is
                     when AIN_A =>
-                        v.result_sign := r.a.negative;
+                        rsign := r.a.negative;
                     when AIN_B =>
-                        v.result_sign := r.b.negative;
+                        rsign := r.b.negative;
                     when AIN_C =>
-                        v.result_sign := r.c.negative;
+                        rsign := r.c.negative;
                     when others =>
                 end case;
+                v.result_sign := rsign;
             when RSGN_SUB =>
-                v.result_sign := r.result_sign xor r.is_subtract;
+                rsign := r.result_sign xor r.is_subtract;
+                v.result_sign := rsign;
             when RSGN_INV =>
-                v.result_sign := not r.result_sign;
+                rsign := not r.result_sign;
+                v.result_sign := rsign;
             when others =>
         end case;
 
-        rsign := r.result_sign xor sign_inv;
         if zero_divide = '1' then
             v.fpscr(FPSCR_ZX) := '1';
         end if;

From 70819c4c39d6892d7d9a338a6dd5b798bbb309a7 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 7 Mar 2024 13:53:01 +1100
Subject: [PATCH 15/24] FPU: Do renormalization from DO_ZERO_DEN state

Instead of having the various DO_* states (DO_FMUL, DO_FDIV, etc.)
handle checking for denormalized inputs, we now have DO_ZERO_DEN state
check for denormalized inputs and branch to RENORM_{A,B,C} to handle
them.

This also meant some changes were needed in how fsqrt and frsqrte
handled inputs with odd exponent.  The DO_FSQRT and DO_FRSQRTE states
were very similar and have been combined into one.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 219 ++++++++++++++++++++-----------------------------------
 1 file changed, 79 insertions(+), 140 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index 60640af..ebbb564 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -53,7 +53,7 @@ architecture behaviour of fpu is
                      DO_FCFID, DO_FCTI,
                      DO_FRSP, DO_FRSP_2, DO_FRI,
                      DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT, DO_FMADD,
-                     DO_FRE, DO_FRSQRTE,
+                     DO_FRE,
                      DO_FSEL,
                      DO_IDIVMOD,
                      FRI_1,
@@ -62,10 +62,9 @@ architecture behaviour of fpu is
                      MULT_1,
                      FMADD_0, FMADD_1, FMADD_2, FMADD_3,
                      FMADD_4, FMADD_5, FMADD_6,
-                     LOOKUP,
                      DIV_2, DIV_3, DIV_4, DIV_5, DIV_6,
                      FRE_1,
-                     RSQRT_1,
+                     SQRT_ODD, RSQRT_1,
                      FTDIV_1,
                      SQRT_1, SQRT_2, SQRT_3, SQRT_4,
                      SQRT_5, SQRT_6, SQRT_7, SQRT_8,
@@ -76,9 +75,8 @@ architecture behaviour of fpu is
                      ROUND_UFLOW, ROUND_OFLOW,
                      ROUNDING, ROUNDING_2, ROUNDING_3,
                      DENORM,
-                     RENORM_A, RENORM_A2,
-                     RENORM_B, RENORM_B2,
-                     RENORM_C, RENORM_C2,
+                     RENORM_A, RENORM_B, RENORM_C,
+                     RENORM_1, RENORM_2,
                      IDIV_NORMB, IDIV_NORMB2, IDIV_NORMB3,
                      IDIV_CLZA, IDIV_CLZA2, IDIV_CLZA3,
                      IDIV_NR0, IDIV_NR1, IDIV_NR2, IDIV_USE0_5,
@@ -174,6 +172,7 @@ architecture behaviour of fpu is
         res_int      : std_ulogic;
         exec_state   : state_t;
         cycle_1      : std_ulogic;
+        regsel       : std_ulogic_vector(1 downto 0);
     end record;
 
     type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0);
@@ -309,7 +308,7 @@ architecture behaviour of fpu is
         2#10110# => DO_FSQRT,
         2#11000# => DO_FRE,
         2#11001# => DO_FMUL,
-        2#11010# => DO_FRSQRTE,
+        2#11010# => DO_FSQRT,
         2#11100# => DO_FMADD,
         2#11101# => DO_FMADD,
         2#11110# => DO_FMADD,
@@ -870,6 +869,7 @@ begin
         variable rsgn_op     : std_ulogic_vector(1 downto 0);
         variable is_nan_inf  : std_ulogic;
         variable is_zero_den : std_ulogic;
+        variable set_reg_ind : std_ulogic;
     begin
         v := r;
         v.complete := '0';
@@ -1170,6 +1170,7 @@ begin
         mult_mask := '0';
         rnd_b32 := '0';
         illegal := '0';
+        set_reg_ind := '0';
 
         re_sel1 <= REXP1_ZERO;
         re_sel2 <= REXP2_CON;
@@ -1208,6 +1209,7 @@ begin
                 v.x := '0';
                 v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX);
                 set_s := '1';
+                v.regsel := AIN_R;
 
             when DO_NAN_INF =>
                 -- At least one floating-point operand is infinity or NaN
@@ -1331,6 +1333,14 @@ begin
                         -- This will trigger for fmul as well as fmadd/sub, but
                         -- it doesn't matter since r.is_subtract = 0 for fmul.
                         rsgn_op := RSGN_SUB;
+                    end if;
+                    if r.a.denorm = '1' and (r.is_multiply = '1' or r.is_inverse = '1') then
+                        v.state := RENORM_A;
+                    elsif r.c.denorm = '1' then
+                        v.state := RENORM_C;
+                    elsif r.b.denorm = '1' and (r.is_inverse = '1' or r.is_sqrt = '1') then
+                        v.state := RENORM_B;
+                    elsif r.is_multiply = '1' and r.b.class = ZERO then
                         v.state := DO_FMUL;
                     else
                         v.state := r.exec_state;
@@ -1639,16 +1649,8 @@ begin
                 re_sel1 <= REXP1_A;
                 re_sel2 <= REXP2_C;
                 re_set_result <= '1';
-                -- Renormalize denorm operands
-                if r.a.denorm = '1' then
-                    v.state := RENORM_A;
-                elsif r.c.denorm = '1' then
-                    opsel_a <= AIN_C;
-                    v.state := RENORM_C;
-                else
-                    f_to_multiply.valid <= '1';
-                    v.state := MULT_1;
-                end if;
+                f_to_multiply.valid <= '1';
+                v.state := MULT_1;
 
             when DO_FDIV =>
                 opsel_a <= AIN_A;
@@ -1658,16 +1660,8 @@ begin
                 re_neg2 <= '1';
                 re_set_result <= '1';
                 v.count := "00";
-                -- Renormalize denorm operands
-                if r.a.denorm = '1' then
-                    v.state := RENORM_A;
-                elsif r.b.denorm = '1' then
-                    opsel_a <= AIN_B;
-                    v.state := RENORM_B;
-                else
-                    v.first := '1';
-                    v.state := DIV_2;
-                end if;
+                v.first := '1';
+                v.state := DIV_2;
 
             when DO_FSEL =>
                 rsgn_op := RSGN_SEL;
@@ -1691,14 +1685,13 @@ begin
                 if r.b.negative = '1' then
                     v.fpscr(FPSCR_VXSQRT) := '1';
                     qnan_result := '1';
-                elsif r.b.denorm = '1' then
-                    v.state := RENORM_B;
-                elsif r.b.exponent(0) = '0' then
+                end if;
+                if r.b.exponent(0) = '1' then
+                    v.state := SQRT_ODD;
+                elsif r.is_inverse = '0' then
                     v.state := SQRT_1;
                 else
-                    -- set shift to 1
-                    rs_con2 <= RSCON2_1;
-                    v.state := RENORM_B2;
+                    v.state := RSQRT_1;
                 end if;
 
             when DO_FRE =>
@@ -1706,29 +1699,7 @@ begin
                 v.result_class := r.b.class;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
-                if r.b.denorm = '1' then
-                    v.state := RENORM_B;
-                else
-                    v.state := FRE_1;
-                end if;
-
-            when DO_FRSQRTE =>
-                opsel_a <= AIN_B;
-                v.result_class := r.b.class;
-                re_sel2 <= REXP2_B;
-                re_set_result <= '1';
-                -- set shift to 1
-                rs_con2 <= RSCON2_1;
-                if r.b.negative = '1' then
-                    v.fpscr(FPSCR_VXSQRT) := '1';
-                    qnan_result := '1';
-                elsif r.b.denorm = '1' then
-                    v.state := RENORM_B;
-                elsif r.b.exponent(0) = '0' then
-                    v.state := RSQRT_1;
-                else
-                    v.state := RENORM_B2;
-                end if;
+                v.state := FRE_1;
 
             when DO_FMADD =>
                 -- fmadd, fmsub, fnmadd, fnmsub
@@ -1740,14 +1711,7 @@ begin
                 re_set_result <= '1';
                 -- put b.exp into shift
                 rs_sel1 <= RSH1_B;
-                -- Make sure A and C are normalized
-                if r.a.denorm = '1' then
-                    opsel_a <= AIN_A;
-                    v.state := RENORM_A;
-                elsif r.c.denorm = '1' then
-                    opsel_a <= AIN_C;
-                    v.state := RENORM_C;
-                elsif (r.a.exponent + r.c.exponent + 1) < r.b.exponent then
+                if (r.a.exponent + r.c.exponent + 1) < r.b.exponent then
                     -- addend is bigger, do multiply first
                     -- if subtracting, sign is opposite to initial estimate
                     f_to_multiply.valid <= '1';
@@ -1759,68 +1723,48 @@ begin
                 end if;
 
             when RENORM_A =>
-                rs_norm <= '1';
-                v.state := RENORM_A2;
-
-            when RENORM_A2 =>
-                set_a := '1';
-                re_sel2 <= REXP2_NE;
+                -- Get A into R
+                opsel_a <= AIN_A;
+                v.regsel := AIN_A;
+                re_sel1 <= REXP1_A;
                 re_set_result <= '1';
-                if r.is_multiply = '1' then
-                    opsel_a <= AIN_C;
-                    if r.c.mantissa(UNIT_BIT) = '1' then
-                        if r.is_addition = '0' or r.b.class = ZERO then
-                            v.first := '1';
-                            v.state := MULT_1;
-                        else
-                            v.state := DO_FMADD;
-                        end if;
-                    else
-                        v.state := RENORM_C;
-                    end if;
-                else
-                    opsel_a <= AIN_B;
-                    if r.b.mantissa(UNIT_BIT) = '1' then
-                        v.first := '1';
-                        v.state := DIV_2;
-                    else
-                        v.state := RENORM_B;
-                    end if;
-                end if;
+                v.a.denorm := '0';
+                v.state := RENORM_1;
 
             when RENORM_B =>
-                rs_norm <= '1';
-                renorm_sqrt := r.is_sqrt;
-                v.state := RENORM_B2;
-
-            when RENORM_B2 =>
-                set_b := '1';
-                -- For fdiv, we need to increase result_exp by shift rather
-                -- than decreasing it as for fre/frsqrte and fsqrt.
-                -- We do that by negating r.shift in this cycle and then
-                -- setting result_exp to new_exp in the next cycle
-                if r.use_a = '1' then
-                    rs_sel1 <= RSH1_S;
-                    rs_neg1 <= '1';
-                else
-                    re_sel2 <= REXP2_NE;
-                    re_set_result <= '1';
-                end if;
-                v.state := LOOKUP;
+                -- Get B into R
+                opsel_a <= AIN_B;
+                v.regsel := AIN_B;
+                re_sel2 <= REXP2_B;
+                re_set_result <= '1';
+                v.b.denorm := '0';
+                v.state := RENORM_1;
 
             when RENORM_C =>
+                -- Get C into R
+                opsel_a <= AIN_C;
+                v.regsel := AIN_C;
+                re_sel2 <= REXP2_C;
+                re_set_result <= '1';
+                v.c.denorm := '0';
+                v.state := RENORM_1;
+
+            when RENORM_1 =>
                 rs_norm <= '1';
-                v.state := RENORM_C2;
+                renorm_sqrt := r.is_sqrt;
+                v.state := RENORM_2;
 
-            when RENORM_C2 =>
-                set_c := '1';
-                re_sel2 <= REXP2_NE;
-                re_set_result <= '1';
-                if r.is_addition = '0' or r.b.class = ZERO then
-                    v.first := '1';
-                    v.state := MULT_1;
+            when RENORM_2 =>
+                set_reg_ind := '1';
+                if r.c.denorm = '1' then
+                    -- must be either fmul or fmadd/sub
+                    v.state := RENORM_C;
+                elsif r.b.denorm = '1' and r.is_addition = '0' then
+                    v.state := RENORM_B;
+                elsif r.is_multiply = '1' and r.b.class = ZERO then
+                    v.state := DO_FMUL;
                 else
-                    v.state := DO_FMADD;
+                    v.state := r.exec_state;
                 end if;
 
             when ADD_1 =>
@@ -2017,28 +1961,6 @@ begin
                     v.state := NORMALIZE;
                 end if;
 
-            when LOOKUP =>
-                -- wait one cycle for inverse_table[B] lookup
-                -- if this is a division, compute exponent
-                -- (see comment on RENORM_B2 above)
-                opsel_a <= AIN_B;
-                if r.use_a = '1' then
-                    re_sel2 <= REXP2_NE;
-                    re_set_result <= '1';
-                end if;
-                v.first := '1';
-                if r.is_sqrt = '1' then
-                    if r.is_inverse = '1' then
-                        v.state := RSQRT_1;
-                    else
-                        v.state := SQRT_1;
-                    end if;
-                elsif r.use_a = '1' then
-                    v.state := DIV_2;
-                else
-                    v.state := FRE_1;
-                end if;
-
             when DIV_2 =>
                 -- compute Y = inverse_table[B] (when count=0); P = 2 - B * Y
                 msel_1 <= MUL1_B;
@@ -2135,6 +2057,12 @@ begin
                     v.doing_ftdiv := "10";
                 end if;
 
+            when SQRT_ODD =>
+                -- set shift to 1
+                rs_con2 <= RSCON2_1;
+                v.regsel := AIN_B;
+                v.state := RENORM_2;
+
             when RSQRT_1 =>
                 opsel_r <= RES_MISC;
                 misc_sel <= "101";
@@ -3344,6 +3272,17 @@ begin
             end case;
         end if;
 
+        if set_reg_ind = '1' then
+            case r.regsel is
+                when AIN_A =>
+                    set_a := '1';
+                when AIN_B =>
+                    set_b := '1';
+                when AIN_C =>
+                    set_c := '1';
+                when others =>
+            end case;
+        end if;
         if set_a = '1' or set_a_exp = '1' then
             v.a.exponent := new_exp;
         end if;

From 5f0b2d433da9cda0777fbf9bbfe58641c6a2bc57 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 7 Mar 2024 21:01:53 +1100
Subject: [PATCH 16/24] FPU: Simplify calculation of result_class

For the various arithmetic operators, we only get to the DO_* states
when the inputs are finite (not zero, infinity or NaN), so we can
replace setting of v.result_class to r.a.class or r.b.class with a
overall setting of it to FINITE in cycle 1 of all those operations.

Also, integer division doesn't need to set the result class since the
result is integer.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index ebbb564..a0a52a8 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -1190,6 +1190,7 @@ begin
         if r.cycle_1 = '1' and r.is_arith = '1' then
             v.fpscr(FPSCR_FR) := '0';
             v.fpscr(FPSCR_FI) := '0';
+            v.result_class := FINITE;
         end if;
 
         case r.state is
@@ -1538,7 +1539,6 @@ begin
 
             when DO_FRI =>    -- fri[nzpm]
                 opsel_a <= AIN_B;
-                v.result_class := r.b.class;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 -- set shift to exponent - 52
@@ -1555,7 +1555,6 @@ begin
             when DO_FRSP =>
                 -- r.shift = 0
                 opsel_a <= AIN_B;
-                v.result_class := r.b.class;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 v.state := DO_FRSP_2;
@@ -1580,7 +1579,6 @@ begin
                 -- instr bit 8: 1=unsigned 0=signed
                 -- instr bit 1: 1=round to zero 0=use fpscr[RN]
                 opsel_a <= AIN_B;
-                v.result_class := r.b.class;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 rs_sel1 <= RSH1_B;
@@ -1624,7 +1622,6 @@ begin
             when DO_FADD =>
                 -- fadd[s] and fsub[s]
                 opsel_a <= AIN_A;
-                v.result_class := r.a.class;
                 re_sel1 <= REXP1_A;
                 re_set_result <= '1';
                 -- set shift to a.exp - b.exp
@@ -1645,7 +1642,6 @@ begin
             when DO_FMUL =>
                 -- fmul[s]
                 opsel_a <= AIN_A;
-                v.result_class := r.a.class;
                 re_sel1 <= REXP1_A;
                 re_sel2 <= REXP2_C;
                 re_set_result <= '1';
@@ -1654,7 +1650,6 @@ begin
 
             when DO_FDIV =>
                 opsel_a <= AIN_A;
-                v.result_class := r.a.class;
                 re_sel1 <= REXP1_A;
                 re_sel2 <= REXP2_B;
                 re_neg2 <= '1';
@@ -1679,7 +1674,6 @@ begin
 
             when DO_FSQRT =>
                 opsel_a <= AIN_B;
-                v.result_class := r.b.class;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 if r.b.negative = '1' then
@@ -1696,7 +1690,6 @@ begin
 
             when DO_FRE =>
                 opsel_a <= AIN_B;
-                v.result_class := r.b.class;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 v.state := FRE_1;
@@ -1704,7 +1697,6 @@ begin
             when DO_FMADD =>
                 -- fmadd, fmsub, fnmadd, fnmsub
                 opsel_a <= AIN_B;
-                v.result_class := r.a.class;
                 -- put a.exp + c.exp into result_exp
                 re_sel1 <= REXP1_A;
                 re_sel2 <= REXP2_C;
@@ -2511,7 +2503,6 @@ begin
                         opsel_ainv <= '1';
                         carry_in <= '1';
                     end if;
-                    v.result_class := FINITE;
                     re_con2 <= RECON2_UNIT;
                     re_set_result <= '1';
                     v.state := IDIV_NORMB;

From 0e7c11a0e4970330954dfa681c4968ef4aa53a30 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 8 Mar 2024 14:44:47 +1100
Subject: [PATCH 17/24] FPU: Move result_class logic outside of state machine

The various states choose one of four operations (including no-op) to
be done on result_class.  Some operations have side-effects on
arith_done or FPSCR.  The DO_NAN_INF and DO_ZERO_DEN states still set
result_class directly since their logic is expected to move out to a
separate process later.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 98 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 57 insertions(+), 41 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index a0a52a8..baa087f 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -296,6 +296,12 @@ architecture behaviour of fpu is
     constant RSGN_SUB : std_ulogic_vector(1 downto 0) := "10";
     constant RSGN_SEL : std_ulogic_vector(1 downto 0) := "11";
 
+    signal  rcls_op      : std_ulogic_vector(1 downto 0);
+    constant RCLS_NOP    : std_ulogic_vector(1 downto 0) := "00";
+    constant RCLS_SEL    : std_ulogic_vector(1 downto 0) := "01";
+    constant RCLS_TZERO  : std_ulogic_vector(1 downto 0) := "10";
+    constant RCLS_TINF   : std_ulogic_vector(1 downto 0) := "11";
+
     constant arith_decode : decode32 := (
         -- indexed by bits 5..1 of opcode
         2#01000# => DO_FRI,
@@ -813,7 +819,6 @@ begin
         variable arith_done  : std_ulogic;
         variable invalid     : std_ulogic;
         variable zero_divide : std_ulogic;
-        variable mant_nz     : std_ulogic;
         variable min_exp     : signed(EXP_BITS-1 downto 0);
         variable max_exp     : signed(EXP_BITS-1 downto 0);
         variable bias_exp    : signed(EXP_BITS-1 downto 0);
@@ -1186,6 +1191,7 @@ begin
         rs_norm <= '0';
 
         rsgn_op := RSGN_NOP;
+        rcls_op <= RCLS_NOP;
 
         if r.cycle_1 = '1' and r.is_arith = '1' then
             v.fpscr(FPSCR_FR) := '0';
@@ -1531,7 +1537,7 @@ begin
 
             when DO_FMR =>
                 opsel_a <= AIN_B;
-                v.result_class := r.b.class;
+                rcls_op <= RCLS_SEL;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 v.writing_fpr := '1';
@@ -1605,12 +1611,12 @@ begin
 
             when DO_FCFID =>
                 opsel_a <= AIN_B;
+                rcls_op <= RCLS_SEL;
                 if r.insn(8) = '0' and r.b.negative = '1' then
                     -- fcfid[s] with negative operand, set R = -B
                     opsel_ainv <= '1';
                     carry_in <= '1';
                 end if;
-                v.result_class := r.b.class;
                 re_con2 <= RECON2_UNIT;
                 re_set_result <= '1';
                 if r.b.class = ZERO then
@@ -1660,14 +1666,13 @@ begin
 
             when DO_FSEL =>
                 rsgn_op := RSGN_SEL;
+                rcls_op <= RCLS_SEL;
                 if r.a.class = ZERO or (r.a.negative = '0' and r.a.class /= NAN) then
                     opsel_a <= AIN_C;
                     re_sel2 <= REXP2_C;
-                    v.result_class := r.c.class;
                 else
                     opsel_a <= AIN_B;
                     re_sel2 <= REXP2_B;
-                    v.result_class := r.b.class;
                 end if;
                 re_set_result <= '1';
                 arith_done := '1';
@@ -1799,6 +1804,7 @@ begin
                 -- check for overflow or negative result (can't get both)
                 -- r.shift = -1
                 re_sel2 <= REXP2_NE;
+                rcls_op <= RCLS_TZERO;
                 if r.r(63) = '1' then
                     -- result is opposite sign to expected
                     rsgn_op := RSGN_INV;
@@ -1818,10 +1824,6 @@ begin
                 elsif r.r(UNIT_BIT) = '1' then
                     set_x := '1';
                     v.state := ROUNDING;
-                elsif (r_hi_nz or r_lo_nz or (or (r.r(DP_LSB - 1 downto 0)))) = '0' then
-                    -- r.x must be zero at this point
-                    v.result_class := ZERO;
-                    arith_done := '1';
                 else
                     rs_norm <= '1';
                     v.state := NORMALIZE;
@@ -1934,19 +1936,15 @@ begin
                 opsel_r <= RES_SHIFT;
                 re_sel2 <= REXP2_NE;
                 rs_norm <= '1';
+                rcls_op <= RCLS_TZERO;
                 if (r.r(UNIT_BIT + 2) or r_hi_nz or r_lo_nz or (or (r.r(DP_LSB - 1 downto 0)))) = '0' then
-                    if s_nz = '0' then
-                        -- must be a subtraction, and r.x must be zero
-                        v.result_class := ZERO;
-                        arith_done := '1';
-                    else
-                        -- R is all zeroes but there are non-zero bits in S
-                        -- so shift them into R and set S to 0
-                        set_r := '1';
-                        re_set_result <= '1';
-                        set_s := '1';
-                        v.state := FINISH;
-                    end if;
+                    -- S = 0 case is handled by RCLS_TZERO logic, otherwise...
+                    -- R is all zeroes but there are non-zero bits in S
+                    -- so shift them into R and set S to 0
+                    set_r := '1';
+                    re_set_result <= '1';
+                    set_s := '1';
+                    v.state := FINISH;
                 elsif r.r(UNIT_BIT + 2 downto UNIT_BIT) = "001" then
                     v.state := FINISH;
                 else
@@ -2379,19 +2377,13 @@ begin
                 end if;
 
             when ROUND_OFLOW =>
+                rcls_op <= RCLS_TINF;
                 v.fpscr(FPSCR_OX) := '1';
                 if r.fpscr(FPSCR_OE) = '0' then
                     -- disabled overflow exception
                     -- result depends on rounding mode
                     v.fpscr(FPSCR_XX) := '1';
                     v.fpscr(FPSCR_FI) := '1';
-                    if r.round_mode(1 downto 0) = "00" or
-                        (r.round_mode(1) = '1' and r.round_mode(0) = r.result_sign) then
-                        v.result_class := INFINITY;
-                        v.fpscr(FPSCR_FR) := '1';
-                    else
-                        v.fpscr(FPSCR_FR) := '0';
-                    end if;
                     -- construct largest representable number
                     re_con2 <= RECON2_MAX;
                     re_set_result <= '1';
@@ -2459,25 +2451,20 @@ begin
             when ROUNDING_3 =>
                 -- r.shift = clz(r.r) - 9
                 opsel_r <= RES_SHIFT;
-                mant_nz := r_hi_nz or (r_lo_nz and not r.single_prec);
                 re_sel2 <= REXP2_NE;
                 -- set shift to new_exp - min_exp (== -1022)
                 rs_sel1 <= RSH1_NE;
                 rs_con2 <= RSCON2_MINEXP;
                 rs_neg2 <= '1';
-                if mant_nz = '0' then
-                    set_r := '0';
-                    v.result_class := ZERO;
-                    arith_done := '1';
+                rcls_op <= RCLS_TZERO;
+                -- If the result is zero, that's handled below.
+                -- Renormalize result after rounding
+                re_set_result <= '1';
+                v.denorm := exp_tiny;
+                if new_exp < to_signed(-1022, EXP_BITS) then
+                    v.state := DENORM;
                 else
-                    -- Renormalize result after rounding
-                    re_set_result <= '1';
-                    v.denorm := exp_tiny;
-                    if new_exp < to_signed(-1022, EXP_BITS) then
-                        v.state := DENORM;
-                    else
-                        arith_done := '1';
-                    end if;
+                    arith_done := '1';
                 end if;
 
             when DENORM =>
@@ -3032,6 +3019,35 @@ begin
             when others =>
         end case;
 
+        case rcls_op is
+            when RCLS_SEL =>
+                case opsel_a is
+                    when AIN_A =>
+                        v.result_class := r.a.class;
+                    when AIN_B =>
+                        v.result_class := r.b.class;
+                    when AIN_C =>
+                        v.result_class := r.c.class;
+                    when others =>
+                end case;
+            when RCLS_TZERO =>
+                if or (r.r(UNIT_BIT + 2 downto 0)) = '0' and s_nz = '0' then
+                    v.result_class := ZERO;
+                    arith_done := '1';
+                end if;
+            when RCLS_TINF =>
+                if r.fpscr(FPSCR_OE) = '0' then
+                    if r.round_mode(1 downto 0) = "00" or
+                        (r.round_mode(1) = '1' and r.round_mode(0) = r.result_sign) then
+                        v.result_class := INFINITY;
+                        v.fpscr(FPSCR_FR) := '1';
+                    else
+                        v.fpscr(FPSCR_FR) := '0';
+                    end if;
+                end if;
+            when others =>
+        end case;
+
         if zero_divide = '1' then
             v.fpscr(FPSCR_ZX) := '1';
         end if;

From bbc485f33657168bec25df5c954d2911f94c5845 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 11 Mar 2024 12:31:58 +1100
Subject: [PATCH 18/24] FPU: Rework inputs to the main adder

With this, the A input no longer has R as an option but now takes the
rounding constants and the low-order bits of P (used as an adjustment
in the square root algorithm).  The B input has either R or zero.
Both inputs can be optionally inverted for subtraction.  The select
inputs to the multiplexers now have 3 bits in opsel_a and 1 bit in
opsel_b.

The states which need R to be set now explicitly have set_r := 1 even
though that is the default, essentially for documentation reasons.
Similarly some states set opsel_b <= BIN_R even though that is the
default.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 243 ++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 177 insertions(+), 66 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index baa087f..648bbaa 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -172,7 +172,7 @@ architecture behaviour of fpu is
         res_int      : std_ulogic;
         exec_state   : state_t;
         cycle_1      : std_ulogic;
-        regsel       : std_ulogic_vector(1 downto 0);
+        regsel       : std_ulogic_vector(2 downto 0);
     end record;
 
     type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0);
@@ -180,8 +180,8 @@ architecture behaviour of fpu is
     signal r, rin : reg_type;
 
     signal fp_result     : std_ulogic_vector(63 downto 0);
-    signal opsel_a       : std_ulogic_vector(1 downto 0);
-    signal opsel_b       : std_ulogic_vector(1 downto 0);
+    signal opsel_a       : std_ulogic_vector(2 downto 0);
+    signal opsel_b       : std_ulogic;
     signal opsel_r       : std_ulogic_vector(1 downto 0);
     signal opsel_s       : std_ulogic_vector(1 downto 0);
     signal opsel_ainv    : std_ulogic;
@@ -206,15 +206,17 @@ architecture behaviour of fpu is
     signal inverse_est   : std_ulogic_vector(18 downto 0);
 
     -- opsel values
-    constant AIN_R    : std_ulogic_vector(1 downto 0) := "00";
-    constant AIN_A    : std_ulogic_vector(1 downto 0) := "01";
-    constant AIN_B    : std_ulogic_vector(1 downto 0) := "10";
-    constant AIN_C    : std_ulogic_vector(1 downto 0) := "11";
-
-    constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00";
-    constant BIN_R    : std_ulogic_vector(1 downto 0) := "01";
-    constant BIN_RND  : std_ulogic_vector(1 downto 0) := "10";
-    constant BIN_PS8  : std_ulogic_vector(1 downto 0) := "11";
+    constant AIN_ZERO     : std_ulogic_vector(2 downto 0) := "000";
+    constant AIN_A        : std_ulogic_vector(2 downto 0) := "001";
+    constant AIN_B        : std_ulogic_vector(2 downto 0) := "010";
+    constant AIN_C        : std_ulogic_vector(2 downto 0) := "011";
+    constant AIN_PS8      : std_ulogic_vector(2 downto 0) := "100";
+    constant AIN_RND_B32  : std_ulogic_vector(2 downto 0) := "101";
+    constant AIN_RND_RBIT : std_ulogic_vector(2 downto 0) := "110";
+    constant AIN_RND      : std_ulogic_vector(2 downto 0) := "111";
+
+    constant BIN_ZERO  : std_ulogic := '0';
+    constant BIN_R     : std_ulogic := '1';
 
     constant RES_SUM   : std_ulogic_vector(1 downto 0) := "00";
     constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01";
@@ -857,10 +859,8 @@ begin
         variable maddend     : std_ulogic_vector(127 downto 0);
         variable sum         : std_ulogic_vector(63 downto 0);
         variable round_inc   : std_ulogic_vector(63 downto 0);
-        variable rbit_inc    : std_ulogic;
         variable mult_mask   : std_ulogic;
         variable sign_bit    : std_ulogic;
-        variable rnd_b32     : std_ulogic;
         variable rexp_in1    : signed(EXP_BITS-1 downto 0);
         variable rexp_in2    : signed(EXP_BITS-1 downto 0);
         variable rexp_cin    : std_ulogic;
@@ -1134,10 +1134,10 @@ begin
         v.update_fprf := '0';
         v.first := '0';
         v.doing_ftdiv := "00";
-        opsel_a <= AIN_R;
+        opsel_a <= AIN_ZERO;
         opsel_ainv <= '0';
         opsel_mask <= '0';
-        opsel_b <= BIN_ZERO;
+        opsel_b <= BIN_R;
         opsel_binv <= '0';
         opsel_r <= RES_SUM;
         opsel_s <= S_ZERO;
@@ -1171,9 +1171,7 @@ begin
         renorm_sqrt := '0';
         shiftin := '0';
         shiftin0 := '0';
-        rbit_inc := '0';
         mult_mask := '0';
-        rnd_b32 := '0';
         illegal := '0';
         set_reg_ind := '0';
 
@@ -1216,7 +1214,7 @@ begin
                 v.x := '0';
                 v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX);
                 set_s := '1';
-                v.regsel := AIN_R;
+                v.regsel := AIN_ZERO;
 
             when DO_NAN_INF =>
                 -- At least one floating-point operand is infinity or NaN
@@ -1227,6 +1225,8 @@ begin
                 else
                     opsel_a <= AIN_C;
                 end if;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
 
                 if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or
                     (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or
@@ -1287,6 +1287,8 @@ begin
 
             when DO_ZERO_DEN =>
                 -- At least one floating point operand is zero or denormalized
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 if r.use_a = '1' and r.a.class = ZERO then
                     opsel_a <= AIN_B;
                     re_sel2 <= REXP2_B;
@@ -1406,6 +1408,8 @@ begin
             when DO_FCMP =>
                 -- fcmp[uo]
                 opsel_a <= AIN_B;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 v.instr_done := '1';
                 update_fx := '1';
                 re_sel2 <= REXP2_B;
@@ -1483,6 +1487,7 @@ begin
 
             when DO_FMRG =>
                 -- fmrgew, fmrgow
+                set_r := '1';
                 opsel_r <= RES_MISC;
                 misc_sel <= "100";
                 v.writing_fpr := '1';
@@ -1490,6 +1495,7 @@ begin
 
             when DO_MFFS =>
                 v.writing_fpr := '1';
+                set_r := '1';
                 opsel_r <= RES_MISC;
                 misc_sel <= "011";
                 case r.insn(20 downto 16) is
@@ -1537,6 +1543,8 @@ begin
 
             when DO_FMR =>
                 opsel_a <= AIN_B;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 rcls_op <= RCLS_SEL;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
@@ -1545,6 +1553,8 @@ begin
 
             when DO_FRI =>    -- fri[nzpm]
                 opsel_a <= AIN_B;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 -- set shift to exponent - 52
@@ -1561,17 +1571,19 @@ begin
             when DO_FRSP =>
                 -- r.shift = 0
                 opsel_a <= AIN_B;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 v.state := DO_FRSP_2;
 
             when DO_FRSP_2 =>
                 -- r.shift = 0
-                -- set shift to exponent - -126
+                -- set shift to exponent - -126 (for ROUND_UFLOW state)
                 rs_sel1 <= RSH1_B;
                 rs_con2 <= RSCON2_MINEXP;
                 rs_neg2 <= '1';
-                set_x := '1';
+                set_x := '1';   -- uses r.r and r.shift
                 if r.b.exponent < to_signed(-126, EXP_BITS) then
                     v.state := ROUND_UFLOW;
                 elsif r.b.exponent > to_signed(127, EXP_BITS) then
@@ -1585,6 +1597,8 @@ begin
                 -- instr bit 8: 1=unsigned 0=signed
                 -- instr bit 1: 1=round to zero 0=use fpscr[RN]
                 opsel_a <= AIN_B;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 rs_sel1 <= RSH1_B;
@@ -1611,6 +1625,8 @@ begin
 
             when DO_FCFID =>
                 opsel_a <= AIN_B;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 rcls_op <= RCLS_SEL;
                 if r.insn(8) = '0' and r.b.negative = '1' then
                     -- fcfid[s] with negative operand, set R = -B
@@ -1628,6 +1644,8 @@ begin
             when DO_FADD =>
                 -- fadd[s] and fsub[s]
                 opsel_a <= AIN_A;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 re_sel1 <= REXP1_A;
                 re_set_result <= '1';
                 -- set shift to a.exp - b.exp
@@ -1648,6 +1666,8 @@ begin
             when DO_FMUL =>
                 -- fmul[s]
                 opsel_a <= AIN_A;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 re_sel1 <= REXP1_A;
                 re_sel2 <= REXP2_C;
                 re_set_result <= '1';
@@ -1656,6 +1676,8 @@ begin
 
             when DO_FDIV =>
                 opsel_a <= AIN_A;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 re_sel1 <= REXP1_A;
                 re_sel2 <= REXP2_B;
                 re_neg2 <= '1';
@@ -1674,11 +1696,15 @@ begin
                     opsel_a <= AIN_B;
                     re_sel2 <= REXP2_B;
                 end if;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 re_set_result <= '1';
                 arith_done := '1';
 
             when DO_FSQRT =>
                 opsel_a <= AIN_B;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 if r.b.negative = '1' then
@@ -1694,7 +1720,6 @@ begin
                 end if;
 
             when DO_FRE =>
-                opsel_a <= AIN_B;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 v.state := FRE_1;
@@ -1702,6 +1727,8 @@ begin
             when DO_FMADD =>
                 -- fmadd, fmsub, fnmadd, fnmsub
                 opsel_a <= AIN_B;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 -- put a.exp + c.exp into result_exp
                 re_sel1 <= REXP1_A;
                 re_sel2 <= REXP2_C;
@@ -1722,6 +1749,8 @@ begin
             when RENORM_A =>
                 -- Get A into R
                 opsel_a <= AIN_A;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 v.regsel := AIN_A;
                 re_sel1 <= REXP1_A;
                 re_set_result <= '1';
@@ -1731,6 +1760,8 @@ begin
             when RENORM_B =>
                 -- Get B into R
                 opsel_a <= AIN_B;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 v.regsel := AIN_B;
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
@@ -1740,6 +1771,8 @@ begin
             when RENORM_C =>
                 -- Get C into R
                 opsel_a <= AIN_C;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 v.regsel := AIN_C;
                 re_sel2 <= REXP2_C;
                 re_set_result <= '1';
@@ -1767,6 +1800,8 @@ begin
             when ADD_1 =>
                 -- transferring B to R
                 opsel_a <= AIN_B;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
                 -- set shift to b.exp - a.exp
@@ -1779,6 +1814,7 @@ begin
             when ADD_SHIFT =>
                 -- r.shift = - exponent difference, r.longmask = 0
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 re_sel2 <= REXP2_NE;
                 re_set_result <= '1';
                 v.x := s_nz;
@@ -1795,6 +1831,7 @@ begin
                 opsel_b <= BIN_R;
                 opsel_binv <= r.is_subtract;
                 carry_in <= r.is_subtract and not r.x;
+                set_r := '1';
                 -- set shift to -1
                 rs_con2 <= RSCON2_1;
                 rs_neg2 <= '1';
@@ -1808,12 +1845,15 @@ begin
                 if r.r(63) = '1' then
                     -- result is opposite sign to expected
                     rsgn_op := RSGN_INV;
-                    opsel_ainv <= '1';
+                    opsel_a <= AIN_ZERO;
+                    set_r := '1';
+                    opsel_binv <= '1';
                     carry_in <= '1';
                     v.state := FINISH;
                 elsif r.r(UNIT_BIT + 1) = '1' then
                     -- sum overflowed, shift right
                     opsel_r <= RES_SHIFT;
+                    set_r := '1';
                     re_set_result <= '1';
                     set_x := '1';
                     if exp_huge = '1' then
@@ -1834,6 +1874,7 @@ begin
                 opsel_b <= BIN_R;
                 opsel_binv <= '1';
                 carry_in <= '1';
+                set_r := '1';
                 v.state := CMP_2;
 
             when CMP_2 =>
@@ -1851,6 +1892,7 @@ begin
             when MULT_1 =>
                 f_to_multiply.valid <= r.first;
                 opsel_r <= RES_MULT;
+                set_r := '1';
                 if multiply_to_f.valid = '1' then
                     v.state := FINISH;
                 end if;
@@ -1867,6 +1909,7 @@ begin
                     rs_sel1 <= RSH1_S;
                 end if;
                 opsel_r <= RES_MULT;
+                set_r := '1';
                 opsel_s <= S_MULT;
                 set_s := '1';
                 if multiply_to_f.valid = '1' then
@@ -1901,6 +1944,7 @@ begin
             when FMADD_3 =>
                 -- r.shift = addend exp - product exp
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 re_sel2 <= REXP2_NE;
                 re_set_result <= '1';
                 v.first := '1';
@@ -1914,6 +1958,7 @@ begin
                 opsel_s <= S_MULT;
                 set_s := '1';
                 if multiply_to_f.valid = '1' then
+                    set_r := '1';
                     v.state := FMADD_5;
                 end if;
 
@@ -1921,8 +1966,9 @@ begin
                 -- negate R:S:X if negative
                 if r.r(63) = '1' then
                     rsgn_op := RSGN_INV;
-                    opsel_ainv <= '1';
+                    opsel_binv <= '1';
                     carry_in <= not (s_nz or r.x);
+                    set_r := '1';
                     opsel_s <= S_NEG;
                     set_s := '1';
                 end if;
@@ -1993,8 +2039,9 @@ begin
                 f_to_multiply.valid <= r.first;
                 pshift := '1';
                 mult_mask := '1';
+                opsel_r <= RES_MULT;
                 if multiply_to_f.valid = '1' then
-                    opsel_r <= RES_MULT;
+                    set_r := '1';
                     v.first := '1';
                     v.state := DIV_5;
                 end if;
@@ -2012,14 +2059,14 @@ begin
 
             when DIV_6 =>
                 -- test if remainder is 0 or >= B
-                opsel_b <= BIN_RND;
-                rbit_inc := '1';
+                opsel_a <= AIN_RND_RBIT;
                 if pcmpb_lt = '1' then
                     -- quotient is correct, set X if remainder non-zero
                     set_r := '0';
                     v.x := r.p(UNIT_BIT + 2) or px_nz;
                 else
                     -- quotient needs to be incremented by 1 in R-bit position
+                    set_r := '1';
                     v.x := not pcmpb_eq;
                 end if;
                 v.state := FINISH;
@@ -2029,6 +2076,7 @@ begin
                 re_neg1 <= '1';
                 re_set_result <= '1';
                 opsel_r <= RES_MISC;
+                set_r := '1';
                 misc_sel <= "101";
                 -- set shift to 1
                 rs_con2 <= RSCON2_1;
@@ -2056,6 +2104,7 @@ begin
             when RSQRT_1 =>
                 opsel_r <= RES_MISC;
                 misc_sel <= "101";
+                set_r := '1';
                 re_sel1 <= REXP1_BHALF;
                 re_neg1 <= '1';
                 re_set_result <= '1';
@@ -2069,6 +2118,7 @@ begin
                 set_a := '1';
                 opsel_r <= RES_MISC;
                 misc_sel <= "101";
+                set_r := '1';
                 msel_1 <= MUL1_B;
                 msel_2 <= MUL2_LUT;
                 f_to_multiply.valid <= '1';
@@ -2083,6 +2133,7 @@ begin
                 -- not expecting multiplier result yet
                 -- r.shift = -1
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 re_sel2 <= REXP2_NE;
                 re_set_result <= '1';
                 v.first := '1';
@@ -2094,9 +2145,10 @@ begin
                 set_y := r.first;
                 pshift := '1';
                 mult_mask := '1';
+                opsel_r <= RES_MULT;
                 if multiply_to_f.valid = '1' then
                     -- put result into R
-                    opsel_r <= RES_MULT;
+                    set_r := '1';
                     v.first := '1';
                     v.state := SQRT_4;
                 end if;
@@ -2139,9 +2191,11 @@ begin
                 -- wait for second multiply (should be here already)
                 pshift := '1';
                 mult_mask := '1';
+                opsel_r <= RES_MULT;
+                set_r := '1';
                 if multiply_to_f.valid = '1' then
                     -- put result into R
-                    opsel_r <= RES_MULT;
+                    set_r := '1';
                     v.first := '1';
                     v.count := r.count + 1;
                     if r.count < 2 then
@@ -2184,7 +2238,8 @@ begin
 
             when SQRT_10 =>
                 -- Add the bottom 8 bits of P, sign-extended, onto R.
-                opsel_b <= BIN_PS8;
+                opsel_a <= AIN_PS8;
+                set_r := '1';
                 re_sel1 <= REXP1_BHALF;
                 re_set_result <= '1';
                 -- set shift to 1
@@ -2208,12 +2263,14 @@ begin
 
             when SQRT_12 =>
                 -- test if remainder is 0 or >= B = 2*R + 1
+                set_r := '0';
+                carry_in <= '1';
                 if pcmpb_lt = '1' then
                     -- square root is correct, set X if remainder non-zero
                     v.x := r.p(UNIT_BIT + 2) or px_nz;
                 else
                     -- square root needs to be incremented by 1
-                    carry_in <= '1';
+                    set_r := '1';
                     v.x := not pcmpb_eq;
                 end if;
                 v.state := FINISH;
@@ -2221,6 +2278,7 @@ begin
             when INT_SHIFT =>
                 -- r.shift = b.exponent - 52
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 re_sel2 <= REXP2_NE;
                 re_set_result <= '1';
                 set_x := '1';
@@ -2232,6 +2290,7 @@ begin
             when INT_ROUND =>
                 -- r.shift = -4 (== 52 - UNIT_BIT)
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 re_sel2 <= REXP2_NE;
                 re_set_result <= '1';
                 round := fp_rounding(r.r, r.x, '0', r.round_mode, r.result_sign);
@@ -2247,14 +2306,16 @@ begin
             when INT_ISHIFT =>
                 -- r.shift = b.exponent - UNIT_BIT;
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 re_sel2 <= REXP2_NE;
                 re_set_result <= '1';
                 v.state := INT_FINAL;
 
             when INT_FINAL =>
                 -- Negate if necessary, and increment for rounding if needed
-                opsel_ainv <= r.result_sign;
+                opsel_binv <= r.result_sign;
                 carry_in <= r.fpscr(FPSCR_FR) xor r.result_sign;
+                set_r := '1';
                 -- Check for possible overflows
                 case r.insn(9 downto 8) is
                     when "00" =>        -- fctiw[z]
@@ -2281,13 +2342,15 @@ begin
                 else
                     msb := r.r(63);
                 end if;
+                opsel_r <= RES_MISC;
                 misc_sel <= "110";
                 if (r.insn(8) = '0' and msb /= r.result_sign) or
                     (r.insn(8) = '1' and msb /= '1') then
-                    opsel_r <= RES_MISC;
+                    set_r := '1';
                     v.fpscr(FPSCR_VXCVI) := '1';
                     invalid := '1';
                 else
+                    set_r := '0';
                     if r.fpscr(FPSCR_FI) = '1' then
                         v.fpscr(FPSCR_XX) := '1';
                     end if;
@@ -2297,6 +2360,7 @@ begin
             when INT_OFLOW =>
                 opsel_r <= RES_MISC;
                 misc_sel <= "110";
+                set_r := '1';
                 v.fpscr(FPSCR_VXCVI) := '1';
                 invalid := '1';
                 arith_done := '1';
@@ -2304,6 +2368,7 @@ begin
             when FRI_1 =>
                 -- r.shift = b.exponent - 52
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 re_sel2 <= REXP2_NE;
                 re_set_result <= '1';
                 set_x := '1';
@@ -2335,6 +2400,7 @@ begin
                 -- Shift so we have 9 leading zeroes (we know R is non-zero)
                 -- r.shift = clz(r.r) - 7
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 re_sel2 <= REXP2_NE;
                 re_set_result <= '1';
                 -- set shift to new_exp - min_exp
@@ -2353,10 +2419,12 @@ begin
             when ROUND_UFLOW =>
                 -- r.shift = - amount by which exponent underflows
                 v.tiny := '1';
+                opsel_r <= RES_SHIFT;
+                set_r := '0';
                 if r.fpscr(FPSCR_UE) = '0' then
                     -- disabled underflow exception case
                     -- have to denormalize before rounding
-                    opsel_r <= RES_SHIFT;
+                    set_r := '1';
                     re_sel2 <= REXP2_NE;
                     re_set_result <= '1';
                     set_x := '1';
@@ -2379,16 +2447,18 @@ begin
             when ROUND_OFLOW =>
                 rcls_op <= RCLS_TINF;
                 v.fpscr(FPSCR_OX) := '1';
+                opsel_r <= RES_MISC;
+                misc_sel <= "010";
+                set_r := '0';
                 if r.fpscr(FPSCR_OE) = '0' then
                     -- disabled overflow exception
                     -- result depends on rounding mode
+                    set_r := '1';
                     v.fpscr(FPSCR_XX) := '1';
                     v.fpscr(FPSCR_FI) := '1';
                     -- construct largest representable number
                     re_con2 <= RECON2_MAX;
                     re_set_result <= '1';
-                    opsel_r <= RES_MISC;
-                    misc_sel <= "010";
                     arith_done := '1';
                 else
                     -- enabled overflow exception
@@ -2401,11 +2471,12 @@ begin
 
             when ROUNDING =>
                 opsel_mask <= '1';
+                set_r := '1';
                 round := fp_rounding(r.r, r.x, r.single_prec, r.round_mode, r.result_sign);
                 v.fpscr(FPSCR_FR downto FPSCR_FI) := round;
                 if round(1) = '1' then
                     -- increment the LSB for the precision
-                    opsel_b <= BIN_RND;
+                    opsel_a <= AIN_RND;
                     -- set shift to -1
                     rs_con2 <= RSCON2_1;
                     rs_neg2 <= '1';
@@ -2432,8 +2503,10 @@ begin
                 -- r.shift = -1
                 v.x := '0';
                 re_sel2 <= REXP2_NE;
+                opsel_r <= RES_SHIFT;
+                set_r := '0';
                 if r.r(UNIT_BIT + 1) = '1' then
-                    opsel_r <= RES_SHIFT;
+                    set_r := '1';
                     re_set_result <= '1';
                     if exp_huge = '1' then
                         v.state := ROUND_OFLOW;
@@ -2451,6 +2524,7 @@ begin
             when ROUNDING_3 =>
                 -- r.shift = clz(r.r) - 9
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 re_sel2 <= REXP2_NE;
                 -- set shift to new_exp - min_exp (== -1022)
                 rs_sel1 <= RSH1_NE;
@@ -2470,12 +2544,23 @@ begin
             when DENORM =>
                 -- r.shift = result_exp - -1022
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 re_sel2 <= REXP2_NE;
                 re_set_result <= '1';
                 arith_done := '1';
 
             when DO_IDIVMOD =>
                 opsel_a <= AIN_B;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
+                -- take absolute value for signed division
+                if r.is_signed = '1' and r.b.negative = '1' then
+                    opsel_ainv <= '1';
+                    carry_in <= '1';
+                end if;
+                -- normalize and round up B to 8.56 format, like fcfid[u]
+                re_con2 <= RECON2_UNIT;
+                re_set_result <= '1';
                 if r.b.class = ZERO then
                     -- B is zero, signal overflow
                     v.int_ovf := '1';
@@ -2484,14 +2569,6 @@ begin
                     -- A is zero, result is zero (both for div and for mod)
                     v.state := IDIV_ZERO;
                 else
-                    -- take absolute value for signed division, and
-                    -- normalize and round up B to 8.56 format, like fcfid[u]
-                    if r.is_signed = '1' and r.b.negative = '1' then
-                        opsel_ainv <= '1';
-                        carry_in <= '1';
-                    end if;
-                    re_con2 <= RECON2_UNIT;
-                    re_set_result <= '1';
                     v.state := IDIV_NORMB;
                 end if;
             when IDIV_NORMB =>
@@ -2504,17 +2581,21 @@ begin
                 -- get B into the range [1, 2) in 8.56 format
                 set_x := '1';           -- record if any 1 bits shifted out
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 re_sel2 <= REXP2_NE;
                 re_set_result <= '1';
                 v.state := IDIV_NORMB3;
             when IDIV_NORMB3 =>
                 -- add the X bit onto R to round up B
                 carry_in <= r.x;
+                set_r := '1';
                 -- prepare to do count-leading-zeroes on A
                 v.state := IDIV_CLZA;
             when IDIV_CLZA =>
                 set_b := '1';           -- put R back into B
                 opsel_a <= AIN_A;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 if r.is_signed = '1' and r.a.negative = '1' then
                     opsel_ainv <= '1';
                     carry_in <= '1';
@@ -2523,16 +2604,17 @@ begin
                 re_set_result <= '1';
                 v.state := IDIV_CLZA2;
             when IDIV_CLZA2 =>
-                opsel_a <= AIN_C;
                 rs_norm <= '1';
                 -- write the dividend back into A in case we negated it
                 set_a_mant := '1';
                 -- while doing the count-leading-zeroes on A,
                 -- also compute A - B to tell us whether A >= B
                 -- (using the original value of B, which is now in C)
+                opsel_a <= AIN_C;
                 opsel_b <= BIN_R;
                 opsel_ainv <= '1';
                 carry_in <= '1';
+                set_r := '1';
                 v.state := IDIV_CLZA3;
             when IDIV_CLZA3 =>
                 -- save the exponent of A (but don't overwrite the mantissa)
@@ -2578,6 +2660,7 @@ begin
                 -- It turns out the generated QNaN mantissa is actually what we want
                 opsel_r <= RES_MISC;
                 misc_sel <= "001";
+                set_r := '1';
                 if r.b.mantissa(UNIT_BIT + 1) = '1' then
                     -- rounding up of the mantissa caused overflow, meaning the
                     -- normalized B is 2.0.  Since this is outside the range
@@ -2641,6 +2724,8 @@ begin
                 -- inverse estimate is in Y
                 -- put A (dividend) into R
                 opsel_a <= AIN_A;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 -- shift_res is 0 because r.shift = 64;
                 -- put that into B, which now holds the quotient
                 set_b_mant := '1';
@@ -2667,6 +2752,7 @@ begin
             when IDIV_SH32 =>
                 -- r.shift = 32, R contains the dividend
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 -- set shift to -UNIT_BIT (== -56)
                 rs_con2 <= RSCON2_UNIT;
                 rs_neg2 <= '1';
@@ -2687,12 +2773,14 @@ begin
                 rs_sel1 <= RSH1_B;
                 rs_neg1 <= '1';
                 if multiply_to_f.valid = '1' then
+                    set_r := '1';
                     v.state := IDIV_DIV2;
                 end if;
             when IDIV_DIV2 =>
                 -- r.shift = - b.exponent
                 -- shift the quotient estimate right by b.exponent bits
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 v.first := '1';
                 v.state := IDIV_DIV3;
             when IDIV_DIV3 =>
@@ -2708,6 +2796,7 @@ begin
                 opsel_s <= S_MULT;
                 set_s := '1';
                 if multiply_to_f.valid = '1' then
+                    set_r := '1';
                     v.state := IDIV_DIV4;
                 end if;
             when IDIV_DIV4 =>
@@ -2718,6 +2807,8 @@ begin
                 if r.divmod = '0' then
                     -- get B into R for IDIV_DIVADJ state
                     opsel_a <= AIN_B;
+                    opsel_b <= BIN_ZERO;
+                    set_r := '1';
                 end if;
                 -- set shift to UNIT_BIT (== 56)
                 rs_con2 <= RSCON2_UNIT;
@@ -2741,18 +2832,21 @@ begin
                 rs_sel1 <= RSH1_B;
                 rs_neg1 <= '1';
                 if multiply_to_f.valid = '1' then
+                    set_r := '1';
                     v.state := IDIV_DIV6;
                 end if;
             when IDIV_DIV6 =>
                 -- r.shift = - b.exponent
                 -- shift the quotient estimate right by b.exponent bits
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 v.first := '1';
                 v.state := IDIV_DIV7;
             when IDIV_DIV7 =>
                 -- add shifted quotient delta onto the total quotient
                 opsel_a <= AIN_B;
                 opsel_b <= BIN_R;
+                set_r := '1';
                 v.first := '1';
                 v.state := IDIV_DIV8;
             when IDIV_DIV8 =>
@@ -2768,6 +2862,7 @@ begin
                 opsel_s <= S_MULT;
                 set_s := '1';
                 if multiply_to_f.valid = '1' then
+                    set_r := '1';
                     v.state := IDIV_DIV9;
                 end if;
             when IDIV_DIV9 =>
@@ -2780,6 +2875,8 @@ begin
                 if r.divmod = '0' then
                     -- get B into R for IDIV_DIVADJ state
                     opsel_a <= AIN_B;
+                    opsel_b <= BIN_ZERO;
+                    set_r := '1';
                     v.state := IDIV_DIVADJ;
                 elsif pcmpc_eq = '1' then
                     v.state := IDIV_ZERO;
@@ -2790,6 +2887,8 @@ begin
                 -- get divisor into R and prepare to shift left
                 -- set shift to 63 - b.exp
                 opsel_a <= AIN_C;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 rs_sel1 <= RSH1_B;
                 rs_neg1 <= '1';
                 rs_con2 <= RSCON2_63;
@@ -2798,6 +2897,8 @@ begin
                 -- divisor is in R
                 -- r.shift = 63 - b.exponent; shift and put into B
                 opsel_a <= AIN_A;
+                opsel_b <= BIN_ZERO;
+                set_r := '1';
                 set_b_mant := '1';
                 -- set shift to 64 - UNIT_BIT (== 8)
                 rs_con2 <= RSCON2_64_UNIT;
@@ -2817,6 +2918,7 @@ begin
                 -- dividend (A) is in R
                 -- r.shift = 64 - B.exponent, so is at least 1
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 -- top bit of A gets lost in the shift, so handle it specially
                 -- set shift to 63
                 rs_con2 <= RSCON2_63;
@@ -2828,6 +2930,7 @@ begin
                 opsel_b <= BIN_R;
                 opsel_ainv <= '1';
                 carry_in <= '1';
+                set_r := '1';
                 -- and put 1<<63 into B as the divisor (S is still 0)
                 shiftin0 := '1';
                 set_b_mant := '1';
@@ -2848,6 +2951,7 @@ begin
                 -- dividend is in R
                 -- r.shift = 64 - B.exponent
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 v.first := '1';
                 v.state := IDIV_EXTDIV2;
             when IDIV_EXTDIV2 =>
@@ -2858,6 +2962,7 @@ begin
                 pshift := '1';
                 opsel_r <= RES_MULT;
                 if multiply_to_f.valid = '1' then
+                    set_r := '1';
                     v.first := '1';
                     v.state := IDIV_EXTDIV3;
                 end if;
@@ -2865,6 +2970,7 @@ begin
                 -- delta quotient is in R; add it to B
                 opsel_a <= AIN_B;
                 opsel_b <= BIN_R;
+                set_r := '1';
                 v.first := '1';
                 v.state := IDIV_EXTDIV4;
             when IDIV_EXTDIV4 =>
@@ -2883,12 +2989,14 @@ begin
                 rs_neg1 <= '1';
                 rs_con2 <= RSCON2_UNIT;
                 if multiply_to_f.valid = '1' then
+                    set_r := '1';
                     v.state := IDIV_EXTDIV5;
                 end if;
             when IDIV_EXTDIV5 =>
                 -- r.shift = r.b.exponent - 56
                 -- remainder is in R/S; shift it right r.b.exponent bits
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 -- test LS 64b of remainder in P against divisor in C
                 v.inc_quot := not pcmpc_lt;
                 v.state := IDIV_EXTDIV6;
@@ -2896,6 +3004,8 @@ begin
                 -- shifted remainder is in R, see if it is > 1
                 -- and compute R = R * Y if so
                 opsel_a <= AIN_B;
+                opsel_b <= BIN_ZERO;
+                set_r := '0';
                 msel_1 <= MUL1_Y;
                 msel_2 <= MUL2_R;
                 pshift := '1';
@@ -2903,12 +3013,15 @@ begin
                     f_to_multiply.valid <= '1';
                     v.state := IDIV_EXTDIV2;
                 else
+                    -- Put B (quotient) into R for IDIV_DIVADJ state
+                    set_r := '1';
                     v.state := IDIV_DIVADJ;
                 end if;
             when IDIV_MODADJ =>
                 -- r.shift = 56
                 -- result is in R/S
                 opsel_r <= RES_SHIFT;
+                set_r := '1';
                 if pcmpc_lt = '0' then
                     v.state := IDIV_MODSUB;
                 elsif r.result_sign = '0' then
@@ -2922,6 +3035,7 @@ begin
                 opsel_ainv <= '1';
                 carry_in <= '1';
                 opsel_b <= BIN_R;
+                set_r := '1';
                 if r.result_sign = '0' then
                     v.state := IDIV_DONE;
                 else
@@ -2931,11 +3045,11 @@ begin
                 -- result (so far) is in R
                 -- set carry to increment quotient if needed
                 -- and also negate R if the answer is negative
-                opsel_ainv <= r.result_sign;
+                opsel_binv <= r.result_sign;
                 carry_in <= r.inc_quot xor r.result_sign;
-                rnd_b32 := '1';
+                set_r := '1';
                 if r.divmod = '0' then
-                    opsel_b <= BIN_RND;
+                    opsel_a <= AIN_RND_B32;
                 end if;
                 if r.is_signed = '0' then
                     v.state := IDIV_DONE;
@@ -2984,6 +3098,7 @@ begin
             when IDIV_ZERO =>
                 opsel_r <= RES_MISC;
                 misc_sel <= "000";
+                set_r := '1';
                 v.xerc_result := v.xerc;
                 if r.oe = '1' then
                     v.xerc_result.ov := r.int_ovf;
@@ -3156,36 +3271,32 @@ begin
             v.x := '1';
         end if;
         case opsel_a is
-            when AIN_R =>
-                in_a0 := r.r;
             when AIN_A =>
                 in_a0 := r.a.mantissa;
             when AIN_B =>
                 in_a0 := r.b.mantissa;
-            when others =>
+            when AIN_C =>
                 in_a0 := r.c.mantissa;
+            when AIN_PS8 =>     -- 8 LSBs of P sign-extended to 64
+                in_a0 := std_ulogic_vector(resize(signed(r.p(7 downto 0)), 64));
+            when AIN_RND_B32 =>
+                in_a0 := (32 => r.result_sign and r.single_prec, others => '0');
+            when AIN_RND_RBIT =>
+                in_a0 := (DP_RBIT => '1', others => '0');
+            when AIN_RND =>
+                in_a0 := (SP_LSB => r.single_prec, DP_LSB => not r.single_prec, others => '0');
+            when others =>
+                in_a0 := (others => '0');
         end case;
         if opsel_ainv = '1' then
             in_a0 := not in_a0;
         end if;
         in_a <= in_a0;
         case opsel_b is
-            when BIN_ZERO =>
-                in_b0 := (others => '0');
             when BIN_R =>
                 in_b0 := r.r;
-            when BIN_RND =>
-                if rnd_b32 = '1' then
-                    round_inc := (32 => r.result_sign and r.single_prec, others => '0');
-                elsif rbit_inc = '0' then
-                    round_inc := (SP_LSB => r.single_prec, DP_LSB => not r.single_prec, others => '0');
-                else
-                    round_inc := (DP_RBIT => '1', others => '0');
-                end if;
-                in_b0 := round_inc;
             when others =>
-                -- BIN_PS8, 8 LSBs of P sign-extended to 64
-                in_b0 := std_ulogic_vector(resize(signed(r.p(7 downto 0)), 64));
+                in_b0 := (others => '0');
         end case;
         if opsel_binv = '1' then
             in_b0 := not in_b0;

From fcfdbc449c9f77ad20dd159373d7522248bb1c5c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 13 Mar 2024 09:45:46 +1100
Subject: [PATCH 19/24] FPU: Move condition register calculations to an
 explicit data path

Instead of calculating v.cr_result in the state machine, we now have
the state machine set a 'cr_op' variable which then controls what
computation the CR data path does to set v.cr_result.  The CR data
path also handles updating the XERC result bits for integer operations
(division and modulus).

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 269 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 153 insertions(+), 116 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index 648bbaa..066f664 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -304,6 +304,13 @@ architecture behaviour of fpu is
     constant RCLS_TZERO  : std_ulogic_vector(1 downto 0) := "10";
     constant RCLS_TINF   : std_ulogic_vector(1 downto 0) := "11";
 
+    constant CROP_NONE   : std_ulogic_vector(2 downto 0) := "000";
+    constant CROP_FCMP   : std_ulogic_vector(2 downto 0) := "001";
+    constant CROP_MCRFS  : std_ulogic_vector(2 downto 0) := "010";
+    constant CROP_FTDIV  : std_ulogic_vector(2 downto 0) := "100";
+    constant CROP_FTSQRT : std_ulogic_vector(2 downto 0) := "101";
+    constant CROP_INTRES : std_ulogic_vector(2 downto 0) := "110";
+
     constant arith_decode : decode32 := (
         -- indexed by bits 5..1 of opcode
         2#01000# => DO_FRI,
@@ -875,6 +882,10 @@ begin
         variable is_nan_inf  : std_ulogic;
         variable is_zero_den : std_ulogic;
         variable set_reg_ind : std_ulogic;
+        variable cr_op       : std_ulogic_vector(2 downto 0);
+        variable cr_result   : std_ulogic_vector(3 downto 0);
+        variable set_cr      : std_ulogic;
+        variable set_fpcc    : std_ulogic;
     begin
         v := r;
         v.complete := '0';
@@ -1144,6 +1155,7 @@ begin
         carry_in <= '0';
         misc_sel <= "000";
         fpscr_mask := (others => '1');
+        cr_op := CROP_NONE;
         update_fx := '0';
         arith_done := '0';
         invalid := '0';
@@ -1160,6 +1172,8 @@ begin
         set_c := '0';
         set_r := '1';
         set_s := '0';
+        set_cr := '0';
+        set_fpcc := '0';
         f_to_multiply.is_signed <= '0';
         f_to_multiply.valid <= '0';
         msel_1 <= MUL1_A;
@@ -1361,6 +1375,8 @@ begin
                 v.instr_done := '1';
 
             when DO_MCRFS =>
+                cr_op := CROP_MCRFS;
+                set_cr := '1';
                 j := to_integer(unsigned(insn_bfa(r.insn)));
                 for i in 0 to 7 loop
                     if i = j then
@@ -1373,94 +1389,56 @@ begin
                 v.instr_done := '1';
 
             when DO_FTDIV =>
-                v.cr_result := "0000";
                 -- set result_exp to the exponent of B
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
-                if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or
-                    (r.b.class = FINITE and r.b.denorm = '1') then
-                    v.cr_result(2) := '1';
-                end if;
-                if r.a.class = NAN or r.a.class = INFINITY or
-                    r.b.class = NAN or r.b.class = ZERO or r.b.class = INFINITY or
-                    (r.a.class = FINITE and r.a.exponent <= to_signed(-970, EXP_BITS)) then
-                    v.cr_result(1) := '1';
-                    v.instr_done := '1';
-                else
+                cr_op := CROP_FTDIV;
+                if (r.a.class = ZERO or r.a.class = FINITE) and r.b.class = FINITE then
                     v.doing_ftdiv := "11";
                     v.first := '1';
                     v.state := FTDIV_1;
-                    v.instr_done := '0';
+                else
+                    set_cr := '1';
+                    v.instr_done := '1';
                 end if;
 
             when DO_FTSQRT =>
+                cr_op := CROP_FTSQRT;
+                set_cr := '1';
                 v.instr_done := '1';
-                v.cr_result := "0000";
-                if r.b.class = ZERO or r.b.class = INFINITY or
-                    (r.b.class = FINITE and r.b.denorm = '1') then
-                    v.cr_result(2) := '1';
-                end if;
-                if r.b.class = NAN or r.b.class = INFINITY or r.b.class = ZERO
-                    or r.b.negative = '1' or r.b.exponent <= to_signed(-970, EXP_BITS) then
-                    v.cr_result(1) := '1';
-                end if;
 
             when DO_FCMP =>
                 -- fcmp[uo]
+                -- Prepare to subtract mantissas, put B in R
                 opsel_a <= AIN_B;
                 opsel_b <= BIN_ZERO;
                 set_r := '1';
-                v.instr_done := '1';
                 update_fx := '1';
-                re_sel2 <= REXP2_B;
-                re_set_result <= '1';
-                if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or
-                    (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') then
-                    -- Signalling NAN
-                    v.fpscr(FPSCR_VXSNAN) := '1';
-                    if r.insn(6) = '1' and r.fpscr(FPSCR_VE) = '0' then
-                        v.fpscr(FPSCR_VXVC) := '1';
-                    end if;
-                    invalid := '1';
-                    v.cr_result := "0001";          -- unordered
-                elsif r.a.class = NAN or r.b.class = NAN then
-                    if r.insn(6) = '1' then
+                cr_op := CROP_FCMP;
+                if r.a.class = NAN or r.b.class = NAN then
+                    if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or
+                        (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') then
+                        -- Signalling NAN
+                        v.fpscr(FPSCR_VXSNAN) := '1';
+                        if r.insn(6) = '1' and r.fpscr(FPSCR_VE) = '0' then
+                            v.fpscr(FPSCR_VXVC) := '1';
+                        end if;
+                        invalid := '1';
+                    elsif r.insn(6) = '1' then
                         -- fcmpo
                         v.fpscr(FPSCR_VXVC) := '1';
                         invalid := '1';
                     end if;
-                    v.cr_result := "0001";          -- unordered
-                elsif r.a.class = ZERO and r.b.class = ZERO then
-                    v.cr_result := "0010";          -- equal
-                elsif r.a.negative /= r.b.negative then
-                    v.cr_result := r.a.negative & r.b.negative & "00";
-                elsif r.a.class = ZERO then
-                    -- A and B are the same sign from here down
-                    v.cr_result := not r.b.negative & r.b.negative & "00";
-                elsif r.a.class = INFINITY then
-                    if r.b.class = INFINITY then
-                        v.cr_result := "0010";
-                    else
-                        v.cr_result := r.a.negative & not r.a.negative & "00";
-                    end if;
-                elsif r.b.class = ZERO then
-                    -- A is finite from here down
-                    v.cr_result := r.a.negative & not r.a.negative & "00";
-                elsif r.b.class = INFINITY then
-                    v.cr_result := not r.b.negative & r.b.negative & "00";
-                elsif r.a.exponent > r.b.exponent then
-                    -- A and B are both finite from here down
-                    v.cr_result := r.a.negative & not r.a.negative & "00";
-                elsif r.a.exponent /= r.b.exponent then
-                    -- A exponent is smaller than B
-                    v.cr_result := not r.a.negative & r.a.negative & "00";
-                else
-                    -- Prepare to subtract mantissas, put B in R
-                    v.cr_result := "0000";
-                    v.instr_done := '0';
+                end if;
+                if r.a.class = FINITE and r.b.class = FINITE and
+                    r.a.negative = r.b.negative and
+                    r.a.exponent = r.b.exponent then
                     v.state := CMP_1;
+                else
+                    set_cr := '1';
+                    set_fpcc := '1';
+                    v.instr_done := '1';
                 end if;
-                v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result;
 
             when DO_MTFSB =>
                 -- mtfsb{0,1}
@@ -1878,15 +1856,9 @@ begin
                 v.state := CMP_2;
 
             when CMP_2 =>
-                if r.r(63) = '1' then
-                    -- A is smaller in magnitude
-                    v.cr_result := not r.a.negative & r.a.negative & "00";
-                elsif (r_hi_nz or r_lo_nz) = '0' then
-                    v.cr_result := "0010";
-                else
-                    v.cr_result := r.a.negative & not r.a.negative & "00";
-                end if;
-                v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result;
+                cr_op := CROP_FCMP;
+                set_cr := '1';
+                set_fpcc := '1';
                 v.instr_done := '1';
 
             when MULT_1 =>
@@ -2086,10 +2058,10 @@ begin
                 -- We go through this state up to two times; the first sees if
                 -- B.exponent is in the range [-1021,1020], and the second tests
                 -- whether B.exp - A.exp is in the range [-1022,1020].
-                v.cr_result(1) := exp_tiny or exp_huge;
-                -- set shift to a.exp
-                rs_sel2 <= RSH2_A;
+                rs_sel2 <= RSH2_A;                -- set shift to a.exp
+                cr_op := CROP_FTDIV;
                 if exp_tiny = '1' or exp_huge = '1' or r.a.class = ZERO or r.first = '0' then
+                    set_cr := '1';
                     v.instr_done := '1';
                 else
                     v.doing_ftdiv := "10";
@@ -3057,58 +3029,26 @@ begin
                     v.state := IDIV_OVFCHK;
                 end if;
             when IDIV_OVFCHK =>
+                opsel_r <= RES_MISC;
+                misc_sel <= "000";
                 if r.single_prec = '0' then
                     sign_bit := r.r(63);
                 else
                     sign_bit := r.r(31);
                 end if;
                 v.int_ovf := sign_bit xor r.result_sign;
-                if v.int_ovf = '1' then
-                    v.state := IDIV_ZERO;
-                else
-                    v.state := IDIV_DONE;
-                end if;
+                set_r := sign_bit xor r.result_sign;
+                v.state := IDIV_DONE;
             when IDIV_DONE =>
-                v.xerc_result := v.xerc;
-                if r.oe = '1' then
-                    v.xerc_result.ov := '0';
-                    v.xerc_result.ov32 := '0';
-                    v.writing_xer := '1';
-                end if;
-                if r.m32b = '0' then
-                    v.cr_result(3) := r.r(63);
-                    v.cr_result(2 downto 1) := "00";
-                    if r.r = 64x"0" then
-                        v.cr_result(1) := '1';
-                    else
-                        v.cr_result(2) := not r.r(63);
-                    end if;
-                else
-                    v.cr_result(3) := r.r(31);
-                    v.cr_result(2 downto 1) := "00";
-                    if r.r(31 downto 0) = 32x"0" then
-                        v.cr_result(1) := '1';
-                    else
-                        v.cr_result(2) := not r.r(31);
-                    end if;
-                end if;
-                v.cr_result(0) := v.xerc.so;
+                cr_op := CROP_INTRES;
+                set_cr := '1';
                 v.writing_fpr := '1';
                 v.instr_done := '1';
             when IDIV_ZERO =>
                 opsel_r <= RES_MISC;
                 misc_sel <= "000";
                 set_r := '1';
-                v.xerc_result := v.xerc;
-                if r.oe = '1' then
-                    v.xerc_result.ov := r.int_ovf;
-                    v.xerc_result.ov32 := r.int_ovf;
-                    v.xerc_result.so := r.xerc.so or r.int_ovf;
-                    v.writing_xer := '1';
-                end if;
-                v.cr_result := "001" & v.xerc_result.so;
-                v.writing_fpr := '1';
-                v.instr_done := '1';
+                v.state := IDIV_DONE;
 
         end case;
 
@@ -3525,6 +3465,103 @@ begin
             v.shift := rsh_in1 + rsh_in2 + (rs_neg1 or rs_neg2);
         end if;
 
+        -- Condition register data path
+        cr_result := "0000";
+        case cr_op is
+            when CROP_FCMP =>
+                if r.a.class = NAN or r.b.class = NAN then
+                    cr_result := "0001";          -- unordered
+                elsif r.a.class = ZERO and r.b.class = ZERO then
+                    cr_result := "0010";          -- equal
+                elsif r.a.negative /= r.b.negative then
+                    cr_result := r.a.negative & r.b.negative & "00";
+                elsif r.a.class = INFINITY and r.b.class = INFINITY then
+                    -- A and B are the same sign from here down
+                    cr_result := "0010";
+                elsif r.a.class = ZERO then
+                    cr_result := not r.b.negative & r.b.negative & "00";
+                elsif r.a.class = INFINITY then
+                    cr_result := r.a.negative & not r.a.negative & "00";
+                elsif r.b.class = ZERO then
+                    -- A is finite from here down
+                    cr_result := r.a.negative & not r.a.negative & "00";
+                elsif r.b.class = INFINITY then
+                    cr_result := not r.b.negative & r.b.negative & "00";
+                elsif r.a.exponent > r.b.exponent then
+                    -- A and B are both finite from here down
+                    cr_result := r.a.negative & not r.a.negative & "00";
+                elsif r.a.exponent /= r.b.exponent then
+                    -- A exponent is smaller than B
+                    cr_result := not r.a.negative & r.a.negative & "00";
+                elsif r.r(63) = '1' then
+                    -- A is smaller in magnitude
+                    cr_result := not r.a.negative & r.a.negative & "00";
+                elsif (r_hi_nz or r_lo_nz) = '0' then
+                    cr_result := "0010";
+                else
+                    cr_result := r.a.negative & not r.a.negative & "00";
+                end if;
+            when CROP_MCRFS =>
+                j := to_integer(unsigned(insn_bfa(r.insn)));
+                for i in 0 to 7 loop
+                    if i = j then
+                        k := (7 - i) * 4;
+                        cr_result := r.fpscr(k + 3 downto k);
+                    end if;
+                end loop;
+            when CROP_FTDIV =>
+                if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or
+                    (r.b.class = FINITE and r.b.denorm = '1') then
+                    cr_result(2) := '1';
+                end if;
+                if r.a.class = NAN or r.a.class = INFINITY or
+                    r.b.class = NAN or r.b.class = ZERO or r.b.class = INFINITY or
+                    (r.a.class = FINITE and r.a.exponent <= to_signed(-970, EXP_BITS)) or
+                    (r.doing_ftdiv(1) = '1' and (exp_tiny or exp_huge) = '1') then
+                    cr_result(1) := '1';
+                end if;
+            when CROP_FTSQRT =>
+                if r.b.class = ZERO or r.b.class = INFINITY or
+                    (r.b.class = FINITE and r.b.denorm = '1') then
+                    cr_result(2) := '1';
+                end if;
+                if r.b.class = NAN or r.b.class = INFINITY or r.b.class = ZERO
+                    or r.b.negative = '1' or r.b.exponent <= to_signed(-970, EXP_BITS) then
+                    cr_result(1) := '1';
+                end if;
+            when CROP_INTRES =>
+                v.xerc_result := v.xerc;
+                if r.oe = '1' then
+                    v.xerc_result.ov := r.int_ovf;
+                    v.xerc_result.ov32 := r.int_ovf;
+                    v.xerc_result.so := r.xerc.so or r.int_ovf;
+                    v.writing_xer := '1';
+                end if;
+                if r.m32b = '0' then
+                    cr_result(3) := r.r(63);
+                    if r.r = 64x"0" then
+                        cr_result(1) := '1';
+                    else
+                        cr_result(2) := not r.r(63);
+                    end if;
+                else
+                    cr_result(3) := r.r(31);
+                    if r.r(31 downto 0) = 32x"0" then
+                        cr_result(1) := '1';
+                    else
+                        cr_result(2) := not r.r(31);
+                    end if;
+                end if;
+                cr_result(0) := v.xerc_result.so;
+            when others =>
+        end case;
+        if set_cr = '1' then
+            v.cr_result := cr_result;
+        end if;
+        if set_fpcc = '1' then
+            v.fpscr(FPSCR_FL downto FPSCR_FU) := cr_result;
+        end if;
+
         if r.update_fprf = '1' then
             v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.res_sign, r.result_class,
                                                              r.r(UNIT_BIT) and not r.denorm);

From b1bd2aa86532a6ac583502dc0060c72f861c1a3e Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 13 Mar 2024 14:05:52 +1100
Subject: [PATCH 20/24] FPU: Make set_r independent of multiply_to_f.valid

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index 066f664..6ca5982 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -1924,13 +1924,13 @@ begin
 
             when FMADD_4 =>
                 msel_add <= MULADD_RS;
+                set_r := '1';
                 f_to_multiply.valid <= r.first;
                 msel_inv <= r.is_subtract;
                 opsel_r <= RES_MULT;
                 opsel_s <= S_MULT;
                 set_s := '1';
                 if multiply_to_f.valid = '1' then
-                    set_r := '1';
                     v.state := FMADD_5;
                 end if;
 
@@ -2012,8 +2012,8 @@ begin
                 pshift := '1';
                 mult_mask := '1';
                 opsel_r <= RES_MULT;
+                set_r := '1';
                 if multiply_to_f.valid = '1' then
-                    set_r := '1';
                     v.first := '1';
                     v.state := DIV_5;
                 end if;
@@ -2118,9 +2118,8 @@ begin
                 pshift := '1';
                 mult_mask := '1';
                 opsel_r <= RES_MULT;
+                set_r := '1';
                 if multiply_to_f.valid = '1' then
-                    -- put result into R
-                    set_r := '1';
                     v.first := '1';
                     v.state := SQRT_4;
                 end if;
@@ -2166,8 +2165,6 @@ begin
                 opsel_r <= RES_MULT;
                 set_r := '1';
                 if multiply_to_f.valid = '1' then
-                    -- put result into R
-                    set_r := '1';
                     v.first := '1';
                     v.count := r.count + 1;
                     if r.count < 2 then
@@ -2741,11 +2738,11 @@ begin
                 f_to_multiply.valid <= r.first;
                 pshift := '1';
                 opsel_r <= RES_MULT;
+                set_r := '1';
                 -- set shift to - b.exp
                 rs_sel1 <= RSH1_B;
                 rs_neg1 <= '1';
                 if multiply_to_f.valid = '1' then
-                    set_r := '1';
                     v.state := IDIV_DIV2;
                 end if;
             when IDIV_DIV2 =>
@@ -2765,10 +2762,10 @@ begin
                 -- store the current quotient estimate in B
                 set_b_mant := r.first;
                 opsel_r <= RES_MULT;
+                set_r := '1';
                 opsel_s <= S_MULT;
                 set_s := '1';
                 if multiply_to_f.valid = '1' then
-                    set_r := '1';
                     v.state := IDIV_DIV4;
                 end if;
             when IDIV_DIV4 =>
@@ -2800,11 +2797,11 @@ begin
             when IDIV_DIV5 =>
                 pshift := '1';
                 opsel_r <= RES_MULT;
+                set_r := '1';
                 -- set shift to - b.exp
                 rs_sel1 <= RSH1_B;
                 rs_neg1 <= '1';
                 if multiply_to_f.valid = '1' then
-                    set_r := '1';
                     v.state := IDIV_DIV6;
                 end if;
             when IDIV_DIV6 =>
@@ -2831,10 +2828,10 @@ begin
                 -- store the current quotient estimate in B
                 set_b_mant := r.first;
                 opsel_r <= RES_MULT;
+                set_r := '1';
                 opsel_s <= S_MULT;
                 set_s := '1';
                 if multiply_to_f.valid = '1' then
-                    set_r := '1';
                     v.state := IDIV_DIV9;
                 end if;
             when IDIV_DIV9 =>
@@ -2933,8 +2930,8 @@ begin
                 f_to_multiply.valid <= r.first;
                 pshift := '1';
                 opsel_r <= RES_MULT;
+                set_r := '1';
                 if multiply_to_f.valid = '1' then
-                    set_r := '1';
                     v.first := '1';
                     v.state := IDIV_EXTDIV3;
                 end if;
@@ -2954,6 +2951,7 @@ begin
                 msel_inv <= '1';
                 f_to_multiply.valid <= r.first;
                 opsel_r <= RES_MULT;
+                set_r := '1';
                 opsel_s <= S_MULT;
                 set_s := '1';
                 -- set shift to UNIT_BIT - b.exp
@@ -2961,7 +2959,6 @@ begin
                 rs_neg1 <= '1';
                 rs_con2 <= RSCON2_UNIT;
                 if multiply_to_f.valid = '1' then
-                    set_r := '1';
                     v.state := IDIV_EXTDIV5;
                 end if;
             when IDIV_EXTDIV5 =>

From b4aae8511df964dc50bfc0ec1627a05298f4b7e3 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 14 Mar 2024 20:41:59 +1100
Subject: [PATCH 21/24] FPU: Move special case handling to a separate process

This creates a new fpu_specialcases process that handles most of the
logic that was previously in the DO_NAN_INF and DO_ZERO_DEN states.
What remains of those states, i.e. the handling of denormalized
inputs, is in a new DO_SPECIAL state.  The state machine goes into
DO_SPECIAL state after IDLE for any arithmetic operation where an
input is a NaN, infinity, zero or denormalized value.  Doing this
means that the rest of the state machine won't try to start any
computation which would need to be overridden by the logic to produce
the result value selected by the fpu_specialcases process.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 360 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 205 insertions(+), 155 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index 6ca5982..0698c63 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -47,7 +47,7 @@ architecture behaviour of fpu is
         mantissa : std_ulogic_vector(63 downto 0);      -- 8.56 format
     end record;
 
-    type state_t is (IDLE, DO_ILLEGAL, DO_NAN_INF, DO_ZERO_DEN,
+    type state_t is (IDLE, DO_ILLEGAL, DO_SPECIAL,
                      DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
                      DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT,
                      DO_FCFID, DO_FCTI,
@@ -92,6 +92,17 @@ architecture behaviour of fpu is
     type decode32 is array(0 to 31) of state_t;
     type decode8 is array(0 to 7) of state_t;
 
+    type specialcase_t is record
+        invalid       : std_ulogic;
+        zero_divide   : std_ulogic;
+        new_fpscr     : std_ulogic_vector(31 downto 0);
+        immed_result  : std_ulogic;      -- result is an input, zero, infinity or NaN
+        qnan_result   : std_ulogic;
+        result_sel    : std_ulogic_vector(2 downto 0);
+        result_class  : fp_number_class;
+        rsgn_op       : std_ulogic_vector(1 downto 0);
+    end record;
+
     type reg_type is record
         state        : state_t;
         busy         : std_ulogic;
@@ -172,7 +183,9 @@ architecture behaviour of fpu is
         res_int      : std_ulogic;
         exec_state   : state_t;
         cycle_1      : std_ulogic;
+        cycle_1_ar   : std_ulogic;
         regsel       : std_ulogic_vector(2 downto 0);
+        is_nan_inf   : std_ulogic;
     end record;
 
     type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0);
@@ -311,6 +324,8 @@ architecture behaviour of fpu is
     constant CROP_FTSQRT : std_ulogic_vector(2 downto 0) := "101";
     constant CROP_INTRES : std_ulogic_vector(2 downto 0) := "110";
 
+    signal scinfo : specialcase_t;
+
     constant arith_decode : decode32 := (
         -- indexed by bits 5..1 of opcode
         2#01000# => DO_FRI,
@@ -806,6 +821,140 @@ begin
     w_out.intr_vec <= 16#700#;
     w_out.srr1 <= (47-44 => r.illegal, 47-43 => not r.illegal, others => '0');
 
+    -- This is active in the second cycle of an instruction, and works out if
+    -- we have a special case where one or more operand is NaN, infinity, or zero,
+    -- meaning that an exception is generated or a specific value results
+    -- immediately without further calculation.
+    fpu_specialcases: process(all)
+        variable e : specialcase_t;
+        variable invalid_mul  : std_ulogic;
+    begin
+        e.invalid := '0';
+        e.zero_divide := '0';
+        e.new_fpscr := (others => '0');
+        e.immed_result := '0';
+        e.qnan_result := '0';
+        e.result_sel := AIN_ZERO;
+        e.result_class := FINITE;
+        e.rsgn_op := RSGN_NOP;
+
+        -- Check if any operand is a signalling NAN
+        if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or
+            (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or
+            (r.c.class = NAN and r.c.mantissa(QNAN_BIT) = '0') then
+            e.new_fpscr(FPSCR_VXSNAN) := '1';
+            e.invalid := '1';
+        end if;
+
+        -- Check for this case here since VXIMZ can be set along with VXSNAN
+        invalid_mul := '0';
+        if r.is_multiply = '1' and
+            ((r.a.class = INFINITY and r.c.class = ZERO) or
+             (r.a.class = ZERO and r.c.class = INFINITY)) then
+            e.new_fpscr(FPSCR_VXIMZ) := '1';
+            e.invalid := '1';
+            invalid_mul := '1';
+        end if;
+
+        -- Note that any operand for which r.use_X is 0 will have class = ZERO
+        if r.is_nan_inf = '1' then
+            e.immed_result := '1';
+
+            if r.int_result = '1' then
+                e.qnan_result := '1';
+                e.new_fpscr(FPSCR_VXCVI) := '1';
+
+            elsif r.a.class = NAN or r.b.class = NAN or r.c.class = NAN then
+                e.result_class := NAN;
+                e.rsgn_op := RSGN_SEL;
+                -- Select the first input that is a NaN
+                if r.a.class = NAN then
+                    e.result_sel := AIN_A;
+                elsif r.b.class = NAN then
+                    e.result_sel := AIN_B;
+                elsif r.c.class = NAN then
+                    e.result_sel := AIN_C;
+                end if;
+
+            else
+                -- some operand is an infinity
+                if invalid_mul = '1' then
+                    e.qnan_result := '1';
+                elsif (r.a.class = INFINITY or r.c.class = INFINITY) then
+                    if r.is_multiply = '1' then
+                        e.rsgn_op := RSGN_SUB;
+                    end if;
+                    if r.is_subtract = '1' and r.b.class = INFINITY then
+                        e.new_fpscr(FPSCR_VXISI) := '1';
+                        e.qnan_result := '1';
+                    end if;
+                end if;
+                if r.is_inverse = '1' and r.a.class = INFINITY and r.b.class = INFINITY then
+                    e.new_fpscr(FPSCR_VXIDI) := '1';
+                    e.qnan_result := '1';
+                end if;
+                if r.b.class = INFINITY and r.is_sqrt = '1' and r.b.negative = '1' then
+                    e.new_fpscr(FPSCR_VXSQRT) := '1';
+                    e.qnan_result := '1';
+                end if;
+                if r.b.class = INFINITY and r.is_inverse = '1' then
+                    -- fdiv, fre, frsqrte
+                    e.result_class := ZERO;
+                else
+                    e.result_class := INFINITY;
+                end if;
+            end if;
+
+        elsif r.use_a = '1' and r.a.class = ZERO then
+            e.immed_result := '1';
+            if r.is_addition = '1' then
+                -- result is +/- B
+                e.result_sel := AIN_B;
+                e.result_class := r.b.class;
+            else
+                e.result_class := ZERO;
+            end if;
+            if r.is_inverse = '1' and r.b.class = ZERO then
+                -- fdiv 0 / 0
+                e.new_fpscr(FPSCR_VXZDZ) := '1';
+                e.qnan_result := '1';
+            end if;
+
+        elsif r.use_c = '1' and r.c.class = ZERO then
+            -- fmadd/sub A * 0 + B
+            e.immed_result := '1';
+            e.result_sel := AIN_B;
+            e.result_class := r.b.class;
+
+        elsif r.use_b = '1' and r.b.class = ZERO and r.is_multiply = '0' then
+            -- B is zero, other operands are finite
+            e.immed_result := '1';
+            if r.is_inverse = '1' then
+                -- fdiv, fre, frsqrte
+                e.result_class := INFINITY;
+                e.new_fpscr(FPSCR_ZX) := '1';
+                e.zero_divide := '1';
+            elsif r.is_addition = '1' then
+                -- fadd, result is A
+                e.result_sel := AIN_A;
+            else
+                -- other things, result is zero
+                e.result_class := ZERO;
+            end if;
+        end if;
+        if r.is_sqrt = '1' and r.b.class = FINITE and r.b.negative = '1' then
+            e.immed_result := '1';
+            e.new_fpscr(FPSCR_VXSQRT) := '1';
+            e.qnan_result := '1';
+        end if;
+
+        if e.qnan_result = '1' then
+            e.invalid := '1';
+            e.result_class := NAN;
+        end if;
+        scinfo <= e;
+    end process;
+
     fpu_1: process(all)
         variable v           : reg_type;
         variable adec        : fpu_reg_type;
@@ -895,6 +1044,7 @@ begin
         is_nan_inf := '0';
         is_zero_den := '0';
         v.cycle_1 := e_in.valid;
+        v.cycle_1_ar := '0';
 
         if r.complete = '1' or r.do_intr = '1' then
             v.instr_done := '0';
@@ -949,6 +1099,7 @@ begin
                     v.longmask := e_in.single;
                     v.fp_rc := e_in.rc;
                     v.is_arith := '1';
+                    v.cycle_1_ar := '1';
                     exec_state := arith_decode(to_integer(unsigned(e_in.insn(5 downto 1))));
                     if e_in.insn(5 downto 1) = "10110" or e_in.insn(5 downto 1) = "11010" then
                         v.is_sqrt := '1';
@@ -1205,7 +1356,7 @@ begin
         rsgn_op := RSGN_NOP;
         rcls_op <= RCLS_NOP;
 
-        if r.cycle_1 = '1' and r.is_arith = '1' then
+        if r.cycle_1_ar = '1' then
             v.fpscr(FPSCR_FR) := '0';
             v.fpscr(FPSCR_FI) := '0';
             v.result_class := FINITE;
@@ -1217,10 +1368,9 @@ begin
                 if e_in.valid = '1' then
                     v.busy := '1';
                     v.exec_state := exec_state;
-                    if is_nan_inf = '1' then
-                        v.state := DO_NAN_INF;
-                    elsif is_zero_den = '1' then
-                        v.state := DO_ZERO_DEN;
+                    v.is_nan_inf := is_nan_inf;
+                    if is_nan_inf = '1' or is_zero_den = '1' then
+                        v.state := DO_SPECIAL;
                     else
                         v.state := exec_state;
                     end if;
@@ -1230,144 +1380,25 @@ begin
                 set_s := '1';
                 v.regsel := AIN_ZERO;
 
-            when DO_NAN_INF =>
-                -- At least one floating-point operand is infinity or NaN
-                if r.a.class = NAN then
-                    opsel_a <= AIN_A;
-                elsif r.b.class = NAN then
-                    opsel_a <= AIN_B;
-                else
-                    opsel_a <= AIN_C;
-                end if;
-                opsel_b <= BIN_ZERO;
-                set_r := '1';
-
-                if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or
-                    (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or
-                    (r.c.class = NAN and r.c.mantissa(QNAN_BIT) = '0') then
-                    -- Signalling NAN
-                    v.fpscr(FPSCR_VXSNAN) := '1';
-                    invalid := '1';
-                end if;
-
-                -- Check for this case here since VXIMZ can be set along with VXSNAN
-                invalid_mul := '0';
-                if r.is_multiply = '1' and
-                    ((r.a.class = INFINITY and r.c.class = ZERO) or
-                     (r.a.class = ZERO and r.c.class = INFINITY)) then
-                    v.fpscr(FPSCR_VXIMZ) := '1';
-                    invalid_mul := '1';
-                end if;
-
-                if r.int_result = '1' then
-                    opsel_r <= RES_MISC;
-                    misc_sel <= "110";
-                    v.fpscr(FPSCR_VXCVI) := '1';
-                    invalid := '1';
-                end if;
-
-                if r.a.class = NAN or r.b.class = NAN or r.c.class = NAN then
-                    rsgn_op := RSGN_SEL;
-                    v.result_class := NAN;
-
-                else
-                    if invalid_mul = '1' then
-                        qnan_result := '1';
-                    elsif (r.a.class = INFINITY or r.c.class = INFINITY) then
-                        if r.is_multiply = '1' then
-                            rsgn_op := RSGN_SUB;
-                        end if;
-                        if r.is_subtract = '1' and r.b.class = INFINITY then
-                            v.fpscr(FPSCR_VXISI) := '1';
-                            qnan_result := '1';
-                        end if;
-                    end if;
-                    if r.is_inverse = '1' and r.a.class = INFINITY and r.b.class = INFINITY then
-                        v.fpscr(FPSCR_VXIDI) := '1';
-                        qnan_result := '1';
-                    end if;
-                    if r.b.class = INFINITY and r.is_sqrt = '1' and r.b.negative = '1' then
-                        v.fpscr(FPSCR_VXSQRT) := '1';
-                        qnan_result := '1';
-                    end if;
-                    if r.b.class = INFINITY and r.is_inverse = '1' then
-                        -- fdiv, fre, frsqrte
-                        v.result_class := ZERO;
-                    else
-                        v.result_class := INFINITY;
-                    end if;
-                end if;
-                arith_done := '1';
-
-            when DO_ZERO_DEN =>
-                -- At least one floating point operand is zero or denormalized
-                opsel_b <= BIN_ZERO;
-                set_r := '1';
-                if r.use_a = '1' and r.a.class = ZERO then
-                    opsel_a <= AIN_B;
-                    re_sel2 <= REXP2_B;
-                    re_set_result <= '1';
-                    if r.is_inverse = '1' and r.b.class = ZERO then
-                        -- fdiv with B=0
-                        v.fpscr(FPSCR_VXZDZ) := '1';
-                        qnan_result := '1';
-                    end if;
-                    if r.is_addition = '1' then
-                        -- result is +/- B
-                        v.result_class := r.b.class;
-                    else
-                        v.result_class := ZERO;
-                    end if;
-                    arith_done := '1';
-                elsif r.use_c = '1' and r.c.class = ZERO then
-                    -- fmul or fmadd/sub with C=0
-                    opsel_a <= AIN_B;
-                    re_sel2 <= REXP2_B;
-                    re_set_result <= '1';
-                    if r.is_addition = '1' then
-                        v.result_class := r.b.class;
-                    else
-                        v.result_class := ZERO;
-                    end if;
-                    arith_done := '1';
-                elsif (r.use_b = '1' and r.b.class = ZERO and r.is_multiply = '0') then
-                    -- B is zero, other operands are finite, not fmadd*
-                    opsel_a <= AIN_A;
-                    re_sel1 <= REXP1_A;
-                    re_set_result <= '1';
-                    if r.is_inverse = '1' then
-                        -- fdiv, fre, frsqrte
-                        v.result_class := INFINITY;
-                        zero_divide := '1';
-                    elsif r.is_addition = '1' then
-                        -- fadd, fsub
-                        v.result_class := FINITE;
-                    else
-                        -- other things, result is zero
-                        v.result_class := ZERO;
-                    end if;
-                    arith_done := '1';
-
+            when DO_SPECIAL =>
+                -- At least one floating point operand is NaN, infinity, zero or denormalized
+                -- Most of the special cases are handled in the fpu_specialcases process
+                -- and in the code below (the scinfo.immed_result = '1' block).
+                if r.is_multiply = '1' and r.b.class = ZERO then
+                    -- This will trigger for fmul as well as fmadd/sub, but
+                    -- it doesn't matter since r.is_subtract = 0 for fmul.
+                    rsgn_op := RSGN_SUB;
+                end if;
+                if r.a.denorm = '1' and (r.is_multiply = '1' or r.is_inverse = '1') then
+                    v.state := RENORM_A;
+                elsif r.c.denorm = '1' then
+                    v.state := RENORM_C;
+                elsif r.b.denorm = '1' and (r.is_inverse = '1' or r.is_sqrt = '1') then
+                    v.state := RENORM_B;
+                elsif r.is_multiply = '1' and r.b.class = ZERO then
+                    v.state := DO_FMUL;
                 else
-                    -- some operand is denorm, and/or it's fmadd/fmsub with B=0
-                    -- A and C are non-zero if present,
-                    -- B is non-zero if present except for multiply-add
-                    if r.is_multiply = '1' and r.b.class = ZERO then
-                        -- This will trigger for fmul as well as fmadd/sub, but
-                        -- it doesn't matter since r.is_subtract = 0 for fmul.
-                        rsgn_op := RSGN_SUB;
-                    end if;
-                    if r.a.denorm = '1' and (r.is_multiply = '1' or r.is_inverse = '1') then
-                        v.state := RENORM_A;
-                    elsif r.c.denorm = '1' then
-                        v.state := RENORM_C;
-                    elsif r.b.denorm = '1' and (r.is_inverse = '1' or r.is_sqrt = '1') then
-                        v.state := RENORM_B;
-                    elsif r.is_multiply = '1' and r.b.class = ZERO then
-                        v.state := DO_FMUL;
-                    else
-                        v.state := r.exec_state;
-                    end if;
+                    v.state := r.exec_state;
                 end if;
 
             when DO_ILLEGAL =>
@@ -1685,10 +1716,6 @@ begin
                 set_r := '1';
                 re_sel2 <= REXP2_B;
                 re_set_result <= '1';
-                if r.b.negative = '1' then
-                    v.fpscr(FPSCR_VXSQRT) := '1';
-                    qnan_result := '1';
-                end if;
                 if r.b.exponent(0) = '1' then
                     v.state := SQRT_ODD;
                 elsif r.is_inverse = '0' then
@@ -3049,6 +3076,37 @@ begin
 
         end case;
 
+        -- Handle exceptions and special cases for arithmetic operations
+        if r.cycle_1_ar = '1' then
+            v.fpscr := r.fpscr or scinfo.new_fpscr;
+            invalid := scinfo.invalid;
+            zero_divide := scinfo.zero_divide;
+            qnan_result := scinfo.qnan_result;
+            if scinfo.immed_result = '1' then
+                -- state machine is in the DO_SPECIAL or DO_FSQRT state here
+                arith_done := '1';
+                set_r := '1';
+                opsel_a <= scinfo.result_sel;
+                opsel_b <= BIN_ZERO;
+                if scinfo.qnan_result = '1' then
+                    opsel_r <= RES_MISC;
+                    if r.int_result = '0' then
+                        misc_sel <= "001";
+                    else
+                        misc_sel <= "110";
+                    end if;
+                end if;
+                rsgn_op := scinfo.rsgn_op;
+                v.result_class := scinfo.result_class;
+                if scinfo.result_sel = AIN_B then
+                    re_sel2 <= REXP2_B;
+                else
+                    re_sel1 <= REXP1_A;
+                end if;
+                re_set_result <= '1';
+            end if;
+        end if;
+
         rsign := r.result_sign;
         case rsgn_op is
             when RSGN_SEL =>
@@ -3100,16 +3158,8 @@ begin
             when others =>
         end case;
 
-        if zero_divide = '1' then
-            v.fpscr(FPSCR_ZX) := '1';
-        end if;
         if qnan_result = '1' then
-            invalid := '1';
-            v.result_class := NAN;
             rsign := '0';
-            misc_sel <= "001";
-            opsel_r <= RES_MISC;
-            arith_done := '1';
         end if;
         if invalid = '1' then
             v.invalid := '1';

From b63773f6e9f2a3ec5e145630455af6a35fd1a366 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 19 Mar 2024 15:36:50 +1100
Subject: [PATCH 22/24] FPU: Move computation of main adder inputs out of the
 state machine

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl |   4 +-
 fpu.vhdl     | 127 +++++++++++++++++++++++++++++++--------------------
 2 files changed, 79 insertions(+), 52 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 2fb1ad4..1978a27 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -151,8 +151,8 @@ architecture behaviour of decode1 is
         INSN_fabs        =>  (FPU,  FPU,  OP_FP_MOVE,   NONE,       FRB, NONE,        NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', '0', NONE),
         INSN_fadd        =>  (FPU,  FPU,  OP_FP_ARITH,  FRA,        FRB, NONE,        NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', '0', NONE),
         INSN_fadds       =>  (FPU,  FPU,  OP_FP_ARITH,  FRA,        FRB, NONE,        NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', '0', NONE),
-        INSN_fcfid       =>  (FPU,  FPU,  OP_FP_MISC,   NONE,       FRB, NONE,        NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', '0', NONE),
-        INSN_fcfids      =>  (FPU,  FPU,  OP_FP_MISC,   NONE,       FRB, NONE,        NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', '0', NONE),
+        INSN_fcfid       =>  (FPU,  FPU,  OP_FP_MISC,   NONE,       FRB, NONE,        NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', '0', NONE),
+        INSN_fcfids      =>  (FPU,  FPU,  OP_FP_MISC,   NONE,       FRB, NONE,        NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', '0', NONE),
         INSN_fcfidu      =>  (FPU,  FPU,  OP_FP_MISC,   NONE,       FRB, NONE,        NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', '0', NONE),
         INSN_fcfidus     =>  (FPU,  FPU,  OP_FP_MISC,   NONE,       FRB, NONE,        NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', '0', NONE),
         INSN_fcmpo       =>  (FPU,  FPU,  OP_FP_CMP,    FRA,        FRB, NONE,        NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', '0', NONE),
diff --git a/fpu.vhdl b/fpu.vhdl
index 0698c63..28cd55f 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -194,16 +194,16 @@ architecture behaviour of fpu is
 
     signal fp_result     : std_ulogic_vector(63 downto 0);
     signal opsel_a       : std_ulogic_vector(2 downto 0);
-    signal opsel_b       : std_ulogic;
+    signal opsel_b       : std_ulogic_vector(2 downto 0);
+    signal opsel_c       : std_ulogic_vector(2 downto 0);
     signal opsel_r       : std_ulogic_vector(1 downto 0);
     signal opsel_s       : std_ulogic_vector(1 downto 0);
-    signal opsel_ainv    : std_ulogic;
+    signal opsel_aneg    : std_ulogic;
+    signal opsel_aabs    : std_ulogic;
     signal opsel_mask    : std_ulogic;
-    signal opsel_binv    : std_ulogic;
     signal in_a          : std_ulogic_vector(63 downto 0);
     signal in_b          : std_ulogic_vector(63 downto 0);
     signal result        : std_ulogic_vector(63 downto 0);
-    signal carry_in      : std_ulogic;
     signal lost_bits     : std_ulogic;
     signal r_hi_nz       : std_ulogic;
     signal r_lo_nz       : std_ulogic;
@@ -228,8 +228,20 @@ architecture behaviour of fpu is
     constant AIN_RND_RBIT : std_ulogic_vector(2 downto 0) := "110";
     constant AIN_RND      : std_ulogic_vector(2 downto 0) := "111";
 
-    constant BIN_ZERO  : std_ulogic := '0';
-    constant BIN_R     : std_ulogic := '1';
+    constant BIN_ZERO     : std_ulogic_vector(2 downto 0) := "000";
+    constant BIN_R        : std_ulogic_vector(2 downto 0) := "001";
+    constant BIN_MINUSR   : std_ulogic_vector(2 downto 0) := "100";
+    constant BIN_ABSR     : std_ulogic_vector(2 downto 0) := "101";
+    constant BIN_ADDSUBR  : std_ulogic_vector(2 downto 0) := "110";
+    constant BIN_RSIGNR   : std_ulogic_vector(2 downto 0) := "111";
+
+    constant CIN_ZERO     : std_ulogic_vector(2 downto 0) := "000";
+    constant CIN_SUBEXT   : std_ulogic_vector(2 downto 0) := "001";
+    constant CIN_ABSEXT   : std_ulogic_vector(2 downto 0) := "010";
+    constant CIN_INC      : std_ulogic_vector(2 downto 0) := "011";
+    constant CIN_ROUND    : std_ulogic_vector(2 downto 0) := "100";
+    constant CIN_RNDX     : std_ulogic_vector(2 downto 0) := "101";
+    constant CIN_RNDQ     : std_ulogic_vector(2 downto 0) := "110";
 
     constant RES_SUM   : std_ulogic_vector(1 downto 0) := "00";
     constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01";
@@ -1035,6 +1047,9 @@ begin
         variable cr_result   : std_ulogic_vector(3 downto 0);
         variable set_cr      : std_ulogic;
         variable set_fpcc    : std_ulogic;
+        variable asign       : std_ulogic;
+        variable bneg        : std_ulogic;
+        variable ci          : std_ulogic;
     begin
         v := r;
         v.complete := '0';
@@ -1297,13 +1312,13 @@ begin
         v.first := '0';
         v.doing_ftdiv := "00";
         opsel_a <= AIN_ZERO;
-        opsel_ainv <= '0';
+        opsel_aneg <= '0';
+        opsel_aabs <= '0';
         opsel_mask <= '0';
         opsel_b <= BIN_R;
-        opsel_binv <= '0';
+        opsel_c <= CIN_ZERO;
         opsel_r <= RES_SUM;
         opsel_s <= S_ZERO;
-        carry_in <= '0';
         misc_sel <= "000";
         fpscr_mask := (others => '1');
         cr_op := CROP_NONE;
@@ -1634,14 +1649,10 @@ begin
 
             when DO_FCFID =>
                 opsel_a <= AIN_B;
+                opsel_aabs <= '1';
                 opsel_b <= BIN_ZERO;
                 set_r := '1';
                 rcls_op <= RCLS_SEL;
-                if r.insn(8) = '0' and r.b.negative = '1' then
-                    -- fcfid[s] with negative operand, set R = -B
-                    opsel_ainv <= '1';
-                    carry_in <= '1';
-                end if;
                 re_con2 <= RECON2_UNIT;
                 re_set_result <= '1';
                 if r.b.class = ZERO then
@@ -1833,9 +1844,8 @@ begin
                 else
                     opsel_a <= AIN_B;
                 end if;
-                opsel_b <= BIN_R;
-                opsel_binv <= r.is_subtract;
-                carry_in <= r.is_subtract and not r.x;
+                opsel_b <= BIN_ADDSUBR;
+                opsel_c <= CIN_SUBEXT;
                 set_r := '1';
                 -- set shift to -1
                 rs_con2 <= RSCON2_1;
@@ -1847,13 +1857,12 @@ begin
                 -- r.shift = -1
                 re_sel2 <= REXP2_NE;
                 rcls_op <= RCLS_TZERO;
+                opsel_a <= AIN_ZERO;
+                opsel_b <= BIN_ABSR;
                 if r.r(63) = '1' then
                     -- result is opposite sign to expected
                     rsgn_op := RSGN_INV;
-                    opsel_a <= AIN_ZERO;
                     set_r := '1';
-                    opsel_binv <= '1';
-                    carry_in <= '1';
                     v.state := FINISH;
                 elsif r.r(UNIT_BIT + 1) = '1' then
                     -- sum overflowed, shift right
@@ -1876,9 +1885,7 @@ begin
 
             when CMP_1 =>
                 opsel_a <= AIN_A;
-                opsel_b <= BIN_R;
-                opsel_binv <= '1';
-                carry_in <= '1';
+                opsel_b <= BIN_MINUSR;
                 set_r := '1';
                 v.state := CMP_2;
 
@@ -1963,10 +1970,10 @@ begin
 
             when FMADD_5 =>
                 -- negate R:S:X if negative
+                opsel_b <= BIN_ABSR;
+                opsel_c <= CIN_ABSEXT;
                 if r.r(63) = '1' then
                     rsgn_op := RSGN_INV;
-                    opsel_binv <= '1';
-                    carry_in <= not (s_nz or r.x);
                     set_r := '1';
                     opsel_s <= S_NEG;
                     set_s := '1';
@@ -2260,7 +2267,7 @@ begin
             when SQRT_12 =>
                 -- test if remainder is 0 or >= B = 2*R + 1
                 set_r := '0';
-                carry_in <= '1';
+                opsel_c <= CIN_INC;
                 if pcmpb_lt = '1' then
                     -- square root is correct, set X if remainder non-zero
                     v.x := r.p(UNIT_BIT + 2) or px_nz;
@@ -2309,8 +2316,8 @@ begin
 
             when INT_FINAL =>
                 -- Negate if necessary, and increment for rounding if needed
-                opsel_binv <= r.result_sign;
-                carry_in <= r.fpscr(FPSCR_FR) xor r.result_sign;
+                opsel_b <= BIN_RSIGNR;
+                opsel_c <= CIN_ROUND;
                 set_r := '1';
                 -- Check for possible overflows
                 case r.insn(9 downto 8) is
@@ -2547,13 +2554,9 @@ begin
 
             when DO_IDIVMOD =>
                 opsel_a <= AIN_B;
+                opsel_aabs <= '1';
                 opsel_b <= BIN_ZERO;
                 set_r := '1';
-                -- take absolute value for signed division
-                if r.is_signed = '1' and r.b.negative = '1' then
-                    opsel_ainv <= '1';
-                    carry_in <= '1';
-                end if;
                 -- normalize and round up B to 8.56 format, like fcfid[u]
                 re_con2 <= RECON2_UNIT;
                 re_set_result <= '1';
@@ -2583,19 +2586,16 @@ begin
                 v.state := IDIV_NORMB3;
             when IDIV_NORMB3 =>
                 -- add the X bit onto R to round up B
-                carry_in <= r.x;
+                opsel_c <= CIN_RNDX;
                 set_r := '1';
                 -- prepare to do count-leading-zeroes on A
                 v.state := IDIV_CLZA;
             when IDIV_CLZA =>
                 set_b := '1';           -- put R back into B
                 opsel_a <= AIN_A;
+                opsel_aabs <= '1';
                 opsel_b <= BIN_ZERO;
                 set_r := '1';
-                if r.is_signed = '1' and r.a.negative = '1' then
-                    opsel_ainv <= '1';
-                    carry_in <= '1';
-                end if;
                 re_con2 <= RECON2_UNIT;
                 re_set_result <= '1';
                 v.state := IDIV_CLZA2;
@@ -2608,8 +2608,7 @@ begin
                 -- (using the original value of B, which is now in C)
                 opsel_a <= AIN_C;
                 opsel_b <= BIN_R;
-                opsel_ainv <= '1';
-                carry_in <= '1';
+                opsel_aneg <= '1';
                 set_r := '1';
                 v.state := IDIV_CLZA3;
             when IDIV_CLZA3 =>
@@ -2924,8 +2923,7 @@ begin
                 -- shifted dividend is in R, subtract left-justified divisor
                 opsel_a <= AIN_B;
                 opsel_b <= BIN_R;
-                opsel_ainv <= '1';
-                carry_in <= '1';
+                opsel_aneg <= '1';
                 set_r := '1';
                 -- and put 1<<63 into B as the divisor (S is still 0)
                 shiftin0 := '1';
@@ -3028,8 +3026,7 @@ begin
             when IDIV_MODSUB =>
                 -- Subtract divisor from remainder
                 opsel_a <= AIN_C;
-                opsel_ainv <= '1';
-                carry_in <= '1';
+                opsel_aneg <= '1';
                 opsel_b <= BIN_R;
                 set_r := '1';
                 if r.result_sign = '0' then
@@ -3041,8 +3038,8 @@ begin
                 -- result (so far) is in R
                 -- set carry to increment quotient if needed
                 -- and also negate R if the answer is negative
-                opsel_binv <= r.result_sign;
-                carry_in <= r.inc_quot xor r.result_sign;
+                opsel_b <= BIN_RSIGNR;
+                opsel_c <= CIN_RNDQ;
                 set_r := '1';
                 if r.divmod = '0' then
                     opsel_a <= AIN_RND_B32;
@@ -3257,11 +3254,14 @@ begin
         if (or (mask and r.r)) = '1' and set_x = '1' then
             v.x := '1';
         end if;
+        asign := '0';
         case opsel_a is
             when AIN_A =>
                 in_a0 := r.a.mantissa;
+                asign := r.a.negative;
             when AIN_B =>
                 in_a0 := r.b.mantissa;
+                asign := r.b.negative;
             when AIN_C =>
                 in_a0 := r.c.mantissa;
             when AIN_PS8 =>     -- 8 LSBs of P sign-extended to 64
@@ -3275,18 +3275,45 @@ begin
             when others =>
                 in_a0 := (others => '0');
         end case;
-        if opsel_ainv = '1' then
+        ci := '0';
+        case opsel_c is
+            when CIN_SUBEXT =>
+                ci := r.is_subtract and r.x;
+            when CIN_ABSEXT =>
+                ci := r.r(63) and (s_nz or r.x);
+            when CIN_INC =>
+                ci := '1';
+            when CIN_ROUND =>
+                ci := r.fpscr(FPSCR_FR);
+            when CIN_RNDX =>
+                ci := r.x;
+            when CIN_RNDQ =>
+                ci := r.inc_quot;
+            when others =>
+        end case;
+        if opsel_aneg = '1' or (opsel_aabs = '1' and r.is_signed = '1' and asign = '1') then
             in_a0 := not in_a0;
+            ci := not ci;
         end if;
         in_a <= in_a0;
+        in_b0 := r.r;
+        bneg := '0';
         case opsel_b is
             when BIN_R =>
-                in_b0 := r.r;
+            when BIN_MINUSR =>
+                bneg := '1';
+            when BIN_ABSR =>
+                bneg := r.r(63);
+            when BIN_ADDSUBR =>
+                bneg := r.is_subtract;
+            when BIN_RSIGNR =>
+                bneg := r.result_sign;
             when others =>
                 in_b0 := (others => '0');
         end case;
-        if opsel_binv = '1' then
+        if bneg = '1' then
             in_b0 := not in_b0;
+            ci := not ci;
         end if;
         in_b <= in_b0;
 	if is_X(r.shift) then
@@ -3298,7 +3325,7 @@ begin
         else
             shift_res := (others => '0');
         end if;
-        sum := std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in);
+        sum := std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + ci);
         if opsel_mask = '1' then
             sum(DP_LSB - 1 downto 0) := "0000";
             if r.single_prec = '1' then

From 73505b16262438369b4f68f1e81e8e496b6d49b2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 18 Mar 2025 20:53:27 +1100
Subject: [PATCH 23/24] FPU: Provide a separate path for transferring A/B/C to
 R

The timing path from r.a.class to result showed up as a critical path
on the Artix-7, apparently because of transfers of A, B or C to R in
special cases (e.g. NaN inputs) and the fsel instruction.  To
alleviate this, we provide a path via the miscellaneous value
multiplexer from A, B and C to R, selected via opsel_R = RES_MISC and
misc_sel = 111.  A new selector opsel_sel selects which of A, B or C
to transfer, using the same encoding as opsel_a.  This new selector is
now also used for the result class when rcls_op = RCLS_SEL and for the
result sign when rsgn_op = RSGN_SEL.  This reduces the number of
things that opsel_a depends on and eases timing in the main adder
path.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index 28cd55f..4ef2d14 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -201,6 +201,7 @@ architecture behaviour of fpu is
     signal opsel_aneg    : std_ulogic;
     signal opsel_aabs    : std_ulogic;
     signal opsel_mask    : std_ulogic;
+    signal opsel_sel     : std_ulogic_vector(2 downto 0);
     signal in_a          : std_ulogic_vector(63 downto 0);
     signal in_b          : std_ulogic_vector(63 downto 0);
     signal result        : std_ulogic_vector(63 downto 0);
@@ -1320,6 +1321,7 @@ begin
         opsel_r <= RES_SUM;
         opsel_s <= S_ZERO;
         misc_sel <= "000";
+        opsel_sel <= AIN_ZERO;
         fpscr_mask := (others => '1');
         cr_op := CROP_NONE;
         update_fx := '0';
@@ -1566,8 +1568,9 @@ begin
                 v.instr_done := '1';
 
             when DO_FMR =>
-                opsel_a <= AIN_B;
-                opsel_b <= BIN_ZERO;
+                opsel_r <= RES_MISC;
+                misc_sel <= "111";
+                opsel_sel <= AIN_B;
                 set_r := '1';
                 rcls_op <= RCLS_SEL;
                 re_sel2 <= REXP2_B;
@@ -1652,6 +1655,7 @@ begin
                 opsel_aabs <= '1';
                 opsel_b <= BIN_ZERO;
                 set_r := '1';
+                opsel_sel <= AIN_B;
                 rcls_op <= RCLS_SEL;
                 re_con2 <= RECON2_UNIT;
                 re_set_result <= '1';
@@ -1710,13 +1714,14 @@ begin
                 rsgn_op := RSGN_SEL;
                 rcls_op <= RCLS_SEL;
                 if r.a.class = ZERO or (r.a.negative = '0' and r.a.class /= NAN) then
-                    opsel_a <= AIN_C;
+                    opsel_sel <= AIN_C;
                     re_sel2 <= REXP2_C;
                 else
-                    opsel_a <= AIN_B;
+                    opsel_sel <= AIN_B;
                     re_sel2 <= REXP2_B;
                 end if;
-                opsel_b <= BIN_ZERO;
+                opsel_r <= RES_MISC;
+                misc_sel <= "111";
                 set_r := '1';
                 re_set_result <= '1';
                 arith_done := '1';
@@ -3083,15 +3088,16 @@ begin
                 -- state machine is in the DO_SPECIAL or DO_FSQRT state here
                 arith_done := '1';
                 set_r := '1';
-                opsel_a <= scinfo.result_sel;
-                opsel_b <= BIN_ZERO;
+                opsel_r <= RES_MISC;
+                opsel_sel <= scinfo.result_sel;
                 if scinfo.qnan_result = '1' then
-                    opsel_r <= RES_MISC;
                     if r.int_result = '0' then
                         misc_sel <= "001";
                     else
                         misc_sel <= "110";
                     end if;
+                else
+                    misc_sel <= "111";
                 end if;
                 rsgn_op := scinfo.rsgn_op;
                 v.result_class := scinfo.result_class;
@@ -3107,7 +3113,7 @@ begin
         rsign := r.result_sign;
         case rsgn_op is
             when RSGN_SEL =>
-                case opsel_a is
+                case opsel_sel is
                     when AIN_A =>
                         rsign := r.a.negative;
                     when AIN_B =>
@@ -3128,7 +3134,7 @@ begin
 
         case rcls_op is
             when RCLS_SEL =>
-                case opsel_a is
+                case opsel_sel is
                     when AIN_A =>
                         v.result_class := r.a.class;
                     when AIN_B =>
@@ -3366,6 +3372,7 @@ begin
                             misc := r.a.mantissa(63 downto 32) & r.b.mantissa(63 downto 32);
                         end if;
                     when "101" =>
+                        -- LUT value
                         misc := std_ulogic_vector(shift_left(resize(unsigned(inverse_est), 64),
                                                              UNIT_BIT - 19));
                     when "110" =>
@@ -3382,6 +3389,16 @@ begin
                             end if;
                         end if;
                     when others =>
+                        -- A, B or C, according to opsel_sel
+                        case opsel_sel is
+                            when AIN_A =>
+                                misc := r.a.mantissa;
+                            when AIN_B =>
+                                misc := r.b.mantissa;
+                            when AIN_C =>
+                                misc := r.c.mantissa;
+                            when others =>
+                        end case;
                 end case;
                 result <= misc;
         end case;

From 3268ef717cfbc38290b0b49be22cb6679e378fb9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 21 Mar 2025 21:41:39 +1100
Subject: [PATCH 24/24] FPU: Make opsel_a a function of just the state

This adds some extra states and transitions so that opsel_a becomes
a function only of the current state.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 89 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 52 insertions(+), 37 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index 4ef2d14..5648012 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -57,7 +57,7 @@ architecture behaviour of fpu is
                      DO_FSEL,
                      DO_IDIVMOD,
                      FRI_1,
-                     ADD_1, ADD_SHIFT, ADD_2, ADD_3,
+                     ADD_1, ADD_SHIFT, ADD_2, ADD_2B, ADD_3,
                      CMP_1, CMP_2,
                      MULT_1,
                      FMADD_0, FMADD_1, FMADD_2, FMADD_3,
@@ -73,7 +73,7 @@ architecture behaviour of fpu is
                      INT_FINAL, INT_CHECK, INT_OFLOW,
                      FINISH, NORMALIZE,
                      ROUND_UFLOW, ROUND_OFLOW,
-                     ROUNDING, ROUNDING_2, ROUNDING_3,
+                     ROUNDING, ROUND_INC, ROUNDING_2, ROUNDING_3,
                      DENORM,
                      RENORM_A, RENORM_B, RENORM_C,
                      RENORM_1, RENORM_2,
@@ -87,7 +87,8 @@ architecture behaviour of fpu is
                      IDIV_EXT_TBH4, IDIV_EXT_TBH5,
                      IDIV_EXTDIV, IDIV_EXTDIV1, IDIV_EXTDIV2, IDIV_EXTDIV3,
                      IDIV_EXTDIV4, IDIV_EXTDIV5, IDIV_EXTDIV6,
-                     IDIV_MODADJ, IDIV_MODSUB, IDIV_DIVADJ, IDIV_OVFCHK, IDIV_DONE, IDIV_ZERO);
+                     IDIV_MODADJ, IDIV_MODADJ_NEG, IDIV_MODSUB,
+                     IDIV_DIVADJ, IDIV_OVFCHK, IDIV_DONE, IDIV_ZERO);
 
     type decode32 is array(0 to 31) of state_t;
     type decode8 is array(0 to 7) of state_t;
@@ -1027,7 +1028,6 @@ begin
         variable mulexp      : signed(EXP_BITS-1 downto 0);
         variable maddend     : std_ulogic_vector(127 downto 0);
         variable sum         : std_ulogic_vector(63 downto 0);
-        variable round_inc   : std_ulogic_vector(63 downto 0);
         variable mult_mask   : std_ulogic;
         variable sign_bit    : std_ulogic;
         variable rexp_in1    : signed(EXP_BITS-1 downto 0);
@@ -1678,7 +1678,7 @@ begin
                 rs_sel2 <= RSH2_A;
                 v.add_bsmall := '0';
                 if r.a.exponent = r.b.exponent then
-                    v.state := ADD_2;
+                    v.state := ADD_2B;
                 elsif r.a.exponent < r.b.exponent then
                     v.longmask := '0';
                     v.state := ADD_SHIFT;
@@ -1841,14 +1841,24 @@ begin
                 v.x := s_nz;
                 set_x := '1';
                 v.longmask := r.single_prec;
-                v.state := ADD_2;
-
-            when ADD_2 =>
                 if r.add_bsmall = '1' then
-                    opsel_a <= AIN_A;
+                    v.state := ADD_2;
                 else
-                    opsel_a <= AIN_B;
+                    v.state := ADD_2B;
                 end if;
+
+            when ADD_2 =>
+                opsel_a <= AIN_A;
+                opsel_b <= BIN_ADDSUBR;
+                opsel_c <= CIN_SUBEXT;
+                set_r := '1';
+                -- set shift to -1
+                rs_con2 <= RSCON2_1;
+                rs_neg2 <= '1';
+                v.state := ADD_3;
+
+            when ADD_2B =>
+                opsel_a <= AIN_B;
                 opsel_b <= BIN_ADDSUBR;
                 opsel_c <= CIN_SUBEXT;
                 set_r := '1';
@@ -2484,20 +2494,14 @@ begin
                 v.fpscr(FPSCR_FR downto FPSCR_FI) := round;
                 if round(1) = '1' then
                     -- increment the LSB for the precision
-                    opsel_a <= AIN_RND;
-                    -- set shift to -1
-                    rs_con2 <= RSCON2_1;
-                    rs_neg2 <= '1';
-                    v.state := ROUNDING_2;
+                    v.state := ROUND_INC;
+                elsif r.r(UNIT_BIT) = '0' then
+                    -- result after masking could be zero, or could be a
+                    -- denormalized result that needs to be renormalized
+                    rs_norm <= '1';
+                    v.state := ROUNDING_3;
                 else
-                    if r.r(UNIT_BIT) = '0' then
-                        -- result after masking could be zero, or could be a
-                        -- denormalized result that needs to be renormalized
-                        rs_norm <= '1';
-                        v.state := ROUNDING_3;
-                    else
-                        arith_done := '1';
-                    end if;
+                    arith_done := '1';
                 end if;
                 if round(0) = '1' then
                     v.fpscr(FPSCR_XX) := '1';
@@ -2506,6 +2510,14 @@ begin
                     end if;
                 end if;
 
+            when ROUND_INC =>
+                set_r := '1';
+                opsel_a <= AIN_RND;
+                -- set shift to -1
+                rs_con2 <= RSCON2_1;
+                rs_neg2 <= '1';
+                v.state := ROUNDING_2;
+
             when ROUNDING_2 =>
                 -- Check for overflow during rounding
                 -- r.shift = -1
@@ -2804,12 +2816,10 @@ begin
                 msel_1 <= MUL1_Y;
                 msel_2 <= MUL2_P;
                 v.inc_quot := not pcmpc_lt and not r.divmod;
-                if r.divmod = '0' then
-                    -- get B into R for IDIV_DIVADJ state
-                    opsel_a <= AIN_B;
-                    opsel_b <= BIN_ZERO;
-                    set_r := '1';
-                end if;
+                -- if dividing, get B into R for IDIV_DIVADJ state
+                opsel_a <= AIN_B;
+                opsel_b <= BIN_ZERO;
+                set_r := not r.divmod;
                 -- set shift to UNIT_BIT (== 56)
                 rs_con2 <= RSCON2_UNIT;
                 if pcmpc_lt = '1' or pcmpc_eq = '1' then
@@ -2872,11 +2882,11 @@ begin
                 v.inc_quot := not pcmpc_lt and not r.divmod;
                 -- set shift to UNIT_BIT (== 56)
                 rs_con2 <= RSCON2_UNIT;
+                -- if dividing, get B into R for IDIV_DIVADJ state
+                opsel_a <= AIN_B;
+                opsel_b <= BIN_ZERO;
+                set_r := not r.divmod;
                 if r.divmod = '0' then
-                    -- get B into R for IDIV_DIVADJ state
-                    opsel_a <= AIN_B;
-                    opsel_b <= BIN_ZERO;
-                    set_r := '1';
                     v.state := IDIV_DIVADJ;
                 elsif pcmpc_eq = '1' then
                     v.state := IDIV_ZERO;
@@ -3026,8 +3036,15 @@ begin
                 elsif r.result_sign = '0' then
                     v.state := IDIV_DONE;
                 else
-                    v.state := IDIV_DIVADJ;
+                    v.state := IDIV_MODADJ_NEG;
                 end if;
+            when IDIV_MODADJ_NEG =>
+                -- result (so far) is in R
+                -- set carry to increment quotient if needed
+                -- and also negate R since the answer is negative
+                opsel_b <= BIN_MINUSR;
+                set_r := '1';
+                v.state := IDIV_OVFCHK;
             when IDIV_MODSUB =>
                 -- Subtract divisor from remainder
                 opsel_a <= AIN_C;
@@ -3043,12 +3060,10 @@ begin
                 -- result (so far) is in R
                 -- set carry to increment quotient if needed
                 -- and also negate R if the answer is negative
+                opsel_a <= AIN_RND_B32;
                 opsel_b <= BIN_RSIGNR;
                 opsel_c <= CIN_RNDQ;
                 set_r := '1';
-                if r.divmod = '0' then
-                    opsel_a <= AIN_RND_B32;
-                end if;
                 if r.is_signed = '0' then
                     v.state := IDIV_DONE;
                 else