From 59992eab907f9431a99ab4de987abd722e9d3098 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 11 Dec 2025 13:15:00 +1100
Subject: [PATCH] FPU: Avoid doing overflow processing twice in OE=1 case

Split the ROUND_OFLOW state into two, one which handles the OE=0 case
(disabled overflow exception) and one which handles the OE=1 case
(enabled overflow exception).  This avoids a loop in the state diagram
and prevents us from adding the exponent bias twice.

Also correct a bug in ROUNDING_3 state where for single-precision
operations which yield a result which is denormal in double-precision
format, r.shift was set wrongly.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl           | 109 +++++++++++++++++++++------------------------
 tests/fpu/fpu.c    |   3 ++
 tests/test_fpu.bin | Bin 33464 -> 33560 bytes
 3 files changed, 55 insertions(+), 57 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index f49f02d..07617af 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -72,7 +72,7 @@ architecture behaviour of fpu is
                      INT_SHIFT, INT_ROUND, INT_ISHIFT,
                      INT_FINAL, INT_CHECK, INT_OFLOW,
                      FINISH, NORMALIZE,
-                     ROUND_UFLOW, NORM_UFLOW, ROUND_OFLOW,
+                     ROUND_UFLOW, NORM_UFLOW, ROUND_OFLOW_DIS, ROUND_OFLOW_EN,
                      ROUNDING, ROUND_INC, ROUNDING_2, ROUNDING_3,
                      DENORM,
                      RENORM_A, RENORM_B, RENORM_C,
@@ -315,6 +315,7 @@ architecture behaviour of fpu is
     constant RSCON2_63      : std_ulogic_vector(3 downto 0) := "0111";
     constant RSCON2_64      : std_ulogic_vector(3 downto 0) := "1000";
     constant RSCON2_MINEXP  : std_ulogic_vector(3 downto 0) := "1001";
+    constant RSCON2_DPMINX  : std_ulogic_vector(3 downto 0) := "1010";
 
     signal rs_sel1       : std_ulogic_vector(1 downto 0);
     signal rs_sel2       : std_ulogic;
@@ -1633,10 +1634,10 @@ begin
                 rs_con2 <= RSCON2_MINEXP;
                 rs_neg2 <= '1';
                 set_x := '1';   -- uses r.r and r.shift
-                if r.result_exp < to_signed(-126, EXP_BITS) then
+                if exp_tiny = '1' then
                     v.state := ROUND_UFLOW;
-                elsif r.result_exp > to_signed(127, EXP_BITS) then
-                    v.state := ROUND_OFLOW;
+                elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then
+                    v.state := ROUND_OFLOW_DIS;
                 else
                     v.state := ROUNDING;
                 end if;
@@ -2406,6 +2407,7 @@ begin
                 v.state := ROUNDING;
 
             when FINISH =>
+                -- r.shift = 0
                 if r.is_multiply = '1' and px_nz = '1' then
                     v.x := '1';
                 end if;
@@ -2420,8 +2422,8 @@ begin
                     set_x := '1';
                     if exp_tiny = '1' then
                         v.state := ROUND_UFLOW;
-                    elsif exp_huge = '1' then
-                        v.state := ROUND_OFLOW;
+                    elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then
+                        v.state := ROUND_OFLOW_DIS;
                     else
                         v.state := ROUNDING;
                     end if;
@@ -2441,8 +2443,8 @@ begin
                 set_x := '1';
                 if exp_tiny = '1' then
                     v.state := ROUND_UFLOW;
-                elsif exp_huge = '1' then
-                    v.state := ROUND_OFLOW;
+                elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then
+                    v.state := ROUND_OFLOW_DIS;
                 else
                     v.state := ROUNDING;
                 end if;
@@ -2485,30 +2487,20 @@ begin
                 set_x := '1';
                 v.state := ROUNDING;
 
-            when ROUND_OFLOW =>
+            when ROUND_OFLOW_DIS =>
+                -- disabled overflow exception
+                -- result depends on rounding mode
                 rcls_op <= RCLS_TINF;
                 v.fpscr(FPSCR_OX) := '1';
                 opsel_r <= RES_MISC;
                 misc_sel <= "010";
-                set_r := '0';
-                if r.fpscr(FPSCR_OE) = '0' then
-                    -- disabled overflow exception
-                    -- result depends on rounding mode
-                    set_r := '1';
-                    v.fpscr(FPSCR_XX) := '1';
-                    v.fpscr(FPSCR_FI) := '1';
-                    -- construct largest representable number
-                    re_con2 <= RECON2_MAX;
-                    re_set_result <= '1';
-                    arith_done := '1';
-                else
-                    -- enabled overflow exception
-                    re_sel1 <= REXP1_R;
-                    re_con2 <= RECON2_BIAS;
-                    re_neg2 <= '1';
-                    re_set_result <= '1';
-                    v.state := ROUNDING;
-                end if;
+                set_r := '1';
+                v.fpscr(FPSCR_XX) := '1';
+                v.fpscr(FPSCR_FI) := '1';
+                -- construct largest representable number
+                re_con2 <= RECON2_MAX;
+                re_set_result <= '1';
+                arith_done := '1';
 
             when ROUNDING =>
                 opsel_mask <= '1';
@@ -2527,6 +2519,8 @@ begin
                     -- denormalized result that needs to be renormalized
                     rs_norm <= '1';
                     v.state := ROUNDING_3;
+                elsif r.result_exp > max_exp then
+                    v.state := ROUND_OFLOW_EN;
                 else
                     arith_done := '1';
                 end if;
@@ -2540,49 +2534,40 @@ begin
             when ROUND_INC =>
                 set_r := '1';
                 opsel_a <= AIN_RND;
-                -- set shift to -1
-                rs_con2 <= RSCON2_1;
-                rs_neg2 <= '1';
                 v.state := ROUNDING_2;
 
             when ROUNDING_2 =>
                 -- Check for overflow during rounding
-                -- r.shift = -1
-                v.x := '0';
-                re_sel2 <= REXP2_NE;
-                opsel_r <= RES_SHIFT;
-                set_r := '0';
-                if r.r(UNIT_BIT + 1) = '1' then
-                    set_r := '1';
-                    re_set_result <= '1';
-                    if exp_huge = '1' then
-                        v.state := ROUND_OFLOW;
-                    else
-                        arith_done := '1';
-                    end if;
-                elsif r.r(UNIT_BIT) = '0' then
+                -- r.shift = 0
+                if r.r(UNIT_BIT + 1) = '1' or r.r(UNIT_BIT) = '0' then
                     -- Do CLZ so we can renormalize the result
                     rs_norm <= '1';
                     v.state := ROUNDING_3;
+                elsif exp_huge = '1' then
+                    v.state := ROUND_OFLOW_EN;
                 else
                     arith_done := '1';
                 end if;
 
             when ROUNDING_3 =>
-                -- r.shift = clz(r.r) - 9
+                -- r.shift = clz(r.r) - 7
                 opsel_r <= RES_SHIFT;
                 set_r := '1';
                 re_sel2 <= REXP2_NE;
-                -- set shift to new_exp - min_exp (== -1022)
+                -- set shift to new_exp - DP min_exp (== -1022)
                 rs_sel1 <= RSH1_NE;
-                rs_con2 <= RSCON2_MINEXP;
+                rs_con2 <= RSCON2_DPMINX;
                 rs_neg2 <= '1';
                 rcls_op <= RCLS_TZERO;
                 -- If the result is zero, that's handled below.
                 -- Renormalize result after rounding
                 v.denorm := exp_tiny;
                 re_set_result <= '1';
-                if new_exp < to_signed(-1022, EXP_BITS) then
+                if exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then
+                    v.state := ROUND_OFLOW_DIS;
+                elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '1' then
+                    v.state := ROUND_OFLOW_EN;
+                elsif new_exp < to_signed(-1022, EXP_BITS) then
                     v.state := DENORM;
                 else
                     arith_done := '1';
@@ -2596,6 +2581,16 @@ begin
                 re_set_result <= '1';
                 arith_done := '1';
 
+            when ROUND_OFLOW_EN =>
+                -- enabled overflow exception
+                -- rounding and normalization has been done
+                v.fpscr(FPSCR_OX) := '1';
+                re_sel1 <= REXP1_R;
+                re_con2 <= RECON2_BIAS;
+                re_neg2 <= '1';
+                re_set_result <= '1';
+                arith_done := '1';
+
             when DO_IDIVMOD =>
                 opsel_a <= AIN_B;
                 opsel_aabs <= '1';
@@ -3201,14 +3196,12 @@ begin
                     arith_done := '1';
                 end if;
             when RCLS_TINF =>
-                if r.fpscr(FPSCR_OE) = '0' then
-                    if r.round_mode(1 downto 0) = "00" or
-                        (r.round_mode(1) = '1' and r.round_mode(0) = r.result_sign) then
-                        v.result_class := INFINITY;
-                        v.fpscr(FPSCR_FR) := '1';
-                    else
-                        v.fpscr(FPSCR_FR) := '0';
-                    end if;
+                if r.round_mode(1 downto 0) = "00" or
+                    (r.round_mode(1) = '1' and r.round_mode(0) = r.result_sign) then
+                    v.result_class := INFINITY;
+                    v.fpscr(FPSCR_FR) := '1';
+                else
+                    v.fpscr(FPSCR_FR) := '0';
                 end if;
             when others =>
         end case;
@@ -3593,6 +3586,8 @@ begin
                         rsh_in2 := to_signed(64, EXP_BITS);
                     when RSCON2_MINEXP =>
                         rsh_in2 := min_exp;
+                    when RSCON2_DPMINX =>
+                        rsh_in2 := to_signed(-1022, EXP_BITS);
                     when others =>
                         rsh_in2 := to_signed(0, EXP_BITS);
                 end case;
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index 5c46b6f..5f0131c 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -682,6 +682,9 @@ struct roundvals {
 	{ FPS_RN_NEAR,			0x37c12345b0000000, 0x37c1234400000000, FPS_FI },
 	{ FPS_RN_NEAR,			0x0000008800000088, 0,			FPS_FI },
 	{ FPS_RN_NEAR,			0xc2000000c2000000, 0xc2000000c0000000,	FPS_FI },
+	{ FPS_RN_NEAR|FPS_OE,		0xefffffffffffffff, 0xe400000000000000,	FPS_FR|FPS_FI },
+	{ FPS_RN_NEAR|FPS_OE,		0xff0000ff43434343, 0xf30000ff40000000,	FPS_FI },
+	{ FPS_RN_NEAR|FPS_OE,		0xfc00fc0139fffcff, 0xf000fc0140000000,	FPS_FR|FPS_FI },
 };
 
 int test8(long arg)
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index f68ea11be39a32ba6dab54ab64e3109f8f4b185a..229e70f68afa50d2af321912e16afa1f774477ee 100755
GIT binary patch
delta 548
zcmZ9GO(;ZB6vxlKGlO}N$5WanA59@&kx6-K>h^2~8D(L?24!JkD>4`wdUiA2%`7Y|
zC8?P!Y-A&a6dN0|V(~0|WNGr8`)2ZW>fU?)=XdV;pTkv}TBBtEECkRuep=W!hwo%8
zS*B5lC+j;XoTNvdOMR*c=m5PIfOP>d)>R9!2w-CJaa9XRY6=58&XAtpUfFiPr1BKy
z*=}TW31Itn??2;dwbEl^SDas}Z8V$_pXv=!OhM5N##F)Y%ZG`{JGJI2G_ok#qnQQa
zrs$o$!e&1nmRz#N7qkYSf&_;-bR$pQ4n5LpyhXxTmP1Ycs0Ed{n26<86oHB%MOx2P
zuN68g)a`p{dq(UxcX()cS3I>xDDewDc%YD&cnkGYvcf~G!J86+bvYx!r`i0E5^rpj
zkOUtv{3OMv41Y-im)dbdbbF93BzV%$>SV<QOfQB;{hMG-{jK?#QG*jKBUqBN^0qj3
zqOs?=dz4`=!_LJ^PQe2}fs?X)p$EgpHk1sncL4f_+gV#DS;$I`?<@Z2*Q8abi(Bq5
DCyBEw

delta 456
zcmbQy#<ZiANsxiT!Ipu+EAbe+S0#{6Wtb=^%+%04QO$^{;qb<&cl?qYHYr&$FnGRX
zV0d|&fg!;{fuX^V0m$H(oG2h{3zFs7>;zPC5-5A>_J#ldy+N`dyaFn>&Tm0I1H-D@
zThQeuZxj$`<=Cub$ujw@fIicUjhp`oTw~<o*vtg9jUi#frpfn&!WcOwn+j{o0xf#U
z1hgJ#*-Hiwt(rUkn;a^BKdEO1iA}B*R+n70$>}A~FtF?#s4NRucCWBJ=ZZ~EjUYog
zCO;K6=3?2L)BthfW+jmzE|Aq_a&7EjC#Y^?WxB9&^EPcGR;CwgH@`CQV`7xpY-zlm
zi;-dTM_X?uMyAQO4(gl|o0U32W(Z8qb*M)0Hh*-8W@HqY?CT`WC^0$HNt#yx=%_U;
z3^l9#9cl$8PjnJi1&J^-I9T#9Xj;N$fMNp3{K+Ss<bfu<bTa0JSjDLWwr;bg^M0$z
N*BCW67u2m_0RS}Zmo5MR