From e60840eabcec6c2fb39fa421f9ee69a23d44a7d5 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 18 Nov 2025 22:42:37 +0000 Subject: [PATCH 01/38] FPU: Make sure FR and FI in FPSCR get reset on special-case arith instructions Arithmetic instructions where the result is determined without doing any actual computation (i.e. the input(s) are NaNs, infinities, zeroes etc.) weren't resetting FR and FI properly. This combines the two blocks that handle the r.cycle_1_ar = 1 case to fix it. Signed-off-by: Paul Mackerras --- fpu.vhdl | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 16583cb..e1bc8fd 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1354,12 +1354,6 @@ begin rsgn_op := RSGN_NOP; rcls_op <= RCLS_NOP; - if r.cycle_1_ar = '1' then - v.fpscr(FPSCR_FR) := '0'; - v.fpscr(FPSCR_FI) := '0'; - v.result_class := FINITE; - end if; - case r.state is when IDLE => v.invalid := '0'; @@ -3077,6 +3071,9 @@ begin -- Handle exceptions and special cases for arithmetic operations if r.cycle_1_ar = '1' then v.fpscr := r.fpscr or scinfo.new_fpscr; + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + v.result_class := FINITE; invalid := scinfo.invalid; zero_divide := scinfo.zero_divide; qnan_result := scinfo.qnan_result; From 7b1febcbd3f2a5e52c1c446778c0877e7e124f8b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 18 Nov 2025 22:49:45 +0000 Subject: [PATCH 02/38] tests/fpu: Check setting of FR and FI in FPSCR by frsp instruction Signed-off-by: Paul Mackerras --- tests/fpu/fpu.c | 54 +++++++++++++++++++++++++++------------------ tests/test_fpu.bin | Bin 32896 -> 33072 bytes 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 5e45038..4ecfd2a 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -21,6 +21,8 @@ #define FPS_VE 0x80 #define FPS_VXCVI 0x100 #define FPS_VXSOFT 0x400 +#define FPS_FI 0x20000 +#define FPS_FR 0x40000 extern int trapit(long arg, int (*func)(long)); extern void do_rfid(unsigned long msr); @@ -653,29 +655,30 @@ struct roundvals { unsigned long fpscr; unsigned long dpval; unsigned long spval; + unsigned long fpscr_fir; } roundvals[] = { - { FPS_RN_NEAR, 0, 0 }, - { FPS_RN_CEIL, 0x8000000000000000, 0x8000000000000000 }, - { FPS_RN_NEAR, 0x402123456789abcd, 0x4021234560000000 }, - { FPS_RN_ZERO, 0x402123456789abcd, 0x4021234560000000 }, - { FPS_RN_CEIL, 0x402123456789abcd, 0x4021234580000000 }, - { FPS_RN_FLOOR, 0x402123456789abcd, 0x4021234560000000 }, - { FPS_RN_NEAR, 0x402123457689abcd, 0x4021234580000000 }, - { FPS_RN_ZERO, 0x402123457689abcd, 0x4021234560000000 }, - { FPS_RN_CEIL, 0x402123457689abcd, 0x4021234580000000 }, - { FPS_RN_FLOOR, 0x402123457689abcd, 0x4021234560000000 }, - { FPS_RN_NEAR, 0x4021234570000000, 0x4021234580000000 }, - { FPS_RN_NEAR, 0x4021234550000000, 0x4021234540000000 }, - { FPS_RN_NEAR, 0x7ff123456789abcd, 0x7ff9234560000000 }, - { FPS_RN_ZERO, 0x7ffa3456789abcde, 0x7ffa345660000000 }, - { FPS_RN_FLOOR, 0x7ff0000000000000, 0x7ff0000000000000 }, - { FPS_RN_NEAR, 0x47e1234550000000, 0x47e1234540000000 }, - { FPS_RN_NEAR, 0x47f1234550000000, 0x7ff0000000000000 }, - { FPS_RN_ZERO, 0x47f1234550000000, 0x47efffffe0000000 }, - { FPS_RN_CEIL, 0x47f1234550000000, 0x7ff0000000000000 }, - { FPS_RN_FLOOR, 0x47f1234550000000, 0x47efffffe0000000 }, - { FPS_RN_NEAR, 0x38012345b0000000, 0x38012345c0000000 }, - { FPS_RN_NEAR, 0x37c12345b0000000, 0x37c1234400000000 }, + { FPS_RN_NEAR|FPS_FI|FPS_FR, 0, 0, 0 }, + { FPS_RN_CEIL|FPS_FI|FPS_FR, 0x8000000000000000, 0x8000000000000000, 0 }, + { FPS_RN_NEAR|FPS_FR, 0x402123456789abcd, 0x4021234560000000, FPS_FI }, + { FPS_RN_ZERO|FPS_FR, 0x402123456789abcd, 0x4021234560000000, FPS_FI }, + { FPS_RN_CEIL, 0x402123456789abcd, 0x4021234580000000, FPS_FR|FPS_FI }, + { FPS_RN_FLOOR, 0x402123456789abcd, 0x4021234560000000, FPS_FI }, + { FPS_RN_NEAR, 0x402123457689abcd, 0x4021234580000000, FPS_FR|FPS_FI }, + { FPS_RN_ZERO|FPS_FR|FPS_FI, 0x402123457689abcd, 0x4021234560000000, FPS_FI }, + { FPS_RN_CEIL|FPS_FR, 0x402123457689abcd, 0x4021234580000000, FPS_FR|FPS_FI }, + { FPS_RN_FLOOR, 0x402123457689abcd, 0x4021234560000000, FPS_FI }, + { FPS_RN_NEAR, 0x4021234570000000, 0x4021234580000000, FPS_FR|FPS_FI }, + { FPS_RN_NEAR, 0x4021234550000000, 0x4021234540000000, FPS_FI }, + { FPS_RN_NEAR|FPS_FR|FPS_FI, 0x7ff123456789abcd, 0x7ff9234560000000, 0 }, + { FPS_RN_ZERO|FPS_FR, 0x7ffa3456789abcde, 0x7ffa345660000000, 0 }, + { FPS_RN_FLOOR|FPS_FR|FPS_FI, 0x7ff0000000000000, 0x7ff0000000000000, 0 }, + { FPS_RN_NEAR, 0x47e1234550000000, 0x47e1234540000000, FPS_FI }, + { FPS_RN_NEAR, 0x47f1234550000000, 0x7ff0000000000000, FPS_FR|FPS_FI }, + { FPS_RN_ZERO, 0x47f1234550000000, 0x47efffffe0000000, FPS_FI }, + { FPS_RN_CEIL, 0x47f1234550000000, 0x7ff0000000000000, FPS_FR|FPS_FI }, + { FPS_RN_FLOOR, 0x47f1234550000000, 0x47efffffe0000000, FPS_FI }, + { FPS_RN_NEAR, 0x38012345b0000000, 0x38012345c0000000, FPS_FR|FPS_FI }, + { FPS_RN_NEAR, 0x37c12345b0000000, 0x37c1234400000000, FPS_FI }, }; int test8(long arg) @@ -696,6 +699,13 @@ int test8(long arg) } if (check_fprf(result, true, fpscr)) return i + 0x101; + if ((fpscr & (FPS_FR|FPS_FI)) != roundvals[i].fpscr_fir) { + print_string("\r\n"); + print_hex(i, 4, " "); + print_hex(fpscr, 8, " "); + print_hex(roundvals[i].fpscr_fir, 8, " "); + return i + 0x201; + } } return 0; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index b2a293c223beb4f42d028d6301a081fde8194082..09be7e4f66691e241c7320c43798fda38093f4d8 100755 GIT binary patch delta 6298 zcmai24Nz29mcH*n^J9d7AQBszh8E;s6p+_8KaGMkAOwwUbT+z842dSuO4doX3Eg(% zv{PnEEuEuPB&!BowKS_1y_%{xrCBv`*KDHkcgn;+@E4*OUCj)RKX1SL`XR5K*~zPF z?)}a=_uO;NJ?Gr_@U)3tOT{$CtR;-qH2qg-O$)|mPg_kBV|LKq0_`o(-fCt`*QDhO zht~XRf;g?zx9ShiyVHFsX2z=f80-56(vp#R>KP`>6?)&fTe4isG5>Md<-W;~{~EIY zbMCl?XHO0(weR^)s-{9S@_kj1ibL~UCP-P>pkB&vn-aZ%e9W1o869wSKO(zEH*2`18K(0Qn^=J=okJ-L*DV4S4!=rC8Gm`w>5jE3pW(a;acI>z0$fH6sa$Y9vITx z{2O=-z)9Eu4wN2^N^1F^cKe0kD;>TH-4keaLa=a` z5)=MKXu;pqP`i_b&$IW@!Gztx=hJY~cSgk1>O|x44uC|HKg;QOQo>kS_DU~+$aB%5 zcR3wdUpOa4eKM1=r0g-wW3{qm$#L&Xk`wju+ufz18I9pAJUfB4Szl%uW+OATbf(Y* z!-NqlQ<=>NB|OKkrY&*UyXbub>TWS@HB1#NQt7tAKwZ{g8lIGnaw|+~EI3pqGLK|m ze9^))-Er@*yYPuzo5z}Z`8*1|RA7O*r3P5BCv={3#Tfb`DTyj$pHGNnR`b@NY8T%x zZ&ix5~6?Z8}s@?|J9uAIBbm7k_&qmd#eJx^zt`e(^W0x+@u?~9y;}T@$-P~RzYKDJ1dRGG z=HAa=yhY-Amw_g*C6e85l;}J_&B?8zE`=^92U@ce#mKD*!zv^DP3wjLJg;?S0G`t7 z3c#k;g#kFIbq+AM#>lR;a%;F=-Uc+*1UbFQv?=AyD8`KDG|xM(FxIrD$B6tIW2#O0 ztAID5)c1t?1J1Qgr!}TgV!M$(HH{O3=m*mu!tL0|F(t@VNgKwr>Nq+~iK#^q?<}5& za0mk@aH3{vN{z`~p!a8O!yUIXkIhp~tWl*+LKCc3=XnU+=LXu8TB;jqV74|&E{HM7 zvd3cYI%MI=;kfsM+sHaC!EAl&)wI6WFt+4oY|SLnr%iI7&2YZLmX^!%tHr!2J77gT zM01qV-BipE$ipqhVK;#m!VXyMi`QGinXA`vWtox~)r#!JqFcBS1SVTN%0XUndlQ*0 zoe6!8%b70LknZFcMjo)0Wwvm<0ysyOt>(yq>K2KWE4PXRM5HI?>CU$?w<+^XryX-2 z?M+Kcz4K(Lm3!s!-+QPACU`>dhgpl+@yXIwG0fRXkJHM894Z;B7outQSiRwAq`r}r zKYa4vu9-&Gb7WNsdy?=%_oH3^+3o!PF@2Dc>0S;$hqI+to)W9M&o51JasLnh%rEVP zY)(%{sd66V5VK9id<~cBgiI-Tj3p{^&&u#*a4CB93SVMPo@Zp@l>D0wA)bG$)e>Dn z)6+@VOV`rN5-U+Deji$&`Y<=C;}K(f&t_10hF*xEIT?EK?!&+9&xL!x?9X1v{E|QG z6g|J-&$|%P`m>qLnSU1N#M6YViK4BG-praI?4feHq{|&Elu!=-j-yF|?9v_TkJXRJ0BTda zKI7G}gFeoi%uUKmn$5SfFP1HJxA;k+t3SM^l*lq*o;rj0Jz%BltNdnQq{LHxBe1|q z&Hhp{xzY7@1*I#0xNTy?MhObTHi=T9`Z&`_@3bIXmuJ=Xz0yJs6{ z+C7ic{=!k>&Ik0z!l~SuiTNndkUwMy;`=QZPzE)3Az2?qLVV!F^o z{S)&eE}*#lIoa%tqBZ820JS~W0@a=X)~YqCY9st=*A}G#Fm>}SAdJID}lNe0&8{c3-{~V{xe0Ir=sB{PkK?e5;-4F$@wGSxU9Z&AbI&a z8WMQTWmTctOWJ_73H>gxHldqQjewzOM3K$Q1yk` zw9)C%JDn}$vgoS^J8Wf7V25?7c(B7RT=KyOv>^ADLj1?j?0fAY-B`S^t?f}kF z@$Ojh=7Vm1HS|Lg(}u_KJXl1G#!bB;`o>P zUhSpsIAbKwg?3-chkRV^qyH#t4Ie@its8~objG?b(I-AgV6220!u5x`P{vom>m^nXv(dt@_xp=6iB^aNIT3Qgk-G)CyiweLLIXMYmiMf;WBNQvb zvsJqmO@1@tef-~vKgNGF-^y}>hGxZlJ!m^LG<@c= z0um}>;;ctyd5p^8qZ7Y|qm)^ZCO$e!Gb%E53EyEYwN;FYn=vesO}=#T!ugQ$=MW73 zvd!ARqg@pS@xXU<48vE)={AO2$0@SXAjDEmWs*=#b(KltYbR(?r9oVAf>vSJbAonZ zIDdkUK`xm3A(u@NQ4_9i5saR8T*} z-#$f=RR(eGDaysL?-bQzxO$2fRT;4H%BrNewWnoSF&6e3@rw;cI8M`{suZ#JG+nPs z6T&FMHcou)43*l_n$sb203v74sFAG;FK3?!8^tM+4bkOG?EVlS0FdL+z6IH}f0X48 z%vlv=%QpmV6h9FlTOMg`R3qk>mmrd$V=R~s+KP0ul-Xlj{@gs!*K+i|ET86jrb17& z&8=0{FbAqC5d4tx5F0kP@EZeKF=+Go@(OPH$07f!TOX`(z$3h?dT{>S58MeWL|?(Wx6jigd^_gV z1nW8gqsUxSmU|VmJ(q^U%`{!PIs|me(pwD8l#- zXh%Tvsx%&EFK7}a)@Ft5!K~vd)z@Z5@VNB??ztw*#WcM(KHmOMPtE1lDgh6J0pzPK z3Ymr3zzw=xo2grl=~t9EElc+~X6;lrEi0rKGwDaFpOzUCAC3h0k#DQq|#t6&5r*2hi_DxlC7B9{iDdqsMx>H|NRHtcbplXAUe@93PJI&q`VK} z=MS<3RRr6-|F^aghqf8yd;D^$YGnh(%t3yDnY8+Y(Tn_|sw5v8)UB*qj|NThdwduF z4Cfv{;G^;kdVF!;$Bus}_>005BT)117e|jn`QgZ7k9A==e|DQB@T1O}Um9Jfurh`R%hYDpBamG2j&$ez0{y*31 BKl1

4>L%D(Z@pA#zBrD5(L}e;KJ;(HDX4Z z5gDf-g(#E@E}X>}yupPXz(WCBYnP#fKZ%?GD~w=q_(Xt&B@4U|U@Zlq-t&4I)I(B9 zSGC?d=ia{ezWeTd(=Fcy(bFnsFlH@btg7*MVO32SS9m;Cjf^#bdkx%c;9gt7TH7;b z2q)TKoG$J!Ike@Ir`--;nuW2-0mcUY2yMy4JhcpyjS4?-@unl! z4;Q;MvNfvgVC`-1dX5um_6HdN#0;}Jm_@?DIB##XC$qK2y0QqJ5_P8 zUwk~;Fm8=+ZmZuF-H4u{^DXmmw8lqiG1I5@lTw&z!U&oZcX!J^ND(oQ*d;0~iRKnS)l^XWKVy|?+pBs)!)zI6Pd8Gq> z`kUb~%aYA2t?<+B8hZRsy;6#wuGG-sE46C<@=X|-SZZ4KSCz$>NSpq7LHWyh;t z>2jZ9S*KAhUx`iVqlm=9$p0I&tY77op7AR;X=wJGR|@H)U5SNy4SNafF}j>wC=5{B z#B`cH@mb*w%yNXQ^zp>Q!X#RqQWV;s&g|>b=wM353yqla+H)kgxEwjhT(kA>KF^pj zcM|hhtt?e?4E~2(&xHPddC%Z&`NZIme0A{spm3K1y;U&smAf=7YmtuWawoD@>yKEL z#l*}_!>N>Qm>zX*60;vt%9=IwX0$kbAJOZEJh5Xf?J~>@-V7;iE>8HBF_^-QnZh2L zW?VG$y<(Ahq>ic& z0q!K9>P;s4#`rXCH0i}R)9BZxQg_GNV@3P{$KYOfANq@rJVM9ypS#a#@x|rtFSYn= znfs^~|IzH;tHqxOxwiw$@_WTR3!TUYx7!L%U@ebUF1muLKGi8!rqcP;z-T@|oAPMF z@sx@E-1&9@Uhd@GJti-6HU(g_b7cSya@GU$XiV(7lSjktR)Et)6XfbN(OYS&C^tP^ z&zQ-Q;aSrh&Klc!;z@6=P76qSK9ar&J&&>fNZ98}q!rX`ju&ql=`HhAL8RZAKhY_L zP!`l|p-q#VAsl^7y7a=@uC>dN5@F~Jc617Q^ZbKRcRu!O9FvWFzm6vPPxiXoZdX2A zuI%9+m79&zal1Ot$Nt_+p)Kj9;-VBfot|X$ee_uouNTMNi*8gzQ83%-?9CW(hO?yu zF;ztrmNEOMby=<$w!tPaxyYjw$aVMl6lTw4!hqvib~CHXboJsVARpMuvYR;W29{;3 zC1R+uNn$qTXQBff3Jmj1*AdL^$~@b(5A$KVm0{FLnDJmdG%zh$NT+F&=LqZQjmgPE zJ+)0v7E4Cy!^ufv>L?w|%64Bsdd4&E=e^<{%pK2_HS@X=a6T|LF_-h1=p-(Ko{eMv z4A;4QU#>2NjIk6&?@1lL;~ztZv1COqu}04mI`K+bY=;sri>+$s=#%O7>{WC-GgBy` z#H=|&8`Wner@Vn$HZj|S(cinCH?gaoTT0ldA)#aB;O{$Ji>N&-S+tD&!$l?D`{#?g zfA^m+>I`)L!J^`nMIC^Wc2SwMI{OFlYCOH1Qy}j6h<4>XE8M3$Imr_o@x1Uxl=YmV z#N0w*J1xw8MVvBBpX9zQd`YQO7Vs^c@~&`>BJ(B-2griIZ&D-vE}>0%IpVY*4*zG~ z>w-{8>G?U8{E+EHwXeJtc);vYG2a+|2(@#J!-BUpKSEo91ynje!Fs}$J7VbVd~?)+ zC}wX}k7UL=ihRGN)A@6GDASDf+)c+?xoFc(zBi5ijt70pd3FMr*Uck*2$-M9kMJH~ z)Wc)k4lJ-zi@!I_%_iD3%~`lQa_rZD*YxfWJ}cr4##l>dINQ~^C54S9m^wxnyL{m& z@@+ylIFeV;oawQ1{gGWMS~-YC-u==Y%omNt?g3UJ1;?wn9avyw|QJRf|WP2jaT zSP87nK|QcG2bI7)2mUrNH+NCV%tA47g#J*JL|bR(3aNB#W|3Hbhel@^UhG(V1zxau zSjWPFz%|)`xo6(3JR%pT_QGQ~x=xl^0g=`;O-PD_BFbeaWD>$HQG6vPXO^u{bR?I@VUS5z=#LgW8F-a40_ zrp=Bn(E6$WHqcKcu-0bJc-}r9sd8A|uv}pwwH3|~+UZV#F~HQO8-b>pfVHOds;Q-{81gCtI~Y&=r~#`B~yhZS{%MfTO>n( ztvkJewypweZ8^sIZC#}Umigj^VQ(@}6q78}hU}xf(vux*pz0{A64Ltj+$tP|7ZOQLOK$UzB9H`~7bb z{g2*h*Yt306W$l`Etu?f$?~$|+tBg&(%Kh6#`6#Yt3&*-K%|?%y)UkZp1(EOzwDLP zDV*~$7Y?owTwAwXJqsPg-#EBF+BjPIZNY2I;{c={DlJ{+9x2C{6<%}sw_FI$@bU`0 z)T=lY_|40k^4G2W*}1m6}yN4tOK*Y85{P+y%Tu#p79nTS~Cw zlS3sYK%oJ+M8y%n`+%pY_-WwVz$q%$151@&X@ZI)fjz(vl@Hqu#ZkbWz_(N!4Xm%i z_mGNXn7ct*4C0hZ#6n>=@G%v~0rvwRP;oqPel;?n;soH0b;y8<6M?^2hzzJW3D^Ps z)hbQ~?f_l_j2|6|g<<Z50^PTd$fZ`~zy&^p_vDL!#{ zB4Z`Y6yEr&vZ`h;Ua$BQyqb3SsCI601^zd%KbW~bTwrb9in;h%RZ}p&m$Y0EzF)>a z!RZ4q9h_^#|0hn*oiRa)LYJ$?6;ZZ|HkREiDLyI7c!^UkyoNI_*PoK*IzPoYXT@Cu zZoh`ZHy`VzL*>zNw;`FNO8DsF^V9UF@(gkRX%Z{4Lr(s;EFYn174dP;kBeY)|9s{0 zrBK^bSUzI{V|87$w89`dx~K(%>kRF}@S8Jqyuu)yq{|gX;T}D#Fp3kuCc`{~SpGGY zV))tDv=qa(uc>98LHLy3nP(Jw>DWA@IQ}eMfn?@c8pZI^Su)Hwh~BeQiedj*S~}k# zbW`(uqcB46%r}aYy6HG1HQjUt!@qXZD2AQgWUw3HxWsOZt2rmjO0bC6ggYjfvFFdx zT6>zf`y9Px&%l-*vriQ#o~Q5a8B2H$I-s)kye#)(&T4cv_O4JC(GY2CX;2b?)iag} zQ9tx*{zsPgalJ}KZ!@I zZ&PrwAzL1vb;?Z zA{LAkxh9fTGs`p;X5tEbnrMR z%O@3`Ht6Jo*%`nzfRS!$bV9a*>86m{99;)y_ioAZ4&F#qSF6|E2J_PQ8XqC1aY!|7 zugwXm!^}rt*3QWK*`2Y}@O1GkTmt98$gfP?R@`<8?WdX#Ib!j|JU8E!Xdc7c!ZGAkM)+dm4eG)BO9|z{d z`gnYwH-hf~cMUSpnJ^Wmah>|@rdKM=ckusL)=*&{#(z`%*OgCy?Ra74UlpE@L6rJ( o3jM+vL+?4`sKc26i+h~$LJ*yBCQ_O!f#%{qD`Xtr=-L Date: Wed, 19 Nov 2025 13:32:14 +0000 Subject: [PATCH 03/38] FPU: Fix setting of r.x Having computed rormr, use it. Signed-off-by: Paul Mackerras --- fpu.vhdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fpu.vhdl b/fpu.vhdl index e1bc8fd..4662c9b 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -3255,7 +3255,7 @@ begin if mshift >= to_signed(64, EXP_BITS) then mshift := to_signed(63, EXP_BITS); end if; - v.x := v.x or r.r(to_integer(unsigned(mshift(5 downto 0)))); + v.x := v.x or rormr(to_integer(unsigned(mshift(5 downto 0)))); end if; asign := '0'; case opsel_a is From 577bbb8f5db64ee2c48a52e51582d23e1b48ff16 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 19 Nov 2025 13:36:04 +0000 Subject: [PATCH 04/38] tests/fpu: Add test case for denorm input in frsp test Signed-off-by: Paul Mackerras --- tests/fpu/fpu.c | 1 + tests/test_fpu.bin | Bin 33072 -> 33104 bytes 2 files changed, 1 insertion(+) diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 4ecfd2a..ed2b619 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -679,6 +679,7 @@ struct roundvals { { FPS_RN_FLOOR, 0x47f1234550000000, 0x47efffffe0000000, FPS_FI }, { FPS_RN_NEAR, 0x38012345b0000000, 0x38012345c0000000, FPS_FR|FPS_FI }, { FPS_RN_NEAR, 0x37c12345b0000000, 0x37c1234400000000, FPS_FI }, + { FPS_RN_NEAR, 0x0000008800000088, 0, FPS_FI }, }; int test8(long arg) diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 09be7e4f66691e241c7320c43798fda38093f4d8..35d5f45ff2e11747fcad8d954deb7de6fd06b1e0 100755 GIT binary patch delta 152 zcmdnc#B`yFX@iFVqsL@V0c}~2jY^gs8=Wi}7*3vMU^sRA!vFu?3=9kjAnY)CqJT7` z!{oIB{)`Qq{|ek^1Zuh~)C?pOg`F9lCNC8>2TI=svK=-nigHwMy!VCIuvt;Wn+vEaSFV~JNWE0u%*yDo fd9JoTE0A~Az@LdxY_qJfx98-0K(#AsR Date: Sat, 6 Dec 2025 11:15:11 +1100 Subject: [PATCH 05/38] FPU: Fix setting of r.x for single-precision operations The fp_rounding function expects r.x to have been set based on the lower 31 bits of r.r, not 29 as presently done, so change 28 to SP_RBIT-1 (SP_RBIT is 31). Also add a test to check. Signed-off-by: Paul Mackerras --- fpu.vhdl | 2 +- tests/fpu/fpu.c | 1 + tests/test_fpu.bin | Bin 33104 -> 33136 bytes 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fpu.vhdl b/fpu.vhdl index 4662c9b..0246faf 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -3239,7 +3239,7 @@ begin -- If shifting right, test if bits of R will be shifted out of significance if r.longmask = '1' then - mshift := to_signed(28, EXP_BITS); + mshift := to_signed(SP_RBIT - 1, EXP_BITS); else mshift := to_signed(-1, EXP_BITS); end if; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index ed2b619..b89674f 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -680,6 +680,7 @@ struct roundvals { { FPS_RN_NEAR, 0x38012345b0000000, 0x38012345c0000000, FPS_FR|FPS_FI }, { FPS_RN_NEAR, 0x37c12345b0000000, 0x37c1234400000000, FPS_FI }, { FPS_RN_NEAR, 0x0000008800000088, 0, FPS_FI }, + { FPS_RN_NEAR, 0xc2000000c2000000, 0xc2000000c0000000, FPS_FI }, }; int test8(long arg) diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 35d5f45ff2e11747fcad8d954deb7de6fd06b1e0..e75740c0a82a2075a48977cfa7e0d1a7b3563f8b 100755 GIT binary patch delta 4616 zcmai14NO#57QSy}fU$rx{D>Bi!2yK=6~xLL2Xv-@9exD5mQ>Qx4yA0dyKXVI*`YAb zV5bQtu+cq|XjZItOM{6dC1@hMWf!)(v0K?lV&Hm z_ucQi=bU@)x#zrhXLz4xW}oIjtKjHq`|~oPjtcduadkbb%`H98ni>223&#HO^{^xr zGsYaeIq0B1MsQHKzDzjYJFovzFtlbeJD+Ol(cIzBIvNZAZ75UN1*SS8Pk4F4n!|6`QB%pUa;ndsK+wq-rX1n^$~>YNLv^!6jZ1 zsu+1NYA5aT;sEwEvy*kRF?|2SMQg(qUNP!hSlei@spxYKuQ)_Q%d-s+{<|m~@|stC zUuJt$toWi=%;jwOP8Dmc^osX=+;Dh;iniBy#qI@qmP_YNz@QU-dWZ@JQ?WpsL zXKvAV5dLA XOY+_fNlL`9F6dBvn#w0DJ{aN16>if72%Y2QM%dpO(lxRx3*ool}2w>zUy5NhhXeq`|3K z>BiK@OM;u6D2I)6pSw2)rBvvcE-jWdTVG@;W+O8-bW_TzEJJ$|v-ik(%^Ww^>~$We zU8~j!_0+NIxquSPQESeM-^T|~aD1|Gf->VPvOdhwFppTjr&Hv4Ze=HC%|A-T8LZle zu}_b=6kvh5MF%`P9aPhFl=|Y0^dNj$Vi>cU;oC#_`0(Q;g)Uxk8NXYN@*TIp|5?6A z3NV`1`>kQb8V6Bxc>0|8K649LqZyd5!FSKsn4wCekw)T|QHxP86wndl7I*!gehZg( zPIbB`QC$4V{c}#d=f0xGU)Q+LtMONP?lWrqg~{Ei#-A;5w*yPkhdDe3PQ-%SZ3ZW> zy^r>-X*x%h35|k|t|a(7a}xE*oe74Lk-gV=#1Gdr^0Ho>&ueV(!=}caet1b^B`|l! z$R--OGu&i&zRA?##8SKX4MCI)JbnDUhNn03`D#FwT`FnbkJwA zqXpzLMGL2>-IO6{=Es;9TjDa){=`NtM}4GA$`1AHsX;u1*-JRhblR4*ML11u z=nXWK6eAp`J4tVBYk|xPcEXY%7uJV4JbnjwPT?#nzy-1aOZ}ds79Df>oD=WM=awtW zUdx`A_E`lcSv>ODPq;nQv4(^W%3S+b_hHxwVf(B+9LKr6?RhS4yYD{#FvO;DJ{h^n z=OAa}nCozvcF1hT9Aj~^+~YDlth;49G4i~~sy&a%M9ZoD!lY!t>wyS~i4u*Nk@+!Xx0citVq{f564!19{lcs3LO3v7$oAs?d2b%@&5 zTT<%xT(fWsC%^UAzM)__3V0h|qVLy7NA-afG+WGX<2}qX*2u0|R#04)NjO5btUpxq zV&yR#yTc3kn=O2OzBZ2m-`jcc>V2O9R`0tLSiSFdV7~8##51`@DR+ZgNT>b{FWF?z z`MF?!9X|iPwgK~Xcs20#xnB+7`2Ne4{SQ;+Gp}kg!{{#?o}|mq*!>J1x$SRoKk(xQ z-vOsKc#*0$zRI23xIJC%T<0x+TkXJVy=ltLE)%M#E!$u3%A~*Ed0@3(EQN|?LK{_y zWq#}F9@Bep(_gQgM#M7B;8J=;jG+Q^mY;GigwK{>a&RUHvNKTn^Iig?W`o`r=cz8Z? zgC~L2n`e}g`<}ZCwZ?BYk8U)dUguIq+!&a{8RKq}q}OsjfsChT^}!RE<6G3i+F4#% zaP11Xk2UW?ZXvS=zz@n?<4@cLaNXcWzLJWYk)#>OcrK4B2$$cKY6rZOySc_amycUD zt~vZFuf@wPzW^6H1qbr0{Z?F#6&wWo32=vkgMnv(+Z3z=wm#<-T?$?b+ycBy!A}4W z16L_HggM;e3OlZw3K0qc2XL-}!+^Vh(-pi7cm_C5!Fph^&?|;2*Z}MSo|pgE?8^xU zz5skz!4bgvBHV=)9Ld}caR-PYg@}T{ao~OhF9-GkcPls=IHMR5Q1A-imU2Wu!7G8! zJ&y<|_(@n~JLa!q)Qpj!kLXz;|mVdb5ur##t zi;`5nKrzl)(OSWsQgOIOvqb92kBG_`lB7gshCiFotTd9pR+uKCAXOXkr6irE%!25s z7nX*xXRluy8x6EQ5z93M#tu+bLA-F5Td92>mIl zLVuZDTjMozm+08mRYD^5gDRmh^l#HV`lA&0T)c3Oa?wvy75YeWq0gZX^siGt`c@i4 zf12jek5ZhSFP3YMk7~UvNwRbB&xjToz}O+$Z8r(y)NWrZFzUCjgG+bqYpeKn2QWW; zMUrk}%t~}N_JNQW<}ldyI^^AAtBfUM$_KgDe@Rjomn)Rzj%o73tdLaVB8xH+(~(-n zf~lb}JFuLOuF|`O>npgCRt`rbX_Ol&3M+^5G0g{p!S1fYYWdf%1=<4bSdeevEM(6@ zb_b%&N@dwv2;~LZY)^2Bpi? zIx8j)04CCW(Uzw#V|3yAVzhXw_`nT=%g31iO3>!Ov2l8{xG23Oh_RO^BJ!;h;t zz>R|Q%FIFrz74K~ZWO0#Kf>rV8A{e^uVQq83NQ)@#v;^ElC8626oS|D91YqdX{aPZ z>jZO^CQDLvT^OCZBT2*ZT>7>M-BmCp->R8l9b?mAF3K`(kP*SOOsSbVFp1MDMtc~{ z0BK88br&!izbi?Hd45xQsa`h=#{QjZ4Q=vLgoWBmQ?)xVI!fnDvr=|^zqnr>1739( zzscctn7hDqKA P7M>c+ruTL=bp`wn59n60 delta 4596 zcmai04@{It7M}qY*ebXypjbc_S5ywDC@SB&AiD*0`HxkunwINsC|BB(oW?e{9<00A zkHjX@V4fzJlU91tXi}OIXrgy=r`M9$t6b%3FYQ%SidLx7*wbF=we-8+eEV6x^>XPZ zGdsWEy!YnKoA>6MJNv>e?hETUz$^ON|2U7ksMM61;_6#s+u!%7jj{LMXY8+^UR0EF z#+X|)2OTseaR>F9s(GM)#`FOV(#Pw*&ySIE8>q*Ph1{U zM=^F#T6}KC;g!sFPDz_ql-CLsWnUp<*X%V_gZ2r>QTyQ;rQCb#xZnHTxIdlR;xl6c zkoJztDlNpGZ3rji&ErZJnu=r4(vkkT<_n$G+v3rH>}*Z;5$k9XsujrS?gBdBM8* zr~fl+V(H_uR5D4wSd?ehv2(Y}(#(V^oS~!F)XUP_6Z9p7znwK9Ic2FSD11amzwxvz zrBBe_heL!5c}A8DH|dp7;fh9C z8vI$hrj=m1MD@ech?^)Y#Jts@G zLAq5(-+N4!VuG|yN1xv!OGDRTdo^_=NARyGCHXge5Bl^dcMCJtyC>;fa(~3-W!hnP zq*&%214yw3Gn6i-CNEbMSy}{Aq@y><*W%6mQ_FhOt6LdM&05L=4hKto9KPcrn%rhazCd{P+bLN+7;K4vt zeaitFOtnx+{Ji8?=CHjUUg{HT<-EMa_BA#n{1;_JCX02oW_vJb`&N%%Evmck_`Mi& z*lt6S3EN_K!uBOxpLSPK2_-O$%`CQ({KSNP=}f6@Pm9)DWze@~A; zEb^b$++ia%fMy>WMsBnz@0-(n6i-MHLEX zqM>AAFE)3D;QHqF5M0#k3&GarrVt$7Tn8+iv9R%G;f$~=>OaI8nYx$!%EXw(wk)vI z7tK5e$1KRJrPlH#{}XKwN_-$H7;Rx}Gaa-p5r@+ewwl~{=b8xcYoT!(~$Wo2M?=ue!{_B5Yjt%oLYbO4X>j; znR&dC`Z9mb@Aao;EsWqpWXs7c6-CsD1IwNYEuaPsi@=B?s})#W1juV~F_ZvveBDHW zUW=;tc2IAQ)tnK>T&>zA!*I+28q2BQF6_H^EB4CUO1VW`Am80k54{ao9N2yQ0_7F^?sJ>wSq#03>0{D!|CV0BIH(%KBn zS)nah#8zJ?b+5M9ifc|-c>9Y`?I{hbQMmPc|0wYBM<$!a3h`xjlvx0&{_MmMcpg@0 zdmaxGk-ipo#lDbM2ds}oAFw_W z-M}Ie!7Q};uF>W-e!hxE);!_VU)d)oLU(@zSga#TL#!|SY6B-C61>K(zDv}!_9?!W zhS%1I*y_0%YU?oYgSJ}1>1|!4UF)6_ey!V*t@o?%^H5vez)C+y>+Pr0w(78NjkLj* zMCt2uLl*UohblJ#>y^(?+xlulwvoBIX=H5zeYyVU9yKAvHw{T~@H0(cD~DEKeUj_~ z)+dPvSR{%5a4W%yM7f{tN2$l&%tt9LKeOz9>6eU!mcCEJ_e;M?U|s3g0qaZuL=?T5 zv&0}iJtZ`lA9`X#*VIh}M8yb$Zv*Rt5J3e6FPb}1d-$wOOed~?D&-o)`%DgYy&RPa4hgK;A{=g1D*g*(Xa_PrxY(y z4V!_zz%zJDi4DyMJ_|gh;W*&vGQ7?;9MAl2sRqQbMkGL>6L?U=3xKZy_iA_{@QQLo zK*Nc^Ej5UMh8F>!+KdQj_+j9kkZ;m(l9&gs1GeC7SpO==|j)Hqb$Kj34*3h})xCFTPS8zDnCxy=r2$u`g_!b zKAGCl7gI0#9W;#o5KW;!Lou#Yex0(>=ToICHQ~@DMNyrDe-^ZO1nVfZx~zPHdR)tS zG!45}!lfD4@`e?dKa2T~KhlC&X>_tz`72>Xu{CDrUUzW6*e&ElU~fbA&_5LA7{+W9 zWSw2%uY|qIA?u8F)N0`g%1aPQM&UrdS&8^D;e@LKD6y=hzvk`X8F2CMX zO&v@XBhr!1NEa3t`#BGtPba?8-Rf1Q_WHJ(Fm}S%SHs z%Ctkq1?Kr%dd3UJbX&(5PJnrX3{{!Nk1@J1r6`>u-KnO^WQ=|Y@%vJ@hCu>zi4Ip~ z8X7U$PeWC?8O2}C?pMS>?0EoO0RQ5*0?mPjPjGc+!q$H&ieSV~NC(gw$x|I?2;5ba aMjGMmw6`X@Keon~Mb~&r|M`{^5&s3pv`tL_ From 0e11f80f2fb41b8fb8c7f79e410312f202d54ff2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 6 Dec 2025 13:32:56 +1100 Subject: [PATCH 06/38] FPU: Set FPSCR[FPRF] to zero for convert to integer operations This seems to be what P9 does. Signed-off-by: Paul Mackerras --- fpu.vhdl | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 0246faf..2005500 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -713,19 +713,24 @@ architecture behaviour of fpu is end; -- Determine result flags to write into the FPSCR - function result_flags(sign: std_ulogic; class: fp_number_class; unitbit: std_ulogic) + function result_flags(sign: std_ulogic; class: fp_number_class; int_result: std_ulogic; + unitbit: std_ulogic) return std_ulogic_vector is begin - case class is - when ZERO => - return sign & "0010"; - when FINITE => - return (not unitbit) & sign & (not sign) & "00"; - when INFINITY => - return '0' & sign & (not sign) & "01"; - when NAN => - return "10001"; - end case; + if int_result = '1' then + return "00000"; + else + case class is + when ZERO => + return sign & "0010"; + when FINITE => + return (not unitbit) & sign & (not sign) & "00"; + when INFINITY => + return '0' & sign & (not sign) & "01"; + when NAN => + return "10001"; + end case; + end if; end; begin @@ -3651,7 +3656,7 @@ begin end if; if r.update_fprf = '1' then - v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.res_sign, r.result_class, + v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.res_sign, r.result_class, r.int_result, r.r(UNIT_BIT) and not r.denorm); end if; From d540171f60eff051a419cf290b90143e0c137bd0 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 6 Dec 2025 14:08:48 +1100 Subject: [PATCH 07/38] FPU: Ignore Rc bit for mffs* variants other than plain mffs Bit 0 of the instruction is Rc for mffs but reserved for the other mffs* instructions. Signed-off-by: Paul Mackerras --- fpu.vhdl | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 2005500..06596bb 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1150,10 +1150,18 @@ begin opcbits := e_in.insn(10) & e_in.insn(8) & e_in.insn(4) & e_in.insn(2) & e_in.insn(1); exec_state := misc_decode(to_integer(unsigned(opcbits))); case opcbits is - when "10010" | "11010" | "10011" => - -- fmrg*, mffs + when "10010" | "11010" => + -- fmrg* v.int_result := '1'; v.result_sign := '0'; + when "10011" => + -- mffs* + v.int_result := '1'; + v.result_sign := '0'; + if e_in.insn(20 downto 16) /= "00000" then + -- mffs* variants other than mffs have bit 0 reserved + v.rc := '0'; + end if; when "10110" => -- fcfid v.result_sign := e_in.frb(63); when others => From 066e38b8ea9208ff0a643cf816efb76c36a2a9c4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 6 Dec 2025 16:26:52 +1100 Subject: [PATCH 08/38] FPU: Do proper over/underflow handling for single-precision [fm]add The ADD_3 state incorporated some of the logic of the FINISH state, but in some cases assumed the result couldn't overflow or underflow - which is not true for single precision operations, if the input operands are outside the single precision range. Fix this, and simplify things, by having ADD_3 always go to FINISH state, which does the full overflow and underflow checking. Signed-off-by: Paul Mackerras --- fpu.vhdl | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 06596bb..e37ceb3 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1866,25 +1866,14 @@ begin -- result is opposite sign to expected rsgn_op := RSGN_INV; set_r := '1'; - v.state := FINISH; elsif r.r(UNIT_BIT + 1) = '1' then -- sum overflowed, shift right opsel_r <= RES_SHIFT; set_r := '1'; re_set_result <= '1'; set_x := '1'; - if exp_huge = '1' then - v.state := ROUND_OFLOW; - else - v.state := ROUNDING; - end if; - elsif r.r(UNIT_BIT) = '1' then - set_x := '1'; - v.state := ROUNDING; - else - rs_norm <= '1'; - v.state := NORMALIZE; end if; + v.state := FINISH; when CMP_1 => opsel_a <= AIN_A; From dcd85164c60b35a8e9edd5d520c10e26ff663345 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 6 Dec 2025 17:09:46 +1100 Subject: [PATCH 09/38] FPU: Make fsel not alter FPSCR fsel is a move-type instruction, and hence shouldn't affect FPSCR. Set v.writing_fpr and v.instr_done, rather than setting arith_done, to achieve this. Signed-off-by: Paul Mackerras --- fpu.vhdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fpu.vhdl b/fpu.vhdl index e37ceb3..58e913e 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1712,7 +1712,8 @@ begin misc_sel <= "111"; set_r := '1'; re_set_result <= '1'; - arith_done := '1'; + v.writing_fpr := '1'; + v.instr_done := '1'; when DO_FSQRT => opsel_a <= AIN_B; From 37b1afc7f7a28516cd469f600cbc493843ded537 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 6 Dec 2025 17:28:31 +1100 Subject: [PATCH 10/38] FPU: Make fri* instructions set FPSCR[FR,FI] to zero As required by the ISA. Also, never generate an inexact exception. Signed-off-by: Paul Mackerras --- fpu.vhdl | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 58e913e..554ca8c 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -187,6 +187,7 @@ architecture behaviour of fpu is cycle_1_ar : std_ulogic; regsel : std_ulogic_vector(2 downto 0); is_nan_inf : std_ulogic; + zero_fri : std_ulogic; end record; type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0); @@ -1093,6 +1094,7 @@ begin v.quieten_nan := '1'; v.int_result := '0'; v.is_arith := '0'; + v.zero_fri := '0'; case e_in.op is when OP_FP_ARITH => fpin_a := e_in.valid_a; @@ -1138,7 +1140,10 @@ begin when "01110" | "01111" => -- fcti* v.int_result := '1'; v.result_sign := e_in.frb(63); - when others => -- fri* and frsp + when "01000" => -- fri* + v.zero_fri := '1'; + v.result_sign := e_in.frb(63); + when others => -- frsp v.result_sign := e_in.frb(63); end case; when OP_FP_CMP => @@ -2469,7 +2474,11 @@ begin opsel_mask <= '1'; set_r := '1'; round := fp_rounding(r.r, r.x, r.single_prec, r.round_mode, r.result_sign); - v.fpscr(FPSCR_FR downto FPSCR_FI) := round; + if r.zero_fri = '0' then + v.fpscr(FPSCR_FR downto FPSCR_FI) := round; + else + v.fpscr(FPSCR_FR downto FPSCR_FI) := "00"; -- for fri* instructions + end if; if round(1) = '1' then -- increment the LSB for the precision v.state := ROUND_INC; @@ -2481,7 +2490,7 @@ begin else arith_done := '1'; end if; - if round(0) = '1' then + if round(0) = '1' and r.zero_fri = '0' then v.fpscr(FPSCR_XX) := '1'; if r.tiny = '1' then v.fpscr(FPSCR_UX) := '1'; From 82825a11badfbd65b18271ee3cd099248cd18de1 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 6 Dec 2025 17:55:11 +1100 Subject: [PATCH 11/38] FPU: Set result sign correctly for denorm +/- 0 case Signed-off-by: Paul Mackerras --- fpu.vhdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fpu.vhdl b/fpu.vhdl index 554ca8c..106cee6 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -911,6 +911,7 @@ begin -- result is +/- B e.result_sel := AIN_B; e.result_class := r.b.class; + -- r.result_sign is already correct else e.result_class := ZERO; end if; @@ -937,6 +938,7 @@ begin elsif r.is_addition = '1' then -- fadd, result is A e.result_sel := AIN_A; + e.rsgn_op := RSGN_SEL; else -- other things, result is zero e.result_class := ZERO; From ca792f3b1357e882df5dcb91c138e2d29e109b81 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 6 Dec 2025 18:09:20 +1100 Subject: [PATCH 12/38] FPU: Make convert-to-integer-word instructions behave like P9 The fctiw* instructions return a copy of the value in bits 31..0 in bits 63..32 of the result on P9, rather than a sign or zero extension of the word result. Make the FPU do the same. Signed-off-by: Paul Mackerras --- fpu.vhdl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fpu.vhdl b/fpu.vhdl index 106cee6..daccc86 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1625,6 +1625,7 @@ begin re_set_result <= '1'; rs_sel1 <= RSH1_B; rs_neg2 <= '1'; + v.single_prec := not r.insn(9); if r.b.exponent >= to_signed(64, EXP_BITS) or (r.insn(9) = '0' and r.b.exponent >= to_signed(32, EXP_BITS)) then @@ -3686,7 +3687,8 @@ begin if r.fp_rc = '1' then v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX); end if; - v.sp_result := r.single_prec; + -- set sp_result for fctiw* + v.sp_result := r.single_prec and not r.integer_op; v.res_int := r.int_result or r.integer_op; v.illegal := illegal; v.nsnan_result := r.quieten_nan; @@ -3720,6 +3722,9 @@ begin -- This mustn't depend on any fields of r that are modified in IDLE state. if r.res_int = '1' then fp_result <= r.r; + if r.sp_result = '1' then + fp_result(63 downto 32) <= r.r(31 downto 0); + end if; else fp_result <= pack_dp(r.res_sign, r.result_class, r.result_exp, r.r, r.sp_result, r.nsnan_result); From de71a6119c348d310b8934f8a05f6aec231b29b1 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 6 Dec 2025 18:23:27 +1100 Subject: [PATCH 13/38] FPU: Make FPSCR bit 11 always read as 0 Bit 11 (52 in BE numbering) is a reserved bit. Signed-off-by: Paul Mackerras --- fpu.vhdl | 4 ++-- tests/fpu/fpu.c | 1 + tests/test_fpu.bin | Bin 33136 -> 33136 bytes 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index daccc86..05acf3b 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1323,7 +1323,7 @@ begin opsel_s <= S_ZERO; misc_sel <= "000"; opsel_sel <= AIN_ZERO; - fpscr_mask := (others => '1'); + fpscr_mask := x"FFFFF7FF"; -- ignore bit 11 (52 BE), it's reserved cr_op := CROP_NONE; update_fx := '0'; arith_done := '0'; @@ -1428,7 +1428,7 @@ begin fpscr_mask(k + 3 downto k) := "0000"; end if; end loop; - v.fpscr := r.fpscr and (fpscr_mask or x"6007F8FF"); + v.fpscr := r.fpscr and (fpscr_mask or x"6007F0FF"); v.instr_done := '1'; when DO_FTDIV => diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index b89674f..784d81f 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -274,6 +274,7 @@ void set_fpscr(unsigned long fpscr) unsigned long fpscr_eval(unsigned long val) { val &= ~0x60000000; /* clear FEX and VX */ + val &= ~0x00000800; /* clear reserved bit 52 (BE) */ if (val & 0x1f80700) /* test all VX* bits */ val |= 0x20000000; if ((val >> 25) & (val >> 3) & 0x1f) diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index e75740c0a82a2075a48977cfa7e0d1a7b3563f8b..13aad7d991130e9b93d1a92205d76ed09e19bf49 100755 GIT binary patch delta 2915 zcmZuz4@^_n8o#HbN)t&${#3@FwssX*DJ@o@GMzHy4Tfy^+%k#idEdFc-Mc04 zCii#m?|0Alo$q|-JLjI&HgXE$+x|*2!IMMD7&Pu>ELPG#^FCd1sn8Vs1YZo%%LsMT>@7Pt6rLZ4S4s%szxFsD1A zi}ZF|wsTQl^BZ+h`nUS6*#8-f!~bn}(1Y40?kewxp`E4J&-hLtF|e2ZGwTCvv4uO{ z$Sg)7yR5KkG^sQ(g?(jyPX9_>dpPeR#}!uyEv0RsvrL(lW(DCgB~J6%G9Z&+DBsx2 zT6y7y_L^svEomOmKg-|H1M1?qLZMF!=MDY8Ha*yelGjVz>j_v?h^o^twj_FTS=_qyi3*QV}Sfj4SRc)Z2JKjV%%GaprA3hLV%?E!` zvZ+{~e#gc=34@`+sF-?^v}%E#io=RdkOkT9#fY_+nSeY3cgZO;?;X`B-u6T9#8*}?(rn@jUJDpCiOTAM{5qV z7;DVr0?2Ds2A1bkjM0CmdmKS7wO)me+IHfFXsucH%?fO)ElD#^ETo%XLoPK}gT~TM zM&PK$NU|VmN$*^~!TQ&`>DIViqxBTntVUdMK5M$Q6TksESyKGGXd#r_^ ztILt~EyL^9^!R=A={`Dv%Sy@R7!G;gIwUuUH#C>He zM;0H_x#f?Z$_#u030scu6I4UvJ4VjkPX4Xr=;bO~hZtA_wU1AH)QSEYGDycdkjrUz%`j zMXbQvF+=J8q{K0;5${3NK9*<)@VyAG?aRq4i&OfH_=P*j_JlKu+ zzxCNwcxV|$yaZ#2HzAIg0V{|VkoBAj&m0EC0Jss)gAZ{Ah7o^(F~lN>BkqG0#MdCJ zS%rrm1L75MBi;oc;uaW2+zn%hFG3t~5LOUBhAfAQ+yDb&1-Komyullk7Ku@-@tKP` zc^kqGE%_Q^j#Amf8}PZK5<`0IFyeX4&{B%7JS}>{LAZrSoD^^km5ZQrs>mYPouzs8 z-%`rlv1>QpxtlPZ5O&te{5Ro=vrf^7Vg$t^c6@t~XF~z?{p7kz}`;@qB1UoPlGlygU delta 2881 zcmZuz4@{G18vowTPI?EM{HX;ilv)sKTmA@0OZnFzM@!2evo#rUMi@sXxm*s9k<_DD zP?wVnY?wz9O=erD(ojPI{zypG{IT7J2^3w{~Ftw#sunrnYt6?5mP5Ti2Q#up-fUfJh(YK|W*c!E^SFmat0d~-0`rFJ$ zb$Sg_Zqs)&70v5~UBBb)6kq3vHAd>SRzKx#CLOglOM2_&&8_~E6x{I{e7Xz;Sj0uc zGWx?A$AKxCcGgSYOr51&8&0!vE#S|ST0`T!5nGAWY;{HYc_JnSGb>m;&1W9ZJlWZ5 zO_qXo{eQWe^S_pE!TdwW!}oP-zzf^U(xknE`gdkwp0S<8&QW#N-{a1}BUUajYN27Lw zzkZj-6%_&Tg?{~yxm%L7p*gKmOYi^lX@if@cqpRrdFtCWlF;ry7Srl`LuvF%Pg;Hd zVMZ^Nnkrcn^_Y&UjGViD+N1taQkk`JYZ@W=v+4KBq@UlVd~>eqlb^%y>!{s)P}LG) zi!@_4F$?`(csqEW zj5mQ}2Y&H)fitPx)*DjY&~t`@`Q2=eQg`RFcBgc3eDyoiu3Vpg~&n7FVqk!+=N}Ozy!Elfe~=I0>j`! zfpFa!dhU|9Frcu!O0Se`qqV|x+;;Iry6{5879qiNOVLXo1Q%02KdvIhBd`fM8Y9Bq zp~FQ7Sw77aRk0wc_e4iL`+ao8e(;SEyI_+?yhUAm4k|2h;Y9Z&;|5d}>ntI~BZ-cD z7aevOTpnVDyu}TQ1F@kHlf}u3xmEgAaavT2*~RD>H^JpGwo|6HK{5Di#L&_yYoa24 zmC8$^W2iR7_;DdRMk774HsH)SY`Hjwhi%cTD1*1W(kvTBpP=iu2DVOdCAsVfm6b$~ zKKvkhbPKpV|0~pA(x5b|BxjJ~OVsp3$sTdW?7EgvMWNC}RlWOdBueC*b%D#PtPNaV zWnOSmW%816z$Pl~`3X5g7wz5bE6OZ=aZ}jlqi>4C5&rz9Xc1g?Q|tqm-xL?m1qAH5Xgzjjaf zrrX3{uTLBBbN*K8-{8|bw&PR~@5D-#ha?+eeB&!7uzDHVOMw$kzRfZ5p;w z*uMIVyZcby;=fS%;{Z`rdA#DpJ^FXTw(*Yg?Ju!gZBPej$lBucKw$<~&_yJA{57Y#9-o`1PYz(xbguY=u&)P0H9$IloMf zdJQ#KC*c#Nt6Imd&@k{CO#l~Y78pzGz(P|0M#tWyBH#(~0xwY)@N*glKBEa>2F(IJ zv<`fS)HOPMkrV-MkQcZ@UBDz72A0wU@V7Jz9H4dJWl}qJ>@F1n3(4!$sRusie0wDO zbo?f5lBCnr?=-OcH0sP!O0%2~(3B^MR-GE>T0vpd%u1mV4;}qWg&NeZ81JUrfRFOIHkL4 z*Yv#?NeStf;B^PKz37X_gZ2zI=|26@T@Ssc+=*N7KEM~8c#B8u;yat+xIpFgJ86q2 zMtOOGb0_WdBvX+`!=|auQ-B5ZdF+t>x+j-?Nvpt1lve c%nTho!ggA;$DALc8C)Gl8)L>_X?!dHf4;Y8)c^nh From fb71f62b835d7e744a71ee85d2b7198fca8fac1c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 6 Dec 2025 20:53:38 +1100 Subject: [PATCH 14/38] FPU: Round finite special-case results to single precision if required When a special case is detected, such as a zero operand to an add, and the operation is a single-precision operation such as fadds, we need to round the result to single precision instead of just returning the relevant input operand unmodified. This accomplishes that by going to DO_FRSP_2 state from the special-case code for single-precision operations that return a finite floating-point result. Signed-off-by: Paul Mackerras --- fpu.vhdl | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 05acf3b..a7f2495 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1606,9 +1606,9 @@ begin rs_con2 <= RSCON2_MINEXP; rs_neg2 <= '1'; set_x := '1'; -- uses r.r and r.shift - if r.b.exponent < to_signed(-126, EXP_BITS) then + if r.result_exp < to_signed(-126, EXP_BITS) then v.state := ROUND_UFLOW; - elsif r.b.exponent > to_signed(127, EXP_BITS) then + elsif r.result_exp > to_signed(127, EXP_BITS) then v.state := ROUND_OFLOW; else v.state := ROUNDING; @@ -3094,7 +3094,6 @@ begin qnan_result := scinfo.qnan_result; if scinfo.immed_result = '1' then -- state machine is in the DO_SPECIAL or DO_FSQRT state here - arith_done := '1'; set_r := '1'; opsel_r <= RES_MISC; opsel_sel <= scinfo.result_sel; @@ -3104,8 +3103,15 @@ begin else misc_sel <= "110"; end if; + arith_done := '1'; else misc_sel <= "111"; + if r.single_prec = '1' and scinfo.result_class = FINITE and r.int_result = '0' then + -- we have to do the equivalent of frsp on the result + v.state := DO_FRSP_2; + else + arith_done := '1'; + end if; end if; rsgn_op := scinfo.rsgn_op; v.result_class := scinfo.result_class; From 8a204f1058392db0fb2f1b9c7e1f5e4b718c87a1 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 8 Dec 2025 08:07:28 +1100 Subject: [PATCH 15/38] FPU: Set FPSCR exception summary based on individual invalid exception bits Rather than setting FPSCR[FX] to 1 when FPSCR[VX] transitions from 0 to 1, this sets it when any of the individual invalid exception bits (VSXNAN, VXISI, VXIDI, VXZDZ, VXIMZ, VXVC, VXSOFT, VXSQRT, VXCVI) transitions from 0 to 1. This better matches the ISA and P9 behaviour. Signed-off-by: Paul Mackerras --- fpu.vhdl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index a7f2495..cca49ff 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -144,7 +144,7 @@ architecture behaviour of fpu is int_result : std_ulogic; cr_result : std_ulogic_vector(3 downto 0); cr_mask : std_ulogic_vector(7 downto 0); - old_exc : std_ulogic_vector(4 downto 0); + old_exc : std_ulogic_vector(12 downto 0); update_fprf : std_ulogic; quieten_nan : std_ulogic; nsnan_result : std_ulogic; @@ -1388,7 +1388,7 @@ begin end if; end if; v.x := '0'; - v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX); + v.old_exc := r.fpscr(FPSCR_OX downto FPSCR_VXVC) & r.fpscr(FPSCR_VXSOFT downto FPSCR_VXCVI); set_s := '1'; v.regsel := AIN_ZERO; @@ -3681,7 +3681,8 @@ begin v.fpscr(FPSCR_FEX) := or (v.fpscr(FPSCR_VX downto FPSCR_XX) and v.fpscr(FPSCR_VE downto FPSCR_XE)); if update_fx = '1' and - (v.fpscr(FPSCR_VX downto FPSCR_XX) and not r.old_exc) /= "00000" then + ((v.fpscr(FPSCR_OX downto FPSCR_VXVC) & v.fpscr(FPSCR_VXSOFT downto FPSCR_VXCVI)) and + not r.old_exc) /= 13x"0" then v.fpscr(FPSCR_FX) := '1'; end if; From f252dba43d5bd5835c5bda941ca2f130f5e030ef Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 8 Dec 2025 14:15:24 +1100 Subject: [PATCH 16/38] FPU: Only apply zero subtraction result sign rule when result is exactly zero The rule in the ISA about the sign of the result of a subtraction when the magnitude of the result is zero only applies when the operands are equal in magnitude but opposite in sign, i.e. when the result is exactly zero. Add a check using FPSCR[FI] to exclude the cases where the exact result is non-zero but gets truncated to zero by rounding. Signed-off-by: Paul Mackerras --- fpu.vhdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fpu.vhdl b/fpu.vhdl index cca49ff..515ac8e 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -3189,7 +3189,7 @@ begin v.writing_fpr := '1'; v.update_fprf := '1'; end if; - if r.is_subtract = '1' and v.result_class = ZERO then + if r.is_subtract = '1' and v.result_class = ZERO and v.fpscr(FPSCR_FI) = '0' then rsign := r.round_mode(0) and r.round_mode(1); end if; if r.negate = '1' and v.result_class /= NAN then From 0478fe41dd95062c7494d40e3459df6323919cee Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 8 Dec 2025 15:03:43 +1100 Subject: [PATCH 17/38] FPU: Reset FPSCR[FR,FI] at beginning of fcfid* Otherwise a non-zero setting from a previous instruction won't get cleared. Signed-off-by: Paul Mackerras --- fpu.vhdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fpu.vhdl b/fpu.vhdl index 515ac8e..2b5d556 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1655,6 +1655,8 @@ begin rcls_op <= RCLS_SEL; re_con2 <= RECON2_UNIT; re_set_result <= '1'; + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; if r.b.class = ZERO then arith_done := '1'; else From e4715812227160743d4c4ac08bc6f56e0e37cbc9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 8 Dec 2025 19:12:03 +1100 Subject: [PATCH 18/38] FPU: Do result processing on denorm short-circuit results when FPSCR[UE] is set Results that are tiny (i.e., in the denorm range) need special processing when underflow exceptions are enabled, including in the cases where the result is just one of the input operands, such as for a fmadd with A or C equal to zero. To make sure this gets done, go to FINISH state rather than returning the relevant input operand as the result. The same logic is now used when the result needs to be rounded to single precision. Signed-off-by: Paul Mackerras --- fpu.vhdl | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 2b5d556..84cec59 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -98,6 +98,7 @@ architecture behaviour of fpu is zero_divide : std_ulogic; new_fpscr : std_ulogic_vector(31 downto 0); immed_result : std_ulogic; -- result is an input, zero, infinity or NaN + need_finish : std_ulogic; -- result needs further processing qnan_result : std_ulogic; result_sel : std_ulogic_vector(2 downto 0); result_class : fp_number_class; @@ -833,6 +834,7 @@ begin e.zero_divide := '0'; e.new_fpscr := (others => '0'); e.immed_result := '0'; + e.need_finish := '0'; e.qnan_result := '0'; e.result_sel := AIN_ZERO; e.result_class := FINITE; @@ -912,6 +914,10 @@ begin e.result_sel := AIN_B; e.result_class := r.b.class; -- r.result_sign is already correct + if r.b.class = FINITE and r.int_result = '0' and + (r.single_prec = '1' or (r.fpscr(FPSCR_UE) = '1' and r.b.denorm = '1')) then + e.need_finish := '1'; + end if; else e.result_class := ZERO; end if; @@ -926,6 +932,10 @@ begin e.immed_result := '1'; e.result_sel := AIN_B; e.result_class := r.b.class; + if r.b.class = FINITE and r.int_result = '0' and + (r.single_prec = '1' or (r.fpscr(FPSCR_UE) = '1' and r.b.denorm = '1')) then + e.need_finish := '1'; + end if; elsif r.use_b = '1' and r.b.class = ZERO and r.is_multiply = '0' then -- B is zero, other operands are finite @@ -939,6 +949,10 @@ begin -- fadd, result is A e.result_sel := AIN_A; e.rsgn_op := RSGN_SEL; + if r.a.class = FINITE and r.int_result = '0' and + (r.single_prec = '1' or (r.fpscr(FPSCR_UE) = '1' and r.a.denorm = '1')) then + e.need_finish := '1'; + end if; else -- other things, result is zero e.result_class := ZERO; @@ -3108,9 +3122,9 @@ begin arith_done := '1'; else misc_sel <= "111"; - if r.single_prec = '1' and scinfo.result_class = FINITE and r.int_result = '0' then - -- we have to do the equivalent of frsp on the result - v.state := DO_FRSP_2; + if scinfo.need_finish = '1' then + -- we have to do rounding or underflow exception processing on the result + v.state := FINISH; else arith_done := '1'; end if; From 32919435a3cfe585573195bd4819e2ee5fc9fdcd Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 9 Dec 2025 11:20:23 +1100 Subject: [PATCH 19/38] FPU: Allow mtfsb* to set FPSCR[FX] implicitly If mtfsb1 causes an individual exception bit to go from 0 to 1, that should set FX as well. Arrange for this by setting update_fx to 1. Also make sure mcrfs doesn't copy the reserved FPSCR bit. Signed-off-by: Paul Mackerras --- fpu.vhdl | 3 ++- tests/fpu/fpu.c | 6 +++--- tests/test_fpu.bin | Bin 33136 -> 33136 bytes 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 84cec59..84f968b 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1438,7 +1438,7 @@ begin for i in 0 to 7 loop if i = j then k := (7 - i) * 4; - v.cr_result := r.fpscr(k + 3 downto k); + v.cr_result := r.fpscr(k + 3 downto k) and fpscr_mask(k + 3 downto k); fpscr_mask(k + 3 downto k) := "0000"; end if; end loop; @@ -1505,6 +1505,7 @@ begin v.fpscr(31 - i) := r.insn(6); end if; end loop; + update_fx := '1'; v.instr_done := '1'; when DO_MTFSFI => diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 784d81f..824e764 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -351,15 +351,15 @@ int test4(long arg) fpscr = fpscr_eval((fpscr & 0x0fffffff) | 0x70000000); if (get_fpscr() != fpscr) return 16 * i + 27; - asm("mtfsb0 21"); + asm("mtfsb0 21"); /* VXSOFT */ fpscr = fpscr_eval(fpscr & ~(1 << (31-21))); if (get_fpscr() != fpscr) return 16 * i + 28; asm("mtfsb1 21"); - fpscr = fpscr_eval(fpscr | (1 << (31-21))); + fpscr = fpscr_eval(fpscr | (1 << (31-21)) | (1ul << 31)); if (get_fpscr() != fpscr) return 16 * i + 29; - asm("mtfsb0 24"); + asm("mtfsb0 24"); /* OE */ fpscr = fpscr_eval(fpscr & ~(1 << (31-24))); if (get_fpscr() != fpscr) return 16 * i + 30; diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 13aad7d991130e9b93d1a92205d76ed09e19bf49..4280dd2468e42c411dccace1f3377962c0b207f4 100755 GIT binary patch delta 1631 zcmZ9Me@I(b6vyv*t3@wh_f}7d538f$UlRfsQzPj zu#vGPdrmS-Zv>b^XRXmyv`L$;k4JM@Objo75%tTQ$$96OsEhNncpgq-U?;2rd4)%Ch6JM)Y>*FgQeAnL3^+MLOnn1150;AYr?1Q_wV621%d~PgoO|GTq zb)8t5*BG%fuVG@I*Cjcsd)Qtd6^_4-ui1)mtG*1zFxR9rU0F>}<^nPQeUkL8gC`d| zGcJZm;|aFOOU+@(bP5bOZ)&1I^US*3k;mz)44~hv&g~^t={JLC&8LLp*?S?F3t=3y zEIRGjU+K=n#7egfY`6G?&I2j8Tb4p$VH5XTj%jBfrTa}1EB#8aLi7p4uachzM?|%d zwTTV3V_JC0Z+j)(k1nU^qk45&_29&iRi`!44CRSeyyS5nC#^nsirF?jgwSczX?vE_ zJvxY$vE9InHeW_n7Hf%Pt}PFr*v!11?Yj2JUSxMt( z|DqzV;tzISO%)EK%kVJ$2KLDO;s$nbR@}gMh?O_+8jd`{Df!S zZuk(V-0J*$%aU}SHs-CG`ll+!JS*7HdIWPknHdu+lGK8JPa&E-1@s w=S^=*QjEty?I`s|>h)OV(^YjpktDA0mP0*8JwjSF&<8cHUFJR6@7po_2h=EoqW}N^ delta 1672 zcmZ9MVMtq76vyw$49U8sjdN-fXKO00#^{n5V-k~CGv^eNn1)qW8Lp97*gp87Y(h3; z6I}-@DAS!5I;dmmaMZCvB^CCeLfsS`te~u;3Uk$|tDUt(rHoR?+qqdXUc7J*@ArS_ zoOjN9=e|UrB+(}s?1zrAf%-g{!MVIfP@*dT9(Zso|1+4v-h+*B8-F4@h1se`xPh%? zFJg>r2R>BY0|QQ~8zsB%;l6@5U?%P_2ukD?#jJ@j7QorVui$*)DHw|z57o;wody5t zClZmcfd#EMWx;T3N~>W3t2ch>=)63T6%35;3hq$Bxv=eT!CjXq4l7xotGY;dtB?uy z!G8pIKVyRRt6+E_<(Iw3?-2RDOMYJRJH`U%qAVC_Px-Fl+^Z$IrF0j_OUjOy&+qot zF}7rXIj+!~0H)DpFuhi|NhNHIM3gLQ4Xu0}X-To1^UjZv2F}k`BHTwpnwc1=i8`jS zF3EzS!5sY3aRjuuV0aTQV1=>7?|dY$SGuv%M` z9&uAv+;as-jNPz|no2#KL4Re5QMExu?Mz3qc#%|`pE&tM(s&~-VqnK@fmDXeI9ge+ z8Tl(SYB9yhRyV$m+(CsYAI9;h$pXXJX6lwMZV7HV8tu6Ut~z-0(f3D^FJ%FtD&IBv zFte{QVsT$FVsT$X#JsP;R8>pZS{0ER53r^&T&~H*?W!Y^SOV46W!Yb>XEt;J{pOtP zE>d}0CsGOSVwd@})Vx2P;IX-k3RhL@%$FWy#(zT0KY<`zb?_I&8#zmwINL0KSbbV* zRHU zirF=K=t5VGPSd%P8PP^8F7p-+)cBwR7s=9)+jT9yf21N(Lw=T6Jml@f;$iR;^I;Hw z0VipEAkt%c1+Uw?VG*?slgqrEc`iFsoIaOroJHr-ODsN@*KpWTB5jbRU(fFjHB8`B zhfXuTl&n0;4-n5bLM-kz3mcqoLkwe13w)0Y&Vq*F`+}R-EhX>U{0Pu<=ScfT@=-Y5 z&kX4kNv->TZI-lA(tiC#aQC3C_OK*+A0NM@#(vfF@t)e8=U_HI5WxEXhOf|Aeue-^yBt=Oz<=I zdh8&_36CD`;wIT^Sm@P56MD(6V-MN$c%5t!E|R^4ie^3hhEB4tV<*{F9431i?=+j> zB!Vv|cWO%zqPzxLrL^m48&K=hX*;$Bfh&9|X-8>?NGk`rpvAR|v Date: Tue, 9 Dec 2025 11:35:12 +1100 Subject: [PATCH 20/38] FPU: Normalize B for fmadd family instructions If B is denormalized, but the A*C product is much smaller, then the result is B; in the UE=1 case we need to normalize the result, and the left shift to do that can bring in low-order product bits from S and corrupt the result. To avoid this, make sure B is normalized. Signed-off-by: Paul Mackerras --- fpu.vhdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fpu.vhdl b/fpu.vhdl index 84f968b..c539e9d 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1824,7 +1824,7 @@ begin if r.c.denorm = '1' then -- must be either fmul or fmadd/sub v.state := RENORM_C; - elsif r.b.denorm = '1' and r.is_addition = '0' then + elsif r.b.denorm = '1' and (r.is_addition = '0' or r.is_multiply = '1') then v.state := RENORM_B; elsif r.is_multiply = '1' and r.b.class = ZERO then v.state := DO_FMUL; From e5651e2eab9adf780a229878124b9c96f9fc13c2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 9 Dec 2025 16:12:05 +1100 Subject: [PATCH 21/38] FPU: Avoid adding bias twice in UE=1 underflow case In case of underflow with UE=1, ROUND_UFLOW state adds the exponent bias and then goes to NORMALIZE state if the value is not normalized. Then NORMALIZE state will go back to ROUND_UFLOW if the exponent is still tiny, resulting in the bias getting added twice. To avoid this, if ROUND_UFLOW needs to do normalization, it goes to a new NORM_UFLOW state which does the normalization and goes to ROUNDING state. Signed-off-by: Paul Mackerras --- fpu.vhdl | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index c539e9d..a005c81 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -72,7 +72,7 @@ architecture behaviour of fpu is INT_SHIFT, INT_ROUND, INT_ISHIFT, INT_FINAL, INT_CHECK, INT_OFLOW, FINISH, NORMALIZE, - ROUND_UFLOW, ROUND_OFLOW, + ROUND_UFLOW, NORM_UFLOW, ROUND_OFLOW, ROUNDING, ROUND_INC, ROUNDING_2, ROUNDING_3, DENORM, RENORM_A, RENORM_B, RENORM_C, @@ -2459,12 +2459,22 @@ begin re_set_result <= '1'; if r.r(UNIT_BIT) = '0' then rs_norm <= '1'; - v.state := NORMALIZE; + v.state := NORM_UFLOW; else v.state := ROUNDING; end if; end if; + when NORM_UFLOW => + -- normalize for UE=1 underflow case + -- r.shift = clz(r.r) - 7 + opsel_r <= RES_SHIFT; + set_r := '1'; + re_sel2 <= REXP2_NE; + re_set_result <= '1'; + set_x := '1'; + v.state := ROUNDING; + when ROUND_OFLOW => rcls_op <= RCLS_TINF; v.fpscr(FPSCR_OX) := '1'; @@ -2560,8 +2570,8 @@ begin rcls_op <= RCLS_TZERO; -- If the result is zero, that's handled below. -- Renormalize result after rounding - re_set_result <= '1'; v.denorm := exp_tiny; + re_set_result <= '1'; if new_exp < to_signed(-1022, EXP_BITS) then v.state := DENORM; else From f3b9566ae2f2268be31dfb0f684ed977e05252bc Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 9 Dec 2025 19:38:59 +1100 Subject: [PATCH 22/38] FPU: Round to single precision for fcfid[u]s The fcfids and fcfidus instructions weren't rounding to single precision because r.longmask wasn't getting set. To fix this, set v.longmask to e_in.single for the fcfid* instructions. Signed-off-by: Paul Mackerras --- fpu.vhdl | 1 + 1 file changed, 1 insertion(+) diff --git a/fpu.vhdl b/fpu.vhdl index a005c81..5227406 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1185,6 +1185,7 @@ begin end if; when "10110" => -- fcfid v.result_sign := e_in.frb(63); + v.longmask := e_in.single; when others => v.result_sign := '0'; end case; From 41988e3b5ffdef3973ae42e1b0d1400dc26ff58d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 10 Dec 2025 08:37:02 +1100 Subject: [PATCH 23/38] FPU: Fix comparison of remainder in square root code The square root procedure needs to compare B - R^2 with 2R + 1 to decide whether to increment the square root estimate R by 1. It currently does this by putting 2R + 1 in B and using the pcmpb_lt and pcmpb_eq signals. This is not correct because the comparisons that generate those signals have a 2-bit shift embedded into them. Instead, put 2R + 1 into C and use pcmpc_lt/eq, which don't have the 2-bit shift. Signed-off-by: Paul Mackerras --- fpu.vhdl | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 5227406..b97e768 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1303,6 +1303,9 @@ begin end if; -- Compare P with zero and with B + -- This has a 2-bit shift in it (p(59..4) compared to b(57..2)) + -- because it's used in the FP division code to determine whether + -- to increment the quotient at bit 2 (DP_RBIT). px_nz := or (r.p(UNIT_BIT + 1 downto 4)); pcmpb_eq := '0'; if r.p(59 downto 4) = r.b.mantissa(UNIT_BIT + 1 downto DP_RBIT) then @@ -1314,6 +1317,9 @@ begin elsif unsigned(r.p(59 downto 4)) < unsigned(r.b.mantissa(UNIT_BIT + 1 downto DP_RBIT)) then pcmpb_lt := '1'; end if; + -- Compare P with zero and with C + -- This is used in the square root and integer division code + -- to decide whether to increment the result by 1 pcmpc_eq := '0'; if r.p = r.c.mantissa then pcmpc_eq := '1'; @@ -2271,29 +2277,29 @@ begin when SQRT_11 => -- compute P = A - R * R (remainder) - -- also put 2 * R + 1 into B for comparison with P + -- also put 2 * R + 1 into C for comparison with P msel_1 <= MUL1_R; msel_2 <= MUL2_R; msel_add <= MULADD_A; msel_inv <= '1'; f_to_multiply.valid <= r.first; shiftin := '1'; - set_b := r.first; + set_c := r.first; if multiply_to_f.valid = '1' then v.state := SQRT_12; end if; when SQRT_12 => - -- test if remainder is 0 or >= B = 2*R + 1 + -- test if remainder is 0 or >= C = 2*R + 1 set_r := '0'; opsel_c <= CIN_INC; - if pcmpb_lt = '1' then + if pcmpc_lt = '1' then -- square root is correct, set X if remainder non-zero v.x := r.p(UNIT_BIT + 2) or px_nz; else -- square root needs to be incremented by 1 set_r := '1'; - v.x := not pcmpb_eq; + v.x := not pcmpc_eq; end if; v.state := FINISH; From a18c462b27f526af36a8fd1ce6af315fe1d60a95 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 10 Dec 2025 09:34:20 +1100 Subject: [PATCH 24/38] FPU: Ignore stale P contents in short-circuit multiply-add When a multiply-add is done with A or C equal to zero, the actual multiplication operation is not done, hence P is not valid, so in FINISH state we shouldn't set X based on P being non-zero. Fix this by clearing the is_multiply flag in the short-circuit case. Signed-off-by: Paul Mackerras --- fpu.vhdl | 1 + 1 file changed, 1 insertion(+) diff --git a/fpu.vhdl b/fpu.vhdl index b97e768..6990413 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -3129,6 +3129,7 @@ begin if scinfo.immed_result = '1' then -- state machine is in the DO_SPECIAL or DO_FSQRT state here set_r := '1'; + v.is_multiply := '0'; -- P is not valid opsel_r <= RES_MISC; opsel_sel <= scinfo.result_sel; if scinfo.qnan_result = '1' then From baf8f5f8c613cdb53f5546d092f9df0e7f82a163 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 10 Dec 2025 11:02:23 +1100 Subject: [PATCH 25/38] FPU: Force reserved FPSCR bit 11 to zero This ensures that the reserved FPSCR bit can never be set, by clearing it at the end of the fpu_1 process. Also remove a redundant setting of cr_result in the mcrfs code. Signed-off-by: Paul Mackerras --- fpu.vhdl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 6990413..1168993 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1344,7 +1344,7 @@ begin opsel_s <= S_ZERO; misc_sel <= "000"; opsel_sel <= AIN_ZERO; - fpscr_mask := x"FFFFF7FF"; -- ignore bit 11 (52 BE), it's reserved + fpscr_mask := x"FFFFFFFF"; cr_op := CROP_NONE; update_fx := '0'; arith_done := '0'; @@ -1445,7 +1445,6 @@ begin for i in 0 to 7 loop if i = j then k := (7 - i) * 4; - v.cr_result := r.fpscr(k + 3 downto k) and fpscr_mask(k + 3 downto k); fpscr_mask(k + 3 downto k) := "0000"; end if; end loop; @@ -3772,6 +3771,9 @@ begin r.sp_result, r.nsnan_result); end if; + -- Make sure the reserved bit 11 (52) of FPSCR can never be set + v.fpscr(11) := '0'; + rin <= v; end process; From 009ee1c9c504b33dcc3d87a1a7bb7e725dea65d5 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 10 Dec 2025 11:47:32 +1100 Subject: [PATCH 26/38] FPU: Renormalize frsp operand if denormalized This arranges for the frsp operand to be renormalized if necessary. Without this, we can incorrectly get X set to 1 for denormalized operands, and hence the rounding may be done incorrectly. To make things clearer, we now have an explicit flag indicating when the B operand needs to be in normalized form. Signed-off-by: Paul Mackerras --- fpu.vhdl | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 1168993..8124dad 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -159,6 +159,7 @@ architecture behaviour of fpu is is_multiply : std_ulogic; is_inverse : std_ulogic; is_sqrt : std_ulogic; + do_renorm_b : std_ulogic; first : std_ulogic; count : unsigned(1 downto 0); doing_ftdiv : std_ulogic_vector(1 downto 0); @@ -1098,6 +1099,7 @@ begin v.is_addition := '0'; v.is_subtract := '0'; v.is_inverse := '0'; + v.do_renorm_b := '0'; fpin_a := '0'; fpin_b := '0'; fpin_c := '0'; @@ -1150,17 +1152,20 @@ begin when "10010" => -- fdiv v.is_inverse := '1'; v.result_sign := e_in.fra(63) xor e_in.frb(63); + v.do_renorm_b := '1'; when "11000" | "11010" => -- fre and frsqrte v.is_inverse := '1'; v.result_sign := e_in.frb(63); + v.do_renorm_b := '1'; when "01110" | "01111" => -- fcti* v.int_result := '1'; v.result_sign := e_in.frb(63); when "01000" => -- fri* v.zero_fri := '1'; v.result_sign := e_in.frb(63); - when others => -- frsp + when others => -- frsp and fsqrt v.result_sign := e_in.frb(63); + v.do_renorm_b := '1'; end case; when OP_FP_CMP => fpin_a := e_in.valid_a; @@ -1426,7 +1431,7 @@ begin v.state := RENORM_A; elsif r.c.denorm = '1' then v.state := RENORM_C; - elsif r.b.denorm = '1' and (r.is_inverse = '1' or r.is_sqrt = '1') then + elsif r.b.denorm = '1' and r.do_renorm_b = '1' then v.state := RENORM_B; elsif r.is_multiply = '1' and r.b.class = ZERO then v.state := DO_FMUL; @@ -1830,7 +1835,7 @@ begin if r.c.denorm = '1' then -- must be either fmul or fmadd/sub v.state := RENORM_C; - elsif r.b.denorm = '1' and (r.is_addition = '0' or r.is_multiply = '1') then + elsif r.b.denorm = '1' and r.do_renorm_b = '1' then v.state := RENORM_B; elsif r.is_multiply = '1' and r.b.class = ZERO then v.state := DO_FMUL; From b8f7cbd894b64410e3f639da2da923d78465b0cf Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 10 Dec 2025 20:14:35 +1100 Subject: [PATCH 27/38] FPU: Record bits shifted out of addend in fmadd-family instructions If the addend is smaller than the product and thus needs to be shifted right, record if any bits are lost from the right end in r.x, so that the result gets rounded correctly. Also add a test that checks one such case. Signed-off-by: Paul Mackerras --- fpu.vhdl | 1 + tests/fpu/fpu.c | 72 ++++++++++++++++++++++++--------------------- tests/test_fpu.bin | Bin 33136 -> 33464 bytes 3 files changed, 39 insertions(+), 34 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 8124dad..d120cd8 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1970,6 +1970,7 @@ begin -- r.shift = addend exp - product exp + 64, r.r = r.b.mantissa set_s := '1'; opsel_s <= S_SHIFT; + set_x := '1'; -- set shift to r.shift - 64 rs_sel1 <= RSH1_S; rs_con2 <= RSCON2_64; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 824e764..5c46b6f 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -1504,110 +1504,114 @@ struct fmavals { unsigned long ra; unsigned long rc; unsigned long rb; + unsigned long fpscr; unsigned long fma; unsigned long fms; unsigned long nfma; unsigned long nfms; } fmavals[] = { /* +0 * +0 +- +0 -> +0, +0, -0, -0 */ - { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, FPS_RN_NEAR, 0x0000000000000000, 0x0000000000000000, 0x8000000000000000, 0x8000000000000000 }, /* +0 * NaNC +- +0 -> NaNC, NaNC, NaNC, NaNC */ - { 0x0000000000000000, 0x7ffc000000000000, 0x0000000000000000, + { 0x0000000000000000, 0x7ffc000000000000, 0x0000000000000000, FPS_RN_NEAR, 0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000 }, /* +0 * NaNC +- NaNB -> NaNB, NaNB, NaNB, NaNB */ - { 0x0000000000000000, 0x7ffc000000000000, 0x7ffb000000000000, + { 0x0000000000000000, 0x7ffc000000000000, 0x7ffb000000000000, FPS_RN_NEAR, 0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000 }, /* NaNA * NaNC +- NaNB -> NaNA, NaNA, NaNA, NaNA */ - { 0x7ffa000000000000, 0x7ffc000000000000, 0x7ffb000000000000, + { 0x7ffa000000000000, 0x7ffc000000000000, 0x7ffb000000000000, FPS_RN_NEAR, 0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000 }, /* +1.0 * -0 +- +finite B -> +B, -B, -B, +B */ - { 0x3ff0000000000000, 0x8000000000000000, 0x678123456789abcd, + { 0x3ff0000000000000, 0x8000000000000000, 0x678123456789abcd, FPS_RN_NEAR, 0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd }, /* +1.0 * -1.0 +- (B = +3.818e+190) -> +B, -B, -B, +B */ - { 0x3ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, + { 0x3ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, FPS_RN_NEAR, 0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd }, /* +inf * -1.0 +- +finite B -> -inf, -inf, +inf, +inf */ - { 0x7ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, + { 0x7ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, FPS_RN_NEAR, 0xfff0000000000000, 0xfff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 }, /* +inf * +0 +- +finite B -> NaNQ, NaNQ, NaNQ, NaNQ */ - { 0x7ff0000000000000, 0x0000000000000000, 0x678123456789abcd, + { 0x7ff0000000000000, 0x0000000000000000, 0x678123456789abcd, FPS_RN_NEAR, 0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000 }, /* +1.0 * +1.0 +- 1.00000012 -> +2.00000012, +1.2e-7, -2.00000012, -1.2e-7 */ - { 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000020000000, + { 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000020000000, FPS_RN_NEAR, 0x4000000010000000, 0xbe80000000000000, 0xc000000010000000, 0x3e80000000000000 }, /* +(1 + 2^-52) * +(1 + 2^-52) +- +1.0 -> +(2 + 2^-51), +2^-51, -(2 + 2^-51), -2^-51 */ - { 0x3ff0000000000001, 0x3ff0000000000001, 0x3ff0000000000000, + { 0x3ff0000000000001, 0x3ff0000000000001, 0x3ff0000000000000, FPS_RN_NEAR, 0x4000000000000001, 0x3cc0000000000000, 0xc000000000000001, 0xbcc0000000000000 }, /* +(1 + 3*2^-52) * +(1 + 2^-51) +- +1.0 -> +(2 + 2^-50), +5 * 2^-52 + 2^-101, -, - */ - { 0x3ff0000000000003, 0x3ff0000000000002, 0x3ff0000000000000, + { 0x3ff0000000000003, 0x3ff0000000000002, 0x3ff0000000000000, FPS_RN_NEAR, 0x4000000000000002, 0x3cd4000000000002, 0xc000000000000002, 0xbcd4000000000002 }, /* +2.443e-77 * 2.828 +- 6.909e-77 -> -1.402e-93, +1.382e-76, +1.402e-93, -1.382e-76 */ - { 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb020000000000000, + { 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb020000000000000, FPS_RN_NEAR, 0xaca765753908cd20, 0x3030000000000000, 0x2ca765753908cd20, 0xb030000000000000 }, /* +2.443e-77 * 2.828 +- 6.909e-77 -> +9.446e-93, +1.382e-76, -9.446e-93, -1.382e-76 */ - { 0x3006a09e667f3bcd, 0x4006a09e667f3bcd, 0xb020000000000000, + { 0x3006a09e667f3bcd, 0x4006a09e667f3bcd, 0xb020000000000000, FPS_RN_NEAR, 0x2cd3b3efbf5e2229, 0x3030000000000000, 0xacd3b3efbf5e2229, 0xb030000000000000 }, /* +2.443e-77 * 2.828 +- -1.1055e-75 -> -1.0364e-75, +1.1746e-75, +1.0364e-75, -1.1746e-75 */ - { 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb060003450000000, + { 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb060003450000000, FPS_RN_NEAR, 0xb05e0068a0000000, 0x3061003450000000, 0x305e0068a0000000, 0xb061003450000000 }, /* +2 * +3 +- 3 -> +9, +3, -9, -3 */ - { 0x4000000000000000, 0x4008000000000000, 0x4008000000000000, + { 0x4000000000000000, 0x4008000000000000, 0x4008000000000000, FPS_RN_NEAR, 0x4022000000000000, 0x4008000000000000, 0xc022000000000000, 0xc008000000000000 }, /* +2 * +3 +- 5 -> +11, +1, -11, -1 */ - { 0x4000000000000000, 0x4008000000000000, 0x4014000000000000, + { 0x4000000000000000, 0x4008000000000000, 0x4014000000000000, FPS_RN_NEAR, 0x4026000000000000, 0x3ff0000000000000, 0xc026000000000000, 0xbff0000000000000 }, /* +2 * +3 +- 7 -> +13, -1, -13, +1 */ - { 0x4000000000000000, 0x4008000000000000, 0x401c000000000000, + { 0x4000000000000000, 0x4008000000000000, 0x401c000000000000, FPS_RN_NEAR, 0x402a000000000000, 0xbff0000000000000, 0xc02a000000000000, 0x3ff0000000000000 }, /* +2 * +3 +- 9 -> +15, -3, -15, +3 */ - { 0x4000000000000000, 0x4008000000000000, 0x4022000000000000, + { 0x4000000000000000, 0x4008000000000000, 0x4022000000000000, FPS_RN_NEAR, 0x402e000000000000, 0xc008000000000000, 0xc02e000000000000, 0x4008000000000000 }, /* +2 * +3 +- -3 -> +3, +9, -3, -9 */ - { 0x4000000000000000, 0x4008000000000000, 0xc008000000000000, + { 0x4000000000000000, 0x4008000000000000, 0xc008000000000000, FPS_RN_NEAR, 0x4008000000000000, 0x4022000000000000, 0xc008000000000000, 0xc022000000000000 }, /* +2 * +3 +- -5 -> +1, +11, -1, -11 */ - { 0x4000000000000000, 0x4008000000000000, 0xc014000000000000, + { 0x4000000000000000, 0x4008000000000000, 0xc014000000000000, FPS_RN_NEAR, 0x3ff0000000000000, 0x4026000000000000, 0xbff0000000000000, 0xc026000000000000 }, /* +2 * +3 +- -7 -> -1, +13, +1, -13 */ - { 0x4000000000000000, 0x4008000000000000, 0xc01c000000000000, + { 0x4000000000000000, 0x4008000000000000, 0xc01c000000000000, FPS_RN_NEAR, 0xbff0000000000000, 0x402a000000000000, 0x3ff0000000000000, 0xc02a000000000000 }, /* +2 * +3 +- -9 -> -3, +15, +3, -15 */ - { 0x4000000000000000, 0x4008000000000000, 0xc022000000000000, + { 0x4000000000000000, 0x4008000000000000, 0xc022000000000000, FPS_RN_NEAR, 0xc008000000000000, 0x402e000000000000, 0x4008000000000000, 0xc02e000000000000 }, /* +2 * -3 +- 3 -> -3, -9, +3, +9 */ - { 0x4000000000000000, 0xc008000000000000, 0x4008000000000000, + { 0x4000000000000000, 0xc008000000000000, 0x4008000000000000, FPS_RN_NEAR, 0xc008000000000000, 0xc022000000000000, 0x4008000000000000, 0x4022000000000000 }, /* +2 * -3 +- 5 -> -1, -11, +1, +11 */ - { 0x4000000000000000, 0xc008000000000000, 0x4014000000000000, + { 0x4000000000000000, 0xc008000000000000, 0x4014000000000000, FPS_RN_NEAR, 0xbff0000000000000, 0xc026000000000000, 0x3ff0000000000000, 0x4026000000000000 }, /* +2 * -3 +- 7 -> +1, -13, -1, +13 */ - { 0x4000000000000000, 0xc008000000000000, 0x401c000000000000, + { 0x4000000000000000, 0xc008000000000000, 0x401c000000000000, FPS_RN_NEAR, 0x3ff0000000000000, 0xc02a000000000000, 0xbff0000000000000, 0x402a000000000000 }, /* +2 * -3 +- 9 -> +3, -15, -3, +15 */ - { 0x4000000000000000, 0xc008000000000000, 0x4022000000000000, + { 0x4000000000000000, 0xc008000000000000, 0x4022000000000000, FPS_RN_NEAR, 0x4008000000000000, 0xc02e000000000000, 0xc008000000000000, 0x402e000000000000 }, /* -2 * +3 +- -3 -> -9, -3, +9, +3 */ - { 0xc000000000000000, 0x4008000000000000, 0xc008000000000000, + { 0xc000000000000000, 0x4008000000000000, 0xc008000000000000, FPS_RN_NEAR, 0xc022000000000000, 0xc008000000000000, 0x4022000000000000, 0x4008000000000000 }, /* -2 * +3 +- -5 -> -11, -1, +11, +1 */ - { 0xc000000000000000, 0x4008000000000000, 0xc014000000000000, + { 0xc000000000000000, 0x4008000000000000, 0xc014000000000000, FPS_RN_NEAR, 0xc026000000000000, 0xbff0000000000000, 0x4026000000000000, 0x3ff0000000000000 }, /* -2 * +3 +- -7 -> -13, +1, +13, -1 */ - { 0xc000000000000000, 0x4008000000000000, 0xc01c000000000000, + { 0xc000000000000000, 0x4008000000000000, 0xc01c000000000000, FPS_RN_NEAR, 0xc02a000000000000, 0x3ff0000000000000, 0x402a000000000000, 0xbff0000000000000 }, /* -2 * +3 +- -9 -> -15, +3, +15, -3 */ - { 0xc000000000000000, 0x4008000000000000, 0xc022000000000000, + { 0xc000000000000000, 0x4008000000000000, 0xc022000000000000, FPS_RN_NEAR, 0xc02e000000000000, 0x4008000000000000, 0x402e000000000000, 0xc008000000000000 }, /* -2 * +3 +- +0 -> -6, -6, +6, +6 */ - { 0xc000000000000000, 0x4008000000000000, 0x0000000000000000, + { 0xc000000000000000, 0x4008000000000000, 0x0000000000000000, FPS_RN_NEAR, 0xc018000000000000, 0xc018000000000000, 0x4018000000000000, 0x4018000000000000 }, /* +2 * -3 +- -0 -> -6, -6, +6, +6 */ - { 0x4000000000000000, 0xc008000000000000, 0x8000000000000000, + { 0x4000000000000000, 0xc008000000000000, 0x8000000000000000, FPS_RN_NEAR, 0xc018000000000000, 0xc018000000000000, 0x4018000000000000, 0x4018000000000000 }, /* 2^-1026 * (1.5 * 2^1023) +- -0 -> (1.5 * 2^-3), ditto, -ditto, -ditto */ - { 0x0001000000000000, 0x7fe8000000000000, 0x8000000000000000, + { 0x0001000000000000, 0x7fe8000000000000, 0x8000000000000000, FPS_RN_NEAR, 0x3fc8000000000000, 0x3fc8000000000000, 0xbfc8000000000000, 0xbfc8000000000000 }, + /* 1 * -1 + tiny -> -1 + delta, -1, 1 - delta, 1 */ + { 0x3ff0000000000000, 0xbff0000000000000, 0x00000000b2200102, FPS_RN_CEIL, + 0xbfefffffffffffff, 0xbff0000000000000, 0x3fefffffffffffff, 0x3ff0000000000000 }, }; int test23(long arg) @@ -1617,8 +1621,8 @@ int test23(long arg) struct fmavals *vp = fmavals; unsigned long fpscr; - set_fpscr(FPS_RN_NEAR); for (i = 0; i < sizeof(fmavals) / sizeof(fmavals[0]); ++i, ++vp) { + set_fpscr(vp->fpscr); asm("lfd 6,0(%0); lfd 7,8(%0); lfd 8,16(%0); fmadd 0,6,7,8; stfd 0,0(%1)" : : "b" (&vp->ra), "b" (results) : "memory"); asm("fmsub 1,6,7,8; fnmadd 2,6,7,8; fnmsub 3,6,7,8; stfd 1,8(%0); stfd 2,16(%0); stfd 3,24(%0)" diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 4280dd2468e42c411dccace1f3377962c0b207f4..f68ea11be39a32ba6dab54ab64e3109f8f4b185a 100755 GIT binary patch delta 1143 zcmZute`r%z6h7}XuCGDsvzZz?%+iYK4}ZmM>Wk05zCP&goZ@d>ZY!kqt=dncx+mTALF+KuRy9)QgBt^d)U zM$=sTVtU|>R%|VDFq%}h;p_-oR31o9nIX?3_R+&K6JCYPmqJg+OFRV zy+-QQF@*S7QIh+lDD(3QmPh0_~gSDkaJDg>eZ-3zA z>#5T-hvS#1?cG64eZ|oGa)lLYlPTeEL!{UESXb4Ug!vzsUek#*D%4xW z8_x{L&{)k~)ey4of1cmpPd{lcNu~SR=2Zq1v`Wyo%r5pC1N1!j4pr~HjWhl1(cV%C zfwB3z2qH4l4WHA^Dldo#3Zo9=^s<)C8*vFa9SxVGo6$$%@&<*!_%#P~Weyr=M2$a3 zPC1|^xST5jBe?QEobs=pCc+*uaV{Jvwbz$yu_LhBeSbO2_LYPn^j*;)UY6E`x$YF|B}2@iPtN%&~I(rJKP5_nt1phE$t1Hb~z#c(F$^h`W2kk z*RtUWHKTva>7ojwJlZ`>uJf~~g7te9gYT$$+y%K+Xpyi zlyPCl5|gj1^L314bAz~0Kg}lN_oo z--=H~182uKV)YsUvSK;7Y8EpQNycx^GPE{6X7Zpq(^$;0u%ga4c^Hc*=#tDjS*f_B zos27ZLOQo%ROh82{e^;FcPWzb&1%ZHxPgHo^7vuB>}e6TrJa(N&*SO3H5 z|C({i&1)NH-Hzhr^F=kw5!9+ZUv&8WfS!d;-nMuPaI4RPZ9W0VeR_fwpZE$fO?{eb z#{%!3TZ|e)bzN?I+E@0m}~8Vc@WUGT}YJ7!dIY!P`=ZcLR1z1_b0= z1Z*S3f`vJLyv2%1n)x8{gqY?Hl9zh^2yuiVLXyBArI|29ND}x~6Tah0)N8G_%3r@Z BT=4(^ From d33f31509b1f3259e0a350bf174192191c03b99e Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 10 Dec 2025 21:02:06 +1100 Subject: [PATCH 28/38] FPU: Clear S in ADD_SHIFT state Otherwise, if this is a multiply-add instruction and the result needs to be shifted left, bits of the product in S will contaminate the final result. Signed-off-by: Paul Mackerras --- fpu.vhdl | 1 + 1 file changed, 1 insertion(+) diff --git a/fpu.vhdl b/fpu.vhdl index d120cd8..014f9ee 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1865,6 +1865,7 @@ begin re_set_result <= '1'; v.x := s_nz; set_x := '1'; + set_s := '1'; v.longmask := r.single_prec; if r.add_bsmall = '1' then v.state := ADD_2; From 37edba4da759e8a969bfcd86ca88908ab2c60b38 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 11 Dec 2025 09:01:41 +1100 Subject: [PATCH 29/38] FPU: Normalize B operand for multiply-add instructions Otherwise the result can get rounded incorrectly when B is denorm but the A * C product is much smaller. Signed-off-by: Paul Mackerras --- fpu.vhdl | 1 + 1 file changed, 1 insertion(+) diff --git a/fpu.vhdl b/fpu.vhdl index 014f9ee..53027ff 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1149,6 +1149,7 @@ begin v.is_subtract := not (e_in.fra(63) xor e_in.frb(63) xor e_in.frc(63) xor e_in.insn(1)); v.negate := e_in.insn(2); + v.do_renorm_b := '1'; when "10010" => -- fdiv v.is_inverse := '1'; v.result_sign := e_in.fra(63) xor e_in.frb(63); From 9f27f60b268f3c6a9e1559c6229c2674ba77f996 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 11 Dec 2025 09:19:12 +1100 Subject: [PATCH 30/38] FPU: Clear FPSCR[FR,FI] on overflow in convert-to-integer instructions Also simplify INT_CHECK state by going to INT_OFLOW on overflow. Signed-off-by: Paul Mackerras --- fpu.vhdl | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 53027ff..f49f02d 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -2377,26 +2377,22 @@ begin else msb := r.r(63); end if; - opsel_r <= RES_MISC; - misc_sel <= "110"; if (r.insn(8) = '0' and msb /= r.result_sign) or (r.insn(8) = '1' and msb /= '1') then - set_r := '1'; - v.fpscr(FPSCR_VXCVI) := '1'; - invalid := '1'; + v.state := INT_OFLOW; else - set_r := '0'; if r.fpscr(FPSCR_FI) = '1' then v.fpscr(FPSCR_XX) := '1'; end if; + arith_done := '1'; end if; - arith_done := '1'; when INT_OFLOW => opsel_r <= RES_MISC; misc_sel <= "110"; set_r := '1'; v.fpscr(FPSCR_VXCVI) := '1'; + v.fpscr(FPSCR_FR downto FPSCR_FI) := "00"; invalid := '1'; arith_done := '1'; From 59992eab907f9431a99ab4de987abd722e9d3098 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 11 Dec 2025 13:15:00 +1100 Subject: [PATCH 31/38] FPU: Avoid doing overflow processing twice in OE=1 case Split the ROUND_OFLOW state into two, one which handles the OE=0 case (disabled overflow exception) and one which handles the OE=1 case (enabled overflow exception). This avoids a loop in the state diagram and prevents us from adding the exponent bias twice. Also correct a bug in ROUNDING_3 state where for single-precision operations which yield a result which is denormal in double-precision format, r.shift was set wrongly. Signed-off-by: Paul Mackerras --- fpu.vhdl | 109 +++++++++++++++++++++------------------------ tests/fpu/fpu.c | 3 ++ tests/test_fpu.bin | Bin 33464 -> 33560 bytes 3 files changed, 55 insertions(+), 57 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index f49f02d..07617af 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -72,7 +72,7 @@ architecture behaviour of fpu is INT_SHIFT, INT_ROUND, INT_ISHIFT, INT_FINAL, INT_CHECK, INT_OFLOW, FINISH, NORMALIZE, - ROUND_UFLOW, NORM_UFLOW, ROUND_OFLOW, + ROUND_UFLOW, NORM_UFLOW, ROUND_OFLOW_DIS, ROUND_OFLOW_EN, ROUNDING, ROUND_INC, ROUNDING_2, ROUNDING_3, DENORM, RENORM_A, RENORM_B, RENORM_C, @@ -315,6 +315,7 @@ architecture behaviour of fpu is constant RSCON2_63 : std_ulogic_vector(3 downto 0) := "0111"; constant RSCON2_64 : std_ulogic_vector(3 downto 0) := "1000"; constant RSCON2_MINEXP : std_ulogic_vector(3 downto 0) := "1001"; + constant RSCON2_DPMINX : std_ulogic_vector(3 downto 0) := "1010"; signal rs_sel1 : std_ulogic_vector(1 downto 0); signal rs_sel2 : std_ulogic; @@ -1633,10 +1634,10 @@ begin rs_con2 <= RSCON2_MINEXP; rs_neg2 <= '1'; set_x := '1'; -- uses r.r and r.shift - if r.result_exp < to_signed(-126, EXP_BITS) then + if exp_tiny = '1' then v.state := ROUND_UFLOW; - elsif r.result_exp > to_signed(127, EXP_BITS) then - v.state := ROUND_OFLOW; + elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then + v.state := ROUND_OFLOW_DIS; else v.state := ROUNDING; end if; @@ -2406,6 +2407,7 @@ begin v.state := ROUNDING; when FINISH => + -- r.shift = 0 if r.is_multiply = '1' and px_nz = '1' then v.x := '1'; end if; @@ -2420,8 +2422,8 @@ begin set_x := '1'; if exp_tiny = '1' then v.state := ROUND_UFLOW; - elsif exp_huge = '1' then - v.state := ROUND_OFLOW; + elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then + v.state := ROUND_OFLOW_DIS; else v.state := ROUNDING; end if; @@ -2441,8 +2443,8 @@ begin set_x := '1'; if exp_tiny = '1' then v.state := ROUND_UFLOW; - elsif exp_huge = '1' then - v.state := ROUND_OFLOW; + elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then + v.state := ROUND_OFLOW_DIS; else v.state := ROUNDING; end if; @@ -2485,30 +2487,20 @@ begin set_x := '1'; v.state := ROUNDING; - when ROUND_OFLOW => + when ROUND_OFLOW_DIS => + -- disabled overflow exception + -- result depends on rounding mode rcls_op <= RCLS_TINF; v.fpscr(FPSCR_OX) := '1'; opsel_r <= RES_MISC; misc_sel <= "010"; - set_r := '0'; - if r.fpscr(FPSCR_OE) = '0' then - -- disabled overflow exception - -- result depends on rounding mode - set_r := '1'; - v.fpscr(FPSCR_XX) := '1'; - v.fpscr(FPSCR_FI) := '1'; - -- construct largest representable number - re_con2 <= RECON2_MAX; - re_set_result <= '1'; - arith_done := '1'; - else - -- enabled overflow exception - re_sel1 <= REXP1_R; - re_con2 <= RECON2_BIAS; - re_neg2 <= '1'; - re_set_result <= '1'; - v.state := ROUNDING; - end if; + set_r := '1'; + v.fpscr(FPSCR_XX) := '1'; + v.fpscr(FPSCR_FI) := '1'; + -- construct largest representable number + re_con2 <= RECON2_MAX; + re_set_result <= '1'; + arith_done := '1'; when ROUNDING => opsel_mask <= '1'; @@ -2527,6 +2519,8 @@ begin -- denormalized result that needs to be renormalized rs_norm <= '1'; v.state := ROUNDING_3; + elsif r.result_exp > max_exp then + v.state := ROUND_OFLOW_EN; else arith_done := '1'; end if; @@ -2540,49 +2534,40 @@ begin when ROUND_INC => set_r := '1'; opsel_a <= AIN_RND; - -- set shift to -1 - rs_con2 <= RSCON2_1; - rs_neg2 <= '1'; v.state := ROUNDING_2; when ROUNDING_2 => -- Check for overflow during rounding - -- r.shift = -1 - v.x := '0'; - re_sel2 <= REXP2_NE; - opsel_r <= RES_SHIFT; - set_r := '0'; - if r.r(UNIT_BIT + 1) = '1' then - set_r := '1'; - re_set_result <= '1'; - if exp_huge = '1' then - v.state := ROUND_OFLOW; - else - arith_done := '1'; - end if; - elsif r.r(UNIT_BIT) = '0' then + -- r.shift = 0 + if r.r(UNIT_BIT + 1) = '1' or r.r(UNIT_BIT) = '0' then -- Do CLZ so we can renormalize the result rs_norm <= '1'; v.state := ROUNDING_3; + elsif exp_huge = '1' then + v.state := ROUND_OFLOW_EN; else arith_done := '1'; end if; when ROUNDING_3 => - -- r.shift = clz(r.r) - 9 + -- r.shift = clz(r.r) - 7 opsel_r <= RES_SHIFT; set_r := '1'; re_sel2 <= REXP2_NE; - -- set shift to new_exp - min_exp (== -1022) + -- set shift to new_exp - DP min_exp (== -1022) rs_sel1 <= RSH1_NE; - rs_con2 <= RSCON2_MINEXP; + rs_con2 <= RSCON2_DPMINX; rs_neg2 <= '1'; rcls_op <= RCLS_TZERO; -- If the result is zero, that's handled below. -- Renormalize result after rounding v.denorm := exp_tiny; re_set_result <= '1'; - if new_exp < to_signed(-1022, EXP_BITS) then + if exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then + v.state := ROUND_OFLOW_DIS; + elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '1' then + v.state := ROUND_OFLOW_EN; + elsif new_exp < to_signed(-1022, EXP_BITS) then v.state := DENORM; else arith_done := '1'; @@ -2596,6 +2581,16 @@ begin re_set_result <= '1'; arith_done := '1'; + when ROUND_OFLOW_EN => + -- enabled overflow exception + -- rounding and normalization has been done + v.fpscr(FPSCR_OX) := '1'; + re_sel1 <= REXP1_R; + re_con2 <= RECON2_BIAS; + re_neg2 <= '1'; + re_set_result <= '1'; + arith_done := '1'; + when DO_IDIVMOD => opsel_a <= AIN_B; opsel_aabs <= '1'; @@ -3201,14 +3196,12 @@ begin arith_done := '1'; end if; when RCLS_TINF => - if r.fpscr(FPSCR_OE) = '0' then - if r.round_mode(1 downto 0) = "00" or - (r.round_mode(1) = '1' and r.round_mode(0) = r.result_sign) then - v.result_class := INFINITY; - v.fpscr(FPSCR_FR) := '1'; - else - v.fpscr(FPSCR_FR) := '0'; - end if; + if r.round_mode(1 downto 0) = "00" or + (r.round_mode(1) = '1' and r.round_mode(0) = r.result_sign) then + v.result_class := INFINITY; + v.fpscr(FPSCR_FR) := '1'; + else + v.fpscr(FPSCR_FR) := '0'; end if; when others => end case; @@ -3593,6 +3586,8 @@ begin rsh_in2 := to_signed(64, EXP_BITS); when RSCON2_MINEXP => rsh_in2 := min_exp; + when RSCON2_DPMINX => + rsh_in2 := to_signed(-1022, EXP_BITS); when others => rsh_in2 := to_signed(0, EXP_BITS); end case; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 5c46b6f..5f0131c 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -682,6 +682,9 @@ struct roundvals { { FPS_RN_NEAR, 0x37c12345b0000000, 0x37c1234400000000, FPS_FI }, { FPS_RN_NEAR, 0x0000008800000088, 0, FPS_FI }, { FPS_RN_NEAR, 0xc2000000c2000000, 0xc2000000c0000000, FPS_FI }, + { FPS_RN_NEAR|FPS_OE, 0xefffffffffffffff, 0xe400000000000000, FPS_FR|FPS_FI }, + { FPS_RN_NEAR|FPS_OE, 0xff0000ff43434343, 0xf30000ff40000000, FPS_FI }, + { FPS_RN_NEAR|FPS_OE, 0xfc00fc0139fffcff, 0xf000fc0140000000, FPS_FR|FPS_FI }, }; int test8(long arg) diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index f68ea11be39a32ba6dab54ab64e3109f8f4b185a..229e70f68afa50d2af321912e16afa1f774477ee 100755 GIT binary patch delta 548 zcmZ9GO(;ZB6vxlKGlO}N$5WanA59@&kx6-K>h^2~8D(L?24!JkD>4`wdUiA2%`7Y| zC8?P!Y-A&a6dN0|V(~0|WNGr8`)2ZW>fU?)=XdV;pTkv}TBBtEECkRuep=W!hwo%8 zS*B5lC+j;XoTNvdOMR*c=m5PIfOP>d)>R9!2w-CJaa9XRY6=58&XAtpUfFiPr1BKy z*=}TW31Itn??2;dwbEl^SDas}Z8V$_pXv=!OhM5N##F)Y%ZG`{JGJI2G_ok#qnQQa zrs$o$!e&1nmRz#N7qkYSf&_;-bR$pQ4n5LpyhXxTmP1Ycs0Ed{n26<86oHB%MOx2P zuN68g)a`p{dq(UxcX()cS3I>xDDewDc%YD&cnkGYvcf~G!J86+bvYx!r`i0E5^rpj zkOUtv{3OMv41Y-im)dbdbbF93BzV%$>SV;zPC5-5A>_J#ldy+N`dyaFn>&Tm0I1H-D@ zThQeuZxj$`<=Cub$ujw@fIicUjhp`oTw~}A~FtF?#s4NRucCWBJ=ZZ~EjUYog zCO;K6=3?2L)BthfW+jmzE|Aq_a&7EjC#Y^?WxB9&^EPcGR;CwgH@`CQV`7xpY-zlm zi;-dTM_X?uMyAQO4(gl|o0U32W(Z8qb*M)0Hh*-8W@HqY?CT`WC^0$HNt#yx=%_U; z3^l9#9cl$8PjnJi1&J^-I9T#9Xj;N$fMNp3{K+Ss Date: Fri, 12 Dec 2025 10:12:10 +1100 Subject: [PATCH 32/38] FPU: Be more careful about preserving low-order bits in multiply-add instrs Add code to check whether bits of S which don't get shifted into R are non-zero, and set X if they are, so that rounding in multiply-add instructions works correctly. This needs to be done after normalization in the case of very small results, where potentially all the non-zero bits in S do get shifted into R. Also fix an incorrect test case, and add another multiply-add test case. Signed-off-by: Paul Mackerras --- fpu.vhdl | 22 +++++++++++++++++++--- tests/fpu/fpu.c | 7 +++++-- tests/test_fpu.bin | Bin 33560 -> 33624 bytes 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 07617af..7d8060a 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1003,6 +1003,7 @@ begin variable exp_huge : std_ulogic; variable clz : std_ulogic_vector(5 downto 0); variable set_x : std_ulogic; + variable set_xs : std_ulogic; variable mshift : signed(EXP_BITS-1 downto 0); variable need_check : std_ulogic; variable msb : std_ulogic; @@ -1056,6 +1057,7 @@ begin variable bneg : std_ulogic; variable ci : std_ulogic; variable rormr : std_ulogic_vector(63 downto 0); + variable sorms : std_ulogic_vector(55 downto 0); begin v := r; v.complete := '0'; @@ -1358,6 +1360,7 @@ begin invalid := '0'; zero_divide := '0'; set_x := '0'; + set_xs := '0'; qnan_result := '0'; set_a := '0'; set_a_exp := '0'; @@ -1931,6 +1934,8 @@ begin f_to_multiply.valid <= r.first; opsel_r <= RES_MULT; set_r := '1'; + opsel_s <= S_MULT; + set_s := '1'; if multiply_to_f.valid = '1' then v.state := FINISH; end if; @@ -1971,6 +1976,7 @@ begin when FMADD_2 => -- Product is potentially bigger here -- r.shift = addend exp - product exp + 64, r.r = r.b.mantissa + -- R contains B, S contains 0 set_s := '1'; opsel_s <= S_SHIFT; set_x := '1'; @@ -2408,9 +2414,6 @@ begin when FINISH => -- r.shift = 0 - if r.is_multiply = '1' and px_nz = '1' then - v.x := '1'; - end if; -- set shift to new_exp - min_exp (N.B. rs_norm overrides this) rs_sel1 <= RSH1_NE; rs_con2 <= RSCON2_MINEXP; @@ -2420,6 +2423,7 @@ begin v.state := NORMALIZE; else set_x := '1'; + set_xs := r.is_multiply; if exp_tiny = '1' then v.state := ROUND_UFLOW; elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then @@ -2441,6 +2445,7 @@ begin rs_con2 <= RSCON2_MINEXP; rs_neg2 <= '1'; set_x := '1'; + set_xs := r.is_multiply; if exp_tiny = '1' then v.state := ROUND_UFLOW; elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then @@ -2485,6 +2490,7 @@ begin re_sel2 <= REXP2_NE; re_set_result <= '1'; set_x := '1'; + set_xs := r.is_multiply; v.state := ROUNDING; when ROUND_OFLOW_DIS => @@ -3309,6 +3315,16 @@ begin end if; v.x := v.x or rormr(to_integer(unsigned(mshift(5 downto 0)))); end if; + -- Test if there are non-zero bits in S which won't get shifted into R + if set_xs = '1' and not is_X(r.shift) and r.shift < to_signed(56, EXP_BITS) then + if r.shift > to_signed(0, EXP_BITS) then + mshift := to_signed(55, EXP_BITS) - r.shift; + else + mshift := to_signed(55, EXP_BITS); + end if; + sorms := r.s or std_ulogic_vector(- signed(r.s)); + v.x := v.x or sorms(to_integer(unsigned(mshift(5 downto 0)))); + end if; asign := '0'; case opsel_a is when AIN_A => diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 5f0131c..89fb44f 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -1543,9 +1543,9 @@ struct fmavals { /* +(1 + 2^-52) * +(1 + 2^-52) +- +1.0 -> +(2 + 2^-51), +2^-51, -(2 + 2^-51), -2^-51 */ { 0x3ff0000000000001, 0x3ff0000000000001, 0x3ff0000000000000, FPS_RN_NEAR, 0x4000000000000001, 0x3cc0000000000000, 0xc000000000000001, 0xbcc0000000000000 }, - /* +(1 + 3*2^-52) * +(1 + 2^-51) +- +1.0 -> +(2 + 2^-50), +5 * 2^-52 + 2^-101, -, - */ + /* +(1 + 3*2^-52) * +(1 + 2^-51) +- +1.0 -> +(2 + 3*2^-51), +5 * 2^-52 + 2^-101, -, - */ { 0x3ff0000000000003, 0x3ff0000000000002, 0x3ff0000000000000, FPS_RN_NEAR, - 0x4000000000000002, 0x3cd4000000000002, 0xc000000000000002, 0xbcd4000000000002 }, + 0x4000000000000003, 0x3cd4000000000002, 0xc000000000000003, 0xbcd4000000000002 }, /* +2.443e-77 * 2.828 +- 6.909e-77 -> -1.402e-93, +1.382e-76, +1.402e-93, -1.382e-76 */ { 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb020000000000000, FPS_RN_NEAR, 0xaca765753908cd20, 0x3030000000000000, 0x2ca765753908cd20, 0xb030000000000000 }, @@ -1615,6 +1615,9 @@ struct fmavals { /* 1 * -1 + tiny -> -1 + delta, -1, 1 - delta, 1 */ { 0x3ff0000000000000, 0xbff0000000000000, 0x00000000b2200102, FPS_RN_CEIL, 0xbfefffffffffffff, 0xbff0000000000000, 0x3fefffffffffffff, 0x3ff0000000000000 }, + /* from random exec tests */ + { 0x43eff79000000000, 0x00000000000000ff, 0x0000000000000081, FPS_RN_CEIL, + 0x014fd79870000001, 0x014fd79870000000, 0x814fd79870000001, 0x814fd79870000000 }, }; int test23(long arg) diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 229e70f68afa50d2af321912e16afa1f774477ee..e6a21b8d72d9e5f5afd9e0892d5f88ed437c4233 100755 GIT binary patch delta 334 zcmbQy#&n~NX@iFVW58rj0d3iU%}SOXo1H8f7*3vMU^sRA!vFu?3=9kjAnY-DqJT7` z$K2TI=svOP8{iUe~pc1$jnd(Ff#adMLU zJjNZH85MrBf^@2eGcrjmpKPb8wz)?A03(yZ;>~uNWlW3(o40BkgH*gUaA#!FSiRZE z$cm9kVgBYk<2FXd1(Uy;{AN;^viXxK6C-2AGk{Kr+!1NN(1( zy3WX{%wVVU;$M@)jL9D@r6+6H>|j!uy!nyMHYq`7pjUvvfeB1qv0>i4GW;^D&4lmo so&SS{7#JF%G(-sl10w@N!HnzvKng-b`5CNC8>2TI=svN<*@iUe~pdQ2{rd(CuW!sI0R zd5jgC85MrBf^@2eGcw&+HrY;7ZF7zK0Y;_|i#FS7mN79JY~HGE3{vsZz@3rl#j4Fl zMpleW59V#oGj3yKOql%DY=G!!NT=zRD=P J`Fw1rDgYd{U)lfw From f631dcd7001c9950d8ec680d09b470007020549d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 12 Dec 2025 12:44:13 +1100 Subject: [PATCH 33/38] FPU: Set FPRF correctly on multiply result that underflows rcls_op being set to RCLS_TZERO was not detecting a zero result after rounding for a multiply result that underflows, because S still had low bits of the product. To fix this, remove the 's_nz = 0' from the RCLS_TZERO test. We can't then use this test in the FMADD_6 state, but we really shouldn't be testing for zero there, before rounding, so remove that. Also simplify FMADD_6 state by not setting rs_norm and going always to FINISH state rather than going to NORMALIZE state. Add a test for this case (actually a fmadd with B=0). While here, remove a pointless assignment to f_to_multiply.valid in MULT_1 state, since r.first is never set here. Signed-off-by: Paul Mackerras --- fpu.vhdl | 16 ++++------------ tests/fpu/fpu.c | 2 ++ tests/test_fpu.bin | Bin 33624 -> 33688 bytes 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 7d8060a..48c021d 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1931,7 +1931,6 @@ begin v.instr_done := '1'; when MULT_1 => - f_to_multiply.valid <= r.first; opsel_r <= RES_MULT; set_r := '1'; opsel_s <= S_MULT; @@ -2022,25 +2021,18 @@ begin v.state := FMADD_6; when FMADD_6 => - -- r.shift = UNIT_BIT (or 0, but only if r is now nonzero) + -- r.shift = UNIT_BIT set_r := '0'; opsel_r <= RES_SHIFT; re_sel2 <= REXP2_NE; - rs_norm <= '1'; - rcls_op <= RCLS_TZERO; if (r.r(UNIT_BIT + 2) or r_hi_nz or r_lo_nz or (or (r.r(DP_LSB - 1 downto 0)))) = '0' then - -- S = 0 case is handled by RCLS_TZERO logic, otherwise... - -- R is all zeroes but there are non-zero bits in S + -- R is all zeroes but there may be non-zero bits in S -- so shift them into R and set S to 0 set_r := '1'; re_set_result <= '1'; set_s := '1'; - v.state := FINISH; - elsif r.r(UNIT_BIT + 2 downto UNIT_BIT) = "001" then - v.state := FINISH; - else - v.state := NORMALIZE; end if; + v.state := FINISH; when DIV_2 => -- compute Y = inverse_table[B] (when count=0); P = 2 - B * Y @@ -3197,7 +3189,7 @@ begin when others => end case; when RCLS_TZERO => - if or (r.r(UNIT_BIT + 2 downto 0)) = '0' and s_nz = '0' then + if or (r.r(UNIT_BIT + 2 downto 0)) = '0' then v.result_class := ZERO; arith_done := '1'; end if; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 89fb44f..ccf07f8 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -1618,6 +1618,8 @@ struct fmavals { /* from random exec tests */ { 0x43eff79000000000, 0x00000000000000ff, 0x0000000000000081, FPS_RN_CEIL, 0x014fd79870000001, 0x014fd79870000000, 0x814fd79870000001, 0x814fd79870000000 }, + { 0x00000000ffffffff, 0x1fc771af627f62ab, 0x8000000000000000, FPS_RN_ZERO, + 0x0000000000000000, 0x0000000000000000, 0x8000000000000000, 0x8000000000000000 }, }; int test23(long arg) diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index e6a21b8d72d9e5f5afd9e0892d5f88ed437c4233..2a7845bd7939f18fa5a2444a80996f887ced14ab 100755 GIT binary patch delta 660 zcmcc7#x$dyNtJ=Y!Ipu+EAbe+S0#{6WoYqAWMD`Die&@EvVmgRsSOiVjTsv@M!x5- zpRifUl7YeVB?H6D(?Dew3JeW?3_u3Q7LfSM+i(8=_XhDn7^IeCixW`%Bv9?C+ZT}J zI-qjv{1((RFs!=01&iD}zXqUO*XY2e}lk0@lq`Edcy#zWNEIkJ* z%>t3$CoIR+vDv8+WGv9YXTnCDEL)NqK-`YaiXy>Wj3*|S%Dravn4Bm-pYg|LMup$3 zAm^xtGcrXipKPb8wz)?A03&0;W?Rj2CdLh$w`v=M6udNWXJpLSY-nW7$e6G>*SMXL z@xkP;CclB4kEYCwj5{WOH9ODPG5M(ZdBz`;r&^o_l8Kf;a{ z3i~-pF-dHhoaH3tCIAeAH7pD@tNa~mL2RZ&UX?5inwH37%nS~eJZNG9DE!H%oa8_z zzH&0+gILe0198G;E$2Qd$^SqAJs!v*9cw8RD27(60$rl;rChG?)Y(5{`sR{rQ Cu;lmv delta 634 zcmZutO=uHA6n<|KsR?3&5tImPtRyzEYE86uwF%1#x~>p~9*Q2s2)!5-FJfDwS!1!Q z2dNA~4uZW2MFe>YIf)1gwqW(75IqPYu8Pt^PeJ3Gq+k>eJ3HUcyziT-PKvizMI6A$ z0@#J;UG^CNN1Q8m0pKKaMdpgk6-VG^JcIDy*pE>8D0S-qhdcmpj%@=maPq(#x5az) zZ_Vamh1I9LE}86q!QR>Ws}9Xt*6!!0%K$U=hyT&O=N;Cr*Qi4;uu`RW&5AMF_*nL}IC&|Oh}tZ1?UsCdS@yT( z#}O@Pk$G8-<$<3f>A(b?aul>^)7qgRy7IjC6<5l_#yeOej5SuMI`Lr?>+i~3GC`#+ zPZ0Yko4g?K96j2aQ3kK?AYu>gOz%UC(V5I7;r1Jv=ig;-ynbD_W%F@ From 80c81b58efeb7d77e6b300c813b234d18af2c92a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 12 Dec 2025 16:44:43 +1100 Subject: [PATCH 34/38] FPU: Generate correct result sign when B is denormal If a subtraction A - B is done where A is in normalized form with an exponent of -1022, and B is denormal, an inconsistency arises between the comparison of the raw exponents in the first cycle, which sees A.exp (0x001) > B.exp (0x000), and the comparison in DO_FADD state, which sees r.a.exponent (-1022) = r.b.exponent (-1022). Conseqently we get r.add_bsmall = 0 and the subtraction is done the wrong way around, yielding the wrong sign for the result. Fix this by setting r.add_bsmall according to the comparison of raw exponents in the first cycle and then using it in DO_FADD state. Also add a test case for this. Signed-off-by: Paul Mackerras --- fpu.vhdl | 15 ++++---- tests/fpu/fpu.c | 92 +++++++++++++++++++++++---------------------- tests/test_fpu.bin | Bin 33688 -> 34432 bytes 3 files changed, 55 insertions(+), 52 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 48c021d..3bc7b3e 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1102,6 +1102,7 @@ begin v.is_addition := '0'; v.is_subtract := '0'; v.is_inverse := '0'; + v.add_bsmall := '0'; v.do_renorm_b := '0'; fpin_a := '0'; fpin_b := '0'; @@ -1140,6 +1141,8 @@ begin v.result_sign := e_in.fra(63); if unsigned(e_in.fra(62 downto 52)) <= unsigned(e_in.frb(62 downto 52)) then v.result_sign := e_in.frb(63) xnor e_in.insn(1); + else + v.add_bsmall := '1'; end if; v.is_subtract := not (e_in.fra(63) xor e_in.frb(63) xor e_in.insn(1)); when "11001" => -- fmul @@ -1255,7 +1258,6 @@ begin end case; v.tiny := '0'; v.denorm := '0'; - v.add_bsmall := '0'; v.int_ovf := '0'; v.div_close := '0'; @@ -1705,15 +1707,13 @@ begin rs_sel1 <= RSH1_B; rs_neg1 <= '1'; rs_sel2 <= RSH2_A; - v.add_bsmall := '0'; - if r.a.exponent = r.b.exponent then + if r.add_bsmall = '1' then + v.state := ADD_1; + elsif r.a.exponent = r.b.exponent then v.state := ADD_2B; - elsif r.a.exponent < r.b.exponent then + elsif v.add_bsmall = '0' then v.longmask := '0'; v.state := ADD_SHIFT; - else - v.add_bsmall := '1'; - v.state := ADD_1; end if; when DO_FMUL => @@ -1856,6 +1856,7 @@ begin re_sel2 <= REXP2_B; re_set_result <= '1'; -- set shift to b.exp - a.exp + -- (N.B., shift can be 0 if B is denorm and A's exp is -1022) rs_sel1 <= RSH1_B; rs_sel2 <= RSH2_A; rs_neg2 <= '1'; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index ccf07f8..b72e069 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -975,51 +975,53 @@ struct addvals { unsigned long val_b; unsigned long sum; unsigned long diff; + unsigned long fpscr; } addvals[] = { - { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }, - { 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 }, - { 0x3fdfffffffffffff, 0x0000000000000000, 0x3fdfffffffffffff, 0x3fdfffffffffffff }, - { 0x3ff0000000000000, 0x3ff0000000000000, 0x4000000000000000, 0x0000000000000000 }, - { 0xbff0000000000000, 0xbff0000000000000, 0xc000000000000000, 0x0000000000000000 }, - { 0x402123456789abcd, 0x4021000000000000, 0x403111a2b3c4d5e6, 0x3fb1a2b3c4d5e680 }, - { 0x4061200000000000, 0x406123456789abcd, 0x407121a2b3c4d5e6, 0xbfba2b3c4d5e6800 }, - { 0x4061230000000000, 0x3fa4560000000000, 0x4061244560000000, 0x406121baa0000000 }, - { 0xc061230000000000, 0x3fa4560000000000, 0xc06121baa0000000, 0xc061244560000000 }, - { 0x4061230000000000, 0xbfa4560000000000, 0x406121baa0000000, 0x4061244560000000 }, - { 0xc061230000000000, 0xbfa4560000000000, 0xc061244560000000, 0xc06121baa0000000 }, - { 0x3fa1230000000000, 0x4064560000000000, 0x4064571230000000, 0xc06454edd0000000 }, - { 0xbfa1230000000000, 0x4064560000000000, 0x406454edd0000000, 0xc064571230000000 }, - { 0x3fa1230000000000, 0xc064560000000000, 0xc06454edd0000000, 0x4064571230000000 }, - { 0xbfa1230000000000, 0xc064560000000000, 0xc064571230000000, 0x406454edd0000000 }, - { 0x6780000000000001, 0x6470000000000000, 0x6780000000000009, 0x677ffffffffffff2 }, - { 0x6780000000000001, 0x6460000000000000, 0x6780000000000005, 0x677ffffffffffffa }, - { 0x6780000000000001, 0x6450000000000000, 0x6780000000000003, 0x677ffffffffffffe }, - { 0x6780000000000001, 0x6440000000000000, 0x6780000000000002, 0x6780000000000000 }, - { 0x7ff8888888888888, 0x7ff9999999999999, 0x7ff8888888888888, 0x7ff8888888888888 }, - { 0xfff8888888888888, 0x7ff9999999999999, 0xfff8888888888888, 0xfff8888888888888 }, - { 0x7ff8888888888888, 0x7ff0000000000000, 0x7ff8888888888888, 0x7ff8888888888888 }, - { 0x7ff8888888888888, 0x0000000000000000, 0x7ff8888888888888, 0x7ff8888888888888 }, - { 0x7ff8888888888888, 0x0001111111111111, 0x7ff8888888888888, 0x7ff8888888888888 }, - { 0x7ff8888888888888, 0x3ff0000000000000, 0x7ff8888888888888, 0x7ff8888888888888 }, - { 0x7ff0000000000000, 0x7ff9999999999999, 0x7ff9999999999999, 0x7ff9999999999999 }, - { 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff8000000000000 }, - { 0x7ff0000000000000, 0xfff0000000000000, 0x7ff8000000000000, 0x7ff0000000000000 }, - { 0x7ff0000000000000, 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000 }, - { 0x7ff0000000000000, 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000 }, - { 0x7ff0000000000000, 0x8002222222222222, 0x7ff0000000000000, 0x7ff0000000000000 }, - { 0x7ff0000000000000, 0xc002222222222222, 0x7ff0000000000000, 0x7ff0000000000000 }, - { 0x0000000000000000, 0x7ff9999999999999, 0x7ff9999999999999, 0x7ff9999999999999 }, - { 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000 }, - { 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000 }, - { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }, - { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }, - { 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 }, - { 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 }, - { 0x8002222222222222, 0x0001111111111111, 0x8001111111111111, 0x8003333333333333 }, - { 0x0000022222222222, 0x0000111111111111, 0x0000133333333333, 0x80000eeeeeeeeeef }, - { 0x401ffffffbfffefe, 0x406b8265196bd89e, 0x406c8265194bd896, 0xc06a8265198bd8a6 }, - { 0x4030020000000004, 0xbf110001ffffffff, 0x403001fbbfff8004, 0x4030020440008004 }, - { 0x3fdfffffffffffff, 0x3fe0000000000000, 0x3ff0000000000000, 0xbc90000000000000 }, + { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, FPS_RN_NEAR }, + { 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000, FPS_RN_NEAR }, + { 0x3fdfffffffffffff, 0x0000000000000000, 0x3fdfffffffffffff, 0x3fdfffffffffffff, FPS_RN_NEAR }, + { 0x3ff0000000000000, 0x3ff0000000000000, 0x4000000000000000, 0x0000000000000000, FPS_RN_NEAR }, + { 0xbff0000000000000, 0xbff0000000000000, 0xc000000000000000, 0x0000000000000000, FPS_RN_NEAR }, + { 0x402123456789abcd, 0x4021000000000000, 0x403111a2b3c4d5e6, 0x3fb1a2b3c4d5e680, FPS_RN_NEAR }, + { 0x4061200000000000, 0x406123456789abcd, 0x407121a2b3c4d5e6, 0xbfba2b3c4d5e6800, FPS_RN_NEAR }, + { 0x4061230000000000, 0x3fa4560000000000, 0x4061244560000000, 0x406121baa0000000, FPS_RN_NEAR }, + { 0xc061230000000000, 0x3fa4560000000000, 0xc06121baa0000000, 0xc061244560000000, FPS_RN_NEAR }, + { 0x4061230000000000, 0xbfa4560000000000, 0x406121baa0000000, 0x4061244560000000, FPS_RN_NEAR }, + { 0xc061230000000000, 0xbfa4560000000000, 0xc061244560000000, 0xc06121baa0000000, FPS_RN_NEAR }, + { 0x3fa1230000000000, 0x4064560000000000, 0x4064571230000000, 0xc06454edd0000000, FPS_RN_NEAR }, + { 0xbfa1230000000000, 0x4064560000000000, 0x406454edd0000000, 0xc064571230000000, FPS_RN_NEAR }, + { 0x3fa1230000000000, 0xc064560000000000, 0xc06454edd0000000, 0x4064571230000000, FPS_RN_NEAR }, + { 0xbfa1230000000000, 0xc064560000000000, 0xc064571230000000, 0x406454edd0000000, FPS_RN_NEAR }, + { 0x6780000000000001, 0x6470000000000000, 0x6780000000000009, 0x677ffffffffffff2, FPS_RN_NEAR }, + { 0x6780000000000001, 0x6460000000000000, 0x6780000000000005, 0x677ffffffffffffa, FPS_RN_NEAR }, + { 0x6780000000000001, 0x6450000000000000, 0x6780000000000003, 0x677ffffffffffffe, FPS_RN_NEAR }, + { 0x6780000000000001, 0x6440000000000000, 0x6780000000000002, 0x6780000000000000, FPS_RN_NEAR }, + { 0x7ff8888888888888, 0x7ff9999999999999, 0x7ff8888888888888, 0x7ff8888888888888, FPS_RN_NEAR }, + { 0xfff8888888888888, 0x7ff9999999999999, 0xfff8888888888888, 0xfff8888888888888, FPS_RN_NEAR }, + { 0x7ff8888888888888, 0x7ff0000000000000, 0x7ff8888888888888, 0x7ff8888888888888, FPS_RN_NEAR }, + { 0x7ff8888888888888, 0x0000000000000000, 0x7ff8888888888888, 0x7ff8888888888888, FPS_RN_NEAR }, + { 0x7ff8888888888888, 0x0001111111111111, 0x7ff8888888888888, 0x7ff8888888888888, FPS_RN_NEAR }, + { 0x7ff8888888888888, 0x3ff0000000000000, 0x7ff8888888888888, 0x7ff8888888888888, FPS_RN_NEAR }, + { 0x7ff0000000000000, 0x7ff9999999999999, 0x7ff9999999999999, 0x7ff9999999999999, FPS_RN_NEAR }, + { 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff8000000000000, FPS_RN_NEAR }, + { 0x7ff0000000000000, 0xfff0000000000000, 0x7ff8000000000000, 0x7ff0000000000000, FPS_RN_NEAR }, + { 0x7ff0000000000000, 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, FPS_RN_NEAR }, + { 0x7ff0000000000000, 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, FPS_RN_NEAR }, + { 0x7ff0000000000000, 0x8002222222222222, 0x7ff0000000000000, 0x7ff0000000000000, FPS_RN_NEAR }, + { 0x7ff0000000000000, 0xc002222222222222, 0x7ff0000000000000, 0x7ff0000000000000, FPS_RN_NEAR }, + { 0x0000000000000000, 0x7ff9999999999999, 0x7ff9999999999999, 0x7ff9999999999999, FPS_RN_NEAR }, + { 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000, FPS_RN_NEAR }, + { 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000, FPS_RN_NEAR }, + { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, FPS_RN_NEAR }, + { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, FPS_RN_NEAR }, + { 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000, FPS_RN_NEAR }, + { 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000, FPS_RN_NEAR }, + { 0x8002222222222222, 0x0001111111111111, 0x8001111111111111, 0x8003333333333333, FPS_RN_NEAR }, + { 0x0000022222222222, 0x0000111111111111, 0x0000133333333333, 0x80000eeeeeeeeeef, FPS_RN_NEAR }, + { 0x401ffffffbfffefe, 0x406b8265196bd89e, 0x406c8265194bd896, 0xc06a8265198bd8a6, FPS_RN_NEAR }, + { 0x4030020000000004, 0xbf110001ffffffff, 0x403001fbbfff8004, 0x4030020440008004, FPS_RN_NEAR }, + { 0x3fdfffffffffffff, 0x3fe0000000000000, 0x3ff0000000000000, 0xbc90000000000000, FPS_RN_NEAR }, + { 0x001000100010000f, 0x00000000000000ff, 0x001000100010010e, 0x00100010000fff10, FPS_RN_CEIL }, }; int test13(long arg) @@ -1029,8 +1031,8 @@ int test13(long arg) struct addvals *vp = addvals; unsigned long fpscr; - set_fpscr(FPS_RN_NEAR); for (i = 0; i < sizeof(addvals) / sizeof(addvals[0]); ++i, ++vp) { + set_fpscr(vp->fpscr); asm("lfd 5,0(%0); lfd 6,8(%0); fadd 7,5,6; fsub 8,5,6; stfd 7,0(%1); stfd 8,8(%1)" : : "b" (&vp->val_a), "b" (results) : "memory"); fpscr = get_fpscr(); diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 2a7845bd7939f18fa5a2444a80996f887ced14ab..ed714b776907036c03b56b2d80d67a521a382909 100755 GIT binary patch delta 1459 zcmbVKT}V@582--Z&wdC!Y3TA|9T%->8Maa~a~&s?tyobYB?PnF&gQ0*gtD+am8b=2 z?Y$ZFx4MXs8p^42X`_pNp`V+&Fl<3cix|=$hf2yMwAlzB`F z*D;9qB0@&xd}iiDF1J;Rr6bjjif&T>kkme!d8X5PRW0dF3?ei!bL$UU*2@#^^h{jS z4Dz5M5uVVi@Z@db8~uVZkBXS=Bh7=^k~TalLVanql}yiIvLV?ZgIcQxUgb`({a@Y= z=E*rYR%)*}KqRubEMHLNd_|rvwVHcH_i?E&X*gHvG!KdUcq!}^1>2&XW@-FE>##vy zV_6%8*$Hq}*@c1SDGKktCb+5&VjRCMh%1~0cyda3U%QFIJCcIaxt_wUSB0K60m`wu z7`0x>RnAL&5lT0*=kGQ$=n6TmOf9PXT#Md37;8e`;Op96$_L$bjV1@0ZLC2XS8jl* zx-XPNxLMy~z`Q7YYVu%wLIf9UH^)WWDXHaI5qj7ox={GUwo{m-z3vJe7op2tvFR*v z;6^?gnApx9l{miZIh>EG8uYn5#5OGWEl2+n!y~s<>iFL6EK zh&7iGd)2to-ip;ly6jbiRHEiX&4-%LYQYt~7)vVS_ z376zlf&}%}vm08rWG&|?$_`sR)l1;D#TN*;>>~wf>pNm?m(O&}-l`zU*&DS8V$I(2 znPC+n~247RH?dXqtJ@l&Ijd zQwzp&JiB5b$z_6LE)m+1`^XFAE3&;tgo8*Oa-+roPqEG+{3#LAkut=O^di&9G9tRo z;Bbr3f~1{Pfwo!+{cZtf-FoQ2s;N-Hh(~}p&ko4*B!JsvhAWk72zs-z{0^u&_*xTW)c(24chpREA`~O1>kQC? zbrhLLgnALOkV+(gOw{Z7K=@nkgg0S91DXasq&58iVug+WUhG7Z0sKw)^N?v|xhV Date: Fri, 12 Dec 2025 18:51:13 +1100 Subject: [PATCH 35/38] FPU: Improve accuracy in multiply-add almost-cancellation cases There are two paths for multiply-add instructions; one where the product is larger or nearly the same as the addend, which does the addition/subtraction in the multiplier with 128-bit accuracy; the other is used when the addend is clearly larger, which shifts the product right before doing the addition/subtraction in 64-bit arithmetic. The threshold for the second path is that B_exp has to be greater than A_exp + C_exp + 1, the +1 being because the product mantissa can be greater than 2. This increases the +1 to +2 to make sure that the 128-bit path is used when there is any chance of cancellation of the high-order bits of the sum. With the +1 threshold we could still get close to cancellation when the mantissas of A and C were nearly 2 and the mantissa of B was 1. This improves accuracy and avoids the need to do a 120-bit subtraction in the second path. Signed-off-by: Paul Mackerras --- fpu.vhdl | 14 +++++++++----- tests/fpu/fpu.c | 2 ++ tests/test_fpu.bin | Bin 34432 -> 34496 bytes 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 3bc7b3e..272e475 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1786,14 +1786,16 @@ begin re_set_result <= '1'; -- put b.exp into shift rs_sel1 <= RSH1_B; - if (r.a.exponent + r.c.exponent + 1) < r.b.exponent then - -- addend is bigger, do multiply first + if (r.a.exponent + r.c.exponent + 2) < r.b.exponent then + -- addend is definitely bigger, do multiply first -- if subtracting, sign is opposite to initial estimate f_to_multiply.valid <= '1'; v.first := '1'; v.state := FMADD_0; else - -- product is bigger, shift B first + -- product may be bigger, or the answer might be + -- close to 0; shift B first so the multiplier does + -- the add/subtract operation. v.state := FMADD_1; end if; @@ -1961,8 +1963,8 @@ begin end if; when FMADD_1 => - -- shift is b.exp, so new_exp is a.exp + c.exp - b.exp - -- product is bigger here + -- shift is b.exp, so new_exp is a.exp + c.exp - b.exp (>= -2) + -- product may bigger here -- shift B right and use it as the addend to the multiplier -- for subtract, multiplier does B - A * C re_sel2 <= REXP2_B; @@ -3342,6 +3344,8 @@ begin ci := '0'; case opsel_c is when CIN_SUBEXT => + -- Used with opsel_b = BIN_ADDSUBR, which will invert it if + -- r.subtract = 1, hence we use r.x here, rather than not r.x. ci := r.is_subtract and r.x; when CIN_ABSEXT => ci := r.r(63) and (s_nz or r.x); diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index b72e069..535d77a 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -1622,6 +1622,8 @@ struct fmavals { 0x014fd79870000001, 0x014fd79870000000, 0x814fd79870000001, 0x814fd79870000000 }, { 0x00000000ffffffff, 0x1fc771af627f62ab, 0x8000000000000000, FPS_RN_ZERO, 0x0000000000000000, 0x0000000000000000, 0x8000000000000000, 0x8000000000000000 }, + { 0x41efffffffe00000, 0xc1efffffffe00000, 0x43f0000000000000, FPS_RN_CEIL, + 0x41fffffffff00000, 0xc3ffffffffe00000, 0xc1fffffffff00000, 0x43ffffffffe00000 }, }; int test23(long arg) diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index ed714b776907036c03b56b2d80d67a521a382909..3d6dcce8e98d9d097e26d3664fb1a4b82f2d3d1a 100755 GIT binary patch delta 698 zcmZvYPiPZS5XNU-B2A*0U<6BqSPO|wEJYKoi8f)~fPW&ygNLFAF@;_X3Ld0ZqFH;f z3f1ZZ$w9E^AQF9=*OOF5(F(R+REUCtimPJNf*^{n^OAs3Tv%r2`+Ym}vBnsi8e;(f zEdh|uK4?o1QyqlU=`6qj(sHEbNXrcx7Xnc7LtX*nWukt#iPWmJAPFH7JSh>lrWhM#$rRvT9urC>d*x73F4{NI*vP=HN(}Okf z)AJmi4#_AEo#U^T^)rX$z22~&Zkvqq(R~wkg!3=05$)KIw*^~W<0rjA-;IiHQtUdu z+{E?TT1yMxdm~P}95B>a7OR)#G*)qh8VsOx7J3L!Cbc;aa^pAY(X3qhoVD>hIqGOG2(=n-)KYp#=rK? z;6r|Y(+rDUeo*&q^U!~|YKMGjw{}AH>~4D44xxCf zW&BKOX(0VvmXdj>*|9c)p{w#nxN^cXj6{wr{uk+WkP6VN$8>n7h7 z3TNurHrY&AOAcuFOD3TEfcC#+@X)Hc^S{ZV;`ft!X0X`gI$<@bBA^t|*U+0$zj!GJ59CCHR=Z#86!5^YL+uG&e*(F+Zd$arGYymW58xZBWp%RkIlKp<&2CD zo1dC+GcqP@{%Fd~$hc$jSF`hs9g~lmpJ)0paq={aQ%pblCMQ_}sou?cR@WFgRT%7a zUi@ovm@)aIrSxPCn;k%{4{i7vnHsij{$%UL#MHEHvYmq(*NkmSogfEK*fu%Op$dt+ z`IAEwBNI^A&q<1D#PEu|YfZ?!)g`sAZze6pE&2-4Cl7&Ij5?PFy!NHOTO-ulV kKlzlC9LU61PDXqX>p68GPS~vFd_iimRj}x0|F}X80DtP}OaK4? From f8a11420cace770513c2645c9e670090f895bfa9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 13 Dec 2025 11:31:31 +1100 Subject: [PATCH 36/38] FPU: Check for rounding overflow in 32-bit convert-to-integer operations Without this, rounding a value of 0xFFFFFFFF up, giving 0x100000000, will yield an incorrect result of zero. Signed-off-by: Paul Mackerras --- fpu.vhdl | 8 ++++++-- tests/fpu/fpu.c | 3 +++ tests/test_fpu.bin | Bin 34496 -> 34616 bytes 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 272e475..751b400 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -2356,10 +2356,13 @@ begin -- Check for possible overflows case r.insn(9 downto 8) is when "00" => -- fctiw[z] + -- check bit 32 in case of rounding overflow need_check := r.r(31) or (r.r(30) and not r.result_sign); when "01" => -- fctiwu[z] - need_check := r.r(31); + -- check bit 32 in case of rounding overflow + need_check := r.r(31) or r.r(31); when "10" => -- fctid[z] + -- can't get rounding overflow for 64-bit conversion need_check := r.r(63) or (r.r(62) and not r.result_sign); when others => -- fctidu[z] need_check := r.r(63); @@ -2380,7 +2383,8 @@ begin msb := r.r(63); end if; if (r.insn(8) = '0' and msb /= r.result_sign) or - (r.insn(8) = '1' and msb /= '1') then + (r.insn(8) = '1' and msb /= '1') or + (r.insn(9) = '0' and r.r(32) /= r.result_sign) then v.state := INT_OFLOW; else if r.fpscr(FPSCR_FI) = '1' then diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 535d77a..7a4b935 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -756,6 +756,8 @@ struct cvtivals { { 0x7ff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } }, { 0xfff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } }, { 0xbfd123456789abcd, 0, 0, 0, 0, {0, 0, 0, 0} }, + { 0x41effffffff00081, 0x100000000, 0x100000000, 0x7fffffff, 0xffffffff, { 0, 0, 1, 1 } }, + { 0xc1e0000000000000, 0xffffffff80000000, 0x0000000000000000, 0x80000000, 0x00000000, { 0, 1, 0, 1 } }, }; #define GET_VXCVI() ((get_fpscr() >> 8) & 1) @@ -830,6 +832,7 @@ struct cvtivals cvtizvals[] = { { 0xfff0000000000000, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } }, { 0x7ff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } }, { 0xfff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } }, + { 0xc1e0000000000000, 0xffffffff80000000, 0x0000000000000000, 0x80000000, 0x00000000, { 0, 1, 0, 1 } }, }; int test10(long arg) diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 3d6dcce8e98d9d097e26d3664fb1a4b82f2d3d1a..0ae6099b6e9a2a79e0396313b794f59f4f938822 100755 GIT binary patch delta 3934 zcmai1eNa=`6~FgMAXv0O_z)ub;5!5`h(a(R2?~U7+!aTyR#>&-SUc{jJGdkyxau5fAPrOhR?OuMjr`BY8xYGbL4X4^4V0;y5q~A2WdJ& zQwYhi5n}s>kW#IJSPKaXwNU0QNn_gDjtsQuLhavfU*q{are$}uLwB<9poNfQw@**j zL92J5s1~hi;Hvj5uDrc%-1}SVo0>q3LWB+uqmoih2i)~tLTh2JM6Noeu`BJ0P#cF! z!UfofF~-!#Z`W769A>W%e6{=@sQxkOIXa%33Z2D_hhGQ`_tU)ii`KL)l7?|^yCNN-8$uLbOA**Y?R9`_@|o^DU!(1gfV&~@ z(2bBFOZ3f*AjVc+`I6-`5Ls1G-VPti;;9B&WZB$UI6RddLeMBz(|kBDuS7XTp{566 ztsn?>5V>RZZAj($?a{uzqG&0U|<9VyDy!~q(A^p0F=eI2i(yVhMdn*Sb zZIxp3OPgP=4y2Lu<`k<;eq_6W5?*H&7W&X)LYL8m+A1Ezm`QQ8X#m?pgIG^uwlUlT z-I0=zPGt2&i47EOI*qbXkYh|oP<{n8O75veiG{ahK@6|Q|5~|tB8L_7gJXo8y}1mUqoiEgIQ%Ipm{X3!t*97_5i9IPw(EKR zP--#C74I%Edt=4vf7lV58O!)+Quc`H(9Xaxo5O7GjPiN&QAbFKU^m%@?c705#A*_aMgRc%!dFOv(Hd`zZ=D*FsA|Wx`b- zVr+iX_5C{W=U+RHW3`Tus>@=s|MJcdGCo7miS6EZ6Du=Yo?>6qUERtd%^fWN@FrAm=nod3A();RKwm*b zVlu6Rm5G%ikg%|g0goRD|86D;`s;ZEI13y3dRZ}Lf*MZqc+zf4I*|f zQZHgwB>SNWGhcyab1JD4dgr8bgCh`}Br~wp3ww$=-g)Gj?^VPs96Qi#8EkoHk#nv= zCv2U&ic?O9TS>t%GB=vuhoB_6i`(!2J@o(01^ z*A=urbe-Ld*m?F5*fB4U)o$Lq?9l1tWkUa8U$Gv#^uRIOaq!L{cJ8Sb#Lfq=2{G#_ z=O(B|j`h_h$8Oca30)=a0#&LyXX0hD?Y`^Fq}72ZUM9^9JH1RA5j($3cEP^XC@#T6 zXsrI!86362|59b~&VBHzU&zJE4lVO*L+o6M5qHQ+TaJZnPK&ZE#W#;Pp^skj_lFKo z5u3*DF=GXOF_O{>sl~+e@1>TajkP1R@jmKU!Xk89&JHCm)E}K6<~~9@`)SyOJQcXg zH(UjBPjGHf51)4dTUvt!V)l4)B*M>_^8zm3gQvd1P#@&kW}g2QrQ0D*ALY5bBeae! zsR1_XE1l3Y{Q-Io8Vz641}Mmo(%rB=LprSizbTKq5!10Ey}1DzGb9m=nL_j_UPP7h zMuVKUGls4VIeM^VBu=x(^Nh8}R4rHs2Nw+cU-^dT@kd*@ym4gegV4NClGb;H=a1VJ zglQJzz2z#;E1bA;lxJ2>_<;oShw7DjQY-$HO^jA+H*dHysQ3Im}u(~tJU)=W7k zz6PIW$~doXXvfE#Zul-!hF@=^nR5KZ_0N*i<1iP;9w@NAcaSmfu6ex~Lu{4T$p6?<}s@-3pwBjk7FAATEA55?h)r(O&1y>6b@sAHJlKA&M!%ufd z9?|&J<81_0kvh)w1J5&tT`X%W-VMmbvQ5B?ajYswp7VS;bmI@7Y^@LQA9%n${IgY6 z?A{hzygWwMe*tZ*JYM*mxU7Oe3B`GdZJ&R1e@cEe89|$W8<@rsL3ko*!5e4O2^uYa M+G|^V)g1Bv0imrXV>;uAcc?!s`%gQbaEKf0N8q#Q3EmiAdLa>=dcG<sP~!qh|8k?3=Zlz>y=`vUN2LcGgdDqZ za<&ZG1A@hp&Ip|ddIFNUu~QvW0e_|Q_ofoFutFz?tCET{r=frTOH=|GGG&~^Eu0oa zxHxnP9dHqIyzM5x(`bEs*D;4-V3);&{{D213l|Wi``uEJAnlSV{n}3F)C&v2uqGO3Z%NrGNwm=Di4 zXyc-xKm0{{3Q`qm^IM{bxt&)}uz2o9*ICC-!JCRCYK6m!d~Q_~{9ADlnP#Pi9)d1q zEz&xbhE71GDw`{bgd?gY2o0+?P$!hAgE7uZbuA~2fIDg?P)BSCbygqNN{CaN`upWY8gUtQejb^IF&>Q^Mf$xL-oi07N>6vg>L_Jmm3-RwvN4A(g{&9s z*0{07xH~J{bPCS-3A9FKi$fK}To26|vPhnH>c>u<*Ap_Jw?4e?P?1)>i0rSu8||u< zkh;;p!o?tuUhL?Lv%N}ons|O)7SHd@ZmM3d4MA1)|M()FKbRGOuJ6MTtT)y>RwL#^ z&grey`}KarK4Kl(^6_1|qMJi%?JoSZrm5K~@ zziS{Ld|z_|U%P9w|1hb#vxm)cppXCJZN4NG8nEU}C*8R}sU z4`wkNrt==^EI}C|;ey_59aa)m!aQ<8t|b=dES(^sZc0#z)lJjWx>*ss84ksVL0_yt z?SkRh?C{*lnOJI{#wRZk3DKWP`U3x&@^by=kRj~LMDs|IK_v;soViXTQbBI_} z7)=TCf-8s#ZHzo6*}4j$(jx79!Lm?qvR3yCIcmMye*5<5tAjjAib$Uxd^} zui;VEyeN$xhR+r$6WJEb_}KDK?-?7B_q4tb(#+-9JeJHf?&<^he38P-yl2?kybF0x z^Dr1h^F~P3CR0DKXuTJwv#iJOdRv#l3GH(1ttV)0fJB|Qbp~pC)?3sp>jtRNE$1Hj z!IV}G!#WvPH3~^OZ~r$9dCyUfyr=(XAa`*QOWxv@tnn5rEkf;JO|km1-*H>Jn{O-f zp7pc_dCz((L7vrR`*5+No;r89v&E3NdbP?_p_tNbT6{PA`Y^Bh~2&^YI_S>t$CNc(^gq?0uw zTd);*&lC|~n96zu%fH+ZBW=TDDS*(C34U<&$Q+67Bs3djThAa$IwP9jzdd~k5aTrG zSZ|`wJ%}CQcGzZe(vA380jqtA^&)l{u}hcvq8cpk6wDRVXAQ($@?6o@Azd;=MO^2) z5f}(kEx=#%ijdBTJ?tAGT$ue5QlQ-^1EaCR@2Lci8b846R)Oi`@PJP4+b}{5L?yBO zYH6Xp0TaWlDW)$3)H(9p0KC~A7ZKaBsJ&zDU@gkeT} z8xTA0fvrM1462sN7R2`O{0nXg``Yl;!u!i&xVt@Yepx&f!GmSVcvTF~RdQ{AgC)5N z?qn~l#m~)NsLNHLLQAfaCO{XCtKb%nHh6;LAqdY?au+Ybk~{_X^(9!Fr=VeA&r{N6 z(1PO@=)!R?+(NBSzJe*#y89I<@)gt{4Eai$4r_3H73?@RKuf-oJAN7dkgwpbT!wzs zN`NQ%$^{KqcpmpfcvLtBiU@fJQq7TE-xYYt9L-H!fjV;%dUnE`%ynOdeseTVCoa%X z0;vUh9JdPN(E<(Eb`Abspx{QX!#4%drCB$4o=wS~bbJHx`e#OLBSmJJpNW`$?eiP) zeo@Pu-*O*Nsj1HNoRe0~eU9YZ?!R^O#&sXl4}|0wqn-689K*ZtfkKhE3E40>T9}=5 z1*vnlzw@8%)(B$Vh~XVXc#3hHL~Qal1Qit~7W)$N>JZQGb&Huv1!6Z5Yj9(%2(}|u z42?w^@gby6LRV3;xDTmr7)Q!qLP#$}6=#agNcn@MSR=L~)d$T;eTdX?_!Ox=q>5p@ zSmP)0Bjo-)p1%n7=0MPu$i!Mic7wV^Bi@MA3s8^LKBUe;J5t?94Zt8$6G)jsTB;Gp z&c*P+gw)GO?T4z;w78eYzw?{L!hG*W%!yz2#NunjHywIPHE|pN%kvCj57TzyYeOu7 z?Fc>$W0~0t>g9VtQ7Hxf4PUs8e^%?td^!@!UWtbrcy~))De1Vqbp%NM E4@ZF3`2YX_ From 1ad88486550b07dc91c41655fc961330ae909ee5 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sun, 14 Dec 2025 08:42:23 +1100 Subject: [PATCH 37/38] FPU: Improve zero result detection and simplify final states This improves detection of results that are exactly zero in FINISH state by noting that on entry to FINISH state, if R is zero then X must also be zero, so no rounding needs to be done and no underflow exists. Therefore we can set rcls_op = RCLS_TZERO to test for zero and exit early if R = 0. The RCLS_TZERO test now tests the whole of R just in case. The rest of the following states have been streamlined and simplified. In cases of underflow, we only need to take action before rounding in the UE=0 case (disabled underflow exception), where we need to denormalize before rounding. For enabled underflow cases we just use the existing NORMALIZE state, which lets us remove NORM_UFLOW state. On entry to ROUNDING state, R can be zero or denorm only for round to integer instructions (fri*) or for disabled underflow exception cases. Note that in case of underflow with UE=0, the exception is only actually signalled if there is loss of accuracy, i.e. if FPSCR[FI] will be set. This is now done at the end of ROUNDING state. For underflow with UE=1, we go to a new ROUND_UFLOW_EN state to adjust the exponent from ROUNDING, ROUNDING_2 or ROUNDING_3 state. In the ROUNDING* states, we avoid shifting left to normalize a result with exponent <= -1022, because if we did we would then just need to denormalize again. This lets us get rid of DENORM state. Finally, noticing that DO_FRSP_2 state does much the same as FINISH state lets us remove DO_FRSP_2 state and go to FINISH state from DO_FRSP. Signed-off-by: Paul Mackerras --- fpu.vhdl | 136 +++++++++++++++++++++------------------------ tests/fpu/fpu.c | 2 + tests/test_fpu.bin | Bin 34616 -> 34680 bytes 3 files changed, 66 insertions(+), 72 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 751b400..52b35c2 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -51,7 +51,7 @@ architecture behaviour of fpu is DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF, DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT, DO_FCFID, DO_FCTI, - DO_FRSP, DO_FRSP_2, DO_FRI, + DO_FRSP, DO_FRI, DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT, DO_FMADD, DO_FRE, DO_FSEL, @@ -72,9 +72,9 @@ architecture behaviour of fpu is INT_SHIFT, INT_ROUND, INT_ISHIFT, INT_FINAL, INT_CHECK, INT_OFLOW, FINISH, NORMALIZE, - ROUND_UFLOW, NORM_UFLOW, ROUND_OFLOW_DIS, ROUND_OFLOW_EN, + ROUND_UFLOW_DIS, ROUND_UFLOW_EN, + ROUND_OFLOW_DIS, ROUND_OFLOW_EN, ROUNDING, ROUND_INC, ROUNDING_2, ROUNDING_3, - DENORM, RENORM_A, RENORM_B, RENORM_C, RENORM_1, RENORM_2, IDIV_NORMB, IDIV_NORMB2, IDIV_NORMB3, @@ -776,6 +776,9 @@ begin end if; else assert not (r.state /= IDLE and e_in.valid = '1') severity failure; + assert not (rin.state = FINISH and rin.r = 64x"0" and rin.x = '1'); + assert not (rin.state = ROUNDING and rin.r(UNIT_BIT) = '0' and + not (rin.tiny = '1' or rin.zero_fri = '1')); r <= rin; end if; end if; @@ -1630,22 +1633,7 @@ begin set_r := '1'; re_sel2 <= REXP2_B; re_set_result <= '1'; - v.state := DO_FRSP_2; - - when DO_FRSP_2 => - -- r.shift = 0 - -- set shift to exponent - -126 (for ROUND_UFLOW state) - rs_sel1 <= RSH1_B; - rs_con2 <= RSCON2_MINEXP; - rs_neg2 <= '1'; - set_x := '1'; -- uses r.r and r.shift - if exp_tiny = '1' then - v.state := ROUND_UFLOW; - elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then - v.state := ROUND_OFLOW_DIS; - else - v.state := ROUNDING; - end if; + v.state := FINISH; when DO_FCTI => -- instr bit 9: 1=dword 0=word @@ -2414,17 +2402,20 @@ begin when FINISH => -- r.shift = 0 -- set shift to new_exp - min_exp (N.B. rs_norm overrides this) + -- assert that if r.r = 0 then r.x = 0 also rs_sel1 <= RSH1_NE; rs_con2 <= RSCON2_MINEXP; rs_neg2 <= '1'; + rcls_op <= RCLS_TZERO; if r.r(63 downto UNIT_BIT) /= std_ulogic_vector(to_unsigned(1, 64 - UNIT_BIT)) then rs_norm <= '1'; v.state := NORMALIZE; else set_x := '1'; set_xs := r.is_multiply; - if exp_tiny = '1' then - v.state := ROUND_UFLOW; + v.tiny := exp_tiny; + if exp_tiny = '1' and r.fpscr(FPSCR_UE) = '0' then + v.state := ROUND_UFLOW_DIS; elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then v.state := ROUND_OFLOW_DIS; else @@ -2445,51 +2436,25 @@ begin rs_neg2 <= '1'; set_x := '1'; set_xs := r.is_multiply; - if exp_tiny = '1' then - v.state := ROUND_UFLOW; + v.tiny := exp_tiny; + if exp_tiny = '1' and r.fpscr(FPSCR_UE) = '0' then + v.state := ROUND_UFLOW_DIS; elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then v.state := ROUND_OFLOW_DIS; else v.state := ROUNDING; end if; - when ROUND_UFLOW => + when ROUND_UFLOW_DIS => -- r.shift = - amount by which exponent underflows - v.tiny := '1'; + -- disabled underflow exception case + -- have to denormalize before rounding opsel_r <= RES_SHIFT; set_r := '0'; - if r.fpscr(FPSCR_UE) = '0' then - -- disabled underflow exception case - -- have to denormalize before rounding - set_r := '1'; - re_sel2 <= REXP2_NE; - re_set_result <= '1'; - set_x := '1'; - v.state := ROUNDING; - else - -- enabled underflow exception case - -- if denormalized, have to normalize before rounding - v.fpscr(FPSCR_UX) := '1'; - re_sel1 <= REXP1_R; - re_con2 <= RECON2_BIAS; - re_set_result <= '1'; - if r.r(UNIT_BIT) = '0' then - rs_norm <= '1'; - v.state := NORM_UFLOW; - else - v.state := ROUNDING; - end if; - end if; - - when NORM_UFLOW => - -- normalize for UE=1 underflow case - -- r.shift = clz(r.r) - 7 - opsel_r <= RES_SHIFT; set_r := '1'; re_sel2 <= REXP2_NE; re_set_result <= '1'; set_x := '1'; - set_xs := r.is_multiply; v.state := ROUNDING; when ROUND_OFLOW_DIS => @@ -2508,6 +2473,8 @@ begin arith_done := '1'; when ROUNDING => + -- r.r can be zero or denorm here for fri* instructions, + -- and for disabled underflow exception cases. opsel_mask <= '1'; set_r := '1'; round := fp_rounding(r.r, r.x, r.single_prec, r.round_mode, r.result_sign); @@ -2520,10 +2487,22 @@ begin -- increment the LSB for the precision v.state := ROUND_INC; elsif r.r(UNIT_BIT) = '0' then - -- result after masking could be zero, or could be a - -- denormalized result that needs to be renormalized - rs_norm <= '1'; + -- Result after masking could be zero, or could be a + -- denormalized result that needs to be renormalized, + -- but only for fri* instructions and for disabled + -- underflow exception cases. + -- For fri* instructions, result_exp is 52. + -- For disabled underflow exception cases for DP operations, + -- result_exp is -1022 and there is no point renormalizing + -- since it will just get denormalized again, but we do need + -- to check for a zero result in a subsequent cycle + -- after R is masked. + if r.result_exp > to_signed(-1022, EXP_BITS) then + rs_norm <= '1'; + end if; v.state := ROUNDING_3; + elsif r.tiny = '1' and r.fpscr(FPSCR_UE) = '1' then + v.state := ROUND_UFLOW_EN; elsif r.result_exp > max_exp then v.state := ROUND_OFLOW_EN; else @@ -2531,9 +2510,9 @@ begin end if; if round(0) = '1' and r.zero_fri = '0' then v.fpscr(FPSCR_XX) := '1'; - if r.tiny = '1' then - v.fpscr(FPSCR_UX) := '1'; - end if; + end if; + if round(0) = '1' and r.tiny = '1' then + v.fpscr(FPSCR_UX) := '1'; end if; when ROUND_INC => @@ -2544,18 +2523,30 @@ begin when ROUNDING_2 => -- Check for overflow during rounding -- r.shift = 0 - if r.r(UNIT_BIT + 1) = '1' or r.r(UNIT_BIT) = '0' then + if r.r(UNIT_BIT + 1) = '1' then -- Do CLZ so we can renormalize the result rs_norm <= '1'; v.state := ROUNDING_3; + elsif r.r(UNIT_BIT) = '0' then + -- R is non-zero (we just incremented it) + -- If result_exp is -1022 here, don't normalize since + -- we would then need to denormalize again. + if r.result_exp > to_signed(-1022, EXP_BITS) then + rs_norm <= '1'; + end if; + v.state := ROUNDING_3; elsif exp_huge = '1' then v.state := ROUND_OFLOW_EN; + elsif r.tiny = '1' and r.fpscr(FPSCR_UE) = '1' then + v.state := ROUND_UFLOW_EN; else arith_done := '1'; end if; when ROUNDING_3 => - -- r.shift = clz(r.r) - 7 + -- r.shift = clz(r.r) - 7 (or 0, or -7, if r.r is 0) + -- Note clz may be done on the value before being masked + -- to the result precision. opsel_r <= RES_SHIFT; set_r := '1'; re_sel2 <= REXP2_NE; @@ -2572,20 +2563,12 @@ begin v.state := ROUND_OFLOW_DIS; elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '1' then v.state := ROUND_OFLOW_EN; - elsif new_exp < to_signed(-1022, EXP_BITS) then - v.state := DENORM; + elsif r.tiny = '1' and r.fpscr(FPSCR_UE) = '1' then + v.state := ROUND_UFLOW_EN; else arith_done := '1'; end if; - when DENORM => - -- r.shift = result_exp - -1022 - opsel_r <= RES_SHIFT; - set_r := '1'; - re_sel2 <= REXP2_NE; - re_set_result <= '1'; - arith_done := '1'; - when ROUND_OFLOW_EN => -- enabled overflow exception -- rounding and normalization has been done @@ -2596,6 +2579,15 @@ begin re_set_result <= '1'; arith_done := '1'; + when ROUND_UFLOW_EN => + -- enabled underflow exception + -- rounding and normalization has been done + v.fpscr(FPSCR_UX) := '1'; + re_sel1 <= REXP1_R; + re_con2 <= RECON2_BIAS; + re_set_result <= '1'; + arith_done := '1'; + when DO_IDIVMOD => opsel_a <= AIN_B; opsel_aabs <= '1'; @@ -3196,7 +3188,7 @@ begin when others => end case; when RCLS_TZERO => - if or (r.r(UNIT_BIT + 2 downto 0)) = '0' then + if or (r.r) = '0' then v.result_class := ZERO; arith_done := '1'; end if; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 7a4b935..a123f62 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -1627,6 +1627,8 @@ struct fmavals { 0x0000000000000000, 0x0000000000000000, 0x8000000000000000, 0x8000000000000000 }, { 0x41efffffffe00000, 0xc1efffffffe00000, 0x43f0000000000000, FPS_RN_CEIL, 0x41fffffffff00000, 0xc3ffffffffe00000, 0xc1fffffffff00000, 0x43ffffffffe00000 }, + { 0x3ff0000000000000, 0x000060fbffffefc1, 0x000060fbffffefc1, FPS_RN_NEAR, + 0x0000c1f7ffffdf82, 0x0000000000000000, 0x8000c1f7ffffdf82, 0x8000000000000000 }, }; int test23(long arg) diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 0ae6099b6e9a2a79e0396313b794f59f4f938822..cd1d6476befa3ae556c49f6fea75b975b3fb7b09 100755 GIT binary patch delta 284 zcmdnd$MmC*X@iFVW5Hxk0d3iW?Mjv%+np>K7*3vMU^sRA!vFu?3=9kjAe=FIqJT7G z#^kjE{){U&{}s5;2-I{}s2NBm3Oh4qOzX;kHBWPGuiSN%C7(~V`5`P9`m%V-8LF;3W=t8ENYu+YGrk?Mjv%+np>K7*3vMU^sRA!vFu?3=9kjAgnQYqJT7` z#^kjE{)`!${|ek^1Zuh~)C?pOg`F9-CNC8>2TI=svNbjSa+QuLS3k}>E896q;G_+=9 z?AWYoT+YbYuz9WtHzVVM%_~ir8JT`ep1jWN9Mg$Olk3dSF Date: Thu, 4 Dec 2025 08:48:27 +1100 Subject: [PATCH 38/38] FPU: Update committed FPSCR value correctly The committed FPSCR is updated in the cycle where an FPU instruction signals completion. Since we update the FPRF field in the FPSCR in that same cycle, the value put into r.comm_fpscr needs to include the new FPRF value. Otherwise, a subsequent flush (for example, due to the following instruction being an illegal instruction that has to be emulated) will drop the FPSCR update. Signed-off-by: Paul Mackerras --- fpu.vhdl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fpu.vhdl b/fpu.vhdl index 52b35c2..190f4a3 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1077,7 +1077,6 @@ begin v.writing_fpr := '0'; v.writing_cr := '0'; v.writing_xer := '0'; - v.comm_fpscr := r.fpscr; v.illegal := '0'; end if; @@ -3728,6 +3727,10 @@ begin v.fpscr(FPSCR_FX) := '1'; end if; + if r.complete = '1' or r.do_intr = '1' then + v.comm_fpscr := v.fpscr; + end if; + if v.instr_done = '1' then if r.state /= IDLE then v.state := IDLE;