From dc1544db691a82dccdd6f6d43224d833dd4a1433 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 1 Aug 2020 19:17:36 +1000
Subject: [PATCH] FPU: Implement floating multiply-add instructions

This implements fmadd, fmsub, fnmadd, fnmsub and their
single-precision counterparts.  The single-precision versions operate
the same as the double-precision versions until the final rounding and
overflow/underflow steps.

This adds an S register to store the low bits of the product.  S
shifts into R on left shifts, and can be negated, but doesn't do any
other arithmetic.

This adds a test for the double-precision versions of these
instructions.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl               |   8 ++
 fpu.vhdl                   | 244 +++++++++++++++++++++++++++++++++++--
 tests/fpu/fpu.c            |  71 +++++++++++
 tests/test_fpu.bin         | Bin 29632 -> 30416 bytes
 tests/test_fpu.console_out |   1 +
 5 files changed, 314 insertions(+), 10 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index bd7f0f3..5d6a557 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -423,6 +423,10 @@ architecture behaviour of decode1 is
         2#11000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fres
         2#11001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmuls
         2#11010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- frsqrtes
+        2#11100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmsubs
+        2#11101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmadds
+        2#11110#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fnmsubs
+        2#11111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fnmadds
         others => illegal_inst
         );
 
@@ -485,6 +489,10 @@ architecture behaviour of decode1 is
         2#1000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fre
         2#1001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmul
         2#1010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- frsqrte
+        2#1100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmsub
+        2#1101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmadd
+        2#1110#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fnmsub
+        2#1111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fnmadd
         others => illegal_inst
         );
 
diff --git a/fpu.vhdl b/fpu.vhdl
index 90670e9..5e30386 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -40,13 +40,15 @@ architecture behaviour of fpu is
                      DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT,
                      DO_FCFID, DO_FCTI,
                      DO_FRSP, DO_FRI,
-                     DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT,
+                     DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT, DO_FMADD,
                      DO_FRE, DO_FRSQRTE,
                      DO_FSEL,
                      FRI_1,
                      ADD_SHIFT, ADD_2, ADD_3,
                      CMP_1, CMP_2,
                      MULT_1,
+                     FMADD_1, FMADD_2, FMADD_3,
+                     FMADD_4, FMADD_5, FMADD_6,
                      LOOKUP,
                      DIV_2, DIV_3, DIV_4, DIV_5, DIV_6,
                      FRE_1,
@@ -82,6 +84,7 @@ architecture behaviour of fpu is
         b            : fpu_reg_type;
         c            : fpu_reg_type;
         r            : std_ulogic_vector(63 downto 0);  -- 10.54 format
+        s            : std_ulogic_vector(55 downto 0);  -- extended fraction
         x            : std_ulogic;
         p            : std_ulogic_vector(63 downto 0);  -- 8.56 format
         y            : std_ulogic_vector(63 downto 0);  -- 8.56 format
@@ -101,6 +104,7 @@ architecture behaviour of fpu is
         round_mode   : std_ulogic_vector(2 downto 0);
         is_subtract  : std_ulogic;
         exp_cmp      : std_ulogic;
+        madd_cmp     : std_ulogic;
         add_bsmall   : std_ulogic;
         is_multiply  : std_ulogic;
         is_sqrt      : std_ulogic;
@@ -117,6 +121,7 @@ architecture behaviour of fpu is
     signal opsel_a       : std_ulogic_vector(1 downto 0);
     signal opsel_b       : std_ulogic_vector(1 downto 0);
     signal opsel_r       : std_ulogic_vector(1 downto 0);
+    signal opsel_s       : std_ulogic_vector(1 downto 0);
     signal opsel_ainv    : std_ulogic;
     signal opsel_amask   : std_ulogic;
     signal opsel_binv    : std_ulogic;
@@ -127,6 +132,7 @@ architecture behaviour of fpu is
     signal lost_bits     : std_ulogic;
     signal r_hi_nz       : std_ulogic;
     signal r_lo_nz       : std_ulogic;
+    signal s_nz          : std_ulogic;
     signal misc_sel      : std_ulogic_vector(3 downto 0);
     signal f_to_multiply : MultiplyInputType;
     signal multiply_to_f : MultiplyOutputType;
@@ -152,6 +158,11 @@ architecture behaviour of fpu is
     constant RES_MULT  : std_ulogic_vector(1 downto 0) := "10";
     constant RES_MISC  : std_ulogic_vector(1 downto 0) := "11";
 
+    constant S_ZERO  : std_ulogic_vector(1 downto 0) := "00";
+    constant S_NEG   : std_ulogic_vector(1 downto 0) := "01";
+    constant S_SHIFT : std_ulogic_vector(1 downto 0) := "10";
+    constant S_MULT  : std_ulogic_vector(1 downto 0) := "11";
+
     -- msel values
     constant MUL1_A : std_ulogic_vector(1 downto 0) := "00";
     constant MUL1_B : std_ulogic_vector(1 downto 0) := "01";
@@ -163,9 +174,10 @@ architecture behaviour of fpu is
     constant MUL2_P   : std_ulogic_vector(1 downto 0) := "10";
     constant MUL2_R   : std_ulogic_vector(1 downto 0) := "11";
 
-    constant MULADD_ZERO : std_ulogic_vector(1 downto 0) := "00";
+    constant MULADD_ZERO  : std_ulogic_vector(1 downto 0) := "00";
     constant MULADD_CONST : std_ulogic_vector(1 downto 0) := "01";
     constant MULADD_A     : std_ulogic_vector(1 downto 0) := "10";
+    constant MULADD_RS    : std_ulogic_vector(1 downto 0) := "11";
 
     -- Inverse lookup table, indexed by the top 8 fraction bits
     -- The first 256 entries are the reciprocal (1/x) lookup table,
@@ -597,20 +609,22 @@ begin
         variable need_check  : std_ulogic;
         variable msb         : std_ulogic;
         variable is_add      : std_ulogic;
-        variable qnan_result : std_ulogic;
         variable longmask    : std_ulogic;
         variable set_a       : std_ulogic;
         variable set_b       : std_ulogic;
         variable set_c       : std_ulogic;
-        variable px_nz       : std_ulogic;
-        variable maddend     : std_ulogic_vector(127 downto 0);
         variable set_y       : std_ulogic;
+        variable set_s       : std_ulogic;
+        variable qnan_result : std_ulogic;
+        variable px_nz       : std_ulogic;
         variable pcmpb_eq    : std_ulogic;
         variable pcmpb_lt    : std_ulogic;
         variable pshift      : std_ulogic;
         variable renorm_sqrt : std_ulogic;
         variable sqrt_exp    : signed(EXP_BITS-1 downto 0);
         variable shiftin     : std_ulogic;
+        variable mulexp      : signed(EXP_BITS-1 downto 0);
+        variable maddend     : std_ulogic_vector(127 downto 0);
     begin
         v := r;
         illegal := '0';
@@ -657,10 +671,15 @@ begin
             if adec.exponent > bdec.exponent then
                 v.exp_cmp := '1';
             end if;
+            v.madd_cmp := '0';
+            if (adec.exponent + cdec.exponent + 1) >= bdec.exponent then
+                v.madd_cmp := '1';
+            end if;
         end if;
 
         r_hi_nz <= or (r.r(55 downto 31));
         r_lo_nz <= or (r.r(30 downto 2));
+        s_nz <= or (r.s);
 
         if r.single_prec = '0' then
             if r.doing_ftdiv(1) = '0' then
@@ -711,6 +730,7 @@ begin
         opsel_b <= BIN_ZERO;
         opsel_binv <= '0';
         opsel_r <= RES_SUM;
+        opsel_s <= S_ZERO;
         carry_in <= '0';
         misc_sel <= "0000";
         fpscr_mask := (others => '1');
@@ -725,6 +745,7 @@ begin
         set_a := '0';
         set_b := '0';
         set_c := '0';
+        set_s := '0';
         f_to_multiply.is_32bit <= '0';
         f_to_multiply.valid <= '0';
         msel_1 <= MUL1_A;
@@ -802,12 +823,15 @@ begin
                         when "11010" =>
                             v.is_sqrt := '1';
                             v.state := DO_FRSQRTE;
+                        when "11100" | "11101" | "11110" | "11111" =>
+                            v.state := DO_FMADD;
                         when others =>
                             illegal := '1';
                     end case;
                 end if;
                 v.x := '0';
                 v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX);
+                set_s := '1';
 
             when DO_MCRFS =>
                 j := to_integer(unsigned(insn_bfa(r.insn)));
@@ -1416,6 +1440,99 @@ begin
                         arith_done := '1';
                 end case;
 
+            when DO_FMADD =>
+                -- fmadd, fmsub, fnmadd, fnmsub
+                opsel_a <= AIN_A;
+                v.result_sign := r.a.negative;
+                v.result_class := r.a.class;
+                v.result_exp := r.a.exponent;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                is_add := r.a.negative xor r.c.negative xor r.b.negative xor r.insn(1);
+                if r.a.class = FINITE and r.c.class = FINITE and
+                    (r.b.class = FINITE or r.b.class = ZERO) then
+                    v.is_subtract := not is_add;
+                    mulexp := r.a.exponent + r.c.exponent;
+                    v.result_exp := mulexp;
+                    opsel_a <= AIN_B;
+                    -- Make sure A and C are normalized
+                    if r.a.mantissa(54) = '0' then
+                        opsel_a <= AIN_A;
+                        v.state := RENORM_A;
+                    elsif r.c.mantissa(54) = '0' then
+                        opsel_a <= AIN_C;
+                        v.state := RENORM_C;
+                    elsif r.b.class = ZERO then
+                        -- no addend, degenerates to multiply
+                        v.result_sign := r.a.negative xor r.c.negative xor r.insn(2);
+                        f_to_multiply.valid <= '1';
+                        v.is_multiply := '1';
+                        v.state := MULT_1;
+                    elsif r.madd_cmp = '0' then
+                        -- addend is bigger, do multiply first
+                        v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2));
+                        f_to_multiply.valid <= '1';
+                        v.state := FMADD_1;
+                    else
+                        -- product is bigger, shift B right and use it as the
+                        -- addend to the multiplier
+                        v.shift := r.b.exponent - mulexp + to_signed(64, EXP_BITS);
+                        -- for subtract, multiplier does B - A * C
+                        v.result_sign := not (r.a.negative xor r.c.negative xor r.insn(2) xor is_add);
+                        v.result_exp := r.b.exponent;
+                        v.state := FMADD_2;
+                    end if;
+                else
+                    if (r.a.class = NAN and r.a.mantissa(53) = '0') or
+                        (r.b.class = NAN and r.b.mantissa(53) = '0') or
+                        (r.c.class = NAN and r.c.mantissa(53) = '0') then
+                        -- Signalling NAN
+                        v.fpscr(FPSCR_VXSNAN) := '1';
+                        invalid := '1';
+                    end if;
+                    if r.a.class = NAN then
+                        -- nothing to do, result is A
+                    elsif r.b.class = NAN then
+                        -- result is B
+                        v.result_class := NAN;
+                        v.result_sign := r.b.negative;
+                        opsel_a <= AIN_B;
+                    elsif r.c.class = NAN then
+                        -- result is C
+                        v.result_class := NAN;
+                        v.result_sign := r.c.negative;
+                        opsel_a <= AIN_C;
+                    elsif (r.a.class = ZERO and r.c.class = INFINITY) or
+                        (r.a.class = INFINITY and r.c.class = ZERO) then
+                        -- invalid operation, construct QNaN
+                        v.fpscr(FPSCR_VXIMZ) := '1';
+                        qnan_result := '1';
+                    elsif r.a.class = INFINITY or r.c.class = INFINITY then
+                        if r.b.class = INFINITY and is_add = '0' then
+                            -- invalid operation, construct QNaN
+                            v.fpscr(FPSCR_VXISI) := '1';
+                            qnan_result := '1';
+                        else
+                            -- result is infinity
+                            v.result_class := INFINITY;
+                            v.result_sign := r.a.negative xor r.c.negative xor r.insn(2);
+                        end if;
+                    else
+                        -- Here A is zero, C is zero, or B is infinity
+                        -- Result is +/-B in all of those cases
+                        v.result_class := r.b.class;
+                        v.result_exp := r.b.exponent;
+                        if v.result_class /= ZERO or is_add = '1' then
+                            v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2));
+                        else
+                            -- have to be careful about rule for 0 - 0 result sign
+                            v.result_sign := (r.round_mode(1) and r.round_mode(0)) xor r.insn(2);
+                        end if;
+                        opsel_a <= AIN_B;
+                    end if;
+                    arith_done := '1';
+                end if;
+
             when RENORM_A =>
                 renormalize := '1';
                 v.state := RENORM_A2;
@@ -1426,8 +1543,16 @@ begin
                 if r.insn(4) = '1' then
                     opsel_a <= AIN_C;
                     if r.c.mantissa(54) = '1' then
-                        v.first := '1';
-                        v.state := MULT_1;
+                        if r.insn(3) = '0' or r.b.class = ZERO then
+                            v.first := '1';
+                            v.state := MULT_1;
+                        else
+                            v.madd_cmp := '0';
+                            if new_exp + 1 >= r.b.exponent then
+                                v.madd_cmp := '1';
+                            end if;
+                            v.state := DO_FMADD;
+                        end if;
                     else
                         v.state := RENORM_C;
                     end if;
@@ -1462,11 +1587,20 @@ begin
             when RENORM_C2 =>
                 set_c := '1';
                 v.result_exp := new_exp;
-                v.first := '1';
-                v.state := MULT_1;
+                if r.insn(3) = '0' or r.b.class = ZERO then
+                    v.first := '1';
+                    v.state := MULT_1;
+                else
+                    v.madd_cmp := '0';
+                    if new_exp + 1 >= r.b.exponent then
+                        v.madd_cmp := '1';
+                    end if;
+                    v.state := DO_FMADD;
+                end if;
 
             when ADD_SHIFT =>
                 opsel_r <= RES_SHIFT;
+                v.x := s_nz;
                 set_x := '1';
                 longmask := '0';
                 v.state := ADD_2;
@@ -1545,6 +1679,78 @@ begin
                     v.state := FINISH;
                 end if;
 
+            when FMADD_1 =>
+                -- Addend is bigger here
+                v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2));
+                -- note v.shift is at most -2 here
+                v.shift := r.result_exp - r.b.exponent;
+                opsel_r <= RES_MULT;
+                opsel_s <= S_MULT;
+                set_s := '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.state := ADD_SHIFT;
+                end if;
+
+            when FMADD_2 =>
+                -- Product is potentially bigger here
+                set_s := '1';
+                opsel_s <= S_SHIFT;
+                v.shift := r.shift - to_signed(64, EXP_BITS);
+                v.state := FMADD_3;
+
+            when FMADD_3 =>
+                opsel_r <= RES_SHIFT;
+                v.first := '1';
+                v.state := FMADD_4;
+
+            when FMADD_4 =>
+                msel_add <= MULADD_RS;
+                f_to_multiply.valid <= r.first;
+                msel_inv <= r.is_subtract;
+                opsel_r <= RES_MULT;
+                opsel_s <= S_MULT;
+                set_s := '1';
+                v.shift := to_signed(56, EXP_BITS);
+                if multiply_to_f.valid = '1' then
+                    if multiply_to_f.result(121) = '1' then
+                        v.state := FMADD_5;
+                    else
+                        v.state := FMADD_6;
+                    end if;
+                end if;
+
+            when FMADD_5 =>
+                -- negate R:S:X
+                v.result_sign := not r.result_sign;
+                opsel_ainv <= '1';
+                carry_in <= not (s_nz or r.x);
+                opsel_s <= S_NEG;
+                set_s := '1';
+                v.shift := to_signed(56, EXP_BITS);
+                v.state := FMADD_6;
+
+            when FMADD_6 =>
+                if (r.r(56) or r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then
+                    if s_nz = '0' then
+                        -- must be a subtraction, and r.x must be zero
+                        v.result_class := ZERO;
+                        v.result_sign := r.round_mode(1) and r.round_mode(0);
+                        arith_done := '1';
+                    else
+                        -- R is all zeroes but there are non-zero bits in S
+                        -- so shift them into R and set S to 0
+                        opsel_r <= RES_SHIFT;
+                        set_s := '1';
+                        -- stay in state FMADD_6
+                    end if;
+                elsif r.r(56 downto 54) = "001" then
+                    v.state := FINISH;
+                else
+                    renormalize := '1';
+                    v.state := NORMALIZE;
+                end if;
+
             when LOOKUP =>
                 opsel_a <= AIN_B;
                 -- wait one cycle for inverse_table[B] lookup
@@ -2097,6 +2303,9 @@ begin
             when MULADD_A =>
                 -- addend is A in 16.112 format
                 maddend(121 downto 58) := r.a.mantissa;
+            when MULADD_RS =>
+                -- addend is concatenation of R and S in 16.112 format
+                maddend := "000000" & r.r & r.s & "00";
             when others =>
         end case;
         if msel_inv = '1' then
@@ -2167,7 +2376,7 @@ begin
         end if;
         in_b <= in_b0;
         if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then
-            shift_res := shifter_64(r.r & shiftin & 55x"00000000000000",
+            shift_res := shifter_64(r.r & (shiftin or r.s(55)) & r.s(54 downto 0),
                                     std_ulogic_vector(r.shift(6 downto 0)));
         else
             shift_res := (others => '0');
@@ -2230,6 +2439,21 @@ begin
                 result <= misc;
         end case;
         v.r := result;
+        if set_s = '1' then
+            case opsel_s is
+                when S_NEG =>
+                    v.s := std_ulogic_vector(unsigned(not r.s) + (not r.x));
+                when S_MULT =>
+                    v.s := multiply_to_f.result(57 downto 2);
+                when S_SHIFT =>
+                    v.s := shift_res(63 downto 8);
+                    if shift_res(7 downto 0) /= x"00" then
+                        v.x := '1';
+                    end if;
+                when others =>
+                    v.s := (others => '0');
+            end case;
+        end if;
 
         if set_a = '1' then
             v.a.exponent := new_exp;
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index b72b01e..52f21d0 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -1338,6 +1338,76 @@ int fpu_test_22(void)
 	return trapit(0, test22);
 }
 
+struct fmavals {
+	unsigned long ra;
+	unsigned long rc;
+	unsigned long rb;
+	unsigned long fma;
+	unsigned long fms;
+	unsigned long nfma;
+	unsigned long nfms;
+} fmavals[] = {
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+	  0x0000000000000000, 0x0000000000000000, 0x8000000000000000, 0x8000000000000000 },
+	{ 0x0000000000000000, 0x7ffc000000000000, 0x0000000000000000,
+	  0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000 },
+	{ 0x0000000000000000, 0x7ffc000000000000, 0x7ffb000000000000,
+	  0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000 },
+	{ 0x7ffa000000000000, 0x7ffc000000000000, 0x7ffb000000000000,
+	  0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000 },
+	{ 0x3ff0000000000000, 0x8000000000000000, 0x678123456789abcd, 
+	  0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd },
+	{ 0x3ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, 
+	  0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd },
+	{ 0x7ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, 
+	  0xfff0000000000000, 0xfff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x0000000000000000, 0x678123456789abcd, 
+	  0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000 },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000020000000, 
+	  0x4000000010000000, 0xbe80000000000000, 0xc000000010000000, 0x3e80000000000000 },
+	{ 0x3ff0000000000001, 0x3ff0000000000001, 0x3ff0000000000000,
+	  0x4000000000000001, 0x3cc0000000000000, 0xc000000000000001, 0xbcc0000000000000 },
+	{ 0x3ff0000000000003, 0x3ff0000000000002, 0x3ff0000000000000,
+	  0x4000000000000002, 0x3cd4000000000002, 0xc000000000000002, 0xbcd4000000000002 },
+	{ 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb020000000000000,
+	  0xaca765753908cd20, 0x3030000000000000, 0x2ca765753908cd20, 0xb030000000000000 },
+	{ 0x3006a09e667f3bcd, 0x4006a09e667f3bcd, 0xb020000000000000,
+	  0x2cd3b3efbf5e2229, 0x3030000000000000, 0xacd3b3efbf5e2229, 0xb030000000000000 },
+	{ 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb060003450000000,
+	  0xb05e0068a0000000, 0x3061003450000000, 0x305e0068a0000000, 0xb061003450000000 },
+};
+
+int test23(long arg)
+{
+	long i;
+	unsigned long results[4];
+	struct fmavals *vp = fmavals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(fmavals) / sizeof(fmavals[0]); ++i, ++vp) {
+		asm("lfd 6,0(%0); lfd 7,8(%0); lfd 8,16(%0); fmadd 0,6,7,8; stfd 0,0(%1)"
+		    : : "b" (&vp->ra), "b" (results) : "memory");
+		asm("fmsub 1,6,7,8; fnmadd 2,6,7,8; fnmsub 3,6,7,8; stfd 1,8(%0); stfd 2,16(%0); stfd 3,24(%0)"
+		    : : "b" (results) : "memory");
+		if (results[0] != vp->fma || results[1] != vp->fms ||
+		    results[2] != vp->nfma || results[3] != vp->nfms) {
+			print_hex(i, 2, " ");
+			print_hex(results[0], 16, " ");
+			print_hex(results[1], 16, " ");
+			print_hex(results[2], 16, " ");
+			print_hex(results[3], 16, "\r\n");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_23(void)
+{
+	enable_fp();
+	return trapit(0, test23);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -1385,6 +1455,7 @@ int main(void)
 	do_test(20, fpu_test_20);
 	do_test(21, fpu_test_21);
 	do_test(22, fpu_test_22);
+	do_test(23, fpu_test_23);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index e3783415a22e72bae0935c2959ab8c4a5bf5a316..50831cb20db40951fb7b0e508677cfddb62e46c6 100755
GIT binary patch
delta 3557
zcmai04Ny~87QXM2#HcY13gIut2Lw!_vZ;SOOn`(cA|N6~mvvhjbgZadhdQ=J7fII1
zcC@1uIo&RGsUlMiW37XYGm9Oy(b+oaYM0%rMFGXuuHY)AKWkB`Z_j;6)UrC>nalT`
z?|%2*^X|Ft5&w0r{T7!@h&`Q<+_IA+a@Qj+CvW7I5#mH`HEOF-TfN-<#`ffC++VjZ
znM$`#KUnk0Gp@qAgfv3>{*F$8C+-4*MB|mT&YPl`$KrsNl@{=xLhtF$_F-csxXyus
zMi(LbIuAW&tiQmGV{diV2gd??kT7So7fx%;bnpYXttmqA<%Snf+L{bI#!VdQ&BEk}
zv#<_Mk4w@1oJ4XCit&S@=(qOW59Frtf;wWRdH{zA$?OZdiz2c{mDDv^W%yJ}>R@NY
zGR_M$a%$vJ1IalqEzy)IikmVC@mm{UUZj=oH^7^bs}#iP5k9&B`=V07ppE8AAYHqF
z%ZF|F_ixaxT|f<qkP`K}+`hmg+=1_+QlTeG#W~?t)Li+%Z#}|FNvnuf(Y6<%Iyzh4
zT;vfhLU>GST}-iDS?m$!u>fjgRNPH?KW47HW06NV49~`<0w1g5YGF$3TzSr~J%UNn
zT4Gg{l)$mrY`JcsM|cm8qtPCRVi6R?Wy=qhc!ZIX*dC|i%Aq?BGm4ajpT8~U;RD)O
zU3E)5!g{zGm&$E}?Ybn$&=ql!$R=^`!%>}2R<9#D;56!>cD!!P3mBY_W~WJ<*Wh4c
z9K9F^7si*;h0!omZyq~8j^ymd6O_i2t1Gc^YxOC#awRnBv#Bf&Zs^Ul;$6^ADAKxD
zE)&LMp?Ff(D~e4v4RH%-te%ifXUD?H32SI;1I*-e$1<b*XO?l3O%^cK@n)JI1IPFc
zv~4HMikG%n6>p~NqT${64Q#`)31-?B1sMq$)L94B2{qKc64Z%WHkO>2O&heZCh-J|
zrW(wc%pT#@J7OM3T*9Fd%c=@VyigwrrAe`{)sRmscfln?7BlOU%#qCe$H8Bk_rt4(
zIH*j@rzL-ZlSxyV_imDzUK@ugOn_}ksgyK8b+R<@ak80C9tW3`-(t2^6EW|xlI>{X
zB<Pu_q9qN`KT&!pz0pkl;c(x$n3n8-CZi<&3q@5xY~2HkO_KPg$xKT$=$z$ZCE!W?
z9iCimt4SyK)x<8nDtp7`ZM475-rpW8PdLO=ZA#MDrXeFw-E}C22vK$nI+BO(JF<s{
z94`+kiLpaNayp_(IrhF@o;do5V6Lhln;fhfRI0`vq4M6>E<4#h_`4s(JVX~7%X(Qo
zUknJn#$Y@@7+ZpII>y2y^=Bt*LM>DVCzhe`8Ir@}=EmUWo5f9bBgU3sT#2y-V<8xq
zi=#K$IU-sGVoKwI;}fCB6fu7@-Xaa*2LFj?v+#Q{@4~(NE(=zB6cHz|il>bZT<gLC
zSCxe<lWsv$P@9GCNlcJiFr}^FFk88cu+i*u)ndvjvdPZQqaHbX>_y+P?~ipd)7Yaw
z&hspKGH{F#o8%krQQ|sdq;=AzaXcpXV?1Mpz5#Y#)l$NZnA|xyJ_#tNg2j_Rer$Fd
zzkPakzd_H_vYR2T^OWqu(e!9`L-5{|pSZ)YJ!KkxGe1k2MqBs#?xwsR=4uQSsCQ7j
z`k%dF%BuE0Hj|ewt|%_GHRCw(CTu^9Iy+c79#9J&IJ<neP}@s4R56_upcVCtlJ@%t
znjf{#Q2WzmG4HjX*hP&On4jG_JnV3GJ05J_fJU%H@m!iOe%cC7$G(<Pcnf6t9$`0r
zi}39d*hh|fm-g0ac{SrY=|3ZJD!p)Gx@+Wv$d^0gd2WpND}nI2Wjx7UkA{6St#iBy
zefwpkYTp8~QHh+MP_CEZm3>qT!xvZeK(k9e%`ajwr!I(;wlYj6#I%KwEx%&PD80U#
z)KbpnIWLOWP+(1i9ZjV8f+!X;0UOio*lST6KxB<+#n@l@QWTq+fQ@P8jC6=1&k!5a
zs<5k2GcW-g!&>Y*)Vc_P)oJmxy#s2}QfSvjIEGMs3HsCYwCWP5XX&ZC6H*YGJD~uf
zy%W|T40b{d0^bD(5OTWUbA*a6=trpU0<~37{auiP(AxzC2%2tKgCKN64T7^94j^pq
zhR+dNx}hJTqZ`yVJtdbR1;Kb3_luK<5mJcE7OQAQ+I$(-*kUs}kdiB+s6$|l$9@wv
zC2AE+m_5=#c5%<C?ct8<LN&>{qHwUs*2wVKuE0rKf<i#L?h5qV%!+cP?N>mbo~(EW
zX-*I1rY9(lAhlnGwMefaJ$Ds$AXUln@V<f*=oOHvzJ`8Ook(Y218s(0;Xx|<24*2`
zMtbHOC`Eb(sq^0w9YFeHfJP_?*?b-P(I6m=?v>~ZNSh^Uaw8S`L{Z={`MGK*KFuy}
z?I<;I-~){gl=1`g<im{2#Mm>)8gD!-{%oAogWCQZ@JVI{v}DTU#u4~2gTtB6O{qY3
z=GMa_WH-fw+BwwNMAq`L*P|u_lRZz~i_8h@?GtqoN<t#O7sVEq)*LmDY0;O|WVXVA
z*Qs65nQDI!#m#I5Y;0k6rDZ>gqK66CjM(ki*8%64G^H6wn8F%!yRjcZZ8BRP8`Ivw
ze(t}bSm`Ts+*QHbub6y=g}2r4C1-+vtPBT#vBEWkzfd1-q4-G{@`lFm1?N8v?*@It
z=DA2{;7K@uzgMN^KKtR86>}2*uwocq55^%}5wM5m4SYJxADS=u4Y=U&iNQ=CPK1G_
zQSj&Iqg^DlRA7t2pz}%iAO-daJRwrd3S-BP&3z32u-K0sn>!;=yn)OR2J}#z7@Fv}
z1*_~282m@ryUjMBaBz{<Wv5bj4u8o=%^!@9oUyuI-r1<IoDHBoFwD2`XO(HME%)vZ
z#1>01W>*lKr6uV9zj>yF(oetjeG&3m0&&(Cp@mu=vti+M;%vcH8_P*4LyXHT!I&Y7
LTYUDH+!6l+$lio8

delta 2528
zcmah}4Qx}_6~6DrHgW5@4vAyOG0wA-II&666e@U$^J6;@j6;K$W)!h5a-fBDP{t<4
zLYCPU@+PPRNqQ;`wk!?IR2gCu#caw@hms0KEt{%Lq8%l~2_;>VKr7RBG=!47bDtUg
z84_2z@B8j|zIWcae{b|-Vdx{lPKdjj5P$2)BL7~*=gDq=D<MJDwxhNkwe8PGcK6tq
z3uk&ZEu)&{uOB-5SlHcVs~}|N@93075@{qzbTL^QyTMokAEz{LMaug&dM}KPek<m~
zJb!BJ4-;}??92mVpEO1=_R`qld$H6vNLtVsg9|z*ojwY;bWI3xe?}vv#f9*j%p!Hn
zi^Y+ky9+L4mSw+KKzy$=>l|Z4u9<~YZMqm}vsP$>7$T&0=AOGL%lnA0>zGS{v+{RA
zU)B~O22{UHUz$&R!*YsawT$hnC1l9e3~Ti+YR-pW>3^t-v_z#p&cX>p85Cq21szmp
zuNM?>5TBhep1q#FVudop9_3tPRJsi}4J%>Npb?Dlv0;sJ*QTiSf~@T_YN+p72pQ{@
z!KSG6K4j#qd^x9CIrMZ?TFVE}ouk2$ewVXG+5Nq!^jCPov=T&<M({zYX^oOR6O|mY
zcFLrogU`S@Q@!$HASxY!VKlmPQT#46=GH5BY>Y~3SscyP2qKK<u2lXLC-^S3ntvwz
z0H1}hU{(nqf^tb*5y&Mzm@F{Er6p!n77h*?V19{>_L#t8X`yNZge=a*N)z!N#T<Mw
zr|hA2+?%tOGCI8tW-axKH*zpXx5WtS@^mx^(301bo!quXT7o+wlGbU)j#cSML_+&W
zUOoNN2wHI`9XJ9Z(Z84*m3LVKHz`u4%c4`QFcP0Ym@5i`B^QdV8|mP0VV_n0=4q>w
zmKY#m?cfz`vN>ra8+O|B;J7W12D)IxcF2_6c3f(|b1nS0`Twh7XMR0x*27=&-{h}%
z6gaWi<I<0Bvj%~Lr8A4RbOwl33S`0Af~~Zz7i@)IZroa!GX8Szo5trd(P)C9!gbVl
z7_|0f-0ZbGY42hzY6DFkhLMywkK$AY*o$7`meWNziwsB<ZJ@p*u%cL=+SX#H&<?j9
zE{HhH)c0$cP3b0wlb(MJ<{g!CVBi?^Ipi1K!GMkj${@Hj>!~?hO&t;D_=X?&j5WM>
zXCYjUvwDQEJU%nWqx@EoU?3UpEGJv!+3N3Um3TGO_joz3Kqm|@bqR07-KFs`X*(`y
z$UZmUrF7(cI}d)!FPy7{YsI+D5AwW#o*Et_RkH8fo^;H!TFz4~$0gDld4OlJWJQxe
zLUO_VSloH=lmfl^D7;sC`hnH`Ir;GFdeHN*>Ry!dJfym{X!?3}Hz4QnFN8lpMmhdl
zVRiX(I`I2=NBN$#u&zzspy@ew_4;dR>7D7<`J1BrQ)6s%l@a44X6v~*o4OK8Tp$q_
zoS)@F^u9(rI{$Z{ic!yywV&VD%2B(E@9HO6!%x3p4Q^a3?&n7w7dwNWj{6_Kh(;(k
zh=L=2^vP|4Mnp3|9W=Ii`sHj<%eh(p^^;(F44PJi)%Qp7%PXE1z7zXEBBH-lB>uf<
zsFYl*W44)-3etIEJ^4jCa{6ZaUIlSfd|gW;?h4I)%`N+M-x$MLX}w<lR6z+Ljst`o
zc$^oL9*bKlT7+=czZjcBfj1p?G?6LP0$jkxH8*zg5@Sh3-niC`{Y%saxPXmoZP>ft
zXG}!ojcc9Qdr>Rk0yd7_*iWK%g%G$?X{Eu-U~!ev9b?dlFf|4tmxXGsKp%o`9L^y~
z<B&iIj>7`N{&BEWS?H;8s6n_i4lM{I0U?Cq1oR;U5^xToJpl=X{sb%_yp;e;wS^`V
zP=jEcfEEPz1cVUUCZG?YdjifO3{Jp8wMlh24QDY4mKsydAkwx;#>{*c*hK9AKy4Rl
zyST7Q?IEuUby|0ZClt^U?}W<3C!0ji8<Vi7#-=hV2)Qx|eKk&%52<t&-l?&xb|XD?
z73R^<i*(=vu+`dB=aE)j13%ImNawD>PipNdqmq!_Q_zR18|ldpVWifgdJgH1f5R-&
z{Yanu2()gC>Lk*)QgjsQv(sEvevUK=!|ptlQAJ2(2Kvxig;Y0-)TA!Jl7l$x2el{P
zv>(~$vtRA#Dvb7`My|slPmP)nr*PN}2R(JAC5v!XKKW{e`1ZR|D@Khwd0U6Q8MTdY
z*VCYkAv*(4d5g@W$l5+-tY1LGYORQCNWZTot5hC5Bf(0u{^Dnh?dKV=@x=U``e4|b
zDGb89-lf6~5b82>$bT3s;VH0j*DKhKw-{@WKT&r}6L;5L65!PzIpTr9Ep2?m=8mlY
E0#G$xNB{r;

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index 9b97cb5..ed759a5 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -20,3 +20,4 @@ test 19:PASS
 test 20:PASS
 test 21:PASS
 test 22:PASS
+test 23:PASS