FPU: Implement floating multiply-add instructions

This implements fmadd, fmsub, fnmadd, fnmsub and their single-precision counterparts. The single-precision versions operate the same as the double-precision versions until the final rounding and overflow/underflow steps. This adds an S register to store the low bits of the product. S shifts into R on left shifts, and can be negated, but doesn't do any other arithmetic. This adds a test for the double-precision versions of these instructions. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
6 years ago · dc1544db69
parent c083b9507d
commit dc1544db69
5 changed files with 314 additions and 10 deletions
--- a/decode1.vhdl
+++ b/decode1.vhdl
@ -423,6 +423,10 @@ architecture behaviour of decode1 is
        2#11000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fres
        2#11001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmuls
        2#11010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- frsqrtes
        2#11100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmsubs
        2#11101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmadds
        2#11110#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fnmsubs
        2#11111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fnmadds
        others => illegal_inst
        );
@ -485,6 +489,10 @@ architecture behaviour of decode1 is
        2#1000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fre
        2#1001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmul
        2#1010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- frsqrte
        2#1100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmsub
        2#1101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmadd
        2#1110#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fnmsub
        2#1111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fnmadd
        others => illegal_inst
        );
--- a/fpu.vhdl
+++ b/fpu.vhdl
@ -40,13 +40,15 @@ architecture behaviour of fpu is
                     DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT,
                     DO_FCFID, DO_FCTI,
                     DO_FRSP, DO_FRI,
-                     DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT,
+                     DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT, DO_FMADD,
                     DO_FRE, DO_FRSQRTE,
                     DO_FSEL,
                     FRI_1,
                     ADD_SHIFT, ADD_2, ADD_3,
                     CMP_1, CMP_2,
                     MULT_1,
                     FMADD_1, FMADD_2, FMADD_3,
                     FMADD_4, FMADD_5, FMADD_6,
                     LOOKUP,
                     DIV_2, DIV_3, DIV_4, DIV_5, DIV_6,
                     FRE_1,
@ -82,6 +84,7 @@ architecture behaviour of fpu is
        b            : fpu_reg_type;
        c            : fpu_reg_type;
        r            : std_ulogic_vector(63 downto 0);  -- 10.54 format
        s            : std_ulogic_vector(55 downto 0);  -- extended fraction
        x            : std_ulogic;
        p            : std_ulogic_vector(63 downto 0);  -- 8.56 format
        y            : std_ulogic_vector(63 downto 0);  -- 8.56 format
@ -101,6 +104,7 @@ architecture behaviour of fpu is
        round_mode   : std_ulogic_vector(2 downto 0);
        is_subtract  : std_ulogic;
        exp_cmp      : std_ulogic;
        madd_cmp     : std_ulogic;
        add_bsmall   : std_ulogic;
        is_multiply  : std_ulogic;
        is_sqrt      : std_ulogic;
@ -117,6 +121,7 @@ architecture behaviour of fpu is
    signal opsel_a       : std_ulogic_vector(1 downto 0);
    signal opsel_b       : std_ulogic_vector(1 downto 0);
    signal opsel_r       : std_ulogic_vector(1 downto 0);
    signal opsel_s       : std_ulogic_vector(1 downto 0);
    signal opsel_ainv    : std_ulogic;
    signal opsel_amask   : std_ulogic;
    signal opsel_binv    : std_ulogic;
@ -127,6 +132,7 @@ architecture behaviour of fpu is
    signal lost_bits     : std_ulogic;
    signal r_hi_nz       : std_ulogic;
    signal r_lo_nz       : std_ulogic;
    signal s_nz          : std_ulogic;
    signal misc_sel      : std_ulogic_vector(3 downto 0);
    signal f_to_multiply : MultiplyInputType;
    signal multiply_to_f : MultiplyOutputType;
@ -152,6 +158,11 @@ architecture behaviour of fpu is
    constant RES_MULT  : std_ulogic_vector(1 downto 0) := "10";
    constant RES_MISC  : std_ulogic_vector(1 downto 0) := "11";
    constant S_ZERO  : std_ulogic_vector(1 downto 0) := "00";
    constant S_NEG   : std_ulogic_vector(1 downto 0) := "01";
    constant S_SHIFT : std_ulogic_vector(1 downto 0) := "10";
    constant S_MULT  : std_ulogic_vector(1 downto 0) := "11";
    -- msel values
    constant MUL1_A : std_ulogic_vector(1 downto 0) := "00";
    constant MUL1_B : std_ulogic_vector(1 downto 0) := "01";
@ -163,9 +174,10 @@ architecture behaviour of fpu is
    constant MUL2_P   : std_ulogic_vector(1 downto 0) := "10";
    constant MUL2_R   : std_ulogic_vector(1 downto 0) := "11";
-    constant MULADD_ZERO : std_ulogic_vector(1 downto 0) := "00";
+    constant MULADD_ZERO  : std_ulogic_vector(1 downto 0) := "00";
    constant MULADD_CONST : std_ulogic_vector(1 downto 0) := "01";
    constant MULADD_A     : std_ulogic_vector(1 downto 0) := "10";
    constant MULADD_RS    : std_ulogic_vector(1 downto 0) := "11";
    -- Inverse lookup table, indexed by the top 8 fraction bits
    -- The first 256 entries are the reciprocal (1/x) lookup table,
@ -597,20 +609,22 @@ begin
        variable need_check  : std_ulogic;
        variable msb         : std_ulogic;
        variable is_add      : std_ulogic;
        variable qnan_result : std_ulogic;
        variable longmask    : std_ulogic;
        variable set_a       : std_ulogic;
        variable set_b       : std_ulogic;
        variable set_c       : std_ulogic;
        variable px_nz       : std_ulogic;
        variable maddend     : std_ulogic_vector(127 downto 0);
        variable set_y       : std_ulogic;
        variable set_s       : std_ulogic;
        variable qnan_result : std_ulogic;
        variable px_nz       : std_ulogic;
        variable pcmpb_eq    : std_ulogic;
        variable pcmpb_lt    : std_ulogic;
        variable pshift      : std_ulogic;
        variable renorm_sqrt : std_ulogic;
        variable sqrt_exp    : signed(EXP_BITS-1 downto 0);
        variable shiftin     : std_ulogic;
        variable mulexp      : signed(EXP_BITS-1 downto 0);
        variable maddend     : std_ulogic_vector(127 downto 0);
    begin
        v := r;
        illegal := '0';
@ -657,10 +671,15 @@ begin
            if adec.exponent > bdec.exponent then
                v.exp_cmp := '1';
            end if;
            v.madd_cmp := '0';
            if (adec.exponent + cdec.exponent + 1) >= bdec.exponent then
                v.madd_cmp := '1';
            end if;
        end if;
        r_hi_nz <= or (r.r(55 downto 31));
        r_lo_nz <= or (r.r(30 downto 2));
        s_nz <= or (r.s);
        if r.single_prec = '0' then
            if r.doing_ftdiv(1) = '0' then
@ -711,6 +730,7 @@ begin
        opsel_b <= BIN_ZERO;
        opsel_binv <= '0';
        opsel_r <= RES_SUM;
        opsel_s <= S_ZERO;
        carry_in <= '0';
        misc_sel <= "0000";
        fpscr_mask := (others => '1');
@ -725,6 +745,7 @@ begin
        set_a := '0';
        set_b := '0';
        set_c := '0';
        set_s := '0';
        f_to_multiply.is_32bit <= '0';
        f_to_multiply.valid <= '0';
        msel_1 <= MUL1_A;
@ -802,12 +823,15 @@ begin
                        when "11010" =>
                            v.is_sqrt := '1';
                            v.state := DO_FRSQRTE;
                        when "11100" | "11101" | "11110" | "11111" =>
                            v.state := DO_FMADD;
                        when others =>
                            illegal := '1';
                    end case;
                end if;
                v.x := '0';
                v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX);
                set_s := '1';
            when DO_MCRFS =>
                j := to_integer(unsigned(insn_bfa(r.insn)));
@ -1416,6 +1440,99 @@ begin
                        arith_done := '1';
                end case;
            when DO_FMADD =>
                -- fmadd, fmsub, fnmadd, fnmsub
                opsel_a <= AIN_A;
                v.result_sign := r.a.negative;
                v.result_class := r.a.class;
                v.result_exp := r.a.exponent;
                v.fpscr(FPSCR_FR) := '0';
                v.fpscr(FPSCR_FI) := '0';
                is_add := r.a.negative xor r.c.negative xor r.b.negative xor r.insn(1);
                if r.a.class = FINITE and r.c.class = FINITE and
                    (r.b.class = FINITE or r.b.class = ZERO) then
                    v.is_subtract := not is_add;
                    mulexp := r.a.exponent + r.c.exponent;
                    v.result_exp := mulexp;
                    opsel_a <= AIN_B;
                    -- Make sure A and C are normalized
                    if r.a.mantissa(54) = '0' then
                        opsel_a <= AIN_A;
                        v.state := RENORM_A;
                    elsif r.c.mantissa(54) = '0' then
                        opsel_a <= AIN_C;
                        v.state := RENORM_C;
                    elsif r.b.class = ZERO then
                        -- no addend, degenerates to multiply
                        v.result_sign := r.a.negative xor r.c.negative xor r.insn(2);
                        f_to_multiply.valid <= '1';
                        v.is_multiply := '1';
                        v.state := MULT_1;
                    elsif r.madd_cmp = '0' then
                        -- addend is bigger, do multiply first
                        v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2));
                        f_to_multiply.valid <= '1';
                        v.state := FMADD_1;
                    else
                        -- product is bigger, shift B right and use it as the
                        -- addend to the multiplier
                        v.shift := r.b.exponent - mulexp + to_signed(64, EXP_BITS);
                        -- for subtract, multiplier does B - A * C
                        v.result_sign := not (r.a.negative xor r.c.negative xor r.insn(2) xor is_add);
                        v.result_exp := r.b.exponent;
                        v.state := FMADD_2;
                    end if;
                else
                    if (r.a.class = NAN and r.a.mantissa(53) = '0') or
                        (r.b.class = NAN and r.b.mantissa(53) = '0') or
                        (r.c.class = NAN and r.c.mantissa(53) = '0') then
                        -- Signalling NAN
                        v.fpscr(FPSCR_VXSNAN) := '1';
                        invalid := '1';
                    end if;
                    if r.a.class = NAN then
                        -- nothing to do, result is A
                    elsif r.b.class = NAN then
                        -- result is B
                        v.result_class := NAN;
                        v.result_sign := r.b.negative;
                        opsel_a <= AIN_B;
                    elsif r.c.class = NAN then
                        -- result is C
                        v.result_class := NAN;
                        v.result_sign := r.c.negative;
                        opsel_a <= AIN_C;
                    elsif (r.a.class = ZERO and r.c.class = INFINITY) or
                        (r.a.class = INFINITY and r.c.class = ZERO) then
                        -- invalid operation, construct QNaN
                        v.fpscr(FPSCR_VXIMZ) := '1';
                        qnan_result := '1';
                    elsif r.a.class = INFINITY or r.c.class = INFINITY then
                        if r.b.class = INFINITY and is_add = '0' then
                            -- invalid operation, construct QNaN
                            v.fpscr(FPSCR_VXISI) := '1';
                            qnan_result := '1';
                        else
                            -- result is infinity
                            v.result_class := INFINITY;
                            v.result_sign := r.a.negative xor r.c.negative xor r.insn(2);
                        end if;
                    else
                        -- Here A is zero, C is zero, or B is infinity
                        -- Result is +/-B in all of those cases
                        v.result_class := r.b.class;
                        v.result_exp := r.b.exponent;
                        if v.result_class /= ZERO or is_add = '1' then
                            v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2));
                        else
                            -- have to be careful about rule for 0 - 0 result sign
                            v.result_sign := (r.round_mode(1) and r.round_mode(0)) xor r.insn(2);
                        end if;
                        opsel_a <= AIN_B;
                    end if;
                    arith_done := '1';
                end if;
            when RENORM_A =>
                renormalize := '1';
                v.state := RENORM_A2;
@ -1426,8 +1543,16 @@ begin
                if r.insn(4) = '1' then
                    opsel_a <= AIN_C;
                    if r.c.mantissa(54) = '1' then
-                        v.first := '1';
+                        if r.insn(3) = '0' or r.b.class = ZERO then
-                        v.state := MULT_1;
+                            v.first := '1';
                            v.state := MULT_1;
                        else
                            v.madd_cmp := '0';
                            if new_exp + 1 >= r.b.exponent then
                                v.madd_cmp := '1';
                            end if;
                            v.state := DO_FMADD;
                        end if;
                    else
                        v.state := RENORM_C;
                    end if;
@ -1462,11 +1587,20 @@ begin
            when RENORM_C2 =>
                set_c := '1';
                v.result_exp := new_exp;
-                v.first := '1';
+                if r.insn(3) = '0' or r.b.class = ZERO then
-                v.state := MULT_1;
+                    v.first := '1';
                    v.state := MULT_1;
                else
                    v.madd_cmp := '0';
                    if new_exp + 1 >= r.b.exponent then
                        v.madd_cmp := '1';
                    end if;
                    v.state := DO_FMADD;
                end if;
            when ADD_SHIFT =>
                opsel_r <= RES_SHIFT;
                v.x := s_nz;
                set_x := '1';
                longmask := '0';
                v.state := ADD_2;
@ -1545,6 +1679,78 @@ begin
                    v.state := FINISH;
                end if;
            when FMADD_1 =>
                -- Addend is bigger here
                v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2));
                -- note v.shift is at most -2 here
                v.shift := r.result_exp - r.b.exponent;
                opsel_r <= RES_MULT;
                opsel_s <= S_MULT;
                set_s := '1';
                f_to_multiply.valid <= r.first;
                if multiply_to_f.valid = '1' then
                    v.state := ADD_SHIFT;
                end if;
            when FMADD_2 =>
                -- Product is potentially bigger here
                set_s := '1';
                opsel_s <= S_SHIFT;
                v.shift := r.shift - to_signed(64, EXP_BITS);
                v.state := FMADD_3;
            when FMADD_3 =>
                opsel_r <= RES_SHIFT;
                v.first := '1';
                v.state := FMADD_4;
            when FMADD_4 =>
                msel_add <= MULADD_RS;
                f_to_multiply.valid <= r.first;
                msel_inv <= r.is_subtract;
                opsel_r <= RES_MULT;
                opsel_s <= S_MULT;
                set_s := '1';
                v.shift := to_signed(56, EXP_BITS);
                if multiply_to_f.valid = '1' then
                    if multiply_to_f.result(121) = '1' then
                        v.state := FMADD_5;
                    else
                        v.state := FMADD_6;
                    end if;
                end if;
            when FMADD_5 =>
                -- negate R:S:X
                v.result_sign := not r.result_sign;
                opsel_ainv <= '1';
                carry_in <= not (s_nz or r.x);
                opsel_s <= S_NEG;
                set_s := '1';
                v.shift := to_signed(56, EXP_BITS);
                v.state := FMADD_6;
            when FMADD_6 =>
                if (r.r(56) or r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then
                    if s_nz = '0' then
                        -- must be a subtraction, and r.x must be zero
                        v.result_class := ZERO;
                        v.result_sign := r.round_mode(1) and r.round_mode(0);
                        arith_done := '1';
                    else
                        -- R is all zeroes but there are non-zero bits in S
                        -- so shift them into R and set S to 0
                        opsel_r <= RES_SHIFT;
                        set_s := '1';
                        -- stay in state FMADD_6
                    end if;
                elsif r.r(56 downto 54) = "001" then
                    v.state := FINISH;
                else
                    renormalize := '1';
                    v.state := NORMALIZE;
                end if;
            when LOOKUP =>
                opsel_a <= AIN_B;
                -- wait one cycle for inverse_table[B] lookup
@ -2097,6 +2303,9 @@ begin
            when MULADD_A =>
                -- addend is A in 16.112 format
                maddend(121 downto 58) := r.a.mantissa;
            when MULADD_RS =>
                -- addend is concatenation of R and S in 16.112 format
                maddend := "000000" & r.r & r.s & "00";
            when others =>
        end case;
        if msel_inv = '1' then
@ -2167,7 +2376,7 @@ begin
        end if;
        in_b <= in_b0;
        if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then
-            shift_res := shifter_64(r.r & shiftin & 55x"00000000000000",
+            shift_res := shifter_64(r.r & (shiftin or r.s(55)) & r.s(54 downto 0),
                                    std_ulogic_vector(r.shift(6 downto 0)));
        else
            shift_res := (others => '0');
@ -2230,6 +2439,21 @@ begin
                result <= misc;
        end case;
        v.r := result;
        if set_s = '1' then
            case opsel_s is
                when S_NEG =>
                    v.s := std_ulogic_vector(unsigned(not r.s) + (not r.x));
                when S_MULT =>
                    v.s := multiply_to_f.result(57 downto 2);
                when S_SHIFT =>
                    v.s := shift_res(63 downto 8);
                    if shift_res(7 downto 0) /= x"00" then
                        v.x := '1';
                    end if;
                when others =>
                    v.s := (others => '0');
            end case;
        end if;
        if set_a = '1' then
            v.a.exponent := new_exp;
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@ -1338,6 +1338,76 @@ int fpu_test_22(void)
 	return trapit(0, test22);
 }
 struct fmavals {
 	unsigned long ra;
 	unsigned long rc;
 	unsigned long rb;
 	unsigned long fma;
 	unsigned long fms;
 	unsigned long nfma;
 	unsigned long nfms;
 } fmavals[] = {
 	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
 	  0x0000000000000000, 0x0000000000000000, 0x8000000000000000, 0x8000000000000000 },
 	{ 0x0000000000000000, 0x7ffc000000000000, 0x0000000000000000,
 	  0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000 },
 	{ 0x0000000000000000, 0x7ffc000000000000, 0x7ffb000000000000,
 	  0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000 },
 	{ 0x7ffa000000000000, 0x7ffc000000000000, 0x7ffb000000000000,
 	  0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000 },
 	{ 0x3ff0000000000000, 0x8000000000000000, 0x678123456789abcd, 
 	  0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd },
 	{ 0x3ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, 
 	  0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd },
 	{ 0x7ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, 
 	  0xfff0000000000000, 0xfff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
 	{ 0x7ff0000000000000, 0x0000000000000000, 0x678123456789abcd, 
 	  0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000 },
 	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000020000000, 
 	  0x4000000010000000, 0xbe80000000000000, 0xc000000010000000, 0x3e80000000000000 },
 	{ 0x3ff0000000000001, 0x3ff0000000000001, 0x3ff0000000000000,
 	  0x4000000000000001, 0x3cc0000000000000, 0xc000000000000001, 0xbcc0000000000000 },
 	{ 0x3ff0000000000003, 0x3ff0000000000002, 0x3ff0000000000000,
 	  0x4000000000000002, 0x3cd4000000000002, 0xc000000000000002, 0xbcd4000000000002 },
 	{ 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb020000000000000,
 	  0xaca765753908cd20, 0x3030000000000000, 0x2ca765753908cd20, 0xb030000000000000 },
 	{ 0x3006a09e667f3bcd, 0x4006a09e667f3bcd, 0xb020000000000000,
 	  0x2cd3b3efbf5e2229, 0x3030000000000000, 0xacd3b3efbf5e2229, 0xb030000000000000 },
 	{ 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb060003450000000,
 	  0xb05e0068a0000000, 0x3061003450000000, 0x305e0068a0000000, 0xb061003450000000 },
 };
 int test23(long arg)
 {
 	long i;
 	unsigned long results[4];
 	struct fmavals *vp = fmavals;
 	set_fpscr(FPS_RN_NEAR);
 	for (i = 0; i < sizeof(fmavals) / sizeof(fmavals[0]); ++i, ++vp) {
 		asm("lfd 6,0(%0); lfd 7,8(%0); lfd 8,16(%0); fmadd 0,6,7,8; stfd 0,0(%1)"
 		    : : "b" (&vp->ra), "b" (results) : "memory");
 		asm("fmsub 1,6,7,8; fnmadd 2,6,7,8; fnmsub 3,6,7,8; stfd 1,8(%0); stfd 2,16(%0); stfd 3,24(%0)"
 		    : : "b" (results) : "memory");
 		if (results[0] != vp->fma || results[1] != vp->fms ||
 		    results[2] != vp->nfma || results[3] != vp->nfms) {
 			print_hex(i, 2, " ");
 			print_hex(results[0], 16, " ");
 			print_hex(results[1], 16, " ");
 			print_hex(results[2], 16, " ");
 			print_hex(results[3], 16, "\r\n");
 			return i + 1;
 		}
 	}
 	return 0;
 }
 int fpu_test_23(void)
 {
 	enable_fp();
 	return trapit(0, test23);
 }
 int fail = 0;
 void do_test(int num, int (*test)(void))
@ -1385,6 +1455,7 @@ int main(void)
 	do_test(20, fpu_test_20);
 	do_test(21, fpu_test_21);
 	do_test(22, fpu_test_22);
 	do_test(23, fpu_test_23);
 	return fail;
 }
--- a/tests/test_fpu.bin
+++ b/tests/test_fpu.bin
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@ -20,3 +20,4 @@ test 19:PASS
 test 20:PASS
 test 21:PASS
 test 22:PASS
 test 23:PASS