Merge pull request #457 from paulusmack/fixes

FPU fixes, mostly for bugs found by comparing results from random instruction sequences (generated by simple_random) with POWER9.
3 weeks ago · a1624a50da
parent 0b3df8ab00 09b340e845
commit a1624a50da
3 changed files with 414 additions and 309 deletions
--- a/fpu.vhdl
+++ b/fpu.vhdl
@ -51,7 +51,7 @@ architecture behaviour of fpu is
                     DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
                     DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT,
                     DO_FCFID, DO_FCTI,
-                     DO_FRSP, DO_FRSP_2, DO_FRI,
+                     DO_FRSP, DO_FRI,
                     DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT, DO_FMADD,
                     DO_FRE,
                     DO_FSEL,
@ -72,9 +72,9 @@ architecture behaviour of fpu is
                     INT_SHIFT, INT_ROUND, INT_ISHIFT,
                     INT_FINAL, INT_CHECK, INT_OFLOW,
                     FINISH, NORMALIZE,
-                     ROUND_UFLOW, ROUND_OFLOW,
+                     ROUND_UFLOW_DIS, ROUND_UFLOW_EN,
                     ROUND_OFLOW_DIS, ROUND_OFLOW_EN,
                     ROUNDING, ROUND_INC, ROUNDING_2, ROUNDING_3,
                     DENORM,
                     RENORM_A, RENORM_B, RENORM_C,
                     RENORM_1, RENORM_2,
                     IDIV_NORMB, IDIV_NORMB2, IDIV_NORMB3,
@ -98,6 +98,7 @@ architecture behaviour of fpu is
        zero_divide   : std_ulogic;
        new_fpscr     : std_ulogic_vector(31 downto 0);
        immed_result  : std_ulogic;      -- result is an input, zero, infinity or NaN
        need_finish   : std_ulogic;      -- result needs further processing
        qnan_result   : std_ulogic;
        result_sel    : std_ulogic_vector(2 downto 0);
        result_class  : fp_number_class;
@ -144,7 +145,7 @@ architecture behaviour of fpu is
        int_result   : std_ulogic;
        cr_result    : std_ulogic_vector(3 downto 0);
        cr_mask      : std_ulogic_vector(7 downto 0);
-        old_exc      : std_ulogic_vector(4 downto 0);
+        old_exc      : std_ulogic_vector(12 downto 0);
        update_fprf  : std_ulogic;
        quieten_nan  : std_ulogic;
        nsnan_result : std_ulogic;
@ -158,6 +159,7 @@ architecture behaviour of fpu is
        is_multiply  : std_ulogic;
        is_inverse   : std_ulogic;
        is_sqrt      : std_ulogic;
        do_renorm_b  : std_ulogic;
        first        : std_ulogic;
        count        : unsigned(1 downto 0);
        doing_ftdiv  : std_ulogic_vector(1 downto 0);
@ -187,6 +189,7 @@ architecture behaviour of fpu is
        cycle_1_ar   : std_ulogic;
        regsel       : std_ulogic_vector(2 downto 0);
        is_nan_inf   : std_ulogic;
        zero_fri     : std_ulogic;
    end record;
    type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0);
@ -312,6 +315,7 @@ architecture behaviour of fpu is
    constant RSCON2_63      : std_ulogic_vector(3 downto 0) := "0111";
    constant RSCON2_64      : std_ulogic_vector(3 downto 0) := "1000";
    constant RSCON2_MINEXP  : std_ulogic_vector(3 downto 0) := "1001";
    constant RSCON2_DPMINX  : std_ulogic_vector(3 downto 0) := "1010";
    signal rs_sel1       : std_ulogic_vector(1 downto 0);
    signal rs_sel2       : std_ulogic;
@ -713,9 +717,13 @@ architecture behaviour of fpu is
    end;
    -- Determine result flags to write into the FPSCR
-    function result_flags(sign: std_ulogic; class: fp_number_class; unitbit: std_ulogic)
+    function result_flags(sign: std_ulogic; class: fp_number_class; int_result: std_ulogic;
                          unitbit: std_ulogic)
        return std_ulogic_vector is
    begin
        if int_result = '1' then
            return "00000";
        else
            case class is
                when ZERO =>
                    return sign & "0010";
@ -726,6 +734,7 @@ architecture behaviour of fpu is
                when NAN =>
                    return "10001";
            end case;
        end if;
    end;
 begin
@ -767,6 +776,9 @@ begin
                end if;
            else
                assert not (r.state /= IDLE and e_in.valid = '1') severity failure;
                assert not (rin.state = FINISH and rin.r = 64x"0" and rin.x = '1');
                assert not (rin.state = ROUNDING and rin.r(UNIT_BIT) = '0' and
                            not (rin.tiny = '1' or rin.zero_fri = '1'));
                r <= rin;
            end if;
        end if;
@ -827,6 +839,7 @@ begin
        e.zero_divide := '0';
        e.new_fpscr := (others => '0');
        e.immed_result := '0';
        e.need_finish := '0';
        e.qnan_result := '0';
        e.result_sel := AIN_ZERO;
        e.result_class := FINITE;
@ -905,6 +918,11 @@ begin
                -- result is +/- B
                e.result_sel := AIN_B;
                e.result_class := r.b.class;
                -- r.result_sign is already correct
                if r.b.class = FINITE and r.int_result = '0' and
                    (r.single_prec = '1' or (r.fpscr(FPSCR_UE) = '1' and r.b.denorm = '1')) then
                    e.need_finish := '1';
                end if;
            else
                e.result_class := ZERO;
            end if;
@ -919,6 +937,10 @@ begin
            e.immed_result := '1';
            e.result_sel := AIN_B;
            e.result_class := r.b.class;
            if r.b.class = FINITE and r.int_result = '0' and
                (r.single_prec = '1' or (r.fpscr(FPSCR_UE) = '1' and r.b.denorm = '1')) then
                e.need_finish := '1';
            end if;
        elsif r.use_b = '1' and r.b.class = ZERO and r.is_multiply = '0' then
            -- B is zero, other operands are finite
@ -931,6 +953,11 @@ begin
            elsif r.is_addition = '1' then
                -- fadd, result is A
                e.result_sel := AIN_A;
                e.rsgn_op := RSGN_SEL;
                if r.a.class = FINITE and r.int_result = '0' and
                    (r.single_prec = '1' or (r.fpscr(FPSCR_UE) = '1' and r.a.denorm = '1')) then
                    e.need_finish := '1';
                end if;
            else
                -- other things, result is zero
                e.result_class := ZERO;
@ -979,6 +1006,7 @@ begin
        variable exp_huge    : std_ulogic;
        variable clz         : std_ulogic_vector(5 downto 0);
        variable set_x       : std_ulogic;
        variable set_xs      : std_ulogic;
        variable mshift      : signed(EXP_BITS-1 downto 0);
        variable need_check  : std_ulogic;
        variable msb         : std_ulogic;
@ -1032,6 +1060,7 @@ begin
        variable bneg        : std_ulogic;
        variable ci          : std_ulogic;
        variable rormr       : std_ulogic_vector(63 downto 0);
        variable sorms       : std_ulogic_vector(55 downto 0);
    begin
        v := r;
        v.complete := '0';
@ -1048,7 +1077,6 @@ begin
            v.writing_fpr := '0';
            v.writing_cr := '0';
            v.writing_xer := '0';
            v.comm_fpscr := r.fpscr;
            v.illegal := '0';
        end if;
@ -1076,6 +1104,8 @@ begin
            v.is_addition := '0';
            v.is_subtract := '0';
            v.is_inverse := '0';
            v.add_bsmall := '0';
            v.do_renorm_b := '0';
            fpin_a := '0';
            fpin_b := '0';
            fpin_c := '0';
@ -1088,6 +1118,7 @@ begin
            v.quieten_nan := '1';
            v.int_result := '0';
            v.is_arith := '0';
            v.zero_fri := '0';
            case e_in.op is
                when OP_FP_ARITH =>
                    fpin_a := e_in.valid_a;
@ -1112,6 +1143,8 @@ begin
                            v.result_sign := e_in.fra(63);
                            if unsigned(e_in.fra(62 downto 52)) <= unsigned(e_in.frb(62 downto 52)) then
                                v.result_sign := e_in.frb(63) xnor e_in.insn(1);
                            else
                                v.add_bsmall := '1';
                            end if;
                            v.is_subtract := not (e_in.fra(63) xor e_in.frb(63) xor e_in.insn(1));
                        when "11001" =>         -- fmul
@ -1124,17 +1157,24 @@ begin
                            v.is_subtract := not (e_in.fra(63) xor e_in.frb(63) xor
                                                  e_in.frc(63) xor e_in.insn(1));
                            v.negate := e_in.insn(2);
                            v.do_renorm_b := '1';
                        when "10010" =>         -- fdiv
                            v.is_inverse := '1';
                            v.result_sign := e_in.fra(63) xor e_in.frb(63);
                            v.do_renorm_b := '1';
                        when "11000" | "11010" =>       -- fre and frsqrte
                            v.is_inverse := '1';
                            v.result_sign := e_in.frb(63);
                            v.do_renorm_b := '1';
                        when "01110" | "01111" =>       -- fcti*
                            v.int_result := '1';
                            v.result_sign := e_in.frb(63);
-                        when others =>                  -- fri* and frsp
+                        when "01000" =>                 -- fri*
                            v.zero_fri := '1';
                            v.result_sign := e_in.frb(63);
                        when others =>                  -- frsp and fsqrt
                            v.result_sign := e_in.frb(63);
                            v.do_renorm_b := '1';
                    end case;
                when OP_FP_CMP =>
                    fpin_a := e_in.valid_a;
@ -1145,12 +1185,21 @@ begin
                    opcbits := e_in.insn(10) & e_in.insn(8) & e_in.insn(4) & e_in.insn(2) & e_in.insn(1);
                    exec_state := misc_decode(to_integer(unsigned(opcbits)));
                    case opcbits is
-                        when "10010" | "11010" | "10011" =>
+                        when "10010" | "11010" =>
-                            -- fmrg*, mffs
+                            -- fmrg*
                            v.int_result := '1';
                            v.result_sign := '0';
                        when "10011" =>
                            -- mffs*
                            v.int_result := '1';
                            v.result_sign := '0';
                            if e_in.insn(20 downto 16) /= "00000" then
                                -- mffs* variants other than mffs have bit 0 reserved
                                v.rc := '0';
                            end if;
                        when "10110" =>        -- fcfid
                            v.result_sign := e_in.frb(63);
                            v.longmask := e_in.single;
                        when others =>
                            v.result_sign := '0';
                    end case;
@ -1211,7 +1260,6 @@ begin
            end case;
            v.tiny := '0';
            v.denorm := '0';
            v.add_bsmall := '0';
            v.int_ovf := '0';
            v.div_close := '0';
@ -1268,6 +1316,9 @@ begin
        end if;
        -- Compare P with zero and with B
        -- This has a 2-bit shift in it (p(59..4) compared to b(57..2))
        -- because it's used in the FP division code to determine whether
        -- to increment the quotient at bit 2 (DP_RBIT).
        px_nz := or (r.p(UNIT_BIT + 1 downto 4));
        pcmpb_eq := '0';
        if r.p(59 downto 4) = r.b.mantissa(UNIT_BIT + 1 downto DP_RBIT) then
@ -1279,6 +1330,9 @@ begin
        elsif unsigned(r.p(59 downto 4)) < unsigned(r.b.mantissa(UNIT_BIT + 1 downto DP_RBIT)) then
            pcmpb_lt := '1';
        end if;
        -- Compare P with zero and with C
        -- This is used in the square root and integer division code
        -- to decide whether to increment the result by 1
        pcmpc_eq := '0';
        if r.p = r.c.mantissa then
            pcmpc_eq := '1';
@ -1303,13 +1357,14 @@ begin
        opsel_s <= S_ZERO;
        misc_sel <= "000";
        opsel_sel <= AIN_ZERO;
-        fpscr_mask := (others => '1');
+        fpscr_mask := x"FFFFFFFF";
        cr_op := CROP_NONE;
        update_fx := '0';
        arith_done := '0';
        invalid := '0';
        zero_divide := '0';
        set_x := '0';
        set_xs := '0';
        qnan_result := '0';
        set_a := '0';
        set_a_exp := '0';
@ -1354,12 +1409,6 @@ begin
        rsgn_op := RSGN_NOP;
        rcls_op <= RCLS_NOP;
        if r.cycle_1_ar = '1' then
            v.fpscr(FPSCR_FR) := '0';
            v.fpscr(FPSCR_FI) := '0';
            v.result_class := FINITE;
        end if;
        case r.state is
            when IDLE =>
                v.invalid := '0';
@ -1374,7 +1423,7 @@ begin
                    end if;
                end if;
                v.x := '0';
-                v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX);
+                v.old_exc := r.fpscr(FPSCR_OX downto FPSCR_VXVC) & r.fpscr(FPSCR_VXSOFT downto FPSCR_VXCVI);
                set_s := '1';
                v.regsel := AIN_ZERO;
@ -1391,7 +1440,7 @@ begin
                    v.state := RENORM_A;
                elsif r.c.denorm = '1' then
                    v.state := RENORM_C;
-                elsif r.b.denorm = '1' and (r.is_inverse = '1' or r.is_sqrt = '1') then
+                elsif r.b.denorm = '1' and r.do_renorm_b = '1' then
                    v.state := RENORM_B;
                elsif r.is_multiply = '1' and r.b.class = ZERO then
                    v.state := DO_FMUL;
@ -1410,11 +1459,10 @@ begin
                for i in 0 to 7 loop
                    if i = j then
                        k := (7 - i) * 4;
                        v.cr_result := r.fpscr(k + 3 downto k);
                        fpscr_mask(k + 3 downto k) := "0000";
                    end if;
                end loop;
-                v.fpscr := r.fpscr and (fpscr_mask or x"6007F8FF");
+                v.fpscr := r.fpscr and (fpscr_mask or x"6007F0FF");
                v.instr_done := '1';
            when DO_FTDIV =>
@ -1477,6 +1525,7 @@ begin
                        v.fpscr(31 - i) := r.insn(6);
                    end if;
                end loop;
                update_fx := '1';
                v.instr_done := '1';
            when DO_MTFSFI =>
@ -1583,22 +1632,7 @@ begin
                set_r := '1';
                re_sel2 <= REXP2_B;
                re_set_result <= '1';
-                v.state := DO_FRSP_2;
+                v.state := FINISH;
            when DO_FRSP_2 =>
                -- r.shift = 0
                -- set shift to exponent - -126 (for ROUND_UFLOW state)
                rs_sel1 <= RSH1_B;
                rs_con2 <= RSCON2_MINEXP;
                rs_neg2 <= '1';
                set_x := '1';   -- uses r.r and r.shift
                if r.b.exponent < to_signed(-126, EXP_BITS) then
                    v.state := ROUND_UFLOW;
                elsif r.b.exponent > to_signed(127, EXP_BITS) then
                    v.state := ROUND_OFLOW;
                else
                    v.state := ROUNDING;
                end if;
            when DO_FCTI =>
                -- instr bit 9: 1=dword 0=word
@ -1611,6 +1645,7 @@ begin
                re_set_result <= '1';
                rs_sel1 <= RSH1_B;
                rs_neg2 <= '1';
                v.single_prec := not r.insn(9);
                if r.b.exponent >= to_signed(64, EXP_BITS) or
                    (r.insn(9) = '0' and r.b.exponent >= to_signed(32, EXP_BITS)) then
@ -1640,6 +1675,8 @@ begin
                rcls_op <= RCLS_SEL;
                re_con2 <= RECON2_UNIT;
                re_set_result <= '1';
                v.fpscr(FPSCR_FR) := '0';
                v.fpscr(FPSCR_FI) := '0';
                if r.b.class = ZERO then
                    arith_done := '1';
                else
@ -1657,15 +1694,13 @@ begin
                rs_sel1 <= RSH1_B;
                rs_neg1 <= '1';
                rs_sel2 <= RSH2_A;
-                v.add_bsmall := '0';
+                if r.add_bsmall = '1' then
-                if r.a.exponent = r.b.exponent then
+                    v.state := ADD_1;
                elsif r.a.exponent = r.b.exponent then
                    v.state := ADD_2B;
-                elsif r.a.exponent < r.b.exponent then
+                elsif v.add_bsmall = '0' then
                    v.longmask := '0';
                    v.state := ADD_SHIFT;
                else
                    v.add_bsmall := '1';
                    v.state := ADD_1;
                end if;
            when DO_FMUL =>
@ -1705,7 +1740,8 @@ begin
                misc_sel <= "111";
                set_r := '1';
                re_set_result <= '1';
-                arith_done := '1';
+                v.writing_fpr := '1';
                v.instr_done := '1';
            when DO_FSQRT =>
                opsel_a <= AIN_B;
@ -1737,14 +1773,16 @@ begin
                re_set_result <= '1';
                -- put b.exp into shift
                rs_sel1 <= RSH1_B;
-                if (r.a.exponent + r.c.exponent + 1) < r.b.exponent then
+                if (r.a.exponent + r.c.exponent + 2) < r.b.exponent then
-                    -- addend is bigger, do multiply first
+                    -- addend is definitely bigger, do multiply first
                    -- if subtracting, sign is opposite to initial estimate
                    f_to_multiply.valid <= '1';
                    v.first := '1';
                    v.state := FMADD_0;
                else
-                    -- product is bigger, shift B first
+                    -- product may be bigger, or the answer might be
                    -- close to 0; shift B first so the multiplier does
                    -- the add/subtract operation.
                    v.state := FMADD_1;
                end if;
@ -1791,7 +1829,7 @@ begin
                if r.c.denorm = '1' then
                    -- must be either fmul or fmadd/sub
                    v.state := RENORM_C;
-                elsif r.b.denorm = '1' and r.is_addition = '0' then
+                elsif r.b.denorm = '1' and r.do_renorm_b = '1' then
                    v.state := RENORM_B;
                elsif r.is_multiply = '1' and r.b.class = ZERO then
                    v.state := DO_FMUL;
@ -1807,6 +1845,7 @@ begin
                re_sel2 <= REXP2_B;
                re_set_result <= '1';
                -- set shift to b.exp - a.exp
                -- (N.B., shift can be 0 if B is denorm and A's exp is -1022)
                rs_sel1 <= RSH1_B;
                rs_sel2 <= RSH2_A;
                rs_neg2 <= '1';
@ -1821,6 +1860,7 @@ begin
                re_set_result <= '1';
                v.x := s_nz;
                set_x := '1';
                set_s := '1';
                v.longmask := r.single_prec;
                if r.add_bsmall = '1' then
                    v.state := ADD_2;
@ -1859,25 +1899,14 @@ begin
                    -- result is opposite sign to expected
                    rsgn_op := RSGN_INV;
                    set_r := '1';
                    v.state := FINISH;
                elsif r.r(UNIT_BIT + 1) = '1' then
                    -- sum overflowed, shift right
                    opsel_r <= RES_SHIFT;
                    set_r := '1';
                    re_set_result <= '1';
                    set_x := '1';
                    if exp_huge = '1' then
                        v.state := ROUND_OFLOW;
                    else
                        v.state := ROUNDING;
                    end if;
                elsif r.r(UNIT_BIT) = '1' then
                    set_x := '1';
                    v.state := ROUNDING;
                else
                    rs_norm <= '1';
                    v.state := NORMALIZE;
                end if;
                v.state := FINISH;
            when CMP_1 =>
                opsel_a <= AIN_A;
@ -1892,9 +1921,10 @@ begin
                v.instr_done := '1';
            when MULT_1 =>
                f_to_multiply.valid <= r.first;
                opsel_r <= RES_MULT;
                set_r := '1';
                opsel_s <= S_MULT;
                set_s := '1';
                if multiply_to_f.valid = '1' then
                    v.state := FINISH;
                end if;
@ -1920,8 +1950,8 @@ begin
                end if;
            when FMADD_1 =>
-                -- shift is b.exp, so new_exp is a.exp + c.exp - b.exp
+                -- shift is b.exp, so new_exp is a.exp + c.exp - b.exp (>= -2)
-                -- product is bigger here
+                -- product may bigger here
                -- shift B right and use it as the addend to the multiplier
                -- for subtract, multiplier does B - A * C
                re_sel2 <= REXP2_B;
@ -1935,8 +1965,10 @@ begin
            when FMADD_2 =>
                -- Product is potentially bigger here
                -- r.shift = addend exp - product exp + 64, r.r = r.b.mantissa
                -- R contains B, S contains 0
                set_s := '1';
                opsel_s <= S_SHIFT;
                set_x := '1';
                -- set shift to r.shift - 64
                rs_sel1 <= RSH1_S;
                rs_con2 <= RSCON2_64;
@ -1979,25 +2011,18 @@ begin
                v.state := FMADD_6;
            when FMADD_6 =>
-                -- r.shift = UNIT_BIT (or 0, but only if r is now nonzero)
+                -- r.shift = UNIT_BIT
                set_r := '0';
                opsel_r <= RES_SHIFT;
                re_sel2 <= REXP2_NE;
                rs_norm <= '1';
                rcls_op <= RCLS_TZERO;
                if (r.r(UNIT_BIT + 2) or r_hi_nz or r_lo_nz or (or (r.r(DP_LSB - 1 downto 0)))) = '0' then
-                    -- S = 0 case is handled by RCLS_TZERO logic, otherwise...
+                    -- R is all zeroes but there may be non-zero bits in S
                    -- R is all zeroes but there are non-zero bits in S
                    -- so shift them into R and set S to 0
                    set_r := '1';
                    re_set_result <= '1';
                    set_s := '1';
                    v.state := FINISH;
                elsif r.r(UNIT_BIT + 2 downto UNIT_BIT) = "001" then
                    v.state := FINISH;
                else
                    v.state := NORMALIZE;
                end if;
                v.state := FINISH;
            when DIV_2 =>
                -- compute Y = inverse_table[B] (when count=0); P = 2 - B * Y
@ -2248,29 +2273,29 @@ begin
            when SQRT_11 =>
                -- compute P = A - R * R (remainder)
-                -- also put 2 * R + 1 into B for comparison with P
+                -- also put 2 * R + 1 into C for comparison with P
                msel_1 <= MUL1_R;
                msel_2 <= MUL2_R;
                msel_add <= MULADD_A;
                msel_inv <= '1';
                f_to_multiply.valid <= r.first;
                shiftin := '1';
-                set_b := r.first;
+                set_c := r.first;
                if multiply_to_f.valid = '1' then
                    v.state := SQRT_12;
                end if;
            when SQRT_12 =>
-                -- test if remainder is 0 or >= B = 2*R + 1
+                -- test if remainder is 0 or >= C = 2*R + 1
                set_r := '0';
                opsel_c <= CIN_INC;
-                if pcmpb_lt = '1' then
+                if pcmpc_lt = '1' then
                    -- square root is correct, set X if remainder non-zero
                    v.x := r.p(UNIT_BIT + 2) or px_nz;
                else
                    -- square root needs to be incremented by 1
                    set_r := '1';
-                    v.x := not pcmpb_eq;
+                    v.x := not pcmpc_eq;
                end if;
                v.state := FINISH;
@ -2318,10 +2343,13 @@ begin
                -- Check for possible overflows
                case r.insn(9 downto 8) is
                    when "00" =>        -- fctiw[z]
                        -- check bit 32 in case of rounding overflow
                        need_check := r.r(31) or (r.r(30) and not r.result_sign);
                    when "01" =>        -- fctiwu[z]
-                        need_check := r.r(31);
+                        -- check bit 32 in case of rounding overflow
                        need_check := r.r(31) or r.r(31);
                    when "10" =>        -- fctid[z]
                        -- can't get rounding overflow for 64-bit conversion
                        need_check := r.r(63) or (r.r(62) and not r.result_sign);
                    when others =>      -- fctidu[z]
                        need_check := r.r(63);
@ -2341,26 +2369,23 @@ begin
                else
                    msb := r.r(63);
                end if;
                opsel_r <= RES_MISC;
                misc_sel <= "110";
                if (r.insn(8) = '0' and msb /= r.result_sign) or
-                    (r.insn(8) = '1' and msb /= '1') then
+                    (r.insn(8) = '1' and msb /= '1') or
-                    set_r := '1';
+                    (r.insn(9) = '0' and r.r(32) /= r.result_sign) then
-                    v.fpscr(FPSCR_VXCVI) := '1';
+                    v.state := INT_OFLOW;
                    invalid := '1';
                else
                    set_r := '0';
                    if r.fpscr(FPSCR_FI) = '1' then
                        v.fpscr(FPSCR_XX) := '1';
                    end if;
                end if;
                    arith_done := '1';
                end if;
            when INT_OFLOW =>
                opsel_r <= RES_MISC;
                misc_sel <= "110";
                set_r := '1';
                v.fpscr(FPSCR_VXCVI) := '1';
                v.fpscr(FPSCR_FR downto FPSCR_FI) := "00";
                invalid := '1';
                arith_done := '1';
@ -2374,22 +2399,24 @@ begin
                v.state := ROUNDING;
            when FINISH =>
-                if r.is_multiply = '1' and px_nz = '1' then
+                -- r.shift = 0
                    v.x := '1';
                end if;
                -- set shift to new_exp - min_exp (N.B. rs_norm overrides this)
                -- assert that if r.r = 0 then r.x = 0 also
                rs_sel1 <= RSH1_NE;
                rs_con2 <= RSCON2_MINEXP;
                rs_neg2 <= '1';
                rcls_op <= RCLS_TZERO;
                if r.r(63 downto UNIT_BIT) /= std_ulogic_vector(to_unsigned(1, 64 - UNIT_BIT)) then
                    rs_norm <= '1';
                    v.state := NORMALIZE;
                else
                    set_x := '1';
-                    if exp_tiny = '1' then
+                    set_xs := r.is_multiply;
-                        v.state := ROUND_UFLOW;
+                    v.tiny := exp_tiny;
-                    elsif exp_huge = '1' then
+                    if exp_tiny = '1' and r.fpscr(FPSCR_UE) = '0' then
-                        v.state := ROUND_OFLOW;
+                        v.state := ROUND_UFLOW_DIS;
                    elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then
                        v.state := ROUND_OFLOW_DIS;
                    else
                        v.state := ROUNDING;
                    end if;
@ -2407,51 +2434,35 @@ begin
                rs_con2 <= RSCON2_MINEXP;
                rs_neg2 <= '1';
                set_x := '1';
-                if exp_tiny = '1' then
+                set_xs := r.is_multiply;
-                    v.state := ROUND_UFLOW;
+                v.tiny := exp_tiny;
-                elsif exp_huge = '1' then
+                if exp_tiny = '1' and r.fpscr(FPSCR_UE) = '0' then
-                    v.state := ROUND_OFLOW;
+                    v.state := ROUND_UFLOW_DIS;
                elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then
                    v.state := ROUND_OFLOW_DIS;
                else
                    v.state := ROUNDING;
                end if;
-            when ROUND_UFLOW =>
+            when ROUND_UFLOW_DIS =>
                -- r.shift = - amount by which exponent underflows
                v.tiny := '1';
                opsel_r <= RES_SHIFT;
                set_r := '0';
                if r.fpscr(FPSCR_UE) = '0' then
                -- disabled underflow exception case
                -- have to denormalize before rounding
                opsel_r <= RES_SHIFT;
                set_r := '0';
                set_r := '1';
                re_sel2 <= REXP2_NE;
                re_set_result <= '1';
                set_x := '1';
                v.state := ROUNDING;
                else
                    -- enabled underflow exception case
                    -- if denormalized, have to normalize before rounding
                    v.fpscr(FPSCR_UX) := '1';
                    re_sel1 <= REXP1_R;
                    re_con2 <= RECON2_BIAS;
                    re_set_result <= '1';
                    if r.r(UNIT_BIT) = '0' then
                        rs_norm <= '1';
                        v.state := NORMALIZE;
                    else
                        v.state := ROUNDING;
                    end if;
                end if;
-            when ROUND_OFLOW =>
+            when ROUND_OFLOW_DIS =>
                -- disabled overflow exception
                -- result depends on rounding mode
                rcls_op <= RCLS_TINF;
                v.fpscr(FPSCR_OX) := '1';
                opsel_r <= RES_MISC;
                misc_sel <= "010";
                set_r := '0';
                if r.fpscr(FPSCR_OE) = '0' then
                    -- disabled overflow exception
                    -- result depends on rounding mode
                set_r := '1';
                v.fpscr(FPSCR_XX) := '1';
                v.fpscr(FPSCR_FI) := '1';
@ -2459,94 +2470,120 @@ begin
                re_con2 <= RECON2_MAX;
                re_set_result <= '1';
                arith_done := '1';
                else
                    -- enabled overflow exception
                    re_sel1 <= REXP1_R;
                    re_con2 <= RECON2_BIAS;
                    re_neg2 <= '1';
                    re_set_result <= '1';
                    v.state := ROUNDING;
                end if;
            when ROUNDING =>
                -- r.r can be zero or denorm here for fri* instructions,
                -- and for disabled underflow exception cases.
                opsel_mask <= '1';
                set_r := '1';
                round := fp_rounding(r.r, r.x, r.single_prec, r.round_mode, r.result_sign);
                if r.zero_fri = '0' then
                    v.fpscr(FPSCR_FR downto FPSCR_FI) := round;
                else
                    v.fpscr(FPSCR_FR downto FPSCR_FI) := "00";  -- for fri* instructions
                end if;
                if round(1) = '1' then
                    -- increment the LSB for the precision
                    v.state := ROUND_INC;
                elsif r.r(UNIT_BIT) = '0' then
-                    -- result after masking could be zero, or could be a
+                    -- Result after masking could be zero, or could be a
-                    -- denormalized result that needs to be renormalized
+                    -- denormalized result that needs to be renormalized,
                    -- but only for fri* instructions and for disabled
                    -- underflow exception cases.
                    -- For fri* instructions, result_exp is 52.
                    -- For disabled underflow exception cases for DP operations,
                    -- result_exp is -1022 and there is no point renormalizing
                    -- since it will just get denormalized again, but we do need
                    -- to check for a zero result in a subsequent cycle
                    -- after R is masked.
                    if r.result_exp > to_signed(-1022, EXP_BITS) then
                        rs_norm <= '1';
                    end if;
                    v.state := ROUNDING_3;
                elsif r.tiny = '1' and r.fpscr(FPSCR_UE) = '1' then
                    v.state := ROUND_UFLOW_EN;
                elsif r.result_exp > max_exp then
                    v.state := ROUND_OFLOW_EN;
                else
                    arith_done := '1';
                end if;
-                if round(0) = '1' then
+                if round(0) = '1' and r.zero_fri = '0' then
                    v.fpscr(FPSCR_XX) := '1';
                    if r.tiny = '1' then
                        v.fpscr(FPSCR_UX) := '1';
                end if;
                if round(0) = '1' and r.tiny = '1' then
                    v.fpscr(FPSCR_UX) := '1';
                end if;
            when ROUND_INC =>
                set_r := '1';
                opsel_a <= AIN_RND;
                -- set shift to -1
                rs_con2 <= RSCON2_1;
                rs_neg2 <= '1';
                v.state := ROUNDING_2;
            when ROUNDING_2 =>
                -- Check for overflow during rounding
-                -- r.shift = -1
+                -- r.shift = 0
                v.x := '0';
                re_sel2 <= REXP2_NE;
                opsel_r <= RES_SHIFT;
                set_r := '0';
                if r.r(UNIT_BIT + 1) = '1' then
                    set_r := '1';
                    re_set_result <= '1';
                    if exp_huge = '1' then
                        v.state := ROUND_OFLOW;
                    else
                        arith_done := '1';
                    end if;
                elsif r.r(UNIT_BIT) = '0' then
                    -- Do CLZ so we can renormalize the result
                    rs_norm <= '1';
                    v.state := ROUNDING_3;
                elsif r.r(UNIT_BIT) = '0' then
                    -- R is non-zero (we just incremented it)
                    -- If result_exp is -1022 here, don't normalize since
                    -- we would then need to denormalize again.
                    if r.result_exp > to_signed(-1022, EXP_BITS) then
                        rs_norm <= '1';
                    end if;
                    v.state := ROUNDING_3;
                elsif exp_huge = '1' then
                    v.state := ROUND_OFLOW_EN;
                elsif r.tiny = '1' and r.fpscr(FPSCR_UE) = '1' then
                    v.state := ROUND_UFLOW_EN;
                else
                    arith_done := '1';
                end if;
            when ROUNDING_3 =>
-                -- r.shift = clz(r.r) - 9
+                -- r.shift = clz(r.r) - 7 (or 0, or -7, if r.r is 0)
                -- Note clz may be done on the value before being masked
                -- to the result precision.
                opsel_r <= RES_SHIFT;
                set_r := '1';
                re_sel2 <= REXP2_NE;
-                -- set shift to new_exp - min_exp (== -1022)
+                -- set shift to new_exp - DP min_exp (== -1022)
                rs_sel1 <= RSH1_NE;
-                rs_con2 <= RSCON2_MINEXP;
+                rs_con2 <= RSCON2_DPMINX;
                rs_neg2 <= '1';
                rcls_op <= RCLS_TZERO;
                -- If the result is zero, that's handled below.
                -- Renormalize result after rounding
                re_set_result <= '1';
                v.denorm := exp_tiny;
-                if new_exp < to_signed(-1022, EXP_BITS) then
+                re_set_result <= '1';
-                    v.state := DENORM;
+                if exp_huge = '1' and r.fpscr(FPSCR_OE) = '0' then
                    v.state := ROUND_OFLOW_DIS;
                elsif exp_huge = '1' and r.fpscr(FPSCR_OE) = '1' then
                    v.state := ROUND_OFLOW_EN;
                elsif r.tiny = '1' and r.fpscr(FPSCR_UE) = '1' then
                    v.state := ROUND_UFLOW_EN;
                else
                    arith_done := '1';
                end if;
-            when DENORM =>
+            when ROUND_OFLOW_EN =>
-                -- r.shift = result_exp - -1022
+                -- enabled overflow exception
-                opsel_r <= RES_SHIFT;
+                -- rounding and normalization has been done
-                set_r := '1';
+                v.fpscr(FPSCR_OX) := '1';
-                re_sel2 <= REXP2_NE;
+                re_sel1 <= REXP1_R;
                re_con2 <= RECON2_BIAS;
                re_neg2 <= '1';
                re_set_result <= '1';
                arith_done := '1';
            when ROUND_UFLOW_EN =>
                -- enabled underflow exception
                -- rounding and normalization has been done
                v.fpscr(FPSCR_UX) := '1';
                re_sel1 <= REXP1_R;
                re_con2 <= RECON2_BIAS;
                re_set_result <= '1';
                arith_done := '1';
@ -3077,13 +3114,16 @@ begin
        -- Handle exceptions and special cases for arithmetic operations
        if r.cycle_1_ar = '1' then
            v.fpscr := r.fpscr or scinfo.new_fpscr;
            v.fpscr(FPSCR_FR) := '0';
            v.fpscr(FPSCR_FI) := '0';
            v.result_class := FINITE;
            invalid := scinfo.invalid;
            zero_divide := scinfo.zero_divide;
            qnan_result := scinfo.qnan_result;
            if scinfo.immed_result = '1' then
                -- state machine is in the DO_SPECIAL or DO_FSQRT state here
                arith_done := '1';
                set_r := '1';
                v.is_multiply := '0';           -- P is not valid
                opsel_r <= RES_MISC;
                opsel_sel <= scinfo.result_sel;
                if scinfo.qnan_result = '1' then
@ -3092,8 +3132,15 @@ begin
                    else
                        misc_sel <= "110";
                    end if;
                    arith_done := '1';
                else
                    misc_sel <= "111";
                    if scinfo.need_finish = '1' then
                        -- we have to do rounding or underflow exception processing on the result
                        v.state := FINISH;
                    else
                        arith_done := '1';
                    end if;
                end if;
                rsgn_op := scinfo.rsgn_op;
                v.result_class := scinfo.result_class;
@ -3140,12 +3187,11 @@ begin
                    when others =>
                end case;
            when RCLS_TZERO =>
-                if or (r.r(UNIT_BIT + 2 downto 0)) = '0' and s_nz = '0' then
+                if or (r.r) = '0' then
                    v.result_class := ZERO;
                    arith_done := '1';
                end if;
            when RCLS_TINF =>
                if r.fpscr(FPSCR_OE) = '0' then
                if r.round_mode(1 downto 0) = "00" or
                    (r.round_mode(1) = '1' and r.round_mode(0) = r.result_sign) then
                    v.result_class := INFINITY;
@ -3153,7 +3199,6 @@ begin
                else
                    v.fpscr(FPSCR_FR) := '0';
                end if;
                end if;
            when others =>
        end case;
@ -3171,7 +3216,7 @@ begin
                v.writing_fpr := '1';
                v.update_fprf := '1';
            end if;
-            if r.is_subtract = '1' and v.result_class = ZERO then
+            if r.is_subtract = '1' and v.result_class = ZERO and v.fpscr(FPSCR_FI) = '0' then
                rsign := r.round_mode(0) and r.round_mode(1);
            end if;
            if r.negate = '1' and v.result_class /= NAN then
@ -3242,7 +3287,7 @@ begin
        -- If shifting right, test if bits of R will be shifted out of significance
        if r.longmask = '1' then
-            mshift := to_signed(28, EXP_BITS);
+            mshift := to_signed(SP_RBIT - 1, EXP_BITS);
        else
            mshift := to_signed(-1, EXP_BITS);
        end if;
@ -3258,7 +3303,17 @@ begin
            if mshift >= to_signed(64, EXP_BITS) then
                mshift := to_signed(63, EXP_BITS);
            end if;
-            v.x := v.x or r.r(to_integer(unsigned(mshift(5 downto 0))));
+            v.x := v.x or rormr(to_integer(unsigned(mshift(5 downto 0))));
        end if;
        -- Test if there are non-zero bits in S which won't get shifted into R
        if set_xs = '1' and not is_X(r.shift) and r.shift < to_signed(56, EXP_BITS) then
            if r.shift > to_signed(0, EXP_BITS) then
                mshift := to_signed(55, EXP_BITS) - r.shift;
            else
                mshift := to_signed(55, EXP_BITS);
            end if;
            sorms := r.s or std_ulogic_vector(- signed(r.s));
            v.x := v.x or sorms(to_integer(unsigned(mshift(5 downto 0))));
        end if;
        asign := '0';
        case opsel_a is
@ -3284,6 +3339,8 @@ begin
        ci := '0';
        case opsel_c is
            when CIN_SUBEXT =>
                -- Used with opsel_b = BIN_ADDSUBR, which will invert it if
                -- r.subtract = 1, hence we use r.x here, rather than not r.x.
                ci := r.is_subtract and r.x;
            when CIN_ABSEXT =>
                ci := r.r(63) and (s_nz or r.x);
@ -3537,6 +3594,8 @@ begin
                        rsh_in2 := to_signed(64, EXP_BITS);
                    when RSCON2_MINEXP =>
                        rsh_in2 := min_exp;
                    when RSCON2_DPMINX =>
                        rsh_in2 := to_signed(-1022, EXP_BITS);
                    when others =>
                        rsh_in2 := to_signed(0, EXP_BITS);
                end case;
@ -3654,7 +3713,7 @@ begin
        end if;
        if r.update_fprf = '1' then
-            v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.res_sign, r.result_class,
+            v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.res_sign, r.result_class, r.int_result,
                                                             r.r(UNIT_BIT) and not r.denorm);
        end if;
@ -3663,10 +3722,15 @@ begin
        v.fpscr(FPSCR_FEX) := or (v.fpscr(FPSCR_VX downto FPSCR_XX) and
                                  v.fpscr(FPSCR_VE downto FPSCR_XE));
        if update_fx = '1' and
-            (v.fpscr(FPSCR_VX downto FPSCR_XX) and not r.old_exc) /= "00000" then
+            ((v.fpscr(FPSCR_OX downto FPSCR_VXVC) & v.fpscr(FPSCR_VXSOFT downto FPSCR_VXCVI)) and
             not r.old_exc) /= 13x"0" then
            v.fpscr(FPSCR_FX) := '1';
        end if;
        if r.complete = '1' or r.do_intr = '1' then
            v.comm_fpscr := v.fpscr;
        end if;
        if v.instr_done = '1' then
            if r.state /= IDLE then
                v.state := IDLE;
@ -3675,7 +3739,8 @@ begin
                if r.fp_rc = '1' then
                    v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX);
                end if;
-                v.sp_result := r.single_prec;
+                -- set sp_result for fctiw*
                v.sp_result := r.single_prec and not r.integer_op;
                v.res_int := r.int_result or r.integer_op;
                v.illegal := illegal;
                v.nsnan_result := r.quieten_nan;
@ -3709,11 +3774,17 @@ begin
        -- This mustn't depend on any fields of r that are modified in IDLE state.
        if r.res_int = '1' then
            fp_result <= r.r;
            if r.sp_result = '1' then
                fp_result(63 downto 32) <= r.r(31 downto 0);
            end if;
        else
            fp_result <= pack_dp(r.res_sign, r.result_class, r.result_exp, r.r,
                                 r.sp_result, r.nsnan_result);
        end if;
        -- Make sure the reserved bit 11 (52) of FPSCR can never be set
        v.fpscr(11) := '0';
        rin <= v;
    end process;
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@ -21,6 +21,8 @@
 #define FPS_VE		0x80
 #define FPS_VXCVI	0x100
 #define FPS_VXSOFT	0x400
 #define FPS_FI		0x20000
 #define FPS_FR		0x40000
 extern int trapit(long arg, int (*func)(long));
 extern void do_rfid(unsigned long msr);
@ -272,6 +274,7 @@ void set_fpscr(unsigned long fpscr)
 unsigned long fpscr_eval(unsigned long val)
 {
 	val &= ~0x60000000;	/* clear FEX and VX */
 	val &= ~0x00000800;	/* clear reserved bit 52 (BE) */
 	if (val & 0x1f80700)	/* test all VX* bits */
 		val |= 0x20000000;
 	if ((val >> 25) & (val >> 3) & 0x1f)
@ -348,15 +351,15 @@ int test4(long arg)
 		fpscr = fpscr_eval((fpscr & 0x0fffffff) | 0x70000000);
 		if (get_fpscr() != fpscr)
 			return 16 * i + 27;
-		asm("mtfsb0 21");
+		asm("mtfsb0 21");	/* VXSOFT */
 		fpscr = fpscr_eval(fpscr & ~(1 << (31-21)));
 		if (get_fpscr() != fpscr)
 			return 16 * i + 28;
 		asm("mtfsb1 21");
-		fpscr = fpscr_eval(fpscr | (1 << (31-21)));
+		fpscr = fpscr_eval(fpscr | (1 << (31-21)) | (1ul << 31));
 		if (get_fpscr() != fpscr)
 			return 16 * i + 29;
-		asm("mtfsb0 24");
+		asm("mtfsb0 24");	/* OE */
 		fpscr = fpscr_eval(fpscr & ~(1 << (31-24)));
 		if (get_fpscr() != fpscr)
 			return 16 * i + 30;
@ -653,29 +656,35 @@ struct roundvals {
 	unsigned long fpscr;
 	unsigned long dpval;
 	unsigned long spval;
 	unsigned long fpscr_fir;
 } roundvals[] = {
-	{ FPS_RN_NEAR,  0, 0 },
+	{ FPS_RN_NEAR|FPS_FI|FPS_FR,	0, 0, 0 },
-	{ FPS_RN_CEIL,  0x8000000000000000, 0x8000000000000000 },
+	{ FPS_RN_CEIL|FPS_FI|FPS_FR,	0x8000000000000000, 0x8000000000000000, 0 },
-	{ FPS_RN_NEAR,  0x402123456789abcd, 0x4021234560000000 },
+	{ FPS_RN_NEAR|FPS_FR,		0x402123456789abcd, 0x4021234560000000, FPS_FI },
-	{ FPS_RN_ZERO,  0x402123456789abcd, 0x4021234560000000 },
+	{ FPS_RN_ZERO|FPS_FR,		0x402123456789abcd, 0x4021234560000000, FPS_FI },
-	{ FPS_RN_CEIL,  0x402123456789abcd, 0x4021234580000000 },
+	{ FPS_RN_CEIL,			0x402123456789abcd, 0x4021234580000000, FPS_FR|FPS_FI },
-	{ FPS_RN_FLOOR, 0x402123456789abcd, 0x4021234560000000 },
+	{ FPS_RN_FLOOR,			0x402123456789abcd, 0x4021234560000000, FPS_FI },
-	{ FPS_RN_NEAR,  0x402123457689abcd, 0x4021234580000000 },
+	{ FPS_RN_NEAR,			0x402123457689abcd, 0x4021234580000000, FPS_FR|FPS_FI },
-	{ FPS_RN_ZERO,  0x402123457689abcd, 0x4021234560000000 },
+	{ FPS_RN_ZERO|FPS_FR|FPS_FI,	0x402123457689abcd, 0x4021234560000000, FPS_FI },
-	{ FPS_RN_CEIL,  0x402123457689abcd, 0x4021234580000000 },
+	{ FPS_RN_CEIL|FPS_FR,		0x402123457689abcd, 0x4021234580000000, FPS_FR|FPS_FI },
-	{ FPS_RN_FLOOR, 0x402123457689abcd, 0x4021234560000000 },
+	{ FPS_RN_FLOOR,			0x402123457689abcd, 0x4021234560000000, FPS_FI },
-	{ FPS_RN_NEAR,  0x4021234570000000, 0x4021234580000000 },
+	{ FPS_RN_NEAR,			0x4021234570000000, 0x4021234580000000, FPS_FR|FPS_FI },
-	{ FPS_RN_NEAR,  0x4021234550000000, 0x4021234540000000 },
+	{ FPS_RN_NEAR,			0x4021234550000000, 0x4021234540000000, FPS_FI },
-	{ FPS_RN_NEAR,  0x7ff123456789abcd, 0x7ff9234560000000 },
+	{ FPS_RN_NEAR|FPS_FR|FPS_FI,	0x7ff123456789abcd, 0x7ff9234560000000, 0 },
-	{ FPS_RN_ZERO,  0x7ffa3456789abcde, 0x7ffa345660000000 },
+	{ FPS_RN_ZERO|FPS_FR,		0x7ffa3456789abcde, 0x7ffa345660000000, 0 },
-	{ FPS_RN_FLOOR, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ FPS_RN_FLOOR|FPS_FR|FPS_FI,	0x7ff0000000000000, 0x7ff0000000000000, 0 },
-	{ FPS_RN_NEAR,  0x47e1234550000000, 0x47e1234540000000 },
+	{ FPS_RN_NEAR,			0x47e1234550000000, 0x47e1234540000000, FPS_FI },
-	{ FPS_RN_NEAR,  0x47f1234550000000, 0x7ff0000000000000 },
+	{ FPS_RN_NEAR,			0x47f1234550000000, 0x7ff0000000000000, FPS_FR|FPS_FI },
-	{ FPS_RN_ZERO,  0x47f1234550000000, 0x47efffffe0000000 },
+	{ FPS_RN_ZERO,			0x47f1234550000000, 0x47efffffe0000000, FPS_FI },
-	{ FPS_RN_CEIL,  0x47f1234550000000, 0x7ff0000000000000 },
+	{ FPS_RN_CEIL,			0x47f1234550000000, 0x7ff0000000000000, FPS_FR|FPS_FI },
-	{ FPS_RN_FLOOR, 0x47f1234550000000, 0x47efffffe0000000 },
+	{ FPS_RN_FLOOR,			0x47f1234550000000, 0x47efffffe0000000, FPS_FI },
-	{ FPS_RN_NEAR,  0x38012345b0000000, 0x38012345c0000000 },
+	{ FPS_RN_NEAR,			0x38012345b0000000, 0x38012345c0000000, FPS_FR|FPS_FI },
-	{ FPS_RN_NEAR,  0x37c12345b0000000, 0x37c1234400000000 },
+	{ FPS_RN_NEAR,			0x37c12345b0000000, 0x37c1234400000000, FPS_FI },
 	{ FPS_RN_NEAR,			0x0000008800000088, 0,			FPS_FI },
 	{ FPS_RN_NEAR,			0xc2000000c2000000, 0xc2000000c0000000,	FPS_FI },
 	{ FPS_RN_NEAR|FPS_OE,		0xefffffffffffffff, 0xe400000000000000,	FPS_FR|FPS_FI },
 	{ FPS_RN_NEAR|FPS_OE,		0xff0000ff43434343, 0xf30000ff40000000,	FPS_FI },
 	{ FPS_RN_NEAR|FPS_OE,		0xfc00fc0139fffcff, 0xf000fc0140000000,	FPS_FR|FPS_FI },
 };
 int test8(long arg)
@ -696,6 +705,13 @@ int test8(long arg)
 		}
 		if (check_fprf(result, true, fpscr))
 			return i + 0x101;
 		if ((fpscr & (FPS_FR|FPS_FI)) != roundvals[i].fpscr_fir) {
 			print_string("\r\n");
 			print_hex(i, 4, " ");
 			print_hex(fpscr, 8, " ");
 			print_hex(roundvals[i].fpscr_fir, 8, " ");
 			return i + 0x201;
 		}
 	}
 	return 0;
 }
@ -740,6 +756,8 @@ struct cvtivals {
 	{ 0x7ff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
 	{ 0xfff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
 	{ 0xbfd123456789abcd, 0, 0, 0, 0, {0, 0, 0, 0} },
 	{ 0x41effffffff00081, 0x100000000, 0x100000000, 0x7fffffff, 0xffffffff, { 0, 0, 1, 1 } },
 	{ 0xc1e0000000000000, 0xffffffff80000000, 0x0000000000000000, 0x80000000, 0x00000000, { 0, 1, 0, 1 } },
 };
 #define GET_VXCVI()	((get_fpscr() >> 8) & 1)
@ -814,6 +832,7 @@ struct cvtivals cvtizvals[] = {
 	{ 0xfff0000000000000, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
 	{ 0x7ff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
 	{ 0xfff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
 	{ 0xc1e0000000000000, 0xffffffff80000000, 0x0000000000000000, 0x80000000, 0x00000000, { 0, 1, 0, 1 } },
 };
 int test10(long arg)
@ -959,51 +978,53 @@ struct addvals {
 	unsigned long val_b;
 	unsigned long sum;
 	unsigned long diff;
 	unsigned long fpscr;
 } addvals[] = {
-	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, FPS_RN_NEAR },
-	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000, FPS_RN_NEAR },
-	{ 0x3fdfffffffffffff, 0x0000000000000000, 0x3fdfffffffffffff, 0x3fdfffffffffffff },
+	{ 0x3fdfffffffffffff, 0x0000000000000000, 0x3fdfffffffffffff, 0x3fdfffffffffffff, FPS_RN_NEAR },
-	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x4000000000000000, 0x0000000000000000 },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x4000000000000000, 0x0000000000000000, FPS_RN_NEAR },
-	{ 0xbff0000000000000, 0xbff0000000000000, 0xc000000000000000, 0x0000000000000000 },
+	{ 0xbff0000000000000, 0xbff0000000000000, 0xc000000000000000, 0x0000000000000000, FPS_RN_NEAR },
-	{ 0x402123456789abcd, 0x4021000000000000, 0x403111a2b3c4d5e6, 0x3fb1a2b3c4d5e680 },
+	{ 0x402123456789abcd, 0x4021000000000000, 0x403111a2b3c4d5e6, 0x3fb1a2b3c4d5e680, FPS_RN_NEAR },
-	{ 0x4061200000000000, 0x406123456789abcd, 0x407121a2b3c4d5e6, 0xbfba2b3c4d5e6800 },
+	{ 0x4061200000000000, 0x406123456789abcd, 0x407121a2b3c4d5e6, 0xbfba2b3c4d5e6800, FPS_RN_NEAR },
-	{ 0x4061230000000000, 0x3fa4560000000000, 0x4061244560000000, 0x406121baa0000000 },
+	{ 0x4061230000000000, 0x3fa4560000000000, 0x4061244560000000, 0x406121baa0000000, FPS_RN_NEAR },
-	{ 0xc061230000000000, 0x3fa4560000000000, 0xc06121baa0000000, 0xc061244560000000 },
+	{ 0xc061230000000000, 0x3fa4560000000000, 0xc06121baa0000000, 0xc061244560000000, FPS_RN_NEAR },
-	{ 0x4061230000000000, 0xbfa4560000000000, 0x406121baa0000000, 0x4061244560000000 },
+	{ 0x4061230000000000, 0xbfa4560000000000, 0x406121baa0000000, 0x4061244560000000, FPS_RN_NEAR },
-	{ 0xc061230000000000, 0xbfa4560000000000, 0xc061244560000000, 0xc06121baa0000000 },
+	{ 0xc061230000000000, 0xbfa4560000000000, 0xc061244560000000, 0xc06121baa0000000, FPS_RN_NEAR },
-	{ 0x3fa1230000000000, 0x4064560000000000, 0x4064571230000000, 0xc06454edd0000000 },
+	{ 0x3fa1230000000000, 0x4064560000000000, 0x4064571230000000, 0xc06454edd0000000, FPS_RN_NEAR },
-	{ 0xbfa1230000000000, 0x4064560000000000, 0x406454edd0000000, 0xc064571230000000 },
+	{ 0xbfa1230000000000, 0x4064560000000000, 0x406454edd0000000, 0xc064571230000000, FPS_RN_NEAR },
-	{ 0x3fa1230000000000, 0xc064560000000000, 0xc06454edd0000000, 0x4064571230000000 },
+	{ 0x3fa1230000000000, 0xc064560000000000, 0xc06454edd0000000, 0x4064571230000000, FPS_RN_NEAR },
-	{ 0xbfa1230000000000, 0xc064560000000000, 0xc064571230000000, 0x406454edd0000000 },
+	{ 0xbfa1230000000000, 0xc064560000000000, 0xc064571230000000, 0x406454edd0000000, FPS_RN_NEAR },
-	{ 0x6780000000000001, 0x6470000000000000, 0x6780000000000009, 0x677ffffffffffff2 },
+	{ 0x6780000000000001, 0x6470000000000000, 0x6780000000000009, 0x677ffffffffffff2, FPS_RN_NEAR },
-	{ 0x6780000000000001, 0x6460000000000000, 0x6780000000000005, 0x677ffffffffffffa },
+	{ 0x6780000000000001, 0x6460000000000000, 0x6780000000000005, 0x677ffffffffffffa, FPS_RN_NEAR },
-	{ 0x6780000000000001, 0x6450000000000000, 0x6780000000000003, 0x677ffffffffffffe },
+	{ 0x6780000000000001, 0x6450000000000000, 0x6780000000000003, 0x677ffffffffffffe, FPS_RN_NEAR },
-	{ 0x6780000000000001, 0x6440000000000000, 0x6780000000000002, 0x6780000000000000 },
+	{ 0x6780000000000001, 0x6440000000000000, 0x6780000000000002, 0x6780000000000000, FPS_RN_NEAR },
-	{ 0x7ff8888888888888, 0x7ff9999999999999, 0x7ff8888888888888, 0x7ff8888888888888 },
+	{ 0x7ff8888888888888, 0x7ff9999999999999, 0x7ff8888888888888, 0x7ff8888888888888, FPS_RN_NEAR },
-	{ 0xfff8888888888888, 0x7ff9999999999999, 0xfff8888888888888, 0xfff8888888888888 },
+	{ 0xfff8888888888888, 0x7ff9999999999999, 0xfff8888888888888, 0xfff8888888888888, FPS_RN_NEAR },
-	{ 0x7ff8888888888888, 0x7ff0000000000000, 0x7ff8888888888888, 0x7ff8888888888888 },
+	{ 0x7ff8888888888888, 0x7ff0000000000000, 0x7ff8888888888888, 0x7ff8888888888888, FPS_RN_NEAR },
-	{ 0x7ff8888888888888, 0x0000000000000000, 0x7ff8888888888888, 0x7ff8888888888888 },
+	{ 0x7ff8888888888888, 0x0000000000000000, 0x7ff8888888888888, 0x7ff8888888888888, FPS_RN_NEAR },
-	{ 0x7ff8888888888888, 0x0001111111111111, 0x7ff8888888888888, 0x7ff8888888888888 },
+	{ 0x7ff8888888888888, 0x0001111111111111, 0x7ff8888888888888, 0x7ff8888888888888, FPS_RN_NEAR },
-	{ 0x7ff8888888888888, 0x3ff0000000000000, 0x7ff8888888888888, 0x7ff8888888888888 },
+	{ 0x7ff8888888888888, 0x3ff0000000000000, 0x7ff8888888888888, 0x7ff8888888888888, FPS_RN_NEAR },
-	{ 0x7ff0000000000000, 0x7ff9999999999999, 0x7ff9999999999999, 0x7ff9999999999999 },
+	{ 0x7ff0000000000000, 0x7ff9999999999999, 0x7ff9999999999999, 0x7ff9999999999999, FPS_RN_NEAR },
-	{ 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff8000000000000 },
+	{ 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff8000000000000, FPS_RN_NEAR },
-	{ 0x7ff0000000000000, 0xfff0000000000000, 0x7ff8000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0xfff0000000000000, 0x7ff8000000000000, 0x7ff0000000000000, FPS_RN_NEAR },
-	{ 0x7ff0000000000000, 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, FPS_RN_NEAR },
-	{ 0x7ff0000000000000, 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, FPS_RN_NEAR },
-	{ 0x7ff0000000000000, 0x8002222222222222, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x8002222222222222, 0x7ff0000000000000, 0x7ff0000000000000, FPS_RN_NEAR },
-	{ 0x7ff0000000000000, 0xc002222222222222, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0xc002222222222222, 0x7ff0000000000000, 0x7ff0000000000000, FPS_RN_NEAR },
-	{ 0x0000000000000000, 0x7ff9999999999999, 0x7ff9999999999999, 0x7ff9999999999999 },
+	{ 0x0000000000000000, 0x7ff9999999999999, 0x7ff9999999999999, 0x7ff9999999999999, FPS_RN_NEAR },
-	{ 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000 },
+	{ 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000, FPS_RN_NEAR },
-	{ 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000 },
+	{ 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000, FPS_RN_NEAR },
-	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, FPS_RN_NEAR },
-	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, FPS_RN_NEAR },
-	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000, FPS_RN_NEAR },
-	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000, FPS_RN_NEAR },
-	{ 0x8002222222222222, 0x0001111111111111, 0x8001111111111111, 0x8003333333333333 },
+	{ 0x8002222222222222, 0x0001111111111111, 0x8001111111111111, 0x8003333333333333, FPS_RN_NEAR },
-	{ 0x0000022222222222, 0x0000111111111111, 0x0000133333333333, 0x80000eeeeeeeeeef },
+	{ 0x0000022222222222, 0x0000111111111111, 0x0000133333333333, 0x80000eeeeeeeeeef, FPS_RN_NEAR },
-	{ 0x401ffffffbfffefe, 0x406b8265196bd89e, 0x406c8265194bd896, 0xc06a8265198bd8a6 },
+	{ 0x401ffffffbfffefe, 0x406b8265196bd89e, 0x406c8265194bd896, 0xc06a8265198bd8a6, FPS_RN_NEAR },
-	{ 0x4030020000000004, 0xbf110001ffffffff, 0x403001fbbfff8004, 0x4030020440008004 },
+	{ 0x4030020000000004, 0xbf110001ffffffff, 0x403001fbbfff8004, 0x4030020440008004, FPS_RN_NEAR },
-	{ 0x3fdfffffffffffff, 0x3fe0000000000000, 0x3ff0000000000000, 0xbc90000000000000 },
+	{ 0x3fdfffffffffffff, 0x3fe0000000000000, 0x3ff0000000000000, 0xbc90000000000000, FPS_RN_NEAR },
 	{ 0x001000100010000f, 0x00000000000000ff, 0x001000100010010e, 0x00100010000fff10, FPS_RN_CEIL },
 };
 int test13(long arg)
@ -1013,8 +1034,8 @@ int test13(long arg)
 	struct addvals *vp = addvals;
 	unsigned long fpscr;
 	set_fpscr(FPS_RN_NEAR);
 	for (i = 0; i < sizeof(addvals) / sizeof(addvals[0]); ++i, ++vp) {
 		set_fpscr(vp->fpscr);
 		asm("lfd 5,0(%0); lfd 6,8(%0); fadd 7,5,6; fsub 8,5,6; stfd 7,0(%1); stfd 8,8(%1)"
 		    : : "b" (&vp->val_a), "b" (results) : "memory");
 		fpscr = get_fpscr();
@ -1491,110 +1512,123 @@ struct fmavals {
 	unsigned long ra;
 	unsigned long rc;
 	unsigned long rb;
 	unsigned long fpscr;
 	unsigned long fma;
 	unsigned long fms;
 	unsigned long nfma;
 	unsigned long nfms;
 } fmavals[] = {
 	/* +0 * +0 +- +0 -> +0, +0, -0, -0 */
-	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, FPS_RN_NEAR,
 	  0x0000000000000000, 0x0000000000000000, 0x8000000000000000, 0x8000000000000000 },
 	/* +0 * NaNC +- +0 -> NaNC, NaNC, NaNC, NaNC */
-	{ 0x0000000000000000, 0x7ffc000000000000, 0x0000000000000000,
+	{ 0x0000000000000000, 0x7ffc000000000000, 0x0000000000000000, FPS_RN_NEAR,
 	  0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000 },
 	/* +0 * NaNC +- NaNB -> NaNB, NaNB, NaNB, NaNB */
-	{ 0x0000000000000000, 0x7ffc000000000000, 0x7ffb000000000000,
+	{ 0x0000000000000000, 0x7ffc000000000000, 0x7ffb000000000000, FPS_RN_NEAR,
 	  0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000 },
 	/* NaNA * NaNC +- NaNB -> NaNA, NaNA, NaNA, NaNA */
-	{ 0x7ffa000000000000, 0x7ffc000000000000, 0x7ffb000000000000,
+	{ 0x7ffa000000000000, 0x7ffc000000000000, 0x7ffb000000000000, FPS_RN_NEAR,
 	  0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000 },
 	/* +1.0 * -0 +- +finite B -> +B, -B, -B, +B */
-	{ 0x3ff0000000000000, 0x8000000000000000, 0x678123456789abcd, 
+	{ 0x3ff0000000000000, 0x8000000000000000, 0x678123456789abcd, FPS_RN_NEAR,
 	  0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd },
 	/* +1.0 * -1.0 +- (B = +3.818e+190) -> +B, -B, -B, +B */
-	{ 0x3ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, 
+	{ 0x3ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, FPS_RN_NEAR,
 	  0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd },
 	/* +inf * -1.0 +- +finite B -> -inf, -inf, +inf, +inf */
-	{ 0x7ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, 
+	{ 0x7ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, FPS_RN_NEAR,
 	  0xfff0000000000000, 0xfff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
 	/* +inf * +0 +- +finite B -> NaNQ, NaNQ, NaNQ, NaNQ */
-	{ 0x7ff0000000000000, 0x0000000000000000, 0x678123456789abcd, 
+	{ 0x7ff0000000000000, 0x0000000000000000, 0x678123456789abcd, FPS_RN_NEAR,
 	  0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000 },
 	/* +1.0 * +1.0 +- 1.00000012 -> +2.00000012, +1.2e-7, -2.00000012, -1.2e-7 */
-	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000020000000, 
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000020000000, FPS_RN_NEAR,
 	  0x4000000010000000, 0xbe80000000000000, 0xc000000010000000, 0x3e80000000000000 },
 	/* +(1 + 2^-52) * +(1 + 2^-52) +- +1.0 -> +(2 + 2^-51), +2^-51, -(2 + 2^-51), -2^-51 */
-	{ 0x3ff0000000000001, 0x3ff0000000000001, 0x3ff0000000000000,
+	{ 0x3ff0000000000001, 0x3ff0000000000001, 0x3ff0000000000000, FPS_RN_NEAR,
 	  0x4000000000000001, 0x3cc0000000000000, 0xc000000000000001, 0xbcc0000000000000 },
-	/* +(1 + 3*2^-52) * +(1 + 2^-51) +- +1.0 -> +(2 + 2^-50), +5 * 2^-52 + 2^-101, -, - */
+	/* +(1 + 3*2^-52) * +(1 + 2^-51) +- +1.0 -> +(2 + 3*2^-51), +5 * 2^-52 + 2^-101, -, - */
-	{ 0x3ff0000000000003, 0x3ff0000000000002, 0x3ff0000000000000,
+	{ 0x3ff0000000000003, 0x3ff0000000000002, 0x3ff0000000000000, FPS_RN_NEAR,
-	  0x4000000000000002, 0x3cd4000000000002, 0xc000000000000002, 0xbcd4000000000002 },
+	  0x4000000000000003, 0x3cd4000000000002, 0xc000000000000003, 0xbcd4000000000002 },
 	/* +2.443e-77 * 2.828 +- 6.909e-77 -> -1.402e-93, +1.382e-76, +1.402e-93, -1.382e-76 */
-	{ 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb020000000000000,
+	{ 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb020000000000000, FPS_RN_NEAR,
 	  0xaca765753908cd20, 0x3030000000000000, 0x2ca765753908cd20, 0xb030000000000000 },
 	/* +2.443e-77 * 2.828 +- 6.909e-77 -> +9.446e-93, +1.382e-76, -9.446e-93, -1.382e-76 */
-	{ 0x3006a09e667f3bcd, 0x4006a09e667f3bcd, 0xb020000000000000,
+	{ 0x3006a09e667f3bcd, 0x4006a09e667f3bcd, 0xb020000000000000, FPS_RN_NEAR,
 	  0x2cd3b3efbf5e2229, 0x3030000000000000, 0xacd3b3efbf5e2229, 0xb030000000000000 },
 	/* +2.443e-77 * 2.828 +- -1.1055e-75 -> -1.0364e-75, +1.1746e-75, +1.0364e-75, -1.1746e-75 */
-	{ 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb060003450000000,
+	{ 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb060003450000000, FPS_RN_NEAR,
 	  0xb05e0068a0000000, 0x3061003450000000, 0x305e0068a0000000, 0xb061003450000000 },
 	/* +2 * +3 +- 3 -> +9, +3, -9, -3 */
-	{ 0x4000000000000000, 0x4008000000000000, 0x4008000000000000,
+	{ 0x4000000000000000, 0x4008000000000000, 0x4008000000000000, FPS_RN_NEAR,
 	  0x4022000000000000, 0x4008000000000000, 0xc022000000000000, 0xc008000000000000 },
 	/* +2 * +3 +- 5 -> +11, +1, -11, -1 */
-	{ 0x4000000000000000, 0x4008000000000000, 0x4014000000000000,
+	{ 0x4000000000000000, 0x4008000000000000, 0x4014000000000000, FPS_RN_NEAR,
 	  0x4026000000000000, 0x3ff0000000000000, 0xc026000000000000, 0xbff0000000000000 },
 	/* +2 * +3 +- 7 -> +13, -1, -13, +1 */
-	{ 0x4000000000000000, 0x4008000000000000, 0x401c000000000000,
+	{ 0x4000000000000000, 0x4008000000000000, 0x401c000000000000, FPS_RN_NEAR,
 	  0x402a000000000000, 0xbff0000000000000, 0xc02a000000000000, 0x3ff0000000000000 },
 	/* +2 * +3 +- 9 -> +15, -3, -15, +3 */
-	{ 0x4000000000000000, 0x4008000000000000, 0x4022000000000000,
+	{ 0x4000000000000000, 0x4008000000000000, 0x4022000000000000, FPS_RN_NEAR,
 	  0x402e000000000000, 0xc008000000000000, 0xc02e000000000000, 0x4008000000000000 },
 	/* +2 * +3 +- -3 -> +3, +9, -3, -9 */
-	{ 0x4000000000000000, 0x4008000000000000, 0xc008000000000000,
+	{ 0x4000000000000000, 0x4008000000000000, 0xc008000000000000, FPS_RN_NEAR,
 	  0x4008000000000000, 0x4022000000000000, 0xc008000000000000, 0xc022000000000000 },
 	/* +2 * +3 +- -5 -> +1, +11, -1, -11 */
-	{ 0x4000000000000000, 0x4008000000000000, 0xc014000000000000,
+	{ 0x4000000000000000, 0x4008000000000000, 0xc014000000000000, FPS_RN_NEAR,
 	  0x3ff0000000000000, 0x4026000000000000, 0xbff0000000000000, 0xc026000000000000 },
 	/* +2 * +3 +- -7 -> -1, +13, +1, -13 */
-	{ 0x4000000000000000, 0x4008000000000000, 0xc01c000000000000,
+	{ 0x4000000000000000, 0x4008000000000000, 0xc01c000000000000, FPS_RN_NEAR,
 	  0xbff0000000000000, 0x402a000000000000, 0x3ff0000000000000, 0xc02a000000000000 },
 	/* +2 * +3 +- -9 -> -3, +15, +3, -15 */
-	{ 0x4000000000000000, 0x4008000000000000, 0xc022000000000000,
+	{ 0x4000000000000000, 0x4008000000000000, 0xc022000000000000, FPS_RN_NEAR,
 	  0xc008000000000000, 0x402e000000000000, 0x4008000000000000, 0xc02e000000000000 },
 	/* +2 * -3 +- 3 -> -3, -9, +3, +9 */
-	{ 0x4000000000000000, 0xc008000000000000, 0x4008000000000000,
+	{ 0x4000000000000000, 0xc008000000000000, 0x4008000000000000, FPS_RN_NEAR,
 	  0xc008000000000000, 0xc022000000000000, 0x4008000000000000, 0x4022000000000000 },
 	/* +2 * -3 +- 5 -> -1, -11, +1, +11 */
-	{ 0x4000000000000000, 0xc008000000000000, 0x4014000000000000,
+	{ 0x4000000000000000, 0xc008000000000000, 0x4014000000000000, FPS_RN_NEAR,
 	  0xbff0000000000000, 0xc026000000000000, 0x3ff0000000000000, 0x4026000000000000 },
 	/* +2 * -3 +- 7 -> +1, -13, -1, +13 */
-	{ 0x4000000000000000, 0xc008000000000000, 0x401c000000000000,
+	{ 0x4000000000000000, 0xc008000000000000, 0x401c000000000000, FPS_RN_NEAR,
 	  0x3ff0000000000000, 0xc02a000000000000, 0xbff0000000000000, 0x402a000000000000 },
 	/* +2 * -3 +- 9 -> +3, -15, -3, +15 */
-	{ 0x4000000000000000, 0xc008000000000000, 0x4022000000000000,
+	{ 0x4000000000000000, 0xc008000000000000, 0x4022000000000000, FPS_RN_NEAR,
 	  0x4008000000000000, 0xc02e000000000000, 0xc008000000000000, 0x402e000000000000 },
 	/* -2 * +3 +- -3 -> -9, -3, +9, +3 */
-	{ 0xc000000000000000, 0x4008000000000000, 0xc008000000000000,
+	{ 0xc000000000000000, 0x4008000000000000, 0xc008000000000000, FPS_RN_NEAR,
 	  0xc022000000000000, 0xc008000000000000, 0x4022000000000000, 0x4008000000000000 },
 	/* -2 * +3 +- -5 -> -11, -1, +11, +1 */
-	{ 0xc000000000000000, 0x4008000000000000, 0xc014000000000000,
+	{ 0xc000000000000000, 0x4008000000000000, 0xc014000000000000, FPS_RN_NEAR,
 	  0xc026000000000000, 0xbff0000000000000, 0x4026000000000000, 0x3ff0000000000000 },
 	/* -2 * +3 +- -7 -> -13, +1, +13, -1 */
-	{ 0xc000000000000000, 0x4008000000000000, 0xc01c000000000000,
+	{ 0xc000000000000000, 0x4008000000000000, 0xc01c000000000000, FPS_RN_NEAR,
 	  0xc02a000000000000, 0x3ff0000000000000, 0x402a000000000000, 0xbff0000000000000 },
 	/* -2 * +3 +- -9 -> -15, +3, +15, -3 */
-	{ 0xc000000000000000, 0x4008000000000000, 0xc022000000000000,
+	{ 0xc000000000000000, 0x4008000000000000, 0xc022000000000000, FPS_RN_NEAR,
 	  0xc02e000000000000, 0x4008000000000000, 0x402e000000000000, 0xc008000000000000 },
 	/* -2 * +3 +- +0 -> -6, -6, +6, +6 */
-	{ 0xc000000000000000, 0x4008000000000000, 0x0000000000000000,
+	{ 0xc000000000000000, 0x4008000000000000, 0x0000000000000000, FPS_RN_NEAR,
 	  0xc018000000000000, 0xc018000000000000, 0x4018000000000000, 0x4018000000000000 },
 	/* +2 * -3 +- -0 -> -6, -6, +6, +6 */
-	{ 0x4000000000000000, 0xc008000000000000, 0x8000000000000000,
+	{ 0x4000000000000000, 0xc008000000000000, 0x8000000000000000, FPS_RN_NEAR,
 	  0xc018000000000000, 0xc018000000000000, 0x4018000000000000, 0x4018000000000000 },
 	/* 2^-1026 * (1.5 * 2^1023) +- -0 -> (1.5 * 2^-3), ditto, -ditto, -ditto */
-	{ 0x0001000000000000, 0x7fe8000000000000, 0x8000000000000000,
+	{ 0x0001000000000000, 0x7fe8000000000000, 0x8000000000000000, FPS_RN_NEAR,
 	  0x3fc8000000000000, 0x3fc8000000000000, 0xbfc8000000000000, 0xbfc8000000000000 },
 	/* 1 * -1 + tiny -> -1 + delta, -1, 1 - delta, 1 */
 	{ 0x3ff0000000000000, 0xbff0000000000000, 0x00000000b2200102, FPS_RN_CEIL,
 	  0xbfefffffffffffff, 0xbff0000000000000, 0x3fefffffffffffff, 0x3ff0000000000000 },
 	/* from random exec tests */
 	{ 0x43eff79000000000, 0x00000000000000ff, 0x0000000000000081, FPS_RN_CEIL,
 	  0x014fd79870000001, 0x014fd79870000000, 0x814fd79870000001, 0x814fd79870000000 },
 	{ 0x00000000ffffffff, 0x1fc771af627f62ab, 0x8000000000000000, FPS_RN_ZERO,
 	  0x0000000000000000, 0x0000000000000000, 0x8000000000000000, 0x8000000000000000 },
 	{ 0x41efffffffe00000, 0xc1efffffffe00000, 0x43f0000000000000, FPS_RN_CEIL,
 	  0x41fffffffff00000, 0xc3ffffffffe00000, 0xc1fffffffff00000, 0x43ffffffffe00000 },
 	{ 0x3ff0000000000000, 0x000060fbffffefc1, 0x000060fbffffefc1, FPS_RN_NEAR,
 	  0x0000c1f7ffffdf82, 0x0000000000000000, 0x8000c1f7ffffdf82, 0x8000000000000000 },
 };
 int test23(long arg)
@ -1604,8 +1638,8 @@ int test23(long arg)
 	struct fmavals *vp = fmavals;
 	unsigned long fpscr;
 	set_fpscr(FPS_RN_NEAR);
 	for (i = 0; i < sizeof(fmavals) / sizeof(fmavals[0]); ++i, ++vp) {
 		set_fpscr(vp->fpscr);
 		asm("lfd 6,0(%0); lfd 7,8(%0); lfd 8,16(%0); fmadd 0,6,7,8; stfd 0,0(%1)"
 		    : : "b" (&vp->ra), "b" (results) : "memory");
 		asm("fmsub 1,6,7,8; fnmadd 2,6,7,8; fnmsub 3,6,7,8; stfd 1,8(%0); stfd 2,16(%0); stfd 3,24(%0)"
--- a/tests/test_fpu.bin
+++ b/tests/test_fpu.bin