From 9e8fb293edd59f355cc1fd020f96dafee0af867c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 16 Jul 2020 15:51:57 +1000
Subject: [PATCH] FPU: Implement floating convert from integer instructions

This implements fcfid, fcfidu, fcfids and fcfidus, which convert
64-bit integer values in an FPR into a floating-point value.
This brings in a lot of the datapath that will be needed in
future, including the shifter, adder, mask generator and
count-leading-zeroes logic, along with the machinery for rounding
to single-precision or double-precision, detecting inexact results,
signalling inexact-result exceptions, and updating result flags
in the FPSCR.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl               |  19 ++
 fpu.vhdl                   | 506 ++++++++++++++++++++++++++++++++++++-
 tests/fpu/fpu.c            |  87 ++++++-
 tests/test_fpu.bin         | Bin 12504 -> 13504 bytes
 tests/test_fpu.console_out |   1 +
 5 files changed, 587 insertions(+), 26 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 5f5fb80..83444cf 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -55,6 +55,7 @@ architecture behaviour of decode1 is
     type op_19_subop_array_t is array(0 to 7) of decode_rom_t;
     type op_30_subop_array_t is array(0 to 15) of decode_rom_t;
     type op_31_subop_array_t is array(0 to 1023) of decode_rom_t;
+    type op_59_subop_array_t is array(0 to 31) of decode_rom_t;
     type minor_rom_array_2_t is array(0 to 3) of decode_rom_t;
     type op_63_subop_array_0_t is array(0 to 511) of decode_rom_t;
 
@@ -410,6 +411,13 @@ architecture behaviour of decode1 is
         others   => decode_rom_init
         );
 
+    constant decode_op_59_array : op_59_subop_array_t := (
+        --             unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
+        --                          op                               in   out   A   out  in    out  len        ext                                pipe
+        2#01110#  =>  (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fcfid[u]s
+        others => illegal_inst
+        );
+
     constant decode_op_62_array : minor_rom_array_2_t := (
         --              unit    internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
         --                            op                                           in   out   A   out  in    out  len        ext                                 pipe
@@ -433,6 +441,8 @@ architecture behaviour of decode1 is
         2#100000010#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  2/8=fmr
         2#100000100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/8=fnabs
         2#100001000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  8/8=fabs
+        2#111011010#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 26/14=fcfid
+        2#111011110#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 30/14=fcfidu
         others => illegal_inst
         );
 
@@ -586,6 +596,15 @@ begin
         when 58 =>
             v.decode := decode_op_58_array(to_integer(unsigned(f_in.insn(1 downto 0))));
 
+        when 59 =>
+            if HAS_FPU then
+                -- floating point operations, mostly single-precision
+                v.decode := decode_op_59_array(to_integer(unsigned(f_in.insn(5 downto 1))));
+                if f_in.insn(5) = '0' and not std_match(f_in.insn(10 downto 1), "11-1001110") then
+                    vi.override := '1';
+                end if;
+            end if;
+
         when 62 =>
             v.decode := decode_op_62_array(to_integer(unsigned(f_in.insn(1 downto 0))));
 
diff --git a/fpu.vhdl b/fpu.vhdl
index 3711b35..fecb7bb 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -37,7 +37,12 @@ architecture behaviour of fpu is
 
     type state_t is (IDLE,
                      DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
-                     DO_FMR);
+                     DO_FMR,
+                     DO_FCFID,
+                     FINISH, NORMALIZE,
+                     ROUND_UFLOW, ROUND_OFLOW,
+                     ROUNDING, ROUNDING_2, ROUNDING_3,
+                     DENORM);
 
     type reg_type is record
         state        : state_t;
@@ -54,21 +59,121 @@ architecture behaviour of fpu is
         fpscr        : std_ulogic_vector(31 downto 0);
         a            : fpu_reg_type;
         b            : fpu_reg_type;
-        r            : std_ulogic_vector(63 downto 0);
+        r            : std_ulogic_vector(63 downto 0);  -- 10.54 format
+        x            : std_ulogic;
         result_sign  : std_ulogic;
         result_class : fp_number_class;
         result_exp   : signed(EXP_BITS-1 downto 0);
+        shift        : signed(EXP_BITS-1 downto 0);
         writing_back : std_ulogic;
         int_result   : std_ulogic;
         cr_result    : std_ulogic_vector(3 downto 0);
         cr_mask      : std_ulogic_vector(7 downto 0);
+        old_exc      : std_ulogic_vector(4 downto 0);
+        update_fprf  : std_ulogic;
+        tiny         : std_ulogic;
+        denorm       : std_ulogic;
+        round_mode   : std_ulogic_vector(2 downto 0);
     end record;
 
     signal r, rin : reg_type;
 
     signal fp_result     : std_ulogic_vector(63 downto 0);
+    signal opsel_a       : std_ulogic_vector(1 downto 0);
+    signal opsel_b       : std_ulogic_vector(1 downto 0);
     signal opsel_r       : std_ulogic_vector(1 downto 0);
+    signal opsel_ainv    : std_ulogic;
+    signal opsel_amask   : std_ulogic;
+    signal in_a          : std_ulogic_vector(63 downto 0);
+    signal in_b          : std_ulogic_vector(63 downto 0);
     signal result        : std_ulogic_vector(63 downto 0);
+    signal carry_in      : std_ulogic;
+    signal lost_bits     : std_ulogic;
+    signal r_hi_nz       : std_ulogic;
+    signal r_lo_nz       : std_ulogic;
+    signal misc_sel      : std_ulogic_vector(3 downto 0);
+
+    -- opsel values
+    constant AIN_R    : std_ulogic_vector(1 downto 0) := "00";
+    constant AIN_A    : std_ulogic_vector(1 downto 0) := "01";
+    constant AIN_B    : std_ulogic_vector(1 downto 0) := "10";
+
+    constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00";
+    constant BIN_R    : std_ulogic_vector(1 downto 0) := "01";
+    constant BIN_MASK : std_ulogic_vector(1 downto 0) := "10";
+
+    constant RES_SUM   : std_ulogic_vector(1 downto 0) := "00";
+    constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01";
+    constant RES_MISC  : std_ulogic_vector(1 downto 0) := "11";
+
+    -- Left and right shifter with 120 bit input and 64 bit output.
+    -- Shifts inp left by shift bits and returns the upper 64 bits of
+    -- the result.  The shift parameter is interpreted as a signed
+    -- number in the range -64..63, with negative values indicating
+    -- right shifts.
+    function shifter_64(inp: std_ulogic_vector(119 downto 0);
+                        shift: std_ulogic_vector(6 downto 0))
+        return std_ulogic_vector is
+        variable s1 : std_ulogic_vector(94 downto 0);
+        variable s2 : std_ulogic_vector(70 downto 0);
+        variable result : std_ulogic_vector(63 downto 0);
+    begin
+        case shift(6 downto 5) is
+            when "00" =>
+                s1 := inp(119 downto 25);
+            when "01" =>
+                s1 := inp(87 downto 0) & "0000000";
+            when "10" =>
+                s1 := x"0000000000000000" & inp(119 downto 89);
+            when others =>
+                s1 := x"00000000" & inp(119 downto 57);
+        end case;
+        case shift(4 downto 3) is
+            when "00" =>
+                s2 := s1(94 downto 24);
+            when "01" =>
+                s2 := s1(86 downto 16);
+            when "10" =>
+                s2 := s1(78 downto 8);
+            when others =>
+                s2 := s1(70 downto 0);
+        end case;
+        case shift(2 downto 0) is
+            when "000" =>
+                result := s2(70 downto 7);
+            when "001" =>
+                result := s2(69 downto 6);
+            when "010" =>
+                result := s2(68 downto 5);
+            when "011" =>
+                result := s2(67 downto 4);
+            when "100" =>
+                result := s2(66 downto 3);
+            when "101" =>
+                result := s2(65 downto 2);
+            when "110" =>
+                result := s2(64 downto 1);
+            when others =>
+                result := s2(63 downto 0);
+        end case;
+        return result;
+    end;
+
+    -- Generate a mask with 0-bits on the left and 1-bits on the right which
+    -- selects the bits will be lost in doing a right shift.  The shift
+    -- parameter is the bottom 6 bits of a negative shift count,
+    -- indicating a right shift.
+    function right_mask(shift: unsigned(5 downto 0)) return std_ulogic_vector is
+        variable result: std_ulogic_vector(63 downto 0);
+    begin
+        result := (others => '0');
+        for i in 0 to 63 loop
+            if i >= shift then
+                result(63 - i) := '1';
+            end if;
+        end loop;
+        return result;
+    end;
 
     -- Split a DP floating-point number into components and work out its class.
     -- If is_int = 1, the input is considered an integer
@@ -112,7 +217,8 @@ architecture behaviour of fpu is
 
     -- Construct a DP floating-point result from components
     function pack_dp(sign: std_ulogic; class: fp_number_class; exp: signed(EXP_BITS-1 downto 0);
-                     mantissa: std_ulogic_vector) return std_ulogic_vector is
+                     mantissa: std_ulogic_vector; single_prec: std_ulogic)
+        return std_ulogic_vector is
         variable result : std_ulogic_vector(63 downto 0);
     begin
         result := (others => '0');
@@ -124,16 +230,76 @@ architecture behaviour of fpu is
                     -- normalized number
                     result(62 downto 52) := std_ulogic_vector(resize(exp, 11) + 1023);
                 end if;
-                result(51 downto 0) := mantissa(53 downto 2);
+                result(51 downto 29) := mantissa(53 downto 31);
+                if single_prec = '0' then
+                    result(28 downto 0) := mantissa(30 downto 2);
+                end if;
             when INFINITY =>
                 result(62 downto 52) := "11111111111";
             when NAN =>
                 result(62 downto 52) := "11111111111";
-                result(51 downto 0) := mantissa(53 downto 2);
+                result(51 downto 29) := mantissa(53 downto 31);
+                if single_prec = '0' then
+                    result(28 downto 0) := mantissa(30 downto 2);
+                end if;
         end case;
         return result;
     end;
 
+    -- Determine whether to increment when rounding
+    -- Returns rounding_inc & inexact
+    -- Assumes x includes the bottom 29 bits of the mantissa already
+    -- if single_prec = 1 (usually arranged by setting set_x = 1 earlier).
+    function fp_rounding(mantissa: std_ulogic_vector(63 downto 0); x: std_ulogic;
+                         single_prec: std_ulogic; rn: std_ulogic_vector(2 downto 0);
+                         sign: std_ulogic)
+        return std_ulogic_vector is
+        variable grx : std_ulogic_vector(2 downto 0);
+        variable ret : std_ulogic_vector(1 downto 0);
+        variable lsb : std_ulogic;
+    begin
+        if single_prec = '0' then
+            grx := mantissa(1 downto 0) & x;
+            lsb := mantissa(2);
+        else
+            grx := mantissa(30 downto 29) & x;
+            lsb := mantissa(31);
+        end if;
+        ret(1) := '0';
+        ret(0) := or (grx);
+        case rn(1 downto 0) is
+            when "00" =>        -- round to nearest
+                if grx = "100" and rn(2) = '0' then
+                    ret(1) := lsb; -- tie, round to even
+                else
+                    ret(1) := grx(2);
+                end if;
+            when "01" =>        -- round towards zero
+            when others =>      -- round towards +/- inf
+                if rn(0) = sign then
+                    -- round towards greater magnitude
+                    ret(1) := ret(0);
+                end if;
+        end case;
+        return ret;
+    end;
+
+    -- Determine result flags to write into the FPSCR
+    function result_flags(sign: std_ulogic; class: fp_number_class; unitbit: std_ulogic)
+        return std_ulogic_vector is
+    begin
+        case class is
+            when ZERO =>
+                return sign & "0010";
+            when FINITE =>
+                return (not unitbit) & sign & (not sign) & "00";
+            when INFINITY =>
+                return '0' & sign & (not sign) & "01";
+            when NAN =>
+                return "10001";
+        end case;
+    end;
+
 begin
     fpu_0: process(clk)
     begin
@@ -174,6 +340,25 @@ begin
         variable j, k        : integer;
         variable flm         : std_ulogic_vector(7 downto 0);
         variable int_input   : std_ulogic;
+        variable mask        : std_ulogic_vector(63 downto 0);
+        variable in_a0       : std_ulogic_vector(63 downto 0);
+        variable in_b0       : std_ulogic_vector(63 downto 0);
+        variable misc        : std_ulogic_vector(63 downto 0);
+        variable shift_res   : std_ulogic_vector(63 downto 0);
+        variable round       : std_ulogic_vector(1 downto 0);
+        variable update_fx   : std_ulogic;
+        variable arith_done  : std_ulogic;
+        variable mant_nz     : std_ulogic;
+        variable min_exp     : signed(EXP_BITS-1 downto 0);
+        variable max_exp     : signed(EXP_BITS-1 downto 0);
+        variable bias_exp    : signed(EXP_BITS-1 downto 0);
+        variable new_exp     : signed(EXP_BITS-1 downto 0);
+        variable exp_tiny    : std_ulogic;
+        variable exp_huge    : std_ulogic;
+        variable renormalize : std_ulogic;
+        variable clz         : std_ulogic_vector(5 downto 0);
+        variable set_x       : std_ulogic;
+        variable mshift      : signed(EXP_BITS-1 downto 0);
     begin
         v := r;
         illegal := '0';
@@ -199,16 +384,53 @@ begin
             if e_in.op = OP_FPOP_I then
                 int_input := '1';
             end if;
+            v.tiny := '0';
+            v.denorm := '0';
+            v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN);
             adec := decode_dp(e_in.fra, int_input);
             bdec := decode_dp(e_in.frb, int_input);
             v.a := adec;
             v.b := bdec;
         end if;
 
+        r_hi_nz <= or (r.r(55 downto 31));
+        r_lo_nz <= or (r.r(30 downto 2));
+
+        if r.single_prec = '0' then
+            max_exp := to_signed(1023, EXP_BITS);
+            min_exp := to_signed(-1022, EXP_BITS);
+            bias_exp := to_signed(1536, EXP_BITS);
+        else
+            max_exp := to_signed(127, EXP_BITS);
+            min_exp := to_signed(-126, EXP_BITS);
+            bias_exp := to_signed(192, EXP_BITS);
+        end if;
+        new_exp := r.result_exp - r.shift;
+        exp_tiny := '0';
+        exp_huge := '0';
+        if new_exp < min_exp then
+            exp_tiny := '1';
+        end if;
+        if new_exp > max_exp then
+            exp_huge := '1';
+        end if;
+
         v.writing_back := '0';
         v.instr_done := '0';
-        opsel_r <= "00";
+        v.update_fprf := '0';
+        v.shift := to_signed(0, EXP_BITS);
+        opsel_a <= AIN_R;
+        opsel_ainv <= '0';
+        opsel_amask <= '0';
+        opsel_b <= BIN_ZERO;
+        opsel_r <= RES_SUM;
+        carry_in <= '0';
+        misc_sel <= "0000";
         fpscr_mask := (others => '1');
+        update_fx := '0';
+        arith_done := '0';
+        renormalize := '0';
+        set_x := '0';
 
         case r.state is
             when IDLE =>
@@ -230,10 +452,15 @@ begin
                             end if;
                         when "01000" =>
                             v.state := DO_FMR;
+                        when "01110" =>
+                            -- fcfid[u][s]
+                            v.state := DO_FCFID;
                         when others =>
                             illegal := '1';
                     end case;
                 end if;
+                v.x := '0';
+                v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX);
 
             when DO_MCRFS =>
                 j := to_integer(unsigned(insn_bfa(r.insn)));
@@ -276,7 +503,7 @@ begin
             when DO_MFFS =>
                 v.int_result := '1';
                 v.writing_back := '1';
-                opsel_r <= "10";
+                opsel_r <= RES_MISC;
                 case r.insn(20 downto 16) is
                     when "00000" =>
                         -- mffs
@@ -322,6 +549,7 @@ begin
                 v.state := IDLE;
 
             when DO_FMR =>
+                opsel_a <= AIN_B;
                 v.result_class := r.b.class;
                 v.result_exp := r.b.exponent;
                 if r.insn(9) = '1' then
@@ -339,29 +567,281 @@ begin
                 v.instr_done := '1';
                 v.state := IDLE;
 
+            when DO_FCFID =>
+                v.result_sign := '0';
+                opsel_a <= AIN_B;
+                if r.insn(8) = '0' and r.b.negative = '1' then
+                    -- fcfid[s] with negative operand, set R = -B
+                    opsel_ainv <= '1';
+                    carry_in <= '1';
+                    v.result_sign := '1';
+                end if;
+                v.result_class := r.b.class;
+                v.result_exp := to_signed(54, EXP_BITS);
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.b.class = ZERO then
+                    arith_done := '1';
+                else
+                    v.state := FINISH;
+                end if;
+
+            when FINISH =>
+                if r.r(63 downto 54) /= "0000000001" then
+                    renormalize := '1';
+                    v.state := NORMALIZE;
+                else
+                    set_x := '1';
+                    if exp_tiny = '1' then
+                        v.shift := new_exp - min_exp;
+                        v.state := ROUND_UFLOW;
+                    elsif exp_huge = '1' then
+                        v.state := ROUND_OFLOW;
+                    else
+                        v.shift := to_signed(-2, EXP_BITS);
+                        v.state := ROUNDING;
+                    end if;
+                end if;
+
+            when NORMALIZE =>
+                -- Shift so we have 9 leading zeroes (we know R is non-zero)
+                opsel_r <= RES_SHIFT;
+                set_x := '1';
+                if exp_tiny = '1' then
+                    v.shift := new_exp - min_exp;
+                    v.state := ROUND_UFLOW;
+                elsif exp_huge = '1' then
+                    v.state := ROUND_OFLOW;
+                else
+                    v.shift := to_signed(-2, EXP_BITS);
+                    v.state := ROUNDING;
+                end if;
+
+            when ROUND_UFLOW =>
+                v.tiny := '1';
+                if r.fpscr(FPSCR_UE) = '0' then
+                    -- disabled underflow exception case
+                    -- have to denormalize before rounding
+                    opsel_r <= RES_SHIFT;
+                    set_x := '1';
+                    v.shift := to_signed(-2, EXP_BITS);
+                    v.state := ROUNDING;
+                else
+                    -- enabled underflow exception case
+                    -- if denormalized, have to normalize before rounding
+                    v.fpscr(FPSCR_UX) := '1';
+                    v.result_exp := r.result_exp + bias_exp;
+                    if r.r(54) = '0' then
+                        renormalize := '1';
+                        v.state := NORMALIZE;
+                    else
+                        v.shift := to_signed(-2, EXP_BITS);
+                        v.state := ROUNDING;
+                    end if;
+                end if;
+
+            when ROUND_OFLOW =>
+                v.fpscr(FPSCR_OX) := '1';
+                if r.fpscr(FPSCR_OE) = '0' then
+                    -- disabled overflow exception
+                    -- result depends on rounding mode
+                    v.fpscr(FPSCR_XX) := '1';
+                    v.fpscr(FPSCR_FI) := '1';
+                    if r.round_mode(1 downto 0) = "00" or
+                        (r.round_mode(1) = '1' and r.round_mode(0) = r.result_sign) then
+                        v.result_class := INFINITY;
+                        v.fpscr(FPSCR_FR) := '1';
+                    else
+                        v.fpscr(FPSCR_FR) := '0';
+                    end if;
+                    -- construct largest representable number
+                    v.result_exp := max_exp;
+                    opsel_r <= RES_MISC;
+                    misc_sel <= "001" & r.single_prec;
+                    arith_done := '1';
+                else
+                    -- enabled overflow exception
+                    v.result_exp := r.result_exp - bias_exp;
+                    v.shift := to_signed(-2, EXP_BITS);
+                    v.state := ROUNDING;
+                end if;
+
+            when ROUNDING =>
+                opsel_amask <= '1';
+                round := fp_rounding(r.r, r.x, r.single_prec, r.round_mode, r.result_sign);
+                v.fpscr(FPSCR_FR downto FPSCR_FI) := round;
+                if round(1) = '1' then
+                    -- set mask to increment the LSB for the precision
+                    opsel_b <= BIN_MASK;
+                    carry_in <= '1';
+                    v.shift := to_signed(-1, EXP_BITS);
+                    v.state := ROUNDING_2;
+                else
+                    if r.r(54) = '0' then
+                        -- result after masking could be zero, or could be a
+                        -- denormalized result that needs to be renormalized
+                        renormalize := '1';
+                        v.state := ROUNDING_3;
+                    else
+                        arith_done := '1';
+                    end if;
+                end if;
+                if round(0) = '1' then
+                    v.fpscr(FPSCR_XX) := '1';
+                    if r.tiny = '1' then
+                        v.fpscr(FPSCR_UX) := '1';
+                    end if;
+                end if;
+
+            when ROUNDING_2 =>
+                -- Check for overflow during rounding
+                v.x := '0';
+                if r.r(55) = '1' then
+                    opsel_r <= RES_SHIFT;
+                    if exp_huge = '1' then
+                        v.state := ROUND_OFLOW;
+                    else
+                        arith_done := '1';
+                    end if;
+                elsif r.r(54) = '0' then
+                    -- Do CLZ so we can renormalize the result
+                    renormalize := '1';
+                    v.state := ROUNDING_3;
+                else
+                    arith_done := '1';
+                end if;
+
+            when ROUNDING_3 =>
+                mant_nz := r_hi_nz or (r_lo_nz and not r.single_prec);
+                if mant_nz = '0' then
+                    v.result_class := ZERO;
+                    arith_done := '1';
+                else
+                    -- Renormalize result after rounding
+                    opsel_r <= RES_SHIFT;
+                    v.denorm := exp_tiny;
+                    v.shift := new_exp - to_signed(-1022, EXP_BITS);
+                    if new_exp < to_signed(-1022, EXP_BITS) then
+                        v.state := DENORM;
+                    else
+                        arith_done := '1';
+                    end if;
+                end if;
+
+            when DENORM =>
+                opsel_r <= RES_SHIFT;
+                arith_done := '1';
+
         end case;
 
+        if arith_done = '1' then
+            v.writing_back := '1';
+            v.update_fprf := '1';
+            v.instr_done := '1';
+            v.state := IDLE;
+            update_fx := '1';
+        end if;
+
         -- Data path.
+        -- This has A and B input multiplexers, an adder, a shifter,
+        -- count-leading-zeroes logic, and a result mux.
+        if r.single_prec = '1' then
+            mshift := r.shift + to_signed(-29, EXP_BITS);
+        else
+            mshift := r.shift;
+        end if;
+        if mshift < to_signed(-64, EXP_BITS) then
+            mask := (others => '1');
+        elsif mshift >= to_signed(0, EXP_BITS) then
+            mask := (others => '0');
+        else
+            mask := right_mask(unsigned(mshift(5 downto 0)));
+        end if;
+        case opsel_a is
+            when AIN_R =>
+                in_a0 := r.r;
+            when AIN_A =>
+                in_a0 := r.a.mantissa;
+            when others =>
+                in_a0 := r.b.mantissa;
+        end case;
+        if (or (mask and in_a0)) = '1' and set_x = '1' then
+            v.x := '1';
+        end if;
+        if opsel_ainv = '1' then
+            in_a0 := not in_a0;
+        end if;
+        if opsel_amask = '1' then
+            in_a0 := in_a0 and not mask;
+        end if;
+        in_a <= in_a0;
+        case opsel_b is
+            when BIN_ZERO =>
+                in_b0 := (others => '0');
+            when BIN_R =>
+                in_b0 := r.r;
+            when BIN_MASK =>
+                in_b0 := mask;
+            when others =>
+                in_b0 := (others => '0');
+        end case;
+        in_b <= in_b0;
+        if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then
+            shift_res := shifter_64(r.r & x"00000000000000",
+                                    std_ulogic_vector(r.shift(6 downto 0)));
+        else
+            shift_res := (others => '0');
+        end if;
         case opsel_r is
-            when "00" =>
-                result <= r.b.mantissa;
-            when "10" =>
-                result <= x"00000000" & (r.fpscr and fpscr_mask);
+            when RES_SUM =>
+                result <= std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in);
+            when RES_SHIFT =>
+                result <= shift_res;
             when others =>
-                result <= (others => '0');
+                case misc_sel is
+                    when "0000" =>
+                        misc := x"00000000" & (r.fpscr and fpscr_mask);
+                    when "0010" =>
+                        -- mantissa of max representable DP number
+                        misc := x"007ffffffffffffc";
+                    when "0011" =>
+                        -- mantissa of max representable SP number
+                        misc := x"007fffff80000000";
+                    when others =>
+                        misc := x"0000000000000000";
+                end case;
+                result <= misc;
         end case;
         v.r := result;
 
+        if opsel_r = RES_SHIFT then
+            v.result_exp := new_exp;
+        end if;
+
+        if renormalize = '1' then
+            clz := count_left_zeroes(r.r);
+            v.shift := resize(signed('0' & clz) - 9, EXP_BITS);
+        end if;
+
         if r.int_result = '1' then
             fp_result <= r.r;
         else
-            fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r);
+            fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r,
+                                 r.single_prec);
+        end if;
+        if r.update_fprf = '1' then
+            v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.result_sign, r.result_class,
+                                                             r.r(54) and not r.denorm);
         end if;
 
         v.fpscr(FPSCR_VX) := (or (v.fpscr(FPSCR_VXSNAN downto FPSCR_VXVC))) or
                              (or (v.fpscr(FPSCR_VXSOFT downto FPSCR_VXCVI)));
         v.fpscr(FPSCR_FEX) := or (v.fpscr(FPSCR_VX downto FPSCR_XX) and
                                   v.fpscr(FPSCR_VE downto FPSCR_XE));
+        if update_fx = '1' and
+            (v.fpscr(FPSCR_VX downto FPSCR_XX) and not r.old_exc) /= "00000" then
+            v.fpscr(FPSCR_FX) := '1';
+        end if;
         if r.rc = '1' then
             v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX);
         end if;
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index 46668f8..80751d1 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -64,7 +64,7 @@ void print_string(const char *str)
 		putchar(*str);
 }
 
-void print_hex(unsigned long val, int ndigits)
+void print_hex(unsigned long val, int ndigits, const char *str)
 {
 	int i, x;
 
@@ -75,6 +75,7 @@ void print_hex(unsigned long val, int ndigits)
 		else
 			putchar(x + '0');
 	}
+	print_string(str);
 }
 
 // i < 100
@@ -201,12 +202,9 @@ int sp_to_dp(long arg)
 	asm("lfs 20,0(%0); stfd 20,0(%1)"
 	    : : "b" (&sp_dp_equiv[arg].sp), "b" (&dp) : "memory");
 	if (dp != sp_dp_equiv[arg].dp) {
-		print_hex(sp_dp_equiv[arg].sp, 8);
-		print_string(" ");
-		print_hex(dp, 16);
-		print_string(" ");
-		print_hex(sp_dp_equiv[arg].dp, 16);
-		print_string(" ");
+		print_hex(sp_dp_equiv[arg].sp, 8, " ");
+		print_hex(dp, 16, " ");
+		print_hex(sp_dp_equiv[arg].dp, 16, " ");
 	}
 	return dp != sp_dp_equiv[arg].dp;
 }
@@ -465,12 +463,77 @@ int test6(long arg)
 	return 0;
 }
 
+struct int_fp_equiv {
+	long		ival;
+	unsigned long	fp;
+	unsigned long	fp_u;
+	unsigned long	fp_s;
+	unsigned long	fp_us;
+} intvals[] = {
+	{ 0,  0, 0, 0, 0 },
+	{ 1,  0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 },
+	{ -1, 0xbff0000000000000, 0x43f0000000000000, 0xbff0000000000000, 0x43f0000000000000 },
+	{ 2,  0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000 },
+	{ -2, 0xc000000000000000, 0x43f0000000000000, 0xc000000000000000, 0x43f0000000000000 },
+	{ 0x12345678, 0x41b2345678000000, 0x41b2345678000000, 0x41b2345680000000, 0x41b2345680000000 },
+	{ 0x0008000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000 },
+	{ 0x0010000000000000, 0x4330000000000000, 0x4330000000000000, 0x4330000000000000, 0x4330000000000000 },
+	{ 0x0020000000000000, 0x4340000000000000, 0x4340000000000000, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000000000001, 0x4340000000000000, 0x4340000000000000, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000000000002, 0x4340000000000001, 0x4340000000000001, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000000000003, 0x4340000000000002, 0x4340000000000002, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000010000000, 0x4340000008000000, 0x4340000008000000, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000020000000, 0x4340000010000000, 0x4340000010000000, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000030000000, 0x4340000018000000, 0x4340000018000000, 0x4340000020000000, 0x4340000020000000 },
+	{ 0x0020000040000000, 0x4340000020000000, 0x4340000020000000, 0x4340000020000000, 0x4340000020000000 },
+	{ 0x0020000080000000, 0x4340000040000000, 0x4340000040000000, 0x4340000040000000, 0x4340000040000000 },
+	{ 0x0040000000000000, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000001, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000002, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000003, 0x4350000000000001, 0x4350000000000001, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000004, 0x4350000000000001, 0x4350000000000001, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000005, 0x4350000000000001, 0x4350000000000001, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000006, 0x4350000000000002, 0x4350000000000002, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000007, 0x4350000000000002, 0x4350000000000002, 0x4350000000000000, 0x4350000000000000 },
+};
+
+int test7(long arg)
+{
+	long i;
+	unsigned long results[4];
+
+	for (i = 0; i < sizeof(intvals) / sizeof(intvals[0]); ++i) {
+		asm("lfd%U0%X0 3,%0; fcfid 6,3; fcfidu 7,3; stfd 6,0(%1); stfd 7,8(%1)"
+		    : : "m" (intvals[i].ival), "b" (results) : "memory");
+		asm("fcfids 9,3; stfd 9,16(%0); fcfidus 10,3; stfd 10,24(%0)"
+		    : : "b" (results) : "memory");
+		if (results[0] != intvals[i].fp ||
+		    results[1] != intvals[i].fp_u ||
+		    results[2] != intvals[i].fp_s ||
+		    results[3] != intvals[i].fp_us) {
+			print_string("\r\n");
+			print_hex(results[0], 16, " ");
+			print_hex(results[1], 16, " ");
+			print_hex(results[2], 16, " ");
+			print_hex(results[3], 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
 int fpu_test_6(void)
 {
 	enable_fp();
 	return trapit(0, test6);
 }
 
+int fpu_test_7(void)
+{
+	enable_fp();
+	return trapit(0, test7);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -484,12 +547,9 @@ void do_test(int num, int (*test)(void))
 	} else {
 		fail = 1;
 		print_string("FAIL ");
-		print_hex(ret, 5);
-		print_string(" SRR0=");
-		print_hex(mfspr(SRR0), 16);
-		print_string(" SRR1=");
-		print_hex(mfspr(SRR1), 16);
-		print_string("\r\n");
+		print_hex(ret, 5, " SRR0=");
+		print_hex(mfspr(SRR0), 16, " SRR1=");
+		print_hex(mfspr(SRR1), 16, "\r\n");
 	}
 }
 
@@ -503,6 +563,7 @@ int main(void)
 	do_test(4, fpu_test_4);
 	do_test(5, fpu_test_5);
 	do_test(6, fpu_test_6);
+	do_test(7, fpu_test_7);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index 4fb260e1d5a3e4f37deea64098b82e55151aceb2..25d50c77a0d990a40320f8baa84752efe3a6d996 100755
GIT binary patch
delta 3104
zcma)7du&tJ9sb?x;5dP}H{{X0h;I^;#MH~&Qjmi~xz2;RrVUw}wi^PGlu~FxT_;h=
zY)-FB?TJnu3Oa1+9|0OfqpYeX7@4#OZPSOQ(rH^eX;T<op&go>F_;6>=8lZB@7#Nw
zB?Ocm>FRgB?|kR?I_KUy(LerFBayL|DBAza{ODfT?X)V|Pt*r)7r0&Ec5P3rI@q{U
zdG6q*d(=Vw<bmJbad)b|m1y$Z-#FhuP>V-PwYO42@MnHS^x?UW=B%*;T6G;1E-0>a
zCrqO6o_j?qG|L<HRixO5+6p!MBKxQI5X`A)@ra^Ema?l$o0Jr*Dp_6ry<&=-<aJrj
z$3v4h?BH68Jz26^O%$^uCBJ}fQ|Ucr4{8*7$I=~J$N9c>MB|~&Y`8R}wrT9s(x=s$
z-7H!bRrQ_h2W8!=cL)1RS+{bCtt#J`H|C}Y(>yvGE$=M<3{D=o&NskG8;T6l)5+lQ
zB<n6~DbPsCevIWJHdX!&<uL2NvqAm+B>T>tujh>~v0PNJ2P*WUBtT^WMMRS>vLa8l
zserMwyBLa955!}4y%k@h@yEvUIU3{fdnsWUR2Ph0d(T9abea!e%J6qCW%*~9MjWpA
z9yD(C^bgJZ@_mmKP=UXOUN9abU+Y+Y!@xA>q3^rwXeXcxfKS_vMgpWH9K_7hBJ;V$
z6d6YLR#uJA$FeVe9xA>1wy9Gcj_~liL@)i(MQbN@3QtmxFWyG`!Yvsy&iS!67bSwR
zJ+B1=G@SKWF^1DR^`|xPl@hyXz`_N<rB^xp_NV($lZ)0(#;Sbg0OwBywEMTk{YXS)
zh9Qz8g^V^j(H<O8U2znAy^@U95y$-lSnt!U`*k4{+t4&1+Aj*wNoj4St;>GYyCXhS
zJ3c++xiC%i!Gw#_6GOH7WsRE84tb2r6%;w^qBBSo1qI9%@tvrM^T*o~d$t9a;Vy{`
zMRlSIQJsP69t01YZkE-pl{g^rmw<zq3ys(({3v7w$MUzuMYX#NDWV~0vlVnje30)m
zDu4riNhCn%Ac7J<3#<?iHRIU)oZ3-z@TcO^u+x>Lb^DQO?%H+zhi~}9^&4gY;R=a+
zM96QlsmhYdEl&IR*I@5g<vf67M%l8eqj*8ytXioIvZ<<->S*#*WA!r*Q(QLDib30d
zdgh{KML9Yy1oesaSFld0`h#1*i&iZ{$*-bSvFm>}4}u#}zhzwz(TY*<0~Yt~xb6QK
zTKyXGPly_3Vtg&j*&UubrIp?7DKW+M5$~Fc+&i}=gW~!K>;O(Ykqmx;1`C`A+`K&*
z6dwVB7Xd#Dd{$y7@S!J@!7~!)1AhYiGhiLnii*_K<2W=WEnX>|65Zg~j<RC6SgVoW
zBv*?4&SMtLpi`Ulhn1q#=Rwk={W?YWLh$ptP*<vc@{K%-zrK+MT$roxy7uPb2xoH+
zT;wG)oMGv-Ihd`f-&kga^7@H<FX0{hvPcFoj?_O|sADchQ6c+xeGBt9jG04!<@}<}
zk_L-ywEGh0J+n9i^HFe_SzIsXNqoKaS=>&{2f=wICu|(gtKdEnTxUU;jw+q*Kymo_
zuNS(h{XE<1EmC_gvuC_5YTp%h1ZLt2`;E6&rN6T)FnJl~US6yC*{{9T&ZI)rn_(@>
ztDMI%J)U9R%d75tAJcziIIj^QFmyZ<;HJTelw{Zrw}G(TBC0biQk#axm|`Q#YfVwL
zS4E?yIDc7e=|HC8!;T~BQ3adAZX<$p*E_@KKmAj9F;O!lz0jYA{`jr*eVg6X1+g<s
zB2)s}d#|Ef2uTsyg+6%R0k`4vZl`c6grgApnXj>-{Ugj0*XB0(BDj~q<th~imIZeR
zTqovYJb-5jzJl*xV+R`7cZy6m1M2_c`~$JR-fF{9C5p&hC@8%6A-ggn;dVc?r=dL|
zv|Vl;Ic{ZIlh+r-FLS>jF+^O^vNymb1%V*~8F*^gmZrOWJzsDx7%_x8f#(Tue$fC7
z!5zXAKXtO{1t)U`maz*;nJHI=0waUJ?yW7CL$1FqZO6?Ry_r1>3Fg@wwm*|g!MJTR
z(jm>@*rWF@l^RTc9L79n6B6evmRwS*OFy@>q8nnE0V%wl)h!tsEp_v)-NA)*c4zSG
zvpgCdmNjqT*kK8DvNsqVIcsM!CjpdU1xDExpR<t8(LrSCptty(CG#d>Kng1)pR?F)
zZOmCi>T>vYHX(J;Tl;Ue_>KK9Q?f^oZ1sFATeqN2c5FeNL)4E1$adUgp9#$moM*XO
iShwi^)H$V2{faf|k!L(VaDGm>fR25vl(2@tPyP$jogY;I

delta 1800
zcmZvce{54#6vxkd+q&0{br09FK?}TYrK~z;2TI=PVC~qjlx15~z@WyJ!Q_W1XiNr{
zy_VEum`GeMA^tJhM6(1!%*du8@ka(B>hSkJhRE0=PP2*3QHc(jT)($1izJ@p_TKY3
z-+S&o=e^g#otMIHBL8ZlQ25jA(0<I@sUj36Y6ja4wi|5s_Gm??dxf;GbIWqsvG~l9
zZ}KZ*#g#-8mw)4Y13@i@8mp<KsNh3i5M8|d?L9GeM72AiP@tG>h#Ew%U4ApIP?OoH
zxhTdis`txkl>Mnbim5*|w^x!cs4Q{cQYpq36s(+QRw;OfJEu7x^iAAJ9j?Y$Yr#r+
zCZ8QI_z=1+*5$S#3kA<5btkGhKU7UL=p**F)hC~|uo3Gn`SdOpvW4VJ&$45-26?!Z
z{bFm7>e<8d8f9-DJ5aQM^~|f!`VJ{JvdY32q@UTz!V-Dmuk7o>Q*t~PidA{RTnb*v
zrl6>PjLo%|7@Uv!+U)I-RVv@Ck~A&$4`g#xySQu)Mg3lK>QgxsTsbpjtjYFlG?Tf^
zK{5Yk@>C9FmmHko+_xv4@pE*?3qGFe5)L6LD&vRzRn%9b_sU9SJ#vY5ByZquH`>(b
zRi8D{ZD{1ggXlU>bod8_swOlFOi+_25~M?c@+l+2`H5<UqWbiUANvA0pzYc_#trmS
zqwu&2zAVkIfGHG7viWH?1ZGWeoDm*xLO}{uPqZ!Y9OOJ_r&nFy61i!nAamG?Sef1G
zq?cOsyRF!Ro!{TSV|dBXbBHe<$Zm-|&BFGA`Rh%oHm;&(eRX;*nst=DZts@%v1N`G
zQiO#aE9Ajw|1rn&8EEy^Dh0XyP$B>LH2?8hZ-x@-y2Uq0q@j9h78(^AiWPUh3Z8|^
z+HU`3oWokgiV)j_b+3FNBC0-_<{J3MB!4BzJJY-r{EgJ(V4oG)@QNsgM^4=wZ&{Uu
z+wa&K*B3$~d>MXiOI*)O`x)?~+v55^=!1x7!Y|z(*RQAjEckoji>`~f3I5U6xc(J<
zJW*ZLB*(VmX-MOuR*j;pu+U}^+>!o>lo;FOG|aPJ|A_N3DKB<aCoL4#D6}7fH6^~f
zSn<T^Op2Upq=O39@_A){Cf>IHvJCQ9&Y5NYq)xrZXP=Zb+LA}P%7{FN@wz=IE`u|Q
zU7Ji|sS!KmDwp$e*n6%IS~dJ*i~$Q%F|doKz?uYr3FgNryUzI(W?_Ooj-gC)u3;7?
z*bWRUm`ebdm?L7IWEYpX<x7(+v7}rc`<)fL%cZASr`vA0B%=5fwqT}p7Vq3Zd$-hX
zp_)-szyVYX-f@F-@kvviMML}%L`?y>BEAjrKE(H?;$nZ3h1Sl-<05b3|Kqr_o!*6@
z2?CK!n1*o{?1=<(FI{9B#<DBHLQ5AJrHEwWbN>kHHms@69H8TJw^WzBc5RM@0;40Q
zKrTX}i(Y7zKRE9b+O^OIx-#F&5F2pGfTigf3)PK|Tp1R36`g8^z5{yEvDw_ImY}2n
z%P%$8bU-r<jV3fO(e*B{aj?4;2-XKS1y(CWuzHLOVA@T#rF31D2G^Q_hw~Tb7qCX>
z{mIVPqX;x_6h(-1Iy4Uawf6{3s{2o}u~L_(1F%Ctm^e?=I|CLM0493n$9Rracpk9z
o+~QmiVL~0n7-0K61zBJIjb8S5dR{j5_jraSX7Vm%xevYnFKc-|0{{R3

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index a49bb9b..340756c 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -4,3 +4,4 @@ test 03:PASS
 test 04:PASS
 test 05:PASS
 test 06:PASS
+test 07:PASS