From 45cd8f4fc375185544309ffd16d73a7dc5ce1dce Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 28 Aug 2020 12:49:48 +1000
Subject: [PATCH 01/30] core: Add support for floating-point loads and stores

This extends the register file so it can hold FPR values, and
implements the FP loads and stores that do not require conversion
between single and double precision.

We now have the FP, FE0 and FE1 bits in MSR.  FP loads and stores
cause a FP unavailable interrupt if MSR[FP] = 0.

The FPU facilities are optional and their presence is controlled by
the HAS_FPU generic passed down from the top-level board file.  It
defaults to true for all except the A7-35 boards.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl               | 34 ++++++++++++++++------
 control.vhdl              |  7 ++---
 core.vhdl                 | 13 ++++++---
 decode1.vhdl              | 19 ++++++++++++
 decode2.vhdl              | 30 ++++++++++++++-----
 decode_types.vhdl         |  5 ++--
 execute1.vhdl             | 20 +++++++++++--
 fpga/top-arty.vhdl        |  2 ++
 fpga/top-generic.vhdl     |  2 ++
 fpga/top-nexys-video.vhdl |  2 ++
 gpr_hazard.vhdl           | 13 +++++----
 insn_helpers.vhdl         | 24 +++++++++++++++
 loadstore1.vhdl           | 18 ++++++++++--
 microwatt.core            | 14 +++++++++
 register_file.vhdl        | 61 ++++++++++++++++++++++++++-------------
 scripts/fmt_log/fmt_log.c | 14 ++++-----
 soc.vhdl                  |  2 ++
 writeback.vhdl            |  2 +-
 18 files changed, 217 insertions(+), 65 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 1ca1178..14bdcf7 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -13,8 +13,11 @@ package common is
     constant MSR_SF  : integer := (63 - 0);     -- Sixty-Four bit mode
     constant MSR_EE  : integer := (63 - 48);    -- External interrupt Enable
     constant MSR_PR  : integer := (63 - 49);    -- PRoblem state
+    constant MSR_FP  : integer := (63 - 50);    -- Floating Point available
+    constant MSR_FE0 : integer := (63 - 52);    -- Floating Exception mode
     constant MSR_SE  : integer := (63 - 53);    -- Single-step bit of TE field
     constant MSR_BE  : integer := (63 - 54);    -- Branch trace bit of TE field
+    constant MSR_FE1 : integer := (63 - 55);    -- Floating Exception mode
     constant MSR_IR  : integer := (63 - 58);    -- Instruction Relocation
     constant MSR_DR  : integer := (63 - 59);    -- Data Relocation
     constant MSR_RI  : integer := (63 - 62);    -- Recoverable Interrupt
@@ -53,8 +56,11 @@ package common is
     -- GPR indices in the register file (GPR only)
     subtype gpr_index_t is std_ulogic_vector(4 downto 0);
 
-    -- Extended GPR indice (can hold an SPR)
-    subtype gspr_index_t is std_ulogic_vector(5 downto 0);
+    -- Extended GPR index (can hold an SPR or a FPR)
+    subtype gspr_index_t is std_ulogic_vector(6 downto 0);
+
+    -- FPR indices
+    subtype fpr_index_t is std_ulogic_vector(4 downto 0);
 
     -- Some SPRs are stored in the register file, they use the magic
     -- GPR numbers above 31.
@@ -64,6 +70,9 @@ package common is
     -- indicates if this is indeed a fast SPR. If clear, then
     -- the SPR is not stored in the GPR file.
     --
+    -- FPRs are also stored in the register file, using GSPR
+    -- numbers from 64 to 95.
+    --
     function fast_spr_num(spr: spr_num_t) return gspr_index_t;
 
     -- Indices conversion functions
@@ -71,6 +80,7 @@ package common is
     function gpr_to_gspr(i: gpr_index_t) return gspr_index_t;
     function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t;
     function is_fast_spr(s: gspr_index_t) return std_ulogic;
+    function fpr_to_gspr(f: fpr_index_t) return gspr_index_t;
 
     -- The XER is split: the common bits (CA, OV, SO, OV32 and CA32) are
     -- in the CR file as a kind of CR extension (with a separate write
@@ -226,7 +236,7 @@ package common is
 	read2_enable : std_ulogic;
 	read2_reg : gspr_index_t;
 	read3_enable : std_ulogic;
-	read3_reg : gpr_index_t;
+	read3_reg : gspr_index_t;
     end record;
 
     type RegisterFileToDecode2Type is record
@@ -264,7 +274,7 @@ package common is
 	addr1 : std_ulogic_vector(63 downto 0);
 	addr2 : std_ulogic_vector(63 downto 0);
 	data : std_ulogic_vector(63 downto 0);		-- data to write, unused for read
-	write_reg : gpr_index_t;
+	write_reg : gspr_index_t;
 	length : std_ulogic_vector(3 downto 0);
         ci : std_ulogic;                                -- cache-inhibited load/store
 	byte_reverse : std_ulogic;
@@ -282,7 +292,8 @@ package common is
                                                                      sign_extend => '0', update => '0', xerc => xerc_init,
                                                                      reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0',
                                                                      nia => (others => '0'), insn => (others => '0'),
-                                                                     addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'), length => (others => '0'),
+                                                                     addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'),
+                                                                     write_reg => (others => '0'), length => (others => '0'),
                                                                      mode_32bit => '0', others => (others => '0'));
 
     type Loadstore1ToExecute1Type is record
@@ -369,7 +380,7 @@ package common is
     type Loadstore1ToWritebackType is record
 	valid : std_ulogic;
 	write_enable: std_ulogic;
-	write_reg : gpr_index_t;
+	write_reg : gspr_index_t;
 	write_data : std_ulogic_vector(63 downto 0);
 	xerc : xer_common_t;
         rc : std_ulogic;
@@ -473,10 +484,10 @@ package body common is
            n := 13;
        when others =>
            n := 0;
-           return "000000";
+           return "0000000";
        end case;
        tmp := std_ulogic_vector(to_unsigned(n, 5));
-       return "1" & tmp;
+       return "01" & tmp;
     end;
 
     function gspr_to_gpr(i: gspr_index_t) return gpr_index_t is
@@ -486,7 +497,7 @@ package body common is
 
     function gpr_to_gspr(i: gpr_index_t) return gspr_index_t is
     begin
-	return "0" & i;
+	return "00" & i;
     end;
 
     function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t is
@@ -502,4 +513,9 @@ package body common is
     begin
 	return s(5);
     end;
+
+    function fpr_to_gspr(f: fpr_index_t) return gspr_index_t is
+    begin
+        return "10" & f;
+    end;
 end common;
diff --git a/control.vhdl b/control.vhdl
index d04576a..4f67ad4 100644
--- a/control.vhdl
+++ b/control.vhdl
@@ -34,7 +34,7 @@ entity control is
         gpr_b_read_in       : in gspr_index_t;
 
         gpr_c_read_valid_in : in std_ulogic;
-        gpr_c_read_in       : in gpr_index_t;
+        gpr_c_read_in       : in gspr_index_t;
 
         cr_read_in          : in std_ulogic;
         cr_write_in         : in std_ulogic;
@@ -70,7 +70,6 @@ architecture rtl of control is
     signal gpr_write_valid : std_ulogic := '0';
     signal cr_write_valid  : std_ulogic := '0';
 
-    signal gpr_c_read_in_fmt : std_ulogic_vector(5 downto 0);
 begin
     gpr_hazard0: entity work.gpr_hazard
         generic map (
@@ -122,8 +121,6 @@ begin
             use_bypass         => gpr_bypass_b
             );
 
-    gpr_c_read_in_fmt <= "0" & gpr_c_read_in;
-
     gpr_hazard2: entity work.gpr_hazard
         generic map (
             PIPELINE_DEPTH => PIPELINE_DEPTH
@@ -140,7 +137,7 @@ begin
             gpr_write_in       => gpr_write_in,
             bypass_avail       => gpr_bypassable,
             gpr_read_valid_in  => gpr_c_read_valid_in,
-            gpr_read_in        => gpr_c_read_in_fmt,
+            gpr_read_in        => gpr_c_read_in,
 
             ugpr_write_valid   => update_gpr_write_valid,
             ugpr_write_reg     => update_gpr_write_reg,
diff --git a/core.vhdl b/core.vhdl
index c7dd3f6..81e11c8 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -11,6 +11,7 @@ entity core is
         SIM : boolean := false;
 	DISABLE_FLATTEN : boolean := false;
         EX1_BYPASS : boolean := true;
+        HAS_FPU : boolean := true;
 	ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0');
         LOG_LENGTH : natural := 512
         );
@@ -244,6 +245,7 @@ begin
     decode2_0: entity work.decode2
         generic map (
             EX1_BYPASS => EX1_BYPASS,
+            HAS_FPU => HAS_FPU,
             LOG_LENGTH => LOG_LENGTH
             )
         port map (
@@ -267,6 +269,7 @@ begin
     register_file_0: entity work.register_file
         generic map (
             SIM => SIM,
+            HAS_FPU => HAS_FPU,
             LOG_LENGTH => LOG_LENGTH
             )
         port map (
@@ -280,7 +283,7 @@ begin
             dbg_gpr_data => dbg_gpr_data,
 	    sim_dump => terminate,
 	    sim_dump_done => sim_cr_dump,
-            log_out => log_data(255 downto 185)
+            log_out => log_data(255 downto 184)
 	    );
 
     cr_file_0: entity work.cr_file
@@ -294,12 +297,13 @@ begin
             d_out => cr_file_to_decode2,
             w_in => writeback_to_cr_file,
             sim_dump => sim_cr_dump,
-            log_out => log_data(184 downto 172)
+            log_out => log_data(183 downto 171)
             );
 
     execute1_0: entity work.execute1
         generic map (
             EX1_BYPASS => EX1_BYPASS,
+            HAS_FPU => HAS_FPU,
             LOG_LENGTH => LOG_LENGTH
             )
         port map (
@@ -324,6 +328,7 @@ begin
 
     loadstore1_0: entity work.loadstore1
         generic map (
+            HAS_FPU => HAS_FPU,
             LOG_LENGTH => LOG_LENGTH
             )
         port map (
@@ -368,7 +373,7 @@ begin
             stall_out => dcache_stall_out,
             wishbone_in => wishbone_data_in,
             wishbone_out => wishbone_data_out,
-            log_out => log_data(171 downto 152)
+            log_out => log_data(170 downto 151)
             );
 
     writeback_0: entity work.writeback
@@ -381,7 +386,7 @@ begin
             complete_out => complete
             );
 
-    log_data(151 downto 150) <= "00";
+    log_data(150) <= '0';
     log_data(139 downto 135) <= "00000";
 
     debug_0: entity work.core_debug
diff --git a/decode1.vhdl b/decode1.vhdl
index a7d5910..75da175 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -72,6 +72,10 @@ architecture behaviour of decode1 is
         10 =>       (ALU,    OP_CMP,       RA,         CONST_UI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli
         34 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbz
         35 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzu
+        50 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfd
+        51 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdu
+--      48 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfs
+--      49 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsu
         42 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lha
         43 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhau
         40 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhz
@@ -87,6 +91,10 @@ architecture behaviour of decode1 is
         17 =>       (ALU,    OP_SC,        NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sc
         38 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stb
         39 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbu
+        54 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfd
+        55 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdu
+--      52 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfs
+--      53 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsu
         44 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth
         45 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu
         36 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw
@@ -272,6 +280,12 @@ architecture behaviour of decode1 is
         2#1101110101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldcix
         2#0000110101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- ldux
         2#0000010101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldx
+        2#1001010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfdx
+        2#1001110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdux
+        2#1101010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwax
+        2#1101110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwzx
+--      2#1000010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfsx
+--      2#1000110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsux
         2#0001110100#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lharx
         2#0101110111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhaux
         2#0101010111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lhax
@@ -350,6 +364,11 @@ architecture behaviour of decode1 is
         2#0011010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0'), -- stdcx
         2#0010110101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stdux
         2#0010010101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdx
+        2#1011010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfdx
+        2#1011110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdux
+        2#1111010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfiwx
+--      2#1010010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfsx
+--      2#1010110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsux
         2#1110010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx
         2#1110110101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthcix
         2#1011010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0'), -- sthcx
diff --git a/decode2.vhdl b/decode2.vhdl
index a2a602c..6cc74c7 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -11,6 +11,7 @@ use work.insn_helpers.all;
 entity decode2 is
     generic (
         EX1_BYPASS : boolean := true;
+        HAS_FPU : boolean := true;
         -- Non-zero to enable log data collection
         LOG_LENGTH : natural := 0
         );
@@ -73,7 +74,7 @@ architecture behaviour of decode2 is
             -- If it's all 0, we don't treat it as a dependency as slow SPRs
             -- operations are single issue.
             --
-            assert is_fast_spr(ispr) =  '1' or ispr = "000000"
+            assert is_fast_spr(ispr) =  '1' or ispr = "0000000"
                 report "Decode A says SPR but ISPR is invalid:" &
                 to_hstring(ispr) severity failure;
             return (is_fast_spr(ispr), ispr, reg_data);
@@ -118,7 +119,7 @@ architecture behaviour of decode2 is
                 -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR.
                 -- If it's all 0, we don't treat it as a dependency as slow SPRs
                 -- operations are single issue.
-                assert is_fast_spr(ispr) = '1' or ispr = "000000"
+                assert is_fast_spr(ispr) = '1' or ispr = "0000000"
                     report "Decode B says SPR but ISPR is invalid:" &
                     to_hstring(ispr) severity failure;
                 ret := (is_fast_spr(ispr), ispr, reg_data);
@@ -137,6 +138,12 @@ architecture behaviour of decode2 is
                 return ('1', gpr_to_gspr(insn_rs(insn_in)), reg_data);
             when RCR =>
                 return ('1', gpr_to_gspr(insn_rcreg(insn_in)), reg_data);
+            when FRS =>
+                if HAS_FPU then
+                    return ('1', fpr_to_gspr(insn_frt(insn_in)), reg_data);
+                else
+                    return ('0', (others => '0'), (others => '0'));
+                end if;
             when NONE =>
                 return ('0', (others => '0'), (others => '0'));
         end case;
@@ -150,16 +157,22 @@ architecture behaviour of decode2 is
                 return ('1', gpr_to_gspr(insn_rt(insn_in)));
             when RA =>
                 return ('1', gpr_to_gspr(insn_ra(insn_in)));
+            when FRT =>
+                if HAS_FPU then
+                    return ('1', fpr_to_gspr(insn_frt(insn_in)));
+                else
+                    return ('0', "0000000");
+                end if;
             when SPR =>
                 -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR.
                 -- If it's all 0, we don't treat it as a dependency as slow SPRs
                 -- operations are single issue.
-                assert is_fast_spr(ispr) = '1' or ispr = "000000"
+                assert is_fast_spr(ispr) = '1' or ispr = "0000000"
                     report "Decode B says SPR but ISPR is invalid:" &
                     to_hstring(ispr) severity failure;
                 return (is_fast_spr(ispr), ispr);
             when NONE =>
-                return ('0', "000000");
+                return ('0', "0000000");
         end case;
     end;
 
@@ -212,7 +225,7 @@ architecture behaviour of decode2 is
     signal gpr_b_bypass : std_ulogic;
 
     signal gpr_c_read_valid : std_ulogic;
-    signal gpr_c_read : gpr_index_t;
+    signal gpr_c_read : gspr_index_t;
     signal gpr_c_bypass : std_ulogic;
 
     signal cr_write_valid  : std_ulogic;
@@ -284,8 +297,9 @@ begin
                        else gpr_to_gspr(insn_ra(d_in.insn));
     r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR
                        else gpr_to_gspr(insn_rb(d_in.insn));
-    r_out.read3_reg <= insn_rcreg(d_in.insn) when d_in.decode.input_reg_c = RCR
-                       else insn_rs(d_in.insn);
+    r_out.read3_reg <= gpr_to_gspr(insn_rcreg(d_in.insn)) when d_in.decode.input_reg_c = RCR
+                       else fpr_to_gspr(insn_frt(d_in.insn)) when d_in.decode.input_reg_c = FRS and HAS_FPU
+                       else gpr_to_gspr(insn_rs(d_in.insn));
 
     c_out.read <= d_in.decode.input_cr;
 
@@ -394,7 +408,7 @@ begin
         gpr_b_read <= decoded_reg_b.reg;
 
         gpr_c_read_valid <= decoded_reg_c.reg_valid;
-        gpr_c_read <= gspr_to_gpr(decoded_reg_c.reg);
+        gpr_c_read <= decoded_reg_c.reg;
 
         cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn);
         cr_bypass_avail <= '0';
diff --git a/decode_types.vhdl b/decode_types.vhdl
index ef654c3..8c20441 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -10,6 +10,7 @@ package decode_types is
 			 OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS,
 			 OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC,
 			 OP_LOAD, OP_STORE,
+                         OP_FPLOAD, OP_FPSTORE,
 			 OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MOD,
 			 OP_MTCRF, OP_MTMSRD, OP_MTSPR, OP_MUL_L64,
 			 OP_MUL_H64, OP_MUL_H32, OP_OR,
@@ -24,8 +25,8 @@ package decode_types is
     type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA);
     type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD,
                            CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR);
-    type input_reg_c_t is (NONE, RS, RCR);
-    type output_reg_a_t is (NONE, RT, RA, SPR);
+    type input_reg_c_t is (NONE, RS, RCR, FRS);
+    type output_reg_a_t is (NONE, RT, RA, SPR, FRT);
     type rc_t is (NONE, ONE, RC);
     type carry_in_t is (ZERO, CA, OV, ONE);
 
diff --git a/execute1.vhdl b/execute1.vhdl
index 04cc970..4d6a9cc 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -13,6 +13,7 @@ use work.ppc_fx_insns.all;
 entity execute1 is
     generic (
         EX1_BYPASS : boolean := true;
+        HAS_FPU : boolean := true;
         -- Non-zero to enable log data collection
         LOG_LENGTH : natural := 0
         );
@@ -542,6 +543,9 @@ begin
             ctrl_tmp.msr(MSR_PR) <= '0';
             ctrl_tmp.msr(MSR_SE) <= '0';
             ctrl_tmp.msr(MSR_BE) <= '0';
+            ctrl_tmp.msr(MSR_FP) <= '0';
+            ctrl_tmp.msr(MSR_FE0) <= '0';
+            ctrl_tmp.msr(MSR_FE1) <= '0';
             ctrl_tmp.msr(MSR_IR) <= '0';
             ctrl_tmp.msr(MSR_DR) <= '0';
             ctrl_tmp.msr(MSR_RI) <= '0';
@@ -578,7 +582,19 @@ begin
             -- set bit 45 to indicate privileged instruction type interrupt
             ctrl_tmp.srr1(63 - 45) <= '1';
             report "privileged instruction";
-            
+
+        elsif not HAS_FPU and valid_in = '1' and
+            (e_in.insn_type = OP_FPLOAD or e_in.insn_type = OP_FPSTORE) then
+            -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations
+            illegal := '1';
+
+        elsif HAS_FPU and valid_in = '1' and ctrl.msr(MSR_FP) = '0' and
+            (e_in.insn_type = OP_FPLOAD or e_in.insn_type = OP_FPSTORE) then
+            -- generate a floating-point unavailable interrupt
+            exception := '1';
+            v.f.redirect_nia := std_logic_vector(to_unsigned(16#800#, 64));
+            report "FP unavailable interrupt";
+
 	elsif valid_in = '1' and e_in.unit = ALU then
 
 	    report "execute nia " & to_hstring(e_in.nia);
@@ -1225,7 +1241,7 @@ begin
         lv.addr1 := a_in;
         lv.addr2 := b_in;
         lv.data := c_in;
-        lv.write_reg := gspr_to_gpr(e_in.write_reg);
+        lv.write_reg := e_in.write_reg;
         lv.length := e_in.data_len;
         lv.byte_reverse := e_in.byte_reverse xnor ctrl.msr(MSR_LE);
         lv.sign_extend := e_in.sign_extend;
diff --git a/fpga/top-arty.vhdl b/fpga/top-arty.vhdl
index a4d253d..8a3dc7a 100644
--- a/fpga/top-arty.vhdl
+++ b/fpga/top-arty.vhdl
@@ -14,6 +14,7 @@ entity toplevel is
         RAM_INIT_FILE      : string   := "firmware.hex";
         RESET_LOW          : boolean  := true;
         CLK_FREQUENCY      : positive := 100000000;
+        HAS_FPU            : boolean  := true;
         USE_LITEDRAM       : boolean  := false;
         NO_BRAM            : boolean  := false;
         DISABLE_FLATTEN_CORE : boolean := false;
@@ -168,6 +169,7 @@ begin
             RAM_INIT_FILE      => RAM_INIT_FILE,
             SIM                => false,
             CLK_FREQ           => CLK_FREQUENCY,
+            HAS_FPU            => HAS_FPU,
             HAS_DRAM           => USE_LITEDRAM,
             DRAM_SIZE          => 256 * 1024 * 1024,
             DRAM_INIT_SIZE     => PAYLOAD_SIZE,
diff --git a/fpga/top-generic.vhdl b/fpga/top-generic.vhdl
index 2300456..2ad0dd3 100644
--- a/fpga/top-generic.vhdl
+++ b/fpga/top-generic.vhdl
@@ -11,6 +11,7 @@ entity toplevel is
 	RESET_LOW     : boolean  := true;
 	CLK_INPUT     : positive := 100000000;
 	CLK_FREQUENCY : positive := 100000000;
+        HAS_FPU       : boolean  := true;
 	DISABLE_FLATTEN_CORE : boolean := false;
         UART_IS_16550 : boolean  := true
 	);
@@ -68,6 +69,7 @@ begin
 	    RAM_INIT_FILE => RAM_INIT_FILE,
 	    SIM           => false,
 	    CLK_FREQ      => CLK_FREQUENCY,
+            HAS_FPU       => HAS_FPU,
 	    DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE,
             UART0_IS_16550     => UART_IS_16550
 	    )
diff --git a/fpga/top-nexys-video.vhdl b/fpga/top-nexys-video.vhdl
index 745ef79..1942b10 100644
--- a/fpga/top-nexys-video.vhdl
+++ b/fpga/top-nexys-video.vhdl
@@ -14,6 +14,7 @@ entity toplevel is
 	RAM_INIT_FILE : string   := "firmware.hex";
 	RESET_LOW     : boolean  := true;
 	CLK_FREQUENCY : positive := 100000000;
+        HAS_FPU       : boolean  := true;
 	USE_LITEDRAM  : boolean  := false;
 	NO_BRAM       : boolean  := false;
 	DISABLE_FLATTEN_CORE : boolean := false;
@@ -120,6 +121,7 @@ begin
 	    RAM_INIT_FILE => RAM_INIT_FILE,
 	    SIM           => false,
 	    CLK_FREQ      => CLK_FREQUENCY,
+            HAS_FPU       => HAS_FPU,
 	    HAS_DRAM      => USE_LITEDRAM,
 	    DRAM_SIZE     => 512 * 1024 * 1024,
             DRAM_INIT_SIZE => PAYLOAD_SIZE,
diff --git a/gpr_hazard.vhdl b/gpr_hazard.vhdl
index 0fa66c5..fec03c7 100644
--- a/gpr_hazard.vhdl
+++ b/gpr_hazard.vhdl
@@ -2,6 +2,9 @@ library ieee;
 use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;
 
+library work;
+use work.common.all;
+
 entity gpr_hazard is
     generic (
         PIPELINE_DEPTH : natural := 1
@@ -15,13 +18,13 @@ entity gpr_hazard is
         issuing            : in std_ulogic;
 
         gpr_write_valid_in : in std_ulogic;
-        gpr_write_in       : in std_ulogic_vector(5 downto 0);
+        gpr_write_in       : in gspr_index_t;
         bypass_avail       : in std_ulogic;
         gpr_read_valid_in  : in std_ulogic;
-        gpr_read_in        : in std_ulogic_vector(5 downto 0);
+        gpr_read_in        : in gspr_index_t;
 
         ugpr_write_valid   : in std_ulogic;
-        ugpr_write_reg     : in std_ulogic_vector(5 downto 0);
+        ugpr_write_reg     : in gspr_index_t;
 
         stall_out          : out std_ulogic;
         use_bypass         : out std_ulogic
@@ -31,9 +34,9 @@ architecture behaviour of gpr_hazard is
     type pipeline_entry_type is record
         valid  : std_ulogic;
         bypass : std_ulogic;
-        gpr    : std_ulogic_vector(5 downto 0);
+        gpr    : gspr_index_t;
         ugpr_valid : std_ulogic;
-        ugpr   : std_ulogic_vector(5 downto 0);
+        ugpr   : gspr_index_t;
     end record;
     constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0'),
                                                            ugpr_valid => '0', ugpr => (others => '0'));
diff --git a/insn_helpers.vhdl b/insn_helpers.vhdl
index 592acb0..be3892a 100644
--- a/insn_helpers.vhdl
+++ b/insn_helpers.vhdl
@@ -37,6 +37,10 @@ package insn_helpers is
     function insn_sh (insn_in : std_ulogic_vector) return std_ulogic_vector;
     function insn_me (insn_in : std_ulogic_vector) return std_ulogic_vector;
     function insn_mb (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_frt (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_fra (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_frb (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_frc (insn_in : std_ulogic_vector) return std_ulogic_vector;
 end package insn_helpers;
 
 package body insn_helpers is
@@ -214,4 +218,24 @@ package body insn_helpers is
     begin
         return insn_in(5) & insn_in(10 downto 6);
     end;
+
+    function insn_frt(insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(25 downto 21);
+    end;
+
+    function insn_fra(insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(20 downto 16);
+    end;
+
+    function insn_frb(insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(15 downto 11);
+    end;
+
+    function insn_frc(insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(10 downto 6);
+    end;
 end package body insn_helpers;
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index e36025c..ec20319 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -5,12 +5,15 @@ use ieee.numeric_std.all;
 library work;
 use work.decode_types.all;
 use work.common.all;
+use work.insn_helpers.all;
+use work.helpers.all;
 
 -- 2 cycle LSU
 -- We calculate the address in the first cycle
 
 entity loadstore1 is
     generic (
+        HAS_FPU : boolean := true;
         -- Non-zero to enable log data collection
         LOG_LENGTH : natural := 0
         );
@@ -58,7 +61,7 @@ architecture behave of loadstore1 is
 	addr         : std_ulogic_vector(63 downto 0);
 	store_data   : std_ulogic_vector(63 downto 0);
 	load_data    : std_ulogic_vector(63 downto 0);
-	write_reg    : gpr_index_t;
+	write_reg    : gspr_index_t;
 	length       : std_ulogic_vector(3 downto 0);
 	byte_reverse : std_ulogic;
 	sign_extend  : std_ulogic;
@@ -431,6 +434,17 @@ begin
                     v.align_intr := v.nc;
                     req := '1';
                     v.dcbz := '1';
+                when OP_FPSTORE =>
+                    if HAS_FPU then
+                        req := '1';
+                    end if;
+                when OP_FPLOAD =>
+                    if HAS_FPU then
+                        v.load := '1';
+                        req := '1';
+                        -- Allow an extra cycle for RA update
+                        v.extra_cycle := l_in.update;
+                    end if;
                 when OP_TLBIE =>
                     mmureq := '1';
                     v.tlbie := '1';
@@ -523,7 +537,7 @@ begin
             l_out.write_data <= r.sprval;
         elsif do_update = '1' then
             l_out.write_enable <= '1';
-            l_out.write_reg <= r.update_reg;
+            l_out.write_reg <= gpr_to_gspr(r.update_reg);
             l_out.write_data <= r.addr;
         else
             l_out.write_enable <= write_enable;
diff --git a/microwatt.core b/microwatt.core
index cd24a06..3b47339 100644
--- a/microwatt.core
+++ b/microwatt.core
@@ -132,6 +132,7 @@ targets:
       - disable_flatten_core
       - log_length=2048
       - uart_is_16550
+      - has_fpu
     tools:
       vivado: {part : xc7a100tcsg324-1}
     toplevel : toplevel
@@ -215,6 +216,7 @@ targets:
       - spi_flash_offset=10485760
       - log_length=2048
       - uart_is_16550
+      - has_fpu
     tools:
       vivado: {part : xc7a200tsbg484-1}
     toplevel : toplevel
@@ -231,6 +233,7 @@ targets:
       - spi_flash_offset=10485760
       - log_length=2048
       - uart_is_16550
+      - has_fpu
     generate: [litedram_nexys_video]
     tools:
       vivado: {part : xc7a200tsbg484-1}
@@ -249,6 +252,7 @@ targets:
       - log_length=512
       - uart_is_16550
       - has_uart1
+      - has_fpu=false
     tools:
       vivado: {part : xc7a35ticsg324-1L}
     toplevel : toplevel
@@ -267,6 +271,7 @@ targets:
       - log_length=512
       - uart_is_16550
       - has_uart1
+      - has_fpu=false
     generate: [litedram_arty, liteeth_arty]
     tools:
       vivado: {part : xc7a35ticsg324-1L}
@@ -285,6 +290,7 @@ targets:
       - log_length=2048
       - uart_is_16550
       - has_uart1
+      - has_fpu
     tools:
       vivado: {part : xc7a100ticsg324-1L}
     toplevel : toplevel
@@ -303,6 +309,7 @@ targets:
       - log_length=2048
       - uart_is_16550
       - has_uart1
+      - has_fpu
     generate: [litedram_arty, liteeth_arty]
     tools:
       vivado: {part : xc7a100ticsg324-1L}
@@ -320,6 +327,7 @@ targets:
       - disable_flatten_core
       - log_length=512
       - uart_is_16550
+      - has_fpu=false
     tools:
       vivado: {part : xc7a35tcpg236-1}
     toplevel : toplevel
@@ -380,6 +388,12 @@ parameters:
     paramtype   : generic
     default     : 100000000
 
+  has_fpu:
+    datatype    : bool
+    description : Include a floating-point unit in the core
+    paramtype   : generic
+    default     : true
+
   disable_flatten_core:
     datatype    : bool
     description : Prevent Vivado from flattening the main core components
diff --git a/register_file.vhdl b/register_file.vhdl
index 10f28a4..32c8490 100644
--- a/register_file.vhdl
+++ b/register_file.vhdl
@@ -8,6 +8,7 @@ use work.common.all;
 entity register_file is
     generic (
         SIM : boolean := false;
+        HAS_FPU : boolean := true;
         -- Non-zero to enable log data collection
         LOG_LENGTH : natural := 0
         );
@@ -28,12 +29,12 @@ entity register_file is
         sim_dump      : in std_ulogic;
         sim_dump_done : out std_ulogic;
 
-        log_out       : out std_ulogic_vector(70 downto 0)
+        log_out       : out std_ulogic_vector(71 downto 0)
         );
 end entity register_file;
 
 architecture behaviour of register_file is
-    type regfile is array(0 to 63) of std_ulogic_vector(63 downto 0);
+    type regfile is array(0 to 127) of std_ulogic_vector(63 downto 0);
     signal registers : regfile := (others => (others => '0'));
     signal rd_port_b : std_ulogic_vector(63 downto 0);
     signal dbg_data : std_ulogic_vector(63 downto 0);
@@ -41,53 +42,73 @@ architecture behaviour of register_file is
 begin
     -- synchronous writes
     register_write_0: process(clk)
+        variable w_addr : gspr_index_t;
     begin
         if rising_edge(clk) then
             if w_in.write_enable = '1' then
-		if w_in.write_reg(5) = '0' then
-		    report "Writing GPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data);
-		else
-		    report "Writing GSPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data);
-		end if;
+                w_addr := w_in.write_reg;
+                if HAS_FPU and w_addr(6) = '1' then
+                    report "Writing FPR " & to_hstring(w_addr(4 downto 0)) & " " & to_hstring(w_in.write_data);
+                else
+                    w_addr(6) := '0';
+                    if w_addr(5) = '0' then
+                        report "Writing GPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data);
+                    else
+                        report "Writing GSPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data);
+                    end if;
+                end if;
                 assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure;
-                registers(to_integer(unsigned(w_in.write_reg))) <= w_in.write_data;
+                registers(to_integer(unsigned(w_addr))) <= w_in.write_data;
             end if;
         end if;
     end process register_write_0;
 
     -- asynchronous reads
     register_read_0: process(all)
-        variable b_addr : gspr_index_t;
+        variable a_addr, b_addr, c_addr : gspr_index_t;
+        variable w_addr : gspr_index_t;
     begin
+        a_addr := d_in.read1_reg;
+        b_addr := d_in.read2_reg;
+        c_addr := d_in.read3_reg;
+        w_addr := w_in.write_reg;
+        if not HAS_FPU then
+            -- Make it obvious that we only want 64 GSPRs for a no-FPU implementation
+            a_addr(6) := '0';
+            b_addr(6) := '0';
+            c_addr(6) := '0';
+            w_addr(6) := '0';
+        end if;
         if d_in.read1_enable = '1' then
-            report "Reading GPR " & to_hstring(d_in.read1_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read1_reg))));
+            report "Reading GPR " & to_hstring(a_addr) & " " & to_hstring(registers(to_integer(unsigned(a_addr))));
         end if;
         if d_in.read2_enable = '1' then
-            report "Reading GPR " & to_hstring(d_in.read2_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read2_reg))));
+            report "Reading GPR " & to_hstring(b_addr) & " " & to_hstring(registers(to_integer(unsigned(b_addr))));
         end if;
         if d_in.read3_enable = '1' then
-            report "Reading GPR " & to_hstring(d_in.read3_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read3_reg))));
+            report "Reading GPR " & to_hstring(c_addr) & " " & to_hstring(registers(to_integer(unsigned(c_addr))));
         end if;
-        d_out.read1_data <= registers(to_integer(unsigned(d_in.read1_reg)));
+        d_out.read1_data <= registers(to_integer(unsigned(a_addr)));
         -- B read port is multiplexed with reads from the debug circuitry
         if d_in.read2_enable = '0' and dbg_gpr_req = '1' and dbg_ack = '0' then
             b_addr := dbg_gpr_addr;
-        else
-            b_addr := d_in.read2_reg;
+            if not HAS_FPU then
+                b_addr(6) := '0';
+            end if;
         end if;
         rd_port_b <= registers(to_integer(unsigned(b_addr)));
         d_out.read2_data <= rd_port_b;
-        d_out.read3_data <= registers(to_integer(unsigned(gpr_to_gspr(d_in.read3_reg))));
+        d_out.read3_data <= registers(to_integer(unsigned(c_addr)));
 
         -- Forward any written data
         if w_in.write_enable = '1' then
-            if d_in.read1_reg = w_in.write_reg then
+            if a_addr = w_addr then
                 d_out.read1_data <= w_in.write_data;
             end if;
-            if d_in.read2_reg = w_in.write_reg then
+            if b_addr = w_addr then
                 d_out.read2_data <= w_in.write_data;
             end if;
-            if gpr_to_gspr(d_in.read3_reg) = w_in.write_reg then
+            if c_addr = w_addr then
                 d_out.read3_data <= w_in.write_data;
             end if;
         end if;
@@ -136,7 +157,7 @@ begin
     end generate;
 
     rf_log: if LOG_LENGTH > 0 generate
-        signal log_data : std_ulogic_vector(70 downto 0);
+        signal log_data : std_ulogic_vector(71 downto 0);
     begin
         reg_log: process(clk)
         begin
diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c
index 146346d..eca4bf0 100644
--- a/scripts/fmt_log/fmt_log.c
+++ b/scripts/fmt_log/fmt_log.c
@@ -58,7 +58,7 @@ struct log_entry {
 	u64	ls_lo_valid: 1;
 	u64	ls_eo_except: 1;
 	u64	ls_stall_out: 1;
-	u64	pad2: 2;
+	u64	pad2: 1;
 	u64	dc_state: 3;
 	u64	dc_ra_valid: 1;
 	u64	dc_tlb_way: 3;
@@ -74,7 +74,7 @@ struct log_entry {
 	u64	cr_wr_mask: 8;
 	u64	cr_wr_data: 4;
 	u64	cr_wr_enable: 1;
-	u64	reg_wr_reg: 6;
+	u64	reg_wr_reg: 7;
 	u64	reg_wr_enable: 1;
 
 	u64	reg_wr_data;
@@ -90,11 +90,11 @@ const char *ops[64] =
 	"illegal", "nop    ", "add    ", "and    ", "attn   ", "b      ", "bc     ", "bcreg  ",
 	"bperm  ", "cmp    ", "cmpb   ", "cmpeqb ", "cmprb  ", "cntz   ", "crop   ", "darn   ",
 	"dcbf   ", "dcbst  ", "dcbt   ", "dcbtst ", "dcbz   ", "div    ", "dive   ", "exts   ",
-	"extswsl", "icbi   ", "icbt   ", "isel   ", "isync  ", "ld     ", "st     ", "mcrxrx ",
-	"mfcr   ", "mfmsr  ", "mfspr  ", "mod    ", "mtcrf  ", "mtmsr  ", "mtspr  ", "mull64 ",
-	"mulh64 ", "mulh32 ", "or     ", "popcnt ", "prty   ", "rfid   ", "rlc    ", "rlcl   ",
-	"rlcr   ", "sc     ", "setb   ", "shl    ", "shr    ", "sync   ", "tlbie  ", "trap   ",
-	"xor    ", "bcd    ", "addg6s ", "ffail  ", "?60    ", "?61    ", "?62    ", "?63    "
+	"extswsl", "icbi   ", "icbt   ", "isel   ", "isync  ", "ld     ", "st     ", "fpload ",
+	"fpstore", "mcrxrx ", "mfcr   ", "mfmsr  ", "mfspr  ", "mod    ", "mtcrf  ", "mtmsr  ",
+	"mtspr  ", "mull64 ", "mulh64 ", "mulh32 ", "or     ", "popcnt ", "prty   ", "rfid   ",
+	"rlc    ", "rlcl   ", "rlcr   ", "sc     ", "setb   ", "shl    ", "shr    ", "sync   ",
+	"tlbie  ", "trap   ", "xor    ", "bcd    ", "addg6s ", "ffail  ", "?62    ", "?63    "
 };
 
 const char *spr_names[13] =
diff --git a/soc.vhdl b/soc.vhdl
index 0a70026..7ab146f 100644
--- a/soc.vhdl
+++ b/soc.vhdl
@@ -52,6 +52,7 @@ entity soc is
 	RAM_INIT_FILE      : string;
 	CLK_FREQ           : positive;
 	SIM                : boolean;
+        HAS_FPU            : boolean := true;
 	DISABLE_FLATTEN_CORE : boolean := false;
 	HAS_DRAM           : boolean  := false;
 	DRAM_SIZE          : integer := 0;
@@ -253,6 +254,7 @@ begin
     processor: entity work.core
 	generic map(
 	    SIM => SIM,
+            HAS_FPU => HAS_FPU,
 	    DISABLE_FLATTEN => DISABLE_FLATTEN_CORE,
 	    ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'),
             LOG_LENGTH => LOG_LENGTH
diff --git a/writeback.vhdl b/writeback.vhdl
index 053a8ba..d0230d8 100644
--- a/writeback.vhdl
+++ b/writeback.vhdl
@@ -80,7 +80,7 @@ begin
             end if;
 
             if l_in.write_enable = '1' then
-                w_out.write_reg <= gpr_to_gspr(l_in.write_reg);
+                w_out.write_reg <= l_in.write_reg;
                 w_out.write_data <= l_in.write_data;
                 w_out.write_enable <= '1';
             end if;

From bcac4b9b2fafe976eb4d2ce2d022cc0cbb33c5de Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 1 Jul 2020 18:03:19 +1000
Subject: [PATCH 02/30] tests: Add a test for FP loads and stores

This tests that floating-point unavailable exceptions occur as expected
on FP loads and stores, and that the simple FP loads and stores appear
to give reasonable results.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 tests/fpu/Makefile         |   3 +
 tests/fpu/fpu.c            | 196 +++++++++++++++++++++++++++++++++++++
 tests/fpu/head.S           | 120 +++++++++++++++++++++++
 tests/fpu/powerpc.lds      |  27 +++++
 tests/test_fpu.bin         | Bin 0 -> 8208 bytes
 tests/test_fpu.console_out |   2 +
 tests/update_console_tests |   2 +-
 7 files changed, 349 insertions(+), 1 deletion(-)
 create mode 100644 tests/fpu/Makefile
 create mode 100644 tests/fpu/fpu.c
 create mode 100644 tests/fpu/head.S
 create mode 100644 tests/fpu/powerpc.lds
 create mode 100755 tests/test_fpu.bin
 create mode 100644 tests/test_fpu.console_out

diff --git a/tests/fpu/Makefile b/tests/fpu/Makefile
new file mode 100644
index 0000000..fd8344e
--- /dev/null
+++ b/tests/fpu/Makefile
@@ -0,0 +1,3 @@
+TEST=fpu
+
+include ../Makefile.test
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
new file mode 100644
index 0000000..d61b36e
--- /dev/null
+++ b/tests/fpu/fpu.c
@@ -0,0 +1,196 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "console.h"
+
+#define MSR_FP	0x2000
+#define MSR_FE0	0x800
+#define MSR_FE1	0x100
+
+extern int trapit(long arg, int (*func)(long));
+
+#define SRR0	26
+#define SRR1	27
+
+static inline unsigned long mfspr(int sprnum)
+{
+	long val;
+
+	__asm__ volatile("mfspr %0,%1" : "=r" (val) : "i" (sprnum));
+	return val;
+}
+
+static inline void mtspr(int sprnum, unsigned long val)
+{
+	__asm__ volatile("mtspr %0,%1" : : "i" (sprnum), "r" (val));
+}
+
+void disable_fp(void)
+{
+	unsigned long msr;
+
+	__asm__("mfmsr %0" : "=r" (msr));
+	msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1);
+	__asm__("mtmsrd %0" : : "r" (msr));
+}
+
+void enable_fp(void)
+{
+	unsigned long msr;
+
+	__asm__("mfmsr %0" : "=r" (msr));
+	msr |= MSR_FP;
+	__asm__("mtmsrd %0" : : "r" (msr));
+}
+
+void print_string(const char *str)
+{
+	for (; *str; ++str)
+		putchar(*str);
+}
+
+void print_hex(unsigned long val, int ndigits)
+{
+	int i, x;
+
+	for (i = (ndigits - 1) * 4; i >= 0; i -= 4) {
+		x = (val >> i) & 0xf;
+		if (x >= 10)
+			putchar(x + 'a' - 10);
+		else
+			putchar(x + '0');
+	}
+}
+
+// i < 100
+void print_test_number(int i)
+{
+	print_string("test ");
+	putchar(48 + i/10);
+	putchar(48 + i%10);
+	putchar(':');
+}
+
+unsigned long foo = 0x3ff8000000000000ul;
+unsigned long foow;
+int fooi = -76543;
+int fooiw;
+
+int do_fp_op(long arg)
+{
+	switch (arg) {
+	case 0:
+		__asm__("lfd 31,0(%0)" : : "b" (&foo));
+		break;
+	case 1:
+		__asm__("stfd 31,0(%0)" : : "b" (&foow) : "memory");
+		break;
+	case 2:
+		__asm__("lfd 30,0(%0); stfd 30,0(%1)"
+			: : "b" (&foo), "b" (&foow) : "memory");
+		break;
+	case 3:
+		__asm__("lfiwax 29,0,%0; stfd 29,0(%1)"
+			: : "r" (&fooi), "b" (&foow) : "memory");
+		break;
+	case 4:
+		__asm__("lfiwzx 28,0,%0; stfd 28,0(%1)"
+			: : "r" (&fooi), "b" (&foow) : "memory");
+		break;
+	case 5:
+		__asm__("lfdx 27,0,%0; stfiwx 27,0,%1"
+			: : "r" (&foow), "r" (&fooiw) : "memory");
+		break;
+	}
+	return 0;
+}
+
+
+int fpu_test_1(void)
+{
+	int ret;
+
+	disable_fp();
+	/* these should give a FP unavailable exception */
+	ret = trapit(0, do_fp_op);
+	if (ret != 0x800)
+		return 1;
+	ret = trapit(1, do_fp_op);
+	if (ret != 0x800)
+		return 2;
+	enable_fp();
+	/* these should succeed */
+	ret = trapit(0, do_fp_op);
+	if (ret)
+		return ret | 3;
+	ret = trapit(1, do_fp_op);
+	if (ret)
+		return ret | 4;
+	if (foow != foo)
+		return 5;
+	return 0;
+}
+
+int fpu_test_2(void)
+{
+	int ret;
+
+	enable_fp();
+	foow = ~0;
+	ret = trapit(2, do_fp_op);
+	if (ret)
+		return ret | 1;
+	if (foow != foo)
+		return 2;
+	foow = ~0;
+	ret = trapit(3, do_fp_op);
+	if (ret)
+		return ret | 3;
+	if (foow != fooi)
+		return 4;
+	foow = ~0;
+	ret = trapit(4, do_fp_op);
+	if (ret)
+		return ret | 5;
+	if (foow != (unsigned int)fooi)
+		return 6;
+	ret = trapit(5, do_fp_op);
+	if (ret)
+		return ret | 7;
+	if (fooiw != fooi)
+		return 8;
+	return 0;
+}
+
+int fail = 0;
+
+void do_test(int num, int (*test)(void))
+{
+	int ret;
+
+	print_test_number(num);
+	ret = test();
+	if (ret == 0) {
+		print_string("PASS\r\n");
+	} else {
+		fail = 1;
+		print_string("FAIL ");
+		print_hex(ret, 4);
+		print_string(" SRR0=");
+		print_hex(mfspr(SRR0), 16);
+		print_string(" SRR1=");
+		print_hex(mfspr(SRR1), 16);
+		print_string("\r\n");
+	}
+}
+
+int main(void)
+{
+	console_init();
+
+	do_test(1, fpu_test_1);
+	do_test(2, fpu_test_2);
+
+	return fail;
+}
diff --git a/tests/fpu/head.S b/tests/fpu/head.S
new file mode 100644
index 0000000..498606b
--- /dev/null
+++ b/tests/fpu/head.S
@@ -0,0 +1,120 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Load an immediate 64-bit value into a register */
+#define LOAD_IMM64(r, e)			\
+	lis     r,(e)@highest;			\
+	ori     r,r,(e)@higher;			\
+	rldicr  r,r, 32, 31;			\
+	oris    r,r, (e)@h;			\
+	ori     r,r, (e)@l;
+
+	.section ".head","ax"
+
+	/*
+	 * Microwatt currently enters in LE mode at 0x0, so we don't need to
+	 * do any endian fix ups
+	 */
+	. = 0
+.global _start
+_start:
+	LOAD_IMM64(%r10,__bss_start)
+	LOAD_IMM64(%r11,__bss_end)
+	subf	%r11,%r10,%r11
+	addi	%r11,%r11,63
+	srdi.	%r11,%r11,6
+	beq	2f
+	mtctr	%r11
+1:	dcbz	0,%r10
+	addi	%r10,%r10,64
+	bdnz	1b
+
+2:	LOAD_IMM64(%r1,__stack_top)
+	li	%r0,0
+	stdu	%r0,-16(%r1)
+	LOAD_IMM64(%r10, die)
+	mtsprg0	%r10
+	LOAD_IMM64(%r12, main)
+	mtctr	%r12
+	bctrl
+die:	attn // terminate on exit
+	b .
+
+.global trapit
+trapit:
+	mflr	%r0
+	std	%r0,16(%r1)
+	stdu	%r1,-256(%r1)
+	mtsprg1	%r1
+	r = 14
+	.rept	18
+	std	r,r*8(%r1)
+	r = r + 1
+	.endr
+	mfcr	%r0
+	stw	%r0,13*8(%r1)
+	LOAD_IMM64(%r10, ret)
+	mtsprg0	%r10
+	mr	%r12,%r4
+	mtctr	%r4
+	bctrl
+ret:
+	mfsprg1	%r1
+	LOAD_IMM64(%r10, die)
+	mtsprg0	%r10
+	r = 14
+	.rept	18
+	ld	r,r*8(%r1)
+	r = r + 1
+	.endr
+	lwz	%r0,13*8(%r1)
+	mtcr	%r0
+	ld	%r0,256+16(%r1)
+	addi	%r1,%r1,256
+	mtlr	%r0
+	blr
+
+#define EXCEPTION(nr)		\
+	.= nr			;\
+	mfsprg0	%r0		;\
+	mtctr	%r0		;\
+	li	%r3,nr		;\
+	bctr
+
+	EXCEPTION(0x300)
+	EXCEPTION(0x380)
+	EXCEPTION(0x400)
+	EXCEPTION(0x480)
+	EXCEPTION(0x500)
+	EXCEPTION(0x600)
+	EXCEPTION(0x700)
+	EXCEPTION(0x800)
+	EXCEPTION(0x900)
+	EXCEPTION(0x980)
+	EXCEPTION(0xa00)
+	EXCEPTION(0xb00)
+	EXCEPTION(0xc00)
+	EXCEPTION(0xd00)
+	EXCEPTION(0xe00)
+	EXCEPTION(0xe20)
+	EXCEPTION(0xe40)
+	EXCEPTION(0xe60)
+	EXCEPTION(0xe80)
+	EXCEPTION(0xf00)
+	EXCEPTION(0xf20)
+	EXCEPTION(0xf40)
+	EXCEPTION(0xf60)
+	EXCEPTION(0xf80)
diff --git a/tests/fpu/powerpc.lds b/tests/fpu/powerpc.lds
new file mode 100644
index 0000000..99611ab
--- /dev/null
+++ b/tests/fpu/powerpc.lds
@@ -0,0 +1,27 @@
+SECTIONS
+{
+	. = 0;
+	_start = .;
+	.head : {
+		KEEP(*(.head))
+	}
+	. = ALIGN(0x1000);
+	.text : { *(.text) *(.text.*) *(.rodata) *(.rodata.*) }
+	. = ALIGN(0x1000);
+	.data : { *(.data) *(.data.*) *(.got) *(.toc) }
+	. = ALIGN(0x80);
+	__bss_start = .;
+	.bss : {
+		*(.dynsbss)
+		*(.sbss)
+		*(.scommon)
+		*(.dynbss)
+		*(.bss)
+		*(.common)
+		*(.bss.*)
+	}
+	. = ALIGN(0x80);
+	__bss_end = .;
+	. = . + 0x4000;
+	__stack_top = .;
+}
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
new file mode 100755
index 0000000000000000000000000000000000000000..885368a70cc97cc929c62934658553ed4b131143
GIT binary patch
literal 8208
zcmeHMU2GKB6+W}O_O2ac9EqxKp~)<!t=F>D<)xa9G0dzDCB}sfBy#!?Wi|$>;6ySi
z54-7XcI>(mRF>*ieQ=}%NTseKL{Tg8zyl9<6Gfzb0V#=)XktwKus<X<s@g8Ql-uvz
znK7Hik)Ob->?>W*%sJmZ=R4<~duQjGNbe;YNPZ9;NaK3MRtJ(qCS)U!jX*YXgu)}q
z#EZu=yJ#f#i)$n4O*C%g{6{mlhse@tAXXL>CsKD1h0PCw;S{ciDH%42NV_AHGS&H*
zhQC~2B12@u&m{az!q4Op?HiNHoc~lN9I*G352K0p=6oTmjg+z?!sT4wc59So@1|(}
zu1!kiB!w$eWK^apSvgB%m3Qf#%AaVqa*5_EGo)0nP`G-PjOuktR&UZ+HBawU7iqRy
zrupjkh^(KIGF8uQo8oXwbxc~|VlkIoe7jYT^I`y>0{9fbrwBep@F{{%5qyf^Qv{zP
z_!PmX1U@D3DS=N3d`jR`0-qB2l)%MS?`*W%6eUkeSj+iCmP*LOdV8c9$i>?*&rKHj
zO-<#xd(6%AZ?Cd{3aoPctL&c|Rww?)Ty77!^XiWp*LeKFH6DLxjmO`#I`Mn!&t7H!
z)Vw<JKeQn|W`UP~dSZ>o-?GNz->}BxZ(TlqZ+)mM==Bx!<_dcIYxIwLJMiM8jVr{z
zf?i)iZ?2%XzefM4<5x)EL1*Jy19f!v)0iPdytjLFAa?kh!>M!M?#gUB7|G=Pk&KFO
zqjr$#M#qOfy@-AS9Usu#OtQDehzftht>%8Wnp2MK#%&??Neq-zc0A@_KLqx#H?G#{
zLKt&l|Jf|jxf>Uk(S5n!hVEB4rd^%mgUo5zy)bBIFASMk%h5?<{6~DE$8w}*v_vFx
zSWz?e-_@)mHIiIT=jDgbA<AODD#Wc_G%9-dOyU~OB>n(jsp<kanc55gaN<-*jZw(o
z-B05>>lEl>uFmJ`I-xsNGxl4qKE3&<)sFKOKSh!kgON;>!dhxyM`j{Odp%oUeChaM
zYzwdfU_V;Q2l6ZD!&kt0PJ#3;>#`U(8sOK>C#OWQ75s7Mo@4u0Bklb&;t^t|=gxh{
z{*-;IM~LwqKJcf}35{s3s?u2bHP=7x^UQ(IgB0oj-(f2OzL#N(Xczo*UBnA=d?I2k
zh;iL7>thUOLUYyc*4g1${IGM$jfLoCX(MF+z9)cj&Gyy;!OmN-!#vo={1*;aj|<yC
zEhkS%8@*#}qmlVz++mDs!2KWHF+?5Rj1SVV+3)(CmNokl#zn>`(PTxDyAZ~7I(jE}
z{;%fFsf*8-YC@sxq-$H}`Mc6UduM%|6EYWS?bOY|qQKddtg(fj)aEy=XWGP&S7#T4
z0yXwD_}bFuC~W@l-iO(p!N?0u)YQF&#`GbI#4ZLqGWUe&dCv8B1^%Lt=WG4qY^`?g
zJ-_Tzh*x$r)6Jf%sbS6Y{T+<`q<TLarL5LW;%T1eJBkxeVlit)X;@>~r>^W7<m;My
z&+)$>kXR>ob-p=X_A`sx<2gMJU$_?9$tZhgfw)JVWpN%ht~0;u;HJYa#^o`-#AjtG
z)T~^0$T4GX0%O}-f0KkfK(>~uex7|B_pHl9_bl#nzSnWKXluW8<%^JSm-3%W`D?Dc
z4Ea=T{_5t9`OkxY!Ht>M46k|h_=Lu}^VZNkJa3Tkg|U{%9JgP|a}0H7eTogGH$RMf
zOV%WrCwcFr$SJ~nM^U##U4>1U+w%dk&h1w-3Pq0JM!gap%S+i)rnR$i_<3i)>ngOl
z*y01M&e#y<kMTv|-TfGDowgE>?qztWOFX&?lE(qlJ<I$mhmeEQcQzh<p7$!rVd&xg
z+2i2t!CbFG)ULO=UlZ-@ZbN-=y&+{<0`=caPjzB_>l^Tl7u&wnKi|v7aIJ|^u~Cny
z&Bt)QYibwTyE2?x)Y*>}<YW<8FAse6c+d>>suW71rzPh-l%W~5ml{K9!~}W7kt5HX
z3*OIJuh;2lhR@!yN6I<Ar4i2FRT@ZRp6_nqGjlxu_E0DjD0ijb+c=Q6VMo?D#<j?E
z`tQHVcw4AW4Ip<Xu}3;NCwcGtODK)^gQGv-$o!<oT9#zG#Cf%G?tt?g1O~CV<8<ep
z$uZK8-V`DfYgcRHgx&gg>Qb&`Z2a3`8-6-)?VkDp$ICpUblTX4n0)ZX`(;SRH;{b5
z*YykiP0R)Beb}p!Q^~8iR9CR*wi8Ewc0$E{!Z>tjLo?AH<Jm!eRu3IK7zKO2ZpW2%
zd4c~2xO*<IVUB^<Bjj>Vfv&{+3(NVQb>4pCU||?ddHo;-Qiig*MWHr4x1+!NZy^>~
ziN-Si-IV%|5Gt<RSf-;-K(+-}ZrjkY5NHyzzq6pf$)Gbze{1wf<Bgv*w^Dqr(RjkK
zWkIduZH<kFs1G>fZU1TC83jnAHSrw4#@uJ0O1QLp9<af?Lhwt-wc97{_DiS)c0%<L
z`m4aD7`GdC+34;X<Yy0T5`13*mbolp_eg9)`foHyg{l*{mWmMXGj30-4vuG(LHO-!
zH0BTx$3F<nd0<S&>~2+o+1E<_u;Ap1#mnXSI>wx2C7PG#LF^6_j)*uW9sM$7Ca#j_
z8RfL^LG<xCHzpqcD*7U@JdPX7Z1ezLs@>qojb*<>-|}BKPb^$->ls+jz<LJ$FEdcy
LrF?u}F!lcd%>`Zx

literal 0
HcmV?d00001

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
new file mode 100644
index 0000000..0c39ae3
--- /dev/null
+++ b/tests/test_fpu.console_out
@@ -0,0 +1,2 @@
+test 01:PASS
+test 02:PASS
diff --git a/tests/update_console_tests b/tests/update_console_tests
index 906b0cc..a5e6ffc 100755
--- a/tests/update_console_tests
+++ b/tests/update_console_tests
@@ -3,7 +3,7 @@
 # Script to update console related tests from source
 #
 
-for i in sc illegal decrementer xics privileged mmu misc modes reservation trace ; do
+for i in sc illegal decrementer xics privileged mmu misc modes reservation trace fpu ; do
     cd $i
     make
     cd -

From 9d285a265cf9fab8f5f17d6d4588d9545e555e68 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 28 Aug 2020 13:35:05 +1000
Subject: [PATCH 03/30] core: Add support for single-precision FP loads and
 stores

This adds code to loadstore1 to convert between single-precision and
double-precision formats, and implements the lfs* and stfs*
instructions.  The conversion processes are described in Power ISA
v3.1 Book 1 sections 4.6.2 and 4.6.3.

These conversions take one cycle, so lfs* and stfs* are one cycle
slower than lfd* and stfd*.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl     |   3 +-
 countzero.vhdl  |  37 +--------
 decode1.vhdl    |  16 ++--
 execute1.vhdl   |   1 +
 helpers.vhdl    |  53 ++++++++++++
 loadstore1.vhdl | 210 +++++++++++++++++++++++++++++++++++++++++++++---
 6 files changed, 263 insertions(+), 57 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 14bdcf7..e1ba844 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -287,6 +287,7 @@ package common is
         virt_mode : std_ulogic;                         -- do translation through TLB
         priv_mode : std_ulogic;                         -- privileged mode (MSR[PR] = 0)
         mode_32bit : std_ulogic;                        -- trim addresses to 32 bits
+        is_32bit : std_ulogic;
     end record;
     constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0',
                                                                      sign_extend => '0', update => '0', xerc => xerc_init,
@@ -294,7 +295,7 @@ package common is
                                                                      nia => (others => '0'), insn => (others => '0'),
                                                                      addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'),
                                                                      write_reg => (others => '0'), length => (others => '0'),
-                                                                     mode_32bit => '0', others => (others => '0'));
+                                                                     mode_32bit => '0', is_32bit => '0', others => (others => '0'));
 
     type Loadstore1ToExecute1Type is record
         busy : std_ulogic;
diff --git a/countzero.vhdl b/countzero.vhdl
index 18aa043..b46f108 100644
--- a/countzero.vhdl
+++ b/countzero.vhdl
@@ -3,6 +3,7 @@ use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;
 
 library work;
+use work.helpers.all;
 
 entity zero_counter is
     port (
@@ -15,42 +16,6 @@ entity zero_counter is
 end entity zero_counter;
 
 architecture behaviour of zero_counter is
-    -- Reverse the order of bits in a word
-    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is
-        variable ret: std_ulogic_vector(a'left downto a'right);
-    begin
-        for i in a'right to a'left loop
-            ret(a'left + a'right - i) := a(i);
-        end loop;
-        return ret;
-    end;
-
-    -- If there is only one bit set in a doubleword, return its bit number
-    -- (counting from the right).  Each bit of the result is obtained by
-    -- ORing together 32 bits of the input:
-    --  bit 0 = a[1] or a[3] or a[5] or ...
-    --  bit 1 = a[2] or a[3] or a[6] or a[7] or ...
-    --  bit 2 = a[4..7] or a[12..15] or ...
-    --  bit 5 = a[32..63] ORed together
-    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
-        variable ret: std_ulogic_vector(5 downto 0);
-        variable stride: natural;
-        variable bit: std_ulogic;
-        variable k: natural;
-    begin
-        stride := 2;
-        for i in 0 to 5 loop
-            bit := '0';
-            for j in 0 to (64 / stride) - 1 loop
-                k := j * stride;
-                bit := bit or (or a(k + stride - 1 downto k + (stride / 2)));
-            end loop;
-            ret(i) := bit;
-            stride := stride * 2;
-        end loop;
-        return ret;
-    end;
-
     signal inp : std_ulogic_vector(63 downto 0);
     signal sum : std_ulogic_vector(64 downto 0);
     signal msb_r : std_ulogic;
diff --git a/decode1.vhdl b/decode1.vhdl
index 75da175..29f0e50 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -74,8 +74,8 @@ architecture behaviour of decode1 is
         35 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzu
         50 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfd
         51 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdu
---      48 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfs
---      49 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsu
+        48 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfs
+        49 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsu
         42 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lha
         43 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhau
         40 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhz
@@ -93,8 +93,8 @@ architecture behaviour of decode1 is
         39 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbu
         54 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfd
         55 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdu
---      52 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfs
---      53 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsu
+        52 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfs
+        53 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsu
         44 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth
         45 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu
         36 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw
@@ -284,8 +284,8 @@ architecture behaviour of decode1 is
         2#1001110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdux
         2#1101010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwax
         2#1101110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwzx
---      2#1000010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfsx
---      2#1000110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsux
+        2#1000010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfsx
+        2#1000110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsux
         2#0001110100#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lharx
         2#0101110111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhaux
         2#0101010111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lhax
@@ -367,8 +367,8 @@ architecture behaviour of decode1 is
         2#1011010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfdx
         2#1011110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdux
         2#1111010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfiwx
---      2#1010010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfsx
---      2#1010110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsux
+        2#1010010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfsx
+        2#1010110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsux
         2#1110010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx
         2#1110110101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthcix
         2#1011010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0'), -- sthcx
diff --git a/execute1.vhdl b/execute1.vhdl
index 4d6a9cc..9d9b711 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -1259,6 +1259,7 @@ begin
         lv.virt_mode := ctrl.msr(MSR_DR);
         lv.priv_mode := not ctrl.msr(MSR_PR);
         lv.mode_32bit := not ctrl.msr(MSR_SF);
+        lv.is_32bit := e_in.is_32bit;
 
 	-- Update registers
 	rin <= v;
diff --git a/helpers.vhdl b/helpers.vhdl
index fe91938..834e386 100644
--- a/helpers.vhdl
+++ b/helpers.vhdl
@@ -25,6 +25,10 @@ package helpers is
     function byte_reverse(val: std_ulogic_vector(63 downto 0); size: integer) return std_ulogic_vector;
 
     function sign_extend(val: std_ulogic_vector(63 downto 0); size: natural) return std_ulogic_vector;
+
+    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector;
+    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
+    function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector;
 end package helpers;
 
 package body helpers is
@@ -206,4 +210,53 @@ package body helpers is
         return std_ulogic_vector(ret);
 
     end;
+
+    -- Reverse the order of bits in a word
+    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is
+        variable ret: std_ulogic_vector(a'left downto a'right);
+    begin
+        for i in a'right to a'left loop
+            ret(a'left + a'right - i) := a(i);
+        end loop;
+        return ret;
+    end;
+
+    -- If there is only one bit set in a doubleword, return its bit number
+    -- (counting from the right).  Each bit of the result is obtained by
+    -- ORing together 32 bits of the input:
+    --  bit 0 = a[1] or a[3] or a[5] or ...
+    --  bit 1 = a[2] or a[3] or a[6] or a[7] or ...
+    --  bit 2 = a[4..7] or a[12..15] or ...
+    --  bit 5 = a[32..63] ORed together
+    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
+        variable ret: std_ulogic_vector(5 downto 0);
+        variable stride: natural;
+        variable bit: std_ulogic;
+        variable k: natural;
+    begin
+        stride := 2;
+        for i in 0 to 5 loop
+            bit := '0';
+            for j in 0 to (64 / stride) - 1 loop
+                k := j * stride;
+                bit := bit or (or a(k + stride - 1 downto k + (stride / 2)));
+            end loop;
+            ret(i) := bit;
+            stride := stride * 2;
+        end loop;
+        return ret;
+    end;
+
+    -- Count leading zeroes operation
+    -- Assumes the value passed in is not zero (if it is, zero is returned)
+    function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector is
+        variable rev: std_ulogic_vector(val'left downto val'right);
+        variable sum: std_ulogic_vector(val'left downto val'right);
+        variable onehot: std_ulogic_vector(val'left downto val'right);
+    begin
+        rev := bit_reverse(val);
+        sum := std_ulogic_vector(- signed(rev));
+        onehot := sum and rev;
+        return bit_number(std_ulogic_vector(resize(unsigned(onehot), 64)));
+    end;
 end package body helpers;
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index ec20319..919ba0e 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -45,10 +45,12 @@ architecture behave of loadstore1 is
 
     -- State machine for unaligned loads/stores
     type state_t is (IDLE,              -- ready for instruction
+                     FPR_CONV,          -- converting double to float for store
                      SECOND_REQ,        -- send 2nd request of unaligned xfer
                      ACK_WAIT,          -- waiting for ack from dcache
                      MMU_LOOKUP,        -- waiting for MMU to look up translation
                      TLBIE_WAIT,        -- waiting for MMU to finish doing a tlbie
+                     FINISH_LFS,        -- write back converted SP data for lfs*
                      COMPLETE           -- extra cycle to complete an operation
                      );
 
@@ -89,6 +91,11 @@ architecture behave of loadstore1 is
         do_update    : std_ulogic;
         extra_cycle  : std_ulogic;
         mode_32bit   : std_ulogic;
+        load_sp      : std_ulogic;
+        ld_sp_data   : std_ulogic_vector(31 downto 0);
+        ld_sp_nz     : std_ulogic;
+        ld_sp_lz     : std_ulogic_vector(5 downto 0);
+        st_sp_data   : std_ulogic_vector(31 downto 0);
     end record;
 
     type byte_sel_t is array(0 to 7) of std_ulogic;
@@ -98,6 +105,9 @@ architecture behave of loadstore1 is
     signal r, rin : reg_stage_t;
     signal lsu_sum : std_ulogic_vector(63 downto 0);
 
+    signal store_sp_data : std_ulogic_vector(31 downto 0);
+    signal load_dp_data  : std_ulogic_vector(63 downto 0);
+
     -- Generate byte enables from sizes
     function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
     begin
@@ -128,6 +138,72 @@ architecture behave of loadstore1 is
 					    to_integer(unsigned(address))));
     end function xfer_data_sel;
 
+    -- 23-bit right shifter for DP -> SP float conversions
+    function shifter_23r(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
+        return std_ulogic_vector is
+        variable fs1   : std_ulogic_vector(22 downto 0);
+        variable fs2   : std_ulogic_vector(22 downto 0);
+    begin
+        case shift(1 downto 0) is
+            when "00" =>
+                fs1 := frac;
+            when "01" =>
+                fs1 := '0' & frac(22 downto 1);
+            when "10" =>
+                fs1 := "00" & frac(22 downto 2);
+            when others =>
+                fs1 := "000" & frac(22 downto 3);
+        end case;
+        case shift(4 downto 2) is
+            when "000" =>
+                fs2 := fs1;
+            when "001" =>
+                fs2 := x"0" & fs1(22 downto 4);
+            when "010" =>
+                fs2 := x"00" & fs1(22 downto 8);
+            when "011" =>
+                fs2 := x"000" & fs1(22 downto 12);
+            when "100" =>
+                fs2 := x"0000" & fs1(22 downto 16);
+            when others =>
+                fs2 := x"00000" & fs1(22 downto 20);
+        end case;
+        return fs2;
+    end;
+
+    -- 23-bit left shifter for SP -> DP float conversions
+    function shifter_23l(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
+        return std_ulogic_vector is
+        variable fs1   : std_ulogic_vector(22 downto 0);
+        variable fs2   : std_ulogic_vector(22 downto 0);
+    begin
+        case shift(1 downto 0) is
+            when "00" =>
+                fs1 := frac;
+            when "01" =>
+                fs1 := frac(21 downto 0) & '0';
+            when "10" =>
+                fs1 := frac(20 downto 0) & "00";
+            when others =>
+                fs1 := frac(19 downto 0) & "000";
+        end case;
+        case shift(4 downto 2) is
+            when "000" =>
+                fs2 := fs1;
+            when "001" =>
+                fs2 := fs1(18 downto 0) & x"0" ;
+            when "010" =>
+                fs2 := fs1(14 downto 0) & x"00";
+            when "011" =>
+                fs2 := fs1(10 downto 0) & x"000";
+            when "100" =>
+                fs2 := fs1(6 downto 0) & x"0000";
+            when others =>
+                fs2 := fs1(2 downto 0) & x"00000";
+        end case;
+        return fs2;
+    end;
+
 begin
     -- Calculate the address in the first cycle
     lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
@@ -145,6 +221,59 @@ begin
         end if;
     end process;
 
+    ls_fp_conv: if HAS_FPU generate
+        -- Convert DP data to SP for stfs
+        dp_to_sp: process(all)
+            variable exp   : unsigned(10 downto 0);
+            variable frac  : std_ulogic_vector(22 downto 0);
+            variable shift : unsigned(4 downto 0);
+        begin
+            store_sp_data(31) <= l_in.data(63);
+            store_sp_data(30 downto 0) <= (others => '0');
+            exp := unsigned(l_in.data(62 downto 52));
+            if exp > 896 then
+                store_sp_data(30) <= l_in.data(62);
+                store_sp_data(29 downto 0) <= l_in.data(58 downto 29);
+            elsif exp >= 874 then
+                -- denormalization required
+                frac := '1' & l_in.data(51 downto 30);
+                shift := 0 - exp(4 downto 0);
+                store_sp_data(22 downto 0) <= shifter_23r(frac, shift);
+            end if;
+        end process;
+
+        -- Convert SP data to DP for lfs
+        sp_to_dp: process(all)
+            variable exp     : unsigned(7 downto 0);
+            variable exp_dp  : unsigned(10 downto 0);
+            variable exp_nz  : std_ulogic;
+            variable exp_ao  : std_ulogic;
+            variable frac    : std_ulogic_vector(22 downto 0);
+            variable frac_shift : unsigned(4 downto 0);
+        begin
+            frac := r.ld_sp_data(22 downto 0);
+            exp := unsigned(r.ld_sp_data(30 downto 23));
+            exp_nz := or (r.ld_sp_data(30 downto 23));
+            exp_ao := and (r.ld_sp_data(30 downto 23));
+            frac_shift := (others => '0');
+            if exp_ao = '1' then
+                exp_dp := to_unsigned(2047, 11);    -- infinity or NaN
+            elsif exp_nz = '1' then
+                exp_dp := 896 + resize(exp, 11);    -- finite normalized value
+            elsif r.ld_sp_nz = '0' then
+                exp_dp := to_unsigned(0, 11);       -- zero
+            else
+                -- denormalized SP operand, need to normalize
+                exp_dp := 896 - resize(unsigned(r.ld_sp_lz), 11);
+                frac_shift := unsigned(r.ld_sp_lz(4 downto 0)) + 1;
+            end if;
+            load_dp_data(63) <= r.ld_sp_data(31);
+            load_dp_data(62 downto 52) <= std_ulogic_vector(exp_dp);
+            load_dp_data(51 downto 29) <= shifter_23l(frac, frac_shift);
+            load_dp_data(28 downto 0) <= (others => '0');
+        end process;
+    end generate;
+
     loadstore1_1: process(all)
         variable v : reg_stage_t;
         variable brev_lenm1 : unsigned(2 downto 0);
@@ -165,6 +294,9 @@ begin
         variable data_permuted : std_ulogic_vector(63 downto 0);
         variable data_trimmed : std_ulogic_vector(63 downto 0);
         variable store_data : std_ulogic_vector(63 downto 0);
+        variable data_in : std_ulogic_vector(63 downto 0);
+        variable byte_rev : std_ulogic;
+        variable length : std_ulogic_vector(3 downto 0);
         variable use_second : byte_sel_t;
         variable trim_ctl : trim_ctl_t;
         variable negative : std_ulogic;
@@ -176,6 +308,8 @@ begin
         variable mmu_mtspr : std_ulogic;
         variable itlb_fault : std_ulogic;
         variable misaligned : std_ulogic;
+        variable fp_reg_conv : std_ulogic;
+        variable lfs_done : std_ulogic;
     begin
         v := r;
         req := '0';
@@ -185,8 +319,10 @@ begin
         sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10));
         dsisr := (others => '0');
         mmureq := '0';
+        fp_reg_conv := '0';
 
         write_enable := '0';
+        lfs_done := '0';
 
         do_update := r.do_update;
         v.do_update := '0';
@@ -245,19 +381,38 @@ begin
             end case;
         end loop;
 
-        -- Byte reversing and rotating for stores
-        -- Done in the first cycle (when l_in.valid = 1)
+        if HAS_FPU then
+            -- Single-precision FP conversion
+            v.st_sp_data := store_sp_data;
+            v.ld_sp_data := data_trimmed(31 downto 0);
+            v.ld_sp_nz := or (data_trimmed(22 downto 0));
+            v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0));
+        end if;
+
+        -- Byte reversing and rotating for stores.
+        -- Done in the first cycle (when l_in.valid = 1) for integer stores
+        -- and DP float stores, and in the second cycle for SP float stores.
         store_data := r.store_data;
-        if l_in.valid = '1' then
-            byte_offset := unsigned(lsu_sum(2 downto 0));
+        if l_in.valid = '1' or (HAS_FPU and r.state = FPR_CONV) then
+            if HAS_FPU and r.state = FPR_CONV then
+                data_in := x"00000000" & r.st_sp_data;
+                byte_offset := unsigned(r.addr(2 downto 0));
+                byte_rev := r.byte_reverse;
+                length := r.length;
+            else
+                data_in := l_in.data;
+                byte_offset := unsigned(lsu_sum(2 downto 0));
+                byte_rev := l_in.byte_reverse;
+                length := l_in.length;
+            end if;
             brev_lenm1 := "000";
-            if l_in.byte_reverse = '1' then
-                brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
+            if byte_rev = '1' then
+                brev_lenm1 := unsigned(length(2 downto 0)) - 1;
             end if;
             for i in 0 to 7 loop
                 k := (to_unsigned(i, 3) - byte_offset) xor brev_lenm1;
                 j := to_integer(k) * 8;
-                store_data(i * 8 + 7 downto i * 8) := l_in.data(j + 7 downto j);
+                store_data(i * 8 + 7 downto i * 8) := data_in(j + 7 downto j);
             end loop;
         end if;
         v.store_data := store_data;
@@ -292,6 +447,14 @@ begin
         case r.state is
         when IDLE =>
 
+        when FPR_CONV =>
+            req := '1';
+            if r.second_bytes /= "00000000" then
+                v.state := SECOND_REQ;
+            else
+                v.state := ACK_WAIT;
+            end if;
+
         when SECOND_REQ =>
             req := '1';
             v.state := ACK_WAIT;
@@ -323,8 +486,13 @@ begin
                         v.load_data := data_permuted;
                     end if;
                 else
-                    write_enable := r.load;
-                    if r.extra_cycle = '1' then
+                    write_enable := r.load and not r.load_sp;
+                    if HAS_FPU and r.load_sp = '1' then
+                        -- SP to DP conversion takes a cycle
+                        -- Write back rA update in this cycle if needed
+                        do_update := r.update;
+                        v.state := FINISH_LFS;
+                    elsif r.extra_cycle = '1' then
                         -- loads with rA update need an extra cycle
                         v.state := COMPLETE;
                         v.do_update := r.update;
@@ -362,6 +530,9 @@ begin
 
         when TLBIE_WAIT =>
 
+        when FINISH_LFS =>
+            lfs_done := '1';
+
         when COMPLETE =>
             exception := r.align_intr;
 
@@ -395,6 +566,7 @@ begin
             v.nc := l_in.ci;
             v.virt_mode := l_in.virt_mode;
             v.priv_mode := l_in.priv_mode;
+            v.load_sp := '0';
             v.wait_dcache := '0';
             v.wait_mmu := '0';
             v.do_update := '0';
@@ -436,14 +608,24 @@ begin
                     v.dcbz := '1';
                 when OP_FPSTORE =>
                     if HAS_FPU then
-                        req := '1';
+                        if l_in.is_32bit = '1' then
+                            v.state := FPR_CONV;
+                            fp_reg_conv := '1';
+                        else
+                            req := '1';
+                        end if;
                     end if;
                 when OP_FPLOAD =>
                     if HAS_FPU then
                         v.load := '1';
                         req := '1';
-                        -- Allow an extra cycle for RA update
+                        -- Allow an extra cycle for SP->DP precision conversion
+                        -- or RA update
                         v.extra_cycle := l_in.update;
+                        if l_in.is_32bit = '1' then
+                            v.load_sp := '1';
+                            v.extra_cycle := '1';
+                        end if;
                     end if;
                 when OP_TLBIE =>
                     mmureq := '1';
@@ -500,7 +682,7 @@ begin
                 end if;
             end if;
 
-            v.busy := req or mmureq or mmu_mtspr;
+            v.busy := req or mmureq or mmu_mtspr or fp_reg_conv;
         end if;
 
         -- Update outputs to dcache
@@ -539,6 +721,10 @@ begin
             l_out.write_enable <= '1';
             l_out.write_reg <= gpr_to_gspr(r.update_reg);
             l_out.write_data <= r.addr;
+        elsif lfs_done = '1' then
+            l_out.write_enable <= '1';
+            l_out.write_reg <= r.write_reg;
+            l_out.write_data <= load_dp_data;
         else
             l_out.write_enable <= write_enable;
             l_out.write_reg <= r.write_reg;

From 76ec1a2f0aba7863d5704cf56f9bf07e1435cdaf Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 2 Jul 2020 19:55:30 +1000
Subject: [PATCH 04/30] tests/fpu: Add tests for lfs and stfs instructions

This exercises the single-to-double and double-to-single conversions,
including denormalized cases.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 tests/fpu/fpu.c            |  69 ++++++++++++++++++++++++++++++++++++-
 tests/test_fpu.bin         | Bin 8208 -> 8384 bytes
 tests/test_fpu.console_out |   1 +
 3 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index d61b36e..86636b6 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -163,6 +163,72 @@ int fpu_test_2(void)
 	return 0;
 }
 
+struct sp_dp_equiv {
+	unsigned int sp;
+	unsigned long dp;
+} sp_dp_equiv[] = {
+	{ 0, 0 },
+	{ 0x80000000, 0x8000000000000000 },
+	{ 0x7f800000, 0x7ff0000000000000 },
+	{ 0xff800000, 0xfff0000000000000 },
+	{ 0x7f812345, 0x7ff02468a0000000 },
+	{ 0x456789ab, 0x40acf13560000000 },
+	{ 0x12345678, 0x3a468acf00000000 },
+	{ 0x00400000, 0x3800000000000000 },
+	{ 0x00200000, 0x37f0000000000000 },
+	{ 0x00000002, 0x36b0000000000000 },
+	{ 0x00000001, 0x36a0000000000000 },
+};
+
+int sp_to_dp(long arg)
+{
+	unsigned long dp;
+
+	__asm__("lfs 20,0(%0); stfd 20,0(%1)"
+		: : "b" (&sp_dp_equiv[arg].sp), "b" (&dp) : "memory");
+	if (dp != sp_dp_equiv[arg].dp) {
+		print_hex(sp_dp_equiv[arg].sp, 8);
+		print_string(" ");
+		print_hex(dp, 16);
+		print_string(" ");
+		print_hex(sp_dp_equiv[arg].dp, 16);
+		print_string(" ");
+	}
+	return dp != sp_dp_equiv[arg].dp;
+}
+
+int dp_to_sp(long arg)
+{
+	unsigned int sp;
+
+	__asm__("lfd 21,0(%0); stfs 21,0(%1)"
+		: : "b" (&sp_dp_equiv[arg].dp), "b" (&sp) : "memory");
+	return sp != sp_dp_equiv[arg].sp;
+}
+
+int fpu_test_3(void)
+{
+	int i, n, ret;
+
+	n = sizeof(sp_dp_equiv) / sizeof(sp_dp_equiv[0]);
+	enable_fp();
+	for (i = 0; i < n; ++i) {
+		ret = trapit(i, sp_to_dp);
+		if (ret != 0) {
+			if (ret == 1)
+				ret += i;
+			return ret;
+		}
+		ret = trapit(i, dp_to_sp);
+		if (ret != 0) {
+			if (ret == 1)
+				ret += i + 0x10000;
+			return ret;
+		}
+	}
+	return 0;
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -176,7 +242,7 @@ void do_test(int num, int (*test)(void))
 	} else {
 		fail = 1;
 		print_string("FAIL ");
-		print_hex(ret, 4);
+		print_hex(ret, 5);
 		print_string(" SRR0=");
 		print_hex(mfspr(SRR0), 16);
 		print_string(" SRR1=");
@@ -191,6 +257,7 @@ int main(void)
 
 	do_test(1, fpu_test_1);
 	do_test(2, fpu_test_2);
+	do_test(3, fpu_test_3);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index 885368a70cc97cc929c62934658553ed4b131143..fb2de320a0017dc482942a6d0b32d6b5b480a309 100755
GIT binary patch
delta 1747
zcmZuxZERCz6h8NMbQR{jZAa<4QMwNnHYYA@%)1p?tBlM}$3h6vAI8px2tyN8esrVk
z-b?lah#^~ELuN8yAd;y>x5y7;g4r-J42>T|jC_C1WX;A>P_bCx^?BP{gv67a+;g7y
zd7pdEdEa~c0%O6&M1g0CLLJ}chPv?FKqa9LqA;+{z%~Qhyg~0@Tr27Quhhy{T%&t_
zcw(7ilr|8>&(Cn)N?;3DjOQ9i7x=_?M5oUG^vD|9qj*XM#Rkf3o30V<K0jbmGIA7l
z8|;+gVLn%$I$05kDmv&?)=&_t+MeJNls`U2G<<rgyEB&HB}Oz9iPcdg)^j$pPNK+0
zAL(uEH)X3dhrMJklH;?i)9#fgX4!7Lx9YBqTB4~*O`@>6Z#GkQSJ9?e9_<L`P-Vw2
z4{Bkj?e(rrppWvBB<ExPoA)zx#b92COIpa5I~K`xHn!eTA)R644kzn#)JXk^YQ3-C
z$WElJT1e+oB)rO$MtctDGOaR~TE^4ThnhIw(L^-nPi#HzpWHI&|L7iXjZPodqJMm&
z{XUbCI+lAG(<-YjhMYteAsdiLQ;+3FVgV{q6IOI~y*IsQ)Z8QZZj*n3r}--|BK3RD
zi&`bR<ex_?mkeNGEK?f}kR&~a)jOA32cTzISf{g14p`Y&POogYuyf8TZG28<CgQXw
zD|&D~*Zc*y4ro3q!PyL+Alh}#Ml0ez3dX6uGS-3umL{|q=ZBhXq^mo-kNbmoUv&CI
zP0Z9s9a9SMr%dc3FdN0vSfPnc04un~@s5tNsdjX0qo#P&Q@Ne<O*P63k=QB}C8~#s
z26~(>mRFG1-q#&b6_S!aW6*Cw<JZ)Etp-DzOMH<SoFu2Wh1*eI4F)5y8&U>q#1*+}
z#x;25WDd2k8K;}+1$Ej0gvHb$K9%HGr;k`{v6DD8iF_CWg*ktH#Sl1Tl=hDTpMwUY
zsq5Me#A0n2Yabv!B6r6`^Jh$a1h|pLx25spCjKSxzW)x1ZFAZ~qOq%J?2cI^ZV^s5
ztZsR`S6v9M$Yta&zSFBdZsr!`JKyb9@1+M{qKaB&gZ0d>m|L9wnkj5E*k|*59!~3B
zhomq&>-s#u*tn*WFVx|qP#1W67c{Q68Kv=~ITZV34Q&^(e8Sd+FR9^S%tC>Nya!ky
z%~Kx9nakbg%2ZHZO;ouHx9Dk69NyCHUN8CBguAx*3O7<y{9r1<$5WVkN=!g}?!c#+
zQrNQg%AcH1Vw*4p4k;S>i}Q8?zyub6&j9n`DNJDN;7MS`0)UmnH;Ujk+gn&_?L&0z
zHv77;qERT03ICh(VLa*i$AT0b%-LtzFCUQX!Jc4#VC~#|3SPc+@q#rtPbekeYLd-D
z$t@ARoMbLfX@gK&h4e7~E+dHlN2&d_d}_`JWop_AemM(z%B+KkqIX!Q$6GLl(EShR
zTZBk+z7IvLVS}D>X&oE)IOm_lQm<HoErAE%DvNubta>xWxj=*|a9udCY6uDw*!%E-
zyI-s<dQY|{o1`=MIXfsV&~(6A)3FP`rmQ6p>pYvGd){1TWY>o_mM64NcfHYUu7{W3
pJQ|3uuXAMiKki=Ln5i*~5H6|jK?1#JWb!_g(@4EpuL%y-{|jszH2nYo

delta 1074
zcmY+CUr1AN6vxlKnp08R=9=xV^XjD4Ak&b$vbsvenq~!A^tL&Q!iwr?)@(OXu!z_=
zJxJ(F4<o{esE3FiEF$^?l2I`lL6#3nwe+D{&ir+L+lo4H_?`1P-+O-N_q)IG)~MnH
zP&NVh0*^JmURn>T4qpJkPppSn53!zu5OX#O5IfK$$}4VNnpoZ#RV5F=Z1OW>?Lcbo
zQMJtjS^o4nz_a9w|7v(iYnHgAm?WuAH3D!UIh>c|+JS#q#b;VGuGbwFx^Pry5j|h=
zp3W|szT#V*ef59_yrIl=L;%11C6(K`*K#CW2K}vt;0nC@+YTjtv+-LIcyF<qG-Kl%
z=a+KwsEWt+M&TF^>Z`^3MHtuDi_R2I=-Y%T)KxawZ+!W4qFOiatAU_@YrZdy*In6&
zEe1#VlV#w2AcB`4d>SW>793kqj28`c(g~_GFFuWI*0{P#pu}y3bIN{jdG2Ux`xY77
zxJSg7hSelD?0gXl%32W8lQg7__IQ(7iZnn(E(KP7M-vRs3%Lz;&qcxw5R^H4muDO3
z4kq()1JuZmu-v#SlAw+XWz=;h&89xZ3yQ-bI=TQ3`d{E@th?nsX;f8;1&9}dDu*&3
zBcl}a+6>ipP#hOehqY82%=0<o>$3bzmLJRWH1TNecBnmI(r9X&%}>Qxe+&Opj*|m2
zKON6S$`>gwr`$nv@|pToY_6&=9mzf&@DHhYsOs#$X{zd~up475Z&r+`A7r5S=>T6Z
z={ijtwhyV&>{ua$ukV6B9*Yl)d+9BB{$&NAc*_1VC9Bi^m*a$~)0mr}&JEx?4{&}h
zAC7iuH1ETx*^T37118Mg&DXv&Hcd5dT5{4u=m%pCTDftikPi^E(#oxld@uRMA9%_l
z6@@8O(s;#EU&<?E<Q+e8fhw;kJo*zQt5m`(AOPIOK5K2!DvHV$FmAOQIw%}lV9d`2
z+bSHo*0b%Hu-3T}1cx{z`!iWbhOvPhf>pFC<oD5OTO-Wj9$TfUklv)58gk>`Tqd{v
YVk~s~oNZ9N9knT9X?lzBdWmtxZwLKkHUIzs

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index 0c39ae3..623335d 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -1,2 +1,3 @@
 test 01:PASS
 test 02:PASS
+test 03:PASS

From 856e9e955f0e5ddcd64c6d328f279e12a5973574 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 28 Aug 2020 20:01:00 +1000
Subject: [PATCH 05/30] core: Add framework for an FPU

This adds the skeleton of a floating-point unit and implements the
mffs and mtfsf instructions.

Execute1 sends FP instructions to the FPU and receives busy,
exception, FP interrupt and illegal interrupt signals from it.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Makefile                  |   2 +-
 common.vhdl               |  69 ++++++++++++++
 core.vhdl                 |  34 +++++++
 decode1.vhdl              |  18 ++++
 decode2.vhdl              |  11 ++-
 decode_types.vhdl         |   9 +-
 execute1.vhdl             |  82 +++++++++++++----
 fpu.vhdl                  | 185 ++++++++++++++++++++++++++++++++++++++
 microwatt.core            |   1 +
 scripts/fmt_log/fmt_log.c |  12 +--
 writeback.vhdl            |  27 +++++-
 11 files changed, 417 insertions(+), 33 deletions(-)
 create mode 100644 fpu.vhdl

diff --git a/Makefile b/Makefile
index b584895..9fe2106 100644
--- a/Makefile
+++ b/Makefile
@@ -48,7 +48,7 @@ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
 	cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \
 	logical.vhdl countzero.vhdl multiply.vhdl divider.vhdl execute1.vhdl \
 	loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \
-	core.vhdl
+	core.vhdl fpu.vhdl
 
 soc_files = $(core_files) wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \
 	wishbone_debug_master.vhdl xics.vhdl syscon.vhdl soc.vhdl \
diff --git a/common.vhdl b/common.vhdl
index e1ba844..f91ac18 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -94,6 +94,38 @@ package common is
     end record;
     constant xerc_init : xer_common_t := (others => '0');
 
+    -- FPSCR bit numbers
+    constant FPSCR_FX     : integer := 63 - 32;
+    constant FPSCR_FEX    : integer := 63 - 33;
+    constant FPSCR_VX     : integer := 63 - 34;
+    constant FPSCR_OX     : integer := 63 - 35;
+    constant FPSCR_UX     : integer := 63 - 36;
+    constant FPSCR_ZX     : integer := 63 - 37;
+    constant FPSCR_XX     : integer := 63 - 38;
+    constant FPSCR_VXSNAN : integer := 63 - 39;
+    constant FPSCR_VXISI  : integer := 63 - 40;
+    constant FPSCR_VXIDI  : integer := 63 - 41;
+    constant FPSCR_VXZDZ  : integer := 63 - 42;
+    constant FPSCR_VXIMZ  : integer := 63 - 43;
+    constant FPSCR_VXVC   : integer := 63 - 44;
+    constant FPSCR_FR     : integer := 63 - 45;
+    constant FPSCR_FI     : integer := 63 - 46;
+    constant FPSCR_C      : integer := 63 - 47;
+    constant FPSCR_FL     : integer := 63 - 48;
+    constant FPSCR_FG     : integer := 63 - 49;
+    constant FPSCR_FE     : integer := 63 - 50;
+    constant FPSCR_FU     : integer := 63 - 51;
+    constant FPSCR_VXSOFT : integer := 63 - 53;
+    constant FPSCR_VXSQRT : integer := 63 - 54;
+    constant FPSCR_VXCVI  : integer := 63 - 55;
+    constant FPSCR_VE     : integer := 63 - 56;
+    constant FPSCR_OE     : integer := 63 - 57;
+    constant FPSCR_UE     : integer := 63 - 58;
+    constant FPSCR_ZE     : integer := 63 - 59;
+    constant FPSCR_XE     : integer := 63 - 60;
+    constant FPSCR_NI     : integer := 63 - 61;
+    constant FPSCR_RN     : integer := 63 - 63;
+
     type irq_state_t is (WRITE_SRR0, WRITE_SRR1);
 
     -- For now, fixed 16 sources, make this either a parametric
@@ -413,6 +445,43 @@ package common is
                                    write_cr_data => (others => '0'), write_reg => (others => '0'),
                                    exc_write_reg => (others => '0'), exc_write_data => (others => '0'));
 
+    type Execute1ToFPUType is record
+        valid   : std_ulogic;
+        op      : insn_type_t;
+        nia     : std_ulogic_vector(63 downto 0);
+        insn    : std_ulogic_vector(31 downto 0);
+        single  : std_ulogic;
+        fe_mode : std_ulogic_vector(1 downto 0);
+        fra     : std_ulogic_vector(63 downto 0);
+        frb     : std_ulogic_vector(63 downto 0);
+        frc     : std_ulogic_vector(63 downto 0);
+        frt     : gspr_index_t;
+        rc      : std_ulogic;
+        out_cr  : std_ulogic;
+    end record;
+    constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'),
+                                                       insn  => (others => '0'), fe_mode => "00", rc => '0',
+                                                       fra => (others => '0'), frb => (others => '0'),
+                                                       frc => (others => '0'), frt => (others => '0'),
+                                                       single => '0', out_cr => '0');
+
+    type FPUToExecute1Type is record
+        busy      : std_ulogic;
+        exception : std_ulogic;
+        interrupt : std_ulogic;
+        illegal   : std_ulogic;
+    end record;
+
+    type FPUToWritebackType is record
+        valid           : std_ulogic;
+        write_enable    : std_ulogic;
+        write_reg       : gspr_index_t;
+        write_data      : std_ulogic_vector(63 downto 0);
+        write_cr_enable : std_ulogic;
+        write_cr_mask   : std_ulogic_vector(7 downto 0);
+        write_cr_data   : std_ulogic_vector(31 downto 0);
+    end record;
+
     type DividerToExecute1Type is record
 	valid: std_ulogic;
 	write_reg_data: std_ulogic_vector(63 downto 0);
diff --git a/core.vhdl b/core.vhdl
index 81e11c8..b905297 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -80,6 +80,11 @@ architecture behave of core is
     signal mmu_to_dcache: MmuToDcacheType;
     signal dcache_to_mmu: DcacheToMmuType;
 
+    -- FPU signals
+    signal execute1_to_fpu: Execute1ToFPUType;
+    signal fpu_to_execute1: FPUToExecute1Type;
+    signal fpu_to_writeback: FPUToWritebackType;
+
     -- local signals
     signal fetch1_stall_in : std_ulogic;
     signal icache_stall_out : std_ulogic;
@@ -109,6 +114,7 @@ architecture behave of core is
     signal rst_dec1    : std_ulogic := '1';
     signal rst_dec2    : std_ulogic := '1';
     signal rst_ex1     : std_ulogic := '1';
+    signal rst_fpu     : std_ulogic := '1';
     signal rst_ls1     : std_ulogic := '1';
     signal rst_dbg     : std_ulogic := '1';
     signal alt_reset_d : std_ulogic;
@@ -171,6 +177,7 @@ begin
             rst_dec1    <= core_rst;
             rst_dec2    <= core_rst;
             rst_ex1     <= core_rst;
+            rst_fpu     <= core_rst;
             rst_ls1     <= core_rst;
             rst_dbg     <= rst;
             alt_reset_d <= alt_reset;
@@ -225,6 +232,7 @@ begin
 
     decode1_0: entity work.decode1
         generic map(
+            HAS_FPU => HAS_FPU,
             LOG_LENGTH => LOG_LENGTH
             )
         port map (
@@ -313,9 +321,11 @@ begin
 	    busy_out => ex1_busy_out,
             e_in => decode2_to_execute1,
             l_in => loadstore1_to_execute1,
+            fp_in => fpu_to_execute1,
             ext_irq_in => ext_irq,
             l_out => execute1_to_loadstore1,
             f_out => execute1_to_fetch1,
+            fp_out => execute1_to_fpu,
             e_out => execute1_to_writeback,
 	    icache_inval => ex1_icache_inval,
             dbg_msr_out => msr,
@@ -326,6 +336,29 @@ begin
             log_wr_addr => log_wr_addr
             );
 
+    with_fpu: if HAS_FPU generate
+    begin
+        fpu_0: entity work.fpu
+            port map (
+                clk => clk,
+                rst => rst_fpu,
+                e_in => execute1_to_fpu,
+                e_out => fpu_to_execute1,
+                w_out => fpu_to_writeback
+                );
+    end generate;
+
+    no_fpu: if not HAS_FPU generate
+    begin
+        fpu_to_execute1.busy <= '0';
+        fpu_to_execute1.exception <= '0';
+        fpu_to_execute1.interrupt <= '0';
+        fpu_to_execute1.illegal <= '0';
+        fpu_to_writeback.valid <= '0';
+        fpu_to_writeback.write_enable <= '0';
+        fpu_to_writeback.write_cr_enable <= '0';
+    end generate;
+
     loadstore1_0: entity work.loadstore1
         generic map (
             HAS_FPU => HAS_FPU,
@@ -381,6 +414,7 @@ begin
             clk => clk,
             e_in => execute1_to_writeback,
             l_in => loadstore1_to_writeback,
+            fp_in => fpu_to_writeback,
             w_out => writeback_to_register_file,
             c_out => writeback_to_cr_file,
             complete_out => complete
diff --git a/decode1.vhdl b/decode1.vhdl
index 29f0e50..afd37ef 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -8,6 +8,7 @@ use work.decode_types.all;
 
 entity decode1 is
     generic (
+        HAS_FPU : boolean := true;
         -- Non-zero to enable log data collection
         LOG_LENGTH : natural := 0
         );
@@ -55,6 +56,7 @@ architecture behaviour of decode1 is
     type op_30_subop_array_t is array(0 to 15) of decode_rom_t;
     type op_31_subop_array_t is array(0 to 1023) of decode_rom_t;
     type minor_rom_array_2_t is array(0 to 3) of decode_rom_t;
+    type op_63_subop_array_0_t is array(0 to 511) of decode_rom_t;
 
     constant major_decode_rom_array : major_rom_array_t := (
         --          unit     internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
@@ -416,6 +418,15 @@ architecture behaviour of decode1 is
         others   => decode_rom_init
         );
 
+    -- indexed by bits 4..1 and 10..6 of instruction word
+    constant decode_op_63l_array : op_63_subop_array_0_t := (
+        --                unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
+        --                             op                               in   out   A   out  in    out  len        ext                                pipe
+        2#011110010#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 18/7=mffs family
+        2#011110110#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 22/7=mtfsf
+        others => illegal_inst
+        );
+
     --                                        unit   internal         in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
     --                                                     op                                              in   out   A   out  in    out  len        ext                                 pipe
     constant nop_instr      : decode_rom_t := (ALU,  OP_NOP,          NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0');
@@ -569,6 +580,13 @@ begin
         when 62 =>
             v.decode := decode_op_62_array(to_integer(unsigned(f_in.insn(1 downto 0))));
 
+        when 63 =>
+            if HAS_FPU then
+                -- floating point operations, general and double-precision
+                v.decode := decode_op_63l_array(to_integer(unsigned(f_in.insn(4 downto 1) & f_in.insn(10 downto 6))));
+                vi.override := f_in.insn(5);
+            end if;
+
         when others =>
         end case;
 
diff --git a/decode2.vhdl b/decode2.vhdl
index 6cc74c7..8b2ab8c 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -93,6 +93,12 @@ architecture behaviour of decode2 is
         case t is
             when RB =>
                 ret := ('1', gpr_to_gspr(insn_rb(insn_in)), reg_data);
+            when FRB =>
+                if HAS_FPU then
+                    ret := ('1', fpr_to_gspr(insn_frb(insn_in)), reg_data);
+                else
+                    ret := ('0', (others => '0'), (others => '0'));
+                end if;
             when CONST_UI =>
                 ret := ('0', (others => '0'), std_ulogic_vector(resize(unsigned(insn_ui(insn_in)), 64)));
             when CONST_SI =>
@@ -296,6 +302,7 @@ begin
     r_out.read1_reg <= d_in.ispr1 when d_in.decode.input_reg_a = SPR
                        else gpr_to_gspr(insn_ra(d_in.insn));
     r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR
+                       else fpr_to_gspr(insn_frb(d_in.insn)) when d_in.decode.input_reg_b = FRB and HAS_FPU
                        else gpr_to_gspr(insn_rb(d_in.insn));
     r_out.read3_reg <= gpr_to_gspr(insn_rcreg(d_in.insn)) when d_in.decode.input_reg_c = RCR
                        else fpr_to_gspr(insn_frt(d_in.insn)) when d_in.decode.input_reg_c = FRS and HAS_FPU
@@ -321,7 +328,7 @@ begin
         mul_b := (others => '0');
 
         --v.e.input_cr := d_in.decode.input_cr;
-        --v.e.output_cr := d_in.decode.output_cr;
+        v.e.output_cr := d_in.decode.output_cr;
         
         decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1,
                                              d_in.nia);
@@ -412,7 +419,7 @@ begin
 
         cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn);
         cr_bypass_avail <= '0';
-        if EX1_BYPASS then
+        if EX1_BYPASS and d_in.decode.unit = ALU then
             cr_bypass_avail <= d_in.decode.output_cr;
         end if;
 
diff --git a/decode_types.vhdl b/decode_types.vhdl
index 8c20441..5eaef50 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -7,8 +7,9 @@ package decode_types is
 			 OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB,
 			 OP_CNTZ, OP_CROP,
 			 OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST,
-			 OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS,
-			 OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC,
+			 OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS, OP_EXTSWSLI,
+                         OP_FPOP, OP_FPOP_I,
+                         OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC,
 			 OP_LOAD, OP_STORE,
                          OP_FPLOAD, OP_FPSTORE,
 			 OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MOD,
@@ -24,7 +25,7 @@ package decode_types is
 			 );
     type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA);
     type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD,
-                           CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR);
+                           CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR, FRB);
     type input_reg_c_t is (NONE, RS, RCR, FRS);
     type output_reg_a_t is (NONE, RT, RA, SPR, FRT);
     type rc_t is (NONE, ONE, RC);
@@ -48,7 +49,7 @@ package decode_types is
 
     constant TOO_OFFSET : integer := 0;
 
-    type unit_t is (NONE, ALU, LDST);
+    type unit_t is (NONE, ALU, LDST, FPU);
     type length_t is (NONE, is1B, is2B, is4B, is8B);
 
     type decode_rom_t is record
diff --git a/execute1.vhdl b/execute1.vhdl
index 9d9b711..29713b2 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -27,12 +27,14 @@ entity execute1 is
 
 	e_in  : in Decode2ToExecute1Type;
         l_in  : in Loadstore1ToExecute1Type;
+        fp_in : in FPUToExecute1Type;
 
 	ext_irq_in : std_ulogic;
 
 	-- asynchronous
         l_out : out Execute1ToLoadstore1Type;
 	f_out : out Execute1ToFetch1Type;
+        fp_out : out Execute1ToFPUType;
 
 	e_out : out Execute1ToWritebackType;
 
@@ -54,6 +56,7 @@ architecture behaviour of execute1 is
         f : Execute1ToFetch1Type;
         busy: std_ulogic;
         terminate: std_ulogic;
+        fp_exception_next : std_ulogic;
         trace_next : std_ulogic;
         prev_op : insn_type_t;
 	lr_update : std_ulogic;
@@ -72,7 +75,8 @@ architecture behaviour of execute1 is
     end record;
     constant reg_type_init : reg_type :=
         (e => Execute1ToWritebackInit, f => Execute1ToFetch1Init,
-         busy => '0', lr_update => '0', terminate => '0', trace_next => '0', prev_op => OP_ILLEGAL,
+         busy => '0', lr_update => '0', terminate => '0',
+         fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL,
          mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0',
          slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init,
          next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0'));
@@ -268,7 +272,7 @@ begin
     b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2;
     c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3;
 
-    busy_out <= l_in.busy or r.busy;
+    busy_out <= l_in.busy or r.busy or fp_in.busy;
     valid_in <= e_in.valid and not busy_out;
 
     terminate_out <= r.terminate;
@@ -334,6 +338,7 @@ begin
         variable spr_val : std_ulogic_vector(63 downto 0);
         variable addend : std_ulogic_vector(127 downto 0);
         variable do_trace : std_ulogic;
+        variable fv : Execute1ToFPUType;
     begin
 	result := (others => '0');
 	sum_with_carry := (others => '0');
@@ -347,6 +352,7 @@ begin
 	v.e := Execute1ToWritebackInit;
         lv := Execute1ToLoadstore1Init;
         v.f.redirect := '0';
+        fv := Execute1ToFPUInit;
 
 	-- XER forwarding. To avoid having to track XER hazards, we
 	-- use the previously latched value.
@@ -522,9 +528,11 @@ begin
         exception_nextpc := '0';
         v.e.exc_write_enable := '0';
         v.e.exc_write_reg := fast_spr_num(SPR_SRR0);
-        v.e.exc_write_data := e_in.nia;
         if valid_in = '1' then
+            v.e.exc_write_data := e_in.nia;
             v.last_nia := e_in.nia;
+        else
+            v.e.exc_write_data := r.last_nia;
         end if;
 
         v.e.mode_32bit := not ctrl.msr(MSR_SF);
@@ -552,18 +560,27 @@ begin
             ctrl_tmp.msr(MSR_LE) <= '1';
             v.e.valid := '1';
             v.trace_next := '0';
+            v.fp_exception_next := '0';
 	    report "Writing SRR1: " & to_hstring(ctrl.srr1);
 
-        elsif r.trace_next = '1' and valid_in = '1' then
-            -- Generate a trace interrupt rather than executing the next instruction
-            -- or taking any asynchronous interrupt
-            v.f.redirect_nia := std_logic_vector(to_unsigned(16#d00#, 64));
-            ctrl_tmp.srr1(63 - 33) <= '1';
-            if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or
-                r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then
-                ctrl_tmp.srr1(63 - 35) <= '1';
-            elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then
-                ctrl_tmp.srr1(63 - 36) <= '1';
+        elsif valid_in = '1' and ((HAS_FPU and r.fp_exception_next = '1') or r.trace_next = '1') then
+            if HAS_FPU and r.fp_exception_next = '1' then
+                -- This is used for FP-type program interrupts that
+                -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero.
+                v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64));
+                ctrl_tmp.srr1(63 - 43) <= '1';
+                ctrl_tmp.srr1(63 - 47) <= '1';
+            else
+                -- Generate a trace interrupt rather than executing the next instruction
+                -- or taking any asynchronous interrupt
+                v.f.redirect_nia := std_logic_vector(to_unsigned(16#d00#, 64));
+                ctrl_tmp.srr1(63 - 33) <= '1';
+                if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or
+                    r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then
+                    ctrl_tmp.srr1(63 - 35) <= '1';
+                elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then
+                    ctrl_tmp.srr1(63 - 36) <= '1';
+                end if;
             end if;
             exception := '1';
 
@@ -589,7 +606,7 @@ begin
             illegal := '1';
 
         elsif HAS_FPU and valid_in = '1' and ctrl.msr(MSR_FP) = '0' and
-            (e_in.insn_type = OP_FPLOAD or e_in.insn_type = OP_FPSTORE) then
+            (e_in.unit = FPU or e_in.insn_type = OP_FPLOAD or e_in.insn_type = OP_FPSTORE) then
             -- generate a floating-point unavailable interrupt
             exception := '1';
             v.f.redirect_nia := std_logic_vector(to_unsigned(16#800#, 64));
@@ -809,6 +826,10 @@ begin
                 is_branch := '1';
                 taken_branch := '1';
                 abs_branch := '1';
+                if HAS_FPU then
+                    v.fp_exception_next := fp_in.exception and
+                                           (a_in(MSR_FE0) or a_in(MSR_FE1));
+                end if;
                 do_trace := '0';
 
             when OP_CNTZ =>
@@ -980,6 +1001,10 @@ begin
                         ctrl_tmp.msr(MSR_IR) <= '1';
                         ctrl_tmp.msr(MSR_DR) <= '1';
                     end if;
+                    if HAS_FPU then
+                        v.fp_exception_next := fp_in.exception and
+                                               (c_in(MSR_FE0) or c_in(MSR_FE1));
+                    end if;
                 end if;
 	    when OP_MTSPR =>
 		report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
@@ -1096,6 +1121,8 @@ begin
                 lv.valid := '1';
             elsif e_in.unit = NONE then
                 illegal := '1';
+            elsif HAS_FPU and e_in.unit = FPU then
+                fv.valid := '1';
             end if;
 
         elsif r.f.redirect = '1' then
@@ -1170,7 +1197,17 @@ begin
             v.e.valid := '1';
 	end if;
 
-        if illegal = '1' then
+        -- Generate FP-type program interrupt.  fp_in.interrupt will only
+        -- be set during the execution of a FP instruction.
+        -- The case where MSR[FE0,FE1] goes from zero to non-zero is
+        -- handled above by mtmsrd and rfid setting v.fp_exception_next.
+        if HAS_FPU and fp_in.interrupt = '1' then
+            v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64));
+            ctrl_tmp.srr1(63 - 43) <= '1';
+            exception := '1';
+        end if;
+
+        if illegal = '1' or (HAS_FPU and fp_in.illegal = '1') then
             exception := '1';
             v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64));
             -- Since we aren't doing Hypervisor emulation assist (0xe40) we
@@ -1216,7 +1253,6 @@ begin
             end if;
             v.e.exc_write_enable := '1';
             v.e.exc_write_reg := fast_spr_num(SPR_SRR0);
-            v.e.exc_write_data := r.last_nia;
             report "ldst exception writing srr0=" & to_hstring(r.last_nia);
         end if;
 
@@ -1261,6 +1297,19 @@ begin
         lv.mode_32bit := not ctrl.msr(MSR_SF);
         lv.is_32bit := e_in.is_32bit;
 
+        -- Outputs to FPU
+        fv.op := e_in.insn_type;
+        fv.nia := e_in.nia;
+        fv.insn := e_in.insn;
+        fv.single := e_in.is_32bit;
+        fv.fe_mode := ctrl.msr(MSR_FE0) & ctrl.msr(MSR_FE1);
+        fv.fra := a_in;
+        fv.frb := b_in;
+        fv.frc := c_in;
+        fv.frt := e_in.write_reg;
+        fv.rc := e_in.rc;
+        fv.out_cr := e_in.output_cr;
+
 	-- Update registers
 	rin <= v;
 
@@ -1268,6 +1317,7 @@ begin
 	f_out <= r.f;
         l_out <= lv;
 	e_out <= r.e;
+        fp_out <= fv;
 	flush_out <= f_out.redirect;
 
         exception_log <= exception;
diff --git a/fpu.vhdl b/fpu.vhdl
new file mode 100644
index 0000000..b05ec9d
--- /dev/null
+++ b/fpu.vhdl
@@ -0,0 +1,185 @@
+-- Floating-point unit for Microwatt
+
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.insn_helpers.all;
+use work.decode_types.all;
+use work.crhelpers.all;
+use work.helpers.all;
+use work.common.all;
+
+entity fpu is
+    port (
+        clk : in std_ulogic;
+        rst : in std_ulogic;
+
+        e_in  : in  Execute1toFPUType;
+        e_out : out FPUToExecute1Type;
+
+        w_out : out FPUToWritebackType
+        );
+end entity fpu;
+
+architecture behaviour of fpu is
+
+    type state_t is (IDLE,
+                     DO_MFFS, DO_MTFSF);
+
+    type reg_type is record
+        state        : state_t;
+        busy         : std_ulogic;
+        instr_done   : std_ulogic;
+        do_intr      : std_ulogic;
+        op           : insn_type_t;
+        insn         : std_ulogic_vector(31 downto 0);
+        dest_fpr     : gspr_index_t;
+        fe_mode      : std_ulogic;
+        rc           : std_ulogic;
+        is_cmp       : std_ulogic;
+        single_prec  : std_ulogic;
+        fpscr        : std_ulogic_vector(31 downto 0);
+        b            : std_ulogic_vector(63 downto 0);
+        writing_back : std_ulogic;
+        cr_result    : std_ulogic_vector(3 downto 0);
+        cr_mask      : std_ulogic_vector(7 downto 0);
+    end record;
+
+    signal r, rin : reg_type;
+
+    signal fp_result     : std_ulogic_vector(63 downto 0);
+
+begin
+    fpu_0: process(clk)
+    begin
+        if rising_edge(clk) then
+            if rst = '1' then
+                r.state <= IDLE;
+                r.busy <= '0';
+                r.instr_done <= '0';
+                r.do_intr <= '0';
+                r.fpscr <= (others => '0');
+                r.writing_back <= '0';
+            else
+                assert not (r.state /= IDLE and e_in.valid = '1') severity failure;
+                r <= rin;
+            end if;
+        end if;
+    end process;
+
+    e_out.busy <= r.busy;
+    e_out.exception <= r.fpscr(FPSCR_FEX);
+    e_out.interrupt <= r.do_intr;
+
+    w_out.valid <= r.instr_done and not r.do_intr;
+    w_out.write_enable <= r.writing_back;
+    w_out.write_reg <= r.dest_fpr;
+    w_out.write_data <= fp_result;
+    w_out.write_cr_enable <= r.instr_done and r.rc;
+    w_out.write_cr_mask <= r.cr_mask;
+    w_out.write_cr_data <= r.cr_result & r.cr_result & r.cr_result & r.cr_result &
+                           r.cr_result & r.cr_result & r.cr_result & r.cr_result;
+
+    fpu_1: process(all)
+        variable v           : reg_type;
+        variable illegal     : std_ulogic;
+        variable j, k        : integer;
+        variable flm         : std_ulogic_vector(7 downto 0);
+    begin
+        v := r;
+        illegal := '0';
+        v.busy := '0';
+
+        -- capture incoming instruction
+        if e_in.valid = '1' then
+            v.insn := e_in.insn;
+            v.op := e_in.op;
+            v.fe_mode := or (e_in.fe_mode);
+            v.dest_fpr := e_in.frt;
+            v.single_prec := e_in.single;
+            v.rc := e_in.rc;
+            v.is_cmp := e_in.out_cr;
+            v.cr_mask := num_to_fxm(1);
+            v.b := e_in.frb;
+        end if;
+
+        v.writing_back := '0';
+        v.instr_done := '0';
+
+        case r.state is
+            when IDLE =>
+                if e_in.valid = '1' then
+                    case e_in.insn(5 downto 1) is
+                        when "00111" =>
+                            if e_in.insn(8) = '0' then
+                                v.state := DO_MFFS;
+                            else
+                                v.state := DO_MTFSF;
+                            end if;
+                        when others =>
+                            illegal := '1';
+                    end case;
+                end if;
+
+            when DO_MFFS =>
+                v.writing_back := '1';
+                case r.insn(20 downto 16) is
+                    when "00000" =>
+                        -- mffs
+                    when others =>
+                        illegal := '1';
+                end case;
+                v.instr_done := '1';
+                v.state := IDLE;
+
+            when DO_MTFSF =>
+                if r.insn(25) = '1' then
+                    flm := x"FF";
+                elsif r.insn(16) = '1' then
+                    flm := x"00";
+                else
+                    flm := r.insn(24 downto 17);
+                end if;
+                for i in 0 to 7 loop
+                    k := i * 4;
+                    if flm(i) = '1' then
+                        v.fpscr(k + 3 downto k) := r.b(k + 3 downto k);
+                    end if;
+                end loop;
+                v.instr_done := '1';
+                v.state := IDLE;
+
+        end case;
+
+        -- Data path.
+        -- Just enough to read FPSCR for now.
+        fp_result <= x"00000000" & r.fpscr;
+
+        v.fpscr(FPSCR_VX) := (or (v.fpscr(FPSCR_VXSNAN downto FPSCR_VXVC))) or
+                             (or (v.fpscr(FPSCR_VXSOFT downto FPSCR_VXCVI)));
+        v.fpscr(FPSCR_FEX) := or (v.fpscr(FPSCR_VX downto FPSCR_XX) and
+                                  v.fpscr(FPSCR_VE downto FPSCR_XE));
+        if r.rc = '1' then
+            v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX);
+        end if;
+
+        if illegal = '1' then
+            v.instr_done := '0';
+            v.do_intr := '0';
+            v.writing_back := '0';
+            v.busy := '0';
+            v.state := IDLE;
+        else
+            v.do_intr := v.instr_done and v.fpscr(FPSCR_FEX) and r.fe_mode;
+            if v.state /= IDLE or v.do_intr = '1' then
+                v.busy := '1';
+            end if;
+        end if;
+
+        rin <= v;
+        e_out.illegal <= illegal;
+    end process;
+
+end architecture behaviour;
diff --git a/microwatt.core b/microwatt.core
index 3b47339..7f2068d 100644
--- a/microwatt.core
+++ b/microwatt.core
@@ -23,6 +23,7 @@ filesets:
       - cr_hazard.vhdl
       - control.vhdl
       - execute1.vhdl
+      - fpu.vhdl
       - loadstore1.vhdl
       - mmu.vhdl
       - dcache.vhdl
diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c
index eca4bf0..c61c8a5 100644
--- a/scripts/fmt_log/fmt_log.c
+++ b/scripts/fmt_log/fmt_log.c
@@ -84,17 +84,17 @@ struct log_entry {
 #define FLGA(i, y, z)	(log.i? y: z)
 #define PNIA(f)		(full_nia[log.f] & 0xff)
 
-const char *units[4] = { "--", "al", "ls", "?3" };
+const char *units[4] = { "--", "al", "ls", "fp" };
 const char *ops[64] =
 {
 	"illegal", "nop    ", "add    ", "and    ", "attn   ", "b      ", "bc     ", "bcreg  ",
 	"bperm  ", "cmp    ", "cmpb   ", "cmpeqb ", "cmprb  ", "cntz   ", "crop   ", "darn   ",
 	"dcbf   ", "dcbst  ", "dcbt   ", "dcbtst ", "dcbz   ", "div    ", "dive   ", "exts   ",
-	"extswsl", "icbi   ", "icbt   ", "isel   ", "isync  ", "ld     ", "st     ", "fpload ",
-	"fpstore", "mcrxrx ", "mfcr   ", "mfmsr  ", "mfspr  ", "mod    ", "mtcrf  ", "mtmsr  ",
-	"mtspr  ", "mull64 ", "mulh64 ", "mulh32 ", "or     ", "popcnt ", "prty   ", "rfid   ",
-	"rlc    ", "rlcl   ", "rlcr   ", "sc     ", "setb   ", "shl    ", "shr    ", "sync   ",
-	"tlbie  ", "trap   ", "xor    ", "bcd    ", "addg6s ", "ffail  ", "?62    ", "?63    "
+	"extswsl", "fpop   ", "fpopi  ", "icbi   ", "icbt   ", "isel   ", "isync  ", "ld     ",
+	"st     ", "fpload ", "fpstore", "mcrxrx ", "mfcr   ", "mfmsr  ", "mfspr  ", "mod    ",
+	"mtcrf  ", "mtmsr  ", "mtspr  ", "mull64 ", "mulh64 ", "mulh32 ", "or     ", "popcnt ",
+	"prty   ", "rfid   ", "rlc    ", "rlcl   ", "rlcr   ", "sc     ", "setb   ", "shl    ",
+	"shr    ", "sync   ", "tlbie  ", "trap   ", "xor    ", "bcd    ", "addg6s ", "ffail  ",
 };
 
 const char *spr_names[13] =
diff --git a/writeback.vhdl b/writeback.vhdl
index d0230d8..95de0ec 100644
--- a/writeback.vhdl
+++ b/writeback.vhdl
@@ -12,6 +12,7 @@ entity writeback is
 
         e_in         : in Execute1ToWritebackType;
         l_in         : in Loadstore1ToWritebackType;
+        fp_in        : in FPUToWritebackType;
 
         w_out        : out WritebackToRegisterFileType;
         c_out        : out WritebackToCrFileType;
@@ -31,15 +32,21 @@ begin
             -- Do consistency checks only on the clock edge
             x(0) := e_in.valid;
             y(0) := l_in.valid;
-            assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure;
+            w(0) := fp_in.valid;
+            assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) +
+                    to_integer(unsigned(w))) <= 1 severity failure;
 
             x(0) := e_in.write_enable or e_in.exc_write_enable;
             y(0) := l_in.write_enable;
-            assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure;
+            w(0) := fp_in.write_enable;
+            assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) +
+                    to_integer(unsigned(w))) <= 1 severity failure;
 
             w(0) := e_in.write_cr_enable;
             x(0) := (e_in.write_enable and e_in.rc);
-            assert (to_integer(unsigned(w)) + to_integer(unsigned(x))) <= 1 severity failure;
+            y(0) := fp_in.write_cr_enable;
+            assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) +
+                    to_integer(unsigned(y))) <= 1 severity failure;
         end if;
     end process;
 
@@ -53,7 +60,7 @@ begin
         c_out <= WritebackToCrFileInit;
 
         complete_out <= '0';
-        if e_in.valid = '1' or l_in.valid = '1' then
+        if e_in.valid = '1' or l_in.valid = '1' or fp_in.valid = '1' then
             complete_out <= '1';
         end if;
 
@@ -79,6 +86,18 @@ begin
                 c_out.write_xerc_data <= e_in.xerc;
             end if;
 
+            if fp_in.write_enable = '1' then
+                w_out.write_reg <= fp_in.write_reg;
+                w_out.write_data <= fp_in.write_data;
+                w_out.write_enable <= '1';
+            end if;
+
+            if fp_in.write_cr_enable = '1' then
+                c_out.write_cr_enable <= '1';
+                c_out.write_cr_mask <= fp_in.write_cr_mask;
+                c_out.write_cr_data <= fp_in.write_cr_data;
+            end if;
+
             if l_in.write_enable = '1' then
                 w_out.write_reg <= l_in.write_reg;
                 w_out.write_data <= l_in.write_data;

From bf1d9e9531aea859d6ba1218a42f3c125845b320 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 15 Jul 2020 12:46:18 +1000
Subject: [PATCH 06/30] tests/fpu: Add tests for basic FPSCR function and
 interrupt generation

This tests mffs, mtfsf and the generation of floating-point type
program interrupts that occur as a result of mtfsf.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 tests/fpu/fpu.c            | 198 +++++++++++++++++++++++++++++++++----
 tests/fpu/head.S           |  12 +++
 tests/test_fpu.bin         | Bin 8384 -> 12504 bytes
 tests/test_fpu.console_out |   2 +
 4 files changed, 192 insertions(+), 20 deletions(-)

diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index 86636b6..54811ed 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -4,11 +4,15 @@
 
 #include "console.h"
 
+#define asm	__asm__ volatile
+
 #define MSR_FP	0x2000
 #define MSR_FE0	0x800
 #define MSR_FE1	0x100
 
 extern int trapit(long arg, int (*func)(long));
+extern void do_rfid(unsigned long msr);
+extern void do_blr(void);
 
 #define SRR0	26
 #define SRR1	27
@@ -17,31 +21,41 @@ static inline unsigned long mfspr(int sprnum)
 {
 	long val;
 
-	__asm__ volatile("mfspr %0,%1" : "=r" (val) : "i" (sprnum));
+	asm("mfspr %0,%1" : "=r" (val) : "i" (sprnum));
 	return val;
 }
 
 static inline void mtspr(int sprnum, unsigned long val)
 {
-	__asm__ volatile("mtspr %0,%1" : : "i" (sprnum), "r" (val));
+	asm("mtspr %0,%1" : : "i" (sprnum), "r" (val));
 }
 
 void disable_fp(void)
 {
 	unsigned long msr;
 
-	__asm__("mfmsr %0" : "=r" (msr));
+	asm("mfmsr %0" : "=r" (msr));
 	msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1);
-	__asm__("mtmsrd %0" : : "r" (msr));
+	asm("mtmsrd %0" : : "r" (msr));
 }
 
 void enable_fp(void)
 {
 	unsigned long msr;
 
-	__asm__("mfmsr %0" : "=r" (msr));
+	asm("mfmsr %0" : "=r" (msr));
 	msr |= MSR_FP;
-	__asm__("mtmsrd %0" : : "r" (msr));
+	msr &= ~(MSR_FE0 | MSR_FE1);
+	asm("mtmsrd %0" : : "r" (msr));
+}
+
+void enable_fp_interrupts(void)
+{
+	unsigned long msr;
+
+	asm("mfmsr %0" : "=r" (msr));
+	msr |= MSR_FE0 | MSR_FE1;
+	asm("mtmsrd %0" : : "r" (msr));
 }
 
 void print_string(const char *str)
@@ -81,26 +95,26 @@ int do_fp_op(long arg)
 {
 	switch (arg) {
 	case 0:
-		__asm__("lfd 31,0(%0)" : : "b" (&foo));
+		asm("lfd 31,0(%0)" : : "b" (&foo));
 		break;
 	case 1:
-		__asm__("stfd 31,0(%0)" : : "b" (&foow) : "memory");
+		asm("stfd 31,0(%0)" : : "b" (&foow) : "memory");
 		break;
 	case 2:
-		__asm__("lfd 30,0(%0); stfd 30,0(%1)"
-			: : "b" (&foo), "b" (&foow) : "memory");
+		asm("lfd 30,0(%0); stfd 30,0(%1)"
+		    : : "b" (&foo), "b" (&foow) : "memory");
 		break;
 	case 3:
-		__asm__("lfiwax 29,0,%0; stfd 29,0(%1)"
-			: : "r" (&fooi), "b" (&foow) : "memory");
+		asm("lfiwax 29,0,%0; stfd 29,0(%1)"
+		    : : "r" (&fooi), "b" (&foow) : "memory");
 		break;
 	case 4:
-		__asm__("lfiwzx 28,0,%0; stfd 28,0(%1)"
-			: : "r" (&fooi), "b" (&foow) : "memory");
+		asm("lfiwzx 28,0,%0; stfd 28,0(%1)"
+		    : : "r" (&fooi), "b" (&foow) : "memory");
 		break;
 	case 5:
-		__asm__("lfdx 27,0,%0; stfiwx 27,0,%1"
-			: : "r" (&foow), "r" (&fooiw) : "memory");
+		asm("lfdx 27,0,%0; stfiwx 27,0,%1"
+		    : : "r" (&foow), "r" (&fooiw) : "memory");
 		break;
 	}
 	return 0;
@@ -184,8 +198,8 @@ int sp_to_dp(long arg)
 {
 	unsigned long dp;
 
-	__asm__("lfs 20,0(%0); stfd 20,0(%1)"
-		: : "b" (&sp_dp_equiv[arg].sp), "b" (&dp) : "memory");
+	asm("lfs 20,0(%0); stfd 20,0(%1)"
+	    : : "b" (&sp_dp_equiv[arg].sp), "b" (&dp) : "memory");
 	if (dp != sp_dp_equiv[arg].dp) {
 		print_hex(sp_dp_equiv[arg].sp, 8);
 		print_string(" ");
@@ -201,8 +215,8 @@ int dp_to_sp(long arg)
 {
 	unsigned int sp;
 
-	__asm__("lfd 21,0(%0); stfs 21,0(%1)"
-		: : "b" (&sp_dp_equiv[arg].dp), "b" (&sp) : "memory");
+	asm("lfd 21,0(%0); stfs 21,0(%1)"
+	    : : "b" (&sp_dp_equiv[arg].dp), "b" (&sp) : "memory");
 	return sp != sp_dp_equiv[arg].sp;
 }
 
@@ -229,6 +243,148 @@ int fpu_test_3(void)
 	return 0;
 }
 
+unsigned long get_fpscr(void)
+{
+	unsigned long ret;
+
+	asm("mffs 10; stfd 10,0(%0)" : : "b" (&ret) : "memory");
+	return ret;
+}
+
+void set_fpscr(unsigned long fpscr)
+{
+	asm("lfd%U0%X0 7,%0; mtfsf 0,7,1,0" : : "m" (fpscr));
+}
+
+unsigned long fpscr_eval(unsigned long val)
+{
+	val &= ~0x60000000;	/* clear FEX and VX */
+	if (val & 0x1f80700)	/* test all VX* bits */
+		val |= 0x20000000;
+	if ((val >> 25) & (val >> 3) & 0x1f)
+		val |= 0x40000000;
+	return val;
+}
+
+unsigned int test4vals[] = {
+	0xdeadbeef, 0x1324679a, 0, 0xffffffff, 0xabcd
+};
+
+int test4(long arg)
+{
+	unsigned long fsi, fpscr;
+	long i;
+	unsigned long cr;
+
+	/* check we can do basic mtfsf and mffs */
+	i = 1;
+	for (fsi = 1; fsi < 0x100; fsi <<= 1) {
+		asm("lfd 7,0(%0); mtfsf 0,7,1,0" : : "b" (&fsi));
+		if (get_fpscr() != fsi)
+			return i;
+		++i;
+		fpscr = fsi;
+	}
+	for (i = 0; i < sizeof(test4vals) / sizeof(test4vals[0]); ++i) {
+		fsi = test4vals[i];
+		asm("lfd 7,0(%0); mtfsf 0x55,7,0,0" : : "b" (&fsi));
+		fpscr = fpscr_eval((fpscr & 0xf0f0f0f0) | (fsi & 0x0f0f0f0f));
+		if (get_fpscr() != fpscr)
+			return 16 * i + 16;
+		asm("mtfsf 0xaa,7,0,0");
+		fpscr = fpscr_eval((fpscr & 0x0f0f0f0f) | (fsi & 0xf0f0f0f0));
+		if (get_fpscr() != fpscr)
+			return 16 * i + 17;
+		asm("mffs. 6; mfcr %0" : "=r" (cr) : : "cr1");
+		if (((cr >> 24) & 0xf) != ((fpscr >> 28) & 0x1f))
+			return 16 * i + 18;
+	}
+	return 0;
+}
+
+int fpu_test_4(void)
+{
+	enable_fp();
+	return trapit(0, test4);
+}
+
+int test5a(long arg)
+{
+	set_fpscr(0);
+	enable_fp_interrupts();
+	set_fpscr(0x80);	/* set VE */
+	set_fpscr(0x480);	/* set VXSOFT */
+	set_fpscr(0);
+	return 1;		/* not supposed to get here */
+}
+
+int test5b(long arg)
+{
+	unsigned long msr;
+
+	enable_fp();
+	set_fpscr(0x80);	/* set VE */
+	set_fpscr(0x480);	/* set VXSOFT */
+	asm("mfmsr %0" : "=r" (msr));
+	msr |= MSR_FE0 | MSR_FE1;
+	asm("mtmsrd %0; xori 4,4,0" : : "r" (msr));
+	set_fpscr(0);
+	return 1;		/* not supposed to get here */
+}
+
+int test5c(long arg)
+{
+	unsigned long msr;
+
+	enable_fp();
+	set_fpscr(0x80);	/* set VE */
+	set_fpscr(0x480);	/* set VXSOFT */
+	asm("mfmsr %0" : "=r" (msr));
+	msr |= MSR_FE0 | MSR_FE1;
+	do_rfid(msr);
+	set_fpscr(0);
+	return 1;		/* not supposed to get here */
+}
+
+int fpu_test_5(void)
+{
+	int ret;
+	unsigned int *ip;
+
+	enable_fp();
+	ret = trapit(0, test5a);
+	if (ret != 0x700)
+		return 1;
+	ip = (unsigned int *)mfspr(SRR0);
+	/* check it's a mtfsf 0,7,1,0 instruction */
+	if (*ip != (63u << 26) + (1 << 25) + (7 << 11) + (711 << 1))
+		return 2;
+	if ((mfspr(SRR1) & 0x783f0000) != (1 << (63 - 43)))
+		return 3;
+
+	ret = trapit(0, test5b);
+	if (ret != 0x700)
+		return 4;
+	ip = (unsigned int *)mfspr(SRR0);
+	/* check it's an xori 4,4,0 instruction */
+	if (*ip != 0x68840000)
+		return 5;
+	if ((mfspr(SRR1) & 0x783f0000) != (1 << (63 - 43)) + (1 << (63 - 47)))
+		return 6;
+
+	ret = trapit(0, test5c);
+	if (ret != 0x700)
+		return 7;
+	ip = (unsigned int *)mfspr(SRR0);
+	/* check it's the destination of the rfid */
+	if (ip != (void *)&do_blr)
+		return 8;
+	if ((mfspr(SRR1) & 0x783f0000) != (1 << (63 - 43)) + (1 << (63 - 47)))
+		return 9;
+
+	return 0;
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -258,6 +414,8 @@ int main(void)
 	do_test(1, fpu_test_1);
 	do_test(2, fpu_test_2);
 	do_test(3, fpu_test_3);
+	do_test(4, fpu_test_4);
+	do_test(5, fpu_test_5);
 
 	return fail;
 }
diff --git a/tests/fpu/head.S b/tests/fpu/head.S
index 498606b..938fca0 100644
--- a/tests/fpu/head.S
+++ b/tests/fpu/head.S
@@ -87,6 +87,18 @@ ret:
 	mtlr	%r0
 	blr
 
+	.global do_rfid
+do_rfid:
+	mtsrr1	%r3
+	LOAD_IMM64(%r4, do_blr)
+	mtsrr0	%r4
+	rfid
+	blr
+
+	.global do_blr
+do_blr:
+	blr
+
 #define EXCEPTION(nr)		\
 	.= nr			;\
 	mfsprg0	%r0		;\
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index fb2de320a0017dc482942a6d0b32d6b5b480a309..6bac86156f19b99c44efc42659d3d5f6219d6486 100755
GIT binary patch
delta 3504
zcmaJ^e@v9;9sj;}z#Rx22M81nJ-{Gm(a_eI7f#^LLW_uoI*6Uyk~{8P{h3H-x7TvV
zd&kbCOALj+c5_xMT?o^yOK{r6EwaGu8cb?pTj#K?3d$b}t6po<9vtxO^W43she`J)
zFYohwzMt>s`F@}8^Wzw{k2y<;Y}G`K&X07B2l3rO&onxT9KiMf+XHOR4&O7qrOyfN
zz1yGFOctK$&J@Z9v>hHI-+H2e<0D-_!q@$w5O54=k9k%QHJQn`g}-Ot6GY*!+Qc;D
z;07^~O3LcUZ)a>nb-6>y$@okdRHiZ-^#f$~vTL?VWxdgq%D%Hzip>@;nJrGmX3SRh
z8xt+?9?<ALAvO_a>~<|Vm^Fih&?l@Ybb_%>Qg~8E(Bk}k<y~Wb(ZqeWiO|=OxsvqS
z@-phO`QvCK%5~%Nw7nVBz#RoRzO5u?Hmy}hIb)!PGf-x1*m`dvDwW~CZDp)$y#{1*
z1Vo}+VKezdOO->J%QgK5WgxRuxU3knvXt@63gI_sJtwp(^;r!Ss%m>WHH<`5|4<91
zTB2d==qp3kNyl;PiFxEO-7iZIFUmKRp{$i@4zQ&=fe3P1`8uoaX;p@&Jpq|yxbT2@
z+6Q(8#QOh612t2TsEE0Ra@>@~rfO*Ju}|O61LDkd)JNd+q|=sP#XJpYs=P!S{e15F
z78>?7k-q#w1pkS*R%(1LgQ!}|`L<P_e&Def`ga7x)2dzq>@pV&#0e&<f`=ivH-d-d
zT8aJ2hy|F$1)FO<EpdX`2=K?MppM(2w>ILp-OHG!`oov>m3odN5K{z?21pzWt&%-@
z>ar)JQ-K+gHkEkCQmFx}f<J;!z#p=$r?^>-ET%Bq<bw}pamGv&qutfC&_sO;@Nw`}
zO34<dTQGYMN3g{_J`P_?nC-g*TBRm8H&%n#{ICAM>JASjyBw`*$~=wuC>E>hJ$!dX
za^cya5g%R5rZgG#eEY=W;M<32JhFB8QbdMW#L+B=us?n%^LU<3&GNvKAD+$VHofy`
ziz-do8EGOMF6$yrj!ET{oNR3V&g^H4$5W_b-P|R)MrUbFrsQ%H`EA?CQh82S;+<p6
z`ZJyKX?97nTByntWg)vpGuW=wEZr(kCdUPlu(+f_HWKE6NCll9#@g1WAo26<E?%C<
zxv>vB`&xKGt{_=VA-4RS&gI#h6pgPrD0?c%C337%#i~%+)oPm;Balq;<MpH=3bLZg
z?gZOBcF*-7bKZ?rL9G((&QOp=XZuN;RItm_rFTg$Ct6x5<dPl;mR7!g$?S4(sM#g`
z^X_hmABxM#(r%`%&17@A&7`y$vtqU7d)TR`1}Ew&Ak{Lg;4<=CiRL1!xE1v+qmgL}
zmUuFtd`@>q?n4@T;nQ!p)FBYFGP0C$zmJ*U77W^OM3EyH-0m_MsMcT@uhCIo4PQUQ
zzGADHws#l|c`n&t_`c0dPOFX$SIq@KXk%etmCPJf?%aDFYesi@%E%dt2Jxpk@ybnE
z9Cmr6f{U`i4`kG42z9U_sB8o!pcd=Qt`<J;ZrHo4TjmkaJ@LHDBQ|W5rGgUgCGff5
zp{So6$!4AT&Nd?oHVUUV6NcwXu~&;<v>d!$x?7@V%Og%EIQ?8W!H+t<a~(VE9_2l@
zXTctTT^ua&I@N&+@r}(IA150e2RPBB<OM`HGR;r!Q1GU_8f5gb&Az<Uec(F`62>6*
zd*-BFivn2+5femscJJ#-4)TmXz>&^pf|rL6lmku(dOXNbgT;&QOm}K74x@t4A8E>Q
z+2z7v6b^2kOA^TI*a1Ph`&cj!_EWHTCx{TSMnuF@mv94Gobj{t6nH-He3UxpBOfaz
zTjvJH#STqisKfn7uI+NvLtC?Q<Lk0CW$|iO?v}6yLeg7}Ug<{`uK~+6mPJ0q)<P_`
zo#xDfOQQWQH^01i#{a^F6rXVcxO#B;@hbMYJQJ{U-iar8Ch*3gfumVopahrWZz}P%
zCy0e0o(J(T5fhKPJiRJ!^Ao&f;PrsF8N5=6`jFucm0P58^Ki*%UKr}otaM3I{*Y{-
z)4#}|rf98}L!cP~d>bWFZMlBF%f_$femJWKfZ$7Cd_wWliJwN%qu3v+4=YtU#qux?
zDI0bszehls9?<GN$7i@SE;eZL!)z2B@)!3W0-i`dQ3~IYN6_ZBE3oZDdqDF#mXc3A
zt>R<ATO)W!1n*Vxao|0X^&AOQ`64I99d|8G;6Qmw6FnFCWzVlT$r%s};llg;7g0-F
zKulBnTJ(Ed0dXF6<n{#gMR!2_O6@12??eBFV#zJX9dI)@U)i6#FVgxo{<h_<5zZ;S
zc`pltXQ%R>769eszly`19LgK{Ys-3=efB7G&h|0?>?!u%>>wMNJ<ld)hnWx_WkumD
z%pM+N&aiSJKaZ2AK%N453gjt}r$C+pc^c$tkf%YOR<7nZZQc15W0PvJ@H2<X@rw5u
zYmUQgXdeJ(j>B5e_5oX=V*I-kZ_j<@)q+w@-2>%V!J5S8AY-SLmrOabZEH$8*>Byv
z@nw>Ii3!9Y-cgt%*GYZ4*=X;4F`YIf*@1I=27avsVPAnSy^vl3`$?e_j9W=|Zm$6T
z?bnR$N1N&+_U3eY{*iw(>~-J7?9Gb_Jk$Lk907rg@yCoe0PHw0o(a_@+v%k6e7X%o
zZ;1}fvpfaP_&3UIL2=R*H0hyIY${EfL$m3jvc<G2sT}v>#}AcG(=wTl(}J$~+r_iD
z6IeO0*w~!s0k#%c9oqb9!rKGP4eWg_Iy@fxTj@&Lk<_=B{5Iux$w&3S*BwiGC-Hc~
z3GKU@lLF$jr}qh)Vl(|k?&sIX{&Vf$S3mm{lTK!6j6-nvdu711EIR?34*cHl0gMUc
z1=yyHnO3a20PqbC;^BeuB~dZMSbzg~%JACo_RT1zg{xLyg^5G_;dCF~yTHo%<ly1h
v5#GRmG16Io;UUf0p2A-~`twtbl4SmS-h+Sq<>s+hR%CG^!*lu1yhZdsV!!GV

delta 1150
zcmYk6e`s4(6vxkfSzco`AwM9Av(2x@IBOhb>mP4Tx4vvqN}J&*L&ii%EdxjWBb)mp
z#@I9?XeTHu$3Ln}46>EsAEVtSh%iX;2lG#n4*%QQRnX00MV+bHynb)`g1vCxJ@<3I
z@0@$?dC8CC^>l*B+)Fev{%yy|8(5D~bYz?;2Q~>d2{w6bIy#%!Eyib$?$U1g7vJ}a
zE9pc|T(;Srp4xuHPCYv^R7p#@e{V9+e)kWe#&_H!4tc}eGf$DXc+k_uIZt;6;(5(c
zYRHwQl&eFOmnk=i#`iMXxFF?dp}AH@YQz4U93>cnB74YTOJ?Y_seA&yUwFymbDr6v
z{*G5XVdnz}4YxHPv(uaHw~`%)V-X3YtfzkecQdfNvYL|e^q_`-l&=~@H=7ZvG`jgc
zZ&>Sh@Fj0TEc3GWfSBdSeY?fWe9$-CXGMIvgNBz{5kDP5lp&&eYW>u;)LQ;r>cT^m
zarb<=@W)r>pZFW!Q_giA2fJNukwxC{WpFvJKti7buh@7X@C3k%fnA!Z^SgnN=CSc-
zfu2XKTuXMo5!k2A6xkmf^JnZdjEhl2OmD-3%t5O0RB*d?w#Xj^Lq~<yQsfS;mrH##
zVO;H)DBTmAYyj1+Sau&pjkRrCqG|1D|F=lBM%Afq=7i*Z-Ldjqal$b1DqKIKi+AU3
zj?#6M-0BoMCgm%GHE5_3n_UFA)3i}&-Y(yRt$Y{Z`w;fLRxG*UTekQDc%{YPZ1HOr
z{{s9hdc9dQ@o1p4{Sm3tt>APURz<3x>I2E2s2bgPrQP4`$*SSB_BQxuUauMt`CurC
zYcGc0X+5`dSNJ%rx5GO{jz13{5w$C>$TI?HAbOG)qW%0>EXdhdcI?)lQm$FYabrzl
zF8n3sp)JgW{VP~x3p)(E0oG-)IM!E{eV^~d657@K?CsfUum3IOMXtpC{8EpzQ)$*Q
zjRyC|V|q{!9cl1bJfUY`o^S9}yjMR3Grqx>;-O3g+H=?zHl+L-Ha+h$J?W`WM7GmK
z2lVvK^rNKm*P#jiBV}G`o^wT@dBH^)J&i4wVqLw5;ha%UjCPUN!K#~xc3?FzjV7-?
l+57Bk_%$n}hLTUK36?}2H5B^*bMAqZ1^jiK(OB>M@;^jvhnoNZ

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index 623335d..99d32e6 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -1,3 +1,5 @@
 test 01:PASS
 test 02:PASS
 test 03:PASS
+test 04:PASS
+test 05:PASS

From fc2968f13279524cfc44581ebf4c308bd78611c9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 29 Aug 2020 20:34:55 +1000
Subject: [PATCH 07/30] FPU: Implement remaining FPSCR-related instructions

This implements mcrfs, mtfsfi, mtfsb0/1, mffscr, mffscrn, mffscrni and
mffsl.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl      |  4 +++
 fpu.vhdl          | 81 ++++++++++++++++++++++++++++++++++++++++++++---
 insn_helpers.vhdl |  6 ++++
 3 files changed, 87 insertions(+), 4 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index afd37ef..343c0c3 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -422,6 +422,10 @@ architecture behaviour of decode1 is
     constant decode_op_63l_array : op_63_subop_array_0_t := (
         --                unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
         --                             op                               in   out   A   out  in    out  len        ext                                pipe
+        2#000000010#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  2/0=mcrfs
+        2#011000001#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  1/6=mtfsb1
+        2#011000010#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  2/6=mtfsb0
+        2#011000100#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/6=mtfsfi
         2#011110010#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 18/7=mffs family
         2#011110110#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 22/7=mtfsf
         others => illegal_inst
diff --git a/fpu.vhdl b/fpu.vhdl
index b05ec9d..047bf2d 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -26,7 +26,7 @@ end entity fpu;
 architecture behaviour of fpu is
 
     type state_t is (IDLE,
-                     DO_MFFS, DO_MTFSF);
+                     DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF);
 
     type reg_type is record
         state        : state_t;
@@ -42,6 +42,7 @@ architecture behaviour of fpu is
         single_prec  : std_ulogic;
         fpscr        : std_ulogic_vector(31 downto 0);
         b            : std_ulogic_vector(63 downto 0);
+        r            : std_ulogic_vector(63 downto 0);
         writing_back : std_ulogic;
         cr_result    : std_ulogic_vector(3 downto 0);
         cr_mask      : std_ulogic_vector(7 downto 0);
@@ -77,13 +78,14 @@ begin
     w_out.write_enable <= r.writing_back;
     w_out.write_reg <= r.dest_fpr;
     w_out.write_data <= fp_result;
-    w_out.write_cr_enable <= r.instr_done and r.rc;
+    w_out.write_cr_enable <= r.instr_done and (r.rc or r.is_cmp);
     w_out.write_cr_mask <= r.cr_mask;
     w_out.write_cr_data <= r.cr_result & r.cr_result & r.cr_result & r.cr_result &
                            r.cr_result & r.cr_result & r.cr_result & r.cr_result;
 
     fpu_1: process(all)
         variable v           : reg_type;
+        variable fpscr_mask  : std_ulogic_vector(31 downto 0);
         variable illegal     : std_ulogic;
         variable j, k        : integer;
         variable flm         : std_ulogic_vector(7 downto 0);
@@ -101,17 +103,30 @@ begin
             v.single_prec := e_in.single;
             v.rc := e_in.rc;
             v.is_cmp := e_in.out_cr;
-            v.cr_mask := num_to_fxm(1);
+            if e_in.out_cr = '0' then
+                v.cr_mask := num_to_fxm(1);
+            else
+                v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(e_in.insn))));
+            end if;
             v.b := e_in.frb;
         end if;
 
         v.writing_back := '0';
         v.instr_done := '0';
+        fpscr_mask := (others => '1');
 
         case r.state is
             when IDLE =>
                 if e_in.valid = '1' then
                     case e_in.insn(5 downto 1) is
+                        when "00000" =>
+                            v.state := DO_MCRFS;
+                        when "00110" =>
+                            if e_in.insn(8) = '0' then
+                                v.state := DO_MTFSB;
+                            else
+                                v.state := DO_MTFSFI;
+                            end if;
                         when "00111" =>
                             if e_in.insn(8) = '0' then
                                 v.state := DO_MFFS;
@@ -123,11 +138,67 @@ begin
                     end case;
                 end if;
 
+            when DO_MCRFS =>
+                j := to_integer(unsigned(insn_bfa(r.insn)));
+                for i in 0 to 7 loop
+                    if i = j then
+                        k := (7 - i) * 4;
+                        v.cr_result := r.fpscr(k + 3 downto k);
+                        fpscr_mask(k + 3 downto k) := "0000";
+                    end if;
+                end loop;
+                v.fpscr := r.fpscr and (fpscr_mask or x"6007F8FF");
+                v.instr_done := '1';
+                v.state := IDLE;
+
+            when DO_MTFSB =>
+                -- mtfsb{0,1}
+                j := to_integer(unsigned(insn_bt(r.insn)));
+                for i in 0 to 31 loop
+                    if i = j then
+                        v.fpscr(31 - i) := r.insn(6);
+                    end if;
+                end loop;
+                v.instr_done := '1';
+                v.state := IDLE;
+
+            when DO_MTFSFI =>
+                -- mtfsfi
+                j := to_integer(unsigned(insn_bf(r.insn)));
+                if r.insn(16) = '0' then
+                    for i in 0 to 7 loop
+                        if i = j then
+                            k := (7 - i) * 4;
+                            v.fpscr(k + 3 downto k) := insn_u(r.insn);
+                        end if;
+                    end loop;
+                end if;
+                v.instr_done := '1';
+                v.state := IDLE;
+
             when DO_MFFS =>
                 v.writing_back := '1';
                 case r.insn(20 downto 16) is
                     when "00000" =>
                         -- mffs
+                    when "00001" =>
+                        -- mffsce
+                        v.fpscr(FPSCR_VE downto FPSCR_XE) := "00000";
+                    when "10100" | "10101" =>
+                        -- mffscdrn[i] (but we don't implement DRN)
+                        fpscr_mask := x"000000FF";
+                    when "10110" =>
+                        -- mffscrn
+                        fpscr_mask := x"000000FF";
+                        v.fpscr(FPSCR_RN+1 downto FPSCR_RN) :=
+                            r.b(FPSCR_RN+1 downto FPSCR_RN);
+                    when "10111" =>
+                        -- mffscrni
+                        fpscr_mask := x"000000FF";
+                        v.fpscr(FPSCR_RN+1 downto FPSCR_RN) := r.insn(12 downto 11);
+                    when "11000" =>
+                        -- mffsl
+                        fpscr_mask := x"0007F0FF";
                     when others =>
                         illegal := '1';
                 end case;
@@ -155,7 +226,9 @@ begin
 
         -- Data path.
         -- Just enough to read FPSCR for now.
-        fp_result <= x"00000000" & r.fpscr;
+        v.r := x"00000000" & (r.fpscr and fpscr_mask);
+
+        fp_result <= r.r;
 
         v.fpscr(FPSCR_VX) := (or (v.fpscr(FPSCR_VXSNAN downto FPSCR_VXVC))) or
                              (or (v.fpscr(FPSCR_VXSOFT downto FPSCR_VXCVI)));
diff --git a/insn_helpers.vhdl b/insn_helpers.vhdl
index be3892a..519aa76 100644
--- a/insn_helpers.vhdl
+++ b/insn_helpers.vhdl
@@ -41,6 +41,7 @@ package insn_helpers is
     function insn_fra (insn_in : std_ulogic_vector) return std_ulogic_vector;
     function insn_frb (insn_in : std_ulogic_vector) return std_ulogic_vector;
     function insn_frc (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_u (insn_in : std_ulogic_vector) return std_ulogic_vector;
 end package insn_helpers;
 
 package body insn_helpers is
@@ -238,4 +239,9 @@ package body insn_helpers is
     begin
         return insn_in(10 downto 6);
     end;
+
+    function insn_u(insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(15 downto 12);
+    end;
 end package body insn_helpers;

From cb27353f37331e5eb34c19d283334ce36bcea27b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 31 Aug 2020 13:10:51 +1000
Subject: [PATCH 08/30] tests/fpu: Test remaining FPSCR-related instructions

This adds tests for mffsce, mffscrn, mffscrni, mffsl, mcrfs, mtfsfi,
mtfsb0 and mtfsb1.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 tests/fpu/fpu.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 55 insertions(+), 2 deletions(-)

diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index 54811ed..f9c4245 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -272,9 +272,9 @@ unsigned int test4vals[] = {
 
 int test4(long arg)
 {
-	unsigned long fsi, fpscr;
+	unsigned long fsi, fso, fpscr;
 	long i;
-	unsigned long cr;
+	unsigned long cr, mask;
 
 	/* check we can do basic mtfsf and mffs */
 	i = 1;
@@ -298,6 +298,59 @@ int test4(long arg)
 		asm("mffs. 6; mfcr %0" : "=r" (cr) : : "cr1");
 		if (((cr >> 24) & 0xf) != ((fpscr >> 28) & 0x1f))
 			return 16 * i + 18;
+		asm("mffsce 12; stfd 12,0(%0)" : : "b" (&fso) : "memory");
+		if (fso != fpscr)
+			return 16 * i + 19;
+		fpscr = fpscr_eval(fpscr & ~0xf8);
+		if (get_fpscr() != fpscr)
+			return 16 * i + 20;
+		asm("lfd 7,0(%0); mtfsf 0xff,7,0,0" : : "b" (&fsi));
+		fpscr = fpscr_eval(fsi);
+		fsi = ~fsi;
+		asm("lfd 14,0(%0); mffscrn 15,14; stfd 15,0(%1)"
+		    : : "b" (&fsi), "b" (&fso) : "memory");
+		if (fso != (fpscr & 0xff))
+			return 16 * i + 21;
+		fpscr = (fpscr & ~3) | (fsi & 3);
+		if (get_fpscr() != fpscr)
+			return 16 * i + 22;
+		fso = ~fso;
+		asm("mffscrni 16,1; stfd 16,0(%0)" : : "b" (&fso) : "memory");
+		if (fso != (fpscr & 0xff))
+			return 16 * i + 23;
+		fpscr = (fpscr & ~3) | 1;
+		if (get_fpscr() != fpscr)
+			return 16 * i + 24;
+		asm("mffsl 17; stfd 17,0(%0)" : : "b" (&fso) : "memory");
+		mask = ((1 << (63-45+1)) - (1 << (63-51))) | ((1 << (63-56+1)) - (1 << (63-63)));
+		if (fso != (fpscr & mask))
+			return 16 * i + 25;
+		asm("mcrfs 0,3; mcrfs 7,0; mfcr %0" : "=r" (cr) : : "cr0", "cr7");
+		fso = fpscr_eval(fpscr & ~0x80000);
+		if (((cr >> 28) & 0xf) != ((fpscr >> 16) & 0xf) ||
+		    ((cr >> 0) & 0xf) != ((fso >> 28) & 0xf))
+			return 16 * i + 26;
+		fpscr = fso & 0x6fffffff;
+		asm("mtfsfi 0,7,0");
+		fpscr = fpscr_eval((fpscr & 0x0fffffff) | 0x70000000);
+		if (get_fpscr() != fpscr)
+			return 16 * i + 27;
+		asm("mtfsb0 21");
+		fpscr = fpscr_eval(fpscr & ~(1 << (31-21)));
+		if (get_fpscr() != fpscr)
+			return 16 * i + 28;
+		asm("mtfsb1 21");
+		fpscr = fpscr_eval(fpscr | (1 << (31-21)));
+		if (get_fpscr() != fpscr)
+			return 16 * i + 29;
+		asm("mtfsb0 24");
+		fpscr = fpscr_eval(fpscr & ~(1 << (31-24)));
+		if (get_fpscr() != fpscr)
+			return 16 * i + 30;
+		asm("mtfsb1. 24; mfcr %0" : "=r" (cr));
+		fpscr = fpscr_eval(fpscr | (1 << (31-24)));
+		if (get_fpscr() != fpscr || ((cr >> 24) & 0xf) != ((fpscr >> 28) & 0xf))
+			return 16 * i + 31;
 	}
 	return 0;
 }

From b628af6176bd0bfa0289fa823ec205f48988ec53 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 15 Jul 2020 14:28:06 +1000
Subject: [PATCH 09/30] FPU: Implement fmr and related instructions

This implements fmr, fneg, fabs, fnabs and fcpsgn and adds tests
for them.

This adds logic to unpack and repack floating-point data from the
64-bit packed form (as stored in memory and the register file) into
the unpacked form in the fpr_reg_type record.  This is not strictly
necessary for fmr et al., but will be useful for when we do actual
arithmetic.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl               |   5 ++
 decode2.vhdl               |   3 +
 decode_types.vhdl          |   2 +-
 fpu.vhdl                   | 144 ++++++++++++++++++++++++++++++++++---
 tests/fpu/fpu.c            |  34 +++++++++
 tests/test_fpu.bin         | Bin 12504 -> 12504 bytes
 tests/test_fpu.console_out |   1 +
 7 files changed, 180 insertions(+), 9 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 343c0c3..5f5fb80 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -428,6 +428,11 @@ architecture behaviour of decode1 is
         2#011000100#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/6=mtfsfi
         2#011110010#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 18/7=mffs family
         2#011110110#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 22/7=mtfsf
+        2#100000000#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  0/8=fcpsgn
+        2#100000001#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  1/8=fneg
+        2#100000010#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  2/8=fmr
+        2#100000100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/8=fnabs
+        2#100001000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  8/8=fabs
         others => illegal_inst
         );
 
diff --git a/decode2.vhdl b/decode2.vhdl
index 8b2ab8c..ec8232f 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -80,6 +80,8 @@ architecture behaviour of decode2 is
             return (is_fast_spr(ispr), ispr, reg_data);
         elsif t = CIA then
             return ('0', (others => '0'), instr_addr);
+        elsif HAS_FPU and t = FRA then
+            return ('1', fpr_to_gspr(insn_fra(insn_in)), reg_data);
         else
             return ('0', (others => '0'), (others => '0'));
         end if;
@@ -300,6 +302,7 @@ begin
     end process;
 
     r_out.read1_reg <= d_in.ispr1 when d_in.decode.input_reg_a = SPR
+                       else fpr_to_gspr(insn_fra(d_in.insn)) when d_in.decode.input_reg_a = FRA and HAS_FPU
                        else gpr_to_gspr(insn_ra(d_in.insn));
     r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR
                        else fpr_to_gspr(insn_frb(d_in.insn)) when d_in.decode.input_reg_b = FRB and HAS_FPU
diff --git a/decode_types.vhdl b/decode_types.vhdl
index 5eaef50..08fdc4a 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -23,7 +23,7 @@ package decode_types is
                          OP_BCD, OP_ADDG6S,
                          OP_FETCH_FAILED
 			 );
-    type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA);
+    type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA, FRA);
     type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD,
                            CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR, FRB);
     type input_reg_c_t is (NONE, RS, RCR, FRS);
diff --git a/fpu.vhdl b/fpu.vhdl
index 047bf2d..3711b35 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -24,9 +24,20 @@ entity fpu is
 end entity fpu;
 
 architecture behaviour of fpu is
+    type fp_number_class is (ZERO, FINITE, INFINITY, NAN);
+
+    constant EXP_BITS : natural := 13;
+
+    type fpu_reg_type is record
+        class    : fp_number_class;
+        negative : std_ulogic;
+        exponent : signed(EXP_BITS-1 downto 0);         -- unbiased
+        mantissa : std_ulogic_vector(63 downto 0);      -- 10.54 format
+    end record;
 
     type state_t is (IDLE,
-                     DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF);
+                     DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
+                     DO_FMR);
 
     type reg_type is record
         state        : state_t;
@@ -41,9 +52,14 @@ architecture behaviour of fpu is
         is_cmp       : std_ulogic;
         single_prec  : std_ulogic;
         fpscr        : std_ulogic_vector(31 downto 0);
-        b            : std_ulogic_vector(63 downto 0);
+        a            : fpu_reg_type;
+        b            : fpu_reg_type;
         r            : std_ulogic_vector(63 downto 0);
+        result_sign  : std_ulogic;
+        result_class : fp_number_class;
+        result_exp   : signed(EXP_BITS-1 downto 0);
         writing_back : std_ulogic;
+        int_result   : std_ulogic;
         cr_result    : std_ulogic_vector(3 downto 0);
         cr_mask      : std_ulogic_vector(7 downto 0);
     end record;
@@ -51,6 +67,72 @@ architecture behaviour of fpu is
     signal r, rin : reg_type;
 
     signal fp_result     : std_ulogic_vector(63 downto 0);
+    signal opsel_r       : std_ulogic_vector(1 downto 0);
+    signal result        : std_ulogic_vector(63 downto 0);
+
+    -- Split a DP floating-point number into components and work out its class.
+    -- If is_int = 1, the input is considered an integer
+    function decode_dp(fpr: std_ulogic_vector(63 downto 0); is_int: std_ulogic) return fpu_reg_type is
+        variable r       : fpu_reg_type;
+        variable exp_nz  : std_ulogic;
+        variable exp_ao  : std_ulogic;
+        variable frac_nz : std_ulogic;
+        variable cls     : std_ulogic_vector(2 downto 0);
+    begin
+        r.negative := fpr(63);
+        exp_nz := or (fpr(62 downto 52));
+        exp_ao := and (fpr(62 downto 52));
+        frac_nz := or (fpr(51 downto 0));
+        if is_int = '0' then
+            r.exponent := signed(resize(unsigned(fpr(62 downto 52)), EXP_BITS)) - to_signed(1023, EXP_BITS);
+            if exp_nz = '0' then
+                r.exponent := to_signed(-1022, EXP_BITS);
+            end if;
+            r.mantissa := "000000000" & exp_nz & fpr(51 downto 0) & "00";
+            cls := exp_ao & exp_nz & frac_nz;
+            case cls is
+                when "000"  => r.class := ZERO;
+                when "001"  => r.class := FINITE;    -- denormalized
+                when "010"  => r.class := FINITE;
+                when "011"  => r.class := FINITE;
+                when "110"  => r.class := INFINITY;
+                when others => r.class := NAN;
+            end case;
+        else
+            r.mantissa := fpr;
+            r.exponent := (others => '0');
+            if (fpr(63) or exp_nz or frac_nz) = '1' then
+                r.class := FINITE;
+            else
+                r.class := ZERO;
+            end if;
+        end if;
+        return r;
+    end;
+
+    -- Construct a DP floating-point result from components
+    function pack_dp(sign: std_ulogic; class: fp_number_class; exp: signed(EXP_BITS-1 downto 0);
+                     mantissa: std_ulogic_vector) return std_ulogic_vector is
+        variable result : std_ulogic_vector(63 downto 0);
+    begin
+        result := (others => '0');
+        result(63) := sign;
+        case class is
+            when ZERO =>
+            when FINITE =>
+                if mantissa(54) = '1' then
+                    -- normalized number
+                    result(62 downto 52) := std_ulogic_vector(resize(exp, 11) + 1023);
+                end if;
+                result(51 downto 0) := mantissa(53 downto 2);
+            when INFINITY =>
+                result(62 downto 52) := "11111111111";
+            when NAN =>
+                result(62 downto 52) := "11111111111";
+                result(51 downto 0) := mantissa(53 downto 2);
+        end case;
+        return result;
+    end;
 
 begin
     fpu_0: process(clk)
@@ -85,14 +167,18 @@ begin
 
     fpu_1: process(all)
         variable v           : reg_type;
+        variable adec        : fpu_reg_type;
+        variable bdec        : fpu_reg_type;
         variable fpscr_mask  : std_ulogic_vector(31 downto 0);
         variable illegal     : std_ulogic;
         variable j, k        : integer;
         variable flm         : std_ulogic_vector(7 downto 0);
+        variable int_input   : std_ulogic;
     begin
         v := r;
         illegal := '0';
         v.busy := '0';
+        int_input := '0';
 
         -- capture incoming instruction
         if e_in.valid = '1' then
@@ -101,6 +187,7 @@ begin
             v.fe_mode := or (e_in.fe_mode);
             v.dest_fpr := e_in.frt;
             v.single_prec := e_in.single;
+            v.int_result := '0';
             v.rc := e_in.rc;
             v.is_cmp := e_in.out_cr;
             if e_in.out_cr = '0' then
@@ -108,11 +195,19 @@ begin
             else
                 v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(e_in.insn))));
             end if;
-            v.b := e_in.frb;
+            int_input := '0';
+            if e_in.op = OP_FPOP_I then
+                int_input := '1';
+            end if;
+            adec := decode_dp(e_in.fra, int_input);
+            bdec := decode_dp(e_in.frb, int_input);
+            v.a := adec;
+            v.b := bdec;
         end if;
 
         v.writing_back := '0';
         v.instr_done := '0';
+        opsel_r <= "00";
         fpscr_mask := (others => '1');
 
         case r.state is
@@ -133,6 +228,8 @@ begin
                             else
                                 v.state := DO_MTFSF;
                             end if;
+                        when "01000" =>
+                            v.state := DO_FMR;
                         when others =>
                             illegal := '1';
                     end case;
@@ -177,7 +274,9 @@ begin
                 v.state := IDLE;
 
             when DO_MFFS =>
+                v.int_result := '1';
                 v.writing_back := '1';
+                opsel_r <= "10";
                 case r.insn(20 downto 16) is
                     when "00000" =>
                         -- mffs
@@ -191,7 +290,7 @@ begin
                         -- mffscrn
                         fpscr_mask := x"000000FF";
                         v.fpscr(FPSCR_RN+1 downto FPSCR_RN) :=
-                            r.b(FPSCR_RN+1 downto FPSCR_RN);
+                            r.b.mantissa(FPSCR_RN+1 downto FPSCR_RN);
                     when "10111" =>
                         -- mffscrni
                         fpscr_mask := x"000000FF";
@@ -216,19 +315,48 @@ begin
                 for i in 0 to 7 loop
                     k := i * 4;
                     if flm(i) = '1' then
-                        v.fpscr(k + 3 downto k) := r.b(k + 3 downto k);
+                        v.fpscr(k + 3 downto k) := r.b.mantissa(k + 3 downto k);
                     end if;
                 end loop;
                 v.instr_done := '1';
                 v.state := IDLE;
 
+            when DO_FMR =>
+                v.result_class := r.b.class;
+                v.result_exp := r.b.exponent;
+                if r.insn(9) = '1' then
+                    v.result_sign := '0';              -- fabs
+                elsif r.insn(8) = '1' then
+                    v.result_sign := '1';              -- fnabs
+                elsif r.insn(7) = '1' then
+                    v.result_sign := r.b.negative;     -- fmr
+                elsif r.insn(6) = '1' then
+                    v.result_sign := not r.b.negative; -- fneg
+                else
+                    v.result_sign := r.a.negative;     -- fcpsgn
+                end if;
+                v.writing_back := '1';
+                v.instr_done := '1';
+                v.state := IDLE;
+
         end case;
 
         -- Data path.
-        -- Just enough to read FPSCR for now.
-        v.r := x"00000000" & (r.fpscr and fpscr_mask);
+        case opsel_r is
+            when "00" =>
+                result <= r.b.mantissa;
+            when "10" =>
+                result <= x"00000000" & (r.fpscr and fpscr_mask);
+            when others =>
+                result <= (others => '0');
+        end case;
+        v.r := result;
 
-        fp_result <= r.r;
+        if r.int_result = '1' then
+            fp_result <= r.r;
+        else
+            fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r);
+        end if;
 
         v.fpscr(FPSCR_VX) := (or (v.fpscr(FPSCR_VXSNAN downto FPSCR_VXVC))) or
                              (or (v.fpscr(FPSCR_VXSOFT downto FPSCR_VXCVI)));
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index f9c4245..46668f8 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -438,6 +438,39 @@ int fpu_test_5(void)
 	return 0;
 }
 
+#define SIGN	0x8000000000000000ul
+
+int test6(long arg)
+{
+	long i;
+	unsigned long results[6];
+	unsigned long v;
+
+	for (i = 0; i < sizeof(sp_dp_equiv) / sizeof(sp_dp_equiv[0]); ++i) {
+		v = sp_dp_equiv[i].dp;
+		asm("lfd%U0%X0 3,%0; fmr 6,3; fneg 7,3; stfd 6,0(%1); stfd 7,8(%1)"
+		    : : "m" (sp_dp_equiv[i].dp), "b" (results) : "memory");
+		asm("fabs 9,6; fnabs 10,6; stfd 9,16(%0); stfd 10,24(%0)"
+		    : : "b" (results) : "memory");
+		asm("fcpsgn 4,9,3; stfd 4,32(%0); fcpsgn 5,10,3; stfd 5,40(%0)"
+		    : : "b" (results) : "memory");
+		if (results[0] != v ||
+		    results[1] != (v ^ SIGN) ||
+		    results[2] != (v & ~SIGN) ||
+		    results[3] != (v | SIGN) ||
+		    results[4] != (v & ~SIGN) ||
+		    results[5] != (v | SIGN))
+			return i + 1;
+	}
+	return 0;
+}
+
+int fpu_test_6(void)
+{
+	enable_fp();
+	return trapit(0, test6);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -469,6 +502,7 @@ int main(void)
 	do_test(3, fpu_test_3);
 	do_test(4, fpu_test_4);
 	do_test(5, fpu_test_5);
+	do_test(6, fpu_test_6);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index 6bac86156f19b99c44efc42659d3d5f6219d6486..4fb260e1d5a3e4f37deea64098b82e55151aceb2 100755
GIT binary patch
delta 1357
zcmY+DZA@Eb6vxlK<#v}WG?!9J8E>U;(1ME74;PuCMVy6BWMm|o>B7s<b#c+jm!h|}
zkUPT?vp~+iXtqcUqAXjo1wKHc$%tgpM3c>^=@Lk`j2Q|PyUh>06d%t6-3L!{^8Ek5
z^Yq+%p652vo#-C<Tr91)abZHiF{>Qg5B7Qh=$bHiqe@(?XDqIgRU@xR(c5fJ%>g~K
z<OxS)2qrD9?x+mmpGIsqGwrr}8Gu^}8U9Kr2kkq`^Zp$LJa=ty_^x6*pRvPd3B`6L
zQw<Y|+pIDP?OsiRp@0Cr>h!)|%?|Ol1Pi>Cj5w;Ip!&3BHsrs@?l1enHvsKbT7o=d
z{(93!%fK=VBq8Y2=82_wvtX4rLwaLZ`?Ra1d1D3S`eO{5_yFk2kDtz4XCXeFz^9ZA
zwaqZ(k8Cf+1lsT@wO13fwQ8yjG8W=>-r!EQK2Q?YZ1m;^@ODq{^G55SQjLV1(Rl@e
zPkVinLX){e%vG?*>=WH7ykIUDGI-5gB`&VvpXMg<i#0q@a8T@9!?uDp-_Irp&aui3
z#=dT3Y^V|7W&?zOXjtmH)Npx+-eGk-7J2exEVYWa3OuG6+8%kJ@1ogN+$uO>yn0$!
zJ6G|9<*<}^Pgiqs(h@Q*ysN9TE4XE8H69@tUcuJFR<SM*x2y*+T_&3^Z?h0R?+~Bw
z#co|qCaJ@-Xt&BjC(agrAU@nepVc9jtzet=ilF1BwLuE*hu|XC7rh~*@qAH*a0Vxf
zYJ?X2qo`5b_z11G4&fgR+ny7lAHTGfKl6?eg8b0i(=n|sd)+Nr_7Xk#<*eohc@ZDl
zz7o#jx#AZDZECjoTZ1U~V^5jmsM6o7!k}aH+$IZ25LYAH4`Oqq8S6P@quLqL6XM78
zQN`8CEVmQy+~uF_^3g1>CO(3Jq7r@xeO__z{rExf58+ggu9nfob1#x_?$y=2tT&L4
zpVZZzfA{J+<g5F1HI?-e`47o2X1$U8MY@Gwr(Q1)<@DLt7+a#V;Gs~~Ql^Qub3nnF
zpRzW^SejNIoE@W-*BP_Z%7e30lonzx4k-Al@fDK}r*WKAVjay+U+(WDm;Pq#I~;Ub
z@whWz$`JtG#<R{6$wf-XQD=#}nbhzj#`smit0sGX;M3Pk&}0mdEu}n3_3L<pexs8D
zyyGk>9w*iQm@!`GVW*rS_9jZM3Tc7VCG@(S#vBpg`^VVf@<??gWsJINq%Km!c+KSu
z43k=T^8e{B5nCqqv^AgjI<YjdX3{*4P|g#xW4=2iSqwBkcDpM|RZ^w&UmD|^_C`5w
zg67>(P;-=ehp9Nsixj*uzlIDkodb$$N<ZcMc;8)9IJU_cCp`G>IOQ}}dJ3dpwit^{
M1w2P(G}it0AGf;B!2kdN

delta 1040
zcmY+COH30{6o&6inE_F-&;l*57U_dWi4|Q8j|doQ8VZO6Vq9P-pn?IV35%*gflLf0
zYHAKFG@@V-#T5i%T!?{~7#3pC7{i7HViyJ|g0O%fT+hX_@K4Unf4+0)KF*yv=bY1f
zkM|{vJ{VGhA+q_}?kaT<LaI5U&mtiTOCjmXcklQ-9I2KT12sNSOTk(vr4CldPo$|p
z{bpQ4tA=zHC`hVvJUF$@N|GeEBe|_nN$dx6^{&ERWH~>26v|a;eAFtms}i|E=u?^b
z89#hfo#F5LAyl2oSNh?!x-`udPV7@OX^Yb5d6ahL5i*%GeRU$oSN$ZXZx{ElJm_(6
zeDKV#z#FwGyc&nQf5`i2cm<Z!m&BF^S+cD_Y1CPv|B5Va!=osNICxo>hL>R}sz_8L
zytoY6(M3UXM~EGix_IFWuGk0b(KomZd?rA9jL26m!=0GdTnn7g<OtGXVxIw*<{Y;G
zQyLvt1&f+=E(^l6dHjeAO10&j4+gZylis>&B_-ju*o;kk)bV|1&toMXDU?v%cyMiN
z39M^X+ANXSS@*NCn;8Mmx+g<I>@&^@6R{_`#<BI-mkNH;1&<St@zXANpBS4ti_<r4
zQ^yd0t5r!lq+#@w*5uA1uiWRi_xZyBUvNRF&Qz)8|E-4|&1E<2cFIx$-YCm?%qwbS
zDJ+mHF!$EV(k||v)q^mX>SW0u$OX(hFrOdm)j5LB*ZV2;;aQlJ7#0j^mC`Z>P>khc
zv>?;tn~AXsj1FW522ku>kKqNIAzAdIEBpo54Qb*e+OjoxYDg2-&`!gQAx=<ogsg!d
zofX|CWEkUwGIWJdVvN%^qwD!iX*`Z+(qqWTo<OHjC%i(p056Qm;w(C{0iTVgLx0c~
zZ%{e{1BPf&nIZ%WPsk@oH6<5Rpwn;uzx{e-7GwuYup_l1JB6$eJyS7;7g-(LH#vj_
zbdxY|(y4<Kc=h;)vqKVFxIUtw{Q@Y>hSYZmyBS2W30W_#+my--pi(gMG5Vp(oSyLw
lD~vF)(H$5+ky+TOP>l63vqPzSY|3m48JjhCb0Av3{sp(8UgZD)

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index 99d32e6..a49bb9b 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -3,3 +3,4 @@ test 02:PASS
 test 03:PASS
 test 04:PASS
 test 05:PASS
+test 06:PASS

From 9e8fb293edd59f355cc1fd020f96dafee0af867c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 16 Jul 2020 15:51:57 +1000
Subject: [PATCH 10/30] FPU: Implement floating convert from integer
 instructions

This implements fcfid, fcfidu, fcfids and fcfidus, which convert
64-bit integer values in an FPR into a floating-point value.
This brings in a lot of the datapath that will be needed in
future, including the shifter, adder, mask generator and
count-leading-zeroes logic, along with the machinery for rounding
to single-precision or double-precision, detecting inexact results,
signalling inexact-result exceptions, and updating result flags
in the FPSCR.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl               |  19 ++
 fpu.vhdl                   | 506 ++++++++++++++++++++++++++++++++++++-
 tests/fpu/fpu.c            |  87 ++++++-
 tests/test_fpu.bin         | Bin 12504 -> 13504 bytes
 tests/test_fpu.console_out |   1 +
 5 files changed, 587 insertions(+), 26 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 5f5fb80..83444cf 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -55,6 +55,7 @@ architecture behaviour of decode1 is
     type op_19_subop_array_t is array(0 to 7) of decode_rom_t;
     type op_30_subop_array_t is array(0 to 15) of decode_rom_t;
     type op_31_subop_array_t is array(0 to 1023) of decode_rom_t;
+    type op_59_subop_array_t is array(0 to 31) of decode_rom_t;
     type minor_rom_array_2_t is array(0 to 3) of decode_rom_t;
     type op_63_subop_array_0_t is array(0 to 511) of decode_rom_t;
 
@@ -410,6 +411,13 @@ architecture behaviour of decode1 is
         others   => decode_rom_init
         );
 
+    constant decode_op_59_array : op_59_subop_array_t := (
+        --             unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
+        --                          op                               in   out   A   out  in    out  len        ext                                pipe
+        2#01110#  =>  (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fcfid[u]s
+        others => illegal_inst
+        );
+
     constant decode_op_62_array : minor_rom_array_2_t := (
         --              unit    internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
         --                            op                                           in   out   A   out  in    out  len        ext                                 pipe
@@ -433,6 +441,8 @@ architecture behaviour of decode1 is
         2#100000010#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  2/8=fmr
         2#100000100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/8=fnabs
         2#100001000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  8/8=fabs
+        2#111011010#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 26/14=fcfid
+        2#111011110#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 30/14=fcfidu
         others => illegal_inst
         );
 
@@ -586,6 +596,15 @@ begin
         when 58 =>
             v.decode := decode_op_58_array(to_integer(unsigned(f_in.insn(1 downto 0))));
 
+        when 59 =>
+            if HAS_FPU then
+                -- floating point operations, mostly single-precision
+                v.decode := decode_op_59_array(to_integer(unsigned(f_in.insn(5 downto 1))));
+                if f_in.insn(5) = '0' and not std_match(f_in.insn(10 downto 1), "11-1001110") then
+                    vi.override := '1';
+                end if;
+            end if;
+
         when 62 =>
             v.decode := decode_op_62_array(to_integer(unsigned(f_in.insn(1 downto 0))));
 
diff --git a/fpu.vhdl b/fpu.vhdl
index 3711b35..fecb7bb 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -37,7 +37,12 @@ architecture behaviour of fpu is
 
     type state_t is (IDLE,
                      DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
-                     DO_FMR);
+                     DO_FMR,
+                     DO_FCFID,
+                     FINISH, NORMALIZE,
+                     ROUND_UFLOW, ROUND_OFLOW,
+                     ROUNDING, ROUNDING_2, ROUNDING_3,
+                     DENORM);
 
     type reg_type is record
         state        : state_t;
@@ -54,21 +59,121 @@ architecture behaviour of fpu is
         fpscr        : std_ulogic_vector(31 downto 0);
         a            : fpu_reg_type;
         b            : fpu_reg_type;
-        r            : std_ulogic_vector(63 downto 0);
+        r            : std_ulogic_vector(63 downto 0);  -- 10.54 format
+        x            : std_ulogic;
         result_sign  : std_ulogic;
         result_class : fp_number_class;
         result_exp   : signed(EXP_BITS-1 downto 0);
+        shift        : signed(EXP_BITS-1 downto 0);
         writing_back : std_ulogic;
         int_result   : std_ulogic;
         cr_result    : std_ulogic_vector(3 downto 0);
         cr_mask      : std_ulogic_vector(7 downto 0);
+        old_exc      : std_ulogic_vector(4 downto 0);
+        update_fprf  : std_ulogic;
+        tiny         : std_ulogic;
+        denorm       : std_ulogic;
+        round_mode   : std_ulogic_vector(2 downto 0);
     end record;
 
     signal r, rin : reg_type;
 
     signal fp_result     : std_ulogic_vector(63 downto 0);
+    signal opsel_a       : std_ulogic_vector(1 downto 0);
+    signal opsel_b       : std_ulogic_vector(1 downto 0);
     signal opsel_r       : std_ulogic_vector(1 downto 0);
+    signal opsel_ainv    : std_ulogic;
+    signal opsel_amask   : std_ulogic;
+    signal in_a          : std_ulogic_vector(63 downto 0);
+    signal in_b          : std_ulogic_vector(63 downto 0);
     signal result        : std_ulogic_vector(63 downto 0);
+    signal carry_in      : std_ulogic;
+    signal lost_bits     : std_ulogic;
+    signal r_hi_nz       : std_ulogic;
+    signal r_lo_nz       : std_ulogic;
+    signal misc_sel      : std_ulogic_vector(3 downto 0);
+
+    -- opsel values
+    constant AIN_R    : std_ulogic_vector(1 downto 0) := "00";
+    constant AIN_A    : std_ulogic_vector(1 downto 0) := "01";
+    constant AIN_B    : std_ulogic_vector(1 downto 0) := "10";
+
+    constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00";
+    constant BIN_R    : std_ulogic_vector(1 downto 0) := "01";
+    constant BIN_MASK : std_ulogic_vector(1 downto 0) := "10";
+
+    constant RES_SUM   : std_ulogic_vector(1 downto 0) := "00";
+    constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01";
+    constant RES_MISC  : std_ulogic_vector(1 downto 0) := "11";
+
+    -- Left and right shifter with 120 bit input and 64 bit output.
+    -- Shifts inp left by shift bits and returns the upper 64 bits of
+    -- the result.  The shift parameter is interpreted as a signed
+    -- number in the range -64..63, with negative values indicating
+    -- right shifts.
+    function shifter_64(inp: std_ulogic_vector(119 downto 0);
+                        shift: std_ulogic_vector(6 downto 0))
+        return std_ulogic_vector is
+        variable s1 : std_ulogic_vector(94 downto 0);
+        variable s2 : std_ulogic_vector(70 downto 0);
+        variable result : std_ulogic_vector(63 downto 0);
+    begin
+        case shift(6 downto 5) is
+            when "00" =>
+                s1 := inp(119 downto 25);
+            when "01" =>
+                s1 := inp(87 downto 0) & "0000000";
+            when "10" =>
+                s1 := x"0000000000000000" & inp(119 downto 89);
+            when others =>
+                s1 := x"00000000" & inp(119 downto 57);
+        end case;
+        case shift(4 downto 3) is
+            when "00" =>
+                s2 := s1(94 downto 24);
+            when "01" =>
+                s2 := s1(86 downto 16);
+            when "10" =>
+                s2 := s1(78 downto 8);
+            when others =>
+                s2 := s1(70 downto 0);
+        end case;
+        case shift(2 downto 0) is
+            when "000" =>
+                result := s2(70 downto 7);
+            when "001" =>
+                result := s2(69 downto 6);
+            when "010" =>
+                result := s2(68 downto 5);
+            when "011" =>
+                result := s2(67 downto 4);
+            when "100" =>
+                result := s2(66 downto 3);
+            when "101" =>
+                result := s2(65 downto 2);
+            when "110" =>
+                result := s2(64 downto 1);
+            when others =>
+                result := s2(63 downto 0);
+        end case;
+        return result;
+    end;
+
+    -- Generate a mask with 0-bits on the left and 1-bits on the right which
+    -- selects the bits will be lost in doing a right shift.  The shift
+    -- parameter is the bottom 6 bits of a negative shift count,
+    -- indicating a right shift.
+    function right_mask(shift: unsigned(5 downto 0)) return std_ulogic_vector is
+        variable result: std_ulogic_vector(63 downto 0);
+    begin
+        result := (others => '0');
+        for i in 0 to 63 loop
+            if i >= shift then
+                result(63 - i) := '1';
+            end if;
+        end loop;
+        return result;
+    end;
 
     -- Split a DP floating-point number into components and work out its class.
     -- If is_int = 1, the input is considered an integer
@@ -112,7 +217,8 @@ architecture behaviour of fpu is
 
     -- Construct a DP floating-point result from components
     function pack_dp(sign: std_ulogic; class: fp_number_class; exp: signed(EXP_BITS-1 downto 0);
-                     mantissa: std_ulogic_vector) return std_ulogic_vector is
+                     mantissa: std_ulogic_vector; single_prec: std_ulogic)
+        return std_ulogic_vector is
         variable result : std_ulogic_vector(63 downto 0);
     begin
         result := (others => '0');
@@ -124,16 +230,76 @@ architecture behaviour of fpu is
                     -- normalized number
                     result(62 downto 52) := std_ulogic_vector(resize(exp, 11) + 1023);
                 end if;
-                result(51 downto 0) := mantissa(53 downto 2);
+                result(51 downto 29) := mantissa(53 downto 31);
+                if single_prec = '0' then
+                    result(28 downto 0) := mantissa(30 downto 2);
+                end if;
             when INFINITY =>
                 result(62 downto 52) := "11111111111";
             when NAN =>
                 result(62 downto 52) := "11111111111";
-                result(51 downto 0) := mantissa(53 downto 2);
+                result(51 downto 29) := mantissa(53 downto 31);
+                if single_prec = '0' then
+                    result(28 downto 0) := mantissa(30 downto 2);
+                end if;
         end case;
         return result;
     end;
 
+    -- Determine whether to increment when rounding
+    -- Returns rounding_inc & inexact
+    -- Assumes x includes the bottom 29 bits of the mantissa already
+    -- if single_prec = 1 (usually arranged by setting set_x = 1 earlier).
+    function fp_rounding(mantissa: std_ulogic_vector(63 downto 0); x: std_ulogic;
+                         single_prec: std_ulogic; rn: std_ulogic_vector(2 downto 0);
+                         sign: std_ulogic)
+        return std_ulogic_vector is
+        variable grx : std_ulogic_vector(2 downto 0);
+        variable ret : std_ulogic_vector(1 downto 0);
+        variable lsb : std_ulogic;
+    begin
+        if single_prec = '0' then
+            grx := mantissa(1 downto 0) & x;
+            lsb := mantissa(2);
+        else
+            grx := mantissa(30 downto 29) & x;
+            lsb := mantissa(31);
+        end if;
+        ret(1) := '0';
+        ret(0) := or (grx);
+        case rn(1 downto 0) is
+            when "00" =>        -- round to nearest
+                if grx = "100" and rn(2) = '0' then
+                    ret(1) := lsb; -- tie, round to even
+                else
+                    ret(1) := grx(2);
+                end if;
+            when "01" =>        -- round towards zero
+            when others =>      -- round towards +/- inf
+                if rn(0) = sign then
+                    -- round towards greater magnitude
+                    ret(1) := ret(0);
+                end if;
+        end case;
+        return ret;
+    end;
+
+    -- Determine result flags to write into the FPSCR
+    function result_flags(sign: std_ulogic; class: fp_number_class; unitbit: std_ulogic)
+        return std_ulogic_vector is
+    begin
+        case class is
+            when ZERO =>
+                return sign & "0010";
+            when FINITE =>
+                return (not unitbit) & sign & (not sign) & "00";
+            when INFINITY =>
+                return '0' & sign & (not sign) & "01";
+            when NAN =>
+                return "10001";
+        end case;
+    end;
+
 begin
     fpu_0: process(clk)
     begin
@@ -174,6 +340,25 @@ begin
         variable j, k        : integer;
         variable flm         : std_ulogic_vector(7 downto 0);
         variable int_input   : std_ulogic;
+        variable mask        : std_ulogic_vector(63 downto 0);
+        variable in_a0       : std_ulogic_vector(63 downto 0);
+        variable in_b0       : std_ulogic_vector(63 downto 0);
+        variable misc        : std_ulogic_vector(63 downto 0);
+        variable shift_res   : std_ulogic_vector(63 downto 0);
+        variable round       : std_ulogic_vector(1 downto 0);
+        variable update_fx   : std_ulogic;
+        variable arith_done  : std_ulogic;
+        variable mant_nz     : std_ulogic;
+        variable min_exp     : signed(EXP_BITS-1 downto 0);
+        variable max_exp     : signed(EXP_BITS-1 downto 0);
+        variable bias_exp    : signed(EXP_BITS-1 downto 0);
+        variable new_exp     : signed(EXP_BITS-1 downto 0);
+        variable exp_tiny    : std_ulogic;
+        variable exp_huge    : std_ulogic;
+        variable renormalize : std_ulogic;
+        variable clz         : std_ulogic_vector(5 downto 0);
+        variable set_x       : std_ulogic;
+        variable mshift      : signed(EXP_BITS-1 downto 0);
     begin
         v := r;
         illegal := '0';
@@ -199,16 +384,53 @@ begin
             if e_in.op = OP_FPOP_I then
                 int_input := '1';
             end if;
+            v.tiny := '0';
+            v.denorm := '0';
+            v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN);
             adec := decode_dp(e_in.fra, int_input);
             bdec := decode_dp(e_in.frb, int_input);
             v.a := adec;
             v.b := bdec;
         end if;
 
+        r_hi_nz <= or (r.r(55 downto 31));
+        r_lo_nz <= or (r.r(30 downto 2));
+
+        if r.single_prec = '0' then
+            max_exp := to_signed(1023, EXP_BITS);
+            min_exp := to_signed(-1022, EXP_BITS);
+            bias_exp := to_signed(1536, EXP_BITS);
+        else
+            max_exp := to_signed(127, EXP_BITS);
+            min_exp := to_signed(-126, EXP_BITS);
+            bias_exp := to_signed(192, EXP_BITS);
+        end if;
+        new_exp := r.result_exp - r.shift;
+        exp_tiny := '0';
+        exp_huge := '0';
+        if new_exp < min_exp then
+            exp_tiny := '1';
+        end if;
+        if new_exp > max_exp then
+            exp_huge := '1';
+        end if;
+
         v.writing_back := '0';
         v.instr_done := '0';
-        opsel_r <= "00";
+        v.update_fprf := '0';
+        v.shift := to_signed(0, EXP_BITS);
+        opsel_a <= AIN_R;
+        opsel_ainv <= '0';
+        opsel_amask <= '0';
+        opsel_b <= BIN_ZERO;
+        opsel_r <= RES_SUM;
+        carry_in <= '0';
+        misc_sel <= "0000";
         fpscr_mask := (others => '1');
+        update_fx := '0';
+        arith_done := '0';
+        renormalize := '0';
+        set_x := '0';
 
         case r.state is
             when IDLE =>
@@ -230,10 +452,15 @@ begin
                             end if;
                         when "01000" =>
                             v.state := DO_FMR;
+                        when "01110" =>
+                            -- fcfid[u][s]
+                            v.state := DO_FCFID;
                         when others =>
                             illegal := '1';
                     end case;
                 end if;
+                v.x := '0';
+                v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX);
 
             when DO_MCRFS =>
                 j := to_integer(unsigned(insn_bfa(r.insn)));
@@ -276,7 +503,7 @@ begin
             when DO_MFFS =>
                 v.int_result := '1';
                 v.writing_back := '1';
-                opsel_r <= "10";
+                opsel_r <= RES_MISC;
                 case r.insn(20 downto 16) is
                     when "00000" =>
                         -- mffs
@@ -322,6 +549,7 @@ begin
                 v.state := IDLE;
 
             when DO_FMR =>
+                opsel_a <= AIN_B;
                 v.result_class := r.b.class;
                 v.result_exp := r.b.exponent;
                 if r.insn(9) = '1' then
@@ -339,29 +567,281 @@ begin
                 v.instr_done := '1';
                 v.state := IDLE;
 
+            when DO_FCFID =>
+                v.result_sign := '0';
+                opsel_a <= AIN_B;
+                if r.insn(8) = '0' and r.b.negative = '1' then
+                    -- fcfid[s] with negative operand, set R = -B
+                    opsel_ainv <= '1';
+                    carry_in <= '1';
+                    v.result_sign := '1';
+                end if;
+                v.result_class := r.b.class;
+                v.result_exp := to_signed(54, EXP_BITS);
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.b.class = ZERO then
+                    arith_done := '1';
+                else
+                    v.state := FINISH;
+                end if;
+
+            when FINISH =>
+                if r.r(63 downto 54) /= "0000000001" then
+                    renormalize := '1';
+                    v.state := NORMALIZE;
+                else
+                    set_x := '1';
+                    if exp_tiny = '1' then
+                        v.shift := new_exp - min_exp;
+                        v.state := ROUND_UFLOW;
+                    elsif exp_huge = '1' then
+                        v.state := ROUND_OFLOW;
+                    else
+                        v.shift := to_signed(-2, EXP_BITS);
+                        v.state := ROUNDING;
+                    end if;
+                end if;
+
+            when NORMALIZE =>
+                -- Shift so we have 9 leading zeroes (we know R is non-zero)
+                opsel_r <= RES_SHIFT;
+                set_x := '1';
+                if exp_tiny = '1' then
+                    v.shift := new_exp - min_exp;
+                    v.state := ROUND_UFLOW;
+                elsif exp_huge = '1' then
+                    v.state := ROUND_OFLOW;
+                else
+                    v.shift := to_signed(-2, EXP_BITS);
+                    v.state := ROUNDING;
+                end if;
+
+            when ROUND_UFLOW =>
+                v.tiny := '1';
+                if r.fpscr(FPSCR_UE) = '0' then
+                    -- disabled underflow exception case
+                    -- have to denormalize before rounding
+                    opsel_r <= RES_SHIFT;
+                    set_x := '1';
+                    v.shift := to_signed(-2, EXP_BITS);
+                    v.state := ROUNDING;
+                else
+                    -- enabled underflow exception case
+                    -- if denormalized, have to normalize before rounding
+                    v.fpscr(FPSCR_UX) := '1';
+                    v.result_exp := r.result_exp + bias_exp;
+                    if r.r(54) = '0' then
+                        renormalize := '1';
+                        v.state := NORMALIZE;
+                    else
+                        v.shift := to_signed(-2, EXP_BITS);
+                        v.state := ROUNDING;
+                    end if;
+                end if;
+
+            when ROUND_OFLOW =>
+                v.fpscr(FPSCR_OX) := '1';
+                if r.fpscr(FPSCR_OE) = '0' then
+                    -- disabled overflow exception
+                    -- result depends on rounding mode
+                    v.fpscr(FPSCR_XX) := '1';
+                    v.fpscr(FPSCR_FI) := '1';
+                    if r.round_mode(1 downto 0) = "00" or
+                        (r.round_mode(1) = '1' and r.round_mode(0) = r.result_sign) then
+                        v.result_class := INFINITY;
+                        v.fpscr(FPSCR_FR) := '1';
+                    else
+                        v.fpscr(FPSCR_FR) := '0';
+                    end if;
+                    -- construct largest representable number
+                    v.result_exp := max_exp;
+                    opsel_r <= RES_MISC;
+                    misc_sel <= "001" & r.single_prec;
+                    arith_done := '1';
+                else
+                    -- enabled overflow exception
+                    v.result_exp := r.result_exp - bias_exp;
+                    v.shift := to_signed(-2, EXP_BITS);
+                    v.state := ROUNDING;
+                end if;
+
+            when ROUNDING =>
+                opsel_amask <= '1';
+                round := fp_rounding(r.r, r.x, r.single_prec, r.round_mode, r.result_sign);
+                v.fpscr(FPSCR_FR downto FPSCR_FI) := round;
+                if round(1) = '1' then
+                    -- set mask to increment the LSB for the precision
+                    opsel_b <= BIN_MASK;
+                    carry_in <= '1';
+                    v.shift := to_signed(-1, EXP_BITS);
+                    v.state := ROUNDING_2;
+                else
+                    if r.r(54) = '0' then
+                        -- result after masking could be zero, or could be a
+                        -- denormalized result that needs to be renormalized
+                        renormalize := '1';
+                        v.state := ROUNDING_3;
+                    else
+                        arith_done := '1';
+                    end if;
+                end if;
+                if round(0) = '1' then
+                    v.fpscr(FPSCR_XX) := '1';
+                    if r.tiny = '1' then
+                        v.fpscr(FPSCR_UX) := '1';
+                    end if;
+                end if;
+
+            when ROUNDING_2 =>
+                -- Check for overflow during rounding
+                v.x := '0';
+                if r.r(55) = '1' then
+                    opsel_r <= RES_SHIFT;
+                    if exp_huge = '1' then
+                        v.state := ROUND_OFLOW;
+                    else
+                        arith_done := '1';
+                    end if;
+                elsif r.r(54) = '0' then
+                    -- Do CLZ so we can renormalize the result
+                    renormalize := '1';
+                    v.state := ROUNDING_3;
+                else
+                    arith_done := '1';
+                end if;
+
+            when ROUNDING_3 =>
+                mant_nz := r_hi_nz or (r_lo_nz and not r.single_prec);
+                if mant_nz = '0' then
+                    v.result_class := ZERO;
+                    arith_done := '1';
+                else
+                    -- Renormalize result after rounding
+                    opsel_r <= RES_SHIFT;
+                    v.denorm := exp_tiny;
+                    v.shift := new_exp - to_signed(-1022, EXP_BITS);
+                    if new_exp < to_signed(-1022, EXP_BITS) then
+                        v.state := DENORM;
+                    else
+                        arith_done := '1';
+                    end if;
+                end if;
+
+            when DENORM =>
+                opsel_r <= RES_SHIFT;
+                arith_done := '1';
+
         end case;
 
+        if arith_done = '1' then
+            v.writing_back := '1';
+            v.update_fprf := '1';
+            v.instr_done := '1';
+            v.state := IDLE;
+            update_fx := '1';
+        end if;
+
         -- Data path.
+        -- This has A and B input multiplexers, an adder, a shifter,
+        -- count-leading-zeroes logic, and a result mux.
+        if r.single_prec = '1' then
+            mshift := r.shift + to_signed(-29, EXP_BITS);
+        else
+            mshift := r.shift;
+        end if;
+        if mshift < to_signed(-64, EXP_BITS) then
+            mask := (others => '1');
+        elsif mshift >= to_signed(0, EXP_BITS) then
+            mask := (others => '0');
+        else
+            mask := right_mask(unsigned(mshift(5 downto 0)));
+        end if;
+        case opsel_a is
+            when AIN_R =>
+                in_a0 := r.r;
+            when AIN_A =>
+                in_a0 := r.a.mantissa;
+            when others =>
+                in_a0 := r.b.mantissa;
+        end case;
+        if (or (mask and in_a0)) = '1' and set_x = '1' then
+            v.x := '1';
+        end if;
+        if opsel_ainv = '1' then
+            in_a0 := not in_a0;
+        end if;
+        if opsel_amask = '1' then
+            in_a0 := in_a0 and not mask;
+        end if;
+        in_a <= in_a0;
+        case opsel_b is
+            when BIN_ZERO =>
+                in_b0 := (others => '0');
+            when BIN_R =>
+                in_b0 := r.r;
+            when BIN_MASK =>
+                in_b0 := mask;
+            when others =>
+                in_b0 := (others => '0');
+        end case;
+        in_b <= in_b0;
+        if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then
+            shift_res := shifter_64(r.r & x"00000000000000",
+                                    std_ulogic_vector(r.shift(6 downto 0)));
+        else
+            shift_res := (others => '0');
+        end if;
         case opsel_r is
-            when "00" =>
-                result <= r.b.mantissa;
-            when "10" =>
-                result <= x"00000000" & (r.fpscr and fpscr_mask);
+            when RES_SUM =>
+                result <= std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in);
+            when RES_SHIFT =>
+                result <= shift_res;
             when others =>
-                result <= (others => '0');
+                case misc_sel is
+                    when "0000" =>
+                        misc := x"00000000" & (r.fpscr and fpscr_mask);
+                    when "0010" =>
+                        -- mantissa of max representable DP number
+                        misc := x"007ffffffffffffc";
+                    when "0011" =>
+                        -- mantissa of max representable SP number
+                        misc := x"007fffff80000000";
+                    when others =>
+                        misc := x"0000000000000000";
+                end case;
+                result <= misc;
         end case;
         v.r := result;
 
+        if opsel_r = RES_SHIFT then
+            v.result_exp := new_exp;
+        end if;
+
+        if renormalize = '1' then
+            clz := count_left_zeroes(r.r);
+            v.shift := resize(signed('0' & clz) - 9, EXP_BITS);
+        end if;
+
         if r.int_result = '1' then
             fp_result <= r.r;
         else
-            fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r);
+            fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r,
+                                 r.single_prec);
+        end if;
+        if r.update_fprf = '1' then
+            v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.result_sign, r.result_class,
+                                                             r.r(54) and not r.denorm);
         end if;
 
         v.fpscr(FPSCR_VX) := (or (v.fpscr(FPSCR_VXSNAN downto FPSCR_VXVC))) or
                              (or (v.fpscr(FPSCR_VXSOFT downto FPSCR_VXCVI)));
         v.fpscr(FPSCR_FEX) := or (v.fpscr(FPSCR_VX downto FPSCR_XX) and
                                   v.fpscr(FPSCR_VE downto FPSCR_XE));
+        if update_fx = '1' and
+            (v.fpscr(FPSCR_VX downto FPSCR_XX) and not r.old_exc) /= "00000" then
+            v.fpscr(FPSCR_FX) := '1';
+        end if;
         if r.rc = '1' then
             v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX);
         end if;
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index 46668f8..80751d1 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -64,7 +64,7 @@ void print_string(const char *str)
 		putchar(*str);
 }
 
-void print_hex(unsigned long val, int ndigits)
+void print_hex(unsigned long val, int ndigits, const char *str)
 {
 	int i, x;
 
@@ -75,6 +75,7 @@ void print_hex(unsigned long val, int ndigits)
 		else
 			putchar(x + '0');
 	}
+	print_string(str);
 }
 
 // i < 100
@@ -201,12 +202,9 @@ int sp_to_dp(long arg)
 	asm("lfs 20,0(%0); stfd 20,0(%1)"
 	    : : "b" (&sp_dp_equiv[arg].sp), "b" (&dp) : "memory");
 	if (dp != sp_dp_equiv[arg].dp) {
-		print_hex(sp_dp_equiv[arg].sp, 8);
-		print_string(" ");
-		print_hex(dp, 16);
-		print_string(" ");
-		print_hex(sp_dp_equiv[arg].dp, 16);
-		print_string(" ");
+		print_hex(sp_dp_equiv[arg].sp, 8, " ");
+		print_hex(dp, 16, " ");
+		print_hex(sp_dp_equiv[arg].dp, 16, " ");
 	}
 	return dp != sp_dp_equiv[arg].dp;
 }
@@ -465,12 +463,77 @@ int test6(long arg)
 	return 0;
 }
 
+struct int_fp_equiv {
+	long		ival;
+	unsigned long	fp;
+	unsigned long	fp_u;
+	unsigned long	fp_s;
+	unsigned long	fp_us;
+} intvals[] = {
+	{ 0,  0, 0, 0, 0 },
+	{ 1,  0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 },
+	{ -1, 0xbff0000000000000, 0x43f0000000000000, 0xbff0000000000000, 0x43f0000000000000 },
+	{ 2,  0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000 },
+	{ -2, 0xc000000000000000, 0x43f0000000000000, 0xc000000000000000, 0x43f0000000000000 },
+	{ 0x12345678, 0x41b2345678000000, 0x41b2345678000000, 0x41b2345680000000, 0x41b2345680000000 },
+	{ 0x0008000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000 },
+	{ 0x0010000000000000, 0x4330000000000000, 0x4330000000000000, 0x4330000000000000, 0x4330000000000000 },
+	{ 0x0020000000000000, 0x4340000000000000, 0x4340000000000000, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000000000001, 0x4340000000000000, 0x4340000000000000, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000000000002, 0x4340000000000001, 0x4340000000000001, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000000000003, 0x4340000000000002, 0x4340000000000002, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000010000000, 0x4340000008000000, 0x4340000008000000, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000020000000, 0x4340000010000000, 0x4340000010000000, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000030000000, 0x4340000018000000, 0x4340000018000000, 0x4340000020000000, 0x4340000020000000 },
+	{ 0x0020000040000000, 0x4340000020000000, 0x4340000020000000, 0x4340000020000000, 0x4340000020000000 },
+	{ 0x0020000080000000, 0x4340000040000000, 0x4340000040000000, 0x4340000040000000, 0x4340000040000000 },
+	{ 0x0040000000000000, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000001, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000002, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000003, 0x4350000000000001, 0x4350000000000001, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000004, 0x4350000000000001, 0x4350000000000001, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000005, 0x4350000000000001, 0x4350000000000001, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000006, 0x4350000000000002, 0x4350000000000002, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000007, 0x4350000000000002, 0x4350000000000002, 0x4350000000000000, 0x4350000000000000 },
+};
+
+int test7(long arg)
+{
+	long i;
+	unsigned long results[4];
+
+	for (i = 0; i < sizeof(intvals) / sizeof(intvals[0]); ++i) {
+		asm("lfd%U0%X0 3,%0; fcfid 6,3; fcfidu 7,3; stfd 6,0(%1); stfd 7,8(%1)"
+		    : : "m" (intvals[i].ival), "b" (results) : "memory");
+		asm("fcfids 9,3; stfd 9,16(%0); fcfidus 10,3; stfd 10,24(%0)"
+		    : : "b" (results) : "memory");
+		if (results[0] != intvals[i].fp ||
+		    results[1] != intvals[i].fp_u ||
+		    results[2] != intvals[i].fp_s ||
+		    results[3] != intvals[i].fp_us) {
+			print_string("\r\n");
+			print_hex(results[0], 16, " ");
+			print_hex(results[1], 16, " ");
+			print_hex(results[2], 16, " ");
+			print_hex(results[3], 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
 int fpu_test_6(void)
 {
 	enable_fp();
 	return trapit(0, test6);
 }
 
+int fpu_test_7(void)
+{
+	enable_fp();
+	return trapit(0, test7);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -484,12 +547,9 @@ void do_test(int num, int (*test)(void))
 	} else {
 		fail = 1;
 		print_string("FAIL ");
-		print_hex(ret, 5);
-		print_string(" SRR0=");
-		print_hex(mfspr(SRR0), 16);
-		print_string(" SRR1=");
-		print_hex(mfspr(SRR1), 16);
-		print_string("\r\n");
+		print_hex(ret, 5, " SRR0=");
+		print_hex(mfspr(SRR0), 16, " SRR1=");
+		print_hex(mfspr(SRR1), 16, "\r\n");
 	}
 }
 
@@ -503,6 +563,7 @@ int main(void)
 	do_test(4, fpu_test_4);
 	do_test(5, fpu_test_5);
 	do_test(6, fpu_test_6);
+	do_test(7, fpu_test_7);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index 4fb260e1d5a3e4f37deea64098b82e55151aceb2..25d50c77a0d990a40320f8baa84752efe3a6d996 100755
GIT binary patch
delta 3104
zcma)7du&tJ9sb?x;5dP}H{{X0h;I^;#MH~&Qjmi~xz2;RrVUw}wi^PGlu~FxT_;h=
zY)-FB?TJnu3Oa1+9|0OfqpYeX7@4#OZPSOQ(rH^eX;T<op&go>F_;6>=8lZB@7#Nw
zB?Ocm>FRgB?|kR?I_KUy(LerFBayL|DBAza{ODfT?X)V|Pt*r)7r0&Ec5P3rI@q{U
zdG6q*d(=Vw<bmJbad)b|m1y$Z-#FhuP>V-PwYO42@MnHS^x?UW=B%*;T6G;1E-0>a
zCrqO6o_j?qG|L<HRixO5+6p!MBKxQI5X`A)@ra^Ema?l$o0Jr*Dp_6ry<&=-<aJrj
z$3v4h?BH68Jz26^O%$^uCBJ}fQ|Ucr4{8*7$I=~J$N9c>MB|~&Y`8R}wrT9s(x=s$
z-7H!bRrQ_h2W8!=cL)1RS+{bCtt#J`H|C}Y(>yvGE$=M<3{D=o&NskG8;T6l)5+lQ
zB<n6~DbPsCevIWJHdX!&<uL2NvqAm+B>T>tujh>~v0PNJ2P*WUBtT^WMMRS>vLa8l
zserMwyBLa955!}4y%k@h@yEvUIU3{fdnsWUR2Ph0d(T9abea!e%J6qCW%*~9MjWpA
z9yD(C^bgJZ@_mmKP=UXOUN9abU+Y+Y!@xA>q3^rwXeXcxfKS_vMgpWH9K_7hBJ;V$
z6d6YLR#uJA$FeVe9xA>1wy9Gcj_~liL@)i(MQbN@3QtmxFWyG`!Yvsy&iS!67bSwR
zJ+B1=G@SKWF^1DR^`|xPl@hyXz`_N<rB^xp_NV($lZ)0(#;Sbg0OwBywEMTk{YXS)
zh9Qz8g^V^j(H<O8U2znAy^@U95y$-lSnt!U`*k4{+t4&1+Aj*wNoj4St;>GYyCXhS
zJ3c++xiC%i!Gw#_6GOH7WsRE84tb2r6%;w^qBBSo1qI9%@tvrM^T*o~d$t9a;Vy{`
zMRlSIQJsP69t01YZkE-pl{g^rmw<zq3ys(({3v7w$MUzuMYX#NDWV~0vlVnje30)m
zDu4riNhCn%Ac7J<3#<?iHRIU)oZ3-z@TcO^u+x>Lb^DQO?%H+zhi~}9^&4gY;R=a+
zM96QlsmhYdEl&IR*I@5g<vf67M%l8eqj*8ytXioIvZ<<->S*#*WA!r*Q(QLDib30d
zdgh{KML9Yy1oesaSFld0`h#1*i&iZ{$*-bSvFm>}4}u#}zhzwz(TY*<0~Yt~xb6QK
zTKyXGPly_3Vtg&j*&UubrIp?7DKW+M5$~Fc+&i}=gW~!K>;O(Ykqmx;1`C`A+`K&*
z6dwVB7Xd#Dd{$y7@S!J@!7~!)1AhYiGhiLnii*_K<2W=WEnX>|65Zg~j<RC6SgVoW
zBv*?4&SMtLpi`Ulhn1q#=Rwk={W?YWLh$ptP*<vc@{K%-zrK+MT$roxy7uPb2xoH+
zT;wG)oMGv-Ihd`f-&kga^7@H<FX0{hvPcFoj?_O|sADchQ6c+xeGBt9jG04!<@}<}
zk_L-ywEGh0J+n9i^HFe_SzIsXNqoKaS=>&{2f=wICu|(gtKdEnTxUU;jw+q*Kymo_
zuNS(h{XE<1EmC_gvuC_5YTp%h1ZLt2`;E6&rN6T)FnJl~US6yC*{{9T&ZI)rn_(@>
ztDMI%J)U9R%d75tAJcziIIj^QFmyZ<;HJTelw{Zrw}G(TBC0biQk#axm|`Q#YfVwL
zS4E?yIDc7e=|HC8!;T~BQ3adAZX<$p*E_@KKmAj9F;O!lz0jYA{`jr*eVg6X1+g<s
zB2)s}d#|Ef2uTsyg+6%R0k`4vZl`c6grgApnXj>-{Ugj0*XB0(BDj~q<th~imIZeR
zTqovYJb-5jzJl*xV+R`7cZy6m1M2_c`~$JR-fF{9C5p&hC@8%6A-ggn;dVc?r=dL|
zv|Vl;Ic{ZIlh+r-FLS>jF+^O^vNymb1%V*~8F*^gmZrOWJzsDx7%_x8f#(Tue$fC7
z!5zXAKXtO{1t)U`maz*;nJHI=0waUJ?yW7CL$1FqZO6?Ry_r1>3Fg@wwm*|g!MJTR
z(jm>@*rWF@l^RTc9L79n6B6evmRwS*OFy@>q8nnE0V%wl)h!tsEp_v)-NA)*c4zSG
zvpgCdmNjqT*kK8DvNsqVIcsM!CjpdU1xDExpR<t8(LrSCptty(CG#d>Kng1)pR?F)
zZOmCi>T>vYHX(J;Tl;Ue_>KK9Q?f^oZ1sFATeqN2c5FeNL)4E1$adUgp9#$moM*XO
iShwi^)H$V2{faf|k!L(VaDGm>fR25vl(2@tPyP$jogY;I

delta 1800
zcmZvce{54#6vxkd+q&0{br09FK?}TYrK~z;2TI=PVC~qjlx15~z@WyJ!Q_W1XiNr{
zy_VEum`GeMA^tJhM6(1!%*du8@ka(B>hSkJhRE0=PP2*3QHc(jT)($1izJ@p_TKY3
z-+S&o=e^g#otMIHBL8ZlQ25jA(0<I@sUj36Y6ja4wi|5s_Gm??dxf;GbIWqsvG~l9
zZ}KZ*#g#-8mw)4Y13@i@8mp<KsNh3i5M8|d?L9GeM72AiP@tG>h#Ew%U4ApIP?OoH
zxhTdis`txkl>Mnbim5*|w^x!cs4Q{cQYpq36s(+QRw;OfJEu7x^iAAJ9j?Y$Yr#r+
zCZ8QI_z=1+*5$S#3kA<5btkGhKU7UL=p**F)hC~|uo3Gn`SdOpvW4VJ&$45-26?!Z
z{bFm7>e<8d8f9-DJ5aQM^~|f!`VJ{JvdY32q@UTz!V-Dmuk7o>Q*t~PidA{RTnb*v
zrl6>PjLo%|7@Uv!+U)I-RVv@Ck~A&$4`g#xySQu)Mg3lK>QgxsTsbpjtjYFlG?Tf^
zK{5Yk@>C9FmmHko+_xv4@pE*?3qGFe5)L6LD&vRzRn%9b_sU9SJ#vY5ByZquH`>(b
zRi8D{ZD{1ggXlU>bod8_swOlFOi+_25~M?c@+l+2`H5<UqWbiUANvA0pzYc_#trmS
zqwu&2zAVkIfGHG7viWH?1ZGWeoDm*xLO}{uPqZ!Y9OOJ_r&nFy61i!nAamG?Sef1G
zq?cOsyRF!Ro!{TSV|dBXbBHe<$Zm-|&BFGA`Rh%oHm;&(eRX;*nst=DZts@%v1N`G
zQiO#aE9Ajw|1rn&8EEy^Dh0XyP$B>LH2?8hZ-x@-y2Uq0q@j9h78(^AiWPUh3Z8|^
z+HU`3oWokgiV)j_b+3FNBC0-_<{J3MB!4BzJJY-r{EgJ(V4oG)@QNsgM^4=wZ&{Uu
z+wa&K*B3$~d>MXiOI*)O`x)?~+v55^=!1x7!Y|z(*RQAjEckoji>`~f3I5U6xc(J<
zJW*ZLB*(VmX-MOuR*j;pu+U}^+>!o>lo;FOG|aPJ|A_N3DKB<aCoL4#D6}7fH6^~f
zSn<T^Op2Upq=O39@_A){Cf>IHvJCQ9&Y5NYq)xrZXP=Zb+LA}P%7{FN@wz=IE`u|Q
zU7Ji|sS!KmDwp$e*n6%IS~dJ*i~$Q%F|doKz?uYr3FgNryUzI(W?_Ooj-gC)u3;7?
z*bWRUm`ebdm?L7IWEYpX<x7(+v7}rc`<)fL%cZASr`vA0B%=5fwqT}p7Vq3Zd$-hX
zp_)-szyVYX-f@F-@kvviMML}%L`?y>BEAjrKE(H?;$nZ3h1Sl-<05b3|Kqr_o!*6@
z2?CK!n1*o{?1=<(FI{9B#<DBHLQ5AJrHEwWbN>kHHms@69H8TJw^WzBc5RM@0;40Q
zKrTX}i(Y7zKRE9b+O^OIx-#F&5F2pGfTigf3)PK|Tp1R36`g8^z5{yEvDw_ImY}2n
z%P%$8bU-r<jV3fO(e*B{aj?4;2-XKS1y(CWuzHLOVA@T#rF31D2G^Q_hw~Tb7qCX>
z{mIVPqX;x_6h(-1Iy4Uawf6{3s{2o}u~L_(1F%Ctm^e?=I|CLM0493n$9Rracpk9z
o+~QmiVL~0n7-0K61zBJIjb8S5dR{j5_jraSX7Vm%xevYnFKc-|0{{R3

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index a49bb9b..340756c 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -4,3 +4,4 @@ test 03:PASS
 test 04:PASS
 test 05:PASS
 test 06:PASS
+test 07:PASS

From 34b5d4a7b51bdae4e9994d11225b8216312eb793 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 31 Aug 2020 17:26:33 +1000
Subject: [PATCH 11/30] FPU: Implement the frsp instruction

This brings in the invalid exception for the case of frsp with a
signalling NaN as input, and the need to be able to convert a
signalling NaN to a quiet NaN.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl |  1 +
 fpu.vhdl     | 49 ++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 83444cf..284fb08 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -441,6 +441,7 @@ architecture behaviour of decode1 is
         2#100000010#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  2/8=fmr
         2#100000100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/8=fnabs
         2#100001000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  8/8=fabs
+        2#110000000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), --  0/12=frsp
         2#111011010#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 26/14=fcfid
         2#111011110#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 30/14=fcfidu
         others => illegal_inst
diff --git a/fpu.vhdl b/fpu.vhdl
index fecb7bb..7576562 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -39,6 +39,7 @@ architecture behaviour of fpu is
                      DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
                      DO_FMR,
                      DO_FCFID,
+                     DO_FRSP,
                      FINISH, NORMALIZE,
                      ROUND_UFLOW, ROUND_OFLOW,
                      ROUNDING, ROUNDING_2, ROUNDING_3,
@@ -71,6 +72,7 @@ architecture behaviour of fpu is
         cr_mask      : std_ulogic_vector(7 downto 0);
         old_exc      : std_ulogic_vector(4 downto 0);
         update_fprf  : std_ulogic;
+        quieten_nan  : std_ulogic;
         tiny         : std_ulogic;
         denorm       : std_ulogic;
         round_mode   : std_ulogic_vector(2 downto 0);
@@ -217,7 +219,7 @@ architecture behaviour of fpu is
 
     -- Construct a DP floating-point result from components
     function pack_dp(sign: std_ulogic; class: fp_number_class; exp: signed(EXP_BITS-1 downto 0);
-                     mantissa: std_ulogic_vector; single_prec: std_ulogic)
+                     mantissa: std_ulogic_vector; single_prec: std_ulogic; quieten_nan: std_ulogic)
         return std_ulogic_vector is
         variable result : std_ulogic_vector(63 downto 0);
     begin
@@ -238,7 +240,8 @@ architecture behaviour of fpu is
                 result(62 downto 52) := "11111111111";
             when NAN =>
                 result(62 downto 52) := "11111111111";
-                result(51 downto 29) := mantissa(53 downto 31);
+                result(51) := quieten_nan or mantissa(53);
+                result(50 downto 29) := mantissa(52 downto 31);
                 if single_prec = '0' then
                     result(28 downto 0) := mantissa(30 downto 2);
                 end if;
@@ -348,6 +351,7 @@ begin
         variable round       : std_ulogic_vector(1 downto 0);
         variable update_fx   : std_ulogic;
         variable arith_done  : std_ulogic;
+        variable invalid     : std_ulogic;
         variable mant_nz     : std_ulogic;
         variable min_exp     : signed(EXP_BITS-1 downto 0);
         variable max_exp     : signed(EXP_BITS-1 downto 0);
@@ -384,6 +388,7 @@ begin
             if e_in.op = OP_FPOP_I then
                 int_input := '1';
             end if;
+            v.quieten_nan := '1';
             v.tiny := '0';
             v.denorm := '0';
             v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN);
@@ -429,6 +434,7 @@ begin
         fpscr_mask := (others => '1');
         update_fx := '0';
         arith_done := '0';
+        invalid := '0';
         renormalize := '0';
         set_x := '0';
 
@@ -452,6 +458,8 @@ begin
                             end if;
                         when "01000" =>
                             v.state := DO_FMR;
+                        when "01100" =>
+                            v.state := DO_FRSP;
                         when "01110" =>
                             -- fcfid[u][s]
                             v.state := DO_FCFID;
@@ -552,6 +560,7 @@ begin
                 opsel_a <= AIN_B;
                 v.result_class := r.b.class;
                 v.result_exp := r.b.exponent;
+                v.quieten_nan := '0';
                 if r.insn(9) = '1' then
                     v.result_sign := '0';              -- fabs
                 elsif r.insn(8) = '1' then
@@ -567,6 +576,33 @@ begin
                 v.instr_done := '1';
                 v.state := IDLE;
 
+            when DO_FRSP =>
+                opsel_a <= AIN_B;
+                v.result_class := r.b.class;
+                v.result_sign := r.b.negative;
+                v.result_exp := r.b.exponent;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.b.class = NAN and r.b.mantissa(53) = '0' then
+                    -- Signalling NAN
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    invalid := '1';
+                end if;
+                set_x := '1';
+                if r.b.class = FINITE then
+                    if r.b.exponent < to_signed(-126, EXP_BITS) then
+                        v.shift := r.b.exponent - to_signed(-126, EXP_BITS);
+                        v.state := ROUND_UFLOW;
+                    elsif r.b.exponent > to_signed(127, EXP_BITS) then
+                        v.state := ROUND_OFLOW;
+                    else
+                        v.shift := to_signed(-2, EXP_BITS);
+                        v.state := ROUNDING;
+                    end if;
+                else
+                    arith_done := '1';
+                end if;
+
             when DO_FCFID =>
                 v.result_sign := '0';
                 opsel_a <= AIN_B;
@@ -735,8 +771,11 @@ begin
         end case;
 
         if arith_done = '1' then
-            v.writing_back := '1';
-            v.update_fprf := '1';
+            -- Enabled invalid exception doesn't write result or FPRF
+            if (invalid and r.fpscr(FPSCR_VE)) = '0' then
+                v.writing_back := '1';
+                v.update_fprf := '1';
+            end if;
             v.instr_done := '1';
             v.state := IDLE;
             update_fx := '1';
@@ -827,7 +866,7 @@ begin
             fp_result <= r.r;
         else
             fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r,
-                                 r.single_prec);
+                                 r.single_prec, r.quieten_nan);
         end if;
         if r.update_fprf = '1' then
             v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.result_sign, r.result_class,

From 36130f1db351eec8bf19b00f4e2f9dd92276810b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sun, 19 Jul 2020 11:53:01 +1000
Subject: [PATCH 12/30] tests/fpu: Add tests for frsp

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 tests/fpu/fpu.c            |  89 ++++++++++++++++++++++++++++++++-----
 tests/test_fpu.bin         | Bin 13504 -> 14032 bytes
 tests/test_fpu.console_out |   1 +
 3 files changed, 79 insertions(+), 11 deletions(-)

diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index 80751d1..aff6d6c 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -10,6 +10,17 @@
 #define MSR_FE0	0x800
 #define MSR_FE1	0x100
 
+#define FPS_RN_NEAR	0
+#define FPS_RN_ZERO	1
+#define FPS_RN_CEIL	2
+#define FPS_RN_FLOOR	3
+#define FPS_XE		0x8
+#define FPS_ZE		0x10
+#define FPS_UE		0x20
+#define FPS_OE		0x40
+#define FPS_VE		0x80
+#define FPS_VXSOFT	0x400
+
 extern int trapit(long arg, int (*func)(long));
 extern void do_rfid(unsigned long msr);
 extern void do_blr(void);
@@ -363,8 +374,8 @@ int test5a(long arg)
 {
 	set_fpscr(0);
 	enable_fp_interrupts();
-	set_fpscr(0x80);	/* set VE */
-	set_fpscr(0x480);	/* set VXSOFT */
+	set_fpscr(FPS_VE);		/* set VE */
+	set_fpscr(FPS_VXSOFT | FPS_VE);	/* set VXSOFT */
 	set_fpscr(0);
 	return 1;		/* not supposed to get here */
 }
@@ -374,8 +385,8 @@ int test5b(long arg)
 	unsigned long msr;
 
 	enable_fp();
-	set_fpscr(0x80);	/* set VE */
-	set_fpscr(0x480);	/* set VXSOFT */
+	set_fpscr(FPS_VE);		/* set VE */
+	set_fpscr(FPS_VXSOFT | FPS_VE);	/* set VXSOFT */
 	asm("mfmsr %0" : "=r" (msr));
 	msr |= MSR_FE0 | MSR_FE1;
 	asm("mtmsrd %0; xori 4,4,0" : : "r" (msr));
@@ -388,8 +399,8 @@ int test5c(long arg)
 	unsigned long msr;
 
 	enable_fp();
-	set_fpscr(0x80);	/* set VE */
-	set_fpscr(0x480);	/* set VXSOFT */
+	set_fpscr(FPS_VE);		/* set VE */
+	set_fpscr(FPS_VXSOFT | FPS_VE);	/* set VXSOFT */
 	asm("mfmsr %0" : "=r" (msr));
 	msr |= MSR_FE0 | MSR_FE1;
 	do_rfid(msr);
@@ -463,6 +474,12 @@ int test6(long arg)
 	return 0;
 }
 
+int fpu_test_6(void)
+{
+	enable_fp();
+	return trapit(0, test6);
+}
+
 struct int_fp_equiv {
 	long		ival;
 	unsigned long	fp;
@@ -522,16 +539,63 @@ int test7(long arg)
 	return 0;
 }
 
-int fpu_test_6(void)
+int fpu_test_7(void)
 {
 	enable_fp();
-	return trapit(0, test6);
+	return trapit(0, test7);
 }
 
-int fpu_test_7(void)
+struct roundvals {
+	unsigned long fpscr;
+	unsigned long dpval;
+	unsigned long spval;
+} roundvals[] = {
+	{ FPS_RN_NEAR,  0, 0 },
+	{ FPS_RN_CEIL,  0x8000000000000000, 0x8000000000000000 },
+	{ FPS_RN_NEAR,  0x402123456789abcd, 0x4021234560000000 },
+	{ FPS_RN_ZERO,  0x402123456789abcd, 0x4021234560000000 },
+	{ FPS_RN_CEIL,  0x402123456789abcd, 0x4021234580000000 },
+	{ FPS_RN_FLOOR, 0x402123456789abcd, 0x4021234560000000 },
+	{ FPS_RN_NEAR,  0x402123457689abcd, 0x4021234580000000 },
+	{ FPS_RN_ZERO,  0x402123457689abcd, 0x4021234560000000 },
+	{ FPS_RN_CEIL,  0x402123457689abcd, 0x4021234580000000 },
+	{ FPS_RN_FLOOR, 0x402123457689abcd, 0x4021234560000000 },
+	{ FPS_RN_NEAR,  0x4021234570000000, 0x4021234580000000 },
+	{ FPS_RN_NEAR,  0x4021234550000000, 0x4021234540000000 },
+	{ FPS_RN_NEAR,  0x7ff123456789abcd, 0x7ff9234560000000 },
+	{ FPS_RN_ZERO,  0x7ffa3456789abcde, 0x7ffa345660000000 },
+	{ FPS_RN_FLOOR, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ FPS_RN_NEAR,  0x47e1234550000000, 0x47e1234540000000 },
+	{ FPS_RN_NEAR,  0x47f1234550000000, 0x7ff0000000000000 },
+	{ FPS_RN_ZERO,  0x47f1234550000000, 0x47efffffe0000000 },
+	{ FPS_RN_CEIL,  0x47f1234550000000, 0x7ff0000000000000 },
+	{ FPS_RN_FLOOR, 0x47f1234550000000, 0x47efffffe0000000 },
+	{ FPS_RN_NEAR,  0x38012345b0000000, 0x38012345c0000000 },
+	{ FPS_RN_NEAR,  0x37c12345b0000000, 0x37c1234400000000 },
+};
+
+int test8(long arg)
+{
+	long i;
+	unsigned long result;
+
+	for (i = 0; i < sizeof(roundvals) / sizeof(roundvals[0]); ++i) {
+		asm("lfd 3,0(%0); lfd 4,8(%0); mtfsf 0,3,1,0; frsp 6,4; stfd 6,0(%1)"
+		    : : "b" (&roundvals[i]), "b" (&result) : "memory");
+		if (result != roundvals[i].spval) {
+			print_string("\r\n");
+			print_hex(i, 4, " ");
+			print_hex(result, 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_8(void)
 {
 	enable_fp();
-	return trapit(0, test7);
+	return trapit(0, test8);
 }
 
 int fail = 0;
@@ -549,7 +613,9 @@ void do_test(int num, int (*test)(void))
 		print_string("FAIL ");
 		print_hex(ret, 5, " SRR0=");
 		print_hex(mfspr(SRR0), 16, " SRR1=");
-		print_hex(mfspr(SRR1), 16, "\r\n");
+		print_hex(mfspr(SRR1), 16, " FPSCR=");
+		enable_fp();
+		print_hex(get_fpscr(), 8, "\r\n");
 	}
 }
 
@@ -564,6 +630,7 @@ int main(void)
 	do_test(5, fpu_test_5);
 	do_test(6, fpu_test_6);
 	do_test(7, fpu_test_7);
+	do_test(8, fpu_test_8);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index 25d50c77a0d990a40320f8baa84752efe3a6d996..81d18542064550fb7064a3683cbcdc7f7048c285 100755
GIT binary patch
delta 2539
zcma)8du)?c6hGg$wsbH?$Jo|wCA3}X9^o;0?Hgm*!Zu2WQjrie20J!hA_*G64GXPU
zDKk2=Ts3H-um$%AVjKYyjf4z?W@5zg2Zn^#(2;Bc+F^o&!9IWIYm0?Q;z{n#`Tfqh
z=W*|CyRHpAo18?hN+NIF_oKW`&}*p7TSw#vw-MY%a2wZz%37T>rPo@YpCO;J9d7w~
z+!GOdInls{8>(7Mh~{OAEGVZC^FRDZboxTiLq5@B%C{lHh=@O)4h4zczOXkQVbZdZ
z-y9LAO(n8%x43Ou26e=n(ILruGsUg(g;GT1WzNaIG@d+%RojrNc2x|F#EP3DVr}Le
zxo5oiH1iw8JwIVa)+ZU{IjP09KdGv_o+Rq3I4;^IRLJjSh|3dRl9f%uo8^@n#Rpjh
z1{(2fYN-?_vYgV(;#$_EwD5YW)RVQGT%!ZRl<koE#O&;)vd$#7oAbo}>=z`zxSw4n
z_FHusV~5pRl$eJjP(mBS%5y@>DVz1(U9~oZ3|6w;vC`G<mP#54=^n-%WZglj#dQr5
zG=<lThMcOYPyMfr{s5)c`*(<+a^7=3nil1T6D3__QzSIjIToGI)o&R~p8Mi!bAdD{
zE}37Hn^ML6+>-HY(#X?>)8cFFbM9=!4Q$VyCU<QW2Xkw57I=j(*DTWV8nU~$)+-Zn
z@K&npSJn1;CJHHt+nrY<zc5Ce$y*g1Qm+o@kgL2SWih5A|InNKCVDGOBn|P^SxG8S
zBU-@cwVudBw4vKb&D|{%X~b;px<}M~-b9NBFhakXipxnU33zCiTUA%j<LdhReBSNz
zbXw)ll*ZxTf~C|pG#fm+viAe@b6b-^NEB3#j;as%a1ITl$s>_k5f&Nw(``LS#Vwsh
zOU?Jx+Fyo8kS3^%CUp|;_(;^{XXd_T;0DP*7tJ=s?Na2rDBjKACw(oZT4qY8M4e@(
z{MGIwA6mAh1kcCv_Tw0?ccd5tU;WKp;3**skI)N0MeMC@;CT!^I_=*Y8{-_f4taYZ
zaj;SFCpGT%2iy?2FTfp(s<o5x^!SE3z8k!eLK!^NgW_QXih$XSCu<BYmSeN<W92uo
zesx%p@DXGz16S3Dl|fuB;}qafLs+>Q$2#DmHDTp!9P5EM1D}jz1MvH6!^+_}9tC_6
zxGj#2<j+zZ>%vMa2z(|KJ`Opux?iN_SkoC;;$H&Bh*)6@rv8gA+uEfr@rJ!~Y~)vk
ztll~+d7BV5zo4QzVjno9qrkzXw8IEn{?^!}!;?woQt%Yh9&j!#&h!u#CkmEkY4sda
zh>Dwunx}I|s9|@k(6K2R9m*(cjqllo%PPf}j$><=45(@!JS-+GY$VI?s_J6`!ki0h
zCAcUw3v<h{W?ffRD>Msp_z;i_oP!An>j0MTx;Q=ADeG>CTa%~Brkld<oaWwhQ&ssz
zQgymD!@YA+IxR4`0WDY#iKr8P)>M;T3jaQ7CnByG-0ZIce^fl<%-1)-y2w^A3`_Ut
z<8vN>e+vAK*pHf=b6GkqG`Imd=PCHZkMZ+xeehp|pJQ39SVQ3Yz-=a)gsxKEv`<=?
z?m}u`tXVz*i%e7+72ZO-p$euiDqbxtF>HZ(C@R_uore7|9fRUbp?z96Ov^1*J&t|k
ziRben->T8+<i_-4Vy7BKT<0xuyU^}7r(o{C9=?bv;7s5Wy>kC9aAt5-oEowOYdJWN
zc&@0{bpY0Z+p5~jt{QEGeUcYNs}1fe?lCLVsRl;uq<Rp$@^Acbacu1N>P26Xqofby
z9ws4NlHbxw{KAEqfD~c5u)2rDjN+24dvGzu!r>v-;+5h|ebHUq(~;KVYq}$bQoT;}
zO8OusPL@y{BsBf>dGosJ0-FoGpEa&VlSlbPo+LiR@fhkq<KuLY3v7!Q&^9C$Nbp+W
zqyh>4(E<rxi;ovzwK+L|67Q=n2x^nL;sKg3Ho+kN1L2A91QQKiKK^mzidlCxo^@if
zSi&gtkc8F%3S3<T3;8vzk@y3Cka?mYHY5rpIeJy~&qaxelX#fX<id$WT0EQy>8vi0
a_$#z4u9NtaWelncVp^P&#)&brj{O5VEAAEm

delta 1740
zcmaKse{54#6vxkd>q<99ef!a|jxM})UALBIR(`&@AuVjPb^|6^j3FQ$8w;|Lpcw|V
zqZQi(435n+5@M7Mg3C0RfZIPRG1I64{6qXhgJL8g8YYMv5ZyA^^Ly70;SW8@$-U=u
zzW3g9PjBD(?ZeMIiG0h6f{kCz3hu_Zg_Z;xi9%qlV69-STcS(4or|QM-Rl?1UFxaM
zZytFpZZ9Vqxp+g@YYA%MN_=HGMFl_mHPN|?=O4;(r!ub)3KJAJ*F+;kuU|ZzR4CKC
zQ7A9Y=M=X*{u=+Ktiv!E%<PxsKo;N1Dv{!xpS?8qjZ6xh(hJ9ReYj#|Dsi|H=a%fH
zax{~VWPbwP`kaN9)d~eZo7Np$q3e5B5Diys;G;Pea;3sQ=WLU0Z5*@&C6y0Zii~|z
zf$ep~pIV%fgMYQm&qyV;T)+cl^01QYka~DY?pni;nF8#!<#Au`28sAy?lSMUIAqSG
zUK_;<l?o?Rx_qW{IZf43-<0S|KdBv*=4xz~)Q$sS?8)=;4oe1)&NWHz@T+sPGyj!_
z!j#q&`i_h94yrR08&1w_>Z3wUqL4P6k*woRo|+m%U}dYdNRoKDb(4I^#7C^|tevwd
za1_OfBCb03>_S(6w3f-c+BrSH#^6DgPg?W1Ex$Q8+TNrUpqVN)Ueonsl?p{Q=zhzu
zk=xR_Y~FJmu;r}Y2bDt7xVUlkJ-zn($*G8!B3e4>^M__Ng<mjIKpa*-8VP%;NfT^i
zl0A;1pH9xbxWoXzXY0jJ^;E$ksf#-b7Rdv#!4C^wN{MtOPPsO&U%uX-VhXp9>$MVk
zD|XiP&6S6dr?|a)Be<wF6}9%YzpL$<{3X&2)-S&tp1H#Sc*it*B`jJecoa<B4>!4g
z#<S31e__1LmWf~26e*Vzx54A76fNJnIi^|hUPUY;u4;;DX~{SRakM$6O`wM&Z$MnS
zC8mi7D&kbcI}rbnjE#tUT4LJSWIPM;Pl!K4j5nmAj+Brb=WX_qwB%pI6z8|>kq07s
z(J^U=zbraACw@gEHQ1<9a5q$Y9TnAa`^X6ch2LLG9VX1>&rQ1x<SKuVrI4>Y<396E
z>y(FLmW$U~rcZZxh+O-L_CJckLO7$P_!%B9*4}guwY>eiu3rTfrobVkw(GiHF91w1
zAAA686r(V~8sIT}JyeXs1ltPl0&@re6Jr?OcZ2gwoN~h-yt1TB4&CGzOUk4Re7R&^
z#33R7Czgn@#_Y}X@9CBHm}%uDqu&Np1#TGCb@4E%+N>h~B(g?7LXmGq-t#~C81fCs
z??|i{``4Rk)y(=$bR2@?{~-`}`U3eo$cw{-sqhP6cfrJcsVdD+z0xYP4_mKE6fW+4
z7aHFfe_U!eI%ITxjIWfsjSZNd9^)*R(-_5c@hyJBWj79B>bcFGF1!CcrqSCExAiL6
z0kD}8L|NltJz!O$1XvwBBL(I1n5)*e0W<Y)T|XmIHPfZ_O2Kq9RU7@6{Il1YsRolo
z2lqjH8rqXWo9OT+4!a$04d3W}0>Z?R;$@rwiwOWLh5O*P2|nj`yXq%&T_9mX9)&l9
jdC&-9g7v_|g9Qsl41?YUhowQIH`T!N7n`_f=?DJ+=ocTF

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index 340756c..25e791c 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -5,3 +5,4 @@ test 04:PASS
 test 05:PASS
 test 06:PASS
 test 07:PASS
+test 08:PASS

From 03d1aa968a76f338c4caf9c742e9e59d8a8d13e0 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 22 Jul 2020 12:19:12 +1000
Subject: [PATCH 13/30] FPU: Implement floating convert to integer instructions

This implements fctiw, fctiwz, fctiwu, fctiwuz, fctid, fctidz, fctidu
and fctiduz, and adds tests for them.

There are some subtleties around the setting of the inexact (XX) and
invalid conversion (VXCVI) flags in the FPSCR.  If the rounded value
ends up being out of range, we need to set VXCVI and not XX.  For a
conversion to unsigned word or doubleword of a negative value that
rounds to zero, we need to set XX and not VXCVI.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl               |   8 ++
 fpu.vhdl                   | 157 ++++++++++++++++++++++++++++++++++++-
 tests/fpu/fpu.c            | 157 +++++++++++++++++++++++++++++++++++++
 tests/test_fpu.console_out |   2 +
 4 files changed, 321 insertions(+), 3 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 284fb08..c659e3e 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -442,8 +442,16 @@ architecture behaviour of decode1 is
         2#100000100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/8=fnabs
         2#100001000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  8/8=fabs
         2#110000000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), --  0/12=frsp
+        2#111000000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  0/14=fctiw
+        2#111000100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/14=fctiwu
+        2#111011001#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 25/14=fctid
         2#111011010#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 26/14=fcfid
+        2#111011101#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 29/14=fctidu
         2#111011110#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 30/14=fcfidu
+        2#111100000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  0/15=fctiwz
+        2#111100100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/15=fctiwuz
+        2#111111001#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 25/15=fctidz
+        2#111111101#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 29/15=fctiduz
         others => illegal_inst
         );
 
diff --git a/fpu.vhdl b/fpu.vhdl
index 7576562..6301fa7 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -38,8 +38,10 @@ architecture behaviour of fpu is
     type state_t is (IDLE,
                      DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
                      DO_FMR,
-                     DO_FCFID,
+                     DO_FCFID, DO_FCTI,
                      DO_FRSP,
+                     INT_SHIFT, INT_ROUND, INT_ISHIFT,
+                     INT_FINAL, INT_CHECK, INT_OFLOW,
                      FINISH, NORMALIZE,
                      ROUND_UFLOW, ROUND_OFLOW,
                      ROUNDING, ROUNDING_2, ROUNDING_3,
@@ -363,6 +365,8 @@ begin
         variable clz         : std_ulogic_vector(5 downto 0);
         variable set_x       : std_ulogic;
         variable mshift      : signed(EXP_BITS-1 downto 0);
+        variable need_check  : std_ulogic;
+        variable msb         : std_ulogic;
     begin
         v := r;
         illegal := '0';
@@ -461,8 +465,15 @@ begin
                         when "01100" =>
                             v.state := DO_FRSP;
                         when "01110" =>
-                            -- fcfid[u][s]
-                            v.state := DO_FCFID;
+                            if int_input = '1' then
+                                -- fcfid[u][s]
+                                v.state := DO_FCFID;
+                            else
+                                v.state := DO_FCTI;
+                            end if;
+                        when "01111" =>
+                            v.round_mode := "001";
+                            v.state := DO_FCTI;
                         when others =>
                             illegal := '1';
                     end case;
@@ -603,6 +614,47 @@ begin
                     arith_done := '1';
                 end if;
 
+            when DO_FCTI =>
+                -- instr bit 9: 1=dword 0=word
+                -- instr bit 8: 1=unsigned 0=signed
+                -- instr bit 1: 1=round to zero 0=use fpscr[RN]
+                opsel_a <= AIN_B;
+                v.result_class := r.b.class;
+                v.result_sign := r.b.negative;
+                v.result_exp := r.b.exponent;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.b.class = NAN and r.b.mantissa(53) = '0' then
+                    -- Signalling NAN
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    invalid := '1';
+                end if;
+
+                v.int_result := '1';
+                case r.b.class is
+                    when ZERO =>
+                        arith_done := '1';
+                    when FINITE =>
+                        if r.b.exponent >= to_signed(64, EXP_BITS) or
+                            (r.insn(9) = '0' and r.b.exponent >= to_signed(32, EXP_BITS)) then
+                            v.state := INT_OFLOW;
+                        elsif r.b.exponent >= to_signed(52, EXP_BITS) then
+                            -- integer already, no rounding required,
+                            -- shift into final position
+                            v.shift := r.b.exponent - to_signed(54, EXP_BITS);
+                            if r.insn(8) = '1' and r.b.negative = '1' then
+                                v.state := INT_OFLOW;
+                            else
+                                v.state := INT_ISHIFT;
+                            end if;
+                        else
+                            v.shift := r.b.exponent - to_signed(52, EXP_BITS);
+                            v.state := INT_SHIFT;
+                        end if;
+                    when INFINITY | NAN =>
+                        v.state := INT_OFLOW;
+                end case;
+
             when DO_FCFID =>
                 v.result_sign := '0';
                 opsel_a <= AIN_B;
@@ -622,6 +674,81 @@ begin
                     v.state := FINISH;
                 end if;
 
+            when INT_SHIFT =>
+                opsel_r <= RES_SHIFT;
+                set_x := '1';
+                v.state := INT_ROUND;
+                v.shift := to_signed(-2, EXP_BITS);
+
+            when INT_ROUND =>
+                opsel_r <= RES_SHIFT;
+                round := fp_rounding(r.r, r.x, '0', r.round_mode, r.result_sign);
+                v.fpscr(FPSCR_FR downto FPSCR_FI) := round;
+                -- Check for negative values that don't round to 0 for fcti*u*
+                if r.insn(8) = '1' and r.result_sign = '1' and
+                    (r_hi_nz or r_lo_nz or v.fpscr(FPSCR_FR)) = '1' then
+                    v.state := INT_OFLOW;
+                else
+                    v.state := INT_FINAL;
+                end if;
+
+            when INT_ISHIFT =>
+                opsel_r <= RES_SHIFT;
+                v.state := INT_FINAL;
+
+            when INT_FINAL =>
+                -- Negate if necessary, and increment for rounding if needed
+                opsel_ainv <= r.result_sign;
+                carry_in <= r.fpscr(FPSCR_FR) xor r.result_sign;
+                -- Check for possible overflows
+                case r.insn(9 downto 8) is
+                    when "00" =>        -- fctiw[z]
+                        need_check := r.r(31) or (r.r(30) and not r.result_sign);
+                    when "01" =>        -- fctiwu[z]
+                        need_check := r.r(31);
+                    when "10" =>        -- fctid[z]
+                        need_check := r.r(63) or (r.r(62) and not r.result_sign);
+                    when others =>      -- fctidu[z]
+                        need_check := r.r(63);
+                end case;
+                if need_check = '1' then
+                    v.state := INT_CHECK;
+                else
+                    if r.fpscr(FPSCR_FI) = '1' then
+                        v.fpscr(FPSCR_XX) := '1';
+                    end if;
+                    arith_done := '1';
+                end if;
+
+            when INT_CHECK =>
+                if r.insn(9) = '0' then
+                    msb := r.r(31);
+                else
+                    msb := r.r(63);
+                end if;
+                misc_sel <= '1' & r.insn(9 downto 8) & r.result_sign;
+                if (r.insn(8) = '0' and msb /= r.result_sign) or
+                    (r.insn(8) = '1' and msb /= '1') then
+                    opsel_r <= RES_MISC;
+                    v.fpscr(FPSCR_VXCVI) := '1';
+                    invalid := '1';
+                else
+                    if r.fpscr(FPSCR_FI) = '1' then
+                        v.fpscr(FPSCR_XX) := '1';
+                    end if;
+                end if;
+                arith_done := '1';
+
+            when INT_OFLOW =>
+                opsel_r <= RES_MISC;
+                misc_sel <= '1' & r.insn(9 downto 8) & r.result_sign;
+                if r.b.class = NAN then
+                    misc_sel(0) <= '1';
+                end if;
+                v.fpscr(FPSCR_VXCVI) := '1';
+                invalid := '1';
+                arith_done := '1';
+
             when FINISH =>
                 if r.r(63 downto 54) /= "0000000001" then
                     renormalize := '1';
@@ -846,6 +973,30 @@ begin
                     when "0011" =>
                         -- mantissa of max representable SP number
                         misc := x"007fffff80000000";
+                    when "1000" =>
+                        -- max positive result for fctiw[z]
+                        misc := x"000000007fffffff";
+                    when "1001" =>
+                        -- max negative result for fctiw[z]
+                        misc := x"ffffffff80000000";
+                    when "1010" =>
+                        -- max positive result for fctiwu[z]
+                        misc := x"00000000ffffffff";
+                    when "1011" =>
+                        -- max negative result for fctiwu[z]
+                        misc := x"0000000000000000";
+                    when "1100" =>
+                        -- max positive result for fctid[z]
+                        misc := x"7fffffffffffffff";
+                    when "1101" =>
+                        -- max negative result for fctid[z]
+                        misc := x"8000000000000000";
+                    when "1110" =>
+                        -- max positive result for fctidu[z]
+                        misc := x"ffffffffffffffff";
+                    when "1111" =>
+                        -- max negative result for fctidu[z]
+                        misc := x"0000000000000000";
                     when others =>
                         misc := x"0000000000000000";
                 end case;
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index aff6d6c..3c6a9bd 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -19,6 +19,7 @@
 #define FPS_UE		0x20
 #define FPS_OE		0x40
 #define FPS_VE		0x80
+#define FPS_VXCVI	0x100
 #define FPS_VXSOFT	0x400
 
 extern int trapit(long arg, int (*func)(long));
@@ -598,6 +599,160 @@ int fpu_test_8(void)
 	return trapit(0, test8);
 }
 
+struct cvtivals {
+	unsigned long dval;
+	long lval;
+	unsigned long ulval;
+	int ival;
+	unsigned int uival;
+	unsigned char invalids[4];
+} cvtivals[] = {
+	{ 0x0000000000000000, 0, 0, 0, 0, {0, 0, 0, 0} },
+	{ 0x8000000000000000, 0, 0, 0, 0, {0, 0, 0, 0} },
+	{ 0x3fdfffffffffffff, 0, 0, 0, 0, {0, 0, 0, 0} },
+	{ 0x3ff0000000000000, 1, 1, 1, 1, {0, 0, 0, 0} },
+	{ 0xbff0000000000000, -1, 0, -1, 0, {0, 1, 0, 1} },
+	{ 0x402123456789abcd, 9, 9, 9, 9, {0, 0, 0, 0} },
+	{ 0x406123456789abcd, 137, 137, 137, 137, {0, 0, 0, 0} },
+	{ 0x409123456789abcd, 1097, 1097, 1097, 1097, {0, 0, 0, 0} },
+	{ 0x41c123456789abcd, 0x22468acf, 0x22468acf, 0x22468acf, 0x22468acf, {0, 0, 0, 0} },
+	{ 0x41d123456789abcd, 0x448d159e, 0x448d159e, 0x448d159e, 0x448d159e, {0, 0, 0, 0} },
+	{ 0x41e123456789abcd, 0x891a2b3c, 0x891a2b3c, 0x7fffffff, 0x891a2b3c, {0, 0, 1, 0} },
+	{ 0x41f123456789abcd, 0x112345679, 0x112345679, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0xc1f123456789abcd, -0x112345679, 0, 0x80000000, 0, {0, 1, 1, 1} },
+	{ 0x432123456789abcd, 0x891a2b3c4d5e6, 0x891a2b3c4d5e6, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x433123456789abcd, 0x1123456789abcd, 0x1123456789abcd, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x434123456789abcd, 0x22468acf13579a, 0x22468acf13579a, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x43c123456789abcd, 0x22468acf13579a00, 0x22468acf13579a00, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x43d123456789abcd, 0x448d159e26af3400, 0x448d159e26af3400, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x43e123456789abcd, 0x7fffffffffffffff, 0x891a2b3c4d5e6800, 0x7fffffff, 0xffffffff, {1, 0, 1, 1} },
+	{ 0x43f123456789abcd, 0x7fffffffffffffff, 0xffffffffffffffff, 0x7fffffff, 0xffffffff, {1, 1, 1, 1} },
+	{ 0xc3f123456789abcd, 0x8000000000000000, 0, 0x80000000, 0, {1, 1, 1, 1} },
+	{ 0x7ff0000000000000, 0x7fffffffffffffff, 0xffffffffffffffff, 0x7fffffff, 0xffffffff, {1, 1, 1, 1} },
+	{ 0xfff0000000000000, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
+	{ 0x7ff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
+	{ 0xfff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
+	{ 0xbfd123456789abcd, 0, 0, 0, 0, {0, 0, 0, 0} },
+};
+
+#define GET_VXCVI()	((get_fpscr() >> 8) & 1)
+
+int test9(long arg)
+{
+	long i;
+	int ires;
+	unsigned int ures;
+	long lres;
+	unsigned long ulres;
+	unsigned char inv[4];
+	struct cvtivals *vp = cvtivals;
+
+	for (i = 0; i < sizeof(cvtivals) / sizeof(cvtivals[0]); ++i, ++vp) {
+		set_fpscr(FPS_RN_NEAR);
+		asm("lfd 3,0(%0); fctid 4,3; stfd 4,0(%1)"
+		    : : "b" (&vp->dval), "b" (&lres) : "memory");
+		inv[0] = GET_VXCVI();
+		set_fpscr(FPS_RN_NEAR);
+		asm("fctidu 5,3; stfd 5,0(%0)" : : "b" (&ulres) : "memory");
+		inv[1] = GET_VXCVI();
+		set_fpscr(FPS_RN_NEAR);
+		asm("fctiw 6,3; stfiwx 6,0,%0" : : "b" (&ires) : "memory");
+		inv[2] = GET_VXCVI();
+		set_fpscr(FPS_RN_NEAR);
+		asm("fctiwu 7,3; stfiwx 7,0,%0" : : "b" (&ures) : "memory");
+		inv[3] = GET_VXCVI();
+
+		if (lres != vp->lval || ulres != vp->ulval || ires != vp->ival || ures != vp->uival ||
+		    inv[0] != vp->invalids[0] || inv[1] != vp->invalids[1] ||
+		    inv[2] != vp->invalids[2] || inv[3] != vp->invalids[3]) {
+			print_hex(lres, 16, inv[0]? "V ": "  ");
+			print_hex(ulres, 16, inv[1]? "V ": "  ");
+			print_hex(ires, 8, inv[2]? "V ": "  ");
+			print_hex(ures, 8, inv[3]? "V ": "  ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_9(void)
+{
+	enable_fp();
+	return trapit(0, test9);
+}
+
+struct cvtivals cvtizvals[] = {
+	{ 0x0000000000000000, 0, 0, 0, 0, {0, 0, 0, 0} },
+	{ 0x8000000000000000, 0, 0, 0, 0, {0, 0, 0, 0} },
+	{ 0x3fdfffffffffffff, 0, 0, 0, 0, {0, 0, 0, 0} },
+	{ 0x3ff0000000000000, 1, 1, 1, 1, {0, 0, 0, 0} },
+	{ 0xbff0000000000000, -1, 0, -1, 0, {0, 1, 0, 1} },
+	{ 0x402123456789abcd, 8, 8, 8, 8, {0, 0, 0, 0} },
+	{ 0x406123456789abcd, 137, 137, 137, 137, {0, 0, 0, 0} },
+	{ 0x409123456789abcd, 1096, 1096, 1096, 1096, {0, 0, 0, 0} },
+	{ 0x41c123456789abcd, 0x22468acf, 0x22468acf, 0x22468acf, 0x22468acf, {0, 0, 0, 0} },
+	{ 0x41d123456789abcd, 0x448d159e, 0x448d159e, 0x448d159e, 0x448d159e, {0, 0, 0, 0} },
+	{ 0x41e123456789abcd, 0x891a2b3c, 0x891a2b3c, 0x7fffffff, 0x891a2b3c, {0, 0, 1, 0} },
+	{ 0x41f123456789abcd, 0x112345678, 0x112345678, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0xc1f123456789abcd, -0x112345678, 0, 0x80000000, 0, {0, 1, 1, 1} },
+	{ 0x432123456789abcd, 0x891a2b3c4d5e6, 0x891a2b3c4d5e6, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x433123456789abcd, 0x1123456789abcd, 0x1123456789abcd, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x434123456789abcd, 0x22468acf13579a, 0x22468acf13579a, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x43c123456789abcd, 0x22468acf13579a00, 0x22468acf13579a00, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x43d123456789abcd, 0x448d159e26af3400, 0x448d159e26af3400, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x43e123456789abcd, 0x7fffffffffffffff, 0x891a2b3c4d5e6800, 0x7fffffff, 0xffffffff, {1, 0, 1, 1} },
+	{ 0x43f123456789abcd, 0x7fffffffffffffff, 0xffffffffffffffff, 0x7fffffff, 0xffffffff, {1, 1, 1, 1} },
+	{ 0xc3f123456789abcd, 0x8000000000000000, 0, 0x80000000, 0, {1, 1, 1, 1} },
+	{ 0x7ff0000000000000, 0x7fffffffffffffff, 0xffffffffffffffff, 0x7fffffff, 0xffffffff, {1, 1, 1, 1} },
+	{ 0xfff0000000000000, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
+	{ 0x7ff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
+	{ 0xfff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
+};
+
+int test10(long arg)
+{
+	long i;
+	int ires;
+	unsigned int ures;
+	long lres;
+	unsigned long ulres;
+	unsigned char inv[4];
+	struct cvtivals *vp = cvtizvals;
+
+	for (i = 0; i < sizeof(cvtizvals) / sizeof(cvtizvals[0]); ++i, ++vp) {
+		set_fpscr(FPS_RN_NEAR);
+		asm("lfd 3,0(%0); fctidz 4,3; stfd 4,0(%1)"
+		    : : "b" (&vp->dval), "b" (&lres) : "memory");
+		inv[0] = GET_VXCVI();
+		set_fpscr(FPS_RN_NEAR);
+		asm("fctiduz 5,3; stfd 5,0(%0)" : : "b" (&ulres) : "memory");
+		inv[1] = GET_VXCVI();
+		set_fpscr(FPS_RN_NEAR);
+		asm("fctiwz 6,3; stfiwx 6,0,%0" : : "b" (&ires) : "memory");
+		inv[2] = GET_VXCVI();
+		set_fpscr(FPS_RN_NEAR);
+		asm("fctiwuz 7,3; stfiwx 7,0,%0" : : "b" (&ures) : "memory");
+		inv[3] = GET_VXCVI();
+
+		if (lres != vp->lval || ulres != vp->ulval || ires != vp->ival || ures != vp->uival ||
+		    inv[0] != vp->invalids[0] || inv[1] != vp->invalids[1] ||
+		    inv[2] != vp->invalids[2] || inv[3] != vp->invalids[3]) {
+			print_hex(lres, 16, inv[0]? "V ": "  ");
+			print_hex(ulres, 16, inv[1]? "V ": "  ");
+			print_hex(ires, 8, inv[2]? "V ": "  ");
+			print_hex(ures, 8, inv[3]? "V ": "  ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_10(void)
+{
+	enable_fp();
+	return trapit(0, test10);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -631,6 +786,8 @@ int main(void)
 	do_test(6, fpu_test_6);
 	do_test(7, fpu_test_7);
 	do_test(8, fpu_test_8);
+	do_test(9, fpu_test_9);
+	do_test(10, fpu_test_10);
 
 	return fail;
 }
diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index 25e791c..3e84260 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -6,3 +6,5 @@ test 05:PASS
 test 06:PASS
 test 07:PASS
 test 08:PASS
+test 09:PASS
+test 10:PASS

From 0ad2aa30149d0a6e2d3082e841f6fe5079209067 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 22 Jul 2020 16:13:12 +1000
Subject: [PATCH 14/30] FPU: Implement floating round-to-integer instructions

This implements frin, friz, frip and frim, and adds tests for them.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl               |   4 +++
 fpu.vhdl                   |  40 +++++++++++++++++++--
 tests/fpu/fpu.c            |  71 +++++++++++++++++++++++++++++++++++++
 tests/test_fpu.bin         | Bin 14032 -> 21208 bytes
 tests/test_fpu.console_out |   1 +
 5 files changed, 114 insertions(+), 2 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index c659e3e..a42899d 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -441,6 +441,10 @@ architecture behaviour of decode1 is
         2#100000010#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  2/8=fmr
         2#100000100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/8=fnabs
         2#100001000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  8/8=fabs
+        2#100001100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 12/8=frin
+        2#100001101#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 13/8=friz
+        2#100001110#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 14/8=frip
+        2#100001111#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 15/8=frim
         2#110000000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), --  0/12=frsp
         2#111000000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  0/14=fctiw
         2#111000100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/14=fctiwu
diff --git a/fpu.vhdl b/fpu.vhdl
index 6301fa7..371fdc5 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -39,7 +39,8 @@ architecture behaviour of fpu is
                      DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
                      DO_FMR,
                      DO_FCFID, DO_FCTI,
-                     DO_FRSP,
+                     DO_FRSP, DO_FRI,
+                     FRI_1,
                      INT_SHIFT, INT_ROUND, INT_ISHIFT,
                      INT_FINAL, INT_CHECK, INT_OFLOW,
                      FINISH, NORMALIZE,
@@ -461,7 +462,11 @@ begin
                                 v.state := DO_MTFSF;
                             end if;
                         when "01000" =>
-                            v.state := DO_FMR;
+                            if e_in.insn(9 downto 8) /= "11" then
+                                v.state := DO_FMR;
+                            else
+                                v.state := DO_FRI;
+                            end if;
                         when "01100" =>
                             v.state := DO_FRSP;
                         when "01110" =>
@@ -587,6 +592,31 @@ begin
                 v.instr_done := '1';
                 v.state := IDLE;
 
+            when DO_FRI =>    -- fri[nzpm]
+                opsel_a <= AIN_B;
+                v.result_class := r.b.class;
+                v.result_sign := r.b.negative;
+                v.result_exp := r.b.exponent;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.b.class = NAN and r.b.mantissa(53) = '0' then
+                    -- Signalling NAN
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    invalid := '1';
+                end if;
+                if r.b.class = FINITE then
+                    if r.b.exponent >= to_signed(52, EXP_BITS) then
+                        -- integer already, no rounding required
+                        arith_done := '1';
+                    else
+                        v.shift := r.b.exponent - to_signed(52, EXP_BITS);
+                        v.state := FRI_1;
+                        v.round_mode := '1' & r.insn(7 downto 6);
+                    end if;
+                else
+                    arith_done := '1';
+                end if;
+
             when DO_FRSP =>
                 opsel_a <= AIN_B;
                 v.result_class := r.b.class;
@@ -749,6 +779,12 @@ begin
                 invalid := '1';
                 arith_done := '1';
 
+            when FRI_1 =>
+                opsel_r <= RES_SHIFT;
+                set_x := '1';
+                v.shift := to_signed(-2, EXP_BITS);
+                v.state := ROUNDING;
+
             when FINISH =>
                 if r.r(63 downto 54) /= "0000000001" then
                     renormalize := '1';
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index 3c6a9bd..d24fe14 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -753,6 +753,76 @@ int fpu_test_10(void)
 	return trapit(0, test10);
 }
 
+struct frivals {
+	unsigned long val;
+	unsigned long nval;
+	unsigned long zval;
+	unsigned long pval;
+	unsigned long mval;
+} frivals[] = {
+	{ 0x0000000000000000, 0, 0, 0, 0 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 },
+	{ 0x3fdfffffffffffff, 0, 0, 0x3ff0000000000000, 0 },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 },
+	{ 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000 },
+	{ 0x402123456789abcd, 0x4022000000000000, 0x4020000000000000, 0x4022000000000000, 0x4020000000000000 },
+	{ 0x406123456789abcd, 0x4061200000000000, 0x4061200000000000, 0x4061400000000000, 0x4061200000000000 },
+	{ 0x409123456789abcd, 0x4091240000000000, 0x4091200000000000, 0x4091240000000000, 0x4091200000000000 },
+	{ 0x41c123456789abcd, 0x41c1234567800000, 0x41c1234567800000, 0x41c1234568000000, 0x41c1234567800000 },
+	{ 0x41d123456789abcd, 0x41d1234567800000, 0x41d1234567800000, 0x41d1234567c00000, 0x41d1234567800000 },
+	{ 0x41e123456789abcd, 0x41e1234567800000, 0x41e1234567800000, 0x41e1234567a00000, 0x41e1234567800000 },
+	{ 0x41f123456789abcd, 0x41f1234567900000, 0x41f1234567800000, 0x41f1234567900000, 0x41f1234567800000 },
+	{ 0xc1f123456789abcd, 0xc1f1234567900000, 0xc1f1234567800000, 0xc1f1234567800000, 0xc1f1234567900000 },
+	{ 0xc1f1234567880000, 0xc1f1234567900000, 0xc1f1234567800000, 0xc1f1234567800000, 0xc1f1234567900000 },
+	{ 0x432123456789abcd, 0x432123456789abce, 0x432123456789abcc, 0x432123456789abce, 0x432123456789abcc },
+	{ 0x433123456789abcd, 0x433123456789abcd, 0x433123456789abcd, 0x433123456789abcd, 0x433123456789abcd },
+	{ 0x434123456789abcd, 0x434123456789abcd, 0x434123456789abcd, 0x434123456789abcd, 0x434123456789abcd },
+	{ 0x43c123456789abcd, 0x43c123456789abcd, 0x43c123456789abcd, 0x43c123456789abcd, 0x43c123456789abcd },
+	{ 0x43d123456789abcd, 0x43d123456789abcd, 0x43d123456789abcd, 0x43d123456789abcd, 0x43d123456789abcd },
+	{ 0x43e123456789abcd, 0x43e123456789abcd, 0x43e123456789abcd, 0x43e123456789abcd, 0x43e123456789abcd },
+	{ 0x43f123456789abcd, 0x43f123456789abcd, 0x43f123456789abcd, 0x43f123456789abcd, 0x43f123456789abcd },
+	{ 0xc3f123456789abcd, 0xc3f123456789abcd, 0xc3f123456789abcd, 0xc3f123456789abcd, 0xc3f123456789abcd },
+	{ 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0xfff0000000000000, 0xfff0000000000000, 0xfff0000000000000, 0xfff0000000000000, 0xfff0000000000000 },
+	{ 0x7ff123456789abcd, 0x7ff923456789abcd, 0x7ff923456789abcd, 0x7ff923456789abcd, 0x7ff923456789abcd },
+	{ 0xfff923456789abcd, 0xfff923456789abcd, 0xfff923456789abcd, 0xfff923456789abcd, 0xfff923456789abcd },
+};
+
+int test11(long arg)
+{
+	long i;
+	unsigned long results[4];
+	struct frivals *vp = frivals;
+
+	for (i = 0; i < sizeof(frivals) / sizeof(frivals[0]); ++i, ++vp) {
+		set_fpscr(FPS_RN_FLOOR);
+		asm("lfd 3,0(%0); frin 4,3; stfd 4,0(%1)"
+		    : : "b" (&vp->val), "b" (results) : "memory");
+		set_fpscr(FPS_RN_NEAR);
+		asm("friz 5,3; stfd 5,8(%0)" : : "b" (results) : "memory");
+		set_fpscr(FPS_RN_ZERO);
+		asm("frip 5,3; stfd 5,16(%0)" : : "b" (results) : "memory");
+		set_fpscr(FPS_RN_CEIL);
+		asm("frim 5,3; stfd 5,24(%0)" : : "b" (results) : "memory");
+		if (results[0] != vp->nval || results[1] != vp->zval ||
+		    results[2] != vp->pval || results[3] != vp->mval) {
+			print_hex(i, 2, "\r\n");
+			print_hex(results[0], 16, " ");
+			print_hex(results[1], 16, " ");
+			print_hex(results[2], 16, " ");
+			print_hex(results[3], 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_11(void)
+{
+	enable_fp();
+	return trapit(0, test11);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -788,6 +858,7 @@ int main(void)
 	do_test(8, fpu_test_8);
 	do_test(9, fpu_test_9);
 	do_test(10, fpu_test_10);
+	do_test(11, fpu_test_11);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index 81d18542064550fb7064a3683cbcdc7f7048c285..d2320cd960e8c39417367d3b8d7fc730494c9eca 100755
GIT binary patch
literal 21208
zcmeHP4RBP~bv~=#6+hAvlUV#1o)7^^QBu(%&J&W*TS;Ic{IlUFV8?M*UNJ^Mrg%cy
zm3hmq$hL+IC1C6pJ75bBR&7G5(~_w@1F;m_2_#MnDNaR9u!KpfACoL;n=DA?^*i_8
zCq1nsMBK(q2fdl6bMHCdJ@?#m&pq$HyAOzrGNPK=AEehbfUcqCHMK;3@YaL39=!Ey
zBFop;`c|(CmDBp-ub*DuFqyV`o0IE9@2?`$pqk>iMHNI^DN&C92kALM(7MQ;96u51
zmn~PE68hMS{+cU&A)*NS*@1p`pr0LU4$s|D+nl^Tl#>#<my*z$sJc1HZF<NRcUd{{
z=E`~IMryqhq>d{Qa$Vg<Iaha)_v&7%z1l>LS6`$9S6`;qtFKVU)fRH~o}irGQ{?S!
zquSnfYV7T#1HCb7?Tu4M?;Ij?ExC3j+BVxIZ42h^Fky?ywlr7FGZS(47+_Bq?CF9%
z-LR({_H@IZZrIZed%9syH|*(#Jw33e2ln*9o*vlK1ABU4PY>+rfh`g3a+c|FxjM;}
zqc<nJO^wiZ@@DiMDJ6=+w^8^uTK9%4>Tgbd#+*%bH)Hq~-WvP!2YaBo!h?CvFe%xu
zYaDkU@n!cPxyk)g%1s{sP41skZ_fBX;&2<%ogIIac8iZc{T3g;`xYO6#?2YOz5XIM
zxqljWbH+a;A$`OQ+y6BF79anFTYUTzZ}IVG4nKZ-eQ2Y|jZx(OQRI;k<bUjRV2_Wo
zMj8Joa$^*^e-wFS1o<C({4O#|sii_srQEzKYV=xGMOmS*rnv6*K=9Dig3#n;`Jv|I
z{E&usqih&ch#N1Xv>Nv&+<1ZJ%O8JyRxwf6k5Sas`>5ZuNATWx-?El6hHuZFieia9
z3G9#FIh7!zZL^Ax+}ji;I`q!bVPwC)Hv-u=-r1{UvJdiSL+_Eh{oy04{9#kd(4D;O
zF{0+bI42PFJW)~XrrhGmH3gxV<$~QW{fOwrpMEBLof)%o8iO?fb3O&kEoTEOTohQF
z_<n8c8&4YA65hr0L!Kv+iY+UmqpAlhQ$we$Ry~EH!HaY1gKnzsnq2eAkeljH?8<KG
z^JTa95&fXeO}}dM{HgYVXP)tahvuBkrp0GH+1vXx`l~ii_MyH!YHmAa<#drRA7>z2
zy|6_;njR460&SMJD>!Rzi1Y%p)7omZTOFMS6*Q1<uGxmVWf{ep@g)tNz)^L9`386x
z8_ILdS5S_Na)EgmW!CE<ckv2wj?+JX%6ml3q=l@X8O6FUKUC+^LW%wm^5020Y5BFo
zVQioAc~0Ew=(Vg1<>a$U_-yAi&UW@=&Vs!cF}Hqw;;&`AxY)n$<iaZ2YOu@&+2u=8
zmZW5PkTp8mo>uaPY4v6{e3nf4wMWzQLq(LM2bboCTBcC7t?LB`b^@>|z@F)ks|qo<
z@yC<8a4yzGMuB<E+UiY3zbY=-Kdy)if7BY{v86))cQ78yYANZuCT;cXxlm$RTT7E*
zKl!4Fk#~DFYK(tU^^bBV+so%_F3N?y0n-P2<JnYARF3|6eN}YH_W9;T%vmyYu@2Vt
zi}z(ewjLf!GW5Kn#zOREp@X(PUy_1$N!w<LnA!(DwvMNV(Se*$yGiIMYh)d1!v1`-
z4((D=UZdI&<rZ?E=@jr+sXnvC+W!*T<u{Tq!z_aD+-Q?0<<fV_gMIhr?Nr?=eD{1g
z{8diWQu5>j$4f5y_Lc1a0P)dtFdY2P*Ta{3<am#!zl(UuEu1d=mdrLFmcwF=81Uaw
z=nK9V6>VNsZ5YpS;iBZib0YS`r5yin!7dk_FBwk@^Y((@;<Phmu$>&U9k4S4Hu1c2
z%xZzCzEbs}p>JS1?K3<cEG}1s|2SfW*MS}X$NL6!`R<SE8A_K}1L?_yXE3(rqy-_O
zC5<BXuT_6#XPLY3nBadgR35DPnun+?nb!qk^&t4RZB?tI`m5d2XDKX)zNkZAbS;%}
zT$3@*+#vY9;qsvM7n5*qo5p-!xw*oj?-=yeD}Cuo-+a;Tfskb-mA&D0Bfm;JFgLJU
z+DFWb7npANKLcfZO%{HF<6K-r)|2h){k;SJng98y-qmHt{EhkA*0a)~NB?3}Kky;-
zOmyg3{a{qT|3m6|^XkC3JYSCLlRl)LkV8+WKdPVX{BZvHtV7S_FGcmn521(i`K2q?
z(g@}lYvGDU9Vd2{(X~j*;5t7yC__FkaIU?U<HL#P_~4v>BWuZCM_;?4e*Gq81pF_k
zvHhX_I_*w;7ACH5qIU6&<GcnxAn!|G97lc`%|A}Op?pENjl5RmbM?LcXDV{x4wah?
z_T?KNqw2CO(cXX2ncotgVP4>59==!-oE9>JpZcD;z+<gCnr<QA&Mcfw5rggFT+DOR
zY2kMa3%_Gn9X*`$_?dBM-%H_z>G>-&C<FPq(O5<K#YfX~Lw%N2@@l5E;bq{8!0&Y0
zlJHFU?j)){0N)`yVDGjCd$$WElYjeK7;%E<^ZxG>ZU4EO%DObdJ_q}xQskD;#CQw%
ze5sovdfWOJ^ddSC&li0i=+vmT(*yn#Rd-WJ)H6_@{96lpYdifIn;XyF;M9ENv-%>>
zl7P7ZcJ$|{1RL4!Q^+W8PF{lijJ@a%-uJeb1eeYYaa^+f8o7$|=*xE;S@DRuZ$?Ys
zzL~rSJb^u6XWPCR+9eO=o!vLnxP(XkSvQ@8ofuD10e+*}iaq(mrS0nj`W@I$uzfr>
z9s`ffz}W5qAG9-1jjc@8i&Xv7sOu;*hTE|o`#H&DJ?x=s5BkhgeeS~ZFB|nDRga*q
zp{}d?M)BOVO7bvwg=){J_8RP>Avv7=bGySov`hMjeZzYt_RR$1-w^#ek^uZ%f5J`M
zPwc#du4QX2!cQJrf;sN!%toKQ=dE`5B~?6!pD#^OJ_+`|7v?$tEcom%l*PC|iMl;*
z-d`QXT5yq75_QI3!dhV4rxIaqGsN7w@qWT{n@sIG_9w_sO?ndg<+1D*WAPRn)Dg!m
zavqPxGcAHKe51{v)_%TkT2r5!4uN-|pSKG<kKi@-^G<=64IbVd5EHI0%Fe@Ypd1s=
zV?P<u+Ytji-X+*SB*umhy61rp+8-M^s&2=|B-A-J7{jsQLf!V!M)+tK)~W}6=c&H8
ziP)%zPo*tE5gYZ04Ow67h>iF)Ij7f)4djhp_Hl697OdHaOUJ>torrbzCqGB>@tplx
z?={S4SRY5$Ox_zejFb7phjm`0-ig*I-U7tX{i@74@2!cBb&KDW@eWkHGsXA-`>3-q
zs}?>T{F|RMe@{W%TG+sAn)jj<xln><MG4|*0sK)?N*xPyYF&U_)Qi|u^Bq3%;0&Jc
zr`n!ET(RuJ4<8HK@&{)&LH0^pL%(cuzic|*4UfFmVt%P#W)8@vwD%qP=XMhlzm#*+
zg&&53JcpE>Z#E$wh!zxbOvDfmybtDb9A%!<Pve<RR3hY@BZk9d0JD(wQR%RHIG6qs
z&sJ<Pg&Z*xNz}ubR$yGTMeM<w;Qz@JIDX2zL~cQTUXA$3rt-#8@{0F@YSgErj@%;Z
zI_esgZ>LikW+BQsTyD!SHI!#@xjn<2jB+lQJ2Ol-%DivS!#bWy3kx@0+iGwg8MLOy
zBPlud-QMwo=hYLvv3Y$+&FgWIR|e1PLuy_R^vee3b%vVPzwDO{%<Bv_ul11qq4T;j
zKKi`Q17@^&ohbBNe_p$OKX_iTP(QMHopPOd#Tt&sEV=d)*K!c=)LY$w;Of-;l@t#I
zf8)wu$?qPd1?nmm)(5}exh}|O8Z#;XQH{3VLq?!3QHL{b_+9J@4s$psxR9svu(!>l
z#uB1FvG!j;uFAvr92CnH=QT&u-wX2^NXek;gIOmx^RvdH{5)>=mepv)Vg}xE$sdb7
z5Xj1+@~o`(h3V9^ke@FtPs}RO=z+~yS=o^N{$b?yl60$OL0{~<4_m#F1!2p-pzijr
z&7V0@Z(>a&N3jiu`t(!h!+J}=3{HDJ>_V>Kyvpw?25ge&3t|0EEwF;~XdUD`%T{~<
zd%E;j0;XQQBpjTU8#;;o0k?Y_If2{p+0&icd#ffP*PcL|Z>F^L&B5M%)?N7x7v`)B
z+UHY@`4}(<S#L@sa_qOxBLCr;CC{%|ejDVxHuaZtL$#`(uW){H^fMV4KQMAk*I-|3
z5AQYOvGZZx%b?y%bnfKNO}|4Qz39W+rKQymvg6~>F&#R1Y=_fx8+bga3FG+$FmdQ(
z8{^eD<HWq|oN)U!qG>-LPUr2wp8&otJrHzJ$p2l=hn)ld$%gzz$akjb&`~0E=r5kc
zxmA9x(??T)ivSm)34IZYSaZnOQsIAnb5iv9=DNRQ-2wk1<b$NSOCAe$=ZBK!uETSn
zHXeMUCKOzrAId_!+H*1$y5>XIgh#1Qa|MF5F;K_n5<K3I!#?NyO#{{q><((WrZ)x5
z4m|4_cM0p?iJJ~wJ9OL)9Hj*OuPWS|3Aj6edj&YWWLWy%fVo@Y_9x(G0JjUcdw`om
zP2mW}w?<(fRal(K7)gtdZ%&#U2nMGf4`<O!U$w_L6YYIy&+8|Z$_w2e$Q*wVI<S8D
zaz!!+=Xl;z@E)R7?8zGAPa%Kdtg9=Vym9**Gn8@e7B;c=#rq8JKWKy4dvv0`c$azc
zx7;tzEwD~m?uMN9%-X#he168J;+f~!AopNzg4dk%n0WW#=WaIm-x0jc{X7l4{owrs
z=O<r-O$PD<>*0MIKR;n}f8U#c6QkqtCG1OiY#kU|Lz3JJ@mUYQE3aJ^)p^}>ozDxa
zmPhr=m~*c4nPFr_RPRuAJ`2oS8P!j#I-lunM7>qj`MmGRFGTh4sCqi;=TJYO>U^Fz
zdsS55t?C)5Z$Q0K)yJWJ6!k|{eLU)YsMn&7wWhOAllDA}edqPh${06YoF|gsjdOmS
zDQ+jcR~CtVPeIpYyb~NwA@k6^S_o_H(f4uINVKj~@MieUg=v8mGOrhOc@(yL0BbPT
zjTmEmH}L8=3FZ2MxEjfh!eQ*t`t2y_5_a6@N1X9qbFd9=i8B_a)vw@lW?{psp=a#Q
z=Mcv>@0%^l`L6%&Hwn*4tQqmH50~ku5F6uYMjr16C!)@KN6w@8p<=0HjIIdw#ldOo
zOm1@|yCB5A<>w)vi^hQMcLWNy9n?Z)8o6t6Hwc>-Q;1r$GD>qdU`*iWOFsO}zTh)g
zmKzB&YB9>7*3b9!Jr{J+YS6Z(0rPxiI)68c`V-6Dp_F(*!|Pc!4H4)O>k;iHKt?q8
zX~st^O~wFx=iW0*`J6;K>HF>mob5~beUg`q_t6RcJVW5@*f?~btfvEmv8Z+i#)0|c
zG16U6Tb8>xTXT#Pdgc6LB<kw-l^7epH_pb-XB@lR{{#@k@~o~Rde%D|V@g6_d{)Ez
z6BoXZ9k5sR3wb-{g5^obpYW-$tGhq0>|O<hTwW8$NxEsxS2r>53;YL{rU^nm@ABo0
zS?pbcZ%M&lzHC_$M#yv?)5X5!mCMR#mxb?aEqrHd;X7Lk-`QID&ep<rwidp#weX#-
zh3{-Fd}nLnJ6j9i*{b(C&Qt6cwC_gyZnW=4`);)FM*D8G??L+>wC_Rt9<=Y_e86W)
ze3k?~;#`S!aG7OxZe||Nyv5~uY#F5ly{>5!T*SO=+_T@atcy}CcyK=b7kEAg&p^58
zqGiP#JS;?-@Jq|m96X#$P&Rn86_2TjI}f~b%&W@q(sQn=%%Vx&E%%Shq>6Xb@SY)l
zFFgqHIpC0>i7J8f?oWO;={XmW-b~-;0rtLo{ucr+@Mi$){FP<B2rBXWQXDOeWvazF
zG?@-a3iSH_!?LOwzbI3K-n(Exnio};&xU*(<V<O?u<$w87HJ~OtH9rZQ>BsG*Fk>t
zI`Syw=dL3^2zg4^$ok)ayzn~mi;%CljywhV%Y;0Lv0(f>1Ise0GR=#s7(e7^Mv!x?
zR6$<YZCQ-vOT)byy!qg5WQLsE%1i^AebR4Y{(*@ABm8D#xc4eCK?Cy-ToZ8ECk>r{
zqZ;7FX&eC@FL{8R0lp7-j+-Iz;(suc7Kv~nwt<rIJA4mf6+WVB6@z}*@W_A-jLQRV
z!~2$XjN3?DWu_0fhX&((z|kek@(<{{D{~`on+D^8!0ErXER*#uLcbi)GPd9$j0WCk
z-!=ia12~OwxT<g;1#d5SZQO@g4^NH%n{j`evDf`{`ZH<YN*xsKO|GQ-MAzqkaqgX;
zpKbg1pS{hFWj%e+lZOY?|3;6P{}i0aTppNz=B);=1w8JDue)(?051mKVe~6v!uv$#
zo{3LRxUTg}6O2FeZ$D-Gd7h6!-`)RhS?wreU2=3i=c)mI5yHVcezlAX8N;2x((hEB
zl570WGO0?$INQtDa5?Z8+H8~J{v25Jd&`O{9@}ByZUWB_D(3Pz*Rx5LVjXcLSMX#a
zG)4Ojpl5*Pc6@Q)5!{{N74m%IV&0QnxT4l1m&dL#4~%(W%mZT{81ulG2gW=w=7BK}
zjCo+p17jZe|Kx$lXvSLNdY6CKyn!oLu3j0*N$v80g4k<PuVvXhu8XouHvP%Bw?DEd
zH^<0O04KG}ERwid1BbBUa#Fj@0*TvgEaCTVTuy42nJ;mz-hchZ;#pjGQoB5WJMDe;
z<9}VnbtkpU1Go-v>8GX&b8L#?hGkiLNA`Q!+wg_?A7hS98HXFbaS5QpwLbpRk1t&l
zeC|bD5%JgM#N|}}>g_lF>G|FGmkElZNf>v6zIl~W?<6(;Ni;mpTlvhY+k_izIy|nm
zl5BvY!{bg@()@pzBmAZ4@VJgj@c9bU+K298zj2uv2p6Yc`f)5%SC}c>;>5i!T_%n?
zR8f{~oMgsCTQs*RimG3wPvPu1j@ZgTMcLLT{d*;><U-~wC*W>S-|5$sj{hs-v};(M
z>;TKUdHKMtta4Jj%p&RM8_x&TDN>=6+T{U#H!mLqRX(sOoFC>HNX!S@#`1yi%dMIZ
zWE98CU2k8rbBAQz*x!hPjYv7>Mqc-^^1<EM^--zHU$GIXT+ca%)IUp6ZAjf-_X4NY
zO1Vvqfx5))mh#z(YD4Q1*Xl(Y$3DbKX|1y^ai<l|Nn6lvpe}J83g@Ivzzx(T?lN$@
zz-P++53MU)tHOZ_oP&z?sBh!;qvE8#oXUTZ`>x8j6iuv4oV{Ndje6C-Vl>*SaHG-F
z3O5?<P`J_PWrZ7+zOMW|A{B8*vG5I2OW{VOs=u(LS4N|j!i`8}f203(4T6lkeq}yT
zdv@wyzm;lS_Ij?wPoK`qP?aH+_p*E`d5femr4L`Hh$~jf&p}xx6#vnSvP~nZZc~Wt
zoq?wGj705U+=>)#B&w;d4TT#@rT<39Nxx|7J;|t)?S|@07CmYt@rLr}P#j<wgy<K8
z7IK?HGzbrnzyUA#HciA8DI7k%5ICEnv8dbBP&m6j5{)Pv@S^=llpx2=mrwb5C{_NF
zu<PKI-$unrKN}?V`qX?3?N@FWN5`d#9!JNeDV(&^r{;HPzv(xKQv)4Br2c+(M3ro+
z_RTp4_<nxxmVU7J(*t~4kIXlMA7Q-o-%z~jhxPwC<4_lPhklzc`gQ2H`HA{%zQEi1
zK{q<}JNQ1uHxv~jY5(OgD$sKU-}K7s{IO?{Pp%3+Q+r%enwZ}O7y3KyLJ9f>l~4pP
zp1aJ)73<fJip|I8U%$D_9<PHh@Q!{5;UToImlx%_bW~atb?{q1$CavFNo|3m5l0<m
z{{I*3i^mtS>>B^;nnk&dw!CTW>{yt$1xoJ!?B&CTBOmiw$%28kD^Rdc8&F5F&pE~m
g{>yjTewU!8JAd=Nzd!%XBeQSgIzJD7@@*;kAE#Kl9{>OV

delta 2116
zcmZvde@xRy6u|F$lv)s2`Bkx$TH0D^!SZ7Zei4+~p|YZtEsDq%Q>=)SnJi9JT&2}j
zs>`y3J+n9xHzmp+#u@yTxI`CR)G+oBF4<xNy2%;?s8BW0PN27UZDDGB$>qKGxp#N(
zy}NIF`;^eX-9U(C2_d%no3m^!s2j*4TRkBTWKGDLkTo@U7PTAR7IwGS718V2=f2+k
zR&uvhj3v693bKD0W7Z{k4p7L8+{B|TA!Ot?TrX;f$4=0Y`@(GCA!AkCk1%UVB=%Q~
z?HlNSZBB;!oV6c!6LNas;v441>>ix^a^PrmE^-Emab)<QPd+aZv*XA34bDBXHh3m~
zm*+d1XfQeA=!!A!leJy2CF=x^yQs_+WcMZ*8!9IbXieEckEX(tlm&4<Op+(;0d496
zV>I+m3aLEDvV)BEl#YZVCCPn5rHmaYqgXa>u_M)#)Q@3=l#fI=IGt*Zmw7r%B^VL!
zfoG{s!3T9|x!KWOk0q1JYvElx%W<c2LV8NCe0;Uk?|`nfV)|7w+)Y~}bb~ovNyY78
zOV34TSGt<+*$UsL+vz8ba37Vw5sDS_g!iFFVOB`wJcY0b9)S>nSjtE@bSN?<^<HrV
zggH4`(c|Sr2;Rfv*G;f0UI<n#I<`Sb!H+S`QO0_jyy6DFV!C5|#VuTM;|0`ZG`>z(
zbE=m)TqMrv*f#>9jP<xt&D=Ugcj6n9x;fWi+KE$Bap7r)KXS1=`ImB!T-M_RP$)Oj
zmN?j@)Xj}n(2zhXPhdW{0I$83U;BBbnf5fpkaC4Wg|p#C@a<w|TCp@!LFFE($^1ak
z+w2sx@H#c5eweY&GCA>xxcKMH74)OoAXV}FH>wmg_yz1z*;06T>=bWSXPG1t`n$?R
zH^)Orm5HHy)k=Fm#!R>A@Fc7Hb0Xv}AZ{_9uz43}IbGjLBehls+^z!R6gfK>WsC6w
zTB3FV$rcCWU`TC-9&JNHv|s^W^G{Hy%cScV*mVkeT?SipMf7qTbm(mKY6$M@is``+
zsP$WDRtVZqU7CV1y`45p!P5EP(;Jg;cfNrhn*^!BK$|DQjB52H)EG=OdlC*9)=W2-
ze;&LN{z5fme{lH(j2aB?+mQl?ar!}5jLdcUIjiDn@jGB_Q`z4*Pkh?;6Ucd^VzE)f
z&8NgY6aTpXK-NX~x~6*_Kz=REKA&MhWIrOi6kt{J@nu@D5j?&Zxr}(+yt^}pdogh4
zif3MX<i+sZ`SX5{Mmo-)xPTqry2&dF_|m4o-3q5ye2kBP`!P80+3XeXMScF|<}`T4
zTTwrjtZ;~*BDfX>QVck<#VekR`m@j<ME^w8m!Yq3^os51Yp@dDCF<KW3<-0x`1iwo
zRrt4(IAlIJpR@Z7bg}U>d|hXaW}ye}7;DikHmy(e4T(f!tJe@)3$E-il`i*bN6tuy
z>+~A3Plh)AMb;uA#*&v=46&3X%rHwhPX3Ay%G{Ow7vlfS^m&9B4-;}YpEEqGHFw81
zd4uj%BaHbGagpP|Frpe|teOK9&MY`;kp)n5;cP9AcOEdNLCuA;tvD>m^c<jY?ZQ!w
zY><PBxMk!Ap(3dud0Fd*_#|Te3qD+!NF@(p+d?yyKY|WahaN%SLN)Dv1f!@fjzPRx
zO$W!ogi10FHdJ}z&}UXkuj6C!j>D)~sq~{cIL?@gAA_R75k%%62X($$s-lDx1i+fF
zl~$lx9e{25Ch1Nz&jg?&-yl7Xrv5S9#DHEjRZn0H%>bI+Phe?*)|!F8J(8!d)|-b+
zj%@nSaIcVMAgkb6psd2N7Fp#}C@Qo`PojD985}4yr1hg&`yBs_c-ZnJ4fezdR|@qe
zKf*&CqVO&GmDJ+DmX`yR92^!Ly+MdCGMOfEiX$#O+{5+1jOAhZTsS*|Bj?41jYY!}
N2(2u-U|Drk`X5<{!7%^;

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index 3e84260..3a5a601 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -8,3 +8,4 @@ test 07:PASS
 test 08:PASS
 test 09:PASS
 test 10:PASS
+test 11:PASS

From 4807d0bdb6bda1154cc39a619d0432de5ec14571 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 22 Jul 2020 20:51:31 +1000
Subject: [PATCH 15/30] FPU: Implement fmrgew and fmrgow and add tests for them

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl               |   2 ++
 fpu.vhdl                   |  27 +++++++++++++++++++++++----
 tests/fpu/fpu.c            |  21 +++++++++++++++++++++
 tests/test_fpu.bin         | Bin 21208 -> 21208 bytes
 tests/test_fpu.console_out |   1 +
 5 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index a42899d..34170dd 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -434,6 +434,8 @@ architecture behaviour of decode1 is
         2#011000001#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  1/6=mtfsb1
         2#011000010#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  2/6=mtfsb0
         2#011000100#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/6=mtfsfi
+        2#011011010#  => (FPU,   OP_FPOP_I,     FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 26/6=fmrgow
+        2#011011110#  => (FPU,   OP_FPOP_I,     FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 30/6=fmrgew
         2#011110010#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 18/7=mffs family
         2#011110110#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 22/7=mtfsf
         2#100000000#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  0/8=fcpsgn
diff --git a/fpu.vhdl b/fpu.vhdl
index 371fdc5..e97461c 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -37,7 +37,7 @@ architecture behaviour of fpu is
 
     type state_t is (IDLE,
                      DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
-                     DO_FMR,
+                     DO_FMR, DO_FMRG,
                      DO_FCFID, DO_FCTI,
                      DO_FRSP, DO_FRI,
                      FRI_1,
@@ -450,10 +450,14 @@ begin
                         when "00000" =>
                             v.state := DO_MCRFS;
                         when "00110" =>
-                            if e_in.insn(8) = '0' then
-                                v.state := DO_MTFSB;
+                            if e_in.insn(10) = '0' then
+                                if e_in.insn(8) = '0' then
+                                    v.state := DO_MTFSB;
+                                else
+                                    v.state := DO_MTFSFI;
+                                end if;
                             else
-                                v.state := DO_MTFSFI;
+                                v.state := DO_FMRG;
                             end if;
                         when "00111" =>
                             if e_in.insn(8) = '0' then
@@ -524,6 +528,15 @@ begin
                 v.instr_done := '1';
                 v.state := IDLE;
 
+            when DO_FMRG =>
+                -- fmrgew, fmrgow
+                opsel_r <= RES_MISC;
+                misc_sel <= "01" & r.insn(8) & '0';
+                v.int_result := '1';
+                v.writing_back := '1';
+                v.instr_done := '1';
+                v.state := IDLE;
+
             when DO_MFFS =>
                 v.int_result := '1';
                 v.writing_back := '1';
@@ -1009,6 +1022,12 @@ begin
                     when "0011" =>
                         -- mantissa of max representable SP number
                         misc := x"007fffff80000000";
+                    when "0100" =>
+                        -- fmrgow result
+                        misc := r.a.mantissa(31 downto 0) & r.b.mantissa(31 downto 0);
+                    when "0110" =>
+                        -- fmrgew result
+                        misc := r.a.mantissa(63 downto 32) & r.b.mantissa(63 downto 32);
                     when "1000" =>
                         -- max positive result for fctiw[z]
                         misc := x"000000007fffffff";
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index d24fe14..e7a1334 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -823,6 +823,26 @@ int fpu_test_11(void)
 	return trapit(0, test11);
 }
 
+int test12(long arg)
+{
+	unsigned long vals[2];
+	unsigned long results[2];
+
+	vals[0] = 0xf0f0f0f05a5a5a5aul;
+	vals[1] = 0x0123456789abcdeful;
+	asm("lfd 5,0(%0); lfd 6,8(%0); fmrgew 7,5,6; fmrgow 8,5,6; stfd 7,0(%1); stfd 8,8(%1)"
+	    : : "b" (vals), "b" (results) : "memory");
+	if (results[0] != 0xf0f0f0f001234567ul || results[1] != 0x5a5a5a5a89abcdeful)
+		return 1;
+	return 0;
+}
+
+int fpu_test_12(void)
+{
+	enable_fp();
+	return trapit(0, test12);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -859,6 +879,7 @@ int main(void)
 	do_test(9, fpu_test_9);
 	do_test(10, fpu_test_10);
 	do_test(11, fpu_test_11);
+	do_test(12, fpu_test_12);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index d2320cd960e8c39417367d3b8d7fc730494c9eca..668ff65367cbf02294b638e77fd93f9173db9532 100755
GIT binary patch
delta 1932
zcmZ`&ZA?>V6n<~v*22&fC{Vsz$_5oGEpziCAC@SjQfJWV<`^m+BT9ma%Ot6;qiF7u
znUDo{7Nd(UxQNk=C1YUX4_q{pMHiRM#UHqsMHFYt$1IypoWi^3ZR?C|JITp?p68r<
z&bjBl??|sS(kpd$X^s^3e{?X>9}1Z&Z}94}P{``}Zh>`oLxZ&i?g?@8LyKgp+*T9O
zzu0eWx%-3FGXov5GR96_)Uit!yO!O4-1V|N%h>pcj>TUKg`B8o4oa*^35DuVvu?LV
zs%zO+Q|DpzR-cLOThq{Bstj;X0DaQKO`jC^);0S$w>b3atv)g&b!N9raW`YENn)ll
zri&?F#r8S9kH-9>`fBP=%9X5iB`J52F@w3D=(CeFm$Y8mOzV=1H1AJRM{<s)KAk>G
zenEPPhLcxnrY9)gP^vjQK~;ve)@&nlo#*Bm&Ocwl`Mwp5ohw)0JX=0if292AEO&b+
zj{CiTeCw~F(}seD3CQ-|Qo~9j1r1xZ`nOf(%W+zpQmb_~sETKtK252Um%gDYh^M%<
zeL&}fv^=#gR!wCt>M1bM$yB{mO;Xx23Z*8<g^jB6kj%yxX=$2Xc5POb{xI`_u_je%
z4YLbiBOX;ThgpJAuZ`WJD%b9y^AfVB>5`@%ayLGp0_#%p*Q?5bNT}YZlX~fj@p<Yn
zu9s{;MVixhXwLYxq^JJ$a`|c!bG<^-=@w}nrDSZCE>U|%ku*%lGMrKy-OQ+!d}Ph+
zOlfOYvgf$l#5@7)szT2?m6103CbL!=A~|a)5pfy&60z7w?5C{7(gm8$x{%)1t9T#X
z@~?jUulIk>u9bSooU;qP4(8<8_Qx}q_~Z2<zt7H^mB9tgzV@iL9ADNN#dc;^uG45v
zIZi-!?%+anNWkF`-Dle9m)x|rXOvz=%p3O?cUNJ4#wv<MW@uiH4mr26M#d_`afNw0
z|10M_awOIf9<B$2<derlSl@lq;Kq3+Mca)tqGWlvWFgwmFDY^y$oc=#XwICLKd-VF
zHu&J#LDo=N-UCfs9F<y1CBs0Ar7y<6qZ!{W1!nl@#EBT4@8s{|6<jAQne(?Qbci#`
zJ8%HH*olm?`W=z~H4N>9W)JRU<bNBX_lLFj{?Sa(jzarjoV(jGjuZP3`8P1G7?HXQ
z?Ed|l$X^xbQ`}wstEwXL$a&R(Pl8Y5aEW*fc;QA>8IAH-@E^giL^<w+VtfskD$2Fs
z6|dvcMEL@675rF~>sV8*av8#bs1Sz)!zNYP8|Crfjo|M@`E%gM!97u)06q!6ZlJwz
zd#wK^=VMr&Fg?5&eh*r`5MV-c!tJ*>58)|HXdB=OW1OE90!(Pz;XTkycnTAm58ekY
zUkEVq?1m3PyN)NhY+3RdfSuH8E0S@RnU4<JinK-vgZxV0+cIPYFpGk=OzkEhQ$UDX
zFQ6*|4*~9sz%zh{B5(xoTm;Sl1|ra?!Cb*eSwMv@Ey<KO0d-SwNoIx@C=ld43v&oF
z!MmV^f|OH|BYy>0LDeM|c?77DT1$##T?}Jq=x|AaycEbz!-zHl71C6RMXLf0O;V~o
zPumB0Z;GnydFfXHUB5lu)GD+IK-)mO?JKqRSjNWg(jI%&$BtN=7i`z`(~-nhyP)}?
ziT8x9f%ib`BZtE+hk$gn*HK`|lF|J>=L`#AHH%D%)gABBkRxCEkVYNJDUbi;Tov!b
t3gJ$88^x6tE$(^1xln|OhMn+1XvLxsOlY6LFV75YDLtAnP~tq5^fx0>#!mnM

delta 1739
zcmZ{jZ%mt26vpo@eLI*yV04sH7zMgfgs~QkUIn%?D#3LzI6*;O1|rD}Rnd&3Q(U{{
zjoF7~89B?MiHl%hQL`;>bcsta0o^`mHWLgV_5pPW({2m`{sSEI`rNj08R|(+-skz<
z^WJ;jbK7f!{MsNtcEoVH=-km0>2zgXmmy`2wq|7W1!~F2<72cxBQJHt%IX7mrh}Zd
ziosbrm9dL!w4705*f>j@%o_|#SyX4<#J{6g%(aI1Zqs>lxuN4W{bYW!__Br7kIMWd
zDL>pM<)LkijZ_|g?(@p&hW9H^-j%+<%s?>k>zBb<+P%IgC5-C8lpZA)sdxPzlf7LR
zO*8a~rHmYzS*C^-T_nxW_RJlcw^<jc$@H0qp<cd4KW6UWrL@^n!EaKVrB)Nq>!Kg(
z5xQ<k)|y|?MN8B<vy!#6J-SGbx-Pg#t1iBxgITqjdABYO(t@Qz8$;8xG-};QuUdEU
zZm1H&V1(wZ=Q*SP>`Lv@I#ypzli7v5hBVtAevvwCCHw;Q+C03IzO~hHKUs4^>pR;;
z?qAYpXKfJ-p(>4q#I;U3n^VV6)9*RM=~&k=2(@O6t$mUEAP>`Q?j>vIpa|TV3O=#&
zulK*+P{)r^dfq;q^-kV~&HLA~dd665!+fyE#o9%9O?yvAoIQqXYl&+Y%NJK@EU%JV
zDa9U6!3oLikVl=q)JfmiE&lt+piuoL{V9E0(LZBVrK)m%RZSvlZY)b5W7W}h!P-Lq
zRWphj&JIMI>q8@EdMckVz8m@Brd3<2W1E)LmX*;hoj88=ma4{$n*VQ&<_av1RXeL@
zZ!X^7NEH<qE*h2-slZXrjYCb2(+NR;JHDMqmgMBjsf5IykA6|F*dJ9xmh#1_M65F^
z=<NkpGtsbZnm=}5m%&aM-t19aHJJr|F3R5ehuOhSf?d5OecfosgL$a>>u6WCNFk>y
zxZe=FRs3_R)SqwfE?sb`4m?z4fcHMFiv|3+luv*UHR@tA?v3!yCS8ohy$0U}KN|NY
z_~Xy$;#}OXfuDfykNZT{`mk_3tBc+^NJ4=h{$Sj%g%88O9`_HxufVqrbr<h72B)N)
z#;{bRAxn|Jfi)<AP|SmLO-nhCrwYXyk;yltwBxBlvAxJFFoyyN_3S}9!6p>&Ce^SH
zxHr|6UUQ&jZ7SRTgNeGz614%WP0-tACE7StgwB+en2a1Z_%nT9maRD;57XT;o5>4x
zB%+|{CCG40>W~vLIRI(A87;U3SsIhmkd0BP84c)(?ryPZ4ybX;b=j)DP^Gt|%uzie
z?8s)YhrqfNi1l5Y>OtQP(Y9-%--enHb-N0+0jNGY>B0v<t<Y6hk;W1j+egb-bwc?m
z-(6^GfMT<>!(CtsK#Dnf!(CwQgFG^KZ<byZg~8sXGwxc`3Y2r6&bqg{)sVeL#y(rP
z*Q{dAU?X7aMHM@dez1A!DEDc@P&?^Dd678+#r}w2<m0J!e9mh$U+yR#20X0ZM5qnb
zufVk^rLF+NiS!`*sJfzL^Eg%%QK70s$Ou>|#;iiI<4P<`85ruXIGQ~4X@!RmZSj1(
F?r+oEbrS#p

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index 3a5a601..d926abc 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -9,3 +9,4 @@ test 08:PASS
 test 09:PASS
 test 10:PASS
 test 11:PASS
+test 12:PASS

From 86b826cd7e4cc8ffde6a324a90d4481cbc910ebd Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 23 Jul 2020 17:56:15 +1000
Subject: [PATCH 16/30] FPU: Implement fadd[s] and fsub[s] and add tests for
 them

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl               |  19 ++++-
 fpu.vhdl                   | 150 +++++++++++++++++++++++++++++++++++-
 tests/fpu/fpu.c            | 154 +++++++++++++++++++++++++++++++++++++
 tests/test_fpu.bin         | Bin 21208 -> 24024 bytes
 tests/test_fpu.console_out |   2 +
 5 files changed, 322 insertions(+), 3 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 34170dd..737d83c 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -58,6 +58,7 @@ architecture behaviour of decode1 is
     type op_59_subop_array_t is array(0 to 31) of decode_rom_t;
     type minor_rom_array_2_t is array(0 to 3) of decode_rom_t;
     type op_63_subop_array_0_t is array(0 to 511) of decode_rom_t;
+    type op_63_subop_array_1_t is array(0 to 16) of decode_rom_t;
 
     constant major_decode_rom_array : major_rom_array_t := (
         --          unit     internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
@@ -415,6 +416,8 @@ architecture behaviour of decode1 is
         --             unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
         --                          op                               in   out   A   out  in    out  len        ext                                pipe
         2#01110#  =>  (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fcfid[u]s
+        2#10100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsubs
+        2#10101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fadds
         others => illegal_inst
         );
 
@@ -461,6 +464,15 @@ architecture behaviour of decode1 is
         others => illegal_inst
         );
 
+    -- indexed by bits 4..1 of instruction word
+    constant decode_op_63h_array : op_63_subop_array_1_t := (
+        --            unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
+        --                         op                               in   out   A   out  in    out  len        ext                                pipe
+        2#0100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsub
+        2#0101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fadd
+        others => illegal_inst
+        );
+
     --                                        unit   internal         in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
     --                                                     op                                              in   out   A   out  in    out  len        ext                                 pipe
     constant nop_instr      : decode_rom_t := (ALU,  OP_NOP,          NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0');
@@ -626,8 +638,11 @@ begin
         when 63 =>
             if HAS_FPU then
                 -- floating point operations, general and double-precision
-                v.decode := decode_op_63l_array(to_integer(unsigned(f_in.insn(4 downto 1) & f_in.insn(10 downto 6))));
-                vi.override := f_in.insn(5);
+                if f_in.insn(5) = '0' then
+                    v.decode := decode_op_63l_array(to_integer(unsigned(f_in.insn(4 downto 1) & f_in.insn(10 downto 6))));
+                else
+                    v.decode := decode_op_63h_array(to_integer(unsigned(f_in.insn(4 downto 1))));
+                end if;
             end if;
 
         when others =>
diff --git a/fpu.vhdl b/fpu.vhdl
index e97461c..e9edfb4 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -40,7 +40,9 @@ architecture behaviour of fpu is
                      DO_FMR, DO_FMRG,
                      DO_FCFID, DO_FCTI,
                      DO_FRSP, DO_FRI,
+                     DO_FADD,
                      FRI_1,
+                     ADD_SHIFT, ADD_2, ADD_3,
                      INT_SHIFT, INT_ROUND, INT_ISHIFT,
                      INT_FINAL, INT_CHECK, INT_OFLOW,
                      FINISH, NORMALIZE,
@@ -79,6 +81,9 @@ architecture behaviour of fpu is
         tiny         : std_ulogic;
         denorm       : std_ulogic;
         round_mode   : std_ulogic_vector(2 downto 0);
+        is_subtract  : std_ulogic;
+        exp_cmp      : std_ulogic;
+        add_bsmall   : std_ulogic;
     end record;
 
     signal r, rin : reg_type;
@@ -89,6 +94,7 @@ architecture behaviour of fpu is
     signal opsel_r       : std_ulogic_vector(1 downto 0);
     signal opsel_ainv    : std_ulogic;
     signal opsel_amask   : std_ulogic;
+    signal opsel_binv    : std_ulogic;
     signal in_a          : std_ulogic_vector(63 downto 0);
     signal in_b          : std_ulogic_vector(63 downto 0);
     signal result        : std_ulogic_vector(63 downto 0);
@@ -368,6 +374,9 @@ begin
         variable mshift      : signed(EXP_BITS-1 downto 0);
         variable need_check  : std_ulogic;
         variable msb         : std_ulogic;
+        variable is_add      : std_ulogic;
+        variable qnan_result : std_ulogic;
+        variable longmask    : std_ulogic;
     begin
         v := r;
         illegal := '0';
@@ -397,10 +406,16 @@ begin
             v.tiny := '0';
             v.denorm := '0';
             v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN);
+            v.is_subtract := '0';
+            v.add_bsmall := '0';
             adec := decode_dp(e_in.fra, int_input);
             bdec := decode_dp(e_in.frb, int_input);
             v.a := adec;
             v.b := bdec;
+            v.exp_cmp := '0';
+            if adec.exponent > bdec.exponent then
+                v.exp_cmp := '1';
+            end if;
         end if;
 
         r_hi_nz <= or (r.r(55 downto 31));
@@ -433,6 +448,7 @@ begin
         opsel_ainv <= '0';
         opsel_amask <= '0';
         opsel_b <= BIN_ZERO;
+        opsel_binv <= '0';
         opsel_r <= RES_SUM;
         carry_in <= '0';
         misc_sel <= "0000";
@@ -442,6 +458,8 @@ begin
         invalid := '0';
         renormalize := '0';
         set_x := '0';
+        qnan_result := '0';
+        longmask := r.single_prec;
 
         case r.state is
             when IDLE =>
@@ -483,6 +501,8 @@ begin
                         when "01111" =>
                             v.round_mode := "001";
                             v.state := DO_FCTI;
+                        when "10100" | "10101" =>
+                            v.state := DO_FADD;
                         when others =>
                             illegal := '1';
                     end case;
@@ -717,6 +737,117 @@ begin
                     v.state := FINISH;
                 end if;
 
+            when DO_FADD =>
+                -- fadd[s] and fsub[s]
+                opsel_a <= AIN_A;
+                v.result_sign := r.a.negative;
+                v.result_class := r.a.class;
+                v.result_exp := r.a.exponent;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                is_add := r.a.negative xor r.b.negative xor r.insn(1);
+                if r.a.class = FINITE and r.b.class = FINITE then
+                    v.is_subtract := not is_add;
+                    v.add_bsmall := r.exp_cmp;
+                    if r.exp_cmp = '0' then
+                        v.shift := r.a.exponent - r.b.exponent;
+                        v.result_sign := r.b.negative xnor r.insn(1);
+                        if r.a.exponent = r.b.exponent then
+                            v.state := ADD_2;
+                        else
+                            v.state := ADD_SHIFT;
+                        end if;
+                    else
+                        opsel_a <= AIN_B;
+                        v.shift := r.b.exponent - r.a.exponent;
+                        v.result_exp := r.b.exponent;
+                        v.state := ADD_SHIFT;
+                    end if;
+                else
+                    if (r.a.class = NAN and r.a.mantissa(53) = '0') or
+                        (r.b.class = NAN and r.b.mantissa(53) = '0') then
+                        -- Signalling NAN
+                        v.fpscr(FPSCR_VXSNAN) := '1';
+                        invalid := '1';
+                    end if;
+                    if r.a.class = NAN then
+                        -- nothing to do, result is A
+                    elsif r.b.class = NAN then
+                        v.result_class := NAN;
+                        v.result_sign := r.b.negative;
+                        opsel_a <= AIN_B;
+                    elsif r.a.class = INFINITY and r.b.class = INFINITY and is_add = '0' then
+                        -- invalid operation, construct QNaN
+                        v.fpscr(FPSCR_VXISI) := '1';
+                        qnan_result := '1';
+                    elsif r.a.class = ZERO and r.b.class = ZERO and is_add = '0' then
+                        -- return -0 for rounding to -infinity
+                        v.result_sign := r.round_mode(1) and r.round_mode(0);
+                    elsif r.a.class = INFINITY or r.b.class = ZERO then
+                        -- nothing to do, result is A
+                    else
+                        -- result is +/- B
+                        v.result_sign := r.b.negative xnor r.insn(1);
+                        v.result_class := r.b.class;
+                        v.result_exp := r.b.exponent;
+                        opsel_a <= AIN_B;
+                    end if;
+                    arith_done := '1';
+                end if;
+
+            when ADD_SHIFT =>
+                opsel_r <= RES_SHIFT;
+                set_x := '1';
+                longmask := '0';
+                v.state := ADD_2;
+
+            when ADD_2 =>
+                if r.add_bsmall = '1' then
+                    opsel_a <= AIN_A;
+                else
+                    opsel_a <= AIN_B;
+                end if;
+                opsel_b <= BIN_R;
+                opsel_binv <= r.is_subtract;
+                carry_in <= r.is_subtract and not r.x;
+                v.shift := to_signed(-1, EXP_BITS);
+                v.state := ADD_3;
+
+            when ADD_3 =>
+                -- check for overflow or negative result (can't get both)
+                if r.r(63) = '1' then
+                    -- result is opposite sign to expected
+                    v.result_sign := not r.result_sign;
+                    opsel_ainv <= '1';
+                    carry_in <= '1';
+                    v.state := FINISH;
+                elsif r.r(55) = '1' then
+                    -- sum overflowed, shift right
+                    opsel_r <= RES_SHIFT;
+                    set_x := '1';
+                    v.shift := to_signed(-2, EXP_BITS);
+                    if exp_huge = '1' then
+                        v.state := ROUND_OFLOW;
+                    else
+                        v.state := ROUNDING;
+                    end if;
+                elsif r.r(54) = '1' then
+                    set_x := '1';
+                    v.shift := to_signed(-2, EXP_BITS);
+                    v.state := ROUNDING;
+                elsif (r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then
+                    -- r.x must be zero at this point
+                    v.result_class := ZERO;
+                    if r.is_subtract = '1' then
+                        -- set result sign depending on rounding mode
+                        v.result_sign := r.round_mode(1) and r.round_mode(0);
+                    end if;
+                    arith_done := '1';
+                else
+                    renormalize := '1';
+                    v.state := NORMALIZE;
+                end if;
+
             when INT_SHIFT =>
                 opsel_r <= RES_SHIFT;
                 set_x := '1';
@@ -927,6 +1058,10 @@ begin
                 mant_nz := r_hi_nz or (r_lo_nz and not r.single_prec);
                 if mant_nz = '0' then
                     v.result_class := ZERO;
+                    if r.is_subtract = '1' then
+                        -- set result sign depending on rounding mode
+                        v.result_sign := r.round_mode(1) and r.round_mode(0);
+                    end if;
                     arith_done := '1';
                 else
                     -- Renormalize result after rounding
@@ -946,6 +1081,13 @@ begin
 
         end case;
 
+        if qnan_result = '1' then
+            invalid := '1';
+            v.result_class := NAN;
+            v.result_sign := '0';
+            misc_sel <= "0001";
+            opsel_r <= RES_MISC;
+        end if;
         if arith_done = '1' then
             -- Enabled invalid exception doesn't write result or FPRF
             if (invalid and r.fpscr(FPSCR_VE)) = '0' then
@@ -960,7 +1102,7 @@ begin
         -- Data path.
         -- This has A and B input multiplexers, an adder, a shifter,
         -- count-leading-zeroes logic, and a result mux.
-        if r.single_prec = '1' then
+        if longmask = '1' then
             mshift := r.shift + to_signed(-29, EXP_BITS);
         else
             mshift := r.shift;
@@ -1000,6 +1142,9 @@ begin
             when others =>
                 in_b0 := (others => '0');
         end case;
+        if opsel_binv = '1' then
+            in_b0 := not in_b0;
+        end if;
         in_b <= in_b0;
         if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then
             shift_res := shifter_64(r.r & x"00000000000000",
@@ -1016,6 +1161,9 @@ begin
                 case misc_sel is
                     when "0000" =>
                         misc := x"00000000" & (r.fpscr and fpscr_mask);
+                    when "0001" =>
+                        -- generated QNaN mantissa
+                        misc := x"0020000000000000";
                     when "0010" =>
                         -- mantissa of max representable DP number
                         misc := x"007ffffffffffffc";
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index e7a1334..8f7407a 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -843,6 +843,158 @@ int fpu_test_12(void)
 	return trapit(0, test12);
 }
 
+struct addvals {
+	unsigned long val_a;
+	unsigned long val_b;
+	unsigned long sum;
+	unsigned long diff;
+} addvals[] = {
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x3fdfffffffffffff, 0x0000000000000000, 0x3fdfffffffffffff, 0x3fdfffffffffffff },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x4000000000000000, 0x0000000000000000 },
+	{ 0xbff0000000000000, 0xbff0000000000000, 0xc000000000000000, 0x0000000000000000 },
+	{ 0x402123456789abcd, 0x4021000000000000, 0x403111a2b3c4d5e6, 0x3fb1a2b3c4d5e680 },
+	{ 0x4061200000000000, 0x406123456789abcd, 0x407121a2b3c4d5e6, 0xbfba2b3c4d5e6800 },
+	{ 0x4061230000000000, 0x3fa4560000000000, 0x4061244560000000, 0x406121baa0000000 },
+	{ 0xc061230000000000, 0x3fa4560000000000, 0xc06121baa0000000, 0xc061244560000000 },
+	{ 0x4061230000000000, 0xbfa4560000000000, 0x406121baa0000000, 0x4061244560000000 },
+	{ 0xc061230000000000, 0xbfa4560000000000, 0xc061244560000000, 0xc06121baa0000000 },
+	{ 0x3fa1230000000000, 0x4064560000000000, 0x4064571230000000, 0xc06454edd0000000 },
+	{ 0xbfa1230000000000, 0x4064560000000000, 0x406454edd0000000, 0xc064571230000000 },
+	{ 0x3fa1230000000000, 0xc064560000000000, 0xc06454edd0000000, 0x4064571230000000 },
+	{ 0xbfa1230000000000, 0xc064560000000000, 0xc064571230000000, 0x406454edd0000000 },
+	{ 0x6780000000000001, 0x6470000000000000, 0x6780000000000009, 0x677ffffffffffff2 },
+	{ 0x6780000000000001, 0x6460000000000000, 0x6780000000000005, 0x677ffffffffffffa },
+	{ 0x6780000000000001, 0x6450000000000000, 0x6780000000000003, 0x677ffffffffffffe },
+	{ 0x6780000000000001, 0x6440000000000000, 0x6780000000000002, 0x6780000000000000 },
+	{ 0x7ff8888888888888, 0x7ff9999999999999, 0x7ff8888888888888, 0x7ff8888888888888 },
+	{ 0xfff8888888888888, 0x7ff9999999999999, 0xfff8888888888888, 0xfff8888888888888 },
+	{ 0x7ff8888888888888, 0x7ff0000000000000, 0x7ff8888888888888, 0x7ff8888888888888 },
+	{ 0x7ff8888888888888, 0x0000000000000000, 0x7ff8888888888888, 0x7ff8888888888888 },
+	{ 0x7ff8888888888888, 0x0001111111111111, 0x7ff8888888888888, 0x7ff8888888888888 },
+	{ 0x7ff8888888888888, 0x3ff0000000000000, 0x7ff8888888888888, 0x7ff8888888888888 },
+	{ 0x7ff0000000000000, 0x7ff9999999999999, 0x7ff9999999999999, 0x7ff9999999999999 },
+	{ 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff8000000000000 },
+	{ 0x7ff0000000000000, 0xfff0000000000000, 0x7ff8000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x8002222222222222, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0xc002222222222222, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x0000000000000000, 0x7ff9999999999999, 0x7ff9999999999999, 0x7ff9999999999999 },
+	{ 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000 },
+	{ 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000 },
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x8002222222222222, 0x0001111111111111, 0x8001111111111111, 0x8003333333333333 },
+	{ 0x0000022222222222, 0x0000111111111111, 0x0000133333333333, 0x80000eeeeeeeeeef },
+	{ 0x401ffffffbfffefe, 0x406b8265196bd89e, 0x406c8265194bd896, 0xc06a8265198bd8a6 },
+	{ 0x4030020000000004, 0xbf110001ffffffff, 0x403001fbbfff8004, 0x4030020440008004 },
+	{ 0x3fdfffffffffffff, 0x3fe0000000000000, 0x3ff0000000000000, 0xbc90000000000000 },
+};
+
+int test13(long arg)
+{
+	long i;
+	unsigned long results[2];
+	struct addvals *vp = addvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(addvals) / sizeof(addvals[0]); ++i, ++vp) {
+		asm("lfd 5,0(%0); lfd 6,8(%0); fadd 7,5,6; fsub 8,5,6; stfd 7,0(%1); stfd 8,8(%1)"
+		    : : "b" (&vp->val_a), "b" (results) : "memory");
+		if (results[0] != vp->sum || results[1] != vp->diff) {
+			print_hex(i, 2, " ");
+			print_hex(results[0], 16, " ");
+			print_hex(results[1], 16, "\r\n");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_13(void)
+{
+	enable_fp();
+	return trapit(0, test13);
+}
+
+struct addvals sp_addvals[] = {
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x3fdfffffffffffff, 0x0000000000000000, 0x3fe0000000000000, 0x3fe0000000000000 },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x4000000000000000, 0x0000000000000000 },
+	{ 0xbff0000000000000, 0xbff0000000000000, 0xc000000000000000, 0x0000000000000000 },
+	{ 0x402123456789abcd, 0x4021000000000000, 0x403111a2c0000000, 0x3fb1a2b000000000 },
+	{ 0x4061200000000000, 0x406123456789abcd, 0x407121a2c0000000, 0xbfba2b0000000000 },
+	{ 0x4061230000000000, 0x3fa4560000000000, 0x4061244560000000, 0x406121baa0000000 },
+	{ 0xc061230000000000, 0x3fa4560000000000, 0xc06121baa0000000, 0xc061244560000000 },
+	{ 0x4061230000000000, 0xbfa4560000000000, 0x406121baa0000000, 0x4061244560000000 },
+	{ 0xc061230000000000, 0xbfa4560000000000, 0xc061244560000000, 0xc06121baa0000000 },
+	{ 0x3fa1230000000000, 0x4064560000000000, 0x4064571240000000, 0xc06454edc0000000 },
+	{ 0xbfa1230000000000, 0x4064560000000000, 0x406454edc0000000, 0xc064571240000000 },
+	{ 0x3fa1230000000000, 0xc064560000000000, 0xc06454edc0000000, 0x4064571240000000 },
+	{ 0xbfa1230000000000, 0xc064560000000000, 0xc064571240000000, 0x406454edc0000000 },
+	{ 0x6780000000000001, 0x6470000000000000, 0x7ff0000000000000, 0x7ff8000000000000 },
+	{ 0x6780000000000001, 0x6460000000000000, 0x7ff0000000000000, 0x7ff8000000000000 },
+	{ 0x6780000000000001, 0x6450000000000000, 0x7ff0000000000000, 0x7ff8000000000000 },
+	{ 0x6780000000000001, 0x6440000000000000, 0x7ff0000000000000, 0x7ff8000000000000 },
+	{ 0x7ff8888888888888, 0x7ff9999999999999, 0x7ff8888880000000, 0x7ff8888880000000 },
+	{ 0xfff8888888888888, 0x7ff9999999999999, 0xfff8888880000000, 0xfff8888880000000 },
+	{ 0x7ff8888888888888, 0x7ff0000000000000, 0x7ff8888880000000, 0x7ff8888880000000 },
+	{ 0x7ff8888888888888, 0x0000000000000000, 0x7ff8888880000000, 0x7ff8888880000000 },
+	{ 0x7ff8888888888888, 0x0001111111111111, 0x7ff8888880000000, 0x7ff8888880000000 },
+	{ 0x7ff8888888888888, 0x3ff0000000000000, 0x7ff8888880000000, 0x7ff8888880000000 },
+	{ 0x7ff0000000000000, 0x7ff9999999999999, 0x7ff9999980000000, 0x7ff9999980000000 },
+	{ 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff8000000000000 },
+	{ 0x7ff0000000000000, 0xfff0000000000000, 0x7ff8000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x8002222222222222, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0xc002222222222222, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x0000000000000000, 0x7ff9999999999999, 0x7ff9999980000000, 0x7ff9999980000000 },
+	{ 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000 },
+	{ 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000 },
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x8002222222222222, 0x0001111111111111, 0x0000000000000000, 0x8000000000000000 },
+	{ 0x0000022222222222, 0x0000111111111111, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x47dc000020000000, 0x47ec03ffe0000000, 0x7ff0000000000000, 0xc7dc07ffa0000000 },
+	{ 0x47dbffffe0000000, 0x47eff7ffe0000000, 0x7ff0000000000000, 0xc7e1f80000000000 },
+	{ 0x47efffffc0000000, 0xc7efffffc0000000, 0x0000000000000000, 0x7ff0000000000000 },
+};
+
+int test14(long arg)
+{
+	long i;
+	unsigned long results[2];
+	struct addvals *vp = sp_addvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(sp_addvals) / sizeof(sp_addvals[0]); ++i, ++vp) {
+		asm("lfd 5,0(%0); frsp 5,5; lfd 6,8(%0); frsp 6,6; "
+		    "fadds 7,5,6; fsubs 8,5,6; stfd 7,0(%1); stfd 8,8(%1)"
+		    : : "b" (&vp->val_a), "b" (results) : "memory");
+		if (results[0] != vp->sum || results[1] != vp->diff) {
+			print_hex(i, 2, " ");
+			print_hex(results[0], 16, " ");
+			print_hex(results[1], 16, "\r\n");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_14(void)
+{
+	enable_fp();
+	return trapit(0, test14);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -880,6 +1032,8 @@ int main(void)
 	do_test(10, fpu_test_10);
 	do_test(11, fpu_test_11);
 	do_test(12, fpu_test_12);
+	do_test(13, fpu_test_13);
+	do_test(14, fpu_test_14);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index 668ff65367cbf02294b638e77fd93f9173db9532..623db3f690b0aad69472a9a8740964239799983c 100755
GIT binary patch
delta 4711
zcmd5;eNa@_6+iE>;9`{Q^0|BnyNCpkfdy={3d(~hA|!|=wMlB|q9`^x5ofHU4PoCf
zB!j6qn3J@ci5q0piPkz=Y{%M8T{BKKw&`f6A7uPOqX-Eo=@?xFxjpy3_mD4R@<(TS
zX6~MQf9H44J?DO$eeHXs_CH9;0LpBDlFC0%DA`1M1yqz&0=NlVP1tI}R<BSiwk79C
z-fb^u%L8eLw;p@av#}|0CP4RJi0QI}S_agycm;O8hp|#V*092v8ut_8o;iPZluac+
z2g_PK0Q=7$8OL_BOeNdf=eO%NZ4C%;+q`(jGF{q<cP!6KJKIXEWwN|I4&M)ZD%e{{
zp@Dm26P^jnh`bmJ#fO>0&zLW_dr*ta;>D2g?}wZv6F`2q&R!f|I8kkC%{5U@^-bsv
zUoLr3j>wMO83V;9gp1aE#_IC{e7WIEu_R)ayd?(Tig-1sW@Urx*I(fNs0>Vsv`J@i
zcH~0oBsN7Zl=EK3OOa)=DH=1PHUu7B+Tgl}UqwBQ9Z{2{cI=IsALw4z;QEQcYHX8a
zR~6RTiUQxQXmEXo*61AUh%mhqJ(nx%o_nm60?6aLe*4D_t_23c`pPg%ySu?9k>zPA
z0#8Iw!`Gr0%RBv85}h=`T?@g>-P5o)y3GU;P>da6HnhZ>&=(Uo`D8d0&$jX{0rxe$
z7#1rRgkw-_h5UO9u8d6!{cRW&@1kuG_T;*amYvudn<2lr7XKbwB(Dj>n7A}~`!+0!
zTO8?LyWBN}cH0h>U5vHnSwM9W@nBq$d>|C>#Jwh~@8QZRC83-c_<*O?nrG5TA5Te>
zk67{Mlx(@N2@~VD#<<t+ac%7X+_PX{jK?$aMRKzRLlY`^9&}giGc6$b`H9=XanBA{
zPH&wJgXy4%OR+a#!5g6Rd)kp}2Z{8)(`NYObei+}K#DW)0YF!~8O)zqpu5Ws1>N9E
zt1XB60v?_$!qXyg!Y;p~pOum(US${0&)Uz5N>hd<%)@?gpOs4BaUGekoE$k^Tb_hs
z8#zzWoqxpY6$uyxlR9%fgF=3^?R@jUbvn(z9GQ+SrDcGpWOMokD6Jk!tM|L4^*xGM
zPpeGiHI0DLX<0|5)e`dPwAPGBt0p1*xm9%N{FKhoDZNBD_h0>k9w%+mL|_SgqxMbI
zZee+!r`8F}#pb=IW3y<3HtT&SZRh}=OUMmfFd2#~^}ETRr}`$w+gm*~bOMv0Ug5{L
zk^20YUh(rHkekjip4NDrnZ%hxHh?_A8_hA3p9tY+wy?XPQXPjA>iX&~KmIzAK?_AY
z7Z*7&blnppqix<y(I#VDV(qvQzj^2J5myrDaS_iJejXEXG?9i!yarRJ-j!0(k(49N
zz#k;dplbVe(uM$!svTmNpI!NKZ-BYBy^l|{i)Y5zvb;vJgEw*83Bvha2hyH3>ks*B
z!uHBPtK~R9lkJ3G5!mZPSPNmF5;m=imHw2rQK9YQeEzCv|E>H64K2Q#i0F*6OZ)I}
z$|*XN`n<_>QD(f<;3}Xp#K$2tuBCC7C`%3(FJrQIS=T#}_7L8W6MtXeHhb|msh;4W
zQQNGv=cUQst1hsYRNA3r6A|8@o;%N**nPkRwfh&s8)oXuADcIs=n)Uc0;uXt8p0I8
zvy2#HO!~qIu~bqzK<Z|I&6zwHv$w4-y+ZQrzRcJ)0=QVH5ee3JFjme15{@a<4iVNz
znhVEXptj`-V;^yVgk!6y*{?EYC(VUpwbarHOX2_t*Ui-O2)h7)Kgw{(eOIwABSRke
z91oIOe+@5X#LFLC!$DFl*D*dbUOs&t=aA~VjuoVAomfXo=|nH7Rh{@JQq7%sDKlO=
ziWf6tgI<)W0C&>V*g)!;Z{T2NOwe}fx^G~-GiJ_F>K?tpSR9Xu#7^z6g!u@o;lSMB
z0{BpRCPc9o)Rl)o;pLzL9(o^fCUxNkXJU{ofHLbsuQM&En7WN!_%~;A(0b}ZH{K`0
z2h{a<V`5ff&`IiU`2tI_;uqP;-b2HaU+Nn_Kct-2_)uCgF$4-|nJ2VpEGB@49>#iz
z4+f|DA%{mZgE-IPx~!=&OR3k=`*7>$lC_4g1HE`Kt1##&b@OgfcTN^R6bPhj`yOV`
zv2eovK^ULO<ymSegqd(fc4^Q?>bY;@@3ND_cT+d~Yg$So#=H<a1^+tsWhbS<9mX2@
zR1)q{TqVL91VY|OE7Y#t!8ti;skXa}afl12tJKm7OXq$`IQA~Jxoz8WwgzLSV#fSp
zivyUD%k+_?i7_@bs>P8L#X`c4hl(RT;M2+Q>900d&ztH{w6R0!{O8HGR%@1`iq2up
ztG!auc$J#nu}*<i70=~OKXzQqn_8*OU$UF?-Sd*fxaF8b$EQ~M1W&WK9P>B?z=@jS
zp#$#Fm}ocxc<6xJjd`4>&)>xt23S(1(62M}heDt2eWA)nW17E<&ok_2J!DX5$caAE
zoM1TMV`HM>Xt)DzH|BAkq8*P~O*lzSRj|-%^?$?xc&bO*-Bom5JVa5e?~h9;uL{r|
z432A8s$|_R6!~cs;`i(ja?fLoM``xZE+~Gc6AaCxBI$dS1KmzfTZn%Q&niTEq+R!)
z-<UDVfo>Sez=%}W=qXKz_78=pvzey-WR1cli)^^H)J$KDsFvEyi9+NN10Pu;V*y4v
zA6gV}(V{81;Ta3QRce(Oo|EPqj|`tuEc~%~z;JA&s%tezNPw!X3!vcu0uBSXb(16n
z)WJd4&sc)uZC@R|qJFjVi}q5|Ym~k1KPUaFkLnVD?4<7}#w4&FH>vD+zvNU*Dkvs7
zQ8O@Ry!MWK&Zjfeb@Lx7?98@I<+c?&r0}8m7b;N`4Q@yNcdH2G-p4BfU*}Nk*oq+R
zydrS<KdlIgUJ-ncs0cozB5<NlWHPcM7!wUgqbq_jkMqV<1mTa~m8Sik(G`JkZhI7=
zyy}}BC}Zr#12wKX%~u;6#S0Qzw+~eWzCJfFoj9f<Ji>vVkqB?R1w6(x`R4g_doxBw
zC|uHx49{T4j(dDTmGu9nBIwSIMLgf4B4m!K2wLPrzoRMwJ<o_TJHUnc#OPoFJzSHB
vvu|M&E}WtPeOjK^?00Vsx6|j-6>j%2CUCA#@gO7c-_m4y?2|Ba@xK28+c^CS

delta 2035
zcmai#e@s(X6vxkdl-jaVU-?lAlv)rwK&+)8J_NBKu&yYexMXumip*qATv(<Kwn~}l
z6pS;%Sr%h9ha?(D*p#8k;*8N{Zqe-zml%V=Ii~1ZbOsj$oXXv~h2V4~p5*qN`#Im9
zd+xpOwY}d7y@vz~fMYR$tNO<ot_Je!q0&_i;3l?_*hXR-*EdzRS~3M}U2PXXOX=Qo
za;A4|XHpJ8;Ac|GW@st{3M34&zB`PS^7#ZS%$d-hC+&s4i_hesIVtDsvUV>(XW#K@
za(By`D7UL`-(+rL4G3`?5HCn`g*F_KDuuQlm#$0{4RN?X@<lZkQfuHI+ldz<t@^dG
z;Ou6JV~m~24UA88CLwC0vb7bI0$385RIiLG41?y5Tm|h(Y{mmoH3Fg-ZPzClz<D-U
zqGJhT+m`^G$@RUCuINRg#(+Db-%?6%_~fq!u`|Ys$$Ep}z{UD<Aqzj&my2Dm;Wd4k
z=+mP$W@~858lU_C@5L;{{uqtm#J^%*3I(@M-VkJU294NTjg5wqP(_tbzJj{gg>A7#
zp}*4Nhww`5d%`^WoHN7i0d<Z01a6D#QAm;C#QsPFwwn}K5pN8?6amiVGx^TIJ%GPO
z#)~T=@YDDz@w^sC;?p9|>A-oA3I_9*#(Jp@?MAEETZ`+AC89@%oyIiLup2KMSL*v~
zYvcqvG&5BD8SBWCph+fSenN?OO2QopZ;P-OM-p5STp0S8m2#0{LbTG9CjKxJ8%=gm
z)r{Ym_Qdtq9+K-H-tfNs=zkSg&ngkywb(JM3NI!`E%#G734}4e@zjT`^zy@TuMNC%
z7_eE#XViFF!Y1}tG352wpho6w%OqPw*SBFZwn3tN6*H1@BT~b`Sv|RHet*z}jY%Ee
zA!<nj+a0{`F^c&H=l#Q<lH7F8QfTATY1E|2ryM|DQ1`4xMRjgI;B^)U^D<O7O{0mB
z&yq-h#^4UxsoP}io{bSJUU<4&$w%E<(39+$w&PdEp5Ji>X`a_{c(BfMIu49J-SIMP
znKLS=@k9EFh{6+=EP5RWEn7ppuqk+n{xSB)?ZY7|&ym0R7tO(!ld*MqD#`;UwYC!H
zXB|psCT*H{M9vXAEWYpIoZpFocz2L}FvZNoJ}36kAS>NMO*!cJxSl^|bpAU21Whfz
znS^MIF$*oIoBNZ-Oj`Z|h`7$*rZ;POYz+Q>fm?-WPxq>)W<4j<D}`{pE`!-sZ3b5Z
z3F54|#h4W6Qh=wk9JZ+_7Qa_DC?GZGzbphCIhrZv2<oJ#=D6Ql9vxgNIUOLi1)ya<
zZ$^bZ<JKy{n|z(I0Rr448YF^YVig>aaps_5yTRBHd2XDoqM;dJ><9;BoV`h-otT+C
zH_kjXx`-unK*s%E8mEX|1;9RAqF8qm$8A>8a|^8*R&n?iR%Mt(<Db|_&g{nn<eYvy
zO|H(5*U0Vj<2bo9el*!lqTi4C<fH&rk&^>>+8(ETRit$Sc+DP{Ur(Vwz?hM*PiCgE
zo7fPsIu44|1@MKiRO`?cG*)P#@ONbaZ}S*w@&++GGfCM?;hI6L%1l$LLIA!U#66i7
zWfq0D+jx=$YbZ>)gV!naQF!VOhGm)5yh;~Mhp{1RcHAY3oOhobPZ4GP#8%zK@vM9`
zpV|}v1Nb;=>AZT1Hs5>FG(XGT#5}~flG`#G?Zmop(}Ggv5Jjr{czS`wXbhzm?lT4g
zDT=jb+Sj{UC{h;C6dg8Wbn^&f%{1Z0EAv}_gc;e9V(loFWTy%3_)d0Y?D0{?(s>Os
kF8hSWFZ54W-*Yf~hq~ujc2r2u(c)np9xc}3{Iy^H3;nCTwg3PC

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index d926abc..440cd77 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -10,3 +10,5 @@ test 09:PASS
 test 10:PASS
 test 11:PASS
 test 12:PASS
+test 13:PASS
+test 14:PASS

From e6a5f237bc02de146e2416cf3d8bec7473e33483 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 27 Jul 2020 18:27:50 +1000
Subject: [PATCH 17/30] FPU: Implement fmul[s]

This implements the fmul and fmuls instructions.

For fmul[s] with denormalized operands we normalize the inputs
before doing the multiplication, to eliminate the need for doing
count-leading-zeroes on P.  This adds 3 or 5 cycles to the
execution time when one or both operands are denormalized.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl               |   2 +
 decode2.vhdl               |   7 ++
 decode_types.vhdl          |   2 +-
 fpu.vhdl                   | 182 ++++++++++++++++++++++++++++++++++++-
 tests/fpu/fpu.c            |  80 ++++++++++++++++
 tests/test_fpu.bin         | Bin 24024 -> 24272 bytes
 tests/test_fpu.console_out |   2 +
 7 files changed, 271 insertions(+), 4 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 737d83c..721c478 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -418,6 +418,7 @@ architecture behaviour of decode1 is
         2#01110#  =>  (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fcfid[u]s
         2#10100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsubs
         2#10101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fadds
+        2#11001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmuls
         others => illegal_inst
         );
 
@@ -470,6 +471,7 @@ architecture behaviour of decode1 is
         --                         op                               in   out   A   out  in    out  len        ext                                pipe
         2#0100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsub
         2#0101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fadd
+        2#1001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmul
         others => illegal_inst
         );
 
diff --git a/decode2.vhdl b/decode2.vhdl
index ec8232f..9443212 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -152,6 +152,12 @@ architecture behaviour of decode2 is
                 else
                     return ('0', (others => '0'), (others => '0'));
                 end if;
+            when FRC =>
+                if HAS_FPU then
+                    return ('1', fpr_to_gspr(insn_frc(insn_in)), reg_data);
+                else
+                    return ('0', (others => '0'), (others => '0'));
+                end if;
             when NONE =>
                 return ('0', (others => '0'), (others => '0'));
         end case;
@@ -308,6 +314,7 @@ begin
                        else fpr_to_gspr(insn_frb(d_in.insn)) when d_in.decode.input_reg_b = FRB and HAS_FPU
                        else gpr_to_gspr(insn_rb(d_in.insn));
     r_out.read3_reg <= gpr_to_gspr(insn_rcreg(d_in.insn)) when d_in.decode.input_reg_c = RCR
+                       else fpr_to_gspr(insn_frc(d_in.insn)) when d_in.decode.input_reg_c = FRC and HAS_FPU
                        else fpr_to_gspr(insn_frt(d_in.insn)) when d_in.decode.input_reg_c = FRS and HAS_FPU
                        else gpr_to_gspr(insn_rs(d_in.insn));
 
diff --git a/decode_types.vhdl b/decode_types.vhdl
index 08fdc4a..72609bf 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -26,7 +26,7 @@ package decode_types is
     type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA, FRA);
     type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD,
                            CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR, FRB);
-    type input_reg_c_t is (NONE, RS, RCR, FRS);
+    type input_reg_c_t is (NONE, RS, RCR, FRC, FRS);
     type output_reg_a_t is (NONE, RT, RA, SPR, FRT);
     type rc_t is (NONE, ONE, RC);
     type carry_in_t is (ZERO, CA, OV, ONE);
diff --git a/fpu.vhdl b/fpu.vhdl
index e9edfb4..209daa0 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -40,15 +40,18 @@ architecture behaviour of fpu is
                      DO_FMR, DO_FMRG,
                      DO_FCFID, DO_FCTI,
                      DO_FRSP, DO_FRI,
-                     DO_FADD,
+                     DO_FADD, DO_FMUL,
                      FRI_1,
                      ADD_SHIFT, ADD_2, ADD_3,
+                     MULT_1,
                      INT_SHIFT, INT_ROUND, INT_ISHIFT,
                      INT_FINAL, INT_CHECK, INT_OFLOW,
                      FINISH, NORMALIZE,
                      ROUND_UFLOW, ROUND_OFLOW,
                      ROUNDING, ROUNDING_2, ROUNDING_3,
-                     DENORM);
+                     DENORM,
+                     RENORM_A, RENORM_A2,
+                     RENORM_C, RENORM_C2);
 
     type reg_type is record
         state        : state_t;
@@ -65,8 +68,10 @@ architecture behaviour of fpu is
         fpscr        : std_ulogic_vector(31 downto 0);
         a            : fpu_reg_type;
         b            : fpu_reg_type;
+        c            : fpu_reg_type;
         r            : std_ulogic_vector(63 downto 0);  -- 10.54 format
         x            : std_ulogic;
+        p            : std_ulogic_vector(63 downto 0);  -- 8.56 format
         result_sign  : std_ulogic;
         result_class : fp_number_class;
         result_exp   : signed(EXP_BITS-1 downto 0);
@@ -84,6 +89,8 @@ architecture behaviour of fpu is
         is_subtract  : std_ulogic;
         exp_cmp      : std_ulogic;
         add_bsmall   : std_ulogic;
+        is_multiply  : std_ulogic;
+        first        : std_ulogic;
     end record;
 
     signal r, rin : reg_type;
@@ -103,11 +110,17 @@ architecture behaviour of fpu is
     signal r_hi_nz       : std_ulogic;
     signal r_lo_nz       : std_ulogic;
     signal misc_sel      : std_ulogic_vector(3 downto 0);
+    signal f_to_multiply : MultiplyInputType;
+    signal multiply_to_f : MultiplyOutputType;
+    signal msel_1        : std_ulogic_vector(1 downto 0);
+    signal msel_2        : std_ulogic_vector(1 downto 0);
+    signal msel_inv      : std_ulogic;
 
     -- opsel values
     constant AIN_R    : std_ulogic_vector(1 downto 0) := "00";
     constant AIN_A    : std_ulogic_vector(1 downto 0) := "01";
     constant AIN_B    : std_ulogic_vector(1 downto 0) := "10";
+    constant AIN_C    : std_ulogic_vector(1 downto 0) := "11";
 
     constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00";
     constant BIN_R    : std_ulogic_vector(1 downto 0) := "01";
@@ -115,8 +128,17 @@ architecture behaviour of fpu is
 
     constant RES_SUM   : std_ulogic_vector(1 downto 0) := "00";
     constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01";
+    constant RES_MULT  : std_ulogic_vector(1 downto 0) := "10";
     constant RES_MISC  : std_ulogic_vector(1 downto 0) := "11";
 
+    -- msel values
+    constant MUL1_A : std_ulogic_vector(1 downto 0) := "00";
+    constant MUL1_B : std_ulogic_vector(1 downto 0) := "01";
+    constant MUL1_R : std_ulogic_vector(1 downto 0) := "11";
+
+    constant MUL2_C   : std_ulogic_vector(1 downto 0) := "00";
+    constant MUL2_R   : std_ulogic_vector(1 downto 0) := "11";
+
     -- Left and right shifter with 120 bit input and 64 bit output.
     -- Shifts inp left by shift bits and returns the upper 64 bits of
     -- the result.  The shift parameter is interpreted as a signed
@@ -313,6 +335,13 @@ architecture behaviour of fpu is
     end;
 
 begin
+    fpu_multiply_0: entity work.multiply
+        port map (
+            clk => clk,
+            m_in => f_to_multiply,
+            m_out => multiply_to_f
+            );
+
     fpu_0: process(clk)
     begin
         if rising_edge(clk) then
@@ -347,6 +376,7 @@ begin
         variable v           : reg_type;
         variable adec        : fpu_reg_type;
         variable bdec        : fpu_reg_type;
+        variable cdec        : fpu_reg_type;
         variable fpscr_mask  : std_ulogic_vector(31 downto 0);
         variable illegal     : std_ulogic;
         variable j, k        : integer;
@@ -377,6 +407,10 @@ begin
         variable is_add      : std_ulogic;
         variable qnan_result : std_ulogic;
         variable longmask    : std_ulogic;
+        variable set_a       : std_ulogic;
+        variable set_c       : std_ulogic;
+        variable px_nz       : std_ulogic;
+        variable maddend     : std_ulogic_vector(127 downto 0);
     begin
         v := r;
         illegal := '0';
@@ -407,11 +441,15 @@ begin
             v.denorm := '0';
             v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN);
             v.is_subtract := '0';
+            v.is_multiply := '0';
             v.add_bsmall := '0';
             adec := decode_dp(e_in.fra, int_input);
             bdec := decode_dp(e_in.frb, int_input);
+            cdec := decode_dp(e_in.frc, int_input);
             v.a := adec;
             v.b := bdec;
+            v.c := cdec;
+
             v.exp_cmp := '0';
             if adec.exponent > bdec.exponent then
                 v.exp_cmp := '1';
@@ -440,10 +478,14 @@ begin
             exp_huge := '1';
         end if;
 
+        -- Compare P with zero
+        px_nz := or (r.p(57 downto 4));
+
         v.writing_back := '0';
         v.instr_done := '0';
         v.update_fprf := '0';
         v.shift := to_signed(0, EXP_BITS);
+        v.first := '0';
         opsel_a <= AIN_R;
         opsel_ainv <= '0';
         opsel_amask <= '0';
@@ -460,6 +502,13 @@ begin
         set_x := '0';
         qnan_result := '0';
         longmask := r.single_prec;
+        set_a := '0';
+        set_c := '0';
+        f_to_multiply.is_32bit <= '0';
+        f_to_multiply.valid <= '0';
+        msel_1 <= MUL1_A;
+        msel_2 <= MUL2_C;
+        msel_inv <= '0';
 
         case r.state is
             when IDLE =>
@@ -503,6 +552,9 @@ begin
                             v.state := DO_FCTI;
                         when "10100" | "10101" =>
                             v.state := DO_FADD;
+                        when "11001" =>
+                            v.is_multiply := '1';
+                            v.state := DO_FMUL;
                         when others =>
                             illegal := '1';
                     end case;
@@ -795,6 +847,81 @@ begin
                     arith_done := '1';
                 end if;
 
+            when DO_FMUL =>
+                -- fmul[s]
+                opsel_a <= AIN_A;
+                v.result_sign := r.a.negative;
+                v.result_class := r.a.class;
+                v.result_exp := r.a.exponent;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.a.class = FINITE and r.c.class = FINITE then
+                    v.result_sign := r.a.negative xor r.c.negative;
+                    v.result_exp := r.a.exponent + r.c.exponent;
+                    -- Renormalize denorm operands
+                    if r.a.mantissa(54) = '0' then
+                        v.state := RENORM_A;
+                    elsif r.c.mantissa(54) = '0' then
+                        opsel_a <= AIN_C;
+                        v.state := RENORM_C;
+                    else
+                        f_to_multiply.valid <= '1';
+                        v.state := MULT_1;
+                    end if;
+                else
+                    if (r.a.class = NAN and r.a.mantissa(53) = '0') or
+                        (r.c.class = NAN and r.c.mantissa(53) = '0') then
+                        -- Signalling NAN
+                        v.fpscr(FPSCR_VXSNAN) := '1';
+                        invalid := '1';
+                    end if;
+                    if r.a.class = NAN then
+                    -- result is A
+                    elsif r.c.class = NAN then
+                        v.result_class := NAN;
+                        v.result_sign := r.c.negative;
+                        opsel_a <= AIN_C;
+                    elsif (r.a.class = INFINITY and r.c.class = ZERO) or
+                        (r.a.class = ZERO and r.c.class = INFINITY) then
+                        -- invalid operation, construct QNaN
+                        v.fpscr(FPSCR_VXIMZ) := '1';
+                        qnan_result := '1';
+                    elsif r.a.class = ZERO or r.a.class = INFINITY then
+                        -- result is +/- A
+                        v.result_sign := r.a.negative xor r.c.negative;
+                    else
+                        -- r.c.class is ZERO or INFINITY
+                        v.result_class := r.c.class;
+                        v.result_sign := r.a.negative xor r.c.negative;
+                    end if;
+                    arith_done := '1';
+                end if;
+
+            when RENORM_A =>
+                renormalize := '1';
+                v.state := RENORM_A2;
+
+            when RENORM_A2 =>
+                set_a := '1';
+                v.result_exp := new_exp;
+                opsel_a <= AIN_C;
+                if r.c.mantissa(54) = '1' then
+                    v.first := '1';
+                    v.state := MULT_1;
+                else
+                    v.state := RENORM_C;
+                end if;
+
+            when RENORM_C =>
+                renormalize := '1';
+                v.state := RENORM_C2;
+
+            when RENORM_C2 =>
+                set_c := '1';
+                v.result_exp := new_exp;
+                v.first := '1';
+                v.state := MULT_1;
+
             when ADD_SHIFT =>
                 opsel_r <= RES_SHIFT;
                 set_x := '1';
@@ -848,6 +975,13 @@ begin
                     v.state := NORMALIZE;
                 end if;
 
+            when MULT_1 =>
+                f_to_multiply.valid <= r.first;
+                opsel_r <= RES_MULT;
+                if multiply_to_f.valid = '1' then
+                    v.state := FINISH;
+                end if;
+
             when INT_SHIFT =>
                 opsel_r <= RES_SHIFT;
                 set_x := '1';
@@ -930,6 +1064,9 @@ begin
                 v.state := ROUNDING;
 
             when FINISH =>
+                if r.is_multiply = '1' and px_nz = '1' then
+                    v.x := '1';
+                end if;
                 if r.r(63 downto 54) /= "0000000001" then
                     renormalize := '1';
                     v.state := NORMALIZE;
@@ -1099,6 +1236,32 @@ begin
             update_fx := '1';
         end if;
 
+        -- Multiplier data path
+        case msel_1 is
+            when MUL1_A =>
+                f_to_multiply.data1 <= r.a.mantissa(61 downto 0) & "00";
+            when MUL1_B =>
+                f_to_multiply.data1 <= r.b.mantissa(61 downto 0) & "00";
+            when others =>
+                f_to_multiply.data1 <= r.r(61 downto 0) & "00";
+        end case;
+        case msel_2 is
+            when MUL2_C =>
+                f_to_multiply.data2 <= r.c.mantissa(61 downto 0) & "00";
+            when others =>
+                f_to_multiply.data2 <= r.r(61 downto 0) & "00";
+        end case;
+        maddend := (others => '0');
+        if msel_inv = '1' then
+            f_to_multiply.addend <= not maddend;
+        else
+            f_to_multiply.addend <= maddend;
+        end if;
+        f_to_multiply.not_result <= msel_inv;
+        if multiply_to_f.valid = '1' then
+            v.p := multiply_to_f.result(63 downto 0);
+        end if;
+
         -- Data path.
         -- This has A and B input multiplexers, an adder, a shifter,
         -- count-leading-zeroes logic, and a result mux.
@@ -1119,8 +1282,10 @@ begin
                 in_a0 := r.r;
             when AIN_A =>
                 in_a0 := r.a.mantissa;
-            when others =>
+            when AIN_B =>
                 in_a0 := r.b.mantissa;
+            when others =>
+                in_a0 := r.c.mantissa;
         end case;
         if (or (mask and in_a0)) = '1' and set_x = '1' then
             v.x := '1';
@@ -1157,6 +1322,8 @@ begin
                 result <= std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in);
             when RES_SHIFT =>
                 result <= shift_res;
+            when RES_MULT =>
+                result <= multiply_to_f.result(121 downto 58);
             when others =>
                 case misc_sel is
                     when "0000" =>
@@ -1207,6 +1374,15 @@ begin
         end case;
         v.r := result;
 
+        if set_a = '1' then
+            v.a.exponent := new_exp;
+            v.a.mantissa := shift_res;
+        end if;
+        if set_c = '1' then
+            v.c.exponent := new_exp;
+            v.c.mantissa := shift_res;
+        end if;
+
         if opsel_r = RES_SHIFT then
             v.result_exp := new_exp;
         end if;
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index 8f7407a..305359a 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -205,6 +205,7 @@ struct sp_dp_equiv {
 	{ 0x00200000, 0x37f0000000000000 },
 	{ 0x00000002, 0x36b0000000000000 },
 	{ 0x00000001, 0x36a0000000000000 },
+	{ 0x7f7fffff, 0x47efffffe0000000 },
 };
 
 int sp_to_dp(long arg)
@@ -995,6 +996,83 @@ int fpu_test_14(void)
 	return trapit(0, test14);
 }
 
+struct mulvals {
+	unsigned long val_a;
+	unsigned long val_b;
+	unsigned long prod;
+} mulvals[] = {
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 },
+	{ 0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000 },
+	{ 0xbf4fff801fffffff, 0x6d7fffff8000007f, 0xecdfff7fa001fffe },
+	{ 0x3fbd50275a65ed80, 0x0010000000000000, 0x0001d50275a65ed8 },
+};
+
+int test15(long arg)
+{
+	long i;
+	unsigned long result;
+	struct mulvals *vp = mulvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(mulvals) / sizeof(mulvals[0]); ++i, ++vp) {
+		asm("lfd 5,0(%0); lfd 6,8(%0); fmul 7,5,6; stfd 7,0(%1)"
+		    : : "b" (&vp->val_a), "b" (&result) : "memory");
+		if (result != vp->prod) {
+			print_hex(i, 2, " ");
+			print_hex(result, 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_15(void)
+{
+	enable_fp();
+	return trapit(0, test15);
+}
+
+struct mulvals_sp {
+	unsigned int val_a;
+	unsigned int val_b;
+	unsigned int prod;
+} mulvals_sp[] = {
+	{ 0x00000000, 0x00000000, 0x00000000 },
+	{ 0x80000000, 0x80000000, 0x00000000 },
+	{ 0x3f800000, 0x3f800000, 0x3f800000 },
+	{ 0xbf800000, 0x3f800000, 0xbf800000 },
+	{ 0xbe7ff801, 0x6d7fffff, 0xec7ff800 },
+	{ 0xc100003d, 0xfe803ff8, 0x7f800000 },
+	{ 0x4f780080, 0x389003ff, 0x488b8427 },
+};
+
+int test16(long arg)
+{
+	long i;
+	unsigned int result;
+	struct mulvals_sp *vp = mulvals_sp;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(mulvals_sp) / sizeof(mulvals_sp[0]); ++i, ++vp) {
+		asm("lfs 5,0(%0); lfs 6,4(%0); fmuls 7,5,6; stfs 7,0(%1)"
+		    : : "b" (&vp->val_a), "b" (&result) : "memory");
+		if (result != vp->prod) {
+			print_hex(i, 2, " ");
+			print_hex(result, 8, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_16(void)
+{
+	enable_fp();
+	return trapit(0, test16);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -1034,6 +1112,8 @@ int main(void)
 	do_test(12, fpu_test_12);
 	do_test(13, fpu_test_13);
 	do_test(14, fpu_test_14);
+	do_test(15, fpu_test_15);
+	do_test(16, fpu_test_16);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index 623db3f690b0aad69472a9a8740964239799983c..1e0e29e0c174fe7fcc993071b35fef6086d1a0c0 100755
GIT binary patch
delta 2764
zcmai0du)@}6+hn>+nCHT&ck^+!OlW}kQpbT=||%RCryH#R7qf|v}O<zC@hG!BGOug
zF4zXS73_f$PK%;y!T~`9s8yk%QrhxJ(KI#P*uo#1n0ST6qz(-<)p)oo`0gCrqe;ue
zk&e%~zwh_m`@83!@5&cHl}1iW`2db*0czX+Jg0UW#f`A8whf?_uq}jbA#BS=@4BA+
z63O4wTp|^o*z@;!MgH970F#%dgs1}|vm5-C%RB$T`0!r<zPL0p>lyAbWLb#eCC2S~
zuLt0_mwICiLu58+P50v$h5{*scMU5ObYP~bptV14@%GSwE3rYA2XpY<<b{$S&n1_p
zKFkK^31JBdF;q4gj><6jQJ+#OPiJFI%BRGWjm5^(CUAZh;T>Ed#Lg7}LuH51Wh|55
zGvQmtVo8S|8x?iCPkH|)9!s-gUTV7B`b#WNt(TjcaZhT!^jEx++92m;p*8JwwR5dc
z`4+!Pdm2a6baHzO-cDPodda73j$m8U6QoV(POnktt@A1W!o-ZH_hvMyeXTyl#RKfl
z(5c(oe2T=QB;&WDjzrmMpGwnupYrhpNhnP^ANn5FZSW}_3>T{}&^F%0RhbJgoUvL;
zplpf!&IGzMKU8(6f)huR)3Gl{g^gLJgeC&cCxMeo7{DvZS@H!V?#pVCT}gN^t01x3
z2+jjEU*vhdyEBAErloRc2d+2O$PZHRn8}^$?bx7XQ)$i678K&3-2h&N80FbDvONXg
z%6?UD--q|IYm=C$e#n{++Er1~>YM_(DjD55=Q6z=hn3Fn#yzeFKbop}evNES!h`eA
z@zR^j1+?a2<>7xsodh1`<eUv%jbKqsiP&%6BzN|r-crfL)s`qR@drek0ry(U_nJ)}
z4LD-m<J#_hhnsS=%nTC6qn~SG`122?8xH`27Y*Q=G{f~EC<R@1*jd3lkRv;YYPP|F
z(2I0PTms62Tml!3XUc1z2Tln@Svo9U-Y1}vDlRKQt~o22Gee1C-z+6VDi|IjB~zqi
zL<_DRq{ItK{Qp&Qfck=zcv*>-l=RSerjq0Os1n_el;DUZrG6J3lL4d<_tE5iQTNa9
z!X6vdIUdA(uJv$yv|k#hJzX}~5UKx1G3*)Y(4(>3#%H3(;@oGX&G<_0fTx=dO$R$2
zT)ke(xk|V0@i$pk$k-yP-s3pK#3`p|0QLy)haBGH;mADCMxHCQ_s4LOlpREpz#UQC
zN9ta}6F)_%2dncQFV1-P$>P4b^JH=7i39KiaYu<XQ{3-y_k#P93wO|!+JqnFm&k+r
z2X5xS9_Q(c9#AkOu7A}Zr|oM0n%|7xgF>vgAED>q&+S=7IM-E8^*J8<y51zLUw)&D
zajyR$;T;j|H;=F(!qyVDU_#WrMh)PgQZb)DI;wRdcgLf}mk|+dX=Z5{6PJ8R_1|fq
zM@{gj7k!F_E(IPNXsm1YDT$Fw!lLkHBKy||_okZloSR}7yVj4zg`T;OMgyM|{#;7%
zk11fTZ8JmdHX>wN%c}jklkcgZ>sSM9*HSK@({58yE6>E@=yT%sj$)1o&oFC@?^+v-
zk)`qq0T%BD*u9j+X#E3WYm4OZT@fNc0GolFNYHduh(-p;7;}(!6E;nejj<;3m#+zN
zgaI<f+Bq2&!pxD3u`Y5QVR;OYaqK2H5q22>FWD^e$S8(wR{8REv=&+ArZH?O%8|Rq
z&`rTRj{7Mb8OJjeM#k|fh0r*LDVPJ8Q=B6^16WRBYXDm)^ajvPVJLw6DFg%bUY1HG
zF?glOq^3l<foF;{H5V!EzJXVZGs{Di25$&qVu564a$Ow2G+|pAsG3^=e(5>ABeBBW
zsE5jHnhM^BjX3rREG@~^{DRUo6WCHxpxH&~-~{d|$=95u)b=HQLWIkdE(+opr79J`
zsUXHL%k>medS^0r0xR{6G{0Zk7_Wy4TH+^BC5^;6c~c0w7Xkjq`PFRBuZh$3l@Nn7
zIW4m}y~Ht0&Ell_r5U6=difNQ4o%@3%W^XVl!l0J<~hl#$U}swZee&?c{~ptYJj(I
z)3DOIi_*O5?`z0s|3|_K31ceT^W?_~TQiNXK2@s$4HfF&c;=~mQz4}<-x0zg5v5vh
zraH*a>7hzfK`r92nHG266=E09LB>;an@$ntV}NWCxr2NSR+ldMS@%657-3_!m;4Z6
zg*-hOV{en6x-UfMz=6^+4SrCq9a!qvY{XG%4iE5y7v|znl?MN|Dg&3+Yd!Q=iEa9<
z5MsVpG@PYBZC)WfuY`rv=MmSYytg(sFKj#=;~C&$OaE)~KVmCJU*=zL%sCMEk)Ms_
zXQhzm49^Q8&lLUb{4BJ>8wSvOJznT+5OLkgLf`M4ctJ7^^uDl2Yal8qi5PuSU$4ZW
Hwdwx_YSOlq

delta 2147
zcmai#3rv$&6vxl^(blR^ze-z5skKx@Ktu$AkB3+gv48=#nMIAVqVq9zlO@jJMB16o
z$ifVIbR$M9Gis*OWUiXv%m@;)Y;JBWSqz&~0g;z4Tzues+?`t(K4#)cPR_Z%`~S{8
z-|21J@}1CnT8IR&&j83T{eDFL28zp|B)=5ENo+N-)x=gWbC+z7OccD^7bXfZ$G81D
zW}4TQ3ebDuI%5SuV(yYRJGJ~%&VT#~;OvE#=k{=$6c$bjHz{rwx?KRDT&NvVNd9Wz
zG<xx@6eCD@SDF(v8mv?boQDFUH%Y!cZILL}nsImVB*BYK!O4b;Vc<B<!Uq^@Oz(X%
z*oNdqjV@Vi3Bz37Y0`=M1pQtUIL`QW`)4w?c_u((x(@U7>0-4Bx9Agu(RfNPE0=rZ
zV|{oeBn~4CMzLifW*Q2`<^{OTP$*>JMMIIO2*tRN^~(B19{B;@4oShbkU+6@G2RTB
zqgdjR-|@3DW03GRRvL4aJ4-zBAE*sY*%3NVdFL&UoW~bf6B?*&b$VohR*9<Ls4@Km
zTxD7>?8SD|WN~6QYQnxzfF2w>CYZ4;%pBCL1IG+4IQTF+@nUe8n5Dx=bBTCL!Y$?)
z?a5$pd_i^m`;nF95}q+n73Z(Tzs<SgnqW*DS7~srT`G^GAz7ichq3(`61Zhj{4y?A
zJUSK)miI*WF5F_tAIpWx$E<*h6oaA;ON@9zi<<FGCg<9Na{0q6uDnP8E7dnXSFDjR
z!P>-+zGsc0GY`s-|6v6Jxa1QfmR1x)xLiF3lf##bHJ_t5JevzU!Uu)g7ld^hwAnJT
zWrD77CAB1hFu-TE-~lT*|L}<`o@$N)HlEi#JZf+@#r>{|cv$K`Yu6A<1E*Xyl*iNg
zkAg5EB2CD{cO!hRGHNgoHrx5TmQ&7$vUq^sr1V(MqG(tz)1;CnllA~I{JIx4YO166
z*O}qpm!Z`C63tlo_I*B`N`J?7)Ljl9pNPWyI6Ly?&3%0D)y<WX=2gwj@Yi`obD<=9
zwz)M}HR-+(jUPoN3e)jqRHE3p*Vh-dKEUN3{D67}So^g@0qTm@e*Wa(x0kUc8Cz)|
zyteIUi1WEBX+AYWH|Z|1L*j=Oob&nD5^wjjjl-;t*g0Yodsx8-G%`DlitG6WqgiYD
z?HE4%00}WZ#3~#{?Ue37D{1)`L-a1`*<rA1cy1p0xlw!Z^*Gna;g#>txHp6#Z-)%l
z{8B6AZy-TheEMv!t@o$`DvlJvCN<?^lX`;!qEnw`0boxJ9A<XEPI_*Q-^Lf|{YOQ_
z0z_8<R88g0sJ%YRv=YH};4)*K1h`4$B!YEqj1_Z0#+jY`YhwKrxp6j+yt<vSFF7FN
zY$dt1gE1>bZk$z+#}bR+fQ-i)@(f~s0pP=gaIwDwEs1gBqbpcQVO=LyCR)Tjo#>@d
zcNH5b{BRX7QRu&lPbe6>&@$a3+Pg55!pbfzp-|I>l@#i`&`Y7M3mYh?yU~(lQq_v|
z%H5cmWXi0kw7#1$Gw+7XO8zskMq*_g<c!RMeZnk_U7J-|tby#ysx01gKWPGc@UtYF
z${0ZJ)PoI4F)9b8TYK<kQlx4frO=C}WSeRar31Z~O=&Zw)qS`s*;aU+(&lSJ<DR1_
zruzGYWur8ZMMsPt>_k!kc<A5lI`8B^&5EZQ-pO>*%)NoO6kF&bO6zVsop=symk~RP
zi&JK)>M6^(g(WGO3H+k~CB13?(>Lc#N9=cE{6TJW$)kuVZlgiYSCv!hyo1Z-Nb><o
zb$96~fh4mvR_cDE269wcv_xluIU4RUwv`s#cxnGlBIfZEDofE$-ibHl$y1E?8RLW-
zSG&n$iN*5rWSs3J&!Iosa^GvI=T$yu>Io6|&5!mK+Z}qJ`^_k|@m9mJOS@M@1oI~t
K*}1QH2JkPg+}GOx

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index 440cd77..04c6c08 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -12,3 +12,5 @@ test 11:PASS
 test 12:PASS
 test 13:PASS
 test 14:PASS
+test 15:PASS
+test 16:PASS

From 9cce9362519b69106936ce437bd1dfe4f27dee4b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 28 Jul 2020 16:07:25 +1000
Subject: [PATCH 18/30] FPU: Implement fdiv[s]

This implements floating-point division A/B by a process that starts
with normalizing both inputs if necessary.  Then an estimate of 1/B
from a lookup table is refined by 3 Newton-Raphson iterations and then
multiplied by A to get a quotient.  The remainder is calculated as
A - R * B (where R is the result, i.e. the quotient) and the remainder
is compared to 0 and to B to see whether the quotient needs to be
incremented by 1.  The calculations of 1 / B are done with 56 fraction
bits and intermediate results are truncated rather than rounded,
meaning that the final estimate of 1 / B is always correct or a little
bit low, never too high, and thus the calculated quotient is correct
or 1 unit too low.  Doing the estimate of 1 / B with sufficient
precision that the quotient is always correct to the last bit without
needing any adjustment would require many more bits of precision.

This implements fdivs by computing a double-precision quotient and
then rounding it to single precision.  It would be possible to
optimize this by e.g. doing only 2 iterations of Newton-Raphson and
then doing the remainder calculation and adjustment at single
precision rather than double precision.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl               |   2 +
 fpu.vhdl                   | 292 +++++++++++++++++++++++++++++++++++--
 tests/fpu/fpu.c            |  39 +++++
 tests/test_fpu.bin         | Bin 24272 -> 24416 bytes
 tests/test_fpu.console_out |   1 +
 5 files changed, 323 insertions(+), 11 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 721c478..ddcbb3c 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -416,6 +416,7 @@ architecture behaviour of decode1 is
         --             unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
         --                          op                               in   out   A   out  in    out  len        ext                                pipe
         2#01110#  =>  (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fcfid[u]s
+        2#10010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fdivs
         2#10100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsubs
         2#10101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fadds
         2#11001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmuls
@@ -469,6 +470,7 @@ architecture behaviour of decode1 is
     constant decode_op_63h_array : op_63_subop_array_1_t := (
         --            unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
         --                         op                               in   out   A   out  in    out  len        ext                                pipe
+        2#0010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fdiv
         2#0100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsub
         2#0101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fadd
         2#1001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmul
diff --git a/fpu.vhdl b/fpu.vhdl
index 209daa0..2584e1c 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -40,10 +40,12 @@ architecture behaviour of fpu is
                      DO_FMR, DO_FMRG,
                      DO_FCFID, DO_FCTI,
                      DO_FRSP, DO_FRI,
-                     DO_FADD, DO_FMUL,
+                     DO_FADD, DO_FMUL, DO_FDIV,
                      FRI_1,
                      ADD_SHIFT, ADD_2, ADD_3,
                      MULT_1,
+                     LOOKUP,
+                     DIV_2, DIV_3, DIV_4, DIV_5, DIV_6,
                      INT_SHIFT, INT_ROUND, INT_ISHIFT,
                      INT_FINAL, INT_CHECK, INT_OFLOW,
                      FINISH, NORMALIZE,
@@ -51,6 +53,7 @@ architecture behaviour of fpu is
                      ROUNDING, ROUNDING_2, ROUNDING_3,
                      DENORM,
                      RENORM_A, RENORM_A2,
+                     RENORM_B, RENORM_B2,
                      RENORM_C, RENORM_C2);
 
     type reg_type is record
@@ -72,6 +75,7 @@ architecture behaviour of fpu is
         r            : std_ulogic_vector(63 downto 0);  -- 10.54 format
         x            : std_ulogic;
         p            : std_ulogic_vector(63 downto 0);  -- 8.56 format
+        y            : std_ulogic_vector(63 downto 0);  -- 8.56 format
         result_sign  : std_ulogic;
         result_class : fp_number_class;
         result_exp   : signed(EXP_BITS-1 downto 0);
@@ -91,8 +95,11 @@ architecture behaviour of fpu is
         add_bsmall   : std_ulogic;
         is_multiply  : std_ulogic;
         first        : std_ulogic;
+        count        : unsigned(1 downto 0);
     end record;
 
+    type lookup_table is array(0 to 255) of std_ulogic_vector(17 downto 0);
+
     signal r, rin : reg_type;
 
     signal fp_result     : std_ulogic_vector(63 downto 0);
@@ -114,7 +121,9 @@ architecture behaviour of fpu is
     signal multiply_to_f : MultiplyOutputType;
     signal msel_1        : std_ulogic_vector(1 downto 0);
     signal msel_2        : std_ulogic_vector(1 downto 0);
+    signal msel_add      : std_ulogic_vector(1 downto 0);
     signal msel_inv      : std_ulogic;
+    signal inverse_est   : std_ulogic_vector(18 downto 0);
 
     -- opsel values
     constant AIN_R    : std_ulogic_vector(1 downto 0) := "00";
@@ -134,11 +143,61 @@ architecture behaviour of fpu is
     -- msel values
     constant MUL1_A : std_ulogic_vector(1 downto 0) := "00";
     constant MUL1_B : std_ulogic_vector(1 downto 0) := "01";
+    constant MUL1_Y : std_ulogic_vector(1 downto 0) := "10";
     constant MUL1_R : std_ulogic_vector(1 downto 0) := "11";
 
     constant MUL2_C   : std_ulogic_vector(1 downto 0) := "00";
+    constant MUL2_LUT : std_ulogic_vector(1 downto 0) := "01";
+    constant MUL2_P   : std_ulogic_vector(1 downto 0) := "10";
     constant MUL2_R   : std_ulogic_vector(1 downto 0) := "11";
 
+    constant MULADD_ZERO : std_ulogic_vector(1 downto 0) := "00";
+    constant MULADD_CONST : std_ulogic_vector(1 downto 0) := "01";
+    constant MULADD_A     : std_ulogic_vector(1 downto 0) := "10";
+
+    -- Inverse lookup table, indexed by the top 8 fraction bits
+    -- Output range is [0.5, 1) in 0.19 format, though the top
+    -- bit isn't stored since it is always 1.
+    -- Each output value is the inverse of the center of the input
+    -- range for the value, i.e. entry 0 is 1 / (1 + 1/512),
+    -- entry 1 is 1 / (1 + 3/512), etc.
+    signal inverse_table : lookup_table := (
+        -- 1/x lookup table
+        -- Unit bit is assumed to be 1, so input range is [1, 2)
+        18x"3fc01", 18x"3f411", 18x"3ec31", 18x"3e460", 18x"3dc9f", 18x"3d4ec", 18x"3cd49", 18x"3c5b5",
+        18x"3be2f", 18x"3b6b8", 18x"3af4f", 18x"3a7f4", 18x"3a0a7", 18x"39968", 18x"39237", 18x"38b14",
+        18x"383fe", 18x"37cf5", 18x"375f9", 18x"36f0a", 18x"36828", 18x"36153", 18x"35a8a", 18x"353ce",
+        18x"34d1e", 18x"3467a", 18x"33fe3", 18x"33957", 18x"332d7", 18x"32c62", 18x"325f9", 18x"31f9c",
+        18x"3194a", 18x"31303", 18x"30cc7", 18x"30696", 18x"30070", 18x"2fa54", 18x"2f443", 18x"2ee3d",
+        18x"2e841", 18x"2e250", 18x"2dc68", 18x"2d68b", 18x"2d0b8", 18x"2caee", 18x"2c52e", 18x"2bf79",
+        18x"2b9cc", 18x"2b429", 18x"2ae90", 18x"2a900", 18x"2a379", 18x"29dfb", 18x"29887", 18x"2931b",
+        18x"28db8", 18x"2885e", 18x"2830d", 18x"27dc4", 18x"27884", 18x"2734d", 18x"26e1d", 18x"268f6",
+        18x"263d8", 18x"25ec1", 18x"259b3", 18x"254ac", 18x"24fad", 18x"24ab7", 18x"245c8", 18x"240e1",
+        18x"23c01", 18x"23729", 18x"23259", 18x"22d90", 18x"228ce", 18x"22413", 18x"21f60", 18x"21ab4",
+        18x"2160f", 18x"21172", 18x"20cdb", 18x"2084b", 18x"203c2", 18x"1ff40", 18x"1fac4", 18x"1f64f",
+        18x"1f1e1", 18x"1ed79", 18x"1e918", 18x"1e4be", 18x"1e069", 18x"1dc1b", 18x"1d7d4", 18x"1d392",
+        18x"1cf57", 18x"1cb22", 18x"1c6f3", 18x"1c2ca", 18x"1bea7", 18x"1ba8a", 18x"1b672", 18x"1b261",
+        18x"1ae55", 18x"1aa50", 18x"1a64f", 18x"1a255", 18x"19e60", 18x"19a70", 18x"19686", 18x"192a2",
+        18x"18ec3", 18x"18ae9", 18x"18715", 18x"18345", 18x"17f7c", 18x"17bb7", 18x"177f7", 18x"1743d",
+        18x"17087", 18x"16cd7", 18x"1692c", 18x"16585", 18x"161e4", 18x"15e47", 18x"15ab0", 18x"1571d",
+        18x"1538e", 18x"15005", 18x"14c80", 18x"14900", 18x"14584", 18x"1420d", 18x"13e9b", 18x"13b2d",
+        18x"137c3", 18x"1345e", 18x"130fe", 18x"12da2", 18x"12a4a", 18x"126f6", 18x"123a7", 18x"1205c",
+        18x"11d15", 18x"119d2", 18x"11694", 18x"11359", 18x"11023", 18x"10cf1", 18x"109c2", 18x"10698",
+        18x"10372", 18x"10050", 18x"0fd31", 18x"0fa17", 18x"0f700", 18x"0f3ed", 18x"0f0de", 18x"0edd3",
+        18x"0eacb", 18x"0e7c7", 18x"0e4c7", 18x"0e1ca", 18x"0ded2", 18x"0dbdc", 18x"0d8eb", 18x"0d5fc",
+        18x"0d312", 18x"0d02b", 18x"0cd47", 18x"0ca67", 18x"0c78a", 18x"0c4b1", 18x"0c1db", 18x"0bf09",
+        18x"0bc3a", 18x"0b96e", 18x"0b6a5", 18x"0b3e0", 18x"0b11e", 18x"0ae5f", 18x"0aba3", 18x"0a8eb",
+        18x"0a636", 18x"0a383", 18x"0a0d4", 18x"09e28", 18x"09b80", 18x"098da", 18x"09637", 18x"09397",
+        18x"090fb", 18x"08e61", 18x"08bca", 18x"08936", 18x"086a5", 18x"08417", 18x"0818c", 18x"07f04",
+        18x"07c7e", 18x"079fc", 18x"0777c", 18x"074ff", 18x"07284", 18x"0700d", 18x"06d98", 18x"06b26",
+        18x"068b6", 18x"0664a", 18x"063e0", 18x"06178", 18x"05f13", 18x"05cb1", 18x"05a52", 18x"057f5",
+        18x"0559a", 18x"05342", 18x"050ed", 18x"04e9a", 18x"04c4a", 18x"049fc", 18x"047b0", 18x"04567",
+        18x"04321", 18x"040dd", 18x"03e9b", 18x"03c5c", 18x"03a1f", 18x"037e4", 18x"035ac", 18x"03376",
+        18x"03142", 18x"02f11", 18x"02ce2", 18x"02ab5", 18x"0288b", 18x"02663", 18x"0243d", 18x"02219",
+        18x"01ff7", 18x"01dd8", 18x"01bbb", 18x"019a0", 18x"01787", 18x"01570", 18x"0135b", 18x"01149",
+        18x"00f39", 18x"00d2a", 18x"00b1e", 18x"00914", 18x"0070c", 18x"00506", 18x"00302", 18x"00100"
+        );
+
     -- Left and right shifter with 120 bit input and 64 bit output.
     -- Shifts inp left by shift bits and returns the upper 64 bits of
     -- the result.  The shift parameter is interpreted as a signed
@@ -359,6 +418,14 @@ begin
         end if;
     end process;
 
+    -- synchronous reads from lookup table
+    lut_access: process(clk)
+    begin
+        if rising_edge(clk) then
+            inverse_est <= '1' & inverse_table(to_integer(unsigned(r.b.mantissa(53 downto 46))));
+        end if;
+    end process;
+
     e_out.busy <= r.busy;
     e_out.exception <= r.fpscr(FPSCR_FEX);
     e_out.interrupt <= r.do_intr;
@@ -391,6 +458,7 @@ begin
         variable update_fx   : std_ulogic;
         variable arith_done  : std_ulogic;
         variable invalid     : std_ulogic;
+        variable zero_divide : std_ulogic;
         variable mant_nz     : std_ulogic;
         variable min_exp     : signed(EXP_BITS-1 downto 0);
         variable max_exp     : signed(EXP_BITS-1 downto 0);
@@ -408,9 +476,14 @@ begin
         variable qnan_result : std_ulogic;
         variable longmask    : std_ulogic;
         variable set_a       : std_ulogic;
+        variable set_b       : std_ulogic;
         variable set_c       : std_ulogic;
         variable px_nz       : std_ulogic;
         variable maddend     : std_ulogic_vector(127 downto 0);
+        variable set_y       : std_ulogic;
+        variable pcmpb_eq    : std_ulogic;
+        variable pcmpb_lt    : std_ulogic;
+        variable pshift      : std_ulogic;
     begin
         v := r;
         illegal := '0';
@@ -478,8 +551,16 @@ begin
             exp_huge := '1';
         end if;
 
-        -- Compare P with zero
+        -- Compare P with zero and with B
         px_nz := or (r.p(57 downto 4));
+        pcmpb_eq := '0';
+        if r.p(59 downto 4) = r.b.mantissa(55 downto 0) then
+            pcmpb_eq := '1';
+        end if;
+        pcmpb_lt := '0';
+        if unsigned(r.p(59 downto 4)) < unsigned(r.b.mantissa(55 downto 0)) then
+            pcmpb_lt := '1';
+        end if;
 
         v.writing_back := '0';
         v.instr_done := '0';
@@ -498,18 +579,22 @@ begin
         update_fx := '0';
         arith_done := '0';
         invalid := '0';
+        zero_divide := '0';
         renormalize := '0';
         set_x := '0';
         qnan_result := '0';
         longmask := r.single_prec;
         set_a := '0';
+        set_b := '0';
         set_c := '0';
         f_to_multiply.is_32bit <= '0';
         f_to_multiply.valid <= '0';
         msel_1 <= MUL1_A;
         msel_2 <= MUL2_C;
+        msel_add <= MULADD_ZERO;
         msel_inv <= '0';
-
+        set_y := '0';
+        pshift := '0';
         case r.state is
             when IDLE =>
                 if e_in.valid = '1' then
@@ -550,6 +635,8 @@ begin
                         when "01111" =>
                             v.round_mode := "001";
                             v.state := DO_FCTI;
+                        when "10010" =>
+                            v.state := DO_FDIV;
                         when "10100" | "10101" =>
                             v.state := DO_FADD;
                         when "11001" =>
@@ -897,6 +984,63 @@ begin
                     arith_done := '1';
                 end if;
 
+            when DO_FDIV =>
+                opsel_a <= AIN_A;
+                v.result_sign := r.a.negative;
+                v.result_class := r.a.class;
+                v.result_exp := r.a.exponent;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                v.result_sign := r.a.negative xor r.b.negative;
+                v.result_exp := r.a.exponent - r.b.exponent;
+                v.count := "00";
+                if r.a.class = FINITE and r.b.class = FINITE then
+                    -- Renormalize denorm operands
+                    if r.a.mantissa(54) = '0' then
+                        v.state := RENORM_A;
+                    elsif r.b.mantissa(54) = '0' then
+                        opsel_a <= AIN_B;
+                        v.state := RENORM_B;
+                    else
+                        v.first := '1';
+                        v.state := DIV_2;
+                    end if;
+                else
+                    if (r.a.class = NAN and r.a.mantissa(53) = '0') or
+                        (r.b.class = NAN and r.b.mantissa(53) = '0') then
+                        -- Signalling NAN
+                        v.fpscr(FPSCR_VXSNAN) := '1';
+                        invalid := '1';
+                    end if;
+                    if r.a.class = NAN then
+                        -- result is A
+                        v.result_sign := r.a.negative;
+                    elsif r.b.class = NAN then
+                        v.result_class := NAN;
+                        v.result_sign := r.b.negative;
+                        opsel_a <= AIN_B;
+                    elsif r.b.class = INFINITY then
+                        if r.a.class = INFINITY then
+                            v.fpscr(FPSCR_VXIDI) := '1';
+                            qnan_result := '1';
+                        else
+                            v.result_class := ZERO;
+                        end if;
+                    elsif r.b.class = ZERO then
+                        if r.a.class = ZERO then
+                            v.fpscr(FPSCR_VXZDZ) := '1';
+                            qnan_result := '1';
+                        else
+                            if r.a.class = FINITE then
+                                zero_divide := '1';
+                            end if;
+                            v.result_class := INFINITY;
+                        end if;
+                    -- else r.b.class = FINITE, result_class = r.a.class
+                    end if;
+                    arith_done := '1';
+                end if;
+
             when RENORM_A =>
                 renormalize := '1';
                 v.state := RENORM_A2;
@@ -904,14 +1048,33 @@ begin
             when RENORM_A2 =>
                 set_a := '1';
                 v.result_exp := new_exp;
-                opsel_a <= AIN_C;
-                if r.c.mantissa(54) = '1' then
-                    v.first := '1';
-                    v.state := MULT_1;
+                if r.insn(4) = '1' then
+                    opsel_a <= AIN_C;
+                    if r.c.mantissa(54) = '1' then
+                        v.first := '1';
+                        v.state := MULT_1;
+                    else
+                        v.state := RENORM_C;
+                    end if;
                 else
-                    v.state := RENORM_C;
+                        opsel_a <= AIN_B;
+                        if r.b.mantissa(54) = '1' then
+                            v.first := '1';
+                            v.state := DIV_2;
+                        else
+                            v.state := RENORM_B;
+                    end if;
                 end if;
 
+            when RENORM_B =>
+                renormalize := '1';
+                v.state := RENORM_B2;
+
+            when RENORM_B2 =>
+                set_b := '1';
+                v.result_exp := r.result_exp + r.shift;
+                v.state := LOOKUP;
+
             when RENORM_C =>
                 renormalize := '1';
                 v.state := RENORM_C2;
@@ -982,6 +1145,82 @@ begin
                     v.state := FINISH;
                 end if;
 
+            when LOOKUP =>
+                opsel_a <= AIN_B;
+                -- wait one cycle for inverse_table[B] lookup
+                v.first := '1';
+                v.state := DIV_2;
+
+            when DIV_2 =>
+                -- compute Y = inverse_table[B] (when count=0); P = 2 - B * Y
+                msel_1 <= MUL1_B;
+                msel_add <= MULADD_CONST;
+                msel_inv <= '1';
+                if r.count = 0 then
+                    msel_2 <= MUL2_LUT;
+                else
+                    msel_2 <= MUL2_P;
+                end if;
+                set_y := r.first;
+                pshift := '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    v.count := r.count + 1;
+                    v.state := DIV_3;
+                end if;
+
+            when DIV_3 =>
+                -- compute Y = P = P * Y
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    if r.count = 3 then
+                        v.state := DIV_4;
+                    else
+                        v.state := DIV_2;
+                    end if;
+                end if;
+
+            when DIV_4 =>
+                -- compute R = P = A * Y (quotient)
+                msel_1 <= MUL1_A;
+                msel_2 <= MUL2_P;
+                set_y := r.first;
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    opsel_r <= RES_MULT;
+                    v.first := '1';
+                    v.state := DIV_5;
+                end if;
+
+            when DIV_5 =>
+                -- compute P = A - B * R (remainder)
+                msel_1 <= MUL1_B;
+                msel_2 <= MUL2_R;
+                msel_add <= MULADD_A;
+                msel_inv <= '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.state := DIV_6;
+                end if;
+
+            when DIV_6 =>
+                -- test if remainder is 0 or >= B
+                if pcmpb_lt = '1' then
+                    -- quotient is correct, set X if remainder non-zero
+                    v.x := r.p(58) or px_nz;
+                else
+                    -- quotient needs to be incremented by 1
+                    carry_in <= '1';
+                    v.x := not pcmpb_eq;
+                end if;
+                v.state := FINISH;
+
             when INT_SHIFT =>
                 opsel_r <= RES_SHIFT;
                 set_x := '1';
@@ -1218,6 +1457,9 @@ begin
 
         end case;
 
+        if zero_divide = '1' then
+            v.fpscr(FPSCR_ZX) := '1';
+        end if;
         if qnan_result = '1' then
             invalid := '1';
             v.result_class := NAN;
@@ -1227,7 +1469,9 @@ begin
         end if;
         if arith_done = '1' then
             -- Enabled invalid exception doesn't write result or FPRF
-            if (invalid and r.fpscr(FPSCR_VE)) = '0' then
+            -- Neither does enabled zero-divide exception
+            if (invalid and r.fpscr(FPSCR_VE)) = '0' and
+                (zero_divide and r.fpscr(FPSCR_ZE)) = '0' then
                 v.writing_back := '1';
                 v.update_fprf := '1';
             end if;
@@ -1236,30 +1480,52 @@ begin
             update_fx := '1';
         end if;
 
-        -- Multiplier data path
+        -- Multiplier and divide/square root data path
         case msel_1 is
             when MUL1_A =>
                 f_to_multiply.data1 <= r.a.mantissa(61 downto 0) & "00";
             when MUL1_B =>
                 f_to_multiply.data1 <= r.b.mantissa(61 downto 0) & "00";
+            when MUL1_Y =>
+                f_to_multiply.data1 <= r.y;
             when others =>
                 f_to_multiply.data1 <= r.r(61 downto 0) & "00";
         end case;
         case msel_2 is
             when MUL2_C =>
                 f_to_multiply.data2 <= r.c.mantissa(61 downto 0) & "00";
+            when MUL2_LUT =>
+                f_to_multiply.data2 <= x"00" & inverse_est & '0' & x"000000000";
+            when MUL2_P =>
+                f_to_multiply.data2 <= r.p;
             when others =>
                 f_to_multiply.data2 <= r.r(61 downto 0) & "00";
         end case;
         maddend := (others => '0');
+        case msel_add is
+            when MULADD_CONST =>
+                -- addend is 2.0 in 16.112 format
+                maddend(113) := '1';                -- 2.0
+            when MULADD_A =>
+                -- addend is A in 16.112 format
+                maddend(121 downto 58) := r.a.mantissa;
+            when others =>
+        end case;
         if msel_inv = '1' then
             f_to_multiply.addend <= not maddend;
         else
             f_to_multiply.addend <= maddend;
         end if;
         f_to_multiply.not_result <= msel_inv;
+        if set_y = '1' then
+            v.y := f_to_multiply.data2;
+        end if;
         if multiply_to_f.valid = '1' then
-            v.p := multiply_to_f.result(63 downto 0);
+            if pshift = '0' then
+                v.p := multiply_to_f.result(63 downto 0);
+            else
+                v.p := multiply_to_f.result(119 downto 56);
+            end if;
         end if;
 
         -- Data path.
@@ -1378,6 +1644,10 @@ begin
             v.a.exponent := new_exp;
             v.a.mantissa := shift_res;
         end if;
+        if set_b = '1' then
+            v.b.exponent := new_exp;
+            v.b.mantissa := shift_res;
+        end if;
         if set_c = '1' then
             v.c.exponent := new_exp;
             v.c.mantissa := shift_res;
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index 305359a..cbb0ee2 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -1007,6 +1007,7 @@ struct mulvals {
 	{ 0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000 },
 	{ 0xbf4fff801fffffff, 0x6d7fffff8000007f, 0xecdfff7fa001fffe },
 	{ 0x3fbd50275a65ed80, 0x0010000000000000, 0x0001d50275a65ed8 },
+	{ 0x3fe95d8937acf1ce, 0x0000000000000001, 0x0000000000000001 },
 };
 
 int test15(long arg)
@@ -1073,6 +1074,43 @@ int fpu_test_16(void)
 	return trapit(0, test16);
 }
 
+struct divvals {
+	unsigned long val_a;
+	unsigned long val_b;
+	unsigned long prod;
+} divvals[] = {
+	{ 0x3ff0000000000000, 0x0000000000000000, 0x7ff0000000000000 },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 },
+	{ 0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000 },
+	{ 0x4000000000000000, 0x4008000000000000, 0x3fe5555555555555 },
+	{ 0xc01fff0007ffffff, 0xc03ffffffdffffbf, 0x3fcfff0009fff041 },
+};
+
+int test17(long arg)
+{
+	long i;
+	unsigned long result;
+	struct divvals *vp = divvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(divvals) / sizeof(divvals[0]); ++i, ++vp) {
+		asm("lfd 5,0(%0); lfd 6,8(%0); fdiv 7,5,6; stfd 7,0(%1)"
+		    : : "b" (&vp->val_a), "b" (&result) : "memory");
+		if (result != vp->prod) {
+			print_hex(i, 2, " ");
+			print_hex(result, 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_17(void)
+{
+	enable_fp();
+	return trapit(0, test17);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -1114,6 +1152,7 @@ int main(void)
 	do_test(14, fpu_test_14);
 	do_test(15, fpu_test_15);
 	do_test(16, fpu_test_16);
+	do_test(17, fpu_test_17);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index 1e0e29e0c174fe7fcc993071b35fef6086d1a0c0..dc5af293a27cd0d08eda8272695afbe7e9ba89c5 100755
GIT binary patch
delta 2394
zcmai0drVVj6hHUUmSS3Mh4QGSFAxE-77@6tC>0swp~xH>r&FuwoO{I?XK+Jcrkb)u
z10J{F!a&Vt*s>*<?&abbd$`D^jwSvf@r67l;A5~Uc250vzFVTsnRt?ubI$Mkedjy(
zeCOV_^BbYNP0$0lX8{zg`YE(%3)!_b_4;rTJoO1PMA%oqG)<UxXxG;h3Ws;vvj7Hv
z0q{sxs3`$*>Y8Qj%oWCpc^t%Yvx3|X;{JT5^P#l>*6X5db<Me10m^u-q^26+?K2Hy
ztYAj)G+>8rs_1LNQQbl^{vv&eC?2!md*KOTd@ny~7(2qJ8?+WEIK*rL#@bzjcY}4*
zp*A8@3$y}|GZ^GP9Wgg7u=9XRLG|Cl_an-M1{4fwh7+++@SVK>Kn`Quasb+0SFq6F
z5|6~<4#P{z^<L@IA^bSXf$@>iLJQ7{Tp%>!&d3E~#4<b|St8Cfp(ARuvTnIo`Wr{0
zGO;^KBedexr~<`Wue4r<HbiTMbyyXhuY7r>SL#B2Oy=&GGUaiPS1RNI*2VBW@5el)
zY+UV?4r8h@4Z95rv>G)+8BQ}&R)a#nJ-~{o$ZUQInl)Z2iFld9EWBu(jJu2r#iKXz
zMPq!ZrxL=-J@ZjJ$*))v1qHj~OxQiiq&1QMPB`xhc>3^M_#|;}Bqo?vi288cVM?B;
zjf8^z)C+k%p{j!V|Gmi}`ZnU9rhM^713nR(Eb4aS^RWvf>o%54vD9xXtQuhKK(-EQ
zBqE-S%@-90j5NP01aP~#cp~R1@9?q@WGjN49&@s&iNNs|ny_7%YuRP2+xVeW@lS7c
z;qCt=9J1sK1DI<4ftN9AO*U}uom;F}fNJTp(DKSsut|NGZd)g+_hOSRm-BjTL7x60
zUI1<OW!g}1%L8;Uq;P8hYwc-#OPf7ebn0-(zF63YbK+!db6j#*#RMo2fVng{VxMbQ
zKCrbyej~bFRuCxNN5|CAaV*}6O?I=xH}*VE*dG|Y8GI)42Y8dZD1NZ^_VIM613Tj)
zv<=iN9SDKn=nhhGBrbef*SO$ql@3=|I-y*C;G<*M6Ep;e$9yL>Aq7+8UBVW8CEj1X
zj|$MhHaG8j3;F!u^ilqHxKcQlNVoPVj-EKPX$>G-<~__&Q=Y3gc%E7Ex&*6g9>JL?
zbr4B_D!Jf&RNN*!G#Q2c_+-MPi(7vE@x^5k=W)du<UEflZsf237T1WiQ*H{+;#-Mn
z!W#TMF-<IJ_75d)4ykSp-pD|JUA%lSL|xf>jqkL|<HOkM?5}9OrrBP<mvG)!CH1F$
z?D9_#c2InylH<Jp#|S?xV{hKWItlxbu*v<b_;nf#H?@lM`H)ku^?V%e9bQR9bVgZ)
zF28<Ss1U=y1bP~RvZY>0kY7cc#9!TQXmd;nYm`6E;PK(pj_Ui7zsvEAplxjKk-%EC
z$_hnWe3<2QX?@N1!A}%W`SAkS>ch93;gdNMLi4VHq^#hIdn?Cu55{=dxxf%SDn11u
zsTQDiIxoWKk4;@ERBt-VSRch)bfm<AymO3|a)5+mZqgdUhRJf_SQ+Va=NW6^013xl
zARQ*mN|p=9DoJk>7S90^ZtF-j7Z^JWfd9;}iR~BAoaPXZcVh{eyo*?sW)@$*h)rZR
z^k5sAh8{dmro9L6k{Reh^Gvg->&4k*q+VP>#?y;cWPH8YM5eVD+sK^m#q(sK5ATvm
z?4x~?lp;VWdFe(~fZUcooSklri4LKv`WZ8k;bJ9CAxw?0rCU^`<dyZ~yXkgS1-YC0
zu`NAW)k1D(KVC~uP@N*T;CD1;*j2a5bzj0<a^n>Mr!L`(8FuA7ay3|=VNq=$@Ax2o
zkzrQtA$QLZ-pw$M<L7-vVdCX`;aLig@X)LTCDn|!%s68rc^}XZI_GoABc-v1)+@Lo
zGgsv$*L{uL*{Qt!$0_Xm<Ni}|teddYgz=qR=8;|}OpOMqSed5+Sc7Y&DTWQ?MPH{;
z5Qy;Pc&mO+mG~6ym*SFU-eAnfx03LTd}vw-^KyVBk<?8ZjaQ|qN%KY-;|Ledt|9dh
zmcsLraBL50g?~=gZY4G^N%EJv9a<cnugB?mYW#9h5;{v%$#P2KsojfY>8Jk#EEJff
v{aeQFFqYFkXPAWn`!&a3F~32rJvs78X6^F99Qhd3=NqtFQ2H&452^nJR9`?N

delta 2173
zcmah~e@s(X6u$4#*5X=hf%2o2(w2fyp@1;@pr}=pwXgv?wrmC!#BG>4BFo(BR%kWV
zfEk2(wuBfRV$`T{22xFMQG;1x!Zr>5w?*er3InDL62<CtcjvZ7jWh8ihjY$%zkA;I
z&O5-!1#b8}rzFI-gpl(3OY_RNWBh>llrg+#gD-tC@9FWa&F8ZEJAYW<aCR7q2$}qu
zkUBv}Je33!MNbI3cAL@)R)?__MPcp`a(}uu{KA<Be2RFzqGv^s2RlwzdfbHUxz_!J
z74Ast5@1NNhz}lsImH?b{&Hm{&ktx}Urc%=yUU*BgrS%m)#GGh@2C18rGwVVf5SQy
zAdj`kXYhoOWs_m<>e$jqPsbsv2=(uT1F;QU0&uE))p-rEUku$pw2ac7%Lo~?9tMZX
z%AeH0Zq>))Mz8Sw6r6}RgE20FtAizRRa^~p#8vUXtc4qKmAo+t%<-R#?d!b41DK7s
zz(~A|YlJ)TcF`uUusH;6O^|aRLTf^q*to$f{07QI%ihEqvA51EI9LH)iR{jS#5cq>
z8@<9=$WvQDr<QRt$W~j#S5e?D=vRNi$?<0zR3=HdZ{eLJzo;vY*r6^(0~s2Pqy~%2
z(NL~2@K;sPq*>29qT!0h6unAC?7i57jUtfV>cqj$Pd4+;W++ZB6FrK>9wV9r*q^N6
zyM1sfc}<+Bxj{(Aaq39@1f_>c6vQJSJx*K3m&C$)Z4;`!tgVP<Jn>^%!8oEY$CP4<
zTp2^`9HCY(^lDSUl_KNYI$%#qr`pqeL~#By>UKQ(U+=}bGEN8Ebw4s)OZ6rd<34^!
zD>&j7PS0y_RTI6SffoHoyt5m|^(z?9WH6!DBZBe;UakUK^=j}MY;p;)h4$rWgrmg+
zR}J~>o+Q;2*|;DqG67~&OTm>Y;}p=I8alp^YU2B%U@FxN!^T*78@{E2a6$GRCSleX
zlRY$_Ef=Y)o^iW!Nkd5AnI~8=uGsM>d2UE^UYeEL44<U=-Cd|vMt0iRyt*-GbG<Xi
z*4CQEune5{%Q!>GS&C<bl!SOMa-`VjNT|<}(78bBJul%z3p<d+k=BslgD7qj^rr*o
zgH;P(E>3mt)#7IFzFOR6<Ph=-amSJLytuESJ>x#-fE}6n_&awdGoK&$&Oep;d4&65
z_=`*g>CL~6Mo3+af3usq(1)gUW63c*Cp`w=YQ))G#W<fnoHyIOU5Fj!zjQI2&3^##
z<`DMPQ*02ib%>>n(~52PX@w0(#rSO3aMnt;4o@FmfJDfR*KtFBrFk9~sY5bb4j%Gy
z7Dj49e<ews6zX%_&no|9&fA<^({oE8y7GD*Dc>#yN1jzK?J-QA6cN{nD$*haUtUZa
zLn3gAWhApGd}3>1nBw^whVrUZ;YS&>2+3?Gq&<g;kovP1Z{XbC-zg1X%}jxg98z<G
z(rN~vG0cYEh1d*6W(=!A-|`2g#~6Udutt`QP^x1QjbScy8Dd5Tpt0D6u0iZNA@Ev(
zo*y0obAg$^eiPPXsJR851zO&93r=D1jKUQR$3|fq!|*6Xy{_ehqhQ9M3qU0XdjOg+
zYz;sshVB5I!Y~+sD;OpMFpWVm22q7tUKoSQLbc>q9yN?XQ=vLBi0Q-_r5dIUO@}Uv
zAY=x53sWSyn3asfr9y+`JxpuIVY<*HX~T429JH2n$!Sb;|AcajL2@0_j0yM@Q;~>}
za}%)7Vi0FxdKWHQQY6)wot}bmi&p#@rY=}5B+X}!>&8lP8_a?_(un0bEDysrAypl~
zEQtR@8Hw3S^dMrQJMd6g3NwO8Y!egm4LlG^v)eE;&OCb&Hv9dEWg*5cGJ6xf53yR<
zTvQ<@5_}CfTeMJ<h1r(7l-f8Xtcubpm$&kzQDmiLIewt6`I=1UJxbeH2Q=20Ey-=f
zybM6gK)0dS!t&xps;)Uo8N!UQJ?MvFM{$h!+<i)&{@&u<Vt-$8EeAR8XZovcX1Rav
Nt-VtJ!FLCx{{h#p-|+wd

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index 04c6c08..a8e2dcb 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -14,3 +14,4 @@ test 13:PASS
 test 14:PASS
 test 15:PASS
 test 16:PASS
+test 17:PASS

From 4ad5ab92038412d46ef0dc2477e079219b8d7ced Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 29 Jul 2020 17:34:03 +1000
Subject: [PATCH 19/30] FPU: Implement fre[s]

This just returns the value from the inverse lookup table.  The result
is accurate to better than one part in 512 (the architecture requires
1/256).

This also adds a simple test, which relies on the particular values in
the inverse lookup table, so it is not a general test.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl               |   2 ++
 fpu.vhdl                   |  48 ++++++++++++++++++++++++++++++++++++-
 tests/fpu/fpu.c            |  38 +++++++++++++++++++++++++++++
 tests/test_fpu.bin         | Bin 24416 -> 24512 bytes
 tests/test_fpu.console_out |   1 +
 5 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index ddcbb3c..c0c3465 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -419,6 +419,7 @@ architecture behaviour of decode1 is
         2#10010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fdivs
         2#10100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsubs
         2#10101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fadds
+        2#11000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fres
         2#11001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmuls
         others => illegal_inst
         );
@@ -473,6 +474,7 @@ architecture behaviour of decode1 is
         2#0010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fdiv
         2#0100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsub
         2#0101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fadd
+        2#1000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fre
         2#1001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmul
         others => illegal_inst
         );
diff --git a/fpu.vhdl b/fpu.vhdl
index 2584e1c..fee1776 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -41,11 +41,13 @@ architecture behaviour of fpu is
                      DO_FCFID, DO_FCTI,
                      DO_FRSP, DO_FRI,
                      DO_FADD, DO_FMUL, DO_FDIV,
+                     DO_FRE,
                      FRI_1,
                      ADD_SHIFT, ADD_2, ADD_3,
                      MULT_1,
                      LOOKUP,
                      DIV_2, DIV_3, DIV_4, DIV_5, DIV_6,
+                     FRE_1,
                      INT_SHIFT, INT_ROUND, INT_ISHIFT,
                      INT_FINAL, INT_CHECK, INT_OFLOW,
                      FINISH, NORMALIZE,
@@ -639,6 +641,8 @@ begin
                             v.state := DO_FDIV;
                         when "10100" | "10101" =>
                             v.state := DO_FADD;
+                        when "11000" =>
+                            v.state := DO_FRE;
                         when "11001" =>
                             v.is_multiply := '1';
                             v.state := DO_FMUL;
@@ -1041,6 +1045,36 @@ begin
                     arith_done := '1';
                 end if;
 
+            when DO_FRE =>
+                opsel_a <= AIN_B;
+                v.result_class := r.b.class;
+                v.result_sign := r.b.negative;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.b.class = NAN and r.b.mantissa(53) = '0' then
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    invalid := '1';
+                end if;
+                case r.b.class is
+                    when FINITE =>
+                        v.result_exp := - r.b.exponent;
+                        if r.b.mantissa(54) = '0' then
+                            v.state := RENORM_B;
+                        else
+                            v.state := FRE_1;
+                        end if;
+                    when NAN =>
+                        -- result is B
+                        arith_done := '1';
+                    when INFINITY =>
+                        v.result_class := ZERO;
+                        arith_done := '1';
+                    when ZERO =>
+                        v.result_class := INFINITY;
+                        zero_divide := '1';
+                        arith_done := '1';
+                end case;
+
             when RENORM_A =>
                 renormalize := '1';
                 v.state := RENORM_A2;
@@ -1149,7 +1183,11 @@ begin
                 opsel_a <= AIN_B;
                 -- wait one cycle for inverse_table[B] lookup
                 v.first := '1';
-                v.state := DIV_2;
+                if r.insn(4) = '0' then
+                    v.state := DIV_2;
+                else
+                    v.state := FRE_1;
+                end if;
 
             when DIV_2 =>
                 -- compute Y = inverse_table[B] (when count=0); P = 2 - B * Y
@@ -1221,6 +1259,12 @@ begin
                 end if;
                 v.state := FINISH;
 
+            when FRE_1 =>
+                opsel_r <= RES_MISC;
+                misc_sel <= "0111";
+                v.shift := to_signed(1, EXP_BITS);
+                v.state := NORMALIZE;
+
             when INT_SHIFT =>
                 opsel_r <= RES_SHIFT;
                 set_x := '1';
@@ -1609,6 +1653,8 @@ begin
                     when "0110" =>
                         -- fmrgew result
                         misc := r.a.mantissa(63 downto 32) & r.b.mantissa(63 downto 32);
+                    when "0111" =>
+                        misc := 10x"000" & inverse_est & 35x"000000000";
                     when "1000" =>
                         -- max positive result for fctiw[z]
                         misc := x"000000007fffffff";
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index cbb0ee2..e62ce27 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -1111,6 +1111,43 @@ int fpu_test_17(void)
 	return trapit(0, test17);
 }
 
+struct recipvals {
+	unsigned long val;
+	unsigned long inv;
+} recipvals[] = {
+	{ 0x0000000000000000, 0x7ff0000000000000 },
+	{ 0xfff0000000000000, 0x8000000000000000 },
+	{ 0x3ff0000000000000, 0x3feff00400000000 },
+	{ 0xbff0000000000000, 0xbfeff00400000000 },
+	{ 0x4008000000000000, 0x3fd54e3800000000 },
+	{ 0xc03ffffffdffffbf, 0xbfa0040000000000 },
+};
+
+int test18(long arg)
+{
+	long i;
+	unsigned long result;
+	struct recipvals *vp = recipvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(recipvals) / sizeof(recipvals[0]); ++i, ++vp) {
+		asm("lfd 6,0(%0); fre 7,6; stfd 7,0(%1)"
+		    : : "b" (&vp->val), "b" (&result) : "memory");
+		if (result != vp->inv) {
+			print_hex(i, 2, " ");
+			print_hex(result, 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_18(void)
+{
+	enable_fp();
+	return trapit(0, test18);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -1153,6 +1190,7 @@ int main(void)
 	do_test(15, fpu_test_15);
 	do_test(16, fpu_test_16);
 	do_test(17, fpu_test_17);
+	do_test(18, fpu_test_18);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index dc5af293a27cd0d08eda8272695afbe7e9ba89c5..572aad0ef8df74ffd3de84f0d0435c5bf1706b02 100755
GIT binary patch
delta 2463
zcmai0eQcBE7C-O1wsyGGZftE=y6#=K;aail#^~OCv2M&!IvtA_KQK-=U}hJ%+~soh
z0(T$9R0G~XhUZ3!!B~irX+$O6HPb(25@IGI!9Os&0d*UNbSPuQO?nl4?m6v(yIvAb
za(aHx@A<vYbDr~@_f5w>k%o^+g#f-9fI$0acLlan?0~jFJ3t$8n~2*)+@_A;zQUza
zaNpXca@>7%@9BF=jbTRxz~s3fglGa%vm3+p71Mi||NIMpljqL-&m8VGS{*EjlVqwn
z*az^^x&3pJ<Z{qv3FAqlON!yF@efigvMPPGET2*EwS|jiV;GMw^qR61@E;X<aUq7Q
zCVx$CPE&?VudG+FG4o@Y){wJGO=s=k|1=>yR4c^uwE)9aWvv*<s+3RI@x`o-y3HFy
z>XFO%j@gX`CW|zUHKx^46!*~YllYBkwOpNpZu4{6(bkaqAG~HR$1!u76vr#(mD(qt
z38{Zfa9x%(xvUd=Esfg2r$g$$F+IEd<?J=u%=VDFiU-)A%}>3ay;2)p7g9gM5^Fgs
z)->q=-fvy0U9%yiI+@F;iHj!6w3<q(rgenW@r$^_>Xl~Exu_7owmvNN5V}v&;v<V9
z8jA(|*p_R<BAZQj-VFY=nHaD+<h5qpVtYzDi(lIAOaIaY{)1ElzMbge-fb~V%_)}q
zw_;^Zqb5X?IA*ipft(Gd{ae?oIaE#s+G9c-sy0GUC7sFMD4oDo`=8~<U&Yh*KnCly
zH#yHk)taQnmFr49l?i?cL{$h6+H<ix_keZ(*0<Dczg*~Bb>sgQEma!jfrYq3InJpp
z&vVi4Z>cwb7EKcLse^Z|?{0xS_00_I&3i`b#EW?=Sm$!MvRIe?JKYfW<SoLGBgi6M
zzKia$B>x{g_$1#Yt46$>{}PX$D{!R-?@6X<!E5;|3U%P4UWK;9DXV(ouW;XvRVg5G
z|BiS(V>bX|qk6pJFjw@?9mWF(<IAom4@5Y|RVt+M{Jc*Yl6x4zS@9+wDJXv^Of@rt
z6#t5DqUR^k^z|>l_Ic=}3qa)ZX(tzepYv#$?(6oz`a}iJ&T*C03!l$n4^>6QlENzK
z1^iQCq;G(-N`vQpd>0=O=4^J(@+(obgt;QR%iC#MNwb{h0ICzZ-)r==PDNs!n#8;c
z?ZMk<GSn;*Nzj`}I84a}@#tcdj^G3L-afgvf4p;YFOcR=$+aiexkGX_B$}Vx0PZfj
zE_L8e=TfN)|3$w8kxA#@Qu>CH*At72@$V0(=(}%R;RlsOYY4HfdYI-R?AUjacrL4!
z$}=+85ZP0r>9D+`n|Ut(8RE4G?(es_QR1%Av(Bh!dX}2SN0nkdH#3zwoqOZf<|8CT
zk6Do>Bk69lWaIBa2KB&!Cqn8qdhZl^n0rMIuZz8GQVfi3iElBq>BG3LxOc(4;pO5(
z(ka~IdEa0QPpF^-+7$?FCtaYVvQZy)Ounmu?sry04`cbbe!GU6c0QLvuSnW0&X>?}
zV<$B{#*-z@rsS3dO8}g^0d^O2Ry5@<eKfMGR3r6mJ|{$!NH!xm38DJD5G_oQG3O)S
zM%*+-Hs;o_e<Q>I6J*SFlE;WsD6%ovO+G_h0TW~#`^h!m3UM9)?=H)ehrUI3nOi>b
z9X>^&dJOlL+2svm_#TDMaXd|-e;lVM42`2<nOz<mM>hri1U6IfOyCv@k5AxU3c(3{
zkHWwNo~CeS0;ed%C(z)v%gP0Gd+m}RA6jP9$^cy#u-R+XSyBK-F5nigb-7AVi3(xk
zT*(yj2Z<{pu8WC=)H--WT4C^|*Y&m-p#D2u9UrNSG=GiaXI_VHKf!-Samwq`jS!rU
zqP={vZiZmzMGTZXbT$p$!6p0)!3Kgem+-Z6hifyzCG>%h@-8b4Ewug{QpW-V)X^4q
zCo}0M%?~lWRBqP|5*(aFw`$jo67*kAz#9b5C!kZ&QdZv!F*tv{*5B4EAijY#fX}M=
z*_#Qip??ettH~P3gT%GrQFVoGgwXUA{8_Dc8+9~s^|u1?9TyO{i#UFW?Ll&txS^}K
zu_B<|MKBXTs<_uSNND6oA=)HTJYZ1hjB=9!8gzA(M%PlCYi34>UAzPtugu-pIV(hn
z39=$`ANlAk)>pc+(tZ+xDK?e|$rapLxlpTK7h+rFU}e1qi`P0MExr#;=x#{G+6H3<
v{okPeo4%Tebg*M{<3t3|`*^H2pRGM6#7!Y;hj|Q|@G<;KvP6uJwdnr~<V#Li

delta 2208
zcmah~3rtgI6#nm}tw;(h0_9ywc?cDdhrlfbEdv383EK#piI3uAn~!XaI0p=Yk;c?*
zy1L^G8cfv;L(JU3E<_jOkj+IIE-W!bqfVX^d@#(EF-8BKQ;bvR=1I;y-~avJJ@>ok
z{*;dIguW9(1Y@QQ#`4R4oRMFJbxr+>{4&PO$gM|iJ#y>UupJRe0^6}9Nt}vp+y3(l
zi8i~=z}V=;8=My~R8!jQFB-<T5r5_<#=0){Jh2DcRRKDZOrm6LaeW<QA6;yGD4A}@
z%#C*FQt1UN+)=$MSRIQzi$w8s2<-8S6m53s^h!~?hp@ah9y-Z+hjH}5^uuw>!#hP}
zA&}$!t%yBDpG5W9V3v2%DLk-%^M(bCbr^3fg?yhh@pv$7@mZ%RUu%&LUxP!waZsKh
zgDADP&;l9iBB2Sk;nf>1tBb^WK@jJ=Np4$ak^X{PzNygXs}x${rf;s?yv8D}b#mo?
zN>O?fYW;HL8&+7P3*hOW`k8-;yxVM%7SRH1{&d$~|6F;~N{e(15&}{|8=w?QAT}UZ
zu3T-AqLE8d{DO`%CgE<tCgBvk8#qUJ3HAm$WUu?OJg|iYL3vP+dk`)@_hNZ8^C4)9
z2o!g#p*?81;0Zp#b3EPEEN?%yac)P}R$AfZ;JKo;0vdvI#2<X1E4Wr|t5_uk<Mgzw
zY=rXz=_*z)p<=lvN033Y<{fcz7pOz>XOT{RpBIouHm#`$(F;o08}faCt)f|~yf;|4
zXzE{$s<k=d$Smm8cG82lgz9}reE%*l5Llh`&5Ttw#Vk}p^SVIj3oR5}Ay$_~(h8j(
z%{5D&Pe~_La74F<6pO?3LOfK3eM}2QxPE5k3oK7y+^B>P!?K_z+(a8E!u4W@2ULf}
zL3f0AXd}LdiV2f6-h@ASLGQf>b@AxLjqwn2a{+`%FaJ|+(?d;TX6e{N9FIO@AU4t{
zRKeOvN8LU&r(_K#nn(*`nq2Y(okL>`aZxzYXK6f9GavUDOLyv?*0^Gyn|G*BhI3zH
zW%bX{%#zNaNMN<jA?`zS8{zP55cWf6)U(YkyZwA~2Gl&yoR725|C;lXZvFA3xh6OG
zYW6x9kN!kh0-N<o!fI&4Yo23R|GrCI>-1L{ndJS~54yP4w2sl;TKoZFX3kfpAI1H8
zyKcvB<Y~5YoKXkPoDSVt<PM6}HN?|5IF0-<C-=c4t_Qhh<RXT7!F%}CO*k&n(*eVI
zd(x43^za%KLcFh5xZrS~>npfvQA|b}>@Yt*(yht)gU8HPJG>oV`}lGq{(#T}yA!_l
zXtEDTOq*Y(W%*U8D@{zxvD$T`U&>g`p(3`Ku-NHZC1cTs$2k{fGAJK$CZ|sI#2R`N
z7ptcogvT%zy_K=8bIBY&B`v8`M$lnPk_mMiFL6GEO9~Yxs#*4B&Wnk_AkKtYkK8y`
z3gSvIm;TCm3lSK^y@ffBoE9quaW$Az$b}PuLDhz-yu$e<#=t8%RP49{DamnScOSfg
zCA%MXC2PbD{m_o3d;of}G!DQ7mW}~Xr)b2H0Z7538iZ0T(jaWYVjhHDSgeE4j-_=F
zda;}zgb6Hc2-K+>F=_}>QZ=Q;L!4KVnS~zlo|(0;c(5!*CN6_8Cq>3u{$ros$Ik^X
z!<>_rLW>!Loa-=jrD_z#2up@xELEqdM7VJn)RJD&g0N>8a->MbIfQw?L4~AKOd&K~
zg*^zv@r#_h3Y`cG5h_RFnxs*bBkUdp4}(U&6X8y%GK46OB8s{W)dr2?62jvI17?Pk
z7dd0aaMBRwABE@({7*~fD6%o}kAbzqeM6SQg2*%m4-E4Y>5aN^+4K89Z$w-la_5nw
z8z~AgZzJbA4()0A@@xfTt3hs@?Ng4(?=~J60Tl~9v^Y*N7cz}u`gwOax6-{B)RGQX
z3vw1BFrqL`n0^zm-KdW)oZy@&3X<1jnvsj4#u&uy#H@7m8*^kw(hCLnbV;<M*wn6u
MKEcgV{#vo?-z=&8<^TWy

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index a8e2dcb..a5c08ea 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -15,3 +15,4 @@ test 14:PASS
 test 15:PASS
 test 16:PASS
 test 17:PASS
+test 18:PASS

From 4cd9301da6b26984247ada52115dcfbd866a8388 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 29 Jul 2020 20:26:39 +1000
Subject: [PATCH 20/30] FPU: Implement fsel

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl |  1 +
 fpu.vhdl     | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/decode1.vhdl b/decode1.vhdl
index c0c3465..09aaf91 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -474,6 +474,7 @@ architecture behaviour of decode1 is
         2#0010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fdiv
         2#0100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsub
         2#0101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fadd
+        2#0111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsel
         2#1000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fre
         2#1001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmul
         others => illegal_inst
diff --git a/fpu.vhdl b/fpu.vhdl
index fee1776..59e6f5d 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -42,6 +42,7 @@ architecture behaviour of fpu is
                      DO_FRSP, DO_FRI,
                      DO_FADD, DO_FMUL, DO_FDIV,
                      DO_FRE,
+                     DO_FSEL,
                      FRI_1,
                      ADD_SHIFT, ADD_2, ADD_3,
                      MULT_1,
@@ -641,6 +642,8 @@ begin
                             v.state := DO_FDIV;
                         when "10100" | "10101" =>
                             v.state := DO_FADD;
+                        when "10111" =>
+                            v.state := DO_FSEL;
                         when "11000" =>
                             v.state := DO_FRE;
                         when "11001" =>
@@ -1045,6 +1048,24 @@ begin
                     arith_done := '1';
                 end if;
 
+            when DO_FSEL =>
+                opsel_a <= AIN_A;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.a.class = ZERO or (r.a.negative = '0' and r.a.class /= NAN) then
+                    v.result_sign := r.c.negative;
+                    v.result_exp := r.c.exponent;
+                    v.result_class := r.c.class;
+                    opsel_a <= AIN_C;
+                else
+                    v.result_sign := r.b.negative;
+                    v.result_exp := r.b.exponent;
+                    v.result_class := r.b.class;
+                    opsel_a <= AIN_B;
+                end if;
+                v.quieten_nan := '0';
+                arith_done := '1';
+
             when DO_FRE =>
                 opsel_a <= AIN_B;
                 v.result_class := r.b.class;

From 49f3d1e77a14e8c2d8c35c1316b8d35754a1b428 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 30 Jul 2020 10:00:25 +1000
Subject: [PATCH 21/30] FPU: Implement fcmpu and fcmpo

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl |  2 ++
 fpu.vhdl     | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 09aaf91..ba9964e 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -436,6 +436,8 @@ architecture behaviour of decode1 is
     constant decode_op_63l_array : op_63_subop_array_0_t := (
         --                unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
         --                             op                               in   out   A   out  in    out  len        ext                                pipe
+        2#000000000#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  0/0=fcmpu
+        2#000000001#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  1/0=fcmpo
         2#000000010#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  2/0=mcrfs
         2#011000001#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  1/6=mtfsb1
         2#011000010#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  2/6=mtfsb0
diff --git a/fpu.vhdl b/fpu.vhdl
index 59e6f5d..c726be3 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -37,7 +37,7 @@ architecture behaviour of fpu is
 
     type state_t is (IDLE,
                      DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
-                     DO_FMR, DO_FMRG,
+                     DO_FMR, DO_FMRG, DO_FCMP,
                      DO_FCFID, DO_FCTI,
                      DO_FRSP, DO_FRI,
                      DO_FADD, DO_FMUL, DO_FDIV,
@@ -45,6 +45,7 @@ architecture behaviour of fpu is
                      DO_FSEL,
                      FRI_1,
                      ADD_SHIFT, ADD_2, ADD_3,
+                     CMP_1, CMP_2,
                      MULT_1,
                      LOOKUP,
                      DIV_2, DIV_3, DIV_4, DIV_5, DIV_6,
@@ -603,7 +604,11 @@ begin
                 if e_in.valid = '1' then
                     case e_in.insn(5 downto 1) is
                         when "00000" =>
-                            v.state := DO_MCRFS;
+                            if e_in.insn(7) = '1' then
+                                v.state := DO_MCRFS;
+                            else
+                                v.state := DO_FCMP;
+                            end if;
                         when "00110" =>
                             if e_in.insn(10) = '0' then
                                 if e_in.insn(8) = '0' then
@@ -669,6 +674,62 @@ begin
                 v.instr_done := '1';
                 v.state := IDLE;
 
+            when DO_FCMP =>
+                -- fcmp[uo]
+                v.instr_done := '1';
+                v.state := IDLE;
+                update_fx := '1';
+                opsel_a <= AIN_B;
+                opsel_r <= RES_SUM;
+                v.result_exp := r.b.exponent;
+                if (r.a.class = NAN and r.a.mantissa(53) = '0') or
+                    (r.b.class = NAN and r.b.mantissa(53) = '0') then
+                    -- Signalling NAN
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    if r.insn(6) = '1' and r.fpscr(FPSCR_VE) = '0' then
+                        v.fpscr(FPSCR_VXVC) := '1';
+                    end if;
+                    invalid := '1';
+                    v.cr_result := "0001";          -- unordered
+                elsif r.a.class = NAN or r.b.class = NAN then
+                    if r.insn(6) = '1' then
+                        -- fcmpo
+                        v.fpscr(FPSCR_VXVC) := '1';
+                        invalid := '1';
+                    end if;
+                    v.cr_result := "0001";          -- unordered
+                elsif r.a.class = ZERO and r.b.class = ZERO then
+                    v.cr_result := "0010";          -- equal
+                elsif r.a.negative /= r.b.negative then
+                    v.cr_result := r.a.negative & r.b.negative & "00";
+                elsif r.a.class = ZERO then
+                    -- A and B are the same sign from here down
+                    v.cr_result := not r.b.negative & r.b.negative & "00";
+                elsif r.a.class = INFINITY then
+                    if r.b.class = INFINITY then
+                        v.cr_result := "0010";
+                    else
+                        v.cr_result := r.a.negative & not r.a.negative & "00";
+                    end if;
+                elsif r.b.class = ZERO then
+                    -- A is finite from here down
+                    v.cr_result := r.a.negative & not r.a.negative & "00";
+                elsif r.b.class = INFINITY then
+                    v.cr_result := not r.b.negative & r.b.negative & "00";
+                elsif r.exp_cmp = '1' then
+                    -- A and B are both finite from here down
+                    v.cr_result := r.a.negative & not r.a.negative & "00";
+                elsif r.a.exponent /= r.b.exponent then
+                    -- A exponent is smaller than B
+                    v.cr_result := not r.a.negative & r.a.negative & "00";
+                else
+                    -- Prepare to subtract mantissas, put B in R
+                    v.cr_result := "0000";
+                    v.instr_done := '0';
+                    v.state := CMP_1;
+                end if;
+                v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result;
+
             when DO_MTFSB =>
                 -- mtfsb{0,1}
                 j := to_integer(unsigned(insn_bt(r.insn)));
@@ -1193,6 +1254,26 @@ begin
                     v.state := NORMALIZE;
                 end if;
 
+            when CMP_1 =>
+                opsel_a <= AIN_A;
+                opsel_b <= BIN_R;
+                opsel_binv <= '1';
+                carry_in <= '1';
+                v.state := CMP_2;
+
+            when CMP_2 =>
+                if r.r(63) = '1' then
+                    -- A is smaller in magnitude
+                    v.cr_result := not r.a.negative & r.a.negative & "00";
+                elsif (r_hi_nz or r_lo_nz) = '0' then
+                    v.cr_result := "0010";
+                else
+                    v.cr_result := r.a.negative & not r.a.negative & "00";
+                end if;
+                v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result;
+                v.instr_done := '1';
+                v.state := IDLE;
+
             when MULT_1 =>
                 f_to_multiply.valid <= r.first;
                 opsel_r <= RES_MULT;

From e1bbb786c078b7ecdae7d027860d44949eb87bb2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 30 Jul 2020 13:38:09 +1000
Subject: [PATCH 22/30] tests/fpu: Add tests for fsel and fcmpu

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 tests/fpu/fpu.c            |  98 +++++++++++++++++++++++++++++++++++++
 tests/test_fpu.bin         | Bin 24512 -> 25024 bytes
 tests/test_fpu.console_out |   2 +
 3 files changed, 100 insertions(+)

diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index e62ce27..06da475 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -1148,6 +1148,102 @@ int fpu_test_18(void)
 	return trapit(0, test18);
 }
 
+#define RES_B	0x7ffaaaaaaaaaaaaa
+#define RES_C	0x000bbbbbbbbbbbbb
+
+struct selvals {
+	unsigned long val;
+	unsigned long result;
+} selvals[] = {
+	{ 0x0000000000000000, RES_C },
+	{ 0x8000000000000000, RES_C },
+	{ 0x3ff0000000000000, RES_C },
+	{ 0xbff0000000000000, RES_B },
+	{ 0x7ff0000000000000, RES_C },
+	{ 0xfff0000000000000, RES_B },
+	{ 0x7ff8000000000000, RES_B },
+	{ 0xfff8000000000000, RES_B },
+	{ 0x0000000000000001, RES_C },
+	{ 0x8000000000000001, RES_B },
+	{ 0xffffffffffffffff, RES_B },
+};
+
+int test19(long arg)
+{
+	long i;
+	unsigned long result;
+	unsigned long frb = RES_B;
+	unsigned long frc = RES_C;
+	struct selvals *vp = selvals;
+
+	for (i = 0; i < sizeof(selvals) / sizeof(selvals[0]); ++i, ++vp) {
+		asm("lfd 6,0(%0); lfd 10,0(%1); lfd 22,0(%2); fsel 0,6,22,10; stfd 0,0(%3)"
+		    : : "b" (&vp->val), "b" (&frb), "b" (&frc), "b" (&result) : "memory");
+		if (result != vp->result) {
+			print_hex(i, 2, " ");
+			print_hex(result, 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_19(void)
+{
+	enable_fp();
+	return trapit(0, test19);
+}
+
+#define LT	8
+#define GT	4
+#define EQ	2
+#define UN	1
+
+struct cmpvals {
+	unsigned long vala, valb;
+	unsigned long result;
+} cmpvals[] = {
+	{ 0x0000000000000000, 0x0000000000000000, EQ },
+	{ 0x8000000000000000, 0x0000000000000000, EQ },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, EQ },
+	{ 0x3ff0000000000001, 0x3ff0000000000000, GT },
+	{ 0x3ff0000000000000, 0x3ff0000000000001, LT },
+	{ 0xbff0000000000000, 0x3ff0000000000000, LT },
+	{ 0x7ff0000000000000, 0x7ff0000000000000, EQ },
+	{ 0xfff0000000000000, 0x7ff0000000000000, LT },
+	{ 0x7ff8000000000000, 0x7ff0000000000000, UN },
+	{ 0xfff8000000000000, 0x7ff0000000000000, UN },
+	{ 0x0000000000000001, 0x0000000000000001, EQ },
+	{ 0x8000000000000001, 0x7ff0000000000000, LT },
+	{ 0xffffffffffffffff, 0x7ff0000000000000, UN },
+	{ 0xffffffffffffffff, 0xffffffffffffffff, UN },
+};
+
+int test20(long arg)
+{
+	long i;
+	unsigned long cr;
+	struct cmpvals *vp = cmpvals;
+
+	for (i = 0; i < sizeof(cmpvals) / sizeof(cmpvals[0]); ++i, ++vp) {
+		asm("lfd 6,0(%1); lfd 10,8(%1); fcmpu 7,6,10; mfcr %0"
+		    : "=r" (cr) : "b" (&vp->vala) : "memory");
+		cr &= 0xf;
+		if (cr != vp->result) {
+			print_hex(i, 2, " ");
+			print_hex(cr, 1, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_20(void)
+{
+	enable_fp();
+	return trapit(0, test20);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -1191,6 +1287,8 @@ int main(void)
 	do_test(16, fpu_test_16);
 	do_test(17, fpu_test_17);
 	do_test(18, fpu_test_18);
+	do_test(19, fpu_test_19);
+	do_test(20, fpu_test_20);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index 572aad0ef8df74ffd3de84f0d0435c5bf1706b02..0f9e03a588b27413f4d3522e6be6362f86d9204d 100755
GIT binary patch
delta 3674
zcmai03s93+7C!$^5`rQPk_RA<5D<YX4XE(PpafJl!iub|w6@d@vb3&t9NG~FY&2OX
zI_g-#o<7uVDP+`9W_4w;j=N5`P`eJY?hJNj9lAaUXm>%a6KiGTWBvEsKM4)2+uoVW
zIp;gyJ@>nh|Awy5xvsNZI)G(9Kxy^4@X|)IwXnLh8o)-_I>OcwwyxH>dT;t{&b{~L
zTwa~ivFA_EI4n&D6Trw{i7Dv8Sq|i=+!A*AwkVdd*pC&P{M_@z?Y-PLA!nsJ$IDwC
z0PUAgJ|%aj+)24zm-hv7{y8Ab7VE}d)w5hHj;fyLT02Xl%XwaviU(q5M7WDd8rUpN
z*c+3rU10!ghnV6M#V+&66TdQ*8>7^DQC3O;6pjSg%hkn`tW9pSjMh}zge~eC&W*e#
zSEES*YqwOwT_}oM3jw;!U*ZytnU|&DcFkJFjyGJwv76Yg%f`AK8D?nJyt4`yXv_Hx
zuizeSId>bcYM1f$WX#rWl&@Lk5+2}PT|Rnrk^GKd;vHRyykU(?__YMq#YXZKYjI0#
ziM+4cC47z1ary7Yy&yN%xP&Dvz~;C}Ui~X<i7SyqtxNbL=EUctK0cB^U5nZACGxu0
zTtcP<o**#eb^LRD9(pvg===W`%V+@w?9>+7T*7|rj-M@Mj)i10Xx3j)CZDUtH>XX<
zq-o3fZ%6TAe45f`hlm<m{WsV$?GLh^IIv=KY68|JBuv>&fn3Y>6KprIWBN2cA`VX_
ztm1#8L2Y8z)XlMAZKFMu^66U|hFfuYVm9Abk6RN<_&KrIo0!Gh5JwZ2Yy0bKghV<j
zJyZ{iqPsu^PJxK4l1lguIy{`To<Fh=waKMZ8BzWpwkCIh%ujkNIg7Vzu_f8yc<9j2
z_lbj#+S@a$CxgXwIBd!0bK#kd6xRZH%nVln=VP)mAmzbi(9h2R&UEHXX0;Ip1;*7w
zw?J_%!}P>q@=qUN_xyYhfb~4Ux!&U|6psPI`<1X{#0YJ}kdb9CfUU(vOcpQ6IwUka
zy6&(*oA1@fVwsWFMgA~b*HVf(a0n;EYwR{i5e{m=%7JKBL8j>=oS&BJh$8818D6pb
z5nBS2;M~R}c!<UM*v({eX!`R|v0Y4uE{BSw=)<qQ4PHsg|0vOul20G89c~+LxOdjU
zk&;6c2bEYziBBylK}TwQ3J<`6r2x8$g~Pnk&g8@c(2hn(aw?ketK>b^{q_EJe)e$U
z*&PxpXvRZ?lO4{W^|Q6VABfq0uy`BA0{M)?;(CgMit0$w1?<+JE$R!*LR;;7{{h?0
zwqbTtqzV+X>cLyX6un7Uuu270>A}CGtl@r(*@ohAwT}A6*V<3<#9G@33$1kpI}ES$
z)(<g0H8(p{*x%@TVxpKhChRkcLkFs(aZ_p~@BI+JO3j@WI#S~w>0~37G*UicWEJM5
zy~y|M!Ch(N2fjKwe#)g3hfaAAM`)mR9~Mu~9yjCB`{M`hr#N(=7eAi9aH)sBHFn)D
z(sjH0y;yeX(G!P}F76}{liB5snC$=fE4&W75o)9heSeZ4n@8`>NWeGJ5me!<bo11s
zQ@~mscoNu?+P57yrMn%&bY_vT)xyruOFrAa`}f$xZJx<6b|z2bTqMo{$^jHeyonsP
zna89$^QAo9H0M(|Q-z&G;$Vxkf>u&{5Rc8ksTXN|p=v$5NNqYcWZ0joIPtrmR{Y@Z
zPb>aAaR7c)ag5MGq)^2dCgJfJYtfszo7;(-vu1PeVMo>+uC4Q*SsTL~KL1G$(^q`!
zaG26wcbkpYOFx>TxViu~)(GG>>|IGX+i*E;oN}Z8(|(7r!~7<D@B>p3-Y8*jk72Qd
z)e<&kNGz+T+shsk#%CWR9YQp_Mq`r~5D|?!Js%O)8TDK`m&JacV(7lyc-bWkN^kqt
z0yV`mUvUW+Db`ap_5|{7TwtUhd-f25P3gw<Mn_0AYIJjbxIgE_6!(w-`qFAWlr|Ep
zGS|G=Z5a8T4D9X8V6&2Z{tIQJjGo(2EQ}uA$T7?!@z|5>=dc(r<u21mv!~4j$lM9A
z^SOYMd9y2J%<XKRE#n+BzZS(C6tbqGPHcEgSOo*9G0Z|;_=hNZ$+E_<7pSW*i{cRm
zP-EB{>PHCElVy!zcIqbyOJe{vW}B&hN!S$tjG3FlH(bHIx!JtyD!xYM@m2g_ZW5n%
z4NsBD7{E(pDh6<rOv3<b^OE@10n8)QGk}$3yaTwAOsoe#AY<|1DKcw3c!^B22S>@A
z_MkRDiT8LgkBo8<EAx{a#z9fEGEIx4^wC9IeijAAiXsXVsK<r@y!0b^f$~nuH~7aJ
zY5ejiSU5KRCCc~zpFG=^hA_~9aYz)|WUT3_@1{Y*5ccLLDNc~<8p1pI21P%)2ZvBA
zWGR&F;I3ndkgk|RZo_qaOE4&^$gR792L!`P7rC-;0w-D;Q3RiGi=u2KZuO)nC>l@{
zO$NxI8gx;hi+IxMEs=OF{K6;;F;_^;YZ6sDt$yMbV7HK}06D;hf8k9bNuEaT9GqfG
zR#?e<JdB%6N%A+yZN?l^vb>qRG;E~wN%BtL#1>PMqMuyrE&Re{hzAA0N4I{sDT_#x
zny@3c@QJBdQAMusEx8MF*k&E1u;<PXd&RKRg!K`|MzZ#E>K6zD{B&NKA|V{0>N`}J
z(=}FdRrf^E$x+gO(){ySWKPYj`d$=w5Xc%U$*z7MVJ-$xV;f~*9p5oOJ8Q#zQDg{f
zjP0bpgRq&bE;WX|N4<5FZbj!^^Lh@OR%UiqSS)g^dr`*&k13V9IQm~jYW`7P`fr^z
zZ=nqDF4ly33zayyQp=M`S){WD$pMlWunwi)3H<drf_Okm5Fg{eTcTddz+k-q&*>+6
z$LfiJ7Q}<~f_Tt!c#+HN{QctY7{?i@(Y5k7=lIxLI3fPLU1|@<Bu~hT<MTepgm^3;
Zkd_I-0>L!}2L!bSW0E|hvuS0H;=dWe4%7eu

delta 2747
zcmai0eN0qW7QgQ@Z;E!L!^aFC!tjQV5fG+`0uN@u!4C>UMQV+$THP6`Ev?3oxXa=$
zI@4xsqzM)|EwxQUN3tcXX&c)i4Q-lTx-|{Dny`Opv}ikw(*|10(iL%9wRg{bGb#`p
zPjWct{Lb&5d+xdSjShV&4E<9u5n`(##P0m+A^SeGJBh>YB*cN(F2r^rwri)`am2J-
z@E+MxL__9}4uAHDYpNrojF5?cBU6@$yPlw<^(xrtRmSRg9L83ag}JAYdwO(aUM_@n
zKCJh;2su7FaG%^{y&H3{jvkHV!h4V?N0Jv#YnKW=Fr$4^=<(TO>nR<P;O+P&3%ska
zHR71+fYb42y-p%EA2DNyu_4REZ{g0gUWiF3iHXJ(AuA^$><tO47KS={Eh;?IuN}~n
z&?tC;>WXyd45a1@r9|&a#$H}Y$dF|P)aoqsqyb*hy{O*(qDTJV2RNQ+hAe#&o!SbO
z`g$7N0*CeW!ZUDLzn)stz?|43UfAZ5e}SJ9OTnM0p`q<CnOH48^PES1K>?eSG;~E1
z97w7b2mauZFG6f`X;<<_F~RAPYk2^j$+*l3J;~K#aEC|!6qcowf|R16?=*rrrCQwh
ztVhmOz!NDNQM1z{kHHg*ONHyuzWC3=e*Bvq?Pwzl8Xe;IU{3v5m6Swkz#pFkQkJSK
zEp^eQL{hUkfe(f_f?&*COgAS&b=r1%E*|>Q^5V|wNzE}FhLSfOXdUl?K$@9$?u9r*
zHTCFWiy@C{yWkbW2K~{!jdB{!Tq4eK#(J$<;+By(iNupSC`sQ#pMDei((Q4aDBfmu
zoTLhqsFX*~B!F4U;GKcrsG;OAOd1b6kM4a>ZoPTMReNiG<@e+UaxAnAakS$(xGUe$
z7>NrbaV*9%`kce_0>%Py$pa5Hwlx!@+z*#CzO3qu>{5sojW<~x+Hrmkz9JT5Uj5FQ
zct^K?y;r3HtNabdGMj|Az@D{gPOW0(&90S=@%&n`^J)#k*{tVj=nW{zE;7$<Z1XSS
z5%7Y%v09Ap9ECSArG*cT>2N!%lzO{iGP|f?_UdCHWwo2Hj>gqTFrK|yhIOW=Xln=b
zndWXgabxbb7cri_(EW(AZ9(QzXjp>pwHvlCG0)k$cwU9<c@@sUr%P6^^WwA65+U>-
zd~f?_Sl#)XzqtzWrWO*G&TngCVfdXkOu5<$NuzRu=OS1+e#NIEej$!RhT0rU+`&bp
z#u<6N{4;hRggtpC_&lf0)sL5`AurqbrJg{a&-110{0GC5&oO>U_v36sP9^3Du`0az
z96s4HrA~#C$EWAM4<}C6AxR(ylv8-IV>g`2g}4tMp4)L}J9gZA-){eY?ZMq1M9zb{
zbt-ip&}{{hW_Q~U$Clm@cEVfv%Y|k*o4-PM*GCIlqFjUFOB@feu^;=QqT6o$#1~4+
zZ!}{&tV5V5-i#yPA<pM4;=G0<A7An$tLvkGX}j~GjUX;6*kA8r7ZCdy*X9OU-JkFo
z*>H56&p%3>eJp=LcQ+qKA`~V{bb|WwmYD>Nglzs(M!h@6yYx~F_fwUB$cQ7_3&)FG
z3+_cl%X@`0aBlgDMc#l+B)d~0_I=3eT47o1&6s#!McR(9C;Ow(r~iuHry{v!v#}^*
zE7RP?Yzj|1Cx*D<^*Uw0tb9Up+X-oZly`)Cv=ndhjTEZ{*X~iqf(UZaq9T!4FEQ4@
z0ThneP+Jk3Ld%6?8@XO)te*oY9BV=yM@&M?g=1}~(}-np0EOF5RMj_(T_OaYEH%=>
zZ=j{rOiz9beQ2zHIA3a@&-!5oP16|YWdrRTgAz1@W3UO0e+*jCL<gW7O<@2&LGyF~
z&ZBV$U<OTp0Q6-BIud{qG@$@&LL*&)7Bn?ipu5blsrd?Ht-PnTF@>>J2R6iz)#@q?
zQ&E$m2pRsJJU^C<c`3*k=W>xyA4e=F2=;P=x*6R+24PQmhWaSFuLq&KJWoB0?o<%I
zDmSU8(QWz;{)dEA6<*2rkYdRY*PuHMZ&)(byV1)ZhksZMVkf#+Va$@Q9zbtk0_v>>
z@dCOvu+Ew;-a_vZ_wpo>kmes58{iE+7Sn)ddRN%CFoslptFC%DJo+`r!D`*sjFjEz
zZp8mxNt}+d2Gxz2<0_<7RD!lbB@Saa1<4hw3$$uX{`cN_`3$lUJA@ct$mMZV8L>fV
zuc%WWLNDPOd|hGE4WK)GjWLISjI}WmHb(0}SDBeRJ<Zr5z7~a-<gc&kI%6IVpyZ(1
zP%nV9a%q8PhB1zC;cPdmgjhbWi^8$jQRN$WvA!=VpA&o|l~0BLfL}1fYg==D4K|zT
S^H=-h;i8c4du{77_5T6UOU>y3

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index a5c08ea..aea206f 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -16,3 +16,5 @@ test 15:PASS
 test 16:PASS
 test 17:PASS
 test 18:PASS
+test 19:PASS
+test 20:PASS

From 394f993e75abaf1d23eba046c8bec59664439b3d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 30 Jul 2020 16:11:58 +1000
Subject: [PATCH 23/30] FPU: Implement frsqrte[s] and a test for frsqrte

This implements frsqrte by table lookup.  We first normalize the input
if necessary and adjust so that the exponent is even, giving us a
mantissa value in the range [1.0, 4.0), which is then used to look up
an entry in a 768-entry table.  The 768 entries are appended to the
table for reciprocal estimates, giving a table of 1024 entries in
total.  frsqrtes is implemented identically to frsqrte.

The estimate supplied is accurate to 1 part in 1024 or better.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl               |   2 +
 fpu.vhdl                   | 194 +++++++++++++++++++++++++++++++++++--
 tests/fpu/fpu.c            |  48 +++++++++
 tests/test_fpu.bin         | Bin 25024 -> 29376 bytes
 tests/test_fpu.console_out |   1 +
 5 files changed, 239 insertions(+), 6 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index ba9964e..7163ff9 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -421,6 +421,7 @@ architecture behaviour of decode1 is
         2#10101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fadds
         2#11000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fres
         2#11001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmuls
+        2#11010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- frsqrtes
         others => illegal_inst
         );
 
@@ -479,6 +480,7 @@ architecture behaviour of decode1 is
         2#0111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsel
         2#1000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fre
         2#1001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmul
+        2#1010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- frsqrte
         others => illegal_inst
         );
 
diff --git a/fpu.vhdl b/fpu.vhdl
index c726be3..0cbd43f 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -41,7 +41,7 @@ architecture behaviour of fpu is
                      DO_FCFID, DO_FCTI,
                      DO_FRSP, DO_FRI,
                      DO_FADD, DO_FMUL, DO_FDIV,
-                     DO_FRE,
+                     DO_FRE, DO_FRSQRTE,
                      DO_FSEL,
                      FRI_1,
                      ADD_SHIFT, ADD_2, ADD_3,
@@ -50,6 +50,7 @@ architecture behaviour of fpu is
                      LOOKUP,
                      DIV_2, DIV_3, DIV_4, DIV_5, DIV_6,
                      FRE_1,
+                     RSQRT_1,
                      INT_SHIFT, INT_ROUND, INT_ISHIFT,
                      INT_FINAL, INT_CHECK, INT_OFLOW,
                      FINISH, NORMALIZE,
@@ -98,11 +99,12 @@ architecture behaviour of fpu is
         exp_cmp      : std_ulogic;
         add_bsmall   : std_ulogic;
         is_multiply  : std_ulogic;
+        is_sqrt      : std_ulogic;
         first        : std_ulogic;
         count        : unsigned(1 downto 0);
     end record;
 
-    type lookup_table is array(0 to 255) of std_ulogic_vector(17 downto 0);
+    type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0);
 
     signal r, rin : reg_type;
 
@@ -160,6 +162,8 @@ architecture behaviour of fpu is
     constant MULADD_A     : std_ulogic_vector(1 downto 0) := "10";
 
     -- Inverse lookup table, indexed by the top 8 fraction bits
+    -- The first 256 entries are the reciprocal (1/x) lookup table,
+    -- and the remaining 768 entries are the reciprocal square root table.
     -- Output range is [0.5, 1) in 0.19 format, though the top
     -- bit isn't stored since it is always 1.
     -- Each output value is the inverse of the center of the input
@@ -199,7 +203,109 @@ architecture behaviour of fpu is
         18x"04321", 18x"040dd", 18x"03e9b", 18x"03c5c", 18x"03a1f", 18x"037e4", 18x"035ac", 18x"03376",
         18x"03142", 18x"02f11", 18x"02ce2", 18x"02ab5", 18x"0288b", 18x"02663", 18x"0243d", 18x"02219",
         18x"01ff7", 18x"01dd8", 18x"01bbb", 18x"019a0", 18x"01787", 18x"01570", 18x"0135b", 18x"01149",
-        18x"00f39", 18x"00d2a", 18x"00b1e", 18x"00914", 18x"0070c", 18x"00506", 18x"00302", 18x"00100"
+        18x"00f39", 18x"00d2a", 18x"00b1e", 18x"00914", 18x"0070c", 18x"00506", 18x"00302", 18x"00100",
+        -- 1/sqrt(x) lookup table
+        -- Input is in the range [1, 4), i.e. two bits to the left of the
+        -- binary point.  Those 2 bits index the following 3 blocks of 256 values.
+        -- 1.0 ... 1.9999
+        18x"3fe00", 18x"3fa06", 18x"3f612", 18x"3f224", 18x"3ee3a", 18x"3ea58", 18x"3e67c", 18x"3e2a4",
+        18x"3ded2", 18x"3db06", 18x"3d73e", 18x"3d37e", 18x"3cfc2", 18x"3cc0a", 18x"3c85a", 18x"3c4ae",
+        18x"3c106", 18x"3bd64", 18x"3b9c8", 18x"3b630", 18x"3b29e", 18x"3af10", 18x"3ab86", 18x"3a802",
+        18x"3a484", 18x"3a108", 18x"39d94", 18x"39a22", 18x"396b6", 18x"3934e", 18x"38fea", 18x"38c8c",
+        18x"38932", 18x"385dc", 18x"3828a", 18x"37f3e", 18x"37bf6", 18x"378b2", 18x"37572", 18x"37236",
+        18x"36efe", 18x"36bca", 18x"3689a", 18x"36570", 18x"36248", 18x"35f26", 18x"35c06", 18x"358ea",
+        18x"355d4", 18x"352c0", 18x"34fb0", 18x"34ca4", 18x"3499c", 18x"34698", 18x"34398", 18x"3409c",
+        18x"33da2", 18x"33aac", 18x"337bc", 18x"334cc", 18x"331e2", 18x"32efc", 18x"32c18", 18x"32938",
+        18x"3265a", 18x"32382", 18x"320ac", 18x"31dd8", 18x"31b0a", 18x"3183e", 18x"31576", 18x"312b0",
+        18x"30fee", 18x"30d2e", 18x"30a74", 18x"307ba", 18x"30506", 18x"30254", 18x"2ffa4", 18x"2fcf8",
+        18x"2fa4e", 18x"2f7a8", 18x"2f506", 18x"2f266", 18x"2efca", 18x"2ed2e", 18x"2ea98", 18x"2e804",
+        18x"2e572", 18x"2e2e4", 18x"2e058", 18x"2ddce", 18x"2db48", 18x"2d8c6", 18x"2d646", 18x"2d3c8",
+        18x"2d14c", 18x"2ced4", 18x"2cc5e", 18x"2c9ea", 18x"2c77a", 18x"2c50c", 18x"2c2a2", 18x"2c038",
+        18x"2bdd2", 18x"2bb70", 18x"2b90e", 18x"2b6b0", 18x"2b454", 18x"2b1fa", 18x"2afa4", 18x"2ad4e",
+        18x"2aafc", 18x"2a8ac", 18x"2a660", 18x"2a414", 18x"2a1cc", 18x"29f86", 18x"29d42", 18x"29b00",
+        18x"298c2", 18x"29684", 18x"2944a", 18x"29210", 18x"28fda", 18x"28da6", 18x"28b74", 18x"28946",
+        18x"28718", 18x"284ec", 18x"282c4", 18x"2809c", 18x"27e78", 18x"27c56", 18x"27a34", 18x"27816",
+        18x"275fa", 18x"273e0", 18x"271c8", 18x"26fb0", 18x"26d9c", 18x"26b8a", 18x"2697a", 18x"2676c",
+        18x"26560", 18x"26356", 18x"2614c", 18x"25f46", 18x"25d42", 18x"25b40", 18x"2593e", 18x"25740",
+        18x"25542", 18x"25348", 18x"2514e", 18x"24f58", 18x"24d62", 18x"24b6e", 18x"2497c", 18x"2478c",
+        18x"2459e", 18x"243b0", 18x"241c6", 18x"23fde", 18x"23df6", 18x"23c10", 18x"23a2c", 18x"2384a",
+        18x"2366a", 18x"2348c", 18x"232ae", 18x"230d2", 18x"22efa", 18x"22d20", 18x"22b4a", 18x"22976",
+        18x"227a2", 18x"225d2", 18x"22402", 18x"22234", 18x"22066", 18x"21e9c", 18x"21cd2", 18x"21b0a",
+        18x"21944", 18x"2177e", 18x"215ba", 18x"213fa", 18x"21238", 18x"2107a", 18x"20ebc", 18x"20d00",
+        18x"20b46", 18x"2098e", 18x"207d6", 18x"20620", 18x"2046c", 18x"202b8", 18x"20108", 18x"1ff58",
+        18x"1fda8", 18x"1fbfc", 18x"1fa50", 18x"1f8a4", 18x"1f6fc", 18x"1f554", 18x"1f3ae", 18x"1f208",
+        18x"1f064", 18x"1eec2", 18x"1ed22", 18x"1eb82", 18x"1e9e4", 18x"1e846", 18x"1e6aa", 18x"1e510",
+        18x"1e378", 18x"1e1e0", 18x"1e04a", 18x"1deb4", 18x"1dd20", 18x"1db8e", 18x"1d9fc", 18x"1d86c",
+        18x"1d6de", 18x"1d550", 18x"1d3c4", 18x"1d238", 18x"1d0ae", 18x"1cf26", 18x"1cd9e", 18x"1cc18",
+        18x"1ca94", 18x"1c910", 18x"1c78c", 18x"1c60a", 18x"1c48a", 18x"1c30c", 18x"1c18e", 18x"1c010",
+        18x"1be94", 18x"1bd1a", 18x"1bba0", 18x"1ba28", 18x"1b8b2", 18x"1b73c", 18x"1b5c6", 18x"1b452",
+        18x"1b2e0", 18x"1b16e", 18x"1affe", 18x"1ae8e", 18x"1ad20", 18x"1abb4", 18x"1aa46", 18x"1a8dc",
+        -- 2.0 ... 2.9999
+        18x"1a772", 18x"1a608", 18x"1a4a0", 18x"1a33a", 18x"1a1d4", 18x"1a070", 18x"19f0c", 18x"19da8",
+        18x"19c48", 18x"19ae6", 18x"19986", 18x"19828", 18x"196ca", 18x"1956e", 18x"19412", 18x"192b8",
+        18x"1915e", 18x"19004", 18x"18eae", 18x"18d56", 18x"18c00", 18x"18aac", 18x"18958", 18x"18804",
+        18x"186b2", 18x"18562", 18x"18412", 18x"182c2", 18x"18174", 18x"18026", 18x"17eda", 18x"17d8e",
+        18x"17c44", 18x"17afa", 18x"179b2", 18x"1786a", 18x"17724", 18x"175de", 18x"17498", 18x"17354",
+        18x"17210", 18x"170ce", 18x"16f8c", 18x"16e4c", 18x"16d0c", 18x"16bcc", 18x"16a8e", 18x"16950",
+        18x"16814", 18x"166d8", 18x"1659e", 18x"16464", 18x"1632a", 18x"161f2", 18x"160ba", 18x"15f84",
+        18x"15e4e", 18x"15d1a", 18x"15be6", 18x"15ab2", 18x"15980", 18x"1584e", 18x"1571c", 18x"155ec",
+        18x"154bc", 18x"1538e", 18x"15260", 18x"15134", 18x"15006", 18x"14edc", 18x"14db0", 18x"14c86",
+        18x"14b5e", 18x"14a36", 18x"1490e", 18x"147e6", 18x"146c0", 18x"1459a", 18x"14476", 18x"14352",
+        18x"14230", 18x"1410c", 18x"13fea", 18x"13eca", 18x"13daa", 18x"13c8a", 18x"13b6c", 18x"13a4e",
+        18x"13930", 18x"13814", 18x"136f8", 18x"135dc", 18x"134c2", 18x"133a8", 18x"1328e", 18x"13176",
+        18x"1305e", 18x"12f48", 18x"12e30", 18x"12d1a", 18x"12c06", 18x"12af2", 18x"129de", 18x"128ca",
+        18x"127b8", 18x"126a6", 18x"12596", 18x"12486", 18x"12376", 18x"12266", 18x"12158", 18x"1204a",
+        18x"11f3e", 18x"11e32", 18x"11d26", 18x"11c1a", 18x"11b10", 18x"11a06", 18x"118fc", 18x"117f4",
+        18x"116ec", 18x"115e4", 18x"114de", 18x"113d8", 18x"112d2", 18x"111ce", 18x"110ca", 18x"10fc6",
+        18x"10ec2", 18x"10dc0", 18x"10cbe", 18x"10bbc", 18x"10abc", 18x"109bc", 18x"108bc", 18x"107be",
+        18x"106c0", 18x"105c2", 18x"104c4", 18x"103c8", 18x"102cc", 18x"101d0", 18x"100d6", 18x"0ffdc",
+        18x"0fee2", 18x"0fdea", 18x"0fcf0", 18x"0fbf8", 18x"0fb02", 18x"0fa0a", 18x"0f914", 18x"0f81e",
+        18x"0f72a", 18x"0f636", 18x"0f542", 18x"0f44e", 18x"0f35a", 18x"0f268", 18x"0f176", 18x"0f086",
+        18x"0ef94", 18x"0eea4", 18x"0edb4", 18x"0ecc6", 18x"0ebd6", 18x"0eae8", 18x"0e9fa", 18x"0e90e",
+        18x"0e822", 18x"0e736", 18x"0e64a", 18x"0e55e", 18x"0e474", 18x"0e38a", 18x"0e2a0", 18x"0e1b8",
+        18x"0e0d0", 18x"0dfe8", 18x"0df00", 18x"0de1a", 18x"0dd32", 18x"0dc4c", 18x"0db68", 18x"0da82",
+        18x"0d99e", 18x"0d8ba", 18x"0d7d6", 18x"0d6f4", 18x"0d612", 18x"0d530", 18x"0d44e", 18x"0d36c",
+        18x"0d28c", 18x"0d1ac", 18x"0d0cc", 18x"0cfee", 18x"0cf0e", 18x"0ce30", 18x"0cd54", 18x"0cc76",
+        18x"0cb9a", 18x"0cabc", 18x"0c9e0", 18x"0c906", 18x"0c82a", 18x"0c750", 18x"0c676", 18x"0c59c",
+        18x"0c4c4", 18x"0c3ea", 18x"0c312", 18x"0c23a", 18x"0c164", 18x"0c08c", 18x"0bfb6", 18x"0bee0",
+        18x"0be0a", 18x"0bd36", 18x"0bc62", 18x"0bb8c", 18x"0baba", 18x"0b9e6", 18x"0b912", 18x"0b840",
+        18x"0b76e", 18x"0b69c", 18x"0b5cc", 18x"0b4fa", 18x"0b42a", 18x"0b35a", 18x"0b28a", 18x"0b1bc",
+        18x"0b0ee", 18x"0b01e", 18x"0af50", 18x"0ae84", 18x"0adb6", 18x"0acea", 18x"0ac1e", 18x"0ab52",
+        18x"0aa86", 18x"0a9bc", 18x"0a8f0", 18x"0a826", 18x"0a75c", 18x"0a694", 18x"0a5ca", 18x"0a502",
+        18x"0a43a", 18x"0a372", 18x"0a2aa", 18x"0a1e4", 18x"0a11c", 18x"0a056", 18x"09f90", 18x"09ecc",
+        -- 3.0 ... 3.9999
+        18x"09e06", 18x"09d42", 18x"09c7e", 18x"09bba", 18x"09af6", 18x"09a32", 18x"09970", 18x"098ae",
+        18x"097ec", 18x"0972a", 18x"09668", 18x"095a8", 18x"094e8", 18x"09426", 18x"09368", 18x"092a8",
+        18x"091e8", 18x"0912a", 18x"0906c", 18x"08fae", 18x"08ef0", 18x"08e32", 18x"08d76", 18x"08cba",
+        18x"08bfe", 18x"08b42", 18x"08a86", 18x"089ca", 18x"08910", 18x"08856", 18x"0879c", 18x"086e2",
+        18x"08628", 18x"08570", 18x"084b6", 18x"083fe", 18x"08346", 18x"0828e", 18x"081d8", 18x"08120",
+        18x"0806a", 18x"07fb4", 18x"07efe", 18x"07e48", 18x"07d92", 18x"07cde", 18x"07c2a", 18x"07b76",
+        18x"07ac2", 18x"07a0e", 18x"0795a", 18x"078a8", 18x"077f4", 18x"07742", 18x"07690", 18x"075de",
+        18x"0752e", 18x"0747c", 18x"073cc", 18x"0731c", 18x"0726c", 18x"071bc", 18x"0710c", 18x"0705e",
+        18x"06fae", 18x"06f00", 18x"06e52", 18x"06da4", 18x"06cf6", 18x"06c4a", 18x"06b9c", 18x"06af0",
+        18x"06a44", 18x"06998", 18x"068ec", 18x"06840", 18x"06796", 18x"066ea", 18x"06640", 18x"06596",
+        18x"064ec", 18x"06442", 18x"0639a", 18x"062f0", 18x"06248", 18x"061a0", 18x"060f8", 18x"06050",
+        18x"05fa8", 18x"05f00", 18x"05e5a", 18x"05db4", 18x"05d0e", 18x"05c68", 18x"05bc2", 18x"05b1c",
+        18x"05a76", 18x"059d2", 18x"0592e", 18x"05888", 18x"057e4", 18x"05742", 18x"0569e", 18x"055fa",
+        18x"05558", 18x"054b6", 18x"05412", 18x"05370", 18x"052ce", 18x"0522e", 18x"0518c", 18x"050ec",
+        18x"0504a", 18x"04faa", 18x"04f0a", 18x"04e6a", 18x"04dca", 18x"04d2c", 18x"04c8c", 18x"04bee",
+        18x"04b50", 18x"04ab0", 18x"04a12", 18x"04976", 18x"048d8", 18x"0483a", 18x"0479e", 18x"04700",
+        18x"04664", 18x"045c8", 18x"0452c", 18x"04490", 18x"043f6", 18x"0435a", 18x"042c0", 18x"04226",
+        18x"0418a", 18x"040f0", 18x"04056", 18x"03fbe", 18x"03f24", 18x"03e8c", 18x"03df2", 18x"03d5a",
+        18x"03cc2", 18x"03c2a", 18x"03b92", 18x"03afa", 18x"03a62", 18x"039cc", 18x"03934", 18x"0389e",
+        18x"03808", 18x"03772", 18x"036dc", 18x"03646", 18x"035b2", 18x"0351c", 18x"03488", 18x"033f2",
+        18x"0335e", 18x"032ca", 18x"03236", 18x"031a2", 18x"03110", 18x"0307c", 18x"02fea", 18x"02f56",
+        18x"02ec4", 18x"02e32", 18x"02da0", 18x"02d0e", 18x"02c7c", 18x"02bec", 18x"02b5a", 18x"02aca",
+        18x"02a38", 18x"029a8", 18x"02918", 18x"02888", 18x"027f8", 18x"0276a", 18x"026da", 18x"0264a",
+        18x"025bc", 18x"0252e", 18x"024a0", 18x"02410", 18x"02384", 18x"022f6", 18x"02268", 18x"021da",
+        18x"0214e", 18x"020c0", 18x"02034", 18x"01fa8", 18x"01f1c", 18x"01e90", 18x"01e04", 18x"01d78",
+        18x"01cee", 18x"01c62", 18x"01bd8", 18x"01b4c", 18x"01ac2", 18x"01a38", 18x"019ae", 18x"01924",
+        18x"0189c", 18x"01812", 18x"01788", 18x"01700", 18x"01676", 18x"015ee", 18x"01566", 18x"014de",
+        18x"01456", 18x"013ce", 18x"01346", 18x"012c0", 18x"01238", 18x"011b2", 18x"0112c", 18x"010a4",
+        18x"0101e", 18x"00f98", 18x"00f12", 18x"00e8c", 18x"00e08", 18x"00d82", 18x"00cfe", 18x"00c78",
+        18x"00bf4", 18x"00b70", 18x"00aec", 18x"00a68", 18x"009e4", 18x"00960", 18x"008dc", 18x"00858",
+        18x"007d6", 18x"00752", 18x"006d0", 18x"0064e", 18x"005cc", 18x"0054a", 18x"004c8", 18x"00446",
+        18x"003c4", 18x"00342", 18x"002c2", 18x"00240", 18x"001c0", 18x"00140", 18x"000c0", 18x"00040"
         );
 
     -- Left and right shifter with 120 bit input and 64 bit output.
@@ -424,9 +530,17 @@ begin
 
     -- synchronous reads from lookup table
     lut_access: process(clk)
+        variable addrhi : std_ulogic_vector(1 downto 0);
+        variable addr   : std_ulogic_vector(9 downto 0);
     begin
         if rising_edge(clk) then
-            inverse_est <= '1' & inverse_table(to_integer(unsigned(r.b.mantissa(53 downto 46))));
+            if r.is_sqrt = '1' then
+                addrhi := r.b.mantissa(55 downto 54);
+            else
+                addrhi := "00";
+            end if;
+            addr := addrhi & r.b.mantissa(53 downto 46);
+            inverse_est <= '1' & inverse_table(to_integer(unsigned(addr)));
         end if;
     end process;
 
@@ -488,6 +602,8 @@ begin
         variable pcmpb_eq    : std_ulogic;
         variable pcmpb_lt    : std_ulogic;
         variable pshift      : std_ulogic;
+        variable renorm_sqrt : std_ulogic;
+        variable sqrt_exp    : signed(EXP_BITS-1 downto 0);
     begin
         v := r;
         illegal := '0';
@@ -519,6 +635,7 @@ begin
             v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN);
             v.is_subtract := '0';
             v.is_multiply := '0';
+            v.is_sqrt := '0';
             v.add_bsmall := '0';
             adec := decode_dp(e_in.fra, int_input);
             bdec := decode_dp(e_in.frb, int_input);
@@ -599,6 +716,7 @@ begin
         msel_inv <= '0';
         set_y := '0';
         pshift := '0';
+        renorm_sqrt := '0';
         case r.state is
             when IDLE =>
                 if e_in.valid = '1' then
@@ -654,6 +772,9 @@ begin
                         when "11001" =>
                             v.is_multiply := '1';
                             v.state := DO_FMUL;
+                        when "11010" =>
+                            v.is_sqrt := '1';
+                            v.state := DO_FRSQRTE;
                         when others =>
                             illegal := '1';
                     end case;
@@ -1157,6 +1278,48 @@ begin
                         arith_done := '1';
                 end case;
 
+            when DO_FRSQRTE =>
+                opsel_a <= AIN_B;
+                v.result_class := r.b.class;
+                v.result_sign := r.b.negative;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.b.class = NAN and r.b.mantissa(53) = '0' then
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    invalid := '1';
+                end if;
+                v.shift := to_signed(1, EXP_BITS);
+                case r.b.class is
+                    when FINITE =>
+                        v.result_exp := r.b.exponent;
+                        if r.b.negative = '1' then
+                            v.fpscr(FPSCR_VXSQRT) := '1';
+                            qnan_result := '1';
+                            arith_done := '1';
+                        elsif r.b.mantissa(54) = '0' then
+                            v.state := RENORM_B;
+                        elsif r.b.exponent(0) = '0' then
+                            v.state := RSQRT_1;
+                        else
+                            v.state := RENORM_B2;
+                        end if;
+                    when NAN =>
+                        -- result is B
+                        arith_done := '1';
+                    when INFINITY =>
+                        if r.b.negative = '1' then
+                            v.fpscr(FPSCR_VXSQRT) := '1';
+                            qnan_result := '1';
+                        else
+                            v.result_class := ZERO;
+                        end if;
+                        arith_done := '1';
+                    when ZERO =>
+                        v.result_class := INFINITY;
+                        zero_divide := '1';
+                        arith_done := '1';
+                end case;
+
             when RENORM_A =>
                 renormalize := '1';
                 v.state := RENORM_A2;
@@ -1184,11 +1347,16 @@ begin
 
             when RENORM_B =>
                 renormalize := '1';
+                renorm_sqrt := r.is_sqrt;
                 v.state := RENORM_B2;
 
             when RENORM_B2 =>
                 set_b := '1';
-                v.result_exp := r.result_exp + r.shift;
+                if r.is_sqrt = '0' then
+                    v.result_exp := r.result_exp + r.shift;
+                else
+                    v.result_exp := new_exp;
+                end if;
                 v.state := LOOKUP;
 
             when RENORM_C =>
@@ -1287,8 +1455,10 @@ begin
                 v.first := '1';
                 if r.insn(4) = '0' then
                     v.state := DIV_2;
-                else
+                elsif r.insn(2) = '0' then
                     v.state := FRE_1;
+                else
+                    v.state := RSQRT_1;
                 end if;
 
             when DIV_2 =>
@@ -1367,6 +1537,14 @@ begin
                 v.shift := to_signed(1, EXP_BITS);
                 v.state := NORMALIZE;
 
+            when RSQRT_1 =>
+                opsel_r <= RES_MISC;
+                misc_sel <= "0111";
+                sqrt_exp := r.b.exponent(EXP_BITS-1) & r.b.exponent(EXP_BITS-1 downto 1);
+                v.result_exp := - sqrt_exp;
+                v.shift := to_signed(1, EXP_BITS);
+                v.state := NORMALIZE;
+
             when INT_SHIFT =>
                 opsel_r <= RES_SHIFT;
                 set_x := '1';
@@ -1807,6 +1985,10 @@ begin
 
         if renormalize = '1' then
             clz := count_left_zeroes(r.r);
+            if renorm_sqrt = '1' then
+                -- make denormalized value end up with even exponent
+                clz(0) := '1';
+            end if;
             v.shift := resize(signed('0' & clz) - 9, EXP_BITS);
         end if;
 
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index 06da475..d9c5c06 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -1244,6 +1244,53 @@ int fpu_test_20(void)
 	return trapit(0, test20);
 }
 
+struct isqrtvals {
+	unsigned long val;
+	unsigned long inv;
+} isqrtvals[] = {
+	{ 0x0000000000000000, 0x7ff0000000000000 },
+	{ 0x8000000000000000, 0xfff0000000000000 },
+	{ 0xfff0000000000000, 0x7ff8000000000000 },
+	{ 0x7ff0000000000000, 0x0000000000000000 },
+	{ 0xfff123456789abcd, 0xfff923456789abcd },
+	{ 0x3ff0000000000000, 0x3feff80000000000 },
+	{ 0x4000000000000000, 0x3fe69dc800000000 },
+	{ 0x4010000000000000, 0x3fdff80000000000 },
+	{ 0xbff0000000000000, 0x7ff8000000000000 },
+	{ 0x4008000000000000, 0x3fe2781800000000 },
+	{ 0x7fd0000000000000, 0x1ffff80000000000 },
+	{ 0x0008000000000000, 0x5fe69dc800000000 },
+	{ 0x0004000000000000, 0x5feff80000000000 },
+	{ 0x0002000000000000, 0x5ff69dc800000000 },
+	{ 0x0000000000000002, 0x61769dc800000000 },
+	{ 0x0000000000000001, 0x617ff80000000000 },
+};
+
+int test21(long arg)
+{
+	long i;
+	unsigned long result;
+	struct isqrtvals *vp = isqrtvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(isqrtvals) / sizeof(isqrtvals[0]); ++i, ++vp) {
+		asm("lfd 6,0(%0); frsqrte 7,6; stfd 7,0(%1)"
+		    : : "b" (&vp->val), "b" (&result) : "memory");
+		if (result != vp->inv) {
+			print_hex(i, 2, " ");
+			print_hex(result, 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_21(void)
+{
+	enable_fp();
+	return trapit(0, test21);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -1289,6 +1336,7 @@ int main(void)
 	do_test(18, fpu_test_18);
 	do_test(19, fpu_test_19);
 	do_test(20, fpu_test_20);
+	do_test(21, fpu_test_21);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index 0f9e03a588b27413f4d3522e6be6362f86d9204d..0253720609c301172916dc31fbe1acd21c7c41bd 100755
GIT binary patch
delta 3862
zcmb7H3s6+o89w*It}7xgA`7f6>@LVlL`2jJEV#TBkwJVEsPO^u8J%XdrUqMNiL=S5
z;}Ff?Qfzf9qD;q{3=U}~X|07c&7jd1XA%b=;T5#2Qi9fywLWgYvwMM6X`P;#!~cKZ
z|2Y3W=ihTLEq@VOZVFM1S<@NIFaLa8{w~N3Y)5`MV`bpBg4+si>xSwbhoWW+&4*Sd
ziQ1$yHE&MxZ_kocEFva{y_G4;S?Tds(rFjYYA~lTcKdq2EH7rr!Kbz%E5-UQ=g)t^
z*oEuu&)BF&JJke_Ja`NgR@)fcf4xre(7Lj*vfu`~pbejh6?wkCsC5;NdK%JZ^BR1a
z5t&(8qgU*1M}2d?{}%$M8%0f}Q|@^r%f0iNm7)TEECxr?uL7cl2KqE$W~}0}I)de$
zkxfonZb`fS)LpZ-p*KyI_gF+$CLvTmB~C1gpaoO@h)^+bx=9H&m|0${EA;q$xX)*-
zCC#_+PicKtDh!+}`kLvTz$L@ZS|&Jq;5`0`yqM=k!IFKoU3&ixoeGK;+9^yICbkt&
zhOSUNyO?Sqk_)I)7cR`Ar@DkByOHJLDftkp1^i)F3sVXi&VBUJXzt`pS`#!KdG`nD
zRZWF<=^@<<iZMJFG0`KUeF+V?BJ{y})f+#wORw<=<8u+a|2?`!!7Fyj#GSCrDFg*C
zvrAXWKjejF{`qh;C^C!uLU>`RA$nECa=WyMM?AmGYLAFjE9{cMoe*>ZL3uy0OP|mz
z{R_)na*i&;iYY<QmszRTt3EEaOR-}`q<chcE1`3)h&zZlP-K@5Q>*@k!ruRWbd7CG
z?b6b*oHlqkMV8v7@!U!4v6Gk42?p9Xb@TJ&NfJV*3o~d&XenxcCv>H5uYRn9r=id&
zRRK)r$z%xCD5=j=u3@d1tD{CkM2Nzrhp@bQEQiLy52uR%aC!|1V#h|B9JWCBAlM_j
z*Kk|W>tS1T-5b|Q2K*w7tlS~X$1Pe`Eg@<kY=M{@Omo9aP+(1XkZ_n9!}BL|Z<VXJ
zk6XOmo}w{Mv;<LtaVk9XjcbKp(<!5C<!i94*OA)98$ZFcj90VA6eoThNFTVkE2c)V
zdm~kub=*{IULYQrLY-#5<hf~PvPAmQ`UnfR9gm1Zr6;BBXc&Rnqz}ift1MzBX>S1i
zEmBX9BMc)k6NMv`ADP9YY9iwTdDPoDf6u5R)Dtn49Fgvz1X>mCM_JRNc*vG%apIT$
zs4tvmL}*+|X9{&RFfEB&%~5e;s6Q1(@h9RX)<s!4e+~S4tw&Ts^i1s7nrPQP)I`UL
zO_S&k(Mps-%dE;>wrF?WFZqqxon~YLzd75I=||DwGdfUp#iMU*iLBoF-2?aOKzH!d
zxe4s~G>_?%=op*Q2ZM6!BYq!NOs%Sv<yS2}NbSI(@ymm^V>*JbJbH@6Hpz#{;d~=Q
zCb+tztZ2|ynaI|;Hdm!^skp!%`nc_h*znDM9-SmIHYl4?YP_vas4T`X+@HUvw_}=Z
zgLnjAw#Ukw$_bs9p7}3+^`^yhE(Q%ZmX85G8L%;Carr&(qd}gB_j#UlmtA7z)noWf
zmcD|Mz^Yu2a}1?7(fQZ|!!>?QtK-J5@wIy|uQ3-sFRRhy%JY&MpA0@*V+Ykw@Ap;C
zQot+k7j!s&ws4Ft#U~5Lo5dOPywG{6i5WJw$#tj>9QNHtFHPmyyF8Qo9mx+~13POl
zA36!1Z?_8DrrqT3s;l4{#h;=<+`so=@PBo2KO5nU;Ld>a?~xb(1Q(qZJIMX{?+Hg>
z;~h1!_+B`=URwX-wH4y!?jrtL6wp=LY~1(w02qD+_XFR<MZ~ce>@m4^sYk)y!0o`7
z6|7>mGAVe0U20VbH5@Dp?a~<qj{~*?*DF{9d=dDNg2w|-$+JsuDcA>i74UWiPXImv
zT&`eW;C|qx3Z7^Kk-W$*Wh#Upa3%081y2J04A`V#EwFFCUGi73Kky1*LBW%Oj{x6O
zJ{$so`+$2CJOwy@F?K+~!GR#&0MQD<h_m4rzu54q<2{{$e;08Ws(e;y8t7Egn`5v^
zvv;7I8fPa67wN0n{8f~YTr#ntS7OHeawE&%g~<1k({dZkw@-Vs%2P|&ZVhztQ_U`K
z7MtS9dErX)9pS7lKkYM^CeK;OKTJj|i;QP1b{}K=W^xW|?VMBG+&;%!utj#ravw-O
zw3u*Y55N_10)ul_%*3m*JP64L=T>0)UX$hDasq>Mn=y}pGeYvgxk}8_;37GJ!R0~B
zv*2zpM$6}!#BJAU=ez{5@&+|RJh(wu=7ou&U(*nTv6FOCn3&f|b0M~MQZYnbC+&o2
z>7-f+M<;y*q3xn85K<QnL6mioE+tId+eLFBn!2bMqN9s;La=VCg^1~<k06S==}JnN
zt)g3&T_1$Gen$Vys)c?mOPz^ALorM7B0mOu1$36{mk9eZKbAdWuYle8U+jDn>R>PG
zk!7BX4<qJjaHZh3aUy3tEl&+pw}U&;L$9Zr)lR6N^-yhUoI1uE5xw+zYLt2f)Vf}}
z4~I&q2frbGnpu4UYGfZ}LA?mIyN|Y{MX8-oSKXvqutpVQKfgui(!%0$psx6bEcf$e
z7y1?9QGO$A^6_JtUFysU?$S2HC)7cOmN4}Js7-FQ32M%5&c=QTwd1xdf9RQH^|MKw
z@Yzh;E#c}=HFoC?)mqFUIZzAkj&?{65|n}~rOTFV^)YCUe$uCBB{-l~42<p=-^It^
zs=@IfJ_<0k<51rK?N85Fl|l8TtLf2!2cWgwm*s=_tjBTj;}lCMFe4&1^nonz<*^t%
zCGYlw;Ov~hh{3dSg~~Ig2PS_f%bekZdsk!TQGG^$s%!|&(A=7_QD|<@DEYtt6btn3
zE3wT*RyDrnay0Zo_IUb!*(B;*qs4z~mwvZzQ*OjP*)1Q+SQTRs{|;d|zjaHc&bFhs
z6s)zn?Oh5U{Id%)r_C)jE_-kV{&{nab~o>o!xg(rv}GT?wbo5|yM^;x1>+M0qwVO>
jTDRslw;kPC&bN7FIBn&0SqP`Um-<snj=H(3B+KVNVG~f*

delta 3338
zcmai0eNa@_6~FJnt_uNNWck{#%kBcpr-(|(!(EmIKZ4kjH8D|<C`f4${X<hR*aC~r
zYe${Ls3%R$mxUd|Sf_TXI2k(~8BCpyF}B4&>X52@sVOK@Ok#`G^7^~WLnzYb&fGoc
z_q*qud+xpGzPp{bh0fm#cE;R!j1^b>VNUUG$SStExPq~Aa9hD`1-G@TY4c(GV!?N~
zbdhLSc<SIAbHh6eB@IhRS;qDhC`w^ohKHi`FD&%gav2-CHmoSC7-I0Q_Z8;4-{$;h
zpEGvpTGxM#(Lw#QR#+6lVx*+W%h-WyM^y{GKN>5K@zEuHd?;q*@ow2G>js<YANr*{
zhY)5#WLDl5klHZV++H01xxnc*QCB~v^gmLRfqdp6d&KLaK7rnkNEUqbX~cq5)#Mc$
zD>|iE#}uX0F*H7rv)(u0P?Wt+k(EnuH5$^yA{(tR{0^>SWV%&#jkL0&^L|%fKFsqO
z>vUXr?VRJ5he{%I#6T<kH1fqsV?7hb`e7XOZ)FvajfACr+hpnF9Xb)6EOe4N$}BdO
zkt?c1+_`}cLJV)9o~U@CfX1UT7uCnJBCL`(p-I3Gb32)u$Ykt~9!<s0UQHXL(-HST
zv{6&NQI;Ok{pb|a43E%h9{V=Yh~L8)W7O1D%hDF^VVU94@UN*kJT<ap<wlt11f04m
zWa$eEk9}&Idk?3a5?`XQSf1FDSfeJZQkGV7kD1fFHqFEEJy{aC5u7f;iB-wcZ)u_N
zscHT=HZ{%JD#|qSX-bVo&GzrhQtE6TdDA>nen1_5k2~;KRxV5L(Rt%jiJkoC)Ep^T
z?`m$OpSDR?aL843<=N_)<ULv$moC&&ZQMGf^{2SfsEe_)ohi9YcFmg@($E%DoK6iL
zL~ofkiF+bx#FP-Lat$%8s0GuZCU7Sj`JG5JXNq05RBB!!WK)aTfSEoq*F<&KR!SyR
zw}n*<DvHmkXH62!a^hErJEN&Fe(hvPeq*AUJGU)hWdjd;{3wmY7tiDV8h_>boPiU*
z#g??7`Y2W;FvTI$4ilg1h-H)TK7D2J=U}s@iLH@TV&&Dn%lZ<}>k_yz1N9`u2Z@t4
zOlDh~vAgzTsqWED?>Us$j3j$(E5yqYWKUR&IE@K4JkFhjG_2`kX$Q(gU|#9WoXUC+
zvq~4|QCi~je!EPu`U5(WSjf$<CQg|D1jjwy{CYUdjg*ySf=fx#0$QIEN(Ykc+~;&s
znqZ(CNxbSxk}}27pHq%~5g)0sr-@~Hde_bu)?wewIa@LoI~NP%Tw{WbjwXjvLsBT6
zPPX|oNKOkK4V&04Jxk4Pr51Y}OnXv{-|fTZ@+-_M=l(a!s;iI2y;)dPFjM0B!h)Xh
zmr1#2)LWm$D*cOn5In&x!DnPh)w8exWwbmsd$MMJNqbV;y@S}N5VqIN&wUg+Kl=w?
z^DmtvgL5gUiP>zfz{Z6!#+-h;nKnAaIq8p+=O33?Mbj)c5z>Ab39QkdcQbNq4WVNh
zHI$WZolNu!txumlQPWr7p6J8--=64SV8ht|O>~}g3P#fty-Xiv42P(fO~5m=pMExf
zvG5LkHa}ZvX&;~ed;pFxXQ9hGHgSZ5xITB=0(AA;hk2xlcS%t;JK3&EiTP}Y*MsMK
zuEE~vc1@K0Tj1KnAJtFZOFj7Ae(uLlxEOF%;6ewKRkip6xv`Dhp5FnSgO}IOlgXVh
z<nIL9|9mQf0w(H*KScs+NdGdvv3PVRpS@w=`}kmR9DwnZS7fPQ#eu+Gz*khPVcv2n
z=2clbuM%1qIM>S3DHYEFmVsMTtOLFbd|1Unz=n0Qv`59kz-7QYRQwEZD{zI1Lx6{Y
zSF1SG3nF{HEET9k7;ruCLKV*iz5r}hu^u?2RF=Y391gq&SWxji;17ZCtB-XA@Gane
z6&rvvHed&UEjS7twz^#OZ5})lW#a!mIE+*Oy6AlL$>KL=VO`ljMD?7`&J?<+KbyY^
zOP*UB>Kl-lrMSYvig&~F*mI8MKHJc*0$Kfu7g>W2y7;MXcObqX)42eAu|uA4ZoeJ>
z%1qdBOy-;y_~Sp7Sz-obsrwn*zkqYF*qifeAXnSFa{>i##utim3y?QGI;`1aa2`&e
zan6k{{Z&y$A$jB68g#=oMLEt1G|p9{9|vcF<c)Ln=x4wsasrLZL+BU4U1y9oNLI1#
zIyFj}qI`pX4e|H}^+;whv4_SXQhLdpYZg7d<btT{rHv5Hz0?SCv6qfPjP_CoL`)y`
zK)Cy89HP39%uCGTp+0g!ob98H5Pf~r2%+nzV-Q*W)B&-spL!sg`pN7tYS#rY*4<Ao
zhtX9BHETdo;&|R@7W6m4N#M3~Vp&iD`-QMH%pG3P=m}$mz1jl4yx+h^9-wy}Hf=Z5
z)&c5pq-k}57`jO#4!brRYTZpTI&IoAsM~K*AygS^;BBgbit}Whx9Nz}uI+~E{E|Ar
zG7V!p|3+Ur&Dun$*@G0EXVw%!eN3zK;<ekL9U7v{JhS!?)Wkbn)trHPmd;_k8(Ps_
z8pyN7GA&~t-kmzBWys9{?l_rTOSNUtq<ds`xfbywYz4eHGPMt!I}5H09CzgHJLs3e
zvA@$*SFt8;4r67Ml%E_~1Wo_7qVWH+Y<ZXkTP40s75NFNW#1^uK9Iceki7D`z{#9I
zOF?(@PABuzQ+Ga46wdI*t((yIfy>}=(Kz=GdULzreqErqx23jwJlbgL6~gJyTf(Vx
Ljkdk9x-j@Z46(Zz

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index aea206f..b6bc733 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -18,3 +18,4 @@ test 17:PASS
 test 18:PASS
 test 19:PASS
 test 20:PASS
+test 21:PASS

From c350bc1f25733d2dc2a6ce6f23172b78744cb9b1 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 31 Jul 2020 12:02:55 +1000
Subject: [PATCH 24/30] FPU: Implement fsqrt[s] and add a test for fsqrt

This implements the floating square-root calculation using a table
lookup of the inverse square root approximation, followed by three
iterations of Goldschmidt's algorithm, which gives estimates of both
sqrt(FRB) and 1/sqrt(FRB).  Then the residual is calculated as
FRB - R * R and that is multiplied by the 1/sqrt(FRB) estimate to get
an adjustment to R.  The residual and the adjustment can be negative,
and since we have an unsigned multiplier, the upper bits can be wrong.
In practice the adjustment fits into an 8-bit signed value, and the
bottom 8 bits of the adjustment product are correct, so we sign-extend
them, divide by 4 (because R is in 10.54 format) and add them to R.

Finally the residual is calculated again and compared to 2*R+1 to see
if a final increment is needed.  Then the result is rounded and
written back.

This implements fsqrts as fsqrt, but with rounding to single precision
and underflow/overflow calculation using the single-precision exponent
range.  This could be optimized later.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl               |   2 +
 fpu.vhdl                   | 217 ++++++++++++++++++++++++++++++++++++-
 tests/fpu/fpu.c            |  48 ++++++++
 tests/test_fpu.bin         | Bin 29376 -> 29632 bytes
 tests/test_fpu.console_out |   1 +
 5 files changed, 262 insertions(+), 6 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 7163ff9..e821469 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -419,6 +419,7 @@ architecture behaviour of decode1 is
         2#10010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fdivs
         2#10100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsubs
         2#10101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fadds
+        2#10110#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsqrts
         2#11000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fres
         2#11001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmuls
         2#11010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- frsqrtes
@@ -477,6 +478,7 @@ architecture behaviour of decode1 is
         2#0010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fdiv
         2#0100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsub
         2#0101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fadd
+        2#0110#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsqrt
         2#0111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsel
         2#1000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fre
         2#1001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmul
diff --git a/fpu.vhdl b/fpu.vhdl
index 0cbd43f..244454e 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -40,7 +40,7 @@ architecture behaviour of fpu is
                      DO_FMR, DO_FMRG, DO_FCMP,
                      DO_FCFID, DO_FCTI,
                      DO_FRSP, DO_FRI,
-                     DO_FADD, DO_FMUL, DO_FDIV,
+                     DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT,
                      DO_FRE, DO_FRSQRTE,
                      DO_FSEL,
                      FRI_1,
@@ -51,6 +51,9 @@ architecture behaviour of fpu is
                      DIV_2, DIV_3, DIV_4, DIV_5, DIV_6,
                      FRE_1,
                      RSQRT_1,
+                     SQRT_1, SQRT_2, SQRT_3, SQRT_4,
+                     SQRT_5, SQRT_6, SQRT_7, SQRT_8,
+                     SQRT_9, SQRT_10, SQRT_11, SQRT_12,
                      INT_SHIFT, INT_ROUND, INT_ISHIFT,
                      INT_FINAL, INT_CHECK, INT_OFLOW,
                      FINISH, NORMALIZE,
@@ -140,6 +143,7 @@ architecture behaviour of fpu is
     constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00";
     constant BIN_R    : std_ulogic_vector(1 downto 0) := "01";
     constant BIN_MASK : std_ulogic_vector(1 downto 0) := "10";
+    constant BIN_PS6  : std_ulogic_vector(1 downto 0) := "11";
 
     constant RES_SUM   : std_ulogic_vector(1 downto 0) := "00";
     constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01";
@@ -604,6 +608,7 @@ begin
         variable pshift      : std_ulogic;
         variable renorm_sqrt : std_ulogic;
         variable sqrt_exp    : signed(EXP_BITS-1 downto 0);
+        variable shiftin     : std_ulogic;
     begin
         v := r;
         illegal := '0';
@@ -717,6 +722,7 @@ begin
         set_y := '0';
         pshift := '0';
         renorm_sqrt := '0';
+        shiftin := '0';
         case r.state is
             when IDLE =>
                 if e_in.valid = '1' then
@@ -765,6 +771,9 @@ begin
                             v.state := DO_FDIV;
                         when "10100" | "10101" =>
                             v.state := DO_FADD;
+                        when "10110" =>
+                            v.is_sqrt := '1';
+                            v.state := DO_FSQRT;
                         when "10111" =>
                             v.state := DO_FSEL;
                         when "11000" =>
@@ -1248,6 +1257,43 @@ begin
                 v.quieten_nan := '0';
                 arith_done := '1';
 
+            when DO_FSQRT =>
+                opsel_a <= AIN_B;
+                v.result_class := r.b.class;
+                v.result_sign := r.b.negative;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.b.class = NAN and r.b.mantissa(53) = '0' then
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    invalid := '1';
+                end if;
+                case r.b.class is
+                    when FINITE =>
+                        v.result_exp := r.b.exponent;
+                        if r.b.negative = '1' then
+                            v.fpscr(FPSCR_VXSQRT) := '1';
+                            qnan_result := '1';
+                            arith_done := '1';
+                        elsif r.b.mantissa(54) = '0' then
+                            v.state := RENORM_B;
+                        elsif r.b.exponent(0) = '0' then
+                            v.state := SQRT_1;
+                        else
+                            v.shift := to_signed(1, EXP_BITS);
+                            v.state := RENORM_B2;
+                        end if;
+                    when NAN | ZERO =>
+                        -- result is B
+                        arith_done := '1';
+                    when INFINITY =>
+                        if r.b.negative = '1' then
+                            v.fpscr(FPSCR_VXSQRT) := '1';
+                            qnan_result := '1';
+                        -- else result is B
+                        end if;
+                        arith_done := '1';
+                end case;
+
             when DO_FRE =>
                 opsel_a <= AIN_B;
                 v.result_class := r.b.class;
@@ -1454,7 +1500,11 @@ begin
                 -- wait one cycle for inverse_table[B] lookup
                 v.first := '1';
                 if r.insn(4) = '0' then
-                    v.state := DIV_2;
+                    if r.insn(3) = '0' then
+                        v.state := DIV_2;
+                    else
+                        v.state := SQRT_1;
+                    end if;
                 elsif r.insn(2) = '0' then
                     v.state := FRE_1;
                 else
@@ -1545,6 +1595,156 @@ begin
                 v.shift := to_signed(1, EXP_BITS);
                 v.state := NORMALIZE;
 
+            when SQRT_1 =>
+                -- put invsqr[B] in R and compute P = invsqr[B] * B
+                -- also transfer B (in R) to A
+                set_a := '1';
+                opsel_r <= RES_MISC;
+                misc_sel <= "0111";
+                msel_1 <= MUL1_B;
+                msel_2 <= MUL2_LUT;
+                f_to_multiply.valid <= '1';
+                v.shift := to_signed(-1, EXP_BITS);
+                v.count := "00";
+                v.state := SQRT_2;
+
+            when SQRT_2 =>
+                -- shift R right one place
+                -- not expecting multiplier result yet
+                opsel_r <= RES_SHIFT;
+                v.first := '1';
+                v.state := SQRT_3;
+
+            when SQRT_3 =>
+                -- put R into Y, wait for product from multiplier
+                msel_2 <= MUL2_R;
+                set_y := r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    -- put result into R
+                    opsel_r <= RES_MULT;
+                    v.first := '1';
+                    v.state := SQRT_4;
+                end if;
+
+            when SQRT_4 =>
+                -- compute 1.5 - Y * P
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                msel_add <= MULADD_CONST;
+                msel_inv <= '1';
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    v.state := SQRT_5;
+                end if;
+
+            when SQRT_5 =>
+                -- compute Y = Y * P
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                f_to_multiply.valid <= '1';
+                v.first := '1';
+                v.state := SQRT_6;
+
+            when SQRT_6 =>
+                -- pipeline in R = R * P
+                msel_1 <= MUL1_R;
+                msel_2 <= MUL2_P;
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    v.state := SQRT_7;
+                end if;
+
+            when SQRT_7 =>
+                -- first multiply is done, put result in Y
+                msel_2 <= MUL2_P;
+                set_y := r.first;
+                -- wait for second multiply (should be here already)
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    -- put result into R
+                    opsel_r <= RES_MULT;
+                    v.first := '1';
+                    v.count := r.count + 1;
+                    if r.count < 2 then
+                        v.state := SQRT_4;
+                    else
+                        v.first := '1';
+                        v.state := SQRT_8;
+                    end if;
+                end if;
+
+            when SQRT_8 =>
+                -- compute P = A - R * R, which can be +ve or -ve
+                -- we arranged for B to be put into A earlier
+                msel_1 <= MUL1_R;
+                msel_2 <= MUL2_R;
+                msel_add <= MULADD_A;
+                msel_inv <= '1';
+                pshift := '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    v.state := SQRT_9;
+                end if;
+
+            when SQRT_9 =>
+                -- compute P = P * Y
+                -- since Y is an estimate of 1/sqrt(B), this makes P an
+                -- estimate of the adjustment needed to R.  Since the error
+                -- could be negative and we have an unsigned multiplier, the
+                -- upper bits can be wrong, but it turns out the lowest 8 bits
+                -- are correct and are all we need (given 3 iterations through
+                -- SQRT_4 to SQRT_7).
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                pshift := '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.state := SQRT_10;
+                end if;
+
+            when SQRT_10 =>
+                -- Add the bottom 8 bits of P, sign-extended,
+                -- divided by 4, onto R.
+                -- The division by 4 is because R is 10.54 format
+                -- whereas P is 8.56 format.
+                opsel_b <= BIN_PS6;
+                sqrt_exp := r.b.exponent(EXP_BITS-1) & r.b.exponent(EXP_BITS-1 downto 1);
+                v.result_exp := sqrt_exp;
+                v.shift := to_signed(1, EXP_BITS);
+                v.first := '1';
+                v.state := SQRT_11;
+
+            when SQRT_11 =>
+                -- compute P = A - R * R (remainder)
+                -- also put 2 * R + 1 into B for comparison with P
+                msel_1 <= MUL1_R;
+                msel_2 <= MUL2_R;
+                msel_add <= MULADD_A;
+                msel_inv <= '1';
+                f_to_multiply.valid <= r.first;
+                shiftin := '1';
+                set_b := r.first;
+                if multiply_to_f.valid = '1' then
+                    v.state := SQRT_12;
+                end if;
+
+            when SQRT_12 =>
+                -- test if remainder is 0 or >= B = 2*R + 1
+                if pcmpb_lt = '1' then
+                    -- square root is correct, set X if remainder non-zero
+                    v.x := r.p(58) or px_nz;
+                else
+                    -- square root needs to be incremented by 1
+                    carry_in <= '1';
+                    v.x := not pcmpb_eq;
+                end if;
+                v.state := FINISH;
+
             when INT_SHIFT =>
                 opsel_r <= RES_SHIFT;
                 set_x := '1';
@@ -1828,8 +2028,12 @@ begin
         maddend := (others => '0');
         case msel_add is
             when MULADD_CONST =>
-                -- addend is 2.0 in 16.112 format
-                maddend(113) := '1';                -- 2.0
+                -- addend is 2.0 or 1.5 in 16.112 format
+                if r.is_sqrt = '0' then
+                    maddend(113) := '1';                -- 2.0
+                else
+                    maddend(112 downto 111) := "11";    -- 1.5
+                end if;
             when MULADD_A =>
                 -- addend is A in 16.112 format
                 maddend(121 downto 58) := r.a.mantissa;
@@ -1895,14 +2099,15 @@ begin
             when BIN_MASK =>
                 in_b0 := mask;
             when others =>
-                in_b0 := (others => '0');
+                -- BIN_PS6, 6 LSBs of P/4 sign-extended to 64
+                in_b0 := std_ulogic_vector(resize(signed(r.p(7 downto 2)), 64));
         end case;
         if opsel_binv = '1' then
             in_b0 := not in_b0;
         end if;
         in_b <= in_b0;
         if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then
-            shift_res := shifter_64(r.r & x"00000000000000",
+            shift_res := shifter_64(r.r & shiftin & 55x"00000000000000",
                                     std_ulogic_vector(r.shift(6 downto 0)));
         else
             shift_res := (others => '0');
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index d9c5c06..b72b01e 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -1291,6 +1291,53 @@ int fpu_test_21(void)
 	return trapit(0, test21);
 }
 
+struct sqrtvals {
+	unsigned long val;
+	unsigned long inv;
+} sqrtvals[] = {
+	{ 0x0000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000 },
+	{ 0xfff0000000000000, 0x7ff8000000000000 },
+	{ 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0xfff123456789abcd, 0xfff923456789abcd },
+	{ 0x3ff0000000000000, 0x3ff0000000000000 },
+	{ 0x4000000000000000, 0x3ff6a09e667f3bcd },
+	{ 0x4010000000000000, 0x4000000000000000 },
+	{ 0xbff0000000000000, 0x7ff8000000000000 },
+	{ 0x4008000000000000, 0x3ffbb67ae8584caa },
+	{ 0x7fd0000000000000, 0x5fe0000000000000 },
+	{ 0x0008000000000000, 0x1ff6a09e667f3bcd },
+	{ 0x0004000000000000, 0x1ff0000000000000 },
+	{ 0x0002000000000000, 0x1fe6a09e667f3bcd },
+	{ 0x0000000000000002, 0x1e66a09e667f3bcd },
+	{ 0x0000000000000001, 0x1e60000000000000 },
+};
+
+int test22(long arg)
+{
+	long i;
+	unsigned long result;
+	struct sqrtvals *vp = sqrtvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(sqrtvals) / sizeof(sqrtvals[0]); ++i, ++vp) {
+		asm("lfd 6,0(%0); fsqrt 7,6; stfd 7,0(%1)"
+		    : : "b" (&vp->val), "b" (&result) : "memory");
+		if (result != vp->inv) {
+			print_hex(i, 2, " ");
+			print_hex(result, 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_22(void)
+{
+	enable_fp();
+	return trapit(0, test22);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -1337,6 +1384,7 @@ int main(void)
 	do_test(19, fpu_test_19);
 	do_test(20, fpu_test_20);
 	do_test(21, fpu_test_21);
+	do_test(22, fpu_test_22);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index 0253720609c301172916dc31fbe1acd21c7c41bd..e3783415a22e72bae0935c2959ab8c4a5bf5a316 100755
GIT binary patch
delta 2670
zcmai04Qx}_6~6Cb8;82-3&cOMWBbK%jBOm601Gd*;}}|kNg#M#HYn|=Q)m+cXjEbd
zSQ>lCs{&JL!f7XlXqu*`VvKgsu!^aW1%xJG>y}QG(S<lk2{h0sGi}zoqs`s9&mPbX
ztw*{#-}%mW?>+atdoA}~<$6En9E1ca2&u0B!_?|+=$|H=tLq5~Bex#8^~kM%I=Z>t
zF_$~mzG@DynR|5iUmhrp+sX;K-akfZ4M8;<LVRI);6=ut`y(Oe`}@9;gWZCu02M)0
zjMqjZgzW0yn^6ezY9wrkcb^yj%=6A7*f(P)7l+d`%5)p;BzTk-q$ut6UY|^FC&VE;
zw~P<lp(^)X)bZLm+TubIJSFS8Dk<GjNl34^4wh)We10K3uWeFASI49`Z@?k_Z19z;
z!J`v+VHx;!wY*{}?AFzC%`l)_&c9Ruv-Mk)soI$ICEV1{hatU&H-unZU!~l%DkiOy
z$xQ|gAAAB@3{}eTqcQ12$jO_(C$CP~`@NX7gay!+r@_iz$y=;!`+iLN9h4g9gJ{%n
zK`1sZR+1Gl$t80~kUI`1k-HFzNxy_ss0)}Fx6HIyxnX5Y%3|Df^B1%R&*j5TOX6cO
z>C_19GnH{4z>@qT_}ug;_d`Up_^TtZCI4+j$V7q=vgbq4oUh8o-~~N2nQeT#5q6tb
z^I3WrH@kVI5sVf;KiUARELDmVc_g^UVt^BtwYp@(8p({^7D@dmrCmOOL?x7!S*y6O
z;5qBN)0sf|H7lvhr${HOMR%6MK!O~h-WWV<Ho;!8l`U>7aPuBL)D~=K(RqxbI!F{*
zL2LVsG1+iH+Is6s<mA8pXSal{im%gxqwqAFccajaWgU>7{gT#jBqAM~x~4fq3Zzgj
zxb26S;-cNHWs0ArzNuIXF1razilR*Pc9EOkKOMK<#iAyMn;U_b9Y1G*Z6?;41GO`s
zmsgwVzJIHbK8`?K<#Bk;Q3*?(K9=0&baQ8bbFRoa*u}2=kXIxeyZUE6u+C{M9?hJp
zguIqoFp)kP;!)PeQjG6DfPm{f2qVtiWhrc`K)BTVc#n5}Mr$tKnv9g-QKu1VWyf{h
zM}q9Bg_4ox60$~afi}ZEg!dvVgG;b2&9KPj<<7xRUELAVctFyS9Rc>S)QDLXFMP`0
zlGnpHCw}F7Y1W~}&vL{k>;7A##&s6S>r}{jiPT5$p_w5Spok+aa>1Qg+!+mYc$%PO
zmXo^)kIlMwg}+UHdxh<&`L+r-%Io~!3a3j;P;_^NWAKXS3+`9&axwn!!MWmj-0R(w
z#apICghu%rji%`ECvQwqH>*Ek6Gi;?Buwd3J_F`S+}3{cM*49mu}32IoZayv)V{%Q
zZ~o7Vaw4B2b3eJwl_2*u&eaalnjhnv4`92Po?UG0cMiKCw>Q6lLMYLToU6NN&I&G{
z{XppOBAlp+Nk{Q@ia41S#>bx;-J=t;8P3oA+N$I5&fG}Woyb40i|d2oc}MP#e<TsH
zx?UvJ+fdc<9q*#J?fPK_X+E@^Jf}v?pH^>E;G4Lcn?eHRn%i7J*0Fb!*02~XWy`hl
zes&KbuAPMJe2`UynH^<6P%zkisZ7B|a{or@2!_lA9H=HE$b}ez#<&0u@e-v;^voDn
zhvO^c4l)9bag8|IKBQDc&x~=+IQAoFX9ODaZ8#1icbO1)Tq@whe}@)nHou`C-a<Fh
z4++V_Yc9hiI$;1T<rZEV06)6$0IWvWIsh%`jtoEtx=RDlgN`I1fzFwLNpztESRS(Q
zTNB_%*O`FT=*}jf1zj=$9q0^$(1R{82nlqJgD{D%Z4mb}DW?!}9Ff<k+>20v3a`n3
z98u#CrTOd}Xd;e3A-4&+O^kRXYXNzcdpJ9gv!ErEO%@KS7O>+cQFCGle&MyL3<^Rn
z4?&ODtqLNPhGE?6P;EwdWEhM-n`%G8gCD^{gl7=?u0RvQF@&iru*c_68I*)<9)TWY
z0|*aag=;>GYAwPI*I-(OMb(OM-p4Q-;bDYlWvJ{!_yl~n!m64;lpKYg3ga{pi!-PQ
ziC$-677KkC3UIQ*sthCQgyD+9yjDaLH|~7aM^LvPIRa+?!>SQPZDU~Z`%BsP_Dsdb
zjNfSz;{wPzkz*5?EyYoX+)DVsU!#g6I(8Erl@4tm!p2W2?c`9gC|krn@{dD(Wug1R
zXOy<GxoGk>=z|lK#$*PUM<2j(d;;F8bi2kd#VBS>9%bO4lzQ00XpDOahoPGb{52b1
z+~DdCg;v_&eNGRNCm(=|KQcmVgBdo4?u)$VjXcxQ`8oc}VI~A*m);Fw^7rXDg@3b{
zsRM_Wt{mO|&aI5TFT)Q#naPvPde(HTNIR0u#C#@y&B~`Ur5WNGOQ!bBgs|lwJ#QxV
PHP~k`f}*|m>vZbhTi11S

delta 2324
zcmai!4NOy46vywo@+!KnD=MXhQXZvXsajD{9#Wv4h!qf<%@4Mz0iEBQ!MPc9l@ik}
zY-0$XZ9%eSgo;_5nqs#^<44d;Go5p8aR#SC@ncq<OdZbZHt)_YM(1MUNlwoH{_el`
zoqOI(J1z?ymxMGz?74&#RU95zv<B-!vZ|<pkaA?pku68Id|}P1#<X!lpz-CgR6Vx6
z{)gwr_>9?vTsiX_W5opBJk`GEv+Wx=Kk^+RN6(yiY!B+y3CZYiqNBI8#zV;aXPO>3
z)REIjxz-mrsy<Gs)&$K%hYLP9ICO$$ZVGX>v*a*i9XVI-_dibU1J$qz)Rh8-!}g$$
z#*U3mG!o~5h_5xDvGw_cbmS;uYHSWw8sXj8WeV?Hue|*#Y>l&mNu#CTmO!4Sl<qBt
zdQGX23FkD^saFDP+-lKZ>Xq-p&A2QG#wqEE>ChWjD8BiUS6&){m0Be&c?GJqg`)Na
zuY4L}bXgm9GsLPFz4BDPfCimXm;;~brijj&UU@f+iO&KlUMUb56+cDXHOnifN0=X3
zCG101=km&%-~jsU37kz!m?E0Wyz(Gq;}k#Qy%O(0yneON295d=bkrr-uiqI}8&8~2
zZq`FXqW;+uTpZC5Cw~_gp*<ym9?`+6#CdeG1|}FRG*<`n4SBS45wsc#X`dFZ8mcwH
zMGNFaoT@}BLX5TA)TBm6M|Dymj^t3%o*~>I{=<s-W>o*?M#(~Laj;+7$`4zXY@x4e
z;FILFd^N;qp}S%s&v={$7lA58!(m!VA*~w*D^d>f)mW1yHn^xwUU`=l3&bOTHE@C3
zMUv%On46jab*2VxGNf9tw>J5m@Z+XWhrqnlX70F`YN01%@JUJVVQLn*(=s{TnPw66
za4ziwzF0OKTT@3Y`%*?gaJVgUwAmu~p*?*x_)U6nrYoVvoWVDKGF#~1D!6a9LNI;U
zv=H80O@uHXer_KtKK0jqPX_ion6M<?v+kMxS-Wm{+!<s+<R0P!Hj$LecL(?HXW-qI
zfWwj_d=2X?0nZJ*mXfTu^NEI0^N~CL;2(C*NY2doDxTy?L{A>>5n_w@{_ly#bMD^a
z=j2BAWl~Y|1kVupXEX_<I?`|pHs^-}BS1I@Ge<tTxurLs-rQvLJgvFp$T?4G?w=cv
zH`fW9M%@uUhDPf+p#=_EGlkZ`J?rWL9<n&{dO~5=b8X9jX!qW0{D%_${g86T=G&ri
zpZJW8ZOHjpMI4WMahqKA*9FgMWLxN3H|Ko(TI7c!?Cpn4Le`EfriT^3b%zz(aZ=pR
zze*f+4F5JBKD-u<kP#;dx<K-{*@99+GjD|Y=7%<FBo){C2mf->_`jCv8T4pXG`>|h
z2G_^?pY@%WiBwb}k)k!|+BzX;vd?&BTNH6`ole$8qoxO=*F=%@?EkU>#Gb8u$m|iH
z`mr@wvZlvIo@W|KNcww(yf>P6gwLqYniU9TMF}2LCu0{8@=#-<k@O*Rae%>@9W&!J
zV>ht!;A{q_@)yRw-~fZOC73P9B&<9*b7O8pX5s*Y*9OeJ$j%Z1%d?Yd<r%2Ywo><5
z*pH>}Ec9j@sQw(NCK{;J1y(H1E-1xP*##@GG<88emX0pifhE)h$FZopp%;tX4XPXi
zE$;>^mfCJ8#p3UV6<9jEp&knf!VWCvARNcy3PLZIsvxLr2HFyYQd_)OHGq&XylzX#
zYeMDfVNB1v$B;0$AuB^x$w9#&2l-r>q_W32s$DAbe7C~EFL?qzn|t7}&8P^Y{H6zb
zZ5D+&3Qsr>hTJs843tghp(xj=aHDLv0B@k&jM8)wno%A@8N3Jw(HllN=MwZHl|({5
z`V};J21NnN8JA%q$|WfEA()4<4yC_e`cW2K;Z!_{vJ<Z7C5d5FOCTmcNz^O&CCBHd
z=n7DkTzmB93h)AD$jV@4{v@#lRS0(HPZ||MRn_}w82k<YjjRS4{|%24O!YwQ0n+Wo
zigHxSTd>xirmsWUaf`7A0UeW763&axR*?dQ1HXe4vfiTaW2}}Rfx%Da-(mwYF9#TA
zOgk61fjTU)nYS6^hzGaVU^>C!7%G<k$yimO!f{v?sC3K`V8d(a0hg=HsQrEa_O9l|
PrlII04CDJRxgz9WV#Otp

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index b6bc733..9b97cb5 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -19,3 +19,4 @@ test 18:PASS
 test 19:PASS
 test 20:PASS
 test 21:PASS
+test 22:PASS

From c083b9507dafee86c5c0c14d6e1ef8c56a2b8a3f Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 31 Jul 2020 16:46:12 +1000
Subject: [PATCH 25/30] FPU: Implement ftdiv and ftsqrt

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl |  2 ++
 fpu.vhdl     | 68 ++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index e821469..bd7f0f3 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -441,6 +441,8 @@ architecture behaviour of decode1 is
         2#000000000#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  0/0=fcmpu
         2#000000001#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  1/0=fcmpo
         2#000000010#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  2/0=mcrfs
+        2#000000100#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  4/0=ftdiv
+        2#000000101#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  5/0=ftsqrt
         2#011000001#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  1/6=mtfsb1
         2#011000010#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  2/6=mtfsb0
         2#011000100#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/6=mtfsfi
diff --git a/fpu.vhdl b/fpu.vhdl
index 244454e..90670e9 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -37,7 +37,7 @@ architecture behaviour of fpu is
 
     type state_t is (IDLE,
                      DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
-                     DO_FMR, DO_FMRG, DO_FCMP,
+                     DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT,
                      DO_FCFID, DO_FCTI,
                      DO_FRSP, DO_FRI,
                      DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT,
@@ -51,6 +51,7 @@ architecture behaviour of fpu is
                      DIV_2, DIV_3, DIV_4, DIV_5, DIV_6,
                      FRE_1,
                      RSQRT_1,
+                     FTDIV_1,
                      SQRT_1, SQRT_2, SQRT_3, SQRT_4,
                      SQRT_5, SQRT_6, SQRT_7, SQRT_8,
                      SQRT_9, SQRT_10, SQRT_11, SQRT_12,
@@ -105,6 +106,7 @@ architecture behaviour of fpu is
         is_sqrt      : std_ulogic;
         first        : std_ulogic;
         count        : unsigned(1 downto 0);
+        doing_ftdiv  : std_ulogic_vector(1 downto 0);
     end record;
 
     type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0);
@@ -642,6 +644,8 @@ begin
             v.is_multiply := '0';
             v.is_sqrt := '0';
             v.add_bsmall := '0';
+            v.doing_ftdiv := "00";
+
             adec := decode_dp(e_in.fra, int_input);
             bdec := decode_dp(e_in.frb, int_input);
             cdec := decode_dp(e_in.frc, int_input);
@@ -659,8 +663,16 @@ begin
         r_lo_nz <= or (r.r(30 downto 2));
 
         if r.single_prec = '0' then
-            max_exp := to_signed(1023, EXP_BITS);
-            min_exp := to_signed(-1022, EXP_BITS);
+            if r.doing_ftdiv(1) = '0' then
+                max_exp := to_signed(1023, EXP_BITS);
+            else
+                max_exp := to_signed(1020, EXP_BITS);
+            end if;
+            if r.doing_ftdiv(0) = '0' then
+                min_exp := to_signed(-1022, EXP_BITS);
+            else
+                min_exp := to_signed(-1021, EXP_BITS);
+            end if;
             bias_exp := to_signed(1536, EXP_BITS);
         else
             max_exp := to_signed(127, EXP_BITS);
@@ -728,7 +740,13 @@ begin
                 if e_in.valid = '1' then
                     case e_in.insn(5 downto 1) is
                         when "00000" =>
-                            if e_in.insn(7) = '1' then
+                            if e_in.insn(8) = '1' then
+                                if e_in.insn(6) = '0' then
+                                    v.state := DO_FTDIV;
+                                else
+                                    v.state := DO_FTSQRT;
+                                end if;
+                            elsif e_in.insn(7) = '1' then
                                 v.state := DO_MCRFS;
                             else
                                 v.state := DO_FCMP;
@@ -804,6 +822,38 @@ begin
                 v.instr_done := '1';
                 v.state := IDLE;
 
+            when DO_FTDIV =>
+                v.instr_done := '1';
+                v.state := IDLE;
+                v.cr_result := "0000";
+                if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or
+                    (r.b.class = FINITE and r.b.mantissa(53) = '0') then
+                    v.cr_result(2) := '1';
+                end if;
+                if r.a.class = NAN or r.a.class = INFINITY or
+                    r.b.class = NAN or r.b.class = ZERO or r.b.class = INFINITY or
+                    (r.a.class = FINITE and r.a.exponent <= to_signed(-970, EXP_BITS)) then
+                    v.cr_result(1) := '1';
+                else
+                    v.doing_ftdiv := "11";
+                    v.first := '1';
+                    v.state := FTDIV_1;
+                    v.instr_done := '0';
+                end if;
+
+            when DO_FTSQRT =>
+                v.instr_done := '1';
+                v.state := IDLE;
+                v.cr_result := "0000";
+                if r.b.class = ZERO or r.b.class = INFINITY or
+                    (r.b.class = FINITE and r.b.mantissa(53) = '0') then
+                    v.cr_result(2) := '1';
+                end if;
+                if r.b.class = NAN or r.b.class = INFINITY or r.b.class = ZERO
+                    or r.b.negative = '1' or r.b.exponent <= to_signed(-970, EXP_BITS) then
+                    v.cr_result(1) := '0';
+                end if;
+
             when DO_FCMP =>
                 -- fcmp[uo]
                 v.instr_done := '1';
@@ -1587,6 +1637,16 @@ begin
                 v.shift := to_signed(1, EXP_BITS);
                 v.state := NORMALIZE;
 
+            when FTDIV_1 =>
+                v.cr_result(1) := exp_tiny or exp_huge;
+                if exp_tiny = '1' or exp_huge = '1' or r.a.class = ZERO or r.first = '0' then
+                    v.instr_done := '1';
+                    v.state := IDLE;
+                else
+                    v.shift := r.a.exponent;
+                    v.doing_ftdiv := "10";
+                end if;
+
             when RSQRT_1 =>
                 opsel_r <= RES_MISC;
                 misc_sel <= "0111";

From dc1544db691a82dccdd6f6d43224d833dd4a1433 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 1 Aug 2020 19:17:36 +1000
Subject: [PATCH 26/30] FPU: Implement floating multiply-add instructions

This implements fmadd, fmsub, fnmadd, fnmsub and their
single-precision counterparts.  The single-precision versions operate
the same as the double-precision versions until the final rounding and
overflow/underflow steps.

This adds an S register to store the low bits of the product.  S
shifts into R on left shifts, and can be negated, but doesn't do any
other arithmetic.

This adds a test for the double-precision versions of these
instructions.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl               |   8 ++
 fpu.vhdl                   | 244 +++++++++++++++++++++++++++++++++++--
 tests/fpu/fpu.c            |  71 +++++++++++
 tests/test_fpu.bin         | Bin 29632 -> 30416 bytes
 tests/test_fpu.console_out |   1 +
 5 files changed, 314 insertions(+), 10 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index bd7f0f3..5d6a557 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -423,6 +423,10 @@ architecture behaviour of decode1 is
         2#11000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fres
         2#11001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmuls
         2#11010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- frsqrtes
+        2#11100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmsubs
+        2#11101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmadds
+        2#11110#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fnmsubs
+        2#11111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fnmadds
         others => illegal_inst
         );
 
@@ -485,6 +489,10 @@ architecture behaviour of decode1 is
         2#1000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fre
         2#1001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmul
         2#1010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- frsqrte
+        2#1100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmsub
+        2#1101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmadd
+        2#1110#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fnmsub
+        2#1111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fnmadd
         others => illegal_inst
         );
 
diff --git a/fpu.vhdl b/fpu.vhdl
index 90670e9..5e30386 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -40,13 +40,15 @@ architecture behaviour of fpu is
                      DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT,
                      DO_FCFID, DO_FCTI,
                      DO_FRSP, DO_FRI,
-                     DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT,
+                     DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT, DO_FMADD,
                      DO_FRE, DO_FRSQRTE,
                      DO_FSEL,
                      FRI_1,
                      ADD_SHIFT, ADD_2, ADD_3,
                      CMP_1, CMP_2,
                      MULT_1,
+                     FMADD_1, FMADD_2, FMADD_3,
+                     FMADD_4, FMADD_5, FMADD_6,
                      LOOKUP,
                      DIV_2, DIV_3, DIV_4, DIV_5, DIV_6,
                      FRE_1,
@@ -82,6 +84,7 @@ architecture behaviour of fpu is
         b            : fpu_reg_type;
         c            : fpu_reg_type;
         r            : std_ulogic_vector(63 downto 0);  -- 10.54 format
+        s            : std_ulogic_vector(55 downto 0);  -- extended fraction
         x            : std_ulogic;
         p            : std_ulogic_vector(63 downto 0);  -- 8.56 format
         y            : std_ulogic_vector(63 downto 0);  -- 8.56 format
@@ -101,6 +104,7 @@ architecture behaviour of fpu is
         round_mode   : std_ulogic_vector(2 downto 0);
         is_subtract  : std_ulogic;
         exp_cmp      : std_ulogic;
+        madd_cmp     : std_ulogic;
         add_bsmall   : std_ulogic;
         is_multiply  : std_ulogic;
         is_sqrt      : std_ulogic;
@@ -117,6 +121,7 @@ architecture behaviour of fpu is
     signal opsel_a       : std_ulogic_vector(1 downto 0);
     signal opsel_b       : std_ulogic_vector(1 downto 0);
     signal opsel_r       : std_ulogic_vector(1 downto 0);
+    signal opsel_s       : std_ulogic_vector(1 downto 0);
     signal opsel_ainv    : std_ulogic;
     signal opsel_amask   : std_ulogic;
     signal opsel_binv    : std_ulogic;
@@ -127,6 +132,7 @@ architecture behaviour of fpu is
     signal lost_bits     : std_ulogic;
     signal r_hi_nz       : std_ulogic;
     signal r_lo_nz       : std_ulogic;
+    signal s_nz          : std_ulogic;
     signal misc_sel      : std_ulogic_vector(3 downto 0);
     signal f_to_multiply : MultiplyInputType;
     signal multiply_to_f : MultiplyOutputType;
@@ -152,6 +158,11 @@ architecture behaviour of fpu is
     constant RES_MULT  : std_ulogic_vector(1 downto 0) := "10";
     constant RES_MISC  : std_ulogic_vector(1 downto 0) := "11";
 
+    constant S_ZERO  : std_ulogic_vector(1 downto 0) := "00";
+    constant S_NEG   : std_ulogic_vector(1 downto 0) := "01";
+    constant S_SHIFT : std_ulogic_vector(1 downto 0) := "10";
+    constant S_MULT  : std_ulogic_vector(1 downto 0) := "11";
+
     -- msel values
     constant MUL1_A : std_ulogic_vector(1 downto 0) := "00";
     constant MUL1_B : std_ulogic_vector(1 downto 0) := "01";
@@ -163,9 +174,10 @@ architecture behaviour of fpu is
     constant MUL2_P   : std_ulogic_vector(1 downto 0) := "10";
     constant MUL2_R   : std_ulogic_vector(1 downto 0) := "11";
 
-    constant MULADD_ZERO : std_ulogic_vector(1 downto 0) := "00";
+    constant MULADD_ZERO  : std_ulogic_vector(1 downto 0) := "00";
     constant MULADD_CONST : std_ulogic_vector(1 downto 0) := "01";
     constant MULADD_A     : std_ulogic_vector(1 downto 0) := "10";
+    constant MULADD_RS    : std_ulogic_vector(1 downto 0) := "11";
 
     -- Inverse lookup table, indexed by the top 8 fraction bits
     -- The first 256 entries are the reciprocal (1/x) lookup table,
@@ -597,20 +609,22 @@ begin
         variable need_check  : std_ulogic;
         variable msb         : std_ulogic;
         variable is_add      : std_ulogic;
-        variable qnan_result : std_ulogic;
         variable longmask    : std_ulogic;
         variable set_a       : std_ulogic;
         variable set_b       : std_ulogic;
         variable set_c       : std_ulogic;
-        variable px_nz       : std_ulogic;
-        variable maddend     : std_ulogic_vector(127 downto 0);
         variable set_y       : std_ulogic;
+        variable set_s       : std_ulogic;
+        variable qnan_result : std_ulogic;
+        variable px_nz       : std_ulogic;
         variable pcmpb_eq    : std_ulogic;
         variable pcmpb_lt    : std_ulogic;
         variable pshift      : std_ulogic;
         variable renorm_sqrt : std_ulogic;
         variable sqrt_exp    : signed(EXP_BITS-1 downto 0);
         variable shiftin     : std_ulogic;
+        variable mulexp      : signed(EXP_BITS-1 downto 0);
+        variable maddend     : std_ulogic_vector(127 downto 0);
     begin
         v := r;
         illegal := '0';
@@ -657,10 +671,15 @@ begin
             if adec.exponent > bdec.exponent then
                 v.exp_cmp := '1';
             end if;
+            v.madd_cmp := '0';
+            if (adec.exponent + cdec.exponent + 1) >= bdec.exponent then
+                v.madd_cmp := '1';
+            end if;
         end if;
 
         r_hi_nz <= or (r.r(55 downto 31));
         r_lo_nz <= or (r.r(30 downto 2));
+        s_nz <= or (r.s);
 
         if r.single_prec = '0' then
             if r.doing_ftdiv(1) = '0' then
@@ -711,6 +730,7 @@ begin
         opsel_b <= BIN_ZERO;
         opsel_binv <= '0';
         opsel_r <= RES_SUM;
+        opsel_s <= S_ZERO;
         carry_in <= '0';
         misc_sel <= "0000";
         fpscr_mask := (others => '1');
@@ -725,6 +745,7 @@ begin
         set_a := '0';
         set_b := '0';
         set_c := '0';
+        set_s := '0';
         f_to_multiply.is_32bit <= '0';
         f_to_multiply.valid <= '0';
         msel_1 <= MUL1_A;
@@ -802,12 +823,15 @@ begin
                         when "11010" =>
                             v.is_sqrt := '1';
                             v.state := DO_FRSQRTE;
+                        when "11100" | "11101" | "11110" | "11111" =>
+                            v.state := DO_FMADD;
                         when others =>
                             illegal := '1';
                     end case;
                 end if;
                 v.x := '0';
                 v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX);
+                set_s := '1';
 
             when DO_MCRFS =>
                 j := to_integer(unsigned(insn_bfa(r.insn)));
@@ -1416,6 +1440,99 @@ begin
                         arith_done := '1';
                 end case;
 
+            when DO_FMADD =>
+                -- fmadd, fmsub, fnmadd, fnmsub
+                opsel_a <= AIN_A;
+                v.result_sign := r.a.negative;
+                v.result_class := r.a.class;
+                v.result_exp := r.a.exponent;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                is_add := r.a.negative xor r.c.negative xor r.b.negative xor r.insn(1);
+                if r.a.class = FINITE and r.c.class = FINITE and
+                    (r.b.class = FINITE or r.b.class = ZERO) then
+                    v.is_subtract := not is_add;
+                    mulexp := r.a.exponent + r.c.exponent;
+                    v.result_exp := mulexp;
+                    opsel_a <= AIN_B;
+                    -- Make sure A and C are normalized
+                    if r.a.mantissa(54) = '0' then
+                        opsel_a <= AIN_A;
+                        v.state := RENORM_A;
+                    elsif r.c.mantissa(54) = '0' then
+                        opsel_a <= AIN_C;
+                        v.state := RENORM_C;
+                    elsif r.b.class = ZERO then
+                        -- no addend, degenerates to multiply
+                        v.result_sign := r.a.negative xor r.c.negative xor r.insn(2);
+                        f_to_multiply.valid <= '1';
+                        v.is_multiply := '1';
+                        v.state := MULT_1;
+                    elsif r.madd_cmp = '0' then
+                        -- addend is bigger, do multiply first
+                        v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2));
+                        f_to_multiply.valid <= '1';
+                        v.state := FMADD_1;
+                    else
+                        -- product is bigger, shift B right and use it as the
+                        -- addend to the multiplier
+                        v.shift := r.b.exponent - mulexp + to_signed(64, EXP_BITS);
+                        -- for subtract, multiplier does B - A * C
+                        v.result_sign := not (r.a.negative xor r.c.negative xor r.insn(2) xor is_add);
+                        v.result_exp := r.b.exponent;
+                        v.state := FMADD_2;
+                    end if;
+                else
+                    if (r.a.class = NAN and r.a.mantissa(53) = '0') or
+                        (r.b.class = NAN and r.b.mantissa(53) = '0') or
+                        (r.c.class = NAN and r.c.mantissa(53) = '0') then
+                        -- Signalling NAN
+                        v.fpscr(FPSCR_VXSNAN) := '1';
+                        invalid := '1';
+                    end if;
+                    if r.a.class = NAN then
+                        -- nothing to do, result is A
+                    elsif r.b.class = NAN then
+                        -- result is B
+                        v.result_class := NAN;
+                        v.result_sign := r.b.negative;
+                        opsel_a <= AIN_B;
+                    elsif r.c.class = NAN then
+                        -- result is C
+                        v.result_class := NAN;
+                        v.result_sign := r.c.negative;
+                        opsel_a <= AIN_C;
+                    elsif (r.a.class = ZERO and r.c.class = INFINITY) or
+                        (r.a.class = INFINITY and r.c.class = ZERO) then
+                        -- invalid operation, construct QNaN
+                        v.fpscr(FPSCR_VXIMZ) := '1';
+                        qnan_result := '1';
+                    elsif r.a.class = INFINITY or r.c.class = INFINITY then
+                        if r.b.class = INFINITY and is_add = '0' then
+                            -- invalid operation, construct QNaN
+                            v.fpscr(FPSCR_VXISI) := '1';
+                            qnan_result := '1';
+                        else
+                            -- result is infinity
+                            v.result_class := INFINITY;
+                            v.result_sign := r.a.negative xor r.c.negative xor r.insn(2);
+                        end if;
+                    else
+                        -- Here A is zero, C is zero, or B is infinity
+                        -- Result is +/-B in all of those cases
+                        v.result_class := r.b.class;
+                        v.result_exp := r.b.exponent;
+                        if v.result_class /= ZERO or is_add = '1' then
+                            v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2));
+                        else
+                            -- have to be careful about rule for 0 - 0 result sign
+                            v.result_sign := (r.round_mode(1) and r.round_mode(0)) xor r.insn(2);
+                        end if;
+                        opsel_a <= AIN_B;
+                    end if;
+                    arith_done := '1';
+                end if;
+
             when RENORM_A =>
                 renormalize := '1';
                 v.state := RENORM_A2;
@@ -1426,8 +1543,16 @@ begin
                 if r.insn(4) = '1' then
                     opsel_a <= AIN_C;
                     if r.c.mantissa(54) = '1' then
-                        v.first := '1';
-                        v.state := MULT_1;
+                        if r.insn(3) = '0' or r.b.class = ZERO then
+                            v.first := '1';
+                            v.state := MULT_1;
+                        else
+                            v.madd_cmp := '0';
+                            if new_exp + 1 >= r.b.exponent then
+                                v.madd_cmp := '1';
+                            end if;
+                            v.state := DO_FMADD;
+                        end if;
                     else
                         v.state := RENORM_C;
                     end if;
@@ -1462,11 +1587,20 @@ begin
             when RENORM_C2 =>
                 set_c := '1';
                 v.result_exp := new_exp;
-                v.first := '1';
-                v.state := MULT_1;
+                if r.insn(3) = '0' or r.b.class = ZERO then
+                    v.first := '1';
+                    v.state := MULT_1;
+                else
+                    v.madd_cmp := '0';
+                    if new_exp + 1 >= r.b.exponent then
+                        v.madd_cmp := '1';
+                    end if;
+                    v.state := DO_FMADD;
+                end if;
 
             when ADD_SHIFT =>
                 opsel_r <= RES_SHIFT;
+                v.x := s_nz;
                 set_x := '1';
                 longmask := '0';
                 v.state := ADD_2;
@@ -1545,6 +1679,78 @@ begin
                     v.state := FINISH;
                 end if;
 
+            when FMADD_1 =>
+                -- Addend is bigger here
+                v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2));
+                -- note v.shift is at most -2 here
+                v.shift := r.result_exp - r.b.exponent;
+                opsel_r <= RES_MULT;
+                opsel_s <= S_MULT;
+                set_s := '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.state := ADD_SHIFT;
+                end if;
+
+            when FMADD_2 =>
+                -- Product is potentially bigger here
+                set_s := '1';
+                opsel_s <= S_SHIFT;
+                v.shift := r.shift - to_signed(64, EXP_BITS);
+                v.state := FMADD_3;
+
+            when FMADD_3 =>
+                opsel_r <= RES_SHIFT;
+                v.first := '1';
+                v.state := FMADD_4;
+
+            when FMADD_4 =>
+                msel_add <= MULADD_RS;
+                f_to_multiply.valid <= r.first;
+                msel_inv <= r.is_subtract;
+                opsel_r <= RES_MULT;
+                opsel_s <= S_MULT;
+                set_s := '1';
+                v.shift := to_signed(56, EXP_BITS);
+                if multiply_to_f.valid = '1' then
+                    if multiply_to_f.result(121) = '1' then
+                        v.state := FMADD_5;
+                    else
+                        v.state := FMADD_6;
+                    end if;
+                end if;
+
+            when FMADD_5 =>
+                -- negate R:S:X
+                v.result_sign := not r.result_sign;
+                opsel_ainv <= '1';
+                carry_in <= not (s_nz or r.x);
+                opsel_s <= S_NEG;
+                set_s := '1';
+                v.shift := to_signed(56, EXP_BITS);
+                v.state := FMADD_6;
+
+            when FMADD_6 =>
+                if (r.r(56) or r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then
+                    if s_nz = '0' then
+                        -- must be a subtraction, and r.x must be zero
+                        v.result_class := ZERO;
+                        v.result_sign := r.round_mode(1) and r.round_mode(0);
+                        arith_done := '1';
+                    else
+                        -- R is all zeroes but there are non-zero bits in S
+                        -- so shift them into R and set S to 0
+                        opsel_r <= RES_SHIFT;
+                        set_s := '1';
+                        -- stay in state FMADD_6
+                    end if;
+                elsif r.r(56 downto 54) = "001" then
+                    v.state := FINISH;
+                else
+                    renormalize := '1';
+                    v.state := NORMALIZE;
+                end if;
+
             when LOOKUP =>
                 opsel_a <= AIN_B;
                 -- wait one cycle for inverse_table[B] lookup
@@ -2097,6 +2303,9 @@ begin
             when MULADD_A =>
                 -- addend is A in 16.112 format
                 maddend(121 downto 58) := r.a.mantissa;
+            when MULADD_RS =>
+                -- addend is concatenation of R and S in 16.112 format
+                maddend := "000000" & r.r & r.s & "00";
             when others =>
         end case;
         if msel_inv = '1' then
@@ -2167,7 +2376,7 @@ begin
         end if;
         in_b <= in_b0;
         if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then
-            shift_res := shifter_64(r.r & shiftin & 55x"00000000000000",
+            shift_res := shifter_64(r.r & (shiftin or r.s(55)) & r.s(54 downto 0),
                                     std_ulogic_vector(r.shift(6 downto 0)));
         else
             shift_res := (others => '0');
@@ -2230,6 +2439,21 @@ begin
                 result <= misc;
         end case;
         v.r := result;
+        if set_s = '1' then
+            case opsel_s is
+                when S_NEG =>
+                    v.s := std_ulogic_vector(unsigned(not r.s) + (not r.x));
+                when S_MULT =>
+                    v.s := multiply_to_f.result(57 downto 2);
+                when S_SHIFT =>
+                    v.s := shift_res(63 downto 8);
+                    if shift_res(7 downto 0) /= x"00" then
+                        v.x := '1';
+                    end if;
+                when others =>
+                    v.s := (others => '0');
+            end case;
+        end if;
 
         if set_a = '1' then
             v.a.exponent := new_exp;
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index b72b01e..52f21d0 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -1338,6 +1338,76 @@ int fpu_test_22(void)
 	return trapit(0, test22);
 }
 
+struct fmavals {
+	unsigned long ra;
+	unsigned long rc;
+	unsigned long rb;
+	unsigned long fma;
+	unsigned long fms;
+	unsigned long nfma;
+	unsigned long nfms;
+} fmavals[] = {
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+	  0x0000000000000000, 0x0000000000000000, 0x8000000000000000, 0x8000000000000000 },
+	{ 0x0000000000000000, 0x7ffc000000000000, 0x0000000000000000,
+	  0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000 },
+	{ 0x0000000000000000, 0x7ffc000000000000, 0x7ffb000000000000,
+	  0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000 },
+	{ 0x7ffa000000000000, 0x7ffc000000000000, 0x7ffb000000000000,
+	  0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000 },
+	{ 0x3ff0000000000000, 0x8000000000000000, 0x678123456789abcd, 
+	  0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd },
+	{ 0x3ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, 
+	  0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd },
+	{ 0x7ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, 
+	  0xfff0000000000000, 0xfff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x0000000000000000, 0x678123456789abcd, 
+	  0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000 },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000020000000, 
+	  0x4000000010000000, 0xbe80000000000000, 0xc000000010000000, 0x3e80000000000000 },
+	{ 0x3ff0000000000001, 0x3ff0000000000001, 0x3ff0000000000000,
+	  0x4000000000000001, 0x3cc0000000000000, 0xc000000000000001, 0xbcc0000000000000 },
+	{ 0x3ff0000000000003, 0x3ff0000000000002, 0x3ff0000000000000,
+	  0x4000000000000002, 0x3cd4000000000002, 0xc000000000000002, 0xbcd4000000000002 },
+	{ 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb020000000000000,
+	  0xaca765753908cd20, 0x3030000000000000, 0x2ca765753908cd20, 0xb030000000000000 },
+	{ 0x3006a09e667f3bcd, 0x4006a09e667f3bcd, 0xb020000000000000,
+	  0x2cd3b3efbf5e2229, 0x3030000000000000, 0xacd3b3efbf5e2229, 0xb030000000000000 },
+	{ 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb060003450000000,
+	  0xb05e0068a0000000, 0x3061003450000000, 0x305e0068a0000000, 0xb061003450000000 },
+};
+
+int test23(long arg)
+{
+	long i;
+	unsigned long results[4];
+	struct fmavals *vp = fmavals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(fmavals) / sizeof(fmavals[0]); ++i, ++vp) {
+		asm("lfd 6,0(%0); lfd 7,8(%0); lfd 8,16(%0); fmadd 0,6,7,8; stfd 0,0(%1)"
+		    : : "b" (&vp->ra), "b" (results) : "memory");
+		asm("fmsub 1,6,7,8; fnmadd 2,6,7,8; fnmsub 3,6,7,8; stfd 1,8(%0); stfd 2,16(%0); stfd 3,24(%0)"
+		    : : "b" (results) : "memory");
+		if (results[0] != vp->fma || results[1] != vp->fms ||
+		    results[2] != vp->nfma || results[3] != vp->nfms) {
+			print_hex(i, 2, " ");
+			print_hex(results[0], 16, " ");
+			print_hex(results[1], 16, " ");
+			print_hex(results[2], 16, " ");
+			print_hex(results[3], 16, "\r\n");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_23(void)
+{
+	enable_fp();
+	return trapit(0, test23);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -1385,6 +1455,7 @@ int main(void)
 	do_test(20, fpu_test_20);
 	do_test(21, fpu_test_21);
 	do_test(22, fpu_test_22);
+	do_test(23, fpu_test_23);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index e3783415a22e72bae0935c2959ab8c4a5bf5a316..50831cb20db40951fb7b0e508677cfddb62e46c6 100755
GIT binary patch
delta 3557
zcmai04Ny~87QXM2#HcY13gIut2Lw!_vZ;SOOn`(cA|N6~mvvhjbgZadhdQ=J7fII1
zcC@1uIo&RGsUlMiW37XYGm9Oy(b+oaYM0%rMFGXuuHY)AKWkB`Z_j;6)UrC>nalT`
z?|%2*^X|Ft5&w0r{T7!@h&`Q<+_IA+a@Qj+CvW7I5#mH`HEOF-TfN-<#`ffC++VjZ
znM$`#KUnk0Gp@qAgfv3>{*F$8C+-4*MB|mT&YPl`$KrsNl@{=xLhtF$_F-csxXyus
zMi(LbIuAW&tiQmGV{diV2gd??kT7So7fx%;bnpYXttmqA<%Snf+L{bI#!VdQ&BEk}
zv#<_Mk4w@1oJ4XCit&S@=(qOW59Frtf;wWRdH{zA$?OZdiz2c{mDDv^W%yJ}>R@NY
zGR_M$a%$vJ1IalqEzy)IikmVC@mm{UUZj=oH^7^bs}#iP5k9&B`=V07ppE8AAYHqF
z%ZF|F_ixaxT|f<qkP`K}+`hmg+=1_+QlTeG#W~?t)Li+%Z#}|FNvnuf(Y6<%Iyzh4
zT;vfhLU>GST}-iDS?m$!u>fjgRNPH?KW47HW06NV49~`<0w1g5YGF$3TzSr~J%UNn
zT4Gg{l)$mrY`JcsM|cm8qtPCRVi6R?Wy=qhc!ZIX*dC|i%Aq?BGm4ajpT8~U;RD)O
zU3E)5!g{zGm&$E}?Ybn$&=ql!$R=^`!%>}2R<9#D;56!>cD!!P3mBY_W~WJ<*Wh4c
z9K9F^7si*;h0!omZyq~8j^ymd6O_i2t1Gc^YxOC#awRnBv#Bf&Zs^Ul;$6^ADAKxD
zE)&LMp?Ff(D~e4v4RH%-te%ifXUD?H32SI;1I*-e$1<b*XO?l3O%^cK@n)JI1IPFc
zv~4HMikG%n6>p~NqT${64Q#`)31-?B1sMq$)L94B2{qKc64Z%WHkO>2O&heZCh-J|
zrW(wc%pT#@J7OM3T*9Fd%c=@VyigwrrAe`{)sRmscfln?7BlOU%#qCe$H8Bk_rt4(
zIH*j@rzL-ZlSxyV_imDzUK@ugOn_}ksgyK8b+R<@ak80C9tW3`-(t2^6EW|xlI>{X
zB<Pu_q9qN`KT&!pz0pkl;c(x$n3n8-CZi<&3q@5xY~2HkO_KPg$xKT$=$z$ZCE!W?
z9iCimt4SyK)x<8nDtp7`ZM475-rpW8PdLO=ZA#MDrXeFw-E}C22vK$nI+BO(JF<s{
z94`+kiLpaNayp_(IrhF@o;do5V6Lhln;fhfRI0`vq4M6>E<4#h_`4s(JVX~7%X(Qo
zUknJn#$Y@@7+ZpII>y2y^=Bt*LM>DVCzhe`8Ir@}=EmUWo5f9bBgU3sT#2y-V<8xq
zi=#K$IU-sGVoKwI;}fCB6fu7@-Xaa*2LFj?v+#Q{@4~(NE(=zB6cHz|il>bZT<gLC
zSCxe<lWsv$P@9GCNlcJiFr}^FFk88cu+i*u)ndvjvdPZQqaHbX>_y+P?~ipd)7Yaw
z&hspKGH{F#o8%krQQ|sdq;=AzaXcpXV?1Mpz5#Y#)l$NZnA|xyJ_#tNg2j_Rer$Fd
zzkPakzd_H_vYR2T^OWqu(e!9`L-5{|pSZ)YJ!KkxGe1k2MqBs#?xwsR=4uQSsCQ7j
z`k%dF%BuE0Hj|ewt|%_GHRCw(CTu^9Iy+c79#9J&IJ<neP}@s4R56_upcVCtlJ@%t
znjf{#Q2WzmG4HjX*hP&On4jG_JnV3GJ05J_fJU%H@m!iOe%cC7$G(<Pcnf6t9$`0r
zi}39d*hh|fm-g0ac{SrY=|3ZJD!p)Gx@+Wv$d^0gd2WpND}nI2Wjx7UkA{6St#iBy
zefwpkYTp8~QHh+MP_CEZm3>qT!xvZeK(k9e%`ajwr!I(;wlYj6#I%KwEx%&PD80U#
z)KbpnIWLOWP+(1i9ZjV8f+!X;0UOio*lST6KxB<+#n@l@QWTq+fQ@P8jC6=1&k!5a
zs<5k2GcW-g!&>Y*)Vc_P)oJmxy#s2}QfSvjIEGMs3HsCYwCWP5XX&ZC6H*YGJD~uf
zy%W|T40b{d0^bD(5OTWUbA*a6=trpU0<~37{auiP(AxzC2%2tKgCKN64T7^94j^pq
zhR+dNx}hJTqZ`yVJtdbR1;Kb3_luK<5mJcE7OQAQ+I$(-*kUs}kdiB+s6$|l$9@wv
zC2AE+m_5=#c5%<C?ct8<LN&>{qHwUs*2wVKuE0rKf<i#L?h5qV%!+cP?N>mbo~(EW
zX-*I1rY9(lAhlnGwMefaJ$Ds$AXUln@V<f*=oOHvzJ`8Ook(Y218s(0;Xx|<24*2`
zMtbHOC`Eb(sq^0w9YFeHfJP_?*?b-P(I6m=?v>~ZNSh^Uaw8S`L{Z={`MGK*KFuy}
z?I<;I-~){gl=1`g<im{2#Mm>)8gD!-{%oAogWCQZ@JVI{v}DTU#u4~2gTtB6O{qY3
z=GMa_WH-fw+BwwNMAq`L*P|u_lRZz~i_8h@?GtqoN<t#O7sVEq)*LmDY0;O|WVXVA
z*Qs65nQDI!#m#I5Y;0k6rDZ>gqK66CjM(ki*8%64G^H6wn8F%!yRjcZZ8BRP8`Ivw
ze(t}bSm`Ts+*QHbub6y=g}2r4C1-+vtPBT#vBEWkzfd1-q4-G{@`lFm1?N8v?*@It
z=DA2{;7K@uzgMN^KKtR86>}2*uwocq55^%}5wM5m4SYJxADS=u4Y=U&iNQ=CPK1G_
zQSj&Iqg^DlRA7t2pz}%iAO-daJRwrd3S-BP&3z32u-K0sn>!;=yn)OR2J}#z7@Fv}
z1*_~282m@ryUjMBaBz{<Wv5bj4u8o=%^!@9oUyuI-r1<IoDHBoFwD2`XO(HME%)vZ
z#1>01W>*lKr6uV9zj>yF(oetjeG&3m0&&(Cp@mu=vti+M;%vcH8_P*4LyXHT!I&Y7
LTYUDH+!6l+$lio8

delta 2528
zcmah}4Qx}_6~6DrHgW5@4vAyOG0wA-II&666e@U$^J6;@j6;K$W)!h5a-fBDP{t<4
zLYCPU@+PPRNqQ;`wk!?IR2gCu#caw@hms0KEt{%Lq8%l~2_;>VKr7RBG=!47bDtUg
z84_2z@B8j|zIWcae{b|-Vdx{lPKdjj5P$2)BL7~*=gDq=D<MJDwxhNkwe8PGcK6tq
z3uk&ZEu)&{uOB-5SlHcVs~}|N@93075@{qzbTL^QyTMokAEz{LMaug&dM}KPek<m~
zJb!BJ4-;}??92mVpEO1=_R`qld$H6vNLtVsg9|z*ojwY;bWI3xe?}vv#f9*j%p!Hn
zi^Y+ky9+L4mSw+KKzy$=>l|Z4u9<~YZMqm}vsP$>7$T&0=AOGL%lnA0>zGS{v+{RA
zU)B~O22{UHUz$&R!*YsawT$hnC1l9e3~Ti+YR-pW>3^t-v_z#p&cX>p85Cq21szmp
zuNM?>5TBhep1q#FVudop9_3tPRJsi}4J%>Npb?Dlv0;sJ*QTiSf~@T_YN+p72pQ{@
z!KSG6K4j#qd^x9CIrMZ?TFVE}ouk2$ewVXG+5Nq!^jCPov=T&<M({zYX^oOR6O|mY
zcFLrogU`S@Q@!$HASxY!VKlmPQT#46=GH5BY>Y~3SscyP2qKK<u2lXLC-^S3ntvwz
z0H1}hU{(nqf^tb*5y&Mzm@F{Er6p!n77h*?V19{>_L#t8X`yNZge=a*N)z!N#T<Mw
zr|hA2+?%tOGCI8tW-axKH*zpXx5WtS@^mx^(301bo!quXT7o+wlGbU)j#cSML_+&W
zUOoNN2wHI`9XJ9Z(Z84*m3LVKHz`u4%c4`QFcP0Ym@5i`B^QdV8|mP0VV_n0=4q>w
zmKY#m?cfz`vN>ra8+O|B;J7W12D)IxcF2_6c3f(|b1nS0`Twh7XMR0x*27=&-{h}%
z6gaWi<I<0Bvj%~Lr8A4RbOwl33S`0Af~~Zz7i@)IZroa!GX8Szo5trd(P)C9!gbVl
z7_|0f-0ZbGY42hzY6DFkhLMywkK$AY*o$7`meWNziwsB<ZJ@p*u%cL=+SX#H&<?j9
zE{HhH)c0$cP3b0wlb(MJ<{g!CVBi?^Ipi1K!GMkj${@Hj>!~?hO&t;D_=X?&j5WM>
zXCYjUvwDQEJU%nWqx@EoU?3UpEGJv!+3N3Um3TGO_joz3Kqm|@bqR07-KFs`X*(`y
z$UZmUrF7(cI}d)!FPy7{YsI+D5AwW#o*Et_RkH8fo^;H!TFz4~$0gDld4OlJWJQxe
zLUO_VSloH=lmfl^D7;sC`hnH`Ir;GFdeHN*>Ry!dJfym{X!?3}Hz4QnFN8lpMmhdl
zVRiX(I`I2=NBN$#u&zzspy@ew_4;dR>7D7<`J1BrQ)6s%l@a44X6v~*o4OK8Tp$q_
zoS)@F^u9(rI{$Z{ic!yywV&VD%2B(E@9HO6!%x3p4Q^a3?&n7w7dwNWj{6_Kh(;(k
zh=L=2^vP|4Mnp3|9W=Ii`sHj<%eh(p^^;(F44PJi)%Qp7%PXE1z7zXEBBH-lB>uf<
zsFYl*W44)-3etIEJ^4jCa{6ZaUIlSfd|gW;?h4I)%`N+M-x$MLX}w<lR6z+Ljst`o
zc$^oL9*bKlT7+=czZjcBfj1p?G?6LP0$jkxH8*zg5@Sh3-niC`{Y%saxPXmoZP>ft
zXG}!ojcc9Qdr>Rk0yd7_*iWK%g%G$?X{Eu-U~!ev9b?dlFf|4tmxXGsKp%o`9L^y~
z<B&iIj>7`N{&BEWS?H;8s6n_i4lM{I0U?Cq1oR;U5^xToJpl=X{sb%_yp;e;wS^`V
zP=jEcfEEPz1cVUUCZG?YdjifO3{Jp8wMlh24QDY4mKsydAkwx;#>{*c*hK9AKy4Rl
zyST7Q?IEuUby|0ZClt^U?}W<3C!0ji8<Vi7#-=hV2)Qx|eKk&%52<t&-l?&xb|XD?
z73R^<i*(=vu+`dB=aE)j13%ImNawD>PipNdqmq!_Q_zR18|ldpVWifgdJgH1f5R-&
z{Yanu2()gC>Lk*)QgjsQv(sEvevUK=!|ptlQAJ2(2Kvxig;Y0-)TA!Jl7l$x2el{P
zv>(~$vtRA#Dvb7`My|slPmP)nr*PN}2R(JAC5v!XKKW{e`1ZR|D@Khwd0U6Q8MTdY
z*VCYkAv*(4d5g@W$l5+-tY1LGYORQCNWZTot5hC5Bf(0u{^Dnh?dKV=@x=U``e4|b
zDGb89-lf6~5b82>$bT3s;VH0j*DKhKw-{@WKT&r}6L;5L65!PzIpTr9Ep2?m=8mlY
E0#G$xNB{r;

diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index 9b97cb5..ed759a5 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -20,3 +20,4 @@ test 19:PASS
 test 20:PASS
 test 21:PASS
 test 22:PASS
+test 23:PASS

From b0b3c0dc70855480fef3278925521bee7dd7de34 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 1 Sep 2020 11:13:17 +1000
Subject: [PATCH 27/30] FPU: Add comments specifying the expectation of r.shift
 for each state

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/fpu.vhdl b/fpu.vhdl
index 5e30386..ec18953 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -1599,6 +1599,7 @@ begin
                 end if;
 
             when ADD_SHIFT =>
+                -- r.shift = - exponent difference
                 opsel_r <= RES_SHIFT;
                 v.x := s_nz;
                 set_x := '1';
@@ -1619,6 +1620,7 @@ begin
 
             when ADD_3 =>
                 -- check for overflow or negative result (can't get both)
+                -- r.shift = -1
                 if r.r(63) = '1' then
                     -- result is opposite sign to expected
                     v.result_sign := not r.result_sign;
@@ -1694,12 +1696,14 @@ begin
 
             when FMADD_2 =>
                 -- Product is potentially bigger here
+                -- r.shift = addend exp - product exp + 64
                 set_s := '1';
                 opsel_s <= S_SHIFT;
                 v.shift := r.shift - to_signed(64, EXP_BITS);
                 v.state := FMADD_3;
 
             when FMADD_3 =>
+                -- r.shift = addend exp - product exp
                 opsel_r <= RES_SHIFT;
                 v.first := '1';
                 v.state := FMADD_4;
@@ -1731,6 +1735,7 @@ begin
                 v.state := FMADD_6;
 
             when FMADD_6 =>
+                -- r.shift = 56 (or 0, but only if r is now nonzero)
                 if (r.r(56) or r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then
                     if s_nz = '0' then
                         -- must be a subtraction, and r.x must be zero
@@ -1877,6 +1882,7 @@ begin
             when SQRT_2 =>
                 -- shift R right one place
                 -- not expecting multiplier result yet
+                -- r.shift = -1
                 opsel_r <= RES_SHIFT;
                 v.first := '1';
                 v.state := SQRT_3;
@@ -2012,12 +2018,14 @@ begin
                 v.state := FINISH;
 
             when INT_SHIFT =>
+                -- r.shift = b.exponent - 52
                 opsel_r <= RES_SHIFT;
                 set_x := '1';
                 v.state := INT_ROUND;
                 v.shift := to_signed(-2, EXP_BITS);
 
             when INT_ROUND =>
+                -- r.shift = -2
                 opsel_r <= RES_SHIFT;
                 round := fp_rounding(r.r, r.x, '0', r.round_mode, r.result_sign);
                 v.fpscr(FPSCR_FR downto FPSCR_FI) := round;
@@ -2030,6 +2038,7 @@ begin
                 end if;
 
             when INT_ISHIFT =>
+                -- r.shift = b.exponent - 54;
                 opsel_r <= RES_SHIFT;
                 v.state := INT_FINAL;
 
@@ -2087,6 +2096,7 @@ begin
                 arith_done := '1';
 
             when FRI_1 =>
+                -- r.shift = b.exponent - 52
                 opsel_r <= RES_SHIFT;
                 set_x := '1';
                 v.shift := to_signed(-2, EXP_BITS);
@@ -2114,6 +2124,7 @@ begin
 
             when NORMALIZE =>
                 -- Shift so we have 9 leading zeroes (we know R is non-zero)
+                -- r.shift = clz(r.r) - 9
                 opsel_r <= RES_SHIFT;
                 set_x := '1';
                 if exp_tiny = '1' then
@@ -2127,6 +2138,7 @@ begin
                 end if;
 
             when ROUND_UFLOW =>
+                -- r.shift = - amount by which exponent underflows
                 v.tiny := '1';
                 if r.fpscr(FPSCR_UE) = '0' then
                     -- disabled underflow exception case
@@ -2204,6 +2216,7 @@ begin
 
             when ROUNDING_2 =>
                 -- Check for overflow during rounding
+                -- r.shift = -1
                 v.x := '0';
                 if r.r(55) = '1' then
                     opsel_r <= RES_SHIFT;
@@ -2221,6 +2234,7 @@ begin
                 end if;
 
             when ROUNDING_3 =>
+                -- r.shift = clz(r.r) - 9
                 mant_nz := r_hi_nz or (r_lo_nz and not r.single_prec);
                 if mant_nz = '0' then
                     v.result_class := ZERO;
@@ -2242,6 +2256,7 @@ begin
                 end if;
 
             when DENORM =>
+                -- r.shift = result_exp - -1022
                 opsel_r <= RES_SHIFT;
                 arith_done := '1';
 

From fb5115c9445fe03142946b195028d90a00c6acee Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 1 Sep 2020 15:09:17 +1000
Subject: [PATCH 28/30] FPU: Decide on A input selection a cycle earlier

This moves opsel_a into the reg_type record, meaning that the A
multiplexer input now needs to be decided a cycle earlier.  This helps
timing by eliminating the combinatorial path from r.state and other
things to opsel_a and thence to in_a and result.

This means that some things now take an extra cycle, in particular
some of the exception cases such as when one or both operands are
NaNs.  The NaN handling has been moved out to its own state, which
simplifies the logic for exception cases in other places.  Additions
or subtractions where FRB's exponent is smaller than FRA's will
also take an extra cycle.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 330 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 177 insertions(+), 153 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index ec18953..9c18e47 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -44,7 +44,7 @@ architecture behaviour of fpu is
                      DO_FRE, DO_FRSQRTE,
                      DO_FSEL,
                      FRI_1,
-                     ADD_SHIFT, ADD_2, ADD_3,
+                     ADD_1, ADD_SHIFT, ADD_2, ADD_3,
                      CMP_1, CMP_2,
                      MULT_1,
                      FMADD_1, FMADD_2, FMADD_3,
@@ -65,7 +65,8 @@ architecture behaviour of fpu is
                      DENORM,
                      RENORM_A, RENORM_A2,
                      RENORM_B, RENORM_B2,
-                     RENORM_C, RENORM_C2);
+                     RENORM_C, RENORM_C2,
+                     NAN_RESULT, EXC_RESULT);
 
     type reg_type is record
         state        : state_t;
@@ -111,6 +112,12 @@ architecture behaviour of fpu is
         first        : std_ulogic;
         count        : unsigned(1 downto 0);
         doing_ftdiv  : std_ulogic_vector(1 downto 0);
+        opsel_a      : std_ulogic_vector(1 downto 0);
+        use_a        : std_ulogic;
+        use_b        : std_ulogic;
+        use_c        : std_ulogic;
+        invalid      : std_ulogic;
+        negate       : std_ulogic;
     end record;
 
     type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0);
@@ -118,7 +125,6 @@ architecture behaviour of fpu is
     signal r, rin : reg_type;
 
     signal fp_result     : std_ulogic_vector(63 downto 0);
-    signal opsel_a       : std_ulogic_vector(1 downto 0);
     signal opsel_b       : std_ulogic_vector(1 downto 0);
     signal opsel_r       : std_ulogic_vector(1 downto 0);
     signal opsel_s       : std_ulogic_vector(1 downto 0);
@@ -724,7 +730,7 @@ begin
         v.update_fprf := '0';
         v.shift := to_signed(0, EXP_BITS);
         v.first := '0';
-        opsel_a <= AIN_R;
+        v.opsel_a := AIN_R;
         opsel_ainv <= '0';
         opsel_amask <= '0';
         opsel_b <= BIN_ZERO;
@@ -758,6 +764,11 @@ begin
         shiftin := '0';
         case r.state is
             when IDLE =>
+                v.use_a := '0';
+                v.use_b := '0';
+                v.use_c := '0';
+                v.invalid := '0';
+                v.negate := '0';
                 if e_in.valid = '1' then
                     case e_in.insn(5 downto 1) is
                         when "00000" =>
@@ -770,6 +781,7 @@ begin
                             elsif e_in.insn(7) = '1' then
                                 v.state := DO_MCRFS;
                             else
+                                v.opsel_a := AIN_B;
                                 v.state := DO_FCMP;
                             end if;
                         when "00110" =>
@@ -789,14 +801,17 @@ begin
                                 v.state := DO_MTFSF;
                             end if;
                         when "01000" =>
+                            v.opsel_a := AIN_B;
                             if e_in.insn(9 downto 8) /= "11" then
                                 v.state := DO_FMR;
                             else
                                 v.state := DO_FRI;
                             end if;
                         when "01100" =>
+                            v.opsel_a := AIN_B;
                             v.state := DO_FRSP;
                         when "01110" =>
+                            v.opsel_a := AIN_B;
                             if int_input = '1' then
                                 -- fcfid[u][s]
                                 v.state := DO_FCFID;
@@ -805,25 +820,45 @@ begin
                             end if;
                         when "01111" =>
                             v.round_mode := "001";
+                            v.opsel_a := AIN_B;
                             v.state := DO_FCTI;
                         when "10010" =>
+                            v.opsel_a := AIN_A;
+                            if v.b.mantissa(54) = '0' and v.a.mantissa(54) = '1' then
+                                v.opsel_a := AIN_B;
+                            end if;
                             v.state := DO_FDIV;
                         when "10100" | "10101" =>
+                            v.opsel_a := AIN_A;
                             v.state := DO_FADD;
                         when "10110" =>
                             v.is_sqrt := '1';
+                            v.opsel_a := AIN_B;
                             v.state := DO_FSQRT;
                         when "10111" =>
                             v.state := DO_FSEL;
                         when "11000" =>
+                            v.opsel_a := AIN_B;
                             v.state := DO_FRE;
                         when "11001" =>
                             v.is_multiply := '1';
+                            v.opsel_a := AIN_A;
+                            if v.c.mantissa(54) = '0' and v.a.mantissa(54) = '1' then
+                                v.opsel_a := AIN_C;
+                            end if;
                             v.state := DO_FMUL;
                         when "11010" =>
                             v.is_sqrt := '1';
+                            v.opsel_a := AIN_B;
                             v.state := DO_FRSQRTE;
                         when "11100" | "11101" | "11110" | "11111" =>
+                            if v.a.mantissa(54) = '0' then
+                                v.opsel_a := AIN_A;
+                            elsif v.c.mantissa(54) = '0' then
+                                v.opsel_a := AIN_C;
+                            else
+                                v.opsel_a := AIN_B;
+                            end if;
                             v.state := DO_FMADD;
                         when others =>
                             illegal := '1';
@@ -880,11 +915,10 @@ begin
 
             when DO_FCMP =>
                 -- fcmp[uo]
+                -- r.opsel_a = AIN_B
                 v.instr_done := '1';
                 v.state := IDLE;
                 update_fx := '1';
-                opsel_a <= AIN_B;
-                opsel_r <= RES_SUM;
                 v.result_exp := r.b.exponent;
                 if (r.a.class = NAN and r.a.mantissa(53) = '0') or
                     (r.b.class = NAN and r.b.mantissa(53) = '0') then
@@ -930,6 +964,7 @@ begin
                     -- Prepare to subtract mantissas, put B in R
                     v.cr_result := "0000";
                     v.instr_done := '0';
+                    v.opsel_a := AIN_A;
                     v.state := CMP_1;
                 end if;
                 v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result;
@@ -1017,7 +1052,7 @@ begin
                 v.state := IDLE;
 
             when DO_FMR =>
-                opsel_a <= AIN_B;
+                -- r.opsel_a = AIN_B
                 v.result_class := r.b.class;
                 v.result_exp := r.b.exponent;
                 v.quieten_nan := '0';
@@ -1037,7 +1072,7 @@ begin
                 v.state := IDLE;
 
             when DO_FRI =>    -- fri[nzpm]
-                opsel_a <= AIN_B;
+                -- r.opsel_a = AIN_B
                 v.result_class := r.b.class;
                 v.result_sign := r.b.negative;
                 v.result_exp := r.b.exponent;
@@ -1062,7 +1097,7 @@ begin
                 end if;
 
             when DO_FRSP =>
-                opsel_a <= AIN_B;
+                -- r.opsel_a = AIN_B, r.shift = 0
                 v.result_class := r.b.class;
                 v.result_sign := r.b.negative;
                 v.result_exp := r.b.exponent;
@@ -1092,7 +1127,7 @@ begin
                 -- instr bit 9: 1=dword 0=word
                 -- instr bit 8: 1=unsigned 0=signed
                 -- instr bit 1: 1=round to zero 0=use fpscr[RN]
-                opsel_a <= AIN_B;
+                -- r.opsel_a = AIN_B
                 v.result_class := r.b.class;
                 v.result_sign := r.b.negative;
                 v.result_exp := r.b.exponent;
@@ -1130,8 +1165,8 @@ begin
                 end case;
 
             when DO_FCFID =>
+                -- r.opsel_a = AIN_B
                 v.result_sign := '0';
-                opsel_a <= AIN_B;
                 if r.insn(8) = '0' and r.b.negative = '1' then
                     -- fcfid[s] with negative operand, set R = -B
                     opsel_ainv <= '1';
@@ -1150,16 +1185,19 @@ begin
 
             when DO_FADD =>
                 -- fadd[s] and fsub[s]
-                opsel_a <= AIN_A;
+                -- r.opsel_a = AIN_A
                 v.result_sign := r.a.negative;
                 v.result_class := r.a.class;
                 v.result_exp := r.a.exponent;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
+                v.use_a := '1';
+                v.use_b := '1';
                 is_add := r.a.negative xor r.b.negative xor r.insn(1);
                 if r.a.class = FINITE and r.b.class = FINITE then
                     v.is_subtract := not is_add;
                     v.add_bsmall := r.exp_cmp;
+                    v.opsel_a := AIN_B;
                     if r.exp_cmp = '0' then
                         v.shift := r.a.exponent - r.b.exponent;
                         v.result_sign := r.b.negative xnor r.insn(1);
@@ -1169,77 +1207,55 @@ begin
                             v.state := ADD_SHIFT;
                         end if;
                     else
-                        opsel_a <= AIN_B;
-                        v.shift := r.b.exponent - r.a.exponent;
-                        v.result_exp := r.b.exponent;
-                        v.state := ADD_SHIFT;
+                        v.state := ADD_1;
                     end if;
                 else
-                    if (r.a.class = NAN and r.a.mantissa(53) = '0') or
-                        (r.b.class = NAN and r.b.mantissa(53) = '0') then
-                        -- Signalling NAN
-                        v.fpscr(FPSCR_VXSNAN) := '1';
-                        invalid := '1';
-                    end if;
-                    if r.a.class = NAN then
-                        -- nothing to do, result is A
-                    elsif r.b.class = NAN then
-                        v.result_class := NAN;
-                        v.result_sign := r.b.negative;
-                        opsel_a <= AIN_B;
+                    if r.a.class = NAN or r.b.class = NAN then
+                        v.state := NAN_RESULT;
                     elsif r.a.class = INFINITY and r.b.class = INFINITY and is_add = '0' then
                         -- invalid operation, construct QNaN
                         v.fpscr(FPSCR_VXISI) := '1';
                         qnan_result := '1';
+                        arith_done := '1';
                     elsif r.a.class = ZERO and r.b.class = ZERO and is_add = '0' then
                         -- return -0 for rounding to -infinity
                         v.result_sign := r.round_mode(1) and r.round_mode(0);
+                        arith_done := '1';
                     elsif r.a.class = INFINITY or r.b.class = ZERO then
-                        -- nothing to do, result is A
+                        -- result is A
+                        v.opsel_a := AIN_A;
+                        v.state := EXC_RESULT;
                     else
                         -- result is +/- B
-                        v.result_sign := r.b.negative xnor r.insn(1);
-                        v.result_class := r.b.class;
-                        v.result_exp := r.b.exponent;
-                        opsel_a <= AIN_B;
+                        v.opsel_a := AIN_B;
+                        v.negate := not r.insn(1);
+                        v.state := EXC_RESULT;
                     end if;
-                    arith_done := '1';
                 end if;
 
             when DO_FMUL =>
                 -- fmul[s]
-                opsel_a <= AIN_A;
-                v.result_sign := r.a.negative;
+                -- r.opsel_a = AIN_A unless C is denorm and A isn't
+                v.result_sign := r.a.negative xor r.c.negative;
                 v.result_class := r.a.class;
-                v.result_exp := r.a.exponent;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
+                v.use_a := '1';
+                v.use_c := '1';
                 if r.a.class = FINITE and r.c.class = FINITE then
-                    v.result_sign := r.a.negative xor r.c.negative;
                     v.result_exp := r.a.exponent + r.c.exponent;
                     -- Renormalize denorm operands
                     if r.a.mantissa(54) = '0' then
                         v.state := RENORM_A;
                     elsif r.c.mantissa(54) = '0' then
-                        opsel_a <= AIN_C;
                         v.state := RENORM_C;
                     else
                         f_to_multiply.valid <= '1';
                         v.state := MULT_1;
                     end if;
                 else
-                    if (r.a.class = NAN and r.a.mantissa(53) = '0') or
-                        (r.c.class = NAN and r.c.mantissa(53) = '0') then
-                        -- Signalling NAN
-                        v.fpscr(FPSCR_VXSNAN) := '1';
-                        invalid := '1';
-                    end if;
-                    if r.a.class = NAN then
-                    -- result is A
-                    elsif r.c.class = NAN then
-                        v.result_class := NAN;
-                        v.result_sign := r.c.negative;
-                        opsel_a <= AIN_C;
+                    if r.a.class = NAN or r.c.class = NAN then
+                        v.state := NAN_RESULT;
                     elsif (r.a.class = INFINITY and r.c.class = ZERO) or
                         (r.a.class = ZERO and r.c.class = INFINITY) then
                         -- invalid operation, construct QNaN
@@ -1247,22 +1263,22 @@ begin
                         qnan_result := '1';
                     elsif r.a.class = ZERO or r.a.class = INFINITY then
                         -- result is +/- A
-                        v.result_sign := r.a.negative xor r.c.negative;
+                        arith_done := '1';
                     else
                         -- r.c.class is ZERO or INFINITY
-                        v.result_class := r.c.class;
-                        v.result_sign := r.a.negative xor r.c.negative;
+                        v.opsel_a := AIN_C;
+                        v.negate := r.a.negative;
+                        v.state := EXC_RESULT;
                     end if;
-                    arith_done := '1';
                 end if;
 
             when DO_FDIV =>
-                opsel_a <= AIN_A;
-                v.result_sign := r.a.negative;
+                -- r.opsel_a = AIN_A unless B is denorm and A isn't
                 v.result_class := r.a.class;
-                v.result_exp := r.a.exponent;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
+                v.use_a := '1';
+                v.use_b := '1';
                 v.result_sign := r.a.negative xor r.b.negative;
                 v.result_exp := r.a.exponent - r.b.exponent;
                 v.count := "00";
@@ -1271,26 +1287,14 @@ begin
                     if r.a.mantissa(54) = '0' then
                         v.state := RENORM_A;
                     elsif r.b.mantissa(54) = '0' then
-                        opsel_a <= AIN_B;
                         v.state := RENORM_B;
                     else
                         v.first := '1';
                         v.state := DIV_2;
                     end if;
                 else
-                    if (r.a.class = NAN and r.a.mantissa(53) = '0') or
-                        (r.b.class = NAN and r.b.mantissa(53) = '0') then
-                        -- Signalling NAN
-                        v.fpscr(FPSCR_VXSNAN) := '1';
-                        invalid := '1';
-                    end if;
-                    if r.a.class = NAN then
-                        -- result is A
-                        v.result_sign := r.a.negative;
-                    elsif r.b.class = NAN then
-                        v.result_class := NAN;
-                        v.result_sign := r.b.negative;
-                        opsel_a <= AIN_B;
+                    if r.a.class = NAN or r.b.class = NAN then
+                        v.state := NAN_RESULT;
                     elsif r.b.class = INFINITY then
                         if r.a.class = INFINITY then
                             v.fpscr(FPSCR_VXIDI) := '1';
@@ -1298,6 +1302,7 @@ begin
                         else
                             v.result_class := ZERO;
                         end if;
+                        arith_done := '1';
                     elsif r.b.class = ZERO then
                         if r.a.class = ZERO then
                             v.fpscr(FPSCR_VXZDZ) := '1';
@@ -1308,46 +1313,36 @@ begin
                             end if;
                             v.result_class := INFINITY;
                         end if;
-                    -- else r.b.class = FINITE, result_class = r.a.class
+                        arith_done := '1';
+                    else -- r.b.class = FINITE, result_class = r.a.class
+                        arith_done := '1';
                     end if;
-                    arith_done := '1';
                 end if;
 
             when DO_FSEL =>
-                opsel_a <= AIN_A;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
                 if r.a.class = ZERO or (r.a.negative = '0' and r.a.class /= NAN) then
-                    v.result_sign := r.c.negative;
-                    v.result_exp := r.c.exponent;
-                    v.result_class := r.c.class;
-                    opsel_a <= AIN_C;
+                    v.opsel_a := AIN_C;
                 else
-                    v.result_sign := r.b.negative;
-                    v.result_exp := r.b.exponent;
-                    v.result_class := r.b.class;
-                    opsel_a <= AIN_B;
+                    v.opsel_a := AIN_B;
                 end if;
                 v.quieten_nan := '0';
-                arith_done := '1';
+                v.state := EXC_RESULT;
 
             when DO_FSQRT =>
-                opsel_a <= AIN_B;
+                -- r.opsel_a = AIN_B
                 v.result_class := r.b.class;
                 v.result_sign := r.b.negative;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
-                if r.b.class = NAN and r.b.mantissa(53) = '0' then
-                    v.fpscr(FPSCR_VXSNAN) := '1';
-                    invalid := '1';
-                end if;
+                v.use_b := '1';
                 case r.b.class is
                     when FINITE =>
                         v.result_exp := r.b.exponent;
                         if r.b.negative = '1' then
                             v.fpscr(FPSCR_VXSQRT) := '1';
                             qnan_result := '1';
-                            arith_done := '1';
                         elsif r.b.mantissa(54) = '0' then
                             v.state := RENORM_B;
                         elsif r.b.exponent(0) = '0' then
@@ -1356,7 +1351,9 @@ begin
                             v.shift := to_signed(1, EXP_BITS);
                             v.state := RENORM_B2;
                         end if;
-                    when NAN | ZERO =>
+                    when NAN =>
+                        v.state := NAN_RESULT;
+                    when ZERO =>
                         -- result is B
                         arith_done := '1';
                     when INFINITY =>
@@ -1369,15 +1366,12 @@ begin
                 end case;
 
             when DO_FRE =>
-                opsel_a <= AIN_B;
+                -- r.opsel_a = AIN_B
                 v.result_class := r.b.class;
                 v.result_sign := r.b.negative;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
-                if r.b.class = NAN and r.b.mantissa(53) = '0' then
-                    v.fpscr(FPSCR_VXSNAN) := '1';
-                    invalid := '1';
-                end if;
+                v.use_b := '1';
                 case r.b.class is
                     when FINITE =>
                         v.result_exp := - r.b.exponent;
@@ -1387,8 +1381,7 @@ begin
                             v.state := FRE_1;
                         end if;
                     when NAN =>
-                        -- result is B
-                        arith_done := '1';
+                        v.state := NAN_RESULT;
                     when INFINITY =>
                         v.result_class := ZERO;
                         arith_done := '1';
@@ -1399,15 +1392,12 @@ begin
                 end case;
 
             when DO_FRSQRTE =>
-                opsel_a <= AIN_B;
+                -- r.opsel_a = AIN_B
                 v.result_class := r.b.class;
                 v.result_sign := r.b.negative;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
-                if r.b.class = NAN and r.b.mantissa(53) = '0' then
-                    v.fpscr(FPSCR_VXSNAN) := '1';
-                    invalid := '1';
-                end if;
+                v.use_b := '1';
                 v.shift := to_signed(1, EXP_BITS);
                 case r.b.class is
                     when FINITE =>
@@ -1415,7 +1405,6 @@ begin
                         if r.b.negative = '1' then
                             v.fpscr(FPSCR_VXSQRT) := '1';
                             qnan_result := '1';
-                            arith_done := '1';
                         elsif r.b.mantissa(54) = '0' then
                             v.state := RENORM_B;
                         elsif r.b.exponent(0) = '0' then
@@ -1424,8 +1413,7 @@ begin
                             v.state := RENORM_B2;
                         end if;
                     when NAN =>
-                        -- result is B
-                        arith_done := '1';
+                        v.state := NAN_RESULT;
                     when INFINITY =>
                         if r.b.negative = '1' then
                             v.fpscr(FPSCR_VXSQRT) := '1';
@@ -1442,25 +1430,26 @@ begin
 
             when DO_FMADD =>
                 -- fmadd, fmsub, fnmadd, fnmsub
-                opsel_a <= AIN_A;
+                -- r.opsel_a = AIN_A if A is denorm, else AIN_C if C is denorm,
+                -- else AIN_B
                 v.result_sign := r.a.negative;
                 v.result_class := r.a.class;
                 v.result_exp := r.a.exponent;
                 v.fpscr(FPSCR_FR) := '0';
                 v.fpscr(FPSCR_FI) := '0';
+                v.use_a := '1';
+                v.use_b := '1';
+                v.use_c := '1';
                 is_add := r.a.negative xor r.c.negative xor r.b.negative xor r.insn(1);
                 if r.a.class = FINITE and r.c.class = FINITE and
                     (r.b.class = FINITE or r.b.class = ZERO) then
                     v.is_subtract := not is_add;
                     mulexp := r.a.exponent + r.c.exponent;
                     v.result_exp := mulexp;
-                    opsel_a <= AIN_B;
                     -- Make sure A and C are normalized
                     if r.a.mantissa(54) = '0' then
-                        opsel_a <= AIN_A;
                         v.state := RENORM_A;
                     elsif r.c.mantissa(54) = '0' then
-                        opsel_a <= AIN_C;
                         v.state := RENORM_C;
                     elsif r.b.class = ZERO then
                         -- no addend, degenerates to multiply
@@ -1483,25 +1472,8 @@ begin
                         v.state := FMADD_2;
                     end if;
                 else
-                    if (r.a.class = NAN and r.a.mantissa(53) = '0') or
-                        (r.b.class = NAN and r.b.mantissa(53) = '0') or
-                        (r.c.class = NAN and r.c.mantissa(53) = '0') then
-                        -- Signalling NAN
-                        v.fpscr(FPSCR_VXSNAN) := '1';
-                        invalid := '1';
-                    end if;
-                    if r.a.class = NAN then
-                        -- nothing to do, result is A
-                    elsif r.b.class = NAN then
-                        -- result is B
-                        v.result_class := NAN;
-                        v.result_sign := r.b.negative;
-                        opsel_a <= AIN_B;
-                    elsif r.c.class = NAN then
-                        -- result is C
-                        v.result_class := NAN;
-                        v.result_sign := r.c.negative;
-                        opsel_a <= AIN_C;
+                    if r.a.class = NAN or r.b.class = NAN or r.c.class = NAN then
+                        v.state := NAN_RESULT;
                     elsif (r.a.class = ZERO and r.c.class = INFINITY) or
                         (r.a.class = INFINITY and r.c.class = ZERO) then
                         -- invalid operation, construct QNaN
@@ -1516,32 +1488,36 @@ begin
                             -- result is infinity
                             v.result_class := INFINITY;
                             v.result_sign := r.a.negative xor r.c.negative xor r.insn(2);
+                            arith_done := '1';
                         end if;
                     else
                         -- Here A is zero, C is zero, or B is infinity
                         -- Result is +/-B in all of those cases
-                        v.result_class := r.b.class;
-                        v.result_exp := r.b.exponent;
-                        if v.result_class /= ZERO or is_add = '1' then
-                            v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2));
+                        v.opsel_a := AIN_B;
+                        if r.b.class /= ZERO or is_add = '1' then
+                            v.negate := not (r.insn(1) xor r.insn(2));
                         else
                             -- have to be careful about rule for 0 - 0 result sign
-                            v.result_sign := (r.round_mode(1) and r.round_mode(0)) xor r.insn(2);
+                            v.negate := r.b.negative xor (r.round_mode(1) and r.round_mode(0)) xor r.insn(2);
                         end if;
-                        opsel_a <= AIN_B;
+                        v.state := EXC_RESULT;
                     end if;
-                    arith_done := '1';
                 end if;
 
             when RENORM_A =>
                 renormalize := '1';
                 v.state := RENORM_A2;
+                if r.insn(4) = '1' then
+                    v.opsel_a := AIN_C;
+                else
+                    v.opsel_a := AIN_B;
+                end if;
 
             when RENORM_A2 =>
+                -- r.opsel_a = AIN_C for fmul/fmadd, AIN_B for fdiv
                 set_a := '1';
                 v.result_exp := new_exp;
                 if r.insn(4) = '1' then
-                    opsel_a <= AIN_C;
                     if r.c.mantissa(54) = '1' then
                         if r.insn(3) = '0' or r.b.class = ZERO then
                             v.first := '1';
@@ -1551,18 +1527,18 @@ begin
                             if new_exp + 1 >= r.b.exponent then
                                 v.madd_cmp := '1';
                             end if;
+                            v.opsel_a := AIN_B;
                             v.state := DO_FMADD;
                         end if;
                     else
                         v.state := RENORM_C;
                     end if;
                 else
-                        opsel_a <= AIN_B;
-                        if r.b.mantissa(54) = '1' then
-                            v.first := '1';
-                            v.state := DIV_2;
-                        else
-                            v.state := RENORM_B;
+                    if r.b.mantissa(54) = '1' then
+                        v.first := '1';
+                        v.state := DIV_2;
+                    else
+                        v.state := RENORM_B;
                     end if;
                 end if;
 
@@ -1578,6 +1554,7 @@ begin
                 else
                     v.result_exp := new_exp;
                 end if;
+                v.opsel_a := AIN_B;
                 v.state := LOOKUP;
 
             when RENORM_C =>
@@ -1595,23 +1572,31 @@ begin
                     if new_exp + 1 >= r.b.exponent then
                         v.madd_cmp := '1';
                     end if;
+                    v.opsel_a := AIN_B;
                     v.state := DO_FMADD;
                 end if;
 
+            when ADD_1 =>
+                -- transferring B to R
+                v.shift := r.b.exponent - r.a.exponent;
+                v.result_exp := r.b.exponent;
+                v.state := ADD_SHIFT;
+
             when ADD_SHIFT =>
                 -- r.shift = - exponent difference
                 opsel_r <= RES_SHIFT;
                 v.x := s_nz;
                 set_x := '1';
                 longmask := '0';
-                v.state := ADD_2;
-
-            when ADD_2 =>
                 if r.add_bsmall = '1' then
-                    opsel_a <= AIN_A;
+                    v.opsel_a := AIN_A;
                 else
-                    opsel_a <= AIN_B;
+                    v.opsel_a := AIN_B;
                 end if;
+                v.state := ADD_2;
+
+            when ADD_2 =>
+                -- r.opsel_a = AIN_A if r.add_bsmall = 1 else AIN_B
                 opsel_b <= BIN_R;
                 opsel_binv <= r.is_subtract;
                 carry_in <= r.is_subtract and not r.x;
@@ -1655,7 +1640,7 @@ begin
                 end if;
 
             when CMP_1 =>
-                opsel_a <= AIN_A;
+                -- r.opsel_a = AIN_A
                 opsel_b <= BIN_R;
                 opsel_binv <= '1';
                 carry_in <= '1';
@@ -1696,7 +1681,7 @@ begin
 
             when FMADD_2 =>
                 -- Product is potentially bigger here
-                -- r.shift = addend exp - product exp + 64
+                -- r.shift = addend exp - product exp + 64, r.r = r.b.mantissa
                 set_s := '1';
                 opsel_s <= S_SHIFT;
                 v.shift := r.shift - to_signed(64, EXP_BITS);
@@ -1757,7 +1742,7 @@ begin
                 end if;
 
             when LOOKUP =>
-                opsel_a <= AIN_B;
+                -- r.opsel_a = AIN_B
                 -- wait one cycle for inverse_table[B] lookup
                 v.first := '1';
                 if r.insn(4) = '0' then
@@ -2260,6 +2245,41 @@ begin
                 opsel_r <= RES_SHIFT;
                 arith_done := '1';
 
+            when NAN_RESULT =>
+                if (r.use_a = '1' and r.a.class = NAN and r.a.mantissa(53) = '0') or
+                    (r.use_b = '1' and r.b.class = NAN and r.b.mantissa(53) = '0') or
+                    (r.use_c = '1' and r.c.class = NAN and r.c.mantissa(53) = '0') then
+                    -- Signalling NAN
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    invalid := '1';
+                end if;
+                if r.use_a = '1' and r.a.class = NAN then
+                    v.opsel_a := AIN_A;
+                elsif r.use_b = '1' and r.b.class = NAN then
+                    v.opsel_a := AIN_B;
+                elsif r.use_c = '1' and r.c.class = NAN then
+                    v.opsel_a := AIN_C;
+                end if;
+                v.state := EXC_RESULT;
+
+            when EXC_RESULT =>
+                -- r.opsel_a = AIN_A, AIN_B or AIN_C according to which input is the result
+                case r.opsel_a is
+                    when AIN_B =>
+                        v.result_sign := r.b.negative xor r.negate;
+                        v.result_exp := r.b.exponent;
+                        v.result_class := r.b.class;
+                    when AIN_C =>
+                        v.result_sign := r.c.negative xor r.negate;
+                        v.result_exp := r.c.exponent;
+                        v.result_class := r.c.class;
+                    when others =>
+                        v.result_sign := r.a.negative xor r.negate;
+                        v.result_exp := r.a.exponent;
+                        v.result_class := r.a.class;
+                end case;
+                arith_done := '1';
+
         end case;
 
         if zero_divide = '1' then
@@ -2271,11 +2291,15 @@ begin
             v.result_sign := '0';
             misc_sel <= "0001";
             opsel_r <= RES_MISC;
+            arith_done := '1';
+        end if;
+        if invalid = '1' then
+            v.invalid := '1';
         end if;
         if arith_done = '1' then
             -- Enabled invalid exception doesn't write result or FPRF
             -- Neither does enabled zero-divide exception
-            if (invalid and r.fpscr(FPSCR_VE)) = '0' and
+            if (v.invalid and r.fpscr(FPSCR_VE)) = '0' and
                 (zero_divide and r.fpscr(FPSCR_ZE)) = '0' then
                 v.writing_back := '1';
                 v.update_fprf := '1';
@@ -2355,7 +2379,7 @@ begin
         else
             mask := right_mask(unsigned(mshift(5 downto 0)));
         end if;
-        case opsel_a is
+        case r.opsel_a is
             when AIN_R =>
                 in_a0 := r.r;
             when AIN_A =>

From e1ca023bad2d11a9ae16da14b327a1329a6f50d9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 1 Sep 2020 15:28:19 +1000
Subject: [PATCH 29/30] FPU: Decide on mask length a cycle earlier

This moves longmask into the reg_type record, meaning that it now
needs to be decided a cycle earlier, in order to help timing.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index 9c18e47..d79cec6 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -118,6 +118,7 @@ architecture behaviour of fpu is
         use_c        : std_ulogic;
         invalid      : std_ulogic;
         negate       : std_ulogic;
+        longmask     : std_ulogic;
     end record;
 
     type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0);
@@ -615,7 +616,6 @@ begin
         variable need_check  : std_ulogic;
         variable msb         : std_ulogic;
         variable is_add      : std_ulogic;
-        variable longmask    : std_ulogic;
         variable set_a       : std_ulogic;
         variable set_b       : std_ulogic;
         variable set_c       : std_ulogic;
@@ -644,6 +644,7 @@ begin
             v.fe_mode := or (e_in.fe_mode);
             v.dest_fpr := e_in.frt;
             v.single_prec := e_in.single;
+            v.longmask := e_in.single;
             v.int_result := '0';
             v.rc := e_in.rc;
             v.is_cmp := e_in.out_cr;
@@ -747,7 +748,6 @@ begin
         renormalize := '0';
         set_x := '0';
         qnan_result := '0';
-        longmask := r.single_prec;
         set_a := '0';
         set_b := '0';
         set_c := '0';
@@ -1204,6 +1204,7 @@ begin
                         if r.a.exponent = r.b.exponent then
                             v.state := ADD_2;
                         else
+                            v.longmask := '0';
                             v.state := ADD_SHIFT;
                         end if;
                     else
@@ -1580,14 +1581,15 @@ begin
                 -- transferring B to R
                 v.shift := r.b.exponent - r.a.exponent;
                 v.result_exp := r.b.exponent;
+                v.longmask := '0';
                 v.state := ADD_SHIFT;
 
             when ADD_SHIFT =>
-                -- r.shift = - exponent difference
+                -- r.shift = - exponent difference, r.longmask = 0
                 opsel_r <= RES_SHIFT;
                 v.x := s_nz;
                 set_x := '1';
-                longmask := '0';
+                v.longmask := r.single_prec;
                 if r.add_bsmall = '1' then
                     v.opsel_a := AIN_A;
                 else
@@ -1676,6 +1678,7 @@ begin
                 set_s := '1';
                 f_to_multiply.valid <= r.first;
                 if multiply_to_f.valid = '1' then
+                    v.longmask := '0';
                     v.state := ADD_SHIFT;
                 end if;
 
@@ -2367,7 +2370,7 @@ begin
         -- Data path.
         -- This has A and B input multiplexers, an adder, a shifter,
         -- count-leading-zeroes logic, and a result mux.
-        if longmask = '1' then
+        if r.longmask = '1' then
             mshift := r.shift + to_signed(-29, EXP_BITS);
         else
             mshift := r.shift;

From 73f819301ba25ddc3855bba8e2f3334ca70b5aef Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 12 Sep 2020 20:13:24 +1000
Subject: [PATCH 30/30] FPU: Do masking after adder rather than on A input

The masking enabled by opsel_amask is only used when rounding, to trim
the rounded result to the required precision.  We now do the masking
after the adder rather than before (on the A input).  This gives the
same result and helps timing.  The path from r.shift through the mask
generator and adder to v.r was showing up as a critical path.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 fpu.vhdl | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/fpu.vhdl b/fpu.vhdl
index d79cec6..023dbf2 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -130,7 +130,7 @@ architecture behaviour of fpu is
     signal opsel_r       : std_ulogic_vector(1 downto 0);
     signal opsel_s       : std_ulogic_vector(1 downto 0);
     signal opsel_ainv    : std_ulogic;
-    signal opsel_amask   : std_ulogic;
+    signal opsel_mask    : std_ulogic;
     signal opsel_binv    : std_ulogic;
     signal in_a          : std_ulogic_vector(63 downto 0);
     signal in_b          : std_ulogic_vector(63 downto 0);
@@ -631,6 +631,7 @@ begin
         variable shiftin     : std_ulogic;
         variable mulexp      : signed(EXP_BITS-1 downto 0);
         variable maddend     : std_ulogic_vector(127 downto 0);
+        variable sum         : std_ulogic_vector(63 downto 0);
     begin
         v := r;
         illegal := '0';
@@ -733,7 +734,7 @@ begin
         v.first := '0';
         v.opsel_a := AIN_R;
         opsel_ainv <= '0';
-        opsel_amask <= '0';
+        opsel_mask <= '0';
         opsel_b <= BIN_ZERO;
         opsel_binv <= '0';
         opsel_r <= RES_SUM;
@@ -2176,7 +2177,7 @@ begin
                 end if;
 
             when ROUNDING =>
-                opsel_amask <= '1';
+                opsel_mask <= '1';
                 round := fp_rounding(r.r, r.x, r.single_prec, r.round_mode, r.result_sign);
                 v.fpscr(FPSCR_FR downto FPSCR_FI) := round;
                 if round(1) = '1' then
@@ -2398,9 +2399,6 @@ begin
         if opsel_ainv = '1' then
             in_a0 := not in_a0;
         end if;
-        if opsel_amask = '1' then
-            in_a0 := in_a0 and not mask;
-        end if;
         in_a <= in_a0;
         case opsel_b is
             when BIN_ZERO =>
@@ -2423,9 +2421,13 @@ begin
         else
             shift_res := (others => '0');
         end if;
+        sum := std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in);
+        if opsel_mask = '1' then
+            sum := sum and not mask;
+        end if;
         case opsel_r is
             when RES_SUM =>
-                result <= std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in);
+                result <= sum;
             when RES_SHIFT =>
                 result <= shift_res;
             when RES_MULT =>