diff --git a/Makefile b/Makefile
index b584895..9fe2106 100644
--- a/Makefile
+++ b/Makefile
@@ -48,7 +48,7 @@ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
 	cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \
 	logical.vhdl countzero.vhdl multiply.vhdl divider.vhdl execute1.vhdl \
 	loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \
-	core.vhdl
+	core.vhdl fpu.vhdl
 
 soc_files = $(core_files) wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \
 	wishbone_debug_master.vhdl xics.vhdl syscon.vhdl soc.vhdl \
diff --git a/common.vhdl b/common.vhdl
index 1ca1178..f91ac18 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -13,8 +13,11 @@ package common is
     constant MSR_SF  : integer := (63 - 0);     -- Sixty-Four bit mode
     constant MSR_EE  : integer := (63 - 48);    -- External interrupt Enable
     constant MSR_PR  : integer := (63 - 49);    -- PRoblem state
+    constant MSR_FP  : integer := (63 - 50);    -- Floating Point available
+    constant MSR_FE0 : integer := (63 - 52);    -- Floating Exception mode
     constant MSR_SE  : integer := (63 - 53);    -- Single-step bit of TE field
     constant MSR_BE  : integer := (63 - 54);    -- Branch trace bit of TE field
+    constant MSR_FE1 : integer := (63 - 55);    -- Floating Exception mode
     constant MSR_IR  : integer := (63 - 58);    -- Instruction Relocation
     constant MSR_DR  : integer := (63 - 59);    -- Data Relocation
     constant MSR_RI  : integer := (63 - 62);    -- Recoverable Interrupt
@@ -53,8 +56,11 @@ package common is
     -- GPR indices in the register file (GPR only)
     subtype gpr_index_t is std_ulogic_vector(4 downto 0);
 
-    -- Extended GPR indice (can hold an SPR)
-    subtype gspr_index_t is std_ulogic_vector(5 downto 0);
+    -- Extended GPR index (can hold an SPR or a FPR)
+    subtype gspr_index_t is std_ulogic_vector(6 downto 0);
+
+    -- FPR indices
+    subtype fpr_index_t is std_ulogic_vector(4 downto 0);
 
     -- Some SPRs are stored in the register file, they use the magic
     -- GPR numbers above 31.
@@ -64,6 +70,9 @@ package common is
     -- indicates if this is indeed a fast SPR. If clear, then
     -- the SPR is not stored in the GPR file.
     --
+    -- FPRs are also stored in the register file, using GSPR
+    -- numbers from 64 to 95.
+    --
     function fast_spr_num(spr: spr_num_t) return gspr_index_t;
 
     -- Indices conversion functions
@@ -71,6 +80,7 @@ package common is
     function gpr_to_gspr(i: gpr_index_t) return gspr_index_t;
     function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t;
     function is_fast_spr(s: gspr_index_t) return std_ulogic;
+    function fpr_to_gspr(f: fpr_index_t) return gspr_index_t;
 
     -- The XER is split: the common bits (CA, OV, SO, OV32 and CA32) are
     -- in the CR file as a kind of CR extension (with a separate write
@@ -84,6 +94,38 @@ package common is
     end record;
     constant xerc_init : xer_common_t := (others => '0');
 
+    -- FPSCR bit numbers
+    constant FPSCR_FX     : integer := 63 - 32;
+    constant FPSCR_FEX    : integer := 63 - 33;
+    constant FPSCR_VX     : integer := 63 - 34;
+    constant FPSCR_OX     : integer := 63 - 35;
+    constant FPSCR_UX     : integer := 63 - 36;
+    constant FPSCR_ZX     : integer := 63 - 37;
+    constant FPSCR_XX     : integer := 63 - 38;
+    constant FPSCR_VXSNAN : integer := 63 - 39;
+    constant FPSCR_VXISI  : integer := 63 - 40;
+    constant FPSCR_VXIDI  : integer := 63 - 41;
+    constant FPSCR_VXZDZ  : integer := 63 - 42;
+    constant FPSCR_VXIMZ  : integer := 63 - 43;
+    constant FPSCR_VXVC   : integer := 63 - 44;
+    constant FPSCR_FR     : integer := 63 - 45;
+    constant FPSCR_FI     : integer := 63 - 46;
+    constant FPSCR_C      : integer := 63 - 47;
+    constant FPSCR_FL     : integer := 63 - 48;
+    constant FPSCR_FG     : integer := 63 - 49;
+    constant FPSCR_FE     : integer := 63 - 50;
+    constant FPSCR_FU     : integer := 63 - 51;
+    constant FPSCR_VXSOFT : integer := 63 - 53;
+    constant FPSCR_VXSQRT : integer := 63 - 54;
+    constant FPSCR_VXCVI  : integer := 63 - 55;
+    constant FPSCR_VE     : integer := 63 - 56;
+    constant FPSCR_OE     : integer := 63 - 57;
+    constant FPSCR_UE     : integer := 63 - 58;
+    constant FPSCR_ZE     : integer := 63 - 59;
+    constant FPSCR_XE     : integer := 63 - 60;
+    constant FPSCR_NI     : integer := 63 - 61;
+    constant FPSCR_RN     : integer := 63 - 63;
+
     type irq_state_t is (WRITE_SRR0, WRITE_SRR1);
 
     -- For now, fixed 16 sources, make this either a parametric
@@ -226,7 +268,7 @@ package common is
 	read2_enable : std_ulogic;
 	read2_reg : gspr_index_t;
 	read3_enable : std_ulogic;
-	read3_reg : gpr_index_t;
+	read3_reg : gspr_index_t;
     end record;
 
     type RegisterFileToDecode2Type is record
@@ -264,7 +306,7 @@ package common is
 	addr1 : std_ulogic_vector(63 downto 0);
 	addr2 : std_ulogic_vector(63 downto 0);
 	data : std_ulogic_vector(63 downto 0);		-- data to write, unused for read
-	write_reg : gpr_index_t;
+	write_reg : gspr_index_t;
 	length : std_ulogic_vector(3 downto 0);
         ci : std_ulogic;                                -- cache-inhibited load/store
 	byte_reverse : std_ulogic;
@@ -277,13 +319,15 @@ package common is
         virt_mode : std_ulogic;                         -- do translation through TLB
         priv_mode : std_ulogic;                         -- privileged mode (MSR[PR] = 0)
         mode_32bit : std_ulogic;                        -- trim addresses to 32 bits
+        is_32bit : std_ulogic;
     end record;
     constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0',
                                                                      sign_extend => '0', update => '0', xerc => xerc_init,
                                                                      reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0',
                                                                      nia => (others => '0'), insn => (others => '0'),
-                                                                     addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'), length => (others => '0'),
-                                                                     mode_32bit => '0', others => (others => '0'));
+                                                                     addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'),
+                                                                     write_reg => (others => '0'), length => (others => '0'),
+                                                                     mode_32bit => '0', is_32bit => '0', others => (others => '0'));
 
     type Loadstore1ToExecute1Type is record
         busy : std_ulogic;
@@ -369,7 +413,7 @@ package common is
     type Loadstore1ToWritebackType is record
 	valid : std_ulogic;
 	write_enable: std_ulogic;
-	write_reg : gpr_index_t;
+	write_reg : gspr_index_t;
 	write_data : std_ulogic_vector(63 downto 0);
 	xerc : xer_common_t;
         rc : std_ulogic;
@@ -401,6 +445,43 @@ package common is
                                    write_cr_data => (others => '0'), write_reg => (others => '0'),
                                    exc_write_reg => (others => '0'), exc_write_data => (others => '0'));
 
+    type Execute1ToFPUType is record
+        valid   : std_ulogic;
+        op      : insn_type_t;
+        nia     : std_ulogic_vector(63 downto 0);
+        insn    : std_ulogic_vector(31 downto 0);
+        single  : std_ulogic;
+        fe_mode : std_ulogic_vector(1 downto 0);
+        fra     : std_ulogic_vector(63 downto 0);
+        frb     : std_ulogic_vector(63 downto 0);
+        frc     : std_ulogic_vector(63 downto 0);
+        frt     : gspr_index_t;
+        rc      : std_ulogic;
+        out_cr  : std_ulogic;
+    end record;
+    constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'),
+                                                       insn  => (others => '0'), fe_mode => "00", rc => '0',
+                                                       fra => (others => '0'), frb => (others => '0'),
+                                                       frc => (others => '0'), frt => (others => '0'),
+                                                       single => '0', out_cr => '0');
+
+    type FPUToExecute1Type is record
+        busy      : std_ulogic;
+        exception : std_ulogic;
+        interrupt : std_ulogic;
+        illegal   : std_ulogic;
+    end record;
+
+    type FPUToWritebackType is record
+        valid           : std_ulogic;
+        write_enable    : std_ulogic;
+        write_reg       : gspr_index_t;
+        write_data      : std_ulogic_vector(63 downto 0);
+        write_cr_enable : std_ulogic;
+        write_cr_mask   : std_ulogic_vector(7 downto 0);
+        write_cr_data   : std_ulogic_vector(31 downto 0);
+    end record;
+
     type DividerToExecute1Type is record
 	valid: std_ulogic;
 	write_reg_data: std_ulogic_vector(63 downto 0);
@@ -473,10 +554,10 @@ package body common is
            n := 13;
        when others =>
            n := 0;
-           return "000000";
+           return "0000000";
        end case;
        tmp := std_ulogic_vector(to_unsigned(n, 5));
-       return "1" & tmp;
+       return "01" & tmp;
     end;
 
     function gspr_to_gpr(i: gspr_index_t) return gpr_index_t is
@@ -486,7 +567,7 @@ package body common is
 
     function gpr_to_gspr(i: gpr_index_t) return gspr_index_t is
     begin
-	return "0" & i;
+	return "00" & i;
     end;
 
     function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t is
@@ -502,4 +583,9 @@ package body common is
     begin
 	return s(5);
     end;
+
+    function fpr_to_gspr(f: fpr_index_t) return gspr_index_t is
+    begin
+        return "10" & f;
+    end;
 end common;
diff --git a/control.vhdl b/control.vhdl
index d04576a..4f67ad4 100644
--- a/control.vhdl
+++ b/control.vhdl
@@ -34,7 +34,7 @@ entity control is
         gpr_b_read_in       : in gspr_index_t;
 
         gpr_c_read_valid_in : in std_ulogic;
-        gpr_c_read_in       : in gpr_index_t;
+        gpr_c_read_in       : in gspr_index_t;
 
         cr_read_in          : in std_ulogic;
         cr_write_in         : in std_ulogic;
@@ -70,7 +70,6 @@ architecture rtl of control is
     signal gpr_write_valid : std_ulogic := '0';
     signal cr_write_valid  : std_ulogic := '0';
 
-    signal gpr_c_read_in_fmt : std_ulogic_vector(5 downto 0);
 begin
     gpr_hazard0: entity work.gpr_hazard
         generic map (
@@ -122,8 +121,6 @@ begin
             use_bypass         => gpr_bypass_b
             );
 
-    gpr_c_read_in_fmt <= "0" & gpr_c_read_in;
-
     gpr_hazard2: entity work.gpr_hazard
         generic map (
             PIPELINE_DEPTH => PIPELINE_DEPTH
@@ -140,7 +137,7 @@ begin
             gpr_write_in       => gpr_write_in,
             bypass_avail       => gpr_bypassable,
             gpr_read_valid_in  => gpr_c_read_valid_in,
-            gpr_read_in        => gpr_c_read_in_fmt,
+            gpr_read_in        => gpr_c_read_in,
 
             ugpr_write_valid   => update_gpr_write_valid,
             ugpr_write_reg     => update_gpr_write_reg,
diff --git a/core.vhdl b/core.vhdl
index c7dd3f6..b905297 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -11,6 +11,7 @@ entity core is
         SIM : boolean := false;
 	DISABLE_FLATTEN : boolean := false;
         EX1_BYPASS : boolean := true;
+        HAS_FPU : boolean := true;
 	ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0');
         LOG_LENGTH : natural := 512
         );
@@ -79,6 +80,11 @@ architecture behave of core is
     signal mmu_to_dcache: MmuToDcacheType;
     signal dcache_to_mmu: DcacheToMmuType;
 
+    -- FPU signals
+    signal execute1_to_fpu: Execute1ToFPUType;
+    signal fpu_to_execute1: FPUToExecute1Type;
+    signal fpu_to_writeback: FPUToWritebackType;
+
     -- local signals
     signal fetch1_stall_in : std_ulogic;
     signal icache_stall_out : std_ulogic;
@@ -108,6 +114,7 @@ architecture behave of core is
     signal rst_dec1    : std_ulogic := '1';
     signal rst_dec2    : std_ulogic := '1';
     signal rst_ex1     : std_ulogic := '1';
+    signal rst_fpu     : std_ulogic := '1';
     signal rst_ls1     : std_ulogic := '1';
     signal rst_dbg     : std_ulogic := '1';
     signal alt_reset_d : std_ulogic;
@@ -170,6 +177,7 @@ begin
             rst_dec1    <= core_rst;
             rst_dec2    <= core_rst;
             rst_ex1     <= core_rst;
+            rst_fpu     <= core_rst;
             rst_ls1     <= core_rst;
             rst_dbg     <= rst;
             alt_reset_d <= alt_reset;
@@ -224,6 +232,7 @@ begin
 
     decode1_0: entity work.decode1
         generic map(
+            HAS_FPU => HAS_FPU,
             LOG_LENGTH => LOG_LENGTH
             )
         port map (
@@ -244,6 +253,7 @@ begin
     decode2_0: entity work.decode2
         generic map (
             EX1_BYPASS => EX1_BYPASS,
+            HAS_FPU => HAS_FPU,
             LOG_LENGTH => LOG_LENGTH
             )
         port map (
@@ -267,6 +277,7 @@ begin
     register_file_0: entity work.register_file
         generic map (
             SIM => SIM,
+            HAS_FPU => HAS_FPU,
             LOG_LENGTH => LOG_LENGTH
             )
         port map (
@@ -280,7 +291,7 @@ begin
             dbg_gpr_data => dbg_gpr_data,
 	    sim_dump => terminate,
 	    sim_dump_done => sim_cr_dump,
-            log_out => log_data(255 downto 185)
+            log_out => log_data(255 downto 184)
 	    );
 
     cr_file_0: entity work.cr_file
@@ -294,12 +305,13 @@ begin
             d_out => cr_file_to_decode2,
             w_in => writeback_to_cr_file,
             sim_dump => sim_cr_dump,
-            log_out => log_data(184 downto 172)
+            log_out => log_data(183 downto 171)
             );
 
     execute1_0: entity work.execute1
         generic map (
             EX1_BYPASS => EX1_BYPASS,
+            HAS_FPU => HAS_FPU,
             LOG_LENGTH => LOG_LENGTH
             )
         port map (
@@ -309,9 +321,11 @@ begin
 	    busy_out => ex1_busy_out,
             e_in => decode2_to_execute1,
             l_in => loadstore1_to_execute1,
+            fp_in => fpu_to_execute1,
             ext_irq_in => ext_irq,
             l_out => execute1_to_loadstore1,
             f_out => execute1_to_fetch1,
+            fp_out => execute1_to_fpu,
             e_out => execute1_to_writeback,
 	    icache_inval => ex1_icache_inval,
             dbg_msr_out => msr,
@@ -322,8 +336,32 @@ begin
             log_wr_addr => log_wr_addr
             );
 
+    with_fpu: if HAS_FPU generate
+    begin
+        fpu_0: entity work.fpu
+            port map (
+                clk => clk,
+                rst => rst_fpu,
+                e_in => execute1_to_fpu,
+                e_out => fpu_to_execute1,
+                w_out => fpu_to_writeback
+                );
+    end generate;
+
+    no_fpu: if not HAS_FPU generate
+    begin
+        fpu_to_execute1.busy <= '0';
+        fpu_to_execute1.exception <= '0';
+        fpu_to_execute1.interrupt <= '0';
+        fpu_to_execute1.illegal <= '0';
+        fpu_to_writeback.valid <= '0';
+        fpu_to_writeback.write_enable <= '0';
+        fpu_to_writeback.write_cr_enable <= '0';
+    end generate;
+
     loadstore1_0: entity work.loadstore1
         generic map (
+            HAS_FPU => HAS_FPU,
             LOG_LENGTH => LOG_LENGTH
             )
         port map (
@@ -368,7 +406,7 @@ begin
             stall_out => dcache_stall_out,
             wishbone_in => wishbone_data_in,
             wishbone_out => wishbone_data_out,
-            log_out => log_data(171 downto 152)
+            log_out => log_data(170 downto 151)
             );
 
     writeback_0: entity work.writeback
@@ -376,12 +414,13 @@ begin
             clk => clk,
             e_in => execute1_to_writeback,
             l_in => loadstore1_to_writeback,
+            fp_in => fpu_to_writeback,
             w_out => writeback_to_register_file,
             c_out => writeback_to_cr_file,
             complete_out => complete
             );
 
-    log_data(151 downto 150) <= "00";
+    log_data(150) <= '0';
     log_data(139 downto 135) <= "00000";
 
     debug_0: entity work.core_debug
diff --git a/countzero.vhdl b/countzero.vhdl
index 18aa043..b46f108 100644
--- a/countzero.vhdl
+++ b/countzero.vhdl
@@ -3,6 +3,7 @@ use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;
 
 library work;
+use work.helpers.all;
 
 entity zero_counter is
     port (
@@ -15,42 +16,6 @@ entity zero_counter is
 end entity zero_counter;
 
 architecture behaviour of zero_counter is
-    -- Reverse the order of bits in a word
-    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is
-        variable ret: std_ulogic_vector(a'left downto a'right);
-    begin
-        for i in a'right to a'left loop
-            ret(a'left + a'right - i) := a(i);
-        end loop;
-        return ret;
-    end;
-
-    -- If there is only one bit set in a doubleword, return its bit number
-    -- (counting from the right).  Each bit of the result is obtained by
-    -- ORing together 32 bits of the input:
-    --  bit 0 = a[1] or a[3] or a[5] or ...
-    --  bit 1 = a[2] or a[3] or a[6] or a[7] or ...
-    --  bit 2 = a[4..7] or a[12..15] or ...
-    --  bit 5 = a[32..63] ORed together
-    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
-        variable ret: std_ulogic_vector(5 downto 0);
-        variable stride: natural;
-        variable bit: std_ulogic;
-        variable k: natural;
-    begin
-        stride := 2;
-        for i in 0 to 5 loop
-            bit := '0';
-            for j in 0 to (64 / stride) - 1 loop
-                k := j * stride;
-                bit := bit or (or a(k + stride - 1 downto k + (stride / 2)));
-            end loop;
-            ret(i) := bit;
-            stride := stride * 2;
-        end loop;
-        return ret;
-    end;
-
     signal inp : std_ulogic_vector(63 downto 0);
     signal sum : std_ulogic_vector(64 downto 0);
     signal msb_r : std_ulogic;
diff --git a/decode1.vhdl b/decode1.vhdl
index a7d5910..5d6a557 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -8,6 +8,7 @@ use work.decode_types.all;
 
 entity decode1 is
     generic (
+        HAS_FPU : boolean := true;
         -- Non-zero to enable log data collection
         LOG_LENGTH : natural := 0
         );
@@ -54,7 +55,10 @@ architecture behaviour of decode1 is
     type op_19_subop_array_t is array(0 to 7) of decode_rom_t;
     type op_30_subop_array_t is array(0 to 15) of decode_rom_t;
     type op_31_subop_array_t is array(0 to 1023) of decode_rom_t;
+    type op_59_subop_array_t is array(0 to 31) of decode_rom_t;
     type minor_rom_array_2_t is array(0 to 3) of decode_rom_t;
+    type op_63_subop_array_0_t is array(0 to 511) of decode_rom_t;
+    type op_63_subop_array_1_t is array(0 to 16) of decode_rom_t;
 
     constant major_decode_rom_array : major_rom_array_t := (
         --          unit     internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
@@ -72,6 +76,10 @@ architecture behaviour of decode1 is
         10 =>       (ALU,    OP_CMP,       RA,         CONST_UI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli
         34 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbz
         35 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzu
+        50 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfd
+        51 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdu
+        48 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfs
+        49 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsu
         42 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lha
         43 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhau
         40 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhz
@@ -87,6 +95,10 @@ architecture behaviour of decode1 is
         17 =>       (ALU,    OP_SC,        NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sc
         38 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stb
         39 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbu
+        54 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfd
+        55 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdu
+        52 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfs
+        53 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsu
         44 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth
         45 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu
         36 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw
@@ -272,6 +284,12 @@ architecture behaviour of decode1 is
         2#1101110101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldcix
         2#0000110101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- ldux
         2#0000010101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldx
+        2#1001010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfdx
+        2#1001110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdux
+        2#1101010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwax
+        2#1101110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwzx
+        2#1000010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfsx
+        2#1000110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsux
         2#0001110100#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lharx
         2#0101110111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhaux
         2#0101010111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lhax
@@ -350,6 +368,11 @@ architecture behaviour of decode1 is
         2#0011010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0'), -- stdcx
         2#0010110101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stdux
         2#0010010101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdx
+        2#1011010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfdx
+        2#1011110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdux
+        2#1111010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfiwx
+        2#1010010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfsx
+        2#1010110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsux
         2#1110010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx
         2#1110110101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthcix
         2#1011010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0'), -- sthcx
@@ -389,6 +412,24 @@ architecture behaviour of decode1 is
         others   => decode_rom_init
         );
 
+    constant decode_op_59_array : op_59_subop_array_t := (
+        --             unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
+        --                          op                               in   out   A   out  in    out  len        ext                                pipe
+        2#01110#  =>  (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fcfid[u]s
+        2#10010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fdivs
+        2#10100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsubs
+        2#10101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fadds
+        2#10110#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsqrts
+        2#11000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fres
+        2#11001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmuls
+        2#11010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- frsqrtes
+        2#11100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmsubs
+        2#11101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmadds
+        2#11110#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fnmsubs
+        2#11111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fnmadds
+        others => illegal_inst
+        );
+
     constant decode_op_62_array : minor_rom_array_2_t := (
         --              unit    internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
         --                            op                                           in   out   A   out  in    out  len        ext                                 pipe
@@ -397,6 +438,64 @@ architecture behaviour of decode1 is
         others   => decode_rom_init
         );
 
+    -- indexed by bits 4..1 and 10..6 of instruction word
+    constant decode_op_63l_array : op_63_subop_array_0_t := (
+        --                unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
+        --                             op                               in   out   A   out  in    out  len        ext                                pipe
+        2#000000000#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  0/0=fcmpu
+        2#000000001#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  1/0=fcmpo
+        2#000000010#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  2/0=mcrfs
+        2#000000100#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  4/0=ftdiv
+        2#000000101#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  5/0=ftsqrt
+        2#011000001#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  1/6=mtfsb1
+        2#011000010#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  2/6=mtfsb0
+        2#011000100#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/6=mtfsfi
+        2#011011010#  => (FPU,   OP_FPOP_I,     FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 26/6=fmrgow
+        2#011011110#  => (FPU,   OP_FPOP_I,     FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 30/6=fmrgew
+        2#011110010#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 18/7=mffs family
+        2#011110110#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 22/7=mtfsf
+        2#100000000#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  0/8=fcpsgn
+        2#100000001#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  1/8=fneg
+        2#100000010#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  2/8=fmr
+        2#100000100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/8=fnabs
+        2#100001000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  8/8=fabs
+        2#100001100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 12/8=frin
+        2#100001101#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 13/8=friz
+        2#100001110#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 14/8=frip
+        2#100001111#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 15/8=frim
+        2#110000000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), --  0/12=frsp
+        2#111000000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  0/14=fctiw
+        2#111000100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/14=fctiwu
+        2#111011001#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 25/14=fctid
+        2#111011010#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 26/14=fcfid
+        2#111011101#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 29/14=fctidu
+        2#111011110#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 30/14=fcfidu
+        2#111100000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  0/15=fctiwz
+        2#111100100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/15=fctiwuz
+        2#111111001#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 25/15=fctidz
+        2#111111101#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 29/15=fctiduz
+        others => illegal_inst
+        );
+
+    -- indexed by bits 4..1 of instruction word
+    constant decode_op_63h_array : op_63_subop_array_1_t := (
+        --            unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
+        --                         op                               in   out   A   out  in    out  len        ext                                pipe
+        2#0010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fdiv
+        2#0100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsub
+        2#0101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fadd
+        2#0110#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsqrt
+        2#0111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsel
+        2#1000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fre
+        2#1001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmul
+        2#1010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- frsqrte
+        2#1100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmsub
+        2#1101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmadd
+        2#1110#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fnmsub
+        2#1111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fnmadd
+        others => illegal_inst
+        );
+
     --                                        unit   internal         in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
     --                                                     op                                              in   out   A   out  in    out  len        ext                                 pipe
     constant nop_instr      : decode_rom_t := (ALU,  OP_NOP,          NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0');
@@ -547,9 +646,28 @@ begin
         when 58 =>
             v.decode := decode_op_58_array(to_integer(unsigned(f_in.insn(1 downto 0))));
 
+        when 59 =>
+            if HAS_FPU then
+                -- floating point operations, mostly single-precision
+                v.decode := decode_op_59_array(to_integer(unsigned(f_in.insn(5 downto 1))));
+                if f_in.insn(5) = '0' and not std_match(f_in.insn(10 downto 1), "11-1001110") then
+                    vi.override := '1';
+                end if;
+            end if;
+
         when 62 =>
             v.decode := decode_op_62_array(to_integer(unsigned(f_in.insn(1 downto 0))));
 
+        when 63 =>
+            if HAS_FPU then
+                -- floating point operations, general and double-precision
+                if f_in.insn(5) = '0' then
+                    v.decode := decode_op_63l_array(to_integer(unsigned(f_in.insn(4 downto 1) & f_in.insn(10 downto 6))));
+                else
+                    v.decode := decode_op_63h_array(to_integer(unsigned(f_in.insn(4 downto 1))));
+                end if;
+            end if;
+
         when others =>
         end case;
 
diff --git a/decode2.vhdl b/decode2.vhdl
index a2a602c..9443212 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -11,6 +11,7 @@ use work.insn_helpers.all;
 entity decode2 is
     generic (
         EX1_BYPASS : boolean := true;
+        HAS_FPU : boolean := true;
         -- Non-zero to enable log data collection
         LOG_LENGTH : natural := 0
         );
@@ -73,12 +74,14 @@ architecture behaviour of decode2 is
             -- If it's all 0, we don't treat it as a dependency as slow SPRs
             -- operations are single issue.
             --
-            assert is_fast_spr(ispr) =  '1' or ispr = "000000"
+            assert is_fast_spr(ispr) =  '1' or ispr = "0000000"
                 report "Decode A says SPR but ISPR is invalid:" &
                 to_hstring(ispr) severity failure;
             return (is_fast_spr(ispr), ispr, reg_data);
         elsif t = CIA then
             return ('0', (others => '0'), instr_addr);
+        elsif HAS_FPU and t = FRA then
+            return ('1', fpr_to_gspr(insn_fra(insn_in)), reg_data);
         else
             return ('0', (others => '0'), (others => '0'));
         end if;
@@ -92,6 +95,12 @@ architecture behaviour of decode2 is
         case t is
             when RB =>
                 ret := ('1', gpr_to_gspr(insn_rb(insn_in)), reg_data);
+            when FRB =>
+                if HAS_FPU then
+                    ret := ('1', fpr_to_gspr(insn_frb(insn_in)), reg_data);
+                else
+                    ret := ('0', (others => '0'), (others => '0'));
+                end if;
             when CONST_UI =>
                 ret := ('0', (others => '0'), std_ulogic_vector(resize(unsigned(insn_ui(insn_in)), 64)));
             when CONST_SI =>
@@ -118,7 +127,7 @@ architecture behaviour of decode2 is
                 -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR.
                 -- If it's all 0, we don't treat it as a dependency as slow SPRs
                 -- operations are single issue.
-                assert is_fast_spr(ispr) = '1' or ispr = "000000"
+                assert is_fast_spr(ispr) = '1' or ispr = "0000000"
                     report "Decode B says SPR but ISPR is invalid:" &
                     to_hstring(ispr) severity failure;
                 ret := (is_fast_spr(ispr), ispr, reg_data);
@@ -137,6 +146,18 @@ architecture behaviour of decode2 is
                 return ('1', gpr_to_gspr(insn_rs(insn_in)), reg_data);
             when RCR =>
                 return ('1', gpr_to_gspr(insn_rcreg(insn_in)), reg_data);
+            when FRS =>
+                if HAS_FPU then
+                    return ('1', fpr_to_gspr(insn_frt(insn_in)), reg_data);
+                else
+                    return ('0', (others => '0'), (others => '0'));
+                end if;
+            when FRC =>
+                if HAS_FPU then
+                    return ('1', fpr_to_gspr(insn_frc(insn_in)), reg_data);
+                else
+                    return ('0', (others => '0'), (others => '0'));
+                end if;
             when NONE =>
                 return ('0', (others => '0'), (others => '0'));
         end case;
@@ -150,16 +171,22 @@ architecture behaviour of decode2 is
                 return ('1', gpr_to_gspr(insn_rt(insn_in)));
             when RA =>
                 return ('1', gpr_to_gspr(insn_ra(insn_in)));
+            when FRT =>
+                if HAS_FPU then
+                    return ('1', fpr_to_gspr(insn_frt(insn_in)));
+                else
+                    return ('0', "0000000");
+                end if;
             when SPR =>
                 -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR.
                 -- If it's all 0, we don't treat it as a dependency as slow SPRs
                 -- operations are single issue.
-                assert is_fast_spr(ispr) = '1' or ispr = "000000"
+                assert is_fast_spr(ispr) = '1' or ispr = "0000000"
                     report "Decode B says SPR but ISPR is invalid:" &
                     to_hstring(ispr) severity failure;
                 return (is_fast_spr(ispr), ispr);
             when NONE =>
-                return ('0', "000000");
+                return ('0', "0000000");
         end case;
     end;
 
@@ -212,7 +239,7 @@ architecture behaviour of decode2 is
     signal gpr_b_bypass : std_ulogic;
 
     signal gpr_c_read_valid : std_ulogic;
-    signal gpr_c_read : gpr_index_t;
+    signal gpr_c_read : gspr_index_t;
     signal gpr_c_bypass : std_ulogic;
 
     signal cr_write_valid  : std_ulogic;
@@ -281,11 +308,15 @@ begin
     end process;
 
     r_out.read1_reg <= d_in.ispr1 when d_in.decode.input_reg_a = SPR
+                       else fpr_to_gspr(insn_fra(d_in.insn)) when d_in.decode.input_reg_a = FRA and HAS_FPU
                        else gpr_to_gspr(insn_ra(d_in.insn));
     r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR
+                       else fpr_to_gspr(insn_frb(d_in.insn)) when d_in.decode.input_reg_b = FRB and HAS_FPU
                        else gpr_to_gspr(insn_rb(d_in.insn));
-    r_out.read3_reg <= insn_rcreg(d_in.insn) when d_in.decode.input_reg_c = RCR
-                       else insn_rs(d_in.insn);
+    r_out.read3_reg <= gpr_to_gspr(insn_rcreg(d_in.insn)) when d_in.decode.input_reg_c = RCR
+                       else fpr_to_gspr(insn_frc(d_in.insn)) when d_in.decode.input_reg_c = FRC and HAS_FPU
+                       else fpr_to_gspr(insn_frt(d_in.insn)) when d_in.decode.input_reg_c = FRS and HAS_FPU
+                       else gpr_to_gspr(insn_rs(d_in.insn));
 
     c_out.read <= d_in.decode.input_cr;
 
@@ -307,7 +338,7 @@ begin
         mul_b := (others => '0');
 
         --v.e.input_cr := d_in.decode.input_cr;
-        --v.e.output_cr := d_in.decode.output_cr;
+        v.e.output_cr := d_in.decode.output_cr;
         
         decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1,
                                              d_in.nia);
@@ -394,11 +425,11 @@ begin
         gpr_b_read <= decoded_reg_b.reg;
 
         gpr_c_read_valid <= decoded_reg_c.reg_valid;
-        gpr_c_read <= gspr_to_gpr(decoded_reg_c.reg);
+        gpr_c_read <= decoded_reg_c.reg;
 
         cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn);
         cr_bypass_avail <= '0';
-        if EX1_BYPASS then
+        if EX1_BYPASS and d_in.decode.unit = ALU then
             cr_bypass_avail <= d_in.decode.output_cr;
         end if;
 
diff --git a/decode_types.vhdl b/decode_types.vhdl
index ef654c3..72609bf 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -7,9 +7,11 @@ package decode_types is
 			 OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB,
 			 OP_CNTZ, OP_CROP,
 			 OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST,
-			 OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS,
-			 OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC,
+			 OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS, OP_EXTSWSLI,
+                         OP_FPOP, OP_FPOP_I,
+                         OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC,
 			 OP_LOAD, OP_STORE,
+                         OP_FPLOAD, OP_FPSTORE,
 			 OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MOD,
 			 OP_MTCRF, OP_MTMSRD, OP_MTSPR, OP_MUL_L64,
 			 OP_MUL_H64, OP_MUL_H32, OP_OR,
@@ -21,11 +23,11 @@ package decode_types is
                          OP_BCD, OP_ADDG6S,
                          OP_FETCH_FAILED
 			 );
-    type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA);
+    type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA, FRA);
     type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD,
-                           CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR);
-    type input_reg_c_t is (NONE, RS, RCR);
-    type output_reg_a_t is (NONE, RT, RA, SPR);
+                           CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR, FRB);
+    type input_reg_c_t is (NONE, RS, RCR, FRC, FRS);
+    type output_reg_a_t is (NONE, RT, RA, SPR, FRT);
     type rc_t is (NONE, ONE, RC);
     type carry_in_t is (ZERO, CA, OV, ONE);
 
@@ -47,7 +49,7 @@ package decode_types is
 
     constant TOO_OFFSET : integer := 0;
 
-    type unit_t is (NONE, ALU, LDST);
+    type unit_t is (NONE, ALU, LDST, FPU);
     type length_t is (NONE, is1B, is2B, is4B, is8B);
 
     type decode_rom_t is record
diff --git a/execute1.vhdl b/execute1.vhdl
index 04cc970..29713b2 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -13,6 +13,7 @@ use work.ppc_fx_insns.all;
 entity execute1 is
     generic (
         EX1_BYPASS : boolean := true;
+        HAS_FPU : boolean := true;
         -- Non-zero to enable log data collection
         LOG_LENGTH : natural := 0
         );
@@ -26,12 +27,14 @@ entity execute1 is
 
 	e_in  : in Decode2ToExecute1Type;
         l_in  : in Loadstore1ToExecute1Type;
+        fp_in : in FPUToExecute1Type;
 
 	ext_irq_in : std_ulogic;
 
 	-- asynchronous
         l_out : out Execute1ToLoadstore1Type;
 	f_out : out Execute1ToFetch1Type;
+        fp_out : out Execute1ToFPUType;
 
 	e_out : out Execute1ToWritebackType;
 
@@ -53,6 +56,7 @@ architecture behaviour of execute1 is
         f : Execute1ToFetch1Type;
         busy: std_ulogic;
         terminate: std_ulogic;
+        fp_exception_next : std_ulogic;
         trace_next : std_ulogic;
         prev_op : insn_type_t;
 	lr_update : std_ulogic;
@@ -71,7 +75,8 @@ architecture behaviour of execute1 is
     end record;
     constant reg_type_init : reg_type :=
         (e => Execute1ToWritebackInit, f => Execute1ToFetch1Init,
-         busy => '0', lr_update => '0', terminate => '0', trace_next => '0', prev_op => OP_ILLEGAL,
+         busy => '0', lr_update => '0', terminate => '0',
+         fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL,
          mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0',
          slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init,
          next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0'));
@@ -267,7 +272,7 @@ begin
     b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2;
     c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3;
 
-    busy_out <= l_in.busy or r.busy;
+    busy_out <= l_in.busy or r.busy or fp_in.busy;
     valid_in <= e_in.valid and not busy_out;
 
     terminate_out <= r.terminate;
@@ -333,6 +338,7 @@ begin
         variable spr_val : std_ulogic_vector(63 downto 0);
         variable addend : std_ulogic_vector(127 downto 0);
         variable do_trace : std_ulogic;
+        variable fv : Execute1ToFPUType;
     begin
 	result := (others => '0');
 	sum_with_carry := (others => '0');
@@ -346,6 +352,7 @@ begin
 	v.e := Execute1ToWritebackInit;
         lv := Execute1ToLoadstore1Init;
         v.f.redirect := '0';
+        fv := Execute1ToFPUInit;
 
 	-- XER forwarding. To avoid having to track XER hazards, we
 	-- use the previously latched value.
@@ -521,9 +528,11 @@ begin
         exception_nextpc := '0';
         v.e.exc_write_enable := '0';
         v.e.exc_write_reg := fast_spr_num(SPR_SRR0);
-        v.e.exc_write_data := e_in.nia;
         if valid_in = '1' then
+            v.e.exc_write_data := e_in.nia;
             v.last_nia := e_in.nia;
+        else
+            v.e.exc_write_data := r.last_nia;
         end if;
 
         v.e.mode_32bit := not ctrl.msr(MSR_SF);
@@ -542,24 +551,36 @@ begin
             ctrl_tmp.msr(MSR_PR) <= '0';
             ctrl_tmp.msr(MSR_SE) <= '0';
             ctrl_tmp.msr(MSR_BE) <= '0';
+            ctrl_tmp.msr(MSR_FP) <= '0';
+            ctrl_tmp.msr(MSR_FE0) <= '0';
+            ctrl_tmp.msr(MSR_FE1) <= '0';
             ctrl_tmp.msr(MSR_IR) <= '0';
             ctrl_tmp.msr(MSR_DR) <= '0';
             ctrl_tmp.msr(MSR_RI) <= '0';
             ctrl_tmp.msr(MSR_LE) <= '1';
             v.e.valid := '1';
             v.trace_next := '0';
+            v.fp_exception_next := '0';
 	    report "Writing SRR1: " & to_hstring(ctrl.srr1);
 
-        elsif r.trace_next = '1' and valid_in = '1' then
-            -- Generate a trace interrupt rather than executing the next instruction
-            -- or taking any asynchronous interrupt
-            v.f.redirect_nia := std_logic_vector(to_unsigned(16#d00#, 64));
-            ctrl_tmp.srr1(63 - 33) <= '1';
-            if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or
-                r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then
-                ctrl_tmp.srr1(63 - 35) <= '1';
-            elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then
-                ctrl_tmp.srr1(63 - 36) <= '1';
+        elsif valid_in = '1' and ((HAS_FPU and r.fp_exception_next = '1') or r.trace_next = '1') then
+            if HAS_FPU and r.fp_exception_next = '1' then
+                -- This is used for FP-type program interrupts that
+                -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero.
+                v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64));
+                ctrl_tmp.srr1(63 - 43) <= '1';
+                ctrl_tmp.srr1(63 - 47) <= '1';
+            else
+                -- Generate a trace interrupt rather than executing the next instruction
+                -- or taking any asynchronous interrupt
+                v.f.redirect_nia := std_logic_vector(to_unsigned(16#d00#, 64));
+                ctrl_tmp.srr1(63 - 33) <= '1';
+                if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or
+                    r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then
+                    ctrl_tmp.srr1(63 - 35) <= '1';
+                elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then
+                    ctrl_tmp.srr1(63 - 36) <= '1';
+                end if;
             end if;
             exception := '1';
 
@@ -578,7 +599,19 @@ begin
             -- set bit 45 to indicate privileged instruction type interrupt
             ctrl_tmp.srr1(63 - 45) <= '1';
             report "privileged instruction";
-            
+
+        elsif not HAS_FPU and valid_in = '1' and
+            (e_in.insn_type = OP_FPLOAD or e_in.insn_type = OP_FPSTORE) then
+            -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations
+            illegal := '1';
+
+        elsif HAS_FPU and valid_in = '1' and ctrl.msr(MSR_FP) = '0' and
+            (e_in.unit = FPU or e_in.insn_type = OP_FPLOAD or e_in.insn_type = OP_FPSTORE) then
+            -- generate a floating-point unavailable interrupt
+            exception := '1';
+            v.f.redirect_nia := std_logic_vector(to_unsigned(16#800#, 64));
+            report "FP unavailable interrupt";
+
 	elsif valid_in = '1' and e_in.unit = ALU then
 
 	    report "execute nia " & to_hstring(e_in.nia);
@@ -793,6 +826,10 @@ begin
                 is_branch := '1';
                 taken_branch := '1';
                 abs_branch := '1';
+                if HAS_FPU then
+                    v.fp_exception_next := fp_in.exception and
+                                           (a_in(MSR_FE0) or a_in(MSR_FE1));
+                end if;
                 do_trace := '0';
 
             when OP_CNTZ =>
@@ -964,6 +1001,10 @@ begin
                         ctrl_tmp.msr(MSR_IR) <= '1';
                         ctrl_tmp.msr(MSR_DR) <= '1';
                     end if;
+                    if HAS_FPU then
+                        v.fp_exception_next := fp_in.exception and
+                                               (c_in(MSR_FE0) or c_in(MSR_FE1));
+                    end if;
                 end if;
 	    when OP_MTSPR =>
 		report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
@@ -1080,6 +1121,8 @@ begin
                 lv.valid := '1';
             elsif e_in.unit = NONE then
                 illegal := '1';
+            elsif HAS_FPU and e_in.unit = FPU then
+                fv.valid := '1';
             end if;
 
         elsif r.f.redirect = '1' then
@@ -1154,7 +1197,17 @@ begin
             v.e.valid := '1';
 	end if;
 
-        if illegal = '1' then
+        -- Generate FP-type program interrupt.  fp_in.interrupt will only
+        -- be set during the execution of a FP instruction.
+        -- The case where MSR[FE0,FE1] goes from zero to non-zero is
+        -- handled above by mtmsrd and rfid setting v.fp_exception_next.
+        if HAS_FPU and fp_in.interrupt = '1' then
+            v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64));
+            ctrl_tmp.srr1(63 - 43) <= '1';
+            exception := '1';
+        end if;
+
+        if illegal = '1' or (HAS_FPU and fp_in.illegal = '1') then
             exception := '1';
             v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64));
             -- Since we aren't doing Hypervisor emulation assist (0xe40) we
@@ -1200,7 +1253,6 @@ begin
             end if;
             v.e.exc_write_enable := '1';
             v.e.exc_write_reg := fast_spr_num(SPR_SRR0);
-            v.e.exc_write_data := r.last_nia;
             report "ldst exception writing srr0=" & to_hstring(r.last_nia);
         end if;
 
@@ -1225,7 +1277,7 @@ begin
         lv.addr1 := a_in;
         lv.addr2 := b_in;
         lv.data := c_in;
-        lv.write_reg := gspr_to_gpr(e_in.write_reg);
+        lv.write_reg := e_in.write_reg;
         lv.length := e_in.data_len;
         lv.byte_reverse := e_in.byte_reverse xnor ctrl.msr(MSR_LE);
         lv.sign_extend := e_in.sign_extend;
@@ -1243,6 +1295,20 @@ begin
         lv.virt_mode := ctrl.msr(MSR_DR);
         lv.priv_mode := not ctrl.msr(MSR_PR);
         lv.mode_32bit := not ctrl.msr(MSR_SF);
+        lv.is_32bit := e_in.is_32bit;
+
+        -- Outputs to FPU
+        fv.op := e_in.insn_type;
+        fv.nia := e_in.nia;
+        fv.insn := e_in.insn;
+        fv.single := e_in.is_32bit;
+        fv.fe_mode := ctrl.msr(MSR_FE0) & ctrl.msr(MSR_FE1);
+        fv.fra := a_in;
+        fv.frb := b_in;
+        fv.frc := c_in;
+        fv.frt := e_in.write_reg;
+        fv.rc := e_in.rc;
+        fv.out_cr := e_in.output_cr;
 
 	-- Update registers
 	rin <= v;
@@ -1251,6 +1317,7 @@ begin
 	f_out <= r.f;
         l_out <= lv;
 	e_out <= r.e;
+        fp_out <= fv;
 	flush_out <= f_out.redirect;
 
         exception_log <= exception;
diff --git a/fpga/top-arty.vhdl b/fpga/top-arty.vhdl
index a4d253d..8a3dc7a 100644
--- a/fpga/top-arty.vhdl
+++ b/fpga/top-arty.vhdl
@@ -14,6 +14,7 @@ entity toplevel is
         RAM_INIT_FILE      : string   := "firmware.hex";
         RESET_LOW          : boolean  := true;
         CLK_FREQUENCY      : positive := 100000000;
+        HAS_FPU            : boolean  := true;
         USE_LITEDRAM       : boolean  := false;
         NO_BRAM            : boolean  := false;
         DISABLE_FLATTEN_CORE : boolean := false;
@@ -168,6 +169,7 @@ begin
             RAM_INIT_FILE      => RAM_INIT_FILE,
             SIM                => false,
             CLK_FREQ           => CLK_FREQUENCY,
+            HAS_FPU            => HAS_FPU,
             HAS_DRAM           => USE_LITEDRAM,
             DRAM_SIZE          => 256 * 1024 * 1024,
             DRAM_INIT_SIZE     => PAYLOAD_SIZE,
diff --git a/fpga/top-generic.vhdl b/fpga/top-generic.vhdl
index 2300456..2ad0dd3 100644
--- a/fpga/top-generic.vhdl
+++ b/fpga/top-generic.vhdl
@@ -11,6 +11,7 @@ entity toplevel is
 	RESET_LOW     : boolean  := true;
 	CLK_INPUT     : positive := 100000000;
 	CLK_FREQUENCY : positive := 100000000;
+        HAS_FPU       : boolean  := true;
 	DISABLE_FLATTEN_CORE : boolean := false;
         UART_IS_16550 : boolean  := true
 	);
@@ -68,6 +69,7 @@ begin
 	    RAM_INIT_FILE => RAM_INIT_FILE,
 	    SIM           => false,
 	    CLK_FREQ      => CLK_FREQUENCY,
+            HAS_FPU       => HAS_FPU,
 	    DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE,
             UART0_IS_16550     => UART_IS_16550
 	    )
diff --git a/fpga/top-nexys-video.vhdl b/fpga/top-nexys-video.vhdl
index 745ef79..1942b10 100644
--- a/fpga/top-nexys-video.vhdl
+++ b/fpga/top-nexys-video.vhdl
@@ -14,6 +14,7 @@ entity toplevel is
 	RAM_INIT_FILE : string   := "firmware.hex";
 	RESET_LOW     : boolean  := true;
 	CLK_FREQUENCY : positive := 100000000;
+        HAS_FPU       : boolean  := true;
 	USE_LITEDRAM  : boolean  := false;
 	NO_BRAM       : boolean  := false;
 	DISABLE_FLATTEN_CORE : boolean := false;
@@ -120,6 +121,7 @@ begin
 	    RAM_INIT_FILE => RAM_INIT_FILE,
 	    SIM           => false,
 	    CLK_FREQ      => CLK_FREQUENCY,
+            HAS_FPU       => HAS_FPU,
 	    HAS_DRAM      => USE_LITEDRAM,
 	    DRAM_SIZE     => 512 * 1024 * 1024,
             DRAM_INIT_SIZE => PAYLOAD_SIZE,
diff --git a/fpu.vhdl b/fpu.vhdl
new file mode 100644
index 0000000..023dbf2
--- /dev/null
+++ b/fpu.vhdl
@@ -0,0 +1,2568 @@
+-- Floating-point unit for Microwatt
+
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.insn_helpers.all;
+use work.decode_types.all;
+use work.crhelpers.all;
+use work.helpers.all;
+use work.common.all;
+
+entity fpu is
+    port (
+        clk : in std_ulogic;
+        rst : in std_ulogic;
+
+        e_in  : in  Execute1toFPUType;
+        e_out : out FPUToExecute1Type;
+
+        w_out : out FPUToWritebackType
+        );
+end entity fpu;
+
+architecture behaviour of fpu is
+    type fp_number_class is (ZERO, FINITE, INFINITY, NAN);
+
+    constant EXP_BITS : natural := 13;
+
+    type fpu_reg_type is record
+        class    : fp_number_class;
+        negative : std_ulogic;
+        exponent : signed(EXP_BITS-1 downto 0);         -- unbiased
+        mantissa : std_ulogic_vector(63 downto 0);      -- 10.54 format
+    end record;
+
+    type state_t is (IDLE,
+                     DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF,
+                     DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT,
+                     DO_FCFID, DO_FCTI,
+                     DO_FRSP, DO_FRI,
+                     DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT, DO_FMADD,
+                     DO_FRE, DO_FRSQRTE,
+                     DO_FSEL,
+                     FRI_1,
+                     ADD_1, ADD_SHIFT, ADD_2, ADD_3,
+                     CMP_1, CMP_2,
+                     MULT_1,
+                     FMADD_1, FMADD_2, FMADD_3,
+                     FMADD_4, FMADD_5, FMADD_6,
+                     LOOKUP,
+                     DIV_2, DIV_3, DIV_4, DIV_5, DIV_6,
+                     FRE_1,
+                     RSQRT_1,
+                     FTDIV_1,
+                     SQRT_1, SQRT_2, SQRT_3, SQRT_4,
+                     SQRT_5, SQRT_6, SQRT_7, SQRT_8,
+                     SQRT_9, SQRT_10, SQRT_11, SQRT_12,
+                     INT_SHIFT, INT_ROUND, INT_ISHIFT,
+                     INT_FINAL, INT_CHECK, INT_OFLOW,
+                     FINISH, NORMALIZE,
+                     ROUND_UFLOW, ROUND_OFLOW,
+                     ROUNDING, ROUNDING_2, ROUNDING_3,
+                     DENORM,
+                     RENORM_A, RENORM_A2,
+                     RENORM_B, RENORM_B2,
+                     RENORM_C, RENORM_C2,
+                     NAN_RESULT, EXC_RESULT);
+
+    type reg_type is record
+        state        : state_t;
+        busy         : std_ulogic;
+        instr_done   : std_ulogic;
+        do_intr      : std_ulogic;
+        op           : insn_type_t;
+        insn         : std_ulogic_vector(31 downto 0);
+        dest_fpr     : gspr_index_t;
+        fe_mode      : std_ulogic;
+        rc           : std_ulogic;
+        is_cmp       : std_ulogic;
+        single_prec  : std_ulogic;
+        fpscr        : std_ulogic_vector(31 downto 0);
+        a            : fpu_reg_type;
+        b            : fpu_reg_type;
+        c            : fpu_reg_type;
+        r            : std_ulogic_vector(63 downto 0);  -- 10.54 format
+        s            : std_ulogic_vector(55 downto 0);  -- extended fraction
+        x            : std_ulogic;
+        p            : std_ulogic_vector(63 downto 0);  -- 8.56 format
+        y            : std_ulogic_vector(63 downto 0);  -- 8.56 format
+        result_sign  : std_ulogic;
+        result_class : fp_number_class;
+        result_exp   : signed(EXP_BITS-1 downto 0);
+        shift        : signed(EXP_BITS-1 downto 0);
+        writing_back : std_ulogic;
+        int_result   : std_ulogic;
+        cr_result    : std_ulogic_vector(3 downto 0);
+        cr_mask      : std_ulogic_vector(7 downto 0);
+        old_exc      : std_ulogic_vector(4 downto 0);
+        update_fprf  : std_ulogic;
+        quieten_nan  : std_ulogic;
+        tiny         : std_ulogic;
+        denorm       : std_ulogic;
+        round_mode   : std_ulogic_vector(2 downto 0);
+        is_subtract  : std_ulogic;
+        exp_cmp      : std_ulogic;
+        madd_cmp     : std_ulogic;
+        add_bsmall   : std_ulogic;
+        is_multiply  : std_ulogic;
+        is_sqrt      : std_ulogic;
+        first        : std_ulogic;
+        count        : unsigned(1 downto 0);
+        doing_ftdiv  : std_ulogic_vector(1 downto 0);
+        opsel_a      : std_ulogic_vector(1 downto 0);
+        use_a        : std_ulogic;
+        use_b        : std_ulogic;
+        use_c        : std_ulogic;
+        invalid      : std_ulogic;
+        negate       : std_ulogic;
+        longmask     : std_ulogic;
+    end record;
+
+    type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0);
+
+    signal r, rin : reg_type;
+
+    signal fp_result     : std_ulogic_vector(63 downto 0);
+    signal opsel_b       : std_ulogic_vector(1 downto 0);
+    signal opsel_r       : std_ulogic_vector(1 downto 0);
+    signal opsel_s       : std_ulogic_vector(1 downto 0);
+    signal opsel_ainv    : std_ulogic;
+    signal opsel_mask    : std_ulogic;
+    signal opsel_binv    : std_ulogic;
+    signal in_a          : std_ulogic_vector(63 downto 0);
+    signal in_b          : std_ulogic_vector(63 downto 0);
+    signal result        : std_ulogic_vector(63 downto 0);
+    signal carry_in      : std_ulogic;
+    signal lost_bits     : std_ulogic;
+    signal r_hi_nz       : std_ulogic;
+    signal r_lo_nz       : std_ulogic;
+    signal s_nz          : std_ulogic;
+    signal misc_sel      : std_ulogic_vector(3 downto 0);
+    signal f_to_multiply : MultiplyInputType;
+    signal multiply_to_f : MultiplyOutputType;
+    signal msel_1        : std_ulogic_vector(1 downto 0);
+    signal msel_2        : std_ulogic_vector(1 downto 0);
+    signal msel_add      : std_ulogic_vector(1 downto 0);
+    signal msel_inv      : std_ulogic;
+    signal inverse_est   : std_ulogic_vector(18 downto 0);
+
+    -- opsel values
+    constant AIN_R    : std_ulogic_vector(1 downto 0) := "00";
+    constant AIN_A    : std_ulogic_vector(1 downto 0) := "01";
+    constant AIN_B    : std_ulogic_vector(1 downto 0) := "10";
+    constant AIN_C    : std_ulogic_vector(1 downto 0) := "11";
+
+    constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00";
+    constant BIN_R    : std_ulogic_vector(1 downto 0) := "01";
+    constant BIN_MASK : std_ulogic_vector(1 downto 0) := "10";
+    constant BIN_PS6  : std_ulogic_vector(1 downto 0) := "11";
+
+    constant RES_SUM   : std_ulogic_vector(1 downto 0) := "00";
+    constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01";
+    constant RES_MULT  : std_ulogic_vector(1 downto 0) := "10";
+    constant RES_MISC  : std_ulogic_vector(1 downto 0) := "11";
+
+    constant S_ZERO  : std_ulogic_vector(1 downto 0) := "00";
+    constant S_NEG   : std_ulogic_vector(1 downto 0) := "01";
+    constant S_SHIFT : std_ulogic_vector(1 downto 0) := "10";
+    constant S_MULT  : std_ulogic_vector(1 downto 0) := "11";
+
+    -- msel values
+    constant MUL1_A : std_ulogic_vector(1 downto 0) := "00";
+    constant MUL1_B : std_ulogic_vector(1 downto 0) := "01";
+    constant MUL1_Y : std_ulogic_vector(1 downto 0) := "10";
+    constant MUL1_R : std_ulogic_vector(1 downto 0) := "11";
+
+    constant MUL2_C   : std_ulogic_vector(1 downto 0) := "00";
+    constant MUL2_LUT : std_ulogic_vector(1 downto 0) := "01";
+    constant MUL2_P   : std_ulogic_vector(1 downto 0) := "10";
+    constant MUL2_R   : std_ulogic_vector(1 downto 0) := "11";
+
+    constant MULADD_ZERO  : std_ulogic_vector(1 downto 0) := "00";
+    constant MULADD_CONST : std_ulogic_vector(1 downto 0) := "01";
+    constant MULADD_A     : std_ulogic_vector(1 downto 0) := "10";
+    constant MULADD_RS    : std_ulogic_vector(1 downto 0) := "11";
+
+    -- Inverse lookup table, indexed by the top 8 fraction bits
+    -- The first 256 entries are the reciprocal (1/x) lookup table,
+    -- and the remaining 768 entries are the reciprocal square root table.
+    -- Output range is [0.5, 1) in 0.19 format, though the top
+    -- bit isn't stored since it is always 1.
+    -- Each output value is the inverse of the center of the input
+    -- range for the value, i.e. entry 0 is 1 / (1 + 1/512),
+    -- entry 1 is 1 / (1 + 3/512), etc.
+    signal inverse_table : lookup_table := (
+        -- 1/x lookup table
+        -- Unit bit is assumed to be 1, so input range is [1, 2)
+        18x"3fc01", 18x"3f411", 18x"3ec31", 18x"3e460", 18x"3dc9f", 18x"3d4ec", 18x"3cd49", 18x"3c5b5",
+        18x"3be2f", 18x"3b6b8", 18x"3af4f", 18x"3a7f4", 18x"3a0a7", 18x"39968", 18x"39237", 18x"38b14",
+        18x"383fe", 18x"37cf5", 18x"375f9", 18x"36f0a", 18x"36828", 18x"36153", 18x"35a8a", 18x"353ce",
+        18x"34d1e", 18x"3467a", 18x"33fe3", 18x"33957", 18x"332d7", 18x"32c62", 18x"325f9", 18x"31f9c",
+        18x"3194a", 18x"31303", 18x"30cc7", 18x"30696", 18x"30070", 18x"2fa54", 18x"2f443", 18x"2ee3d",
+        18x"2e841", 18x"2e250", 18x"2dc68", 18x"2d68b", 18x"2d0b8", 18x"2caee", 18x"2c52e", 18x"2bf79",
+        18x"2b9cc", 18x"2b429", 18x"2ae90", 18x"2a900", 18x"2a379", 18x"29dfb", 18x"29887", 18x"2931b",
+        18x"28db8", 18x"2885e", 18x"2830d", 18x"27dc4", 18x"27884", 18x"2734d", 18x"26e1d", 18x"268f6",
+        18x"263d8", 18x"25ec1", 18x"259b3", 18x"254ac", 18x"24fad", 18x"24ab7", 18x"245c8", 18x"240e1",
+        18x"23c01", 18x"23729", 18x"23259", 18x"22d90", 18x"228ce", 18x"22413", 18x"21f60", 18x"21ab4",
+        18x"2160f", 18x"21172", 18x"20cdb", 18x"2084b", 18x"203c2", 18x"1ff40", 18x"1fac4", 18x"1f64f",
+        18x"1f1e1", 18x"1ed79", 18x"1e918", 18x"1e4be", 18x"1e069", 18x"1dc1b", 18x"1d7d4", 18x"1d392",
+        18x"1cf57", 18x"1cb22", 18x"1c6f3", 18x"1c2ca", 18x"1bea7", 18x"1ba8a", 18x"1b672", 18x"1b261",
+        18x"1ae55", 18x"1aa50", 18x"1a64f", 18x"1a255", 18x"19e60", 18x"19a70", 18x"19686", 18x"192a2",
+        18x"18ec3", 18x"18ae9", 18x"18715", 18x"18345", 18x"17f7c", 18x"17bb7", 18x"177f7", 18x"1743d",
+        18x"17087", 18x"16cd7", 18x"1692c", 18x"16585", 18x"161e4", 18x"15e47", 18x"15ab0", 18x"1571d",
+        18x"1538e", 18x"15005", 18x"14c80", 18x"14900", 18x"14584", 18x"1420d", 18x"13e9b", 18x"13b2d",
+        18x"137c3", 18x"1345e", 18x"130fe", 18x"12da2", 18x"12a4a", 18x"126f6", 18x"123a7", 18x"1205c",
+        18x"11d15", 18x"119d2", 18x"11694", 18x"11359", 18x"11023", 18x"10cf1", 18x"109c2", 18x"10698",
+        18x"10372", 18x"10050", 18x"0fd31", 18x"0fa17", 18x"0f700", 18x"0f3ed", 18x"0f0de", 18x"0edd3",
+        18x"0eacb", 18x"0e7c7", 18x"0e4c7", 18x"0e1ca", 18x"0ded2", 18x"0dbdc", 18x"0d8eb", 18x"0d5fc",
+        18x"0d312", 18x"0d02b", 18x"0cd47", 18x"0ca67", 18x"0c78a", 18x"0c4b1", 18x"0c1db", 18x"0bf09",
+        18x"0bc3a", 18x"0b96e", 18x"0b6a5", 18x"0b3e0", 18x"0b11e", 18x"0ae5f", 18x"0aba3", 18x"0a8eb",
+        18x"0a636", 18x"0a383", 18x"0a0d4", 18x"09e28", 18x"09b80", 18x"098da", 18x"09637", 18x"09397",
+        18x"090fb", 18x"08e61", 18x"08bca", 18x"08936", 18x"086a5", 18x"08417", 18x"0818c", 18x"07f04",
+        18x"07c7e", 18x"079fc", 18x"0777c", 18x"074ff", 18x"07284", 18x"0700d", 18x"06d98", 18x"06b26",
+        18x"068b6", 18x"0664a", 18x"063e0", 18x"06178", 18x"05f13", 18x"05cb1", 18x"05a52", 18x"057f5",
+        18x"0559a", 18x"05342", 18x"050ed", 18x"04e9a", 18x"04c4a", 18x"049fc", 18x"047b0", 18x"04567",
+        18x"04321", 18x"040dd", 18x"03e9b", 18x"03c5c", 18x"03a1f", 18x"037e4", 18x"035ac", 18x"03376",
+        18x"03142", 18x"02f11", 18x"02ce2", 18x"02ab5", 18x"0288b", 18x"02663", 18x"0243d", 18x"02219",
+        18x"01ff7", 18x"01dd8", 18x"01bbb", 18x"019a0", 18x"01787", 18x"01570", 18x"0135b", 18x"01149",
+        18x"00f39", 18x"00d2a", 18x"00b1e", 18x"00914", 18x"0070c", 18x"00506", 18x"00302", 18x"00100",
+        -- 1/sqrt(x) lookup table
+        -- Input is in the range [1, 4), i.e. two bits to the left of the
+        -- binary point.  Those 2 bits index the following 3 blocks of 256 values.
+        -- 1.0 ... 1.9999
+        18x"3fe00", 18x"3fa06", 18x"3f612", 18x"3f224", 18x"3ee3a", 18x"3ea58", 18x"3e67c", 18x"3e2a4",
+        18x"3ded2", 18x"3db06", 18x"3d73e", 18x"3d37e", 18x"3cfc2", 18x"3cc0a", 18x"3c85a", 18x"3c4ae",
+        18x"3c106", 18x"3bd64", 18x"3b9c8", 18x"3b630", 18x"3b29e", 18x"3af10", 18x"3ab86", 18x"3a802",
+        18x"3a484", 18x"3a108", 18x"39d94", 18x"39a22", 18x"396b6", 18x"3934e", 18x"38fea", 18x"38c8c",
+        18x"38932", 18x"385dc", 18x"3828a", 18x"37f3e", 18x"37bf6", 18x"378b2", 18x"37572", 18x"37236",
+        18x"36efe", 18x"36bca", 18x"3689a", 18x"36570", 18x"36248", 18x"35f26", 18x"35c06", 18x"358ea",
+        18x"355d4", 18x"352c0", 18x"34fb0", 18x"34ca4", 18x"3499c", 18x"34698", 18x"34398", 18x"3409c",
+        18x"33da2", 18x"33aac", 18x"337bc", 18x"334cc", 18x"331e2", 18x"32efc", 18x"32c18", 18x"32938",
+        18x"3265a", 18x"32382", 18x"320ac", 18x"31dd8", 18x"31b0a", 18x"3183e", 18x"31576", 18x"312b0",
+        18x"30fee", 18x"30d2e", 18x"30a74", 18x"307ba", 18x"30506", 18x"30254", 18x"2ffa4", 18x"2fcf8",
+        18x"2fa4e", 18x"2f7a8", 18x"2f506", 18x"2f266", 18x"2efca", 18x"2ed2e", 18x"2ea98", 18x"2e804",
+        18x"2e572", 18x"2e2e4", 18x"2e058", 18x"2ddce", 18x"2db48", 18x"2d8c6", 18x"2d646", 18x"2d3c8",
+        18x"2d14c", 18x"2ced4", 18x"2cc5e", 18x"2c9ea", 18x"2c77a", 18x"2c50c", 18x"2c2a2", 18x"2c038",
+        18x"2bdd2", 18x"2bb70", 18x"2b90e", 18x"2b6b0", 18x"2b454", 18x"2b1fa", 18x"2afa4", 18x"2ad4e",
+        18x"2aafc", 18x"2a8ac", 18x"2a660", 18x"2a414", 18x"2a1cc", 18x"29f86", 18x"29d42", 18x"29b00",
+        18x"298c2", 18x"29684", 18x"2944a", 18x"29210", 18x"28fda", 18x"28da6", 18x"28b74", 18x"28946",
+        18x"28718", 18x"284ec", 18x"282c4", 18x"2809c", 18x"27e78", 18x"27c56", 18x"27a34", 18x"27816",
+        18x"275fa", 18x"273e0", 18x"271c8", 18x"26fb0", 18x"26d9c", 18x"26b8a", 18x"2697a", 18x"2676c",
+        18x"26560", 18x"26356", 18x"2614c", 18x"25f46", 18x"25d42", 18x"25b40", 18x"2593e", 18x"25740",
+        18x"25542", 18x"25348", 18x"2514e", 18x"24f58", 18x"24d62", 18x"24b6e", 18x"2497c", 18x"2478c",
+        18x"2459e", 18x"243b0", 18x"241c6", 18x"23fde", 18x"23df6", 18x"23c10", 18x"23a2c", 18x"2384a",
+        18x"2366a", 18x"2348c", 18x"232ae", 18x"230d2", 18x"22efa", 18x"22d20", 18x"22b4a", 18x"22976",
+        18x"227a2", 18x"225d2", 18x"22402", 18x"22234", 18x"22066", 18x"21e9c", 18x"21cd2", 18x"21b0a",
+        18x"21944", 18x"2177e", 18x"215ba", 18x"213fa", 18x"21238", 18x"2107a", 18x"20ebc", 18x"20d00",
+        18x"20b46", 18x"2098e", 18x"207d6", 18x"20620", 18x"2046c", 18x"202b8", 18x"20108", 18x"1ff58",
+        18x"1fda8", 18x"1fbfc", 18x"1fa50", 18x"1f8a4", 18x"1f6fc", 18x"1f554", 18x"1f3ae", 18x"1f208",
+        18x"1f064", 18x"1eec2", 18x"1ed22", 18x"1eb82", 18x"1e9e4", 18x"1e846", 18x"1e6aa", 18x"1e510",
+        18x"1e378", 18x"1e1e0", 18x"1e04a", 18x"1deb4", 18x"1dd20", 18x"1db8e", 18x"1d9fc", 18x"1d86c",
+        18x"1d6de", 18x"1d550", 18x"1d3c4", 18x"1d238", 18x"1d0ae", 18x"1cf26", 18x"1cd9e", 18x"1cc18",
+        18x"1ca94", 18x"1c910", 18x"1c78c", 18x"1c60a", 18x"1c48a", 18x"1c30c", 18x"1c18e", 18x"1c010",
+        18x"1be94", 18x"1bd1a", 18x"1bba0", 18x"1ba28", 18x"1b8b2", 18x"1b73c", 18x"1b5c6", 18x"1b452",
+        18x"1b2e0", 18x"1b16e", 18x"1affe", 18x"1ae8e", 18x"1ad20", 18x"1abb4", 18x"1aa46", 18x"1a8dc",
+        -- 2.0 ... 2.9999
+        18x"1a772", 18x"1a608", 18x"1a4a0", 18x"1a33a", 18x"1a1d4", 18x"1a070", 18x"19f0c", 18x"19da8",
+        18x"19c48", 18x"19ae6", 18x"19986", 18x"19828", 18x"196ca", 18x"1956e", 18x"19412", 18x"192b8",
+        18x"1915e", 18x"19004", 18x"18eae", 18x"18d56", 18x"18c00", 18x"18aac", 18x"18958", 18x"18804",
+        18x"186b2", 18x"18562", 18x"18412", 18x"182c2", 18x"18174", 18x"18026", 18x"17eda", 18x"17d8e",
+        18x"17c44", 18x"17afa", 18x"179b2", 18x"1786a", 18x"17724", 18x"175de", 18x"17498", 18x"17354",
+        18x"17210", 18x"170ce", 18x"16f8c", 18x"16e4c", 18x"16d0c", 18x"16bcc", 18x"16a8e", 18x"16950",
+        18x"16814", 18x"166d8", 18x"1659e", 18x"16464", 18x"1632a", 18x"161f2", 18x"160ba", 18x"15f84",
+        18x"15e4e", 18x"15d1a", 18x"15be6", 18x"15ab2", 18x"15980", 18x"1584e", 18x"1571c", 18x"155ec",
+        18x"154bc", 18x"1538e", 18x"15260", 18x"15134", 18x"15006", 18x"14edc", 18x"14db0", 18x"14c86",
+        18x"14b5e", 18x"14a36", 18x"1490e", 18x"147e6", 18x"146c0", 18x"1459a", 18x"14476", 18x"14352",
+        18x"14230", 18x"1410c", 18x"13fea", 18x"13eca", 18x"13daa", 18x"13c8a", 18x"13b6c", 18x"13a4e",
+        18x"13930", 18x"13814", 18x"136f8", 18x"135dc", 18x"134c2", 18x"133a8", 18x"1328e", 18x"13176",
+        18x"1305e", 18x"12f48", 18x"12e30", 18x"12d1a", 18x"12c06", 18x"12af2", 18x"129de", 18x"128ca",
+        18x"127b8", 18x"126a6", 18x"12596", 18x"12486", 18x"12376", 18x"12266", 18x"12158", 18x"1204a",
+        18x"11f3e", 18x"11e32", 18x"11d26", 18x"11c1a", 18x"11b10", 18x"11a06", 18x"118fc", 18x"117f4",
+        18x"116ec", 18x"115e4", 18x"114de", 18x"113d8", 18x"112d2", 18x"111ce", 18x"110ca", 18x"10fc6",
+        18x"10ec2", 18x"10dc0", 18x"10cbe", 18x"10bbc", 18x"10abc", 18x"109bc", 18x"108bc", 18x"107be",
+        18x"106c0", 18x"105c2", 18x"104c4", 18x"103c8", 18x"102cc", 18x"101d0", 18x"100d6", 18x"0ffdc",
+        18x"0fee2", 18x"0fdea", 18x"0fcf0", 18x"0fbf8", 18x"0fb02", 18x"0fa0a", 18x"0f914", 18x"0f81e",
+        18x"0f72a", 18x"0f636", 18x"0f542", 18x"0f44e", 18x"0f35a", 18x"0f268", 18x"0f176", 18x"0f086",
+        18x"0ef94", 18x"0eea4", 18x"0edb4", 18x"0ecc6", 18x"0ebd6", 18x"0eae8", 18x"0e9fa", 18x"0e90e",
+        18x"0e822", 18x"0e736", 18x"0e64a", 18x"0e55e", 18x"0e474", 18x"0e38a", 18x"0e2a0", 18x"0e1b8",
+        18x"0e0d0", 18x"0dfe8", 18x"0df00", 18x"0de1a", 18x"0dd32", 18x"0dc4c", 18x"0db68", 18x"0da82",
+        18x"0d99e", 18x"0d8ba", 18x"0d7d6", 18x"0d6f4", 18x"0d612", 18x"0d530", 18x"0d44e", 18x"0d36c",
+        18x"0d28c", 18x"0d1ac", 18x"0d0cc", 18x"0cfee", 18x"0cf0e", 18x"0ce30", 18x"0cd54", 18x"0cc76",
+        18x"0cb9a", 18x"0cabc", 18x"0c9e0", 18x"0c906", 18x"0c82a", 18x"0c750", 18x"0c676", 18x"0c59c",
+        18x"0c4c4", 18x"0c3ea", 18x"0c312", 18x"0c23a", 18x"0c164", 18x"0c08c", 18x"0bfb6", 18x"0bee0",
+        18x"0be0a", 18x"0bd36", 18x"0bc62", 18x"0bb8c", 18x"0baba", 18x"0b9e6", 18x"0b912", 18x"0b840",
+        18x"0b76e", 18x"0b69c", 18x"0b5cc", 18x"0b4fa", 18x"0b42a", 18x"0b35a", 18x"0b28a", 18x"0b1bc",
+        18x"0b0ee", 18x"0b01e", 18x"0af50", 18x"0ae84", 18x"0adb6", 18x"0acea", 18x"0ac1e", 18x"0ab52",
+        18x"0aa86", 18x"0a9bc", 18x"0a8f0", 18x"0a826", 18x"0a75c", 18x"0a694", 18x"0a5ca", 18x"0a502",
+        18x"0a43a", 18x"0a372", 18x"0a2aa", 18x"0a1e4", 18x"0a11c", 18x"0a056", 18x"09f90", 18x"09ecc",
+        -- 3.0 ... 3.9999
+        18x"09e06", 18x"09d42", 18x"09c7e", 18x"09bba", 18x"09af6", 18x"09a32", 18x"09970", 18x"098ae",
+        18x"097ec", 18x"0972a", 18x"09668", 18x"095a8", 18x"094e8", 18x"09426", 18x"09368", 18x"092a8",
+        18x"091e8", 18x"0912a", 18x"0906c", 18x"08fae", 18x"08ef0", 18x"08e32", 18x"08d76", 18x"08cba",
+        18x"08bfe", 18x"08b42", 18x"08a86", 18x"089ca", 18x"08910", 18x"08856", 18x"0879c", 18x"086e2",
+        18x"08628", 18x"08570", 18x"084b6", 18x"083fe", 18x"08346", 18x"0828e", 18x"081d8", 18x"08120",
+        18x"0806a", 18x"07fb4", 18x"07efe", 18x"07e48", 18x"07d92", 18x"07cde", 18x"07c2a", 18x"07b76",
+        18x"07ac2", 18x"07a0e", 18x"0795a", 18x"078a8", 18x"077f4", 18x"07742", 18x"07690", 18x"075de",
+        18x"0752e", 18x"0747c", 18x"073cc", 18x"0731c", 18x"0726c", 18x"071bc", 18x"0710c", 18x"0705e",
+        18x"06fae", 18x"06f00", 18x"06e52", 18x"06da4", 18x"06cf6", 18x"06c4a", 18x"06b9c", 18x"06af0",
+        18x"06a44", 18x"06998", 18x"068ec", 18x"06840", 18x"06796", 18x"066ea", 18x"06640", 18x"06596",
+        18x"064ec", 18x"06442", 18x"0639a", 18x"062f0", 18x"06248", 18x"061a0", 18x"060f8", 18x"06050",
+        18x"05fa8", 18x"05f00", 18x"05e5a", 18x"05db4", 18x"05d0e", 18x"05c68", 18x"05bc2", 18x"05b1c",
+        18x"05a76", 18x"059d2", 18x"0592e", 18x"05888", 18x"057e4", 18x"05742", 18x"0569e", 18x"055fa",
+        18x"05558", 18x"054b6", 18x"05412", 18x"05370", 18x"052ce", 18x"0522e", 18x"0518c", 18x"050ec",
+        18x"0504a", 18x"04faa", 18x"04f0a", 18x"04e6a", 18x"04dca", 18x"04d2c", 18x"04c8c", 18x"04bee",
+        18x"04b50", 18x"04ab0", 18x"04a12", 18x"04976", 18x"048d8", 18x"0483a", 18x"0479e", 18x"04700",
+        18x"04664", 18x"045c8", 18x"0452c", 18x"04490", 18x"043f6", 18x"0435a", 18x"042c0", 18x"04226",
+        18x"0418a", 18x"040f0", 18x"04056", 18x"03fbe", 18x"03f24", 18x"03e8c", 18x"03df2", 18x"03d5a",
+        18x"03cc2", 18x"03c2a", 18x"03b92", 18x"03afa", 18x"03a62", 18x"039cc", 18x"03934", 18x"0389e",
+        18x"03808", 18x"03772", 18x"036dc", 18x"03646", 18x"035b2", 18x"0351c", 18x"03488", 18x"033f2",
+        18x"0335e", 18x"032ca", 18x"03236", 18x"031a2", 18x"03110", 18x"0307c", 18x"02fea", 18x"02f56",
+        18x"02ec4", 18x"02e32", 18x"02da0", 18x"02d0e", 18x"02c7c", 18x"02bec", 18x"02b5a", 18x"02aca",
+        18x"02a38", 18x"029a8", 18x"02918", 18x"02888", 18x"027f8", 18x"0276a", 18x"026da", 18x"0264a",
+        18x"025bc", 18x"0252e", 18x"024a0", 18x"02410", 18x"02384", 18x"022f6", 18x"02268", 18x"021da",
+        18x"0214e", 18x"020c0", 18x"02034", 18x"01fa8", 18x"01f1c", 18x"01e90", 18x"01e04", 18x"01d78",
+        18x"01cee", 18x"01c62", 18x"01bd8", 18x"01b4c", 18x"01ac2", 18x"01a38", 18x"019ae", 18x"01924",
+        18x"0189c", 18x"01812", 18x"01788", 18x"01700", 18x"01676", 18x"015ee", 18x"01566", 18x"014de",
+        18x"01456", 18x"013ce", 18x"01346", 18x"012c0", 18x"01238", 18x"011b2", 18x"0112c", 18x"010a4",
+        18x"0101e", 18x"00f98", 18x"00f12", 18x"00e8c", 18x"00e08", 18x"00d82", 18x"00cfe", 18x"00c78",
+        18x"00bf4", 18x"00b70", 18x"00aec", 18x"00a68", 18x"009e4", 18x"00960", 18x"008dc", 18x"00858",
+        18x"007d6", 18x"00752", 18x"006d0", 18x"0064e", 18x"005cc", 18x"0054a", 18x"004c8", 18x"00446",
+        18x"003c4", 18x"00342", 18x"002c2", 18x"00240", 18x"001c0", 18x"00140", 18x"000c0", 18x"00040"
+        );
+
+    -- Left and right shifter with 120 bit input and 64 bit output.
+    -- Shifts inp left by shift bits and returns the upper 64 bits of
+    -- the result.  The shift parameter is interpreted as a signed
+    -- number in the range -64..63, with negative values indicating
+    -- right shifts.
+    function shifter_64(inp: std_ulogic_vector(119 downto 0);
+                        shift: std_ulogic_vector(6 downto 0))
+        return std_ulogic_vector is
+        variable s1 : std_ulogic_vector(94 downto 0);
+        variable s2 : std_ulogic_vector(70 downto 0);
+        variable result : std_ulogic_vector(63 downto 0);
+    begin
+        case shift(6 downto 5) is
+            when "00" =>
+                s1 := inp(119 downto 25);
+            when "01" =>
+                s1 := inp(87 downto 0) & "0000000";
+            when "10" =>
+                s1 := x"0000000000000000" & inp(119 downto 89);
+            when others =>
+                s1 := x"00000000" & inp(119 downto 57);
+        end case;
+        case shift(4 downto 3) is
+            when "00" =>
+                s2 := s1(94 downto 24);
+            when "01" =>
+                s2 := s1(86 downto 16);
+            when "10" =>
+                s2 := s1(78 downto 8);
+            when others =>
+                s2 := s1(70 downto 0);
+        end case;
+        case shift(2 downto 0) is
+            when "000" =>
+                result := s2(70 downto 7);
+            when "001" =>
+                result := s2(69 downto 6);
+            when "010" =>
+                result := s2(68 downto 5);
+            when "011" =>
+                result := s2(67 downto 4);
+            when "100" =>
+                result := s2(66 downto 3);
+            when "101" =>
+                result := s2(65 downto 2);
+            when "110" =>
+                result := s2(64 downto 1);
+            when others =>
+                result := s2(63 downto 0);
+        end case;
+        return result;
+    end;
+
+    -- Generate a mask with 0-bits on the left and 1-bits on the right which
+    -- selects the bits will be lost in doing a right shift.  The shift
+    -- parameter is the bottom 6 bits of a negative shift count,
+    -- indicating a right shift.
+    function right_mask(shift: unsigned(5 downto 0)) return std_ulogic_vector is
+        variable result: std_ulogic_vector(63 downto 0);
+    begin
+        result := (others => '0');
+        for i in 0 to 63 loop
+            if i >= shift then
+                result(63 - i) := '1';
+            end if;
+        end loop;
+        return result;
+    end;
+
+    -- Split a DP floating-point number into components and work out its class.
+    -- If is_int = 1, the input is considered an integer
+    function decode_dp(fpr: std_ulogic_vector(63 downto 0); is_int: std_ulogic) return fpu_reg_type is
+        variable r       : fpu_reg_type;
+        variable exp_nz  : std_ulogic;
+        variable exp_ao  : std_ulogic;
+        variable frac_nz : std_ulogic;
+        variable cls     : std_ulogic_vector(2 downto 0);
+    begin
+        r.negative := fpr(63);
+        exp_nz := or (fpr(62 downto 52));
+        exp_ao := and (fpr(62 downto 52));
+        frac_nz := or (fpr(51 downto 0));
+        if is_int = '0' then
+            r.exponent := signed(resize(unsigned(fpr(62 downto 52)), EXP_BITS)) - to_signed(1023, EXP_BITS);
+            if exp_nz = '0' then
+                r.exponent := to_signed(-1022, EXP_BITS);
+            end if;
+            r.mantissa := "000000000" & exp_nz & fpr(51 downto 0) & "00";
+            cls := exp_ao & exp_nz & frac_nz;
+            case cls is
+                when "000"  => r.class := ZERO;
+                when "001"  => r.class := FINITE;    -- denormalized
+                when "010"  => r.class := FINITE;
+                when "011"  => r.class := FINITE;
+                when "110"  => r.class := INFINITY;
+                when others => r.class := NAN;
+            end case;
+        else
+            r.mantissa := fpr;
+            r.exponent := (others => '0');
+            if (fpr(63) or exp_nz or frac_nz) = '1' then
+                r.class := FINITE;
+            else
+                r.class := ZERO;
+            end if;
+        end if;
+        return r;
+    end;
+
+    -- Construct a DP floating-point result from components
+    function pack_dp(sign: std_ulogic; class: fp_number_class; exp: signed(EXP_BITS-1 downto 0);
+                     mantissa: std_ulogic_vector; single_prec: std_ulogic; quieten_nan: std_ulogic)
+        return std_ulogic_vector is
+        variable result : std_ulogic_vector(63 downto 0);
+    begin
+        result := (others => '0');
+        result(63) := sign;
+        case class is
+            when ZERO =>
+            when FINITE =>
+                if mantissa(54) = '1' then
+                    -- normalized number
+                    result(62 downto 52) := std_ulogic_vector(resize(exp, 11) + 1023);
+                end if;
+                result(51 downto 29) := mantissa(53 downto 31);
+                if single_prec = '0' then
+                    result(28 downto 0) := mantissa(30 downto 2);
+                end if;
+            when INFINITY =>
+                result(62 downto 52) := "11111111111";
+            when NAN =>
+                result(62 downto 52) := "11111111111";
+                result(51) := quieten_nan or mantissa(53);
+                result(50 downto 29) := mantissa(52 downto 31);
+                if single_prec = '0' then
+                    result(28 downto 0) := mantissa(30 downto 2);
+                end if;
+        end case;
+        return result;
+    end;
+
+    -- Determine whether to increment when rounding
+    -- Returns rounding_inc & inexact
+    -- Assumes x includes the bottom 29 bits of the mantissa already
+    -- if single_prec = 1 (usually arranged by setting set_x = 1 earlier).
+    function fp_rounding(mantissa: std_ulogic_vector(63 downto 0); x: std_ulogic;
+                         single_prec: std_ulogic; rn: std_ulogic_vector(2 downto 0);
+                         sign: std_ulogic)
+        return std_ulogic_vector is
+        variable grx : std_ulogic_vector(2 downto 0);
+        variable ret : std_ulogic_vector(1 downto 0);
+        variable lsb : std_ulogic;
+    begin
+        if single_prec = '0' then
+            grx := mantissa(1 downto 0) & x;
+            lsb := mantissa(2);
+        else
+            grx := mantissa(30 downto 29) & x;
+            lsb := mantissa(31);
+        end if;
+        ret(1) := '0';
+        ret(0) := or (grx);
+        case rn(1 downto 0) is
+            when "00" =>        -- round to nearest
+                if grx = "100" and rn(2) = '0' then
+                    ret(1) := lsb; -- tie, round to even
+                else
+                    ret(1) := grx(2);
+                end if;
+            when "01" =>        -- round towards zero
+            when others =>      -- round towards +/- inf
+                if rn(0) = sign then
+                    -- round towards greater magnitude
+                    ret(1) := ret(0);
+                end if;
+        end case;
+        return ret;
+    end;
+
+    -- Determine result flags to write into the FPSCR
+    function result_flags(sign: std_ulogic; class: fp_number_class; unitbit: std_ulogic)
+        return std_ulogic_vector is
+    begin
+        case class is
+            when ZERO =>
+                return sign & "0010";
+            when FINITE =>
+                return (not unitbit) & sign & (not sign) & "00";
+            when INFINITY =>
+                return '0' & sign & (not sign) & "01";
+            when NAN =>
+                return "10001";
+        end case;
+    end;
+
+begin
+    fpu_multiply_0: entity work.multiply
+        port map (
+            clk => clk,
+            m_in => f_to_multiply,
+            m_out => multiply_to_f
+            );
+
+    fpu_0: process(clk)
+    begin
+        if rising_edge(clk) then
+            if rst = '1' then
+                r.state <= IDLE;
+                r.busy <= '0';
+                r.instr_done <= '0';
+                r.do_intr <= '0';
+                r.fpscr <= (others => '0');
+                r.writing_back <= '0';
+            else
+                assert not (r.state /= IDLE and e_in.valid = '1') severity failure;
+                r <= rin;
+            end if;
+        end if;
+    end process;
+
+    -- synchronous reads from lookup table
+    lut_access: process(clk)
+        variable addrhi : std_ulogic_vector(1 downto 0);
+        variable addr   : std_ulogic_vector(9 downto 0);
+    begin
+        if rising_edge(clk) then
+            if r.is_sqrt = '1' then
+                addrhi := r.b.mantissa(55 downto 54);
+            else
+                addrhi := "00";
+            end if;
+            addr := addrhi & r.b.mantissa(53 downto 46);
+            inverse_est <= '1' & inverse_table(to_integer(unsigned(addr)));
+        end if;
+    end process;
+
+    e_out.busy <= r.busy;
+    e_out.exception <= r.fpscr(FPSCR_FEX);
+    e_out.interrupt <= r.do_intr;
+
+    w_out.valid <= r.instr_done and not r.do_intr;
+    w_out.write_enable <= r.writing_back;
+    w_out.write_reg <= r.dest_fpr;
+    w_out.write_data <= fp_result;
+    w_out.write_cr_enable <= r.instr_done and (r.rc or r.is_cmp);
+    w_out.write_cr_mask <= r.cr_mask;
+    w_out.write_cr_data <= r.cr_result & r.cr_result & r.cr_result & r.cr_result &
+                           r.cr_result & r.cr_result & r.cr_result & r.cr_result;
+
+    fpu_1: process(all)
+        variable v           : reg_type;
+        variable adec        : fpu_reg_type;
+        variable bdec        : fpu_reg_type;
+        variable cdec        : fpu_reg_type;
+        variable fpscr_mask  : std_ulogic_vector(31 downto 0);
+        variable illegal     : std_ulogic;
+        variable j, k        : integer;
+        variable flm         : std_ulogic_vector(7 downto 0);
+        variable int_input   : std_ulogic;
+        variable mask        : std_ulogic_vector(63 downto 0);
+        variable in_a0       : std_ulogic_vector(63 downto 0);
+        variable in_b0       : std_ulogic_vector(63 downto 0);
+        variable misc        : std_ulogic_vector(63 downto 0);
+        variable shift_res   : std_ulogic_vector(63 downto 0);
+        variable round       : std_ulogic_vector(1 downto 0);
+        variable update_fx   : std_ulogic;
+        variable arith_done  : std_ulogic;
+        variable invalid     : std_ulogic;
+        variable zero_divide : std_ulogic;
+        variable mant_nz     : std_ulogic;
+        variable min_exp     : signed(EXP_BITS-1 downto 0);
+        variable max_exp     : signed(EXP_BITS-1 downto 0);
+        variable bias_exp    : signed(EXP_BITS-1 downto 0);
+        variable new_exp     : signed(EXP_BITS-1 downto 0);
+        variable exp_tiny    : std_ulogic;
+        variable exp_huge    : std_ulogic;
+        variable renormalize : std_ulogic;
+        variable clz         : std_ulogic_vector(5 downto 0);
+        variable set_x       : std_ulogic;
+        variable mshift      : signed(EXP_BITS-1 downto 0);
+        variable need_check  : std_ulogic;
+        variable msb         : std_ulogic;
+        variable is_add      : std_ulogic;
+        variable set_a       : std_ulogic;
+        variable set_b       : std_ulogic;
+        variable set_c       : std_ulogic;
+        variable set_y       : std_ulogic;
+        variable set_s       : std_ulogic;
+        variable qnan_result : std_ulogic;
+        variable px_nz       : std_ulogic;
+        variable pcmpb_eq    : std_ulogic;
+        variable pcmpb_lt    : std_ulogic;
+        variable pshift      : std_ulogic;
+        variable renorm_sqrt : std_ulogic;
+        variable sqrt_exp    : signed(EXP_BITS-1 downto 0);
+        variable shiftin     : std_ulogic;
+        variable mulexp      : signed(EXP_BITS-1 downto 0);
+        variable maddend     : std_ulogic_vector(127 downto 0);
+        variable sum         : std_ulogic_vector(63 downto 0);
+    begin
+        v := r;
+        illegal := '0';
+        v.busy := '0';
+        int_input := '0';
+
+        -- capture incoming instruction
+        if e_in.valid = '1' then
+            v.insn := e_in.insn;
+            v.op := e_in.op;
+            v.fe_mode := or (e_in.fe_mode);
+            v.dest_fpr := e_in.frt;
+            v.single_prec := e_in.single;
+            v.longmask := e_in.single;
+            v.int_result := '0';
+            v.rc := e_in.rc;
+            v.is_cmp := e_in.out_cr;
+            if e_in.out_cr = '0' then
+                v.cr_mask := num_to_fxm(1);
+            else
+                v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(e_in.insn))));
+            end if;
+            int_input := '0';
+            if e_in.op = OP_FPOP_I then
+                int_input := '1';
+            end if;
+            v.quieten_nan := '1';
+            v.tiny := '0';
+            v.denorm := '0';
+            v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN);
+            v.is_subtract := '0';
+            v.is_multiply := '0';
+            v.is_sqrt := '0';
+            v.add_bsmall := '0';
+            v.doing_ftdiv := "00";
+
+            adec := decode_dp(e_in.fra, int_input);
+            bdec := decode_dp(e_in.frb, int_input);
+            cdec := decode_dp(e_in.frc, int_input);
+            v.a := adec;
+            v.b := bdec;
+            v.c := cdec;
+
+            v.exp_cmp := '0';
+            if adec.exponent > bdec.exponent then
+                v.exp_cmp := '1';
+            end if;
+            v.madd_cmp := '0';
+            if (adec.exponent + cdec.exponent + 1) >= bdec.exponent then
+                v.madd_cmp := '1';
+            end if;
+        end if;
+
+        r_hi_nz <= or (r.r(55 downto 31));
+        r_lo_nz <= or (r.r(30 downto 2));
+        s_nz <= or (r.s);
+
+        if r.single_prec = '0' then
+            if r.doing_ftdiv(1) = '0' then
+                max_exp := to_signed(1023, EXP_BITS);
+            else
+                max_exp := to_signed(1020, EXP_BITS);
+            end if;
+            if r.doing_ftdiv(0) = '0' then
+                min_exp := to_signed(-1022, EXP_BITS);
+            else
+                min_exp := to_signed(-1021, EXP_BITS);
+            end if;
+            bias_exp := to_signed(1536, EXP_BITS);
+        else
+            max_exp := to_signed(127, EXP_BITS);
+            min_exp := to_signed(-126, EXP_BITS);
+            bias_exp := to_signed(192, EXP_BITS);
+        end if;
+        new_exp := r.result_exp - r.shift;
+        exp_tiny := '0';
+        exp_huge := '0';
+        if new_exp < min_exp then
+            exp_tiny := '1';
+        end if;
+        if new_exp > max_exp then
+            exp_huge := '1';
+        end if;
+
+        -- Compare P with zero and with B
+        px_nz := or (r.p(57 downto 4));
+        pcmpb_eq := '0';
+        if r.p(59 downto 4) = r.b.mantissa(55 downto 0) then
+            pcmpb_eq := '1';
+        end if;
+        pcmpb_lt := '0';
+        if unsigned(r.p(59 downto 4)) < unsigned(r.b.mantissa(55 downto 0)) then
+            pcmpb_lt := '1';
+        end if;
+
+        v.writing_back := '0';
+        v.instr_done := '0';
+        v.update_fprf := '0';
+        v.shift := to_signed(0, EXP_BITS);
+        v.first := '0';
+        v.opsel_a := AIN_R;
+        opsel_ainv <= '0';
+        opsel_mask <= '0';
+        opsel_b <= BIN_ZERO;
+        opsel_binv <= '0';
+        opsel_r <= RES_SUM;
+        opsel_s <= S_ZERO;
+        carry_in <= '0';
+        misc_sel <= "0000";
+        fpscr_mask := (others => '1');
+        update_fx := '0';
+        arith_done := '0';
+        invalid := '0';
+        zero_divide := '0';
+        renormalize := '0';
+        set_x := '0';
+        qnan_result := '0';
+        set_a := '0';
+        set_b := '0';
+        set_c := '0';
+        set_s := '0';
+        f_to_multiply.is_32bit <= '0';
+        f_to_multiply.valid <= '0';
+        msel_1 <= MUL1_A;
+        msel_2 <= MUL2_C;
+        msel_add <= MULADD_ZERO;
+        msel_inv <= '0';
+        set_y := '0';
+        pshift := '0';
+        renorm_sqrt := '0';
+        shiftin := '0';
+        case r.state is
+            when IDLE =>
+                v.use_a := '0';
+                v.use_b := '0';
+                v.use_c := '0';
+                v.invalid := '0';
+                v.negate := '0';
+                if e_in.valid = '1' then
+                    case e_in.insn(5 downto 1) is
+                        when "00000" =>
+                            if e_in.insn(8) = '1' then
+                                if e_in.insn(6) = '0' then
+                                    v.state := DO_FTDIV;
+                                else
+                                    v.state := DO_FTSQRT;
+                                end if;
+                            elsif e_in.insn(7) = '1' then
+                                v.state := DO_MCRFS;
+                            else
+                                v.opsel_a := AIN_B;
+                                v.state := DO_FCMP;
+                            end if;
+                        when "00110" =>
+                            if e_in.insn(10) = '0' then
+                                if e_in.insn(8) = '0' then
+                                    v.state := DO_MTFSB;
+                                else
+                                    v.state := DO_MTFSFI;
+                                end if;
+                            else
+                                v.state := DO_FMRG;
+                            end if;
+                        when "00111" =>
+                            if e_in.insn(8) = '0' then
+                                v.state := DO_MFFS;
+                            else
+                                v.state := DO_MTFSF;
+                            end if;
+                        when "01000" =>
+                            v.opsel_a := AIN_B;
+                            if e_in.insn(9 downto 8) /= "11" then
+                                v.state := DO_FMR;
+                            else
+                                v.state := DO_FRI;
+                            end if;
+                        when "01100" =>
+                            v.opsel_a := AIN_B;
+                            v.state := DO_FRSP;
+                        when "01110" =>
+                            v.opsel_a := AIN_B;
+                            if int_input = '1' then
+                                -- fcfid[u][s]
+                                v.state := DO_FCFID;
+                            else
+                                v.state := DO_FCTI;
+                            end if;
+                        when "01111" =>
+                            v.round_mode := "001";
+                            v.opsel_a := AIN_B;
+                            v.state := DO_FCTI;
+                        when "10010" =>
+                            v.opsel_a := AIN_A;
+                            if v.b.mantissa(54) = '0' and v.a.mantissa(54) = '1' then
+                                v.opsel_a := AIN_B;
+                            end if;
+                            v.state := DO_FDIV;
+                        when "10100" | "10101" =>
+                            v.opsel_a := AIN_A;
+                            v.state := DO_FADD;
+                        when "10110" =>
+                            v.is_sqrt := '1';
+                            v.opsel_a := AIN_B;
+                            v.state := DO_FSQRT;
+                        when "10111" =>
+                            v.state := DO_FSEL;
+                        when "11000" =>
+                            v.opsel_a := AIN_B;
+                            v.state := DO_FRE;
+                        when "11001" =>
+                            v.is_multiply := '1';
+                            v.opsel_a := AIN_A;
+                            if v.c.mantissa(54) = '0' and v.a.mantissa(54) = '1' then
+                                v.opsel_a := AIN_C;
+                            end if;
+                            v.state := DO_FMUL;
+                        when "11010" =>
+                            v.is_sqrt := '1';
+                            v.opsel_a := AIN_B;
+                            v.state := DO_FRSQRTE;
+                        when "11100" | "11101" | "11110" | "11111" =>
+                            if v.a.mantissa(54) = '0' then
+                                v.opsel_a := AIN_A;
+                            elsif v.c.mantissa(54) = '0' then
+                                v.opsel_a := AIN_C;
+                            else
+                                v.opsel_a := AIN_B;
+                            end if;
+                            v.state := DO_FMADD;
+                        when others =>
+                            illegal := '1';
+                    end case;
+                end if;
+                v.x := '0';
+                v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX);
+                set_s := '1';
+
+            when DO_MCRFS =>
+                j := to_integer(unsigned(insn_bfa(r.insn)));
+                for i in 0 to 7 loop
+                    if i = j then
+                        k := (7 - i) * 4;
+                        v.cr_result := r.fpscr(k + 3 downto k);
+                        fpscr_mask(k + 3 downto k) := "0000";
+                    end if;
+                end loop;
+                v.fpscr := r.fpscr and (fpscr_mask or x"6007F8FF");
+                v.instr_done := '1';
+                v.state := IDLE;
+
+            when DO_FTDIV =>
+                v.instr_done := '1';
+                v.state := IDLE;
+                v.cr_result := "0000";
+                if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or
+                    (r.b.class = FINITE and r.b.mantissa(53) = '0') then
+                    v.cr_result(2) := '1';
+                end if;
+                if r.a.class = NAN or r.a.class = INFINITY or
+                    r.b.class = NAN or r.b.class = ZERO or r.b.class = INFINITY or
+                    (r.a.class = FINITE and r.a.exponent <= to_signed(-970, EXP_BITS)) then
+                    v.cr_result(1) := '1';
+                else
+                    v.doing_ftdiv := "11";
+                    v.first := '1';
+                    v.state := FTDIV_1;
+                    v.instr_done := '0';
+                end if;
+
+            when DO_FTSQRT =>
+                v.instr_done := '1';
+                v.state := IDLE;
+                v.cr_result := "0000";
+                if r.b.class = ZERO or r.b.class = INFINITY or
+                    (r.b.class = FINITE and r.b.mantissa(53) = '0') then
+                    v.cr_result(2) := '1';
+                end if;
+                if r.b.class = NAN or r.b.class = INFINITY or r.b.class = ZERO
+                    or r.b.negative = '1' or r.b.exponent <= to_signed(-970, EXP_BITS) then
+                    v.cr_result(1) := '0';
+                end if;
+
+            when DO_FCMP =>
+                -- fcmp[uo]
+                -- r.opsel_a = AIN_B
+                v.instr_done := '1';
+                v.state := IDLE;
+                update_fx := '1';
+                v.result_exp := r.b.exponent;
+                if (r.a.class = NAN and r.a.mantissa(53) = '0') or
+                    (r.b.class = NAN and r.b.mantissa(53) = '0') then
+                    -- Signalling NAN
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    if r.insn(6) = '1' and r.fpscr(FPSCR_VE) = '0' then
+                        v.fpscr(FPSCR_VXVC) := '1';
+                    end if;
+                    invalid := '1';
+                    v.cr_result := "0001";          -- unordered
+                elsif r.a.class = NAN or r.b.class = NAN then
+                    if r.insn(6) = '1' then
+                        -- fcmpo
+                        v.fpscr(FPSCR_VXVC) := '1';
+                        invalid := '1';
+                    end if;
+                    v.cr_result := "0001";          -- unordered
+                elsif r.a.class = ZERO and r.b.class = ZERO then
+                    v.cr_result := "0010";          -- equal
+                elsif r.a.negative /= r.b.negative then
+                    v.cr_result := r.a.negative & r.b.negative & "00";
+                elsif r.a.class = ZERO then
+                    -- A and B are the same sign from here down
+                    v.cr_result := not r.b.negative & r.b.negative & "00";
+                elsif r.a.class = INFINITY then
+                    if r.b.class = INFINITY then
+                        v.cr_result := "0010";
+                    else
+                        v.cr_result := r.a.negative & not r.a.negative & "00";
+                    end if;
+                elsif r.b.class = ZERO then
+                    -- A is finite from here down
+                    v.cr_result := r.a.negative & not r.a.negative & "00";
+                elsif r.b.class = INFINITY then
+                    v.cr_result := not r.b.negative & r.b.negative & "00";
+                elsif r.exp_cmp = '1' then
+                    -- A and B are both finite from here down
+                    v.cr_result := r.a.negative & not r.a.negative & "00";
+                elsif r.a.exponent /= r.b.exponent then
+                    -- A exponent is smaller than B
+                    v.cr_result := not r.a.negative & r.a.negative & "00";
+                else
+                    -- Prepare to subtract mantissas, put B in R
+                    v.cr_result := "0000";
+                    v.instr_done := '0';
+                    v.opsel_a := AIN_A;
+                    v.state := CMP_1;
+                end if;
+                v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result;
+
+            when DO_MTFSB =>
+                -- mtfsb{0,1}
+                j := to_integer(unsigned(insn_bt(r.insn)));
+                for i in 0 to 31 loop
+                    if i = j then
+                        v.fpscr(31 - i) := r.insn(6);
+                    end if;
+                end loop;
+                v.instr_done := '1';
+                v.state := IDLE;
+
+            when DO_MTFSFI =>
+                -- mtfsfi
+                j := to_integer(unsigned(insn_bf(r.insn)));
+                if r.insn(16) = '0' then
+                    for i in 0 to 7 loop
+                        if i = j then
+                            k := (7 - i) * 4;
+                            v.fpscr(k + 3 downto k) := insn_u(r.insn);
+                        end if;
+                    end loop;
+                end if;
+                v.instr_done := '1';
+                v.state := IDLE;
+
+            when DO_FMRG =>
+                -- fmrgew, fmrgow
+                opsel_r <= RES_MISC;
+                misc_sel <= "01" & r.insn(8) & '0';
+                v.int_result := '1';
+                v.writing_back := '1';
+                v.instr_done := '1';
+                v.state := IDLE;
+
+            when DO_MFFS =>
+                v.int_result := '1';
+                v.writing_back := '1';
+                opsel_r <= RES_MISC;
+                case r.insn(20 downto 16) is
+                    when "00000" =>
+                        -- mffs
+                    when "00001" =>
+                        -- mffsce
+                        v.fpscr(FPSCR_VE downto FPSCR_XE) := "00000";
+                    when "10100" | "10101" =>
+                        -- mffscdrn[i] (but we don't implement DRN)
+                        fpscr_mask := x"000000FF";
+                    when "10110" =>
+                        -- mffscrn
+                        fpscr_mask := x"000000FF";
+                        v.fpscr(FPSCR_RN+1 downto FPSCR_RN) :=
+                            r.b.mantissa(FPSCR_RN+1 downto FPSCR_RN);
+                    when "10111" =>
+                        -- mffscrni
+                        fpscr_mask := x"000000FF";
+                        v.fpscr(FPSCR_RN+1 downto FPSCR_RN) := r.insn(12 downto 11);
+                    when "11000" =>
+                        -- mffsl
+                        fpscr_mask := x"0007F0FF";
+                    when others =>
+                        illegal := '1';
+                end case;
+                v.instr_done := '1';
+                v.state := IDLE;
+
+            when DO_MTFSF =>
+                if r.insn(25) = '1' then
+                    flm := x"FF";
+                elsif r.insn(16) = '1' then
+                    flm := x"00";
+                else
+                    flm := r.insn(24 downto 17);
+                end if;
+                for i in 0 to 7 loop
+                    k := i * 4;
+                    if flm(i) = '1' then
+                        v.fpscr(k + 3 downto k) := r.b.mantissa(k + 3 downto k);
+                    end if;
+                end loop;
+                v.instr_done := '1';
+                v.state := IDLE;
+
+            when DO_FMR =>
+                -- r.opsel_a = AIN_B
+                v.result_class := r.b.class;
+                v.result_exp := r.b.exponent;
+                v.quieten_nan := '0';
+                if r.insn(9) = '1' then
+                    v.result_sign := '0';              -- fabs
+                elsif r.insn(8) = '1' then
+                    v.result_sign := '1';              -- fnabs
+                elsif r.insn(7) = '1' then
+                    v.result_sign := r.b.negative;     -- fmr
+                elsif r.insn(6) = '1' then
+                    v.result_sign := not r.b.negative; -- fneg
+                else
+                    v.result_sign := r.a.negative;     -- fcpsgn
+                end if;
+                v.writing_back := '1';
+                v.instr_done := '1';
+                v.state := IDLE;
+
+            when DO_FRI =>    -- fri[nzpm]
+                -- r.opsel_a = AIN_B
+                v.result_class := r.b.class;
+                v.result_sign := r.b.negative;
+                v.result_exp := r.b.exponent;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.b.class = NAN and r.b.mantissa(53) = '0' then
+                    -- Signalling NAN
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    invalid := '1';
+                end if;
+                if r.b.class = FINITE then
+                    if r.b.exponent >= to_signed(52, EXP_BITS) then
+                        -- integer already, no rounding required
+                        arith_done := '1';
+                    else
+                        v.shift := r.b.exponent - to_signed(52, EXP_BITS);
+                        v.state := FRI_1;
+                        v.round_mode := '1' & r.insn(7 downto 6);
+                    end if;
+                else
+                    arith_done := '1';
+                end if;
+
+            when DO_FRSP =>
+                -- r.opsel_a = AIN_B, r.shift = 0
+                v.result_class := r.b.class;
+                v.result_sign := r.b.negative;
+                v.result_exp := r.b.exponent;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.b.class = NAN and r.b.mantissa(53) = '0' then
+                    -- Signalling NAN
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    invalid := '1';
+                end if;
+                set_x := '1';
+                if r.b.class = FINITE then
+                    if r.b.exponent < to_signed(-126, EXP_BITS) then
+                        v.shift := r.b.exponent - to_signed(-126, EXP_BITS);
+                        v.state := ROUND_UFLOW;
+                    elsif r.b.exponent > to_signed(127, EXP_BITS) then
+                        v.state := ROUND_OFLOW;
+                    else
+                        v.shift := to_signed(-2, EXP_BITS);
+                        v.state := ROUNDING;
+                    end if;
+                else
+                    arith_done := '1';
+                end if;
+
+            when DO_FCTI =>
+                -- instr bit 9: 1=dword 0=word
+                -- instr bit 8: 1=unsigned 0=signed
+                -- instr bit 1: 1=round to zero 0=use fpscr[RN]
+                -- r.opsel_a = AIN_B
+                v.result_class := r.b.class;
+                v.result_sign := r.b.negative;
+                v.result_exp := r.b.exponent;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.b.class = NAN and r.b.mantissa(53) = '0' then
+                    -- Signalling NAN
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    invalid := '1';
+                end if;
+
+                v.int_result := '1';
+                case r.b.class is
+                    when ZERO =>
+                        arith_done := '1';
+                    when FINITE =>
+                        if r.b.exponent >= to_signed(64, EXP_BITS) or
+                            (r.insn(9) = '0' and r.b.exponent >= to_signed(32, EXP_BITS)) then
+                            v.state := INT_OFLOW;
+                        elsif r.b.exponent >= to_signed(52, EXP_BITS) then
+                            -- integer already, no rounding required,
+                            -- shift into final position
+                            v.shift := r.b.exponent - to_signed(54, EXP_BITS);
+                            if r.insn(8) = '1' and r.b.negative = '1' then
+                                v.state := INT_OFLOW;
+                            else
+                                v.state := INT_ISHIFT;
+                            end if;
+                        else
+                            v.shift := r.b.exponent - to_signed(52, EXP_BITS);
+                            v.state := INT_SHIFT;
+                        end if;
+                    when INFINITY | NAN =>
+                        v.state := INT_OFLOW;
+                end case;
+
+            when DO_FCFID =>
+                -- r.opsel_a = AIN_B
+                v.result_sign := '0';
+                if r.insn(8) = '0' and r.b.negative = '1' then
+                    -- fcfid[s] with negative operand, set R = -B
+                    opsel_ainv <= '1';
+                    carry_in <= '1';
+                    v.result_sign := '1';
+                end if;
+                v.result_class := r.b.class;
+                v.result_exp := to_signed(54, EXP_BITS);
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.b.class = ZERO then
+                    arith_done := '1';
+                else
+                    v.state := FINISH;
+                end if;
+
+            when DO_FADD =>
+                -- fadd[s] and fsub[s]
+                -- r.opsel_a = AIN_A
+                v.result_sign := r.a.negative;
+                v.result_class := r.a.class;
+                v.result_exp := r.a.exponent;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                v.use_a := '1';
+                v.use_b := '1';
+                is_add := r.a.negative xor r.b.negative xor r.insn(1);
+                if r.a.class = FINITE and r.b.class = FINITE then
+                    v.is_subtract := not is_add;
+                    v.add_bsmall := r.exp_cmp;
+                    v.opsel_a := AIN_B;
+                    if r.exp_cmp = '0' then
+                        v.shift := r.a.exponent - r.b.exponent;
+                        v.result_sign := r.b.negative xnor r.insn(1);
+                        if r.a.exponent = r.b.exponent then
+                            v.state := ADD_2;
+                        else
+                            v.longmask := '0';
+                            v.state := ADD_SHIFT;
+                        end if;
+                    else
+                        v.state := ADD_1;
+                    end if;
+                else
+                    if r.a.class = NAN or r.b.class = NAN then
+                        v.state := NAN_RESULT;
+                    elsif r.a.class = INFINITY and r.b.class = INFINITY and is_add = '0' then
+                        -- invalid operation, construct QNaN
+                        v.fpscr(FPSCR_VXISI) := '1';
+                        qnan_result := '1';
+                        arith_done := '1';
+                    elsif r.a.class = ZERO and r.b.class = ZERO and is_add = '0' then
+                        -- return -0 for rounding to -infinity
+                        v.result_sign := r.round_mode(1) and r.round_mode(0);
+                        arith_done := '1';
+                    elsif r.a.class = INFINITY or r.b.class = ZERO then
+                        -- result is A
+                        v.opsel_a := AIN_A;
+                        v.state := EXC_RESULT;
+                    else
+                        -- result is +/- B
+                        v.opsel_a := AIN_B;
+                        v.negate := not r.insn(1);
+                        v.state := EXC_RESULT;
+                    end if;
+                end if;
+
+            when DO_FMUL =>
+                -- fmul[s]
+                -- r.opsel_a = AIN_A unless C is denorm and A isn't
+                v.result_sign := r.a.negative xor r.c.negative;
+                v.result_class := r.a.class;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                v.use_a := '1';
+                v.use_c := '1';
+                if r.a.class = FINITE and r.c.class = FINITE then
+                    v.result_exp := r.a.exponent + r.c.exponent;
+                    -- Renormalize denorm operands
+                    if r.a.mantissa(54) = '0' then
+                        v.state := RENORM_A;
+                    elsif r.c.mantissa(54) = '0' then
+                        v.state := RENORM_C;
+                    else
+                        f_to_multiply.valid <= '1';
+                        v.state := MULT_1;
+                    end if;
+                else
+                    if r.a.class = NAN or r.c.class = NAN then
+                        v.state := NAN_RESULT;
+                    elsif (r.a.class = INFINITY and r.c.class = ZERO) or
+                        (r.a.class = ZERO and r.c.class = INFINITY) then
+                        -- invalid operation, construct QNaN
+                        v.fpscr(FPSCR_VXIMZ) := '1';
+                        qnan_result := '1';
+                    elsif r.a.class = ZERO or r.a.class = INFINITY then
+                        -- result is +/- A
+                        arith_done := '1';
+                    else
+                        -- r.c.class is ZERO or INFINITY
+                        v.opsel_a := AIN_C;
+                        v.negate := r.a.negative;
+                        v.state := EXC_RESULT;
+                    end if;
+                end if;
+
+            when DO_FDIV =>
+                -- r.opsel_a = AIN_A unless B is denorm and A isn't
+                v.result_class := r.a.class;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                v.use_a := '1';
+                v.use_b := '1';
+                v.result_sign := r.a.negative xor r.b.negative;
+                v.result_exp := r.a.exponent - r.b.exponent;
+                v.count := "00";
+                if r.a.class = FINITE and r.b.class = FINITE then
+                    -- Renormalize denorm operands
+                    if r.a.mantissa(54) = '0' then
+                        v.state := RENORM_A;
+                    elsif r.b.mantissa(54) = '0' then
+                        v.state := RENORM_B;
+                    else
+                        v.first := '1';
+                        v.state := DIV_2;
+                    end if;
+                else
+                    if r.a.class = NAN or r.b.class = NAN then
+                        v.state := NAN_RESULT;
+                    elsif r.b.class = INFINITY then
+                        if r.a.class = INFINITY then
+                            v.fpscr(FPSCR_VXIDI) := '1';
+                            qnan_result := '1';
+                        else
+                            v.result_class := ZERO;
+                        end if;
+                        arith_done := '1';
+                    elsif r.b.class = ZERO then
+                        if r.a.class = ZERO then
+                            v.fpscr(FPSCR_VXZDZ) := '1';
+                            qnan_result := '1';
+                        else
+                            if r.a.class = FINITE then
+                                zero_divide := '1';
+                            end if;
+                            v.result_class := INFINITY;
+                        end if;
+                        arith_done := '1';
+                    else -- r.b.class = FINITE, result_class = r.a.class
+                        arith_done := '1';
+                    end if;
+                end if;
+
+            when DO_FSEL =>
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.a.class = ZERO or (r.a.negative = '0' and r.a.class /= NAN) then
+                    v.opsel_a := AIN_C;
+                else
+                    v.opsel_a := AIN_B;
+                end if;
+                v.quieten_nan := '0';
+                v.state := EXC_RESULT;
+
+            when DO_FSQRT =>
+                -- r.opsel_a = AIN_B
+                v.result_class := r.b.class;
+                v.result_sign := r.b.negative;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                v.use_b := '1';
+                case r.b.class is
+                    when FINITE =>
+                        v.result_exp := r.b.exponent;
+                        if r.b.negative = '1' then
+                            v.fpscr(FPSCR_VXSQRT) := '1';
+                            qnan_result := '1';
+                        elsif r.b.mantissa(54) = '0' then
+                            v.state := RENORM_B;
+                        elsif r.b.exponent(0) = '0' then
+                            v.state := SQRT_1;
+                        else
+                            v.shift := to_signed(1, EXP_BITS);
+                            v.state := RENORM_B2;
+                        end if;
+                    when NAN =>
+                        v.state := NAN_RESULT;
+                    when ZERO =>
+                        -- result is B
+                        arith_done := '1';
+                    when INFINITY =>
+                        if r.b.negative = '1' then
+                            v.fpscr(FPSCR_VXSQRT) := '1';
+                            qnan_result := '1';
+                        -- else result is B
+                        end if;
+                        arith_done := '1';
+                end case;
+
+            when DO_FRE =>
+                -- r.opsel_a = AIN_B
+                v.result_class := r.b.class;
+                v.result_sign := r.b.negative;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                v.use_b := '1';
+                case r.b.class is
+                    when FINITE =>
+                        v.result_exp := - r.b.exponent;
+                        if r.b.mantissa(54) = '0' then
+                            v.state := RENORM_B;
+                        else
+                            v.state := FRE_1;
+                        end if;
+                    when NAN =>
+                        v.state := NAN_RESULT;
+                    when INFINITY =>
+                        v.result_class := ZERO;
+                        arith_done := '1';
+                    when ZERO =>
+                        v.result_class := INFINITY;
+                        zero_divide := '1';
+                        arith_done := '1';
+                end case;
+
+            when DO_FRSQRTE =>
+                -- r.opsel_a = AIN_B
+                v.result_class := r.b.class;
+                v.result_sign := r.b.negative;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                v.use_b := '1';
+                v.shift := to_signed(1, EXP_BITS);
+                case r.b.class is
+                    when FINITE =>
+                        v.result_exp := r.b.exponent;
+                        if r.b.negative = '1' then
+                            v.fpscr(FPSCR_VXSQRT) := '1';
+                            qnan_result := '1';
+                        elsif r.b.mantissa(54) = '0' then
+                            v.state := RENORM_B;
+                        elsif r.b.exponent(0) = '0' then
+                            v.state := RSQRT_1;
+                        else
+                            v.state := RENORM_B2;
+                        end if;
+                    when NAN =>
+                        v.state := NAN_RESULT;
+                    when INFINITY =>
+                        if r.b.negative = '1' then
+                            v.fpscr(FPSCR_VXSQRT) := '1';
+                            qnan_result := '1';
+                        else
+                            v.result_class := ZERO;
+                        end if;
+                        arith_done := '1';
+                    when ZERO =>
+                        v.result_class := INFINITY;
+                        zero_divide := '1';
+                        arith_done := '1';
+                end case;
+
+            when DO_FMADD =>
+                -- fmadd, fmsub, fnmadd, fnmsub
+                -- r.opsel_a = AIN_A if A is denorm, else AIN_C if C is denorm,
+                -- else AIN_B
+                v.result_sign := r.a.negative;
+                v.result_class := r.a.class;
+                v.result_exp := r.a.exponent;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                v.use_a := '1';
+                v.use_b := '1';
+                v.use_c := '1';
+                is_add := r.a.negative xor r.c.negative xor r.b.negative xor r.insn(1);
+                if r.a.class = FINITE and r.c.class = FINITE and
+                    (r.b.class = FINITE or r.b.class = ZERO) then
+                    v.is_subtract := not is_add;
+                    mulexp := r.a.exponent + r.c.exponent;
+                    v.result_exp := mulexp;
+                    -- Make sure A and C are normalized
+                    if r.a.mantissa(54) = '0' then
+                        v.state := RENORM_A;
+                    elsif r.c.mantissa(54) = '0' then
+                        v.state := RENORM_C;
+                    elsif r.b.class = ZERO then
+                        -- no addend, degenerates to multiply
+                        v.result_sign := r.a.negative xor r.c.negative xor r.insn(2);
+                        f_to_multiply.valid <= '1';
+                        v.is_multiply := '1';
+                        v.state := MULT_1;
+                    elsif r.madd_cmp = '0' then
+                        -- addend is bigger, do multiply first
+                        v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2));
+                        f_to_multiply.valid <= '1';
+                        v.state := FMADD_1;
+                    else
+                        -- product is bigger, shift B right and use it as the
+                        -- addend to the multiplier
+                        v.shift := r.b.exponent - mulexp + to_signed(64, EXP_BITS);
+                        -- for subtract, multiplier does B - A * C
+                        v.result_sign := not (r.a.negative xor r.c.negative xor r.insn(2) xor is_add);
+                        v.result_exp := r.b.exponent;
+                        v.state := FMADD_2;
+                    end if;
+                else
+                    if r.a.class = NAN or r.b.class = NAN or r.c.class = NAN then
+                        v.state := NAN_RESULT;
+                    elsif (r.a.class = ZERO and r.c.class = INFINITY) or
+                        (r.a.class = INFINITY and r.c.class = ZERO) then
+                        -- invalid operation, construct QNaN
+                        v.fpscr(FPSCR_VXIMZ) := '1';
+                        qnan_result := '1';
+                    elsif r.a.class = INFINITY or r.c.class = INFINITY then
+                        if r.b.class = INFINITY and is_add = '0' then
+                            -- invalid operation, construct QNaN
+                            v.fpscr(FPSCR_VXISI) := '1';
+                            qnan_result := '1';
+                        else
+                            -- result is infinity
+                            v.result_class := INFINITY;
+                            v.result_sign := r.a.negative xor r.c.negative xor r.insn(2);
+                            arith_done := '1';
+                        end if;
+                    else
+                        -- Here A is zero, C is zero, or B is infinity
+                        -- Result is +/-B in all of those cases
+                        v.opsel_a := AIN_B;
+                        if r.b.class /= ZERO or is_add = '1' then
+                            v.negate := not (r.insn(1) xor r.insn(2));
+                        else
+                            -- have to be careful about rule for 0 - 0 result sign
+                            v.negate := r.b.negative xor (r.round_mode(1) and r.round_mode(0)) xor r.insn(2);
+                        end if;
+                        v.state := EXC_RESULT;
+                    end if;
+                end if;
+
+            when RENORM_A =>
+                renormalize := '1';
+                v.state := RENORM_A2;
+                if r.insn(4) = '1' then
+                    v.opsel_a := AIN_C;
+                else
+                    v.opsel_a := AIN_B;
+                end if;
+
+            when RENORM_A2 =>
+                -- r.opsel_a = AIN_C for fmul/fmadd, AIN_B for fdiv
+                set_a := '1';
+                v.result_exp := new_exp;
+                if r.insn(4) = '1' then
+                    if r.c.mantissa(54) = '1' then
+                        if r.insn(3) = '0' or r.b.class = ZERO then
+                            v.first := '1';
+                            v.state := MULT_1;
+                        else
+                            v.madd_cmp := '0';
+                            if new_exp + 1 >= r.b.exponent then
+                                v.madd_cmp := '1';
+                            end if;
+                            v.opsel_a := AIN_B;
+                            v.state := DO_FMADD;
+                        end if;
+                    else
+                        v.state := RENORM_C;
+                    end if;
+                else
+                    if r.b.mantissa(54) = '1' then
+                        v.first := '1';
+                        v.state := DIV_2;
+                    else
+                        v.state := RENORM_B;
+                    end if;
+                end if;
+
+            when RENORM_B =>
+                renormalize := '1';
+                renorm_sqrt := r.is_sqrt;
+                v.state := RENORM_B2;
+
+            when RENORM_B2 =>
+                set_b := '1';
+                if r.is_sqrt = '0' then
+                    v.result_exp := r.result_exp + r.shift;
+                else
+                    v.result_exp := new_exp;
+                end if;
+                v.opsel_a := AIN_B;
+                v.state := LOOKUP;
+
+            when RENORM_C =>
+                renormalize := '1';
+                v.state := RENORM_C2;
+
+            when RENORM_C2 =>
+                set_c := '1';
+                v.result_exp := new_exp;
+                if r.insn(3) = '0' or r.b.class = ZERO then
+                    v.first := '1';
+                    v.state := MULT_1;
+                else
+                    v.madd_cmp := '0';
+                    if new_exp + 1 >= r.b.exponent then
+                        v.madd_cmp := '1';
+                    end if;
+                    v.opsel_a := AIN_B;
+                    v.state := DO_FMADD;
+                end if;
+
+            when ADD_1 =>
+                -- transferring B to R
+                v.shift := r.b.exponent - r.a.exponent;
+                v.result_exp := r.b.exponent;
+                v.longmask := '0';
+                v.state := ADD_SHIFT;
+
+            when ADD_SHIFT =>
+                -- r.shift = - exponent difference, r.longmask = 0
+                opsel_r <= RES_SHIFT;
+                v.x := s_nz;
+                set_x := '1';
+                v.longmask := r.single_prec;
+                if r.add_bsmall = '1' then
+                    v.opsel_a := AIN_A;
+                else
+                    v.opsel_a := AIN_B;
+                end if;
+                v.state := ADD_2;
+
+            when ADD_2 =>
+                -- r.opsel_a = AIN_A if r.add_bsmall = 1 else AIN_B
+                opsel_b <= BIN_R;
+                opsel_binv <= r.is_subtract;
+                carry_in <= r.is_subtract and not r.x;
+                v.shift := to_signed(-1, EXP_BITS);
+                v.state := ADD_3;
+
+            when ADD_3 =>
+                -- check for overflow or negative result (can't get both)
+                -- r.shift = -1
+                if r.r(63) = '1' then
+                    -- result is opposite sign to expected
+                    v.result_sign := not r.result_sign;
+                    opsel_ainv <= '1';
+                    carry_in <= '1';
+                    v.state := FINISH;
+                elsif r.r(55) = '1' then
+                    -- sum overflowed, shift right
+                    opsel_r <= RES_SHIFT;
+                    set_x := '1';
+                    v.shift := to_signed(-2, EXP_BITS);
+                    if exp_huge = '1' then
+                        v.state := ROUND_OFLOW;
+                    else
+                        v.state := ROUNDING;
+                    end if;
+                elsif r.r(54) = '1' then
+                    set_x := '1';
+                    v.shift := to_signed(-2, EXP_BITS);
+                    v.state := ROUNDING;
+                elsif (r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then
+                    -- r.x must be zero at this point
+                    v.result_class := ZERO;
+                    if r.is_subtract = '1' then
+                        -- set result sign depending on rounding mode
+                        v.result_sign := r.round_mode(1) and r.round_mode(0);
+                    end if;
+                    arith_done := '1';
+                else
+                    renormalize := '1';
+                    v.state := NORMALIZE;
+                end if;
+
+            when CMP_1 =>
+                -- r.opsel_a = AIN_A
+                opsel_b <= BIN_R;
+                opsel_binv <= '1';
+                carry_in <= '1';
+                v.state := CMP_2;
+
+            when CMP_2 =>
+                if r.r(63) = '1' then
+                    -- A is smaller in magnitude
+                    v.cr_result := not r.a.negative & r.a.negative & "00";
+                elsif (r_hi_nz or r_lo_nz) = '0' then
+                    v.cr_result := "0010";
+                else
+                    v.cr_result := r.a.negative & not r.a.negative & "00";
+                end if;
+                v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result;
+                v.instr_done := '1';
+                v.state := IDLE;
+
+            when MULT_1 =>
+                f_to_multiply.valid <= r.first;
+                opsel_r <= RES_MULT;
+                if multiply_to_f.valid = '1' then
+                    v.state := FINISH;
+                end if;
+
+            when FMADD_1 =>
+                -- Addend is bigger here
+                v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2));
+                -- note v.shift is at most -2 here
+                v.shift := r.result_exp - r.b.exponent;
+                opsel_r <= RES_MULT;
+                opsel_s <= S_MULT;
+                set_s := '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.longmask := '0';
+                    v.state := ADD_SHIFT;
+                end if;
+
+            when FMADD_2 =>
+                -- Product is potentially bigger here
+                -- r.shift = addend exp - product exp + 64, r.r = r.b.mantissa
+                set_s := '1';
+                opsel_s <= S_SHIFT;
+                v.shift := r.shift - to_signed(64, EXP_BITS);
+                v.state := FMADD_3;
+
+            when FMADD_3 =>
+                -- r.shift = addend exp - product exp
+                opsel_r <= RES_SHIFT;
+                v.first := '1';
+                v.state := FMADD_4;
+
+            when FMADD_4 =>
+                msel_add <= MULADD_RS;
+                f_to_multiply.valid <= r.first;
+                msel_inv <= r.is_subtract;
+                opsel_r <= RES_MULT;
+                opsel_s <= S_MULT;
+                set_s := '1';
+                v.shift := to_signed(56, EXP_BITS);
+                if multiply_to_f.valid = '1' then
+                    if multiply_to_f.result(121) = '1' then
+                        v.state := FMADD_5;
+                    else
+                        v.state := FMADD_6;
+                    end if;
+                end if;
+
+            when FMADD_5 =>
+                -- negate R:S:X
+                v.result_sign := not r.result_sign;
+                opsel_ainv <= '1';
+                carry_in <= not (s_nz or r.x);
+                opsel_s <= S_NEG;
+                set_s := '1';
+                v.shift := to_signed(56, EXP_BITS);
+                v.state := FMADD_6;
+
+            when FMADD_6 =>
+                -- r.shift = 56 (or 0, but only if r is now nonzero)
+                if (r.r(56) or r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then
+                    if s_nz = '0' then
+                        -- must be a subtraction, and r.x must be zero
+                        v.result_class := ZERO;
+                        v.result_sign := r.round_mode(1) and r.round_mode(0);
+                        arith_done := '1';
+                    else
+                        -- R is all zeroes but there are non-zero bits in S
+                        -- so shift them into R and set S to 0
+                        opsel_r <= RES_SHIFT;
+                        set_s := '1';
+                        -- stay in state FMADD_6
+                    end if;
+                elsif r.r(56 downto 54) = "001" then
+                    v.state := FINISH;
+                else
+                    renormalize := '1';
+                    v.state := NORMALIZE;
+                end if;
+
+            when LOOKUP =>
+                -- r.opsel_a = AIN_B
+                -- wait one cycle for inverse_table[B] lookup
+                v.first := '1';
+                if r.insn(4) = '0' then
+                    if r.insn(3) = '0' then
+                        v.state := DIV_2;
+                    else
+                        v.state := SQRT_1;
+                    end if;
+                elsif r.insn(2) = '0' then
+                    v.state := FRE_1;
+                else
+                    v.state := RSQRT_1;
+                end if;
+
+            when DIV_2 =>
+                -- compute Y = inverse_table[B] (when count=0); P = 2 - B * Y
+                msel_1 <= MUL1_B;
+                msel_add <= MULADD_CONST;
+                msel_inv <= '1';
+                if r.count = 0 then
+                    msel_2 <= MUL2_LUT;
+                else
+                    msel_2 <= MUL2_P;
+                end if;
+                set_y := r.first;
+                pshift := '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    v.count := r.count + 1;
+                    v.state := DIV_3;
+                end if;
+
+            when DIV_3 =>
+                -- compute Y = P = P * Y
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    if r.count = 3 then
+                        v.state := DIV_4;
+                    else
+                        v.state := DIV_2;
+                    end if;
+                end if;
+
+            when DIV_4 =>
+                -- compute R = P = A * Y (quotient)
+                msel_1 <= MUL1_A;
+                msel_2 <= MUL2_P;
+                set_y := r.first;
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    opsel_r <= RES_MULT;
+                    v.first := '1';
+                    v.state := DIV_5;
+                end if;
+
+            when DIV_5 =>
+                -- compute P = A - B * R (remainder)
+                msel_1 <= MUL1_B;
+                msel_2 <= MUL2_R;
+                msel_add <= MULADD_A;
+                msel_inv <= '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.state := DIV_6;
+                end if;
+
+            when DIV_6 =>
+                -- test if remainder is 0 or >= B
+                if pcmpb_lt = '1' then
+                    -- quotient is correct, set X if remainder non-zero
+                    v.x := r.p(58) or px_nz;
+                else
+                    -- quotient needs to be incremented by 1
+                    carry_in <= '1';
+                    v.x := not pcmpb_eq;
+                end if;
+                v.state := FINISH;
+
+            when FRE_1 =>
+                opsel_r <= RES_MISC;
+                misc_sel <= "0111";
+                v.shift := to_signed(1, EXP_BITS);
+                v.state := NORMALIZE;
+
+            when FTDIV_1 =>
+                v.cr_result(1) := exp_tiny or exp_huge;
+                if exp_tiny = '1' or exp_huge = '1' or r.a.class = ZERO or r.first = '0' then
+                    v.instr_done := '1';
+                    v.state := IDLE;
+                else
+                    v.shift := r.a.exponent;
+                    v.doing_ftdiv := "10";
+                end if;
+
+            when RSQRT_1 =>
+                opsel_r <= RES_MISC;
+                misc_sel <= "0111";
+                sqrt_exp := r.b.exponent(EXP_BITS-1) & r.b.exponent(EXP_BITS-1 downto 1);
+                v.result_exp := - sqrt_exp;
+                v.shift := to_signed(1, EXP_BITS);
+                v.state := NORMALIZE;
+
+            when SQRT_1 =>
+                -- put invsqr[B] in R and compute P = invsqr[B] * B
+                -- also transfer B (in R) to A
+                set_a := '1';
+                opsel_r <= RES_MISC;
+                misc_sel <= "0111";
+                msel_1 <= MUL1_B;
+                msel_2 <= MUL2_LUT;
+                f_to_multiply.valid <= '1';
+                v.shift := to_signed(-1, EXP_BITS);
+                v.count := "00";
+                v.state := SQRT_2;
+
+            when SQRT_2 =>
+                -- shift R right one place
+                -- not expecting multiplier result yet
+                -- r.shift = -1
+                opsel_r <= RES_SHIFT;
+                v.first := '1';
+                v.state := SQRT_3;
+
+            when SQRT_3 =>
+                -- put R into Y, wait for product from multiplier
+                msel_2 <= MUL2_R;
+                set_y := r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    -- put result into R
+                    opsel_r <= RES_MULT;
+                    v.first := '1';
+                    v.state := SQRT_4;
+                end if;
+
+            when SQRT_4 =>
+                -- compute 1.5 - Y * P
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                msel_add <= MULADD_CONST;
+                msel_inv <= '1';
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    v.state := SQRT_5;
+                end if;
+
+            when SQRT_5 =>
+                -- compute Y = Y * P
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                f_to_multiply.valid <= '1';
+                v.first := '1';
+                v.state := SQRT_6;
+
+            when SQRT_6 =>
+                -- pipeline in R = R * P
+                msel_1 <= MUL1_R;
+                msel_2 <= MUL2_P;
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    v.state := SQRT_7;
+                end if;
+
+            when SQRT_7 =>
+                -- first multiply is done, put result in Y
+                msel_2 <= MUL2_P;
+                set_y := r.first;
+                -- wait for second multiply (should be here already)
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    -- put result into R
+                    opsel_r <= RES_MULT;
+                    v.first := '1';
+                    v.count := r.count + 1;
+                    if r.count < 2 then
+                        v.state := SQRT_4;
+                    else
+                        v.first := '1';
+                        v.state := SQRT_8;
+                    end if;
+                end if;
+
+            when SQRT_8 =>
+                -- compute P = A - R * R, which can be +ve or -ve
+                -- we arranged for B to be put into A earlier
+                msel_1 <= MUL1_R;
+                msel_2 <= MUL2_R;
+                msel_add <= MULADD_A;
+                msel_inv <= '1';
+                pshift := '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    v.state := SQRT_9;
+                end if;
+
+            when SQRT_9 =>
+                -- compute P = P * Y
+                -- since Y is an estimate of 1/sqrt(B), this makes P an
+                -- estimate of the adjustment needed to R.  Since the error
+                -- could be negative and we have an unsigned multiplier, the
+                -- upper bits can be wrong, but it turns out the lowest 8 bits
+                -- are correct and are all we need (given 3 iterations through
+                -- SQRT_4 to SQRT_7).
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                pshift := '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.state := SQRT_10;
+                end if;
+
+            when SQRT_10 =>
+                -- Add the bottom 8 bits of P, sign-extended,
+                -- divided by 4, onto R.
+                -- The division by 4 is because R is 10.54 format
+                -- whereas P is 8.56 format.
+                opsel_b <= BIN_PS6;
+                sqrt_exp := r.b.exponent(EXP_BITS-1) & r.b.exponent(EXP_BITS-1 downto 1);
+                v.result_exp := sqrt_exp;
+                v.shift := to_signed(1, EXP_BITS);
+                v.first := '1';
+                v.state := SQRT_11;
+
+            when SQRT_11 =>
+                -- compute P = A - R * R (remainder)
+                -- also put 2 * R + 1 into B for comparison with P
+                msel_1 <= MUL1_R;
+                msel_2 <= MUL2_R;
+                msel_add <= MULADD_A;
+                msel_inv <= '1';
+                f_to_multiply.valid <= r.first;
+                shiftin := '1';
+                set_b := r.first;
+                if multiply_to_f.valid = '1' then
+                    v.state := SQRT_12;
+                end if;
+
+            when SQRT_12 =>
+                -- test if remainder is 0 or >= B = 2*R + 1
+                if pcmpb_lt = '1' then
+                    -- square root is correct, set X if remainder non-zero
+                    v.x := r.p(58) or px_nz;
+                else
+                    -- square root needs to be incremented by 1
+                    carry_in <= '1';
+                    v.x := not pcmpb_eq;
+                end if;
+                v.state := FINISH;
+
+            when INT_SHIFT =>
+                -- r.shift = b.exponent - 52
+                opsel_r <= RES_SHIFT;
+                set_x := '1';
+                v.state := INT_ROUND;
+                v.shift := to_signed(-2, EXP_BITS);
+
+            when INT_ROUND =>
+                -- r.shift = -2
+                opsel_r <= RES_SHIFT;
+                round := fp_rounding(r.r, r.x, '0', r.round_mode, r.result_sign);
+                v.fpscr(FPSCR_FR downto FPSCR_FI) := round;
+                -- Check for negative values that don't round to 0 for fcti*u*
+                if r.insn(8) = '1' and r.result_sign = '1' and
+                    (r_hi_nz or r_lo_nz or v.fpscr(FPSCR_FR)) = '1' then
+                    v.state := INT_OFLOW;
+                else
+                    v.state := INT_FINAL;
+                end if;
+
+            when INT_ISHIFT =>
+                -- r.shift = b.exponent - 54;
+                opsel_r <= RES_SHIFT;
+                v.state := INT_FINAL;
+
+            when INT_FINAL =>
+                -- Negate if necessary, and increment for rounding if needed
+                opsel_ainv <= r.result_sign;
+                carry_in <= r.fpscr(FPSCR_FR) xor r.result_sign;
+                -- Check for possible overflows
+                case r.insn(9 downto 8) is
+                    when "00" =>        -- fctiw[z]
+                        need_check := r.r(31) or (r.r(30) and not r.result_sign);
+                    when "01" =>        -- fctiwu[z]
+                        need_check := r.r(31);
+                    when "10" =>        -- fctid[z]
+                        need_check := r.r(63) or (r.r(62) and not r.result_sign);
+                    when others =>      -- fctidu[z]
+                        need_check := r.r(63);
+                end case;
+                if need_check = '1' then
+                    v.state := INT_CHECK;
+                else
+                    if r.fpscr(FPSCR_FI) = '1' then
+                        v.fpscr(FPSCR_XX) := '1';
+                    end if;
+                    arith_done := '1';
+                end if;
+
+            when INT_CHECK =>
+                if r.insn(9) = '0' then
+                    msb := r.r(31);
+                else
+                    msb := r.r(63);
+                end if;
+                misc_sel <= '1' & r.insn(9 downto 8) & r.result_sign;
+                if (r.insn(8) = '0' and msb /= r.result_sign) or
+                    (r.insn(8) = '1' and msb /= '1') then
+                    opsel_r <= RES_MISC;
+                    v.fpscr(FPSCR_VXCVI) := '1';
+                    invalid := '1';
+                else
+                    if r.fpscr(FPSCR_FI) = '1' then
+                        v.fpscr(FPSCR_XX) := '1';
+                    end if;
+                end if;
+                arith_done := '1';
+
+            when INT_OFLOW =>
+                opsel_r <= RES_MISC;
+                misc_sel <= '1' & r.insn(9 downto 8) & r.result_sign;
+                if r.b.class = NAN then
+                    misc_sel(0) <= '1';
+                end if;
+                v.fpscr(FPSCR_VXCVI) := '1';
+                invalid := '1';
+                arith_done := '1';
+
+            when FRI_1 =>
+                -- r.shift = b.exponent - 52
+                opsel_r <= RES_SHIFT;
+                set_x := '1';
+                v.shift := to_signed(-2, EXP_BITS);
+                v.state := ROUNDING;
+
+            when FINISH =>
+                if r.is_multiply = '1' and px_nz = '1' then
+                    v.x := '1';
+                end if;
+                if r.r(63 downto 54) /= "0000000001" then
+                    renormalize := '1';
+                    v.state := NORMALIZE;
+                else
+                    set_x := '1';
+                    if exp_tiny = '1' then
+                        v.shift := new_exp - min_exp;
+                        v.state := ROUND_UFLOW;
+                    elsif exp_huge = '1' then
+                        v.state := ROUND_OFLOW;
+                    else
+                        v.shift := to_signed(-2, EXP_BITS);
+                        v.state := ROUNDING;
+                    end if;
+                end if;
+
+            when NORMALIZE =>
+                -- Shift so we have 9 leading zeroes (we know R is non-zero)
+                -- r.shift = clz(r.r) - 9
+                opsel_r <= RES_SHIFT;
+                set_x := '1';
+                if exp_tiny = '1' then
+                    v.shift := new_exp - min_exp;
+                    v.state := ROUND_UFLOW;
+                elsif exp_huge = '1' then
+                    v.state := ROUND_OFLOW;
+                else
+                    v.shift := to_signed(-2, EXP_BITS);
+                    v.state := ROUNDING;
+                end if;
+
+            when ROUND_UFLOW =>
+                -- r.shift = - amount by which exponent underflows
+                v.tiny := '1';
+                if r.fpscr(FPSCR_UE) = '0' then
+                    -- disabled underflow exception case
+                    -- have to denormalize before rounding
+                    opsel_r <= RES_SHIFT;
+                    set_x := '1';
+                    v.shift := to_signed(-2, EXP_BITS);
+                    v.state := ROUNDING;
+                else
+                    -- enabled underflow exception case
+                    -- if denormalized, have to normalize before rounding
+                    v.fpscr(FPSCR_UX) := '1';
+                    v.result_exp := r.result_exp + bias_exp;
+                    if r.r(54) = '0' then
+                        renormalize := '1';
+                        v.state := NORMALIZE;
+                    else
+                        v.shift := to_signed(-2, EXP_BITS);
+                        v.state := ROUNDING;
+                    end if;
+                end if;
+
+            when ROUND_OFLOW =>
+                v.fpscr(FPSCR_OX) := '1';
+                if r.fpscr(FPSCR_OE) = '0' then
+                    -- disabled overflow exception
+                    -- result depends on rounding mode
+                    v.fpscr(FPSCR_XX) := '1';
+                    v.fpscr(FPSCR_FI) := '1';
+                    if r.round_mode(1 downto 0) = "00" or
+                        (r.round_mode(1) = '1' and r.round_mode(0) = r.result_sign) then
+                        v.result_class := INFINITY;
+                        v.fpscr(FPSCR_FR) := '1';
+                    else
+                        v.fpscr(FPSCR_FR) := '0';
+                    end if;
+                    -- construct largest representable number
+                    v.result_exp := max_exp;
+                    opsel_r <= RES_MISC;
+                    misc_sel <= "001" & r.single_prec;
+                    arith_done := '1';
+                else
+                    -- enabled overflow exception
+                    v.result_exp := r.result_exp - bias_exp;
+                    v.shift := to_signed(-2, EXP_BITS);
+                    v.state := ROUNDING;
+                end if;
+
+            when ROUNDING =>
+                opsel_mask <= '1';
+                round := fp_rounding(r.r, r.x, r.single_prec, r.round_mode, r.result_sign);
+                v.fpscr(FPSCR_FR downto FPSCR_FI) := round;
+                if round(1) = '1' then
+                    -- set mask to increment the LSB for the precision
+                    opsel_b <= BIN_MASK;
+                    carry_in <= '1';
+                    v.shift := to_signed(-1, EXP_BITS);
+                    v.state := ROUNDING_2;
+                else
+                    if r.r(54) = '0' then
+                        -- result after masking could be zero, or could be a
+                        -- denormalized result that needs to be renormalized
+                        renormalize := '1';
+                        v.state := ROUNDING_3;
+                    else
+                        arith_done := '1';
+                    end if;
+                end if;
+                if round(0) = '1' then
+                    v.fpscr(FPSCR_XX) := '1';
+                    if r.tiny = '1' then
+                        v.fpscr(FPSCR_UX) := '1';
+                    end if;
+                end if;
+
+            when ROUNDING_2 =>
+                -- Check for overflow during rounding
+                -- r.shift = -1
+                v.x := '0';
+                if r.r(55) = '1' then
+                    opsel_r <= RES_SHIFT;
+                    if exp_huge = '1' then
+                        v.state := ROUND_OFLOW;
+                    else
+                        arith_done := '1';
+                    end if;
+                elsif r.r(54) = '0' then
+                    -- Do CLZ so we can renormalize the result
+                    renormalize := '1';
+                    v.state := ROUNDING_3;
+                else
+                    arith_done := '1';
+                end if;
+
+            when ROUNDING_3 =>
+                -- r.shift = clz(r.r) - 9
+                mant_nz := r_hi_nz or (r_lo_nz and not r.single_prec);
+                if mant_nz = '0' then
+                    v.result_class := ZERO;
+                    if r.is_subtract = '1' then
+                        -- set result sign depending on rounding mode
+                        v.result_sign := r.round_mode(1) and r.round_mode(0);
+                    end if;
+                    arith_done := '1';
+                else
+                    -- Renormalize result after rounding
+                    opsel_r <= RES_SHIFT;
+                    v.denorm := exp_tiny;
+                    v.shift := new_exp - to_signed(-1022, EXP_BITS);
+                    if new_exp < to_signed(-1022, EXP_BITS) then
+                        v.state := DENORM;
+                    else
+                        arith_done := '1';
+                    end if;
+                end if;
+
+            when DENORM =>
+                -- r.shift = result_exp - -1022
+                opsel_r <= RES_SHIFT;
+                arith_done := '1';
+
+            when NAN_RESULT =>
+                if (r.use_a = '1' and r.a.class = NAN and r.a.mantissa(53) = '0') or
+                    (r.use_b = '1' and r.b.class = NAN and r.b.mantissa(53) = '0') or
+                    (r.use_c = '1' and r.c.class = NAN and r.c.mantissa(53) = '0') then
+                    -- Signalling NAN
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    invalid := '1';
+                end if;
+                if r.use_a = '1' and r.a.class = NAN then
+                    v.opsel_a := AIN_A;
+                elsif r.use_b = '1' and r.b.class = NAN then
+                    v.opsel_a := AIN_B;
+                elsif r.use_c = '1' and r.c.class = NAN then
+                    v.opsel_a := AIN_C;
+                end if;
+                v.state := EXC_RESULT;
+
+            when EXC_RESULT =>
+                -- r.opsel_a = AIN_A, AIN_B or AIN_C according to which input is the result
+                case r.opsel_a is
+                    when AIN_B =>
+                        v.result_sign := r.b.negative xor r.negate;
+                        v.result_exp := r.b.exponent;
+                        v.result_class := r.b.class;
+                    when AIN_C =>
+                        v.result_sign := r.c.negative xor r.negate;
+                        v.result_exp := r.c.exponent;
+                        v.result_class := r.c.class;
+                    when others =>
+                        v.result_sign := r.a.negative xor r.negate;
+                        v.result_exp := r.a.exponent;
+                        v.result_class := r.a.class;
+                end case;
+                arith_done := '1';
+
+        end case;
+
+        if zero_divide = '1' then
+            v.fpscr(FPSCR_ZX) := '1';
+        end if;
+        if qnan_result = '1' then
+            invalid := '1';
+            v.result_class := NAN;
+            v.result_sign := '0';
+            misc_sel <= "0001";
+            opsel_r <= RES_MISC;
+            arith_done := '1';
+        end if;
+        if invalid = '1' then
+            v.invalid := '1';
+        end if;
+        if arith_done = '1' then
+            -- Enabled invalid exception doesn't write result or FPRF
+            -- Neither does enabled zero-divide exception
+            if (v.invalid and r.fpscr(FPSCR_VE)) = '0' and
+                (zero_divide and r.fpscr(FPSCR_ZE)) = '0' then
+                v.writing_back := '1';
+                v.update_fprf := '1';
+            end if;
+            v.instr_done := '1';
+            v.state := IDLE;
+            update_fx := '1';
+        end if;
+
+        -- Multiplier and divide/square root data path
+        case msel_1 is
+            when MUL1_A =>
+                f_to_multiply.data1 <= r.a.mantissa(61 downto 0) & "00";
+            when MUL1_B =>
+                f_to_multiply.data1 <= r.b.mantissa(61 downto 0) & "00";
+            when MUL1_Y =>
+                f_to_multiply.data1 <= r.y;
+            when others =>
+                f_to_multiply.data1 <= r.r(61 downto 0) & "00";
+        end case;
+        case msel_2 is
+            when MUL2_C =>
+                f_to_multiply.data2 <= r.c.mantissa(61 downto 0) & "00";
+            when MUL2_LUT =>
+                f_to_multiply.data2 <= x"00" & inverse_est & '0' & x"000000000";
+            when MUL2_P =>
+                f_to_multiply.data2 <= r.p;
+            when others =>
+                f_to_multiply.data2 <= r.r(61 downto 0) & "00";
+        end case;
+        maddend := (others => '0');
+        case msel_add is
+            when MULADD_CONST =>
+                -- addend is 2.0 or 1.5 in 16.112 format
+                if r.is_sqrt = '0' then
+                    maddend(113) := '1';                -- 2.0
+                else
+                    maddend(112 downto 111) := "11";    -- 1.5
+                end if;
+            when MULADD_A =>
+                -- addend is A in 16.112 format
+                maddend(121 downto 58) := r.a.mantissa;
+            when MULADD_RS =>
+                -- addend is concatenation of R and S in 16.112 format
+                maddend := "000000" & r.r & r.s & "00";
+            when others =>
+        end case;
+        if msel_inv = '1' then
+            f_to_multiply.addend <= not maddend;
+        else
+            f_to_multiply.addend <= maddend;
+        end if;
+        f_to_multiply.not_result <= msel_inv;
+        if set_y = '1' then
+            v.y := f_to_multiply.data2;
+        end if;
+        if multiply_to_f.valid = '1' then
+            if pshift = '0' then
+                v.p := multiply_to_f.result(63 downto 0);
+            else
+                v.p := multiply_to_f.result(119 downto 56);
+            end if;
+        end if;
+
+        -- Data path.
+        -- This has A and B input multiplexers, an adder, a shifter,
+        -- count-leading-zeroes logic, and a result mux.
+        if r.longmask = '1' then
+            mshift := r.shift + to_signed(-29, EXP_BITS);
+        else
+            mshift := r.shift;
+        end if;
+        if mshift < to_signed(-64, EXP_BITS) then
+            mask := (others => '1');
+        elsif mshift >= to_signed(0, EXP_BITS) then
+            mask := (others => '0');
+        else
+            mask := right_mask(unsigned(mshift(5 downto 0)));
+        end if;
+        case r.opsel_a is
+            when AIN_R =>
+                in_a0 := r.r;
+            when AIN_A =>
+                in_a0 := r.a.mantissa;
+            when AIN_B =>
+                in_a0 := r.b.mantissa;
+            when others =>
+                in_a0 := r.c.mantissa;
+        end case;
+        if (or (mask and in_a0)) = '1' and set_x = '1' then
+            v.x := '1';
+        end if;
+        if opsel_ainv = '1' then
+            in_a0 := not in_a0;
+        end if;
+        in_a <= in_a0;
+        case opsel_b is
+            when BIN_ZERO =>
+                in_b0 := (others => '0');
+            when BIN_R =>
+                in_b0 := r.r;
+            when BIN_MASK =>
+                in_b0 := mask;
+            when others =>
+                -- BIN_PS6, 6 LSBs of P/4 sign-extended to 64
+                in_b0 := std_ulogic_vector(resize(signed(r.p(7 downto 2)), 64));
+        end case;
+        if opsel_binv = '1' then
+            in_b0 := not in_b0;
+        end if;
+        in_b <= in_b0;
+        if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then
+            shift_res := shifter_64(r.r & (shiftin or r.s(55)) & r.s(54 downto 0),
+                                    std_ulogic_vector(r.shift(6 downto 0)));
+        else
+            shift_res := (others => '0');
+        end if;
+        sum := std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in);
+        if opsel_mask = '1' then
+            sum := sum and not mask;
+        end if;
+        case opsel_r is
+            when RES_SUM =>
+                result <= sum;
+            when RES_SHIFT =>
+                result <= shift_res;
+            when RES_MULT =>
+                result <= multiply_to_f.result(121 downto 58);
+            when others =>
+                case misc_sel is
+                    when "0000" =>
+                        misc := x"00000000" & (r.fpscr and fpscr_mask);
+                    when "0001" =>
+                        -- generated QNaN mantissa
+                        misc := x"0020000000000000";
+                    when "0010" =>
+                        -- mantissa of max representable DP number
+                        misc := x"007ffffffffffffc";
+                    when "0011" =>
+                        -- mantissa of max representable SP number
+                        misc := x"007fffff80000000";
+                    when "0100" =>
+                        -- fmrgow result
+                        misc := r.a.mantissa(31 downto 0) & r.b.mantissa(31 downto 0);
+                    when "0110" =>
+                        -- fmrgew result
+                        misc := r.a.mantissa(63 downto 32) & r.b.mantissa(63 downto 32);
+                    when "0111" =>
+                        misc := 10x"000" & inverse_est & 35x"000000000";
+                    when "1000" =>
+                        -- max positive result for fctiw[z]
+                        misc := x"000000007fffffff";
+                    when "1001" =>
+                        -- max negative result for fctiw[z]
+                        misc := x"ffffffff80000000";
+                    when "1010" =>
+                        -- max positive result for fctiwu[z]
+                        misc := x"00000000ffffffff";
+                    when "1011" =>
+                        -- max negative result for fctiwu[z]
+                        misc := x"0000000000000000";
+                    when "1100" =>
+                        -- max positive result for fctid[z]
+                        misc := x"7fffffffffffffff";
+                    when "1101" =>
+                        -- max negative result for fctid[z]
+                        misc := x"8000000000000000";
+                    when "1110" =>
+                        -- max positive result for fctidu[z]
+                        misc := x"ffffffffffffffff";
+                    when "1111" =>
+                        -- max negative result for fctidu[z]
+                        misc := x"0000000000000000";
+                    when others =>
+                        misc := x"0000000000000000";
+                end case;
+                result <= misc;
+        end case;
+        v.r := result;
+        if set_s = '1' then
+            case opsel_s is
+                when S_NEG =>
+                    v.s := std_ulogic_vector(unsigned(not r.s) + (not r.x));
+                when S_MULT =>
+                    v.s := multiply_to_f.result(57 downto 2);
+                when S_SHIFT =>
+                    v.s := shift_res(63 downto 8);
+                    if shift_res(7 downto 0) /= x"00" then
+                        v.x := '1';
+                    end if;
+                when others =>
+                    v.s := (others => '0');
+            end case;
+        end if;
+
+        if set_a = '1' then
+            v.a.exponent := new_exp;
+            v.a.mantissa := shift_res;
+        end if;
+        if set_b = '1' then
+            v.b.exponent := new_exp;
+            v.b.mantissa := shift_res;
+        end if;
+        if set_c = '1' then
+            v.c.exponent := new_exp;
+            v.c.mantissa := shift_res;
+        end if;
+
+        if opsel_r = RES_SHIFT then
+            v.result_exp := new_exp;
+        end if;
+
+        if renormalize = '1' then
+            clz := count_left_zeroes(r.r);
+            if renorm_sqrt = '1' then
+                -- make denormalized value end up with even exponent
+                clz(0) := '1';
+            end if;
+            v.shift := resize(signed('0' & clz) - 9, EXP_BITS);
+        end if;
+
+        if r.int_result = '1' then
+            fp_result <= r.r;
+        else
+            fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r,
+                                 r.single_prec, r.quieten_nan);
+        end if;
+        if r.update_fprf = '1' then
+            v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.result_sign, r.result_class,
+                                                             r.r(54) and not r.denorm);
+        end if;
+
+        v.fpscr(FPSCR_VX) := (or (v.fpscr(FPSCR_VXSNAN downto FPSCR_VXVC))) or
+                             (or (v.fpscr(FPSCR_VXSOFT downto FPSCR_VXCVI)));
+        v.fpscr(FPSCR_FEX) := or (v.fpscr(FPSCR_VX downto FPSCR_XX) and
+                                  v.fpscr(FPSCR_VE downto FPSCR_XE));
+        if update_fx = '1' and
+            (v.fpscr(FPSCR_VX downto FPSCR_XX) and not r.old_exc) /= "00000" then
+            v.fpscr(FPSCR_FX) := '1';
+        end if;
+        if r.rc = '1' then
+            v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX);
+        end if;
+
+        if illegal = '1' then
+            v.instr_done := '0';
+            v.do_intr := '0';
+            v.writing_back := '0';
+            v.busy := '0';
+            v.state := IDLE;
+        else
+            v.do_intr := v.instr_done and v.fpscr(FPSCR_FEX) and r.fe_mode;
+            if v.state /= IDLE or v.do_intr = '1' then
+                v.busy := '1';
+            end if;
+        end if;
+
+        rin <= v;
+        e_out.illegal <= illegal;
+    end process;
+
+end architecture behaviour;
diff --git a/gpr_hazard.vhdl b/gpr_hazard.vhdl
index 0fa66c5..fec03c7 100644
--- a/gpr_hazard.vhdl
+++ b/gpr_hazard.vhdl
@@ -2,6 +2,9 @@ library ieee;
 use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;
 
+library work;
+use work.common.all;
+
 entity gpr_hazard is
     generic (
         PIPELINE_DEPTH : natural := 1
@@ -15,13 +18,13 @@ entity gpr_hazard is
         issuing            : in std_ulogic;
 
         gpr_write_valid_in : in std_ulogic;
-        gpr_write_in       : in std_ulogic_vector(5 downto 0);
+        gpr_write_in       : in gspr_index_t;
         bypass_avail       : in std_ulogic;
         gpr_read_valid_in  : in std_ulogic;
-        gpr_read_in        : in std_ulogic_vector(5 downto 0);
+        gpr_read_in        : in gspr_index_t;
 
         ugpr_write_valid   : in std_ulogic;
-        ugpr_write_reg     : in std_ulogic_vector(5 downto 0);
+        ugpr_write_reg     : in gspr_index_t;
 
         stall_out          : out std_ulogic;
         use_bypass         : out std_ulogic
@@ -31,9 +34,9 @@ architecture behaviour of gpr_hazard is
     type pipeline_entry_type is record
         valid  : std_ulogic;
         bypass : std_ulogic;
-        gpr    : std_ulogic_vector(5 downto 0);
+        gpr    : gspr_index_t;
         ugpr_valid : std_ulogic;
-        ugpr   : std_ulogic_vector(5 downto 0);
+        ugpr   : gspr_index_t;
     end record;
     constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0'),
                                                            ugpr_valid => '0', ugpr => (others => '0'));
diff --git a/helpers.vhdl b/helpers.vhdl
index fe91938..834e386 100644
--- a/helpers.vhdl
+++ b/helpers.vhdl
@@ -25,6 +25,10 @@ package helpers is
     function byte_reverse(val: std_ulogic_vector(63 downto 0); size: integer) return std_ulogic_vector;
 
     function sign_extend(val: std_ulogic_vector(63 downto 0); size: natural) return std_ulogic_vector;
+
+    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector;
+    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
+    function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector;
 end package helpers;
 
 package body helpers is
@@ -206,4 +210,53 @@ package body helpers is
         return std_ulogic_vector(ret);
 
     end;
+
+    -- Reverse the order of bits in a word
+    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is
+        variable ret: std_ulogic_vector(a'left downto a'right);
+    begin
+        for i in a'right to a'left loop
+            ret(a'left + a'right - i) := a(i);
+        end loop;
+        return ret;
+    end;
+
+    -- If there is only one bit set in a doubleword, return its bit number
+    -- (counting from the right).  Each bit of the result is obtained by
+    -- ORing together 32 bits of the input:
+    --  bit 0 = a[1] or a[3] or a[5] or ...
+    --  bit 1 = a[2] or a[3] or a[6] or a[7] or ...
+    --  bit 2 = a[4..7] or a[12..15] or ...
+    --  bit 5 = a[32..63] ORed together
+    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
+        variable ret: std_ulogic_vector(5 downto 0);
+        variable stride: natural;
+        variable bit: std_ulogic;
+        variable k: natural;
+    begin
+        stride := 2;
+        for i in 0 to 5 loop
+            bit := '0';
+            for j in 0 to (64 / stride) - 1 loop
+                k := j * stride;
+                bit := bit or (or a(k + stride - 1 downto k + (stride / 2)));
+            end loop;
+            ret(i) := bit;
+            stride := stride * 2;
+        end loop;
+        return ret;
+    end;
+
+    -- Count leading zeroes operation
+    -- Assumes the value passed in is not zero (if it is, zero is returned)
+    function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector is
+        variable rev: std_ulogic_vector(val'left downto val'right);
+        variable sum: std_ulogic_vector(val'left downto val'right);
+        variable onehot: std_ulogic_vector(val'left downto val'right);
+    begin
+        rev := bit_reverse(val);
+        sum := std_ulogic_vector(- signed(rev));
+        onehot := sum and rev;
+        return bit_number(std_ulogic_vector(resize(unsigned(onehot), 64)));
+    end;
 end package body helpers;
diff --git a/insn_helpers.vhdl b/insn_helpers.vhdl
index 592acb0..519aa76 100644
--- a/insn_helpers.vhdl
+++ b/insn_helpers.vhdl
@@ -37,6 +37,11 @@ package insn_helpers is
     function insn_sh (insn_in : std_ulogic_vector) return std_ulogic_vector;
     function insn_me (insn_in : std_ulogic_vector) return std_ulogic_vector;
     function insn_mb (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_frt (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_fra (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_frb (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_frc (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_u (insn_in : std_ulogic_vector) return std_ulogic_vector;
 end package insn_helpers;
 
 package body insn_helpers is
@@ -214,4 +219,29 @@ package body insn_helpers is
     begin
         return insn_in(5) & insn_in(10 downto 6);
     end;
+
+    function insn_frt(insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(25 downto 21);
+    end;
+
+    function insn_fra(insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(20 downto 16);
+    end;
+
+    function insn_frb(insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(15 downto 11);
+    end;
+
+    function insn_frc(insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(10 downto 6);
+    end;
+
+    function insn_u(insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(15 downto 12);
+    end;
 end package body insn_helpers;
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index e36025c..919ba0e 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -5,12 +5,15 @@ use ieee.numeric_std.all;
 library work;
 use work.decode_types.all;
 use work.common.all;
+use work.insn_helpers.all;
+use work.helpers.all;
 
 -- 2 cycle LSU
 -- We calculate the address in the first cycle
 
 entity loadstore1 is
     generic (
+        HAS_FPU : boolean := true;
         -- Non-zero to enable log data collection
         LOG_LENGTH : natural := 0
         );
@@ -42,10 +45,12 @@ architecture behave of loadstore1 is
 
     -- State machine for unaligned loads/stores
     type state_t is (IDLE,              -- ready for instruction
+                     FPR_CONV,          -- converting double to float for store
                      SECOND_REQ,        -- send 2nd request of unaligned xfer
                      ACK_WAIT,          -- waiting for ack from dcache
                      MMU_LOOKUP,        -- waiting for MMU to look up translation
                      TLBIE_WAIT,        -- waiting for MMU to finish doing a tlbie
+                     FINISH_LFS,        -- write back converted SP data for lfs*
                      COMPLETE           -- extra cycle to complete an operation
                      );
 
@@ -58,7 +63,7 @@ architecture behave of loadstore1 is
 	addr         : std_ulogic_vector(63 downto 0);
 	store_data   : std_ulogic_vector(63 downto 0);
 	load_data    : std_ulogic_vector(63 downto 0);
-	write_reg    : gpr_index_t;
+	write_reg    : gspr_index_t;
 	length       : std_ulogic_vector(3 downto 0);
 	byte_reverse : std_ulogic;
 	sign_extend  : std_ulogic;
@@ -86,6 +91,11 @@ architecture behave of loadstore1 is
         do_update    : std_ulogic;
         extra_cycle  : std_ulogic;
         mode_32bit   : std_ulogic;
+        load_sp      : std_ulogic;
+        ld_sp_data   : std_ulogic_vector(31 downto 0);
+        ld_sp_nz     : std_ulogic;
+        ld_sp_lz     : std_ulogic_vector(5 downto 0);
+        st_sp_data   : std_ulogic_vector(31 downto 0);
     end record;
 
     type byte_sel_t is array(0 to 7) of std_ulogic;
@@ -95,6 +105,9 @@ architecture behave of loadstore1 is
     signal r, rin : reg_stage_t;
     signal lsu_sum : std_ulogic_vector(63 downto 0);
 
+    signal store_sp_data : std_ulogic_vector(31 downto 0);
+    signal load_dp_data  : std_ulogic_vector(63 downto 0);
+
     -- Generate byte enables from sizes
     function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
     begin
@@ -125,6 +138,72 @@ architecture behave of loadstore1 is
 					    to_integer(unsigned(address))));
     end function xfer_data_sel;
 
+    -- 23-bit right shifter for DP -> SP float conversions
+    function shifter_23r(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
+        return std_ulogic_vector is
+        variable fs1   : std_ulogic_vector(22 downto 0);
+        variable fs2   : std_ulogic_vector(22 downto 0);
+    begin
+        case shift(1 downto 0) is
+            when "00" =>
+                fs1 := frac;
+            when "01" =>
+                fs1 := '0' & frac(22 downto 1);
+            when "10" =>
+                fs1 := "00" & frac(22 downto 2);
+            when others =>
+                fs1 := "000" & frac(22 downto 3);
+        end case;
+        case shift(4 downto 2) is
+            when "000" =>
+                fs2 := fs1;
+            when "001" =>
+                fs2 := x"0" & fs1(22 downto 4);
+            when "010" =>
+                fs2 := x"00" & fs1(22 downto 8);
+            when "011" =>
+                fs2 := x"000" & fs1(22 downto 12);
+            when "100" =>
+                fs2 := x"0000" & fs1(22 downto 16);
+            when others =>
+                fs2 := x"00000" & fs1(22 downto 20);
+        end case;
+        return fs2;
+    end;
+
+    -- 23-bit left shifter for SP -> DP float conversions
+    function shifter_23l(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
+        return std_ulogic_vector is
+        variable fs1   : std_ulogic_vector(22 downto 0);
+        variable fs2   : std_ulogic_vector(22 downto 0);
+    begin
+        case shift(1 downto 0) is
+            when "00" =>
+                fs1 := frac;
+            when "01" =>
+                fs1 := frac(21 downto 0) & '0';
+            when "10" =>
+                fs1 := frac(20 downto 0) & "00";
+            when others =>
+                fs1 := frac(19 downto 0) & "000";
+        end case;
+        case shift(4 downto 2) is
+            when "000" =>
+                fs2 := fs1;
+            when "001" =>
+                fs2 := fs1(18 downto 0) & x"0" ;
+            when "010" =>
+                fs2 := fs1(14 downto 0) & x"00";
+            when "011" =>
+                fs2 := fs1(10 downto 0) & x"000";
+            when "100" =>
+                fs2 := fs1(6 downto 0) & x"0000";
+            when others =>
+                fs2 := fs1(2 downto 0) & x"00000";
+        end case;
+        return fs2;
+    end;
+
 begin
     -- Calculate the address in the first cycle
     lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
@@ -142,6 +221,59 @@ begin
         end if;
     end process;
 
+    ls_fp_conv: if HAS_FPU generate
+        -- Convert DP data to SP for stfs
+        dp_to_sp: process(all)
+            variable exp   : unsigned(10 downto 0);
+            variable frac  : std_ulogic_vector(22 downto 0);
+            variable shift : unsigned(4 downto 0);
+        begin
+            store_sp_data(31) <= l_in.data(63);
+            store_sp_data(30 downto 0) <= (others => '0');
+            exp := unsigned(l_in.data(62 downto 52));
+            if exp > 896 then
+                store_sp_data(30) <= l_in.data(62);
+                store_sp_data(29 downto 0) <= l_in.data(58 downto 29);
+            elsif exp >= 874 then
+                -- denormalization required
+                frac := '1' & l_in.data(51 downto 30);
+                shift := 0 - exp(4 downto 0);
+                store_sp_data(22 downto 0) <= shifter_23r(frac, shift);
+            end if;
+        end process;
+
+        -- Convert SP data to DP for lfs
+        sp_to_dp: process(all)
+            variable exp     : unsigned(7 downto 0);
+            variable exp_dp  : unsigned(10 downto 0);
+            variable exp_nz  : std_ulogic;
+            variable exp_ao  : std_ulogic;
+            variable frac    : std_ulogic_vector(22 downto 0);
+            variable frac_shift : unsigned(4 downto 0);
+        begin
+            frac := r.ld_sp_data(22 downto 0);
+            exp := unsigned(r.ld_sp_data(30 downto 23));
+            exp_nz := or (r.ld_sp_data(30 downto 23));
+            exp_ao := and (r.ld_sp_data(30 downto 23));
+            frac_shift := (others => '0');
+            if exp_ao = '1' then
+                exp_dp := to_unsigned(2047, 11);    -- infinity or NaN
+            elsif exp_nz = '1' then
+                exp_dp := 896 + resize(exp, 11);    -- finite normalized value
+            elsif r.ld_sp_nz = '0' then
+                exp_dp := to_unsigned(0, 11);       -- zero
+            else
+                -- denormalized SP operand, need to normalize
+                exp_dp := 896 - resize(unsigned(r.ld_sp_lz), 11);
+                frac_shift := unsigned(r.ld_sp_lz(4 downto 0)) + 1;
+            end if;
+            load_dp_data(63) <= r.ld_sp_data(31);
+            load_dp_data(62 downto 52) <= std_ulogic_vector(exp_dp);
+            load_dp_data(51 downto 29) <= shifter_23l(frac, frac_shift);
+            load_dp_data(28 downto 0) <= (others => '0');
+        end process;
+    end generate;
+
     loadstore1_1: process(all)
         variable v : reg_stage_t;
         variable brev_lenm1 : unsigned(2 downto 0);
@@ -162,6 +294,9 @@ begin
         variable data_permuted : std_ulogic_vector(63 downto 0);
         variable data_trimmed : std_ulogic_vector(63 downto 0);
         variable store_data : std_ulogic_vector(63 downto 0);
+        variable data_in : std_ulogic_vector(63 downto 0);
+        variable byte_rev : std_ulogic;
+        variable length : std_ulogic_vector(3 downto 0);
         variable use_second : byte_sel_t;
         variable trim_ctl : trim_ctl_t;
         variable negative : std_ulogic;
@@ -173,6 +308,8 @@ begin
         variable mmu_mtspr : std_ulogic;
         variable itlb_fault : std_ulogic;
         variable misaligned : std_ulogic;
+        variable fp_reg_conv : std_ulogic;
+        variable lfs_done : std_ulogic;
     begin
         v := r;
         req := '0';
@@ -182,8 +319,10 @@ begin
         sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10));
         dsisr := (others => '0');
         mmureq := '0';
+        fp_reg_conv := '0';
 
         write_enable := '0';
+        lfs_done := '0';
 
         do_update := r.do_update;
         v.do_update := '0';
@@ -242,19 +381,38 @@ begin
             end case;
         end loop;
 
-        -- Byte reversing and rotating for stores
-        -- Done in the first cycle (when l_in.valid = 1)
+        if HAS_FPU then
+            -- Single-precision FP conversion
+            v.st_sp_data := store_sp_data;
+            v.ld_sp_data := data_trimmed(31 downto 0);
+            v.ld_sp_nz := or (data_trimmed(22 downto 0));
+            v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0));
+        end if;
+
+        -- Byte reversing and rotating for stores.
+        -- Done in the first cycle (when l_in.valid = 1) for integer stores
+        -- and DP float stores, and in the second cycle for SP float stores.
         store_data := r.store_data;
-        if l_in.valid = '1' then
-            byte_offset := unsigned(lsu_sum(2 downto 0));
+        if l_in.valid = '1' or (HAS_FPU and r.state = FPR_CONV) then
+            if HAS_FPU and r.state = FPR_CONV then
+                data_in := x"00000000" & r.st_sp_data;
+                byte_offset := unsigned(r.addr(2 downto 0));
+                byte_rev := r.byte_reverse;
+                length := r.length;
+            else
+                data_in := l_in.data;
+                byte_offset := unsigned(lsu_sum(2 downto 0));
+                byte_rev := l_in.byte_reverse;
+                length := l_in.length;
+            end if;
             brev_lenm1 := "000";
-            if l_in.byte_reverse = '1' then
-                brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
+            if byte_rev = '1' then
+                brev_lenm1 := unsigned(length(2 downto 0)) - 1;
             end if;
             for i in 0 to 7 loop
                 k := (to_unsigned(i, 3) - byte_offset) xor brev_lenm1;
                 j := to_integer(k) * 8;
-                store_data(i * 8 + 7 downto i * 8) := l_in.data(j + 7 downto j);
+                store_data(i * 8 + 7 downto i * 8) := data_in(j + 7 downto j);
             end loop;
         end if;
         v.store_data := store_data;
@@ -289,6 +447,14 @@ begin
         case r.state is
         when IDLE =>
 
+        when FPR_CONV =>
+            req := '1';
+            if r.second_bytes /= "00000000" then
+                v.state := SECOND_REQ;
+            else
+                v.state := ACK_WAIT;
+            end if;
+
         when SECOND_REQ =>
             req := '1';
             v.state := ACK_WAIT;
@@ -320,8 +486,13 @@ begin
                         v.load_data := data_permuted;
                     end if;
                 else
-                    write_enable := r.load;
-                    if r.extra_cycle = '1' then
+                    write_enable := r.load and not r.load_sp;
+                    if HAS_FPU and r.load_sp = '1' then
+                        -- SP to DP conversion takes a cycle
+                        -- Write back rA update in this cycle if needed
+                        do_update := r.update;
+                        v.state := FINISH_LFS;
+                    elsif r.extra_cycle = '1' then
                         -- loads with rA update need an extra cycle
                         v.state := COMPLETE;
                         v.do_update := r.update;
@@ -359,6 +530,9 @@ begin
 
         when TLBIE_WAIT =>
 
+        when FINISH_LFS =>
+            lfs_done := '1';
+
         when COMPLETE =>
             exception := r.align_intr;
 
@@ -392,6 +566,7 @@ begin
             v.nc := l_in.ci;
             v.virt_mode := l_in.virt_mode;
             v.priv_mode := l_in.priv_mode;
+            v.load_sp := '0';
             v.wait_dcache := '0';
             v.wait_mmu := '0';
             v.do_update := '0';
@@ -431,6 +606,27 @@ begin
                     v.align_intr := v.nc;
                     req := '1';
                     v.dcbz := '1';
+                when OP_FPSTORE =>
+                    if HAS_FPU then
+                        if l_in.is_32bit = '1' then
+                            v.state := FPR_CONV;
+                            fp_reg_conv := '1';
+                        else
+                            req := '1';
+                        end if;
+                    end if;
+                when OP_FPLOAD =>
+                    if HAS_FPU then
+                        v.load := '1';
+                        req := '1';
+                        -- Allow an extra cycle for SP->DP precision conversion
+                        -- or RA update
+                        v.extra_cycle := l_in.update;
+                        if l_in.is_32bit = '1' then
+                            v.load_sp := '1';
+                            v.extra_cycle := '1';
+                        end if;
+                    end if;
                 when OP_TLBIE =>
                     mmureq := '1';
                     v.tlbie := '1';
@@ -486,7 +682,7 @@ begin
                 end if;
             end if;
 
-            v.busy := req or mmureq or mmu_mtspr;
+            v.busy := req or mmureq or mmu_mtspr or fp_reg_conv;
         end if;
 
         -- Update outputs to dcache
@@ -523,8 +719,12 @@ begin
             l_out.write_data <= r.sprval;
         elsif do_update = '1' then
             l_out.write_enable <= '1';
-            l_out.write_reg <= r.update_reg;
+            l_out.write_reg <= gpr_to_gspr(r.update_reg);
             l_out.write_data <= r.addr;
+        elsif lfs_done = '1' then
+            l_out.write_enable <= '1';
+            l_out.write_reg <= r.write_reg;
+            l_out.write_data <= load_dp_data;
         else
             l_out.write_enable <= write_enable;
             l_out.write_reg <= r.write_reg;
diff --git a/microwatt.core b/microwatt.core
index cd24a06..7f2068d 100644
--- a/microwatt.core
+++ b/microwatt.core
@@ -23,6 +23,7 @@ filesets:
       - cr_hazard.vhdl
       - control.vhdl
       - execute1.vhdl
+      - fpu.vhdl
       - loadstore1.vhdl
       - mmu.vhdl
       - dcache.vhdl
@@ -132,6 +133,7 @@ targets:
       - disable_flatten_core
       - log_length=2048
       - uart_is_16550
+      - has_fpu
     tools:
       vivado: {part : xc7a100tcsg324-1}
     toplevel : toplevel
@@ -215,6 +217,7 @@ targets:
       - spi_flash_offset=10485760
       - log_length=2048
       - uart_is_16550
+      - has_fpu
     tools:
       vivado: {part : xc7a200tsbg484-1}
     toplevel : toplevel
@@ -231,6 +234,7 @@ targets:
       - spi_flash_offset=10485760
       - log_length=2048
       - uart_is_16550
+      - has_fpu
     generate: [litedram_nexys_video]
     tools:
       vivado: {part : xc7a200tsbg484-1}
@@ -249,6 +253,7 @@ targets:
       - log_length=512
       - uart_is_16550
       - has_uart1
+      - has_fpu=false
     tools:
       vivado: {part : xc7a35ticsg324-1L}
     toplevel : toplevel
@@ -267,6 +272,7 @@ targets:
       - log_length=512
       - uart_is_16550
       - has_uart1
+      - has_fpu=false
     generate: [litedram_arty, liteeth_arty]
     tools:
       vivado: {part : xc7a35ticsg324-1L}
@@ -285,6 +291,7 @@ targets:
       - log_length=2048
       - uart_is_16550
       - has_uart1
+      - has_fpu
     tools:
       vivado: {part : xc7a100ticsg324-1L}
     toplevel : toplevel
@@ -303,6 +310,7 @@ targets:
       - log_length=2048
       - uart_is_16550
       - has_uart1
+      - has_fpu
     generate: [litedram_arty, liteeth_arty]
     tools:
       vivado: {part : xc7a100ticsg324-1L}
@@ -320,6 +328,7 @@ targets:
       - disable_flatten_core
       - log_length=512
       - uart_is_16550
+      - has_fpu=false
     tools:
       vivado: {part : xc7a35tcpg236-1}
     toplevel : toplevel
@@ -380,6 +389,12 @@ parameters:
     paramtype   : generic
     default     : 100000000
 
+  has_fpu:
+    datatype    : bool
+    description : Include a floating-point unit in the core
+    paramtype   : generic
+    default     : true
+
   disable_flatten_core:
     datatype    : bool
     description : Prevent Vivado from flattening the main core components
diff --git a/register_file.vhdl b/register_file.vhdl
index 10f28a4..32c8490 100644
--- a/register_file.vhdl
+++ b/register_file.vhdl
@@ -8,6 +8,7 @@ use work.common.all;
 entity register_file is
     generic (
         SIM : boolean := false;
+        HAS_FPU : boolean := true;
         -- Non-zero to enable log data collection
         LOG_LENGTH : natural := 0
         );
@@ -28,12 +29,12 @@ entity register_file is
         sim_dump      : in std_ulogic;
         sim_dump_done : out std_ulogic;
 
-        log_out       : out std_ulogic_vector(70 downto 0)
+        log_out       : out std_ulogic_vector(71 downto 0)
         );
 end entity register_file;
 
 architecture behaviour of register_file is
-    type regfile is array(0 to 63) of std_ulogic_vector(63 downto 0);
+    type regfile is array(0 to 127) of std_ulogic_vector(63 downto 0);
     signal registers : regfile := (others => (others => '0'));
     signal rd_port_b : std_ulogic_vector(63 downto 0);
     signal dbg_data : std_ulogic_vector(63 downto 0);
@@ -41,53 +42,73 @@ architecture behaviour of register_file is
 begin
     -- synchronous writes
     register_write_0: process(clk)
+        variable w_addr : gspr_index_t;
     begin
         if rising_edge(clk) then
             if w_in.write_enable = '1' then
-		if w_in.write_reg(5) = '0' then
-		    report "Writing GPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data);
-		else
-		    report "Writing GSPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data);
-		end if;
+                w_addr := w_in.write_reg;
+                if HAS_FPU and w_addr(6) = '1' then
+                    report "Writing FPR " & to_hstring(w_addr(4 downto 0)) & " " & to_hstring(w_in.write_data);
+                else
+                    w_addr(6) := '0';
+                    if w_addr(5) = '0' then
+                        report "Writing GPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data);
+                    else
+                        report "Writing GSPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data);
+                    end if;
+                end if;
                 assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure;
-                registers(to_integer(unsigned(w_in.write_reg))) <= w_in.write_data;
+                registers(to_integer(unsigned(w_addr))) <= w_in.write_data;
             end if;
         end if;
     end process register_write_0;
 
     -- asynchronous reads
     register_read_0: process(all)
-        variable b_addr : gspr_index_t;
+        variable a_addr, b_addr, c_addr : gspr_index_t;
+        variable w_addr : gspr_index_t;
     begin
+        a_addr := d_in.read1_reg;
+        b_addr := d_in.read2_reg;
+        c_addr := d_in.read3_reg;
+        w_addr := w_in.write_reg;
+        if not HAS_FPU then
+            -- Make it obvious that we only want 64 GSPRs for a no-FPU implementation
+            a_addr(6) := '0';
+            b_addr(6) := '0';
+            c_addr(6) := '0';
+            w_addr(6) := '0';
+        end if;
         if d_in.read1_enable = '1' then
-            report "Reading GPR " & to_hstring(d_in.read1_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read1_reg))));
+            report "Reading GPR " & to_hstring(a_addr) & " " & to_hstring(registers(to_integer(unsigned(a_addr))));
         end if;
         if d_in.read2_enable = '1' then
-            report "Reading GPR " & to_hstring(d_in.read2_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read2_reg))));
+            report "Reading GPR " & to_hstring(b_addr) & " " & to_hstring(registers(to_integer(unsigned(b_addr))));
         end if;
         if d_in.read3_enable = '1' then
-            report "Reading GPR " & to_hstring(d_in.read3_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read3_reg))));
+            report "Reading GPR " & to_hstring(c_addr) & " " & to_hstring(registers(to_integer(unsigned(c_addr))));
         end if;
-        d_out.read1_data <= registers(to_integer(unsigned(d_in.read1_reg)));
+        d_out.read1_data <= registers(to_integer(unsigned(a_addr)));
         -- B read port is multiplexed with reads from the debug circuitry
         if d_in.read2_enable = '0' and dbg_gpr_req = '1' and dbg_ack = '0' then
             b_addr := dbg_gpr_addr;
-        else
-            b_addr := d_in.read2_reg;
+            if not HAS_FPU then
+                b_addr(6) := '0';
+            end if;
         end if;
         rd_port_b <= registers(to_integer(unsigned(b_addr)));
         d_out.read2_data <= rd_port_b;
-        d_out.read3_data <= registers(to_integer(unsigned(gpr_to_gspr(d_in.read3_reg))));
+        d_out.read3_data <= registers(to_integer(unsigned(c_addr)));
 
         -- Forward any written data
         if w_in.write_enable = '1' then
-            if d_in.read1_reg = w_in.write_reg then
+            if a_addr = w_addr then
                 d_out.read1_data <= w_in.write_data;
             end if;
-            if d_in.read2_reg = w_in.write_reg then
+            if b_addr = w_addr then
                 d_out.read2_data <= w_in.write_data;
             end if;
-            if gpr_to_gspr(d_in.read3_reg) = w_in.write_reg then
+            if c_addr = w_addr then
                 d_out.read3_data <= w_in.write_data;
             end if;
         end if;
@@ -136,7 +157,7 @@ begin
     end generate;
 
     rf_log: if LOG_LENGTH > 0 generate
-        signal log_data : std_ulogic_vector(70 downto 0);
+        signal log_data : std_ulogic_vector(71 downto 0);
     begin
         reg_log: process(clk)
         begin
diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c
index 146346d..c61c8a5 100644
--- a/scripts/fmt_log/fmt_log.c
+++ b/scripts/fmt_log/fmt_log.c
@@ -58,7 +58,7 @@ struct log_entry {
 	u64	ls_lo_valid: 1;
 	u64	ls_eo_except: 1;
 	u64	ls_stall_out: 1;
-	u64	pad2: 2;
+	u64	pad2: 1;
 	u64	dc_state: 3;
 	u64	dc_ra_valid: 1;
 	u64	dc_tlb_way: 3;
@@ -74,7 +74,7 @@ struct log_entry {
 	u64	cr_wr_mask: 8;
 	u64	cr_wr_data: 4;
 	u64	cr_wr_enable: 1;
-	u64	reg_wr_reg: 6;
+	u64	reg_wr_reg: 7;
 	u64	reg_wr_enable: 1;
 
 	u64	reg_wr_data;
@@ -84,17 +84,17 @@ struct log_entry {
 #define FLGA(i, y, z)	(log.i? y: z)
 #define PNIA(f)		(full_nia[log.f] & 0xff)
 
-const char *units[4] = { "--", "al", "ls", "?3" };
+const char *units[4] = { "--", "al", "ls", "fp" };
 const char *ops[64] =
 {
 	"illegal", "nop    ", "add    ", "and    ", "attn   ", "b      ", "bc     ", "bcreg  ",
 	"bperm  ", "cmp    ", "cmpb   ", "cmpeqb ", "cmprb  ", "cntz   ", "crop   ", "darn   ",
 	"dcbf   ", "dcbst  ", "dcbt   ", "dcbtst ", "dcbz   ", "div    ", "dive   ", "exts   ",
-	"extswsl", "icbi   ", "icbt   ", "isel   ", "isync  ", "ld     ", "st     ", "mcrxrx ",
-	"mfcr   ", "mfmsr  ", "mfspr  ", "mod    ", "mtcrf  ", "mtmsr  ", "mtspr  ", "mull64 ",
-	"mulh64 ", "mulh32 ", "or     ", "popcnt ", "prty   ", "rfid   ", "rlc    ", "rlcl   ",
-	"rlcr   ", "sc     ", "setb   ", "shl    ", "shr    ", "sync   ", "tlbie  ", "trap   ",
-	"xor    ", "bcd    ", "addg6s ", "ffail  ", "?60    ", "?61    ", "?62    ", "?63    "
+	"extswsl", "fpop   ", "fpopi  ", "icbi   ", "icbt   ", "isel   ", "isync  ", "ld     ",
+	"st     ", "fpload ", "fpstore", "mcrxrx ", "mfcr   ", "mfmsr  ", "mfspr  ", "mod    ",
+	"mtcrf  ", "mtmsr  ", "mtspr  ", "mull64 ", "mulh64 ", "mulh32 ", "or     ", "popcnt ",
+	"prty   ", "rfid   ", "rlc    ", "rlcl   ", "rlcr   ", "sc     ", "setb   ", "shl    ",
+	"shr    ", "sync   ", "tlbie  ", "trap   ", "xor    ", "bcd    ", "addg6s ", "ffail  ",
 };
 
 const char *spr_names[13] =
diff --git a/soc.vhdl b/soc.vhdl
index 0a70026..7ab146f 100644
--- a/soc.vhdl
+++ b/soc.vhdl
@@ -52,6 +52,7 @@ entity soc is
 	RAM_INIT_FILE      : string;
 	CLK_FREQ           : positive;
 	SIM                : boolean;
+        HAS_FPU            : boolean := true;
 	DISABLE_FLATTEN_CORE : boolean := false;
 	HAS_DRAM           : boolean  := false;
 	DRAM_SIZE          : integer := 0;
@@ -253,6 +254,7 @@ begin
     processor: entity work.core
 	generic map(
 	    SIM => SIM,
+            HAS_FPU => HAS_FPU,
 	    DISABLE_FLATTEN => DISABLE_FLATTEN_CORE,
 	    ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'),
             LOG_LENGTH => LOG_LENGTH
diff --git a/tests/fpu/Makefile b/tests/fpu/Makefile
new file mode 100644
index 0000000..fd8344e
--- /dev/null
+++ b/tests/fpu/Makefile
@@ -0,0 +1,3 @@
+TEST=fpu
+
+include ../Makefile.test
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
new file mode 100644
index 0000000..52f21d0
--- /dev/null
+++ b/tests/fpu/fpu.c
@@ -0,0 +1,1461 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "console.h"
+
+#define asm	__asm__ volatile
+
+#define MSR_FP	0x2000
+#define MSR_FE0	0x800
+#define MSR_FE1	0x100
+
+#define FPS_RN_NEAR	0
+#define FPS_RN_ZERO	1
+#define FPS_RN_CEIL	2
+#define FPS_RN_FLOOR	3
+#define FPS_XE		0x8
+#define FPS_ZE		0x10
+#define FPS_UE		0x20
+#define FPS_OE		0x40
+#define FPS_VE		0x80
+#define FPS_VXCVI	0x100
+#define FPS_VXSOFT	0x400
+
+extern int trapit(long arg, int (*func)(long));
+extern void do_rfid(unsigned long msr);
+extern void do_blr(void);
+
+#define SRR0	26
+#define SRR1	27
+
+static inline unsigned long mfspr(int sprnum)
+{
+	long val;
+
+	asm("mfspr %0,%1" : "=r" (val) : "i" (sprnum));
+	return val;
+}
+
+static inline void mtspr(int sprnum, unsigned long val)
+{
+	asm("mtspr %0,%1" : : "i" (sprnum), "r" (val));
+}
+
+void disable_fp(void)
+{
+	unsigned long msr;
+
+	asm("mfmsr %0" : "=r" (msr));
+	msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1);
+	asm("mtmsrd %0" : : "r" (msr));
+}
+
+void enable_fp(void)
+{
+	unsigned long msr;
+
+	asm("mfmsr %0" : "=r" (msr));
+	msr |= MSR_FP;
+	msr &= ~(MSR_FE0 | MSR_FE1);
+	asm("mtmsrd %0" : : "r" (msr));
+}
+
+void enable_fp_interrupts(void)
+{
+	unsigned long msr;
+
+	asm("mfmsr %0" : "=r" (msr));
+	msr |= MSR_FE0 | MSR_FE1;
+	asm("mtmsrd %0" : : "r" (msr));
+}
+
+void print_string(const char *str)
+{
+	for (; *str; ++str)
+		putchar(*str);
+}
+
+void print_hex(unsigned long val, int ndigits, const char *str)
+{
+	int i, x;
+
+	for (i = (ndigits - 1) * 4; i >= 0; i -= 4) {
+		x = (val >> i) & 0xf;
+		if (x >= 10)
+			putchar(x + 'a' - 10);
+		else
+			putchar(x + '0');
+	}
+	print_string(str);
+}
+
+// i < 100
+void print_test_number(int i)
+{
+	print_string("test ");
+	putchar(48 + i/10);
+	putchar(48 + i%10);
+	putchar(':');
+}
+
+unsigned long foo = 0x3ff8000000000000ul;
+unsigned long foow;
+int fooi = -76543;
+int fooiw;
+
+int do_fp_op(long arg)
+{
+	switch (arg) {
+	case 0:
+		asm("lfd 31,0(%0)" : : "b" (&foo));
+		break;
+	case 1:
+		asm("stfd 31,0(%0)" : : "b" (&foow) : "memory");
+		break;
+	case 2:
+		asm("lfd 30,0(%0); stfd 30,0(%1)"
+		    : : "b" (&foo), "b" (&foow) : "memory");
+		break;
+	case 3:
+		asm("lfiwax 29,0,%0; stfd 29,0(%1)"
+		    : : "r" (&fooi), "b" (&foow) : "memory");
+		break;
+	case 4:
+		asm("lfiwzx 28,0,%0; stfd 28,0(%1)"
+		    : : "r" (&fooi), "b" (&foow) : "memory");
+		break;
+	case 5:
+		asm("lfdx 27,0,%0; stfiwx 27,0,%1"
+		    : : "r" (&foow), "r" (&fooiw) : "memory");
+		break;
+	}
+	return 0;
+}
+
+
+int fpu_test_1(void)
+{
+	int ret;
+
+	disable_fp();
+	/* these should give a FP unavailable exception */
+	ret = trapit(0, do_fp_op);
+	if (ret != 0x800)
+		return 1;
+	ret = trapit(1, do_fp_op);
+	if (ret != 0x800)
+		return 2;
+	enable_fp();
+	/* these should succeed */
+	ret = trapit(0, do_fp_op);
+	if (ret)
+		return ret | 3;
+	ret = trapit(1, do_fp_op);
+	if (ret)
+		return ret | 4;
+	if (foow != foo)
+		return 5;
+	return 0;
+}
+
+int fpu_test_2(void)
+{
+	int ret;
+
+	enable_fp();
+	foow = ~0;
+	ret = trapit(2, do_fp_op);
+	if (ret)
+		return ret | 1;
+	if (foow != foo)
+		return 2;
+	foow = ~0;
+	ret = trapit(3, do_fp_op);
+	if (ret)
+		return ret | 3;
+	if (foow != fooi)
+		return 4;
+	foow = ~0;
+	ret = trapit(4, do_fp_op);
+	if (ret)
+		return ret | 5;
+	if (foow != (unsigned int)fooi)
+		return 6;
+	ret = trapit(5, do_fp_op);
+	if (ret)
+		return ret | 7;
+	if (fooiw != fooi)
+		return 8;
+	return 0;
+}
+
+struct sp_dp_equiv {
+	unsigned int sp;
+	unsigned long dp;
+} sp_dp_equiv[] = {
+	{ 0, 0 },
+	{ 0x80000000, 0x8000000000000000 },
+	{ 0x7f800000, 0x7ff0000000000000 },
+	{ 0xff800000, 0xfff0000000000000 },
+	{ 0x7f812345, 0x7ff02468a0000000 },
+	{ 0x456789ab, 0x40acf13560000000 },
+	{ 0x12345678, 0x3a468acf00000000 },
+	{ 0x00400000, 0x3800000000000000 },
+	{ 0x00200000, 0x37f0000000000000 },
+	{ 0x00000002, 0x36b0000000000000 },
+	{ 0x00000001, 0x36a0000000000000 },
+	{ 0x7f7fffff, 0x47efffffe0000000 },
+};
+
+int sp_to_dp(long arg)
+{
+	unsigned long dp;
+
+	asm("lfs 20,0(%0); stfd 20,0(%1)"
+	    : : "b" (&sp_dp_equiv[arg].sp), "b" (&dp) : "memory");
+	if (dp != sp_dp_equiv[arg].dp) {
+		print_hex(sp_dp_equiv[arg].sp, 8, " ");
+		print_hex(dp, 16, " ");
+		print_hex(sp_dp_equiv[arg].dp, 16, " ");
+	}
+	return dp != sp_dp_equiv[arg].dp;
+}
+
+int dp_to_sp(long arg)
+{
+	unsigned int sp;
+
+	asm("lfd 21,0(%0); stfs 21,0(%1)"
+	    : : "b" (&sp_dp_equiv[arg].dp), "b" (&sp) : "memory");
+	return sp != sp_dp_equiv[arg].sp;
+}
+
+int fpu_test_3(void)
+{
+	int i, n, ret;
+
+	n = sizeof(sp_dp_equiv) / sizeof(sp_dp_equiv[0]);
+	enable_fp();
+	for (i = 0; i < n; ++i) {
+		ret = trapit(i, sp_to_dp);
+		if (ret != 0) {
+			if (ret == 1)
+				ret += i;
+			return ret;
+		}
+		ret = trapit(i, dp_to_sp);
+		if (ret != 0) {
+			if (ret == 1)
+				ret += i + 0x10000;
+			return ret;
+		}
+	}
+	return 0;
+}
+
+unsigned long get_fpscr(void)
+{
+	unsigned long ret;
+
+	asm("mffs 10; stfd 10,0(%0)" : : "b" (&ret) : "memory");
+	return ret;
+}
+
+void set_fpscr(unsigned long fpscr)
+{
+	asm("lfd%U0%X0 7,%0; mtfsf 0,7,1,0" : : "m" (fpscr));
+}
+
+unsigned long fpscr_eval(unsigned long val)
+{
+	val &= ~0x60000000;	/* clear FEX and VX */
+	if (val & 0x1f80700)	/* test all VX* bits */
+		val |= 0x20000000;
+	if ((val >> 25) & (val >> 3) & 0x1f)
+		val |= 0x40000000;
+	return val;
+}
+
+unsigned int test4vals[] = {
+	0xdeadbeef, 0x1324679a, 0, 0xffffffff, 0xabcd
+};
+
+int test4(long arg)
+{
+	unsigned long fsi, fso, fpscr;
+	long i;
+	unsigned long cr, mask;
+
+	/* check we can do basic mtfsf and mffs */
+	i = 1;
+	for (fsi = 1; fsi < 0x100; fsi <<= 1) {
+		asm("lfd 7,0(%0); mtfsf 0,7,1,0" : : "b" (&fsi));
+		if (get_fpscr() != fsi)
+			return i;
+		++i;
+		fpscr = fsi;
+	}
+	for (i = 0; i < sizeof(test4vals) / sizeof(test4vals[0]); ++i) {
+		fsi = test4vals[i];
+		asm("lfd 7,0(%0); mtfsf 0x55,7,0,0" : : "b" (&fsi));
+		fpscr = fpscr_eval((fpscr & 0xf0f0f0f0) | (fsi & 0x0f0f0f0f));
+		if (get_fpscr() != fpscr)
+			return 16 * i + 16;
+		asm("mtfsf 0xaa,7,0,0");
+		fpscr = fpscr_eval((fpscr & 0x0f0f0f0f) | (fsi & 0xf0f0f0f0));
+		if (get_fpscr() != fpscr)
+			return 16 * i + 17;
+		asm("mffs. 6; mfcr %0" : "=r" (cr) : : "cr1");
+		if (((cr >> 24) & 0xf) != ((fpscr >> 28) & 0x1f))
+			return 16 * i + 18;
+		asm("mffsce 12; stfd 12,0(%0)" : : "b" (&fso) : "memory");
+		if (fso != fpscr)
+			return 16 * i + 19;
+		fpscr = fpscr_eval(fpscr & ~0xf8);
+		if (get_fpscr() != fpscr)
+			return 16 * i + 20;
+		asm("lfd 7,0(%0); mtfsf 0xff,7,0,0" : : "b" (&fsi));
+		fpscr = fpscr_eval(fsi);
+		fsi = ~fsi;
+		asm("lfd 14,0(%0); mffscrn 15,14; stfd 15,0(%1)"
+		    : : "b" (&fsi), "b" (&fso) : "memory");
+		if (fso != (fpscr & 0xff))
+			return 16 * i + 21;
+		fpscr = (fpscr & ~3) | (fsi & 3);
+		if (get_fpscr() != fpscr)
+			return 16 * i + 22;
+		fso = ~fso;
+		asm("mffscrni 16,1; stfd 16,0(%0)" : : "b" (&fso) : "memory");
+		if (fso != (fpscr & 0xff))
+			return 16 * i + 23;
+		fpscr = (fpscr & ~3) | 1;
+		if (get_fpscr() != fpscr)
+			return 16 * i + 24;
+		asm("mffsl 17; stfd 17,0(%0)" : : "b" (&fso) : "memory");
+		mask = ((1 << (63-45+1)) - (1 << (63-51))) | ((1 << (63-56+1)) - (1 << (63-63)));
+		if (fso != (fpscr & mask))
+			return 16 * i + 25;
+		asm("mcrfs 0,3; mcrfs 7,0; mfcr %0" : "=r" (cr) : : "cr0", "cr7");
+		fso = fpscr_eval(fpscr & ~0x80000);
+		if (((cr >> 28) & 0xf) != ((fpscr >> 16) & 0xf) ||
+		    ((cr >> 0) & 0xf) != ((fso >> 28) & 0xf))
+			return 16 * i + 26;
+		fpscr = fso & 0x6fffffff;
+		asm("mtfsfi 0,7,0");
+		fpscr = fpscr_eval((fpscr & 0x0fffffff) | 0x70000000);
+		if (get_fpscr() != fpscr)
+			return 16 * i + 27;
+		asm("mtfsb0 21");
+		fpscr = fpscr_eval(fpscr & ~(1 << (31-21)));
+		if (get_fpscr() != fpscr)
+			return 16 * i + 28;
+		asm("mtfsb1 21");
+		fpscr = fpscr_eval(fpscr | (1 << (31-21)));
+		if (get_fpscr() != fpscr)
+			return 16 * i + 29;
+		asm("mtfsb0 24");
+		fpscr = fpscr_eval(fpscr & ~(1 << (31-24)));
+		if (get_fpscr() != fpscr)
+			return 16 * i + 30;
+		asm("mtfsb1. 24; mfcr %0" : "=r" (cr));
+		fpscr = fpscr_eval(fpscr | (1 << (31-24)));
+		if (get_fpscr() != fpscr || ((cr >> 24) & 0xf) != ((fpscr >> 28) & 0xf))
+			return 16 * i + 31;
+	}
+	return 0;
+}
+
+int fpu_test_4(void)
+{
+	enable_fp();
+	return trapit(0, test4);
+}
+
+int test5a(long arg)
+{
+	set_fpscr(0);
+	enable_fp_interrupts();
+	set_fpscr(FPS_VE);		/* set VE */
+	set_fpscr(FPS_VXSOFT | FPS_VE);	/* set VXSOFT */
+	set_fpscr(0);
+	return 1;		/* not supposed to get here */
+}
+
+int test5b(long arg)
+{
+	unsigned long msr;
+
+	enable_fp();
+	set_fpscr(FPS_VE);		/* set VE */
+	set_fpscr(FPS_VXSOFT | FPS_VE);	/* set VXSOFT */
+	asm("mfmsr %0" : "=r" (msr));
+	msr |= MSR_FE0 | MSR_FE1;
+	asm("mtmsrd %0; xori 4,4,0" : : "r" (msr));
+	set_fpscr(0);
+	return 1;		/* not supposed to get here */
+}
+
+int test5c(long arg)
+{
+	unsigned long msr;
+
+	enable_fp();
+	set_fpscr(FPS_VE);		/* set VE */
+	set_fpscr(FPS_VXSOFT | FPS_VE);	/* set VXSOFT */
+	asm("mfmsr %0" : "=r" (msr));
+	msr |= MSR_FE0 | MSR_FE1;
+	do_rfid(msr);
+	set_fpscr(0);
+	return 1;		/* not supposed to get here */
+}
+
+int fpu_test_5(void)
+{
+	int ret;
+	unsigned int *ip;
+
+	enable_fp();
+	ret = trapit(0, test5a);
+	if (ret != 0x700)
+		return 1;
+	ip = (unsigned int *)mfspr(SRR0);
+	/* check it's a mtfsf 0,7,1,0 instruction */
+	if (*ip != (63u << 26) + (1 << 25) + (7 << 11) + (711 << 1))
+		return 2;
+	if ((mfspr(SRR1) & 0x783f0000) != (1 << (63 - 43)))
+		return 3;
+
+	ret = trapit(0, test5b);
+	if (ret != 0x700)
+		return 4;
+	ip = (unsigned int *)mfspr(SRR0);
+	/* check it's an xori 4,4,0 instruction */
+	if (*ip != 0x68840000)
+		return 5;
+	if ((mfspr(SRR1) & 0x783f0000) != (1 << (63 - 43)) + (1 << (63 - 47)))
+		return 6;
+
+	ret = trapit(0, test5c);
+	if (ret != 0x700)
+		return 7;
+	ip = (unsigned int *)mfspr(SRR0);
+	/* check it's the destination of the rfid */
+	if (ip != (void *)&do_blr)
+		return 8;
+	if ((mfspr(SRR1) & 0x783f0000) != (1 << (63 - 43)) + (1 << (63 - 47)))
+		return 9;
+
+	return 0;
+}
+
+#define SIGN	0x8000000000000000ul
+
+int test6(long arg)
+{
+	long i;
+	unsigned long results[6];
+	unsigned long v;
+
+	for (i = 0; i < sizeof(sp_dp_equiv) / sizeof(sp_dp_equiv[0]); ++i) {
+		v = sp_dp_equiv[i].dp;
+		asm("lfd%U0%X0 3,%0; fmr 6,3; fneg 7,3; stfd 6,0(%1); stfd 7,8(%1)"
+		    : : "m" (sp_dp_equiv[i].dp), "b" (results) : "memory");
+		asm("fabs 9,6; fnabs 10,6; stfd 9,16(%0); stfd 10,24(%0)"
+		    : : "b" (results) : "memory");
+		asm("fcpsgn 4,9,3; stfd 4,32(%0); fcpsgn 5,10,3; stfd 5,40(%0)"
+		    : : "b" (results) : "memory");
+		if (results[0] != v ||
+		    results[1] != (v ^ SIGN) ||
+		    results[2] != (v & ~SIGN) ||
+		    results[3] != (v | SIGN) ||
+		    results[4] != (v & ~SIGN) ||
+		    results[5] != (v | SIGN))
+			return i + 1;
+	}
+	return 0;
+}
+
+int fpu_test_6(void)
+{
+	enable_fp();
+	return trapit(0, test6);
+}
+
+struct int_fp_equiv {
+	long		ival;
+	unsigned long	fp;
+	unsigned long	fp_u;
+	unsigned long	fp_s;
+	unsigned long	fp_us;
+} intvals[] = {
+	{ 0,  0, 0, 0, 0 },
+	{ 1,  0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 },
+	{ -1, 0xbff0000000000000, 0x43f0000000000000, 0xbff0000000000000, 0x43f0000000000000 },
+	{ 2,  0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000 },
+	{ -2, 0xc000000000000000, 0x43f0000000000000, 0xc000000000000000, 0x43f0000000000000 },
+	{ 0x12345678, 0x41b2345678000000, 0x41b2345678000000, 0x41b2345680000000, 0x41b2345680000000 },
+	{ 0x0008000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000 },
+	{ 0x0010000000000000, 0x4330000000000000, 0x4330000000000000, 0x4330000000000000, 0x4330000000000000 },
+	{ 0x0020000000000000, 0x4340000000000000, 0x4340000000000000, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000000000001, 0x4340000000000000, 0x4340000000000000, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000000000002, 0x4340000000000001, 0x4340000000000001, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000000000003, 0x4340000000000002, 0x4340000000000002, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000010000000, 0x4340000008000000, 0x4340000008000000, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000020000000, 0x4340000010000000, 0x4340000010000000, 0x4340000000000000, 0x4340000000000000 },
+	{ 0x0020000030000000, 0x4340000018000000, 0x4340000018000000, 0x4340000020000000, 0x4340000020000000 },
+	{ 0x0020000040000000, 0x4340000020000000, 0x4340000020000000, 0x4340000020000000, 0x4340000020000000 },
+	{ 0x0020000080000000, 0x4340000040000000, 0x4340000040000000, 0x4340000040000000, 0x4340000040000000 },
+	{ 0x0040000000000000, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000001, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000002, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000003, 0x4350000000000001, 0x4350000000000001, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000004, 0x4350000000000001, 0x4350000000000001, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000005, 0x4350000000000001, 0x4350000000000001, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000006, 0x4350000000000002, 0x4350000000000002, 0x4350000000000000, 0x4350000000000000 },
+	{ 0x0040000000000007, 0x4350000000000002, 0x4350000000000002, 0x4350000000000000, 0x4350000000000000 },
+};
+
+int test7(long arg)
+{
+	long i;
+	unsigned long results[4];
+
+	for (i = 0; i < sizeof(intvals) / sizeof(intvals[0]); ++i) {
+		asm("lfd%U0%X0 3,%0; fcfid 6,3; fcfidu 7,3; stfd 6,0(%1); stfd 7,8(%1)"
+		    : : "m" (intvals[i].ival), "b" (results) : "memory");
+		asm("fcfids 9,3; stfd 9,16(%0); fcfidus 10,3; stfd 10,24(%0)"
+		    : : "b" (results) : "memory");
+		if (results[0] != intvals[i].fp ||
+		    results[1] != intvals[i].fp_u ||
+		    results[2] != intvals[i].fp_s ||
+		    results[3] != intvals[i].fp_us) {
+			print_string("\r\n");
+			print_hex(results[0], 16, " ");
+			print_hex(results[1], 16, " ");
+			print_hex(results[2], 16, " ");
+			print_hex(results[3], 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_7(void)
+{
+	enable_fp();
+	return trapit(0, test7);
+}
+
+struct roundvals {
+	unsigned long fpscr;
+	unsigned long dpval;
+	unsigned long spval;
+} roundvals[] = {
+	{ FPS_RN_NEAR,  0, 0 },
+	{ FPS_RN_CEIL,  0x8000000000000000, 0x8000000000000000 },
+	{ FPS_RN_NEAR,  0x402123456789abcd, 0x4021234560000000 },
+	{ FPS_RN_ZERO,  0x402123456789abcd, 0x4021234560000000 },
+	{ FPS_RN_CEIL,  0x402123456789abcd, 0x4021234580000000 },
+	{ FPS_RN_FLOOR, 0x402123456789abcd, 0x4021234560000000 },
+	{ FPS_RN_NEAR,  0x402123457689abcd, 0x4021234580000000 },
+	{ FPS_RN_ZERO,  0x402123457689abcd, 0x4021234560000000 },
+	{ FPS_RN_CEIL,  0x402123457689abcd, 0x4021234580000000 },
+	{ FPS_RN_FLOOR, 0x402123457689abcd, 0x4021234560000000 },
+	{ FPS_RN_NEAR,  0x4021234570000000, 0x4021234580000000 },
+	{ FPS_RN_NEAR,  0x4021234550000000, 0x4021234540000000 },
+	{ FPS_RN_NEAR,  0x7ff123456789abcd, 0x7ff9234560000000 },
+	{ FPS_RN_ZERO,  0x7ffa3456789abcde, 0x7ffa345660000000 },
+	{ FPS_RN_FLOOR, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ FPS_RN_NEAR,  0x47e1234550000000, 0x47e1234540000000 },
+	{ FPS_RN_NEAR,  0x47f1234550000000, 0x7ff0000000000000 },
+	{ FPS_RN_ZERO,  0x47f1234550000000, 0x47efffffe0000000 },
+	{ FPS_RN_CEIL,  0x47f1234550000000, 0x7ff0000000000000 },
+	{ FPS_RN_FLOOR, 0x47f1234550000000, 0x47efffffe0000000 },
+	{ FPS_RN_NEAR,  0x38012345b0000000, 0x38012345c0000000 },
+	{ FPS_RN_NEAR,  0x37c12345b0000000, 0x37c1234400000000 },
+};
+
+int test8(long arg)
+{
+	long i;
+	unsigned long result;
+
+	for (i = 0; i < sizeof(roundvals) / sizeof(roundvals[0]); ++i) {
+		asm("lfd 3,0(%0); lfd 4,8(%0); mtfsf 0,3,1,0; frsp 6,4; stfd 6,0(%1)"
+		    : : "b" (&roundvals[i]), "b" (&result) : "memory");
+		if (result != roundvals[i].spval) {
+			print_string("\r\n");
+			print_hex(i, 4, " ");
+			print_hex(result, 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_8(void)
+{
+	enable_fp();
+	return trapit(0, test8);
+}
+
+struct cvtivals {
+	unsigned long dval;
+	long lval;
+	unsigned long ulval;
+	int ival;
+	unsigned int uival;
+	unsigned char invalids[4];
+} cvtivals[] = {
+	{ 0x0000000000000000, 0, 0, 0, 0, {0, 0, 0, 0} },
+	{ 0x8000000000000000, 0, 0, 0, 0, {0, 0, 0, 0} },
+	{ 0x3fdfffffffffffff, 0, 0, 0, 0, {0, 0, 0, 0} },
+	{ 0x3ff0000000000000, 1, 1, 1, 1, {0, 0, 0, 0} },
+	{ 0xbff0000000000000, -1, 0, -1, 0, {0, 1, 0, 1} },
+	{ 0x402123456789abcd, 9, 9, 9, 9, {0, 0, 0, 0} },
+	{ 0x406123456789abcd, 137, 137, 137, 137, {0, 0, 0, 0} },
+	{ 0x409123456789abcd, 1097, 1097, 1097, 1097, {0, 0, 0, 0} },
+	{ 0x41c123456789abcd, 0x22468acf, 0x22468acf, 0x22468acf, 0x22468acf, {0, 0, 0, 0} },
+	{ 0x41d123456789abcd, 0x448d159e, 0x448d159e, 0x448d159e, 0x448d159e, {0, 0, 0, 0} },
+	{ 0x41e123456789abcd, 0x891a2b3c, 0x891a2b3c, 0x7fffffff, 0x891a2b3c, {0, 0, 1, 0} },
+	{ 0x41f123456789abcd, 0x112345679, 0x112345679, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0xc1f123456789abcd, -0x112345679, 0, 0x80000000, 0, {0, 1, 1, 1} },
+	{ 0x432123456789abcd, 0x891a2b3c4d5e6, 0x891a2b3c4d5e6, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x433123456789abcd, 0x1123456789abcd, 0x1123456789abcd, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x434123456789abcd, 0x22468acf13579a, 0x22468acf13579a, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x43c123456789abcd, 0x22468acf13579a00, 0x22468acf13579a00, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x43d123456789abcd, 0x448d159e26af3400, 0x448d159e26af3400, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x43e123456789abcd, 0x7fffffffffffffff, 0x891a2b3c4d5e6800, 0x7fffffff, 0xffffffff, {1, 0, 1, 1} },
+	{ 0x43f123456789abcd, 0x7fffffffffffffff, 0xffffffffffffffff, 0x7fffffff, 0xffffffff, {1, 1, 1, 1} },
+	{ 0xc3f123456789abcd, 0x8000000000000000, 0, 0x80000000, 0, {1, 1, 1, 1} },
+	{ 0x7ff0000000000000, 0x7fffffffffffffff, 0xffffffffffffffff, 0x7fffffff, 0xffffffff, {1, 1, 1, 1} },
+	{ 0xfff0000000000000, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
+	{ 0x7ff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
+	{ 0xfff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
+	{ 0xbfd123456789abcd, 0, 0, 0, 0, {0, 0, 0, 0} },
+};
+
+#define GET_VXCVI()	((get_fpscr() >> 8) & 1)
+
+int test9(long arg)
+{
+	long i;
+	int ires;
+	unsigned int ures;
+	long lres;
+	unsigned long ulres;
+	unsigned char inv[4];
+	struct cvtivals *vp = cvtivals;
+
+	for (i = 0; i < sizeof(cvtivals) / sizeof(cvtivals[0]); ++i, ++vp) {
+		set_fpscr(FPS_RN_NEAR);
+		asm("lfd 3,0(%0); fctid 4,3; stfd 4,0(%1)"
+		    : : "b" (&vp->dval), "b" (&lres) : "memory");
+		inv[0] = GET_VXCVI();
+		set_fpscr(FPS_RN_NEAR);
+		asm("fctidu 5,3; stfd 5,0(%0)" : : "b" (&ulres) : "memory");
+		inv[1] = GET_VXCVI();
+		set_fpscr(FPS_RN_NEAR);
+		asm("fctiw 6,3; stfiwx 6,0,%0" : : "b" (&ires) : "memory");
+		inv[2] = GET_VXCVI();
+		set_fpscr(FPS_RN_NEAR);
+		asm("fctiwu 7,3; stfiwx 7,0,%0" : : "b" (&ures) : "memory");
+		inv[3] = GET_VXCVI();
+
+		if (lres != vp->lval || ulres != vp->ulval || ires != vp->ival || ures != vp->uival ||
+		    inv[0] != vp->invalids[0] || inv[1] != vp->invalids[1] ||
+		    inv[2] != vp->invalids[2] || inv[3] != vp->invalids[3]) {
+			print_hex(lres, 16, inv[0]? "V ": "  ");
+			print_hex(ulres, 16, inv[1]? "V ": "  ");
+			print_hex(ires, 8, inv[2]? "V ": "  ");
+			print_hex(ures, 8, inv[3]? "V ": "  ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_9(void)
+{
+	enable_fp();
+	return trapit(0, test9);
+}
+
+struct cvtivals cvtizvals[] = {
+	{ 0x0000000000000000, 0, 0, 0, 0, {0, 0, 0, 0} },
+	{ 0x8000000000000000, 0, 0, 0, 0, {0, 0, 0, 0} },
+	{ 0x3fdfffffffffffff, 0, 0, 0, 0, {0, 0, 0, 0} },
+	{ 0x3ff0000000000000, 1, 1, 1, 1, {0, 0, 0, 0} },
+	{ 0xbff0000000000000, -1, 0, -1, 0, {0, 1, 0, 1} },
+	{ 0x402123456789abcd, 8, 8, 8, 8, {0, 0, 0, 0} },
+	{ 0x406123456789abcd, 137, 137, 137, 137, {0, 0, 0, 0} },
+	{ 0x409123456789abcd, 1096, 1096, 1096, 1096, {0, 0, 0, 0} },
+	{ 0x41c123456789abcd, 0x22468acf, 0x22468acf, 0x22468acf, 0x22468acf, {0, 0, 0, 0} },
+	{ 0x41d123456789abcd, 0x448d159e, 0x448d159e, 0x448d159e, 0x448d159e, {0, 0, 0, 0} },
+	{ 0x41e123456789abcd, 0x891a2b3c, 0x891a2b3c, 0x7fffffff, 0x891a2b3c, {0, 0, 1, 0} },
+	{ 0x41f123456789abcd, 0x112345678, 0x112345678, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0xc1f123456789abcd, -0x112345678, 0, 0x80000000, 0, {0, 1, 1, 1} },
+	{ 0x432123456789abcd, 0x891a2b3c4d5e6, 0x891a2b3c4d5e6, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x433123456789abcd, 0x1123456789abcd, 0x1123456789abcd, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x434123456789abcd, 0x22468acf13579a, 0x22468acf13579a, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x43c123456789abcd, 0x22468acf13579a00, 0x22468acf13579a00, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x43d123456789abcd, 0x448d159e26af3400, 0x448d159e26af3400, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} },
+	{ 0x43e123456789abcd, 0x7fffffffffffffff, 0x891a2b3c4d5e6800, 0x7fffffff, 0xffffffff, {1, 0, 1, 1} },
+	{ 0x43f123456789abcd, 0x7fffffffffffffff, 0xffffffffffffffff, 0x7fffffff, 0xffffffff, {1, 1, 1, 1} },
+	{ 0xc3f123456789abcd, 0x8000000000000000, 0, 0x80000000, 0, {1, 1, 1, 1} },
+	{ 0x7ff0000000000000, 0x7fffffffffffffff, 0xffffffffffffffff, 0x7fffffff, 0xffffffff, {1, 1, 1, 1} },
+	{ 0xfff0000000000000, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
+	{ 0x7ff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
+	{ 0xfff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } },
+};
+
+int test10(long arg)
+{
+	long i;
+	int ires;
+	unsigned int ures;
+	long lres;
+	unsigned long ulres;
+	unsigned char inv[4];
+	struct cvtivals *vp = cvtizvals;
+
+	for (i = 0; i < sizeof(cvtizvals) / sizeof(cvtizvals[0]); ++i, ++vp) {
+		set_fpscr(FPS_RN_NEAR);
+		asm("lfd 3,0(%0); fctidz 4,3; stfd 4,0(%1)"
+		    : : "b" (&vp->dval), "b" (&lres) : "memory");
+		inv[0] = GET_VXCVI();
+		set_fpscr(FPS_RN_NEAR);
+		asm("fctiduz 5,3; stfd 5,0(%0)" : : "b" (&ulres) : "memory");
+		inv[1] = GET_VXCVI();
+		set_fpscr(FPS_RN_NEAR);
+		asm("fctiwz 6,3; stfiwx 6,0,%0" : : "b" (&ires) : "memory");
+		inv[2] = GET_VXCVI();
+		set_fpscr(FPS_RN_NEAR);
+		asm("fctiwuz 7,3; stfiwx 7,0,%0" : : "b" (&ures) : "memory");
+		inv[3] = GET_VXCVI();
+
+		if (lres != vp->lval || ulres != vp->ulval || ires != vp->ival || ures != vp->uival ||
+		    inv[0] != vp->invalids[0] || inv[1] != vp->invalids[1] ||
+		    inv[2] != vp->invalids[2] || inv[3] != vp->invalids[3]) {
+			print_hex(lres, 16, inv[0]? "V ": "  ");
+			print_hex(ulres, 16, inv[1]? "V ": "  ");
+			print_hex(ires, 8, inv[2]? "V ": "  ");
+			print_hex(ures, 8, inv[3]? "V ": "  ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_10(void)
+{
+	enable_fp();
+	return trapit(0, test10);
+}
+
+struct frivals {
+	unsigned long val;
+	unsigned long nval;
+	unsigned long zval;
+	unsigned long pval;
+	unsigned long mval;
+} frivals[] = {
+	{ 0x0000000000000000, 0, 0, 0, 0 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 },
+	{ 0x3fdfffffffffffff, 0, 0, 0x3ff0000000000000, 0 },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 },
+	{ 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000 },
+	{ 0x402123456789abcd, 0x4022000000000000, 0x4020000000000000, 0x4022000000000000, 0x4020000000000000 },
+	{ 0x406123456789abcd, 0x4061200000000000, 0x4061200000000000, 0x4061400000000000, 0x4061200000000000 },
+	{ 0x409123456789abcd, 0x4091240000000000, 0x4091200000000000, 0x4091240000000000, 0x4091200000000000 },
+	{ 0x41c123456789abcd, 0x41c1234567800000, 0x41c1234567800000, 0x41c1234568000000, 0x41c1234567800000 },
+	{ 0x41d123456789abcd, 0x41d1234567800000, 0x41d1234567800000, 0x41d1234567c00000, 0x41d1234567800000 },
+	{ 0x41e123456789abcd, 0x41e1234567800000, 0x41e1234567800000, 0x41e1234567a00000, 0x41e1234567800000 },
+	{ 0x41f123456789abcd, 0x41f1234567900000, 0x41f1234567800000, 0x41f1234567900000, 0x41f1234567800000 },
+	{ 0xc1f123456789abcd, 0xc1f1234567900000, 0xc1f1234567800000, 0xc1f1234567800000, 0xc1f1234567900000 },
+	{ 0xc1f1234567880000, 0xc1f1234567900000, 0xc1f1234567800000, 0xc1f1234567800000, 0xc1f1234567900000 },
+	{ 0x432123456789abcd, 0x432123456789abce, 0x432123456789abcc, 0x432123456789abce, 0x432123456789abcc },
+	{ 0x433123456789abcd, 0x433123456789abcd, 0x433123456789abcd, 0x433123456789abcd, 0x433123456789abcd },
+	{ 0x434123456789abcd, 0x434123456789abcd, 0x434123456789abcd, 0x434123456789abcd, 0x434123456789abcd },
+	{ 0x43c123456789abcd, 0x43c123456789abcd, 0x43c123456789abcd, 0x43c123456789abcd, 0x43c123456789abcd },
+	{ 0x43d123456789abcd, 0x43d123456789abcd, 0x43d123456789abcd, 0x43d123456789abcd, 0x43d123456789abcd },
+	{ 0x43e123456789abcd, 0x43e123456789abcd, 0x43e123456789abcd, 0x43e123456789abcd, 0x43e123456789abcd },
+	{ 0x43f123456789abcd, 0x43f123456789abcd, 0x43f123456789abcd, 0x43f123456789abcd, 0x43f123456789abcd },
+	{ 0xc3f123456789abcd, 0xc3f123456789abcd, 0xc3f123456789abcd, 0xc3f123456789abcd, 0xc3f123456789abcd },
+	{ 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0xfff0000000000000, 0xfff0000000000000, 0xfff0000000000000, 0xfff0000000000000, 0xfff0000000000000 },
+	{ 0x7ff123456789abcd, 0x7ff923456789abcd, 0x7ff923456789abcd, 0x7ff923456789abcd, 0x7ff923456789abcd },
+	{ 0xfff923456789abcd, 0xfff923456789abcd, 0xfff923456789abcd, 0xfff923456789abcd, 0xfff923456789abcd },
+};
+
+int test11(long arg)
+{
+	long i;
+	unsigned long results[4];
+	struct frivals *vp = frivals;
+
+	for (i = 0; i < sizeof(frivals) / sizeof(frivals[0]); ++i, ++vp) {
+		set_fpscr(FPS_RN_FLOOR);
+		asm("lfd 3,0(%0); frin 4,3; stfd 4,0(%1)"
+		    : : "b" (&vp->val), "b" (results) : "memory");
+		set_fpscr(FPS_RN_NEAR);
+		asm("friz 5,3; stfd 5,8(%0)" : : "b" (results) : "memory");
+		set_fpscr(FPS_RN_ZERO);
+		asm("frip 5,3; stfd 5,16(%0)" : : "b" (results) : "memory");
+		set_fpscr(FPS_RN_CEIL);
+		asm("frim 5,3; stfd 5,24(%0)" : : "b" (results) : "memory");
+		if (results[0] != vp->nval || results[1] != vp->zval ||
+		    results[2] != vp->pval || results[3] != vp->mval) {
+			print_hex(i, 2, "\r\n");
+			print_hex(results[0], 16, " ");
+			print_hex(results[1], 16, " ");
+			print_hex(results[2], 16, " ");
+			print_hex(results[3], 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_11(void)
+{
+	enable_fp();
+	return trapit(0, test11);
+}
+
+int test12(long arg)
+{
+	unsigned long vals[2];
+	unsigned long results[2];
+
+	vals[0] = 0xf0f0f0f05a5a5a5aul;
+	vals[1] = 0x0123456789abcdeful;
+	asm("lfd 5,0(%0); lfd 6,8(%0); fmrgew 7,5,6; fmrgow 8,5,6; stfd 7,0(%1); stfd 8,8(%1)"
+	    : : "b" (vals), "b" (results) : "memory");
+	if (results[0] != 0xf0f0f0f001234567ul || results[1] != 0x5a5a5a5a89abcdeful)
+		return 1;
+	return 0;
+}
+
+int fpu_test_12(void)
+{
+	enable_fp();
+	return trapit(0, test12);
+}
+
+struct addvals {
+	unsigned long val_a;
+	unsigned long val_b;
+	unsigned long sum;
+	unsigned long diff;
+} addvals[] = {
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x3fdfffffffffffff, 0x0000000000000000, 0x3fdfffffffffffff, 0x3fdfffffffffffff },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x4000000000000000, 0x0000000000000000 },
+	{ 0xbff0000000000000, 0xbff0000000000000, 0xc000000000000000, 0x0000000000000000 },
+	{ 0x402123456789abcd, 0x4021000000000000, 0x403111a2b3c4d5e6, 0x3fb1a2b3c4d5e680 },
+	{ 0x4061200000000000, 0x406123456789abcd, 0x407121a2b3c4d5e6, 0xbfba2b3c4d5e6800 },
+	{ 0x4061230000000000, 0x3fa4560000000000, 0x4061244560000000, 0x406121baa0000000 },
+	{ 0xc061230000000000, 0x3fa4560000000000, 0xc06121baa0000000, 0xc061244560000000 },
+	{ 0x4061230000000000, 0xbfa4560000000000, 0x406121baa0000000, 0x4061244560000000 },
+	{ 0xc061230000000000, 0xbfa4560000000000, 0xc061244560000000, 0xc06121baa0000000 },
+	{ 0x3fa1230000000000, 0x4064560000000000, 0x4064571230000000, 0xc06454edd0000000 },
+	{ 0xbfa1230000000000, 0x4064560000000000, 0x406454edd0000000, 0xc064571230000000 },
+	{ 0x3fa1230000000000, 0xc064560000000000, 0xc06454edd0000000, 0x4064571230000000 },
+	{ 0xbfa1230000000000, 0xc064560000000000, 0xc064571230000000, 0x406454edd0000000 },
+	{ 0x6780000000000001, 0x6470000000000000, 0x6780000000000009, 0x677ffffffffffff2 },
+	{ 0x6780000000000001, 0x6460000000000000, 0x6780000000000005, 0x677ffffffffffffa },
+	{ 0x6780000000000001, 0x6450000000000000, 0x6780000000000003, 0x677ffffffffffffe },
+	{ 0x6780000000000001, 0x6440000000000000, 0x6780000000000002, 0x6780000000000000 },
+	{ 0x7ff8888888888888, 0x7ff9999999999999, 0x7ff8888888888888, 0x7ff8888888888888 },
+	{ 0xfff8888888888888, 0x7ff9999999999999, 0xfff8888888888888, 0xfff8888888888888 },
+	{ 0x7ff8888888888888, 0x7ff0000000000000, 0x7ff8888888888888, 0x7ff8888888888888 },
+	{ 0x7ff8888888888888, 0x0000000000000000, 0x7ff8888888888888, 0x7ff8888888888888 },
+	{ 0x7ff8888888888888, 0x0001111111111111, 0x7ff8888888888888, 0x7ff8888888888888 },
+	{ 0x7ff8888888888888, 0x3ff0000000000000, 0x7ff8888888888888, 0x7ff8888888888888 },
+	{ 0x7ff0000000000000, 0x7ff9999999999999, 0x7ff9999999999999, 0x7ff9999999999999 },
+	{ 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff8000000000000 },
+	{ 0x7ff0000000000000, 0xfff0000000000000, 0x7ff8000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x8002222222222222, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0xc002222222222222, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x0000000000000000, 0x7ff9999999999999, 0x7ff9999999999999, 0x7ff9999999999999 },
+	{ 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000 },
+	{ 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000 },
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x8002222222222222, 0x0001111111111111, 0x8001111111111111, 0x8003333333333333 },
+	{ 0x0000022222222222, 0x0000111111111111, 0x0000133333333333, 0x80000eeeeeeeeeef },
+	{ 0x401ffffffbfffefe, 0x406b8265196bd89e, 0x406c8265194bd896, 0xc06a8265198bd8a6 },
+	{ 0x4030020000000004, 0xbf110001ffffffff, 0x403001fbbfff8004, 0x4030020440008004 },
+	{ 0x3fdfffffffffffff, 0x3fe0000000000000, 0x3ff0000000000000, 0xbc90000000000000 },
+};
+
+int test13(long arg)
+{
+	long i;
+	unsigned long results[2];
+	struct addvals *vp = addvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(addvals) / sizeof(addvals[0]); ++i, ++vp) {
+		asm("lfd 5,0(%0); lfd 6,8(%0); fadd 7,5,6; fsub 8,5,6; stfd 7,0(%1); stfd 8,8(%1)"
+		    : : "b" (&vp->val_a), "b" (results) : "memory");
+		if (results[0] != vp->sum || results[1] != vp->diff) {
+			print_hex(i, 2, " ");
+			print_hex(results[0], 16, " ");
+			print_hex(results[1], 16, "\r\n");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_13(void)
+{
+	enable_fp();
+	return trapit(0, test13);
+}
+
+struct addvals sp_addvals[] = {
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x3fdfffffffffffff, 0x0000000000000000, 0x3fe0000000000000, 0x3fe0000000000000 },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x4000000000000000, 0x0000000000000000 },
+	{ 0xbff0000000000000, 0xbff0000000000000, 0xc000000000000000, 0x0000000000000000 },
+	{ 0x402123456789abcd, 0x4021000000000000, 0x403111a2c0000000, 0x3fb1a2b000000000 },
+	{ 0x4061200000000000, 0x406123456789abcd, 0x407121a2c0000000, 0xbfba2b0000000000 },
+	{ 0x4061230000000000, 0x3fa4560000000000, 0x4061244560000000, 0x406121baa0000000 },
+	{ 0xc061230000000000, 0x3fa4560000000000, 0xc06121baa0000000, 0xc061244560000000 },
+	{ 0x4061230000000000, 0xbfa4560000000000, 0x406121baa0000000, 0x4061244560000000 },
+	{ 0xc061230000000000, 0xbfa4560000000000, 0xc061244560000000, 0xc06121baa0000000 },
+	{ 0x3fa1230000000000, 0x4064560000000000, 0x4064571240000000, 0xc06454edc0000000 },
+	{ 0xbfa1230000000000, 0x4064560000000000, 0x406454edc0000000, 0xc064571240000000 },
+	{ 0x3fa1230000000000, 0xc064560000000000, 0xc06454edc0000000, 0x4064571240000000 },
+	{ 0xbfa1230000000000, 0xc064560000000000, 0xc064571240000000, 0x406454edc0000000 },
+	{ 0x6780000000000001, 0x6470000000000000, 0x7ff0000000000000, 0x7ff8000000000000 },
+	{ 0x6780000000000001, 0x6460000000000000, 0x7ff0000000000000, 0x7ff8000000000000 },
+	{ 0x6780000000000001, 0x6450000000000000, 0x7ff0000000000000, 0x7ff8000000000000 },
+	{ 0x6780000000000001, 0x6440000000000000, 0x7ff0000000000000, 0x7ff8000000000000 },
+	{ 0x7ff8888888888888, 0x7ff9999999999999, 0x7ff8888880000000, 0x7ff8888880000000 },
+	{ 0xfff8888888888888, 0x7ff9999999999999, 0xfff8888880000000, 0xfff8888880000000 },
+	{ 0x7ff8888888888888, 0x7ff0000000000000, 0x7ff8888880000000, 0x7ff8888880000000 },
+	{ 0x7ff8888888888888, 0x0000000000000000, 0x7ff8888880000000, 0x7ff8888880000000 },
+	{ 0x7ff8888888888888, 0x0001111111111111, 0x7ff8888880000000, 0x7ff8888880000000 },
+	{ 0x7ff8888888888888, 0x3ff0000000000000, 0x7ff8888880000000, 0x7ff8888880000000 },
+	{ 0x7ff0000000000000, 0x7ff9999999999999, 0x7ff9999980000000, 0x7ff9999980000000 },
+	{ 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff8000000000000 },
+	{ 0x7ff0000000000000, 0xfff0000000000000, 0x7ff8000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x8002222222222222, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0xc002222222222222, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x0000000000000000, 0x7ff9999999999999, 0x7ff9999980000000, 0x7ff9999980000000 },
+	{ 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000 },
+	{ 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000 },
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x8002222222222222, 0x0001111111111111, 0x0000000000000000, 0x8000000000000000 },
+	{ 0x0000022222222222, 0x0000111111111111, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x47dc000020000000, 0x47ec03ffe0000000, 0x7ff0000000000000, 0xc7dc07ffa0000000 },
+	{ 0x47dbffffe0000000, 0x47eff7ffe0000000, 0x7ff0000000000000, 0xc7e1f80000000000 },
+	{ 0x47efffffc0000000, 0xc7efffffc0000000, 0x0000000000000000, 0x7ff0000000000000 },
+};
+
+int test14(long arg)
+{
+	long i;
+	unsigned long results[2];
+	struct addvals *vp = sp_addvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(sp_addvals) / sizeof(sp_addvals[0]); ++i, ++vp) {
+		asm("lfd 5,0(%0); frsp 5,5; lfd 6,8(%0); frsp 6,6; "
+		    "fadds 7,5,6; fsubs 8,5,6; stfd 7,0(%1); stfd 8,8(%1)"
+		    : : "b" (&vp->val_a), "b" (results) : "memory");
+		if (results[0] != vp->sum || results[1] != vp->diff) {
+			print_hex(i, 2, " ");
+			print_hex(results[0], 16, " ");
+			print_hex(results[1], 16, "\r\n");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_14(void)
+{
+	enable_fp();
+	return trapit(0, test14);
+}
+
+struct mulvals {
+	unsigned long val_a;
+	unsigned long val_b;
+	unsigned long prod;
+} mulvals[] = {
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 },
+	{ 0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000 },
+	{ 0xbf4fff801fffffff, 0x6d7fffff8000007f, 0xecdfff7fa001fffe },
+	{ 0x3fbd50275a65ed80, 0x0010000000000000, 0x0001d50275a65ed8 },
+	{ 0x3fe95d8937acf1ce, 0x0000000000000001, 0x0000000000000001 },
+};
+
+int test15(long arg)
+{
+	long i;
+	unsigned long result;
+	struct mulvals *vp = mulvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(mulvals) / sizeof(mulvals[0]); ++i, ++vp) {
+		asm("lfd 5,0(%0); lfd 6,8(%0); fmul 7,5,6; stfd 7,0(%1)"
+		    : : "b" (&vp->val_a), "b" (&result) : "memory");
+		if (result != vp->prod) {
+			print_hex(i, 2, " ");
+			print_hex(result, 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_15(void)
+{
+	enable_fp();
+	return trapit(0, test15);
+}
+
+struct mulvals_sp {
+	unsigned int val_a;
+	unsigned int val_b;
+	unsigned int prod;
+} mulvals_sp[] = {
+	{ 0x00000000, 0x00000000, 0x00000000 },
+	{ 0x80000000, 0x80000000, 0x00000000 },
+	{ 0x3f800000, 0x3f800000, 0x3f800000 },
+	{ 0xbf800000, 0x3f800000, 0xbf800000 },
+	{ 0xbe7ff801, 0x6d7fffff, 0xec7ff800 },
+	{ 0xc100003d, 0xfe803ff8, 0x7f800000 },
+	{ 0x4f780080, 0x389003ff, 0x488b8427 },
+};
+
+int test16(long arg)
+{
+	long i;
+	unsigned int result;
+	struct mulvals_sp *vp = mulvals_sp;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(mulvals_sp) / sizeof(mulvals_sp[0]); ++i, ++vp) {
+		asm("lfs 5,0(%0); lfs 6,4(%0); fmuls 7,5,6; stfs 7,0(%1)"
+		    : : "b" (&vp->val_a), "b" (&result) : "memory");
+		if (result != vp->prod) {
+			print_hex(i, 2, " ");
+			print_hex(result, 8, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_16(void)
+{
+	enable_fp();
+	return trapit(0, test16);
+}
+
+struct divvals {
+	unsigned long val_a;
+	unsigned long val_b;
+	unsigned long prod;
+} divvals[] = {
+	{ 0x3ff0000000000000, 0x0000000000000000, 0x7ff0000000000000 },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 },
+	{ 0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000 },
+	{ 0x4000000000000000, 0x4008000000000000, 0x3fe5555555555555 },
+	{ 0xc01fff0007ffffff, 0xc03ffffffdffffbf, 0x3fcfff0009fff041 },
+};
+
+int test17(long arg)
+{
+	long i;
+	unsigned long result;
+	struct divvals *vp = divvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(divvals) / sizeof(divvals[0]); ++i, ++vp) {
+		asm("lfd 5,0(%0); lfd 6,8(%0); fdiv 7,5,6; stfd 7,0(%1)"
+		    : : "b" (&vp->val_a), "b" (&result) : "memory");
+		if (result != vp->prod) {
+			print_hex(i, 2, " ");
+			print_hex(result, 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_17(void)
+{
+	enable_fp();
+	return trapit(0, test17);
+}
+
+struct recipvals {
+	unsigned long val;
+	unsigned long inv;
+} recipvals[] = {
+	{ 0x0000000000000000, 0x7ff0000000000000 },
+	{ 0xfff0000000000000, 0x8000000000000000 },
+	{ 0x3ff0000000000000, 0x3feff00400000000 },
+	{ 0xbff0000000000000, 0xbfeff00400000000 },
+	{ 0x4008000000000000, 0x3fd54e3800000000 },
+	{ 0xc03ffffffdffffbf, 0xbfa0040000000000 },
+};
+
+int test18(long arg)
+{
+	long i;
+	unsigned long result;
+	struct recipvals *vp = recipvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(recipvals) / sizeof(recipvals[0]); ++i, ++vp) {
+		asm("lfd 6,0(%0); fre 7,6; stfd 7,0(%1)"
+		    : : "b" (&vp->val), "b" (&result) : "memory");
+		if (result != vp->inv) {
+			print_hex(i, 2, " ");
+			print_hex(result, 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_18(void)
+{
+	enable_fp();
+	return trapit(0, test18);
+}
+
+#define RES_B	0x7ffaaaaaaaaaaaaa
+#define RES_C	0x000bbbbbbbbbbbbb
+
+struct selvals {
+	unsigned long val;
+	unsigned long result;
+} selvals[] = {
+	{ 0x0000000000000000, RES_C },
+	{ 0x8000000000000000, RES_C },
+	{ 0x3ff0000000000000, RES_C },
+	{ 0xbff0000000000000, RES_B },
+	{ 0x7ff0000000000000, RES_C },
+	{ 0xfff0000000000000, RES_B },
+	{ 0x7ff8000000000000, RES_B },
+	{ 0xfff8000000000000, RES_B },
+	{ 0x0000000000000001, RES_C },
+	{ 0x8000000000000001, RES_B },
+	{ 0xffffffffffffffff, RES_B },
+};
+
+int test19(long arg)
+{
+	long i;
+	unsigned long result;
+	unsigned long frb = RES_B;
+	unsigned long frc = RES_C;
+	struct selvals *vp = selvals;
+
+	for (i = 0; i < sizeof(selvals) / sizeof(selvals[0]); ++i, ++vp) {
+		asm("lfd 6,0(%0); lfd 10,0(%1); lfd 22,0(%2); fsel 0,6,22,10; stfd 0,0(%3)"
+		    : : "b" (&vp->val), "b" (&frb), "b" (&frc), "b" (&result) : "memory");
+		if (result != vp->result) {
+			print_hex(i, 2, " ");
+			print_hex(result, 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_19(void)
+{
+	enable_fp();
+	return trapit(0, test19);
+}
+
+#define LT	8
+#define GT	4
+#define EQ	2
+#define UN	1
+
+struct cmpvals {
+	unsigned long vala, valb;
+	unsigned long result;
+} cmpvals[] = {
+	{ 0x0000000000000000, 0x0000000000000000, EQ },
+	{ 0x8000000000000000, 0x0000000000000000, EQ },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, EQ },
+	{ 0x3ff0000000000001, 0x3ff0000000000000, GT },
+	{ 0x3ff0000000000000, 0x3ff0000000000001, LT },
+	{ 0xbff0000000000000, 0x3ff0000000000000, LT },
+	{ 0x7ff0000000000000, 0x7ff0000000000000, EQ },
+	{ 0xfff0000000000000, 0x7ff0000000000000, LT },
+	{ 0x7ff8000000000000, 0x7ff0000000000000, UN },
+	{ 0xfff8000000000000, 0x7ff0000000000000, UN },
+	{ 0x0000000000000001, 0x0000000000000001, EQ },
+	{ 0x8000000000000001, 0x7ff0000000000000, LT },
+	{ 0xffffffffffffffff, 0x7ff0000000000000, UN },
+	{ 0xffffffffffffffff, 0xffffffffffffffff, UN },
+};
+
+int test20(long arg)
+{
+	long i;
+	unsigned long cr;
+	struct cmpvals *vp = cmpvals;
+
+	for (i = 0; i < sizeof(cmpvals) / sizeof(cmpvals[0]); ++i, ++vp) {
+		asm("lfd 6,0(%1); lfd 10,8(%1); fcmpu 7,6,10; mfcr %0"
+		    : "=r" (cr) : "b" (&vp->vala) : "memory");
+		cr &= 0xf;
+		if (cr != vp->result) {
+			print_hex(i, 2, " ");
+			print_hex(cr, 1, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_20(void)
+{
+	enable_fp();
+	return trapit(0, test20);
+}
+
+struct isqrtvals {
+	unsigned long val;
+	unsigned long inv;
+} isqrtvals[] = {
+	{ 0x0000000000000000, 0x7ff0000000000000 },
+	{ 0x8000000000000000, 0xfff0000000000000 },
+	{ 0xfff0000000000000, 0x7ff8000000000000 },
+	{ 0x7ff0000000000000, 0x0000000000000000 },
+	{ 0xfff123456789abcd, 0xfff923456789abcd },
+	{ 0x3ff0000000000000, 0x3feff80000000000 },
+	{ 0x4000000000000000, 0x3fe69dc800000000 },
+	{ 0x4010000000000000, 0x3fdff80000000000 },
+	{ 0xbff0000000000000, 0x7ff8000000000000 },
+	{ 0x4008000000000000, 0x3fe2781800000000 },
+	{ 0x7fd0000000000000, 0x1ffff80000000000 },
+	{ 0x0008000000000000, 0x5fe69dc800000000 },
+	{ 0x0004000000000000, 0x5feff80000000000 },
+	{ 0x0002000000000000, 0x5ff69dc800000000 },
+	{ 0x0000000000000002, 0x61769dc800000000 },
+	{ 0x0000000000000001, 0x617ff80000000000 },
+};
+
+int test21(long arg)
+{
+	long i;
+	unsigned long result;
+	struct isqrtvals *vp = isqrtvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(isqrtvals) / sizeof(isqrtvals[0]); ++i, ++vp) {
+		asm("lfd 6,0(%0); frsqrte 7,6; stfd 7,0(%1)"
+		    : : "b" (&vp->val), "b" (&result) : "memory");
+		if (result != vp->inv) {
+			print_hex(i, 2, " ");
+			print_hex(result, 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_21(void)
+{
+	enable_fp();
+	return trapit(0, test21);
+}
+
+struct sqrtvals {
+	unsigned long val;
+	unsigned long inv;
+} sqrtvals[] = {
+	{ 0x0000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000 },
+	{ 0xfff0000000000000, 0x7ff8000000000000 },
+	{ 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0xfff123456789abcd, 0xfff923456789abcd },
+	{ 0x3ff0000000000000, 0x3ff0000000000000 },
+	{ 0x4000000000000000, 0x3ff6a09e667f3bcd },
+	{ 0x4010000000000000, 0x4000000000000000 },
+	{ 0xbff0000000000000, 0x7ff8000000000000 },
+	{ 0x4008000000000000, 0x3ffbb67ae8584caa },
+	{ 0x7fd0000000000000, 0x5fe0000000000000 },
+	{ 0x0008000000000000, 0x1ff6a09e667f3bcd },
+	{ 0x0004000000000000, 0x1ff0000000000000 },
+	{ 0x0002000000000000, 0x1fe6a09e667f3bcd },
+	{ 0x0000000000000002, 0x1e66a09e667f3bcd },
+	{ 0x0000000000000001, 0x1e60000000000000 },
+};
+
+int test22(long arg)
+{
+	long i;
+	unsigned long result;
+	struct sqrtvals *vp = sqrtvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(sqrtvals) / sizeof(sqrtvals[0]); ++i, ++vp) {
+		asm("lfd 6,0(%0); fsqrt 7,6; stfd 7,0(%1)"
+		    : : "b" (&vp->val), "b" (&result) : "memory");
+		if (result != vp->inv) {
+			print_hex(i, 2, " ");
+			print_hex(result, 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_22(void)
+{
+	enable_fp();
+	return trapit(0, test22);
+}
+
+struct fmavals {
+	unsigned long ra;
+	unsigned long rc;
+	unsigned long rb;
+	unsigned long fma;
+	unsigned long fms;
+	unsigned long nfma;
+	unsigned long nfms;
+} fmavals[] = {
+	{ 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+	  0x0000000000000000, 0x0000000000000000, 0x8000000000000000, 0x8000000000000000 },
+	{ 0x0000000000000000, 0x7ffc000000000000, 0x0000000000000000,
+	  0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000 },
+	{ 0x0000000000000000, 0x7ffc000000000000, 0x7ffb000000000000,
+	  0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000 },
+	{ 0x7ffa000000000000, 0x7ffc000000000000, 0x7ffb000000000000,
+	  0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000 },
+	{ 0x3ff0000000000000, 0x8000000000000000, 0x678123456789abcd, 
+	  0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd },
+	{ 0x3ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, 
+	  0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd },
+	{ 0x7ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, 
+	  0xfff0000000000000, 0xfff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0x7ff0000000000000, 0x0000000000000000, 0x678123456789abcd, 
+	  0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000 },
+	{ 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000020000000, 
+	  0x4000000010000000, 0xbe80000000000000, 0xc000000010000000, 0x3e80000000000000 },
+	{ 0x3ff0000000000001, 0x3ff0000000000001, 0x3ff0000000000000,
+	  0x4000000000000001, 0x3cc0000000000000, 0xc000000000000001, 0xbcc0000000000000 },
+	{ 0x3ff0000000000003, 0x3ff0000000000002, 0x3ff0000000000000,
+	  0x4000000000000002, 0x3cd4000000000002, 0xc000000000000002, 0xbcd4000000000002 },
+	{ 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb020000000000000,
+	  0xaca765753908cd20, 0x3030000000000000, 0x2ca765753908cd20, 0xb030000000000000 },
+	{ 0x3006a09e667f3bcd, 0x4006a09e667f3bcd, 0xb020000000000000,
+	  0x2cd3b3efbf5e2229, 0x3030000000000000, 0xacd3b3efbf5e2229, 0xb030000000000000 },
+	{ 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb060003450000000,
+	  0xb05e0068a0000000, 0x3061003450000000, 0x305e0068a0000000, 0xb061003450000000 },
+};
+
+int test23(long arg)
+{
+	long i;
+	unsigned long results[4];
+	struct fmavals *vp = fmavals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(fmavals) / sizeof(fmavals[0]); ++i, ++vp) {
+		asm("lfd 6,0(%0); lfd 7,8(%0); lfd 8,16(%0); fmadd 0,6,7,8; stfd 0,0(%1)"
+		    : : "b" (&vp->ra), "b" (results) : "memory");
+		asm("fmsub 1,6,7,8; fnmadd 2,6,7,8; fnmsub 3,6,7,8; stfd 1,8(%0); stfd 2,16(%0); stfd 3,24(%0)"
+		    : : "b" (results) : "memory");
+		if (results[0] != vp->fma || results[1] != vp->fms ||
+		    results[2] != vp->nfma || results[3] != vp->nfms) {
+			print_hex(i, 2, " ");
+			print_hex(results[0], 16, " ");
+			print_hex(results[1], 16, " ");
+			print_hex(results[2], 16, " ");
+			print_hex(results[3], 16, "\r\n");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_23(void)
+{
+	enable_fp();
+	return trapit(0, test23);
+}
+
+int fail = 0;
+
+void do_test(int num, int (*test)(void))
+{
+	int ret;
+
+	print_test_number(num);
+	ret = test();
+	if (ret == 0) {
+		print_string("PASS\r\n");
+	} else {
+		fail = 1;
+		print_string("FAIL ");
+		print_hex(ret, 5, " SRR0=");
+		print_hex(mfspr(SRR0), 16, " SRR1=");
+		print_hex(mfspr(SRR1), 16, " FPSCR=");
+		enable_fp();
+		print_hex(get_fpscr(), 8, "\r\n");
+	}
+}
+
+int main(void)
+{
+	console_init();
+
+	do_test(1, fpu_test_1);
+	do_test(2, fpu_test_2);
+	do_test(3, fpu_test_3);
+	do_test(4, fpu_test_4);
+	do_test(5, fpu_test_5);
+	do_test(6, fpu_test_6);
+	do_test(7, fpu_test_7);
+	do_test(8, fpu_test_8);
+	do_test(9, fpu_test_9);
+	do_test(10, fpu_test_10);
+	do_test(11, fpu_test_11);
+	do_test(12, fpu_test_12);
+	do_test(13, fpu_test_13);
+	do_test(14, fpu_test_14);
+	do_test(15, fpu_test_15);
+	do_test(16, fpu_test_16);
+	do_test(17, fpu_test_17);
+	do_test(18, fpu_test_18);
+	do_test(19, fpu_test_19);
+	do_test(20, fpu_test_20);
+	do_test(21, fpu_test_21);
+	do_test(22, fpu_test_22);
+	do_test(23, fpu_test_23);
+
+	return fail;
+}
diff --git a/tests/fpu/head.S b/tests/fpu/head.S
new file mode 100644
index 0000000..938fca0
--- /dev/null
+++ b/tests/fpu/head.S
@@ -0,0 +1,132 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Load an immediate 64-bit value into a register */
+#define LOAD_IMM64(r, e)			\
+	lis     r,(e)@highest;			\
+	ori     r,r,(e)@higher;			\
+	rldicr  r,r, 32, 31;			\
+	oris    r,r, (e)@h;			\
+	ori     r,r, (e)@l;
+
+	.section ".head","ax"
+
+	/*
+	 * Microwatt currently enters in LE mode at 0x0, so we don't need to
+	 * do any endian fix ups
+	 */
+	. = 0
+.global _start
+_start:
+	LOAD_IMM64(%r10,__bss_start)
+	LOAD_IMM64(%r11,__bss_end)
+	subf	%r11,%r10,%r11
+	addi	%r11,%r11,63
+	srdi.	%r11,%r11,6
+	beq	2f
+	mtctr	%r11
+1:	dcbz	0,%r10
+	addi	%r10,%r10,64
+	bdnz	1b
+
+2:	LOAD_IMM64(%r1,__stack_top)
+	li	%r0,0
+	stdu	%r0,-16(%r1)
+	LOAD_IMM64(%r10, die)
+	mtsprg0	%r10
+	LOAD_IMM64(%r12, main)
+	mtctr	%r12
+	bctrl
+die:	attn // terminate on exit
+	b .
+
+.global trapit
+trapit:
+	mflr	%r0
+	std	%r0,16(%r1)
+	stdu	%r1,-256(%r1)
+	mtsprg1	%r1
+	r = 14
+	.rept	18
+	std	r,r*8(%r1)
+	r = r + 1
+	.endr
+	mfcr	%r0
+	stw	%r0,13*8(%r1)
+	LOAD_IMM64(%r10, ret)
+	mtsprg0	%r10
+	mr	%r12,%r4
+	mtctr	%r4
+	bctrl
+ret:
+	mfsprg1	%r1
+	LOAD_IMM64(%r10, die)
+	mtsprg0	%r10
+	r = 14
+	.rept	18
+	ld	r,r*8(%r1)
+	r = r + 1
+	.endr
+	lwz	%r0,13*8(%r1)
+	mtcr	%r0
+	ld	%r0,256+16(%r1)
+	addi	%r1,%r1,256
+	mtlr	%r0
+	blr
+
+	.global do_rfid
+do_rfid:
+	mtsrr1	%r3
+	LOAD_IMM64(%r4, do_blr)
+	mtsrr0	%r4
+	rfid
+	blr
+
+	.global do_blr
+do_blr:
+	blr
+
+#define EXCEPTION(nr)		\
+	.= nr			;\
+	mfsprg0	%r0		;\
+	mtctr	%r0		;\
+	li	%r3,nr		;\
+	bctr
+
+	EXCEPTION(0x300)
+	EXCEPTION(0x380)
+	EXCEPTION(0x400)
+	EXCEPTION(0x480)
+	EXCEPTION(0x500)
+	EXCEPTION(0x600)
+	EXCEPTION(0x700)
+	EXCEPTION(0x800)
+	EXCEPTION(0x900)
+	EXCEPTION(0x980)
+	EXCEPTION(0xa00)
+	EXCEPTION(0xb00)
+	EXCEPTION(0xc00)
+	EXCEPTION(0xd00)
+	EXCEPTION(0xe00)
+	EXCEPTION(0xe20)
+	EXCEPTION(0xe40)
+	EXCEPTION(0xe60)
+	EXCEPTION(0xe80)
+	EXCEPTION(0xf00)
+	EXCEPTION(0xf20)
+	EXCEPTION(0xf40)
+	EXCEPTION(0xf60)
+	EXCEPTION(0xf80)
diff --git a/tests/fpu/powerpc.lds b/tests/fpu/powerpc.lds
new file mode 100644
index 0000000..99611ab
--- /dev/null
+++ b/tests/fpu/powerpc.lds
@@ -0,0 +1,27 @@
+SECTIONS
+{
+	. = 0;
+	_start = .;
+	.head : {
+		KEEP(*(.head))
+	}
+	. = ALIGN(0x1000);
+	.text : { *(.text) *(.text.*) *(.rodata) *(.rodata.*) }
+	. = ALIGN(0x1000);
+	.data : { *(.data) *(.data.*) *(.got) *(.toc) }
+	. = ALIGN(0x80);
+	__bss_start = .;
+	.bss : {
+		*(.dynsbss)
+		*(.sbss)
+		*(.scommon)
+		*(.dynbss)
+		*(.bss)
+		*(.common)
+		*(.bss.*)
+	}
+	. = ALIGN(0x80);
+	__bss_end = .;
+	. = . + 0x4000;
+	__stack_top = .;
+}
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
new file mode 100755
index 0000000..50831cb
Binary files /dev/null and b/tests/test_fpu.bin differ
diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
new file mode 100644
index 0000000..ed759a5
--- /dev/null
+++ b/tests/test_fpu.console_out
@@ -0,0 +1,23 @@
+test 01:PASS
+test 02:PASS
+test 03:PASS
+test 04:PASS
+test 05:PASS
+test 06:PASS
+test 07:PASS
+test 08:PASS
+test 09:PASS
+test 10:PASS
+test 11:PASS
+test 12:PASS
+test 13:PASS
+test 14:PASS
+test 15:PASS
+test 16:PASS
+test 17:PASS
+test 18:PASS
+test 19:PASS
+test 20:PASS
+test 21:PASS
+test 22:PASS
+test 23:PASS
diff --git a/tests/update_console_tests b/tests/update_console_tests
index 906b0cc..a5e6ffc 100755
--- a/tests/update_console_tests
+++ b/tests/update_console_tests
@@ -3,7 +3,7 @@
 # Script to update console related tests from source
 #
 
-for i in sc illegal decrementer xics privileged mmu misc modes reservation trace ; do
+for i in sc illegal decrementer xics privileged mmu misc modes reservation trace fpu ; do
     cd $i
     make
     cd -
diff --git a/writeback.vhdl b/writeback.vhdl
index 053a8ba..95de0ec 100644
--- a/writeback.vhdl
+++ b/writeback.vhdl
@@ -12,6 +12,7 @@ entity writeback is
 
         e_in         : in Execute1ToWritebackType;
         l_in         : in Loadstore1ToWritebackType;
+        fp_in        : in FPUToWritebackType;
 
         w_out        : out WritebackToRegisterFileType;
         c_out        : out WritebackToCrFileType;
@@ -31,15 +32,21 @@ begin
             -- Do consistency checks only on the clock edge
             x(0) := e_in.valid;
             y(0) := l_in.valid;
-            assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure;
+            w(0) := fp_in.valid;
+            assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) +
+                    to_integer(unsigned(w))) <= 1 severity failure;
 
             x(0) := e_in.write_enable or e_in.exc_write_enable;
             y(0) := l_in.write_enable;
-            assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure;
+            w(0) := fp_in.write_enable;
+            assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) +
+                    to_integer(unsigned(w))) <= 1 severity failure;
 
             w(0) := e_in.write_cr_enable;
             x(0) := (e_in.write_enable and e_in.rc);
-            assert (to_integer(unsigned(w)) + to_integer(unsigned(x))) <= 1 severity failure;
+            y(0) := fp_in.write_cr_enable;
+            assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) +
+                    to_integer(unsigned(y))) <= 1 severity failure;
         end if;
     end process;
 
@@ -53,7 +60,7 @@ begin
         c_out <= WritebackToCrFileInit;
 
         complete_out <= '0';
-        if e_in.valid = '1' or l_in.valid = '1' then
+        if e_in.valid = '1' or l_in.valid = '1' or fp_in.valid = '1' then
             complete_out <= '1';
         end if;
 
@@ -79,8 +86,20 @@ begin
                 c_out.write_xerc_data <= e_in.xerc;
             end if;
 
+            if fp_in.write_enable = '1' then
+                w_out.write_reg <= fp_in.write_reg;
+                w_out.write_data <= fp_in.write_data;
+                w_out.write_enable <= '1';
+            end if;
+
+            if fp_in.write_cr_enable = '1' then
+                c_out.write_cr_enable <= '1';
+                c_out.write_cr_mask <= fp_in.write_cr_mask;
+                c_out.write_cr_data <= fp_in.write_cr_data;
+            end if;
+
             if l_in.write_enable = '1' then
-                w_out.write_reg <= gpr_to_gspr(l_in.write_reg);
+                w_out.write_reg <= l_in.write_reg;
                 w_out.write_data <= l_in.write_data;
                 w_out.write_enable <= '1';
             end if;