Merge pull request #245 from paulusmack/fpu

Add a simple FPU
6 years ago · e40e752b9a
parent 168d30c07a 73f819301b
commit e40e752b9a
29 changed files with 5009 additions and 141 deletions
--- a/2
+++ b/2
@ -48,7 +48,7 @@ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
 	cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \
 	logical.vhdl countzero.vhdl multiply.vhdl divider.vhdl execute1.vhdl \
 	loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \
-	core.vhdl
+	core.vhdl fpu.vhdl

 soc_files = $(core_files) wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \
 	wishbone_debug_master.vhdl xics.vhdl syscon.vhdl soc.vhdl \
--- a/common.vhdl
+++ b/common.vhdl
@ -13,8 +13,11 @@ package common is
    constant MSR_SF  : integer := (63 - 0);     -- Sixty-Four bit mode
    constant MSR_EE  : integer := (63 - 48);    -- External interrupt Enable
    constant MSR_PR  : integer := (63 - 49);    -- PRoblem state
+    constant MSR_FP  : integer := (63 - 50);    -- Floating Point available
+    constant MSR_FE0 : integer := (63 - 52);    -- Floating Exception mode
    constant MSR_SE  : integer := (63 - 53);    -- Single-step bit of TE field
    constant MSR_BE  : integer := (63 - 54);    -- Branch trace bit of TE field
+    constant MSR_FE1 : integer := (63 - 55);    -- Floating Exception mode
    constant MSR_IR  : integer := (63 - 58);    -- Instruction Relocation
    constant MSR_DR  : integer := (63 - 59);    -- Data Relocation
    constant MSR_RI  : integer := (63 - 62);    -- Recoverable Interrupt
@ -53,8 +56,11 @@ package common is
    -- GPR indices in the register file (GPR only)
    subtype gpr_index_t is std_ulogic_vector(4 downto 0);

-    -- Extended GPR indice (can hold an SPR)
-    subtype gspr_index_t is std_ulogic_vector(5 downto 0);
+    -- Extended GPR index (can hold an SPR or a FPR)
+    subtype gspr_index_t is std_ulogic_vector(6 downto 0);
+
+    -- FPR indices
+    subtype fpr_index_t is std_ulogic_vector(4 downto 0);

    -- Some SPRs are stored in the register file, they use the magic
    -- GPR numbers above 31.
@ -64,6 +70,9 @@ package common is
    -- indicates if this is indeed a fast SPR. If clear, then
    -- the SPR is not stored in the GPR file.
    --
+    -- FPRs are also stored in the register file, using GSPR
+    -- numbers from 64 to 95.
+    --
    function fast_spr_num(spr: spr_num_t) return gspr_index_t;

    -- Indices conversion functions
@ -71,6 +80,7 @@ package common is
    function gpr_to_gspr(i: gpr_index_t) return gspr_index_t;
    function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t;
    function is_fast_spr(s: gspr_index_t) return std_ulogic;
+    function fpr_to_gspr(f: fpr_index_t) return gspr_index_t;

    -- The XER is split: the common bits (CA, OV, SO, OV32 and CA32) are
    -- in the CR file as a kind of CR extension (with a separate write
@ -84,6 +94,38 @@ package common is
    end record;
    constant xerc_init : xer_common_t := (others => '0');

+    -- FPSCR bit numbers
+    constant FPSCR_FX     : integer := 63 - 32;
+    constant FPSCR_FEX    : integer := 63 - 33;
+    constant FPSCR_VX     : integer := 63 - 34;
+    constant FPSCR_OX     : integer := 63 - 35;
+    constant FPSCR_UX     : integer := 63 - 36;
+    constant FPSCR_ZX     : integer := 63 - 37;
+    constant FPSCR_XX     : integer := 63 - 38;
+    constant FPSCR_VXSNAN : integer := 63 - 39;
+    constant FPSCR_VXISI  : integer := 63 - 40;
+    constant FPSCR_VXIDI  : integer := 63 - 41;
+    constant FPSCR_VXZDZ  : integer := 63 - 42;
+    constant FPSCR_VXIMZ  : integer := 63 - 43;
+    constant FPSCR_VXVC   : integer := 63 - 44;
+    constant FPSCR_FR     : integer := 63 - 45;
+    constant FPSCR_FI     : integer := 63 - 46;
+    constant FPSCR_C      : integer := 63 - 47;
+    constant FPSCR_FL     : integer := 63 - 48;
+    constant FPSCR_FG     : integer := 63 - 49;
+    constant FPSCR_FE     : integer := 63 - 50;
+    constant FPSCR_FU     : integer := 63 - 51;
+    constant FPSCR_VXSOFT : integer := 63 - 53;
+    constant FPSCR_VXSQRT : integer := 63 - 54;
+    constant FPSCR_VXCVI  : integer := 63 - 55;
+    constant FPSCR_VE     : integer := 63 - 56;
+    constant FPSCR_OE     : integer := 63 - 57;
+    constant FPSCR_UE     : integer := 63 - 58;
+    constant FPSCR_ZE     : integer := 63 - 59;
+    constant FPSCR_XE     : integer := 63 - 60;
+    constant FPSCR_NI     : integer := 63 - 61;
+    constant FPSCR_RN     : integer := 63 - 63;
+
    type irq_state_t is (WRITE_SRR0, WRITE_SRR1);

    -- For now, fixed 16 sources, make this either a parametric
@ -226,7 +268,7 @@ package common is
 	read2_enable : std_ulogic;
 	read2_reg : gspr_index_t;
 	read3_enable : std_ulogic;
-	read3_reg : gpr_index_t;
+	read3_reg : gspr_index_t;
    end record;

    type RegisterFileToDecode2Type is record
@ -264,7 +306,7 @@ package common is
 	addr1 : std_ulogic_vector(63 downto 0);
 	addr2 : std_ulogic_vector(63 downto 0);
 	data : std_ulogic_vector(63 downto 0);		-- data to write, unused for read
-	write_reg : gpr_index_t;
+	write_reg : gspr_index_t;
 	length : std_ulogic_vector(3 downto 0);
        ci : std_ulogic;                                -- cache-inhibited load/store
 	byte_reverse : std_ulogic;
@ -277,13 +319,15 @@ package common is
        virt_mode : std_ulogic;                         -- do translation through TLB
        priv_mode : std_ulogic;                         -- privileged mode (MSR[PR] = 0)
        mode_32bit : std_ulogic;                        -- trim addresses to 32 bits
+        is_32bit : std_ulogic;
    end record;
    constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0',
                                                                     sign_extend => '0', update => '0', xerc => xerc_init,
                                                                     reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0',
                                                                     nia => (others => '0'), insn => (others => '0'),
-                                                                     addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'), length => (others => '0'),
-                                                                     mode_32bit => '0', others => (others => '0'));
+                                                                     addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'),
+                                                                     write_reg => (others => '0'), length => (others => '0'),
+                                                                     mode_32bit => '0', is_32bit => '0', others => (others => '0'));

    type Loadstore1ToExecute1Type is record
        busy : std_ulogic;
@ -369,7 +413,7 @@ package common is
    type Loadstore1ToWritebackType is record
 	valid : std_ulogic;
 	write_enable: std_ulogic;
-	write_reg : gpr_index_t;
+	write_reg : gspr_index_t;
 	write_data : std_ulogic_vector(63 downto 0);
 	xerc : xer_common_t;
        rc : std_ulogic;
@ -401,6 +445,43 @@ package common is
                                   write_cr_data => (others => '0'), write_reg => (others => '0'),
                                   exc_write_reg => (others => '0'), exc_write_data => (others => '0'));

+    type Execute1ToFPUType is record
+        valid   : std_ulogic;
+        op      : insn_type_t;
+        nia     : std_ulogic_vector(63 downto 0);
+        insn    : std_ulogic_vector(31 downto 0);
+        single  : std_ulogic;
+        fe_mode : std_ulogic_vector(1 downto 0);
+        fra     : std_ulogic_vector(63 downto 0);
+        frb     : std_ulogic_vector(63 downto 0);
+        frc     : std_ulogic_vector(63 downto 0);
+        frt     : gspr_index_t;
+        rc      : std_ulogic;
+        out_cr  : std_ulogic;
+    end record;
+    constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'),
+                                                       insn  => (others => '0'), fe_mode => "00", rc => '0',
+                                                       fra => (others => '0'), frb => (others => '0'),
+                                                       frc => (others => '0'), frt => (others => '0'),
+                                                       single => '0', out_cr => '0');
+
+    type FPUToExecute1Type is record
+        busy      : std_ulogic;
+        exception : std_ulogic;
+        interrupt : std_ulogic;
+        illegal   : std_ulogic;
+    end record;
+
+    type FPUToWritebackType is record
+        valid           : std_ulogic;
+        write_enable    : std_ulogic;
+        write_reg       : gspr_index_t;
+        write_data      : std_ulogic_vector(63 downto 0);
+        write_cr_enable : std_ulogic;
+        write_cr_mask   : std_ulogic_vector(7 downto 0);
+        write_cr_data   : std_ulogic_vector(31 downto 0);
+    end record;
+
    type DividerToExecute1Type is record
 	valid: std_ulogic;
 	write_reg_data: std_ulogic_vector(63 downto 0);
@ -473,10 +554,10 @@ package body common is
           n := 13;
       when others =>
           n := 0;
-           return "000000";
+           return "0000000";
       end case;
       tmp := std_ulogic_vector(to_unsigned(n, 5));
-       return "1" & tmp;
+       return "01" & tmp;
    end;

    function gspr_to_gpr(i: gspr_index_t) return gpr_index_t is
@ -486,7 +567,7 @@ package body common is

    function gpr_to_gspr(i: gpr_index_t) return gspr_index_t is
    begin
-	return "0" & i;
+	return "00" & i;
    end;

    function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t is
@ -502,4 +583,9 @@ package body common is
    begin
 	return s(5);
    end;
+
+    function fpr_to_gspr(f: fpr_index_t) return gspr_index_t is
+    begin
+        return "10" & f;
+    end;
 end common;
--- a/control.vhdl
+++ b/control.vhdl
@ -34,7 +34,7 @@ entity control is
        gpr_b_read_in       : in gspr_index_t;

        gpr_c_read_valid_in : in std_ulogic;
-        gpr_c_read_in       : in gpr_index_t;
+        gpr_c_read_in       : in gspr_index_t;

        cr_read_in          : in std_ulogic;
        cr_write_in         : in std_ulogic;
@ -70,7 +70,6 @@ architecture rtl of control is
    signal gpr_write_valid : std_ulogic := '0';
    signal cr_write_valid  : std_ulogic := '0';

-    signal gpr_c_read_in_fmt : std_ulogic_vector(5 downto 0);
 begin
    gpr_hazard0: entity work.gpr_hazard
        generic map (
@ -122,8 +121,6 @@ begin
            use_bypass         => gpr_bypass_b
            );

-    gpr_c_read_in_fmt <= "0" & gpr_c_read_in;
-
    gpr_hazard2: entity work.gpr_hazard
        generic map (
            PIPELINE_DEPTH => PIPELINE_DEPTH
@ -140,7 +137,7 @@ begin
            gpr_write_in       => gpr_write_in,
            bypass_avail       => gpr_bypassable,
            gpr_read_valid_in  => gpr_c_read_valid_in,
-            gpr_read_in        => gpr_c_read_in_fmt,
+            gpr_read_in        => gpr_c_read_in,

            ugpr_write_valid   => update_gpr_write_valid,
            ugpr_write_reg     => update_gpr_write_reg,
--- a/core.vhdl
+++ b/core.vhdl
@ -11,6 +11,7 @@ entity core is
        SIM : boolean := false;
 	DISABLE_FLATTEN : boolean := false;
        EX1_BYPASS : boolean := true;
+        HAS_FPU : boolean := true;
 	ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0');
        LOG_LENGTH : natural := 512
        );
@ -79,6 +80,11 @@ architecture behave of core is
    signal mmu_to_dcache: MmuToDcacheType;
    signal dcache_to_mmu: DcacheToMmuType;

+    -- FPU signals
+    signal execute1_to_fpu: Execute1ToFPUType;
+    signal fpu_to_execute1: FPUToExecute1Type;
+    signal fpu_to_writeback: FPUToWritebackType;
+
    -- local signals
    signal fetch1_stall_in : std_ulogic;
    signal icache_stall_out : std_ulogic;
@ -108,6 +114,7 @@ architecture behave of core is
    signal rst_dec1    : std_ulogic := '1';
    signal rst_dec2    : std_ulogic := '1';
    signal rst_ex1     : std_ulogic := '1';
+    signal rst_fpu     : std_ulogic := '1';
    signal rst_ls1     : std_ulogic := '1';
    signal rst_dbg     : std_ulogic := '1';
    signal alt_reset_d : std_ulogic;
@ -170,6 +177,7 @@ begin
            rst_dec1    <= core_rst;
            rst_dec2    <= core_rst;
            rst_ex1     <= core_rst;
+            rst_fpu     <= core_rst;
            rst_ls1     <= core_rst;
            rst_dbg     <= rst;
            alt_reset_d <= alt_reset;
@ -224,6 +232,7 @@ begin

    decode1_0: entity work.decode1
        generic map(
+            HAS_FPU => HAS_FPU,
            LOG_LENGTH => LOG_LENGTH
            )
        port map (
@ -244,6 +253,7 @@ begin
    decode2_0: entity work.decode2
        generic map (
            EX1_BYPASS => EX1_BYPASS,
+            HAS_FPU => HAS_FPU,
            LOG_LENGTH => LOG_LENGTH
            )
        port map (
@ -267,6 +277,7 @@ begin
    register_file_0: entity work.register_file
        generic map (
            SIM => SIM,
+            HAS_FPU => HAS_FPU,
            LOG_LENGTH => LOG_LENGTH
            )
        port map (
@ -280,7 +291,7 @@ begin
            dbg_gpr_data => dbg_gpr_data,
 	    sim_dump => terminate,
 	    sim_dump_done => sim_cr_dump,
-            log_out => log_data(255 downto 185)
+            log_out => log_data(255 downto 184)
 	    );

    cr_file_0: entity work.cr_file
@ -294,12 +305,13 @@ begin
            d_out => cr_file_to_decode2,
            w_in => writeback_to_cr_file,
            sim_dump => sim_cr_dump,
-            log_out => log_data(184 downto 172)
+            log_out => log_data(183 downto 171)
            );

    execute1_0: entity work.execute1
        generic map (
            EX1_BYPASS => EX1_BYPASS,
+            HAS_FPU => HAS_FPU,
            LOG_LENGTH => LOG_LENGTH
            )
        port map (
@ -309,9 +321,11 @@ begin
 	    busy_out => ex1_busy_out,
            e_in => decode2_to_execute1,
            l_in => loadstore1_to_execute1,
+            fp_in => fpu_to_execute1,
            ext_irq_in => ext_irq,
            l_out => execute1_to_loadstore1,
            f_out => execute1_to_fetch1,
+            fp_out => execute1_to_fpu,
            e_out => execute1_to_writeback,
 	    icache_inval => ex1_icache_inval,
            dbg_msr_out => msr,
@ -322,8 +336,32 @@ begin
            log_wr_addr => log_wr_addr
            );

+    with_fpu: if HAS_FPU generate
+    begin
+        fpu_0: entity work.fpu
+            port map (
+                clk => clk,
+                rst => rst_fpu,
+                e_in => execute1_to_fpu,
+                e_out => fpu_to_execute1,
+                w_out => fpu_to_writeback
+                );
+    end generate;
+
+    no_fpu: if not HAS_FPU generate
+    begin
+        fpu_to_execute1.busy <= '0';
+        fpu_to_execute1.exception <= '0';
+        fpu_to_execute1.interrupt <= '0';
+        fpu_to_execute1.illegal <= '0';
+        fpu_to_writeback.valid <= '0';
+        fpu_to_writeback.write_enable <= '0';
+        fpu_to_writeback.write_cr_enable <= '0';
+    end generate;
+
    loadstore1_0: entity work.loadstore1
        generic map (
+            HAS_FPU => HAS_FPU,
            LOG_LENGTH => LOG_LENGTH
            )
        port map (
@ -368,7 +406,7 @@ begin
            stall_out => dcache_stall_out,
            wishbone_in => wishbone_data_in,
            wishbone_out => wishbone_data_out,
-            log_out => log_data(171 downto 152)
+            log_out => log_data(170 downto 151)
            );

    writeback_0: entity work.writeback
@ -376,12 +414,13 @@ begin
            clk => clk,
            e_in => execute1_to_writeback,
            l_in => loadstore1_to_writeback,
+            fp_in => fpu_to_writeback,
            w_out => writeback_to_register_file,
            c_out => writeback_to_cr_file,
            complete_out => complete
            );

-    log_data(151 downto 150) <= "00";
+    log_data(150) <= '0';
    log_data(139 downto 135) <= "00000";

    debug_0: entity work.core_debug
--- a/countzero.vhdl
+++ b/countzero.vhdl
@ -3,6 +3,7 @@ use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;

 library work;
+use work.helpers.all;

 entity zero_counter is
    port (
@ -15,42 +16,6 @@ entity zero_counter is
 end entity zero_counter;

 architecture behaviour of zero_counter is
-    -- Reverse the order of bits in a word
-    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is
-        variable ret: std_ulogic_vector(a'left downto a'right);
-    begin
-        for i in a'right to a'left loop
-            ret(a'left + a'right - i) := a(i);
-        end loop;
-        return ret;
-    end;
-
-    -- If there is only one bit set in a doubleword, return its bit number
-    -- (counting from the right).  Each bit of the result is obtained by
-    -- ORing together 32 bits of the input:
-    --  bit 0 = a[1] or a[3] or a[5] or ...
-    --  bit 1 = a[2] or a[3] or a[6] or a[7] or ...
-    --  bit 2 = a[4..7] or a[12..15] or ...
-    --  bit 5 = a[32..63] ORed together
-    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
-        variable ret: std_ulogic_vector(5 downto 0);
-        variable stride: natural;
-        variable bit: std_ulogic;
-        variable k: natural;
-    begin
-        stride := 2;
-        for i in 0 to 5 loop
-            bit := '0';
-            for j in 0 to (64 / stride) - 1 loop
-                k := j * stride;
-                bit := bit or (or a(k + stride - 1 downto k + (stride / 2)));
-            end loop;
-            ret(i) := bit;
-            stride := stride * 2;
-        end loop;
-        return ret;
-    end;
-
    signal inp : std_ulogic_vector(63 downto 0);
    signal sum : std_ulogic_vector(64 downto 0);
    signal msb_r : std_ulogic;
--- a/decode1.vhdl
+++ b/decode1.vhdl
@ -8,6 +8,7 @@ use work.decode_types.all;

 entity decode1 is
    generic (
+        HAS_FPU : boolean := true;
        -- Non-zero to enable log data collection
        LOG_LENGTH : natural := 0
        );
@ -54,7 +55,10 @@ architecture behaviour of decode1 is
    type op_19_subop_array_t is array(0 to 7) of decode_rom_t;
    type op_30_subop_array_t is array(0 to 15) of decode_rom_t;
    type op_31_subop_array_t is array(0 to 1023) of decode_rom_t;
+    type op_59_subop_array_t is array(0 to 31) of decode_rom_t;
    type minor_rom_array_2_t is array(0 to 3) of decode_rom_t;
+    type op_63_subop_array_0_t is array(0 to 511) of decode_rom_t;
+    type op_63_subop_array_1_t is array(0 to 16) of decode_rom_t;

    constant major_decode_rom_array : major_rom_array_t := (
        --          unit     internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
@ -72,6 +76,10 @@ architecture behaviour of decode1 is
        10 =>       (ALU,    OP_CMP,       RA,         CONST_UI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli
        34 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbz
        35 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzu
+        50 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfd
+        51 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdu
+        48 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfs
+        49 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsu
        42 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lha
        43 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhau
        40 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhz
@ -87,6 +95,10 @@ architecture behaviour of decode1 is
        17 =>       (ALU,    OP_SC,        NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sc
        38 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stb
        39 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbu
+        54 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfd
+        55 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdu
+        52 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfs
+        53 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsu
        44 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth
        45 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu
        36 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw
@ -272,6 +284,12 @@ architecture behaviour of decode1 is
        2#1101110101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldcix
        2#0000110101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- ldux
        2#0000010101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldx
+        2#1001010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfdx
+        2#1001110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdux
+        2#1101010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwax
+        2#1101110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwzx
+        2#1000010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfsx
+        2#1000110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsux
        2#0001110100#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lharx
        2#0101110111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhaux
        2#0101010111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lhax
@ -350,6 +368,11 @@ architecture behaviour of decode1 is
        2#0011010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0'), -- stdcx
        2#0010110101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stdux
        2#0010010101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdx
+        2#1011010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfdx
+        2#1011110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdux
+        2#1111010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfiwx
+        2#1010010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfsx
+        2#1010110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsux
        2#1110010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx
        2#1110110101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthcix
        2#1011010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0'), -- sthcx
@ -389,6 +412,24 @@ architecture behaviour of decode1 is
        others   => decode_rom_init
        );

+    constant decode_op_59_array : op_59_subop_array_t := (
+        --             unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
+        --                          op                               in   out   A   out  in    out  len        ext                                pipe
+        2#01110#  =>  (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fcfid[u]s
+        2#10010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fdivs
+        2#10100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsubs
+        2#10101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fadds
+        2#10110#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsqrts
+        2#11000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fres
+        2#11001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmuls
+        2#11010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- frsqrtes
+        2#11100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmsubs
+        2#11101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmadds
+        2#11110#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fnmsubs
+        2#11111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fnmadds
+        others => illegal_inst
+        );
+
    constant decode_op_62_array : minor_rom_array_2_t := (
        --              unit    internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
        --                            op                                           in   out   A   out  in    out  len        ext                                 pipe
@ -397,6 +438,64 @@ architecture behaviour of decode1 is
        others   => decode_rom_init
        );

+    -- indexed by bits 4..1 and 10..6 of instruction word
+    constant decode_op_63l_array : op_63_subop_array_0_t := (
+        --                unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
+        --                             op                               in   out   A   out  in    out  len        ext                                pipe
+        2#000000000#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  0/0=fcmpu
+        2#000000001#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  1/0=fcmpo
+        2#000000010#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  2/0=mcrfs
+        2#000000100#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  4/0=ftdiv
+        2#000000101#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  5/0=ftsqrt
+        2#011000001#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  1/6=mtfsb1
+        2#011000010#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  2/6=mtfsb0
+        2#011000100#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/6=mtfsfi
+        2#011011010#  => (FPU,   OP_FPOP_I,     FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 26/6=fmrgow
+        2#011011110#  => (FPU,   OP_FPOP_I,     FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 30/6=fmrgew
+        2#011110010#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 18/7=mffs family
+        2#011110110#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 22/7=mtfsf
+        2#100000000#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  0/8=fcpsgn
+        2#100000001#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  1/8=fneg
+        2#100000010#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  2/8=fmr
+        2#100000100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/8=fnabs
+        2#100001000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  8/8=fabs
+        2#100001100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 12/8=frin
+        2#100001101#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 13/8=friz
+        2#100001110#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 14/8=frip
+        2#100001111#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 15/8=frim
+        2#110000000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), --  0/12=frsp
+        2#111000000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  0/14=fctiw
+        2#111000100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/14=fctiwu
+        2#111011001#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 25/14=fctid
+        2#111011010#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 26/14=fcfid
+        2#111011101#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 29/14=fctidu
+        2#111011110#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 30/14=fcfidu
+        2#111100000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  0/15=fctiwz
+        2#111100100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/15=fctiwuz
+        2#111111001#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 25/15=fctidz
+        2#111111101#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 29/15=fctiduz
+        others => illegal_inst
+        );
+
+    -- indexed by bits 4..1 of instruction word
+    constant decode_op_63h_array : op_63_subop_array_1_t := (
+        --            unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
+        --                         op                               in   out   A   out  in    out  len        ext                                pipe
+        2#0010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fdiv
+        2#0100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsub
+        2#0101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fadd
+        2#0110#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsqrt
+        2#0111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsel
+        2#1000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fre
+        2#1001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmul
+        2#1010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- frsqrte
+        2#1100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmsub
+        2#1101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmadd
+        2#1110#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fnmsub
+        2#1111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fnmadd
+        others => illegal_inst
+        );
+
    --                                        unit   internal         in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
    --                                                     op                                              in   out   A   out  in    out  len        ext                                 pipe
    constant nop_instr      : decode_rom_t := (ALU,  OP_NOP,          NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0');
@ -547,9 +646,28 @@ begin
        when 58 =>
            v.decode := decode_op_58_array(to_integer(unsigned(f_in.insn(1 downto 0))));

+        when 59 =>
+            if HAS_FPU then
+                -- floating point operations, mostly single-precision
+                v.decode := decode_op_59_array(to_integer(unsigned(f_in.insn(5 downto 1))));
+                if f_in.insn(5) = '0' and not std_match(f_in.insn(10 downto 1), "11-1001110") then
+                    vi.override := '1';
+                end if;
+            end if;
+
        when 62 =>
            v.decode := decode_op_62_array(to_integer(unsigned(f_in.insn(1 downto 0))));

+        when 63 =>
+            if HAS_FPU then
+                -- floating point operations, general and double-precision
+                if f_in.insn(5) = '0' then
+                    v.decode := decode_op_63l_array(to_integer(unsigned(f_in.insn(4 downto 1) & f_in.insn(10 downto 6))));
+                else
+                    v.decode := decode_op_63h_array(to_integer(unsigned(f_in.insn(4 downto 1))));
+                end if;
+            end if;
+
        when others =>
        end case;

--- a/decode2.vhdl
+++ b/decode2.vhdl
@ -11,6 +11,7 @@ use work.insn_helpers.all;
 entity decode2 is
    generic (
        EX1_BYPASS : boolean := true;
+        HAS_FPU : boolean := true;
        -- Non-zero to enable log data collection
        LOG_LENGTH : natural := 0
        );
@ -73,12 +74,14 @@ architecture behaviour of decode2 is
            -- If it's all 0, we don't treat it as a dependency as slow SPRs
            -- operations are single issue.
            --
-            assert is_fast_spr(ispr) =  '1' or ispr = "000000"
+            assert is_fast_spr(ispr) =  '1' or ispr = "0000000"
                report "Decode A says SPR but ISPR is invalid:" &
                to_hstring(ispr) severity failure;
            return (is_fast_spr(ispr), ispr, reg_data);
        elsif t = CIA then
            return ('0', (others => '0'), instr_addr);
+        elsif HAS_FPU and t = FRA then
+            return ('1', fpr_to_gspr(insn_fra(insn_in)), reg_data);
        else
            return ('0', (others => '0'), (others => '0'));
        end if;
@ -92,6 +95,12 @@ architecture behaviour of decode2 is
        case t is
            when RB =>
                ret := ('1', gpr_to_gspr(insn_rb(insn_in)), reg_data);
+            when FRB =>
+                if HAS_FPU then
+                    ret := ('1', fpr_to_gspr(insn_frb(insn_in)), reg_data);
+                else
+                    ret := ('0', (others => '0'), (others => '0'));
+                end if;
            when CONST_UI =>
                ret := ('0', (others => '0'), std_ulogic_vector(resize(unsigned(insn_ui(insn_in)), 64)));
            when CONST_SI =>
@ -118,7 +127,7 @@ architecture behaviour of decode2 is
                -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR.
                -- If it's all 0, we don't treat it as a dependency as slow SPRs
                -- operations are single issue.
-                assert is_fast_spr(ispr) = '1' or ispr = "000000"
+                assert is_fast_spr(ispr) = '1' or ispr = "0000000"
                    report "Decode B says SPR but ISPR is invalid:" &
                    to_hstring(ispr) severity failure;
                ret := (is_fast_spr(ispr), ispr, reg_data);
@ -137,6 +146,18 @@ architecture behaviour of decode2 is
                return ('1', gpr_to_gspr(insn_rs(insn_in)), reg_data);
            when RCR =>
                return ('1', gpr_to_gspr(insn_rcreg(insn_in)), reg_data);
+            when FRS =>
+                if HAS_FPU then
+                    return ('1', fpr_to_gspr(insn_frt(insn_in)), reg_data);
+                else
+                    return ('0', (others => '0'), (others => '0'));
+                end if;
+            when FRC =>
+                if HAS_FPU then
+                    return ('1', fpr_to_gspr(insn_frc(insn_in)), reg_data);
+                else
+                    return ('0', (others => '0'), (others => '0'));
+                end if;
            when NONE =>
                return ('0', (others => '0'), (others => '0'));
        end case;
@ -150,16 +171,22 @@ architecture behaviour of decode2 is
                return ('1', gpr_to_gspr(insn_rt(insn_in)));
            when RA =>
                return ('1', gpr_to_gspr(insn_ra(insn_in)));
+            when FRT =>
+                if HAS_FPU then
+                    return ('1', fpr_to_gspr(insn_frt(insn_in)));
+                else
+                    return ('0', "0000000");
+                end if;
            when SPR =>
                -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR.
                -- If it's all 0, we don't treat it as a dependency as slow SPRs
                -- operations are single issue.
-                assert is_fast_spr(ispr) = '1' or ispr = "000000"
+                assert is_fast_spr(ispr) = '1' or ispr = "0000000"
                    report "Decode B says SPR but ISPR is invalid:" &
                    to_hstring(ispr) severity failure;
                return (is_fast_spr(ispr), ispr);
            when NONE =>
-                return ('0', "000000");
+                return ('0', "0000000");
        end case;
    end;

@ -212,7 +239,7 @@ architecture behaviour of decode2 is
    signal gpr_b_bypass : std_ulogic;

    signal gpr_c_read_valid : std_ulogic;
-    signal gpr_c_read : gpr_index_t;
+    signal gpr_c_read : gspr_index_t;
    signal gpr_c_bypass : std_ulogic;

    signal cr_write_valid  : std_ulogic;
@ -281,11 +308,15 @@ begin
    end process;

    r_out.read1_reg <= d_in.ispr1 when d_in.decode.input_reg_a = SPR
+                       else fpr_to_gspr(insn_fra(d_in.insn)) when d_in.decode.input_reg_a = FRA and HAS_FPU
                       else gpr_to_gspr(insn_ra(d_in.insn));
    r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR
+                       else fpr_to_gspr(insn_frb(d_in.insn)) when d_in.decode.input_reg_b = FRB and HAS_FPU
                       else gpr_to_gspr(insn_rb(d_in.insn));
-    r_out.read3_reg <= insn_rcreg(d_in.insn) when d_in.decode.input_reg_c = RCR
-                       else insn_rs(d_in.insn);
+    r_out.read3_reg <= gpr_to_gspr(insn_rcreg(d_in.insn)) when d_in.decode.input_reg_c = RCR
+                       else fpr_to_gspr(insn_frc(d_in.insn)) when d_in.decode.input_reg_c = FRC and HAS_FPU
+                       else fpr_to_gspr(insn_frt(d_in.insn)) when d_in.decode.input_reg_c = FRS and HAS_FPU
+                       else gpr_to_gspr(insn_rs(d_in.insn));

    c_out.read <= d_in.decode.input_cr;

@ -307,7 +338,7 @@ begin
        mul_b := (others => '0');

        --v.e.input_cr := d_in.decode.input_cr;
-        --v.e.output_cr := d_in.decode.output_cr;
+        v.e.output_cr := d_in.decode.output_cr;
        
        decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1,
                                             d_in.nia);
@ -394,11 +425,11 @@ begin
        gpr_b_read <= decoded_reg_b.reg;

        gpr_c_read_valid <= decoded_reg_c.reg_valid;
-        gpr_c_read <= gspr_to_gpr(decoded_reg_c.reg);
+        gpr_c_read <= decoded_reg_c.reg;

        cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn);
        cr_bypass_avail <= '0';
-        if EX1_BYPASS then
+        if EX1_BYPASS and d_in.decode.unit = ALU then
            cr_bypass_avail <= d_in.decode.output_cr;
        end if;

--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@ -7,9 +7,11 @@ package decode_types is
 			 OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB,
 			 OP_CNTZ, OP_CROP,
 			 OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST,
-			 OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS,
-			 OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC,
+			 OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS, OP_EXTSWSLI,
+                         OP_FPOP, OP_FPOP_I,
+                         OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC,
 			 OP_LOAD, OP_STORE,
+                         OP_FPLOAD, OP_FPSTORE,
 			 OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MOD,
 			 OP_MTCRF, OP_MTMSRD, OP_MTSPR, OP_MUL_L64,
 			 OP_MUL_H64, OP_MUL_H32, OP_OR,
@ -21,11 +23,11 @@ package decode_types is
                         OP_BCD, OP_ADDG6S,
                         OP_FETCH_FAILED
 			 );
-    type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA);
+    type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA, FRA);
    type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD,
-                           CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR);
-    type input_reg_c_t is (NONE, RS, RCR);
-    type output_reg_a_t is (NONE, RT, RA, SPR);
+                           CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR, FRB);
+    type input_reg_c_t is (NONE, RS, RCR, FRC, FRS);
+    type output_reg_a_t is (NONE, RT, RA, SPR, FRT);
    type rc_t is (NONE, ONE, RC);
    type carry_in_t is (ZERO, CA, OV, ONE);

@ -47,7 +49,7 @@ package decode_types is

    constant TOO_OFFSET : integer := 0;

-    type unit_t is (NONE, ALU, LDST);
+    type unit_t is (NONE, ALU, LDST, FPU);
    type length_t is (NONE, is1B, is2B, is4B, is8B);

    type decode_rom_t is record
--- a/execute1.vhdl
+++ b/execute1.vhdl
@ -13,6 +13,7 @@ use work.ppc_fx_insns.all;
 entity execute1 is
    generic (
        EX1_BYPASS : boolean := true;
+        HAS_FPU : boolean := true;
        -- Non-zero to enable log data collection
        LOG_LENGTH : natural := 0
        );
@ -26,12 +27,14 @@ entity execute1 is

 	e_in  : in Decode2ToExecute1Type;
        l_in  : in Loadstore1ToExecute1Type;
+        fp_in : in FPUToExecute1Type;

 	ext_irq_in : std_ulogic;

 	-- asynchronous
        l_out : out Execute1ToLoadstore1Type;
 	f_out : out Execute1ToFetch1Type;
+        fp_out : out Execute1ToFPUType;

 	e_out : out Execute1ToWritebackType;

@ -53,6 +56,7 @@ architecture behaviour of execute1 is
        f : Execute1ToFetch1Type;
        busy: std_ulogic;
        terminate: std_ulogic;
+        fp_exception_next : std_ulogic;
        trace_next : std_ulogic;
        prev_op : insn_type_t;
 	lr_update : std_ulogic;
@ -71,7 +75,8 @@ architecture behaviour of execute1 is
    end record;
    constant reg_type_init : reg_type :=
        (e => Execute1ToWritebackInit, f => Execute1ToFetch1Init,
-         busy => '0', lr_update => '0', terminate => '0', trace_next => '0', prev_op => OP_ILLEGAL,
+         busy => '0', lr_update => '0', terminate => '0',
+         fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL,
         mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0',
         slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init,
         next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0'));
@ -267,7 +272,7 @@ begin
    b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2;
    c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3;

-    busy_out <= l_in.busy or r.busy;
+    busy_out <= l_in.busy or r.busy or fp_in.busy;
    valid_in <= e_in.valid and not busy_out;

    terminate_out <= r.terminate;
@ -333,6 +338,7 @@ begin
        variable spr_val : std_ulogic_vector(63 downto 0);
        variable addend : std_ulogic_vector(127 downto 0);
        variable do_trace : std_ulogic;
+        variable fv : Execute1ToFPUType;
    begin
 	result := (others => '0');
 	sum_with_carry := (others => '0');
@ -346,6 +352,7 @@ begin
 	v.e := Execute1ToWritebackInit;
        lv := Execute1ToLoadstore1Init;
        v.f.redirect := '0';
+        fv := Execute1ToFPUInit;

 	-- XER forwarding. To avoid having to track XER hazards, we
 	-- use the previously latched value.
@ -521,9 +528,11 @@ begin
        exception_nextpc := '0';
        v.e.exc_write_enable := '0';
        v.e.exc_write_reg := fast_spr_num(SPR_SRR0);
-        v.e.exc_write_data := e_in.nia;
        if valid_in = '1' then
+            v.e.exc_write_data := e_in.nia;
            v.last_nia := e_in.nia;
+        else
+            v.e.exc_write_data := r.last_nia;
        end if;

        v.e.mode_32bit := not ctrl.msr(MSR_SF);
@ -542,24 +551,36 @@ begin
            ctrl_tmp.msr(MSR_PR) <= '0';
            ctrl_tmp.msr(MSR_SE) <= '0';
            ctrl_tmp.msr(MSR_BE) <= '0';
+            ctrl_tmp.msr(MSR_FP) <= '0';
+            ctrl_tmp.msr(MSR_FE0) <= '0';
+            ctrl_tmp.msr(MSR_FE1) <= '0';
            ctrl_tmp.msr(MSR_IR) <= '0';
            ctrl_tmp.msr(MSR_DR) <= '0';
            ctrl_tmp.msr(MSR_RI) <= '0';
            ctrl_tmp.msr(MSR_LE) <= '1';
            v.e.valid := '1';
            v.trace_next := '0';
+            v.fp_exception_next := '0';
 	    report "Writing SRR1: " & to_hstring(ctrl.srr1);

-        elsif r.trace_next = '1' and valid_in = '1' then
-            -- Generate a trace interrupt rather than executing the next instruction
-            -- or taking any asynchronous interrupt
-            v.f.redirect_nia := std_logic_vector(to_unsigned(16#d00#, 64));
-            ctrl_tmp.srr1(63 - 33) <= '1';
-            if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or
-                r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then
-                ctrl_tmp.srr1(63 - 35) <= '1';
-            elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then
-                ctrl_tmp.srr1(63 - 36) <= '1';
+        elsif valid_in = '1' and ((HAS_FPU and r.fp_exception_next = '1') or r.trace_next = '1') then
+            if HAS_FPU and r.fp_exception_next = '1' then
+                -- This is used for FP-type program interrupts that
+                -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero.
+                v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64));
+                ctrl_tmp.srr1(63 - 43) <= '1';
+                ctrl_tmp.srr1(63 - 47) <= '1';
+            else
+                -- Generate a trace interrupt rather than executing the next instruction
+                -- or taking any asynchronous interrupt
+                v.f.redirect_nia := std_logic_vector(to_unsigned(16#d00#, 64));
+                ctrl_tmp.srr1(63 - 33) <= '1';
+                if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or
+                    r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then
+                    ctrl_tmp.srr1(63 - 35) <= '1';
+                elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then
+                    ctrl_tmp.srr1(63 - 36) <= '1';
+                end if;
            end if;
            exception := '1';

@ -579,6 +600,18 @@ begin
            ctrl_tmp.srr1(63 - 45) <= '1';
            report "privileged instruction";

+        elsif not HAS_FPU and valid_in = '1' and
+            (e_in.insn_type = OP_FPLOAD or e_in.insn_type = OP_FPSTORE) then
+            -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations
+            illegal := '1';
+
+        elsif HAS_FPU and valid_in = '1' and ctrl.msr(MSR_FP) = '0' and
+            (e_in.unit = FPU or e_in.insn_type = OP_FPLOAD or e_in.insn_type = OP_FPSTORE) then
+            -- generate a floating-point unavailable interrupt
+            exception := '1';
+            v.f.redirect_nia := std_logic_vector(to_unsigned(16#800#, 64));
+            report "FP unavailable interrupt";
+
 	elsif valid_in = '1' and e_in.unit = ALU then

 	    report "execute nia " & to_hstring(e_in.nia);
@ -793,6 +826,10 @@ begin
                is_branch := '1';
                taken_branch := '1';
                abs_branch := '1';
+                if HAS_FPU then
+                    v.fp_exception_next := fp_in.exception and
+                                           (a_in(MSR_FE0) or a_in(MSR_FE1));
+                end if;
                do_trace := '0';

            when OP_CNTZ =>
@ -964,6 +1001,10 @@ begin
                        ctrl_tmp.msr(MSR_IR) <= '1';
                        ctrl_tmp.msr(MSR_DR) <= '1';
                    end if;
+                    if HAS_FPU then
+                        v.fp_exception_next := fp_in.exception and
+                                               (c_in(MSR_FE0) or c_in(MSR_FE1));
+                    end if;
                end if;
 	    when OP_MTSPR =>
 		report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
@ -1080,6 +1121,8 @@ begin
                lv.valid := '1';
            elsif e_in.unit = NONE then
                illegal := '1';
+            elsif HAS_FPU and e_in.unit = FPU then
+                fv.valid := '1';
            end if;

        elsif r.f.redirect = '1' then
@ -1154,7 +1197,17 @@ begin
            v.e.valid := '1';
 	end if;

-        if illegal = '1' then
+        -- Generate FP-type program interrupt.  fp_in.interrupt will only
+        -- be set during the execution of a FP instruction.
+        -- The case where MSR[FE0,FE1] goes from zero to non-zero is
+        -- handled above by mtmsrd and rfid setting v.fp_exception_next.
+        if HAS_FPU and fp_in.interrupt = '1' then
+            v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64));
+            ctrl_tmp.srr1(63 - 43) <= '1';
+            exception := '1';
+        end if;
+
+        if illegal = '1' or (HAS_FPU and fp_in.illegal = '1') then
            exception := '1';
            v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64));
            -- Since we aren't doing Hypervisor emulation assist (0xe40) we
@ -1200,7 +1253,6 @@ begin
            end if;
            v.e.exc_write_enable := '1';
            v.e.exc_write_reg := fast_spr_num(SPR_SRR0);
-            v.e.exc_write_data := r.last_nia;
            report "ldst exception writing srr0=" & to_hstring(r.last_nia);
        end if;

@ -1225,7 +1277,7 @@ begin
        lv.addr1 := a_in;
        lv.addr2 := b_in;
        lv.data := c_in;
-        lv.write_reg := gspr_to_gpr(e_in.write_reg);
+        lv.write_reg := e_in.write_reg;
        lv.length := e_in.data_len;
        lv.byte_reverse := e_in.byte_reverse xnor ctrl.msr(MSR_LE);
        lv.sign_extend := e_in.sign_extend;
@ -1243,6 +1295,20 @@ begin
        lv.virt_mode := ctrl.msr(MSR_DR);
        lv.priv_mode := not ctrl.msr(MSR_PR);
        lv.mode_32bit := not ctrl.msr(MSR_SF);
+        lv.is_32bit := e_in.is_32bit;
+
+        -- Outputs to FPU
+        fv.op := e_in.insn_type;
+        fv.nia := e_in.nia;
+        fv.insn := e_in.insn;
+        fv.single := e_in.is_32bit;
+        fv.fe_mode := ctrl.msr(MSR_FE0) & ctrl.msr(MSR_FE1);
+        fv.fra := a_in;
+        fv.frb := b_in;
+        fv.frc := c_in;
+        fv.frt := e_in.write_reg;
+        fv.rc := e_in.rc;
+        fv.out_cr := e_in.output_cr;

 	-- Update registers
 	rin <= v;
@ -1251,6 +1317,7 @@ begin
 	f_out <= r.f;
        l_out <= lv;
 	e_out <= r.e;
+        fp_out <= fv;
 	flush_out <= f_out.redirect;

        exception_log <= exception;
--- a/fpga/top-arty.vhdl
+++ b/fpga/top-arty.vhdl
@ -14,6 +14,7 @@ entity toplevel is
        RAM_INIT_FILE      : string   := "firmware.hex";
        RESET_LOW          : boolean  := true;
        CLK_FREQUENCY      : positive := 100000000;
+        HAS_FPU            : boolean  := true;
        USE_LITEDRAM       : boolean  := false;
        NO_BRAM            : boolean  := false;
        DISABLE_FLATTEN_CORE : boolean := false;
@ -168,6 +169,7 @@ begin
            RAM_INIT_FILE      => RAM_INIT_FILE,
            SIM                => false,
            CLK_FREQ           => CLK_FREQUENCY,
+            HAS_FPU            => HAS_FPU,
            HAS_DRAM           => USE_LITEDRAM,
            DRAM_SIZE          => 256 * 1024 * 1024,
            DRAM_INIT_SIZE     => PAYLOAD_SIZE,
--- a/fpga/top-generic.vhdl
+++ b/fpga/top-generic.vhdl
@ -11,6 +11,7 @@ entity toplevel is
 	RESET_LOW     : boolean  := true;
 	CLK_INPUT     : positive := 100000000;
 	CLK_FREQUENCY : positive := 100000000;
+        HAS_FPU       : boolean  := true;
 	DISABLE_FLATTEN_CORE : boolean := false;
        UART_IS_16550 : boolean  := true
 	);
@ -68,6 +69,7 @@ begin
 	    RAM_INIT_FILE => RAM_INIT_FILE,
 	    SIM           => false,
 	    CLK_FREQ      => CLK_FREQUENCY,
+            HAS_FPU       => HAS_FPU,
 	    DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE,
            UART0_IS_16550     => UART_IS_16550
 	    )
--- a/fpga/top-nexys-video.vhdl
+++ b/fpga/top-nexys-video.vhdl
@ -14,6 +14,7 @@ entity toplevel is
 	RAM_INIT_FILE : string   := "firmware.hex";
 	RESET_LOW     : boolean  := true;
 	CLK_FREQUENCY : positive := 100000000;
+        HAS_FPU       : boolean  := true;
 	USE_LITEDRAM  : boolean  := false;
 	NO_BRAM       : boolean  := false;
 	DISABLE_FLATTEN_CORE : boolean := false;
@ -120,6 +121,7 @@ begin
 	    RAM_INIT_FILE => RAM_INIT_FILE,
 	    SIM           => false,
 	    CLK_FREQ      => CLK_FREQUENCY,
+            HAS_FPU       => HAS_FPU,
 	    HAS_DRAM      => USE_LITEDRAM,
 	    DRAM_SIZE     => 512 * 1024 * 1024,
            DRAM_INIT_SIZE => PAYLOAD_SIZE,
--- a/fpu.vhdl
+++ b/fpu.vhdl
--- a/gpr_hazard.vhdl
+++ b/gpr_hazard.vhdl
@ -2,6 +2,9 @@ library ieee;
 use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;

+library work;
+use work.common.all;
+
 entity gpr_hazard is
    generic (
        PIPELINE_DEPTH : natural := 1
@ -15,13 +18,13 @@ entity gpr_hazard is
        issuing            : in std_ulogic;

        gpr_write_valid_in : in std_ulogic;
-        gpr_write_in       : in std_ulogic_vector(5 downto 0);
+        gpr_write_in       : in gspr_index_t;
        bypass_avail       : in std_ulogic;
        gpr_read_valid_in  : in std_ulogic;
-        gpr_read_in        : in std_ulogic_vector(5 downto 0);
+        gpr_read_in        : in gspr_index_t;

        ugpr_write_valid   : in std_ulogic;
-        ugpr_write_reg     : in std_ulogic_vector(5 downto 0);
+        ugpr_write_reg     : in gspr_index_t;

        stall_out          : out std_ulogic;
        use_bypass         : out std_ulogic
@ -31,9 +34,9 @@ architecture behaviour of gpr_hazard is
    type pipeline_entry_type is record
        valid  : std_ulogic;
        bypass : std_ulogic;
-        gpr    : std_ulogic_vector(5 downto 0);
+        gpr    : gspr_index_t;
        ugpr_valid : std_ulogic;
-        ugpr   : std_ulogic_vector(5 downto 0);
+        ugpr   : gspr_index_t;
    end record;
    constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0'),
                                                           ugpr_valid => '0', ugpr => (others => '0'));
--- a/helpers.vhdl
+++ b/helpers.vhdl
@ -25,6 +25,10 @@ package helpers is
    function byte_reverse(val: std_ulogic_vector(63 downto 0); size: integer) return std_ulogic_vector;

    function sign_extend(val: std_ulogic_vector(63 downto 0); size: natural) return std_ulogic_vector;
+
+    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector;
+    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
+    function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector;
 end package helpers;

 package body helpers is
@ -206,4 +210,53 @@ package body helpers is
        return std_ulogic_vector(ret);

    end;
+
+    -- Reverse the order of bits in a word
+    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is
+        variable ret: std_ulogic_vector(a'left downto a'right);
+    begin
+        for i in a'right to a'left loop
+            ret(a'left + a'right - i) := a(i);
+        end loop;
+        return ret;
+    end;
+
+    -- If there is only one bit set in a doubleword, return its bit number
+    -- (counting from the right).  Each bit of the result is obtained by
+    -- ORing together 32 bits of the input:
+    --  bit 0 = a[1] or a[3] or a[5] or ...
+    --  bit 1 = a[2] or a[3] or a[6] or a[7] or ...
+    --  bit 2 = a[4..7] or a[12..15] or ...
+    --  bit 5 = a[32..63] ORed together
+    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
+        variable ret: std_ulogic_vector(5 downto 0);
+        variable stride: natural;
+        variable bit: std_ulogic;
+        variable k: natural;
+    begin
+        stride := 2;
+        for i in 0 to 5 loop
+            bit := '0';
+            for j in 0 to (64 / stride) - 1 loop
+                k := j * stride;
+                bit := bit or (or a(k + stride - 1 downto k + (stride / 2)));
+            end loop;
+            ret(i) := bit;
+            stride := stride * 2;
+        end loop;
+        return ret;
+    end;
+
+    -- Count leading zeroes operation
+    -- Assumes the value passed in is not zero (if it is, zero is returned)
+    function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector is
+        variable rev: std_ulogic_vector(val'left downto val'right);
+        variable sum: std_ulogic_vector(val'left downto val'right);
+        variable onehot: std_ulogic_vector(val'left downto val'right);
+    begin
+        rev := bit_reverse(val);
+        sum := std_ulogic_vector(- signed(rev));
+        onehot := sum and rev;
+        return bit_number(std_ulogic_vector(resize(unsigned(onehot), 64)));
+    end;
 end package body helpers;
--- a/insn_helpers.vhdl
+++ b/insn_helpers.vhdl
@ -37,6 +37,11 @@ package insn_helpers is
    function insn_sh (insn_in : std_ulogic_vector) return std_ulogic_vector;
    function insn_me (insn_in : std_ulogic_vector) return std_ulogic_vector;
    function insn_mb (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_frt (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_fra (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_frb (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_frc (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_u (insn_in : std_ulogic_vector) return std_ulogic_vector;
 end package insn_helpers;

 package body insn_helpers is
@ -214,4 +219,29 @@ package body insn_helpers is
    begin
        return insn_in(5) & insn_in(10 downto 6);
    end;
+
+    function insn_frt(insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(25 downto 21);
+    end;
+
+    function insn_fra(insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(20 downto 16);
+    end;
+
+    function insn_frb(insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(15 downto 11);
+    end;
+
+    function insn_frc(insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(10 downto 6);
+    end;
+
+    function insn_u(insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(15 downto 12);
+    end;
 end package body insn_helpers;
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@ -5,12 +5,15 @@ use ieee.numeric_std.all;
 library work;
 use work.decode_types.all;
 use work.common.all;
+use work.insn_helpers.all;
+use work.helpers.all;

 -- 2 cycle LSU
 -- We calculate the address in the first cycle

 entity loadstore1 is
    generic (
+        HAS_FPU : boolean := true;
        -- Non-zero to enable log data collection
        LOG_LENGTH : natural := 0
        );
@ -42,10 +45,12 @@ architecture behave of loadstore1 is

    -- State machine for unaligned loads/stores
    type state_t is (IDLE,              -- ready for instruction
+                     FPR_CONV,          -- converting double to float for store
                     SECOND_REQ,        -- send 2nd request of unaligned xfer
                     ACK_WAIT,          -- waiting for ack from dcache
                     MMU_LOOKUP,        -- waiting for MMU to look up translation
                     TLBIE_WAIT,        -- waiting for MMU to finish doing a tlbie
+                     FINISH_LFS,        -- write back converted SP data for lfs*
                     COMPLETE           -- extra cycle to complete an operation
                     );

@ -58,7 +63,7 @@ architecture behave of loadstore1 is
 	addr         : std_ulogic_vector(63 downto 0);
 	store_data   : std_ulogic_vector(63 downto 0);
 	load_data    : std_ulogic_vector(63 downto 0);
-	write_reg    : gpr_index_t;
+	write_reg    : gspr_index_t;
 	length       : std_ulogic_vector(3 downto 0);
 	byte_reverse : std_ulogic;
 	sign_extend  : std_ulogic;
@ -86,6 +91,11 @@ architecture behave of loadstore1 is
        do_update    : std_ulogic;
        extra_cycle  : std_ulogic;
        mode_32bit   : std_ulogic;
+        load_sp      : std_ulogic;
+        ld_sp_data   : std_ulogic_vector(31 downto 0);
+        ld_sp_nz     : std_ulogic;
+        ld_sp_lz     : std_ulogic_vector(5 downto 0);
+        st_sp_data   : std_ulogic_vector(31 downto 0);
    end record;

    type byte_sel_t is array(0 to 7) of std_ulogic;
@ -95,6 +105,9 @@ architecture behave of loadstore1 is
    signal r, rin : reg_stage_t;
    signal lsu_sum : std_ulogic_vector(63 downto 0);

+    signal store_sp_data : std_ulogic_vector(31 downto 0);
+    signal load_dp_data  : std_ulogic_vector(63 downto 0);
+
    -- Generate byte enables from sizes
    function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
    begin
@ -125,6 +138,72 @@ architecture behave of loadstore1 is
 					    to_integer(unsigned(address))));
    end function xfer_data_sel;

+    -- 23-bit right shifter for DP -> SP float conversions
+    function shifter_23r(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
+        return std_ulogic_vector is
+        variable fs1   : std_ulogic_vector(22 downto 0);
+        variable fs2   : std_ulogic_vector(22 downto 0);
+    begin
+        case shift(1 downto 0) is
+            when "00" =>
+                fs1 := frac;
+            when "01" =>
+                fs1 := '0' & frac(22 downto 1);
+            when "10" =>
+                fs1 := "00" & frac(22 downto 2);
+            when others =>
+                fs1 := "000" & frac(22 downto 3);
+        end case;
+        case shift(4 downto 2) is
+            when "000" =>
+                fs2 := fs1;
+            when "001" =>
+                fs2 := x"0" & fs1(22 downto 4);
+            when "010" =>
+                fs2 := x"00" & fs1(22 downto 8);
+            when "011" =>
+                fs2 := x"000" & fs1(22 downto 12);
+            when "100" =>
+                fs2 := x"0000" & fs1(22 downto 16);
+            when others =>
+                fs2 := x"00000" & fs1(22 downto 20);
+        end case;
+        return fs2;
+    end;
+
+    -- 23-bit left shifter for SP -> DP float conversions
+    function shifter_23l(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
+        return std_ulogic_vector is
+        variable fs1   : std_ulogic_vector(22 downto 0);
+        variable fs2   : std_ulogic_vector(22 downto 0);
+    begin
+        case shift(1 downto 0) is
+            when "00" =>
+                fs1 := frac;
+            when "01" =>
+                fs1 := frac(21 downto 0) & '0';
+            when "10" =>
+                fs1 := frac(20 downto 0) & "00";
+            when others =>
+                fs1 := frac(19 downto 0) & "000";
+        end case;
+        case shift(4 downto 2) is
+            when "000" =>
+                fs2 := fs1;
+            when "001" =>
+                fs2 := fs1(18 downto 0) & x"0" ;
+            when "010" =>
+                fs2 := fs1(14 downto 0) & x"00";
+            when "011" =>
+                fs2 := fs1(10 downto 0) & x"000";
+            when "100" =>
+                fs2 := fs1(6 downto 0) & x"0000";
+            when others =>
+                fs2 := fs1(2 downto 0) & x"00000";
+        end case;
+        return fs2;
+    end;
+
 begin
    -- Calculate the address in the first cycle
    lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
@ -142,6 +221,59 @@ begin
        end if;
    end process;

+    ls_fp_conv: if HAS_FPU generate
+        -- Convert DP data to SP for stfs
+        dp_to_sp: process(all)
+            variable exp   : unsigned(10 downto 0);
+            variable frac  : std_ulogic_vector(22 downto 0);
+            variable shift : unsigned(4 downto 0);
+        begin
+            store_sp_data(31) <= l_in.data(63);
+            store_sp_data(30 downto 0) <= (others => '0');
+            exp := unsigned(l_in.data(62 downto 52));
+            if exp > 896 then
+                store_sp_data(30) <= l_in.data(62);
+                store_sp_data(29 downto 0) <= l_in.data(58 downto 29);
+            elsif exp >= 874 then
+                -- denormalization required
+                frac := '1' & l_in.data(51 downto 30);
+                shift := 0 - exp(4 downto 0);
+                store_sp_data(22 downto 0) <= shifter_23r(frac, shift);
+            end if;
+        end process;
+
+        -- Convert SP data to DP for lfs
+        sp_to_dp: process(all)
+            variable exp     : unsigned(7 downto 0);
+            variable exp_dp  : unsigned(10 downto 0);
+            variable exp_nz  : std_ulogic;
+            variable exp_ao  : std_ulogic;
+            variable frac    : std_ulogic_vector(22 downto 0);
+            variable frac_shift : unsigned(4 downto 0);
+        begin
+            frac := r.ld_sp_data(22 downto 0);
+            exp := unsigned(r.ld_sp_data(30 downto 23));
+            exp_nz := or (r.ld_sp_data(30 downto 23));
+            exp_ao := and (r.ld_sp_data(30 downto 23));
+            frac_shift := (others => '0');
+            if exp_ao = '1' then
+                exp_dp := to_unsigned(2047, 11);    -- infinity or NaN
+            elsif exp_nz = '1' then
+                exp_dp := 896 + resize(exp, 11);    -- finite normalized value
+            elsif r.ld_sp_nz = '0' then
+                exp_dp := to_unsigned(0, 11);       -- zero
+            else
+                -- denormalized SP operand, need to normalize
+                exp_dp := 896 - resize(unsigned(r.ld_sp_lz), 11);
+                frac_shift := unsigned(r.ld_sp_lz(4 downto 0)) + 1;
+            end if;
+            load_dp_data(63) <= r.ld_sp_data(31);
+            load_dp_data(62 downto 52) <= std_ulogic_vector(exp_dp);
+            load_dp_data(51 downto 29) <= shifter_23l(frac, frac_shift);
+            load_dp_data(28 downto 0) <= (others => '0');
+        end process;
+    end generate;
+
    loadstore1_1: process(all)
        variable v : reg_stage_t;
        variable brev_lenm1 : unsigned(2 downto 0);
@ -162,6 +294,9 @@ begin
        variable data_permuted : std_ulogic_vector(63 downto 0);
        variable data_trimmed : std_ulogic_vector(63 downto 0);
        variable store_data : std_ulogic_vector(63 downto 0);
+        variable data_in : std_ulogic_vector(63 downto 0);
+        variable byte_rev : std_ulogic;
+        variable length : std_ulogic_vector(3 downto 0);
        variable use_second : byte_sel_t;
        variable trim_ctl : trim_ctl_t;
        variable negative : std_ulogic;
@ -173,6 +308,8 @@ begin
        variable mmu_mtspr : std_ulogic;
        variable itlb_fault : std_ulogic;
        variable misaligned : std_ulogic;
+        variable fp_reg_conv : std_ulogic;
+        variable lfs_done : std_ulogic;
    begin
        v := r;
        req := '0';
@ -182,8 +319,10 @@ begin
        sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10));
        dsisr := (others => '0');
        mmureq := '0';
+        fp_reg_conv := '0';

        write_enable := '0';
+        lfs_done := '0';

        do_update := r.do_update;
        v.do_update := '0';
@ -242,19 +381,38 @@ begin
            end case;
        end loop;

-        -- Byte reversing and rotating for stores
-        -- Done in the first cycle (when l_in.valid = 1)
+        if HAS_FPU then
+            -- Single-precision FP conversion
+            v.st_sp_data := store_sp_data;
+            v.ld_sp_data := data_trimmed(31 downto 0);
+            v.ld_sp_nz := or (data_trimmed(22 downto 0));
+            v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0));
+        end if;
+
+        -- Byte reversing and rotating for stores.
+        -- Done in the first cycle (when l_in.valid = 1) for integer stores
+        -- and DP float stores, and in the second cycle for SP float stores.
        store_data := r.store_data;
-        if l_in.valid = '1' then
-            byte_offset := unsigned(lsu_sum(2 downto 0));
+        if l_in.valid = '1' or (HAS_FPU and r.state = FPR_CONV) then
+            if HAS_FPU and r.state = FPR_CONV then
+                data_in := x"00000000" & r.st_sp_data;
+                byte_offset := unsigned(r.addr(2 downto 0));
+                byte_rev := r.byte_reverse;
+                length := r.length;
+            else
+                data_in := l_in.data;
+                byte_offset := unsigned(lsu_sum(2 downto 0));
+                byte_rev := l_in.byte_reverse;
+                length := l_in.length;
+            end if;
            brev_lenm1 := "000";
-            if l_in.byte_reverse = '1' then
-                brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
+            if byte_rev = '1' then
+                brev_lenm1 := unsigned(length(2 downto 0)) - 1;
            end if;
            for i in 0 to 7 loop
                k := (to_unsigned(i, 3) - byte_offset) xor brev_lenm1;
                j := to_integer(k) * 8;
-                store_data(i * 8 + 7 downto i * 8) := l_in.data(j + 7 downto j);
+                store_data(i * 8 + 7 downto i * 8) := data_in(j + 7 downto j);
            end loop;
        end if;
        v.store_data := store_data;
@ -289,6 +447,14 @@ begin
        case r.state is
        when IDLE =>

+        when FPR_CONV =>
+            req := '1';
+            if r.second_bytes /= "00000000" then
+                v.state := SECOND_REQ;
+            else
+                v.state := ACK_WAIT;
+            end if;
+
        when SECOND_REQ =>
            req := '1';
            v.state := ACK_WAIT;
@ -320,8 +486,13 @@ begin
                        v.load_data := data_permuted;
                    end if;
                else
-                    write_enable := r.load;
-                    if r.extra_cycle = '1' then
+                    write_enable := r.load and not r.load_sp;
+                    if HAS_FPU and r.load_sp = '1' then
+                        -- SP to DP conversion takes a cycle
+                        -- Write back rA update in this cycle if needed
+                        do_update := r.update;
+                        v.state := FINISH_LFS;
+                    elsif r.extra_cycle = '1' then
                        -- loads with rA update need an extra cycle
                        v.state := COMPLETE;
                        v.do_update := r.update;
@ -359,6 +530,9 @@ begin

        when TLBIE_WAIT =>

+        when FINISH_LFS =>
+            lfs_done := '1';
+
        when COMPLETE =>
            exception := r.align_intr;

@ -392,6 +566,7 @@ begin
            v.nc := l_in.ci;
            v.virt_mode := l_in.virt_mode;
            v.priv_mode := l_in.priv_mode;
+            v.load_sp := '0';
            v.wait_dcache := '0';
            v.wait_mmu := '0';
            v.do_update := '0';
@ -431,6 +606,27 @@ begin
                    v.align_intr := v.nc;
                    req := '1';
                    v.dcbz := '1';
+                when OP_FPSTORE =>
+                    if HAS_FPU then
+                        if l_in.is_32bit = '1' then
+                            v.state := FPR_CONV;
+                            fp_reg_conv := '1';
+                        else
+                            req := '1';
+                        end if;
+                    end if;
+                when OP_FPLOAD =>
+                    if HAS_FPU then
+                        v.load := '1';
+                        req := '1';
+                        -- Allow an extra cycle for SP->DP precision conversion
+                        -- or RA update
+                        v.extra_cycle := l_in.update;
+                        if l_in.is_32bit = '1' then
+                            v.load_sp := '1';
+                            v.extra_cycle := '1';
+                        end if;
+                    end if;
                when OP_TLBIE =>
                    mmureq := '1';
                    v.tlbie := '1';
@ -486,7 +682,7 @@ begin
                end if;
            end if;

-            v.busy := req or mmureq or mmu_mtspr;
+            v.busy := req or mmureq or mmu_mtspr or fp_reg_conv;
        end if;

        -- Update outputs to dcache
@ -523,8 +719,12 @@ begin
            l_out.write_data <= r.sprval;
        elsif do_update = '1' then
            l_out.write_enable <= '1';
-            l_out.write_reg <= r.update_reg;
+            l_out.write_reg <= gpr_to_gspr(r.update_reg);
            l_out.write_data <= r.addr;
+        elsif lfs_done = '1' then
+            l_out.write_enable <= '1';
+            l_out.write_reg <= r.write_reg;
+            l_out.write_data <= load_dp_data;
        else
            l_out.write_enable <= write_enable;
            l_out.write_reg <= r.write_reg;
--- a/microwatt.core
+++ b/microwatt.core
@ -23,6 +23,7 @@ filesets:
      - cr_hazard.vhdl
      - control.vhdl
      - execute1.vhdl
+      - fpu.vhdl
      - loadstore1.vhdl
      - mmu.vhdl
      - dcache.vhdl
@ -132,6 +133,7 @@ targets:
      - disable_flatten_core
      - log_length=2048
      - uart_is_16550
+      - has_fpu
    tools:
      vivado: {part : xc7a100tcsg324-1}
    toplevel : toplevel
@ -215,6 +217,7 @@ targets:
      - spi_flash_offset=10485760
      - log_length=2048
      - uart_is_16550
+      - has_fpu
    tools:
      vivado: {part : xc7a200tsbg484-1}
    toplevel : toplevel
@ -231,6 +234,7 @@ targets:
      - spi_flash_offset=10485760
      - log_length=2048
      - uart_is_16550
+      - has_fpu
    generate: [litedram_nexys_video]
    tools:
      vivado: {part : xc7a200tsbg484-1}
@ -249,6 +253,7 @@ targets:
      - log_length=512
      - uart_is_16550
      - has_uart1
+      - has_fpu=false
    tools:
      vivado: {part : xc7a35ticsg324-1L}
    toplevel : toplevel
@ -267,6 +272,7 @@ targets:
      - log_length=512
      - uart_is_16550
      - has_uart1
+      - has_fpu=false
    generate: [litedram_arty, liteeth_arty]
    tools:
      vivado: {part : xc7a35ticsg324-1L}
@ -285,6 +291,7 @@ targets:
      - log_length=2048
      - uart_is_16550
      - has_uart1
+      - has_fpu
    tools:
      vivado: {part : xc7a100ticsg324-1L}
    toplevel : toplevel
@ -303,6 +310,7 @@ targets:
      - log_length=2048
      - uart_is_16550
      - has_uart1
+      - has_fpu
    generate: [litedram_arty, liteeth_arty]
    tools:
      vivado: {part : xc7a100ticsg324-1L}
@ -320,6 +328,7 @@ targets:
      - disable_flatten_core
      - log_length=512
      - uart_is_16550
+      - has_fpu=false
    tools:
      vivado: {part : xc7a35tcpg236-1}
    toplevel : toplevel
@ -380,6 +389,12 @@ parameters:
    paramtype   : generic
    default     : 100000000

+  has_fpu:
+    datatype    : bool
+    description : Include a floating-point unit in the core
+    paramtype   : generic
+    default     : true
+
  disable_flatten_core:
    datatype    : bool
    description : Prevent Vivado from flattening the main core components
--- a/register_file.vhdl
+++ b/register_file.vhdl
@ -8,6 +8,7 @@ use work.common.all;
 entity register_file is
    generic (
        SIM : boolean := false;
+        HAS_FPU : boolean := true;
        -- Non-zero to enable log data collection
        LOG_LENGTH : natural := 0
        );
@ -28,12 +29,12 @@ entity register_file is
        sim_dump      : in std_ulogic;
        sim_dump_done : out std_ulogic;

-        log_out       : out std_ulogic_vector(70 downto 0)
+        log_out       : out std_ulogic_vector(71 downto 0)
        );
 end entity register_file;

 architecture behaviour of register_file is
-    type regfile is array(0 to 63) of std_ulogic_vector(63 downto 0);
+    type regfile is array(0 to 127) of std_ulogic_vector(63 downto 0);
    signal registers : regfile := (others => (others => '0'));
    signal rd_port_b : std_ulogic_vector(63 downto 0);
    signal dbg_data : std_ulogic_vector(63 downto 0);
@ -41,53 +42,73 @@ architecture behaviour of register_file is
 begin
    -- synchronous writes
    register_write_0: process(clk)
+        variable w_addr : gspr_index_t;
    begin
        if rising_edge(clk) then
            if w_in.write_enable = '1' then
-		if w_in.write_reg(5) = '0' then
-		    report "Writing GPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data);
-		else
-		    report "Writing GSPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data);
-		end if;
+                w_addr := w_in.write_reg;
+                if HAS_FPU and w_addr(6) = '1' then
+                    report "Writing FPR " & to_hstring(w_addr(4 downto 0)) & " " & to_hstring(w_in.write_data);
+                else
+                    w_addr(6) := '0';
+                    if w_addr(5) = '0' then
+                        report "Writing GPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data);
+                    else
+                        report "Writing GSPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data);
+                    end if;
+                end if;
                assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure;
-                registers(to_integer(unsigned(w_in.write_reg))) <= w_in.write_data;
+                registers(to_integer(unsigned(w_addr))) <= w_in.write_data;
            end if;
        end if;
    end process register_write_0;

    -- asynchronous reads
    register_read_0: process(all)
-        variable b_addr : gspr_index_t;
+        variable a_addr, b_addr, c_addr : gspr_index_t;
+        variable w_addr : gspr_index_t;
    begin
+        a_addr := d_in.read1_reg;
+        b_addr := d_in.read2_reg;
+        c_addr := d_in.read3_reg;
+        w_addr := w_in.write_reg;
+        if not HAS_FPU then
+            -- Make it obvious that we only want 64 GSPRs for a no-FPU implementation
+            a_addr(6) := '0';
+            b_addr(6) := '0';
+            c_addr(6) := '0';
+            w_addr(6) := '0';
+        end if;
        if d_in.read1_enable = '1' then
-            report "Reading GPR " & to_hstring(d_in.read1_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read1_reg))));
+            report "Reading GPR " & to_hstring(a_addr) & " " & to_hstring(registers(to_integer(unsigned(a_addr))));
        end if;
        if d_in.read2_enable = '1' then
-            report "Reading GPR " & to_hstring(d_in.read2_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read2_reg))));
+            report "Reading GPR " & to_hstring(b_addr) & " " & to_hstring(registers(to_integer(unsigned(b_addr))));
        end if;
        if d_in.read3_enable = '1' then
-            report "Reading GPR " & to_hstring(d_in.read3_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read3_reg))));
+            report "Reading GPR " & to_hstring(c_addr) & " " & to_hstring(registers(to_integer(unsigned(c_addr))));
        end if;
-        d_out.read1_data <= registers(to_integer(unsigned(d_in.read1_reg)));
+        d_out.read1_data <= registers(to_integer(unsigned(a_addr)));
        -- B read port is multiplexed with reads from the debug circuitry
        if d_in.read2_enable = '0' and dbg_gpr_req = '1' and dbg_ack = '0' then
            b_addr := dbg_gpr_addr;
-        else
-            b_addr := d_in.read2_reg;
+            if not HAS_FPU then
+                b_addr(6) := '0';
+            end if;
        end if;
        rd_port_b <= registers(to_integer(unsigned(b_addr)));
        d_out.read2_data <= rd_port_b;
-        d_out.read3_data <= registers(to_integer(unsigned(gpr_to_gspr(d_in.read3_reg))));
+        d_out.read3_data <= registers(to_integer(unsigned(c_addr)));

        -- Forward any written data
        if w_in.write_enable = '1' then
-            if d_in.read1_reg = w_in.write_reg then
+            if a_addr = w_addr then
                d_out.read1_data <= w_in.write_data;
            end if;
-            if d_in.read2_reg = w_in.write_reg then
+            if b_addr = w_addr then
                d_out.read2_data <= w_in.write_data;
            end if;
-            if gpr_to_gspr(d_in.read3_reg) = w_in.write_reg then
+            if c_addr = w_addr then
                d_out.read3_data <= w_in.write_data;
            end if;
        end if;
@ -136,7 +157,7 @@ begin
    end generate;

    rf_log: if LOG_LENGTH > 0 generate
-        signal log_data : std_ulogic_vector(70 downto 0);
+        signal log_data : std_ulogic_vector(71 downto 0);
    begin
        reg_log: process(clk)
        begin
--- a/scripts/fmt_log/fmt_log.c
+++ b/scripts/fmt_log/fmt_log.c
@ -58,7 +58,7 @@ struct log_entry {
 	u64	ls_lo_valid: 1;
 	u64	ls_eo_except: 1;
 	u64	ls_stall_out: 1;
-	u64	pad2: 2;
+	u64	pad2: 1;
 	u64	dc_state: 3;
 	u64	dc_ra_valid: 1;
 	u64	dc_tlb_way: 3;
@ -74,7 +74,7 @@ struct log_entry {
 	u64	cr_wr_mask: 8;
 	u64	cr_wr_data: 4;
 	u64	cr_wr_enable: 1;
-	u64	reg_wr_reg: 6;
+	u64	reg_wr_reg: 7;
 	u64	reg_wr_enable: 1;

 	u64	reg_wr_data;
@ -84,17 +84,17 @@ struct log_entry {
 #define FLGA(i, y, z)	(log.i? y: z)
 #define PNIA(f)		(full_nia[log.f] & 0xff)

-const char *units[4] = { "--", "al", "ls", "?3" };
+const char *units[4] = { "--", "al", "ls", "fp" };
 const char *ops[64] =
 {
 	"illegal", "nop    ", "add    ", "and    ", "attn   ", "b      ", "bc     ", "bcreg  ",
 	"bperm  ", "cmp    ", "cmpb   ", "cmpeqb ", "cmprb  ", "cntz   ", "crop   ", "darn   ",
 	"dcbf   ", "dcbst  ", "dcbt   ", "dcbtst ", "dcbz   ", "div    ", "dive   ", "exts   ",
-	"extswsl", "icbi   ", "icbt   ", "isel   ", "isync  ", "ld     ", "st     ", "mcrxrx ",
-	"mfcr   ", "mfmsr  ", "mfspr  ", "mod    ", "mtcrf  ", "mtmsr  ", "mtspr  ", "mull64 ",
-	"mulh64 ", "mulh32 ", "or     ", "popcnt ", "prty   ", "rfid   ", "rlc    ", "rlcl   ",
-	"rlcr   ", "sc     ", "setb   ", "shl    ", "shr    ", "sync   ", "tlbie  ", "trap   ",
-	"xor    ", "bcd    ", "addg6s ", "ffail  ", "?60    ", "?61    ", "?62    ", "?63    "
+	"extswsl", "fpop   ", "fpopi  ", "icbi   ", "icbt   ", "isel   ", "isync  ", "ld     ",
+	"st     ", "fpload ", "fpstore", "mcrxrx ", "mfcr   ", "mfmsr  ", "mfspr  ", "mod    ",
+	"mtcrf  ", "mtmsr  ", "mtspr  ", "mull64 ", "mulh64 ", "mulh32 ", "or     ", "popcnt ",
+	"prty   ", "rfid   ", "rlc    ", "rlcl   ", "rlcr   ", "sc     ", "setb   ", "shl    ",
+	"shr    ", "sync   ", "tlbie  ", "trap   ", "xor    ", "bcd    ", "addg6s ", "ffail  ",
 };

 const char *spr_names[13] =
--- a/soc.vhdl
+++ b/soc.vhdl
@ -52,6 +52,7 @@ entity soc is
 	RAM_INIT_FILE      : string;
 	CLK_FREQ           : positive;
 	SIM                : boolean;
+        HAS_FPU            : boolean := true;
 	DISABLE_FLATTEN_CORE : boolean := false;
 	HAS_DRAM           : boolean  := false;
 	DRAM_SIZE          : integer := 0;
@ -253,6 +254,7 @@ begin
    processor: entity work.core
 	generic map(
 	    SIM => SIM,
+            HAS_FPU => HAS_FPU,
 	    DISABLE_FLATTEN => DISABLE_FLATTEN_CORE,
 	    ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'),
            LOG_LENGTH => LOG_LENGTH
--- a/tests/fpu/Makefile
+++ b/tests/fpu/Makefile
@ -0,0 +1,3 @@
+TEST=fpu
+
+include ../Makefile.test
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
--- a/tests/fpu/head.S
+++ b/tests/fpu/head.S
@ -0,0 +1,132 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Load an immediate 64-bit value into a register */
+#define LOAD_IMM64(r, e)			\
+	lis     r,(e)@highest;			\
+	ori     r,r,(e)@higher;			\
+	rldicr  r,r, 32, 31;			\
+	oris    r,r, (e)@h;			\
+	ori     r,r, (e)@l;
+
+	.section ".head","ax"
+
+	/*
+	 * Microwatt currently enters in LE mode at 0x0, so we don't need to
+	 * do any endian fix ups
+	 */
+	. = 0
+.global _start
+_start:
+	LOAD_IMM64(%r10,__bss_start)
+	LOAD_IMM64(%r11,__bss_end)
+	subf	%r11,%r10,%r11
+	addi	%r11,%r11,63
+	srdi.	%r11,%r11,6
+	beq	2f
+	mtctr	%r11
+1:	dcbz	0,%r10
+	addi	%r10,%r10,64
+	bdnz	1b
+
+2:	LOAD_IMM64(%r1,__stack_top)
+	li	%r0,0
+	stdu	%r0,-16(%r1)
+	LOAD_IMM64(%r10, die)
+	mtsprg0	%r10
+	LOAD_IMM64(%r12, main)
+	mtctr	%r12
+	bctrl
+die:	attn // terminate on exit
+	b .
+
+.global trapit
+trapit:
+	mflr	%r0
+	std	%r0,16(%r1)
+	stdu	%r1,-256(%r1)
+	mtsprg1	%r1
+	r = 14
+	.rept	18
+	std	r,r*8(%r1)
+	r = r + 1
+	.endr
+	mfcr	%r0
+	stw	%r0,13*8(%r1)
+	LOAD_IMM64(%r10, ret)
+	mtsprg0	%r10
+	mr	%r12,%r4
+	mtctr	%r4
+	bctrl
+ret:
+	mfsprg1	%r1
+	LOAD_IMM64(%r10, die)
+	mtsprg0	%r10
+	r = 14
+	.rept	18
+	ld	r,r*8(%r1)
+	r = r + 1
+	.endr
+	lwz	%r0,13*8(%r1)
+	mtcr	%r0
+	ld	%r0,256+16(%r1)
+	addi	%r1,%r1,256
+	mtlr	%r0
+	blr
+
+	.global do_rfid
+do_rfid:
+	mtsrr1	%r3
+	LOAD_IMM64(%r4, do_blr)
+	mtsrr0	%r4
+	rfid
+	blr
+
+	.global do_blr
+do_blr:
+	blr
+
+#define EXCEPTION(nr)		\
+	.= nr			;\
+	mfsprg0	%r0		;\
+	mtctr	%r0		;\
+	li	%r3,nr		;\
+	bctr
+
+	EXCEPTION(0x300)
+	EXCEPTION(0x380)
+	EXCEPTION(0x400)
+	EXCEPTION(0x480)
+	EXCEPTION(0x500)
+	EXCEPTION(0x600)
+	EXCEPTION(0x700)
+	EXCEPTION(0x800)
+	EXCEPTION(0x900)
+	EXCEPTION(0x980)
+	EXCEPTION(0xa00)
+	EXCEPTION(0xb00)
+	EXCEPTION(0xc00)
+	EXCEPTION(0xd00)
+	EXCEPTION(0xe00)
+	EXCEPTION(0xe20)
+	EXCEPTION(0xe40)
+	EXCEPTION(0xe60)
+	EXCEPTION(0xe80)
+	EXCEPTION(0xf00)
+	EXCEPTION(0xf20)
+	EXCEPTION(0xf40)
+	EXCEPTION(0xf60)
+	EXCEPTION(0xf80)
--- a/tests/fpu/powerpc.lds
+++ b/tests/fpu/powerpc.lds
@ -0,0 +1,27 @@
+SECTIONS
+{
+	. = 0;
+	_start = .;
+	.head : {
+		KEEP(*(.head))
+	}
+	. = ALIGN(0x1000);
+	.text : { *(.text) *(.text.*) *(.rodata) *(.rodata.*) }
+	. = ALIGN(0x1000);
+	.data : { *(.data) *(.data.*) *(.got) *(.toc) }
+	. = ALIGN(0x80);
+	__bss_start = .;
+	.bss : {
+		*(.dynsbss)
+		*(.sbss)
+		*(.scommon)
+		*(.dynbss)
+		*(.bss)
+		*(.common)
+		*(.bss.*)
+	}
+	. = ALIGN(0x80);
+	__bss_end = .;
+	. = . + 0x4000;
+	__stack_top = .;
+}
--- a/tests/test_fpu.bin
+++ b/tests/test_fpu.bin
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@ -0,0 +1,23 @@
+test 01:PASS
+test 02:PASS
+test 03:PASS
+test 04:PASS
+test 05:PASS
+test 06:PASS
+test 07:PASS
+test 08:PASS
+test 09:PASS
+test 10:PASS
+test 11:PASS
+test 12:PASS
+test 13:PASS
+test 14:PASS
+test 15:PASS
+test 16:PASS
+test 17:PASS
+test 18:PASS
+test 19:PASS
+test 20:PASS
+test 21:PASS
+test 22:PASS
+test 23:PASS
--- a/tests/update_console_tests
+++ b/tests/update_console_tests
@ -3,7 +3,7 @@
 # Script to update console related tests from source
 #

-for i in sc illegal decrementer xics privileged mmu misc modes reservation trace ; do
+for i in sc illegal decrementer xics privileged mmu misc modes reservation trace fpu ; do
    cd $i
    make
    cd -
--- a/writeback.vhdl
+++ b/writeback.vhdl
@ -12,6 +12,7 @@ entity writeback is

        e_in         : in Execute1ToWritebackType;
        l_in         : in Loadstore1ToWritebackType;
+        fp_in        : in FPUToWritebackType;

        w_out        : out WritebackToRegisterFileType;
        c_out        : out WritebackToCrFileType;
@ -31,15 +32,21 @@ begin
            -- Do consistency checks only on the clock edge
            x(0) := e_in.valid;
            y(0) := l_in.valid;
-            assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure;
+            w(0) := fp_in.valid;
+            assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) +
+                    to_integer(unsigned(w))) <= 1 severity failure;

            x(0) := e_in.write_enable or e_in.exc_write_enable;
            y(0) := l_in.write_enable;
-            assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure;
+            w(0) := fp_in.write_enable;
+            assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) +
+                    to_integer(unsigned(w))) <= 1 severity failure;

            w(0) := e_in.write_cr_enable;
            x(0) := (e_in.write_enable and e_in.rc);
-            assert (to_integer(unsigned(w)) + to_integer(unsigned(x))) <= 1 severity failure;
+            y(0) := fp_in.write_cr_enable;
+            assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) +
+                    to_integer(unsigned(y))) <= 1 severity failure;
        end if;
    end process;

@ -53,7 +60,7 @@ begin
        c_out <= WritebackToCrFileInit;

        complete_out <= '0';
-        if e_in.valid = '1' or l_in.valid = '1' then
+        if e_in.valid = '1' or l_in.valid = '1' or fp_in.valid = '1' then
            complete_out <= '1';
        end if;

@ -79,8 +86,20 @@ begin
                c_out.write_xerc_data <= e_in.xerc;
            end if;

+            if fp_in.write_enable = '1' then
+                w_out.write_reg <= fp_in.write_reg;
+                w_out.write_data <= fp_in.write_data;
+                w_out.write_enable <= '1';
+            end if;
+
+            if fp_in.write_cr_enable = '1' then
+                c_out.write_cr_enable <= '1';
+                c_out.write_cr_mask <= fp_in.write_cr_mask;
+                c_out.write_cr_data <= fp_in.write_cr_data;
+            end if;
+
            if l_in.write_enable = '1' then
-                w_out.write_reg <= gpr_to_gspr(l_in.write_reg);
+                w_out.write_reg <= l_in.write_reg;
                w_out.write_data <= l_in.write_data;
                w_out.write_enable <= '1';
            end if;