core: Add support for single-precision FP loads and stores

This adds code to loadstore1 to convert between single-precision and double-precision formats, and implements the lfs* and stfs* instructions. The conversion processes are described in Power ISA v3.1 Book 1 sections 4.6.2 and 4.6.3. These conversions take one cycle, so lfs* and stfs* are one cycle slower than lfd* and stfd*. Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago · 9d285a265c
parent bcac4b9b2f
commit 9d285a265c
6 changed files with 263 additions and 57 deletions
--- a/common.vhdl
+++ b/common.vhdl
@ -287,6 +287,7 @@ package common is
        virt_mode : std_ulogic;                         -- do translation through TLB
        priv_mode : std_ulogic;                         -- privileged mode (MSR[PR] = 0)
        mode_32bit : std_ulogic;                        -- trim addresses to 32 bits
        is_32bit : std_ulogic;
    end record;
    constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0',
                                                                     sign_extend => '0', update => '0', xerc => xerc_init,
@ -294,7 +295,7 @@ package common is
                                                                     nia => (others => '0'), insn => (others => '0'),
                                                                     addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'),
                                                                     write_reg => (others => '0'), length => (others => '0'),
-                                                                     mode_32bit => '0', others => (others => '0'));
+                                                                     mode_32bit => '0', is_32bit => '0', others => (others => '0'));
    type Loadstore1ToExecute1Type is record
        busy : std_ulogic;
--- a/countzero.vhdl
+++ b/countzero.vhdl
@ -3,6 +3,7 @@ use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;
 library work;
 use work.helpers.all;
 entity zero_counter is
    port (
@ -15,42 +16,6 @@ entity zero_counter is
 end entity zero_counter;
 architecture behaviour of zero_counter is
    -- Reverse the order of bits in a word
    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is
        variable ret: std_ulogic_vector(a'left downto a'right);
    begin
        for i in a'right to a'left loop
            ret(a'left + a'right - i) := a(i);
        end loop;
        return ret;
    end;
    -- If there is only one bit set in a doubleword, return its bit number
    -- (counting from the right).  Each bit of the result is obtained by
    -- ORing together 32 bits of the input:
    --  bit 0 = a[1] or a[3] or a[5] or ...
    --  bit 1 = a[2] or a[3] or a[6] or a[7] or ...
    --  bit 2 = a[4..7] or a[12..15] or ...
    --  bit 5 = a[32..63] ORed together
    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
        variable ret: std_ulogic_vector(5 downto 0);
        variable stride: natural;
        variable bit: std_ulogic;
        variable k: natural;
    begin
        stride := 2;
        for i in 0 to 5 loop
            bit := '0';
            for j in 0 to (64 / stride) - 1 loop
                k := j * stride;
                bit := bit or (or a(k + stride - 1 downto k + (stride / 2)));
            end loop;
            ret(i) := bit;
            stride := stride * 2;
        end loop;
        return ret;
    end;
    signal inp : std_ulogic_vector(63 downto 0);
    signal sum : std_ulogic_vector(64 downto 0);
    signal msb_r : std_ulogic;
--- a/decode1.vhdl
+++ b/decode1.vhdl
@ -74,8 +74,8 @@ architecture behaviour of decode1 is
        35 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzu
        50 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfd
        51 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdu
--      48 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfs
+        48 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfs
--      49 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsu
+        49 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsu
        42 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lha
        43 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhau
        40 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhz
@ -93,8 +93,8 @@ architecture behaviour of decode1 is
        39 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbu
        54 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfd
        55 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdu
--      52 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfs
+        52 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfs
--      53 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsu
+        53 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsu
        44 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth
        45 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu
        36 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw
@ -284,8 +284,8 @@ architecture behaviour of decode1 is
        2#1001110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdux
        2#1101010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwax
        2#1101110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwzx
--      2#1000010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfsx
+        2#1000010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfsx
--      2#1000110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsux
+        2#1000110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsux
        2#0001110100#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lharx
        2#0101110111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhaux
        2#0101010111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lhax
@ -367,8 +367,8 @@ architecture behaviour of decode1 is
        2#1011010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfdx
        2#1011110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdux
        2#1111010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfiwx
--      2#1010010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfsx
+        2#1010010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfsx
--      2#1010110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsux
+        2#1010110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsux
        2#1110010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx
        2#1110110101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthcix
        2#1011010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0'), -- sthcx
--- a/execute1.vhdl
+++ b/execute1.vhdl
@ -1259,6 +1259,7 @@ begin
        lv.virt_mode := ctrl.msr(MSR_DR);
        lv.priv_mode := not ctrl.msr(MSR_PR);
        lv.mode_32bit := not ctrl.msr(MSR_SF);
        lv.is_32bit := e_in.is_32bit;
 	-- Update registers
 	rin <= v;
--- a/helpers.vhdl
+++ b/helpers.vhdl
@ -25,6 +25,10 @@ package helpers is
    function byte_reverse(val: std_ulogic_vector(63 downto 0); size: integer) return std_ulogic_vector;
    function sign_extend(val: std_ulogic_vector(63 downto 0); size: natural) return std_ulogic_vector;
    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector;
    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
    function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector;
 end package helpers;
 package body helpers is
@ -206,4 +210,53 @@ package body helpers is
        return std_ulogic_vector(ret);
    end;
    -- Reverse the order of bits in a word
    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is
        variable ret: std_ulogic_vector(a'left downto a'right);
    begin
        for i in a'right to a'left loop
            ret(a'left + a'right - i) := a(i);
        end loop;
        return ret;
    end;
    -- If there is only one bit set in a doubleword, return its bit number
    -- (counting from the right).  Each bit of the result is obtained by
    -- ORing together 32 bits of the input:
    --  bit 0 = a[1] or a[3] or a[5] or ...
    --  bit 1 = a[2] or a[3] or a[6] or a[7] or ...
    --  bit 2 = a[4..7] or a[12..15] or ...
    --  bit 5 = a[32..63] ORed together
    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
        variable ret: std_ulogic_vector(5 downto 0);
        variable stride: natural;
        variable bit: std_ulogic;
        variable k: natural;
    begin
        stride := 2;
        for i in 0 to 5 loop
            bit := '0';
            for j in 0 to (64 / stride) - 1 loop
                k := j * stride;
                bit := bit or (or a(k + stride - 1 downto k + (stride / 2)));
            end loop;
            ret(i) := bit;
            stride := stride * 2;
        end loop;
        return ret;
    end;
    -- Count leading zeroes operation
    -- Assumes the value passed in is not zero (if it is, zero is returned)
    function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector is
        variable rev: std_ulogic_vector(val'left downto val'right);
        variable sum: std_ulogic_vector(val'left downto val'right);
        variable onehot: std_ulogic_vector(val'left downto val'right);
    begin
        rev := bit_reverse(val);
        sum := std_ulogic_vector(- signed(rev));
        onehot := sum and rev;
        return bit_number(std_ulogic_vector(resize(unsigned(onehot), 64)));
    end;
 end package body helpers;
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@ -45,10 +45,12 @@ architecture behave of loadstore1 is
    -- State machine for unaligned loads/stores
    type state_t is (IDLE,              -- ready for instruction
                     FPR_CONV,          -- converting double to float for store
                     SECOND_REQ,        -- send 2nd request of unaligned xfer
                     ACK_WAIT,          -- waiting for ack from dcache
                     MMU_LOOKUP,        -- waiting for MMU to look up translation
                     TLBIE_WAIT,        -- waiting for MMU to finish doing a tlbie
                     FINISH_LFS,        -- write back converted SP data for lfs*
                     COMPLETE           -- extra cycle to complete an operation
                     );
@ -89,6 +91,11 @@ architecture behave of loadstore1 is
        do_update    : std_ulogic;
        extra_cycle  : std_ulogic;
        mode_32bit   : std_ulogic;
        load_sp      : std_ulogic;
        ld_sp_data   : std_ulogic_vector(31 downto 0);
        ld_sp_nz     : std_ulogic;
        ld_sp_lz     : std_ulogic_vector(5 downto 0);
        st_sp_data   : std_ulogic_vector(31 downto 0);
    end record;
    type byte_sel_t is array(0 to 7) of std_ulogic;
@ -98,6 +105,9 @@ architecture behave of loadstore1 is
    signal r, rin : reg_stage_t;
    signal lsu_sum : std_ulogic_vector(63 downto 0);
    signal store_sp_data : std_ulogic_vector(31 downto 0);
    signal load_dp_data  : std_ulogic_vector(63 downto 0);
    -- Generate byte enables from sizes
    function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
    begin
@ -128,6 +138,72 @@ architecture behave of loadstore1 is
 					    to_integer(unsigned(address))));
    end function xfer_data_sel;
    -- 23-bit right shifter for DP -> SP float conversions
    function shifter_23r(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
        return std_ulogic_vector is
        variable fs1   : std_ulogic_vector(22 downto 0);
        variable fs2   : std_ulogic_vector(22 downto 0);
    begin
        case shift(1 downto 0) is
            when "00" =>
                fs1 := frac;
            when "01" =>
                fs1 := '0' & frac(22 downto 1);
            when "10" =>
                fs1 := "00" & frac(22 downto 2);
            when others =>
                fs1 := "000" & frac(22 downto 3);
        end case;
        case shift(4 downto 2) is
            when "000" =>
                fs2 := fs1;
            when "001" =>
                fs2 := x"0" & fs1(22 downto 4);
            when "010" =>
                fs2 := x"00" & fs1(22 downto 8);
            when "011" =>
                fs2 := x"000" & fs1(22 downto 12);
            when "100" =>
                fs2 := x"0000" & fs1(22 downto 16);
            when others =>
                fs2 := x"00000" & fs1(22 downto 20);
        end case;
        return fs2;
    end;
    -- 23-bit left shifter for SP -> DP float conversions
    function shifter_23l(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
        return std_ulogic_vector is
        variable fs1   : std_ulogic_vector(22 downto 0);
        variable fs2   : std_ulogic_vector(22 downto 0);
    begin
        case shift(1 downto 0) is
            when "00" =>
                fs1 := frac;
            when "01" =>
                fs1 := frac(21 downto 0) & '0';
            when "10" =>
                fs1 := frac(20 downto 0) & "00";
            when others =>
                fs1 := frac(19 downto 0) & "000";
        end case;
        case shift(4 downto 2) is
            when "000" =>
                fs2 := fs1;
            when "001" =>
                fs2 := fs1(18 downto 0) & x"0" ;
            when "010" =>
                fs2 := fs1(14 downto 0) & x"00";
            when "011" =>
                fs2 := fs1(10 downto 0) & x"000";
            when "100" =>
                fs2 := fs1(6 downto 0) & x"0000";
            when others =>
                fs2 := fs1(2 downto 0) & x"00000";
        end case;
        return fs2;
    end;
 begin
    -- Calculate the address in the first cycle
    lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
@ -145,6 +221,59 @@ begin
        end if;
    end process;
    ls_fp_conv: if HAS_FPU generate
        -- Convert DP data to SP for stfs
        dp_to_sp: process(all)
            variable exp   : unsigned(10 downto 0);
            variable frac  : std_ulogic_vector(22 downto 0);
            variable shift : unsigned(4 downto 0);
        begin
            store_sp_data(31) <= l_in.data(63);
            store_sp_data(30 downto 0) <= (others => '0');
            exp := unsigned(l_in.data(62 downto 52));
            if exp > 896 then
                store_sp_data(30) <= l_in.data(62);
                store_sp_data(29 downto 0) <= l_in.data(58 downto 29);
            elsif exp >= 874 then
                -- denormalization required
                frac := '1' & l_in.data(51 downto 30);
                shift := 0 - exp(4 downto 0);
                store_sp_data(22 downto 0) <= shifter_23r(frac, shift);
            end if;
        end process;
        -- Convert SP data to DP for lfs
        sp_to_dp: process(all)
            variable exp     : unsigned(7 downto 0);
            variable exp_dp  : unsigned(10 downto 0);
            variable exp_nz  : std_ulogic;
            variable exp_ao  : std_ulogic;
            variable frac    : std_ulogic_vector(22 downto 0);
            variable frac_shift : unsigned(4 downto 0);
        begin
            frac := r.ld_sp_data(22 downto 0);
            exp := unsigned(r.ld_sp_data(30 downto 23));
            exp_nz := or (r.ld_sp_data(30 downto 23));
            exp_ao := and (r.ld_sp_data(30 downto 23));
            frac_shift := (others => '0');
            if exp_ao = '1' then
                exp_dp := to_unsigned(2047, 11);    -- infinity or NaN
            elsif exp_nz = '1' then
                exp_dp := 896 + resize(exp, 11);    -- finite normalized value
            elsif r.ld_sp_nz = '0' then
                exp_dp := to_unsigned(0, 11);       -- zero
            else
                -- denormalized SP operand, need to normalize
                exp_dp := 896 - resize(unsigned(r.ld_sp_lz), 11);
                frac_shift := unsigned(r.ld_sp_lz(4 downto 0)) + 1;
            end if;
            load_dp_data(63) <= r.ld_sp_data(31);
            load_dp_data(62 downto 52) <= std_ulogic_vector(exp_dp);
            load_dp_data(51 downto 29) <= shifter_23l(frac, frac_shift);
            load_dp_data(28 downto 0) <= (others => '0');
        end process;
    end generate;
    loadstore1_1: process(all)
        variable v : reg_stage_t;
        variable brev_lenm1 : unsigned(2 downto 0);
@ -165,6 +294,9 @@ begin
        variable data_permuted : std_ulogic_vector(63 downto 0);
        variable data_trimmed : std_ulogic_vector(63 downto 0);
        variable store_data : std_ulogic_vector(63 downto 0);
        variable data_in : std_ulogic_vector(63 downto 0);
        variable byte_rev : std_ulogic;
        variable length : std_ulogic_vector(3 downto 0);
        variable use_second : byte_sel_t;
        variable trim_ctl : trim_ctl_t;
        variable negative : std_ulogic;
@ -176,6 +308,8 @@ begin
        variable mmu_mtspr : std_ulogic;
        variable itlb_fault : std_ulogic;
        variable misaligned : std_ulogic;
        variable fp_reg_conv : std_ulogic;
        variable lfs_done : std_ulogic;
    begin
        v := r;
        req := '0';
@ -185,8 +319,10 @@ begin
        sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10));
        dsisr := (others => '0');
        mmureq := '0';
        fp_reg_conv := '0';
        write_enable := '0';
        lfs_done := '0';
        do_update := r.do_update;
        v.do_update := '0';
@ -245,19 +381,38 @@ begin
            end case;
        end loop;
-        -- Byte reversing and rotating for stores
+        if HAS_FPU then
-        -- Done in the first cycle (when l_in.valid = 1)
+            -- Single-precision FP conversion
            v.st_sp_data := store_sp_data;
            v.ld_sp_data := data_trimmed(31 downto 0);
            v.ld_sp_nz := or (data_trimmed(22 downto 0));
            v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0));
        end if;
        -- Byte reversing and rotating for stores.
        -- Done in the first cycle (when l_in.valid = 1) for integer stores
        -- and DP float stores, and in the second cycle for SP float stores.
        store_data := r.store_data;
-        if l_in.valid = '1' then
+        if l_in.valid = '1' or (HAS_FPU and r.state = FPR_CONV) then
            if HAS_FPU and r.state = FPR_CONV then
                data_in := x"00000000" & r.st_sp_data;
                byte_offset := unsigned(r.addr(2 downto 0));
                byte_rev := r.byte_reverse;
                length := r.length;
            else
                data_in := l_in.data;
                byte_offset := unsigned(lsu_sum(2 downto 0));
                byte_rev := l_in.byte_reverse;
                length := l_in.length;
            end if;
            brev_lenm1 := "000";
-            if l_in.byte_reverse = '1' then
+            if byte_rev = '1' then
-                brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
+                brev_lenm1 := unsigned(length(2 downto 0)) - 1;
            end if;
            for i in 0 to 7 loop
                k := (to_unsigned(i, 3) - byte_offset) xor brev_lenm1;
                j := to_integer(k) * 8;
-                store_data(i * 8 + 7 downto i * 8) := l_in.data(j + 7 downto j);
+                store_data(i * 8 + 7 downto i * 8) := data_in(j + 7 downto j);
            end loop;
        end if;
        v.store_data := store_data;
@ -292,6 +447,14 @@ begin
        case r.state is
        when IDLE =>
        when FPR_CONV =>
            req := '1';
            if r.second_bytes /= "00000000" then
                v.state := SECOND_REQ;
            else
                v.state := ACK_WAIT;
            end if;
        when SECOND_REQ =>
            req := '1';
            v.state := ACK_WAIT;
@ -323,8 +486,13 @@ begin
                        v.load_data := data_permuted;
                    end if;
                else
-                    write_enable := r.load;
+                    write_enable := r.load and not r.load_sp;
-                    if r.extra_cycle = '1' then
+                    if HAS_FPU and r.load_sp = '1' then
                        -- SP to DP conversion takes a cycle
                        -- Write back rA update in this cycle if needed
                        do_update := r.update;
                        v.state := FINISH_LFS;
                    elsif r.extra_cycle = '1' then
                        -- loads with rA update need an extra cycle
                        v.state := COMPLETE;
                        v.do_update := r.update;
@ -362,6 +530,9 @@ begin
        when TLBIE_WAIT =>
        when FINISH_LFS =>
            lfs_done := '1';
        when COMPLETE =>
            exception := r.align_intr;
@ -395,6 +566,7 @@ begin
            v.nc := l_in.ci;
            v.virt_mode := l_in.virt_mode;
            v.priv_mode := l_in.priv_mode;
            v.load_sp := '0';
            v.wait_dcache := '0';
            v.wait_mmu := '0';
            v.do_update := '0';
@ -436,14 +608,24 @@ begin
                    v.dcbz := '1';
                when OP_FPSTORE =>
                    if HAS_FPU then
                        if l_in.is_32bit = '1' then
                            v.state := FPR_CONV;
                            fp_reg_conv := '1';
                        else
                            req := '1';
                        end if;
                    end if;
                when OP_FPLOAD =>
                    if HAS_FPU then
                        v.load := '1';
                        req := '1';
-                        -- Allow an extra cycle for RA update
+                        -- Allow an extra cycle for SP->DP precision conversion
                        -- or RA update
                        v.extra_cycle := l_in.update;
                        if l_in.is_32bit = '1' then
                            v.load_sp := '1';
                            v.extra_cycle := '1';
                        end if;
                    end if;
                when OP_TLBIE =>
                    mmureq := '1';
@ -500,7 +682,7 @@ begin
                end if;
            end if;
-            v.busy := req or mmureq or mmu_mtspr;
+            v.busy := req or mmureq or mmu_mtspr or fp_reg_conv;
        end if;
        -- Update outputs to dcache
@ -539,6 +721,10 @@ begin
            l_out.write_enable <= '1';
            l_out.write_reg <= gpr_to_gspr(r.update_reg);
            l_out.write_data <= r.addr;
        elsif lfs_done = '1' then
            l_out.write_enable <= '1';
            l_out.write_reg <= r.write_reg;
            l_out.write_data <= load_dp_data;
        else
            l_out.write_enable <= write_enable;
            l_out.write_reg <= r.write_reg;