From 9d285a265cf9fab8f5f17d6d4588d9545e555e68 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 28 Aug 2020 13:35:05 +1000
Subject: [PATCH] core: Add support for single-precision FP loads and stores

This adds code to loadstore1 to convert between single-precision and
double-precision formats, and implements the lfs* and stfs*
instructions.  The conversion processes are described in Power ISA
v3.1 Book 1 sections 4.6.2 and 4.6.3.

These conversions take one cycle, so lfs* and stfs* are one cycle
slower than lfd* and stfd*.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl     |   3 +-
 countzero.vhdl  |  37 +--------
 decode1.vhdl    |  16 ++--
 execute1.vhdl   |   1 +
 helpers.vhdl    |  53 ++++++++++++
 loadstore1.vhdl | 210 +++++++++++++++++++++++++++++++++++++++++++++---
 6 files changed, 263 insertions(+), 57 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 14bdcf7..e1ba844 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -287,6 +287,7 @@ package common is
         virt_mode : std_ulogic;                         -- do translation through TLB
         priv_mode : std_ulogic;                         -- privileged mode (MSR[PR] = 0)
         mode_32bit : std_ulogic;                        -- trim addresses to 32 bits
+        is_32bit : std_ulogic;
     end record;
     constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0',
                                                                      sign_extend => '0', update => '0', xerc => xerc_init,
@@ -294,7 +295,7 @@ package common is
                                                                      nia => (others => '0'), insn => (others => '0'),
                                                                      addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'),
                                                                      write_reg => (others => '0'), length => (others => '0'),
-                                                                     mode_32bit => '0', others => (others => '0'));
+                                                                     mode_32bit => '0', is_32bit => '0', others => (others => '0'));
 
     type Loadstore1ToExecute1Type is record
         busy : std_ulogic;
diff --git a/countzero.vhdl b/countzero.vhdl
index 18aa043..b46f108 100644
--- a/countzero.vhdl
+++ b/countzero.vhdl
@@ -3,6 +3,7 @@ use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;
 
 library work;
+use work.helpers.all;
 
 entity zero_counter is
     port (
@@ -15,42 +16,6 @@ entity zero_counter is
 end entity zero_counter;
 
 architecture behaviour of zero_counter is
-    -- Reverse the order of bits in a word
-    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is
-        variable ret: std_ulogic_vector(a'left downto a'right);
-    begin
-        for i in a'right to a'left loop
-            ret(a'left + a'right - i) := a(i);
-        end loop;
-        return ret;
-    end;
-
-    -- If there is only one bit set in a doubleword, return its bit number
-    -- (counting from the right).  Each bit of the result is obtained by
-    -- ORing together 32 bits of the input:
-    --  bit 0 = a[1] or a[3] or a[5] or ...
-    --  bit 1 = a[2] or a[3] or a[6] or a[7] or ...
-    --  bit 2 = a[4..7] or a[12..15] or ...
-    --  bit 5 = a[32..63] ORed together
-    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
-        variable ret: std_ulogic_vector(5 downto 0);
-        variable stride: natural;
-        variable bit: std_ulogic;
-        variable k: natural;
-    begin
-        stride := 2;
-        for i in 0 to 5 loop
-            bit := '0';
-            for j in 0 to (64 / stride) - 1 loop
-                k := j * stride;
-                bit := bit or (or a(k + stride - 1 downto k + (stride / 2)));
-            end loop;
-            ret(i) := bit;
-            stride := stride * 2;
-        end loop;
-        return ret;
-    end;
-
     signal inp : std_ulogic_vector(63 downto 0);
     signal sum : std_ulogic_vector(64 downto 0);
     signal msb_r : std_ulogic;
diff --git a/decode1.vhdl b/decode1.vhdl
index 75da175..29f0e50 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -74,8 +74,8 @@ architecture behaviour of decode1 is
         35 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzu
         50 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfd
         51 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdu
---      48 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfs
---      49 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsu
+        48 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfs
+        49 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsu
         42 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lha
         43 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhau
         40 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhz
@@ -93,8 +93,8 @@ architecture behaviour of decode1 is
         39 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbu
         54 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfd
         55 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdu
---      52 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfs
---      53 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsu
+        52 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfs
+        53 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsu
         44 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth
         45 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu
         36 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw
@@ -284,8 +284,8 @@ architecture behaviour of decode1 is
         2#1001110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdux
         2#1101010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwax
         2#1101110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwzx
---      2#1000010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfsx
---      2#1000110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsux
+        2#1000010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfsx
+        2#1000110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsux
         2#0001110100#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lharx
         2#0101110111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhaux
         2#0101010111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lhax
@@ -367,8 +367,8 @@ architecture behaviour of decode1 is
         2#1011010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfdx
         2#1011110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdux
         2#1111010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfiwx
---      2#1010010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfsx
---      2#1010110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsux
+        2#1010010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfsx
+        2#1010110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsux
         2#1110010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx
         2#1110110101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthcix
         2#1011010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0'), -- sthcx
diff --git a/execute1.vhdl b/execute1.vhdl
index 4d6a9cc..9d9b711 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -1259,6 +1259,7 @@ begin
         lv.virt_mode := ctrl.msr(MSR_DR);
         lv.priv_mode := not ctrl.msr(MSR_PR);
         lv.mode_32bit := not ctrl.msr(MSR_SF);
+        lv.is_32bit := e_in.is_32bit;
 
 	-- Update registers
 	rin <= v;
diff --git a/helpers.vhdl b/helpers.vhdl
index fe91938..834e386 100644
--- a/helpers.vhdl
+++ b/helpers.vhdl
@@ -25,6 +25,10 @@ package helpers is
     function byte_reverse(val: std_ulogic_vector(63 downto 0); size: integer) return std_ulogic_vector;
 
     function sign_extend(val: std_ulogic_vector(63 downto 0); size: natural) return std_ulogic_vector;
+
+    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector;
+    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
+    function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector;
 end package helpers;
 
 package body helpers is
@@ -206,4 +210,53 @@ package body helpers is
         return std_ulogic_vector(ret);
 
     end;
+
+    -- Reverse the order of bits in a word
+    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is
+        variable ret: std_ulogic_vector(a'left downto a'right);
+    begin
+        for i in a'right to a'left loop
+            ret(a'left + a'right - i) := a(i);
+        end loop;
+        return ret;
+    end;
+
+    -- If there is only one bit set in a doubleword, return its bit number
+    -- (counting from the right).  Each bit of the result is obtained by
+    -- ORing together 32 bits of the input:
+    --  bit 0 = a[1] or a[3] or a[5] or ...
+    --  bit 1 = a[2] or a[3] or a[6] or a[7] or ...
+    --  bit 2 = a[4..7] or a[12..15] or ...
+    --  bit 5 = a[32..63] ORed together
+    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
+        variable ret: std_ulogic_vector(5 downto 0);
+        variable stride: natural;
+        variable bit: std_ulogic;
+        variable k: natural;
+    begin
+        stride := 2;
+        for i in 0 to 5 loop
+            bit := '0';
+            for j in 0 to (64 / stride) - 1 loop
+                k := j * stride;
+                bit := bit or (or a(k + stride - 1 downto k + (stride / 2)));
+            end loop;
+            ret(i) := bit;
+            stride := stride * 2;
+        end loop;
+        return ret;
+    end;
+
+    -- Count leading zeroes operation
+    -- Assumes the value passed in is not zero (if it is, zero is returned)
+    function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector is
+        variable rev: std_ulogic_vector(val'left downto val'right);
+        variable sum: std_ulogic_vector(val'left downto val'right);
+        variable onehot: std_ulogic_vector(val'left downto val'right);
+    begin
+        rev := bit_reverse(val);
+        sum := std_ulogic_vector(- signed(rev));
+        onehot := sum and rev;
+        return bit_number(std_ulogic_vector(resize(unsigned(onehot), 64)));
+    end;
 end package body helpers;
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index ec20319..919ba0e 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -45,10 +45,12 @@ architecture behave of loadstore1 is
 
     -- State machine for unaligned loads/stores
     type state_t is (IDLE,              -- ready for instruction
+                     FPR_CONV,          -- converting double to float for store
                      SECOND_REQ,        -- send 2nd request of unaligned xfer
                      ACK_WAIT,          -- waiting for ack from dcache
                      MMU_LOOKUP,        -- waiting for MMU to look up translation
                      TLBIE_WAIT,        -- waiting for MMU to finish doing a tlbie
+                     FINISH_LFS,        -- write back converted SP data for lfs*
                      COMPLETE           -- extra cycle to complete an operation
                      );
 
@@ -89,6 +91,11 @@ architecture behave of loadstore1 is
         do_update    : std_ulogic;
         extra_cycle  : std_ulogic;
         mode_32bit   : std_ulogic;
+        load_sp      : std_ulogic;
+        ld_sp_data   : std_ulogic_vector(31 downto 0);
+        ld_sp_nz     : std_ulogic;
+        ld_sp_lz     : std_ulogic_vector(5 downto 0);
+        st_sp_data   : std_ulogic_vector(31 downto 0);
     end record;
 
     type byte_sel_t is array(0 to 7) of std_ulogic;
@@ -98,6 +105,9 @@ architecture behave of loadstore1 is
     signal r, rin : reg_stage_t;
     signal lsu_sum : std_ulogic_vector(63 downto 0);
 
+    signal store_sp_data : std_ulogic_vector(31 downto 0);
+    signal load_dp_data  : std_ulogic_vector(63 downto 0);
+
     -- Generate byte enables from sizes
     function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
     begin
@@ -128,6 +138,72 @@ architecture behave of loadstore1 is
 					    to_integer(unsigned(address))));
     end function xfer_data_sel;
 
+    -- 23-bit right shifter for DP -> SP float conversions
+    function shifter_23r(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
+        return std_ulogic_vector is
+        variable fs1   : std_ulogic_vector(22 downto 0);
+        variable fs2   : std_ulogic_vector(22 downto 0);
+    begin
+        case shift(1 downto 0) is
+            when "00" =>
+                fs1 := frac;
+            when "01" =>
+                fs1 := '0' & frac(22 downto 1);
+            when "10" =>
+                fs1 := "00" & frac(22 downto 2);
+            when others =>
+                fs1 := "000" & frac(22 downto 3);
+        end case;
+        case shift(4 downto 2) is
+            when "000" =>
+                fs2 := fs1;
+            when "001" =>
+                fs2 := x"0" & fs1(22 downto 4);
+            when "010" =>
+                fs2 := x"00" & fs1(22 downto 8);
+            when "011" =>
+                fs2 := x"000" & fs1(22 downto 12);
+            when "100" =>
+                fs2 := x"0000" & fs1(22 downto 16);
+            when others =>
+                fs2 := x"00000" & fs1(22 downto 20);
+        end case;
+        return fs2;
+    end;
+
+    -- 23-bit left shifter for SP -> DP float conversions
+    function shifter_23l(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
+        return std_ulogic_vector is
+        variable fs1   : std_ulogic_vector(22 downto 0);
+        variable fs2   : std_ulogic_vector(22 downto 0);
+    begin
+        case shift(1 downto 0) is
+            when "00" =>
+                fs1 := frac;
+            when "01" =>
+                fs1 := frac(21 downto 0) & '0';
+            when "10" =>
+                fs1 := frac(20 downto 0) & "00";
+            when others =>
+                fs1 := frac(19 downto 0) & "000";
+        end case;
+        case shift(4 downto 2) is
+            when "000" =>
+                fs2 := fs1;
+            when "001" =>
+                fs2 := fs1(18 downto 0) & x"0" ;
+            when "010" =>
+                fs2 := fs1(14 downto 0) & x"00";
+            when "011" =>
+                fs2 := fs1(10 downto 0) & x"000";
+            when "100" =>
+                fs2 := fs1(6 downto 0) & x"0000";
+            when others =>
+                fs2 := fs1(2 downto 0) & x"00000";
+        end case;
+        return fs2;
+    end;
+
 begin
     -- Calculate the address in the first cycle
     lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
@@ -145,6 +221,59 @@ begin
         end if;
     end process;
 
+    ls_fp_conv: if HAS_FPU generate
+        -- Convert DP data to SP for stfs
+        dp_to_sp: process(all)
+            variable exp   : unsigned(10 downto 0);
+            variable frac  : std_ulogic_vector(22 downto 0);
+            variable shift : unsigned(4 downto 0);
+        begin
+            store_sp_data(31) <= l_in.data(63);
+            store_sp_data(30 downto 0) <= (others => '0');
+            exp := unsigned(l_in.data(62 downto 52));
+            if exp > 896 then
+                store_sp_data(30) <= l_in.data(62);
+                store_sp_data(29 downto 0) <= l_in.data(58 downto 29);
+            elsif exp >= 874 then
+                -- denormalization required
+                frac := '1' & l_in.data(51 downto 30);
+                shift := 0 - exp(4 downto 0);
+                store_sp_data(22 downto 0) <= shifter_23r(frac, shift);
+            end if;
+        end process;
+
+        -- Convert SP data to DP for lfs
+        sp_to_dp: process(all)
+            variable exp     : unsigned(7 downto 0);
+            variable exp_dp  : unsigned(10 downto 0);
+            variable exp_nz  : std_ulogic;
+            variable exp_ao  : std_ulogic;
+            variable frac    : std_ulogic_vector(22 downto 0);
+            variable frac_shift : unsigned(4 downto 0);
+        begin
+            frac := r.ld_sp_data(22 downto 0);
+            exp := unsigned(r.ld_sp_data(30 downto 23));
+            exp_nz := or (r.ld_sp_data(30 downto 23));
+            exp_ao := and (r.ld_sp_data(30 downto 23));
+            frac_shift := (others => '0');
+            if exp_ao = '1' then
+                exp_dp := to_unsigned(2047, 11);    -- infinity or NaN
+            elsif exp_nz = '1' then
+                exp_dp := 896 + resize(exp, 11);    -- finite normalized value
+            elsif r.ld_sp_nz = '0' then
+                exp_dp := to_unsigned(0, 11);       -- zero
+            else
+                -- denormalized SP operand, need to normalize
+                exp_dp := 896 - resize(unsigned(r.ld_sp_lz), 11);
+                frac_shift := unsigned(r.ld_sp_lz(4 downto 0)) + 1;
+            end if;
+            load_dp_data(63) <= r.ld_sp_data(31);
+            load_dp_data(62 downto 52) <= std_ulogic_vector(exp_dp);
+            load_dp_data(51 downto 29) <= shifter_23l(frac, frac_shift);
+            load_dp_data(28 downto 0) <= (others => '0');
+        end process;
+    end generate;
+
     loadstore1_1: process(all)
         variable v : reg_stage_t;
         variable brev_lenm1 : unsigned(2 downto 0);
@@ -165,6 +294,9 @@ begin
         variable data_permuted : std_ulogic_vector(63 downto 0);
         variable data_trimmed : std_ulogic_vector(63 downto 0);
         variable store_data : std_ulogic_vector(63 downto 0);
+        variable data_in : std_ulogic_vector(63 downto 0);
+        variable byte_rev : std_ulogic;
+        variable length : std_ulogic_vector(3 downto 0);
         variable use_second : byte_sel_t;
         variable trim_ctl : trim_ctl_t;
         variable negative : std_ulogic;
@@ -176,6 +308,8 @@ begin
         variable mmu_mtspr : std_ulogic;
         variable itlb_fault : std_ulogic;
         variable misaligned : std_ulogic;
+        variable fp_reg_conv : std_ulogic;
+        variable lfs_done : std_ulogic;
     begin
         v := r;
         req := '0';
@@ -185,8 +319,10 @@ begin
         sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10));
         dsisr := (others => '0');
         mmureq := '0';
+        fp_reg_conv := '0';
 
         write_enable := '0';
+        lfs_done := '0';
 
         do_update := r.do_update;
         v.do_update := '0';
@@ -245,19 +381,38 @@ begin
             end case;
         end loop;
 
-        -- Byte reversing and rotating for stores
-        -- Done in the first cycle (when l_in.valid = 1)
+        if HAS_FPU then
+            -- Single-precision FP conversion
+            v.st_sp_data := store_sp_data;
+            v.ld_sp_data := data_trimmed(31 downto 0);
+            v.ld_sp_nz := or (data_trimmed(22 downto 0));
+            v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0));
+        end if;
+
+        -- Byte reversing and rotating for stores.
+        -- Done in the first cycle (when l_in.valid = 1) for integer stores
+        -- and DP float stores, and in the second cycle for SP float stores.
         store_data := r.store_data;
-        if l_in.valid = '1' then
-            byte_offset := unsigned(lsu_sum(2 downto 0));
+        if l_in.valid = '1' or (HAS_FPU and r.state = FPR_CONV) then
+            if HAS_FPU and r.state = FPR_CONV then
+                data_in := x"00000000" & r.st_sp_data;
+                byte_offset := unsigned(r.addr(2 downto 0));
+                byte_rev := r.byte_reverse;
+                length := r.length;
+            else
+                data_in := l_in.data;
+                byte_offset := unsigned(lsu_sum(2 downto 0));
+                byte_rev := l_in.byte_reverse;
+                length := l_in.length;
+            end if;
             brev_lenm1 := "000";
-            if l_in.byte_reverse = '1' then
-                brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
+            if byte_rev = '1' then
+                brev_lenm1 := unsigned(length(2 downto 0)) - 1;
             end if;
             for i in 0 to 7 loop
                 k := (to_unsigned(i, 3) - byte_offset) xor brev_lenm1;
                 j := to_integer(k) * 8;
-                store_data(i * 8 + 7 downto i * 8) := l_in.data(j + 7 downto j);
+                store_data(i * 8 + 7 downto i * 8) := data_in(j + 7 downto j);
             end loop;
         end if;
         v.store_data := store_data;
@@ -292,6 +447,14 @@ begin
         case r.state is
         when IDLE =>
 
+        when FPR_CONV =>
+            req := '1';
+            if r.second_bytes /= "00000000" then
+                v.state := SECOND_REQ;
+            else
+                v.state := ACK_WAIT;
+            end if;
+
         when SECOND_REQ =>
             req := '1';
             v.state := ACK_WAIT;
@@ -323,8 +486,13 @@ begin
                         v.load_data := data_permuted;
                     end if;
                 else
-                    write_enable := r.load;
-                    if r.extra_cycle = '1' then
+                    write_enable := r.load and not r.load_sp;
+                    if HAS_FPU and r.load_sp = '1' then
+                        -- SP to DP conversion takes a cycle
+                        -- Write back rA update in this cycle if needed
+                        do_update := r.update;
+                        v.state := FINISH_LFS;
+                    elsif r.extra_cycle = '1' then
                         -- loads with rA update need an extra cycle
                         v.state := COMPLETE;
                         v.do_update := r.update;
@@ -362,6 +530,9 @@ begin
 
         when TLBIE_WAIT =>
 
+        when FINISH_LFS =>
+            lfs_done := '1';
+
         when COMPLETE =>
             exception := r.align_intr;
 
@@ -395,6 +566,7 @@ begin
             v.nc := l_in.ci;
             v.virt_mode := l_in.virt_mode;
             v.priv_mode := l_in.priv_mode;
+            v.load_sp := '0';
             v.wait_dcache := '0';
             v.wait_mmu := '0';
             v.do_update := '0';
@@ -436,14 +608,24 @@ begin
                     v.dcbz := '1';
                 when OP_FPSTORE =>
                     if HAS_FPU then
-                        req := '1';
+                        if l_in.is_32bit = '1' then
+                            v.state := FPR_CONV;
+                            fp_reg_conv := '1';
+                        else
+                            req := '1';
+                        end if;
                     end if;
                 when OP_FPLOAD =>
                     if HAS_FPU then
                         v.load := '1';
                         req := '1';
-                        -- Allow an extra cycle for RA update
+                        -- Allow an extra cycle for SP->DP precision conversion
+                        -- or RA update
                         v.extra_cycle := l_in.update;
+                        if l_in.is_32bit = '1' then
+                            v.load_sp := '1';
+                            v.extra_cycle := '1';
+                        end if;
                     end if;
                 when OP_TLBIE =>
                     mmureq := '1';
@@ -500,7 +682,7 @@ begin
                 end if;
             end if;
 
-            v.busy := req or mmureq or mmu_mtspr;
+            v.busy := req or mmureq or mmu_mtspr or fp_reg_conv;
         end if;
 
         -- Update outputs to dcache
@@ -539,6 +721,10 @@ begin
             l_out.write_enable <= '1';
             l_out.write_reg <= gpr_to_gspr(r.update_reg);
             l_out.write_data <= r.addr;
+        elsif lfs_done = '1' then
+            l_out.write_enable <= '1';
+            l_out.write_reg <= r.write_reg;
+            l_out.write_data <= load_dp_data;
         else
             l_out.write_enable <= write_enable;
             l_out.write_reg <= r.write_reg;