From 45cd8f4fc375185544309ffd16d73a7dc5ce1dce Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 28 Aug 2020 12:49:48 +1000 Subject: [PATCH 01/30] core: Add support for floating-point loads and stores This extends the register file so it can hold FPR values, and implements the FP loads and stores that do not require conversion between single and double precision. We now have the FP, FE0 and FE1 bits in MSR. FP loads and stores cause a FP unavailable interrupt if MSR[FP] = 0. The FPU facilities are optional and their presence is controlled by the HAS_FPU generic passed down from the top-level board file. It defaults to true for all except the A7-35 boards. Signed-off-by: Paul Mackerras --- common.vhdl | 34 ++++++++++++++++------ control.vhdl | 7 ++--- core.vhdl | 13 ++++++--- decode1.vhdl | 19 ++++++++++++ decode2.vhdl | 30 ++++++++++++++----- decode_types.vhdl | 5 ++-- execute1.vhdl | 20 +++++++++++-- fpga/top-arty.vhdl | 2 ++ fpga/top-generic.vhdl | 2 ++ fpga/top-nexys-video.vhdl | 2 ++ gpr_hazard.vhdl | 13 +++++---- insn_helpers.vhdl | 24 +++++++++++++++ loadstore1.vhdl | 18 ++++++++++-- microwatt.core | 14 +++++++++ register_file.vhdl | 61 ++++++++++++++++++++++++++------------- scripts/fmt_log/fmt_log.c | 14 ++++----- soc.vhdl | 2 ++ writeback.vhdl | 2 +- 18 files changed, 217 insertions(+), 65 deletions(-) diff --git a/common.vhdl b/common.vhdl index 1ca1178..14bdcf7 100644 --- a/common.vhdl +++ b/common.vhdl @@ -13,8 +13,11 @@ package common is constant MSR_SF : integer := (63 - 0); -- Sixty-Four bit mode constant MSR_EE : integer := (63 - 48); -- External interrupt Enable constant MSR_PR : integer := (63 - 49); -- PRoblem state + constant MSR_FP : integer := (63 - 50); -- Floating Point available + constant MSR_FE0 : integer := (63 - 52); -- Floating Exception mode constant MSR_SE : integer := (63 - 53); -- Single-step bit of TE field constant MSR_BE : integer := (63 - 54); -- Branch trace bit of TE field + constant MSR_FE1 : integer := (63 - 55); -- Floating Exception mode constant MSR_IR : integer := (63 - 58); -- Instruction Relocation constant MSR_DR : integer := (63 - 59); -- Data Relocation constant MSR_RI : integer := (63 - 62); -- Recoverable Interrupt @@ -53,8 +56,11 @@ package common is -- GPR indices in the register file (GPR only) subtype gpr_index_t is std_ulogic_vector(4 downto 0); - -- Extended GPR indice (can hold an SPR) - subtype gspr_index_t is std_ulogic_vector(5 downto 0); + -- Extended GPR index (can hold an SPR or a FPR) + subtype gspr_index_t is std_ulogic_vector(6 downto 0); + + -- FPR indices + subtype fpr_index_t is std_ulogic_vector(4 downto 0); -- Some SPRs are stored in the register file, they use the magic -- GPR numbers above 31. @@ -64,6 +70,9 @@ package common is -- indicates if this is indeed a fast SPR. If clear, then -- the SPR is not stored in the GPR file. -- + -- FPRs are also stored in the register file, using GSPR + -- numbers from 64 to 95. + -- function fast_spr_num(spr: spr_num_t) return gspr_index_t; -- Indices conversion functions @@ -71,6 +80,7 @@ package common is function gpr_to_gspr(i: gpr_index_t) return gspr_index_t; function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t; function is_fast_spr(s: gspr_index_t) return std_ulogic; + function fpr_to_gspr(f: fpr_index_t) return gspr_index_t; -- The XER is split: the common bits (CA, OV, SO, OV32 and CA32) are -- in the CR file as a kind of CR extension (with a separate write @@ -226,7 +236,7 @@ package common is read2_enable : std_ulogic; read2_reg : gspr_index_t; read3_enable : std_ulogic; - read3_reg : gpr_index_t; + read3_reg : gspr_index_t; end record; type RegisterFileToDecode2Type is record @@ -264,7 +274,7 @@ package common is addr1 : std_ulogic_vector(63 downto 0); addr2 : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); -- data to write, unused for read - write_reg : gpr_index_t; + write_reg : gspr_index_t; length : std_ulogic_vector(3 downto 0); ci : std_ulogic; -- cache-inhibited load/store byte_reverse : std_ulogic; @@ -282,7 +292,8 @@ package common is sign_extend => '0', update => '0', xerc => xerc_init, reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0', nia => (others => '0'), insn => (others => '0'), - addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'), length => (others => '0'), + addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'), + write_reg => (others => '0'), length => (others => '0'), mode_32bit => '0', others => (others => '0')); type Loadstore1ToExecute1Type is record @@ -369,7 +380,7 @@ package common is type Loadstore1ToWritebackType is record valid : std_ulogic; write_enable: std_ulogic; - write_reg : gpr_index_t; + write_reg : gspr_index_t; write_data : std_ulogic_vector(63 downto 0); xerc : xer_common_t; rc : std_ulogic; @@ -473,10 +484,10 @@ package body common is n := 13; when others => n := 0; - return "000000"; + return "0000000"; end case; tmp := std_ulogic_vector(to_unsigned(n, 5)); - return "1" & tmp; + return "01" & tmp; end; function gspr_to_gpr(i: gspr_index_t) return gpr_index_t is @@ -486,7 +497,7 @@ package body common is function gpr_to_gspr(i: gpr_index_t) return gspr_index_t is begin - return "0" & i; + return "00" & i; end; function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t is @@ -502,4 +513,9 @@ package body common is begin return s(5); end; + + function fpr_to_gspr(f: fpr_index_t) return gspr_index_t is + begin + return "10" & f; + end; end common; diff --git a/control.vhdl b/control.vhdl index d04576a..4f67ad4 100644 --- a/control.vhdl +++ b/control.vhdl @@ -34,7 +34,7 @@ entity control is gpr_b_read_in : in gspr_index_t; gpr_c_read_valid_in : in std_ulogic; - gpr_c_read_in : in gpr_index_t; + gpr_c_read_in : in gspr_index_t; cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; @@ -70,7 +70,6 @@ architecture rtl of control is signal gpr_write_valid : std_ulogic := '0'; signal cr_write_valid : std_ulogic := '0'; - signal gpr_c_read_in_fmt : std_ulogic_vector(5 downto 0); begin gpr_hazard0: entity work.gpr_hazard generic map ( @@ -122,8 +121,6 @@ begin use_bypass => gpr_bypass_b ); - gpr_c_read_in_fmt <= "0" & gpr_c_read_in; - gpr_hazard2: entity work.gpr_hazard generic map ( PIPELINE_DEPTH => PIPELINE_DEPTH @@ -140,7 +137,7 @@ begin gpr_write_in => gpr_write_in, bypass_avail => gpr_bypassable, gpr_read_valid_in => gpr_c_read_valid_in, - gpr_read_in => gpr_c_read_in_fmt, + gpr_read_in => gpr_c_read_in, ugpr_write_valid => update_gpr_write_valid, ugpr_write_reg => update_gpr_write_reg, diff --git a/core.vhdl b/core.vhdl index c7dd3f6..81e11c8 100644 --- a/core.vhdl +++ b/core.vhdl @@ -11,6 +11,7 @@ entity core is SIM : boolean := false; DISABLE_FLATTEN : boolean := false; EX1_BYPASS : boolean := true; + HAS_FPU : boolean := true; ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0'); LOG_LENGTH : natural := 512 ); @@ -244,6 +245,7 @@ begin decode2_0: entity work.decode2 generic map ( EX1_BYPASS => EX1_BYPASS, + HAS_FPU => HAS_FPU, LOG_LENGTH => LOG_LENGTH ) port map ( @@ -267,6 +269,7 @@ begin register_file_0: entity work.register_file generic map ( SIM => SIM, + HAS_FPU => HAS_FPU, LOG_LENGTH => LOG_LENGTH ) port map ( @@ -280,7 +283,7 @@ begin dbg_gpr_data => dbg_gpr_data, sim_dump => terminate, sim_dump_done => sim_cr_dump, - log_out => log_data(255 downto 185) + log_out => log_data(255 downto 184) ); cr_file_0: entity work.cr_file @@ -294,12 +297,13 @@ begin d_out => cr_file_to_decode2, w_in => writeback_to_cr_file, sim_dump => sim_cr_dump, - log_out => log_data(184 downto 172) + log_out => log_data(183 downto 171) ); execute1_0: entity work.execute1 generic map ( EX1_BYPASS => EX1_BYPASS, + HAS_FPU => HAS_FPU, LOG_LENGTH => LOG_LENGTH ) port map ( @@ -324,6 +328,7 @@ begin loadstore1_0: entity work.loadstore1 generic map ( + HAS_FPU => HAS_FPU, LOG_LENGTH => LOG_LENGTH ) port map ( @@ -368,7 +373,7 @@ begin stall_out => dcache_stall_out, wishbone_in => wishbone_data_in, wishbone_out => wishbone_data_out, - log_out => log_data(171 downto 152) + log_out => log_data(170 downto 151) ); writeback_0: entity work.writeback @@ -381,7 +386,7 @@ begin complete_out => complete ); - log_data(151 downto 150) <= "00"; + log_data(150) <= '0'; log_data(139 downto 135) <= "00000"; debug_0: entity work.core_debug diff --git a/decode1.vhdl b/decode1.vhdl index a7d5910..75da175 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -72,6 +72,10 @@ architecture behaviour of decode1 is 10 => (ALU, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli 34 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbz 35 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzu + 50 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfd + 51 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdu +-- 48 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfs +-- 49 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsu 42 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lha 43 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhau 40 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhz @@ -87,6 +91,10 @@ architecture behaviour of decode1 is 17 => (ALU, OP_SC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sc 38 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stb 39 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbu + 54 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfd + 55 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdu +-- 52 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfs +-- 53 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsu 44 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth 45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu 36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw @@ -272,6 +280,12 @@ architecture behaviour of decode1 is 2#1101110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldcix 2#0000110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- ldux 2#0000010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldx + 2#1001010111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfdx + 2#1001110111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdux + 2#1101010111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwax + 2#1101110111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwzx +-- 2#1000010111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfsx +-- 2#1000110111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsux 2#0001110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lharx 2#0101110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhaux 2#0101010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lhax @@ -350,6 +364,11 @@ architecture behaviour of decode1 is 2#0011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stdcx 2#0010110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stdux 2#0010010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdx + 2#1011010111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfdx + 2#1011110111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdux + 2#1111010111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfiwx +-- 2#1010010111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfsx +-- 2#1010110111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsux 2#1110010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx 2#1110110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthcix 2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- sthcx diff --git a/decode2.vhdl b/decode2.vhdl index a2a602c..6cc74c7 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -11,6 +11,7 @@ use work.insn_helpers.all; entity decode2 is generic ( EX1_BYPASS : boolean := true; + HAS_FPU : boolean := true; -- Non-zero to enable log data collection LOG_LENGTH : natural := 0 ); @@ -73,7 +74,7 @@ architecture behaviour of decode2 is -- If it's all 0, we don't treat it as a dependency as slow SPRs -- operations are single issue. -- - assert is_fast_spr(ispr) = '1' or ispr = "000000" + assert is_fast_spr(ispr) = '1' or ispr = "0000000" report "Decode A says SPR but ISPR is invalid:" & to_hstring(ispr) severity failure; return (is_fast_spr(ispr), ispr, reg_data); @@ -118,7 +119,7 @@ architecture behaviour of decode2 is -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. -- If it's all 0, we don't treat it as a dependency as slow SPRs -- operations are single issue. - assert is_fast_spr(ispr) = '1' or ispr = "000000" + assert is_fast_spr(ispr) = '1' or ispr = "0000000" report "Decode B says SPR but ISPR is invalid:" & to_hstring(ispr) severity failure; ret := (is_fast_spr(ispr), ispr, reg_data); @@ -137,6 +138,12 @@ architecture behaviour of decode2 is return ('1', gpr_to_gspr(insn_rs(insn_in)), reg_data); when RCR => return ('1', gpr_to_gspr(insn_rcreg(insn_in)), reg_data); + when FRS => + if HAS_FPU then + return ('1', fpr_to_gspr(insn_frt(insn_in)), reg_data); + else + return ('0', (others => '0'), (others => '0')); + end if; when NONE => return ('0', (others => '0'), (others => '0')); end case; @@ -150,16 +157,22 @@ architecture behaviour of decode2 is return ('1', gpr_to_gspr(insn_rt(insn_in))); when RA => return ('1', gpr_to_gspr(insn_ra(insn_in))); + when FRT => + if HAS_FPU then + return ('1', fpr_to_gspr(insn_frt(insn_in))); + else + return ('0', "0000000"); + end if; when SPR => -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. -- If it's all 0, we don't treat it as a dependency as slow SPRs -- operations are single issue. - assert is_fast_spr(ispr) = '1' or ispr = "000000" + assert is_fast_spr(ispr) = '1' or ispr = "0000000" report "Decode B says SPR but ISPR is invalid:" & to_hstring(ispr) severity failure; return (is_fast_spr(ispr), ispr); when NONE => - return ('0', "000000"); + return ('0', "0000000"); end case; end; @@ -212,7 +225,7 @@ architecture behaviour of decode2 is signal gpr_b_bypass : std_ulogic; signal gpr_c_read_valid : std_ulogic; - signal gpr_c_read : gpr_index_t; + signal gpr_c_read : gspr_index_t; signal gpr_c_bypass : std_ulogic; signal cr_write_valid : std_ulogic; @@ -284,8 +297,9 @@ begin else gpr_to_gspr(insn_ra(d_in.insn)); r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR else gpr_to_gspr(insn_rb(d_in.insn)); - r_out.read3_reg <= insn_rcreg(d_in.insn) when d_in.decode.input_reg_c = RCR - else insn_rs(d_in.insn); + r_out.read3_reg <= gpr_to_gspr(insn_rcreg(d_in.insn)) when d_in.decode.input_reg_c = RCR + else fpr_to_gspr(insn_frt(d_in.insn)) when d_in.decode.input_reg_c = FRS and HAS_FPU + else gpr_to_gspr(insn_rs(d_in.insn)); c_out.read <= d_in.decode.input_cr; @@ -394,7 +408,7 @@ begin gpr_b_read <= decoded_reg_b.reg; gpr_c_read_valid <= decoded_reg_c.reg_valid; - gpr_c_read <= gspr_to_gpr(decoded_reg_c.reg); + gpr_c_read <= decoded_reg_c.reg; cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); cr_bypass_avail <= '0'; diff --git a/decode_types.vhdl b/decode_types.vhdl index ef654c3..8c20441 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -10,6 +10,7 @@ package decode_types is OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS, OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC, OP_LOAD, OP_STORE, + OP_FPLOAD, OP_FPSTORE, OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MOD, OP_MTCRF, OP_MTMSRD, OP_MTSPR, OP_MUL_L64, OP_MUL_H64, OP_MUL_H32, OP_OR, @@ -24,8 +25,8 @@ package decode_types is type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA); type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR); - type input_reg_c_t is (NONE, RS, RCR); - type output_reg_a_t is (NONE, RT, RA, SPR); + type input_reg_c_t is (NONE, RS, RCR, FRS); + type output_reg_a_t is (NONE, RT, RA, SPR, FRT); type rc_t is (NONE, ONE, RC); type carry_in_t is (ZERO, CA, OV, ONE); diff --git a/execute1.vhdl b/execute1.vhdl index 04cc970..4d6a9cc 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -13,6 +13,7 @@ use work.ppc_fx_insns.all; entity execute1 is generic ( EX1_BYPASS : boolean := true; + HAS_FPU : boolean := true; -- Non-zero to enable log data collection LOG_LENGTH : natural := 0 ); @@ -542,6 +543,9 @@ begin ctrl_tmp.msr(MSR_PR) <= '0'; ctrl_tmp.msr(MSR_SE) <= '0'; ctrl_tmp.msr(MSR_BE) <= '0'; + ctrl_tmp.msr(MSR_FP) <= '0'; + ctrl_tmp.msr(MSR_FE0) <= '0'; + ctrl_tmp.msr(MSR_FE1) <= '0'; ctrl_tmp.msr(MSR_IR) <= '0'; ctrl_tmp.msr(MSR_DR) <= '0'; ctrl_tmp.msr(MSR_RI) <= '0'; @@ -578,7 +582,19 @@ begin -- set bit 45 to indicate privileged instruction type interrupt ctrl_tmp.srr1(63 - 45) <= '1'; report "privileged instruction"; - + + elsif not HAS_FPU and valid_in = '1' and + (e_in.insn_type = OP_FPLOAD or e_in.insn_type = OP_FPSTORE) then + -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations + illegal := '1'; + + elsif HAS_FPU and valid_in = '1' and ctrl.msr(MSR_FP) = '0' and + (e_in.insn_type = OP_FPLOAD or e_in.insn_type = OP_FPSTORE) then + -- generate a floating-point unavailable interrupt + exception := '1'; + v.f.redirect_nia := std_logic_vector(to_unsigned(16#800#, 64)); + report "FP unavailable interrupt"; + elsif valid_in = '1' and e_in.unit = ALU then report "execute nia " & to_hstring(e_in.nia); @@ -1225,7 +1241,7 @@ begin lv.addr1 := a_in; lv.addr2 := b_in; lv.data := c_in; - lv.write_reg := gspr_to_gpr(e_in.write_reg); + lv.write_reg := e_in.write_reg; lv.length := e_in.data_len; lv.byte_reverse := e_in.byte_reverse xnor ctrl.msr(MSR_LE); lv.sign_extend := e_in.sign_extend; diff --git a/fpga/top-arty.vhdl b/fpga/top-arty.vhdl index a4d253d..8a3dc7a 100644 --- a/fpga/top-arty.vhdl +++ b/fpga/top-arty.vhdl @@ -14,6 +14,7 @@ entity toplevel is RAM_INIT_FILE : string := "firmware.hex"; RESET_LOW : boolean := true; CLK_FREQUENCY : positive := 100000000; + HAS_FPU : boolean := true; USE_LITEDRAM : boolean := false; NO_BRAM : boolean := false; DISABLE_FLATTEN_CORE : boolean := false; @@ -168,6 +169,7 @@ begin RAM_INIT_FILE => RAM_INIT_FILE, SIM => false, CLK_FREQ => CLK_FREQUENCY, + HAS_FPU => HAS_FPU, HAS_DRAM => USE_LITEDRAM, DRAM_SIZE => 256 * 1024 * 1024, DRAM_INIT_SIZE => PAYLOAD_SIZE, diff --git a/fpga/top-generic.vhdl b/fpga/top-generic.vhdl index 2300456..2ad0dd3 100644 --- a/fpga/top-generic.vhdl +++ b/fpga/top-generic.vhdl @@ -11,6 +11,7 @@ entity toplevel is RESET_LOW : boolean := true; CLK_INPUT : positive := 100000000; CLK_FREQUENCY : positive := 100000000; + HAS_FPU : boolean := true; DISABLE_FLATTEN_CORE : boolean := false; UART_IS_16550 : boolean := true ); @@ -68,6 +69,7 @@ begin RAM_INIT_FILE => RAM_INIT_FILE, SIM => false, CLK_FREQ => CLK_FREQUENCY, + HAS_FPU => HAS_FPU, DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE, UART0_IS_16550 => UART_IS_16550 ) diff --git a/fpga/top-nexys-video.vhdl b/fpga/top-nexys-video.vhdl index 745ef79..1942b10 100644 --- a/fpga/top-nexys-video.vhdl +++ b/fpga/top-nexys-video.vhdl @@ -14,6 +14,7 @@ entity toplevel is RAM_INIT_FILE : string := "firmware.hex"; RESET_LOW : boolean := true; CLK_FREQUENCY : positive := 100000000; + HAS_FPU : boolean := true; USE_LITEDRAM : boolean := false; NO_BRAM : boolean := false; DISABLE_FLATTEN_CORE : boolean := false; @@ -120,6 +121,7 @@ begin RAM_INIT_FILE => RAM_INIT_FILE, SIM => false, CLK_FREQ => CLK_FREQUENCY, + HAS_FPU => HAS_FPU, HAS_DRAM => USE_LITEDRAM, DRAM_SIZE => 512 * 1024 * 1024, DRAM_INIT_SIZE => PAYLOAD_SIZE, diff --git a/gpr_hazard.vhdl b/gpr_hazard.vhdl index 0fa66c5..fec03c7 100644 --- a/gpr_hazard.vhdl +++ b/gpr_hazard.vhdl @@ -2,6 +2,9 @@ library ieee; use ieee.std_logic_1164.all; use ieee.numeric_std.all; +library work; +use work.common.all; + entity gpr_hazard is generic ( PIPELINE_DEPTH : natural := 1 @@ -15,13 +18,13 @@ entity gpr_hazard is issuing : in std_ulogic; gpr_write_valid_in : in std_ulogic; - gpr_write_in : in std_ulogic_vector(5 downto 0); + gpr_write_in : in gspr_index_t; bypass_avail : in std_ulogic; gpr_read_valid_in : in std_ulogic; - gpr_read_in : in std_ulogic_vector(5 downto 0); + gpr_read_in : in gspr_index_t; ugpr_write_valid : in std_ulogic; - ugpr_write_reg : in std_ulogic_vector(5 downto 0); + ugpr_write_reg : in gspr_index_t; stall_out : out std_ulogic; use_bypass : out std_ulogic @@ -31,9 +34,9 @@ architecture behaviour of gpr_hazard is type pipeline_entry_type is record valid : std_ulogic; bypass : std_ulogic; - gpr : std_ulogic_vector(5 downto 0); + gpr : gspr_index_t; ugpr_valid : std_ulogic; - ugpr : std_ulogic_vector(5 downto 0); + ugpr : gspr_index_t; end record; constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0'), ugpr_valid => '0', ugpr => (others => '0')); diff --git a/insn_helpers.vhdl b/insn_helpers.vhdl index 592acb0..be3892a 100644 --- a/insn_helpers.vhdl +++ b/insn_helpers.vhdl @@ -37,6 +37,10 @@ package insn_helpers is function insn_sh (insn_in : std_ulogic_vector) return std_ulogic_vector; function insn_me (insn_in : std_ulogic_vector) return std_ulogic_vector; function insn_mb (insn_in : std_ulogic_vector) return std_ulogic_vector; + function insn_frt (insn_in : std_ulogic_vector) return std_ulogic_vector; + function insn_fra (insn_in : std_ulogic_vector) return std_ulogic_vector; + function insn_frb (insn_in : std_ulogic_vector) return std_ulogic_vector; + function insn_frc (insn_in : std_ulogic_vector) return std_ulogic_vector; end package insn_helpers; package body insn_helpers is @@ -214,4 +218,24 @@ package body insn_helpers is begin return insn_in(5) & insn_in(10 downto 6); end; + + function insn_frt(insn_in : std_ulogic_vector) return std_ulogic_vector is + begin + return insn_in(25 downto 21); + end; + + function insn_fra(insn_in : std_ulogic_vector) return std_ulogic_vector is + begin + return insn_in(20 downto 16); + end; + + function insn_frb(insn_in : std_ulogic_vector) return std_ulogic_vector is + begin + return insn_in(15 downto 11); + end; + + function insn_frc(insn_in : std_ulogic_vector) return std_ulogic_vector is + begin + return insn_in(10 downto 6); + end; end package body insn_helpers; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index e36025c..ec20319 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -5,12 +5,15 @@ use ieee.numeric_std.all; library work; use work.decode_types.all; use work.common.all; +use work.insn_helpers.all; +use work.helpers.all; -- 2 cycle LSU -- We calculate the address in the first cycle entity loadstore1 is generic ( + HAS_FPU : boolean := true; -- Non-zero to enable log data collection LOG_LENGTH : natural := 0 ); @@ -58,7 +61,7 @@ architecture behave of loadstore1 is addr : std_ulogic_vector(63 downto 0); store_data : std_ulogic_vector(63 downto 0); load_data : std_ulogic_vector(63 downto 0); - write_reg : gpr_index_t; + write_reg : gspr_index_t; length : std_ulogic_vector(3 downto 0); byte_reverse : std_ulogic; sign_extend : std_ulogic; @@ -431,6 +434,17 @@ begin v.align_intr := v.nc; req := '1'; v.dcbz := '1'; + when OP_FPSTORE => + if HAS_FPU then + req := '1'; + end if; + when OP_FPLOAD => + if HAS_FPU then + v.load := '1'; + req := '1'; + -- Allow an extra cycle for RA update + v.extra_cycle := l_in.update; + end if; when OP_TLBIE => mmureq := '1'; v.tlbie := '1'; @@ -523,7 +537,7 @@ begin l_out.write_data <= r.sprval; elsif do_update = '1' then l_out.write_enable <= '1'; - l_out.write_reg <= r.update_reg; + l_out.write_reg <= gpr_to_gspr(r.update_reg); l_out.write_data <= r.addr; else l_out.write_enable <= write_enable; diff --git a/microwatt.core b/microwatt.core index cd24a06..3b47339 100644 --- a/microwatt.core +++ b/microwatt.core @@ -132,6 +132,7 @@ targets: - disable_flatten_core - log_length=2048 - uart_is_16550 + - has_fpu tools: vivado: {part : xc7a100tcsg324-1} toplevel : toplevel @@ -215,6 +216,7 @@ targets: - spi_flash_offset=10485760 - log_length=2048 - uart_is_16550 + - has_fpu tools: vivado: {part : xc7a200tsbg484-1} toplevel : toplevel @@ -231,6 +233,7 @@ targets: - spi_flash_offset=10485760 - log_length=2048 - uart_is_16550 + - has_fpu generate: [litedram_nexys_video] tools: vivado: {part : xc7a200tsbg484-1} @@ -249,6 +252,7 @@ targets: - log_length=512 - uart_is_16550 - has_uart1 + - has_fpu=false tools: vivado: {part : xc7a35ticsg324-1L} toplevel : toplevel @@ -267,6 +271,7 @@ targets: - log_length=512 - uart_is_16550 - has_uart1 + - has_fpu=false generate: [litedram_arty, liteeth_arty] tools: vivado: {part : xc7a35ticsg324-1L} @@ -285,6 +290,7 @@ targets: - log_length=2048 - uart_is_16550 - has_uart1 + - has_fpu tools: vivado: {part : xc7a100ticsg324-1L} toplevel : toplevel @@ -303,6 +309,7 @@ targets: - log_length=2048 - uart_is_16550 - has_uart1 + - has_fpu generate: [litedram_arty, liteeth_arty] tools: vivado: {part : xc7a100ticsg324-1L} @@ -320,6 +327,7 @@ targets: - disable_flatten_core - log_length=512 - uart_is_16550 + - has_fpu=false tools: vivado: {part : xc7a35tcpg236-1} toplevel : toplevel @@ -380,6 +388,12 @@ parameters: paramtype : generic default : 100000000 + has_fpu: + datatype : bool + description : Include a floating-point unit in the core + paramtype : generic + default : true + disable_flatten_core: datatype : bool description : Prevent Vivado from flattening the main core components diff --git a/register_file.vhdl b/register_file.vhdl index 10f28a4..32c8490 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -8,6 +8,7 @@ use work.common.all; entity register_file is generic ( SIM : boolean := false; + HAS_FPU : boolean := true; -- Non-zero to enable log data collection LOG_LENGTH : natural := 0 ); @@ -28,12 +29,12 @@ entity register_file is sim_dump : in std_ulogic; sim_dump_done : out std_ulogic; - log_out : out std_ulogic_vector(70 downto 0) + log_out : out std_ulogic_vector(71 downto 0) ); end entity register_file; architecture behaviour of register_file is - type regfile is array(0 to 63) of std_ulogic_vector(63 downto 0); + type regfile is array(0 to 127) of std_ulogic_vector(63 downto 0); signal registers : regfile := (others => (others => '0')); signal rd_port_b : std_ulogic_vector(63 downto 0); signal dbg_data : std_ulogic_vector(63 downto 0); @@ -41,53 +42,73 @@ architecture behaviour of register_file is begin -- synchronous writes register_write_0: process(clk) + variable w_addr : gspr_index_t; begin if rising_edge(clk) then if w_in.write_enable = '1' then - if w_in.write_reg(5) = '0' then - report "Writing GPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data); - else - report "Writing GSPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data); - end if; + w_addr := w_in.write_reg; + if HAS_FPU and w_addr(6) = '1' then + report "Writing FPR " & to_hstring(w_addr(4 downto 0)) & " " & to_hstring(w_in.write_data); + else + w_addr(6) := '0'; + if w_addr(5) = '0' then + report "Writing GPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data); + else + report "Writing GSPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data); + end if; + end if; assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure; - registers(to_integer(unsigned(w_in.write_reg))) <= w_in.write_data; + registers(to_integer(unsigned(w_addr))) <= w_in.write_data; end if; end if; end process register_write_0; -- asynchronous reads register_read_0: process(all) - variable b_addr : gspr_index_t; + variable a_addr, b_addr, c_addr : gspr_index_t; + variable w_addr : gspr_index_t; begin + a_addr := d_in.read1_reg; + b_addr := d_in.read2_reg; + c_addr := d_in.read3_reg; + w_addr := w_in.write_reg; + if not HAS_FPU then + -- Make it obvious that we only want 64 GSPRs for a no-FPU implementation + a_addr(6) := '0'; + b_addr(6) := '0'; + c_addr(6) := '0'; + w_addr(6) := '0'; + end if; if d_in.read1_enable = '1' then - report "Reading GPR " & to_hstring(d_in.read1_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read1_reg)))); + report "Reading GPR " & to_hstring(a_addr) & " " & to_hstring(registers(to_integer(unsigned(a_addr)))); end if; if d_in.read2_enable = '1' then - report "Reading GPR " & to_hstring(d_in.read2_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read2_reg)))); + report "Reading GPR " & to_hstring(b_addr) & " " & to_hstring(registers(to_integer(unsigned(b_addr)))); end if; if d_in.read3_enable = '1' then - report "Reading GPR " & to_hstring(d_in.read3_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read3_reg)))); + report "Reading GPR " & to_hstring(c_addr) & " " & to_hstring(registers(to_integer(unsigned(c_addr)))); end if; - d_out.read1_data <= registers(to_integer(unsigned(d_in.read1_reg))); + d_out.read1_data <= registers(to_integer(unsigned(a_addr))); -- B read port is multiplexed with reads from the debug circuitry if d_in.read2_enable = '0' and dbg_gpr_req = '1' and dbg_ack = '0' then b_addr := dbg_gpr_addr; - else - b_addr := d_in.read2_reg; + if not HAS_FPU then + b_addr(6) := '0'; + end if; end if; rd_port_b <= registers(to_integer(unsigned(b_addr))); d_out.read2_data <= rd_port_b; - d_out.read3_data <= registers(to_integer(unsigned(gpr_to_gspr(d_in.read3_reg)))); + d_out.read3_data <= registers(to_integer(unsigned(c_addr))); -- Forward any written data if w_in.write_enable = '1' then - if d_in.read1_reg = w_in.write_reg then + if a_addr = w_addr then d_out.read1_data <= w_in.write_data; end if; - if d_in.read2_reg = w_in.write_reg then + if b_addr = w_addr then d_out.read2_data <= w_in.write_data; end if; - if gpr_to_gspr(d_in.read3_reg) = w_in.write_reg then + if c_addr = w_addr then d_out.read3_data <= w_in.write_data; end if; end if; @@ -136,7 +157,7 @@ begin end generate; rf_log: if LOG_LENGTH > 0 generate - signal log_data : std_ulogic_vector(70 downto 0); + signal log_data : std_ulogic_vector(71 downto 0); begin reg_log: process(clk) begin diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c index 146346d..eca4bf0 100644 --- a/scripts/fmt_log/fmt_log.c +++ b/scripts/fmt_log/fmt_log.c @@ -58,7 +58,7 @@ struct log_entry { u64 ls_lo_valid: 1; u64 ls_eo_except: 1; u64 ls_stall_out: 1; - u64 pad2: 2; + u64 pad2: 1; u64 dc_state: 3; u64 dc_ra_valid: 1; u64 dc_tlb_way: 3; @@ -74,7 +74,7 @@ struct log_entry { u64 cr_wr_mask: 8; u64 cr_wr_data: 4; u64 cr_wr_enable: 1; - u64 reg_wr_reg: 6; + u64 reg_wr_reg: 7; u64 reg_wr_enable: 1; u64 reg_wr_data; @@ -90,11 +90,11 @@ const char *ops[64] = "illegal", "nop ", "add ", "and ", "attn ", "b ", "bc ", "bcreg ", "bperm ", "cmp ", "cmpb ", "cmpeqb ", "cmprb ", "cntz ", "crop ", "darn ", "dcbf ", "dcbst ", "dcbt ", "dcbtst ", "dcbz ", "div ", "dive ", "exts ", - "extswsl", "icbi ", "icbt ", "isel ", "isync ", "ld ", "st ", "mcrxrx ", - "mfcr ", "mfmsr ", "mfspr ", "mod ", "mtcrf ", "mtmsr ", "mtspr ", "mull64 ", - "mulh64 ", "mulh32 ", "or ", "popcnt ", "prty ", "rfid ", "rlc ", "rlcl ", - "rlcr ", "sc ", "setb ", "shl ", "shr ", "sync ", "tlbie ", "trap ", - "xor ", "bcd ", "addg6s ", "ffail ", "?60 ", "?61 ", "?62 ", "?63 " + "extswsl", "icbi ", "icbt ", "isel ", "isync ", "ld ", "st ", "fpload ", + "fpstore", "mcrxrx ", "mfcr ", "mfmsr ", "mfspr ", "mod ", "mtcrf ", "mtmsr ", + "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "or ", "popcnt ", "prty ", "rfid ", + "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", "shl ", "shr ", "sync ", + "tlbie ", "trap ", "xor ", "bcd ", "addg6s ", "ffail ", "?62 ", "?63 " }; const char *spr_names[13] = diff --git a/soc.vhdl b/soc.vhdl index 0a70026..7ab146f 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -52,6 +52,7 @@ entity soc is RAM_INIT_FILE : string; CLK_FREQ : positive; SIM : boolean; + HAS_FPU : boolean := true; DISABLE_FLATTEN_CORE : boolean := false; HAS_DRAM : boolean := false; DRAM_SIZE : integer := 0; @@ -253,6 +254,7 @@ begin processor: entity work.core generic map( SIM => SIM, + HAS_FPU => HAS_FPU, DISABLE_FLATTEN => DISABLE_FLATTEN_CORE, ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'), LOG_LENGTH => LOG_LENGTH diff --git a/writeback.vhdl b/writeback.vhdl index 053a8ba..d0230d8 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -80,7 +80,7 @@ begin end if; if l_in.write_enable = '1' then - w_out.write_reg <= gpr_to_gspr(l_in.write_reg); + w_out.write_reg <= l_in.write_reg; w_out.write_data <= l_in.write_data; w_out.write_enable <= '1'; end if; From bcac4b9b2fafe976eb4d2ce2d022cc0cbb33c5de Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 1 Jul 2020 18:03:19 +1000 Subject: [PATCH 02/30] tests: Add a test for FP loads and stores This tests that floating-point unavailable exceptions occur as expected on FP loads and stores, and that the simple FP loads and stores appear to give reasonable results. Signed-off-by: Paul Mackerras --- tests/fpu/Makefile | 3 + tests/fpu/fpu.c | 196 +++++++++++++++++++++++++++++++++++++ tests/fpu/head.S | 120 +++++++++++++++++++++++ tests/fpu/powerpc.lds | 27 +++++ tests/test_fpu.bin | Bin 0 -> 8208 bytes tests/test_fpu.console_out | 2 + tests/update_console_tests | 2 +- 7 files changed, 349 insertions(+), 1 deletion(-) create mode 100644 tests/fpu/Makefile create mode 100644 tests/fpu/fpu.c create mode 100644 tests/fpu/head.S create mode 100644 tests/fpu/powerpc.lds create mode 100755 tests/test_fpu.bin create mode 100644 tests/test_fpu.console_out diff --git a/tests/fpu/Makefile b/tests/fpu/Makefile new file mode 100644 index 0000000..fd8344e --- /dev/null +++ b/tests/fpu/Makefile @@ -0,0 +1,3 @@ +TEST=fpu + +include ../Makefile.test diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c new file mode 100644 index 0000000..d61b36e --- /dev/null +++ b/tests/fpu/fpu.c @@ -0,0 +1,196 @@ +#include +#include +#include + +#include "console.h" + +#define MSR_FP 0x2000 +#define MSR_FE0 0x800 +#define MSR_FE1 0x100 + +extern int trapit(long arg, int (*func)(long)); + +#define SRR0 26 +#define SRR1 27 + +static inline unsigned long mfspr(int sprnum) +{ + long val; + + __asm__ volatile("mfspr %0,%1" : "=r" (val) : "i" (sprnum)); + return val; +} + +static inline void mtspr(int sprnum, unsigned long val) +{ + __asm__ volatile("mtspr %0,%1" : : "i" (sprnum), "r" (val)); +} + +void disable_fp(void) +{ + unsigned long msr; + + __asm__("mfmsr %0" : "=r" (msr)); + msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1); + __asm__("mtmsrd %0" : : "r" (msr)); +} + +void enable_fp(void) +{ + unsigned long msr; + + __asm__("mfmsr %0" : "=r" (msr)); + msr |= MSR_FP; + __asm__("mtmsrd %0" : : "r" (msr)); +} + +void print_string(const char *str) +{ + for (; *str; ++str) + putchar(*str); +} + +void print_hex(unsigned long val, int ndigits) +{ + int i, x; + + for (i = (ndigits - 1) * 4; i >= 0; i -= 4) { + x = (val >> i) & 0xf; + if (x >= 10) + putchar(x + 'a' - 10); + else + putchar(x + '0'); + } +} + +// i < 100 +void print_test_number(int i) +{ + print_string("test "); + putchar(48 + i/10); + putchar(48 + i%10); + putchar(':'); +} + +unsigned long foo = 0x3ff8000000000000ul; +unsigned long foow; +int fooi = -76543; +int fooiw; + +int do_fp_op(long arg) +{ + switch (arg) { + case 0: + __asm__("lfd 31,0(%0)" : : "b" (&foo)); + break; + case 1: + __asm__("stfd 31,0(%0)" : : "b" (&foow) : "memory"); + break; + case 2: + __asm__("lfd 30,0(%0); stfd 30,0(%1)" + : : "b" (&foo), "b" (&foow) : "memory"); + break; + case 3: + __asm__("lfiwax 29,0,%0; stfd 29,0(%1)" + : : "r" (&fooi), "b" (&foow) : "memory"); + break; + case 4: + __asm__("lfiwzx 28,0,%0; stfd 28,0(%1)" + : : "r" (&fooi), "b" (&foow) : "memory"); + break; + case 5: + __asm__("lfdx 27,0,%0; stfiwx 27,0,%1" + : : "r" (&foow), "r" (&fooiw) : "memory"); + break; + } + return 0; +} + + +int fpu_test_1(void) +{ + int ret; + + disable_fp(); + /* these should give a FP unavailable exception */ + ret = trapit(0, do_fp_op); + if (ret != 0x800) + return 1; + ret = trapit(1, do_fp_op); + if (ret != 0x800) + return 2; + enable_fp(); + /* these should succeed */ + ret = trapit(0, do_fp_op); + if (ret) + return ret | 3; + ret = trapit(1, do_fp_op); + if (ret) + return ret | 4; + if (foow != foo) + return 5; + return 0; +} + +int fpu_test_2(void) +{ + int ret; + + enable_fp(); + foow = ~0; + ret = trapit(2, do_fp_op); + if (ret) + return ret | 1; + if (foow != foo) + return 2; + foow = ~0; + ret = trapit(3, do_fp_op); + if (ret) + return ret | 3; + if (foow != fooi) + return 4; + foow = ~0; + ret = trapit(4, do_fp_op); + if (ret) + return ret | 5; + if (foow != (unsigned int)fooi) + return 6; + ret = trapit(5, do_fp_op); + if (ret) + return ret | 7; + if (fooiw != fooi) + return 8; + return 0; +} + +int fail = 0; + +void do_test(int num, int (*test)(void)) +{ + int ret; + + print_test_number(num); + ret = test(); + if (ret == 0) { + print_string("PASS\r\n"); + } else { + fail = 1; + print_string("FAIL "); + print_hex(ret, 4); + print_string(" SRR0="); + print_hex(mfspr(SRR0), 16); + print_string(" SRR1="); + print_hex(mfspr(SRR1), 16); + print_string("\r\n"); + } +} + +int main(void) +{ + console_init(); + + do_test(1, fpu_test_1); + do_test(2, fpu_test_2); + + return fail; +} diff --git a/tests/fpu/head.S b/tests/fpu/head.S new file mode 100644 index 0000000..498606b --- /dev/null +++ b/tests/fpu/head.S @@ -0,0 +1,120 @@ +/* Copyright 2013-2014 IBM Corp. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Load an immediate 64-bit value into a register */ +#define LOAD_IMM64(r, e) \ + lis r,(e)@highest; \ + ori r,r,(e)@higher; \ + rldicr r,r, 32, 31; \ + oris r,r, (e)@h; \ + ori r,r, (e)@l; + + .section ".head","ax" + + /* + * Microwatt currently enters in LE mode at 0x0, so we don't need to + * do any endian fix ups + */ + . = 0 +.global _start +_start: + LOAD_IMM64(%r10,__bss_start) + LOAD_IMM64(%r11,__bss_end) + subf %r11,%r10,%r11 + addi %r11,%r11,63 + srdi. %r11,%r11,6 + beq 2f + mtctr %r11 +1: dcbz 0,%r10 + addi %r10,%r10,64 + bdnz 1b + +2: LOAD_IMM64(%r1,__stack_top) + li %r0,0 + stdu %r0,-16(%r1) + LOAD_IMM64(%r10, die) + mtsprg0 %r10 + LOAD_IMM64(%r12, main) + mtctr %r12 + bctrl +die: attn // terminate on exit + b . + +.global trapit +trapit: + mflr %r0 + std %r0,16(%r1) + stdu %r1,-256(%r1) + mtsprg1 %r1 + r = 14 + .rept 18 + std r,r*8(%r1) + r = r + 1 + .endr + mfcr %r0 + stw %r0,13*8(%r1) + LOAD_IMM64(%r10, ret) + mtsprg0 %r10 + mr %r12,%r4 + mtctr %r4 + bctrl +ret: + mfsprg1 %r1 + LOAD_IMM64(%r10, die) + mtsprg0 %r10 + r = 14 + .rept 18 + ld r,r*8(%r1) + r = r + 1 + .endr + lwz %r0,13*8(%r1) + mtcr %r0 + ld %r0,256+16(%r1) + addi %r1,%r1,256 + mtlr %r0 + blr + +#define EXCEPTION(nr) \ + .= nr ;\ + mfsprg0 %r0 ;\ + mtctr %r0 ;\ + li %r3,nr ;\ + bctr + + EXCEPTION(0x300) + EXCEPTION(0x380) + EXCEPTION(0x400) + EXCEPTION(0x480) + EXCEPTION(0x500) + EXCEPTION(0x600) + EXCEPTION(0x700) + EXCEPTION(0x800) + EXCEPTION(0x900) + EXCEPTION(0x980) + EXCEPTION(0xa00) + EXCEPTION(0xb00) + EXCEPTION(0xc00) + EXCEPTION(0xd00) + EXCEPTION(0xe00) + EXCEPTION(0xe20) + EXCEPTION(0xe40) + EXCEPTION(0xe60) + EXCEPTION(0xe80) + EXCEPTION(0xf00) + EXCEPTION(0xf20) + EXCEPTION(0xf40) + EXCEPTION(0xf60) + EXCEPTION(0xf80) diff --git a/tests/fpu/powerpc.lds b/tests/fpu/powerpc.lds new file mode 100644 index 0000000..99611ab --- /dev/null +++ b/tests/fpu/powerpc.lds @@ -0,0 +1,27 @@ +SECTIONS +{ + . = 0; + _start = .; + .head : { + KEEP(*(.head)) + } + . = ALIGN(0x1000); + .text : { *(.text) *(.text.*) *(.rodata) *(.rodata.*) } + . = ALIGN(0x1000); + .data : { *(.data) *(.data.*) *(.got) *(.toc) } + . = ALIGN(0x80); + __bss_start = .; + .bss : { + *(.dynsbss) + *(.sbss) + *(.scommon) + *(.dynbss) + *(.bss) + *(.common) + *(.bss.*) + } + . = ALIGN(0x80); + __bss_end = .; + . = . + 0x4000; + __stack_top = .; +} diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin new file mode 100755 index 0000000000000000000000000000000000000000..885368a70cc97cc929c62934658553ed4b131143 GIT binary patch literal 8208 zcmeHMU2GKB6+W}O_O2ac9EqxKp~)D<)xa9G0dzDCB}sfBy#!?Wi|$>;6ySi z54-7XcI>(mRF>*ieQ=}%NTseKL{Tg8zyl9<6Gfzb0V#=)XktwKus?>W*%sJmZ=R4<~duQjGNbe;YNPZ9;NaK3MRtJ(qCS)U!jX*YXgu)}q z#EZu=yJ#f#i)$n4O*C%g{6{mlhse@tAXXL>CsKD1h0PCw;S{ciDH%42NV_AHGS&H* zhQC~2B12@u&m{az!q4Op?HiNHoc~lN9I*G352K0p=6oTmjg+z?!sT4wc59So@1|(} zu1!kiB!w$eWK^apSvgB%m3Qf#%AaVqa*5_EGo)0nP`G-PjOuktR&UZ+HBawU7iqRy zrupjkh^(KIGF8uQo8oXwbxc~|VlkIoe7jYT^I`y>0{9fbrwBep@F{{%5qyf^Qv{zP z_!PmX1U@D3DS=N3d`jR`0-qB2l)%MS?`*W%6eUkeSj+iCmP*LOdV8c9$i>?*&rKHj zO-<#xd(6%AZ?Cd{3aoPctL&c|Rww?)Ty77!^XiWp*LeKFH6DLxjmO`#I`Mn!&t7H! z)Vwo-?GNz->}BxZ(TlqZ+)mM==Bx!<_dcIYxIwLJMiM8jVr{z zf?i)iZ?2%XzefM4<5x)EL1*Jy19f!v)0iPdytjLFAa?kh!>M!M?#gUB7|G=Pk&KFO zqjr$#M#qOfy@-AS9Usu#OtQDehzftht>%8Wnp2MK#%&??Neq-zc0A@_KLqx#H?G#{ zLKt&l|Jf|jxf>Uk(S5n!hVEB4rd^%mgUo5zy)bBIFASMk%h5?<{6~DE$8w}*v_vFx zSWz?e-_@)mHIiIT=jDgbAn(jsp>lEl>uFmJ`I-xsNGxl4qKE3&<)sFKOKSh!kgON;>!dhxyM`j{Odp%oUeChaM zYzwdfU_V;Q2l6ZD!&kt0PJ#3;>#`U(8sOK>C#OWQ75s7Mo@4u0Bklb&;t^t|=gxh{ z{*-;IM~LwqKJcf}35{s3s?u2bHP=7x^UQ(IgB0oj-(f2OzL#N(Xczo*UBnA=d?I2k zh;iL7>thUOLUYyc*4g1${IGM$jfLoCX(MF+z9)cj&Gyy;!OmN-!#vo={1*;aj|`(PTxDyAZ~7I(jE} z{;%fFsf*8-YC@sxq-$H}`Mc6UduM%|6EYWS?bOY|qQKddtg(fj)aEy=XWGP&S7#T4 z0yXwD_}bFuC~W@l-iO(p!N?0u)YQF&#`GbI#4ZLqGWUe&dCv8B1^%Lt=WG4qY^`?g zJ-_Tzh*x$r)6Jf%sbS6Y{T+<`q~r>^W7kXR>ob-p=X_A`sx<2gMJU$_?9$tZhgfw)JVWpN%ht~0;u;HJYa#^o`-#AjtG z)T~^0$T4GX0%O}-f0KkfK(>~uex7|B_pHl9_bl#nzSnWKXluW8<%^JSm-3%W`D?Dc z4Ea=T{_5t9`OkxY!Ht>M46k|h_=Lu}^VZNkJa3Tkg|U{%9JgP|a}0H7eTogGH$RMf zOV%WrCwcFr$SJ~nM^U##U4>1U+w%dk&h1w-3Pq0JM!gap%S+i)rnR$i_<3i)>ngOl z*y01M&e#y?qztWOFX&?lE(qlJl^Tl7u&wnKi|v7aIJ|^u~Cny z&Bt)QYibwTyE2?x)Y*>}>suW71rzPh-l%W~5ml{K9!~}W7kt5HX z3*OIJuh;2lhR@!yN6IQb&`Z2a3`8-6-)?VkDp$ICpUblTX4n0)ZX`(;SRH;{b5 z*YykiP0R)Beb}p!Q^~8iR9CR*wi8Ewc0$E{!Z>tjLo?AHVfv&{+3(NVQb>4pCU||?ddHo;-Qiig*MWHr4x1+!NZy^>~ ziN-Si-IV%|5GtfZU1TC83jnAHSrw4#@uJ0O1QLp9~2+o+1E<_u;Ap1#mnXSI>wx2C7PG#LF^6_j)*uW9sM$7Ca#j_ z8RfL^LGqojb*<>-|}BKPb^$->ls+jz`Zx literal 0 HcmV?d00001 diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out new file mode 100644 index 0000000..0c39ae3 --- /dev/null +++ b/tests/test_fpu.console_out @@ -0,0 +1,2 @@ +test 01:PASS +test 02:PASS diff --git a/tests/update_console_tests b/tests/update_console_tests index 906b0cc..a5e6ffc 100755 --- a/tests/update_console_tests +++ b/tests/update_console_tests @@ -3,7 +3,7 @@ # Script to update console related tests from source # -for i in sc illegal decrementer xics privileged mmu misc modes reservation trace ; do +for i in sc illegal decrementer xics privileged mmu misc modes reservation trace fpu ; do cd $i make cd - From 9d285a265cf9fab8f5f17d6d4588d9545e555e68 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 28 Aug 2020 13:35:05 +1000 Subject: [PATCH 03/30] core: Add support for single-precision FP loads and stores This adds code to loadstore1 to convert between single-precision and double-precision formats, and implements the lfs* and stfs* instructions. The conversion processes are described in Power ISA v3.1 Book 1 sections 4.6.2 and 4.6.3. These conversions take one cycle, so lfs* and stfs* are one cycle slower than lfd* and stfd*. Signed-off-by: Paul Mackerras --- common.vhdl | 3 +- countzero.vhdl | 37 +-------- decode1.vhdl | 16 ++-- execute1.vhdl | 1 + helpers.vhdl | 53 ++++++++++++ loadstore1.vhdl | 210 +++++++++++++++++++++++++++++++++++++++++++++--- 6 files changed, 263 insertions(+), 57 deletions(-) diff --git a/common.vhdl b/common.vhdl index 14bdcf7..e1ba844 100644 --- a/common.vhdl +++ b/common.vhdl @@ -287,6 +287,7 @@ package common is virt_mode : std_ulogic; -- do translation through TLB priv_mode : std_ulogic; -- privileged mode (MSR[PR] = 0) mode_32bit : std_ulogic; -- trim addresses to 32 bits + is_32bit : std_ulogic; end record; constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0', sign_extend => '0', update => '0', xerc => xerc_init, @@ -294,7 +295,7 @@ package common is nia => (others => '0'), insn => (others => '0'), addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'), write_reg => (others => '0'), length => (others => '0'), - mode_32bit => '0', others => (others => '0')); + mode_32bit => '0', is_32bit => '0', others => (others => '0')); type Loadstore1ToExecute1Type is record busy : std_ulogic; diff --git a/countzero.vhdl b/countzero.vhdl index 18aa043..b46f108 100644 --- a/countzero.vhdl +++ b/countzero.vhdl @@ -3,6 +3,7 @@ use ieee.std_logic_1164.all; use ieee.numeric_std.all; library work; +use work.helpers.all; entity zero_counter is port ( @@ -15,42 +16,6 @@ entity zero_counter is end entity zero_counter; architecture behaviour of zero_counter is - -- Reverse the order of bits in a word - function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is - variable ret: std_ulogic_vector(a'left downto a'right); - begin - for i in a'right to a'left loop - ret(a'left + a'right - i) := a(i); - end loop; - return ret; - end; - - -- If there is only one bit set in a doubleword, return its bit number - -- (counting from the right). Each bit of the result is obtained by - -- ORing together 32 bits of the input: - -- bit 0 = a[1] or a[3] or a[5] or ... - -- bit 1 = a[2] or a[3] or a[6] or a[7] or ... - -- bit 2 = a[4..7] or a[12..15] or ... - -- bit 5 = a[32..63] ORed together - function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is - variable ret: std_ulogic_vector(5 downto 0); - variable stride: natural; - variable bit: std_ulogic; - variable k: natural; - begin - stride := 2; - for i in 0 to 5 loop - bit := '0'; - for j in 0 to (64 / stride) - 1 loop - k := j * stride; - bit := bit or (or a(k + stride - 1 downto k + (stride / 2))); - end loop; - ret(i) := bit; - stride := stride * 2; - end loop; - return ret; - end; - signal inp : std_ulogic_vector(63 downto 0); signal sum : std_ulogic_vector(64 downto 0); signal msb_r : std_ulogic; diff --git a/decode1.vhdl b/decode1.vhdl index 75da175..29f0e50 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -74,8 +74,8 @@ architecture behaviour of decode1 is 35 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzu 50 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfd 51 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdu --- 48 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfs --- 49 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsu + 48 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfs + 49 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsu 42 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lha 43 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhau 40 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhz @@ -93,8 +93,8 @@ architecture behaviour of decode1 is 39 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbu 54 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfd 55 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdu --- 52 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfs --- 53 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsu + 52 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfs + 53 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsu 44 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth 45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu 36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw @@ -284,8 +284,8 @@ architecture behaviour of decode1 is 2#1001110111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdux 2#1101010111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwax 2#1101110111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwzx --- 2#1000010111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfsx --- 2#1000110111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsux + 2#1000010111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfsx + 2#1000110111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsux 2#0001110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lharx 2#0101110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhaux 2#0101010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lhax @@ -367,8 +367,8 @@ architecture behaviour of decode1 is 2#1011010111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfdx 2#1011110111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdux 2#1111010111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfiwx --- 2#1010010111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfsx --- 2#1010110111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsux + 2#1010010111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfsx + 2#1010110111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsux 2#1110010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx 2#1110110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthcix 2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- sthcx diff --git a/execute1.vhdl b/execute1.vhdl index 4d6a9cc..9d9b711 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -1259,6 +1259,7 @@ begin lv.virt_mode := ctrl.msr(MSR_DR); lv.priv_mode := not ctrl.msr(MSR_PR); lv.mode_32bit := not ctrl.msr(MSR_SF); + lv.is_32bit := e_in.is_32bit; -- Update registers rin <= v; diff --git a/helpers.vhdl b/helpers.vhdl index fe91938..834e386 100644 --- a/helpers.vhdl +++ b/helpers.vhdl @@ -25,6 +25,10 @@ package helpers is function byte_reverse(val: std_ulogic_vector(63 downto 0); size: integer) return std_ulogic_vector; function sign_extend(val: std_ulogic_vector(63 downto 0); size: natural) return std_ulogic_vector; + + function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector; + function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; + function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector; end package helpers; package body helpers is @@ -206,4 +210,53 @@ package body helpers is return std_ulogic_vector(ret); end; + + -- Reverse the order of bits in a word + function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is + variable ret: std_ulogic_vector(a'left downto a'right); + begin + for i in a'right to a'left loop + ret(a'left + a'right - i) := a(i); + end loop; + return ret; + end; + + -- If there is only one bit set in a doubleword, return its bit number + -- (counting from the right). Each bit of the result is obtained by + -- ORing together 32 bits of the input: + -- bit 0 = a[1] or a[3] or a[5] or ... + -- bit 1 = a[2] or a[3] or a[6] or a[7] or ... + -- bit 2 = a[4..7] or a[12..15] or ... + -- bit 5 = a[32..63] ORed together + function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is + variable ret: std_ulogic_vector(5 downto 0); + variable stride: natural; + variable bit: std_ulogic; + variable k: natural; + begin + stride := 2; + for i in 0 to 5 loop + bit := '0'; + for j in 0 to (64 / stride) - 1 loop + k := j * stride; + bit := bit or (or a(k + stride - 1 downto k + (stride / 2))); + end loop; + ret(i) := bit; + stride := stride * 2; + end loop; + return ret; + end; + + -- Count leading zeroes operation + -- Assumes the value passed in is not zero (if it is, zero is returned) + function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector is + variable rev: std_ulogic_vector(val'left downto val'right); + variable sum: std_ulogic_vector(val'left downto val'right); + variable onehot: std_ulogic_vector(val'left downto val'right); + begin + rev := bit_reverse(val); + sum := std_ulogic_vector(- signed(rev)); + onehot := sum and rev; + return bit_number(std_ulogic_vector(resize(unsigned(onehot), 64))); + end; end package body helpers; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index ec20319..919ba0e 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -45,10 +45,12 @@ architecture behave of loadstore1 is -- State machine for unaligned loads/stores type state_t is (IDLE, -- ready for instruction + FPR_CONV, -- converting double to float for store SECOND_REQ, -- send 2nd request of unaligned xfer ACK_WAIT, -- waiting for ack from dcache MMU_LOOKUP, -- waiting for MMU to look up translation TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie + FINISH_LFS, -- write back converted SP data for lfs* COMPLETE -- extra cycle to complete an operation ); @@ -89,6 +91,11 @@ architecture behave of loadstore1 is do_update : std_ulogic; extra_cycle : std_ulogic; mode_32bit : std_ulogic; + load_sp : std_ulogic; + ld_sp_data : std_ulogic_vector(31 downto 0); + ld_sp_nz : std_ulogic; + ld_sp_lz : std_ulogic_vector(5 downto 0); + st_sp_data : std_ulogic_vector(31 downto 0); end record; type byte_sel_t is array(0 to 7) of std_ulogic; @@ -98,6 +105,9 @@ architecture behave of loadstore1 is signal r, rin : reg_stage_t; signal lsu_sum : std_ulogic_vector(63 downto 0); + signal store_sp_data : std_ulogic_vector(31 downto 0); + signal load_dp_data : std_ulogic_vector(63 downto 0); + -- Generate byte enables from sizes function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is begin @@ -128,6 +138,72 @@ architecture behave of loadstore1 is to_integer(unsigned(address)))); end function xfer_data_sel; + -- 23-bit right shifter for DP -> SP float conversions + function shifter_23r(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0)) + return std_ulogic_vector is + variable fs1 : std_ulogic_vector(22 downto 0); + variable fs2 : std_ulogic_vector(22 downto 0); + begin + case shift(1 downto 0) is + when "00" => + fs1 := frac; + when "01" => + fs1 := '0' & frac(22 downto 1); + when "10" => + fs1 := "00" & frac(22 downto 2); + when others => + fs1 := "000" & frac(22 downto 3); + end case; + case shift(4 downto 2) is + when "000" => + fs2 := fs1; + when "001" => + fs2 := x"0" & fs1(22 downto 4); + when "010" => + fs2 := x"00" & fs1(22 downto 8); + when "011" => + fs2 := x"000" & fs1(22 downto 12); + when "100" => + fs2 := x"0000" & fs1(22 downto 16); + when others => + fs2 := x"00000" & fs1(22 downto 20); + end case; + return fs2; + end; + + -- 23-bit left shifter for SP -> DP float conversions + function shifter_23l(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0)) + return std_ulogic_vector is + variable fs1 : std_ulogic_vector(22 downto 0); + variable fs2 : std_ulogic_vector(22 downto 0); + begin + case shift(1 downto 0) is + when "00" => + fs1 := frac; + when "01" => + fs1 := frac(21 downto 0) & '0'; + when "10" => + fs1 := frac(20 downto 0) & "00"; + when others => + fs1 := frac(19 downto 0) & "000"; + end case; + case shift(4 downto 2) is + when "000" => + fs2 := fs1; + when "001" => + fs2 := fs1(18 downto 0) & x"0" ; + when "010" => + fs2 := fs1(14 downto 0) & x"00"; + when "011" => + fs2 := fs1(10 downto 0) & x"000"; + when "100" => + fs2 := fs1(6 downto 0) & x"0000"; + when others => + fs2 := fs1(2 downto 0) & x"00000"; + end case; + return fs2; + end; + begin -- Calculate the address in the first cycle lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0'); @@ -145,6 +221,59 @@ begin end if; end process; + ls_fp_conv: if HAS_FPU generate + -- Convert DP data to SP for stfs + dp_to_sp: process(all) + variable exp : unsigned(10 downto 0); + variable frac : std_ulogic_vector(22 downto 0); + variable shift : unsigned(4 downto 0); + begin + store_sp_data(31) <= l_in.data(63); + store_sp_data(30 downto 0) <= (others => '0'); + exp := unsigned(l_in.data(62 downto 52)); + if exp > 896 then + store_sp_data(30) <= l_in.data(62); + store_sp_data(29 downto 0) <= l_in.data(58 downto 29); + elsif exp >= 874 then + -- denormalization required + frac := '1' & l_in.data(51 downto 30); + shift := 0 - exp(4 downto 0); + store_sp_data(22 downto 0) <= shifter_23r(frac, shift); + end if; + end process; + + -- Convert SP data to DP for lfs + sp_to_dp: process(all) + variable exp : unsigned(7 downto 0); + variable exp_dp : unsigned(10 downto 0); + variable exp_nz : std_ulogic; + variable exp_ao : std_ulogic; + variable frac : std_ulogic_vector(22 downto 0); + variable frac_shift : unsigned(4 downto 0); + begin + frac := r.ld_sp_data(22 downto 0); + exp := unsigned(r.ld_sp_data(30 downto 23)); + exp_nz := or (r.ld_sp_data(30 downto 23)); + exp_ao := and (r.ld_sp_data(30 downto 23)); + frac_shift := (others => '0'); + if exp_ao = '1' then + exp_dp := to_unsigned(2047, 11); -- infinity or NaN + elsif exp_nz = '1' then + exp_dp := 896 + resize(exp, 11); -- finite normalized value + elsif r.ld_sp_nz = '0' then + exp_dp := to_unsigned(0, 11); -- zero + else + -- denormalized SP operand, need to normalize + exp_dp := 896 - resize(unsigned(r.ld_sp_lz), 11); + frac_shift := unsigned(r.ld_sp_lz(4 downto 0)) + 1; + end if; + load_dp_data(63) <= r.ld_sp_data(31); + load_dp_data(62 downto 52) <= std_ulogic_vector(exp_dp); + load_dp_data(51 downto 29) <= shifter_23l(frac, frac_shift); + load_dp_data(28 downto 0) <= (others => '0'); + end process; + end generate; + loadstore1_1: process(all) variable v : reg_stage_t; variable brev_lenm1 : unsigned(2 downto 0); @@ -165,6 +294,9 @@ begin variable data_permuted : std_ulogic_vector(63 downto 0); variable data_trimmed : std_ulogic_vector(63 downto 0); variable store_data : std_ulogic_vector(63 downto 0); + variable data_in : std_ulogic_vector(63 downto 0); + variable byte_rev : std_ulogic; + variable length : std_ulogic_vector(3 downto 0); variable use_second : byte_sel_t; variable trim_ctl : trim_ctl_t; variable negative : std_ulogic; @@ -176,6 +308,8 @@ begin variable mmu_mtspr : std_ulogic; variable itlb_fault : std_ulogic; variable misaligned : std_ulogic; + variable fp_reg_conv : std_ulogic; + variable lfs_done : std_ulogic; begin v := r; req := '0'; @@ -185,8 +319,10 @@ begin sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10)); dsisr := (others => '0'); mmureq := '0'; + fp_reg_conv := '0'; write_enable := '0'; + lfs_done := '0'; do_update := r.do_update; v.do_update := '0'; @@ -245,19 +381,38 @@ begin end case; end loop; - -- Byte reversing and rotating for stores - -- Done in the first cycle (when l_in.valid = 1) + if HAS_FPU then + -- Single-precision FP conversion + v.st_sp_data := store_sp_data; + v.ld_sp_data := data_trimmed(31 downto 0); + v.ld_sp_nz := or (data_trimmed(22 downto 0)); + v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0)); + end if; + + -- Byte reversing and rotating for stores. + -- Done in the first cycle (when l_in.valid = 1) for integer stores + -- and DP float stores, and in the second cycle for SP float stores. store_data := r.store_data; - if l_in.valid = '1' then - byte_offset := unsigned(lsu_sum(2 downto 0)); + if l_in.valid = '1' or (HAS_FPU and r.state = FPR_CONV) then + if HAS_FPU and r.state = FPR_CONV then + data_in := x"00000000" & r.st_sp_data; + byte_offset := unsigned(r.addr(2 downto 0)); + byte_rev := r.byte_reverse; + length := r.length; + else + data_in := l_in.data; + byte_offset := unsigned(lsu_sum(2 downto 0)); + byte_rev := l_in.byte_reverse; + length := l_in.length; + end if; brev_lenm1 := "000"; - if l_in.byte_reverse = '1' then - brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + if byte_rev = '1' then + brev_lenm1 := unsigned(length(2 downto 0)) - 1; end if; for i in 0 to 7 loop k := (to_unsigned(i, 3) - byte_offset) xor brev_lenm1; j := to_integer(k) * 8; - store_data(i * 8 + 7 downto i * 8) := l_in.data(j + 7 downto j); + store_data(i * 8 + 7 downto i * 8) := data_in(j + 7 downto j); end loop; end if; v.store_data := store_data; @@ -292,6 +447,14 @@ begin case r.state is when IDLE => + when FPR_CONV => + req := '1'; + if r.second_bytes /= "00000000" then + v.state := SECOND_REQ; + else + v.state := ACK_WAIT; + end if; + when SECOND_REQ => req := '1'; v.state := ACK_WAIT; @@ -323,8 +486,13 @@ begin v.load_data := data_permuted; end if; else - write_enable := r.load; - if r.extra_cycle = '1' then + write_enable := r.load and not r.load_sp; + if HAS_FPU and r.load_sp = '1' then + -- SP to DP conversion takes a cycle + -- Write back rA update in this cycle if needed + do_update := r.update; + v.state := FINISH_LFS; + elsif r.extra_cycle = '1' then -- loads with rA update need an extra cycle v.state := COMPLETE; v.do_update := r.update; @@ -362,6 +530,9 @@ begin when TLBIE_WAIT => + when FINISH_LFS => + lfs_done := '1'; + when COMPLETE => exception := r.align_intr; @@ -395,6 +566,7 @@ begin v.nc := l_in.ci; v.virt_mode := l_in.virt_mode; v.priv_mode := l_in.priv_mode; + v.load_sp := '0'; v.wait_dcache := '0'; v.wait_mmu := '0'; v.do_update := '0'; @@ -436,14 +608,24 @@ begin v.dcbz := '1'; when OP_FPSTORE => if HAS_FPU then - req := '1'; + if l_in.is_32bit = '1' then + v.state := FPR_CONV; + fp_reg_conv := '1'; + else + req := '1'; + end if; end if; when OP_FPLOAD => if HAS_FPU then v.load := '1'; req := '1'; - -- Allow an extra cycle for RA update + -- Allow an extra cycle for SP->DP precision conversion + -- or RA update v.extra_cycle := l_in.update; + if l_in.is_32bit = '1' then + v.load_sp := '1'; + v.extra_cycle := '1'; + end if; end if; when OP_TLBIE => mmureq := '1'; @@ -500,7 +682,7 @@ begin end if; end if; - v.busy := req or mmureq or mmu_mtspr; + v.busy := req or mmureq or mmu_mtspr or fp_reg_conv; end if; -- Update outputs to dcache @@ -539,6 +721,10 @@ begin l_out.write_enable <= '1'; l_out.write_reg <= gpr_to_gspr(r.update_reg); l_out.write_data <= r.addr; + elsif lfs_done = '1' then + l_out.write_enable <= '1'; + l_out.write_reg <= r.write_reg; + l_out.write_data <= load_dp_data; else l_out.write_enable <= write_enable; l_out.write_reg <= r.write_reg; From 76ec1a2f0aba7863d5704cf56f9bf07e1435cdaf Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 2 Jul 2020 19:55:30 +1000 Subject: [PATCH 04/30] tests/fpu: Add tests for lfs and stfs instructions This exercises the single-to-double and double-to-single conversions, including denormalized cases. Signed-off-by: Paul Mackerras --- tests/fpu/fpu.c | 69 ++++++++++++++++++++++++++++++++++++- tests/test_fpu.bin | Bin 8208 -> 8384 bytes tests/test_fpu.console_out | 1 + 3 files changed, 69 insertions(+), 1 deletion(-) diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index d61b36e..86636b6 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -163,6 +163,72 @@ int fpu_test_2(void) return 0; } +struct sp_dp_equiv { + unsigned int sp; + unsigned long dp; +} sp_dp_equiv[] = { + { 0, 0 }, + { 0x80000000, 0x8000000000000000 }, + { 0x7f800000, 0x7ff0000000000000 }, + { 0xff800000, 0xfff0000000000000 }, + { 0x7f812345, 0x7ff02468a0000000 }, + { 0x456789ab, 0x40acf13560000000 }, + { 0x12345678, 0x3a468acf00000000 }, + { 0x00400000, 0x3800000000000000 }, + { 0x00200000, 0x37f0000000000000 }, + { 0x00000002, 0x36b0000000000000 }, + { 0x00000001, 0x36a0000000000000 }, +}; + +int sp_to_dp(long arg) +{ + unsigned long dp; + + __asm__("lfs 20,0(%0); stfd 20,0(%1)" + : : "b" (&sp_dp_equiv[arg].sp), "b" (&dp) : "memory"); + if (dp != sp_dp_equiv[arg].dp) { + print_hex(sp_dp_equiv[arg].sp, 8); + print_string(" "); + print_hex(dp, 16); + print_string(" "); + print_hex(sp_dp_equiv[arg].dp, 16); + print_string(" "); + } + return dp != sp_dp_equiv[arg].dp; +} + +int dp_to_sp(long arg) +{ + unsigned int sp; + + __asm__("lfd 21,0(%0); stfs 21,0(%1)" + : : "b" (&sp_dp_equiv[arg].dp), "b" (&sp) : "memory"); + return sp != sp_dp_equiv[arg].sp; +} + +int fpu_test_3(void) +{ + int i, n, ret; + + n = sizeof(sp_dp_equiv) / sizeof(sp_dp_equiv[0]); + enable_fp(); + for (i = 0; i < n; ++i) { + ret = trapit(i, sp_to_dp); + if (ret != 0) { + if (ret == 1) + ret += i; + return ret; + } + ret = trapit(i, dp_to_sp); + if (ret != 0) { + if (ret == 1) + ret += i + 0x10000; + return ret; + } + } + return 0; +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -176,7 +242,7 @@ void do_test(int num, int (*test)(void)) } else { fail = 1; print_string("FAIL "); - print_hex(ret, 4); + print_hex(ret, 5); print_string(" SRR0="); print_hex(mfspr(SRR0), 16); print_string(" SRR1="); @@ -191,6 +257,7 @@ int main(void) do_test(1, fpu_test_1); do_test(2, fpu_test_2); + do_test(3, fpu_test_3); return fail; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 885368a70cc97cc929c62934658553ed4b131143..fb2de320a0017dc482942a6d0b32d6b5b480a309 100755 GIT binary patch delta 1747 zcmZuxZERCz6h8NMbQR{jZAa<4QMwNnHYYA@%)1p?tBlM}$3h6vAI8px2tyN8esrVk z-b?lah#^~ELuN8yAd;y>x5y7;g4r-J42>T|jC_C1WX;A>P_bCx^?BP{gv67a+;g7y zd7pdEdEa~c0%O6&M1g0CLLJ}chPv?FKqa9LqA;+{z%~Qhyg~0@Tr27Quhhy{T%&t_ zcw(7ilr|8>&(Cn)N?;3DjOQ9i7x=_?M5oUG^vD|9qj*XM#Rkf3o30V6Z#GkQSJ9?e9_mRFG1-q#&b6_S!aW6*Cwq#1*+} z#x;25WDd2k8K;}+1$Ej0gvHb$K9%HGr;k`{v6DD8iF_CWg*ktH#Sl1Tl=hDTpMwUY zsq5Me#A0n2Yabv!B6r6`^Jh$a1h|pLx25spCjKSxzW)x1ZFAZ~qOq%J?2cI^ZV^s5 ztZsR`S6v9M$Yta&zSFBdZsr!`JKyb9@1+M{qKaB&gZ0d>m|L9wnkj5E*k|*59!~3B zhomq&>-s#u*tn*WFVx|qP#1W67c{Q68Kv=~ITZV34Q&^(e8Sd+FR9^S%tC>Nya!ky z%~Kx9nakbg%2ZHZO;ouHx9Dk69NyCHUN8CBguAx*3O7i}Q8?zyub6&j9n`DNJDN;7MS`0)UmnH;Ujk+gn&_?L&0z zHv77;qERT03ICh(VLa*i$AT0b%-LtzFCUQX!Jc4#VC~#|3SPc+@q#rtPbekeYLd-D z$t@ARoMbLfX@gK&h4e7~E+dHlN2&d_d}_`JWop_AemM(z%B+KkqIX!Q$6GLl(EShR zTZBk+z7IvLVS}D>X&oE)IOm_lQmxWxj=*|a9udCY6uDw*!%E- zyI-spYvGd){1TWY>o_mM64NcfHYUu7{W3 pJQ|3uuXAMiKki=Ln5i*~5H6|jK?1#JWb!_g(@4EpuL%y-{|jszH2nYo delta 1074 zcmY+CUr1AN6vxlKnp08R=9=xV^XjD4Ak&b$vbsvenq~!A^tL&Q!iwr?)@(OXu!z_= zJxJ(F4T4qpJkPppSn53!zu5OX#O5IfK$$}4VNnpoZ#RV5F=Z1OW>?Lcbo zQMJtjS^o4nz_a9w|7v(iYnHgAm?WuAH3D!UIh>c|+JS#q#b;VGuGbwFx^Pry5j|h= zp3W|szT#V*ef59_yrIl=L;%11C6(K`*K#CW2K}vt;0nC@+YTjtv+-LIcyFZ`^3MHtuDi_R2I=-Y%T)KxawZ+!W4qFOiatAU_@YrZdy*In6& zEe1#VlV#w2AcB`4d>SW>793kqj28`c(g~_GFFuWI*0{P#pu}y3bIN{jdG2Ux`xY77 zxJSg7hSelD?0gXl%32W8lQg7__IQ(7iZnn(E(KP7M-vRs3%Lz;&qcxw5R^H4muDO3 z4kq()1JuZmu-v#SlAw+XWz=;h&89xZ3yQ-bI=TQ3`d{E@th?nsX;f8;1&9}dDu*&3 zBcl}a+6>ipP#hOehqY82%=0$3bzmLJRWH1TNecBnmI(r9X&%}>Qxe+&Opj*|m2 zKON6S$`>gwr`$nv@|pToY_6&=9mzf&@DHhYsOs#$X{zd~up475Z&r+`A7r5S=>T6Z z={ijtwhyV&>{ua$ukV6B9*Yl)d+9BB{$&NAc*_1VC9Bi^m*a$~)0mr}&JEx?4{&}h zAC7iuH1ETx*^T37118Mg&DXv&Hcd5dT5{4u=m%pCTDftikPi^E(#oxld@uRMA9%_l z6@@8O(s;#EU&pFC`Tqd{v YVk~s~oNZ9N9knT9X?lzBdWmtxZwLKkHUIzs diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out index 0c39ae3..623335d 100644 --- a/tests/test_fpu.console_out +++ b/tests/test_fpu.console_out @@ -1,2 +1,3 @@ test 01:PASS test 02:PASS +test 03:PASS From 856e9e955f0e5ddcd64c6d328f279e12a5973574 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 28 Aug 2020 20:01:00 +1000 Subject: [PATCH 05/30] core: Add framework for an FPU This adds the skeleton of a floating-point unit and implements the mffs and mtfsf instructions. Execute1 sends FP instructions to the FPU and receives busy, exception, FP interrupt and illegal interrupt signals from it. Signed-off-by: Paul Mackerras --- Makefile | 2 +- common.vhdl | 69 ++++++++++++++ core.vhdl | 34 +++++++ decode1.vhdl | 18 ++++ decode2.vhdl | 11 ++- decode_types.vhdl | 9 +- execute1.vhdl | 82 +++++++++++++---- fpu.vhdl | 185 ++++++++++++++++++++++++++++++++++++++ microwatt.core | 1 + scripts/fmt_log/fmt_log.c | 12 +-- writeback.vhdl | 27 +++++- 11 files changed, 417 insertions(+), 33 deletions(-) create mode 100644 fpu.vhdl diff --git a/Makefile b/Makefile index b584895..9fe2106 100644 --- a/Makefile +++ b/Makefile @@ -48,7 +48,7 @@ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ logical.vhdl countzero.vhdl multiply.vhdl divider.vhdl execute1.vhdl \ loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \ - core.vhdl + core.vhdl fpu.vhdl soc_files = $(core_files) wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \ wishbone_debug_master.vhdl xics.vhdl syscon.vhdl soc.vhdl \ diff --git a/common.vhdl b/common.vhdl index e1ba844..f91ac18 100644 --- a/common.vhdl +++ b/common.vhdl @@ -94,6 +94,38 @@ package common is end record; constant xerc_init : xer_common_t := (others => '0'); + -- FPSCR bit numbers + constant FPSCR_FX : integer := 63 - 32; + constant FPSCR_FEX : integer := 63 - 33; + constant FPSCR_VX : integer := 63 - 34; + constant FPSCR_OX : integer := 63 - 35; + constant FPSCR_UX : integer := 63 - 36; + constant FPSCR_ZX : integer := 63 - 37; + constant FPSCR_XX : integer := 63 - 38; + constant FPSCR_VXSNAN : integer := 63 - 39; + constant FPSCR_VXISI : integer := 63 - 40; + constant FPSCR_VXIDI : integer := 63 - 41; + constant FPSCR_VXZDZ : integer := 63 - 42; + constant FPSCR_VXIMZ : integer := 63 - 43; + constant FPSCR_VXVC : integer := 63 - 44; + constant FPSCR_FR : integer := 63 - 45; + constant FPSCR_FI : integer := 63 - 46; + constant FPSCR_C : integer := 63 - 47; + constant FPSCR_FL : integer := 63 - 48; + constant FPSCR_FG : integer := 63 - 49; + constant FPSCR_FE : integer := 63 - 50; + constant FPSCR_FU : integer := 63 - 51; + constant FPSCR_VXSOFT : integer := 63 - 53; + constant FPSCR_VXSQRT : integer := 63 - 54; + constant FPSCR_VXCVI : integer := 63 - 55; + constant FPSCR_VE : integer := 63 - 56; + constant FPSCR_OE : integer := 63 - 57; + constant FPSCR_UE : integer := 63 - 58; + constant FPSCR_ZE : integer := 63 - 59; + constant FPSCR_XE : integer := 63 - 60; + constant FPSCR_NI : integer := 63 - 61; + constant FPSCR_RN : integer := 63 - 63; + type irq_state_t is (WRITE_SRR0, WRITE_SRR1); -- For now, fixed 16 sources, make this either a parametric @@ -413,6 +445,43 @@ package common is write_cr_data => (others => '0'), write_reg => (others => '0'), exc_write_reg => (others => '0'), exc_write_data => (others => '0')); + type Execute1ToFPUType is record + valid : std_ulogic; + op : insn_type_t; + nia : std_ulogic_vector(63 downto 0); + insn : std_ulogic_vector(31 downto 0); + single : std_ulogic; + fe_mode : std_ulogic_vector(1 downto 0); + fra : std_ulogic_vector(63 downto 0); + frb : std_ulogic_vector(63 downto 0); + frc : std_ulogic_vector(63 downto 0); + frt : gspr_index_t; + rc : std_ulogic; + out_cr : std_ulogic; + end record; + constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'), + insn => (others => '0'), fe_mode => "00", rc => '0', + fra => (others => '0'), frb => (others => '0'), + frc => (others => '0'), frt => (others => '0'), + single => '0', out_cr => '0'); + + type FPUToExecute1Type is record + busy : std_ulogic; + exception : std_ulogic; + interrupt : std_ulogic; + illegal : std_ulogic; + end record; + + type FPUToWritebackType is record + valid : std_ulogic; + write_enable : std_ulogic; + write_reg : gspr_index_t; + write_data : std_ulogic_vector(63 downto 0); + write_cr_enable : std_ulogic; + write_cr_mask : std_ulogic_vector(7 downto 0); + write_cr_data : std_ulogic_vector(31 downto 0); + end record; + type DividerToExecute1Type is record valid: std_ulogic; write_reg_data: std_ulogic_vector(63 downto 0); diff --git a/core.vhdl b/core.vhdl index 81e11c8..b905297 100644 --- a/core.vhdl +++ b/core.vhdl @@ -80,6 +80,11 @@ architecture behave of core is signal mmu_to_dcache: MmuToDcacheType; signal dcache_to_mmu: DcacheToMmuType; + -- FPU signals + signal execute1_to_fpu: Execute1ToFPUType; + signal fpu_to_execute1: FPUToExecute1Type; + signal fpu_to_writeback: FPUToWritebackType; + -- local signals signal fetch1_stall_in : std_ulogic; signal icache_stall_out : std_ulogic; @@ -109,6 +114,7 @@ architecture behave of core is signal rst_dec1 : std_ulogic := '1'; signal rst_dec2 : std_ulogic := '1'; signal rst_ex1 : std_ulogic := '1'; + signal rst_fpu : std_ulogic := '1'; signal rst_ls1 : std_ulogic := '1'; signal rst_dbg : std_ulogic := '1'; signal alt_reset_d : std_ulogic; @@ -171,6 +177,7 @@ begin rst_dec1 <= core_rst; rst_dec2 <= core_rst; rst_ex1 <= core_rst; + rst_fpu <= core_rst; rst_ls1 <= core_rst; rst_dbg <= rst; alt_reset_d <= alt_reset; @@ -225,6 +232,7 @@ begin decode1_0: entity work.decode1 generic map( + HAS_FPU => HAS_FPU, LOG_LENGTH => LOG_LENGTH ) port map ( @@ -313,9 +321,11 @@ begin busy_out => ex1_busy_out, e_in => decode2_to_execute1, l_in => loadstore1_to_execute1, + fp_in => fpu_to_execute1, ext_irq_in => ext_irq, l_out => execute1_to_loadstore1, f_out => execute1_to_fetch1, + fp_out => execute1_to_fpu, e_out => execute1_to_writeback, icache_inval => ex1_icache_inval, dbg_msr_out => msr, @@ -326,6 +336,29 @@ begin log_wr_addr => log_wr_addr ); + with_fpu: if HAS_FPU generate + begin + fpu_0: entity work.fpu + port map ( + clk => clk, + rst => rst_fpu, + e_in => execute1_to_fpu, + e_out => fpu_to_execute1, + w_out => fpu_to_writeback + ); + end generate; + + no_fpu: if not HAS_FPU generate + begin + fpu_to_execute1.busy <= '0'; + fpu_to_execute1.exception <= '0'; + fpu_to_execute1.interrupt <= '0'; + fpu_to_execute1.illegal <= '0'; + fpu_to_writeback.valid <= '0'; + fpu_to_writeback.write_enable <= '0'; + fpu_to_writeback.write_cr_enable <= '0'; + end generate; + loadstore1_0: entity work.loadstore1 generic map ( HAS_FPU => HAS_FPU, @@ -381,6 +414,7 @@ begin clk => clk, e_in => execute1_to_writeback, l_in => loadstore1_to_writeback, + fp_in => fpu_to_writeback, w_out => writeback_to_register_file, c_out => writeback_to_cr_file, complete_out => complete diff --git a/decode1.vhdl b/decode1.vhdl index 29f0e50..afd37ef 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -8,6 +8,7 @@ use work.decode_types.all; entity decode1 is generic ( + HAS_FPU : boolean := true; -- Non-zero to enable log data collection LOG_LENGTH : natural := 0 ); @@ -55,6 +56,7 @@ architecture behaviour of decode1 is type op_30_subop_array_t is array(0 to 15) of decode_rom_t; type op_31_subop_array_t is array(0 to 1023) of decode_rom_t; type minor_rom_array_2_t is array(0 to 3) of decode_rom_t; + type op_63_subop_array_0_t is array(0 to 511) of decode_rom_t; constant major_decode_rom_array : major_rom_array_t := ( -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl @@ -416,6 +418,15 @@ architecture behaviour of decode1 is others => decode_rom_init ); + -- indexed by bits 4..1 and 10..6 of instruction word + constant decode_op_63l_array : op_63_subop_array_0_t := ( + -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl + -- op in out A out in out len ext pipe + 2#011110010# => (FPU, OP_FPOP_I, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 18/7=mffs family + 2#011110110# => (FPU, OP_FPOP_I, NONE, FRB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 22/7=mtfsf + others => illegal_inst + ); + -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl -- op in out A out in out len ext pipe constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); @@ -569,6 +580,13 @@ begin when 62 => v.decode := decode_op_62_array(to_integer(unsigned(f_in.insn(1 downto 0)))); + when 63 => + if HAS_FPU then + -- floating point operations, general and double-precision + v.decode := decode_op_63l_array(to_integer(unsigned(f_in.insn(4 downto 1) & f_in.insn(10 downto 6)))); + vi.override := f_in.insn(5); + end if; + when others => end case; diff --git a/decode2.vhdl b/decode2.vhdl index 6cc74c7..8b2ab8c 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -93,6 +93,12 @@ architecture behaviour of decode2 is case t is when RB => ret := ('1', gpr_to_gspr(insn_rb(insn_in)), reg_data); + when FRB => + if HAS_FPU then + ret := ('1', fpr_to_gspr(insn_frb(insn_in)), reg_data); + else + ret := ('0', (others => '0'), (others => '0')); + end if; when CONST_UI => ret := ('0', (others => '0'), std_ulogic_vector(resize(unsigned(insn_ui(insn_in)), 64))); when CONST_SI => @@ -296,6 +302,7 @@ begin r_out.read1_reg <= d_in.ispr1 when d_in.decode.input_reg_a = SPR else gpr_to_gspr(insn_ra(d_in.insn)); r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR + else fpr_to_gspr(insn_frb(d_in.insn)) when d_in.decode.input_reg_b = FRB and HAS_FPU else gpr_to_gspr(insn_rb(d_in.insn)); r_out.read3_reg <= gpr_to_gspr(insn_rcreg(d_in.insn)) when d_in.decode.input_reg_c = RCR else fpr_to_gspr(insn_frt(d_in.insn)) when d_in.decode.input_reg_c = FRS and HAS_FPU @@ -321,7 +328,7 @@ begin mul_b := (others => '0'); --v.e.input_cr := d_in.decode.input_cr; - --v.e.output_cr := d_in.decode.output_cr; + v.e.output_cr := d_in.decode.output_cr; decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1, d_in.nia); @@ -412,7 +419,7 @@ begin cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); cr_bypass_avail <= '0'; - if EX1_BYPASS then + if EX1_BYPASS and d_in.decode.unit = ALU then cr_bypass_avail <= d_in.decode.output_cr; end if; diff --git a/decode_types.vhdl b/decode_types.vhdl index 8c20441..5eaef50 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -7,8 +7,9 @@ package decode_types is OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB, OP_CNTZ, OP_CROP, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST, - OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS, - OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC, + OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS, OP_EXTSWSLI, + OP_FPOP, OP_FPOP_I, + OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC, OP_LOAD, OP_STORE, OP_FPLOAD, OP_FPSTORE, OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MOD, @@ -24,7 +25,7 @@ package decode_types is ); type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA); type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, - CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR); + CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR, FRB); type input_reg_c_t is (NONE, RS, RCR, FRS); type output_reg_a_t is (NONE, RT, RA, SPR, FRT); type rc_t is (NONE, ONE, RC); @@ -48,7 +49,7 @@ package decode_types is constant TOO_OFFSET : integer := 0; - type unit_t is (NONE, ALU, LDST); + type unit_t is (NONE, ALU, LDST, FPU); type length_t is (NONE, is1B, is2B, is4B, is8B); type decode_rom_t is record diff --git a/execute1.vhdl b/execute1.vhdl index 9d9b711..29713b2 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -27,12 +27,14 @@ entity execute1 is e_in : in Decode2ToExecute1Type; l_in : in Loadstore1ToExecute1Type; + fp_in : in FPUToExecute1Type; ext_irq_in : std_ulogic; -- asynchronous l_out : out Execute1ToLoadstore1Type; f_out : out Execute1ToFetch1Type; + fp_out : out Execute1ToFPUType; e_out : out Execute1ToWritebackType; @@ -54,6 +56,7 @@ architecture behaviour of execute1 is f : Execute1ToFetch1Type; busy: std_ulogic; terminate: std_ulogic; + fp_exception_next : std_ulogic; trace_next : std_ulogic; prev_op : insn_type_t; lr_update : std_ulogic; @@ -72,7 +75,8 @@ architecture behaviour of execute1 is end record; constant reg_type_init : reg_type := (e => Execute1ToWritebackInit, f => Execute1ToFetch1Init, - busy => '0', lr_update => '0', terminate => '0', trace_next => '0', prev_op => OP_ILLEGAL, + busy => '0', lr_update => '0', terminate => '0', + fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0', slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init, next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0')); @@ -268,7 +272,7 @@ begin b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2; c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3; - busy_out <= l_in.busy or r.busy; + busy_out <= l_in.busy or r.busy or fp_in.busy; valid_in <= e_in.valid and not busy_out; terminate_out <= r.terminate; @@ -334,6 +338,7 @@ begin variable spr_val : std_ulogic_vector(63 downto 0); variable addend : std_ulogic_vector(127 downto 0); variable do_trace : std_ulogic; + variable fv : Execute1ToFPUType; begin result := (others => '0'); sum_with_carry := (others => '0'); @@ -347,6 +352,7 @@ begin v.e := Execute1ToWritebackInit; lv := Execute1ToLoadstore1Init; v.f.redirect := '0'; + fv := Execute1ToFPUInit; -- XER forwarding. To avoid having to track XER hazards, we -- use the previously latched value. @@ -522,9 +528,11 @@ begin exception_nextpc := '0'; v.e.exc_write_enable := '0'; v.e.exc_write_reg := fast_spr_num(SPR_SRR0); - v.e.exc_write_data := e_in.nia; if valid_in = '1' then + v.e.exc_write_data := e_in.nia; v.last_nia := e_in.nia; + else + v.e.exc_write_data := r.last_nia; end if; v.e.mode_32bit := not ctrl.msr(MSR_SF); @@ -552,18 +560,27 @@ begin ctrl_tmp.msr(MSR_LE) <= '1'; v.e.valid := '1'; v.trace_next := '0'; + v.fp_exception_next := '0'; report "Writing SRR1: " & to_hstring(ctrl.srr1); - elsif r.trace_next = '1' and valid_in = '1' then - -- Generate a trace interrupt rather than executing the next instruction - -- or taking any asynchronous interrupt - v.f.redirect_nia := std_logic_vector(to_unsigned(16#d00#, 64)); - ctrl_tmp.srr1(63 - 33) <= '1'; - if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or - r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then - ctrl_tmp.srr1(63 - 35) <= '1'; - elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then - ctrl_tmp.srr1(63 - 36) <= '1'; + elsif valid_in = '1' and ((HAS_FPU and r.fp_exception_next = '1') or r.trace_next = '1') then + if HAS_FPU and r.fp_exception_next = '1' then + -- This is used for FP-type program interrupts that + -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero. + v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); + ctrl_tmp.srr1(63 - 43) <= '1'; + ctrl_tmp.srr1(63 - 47) <= '1'; + else + -- Generate a trace interrupt rather than executing the next instruction + -- or taking any asynchronous interrupt + v.f.redirect_nia := std_logic_vector(to_unsigned(16#d00#, 64)); + ctrl_tmp.srr1(63 - 33) <= '1'; + if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or + r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then + ctrl_tmp.srr1(63 - 35) <= '1'; + elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then + ctrl_tmp.srr1(63 - 36) <= '1'; + end if; end if; exception := '1'; @@ -589,7 +606,7 @@ begin illegal := '1'; elsif HAS_FPU and valid_in = '1' and ctrl.msr(MSR_FP) = '0' and - (e_in.insn_type = OP_FPLOAD or e_in.insn_type = OP_FPSTORE) then + (e_in.unit = FPU or e_in.insn_type = OP_FPLOAD or e_in.insn_type = OP_FPSTORE) then -- generate a floating-point unavailable interrupt exception := '1'; v.f.redirect_nia := std_logic_vector(to_unsigned(16#800#, 64)); @@ -809,6 +826,10 @@ begin is_branch := '1'; taken_branch := '1'; abs_branch := '1'; + if HAS_FPU then + v.fp_exception_next := fp_in.exception and + (a_in(MSR_FE0) or a_in(MSR_FE1)); + end if; do_trace := '0'; when OP_CNTZ => @@ -980,6 +1001,10 @@ begin ctrl_tmp.msr(MSR_IR) <= '1'; ctrl_tmp.msr(MSR_DR) <= '1'; end if; + if HAS_FPU then + v.fp_exception_next := fp_in.exception and + (c_in(MSR_FE0) or c_in(MSR_FE1)); + end if; end if; when OP_MTSPR => report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & @@ -1096,6 +1121,8 @@ begin lv.valid := '1'; elsif e_in.unit = NONE then illegal := '1'; + elsif HAS_FPU and e_in.unit = FPU then + fv.valid := '1'; end if; elsif r.f.redirect = '1' then @@ -1170,7 +1197,17 @@ begin v.e.valid := '1'; end if; - if illegal = '1' then + -- Generate FP-type program interrupt. fp_in.interrupt will only + -- be set during the execution of a FP instruction. + -- The case where MSR[FE0,FE1] goes from zero to non-zero is + -- handled above by mtmsrd and rfid setting v.fp_exception_next. + if HAS_FPU and fp_in.interrupt = '1' then + v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); + ctrl_tmp.srr1(63 - 43) <= '1'; + exception := '1'; + end if; + + if illegal = '1' or (HAS_FPU and fp_in.illegal = '1') then exception := '1'; v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); -- Since we aren't doing Hypervisor emulation assist (0xe40) we @@ -1216,7 +1253,6 @@ begin end if; v.e.exc_write_enable := '1'; v.e.exc_write_reg := fast_spr_num(SPR_SRR0); - v.e.exc_write_data := r.last_nia; report "ldst exception writing srr0=" & to_hstring(r.last_nia); end if; @@ -1261,6 +1297,19 @@ begin lv.mode_32bit := not ctrl.msr(MSR_SF); lv.is_32bit := e_in.is_32bit; + -- Outputs to FPU + fv.op := e_in.insn_type; + fv.nia := e_in.nia; + fv.insn := e_in.insn; + fv.single := e_in.is_32bit; + fv.fe_mode := ctrl.msr(MSR_FE0) & ctrl.msr(MSR_FE1); + fv.fra := a_in; + fv.frb := b_in; + fv.frc := c_in; + fv.frt := e_in.write_reg; + fv.rc := e_in.rc; + fv.out_cr := e_in.output_cr; + -- Update registers rin <= v; @@ -1268,6 +1317,7 @@ begin f_out <= r.f; l_out <= lv; e_out <= r.e; + fp_out <= fv; flush_out <= f_out.redirect; exception_log <= exception; diff --git a/fpu.vhdl b/fpu.vhdl new file mode 100644 index 0000000..b05ec9d --- /dev/null +++ b/fpu.vhdl @@ -0,0 +1,185 @@ +-- Floating-point unit for Microwatt + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.insn_helpers.all; +use work.decode_types.all; +use work.crhelpers.all; +use work.helpers.all; +use work.common.all; + +entity fpu is + port ( + clk : in std_ulogic; + rst : in std_ulogic; + + e_in : in Execute1toFPUType; + e_out : out FPUToExecute1Type; + + w_out : out FPUToWritebackType + ); +end entity fpu; + +architecture behaviour of fpu is + + type state_t is (IDLE, + DO_MFFS, DO_MTFSF); + + type reg_type is record + state : state_t; + busy : std_ulogic; + instr_done : std_ulogic; + do_intr : std_ulogic; + op : insn_type_t; + insn : std_ulogic_vector(31 downto 0); + dest_fpr : gspr_index_t; + fe_mode : std_ulogic; + rc : std_ulogic; + is_cmp : std_ulogic; + single_prec : std_ulogic; + fpscr : std_ulogic_vector(31 downto 0); + b : std_ulogic_vector(63 downto 0); + writing_back : std_ulogic; + cr_result : std_ulogic_vector(3 downto 0); + cr_mask : std_ulogic_vector(7 downto 0); + end record; + + signal r, rin : reg_type; + + signal fp_result : std_ulogic_vector(63 downto 0); + +begin + fpu_0: process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + r.state <= IDLE; + r.busy <= '0'; + r.instr_done <= '0'; + r.do_intr <= '0'; + r.fpscr <= (others => '0'); + r.writing_back <= '0'; + else + assert not (r.state /= IDLE and e_in.valid = '1') severity failure; + r <= rin; + end if; + end if; + end process; + + e_out.busy <= r.busy; + e_out.exception <= r.fpscr(FPSCR_FEX); + e_out.interrupt <= r.do_intr; + + w_out.valid <= r.instr_done and not r.do_intr; + w_out.write_enable <= r.writing_back; + w_out.write_reg <= r.dest_fpr; + w_out.write_data <= fp_result; + w_out.write_cr_enable <= r.instr_done and r.rc; + w_out.write_cr_mask <= r.cr_mask; + w_out.write_cr_data <= r.cr_result & r.cr_result & r.cr_result & r.cr_result & + r.cr_result & r.cr_result & r.cr_result & r.cr_result; + + fpu_1: process(all) + variable v : reg_type; + variable illegal : std_ulogic; + variable j, k : integer; + variable flm : std_ulogic_vector(7 downto 0); + begin + v := r; + illegal := '0'; + v.busy := '0'; + + -- capture incoming instruction + if e_in.valid = '1' then + v.insn := e_in.insn; + v.op := e_in.op; + v.fe_mode := or (e_in.fe_mode); + v.dest_fpr := e_in.frt; + v.single_prec := e_in.single; + v.rc := e_in.rc; + v.is_cmp := e_in.out_cr; + v.cr_mask := num_to_fxm(1); + v.b := e_in.frb; + end if; + + v.writing_back := '0'; + v.instr_done := '0'; + + case r.state is + when IDLE => + if e_in.valid = '1' then + case e_in.insn(5 downto 1) is + when "00111" => + if e_in.insn(8) = '0' then + v.state := DO_MFFS; + else + v.state := DO_MTFSF; + end if; + when others => + illegal := '1'; + end case; + end if; + + when DO_MFFS => + v.writing_back := '1'; + case r.insn(20 downto 16) is + when "00000" => + -- mffs + when others => + illegal := '1'; + end case; + v.instr_done := '1'; + v.state := IDLE; + + when DO_MTFSF => + if r.insn(25) = '1' then + flm := x"FF"; + elsif r.insn(16) = '1' then + flm := x"00"; + else + flm := r.insn(24 downto 17); + end if; + for i in 0 to 7 loop + k := i * 4; + if flm(i) = '1' then + v.fpscr(k + 3 downto k) := r.b(k + 3 downto k); + end if; + end loop; + v.instr_done := '1'; + v.state := IDLE; + + end case; + + -- Data path. + -- Just enough to read FPSCR for now. + fp_result <= x"00000000" & r.fpscr; + + v.fpscr(FPSCR_VX) := (or (v.fpscr(FPSCR_VXSNAN downto FPSCR_VXVC))) or + (or (v.fpscr(FPSCR_VXSOFT downto FPSCR_VXCVI))); + v.fpscr(FPSCR_FEX) := or (v.fpscr(FPSCR_VX downto FPSCR_XX) and + v.fpscr(FPSCR_VE downto FPSCR_XE)); + if r.rc = '1' then + v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX); + end if; + + if illegal = '1' then + v.instr_done := '0'; + v.do_intr := '0'; + v.writing_back := '0'; + v.busy := '0'; + v.state := IDLE; + else + v.do_intr := v.instr_done and v.fpscr(FPSCR_FEX) and r.fe_mode; + if v.state /= IDLE or v.do_intr = '1' then + v.busy := '1'; + end if; + end if; + + rin <= v; + e_out.illegal <= illegal; + end process; + +end architecture behaviour; diff --git a/microwatt.core b/microwatt.core index 3b47339..7f2068d 100644 --- a/microwatt.core +++ b/microwatt.core @@ -23,6 +23,7 @@ filesets: - cr_hazard.vhdl - control.vhdl - execute1.vhdl + - fpu.vhdl - loadstore1.vhdl - mmu.vhdl - dcache.vhdl diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c index eca4bf0..c61c8a5 100644 --- a/scripts/fmt_log/fmt_log.c +++ b/scripts/fmt_log/fmt_log.c @@ -84,17 +84,17 @@ struct log_entry { #define FLGA(i, y, z) (log.i? y: z) #define PNIA(f) (full_nia[log.f] & 0xff) -const char *units[4] = { "--", "al", "ls", "?3" }; +const char *units[4] = { "--", "al", "ls", "fp" }; const char *ops[64] = { "illegal", "nop ", "add ", "and ", "attn ", "b ", "bc ", "bcreg ", "bperm ", "cmp ", "cmpb ", "cmpeqb ", "cmprb ", "cntz ", "crop ", "darn ", "dcbf ", "dcbst ", "dcbt ", "dcbtst ", "dcbz ", "div ", "dive ", "exts ", - "extswsl", "icbi ", "icbt ", "isel ", "isync ", "ld ", "st ", "fpload ", - "fpstore", "mcrxrx ", "mfcr ", "mfmsr ", "mfspr ", "mod ", "mtcrf ", "mtmsr ", - "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "or ", "popcnt ", "prty ", "rfid ", - "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", "shl ", "shr ", "sync ", - "tlbie ", "trap ", "xor ", "bcd ", "addg6s ", "ffail ", "?62 ", "?63 " + "extswsl", "fpop ", "fpopi ", "icbi ", "icbt ", "isel ", "isync ", "ld ", + "st ", "fpload ", "fpstore", "mcrxrx ", "mfcr ", "mfmsr ", "mfspr ", "mod ", + "mtcrf ", "mtmsr ", "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "or ", "popcnt ", + "prty ", "rfid ", "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", "shl ", + "shr ", "sync ", "tlbie ", "trap ", "xor ", "bcd ", "addg6s ", "ffail ", }; const char *spr_names[13] = diff --git a/writeback.vhdl b/writeback.vhdl index d0230d8..95de0ec 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -12,6 +12,7 @@ entity writeback is e_in : in Execute1ToWritebackType; l_in : in Loadstore1ToWritebackType; + fp_in : in FPUToWritebackType; w_out : out WritebackToRegisterFileType; c_out : out WritebackToCrFileType; @@ -31,15 +32,21 @@ begin -- Do consistency checks only on the clock edge x(0) := e_in.valid; y(0) := l_in.valid; - assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; + w(0) := fp_in.valid; + assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + + to_integer(unsigned(w))) <= 1 severity failure; x(0) := e_in.write_enable or e_in.exc_write_enable; y(0) := l_in.write_enable; - assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; + w(0) := fp_in.write_enable; + assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + + to_integer(unsigned(w))) <= 1 severity failure; w(0) := e_in.write_cr_enable; x(0) := (e_in.write_enable and e_in.rc); - assert (to_integer(unsigned(w)) + to_integer(unsigned(x))) <= 1 severity failure; + y(0) := fp_in.write_cr_enable; + assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + + to_integer(unsigned(y))) <= 1 severity failure; end if; end process; @@ -53,7 +60,7 @@ begin c_out <= WritebackToCrFileInit; complete_out <= '0'; - if e_in.valid = '1' or l_in.valid = '1' then + if e_in.valid = '1' or l_in.valid = '1' or fp_in.valid = '1' then complete_out <= '1'; end if; @@ -79,6 +86,18 @@ begin c_out.write_xerc_data <= e_in.xerc; end if; + if fp_in.write_enable = '1' then + w_out.write_reg <= fp_in.write_reg; + w_out.write_data <= fp_in.write_data; + w_out.write_enable <= '1'; + end if; + + if fp_in.write_cr_enable = '1' then + c_out.write_cr_enable <= '1'; + c_out.write_cr_mask <= fp_in.write_cr_mask; + c_out.write_cr_data <= fp_in.write_cr_data; + end if; + if l_in.write_enable = '1' then w_out.write_reg <= l_in.write_reg; w_out.write_data <= l_in.write_data; From bf1d9e9531aea859d6ba1218a42f3c125845b320 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 15 Jul 2020 12:46:18 +1000 Subject: [PATCH 06/30] tests/fpu: Add tests for basic FPSCR function and interrupt generation This tests mffs, mtfsf and the generation of floating-point type program interrupts that occur as a result of mtfsf. Signed-off-by: Paul Mackerras --- tests/fpu/fpu.c | 198 +++++++++++++++++++++++++++++++++---- tests/fpu/head.S | 12 +++ tests/test_fpu.bin | Bin 8384 -> 12504 bytes tests/test_fpu.console_out | 2 + 4 files changed, 192 insertions(+), 20 deletions(-) diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 86636b6..54811ed 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -4,11 +4,15 @@ #include "console.h" +#define asm __asm__ volatile + #define MSR_FP 0x2000 #define MSR_FE0 0x800 #define MSR_FE1 0x100 extern int trapit(long arg, int (*func)(long)); +extern void do_rfid(unsigned long msr); +extern void do_blr(void); #define SRR0 26 #define SRR1 27 @@ -17,31 +21,41 @@ static inline unsigned long mfspr(int sprnum) { long val; - __asm__ volatile("mfspr %0,%1" : "=r" (val) : "i" (sprnum)); + asm("mfspr %0,%1" : "=r" (val) : "i" (sprnum)); return val; } static inline void mtspr(int sprnum, unsigned long val) { - __asm__ volatile("mtspr %0,%1" : : "i" (sprnum), "r" (val)); + asm("mtspr %0,%1" : : "i" (sprnum), "r" (val)); } void disable_fp(void) { unsigned long msr; - __asm__("mfmsr %0" : "=r" (msr)); + asm("mfmsr %0" : "=r" (msr)); msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1); - __asm__("mtmsrd %0" : : "r" (msr)); + asm("mtmsrd %0" : : "r" (msr)); } void enable_fp(void) { unsigned long msr; - __asm__("mfmsr %0" : "=r" (msr)); + asm("mfmsr %0" : "=r" (msr)); msr |= MSR_FP; - __asm__("mtmsrd %0" : : "r" (msr)); + msr &= ~(MSR_FE0 | MSR_FE1); + asm("mtmsrd %0" : : "r" (msr)); +} + +void enable_fp_interrupts(void) +{ + unsigned long msr; + + asm("mfmsr %0" : "=r" (msr)); + msr |= MSR_FE0 | MSR_FE1; + asm("mtmsrd %0" : : "r" (msr)); } void print_string(const char *str) @@ -81,26 +95,26 @@ int do_fp_op(long arg) { switch (arg) { case 0: - __asm__("lfd 31,0(%0)" : : "b" (&foo)); + asm("lfd 31,0(%0)" : : "b" (&foo)); break; case 1: - __asm__("stfd 31,0(%0)" : : "b" (&foow) : "memory"); + asm("stfd 31,0(%0)" : : "b" (&foow) : "memory"); break; case 2: - __asm__("lfd 30,0(%0); stfd 30,0(%1)" - : : "b" (&foo), "b" (&foow) : "memory"); + asm("lfd 30,0(%0); stfd 30,0(%1)" + : : "b" (&foo), "b" (&foow) : "memory"); break; case 3: - __asm__("lfiwax 29,0,%0; stfd 29,0(%1)" - : : "r" (&fooi), "b" (&foow) : "memory"); + asm("lfiwax 29,0,%0; stfd 29,0(%1)" + : : "r" (&fooi), "b" (&foow) : "memory"); break; case 4: - __asm__("lfiwzx 28,0,%0; stfd 28,0(%1)" - : : "r" (&fooi), "b" (&foow) : "memory"); + asm("lfiwzx 28,0,%0; stfd 28,0(%1)" + : : "r" (&fooi), "b" (&foow) : "memory"); break; case 5: - __asm__("lfdx 27,0,%0; stfiwx 27,0,%1" - : : "r" (&foow), "r" (&fooiw) : "memory"); + asm("lfdx 27,0,%0; stfiwx 27,0,%1" + : : "r" (&foow), "r" (&fooiw) : "memory"); break; } return 0; @@ -184,8 +198,8 @@ int sp_to_dp(long arg) { unsigned long dp; - __asm__("lfs 20,0(%0); stfd 20,0(%1)" - : : "b" (&sp_dp_equiv[arg].sp), "b" (&dp) : "memory"); + asm("lfs 20,0(%0); stfd 20,0(%1)" + : : "b" (&sp_dp_equiv[arg].sp), "b" (&dp) : "memory"); if (dp != sp_dp_equiv[arg].dp) { print_hex(sp_dp_equiv[arg].sp, 8); print_string(" "); @@ -201,8 +215,8 @@ int dp_to_sp(long arg) { unsigned int sp; - __asm__("lfd 21,0(%0); stfs 21,0(%1)" - : : "b" (&sp_dp_equiv[arg].dp), "b" (&sp) : "memory"); + asm("lfd 21,0(%0); stfs 21,0(%1)" + : : "b" (&sp_dp_equiv[arg].dp), "b" (&sp) : "memory"); return sp != sp_dp_equiv[arg].sp; } @@ -229,6 +243,148 @@ int fpu_test_3(void) return 0; } +unsigned long get_fpscr(void) +{ + unsigned long ret; + + asm("mffs 10; stfd 10,0(%0)" : : "b" (&ret) : "memory"); + return ret; +} + +void set_fpscr(unsigned long fpscr) +{ + asm("lfd%U0%X0 7,%0; mtfsf 0,7,1,0" : : "m" (fpscr)); +} + +unsigned long fpscr_eval(unsigned long val) +{ + val &= ~0x60000000; /* clear FEX and VX */ + if (val & 0x1f80700) /* test all VX* bits */ + val |= 0x20000000; + if ((val >> 25) & (val >> 3) & 0x1f) + val |= 0x40000000; + return val; +} + +unsigned int test4vals[] = { + 0xdeadbeef, 0x1324679a, 0, 0xffffffff, 0xabcd +}; + +int test4(long arg) +{ + unsigned long fsi, fpscr; + long i; + unsigned long cr; + + /* check we can do basic mtfsf and mffs */ + i = 1; + for (fsi = 1; fsi < 0x100; fsi <<= 1) { + asm("lfd 7,0(%0); mtfsf 0,7,1,0" : : "b" (&fsi)); + if (get_fpscr() != fsi) + return i; + ++i; + fpscr = fsi; + } + for (i = 0; i < sizeof(test4vals) / sizeof(test4vals[0]); ++i) { + fsi = test4vals[i]; + asm("lfd 7,0(%0); mtfsf 0x55,7,0,0" : : "b" (&fsi)); + fpscr = fpscr_eval((fpscr & 0xf0f0f0f0) | (fsi & 0x0f0f0f0f)); + if (get_fpscr() != fpscr) + return 16 * i + 16; + asm("mtfsf 0xaa,7,0,0"); + fpscr = fpscr_eval((fpscr & 0x0f0f0f0f) | (fsi & 0xf0f0f0f0)); + if (get_fpscr() != fpscr) + return 16 * i + 17; + asm("mffs. 6; mfcr %0" : "=r" (cr) : : "cr1"); + if (((cr >> 24) & 0xf) != ((fpscr >> 28) & 0x1f)) + return 16 * i + 18; + } + return 0; +} + +int fpu_test_4(void) +{ + enable_fp(); + return trapit(0, test4); +} + +int test5a(long arg) +{ + set_fpscr(0); + enable_fp_interrupts(); + set_fpscr(0x80); /* set VE */ + set_fpscr(0x480); /* set VXSOFT */ + set_fpscr(0); + return 1; /* not supposed to get here */ +} + +int test5b(long arg) +{ + unsigned long msr; + + enable_fp(); + set_fpscr(0x80); /* set VE */ + set_fpscr(0x480); /* set VXSOFT */ + asm("mfmsr %0" : "=r" (msr)); + msr |= MSR_FE0 | MSR_FE1; + asm("mtmsrd %0; xori 4,4,0" : : "r" (msr)); + set_fpscr(0); + return 1; /* not supposed to get here */ +} + +int test5c(long arg) +{ + unsigned long msr; + + enable_fp(); + set_fpscr(0x80); /* set VE */ + set_fpscr(0x480); /* set VXSOFT */ + asm("mfmsr %0" : "=r" (msr)); + msr |= MSR_FE0 | MSR_FE1; + do_rfid(msr); + set_fpscr(0); + return 1; /* not supposed to get here */ +} + +int fpu_test_5(void) +{ + int ret; + unsigned int *ip; + + enable_fp(); + ret = trapit(0, test5a); + if (ret != 0x700) + return 1; + ip = (unsigned int *)mfspr(SRR0); + /* check it's a mtfsf 0,7,1,0 instruction */ + if (*ip != (63u << 26) + (1 << 25) + (7 << 11) + (711 << 1)) + return 2; + if ((mfspr(SRR1) & 0x783f0000) != (1 << (63 - 43))) + return 3; + + ret = trapit(0, test5b); + if (ret != 0x700) + return 4; + ip = (unsigned int *)mfspr(SRR0); + /* check it's an xori 4,4,0 instruction */ + if (*ip != 0x68840000) + return 5; + if ((mfspr(SRR1) & 0x783f0000) != (1 << (63 - 43)) + (1 << (63 - 47))) + return 6; + + ret = trapit(0, test5c); + if (ret != 0x700) + return 7; + ip = (unsigned int *)mfspr(SRR0); + /* check it's the destination of the rfid */ + if (ip != (void *)&do_blr) + return 8; + if ((mfspr(SRR1) & 0x783f0000) != (1 << (63 - 43)) + (1 << (63 - 47))) + return 9; + + return 0; +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -258,6 +414,8 @@ int main(void) do_test(1, fpu_test_1); do_test(2, fpu_test_2); do_test(3, fpu_test_3); + do_test(4, fpu_test_4); + do_test(5, fpu_test_5); return fail; } diff --git a/tests/fpu/head.S b/tests/fpu/head.S index 498606b..938fca0 100644 --- a/tests/fpu/head.S +++ b/tests/fpu/head.S @@ -87,6 +87,18 @@ ret: mtlr %r0 blr + .global do_rfid +do_rfid: + mtsrr1 %r3 + LOAD_IMM64(%r4, do_blr) + mtsrr0 %r4 + rfid + blr + + .global do_blr +do_blr: + blr + #define EXCEPTION(nr) \ .= nr ;\ mfsprg0 %r0 ;\ diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index fb2de320a0017dc482942a6d0b32d6b5b480a309..6bac86156f19b99c44efc42659d3d5f6219d6486 100755 GIT binary patch delta 3504 zcmaJ^e@v9;9sj;}z#Rx22M81nJ-{Gm(a_eI7f#^LLW_uoI*6Uyk~{8P{h3H-x7TvV zd&kbCOALj+c5_xMT?o^yOK{r6EwaGu8cb?pTj#K?3d$b}t6po<9vtxO^W43she`J) zFYohwzMt>s`F@}8^Wzw{k2y<;Y}G`K&X07B2l3rO&onxT9KiMf+XHOR4&O7qrOyfN zz1yGFOctK$&J@Z9v>hHI-+H2e<0D-_!q@$w5O54=k9k%QHJQn`g}-Ot6GY*!+Qc;D z;07^~O3LcUZ)a>nb-6>y$@okdRHiZ-^#f$~vTL?VWxdgq%D%Hzip>@;nJrGmX3SRh z8xt+?9?~<|Vm^Fih&?l@Ybb_%>Qg~8E(Bk}kJ%QgK5WgxRuxU3knvXt@63gI_sJtwp(^;r!Ss%m>WHH<`5|4<91 zTB2d==qp3kNyl;PiFxEO-7iZIFUmKRp{$i@4zQ&=fe3P1`8uoaX;p@&Jpq|yxbT2@ z+6Q(8#QOh612t2TsEE0Ra@>@~rfO*Ju}|O61LDkd)JNd+q|=sP#XJpYs=P!S{e15F z78>?7k-q#w1pkS*R%(1LgQ!}|`L@pV�e&ILp-OHG!`oov>m3odN5K{z?21pzWt&%-@ z>ar)JQ-K+gHkEkCQmFx}f-ET%BqJASjyBw`*$~=wuC>E>hJ$!dX za^cya5g%R5rZgG#eEY=W;M<32JhFB8QbdMW#L+B=us?n%^LU<3&GNvKAD+$VHofy` ziz-do8EGOMF6$yrj!ET{oNR3V&g^H4$5W_b-P|R)MrUbFrsQ%H`EA?CQh82S;+vB`&xKGt{_=VA-4RS&gI#h6pgPrD0?c%C337%#i~%+)oPm;Balq;q?n4@T;nQ!p)FBYFGP0C$zmJ*U77W^OM3EyH-0m_MsMcT@uhCIo4PQUQ zzGADHws#l|c`n&t_`c0dPOFX$SIq@KXk%etmCPJf?%aDFYesi@%E%dt2Jxpk@ybnE z9Cmr6f{U`i4`kG42z9U_sB8o!pcd=Qt`d!$x?7@V%Og%EIQ?8W!H+t6* zd*-BFivn2+5femscJJ#-4)TmXz>&^pf|rL6lmku(dOXNbgT;&QOm}K74x@t4A8E>Q z+2z7v6b^2kOA^TI*a1Ph`&cj!_EWHTCx{TSMnuF@mv94Gobj{t6nH-He3UxpBOfaz zTjvJH#STqisKfn7uI+NvLtC?QbWFZMlBF%f_$femJWKfZ$7Cd_wWliJwN%qu3v+4=YtU#qux? zDI0bszehls9?R769eszly`19LgK{Ys-3=efB7G&h|0?>?!u%>>wMNJB5e_5oX=V*I-kZ_j<@)q+w@-2>%V!J5S8AY-SLmrOabZEH$8*>Byv z@nw>Ii3!9Y-cgt%*GYZ4*=X;4F`YIf*@1I=27avsVPAnSy^vl3`$?e_j9W=|Zm$6T z?bnR$N1N&+_U3eY{*iw(>~-J7?9Gb_Jk$Lk907rg@yCoe0PHw0o(a_@+v%k6e7X%o zZ;1}fvpfaP_&3UIL2=R*H0hyIY${EfL$m3jvc#}AcG(=wTl(}J$~+r_iD z6IeO0*w~!s0k#%c9oqb9!rKGP4eWg_Iy@fxTj@&Lk<_=B{5Iux$w&3S*BwiGC-Hc~ z3GKU@lLF$jr}qh)Vl(|k?&sIX{&Vf$S3mm{lTK!6j6-nvdu711EIR?34*cHl0gMUc z1=yyHnO3a20PqbC;^BeuB~dZMSbzg~%JACo_RT1zg{xLyg^5G_;dCF~yTHo%s+hR%CG^!*lu1yhZdsV!!GV delta 1150 zcmYk6e`s4(6vxkfSzco`AwM9Av(2x@IBOhb>mP4Tx4vvqN}J&*L&ii%EdxjWBb)mp z#@I9?XeTHu$3Ln}46>EsAEVtSh%iX;2lG#n4*%QQRnX00MV+bHynb)`g1vCxJ@<3I z@0@$?dC8CC^>l*B+)Fev{%yy|8(5D~bYz?;2Q~>d2{w6bIy#%!Eyib$?$U1g7vJ}a zE9pc|T(;Srp4xuHPCYv^R7p#@e{V9+e)kWe#&_H!4tc}eGf$DXc+k_uIZt;6;(5(c zYRHwQl&eFOmnk=i#`iMXxFF?dp}AH@YQz4U93>cnB74YTOJ?Y_seA&yUwFymbDr6v z{*G5XVdnz}4YxHPv(uaHw~`%)V-X3YtfzkecQdfNvYL|e^q_`-l&=~@H=7ZvG`jgc zZ&>Sh@Fj0TEc3GWfSBdSeY?fWe9$-CXGMIvgNBz{5kDP5lp&&eYW>u;)LQ;r>cT^m zarb<=@W)r>pZFW!Q_giA2fJNukwxC{WpFvJKti7buh@7X@C3k%fnA!Z^SgnN=CSc- zfu2XKTuXMo5!k2A6xkmf^JnZdjEhl2OmD-3%t5O0RB*d?w#Xj^Lq~APURz<3x>I2E2s2bgPrQP4`$*SSB_BQxuUauMt`CurC zYcGc0X+5`dSNJ%rx5GO{jz13{5w$C>$TI?HAbOG)qW%0>EXdhdcI?)lQm$FYabrzl zF8n3sp)JgW{VP~x3p)(E0oG-)IM!E{eV^~d657@K?CsfUum3IOMXtpC{8EpzQ)$*Q zjRyC|V|q{!9cl1bJfUY`o^S9}yjMR3Grqx>;-O3g+H=?zHl+L-Ha+h$J?W`WM7GmK z2lVvK^rNKm*P#jiBV}G`o^wT@dBH^)J&i4wVqLw5;ha%UjCPUN!K#~xc3?FzjV7-? l+57Bk_%$n}hLTUK36?}2H5B^*bMAqZ1^jiK(OB>M@;^jvhnoNZ diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out index 623335d..99d32e6 100644 --- a/tests/test_fpu.console_out +++ b/tests/test_fpu.console_out @@ -1,3 +1,5 @@ test 01:PASS test 02:PASS test 03:PASS +test 04:PASS +test 05:PASS From fc2968f13279524cfc44581ebf4c308bd78611c9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 29 Aug 2020 20:34:55 +1000 Subject: [PATCH 07/30] FPU: Implement remaining FPSCR-related instructions This implements mcrfs, mtfsfi, mtfsb0/1, mffscr, mffscrn, mffscrni and mffsl. Signed-off-by: Paul Mackerras --- decode1.vhdl | 4 +++ fpu.vhdl | 81 ++++++++++++++++++++++++++++++++++++++++++++--- insn_helpers.vhdl | 6 ++++ 3 files changed, 87 insertions(+), 4 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index afd37ef..343c0c3 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -422,6 +422,10 @@ architecture behaviour of decode1 is constant decode_op_63l_array : op_63_subop_array_0_t := ( -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl -- op in out A out in out len ext pipe + 2#000000010# => (FPU, OP_FPOP, NONE, NONE, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 2/0=mcrfs + 2#011000001# => (FPU, OP_FPOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 1/6=mtfsb1 + 2#011000010# => (FPU, OP_FPOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 2/6=mtfsb0 + 2#011000100# => (FPU, OP_FPOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 4/6=mtfsfi 2#011110010# => (FPU, OP_FPOP_I, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 18/7=mffs family 2#011110110# => (FPU, OP_FPOP_I, NONE, FRB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 22/7=mtfsf others => illegal_inst diff --git a/fpu.vhdl b/fpu.vhdl index b05ec9d..047bf2d 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -26,7 +26,7 @@ end entity fpu; architecture behaviour of fpu is type state_t is (IDLE, - DO_MFFS, DO_MTFSF); + DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF); type reg_type is record state : state_t; @@ -42,6 +42,7 @@ architecture behaviour of fpu is single_prec : std_ulogic; fpscr : std_ulogic_vector(31 downto 0); b : std_ulogic_vector(63 downto 0); + r : std_ulogic_vector(63 downto 0); writing_back : std_ulogic; cr_result : std_ulogic_vector(3 downto 0); cr_mask : std_ulogic_vector(7 downto 0); @@ -77,13 +78,14 @@ begin w_out.write_enable <= r.writing_back; w_out.write_reg <= r.dest_fpr; w_out.write_data <= fp_result; - w_out.write_cr_enable <= r.instr_done and r.rc; + w_out.write_cr_enable <= r.instr_done and (r.rc or r.is_cmp); w_out.write_cr_mask <= r.cr_mask; w_out.write_cr_data <= r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result; fpu_1: process(all) variable v : reg_type; + variable fpscr_mask : std_ulogic_vector(31 downto 0); variable illegal : std_ulogic; variable j, k : integer; variable flm : std_ulogic_vector(7 downto 0); @@ -101,17 +103,30 @@ begin v.single_prec := e_in.single; v.rc := e_in.rc; v.is_cmp := e_in.out_cr; - v.cr_mask := num_to_fxm(1); + if e_in.out_cr = '0' then + v.cr_mask := num_to_fxm(1); + else + v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(e_in.insn)))); + end if; v.b := e_in.frb; end if; v.writing_back := '0'; v.instr_done := '0'; + fpscr_mask := (others => '1'); case r.state is when IDLE => if e_in.valid = '1' then case e_in.insn(5 downto 1) is + when "00000" => + v.state := DO_MCRFS; + when "00110" => + if e_in.insn(8) = '0' then + v.state := DO_MTFSB; + else + v.state := DO_MTFSFI; + end if; when "00111" => if e_in.insn(8) = '0' then v.state := DO_MFFS; @@ -123,11 +138,67 @@ begin end case; end if; + when DO_MCRFS => + j := to_integer(unsigned(insn_bfa(r.insn))); + for i in 0 to 7 loop + if i = j then + k := (7 - i) * 4; + v.cr_result := r.fpscr(k + 3 downto k); + fpscr_mask(k + 3 downto k) := "0000"; + end if; + end loop; + v.fpscr := r.fpscr and (fpscr_mask or x"6007F8FF"); + v.instr_done := '1'; + v.state := IDLE; + + when DO_MTFSB => + -- mtfsb{0,1} + j := to_integer(unsigned(insn_bt(r.insn))); + for i in 0 to 31 loop + if i = j then + v.fpscr(31 - i) := r.insn(6); + end if; + end loop; + v.instr_done := '1'; + v.state := IDLE; + + when DO_MTFSFI => + -- mtfsfi + j := to_integer(unsigned(insn_bf(r.insn))); + if r.insn(16) = '0' then + for i in 0 to 7 loop + if i = j then + k := (7 - i) * 4; + v.fpscr(k + 3 downto k) := insn_u(r.insn); + end if; + end loop; + end if; + v.instr_done := '1'; + v.state := IDLE; + when DO_MFFS => v.writing_back := '1'; case r.insn(20 downto 16) is when "00000" => -- mffs + when "00001" => + -- mffsce + v.fpscr(FPSCR_VE downto FPSCR_XE) := "00000"; + when "10100" | "10101" => + -- mffscdrn[i] (but we don't implement DRN) + fpscr_mask := x"000000FF"; + when "10110" => + -- mffscrn + fpscr_mask := x"000000FF"; + v.fpscr(FPSCR_RN+1 downto FPSCR_RN) := + r.b(FPSCR_RN+1 downto FPSCR_RN); + when "10111" => + -- mffscrni + fpscr_mask := x"000000FF"; + v.fpscr(FPSCR_RN+1 downto FPSCR_RN) := r.insn(12 downto 11); + when "11000" => + -- mffsl + fpscr_mask := x"0007F0FF"; when others => illegal := '1'; end case; @@ -155,7 +226,9 @@ begin -- Data path. -- Just enough to read FPSCR for now. - fp_result <= x"00000000" & r.fpscr; + v.r := x"00000000" & (r.fpscr and fpscr_mask); + + fp_result <= r.r; v.fpscr(FPSCR_VX) := (or (v.fpscr(FPSCR_VXSNAN downto FPSCR_VXVC))) or (or (v.fpscr(FPSCR_VXSOFT downto FPSCR_VXCVI))); diff --git a/insn_helpers.vhdl b/insn_helpers.vhdl index be3892a..519aa76 100644 --- a/insn_helpers.vhdl +++ b/insn_helpers.vhdl @@ -41,6 +41,7 @@ package insn_helpers is function insn_fra (insn_in : std_ulogic_vector) return std_ulogic_vector; function insn_frb (insn_in : std_ulogic_vector) return std_ulogic_vector; function insn_frc (insn_in : std_ulogic_vector) return std_ulogic_vector; + function insn_u (insn_in : std_ulogic_vector) return std_ulogic_vector; end package insn_helpers; package body insn_helpers is @@ -238,4 +239,9 @@ package body insn_helpers is begin return insn_in(10 downto 6); end; + + function insn_u(insn_in : std_ulogic_vector) return std_ulogic_vector is + begin + return insn_in(15 downto 12); + end; end package body insn_helpers; From cb27353f37331e5eb34c19d283334ce36bcea27b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 31 Aug 2020 13:10:51 +1000 Subject: [PATCH 08/30] tests/fpu: Test remaining FPSCR-related instructions This adds tests for mffsce, mffscrn, mffscrni, mffsl, mcrfs, mtfsfi, mtfsb0 and mtfsb1. Signed-off-by: Paul Mackerras --- tests/fpu/fpu.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 54811ed..f9c4245 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -272,9 +272,9 @@ unsigned int test4vals[] = { int test4(long arg) { - unsigned long fsi, fpscr; + unsigned long fsi, fso, fpscr; long i; - unsigned long cr; + unsigned long cr, mask; /* check we can do basic mtfsf and mffs */ i = 1; @@ -298,6 +298,59 @@ int test4(long arg) asm("mffs. 6; mfcr %0" : "=r" (cr) : : "cr1"); if (((cr >> 24) & 0xf) != ((fpscr >> 28) & 0x1f)) return 16 * i + 18; + asm("mffsce 12; stfd 12,0(%0)" : : "b" (&fso) : "memory"); + if (fso != fpscr) + return 16 * i + 19; + fpscr = fpscr_eval(fpscr & ~0xf8); + if (get_fpscr() != fpscr) + return 16 * i + 20; + asm("lfd 7,0(%0); mtfsf 0xff,7,0,0" : : "b" (&fsi)); + fpscr = fpscr_eval(fsi); + fsi = ~fsi; + asm("lfd 14,0(%0); mffscrn 15,14; stfd 15,0(%1)" + : : "b" (&fsi), "b" (&fso) : "memory"); + if (fso != (fpscr & 0xff)) + return 16 * i + 21; + fpscr = (fpscr & ~3) | (fsi & 3); + if (get_fpscr() != fpscr) + return 16 * i + 22; + fso = ~fso; + asm("mffscrni 16,1; stfd 16,0(%0)" : : "b" (&fso) : "memory"); + if (fso != (fpscr & 0xff)) + return 16 * i + 23; + fpscr = (fpscr & ~3) | 1; + if (get_fpscr() != fpscr) + return 16 * i + 24; + asm("mffsl 17; stfd 17,0(%0)" : : "b" (&fso) : "memory"); + mask = ((1 << (63-45+1)) - (1 << (63-51))) | ((1 << (63-56+1)) - (1 << (63-63))); + if (fso != (fpscr & mask)) + return 16 * i + 25; + asm("mcrfs 0,3; mcrfs 7,0; mfcr %0" : "=r" (cr) : : "cr0", "cr7"); + fso = fpscr_eval(fpscr & ~0x80000); + if (((cr >> 28) & 0xf) != ((fpscr >> 16) & 0xf) || + ((cr >> 0) & 0xf) != ((fso >> 28) & 0xf)) + return 16 * i + 26; + fpscr = fso & 0x6fffffff; + asm("mtfsfi 0,7,0"); + fpscr = fpscr_eval((fpscr & 0x0fffffff) | 0x70000000); + if (get_fpscr() != fpscr) + return 16 * i + 27; + asm("mtfsb0 21"); + fpscr = fpscr_eval(fpscr & ~(1 << (31-21))); + if (get_fpscr() != fpscr) + return 16 * i + 28; + asm("mtfsb1 21"); + fpscr = fpscr_eval(fpscr | (1 << (31-21))); + if (get_fpscr() != fpscr) + return 16 * i + 29; + asm("mtfsb0 24"); + fpscr = fpscr_eval(fpscr & ~(1 << (31-24))); + if (get_fpscr() != fpscr) + return 16 * i + 30; + asm("mtfsb1. 24; mfcr %0" : "=r" (cr)); + fpscr = fpscr_eval(fpscr | (1 << (31-24))); + if (get_fpscr() != fpscr || ((cr >> 24) & 0xf) != ((fpscr >> 28) & 0xf)) + return 16 * i + 31; } return 0; } From b628af6176bd0bfa0289fa823ec205f48988ec53 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 15 Jul 2020 14:28:06 +1000 Subject: [PATCH 09/30] FPU: Implement fmr and related instructions This implements fmr, fneg, fabs, fnabs and fcpsgn and adds tests for them. This adds logic to unpack and repack floating-point data from the 64-bit packed form (as stored in memory and the register file) into the unpacked form in the fpr_reg_type record. This is not strictly necessary for fmr et al., but will be useful for when we do actual arithmetic. Signed-off-by: Paul Mackerras --- decode1.vhdl | 5 ++ decode2.vhdl | 3 + decode_types.vhdl | 2 +- fpu.vhdl | 144 ++++++++++++++++++++++++++++++++++--- tests/fpu/fpu.c | 34 +++++++++ tests/test_fpu.bin | Bin 12504 -> 12504 bytes tests/test_fpu.console_out | 1 + 7 files changed, 180 insertions(+), 9 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 343c0c3..5f5fb80 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -428,6 +428,11 @@ architecture behaviour of decode1 is 2#011000100# => (FPU, OP_FPOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 4/6=mtfsfi 2#011110010# => (FPU, OP_FPOP_I, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 18/7=mffs family 2#011110110# => (FPU, OP_FPOP_I, NONE, FRB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 22/7=mtfsf + 2#100000000# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 0/8=fcpsgn + 2#100000001# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 1/8=fneg + 2#100000010# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 2/8=fmr + 2#100000100# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 4/8=fnabs + 2#100001000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 8/8=fabs others => illegal_inst ); diff --git a/decode2.vhdl b/decode2.vhdl index 8b2ab8c..ec8232f 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -80,6 +80,8 @@ architecture behaviour of decode2 is return (is_fast_spr(ispr), ispr, reg_data); elsif t = CIA then return ('0', (others => '0'), instr_addr); + elsif HAS_FPU and t = FRA then + return ('1', fpr_to_gspr(insn_fra(insn_in)), reg_data); else return ('0', (others => '0'), (others => '0')); end if; @@ -300,6 +302,7 @@ begin end process; r_out.read1_reg <= d_in.ispr1 when d_in.decode.input_reg_a = SPR + else fpr_to_gspr(insn_fra(d_in.insn)) when d_in.decode.input_reg_a = FRA and HAS_FPU else gpr_to_gspr(insn_ra(d_in.insn)); r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR else fpr_to_gspr(insn_frb(d_in.insn)) when d_in.decode.input_reg_b = FRB and HAS_FPU diff --git a/decode_types.vhdl b/decode_types.vhdl index 5eaef50..08fdc4a 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -23,7 +23,7 @@ package decode_types is OP_BCD, OP_ADDG6S, OP_FETCH_FAILED ); - type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA); + type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA, FRA); type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR, FRB); type input_reg_c_t is (NONE, RS, RCR, FRS); diff --git a/fpu.vhdl b/fpu.vhdl index 047bf2d..3711b35 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -24,9 +24,20 @@ entity fpu is end entity fpu; architecture behaviour of fpu is + type fp_number_class is (ZERO, FINITE, INFINITY, NAN); + + constant EXP_BITS : natural := 13; + + type fpu_reg_type is record + class : fp_number_class; + negative : std_ulogic; + exponent : signed(EXP_BITS-1 downto 0); -- unbiased + mantissa : std_ulogic_vector(63 downto 0); -- 10.54 format + end record; type state_t is (IDLE, - DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF); + DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF, + DO_FMR); type reg_type is record state : state_t; @@ -41,9 +52,14 @@ architecture behaviour of fpu is is_cmp : std_ulogic; single_prec : std_ulogic; fpscr : std_ulogic_vector(31 downto 0); - b : std_ulogic_vector(63 downto 0); + a : fpu_reg_type; + b : fpu_reg_type; r : std_ulogic_vector(63 downto 0); + result_sign : std_ulogic; + result_class : fp_number_class; + result_exp : signed(EXP_BITS-1 downto 0); writing_back : std_ulogic; + int_result : std_ulogic; cr_result : std_ulogic_vector(3 downto 0); cr_mask : std_ulogic_vector(7 downto 0); end record; @@ -51,6 +67,72 @@ architecture behaviour of fpu is signal r, rin : reg_type; signal fp_result : std_ulogic_vector(63 downto 0); + signal opsel_r : std_ulogic_vector(1 downto 0); + signal result : std_ulogic_vector(63 downto 0); + + -- Split a DP floating-point number into components and work out its class. + -- If is_int = 1, the input is considered an integer + function decode_dp(fpr: std_ulogic_vector(63 downto 0); is_int: std_ulogic) return fpu_reg_type is + variable r : fpu_reg_type; + variable exp_nz : std_ulogic; + variable exp_ao : std_ulogic; + variable frac_nz : std_ulogic; + variable cls : std_ulogic_vector(2 downto 0); + begin + r.negative := fpr(63); + exp_nz := or (fpr(62 downto 52)); + exp_ao := and (fpr(62 downto 52)); + frac_nz := or (fpr(51 downto 0)); + if is_int = '0' then + r.exponent := signed(resize(unsigned(fpr(62 downto 52)), EXP_BITS)) - to_signed(1023, EXP_BITS); + if exp_nz = '0' then + r.exponent := to_signed(-1022, EXP_BITS); + end if; + r.mantissa := "000000000" & exp_nz & fpr(51 downto 0) & "00"; + cls := exp_ao & exp_nz & frac_nz; + case cls is + when "000" => r.class := ZERO; + when "001" => r.class := FINITE; -- denormalized + when "010" => r.class := FINITE; + when "011" => r.class := FINITE; + when "110" => r.class := INFINITY; + when others => r.class := NAN; + end case; + else + r.mantissa := fpr; + r.exponent := (others => '0'); + if (fpr(63) or exp_nz or frac_nz) = '1' then + r.class := FINITE; + else + r.class := ZERO; + end if; + end if; + return r; + end; + + -- Construct a DP floating-point result from components + function pack_dp(sign: std_ulogic; class: fp_number_class; exp: signed(EXP_BITS-1 downto 0); + mantissa: std_ulogic_vector) return std_ulogic_vector is + variable result : std_ulogic_vector(63 downto 0); + begin + result := (others => '0'); + result(63) := sign; + case class is + when ZERO => + when FINITE => + if mantissa(54) = '1' then + -- normalized number + result(62 downto 52) := std_ulogic_vector(resize(exp, 11) + 1023); + end if; + result(51 downto 0) := mantissa(53 downto 2); + when INFINITY => + result(62 downto 52) := "11111111111"; + when NAN => + result(62 downto 52) := "11111111111"; + result(51 downto 0) := mantissa(53 downto 2); + end case; + return result; + end; begin fpu_0: process(clk) @@ -85,14 +167,18 @@ begin fpu_1: process(all) variable v : reg_type; + variable adec : fpu_reg_type; + variable bdec : fpu_reg_type; variable fpscr_mask : std_ulogic_vector(31 downto 0); variable illegal : std_ulogic; variable j, k : integer; variable flm : std_ulogic_vector(7 downto 0); + variable int_input : std_ulogic; begin v := r; illegal := '0'; v.busy := '0'; + int_input := '0'; -- capture incoming instruction if e_in.valid = '1' then @@ -101,6 +187,7 @@ begin v.fe_mode := or (e_in.fe_mode); v.dest_fpr := e_in.frt; v.single_prec := e_in.single; + v.int_result := '0'; v.rc := e_in.rc; v.is_cmp := e_in.out_cr; if e_in.out_cr = '0' then @@ -108,11 +195,19 @@ begin else v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(e_in.insn)))); end if; - v.b := e_in.frb; + int_input := '0'; + if e_in.op = OP_FPOP_I then + int_input := '1'; + end if; + adec := decode_dp(e_in.fra, int_input); + bdec := decode_dp(e_in.frb, int_input); + v.a := adec; + v.b := bdec; end if; v.writing_back := '0'; v.instr_done := '0'; + opsel_r <= "00"; fpscr_mask := (others => '1'); case r.state is @@ -133,6 +228,8 @@ begin else v.state := DO_MTFSF; end if; + when "01000" => + v.state := DO_FMR; when others => illegal := '1'; end case; @@ -177,7 +274,9 @@ begin v.state := IDLE; when DO_MFFS => + v.int_result := '1'; v.writing_back := '1'; + opsel_r <= "10"; case r.insn(20 downto 16) is when "00000" => -- mffs @@ -191,7 +290,7 @@ begin -- mffscrn fpscr_mask := x"000000FF"; v.fpscr(FPSCR_RN+1 downto FPSCR_RN) := - r.b(FPSCR_RN+1 downto FPSCR_RN); + r.b.mantissa(FPSCR_RN+1 downto FPSCR_RN); when "10111" => -- mffscrni fpscr_mask := x"000000FF"; @@ -216,19 +315,48 @@ begin for i in 0 to 7 loop k := i * 4; if flm(i) = '1' then - v.fpscr(k + 3 downto k) := r.b(k + 3 downto k); + v.fpscr(k + 3 downto k) := r.b.mantissa(k + 3 downto k); end if; end loop; v.instr_done := '1'; v.state := IDLE; + when DO_FMR => + v.result_class := r.b.class; + v.result_exp := r.b.exponent; + if r.insn(9) = '1' then + v.result_sign := '0'; -- fabs + elsif r.insn(8) = '1' then + v.result_sign := '1'; -- fnabs + elsif r.insn(7) = '1' then + v.result_sign := r.b.negative; -- fmr + elsif r.insn(6) = '1' then + v.result_sign := not r.b.negative; -- fneg + else + v.result_sign := r.a.negative; -- fcpsgn + end if; + v.writing_back := '1'; + v.instr_done := '1'; + v.state := IDLE; + end case; -- Data path. - -- Just enough to read FPSCR for now. - v.r := x"00000000" & (r.fpscr and fpscr_mask); + case opsel_r is + when "00" => + result <= r.b.mantissa; + when "10" => + result <= x"00000000" & (r.fpscr and fpscr_mask); + when others => + result <= (others => '0'); + end case; + v.r := result; - fp_result <= r.r; + if r.int_result = '1' then + fp_result <= r.r; + else + fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r); + end if; v.fpscr(FPSCR_VX) := (or (v.fpscr(FPSCR_VXSNAN downto FPSCR_VXVC))) or (or (v.fpscr(FPSCR_VXSOFT downto FPSCR_VXCVI))); diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index f9c4245..46668f8 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -438,6 +438,39 @@ int fpu_test_5(void) return 0; } +#define SIGN 0x8000000000000000ul + +int test6(long arg) +{ + long i; + unsigned long results[6]; + unsigned long v; + + for (i = 0; i < sizeof(sp_dp_equiv) / sizeof(sp_dp_equiv[0]); ++i) { + v = sp_dp_equiv[i].dp; + asm("lfd%U0%X0 3,%0; fmr 6,3; fneg 7,3; stfd 6,0(%1); stfd 7,8(%1)" + : : "m" (sp_dp_equiv[i].dp), "b" (results) : "memory"); + asm("fabs 9,6; fnabs 10,6; stfd 9,16(%0); stfd 10,24(%0)" + : : "b" (results) : "memory"); + asm("fcpsgn 4,9,3; stfd 4,32(%0); fcpsgn 5,10,3; stfd 5,40(%0)" + : : "b" (results) : "memory"); + if (results[0] != v || + results[1] != (v ^ SIGN) || + results[2] != (v & ~SIGN) || + results[3] != (v | SIGN) || + results[4] != (v & ~SIGN) || + results[5] != (v | SIGN)) + return i + 1; + } + return 0; +} + +int fpu_test_6(void) +{ + enable_fp(); + return trapit(0, test6); +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -469,6 +502,7 @@ int main(void) do_test(3, fpu_test_3); do_test(4, fpu_test_4); do_test(5, fpu_test_5); + do_test(6, fpu_test_6); return fail; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 6bac86156f19b99c44efc42659d3d5f6219d6486..4fb260e1d5a3e4f37deea64098b82e55151aceb2 100755 GIT binary patch delta 1357 zcmY+DZA@Eb6vxlK<#v}WG?!9J8E>U;(1ME74;PuCMVy6BWMm|o>B7s^=@Lk`j2Q|PyUh>06d%t6-3L!{^8Ek5 z^Yq+%p652vo#-CQg5B7Qh=$bHiqe@(?XDqIgRU@xR(c5fJ%>g~K zh!)|%?|Ol1Pi>Cj5w;Ip!&3BHsrs@?l1enHvsKbT7o=d z{(93!%fK=VBq8Y2=82_wvtX4rLwaLZ`?Ra1d1D3S`eO{5_yFk2kDtz4XCXeFz^9ZA zwaqZ(k8Cf+1lsT@wO13fwQ8yjG8W=>-r!EQK2Q?YZ1m;^@ODq{^G55SQjLV1(Rl@e zPkVinLX){e%vG?*>=WH7ykIUDGI-5gB`&VvpXMgyn0$! zJ6G|9<*<}^Pgiqs(h@Q*ysN9TE4XE8H69@tUcuJFRCO(3Jq7r@xeO__z{rExf58+ggu9nfob1#x_?$y=2tT&L4 zpVZZzfA{J+9w*iQm@!`GVW*rS_9jZM3Tc7VCG@(S#vBpg`^VVf@(P;-=ehp9Nsixj*uzlIDkodb$$NIOQ}}dJ3dpwit^{ M1w2P(G}it0AGf;B!2kdN delta 1040 zcmY+COH30{6o&6inE_F-&;l*57U_dWi4|Q8j|doQ8VZO6Vq9P-pn?IV35%*gflLf0 zYHAKFG@@V-#T5i%T!?{~7#3pC7{i7HViyJ|g0O%fT+hX_@K4Unf4+0)KF*yv=bY1f zkM|{vJ{VGhA+q_}?kaT26v|a;eAFtms}i|E=u?^b z89#hfo#F5LAyl2oSNh?!x-`udPV7@OX^Yb5d6ahL5i*%GeRU$oSN$ZXZx{ElJm_(6 zeDKV#z#FwGyc&nQf5`i2cmhL>R}sz_8L zytoY6(M3UXM~EGix_IFWuGk0b(KomZd?rA9jL26m!=0GdTnn7g&B_-ju*o;kk)bV|1&toMXDU?v%cyMiN z39M^X+ANXSS@*NCn;8Mmx+g@&^@6R{_`#SW0u$OX(hFrOdm)j5LB*ZV2;;aQlJ7#0j^mC`Z>P>khc zv>?;tn~AXsj1FW522ku>kKqNIAzAdIEBpo54Qb*e+OjoxYDg2-&`!gQAx=hSYZmyBS2W30W_#+my--pi(gMG5Vp(oSyLw lD~vF)(H$5+ky+TOP>l63vqPzSY|3m48JjhCb0Av3{sp(8UgZD) diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out index 99d32e6..a49bb9b 100644 --- a/tests/test_fpu.console_out +++ b/tests/test_fpu.console_out @@ -3,3 +3,4 @@ test 02:PASS test 03:PASS test 04:PASS test 05:PASS +test 06:PASS From 9e8fb293edd59f355cc1fd020f96dafee0af867c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 16 Jul 2020 15:51:57 +1000 Subject: [PATCH 10/30] FPU: Implement floating convert from integer instructions This implements fcfid, fcfidu, fcfids and fcfidus, which convert 64-bit integer values in an FPR into a floating-point value. This brings in a lot of the datapath that will be needed in future, including the shifter, adder, mask generator and count-leading-zeroes logic, along with the machinery for rounding to single-precision or double-precision, detecting inexact results, signalling inexact-result exceptions, and updating result flags in the FPSCR. Signed-off-by: Paul Mackerras --- decode1.vhdl | 19 ++ fpu.vhdl | 506 ++++++++++++++++++++++++++++++++++++- tests/fpu/fpu.c | 87 ++++++- tests/test_fpu.bin | Bin 12504 -> 13504 bytes tests/test_fpu.console_out | 1 + 5 files changed, 587 insertions(+), 26 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 5f5fb80..83444cf 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -55,6 +55,7 @@ architecture behaviour of decode1 is type op_19_subop_array_t is array(0 to 7) of decode_rom_t; type op_30_subop_array_t is array(0 to 15) of decode_rom_t; type op_31_subop_array_t is array(0 to 1023) of decode_rom_t; + type op_59_subop_array_t is array(0 to 31) of decode_rom_t; type minor_rom_array_2_t is array(0 to 3) of decode_rom_t; type op_63_subop_array_0_t is array(0 to 511) of decode_rom_t; @@ -410,6 +411,13 @@ architecture behaviour of decode1 is others => decode_rom_init ); + constant decode_op_59_array : op_59_subop_array_t := ( + -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl + -- op in out A out in out len ext pipe + 2#01110# => (FPU, OP_FPOP_I, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fcfid[u]s + others => illegal_inst + ); + constant decode_op_62_array : minor_rom_array_2_t := ( -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl -- op in out A out in out len ext pipe @@ -433,6 +441,8 @@ architecture behaviour of decode1 is 2#100000010# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 2/8=fmr 2#100000100# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 4/8=fnabs 2#100001000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 8/8=fabs + 2#111011010# => (FPU, OP_FPOP_I, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 26/14=fcfid + 2#111011110# => (FPU, OP_FPOP_I, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 30/14=fcfidu others => illegal_inst ); @@ -586,6 +596,15 @@ begin when 58 => v.decode := decode_op_58_array(to_integer(unsigned(f_in.insn(1 downto 0)))); + when 59 => + if HAS_FPU then + -- floating point operations, mostly single-precision + v.decode := decode_op_59_array(to_integer(unsigned(f_in.insn(5 downto 1)))); + if f_in.insn(5) = '0' and not std_match(f_in.insn(10 downto 1), "11-1001110") then + vi.override := '1'; + end if; + end if; + when 62 => v.decode := decode_op_62_array(to_integer(unsigned(f_in.insn(1 downto 0)))); diff --git a/fpu.vhdl b/fpu.vhdl index 3711b35..fecb7bb 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -37,7 +37,12 @@ architecture behaviour of fpu is type state_t is (IDLE, DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF, - DO_FMR); + DO_FMR, + DO_FCFID, + FINISH, NORMALIZE, + ROUND_UFLOW, ROUND_OFLOW, + ROUNDING, ROUNDING_2, ROUNDING_3, + DENORM); type reg_type is record state : state_t; @@ -54,21 +59,121 @@ architecture behaviour of fpu is fpscr : std_ulogic_vector(31 downto 0); a : fpu_reg_type; b : fpu_reg_type; - r : std_ulogic_vector(63 downto 0); + r : std_ulogic_vector(63 downto 0); -- 10.54 format + x : std_ulogic; result_sign : std_ulogic; result_class : fp_number_class; result_exp : signed(EXP_BITS-1 downto 0); + shift : signed(EXP_BITS-1 downto 0); writing_back : std_ulogic; int_result : std_ulogic; cr_result : std_ulogic_vector(3 downto 0); cr_mask : std_ulogic_vector(7 downto 0); + old_exc : std_ulogic_vector(4 downto 0); + update_fprf : std_ulogic; + tiny : std_ulogic; + denorm : std_ulogic; + round_mode : std_ulogic_vector(2 downto 0); end record; signal r, rin : reg_type; signal fp_result : std_ulogic_vector(63 downto 0); + signal opsel_a : std_ulogic_vector(1 downto 0); + signal opsel_b : std_ulogic_vector(1 downto 0); signal opsel_r : std_ulogic_vector(1 downto 0); + signal opsel_ainv : std_ulogic; + signal opsel_amask : std_ulogic; + signal in_a : std_ulogic_vector(63 downto 0); + signal in_b : std_ulogic_vector(63 downto 0); signal result : std_ulogic_vector(63 downto 0); + signal carry_in : std_ulogic; + signal lost_bits : std_ulogic; + signal r_hi_nz : std_ulogic; + signal r_lo_nz : std_ulogic; + signal misc_sel : std_ulogic_vector(3 downto 0); + + -- opsel values + constant AIN_R : std_ulogic_vector(1 downto 0) := "00"; + constant AIN_A : std_ulogic_vector(1 downto 0) := "01"; + constant AIN_B : std_ulogic_vector(1 downto 0) := "10"; + + constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00"; + constant BIN_R : std_ulogic_vector(1 downto 0) := "01"; + constant BIN_MASK : std_ulogic_vector(1 downto 0) := "10"; + + constant RES_SUM : std_ulogic_vector(1 downto 0) := "00"; + constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01"; + constant RES_MISC : std_ulogic_vector(1 downto 0) := "11"; + + -- Left and right shifter with 120 bit input and 64 bit output. + -- Shifts inp left by shift bits and returns the upper 64 bits of + -- the result. The shift parameter is interpreted as a signed + -- number in the range -64..63, with negative values indicating + -- right shifts. + function shifter_64(inp: std_ulogic_vector(119 downto 0); + shift: std_ulogic_vector(6 downto 0)) + return std_ulogic_vector is + variable s1 : std_ulogic_vector(94 downto 0); + variable s2 : std_ulogic_vector(70 downto 0); + variable result : std_ulogic_vector(63 downto 0); + begin + case shift(6 downto 5) is + when "00" => + s1 := inp(119 downto 25); + when "01" => + s1 := inp(87 downto 0) & "0000000"; + when "10" => + s1 := x"0000000000000000" & inp(119 downto 89); + when others => + s1 := x"00000000" & inp(119 downto 57); + end case; + case shift(4 downto 3) is + when "00" => + s2 := s1(94 downto 24); + when "01" => + s2 := s1(86 downto 16); + when "10" => + s2 := s1(78 downto 8); + when others => + s2 := s1(70 downto 0); + end case; + case shift(2 downto 0) is + when "000" => + result := s2(70 downto 7); + when "001" => + result := s2(69 downto 6); + when "010" => + result := s2(68 downto 5); + when "011" => + result := s2(67 downto 4); + when "100" => + result := s2(66 downto 3); + when "101" => + result := s2(65 downto 2); + when "110" => + result := s2(64 downto 1); + when others => + result := s2(63 downto 0); + end case; + return result; + end; + + -- Generate a mask with 0-bits on the left and 1-bits on the right which + -- selects the bits will be lost in doing a right shift. The shift + -- parameter is the bottom 6 bits of a negative shift count, + -- indicating a right shift. + function right_mask(shift: unsigned(5 downto 0)) return std_ulogic_vector is + variable result: std_ulogic_vector(63 downto 0); + begin + result := (others => '0'); + for i in 0 to 63 loop + if i >= shift then + result(63 - i) := '1'; + end if; + end loop; + return result; + end; -- Split a DP floating-point number into components and work out its class. -- If is_int = 1, the input is considered an integer @@ -112,7 +217,8 @@ architecture behaviour of fpu is -- Construct a DP floating-point result from components function pack_dp(sign: std_ulogic; class: fp_number_class; exp: signed(EXP_BITS-1 downto 0); - mantissa: std_ulogic_vector) return std_ulogic_vector is + mantissa: std_ulogic_vector; single_prec: std_ulogic) + return std_ulogic_vector is variable result : std_ulogic_vector(63 downto 0); begin result := (others => '0'); @@ -124,16 +230,76 @@ architecture behaviour of fpu is -- normalized number result(62 downto 52) := std_ulogic_vector(resize(exp, 11) + 1023); end if; - result(51 downto 0) := mantissa(53 downto 2); + result(51 downto 29) := mantissa(53 downto 31); + if single_prec = '0' then + result(28 downto 0) := mantissa(30 downto 2); + end if; when INFINITY => result(62 downto 52) := "11111111111"; when NAN => result(62 downto 52) := "11111111111"; - result(51 downto 0) := mantissa(53 downto 2); + result(51 downto 29) := mantissa(53 downto 31); + if single_prec = '0' then + result(28 downto 0) := mantissa(30 downto 2); + end if; end case; return result; end; + -- Determine whether to increment when rounding + -- Returns rounding_inc & inexact + -- Assumes x includes the bottom 29 bits of the mantissa already + -- if single_prec = 1 (usually arranged by setting set_x = 1 earlier). + function fp_rounding(mantissa: std_ulogic_vector(63 downto 0); x: std_ulogic; + single_prec: std_ulogic; rn: std_ulogic_vector(2 downto 0); + sign: std_ulogic) + return std_ulogic_vector is + variable grx : std_ulogic_vector(2 downto 0); + variable ret : std_ulogic_vector(1 downto 0); + variable lsb : std_ulogic; + begin + if single_prec = '0' then + grx := mantissa(1 downto 0) & x; + lsb := mantissa(2); + else + grx := mantissa(30 downto 29) & x; + lsb := mantissa(31); + end if; + ret(1) := '0'; + ret(0) := or (grx); + case rn(1 downto 0) is + when "00" => -- round to nearest + if grx = "100" and rn(2) = '0' then + ret(1) := lsb; -- tie, round to even + else + ret(1) := grx(2); + end if; + when "01" => -- round towards zero + when others => -- round towards +/- inf + if rn(0) = sign then + -- round towards greater magnitude + ret(1) := ret(0); + end if; + end case; + return ret; + end; + + -- Determine result flags to write into the FPSCR + function result_flags(sign: std_ulogic; class: fp_number_class; unitbit: std_ulogic) + return std_ulogic_vector is + begin + case class is + when ZERO => + return sign & "0010"; + when FINITE => + return (not unitbit) & sign & (not sign) & "00"; + when INFINITY => + return '0' & sign & (not sign) & "01"; + when NAN => + return "10001"; + end case; + end; + begin fpu_0: process(clk) begin @@ -174,6 +340,25 @@ begin variable j, k : integer; variable flm : std_ulogic_vector(7 downto 0); variable int_input : std_ulogic; + variable mask : std_ulogic_vector(63 downto 0); + variable in_a0 : std_ulogic_vector(63 downto 0); + variable in_b0 : std_ulogic_vector(63 downto 0); + variable misc : std_ulogic_vector(63 downto 0); + variable shift_res : std_ulogic_vector(63 downto 0); + variable round : std_ulogic_vector(1 downto 0); + variable update_fx : std_ulogic; + variable arith_done : std_ulogic; + variable mant_nz : std_ulogic; + variable min_exp : signed(EXP_BITS-1 downto 0); + variable max_exp : signed(EXP_BITS-1 downto 0); + variable bias_exp : signed(EXP_BITS-1 downto 0); + variable new_exp : signed(EXP_BITS-1 downto 0); + variable exp_tiny : std_ulogic; + variable exp_huge : std_ulogic; + variable renormalize : std_ulogic; + variable clz : std_ulogic_vector(5 downto 0); + variable set_x : std_ulogic; + variable mshift : signed(EXP_BITS-1 downto 0); begin v := r; illegal := '0'; @@ -199,16 +384,53 @@ begin if e_in.op = OP_FPOP_I then int_input := '1'; end if; + v.tiny := '0'; + v.denorm := '0'; + v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN); adec := decode_dp(e_in.fra, int_input); bdec := decode_dp(e_in.frb, int_input); v.a := adec; v.b := bdec; end if; + r_hi_nz <= or (r.r(55 downto 31)); + r_lo_nz <= or (r.r(30 downto 2)); + + if r.single_prec = '0' then + max_exp := to_signed(1023, EXP_BITS); + min_exp := to_signed(-1022, EXP_BITS); + bias_exp := to_signed(1536, EXP_BITS); + else + max_exp := to_signed(127, EXP_BITS); + min_exp := to_signed(-126, EXP_BITS); + bias_exp := to_signed(192, EXP_BITS); + end if; + new_exp := r.result_exp - r.shift; + exp_tiny := '0'; + exp_huge := '0'; + if new_exp < min_exp then + exp_tiny := '1'; + end if; + if new_exp > max_exp then + exp_huge := '1'; + end if; + v.writing_back := '0'; v.instr_done := '0'; - opsel_r <= "00"; + v.update_fprf := '0'; + v.shift := to_signed(0, EXP_BITS); + opsel_a <= AIN_R; + opsel_ainv <= '0'; + opsel_amask <= '0'; + opsel_b <= BIN_ZERO; + opsel_r <= RES_SUM; + carry_in <= '0'; + misc_sel <= "0000"; fpscr_mask := (others => '1'); + update_fx := '0'; + arith_done := '0'; + renormalize := '0'; + set_x := '0'; case r.state is when IDLE => @@ -230,10 +452,15 @@ begin end if; when "01000" => v.state := DO_FMR; + when "01110" => + -- fcfid[u][s] + v.state := DO_FCFID; when others => illegal := '1'; end case; end if; + v.x := '0'; + v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX); when DO_MCRFS => j := to_integer(unsigned(insn_bfa(r.insn))); @@ -276,7 +503,7 @@ begin when DO_MFFS => v.int_result := '1'; v.writing_back := '1'; - opsel_r <= "10"; + opsel_r <= RES_MISC; case r.insn(20 downto 16) is when "00000" => -- mffs @@ -322,6 +549,7 @@ begin v.state := IDLE; when DO_FMR => + opsel_a <= AIN_B; v.result_class := r.b.class; v.result_exp := r.b.exponent; if r.insn(9) = '1' then @@ -339,29 +567,281 @@ begin v.instr_done := '1'; v.state := IDLE; + when DO_FCFID => + v.result_sign := '0'; + opsel_a <= AIN_B; + if r.insn(8) = '0' and r.b.negative = '1' then + -- fcfid[s] with negative operand, set R = -B + opsel_ainv <= '1'; + carry_in <= '1'; + v.result_sign := '1'; + end if; + v.result_class := r.b.class; + v.result_exp := to_signed(54, EXP_BITS); + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + if r.b.class = ZERO then + arith_done := '1'; + else + v.state := FINISH; + end if; + + when FINISH => + if r.r(63 downto 54) /= "0000000001" then + renormalize := '1'; + v.state := NORMALIZE; + else + set_x := '1'; + if exp_tiny = '1' then + v.shift := new_exp - min_exp; + v.state := ROUND_UFLOW; + elsif exp_huge = '1' then + v.state := ROUND_OFLOW; + else + v.shift := to_signed(-2, EXP_BITS); + v.state := ROUNDING; + end if; + end if; + + when NORMALIZE => + -- Shift so we have 9 leading zeroes (we know R is non-zero) + opsel_r <= RES_SHIFT; + set_x := '1'; + if exp_tiny = '1' then + v.shift := new_exp - min_exp; + v.state := ROUND_UFLOW; + elsif exp_huge = '1' then + v.state := ROUND_OFLOW; + else + v.shift := to_signed(-2, EXP_BITS); + v.state := ROUNDING; + end if; + + when ROUND_UFLOW => + v.tiny := '1'; + if r.fpscr(FPSCR_UE) = '0' then + -- disabled underflow exception case + -- have to denormalize before rounding + opsel_r <= RES_SHIFT; + set_x := '1'; + v.shift := to_signed(-2, EXP_BITS); + v.state := ROUNDING; + else + -- enabled underflow exception case + -- if denormalized, have to normalize before rounding + v.fpscr(FPSCR_UX) := '1'; + v.result_exp := r.result_exp + bias_exp; + if r.r(54) = '0' then + renormalize := '1'; + v.state := NORMALIZE; + else + v.shift := to_signed(-2, EXP_BITS); + v.state := ROUNDING; + end if; + end if; + + when ROUND_OFLOW => + v.fpscr(FPSCR_OX) := '1'; + if r.fpscr(FPSCR_OE) = '0' then + -- disabled overflow exception + -- result depends on rounding mode + v.fpscr(FPSCR_XX) := '1'; + v.fpscr(FPSCR_FI) := '1'; + if r.round_mode(1 downto 0) = "00" or + (r.round_mode(1) = '1' and r.round_mode(0) = r.result_sign) then + v.result_class := INFINITY; + v.fpscr(FPSCR_FR) := '1'; + else + v.fpscr(FPSCR_FR) := '0'; + end if; + -- construct largest representable number + v.result_exp := max_exp; + opsel_r <= RES_MISC; + misc_sel <= "001" & r.single_prec; + arith_done := '1'; + else + -- enabled overflow exception + v.result_exp := r.result_exp - bias_exp; + v.shift := to_signed(-2, EXP_BITS); + v.state := ROUNDING; + end if; + + when ROUNDING => + opsel_amask <= '1'; + round := fp_rounding(r.r, r.x, r.single_prec, r.round_mode, r.result_sign); + v.fpscr(FPSCR_FR downto FPSCR_FI) := round; + if round(1) = '1' then + -- set mask to increment the LSB for the precision + opsel_b <= BIN_MASK; + carry_in <= '1'; + v.shift := to_signed(-1, EXP_BITS); + v.state := ROUNDING_2; + else + if r.r(54) = '0' then + -- result after masking could be zero, or could be a + -- denormalized result that needs to be renormalized + renormalize := '1'; + v.state := ROUNDING_3; + else + arith_done := '1'; + end if; + end if; + if round(0) = '1' then + v.fpscr(FPSCR_XX) := '1'; + if r.tiny = '1' then + v.fpscr(FPSCR_UX) := '1'; + end if; + end if; + + when ROUNDING_2 => + -- Check for overflow during rounding + v.x := '0'; + if r.r(55) = '1' then + opsel_r <= RES_SHIFT; + if exp_huge = '1' then + v.state := ROUND_OFLOW; + else + arith_done := '1'; + end if; + elsif r.r(54) = '0' then + -- Do CLZ so we can renormalize the result + renormalize := '1'; + v.state := ROUNDING_3; + else + arith_done := '1'; + end if; + + when ROUNDING_3 => + mant_nz := r_hi_nz or (r_lo_nz and not r.single_prec); + if mant_nz = '0' then + v.result_class := ZERO; + arith_done := '1'; + else + -- Renormalize result after rounding + opsel_r <= RES_SHIFT; + v.denorm := exp_tiny; + v.shift := new_exp - to_signed(-1022, EXP_BITS); + if new_exp < to_signed(-1022, EXP_BITS) then + v.state := DENORM; + else + arith_done := '1'; + end if; + end if; + + when DENORM => + opsel_r <= RES_SHIFT; + arith_done := '1'; + end case; + if arith_done = '1' then + v.writing_back := '1'; + v.update_fprf := '1'; + v.instr_done := '1'; + v.state := IDLE; + update_fx := '1'; + end if; + -- Data path. + -- This has A and B input multiplexers, an adder, a shifter, + -- count-leading-zeroes logic, and a result mux. + if r.single_prec = '1' then + mshift := r.shift + to_signed(-29, EXP_BITS); + else + mshift := r.shift; + end if; + if mshift < to_signed(-64, EXP_BITS) then + mask := (others => '1'); + elsif mshift >= to_signed(0, EXP_BITS) then + mask := (others => '0'); + else + mask := right_mask(unsigned(mshift(5 downto 0))); + end if; + case opsel_a is + when AIN_R => + in_a0 := r.r; + when AIN_A => + in_a0 := r.a.mantissa; + when others => + in_a0 := r.b.mantissa; + end case; + if (or (mask and in_a0)) = '1' and set_x = '1' then + v.x := '1'; + end if; + if opsel_ainv = '1' then + in_a0 := not in_a0; + end if; + if opsel_amask = '1' then + in_a0 := in_a0 and not mask; + end if; + in_a <= in_a0; + case opsel_b is + when BIN_ZERO => + in_b0 := (others => '0'); + when BIN_R => + in_b0 := r.r; + when BIN_MASK => + in_b0 := mask; + when others => + in_b0 := (others => '0'); + end case; + in_b <= in_b0; + if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then + shift_res := shifter_64(r.r & x"00000000000000", + std_ulogic_vector(r.shift(6 downto 0))); + else + shift_res := (others => '0'); + end if; case opsel_r is - when "00" => - result <= r.b.mantissa; - when "10" => - result <= x"00000000" & (r.fpscr and fpscr_mask); + when RES_SUM => + result <= std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in); + when RES_SHIFT => + result <= shift_res; when others => - result <= (others => '0'); + case misc_sel is + when "0000" => + misc := x"00000000" & (r.fpscr and fpscr_mask); + when "0010" => + -- mantissa of max representable DP number + misc := x"007ffffffffffffc"; + when "0011" => + -- mantissa of max representable SP number + misc := x"007fffff80000000"; + when others => + misc := x"0000000000000000"; + end case; + result <= misc; end case; v.r := result; + if opsel_r = RES_SHIFT then + v.result_exp := new_exp; + end if; + + if renormalize = '1' then + clz := count_left_zeroes(r.r); + v.shift := resize(signed('0' & clz) - 9, EXP_BITS); + end if; + if r.int_result = '1' then fp_result <= r.r; else - fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r); + fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r, + r.single_prec); + end if; + if r.update_fprf = '1' then + v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.result_sign, r.result_class, + r.r(54) and not r.denorm); end if; v.fpscr(FPSCR_VX) := (or (v.fpscr(FPSCR_VXSNAN downto FPSCR_VXVC))) or (or (v.fpscr(FPSCR_VXSOFT downto FPSCR_VXCVI))); v.fpscr(FPSCR_FEX) := or (v.fpscr(FPSCR_VX downto FPSCR_XX) and v.fpscr(FPSCR_VE downto FPSCR_XE)); + if update_fx = '1' and + (v.fpscr(FPSCR_VX downto FPSCR_XX) and not r.old_exc) /= "00000" then + v.fpscr(FPSCR_FX) := '1'; + end if; if r.rc = '1' then v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX); end if; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 46668f8..80751d1 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -64,7 +64,7 @@ void print_string(const char *str) putchar(*str); } -void print_hex(unsigned long val, int ndigits) +void print_hex(unsigned long val, int ndigits, const char *str) { int i, x; @@ -75,6 +75,7 @@ void print_hex(unsigned long val, int ndigits) else putchar(x + '0'); } + print_string(str); } // i < 100 @@ -201,12 +202,9 @@ int sp_to_dp(long arg) asm("lfs 20,0(%0); stfd 20,0(%1)" : : "b" (&sp_dp_equiv[arg].sp), "b" (&dp) : "memory"); if (dp != sp_dp_equiv[arg].dp) { - print_hex(sp_dp_equiv[arg].sp, 8); - print_string(" "); - print_hex(dp, 16); - print_string(" "); - print_hex(sp_dp_equiv[arg].dp, 16); - print_string(" "); + print_hex(sp_dp_equiv[arg].sp, 8, " "); + print_hex(dp, 16, " "); + print_hex(sp_dp_equiv[arg].dp, 16, " "); } return dp != sp_dp_equiv[arg].dp; } @@ -465,12 +463,77 @@ int test6(long arg) return 0; } +struct int_fp_equiv { + long ival; + unsigned long fp; + unsigned long fp_u; + unsigned long fp_s; + unsigned long fp_us; +} intvals[] = { + { 0, 0, 0, 0, 0 }, + { 1, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 }, + { -1, 0xbff0000000000000, 0x43f0000000000000, 0xbff0000000000000, 0x43f0000000000000 }, + { 2, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000 }, + { -2, 0xc000000000000000, 0x43f0000000000000, 0xc000000000000000, 0x43f0000000000000 }, + { 0x12345678, 0x41b2345678000000, 0x41b2345678000000, 0x41b2345680000000, 0x41b2345680000000 }, + { 0x0008000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000 }, + { 0x0010000000000000, 0x4330000000000000, 0x4330000000000000, 0x4330000000000000, 0x4330000000000000 }, + { 0x0020000000000000, 0x4340000000000000, 0x4340000000000000, 0x4340000000000000, 0x4340000000000000 }, + { 0x0020000000000001, 0x4340000000000000, 0x4340000000000000, 0x4340000000000000, 0x4340000000000000 }, + { 0x0020000000000002, 0x4340000000000001, 0x4340000000000001, 0x4340000000000000, 0x4340000000000000 }, + { 0x0020000000000003, 0x4340000000000002, 0x4340000000000002, 0x4340000000000000, 0x4340000000000000 }, + { 0x0020000010000000, 0x4340000008000000, 0x4340000008000000, 0x4340000000000000, 0x4340000000000000 }, + { 0x0020000020000000, 0x4340000010000000, 0x4340000010000000, 0x4340000000000000, 0x4340000000000000 }, + { 0x0020000030000000, 0x4340000018000000, 0x4340000018000000, 0x4340000020000000, 0x4340000020000000 }, + { 0x0020000040000000, 0x4340000020000000, 0x4340000020000000, 0x4340000020000000, 0x4340000020000000 }, + { 0x0020000080000000, 0x4340000040000000, 0x4340000040000000, 0x4340000040000000, 0x4340000040000000 }, + { 0x0040000000000000, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000 }, + { 0x0040000000000001, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000 }, + { 0x0040000000000002, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000, 0x4350000000000000 }, + { 0x0040000000000003, 0x4350000000000001, 0x4350000000000001, 0x4350000000000000, 0x4350000000000000 }, + { 0x0040000000000004, 0x4350000000000001, 0x4350000000000001, 0x4350000000000000, 0x4350000000000000 }, + { 0x0040000000000005, 0x4350000000000001, 0x4350000000000001, 0x4350000000000000, 0x4350000000000000 }, + { 0x0040000000000006, 0x4350000000000002, 0x4350000000000002, 0x4350000000000000, 0x4350000000000000 }, + { 0x0040000000000007, 0x4350000000000002, 0x4350000000000002, 0x4350000000000000, 0x4350000000000000 }, +}; + +int test7(long arg) +{ + long i; + unsigned long results[4]; + + for (i = 0; i < sizeof(intvals) / sizeof(intvals[0]); ++i) { + asm("lfd%U0%X0 3,%0; fcfid 6,3; fcfidu 7,3; stfd 6,0(%1); stfd 7,8(%1)" + : : "m" (intvals[i].ival), "b" (results) : "memory"); + asm("fcfids 9,3; stfd 9,16(%0); fcfidus 10,3; stfd 10,24(%0)" + : : "b" (results) : "memory"); + if (results[0] != intvals[i].fp || + results[1] != intvals[i].fp_u || + results[2] != intvals[i].fp_s || + results[3] != intvals[i].fp_us) { + print_string("\r\n"); + print_hex(results[0], 16, " "); + print_hex(results[1], 16, " "); + print_hex(results[2], 16, " "); + print_hex(results[3], 16, " "); + return i + 1; + } + } + return 0; +} + int fpu_test_6(void) { enable_fp(); return trapit(0, test6); } +int fpu_test_7(void) +{ + enable_fp(); + return trapit(0, test7); +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -484,12 +547,9 @@ void do_test(int num, int (*test)(void)) } else { fail = 1; print_string("FAIL "); - print_hex(ret, 5); - print_string(" SRR0="); - print_hex(mfspr(SRR0), 16); - print_string(" SRR1="); - print_hex(mfspr(SRR1), 16); - print_string("\r\n"); + print_hex(ret, 5, " SRR0="); + print_hex(mfspr(SRR0), 16, " SRR1="); + print_hex(mfspr(SRR1), 16, "\r\n"); } } @@ -503,6 +563,7 @@ int main(void) do_test(4, fpu_test_4); do_test(5, fpu_test_5); do_test(6, fpu_test_6); + do_test(7, fpu_test_7); return fail; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 4fb260e1d5a3e4f37deea64098b82e55151aceb2..25d50c77a0d990a40320f8baa84752efe3a6d996 100755 GIT binary patch delta 3104 zcma)7du&tJ9sb?x;5dP}H{{X0h;I^;#MH~&Qjmi~xz2;RrVUw}wi^PGlu~FxT_;h= zY)-FB?TJnu3Oa1+9|0OfqpYeX7@4#OZPSOQ(rH^eX;TF_;6>=8lZB@7#Nw zB?Ocm>FRgB?|kR?I_KUy(LerFBayL|DBAza{ODfT?X)V|Pt*r)7r0&Ec5P3rI@q{U zdG6q*d(=VwV-PwYO42@MnHS^x?UW=B%*;T6G;1E-0>a zCrqO6o_j?qG|LMB|~&Y`8R}wrT9s(x=s$ z-7H!bRrQ_h2W8!=cL)1RS+{bCtt#J`H|C}Y(>yvGE$=M<3{D=o&NskG8;T6l)5+lQ zBT>tujh>~v0PNJ2P*WUBtT^WMMRS>vLa8l zserMwyBLa955!}4y%k@h@yEvUIU3{fdnsWUR2Ph0d(T9abea!e%J6qCW%*~9MjWpA z9yD(C^bgJZ@_mmKP=UXOUN9abU+Y+Y!@xA>q3^rwXeXcxfKS_vMgpWH9K_7hBJ;V$ z6d6YLR#uJA$FeVe9xA>1wy9Gcj_~liL@)i(MQbN@3QtmxFWyG`!Yvsy&iS!67bSwR zJ+B1=G@SKWF^1DR^`|xPl@hyXz`_NGYyCXhS zJ3c++xiC%i!Gw#_6GOH7WsRE84tb2r6%;w^qBBSo1qI9%@tvrM^T*o~d$t9a;Vy{` zMRlSIQJsP69t01YZkE-pl{g^rmwLb^DQO?%H+zhi~}9^&4gY;R=a+ zM96QlsmhYdEl&IR*I@5gS*#*WA!r*Q(QLDib30d zdgh{KML9Yy1oesaSFld0`h#1*i&iZ{$*-bSvFm>}4}u#}zhzwz(TY*<0~Yt~xb6QK zTKyXGPly_3Vtg&j*&UubrIp?7DKW+M5$~Fc+&i}=gW~!K>;O(Ykqmx;1`C`A+`K&* z6dwVB7Xd#Dd{$y7@S!J@!7~!)1AhYiGhiLnii*_K<2W=WEnX>|65Zg~j&{2f=wICu|(gtKdEnTxUU;jw+q*Kymo_ zuNS(h{XE<1EmC_gvuC_5YTp%h1ZLt2`;E6&rN6T)FnJl~US6yC*{{9T&ZI)rn_(@> ztDMI%J)U9R%d75tAJcziIIj^QFmyZ<;HJTelw{Zrw}G(TBC0biQk#axm|`Q#YfVwL zS4E?yIDc7e=|HC8!;T~BQ3adAZX<$p*E_@KKmAj9F;O!lz0jYA{`jr*eVg6X1+g-{Ugj0*XB0(BDj~qjF+^O^vNymb1%V*~8F*^gmZrOWJzsDx7%_x8f#(Tue$fC7 z!5zXAKXtO{1t)U`maz*;nJHI=0waUJ?yW7CL$1FqZO6?Ry_r1>3Fg@wwm*|g!MJTR z(jm>@*rWF@l^RTc9L79n6B6evmRwS*OFy@>q8nnE0V%wl)h!tsEp_v)-NA)*c4zSG zvpgCdmNjqT*kK8DvNsqVIcsM!CjpdU1xDExpRKng1)pR?F) zZOmCi>T>vYHX(J;Tl;Ue_>KK9Q?f^oZ1sFATeqN2c5FeNL)4E1$adUgp9#$moM*XO iShwi^)H$V2{faf|k!L(VaDGm>fR25vl(2@tPyP$jogY;I delta 1800 zcmZvce{54#6vxkd+q&0{br09FK?}TYrK~z;2TI=PVC~qjlx15~z@WyJ!Q_W1XiNr{ zy_VEum`GeMA^tJhM6(1!%*du8@ka(B>hSkJhRE0=PP2*3QHc(jT)($1izJ@p_TKY3 z-+S&o=e^g#otMIHBL8ZlQ25jA(0h#Ew%U4ApIP?OoH zxhTdis`txkl>Mnbim5*|w^x!cs4Q{cQYpq36s(+QRw;OfJEu7x^iAAJ9j?Y$Yr#r+ zCZ8QI_z=1+*5$S#3kA<5btkGhKU7UL=p**F)hC~|uo3Gn`SdOpvW4VJ&$45-26?!Z z{bFm7>e<8d8f9-DJ5aQM^~|f!`VJ{JvdY32q@UTz!V-Dmuk7o>Q*t~PidA{RTnb*v zrl6>PjLo%|7@Uv!+U)I-RVv@Ck~A&$4`g#xySQu)Mg3lK>QgxsTsbpjtjYFlG?Tf^ zK{5Yk@>C9FmmHko+_xv4@pE*?3qGFe5)L6LD&vRzRn%9b_sU9SJ#vY5ByZquH`>(b zRi8D{ZD{1ggXlU>bod8_swOlFOi+_25~M?c@+l+2`H5LH2?8hZ-x@-y2Uq0q@j9h78(^AiWPUh3Z8|^ z+HU`3oWokgiV)j_b+3FNBC0-_<{J3MB!4BzJJY-r{EgJ(V4oG)@QNsgM^4=wZ&{Uu z+wa&K*B3$~d>MXiOI*)O`x)?~+v55^=!1x7!Y|z(*RQAjEckoji>`~f3I5U6xc(J< zJW*ZLB*(VmX-MOuR*j;pu+U}^+>!o>lo;FOG|aPJ|A_N3DKBIHvJCQ9&Y5NYq)xrZXP=Zb+LA}P%7{FN@wz=IE`u|Q zU7Ji|sS!KmDwp$e*n6%IS~dJ*i~$Q%F|doKz?uYr3FgNryUzI(W?_Ooj-gC)u3;7? z*bWRUm`ebdm?L7IWEYpXBEAjrKE(H?;$nZ3h1Sl-<05b3|Kqr_o!*6@ z2?CK!n1*o{?1=<(FI{9B#kHHms@69H8TJw^WzBc5RM@0;40Q zKrTX}i(Y7zKRE9b+O^OIx-#F&5F2pGfTigf3)PK|Tp1R36`g8^z5{yEvDw_ImY}2n z%P%$8bU-r z{mIVPqX;x_6h(-1Iy4Uawf6{3s{2o}u~L_(1F%Ctm^e?=I|CLM0493n$9Rracpk9z o+~QmiVL~0n7-0K61zBJIjb8S5dR{j5_jraSX7Vm%xevYnFKc-|0{{R3 diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out index a49bb9b..340756c 100644 --- a/tests/test_fpu.console_out +++ b/tests/test_fpu.console_out @@ -4,3 +4,4 @@ test 03:PASS test 04:PASS test 05:PASS test 06:PASS +test 07:PASS From 34b5d4a7b51bdae4e9994d11225b8216312eb793 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 31 Aug 2020 17:26:33 +1000 Subject: [PATCH 11/30] FPU: Implement the frsp instruction This brings in the invalid exception for the case of frsp with a signalling NaN as input, and the need to be able to convert a signalling NaN to a quiet NaN. Signed-off-by: Paul Mackerras --- decode1.vhdl | 1 + fpu.vhdl | 49 ++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 83444cf..284fb08 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -441,6 +441,7 @@ architecture behaviour of decode1 is 2#100000010# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 2/8=fmr 2#100000100# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 4/8=fnabs 2#100001000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 8/8=fabs + 2#110000000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- 0/12=frsp 2#111011010# => (FPU, OP_FPOP_I, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 26/14=fcfid 2#111011110# => (FPU, OP_FPOP_I, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 30/14=fcfidu others => illegal_inst diff --git a/fpu.vhdl b/fpu.vhdl index fecb7bb..7576562 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -39,6 +39,7 @@ architecture behaviour of fpu is DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF, DO_FMR, DO_FCFID, + DO_FRSP, FINISH, NORMALIZE, ROUND_UFLOW, ROUND_OFLOW, ROUNDING, ROUNDING_2, ROUNDING_3, @@ -71,6 +72,7 @@ architecture behaviour of fpu is cr_mask : std_ulogic_vector(7 downto 0); old_exc : std_ulogic_vector(4 downto 0); update_fprf : std_ulogic; + quieten_nan : std_ulogic; tiny : std_ulogic; denorm : std_ulogic; round_mode : std_ulogic_vector(2 downto 0); @@ -217,7 +219,7 @@ architecture behaviour of fpu is -- Construct a DP floating-point result from components function pack_dp(sign: std_ulogic; class: fp_number_class; exp: signed(EXP_BITS-1 downto 0); - mantissa: std_ulogic_vector; single_prec: std_ulogic) + mantissa: std_ulogic_vector; single_prec: std_ulogic; quieten_nan: std_ulogic) return std_ulogic_vector is variable result : std_ulogic_vector(63 downto 0); begin @@ -238,7 +240,8 @@ architecture behaviour of fpu is result(62 downto 52) := "11111111111"; when NAN => result(62 downto 52) := "11111111111"; - result(51 downto 29) := mantissa(53 downto 31); + result(51) := quieten_nan or mantissa(53); + result(50 downto 29) := mantissa(52 downto 31); if single_prec = '0' then result(28 downto 0) := mantissa(30 downto 2); end if; @@ -348,6 +351,7 @@ begin variable round : std_ulogic_vector(1 downto 0); variable update_fx : std_ulogic; variable arith_done : std_ulogic; + variable invalid : std_ulogic; variable mant_nz : std_ulogic; variable min_exp : signed(EXP_BITS-1 downto 0); variable max_exp : signed(EXP_BITS-1 downto 0); @@ -384,6 +388,7 @@ begin if e_in.op = OP_FPOP_I then int_input := '1'; end if; + v.quieten_nan := '1'; v.tiny := '0'; v.denorm := '0'; v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN); @@ -429,6 +434,7 @@ begin fpscr_mask := (others => '1'); update_fx := '0'; arith_done := '0'; + invalid := '0'; renormalize := '0'; set_x := '0'; @@ -452,6 +458,8 @@ begin end if; when "01000" => v.state := DO_FMR; + when "01100" => + v.state := DO_FRSP; when "01110" => -- fcfid[u][s] v.state := DO_FCFID; @@ -552,6 +560,7 @@ begin opsel_a <= AIN_B; v.result_class := r.b.class; v.result_exp := r.b.exponent; + v.quieten_nan := '0'; if r.insn(9) = '1' then v.result_sign := '0'; -- fabs elsif r.insn(8) = '1' then @@ -567,6 +576,33 @@ begin v.instr_done := '1'; v.state := IDLE; + when DO_FRSP => + opsel_a <= AIN_B; + v.result_class := r.b.class; + v.result_sign := r.b.negative; + v.result_exp := r.b.exponent; + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + if r.b.class = NAN and r.b.mantissa(53) = '0' then + -- Signalling NAN + v.fpscr(FPSCR_VXSNAN) := '1'; + invalid := '1'; + end if; + set_x := '1'; + if r.b.class = FINITE then + if r.b.exponent < to_signed(-126, EXP_BITS) then + v.shift := r.b.exponent - to_signed(-126, EXP_BITS); + v.state := ROUND_UFLOW; + elsif r.b.exponent > to_signed(127, EXP_BITS) then + v.state := ROUND_OFLOW; + else + v.shift := to_signed(-2, EXP_BITS); + v.state := ROUNDING; + end if; + else + arith_done := '1'; + end if; + when DO_FCFID => v.result_sign := '0'; opsel_a <= AIN_B; @@ -735,8 +771,11 @@ begin end case; if arith_done = '1' then - v.writing_back := '1'; - v.update_fprf := '1'; + -- Enabled invalid exception doesn't write result or FPRF + if (invalid and r.fpscr(FPSCR_VE)) = '0' then + v.writing_back := '1'; + v.update_fprf := '1'; + end if; v.instr_done := '1'; v.state := IDLE; update_fx := '1'; @@ -827,7 +866,7 @@ begin fp_result <= r.r; else fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r, - r.single_prec); + r.single_prec, r.quieten_nan); end if; if r.update_fprf = '1' then v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.result_sign, r.result_class, From 36130f1db351eec8bf19b00f4e2f9dd92276810b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sun, 19 Jul 2020 11:53:01 +1000 Subject: [PATCH 12/30] tests/fpu: Add tests for frsp Signed-off-by: Paul Mackerras --- tests/fpu/fpu.c | 89 ++++++++++++++++++++++++++++++++----- tests/test_fpu.bin | Bin 13504 -> 14032 bytes tests/test_fpu.console_out | 1 + 3 files changed, 79 insertions(+), 11 deletions(-) diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 80751d1..aff6d6c 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -10,6 +10,17 @@ #define MSR_FE0 0x800 #define MSR_FE1 0x100 +#define FPS_RN_NEAR 0 +#define FPS_RN_ZERO 1 +#define FPS_RN_CEIL 2 +#define FPS_RN_FLOOR 3 +#define FPS_XE 0x8 +#define FPS_ZE 0x10 +#define FPS_UE 0x20 +#define FPS_OE 0x40 +#define FPS_VE 0x80 +#define FPS_VXSOFT 0x400 + extern int trapit(long arg, int (*func)(long)); extern void do_rfid(unsigned long msr); extern void do_blr(void); @@ -363,8 +374,8 @@ int test5a(long arg) { set_fpscr(0); enable_fp_interrupts(); - set_fpscr(0x80); /* set VE */ - set_fpscr(0x480); /* set VXSOFT */ + set_fpscr(FPS_VE); /* set VE */ + set_fpscr(FPS_VXSOFT | FPS_VE); /* set VXSOFT */ set_fpscr(0); return 1; /* not supposed to get here */ } @@ -374,8 +385,8 @@ int test5b(long arg) unsigned long msr; enable_fp(); - set_fpscr(0x80); /* set VE */ - set_fpscr(0x480); /* set VXSOFT */ + set_fpscr(FPS_VE); /* set VE */ + set_fpscr(FPS_VXSOFT | FPS_VE); /* set VXSOFT */ asm("mfmsr %0" : "=r" (msr)); msr |= MSR_FE0 | MSR_FE1; asm("mtmsrd %0; xori 4,4,0" : : "r" (msr)); @@ -388,8 +399,8 @@ int test5c(long arg) unsigned long msr; enable_fp(); - set_fpscr(0x80); /* set VE */ - set_fpscr(0x480); /* set VXSOFT */ + set_fpscr(FPS_VE); /* set VE */ + set_fpscr(FPS_VXSOFT | FPS_VE); /* set VXSOFT */ asm("mfmsr %0" : "=r" (msr)); msr |= MSR_FE0 | MSR_FE1; do_rfid(msr); @@ -463,6 +474,12 @@ int test6(long arg) return 0; } +int fpu_test_6(void) +{ + enable_fp(); + return trapit(0, test6); +} + struct int_fp_equiv { long ival; unsigned long fp; @@ -522,16 +539,63 @@ int test7(long arg) return 0; } -int fpu_test_6(void) +int fpu_test_7(void) { enable_fp(); - return trapit(0, test6); + return trapit(0, test7); } -int fpu_test_7(void) +struct roundvals { + unsigned long fpscr; + unsigned long dpval; + unsigned long spval; +} roundvals[] = { + { FPS_RN_NEAR, 0, 0 }, + { FPS_RN_CEIL, 0x8000000000000000, 0x8000000000000000 }, + { FPS_RN_NEAR, 0x402123456789abcd, 0x4021234560000000 }, + { FPS_RN_ZERO, 0x402123456789abcd, 0x4021234560000000 }, + { FPS_RN_CEIL, 0x402123456789abcd, 0x4021234580000000 }, + { FPS_RN_FLOOR, 0x402123456789abcd, 0x4021234560000000 }, + { FPS_RN_NEAR, 0x402123457689abcd, 0x4021234580000000 }, + { FPS_RN_ZERO, 0x402123457689abcd, 0x4021234560000000 }, + { FPS_RN_CEIL, 0x402123457689abcd, 0x4021234580000000 }, + { FPS_RN_FLOOR, 0x402123457689abcd, 0x4021234560000000 }, + { FPS_RN_NEAR, 0x4021234570000000, 0x4021234580000000 }, + { FPS_RN_NEAR, 0x4021234550000000, 0x4021234540000000 }, + { FPS_RN_NEAR, 0x7ff123456789abcd, 0x7ff9234560000000 }, + { FPS_RN_ZERO, 0x7ffa3456789abcde, 0x7ffa345660000000 }, + { FPS_RN_FLOOR, 0x7ff0000000000000, 0x7ff0000000000000 }, + { FPS_RN_NEAR, 0x47e1234550000000, 0x47e1234540000000 }, + { FPS_RN_NEAR, 0x47f1234550000000, 0x7ff0000000000000 }, + { FPS_RN_ZERO, 0x47f1234550000000, 0x47efffffe0000000 }, + { FPS_RN_CEIL, 0x47f1234550000000, 0x7ff0000000000000 }, + { FPS_RN_FLOOR, 0x47f1234550000000, 0x47efffffe0000000 }, + { FPS_RN_NEAR, 0x38012345b0000000, 0x38012345c0000000 }, + { FPS_RN_NEAR, 0x37c12345b0000000, 0x37c1234400000000 }, +}; + +int test8(long arg) +{ + long i; + unsigned long result; + + for (i = 0; i < sizeof(roundvals) / sizeof(roundvals[0]); ++i) { + asm("lfd 3,0(%0); lfd 4,8(%0); mtfsf 0,3,1,0; frsp 6,4; stfd 6,0(%1)" + : : "b" (&roundvals[i]), "b" (&result) : "memory"); + if (result != roundvals[i].spval) { + print_string("\r\n"); + print_hex(i, 4, " "); + print_hex(result, 16, " "); + return i + 1; + } + } + return 0; +} + +int fpu_test_8(void) { enable_fp(); - return trapit(0, test7); + return trapit(0, test8); } int fail = 0; @@ -549,7 +613,9 @@ void do_test(int num, int (*test)(void)) print_string("FAIL "); print_hex(ret, 5, " SRR0="); print_hex(mfspr(SRR0), 16, " SRR1="); - print_hex(mfspr(SRR1), 16, "\r\n"); + print_hex(mfspr(SRR1), 16, " FPSCR="); + enable_fp(); + print_hex(get_fpscr(), 8, "\r\n"); } } @@ -564,6 +630,7 @@ int main(void) do_test(5, fpu_test_5); do_test(6, fpu_test_6); do_test(7, fpu_test_7); + do_test(8, fpu_test_8); return fail; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 25d50c77a0d990a40320f8baa84752efe3a6d996..81d18542064550fb7064a3683cbcdc7f7048c285 100755 GIT binary patch delta 2539 zcma)8du)?c6hGg$wsbH?$Jo|wCA3}X9^o;0?Hgm*!Zu2WQjrie20J!hA_*G64GXPU zDKk2=Ts3H-um$%AVjKYyjf4z?W@5zg2Zn^#(2;Bc+F^o&!9IWIYm0?Q;z{n#`Tfqh z=W*|CyRHpAo18?hN+NIF_oKW`&}*p7TSw#vw-MY%a2wZz%37T>rPo@YpCO;J9d7w~ z+!GOdInls{8>(7Mh~{OAEGVZC^FRDZboxTiLq5@B%C{lHh=@O)4h4zczOXkQVbZdZ z-y9LAO(n8%x43Ou26e=n(ILruGsUg(g;GT1WzNaIG@d+%RojrNc2x|F#EP3DVr}Le zxo5oiH1iw8JwIVa)+ZU{IjP09KdGv_o+Rq3I4;^IRLJjSh|3dRl9f%uo8^@n#Rpjh z1{(2fYN-?_vYgV(;#$_EwD5YW)RVQGT%!ZRl=z`zxSw4n z_FHusV~5pRl$eJjP(mBS%5y@>DVz1(U9~oZ3|6w;vC`G-HY(#X?>)8cFFbM9=!4Q$VyCU_k5f&Nw(``LS#Vwsh zOU?Jx+Fyo8kS3^%CUp|;_(;^{XXd_T;0DP*7tJ=s?Na2rDBjKACw(oZT4qY8M4e@( z{MGIwA6mAh1kcCv_Tw0?ccd5tU;WKp;3**skI)N0MeMC@;CT!^I_=*Y8{-_f4taYZ zaj;SFCpGT%2iy?2FTfp(sQ$2#DmHDTp!9P5EM1D}jz1MvH6!^+_}9tC_6 zxGj#2%vMa2z(|KJ`Opux?iN_SkoC;;$H&Bh*)6@rv8gA+uEfr@rJ!~Y~)vk ztll~+d7BV5zo4QzVjno9qrkzXw8IEn{?^!}!;?woQt%Yh9&j!#&h!u#CkmEkY4sda zh>Dwunx}I|s9|@k(6K2R9m*(cjqllo%PPf}j$><=45(@!JS-+GY$VI?s_J6`!ki0h zCAcUw3vMsp_z;i_oP!An>j0MTx;Q=ADeG>CTa%~BrkldM;T3jaQ7CnByG-0ZIce^fl<%-1)-y2w^A3`_Ut z<8vN>e+vAK*pHf=b6GkqG`Imd=PCHZkMZ+xeehp|pJQ39SVQ3Yz-=a)gsxKEv`<=? z?m}u`tXVz*i%e7+72ZO-p$euiDqbxtF>HZ(C@R_uore7|9fRUbp?z96Ov^1*J&t|k ziRben->T8+kC9aAt5-oEowOYdJWN zc&@0{bpY0Z+p5~jt{QEGeUcYNs}1fe?lCLVsRl;uqP26Xqofby z9ws4NlHbxw{KAEqfD~c5u)2rDjN+24dvGzu!r>v-;+5h|ebHUq(~;KVYq}$bQoT;} zO8OusPL@y{BsBf>dGosJ0-FoGpEa&VlSlbPo+LiR@fhkq4(E8vi0 a_$#z4u9NtaWelncVp^P&#)&brj{O5VEAAEm delta 1740 zcmaKse{54#6vxkd>q<99ef!a|jxM})UALBIR(`&@AuVjPb^|6^j3FQ$8w;|Lpcw|V zqZQi(435n+5@M7Mg3C0RfZIPRG1I64{6qXhgJL8g8YYMv5ZyA^^Ly70;SW8@$-U=u zzW3g9PjBD(?ZeMIiG0h6f{kCz3hu_Zg_Z;xi9%qlV69-STcS(4or|QM-Rl?1UFxaM zZytFpZZ9Vqxp+g@YYA%MN_=HGMFl_mHPN|?=O4;(r!ub)3KJAJ*F+;kuU|ZzR4CKC zQ7A9Y=M=X*{u=+Ktiv!E%Z3wUqL4P6k*woRo|+m%U}dYdNRoKDb(4I^#7C^|tevwd za1_OfBCb03>_S(6w3f-c+BrSH#^6DgPg?W1Ex$Q8+TNrUpqVN)Ueonsl?p{Q=zhzu zk=xR_Y~FJmu;r}Y2bDt7xVUlkJ-zn($*G8!B3e4>^M__Ng$MVk zD|XiP&6S6dr?|a)Be!4g z#kq07s z(J^U=zbraACw@gEHQ1<9a5q$Y9TnAa`^X6ch2LLG9VX1>&rQ1xY<396E z>y(FLmW$U~rcZZxh+O-L_CJckLO7$P_!%B9*4}guwY>eiu3rTfrobVkw(GiHF91w1 zAAA686r(V~8sIT}JyeXs1ltPl0&@re6Jr?OcZ2gwoN~h-yt1TB4&CGzOUk4Re7R&^ z#33R7Czgn@#_Y}X@9CBHm}%uDqu&Np1#TGCb@4E%+N>h~B(g?7LXmGq-t#~C81fCs z??|i{``4Rk)y(=$bR2@?{~-`}`U3eo$cw{-sqhP6cfrJcsVdD+z0xYP4_mKE6fW+4 z7aHFfe_U!eI%ITxjIWfsjSZNd9^)*R(-_5c@hyJBWj79B>bcFGF1!CcrqSCExAiL6 z0kD}8L|NltJz!O$1XvwBBL(I1n5)*e0WZ?R;$@rwiwOWLh5O*P2|nj`yXq%&T_9mX9)&l9 jdC&-9g7v_|g9Qsl41?YUhowQIH`T!N7n`_f=?DJ+=ocTF diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out index 340756c..25e791c 100644 --- a/tests/test_fpu.console_out +++ b/tests/test_fpu.console_out @@ -5,3 +5,4 @@ test 04:PASS test 05:PASS test 06:PASS test 07:PASS +test 08:PASS From 03d1aa968a76f338c4caf9c742e9e59d8a8d13e0 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 22 Jul 2020 12:19:12 +1000 Subject: [PATCH 13/30] FPU: Implement floating convert to integer instructions This implements fctiw, fctiwz, fctiwu, fctiwuz, fctid, fctidz, fctidu and fctiduz, and adds tests for them. There are some subtleties around the setting of the inexact (XX) and invalid conversion (VXCVI) flags in the FPSCR. If the rounded value ends up being out of range, we need to set VXCVI and not XX. For a conversion to unsigned word or doubleword of a negative value that rounds to zero, we need to set XX and not VXCVI. Signed-off-by: Paul Mackerras --- decode1.vhdl | 8 ++ fpu.vhdl | 157 ++++++++++++++++++++++++++++++++++++- tests/fpu/fpu.c | 157 +++++++++++++++++++++++++++++++++++++ tests/test_fpu.console_out | 2 + 4 files changed, 321 insertions(+), 3 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 284fb08..c659e3e 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -442,8 +442,16 @@ architecture behaviour of decode1 is 2#100000100# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 4/8=fnabs 2#100001000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 8/8=fabs 2#110000000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- 0/12=frsp + 2#111000000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 0/14=fctiw + 2#111000100# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 4/14=fctiwu + 2#111011001# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 25/14=fctid 2#111011010# => (FPU, OP_FPOP_I, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 26/14=fcfid + 2#111011101# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 29/14=fctidu 2#111011110# => (FPU, OP_FPOP_I, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 30/14=fcfidu + 2#111100000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 0/15=fctiwz + 2#111100100# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 4/15=fctiwuz + 2#111111001# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 25/15=fctidz + 2#111111101# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 29/15=fctiduz others => illegal_inst ); diff --git a/fpu.vhdl b/fpu.vhdl index 7576562..6301fa7 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -38,8 +38,10 @@ architecture behaviour of fpu is type state_t is (IDLE, DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF, DO_FMR, - DO_FCFID, + DO_FCFID, DO_FCTI, DO_FRSP, + INT_SHIFT, INT_ROUND, INT_ISHIFT, + INT_FINAL, INT_CHECK, INT_OFLOW, FINISH, NORMALIZE, ROUND_UFLOW, ROUND_OFLOW, ROUNDING, ROUNDING_2, ROUNDING_3, @@ -363,6 +365,8 @@ begin variable clz : std_ulogic_vector(5 downto 0); variable set_x : std_ulogic; variable mshift : signed(EXP_BITS-1 downto 0); + variable need_check : std_ulogic; + variable msb : std_ulogic; begin v := r; illegal := '0'; @@ -461,8 +465,15 @@ begin when "01100" => v.state := DO_FRSP; when "01110" => - -- fcfid[u][s] - v.state := DO_FCFID; + if int_input = '1' then + -- fcfid[u][s] + v.state := DO_FCFID; + else + v.state := DO_FCTI; + end if; + when "01111" => + v.round_mode := "001"; + v.state := DO_FCTI; when others => illegal := '1'; end case; @@ -603,6 +614,47 @@ begin arith_done := '1'; end if; + when DO_FCTI => + -- instr bit 9: 1=dword 0=word + -- instr bit 8: 1=unsigned 0=signed + -- instr bit 1: 1=round to zero 0=use fpscr[RN] + opsel_a <= AIN_B; + v.result_class := r.b.class; + v.result_sign := r.b.negative; + v.result_exp := r.b.exponent; + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + if r.b.class = NAN and r.b.mantissa(53) = '0' then + -- Signalling NAN + v.fpscr(FPSCR_VXSNAN) := '1'; + invalid := '1'; + end if; + + v.int_result := '1'; + case r.b.class is + when ZERO => + arith_done := '1'; + when FINITE => + if r.b.exponent >= to_signed(64, EXP_BITS) or + (r.insn(9) = '0' and r.b.exponent >= to_signed(32, EXP_BITS)) then + v.state := INT_OFLOW; + elsif r.b.exponent >= to_signed(52, EXP_BITS) then + -- integer already, no rounding required, + -- shift into final position + v.shift := r.b.exponent - to_signed(54, EXP_BITS); + if r.insn(8) = '1' and r.b.negative = '1' then + v.state := INT_OFLOW; + else + v.state := INT_ISHIFT; + end if; + else + v.shift := r.b.exponent - to_signed(52, EXP_BITS); + v.state := INT_SHIFT; + end if; + when INFINITY | NAN => + v.state := INT_OFLOW; + end case; + when DO_FCFID => v.result_sign := '0'; opsel_a <= AIN_B; @@ -622,6 +674,81 @@ begin v.state := FINISH; end if; + when INT_SHIFT => + opsel_r <= RES_SHIFT; + set_x := '1'; + v.state := INT_ROUND; + v.shift := to_signed(-2, EXP_BITS); + + when INT_ROUND => + opsel_r <= RES_SHIFT; + round := fp_rounding(r.r, r.x, '0', r.round_mode, r.result_sign); + v.fpscr(FPSCR_FR downto FPSCR_FI) := round; + -- Check for negative values that don't round to 0 for fcti*u* + if r.insn(8) = '1' and r.result_sign = '1' and + (r_hi_nz or r_lo_nz or v.fpscr(FPSCR_FR)) = '1' then + v.state := INT_OFLOW; + else + v.state := INT_FINAL; + end if; + + when INT_ISHIFT => + opsel_r <= RES_SHIFT; + v.state := INT_FINAL; + + when INT_FINAL => + -- Negate if necessary, and increment for rounding if needed + opsel_ainv <= r.result_sign; + carry_in <= r.fpscr(FPSCR_FR) xor r.result_sign; + -- Check for possible overflows + case r.insn(9 downto 8) is + when "00" => -- fctiw[z] + need_check := r.r(31) or (r.r(30) and not r.result_sign); + when "01" => -- fctiwu[z] + need_check := r.r(31); + when "10" => -- fctid[z] + need_check := r.r(63) or (r.r(62) and not r.result_sign); + when others => -- fctidu[z] + need_check := r.r(63); + end case; + if need_check = '1' then + v.state := INT_CHECK; + else + if r.fpscr(FPSCR_FI) = '1' then + v.fpscr(FPSCR_XX) := '1'; + end if; + arith_done := '1'; + end if; + + when INT_CHECK => + if r.insn(9) = '0' then + msb := r.r(31); + else + msb := r.r(63); + end if; + misc_sel <= '1' & r.insn(9 downto 8) & r.result_sign; + if (r.insn(8) = '0' and msb /= r.result_sign) or + (r.insn(8) = '1' and msb /= '1') then + opsel_r <= RES_MISC; + v.fpscr(FPSCR_VXCVI) := '1'; + invalid := '1'; + else + if r.fpscr(FPSCR_FI) = '1' then + v.fpscr(FPSCR_XX) := '1'; + end if; + end if; + arith_done := '1'; + + when INT_OFLOW => + opsel_r <= RES_MISC; + misc_sel <= '1' & r.insn(9 downto 8) & r.result_sign; + if r.b.class = NAN then + misc_sel(0) <= '1'; + end if; + v.fpscr(FPSCR_VXCVI) := '1'; + invalid := '1'; + arith_done := '1'; + when FINISH => if r.r(63 downto 54) /= "0000000001" then renormalize := '1'; @@ -846,6 +973,30 @@ begin when "0011" => -- mantissa of max representable SP number misc := x"007fffff80000000"; + when "1000" => + -- max positive result for fctiw[z] + misc := x"000000007fffffff"; + when "1001" => + -- max negative result for fctiw[z] + misc := x"ffffffff80000000"; + when "1010" => + -- max positive result for fctiwu[z] + misc := x"00000000ffffffff"; + when "1011" => + -- max negative result for fctiwu[z] + misc := x"0000000000000000"; + when "1100" => + -- max positive result for fctid[z] + misc := x"7fffffffffffffff"; + when "1101" => + -- max negative result for fctid[z] + misc := x"8000000000000000"; + when "1110" => + -- max positive result for fctidu[z] + misc := x"ffffffffffffffff"; + when "1111" => + -- max negative result for fctidu[z] + misc := x"0000000000000000"; when others => misc := x"0000000000000000"; end case; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index aff6d6c..3c6a9bd 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -19,6 +19,7 @@ #define FPS_UE 0x20 #define FPS_OE 0x40 #define FPS_VE 0x80 +#define FPS_VXCVI 0x100 #define FPS_VXSOFT 0x400 extern int trapit(long arg, int (*func)(long)); @@ -598,6 +599,160 @@ int fpu_test_8(void) return trapit(0, test8); } +struct cvtivals { + unsigned long dval; + long lval; + unsigned long ulval; + int ival; + unsigned int uival; + unsigned char invalids[4]; +} cvtivals[] = { + { 0x0000000000000000, 0, 0, 0, 0, {0, 0, 0, 0} }, + { 0x8000000000000000, 0, 0, 0, 0, {0, 0, 0, 0} }, + { 0x3fdfffffffffffff, 0, 0, 0, 0, {0, 0, 0, 0} }, + { 0x3ff0000000000000, 1, 1, 1, 1, {0, 0, 0, 0} }, + { 0xbff0000000000000, -1, 0, -1, 0, {0, 1, 0, 1} }, + { 0x402123456789abcd, 9, 9, 9, 9, {0, 0, 0, 0} }, + { 0x406123456789abcd, 137, 137, 137, 137, {0, 0, 0, 0} }, + { 0x409123456789abcd, 1097, 1097, 1097, 1097, {0, 0, 0, 0} }, + { 0x41c123456789abcd, 0x22468acf, 0x22468acf, 0x22468acf, 0x22468acf, {0, 0, 0, 0} }, + { 0x41d123456789abcd, 0x448d159e, 0x448d159e, 0x448d159e, 0x448d159e, {0, 0, 0, 0} }, + { 0x41e123456789abcd, 0x891a2b3c, 0x891a2b3c, 0x7fffffff, 0x891a2b3c, {0, 0, 1, 0} }, + { 0x41f123456789abcd, 0x112345679, 0x112345679, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} }, + { 0xc1f123456789abcd, -0x112345679, 0, 0x80000000, 0, {0, 1, 1, 1} }, + { 0x432123456789abcd, 0x891a2b3c4d5e6, 0x891a2b3c4d5e6, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} }, + { 0x433123456789abcd, 0x1123456789abcd, 0x1123456789abcd, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} }, + { 0x434123456789abcd, 0x22468acf13579a, 0x22468acf13579a, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} }, + { 0x43c123456789abcd, 0x22468acf13579a00, 0x22468acf13579a00, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} }, + { 0x43d123456789abcd, 0x448d159e26af3400, 0x448d159e26af3400, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} }, + { 0x43e123456789abcd, 0x7fffffffffffffff, 0x891a2b3c4d5e6800, 0x7fffffff, 0xffffffff, {1, 0, 1, 1} }, + { 0x43f123456789abcd, 0x7fffffffffffffff, 0xffffffffffffffff, 0x7fffffff, 0xffffffff, {1, 1, 1, 1} }, + { 0xc3f123456789abcd, 0x8000000000000000, 0, 0x80000000, 0, {1, 1, 1, 1} }, + { 0x7ff0000000000000, 0x7fffffffffffffff, 0xffffffffffffffff, 0x7fffffff, 0xffffffff, {1, 1, 1, 1} }, + { 0xfff0000000000000, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } }, + { 0x7ff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } }, + { 0xfff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } }, + { 0xbfd123456789abcd, 0, 0, 0, 0, {0, 0, 0, 0} }, +}; + +#define GET_VXCVI() ((get_fpscr() >> 8) & 1) + +int test9(long arg) +{ + long i; + int ires; + unsigned int ures; + long lres; + unsigned long ulres; + unsigned char inv[4]; + struct cvtivals *vp = cvtivals; + + for (i = 0; i < sizeof(cvtivals) / sizeof(cvtivals[0]); ++i, ++vp) { + set_fpscr(FPS_RN_NEAR); + asm("lfd 3,0(%0); fctid 4,3; stfd 4,0(%1)" + : : "b" (&vp->dval), "b" (&lres) : "memory"); + inv[0] = GET_VXCVI(); + set_fpscr(FPS_RN_NEAR); + asm("fctidu 5,3; stfd 5,0(%0)" : : "b" (&ulres) : "memory"); + inv[1] = GET_VXCVI(); + set_fpscr(FPS_RN_NEAR); + asm("fctiw 6,3; stfiwx 6,0,%0" : : "b" (&ires) : "memory"); + inv[2] = GET_VXCVI(); + set_fpscr(FPS_RN_NEAR); + asm("fctiwu 7,3; stfiwx 7,0,%0" : : "b" (&ures) : "memory"); + inv[3] = GET_VXCVI(); + + if (lres != vp->lval || ulres != vp->ulval || ires != vp->ival || ures != vp->uival || + inv[0] != vp->invalids[0] || inv[1] != vp->invalids[1] || + inv[2] != vp->invalids[2] || inv[3] != vp->invalids[3]) { + print_hex(lres, 16, inv[0]? "V ": " "); + print_hex(ulres, 16, inv[1]? "V ": " "); + print_hex(ires, 8, inv[2]? "V ": " "); + print_hex(ures, 8, inv[3]? "V ": " "); + return i + 1; + } + } + return 0; +} + +int fpu_test_9(void) +{ + enable_fp(); + return trapit(0, test9); +} + +struct cvtivals cvtizvals[] = { + { 0x0000000000000000, 0, 0, 0, 0, {0, 0, 0, 0} }, + { 0x8000000000000000, 0, 0, 0, 0, {0, 0, 0, 0} }, + { 0x3fdfffffffffffff, 0, 0, 0, 0, {0, 0, 0, 0} }, + { 0x3ff0000000000000, 1, 1, 1, 1, {0, 0, 0, 0} }, + { 0xbff0000000000000, -1, 0, -1, 0, {0, 1, 0, 1} }, + { 0x402123456789abcd, 8, 8, 8, 8, {0, 0, 0, 0} }, + { 0x406123456789abcd, 137, 137, 137, 137, {0, 0, 0, 0} }, + { 0x409123456789abcd, 1096, 1096, 1096, 1096, {0, 0, 0, 0} }, + { 0x41c123456789abcd, 0x22468acf, 0x22468acf, 0x22468acf, 0x22468acf, {0, 0, 0, 0} }, + { 0x41d123456789abcd, 0x448d159e, 0x448d159e, 0x448d159e, 0x448d159e, {0, 0, 0, 0} }, + { 0x41e123456789abcd, 0x891a2b3c, 0x891a2b3c, 0x7fffffff, 0x891a2b3c, {0, 0, 1, 0} }, + { 0x41f123456789abcd, 0x112345678, 0x112345678, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} }, + { 0xc1f123456789abcd, -0x112345678, 0, 0x80000000, 0, {0, 1, 1, 1} }, + { 0x432123456789abcd, 0x891a2b3c4d5e6, 0x891a2b3c4d5e6, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} }, + { 0x433123456789abcd, 0x1123456789abcd, 0x1123456789abcd, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} }, + { 0x434123456789abcd, 0x22468acf13579a, 0x22468acf13579a, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} }, + { 0x43c123456789abcd, 0x22468acf13579a00, 0x22468acf13579a00, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} }, + { 0x43d123456789abcd, 0x448d159e26af3400, 0x448d159e26af3400, 0x7fffffff, 0xffffffff, {0, 0, 1, 1} }, + { 0x43e123456789abcd, 0x7fffffffffffffff, 0x891a2b3c4d5e6800, 0x7fffffff, 0xffffffff, {1, 0, 1, 1} }, + { 0x43f123456789abcd, 0x7fffffffffffffff, 0xffffffffffffffff, 0x7fffffff, 0xffffffff, {1, 1, 1, 1} }, + { 0xc3f123456789abcd, 0x8000000000000000, 0, 0x80000000, 0, {1, 1, 1, 1} }, + { 0x7ff0000000000000, 0x7fffffffffffffff, 0xffffffffffffffff, 0x7fffffff, 0xffffffff, {1, 1, 1, 1} }, + { 0xfff0000000000000, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } }, + { 0x7ff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } }, + { 0xfff923456789abcd, 0x8000000000000000, 0, 0x80000000, 0, { 1, 1, 1, 1 } }, +}; + +int test10(long arg) +{ + long i; + int ires; + unsigned int ures; + long lres; + unsigned long ulres; + unsigned char inv[4]; + struct cvtivals *vp = cvtizvals; + + for (i = 0; i < sizeof(cvtizvals) / sizeof(cvtizvals[0]); ++i, ++vp) { + set_fpscr(FPS_RN_NEAR); + asm("lfd 3,0(%0); fctidz 4,3; stfd 4,0(%1)" + : : "b" (&vp->dval), "b" (&lres) : "memory"); + inv[0] = GET_VXCVI(); + set_fpscr(FPS_RN_NEAR); + asm("fctiduz 5,3; stfd 5,0(%0)" : : "b" (&ulres) : "memory"); + inv[1] = GET_VXCVI(); + set_fpscr(FPS_RN_NEAR); + asm("fctiwz 6,3; stfiwx 6,0,%0" : : "b" (&ires) : "memory"); + inv[2] = GET_VXCVI(); + set_fpscr(FPS_RN_NEAR); + asm("fctiwuz 7,3; stfiwx 7,0,%0" : : "b" (&ures) : "memory"); + inv[3] = GET_VXCVI(); + + if (lres != vp->lval || ulres != vp->ulval || ires != vp->ival || ures != vp->uival || + inv[0] != vp->invalids[0] || inv[1] != vp->invalids[1] || + inv[2] != vp->invalids[2] || inv[3] != vp->invalids[3]) { + print_hex(lres, 16, inv[0]? "V ": " "); + print_hex(ulres, 16, inv[1]? "V ": " "); + print_hex(ires, 8, inv[2]? "V ": " "); + print_hex(ures, 8, inv[3]? "V ": " "); + return i + 1; + } + } + return 0; +} + +int fpu_test_10(void) +{ + enable_fp(); + return trapit(0, test10); +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -631,6 +786,8 @@ int main(void) do_test(6, fpu_test_6); do_test(7, fpu_test_7); do_test(8, fpu_test_8); + do_test(9, fpu_test_9); + do_test(10, fpu_test_10); return fail; } diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out index 25e791c..3e84260 100644 --- a/tests/test_fpu.console_out +++ b/tests/test_fpu.console_out @@ -6,3 +6,5 @@ test 05:PASS test 06:PASS test 07:PASS test 08:PASS +test 09:PASS +test 10:PASS From 0ad2aa30149d0a6e2d3082e841f6fe5079209067 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 22 Jul 2020 16:13:12 +1000 Subject: [PATCH 14/30] FPU: Implement floating round-to-integer instructions This implements frin, friz, frip and frim, and adds tests for them. Signed-off-by: Paul Mackerras --- decode1.vhdl | 4 +++ fpu.vhdl | 40 +++++++++++++++++++-- tests/fpu/fpu.c | 71 +++++++++++++++++++++++++++++++++++++ tests/test_fpu.bin | Bin 14032 -> 21208 bytes tests/test_fpu.console_out | 1 + 5 files changed, 114 insertions(+), 2 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index c659e3e..a42899d 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -441,6 +441,10 @@ architecture behaviour of decode1 is 2#100000010# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 2/8=fmr 2#100000100# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 4/8=fnabs 2#100001000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 8/8=fabs + 2#100001100# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 12/8=frin + 2#100001101# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 13/8=friz + 2#100001110# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 14/8=frip + 2#100001111# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 15/8=frim 2#110000000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- 0/12=frsp 2#111000000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 0/14=fctiw 2#111000100# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 4/14=fctiwu diff --git a/fpu.vhdl b/fpu.vhdl index 6301fa7..371fdc5 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -39,7 +39,8 @@ architecture behaviour of fpu is DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF, DO_FMR, DO_FCFID, DO_FCTI, - DO_FRSP, + DO_FRSP, DO_FRI, + FRI_1, INT_SHIFT, INT_ROUND, INT_ISHIFT, INT_FINAL, INT_CHECK, INT_OFLOW, FINISH, NORMALIZE, @@ -461,7 +462,11 @@ begin v.state := DO_MTFSF; end if; when "01000" => - v.state := DO_FMR; + if e_in.insn(9 downto 8) /= "11" then + v.state := DO_FMR; + else + v.state := DO_FRI; + end if; when "01100" => v.state := DO_FRSP; when "01110" => @@ -587,6 +592,31 @@ begin v.instr_done := '1'; v.state := IDLE; + when DO_FRI => -- fri[nzpm] + opsel_a <= AIN_B; + v.result_class := r.b.class; + v.result_sign := r.b.negative; + v.result_exp := r.b.exponent; + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + if r.b.class = NAN and r.b.mantissa(53) = '0' then + -- Signalling NAN + v.fpscr(FPSCR_VXSNAN) := '1'; + invalid := '1'; + end if; + if r.b.class = FINITE then + if r.b.exponent >= to_signed(52, EXP_BITS) then + -- integer already, no rounding required + arith_done := '1'; + else + v.shift := r.b.exponent - to_signed(52, EXP_BITS); + v.state := FRI_1; + v.round_mode := '1' & r.insn(7 downto 6); + end if; + else + arith_done := '1'; + end if; + when DO_FRSP => opsel_a <= AIN_B; v.result_class := r.b.class; @@ -749,6 +779,12 @@ begin invalid := '1'; arith_done := '1'; + when FRI_1 => + opsel_r <= RES_SHIFT; + set_x := '1'; + v.shift := to_signed(-2, EXP_BITS); + v.state := ROUNDING; + when FINISH => if r.r(63 downto 54) /= "0000000001" then renormalize := '1'; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 3c6a9bd..d24fe14 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -753,6 +753,76 @@ int fpu_test_10(void) return trapit(0, test10); } +struct frivals { + unsigned long val; + unsigned long nval; + unsigned long zval; + unsigned long pval; + unsigned long mval; +} frivals[] = { + { 0x0000000000000000, 0, 0, 0, 0 }, + { 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 }, + { 0x3fdfffffffffffff, 0, 0, 0x3ff0000000000000, 0 }, + { 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 }, + { 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000 }, + { 0x402123456789abcd, 0x4022000000000000, 0x4020000000000000, 0x4022000000000000, 0x4020000000000000 }, + { 0x406123456789abcd, 0x4061200000000000, 0x4061200000000000, 0x4061400000000000, 0x4061200000000000 }, + { 0x409123456789abcd, 0x4091240000000000, 0x4091200000000000, 0x4091240000000000, 0x4091200000000000 }, + { 0x41c123456789abcd, 0x41c1234567800000, 0x41c1234567800000, 0x41c1234568000000, 0x41c1234567800000 }, + { 0x41d123456789abcd, 0x41d1234567800000, 0x41d1234567800000, 0x41d1234567c00000, 0x41d1234567800000 }, + { 0x41e123456789abcd, 0x41e1234567800000, 0x41e1234567800000, 0x41e1234567a00000, 0x41e1234567800000 }, + { 0x41f123456789abcd, 0x41f1234567900000, 0x41f1234567800000, 0x41f1234567900000, 0x41f1234567800000 }, + { 0xc1f123456789abcd, 0xc1f1234567900000, 0xc1f1234567800000, 0xc1f1234567800000, 0xc1f1234567900000 }, + { 0xc1f1234567880000, 0xc1f1234567900000, 0xc1f1234567800000, 0xc1f1234567800000, 0xc1f1234567900000 }, + { 0x432123456789abcd, 0x432123456789abce, 0x432123456789abcc, 0x432123456789abce, 0x432123456789abcc }, + { 0x433123456789abcd, 0x433123456789abcd, 0x433123456789abcd, 0x433123456789abcd, 0x433123456789abcd }, + { 0x434123456789abcd, 0x434123456789abcd, 0x434123456789abcd, 0x434123456789abcd, 0x434123456789abcd }, + { 0x43c123456789abcd, 0x43c123456789abcd, 0x43c123456789abcd, 0x43c123456789abcd, 0x43c123456789abcd }, + { 0x43d123456789abcd, 0x43d123456789abcd, 0x43d123456789abcd, 0x43d123456789abcd, 0x43d123456789abcd }, + { 0x43e123456789abcd, 0x43e123456789abcd, 0x43e123456789abcd, 0x43e123456789abcd, 0x43e123456789abcd }, + { 0x43f123456789abcd, 0x43f123456789abcd, 0x43f123456789abcd, 0x43f123456789abcd, 0x43f123456789abcd }, + { 0xc3f123456789abcd, 0xc3f123456789abcd, 0xc3f123456789abcd, 0xc3f123456789abcd, 0xc3f123456789abcd }, + { 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 }, + { 0xfff0000000000000, 0xfff0000000000000, 0xfff0000000000000, 0xfff0000000000000, 0xfff0000000000000 }, + { 0x7ff123456789abcd, 0x7ff923456789abcd, 0x7ff923456789abcd, 0x7ff923456789abcd, 0x7ff923456789abcd }, + { 0xfff923456789abcd, 0xfff923456789abcd, 0xfff923456789abcd, 0xfff923456789abcd, 0xfff923456789abcd }, +}; + +int test11(long arg) +{ + long i; + unsigned long results[4]; + struct frivals *vp = frivals; + + for (i = 0; i < sizeof(frivals) / sizeof(frivals[0]); ++i, ++vp) { + set_fpscr(FPS_RN_FLOOR); + asm("lfd 3,0(%0); frin 4,3; stfd 4,0(%1)" + : : "b" (&vp->val), "b" (results) : "memory"); + set_fpscr(FPS_RN_NEAR); + asm("friz 5,3; stfd 5,8(%0)" : : "b" (results) : "memory"); + set_fpscr(FPS_RN_ZERO); + asm("frip 5,3; stfd 5,16(%0)" : : "b" (results) : "memory"); + set_fpscr(FPS_RN_CEIL); + asm("frim 5,3; stfd 5,24(%0)" : : "b" (results) : "memory"); + if (results[0] != vp->nval || results[1] != vp->zval || + results[2] != vp->pval || results[3] != vp->mval) { + print_hex(i, 2, "\r\n"); + print_hex(results[0], 16, " "); + print_hex(results[1], 16, " "); + print_hex(results[2], 16, " "); + print_hex(results[3], 16, " "); + return i + 1; + } + } + return 0; +} + +int fpu_test_11(void) +{ + enable_fp(); + return trapit(0, test11); +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -788,6 +858,7 @@ int main(void) do_test(8, fpu_test_8); do_test(9, fpu_test_9); do_test(10, fpu_test_10); + do_test(11, fpu_test_11); return fail; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 81d18542064550fb7064a3683cbcdc7f7048c285..d2320cd960e8c39417367d3b8d7fc730494c9eca 100755 GIT binary patch literal 21208 zcmeHP4RBP~bv~=#6+hAvlUV#1o)7^^QBu(%&J&W*TS;Ic{IlUFV8?M*UNJ^Mrg%cy zm3hmq$hL+IC1C6pJ75bBR&7G5(~_w@1F;m_2_#MnDNaR9u!KpfACoL;n=DA?^*i_8 zCq1nsMBK(q2fdl6bMHCdJ@?#m&pq$HyAOzrGNPK=AEehbfUcqCHMK;3@YaL39=!Ey zBFop;`c|(CmDBp-ub*DuFqyV`o0IE9@2?`$pqk>iMHNI^DN&C92kALM(7MQ;96u51 zmn~PE68hMS{+cU&A)*NS*@1p`pr0LU4$s|D+nl^Tl#>#d{Qa$VgLS6`$9S6`;qtFKVU)fRH~o}irGQ{?S! zquSnfYV7T#1HCb7?Tu4M?;Ij?ExC3j+BVxIZ42h^Fky?ywlr7FGZS(47+_Bq?CF9% z-LR({_H@IZZrIZed%9syH|*(#Jw33e2ln*9o*vlK1ABU4PY>+rfh`g3a+c|FxjM;} zqcTgbd#+*%bH)Hq~-WvP!2YaBo!h?CvFe%xu zYaDkU@n!cPxyk)g%1s{sP41skZ_fBX;&2<%ogIIac8iZc{T3g;`xYO6#?2YOz5XIM zxqljWbH+a;A$`OQ+y6BF79anFTYUTzZ}IVG4nKZ-eQ2Y|jZx(OQRI;kNv&+<1ZJ%O8JyRxwf6k5Sas`>5ZuNATWx-?El6hHuZFieia9 z3G9#FIh7!zZL^Ax+}ji;I`q!bVPwC)Hv-u=-r1{UvJdiSL+_Eh{oy04{9#kd(4D;O zF{0+bI42PFJW)~XrrhGmH3gxV<$~QW{fOwrpMEBLof)%o8iO?fb3O&kEoTEOTohQF z_z2 zy|6_;njR460&SMJD>!Rzi1Y%p)7omZTOFMS6*Q1a$U_-yAi&UW@=&Vs!cF}Hqw;;&`AxY)n$uaPY4v6{e3nf4wMWzQLq(LM2bboCTBcC7t?LB`b^@>|z@F)ks|qo< z@yC<8a4yzGMuBj$4f5y_Lc1a0P)dtFdY2P*Ta{3VNsZ5YpS;iBZib0YS`r5yin!7dk_FBwk@^Y((@;&U9k4S4Hu1c2 z%xZzCzEbs}p>JS1?K3BWuZCM_;?4e*Gq81pF_k zvHhX_I_*w;7ACH5qIU6&59==!-oE9>JpZcD;z+G>-&CZU4EO%DObdJ_q}xQskD;#CQw% ze5sovdfWOJ^ddSC&li0i=+vmT(*yn#Rd-WJ)H6_@{96lpYdifIn;XyF;M9ENv-%>> zl7P7ZcJ$|{1RL4!Q^+W8PF{lijJ@a%-uJeb1eeYYaa^+f8o7$|=*xE;S@DRuZ$?Ys zzL~rSJb^u6XWPCR+9eO=o!vLnxP(XkSvQ@8ofuD10e+*}iaq(mrS0nj`W@I$uzfr> z9s`ffz}W5qAG9-1jjc@8i&Xv7sOu;*hTE|o`#H&DJ?x=s5BkhgeeS~ZFB|nDRga*q zp{}d?M)BOVO7bvwg=){J_8RP>Avv7=bGySov`hMjeZzYt_RR$1-w^#ek^uZ%f5J`M zPwc#du4QX2!cQJrf;sN!%toKQ=dE`5B~?6!pD#^OJ_+`|7v?$tEcom%l*PC|iMl;* z-d`QXT5yq75_QI3!dhV4rxIaqGsN7w@qWT{n@sIG_9w_sO?ndg<+1D*WAPRn)Dg!m zavqPxGcAHKe51{v)_%TkT2r5!4uN-|pSKGiF)Ij7f)4djhp_Hl697OdHaOUJ>torrbzCqGB>@tplx z?={S4SRY5$Ox_zejFb7phjm`0-ig*I-U7tX{i@74@2!cBb&KDW@eWkHGsXA-`>3-q zs}?>T{F|RMe@{W%TG+sAn)jjZ#E$wh!zxbOvDfmybtDb9A%!LikW+BQsTyD!SHI!#@xjn<2jB+lQJ2Ol-%DivS!#bWy3kx@0+iGwg8MLOy zBPlud-QMwo=hYLvv3Y$+&FgWIR|e1PLuy_R^vee3b%vVPzwDO{%l z#uB1FvG!j;uFAvr92CnH=QT&u-wX2^NXek;gIOmx^RvdH{5)>=mepv)Vg}xE$sdb7 z5Xj1+@~o`(h3V9^ke@FtPs}RO=z+~yS=o^N{$b?yl60$OL0{~<4_m#F1!2p-pzijr z&7V0@Z(>a&N3jiu`t(!h!+J}=3{HDJ>_V>Kyvpw?25ge&3t|0EEwF;~XdUD`%T{~< zd%E;j0;XQQBpjTU8#;;o0k?Y_If2{p+0&icd#ffP*PcL|Z>F^L&B5M%)?N7x7v`)B z+UHY@`4}(F&#R1Y=_fx8+bga3FG+$FmdQ( z8{^eD-LPUr2wp8&otJrHzJ$p2l=hn)ld$%gzz$akjb&`~0E=r5kc zxmA9x(??T)ivSm)34IZYSaZnOQsIAnb5iv9=DNRQ-2wk1XIgh#1Qa|MF5F;K_n5<((WrZ)x5 z4m|4_cM0p?iJJ~wJ9OL)9Hj*OuPWS|3Aj6edj&YWWLWy%fVo@Y_9x(G0JjUcdw`om zP2mW}w?<(fRal(K7)gtdZ%&#U2nMGf4`*0MIKR;n}f8U#c6QkqtCG1OiY#kU|Lz3JJ@mUYQE3aJ^)p^}>ozDxa zmPhr=m~*c4nPFr_RPRuAJ`2oS8P!j#I-lunM7>qj`MmGRFGTh4sCqi;=TJYO>U^Fz zdsS55t?C)5Z$Q0K)yJWJ6!k|{eLU)YsMn&7wWhOAllDA}edqPh${06YoF|gsjdOmS zDQ+jcR~CtVPeIpYyb~NwA@k6^S_o_H(f4uINVKj~@MieUg=v8mGOrhOc@(yL0BbPT zjTmEmH}L8=3FZ2MxEjfh!eQ*t`t2y_5_a6@N1X9qbFd9=i8B_a)vw@lW?{psp=a#Q z=Mcv>@0%^l`L6%&Hwn*4tQqmH50~ku5F6uYMjr16C!)@KN6w@8p<=0HjIIdw#ldOo zOm1@|yCB5A<>w)vi^hQMcLWNy9n?Z)8o6t6Hwc>-Q;1r$GD>qdU`*iWOFsO}zTh)g zmKzB&YB9>7*3b9!Jr{J+YS6Z(0rPxiI)68c`V-6Dp_F(*!|Pc!4H4)O>k;iHKt?q8 zX~st^O~wFx=iW0*`J6;K>HF>mob5~beUg`q_t6RcJVW5@*f?~btfvEmv8Z+i#)0|c zG16U6Tb8>xTXT#Pdgc6LB|O53;YL{rU^nm@ABo0 zS?pbcZ%M&lzHC_$M#yv?)5X5!mCMR#mxb?aEqrHd;X7Lk-`QID&epwC_Rt9<=Y_e86W) ze3k?~;#`S!aG7OxZe||Nyv5~uY#F5ly{>5!T*SO=+_T@atcy}CcyK=b7kEAg&p^58 zqGiP#JS;?-@Jq|m96X#$P&Rn86_2TjI}f~b%&W@q(sQn=%%Vx&E%%Shq>6Xb@SY)l zFFgqHIpC0>i7J8f?oWO;={XmW-b~-;0rtLo{ucr+@Mi$){FPBsG*Fk>t zI`Syw=dL3^2zg4^$ok)ayzn~mi;%CljywhV%Y;0Lv0(f>1Ise0GR=#s7(e7^Mv!x? zR6$r{ zqZ;7FX&eC@FL{8R0lp7-j+-Iz;(suc7Kv~nwt8GX&b8L#?hGkiLNA`Q!+wg_?A7hS98HXFbaS5QpwLbpRk1t&l zeC|bD5%JgM#N|}}>g_lF>G|FGmkElZNf>v6zIl~W?<6(;Ni;mpTlvhY+k_izIy|nm zl5BvY!{bg@()@pzBmAZ4@VJgj@c9bU+K298zj2uv2p6Yc`f)5%SC}c>;>5i!T_%n? zR8f{~oMgsCTQs*RimG3wPvPu1j@ZgTMcLLT{d*;>S-|5$sj{hs-v};(M z>;TKUdHKMtta4Jj%p&RM8_x&TDN>=6+T{U#H!mLqRX(sOoFC>HNX!S@#`1yi%dMIZ zWE98CU2k8rbBAQz*x!hPjYv7>Mqc-^^1x8j6iuv4oV{Ndje6C-Vl>*SaHG-F z3O5?)#B&w;d4TT#@rT<39Nxx|7J;|t)?S|@07CmYt@rLr}P#jl~4pP zp1aJ)73>B^;nnk&dw!CTW>{yt$1xoJ!?B&CTBOmiw$%28kD^Rdc8&F5F&pE~m g{>yjTewU!8JAd=Nzd!%XBeQSgIzJD7@@*;kAE#Kl9{>OV delta 2116 zcmZvde@xRy6u|F$lv)s2`Bkx$TH0D^!SZ7Zei4+~p|YZtEsDq%Q>=)SnJi9JT&2}j zs>`y3J+n9xHzmp+#u@yTxI`CR)G+oBF4qKGxp#N( zy}NIF`;^eX-9U(C2_d%no3m^!s2j*4TRkBTWKGDLkTo@U7PTAR7IwGS718V2=f2+k zR&uvhj3v693bKD0W7Z{k4p7L8+{B|TA!Ot?TrX;f$4=0Y`@(GCA!AkCk1%UVB=%Q~ z?HlNSZBB;!oV6c!6LNas;v441>>ix^a^PrmE^-Emab)I+m3aLEDvV)BEl#YZVCCPn5rHmaYqgXa>u_M)#)Q@3=l#fI=IGt*Zmw7r%B^VL! zfoG{s!3T9|x!KWOk0q1JYvElx%WnSjtE@bSN?<^z( zbE=m)TqMrv*f#>9jPxxcKMH74)OoAXV}FH>wmg_yz1z*;06T>=bWSXPG1t`n$?R zH^)Orm5HHy)k=Fm#!R>A@Fc7Hb0Xv}AZ{_9uz43}IbGjLBehls+^z!R6gfK>WsC6w zTB3FV$rcCWU`TC-9&JNHv|s^W^G{Hy%cScV*mVkeT?SipMf7qTbm(mKY6$M@is``+ zsP$WDRtVZqU7CV1y`45p!P5EP(;Jg;cfNrhn*^!BK$|DQjB52H)EG=OdlC*9)=W2- ze;&LN{z5fme{lH(j2aB?+mQl?ar!}5jLdcUIjiDn@jGB_Q`z4*Pkh?;6Ucd^VzE)f z&8NgY6aTpXK-NX~x~6*_Kz=REKA&MhWIrOi6kt{J@nu@D5j?&Zxr}(+yt^}pdogh4 zif3MXQVck<#VekR`m@jd|hXaW}ye}7;DikHmy(e4T(f!tJe@)3$E-il`i*bN6tuy z>+~A3Plh)AMb;uA#*&v=46&3X%rHwhPX3Ay%G{Ow7vlfS^m&9B4-;}YpEEqGHFw81 zd4uj%BaHbGagpP|Frpe|teOK9&MY`;kp)n5;cP9AcOEdNLCuA;tvD>m^cD)~sq~{cIL?@gAA_R75k%%62X($$s-lDx1i+fF zl~$lx9e{25Ch1Nz&jg?&-yl7Xrv5S9#DHEjRZn0H%>bI+Phe?*)|!F8J(8!d)|-b+ zj%@nSaIcVMAgkb6psd2N7Fp#}C@Qo`PojD985}4yr1hg&`yBs_c-ZnJ4fezdR|@qe zKf*&CqVO&GmDJ+DmX`yR92^!Ly+MdCGMOfEiX$#O+{5+1jOAhZTsS*|Bj?41jYY!} N2(2u-U|Drk`X5<{!7%^; diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out index 3e84260..3a5a601 100644 --- a/tests/test_fpu.console_out +++ b/tests/test_fpu.console_out @@ -8,3 +8,4 @@ test 07:PASS test 08:PASS test 09:PASS test 10:PASS +test 11:PASS From 4807d0bdb6bda1154cc39a619d0432de5ec14571 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 22 Jul 2020 20:51:31 +1000 Subject: [PATCH 15/30] FPU: Implement fmrgew and fmrgow and add tests for them Signed-off-by: Paul Mackerras --- decode1.vhdl | 2 ++ fpu.vhdl | 27 +++++++++++++++++++++++---- tests/fpu/fpu.c | 21 +++++++++++++++++++++ tests/test_fpu.bin | Bin 21208 -> 21208 bytes tests/test_fpu.console_out | 1 + 5 files changed, 47 insertions(+), 4 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index a42899d..34170dd 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -434,6 +434,8 @@ architecture behaviour of decode1 is 2#011000001# => (FPU, OP_FPOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 1/6=mtfsb1 2#011000010# => (FPU, OP_FPOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 2/6=mtfsb0 2#011000100# => (FPU, OP_FPOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 4/6=mtfsfi + 2#011011010# => (FPU, OP_FPOP_I, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 26/6=fmrgow + 2#011011110# => (FPU, OP_FPOP_I, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 30/6=fmrgew 2#011110010# => (FPU, OP_FPOP_I, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 18/7=mffs family 2#011110110# => (FPU, OP_FPOP_I, NONE, FRB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 22/7=mtfsf 2#100000000# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 0/8=fcpsgn diff --git a/fpu.vhdl b/fpu.vhdl index 371fdc5..e97461c 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -37,7 +37,7 @@ architecture behaviour of fpu is type state_t is (IDLE, DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF, - DO_FMR, + DO_FMR, DO_FMRG, DO_FCFID, DO_FCTI, DO_FRSP, DO_FRI, FRI_1, @@ -450,10 +450,14 @@ begin when "00000" => v.state := DO_MCRFS; when "00110" => - if e_in.insn(8) = '0' then - v.state := DO_MTFSB; + if e_in.insn(10) = '0' then + if e_in.insn(8) = '0' then + v.state := DO_MTFSB; + else + v.state := DO_MTFSFI; + end if; else - v.state := DO_MTFSFI; + v.state := DO_FMRG; end if; when "00111" => if e_in.insn(8) = '0' then @@ -524,6 +528,15 @@ begin v.instr_done := '1'; v.state := IDLE; + when DO_FMRG => + -- fmrgew, fmrgow + opsel_r <= RES_MISC; + misc_sel <= "01" & r.insn(8) & '0'; + v.int_result := '1'; + v.writing_back := '1'; + v.instr_done := '1'; + v.state := IDLE; + when DO_MFFS => v.int_result := '1'; v.writing_back := '1'; @@ -1009,6 +1022,12 @@ begin when "0011" => -- mantissa of max representable SP number misc := x"007fffff80000000"; + when "0100" => + -- fmrgow result + misc := r.a.mantissa(31 downto 0) & r.b.mantissa(31 downto 0); + when "0110" => + -- fmrgew result + misc := r.a.mantissa(63 downto 32) & r.b.mantissa(63 downto 32); when "1000" => -- max positive result for fctiw[z] misc := x"000000007fffffff"; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index d24fe14..e7a1334 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -823,6 +823,26 @@ int fpu_test_11(void) return trapit(0, test11); } +int test12(long arg) +{ + unsigned long vals[2]; + unsigned long results[2]; + + vals[0] = 0xf0f0f0f05a5a5a5aul; + vals[1] = 0x0123456789abcdeful; + asm("lfd 5,0(%0); lfd 6,8(%0); fmrgew 7,5,6; fmrgow 8,5,6; stfd 7,0(%1); stfd 8,8(%1)" + : : "b" (vals), "b" (results) : "memory"); + if (results[0] != 0xf0f0f0f001234567ul || results[1] != 0x5a5a5a5a89abcdeful) + return 1; + return 0; +} + +int fpu_test_12(void) +{ + enable_fp(); + return trapit(0, test12); +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -859,6 +879,7 @@ int main(void) do_test(9, fpu_test_9); do_test(10, fpu_test_10); do_test(11, fpu_test_11); + do_test(12, fpu_test_12); return fail; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index d2320cd960e8c39417367d3b8d7fc730494c9eca..668ff65367cbf02294b638e77fd93f9173db9532 100755 GIT binary patch delta 1932 zcmZ`&ZA?>V6n<~v*22&fC{Vsz$_5oGEpziCAC@SjQfJWV<`^m+BT9ma%Ot6;qiF7u znUDo{7Nd(UxQNk=C1YUX4_q{pMHiRM#UHqsMHFYt$1IypoWi^3ZR?C|JITp?p68r< z&bjBl??|sS(kpd$X^s^3e{?X>9}1Z&Z}94}P{``}Zh>`oLxZ&i?g?@8LyKgp+*T9O zzu0eWx%-3FGXov5GR96_)Uit!yO!O4-1V|N%h>pcj>TUKg`B8o4oa*^35DuVvu?LV zs%zO+Q|DpzR-cLOThq{Bstj;X0DaQKO`jC^);0S$w>b3atv)g&b!N9raW`YENn)ll zri&?F#r8S9kH-9>`fBP=%9X5iB`J52F@w3D=(CeFm$Y8mOzV=1H1AJRM{G zenEPPhLcxnrY9)gP^vjQK~;ve)@&nlo#*Bm&Ocwl`Mwp5ohw)0JX=0if292AEO&b+ zj{CiTeCw~F(}seD3CQ-|Qo~9j1r1xZ`nOf(%W+zpQmb_~sETKtK252Um%gDYh^M%< zeL&}fv^=#gR!wCt>M1bM$yB{mO;Xx23Z*85`@%ayLGp0_#%p*Q?5bNT}YZlX~fj@pU|%ku*%lGMrKy-OQ+!d}Ph+ zOlfOYvgf$l#5@7)szT2?m6103CbL!=A~|a)5pfy&60z7w?5C{7(gm8$x{%)1t9T#X z@~?jUulIk>u9bSooU;qP4(8<8_Qx}q_~Z29W)JRU8IAH-@E^giL^sV8*av8#bs1Sz)!zNYP8|Crfjo|M@`E%gM!97u)06q!6ZlJwz zd#wK^=VMr&Fg?5&eh*r`5MV-c!tJ*>58)|HXdB=OW1OE90!(Pz;XTkycnTAm58ekY zUkEVq?1m3PyN)NhY+3RdfSuH8E0S@RnU40LDeM|c?77DT1$##T?}Jq=x|AaycEbz!-zHl71C6RMXLf0O;V~o zPumB0Z;GnydFfXHUB5lu)GD+IK-)mO?JKqRSjNWg(jI%&$BtN=7i`z`(~-nhyP)}? ziT8x9f%ib`BZtE+hk$gn*HK`|lF|J>=L`#AHH%D%)gABBkRxCEkVYNJDUbi;Tov!b t3gJ$88^x6tE$(^1xln|OhMn+1XvLxsOlY6LFV75YDLtAnP~tq5^fx0>#!mnM delta 1739 zcmZ{jZ%mt26vpo@eLI*yV04sH7zMgfgs~QkUIn%?D#3LzI6*;O1|rD}Rnd&3Q(U{{ zjoF7~89B?MiHl%hQL`;>bcsta0o^`mHWLgV_5pPW({2m`{sSEI`rNj08R|(+-skz< z^WJ;jbK7f!{MsNtcEoVH=-km0>2zgXmmy`2wq|7W1!~F2<72cxBQJHt%IX7mrh}Zd ziosbrm9dL!w4705*f>j@%o_|#SyX4<#J{6g%(aI1Zqs>lxuN4W{bYW!__Br7kIMWd zDL>pM<)LkijZ_|g?(@p&hW9H^-j%+<%s?>k>zBb<+P%IgC5-C8lpZA)sdxPzlf7LR zO*8a~rHmYzS*C^-T_nxW_RJlcw^!Kg( z5xQ`Lv@I#ypzli7v5hBVtAevvwCCHw;Q+C03IzO~hHKUs4^>pR;; z?qAYpXKfJ-p(>4q#I;U3n^VV6)9*RM=~&k=2(@O6t$mUEAP>`Q?j>vIpa|TV3O=#& zulK*+P{)r^dfq;q^-kV~&HLA~dd665!+fyE#o9%9O?yvAoIQqXYl&+Y%NJK@EU%JV zDa9U6!3oLikVl=q)JfmiE<+piuoL{V9E0(LZBVrK)m%RZSvlZY)b5W7W}h!P-Lq zRWphj&JIMI>q8@EdMckVz8m@Brd3<2W1E)LmX*;hoj88=ma4{$n*VQ&<_av1RXeL@ zZ!X^7NEH>u6WCNFk>y zxZe=FRs3_R)SqwfE?sb`4m?z4fcHMFiv|3+luv*UHR@tA?v3!yCS8ohy$0U}KN|NY z_~Xy$;#}OXfuDfykNZT{`mk_3tBc+^NJ4=h{$Sj%g%88O9`_HxufVqrbr&Ce^SH zxHr|6UUQ&jZ7SRTgNeGz614%WP0-tACE7StgwB+en2a1Z_%nT9maRD;57XT;o5>4x zB%+|{CCG40>W~vLIRI(A87;U3SsIhmkd0BP84c)(?ryPZ4ybX;b=j)DP^Gt|%uzie z?8s)YhrqfNi1l5Y>OtQP(Y9-%--enHb-N0+0jNGY>B0v!(CtsK#Dnf!(CwQgFG^KZ|#;iiI<4P<`85ruXIGQ~4X@!RmZSj1( F?r+oEbrS#p diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out index 3a5a601..d926abc 100644 --- a/tests/test_fpu.console_out +++ b/tests/test_fpu.console_out @@ -9,3 +9,4 @@ test 08:PASS test 09:PASS test 10:PASS test 11:PASS +test 12:PASS From 86b826cd7e4cc8ffde6a324a90d4481cbc910ebd Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 23 Jul 2020 17:56:15 +1000 Subject: [PATCH 16/30] FPU: Implement fadd[s] and fsub[s] and add tests for them Signed-off-by: Paul Mackerras --- decode1.vhdl | 19 ++++- fpu.vhdl | 150 +++++++++++++++++++++++++++++++++++- tests/fpu/fpu.c | 154 +++++++++++++++++++++++++++++++++++++ tests/test_fpu.bin | Bin 21208 -> 24024 bytes tests/test_fpu.console_out | 2 + 5 files changed, 322 insertions(+), 3 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 34170dd..737d83c 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -58,6 +58,7 @@ architecture behaviour of decode1 is type op_59_subop_array_t is array(0 to 31) of decode_rom_t; type minor_rom_array_2_t is array(0 to 3) of decode_rom_t; type op_63_subop_array_0_t is array(0 to 511) of decode_rom_t; + type op_63_subop_array_1_t is array(0 to 16) of decode_rom_t; constant major_decode_rom_array : major_rom_array_t := ( -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl @@ -415,6 +416,8 @@ architecture behaviour of decode1 is -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl -- op in out A out in out len ext pipe 2#01110# => (FPU, OP_FPOP_I, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fcfid[u]s + 2#10100# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fsubs + 2#10101# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fadds others => illegal_inst ); @@ -461,6 +464,15 @@ architecture behaviour of decode1 is others => illegal_inst ); + -- indexed by bits 4..1 of instruction word + constant decode_op_63h_array : op_63_subop_array_1_t := ( + -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl + -- op in out A out in out len ext pipe + 2#0100# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fsub + 2#0101# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fadd + others => illegal_inst + ); + -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl -- op in out A out in out len ext pipe constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); @@ -626,8 +638,11 @@ begin when 63 => if HAS_FPU then -- floating point operations, general and double-precision - v.decode := decode_op_63l_array(to_integer(unsigned(f_in.insn(4 downto 1) & f_in.insn(10 downto 6)))); - vi.override := f_in.insn(5); + if f_in.insn(5) = '0' then + v.decode := decode_op_63l_array(to_integer(unsigned(f_in.insn(4 downto 1) & f_in.insn(10 downto 6)))); + else + v.decode := decode_op_63h_array(to_integer(unsigned(f_in.insn(4 downto 1)))); + end if; end if; when others => diff --git a/fpu.vhdl b/fpu.vhdl index e97461c..e9edfb4 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -40,7 +40,9 @@ architecture behaviour of fpu is DO_FMR, DO_FMRG, DO_FCFID, DO_FCTI, DO_FRSP, DO_FRI, + DO_FADD, FRI_1, + ADD_SHIFT, ADD_2, ADD_3, INT_SHIFT, INT_ROUND, INT_ISHIFT, INT_FINAL, INT_CHECK, INT_OFLOW, FINISH, NORMALIZE, @@ -79,6 +81,9 @@ architecture behaviour of fpu is tiny : std_ulogic; denorm : std_ulogic; round_mode : std_ulogic_vector(2 downto 0); + is_subtract : std_ulogic; + exp_cmp : std_ulogic; + add_bsmall : std_ulogic; end record; signal r, rin : reg_type; @@ -89,6 +94,7 @@ architecture behaviour of fpu is signal opsel_r : std_ulogic_vector(1 downto 0); signal opsel_ainv : std_ulogic; signal opsel_amask : std_ulogic; + signal opsel_binv : std_ulogic; signal in_a : std_ulogic_vector(63 downto 0); signal in_b : std_ulogic_vector(63 downto 0); signal result : std_ulogic_vector(63 downto 0); @@ -368,6 +374,9 @@ begin variable mshift : signed(EXP_BITS-1 downto 0); variable need_check : std_ulogic; variable msb : std_ulogic; + variable is_add : std_ulogic; + variable qnan_result : std_ulogic; + variable longmask : std_ulogic; begin v := r; illegal := '0'; @@ -397,10 +406,16 @@ begin v.tiny := '0'; v.denorm := '0'; v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN); + v.is_subtract := '0'; + v.add_bsmall := '0'; adec := decode_dp(e_in.fra, int_input); bdec := decode_dp(e_in.frb, int_input); v.a := adec; v.b := bdec; + v.exp_cmp := '0'; + if adec.exponent > bdec.exponent then + v.exp_cmp := '1'; + end if; end if; r_hi_nz <= or (r.r(55 downto 31)); @@ -433,6 +448,7 @@ begin opsel_ainv <= '0'; opsel_amask <= '0'; opsel_b <= BIN_ZERO; + opsel_binv <= '0'; opsel_r <= RES_SUM; carry_in <= '0'; misc_sel <= "0000"; @@ -442,6 +458,8 @@ begin invalid := '0'; renormalize := '0'; set_x := '0'; + qnan_result := '0'; + longmask := r.single_prec; case r.state is when IDLE => @@ -483,6 +501,8 @@ begin when "01111" => v.round_mode := "001"; v.state := DO_FCTI; + when "10100" | "10101" => + v.state := DO_FADD; when others => illegal := '1'; end case; @@ -717,6 +737,117 @@ begin v.state := FINISH; end if; + when DO_FADD => + -- fadd[s] and fsub[s] + opsel_a <= AIN_A; + v.result_sign := r.a.negative; + v.result_class := r.a.class; + v.result_exp := r.a.exponent; + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + is_add := r.a.negative xor r.b.negative xor r.insn(1); + if r.a.class = FINITE and r.b.class = FINITE then + v.is_subtract := not is_add; + v.add_bsmall := r.exp_cmp; + if r.exp_cmp = '0' then + v.shift := r.a.exponent - r.b.exponent; + v.result_sign := r.b.negative xnor r.insn(1); + if r.a.exponent = r.b.exponent then + v.state := ADD_2; + else + v.state := ADD_SHIFT; + end if; + else + opsel_a <= AIN_B; + v.shift := r.b.exponent - r.a.exponent; + v.result_exp := r.b.exponent; + v.state := ADD_SHIFT; + end if; + else + if (r.a.class = NAN and r.a.mantissa(53) = '0') or + (r.b.class = NAN and r.b.mantissa(53) = '0') then + -- Signalling NAN + v.fpscr(FPSCR_VXSNAN) := '1'; + invalid := '1'; + end if; + if r.a.class = NAN then + -- nothing to do, result is A + elsif r.b.class = NAN then + v.result_class := NAN; + v.result_sign := r.b.negative; + opsel_a <= AIN_B; + elsif r.a.class = INFINITY and r.b.class = INFINITY and is_add = '0' then + -- invalid operation, construct QNaN + v.fpscr(FPSCR_VXISI) := '1'; + qnan_result := '1'; + elsif r.a.class = ZERO and r.b.class = ZERO and is_add = '0' then + -- return -0 for rounding to -infinity + v.result_sign := r.round_mode(1) and r.round_mode(0); + elsif r.a.class = INFINITY or r.b.class = ZERO then + -- nothing to do, result is A + else + -- result is +/- B + v.result_sign := r.b.negative xnor r.insn(1); + v.result_class := r.b.class; + v.result_exp := r.b.exponent; + opsel_a <= AIN_B; + end if; + arith_done := '1'; + end if; + + when ADD_SHIFT => + opsel_r <= RES_SHIFT; + set_x := '1'; + longmask := '0'; + v.state := ADD_2; + + when ADD_2 => + if r.add_bsmall = '1' then + opsel_a <= AIN_A; + else + opsel_a <= AIN_B; + end if; + opsel_b <= BIN_R; + opsel_binv <= r.is_subtract; + carry_in <= r.is_subtract and not r.x; + v.shift := to_signed(-1, EXP_BITS); + v.state := ADD_3; + + when ADD_3 => + -- check for overflow or negative result (can't get both) + if r.r(63) = '1' then + -- result is opposite sign to expected + v.result_sign := not r.result_sign; + opsel_ainv <= '1'; + carry_in <= '1'; + v.state := FINISH; + elsif r.r(55) = '1' then + -- sum overflowed, shift right + opsel_r <= RES_SHIFT; + set_x := '1'; + v.shift := to_signed(-2, EXP_BITS); + if exp_huge = '1' then + v.state := ROUND_OFLOW; + else + v.state := ROUNDING; + end if; + elsif r.r(54) = '1' then + set_x := '1'; + v.shift := to_signed(-2, EXP_BITS); + v.state := ROUNDING; + elsif (r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then + -- r.x must be zero at this point + v.result_class := ZERO; + if r.is_subtract = '1' then + -- set result sign depending on rounding mode + v.result_sign := r.round_mode(1) and r.round_mode(0); + end if; + arith_done := '1'; + else + renormalize := '1'; + v.state := NORMALIZE; + end if; + when INT_SHIFT => opsel_r <= RES_SHIFT; set_x := '1'; @@ -927,6 +1058,10 @@ begin mant_nz := r_hi_nz or (r_lo_nz and not r.single_prec); if mant_nz = '0' then v.result_class := ZERO; + if r.is_subtract = '1' then + -- set result sign depending on rounding mode + v.result_sign := r.round_mode(1) and r.round_mode(0); + end if; arith_done := '1'; else -- Renormalize result after rounding @@ -946,6 +1081,13 @@ begin end case; + if qnan_result = '1' then + invalid := '1'; + v.result_class := NAN; + v.result_sign := '0'; + misc_sel <= "0001"; + opsel_r <= RES_MISC; + end if; if arith_done = '1' then -- Enabled invalid exception doesn't write result or FPRF if (invalid and r.fpscr(FPSCR_VE)) = '0' then @@ -960,7 +1102,7 @@ begin -- Data path. -- This has A and B input multiplexers, an adder, a shifter, -- count-leading-zeroes logic, and a result mux. - if r.single_prec = '1' then + if longmask = '1' then mshift := r.shift + to_signed(-29, EXP_BITS); else mshift := r.shift; @@ -1000,6 +1142,9 @@ begin when others => in_b0 := (others => '0'); end case; + if opsel_binv = '1' then + in_b0 := not in_b0; + end if; in_b <= in_b0; if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then shift_res := shifter_64(r.r & x"00000000000000", @@ -1016,6 +1161,9 @@ begin case misc_sel is when "0000" => misc := x"00000000" & (r.fpscr and fpscr_mask); + when "0001" => + -- generated QNaN mantissa + misc := x"0020000000000000"; when "0010" => -- mantissa of max representable DP number misc := x"007ffffffffffffc"; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index e7a1334..8f7407a 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -843,6 +843,158 @@ int fpu_test_12(void) return trapit(0, test12); } +struct addvals { + unsigned long val_a; + unsigned long val_b; + unsigned long sum; + unsigned long diff; +} addvals[] = { + { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }, + { 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 }, + { 0x3fdfffffffffffff, 0x0000000000000000, 0x3fdfffffffffffff, 0x3fdfffffffffffff }, + { 0x3ff0000000000000, 0x3ff0000000000000, 0x4000000000000000, 0x0000000000000000 }, + { 0xbff0000000000000, 0xbff0000000000000, 0xc000000000000000, 0x0000000000000000 }, + { 0x402123456789abcd, 0x4021000000000000, 0x403111a2b3c4d5e6, 0x3fb1a2b3c4d5e680 }, + { 0x4061200000000000, 0x406123456789abcd, 0x407121a2b3c4d5e6, 0xbfba2b3c4d5e6800 }, + { 0x4061230000000000, 0x3fa4560000000000, 0x4061244560000000, 0x406121baa0000000 }, + { 0xc061230000000000, 0x3fa4560000000000, 0xc06121baa0000000, 0xc061244560000000 }, + { 0x4061230000000000, 0xbfa4560000000000, 0x406121baa0000000, 0x4061244560000000 }, + { 0xc061230000000000, 0xbfa4560000000000, 0xc061244560000000, 0xc06121baa0000000 }, + { 0x3fa1230000000000, 0x4064560000000000, 0x4064571230000000, 0xc06454edd0000000 }, + { 0xbfa1230000000000, 0x4064560000000000, 0x406454edd0000000, 0xc064571230000000 }, + { 0x3fa1230000000000, 0xc064560000000000, 0xc06454edd0000000, 0x4064571230000000 }, + { 0xbfa1230000000000, 0xc064560000000000, 0xc064571230000000, 0x406454edd0000000 }, + { 0x6780000000000001, 0x6470000000000000, 0x6780000000000009, 0x677ffffffffffff2 }, + { 0x6780000000000001, 0x6460000000000000, 0x6780000000000005, 0x677ffffffffffffa }, + { 0x6780000000000001, 0x6450000000000000, 0x6780000000000003, 0x677ffffffffffffe }, + { 0x6780000000000001, 0x6440000000000000, 0x6780000000000002, 0x6780000000000000 }, + { 0x7ff8888888888888, 0x7ff9999999999999, 0x7ff8888888888888, 0x7ff8888888888888 }, + { 0xfff8888888888888, 0x7ff9999999999999, 0xfff8888888888888, 0xfff8888888888888 }, + { 0x7ff8888888888888, 0x7ff0000000000000, 0x7ff8888888888888, 0x7ff8888888888888 }, + { 0x7ff8888888888888, 0x0000000000000000, 0x7ff8888888888888, 0x7ff8888888888888 }, + { 0x7ff8888888888888, 0x0001111111111111, 0x7ff8888888888888, 0x7ff8888888888888 }, + { 0x7ff8888888888888, 0x3ff0000000000000, 0x7ff8888888888888, 0x7ff8888888888888 }, + { 0x7ff0000000000000, 0x7ff9999999999999, 0x7ff9999999999999, 0x7ff9999999999999 }, + { 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff8000000000000 }, + { 0x7ff0000000000000, 0xfff0000000000000, 0x7ff8000000000000, 0x7ff0000000000000 }, + { 0x7ff0000000000000, 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000 }, + { 0x7ff0000000000000, 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000 }, + { 0x7ff0000000000000, 0x8002222222222222, 0x7ff0000000000000, 0x7ff0000000000000 }, + { 0x7ff0000000000000, 0xc002222222222222, 0x7ff0000000000000, 0x7ff0000000000000 }, + { 0x0000000000000000, 0x7ff9999999999999, 0x7ff9999999999999, 0x7ff9999999999999 }, + { 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000 }, + { 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000 }, + { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }, + { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }, + { 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 }, + { 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 }, + { 0x8002222222222222, 0x0001111111111111, 0x8001111111111111, 0x8003333333333333 }, + { 0x0000022222222222, 0x0000111111111111, 0x0000133333333333, 0x80000eeeeeeeeeef }, + { 0x401ffffffbfffefe, 0x406b8265196bd89e, 0x406c8265194bd896, 0xc06a8265198bd8a6 }, + { 0x4030020000000004, 0xbf110001ffffffff, 0x403001fbbfff8004, 0x4030020440008004 }, + { 0x3fdfffffffffffff, 0x3fe0000000000000, 0x3ff0000000000000, 0xbc90000000000000 }, +}; + +int test13(long arg) +{ + long i; + unsigned long results[2]; + struct addvals *vp = addvals; + + set_fpscr(FPS_RN_NEAR); + for (i = 0; i < sizeof(addvals) / sizeof(addvals[0]); ++i, ++vp) { + asm("lfd 5,0(%0); lfd 6,8(%0); fadd 7,5,6; fsub 8,5,6; stfd 7,0(%1); stfd 8,8(%1)" + : : "b" (&vp->val_a), "b" (results) : "memory"); + if (results[0] != vp->sum || results[1] != vp->diff) { + print_hex(i, 2, " "); + print_hex(results[0], 16, " "); + print_hex(results[1], 16, "\r\n"); + return i + 1; + } + } + return 0; +} + +int fpu_test_13(void) +{ + enable_fp(); + return trapit(0, test13); +} + +struct addvals sp_addvals[] = { + { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }, + { 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 }, + { 0x3fdfffffffffffff, 0x0000000000000000, 0x3fe0000000000000, 0x3fe0000000000000 }, + { 0x3ff0000000000000, 0x3ff0000000000000, 0x4000000000000000, 0x0000000000000000 }, + { 0xbff0000000000000, 0xbff0000000000000, 0xc000000000000000, 0x0000000000000000 }, + { 0x402123456789abcd, 0x4021000000000000, 0x403111a2c0000000, 0x3fb1a2b000000000 }, + { 0x4061200000000000, 0x406123456789abcd, 0x407121a2c0000000, 0xbfba2b0000000000 }, + { 0x4061230000000000, 0x3fa4560000000000, 0x4061244560000000, 0x406121baa0000000 }, + { 0xc061230000000000, 0x3fa4560000000000, 0xc06121baa0000000, 0xc061244560000000 }, + { 0x4061230000000000, 0xbfa4560000000000, 0x406121baa0000000, 0x4061244560000000 }, + { 0xc061230000000000, 0xbfa4560000000000, 0xc061244560000000, 0xc06121baa0000000 }, + { 0x3fa1230000000000, 0x4064560000000000, 0x4064571240000000, 0xc06454edc0000000 }, + { 0xbfa1230000000000, 0x4064560000000000, 0x406454edc0000000, 0xc064571240000000 }, + { 0x3fa1230000000000, 0xc064560000000000, 0xc06454edc0000000, 0x4064571240000000 }, + { 0xbfa1230000000000, 0xc064560000000000, 0xc064571240000000, 0x406454edc0000000 }, + { 0x6780000000000001, 0x6470000000000000, 0x7ff0000000000000, 0x7ff8000000000000 }, + { 0x6780000000000001, 0x6460000000000000, 0x7ff0000000000000, 0x7ff8000000000000 }, + { 0x6780000000000001, 0x6450000000000000, 0x7ff0000000000000, 0x7ff8000000000000 }, + { 0x6780000000000001, 0x6440000000000000, 0x7ff0000000000000, 0x7ff8000000000000 }, + { 0x7ff8888888888888, 0x7ff9999999999999, 0x7ff8888880000000, 0x7ff8888880000000 }, + { 0xfff8888888888888, 0x7ff9999999999999, 0xfff8888880000000, 0xfff8888880000000 }, + { 0x7ff8888888888888, 0x7ff0000000000000, 0x7ff8888880000000, 0x7ff8888880000000 }, + { 0x7ff8888888888888, 0x0000000000000000, 0x7ff8888880000000, 0x7ff8888880000000 }, + { 0x7ff8888888888888, 0x0001111111111111, 0x7ff8888880000000, 0x7ff8888880000000 }, + { 0x7ff8888888888888, 0x3ff0000000000000, 0x7ff8888880000000, 0x7ff8888880000000 }, + { 0x7ff0000000000000, 0x7ff9999999999999, 0x7ff9999980000000, 0x7ff9999980000000 }, + { 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0x7ff8000000000000 }, + { 0x7ff0000000000000, 0xfff0000000000000, 0x7ff8000000000000, 0x7ff0000000000000 }, + { 0x7ff0000000000000, 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000 }, + { 0x7ff0000000000000, 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000 }, + { 0x7ff0000000000000, 0x8002222222222222, 0x7ff0000000000000, 0x7ff0000000000000 }, + { 0x7ff0000000000000, 0xc002222222222222, 0x7ff0000000000000, 0x7ff0000000000000 }, + { 0x0000000000000000, 0x7ff9999999999999, 0x7ff9999980000000, 0x7ff9999980000000 }, + { 0x0000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000 }, + { 0x8000000000000000, 0x7ff0000000000000, 0x7ff0000000000000, 0xfff0000000000000 }, + { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }, + { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }, + { 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 }, + { 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 }, + { 0x8002222222222222, 0x0001111111111111, 0x0000000000000000, 0x8000000000000000 }, + { 0x0000022222222222, 0x0000111111111111, 0x0000000000000000, 0x0000000000000000 }, + { 0x47dc000020000000, 0x47ec03ffe0000000, 0x7ff0000000000000, 0xc7dc07ffa0000000 }, + { 0x47dbffffe0000000, 0x47eff7ffe0000000, 0x7ff0000000000000, 0xc7e1f80000000000 }, + { 0x47efffffc0000000, 0xc7efffffc0000000, 0x0000000000000000, 0x7ff0000000000000 }, +}; + +int test14(long arg) +{ + long i; + unsigned long results[2]; + struct addvals *vp = sp_addvals; + + set_fpscr(FPS_RN_NEAR); + for (i = 0; i < sizeof(sp_addvals) / sizeof(sp_addvals[0]); ++i, ++vp) { + asm("lfd 5,0(%0); frsp 5,5; lfd 6,8(%0); frsp 6,6; " + "fadds 7,5,6; fsubs 8,5,6; stfd 7,0(%1); stfd 8,8(%1)" + : : "b" (&vp->val_a), "b" (results) : "memory"); + if (results[0] != vp->sum || results[1] != vp->diff) { + print_hex(i, 2, " "); + print_hex(results[0], 16, " "); + print_hex(results[1], 16, "\r\n"); + return i + 1; + } + } + return 0; +} + +int fpu_test_14(void) +{ + enable_fp(); + return trapit(0, test14); +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -880,6 +1032,8 @@ int main(void) do_test(10, fpu_test_10); do_test(11, fpu_test_11); do_test(12, fpu_test_12); + do_test(13, fpu_test_13); + do_test(14, fpu_test_14); return fail; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 668ff65367cbf02294b638e77fd93f9173db9532..623db3f690b0aad69472a9a8740964239799983c 100755 GIT binary patch delta 4711 zcmd5;eNa@_6+iE>;9`{Q^0|BnyNCpkfdy={3d(~hA|!|=wMlB|q9`^x5ofHU4PoCf zB!j6qn3J@ci5q0piPkz=Y{%M8T{BKKw&`f6A7uPOqX-Eo=@?xFxjpy3_mD4R@<(TS zX6~MQf9H44J?DO$eeHXs_CH9;0LpBDlFC0%DA`1M1yqz&0=NlVP1tI}R0&zLW_dr*ta;>D2g?}wZv6F`2q&R!f|I8kkC%{5U@^-bsv zUoLr3j>wMO83V;9gp1aE#_IC{e7WIEu_R)ayd?(Tig-1sW@Urx*I(fNs0>Vsv`J@i zcH~0oBsN7Zl=EK3OOa)=DH=1PHUu7B+Tgl}UqwBQ9Z{2{cI=IsALw4z;QEQcYHX8a zR~6RTiUQxQXmEXo*61AUh%mhqJ(nx%o_nm60?6aLe*4D_t_23c`pPg%ySu?9k>zPA z0#8Iw!`Gr0%RBv85}h=`T?@g>-P5o)y3GU;P>da6HnhZ>&=(Uo`D8d0&$jX{0rxe$ z7#1rRgkw-_h5UO9u8d6!{cRW&@1kuG_T;*amYvudn<2lr7XKbwB(Dj>n7A}~`!+0! zTO8?LyWBN}cH0h>U5vHnSwM9W@nBq$d>|C>#Jwh~@8QZRC83-c_<*O?nrG5TA5Te> zk67{Mlx(@N2@~VD#<N9E zt1XB60v?_$!qXyg!Y;p~pOum(US${0&)Uz5N>hd<%)@?gpOs4BaUGekoE$k^Tb_hs z8#zzWoqxpY6$uyxlR9%fgF=3^?R@jUbvn(z9GQ+SrDcGpWOMokD6Jk!tM|L4^*xGM zPpeGiHI0DLX<0|5)e`dPwAPGBt0p1*xm9%N{FKhoDZNBD_h0>k9w%+mL|_SgqxMbI zZee+!r`8F}#pb=IW3y<3HtT&SZRh}=OUMmfFd2#~^}ETRr}`$w+gm*~bOMv0Ug5{L zk^20YUh(rHkekjip4NDrnZ%hxHh?_A8_hA3p9tY+wy?XPQXPjA>iX&~KmIzAK?_AY z7Z*7&blnppqixks*B z!uHBPtK~R9lkJ3G5!mZPSPNmF5;m=imHw2rQK9YQeEzCv|E>H64K2Q#i0F*6OZ)I} z$|*XN`n<_>QD(f<;3}Xp#K$2tuBCC7C`%3(FJrQIS=T#}_7L8W6MtXeHhb|msh;4W zQQNGv=cUQst1hsYRNA3r6A|8@o;%N**nPkRwfh&s8)oXuADcIs=n)Uc0;uXt8p0I8 zvy2#HO!~qIu~bqzKV*g)!;Z{T2NOwe}fx^G~-GiJ_F>K?tpSR9Xu#7^z6g!u@o;lSMB z0{BpRCPc9o)Rl)o;pLzL9(o^fCUxNkXJU{ofHLbsuQM&En7WN!_%~;A(0b}ZH{K`0 z2h{a#iz z4+f|DA%{mZgE-IPx~!=&OR3k=`*7>$lC_4g1HE`Kt1##&b@OgfcTN^R6bPhj`yOV` zv2eovK^ULObY;@@3ND_cT+d~Yg$So#=HP8L#X`c4hl(RT;M2+Q>900d&ztH{w6R0!{O8HGR%@1`iq2up ztG!auc$J#nu}*B?z=@jS zp#$#Fm}ocxc<6xJjd`4>&)>xt23S(1(62M}heDt2eWA)nW17E<&ok_2J!DX5$caAE zoM1TMV`HM>Xt)DzH|BAkq8*P~O*lzSRj|-%^?$?xc&bO*-Bom5JVa5e?~h9;uL{r| z432A8s$|_R6!~cs;`i(ja?fLoM``xZE+~Gc6AaCxBI$dS1KmzfTZn%Q&niTEq+R!) z-Sez=%}W=qXKz_78=pvzey-WR1cli)^^H)J$KDsFvEyi9+NN10Pu;V*y4v zA6gV}(V{81;Ta3QRce(Oo|EPqj|`tuEc~%~z;JA&s%tezNPw!X3!vcu0uBSXb(16n z)WJd4&sc)uZC@R|qJFjVi}q5|Ym~k1KPUaFkLnVD?4<7}#w4&FH>vD+zvNU*Dkvs7 zQ8O@Ry!MWK&Zjfeb@Lx7?98@I<+c?&r0}8m7b;N`4Q@yNcdH2G-p4BfU*}Nk*oq+R zydrSvVkqB?R1w6(x`R4g_doxBw zC|uHx49{T4j(dDTmGu9nBIwSIMLgf4B4m!K2wLPrzoRMwJj%2CUCA#@gO7c-_m4y?2|Ba@xK28+c^CS delta 2035 zcmai#e@s(X6vxkdl-jaVU-?lAlv)rwK&+)8J_NBKu&yYexMXumip*qATv((XW#K@ za(By`D7UL`-(+rL4G3`?5HCn`g*F_KDuuQlm#$0{4RN?X@A7# zp}*4Nhww`5d%`^WoHN7i0dA-oA3I_9*#(Jp@?MAEETZ`+AC89@%oyIiLup2KMSL*v~ zYvcqvG&5BD8SBWCph+fSenN?OO2QopZ;P-OM-p5STp0S8m2#0{LbTG9CjKxJ8%=gm z)r{Ym_Qdtq9+K-H-tfNs=zkSg&ngkywb(JM3NI!`E%#G734}4e@zjT`^zy@TuMNC% z7_eE#XViFF!Y1}tG352wpho6w%OqPw*SBFZwn3tN6*H1@BT~b`Sv|RHet*z}jY%Ee zA!X`a_{c(BfMIu49J-SIMP znKLS=@k9EFh{6+=EP5RWEn7ppuqk+n{xSB)?ZY7|&ym0R7tO(!ld*MqD#`;UwYC!H zXB|psCT*H{M9vXAEWYpIoZpFocz2L}FvZNoJ}36kAS>NMO*!cJxSl^|bpAU21Whfz znS^MIF$*oIoBNZ-Oj`Z|h`7$*rZ;POYz+Q>fm?-WPxq>)W<4jaaps_5yTRBHd2XDoqM;dJ><9;BoV`h-otT+C zH_kjXx`-unK*s%E8mEX|1;9RAqF8qm$8A>8a|^8*R&n?iR%Mt(>+8(ETRit$Sc+DP{Ur(Vwz?hM*PiCgE zo7fPsIu44|1@MKiRO`?cG*)P#@ONbaZ}S*w@&++GGfCM?;hI6L%1l$LLIA!U#66i7 zWfq0D+jx=$YbZ>)gV!naQF!VOhGm)5yh;~Mhp{1RcHAY3oOhobPZ4GP#8%zK@vM9` zpV|}v1Nb;=>AZT1Hs5>FG(XGT#5}~flG`#G?Zmop(}Ggv5Jjr{czS`wXbhzm?lT4g zDT=jb+Sj{UC{h;C6dg8Wbn^&f%{1Z0EAv}_gc;e9V(loFWTy%3_)d0Y?D0{?(s>Os kF8hSWFZ54W-*Yf~hq~ujc2r2u(c)np9xc}3{Iy^H3;nCTwg3PC diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out index d926abc..440cd77 100644 --- a/tests/test_fpu.console_out +++ b/tests/test_fpu.console_out @@ -10,3 +10,5 @@ test 09:PASS test 10:PASS test 11:PASS test 12:PASS +test 13:PASS +test 14:PASS From e6a5f237bc02de146e2416cf3d8bec7473e33483 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 27 Jul 2020 18:27:50 +1000 Subject: [PATCH 17/30] FPU: Implement fmul[s] This implements the fmul and fmuls instructions. For fmul[s] with denormalized operands we normalize the inputs before doing the multiplication, to eliminate the need for doing count-leading-zeroes on P. This adds 3 or 5 cycles to the execution time when one or both operands are denormalized. Signed-off-by: Paul Mackerras --- decode1.vhdl | 2 + decode2.vhdl | 7 ++ decode_types.vhdl | 2 +- fpu.vhdl | 182 ++++++++++++++++++++++++++++++++++++- tests/fpu/fpu.c | 80 ++++++++++++++++ tests/test_fpu.bin | Bin 24024 -> 24272 bytes tests/test_fpu.console_out | 2 + 7 files changed, 271 insertions(+), 4 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 737d83c..721c478 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -418,6 +418,7 @@ architecture behaviour of decode1 is 2#01110# => (FPU, OP_FPOP_I, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fcfid[u]s 2#10100# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fsubs 2#10101# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fadds + 2#11001# => (FPU, OP_FPOP, FRA, NONE, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fmuls others => illegal_inst ); @@ -470,6 +471,7 @@ architecture behaviour of decode1 is -- op in out A out in out len ext pipe 2#0100# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fsub 2#0101# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fadd + 2#1001# => (FPU, OP_FPOP, FRA, NONE, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fmul others => illegal_inst ); diff --git a/decode2.vhdl b/decode2.vhdl index ec8232f..9443212 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -152,6 +152,12 @@ architecture behaviour of decode2 is else return ('0', (others => '0'), (others => '0')); end if; + when FRC => + if HAS_FPU then + return ('1', fpr_to_gspr(insn_frc(insn_in)), reg_data); + else + return ('0', (others => '0'), (others => '0')); + end if; when NONE => return ('0', (others => '0'), (others => '0')); end case; @@ -308,6 +314,7 @@ begin else fpr_to_gspr(insn_frb(d_in.insn)) when d_in.decode.input_reg_b = FRB and HAS_FPU else gpr_to_gspr(insn_rb(d_in.insn)); r_out.read3_reg <= gpr_to_gspr(insn_rcreg(d_in.insn)) when d_in.decode.input_reg_c = RCR + else fpr_to_gspr(insn_frc(d_in.insn)) when d_in.decode.input_reg_c = FRC and HAS_FPU else fpr_to_gspr(insn_frt(d_in.insn)) when d_in.decode.input_reg_c = FRS and HAS_FPU else gpr_to_gspr(insn_rs(d_in.insn)); diff --git a/decode_types.vhdl b/decode_types.vhdl index 08fdc4a..72609bf 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -26,7 +26,7 @@ package decode_types is type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA, FRA); type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR, FRB); - type input_reg_c_t is (NONE, RS, RCR, FRS); + type input_reg_c_t is (NONE, RS, RCR, FRC, FRS); type output_reg_a_t is (NONE, RT, RA, SPR, FRT); type rc_t is (NONE, ONE, RC); type carry_in_t is (ZERO, CA, OV, ONE); diff --git a/fpu.vhdl b/fpu.vhdl index e9edfb4..209daa0 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -40,15 +40,18 @@ architecture behaviour of fpu is DO_FMR, DO_FMRG, DO_FCFID, DO_FCTI, DO_FRSP, DO_FRI, - DO_FADD, + DO_FADD, DO_FMUL, FRI_1, ADD_SHIFT, ADD_2, ADD_3, + MULT_1, INT_SHIFT, INT_ROUND, INT_ISHIFT, INT_FINAL, INT_CHECK, INT_OFLOW, FINISH, NORMALIZE, ROUND_UFLOW, ROUND_OFLOW, ROUNDING, ROUNDING_2, ROUNDING_3, - DENORM); + DENORM, + RENORM_A, RENORM_A2, + RENORM_C, RENORM_C2); type reg_type is record state : state_t; @@ -65,8 +68,10 @@ architecture behaviour of fpu is fpscr : std_ulogic_vector(31 downto 0); a : fpu_reg_type; b : fpu_reg_type; + c : fpu_reg_type; r : std_ulogic_vector(63 downto 0); -- 10.54 format x : std_ulogic; + p : std_ulogic_vector(63 downto 0); -- 8.56 format result_sign : std_ulogic; result_class : fp_number_class; result_exp : signed(EXP_BITS-1 downto 0); @@ -84,6 +89,8 @@ architecture behaviour of fpu is is_subtract : std_ulogic; exp_cmp : std_ulogic; add_bsmall : std_ulogic; + is_multiply : std_ulogic; + first : std_ulogic; end record; signal r, rin : reg_type; @@ -103,11 +110,17 @@ architecture behaviour of fpu is signal r_hi_nz : std_ulogic; signal r_lo_nz : std_ulogic; signal misc_sel : std_ulogic_vector(3 downto 0); + signal f_to_multiply : MultiplyInputType; + signal multiply_to_f : MultiplyOutputType; + signal msel_1 : std_ulogic_vector(1 downto 0); + signal msel_2 : std_ulogic_vector(1 downto 0); + signal msel_inv : std_ulogic; -- opsel values constant AIN_R : std_ulogic_vector(1 downto 0) := "00"; constant AIN_A : std_ulogic_vector(1 downto 0) := "01"; constant AIN_B : std_ulogic_vector(1 downto 0) := "10"; + constant AIN_C : std_ulogic_vector(1 downto 0) := "11"; constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00"; constant BIN_R : std_ulogic_vector(1 downto 0) := "01"; @@ -115,8 +128,17 @@ architecture behaviour of fpu is constant RES_SUM : std_ulogic_vector(1 downto 0) := "00"; constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01"; + constant RES_MULT : std_ulogic_vector(1 downto 0) := "10"; constant RES_MISC : std_ulogic_vector(1 downto 0) := "11"; + -- msel values + constant MUL1_A : std_ulogic_vector(1 downto 0) := "00"; + constant MUL1_B : std_ulogic_vector(1 downto 0) := "01"; + constant MUL1_R : std_ulogic_vector(1 downto 0) := "11"; + + constant MUL2_C : std_ulogic_vector(1 downto 0) := "00"; + constant MUL2_R : std_ulogic_vector(1 downto 0) := "11"; + -- Left and right shifter with 120 bit input and 64 bit output. -- Shifts inp left by shift bits and returns the upper 64 bits of -- the result. The shift parameter is interpreted as a signed @@ -313,6 +335,13 @@ architecture behaviour of fpu is end; begin + fpu_multiply_0: entity work.multiply + port map ( + clk => clk, + m_in => f_to_multiply, + m_out => multiply_to_f + ); + fpu_0: process(clk) begin if rising_edge(clk) then @@ -347,6 +376,7 @@ begin variable v : reg_type; variable adec : fpu_reg_type; variable bdec : fpu_reg_type; + variable cdec : fpu_reg_type; variable fpscr_mask : std_ulogic_vector(31 downto 0); variable illegal : std_ulogic; variable j, k : integer; @@ -377,6 +407,10 @@ begin variable is_add : std_ulogic; variable qnan_result : std_ulogic; variable longmask : std_ulogic; + variable set_a : std_ulogic; + variable set_c : std_ulogic; + variable px_nz : std_ulogic; + variable maddend : std_ulogic_vector(127 downto 0); begin v := r; illegal := '0'; @@ -407,11 +441,15 @@ begin v.denorm := '0'; v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN); v.is_subtract := '0'; + v.is_multiply := '0'; v.add_bsmall := '0'; adec := decode_dp(e_in.fra, int_input); bdec := decode_dp(e_in.frb, int_input); + cdec := decode_dp(e_in.frc, int_input); v.a := adec; v.b := bdec; + v.c := cdec; + v.exp_cmp := '0'; if adec.exponent > bdec.exponent then v.exp_cmp := '1'; @@ -440,10 +478,14 @@ begin exp_huge := '1'; end if; + -- Compare P with zero + px_nz := or (r.p(57 downto 4)); + v.writing_back := '0'; v.instr_done := '0'; v.update_fprf := '0'; v.shift := to_signed(0, EXP_BITS); + v.first := '0'; opsel_a <= AIN_R; opsel_ainv <= '0'; opsel_amask <= '0'; @@ -460,6 +502,13 @@ begin set_x := '0'; qnan_result := '0'; longmask := r.single_prec; + set_a := '0'; + set_c := '0'; + f_to_multiply.is_32bit <= '0'; + f_to_multiply.valid <= '0'; + msel_1 <= MUL1_A; + msel_2 <= MUL2_C; + msel_inv <= '0'; case r.state is when IDLE => @@ -503,6 +552,9 @@ begin v.state := DO_FCTI; when "10100" | "10101" => v.state := DO_FADD; + when "11001" => + v.is_multiply := '1'; + v.state := DO_FMUL; when others => illegal := '1'; end case; @@ -795,6 +847,81 @@ begin arith_done := '1'; end if; + when DO_FMUL => + -- fmul[s] + opsel_a <= AIN_A; + v.result_sign := r.a.negative; + v.result_class := r.a.class; + v.result_exp := r.a.exponent; + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + if r.a.class = FINITE and r.c.class = FINITE then + v.result_sign := r.a.negative xor r.c.negative; + v.result_exp := r.a.exponent + r.c.exponent; + -- Renormalize denorm operands + if r.a.mantissa(54) = '0' then + v.state := RENORM_A; + elsif r.c.mantissa(54) = '0' then + opsel_a <= AIN_C; + v.state := RENORM_C; + else + f_to_multiply.valid <= '1'; + v.state := MULT_1; + end if; + else + if (r.a.class = NAN and r.a.mantissa(53) = '0') or + (r.c.class = NAN and r.c.mantissa(53) = '0') then + -- Signalling NAN + v.fpscr(FPSCR_VXSNAN) := '1'; + invalid := '1'; + end if; + if r.a.class = NAN then + -- result is A + elsif r.c.class = NAN then + v.result_class := NAN; + v.result_sign := r.c.negative; + opsel_a <= AIN_C; + elsif (r.a.class = INFINITY and r.c.class = ZERO) or + (r.a.class = ZERO and r.c.class = INFINITY) then + -- invalid operation, construct QNaN + v.fpscr(FPSCR_VXIMZ) := '1'; + qnan_result := '1'; + elsif r.a.class = ZERO or r.a.class = INFINITY then + -- result is +/- A + v.result_sign := r.a.negative xor r.c.negative; + else + -- r.c.class is ZERO or INFINITY + v.result_class := r.c.class; + v.result_sign := r.a.negative xor r.c.negative; + end if; + arith_done := '1'; + end if; + + when RENORM_A => + renormalize := '1'; + v.state := RENORM_A2; + + when RENORM_A2 => + set_a := '1'; + v.result_exp := new_exp; + opsel_a <= AIN_C; + if r.c.mantissa(54) = '1' then + v.first := '1'; + v.state := MULT_1; + else + v.state := RENORM_C; + end if; + + when RENORM_C => + renormalize := '1'; + v.state := RENORM_C2; + + when RENORM_C2 => + set_c := '1'; + v.result_exp := new_exp; + v.first := '1'; + v.state := MULT_1; + when ADD_SHIFT => opsel_r <= RES_SHIFT; set_x := '1'; @@ -848,6 +975,13 @@ begin v.state := NORMALIZE; end if; + when MULT_1 => + f_to_multiply.valid <= r.first; + opsel_r <= RES_MULT; + if multiply_to_f.valid = '1' then + v.state := FINISH; + end if; + when INT_SHIFT => opsel_r <= RES_SHIFT; set_x := '1'; @@ -930,6 +1064,9 @@ begin v.state := ROUNDING; when FINISH => + if r.is_multiply = '1' and px_nz = '1' then + v.x := '1'; + end if; if r.r(63 downto 54) /= "0000000001" then renormalize := '1'; v.state := NORMALIZE; @@ -1099,6 +1236,32 @@ begin update_fx := '1'; end if; + -- Multiplier data path + case msel_1 is + when MUL1_A => + f_to_multiply.data1 <= r.a.mantissa(61 downto 0) & "00"; + when MUL1_B => + f_to_multiply.data1 <= r.b.mantissa(61 downto 0) & "00"; + when others => + f_to_multiply.data1 <= r.r(61 downto 0) & "00"; + end case; + case msel_2 is + when MUL2_C => + f_to_multiply.data2 <= r.c.mantissa(61 downto 0) & "00"; + when others => + f_to_multiply.data2 <= r.r(61 downto 0) & "00"; + end case; + maddend := (others => '0'); + if msel_inv = '1' then + f_to_multiply.addend <= not maddend; + else + f_to_multiply.addend <= maddend; + end if; + f_to_multiply.not_result <= msel_inv; + if multiply_to_f.valid = '1' then + v.p := multiply_to_f.result(63 downto 0); + end if; + -- Data path. -- This has A and B input multiplexers, an adder, a shifter, -- count-leading-zeroes logic, and a result mux. @@ -1119,8 +1282,10 @@ begin in_a0 := r.r; when AIN_A => in_a0 := r.a.mantissa; - when others => + when AIN_B => in_a0 := r.b.mantissa; + when others => + in_a0 := r.c.mantissa; end case; if (or (mask and in_a0)) = '1' and set_x = '1' then v.x := '1'; @@ -1157,6 +1322,8 @@ begin result <= std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in); when RES_SHIFT => result <= shift_res; + when RES_MULT => + result <= multiply_to_f.result(121 downto 58); when others => case misc_sel is when "0000" => @@ -1207,6 +1374,15 @@ begin end case; v.r := result; + if set_a = '1' then + v.a.exponent := new_exp; + v.a.mantissa := shift_res; + end if; + if set_c = '1' then + v.c.exponent := new_exp; + v.c.mantissa := shift_res; + end if; + if opsel_r = RES_SHIFT then v.result_exp := new_exp; end if; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 8f7407a..305359a 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -205,6 +205,7 @@ struct sp_dp_equiv { { 0x00200000, 0x37f0000000000000 }, { 0x00000002, 0x36b0000000000000 }, { 0x00000001, 0x36a0000000000000 }, + { 0x7f7fffff, 0x47efffffe0000000 }, }; int sp_to_dp(long arg) @@ -995,6 +996,83 @@ int fpu_test_14(void) return trapit(0, test14); } +struct mulvals { + unsigned long val_a; + unsigned long val_b; + unsigned long prod; +} mulvals[] = { + { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }, + { 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 }, + { 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 }, + { 0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000 }, + { 0xbf4fff801fffffff, 0x6d7fffff8000007f, 0xecdfff7fa001fffe }, + { 0x3fbd50275a65ed80, 0x0010000000000000, 0x0001d50275a65ed8 }, +}; + +int test15(long arg) +{ + long i; + unsigned long result; + struct mulvals *vp = mulvals; + + set_fpscr(FPS_RN_NEAR); + for (i = 0; i < sizeof(mulvals) / sizeof(mulvals[0]); ++i, ++vp) { + asm("lfd 5,0(%0); lfd 6,8(%0); fmul 7,5,6; stfd 7,0(%1)" + : : "b" (&vp->val_a), "b" (&result) : "memory"); + if (result != vp->prod) { + print_hex(i, 2, " "); + print_hex(result, 16, " "); + return i + 1; + } + } + return 0; +} + +int fpu_test_15(void) +{ + enable_fp(); + return trapit(0, test15); +} + +struct mulvals_sp { + unsigned int val_a; + unsigned int val_b; + unsigned int prod; +} mulvals_sp[] = { + { 0x00000000, 0x00000000, 0x00000000 }, + { 0x80000000, 0x80000000, 0x00000000 }, + { 0x3f800000, 0x3f800000, 0x3f800000 }, + { 0xbf800000, 0x3f800000, 0xbf800000 }, + { 0xbe7ff801, 0x6d7fffff, 0xec7ff800 }, + { 0xc100003d, 0xfe803ff8, 0x7f800000 }, + { 0x4f780080, 0x389003ff, 0x488b8427 }, +}; + +int test16(long arg) +{ + long i; + unsigned int result; + struct mulvals_sp *vp = mulvals_sp; + + set_fpscr(FPS_RN_NEAR); + for (i = 0; i < sizeof(mulvals_sp) / sizeof(mulvals_sp[0]); ++i, ++vp) { + asm("lfs 5,0(%0); lfs 6,4(%0); fmuls 7,5,6; stfs 7,0(%1)" + : : "b" (&vp->val_a), "b" (&result) : "memory"); + if (result != vp->prod) { + print_hex(i, 2, " "); + print_hex(result, 8, " "); + return i + 1; + } + } + return 0; +} + +int fpu_test_16(void) +{ + enable_fp(); + return trapit(0, test16); +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -1034,6 +1112,8 @@ int main(void) do_test(12, fpu_test_12); do_test(13, fpu_test_13); do_test(14, fpu_test_14); + do_test(15, fpu_test_15); + do_test(16, fpu_test_16); return fail; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 623db3f690b0aad69472a9a8740964239799983c..1e0e29e0c174fe7fcc993071b35fef6086d1a0c0 100755 GIT binary patch delta 2764 zcmai0du)@}6+hn>+nCHT&ck^+!OlW}kQpbT=||%RCryH#R7qf|v}OlyAbWLb#eCC2S~ zuLt0_mwICiLu58+P50v$h5{*scMU5ObYP~bptV14@%GSwE3rYA2XpY<<6jQJ+#OPiJFI%BRGWjm5^(CUAZh;T>Ed#Lg7}LuH51Wh|55 zGvQmtVo8S|8x?iCPkH|)9!s-gUTV7B`b#WNt(TjcaZhT!^jEx++92m;p*8JwwR5dc z`4+!Pdm2a6baHzO-cDPodda73j$m8U6QoV(POnktt@A1W!o-ZH_hvMyeXTyl#RKfl z(5c(oe2T=QB;&WDjzrmMpGwnupYrhpNhnP^ANn5FZSW}_3>T{}&^F%0RhbJgoUvL; zplpf!&IGzMKU8(6f)huR)3Gl{g^gLJgeC&cCxMeo7{DvZS@H!V?#pVCT}gN^t01x3 z2+jjEU*vhdyEBAErloRc2d+2O$PZHRn8}^$?bx7XQ)$i678K&3-2h&N80FbDvONXg z%6?UD--q|IYm=C$e#n{++Er1~>YM_(DjD55=Q6z=hn3Fn#yzeFKbop}evNES!h`eA z@zR^j1+?a2<>7xsodh1`-l=RSerjq0Os1n_el;DUZrG6J3lL4d<_tE5iQTNa9 z!X6vdIUdA(uJv$yv|k#hJzX}~5UKx1G3*)Y(4(>3#%H3(;@oGX&G<_0fTx=dO$R$2 zT)ke(xk|V0@i$pk$k-yP-s3pK#3`p|0QLy)haBGH;mADCMxHCQ_s4LOlpREpz#UQC zN9ta}6F)_%2dncQFV1-P$>P4b^JH=7i39KiaYuvgAED>q&+S=7IM-E8^*J8wN%c}jklkcgZ>sSM9*HSK@({58yE6>E@=yT%sj$)1o&oFC@?^+v- zk)`qq0T%BD*u9j+X#E3WYm4OZT@fNc0GolFNYHduh(-p;7;}(!6E;nejj<;3m#+zN zgaIFWD^e$S8(wR{8REv=&+ArZH?O%8|Rq z&`rTRj{7Mb8OJjeM#k|fh0r*LDVPJ8Q=B6^16WRBYXDm)^ajvPVJLw6DFg%bUY1HG zF?glOq^3lABeBBW zsE5jHnhM^BjX3rREG@~^{DRUo6WCHxpxH&~-~{d|$=95u)b=HQLWIkdE(+opr79J` zsUXHL%k>medS^0r0xR{6G{0Zk7_Wy4TH+^BC5^;6c~c0w7Xkjq`PFRBuZh$3l@Nn7 zIW4m}y~Ht0&Ell_r5U6=difNQ4o%@3%W^XVl!l0J<~hl#$U}swZee&?c{~ptYJj(I z)3DOIi_*O5?`z0s|3|_K31ceT^W?_~TQiNXK2@s$4HfF&c;=~mQz4}<-x0zg5v5vh zraH*a>7hzfK`r92nHG266=E09LB>;an@$ntV}NWCxr2NSR+ldMS@%657-3_!m;4Z6 zg*-hOV{en6x-UfMz=6^+4SrCq9a!qvY{XG%4iE5y7v|znl?MN|Dg&3+Yd!Q=iEa9< z5MsVpG@PYBZC)WfuY`rv=MmSYytg(sFKj#=;~C&$OaE)~KVmCJU*=zL%sCMEk)Ms_ zXQhzm49^Q8&lLUb{4BJ>8wSvOJznT+5OLkgLf`M4ctJ7^^uDl2Yal8qi5PuSU$4ZW Hwdwx_YSOlq delta 2147 zcmai#3rv$&6vxl^(blR^ze-z5skKx@Ktu$AkB3+gv48=#nMIAVqVq9zlO@jJMB16o z$ifVIbR$M9Gis*OWUiXv%m@;)Y;JBWSqz&~0g;z4Tzues+?`t(K4#)cPR_Z%`~S{8 z-|21J@}1CnT8IR&&j83T{eDFL28zp|B)=5ENo+N-)x=gWbC+z7OccD^7bXfZ$G81D zW}4TQ3ebDuI%5SuV(yYRJGJ~%&VT#~;OvE#=k{=$6c$bjHz{rwx?KRDT&NvVNd9Wz zG0-4Bx9Agu(RfNPE0=rZ zV|{oeBn~4CMzLifW*Q2`<^{OTP$*>JMMIIO2*tRN^~(B19{B;@4oShbkU+6@G2RTB zqgdjR-|@3DW03GRRvL4aJ4-zBAE*sY*%3NVdFL&UoW~bf6B?*&b$VohR*9?8SD|WN~6QYQnxzfF2w>CYZ4;%pBCL1IG+4IQTF+@nUe8n5Dx=bBTCL!Y$?) z?a5$pd_i^m`;nF95}q+n73Z(Tzs-+zGsc0GY`s-|6v6Jxa1QfmR1x)xLiF3lf##bHJ_t5JevzU!Uu)g7ld^hwAnJT zWrD77CAB1hFu-TE-~lT*|L}<`o@$N)HlEi#JZf+@#r>{|cv$K`Yu6A<1E*Xyl*iNg zkAg5EB2CD{cO!hRGHNgoHrx5TmQ&7$vUq^sr1V(MqG(tz)1;CnllA~I{JIx4YO166 z*O}qpm!Z`C63tlo_I*B`N`J?7)Ljl9pNPWyI6Ly?&3%0D)ytxHp6#Z-)%l z{8B6AZy-TheEMv!t@o$`DvlJvCNbZk$z+#}bR+fQ-i)@(f~s0pP=gaIwDwEs1gBqbpcQVO=LyCR)Tjo#>@d zcNH5b{BRX7QRu&lPbe6>&@$a3+Pg55!pbfzp-|I>l@#i`&`Y7M3mYh?yU~(lQq_v| z%H5cmWXi0kw7#1$Gw+7XO8zskMq*_g_k!kc*%)NoO6kF&bO6zVsop=symk~RP zi&JK)>M6^(g(WGO3H+k~CB13?(>Lc#N9=cE{6TJW$)kuVZlgiYSCv!hyo1Z-Nb>Io6|&5!mK+Z}qJ`^_k|@m9mJOS@M@1oI~t K*}1QH2JkPg+}GOx diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out index 440cd77..04c6c08 100644 --- a/tests/test_fpu.console_out +++ b/tests/test_fpu.console_out @@ -12,3 +12,5 @@ test 11:PASS test 12:PASS test 13:PASS test 14:PASS +test 15:PASS +test 16:PASS From 9cce9362519b69106936ce437bd1dfe4f27dee4b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 28 Jul 2020 16:07:25 +1000 Subject: [PATCH 18/30] FPU: Implement fdiv[s] This implements floating-point division A/B by a process that starts with normalizing both inputs if necessary. Then an estimate of 1/B from a lookup table is refined by 3 Newton-Raphson iterations and then multiplied by A to get a quotient. The remainder is calculated as A - R * B (where R is the result, i.e. the quotient) and the remainder is compared to 0 and to B to see whether the quotient needs to be incremented by 1. The calculations of 1 / B are done with 56 fraction bits and intermediate results are truncated rather than rounded, meaning that the final estimate of 1 / B is always correct or a little bit low, never too high, and thus the calculated quotient is correct or 1 unit too low. Doing the estimate of 1 / B with sufficient precision that the quotient is always correct to the last bit without needing any adjustment would require many more bits of precision. This implements fdivs by computing a double-precision quotient and then rounding it to single precision. It would be possible to optimize this by e.g. doing only 2 iterations of Newton-Raphson and then doing the remainder calculation and adjustment at single precision rather than double precision. Signed-off-by: Paul Mackerras --- decode1.vhdl | 2 + fpu.vhdl | 292 +++++++++++++++++++++++++++++++++++-- tests/fpu/fpu.c | 39 +++++ tests/test_fpu.bin | Bin 24272 -> 24416 bytes tests/test_fpu.console_out | 1 + 5 files changed, 323 insertions(+), 11 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 721c478..ddcbb3c 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -416,6 +416,7 @@ architecture behaviour of decode1 is -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl -- op in out A out in out len ext pipe 2#01110# => (FPU, OP_FPOP_I, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fcfid[u]s + 2#10010# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fdivs 2#10100# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fsubs 2#10101# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fadds 2#11001# => (FPU, OP_FPOP, FRA, NONE, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fmuls @@ -469,6 +470,7 @@ architecture behaviour of decode1 is constant decode_op_63h_array : op_63_subop_array_1_t := ( -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl -- op in out A out in out len ext pipe + 2#0010# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fdiv 2#0100# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fsub 2#0101# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fadd 2#1001# => (FPU, OP_FPOP, FRA, NONE, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fmul diff --git a/fpu.vhdl b/fpu.vhdl index 209daa0..2584e1c 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -40,10 +40,12 @@ architecture behaviour of fpu is DO_FMR, DO_FMRG, DO_FCFID, DO_FCTI, DO_FRSP, DO_FRI, - DO_FADD, DO_FMUL, + DO_FADD, DO_FMUL, DO_FDIV, FRI_1, ADD_SHIFT, ADD_2, ADD_3, MULT_1, + LOOKUP, + DIV_2, DIV_3, DIV_4, DIV_5, DIV_6, INT_SHIFT, INT_ROUND, INT_ISHIFT, INT_FINAL, INT_CHECK, INT_OFLOW, FINISH, NORMALIZE, @@ -51,6 +53,7 @@ architecture behaviour of fpu is ROUNDING, ROUNDING_2, ROUNDING_3, DENORM, RENORM_A, RENORM_A2, + RENORM_B, RENORM_B2, RENORM_C, RENORM_C2); type reg_type is record @@ -72,6 +75,7 @@ architecture behaviour of fpu is r : std_ulogic_vector(63 downto 0); -- 10.54 format x : std_ulogic; p : std_ulogic_vector(63 downto 0); -- 8.56 format + y : std_ulogic_vector(63 downto 0); -- 8.56 format result_sign : std_ulogic; result_class : fp_number_class; result_exp : signed(EXP_BITS-1 downto 0); @@ -91,8 +95,11 @@ architecture behaviour of fpu is add_bsmall : std_ulogic; is_multiply : std_ulogic; first : std_ulogic; + count : unsigned(1 downto 0); end record; + type lookup_table is array(0 to 255) of std_ulogic_vector(17 downto 0); + signal r, rin : reg_type; signal fp_result : std_ulogic_vector(63 downto 0); @@ -114,7 +121,9 @@ architecture behaviour of fpu is signal multiply_to_f : MultiplyOutputType; signal msel_1 : std_ulogic_vector(1 downto 0); signal msel_2 : std_ulogic_vector(1 downto 0); + signal msel_add : std_ulogic_vector(1 downto 0); signal msel_inv : std_ulogic; + signal inverse_est : std_ulogic_vector(18 downto 0); -- opsel values constant AIN_R : std_ulogic_vector(1 downto 0) := "00"; @@ -134,11 +143,61 @@ architecture behaviour of fpu is -- msel values constant MUL1_A : std_ulogic_vector(1 downto 0) := "00"; constant MUL1_B : std_ulogic_vector(1 downto 0) := "01"; + constant MUL1_Y : std_ulogic_vector(1 downto 0) := "10"; constant MUL1_R : std_ulogic_vector(1 downto 0) := "11"; constant MUL2_C : std_ulogic_vector(1 downto 0) := "00"; + constant MUL2_LUT : std_ulogic_vector(1 downto 0) := "01"; + constant MUL2_P : std_ulogic_vector(1 downto 0) := "10"; constant MUL2_R : std_ulogic_vector(1 downto 0) := "11"; + constant MULADD_ZERO : std_ulogic_vector(1 downto 0) := "00"; + constant MULADD_CONST : std_ulogic_vector(1 downto 0) := "01"; + constant MULADD_A : std_ulogic_vector(1 downto 0) := "10"; + + -- Inverse lookup table, indexed by the top 8 fraction bits + -- Output range is [0.5, 1) in 0.19 format, though the top + -- bit isn't stored since it is always 1. + -- Each output value is the inverse of the center of the input + -- range for the value, i.e. entry 0 is 1 / (1 + 1/512), + -- entry 1 is 1 / (1 + 3/512), etc. + signal inverse_table : lookup_table := ( + -- 1/x lookup table + -- Unit bit is assumed to be 1, so input range is [1, 2) + 18x"3fc01", 18x"3f411", 18x"3ec31", 18x"3e460", 18x"3dc9f", 18x"3d4ec", 18x"3cd49", 18x"3c5b5", + 18x"3be2f", 18x"3b6b8", 18x"3af4f", 18x"3a7f4", 18x"3a0a7", 18x"39968", 18x"39237", 18x"38b14", + 18x"383fe", 18x"37cf5", 18x"375f9", 18x"36f0a", 18x"36828", 18x"36153", 18x"35a8a", 18x"353ce", + 18x"34d1e", 18x"3467a", 18x"33fe3", 18x"33957", 18x"332d7", 18x"32c62", 18x"325f9", 18x"31f9c", + 18x"3194a", 18x"31303", 18x"30cc7", 18x"30696", 18x"30070", 18x"2fa54", 18x"2f443", 18x"2ee3d", + 18x"2e841", 18x"2e250", 18x"2dc68", 18x"2d68b", 18x"2d0b8", 18x"2caee", 18x"2c52e", 18x"2bf79", + 18x"2b9cc", 18x"2b429", 18x"2ae90", 18x"2a900", 18x"2a379", 18x"29dfb", 18x"29887", 18x"2931b", + 18x"28db8", 18x"2885e", 18x"2830d", 18x"27dc4", 18x"27884", 18x"2734d", 18x"26e1d", 18x"268f6", + 18x"263d8", 18x"25ec1", 18x"259b3", 18x"254ac", 18x"24fad", 18x"24ab7", 18x"245c8", 18x"240e1", + 18x"23c01", 18x"23729", 18x"23259", 18x"22d90", 18x"228ce", 18x"22413", 18x"21f60", 18x"21ab4", + 18x"2160f", 18x"21172", 18x"20cdb", 18x"2084b", 18x"203c2", 18x"1ff40", 18x"1fac4", 18x"1f64f", + 18x"1f1e1", 18x"1ed79", 18x"1e918", 18x"1e4be", 18x"1e069", 18x"1dc1b", 18x"1d7d4", 18x"1d392", + 18x"1cf57", 18x"1cb22", 18x"1c6f3", 18x"1c2ca", 18x"1bea7", 18x"1ba8a", 18x"1b672", 18x"1b261", + 18x"1ae55", 18x"1aa50", 18x"1a64f", 18x"1a255", 18x"19e60", 18x"19a70", 18x"19686", 18x"192a2", + 18x"18ec3", 18x"18ae9", 18x"18715", 18x"18345", 18x"17f7c", 18x"17bb7", 18x"177f7", 18x"1743d", + 18x"17087", 18x"16cd7", 18x"1692c", 18x"16585", 18x"161e4", 18x"15e47", 18x"15ab0", 18x"1571d", + 18x"1538e", 18x"15005", 18x"14c80", 18x"14900", 18x"14584", 18x"1420d", 18x"13e9b", 18x"13b2d", + 18x"137c3", 18x"1345e", 18x"130fe", 18x"12da2", 18x"12a4a", 18x"126f6", 18x"123a7", 18x"1205c", + 18x"11d15", 18x"119d2", 18x"11694", 18x"11359", 18x"11023", 18x"10cf1", 18x"109c2", 18x"10698", + 18x"10372", 18x"10050", 18x"0fd31", 18x"0fa17", 18x"0f700", 18x"0f3ed", 18x"0f0de", 18x"0edd3", + 18x"0eacb", 18x"0e7c7", 18x"0e4c7", 18x"0e1ca", 18x"0ded2", 18x"0dbdc", 18x"0d8eb", 18x"0d5fc", + 18x"0d312", 18x"0d02b", 18x"0cd47", 18x"0ca67", 18x"0c78a", 18x"0c4b1", 18x"0c1db", 18x"0bf09", + 18x"0bc3a", 18x"0b96e", 18x"0b6a5", 18x"0b3e0", 18x"0b11e", 18x"0ae5f", 18x"0aba3", 18x"0a8eb", + 18x"0a636", 18x"0a383", 18x"0a0d4", 18x"09e28", 18x"09b80", 18x"098da", 18x"09637", 18x"09397", + 18x"090fb", 18x"08e61", 18x"08bca", 18x"08936", 18x"086a5", 18x"08417", 18x"0818c", 18x"07f04", + 18x"07c7e", 18x"079fc", 18x"0777c", 18x"074ff", 18x"07284", 18x"0700d", 18x"06d98", 18x"06b26", + 18x"068b6", 18x"0664a", 18x"063e0", 18x"06178", 18x"05f13", 18x"05cb1", 18x"05a52", 18x"057f5", + 18x"0559a", 18x"05342", 18x"050ed", 18x"04e9a", 18x"04c4a", 18x"049fc", 18x"047b0", 18x"04567", + 18x"04321", 18x"040dd", 18x"03e9b", 18x"03c5c", 18x"03a1f", 18x"037e4", 18x"035ac", 18x"03376", + 18x"03142", 18x"02f11", 18x"02ce2", 18x"02ab5", 18x"0288b", 18x"02663", 18x"0243d", 18x"02219", + 18x"01ff7", 18x"01dd8", 18x"01bbb", 18x"019a0", 18x"01787", 18x"01570", 18x"0135b", 18x"01149", + 18x"00f39", 18x"00d2a", 18x"00b1e", 18x"00914", 18x"0070c", 18x"00506", 18x"00302", 18x"00100" + ); + -- Left and right shifter with 120 bit input and 64 bit output. -- Shifts inp left by shift bits and returns the upper 64 bits of -- the result. The shift parameter is interpreted as a signed @@ -359,6 +418,14 @@ begin end if; end process; + -- synchronous reads from lookup table + lut_access: process(clk) + begin + if rising_edge(clk) then + inverse_est <= '1' & inverse_table(to_integer(unsigned(r.b.mantissa(53 downto 46)))); + end if; + end process; + e_out.busy <= r.busy; e_out.exception <= r.fpscr(FPSCR_FEX); e_out.interrupt <= r.do_intr; @@ -391,6 +458,7 @@ begin variable update_fx : std_ulogic; variable arith_done : std_ulogic; variable invalid : std_ulogic; + variable zero_divide : std_ulogic; variable mant_nz : std_ulogic; variable min_exp : signed(EXP_BITS-1 downto 0); variable max_exp : signed(EXP_BITS-1 downto 0); @@ -408,9 +476,14 @@ begin variable qnan_result : std_ulogic; variable longmask : std_ulogic; variable set_a : std_ulogic; + variable set_b : std_ulogic; variable set_c : std_ulogic; variable px_nz : std_ulogic; variable maddend : std_ulogic_vector(127 downto 0); + variable set_y : std_ulogic; + variable pcmpb_eq : std_ulogic; + variable pcmpb_lt : std_ulogic; + variable pshift : std_ulogic; begin v := r; illegal := '0'; @@ -478,8 +551,16 @@ begin exp_huge := '1'; end if; - -- Compare P with zero + -- Compare P with zero and with B px_nz := or (r.p(57 downto 4)); + pcmpb_eq := '0'; + if r.p(59 downto 4) = r.b.mantissa(55 downto 0) then + pcmpb_eq := '1'; + end if; + pcmpb_lt := '0'; + if unsigned(r.p(59 downto 4)) < unsigned(r.b.mantissa(55 downto 0)) then + pcmpb_lt := '1'; + end if; v.writing_back := '0'; v.instr_done := '0'; @@ -498,18 +579,22 @@ begin update_fx := '0'; arith_done := '0'; invalid := '0'; + zero_divide := '0'; renormalize := '0'; set_x := '0'; qnan_result := '0'; longmask := r.single_prec; set_a := '0'; + set_b := '0'; set_c := '0'; f_to_multiply.is_32bit <= '0'; f_to_multiply.valid <= '0'; msel_1 <= MUL1_A; msel_2 <= MUL2_C; + msel_add <= MULADD_ZERO; msel_inv <= '0'; - + set_y := '0'; + pshift := '0'; case r.state is when IDLE => if e_in.valid = '1' then @@ -550,6 +635,8 @@ begin when "01111" => v.round_mode := "001"; v.state := DO_FCTI; + when "10010" => + v.state := DO_FDIV; when "10100" | "10101" => v.state := DO_FADD; when "11001" => @@ -897,6 +984,63 @@ begin arith_done := '1'; end if; + when DO_FDIV => + opsel_a <= AIN_A; + v.result_sign := r.a.negative; + v.result_class := r.a.class; + v.result_exp := r.a.exponent; + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + v.result_sign := r.a.negative xor r.b.negative; + v.result_exp := r.a.exponent - r.b.exponent; + v.count := "00"; + if r.a.class = FINITE and r.b.class = FINITE then + -- Renormalize denorm operands + if r.a.mantissa(54) = '0' then + v.state := RENORM_A; + elsif r.b.mantissa(54) = '0' then + opsel_a <= AIN_B; + v.state := RENORM_B; + else + v.first := '1'; + v.state := DIV_2; + end if; + else + if (r.a.class = NAN and r.a.mantissa(53) = '0') or + (r.b.class = NAN and r.b.mantissa(53) = '0') then + -- Signalling NAN + v.fpscr(FPSCR_VXSNAN) := '1'; + invalid := '1'; + end if; + if r.a.class = NAN then + -- result is A + v.result_sign := r.a.negative; + elsif r.b.class = NAN then + v.result_class := NAN; + v.result_sign := r.b.negative; + opsel_a <= AIN_B; + elsif r.b.class = INFINITY then + if r.a.class = INFINITY then + v.fpscr(FPSCR_VXIDI) := '1'; + qnan_result := '1'; + else + v.result_class := ZERO; + end if; + elsif r.b.class = ZERO then + if r.a.class = ZERO then + v.fpscr(FPSCR_VXZDZ) := '1'; + qnan_result := '1'; + else + if r.a.class = FINITE then + zero_divide := '1'; + end if; + v.result_class := INFINITY; + end if; + -- else r.b.class = FINITE, result_class = r.a.class + end if; + arith_done := '1'; + end if; + when RENORM_A => renormalize := '1'; v.state := RENORM_A2; @@ -904,14 +1048,33 @@ begin when RENORM_A2 => set_a := '1'; v.result_exp := new_exp; - opsel_a <= AIN_C; - if r.c.mantissa(54) = '1' then - v.first := '1'; - v.state := MULT_1; + if r.insn(4) = '1' then + opsel_a <= AIN_C; + if r.c.mantissa(54) = '1' then + v.first := '1'; + v.state := MULT_1; + else + v.state := RENORM_C; + end if; else - v.state := RENORM_C; + opsel_a <= AIN_B; + if r.b.mantissa(54) = '1' then + v.first := '1'; + v.state := DIV_2; + else + v.state := RENORM_B; + end if; end if; + when RENORM_B => + renormalize := '1'; + v.state := RENORM_B2; + + when RENORM_B2 => + set_b := '1'; + v.result_exp := r.result_exp + r.shift; + v.state := LOOKUP; + when RENORM_C => renormalize := '1'; v.state := RENORM_C2; @@ -982,6 +1145,82 @@ begin v.state := FINISH; end if; + when LOOKUP => + opsel_a <= AIN_B; + -- wait one cycle for inverse_table[B] lookup + v.first := '1'; + v.state := DIV_2; + + when DIV_2 => + -- compute Y = inverse_table[B] (when count=0); P = 2 - B * Y + msel_1 <= MUL1_B; + msel_add <= MULADD_CONST; + msel_inv <= '1'; + if r.count = 0 then + msel_2 <= MUL2_LUT; + else + msel_2 <= MUL2_P; + end if; + set_y := r.first; + pshift := '1'; + f_to_multiply.valid <= r.first; + if multiply_to_f.valid = '1' then + v.first := '1'; + v.count := r.count + 1; + v.state := DIV_3; + end if; + + when DIV_3 => + -- compute Y = P = P * Y + msel_1 <= MUL1_Y; + msel_2 <= MUL2_P; + f_to_multiply.valid <= r.first; + pshift := '1'; + if multiply_to_f.valid = '1' then + v.first := '1'; + if r.count = 3 then + v.state := DIV_4; + else + v.state := DIV_2; + end if; + end if; + + when DIV_4 => + -- compute R = P = A * Y (quotient) + msel_1 <= MUL1_A; + msel_2 <= MUL2_P; + set_y := r.first; + f_to_multiply.valid <= r.first; + pshift := '1'; + if multiply_to_f.valid = '1' then + opsel_r <= RES_MULT; + v.first := '1'; + v.state := DIV_5; + end if; + + when DIV_5 => + -- compute P = A - B * R (remainder) + msel_1 <= MUL1_B; + msel_2 <= MUL2_R; + msel_add <= MULADD_A; + msel_inv <= '1'; + f_to_multiply.valid <= r.first; + if multiply_to_f.valid = '1' then + v.state := DIV_6; + end if; + + when DIV_6 => + -- test if remainder is 0 or >= B + if pcmpb_lt = '1' then + -- quotient is correct, set X if remainder non-zero + v.x := r.p(58) or px_nz; + else + -- quotient needs to be incremented by 1 + carry_in <= '1'; + v.x := not pcmpb_eq; + end if; + v.state := FINISH; + when INT_SHIFT => opsel_r <= RES_SHIFT; set_x := '1'; @@ -1218,6 +1457,9 @@ begin end case; + if zero_divide = '1' then + v.fpscr(FPSCR_ZX) := '1'; + end if; if qnan_result = '1' then invalid := '1'; v.result_class := NAN; @@ -1227,7 +1469,9 @@ begin end if; if arith_done = '1' then -- Enabled invalid exception doesn't write result or FPRF - if (invalid and r.fpscr(FPSCR_VE)) = '0' then + -- Neither does enabled zero-divide exception + if (invalid and r.fpscr(FPSCR_VE)) = '0' and + (zero_divide and r.fpscr(FPSCR_ZE)) = '0' then v.writing_back := '1'; v.update_fprf := '1'; end if; @@ -1236,30 +1480,52 @@ begin update_fx := '1'; end if; - -- Multiplier data path + -- Multiplier and divide/square root data path case msel_1 is when MUL1_A => f_to_multiply.data1 <= r.a.mantissa(61 downto 0) & "00"; when MUL1_B => f_to_multiply.data1 <= r.b.mantissa(61 downto 0) & "00"; + when MUL1_Y => + f_to_multiply.data1 <= r.y; when others => f_to_multiply.data1 <= r.r(61 downto 0) & "00"; end case; case msel_2 is when MUL2_C => f_to_multiply.data2 <= r.c.mantissa(61 downto 0) & "00"; + when MUL2_LUT => + f_to_multiply.data2 <= x"00" & inverse_est & '0' & x"000000000"; + when MUL2_P => + f_to_multiply.data2 <= r.p; when others => f_to_multiply.data2 <= r.r(61 downto 0) & "00"; end case; maddend := (others => '0'); + case msel_add is + when MULADD_CONST => + -- addend is 2.0 in 16.112 format + maddend(113) := '1'; -- 2.0 + when MULADD_A => + -- addend is A in 16.112 format + maddend(121 downto 58) := r.a.mantissa; + when others => + end case; if msel_inv = '1' then f_to_multiply.addend <= not maddend; else f_to_multiply.addend <= maddend; end if; f_to_multiply.not_result <= msel_inv; + if set_y = '1' then + v.y := f_to_multiply.data2; + end if; if multiply_to_f.valid = '1' then - v.p := multiply_to_f.result(63 downto 0); + if pshift = '0' then + v.p := multiply_to_f.result(63 downto 0); + else + v.p := multiply_to_f.result(119 downto 56); + end if; end if; -- Data path. @@ -1378,6 +1644,10 @@ begin v.a.exponent := new_exp; v.a.mantissa := shift_res; end if; + if set_b = '1' then + v.b.exponent := new_exp; + v.b.mantissa := shift_res; + end if; if set_c = '1' then v.c.exponent := new_exp; v.c.mantissa := shift_res; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 305359a..cbb0ee2 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -1007,6 +1007,7 @@ struct mulvals { { 0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000 }, { 0xbf4fff801fffffff, 0x6d7fffff8000007f, 0xecdfff7fa001fffe }, { 0x3fbd50275a65ed80, 0x0010000000000000, 0x0001d50275a65ed8 }, + { 0x3fe95d8937acf1ce, 0x0000000000000001, 0x0000000000000001 }, }; int test15(long arg) @@ -1073,6 +1074,43 @@ int fpu_test_16(void) return trapit(0, test16); } +struct divvals { + unsigned long val_a; + unsigned long val_b; + unsigned long prod; +} divvals[] = { + { 0x3ff0000000000000, 0x0000000000000000, 0x7ff0000000000000 }, + { 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 }, + { 0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000 }, + { 0x4000000000000000, 0x4008000000000000, 0x3fe5555555555555 }, + { 0xc01fff0007ffffff, 0xc03ffffffdffffbf, 0x3fcfff0009fff041 }, +}; + +int test17(long arg) +{ + long i; + unsigned long result; + struct divvals *vp = divvals; + + set_fpscr(FPS_RN_NEAR); + for (i = 0; i < sizeof(divvals) / sizeof(divvals[0]); ++i, ++vp) { + asm("lfd 5,0(%0); lfd 6,8(%0); fdiv 7,5,6; stfd 7,0(%1)" + : : "b" (&vp->val_a), "b" (&result) : "memory"); + if (result != vp->prod) { + print_hex(i, 2, " "); + print_hex(result, 16, " "); + return i + 1; + } + } + return 0; +} + +int fpu_test_17(void) +{ + enable_fp(); + return trapit(0, test17); +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -1114,6 +1152,7 @@ int main(void) do_test(14, fpu_test_14); do_test(15, fpu_test_15); do_test(16, fpu_test_16); + do_test(17, fpu_test_17); return fail; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 1e0e29e0c174fe7fcc993071b35fef6086d1a0c0..dc5af293a27cd0d08eda8272695afbe7e9ba89c5 100755 GIT binary patch delta 2394 zcmai0drVVj6hHUUmSS3Mh4QGSFAxE-77@6tC>0swp~xH>r&FuwoO{I?XK+Jcrkb)u z10J{F!a&Vt*s>*Y8Qj%oWCpc^t%Yvx3|X;{JT5^P#l>*6X5db8rs_1LNQQbl^{vv&eC?2!md*KOTd@ny~7(2qJ8?+WEIK*rL#@bzjcY}4* zp*A8@3$y}|GZ^GP9Wgg7u=9XRLG|Cl_an-M1{4fwh7+++@SVK>Kn`Quasb+0SFq6F z5|6~<4#P{z^iLJQ7{Tp%>!&d3E~#4SL#B2Oy=&GGUaiPS1RNI*2VBW@5el) zY+UV?4r8h@4Z95rv>G)+8BQ}&R)a#nJ-~{o$ZUQInl)Z2iFld9EWBu(jJu2r#iKXz zMPq!ZrxL=-J@ZjJ$*))v1qHj~OxQiiq&1QMPB`xhc>3^M_#|;}Bqo?vi288cVM?B; zjf8^z)C+k%p{j!V|Gmi}`ZnU9rhM^713nR(Eb4aS^RWvf>o%54vD9xXtQuhKK(-EQ zBqE-S%@-90j5NP01aP~#cp~R1@9?q@WGjN49&@s&iNNs|ny_7%YuRP2+xVeW@lS7c z;qCt=9J1sK1DI<4ftN9AO*U}uom;F}fNJTp(DKSsut|NGZd)g+_hOSRm-BjTL7x60 zUI1Aq7+8UBVW8CEj1X zj|$MhHaG8j3;F!u^ilqHxKcQlNVoPVj-EKPX$>G-<~__&Q=Y3gc%E7Ex&*6g9>JL? zbr4B_D!Jf&RNN*!G#Q2c_+-MPi(7vE@x^5k=W)dujqkL|0kY7cc#9!TQXmd;nYm`6E;PK(pj_Ui7zsvEAplxjKk-%EC z$_hnWe3<2QX?@N1!A}%W`SAkS>ch93;gdNMLi4VHq^#hIdn?Cu55{=dxxf%SDn11u zsTQDiIxoWKk4;@ERBt-VSRch)bfm7S90^ZtF-j7Z^JWfd9;}iR~BAoaPXZcVh{eyo*?sW)@$*h)rZR z^k5sAh8{dmro9L6k{Reh^Gvg->&4k*q+VP>#?y;cWPH8YM5eVD+sK^m#q(sK5ATvm z?4x~?lp;VWdFe(~fZUcooSklri4LKv`WZ8k;bJ9CAxw?0rCU^`Mr!L`(8FuA7ay3|=VNq=$@Ax2o zkzrQtA$QLZ-pw$M8Jk#EEJff v{aeQFFqYFkXPAWn`!&a3F~32rJvs78X6^F99Qhd3=NqtFQ2H&452^nJR9`?N delta 2173 zcmah~e@s(X6u$4#*5X=hf%2o2(w2fyp@1;@pr}=pwXgv?wrmC!#BG>4BFo(BR%kWV zfEk2(wuBfRV$`T{22xFMQG;1x!Zr>5w?*er3InDL62llrg+#gD-tC@9FWa&F8ZEJAYWe$jqPsbsv2=(uT1F;QU0&uE))p-rEUku$pw2ac7%Lo~?9tMZX z%AeH0Zq>))Mz8Sw6r6}RgE20FtAizRRa^~p#8vUXtc4qKmAo+t%<-R#?d!b41DK7s zz(~A|YlJ)TcF`uUusH;6O^|aRLTf^q*to$f{07QI%ihEqvA51EI9LH)iR{jS#5cq> z8@<9=$WvQDr29qT!0h6unAC?7i57jUtfV>cqj$Pd4+;W++ZB6FrK>9wV9r*q^N6 zyM1sfc}<+Bxj{(Aaq39@1f_>c6vQJSJx*K3m&C$)Z4;`!tgVP^%!8oEY$CP4< zTp2^`9HCY(^lDSUl_KNYI$%#qr`pqeL~#By>UKQ(U+=}bGEN8Ebw4s)OZ6rd<34^! zD>&j7PS0y_RTI6SffoHoyt5m|^(z?9WH6!DBZBe;UakUK^=j}MY;p;)h4$rWgrmg+ zR}J~>o+Q;2*|;DqG67~&OTm>Y;}p=I8alp^YU2B%U@FxN!^T*78@{E2a6$GRCSleX zlRY$_Ef=Y)o^iW!Nkd5AnI~8=uGsM>d2UE^UYeEL44GS&C3mt)#7IFzFOR6x#-fE}6n_&awdGoK&$&Oep;d4&65 z_=`*g>CL~6Mo3+af3usq(1)gUW63c*Cp`w=YQ))G#W>n(~52PX@w0(#rSO3aMnt;4o@FmfJDfR*KtFBrFk9~sY5bb4j%Gy z7Dj49e#yN1jzK?J-QA6cN{nD$*haUtUZa zLn3gAWhApGd}3>1nBw^whVrUZ;YS&>2+3?Gq&q9yN?XQ=vLBi0Q-_r5dIUO@}Uv zAY=x53sWSyn3asfr9y+`JxpuIVY<*HX~T429JH2n$!Sb;|AcajL2@0_j0yM@Q;~>} za}%)7Vi0FxdKWHQQY6)wot}bmi&p#@rY=}5B+X}!>&8lP8_a?_(un0bEDysrAypl~ zEQtR@8Hw3S^dMrQJMd6g3NwO8Y!egm4LlG^v)eE;&OCb&Hv9dEWg*5cGJ6xf53yR< zTvQ<@5_}CfTeMJ Date: Wed, 29 Jul 2020 17:34:03 +1000 Subject: [PATCH 19/30] FPU: Implement fre[s] This just returns the value from the inverse lookup table. The result is accurate to better than one part in 512 (the architecture requires 1/256). This also adds a simple test, which relies on the particular values in the inverse lookup table, so it is not a general test. Signed-off-by: Paul Mackerras --- decode1.vhdl | 2 ++ fpu.vhdl | 48 ++++++++++++++++++++++++++++++++++++- tests/fpu/fpu.c | 38 +++++++++++++++++++++++++++++ tests/test_fpu.bin | Bin 24416 -> 24512 bytes tests/test_fpu.console_out | 1 + 5 files changed, 88 insertions(+), 1 deletion(-) diff --git a/decode1.vhdl b/decode1.vhdl index ddcbb3c..c0c3465 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -419,6 +419,7 @@ architecture behaviour of decode1 is 2#10010# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fdivs 2#10100# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fsubs 2#10101# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fadds + 2#11000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fres 2#11001# => (FPU, OP_FPOP, FRA, NONE, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fmuls others => illegal_inst ); @@ -473,6 +474,7 @@ architecture behaviour of decode1 is 2#0010# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fdiv 2#0100# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fsub 2#0101# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fadd + 2#1000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fre 2#1001# => (FPU, OP_FPOP, FRA, NONE, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fmul others => illegal_inst ); diff --git a/fpu.vhdl b/fpu.vhdl index 2584e1c..fee1776 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -41,11 +41,13 @@ architecture behaviour of fpu is DO_FCFID, DO_FCTI, DO_FRSP, DO_FRI, DO_FADD, DO_FMUL, DO_FDIV, + DO_FRE, FRI_1, ADD_SHIFT, ADD_2, ADD_3, MULT_1, LOOKUP, DIV_2, DIV_3, DIV_4, DIV_5, DIV_6, + FRE_1, INT_SHIFT, INT_ROUND, INT_ISHIFT, INT_FINAL, INT_CHECK, INT_OFLOW, FINISH, NORMALIZE, @@ -639,6 +641,8 @@ begin v.state := DO_FDIV; when "10100" | "10101" => v.state := DO_FADD; + when "11000" => + v.state := DO_FRE; when "11001" => v.is_multiply := '1'; v.state := DO_FMUL; @@ -1041,6 +1045,36 @@ begin arith_done := '1'; end if; + when DO_FRE => + opsel_a <= AIN_B; + v.result_class := r.b.class; + v.result_sign := r.b.negative; + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + if r.b.class = NAN and r.b.mantissa(53) = '0' then + v.fpscr(FPSCR_VXSNAN) := '1'; + invalid := '1'; + end if; + case r.b.class is + when FINITE => + v.result_exp := - r.b.exponent; + if r.b.mantissa(54) = '0' then + v.state := RENORM_B; + else + v.state := FRE_1; + end if; + when NAN => + -- result is B + arith_done := '1'; + when INFINITY => + v.result_class := ZERO; + arith_done := '1'; + when ZERO => + v.result_class := INFINITY; + zero_divide := '1'; + arith_done := '1'; + end case; + when RENORM_A => renormalize := '1'; v.state := RENORM_A2; @@ -1149,7 +1183,11 @@ begin opsel_a <= AIN_B; -- wait one cycle for inverse_table[B] lookup v.first := '1'; - v.state := DIV_2; + if r.insn(4) = '0' then + v.state := DIV_2; + else + v.state := FRE_1; + end if; when DIV_2 => -- compute Y = inverse_table[B] (when count=0); P = 2 - B * Y @@ -1221,6 +1259,12 @@ begin end if; v.state := FINISH; + when FRE_1 => + opsel_r <= RES_MISC; + misc_sel <= "0111"; + v.shift := to_signed(1, EXP_BITS); + v.state := NORMALIZE; + when INT_SHIFT => opsel_r <= RES_SHIFT; set_x := '1'; @@ -1609,6 +1653,8 @@ begin when "0110" => -- fmrgew result misc := r.a.mantissa(63 downto 32) & r.b.mantissa(63 downto 32); + when "0111" => + misc := 10x"000" & inverse_est & 35x"000000000"; when "1000" => -- max positive result for fctiw[z] misc := x"000000007fffffff"; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index cbb0ee2..e62ce27 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -1111,6 +1111,43 @@ int fpu_test_17(void) return trapit(0, test17); } +struct recipvals { + unsigned long val; + unsigned long inv; +} recipvals[] = { + { 0x0000000000000000, 0x7ff0000000000000 }, + { 0xfff0000000000000, 0x8000000000000000 }, + { 0x3ff0000000000000, 0x3feff00400000000 }, + { 0xbff0000000000000, 0xbfeff00400000000 }, + { 0x4008000000000000, 0x3fd54e3800000000 }, + { 0xc03ffffffdffffbf, 0xbfa0040000000000 }, +}; + +int test18(long arg) +{ + long i; + unsigned long result; + struct recipvals *vp = recipvals; + + set_fpscr(FPS_RN_NEAR); + for (i = 0; i < sizeof(recipvals) / sizeof(recipvals[0]); ++i, ++vp) { + asm("lfd 6,0(%0); fre 7,6; stfd 7,0(%1)" + : : "b" (&vp->val), "b" (&result) : "memory"); + if (result != vp->inv) { + print_hex(i, 2, " "); + print_hex(result, 16, " "); + return i + 1; + } + } + return 0; +} + +int fpu_test_18(void) +{ + enable_fp(); + return trapit(0, test18); +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -1153,6 +1190,7 @@ int main(void) do_test(15, fpu_test_15); do_test(16, fpu_test_16); do_test(17, fpu_test_17); + do_test(18, fpu_test_18); return fail; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index dc5af293a27cd0d08eda8272695afbe7e9ba89c5..572aad0ef8df74ffd3de84f0d0435c5bf1706b02 100755 GIT binary patch delta 2463 zcmai0eQcBE7C-O1wsyGGZftE=y6#=K;aail#^~OCv2M&!IvtA_KQK-=U}hJ%+~soh z0(T$9R0G~XhUZ3!!B~irX+$O6HPb(25@IGI!9Os&0d*UNbSPuQO?nl4?m6v(yIvAb za(aHx@Ak%o^+g#f-9fI$0acLlan?0~jFJ3t$8n~2*)+@_A;zQUza zaNpXca@>7%@9BF=jbTRxz~s3fglGa%vm3+p71Mi||NIMpljqL-&m8VGS{*EjlVqwn z*az^^x&3pJyR4c^uwE)9aWvv*XFO%j@gX`CW|zUHKx^46!*~YllYBkwOpNpZu4{6(bkaqAG~HR$1!u76vr#(mD(qt z38{Zfa9x%(xvUd=Esfg2r$g$$F+IEd3ay;2)p7g9gM5^Fgs z)->q=-fvy0U9%yiI+@F;iHj!6w3Xl~Exu_7owmvNN5V}v&;vsgQEma!jfrYq3InJpp z&vVi4Z>cwb7EKcLse^Z|?{0xS_00_I&3i`b#EW?=Sm$!MvRIe?JKYfWx{g_$1#Yt46$>{}PX$D{!R-?@6XbX|qk6pJFjw@?9mWF(l_Ic=}3qa)ZX(tzepYv#$?(6oz`a}iJ&T*C03!l$n4^>6QlENzK z1^iQCq;G(-N`vQpd>0=O=4^J(@+(obgt;QR%iC#MNwb{h0ICzZ-)r==PDNs!n#8;c z?ZMkCH*At72@$V0(=(}%R;RlsOYY4HfdYI-R?AUjacrL4! z$}=+85ZP0r>9D+`n|Ut(8RE4G?(es_QR1%Av(Bh!dX}2SN0nkdH#3zwoqOZf<|8CT zk6Do>Bk69lWaIBa2KB&!Cqn8qdhZl^n0rMIuZz8GQVfi3iElBq>BG3LxOc(4;pO5( z(ka~IdEa0QPpF^-+7$?FCtaYVvQZy)Ounmu?sry04`cbbe!GU6c0QLvuSnW0&X>?} zV<$B{#*-z@rsS3dO8}g^0d^O2Ry5@I6J*SFlE;WsD6%ovO+G_h0TW~#`^h!m3UM9)?=H)ehrUI3nOi>b z9X>^&dJOlL+2svm_#TDMaXd|-e;lVM42`2hri1U6IfOyCv@k5AxU3c(3{ zkHWwNo~CeS0;ed%C(z)v%gP0Gd+m}RA6jP9$^cy#u-R+XSyBK-F5nigb-7AVi3(xk zT*(yj2Z<{pu8WC=)H--WT4C^|*Y&m-p#D2u9UrNSG=GiaXI_VHKf!-Samwq`jS!rU zqP={vZiZmzMGTZXbT$p$!6p0)!3Kgem+-Z6hifyzCG>%h@-8b4Ewug{QpW-V)X^4q zCo}0M%?~lWRBqP|5*(aFw`$jo67*kAz#9b5C!kZ&QdZv!F*tv{*5B4EAijY#fX}M= z*_#Qip??ettH~P3gT%GrQFVoGgwXUA{8_Dc8+9~s^|u1?9TyO{i#UFW?Ll&txS^}K zu_B<|MKBXTs<_uSNND6oA=)HTJYZ1hjB=9!8gzA(M%PlCYi34>UAzPtugu-pIV(hn z39=$`ANlAk)>pc+(tZ+xDK?e|$rapLxlpTK7h+rFU}e1qi`P0MExr#;=x#{G+6H3< v{okPeo4%Tebg*M{<3t3|`*^H2pRGM6#7!Y;hj|Q|@G<;KvP6uJwdnr~ok z{*;dIguW9(1Y@QQ#`4R4oRMFJbxr+>{4&PO$gM|iJ#y>UupJRe0^6}9Nt}vp+y3(l zi8i~=z}V=;8=My~R8!jQFB-!#hP} zA&}$!t%yBDpG5W9V3v2%DLk-%^M(bCbr^3fg?yhh@pv$7@mZ%RUu%&LUxP!waZsKh zgDADP&;l9iBB2Sk;nf>1tBb^WK@jJ=Np4$ak^X{PzNygXs}x${rf;s?yv8D}b#mo? zN>O?fYW;HL8&+7P3*hOW`k8-;yxVM%7SRH1{&d$~|6F;~N{e(15&}{|8=w?QAT}UZ zu3T-AqLE8d{DO`%CgEXOT{RpBIouHm#`$(F;o08}faCt)f|~yf;|4 zXzE{$sP!?K_z+(a8E!u4W@2ULf} zL3f0AXd}LdiV2f6-h@ASLGQf>b@AxLjqwn2a{+`%FaJ|+(?d;TX6e{N9FIO@AU4t{ zRKeOvN8LU&r(_K#nn(*`nq2Y(okL>`aZxzYXK6f9GavUDOLyv?*0^Gyn|G*BhI3zH zW%bX{%#zNaNMN-1L{ndJS~54yP4w2sl;TKoZFX3kfpAI1H8 zyKcvBnpfvQA|b}>@Yt*(yht)gU8HPJG>oV`}lGq{(#T}yA!_l zXtEDTOq*Y(W%*U8D@{zxvD$T`U&>g`p(3`Ku-NHZC1cTs$2k{fGAJK$CZ|sI#2R`N z7ptcogvT%zy_K=8bIBY&B`v8`M$lnPk_mMiFL6GEO9~Yxs#*4B&Wnk_AkKtYkK8y` z3gSvIm;TCm3lSK^y@ffBoE9quaW$Az$b}PuLDhz-yu$e<#=t8%RP49{DamnScOSfg zCA%MXC2PbD{m_o3d;of}G!DQ7mW}~Xr)b2H0Z7538iZ0T(jaWYVjhHDSgeE4j-_=F zda;}zgb6Hc2-K+>F=_}>QZ=Q;L!4KVnS~zlo|(0;c(5!*CN6_8Cq>3u{$ros$Ik^X z!<>_rLW>!Loa-=jrD_z#2up@xELEqdM7VJn)RJD&g0N>8a->MbIfQw?L4~AKOd&K~ zg*^zv@r#_h3Y`cG5h_RFnxs*bBkUdp4}(U&6X8y%GK46OB8s{W)dr2?62jvI17?Pk z7dd0aaMBRwABE@({7*~fD6%o}kAbzqeM6SQg2*%m4-E4Y>5aN^+4K89Z$w-la_5nw z8z~AgZzJbA4()0A@@xfTt3hs@?Ng4(?=~J60Tl~9v^Y*N7cz}u`gwOax6-{B)RGQX z3vw1BFrqL`n0^zm-KdW)oZy@&3X<1jnvsj4#u&uy#H@7m8*^kw(hCLnbV; Date: Wed, 29 Jul 2020 20:26:39 +1000 Subject: [PATCH 20/30] FPU: Implement fsel Signed-off-by: Paul Mackerras --- decode1.vhdl | 1 + fpu.vhdl | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/decode1.vhdl b/decode1.vhdl index c0c3465..09aaf91 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -474,6 +474,7 @@ architecture behaviour of decode1 is 2#0010# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fdiv 2#0100# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fsub 2#0101# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fadd + 2#0111# => (FPU, OP_FPOP, FRA, FRB, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fsel 2#1000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fre 2#1001# => (FPU, OP_FPOP, FRA, NONE, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fmul others => illegal_inst diff --git a/fpu.vhdl b/fpu.vhdl index fee1776..59e6f5d 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -42,6 +42,7 @@ architecture behaviour of fpu is DO_FRSP, DO_FRI, DO_FADD, DO_FMUL, DO_FDIV, DO_FRE, + DO_FSEL, FRI_1, ADD_SHIFT, ADD_2, ADD_3, MULT_1, @@ -641,6 +642,8 @@ begin v.state := DO_FDIV; when "10100" | "10101" => v.state := DO_FADD; + when "10111" => + v.state := DO_FSEL; when "11000" => v.state := DO_FRE; when "11001" => @@ -1045,6 +1048,24 @@ begin arith_done := '1'; end if; + when DO_FSEL => + opsel_a <= AIN_A; + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + if r.a.class = ZERO or (r.a.negative = '0' and r.a.class /= NAN) then + v.result_sign := r.c.negative; + v.result_exp := r.c.exponent; + v.result_class := r.c.class; + opsel_a <= AIN_C; + else + v.result_sign := r.b.negative; + v.result_exp := r.b.exponent; + v.result_class := r.b.class; + opsel_a <= AIN_B; + end if; + v.quieten_nan := '0'; + arith_done := '1'; + when DO_FRE => opsel_a <= AIN_B; v.result_class := r.b.class; From 49f3d1e77a14e8c2d8c35c1316b8d35754a1b428 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 30 Jul 2020 10:00:25 +1000 Subject: [PATCH 21/30] FPU: Implement fcmpu and fcmpo Signed-off-by: Paul Mackerras --- decode1.vhdl | 2 ++ fpu.vhdl | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 09aaf91..ba9964e 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -436,6 +436,8 @@ architecture behaviour of decode1 is constant decode_op_63l_array : op_63_subop_array_0_t := ( -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl -- op in out A out in out len ext pipe + 2#000000000# => (FPU, OP_FPOP, FRA, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 0/0=fcmpu + 2#000000001# => (FPU, OP_FPOP, FRA, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 1/0=fcmpo 2#000000010# => (FPU, OP_FPOP, NONE, NONE, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 2/0=mcrfs 2#011000001# => (FPU, OP_FPOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 1/6=mtfsb1 2#011000010# => (FPU, OP_FPOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 2/6=mtfsb0 diff --git a/fpu.vhdl b/fpu.vhdl index 59e6f5d..c726be3 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -37,7 +37,7 @@ architecture behaviour of fpu is type state_t is (IDLE, DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF, - DO_FMR, DO_FMRG, + DO_FMR, DO_FMRG, DO_FCMP, DO_FCFID, DO_FCTI, DO_FRSP, DO_FRI, DO_FADD, DO_FMUL, DO_FDIV, @@ -45,6 +45,7 @@ architecture behaviour of fpu is DO_FSEL, FRI_1, ADD_SHIFT, ADD_2, ADD_3, + CMP_1, CMP_2, MULT_1, LOOKUP, DIV_2, DIV_3, DIV_4, DIV_5, DIV_6, @@ -603,7 +604,11 @@ begin if e_in.valid = '1' then case e_in.insn(5 downto 1) is when "00000" => - v.state := DO_MCRFS; + if e_in.insn(7) = '1' then + v.state := DO_MCRFS; + else + v.state := DO_FCMP; + end if; when "00110" => if e_in.insn(10) = '0' then if e_in.insn(8) = '0' then @@ -669,6 +674,62 @@ begin v.instr_done := '1'; v.state := IDLE; + when DO_FCMP => + -- fcmp[uo] + v.instr_done := '1'; + v.state := IDLE; + update_fx := '1'; + opsel_a <= AIN_B; + opsel_r <= RES_SUM; + v.result_exp := r.b.exponent; + if (r.a.class = NAN and r.a.mantissa(53) = '0') or + (r.b.class = NAN and r.b.mantissa(53) = '0') then + -- Signalling NAN + v.fpscr(FPSCR_VXSNAN) := '1'; + if r.insn(6) = '1' and r.fpscr(FPSCR_VE) = '0' then + v.fpscr(FPSCR_VXVC) := '1'; + end if; + invalid := '1'; + v.cr_result := "0001"; -- unordered + elsif r.a.class = NAN or r.b.class = NAN then + if r.insn(6) = '1' then + -- fcmpo + v.fpscr(FPSCR_VXVC) := '1'; + invalid := '1'; + end if; + v.cr_result := "0001"; -- unordered + elsif r.a.class = ZERO and r.b.class = ZERO then + v.cr_result := "0010"; -- equal + elsif r.a.negative /= r.b.negative then + v.cr_result := r.a.negative & r.b.negative & "00"; + elsif r.a.class = ZERO then + -- A and B are the same sign from here down + v.cr_result := not r.b.negative & r.b.negative & "00"; + elsif r.a.class = INFINITY then + if r.b.class = INFINITY then + v.cr_result := "0010"; + else + v.cr_result := r.a.negative & not r.a.negative & "00"; + end if; + elsif r.b.class = ZERO then + -- A is finite from here down + v.cr_result := r.a.negative & not r.a.negative & "00"; + elsif r.b.class = INFINITY then + v.cr_result := not r.b.negative & r.b.negative & "00"; + elsif r.exp_cmp = '1' then + -- A and B are both finite from here down + v.cr_result := r.a.negative & not r.a.negative & "00"; + elsif r.a.exponent /= r.b.exponent then + -- A exponent is smaller than B + v.cr_result := not r.a.negative & r.a.negative & "00"; + else + -- Prepare to subtract mantissas, put B in R + v.cr_result := "0000"; + v.instr_done := '0'; + v.state := CMP_1; + end if; + v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result; + when DO_MTFSB => -- mtfsb{0,1} j := to_integer(unsigned(insn_bt(r.insn))); @@ -1193,6 +1254,26 @@ begin v.state := NORMALIZE; end if; + when CMP_1 => + opsel_a <= AIN_A; + opsel_b <= BIN_R; + opsel_binv <= '1'; + carry_in <= '1'; + v.state := CMP_2; + + when CMP_2 => + if r.r(63) = '1' then + -- A is smaller in magnitude + v.cr_result := not r.a.negative & r.a.negative & "00"; + elsif (r_hi_nz or r_lo_nz) = '0' then + v.cr_result := "0010"; + else + v.cr_result := r.a.negative & not r.a.negative & "00"; + end if; + v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result; + v.instr_done := '1'; + v.state := IDLE; + when MULT_1 => f_to_multiply.valid <= r.first; opsel_r <= RES_MULT; From e1bbb786c078b7ecdae7d027860d44949eb87bb2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 30 Jul 2020 13:38:09 +1000 Subject: [PATCH 22/30] tests/fpu: Add tests for fsel and fcmpu Signed-off-by: Paul Mackerras --- tests/fpu/fpu.c | 98 +++++++++++++++++++++++++++++++++++++ tests/test_fpu.bin | Bin 24512 -> 25024 bytes tests/test_fpu.console_out | 2 + 3 files changed, 100 insertions(+) diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index e62ce27..06da475 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -1148,6 +1148,102 @@ int fpu_test_18(void) return trapit(0, test18); } +#define RES_B 0x7ffaaaaaaaaaaaaa +#define RES_C 0x000bbbbbbbbbbbbb + +struct selvals { + unsigned long val; + unsigned long result; +} selvals[] = { + { 0x0000000000000000, RES_C }, + { 0x8000000000000000, RES_C }, + { 0x3ff0000000000000, RES_C }, + { 0xbff0000000000000, RES_B }, + { 0x7ff0000000000000, RES_C }, + { 0xfff0000000000000, RES_B }, + { 0x7ff8000000000000, RES_B }, + { 0xfff8000000000000, RES_B }, + { 0x0000000000000001, RES_C }, + { 0x8000000000000001, RES_B }, + { 0xffffffffffffffff, RES_B }, +}; + +int test19(long arg) +{ + long i; + unsigned long result; + unsigned long frb = RES_B; + unsigned long frc = RES_C; + struct selvals *vp = selvals; + + for (i = 0; i < sizeof(selvals) / sizeof(selvals[0]); ++i, ++vp) { + asm("lfd 6,0(%0); lfd 10,0(%1); lfd 22,0(%2); fsel 0,6,22,10; stfd 0,0(%3)" + : : "b" (&vp->val), "b" (&frb), "b" (&frc), "b" (&result) : "memory"); + if (result != vp->result) { + print_hex(i, 2, " "); + print_hex(result, 16, " "); + return i + 1; + } + } + return 0; +} + +int fpu_test_19(void) +{ + enable_fp(); + return trapit(0, test19); +} + +#define LT 8 +#define GT 4 +#define EQ 2 +#define UN 1 + +struct cmpvals { + unsigned long vala, valb; + unsigned long result; +} cmpvals[] = { + { 0x0000000000000000, 0x0000000000000000, EQ }, + { 0x8000000000000000, 0x0000000000000000, EQ }, + { 0x3ff0000000000000, 0x3ff0000000000000, EQ }, + { 0x3ff0000000000001, 0x3ff0000000000000, GT }, + { 0x3ff0000000000000, 0x3ff0000000000001, LT }, + { 0xbff0000000000000, 0x3ff0000000000000, LT }, + { 0x7ff0000000000000, 0x7ff0000000000000, EQ }, + { 0xfff0000000000000, 0x7ff0000000000000, LT }, + { 0x7ff8000000000000, 0x7ff0000000000000, UN }, + { 0xfff8000000000000, 0x7ff0000000000000, UN }, + { 0x0000000000000001, 0x0000000000000001, EQ }, + { 0x8000000000000001, 0x7ff0000000000000, LT }, + { 0xffffffffffffffff, 0x7ff0000000000000, UN }, + { 0xffffffffffffffff, 0xffffffffffffffff, UN }, +}; + +int test20(long arg) +{ + long i; + unsigned long cr; + struct cmpvals *vp = cmpvals; + + for (i = 0; i < sizeof(cmpvals) / sizeof(cmpvals[0]); ++i, ++vp) { + asm("lfd 6,0(%1); lfd 10,8(%1); fcmpu 7,6,10; mfcr %0" + : "=r" (cr) : "b" (&vp->vala) : "memory"); + cr &= 0xf; + if (cr != vp->result) { + print_hex(i, 2, " "); + print_hex(cr, 1, " "); + return i + 1; + } + } + return 0; +} + +int fpu_test_20(void) +{ + enable_fp(); + return trapit(0, test20); +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -1191,6 +1287,8 @@ int main(void) do_test(16, fpu_test_16); do_test(17, fpu_test_17); do_test(18, fpu_test_18); + do_test(19, fpu_test_19); + do_test(20, fpu_test_20); return fail; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 572aad0ef8df74ffd3de84f0d0435c5bf1706b02..0f9e03a588b27413f4d3522e6be6362f86d9204d 100755 GIT binary patch delta 3674 zcmai03s93+7C!$^5`rQPk_RA<5D%a6KiGTWBvEsKM4)2+uoVW zIp;gyJ@>nh|Awy5xvsNZI)G(9Kxy^4@X|)IwXnLh8o)-_I>OcwwyxH>dT;t{&b{~L zTwa~ivFA_EI4n&D6Trw{i7Dv8Sq|i=+!A*AwkVdd*pC&P{M_@z?Y-PLA!nsJ$IDwC z0PUAgJ|%aj+)24zm-hv7{y8Ab7VE}d)w5hHj;fyLT02Xl%XwaviU(q5M7WDd8rUpN z*c+3rU10!ghnV6M#V+&66TdQ*8>7^DQC3O;6pjSg%hkn`tW9pSjMh}zge~eC&W*e# zSEES*YqwOwT_}oM3jw;!U*ZytnU|&DcFkJFjyGJwv76Yg%f`AK8D?nJyt4`yXv_Hx zuizeSId>bcYM1f$WX#rWl&@Lk5+2}PT|Rnrk^GKd;vHRyykU(?__YMq#YXZKYjI0# ziM+4cC47z1ary7Yy&yN%xP&Dvz~;C}Ui~X+7T*7|rj-M@Mj)i10Xx3j)CZDUtH>XX< zq-o3fZ%6TAe45f`hlm&fn3Y>6KprIWBN2cA`VX_ ztm1#8L2Y8z)XlMAZKFMu^66U|hFfuYVm9Abk6RN<_&KrIo0!Gh5JwZ2Yy0bKghVq@=qUN_xyYhfb~4Ux!&U|6psPI`<1X{#0YJ}kdb9CfUU(vOcpQ6IwUka zy6&(*oA1@fVwsWFMgA~b*HVf(a0n;EYwR{i5e{m=%7JKBL8j>=oS&BJh$8818D6pb z5nBS2;M~R}c!b^{q_EJe)e$U z*&PxpXvRZ?lO4{W^|Q6VABfq0uy`BA0{M)?;(CgMit0$w1?<+JE$R!*LR;;7{{h?0 zwqbTtqzV+X>cLyX6un7Uuu270>A}CGtl@r(*@ohAwT}A6*V<3<#9G@33$1kpI}ES$ z)(0~37G*UicWEJM5 zy~y|M!Ch(N2fjKwe#)g3hfaAAM`)mR9~Mu~9yjCB`{M`hr#N(=7eAi9aH)sBHFn)D z(sjH0y;yeX(G!P}F76}{liB5snC$=fE4&W75o)9heSeZ4n@8`>NWeGJ5me!u&{5Rc8ksTXN|p=v$5NNqYcWZ0joIPtrmR{Y@Z zPb>aAaR7c)ag5MGq)^2dCgJfJYtfszo7;(-vu1PeVMo>+uC4Q*SsTL~KL1G$(^q`! zaG26wcbkpYOFx>TxViu~)(GG>>|IGX+i*E;oN}Z8(|(7r!~7p3-Y8*jk72Qd z)e<&kNGz+T+shsk#%CWR9YQp_Mq`r~5D|?!Js%O)8TDK`m&JacV(7lyc-bWkN^kqt z0yV`mUvUW+Db`ap_5|{7TwtUhd-f25P3gw=dc(rPHCElVy!zcIqbyOJe{vW}B&hN!S$tjG3FlH(bHIx!JtyD!xYM@m2g_ZW5n% z4NsBD7{E(pDh6@A z_MkRDiT8LgkBo8l@{ zO$NxI8gx;hi+IxMEs=OF{K6;;F;_^;YZ6sDt$yMbV7HK}06D;hf8k9bNuEaT9GqfG zR#?eqRG;E~wN%BtL#1>PMqMuyrE&Re{hzAA0N4I{sDT_#x zny@3c@QJBdQAMusEx8MF*k&E1u;K6zD{B&NKA|V{0>N`}J z(=}FdRrf^E$x+gO(){ySWKPYj`d$=w5Xc%U$*z7MVJ-$xV;f~*9p5oOJ8Q#zQDg{f zjP0bpgRq&bE;WX|N4<5FZbj!^^Lh@OR%UiqSS)g^dr`*&k13V9IQm~jYW`7P`fr^z zZ=nqDF4ly33zayyQp=M`S){WD$pMlWunwi)3H+6 z$LfiJ7Q}<~f_Tt!c#+HN{QctY7{?i@(Y5k7=lIxLI3fPLU1|@L!}2L!bSW0E|hvuS0H;=dWe4%7eu delta 2747 zcmai0eN0qW7QgQ@Z;E!L!^aFC!tjQV5fG+`0uN@u!4C>UMQV+$THP6`Ev?3oxXa=$ zI@4xsqzM)|EwxQUN3tcXX&c)i4Q-lTx-|{Dny`Opv}ikw(*|10(iL%9wRg{bGb#`p zPjWct{Lb&5d+xdSjShV&4E<9u5n`(##P0m+A^SeGJBh>YB*cN(F2r^rwri)`am2J- z@E+MxL__9}4uAHDYpNrojF5?cBU6@$yPlw<^(xrtRmSRg9L83ag}JAYdwO(aUM_@n zKCJh;2su7FaG%^{y&H3{jvkHV!h4V?N0Jv#YnKW=Fr$4^=<(TO>nRWXyd45a1@r9|&a#$H}Y$dF|P)aoqsqyb*hy{O*(qDTJV2RNQ+hAe#&o!SbO z`g$7N0*CeW!ZUDLzn)stz?|43UfAZ5e}SJ9OTnM0p`q?eSG;~E1 z97w7b2mauZFG6f`X;<<_F~RAPYk2^j$+*l3J;~K#aEC|!6qcowf|R16?=*rrrCQwh ztVhmOz!NDNQM1z{kHHg*ONHyuzWC3=e*Bvq?Pwzl8Xe;IU{3v5m6Swkz#pFkQkJSK zEp^eQL{hUkfe(f_f?&*COgAS&b=r1%E*|>Q^5V|wNzE}FhLSfOXdUl?K$@9$?u9r* zHTCFWiy@C{yWkbW2K~{!jdB{!Tq4eK#(J$<;+By(iNupSC`sQ#pMDei((Q4aDBfmu zoTLhqsFX*~B!F4U;GKcrsG;OAOd1b6kM4a>ZoPTMReNiG<@e+UaxAnAakS$(xGUe$ z7>NrbaV*9%`kce_0>%Py$pa5Hwlx!@+z*#CzO3qu>{5sojW<~x+Hrmkz9JT5Uj5FQ zct^K?y;r3HtNabdGMj|Az@D{gPOW0(&90S=@%&n`^J)#k*{tVj=nW{zE;7$2N!%lzO{iGP|f?_UdCHWwo2Hj>gqTFrK|yhIOW=Xln=b zndWXgabxbb7cri_(EW(AZ9(QzXjp>pwHvlCG0)k$cwU9- zd~f?_Sl#)XzqtzWrWO*G&TngCVfdXkOu5<$NuzRu=OS1+e#NIEej$!RhT0rU+`&bp z#u<6N{4;hRggtpC_&lf0)sL5`AurqbrJg{a&-110{0GC5&oO>U_v36sP9^3Du`0az z96s4HrA~#C$EWAM4<}C6AxR(ylv8-IV>g`2g}4tMp4)L}J9gZA-){eY?ZMq1M9zb{ zbt-ip&}{{hW_Q~U$Clm@cEVfv%Y|k*o4-PM*GCIlqFjUFOB@feu^;=QqT6o$#1~4+ zZ!}{&tV5V5-i#yPAH56&p%3>eJp=LcQ+qKA`~V{bb|WwmYD>Nglzs(M!h@6yYx~F_fwUB$cQ7_3&)FG z3+_cl%X@`0aBlgDMc#l+B)d~0_I=3eT47o1&6s#!McR(9C;Ow(r~iuHry{v!v#}^* zE7RP?Yzj|1Cx*D<^*Uw0tb9Up+X-oZly`)Cv=ndhjTEZ{*X~iqf(UZaq9T!4FEQ4@ z0ThneP+Jk3Ld%6?8@XO)te*oY9BV=yM@&M?g=1}~(}-np0EOF5RMj_(T_OaYEH%=> zZ=j{rOiz9beQ2zHIA3a@&-!5oP16|YWdrRTgAz1@W3UO0e+*jCL>J2R6iz)#@q? zQ&E$m2pRsJJU^CxyA4e=F2=;P=x*6R+24PQmhWaSFuLq&KJWoB0?o<%I zDmSU8(QWz;{)dEA6<*2rkYdRY*PuHMZ&)(byV1)ZhksZMVkf#+Va$@Q9zbtk0_v>> z@dCOvu+Ew;-a_vZ_wpo>kmes58{iE+7Sn)ddRN%CFoslptFC%DJo+`r!D`*sjFjEz zZp8mxNt}+d2Gxz2<0_<7RD!lbB@Saa1<4hw3$$uX{`cN_`3$lUJA@ct$mMZV8L>fV zuc%WWLNDPOd|hGE4WK)GjWLISjI}WmHb(0}SDBeRJy3 diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out index a5c08ea..aea206f 100644 --- a/tests/test_fpu.console_out +++ b/tests/test_fpu.console_out @@ -16,3 +16,5 @@ test 15:PASS test 16:PASS test 17:PASS test 18:PASS +test 19:PASS +test 20:PASS From 394f993e75abaf1d23eba046c8bec59664439b3d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 30 Jul 2020 16:11:58 +1000 Subject: [PATCH 23/30] FPU: Implement frsqrte[s] and a test for frsqrte This implements frsqrte by table lookup. We first normalize the input if necessary and adjust so that the exponent is even, giving us a mantissa value in the range [1.0, 4.0), which is then used to look up an entry in a 768-entry table. The 768 entries are appended to the table for reciprocal estimates, giving a table of 1024 entries in total. frsqrtes is implemented identically to frsqrte. The estimate supplied is accurate to 1 part in 1024 or better. Signed-off-by: Paul Mackerras --- decode1.vhdl | 2 + fpu.vhdl | 194 +++++++++++++++++++++++++++++++++++-- tests/fpu/fpu.c | 48 +++++++++ tests/test_fpu.bin | Bin 25024 -> 29376 bytes tests/test_fpu.console_out | 1 + 5 files changed, 239 insertions(+), 6 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index ba9964e..7163ff9 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -421,6 +421,7 @@ architecture behaviour of decode1 is 2#10101# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fadds 2#11000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fres 2#11001# => (FPU, OP_FPOP, FRA, NONE, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fmuls + 2#11010# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- frsqrtes others => illegal_inst ); @@ -479,6 +480,7 @@ architecture behaviour of decode1 is 2#0111# => (FPU, OP_FPOP, FRA, FRB, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fsel 2#1000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fre 2#1001# => (FPU, OP_FPOP, FRA, NONE, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fmul + 2#1010# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- frsqrte others => illegal_inst ); diff --git a/fpu.vhdl b/fpu.vhdl index c726be3..0cbd43f 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -41,7 +41,7 @@ architecture behaviour of fpu is DO_FCFID, DO_FCTI, DO_FRSP, DO_FRI, DO_FADD, DO_FMUL, DO_FDIV, - DO_FRE, + DO_FRE, DO_FRSQRTE, DO_FSEL, FRI_1, ADD_SHIFT, ADD_2, ADD_3, @@ -50,6 +50,7 @@ architecture behaviour of fpu is LOOKUP, DIV_2, DIV_3, DIV_4, DIV_5, DIV_6, FRE_1, + RSQRT_1, INT_SHIFT, INT_ROUND, INT_ISHIFT, INT_FINAL, INT_CHECK, INT_OFLOW, FINISH, NORMALIZE, @@ -98,11 +99,12 @@ architecture behaviour of fpu is exp_cmp : std_ulogic; add_bsmall : std_ulogic; is_multiply : std_ulogic; + is_sqrt : std_ulogic; first : std_ulogic; count : unsigned(1 downto 0); end record; - type lookup_table is array(0 to 255) of std_ulogic_vector(17 downto 0); + type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0); signal r, rin : reg_type; @@ -160,6 +162,8 @@ architecture behaviour of fpu is constant MULADD_A : std_ulogic_vector(1 downto 0) := "10"; -- Inverse lookup table, indexed by the top 8 fraction bits + -- The first 256 entries are the reciprocal (1/x) lookup table, + -- and the remaining 768 entries are the reciprocal square root table. -- Output range is [0.5, 1) in 0.19 format, though the top -- bit isn't stored since it is always 1. -- Each output value is the inverse of the center of the input @@ -199,7 +203,109 @@ architecture behaviour of fpu is 18x"04321", 18x"040dd", 18x"03e9b", 18x"03c5c", 18x"03a1f", 18x"037e4", 18x"035ac", 18x"03376", 18x"03142", 18x"02f11", 18x"02ce2", 18x"02ab5", 18x"0288b", 18x"02663", 18x"0243d", 18x"02219", 18x"01ff7", 18x"01dd8", 18x"01bbb", 18x"019a0", 18x"01787", 18x"01570", 18x"0135b", 18x"01149", - 18x"00f39", 18x"00d2a", 18x"00b1e", 18x"00914", 18x"0070c", 18x"00506", 18x"00302", 18x"00100" + 18x"00f39", 18x"00d2a", 18x"00b1e", 18x"00914", 18x"0070c", 18x"00506", 18x"00302", 18x"00100", + -- 1/sqrt(x) lookup table + -- Input is in the range [1, 4), i.e. two bits to the left of the + -- binary point. Those 2 bits index the following 3 blocks of 256 values. + -- 1.0 ... 1.9999 + 18x"3fe00", 18x"3fa06", 18x"3f612", 18x"3f224", 18x"3ee3a", 18x"3ea58", 18x"3e67c", 18x"3e2a4", + 18x"3ded2", 18x"3db06", 18x"3d73e", 18x"3d37e", 18x"3cfc2", 18x"3cc0a", 18x"3c85a", 18x"3c4ae", + 18x"3c106", 18x"3bd64", 18x"3b9c8", 18x"3b630", 18x"3b29e", 18x"3af10", 18x"3ab86", 18x"3a802", + 18x"3a484", 18x"3a108", 18x"39d94", 18x"39a22", 18x"396b6", 18x"3934e", 18x"38fea", 18x"38c8c", + 18x"38932", 18x"385dc", 18x"3828a", 18x"37f3e", 18x"37bf6", 18x"378b2", 18x"37572", 18x"37236", + 18x"36efe", 18x"36bca", 18x"3689a", 18x"36570", 18x"36248", 18x"35f26", 18x"35c06", 18x"358ea", + 18x"355d4", 18x"352c0", 18x"34fb0", 18x"34ca4", 18x"3499c", 18x"34698", 18x"34398", 18x"3409c", + 18x"33da2", 18x"33aac", 18x"337bc", 18x"334cc", 18x"331e2", 18x"32efc", 18x"32c18", 18x"32938", + 18x"3265a", 18x"32382", 18x"320ac", 18x"31dd8", 18x"31b0a", 18x"3183e", 18x"31576", 18x"312b0", + 18x"30fee", 18x"30d2e", 18x"30a74", 18x"307ba", 18x"30506", 18x"30254", 18x"2ffa4", 18x"2fcf8", + 18x"2fa4e", 18x"2f7a8", 18x"2f506", 18x"2f266", 18x"2efca", 18x"2ed2e", 18x"2ea98", 18x"2e804", + 18x"2e572", 18x"2e2e4", 18x"2e058", 18x"2ddce", 18x"2db48", 18x"2d8c6", 18x"2d646", 18x"2d3c8", + 18x"2d14c", 18x"2ced4", 18x"2cc5e", 18x"2c9ea", 18x"2c77a", 18x"2c50c", 18x"2c2a2", 18x"2c038", + 18x"2bdd2", 18x"2bb70", 18x"2b90e", 18x"2b6b0", 18x"2b454", 18x"2b1fa", 18x"2afa4", 18x"2ad4e", + 18x"2aafc", 18x"2a8ac", 18x"2a660", 18x"2a414", 18x"2a1cc", 18x"29f86", 18x"29d42", 18x"29b00", + 18x"298c2", 18x"29684", 18x"2944a", 18x"29210", 18x"28fda", 18x"28da6", 18x"28b74", 18x"28946", + 18x"28718", 18x"284ec", 18x"282c4", 18x"2809c", 18x"27e78", 18x"27c56", 18x"27a34", 18x"27816", + 18x"275fa", 18x"273e0", 18x"271c8", 18x"26fb0", 18x"26d9c", 18x"26b8a", 18x"2697a", 18x"2676c", + 18x"26560", 18x"26356", 18x"2614c", 18x"25f46", 18x"25d42", 18x"25b40", 18x"2593e", 18x"25740", + 18x"25542", 18x"25348", 18x"2514e", 18x"24f58", 18x"24d62", 18x"24b6e", 18x"2497c", 18x"2478c", + 18x"2459e", 18x"243b0", 18x"241c6", 18x"23fde", 18x"23df6", 18x"23c10", 18x"23a2c", 18x"2384a", + 18x"2366a", 18x"2348c", 18x"232ae", 18x"230d2", 18x"22efa", 18x"22d20", 18x"22b4a", 18x"22976", + 18x"227a2", 18x"225d2", 18x"22402", 18x"22234", 18x"22066", 18x"21e9c", 18x"21cd2", 18x"21b0a", + 18x"21944", 18x"2177e", 18x"215ba", 18x"213fa", 18x"21238", 18x"2107a", 18x"20ebc", 18x"20d00", + 18x"20b46", 18x"2098e", 18x"207d6", 18x"20620", 18x"2046c", 18x"202b8", 18x"20108", 18x"1ff58", + 18x"1fda8", 18x"1fbfc", 18x"1fa50", 18x"1f8a4", 18x"1f6fc", 18x"1f554", 18x"1f3ae", 18x"1f208", + 18x"1f064", 18x"1eec2", 18x"1ed22", 18x"1eb82", 18x"1e9e4", 18x"1e846", 18x"1e6aa", 18x"1e510", + 18x"1e378", 18x"1e1e0", 18x"1e04a", 18x"1deb4", 18x"1dd20", 18x"1db8e", 18x"1d9fc", 18x"1d86c", + 18x"1d6de", 18x"1d550", 18x"1d3c4", 18x"1d238", 18x"1d0ae", 18x"1cf26", 18x"1cd9e", 18x"1cc18", + 18x"1ca94", 18x"1c910", 18x"1c78c", 18x"1c60a", 18x"1c48a", 18x"1c30c", 18x"1c18e", 18x"1c010", + 18x"1be94", 18x"1bd1a", 18x"1bba0", 18x"1ba28", 18x"1b8b2", 18x"1b73c", 18x"1b5c6", 18x"1b452", + 18x"1b2e0", 18x"1b16e", 18x"1affe", 18x"1ae8e", 18x"1ad20", 18x"1abb4", 18x"1aa46", 18x"1a8dc", + -- 2.0 ... 2.9999 + 18x"1a772", 18x"1a608", 18x"1a4a0", 18x"1a33a", 18x"1a1d4", 18x"1a070", 18x"19f0c", 18x"19da8", + 18x"19c48", 18x"19ae6", 18x"19986", 18x"19828", 18x"196ca", 18x"1956e", 18x"19412", 18x"192b8", + 18x"1915e", 18x"19004", 18x"18eae", 18x"18d56", 18x"18c00", 18x"18aac", 18x"18958", 18x"18804", + 18x"186b2", 18x"18562", 18x"18412", 18x"182c2", 18x"18174", 18x"18026", 18x"17eda", 18x"17d8e", + 18x"17c44", 18x"17afa", 18x"179b2", 18x"1786a", 18x"17724", 18x"175de", 18x"17498", 18x"17354", + 18x"17210", 18x"170ce", 18x"16f8c", 18x"16e4c", 18x"16d0c", 18x"16bcc", 18x"16a8e", 18x"16950", + 18x"16814", 18x"166d8", 18x"1659e", 18x"16464", 18x"1632a", 18x"161f2", 18x"160ba", 18x"15f84", + 18x"15e4e", 18x"15d1a", 18x"15be6", 18x"15ab2", 18x"15980", 18x"1584e", 18x"1571c", 18x"155ec", + 18x"154bc", 18x"1538e", 18x"15260", 18x"15134", 18x"15006", 18x"14edc", 18x"14db0", 18x"14c86", + 18x"14b5e", 18x"14a36", 18x"1490e", 18x"147e6", 18x"146c0", 18x"1459a", 18x"14476", 18x"14352", + 18x"14230", 18x"1410c", 18x"13fea", 18x"13eca", 18x"13daa", 18x"13c8a", 18x"13b6c", 18x"13a4e", + 18x"13930", 18x"13814", 18x"136f8", 18x"135dc", 18x"134c2", 18x"133a8", 18x"1328e", 18x"13176", + 18x"1305e", 18x"12f48", 18x"12e30", 18x"12d1a", 18x"12c06", 18x"12af2", 18x"129de", 18x"128ca", + 18x"127b8", 18x"126a6", 18x"12596", 18x"12486", 18x"12376", 18x"12266", 18x"12158", 18x"1204a", + 18x"11f3e", 18x"11e32", 18x"11d26", 18x"11c1a", 18x"11b10", 18x"11a06", 18x"118fc", 18x"117f4", + 18x"116ec", 18x"115e4", 18x"114de", 18x"113d8", 18x"112d2", 18x"111ce", 18x"110ca", 18x"10fc6", + 18x"10ec2", 18x"10dc0", 18x"10cbe", 18x"10bbc", 18x"10abc", 18x"109bc", 18x"108bc", 18x"107be", + 18x"106c0", 18x"105c2", 18x"104c4", 18x"103c8", 18x"102cc", 18x"101d0", 18x"100d6", 18x"0ffdc", + 18x"0fee2", 18x"0fdea", 18x"0fcf0", 18x"0fbf8", 18x"0fb02", 18x"0fa0a", 18x"0f914", 18x"0f81e", + 18x"0f72a", 18x"0f636", 18x"0f542", 18x"0f44e", 18x"0f35a", 18x"0f268", 18x"0f176", 18x"0f086", + 18x"0ef94", 18x"0eea4", 18x"0edb4", 18x"0ecc6", 18x"0ebd6", 18x"0eae8", 18x"0e9fa", 18x"0e90e", + 18x"0e822", 18x"0e736", 18x"0e64a", 18x"0e55e", 18x"0e474", 18x"0e38a", 18x"0e2a0", 18x"0e1b8", + 18x"0e0d0", 18x"0dfe8", 18x"0df00", 18x"0de1a", 18x"0dd32", 18x"0dc4c", 18x"0db68", 18x"0da82", + 18x"0d99e", 18x"0d8ba", 18x"0d7d6", 18x"0d6f4", 18x"0d612", 18x"0d530", 18x"0d44e", 18x"0d36c", + 18x"0d28c", 18x"0d1ac", 18x"0d0cc", 18x"0cfee", 18x"0cf0e", 18x"0ce30", 18x"0cd54", 18x"0cc76", + 18x"0cb9a", 18x"0cabc", 18x"0c9e0", 18x"0c906", 18x"0c82a", 18x"0c750", 18x"0c676", 18x"0c59c", + 18x"0c4c4", 18x"0c3ea", 18x"0c312", 18x"0c23a", 18x"0c164", 18x"0c08c", 18x"0bfb6", 18x"0bee0", + 18x"0be0a", 18x"0bd36", 18x"0bc62", 18x"0bb8c", 18x"0baba", 18x"0b9e6", 18x"0b912", 18x"0b840", + 18x"0b76e", 18x"0b69c", 18x"0b5cc", 18x"0b4fa", 18x"0b42a", 18x"0b35a", 18x"0b28a", 18x"0b1bc", + 18x"0b0ee", 18x"0b01e", 18x"0af50", 18x"0ae84", 18x"0adb6", 18x"0acea", 18x"0ac1e", 18x"0ab52", + 18x"0aa86", 18x"0a9bc", 18x"0a8f0", 18x"0a826", 18x"0a75c", 18x"0a694", 18x"0a5ca", 18x"0a502", + 18x"0a43a", 18x"0a372", 18x"0a2aa", 18x"0a1e4", 18x"0a11c", 18x"0a056", 18x"09f90", 18x"09ecc", + -- 3.0 ... 3.9999 + 18x"09e06", 18x"09d42", 18x"09c7e", 18x"09bba", 18x"09af6", 18x"09a32", 18x"09970", 18x"098ae", + 18x"097ec", 18x"0972a", 18x"09668", 18x"095a8", 18x"094e8", 18x"09426", 18x"09368", 18x"092a8", + 18x"091e8", 18x"0912a", 18x"0906c", 18x"08fae", 18x"08ef0", 18x"08e32", 18x"08d76", 18x"08cba", + 18x"08bfe", 18x"08b42", 18x"08a86", 18x"089ca", 18x"08910", 18x"08856", 18x"0879c", 18x"086e2", + 18x"08628", 18x"08570", 18x"084b6", 18x"083fe", 18x"08346", 18x"0828e", 18x"081d8", 18x"08120", + 18x"0806a", 18x"07fb4", 18x"07efe", 18x"07e48", 18x"07d92", 18x"07cde", 18x"07c2a", 18x"07b76", + 18x"07ac2", 18x"07a0e", 18x"0795a", 18x"078a8", 18x"077f4", 18x"07742", 18x"07690", 18x"075de", + 18x"0752e", 18x"0747c", 18x"073cc", 18x"0731c", 18x"0726c", 18x"071bc", 18x"0710c", 18x"0705e", + 18x"06fae", 18x"06f00", 18x"06e52", 18x"06da4", 18x"06cf6", 18x"06c4a", 18x"06b9c", 18x"06af0", + 18x"06a44", 18x"06998", 18x"068ec", 18x"06840", 18x"06796", 18x"066ea", 18x"06640", 18x"06596", + 18x"064ec", 18x"06442", 18x"0639a", 18x"062f0", 18x"06248", 18x"061a0", 18x"060f8", 18x"06050", + 18x"05fa8", 18x"05f00", 18x"05e5a", 18x"05db4", 18x"05d0e", 18x"05c68", 18x"05bc2", 18x"05b1c", + 18x"05a76", 18x"059d2", 18x"0592e", 18x"05888", 18x"057e4", 18x"05742", 18x"0569e", 18x"055fa", + 18x"05558", 18x"054b6", 18x"05412", 18x"05370", 18x"052ce", 18x"0522e", 18x"0518c", 18x"050ec", + 18x"0504a", 18x"04faa", 18x"04f0a", 18x"04e6a", 18x"04dca", 18x"04d2c", 18x"04c8c", 18x"04bee", + 18x"04b50", 18x"04ab0", 18x"04a12", 18x"04976", 18x"048d8", 18x"0483a", 18x"0479e", 18x"04700", + 18x"04664", 18x"045c8", 18x"0452c", 18x"04490", 18x"043f6", 18x"0435a", 18x"042c0", 18x"04226", + 18x"0418a", 18x"040f0", 18x"04056", 18x"03fbe", 18x"03f24", 18x"03e8c", 18x"03df2", 18x"03d5a", + 18x"03cc2", 18x"03c2a", 18x"03b92", 18x"03afa", 18x"03a62", 18x"039cc", 18x"03934", 18x"0389e", + 18x"03808", 18x"03772", 18x"036dc", 18x"03646", 18x"035b2", 18x"0351c", 18x"03488", 18x"033f2", + 18x"0335e", 18x"032ca", 18x"03236", 18x"031a2", 18x"03110", 18x"0307c", 18x"02fea", 18x"02f56", + 18x"02ec4", 18x"02e32", 18x"02da0", 18x"02d0e", 18x"02c7c", 18x"02bec", 18x"02b5a", 18x"02aca", + 18x"02a38", 18x"029a8", 18x"02918", 18x"02888", 18x"027f8", 18x"0276a", 18x"026da", 18x"0264a", + 18x"025bc", 18x"0252e", 18x"024a0", 18x"02410", 18x"02384", 18x"022f6", 18x"02268", 18x"021da", + 18x"0214e", 18x"020c0", 18x"02034", 18x"01fa8", 18x"01f1c", 18x"01e90", 18x"01e04", 18x"01d78", + 18x"01cee", 18x"01c62", 18x"01bd8", 18x"01b4c", 18x"01ac2", 18x"01a38", 18x"019ae", 18x"01924", + 18x"0189c", 18x"01812", 18x"01788", 18x"01700", 18x"01676", 18x"015ee", 18x"01566", 18x"014de", + 18x"01456", 18x"013ce", 18x"01346", 18x"012c0", 18x"01238", 18x"011b2", 18x"0112c", 18x"010a4", + 18x"0101e", 18x"00f98", 18x"00f12", 18x"00e8c", 18x"00e08", 18x"00d82", 18x"00cfe", 18x"00c78", + 18x"00bf4", 18x"00b70", 18x"00aec", 18x"00a68", 18x"009e4", 18x"00960", 18x"008dc", 18x"00858", + 18x"007d6", 18x"00752", 18x"006d0", 18x"0064e", 18x"005cc", 18x"0054a", 18x"004c8", 18x"00446", + 18x"003c4", 18x"00342", 18x"002c2", 18x"00240", 18x"001c0", 18x"00140", 18x"000c0", 18x"00040" ); -- Left and right shifter with 120 bit input and 64 bit output. @@ -424,9 +530,17 @@ begin -- synchronous reads from lookup table lut_access: process(clk) + variable addrhi : std_ulogic_vector(1 downto 0); + variable addr : std_ulogic_vector(9 downto 0); begin if rising_edge(clk) then - inverse_est <= '1' & inverse_table(to_integer(unsigned(r.b.mantissa(53 downto 46)))); + if r.is_sqrt = '1' then + addrhi := r.b.mantissa(55 downto 54); + else + addrhi := "00"; + end if; + addr := addrhi & r.b.mantissa(53 downto 46); + inverse_est <= '1' & inverse_table(to_integer(unsigned(addr))); end if; end process; @@ -488,6 +602,8 @@ begin variable pcmpb_eq : std_ulogic; variable pcmpb_lt : std_ulogic; variable pshift : std_ulogic; + variable renorm_sqrt : std_ulogic; + variable sqrt_exp : signed(EXP_BITS-1 downto 0); begin v := r; illegal := '0'; @@ -519,6 +635,7 @@ begin v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN); v.is_subtract := '0'; v.is_multiply := '0'; + v.is_sqrt := '0'; v.add_bsmall := '0'; adec := decode_dp(e_in.fra, int_input); bdec := decode_dp(e_in.frb, int_input); @@ -599,6 +716,7 @@ begin msel_inv <= '0'; set_y := '0'; pshift := '0'; + renorm_sqrt := '0'; case r.state is when IDLE => if e_in.valid = '1' then @@ -654,6 +772,9 @@ begin when "11001" => v.is_multiply := '1'; v.state := DO_FMUL; + when "11010" => + v.is_sqrt := '1'; + v.state := DO_FRSQRTE; when others => illegal := '1'; end case; @@ -1157,6 +1278,48 @@ begin arith_done := '1'; end case; + when DO_FRSQRTE => + opsel_a <= AIN_B; + v.result_class := r.b.class; + v.result_sign := r.b.negative; + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + if r.b.class = NAN and r.b.mantissa(53) = '0' then + v.fpscr(FPSCR_VXSNAN) := '1'; + invalid := '1'; + end if; + v.shift := to_signed(1, EXP_BITS); + case r.b.class is + when FINITE => + v.result_exp := r.b.exponent; + if r.b.negative = '1' then + v.fpscr(FPSCR_VXSQRT) := '1'; + qnan_result := '1'; + arith_done := '1'; + elsif r.b.mantissa(54) = '0' then + v.state := RENORM_B; + elsif r.b.exponent(0) = '0' then + v.state := RSQRT_1; + else + v.state := RENORM_B2; + end if; + when NAN => + -- result is B + arith_done := '1'; + when INFINITY => + if r.b.negative = '1' then + v.fpscr(FPSCR_VXSQRT) := '1'; + qnan_result := '1'; + else + v.result_class := ZERO; + end if; + arith_done := '1'; + when ZERO => + v.result_class := INFINITY; + zero_divide := '1'; + arith_done := '1'; + end case; + when RENORM_A => renormalize := '1'; v.state := RENORM_A2; @@ -1184,11 +1347,16 @@ begin when RENORM_B => renormalize := '1'; + renorm_sqrt := r.is_sqrt; v.state := RENORM_B2; when RENORM_B2 => set_b := '1'; - v.result_exp := r.result_exp + r.shift; + if r.is_sqrt = '0' then + v.result_exp := r.result_exp + r.shift; + else + v.result_exp := new_exp; + end if; v.state := LOOKUP; when RENORM_C => @@ -1287,8 +1455,10 @@ begin v.first := '1'; if r.insn(4) = '0' then v.state := DIV_2; - else + elsif r.insn(2) = '0' then v.state := FRE_1; + else + v.state := RSQRT_1; end if; when DIV_2 => @@ -1367,6 +1537,14 @@ begin v.shift := to_signed(1, EXP_BITS); v.state := NORMALIZE; + when RSQRT_1 => + opsel_r <= RES_MISC; + misc_sel <= "0111"; + sqrt_exp := r.b.exponent(EXP_BITS-1) & r.b.exponent(EXP_BITS-1 downto 1); + v.result_exp := - sqrt_exp; + v.shift := to_signed(1, EXP_BITS); + v.state := NORMALIZE; + when INT_SHIFT => opsel_r <= RES_SHIFT; set_x := '1'; @@ -1807,6 +1985,10 @@ begin if renormalize = '1' then clz := count_left_zeroes(r.r); + if renorm_sqrt = '1' then + -- make denormalized value end up with even exponent + clz(0) := '1'; + end if; v.shift := resize(signed('0' & clz) - 9, EXP_BITS); end if; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 06da475..d9c5c06 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -1244,6 +1244,53 @@ int fpu_test_20(void) return trapit(0, test20); } +struct isqrtvals { + unsigned long val; + unsigned long inv; +} isqrtvals[] = { + { 0x0000000000000000, 0x7ff0000000000000 }, + { 0x8000000000000000, 0xfff0000000000000 }, + { 0xfff0000000000000, 0x7ff8000000000000 }, + { 0x7ff0000000000000, 0x0000000000000000 }, + { 0xfff123456789abcd, 0xfff923456789abcd }, + { 0x3ff0000000000000, 0x3feff80000000000 }, + { 0x4000000000000000, 0x3fe69dc800000000 }, + { 0x4010000000000000, 0x3fdff80000000000 }, + { 0xbff0000000000000, 0x7ff8000000000000 }, + { 0x4008000000000000, 0x3fe2781800000000 }, + { 0x7fd0000000000000, 0x1ffff80000000000 }, + { 0x0008000000000000, 0x5fe69dc800000000 }, + { 0x0004000000000000, 0x5feff80000000000 }, + { 0x0002000000000000, 0x5ff69dc800000000 }, + { 0x0000000000000002, 0x61769dc800000000 }, + { 0x0000000000000001, 0x617ff80000000000 }, +}; + +int test21(long arg) +{ + long i; + unsigned long result; + struct isqrtvals *vp = isqrtvals; + + set_fpscr(FPS_RN_NEAR); + for (i = 0; i < sizeof(isqrtvals) / sizeof(isqrtvals[0]); ++i, ++vp) { + asm("lfd 6,0(%0); frsqrte 7,6; stfd 7,0(%1)" + : : "b" (&vp->val), "b" (&result) : "memory"); + if (result != vp->inv) { + print_hex(i, 2, " "); + print_hex(result, 16, " "); + return i + 1; + } + } + return 0; +} + +int fpu_test_21(void) +{ + enable_fp(); + return trapit(0, test21); +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -1289,6 +1336,7 @@ int main(void) do_test(18, fpu_test_18); do_test(19, fpu_test_19); do_test(20, fpu_test_20); + do_test(21, fpu_test_21); return fail; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 0f9e03a588b27413f4d3522e6be6362f86d9204d..0253720609c301172916dc31fbe1acd21c7c41bd 100755 GIT binary patch delta 3862 zcmb7H3s6+o89w*It}7xgA`7f6>@LVlL`2jJEV#TBkwJVEsPO^u8J%XdrUqMNiL=S5 z;}Ff?Qfzf9qD;q{3=U}~X|07c&7jd1XA%b=;T5#2Qi9fywLWgYvwMM6X`P;#!~cKZ z|2Y3W=ihTLEq@VOZVFM1S<@NIFaLa8{w~N3Y)5`MV`bpBg4+si>xSwbhoWW+&4*Sd ziQ1$yHE&MxZ_kocEFva{y_G4;S?Tds(rFjYYA~lTcKdq2EH7rr!Kbz%E5-UQ=g)t^ z*oEuu&)BF&JJke_Ja`NgR@)fcf4xre(7Lj*vfu`~pbejh6?wkCsC5;NdK%JZ^BR1a z5t&(8qgU*1M}2d?{}%$M8%0f}Q|@^r%f0iNm7)TEECxr?uL7cl2KqE$W~}0}I)de$ zkxfonZb`fS)LpZ-p*KyI_gF+$CLvTmB~C1gpaoO@h)^+bx=9H&m|0${EA;q$xX)*- zCC#_+PicKtDh!+}`kLvTz$L@ZS|&Jq;5`0`yqM=k!IFKoU3&ixoeGK;+9^yICbkt& zhOSUNyO?Sqk_)I)7cR`Ar@DkByOHJLDftkp1^i)F3sVXi&VBUJXzt`pS`#!KdG`nD zRZWF<=^@<`RA$nECa=WyMM?AmGYLAFjE9{cMoe*>ZL3uy0OP|mz z{R_)na*i&;iYY*@k!ruRWbd7CG z?b6b*oHlqkMV8v7@!U!4v6Gk42?p9Xb@TJ&NfJV*3o~d&XenxcCv>H5uYRn9r=id& zRRK)r$z%xCD5=j=u3@d1tD{CkM2Nzrhp@bQEQiLy52uR%aC!|1V#h|B9JWCBAlM_j z*Kk|W>tS1T-5b|Q2K*w7tlS~X$1Pe`Eg@S1i zEmBX9BMc)k6NMv`ADP9YY9iwTdDPoDf6u5R)Dtn49Fgvz1X>mCM_JRNc*vG%apIT$ zs4tvmL}*+|X9{&RFfEB&%~5e;s6Q1(@h9RX)wrF?WFZqqxon~YLzd75I=||DwGdfUp#iMU*iLBoF-2?aOKzH!d zxe4s~G>_?%=op*Q2ZM6!BYq!NOs%Sv*JbJbH@6Hpz#{;d~=Q zCb+tztZ2|ynaI|;Hdm!^skp!%`nc_h*znDM9-SmIHYl4?YP_vas4T`X+@HUvw_}=Z zgLnjAw#Ukw$_bs9p7}3+^`^yhE(Q%ZmX85G8L%;Carr&(qd}gB_j#UlmtA7z)noWf zmcD|Mz^Yu2a}1?7(fQZ|!!>?QtK-J5@wIy|uQ3-sFRRhy%JY&MpA0@*V+Ykw@Ap;C zQot+k7j!s&ws4Ft#U~5Lo5dOPywG{6i5WJw$#tj>9QNHtFHPmyyF8Qo9mx+~13POl zA36!1Z?_8DrrqT3s;l4{#h;=<+`so=@PBo2KO5nU;Ld>a?~xb(1Q(qZJIMX{?+Hg> z;~h1!_+B`=URwX-wH4y!?jrtL6wp=LY~1(w02qD+_XFR@m4^sYk)y!0o`7 z6|7>mGAVe0U20VbH5@Dp?a~i74UWiPXImv zT&`eW;C|qx3Z7^Kk-W$*Wh#Upa3%081y2J04A`V#EwFFCUGi73Kky1*LBW%Oj{x6O zJ{$so`+$2CJOwy@F?K+~!GR#&0MQD0)UX$hDasq>Mn=y}pGeYvgxk}8_;37GJ!R0~B zv*2zpM$6}!#BJAU=ez{5@&+|RJh(wu=7ou&U(*nTv6FOCn3&f|b0M~MQZYnbC+&o2 z>7-f+M<;y*q3xn85KR>PG zk!7BX4o!xg(rv}GT?wbo5|yM^;x1>+M0qwVO> jTDRslw;kPC&bN7FIBn&0SqP`Um-X52@sVOK@Ok#`G^7^~WLnzYb&fGoc z_q*qud+xpGzPp{bh0fm#cE;R!j1^b>VNUUG$SStExPq~Aa9hD`1-G@TY4c(GV!?N~ zbdhLSc5K@zEuHd?;q*@ow2G>js|iE#}uX0F*H7rv)(u0P?Wt+k(EnuH5$^yA{(tR{0^>SWV%&#jkL0&^L|%fKFsqO z>vUXr?VRJ5he{%I#6Tnen1_5k2~;KRxV5L(Rt%jiJkoC)Ep^T z?`m$OpSDR?aL843<=N_)boPiU* z#g??7`Y2W;FvTI$4ilg1h-H)TK7D2J=U}s@iLH@TV&&Dn%lZ<}>k_yz1N9`u2Z@t4 zOlDh~vAgzTsqWED?>Us$j3j$(E5yqYWKUR&IE@K4JkFhjG_2`kX$Q(gU|#9WoXUC+ zvq~4|QCi~je!EPu`U5(WSjf$zfz{Z6!#+-h;nKnAaIq8p+=O33?Mbj)c5z>Ab39QkdcQbNq4WVNh zHI$WZolNu!txumlQPWr7p6J8--=64SV8ht|O>~}g3P#fty-Xiv42P(fO~5m=pMExf zvG5LkHa}ZvX&;~ed;pFxXQ9hGHgSZ5xITB=0(AA;hk2xlcS%t;JK3&EiTP}Y*MsMK zuEE~vc1@K0Tj1KnAJtFZOFj7Ae(uLlxEOF%;6ewKRkip6xv`Dhp5FnSgO}IOlgXVh zS3DHYEFmVsMTtOLFbd|1Unz=n0Qv`59kz-7QYRQwEZD{zI1Lx6{Y zSF1SG3nF{HEET9k7;ruCLKV*iz5r}hu^u?2RF=Y391gq&SWxji;17ZCtB-XA@Gane z6&rvvHed&UEjS7twz^#OZ5})lW#a!mIE+*Oy6AlL$>KL=VO`ljMD?7`&J?<+KbyY^ zOP*UB>Kl-lrMSYvig&~F*mI8MKHJc*0$Kfu7g>W2y7;MXcObqX)42eAu|uA4ZoeJ> z%1qdBOy-;y_~Sp7Sz-obsrwn*zkqYF*qifeAXnSFa{>i##utim3y?QGI;`1aa2`&e zan6k{{Z&y$A$jB68g#=oMLEt1G|p9{9|vcF}=(Kz=GdULzreqErqx23jwJlbgL6~gJyTf(Vx Ljkdk9x-j@Z46(Zz diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out index aea206f..b6bc733 100644 --- a/tests/test_fpu.console_out +++ b/tests/test_fpu.console_out @@ -18,3 +18,4 @@ test 17:PASS test 18:PASS test 19:PASS test 20:PASS +test 21:PASS From c350bc1f25733d2dc2a6ce6f23172b78744cb9b1 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 31 Jul 2020 12:02:55 +1000 Subject: [PATCH 24/30] FPU: Implement fsqrt[s] and add a test for fsqrt This implements the floating square-root calculation using a table lookup of the inverse square root approximation, followed by three iterations of Goldschmidt's algorithm, which gives estimates of both sqrt(FRB) and 1/sqrt(FRB). Then the residual is calculated as FRB - R * R and that is multiplied by the 1/sqrt(FRB) estimate to get an adjustment to R. The residual and the adjustment can be negative, and since we have an unsigned multiplier, the upper bits can be wrong. In practice the adjustment fits into an 8-bit signed value, and the bottom 8 bits of the adjustment product are correct, so we sign-extend them, divide by 4 (because R is in 10.54 format) and add them to R. Finally the residual is calculated again and compared to 2*R+1 to see if a final increment is needed. Then the result is rounded and written back. This implements fsqrts as fsqrt, but with rounding to single precision and underflow/overflow calculation using the single-precision exponent range. This could be optimized later. Signed-off-by: Paul Mackerras --- decode1.vhdl | 2 + fpu.vhdl | 217 ++++++++++++++++++++++++++++++++++++- tests/fpu/fpu.c | 48 ++++++++ tests/test_fpu.bin | Bin 29376 -> 29632 bytes tests/test_fpu.console_out | 1 + 5 files changed, 262 insertions(+), 6 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 7163ff9..e821469 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -419,6 +419,7 @@ architecture behaviour of decode1 is 2#10010# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fdivs 2#10100# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fsubs 2#10101# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fadds + 2#10110# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fsqrts 2#11000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fres 2#11001# => (FPU, OP_FPOP, FRA, NONE, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fmuls 2#11010# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- frsqrtes @@ -477,6 +478,7 @@ architecture behaviour of decode1 is 2#0010# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fdiv 2#0100# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fsub 2#0101# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fadd + 2#0110# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fsqrt 2#0111# => (FPU, OP_FPOP, FRA, FRB, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fsel 2#1000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fre 2#1001# => (FPU, OP_FPOP, FRA, NONE, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fmul diff --git a/fpu.vhdl b/fpu.vhdl index 0cbd43f..244454e 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -40,7 +40,7 @@ architecture behaviour of fpu is DO_FMR, DO_FMRG, DO_FCMP, DO_FCFID, DO_FCTI, DO_FRSP, DO_FRI, - DO_FADD, DO_FMUL, DO_FDIV, + DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT, DO_FRE, DO_FRSQRTE, DO_FSEL, FRI_1, @@ -51,6 +51,9 @@ architecture behaviour of fpu is DIV_2, DIV_3, DIV_4, DIV_5, DIV_6, FRE_1, RSQRT_1, + SQRT_1, SQRT_2, SQRT_3, SQRT_4, + SQRT_5, SQRT_6, SQRT_7, SQRT_8, + SQRT_9, SQRT_10, SQRT_11, SQRT_12, INT_SHIFT, INT_ROUND, INT_ISHIFT, INT_FINAL, INT_CHECK, INT_OFLOW, FINISH, NORMALIZE, @@ -140,6 +143,7 @@ architecture behaviour of fpu is constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00"; constant BIN_R : std_ulogic_vector(1 downto 0) := "01"; constant BIN_MASK : std_ulogic_vector(1 downto 0) := "10"; + constant BIN_PS6 : std_ulogic_vector(1 downto 0) := "11"; constant RES_SUM : std_ulogic_vector(1 downto 0) := "00"; constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01"; @@ -604,6 +608,7 @@ begin variable pshift : std_ulogic; variable renorm_sqrt : std_ulogic; variable sqrt_exp : signed(EXP_BITS-1 downto 0); + variable shiftin : std_ulogic; begin v := r; illegal := '0'; @@ -717,6 +722,7 @@ begin set_y := '0'; pshift := '0'; renorm_sqrt := '0'; + shiftin := '0'; case r.state is when IDLE => if e_in.valid = '1' then @@ -765,6 +771,9 @@ begin v.state := DO_FDIV; when "10100" | "10101" => v.state := DO_FADD; + when "10110" => + v.is_sqrt := '1'; + v.state := DO_FSQRT; when "10111" => v.state := DO_FSEL; when "11000" => @@ -1248,6 +1257,43 @@ begin v.quieten_nan := '0'; arith_done := '1'; + when DO_FSQRT => + opsel_a <= AIN_B; + v.result_class := r.b.class; + v.result_sign := r.b.negative; + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + if r.b.class = NAN and r.b.mantissa(53) = '0' then + v.fpscr(FPSCR_VXSNAN) := '1'; + invalid := '1'; + end if; + case r.b.class is + when FINITE => + v.result_exp := r.b.exponent; + if r.b.negative = '1' then + v.fpscr(FPSCR_VXSQRT) := '1'; + qnan_result := '1'; + arith_done := '1'; + elsif r.b.mantissa(54) = '0' then + v.state := RENORM_B; + elsif r.b.exponent(0) = '0' then + v.state := SQRT_1; + else + v.shift := to_signed(1, EXP_BITS); + v.state := RENORM_B2; + end if; + when NAN | ZERO => + -- result is B + arith_done := '1'; + when INFINITY => + if r.b.negative = '1' then + v.fpscr(FPSCR_VXSQRT) := '1'; + qnan_result := '1'; + -- else result is B + end if; + arith_done := '1'; + end case; + when DO_FRE => opsel_a <= AIN_B; v.result_class := r.b.class; @@ -1454,7 +1500,11 @@ begin -- wait one cycle for inverse_table[B] lookup v.first := '1'; if r.insn(4) = '0' then - v.state := DIV_2; + if r.insn(3) = '0' then + v.state := DIV_2; + else + v.state := SQRT_1; + end if; elsif r.insn(2) = '0' then v.state := FRE_1; else @@ -1545,6 +1595,156 @@ begin v.shift := to_signed(1, EXP_BITS); v.state := NORMALIZE; + when SQRT_1 => + -- put invsqr[B] in R and compute P = invsqr[B] * B + -- also transfer B (in R) to A + set_a := '1'; + opsel_r <= RES_MISC; + misc_sel <= "0111"; + msel_1 <= MUL1_B; + msel_2 <= MUL2_LUT; + f_to_multiply.valid <= '1'; + v.shift := to_signed(-1, EXP_BITS); + v.count := "00"; + v.state := SQRT_2; + + when SQRT_2 => + -- shift R right one place + -- not expecting multiplier result yet + opsel_r <= RES_SHIFT; + v.first := '1'; + v.state := SQRT_3; + + when SQRT_3 => + -- put R into Y, wait for product from multiplier + msel_2 <= MUL2_R; + set_y := r.first; + pshift := '1'; + if multiply_to_f.valid = '1' then + -- put result into R + opsel_r <= RES_MULT; + v.first := '1'; + v.state := SQRT_4; + end if; + + when SQRT_4 => + -- compute 1.5 - Y * P + msel_1 <= MUL1_Y; + msel_2 <= MUL2_P; + msel_add <= MULADD_CONST; + msel_inv <= '1'; + f_to_multiply.valid <= r.first; + pshift := '1'; + if multiply_to_f.valid = '1' then + v.state := SQRT_5; + end if; + + when SQRT_5 => + -- compute Y = Y * P + msel_1 <= MUL1_Y; + msel_2 <= MUL2_P; + f_to_multiply.valid <= '1'; + v.first := '1'; + v.state := SQRT_6; + + when SQRT_6 => + -- pipeline in R = R * P + msel_1 <= MUL1_R; + msel_2 <= MUL2_P; + f_to_multiply.valid <= r.first; + pshift := '1'; + if multiply_to_f.valid = '1' then + v.first := '1'; + v.state := SQRT_7; + end if; + + when SQRT_7 => + -- first multiply is done, put result in Y + msel_2 <= MUL2_P; + set_y := r.first; + -- wait for second multiply (should be here already) + pshift := '1'; + if multiply_to_f.valid = '1' then + -- put result into R + opsel_r <= RES_MULT; + v.first := '1'; + v.count := r.count + 1; + if r.count < 2 then + v.state := SQRT_4; + else + v.first := '1'; + v.state := SQRT_8; + end if; + end if; + + when SQRT_8 => + -- compute P = A - R * R, which can be +ve or -ve + -- we arranged for B to be put into A earlier + msel_1 <= MUL1_R; + msel_2 <= MUL2_R; + msel_add <= MULADD_A; + msel_inv <= '1'; + pshift := '1'; + f_to_multiply.valid <= r.first; + if multiply_to_f.valid = '1' then + v.first := '1'; + v.state := SQRT_9; + end if; + + when SQRT_9 => + -- compute P = P * Y + -- since Y is an estimate of 1/sqrt(B), this makes P an + -- estimate of the adjustment needed to R. Since the error + -- could be negative and we have an unsigned multiplier, the + -- upper bits can be wrong, but it turns out the lowest 8 bits + -- are correct and are all we need (given 3 iterations through + -- SQRT_4 to SQRT_7). + msel_1 <= MUL1_Y; + msel_2 <= MUL2_P; + pshift := '1'; + f_to_multiply.valid <= r.first; + if multiply_to_f.valid = '1' then + v.state := SQRT_10; + end if; + + when SQRT_10 => + -- Add the bottom 8 bits of P, sign-extended, + -- divided by 4, onto R. + -- The division by 4 is because R is 10.54 format + -- whereas P is 8.56 format. + opsel_b <= BIN_PS6; + sqrt_exp := r.b.exponent(EXP_BITS-1) & r.b.exponent(EXP_BITS-1 downto 1); + v.result_exp := sqrt_exp; + v.shift := to_signed(1, EXP_BITS); + v.first := '1'; + v.state := SQRT_11; + + when SQRT_11 => + -- compute P = A - R * R (remainder) + -- also put 2 * R + 1 into B for comparison with P + msel_1 <= MUL1_R; + msel_2 <= MUL2_R; + msel_add <= MULADD_A; + msel_inv <= '1'; + f_to_multiply.valid <= r.first; + shiftin := '1'; + set_b := r.first; + if multiply_to_f.valid = '1' then + v.state := SQRT_12; + end if; + + when SQRT_12 => + -- test if remainder is 0 or >= B = 2*R + 1 + if pcmpb_lt = '1' then + -- square root is correct, set X if remainder non-zero + v.x := r.p(58) or px_nz; + else + -- square root needs to be incremented by 1 + carry_in <= '1'; + v.x := not pcmpb_eq; + end if; + v.state := FINISH; + when INT_SHIFT => opsel_r <= RES_SHIFT; set_x := '1'; @@ -1828,8 +2028,12 @@ begin maddend := (others => '0'); case msel_add is when MULADD_CONST => - -- addend is 2.0 in 16.112 format - maddend(113) := '1'; -- 2.0 + -- addend is 2.0 or 1.5 in 16.112 format + if r.is_sqrt = '0' then + maddend(113) := '1'; -- 2.0 + else + maddend(112 downto 111) := "11"; -- 1.5 + end if; when MULADD_A => -- addend is A in 16.112 format maddend(121 downto 58) := r.a.mantissa; @@ -1895,14 +2099,15 @@ begin when BIN_MASK => in_b0 := mask; when others => - in_b0 := (others => '0'); + -- BIN_PS6, 6 LSBs of P/4 sign-extended to 64 + in_b0 := std_ulogic_vector(resize(signed(r.p(7 downto 2)), 64)); end case; if opsel_binv = '1' then in_b0 := not in_b0; end if; in_b <= in_b0; if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then - shift_res := shifter_64(r.r & x"00000000000000", + shift_res := shifter_64(r.r & shiftin & 55x"00000000000000", std_ulogic_vector(r.shift(6 downto 0))); else shift_res := (others => '0'); diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index d9c5c06..b72b01e 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -1291,6 +1291,53 @@ int fpu_test_21(void) return trapit(0, test21); } +struct sqrtvals { + unsigned long val; + unsigned long inv; +} sqrtvals[] = { + { 0x0000000000000000, 0x0000000000000000 }, + { 0x8000000000000000, 0x8000000000000000 }, + { 0xfff0000000000000, 0x7ff8000000000000 }, + { 0x7ff0000000000000, 0x7ff0000000000000 }, + { 0xfff123456789abcd, 0xfff923456789abcd }, + { 0x3ff0000000000000, 0x3ff0000000000000 }, + { 0x4000000000000000, 0x3ff6a09e667f3bcd }, + { 0x4010000000000000, 0x4000000000000000 }, + { 0xbff0000000000000, 0x7ff8000000000000 }, + { 0x4008000000000000, 0x3ffbb67ae8584caa }, + { 0x7fd0000000000000, 0x5fe0000000000000 }, + { 0x0008000000000000, 0x1ff6a09e667f3bcd }, + { 0x0004000000000000, 0x1ff0000000000000 }, + { 0x0002000000000000, 0x1fe6a09e667f3bcd }, + { 0x0000000000000002, 0x1e66a09e667f3bcd }, + { 0x0000000000000001, 0x1e60000000000000 }, +}; + +int test22(long arg) +{ + long i; + unsigned long result; + struct sqrtvals *vp = sqrtvals; + + set_fpscr(FPS_RN_NEAR); + for (i = 0; i < sizeof(sqrtvals) / sizeof(sqrtvals[0]); ++i, ++vp) { + asm("lfd 6,0(%0); fsqrt 7,6; stfd 7,0(%1)" + : : "b" (&vp->val), "b" (&result) : "memory"); + if (result != vp->inv) { + print_hex(i, 2, " "); + print_hex(result, 16, " "); + return i + 1; + } + } + return 0; +} + +int fpu_test_22(void) +{ + enable_fp(); + return trapit(0, test22); +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -1337,6 +1384,7 @@ int main(void) do_test(19, fpu_test_19); do_test(20, fpu_test_20); do_test(21, fpu_test_21); + do_test(22, fpu_test_22); return fail; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 0253720609c301172916dc31fbe1acd21c7c41bd..e3783415a22e72bae0935c2959ab8c4a5bf5a316 100755 GIT binary patch delta 2670 zcmai04Qx}_6~6Cb8;82-3&cOMWBbK%jBOm601Gd*;}}|kNg#M#HYn|=Q)m+cXjEbd zSQ>lCs{&JL!f7XlXqu*`VvKgsu!^aW1%xJG>y}QG(S+atdoA}~<$6En9E1ca2&u0B!_?|+=$|H=tLq5~Bex#8^~kM%I=Z>t zF_$~mzG@DynR|5iUmhrp+sX;K-akfZ4M8;C_19GnH{4z>@qT_}ug;_d`Up_^TtZCI4+j$V7q=vgbq4oUh8o-~~N2nQeT#5q6tb z^I3WrH@kVI5sVf;KiUARELDmVc_g^UVt^BtwYp@(8p({^7D@dmrCmOOL?x7!S*y6O z;5qBN)0sf|H7lvhr${HOMR%6MK!O~h-WWV7aPuBL)D~=K(RqxbI!F{* zL2LVsG1+iH+Is6s7{gT#jBqAM~x~4fq3Zzgj zxb26S;-cNHWs0ArzNuIXF1razilR*Pc9EOkKOMK<#iAyMn;U_b9Y1G*Z6?;41GO`s zmsgwVzJIHbK8`?K<#Bk;Q3*?(K9=0&baQ8bbFRoa*u}2=kXIxeyZUE6u+C{M9?hJp zguIqoFp)kP;!)PeQjG6DfPm{f2qVtiWhrc`K)BTVc#n5}Mr$tKnv9g-QKu1VWyf{h zM}q9Bg_4ox60$~afi}ZEg!dvVgG;b2&9KPj<<7xRUELAVctFyS9Rc>S)QDLXFMP`0 zlGnpHCw}F7Y1W~}&vL{k>;7A##&s6S>r}{jiPT5$p_w5Spok+aa>1Qg+!+mYc$%PO zmXo^)kIlMwg}+UHdxh<&`L+r-%Io~!3a3j;P;_^NWAKXS3+`9&axwn!!MWmj-0R(w z#apICghu%rji%`ECvQwqH>*Ek6Gi;?Buwd3J_F`S+}3{cM*49mu}32IoZayv)V{%Q zZ~o7Vaw4B2b3eJwl_2*u&eaalnjhnv4`92Po?UG0cMiKCw>Q6lLMYLToU6NN&I&G{ z{XppOBAlp+Nk{Q@ia41S#>bx;-J=t;8P3oA+N$I5&fG}Woyb40i|d2oc}MP#eE;G4Lcn?eHRn%i7J*0Fb!*02~XWy`hl zes&KbuAPMJe2`UynH^<6P%zkisZ7B|a{or@2!_lA9H=HE$b}ez#<&0u@e-v;^voDn zhvO^c4l)9bag8|IKBQDc&x~=+IQAoFX9ODaZ8#1icbO1)Tq@whe}@)nHou`C-a20v3a`n3 z98u#CrTOd}Xd;e3A-4&+O^kRXYXNzcdpJ9gv!ErEO%@KS7O>+cQFCGle&MyL3<^Rn z4?&ODtqLNPhGE?6P;EwdWEhM-n`%G8gCD^{gl7=?u0RvQF@&iru*c_68I*)<9)TWY z0|*aag=;>GYAwPI*I-(OMb(OM-p4Q-;bDYlWvJ{!_yl~n!m64;lpKYg3ga{pi!-PQ ziC$-677KkC3UIQ*sthCQgyD+9yjDaLH|~7aM^LvPIRa+?!>SQPZDU~Z`%BsP_Dsdb zjNfSz;{wPzkz*5?EyYoX+)DVsU!#g6I(8Erl@4tm!p2W2?c`9gC|krn@{dD(Wug1R zXOy=z|lK#$*PUM<2j(d;;F8bi2kd#VBS>9%bO4lzQ00XpDOahoPGb{52b1 z+~DdCg;v_&eNGRNCm(=|KQcmVgBdo4?u)$VjXcxQ`8oc}VI~A*m);Fw^7rXDg@3b{ zsRM_Wt{mO|&aI5TFT)Q#naPvPde(HTNIR0u#C#@y&B~`Ur5WNGOQ!bBgs|lwJ#QxV PHP~k`f}*|m>vZbhTi11S delta 2324 zcmai!4NOy46vywo@+!KnD=MXhQXZvXsajD{9#Wv4h!qf<%@4Mz0iEBQ!MPc9l@ik} zY-0$XZ9%eSgo;_5nqs#^<44d;Go5p8aR#SC@ncq9?vTsiX_W5opBJk`GEv+Wx=Kk^+RN6(yiY!B+y3CZYiqNBI8#zV;aXPO>3 z)REIjxz-mrsyv*a*i9XVI-_dibU1J$qz)Rh8-!}g$$ z#*U3mG!o~5h_5xDvGw_cbmS;uYHSWw8sXj8WeV?Hue|*#Y>l&mNu#CTmO!4SlXq-p&A2QG#wqEE>ChWjD8BiUS6&){m0Be&c?GJqg`)Na zuY4L}bXgm9GsLPFz4BDPfCimXm;;~brijj&UU@f+iO&KlUMUb56+cDXHOnifN0=X3 zCG101=km&%-~jsU37kz!m?E0Wyz(Gq;}k#Qy%O(0yneON295d=bkrr-uiqI}8&8~2 zZq`FXqW;+uTpZC5Cw~_gp*I{=o*?M#(~Laj;+7$`4zXY@x4e z;FILFd^N;qp}S%s&v={$7lA58!(m!VA*~w*D^d>f)mW1yHn^xwUU`=l3&bOTHE@C3 zMUv%On46jab*2VxGNf9tw>J5m@Z+XWhrqnlX70F`YN01%@JUJVVQLn*(=s{TnPw66 za4ziwzF0OKTT@3Y`%*?gaJVgUwAmu~p*?*x_)U6nrYoVvoWVDKGF#~1D!6a9LNI;U zv=H80O@uHXer_KtKK0jqPX_ion6M>5n_w@{_ly#bMD^a z=j2BAWl~Y|1kVupXEX__`jCv8T4pXG`>|h z2G_^?pY@%WiBwb}k)k!|+BzX;vd?&BTNH6`ole$8qoxO=*F=%@?EkU>#Gb8u$m|iH z`mr@wvZlvIo@W|KNcww(yf>P6gwLqYniU9TMF}2LCu0{8@=#-@9W&!J zV>ht!;A{q_@)yRw-~fZOC73P9B&<9*b7O8pX5s*Y*9OeJ$j%Z1%d?Yd<5 z*pH>}Ec9j@sQw(NCK{;J1y(H1E-1xP*##@GG<88emX0pifhE)h$FZopp%;tX4XPXi zE$;>^mfCJ8#p3UV6<9jEp&knf!VWCvARNcy3PLZIsvxLr2HFyYQd_)OHGq&XylzX# zYeMDfVNB1v$B;0$AuB^x$w9#&2l-r>q_W32s$DAbe7C~EFL?qzn|t7}&8P^Y{H6zb zZ5D+&3Qsr>hTJs843tghp(xj=aHDLv0B@k&jM8)wno%A@8N3Jw(HllN=MwZHl|({5 z`V};J21NnN8JA%q$|WfEA()4<4yC_e`cW2K;Z!_{vJxirmsWUaf`7A0UeW763&axR*?dQ1HXe4vfiTaW2}}Rfx%Da-(mwYF9#TA zOgk61fjTU)nYS6^hzGaVU^>C!7%G Date: Fri, 31 Jul 2020 16:46:12 +1000 Subject: [PATCH 25/30] FPU: Implement ftdiv and ftsqrt Signed-off-by: Paul Mackerras --- decode1.vhdl | 2 ++ fpu.vhdl | 68 ++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index e821469..bd7f0f3 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -441,6 +441,8 @@ architecture behaviour of decode1 is 2#000000000# => (FPU, OP_FPOP, FRA, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 0/0=fcmpu 2#000000001# => (FPU, OP_FPOP, FRA, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 1/0=fcmpo 2#000000010# => (FPU, OP_FPOP, NONE, NONE, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 2/0=mcrfs + 2#000000100# => (FPU, OP_FPOP, FRA, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 4/0=ftdiv + 2#000000101# => (FPU, OP_FPOP, NONE, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 5/0=ftsqrt 2#011000001# => (FPU, OP_FPOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 1/6=mtfsb1 2#011000010# => (FPU, OP_FPOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 2/6=mtfsb0 2#011000100# => (FPU, OP_FPOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 4/6=mtfsfi diff --git a/fpu.vhdl b/fpu.vhdl index 244454e..90670e9 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -37,7 +37,7 @@ architecture behaviour of fpu is type state_t is (IDLE, DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF, - DO_FMR, DO_FMRG, DO_FCMP, + DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT, DO_FCFID, DO_FCTI, DO_FRSP, DO_FRI, DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT, @@ -51,6 +51,7 @@ architecture behaviour of fpu is DIV_2, DIV_3, DIV_4, DIV_5, DIV_6, FRE_1, RSQRT_1, + FTDIV_1, SQRT_1, SQRT_2, SQRT_3, SQRT_4, SQRT_5, SQRT_6, SQRT_7, SQRT_8, SQRT_9, SQRT_10, SQRT_11, SQRT_12, @@ -105,6 +106,7 @@ architecture behaviour of fpu is is_sqrt : std_ulogic; first : std_ulogic; count : unsigned(1 downto 0); + doing_ftdiv : std_ulogic_vector(1 downto 0); end record; type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0); @@ -642,6 +644,8 @@ begin v.is_multiply := '0'; v.is_sqrt := '0'; v.add_bsmall := '0'; + v.doing_ftdiv := "00"; + adec := decode_dp(e_in.fra, int_input); bdec := decode_dp(e_in.frb, int_input); cdec := decode_dp(e_in.frc, int_input); @@ -659,8 +663,16 @@ begin r_lo_nz <= or (r.r(30 downto 2)); if r.single_prec = '0' then - max_exp := to_signed(1023, EXP_BITS); - min_exp := to_signed(-1022, EXP_BITS); + if r.doing_ftdiv(1) = '0' then + max_exp := to_signed(1023, EXP_BITS); + else + max_exp := to_signed(1020, EXP_BITS); + end if; + if r.doing_ftdiv(0) = '0' then + min_exp := to_signed(-1022, EXP_BITS); + else + min_exp := to_signed(-1021, EXP_BITS); + end if; bias_exp := to_signed(1536, EXP_BITS); else max_exp := to_signed(127, EXP_BITS); @@ -728,7 +740,13 @@ begin if e_in.valid = '1' then case e_in.insn(5 downto 1) is when "00000" => - if e_in.insn(7) = '1' then + if e_in.insn(8) = '1' then + if e_in.insn(6) = '0' then + v.state := DO_FTDIV; + else + v.state := DO_FTSQRT; + end if; + elsif e_in.insn(7) = '1' then v.state := DO_MCRFS; else v.state := DO_FCMP; @@ -804,6 +822,38 @@ begin v.instr_done := '1'; v.state := IDLE; + when DO_FTDIV => + v.instr_done := '1'; + v.state := IDLE; + v.cr_result := "0000"; + if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or + (r.b.class = FINITE and r.b.mantissa(53) = '0') then + v.cr_result(2) := '1'; + end if; + if r.a.class = NAN or r.a.class = INFINITY or + r.b.class = NAN or r.b.class = ZERO or r.b.class = INFINITY or + (r.a.class = FINITE and r.a.exponent <= to_signed(-970, EXP_BITS)) then + v.cr_result(1) := '1'; + else + v.doing_ftdiv := "11"; + v.first := '1'; + v.state := FTDIV_1; + v.instr_done := '0'; + end if; + + when DO_FTSQRT => + v.instr_done := '1'; + v.state := IDLE; + v.cr_result := "0000"; + if r.b.class = ZERO or r.b.class = INFINITY or + (r.b.class = FINITE and r.b.mantissa(53) = '0') then + v.cr_result(2) := '1'; + end if; + if r.b.class = NAN or r.b.class = INFINITY or r.b.class = ZERO + or r.b.negative = '1' or r.b.exponent <= to_signed(-970, EXP_BITS) then + v.cr_result(1) := '0'; + end if; + when DO_FCMP => -- fcmp[uo] v.instr_done := '1'; @@ -1587,6 +1637,16 @@ begin v.shift := to_signed(1, EXP_BITS); v.state := NORMALIZE; + when FTDIV_1 => + v.cr_result(1) := exp_tiny or exp_huge; + if exp_tiny = '1' or exp_huge = '1' or r.a.class = ZERO or r.first = '0' then + v.instr_done := '1'; + v.state := IDLE; + else + v.shift := r.a.exponent; + v.doing_ftdiv := "10"; + end if; + when RSQRT_1 => opsel_r <= RES_MISC; misc_sel <= "0111"; From dc1544db691a82dccdd6f6d43224d833dd4a1433 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 1 Aug 2020 19:17:36 +1000 Subject: [PATCH 26/30] FPU: Implement floating multiply-add instructions This implements fmadd, fmsub, fnmadd, fnmsub and their single-precision counterparts. The single-precision versions operate the same as the double-precision versions until the final rounding and overflow/underflow steps. This adds an S register to store the low bits of the product. S shifts into R on left shifts, and can be negated, but doesn't do any other arithmetic. This adds a test for the double-precision versions of these instructions. Signed-off-by: Paul Mackerras --- decode1.vhdl | 8 ++ fpu.vhdl | 244 +++++++++++++++++++++++++++++++++++-- tests/fpu/fpu.c | 71 +++++++++++ tests/test_fpu.bin | Bin 29632 -> 30416 bytes tests/test_fpu.console_out | 1 + 5 files changed, 314 insertions(+), 10 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index bd7f0f3..5d6a557 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -423,6 +423,10 @@ architecture behaviour of decode1 is 2#11000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fres 2#11001# => (FPU, OP_FPOP, FRA, NONE, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fmuls 2#11010# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- frsqrtes + 2#11100# => (FPU, OP_FPOP, FRA, FRB, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fmsubs + 2#11101# => (FPU, OP_FPOP, FRA, FRB, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fmadds + 2#11110# => (FPU, OP_FPOP, FRA, FRB, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fnmsubs + 2#11111# => (FPU, OP_FPOP, FRA, FRB, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- fnmadds others => illegal_inst ); @@ -485,6 +489,10 @@ architecture behaviour of decode1 is 2#1000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fre 2#1001# => (FPU, OP_FPOP, FRA, NONE, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fmul 2#1010# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- frsqrte + 2#1100# => (FPU, OP_FPOP, FRA, FRB, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fmsub + 2#1101# => (FPU, OP_FPOP, FRA, FRB, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fmadd + 2#1110# => (FPU, OP_FPOP, FRA, FRB, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fnmsub + 2#1111# => (FPU, OP_FPOP, FRA, FRB, FRC, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- fnmadd others => illegal_inst ); diff --git a/fpu.vhdl b/fpu.vhdl index 90670e9..5e30386 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -40,13 +40,15 @@ architecture behaviour of fpu is DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT, DO_FCFID, DO_FCTI, DO_FRSP, DO_FRI, - DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT, + DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT, DO_FMADD, DO_FRE, DO_FRSQRTE, DO_FSEL, FRI_1, ADD_SHIFT, ADD_2, ADD_3, CMP_1, CMP_2, MULT_1, + FMADD_1, FMADD_2, FMADD_3, + FMADD_4, FMADD_5, FMADD_6, LOOKUP, DIV_2, DIV_3, DIV_4, DIV_5, DIV_6, FRE_1, @@ -82,6 +84,7 @@ architecture behaviour of fpu is b : fpu_reg_type; c : fpu_reg_type; r : std_ulogic_vector(63 downto 0); -- 10.54 format + s : std_ulogic_vector(55 downto 0); -- extended fraction x : std_ulogic; p : std_ulogic_vector(63 downto 0); -- 8.56 format y : std_ulogic_vector(63 downto 0); -- 8.56 format @@ -101,6 +104,7 @@ architecture behaviour of fpu is round_mode : std_ulogic_vector(2 downto 0); is_subtract : std_ulogic; exp_cmp : std_ulogic; + madd_cmp : std_ulogic; add_bsmall : std_ulogic; is_multiply : std_ulogic; is_sqrt : std_ulogic; @@ -117,6 +121,7 @@ architecture behaviour of fpu is signal opsel_a : std_ulogic_vector(1 downto 0); signal opsel_b : std_ulogic_vector(1 downto 0); signal opsel_r : std_ulogic_vector(1 downto 0); + signal opsel_s : std_ulogic_vector(1 downto 0); signal opsel_ainv : std_ulogic; signal opsel_amask : std_ulogic; signal opsel_binv : std_ulogic; @@ -127,6 +132,7 @@ architecture behaviour of fpu is signal lost_bits : std_ulogic; signal r_hi_nz : std_ulogic; signal r_lo_nz : std_ulogic; + signal s_nz : std_ulogic; signal misc_sel : std_ulogic_vector(3 downto 0); signal f_to_multiply : MultiplyInputType; signal multiply_to_f : MultiplyOutputType; @@ -152,6 +158,11 @@ architecture behaviour of fpu is constant RES_MULT : std_ulogic_vector(1 downto 0) := "10"; constant RES_MISC : std_ulogic_vector(1 downto 0) := "11"; + constant S_ZERO : std_ulogic_vector(1 downto 0) := "00"; + constant S_NEG : std_ulogic_vector(1 downto 0) := "01"; + constant S_SHIFT : std_ulogic_vector(1 downto 0) := "10"; + constant S_MULT : std_ulogic_vector(1 downto 0) := "11"; + -- msel values constant MUL1_A : std_ulogic_vector(1 downto 0) := "00"; constant MUL1_B : std_ulogic_vector(1 downto 0) := "01"; @@ -163,9 +174,10 @@ architecture behaviour of fpu is constant MUL2_P : std_ulogic_vector(1 downto 0) := "10"; constant MUL2_R : std_ulogic_vector(1 downto 0) := "11"; - constant MULADD_ZERO : std_ulogic_vector(1 downto 0) := "00"; + constant MULADD_ZERO : std_ulogic_vector(1 downto 0) := "00"; constant MULADD_CONST : std_ulogic_vector(1 downto 0) := "01"; constant MULADD_A : std_ulogic_vector(1 downto 0) := "10"; + constant MULADD_RS : std_ulogic_vector(1 downto 0) := "11"; -- Inverse lookup table, indexed by the top 8 fraction bits -- The first 256 entries are the reciprocal (1/x) lookup table, @@ -597,20 +609,22 @@ begin variable need_check : std_ulogic; variable msb : std_ulogic; variable is_add : std_ulogic; - variable qnan_result : std_ulogic; variable longmask : std_ulogic; variable set_a : std_ulogic; variable set_b : std_ulogic; variable set_c : std_ulogic; - variable px_nz : std_ulogic; - variable maddend : std_ulogic_vector(127 downto 0); variable set_y : std_ulogic; + variable set_s : std_ulogic; + variable qnan_result : std_ulogic; + variable px_nz : std_ulogic; variable pcmpb_eq : std_ulogic; variable pcmpb_lt : std_ulogic; variable pshift : std_ulogic; variable renorm_sqrt : std_ulogic; variable sqrt_exp : signed(EXP_BITS-1 downto 0); variable shiftin : std_ulogic; + variable mulexp : signed(EXP_BITS-1 downto 0); + variable maddend : std_ulogic_vector(127 downto 0); begin v := r; illegal := '0'; @@ -657,10 +671,15 @@ begin if adec.exponent > bdec.exponent then v.exp_cmp := '1'; end if; + v.madd_cmp := '0'; + if (adec.exponent + cdec.exponent + 1) >= bdec.exponent then + v.madd_cmp := '1'; + end if; end if; r_hi_nz <= or (r.r(55 downto 31)); r_lo_nz <= or (r.r(30 downto 2)); + s_nz <= or (r.s); if r.single_prec = '0' then if r.doing_ftdiv(1) = '0' then @@ -711,6 +730,7 @@ begin opsel_b <= BIN_ZERO; opsel_binv <= '0'; opsel_r <= RES_SUM; + opsel_s <= S_ZERO; carry_in <= '0'; misc_sel <= "0000"; fpscr_mask := (others => '1'); @@ -725,6 +745,7 @@ begin set_a := '0'; set_b := '0'; set_c := '0'; + set_s := '0'; f_to_multiply.is_32bit <= '0'; f_to_multiply.valid <= '0'; msel_1 <= MUL1_A; @@ -802,12 +823,15 @@ begin when "11010" => v.is_sqrt := '1'; v.state := DO_FRSQRTE; + when "11100" | "11101" | "11110" | "11111" => + v.state := DO_FMADD; when others => illegal := '1'; end case; end if; v.x := '0'; v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX); + set_s := '1'; when DO_MCRFS => j := to_integer(unsigned(insn_bfa(r.insn))); @@ -1416,6 +1440,99 @@ begin arith_done := '1'; end case; + when DO_FMADD => + -- fmadd, fmsub, fnmadd, fnmsub + opsel_a <= AIN_A; + v.result_sign := r.a.negative; + v.result_class := r.a.class; + v.result_exp := r.a.exponent; + v.fpscr(FPSCR_FR) := '0'; + v.fpscr(FPSCR_FI) := '0'; + is_add := r.a.negative xor r.c.negative xor r.b.negative xor r.insn(1); + if r.a.class = FINITE and r.c.class = FINITE and + (r.b.class = FINITE or r.b.class = ZERO) then + v.is_subtract := not is_add; + mulexp := r.a.exponent + r.c.exponent; + v.result_exp := mulexp; + opsel_a <= AIN_B; + -- Make sure A and C are normalized + if r.a.mantissa(54) = '0' then + opsel_a <= AIN_A; + v.state := RENORM_A; + elsif r.c.mantissa(54) = '0' then + opsel_a <= AIN_C; + v.state := RENORM_C; + elsif r.b.class = ZERO then + -- no addend, degenerates to multiply + v.result_sign := r.a.negative xor r.c.negative xor r.insn(2); + f_to_multiply.valid <= '1'; + v.is_multiply := '1'; + v.state := MULT_1; + elsif r.madd_cmp = '0' then + -- addend is bigger, do multiply first + v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2)); + f_to_multiply.valid <= '1'; + v.state := FMADD_1; + else + -- product is bigger, shift B right and use it as the + -- addend to the multiplier + v.shift := r.b.exponent - mulexp + to_signed(64, EXP_BITS); + -- for subtract, multiplier does B - A * C + v.result_sign := not (r.a.negative xor r.c.negative xor r.insn(2) xor is_add); + v.result_exp := r.b.exponent; + v.state := FMADD_2; + end if; + else + if (r.a.class = NAN and r.a.mantissa(53) = '0') or + (r.b.class = NAN and r.b.mantissa(53) = '0') or + (r.c.class = NAN and r.c.mantissa(53) = '0') then + -- Signalling NAN + v.fpscr(FPSCR_VXSNAN) := '1'; + invalid := '1'; + end if; + if r.a.class = NAN then + -- nothing to do, result is A + elsif r.b.class = NAN then + -- result is B + v.result_class := NAN; + v.result_sign := r.b.negative; + opsel_a <= AIN_B; + elsif r.c.class = NAN then + -- result is C + v.result_class := NAN; + v.result_sign := r.c.negative; + opsel_a <= AIN_C; + elsif (r.a.class = ZERO and r.c.class = INFINITY) or + (r.a.class = INFINITY and r.c.class = ZERO) then + -- invalid operation, construct QNaN + v.fpscr(FPSCR_VXIMZ) := '1'; + qnan_result := '1'; + elsif r.a.class = INFINITY or r.c.class = INFINITY then + if r.b.class = INFINITY and is_add = '0' then + -- invalid operation, construct QNaN + v.fpscr(FPSCR_VXISI) := '1'; + qnan_result := '1'; + else + -- result is infinity + v.result_class := INFINITY; + v.result_sign := r.a.negative xor r.c.negative xor r.insn(2); + end if; + else + -- Here A is zero, C is zero, or B is infinity + -- Result is +/-B in all of those cases + v.result_class := r.b.class; + v.result_exp := r.b.exponent; + if v.result_class /= ZERO or is_add = '1' then + v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2)); + else + -- have to be careful about rule for 0 - 0 result sign + v.result_sign := (r.round_mode(1) and r.round_mode(0)) xor r.insn(2); + end if; + opsel_a <= AIN_B; + end if; + arith_done := '1'; + end if; + when RENORM_A => renormalize := '1'; v.state := RENORM_A2; @@ -1426,8 +1543,16 @@ begin if r.insn(4) = '1' then opsel_a <= AIN_C; if r.c.mantissa(54) = '1' then - v.first := '1'; - v.state := MULT_1; + if r.insn(3) = '0' or r.b.class = ZERO then + v.first := '1'; + v.state := MULT_1; + else + v.madd_cmp := '0'; + if new_exp + 1 >= r.b.exponent then + v.madd_cmp := '1'; + end if; + v.state := DO_FMADD; + end if; else v.state := RENORM_C; end if; @@ -1462,11 +1587,20 @@ begin when RENORM_C2 => set_c := '1'; v.result_exp := new_exp; - v.first := '1'; - v.state := MULT_1; + if r.insn(3) = '0' or r.b.class = ZERO then + v.first := '1'; + v.state := MULT_1; + else + v.madd_cmp := '0'; + if new_exp + 1 >= r.b.exponent then + v.madd_cmp := '1'; + end if; + v.state := DO_FMADD; + end if; when ADD_SHIFT => opsel_r <= RES_SHIFT; + v.x := s_nz; set_x := '1'; longmask := '0'; v.state := ADD_2; @@ -1545,6 +1679,78 @@ begin v.state := FINISH; end if; + when FMADD_1 => + -- Addend is bigger here + v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2)); + -- note v.shift is at most -2 here + v.shift := r.result_exp - r.b.exponent; + opsel_r <= RES_MULT; + opsel_s <= S_MULT; + set_s := '1'; + f_to_multiply.valid <= r.first; + if multiply_to_f.valid = '1' then + v.state := ADD_SHIFT; + end if; + + when FMADD_2 => + -- Product is potentially bigger here + set_s := '1'; + opsel_s <= S_SHIFT; + v.shift := r.shift - to_signed(64, EXP_BITS); + v.state := FMADD_3; + + when FMADD_3 => + opsel_r <= RES_SHIFT; + v.first := '1'; + v.state := FMADD_4; + + when FMADD_4 => + msel_add <= MULADD_RS; + f_to_multiply.valid <= r.first; + msel_inv <= r.is_subtract; + opsel_r <= RES_MULT; + opsel_s <= S_MULT; + set_s := '1'; + v.shift := to_signed(56, EXP_BITS); + if multiply_to_f.valid = '1' then + if multiply_to_f.result(121) = '1' then + v.state := FMADD_5; + else + v.state := FMADD_6; + end if; + end if; + + when FMADD_5 => + -- negate R:S:X + v.result_sign := not r.result_sign; + opsel_ainv <= '1'; + carry_in <= not (s_nz or r.x); + opsel_s <= S_NEG; + set_s := '1'; + v.shift := to_signed(56, EXP_BITS); + v.state := FMADD_6; + + when FMADD_6 => + if (r.r(56) or r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then + if s_nz = '0' then + -- must be a subtraction, and r.x must be zero + v.result_class := ZERO; + v.result_sign := r.round_mode(1) and r.round_mode(0); + arith_done := '1'; + else + -- R is all zeroes but there are non-zero bits in S + -- so shift them into R and set S to 0 + opsel_r <= RES_SHIFT; + set_s := '1'; + -- stay in state FMADD_6 + end if; + elsif r.r(56 downto 54) = "001" then + v.state := FINISH; + else + renormalize := '1'; + v.state := NORMALIZE; + end if; + when LOOKUP => opsel_a <= AIN_B; -- wait one cycle for inverse_table[B] lookup @@ -2097,6 +2303,9 @@ begin when MULADD_A => -- addend is A in 16.112 format maddend(121 downto 58) := r.a.mantissa; + when MULADD_RS => + -- addend is concatenation of R and S in 16.112 format + maddend := "000000" & r.r & r.s & "00"; when others => end case; if msel_inv = '1' then @@ -2167,7 +2376,7 @@ begin end if; in_b <= in_b0; if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then - shift_res := shifter_64(r.r & shiftin & 55x"00000000000000", + shift_res := shifter_64(r.r & (shiftin or r.s(55)) & r.s(54 downto 0), std_ulogic_vector(r.shift(6 downto 0))); else shift_res := (others => '0'); @@ -2230,6 +2439,21 @@ begin result <= misc; end case; v.r := result; + if set_s = '1' then + case opsel_s is + when S_NEG => + v.s := std_ulogic_vector(unsigned(not r.s) + (not r.x)); + when S_MULT => + v.s := multiply_to_f.result(57 downto 2); + when S_SHIFT => + v.s := shift_res(63 downto 8); + if shift_res(7 downto 0) /= x"00" then + v.x := '1'; + end if; + when others => + v.s := (others => '0'); + end case; + end if; if set_a = '1' then v.a.exponent := new_exp; diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index b72b01e..52f21d0 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -1338,6 +1338,76 @@ int fpu_test_22(void) return trapit(0, test22); } +struct fmavals { + unsigned long ra; + unsigned long rc; + unsigned long rb; + unsigned long fma; + unsigned long fms; + unsigned long nfma; + unsigned long nfms; +} fmavals[] = { + { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x8000000000000000, 0x8000000000000000 }, + { 0x0000000000000000, 0x7ffc000000000000, 0x0000000000000000, + 0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000, 0x7ffc000000000000 }, + { 0x0000000000000000, 0x7ffc000000000000, 0x7ffb000000000000, + 0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000, 0x7ffb000000000000 }, + { 0x7ffa000000000000, 0x7ffc000000000000, 0x7ffb000000000000, + 0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000, 0x7ffa000000000000 }, + { 0x3ff0000000000000, 0x8000000000000000, 0x678123456789abcd, + 0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd }, + { 0x3ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, + 0x678123456789abcd, 0xe78123456789abcd, 0xe78123456789abcd, 0x678123456789abcd }, + { 0x7ff0000000000000, 0xbff0000000000000, 0x678123456789abcd, + 0xfff0000000000000, 0xfff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000 }, + { 0x7ff0000000000000, 0x0000000000000000, 0x678123456789abcd, + 0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000 }, + { 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000020000000, + 0x4000000010000000, 0xbe80000000000000, 0xc000000010000000, 0x3e80000000000000 }, + { 0x3ff0000000000001, 0x3ff0000000000001, 0x3ff0000000000000, + 0x4000000000000001, 0x3cc0000000000000, 0xc000000000000001, 0xbcc0000000000000 }, + { 0x3ff0000000000003, 0x3ff0000000000002, 0x3ff0000000000000, + 0x4000000000000002, 0x3cd4000000000002, 0xc000000000000002, 0xbcd4000000000002 }, + { 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb020000000000000, + 0xaca765753908cd20, 0x3030000000000000, 0x2ca765753908cd20, 0xb030000000000000 }, + { 0x3006a09e667f3bcd, 0x4006a09e667f3bcd, 0xb020000000000000, + 0x2cd3b3efbf5e2229, 0x3030000000000000, 0xacd3b3efbf5e2229, 0xb030000000000000 }, + { 0x3006a09e667f3bcc, 0x4006a09e667f3bcd, 0xb060003450000000, + 0xb05e0068a0000000, 0x3061003450000000, 0x305e0068a0000000, 0xb061003450000000 }, +}; + +int test23(long arg) +{ + long i; + unsigned long results[4]; + struct fmavals *vp = fmavals; + + set_fpscr(FPS_RN_NEAR); + for (i = 0; i < sizeof(fmavals) / sizeof(fmavals[0]); ++i, ++vp) { + asm("lfd 6,0(%0); lfd 7,8(%0); lfd 8,16(%0); fmadd 0,6,7,8; stfd 0,0(%1)" + : : "b" (&vp->ra), "b" (results) : "memory"); + asm("fmsub 1,6,7,8; fnmadd 2,6,7,8; fnmsub 3,6,7,8; stfd 1,8(%0); stfd 2,16(%0); stfd 3,24(%0)" + : : "b" (results) : "memory"); + if (results[0] != vp->fma || results[1] != vp->fms || + results[2] != vp->nfma || results[3] != vp->nfms) { + print_hex(i, 2, " "); + print_hex(results[0], 16, " "); + print_hex(results[1], 16, " "); + print_hex(results[2], 16, " "); + print_hex(results[3], 16, "\r\n"); + return i + 1; + } + } + return 0; +} + +int fpu_test_23(void) +{ + enable_fp(); + return trapit(0, test23); +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -1385,6 +1455,7 @@ int main(void) do_test(20, fpu_test_20); do_test(21, fpu_test_21); do_test(22, fpu_test_22); + do_test(23, fpu_test_23); return fail; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index e3783415a22e72bae0935c2959ab8c4a5bf5a316..50831cb20db40951fb7b0e508677cfddb62e46c6 100755 GIT binary patch delta 3557 zcmai04Ny~87QXM2#HcY13gIut2Lw!_vZ;SOOn`(cA|N6~mvvhjbgZadhdQ=J7fII1 zcC@1uIo&RGsUlMiW37XYGm9Oy(b+oaYM0%rMFGXuuHY)AKWkB`Z_j;6)UrC>nalT` z?|%2*^X|Ft5&w0r{T7!@h&`Q<+_IA+a@Qj+CvW7I5#mH`HEOF-TfN-<#`ffC++VjZ znM$`#KUnk0Gp@qAgfv3>{*F$8C+-4*MB|mT&YPl`$KrsNl@{=xLhtF$_F-csxXyus zMi(LbIuAW&tiQmGV{diV2gd??kT7So7fx%;bnpYXttmqA<%Snf+L{bI#!VdQ&BEk} zv#<_Mk4w@1oJ4XCit&S@=(qOW59Frtf;wWRdH{zA$?OZdiz2c{mDDv^W%yJ}>R@NY zGR_M$a%$vJ1IalqEzy)IikmVC@mm{UUZj=oH^7^bs}#iP5k9&B`=V07ppE8AAYHqF z%ZF|F_ixaxT|fGST}-iDS?m$!u>fjgRNPH?KW47HW06NV49~`<0w1g5YGF$3TzSr~J%UNn zT4Gg{l)$mrY`JcsM|cm8qtPCRVi6R?Wy=qhc!ZIX*dC|i%Aq?BGm4ajpT8~U;RD)O zU3E)5!g{zGm&$E}?Ybn$&=ql!$R=^`!%>}2R<9#D;56!>cD!!P3mBY_W~WJ<*Wh4c z9K9F^7si*;h0!omZyq~8j^ymd6O_i2t1Gc^YxOC#awRnBv#Bf&Zs^Ul;$6^ADAKxD zE)&LMp?Ff(D~e4v4RH%-te%ifXUD?H32SI;1I*-e$1p~NqT${64Q#`)31-?B1sMq$)L94B2{qKc64Z%WHkO>2O&heZCh-J| zrW(wc%pT#@J7OM3T*9Fd%c=@VyigwrrAe`{)sRmscfln?7BlOU%#qCe$H8Bk_rt4( zIH*j@rzL-ZlSxyV_imDzUK@ugOn_}ksgyK8b+R<@ak80C9tW3`-(t2^6EW|xlI>{X zBE<4#h_`4s(JVX~7%X(Qo zUknJn#$Y@@7+ZpII>y2y^=Bt*LM>DVCzhe`8Ir@}=EmUWo5f9bBgU3sT#2y-V<8xq zi=#K$IU-sGVoKwI;}fCB6fu7@-Xaa*2LFj?v+#Q{@4~(NE(=zB6cHz|il>bZT_y+P?~ipd)7Yaw z&hspKGH{F#o8%krQQ|sdq;=AzaXcpXV?1Mpz5#Y#)l$NZnA|xyJ_#tNg2j_Rer$Fd zzkPakzd_H_vYR2T^OWqu(e!9`L-5{|pSZ)YJ!KkxGe1k2MqBs#?xwsR=4uQSsCQ7j z`k%dF%BuE0Hj|ewt|%_GHRCw(CTu^9Iy+c79#9J&IJqT!xvZeK(k9e%`ajwr!I(;wlYj6#I%KwEx%&PD80U# z)KbpnIWLOWP+(1i9ZjV8f+!X;0UOio*lST6KxB<+#n@l@QWTq+fQ@P8jC6=1&k!5a zs<5k2GcW-g!&>Y*)Vc_P)oJmxy#s2}QfSvjIEGMs3HsCYwCWP5XX&ZC6H*YGJD~uf zy%W|T40b{d0^bD(5OTWUbA*a6=trpU0<~37{auiP(AxzC2%2tKgCKN64T7^94j^pq zhR+dNx}hJTqZ`yVJtdbR1;Kb3_luK<5mJcE7OQAQ+I$(-*kUs}kdiB+s6$|l$9@wv zC2AE+m_5=#c5%{qHwUs*2wVKuE0rKfmbo~(EW zX-*I1rY9(lAhlnGwMefaJ$Ds$AXUln@V~ZNSh^Uaw8S`L{Z={`MGK*KFuy} z?I<;I-~){gl=1`g)8gD!-{%oAogWCQZ@JVI{v}DTU#u4~2gTtB6O{qY3 z=GMa_WH-fw+BwwNMAq`L*P|u_lRZz~i_8h@?GtqoNgqK66CjM(ki*8%64G^H6wn8F%!yRjcZZ8BRP8`Ivw ze(t}bSm`Ts+*QHbub6y=g}2r4C1-+vtPBT#vBEWkzfd1-q4-G{@`lFm1?N8v?*@It z=DA2{;7K@uzgMN^KKtR86>}2*uwocq55^%}5wM5m4SYJxADS=u4Y=U&iNQ=CPK1G_ zQSj&Iqg^DlRA7t2pz}%iAO-daJRwrd3S-BP&3z32u-K0sn>!;=yn)OR2J}#z7@Fv} z1*_~282m@ryUjMBaBz{01W>*lKr6uV9zj>yF(oetjeG&3m0&&(Cp@mu=vti+M;%vcH8_P*4LyXHT!I&Y7 LTYUDH+!6l+$lio8 delta 2528 zcmah}4Qx}_6~6DrHgW5@4vAyOG0wA-II&666e@U$^J6;@j6;K$W)!h5a-fBDP{t<4 zLYCPU@+PPRNqQ;`wk!?IR2gCu#caw@hms0KEt{%Lq8%l~2_;>VKr7RBG=!47bDtUg z84_2z@B8j|zIWcae{b|-Vdx{lPKdjj5P$2)BL7~*=gDq=Dl|Z4u9<~YZMqm}vsP$>7$T&0=AOGL%lnA0>zGS{v+{RA zU)B~O22{UHUz$&R!*YsawT$hnC1l9e3~Ti+YR-pW>3^t-v_z#p&cX>p85Cq21szmp zuNM?>5TBhep1q#FVudop9_3tPRJsi}4J%>Npb?Dlv0;sJ*QTiSf~@T_YN+p72pQ{@ z!KSG6K4j#qd^x9CIrMZ?TFVE}ouk2$ewVXG+5Nq!^jCPov=T&Y~3SscyP2qKK_L#t8X`yNZge=a*N)z!N#Tw zmKY#m?cfz`vN>ra8+O|B;J7W12D)IxcF2_6c3f(|b1nS0`Twh7XMR0x*27=&-{h}% z6gaWi!~?hO&t;D_=X?&j5WM> zXCYjUvwDQEJU%nWqx@EoU?3UpEGJv!+3N3Um3TGO_joz3Kqm|@bqR07-KFs`X*(`y z$UZmUrF7(cI}d)!FPy7{YsI+D5AwW#o*Et_RkH8fo^;H!TFz4~$0gDld4OlJWJQxe zLUO_VSloH=lmfl^D7;sC`hnH`Ir;GFdeHN*>Ry!dJfym{X!?3}Hz4QnFN8lpMmhdl zVRiX(I`I2=NBN$#u&zzspy@ew_4;dR>7D7<`J1BrQ)6s%l@a44X6v~*o4OK8Tp$q_ zoS)@F^u9(rI{$Z{ic!yywV&VD%2B(E@9HO6!%x3p4Q^a3?&n7w7dwNWj{6_Kh(;(k zh=L=2^vP|4Mnp3|9W=Ii`sHj<%eh(p^^;(F44PJi)%Qp7%PXE1z7zXEBBH-lB>uf< zsFYl*W44)-3etIEJ^4jCa{6ZaUIlSfd|gW;?h4I)%`N+M-x$MLX}wft zXG}!ojcc9Qdr>Rk0yd7_*iWK%g%G$?X{Eu-U~!ev9b?dlFf|4tmxXGsKp%o`9L^y~ z7`N{&BEWS?H;8s6n_i4lM{I0U?Cq1oR;U5^xToJpl=X{sb%_yp;e;wS^`V zP=jEcfEEPz1cVUUCZG?YdjifO3{Jp8wMlh24QDY4mKsydAkwx;#>{*c*hK9AKy4Rl zyST7Q?IEuUby|0ZClt^U?}W<3C!0ji8PipNdqmq!_Q_zR18|ldpVWifgdJgH1f5R-& z{Yanu2()gC>Lk*)QgjsQv(sEvevUK=!|ptlQAJ2(2Kvxig;Y0-)TA!Jl7l$x2el{P zv>(~$vtRA#Dvb7`My|slPmP)nr*PN}2R(JAC5v!XKKW{e`1ZR|D@Khwd0U6Q8MTdY z*VCYkAv*(4d5g@W$l5+-tY1LGYORQCNWZTot5hC5Bf(0u{^Dnh?dKV=@x=U``e4|b zDGb89-lf6~5b82>$bT3s;VH0j*DKhKw-{@WKT&r}6L;5L65!PzIpTr9Ep2?m=8mlY E0#G$xNB{r; diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out index 9b97cb5..ed759a5 100644 --- a/tests/test_fpu.console_out +++ b/tests/test_fpu.console_out @@ -20,3 +20,4 @@ test 19:PASS test 20:PASS test 21:PASS test 22:PASS +test 23:PASS From b0b3c0dc70855480fef3278925521bee7dd7de34 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 1 Sep 2020 11:13:17 +1000 Subject: [PATCH 27/30] FPU: Add comments specifying the expectation of r.shift for each state Signed-off-by: Paul Mackerras --- fpu.vhdl | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fpu.vhdl b/fpu.vhdl index 5e30386..ec18953 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -1599,6 +1599,7 @@ begin end if; when ADD_SHIFT => + -- r.shift = - exponent difference opsel_r <= RES_SHIFT; v.x := s_nz; set_x := '1'; @@ -1619,6 +1620,7 @@ begin when ADD_3 => -- check for overflow or negative result (can't get both) + -- r.shift = -1 if r.r(63) = '1' then -- result is opposite sign to expected v.result_sign := not r.result_sign; @@ -1694,12 +1696,14 @@ begin when FMADD_2 => -- Product is potentially bigger here + -- r.shift = addend exp - product exp + 64 set_s := '1'; opsel_s <= S_SHIFT; v.shift := r.shift - to_signed(64, EXP_BITS); v.state := FMADD_3; when FMADD_3 => + -- r.shift = addend exp - product exp opsel_r <= RES_SHIFT; v.first := '1'; v.state := FMADD_4; @@ -1731,6 +1735,7 @@ begin v.state := FMADD_6; when FMADD_6 => + -- r.shift = 56 (or 0, but only if r is now nonzero) if (r.r(56) or r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then if s_nz = '0' then -- must be a subtraction, and r.x must be zero @@ -1877,6 +1882,7 @@ begin when SQRT_2 => -- shift R right one place -- not expecting multiplier result yet + -- r.shift = -1 opsel_r <= RES_SHIFT; v.first := '1'; v.state := SQRT_3; @@ -2012,12 +2018,14 @@ begin v.state := FINISH; when INT_SHIFT => + -- r.shift = b.exponent - 52 opsel_r <= RES_SHIFT; set_x := '1'; v.state := INT_ROUND; v.shift := to_signed(-2, EXP_BITS); when INT_ROUND => + -- r.shift = -2 opsel_r <= RES_SHIFT; round := fp_rounding(r.r, r.x, '0', r.round_mode, r.result_sign); v.fpscr(FPSCR_FR downto FPSCR_FI) := round; @@ -2030,6 +2038,7 @@ begin end if; when INT_ISHIFT => + -- r.shift = b.exponent - 54; opsel_r <= RES_SHIFT; v.state := INT_FINAL; @@ -2087,6 +2096,7 @@ begin arith_done := '1'; when FRI_1 => + -- r.shift = b.exponent - 52 opsel_r <= RES_SHIFT; set_x := '1'; v.shift := to_signed(-2, EXP_BITS); @@ -2114,6 +2124,7 @@ begin when NORMALIZE => -- Shift so we have 9 leading zeroes (we know R is non-zero) + -- r.shift = clz(r.r) - 9 opsel_r <= RES_SHIFT; set_x := '1'; if exp_tiny = '1' then @@ -2127,6 +2138,7 @@ begin end if; when ROUND_UFLOW => + -- r.shift = - amount by which exponent underflows v.tiny := '1'; if r.fpscr(FPSCR_UE) = '0' then -- disabled underflow exception case @@ -2204,6 +2216,7 @@ begin when ROUNDING_2 => -- Check for overflow during rounding + -- r.shift = -1 v.x := '0'; if r.r(55) = '1' then opsel_r <= RES_SHIFT; @@ -2221,6 +2234,7 @@ begin end if; when ROUNDING_3 => + -- r.shift = clz(r.r) - 9 mant_nz := r_hi_nz or (r_lo_nz and not r.single_prec); if mant_nz = '0' then v.result_class := ZERO; @@ -2242,6 +2256,7 @@ begin end if; when DENORM => + -- r.shift = result_exp - -1022 opsel_r <= RES_SHIFT; arith_done := '1'; From fb5115c9445fe03142946b195028d90a00c6acee Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 1 Sep 2020 15:09:17 +1000 Subject: [PATCH 28/30] FPU: Decide on A input selection a cycle earlier This moves opsel_a into the reg_type record, meaning that the A multiplexer input now needs to be decided a cycle earlier. This helps timing by eliminating the combinatorial path from r.state and other things to opsel_a and thence to in_a and result. This means that some things now take an extra cycle, in particular some of the exception cases such as when one or both operands are NaNs. The NaN handling has been moved out to its own state, which simplifies the logic for exception cases in other places. Additions or subtractions where FRB's exponent is smaller than FRA's will also take an extra cycle. Signed-off-by: Paul Mackerras --- fpu.vhdl | 330 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 177 insertions(+), 153 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index ec18953..9c18e47 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -44,7 +44,7 @@ architecture behaviour of fpu is DO_FRE, DO_FRSQRTE, DO_FSEL, FRI_1, - ADD_SHIFT, ADD_2, ADD_3, + ADD_1, ADD_SHIFT, ADD_2, ADD_3, CMP_1, CMP_2, MULT_1, FMADD_1, FMADD_2, FMADD_3, @@ -65,7 +65,8 @@ architecture behaviour of fpu is DENORM, RENORM_A, RENORM_A2, RENORM_B, RENORM_B2, - RENORM_C, RENORM_C2); + RENORM_C, RENORM_C2, + NAN_RESULT, EXC_RESULT); type reg_type is record state : state_t; @@ -111,6 +112,12 @@ architecture behaviour of fpu is first : std_ulogic; count : unsigned(1 downto 0); doing_ftdiv : std_ulogic_vector(1 downto 0); + opsel_a : std_ulogic_vector(1 downto 0); + use_a : std_ulogic; + use_b : std_ulogic; + use_c : std_ulogic; + invalid : std_ulogic; + negate : std_ulogic; end record; type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0); @@ -118,7 +125,6 @@ architecture behaviour of fpu is signal r, rin : reg_type; signal fp_result : std_ulogic_vector(63 downto 0); - signal opsel_a : std_ulogic_vector(1 downto 0); signal opsel_b : std_ulogic_vector(1 downto 0); signal opsel_r : std_ulogic_vector(1 downto 0); signal opsel_s : std_ulogic_vector(1 downto 0); @@ -724,7 +730,7 @@ begin v.update_fprf := '0'; v.shift := to_signed(0, EXP_BITS); v.first := '0'; - opsel_a <= AIN_R; + v.opsel_a := AIN_R; opsel_ainv <= '0'; opsel_amask <= '0'; opsel_b <= BIN_ZERO; @@ -758,6 +764,11 @@ begin shiftin := '0'; case r.state is when IDLE => + v.use_a := '0'; + v.use_b := '0'; + v.use_c := '0'; + v.invalid := '0'; + v.negate := '0'; if e_in.valid = '1' then case e_in.insn(5 downto 1) is when "00000" => @@ -770,6 +781,7 @@ begin elsif e_in.insn(7) = '1' then v.state := DO_MCRFS; else + v.opsel_a := AIN_B; v.state := DO_FCMP; end if; when "00110" => @@ -789,14 +801,17 @@ begin v.state := DO_MTFSF; end if; when "01000" => + v.opsel_a := AIN_B; if e_in.insn(9 downto 8) /= "11" then v.state := DO_FMR; else v.state := DO_FRI; end if; when "01100" => + v.opsel_a := AIN_B; v.state := DO_FRSP; when "01110" => + v.opsel_a := AIN_B; if int_input = '1' then -- fcfid[u][s] v.state := DO_FCFID; @@ -805,25 +820,45 @@ begin end if; when "01111" => v.round_mode := "001"; + v.opsel_a := AIN_B; v.state := DO_FCTI; when "10010" => + v.opsel_a := AIN_A; + if v.b.mantissa(54) = '0' and v.a.mantissa(54) = '1' then + v.opsel_a := AIN_B; + end if; v.state := DO_FDIV; when "10100" | "10101" => + v.opsel_a := AIN_A; v.state := DO_FADD; when "10110" => v.is_sqrt := '1'; + v.opsel_a := AIN_B; v.state := DO_FSQRT; when "10111" => v.state := DO_FSEL; when "11000" => + v.opsel_a := AIN_B; v.state := DO_FRE; when "11001" => v.is_multiply := '1'; + v.opsel_a := AIN_A; + if v.c.mantissa(54) = '0' and v.a.mantissa(54) = '1' then + v.opsel_a := AIN_C; + end if; v.state := DO_FMUL; when "11010" => v.is_sqrt := '1'; + v.opsel_a := AIN_B; v.state := DO_FRSQRTE; when "11100" | "11101" | "11110" | "11111" => + if v.a.mantissa(54) = '0' then + v.opsel_a := AIN_A; + elsif v.c.mantissa(54) = '0' then + v.opsel_a := AIN_C; + else + v.opsel_a := AIN_B; + end if; v.state := DO_FMADD; when others => illegal := '1'; @@ -880,11 +915,10 @@ begin when DO_FCMP => -- fcmp[uo] + -- r.opsel_a = AIN_B v.instr_done := '1'; v.state := IDLE; update_fx := '1'; - opsel_a <= AIN_B; - opsel_r <= RES_SUM; v.result_exp := r.b.exponent; if (r.a.class = NAN and r.a.mantissa(53) = '0') or (r.b.class = NAN and r.b.mantissa(53) = '0') then @@ -930,6 +964,7 @@ begin -- Prepare to subtract mantissas, put B in R v.cr_result := "0000"; v.instr_done := '0'; + v.opsel_a := AIN_A; v.state := CMP_1; end if; v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result; @@ -1017,7 +1052,7 @@ begin v.state := IDLE; when DO_FMR => - opsel_a <= AIN_B; + -- r.opsel_a = AIN_B v.result_class := r.b.class; v.result_exp := r.b.exponent; v.quieten_nan := '0'; @@ -1037,7 +1072,7 @@ begin v.state := IDLE; when DO_FRI => -- fri[nzpm] - opsel_a <= AIN_B; + -- r.opsel_a = AIN_B v.result_class := r.b.class; v.result_sign := r.b.negative; v.result_exp := r.b.exponent; @@ -1062,7 +1097,7 @@ begin end if; when DO_FRSP => - opsel_a <= AIN_B; + -- r.opsel_a = AIN_B, r.shift = 0 v.result_class := r.b.class; v.result_sign := r.b.negative; v.result_exp := r.b.exponent; @@ -1092,7 +1127,7 @@ begin -- instr bit 9: 1=dword 0=word -- instr bit 8: 1=unsigned 0=signed -- instr bit 1: 1=round to zero 0=use fpscr[RN] - opsel_a <= AIN_B; + -- r.opsel_a = AIN_B v.result_class := r.b.class; v.result_sign := r.b.negative; v.result_exp := r.b.exponent; @@ -1130,8 +1165,8 @@ begin end case; when DO_FCFID => + -- r.opsel_a = AIN_B v.result_sign := '0'; - opsel_a <= AIN_B; if r.insn(8) = '0' and r.b.negative = '1' then -- fcfid[s] with negative operand, set R = -B opsel_ainv <= '1'; @@ -1150,16 +1185,19 @@ begin when DO_FADD => -- fadd[s] and fsub[s] - opsel_a <= AIN_A; + -- r.opsel_a = AIN_A v.result_sign := r.a.negative; v.result_class := r.a.class; v.result_exp := r.a.exponent; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; + v.use_a := '1'; + v.use_b := '1'; is_add := r.a.negative xor r.b.negative xor r.insn(1); if r.a.class = FINITE and r.b.class = FINITE then v.is_subtract := not is_add; v.add_bsmall := r.exp_cmp; + v.opsel_a := AIN_B; if r.exp_cmp = '0' then v.shift := r.a.exponent - r.b.exponent; v.result_sign := r.b.negative xnor r.insn(1); @@ -1169,77 +1207,55 @@ begin v.state := ADD_SHIFT; end if; else - opsel_a <= AIN_B; - v.shift := r.b.exponent - r.a.exponent; - v.result_exp := r.b.exponent; - v.state := ADD_SHIFT; + v.state := ADD_1; end if; else - if (r.a.class = NAN and r.a.mantissa(53) = '0') or - (r.b.class = NAN and r.b.mantissa(53) = '0') then - -- Signalling NAN - v.fpscr(FPSCR_VXSNAN) := '1'; - invalid := '1'; - end if; - if r.a.class = NAN then - -- nothing to do, result is A - elsif r.b.class = NAN then - v.result_class := NAN; - v.result_sign := r.b.negative; - opsel_a <= AIN_B; + if r.a.class = NAN or r.b.class = NAN then + v.state := NAN_RESULT; elsif r.a.class = INFINITY and r.b.class = INFINITY and is_add = '0' then -- invalid operation, construct QNaN v.fpscr(FPSCR_VXISI) := '1'; qnan_result := '1'; + arith_done := '1'; elsif r.a.class = ZERO and r.b.class = ZERO and is_add = '0' then -- return -0 for rounding to -infinity v.result_sign := r.round_mode(1) and r.round_mode(0); + arith_done := '1'; elsif r.a.class = INFINITY or r.b.class = ZERO then - -- nothing to do, result is A + -- result is A + v.opsel_a := AIN_A; + v.state := EXC_RESULT; else -- result is +/- B - v.result_sign := r.b.negative xnor r.insn(1); - v.result_class := r.b.class; - v.result_exp := r.b.exponent; - opsel_a <= AIN_B; + v.opsel_a := AIN_B; + v.negate := not r.insn(1); + v.state := EXC_RESULT; end if; - arith_done := '1'; end if; when DO_FMUL => -- fmul[s] - opsel_a <= AIN_A; - v.result_sign := r.a.negative; + -- r.opsel_a = AIN_A unless C is denorm and A isn't + v.result_sign := r.a.negative xor r.c.negative; v.result_class := r.a.class; - v.result_exp := r.a.exponent; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; + v.use_a := '1'; + v.use_c := '1'; if r.a.class = FINITE and r.c.class = FINITE then - v.result_sign := r.a.negative xor r.c.negative; v.result_exp := r.a.exponent + r.c.exponent; -- Renormalize denorm operands if r.a.mantissa(54) = '0' then v.state := RENORM_A; elsif r.c.mantissa(54) = '0' then - opsel_a <= AIN_C; v.state := RENORM_C; else f_to_multiply.valid <= '1'; v.state := MULT_1; end if; else - if (r.a.class = NAN and r.a.mantissa(53) = '0') or - (r.c.class = NAN and r.c.mantissa(53) = '0') then - -- Signalling NAN - v.fpscr(FPSCR_VXSNAN) := '1'; - invalid := '1'; - end if; - if r.a.class = NAN then - -- result is A - elsif r.c.class = NAN then - v.result_class := NAN; - v.result_sign := r.c.negative; - opsel_a <= AIN_C; + if r.a.class = NAN or r.c.class = NAN then + v.state := NAN_RESULT; elsif (r.a.class = INFINITY and r.c.class = ZERO) or (r.a.class = ZERO and r.c.class = INFINITY) then -- invalid operation, construct QNaN @@ -1247,22 +1263,22 @@ begin qnan_result := '1'; elsif r.a.class = ZERO or r.a.class = INFINITY then -- result is +/- A - v.result_sign := r.a.negative xor r.c.negative; + arith_done := '1'; else -- r.c.class is ZERO or INFINITY - v.result_class := r.c.class; - v.result_sign := r.a.negative xor r.c.negative; + v.opsel_a := AIN_C; + v.negate := r.a.negative; + v.state := EXC_RESULT; end if; - arith_done := '1'; end if; when DO_FDIV => - opsel_a <= AIN_A; - v.result_sign := r.a.negative; + -- r.opsel_a = AIN_A unless B is denorm and A isn't v.result_class := r.a.class; - v.result_exp := r.a.exponent; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; + v.use_a := '1'; + v.use_b := '1'; v.result_sign := r.a.negative xor r.b.negative; v.result_exp := r.a.exponent - r.b.exponent; v.count := "00"; @@ -1271,26 +1287,14 @@ begin if r.a.mantissa(54) = '0' then v.state := RENORM_A; elsif r.b.mantissa(54) = '0' then - opsel_a <= AIN_B; v.state := RENORM_B; else v.first := '1'; v.state := DIV_2; end if; else - if (r.a.class = NAN and r.a.mantissa(53) = '0') or - (r.b.class = NAN and r.b.mantissa(53) = '0') then - -- Signalling NAN - v.fpscr(FPSCR_VXSNAN) := '1'; - invalid := '1'; - end if; - if r.a.class = NAN then - -- result is A - v.result_sign := r.a.negative; - elsif r.b.class = NAN then - v.result_class := NAN; - v.result_sign := r.b.negative; - opsel_a <= AIN_B; + if r.a.class = NAN or r.b.class = NAN then + v.state := NAN_RESULT; elsif r.b.class = INFINITY then if r.a.class = INFINITY then v.fpscr(FPSCR_VXIDI) := '1'; @@ -1298,6 +1302,7 @@ begin else v.result_class := ZERO; end if; + arith_done := '1'; elsif r.b.class = ZERO then if r.a.class = ZERO then v.fpscr(FPSCR_VXZDZ) := '1'; @@ -1308,46 +1313,36 @@ begin end if; v.result_class := INFINITY; end if; - -- else r.b.class = FINITE, result_class = r.a.class + arith_done := '1'; + else -- r.b.class = FINITE, result_class = r.a.class + arith_done := '1'; end if; - arith_done := '1'; end if; when DO_FSEL => - opsel_a <= AIN_A; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; if r.a.class = ZERO or (r.a.negative = '0' and r.a.class /= NAN) then - v.result_sign := r.c.negative; - v.result_exp := r.c.exponent; - v.result_class := r.c.class; - opsel_a <= AIN_C; + v.opsel_a := AIN_C; else - v.result_sign := r.b.negative; - v.result_exp := r.b.exponent; - v.result_class := r.b.class; - opsel_a <= AIN_B; + v.opsel_a := AIN_B; end if; v.quieten_nan := '0'; - arith_done := '1'; + v.state := EXC_RESULT; when DO_FSQRT => - opsel_a <= AIN_B; + -- r.opsel_a = AIN_B v.result_class := r.b.class; v.result_sign := r.b.negative; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; - if r.b.class = NAN and r.b.mantissa(53) = '0' then - v.fpscr(FPSCR_VXSNAN) := '1'; - invalid := '1'; - end if; + v.use_b := '1'; case r.b.class is when FINITE => v.result_exp := r.b.exponent; if r.b.negative = '1' then v.fpscr(FPSCR_VXSQRT) := '1'; qnan_result := '1'; - arith_done := '1'; elsif r.b.mantissa(54) = '0' then v.state := RENORM_B; elsif r.b.exponent(0) = '0' then @@ -1356,7 +1351,9 @@ begin v.shift := to_signed(1, EXP_BITS); v.state := RENORM_B2; end if; - when NAN | ZERO => + when NAN => + v.state := NAN_RESULT; + when ZERO => -- result is B arith_done := '1'; when INFINITY => @@ -1369,15 +1366,12 @@ begin end case; when DO_FRE => - opsel_a <= AIN_B; + -- r.opsel_a = AIN_B v.result_class := r.b.class; v.result_sign := r.b.negative; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; - if r.b.class = NAN and r.b.mantissa(53) = '0' then - v.fpscr(FPSCR_VXSNAN) := '1'; - invalid := '1'; - end if; + v.use_b := '1'; case r.b.class is when FINITE => v.result_exp := - r.b.exponent; @@ -1387,8 +1381,7 @@ begin v.state := FRE_1; end if; when NAN => - -- result is B - arith_done := '1'; + v.state := NAN_RESULT; when INFINITY => v.result_class := ZERO; arith_done := '1'; @@ -1399,15 +1392,12 @@ begin end case; when DO_FRSQRTE => - opsel_a <= AIN_B; + -- r.opsel_a = AIN_B v.result_class := r.b.class; v.result_sign := r.b.negative; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; - if r.b.class = NAN and r.b.mantissa(53) = '0' then - v.fpscr(FPSCR_VXSNAN) := '1'; - invalid := '1'; - end if; + v.use_b := '1'; v.shift := to_signed(1, EXP_BITS); case r.b.class is when FINITE => @@ -1415,7 +1405,6 @@ begin if r.b.negative = '1' then v.fpscr(FPSCR_VXSQRT) := '1'; qnan_result := '1'; - arith_done := '1'; elsif r.b.mantissa(54) = '0' then v.state := RENORM_B; elsif r.b.exponent(0) = '0' then @@ -1424,8 +1413,7 @@ begin v.state := RENORM_B2; end if; when NAN => - -- result is B - arith_done := '1'; + v.state := NAN_RESULT; when INFINITY => if r.b.negative = '1' then v.fpscr(FPSCR_VXSQRT) := '1'; @@ -1442,25 +1430,26 @@ begin when DO_FMADD => -- fmadd, fmsub, fnmadd, fnmsub - opsel_a <= AIN_A; + -- r.opsel_a = AIN_A if A is denorm, else AIN_C if C is denorm, + -- else AIN_B v.result_sign := r.a.negative; v.result_class := r.a.class; v.result_exp := r.a.exponent; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; + v.use_a := '1'; + v.use_b := '1'; + v.use_c := '1'; is_add := r.a.negative xor r.c.negative xor r.b.negative xor r.insn(1); if r.a.class = FINITE and r.c.class = FINITE and (r.b.class = FINITE or r.b.class = ZERO) then v.is_subtract := not is_add; mulexp := r.a.exponent + r.c.exponent; v.result_exp := mulexp; - opsel_a <= AIN_B; -- Make sure A and C are normalized if r.a.mantissa(54) = '0' then - opsel_a <= AIN_A; v.state := RENORM_A; elsif r.c.mantissa(54) = '0' then - opsel_a <= AIN_C; v.state := RENORM_C; elsif r.b.class = ZERO then -- no addend, degenerates to multiply @@ -1483,25 +1472,8 @@ begin v.state := FMADD_2; end if; else - if (r.a.class = NAN and r.a.mantissa(53) = '0') or - (r.b.class = NAN and r.b.mantissa(53) = '0') or - (r.c.class = NAN and r.c.mantissa(53) = '0') then - -- Signalling NAN - v.fpscr(FPSCR_VXSNAN) := '1'; - invalid := '1'; - end if; - if r.a.class = NAN then - -- nothing to do, result is A - elsif r.b.class = NAN then - -- result is B - v.result_class := NAN; - v.result_sign := r.b.negative; - opsel_a <= AIN_B; - elsif r.c.class = NAN then - -- result is C - v.result_class := NAN; - v.result_sign := r.c.negative; - opsel_a <= AIN_C; + if r.a.class = NAN or r.b.class = NAN or r.c.class = NAN then + v.state := NAN_RESULT; elsif (r.a.class = ZERO and r.c.class = INFINITY) or (r.a.class = INFINITY and r.c.class = ZERO) then -- invalid operation, construct QNaN @@ -1516,32 +1488,36 @@ begin -- result is infinity v.result_class := INFINITY; v.result_sign := r.a.negative xor r.c.negative xor r.insn(2); + arith_done := '1'; end if; else -- Here A is zero, C is zero, or B is infinity -- Result is +/-B in all of those cases - v.result_class := r.b.class; - v.result_exp := r.b.exponent; - if v.result_class /= ZERO or is_add = '1' then - v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2)); + v.opsel_a := AIN_B; + if r.b.class /= ZERO or is_add = '1' then + v.negate := not (r.insn(1) xor r.insn(2)); else -- have to be careful about rule for 0 - 0 result sign - v.result_sign := (r.round_mode(1) and r.round_mode(0)) xor r.insn(2); + v.negate := r.b.negative xor (r.round_mode(1) and r.round_mode(0)) xor r.insn(2); end if; - opsel_a <= AIN_B; + v.state := EXC_RESULT; end if; - arith_done := '1'; end if; when RENORM_A => renormalize := '1'; v.state := RENORM_A2; + if r.insn(4) = '1' then + v.opsel_a := AIN_C; + else + v.opsel_a := AIN_B; + end if; when RENORM_A2 => + -- r.opsel_a = AIN_C for fmul/fmadd, AIN_B for fdiv set_a := '1'; v.result_exp := new_exp; if r.insn(4) = '1' then - opsel_a <= AIN_C; if r.c.mantissa(54) = '1' then if r.insn(3) = '0' or r.b.class = ZERO then v.first := '1'; @@ -1551,18 +1527,18 @@ begin if new_exp + 1 >= r.b.exponent then v.madd_cmp := '1'; end if; + v.opsel_a := AIN_B; v.state := DO_FMADD; end if; else v.state := RENORM_C; end if; else - opsel_a <= AIN_B; - if r.b.mantissa(54) = '1' then - v.first := '1'; - v.state := DIV_2; - else - v.state := RENORM_B; + if r.b.mantissa(54) = '1' then + v.first := '1'; + v.state := DIV_2; + else + v.state := RENORM_B; end if; end if; @@ -1578,6 +1554,7 @@ begin else v.result_exp := new_exp; end if; + v.opsel_a := AIN_B; v.state := LOOKUP; when RENORM_C => @@ -1595,23 +1572,31 @@ begin if new_exp + 1 >= r.b.exponent then v.madd_cmp := '1'; end if; + v.opsel_a := AIN_B; v.state := DO_FMADD; end if; + when ADD_1 => + -- transferring B to R + v.shift := r.b.exponent - r.a.exponent; + v.result_exp := r.b.exponent; + v.state := ADD_SHIFT; + when ADD_SHIFT => -- r.shift = - exponent difference opsel_r <= RES_SHIFT; v.x := s_nz; set_x := '1'; longmask := '0'; - v.state := ADD_2; - - when ADD_2 => if r.add_bsmall = '1' then - opsel_a <= AIN_A; + v.opsel_a := AIN_A; else - opsel_a <= AIN_B; + v.opsel_a := AIN_B; end if; + v.state := ADD_2; + + when ADD_2 => + -- r.opsel_a = AIN_A if r.add_bsmall = 1 else AIN_B opsel_b <= BIN_R; opsel_binv <= r.is_subtract; carry_in <= r.is_subtract and not r.x; @@ -1655,7 +1640,7 @@ begin end if; when CMP_1 => - opsel_a <= AIN_A; + -- r.opsel_a = AIN_A opsel_b <= BIN_R; opsel_binv <= '1'; carry_in <= '1'; @@ -1696,7 +1681,7 @@ begin when FMADD_2 => -- Product is potentially bigger here - -- r.shift = addend exp - product exp + 64 + -- r.shift = addend exp - product exp + 64, r.r = r.b.mantissa set_s := '1'; opsel_s <= S_SHIFT; v.shift := r.shift - to_signed(64, EXP_BITS); @@ -1757,7 +1742,7 @@ begin end if; when LOOKUP => - opsel_a <= AIN_B; + -- r.opsel_a = AIN_B -- wait one cycle for inverse_table[B] lookup v.first := '1'; if r.insn(4) = '0' then @@ -2260,6 +2245,41 @@ begin opsel_r <= RES_SHIFT; arith_done := '1'; + when NAN_RESULT => + if (r.use_a = '1' and r.a.class = NAN and r.a.mantissa(53) = '0') or + (r.use_b = '1' and r.b.class = NAN and r.b.mantissa(53) = '0') or + (r.use_c = '1' and r.c.class = NAN and r.c.mantissa(53) = '0') then + -- Signalling NAN + v.fpscr(FPSCR_VXSNAN) := '1'; + invalid := '1'; + end if; + if r.use_a = '1' and r.a.class = NAN then + v.opsel_a := AIN_A; + elsif r.use_b = '1' and r.b.class = NAN then + v.opsel_a := AIN_B; + elsif r.use_c = '1' and r.c.class = NAN then + v.opsel_a := AIN_C; + end if; + v.state := EXC_RESULT; + + when EXC_RESULT => + -- r.opsel_a = AIN_A, AIN_B or AIN_C according to which input is the result + case r.opsel_a is + when AIN_B => + v.result_sign := r.b.negative xor r.negate; + v.result_exp := r.b.exponent; + v.result_class := r.b.class; + when AIN_C => + v.result_sign := r.c.negative xor r.negate; + v.result_exp := r.c.exponent; + v.result_class := r.c.class; + when others => + v.result_sign := r.a.negative xor r.negate; + v.result_exp := r.a.exponent; + v.result_class := r.a.class; + end case; + arith_done := '1'; + end case; if zero_divide = '1' then @@ -2271,11 +2291,15 @@ begin v.result_sign := '0'; misc_sel <= "0001"; opsel_r <= RES_MISC; + arith_done := '1'; + end if; + if invalid = '1' then + v.invalid := '1'; end if; if arith_done = '1' then -- Enabled invalid exception doesn't write result or FPRF -- Neither does enabled zero-divide exception - if (invalid and r.fpscr(FPSCR_VE)) = '0' and + if (v.invalid and r.fpscr(FPSCR_VE)) = '0' and (zero_divide and r.fpscr(FPSCR_ZE)) = '0' then v.writing_back := '1'; v.update_fprf := '1'; @@ -2355,7 +2379,7 @@ begin else mask := right_mask(unsigned(mshift(5 downto 0))); end if; - case opsel_a is + case r.opsel_a is when AIN_R => in_a0 := r.r; when AIN_A => From e1ca023bad2d11a9ae16da14b327a1329a6f50d9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 1 Sep 2020 15:28:19 +1000 Subject: [PATCH 29/30] FPU: Decide on mask length a cycle earlier This moves longmask into the reg_type record, meaning that it now needs to be decided a cycle earlier, in order to help timing. Signed-off-by: Paul Mackerras --- fpu.vhdl | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index 9c18e47..d79cec6 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -118,6 +118,7 @@ architecture behaviour of fpu is use_c : std_ulogic; invalid : std_ulogic; negate : std_ulogic; + longmask : std_ulogic; end record; type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0); @@ -615,7 +616,6 @@ begin variable need_check : std_ulogic; variable msb : std_ulogic; variable is_add : std_ulogic; - variable longmask : std_ulogic; variable set_a : std_ulogic; variable set_b : std_ulogic; variable set_c : std_ulogic; @@ -644,6 +644,7 @@ begin v.fe_mode := or (e_in.fe_mode); v.dest_fpr := e_in.frt; v.single_prec := e_in.single; + v.longmask := e_in.single; v.int_result := '0'; v.rc := e_in.rc; v.is_cmp := e_in.out_cr; @@ -747,7 +748,6 @@ begin renormalize := '0'; set_x := '0'; qnan_result := '0'; - longmask := r.single_prec; set_a := '0'; set_b := '0'; set_c := '0'; @@ -1204,6 +1204,7 @@ begin if r.a.exponent = r.b.exponent then v.state := ADD_2; else + v.longmask := '0'; v.state := ADD_SHIFT; end if; else @@ -1580,14 +1581,15 @@ begin -- transferring B to R v.shift := r.b.exponent - r.a.exponent; v.result_exp := r.b.exponent; + v.longmask := '0'; v.state := ADD_SHIFT; when ADD_SHIFT => - -- r.shift = - exponent difference + -- r.shift = - exponent difference, r.longmask = 0 opsel_r <= RES_SHIFT; v.x := s_nz; set_x := '1'; - longmask := '0'; + v.longmask := r.single_prec; if r.add_bsmall = '1' then v.opsel_a := AIN_A; else @@ -1676,6 +1678,7 @@ begin set_s := '1'; f_to_multiply.valid <= r.first; if multiply_to_f.valid = '1' then + v.longmask := '0'; v.state := ADD_SHIFT; end if; @@ -2367,7 +2370,7 @@ begin -- Data path. -- This has A and B input multiplexers, an adder, a shifter, -- count-leading-zeroes logic, and a result mux. - if longmask = '1' then + if r.longmask = '1' then mshift := r.shift + to_signed(-29, EXP_BITS); else mshift := r.shift; From 73f819301ba25ddc3855bba8e2f3334ca70b5aef Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 12 Sep 2020 20:13:24 +1000 Subject: [PATCH 30/30] FPU: Do masking after adder rather than on A input The masking enabled by opsel_amask is only used when rounding, to trim the rounded result to the required precision. We now do the masking after the adder rather than before (on the A input). This gives the same result and helps timing. The path from r.shift through the mask generator and adder to v.r was showing up as a critical path. Signed-off-by: Paul Mackerras --- fpu.vhdl | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fpu.vhdl b/fpu.vhdl index d79cec6..023dbf2 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -130,7 +130,7 @@ architecture behaviour of fpu is signal opsel_r : std_ulogic_vector(1 downto 0); signal opsel_s : std_ulogic_vector(1 downto 0); signal opsel_ainv : std_ulogic; - signal opsel_amask : std_ulogic; + signal opsel_mask : std_ulogic; signal opsel_binv : std_ulogic; signal in_a : std_ulogic_vector(63 downto 0); signal in_b : std_ulogic_vector(63 downto 0); @@ -631,6 +631,7 @@ begin variable shiftin : std_ulogic; variable mulexp : signed(EXP_BITS-1 downto 0); variable maddend : std_ulogic_vector(127 downto 0); + variable sum : std_ulogic_vector(63 downto 0); begin v := r; illegal := '0'; @@ -733,7 +734,7 @@ begin v.first := '0'; v.opsel_a := AIN_R; opsel_ainv <= '0'; - opsel_amask <= '0'; + opsel_mask <= '0'; opsel_b <= BIN_ZERO; opsel_binv <= '0'; opsel_r <= RES_SUM; @@ -2176,7 +2177,7 @@ begin end if; when ROUNDING => - opsel_amask <= '1'; + opsel_mask <= '1'; round := fp_rounding(r.r, r.x, r.single_prec, r.round_mode, r.result_sign); v.fpscr(FPSCR_FR downto FPSCR_FI) := round; if round(1) = '1' then @@ -2398,9 +2399,6 @@ begin if opsel_ainv = '1' then in_a0 := not in_a0; end if; - if opsel_amask = '1' then - in_a0 := in_a0 and not mask; - end if; in_a <= in_a0; case opsel_b is when BIN_ZERO => @@ -2423,9 +2421,13 @@ begin else shift_res := (others => '0'); end if; + sum := std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in); + if opsel_mask = '1' then + sum := sum and not mask; + end if; case opsel_r is when RES_SUM => - result <= std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in); + result <= sum; when RES_SHIFT => result <= shift_res; when RES_MULT =>