From 9a8a8e50f8e886a90315091fe8d9e584c8429493 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 2 Jul 2022 14:17:18 +1000 Subject: [PATCH] FPU: Add stage-2 stall ability to FPU This makes the FPU able to stall other units at execute stage 2 and be stalled by other units (specifically the LSU). This means that the completion and writeback for an instruction can now end up being deferred until the second cycle of a following instruction, i.e. the cycle when the state machine has gone through IDLE state into one of the DO_* states, which means we need to latch the destination FPR number, CR mask, etc. from the previous instruction so that we present the correct information to writeback. The advantage of this is that we can get rid of the in_progress signal from the LSU. Signed-off-by: Paul Mackerras --- common.vhdl | 5 +- core.vhdl | 1 + execute1.vhdl | 14 ++-- fpu.vhdl | 169 ++++++++++++++++++++++++++++++------------------ loadstore1.vhdl | 3 - 5 files changed, 118 insertions(+), 74 deletions(-) diff --git a/common.vhdl b/common.vhdl index ac733db..ea6a8d8 100644 --- a/common.vhdl +++ b/common.vhdl @@ -480,7 +480,6 @@ package common is type Loadstore1ToExecute1Type is record busy : std_ulogic; l2stall : std_ulogic; - in_progress : std_ulogic; end record; type Loadstore1ToDcacheType is record @@ -640,16 +639,18 @@ package common is frt : gspr_index_t; rc : std_ulogic; out_cr : std_ulogic; + stall : std_ulogic; end record; constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'), itag => instr_tag_init, insn => (others => '0'), fe_mode => "00", rc => '0', fra => (others => '0'), frb => (others => '0'), frc => (others => '0'), frt => (others => '0'), - single => '0', out_cr => '0'); + single => '0', out_cr => '0', stall => '0'); type FPUToExecute1Type is record busy : std_ulogic; + f2stall : std_ulogic; exception : std_ulogic; end record; constant FPUToExecute1Init : FPUToExecute1Type := (others => '0'); diff --git a/core.vhdl b/core.vhdl index 84604c6..23f7e82 100644 --- a/core.vhdl +++ b/core.vhdl @@ -384,6 +384,7 @@ begin port map ( clk => clk, rst => rst_fpu, + flush_in => flush, e_in => execute1_to_fpu, e_out => fpu_to_execute1, w_out => fpu_to_writeback diff --git a/execute1.vhdl b/execute1.vhdl index 75e8275..57f90b0 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -442,9 +442,9 @@ begin -- writeback, unless a pipeline flush has happened in the meantime. xerc_in <= ex1.xerc when ex1.xerc_valid = '1' else e_in.xerc; - with e_in.unit select busy_out <= - l_in.busy or l_in.in_progress or ex1.e.valid or ex1.busy or fp_in.busy when FPU, - l_in.busy or ex1.busy or fp_in.busy when others; + -- N.B. the busy signal from each source includes the + -- stage2 stall from that source in it. + busy_out <= l_in.busy or ex1.busy or fp_in.busy; valid_in <= e_in.valid and not (busy_out or flush_in or ex1.e.redirect or ex1.e.interrupt); @@ -1299,8 +1299,7 @@ begin end if; end if; - v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or - ex1.busy or fp_in.busy); + v.no_instr_avail := not (e_in.valid or l_in.busy or ex1.busy or fp_in.busy); go := valid_in and not exception; v.instr_dispatch := go; @@ -1436,7 +1435,7 @@ begin lv.is_32bit := e_in.is_32bit; lv.repeat := e_in.repeat; lv.second := e_in.second; - lv.e2stall := '0'; + lv.e2stall := fp_in.f2stall; -- Outputs to FPU fv.op := e_in.insn_type; @@ -1451,6 +1450,7 @@ begin fv.frt := e_in.write_reg; fv.rc := e_in.rc; fv.out_cr := e_in.output_cr; + fv.stall := l_in.l2stall; -- Update registers ex1in <= v; @@ -1472,7 +1472,7 @@ begin ctrl.cfar when SPRSEL_CFAR, assemble_xer(ex1.e.xerc, ctrl.xer_low) when others; - stage2_stall <= l_in.l2stall or fp_in.busy; + stage2_stall <= l_in.l2stall or fp_in.f2stall; -- Second execute stage control execute2_1: process(all) diff --git a/fpu.vhdl b/fpu.vhdl index fad09cc..a20a7a0 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -15,6 +15,7 @@ entity fpu is port ( clk : in std_ulogic; rst : in std_ulogic; + flush_in : in std_ulogic; e_in : in Execute1ToFPUType; e_out : out FPUToExecute1Type; @@ -35,7 +36,7 @@ architecture behaviour of fpu is mantissa : std_ulogic_vector(63 downto 0); -- 10.54 format end record; - type state_t is (IDLE, + type state_t is (IDLE, DO_ILLEGAL, DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF, DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT, DO_FCFID, DO_FCTI, @@ -71,7 +72,9 @@ architecture behaviour of fpu is type reg_type is record state : state_t; busy : std_ulogic; + f2stall : std_ulogic; instr_done : std_ulogic; + complete : std_ulogic; do_intr : std_ulogic; illegal : std_ulogic; op : insn_type_t; @@ -83,7 +86,9 @@ architecture behaviour of fpu is rc : std_ulogic; is_cmp : std_ulogic; single_prec : std_ulogic; + sp_result : std_ulogic; fpscr : std_ulogic_vector(31 downto 0); + comm_fpscr : std_ulogic_vector(31 downto 0); -- committed FPSCR value a : fpu_reg_type; b : fpu_reg_type; c : fpu_reg_type; @@ -96,13 +101,17 @@ architecture behaviour of fpu is result_class : fp_number_class; result_exp : signed(EXP_BITS-1 downto 0); shift : signed(EXP_BITS-1 downto 0); - writing_back : std_ulogic; + writing_fpr : std_ulogic; + write_reg : gspr_index_t; + complete_tag : instr_tag_t; + writing_cr : std_ulogic; int_result : std_ulogic; cr_result : std_ulogic_vector(3 downto 0); cr_mask : std_ulogic_vector(7 downto 0); old_exc : std_ulogic_vector(4 downto 0); update_fprf : std_ulogic; quieten_nan : std_ulogic; + nsnan_result : std_ulogic; tiny : std_ulogic; denorm : std_ulogic; round_mode : std_ulogic_vector(2 downto 0); @@ -542,17 +551,30 @@ begin fpu_0: process(clk) begin if rising_edge(clk) then - if rst = '1' then + if rst = '1' or flush_in = '1' then r.state <= IDLE; r.busy <= '0'; + r.f2stall <= '0'; r.instr_done <= '0'; + r.complete <= '0'; + r.illegal <= '0'; r.do_intr <= '0'; + r.writing_fpr <= '0'; + r.writing_cr <= '0'; r.fpscr <= (others => '0'); - r.writing_back <= '0'; - r.dest_fpr <= (others =>'0'); + r.write_reg <= (others =>'0'); + r.complete_tag.valid <= '0'; r.cr_mask <= (others =>'0'); r.cr_result <= (others =>'0'); r.instr_tag.valid <= '0'; + if rst = '1' then + r.fpscr <= (others => '0'); + r.comm_fpscr <= (others => '0'); + elsif r.do_intr = '0' then + -- flush_in = 1 and not due to us generating an interrupt, + -- roll back to committed fpscr + r.fpscr <= r.comm_fpscr; + end if; else assert not (r.state /= IDLE and e_in.valid = '1') severity failure; r <= rin; @@ -577,14 +599,19 @@ begin end process; e_out.busy <= r.busy; + e_out.f2stall <= r.f2stall; e_out.exception <= r.fpscr(FPSCR_FEX); - w_out.valid <= r.instr_done and not r.do_intr; - w_out.instr_tag <= r.instr_tag; - w_out.write_enable <= r.writing_back; - w_out.write_reg <= r.dest_fpr; + -- Note that the cycle where r.complete = 1 for an instruction can be as + -- late as the second cycle of the following instruction (i.e. in the state + -- following IDLE state). Hence it is important that none of the fields of + -- r that are used below are modified in IDLE state. + w_out.valid <= r.complete; + w_out.instr_tag <= r.complete_tag; + w_out.write_enable <= r.writing_fpr and r.complete; + w_out.write_reg <= r.write_reg; w_out.write_data <= fp_result; - w_out.write_cr_enable <= r.instr_done and (r.rc or r.is_cmp); + w_out.write_cr_enable <= r.writing_cr and r.complete; w_out.write_cr_mask <= r.cr_mask; w_out.write_cr_data <= r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result; @@ -599,7 +626,6 @@ begin variable bdec : fpu_reg_type; variable cdec : fpu_reg_type; variable fpscr_mask : std_ulogic_vector(31 downto 0); - variable illegal : std_ulogic; variable j, k : integer; variable flm : std_ulogic_vector(7 downto 0); variable int_input : std_ulogic; @@ -644,12 +670,22 @@ begin variable maddend : std_ulogic_vector(127 downto 0); variable sum : std_ulogic_vector(63 downto 0); variable round_inc : std_ulogic_vector(63 downto 0); + variable int_result : std_ulogic; + variable illegal : std_ulogic; begin v := r; - illegal := '0'; - v.busy := '0'; + v.complete := '0'; + v.do_intr := '0'; int_input := '0'; + if r.complete = '1' or r.do_intr = '1' then + v.instr_done := '0'; + v.writing_fpr := '0'; + v.writing_cr := '0'; + v.comm_fpscr := r.fpscr; + v.illegal := '0'; + end if; + -- capture incoming instruction if e_in.valid = '1' then v.insn := e_in.insn; @@ -660,14 +696,8 @@ begin v.dest_fpr := e_in.frt; v.single_prec := e_in.single; v.longmask := e_in.single; - v.int_result := '0'; v.rc := e_in.rc; v.is_cmp := e_in.out_cr; - if e_in.out_cr = '0' then - v.cr_mask := num_to_fxm(1); - else - v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(e_in.insn)))); - end if; int_input := '0'; if e_in.op = OP_FPOP_I then int_input := '1'; @@ -741,8 +771,6 @@ begin pcmpb_lt := '1'; end if; - v.writing_back := '0'; - v.instr_done := '0'; v.update_fprf := '0'; v.shift := to_signed(0, EXP_BITS); v.first := '0'; @@ -777,6 +805,8 @@ begin pshift := '0'; renorm_sqrt := '0'; shiftin := '0'; + int_result := '0'; + illegal := '0'; case r.state is when IDLE => v.use_a := '0'; @@ -785,6 +815,7 @@ begin v.invalid := '0'; v.negate := '0'; if e_in.valid = '1' then + v.busy := '1'; case e_in.insn(5 downto 1) is when "00000" => if e_in.insn(8) = '1' then @@ -876,13 +907,17 @@ begin end if; v.state := DO_FMADD; when others => - illegal := '1'; + v.state := DO_ILLEGAL; end case; end if; v.x := '0'; v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX); set_s := '1'; + when DO_ILLEGAL => + illegal := '1'; + v.instr_done := '1'; + when DO_MCRFS => j := to_integer(unsigned(insn_bfa(r.insn))); for i in 0 to 7 loop @@ -894,11 +929,9 @@ begin end loop; v.fpscr := r.fpscr and (fpscr_mask or x"6007F8FF"); v.instr_done := '1'; - v.state := IDLE; when DO_FTDIV => v.instr_done := '1'; - v.state := IDLE; v.cr_result := "0000"; if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or (r.b.class = FINITE and r.b.mantissa(53) = '0') then @@ -917,7 +950,6 @@ begin when DO_FTSQRT => v.instr_done := '1'; - v.state := IDLE; v.cr_result := "0000"; if r.b.class = ZERO or r.b.class = INFINITY or (r.b.class = FINITE and r.b.mantissa(53) = '0') then @@ -932,7 +964,6 @@ begin -- fcmp[uo] -- r.opsel_a = AIN_B v.instr_done := '1'; - v.state := IDLE; update_fx := '1'; v.result_exp := r.b.exponent; if (r.a.class = NAN and r.a.mantissa(53) = '0') or @@ -993,7 +1024,6 @@ begin end if; end loop; v.instr_done := '1'; - v.state := IDLE; when DO_MTFSFI => -- mtfsfi @@ -1007,20 +1037,17 @@ begin end loop; end if; v.instr_done := '1'; - v.state := IDLE; when DO_FMRG => -- fmrgew, fmrgow opsel_r <= RES_MISC; misc_sel <= "01" & r.insn(8) & '0'; - v.int_result := '1'; - v.writing_back := '1'; + int_result := '1'; + v.writing_fpr := '1'; v.instr_done := '1'; - v.state := IDLE; when DO_MFFS => - v.int_result := '1'; - v.writing_back := '1'; + v.writing_fpr := '1'; opsel_r <= RES_MISC; case r.insn(20 downto 16) is when "00000" => @@ -1044,10 +1071,11 @@ begin -- mffsl fpscr_mask := x"0007F0FF"; when others => - illegal := '1'; + v.illegal := '1'; + v.writing_fpr := '0'; end case; + int_result := '1'; v.instr_done := '1'; - v.state := IDLE; when DO_MTFSF => if r.insn(25) = '1' then @@ -1064,7 +1092,6 @@ begin end if; end loop; v.instr_done := '1'; - v.state := IDLE; when DO_FMR => -- r.opsel_a = AIN_B @@ -1082,9 +1109,8 @@ begin else v.result_sign := r.a.negative; -- fcpsgn end if; - v.writing_back := '1'; + v.writing_fpr := '1'; v.instr_done := '1'; - v.state := IDLE; when DO_FRI => -- fri[nzpm] -- r.opsel_a = AIN_B @@ -1153,7 +1179,7 @@ begin invalid := '1'; end if; - v.int_result := '1'; + int_result := '1'; case r.b.class is when ZERO => arith_done := '1'; @@ -1671,7 +1697,6 @@ begin end if; v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result; v.instr_done := '1'; - v.state := IDLE; when MULT_1 => f_to_multiply.valid <= r.first; @@ -1849,7 +1874,6 @@ begin v.cr_result(1) := exp_tiny or exp_huge; if exp_tiny = '1' or exp_huge = '1' or r.a.class = ZERO or r.first = '0' then v.instr_done := '1'; - v.state := IDLE; else v.shift := r.a.exponent; v.doing_ftdiv := "10"; @@ -2054,6 +2078,7 @@ begin when others => -- fctidu[z] need_check := r.r(63); end case; + int_result := '1'; if need_check = '1' then v.state := INT_CHECK; else @@ -2080,6 +2105,7 @@ begin v.fpscr(FPSCR_XX) := '1'; end if; end if; + int_result := '1'; arith_done := '1'; when INT_OFLOW => @@ -2090,6 +2116,7 @@ begin end if; v.fpscr(FPSCR_VXCVI) := '1'; invalid := '1'; + int_result := '1'; arith_done := '1'; when FRI_1 => @@ -2306,11 +2333,10 @@ begin -- Neither does enabled zero-divide exception if (v.invalid and r.fpscr(FPSCR_VE)) = '0' and (zero_divide and r.fpscr(FPSCR_ZE)) = '0' then - v.writing_back := '1'; + v.writing_fpr := '1'; v.update_fprf := '1'; end if; v.instr_done := '1'; - v.state := IDLE; update_fx := '1'; end if; @@ -2530,12 +2556,6 @@ begin v.shift := resize(signed('0' & clz) - 9, EXP_BITS); end if; - if r.int_result = '1' then - fp_result <= r.r; - else - fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r, - r.single_prec, r.quieten_nan); - end if; if r.update_fprf = '1' then v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.result_sign, r.result_class, r.r(54) and not r.denorm); @@ -2549,24 +2569,49 @@ begin (v.fpscr(FPSCR_VX downto FPSCR_XX) and not r.old_exc) /= "00000" then v.fpscr(FPSCR_FX) := '1'; end if; - if r.rc = '1' then - v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX); - end if; - v.illegal := illegal; - if illegal = '1' then - v.instr_done := '0'; - v.do_intr := '1'; - v.writing_back := '0'; - v.busy := '0'; - v.state := IDLE; + if v.instr_done = '1' then + if r.state /= IDLE then + v.state := IDLE; + v.busy := '0'; + v.f2stall := '0'; + if r.rc = '1' then + v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX); + end if; + v.sp_result := r.single_prec; + v.int_result := int_result; + v.illegal := illegal; + v.nsnan_result := v.quieten_nan; + if r.is_cmp = '0' then + v.cr_mask := num_to_fxm(1); + else + v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(r.insn)))); + end if; + v.writing_cr := r.is_cmp or r.rc; + v.write_reg := r.dest_fpr; + v.complete_tag := r.instr_tag; + end if; + if e_in.stall = '0' then + v.complete := not v.illegal; + v.do_intr := (v.fpscr(FPSCR_FEX) and r.fe_mode) or v.illegal; + end if; + -- N.B. We rely on execute1 to prevent any new instruction + -- coming in while e_in.stall = 1, without us needing to + -- have busy asserted. else - v.do_intr := v.instr_done and v.fpscr(FPSCR_FEX) and r.fe_mode; - if v.state /= IDLE or v.do_intr = '1' then - v.busy := '1'; + if r.state /= IDLE and e_in.stall = '0' then + v.f2stall := '1'; end if; end if; + -- This mustn't depend on any fields of r that are modified in IDLE state. + if r.int_result = '1' then + fp_result <= r.r; + else + fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r, + r.sp_result, r.nsnan_result); + end if; + rin <= v; end process; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index bd62f0b..ff2633b 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -159,7 +159,6 @@ architecture behave of loadstore1 is signal flush : std_ulogic; signal busy : std_ulogic; signal complete : std_ulogic; - signal in_progress : std_ulogic; signal flushing : std_ulogic; signal store_sp_data : std_ulogic_vector(31 downto 0); @@ -523,7 +522,6 @@ begin busy <= dc_stall or d_in.error or r1.busy or r2.busy; complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or r3.complete; - in_progress <= r1.req.valid or (r2.req.valid and not complete); -- Processing done in the first cycle of a load/store instruction loadstore1_1: process(all) @@ -981,7 +979,6 @@ begin -- update busy signal back to execute1 e_out.busy <= busy; e_out.l2stall <= dc_stall or d_in.error or r2.busy; - e_out.in_progress <= in_progress; events <= r3.events;