From 17fd069640c240054db07746543252c89322407f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 28 Dec 2020 15:15:30 +1100 Subject: [PATCH] core: Allow multiple loadstore instructions to be in flight The idea here is that we can have multiple instructions in progress at the same time as long as they all go to the same unit, because that unit will keep them in order. If we get an instruction for a different unit, we wait for all the previous instructions to finish before executing it. Since the loadstore unit is the only one that is currently pipelined, this boils down to saying that loadstore instructions can go ahead while l_in.in_progress = 1 but other instructions have to wait until it is 0. This gives a 2% increase on coremark performance on the Arty A7-100 (from ~190 to ~194). Signed-off-by: Paul Mackerras --- common.vhdl | 1 + control.vhdl | 6 +++++- decode2.vhdl | 3 +-- execute1.vhdl | 7 +++++-- loadstore1.vhdl | 12 +++++++----- 5 files changed, 19 insertions(+), 10 deletions(-) diff --git a/common.vhdl b/common.vhdl index 35f782b..69dde30 100644 --- a/common.vhdl +++ b/common.vhdl @@ -368,6 +368,7 @@ package common is type Loadstore1ToExecute1Type is record busy : std_ulogic; + in_progress : std_ulogic; end record; type Loadstore1ToDcacheType is record diff --git a/control.vhdl b/control.vhdl index f14e350..34c35e2 100644 --- a/control.vhdl +++ b/control.vhdl @@ -7,7 +7,7 @@ use work.common.all; entity control is generic ( EX1_BYPASS : boolean := true; - PIPELINE_DEPTH : natural := 2 + PIPELINE_DEPTH : natural := 3 ); port ( clk : in std_ulogic; @@ -239,6 +239,10 @@ begin elsif complete_in.valid = '1' then v_int.outstanding := r_int.outstanding - 1; end if; + if r_int.outstanding >= PIPELINE_DEPTH + 1 then + valid_tmp := '0'; + stall_tmp := '1'; + end if; if rst = '1' then v_int := reg_internal_init; diff --git a/decode2.vhdl b/decode2.vhdl index 732cfe0..f9fa541 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -300,8 +300,7 @@ architecture behaviour of decode2 is begin control_0: entity work.control generic map ( - EX1_BYPASS => EX1_BYPASS, - PIPELINE_DEPTH => 1 + EX1_BYPASS => EX1_BYPASS ) port map ( clk => clk, diff --git a/execute1.vhdl b/execute1.vhdl index 0eaf55a..c0434a0 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -293,7 +293,10 @@ begin -- writeback. xerc_in <= r.e.xerc when r.e.write_xerc_enable = '1' or r.busy = '1' else e_in.xerc; - busy_out <= l_in.busy or r.busy or fp_in.busy; + with e_in.unit select busy_out <= + l_in.busy or r.busy or fp_in.busy when LDST, + l_in.busy or l_in.in_progress or r.busy or fp_in.busy when others; + valid_in <= e_in.valid and not busy_out and not flush_in; terminate_out <= r.terminate; @@ -744,7 +747,7 @@ begin -- Determine if there is any exception to be taken -- before/instead of executing this instruction - if valid_in = '1' and e_in.second = '0' then + if valid_in = '1' and e_in.second = '0' and l_in.in_progress = '0' then if HAS_FPU and r.fp_exception_next = '1' then -- This is used for FP-type program interrupts that -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero. diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 66700e8..ee4507b 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -155,6 +155,7 @@ architecture behave of loadstore1 is signal busy : std_ulogic; signal complete : std_ulogic; + signal in_progress : std_ulogic; signal flushing : std_ulogic; signal store_sp_data : std_ulogic_vector(31 downto 0); @@ -494,13 +495,13 @@ begin req_in <= v; end process; - --busy <= r1.req.valid and ((r1.req.dc_req and not r1.issued) or - -- (r1.issued and d_in.error) or - -- stage2_busy_next or - -- (r1.req.dc_req and r1.req.two_dwords and not r1.req.dword_index)); + busy <= r1.req.valid and ((r1.req.dc_req and not r1.issued) or + (r1.issued and d_in.error) or + stage2_busy_next or + (r1.req.dc_req and r1.req.two_dwords and not r1.req.dword_index)); complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or (r2.wait_mmu and m_in.done) or r3.convert_lfs; - busy <= r1.req.valid or (r2.req.valid and not complete); + in_progress <= r1.req.valid or (r2.req.valid and not complete); stage1_issue_enable <= r3.stage1_en and not (r1.req.valid and r1.req.mmu_op) and not (r2.req.valid and r2.req.mmu_op); @@ -940,6 +941,7 @@ begin -- update busy signal back to execute1 e_out.busy <= busy; + e_out.in_progress <= in_progress; -- Busy calculation. stage3_busy_next <= r2.req.valid and not (complete or part_done or exception);