core: Allow multiple loadstore instructions to be in flight

The idea here is that we can have multiple instructions in progress at
the same time as long as they all go to the same unit, because that
unit will keep them in order.  If we get an instruction for a
different unit, we wait for all the previous instructions to finish
before executing it.  Since the loadstore unit is the only one that is
currently pipelined, this boils down to saying that loadstore
instructions can go ahead while l_in.in_progress = 1 but other
instructions have to wait until it is 0.

This gives a 2% increase on coremark performance on the Arty A7-100
(from ~190 to ~194).

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/269/head
Paul Mackerras 4 years ago
parent f583d088b7
commit 17fd069640

@ -368,6 +368,7 @@ package common is


type Loadstore1ToExecute1Type is record type Loadstore1ToExecute1Type is record
busy : std_ulogic; busy : std_ulogic;
in_progress : std_ulogic;
end record; end record;


type Loadstore1ToDcacheType is record type Loadstore1ToDcacheType is record

@ -7,7 +7,7 @@ use work.common.all;
entity control is entity control is
generic ( generic (
EX1_BYPASS : boolean := true; EX1_BYPASS : boolean := true;
PIPELINE_DEPTH : natural := 2 PIPELINE_DEPTH : natural := 3
); );
port ( port (
clk : in std_ulogic; clk : in std_ulogic;
@ -239,6 +239,10 @@ begin
elsif complete_in.valid = '1' then elsif complete_in.valid = '1' then
v_int.outstanding := r_int.outstanding - 1; v_int.outstanding := r_int.outstanding - 1;
end if; end if;
if r_int.outstanding >= PIPELINE_DEPTH + 1 then
valid_tmp := '0';
stall_tmp := '1';
end if;


if rst = '1' then if rst = '1' then
v_int := reg_internal_init; v_int := reg_internal_init;

@ -300,8 +300,7 @@ architecture behaviour of decode2 is
begin begin
control_0: entity work.control control_0: entity work.control
generic map ( generic map (
EX1_BYPASS => EX1_BYPASS, EX1_BYPASS => EX1_BYPASS
PIPELINE_DEPTH => 1
) )
port map ( port map (
clk => clk, clk => clk,

@ -293,7 +293,10 @@ begin
-- writeback. -- writeback.
xerc_in <= r.e.xerc when r.e.write_xerc_enable = '1' or r.busy = '1' else e_in.xerc; xerc_in <= r.e.xerc when r.e.write_xerc_enable = '1' or r.busy = '1' else e_in.xerc;


busy_out <= l_in.busy or r.busy or fp_in.busy; with e_in.unit select busy_out <=
l_in.busy or r.busy or fp_in.busy when LDST,
l_in.busy or l_in.in_progress or r.busy or fp_in.busy when others;

valid_in <= e_in.valid and not busy_out and not flush_in; valid_in <= e_in.valid and not busy_out and not flush_in;


terminate_out <= r.terminate; terminate_out <= r.terminate;
@ -744,7 +747,7 @@ begin


-- Determine if there is any exception to be taken -- Determine if there is any exception to be taken
-- before/instead of executing this instruction -- before/instead of executing this instruction
if valid_in = '1' and e_in.second = '0' then if valid_in = '1' and e_in.second = '0' and l_in.in_progress = '0' then
if HAS_FPU and r.fp_exception_next = '1' then if HAS_FPU and r.fp_exception_next = '1' then
-- This is used for FP-type program interrupts that -- This is used for FP-type program interrupts that
-- become pending due to MSR[FE0,FE1] changing from 00 to non-zero. -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero.

@ -155,6 +155,7 @@ architecture behave of loadstore1 is


signal busy : std_ulogic; signal busy : std_ulogic;
signal complete : std_ulogic; signal complete : std_ulogic;
signal in_progress : std_ulogic;
signal flushing : std_ulogic; signal flushing : std_ulogic;


signal store_sp_data : std_ulogic_vector(31 downto 0); signal store_sp_data : std_ulogic_vector(31 downto 0);
@ -494,13 +495,13 @@ begin
req_in <= v; req_in <= v;
end process; end process;


--busy <= r1.req.valid and ((r1.req.dc_req and not r1.issued) or busy <= r1.req.valid and ((r1.req.dc_req and not r1.issued) or
-- (r1.issued and d_in.error) or (r1.issued and d_in.error) or
-- stage2_busy_next or stage2_busy_next or
-- (r1.req.dc_req and r1.req.two_dwords and not r1.req.dword_index)); (r1.req.dc_req and r1.req.two_dwords and not r1.req.dword_index));
complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or
(r2.wait_mmu and m_in.done) or r3.convert_lfs; (r2.wait_mmu and m_in.done) or r3.convert_lfs;
busy <= r1.req.valid or (r2.req.valid and not complete); in_progress <= r1.req.valid or (r2.req.valid and not complete);


stage1_issue_enable <= r3.stage1_en and not (r1.req.valid and r1.req.mmu_op) and stage1_issue_enable <= r3.stage1_en and not (r1.req.valid and r1.req.mmu_op) and
not (r2.req.valid and r2.req.mmu_op); not (r2.req.valid and r2.req.mmu_op);
@ -940,6 +941,7 @@ begin


-- update busy signal back to execute1 -- update busy signal back to execute1
e_out.busy <= busy; e_out.busy <= busy;
e_out.in_progress <= in_progress;


-- Busy calculation. -- Busy calculation.
stage3_busy_next <= r2.req.valid and not (complete or part_done or exception); stage3_busy_next <= r2.req.valid and not (complete or part_done or exception);

Loading…
Cancel
Save