core: Allow multiple loadstore instructions to be in flight

The idea here is that we can have multiple instructions in progress at
the same time as long as they all go to the same unit, because that
unit will keep them in order.  If we get an instruction for a
different unit, we wait for all the previous instructions to finish
before executing it.  Since the loadstore unit is the only one that is
currently pipelined, this boils down to saying that loadstore
instructions can go ahead while l_in.in_progress = 1 but other
instructions have to wait until it is 0.

This gives a 2% increase on coremark performance on the Arty A7-100
(from ~190 to ~194).

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
cache-tlb-parameters-2
Paul Mackerras 4 years ago
parent f583d088b7
commit 17fd069640

@ -368,6 +368,7 @@ package common is

type Loadstore1ToExecute1Type is record
busy : std_ulogic;
in_progress : std_ulogic;
end record;

type Loadstore1ToDcacheType is record

@ -7,7 +7,7 @@ use work.common.all;
entity control is
generic (
EX1_BYPASS : boolean := true;
PIPELINE_DEPTH : natural := 2
PIPELINE_DEPTH : natural := 3
);
port (
clk : in std_ulogic;
@ -239,6 +239,10 @@ begin
elsif complete_in.valid = '1' then
v_int.outstanding := r_int.outstanding - 1;
end if;
if r_int.outstanding >= PIPELINE_DEPTH + 1 then
valid_tmp := '0';
stall_tmp := '1';
end if;

if rst = '1' then
v_int := reg_internal_init;

@ -300,8 +300,7 @@ architecture behaviour of decode2 is
begin
control_0: entity work.control
generic map (
EX1_BYPASS => EX1_BYPASS,
PIPELINE_DEPTH => 1
EX1_BYPASS => EX1_BYPASS
)
port map (
clk => clk,

@ -293,7 +293,10 @@ begin
-- writeback.
xerc_in <= r.e.xerc when r.e.write_xerc_enable = '1' or r.busy = '1' else e_in.xerc;

busy_out <= l_in.busy or r.busy or fp_in.busy;
with e_in.unit select busy_out <=
l_in.busy or r.busy or fp_in.busy when LDST,
l_in.busy or l_in.in_progress or r.busy or fp_in.busy when others;

valid_in <= e_in.valid and not busy_out and not flush_in;

terminate_out <= r.terminate;
@ -744,7 +747,7 @@ begin

-- Determine if there is any exception to be taken
-- before/instead of executing this instruction
if valid_in = '1' and e_in.second = '0' then
if valid_in = '1' and e_in.second = '0' and l_in.in_progress = '0' then
if HAS_FPU and r.fp_exception_next = '1' then
-- This is used for FP-type program interrupts that
-- become pending due to MSR[FE0,FE1] changing from 00 to non-zero.

@ -155,6 +155,7 @@ architecture behave of loadstore1 is

signal busy : std_ulogic;
signal complete : std_ulogic;
signal in_progress : std_ulogic;
signal flushing : std_ulogic;

signal store_sp_data : std_ulogic_vector(31 downto 0);
@ -494,13 +495,13 @@ begin
req_in <= v;
end process;

--busy <= r1.req.valid and ((r1.req.dc_req and not r1.issued) or
-- (r1.issued and d_in.error) or
-- stage2_busy_next or
-- (r1.req.dc_req and r1.req.two_dwords and not r1.req.dword_index));
busy <= r1.req.valid and ((r1.req.dc_req and not r1.issued) or
(r1.issued and d_in.error) or
stage2_busy_next or
(r1.req.dc_req and r1.req.two_dwords and not r1.req.dword_index));
complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or
(r2.wait_mmu and m_in.done) or r3.convert_lfs;
busy <= r1.req.valid or (r2.req.valid and not complete);
in_progress <= r1.req.valid or (r2.req.valid and not complete);

stage1_issue_enable <= r3.stage1_en and not (r1.req.valid and r1.req.mmu_op) and
not (r2.req.valid and r2.req.mmu_op);
@ -940,6 +941,7 @@ begin

-- update busy signal back to execute1
e_out.busy <= busy;
e_out.in_progress <= in_progress;

-- Busy calculation.
stage3_busy_next <= r2.req.valid and not (complete or part_done or exception);

Loading…
Cancel
Save