loadstore1: Reduce busy cycles

This reduces the number of cycles where loadstore1 asserts its busy
output, leading to increased throughput of loads and stores.  Loads
that hit in the cache can now be executed at the rate of one every two
cycles.  Stores take 4 cycles assuming the wishbone slave responds
with an ack the cycle after we assert strobe.

To achieve this, the state machine code is split into two parts, one
for when we have an existing instruction in progress, and one for
starting a new instruction.  We can now combinatorially clear busy and
start a new instruction in the same cycle that we get a done signal
from the dcache; in other words we are completing one instruction and
potentially writing back results in the same cycle that we start a new
instruction and send its address and data to the dcache.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/208/head
Paul Mackerras 5 years ago
parent 1d09daae03
commit 209aa9ce3f

@ -60,14 +60,14 @@ architecture behaviour of execute1 is
slow_op_rc : std_ulogic; slow_op_rc : std_ulogic;
slow_op_oe : std_ulogic; slow_op_oe : std_ulogic;
slow_op_xerc : xer_common_t; slow_op_xerc : xer_common_t;
ldst_nia : std_ulogic_vector(63 downto 0); last_nia : std_ulogic_vector(63 downto 0);
log_addr_spr : std_ulogic_vector(31 downto 0); log_addr_spr : std_ulogic_vector(31 downto 0);
end record; end record;
constant reg_type_init : reg_type := constant reg_type_init : reg_type :=
(e => Execute1ToWritebackInit, busy => '0', lr_update => '0', terminate => '0', (e => Execute1ToWritebackInit, busy => '0', lr_update => '0', terminate => '0',
mul_in_progress => '0', div_in_progress => '0', cntz_in_progress => '0', mul_in_progress => '0', div_in_progress => '0', cntz_in_progress => '0',
slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init, slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init,
next_lr => (others => '0'), ldst_nia => (others => '0'), others => (others => '0')); next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0'));


signal r, rin : reg_type; signal r, rin : reg_type;


@ -455,6 +455,9 @@ begin
v.e.exc_write_enable := '0'; v.e.exc_write_enable := '0';
v.e.exc_write_reg := fast_spr_num(SPR_SRR0); v.e.exc_write_reg := fast_spr_num(SPR_SRR0);
v.e.exc_write_data := e_in.nia; v.e.exc_write_data := e_in.nia;
if valid_in = '1' then
v.last_nia := e_in.nia;
end if;


if ctrl.irq_state = WRITE_SRR1 then if ctrl.irq_state = WRITE_SRR1 then
v.e.exc_write_reg := fast_spr_num(SPR_SRR1); v.e.exc_write_reg := fast_spr_num(SPR_SRR1);
@ -921,8 +924,6 @@ begin


elsif valid_in = '1' then elsif valid_in = '1' then
-- instruction for other units, i.e. LDST -- instruction for other units, i.e. LDST
v.ldst_nia := e_in.nia;
v.e.valid := '0';
if e_in.unit = LDST then if e_in.unit = LDST then
lv.valid := '1'; lv.valid := '1';
end if; end if;
@ -1026,8 +1027,8 @@ begin
end if; end if;
v.e.exc_write_enable := '1'; v.e.exc_write_enable := '1';
v.e.exc_write_reg := fast_spr_num(SPR_SRR0); v.e.exc_write_reg := fast_spr_num(SPR_SRR0);
v.e.exc_write_data := r.ldst_nia; v.e.exc_write_data := r.last_nia;
report "ldst exception writing srr0=" & to_hstring(r.ldst_nia); report "ldst exception writing srr0=" & to_hstring(r.last_nia);
ctrl_tmp.irq_state <= WRITE_SRR1; ctrl_tmp.irq_state <= WRITE_SRR1;
end if; end if;



@ -47,7 +47,6 @@ architecture behave of loadstore1 is
); );


type reg_stage_t is record type reg_stage_t is record
busy : std_ulogic;
-- latch most of the input request -- latch most of the input request
load : std_ulogic; load : std_ulogic;
tlbie : std_ulogic; tlbie : std_ulogic;
@ -126,7 +125,6 @@ begin
if rising_edge(clk) then if rising_edge(clk) then
if rst = '1' then if rst = '1' then
r.state <= IDLE; r.state <= IDLE;
r.busy <= '0';
else else
r <= rin; r <= rin;
end if; end if;
@ -143,7 +141,7 @@ begin
variable long_sel : std_ulogic_vector(15 downto 0); variable long_sel : std_ulogic_vector(15 downto 0);
variable byte_sel : std_ulogic_vector(7 downto 0); variable byte_sel : std_ulogic_vector(7 downto 0);
variable req : std_ulogic; variable req : std_ulogic;
variable stall : std_ulogic; variable busy : std_ulogic;
variable addr : std_ulogic_vector(63 downto 0); variable addr : std_ulogic_vector(63 downto 0);
variable wdata : std_ulogic_vector(63 downto 0); variable wdata : std_ulogic_vector(63 downto 0);
variable write_enable : std_ulogic; variable write_enable : std_ulogic;
@ -165,15 +163,12 @@ begin
begin begin
v := r; v := r;
req := '0'; req := '0';
stall := '0';
done := '0';
byte_sel := (others => '0'); byte_sel := (others => '0');
addr := lsu_sum; addr := lsu_sum;
v.mfspr := '0'; v.mfspr := '0';
mmu_mtspr := '0'; mmu_mtspr := '0';
itlb_fault := '0'; itlb_fault := '0';
sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10)); sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10));
exception := '0';
dsisr := (others => '0'); dsisr := (others => '0');
mmureq := '0'; mmureq := '0';


@ -232,130 +227,18 @@ begin
-- compute (addr + 8) & ~7 for the second doubleword when unaligned -- compute (addr + 8) & ~7 for the second doubleword when unaligned
next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";


done := '0';
exception := '0';
case r.state is case r.state is
when IDLE => when IDLE =>
if l_in.valid = '1' then
v.addr := lsu_sum;
v.load := '0';
v.dcbz := '0';
v.tlbie := '0';
v.instr_fault := '0';
v.dwords_done := '0';
case l_in.op is
when OP_STORE =>
req := '1';
when OP_LOAD =>
req := '1';
v.load := '1';
when OP_DCBZ =>
req := '1';
v.dcbz := '1';
when OP_TLBIE =>
mmureq := '1';
stall := '1';
v.tlbie := '1';
v.state := TLBIE_WAIT;
when OP_MFSPR =>
v.mfspr := '1';
-- partial decode on SPR number should be adequate given
-- the restricted set that get sent down this path
if sprn(9) = '0' and sprn(5) = '0' then
if sprn(0) = '0' then
v.sprval := x"00000000" & r.dsisr;
else
v.sprval := r.dar;
end if;
else
-- reading one of the SPRs in the MMU
v.sprval := m_in.sprval;
end if;
v.state := SPR_CMPLT;
when OP_MTSPR =>
if sprn(9) = '0' and sprn(5) = '0' then
if sprn(0) = '0' then
v.dsisr := l_in.data(31 downto 0);
else
v.dar := l_in.data;
end if;
v.state := SPR_CMPLT;
else
-- writing one of the SPRs in the MMU
mmu_mtspr := '1';
stall := '1';
v.state := TLBIE_WAIT;
end if;
when OP_FETCH_FAILED =>
-- send it to the MMU to do the radix walk
addr := l_in.nia;
v.addr := l_in.nia;
v.instr_fault := '1';
mmureq := '1';
stall := '1';
v.state := MMU_LOOKUP;
when others =>
assert false report "unknown op sent to loadstore1";
end case;

v.write_reg := l_in.write_reg;
v.length := l_in.length;
v.byte_reverse := l_in.byte_reverse;
v.sign_extend := l_in.sign_extend;
v.update := l_in.update;
v.update_reg := l_in.update_reg;
v.xerc := l_in.xerc;
v.reserve := l_in.reserve;
v.rc := l_in.rc;
v.nc := l_in.ci;
v.virt_mode := l_in.virt_mode;
v.priv_mode := l_in.priv_mode;

-- XXX Temporary hack. Mark the op as non-cachable if the address
-- is the form 0xc------- for a real-mode access.
--
-- This will have to be replaced by a combination of implementing the
-- proper HV CI load/store instructions and having an MMU to get the I
-- bit otherwise.
if lsu_sum(31 downto 28) = "1100" and l_in.virt_mode = '0' then
v.nc := '1';
end if;

-- Do length_to_sel and work out if we are doing 2 dwords
long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0));
byte_sel := long_sel(7 downto 0);
v.first_bytes := byte_sel;
v.second_bytes := long_sel(15 downto 8);

-- Do byte reversing and rotating for stores in the first cycle
byte_offset := unsigned(lsu_sum(2 downto 0));
brev_lenm1 := "000";
if l_in.byte_reverse = '1' then
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
end if;
for i in 0 to 7 loop
k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
j := to_integer(k) * 8;
v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
end loop;

if req = '1' then
stall := '1';
if long_sel(15 downto 8) = "00000000" then
v.state := ACK_WAIT;
else
v.state := SECOND_REQ;
end if;
end if;
end if;


when SECOND_REQ => when SECOND_REQ =>
addr := next_addr; addr := next_addr;
byte_sel := r.second_bytes; byte_sel := r.second_bytes;
req := '1'; req := '1';
stall := '1';
v.state := ACK_WAIT; v.state := ACK_WAIT;


when ACK_WAIT => when ACK_WAIT =>
stall := '1';
if d_in.valid = '1' then if d_in.valid = '1' then
if d_in.error = '1' then if d_in.error = '1' then
-- dcache will discard the second request if it -- dcache will discard the second request if it
@ -393,7 +276,6 @@ begin
else else
-- stores write back rA update in this cycle -- stores write back rA update in this cycle
do_update := r.update; do_update := r.update;
stall := '0';
done := '1'; done := '1';
v.state := IDLE; v.state := IDLE;
end if; end if;
@ -402,7 +284,6 @@ begin
end if; end if;


when MMU_LOOKUP => when MMU_LOOKUP =>
stall := '1';
if r.dwords_done = '1' then if r.dwords_done = '1' then
addr := next_addr; addr := next_addr;
byte_sel := r.second_bytes; byte_sel := r.second_bytes;
@ -423,7 +304,6 @@ begin
end if; end if;
else else
-- nothing to do, the icache retries automatically -- nothing to do, the icache retries automatically
stall := '0';
done := '1'; done := '1';
v.state := IDLE; v.state := IDLE;
end if; end if;
@ -439,10 +319,8 @@ begin
end if; end if;


when TLBIE_WAIT => when TLBIE_WAIT =>
stall := '1';
if m_in.done = '1' then if m_in.done = '1' then
-- tlbie is finished -- tlbie is finished
stall := '0';
done := '1'; done := '1';
v.state := IDLE; v.state := IDLE;
end if; end if;
@ -458,6 +336,117 @@ begin


end case; end case;


busy := '1';
if r.state = IDLE or done = '1' then
busy := '0';
end if;

-- Note that l_in.valid is gated with busy inside execute1
if l_in.valid = '1' then
v.addr := lsu_sum;
v.load := '0';
v.dcbz := '0';
v.tlbie := '0';
v.instr_fault := '0';
v.dwords_done := '0';
v.write_reg := l_in.write_reg;
v.length := l_in.length;
v.byte_reverse := l_in.byte_reverse;
v.sign_extend := l_in.sign_extend;
v.update := l_in.update;
v.update_reg := l_in.update_reg;
v.xerc := l_in.xerc;
v.reserve := l_in.reserve;
v.rc := l_in.rc;
v.nc := l_in.ci;
v.virt_mode := l_in.virt_mode;
v.priv_mode := l_in.priv_mode;

-- XXX Temporary hack. Mark the op as non-cachable if the address
-- is the form 0xc------- for a real-mode access.
if lsu_sum(31 downto 28) = "1100" and l_in.virt_mode = '0' then
v.nc := '1';
end if;

-- Do length_to_sel and work out if we are doing 2 dwords
long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0));
byte_sel := long_sel(7 downto 0);
v.first_bytes := byte_sel;
v.second_bytes := long_sel(15 downto 8);

-- Do byte reversing and rotating for stores in the first cycle
byte_offset := unsigned(lsu_sum(2 downto 0));
brev_lenm1 := "000";
if l_in.byte_reverse = '1' then
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
end if;
for i in 0 to 7 loop
k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
j := to_integer(k) * 8;
v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
end loop;

case l_in.op is
when OP_STORE =>
req := '1';
when OP_LOAD =>
req := '1';
v.load := '1';
when OP_DCBZ =>
req := '1';
v.dcbz := '1';
when OP_TLBIE =>
mmureq := '1';
v.tlbie := '1';
v.state := TLBIE_WAIT;
when OP_MFSPR =>
v.mfspr := '1';
-- partial decode on SPR number should be adequate given
-- the restricted set that get sent down this path
if sprn(9) = '0' and sprn(5) = '0' then
if sprn(0) = '0' then
v.sprval := x"00000000" & r.dsisr;
else
v.sprval := r.dar;
end if;
else
-- reading one of the SPRs in the MMU
v.sprval := m_in.sprval;
end if;
v.state := SPR_CMPLT;
when OP_MTSPR =>
if sprn(9) = '0' and sprn(5) = '0' then
if sprn(0) = '0' then
v.dsisr := l_in.data(31 downto 0);
else
v.dar := l_in.data;
end if;
v.state := SPR_CMPLT;
else
-- writing one of the SPRs in the MMU
mmu_mtspr := '1';
v.state := TLBIE_WAIT;
end if;
when OP_FETCH_FAILED =>
-- send it to the MMU to do the radix walk
addr := l_in.nia;
v.addr := l_in.nia;
v.instr_fault := '1';
mmureq := '1';
v.state := MMU_LOOKUP;
when others =>
assert false report "unknown op sent to loadstore1";
end case;

if req = '1' then
if long_sel(15 downto 8) = "00000000" then
v.state := ACK_WAIT;
else
v.state := SECOND_REQ;
end if;
end if;
end if;

-- Update outputs to dcache -- Update outputs to dcache
d_out.valid <= req; d_out.valid <= req;
d_out.load <= v.load; d_out.load <= v.load;
@ -504,7 +493,7 @@ begin
l_out.store_done <= d_in.store_done; l_out.store_done <= d_in.store_done;


-- update exception info back to execute1 -- update exception info back to execute1
e_out.busy <= r.busy; e_out.busy <= busy;
e_out.exception <= exception; e_out.exception <= exception;
e_out.instr_fault <= r.instr_fault; e_out.instr_fault <= r.instr_fault;
e_out.invalid <= m_in.invalid; e_out.invalid <= m_in.invalid;
@ -519,8 +508,6 @@ begin
end if; end if;
end if; end if;


v.busy := stall;

-- Update registers -- Update registers
rin <= v; rin <= v;


@ -529,7 +516,7 @@ begin
ls1_log: process(clk) ls1_log: process(clk)
begin begin
if rising_edge(clk) then if rising_edge(clk) then
log_data <= r.busy & log_data <= e_out.busy &
e_out.exception & e_out.exception &
l_out.valid & l_out.valid &
m_out.valid & m_out.valid &

Loading…
Cancel
Save