Merge pull request #322 from paulusmack/fixes

Fix bug with load hitting two previous stores
dcache-nc-fix
Michael Neuling 3 years ago committed by GitHub
commit 18eb029f0a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -122,7 +122,7 @@ architecture rtl of dcache is
type cache_valids_t is array(index_t) of cache_way_valids_t;
type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;

-- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
-- Storage. Hopefully implemented in LUTs
signal cache_tags : cache_tags_array_t;
signal cache_tag_set : cache_tags_set_t;
signal cache_valids : cache_valids_t;
@ -317,15 +317,13 @@ architecture rtl of dcache is
tlb_hit_way : tlb_way_t;
tlb_hit_index : tlb_index_t;

-- 2-stage data buffer for data forwarded from writes to reads
forward_data1 : std_ulogic_vector(63 downto 0);
forward_data2 : std_ulogic_vector(63 downto 0);
forward_sel1 : std_ulogic_vector(7 downto 0);
forward_valid1 : std_ulogic;
forward_way1 : way_t;
forward_row1 : row_t;
use_forward1 : std_ulogic;
-- data buffer for data forwarded from writes to reads
forward_data : std_ulogic_vector(63 downto 0);
forward_tag : cache_tag_t;
forward_sel : std_ulogic_vector(7 downto 0);
forward_valid : std_ulogic;
forward_row : row_t;
data_out : std_ulogic_vector(63 downto 0);

-- Cache miss state (reload state machine)
state : state_t;
@ -387,12 +385,16 @@ architecture rtl of dcache is
signal r0_valid : std_ulogic;
signal r0_stall : std_ulogic;

signal use_forward1_next : std_ulogic;
signal use_forward2_next : std_ulogic;
signal fwd_same_tag : std_ulogic;
signal use_forward_st : std_ulogic;
signal use_forward_rl : std_ulogic;
signal use_forward2 : std_ulogic;

-- Cache RAM interface
type cache_ram_out_t is array(way_t) of cache_row_t;
signal cache_out : cache_ram_out_t;
signal ram_wr_data : cache_row_t;
signal ram_wr_select : std_ulogic_vector(ROW_SIZE - 1 downto 0);

-- PLRU output interface
type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0);
@ -576,6 +578,7 @@ begin
r.doall := m_in.doall;
r.tlbld := m_in.tlbld;
r.mmu_req := '1';
r.d_valid := '1';
else
r.req := d_in;
r.req.data := (others => '0');
@ -583,21 +586,19 @@ begin
r.doall := '0';
r.tlbld := '0';
r.mmu_req := '0';
r.d_valid := '0';
end if;
r.d_valid := '0';
if rst = '1' then
r0_full <= '0';
elsif (r1.full = '0' and d_in.hold = '0') or r0_full = '0' then
r0 <= r;
r0_full <= r.req.valid;
end if;
-- Sample data the cycle after a request comes in from loadstore1.
-- If another request has come in already then the data will get
-- put directly into req.data below.
if r0.req.valid = '1' and r.req.valid = '0' and r0.d_valid = '0' and
r0.mmu_req = '0' then
elsif r0.d_valid = '0' then
-- Sample data the cycle after a request comes in from loadstore1.
-- If this request is already moving into r1 then the data will get
-- put directly into req.data in the dcache_slow process below.
r0.req.data <= d_in.data;
r0.d_valid <= '1';
r0.d_valid <= r0.req.valid;
end if;
end if;
end process;
@ -834,6 +835,8 @@ begin
variable hit_way_set : hit_way_set_t;
variable rel_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0);
variable rel_match : std_ulogic;
variable fwd_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0);
variable fwd_match : std_ulogic;
begin
-- Extract line, row and tag from request
req_index <= get_index(r0.req.addr);
@ -849,8 +852,10 @@ begin
hit_way := 0;
is_hit := '0';
rel_match := '0';
fwd_match := '0';
if r0.req.virt_mode = '1' then
rel_matches := (others => '0');
fwd_matches := (others => '0');
for j in tlb_way_t loop
hit_way_set(j) := 0;
s_hit := '0';
@ -870,11 +875,15 @@ begin
if s_tag = r1.reload_tag then
rel_matches(j) := '1';
end if;
if s_tag = r1.forward_tag then
fwd_matches(j) := '1';
end if;
end loop;
if tlb_hit = '1' then
is_hit := hit_set(tlb_hit_way);
hit_way := hit_way_set(tlb_hit_way);
rel_match := rel_matches(tlb_hit_way);
fwd_match := fwd_matches(tlb_hit_way);
end if;
else
s_tag := get_tag(r0.req.addr);
@ -888,39 +897,28 @@ begin
if s_tag = r1.reload_tag then
rel_match := '1';
end if;
if s_tag = r1.forward_tag then
fwd_match := '1';
end if;
end if;
req_same_tag <= rel_match;

-- See if the request matches the line currently being reloaded
if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index and
rel_match = '1' then
-- For a store, consider this a hit even if the row isn't valid
-- since it will be by the time we perform the store.
-- For a load, check the appropriate row valid bit.
is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE);
hit_way := replace_way;
end if;
fwd_same_tag <= fwd_match;

-- Whether to use forwarded data for a load or not
use_forward1_next <= '0';
if get_row(r1.req.real_addr) = req_row and r1.req.hit_way = hit_way then
-- Only need to consider r1.write_bram here, since if we are
-- writing refill data here, then we don't have a cache hit this
-- cycle on the line being refilled. (There is the possibility
-- that the load following the load miss that started the refill
-- could be to the old contents of the victim line, since it is a
-- couple of cycles after the refill starts before we see the
-- updated cache tag. In that case we don't use the bypass.)
use_forward1_next <= r1.write_bram;
use_forward_st <= '0';
use_forward_rl <= '0';
if r1.store_row = req_row and rel_match = '1' then
-- Use the forwarding path if this cycle is a write to this row
use_forward_st <= r1.write_bram;
if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then
use_forward_rl <= '1';
end if;
end if;
use_forward2_next <= '0';
if r1.forward_row1 = req_row and r1.forward_way1 = hit_way then
use_forward2_next <= r1.forward_valid1;
use_forward2 <= '0';
if r1.forward_row = req_row and fwd_match = '1' then
use_forward2 <= r1.forward_valid;
end if;

-- The way that matched on a hit
req_hit_way <= hit_way;

-- The way to replace on a miss
if r1.write_tag = '1' then
replace_way <= to_integer(unsigned(plru_victim(r1.store_index)));
@ -928,6 +926,23 @@ begin
replace_way <= r1.store_way;
end if;

-- See if the request matches the line currently being reloaded
if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index and
rel_match = '1' then
-- Ignore is_hit from above, because a load miss writes the new tag
-- but doesn't clear the valid bit on the line before refilling it.
-- For a store, consider this a hit even if the row isn't valid
-- since it will be by the time we perform the store.
-- For a load, check the appropriate row valid bit; but also,
-- if use_forward_rl is 1 then we can consider this a hit.
is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE) or
use_forward_rl;
hit_way := replace_way;
end if;

-- The way that matched on a hit
req_hit_way <= hit_way;

-- work out whether we have permission for this access
-- NB we don't yet implement AMR, thus no KUAP
rc_ok <= perm_attr.reference and (r0.req.load or perm_attr.changed);
@ -1023,28 +1038,9 @@ begin
-- Return data for loads & completion control logic
--
writeback_control: process(all)
variable data_out : std_ulogic_vector(63 downto 0);
variable data_fwd : std_ulogic_vector(63 downto 0);
variable j : integer;
begin
-- Use the bypass if are reading the row that was written 1 or 2 cycles
-- ago, including for the slow_valid = 1 case (i.e. completing a load
-- miss or a non-cacheable load).
if r1.use_forward1 = '1' then
data_fwd := r1.forward_data1;
else
data_fwd := r1.forward_data2;
end if;
data_out := cache_out(r1.hit_way);
for i in 0 to 7 loop
j := i * 8;
if r1.forward_sel(i) = '1' then
data_out(j + 7 downto j) := data_fwd(j + 7 downto j);
end if;
end loop;

d_out.valid <= r1.ls_valid;
d_out.data <= data_out;
d_out.data <= r1.data_out;
d_out.store_done <= not r1.stcx_fail;
d_out.error <= r1.ls_error;
d_out.cache_paradox <= r1.cache_paradox;
@ -1052,7 +1048,7 @@ begin
-- Outputs to MMU
m_out.done <= r1.mmu_done;
m_out.err <= r1.mmu_error;
m_out.data <= data_out;
m_out.data <= r1.data_out;

-- We have a valid load or store hit or we just completed a slow
-- op such as a load miss, a NC load or a store
@ -1076,7 +1072,7 @@ begin
-- Request came from loadstore1...
-- Load hit case is the standard path
if r1.hit_load_valid = '1' then
report "completing load hit data=" & to_hstring(data_out);
report "completing load hit data=" & to_hstring(r1.data_out);
end if;

-- error cases complete without stalling
@ -1086,7 +1082,7 @@ begin

-- Slow ops (load miss, NC, stores)
if r1.slow_valid = '1' then
report "completing store or load miss data=" & to_hstring(data_out);
report "completing store or load miss data=" & to_hstring(r1.data_out);
end if;

else
@ -1108,6 +1104,13 @@ begin

end process;

-- RAM write data and select multiplexers
ram_wr_data <= r1.req.data when r1.write_bram = '1' else
wishbone_in.dat when r1.dcbz = '0' else
(others => '0');
ram_wr_select <= r1.req.byte_sel when r1.write_bram = '1' else
(others => '1');

--
-- Generate a cache RAM for each way. This handles the normal
-- reads, writes from reloads and the special store-hit update
@ -1132,7 +1135,7 @@ begin
generic map (
ROW_BITS => ROW_BITS,
WIDTH => wishbone_data_bits,
ADD_BUF => true
ADD_BUF => false
)
port map (
clk => clk,
@ -1141,7 +1144,7 @@ begin
rd_data => dout,
wr_sel => wr_sel_m,
wr_addr => wr_addr,
wr_data => wr_data
wr_data => ram_wr_data
);
process(all)
begin
@ -1157,37 +1160,13 @@ begin
-- For timing, the mux on wr_data/sel/addr is not dependent on anything
-- other than the current state.
--
wr_sel_m <= (others => '0');
wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS));

do_write <= '0';
if r1.write_bram = '1' then
-- Write store data to BRAM. This happens one cycle after the
-- store is in r0.
wr_data <= r1.req.data;
wr_sel <= r1.req.byte_sel;
wr_addr <= std_ulogic_vector(to_unsigned(get_row(r1.req.real_addr), ROW_BITS));
if i = r1.req.hit_way then
do_write <= '1';
end if;
else
-- Otherwise, we might be doing a reload or a DCBZ
if r1.dcbz = '1' then
wr_data <= (others => '0');
else
wr_data <= wishbone_in.dat;
end if;
wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS));
wr_sel <= (others => '1');

if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and replace_way = i then
do_write <= '1';
end if;
end if;

-- Mask write selects with do_write since BRAM doesn't
-- have a global write-enable
if do_write = '1' then
wr_sel_m <= wr_sel;
wr_sel_m <= (others => '0');
if i = replace_way and
(r1.write_bram = '1' or
(r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1')) then
wr_sel_m <= ram_wr_select;
end if;

end process;
@ -1198,20 +1177,60 @@ begin
-- It also handles error cases (TLB miss, cache paradox)
--
dcache_fast_hit : process(clk)
variable j : integer;
variable sel : std_ulogic_vector(1 downto 0);
variable data_out : std_ulogic_vector(63 downto 0);
begin
if rising_edge(clk) then
if req_op /= OP_NONE then
report "op:" & op_t'image(req_op) &
" addr:" & to_hstring(r0.req.addr) &
" nc:" & std_ulogic'image(r0.req.nc) &
" idx:" & integer'image(req_index) &
" tag:" & to_hstring(req_tag) &
" way: " & integer'image(req_hit_way);
end if;
report "op:" & op_t'image(req_op) &
" addr:" & to_hstring(r0.req.addr) &
" nc:" & std_ulogic'image(r0.req.nc) &
" idx:" & integer'image(req_index) &
" tag:" & to_hstring(req_tag) &
" way: " & integer'image(req_hit_way);
end if;
if r0_valid = '1' then
r1.mmu_req <= r0.mmu_req;
end if;

-- Bypass/forwarding multiplexer for load data.
-- Use the bypass if are reading the row of BRAM that was written 0 or 1
-- cycles ago, including for the slow_valid = 1 cases (i.e. completing a
-- load miss or a non-cacheable load), which are handled via the r1.full case.
for i in 0 to 7 loop
if r1.full = '1' or use_forward_rl = '1' then
sel := '0' & r1.dcbz;
elsif use_forward_st = '1' and r1.req.byte_sel(i) = '1' then
sel := "01";
elsif use_forward2 = '1' and r1.forward_sel(i) = '1' then
sel := "10";
else
sel := "11";
end if;
j := i * 8;
case sel is
when "00" =>
data_out(j + 7 downto j) := wishbone_in.dat(j + 7 downto j);
when "01" =>
data_out(j + 7 downto j) := r1.req.data(j + 7 downto j);
when "10" =>
data_out(j + 7 downto j) := r1.forward_data(j + 7 downto j);
when others =>
data_out(j + 7 downto j) := cache_out(req_hit_way)(j + 7 downto j);
end case;
end loop;
r1.data_out <= data_out;

r1.forward_data <= ram_wr_data;
r1.forward_tag <= r1.reload_tag;
r1.forward_row <= r1.store_row;
r1.forward_sel <= ram_wr_select;
r1.forward_valid <= r1.write_bram;
if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then
r1.forward_valid <= '1';
end if;

-- Fast path for load/store hits. Set signals for the writeback controls.
r1.hit_way <= req_hit_way;
r1.hit_index <= req_index;
@ -1263,38 +1282,11 @@ begin
-- operates at stage 1.
--
dcache_slow : process(clk)
variable stbs_done : boolean;
variable stbs_done : boolean;
variable req : mem_access_request_t;
variable acks : unsigned(2 downto 0);
begin
if rising_edge(clk) then
r1.use_forward1 <= use_forward1_next;
r1.forward_sel <= (others => '0');
if use_forward1_next = '1' then
r1.forward_sel <= r1.req.byte_sel;
elsif use_forward2_next = '1' then
r1.forward_sel <= r1.forward_sel1;
end if;

r1.forward_data2 <= r1.forward_data1;
if r1.write_bram = '1' then
r1.forward_data1 <= r1.req.data;
r1.forward_sel1 <= r1.req.byte_sel;
r1.forward_way1 <= r1.req.hit_way;
r1.forward_row1 <= get_row(r1.req.real_addr);
r1.forward_valid1 <= '1';
else
if r1.dcbz = '1' then
r1.forward_data1 <= (others => '0');
else
r1.forward_data1 <= wishbone_in.dat;
end if;
r1.forward_sel1 <= (others => '1');
r1.forward_way1 <= replace_way;
r1.forward_row1 <= r1.store_row;
r1.forward_valid1 <= '0';
end if;

ev.dcache_refill <= '0';
ev.load_miss <= '0';
ev.store_miss <= '0';
@ -1488,17 +1480,17 @@ begin
end if;

-- Incoming acks processing
r1.forward_valid1 <= wishbone_in.ack;
if wishbone_in.ack = '1' then
r1.rows_valid(r1.store_row mod ROW_PER_LINE) <= '1';
-- If this is the data we were looking for, we can
-- complete the request next cycle.
-- Compare the whole address in case the request in
-- r1.req is not the one that started this refill.
if req.valid = '1' and req.same_tag = '1' and
((r1.dcbz = '1' and req.dcbz = '1') or
(r1.dcbz = '0' and req.op = OP_LOAD_MISS)) and
r1.store_row = get_row(req.real_addr) then
-- (Cases where req comes from r0 are handled as a load
-- hit.)
if r1.full = '1' and r1.req.same_tag = '1' and
((r1.dcbz = '1' and req.dcbz = '1') or r1.req.op = OP_LOAD_MISS) and
r1.store_row = get_row(r1.req.real_addr) then
r1.full <= '0';
r1.slow_valid <= '1';
if r1.mmu_req = '0' then
@ -1506,8 +1498,6 @@ begin
else
r1.mmu_done <= '1';
end if;
r1.forward_sel <= (others => '1');
r1.use_forward1 <= '1';
end if;

-- Check for completion
@ -1551,6 +1541,8 @@ begin
(req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then
r1.wb.stb <= '1';
stbs_done := false;
r1.store_way <= req.hit_way;
r1.store_row <= get_row(req.real_addr);
if req.op = OP_STORE_HIT then
r1.write_bram <= '1';
end if;
@ -1592,8 +1584,6 @@ begin
else
r1.mmu_done <= '1';
end if;
r1.forward_sel <= (others => '1');
r1.use_forward1 <= '1';
r1.wb.cyc <= '0';
r1.wb.stb <= '0';
end if;

@ -162,3 +162,26 @@ test_bdnzl:
li %r3,0
9: mtlr %r10
blr

/* Test that a load that hits stores gets the correct data */
.global test_loadhitstore
test_loadhitstore:
addi %r5,%r1,-16
ld %r0,0(%r5)
li %r0,0
std %r0,0(%r5)
li %r6,0x66
li %r7,0x77
.balign 64
nop
nop
nop
nop
stb %r6,2(%r5)
stb %r7,3(%r5)
ld %r0,0(%r5)
sldi %r6,%r6,16
sldi %r7,%r7,24
or %r7,%r6,%r7
subf %r3,%r0,%r7
blr

@ -15,6 +15,7 @@ extern long test_addpcis_2(void);
extern long test_mfpvr(void);
extern long test_mtpvr(void);
extern long test_bdnzl(void);
extern long test_loadhitstore(void);

// i < 100
void print_test_number(int i)
@ -66,5 +67,12 @@ int main(void)
} else
puts(PASS);

print_test_number(6);
if (test_loadhitstore() != 0) {
fail = 1;
puts(FAIL);
} else
puts(PASS);

return fail;
}

Binary file not shown.

@ -3,3 +3,4 @@ Test 02:PASS
Test 03:PASS
Test 04:PASS
Test 05:PASS
Test 06:PASS

Loading…
Cancel
Save