dcache: Implement the dcbz instruction

This adds logic to dcache and loadstore1 to implement dcbz.  For now
it zeroes a single cache line (by default 64 bytes), not 128 bytes
like IBM Power processors do.

The dcbz operation is performed much like a load miss, except that
we are writing zeroes to memory instead of reading.  As each ack
comes back, we write zeroes to the BRAM instead of data from memory.
In this way we zero the line in memory and also zero the line of
cache memory, establishing the line in the cache if it wasn't already
resident.  If it was already resident then we overwrite the existing
line in the cache.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/166/head
Paul Mackerras 4 years ago
parent 167e37d667
commit 041d6bef60

@ -236,6 +236,7 @@ package common is
type Loadstore1ToDcacheType is record
valid : std_ulogic;
load : std_ulogic;
dcbz : std_ulogic;
nc : std_ulogic;
reserve : std_ulogic;
addr : std_ulogic_vector(63 downto 0);

@ -581,8 +581,12 @@ begin
wr_data <= r0.data;
wr_sel <= r0.byte_sel;
else
-- Otherwise, we might be doing a reload
wr_data <= wishbone_in.dat;
-- Otherwise, we might be doing a reload or a DCBZ
if r1.req.dcbz = '1' then
wr_data <= (others => '0');
else
wr_data <= wishbone_in.dat;
end if;
wr_sel <= (others => '1');
wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS));
end if;
@ -718,18 +722,54 @@ begin
r1.wb.we <= '0';
r1.state <= NC_LOAD_WAIT_ACK;

when OP_STORE_HIT | OP_STORE_MISS =>
r1.wb.sel <= r0.byte_sel;
r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000";
r1.wb.dat <= r0.data;
if cancel_store = '0' then
when OP_STORE_HIT | OP_STORE_MISS =>
if r0.dcbz = '0' then
r1.wb.sel <= r0.byte_sel;
r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000";
r1.wb.dat <= r0.data;
if cancel_store = '0' then
r1.wb.cyc <= '1';
r1.wb.stb <= '1';
r1.wb.we <= '1';
r1.state <= STORE_WAIT_ACK;
else
r1.stcx_fail <= '1';
r1.state <= IDLE;
end if;
else
-- dcbz is handled much like a load miss except
-- that we are writing to memory instead of reading
r1.store_index <= req_index;
r1.store_row <= get_row(req_laddr);

if req_op = OP_STORE_HIT then
r1.store_way <= req_hit_way;
else
r1.store_way <= replace_way;

-- Force misses on the victim way while zeroing
cache_valids(req_index)(replace_way) <= '0';

-- Store new tag in selected way
for i in 0 to NUM_WAYS-1 loop
if i = replace_way then
tagset := cache_tags(req_index);
write_tag(i, tagset, req_tag);
cache_tags(req_index) <= tagset;
end if;
end loop;
end if;

-- Set up for wishbone writes
r1.wb.adr <= req_laddr(r1.wb.adr'left downto 0);
r1.wb.sel <= (others => '1');
r1.wb.we <= '1';
r1.wb.dat <= (others => '0');
r1.wb.cyc <= '1';
r1.wb.stb <= '1';
r1.wb.we <= '1';
r1.state <= STORE_WAIT_ACK;
else
r1.stcx_fail <= '1';
r1.state <= IDLE;

-- Handle the rest like a load miss
r1.state <= RELOAD_WAIT_ACK;
end if;

-- OP_NONE and OP_BAD do nothing
@ -766,7 +806,7 @@ begin
-- not idle, which we don't currently know how to deal
-- with.
--
if r1.store_row = get_row(r1.req.addr) then
if r1.store_row = get_row(r1.req.addr) and r1.req.dcbz = '0' then
r1.slow_data <= wishbone_in.dat;
end if;


@ -164,7 +164,7 @@ architecture behaviour of decode1 is
2#0000110110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbst
2#0100010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbt
2#0011110110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbtst
-- 2#1111110110# dcbz
2#1111110110# => (LDST, OP_DCBZ, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- dcbz
2#0110001001# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- divdeu
2#1110001001# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- divdeuo
2#0110001011# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- divweu

@ -43,6 +43,7 @@ architecture behave of loadstore1 is
type reg_stage_t is record
-- latch most of the input request
load : std_ulogic;
dcbz : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
store_data : std_ulogic_vector(63 downto 0);
load_data : std_ulogic_vector(63 downto 0);
@ -198,8 +199,11 @@ begin
when IDLE =>
if l_in.valid = '1' then
v.load := '0';
v.dcbz := '0';
if l_in.op = OP_LOAD then
v.load := '1';
elsif l_in.op = OP_DCBZ then
v.dcbz := '1';
end if;
v.addr := lsu_sum;
v.write_reg := l_in.write_reg;
@ -293,6 +297,7 @@ begin
-- Update outputs to dcache
d_out.valid <= req;
d_out.load <= v.load;
d_out.dcbz <= v.dcbz;
d_out.nc <= v.nc;
d_out.reserve <= v.reserve;
d_out.addr <= addr;

Loading…
Cancel
Save