dcache: Implement data cache touch and flush instructions

This implements dcbf, dcbt and dcbtst in the dcache.  The dcbst (data
cache block store) instruction remains a no-op because our dcache is
write-through and therefore never has modified data that could need to
be written back.

Dcbt (data cache block touch) and dcbtst (data cache block touch for
store) behave similarly except that dcbtst is a no-op on a readonly
page.  Neither instruction ever causes an interrupt.  If they miss in
the cache and the page is cacheable, they are handled like a load miss
except that they complete immediately the state machine starts
handling the load miss rather than waiting for any data.

Dcbf (data cache block flush) can cause a data storage interrupt.  If
it hits in the cache, the state machine goes to a new FLUSH_CYCLE
state in which the cache line valid bit is cleared.

In order to avoid having more than 8 values in op_t, this combines
OP_STORE_MISS and OP_STORE_HIT into a single state.  A new OP_NOP
state is used for operations which can complete immediately without
changing any dcache state (now used for dcbt/dcbtst causing access
exception or on a non-cachable page, or dcbf that misses the cache).

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/434/head
Paul Mackerras 4 years ago
parent b181d28df2
commit ba4614c5f4

@ -603,6 +603,8 @@ package common is
hold : std_ulogic;
load : std_ulogic; -- is this a load
dcbz : std_ulogic;
flush : std_ulogic;
touch : std_ulogic;
nc : std_ulogic;
reserve : std_ulogic;
atomic_qw : std_ulogic; -- part of a quadword atomic op
@ -614,6 +616,9 @@ package common is
data : std_ulogic_vector(63 downto 0); -- valid the cycle after .valid = 1
byte_sel : std_ulogic_vector(7 downto 0);
end record;
constant Loadstore1ToDcacheInit : Loadstore1ToDcacheType :=
(addr => (others => '0'), data => (others => '0'), byte_sel => x"00",
others => '0');

type DcacheToLoadstore1Type is record
valid : std_ulogic;

@ -187,15 +187,17 @@ architecture rtl of dcache is
OP_LOAD_HIT, -- Cache hit on load
OP_LOAD_MISS, -- Load missing cache
OP_LOAD_NC, -- Non-cachable load
OP_STORE_HIT, -- Store hitting cache
OP_STORE_MISS); -- Store missing cache
OP_STORE, -- Store, whether hitting or missing cache
OP_NOP, -- nothing to do, just complete the op
OP_MISC); -- Flush

-- Cache state machine
type state_t is (IDLE, -- Normal load hit processing
RELOAD_WAIT_ACK, -- Cache reload wait ack
STORE_WAIT_ACK, -- Store wait ack
NC_LOAD_WAIT_ACK, -- Non-cachable load wait ack
DO_STCX); -- Check for stcx. validity
DO_STCX, -- Check for stcx. validity
FLUSH_CYCLE); -- Cycle for invalidating cache line

--
-- Dcache operations:
@ -289,12 +291,15 @@ architecture rtl of dcache is
op : op_t;
valid : std_ulogic;
dcbz : std_ulogic;
flush : std_ulogic;
touch : std_ulogic;
reserve : std_ulogic;
first_dw : std_ulogic;
last_dw : std_ulogic;
real_addr : real_addr_t;
data : std_ulogic_vector(63 downto 0);
byte_sel : std_ulogic_vector(7 downto 0);
is_hit : std_ulogic;
hit_way : way_t;
same_tag : std_ulogic;
mmu_req : std_ulogic;
@ -377,6 +382,7 @@ architecture rtl of dcache is
-- Async signals on incoming request
signal req_index : index_t;
signal req_hit_way : way_t;
signal req_is_hit : std_ulogic;
signal req_tag : cache_tag_t;
signal req_op : op_t;
signal req_data : std_ulogic_vector(63 downto 0);
@ -568,12 +574,9 @@ begin
assert (d_in.valid and m_in.valid) = '0' report
"request collision loadstore vs MMU";
if m_in.valid = '1' then
r.req := Loadstore1ToDcacheInit;
r.req.valid := '1';
r.req.load := not (m_in.tlbie or m_in.tlbld);
r.req.dcbz := '0';
r.req.nc := '0';
r.req.reserve := '0';
r.req.virt_mode := '0';
r.req.priv_mode := '1';
r.req.addr := m_in.addr;
r.req.data := m_in.pte;
@ -1077,13 +1080,17 @@ begin
-- since it will be by the time we perform the store.
-- For a load, check the appropriate row valid bit; but also,
-- if use_forward_rl is 1 then we can consider this a hit.
is_hit := not r0.req.load or r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or
-- For a touch, since the line we want is being reloaded already,
-- consider this a hit.
is_hit := not r0.req.load or r0.req.touch or
r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or
use_forward_rl;
hit_way := replace_way;
end if;

-- The way that matched on a hit
req_hit_way <= hit_way;
req_is_hit <= is_hit;

-- work out whether we have permission for this access
-- NB we don't yet implement AMR, thus no KUAP
@ -1098,17 +1105,32 @@ begin
nc := r0.req.nc or perm_attr.nocache;
op := OP_NONE;
if go = '1' then
if access_ok = '0' then
if r0.req.touch = '1' then
if access_ok = '1' and is_hit = '0' and nc = '0' then
op := OP_LOAD_MISS;
elsif access_ok = '1' and is_hit = '1' and nc = '0' then
-- Make this OP_LOAD_HIT so the PLRU gets updated
op := OP_LOAD_HIT;
else
op := OP_NOP;
end if;
elsif access_ok = '0' then
op := OP_BAD;
elsif r0.req.flush = '1' then
if is_hit = '0' then
op := OP_NOP;
else
op := OP_MISC;
end if;
else
opsel := r0.req.load & nc & is_hit;
case opsel is
when "101" => op := OP_LOAD_HIT;
when "100" => op := OP_LOAD_MISS;
when "110" => op := OP_LOAD_NC;
when "001" => op := OP_STORE_HIT;
when "000" => op := OP_STORE_MISS;
when "010" => op := OP_STORE_MISS;
when "001" => op := OP_STORE;
when "000" => op := OP_STORE;
when "010" => op := OP_STORE;
when "011" => op := OP_BAD;
when "111" => op := OP_BAD;
when others => op := OP_NONE;
@ -1348,8 +1370,8 @@ begin
end if;

-- The cache hit indication is used for PLRU updates
if req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT then
r1.cache_hit <= '1';
if req_op = OP_LOAD_HIT or req_op = OP_STORE then
r1.cache_hit <= req_is_hit;
else
r1.cache_hit <= '0';
end if;
@ -1430,7 +1452,7 @@ begin
r1.ls_valid <= '0';
-- complete tlbies and TLB loads in the third cycle
r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld);
if req_op = OP_LOAD_HIT then
if req_op = OP_LOAD_HIT or req_op = OP_NOP then
if r0.mmu_req = '0' then
r1.ls_valid <= '1';
else
@ -1446,7 +1468,7 @@ begin
if req_go = '1' and access_ok = '1' and r0.req.load = '1' and
r0.req.reserve = '1' and r0.req.atomic_first = '1' then
reservation.addr <= ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS);
if req_op = OP_LOAD_HIT then
if req_is_hit = '1' then
reservation.valid <= not req_snoop_hit;
end if;
end if;
@ -1485,6 +1507,8 @@ begin
req.valid := req_go;
req.mmu_req := r0.mmu_req;
req.dcbz := r0.req.dcbz;
req.flush := r0.req.flush;
req.touch := r0.req.touch;
req.reserve := r0.req.reserve;
req.first_dw := r0.req.atomic_first;
req.last_dw := r0.req.atomic_last;
@ -1504,12 +1528,13 @@ begin
req.byte_sel := r0.req.byte_sel;
end if;
req.hit_way := req_hit_way;
req.is_hit := req_is_hit;
req.same_tag := req_same_tag;

-- Store the incoming request from r0, if it is a slow request
-- Note that r1.full = 1 implies req_op = OP_NONE
if req_op = OP_LOAD_MISS or req_op = OP_LOAD_NC or
req_op = OP_STORE_MISS or req_op = OP_STORE_HIT then
req_op = OP_STORE or req_op = OP_MISC then
r1.req <= req;
r1.full <= '1';
end if;
@ -1523,7 +1548,7 @@ begin
r1.victim_way <= plru_victim;
report "victim way:" & to_hstring(plru_victim);
end if;
if req_op = OP_LOAD_MISS or (req_op = OP_STORE_MISS and r0.req.dcbz = '1') then
if req_op = OP_LOAD_MISS or (r0.req.dcbz = '1' and req_is_hit = '0') then
r1.choose_victim <= '1';
end if;

@ -1555,7 +1580,7 @@ begin
r1.reload_tag <= get_tag(req.real_addr);
r1.req.same_tag <= '1';

if req.op = OP_STORE_HIT then
if req.is_hit = '1' then
r1.store_way <= req.hit_way;
end if;

@ -1585,13 +1610,20 @@ begin
r1.write_tag <= '1';
ev.load_miss <= '1';

-- If this is a touch, complete the instruction
if req.touch = '1' then
r1.full <= '0';
r1.slow_valid <= '1';
r1.ls_valid <= '1';
end if;

when OP_LOAD_NC =>
r1.wb.cyc <= '1';
r1.wb.stb <= '1';
r1.wb.we <= '0';
r1.state <= NC_LOAD_WAIT_ACK;

when OP_STORE_HIT | OP_STORE_MISS =>
when OP_STORE =>
if req.reserve = '1' then
-- stcx needs to wait until next cycle
-- for the reservation address check
@ -1605,9 +1637,7 @@ begin
else
r1.mmu_done <= '1';
end if;
if req.op = OP_STORE_HIT then
r1.write_bram <= '1';
end if;
r1.write_bram <= req.is_hit;
r1.wb.we <= '1';
r1.wb.cyc <= '1';
r1.wb.stb <= '1';
@ -1615,21 +1645,24 @@ begin
-- dcbz is handled much like a load miss except
-- that we are writing to memory instead of reading
r1.state <= RELOAD_WAIT_ACK;
if req.op = OP_STORE_MISS then
r1.write_tag <= '1';
end if;
r1.write_tag <= not req.is_hit;
r1.wb.we <= '1';
r1.wb.cyc <= '1';
r1.wb.stb <= '1';
end if;
if req.op = OP_STORE_MISS then
ev.store_miss <= '1';
if req.op = OP_STORE then
ev.store_miss <= not req.is_hit;
end if;

when OP_MISC =>
r1.state <= FLUSH_CYCLE;

-- OP_NONE and OP_BAD do nothing
-- OP_BAD was handled above already
-- OP_BAD & OP_NOP were handled above already
when OP_NONE =>
when OP_BAD =>
when OP_NOP =>

end case;

when RELOAD_WAIT_ACK =>
@ -1712,14 +1745,12 @@ begin
end if;
assert not is_X(acks);
if acks < 7 and req.same_tag = '1' and req.dcbz = '0' and
(req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then
req.op = OP_STORE then
r1.wb.stb <= '1';
stbs_done := false;
r1.store_way <= req.hit_way;
r1.store_row <= get_row(req.real_addr);
if req.op = OP_STORE_HIT then
r1.write_bram <= '1';
end if;
r1.write_bram <= req.is_hit;
r1.full <= '0';
r1.slow_valid <= '1';
-- Store requests never come from the MMU
@ -1783,9 +1814,7 @@ begin
if wishbone_in.stall = '0' then
-- Store has been accepted, so now we can write the
-- cache data RAM
if r1.req.op = OP_STORE_HIT then
r1.write_bram <= '1';
end if;
r1.write_bram <= req.is_hit;
r1.wb.stb <= '0';
end if;
if wishbone_in.ack = '1' then
@ -1802,6 +1831,12 @@ begin
end if;
end if;

when FLUSH_CYCLE =>
cache_valids(to_integer(r1.store_index))(to_integer(r1.store_way)) <= '0';
r1.full <= '0';
r1.slow_valid <= '1';
r1.ls_valid <= '1';
r1.state <= IDLE;
end case;
end if;
end if;

@ -129,10 +129,10 @@ architecture behaviour of decode1 is
INSN_crorc => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_crxor => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_darn => (ALU, NONE, OP_DARN, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_dcbf => (ALU, NONE, OP_DCBF, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_dcbf => (LDST, NONE, OP_DCBF, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_dcbst => (ALU, NONE, OP_DCBST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_dcbt => (ALU, NONE, OP_XCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_dcbtst => (ALU, NONE, OP_DCBTST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_dcbt => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_dcbtst => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_dcbz => (LDST, NONE, OP_DCBZ, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_divd => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RCOE, '0', '0', NONE),
INSN_divde => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RCOE, '0', '0', NONE),
@ -200,7 +200,7 @@ architecture behaviour of decode1 is
INSN_ftdiv => (FPU, FPU, OP_FP_CMP, FRA, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_ftsqrt => (FPU, FPU, OP_FP_CMP, NONE, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_icbi => (ALU, NONE, OP_ICBI, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE),
INSN_icbt => (ALU, NONE, OP_XCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_icbt => (ALU, NONE, OP_ICBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_isel => (ALU, NONE, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_isync => (ALU, NONE, OP_ISYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
INSN_lbarx => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE),

@ -7,8 +7,9 @@ package decode_types is
OP_BCD, OP_BPERM, OP_BREV,
OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB,
OP_COUNTB, OP_CROP,
OP_DARN, OP_DCBF, OP_DCBST, OP_XCBT, OP_DCBTST,
OP_DCBZ, OP_ICBI,
OP_DARN, OP_DCBF, OP_DCBST, OP_DCBZ,
OP_SPARE,
OP_ICBI, OP_ICBT,
OP_FP_CMP, OP_FP_ARITH, OP_FP_MOVE, OP_FP_MISC,
OP_DIV, OP_DIVE, OP_MOD,
OP_EXTS, OP_EXTSWSLI,

@ -1184,7 +1184,7 @@ begin
else
illegal := '1';
end if;
when OP_NOP | OP_DCBF | OP_DCBST | OP_XCBT | OP_DCBTST =>
when OP_NOP | OP_DCBST | OP_ICBT =>
-- Do nothing
when OP_ADD =>
if e_in.output_carry = '1' then
@ -1653,11 +1653,10 @@ begin
v.e.srr1 := (others => '0');
v.e.srr1(47 - 33) := '1';
v.e.srr1(47 - 34) := ex1.prev_prefixed;
if ex1.prev_op = OP_LOAD or ex1.prev_op = OP_ICBI or
ex1.prev_op = OP_XCBT or ex1.prev_op = OP_DCBST or ex1.prev_op = OP_DCBF then
if ex1.prev_op = OP_LOAD or ex1.prev_op = OP_ICBI or ex1.prev_op = OP_ICBT or
ex1.prev_op = OP_DCBF then
v.e.srr1(47 - 35) := '1';
elsif ex1.prev_op = OP_STORE or ex1.prev_op = OP_DCBZ or
ex1.prev_op = OP_DCBTST then
elsif ex1.prev_op = OP_STORE or ex1.prev_op = OP_DCBZ then
v.e.srr1(47 - 36) := '1';
end if;


@ -61,6 +61,8 @@ architecture behave of loadstore1 is
dc_req : std_ulogic;
load : std_ulogic;
store : std_ulogic;
flush : std_ulogic;
touch : std_ulogic;
tlbie : std_ulogic;
dcbz : std_ulogic;
read_spr : std_ulogic;
@ -100,7 +102,8 @@ architecture behave of loadstore1 is
two_dwords : std_ulogic;
incomplete : std_ulogic;
end record;
constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', tlbie => '0',
constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0',
flush => '0', touch => '0', tlbie => '0',
dcbz => '0', read_spr => '0', write_spr => '0', mmu_op => '0',
instr_fault => '0', do_update => '0',
mode_32bit => '0', prefixed => '0',
@ -470,7 +473,7 @@ begin
addr_mask := std_ulogic_vector(unsigned(l_in.length(2 downto 0)) - 1);

-- Do length_to_sel and work out if we are doing 2 dwords
long_sel := xfer_data_sel(v.length, addr(2 downto 0));
long_sel := xfer_data_sel(l_in.length, addr(2 downto 0));
v.byte_sel := long_sel(7 downto 0);
v.second_bytes := long_sel(15 downto 8);
if long_sel(15 downto 8) /= "00000000" then
@ -505,6 +508,9 @@ begin
case l_in.op is
when OP_STORE =>
v.store := '1';
if l_in.length = "0000" then
v.touch := '1';
end if;
when OP_LOAD =>
if l_in.update = '0' or l_in.second = '0' then
v.load := '1';
@ -512,10 +518,16 @@ begin
-- Allow an extra cycle for SP->DP precision conversion
v.load_sp := '1';
end if;
if l_in.length = "0000" then
v.touch := '1';
end if;
else
-- write back address to RA
v.do_update := '1';
end if;
when OP_DCBF =>
v.load := '1';
v.flush := '1';
when OP_DCBZ =>
v.dcbz := '1';
v.align_intr := v.nc;
@ -541,7 +553,7 @@ begin
-- Work out controls for load and store formatting
brev_lenm1 := "000";
if v.byte_reverse = '1' then
brev_lenm1 := unsigned(v.length(2 downto 0)) - 1;
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
end if;
v.brev_mask := brev_lenm1;

@ -882,7 +894,8 @@ begin

if d_in.valid = '1' then
if r2.req.incomplete = '0' then
write_enable := r2.req.load and not r2.req.load_sp;
write_enable := r2.req.load and not r2.req.load_sp and
not r2.req.flush and not r2.req.touch;
-- stores write back rA update
do_update := r2.req.update and r2.req.store;
end if;
@ -977,6 +990,8 @@ begin
d_out.valid <= stage1_dcreq;
d_out.load <= stage1_req.load;
d_out.dcbz <= stage1_req.dcbz;
d_out.flush <= stage1_req.flush;
d_out.touch <= stage1_req.touch;
d_out.nc <= stage1_req.nc;
d_out.reserve <= stage1_req.reserve;
d_out.atomic_qw <= stage1_req.atomic_qw;
@ -990,6 +1005,8 @@ begin
d_out.valid <= req;
d_out.load <= r2.req.load;
d_out.dcbz <= r2.req.dcbz;
d_out.flush <= r2.req.flush;
d_out.touch <= r2.req.touch;
d_out.nc <= r2.req.nc;
d_out.reserve <= r2.req.reserve;
d_out.atomic_qw <= r2.req.atomic_qw;

Loading…
Cancel
Save