From ba4614c5f4cd6fa56079151a6be44f92790e0b2b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 11 May 2021 20:26:09 +1000 Subject: [PATCH] dcache: Implement data cache touch and flush instructions This implements dcbf, dcbt and dcbtst in the dcache. The dcbst (data cache block store) instruction remains a no-op because our dcache is write-through and therefore never has modified data that could need to be written back. Dcbt (data cache block touch) and dcbtst (data cache block touch for store) behave similarly except that dcbtst is a no-op on a readonly page. Neither instruction ever causes an interrupt. If they miss in the cache and the page is cacheable, they are handled like a load miss except that they complete immediately the state machine starts handling the load miss rather than waiting for any data. Dcbf (data cache block flush) can cause a data storage interrupt. If it hits in the cache, the state machine goes to a new FLUSH_CYCLE state in which the cache line valid bit is cleared. In order to avoid having more than 8 values in op_t, this combines OP_STORE_MISS and OP_STORE_HIT into a single state. A new OP_NOP state is used for operations which can complete immediately without changing any dcache state (now used for dcbt/dcbtst causing access exception or on a non-cachable page, or dcbf that misses the cache). Signed-off-by: Paul Mackerras --- common.vhdl | 5 +++ dcache.vhdl | 109 ++++++++++++++++++++++++++++++---------------- decode1.vhdl | 8 ++-- decode_types.vhdl | 5 ++- execute1.vhdl | 11 +++-- loadstore1.vhdl | 25 +++++++++-- 6 files changed, 110 insertions(+), 53 deletions(-) diff --git a/common.vhdl b/common.vhdl index 6df5b6b..b1a2c8e 100644 --- a/common.vhdl +++ b/common.vhdl @@ -603,6 +603,8 @@ package common is hold : std_ulogic; load : std_ulogic; -- is this a load dcbz : std_ulogic; + flush : std_ulogic; + touch : std_ulogic; nc : std_ulogic; reserve : std_ulogic; atomic_qw : std_ulogic; -- part of a quadword atomic op @@ -614,6 +616,9 @@ package common is data : std_ulogic_vector(63 downto 0); -- valid the cycle after .valid = 1 byte_sel : std_ulogic_vector(7 downto 0); end record; + constant Loadstore1ToDcacheInit : Loadstore1ToDcacheType := + (addr => (others => '0'), data => (others => '0'), byte_sel => x"00", + others => '0'); type DcacheToLoadstore1Type is record valid : std_ulogic; diff --git a/dcache.vhdl b/dcache.vhdl index 68f3b60..82ae791 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -187,15 +187,17 @@ architecture rtl of dcache is OP_LOAD_HIT, -- Cache hit on load OP_LOAD_MISS, -- Load missing cache OP_LOAD_NC, -- Non-cachable load - OP_STORE_HIT, -- Store hitting cache - OP_STORE_MISS); -- Store missing cache - + OP_STORE, -- Store, whether hitting or missing cache + OP_NOP, -- nothing to do, just complete the op + OP_MISC); -- Flush + -- Cache state machine type state_t is (IDLE, -- Normal load hit processing RELOAD_WAIT_ACK, -- Cache reload wait ack STORE_WAIT_ACK, -- Store wait ack NC_LOAD_WAIT_ACK, -- Non-cachable load wait ack - DO_STCX); -- Check for stcx. validity + DO_STCX, -- Check for stcx. validity + FLUSH_CYCLE); -- Cycle for invalidating cache line -- -- Dcache operations: @@ -289,12 +291,15 @@ architecture rtl of dcache is op : op_t; valid : std_ulogic; dcbz : std_ulogic; + flush : std_ulogic; + touch : std_ulogic; reserve : std_ulogic; first_dw : std_ulogic; last_dw : std_ulogic; real_addr : real_addr_t; data : std_ulogic_vector(63 downto 0); byte_sel : std_ulogic_vector(7 downto 0); + is_hit : std_ulogic; hit_way : way_t; same_tag : std_ulogic; mmu_req : std_ulogic; @@ -377,6 +382,7 @@ architecture rtl of dcache is -- Async signals on incoming request signal req_index : index_t; signal req_hit_way : way_t; + signal req_is_hit : std_ulogic; signal req_tag : cache_tag_t; signal req_op : op_t; signal req_data : std_ulogic_vector(63 downto 0); @@ -568,12 +574,9 @@ begin assert (d_in.valid and m_in.valid) = '0' report "request collision loadstore vs MMU"; if m_in.valid = '1' then + r.req := Loadstore1ToDcacheInit; r.req.valid := '1'; r.req.load := not (m_in.tlbie or m_in.tlbld); - r.req.dcbz := '0'; - r.req.nc := '0'; - r.req.reserve := '0'; - r.req.virt_mode := '0'; r.req.priv_mode := '1'; r.req.addr := m_in.addr; r.req.data := m_in.pte; @@ -1077,13 +1080,17 @@ begin -- since it will be by the time we perform the store. -- For a load, check the appropriate row valid bit; but also, -- if use_forward_rl is 1 then we can consider this a hit. - is_hit := not r0.req.load or r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or + -- For a touch, since the line we want is being reloaded already, + -- consider this a hit. + is_hit := not r0.req.load or r0.req.touch or + r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or use_forward_rl; hit_way := replace_way; end if; -- The way that matched on a hit req_hit_way <= hit_way; + req_is_hit <= is_hit; -- work out whether we have permission for this access -- NB we don't yet implement AMR, thus no KUAP @@ -1098,17 +1105,32 @@ begin nc := r0.req.nc or perm_attr.nocache; op := OP_NONE; if go = '1' then - if access_ok = '0' then + if r0.req.touch = '1' then + if access_ok = '1' and is_hit = '0' and nc = '0' then + op := OP_LOAD_MISS; + elsif access_ok = '1' and is_hit = '1' and nc = '0' then + -- Make this OP_LOAD_HIT so the PLRU gets updated + op := OP_LOAD_HIT; + else + op := OP_NOP; + end if; + elsif access_ok = '0' then op := OP_BAD; + elsif r0.req.flush = '1' then + if is_hit = '0' then + op := OP_NOP; + else + op := OP_MISC; + end if; else opsel := r0.req.load & nc & is_hit; case opsel is when "101" => op := OP_LOAD_HIT; when "100" => op := OP_LOAD_MISS; when "110" => op := OP_LOAD_NC; - when "001" => op := OP_STORE_HIT; - when "000" => op := OP_STORE_MISS; - when "010" => op := OP_STORE_MISS; + when "001" => op := OP_STORE; + when "000" => op := OP_STORE; + when "010" => op := OP_STORE; when "011" => op := OP_BAD; when "111" => op := OP_BAD; when others => op := OP_NONE; @@ -1348,8 +1370,8 @@ begin end if; -- The cache hit indication is used for PLRU updates - if req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT then - r1.cache_hit <= '1'; + if req_op = OP_LOAD_HIT or req_op = OP_STORE then + r1.cache_hit <= req_is_hit; else r1.cache_hit <= '0'; end if; @@ -1430,7 +1452,7 @@ begin r1.ls_valid <= '0'; -- complete tlbies and TLB loads in the third cycle r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld); - if req_op = OP_LOAD_HIT then + if req_op = OP_LOAD_HIT or req_op = OP_NOP then if r0.mmu_req = '0' then r1.ls_valid <= '1'; else @@ -1446,7 +1468,7 @@ begin if req_go = '1' and access_ok = '1' and r0.req.load = '1' and r0.req.reserve = '1' and r0.req.atomic_first = '1' then reservation.addr <= ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); - if req_op = OP_LOAD_HIT then + if req_is_hit = '1' then reservation.valid <= not req_snoop_hit; end if; end if; @@ -1485,6 +1507,8 @@ begin req.valid := req_go; req.mmu_req := r0.mmu_req; req.dcbz := r0.req.dcbz; + req.flush := r0.req.flush; + req.touch := r0.req.touch; req.reserve := r0.req.reserve; req.first_dw := r0.req.atomic_first; req.last_dw := r0.req.atomic_last; @@ -1504,12 +1528,13 @@ begin req.byte_sel := r0.req.byte_sel; end if; req.hit_way := req_hit_way; + req.is_hit := req_is_hit; req.same_tag := req_same_tag; -- Store the incoming request from r0, if it is a slow request -- Note that r1.full = 1 implies req_op = OP_NONE if req_op = OP_LOAD_MISS or req_op = OP_LOAD_NC or - req_op = OP_STORE_MISS or req_op = OP_STORE_HIT then + req_op = OP_STORE or req_op = OP_MISC then r1.req <= req; r1.full <= '1'; end if; @@ -1523,7 +1548,7 @@ begin r1.victim_way <= plru_victim; report "victim way:" & to_hstring(plru_victim); end if; - if req_op = OP_LOAD_MISS or (req_op = OP_STORE_MISS and r0.req.dcbz = '1') then + if req_op = OP_LOAD_MISS or (r0.req.dcbz = '1' and req_is_hit = '0') then r1.choose_victim <= '1'; end if; @@ -1555,7 +1580,7 @@ begin r1.reload_tag <= get_tag(req.real_addr); r1.req.same_tag <= '1'; - if req.op = OP_STORE_HIT then + if req.is_hit = '1' then r1.store_way <= req.hit_way; end if; @@ -1585,13 +1610,20 @@ begin r1.write_tag <= '1'; ev.load_miss <= '1'; + -- If this is a touch, complete the instruction + if req.touch = '1' then + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + end if; + when OP_LOAD_NC => r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; r1.state <= NC_LOAD_WAIT_ACK; - when OP_STORE_HIT | OP_STORE_MISS => + when OP_STORE => if req.reserve = '1' then -- stcx needs to wait until next cycle -- for the reservation address check @@ -1605,9 +1637,7 @@ begin else r1.mmu_done <= '1'; end if; - if req.op = OP_STORE_HIT then - r1.write_bram <= '1'; - end if; + r1.write_bram <= req.is_hit; r1.wb.we <= '1'; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; @@ -1615,21 +1645,24 @@ begin -- dcbz is handled much like a load miss except -- that we are writing to memory instead of reading r1.state <= RELOAD_WAIT_ACK; - if req.op = OP_STORE_MISS then - r1.write_tag <= '1'; - end if; + r1.write_tag <= not req.is_hit; r1.wb.we <= '1'; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; end if; - if req.op = OP_STORE_MISS then - ev.store_miss <= '1'; + if req.op = OP_STORE then + ev.store_miss <= not req.is_hit; end if; + when OP_MISC => + r1.state <= FLUSH_CYCLE; + -- OP_NONE and OP_BAD do nothing - -- OP_BAD was handled above already + -- OP_BAD & OP_NOP were handled above already when OP_NONE => when OP_BAD => + when OP_NOP => + end case; when RELOAD_WAIT_ACK => @@ -1712,14 +1745,12 @@ begin end if; assert not is_X(acks); if acks < 7 and req.same_tag = '1' and req.dcbz = '0' and - (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then + req.op = OP_STORE then r1.wb.stb <= '1'; stbs_done := false; r1.store_way <= req.hit_way; r1.store_row <= get_row(req.real_addr); - if req.op = OP_STORE_HIT then - r1.write_bram <= '1'; - end if; + r1.write_bram <= req.is_hit; r1.full <= '0'; r1.slow_valid <= '1'; -- Store requests never come from the MMU @@ -1783,9 +1814,7 @@ begin if wishbone_in.stall = '0' then -- Store has been accepted, so now we can write the -- cache data RAM - if r1.req.op = OP_STORE_HIT then - r1.write_bram <= '1'; - end if; + r1.write_bram <= req.is_hit; r1.wb.stb <= '0'; end if; if wishbone_in.ack = '1' then @@ -1802,6 +1831,12 @@ begin end if; end if; + when FLUSH_CYCLE => + cache_valids(to_integer(r1.store_index))(to_integer(r1.store_way)) <= '0'; + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + r1.state <= IDLE; end case; end if; end if; diff --git a/decode1.vhdl b/decode1.vhdl index 643523b..9047cf8 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -129,10 +129,10 @@ architecture behaviour of decode1 is INSN_crorc => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_crxor => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_darn => (ALU, NONE, OP_DARN, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_dcbf => (ALU, NONE, OP_DCBF, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_dcbf => (LDST, NONE, OP_DCBF, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_dcbst => (ALU, NONE, OP_DCBST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_dcbt => (ALU, NONE, OP_XCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_dcbtst => (ALU, NONE, OP_DCBTST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_dcbt => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_dcbtst => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_dcbz => (LDST, NONE, OP_DCBZ, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_divd => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RCOE, '0', '0', NONE), INSN_divde => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RCOE, '0', '0', NONE), @@ -200,7 +200,7 @@ architecture behaviour of decode1 is INSN_ftdiv => (FPU, FPU, OP_FP_CMP, FRA, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_ftsqrt => (FPU, FPU, OP_FP_CMP, NONE, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_icbi => (ALU, NONE, OP_ICBI, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), - INSN_icbt => (ALU, NONE, OP_XCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_icbt => (ALU, NONE, OP_ICBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_isel => (ALU, NONE, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_isync => (ALU, NONE, OP_ISYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_lbarx => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), diff --git a/decode_types.vhdl b/decode_types.vhdl index 03e958b..5695643 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -7,8 +7,9 @@ package decode_types is OP_BCD, OP_BPERM, OP_BREV, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB, OP_COUNTB, OP_CROP, - OP_DARN, OP_DCBF, OP_DCBST, OP_XCBT, OP_DCBTST, - OP_DCBZ, OP_ICBI, + OP_DARN, OP_DCBF, OP_DCBST, OP_DCBZ, + OP_SPARE, + OP_ICBI, OP_ICBT, OP_FP_CMP, OP_FP_ARITH, OP_FP_MOVE, OP_FP_MISC, OP_DIV, OP_DIVE, OP_MOD, OP_EXTS, OP_EXTSWSLI, diff --git a/execute1.vhdl b/execute1.vhdl index ecb1e63..84a6fbe 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -1184,8 +1184,8 @@ begin else illegal := '1'; end if; - when OP_NOP | OP_DCBF | OP_DCBST | OP_XCBT | OP_DCBTST => - -- Do nothing + when OP_NOP | OP_DCBST | OP_ICBT => + -- Do nothing when OP_ADD => if e_in.output_carry = '1' then if e_in.input_carry /= OV then @@ -1653,11 +1653,10 @@ begin v.e.srr1 := (others => '0'); v.e.srr1(47 - 33) := '1'; v.e.srr1(47 - 34) := ex1.prev_prefixed; - if ex1.prev_op = OP_LOAD or ex1.prev_op = OP_ICBI or - ex1.prev_op = OP_XCBT or ex1.prev_op = OP_DCBST or ex1.prev_op = OP_DCBF then + if ex1.prev_op = OP_LOAD or ex1.prev_op = OP_ICBI or ex1.prev_op = OP_ICBT or + ex1.prev_op = OP_DCBF then v.e.srr1(47 - 35) := '1'; - elsif ex1.prev_op = OP_STORE or ex1.prev_op = OP_DCBZ or - ex1.prev_op = OP_DCBTST then + elsif ex1.prev_op = OP_STORE or ex1.prev_op = OP_DCBZ then v.e.srr1(47 - 36) := '1'; end if; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index e69a27e..69d053d 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -61,6 +61,8 @@ architecture behave of loadstore1 is dc_req : std_ulogic; load : std_ulogic; store : std_ulogic; + flush : std_ulogic; + touch : std_ulogic; tlbie : std_ulogic; dcbz : std_ulogic; read_spr : std_ulogic; @@ -100,7 +102,8 @@ architecture behave of loadstore1 is two_dwords : std_ulogic; incomplete : std_ulogic; end record; - constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', tlbie => '0', + constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', + flush => '0', touch => '0', tlbie => '0', dcbz => '0', read_spr => '0', write_spr => '0', mmu_op => '0', instr_fault => '0', do_update => '0', mode_32bit => '0', prefixed => '0', @@ -470,7 +473,7 @@ begin addr_mask := std_ulogic_vector(unsigned(l_in.length(2 downto 0)) - 1); -- Do length_to_sel and work out if we are doing 2 dwords - long_sel := xfer_data_sel(v.length, addr(2 downto 0)); + long_sel := xfer_data_sel(l_in.length, addr(2 downto 0)); v.byte_sel := long_sel(7 downto 0); v.second_bytes := long_sel(15 downto 8); if long_sel(15 downto 8) /= "00000000" then @@ -505,6 +508,9 @@ begin case l_in.op is when OP_STORE => v.store := '1'; + if l_in.length = "0000" then + v.touch := '1'; + end if; when OP_LOAD => if l_in.update = '0' or l_in.second = '0' then v.load := '1'; @@ -512,10 +518,16 @@ begin -- Allow an extra cycle for SP->DP precision conversion v.load_sp := '1'; end if; + if l_in.length = "0000" then + v.touch := '1'; + end if; else -- write back address to RA v.do_update := '1'; end if; + when OP_DCBF => + v.load := '1'; + v.flush := '1'; when OP_DCBZ => v.dcbz := '1'; v.align_intr := v.nc; @@ -541,7 +553,7 @@ begin -- Work out controls for load and store formatting brev_lenm1 := "000"; if v.byte_reverse = '1' then - brev_lenm1 := unsigned(v.length(2 downto 0)) - 1; + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; end if; v.brev_mask := brev_lenm1; @@ -882,7 +894,8 @@ begin if d_in.valid = '1' then if r2.req.incomplete = '0' then - write_enable := r2.req.load and not r2.req.load_sp; + write_enable := r2.req.load and not r2.req.load_sp and + not r2.req.flush and not r2.req.touch; -- stores write back rA update do_update := r2.req.update and r2.req.store; end if; @@ -977,6 +990,8 @@ begin d_out.valid <= stage1_dcreq; d_out.load <= stage1_req.load; d_out.dcbz <= stage1_req.dcbz; + d_out.flush <= stage1_req.flush; + d_out.touch <= stage1_req.touch; d_out.nc <= stage1_req.nc; d_out.reserve <= stage1_req.reserve; d_out.atomic_qw <= stage1_req.atomic_qw; @@ -990,6 +1005,8 @@ begin d_out.valid <= req; d_out.load <= r2.req.load; d_out.dcbz <= r2.req.dcbz; + d_out.flush <= r2.req.flush; + d_out.touch <= r2.req.touch; d_out.nc <= r2.req.nc; d_out.reserve <= r2.req.reserve; d_out.atomic_qw <= r2.req.atomic_qw;