diff --git a/common.vhdl b/common.vhdl index 6df5b6b..b1a2c8e 100644 --- a/common.vhdl +++ b/common.vhdl @@ -603,6 +603,8 @@ package common is hold : std_ulogic; load : std_ulogic; -- is this a load dcbz : std_ulogic; + flush : std_ulogic; + touch : std_ulogic; nc : std_ulogic; reserve : std_ulogic; atomic_qw : std_ulogic; -- part of a quadword atomic op @@ -614,6 +616,9 @@ package common is data : std_ulogic_vector(63 downto 0); -- valid the cycle after .valid = 1 byte_sel : std_ulogic_vector(7 downto 0); end record; + constant Loadstore1ToDcacheInit : Loadstore1ToDcacheType := + (addr => (others => '0'), data => (others => '0'), byte_sel => x"00", + others => '0'); type DcacheToLoadstore1Type is record valid : std_ulogic; diff --git a/dcache.vhdl b/dcache.vhdl index 68f3b60..82ae791 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -187,15 +187,17 @@ architecture rtl of dcache is OP_LOAD_HIT, -- Cache hit on load OP_LOAD_MISS, -- Load missing cache OP_LOAD_NC, -- Non-cachable load - OP_STORE_HIT, -- Store hitting cache - OP_STORE_MISS); -- Store missing cache - + OP_STORE, -- Store, whether hitting or missing cache + OP_NOP, -- nothing to do, just complete the op + OP_MISC); -- Flush + -- Cache state machine type state_t is (IDLE, -- Normal load hit processing RELOAD_WAIT_ACK, -- Cache reload wait ack STORE_WAIT_ACK, -- Store wait ack NC_LOAD_WAIT_ACK, -- Non-cachable load wait ack - DO_STCX); -- Check for stcx. validity + DO_STCX, -- Check for stcx. validity + FLUSH_CYCLE); -- Cycle for invalidating cache line -- -- Dcache operations: @@ -289,12 +291,15 @@ architecture rtl of dcache is op : op_t; valid : std_ulogic; dcbz : std_ulogic; + flush : std_ulogic; + touch : std_ulogic; reserve : std_ulogic; first_dw : std_ulogic; last_dw : std_ulogic; real_addr : real_addr_t; data : std_ulogic_vector(63 downto 0); byte_sel : std_ulogic_vector(7 downto 0); + is_hit : std_ulogic; hit_way : way_t; same_tag : std_ulogic; mmu_req : std_ulogic; @@ -377,6 +382,7 @@ architecture rtl of dcache is -- Async signals on incoming request signal req_index : index_t; signal req_hit_way : way_t; + signal req_is_hit : std_ulogic; signal req_tag : cache_tag_t; signal req_op : op_t; signal req_data : std_ulogic_vector(63 downto 0); @@ -568,12 +574,9 @@ begin assert (d_in.valid and m_in.valid) = '0' report "request collision loadstore vs MMU"; if m_in.valid = '1' then + r.req := Loadstore1ToDcacheInit; r.req.valid := '1'; r.req.load := not (m_in.tlbie or m_in.tlbld); - r.req.dcbz := '0'; - r.req.nc := '0'; - r.req.reserve := '0'; - r.req.virt_mode := '0'; r.req.priv_mode := '1'; r.req.addr := m_in.addr; r.req.data := m_in.pte; @@ -1077,13 +1080,17 @@ begin -- since it will be by the time we perform the store. -- For a load, check the appropriate row valid bit; but also, -- if use_forward_rl is 1 then we can consider this a hit. - is_hit := not r0.req.load or r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or + -- For a touch, since the line we want is being reloaded already, + -- consider this a hit. + is_hit := not r0.req.load or r0.req.touch or + r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or use_forward_rl; hit_way := replace_way; end if; -- The way that matched on a hit req_hit_way <= hit_way; + req_is_hit <= is_hit; -- work out whether we have permission for this access -- NB we don't yet implement AMR, thus no KUAP @@ -1098,17 +1105,32 @@ begin nc := r0.req.nc or perm_attr.nocache; op := OP_NONE; if go = '1' then - if access_ok = '0' then + if r0.req.touch = '1' then + if access_ok = '1' and is_hit = '0' and nc = '0' then + op := OP_LOAD_MISS; + elsif access_ok = '1' and is_hit = '1' and nc = '0' then + -- Make this OP_LOAD_HIT so the PLRU gets updated + op := OP_LOAD_HIT; + else + op := OP_NOP; + end if; + elsif access_ok = '0' then op := OP_BAD; + elsif r0.req.flush = '1' then + if is_hit = '0' then + op := OP_NOP; + else + op := OP_MISC; + end if; else opsel := r0.req.load & nc & is_hit; case opsel is when "101" => op := OP_LOAD_HIT; when "100" => op := OP_LOAD_MISS; when "110" => op := OP_LOAD_NC; - when "001" => op := OP_STORE_HIT; - when "000" => op := OP_STORE_MISS; - when "010" => op := OP_STORE_MISS; + when "001" => op := OP_STORE; + when "000" => op := OP_STORE; + when "010" => op := OP_STORE; when "011" => op := OP_BAD; when "111" => op := OP_BAD; when others => op := OP_NONE; @@ -1348,8 +1370,8 @@ begin end if; -- The cache hit indication is used for PLRU updates - if req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT then - r1.cache_hit <= '1'; + if req_op = OP_LOAD_HIT or req_op = OP_STORE then + r1.cache_hit <= req_is_hit; else r1.cache_hit <= '0'; end if; @@ -1430,7 +1452,7 @@ begin r1.ls_valid <= '0'; -- complete tlbies and TLB loads in the third cycle r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld); - if req_op = OP_LOAD_HIT then + if req_op = OP_LOAD_HIT or req_op = OP_NOP then if r0.mmu_req = '0' then r1.ls_valid <= '1'; else @@ -1446,7 +1468,7 @@ begin if req_go = '1' and access_ok = '1' and r0.req.load = '1' and r0.req.reserve = '1' and r0.req.atomic_first = '1' then reservation.addr <= ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); - if req_op = OP_LOAD_HIT then + if req_is_hit = '1' then reservation.valid <= not req_snoop_hit; end if; end if; @@ -1485,6 +1507,8 @@ begin req.valid := req_go; req.mmu_req := r0.mmu_req; req.dcbz := r0.req.dcbz; + req.flush := r0.req.flush; + req.touch := r0.req.touch; req.reserve := r0.req.reserve; req.first_dw := r0.req.atomic_first; req.last_dw := r0.req.atomic_last; @@ -1504,12 +1528,13 @@ begin req.byte_sel := r0.req.byte_sel; end if; req.hit_way := req_hit_way; + req.is_hit := req_is_hit; req.same_tag := req_same_tag; -- Store the incoming request from r0, if it is a slow request -- Note that r1.full = 1 implies req_op = OP_NONE if req_op = OP_LOAD_MISS or req_op = OP_LOAD_NC or - req_op = OP_STORE_MISS or req_op = OP_STORE_HIT then + req_op = OP_STORE or req_op = OP_MISC then r1.req <= req; r1.full <= '1'; end if; @@ -1523,7 +1548,7 @@ begin r1.victim_way <= plru_victim; report "victim way:" & to_hstring(plru_victim); end if; - if req_op = OP_LOAD_MISS or (req_op = OP_STORE_MISS and r0.req.dcbz = '1') then + if req_op = OP_LOAD_MISS or (r0.req.dcbz = '1' and req_is_hit = '0') then r1.choose_victim <= '1'; end if; @@ -1555,7 +1580,7 @@ begin r1.reload_tag <= get_tag(req.real_addr); r1.req.same_tag <= '1'; - if req.op = OP_STORE_HIT then + if req.is_hit = '1' then r1.store_way <= req.hit_way; end if; @@ -1585,13 +1610,20 @@ begin r1.write_tag <= '1'; ev.load_miss <= '1'; + -- If this is a touch, complete the instruction + if req.touch = '1' then + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + end if; + when OP_LOAD_NC => r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; r1.state <= NC_LOAD_WAIT_ACK; - when OP_STORE_HIT | OP_STORE_MISS => + when OP_STORE => if req.reserve = '1' then -- stcx needs to wait until next cycle -- for the reservation address check @@ -1605,9 +1637,7 @@ begin else r1.mmu_done <= '1'; end if; - if req.op = OP_STORE_HIT then - r1.write_bram <= '1'; - end if; + r1.write_bram <= req.is_hit; r1.wb.we <= '1'; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; @@ -1615,21 +1645,24 @@ begin -- dcbz is handled much like a load miss except -- that we are writing to memory instead of reading r1.state <= RELOAD_WAIT_ACK; - if req.op = OP_STORE_MISS then - r1.write_tag <= '1'; - end if; + r1.write_tag <= not req.is_hit; r1.wb.we <= '1'; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; end if; - if req.op = OP_STORE_MISS then - ev.store_miss <= '1'; + if req.op = OP_STORE then + ev.store_miss <= not req.is_hit; end if; + when OP_MISC => + r1.state <= FLUSH_CYCLE; + -- OP_NONE and OP_BAD do nothing - -- OP_BAD was handled above already + -- OP_BAD & OP_NOP were handled above already when OP_NONE => when OP_BAD => + when OP_NOP => + end case; when RELOAD_WAIT_ACK => @@ -1712,14 +1745,12 @@ begin end if; assert not is_X(acks); if acks < 7 and req.same_tag = '1' and req.dcbz = '0' and - (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then + req.op = OP_STORE then r1.wb.stb <= '1'; stbs_done := false; r1.store_way <= req.hit_way; r1.store_row <= get_row(req.real_addr); - if req.op = OP_STORE_HIT then - r1.write_bram <= '1'; - end if; + r1.write_bram <= req.is_hit; r1.full <= '0'; r1.slow_valid <= '1'; -- Store requests never come from the MMU @@ -1783,9 +1814,7 @@ begin if wishbone_in.stall = '0' then -- Store has been accepted, so now we can write the -- cache data RAM - if r1.req.op = OP_STORE_HIT then - r1.write_bram <= '1'; - end if; + r1.write_bram <= req.is_hit; r1.wb.stb <= '0'; end if; if wishbone_in.ack = '1' then @@ -1802,6 +1831,12 @@ begin end if; end if; + when FLUSH_CYCLE => + cache_valids(to_integer(r1.store_index))(to_integer(r1.store_way)) <= '0'; + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + r1.state <= IDLE; end case; end if; end if; diff --git a/decode1.vhdl b/decode1.vhdl index 643523b..9047cf8 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -129,10 +129,10 @@ architecture behaviour of decode1 is INSN_crorc => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_crxor => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_darn => (ALU, NONE, OP_DARN, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_dcbf => (ALU, NONE, OP_DCBF, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_dcbf => (LDST, NONE, OP_DCBF, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_dcbst => (ALU, NONE, OP_DCBST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_dcbt => (ALU, NONE, OP_XCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_dcbtst => (ALU, NONE, OP_DCBTST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_dcbt => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_dcbtst => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_dcbz => (LDST, NONE, OP_DCBZ, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_divd => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RCOE, '0', '0', NONE), INSN_divde => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RCOE, '0', '0', NONE), @@ -200,7 +200,7 @@ architecture behaviour of decode1 is INSN_ftdiv => (FPU, FPU, OP_FP_CMP, FRA, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_ftsqrt => (FPU, FPU, OP_FP_CMP, NONE, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_icbi => (ALU, NONE, OP_ICBI, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), - INSN_icbt => (ALU, NONE, OP_XCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_icbt => (ALU, NONE, OP_ICBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_isel => (ALU, NONE, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_isync => (ALU, NONE, OP_ISYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_lbarx => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), diff --git a/decode_types.vhdl b/decode_types.vhdl index 03e958b..5695643 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -7,8 +7,9 @@ package decode_types is OP_BCD, OP_BPERM, OP_BREV, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB, OP_COUNTB, OP_CROP, - OP_DARN, OP_DCBF, OP_DCBST, OP_XCBT, OP_DCBTST, - OP_DCBZ, OP_ICBI, + OP_DARN, OP_DCBF, OP_DCBST, OP_DCBZ, + OP_SPARE, + OP_ICBI, OP_ICBT, OP_FP_CMP, OP_FP_ARITH, OP_FP_MOVE, OP_FP_MISC, OP_DIV, OP_DIVE, OP_MOD, OP_EXTS, OP_EXTSWSLI, diff --git a/execute1.vhdl b/execute1.vhdl index ecb1e63..84a6fbe 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -1184,8 +1184,8 @@ begin else illegal := '1'; end if; - when OP_NOP | OP_DCBF | OP_DCBST | OP_XCBT | OP_DCBTST => - -- Do nothing + when OP_NOP | OP_DCBST | OP_ICBT => + -- Do nothing when OP_ADD => if e_in.output_carry = '1' then if e_in.input_carry /= OV then @@ -1653,11 +1653,10 @@ begin v.e.srr1 := (others => '0'); v.e.srr1(47 - 33) := '1'; v.e.srr1(47 - 34) := ex1.prev_prefixed; - if ex1.prev_op = OP_LOAD or ex1.prev_op = OP_ICBI or - ex1.prev_op = OP_XCBT or ex1.prev_op = OP_DCBST or ex1.prev_op = OP_DCBF then + if ex1.prev_op = OP_LOAD or ex1.prev_op = OP_ICBI or ex1.prev_op = OP_ICBT or + ex1.prev_op = OP_DCBF then v.e.srr1(47 - 35) := '1'; - elsif ex1.prev_op = OP_STORE or ex1.prev_op = OP_DCBZ or - ex1.prev_op = OP_DCBTST then + elsif ex1.prev_op = OP_STORE or ex1.prev_op = OP_DCBZ then v.e.srr1(47 - 36) := '1'; end if; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index e69a27e..69d053d 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -61,6 +61,8 @@ architecture behave of loadstore1 is dc_req : std_ulogic; load : std_ulogic; store : std_ulogic; + flush : std_ulogic; + touch : std_ulogic; tlbie : std_ulogic; dcbz : std_ulogic; read_spr : std_ulogic; @@ -100,7 +102,8 @@ architecture behave of loadstore1 is two_dwords : std_ulogic; incomplete : std_ulogic; end record; - constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', tlbie => '0', + constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', + flush => '0', touch => '0', tlbie => '0', dcbz => '0', read_spr => '0', write_spr => '0', mmu_op => '0', instr_fault => '0', do_update => '0', mode_32bit => '0', prefixed => '0', @@ -470,7 +473,7 @@ begin addr_mask := std_ulogic_vector(unsigned(l_in.length(2 downto 0)) - 1); -- Do length_to_sel and work out if we are doing 2 dwords - long_sel := xfer_data_sel(v.length, addr(2 downto 0)); + long_sel := xfer_data_sel(l_in.length, addr(2 downto 0)); v.byte_sel := long_sel(7 downto 0); v.second_bytes := long_sel(15 downto 8); if long_sel(15 downto 8) /= "00000000" then @@ -505,6 +508,9 @@ begin case l_in.op is when OP_STORE => v.store := '1'; + if l_in.length = "0000" then + v.touch := '1'; + end if; when OP_LOAD => if l_in.update = '0' or l_in.second = '0' then v.load := '1'; @@ -512,10 +518,16 @@ begin -- Allow an extra cycle for SP->DP precision conversion v.load_sp := '1'; end if; + if l_in.length = "0000" then + v.touch := '1'; + end if; else -- write back address to RA v.do_update := '1'; end if; + when OP_DCBF => + v.load := '1'; + v.flush := '1'; when OP_DCBZ => v.dcbz := '1'; v.align_intr := v.nc; @@ -541,7 +553,7 @@ begin -- Work out controls for load and store formatting brev_lenm1 := "000"; if v.byte_reverse = '1' then - brev_lenm1 := unsigned(v.length(2 downto 0)) - 1; + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; end if; v.brev_mask := brev_lenm1; @@ -882,7 +894,8 @@ begin if d_in.valid = '1' then if r2.req.incomplete = '0' then - write_enable := r2.req.load and not r2.req.load_sp; + write_enable := r2.req.load and not r2.req.load_sp and + not r2.req.flush and not r2.req.touch; -- stores write back rA update do_update := r2.req.update and r2.req.store; end if; @@ -977,6 +990,8 @@ begin d_out.valid <= stage1_dcreq; d_out.load <= stage1_req.load; d_out.dcbz <= stage1_req.dcbz; + d_out.flush <= stage1_req.flush; + d_out.touch <= stage1_req.touch; d_out.nc <= stage1_req.nc; d_out.reserve <= stage1_req.reserve; d_out.atomic_qw <= stage1_req.atomic_qw; @@ -990,6 +1005,8 @@ begin d_out.valid <= req; d_out.load <= r2.req.load; d_out.dcbz <= r2.req.dcbz; + d_out.flush <= r2.req.flush; + d_out.touch <= r2.req.touch; d_out.nc <= r2.req.nc; d_out.reserve <= r2.req.reserve; d_out.atomic_qw <= r2.req.atomic_qw;