From 750b3a8e284ef721c101bbf82f88743ae2428459 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 6 Apr 2020 17:54:45 +1000 Subject: [PATCH] dcache: Implement data TLB This adds a TLB to dcache, providing the ability to translate addresses for loads and stores. No protection mechanism has been implemented yet. The MSR_DR bit controls whether addresses are translated through the TLB. The TLB is a fixed-pagesize, set-associative cache. Currently the page size is 4kB and the TLB is 2-way set associative with 64 entries per set. This implements the tlbie instruction. RB bits 10 and 11 control whether the whole TLB is invalidated (if either bit is 1) or just a single entry corresponding to the effective page number in bits 12-63 of RB. As an extension until we get a hardware page table walk, a tlbie instruction with RB bits 9-11 set to 001 will load an entry into the TLB. The TLB entry value is in RS in the format of a radix PTE. Currently there is no proper handling of TLB misses. The load or store will not be performed but no interrupt is generated. In order to make timing at 100MHz on the Arty A7-100, we compare the real address from each way of the TLB with the tag from each way of the cache in parallel (requiring # TLB ways * # cache ways comparators). Then the result is selected based on which way hit in the TLB. That avoids a timing path going through the TLB EA comparators, the multiplexer that selects the RA, and the cache tag comparators. The hack where addresses of the form 0xc------- are marked as cache-inhibited is kept for now but restricted to real-mode accesses. Signed-off-by: Paul Mackerras --- common.vhdl | 10 +- dcache.vhdl | 398 +++++++++++++++++++++++++++++++++++++++------- dcache_tb.vhdl | 1 + decode1.vhdl | 1 + decode_types.vhdl | 2 +- execute1.vhdl | 2 + loadstore1.vhdl | 22 ++- 7 files changed, 371 insertions(+), 65 deletions(-) diff --git a/common.vhdl b/common.vhdl index d3d30e7..6741044 100644 --- a/common.vhdl +++ b/common.vhdl @@ -216,7 +216,7 @@ package common is type Execute1ToLoadstore1Type is record valid : std_ulogic; - op : insn_type_t; -- what ld/st or m[tf]spr to do + op : insn_type_t; -- what ld/st or m[tf]spr or TLB op to do addr1 : std_ulogic_vector(63 downto 0); addr2 : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); -- data to write, unused for read @@ -231,18 +231,21 @@ package common is reserve : std_ulogic; -- set for larx/stcx. rc : std_ulogic; -- set for stcx. spr_num : spr_num_t; -- SPR number for mfspr/mtspr + virt_mode : std_ulogic; -- do translation through TLB end record; constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0', sign_extend => '0', update => '0', xerc => xerc_init, - reserve => '0', rc => '0', + reserve => '0', rc => '0', virt_mode => '0', spr_num => 0, others => (others => '0')); type Loadstore1ToDcacheType is record valid : std_ulogic; - load : std_ulogic; + load : std_ulogic; -- is this a load + tlbie : std_ulogic; -- is this a tlbie dcbz : std_ulogic; nc : std_ulogic; reserve : std_ulogic; + virt_mode : std_ulogic; addr : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); byte_sel : std_ulogic_vector(7 downto 0); @@ -253,6 +256,7 @@ package common is data : std_ulogic_vector(63 downto 0); store_done : std_ulogic; error : std_ulogic; + tlb_miss : std_ulogic; end record; type Loadstore1ToWritebackType is record diff --git a/dcache.vhdl b/dcache.vhdl index 7d61a85..3464c0d 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -25,7 +25,13 @@ entity dcache is -- Number of lines in a set NUM_LINES : positive := 32; -- Number of ways - NUM_WAYS : positive := 4 + NUM_WAYS : positive := 4; + -- L1 DTLB entries per set + TLB_SET_SIZE : positive := 64; + -- L1 DTLB number of sets + TLB_NUM_WAYS : positive := 2; + -- L1 DTLB log_2(page_size) + TLB_LG_PGSZ : positive := 12 ); port ( clk : in std_ulogic; @@ -56,6 +62,8 @@ architecture rtl of dcache is -- Bit fields counts in the address + -- REAL_ADDR_BITS is the number of real address bits that we store + constant REAL_ADDR_BITS : positive := 56; -- ROW_BITS is the number of bits to select a row constant ROW_BITS : natural := log2(BRAM_ROWS); -- ROW_LINEBITS is the number of bits to select a row within a line @@ -66,8 +74,10 @@ architecture rtl of dcache is constant ROW_OFF_BITS : natural := log2(ROW_SIZE); -- INDEX_BITS is the number if bits to select a cache line constant INDEX_BITS : natural := log2(NUM_LINES); + -- SET_SIZE_BITS is the log base 2 of the set size + constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS; -- TAG_BITS is the number of bits of the tag part of the address - constant TAG_BITS : natural := 64 - LINE_OFF_BITS - INDEX_BITS; + constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS; -- WAY_BITS is the number of bits to select a way constant WAY_BITS : natural := log2(NUM_WAYS); @@ -80,7 +90,7 @@ architecture rtl of dcache is -- .. | |- --| ROW_OFF_BITS (3) -- .. |----- ---| | ROW_BITS (8) -- .. |-----| | INDEX_BITS (5) - -- .. --------| | TAG_BITS (53) + -- .. --------| | TAG_BITS (45) subtype row_t is integer range 0 to BRAM_ROWS-1; subtype index_t is integer range 0 to NUM_LINES-1; @@ -110,6 +120,32 @@ architecture rtl of dcache is attribute ram_style : string; attribute ram_style of cache_tags : signal is "distributed"; + -- L1 TLB. + constant TLB_SET_BITS : natural := log2(TLB_SET_SIZE); + constant TLB_WAY_BITS : natural := log2(TLB_NUM_WAYS); + constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_SET_BITS); + constant TLB_TAG_WAY_BITS : natural := TLB_NUM_WAYS * TLB_EA_TAG_BITS; + constant TLB_PTE_BITS : natural := 64; + constant TLB_PTE_WAY_BITS : natural := TLB_NUM_WAYS * TLB_PTE_BITS; + + subtype tlb_way_t is integer range 0 to TLB_NUM_WAYS - 1; + subtype tlb_index_t is integer range 0 to TLB_SET_SIZE - 1; + subtype tlb_way_valids_t is std_ulogic_vector(TLB_NUM_WAYS-1 downto 0); + type tlb_valids_t is array(tlb_index_t) of tlb_way_valids_t; + subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0); + subtype tlb_way_tags_t is std_ulogic_vector(TLB_TAG_WAY_BITS-1 downto 0); + type tlb_tags_t is array(tlb_index_t) of tlb_way_tags_t; + subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0); + subtype tlb_way_ptes_t is std_ulogic_vector(TLB_PTE_WAY_BITS-1 downto 0); + type tlb_ptes_t is array(tlb_index_t) of tlb_way_ptes_t; + type hit_way_set_t is array(tlb_way_t) of way_t; + + signal dtlb_valids : tlb_valids_t; + signal dtlb_tags : tlb_tags_t; + signal dtlb_ptes : tlb_ptes_t; + attribute ram_style of dtlb_tags : signal is "distributed"; + attribute ram_style of dtlb_ptes : signal is "distributed"; + signal r0 : Loadstore1ToDcacheType; -- Type of operation on a "valid" input @@ -168,6 +204,13 @@ architecture rtl of dcache is store_way : way_t; store_row : row_t; store_index : index_t; + + -- Signals to complete with error + error_done : std_ulogic; + tlb_miss : std_ulogic; + + -- completion signal for tlbie + tlbie_done : std_ulogic; end record; signal r1 : reg_stage_1_t; @@ -208,6 +251,21 @@ architecture rtl of dcache is -- Wishbone read/write/cache write formatting signals signal bus_sel : std_ulogic_vector(7 downto 0); + -- TLB signals + signal tlb_tag_way : tlb_way_tags_t; + signal tlb_pte_way : tlb_way_ptes_t; + signal tlb_valid_way : tlb_way_valids_t; + signal tlb_req_index : tlb_index_t; + signal tlb_hit : std_ulogic; + signal tlb_hit_way : tlb_way_t; + signal pte : tlb_pte_t; + signal ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + signal valid_ra : std_ulogic; + + -- TLB PLRU output interface + type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + signal tlb_plru_victim : tlb_plru_out_t; + -- -- Helper functions to decode incoming requests -- @@ -215,13 +273,13 @@ architecture rtl of dcache is -- Return the cache line index (tag index) for an address function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is begin - return to_integer(unsigned(addr(63-TAG_BITS downto LINE_OFF_BITS))); + return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS))); end; -- Return the cache row index (data memory) for an address function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is begin - return to_integer(unsigned(addr(63-TAG_BITS downto ROW_OFF_BITS))); + return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS))); end; -- Returns whether this is the last row of a line @@ -269,9 +327,9 @@ architecture rtl of dcache is end; -- Get the tag value from the address - function get_tag(addr: std_ulogic_vector(63 downto 0)) return cache_tag_t is + function get_tag(addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)) return cache_tag_t is begin - return addr(63 downto 64-TAG_BITS); + return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS); end; -- Read a tag from a tag memory row @@ -287,6 +345,38 @@ architecture rtl of dcache is tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; end; + -- Read a TLB tag from a TLB tag memory row + function read_tlb_tag(way: tlb_way_t; tags: tlb_way_tags_t) return tlb_tag_t is + variable j : integer; + begin + j := way * TLB_EA_TAG_BITS; + return tags(j + TLB_EA_TAG_BITS - 1 downto j); + end; + + -- Write a TLB tag to a TLB tag memory row + procedure write_tlb_tag(way: tlb_way_t; tags: inout tlb_way_tags_t; + tag: tlb_tag_t) is + variable j : integer; + begin + j := way * TLB_EA_TAG_BITS; + tags(j + TLB_EA_TAG_BITS - 1 downto j) := tag; + end; + + -- Read a PTE from a TLB PTE memory row + function read_tlb_pte(way: tlb_way_t; ptes: tlb_way_ptes_t) return tlb_pte_t is + variable j : integer; + begin + j := way * TLB_PTE_BITS; + return ptes(j + TLB_PTE_BITS - 1 downto j); + end; + + procedure write_tlb_pte(way: tlb_way_t; ptes: inout tlb_way_ptes_t; newpte: tlb_pte_t) is + variable j : integer; + begin + j := way * TLB_PTE_BITS; + ptes(j + TLB_PTE_BITS - 1 downto j) := newpte; + end; + begin assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; @@ -297,13 +387,158 @@ begin report "geometry bits don't add up" severity FAILURE; assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) report "geometry bits don't add up" severity FAILURE; - assert (64 = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) + assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) report "geometry bits don't add up" severity FAILURE; - assert (64 = TAG_BITS + ROW_BITS + ROW_OFF_BITS) + assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS) report "geometry bits don't add up" severity FAILURE; assert (64 = wishbone_data_bits) report "Can't yet handle a wishbone width that isn't 64-bits" severity FAILURE; - + + -- Latch the request in r0 as long as we're not stalling + stage_0 : process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + r0.valid <= '0'; + elsif stall_out = '0' then + r0 <= d_in; + end if; + end if; + end process; + + -- TLB + -- Operates in the second cycle on the request latched in r0. + -- TLB updates write the entry at the end of the second cycle. + tlb_read : process(clk) + variable index : tlb_index_t; + begin + if rising_edge(clk) then + if stall_out = '1' then + -- keep reading the same thing while stalled + index := tlb_req_index; + else + index := to_integer(unsigned(d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 + downto TLB_LG_PGSZ))); + end if; + tlb_valid_way <= dtlb_valids(index); + tlb_tag_way <= dtlb_tags(index); + tlb_pte_way <= dtlb_ptes(index); + end if; + end process; + + -- Generate TLB PLRUs + maybe_tlb_plrus: if TLB_NUM_WAYS > 1 generate + begin + tlb_plrus: for i in 0 to TLB_SET_SIZE - 1 generate + -- TLB PLRU interface + signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + signal tlb_plru_acc_en : std_ulogic; + signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + begin + tlb_plru : entity work.plru + generic map ( + BITS => TLB_WAY_BITS + ) + port map ( + clk => clk, + rst => rst, + acc => tlb_plru_acc, + acc_en => tlb_plru_acc_en, + lru => tlb_plru_out + ); + + process(tlb_req_index, tlb_hit, tlb_hit_way, tlb_plru_out) + begin + -- PLRU interface + if tlb_hit = '1' and tlb_req_index = i then + tlb_plru_acc_en <= '1'; + else + tlb_plru_acc_en <= '0'; + end if; + tlb_plru_acc <= std_ulogic_vector(to_unsigned(tlb_hit_way, TLB_WAY_BITS)); + tlb_plru_victim(i) <= tlb_plru_out; + end process; + end generate; + end generate; + + tlb_search : process(all) + variable hitway : tlb_way_t; + variable hit : std_ulogic; + variable eatag : tlb_tag_t; + begin + tlb_req_index <= to_integer(unsigned(r0.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 + downto TLB_LG_PGSZ))); + hitway := 0; + hit := '0'; + eatag := r0.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); + for i in tlb_way_t loop + if tlb_valid_way(i) = '1' and + read_tlb_tag(i, tlb_tag_way) = eatag then + hitway := i; + hit := '1'; + end if; + end loop; + tlb_hit <= hit and r0.valid; + tlb_hit_way <= hitway; + pte <= read_tlb_pte(hitway, tlb_pte_way); + valid_ra <= tlb_hit or not r0.virt_mode; + if r0.virt_mode = '1' then + ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & + r0.addr(TLB_LG_PGSZ - 1 downto 0); + else + ra <= r0.addr(REAL_ADDR_BITS - 1 downto 0); + end if; + end process; + + tlb_update : process(clk) + variable tlbie : std_ulogic; + variable tlbia : std_ulogic; + variable tlbwe : std_ulogic; + variable repl_way : tlb_way_t; + variable eatag : tlb_tag_t; + variable tagset : tlb_way_tags_t; + variable pteset : tlb_way_ptes_t; + begin + if rising_edge(clk) then + tlbie := '0'; + tlbia := '0'; + tlbwe := '0'; + if r0.valid = '1' and stall_out = '0' and r0.tlbie = '1' then + if r0.addr(11 downto 10) /= "00" then + tlbia := '1'; + elsif r0.addr(9) = '1' then + tlbwe := '1'; + else + tlbie := '1'; + end if; + end if; + if rst = '1' or tlbia = '1' then + -- clear all valid bits at once + for i in tlb_index_t loop + dtlb_valids(i) <= (others => '0'); + end loop; + elsif tlbie = '1' then + if tlb_hit = '1' then + dtlb_valids(tlb_req_index)(tlb_hit_way) <= '0'; + end if; + elsif tlbwe = '1' then + if tlb_hit = '1' then + repl_way := tlb_hit_way; + else + repl_way := to_integer(unsigned(tlb_plru_victim(tlb_req_index))); + end if; + eatag := r0.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); + tagset := tlb_tag_way; + write_tlb_tag(repl_way, tagset, eatag); + dtlb_tags(tlb_req_index) <= tagset; + pteset := tlb_pte_way; + write_tlb_pte(repl_way, pteset, r0.data); + dtlb_ptes(tlb_req_index) <= pteset; + dtlb_valids(tlb_req_index)(repl_way) <= '1'; + end if; + end if; + end process; + -- Generate PLRUs maybe_plrus: if NUM_WAYS > 1 generate begin @@ -341,53 +576,73 @@ begin end generate; end generate; - -- Latch the request in r0 as long as we're not stalling - stage_0 : process(clk) - begin - if rising_edge(clk) then - if rst = '1' then - r0.valid <= '0'; - elsif stall_out = '0' then - r0 <= d_in; - end if; - end if; - end process; - -- Cache request parsing and hit detection dcache_request : process(all) variable is_hit : std_ulogic; variable hit_way : way_t; variable op : op_t; - variable tmp : std_ulogic_vector(63 downto 0); - variable data : std_ulogic_vector(63 downto 0); - variable opsel : std_ulogic_vector(3 downto 0); + variable opsel : std_ulogic_vector(2 downto 0); variable go : std_ulogic; + variable s_hit : std_ulogic; + variable s_tag : cache_tag_t; + variable s_pte : tlb_pte_t; + variable s_ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + variable hit_set : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); + variable hit_way_set : hit_way_set_t; begin -- Extract line, row and tag from request req_index <= get_index(r0.addr); req_row <= get_row(r0.addr); - req_tag <= get_tag(r0.addr); + req_tag <= get_tag(ra); -- Only do anything if not being stalled by stage 1 - go := r0.valid and not stall_out; + go := r0.valid and not stall_out and not r0.tlbie; -- Calculate address of beginning of cache line, will be -- used for cache miss processing if needed -- - req_laddr <= r0.addr(63 downto LINE_OFF_BITS) & + req_laddr <= (63 downto REAL_ADDR_BITS => '0') & + ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) & (LINE_OFF_BITS-1 downto 0 => '0'); -- Test if pending request is a hit on any way - hit_way := 0; - is_hit := '0'; - for i in way_t loop - if go = '1' and cache_valids(req_index)(i) = '1' then - if read_tag(i, cache_tags(req_index)) = req_tag then - hit_way := i; - is_hit := '1'; - end if; - end if; - end loop; + -- In order to make timing in virtual mode, when we are using the TLB, + -- we compare each way with each of the real addresses from each way of + -- the TLB, and then decide later which match to use. + hit_way := 0; + is_hit := '0'; + if r0.virt_mode = '1' then + for j in tlb_way_t loop + hit_way_set(j) := 0; + s_hit := '0'; + s_pte := read_tlb_pte(j, tlb_pte_way); + s_ra := s_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & + r0.addr(TLB_LG_PGSZ - 1 downto 0); + s_tag := get_tag(s_ra); + for i in way_t loop + if go = '1' and cache_valids(req_index)(i) = '1' and + read_tag(i, cache_tags(req_index)) = s_tag and + tlb_valid_way(j) = '1' then + hit_way_set(j) := i; + s_hit := '1'; + end if; + end loop; + hit_set(j) := s_hit; + end loop; + if tlb_hit = '1' then + is_hit := hit_set(tlb_hit_way); + hit_way := hit_way_set(tlb_hit_way); + end if; + else + s_tag := get_tag(r0.addr(REAL_ADDR_BITS - 1 downto 0)); + for i in way_t loop + if go = '1' and cache_valids(req_index)(i) = '1' and + read_tag(i, cache_tags(req_index)) = s_tag then + hit_way := i; + is_hit := '1'; + end if; + end loop; + end if; -- The way that matched on a hit req_hit_way <= hit_way; @@ -398,19 +653,25 @@ begin -- Combine the request and cache his status to decide what -- operation needs to be done -- - opsel := go & r0.load & r0.nc & is_hit; - case opsel is - when "1101" => op := OP_LOAD_HIT; - when "1100" => op := OP_LOAD_MISS; - when "1110" => op := OP_LOAD_NC; - when "1001" => op := OP_STORE_HIT; - when "1000" => op := OP_STORE_MISS; - when "1010" => op := OP_STORE_MISS; - when "1011" => op := OP_BAD; - when "1111" => op := OP_BAD; - when others => op := OP_NONE; - end case; - + op := OP_NONE; + if go = '1' then + if valid_ra = '1' then + opsel := r0.load & r0.nc & is_hit; + case opsel is + when "101" => op := OP_LOAD_HIT; + when "100" => op := OP_LOAD_MISS; + when "110" => op := OP_LOAD_NC; + when "001" => op := OP_STORE_HIT; + when "000" => op := OP_STORE_MISS; + when "010" => op := OP_STORE_MISS; + when "011" => op := OP_BAD; + when "111" => op := OP_BAD; + when others => op := OP_NONE; + end case; + else + op := OP_BAD; + end if; + end if; req_op <= op; -- Version of the row number that is valid one cycle earlier @@ -427,9 +688,6 @@ begin -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- TODO: Generate errors - -- err_nc_collision <= '1' when req_op = OP_BAD else '0'; - -- Generate stalls from stage 1 state machine stall_out <= '1' when r1.state /= IDLE else '0'; @@ -477,6 +735,8 @@ begin d_out.valid <= '0'; d_out.data <= cache_out(r1.hit_way); d_out.store_done <= '0'; + d_out.error <= '0'; + d_out.tlb_miss <= '0'; -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store @@ -502,6 +762,20 @@ begin d_out.valid <= '1'; end if; + -- error cases complete without stalling + if r1.error_done = '1' then + report "completing ld/st with error"; + d_out.error <= '1'; + d_out.tlb_miss <= r1.tlb_miss; + d_out.valid <= '1'; + end if; + + -- tlbie is handled above and doesn't go through the cache state machine + if r1.tlbie_done = '1' then + report "completing tlbie"; + d_out.valid <= '1'; + end if; + -- Slow ops (load miss, NC, stores) if r1.slow_valid = '1' then -- If it's a load, enable register writeback and switch @@ -609,6 +883,7 @@ begin -- -- Cache hit synchronous machine for the easy case. This handles load hits. + -- It also handles error cases (TLB miss, cache paradox) -- dcache_fast_hit : process(clk) begin @@ -636,6 +911,16 @@ begin else r1.hit_load_valid <= '0'; end if; + + if req_op = OP_BAD then + r1.error_done <= '1'; + r1.tlb_miss <= not valid_ra; + else + r1.error_done <= '0'; + end if; + + -- complete tlbies in the third cycle + r1.tlbie_done <= r0.valid and r0.tlbie and not stall_out; end if; end process; @@ -717,7 +1002,7 @@ begin when OP_LOAD_NC => r1.wb.sel <= r0.byte_sel; - r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; @@ -726,7 +1011,7 @@ begin when OP_STORE_HIT | OP_STORE_MISS => if r0.dcbz = '0' then r1.wb.sel <= r0.byte_sel; - r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; r1.wb.dat <= r0.data; if cancel_store = '0' then r1.wb.cyc <= '1'; @@ -774,6 +1059,7 @@ begin end if; -- OP_NONE and OP_BAD do nothing + -- OP_BAD was handled above already when OP_NONE => when OP_BAD => end case; diff --git a/dcache_tb.vhdl b/dcache_tb.vhdl index bd8341a..66b938f 100644 --- a/dcache_tb.vhdl +++ b/dcache_tb.vhdl @@ -68,6 +68,7 @@ begin -- Clear stuff d_in.valid <= '0'; d_in.load <= '0'; + d_in.tlbie <= '0'; d_in.nc <= '0'; d_in.addr <= (others => '0'); d_in.data <= (others => '0'); diff --git a/decode1.vhdl b/decode1.vhdl index 70099d4..fd799fe 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -323,6 +323,7 @@ architecture behaviour of decode1 is 2#1001010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sync 2#0001000100# => (ALU, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- td 2#0000000100# => (ALU, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- tw + 2#0100110010# => (LDST, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- tlbie 2#0100111100# => (ALU, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- xor others => illegal_inst ); diff --git a/decode_types.vhdl b/decode_types.vhdl index 07c486a..ef51bd0 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -16,7 +16,7 @@ package decode_types is OP_POPCNT, OP_PRTY, OP_RFID, OP_RLC, OP_RLCL, OP_RLCR, OP_SC, OP_SETB, OP_SHL, OP_SHR, - OP_SYNC, OP_TRAP, + OP_SYNC, OP_TLBIE, OP_TRAP, OP_XOR ); type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR); diff --git a/execute1.vhdl b/execute1.vhdl index 490723e..98b95dc 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -88,6 +88,7 @@ architecture behaviour of execute1 is OP_MFMSR => SUPER, OP_MTMSRD => SUPER, OP_RFID => SUPER, + OP_TLBIE => SUPER, others => USER ); @@ -988,6 +989,7 @@ begin e_in.insn(5 downto 1) = "10101" then lv.ci := '1'; end if; + lv.virt_mode := ctrl.msr(MSR_DR); -- Update registers rin <= v; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 7ddbbc0..d5a59e8 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -43,6 +43,7 @@ architecture behave of loadstore1 is type reg_stage_t is record -- latch most of the input request load : std_ulogic; + tlbie : std_ulogic; dcbz : std_ulogic; addr : std_ulogic_vector(63 downto 0); store_data : std_ulogic_vector(63 downto 0); @@ -57,6 +58,7 @@ architecture behave of loadstore1 is reserve : std_ulogic; rc : std_ulogic; nc : std_ulogic; -- non-cacheable access + virt_mode : std_ulogic; state : state_t; second_bytes : std_ulogic_vector(7 downto 0); dar : std_ulogic_vector(63 downto 0); @@ -207,6 +209,7 @@ begin if l_in.valid = '1' then v.load := '0'; v.dcbz := '0'; + v.tlbie := '0'; case l_in.op is when OP_STORE => req := '1'; @@ -216,6 +219,9 @@ begin when OP_DCBZ => req := '1'; v.dcbz := '1'; + when OP_TLBIE => + req := '1'; + v.tlbie := '1'; when OP_MFSPR => done := '1'; mfspr := '1'; @@ -250,14 +256,15 @@ begin v.reserve := l_in.reserve; v.rc := l_in.rc; v.nc := l_in.ci; + v.virt_mode := l_in.virt_mode; -- XXX Temporary hack. Mark the op as non-cachable if the address - -- is the form 0xc------- + -- is the form 0xc------- for a real-mode access. -- -- This will have to be replaced by a combination of implementing the -- proper HV CI load/store instructions and having an MMU to get the I -- bit otherwise. - if lsu_sum(31 downto 28) = "1100" then + if lsu_sum(31 downto 28) = "1100" and l_in.virt_mode = '0' then v.nc := '1'; end if; @@ -269,10 +276,13 @@ begin v.addr := lsu_sum; -- Do byte reversing and rotating for stores in the first cycle - byte_offset := unsigned(lsu_sum(2 downto 0)); + byte_offset := "000"; brev_lenm1 := "000"; - if l_in.byte_reverse = '1' then - brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + if v.tlbie = '0' then + byte_offset := unsigned(lsu_sum(2 downto 0)); + if l_in.byte_reverse = '1' then + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + end if; end if; for i in 0 to 7 loop k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; @@ -332,12 +342,14 @@ begin -- Update outputs to dcache d_out.valid <= req; d_out.load <= v.load; + d_out.tlbie <= v.tlbie; d_out.dcbz <= v.dcbz; d_out.nc <= v.nc; d_out.reserve <= v.reserve; d_out.addr <= addr; d_out.data <= v.store_data; d_out.byte_sel <= byte_sel; + d_out.virt_mode <= v.virt_mode; -- Update outputs to writeback -- Multiplex either cache data to the destination GPR or