diff --git a/common.vhdl b/common.vhdl index d3d30e7..6741044 100644 --- a/common.vhdl +++ b/common.vhdl @@ -216,7 +216,7 @@ package common is type Execute1ToLoadstore1Type is record valid : std_ulogic; - op : insn_type_t; -- what ld/st or m[tf]spr to do + op : insn_type_t; -- what ld/st or m[tf]spr or TLB op to do addr1 : std_ulogic_vector(63 downto 0); addr2 : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); -- data to write, unused for read @@ -231,18 +231,21 @@ package common is reserve : std_ulogic; -- set for larx/stcx. rc : std_ulogic; -- set for stcx. spr_num : spr_num_t; -- SPR number for mfspr/mtspr + virt_mode : std_ulogic; -- do translation through TLB end record; constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0', sign_extend => '0', update => '0', xerc => xerc_init, - reserve => '0', rc => '0', + reserve => '0', rc => '0', virt_mode => '0', spr_num => 0, others => (others => '0')); type Loadstore1ToDcacheType is record valid : std_ulogic; - load : std_ulogic; + load : std_ulogic; -- is this a load + tlbie : std_ulogic; -- is this a tlbie dcbz : std_ulogic; nc : std_ulogic; reserve : std_ulogic; + virt_mode : std_ulogic; addr : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); byte_sel : std_ulogic_vector(7 downto 0); @@ -253,6 +256,7 @@ package common is data : std_ulogic_vector(63 downto 0); store_done : std_ulogic; error : std_ulogic; + tlb_miss : std_ulogic; end record; type Loadstore1ToWritebackType is record diff --git a/dcache.vhdl b/dcache.vhdl index 7d61a85..3464c0d 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -25,7 +25,13 @@ entity dcache is -- Number of lines in a set NUM_LINES : positive := 32; -- Number of ways - NUM_WAYS : positive := 4 + NUM_WAYS : positive := 4; + -- L1 DTLB entries per set + TLB_SET_SIZE : positive := 64; + -- L1 DTLB number of sets + TLB_NUM_WAYS : positive := 2; + -- L1 DTLB log_2(page_size) + TLB_LG_PGSZ : positive := 12 ); port ( clk : in std_ulogic; @@ -56,6 +62,8 @@ architecture rtl of dcache is -- Bit fields counts in the address + -- REAL_ADDR_BITS is the number of real address bits that we store + constant REAL_ADDR_BITS : positive := 56; -- ROW_BITS is the number of bits to select a row constant ROW_BITS : natural := log2(BRAM_ROWS); -- ROW_LINEBITS is the number of bits to select a row within a line @@ -66,8 +74,10 @@ architecture rtl of dcache is constant ROW_OFF_BITS : natural := log2(ROW_SIZE); -- INDEX_BITS is the number if bits to select a cache line constant INDEX_BITS : natural := log2(NUM_LINES); + -- SET_SIZE_BITS is the log base 2 of the set size + constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS; -- TAG_BITS is the number of bits of the tag part of the address - constant TAG_BITS : natural := 64 - LINE_OFF_BITS - INDEX_BITS; + constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS; -- WAY_BITS is the number of bits to select a way constant WAY_BITS : natural := log2(NUM_WAYS); @@ -80,7 +90,7 @@ architecture rtl of dcache is -- .. | |- --| ROW_OFF_BITS (3) -- .. |----- ---| | ROW_BITS (8) -- .. |-----| | INDEX_BITS (5) - -- .. --------| | TAG_BITS (53) + -- .. --------| | TAG_BITS (45) subtype row_t is integer range 0 to BRAM_ROWS-1; subtype index_t is integer range 0 to NUM_LINES-1; @@ -110,6 +120,32 @@ architecture rtl of dcache is attribute ram_style : string; attribute ram_style of cache_tags : signal is "distributed"; + -- L1 TLB. + constant TLB_SET_BITS : natural := log2(TLB_SET_SIZE); + constant TLB_WAY_BITS : natural := log2(TLB_NUM_WAYS); + constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_SET_BITS); + constant TLB_TAG_WAY_BITS : natural := TLB_NUM_WAYS * TLB_EA_TAG_BITS; + constant TLB_PTE_BITS : natural := 64; + constant TLB_PTE_WAY_BITS : natural := TLB_NUM_WAYS * TLB_PTE_BITS; + + subtype tlb_way_t is integer range 0 to TLB_NUM_WAYS - 1; + subtype tlb_index_t is integer range 0 to TLB_SET_SIZE - 1; + subtype tlb_way_valids_t is std_ulogic_vector(TLB_NUM_WAYS-1 downto 0); + type tlb_valids_t is array(tlb_index_t) of tlb_way_valids_t; + subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0); + subtype tlb_way_tags_t is std_ulogic_vector(TLB_TAG_WAY_BITS-1 downto 0); + type tlb_tags_t is array(tlb_index_t) of tlb_way_tags_t; + subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0); + subtype tlb_way_ptes_t is std_ulogic_vector(TLB_PTE_WAY_BITS-1 downto 0); + type tlb_ptes_t is array(tlb_index_t) of tlb_way_ptes_t; + type hit_way_set_t is array(tlb_way_t) of way_t; + + signal dtlb_valids : tlb_valids_t; + signal dtlb_tags : tlb_tags_t; + signal dtlb_ptes : tlb_ptes_t; + attribute ram_style of dtlb_tags : signal is "distributed"; + attribute ram_style of dtlb_ptes : signal is "distributed"; + signal r0 : Loadstore1ToDcacheType; -- Type of operation on a "valid" input @@ -168,6 +204,13 @@ architecture rtl of dcache is store_way : way_t; store_row : row_t; store_index : index_t; + + -- Signals to complete with error + error_done : std_ulogic; + tlb_miss : std_ulogic; + + -- completion signal for tlbie + tlbie_done : std_ulogic; end record; signal r1 : reg_stage_1_t; @@ -208,6 +251,21 @@ architecture rtl of dcache is -- Wishbone read/write/cache write formatting signals signal bus_sel : std_ulogic_vector(7 downto 0); + -- TLB signals + signal tlb_tag_way : tlb_way_tags_t; + signal tlb_pte_way : tlb_way_ptes_t; + signal tlb_valid_way : tlb_way_valids_t; + signal tlb_req_index : tlb_index_t; + signal tlb_hit : std_ulogic; + signal tlb_hit_way : tlb_way_t; + signal pte : tlb_pte_t; + signal ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + signal valid_ra : std_ulogic; + + -- TLB PLRU output interface + type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + signal tlb_plru_victim : tlb_plru_out_t; + -- -- Helper functions to decode incoming requests -- @@ -215,13 +273,13 @@ architecture rtl of dcache is -- Return the cache line index (tag index) for an address function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is begin - return to_integer(unsigned(addr(63-TAG_BITS downto LINE_OFF_BITS))); + return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS))); end; -- Return the cache row index (data memory) for an address function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is begin - return to_integer(unsigned(addr(63-TAG_BITS downto ROW_OFF_BITS))); + return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS))); end; -- Returns whether this is the last row of a line @@ -269,9 +327,9 @@ architecture rtl of dcache is end; -- Get the tag value from the address - function get_tag(addr: std_ulogic_vector(63 downto 0)) return cache_tag_t is + function get_tag(addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)) return cache_tag_t is begin - return addr(63 downto 64-TAG_BITS); + return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS); end; -- Read a tag from a tag memory row @@ -287,6 +345,38 @@ architecture rtl of dcache is tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; end; + -- Read a TLB tag from a TLB tag memory row + function read_tlb_tag(way: tlb_way_t; tags: tlb_way_tags_t) return tlb_tag_t is + variable j : integer; + begin + j := way * TLB_EA_TAG_BITS; + return tags(j + TLB_EA_TAG_BITS - 1 downto j); + end; + + -- Write a TLB tag to a TLB tag memory row + procedure write_tlb_tag(way: tlb_way_t; tags: inout tlb_way_tags_t; + tag: tlb_tag_t) is + variable j : integer; + begin + j := way * TLB_EA_TAG_BITS; + tags(j + TLB_EA_TAG_BITS - 1 downto j) := tag; + end; + + -- Read a PTE from a TLB PTE memory row + function read_tlb_pte(way: tlb_way_t; ptes: tlb_way_ptes_t) return tlb_pte_t is + variable j : integer; + begin + j := way * TLB_PTE_BITS; + return ptes(j + TLB_PTE_BITS - 1 downto j); + end; + + procedure write_tlb_pte(way: tlb_way_t; ptes: inout tlb_way_ptes_t; newpte: tlb_pte_t) is + variable j : integer; + begin + j := way * TLB_PTE_BITS; + ptes(j + TLB_PTE_BITS - 1 downto j) := newpte; + end; + begin assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; @@ -297,13 +387,158 @@ begin report "geometry bits don't add up" severity FAILURE; assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) report "geometry bits don't add up" severity FAILURE; - assert (64 = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) + assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) report "geometry bits don't add up" severity FAILURE; - assert (64 = TAG_BITS + ROW_BITS + ROW_OFF_BITS) + assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS) report "geometry bits don't add up" severity FAILURE; assert (64 = wishbone_data_bits) report "Can't yet handle a wishbone width that isn't 64-bits" severity FAILURE; - + + -- Latch the request in r0 as long as we're not stalling + stage_0 : process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + r0.valid <= '0'; + elsif stall_out = '0' then + r0 <= d_in; + end if; + end if; + end process; + + -- TLB + -- Operates in the second cycle on the request latched in r0. + -- TLB updates write the entry at the end of the second cycle. + tlb_read : process(clk) + variable index : tlb_index_t; + begin + if rising_edge(clk) then + if stall_out = '1' then + -- keep reading the same thing while stalled + index := tlb_req_index; + else + index := to_integer(unsigned(d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 + downto TLB_LG_PGSZ))); + end if; + tlb_valid_way <= dtlb_valids(index); + tlb_tag_way <= dtlb_tags(index); + tlb_pte_way <= dtlb_ptes(index); + end if; + end process; + + -- Generate TLB PLRUs + maybe_tlb_plrus: if TLB_NUM_WAYS > 1 generate + begin + tlb_plrus: for i in 0 to TLB_SET_SIZE - 1 generate + -- TLB PLRU interface + signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + signal tlb_plru_acc_en : std_ulogic; + signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + begin + tlb_plru : entity work.plru + generic map ( + BITS => TLB_WAY_BITS + ) + port map ( + clk => clk, + rst => rst, + acc => tlb_plru_acc, + acc_en => tlb_plru_acc_en, + lru => tlb_plru_out + ); + + process(tlb_req_index, tlb_hit, tlb_hit_way, tlb_plru_out) + begin + -- PLRU interface + if tlb_hit = '1' and tlb_req_index = i then + tlb_plru_acc_en <= '1'; + else + tlb_plru_acc_en <= '0'; + end if; + tlb_plru_acc <= std_ulogic_vector(to_unsigned(tlb_hit_way, TLB_WAY_BITS)); + tlb_plru_victim(i) <= tlb_plru_out; + end process; + end generate; + end generate; + + tlb_search : process(all) + variable hitway : tlb_way_t; + variable hit : std_ulogic; + variable eatag : tlb_tag_t; + begin + tlb_req_index <= to_integer(unsigned(r0.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 + downto TLB_LG_PGSZ))); + hitway := 0; + hit := '0'; + eatag := r0.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); + for i in tlb_way_t loop + if tlb_valid_way(i) = '1' and + read_tlb_tag(i, tlb_tag_way) = eatag then + hitway := i; + hit := '1'; + end if; + end loop; + tlb_hit <= hit and r0.valid; + tlb_hit_way <= hitway; + pte <= read_tlb_pte(hitway, tlb_pte_way); + valid_ra <= tlb_hit or not r0.virt_mode; + if r0.virt_mode = '1' then + ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & + r0.addr(TLB_LG_PGSZ - 1 downto 0); + else + ra <= r0.addr(REAL_ADDR_BITS - 1 downto 0); + end if; + end process; + + tlb_update : process(clk) + variable tlbie : std_ulogic; + variable tlbia : std_ulogic; + variable tlbwe : std_ulogic; + variable repl_way : tlb_way_t; + variable eatag : tlb_tag_t; + variable tagset : tlb_way_tags_t; + variable pteset : tlb_way_ptes_t; + begin + if rising_edge(clk) then + tlbie := '0'; + tlbia := '0'; + tlbwe := '0'; + if r0.valid = '1' and stall_out = '0' and r0.tlbie = '1' then + if r0.addr(11 downto 10) /= "00" then + tlbia := '1'; + elsif r0.addr(9) = '1' then + tlbwe := '1'; + else + tlbie := '1'; + end if; + end if; + if rst = '1' or tlbia = '1' then + -- clear all valid bits at once + for i in tlb_index_t loop + dtlb_valids(i) <= (others => '0'); + end loop; + elsif tlbie = '1' then + if tlb_hit = '1' then + dtlb_valids(tlb_req_index)(tlb_hit_way) <= '0'; + end if; + elsif tlbwe = '1' then + if tlb_hit = '1' then + repl_way := tlb_hit_way; + else + repl_way := to_integer(unsigned(tlb_plru_victim(tlb_req_index))); + end if; + eatag := r0.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); + tagset := tlb_tag_way; + write_tlb_tag(repl_way, tagset, eatag); + dtlb_tags(tlb_req_index) <= tagset; + pteset := tlb_pte_way; + write_tlb_pte(repl_way, pteset, r0.data); + dtlb_ptes(tlb_req_index) <= pteset; + dtlb_valids(tlb_req_index)(repl_way) <= '1'; + end if; + end if; + end process; + -- Generate PLRUs maybe_plrus: if NUM_WAYS > 1 generate begin @@ -341,53 +576,73 @@ begin end generate; end generate; - -- Latch the request in r0 as long as we're not stalling - stage_0 : process(clk) - begin - if rising_edge(clk) then - if rst = '1' then - r0.valid <= '0'; - elsif stall_out = '0' then - r0 <= d_in; - end if; - end if; - end process; - -- Cache request parsing and hit detection dcache_request : process(all) variable is_hit : std_ulogic; variable hit_way : way_t; variable op : op_t; - variable tmp : std_ulogic_vector(63 downto 0); - variable data : std_ulogic_vector(63 downto 0); - variable opsel : std_ulogic_vector(3 downto 0); + variable opsel : std_ulogic_vector(2 downto 0); variable go : std_ulogic; + variable s_hit : std_ulogic; + variable s_tag : cache_tag_t; + variable s_pte : tlb_pte_t; + variable s_ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + variable hit_set : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); + variable hit_way_set : hit_way_set_t; begin -- Extract line, row and tag from request req_index <= get_index(r0.addr); req_row <= get_row(r0.addr); - req_tag <= get_tag(r0.addr); + req_tag <= get_tag(ra); -- Only do anything if not being stalled by stage 1 - go := r0.valid and not stall_out; + go := r0.valid and not stall_out and not r0.tlbie; -- Calculate address of beginning of cache line, will be -- used for cache miss processing if needed -- - req_laddr <= r0.addr(63 downto LINE_OFF_BITS) & + req_laddr <= (63 downto REAL_ADDR_BITS => '0') & + ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) & (LINE_OFF_BITS-1 downto 0 => '0'); -- Test if pending request is a hit on any way - hit_way := 0; - is_hit := '0'; - for i in way_t loop - if go = '1' and cache_valids(req_index)(i) = '1' then - if read_tag(i, cache_tags(req_index)) = req_tag then - hit_way := i; - is_hit := '1'; - end if; - end if; - end loop; + -- In order to make timing in virtual mode, when we are using the TLB, + -- we compare each way with each of the real addresses from each way of + -- the TLB, and then decide later which match to use. + hit_way := 0; + is_hit := '0'; + if r0.virt_mode = '1' then + for j in tlb_way_t loop + hit_way_set(j) := 0; + s_hit := '0'; + s_pte := read_tlb_pte(j, tlb_pte_way); + s_ra := s_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & + r0.addr(TLB_LG_PGSZ - 1 downto 0); + s_tag := get_tag(s_ra); + for i in way_t loop + if go = '1' and cache_valids(req_index)(i) = '1' and + read_tag(i, cache_tags(req_index)) = s_tag and + tlb_valid_way(j) = '1' then + hit_way_set(j) := i; + s_hit := '1'; + end if; + end loop; + hit_set(j) := s_hit; + end loop; + if tlb_hit = '1' then + is_hit := hit_set(tlb_hit_way); + hit_way := hit_way_set(tlb_hit_way); + end if; + else + s_tag := get_tag(r0.addr(REAL_ADDR_BITS - 1 downto 0)); + for i in way_t loop + if go = '1' and cache_valids(req_index)(i) = '1' and + read_tag(i, cache_tags(req_index)) = s_tag then + hit_way := i; + is_hit := '1'; + end if; + end loop; + end if; -- The way that matched on a hit req_hit_way <= hit_way; @@ -398,19 +653,25 @@ begin -- Combine the request and cache his status to decide what -- operation needs to be done -- - opsel := go & r0.load & r0.nc & is_hit; - case opsel is - when "1101" => op := OP_LOAD_HIT; - when "1100" => op := OP_LOAD_MISS; - when "1110" => op := OP_LOAD_NC; - when "1001" => op := OP_STORE_HIT; - when "1000" => op := OP_STORE_MISS; - when "1010" => op := OP_STORE_MISS; - when "1011" => op := OP_BAD; - when "1111" => op := OP_BAD; - when others => op := OP_NONE; - end case; - + op := OP_NONE; + if go = '1' then + if valid_ra = '1' then + opsel := r0.load & r0.nc & is_hit; + case opsel is + when "101" => op := OP_LOAD_HIT; + when "100" => op := OP_LOAD_MISS; + when "110" => op := OP_LOAD_NC; + when "001" => op := OP_STORE_HIT; + when "000" => op := OP_STORE_MISS; + when "010" => op := OP_STORE_MISS; + when "011" => op := OP_BAD; + when "111" => op := OP_BAD; + when others => op := OP_NONE; + end case; + else + op := OP_BAD; + end if; + end if; req_op <= op; -- Version of the row number that is valid one cycle earlier @@ -427,9 +688,6 @@ begin -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- TODO: Generate errors - -- err_nc_collision <= '1' when req_op = OP_BAD else '0'; - -- Generate stalls from stage 1 state machine stall_out <= '1' when r1.state /= IDLE else '0'; @@ -477,6 +735,8 @@ begin d_out.valid <= '0'; d_out.data <= cache_out(r1.hit_way); d_out.store_done <= '0'; + d_out.error <= '0'; + d_out.tlb_miss <= '0'; -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store @@ -502,6 +762,20 @@ begin d_out.valid <= '1'; end if; + -- error cases complete without stalling + if r1.error_done = '1' then + report "completing ld/st with error"; + d_out.error <= '1'; + d_out.tlb_miss <= r1.tlb_miss; + d_out.valid <= '1'; + end if; + + -- tlbie is handled above and doesn't go through the cache state machine + if r1.tlbie_done = '1' then + report "completing tlbie"; + d_out.valid <= '1'; + end if; + -- Slow ops (load miss, NC, stores) if r1.slow_valid = '1' then -- If it's a load, enable register writeback and switch @@ -609,6 +883,7 @@ begin -- -- Cache hit synchronous machine for the easy case. This handles load hits. + -- It also handles error cases (TLB miss, cache paradox) -- dcache_fast_hit : process(clk) begin @@ -636,6 +911,16 @@ begin else r1.hit_load_valid <= '0'; end if; + + if req_op = OP_BAD then + r1.error_done <= '1'; + r1.tlb_miss <= not valid_ra; + else + r1.error_done <= '0'; + end if; + + -- complete tlbies in the third cycle + r1.tlbie_done <= r0.valid and r0.tlbie and not stall_out; end if; end process; @@ -717,7 +1002,7 @@ begin when OP_LOAD_NC => r1.wb.sel <= r0.byte_sel; - r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; @@ -726,7 +1011,7 @@ begin when OP_STORE_HIT | OP_STORE_MISS => if r0.dcbz = '0' then r1.wb.sel <= r0.byte_sel; - r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; r1.wb.dat <= r0.data; if cancel_store = '0' then r1.wb.cyc <= '1'; @@ -774,6 +1059,7 @@ begin end if; -- OP_NONE and OP_BAD do nothing + -- OP_BAD was handled above already when OP_NONE => when OP_BAD => end case; diff --git a/dcache_tb.vhdl b/dcache_tb.vhdl index bd8341a..66b938f 100644 --- a/dcache_tb.vhdl +++ b/dcache_tb.vhdl @@ -68,6 +68,7 @@ begin -- Clear stuff d_in.valid <= '0'; d_in.load <= '0'; + d_in.tlbie <= '0'; d_in.nc <= '0'; d_in.addr <= (others => '0'); d_in.data <= (others => '0'); diff --git a/decode1.vhdl b/decode1.vhdl index 70099d4..fd799fe 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -323,6 +323,7 @@ architecture behaviour of decode1 is 2#1001010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sync 2#0001000100# => (ALU, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- td 2#0000000100# => (ALU, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- tw + 2#0100110010# => (LDST, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- tlbie 2#0100111100# => (ALU, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- xor others => illegal_inst ); diff --git a/decode_types.vhdl b/decode_types.vhdl index 07c486a..ef51bd0 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -16,7 +16,7 @@ package decode_types is OP_POPCNT, OP_PRTY, OP_RFID, OP_RLC, OP_RLCL, OP_RLCR, OP_SC, OP_SETB, OP_SHL, OP_SHR, - OP_SYNC, OP_TRAP, + OP_SYNC, OP_TLBIE, OP_TRAP, OP_XOR ); type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR); diff --git a/execute1.vhdl b/execute1.vhdl index 490723e..98b95dc 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -88,6 +88,7 @@ architecture behaviour of execute1 is OP_MFMSR => SUPER, OP_MTMSRD => SUPER, OP_RFID => SUPER, + OP_TLBIE => SUPER, others => USER ); @@ -988,6 +989,7 @@ begin e_in.insn(5 downto 1) = "10101" then lv.ci := '1'; end if; + lv.virt_mode := ctrl.msr(MSR_DR); -- Update registers rin <= v; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 7ddbbc0..d5a59e8 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -43,6 +43,7 @@ architecture behave of loadstore1 is type reg_stage_t is record -- latch most of the input request load : std_ulogic; + tlbie : std_ulogic; dcbz : std_ulogic; addr : std_ulogic_vector(63 downto 0); store_data : std_ulogic_vector(63 downto 0); @@ -57,6 +58,7 @@ architecture behave of loadstore1 is reserve : std_ulogic; rc : std_ulogic; nc : std_ulogic; -- non-cacheable access + virt_mode : std_ulogic; state : state_t; second_bytes : std_ulogic_vector(7 downto 0); dar : std_ulogic_vector(63 downto 0); @@ -207,6 +209,7 @@ begin if l_in.valid = '1' then v.load := '0'; v.dcbz := '0'; + v.tlbie := '0'; case l_in.op is when OP_STORE => req := '1'; @@ -216,6 +219,9 @@ begin when OP_DCBZ => req := '1'; v.dcbz := '1'; + when OP_TLBIE => + req := '1'; + v.tlbie := '1'; when OP_MFSPR => done := '1'; mfspr := '1'; @@ -250,14 +256,15 @@ begin v.reserve := l_in.reserve; v.rc := l_in.rc; v.nc := l_in.ci; + v.virt_mode := l_in.virt_mode; -- XXX Temporary hack. Mark the op as non-cachable if the address - -- is the form 0xc------- + -- is the form 0xc------- for a real-mode access. -- -- This will have to be replaced by a combination of implementing the -- proper HV CI load/store instructions and having an MMU to get the I -- bit otherwise. - if lsu_sum(31 downto 28) = "1100" then + if lsu_sum(31 downto 28) = "1100" and l_in.virt_mode = '0' then v.nc := '1'; end if; @@ -269,10 +276,13 @@ begin v.addr := lsu_sum; -- Do byte reversing and rotating for stores in the first cycle - byte_offset := unsigned(lsu_sum(2 downto 0)); + byte_offset := "000"; brev_lenm1 := "000"; - if l_in.byte_reverse = '1' then - brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + if v.tlbie = '0' then + byte_offset := unsigned(lsu_sum(2 downto 0)); + if l_in.byte_reverse = '1' then + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + end if; end if; for i in 0 to 7 loop k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; @@ -332,12 +342,14 @@ begin -- Update outputs to dcache d_out.valid <= req; d_out.load <= v.load; + d_out.tlbie <= v.tlbie; d_out.dcbz <= v.dcbz; d_out.nc <= v.nc; d_out.reserve <= v.reserve; d_out.addr <= addr; d_out.data <= v.store_data; d_out.byte_sel <= byte_sel; + d_out.virt_mode <= v.virt_mode; -- Update outputs to writeback -- Multiplex either cache data to the destination GPR or