diff --git a/Makefile b/Makefile index b9ad461..5b3b7dd 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ common.o: decode_types.o control.o: gpr_hazard.o cr_hazard.o common.o sim_jtag.o: sim_jtag_socket.o core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o -core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o writeback.o core_debug.o +core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o mmu.o dcache.o writeback.o core_debug.o core_debug.o: common.o countzero.o: countzero_tb.o: common.o glibc_random.o countzero.o @@ -58,10 +58,11 @@ icache_tb.o: common.o wishbone_types.o icache.o wishbone_bram_wrapper.o dcache.o: utils.o common.o wishbone_types.o plru.o cache_ram.o utils.o dcache_tb.o: common.o wishbone_types.o dcache.o wishbone_bram_wrapper.o insn_helpers.o: -loadstore1.o: common.o helpers.o decode_types.o +loadstore1.o: common.o decode_types.o logical.o: decode_types.o multiply_tb.o: decode_types.o common.o glibc_random.o ppc_fx_insns.o multiply.o multiply.o: common.o decode_types.o +mmu.o: common.o divider_tb.o: decode_types.o common.o glibc_random.o ppc_fx_insns.o divider.o divider.o: common.o decode_types.o ppc_fx_insns.o: helpers.o diff --git a/common.vhdl b/common.vhdl index ed97e0c..aaf176d 100644 --- a/common.vhdl +++ b/common.vhdl @@ -24,6 +24,8 @@ package common is constant SPR_XER : spr_num_t := 1; constant SPR_LR : spr_num_t := 8; constant SPR_CTR : spr_num_t := 9; + constant SPR_DSISR : spr_num_t := 18; + constant SPR_DAR : spr_num_t := 19; constant SPR_TB : spr_num_t := 268; constant SPR_DEC : spr_num_t := 22; constant SPR_SRR0 : spr_num_t := 26; @@ -37,6 +39,8 @@ package common is constant SPR_SPRG3U : spr_num_t := 259; constant SPR_HSPRG0 : spr_num_t := 304; constant SPR_HSPRG1 : spr_num_t := 305; + constant SPR_PID : spr_num_t := 48; + constant SPR_PRTBL : spr_num_t := 720; -- GPR indices in the register file (GPR only) subtype gpr_index_t is std_ulogic_vector(4 downto 0); @@ -86,6 +90,8 @@ package common is type Fetch1ToIcacheType is record req: std_ulogic; + virt_mode : std_ulogic; + priv_mode : std_ulogic; stop_mark: std_ulogic; nia: std_ulogic_vector(63 downto 0); end record; @@ -93,6 +99,7 @@ package common is type IcacheToFetch2Type is record valid: std_ulogic; stop_mark: std_ulogic; + fetch_failed: std_ulogic; nia: std_ulogic_vector(63 downto 0); insn: std_ulogic_vector(31 downto 0); end record; @@ -100,10 +107,12 @@ package common is type Fetch2ToDecode1Type is record valid: std_ulogic; stop_mark : std_ulogic; + fetch_failed: std_ulogic; nia: std_ulogic_vector(63 downto 0); insn: std_ulogic_vector(31 downto 0); end record; - constant Fetch2ToDecode1Init : Fetch2ToDecode1Type := (valid => '0', stop_mark => '0', others => (others => '0')); + constant Fetch2ToDecode1Init : Fetch2ToDecode1Type := (valid => '0', stop_mark => '0', fetch_failed => '0', + others => (others => '0')); type Decode1ToDecode2Type is record valid: std_ulogic; @@ -208,13 +217,18 @@ package common is type Execute1ToFetch1Type is record redirect: std_ulogic; + virt_mode: std_ulogic; + priv_mode: std_ulogic; redirect_nia: std_ulogic_vector(63 downto 0); end record; - constant Execute1ToFetch1TypeInit : Execute1ToFetch1Type := (redirect => '0', others => (others => '0')); + constant Execute1ToFetch1TypeInit : Execute1ToFetch1Type := (redirect => '0', virt_mode => '0', + priv_mode => '0', others => (others => '0')); type Execute1ToLoadstore1Type is record valid : std_ulogic; - op : insn_type_t; -- what ld/st op to do + op : insn_type_t; -- what ld/st or m[tf]spr or TLB op to do + nia : std_ulogic_vector(63 downto 0); + insn : std_ulogic_vector(31 downto 0); addr1 : std_ulogic_vector(63 downto 0); addr2 : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); -- data to write, unused for read @@ -228,17 +242,32 @@ package common is xerc : xer_common_t; reserve : std_ulogic; -- set for larx/stcx. rc : std_ulogic; -- set for stcx. + virt_mode : std_ulogic; -- do translation through TLB + priv_mode : std_ulogic; -- privileged mode (MSR[PR] = 0) end record; constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0', sign_extend => '0', update => '0', xerc => xerc_init, - reserve => '0', rc => '0', others => (others => '0')); + reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0', + others => (others => '0')); + + type Loadstore1ToExecute1Type is record + exception : std_ulogic; + invalid : std_ulogic; + perm_error : std_ulogic; + rc_error : std_ulogic; + badtree : std_ulogic; + segment_fault : std_ulogic; + instr_fault : std_ulogic; + end record; type Loadstore1ToDcacheType is record valid : std_ulogic; - load : std_ulogic; + load : std_ulogic; -- is this a load dcbz : std_ulogic; nc : std_ulogic; reserve : std_ulogic; + virt_mode : std_ulogic; + priv_mode : std_ulogic; addr : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); byte_sel : std_ulogic_vector(7 downto 0); @@ -249,6 +278,54 @@ package common is data : std_ulogic_vector(63 downto 0); store_done : std_ulogic; error : std_ulogic; + cache_paradox : std_ulogic; + end record; + + type Loadstore1ToMmuType is record + valid : std_ulogic; + tlbie : std_ulogic; + slbia : std_ulogic; + mtspr : std_ulogic; + iside : std_ulogic; + load : std_ulogic; + priv : std_ulogic; + sprn : std_ulogic_vector(9 downto 0); + addr : std_ulogic_vector(63 downto 0); + rs : std_ulogic_vector(63 downto 0); + end record; + + type MmuToLoadstore1Type is record + done : std_ulogic; + invalid : std_ulogic; + badtree : std_ulogic; + segerr : std_ulogic; + perm_error : std_ulogic; + rc_error : std_ulogic; + sprval : std_ulogic_vector(63 downto 0); + end record; + + type MmuToDcacheType is record + valid : std_ulogic; + tlbie : std_ulogic; + doall : std_ulogic; + tlbld : std_ulogic; + addr : std_ulogic_vector(63 downto 0); + pte : std_ulogic_vector(63 downto 0); + end record; + + type DcacheToMmuType is record + stall : std_ulogic; + done : std_ulogic; + err : std_ulogic; + data : std_ulogic_vector(63 downto 0); + end record; + + type MmuToIcacheType is record + tlbld : std_ulogic; + tlbie : std_ulogic; + doall : std_ulogic; + addr : std_ulogic_vector(63 downto 0); + pte : std_ulogic_vector(63 downto 0); end record; type Loadstore1ToWritebackType is record diff --git a/core.vhdl b/core.vhdl index 9895dc8..b0ccb7b 100644 --- a/core.vhdl +++ b/core.vhdl @@ -47,6 +47,7 @@ architecture behave of core is -- icache signals signal fetch1_to_icache : Fetch1ToIcacheType; signal icache_to_fetch2 : IcacheToFetch2Type; + signal mmu_to_icache : MmuToIcacheType; -- decode signals signal decode1_to_decode2: Decode1ToDecode2Type; @@ -68,11 +69,16 @@ architecture behave of core is -- load store signals signal execute1_to_loadstore1: Execute1ToLoadstore1Type; + signal loadstore1_to_execute1: Loadstore1ToExecute1Type; signal loadstore1_to_writeback: Loadstore1ToWritebackType; + signal loadstore1_to_mmu: Loadstore1ToMmuType; + signal mmu_to_loadstore1: MmuToLoadstore1Type; -- dcache signals signal loadstore1_to_dcache: Loadstore1ToDcacheType; signal dcache_to_loadstore1: DcacheToLoadstore1Type; + signal mmu_to_dcache: MmuToDcacheType; + signal dcache_to_mmu: DcacheToMmuType; -- local signals signal fetch1_stall_in : std_ulogic; @@ -100,6 +106,13 @@ architecture behave of core is signal dbg_core_rst: std_ulogic; signal dbg_icache_rst: std_ulogic; + signal dbg_gpr_req : std_ulogic; + signal dbg_gpr_ack : std_ulogic; + signal dbg_gpr_addr : gspr_index_t; + signal dbg_gpr_data : std_ulogic_vector(63 downto 0); + + signal msr : std_ulogic_vector(63 downto 0); + -- Debug status signal dbg_core_is_stopped: std_ulogic; @@ -121,6 +134,7 @@ architecture behave of core is attribute keep_hierarchy of cr_file_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of execute1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of loadstore1_0 : label is keep_h(DISABLE_FLATTEN); + attribute keep_hierarchy of mmu_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of dcache_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of writeback_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of debug_0 : label is keep_h(DISABLE_FLATTEN); @@ -158,6 +172,7 @@ begin rst => icache_rst, i_in => fetch1_to_icache, i_out => icache_to_fetch2, + m_in => mmu_to_icache, flush_in => flush, stall_out => icache_stall_out, wishbone_out => wishbone_insn_out, @@ -220,6 +235,10 @@ begin d_in => decode2_to_register_file, d_out => register_file_to_decode2, w_in => writeback_to_register_file, + dbg_gpr_req => dbg_gpr_req, + dbg_gpr_ack => dbg_gpr_ack, + dbg_gpr_addr => dbg_gpr_addr, + dbg_gpr_data => dbg_gpr_data, sim_dump => terminate, sim_dump_done => sim_cr_dump ); @@ -247,10 +266,12 @@ begin stall_out => ex1_stall_out, e_in => decode2_to_execute1, i_in => xics_in, + l_in => loadstore1_to_execute1, l_out => execute1_to_loadstore1, f_out => execute1_to_fetch1, e_out => execute1_to_writeback, icache_inval => ex1_icache_inval, + dbg_msr_out => msr, terminate_out => terminate ); @@ -259,13 +280,27 @@ begin clk => clk, rst => core_rst, l_in => execute1_to_loadstore1, + e_out => loadstore1_to_execute1, l_out => loadstore1_to_writeback, d_out => loadstore1_to_dcache, d_in => dcache_to_loadstore1, + m_out => loadstore1_to_mmu, + m_in => mmu_to_loadstore1, dc_stall => dcache_stall_out, stall_out => ls1_stall_out ); + mmu_0: entity work.mmu + port map ( + clk => clk, + rst => core_rst, + l_in => loadstore1_to_mmu, + l_out => mmu_to_loadstore1, + d_out => mmu_to_dcache, + d_in => dcache_to_mmu, + i_out => mmu_to_icache + ); + dcache_0: entity work.dcache generic map( LINE_SIZE => 64, @@ -277,6 +312,8 @@ begin rst => core_rst, d_in => loadstore1_to_dcache, d_out => dcache_to_loadstore1, + m_in => mmu_to_dcache, + m_out => dcache_to_mmu, stall_out => dcache_stall_out, wishbone_in => wishbone_data_in, wishbone_out => wishbone_data_out @@ -308,6 +345,11 @@ begin terminate => terminate, core_stopped => dbg_core_is_stopped, nia => fetch1_to_icache.nia, + msr => msr, + dbg_gpr_req => dbg_gpr_req, + dbg_gpr_ack => dbg_gpr_ack, + dbg_gpr_addr => dbg_gpr_addr, + dbg_gpr_data => dbg_gpr_data, terminated_out => terminated_out ); diff --git a/core_debug.vhdl b/core_debug.vhdl index ae4414e..c97213b 100644 --- a/core_debug.vhdl +++ b/core_debug.vhdl @@ -26,6 +26,13 @@ entity core_debug is terminate : in std_ulogic; core_stopped : in std_ulogic; nia : in std_ulogic_vector(63 downto 0); + msr : in std_ulogic_vector(63 downto 0); + + -- GSPR register read port + dbg_gpr_req : out std_ulogic; + dbg_gpr_ack : in std_ulogic; + dbg_gpr_addr : out gspr_index_t; + dbg_gpr_data : in std_ulogic_vector(63 downto 0); -- Misc terminated_out : out std_ulogic @@ -61,6 +68,15 @@ architecture behave of core_debug is -- NIA register (read only for now) constant DBG_CORE_NIA : std_ulogic_vector(3 downto 0) := "0010"; + -- MSR (read only) + constant DBG_CORE_MSR : std_ulogic_vector(3 downto 0) := "0011"; + + -- GSPR register index + constant DBG_CORE_GSPR_INDEX : std_ulogic_vector(3 downto 0) := "0100"; + + -- GSPR register data + constant DBG_CORE_GSPR_DATA : std_ulogic_vector(3 downto 0) := "0101"; + -- Some internal wires signal stat_reg : std_ulogic_vector(63 downto 0); @@ -70,10 +86,15 @@ architecture behave of core_debug is signal do_reset : std_ulogic; signal do_icreset : std_ulogic; signal terminated : std_ulogic; + signal do_gspr_rd : std_ulogic; + signal gspr_index : gspr_index_t; begin - -- Single cycle register accesses on DMI - dmi_ack <= dmi_req; + -- Single cycle register accesses on DMI except for GSPR data + dmi_ack <= dmi_req when dmi_addr /= DBG_CORE_GSPR_DATA + else dbg_gpr_ack; + dbg_gpr_req <= dmi_req when dmi_addr = DBG_CORE_GSPR_DATA + else '0'; -- Status register read composition stat_reg <= (2 => terminated, @@ -85,6 +106,8 @@ begin with dmi_addr select dmi_dout <= stat_reg when DBG_CORE_STAT, nia when DBG_CORE_NIA, + msr when DBG_CORE_MSR, + dbg_gpr_data when DBG_CORE_GSPR_DATA, (others => '0') when others; -- DMI writes @@ -126,6 +149,8 @@ begin stopping <= '0'; terminated <= '0'; end if; + elsif dmi_addr = DBG_CORE_GSPR_INDEX then + gspr_index <= dmi_din(gspr_index_t'left downto 0); end if; else report("DMI read from " & to_string(dmi_addr)); @@ -143,6 +168,8 @@ begin end if; end process; + dbg_gpr_addr <= gspr_index; + -- Core control signals generated by the debug module core_stop <= stopping and not do_step; core_rst <= do_reset; diff --git a/dcache.vhdl b/dcache.vhdl index 7d61a85..a9b5c4a 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -25,7 +25,13 @@ entity dcache is -- Number of lines in a set NUM_LINES : positive := 32; -- Number of ways - NUM_WAYS : positive := 4 + NUM_WAYS : positive := 4; + -- L1 DTLB entries per set + TLB_SET_SIZE : positive := 64; + -- L1 DTLB number of sets + TLB_NUM_WAYS : positive := 2; + -- L1 DTLB log_2(page_size) + TLB_LG_PGSZ : positive := 12 ); port ( clk : in std_ulogic; @@ -34,6 +40,9 @@ entity dcache is d_in : in Loadstore1ToDcacheType; d_out : out DcacheToLoadstore1Type; + m_in : in MmuToDcacheType; + m_out : out DcacheToMmuType; + stall_out : out std_ulogic; wishbone_out : out wishbone_master_out; @@ -56,6 +65,8 @@ architecture rtl of dcache is -- Bit fields counts in the address + -- REAL_ADDR_BITS is the number of real address bits that we store + constant REAL_ADDR_BITS : positive := 56; -- ROW_BITS is the number of bits to select a row constant ROW_BITS : natural := log2(BRAM_ROWS); -- ROW_LINEBITS is the number of bits to select a row within a line @@ -66,8 +77,10 @@ architecture rtl of dcache is constant ROW_OFF_BITS : natural := log2(ROW_SIZE); -- INDEX_BITS is the number if bits to select a cache line constant INDEX_BITS : natural := log2(NUM_LINES); + -- SET_SIZE_BITS is the log base 2 of the set size + constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS; -- TAG_BITS is the number of bits of the tag part of the address - constant TAG_BITS : natural := 64 - LINE_OFF_BITS - INDEX_BITS; + constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS; -- WAY_BITS is the number of bits to select a way constant WAY_BITS : natural := log2(NUM_WAYS); @@ -80,7 +93,7 @@ architecture rtl of dcache is -- .. | |- --| ROW_OFF_BITS (3) -- .. |----- ---| | ROW_BITS (8) -- .. |-----| | INDEX_BITS (5) - -- .. --------| | TAG_BITS (53) + -- .. --------| | TAG_BITS (45) subtype row_t is integer range 0 to BRAM_ROWS-1; subtype index_t is integer range 0 to NUM_LINES-1; @@ -110,7 +123,55 @@ architecture rtl of dcache is attribute ram_style : string; attribute ram_style of cache_tags : signal is "distributed"; - signal r0 : Loadstore1ToDcacheType; + -- L1 TLB. + constant TLB_SET_BITS : natural := log2(TLB_SET_SIZE); + constant TLB_WAY_BITS : natural := log2(TLB_NUM_WAYS); + constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_SET_BITS); + constant TLB_TAG_WAY_BITS : natural := TLB_NUM_WAYS * TLB_EA_TAG_BITS; + constant TLB_PTE_BITS : natural := 64; + constant TLB_PTE_WAY_BITS : natural := TLB_NUM_WAYS * TLB_PTE_BITS; + + subtype tlb_way_t is integer range 0 to TLB_NUM_WAYS - 1; + subtype tlb_index_t is integer range 0 to TLB_SET_SIZE - 1; + subtype tlb_way_valids_t is std_ulogic_vector(TLB_NUM_WAYS-1 downto 0); + type tlb_valids_t is array(tlb_index_t) of tlb_way_valids_t; + subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0); + subtype tlb_way_tags_t is std_ulogic_vector(TLB_TAG_WAY_BITS-1 downto 0); + type tlb_tags_t is array(tlb_index_t) of tlb_way_tags_t; + subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0); + subtype tlb_way_ptes_t is std_ulogic_vector(TLB_PTE_WAY_BITS-1 downto 0); + type tlb_ptes_t is array(tlb_index_t) of tlb_way_ptes_t; + type hit_way_set_t is array(tlb_way_t) of way_t; + + signal dtlb_valids : tlb_valids_t; + signal dtlb_tags : tlb_tags_t; + signal dtlb_ptes : tlb_ptes_t; + attribute ram_style of dtlb_tags : signal is "distributed"; + attribute ram_style of dtlb_ptes : signal is "distributed"; + + -- Record for storing permission, attribute, etc. bits from a PTE + type perm_attr_t is record + reference : std_ulogic; + changed : std_ulogic; + nocache : std_ulogic; + priv : std_ulogic; + rd_perm : std_ulogic; + wr_perm : std_ulogic; + end record; + + function extract_perm_attr(pte : std_ulogic_vector(TLB_PTE_BITS - 1 downto 0)) return perm_attr_t is + variable pa : perm_attr_t; + begin + pa.reference := pte(8); + pa.changed := pte(7); + pa.nocache := pte(5); + pa.priv := pte(3); + pa.rd_perm := pte(2); + pa.wr_perm := pte(1); + return pa; + end; + + constant real_mode_perm_attr : perm_attr_t := (nocache => '0', others => '1'); -- Type of operation on a "valid" input type op_t is (OP_NONE, @@ -118,6 +179,7 @@ architecture rtl of dcache is OP_LOAD_MISS, -- Load missing cache OP_LOAD_NC, -- Non-cachable load OP_BAD, -- BAD: Cache hit on NC load/store + OP_TLB_ERR, -- TLB miss or protection/RC failure OP_STORE_HIT, -- Store hitting cache OP_STORE_MISS); -- Store missing cache @@ -144,12 +206,25 @@ architecture rtl of dcache is -- first stage emits a stall for a complex op. -- + -- Stage 0 register, basically contains just the latched request + type reg_stage_0_t is record + req : Loadstore1ToDcacheType; + tlbie : std_ulogic; + doall : std_ulogic; + tlbld : std_ulogic; + mmu_req : std_ulogic; -- indicates source of request + end record; + + signal r0 : reg_stage_0_t; + signal r0_valid : std_ulogic; + -- First stage register, contains state for stage 1 of load hits -- and for the state machine used by all other operations -- type reg_stage_1_t is record -- Latch the complete request from ls1 req : Loadstore1ToDcacheType; + mmu_req : std_ulogic; -- Cache hit state hit_way : way_t; @@ -168,6 +243,13 @@ architecture rtl of dcache is store_way : way_t; store_row : row_t; store_index : index_t; + + -- Signals to complete with error + error_done : std_ulogic; + cache_paradox : std_ulogic; + + -- completion signal for tlbie + tlbie_done : std_ulogic; end record; signal r1 : reg_stage_1_t; @@ -208,6 +290,24 @@ architecture rtl of dcache is -- Wishbone read/write/cache write formatting signals signal bus_sel : std_ulogic_vector(7 downto 0); + -- TLB signals + signal tlb_tag_way : tlb_way_tags_t; + signal tlb_pte_way : tlb_way_ptes_t; + signal tlb_valid_way : tlb_way_valids_t; + signal tlb_req_index : tlb_index_t; + signal tlb_hit : std_ulogic; + signal tlb_hit_way : tlb_way_t; + signal pte : tlb_pte_t; + signal ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + signal valid_ra : std_ulogic; + signal perm_attr : perm_attr_t; + signal rc_ok : std_ulogic; + signal perm_ok : std_ulogic; + + -- TLB PLRU output interface + type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + signal tlb_plru_victim : tlb_plru_out_t; + -- -- Helper functions to decode incoming requests -- @@ -215,13 +315,13 @@ architecture rtl of dcache is -- Return the cache line index (tag index) for an address function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is begin - return to_integer(unsigned(addr(63-TAG_BITS downto LINE_OFF_BITS))); + return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS))); end; -- Return the cache row index (data memory) for an address function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is begin - return to_integer(unsigned(addr(63-TAG_BITS downto ROW_OFF_BITS))); + return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS))); end; -- Returns whether this is the last row of a line @@ -269,9 +369,9 @@ architecture rtl of dcache is end; -- Get the tag value from the address - function get_tag(addr: std_ulogic_vector(63 downto 0)) return cache_tag_t is + function get_tag(addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)) return cache_tag_t is begin - return addr(63 downto 64-TAG_BITS); + return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS); end; -- Read a tag from a tag memory row @@ -287,6 +387,38 @@ architecture rtl of dcache is tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; end; + -- Read a TLB tag from a TLB tag memory row + function read_tlb_tag(way: tlb_way_t; tags: tlb_way_tags_t) return tlb_tag_t is + variable j : integer; + begin + j := way * TLB_EA_TAG_BITS; + return tags(j + TLB_EA_TAG_BITS - 1 downto j); + end; + + -- Write a TLB tag to a TLB tag memory row + procedure write_tlb_tag(way: tlb_way_t; tags: inout tlb_way_tags_t; + tag: tlb_tag_t) is + variable j : integer; + begin + j := way * TLB_EA_TAG_BITS; + tags(j + TLB_EA_TAG_BITS - 1 downto j) := tag; + end; + + -- Read a PTE from a TLB PTE memory row + function read_tlb_pte(way: tlb_way_t; ptes: tlb_way_ptes_t) return tlb_pte_t is + variable j : integer; + begin + j := way * TLB_PTE_BITS; + return ptes(j + TLB_PTE_BITS - 1 downto j); + end; + + procedure write_tlb_pte(way: tlb_way_t; ptes: inout tlb_way_ptes_t; newpte: tlb_pte_t) is + variable j : integer; + begin + j := way * TLB_PTE_BITS; + ptes(j + TLB_PTE_BITS - 1 downto j) := newpte; + end; + begin assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; @@ -297,13 +429,188 @@ begin report "geometry bits don't add up" severity FAILURE; assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) report "geometry bits don't add up" severity FAILURE; - assert (64 = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) + assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) report "geometry bits don't add up" severity FAILURE; - assert (64 = TAG_BITS + ROW_BITS + ROW_OFF_BITS) + assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS) report "geometry bits don't add up" severity FAILURE; assert (64 = wishbone_data_bits) report "Can't yet handle a wishbone width that isn't 64-bits" severity FAILURE; - + + -- Latch the request in r0.req as long as we're not stalling + stage_0 : process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + r0.req.valid <= '0'; + elsif stall_out = '0' then + assert (d_in.valid and m_in.valid) = '0' report + "request collision loadstore vs MMU"; + if m_in.valid = '1' then + r0.req.valid <= '1'; + r0.req.load <= not (m_in.tlbie or m_in.tlbld); + r0.req.dcbz <= '0'; + r0.req.nc <= '0'; + r0.req.reserve <= '0'; + r0.req.virt_mode <= '0'; + r0.req.priv_mode <= '1'; + r0.req.addr <= m_in.addr; + r0.req.data <= m_in.pte; + r0.req.byte_sel <= (others => '1'); + r0.tlbie <= m_in.tlbie; + r0.doall <= m_in.doall; + r0.tlbld <= m_in.tlbld; + r0.mmu_req <= '1'; + else + r0.req <= d_in; + r0.tlbie <= '0'; + r0.doall <= '0'; + r0.tlbld <= '0'; + r0.mmu_req <= '0'; + end if; + end if; + end if; + end process; + + -- we don't yet handle collisions between loadstore1 requests and MMU requests + m_out.stall <= '0'; + + -- Hold off the request in r0 when stalling, + -- and cancel it if we get an error in a previous request. + r0_valid <= r0.req.valid and not stall_out and not r1.error_done; + + -- TLB + -- Operates in the second cycle on the request latched in r0.req. + -- TLB updates write the entry at the end of the second cycle. + tlb_read : process(clk) + variable index : tlb_index_t; + variable addrbits : std_ulogic_vector(TLB_SET_BITS - 1 downto 0); + begin + if rising_edge(clk) then + if stall_out = '1' then + -- keep reading the same thing while stalled + index := tlb_req_index; + else + if m_in.valid = '1' then + addrbits := m_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); + else + addrbits := d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); + end if; + index := to_integer(unsigned(addrbits)); + end if; + tlb_valid_way <= dtlb_valids(index); + tlb_tag_way <= dtlb_tags(index); + tlb_pte_way <= dtlb_ptes(index); + end if; + end process; + + -- Generate TLB PLRUs + maybe_tlb_plrus: if TLB_NUM_WAYS > 1 generate + begin + tlb_plrus: for i in 0 to TLB_SET_SIZE - 1 generate + -- TLB PLRU interface + signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + signal tlb_plru_acc_en : std_ulogic; + signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + begin + tlb_plru : entity work.plru + generic map ( + BITS => TLB_WAY_BITS + ) + port map ( + clk => clk, + rst => rst, + acc => tlb_plru_acc, + acc_en => tlb_plru_acc_en, + lru => tlb_plru_out + ); + + process(tlb_req_index, tlb_hit, tlb_hit_way, tlb_plru_out) + begin + -- PLRU interface + if tlb_hit = '1' and tlb_req_index = i then + tlb_plru_acc_en <= '1'; + else + tlb_plru_acc_en <= '0'; + end if; + tlb_plru_acc <= std_ulogic_vector(to_unsigned(tlb_hit_way, TLB_WAY_BITS)); + tlb_plru_victim(i) <= tlb_plru_out; + end process; + end generate; + end generate; + + tlb_search : process(all) + variable hitway : tlb_way_t; + variable hit : std_ulogic; + variable eatag : tlb_tag_t; + begin + tlb_req_index <= to_integer(unsigned(r0.req.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 + downto TLB_LG_PGSZ))); + hitway := 0; + hit := '0'; + eatag := r0.req.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); + for i in tlb_way_t loop + if tlb_valid_way(i) = '1' and + read_tlb_tag(i, tlb_tag_way) = eatag then + hitway := i; + hit := '1'; + end if; + end loop; + tlb_hit <= hit and r0_valid; + tlb_hit_way <= hitway; + if tlb_hit = '1' then + pte <= read_tlb_pte(hitway, tlb_pte_way); + else + pte <= (others => '0'); + end if; + valid_ra <= tlb_hit or not r0.req.virt_mode; + if r0.req.virt_mode = '1' then + ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & + r0.req.addr(TLB_LG_PGSZ - 1 downto 0); + perm_attr <= extract_perm_attr(pte); + else + ra <= r0.req.addr(REAL_ADDR_BITS - 1 downto 0); + perm_attr <= real_mode_perm_attr; + end if; + end process; + + tlb_update : process(clk) + variable tlbie : std_ulogic; + variable tlbwe : std_ulogic; + variable repl_way : tlb_way_t; + variable eatag : tlb_tag_t; + variable tagset : tlb_way_tags_t; + variable pteset : tlb_way_ptes_t; + begin + if rising_edge(clk) then + tlbie := r0_valid and r0.tlbie; + tlbwe := r0_valid and r0.tlbld; + if rst = '1' or (tlbie = '1' and r0.doall = '1') then + -- clear all valid bits at once + for i in tlb_index_t loop + dtlb_valids(i) <= (others => '0'); + end loop; + elsif tlbie = '1' then + if tlb_hit = '1' then + dtlb_valids(tlb_req_index)(tlb_hit_way) <= '0'; + end if; + elsif tlbwe = '1' then + if tlb_hit = '1' then + repl_way := tlb_hit_way; + else + repl_way := to_integer(unsigned(tlb_plru_victim(tlb_req_index))); + end if; + eatag := r0.req.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); + tagset := tlb_tag_way; + write_tlb_tag(repl_way, tagset, eatag); + dtlb_tags(tlb_req_index) <= tagset; + pteset := tlb_pte_way; + write_tlb_pte(repl_way, pteset, r0.req.data); + dtlb_ptes(tlb_req_index) <= pteset; + dtlb_valids(tlb_req_index)(repl_way) <= '1'; + end if; + end if; + end process; + -- Generate PLRUs maybe_plrus: if NUM_WAYS > 1 generate begin @@ -341,53 +648,74 @@ begin end generate; end generate; - -- Latch the request in r0 as long as we're not stalling - stage_0 : process(clk) - begin - if rising_edge(clk) then - if rst = '1' then - r0.valid <= '0'; - elsif stall_out = '0' then - r0 <= d_in; - end if; - end if; - end process; - -- Cache request parsing and hit detection dcache_request : process(all) variable is_hit : std_ulogic; variable hit_way : way_t; variable op : op_t; - variable tmp : std_ulogic_vector(63 downto 0); - variable data : std_ulogic_vector(63 downto 0); - variable opsel : std_ulogic_vector(3 downto 0); + variable opsel : std_ulogic_vector(2 downto 0); variable go : std_ulogic; + variable nc : std_ulogic; + variable s_hit : std_ulogic; + variable s_tag : cache_tag_t; + variable s_pte : tlb_pte_t; + variable s_ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + variable hit_set : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); + variable hit_way_set : hit_way_set_t; begin -- Extract line, row and tag from request - req_index <= get_index(r0.addr); - req_row <= get_row(r0.addr); - req_tag <= get_tag(r0.addr); + req_index <= get_index(r0.req.addr); + req_row <= get_row(r0.req.addr); + req_tag <= get_tag(ra); -- Only do anything if not being stalled by stage 1 - go := r0.valid and not stall_out; + go := r0_valid and not (r0.tlbie or r0.tlbld); -- Calculate address of beginning of cache line, will be -- used for cache miss processing if needed -- - req_laddr <= r0.addr(63 downto LINE_OFF_BITS) & + req_laddr <= (63 downto REAL_ADDR_BITS => '0') & + ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) & (LINE_OFF_BITS-1 downto 0 => '0'); -- Test if pending request is a hit on any way - hit_way := 0; - is_hit := '0'; - for i in way_t loop - if go = '1' and cache_valids(req_index)(i) = '1' then - if read_tag(i, cache_tags(req_index)) = req_tag then - hit_way := i; - is_hit := '1'; - end if; - end if; - end loop; + -- In order to make timing in virtual mode, when we are using the TLB, + -- we compare each way with each of the real addresses from each way of + -- the TLB, and then decide later which match to use. + hit_way := 0; + is_hit := '0'; + if r0.req.virt_mode = '1' then + for j in tlb_way_t loop + hit_way_set(j) := 0; + s_hit := '0'; + s_pte := read_tlb_pte(j, tlb_pte_way); + s_ra := s_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & + r0.req.addr(TLB_LG_PGSZ - 1 downto 0); + s_tag := get_tag(s_ra); + for i in way_t loop + if go = '1' and cache_valids(req_index)(i) = '1' and + read_tag(i, cache_tags(req_index)) = s_tag and + tlb_valid_way(j) = '1' then + hit_way_set(j) := i; + s_hit := '1'; + end if; + end loop; + hit_set(j) := s_hit; + end loop; + if tlb_hit = '1' then + is_hit := hit_set(tlb_hit_way); + hit_way := hit_way_set(tlb_hit_way); + end if; + else + s_tag := get_tag(r0.req.addr(REAL_ADDR_BITS - 1 downto 0)); + for i in way_t loop + if go = '1' and cache_valids(req_index)(i) = '1' and + read_tag(i, cache_tags(req_index)) = s_tag then + hit_way := i; + is_hit := '1'; + end if; + end loop; + end if; -- The way that matched on a hit req_hit_way <= hit_way; @@ -395,22 +723,35 @@ begin -- The way to replace on a miss replace_way <= to_integer(unsigned(plru_victim(req_index))); - -- Combine the request and cache his status to decide what + -- work out whether we have permission for this access + -- NB we don't yet implement AMR, thus no KUAP + rc_ok <= perm_attr.reference and (r0.req.load or perm_attr.changed); + perm_ok <= (r0.req.priv_mode or not perm_attr.priv) and + (perm_attr.wr_perm or (r0.req.load and perm_attr.rd_perm)); + + -- Combine the request and cache hit status to decide what -- operation needs to be done -- - opsel := go & r0.load & r0.nc & is_hit; - case opsel is - when "1101" => op := OP_LOAD_HIT; - when "1100" => op := OP_LOAD_MISS; - when "1110" => op := OP_LOAD_NC; - when "1001" => op := OP_STORE_HIT; - when "1000" => op := OP_STORE_MISS; - when "1010" => op := OP_STORE_MISS; - when "1011" => op := OP_BAD; - when "1111" => op := OP_BAD; - when others => op := OP_NONE; - end case; - + nc := r0.req.nc or perm_attr.nocache; + op := OP_NONE; + if go = '1' then + if valid_ra = '1' and rc_ok = '1' and perm_ok = '1' then + opsel := r0.req.load & nc & is_hit; + case opsel is + when "101" => op := OP_LOAD_HIT; + when "100" => op := OP_LOAD_MISS; + when "110" => op := OP_LOAD_NC; + when "001" => op := OP_STORE_HIT; + when "000" => op := OP_STORE_MISS; + when "010" => op := OP_STORE_MISS; + when "011" => op := OP_BAD; + when "111" => op := OP_BAD; + when others => op := OP_NONE; + end case; + else + op := OP_TLB_ERR; + end if; + end if; req_op <= op; -- Version of the row number that is valid one cycle earlier @@ -418,7 +759,11 @@ begin -- If we're stalling then we need to keep reading the last -- row requested. if stall_out = '0' then - early_req_row <= get_row(d_in.addr); + if m_in.valid = '1' then + early_req_row <= get_row(m_in.addr); + else + early_req_row <= get_row(d_in.addr); + end if; else early_req_row <= req_row; end if; @@ -427,9 +772,6 @@ begin -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- TODO: Generate errors - -- err_nc_collision <= '1' when req_op = OP_BAD else '0'; - -- Generate stalls from stage 1 state machine stall_out <= '1' when r1.state /= IDLE else '0'; @@ -439,17 +781,17 @@ begin cancel_store <= '0'; set_rsrv <= '0'; clear_rsrv <= '0'; - if stall_out = '0' and r0.valid = '1' and r0.reserve = '1' then + if r0_valid = '1' and r0.req.reserve = '1' then -- XXX generate alignment interrupt if address is not aligned - -- XXX or if r0.nc = '1' - if r0.load = '1' then + -- XXX or if r0.req.nc = '1' + if r0.req.load = '1' then -- load with reservation set_rsrv <= '1'; else -- store conditional clear_rsrv <= '1'; if reservation.valid = '0' or - r0.addr(63 downto LINE_OFF_BITS) /= reservation.addr then + r0.req.addr(63 downto LINE_OFF_BITS) /= reservation.addr then cancel_store <= '1'; end if; end if; @@ -463,7 +805,7 @@ begin reservation.valid <= '0'; elsif set_rsrv = '1' then reservation.valid <= '1'; - reservation.addr <= r0.addr(63 downto LINE_OFF_BITS); + reservation.addr <= r0.req.addr(63 downto LINE_OFF_BITS); end if; end if; end process; @@ -477,6 +819,13 @@ begin d_out.valid <= '0'; d_out.data <= cache_out(r1.hit_way); d_out.store_done <= '0'; + d_out.error <= '0'; + d_out.cache_paradox <= '0'; + + -- Outputs to MMU + m_out.done <= r1.tlbie_done; + m_out.err <= '0'; + m_out.data <= cache_out(r1.hit_way); -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store @@ -496,30 +845,63 @@ begin "unexpected hit_load_delayed collision with slow_valid" severity FAILURE; - -- Load hit case is the standard path - if r1.hit_load_valid = '1' then - report "completing load hit"; - d_out.valid <= '1'; - end if; + if r1.mmu_req = '0' then + -- Request came from loadstore1... + -- Load hit case is the standard path + if r1.hit_load_valid = '1' then + report "completing load hit"; + d_out.valid <= '1'; + end if; - -- Slow ops (load miss, NC, stores) - if r1.slow_valid = '1' then - -- If it's a load, enable register writeback and switch - -- mux accordingly - -- - if r1.req.load then - -- Read data comes from the slow data latch - d_out.data <= r1.slow_data; - end if; - d_out.store_done <= '1'; + -- error cases complete without stalling + if r1.error_done = '1' then + report "completing ld/st with error"; + d_out.error <= '1'; + d_out.cache_paradox <= r1.cache_paradox; + d_out.valid <= '1'; + end if; - report "completing store or load miss"; - d_out.valid <= '1'; - end if; + -- Slow ops (load miss, NC, stores) + if r1.slow_valid = '1' then + -- If it's a load, enable register writeback and switch + -- mux accordingly + -- + if r1.req.load then + -- Read data comes from the slow data latch + d_out.data <= r1.slow_data; + end if; + d_out.store_done <= '1'; + + report "completing store or load miss"; + d_out.valid <= '1'; + end if; + + if r1.stcx_fail = '1' then + d_out.store_done <= '0'; + d_out.valid <= '1'; + end if; + + else + -- Request came from MMU + if r1.hit_load_valid = '1' then + report "completing load hit to MMU, data=" & to_hstring(m_out.data); + m_out.done <= '1'; + end if; + + -- error cases complete without stalling + if r1.error_done = '1' then + report "completing MMU ld with error"; + m_out.err <= '1'; + m_out.done <= '1'; + end if; - if r1.stcx_fail = '1' then - d_out.store_done <= '0'; - d_out.valid <= '1'; + -- Slow ops (i.e. load miss) + if r1.slow_valid = '1' then + -- Read data comes from the slow data latch + m_out.data <= r1.slow_data; + report "completing MMU load miss, data=" & to_hstring(m_out.data); + m_out.done <= '1'; + end if; end if; end process; @@ -578,8 +960,8 @@ begin if r1.state = IDLE then -- In IDLE state, the only write path is the store-hit update case wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); - wr_data <= r0.data; - wr_sel <= r0.byte_sel; + wr_data <= r0.req.data; + wr_sel <= r0.req.byte_sel; else -- Otherwise, we might be doing a reload or a DCBZ if r1.req.dcbz = '1' then @@ -609,21 +991,23 @@ begin -- -- Cache hit synchronous machine for the easy case. This handles load hits. + -- It also handles error cases (TLB miss, cache paradox) -- dcache_fast_hit : process(clk) begin if rising_edge(clk) then - -- If we have a request incoming, we have to latch it as r0.valid + -- If we have a request incoming, we have to latch it as r0.req.valid -- is only set for a single cycle. It's up to the control logic to -- ensure we don't override an uncompleted request (for now we are -- single issue on load/stores so we are fine, later, we can generate -- a stall output if necessary). if req_op /= OP_NONE and stall_out = '0' then - r1.req <= r0; + r1.req <= r0.req; + r1.mmu_req <= r0.mmu_req; report "op:" & op_t'image(req_op) & - " addr:" & to_hstring(r0.addr) & - " nc:" & std_ulogic'image(r0.nc) & + " addr:" & to_hstring(r0.req.addr) & + " nc:" & std_ulogic'image(r0.req.nc) & " idx:" & integer'image(req_index) & " tag:" & to_hstring(req_tag) & " way: " & integer'image(req_hit_way); @@ -636,6 +1020,23 @@ begin else r1.hit_load_valid <= '0'; end if; + + if req_op = OP_TLB_ERR then + report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) & + " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok); + r1.error_done <= '1'; + r1.cache_paradox <= '0'; + elsif req_op = OP_BAD then + report "Signalling cache paradox"; + r1.error_done <= '1'; + r1.cache_paradox <= '1'; + else + r1.error_done <= '0'; + r1.cache_paradox <= '0'; + end if; + + -- complete tlbies and TLB loads in the third cycle + r1.tlbie_done <= r0_valid and (r0.tlbie or r0.tlbld); end if; end process; @@ -681,7 +1082,7 @@ begin when OP_LOAD_MISS => -- Normal load cache miss, start the reload machine -- - report "cache miss addr:" & to_hstring(r0.addr) & + report "cache miss addr:" & to_hstring(r0.req.addr) & " idx:" & integer'image(req_index) & " way:" & integer'image(replace_way) & " tag:" & to_hstring(req_tag); @@ -716,18 +1117,18 @@ begin r1.state <= RELOAD_WAIT_ACK; when OP_LOAD_NC => - r1.wb.sel <= r0.byte_sel; - r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.sel <= r0.req.byte_sel; + r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; r1.state <= NC_LOAD_WAIT_ACK; when OP_STORE_HIT | OP_STORE_MISS => - if r0.dcbz = '0' then - r1.wb.sel <= r0.byte_sel; - r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; - r1.wb.dat <= r0.data; + if r0.req.dcbz = '0' then + r1.wb.sel <= r0.req.byte_sel; + r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; + r1.wb.dat <= r0.req.data; if cancel_store = '0' then r1.wb.cyc <= '1'; r1.wb.stb <= '1'; @@ -774,8 +1175,10 @@ begin end if; -- OP_NONE and OP_BAD do nothing + -- OP_BAD was handled above already when OP_NONE => - when OP_BAD => + when OP_BAD => + when OP_TLB_ERR => end case; when RELOAD_WAIT_ACK => diff --git a/dcache_tb.vhdl b/dcache_tb.vhdl index bd8341a..48c6877 100644 --- a/dcache_tb.vhdl +++ b/dcache_tb.vhdl @@ -15,6 +15,9 @@ architecture behave of dcache_tb is signal d_in : Loadstore1ToDcacheType; signal d_out : DcacheToLoadstore1Type; + signal m_in : MmuToDcacheType; + signal m_out : DcacheToMmuType; + signal wb_bram_in : wishbone_master_out; signal wb_bram_out : wishbone_slave_out; @@ -30,6 +33,8 @@ begin rst => rst, d_in => d_in, d_out => d_out, + m_in => m_in, + m_out => m_out, wishbone_out => wb_bram_in, wishbone_in => wb_bram_out ); @@ -71,6 +76,9 @@ begin d_in.nc <= '0'; d_in.addr <= (others => '0'); d_in.data <= (others => '0'); + m_in.valid <= '0'; + m_in.addr <= (others => '0'); + m_in.pte <= (others => '0'); wait for 4*clk_period; wait until rising_edge(clk); diff --git a/decode1.vhdl b/decode1.vhdl index 785b669..4cd195f 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -181,11 +181,13 @@ architecture behaviour of decode1 is 2#1111101001# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- divdo 2#0111101011# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- divw 2#1111101011# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- divwo + 2#1101010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- eieio 2#0100011100# => (ALU, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- eqv 2#1110111010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsb 2#1110011010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsh 2#1111011010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsw - -- 2#110111101-# extswsli + 2#1101111010# => (ALU, OP_EXTSWSLI, NONE, CONST_SH, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extswsli + 2#1101111011# => (ALU, OP_EXTSWSLI, NONE, CONST_SH, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extswsli 2#1111010110# => (ALU, OP_ICBI, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- icbi 2#0000010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- icbt 2#0000001111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- isel @@ -247,7 +249,7 @@ architecture behaviour of decode1 is -- 2#1001000000# mcrxrx 2#0000010011# => (ALU, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf 2#0001010011# => (ALU, OP_MFMSR, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- mfmsr - 2#0101010011# => (ALU, OP_MFSPR, SPR, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfspr + 2#0101010011# => (ALU, OP_MFSPR, SPR, NONE, RS, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfspr 2#0100001001# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- modud 2#0100001011# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- moduw 2#1100001001# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- modsd @@ -280,6 +282,7 @@ architecture behaviour of decode1 is 2#0010111010# => (ALU, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd 2#0010011010# => (ALU, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw -- 2#0010000000# setb + 2#0111110010# => (LDST, OP_TLBIE, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- slbia 2#0000011011# => (ALU, OP_SHL, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- sld 2#0000011000# => (ALU, OP_SHL, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- slw 2#1100011010# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- srad @@ -321,6 +324,8 @@ architecture behaviour of decode1 is 2#1001010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sync 2#0001000100# => (ALU, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- td 2#0000000100# => (ALU, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- tw + 2#0100110010# => (LDST, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- tlbie + 2#0100010010# => (LDST, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- tlbiel 2#0100111100# => (ALU, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- xor others => illegal_inst ); @@ -342,9 +347,10 @@ architecture behaviour of decode1 is others => decode_rom_init ); - -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl - -- op in out A out in out len ext pipe - constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); + -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl + -- op in out A out in out len ext pipe + constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); + constant fetch_fail_inst: decode_rom_t := (LDST, OP_FETCH_FAILED, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); begin decode1_0: process(clk) @@ -361,6 +367,7 @@ begin variable v : Decode1ToDecode2Type; variable majorop : major_opcode_t; variable op_19_bits: std_ulogic_vector(2 downto 0); + variable sprn : spr_num_t; begin v := r; @@ -376,7 +383,15 @@ begin end if; majorop := unsigned(f_in.insn(31 downto 26)); - if majorop = "011111" then + if f_in.fetch_failed = '1' then + v.valid := '1'; + -- Only send down a single OP_FETCH_FAILED + if r.decode.insn_type = OP_FETCH_FAILED then + v.valid := '0'; + end if; + v.decode := fetch_fail_inst; + + elsif majorop = "011111" then -- major opcode 31, lots of things v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1)))); @@ -427,10 +442,17 @@ begin end if; end if; elsif v.decode.insn_type = OP_MFSPR or v.decode.insn_type = OP_MTSPR then - v.ispr1 := fast_spr_num(decode_spr_num(f_in.insn)); + sprn := decode_spr_num(f_in.insn); + v.ispr1 := fast_spr_num(sprn); -- Make slow SPRs single issue if is_fast_spr(v.ispr1) = '0' then v.decode.sgl_pipe := '1'; + -- send MMU-related SPRs to loadstore1 + case sprn is + when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PRTBL => + v.decode.unit := LDST; + when others => + end case; end if; elsif v.decode.insn_type = OP_RFID then report "PPC RFID"; diff --git a/decode2.vhdl b/decode2.vhdl index edcc50c..b239392 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -285,9 +285,9 @@ begin decoded_reg_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn, r_in.read3_data); decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispr1); - r_out.read1_enable <= decoded_reg_a.reg_valid; - r_out.read2_enable <= decoded_reg_b.reg_valid; - r_out.read3_enable <= decoded_reg_c.reg_valid; + r_out.read1_enable <= decoded_reg_a.reg_valid and d_in.valid; + r_out.read2_enable <= decoded_reg_b.reg_valid and d_in.valid; + r_out.read3_enable <= decoded_reg_c.reg_valid and d_in.valid; case d_in.decode.length is when is1B => diff --git a/decode_types.vhdl b/decode_types.vhdl index 07c486a..8f000a0 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -16,8 +16,9 @@ package decode_types is OP_POPCNT, OP_PRTY, OP_RFID, OP_RLC, OP_RLCL, OP_RLCR, OP_SC, OP_SETB, OP_SHL, OP_SHR, - OP_SYNC, OP_TRAP, - OP_XOR + OP_SYNC, OP_TLBIE, OP_TRAP, + OP_XOR, + OP_FETCH_FAILED ); type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR); type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR); diff --git a/execute1.vhdl b/execute1.vhdl index 8286d30..688f93c 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -23,6 +23,7 @@ entity execute1 is stall_out : out std_ulogic; e_in : in Decode2ToExecute1Type; + l_in : in Loadstore1ToExecute1Type; i_in : in XicsToExecute1Type; @@ -32,6 +33,8 @@ entity execute1 is e_out : out Execute1ToWritebackType; + dbg_msr_out : out std_ulogic_vector(63 downto 0); + icache_inval : out std_ulogic; terminate_out : out std_ulogic ); @@ -49,6 +52,7 @@ architecture behaviour of execute1 is slow_op_rc : std_ulogic; slow_op_oe : std_ulogic; slow_op_xerc : xer_common_t; + ldst_nia : std_ulogic_vector(63 downto 0); end record; constant reg_type_init : reg_type := (e => Execute1ToWritebackInit, lr_update => '0', @@ -63,6 +67,7 @@ architecture behaviour of execute1 is signal ctrl: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0')); signal ctrl_tmp: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0')); signal right_shift, rot_clear_left, rot_clear_right: std_ulogic; + signal rot_sign_ext: std_ulogic; signal rotator_result: std_ulogic_vector(63 downto 0); signal rotator_carry: std_ulogic; signal logical_result: std_ulogic_vector(63 downto 0); @@ -85,6 +90,7 @@ architecture behaviour of execute1 is OP_MFMSR => SUPER, OP_MTMSRD => SUPER, OP_RFID => SUPER, + OP_TLBIE => SUPER, others => USER ); @@ -174,6 +180,7 @@ begin arith => e_in.is_signed, clear_left => rot_clear_left, clear_right => rot_clear_right, + sign_ext_rs => rot_sign_ext, result => rotator_result, carry_out => rotator_carry ); @@ -215,6 +222,8 @@ begin d_out => divider_to_x ); + dbg_msr_out <= ctrl.msr; + a_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data1 = '1' else e_in.read_data1; b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2; c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3; @@ -421,6 +430,9 @@ begin icache_inval <= '0'; stall_out <= '0'; f_out <= Execute1ToFetch1TypeInit; + -- send MSR[IR] and ~MSR[PR] up to fetch1 + f_out.virt_mode <= ctrl.msr(MSR_IR); + f_out.priv_mode <= not ctrl.msr(MSR_PR); -- Next insn adder used in a couple of places next_nia := std_ulogic_vector(unsigned(e_in.nia) + 4); @@ -429,6 +441,7 @@ begin right_shift <= '1' when e_in.insn_type = OP_SHR else '0'; rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0'; rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0'; + rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0'; ctrl_tmp.irq_state <= WRITE_SRR0; exception := '0'; @@ -438,9 +451,9 @@ begin v.e.exc_write_reg := fast_spr_num(SPR_SRR0); v.e.exc_write_data := e_in.nia; - if ctrl.irq_state = WRITE_SRR1 then - v.e.exc_write_reg := fast_spr_num(SPR_SRR1); - v.e.exc_write_data := ctrl.srr1; + if ctrl.irq_state = WRITE_SRR1 then + v.e.exc_write_reg := fast_spr_num(SPR_SRR1); + v.e.exc_write_data := ctrl.srr1; v.e.exc_write_enable := '1'; ctrl_tmp.msr(MSR_SF) <= '1'; ctrl_tmp.msr(MSR_EE) <= '0'; @@ -450,13 +463,15 @@ begin ctrl_tmp.msr(MSR_RI) <= '0'; ctrl_tmp.msr(MSR_LE) <= '1'; f_out.redirect <= '1'; + f_out.virt_mode <= '0'; + f_out.priv_mode <= '1'; f_out.redirect_nia <= ctrl.irq_nia; v.e.valid := e_in.valid; report "Writing SRR1: " & to_hstring(ctrl.srr1); elsif irq_valid = '1' and e_in.valid = '1' then -- we need two cycles to write srr0 and 1 - -- will need more when we have to write DSISR, DAR and HIER + -- will need more when we have to write HEIR -- Don't deliver the interrupt until we have a valid instruction -- coming in, so we have a valid NIA to put in SRR0. exception := '1'; @@ -487,13 +502,12 @@ begin when OP_ILLEGAL => -- we need two cycles to write srr0 and 1 - -- will need more when we have to write DSISR, DAR and HIER + -- will need more when we have to write HEIR illegal := '1'; when OP_SC => -- check bit 1 of the instruction is 1 so we know this is sc; -- 0 would mean scv, so generate an illegal instruction interrupt -- we need two cycles to write srr0 and 1 - -- will need more when we have to write DSISR, DAR and HIER if e_in.insn(1) = '1' then exception := '1'; exception_nextpc := '1'; @@ -642,6 +656,8 @@ begin when OP_RFID => f_out.redirect <= '1'; + f_out.virt_mode <= b_in(MSR_IR) or b_in(MSR_PR); + f_out.priv_mode <= not b_in(MSR_PR); f_out.redirect_nia <= a_in(63 downto 2) & "00"; -- srr0 -- Can't use msr_copy here because the partial function MSR -- bits should be left unchanged, not zeroed. @@ -732,6 +748,7 @@ begin when OP_MFSPR => report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & "=" & to_hstring(a_in); + result_en := '1'; if is_fast_spr(e_in.read_reg1) then result := a_in; if decode_spr_num(e_in.insn) = SPR_XER then @@ -750,11 +767,15 @@ begin result := ctrl.tb; when SPR_DEC => result := ctrl.dec; - when others => - result := (others => '0'); + when others => + -- mfspr from unimplemented SPRs should be a nop in + -- supervisor mode and a program interrupt for user mode + result := c_in; + if ctrl.msr(MSR_PR) = '1' then + illegal := '1'; + end if; end case; end if; - result_en := '1'; when OP_MFCR => if e_in.insn(20) = '0' then -- mfcr @@ -820,6 +841,11 @@ begin when SPR_DEC => ctrl_tmp.dec <= c_in; when others => + -- mtspr to unimplemented SPRs should be a nop in + -- supervisor mode and a program interrupt for user mode + if ctrl.msr(MSR_PR) = '1' then + illegal := '1'; + end if; end case; end if; when OP_POPCNT => @@ -828,7 +854,7 @@ begin when OP_PRTY => result := parity_result; result_en := '1'; - when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR => + when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR | OP_EXTSWSLI => result := rotator_result; if e_in.output_carry = '1' then set_carry(v.e, rotator_carry, rotator_carry); @@ -882,6 +908,7 @@ begin elsif e_in.valid = '1' then -- instruction for other units, i.e. LDST + v.ldst_nia := e_in.nia; v.e.valid := '0'; if e_in.unit = LDST then lv.valid := '1'; @@ -952,8 +979,38 @@ begin v.e.write_data := result; v.e.write_enable := result_en; + -- generate DSI or DSegI for load/store exceptions + -- or ISI or ISegI for instruction fetch exceptions + if l_in.exception = '1' then + ctrl_tmp.srr1 <= msr_copy(ctrl.msr); + if l_in.instr_fault = '0' then + if l_in.segment_fault = '0' then + ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#300#, 64)); + else + ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#380#, 64)); + end if; + else + if l_in.segment_fault = '0' then + ctrl_tmp.srr1(63 - 33) <= l_in.invalid; + ctrl_tmp.srr1(63 - 35) <= l_in.perm_error; -- noexec fault + ctrl_tmp.srr1(63 - 44) <= l_in.badtree; + ctrl_tmp.srr1(63 - 45) <= l_in.rc_error; + ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#400#, 64)); + else + ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#480#, 64)); + end if; + end if; + v.e.exc_write_enable := '1'; + v.e.exc_write_reg := fast_spr_num(SPR_SRR0); + v.e.exc_write_data := r.ldst_nia; + report "ldst exception writing srr0=" & to_hstring(r.ldst_nia); + ctrl_tmp.irq_state <= WRITE_SRR1; + v.e.valid := '1'; -- complete the original load or store + end if; + -- Outputs to loadstore1 (async) lv.op := e_in.insn_type; + lv.nia := e_in.nia; lv.addr1 := a_in; lv.addr2 := b_in; lv.data := c_in; @@ -966,11 +1023,14 @@ begin lv.xerc := v.e.xerc; lv.reserve := e_in.reserve; lv.rc := e_in.rc; + lv.insn := e_in.insn; -- decode l*cix and st*cix instructions here if e_in.insn(31 downto 26) = "011111" and e_in.insn(10 downto 9) = "11" and e_in.insn(5 downto 1) = "10101" then lv.ci := '1'; end if; + lv.virt_mode := ctrl.msr(MSR_DR); + lv.priv_mode := not ctrl.msr(MSR_PR); -- Update registers rin <= v; diff --git a/fetch1.vhdl b/fetch1.vhdl index 301f317..cb1d1df 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -42,6 +42,8 @@ begin if rising_edge(clk) then if r /= r_next then report "fetch1 rst:" & std_ulogic'image(rst) & + " IR:" & std_ulogic'image(e_in.virt_mode) & + " P:" & std_ulogic'image(e_in.priv_mode) & " R:" & std_ulogic'image(e_in.redirect) & " S:" & std_ulogic'image(stall_in) & " T:" & std_ulogic'image(stop_in) & @@ -67,9 +69,13 @@ begin else v.nia := RESET_ADDRESS; end if; + v.virt_mode := '0'; + v.priv_mode := '1'; v_int.stop_state := RUNNING; elsif e_in.redirect = '1' then v.nia := e_in.redirect_nia; + v.virt_mode := e_in.virt_mode; + v.priv_mode := e_in.priv_mode; elsif stall_in = '0' then -- For debug stop/step to work properly we need a little bit of diff --git a/fetch2.vhdl b/fetch2.vhdl index 5474ca6..13ff56e 100644 --- a/fetch2.vhdl +++ b/fetch2.vhdl @@ -46,6 +46,7 @@ begin " F:" & std_ulogic'image(flush_in) & " T:" & std_ulogic'image(rin.stop_mark) & " V:" & std_ulogic'image(rin.valid) & + " FF:" & std_ulogic'image(rin.fetch_failed) & " nia:" & to_hstring(rin.nia); end if; @@ -84,6 +85,7 @@ begin v.valid := v_i_in.valid; v.stop_mark := v_i_in.stop_mark; + v.fetch_failed := v_i_in.fetch_failed; v.nia := v_i_in.nia; v.insn := v_i_in.insn; @@ -94,12 +96,14 @@ begin -- if flush_in = '1' then v_int.stash.valid := '0'; + v_int.stash.fetch_failed := '0'; end if; -- If we are flushing or the instruction comes with a stop mark -- we tag it as invalid so it doesn't get decoded and executed if flush_in = '1' or v.stop_mark = '1' then v.valid := '0'; + v.fetch_failed := '0'; end if; -- Clear stash on reset diff --git a/icache.vhdl b/icache.vhdl index 3eaf548..35d64a8 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -35,7 +35,13 @@ entity icache is -- Number of lines in a set NUM_LINES : positive := 32; -- Number of ways - NUM_WAYS : positive := 4 + NUM_WAYS : positive := 4; + -- L1 ITLB number of entries (direct mapped) + TLB_SIZE : positive := 64; + -- L1 ITLB log_2(page_size) + TLB_LG_PGSZ : positive := 12; + -- Number of real address bits that we store + REAL_ADDR_BITS : positive := 56 ); port ( clk : in std_ulogic; @@ -44,6 +50,8 @@ entity icache is i_in : in Fetch1ToIcacheType; i_out : out IcacheToFetch2Type; + m_in : in MmuToIcacheType; + stall_out : out std_ulogic; flush_in : in std_ulogic; @@ -78,10 +86,12 @@ architecture rtl of icache is constant LINE_OFF_BITS : natural := log2(LINE_SIZE); -- ROW_OFF_BITS is the number of bits for the offset in a row constant ROW_OFF_BITS : natural := log2(ROW_SIZE); - -- INDEX_BITS is the number if bits to select a cache line + -- INDEX_BITS is the number of bits to select a cache line constant INDEX_BITS : natural := log2(NUM_LINES); + -- SET_SIZE_BITS is the log base 2 of the set size + constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS; -- TAG_BITS is the number of bits of the tag part of the address - constant TAG_BITS : natural := 64 - LINE_OFF_BITS - INDEX_BITS; + constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS; -- WAY_BITS is the number of bits to select a way constant WAY_BITS : natural := log2(NUM_WAYS); @@ -126,6 +136,27 @@ architecture rtl of icache is attribute ram_style : string; attribute ram_style of cache_tags : signal is "distributed"; + -- L1 ITLB. + constant TLB_BITS : natural := log2(TLB_SIZE); + constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS); + constant TLB_PTE_BITS : natural := 64; + + subtype tlb_index_t is integer range 0 to TLB_SIZE - 1; + type tlb_valids_t is array(tlb_index_t) of std_ulogic; + subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0); + type tlb_tags_t is array(tlb_index_t) of tlb_tag_t; + subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0); + type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t; + + signal itlb_valids : tlb_valids_t; + signal itlb_tags : tlb_tags_t; + signal itlb_ptes : tlb_ptes_t; + attribute ram_style of itlb_tags : signal is "distributed"; + attribute ram_style of itlb_ptes : signal is "distributed"; + + -- Privilege bit from PTE EAA field + signal eaa_priv : std_ulogic; + -- Cache reload state machine type state_t is (IDLE, WAIT_ACK); @@ -142,6 +173,9 @@ architecture rtl of icache is store_way : way_t; store_index : index_t; store_row : row_t; + + -- TLB miss state + fetch_failed : std_ulogic; end record; signal r : reg_internal_t; @@ -155,6 +189,12 @@ architecture rtl of icache is signal req_is_miss : std_ulogic; signal req_laddr : std_ulogic_vector(63 downto 0); + signal tlb_req_index : tlb_index_t; + signal real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + signal ra_valid : std_ulogic; + signal priv_fault : std_ulogic; + signal access_ok : std_ulogic; + -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; signal cache_out : cache_ram_out_t; @@ -167,13 +207,13 @@ architecture rtl of icache is -- Return the cache line index (tag index) for an address function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is begin - return to_integer(unsigned(addr(63-TAG_BITS downto LINE_OFF_BITS))); + return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS))); end; -- Return the cache row index (data memory) for an address function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is begin - return to_integer(unsigned(addr(63-TAG_BITS downto ROW_OFF_BITS))); + return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS))); end; -- Returns whether this is the last row of a line @@ -231,9 +271,9 @@ architecture rtl of icache is end; -- Get the tag value from the address - function get_tag(addr: std_ulogic_vector(63 downto 0)) return cache_tag_t is + function get_tag(addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)) return cache_tag_t is begin - return addr(63 downto 64-TAG_BITS); + return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS); end; -- Read a tag from a tag memory row @@ -249,6 +289,15 @@ architecture rtl of icache is tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; end; + -- Simple hash for direct-mapped TLB index + function hash_ea(addr: std_ulogic_vector(63 downto 0)) return tlb_index_t is + variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0); + begin + hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ) + xor addr(TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto TLB_LG_PGSZ + TLB_BITS) + xor addr(TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto TLB_LG_PGSZ + 2 * TLB_BITS); + return to_integer(unsigned(hash)); + end; begin assert LINE_SIZE mod ROW_SIZE = 0; @@ -260,9 +309,9 @@ begin report "geometry bits don't add up" severity FAILURE; assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) report "geometry bits don't add up" severity FAILURE; - assert (64 = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) + assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) report "geometry bits don't add up" severity FAILURE; - assert (64 = TAG_BITS + ROW_BITS + ROW_OFF_BITS) + assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS) report "geometry bits don't add up" severity FAILURE; sim_debug: if SIM generate @@ -356,6 +405,56 @@ begin end generate; end generate; + -- TLB hit detection and real address generation + itlb_lookup : process(all) + variable pte : tlb_pte_t; + variable ttag : tlb_tag_t; + begin + tlb_req_index <= hash_ea(i_in.nia); + pte := itlb_ptes(tlb_req_index); + ttag := itlb_tags(tlb_req_index); + if i_in.virt_mode = '1' then + real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & + i_in.nia(TLB_LG_PGSZ - 1 downto 0); + if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then + ra_valid <= itlb_valids(tlb_req_index); + else + ra_valid <= '0'; + end if; + eaa_priv <= pte(3); + else + real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0); + ra_valid <= '1'; + eaa_priv <= '1'; + end if; + + -- no IAMR, so no KUEP support for now + priv_fault <= eaa_priv and not i_in.priv_mode; + access_ok <= ra_valid and not priv_fault; + end process; + + -- iTLB update + itlb_update: process(clk) + variable wr_index : tlb_index_t; + begin + if rising_edge(clk) then + wr_index := hash_ea(m_in.addr); + if rst = '1' or (m_in.tlbie = '1' and m_in.doall = '1') then + -- clear all valid bits + for i in tlb_index_t loop + itlb_valids(i) <= '0'; + end loop; + elsif m_in.tlbie = '1' then + -- clear entry regardless of hit or miss + itlb_valids(wr_index) <= '0'; + elsif m_in.tlbld = '1' then + itlb_tags(wr_index) <= m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS); + itlb_ptes(wr_index) <= m_in.pte; + itlb_valids(wr_index) <= '1'; + end if; + end if; + end process; + -- Cache hit detection, output to fetch2 and other misc logic icache_comb : process(all) variable is_hit : std_ulogic; @@ -364,12 +463,13 @@ begin -- Extract line, row and tag from request req_index <= get_index(i_in.nia); req_row <= get_row(i_in.nia); - req_tag <= get_tag(i_in.nia); + req_tag <= get_tag(real_addr); -- Calculate address of beginning of cache line, will be -- used for cache miss processing if needed -- - req_laddr <= i_in.nia(63 downto LINE_OFF_BITS) & + req_laddr <= (63 downto REAL_ADDR_BITS => '0') & + real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) & (LINE_OFF_BITS-1 downto 0 => '0'); -- Test if pending request is a hit on any way @@ -385,8 +485,13 @@ begin end loop; -- Generate the "hit" and "miss" signals for the synchronous blocks - req_is_hit <= i_in.req and is_hit and not flush_in and not rst; - req_is_miss <= i_in.req and not is_hit and not flush_in; + if i_in.req = '1' and access_ok = '1' and flush_in = '0' and rst = '0' then + req_is_hit <= is_hit; + req_is_miss <= not is_hit; + else + req_is_hit <= '0'; + req_is_miss <= '0'; + end if; req_hit_way <= hit_way; -- The way to replace on a miss @@ -404,9 +509,10 @@ begin i_out.valid <= r.hit_valid; i_out.nia <= r.hit_nia; i_out.stop_mark <= r.hit_smark; + i_out.fetch_failed <= r.fetch_failed; - -- Stall fetch1 if we have a miss - stall_out <= not is_hit; + -- Stall fetch1 if we have a miss on cache or TLB or a protection fault + stall_out <= not (is_hit and access_ok); -- Wishbone requests output (from the cache miss reload machine) wishbone_out <= r.wb; @@ -419,22 +525,21 @@ begin -- On a hit, latch the request for the next cycle, when the BRAM data -- will be available on the cache_out output of the corresponding way -- + r.hit_valid <= req_is_hit; + -- Send stop marks and NIA down regardless of validity + r.hit_smark <= i_in.stop_mark; + r.hit_nia <= i_in.nia; if req_is_hit = '1' then r.hit_way <= req_hit_way; - r.hit_nia <= i_in.nia; r.hit_smark <= i_in.stop_mark; - r.hit_valid <= '1'; report "cache hit nia:" & to_hstring(i_in.nia) & + " IR:" & std_ulogic'image(i_in.virt_mode) & " SM:" & std_ulogic'image(i_in.stop_mark) & " idx:" & integer'image(req_index) & " tag:" & to_hstring(req_tag) & - " way: " & integer'image(req_hit_way); - else - r.hit_valid <= '0'; - - -- Send stop marks down regardless of validity - r.hit_smark <= i_in.stop_mark; + " way:" & integer'image(req_hit_way) & + " RA:" & to_hstring(real_addr); end if; end if; end process; @@ -468,10 +573,12 @@ begin -- We need to read a cache line if req_is_miss = '1' then report "cache miss nia:" & to_hstring(i_in.nia) & + " IR:" & std_ulogic'image(i_in.virt_mode) & " SM:" & std_ulogic'image(i_in.stop_mark) & " idx:" & integer'image(req_index) & " way:" & integer'image(replace_way) & - " tag:" & to_hstring(req_tag); + " tag:" & to_hstring(req_tag) & + " RA:" & to_hstring(real_addr); -- Force misses on that way while reloading that line cache_valids(req_index)(replace_way) <= '0'; @@ -539,6 +646,13 @@ begin end if; end case; end if; + + -- TLB miss and protection fault processing + if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then + r.fetch_failed <= '0'; + elsif i_in.req = '1' and access_ok = '0' then + r.fetch_failed <= '1'; + end if; end if; end process; end; diff --git a/icache_tb.vhdl b/icache_tb.vhdl index ea5cf3a..09a644b 100644 --- a/icache_tb.vhdl +++ b/icache_tb.vhdl @@ -15,6 +15,8 @@ architecture behave of icache_tb is signal i_out : Fetch1ToIcacheType; signal i_in : IcacheToFetch2Type; + signal m_out : MmuToIcacheType; + signal wb_bram_in : wishbone_master_out; signal wb_bram_out : wishbone_slave_out; @@ -30,6 +32,7 @@ begin rst => rst, i_in => i_out, i_out => i_in, + m_in => m_out, flush_in => '0', wishbone_out => wb_bram_in, wishbone_in => wb_bram_out @@ -70,6 +73,11 @@ begin i_out.nia <= (others => '0'); i_out.stop_mark <= '0'; + m_out.tlbld <= '0'; + m_out.tlbie <= '0'; + m_out.addr <= (others => '0'); + m_out.pte <= (others => '0'); + wait until rising_edge(clk); wait until rising_edge(clk); wait until rising_edge(clk); diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 90650db..e71ad74 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -5,7 +5,6 @@ use ieee.numeric_std.all; library work; use work.decode_types.all; use work.common.all; -use work.helpers.all; -- 2 cycle LSU -- We calculate the address in the first cycle @@ -16,11 +15,15 @@ entity loadstore1 is rst : in std_ulogic; l_in : in Execute1ToLoadstore1Type; + e_out : out Loadstore1ToExecute1Type; l_out : out Loadstore1ToWritebackType; d_out : out Loadstore1ToDcacheType; d_in : in DcacheToLoadstore1Type; + m_out : out Loadstore1ToMmuType; + m_in : in MmuToLoadstore1Type; + dc_stall : in std_ulogic; stall_out : out std_ulogic ); @@ -35,14 +38,16 @@ architecture behave of loadstore1 is -- State machine for unaligned loads/stores type state_t is (IDLE, -- ready for instruction SECOND_REQ, -- send 2nd request of unaligned xfer - FIRST_ACK_WAIT, -- waiting for 1st ack from dcache - LAST_ACK_WAIT, -- waiting for last ack from dcache - LD_UPDATE -- writing rA with computed addr on load + ACK_WAIT, -- waiting for ack from dcache + LD_UPDATE, -- writing rA with computed addr on load + MMU_LOOKUP, -- waiting for MMU to look up translation + TLBIE_WAIT -- waiting for MMU to finish doing a tlbie ); type reg_stage_t is record -- latch most of the input request load : std_ulogic; + tlbie : std_ulogic; dcbz : std_ulogic; addr : std_ulogic_vector(63 downto 0); store_data : std_ulogic_vector(63 downto 0); @@ -57,8 +62,15 @@ architecture behave of loadstore1 is reserve : std_ulogic; rc : std_ulogic; nc : std_ulogic; -- non-cacheable access + virt_mode : std_ulogic; + priv_mode : std_ulogic; state : state_t; + dwords_done : std_ulogic; + first_bytes : std_ulogic_vector(7 downto 0); second_bytes : std_ulogic_vector(7 downto 0); + dar : std_ulogic_vector(63 downto 0); + dsisr : std_ulogic_vector(31 downto 0); + instr_fault : std_ulogic; end record; type byte_sel_t is array(0 to 7) of std_ulogic; @@ -135,6 +147,15 @@ begin variable use_second : byte_sel_t; variable trim_ctl : trim_ctl_t; variable negative : std_ulogic; + variable mfspr : std_ulogic; + variable sprn : std_ulogic_vector(9 downto 0); + variable sprval : std_ulogic_vector(63 downto 0); + variable exception : std_ulogic; + variable next_addr : std_ulogic_vector(63 downto 0); + variable mmureq : std_ulogic; + variable dsisr : std_ulogic_vector(31 downto 0); + variable mmu_mtspr : std_ulogic; + variable itlb_fault : std_ulogic; begin v := r; req := '0'; @@ -142,6 +163,14 @@ begin done := '0'; byte_sel := (others => '0'); addr := lsu_sum; + mfspr := '0'; + mmu_mtspr := '0'; + itlb_fault := '0'; + sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10)); + sprval := (others => '0'); -- avoid inferred latches + exception := '0'; + dsisr := (others => '0'); + mmureq := '0'; write_enable := '0'; do_update := '0'; @@ -195,17 +224,73 @@ begin end case; end loop; + -- compute (addr + 8) & ~7 for the second doubleword when unaligned + next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; + case r.state is when IDLE => if l_in.valid = '1' then + v.addr := lsu_sum; v.load := '0'; v.dcbz := '0'; - if l_in.op = OP_LOAD then + v.tlbie := '0'; + v.instr_fault := '0'; + v.dwords_done := '0'; + case l_in.op is + when OP_STORE => + req := '1'; + when OP_LOAD => + req := '1'; v.load := '1'; - elsif l_in.op = OP_DCBZ then + when OP_DCBZ => + req := '1'; v.dcbz := '1'; - end if; - v.addr := lsu_sum; + when OP_TLBIE => + mmureq := '1'; + stall := '1'; + v.tlbie := '1'; + v.state := TLBIE_WAIT; + when OP_MFSPR => + done := '1'; + mfspr := '1'; + -- partial decode on SPR number should be adequate given + -- the restricted set that get sent down this path + if sprn(9) = '0' and sprn(5) = '0' then + if sprn(0) = '0' then + sprval := x"00000000" & r.dsisr; + else + sprval := r.dar; + end if; + else + -- reading one of the SPRs in the MMU + sprval := m_in.sprval; + end if; + when OP_MTSPR => + if sprn(9) = '0' and sprn(5) = '0' then + if sprn(0) = '0' then + v.dsisr := l_in.data(31 downto 0); + else + v.dar := l_in.data; + end if; + done := '1'; + else + -- writing one of the SPRs in the MMU + mmu_mtspr := '1'; + stall := '1'; + v.state := TLBIE_WAIT; + end if; + when OP_FETCH_FAILED => + -- send it to the MMU to do the radix walk + addr := l_in.nia; + v.addr := l_in.nia; + v.instr_fault := '1'; + mmureq := '1'; + stall := '1'; + v.state := MMU_LOOKUP; + when others => + assert false report "unknown op sent to loadstore1"; + end case; + v.write_reg := l_in.write_reg; v.length := l_in.length; v.byte_reverse := l_in.byte_reverse; @@ -216,24 +301,25 @@ begin v.reserve := l_in.reserve; v.rc := l_in.rc; v.nc := l_in.ci; + v.virt_mode := l_in.virt_mode; + v.priv_mode := l_in.priv_mode; -- XXX Temporary hack. Mark the op as non-cachable if the address - -- is the form 0xc------- + -- is the form 0xc------- for a real-mode access. -- -- This will have to be replaced by a combination of implementing the -- proper HV CI load/store instructions and having an MMU to get the I -- bit otherwise. - if lsu_sum(31 downto 28) = "1100" then + if lsu_sum(31 downto 28) = "1100" and l_in.virt_mode = '0' then v.nc := '1'; end if; -- Do length_to_sel and work out if we are doing 2 dwords long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0)); byte_sel := long_sel(7 downto 0); + v.first_bytes := byte_sel; v.second_bytes := long_sel(15 downto 8); - v.addr := lsu_sum; - -- Do byte reversing and rotating for stores in the first cycle byte_offset := unsigned(lsu_sum(2 downto 0)); brev_lenm1 := "000"; @@ -246,52 +332,121 @@ begin v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); end loop; - req := '1'; - stall := '1'; - if long_sel(15 downto 8) = "00000000" then - v.state := LAST_ACK_WAIT; - else - v.state := SECOND_REQ; + if req = '1' then + stall := '1'; + if long_sel(15 downto 8) = "00000000" then + v.state := ACK_WAIT; + else + v.state := SECOND_REQ; + end if; end if; end if; when SECOND_REQ => - -- compute (addr + 8) & ~7 for the second doubleword when unaligned - addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; + addr := next_addr; byte_sel := r.second_bytes; req := '1'; stall := '1'; - v.state := FIRST_ACK_WAIT; + v.state := ACK_WAIT; - when FIRST_ACK_WAIT => + when ACK_WAIT => stall := '1'; if d_in.valid = '1' then - v.state := LAST_ACK_WAIT; - if r.load = '1' then - v.load_data := data_permuted; + if d_in.error = '1' then + -- dcache will discard the second request if it + -- gets an error on the 1st of two requests + if r.dwords_done = '1' then + addr := next_addr; + else + addr := r.addr; + end if; + if d_in.cache_paradox = '1' then + -- signal an interrupt straight away + exception := '1'; + dsisr(63 - 38) := not r.load; + -- XXX there is no architected bit for this + dsisr(63 - 35) := d_in.cache_paradox; + v.state := IDLE; + else + -- Look up the translation for TLB miss + -- and also for permission error and RC error + -- in case the PTE has been updated. + mmureq := '1'; + v.state := MMU_LOOKUP; + end if; + else + if two_dwords = '1' and r.dwords_done = '0' then + v.dwords_done := '1'; + if r.load = '1' then + v.load_data := data_permuted; + end if; + else + write_enable := r.load; + if r.load = '1' and r.update = '1' then + -- loads with rA update need an extra cycle + v.state := LD_UPDATE; + else + -- stores write back rA update in this cycle + do_update := r.update; + stall := '0'; + done := '1'; + v.state := IDLE; + end if; + end if; end if; end if; - when LAST_ACK_WAIT => + when MMU_LOOKUP => stall := '1'; - if d_in.valid = '1' then - write_enable := r.load; - if r.load = '1' and r.update = '1' then - -- loads with rA update need an extra cycle - v.state := LD_UPDATE; + if r.dwords_done = '1' then + addr := next_addr; + byte_sel := r.second_bytes; + else + addr := r.addr; + byte_sel := r.first_bytes; + end if; + if m_in.done = '1' then + if m_in.invalid = '0' and m_in.perm_error = '0' and m_in.rc_error = '0' and + m_in.badtree = '0' and m_in.segerr = '0' then + if r.instr_fault = '0' then + -- retry the request now that the MMU has installed a TLB entry + req := '1'; + if two_dwords = '1' and r.dwords_done = '0' then + v.state := SECOND_REQ; + else + v.state := ACK_WAIT; + end if; + else + -- nothing to do, the icache retries automatically + stall := '0'; + done := '1'; + v.state := IDLE; + end if; else - -- stores write back rA update in this cycle - do_update := r.update; - stall := '0'; - done := '1'; + exception := '1'; + dsisr(63 - 33) := m_in.invalid; + dsisr(63 - 36) := m_in.perm_error; + dsisr(63 - 38) := not r.load; + dsisr(63 - 44) := m_in.badtree; + dsisr(63 - 45) := m_in.rc_error; v.state := IDLE; end if; end if; + when TLBIE_WAIT => + stall := '1'; + if m_in.done = '1' then + -- tlbie is finished + stall := '0'; + done := '1'; + v.state := IDLE; + end if; + when LD_UPDATE => do_update := '1'; v.state := IDLE; done := '1'; + end case; -- Update outputs to dcache @@ -303,12 +458,30 @@ begin d_out.addr <= addr; d_out.data <= v.store_data; d_out.byte_sel <= byte_sel; + d_out.virt_mode <= v.virt_mode; + d_out.priv_mode <= v.priv_mode; + + -- Update outputs to MMU + m_out.valid <= mmureq; + m_out.iside <= v.instr_fault; + m_out.load <= r.load; + m_out.priv <= r.priv_mode; + m_out.tlbie <= v.tlbie; + m_out.mtspr <= mmu_mtspr; + m_out.sprn <= sprn; + m_out.addr <= addr; + m_out.slbia <= l_in.insn(7); + m_out.rs <= l_in.data; -- Update outputs to writeback -- Multiplex either cache data to the destination GPR or -- the address for the rA update. l_out.valid <= done; - if do_update = '1' then + if mfspr = '1' then + l_out.write_enable <= '1'; + l_out.write_reg <= l_in.write_reg; + l_out.write_data <= sprval; + elsif do_update = '1' then l_out.write_enable <= '1'; l_out.write_reg <= r.update_reg; l_out.write_data <= r.addr; @@ -321,6 +494,21 @@ begin l_out.rc <= r.rc and done; l_out.store_done <= d_in.store_done; + -- update exception info back to execute1 + e_out.exception <= exception; + e_out.instr_fault <= r.instr_fault; + e_out.invalid <= m_in.invalid; + e_out.badtree <= m_in.badtree; + e_out.perm_error <= m_in.perm_error; + e_out.rc_error <= m_in.rc_error; + e_out.segment_fault <= m_in.segerr; + if exception = '1' and r.instr_fault = '0' then + v.dar := addr; + if m_in.segerr = '0' then + v.dsisr := dsisr; + end if; + end if; + stall_out <= stall; -- Update registers diff --git a/microwatt.core b/microwatt.core index a1ae14b..9cc51ee 100644 --- a/microwatt.core +++ b/microwatt.core @@ -25,6 +25,7 @@ filesets: - control.vhdl - execute1.vhdl - loadstore1.vhdl + - mmu.vhdl - dcache.vhdl - multiply.vhdl - divider.vhdl diff --git a/mmu.vhdl b/mmu.vhdl new file mode 100644 index 0000000..0eefbab --- /dev/null +++ b/mmu.vhdl @@ -0,0 +1,473 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.common.all; + +-- Radix MMU +-- Supports 4-level trees as in arch 3.0B, but not the two-step translation for +-- guests under a hypervisor (i.e. there is no gRA -> hRA translation). + +entity mmu is + port ( + clk : in std_ulogic; + rst : in std_ulogic; + + l_in : in Loadstore1ToMmuType; + l_out : out MmuToLoadstore1Type; + + d_out : out MmuToDcacheType; + d_in : in DcacheToMmuType; + + i_out : out MmuToIcacheType + ); +end mmu; + +architecture behave of mmu is + + type state_t is (IDLE, + TLB_WAIT, + PROC_TBL_READ, + PROC_TBL_WAIT, + SEGMENT_CHECK, + RADIX_LOOKUP, + RADIX_READ_WAIT, + RADIX_LOAD_TLB, + RADIX_ERROR + ); + + type reg_stage_t is record + -- latched request from loadstore1 + valid : std_ulogic; + iside : std_ulogic; + store : std_ulogic; + priv : std_ulogic; + addr : std_ulogic_vector(63 downto 0); + -- config SPRs + prtbl : std_ulogic_vector(63 downto 0); + pid : std_ulogic_vector(31 downto 0); + -- internal state + state : state_t; + pgtbl0 : std_ulogic_vector(63 downto 0); + pt0_valid : std_ulogic; + pgtbl3 : std_ulogic_vector(63 downto 0); + pt3_valid : std_ulogic; + shift : unsigned(5 downto 0); + mask_size : unsigned(4 downto 0); + pgbase : std_ulogic_vector(55 downto 0); + pde : std_ulogic_vector(63 downto 0); + invalid : std_ulogic; + badtree : std_ulogic; + segerror : std_ulogic; + perm_err : std_ulogic; + rc_error : std_ulogic; + end record; + + signal r, rin : reg_stage_t; + + signal addrsh : std_ulogic_vector(15 downto 0); + signal mask : std_ulogic_vector(15 downto 0); + signal finalmask : std_ulogic_vector(43 downto 0); + +begin + -- Multiplex internal SPR values back to loadstore1, selected + -- by l_in.sprn. + l_out.sprval <= r.prtbl when l_in.sprn(9) = '1' else x"00000000" & r.pid; + + mmu_0: process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + r.state <= IDLE; + r.valid <= '0'; + r.pt0_valid <= '0'; + r.pt3_valid <= '0'; + r.prtbl <= (others => '0'); + else + if rin.valid = '1' then + report "MMU got tlb miss for " & to_hstring(rin.addr); + end if; + if l_out.done = '1' then + report "MMU completing op with invalid=" & std_ulogic'image(l_out.invalid) & + " badtree=" & std_ulogic'image(l_out.badtree); + end if; + if rin.state = RADIX_LOOKUP then + report "radix lookup shift=" & integer'image(to_integer(rin.shift)) & + " msize=" & integer'image(to_integer(rin.mask_size)); + end if; + if r.state = RADIX_LOOKUP then + report "send load addr=" & to_hstring(d_out.addr) & + " addrsh=" & to_hstring(addrsh) & " mask=" & to_hstring(mask); + end if; + r <= rin; + end if; + end if; + end process; + + -- Shift address bits 61--12 right by 0--47 bits and + -- supply the least significant 16 bits of the result. + addrshifter: process(all) + variable sh1 : std_ulogic_vector(30 downto 0); + variable sh2 : std_ulogic_vector(18 downto 0); + variable result : std_ulogic_vector(15 downto 0); + begin + case r.shift(5 downto 4) is + when "00" => + sh1 := r.addr(42 downto 12); + when "01" => + sh1 := r.addr(58 downto 28); + when others => + sh1 := "0000000000000" & r.addr(61 downto 44); + end case; + case r.shift(3 downto 2) is + when "00" => + sh2 := sh1(18 downto 0); + when "01" => + sh2 := sh1(22 downto 4); + when "10" => + sh2 := sh1(26 downto 8); + when others => + sh2 := sh1(30 downto 12); + end case; + case r.shift(1 downto 0) is + when "00" => + result := sh2(15 downto 0); + when "01" => + result := sh2(16 downto 1); + when "10" => + result := sh2(17 downto 2); + when others => + result := sh2(18 downto 3); + end case; + addrsh <= result; + end process; + + -- generate mask for extracting address fields for PTE address generation + addrmaskgen: process(all) + variable m : std_ulogic_vector(15 downto 0); + begin + -- mask_count has to be >= 5 + m := x"001f"; + for i in 5 to 15 loop + if i < to_integer(r.mask_size) then + m(i) := '1'; + end if; + end loop; + mask <= m; + end process; + + -- generate mask for extracting address bits to go in TLB entry + -- in order to support pages > 4kB + finalmaskgen: process(all) + variable m : std_ulogic_vector(43 downto 0); + begin + m := (others => '0'); + for i in 0 to 43 loop + if i < to_integer(r.shift) then + m(i) := '1'; + end if; + end loop; + finalmask <= m; + end process; + + mmu_1: process(all) + variable v : reg_stage_t; + variable dcreq : std_ulogic; + variable done : std_ulogic; + variable tlb_load : std_ulogic; + variable itlb_load : std_ulogic; + variable tlbie_req : std_ulogic; + variable inval_all : std_ulogic; + variable prtbl_rd : std_ulogic; + variable pt_valid : std_ulogic; + variable effpid : std_ulogic_vector(31 downto 0); + variable prtable_addr : std_ulogic_vector(63 downto 0); + variable rts : unsigned(5 downto 0); + variable mbits : unsigned(5 downto 0); + variable pgtable_addr : std_ulogic_vector(63 downto 0); + variable pte : std_ulogic_vector(63 downto 0); + variable tlb_data : std_ulogic_vector(63 downto 0); + variable nonzero : std_ulogic; + variable pgtbl : std_ulogic_vector(63 downto 0); + variable perm_ok : std_ulogic; + variable rc_ok : std_ulogic; + variable addr : std_ulogic_vector(63 downto 0); + variable data : std_ulogic_vector(63 downto 0); + begin + v := r; + v.valid := '0'; + dcreq := '0'; + done := '0'; + v.invalid := '0'; + v.badtree := '0'; + v.segerror := '0'; + v.perm_err := '0'; + v.rc_error := '0'; + tlb_load := '0'; + itlb_load := '0'; + tlbie_req := '0'; + inval_all := '0'; + prtbl_rd := '0'; + + -- Radix tree data structures in memory are big-endian, + -- so we need to byte-swap them + for i in 0 to 7 loop + data(i * 8 + 7 downto i * 8) := d_in.data((7 - i) * 8 + 7 downto (7 - i) * 8); + end loop; + + case r.state is + when IDLE => + if l_in.addr(63) = '0' then + pgtbl := r.pgtbl0; + pt_valid := r.pt0_valid; + else + pgtbl := r.pgtbl3; + pt_valid := r.pt3_valid; + end if; + -- rts == radix tree size, # address bits being translated + rts := unsigned('0' & pgtbl(62 downto 61) & pgtbl(7 downto 5)); + -- mbits == # address bits to index top level of tree + mbits := unsigned('0' & pgtbl(4 downto 0)); + -- set v.shift to rts so that we can use finalmask for the segment check + v.shift := rts; + v.mask_size := mbits(4 downto 0); + v.pgbase := pgtbl(55 downto 8) & x"00"; + + if l_in.valid = '1' then + v.addr := l_in.addr; + v.iside := l_in.iside; + v.store := not (l_in.load or l_in.iside); + v.priv := l_in.priv; + if l_in.tlbie = '1' then + dcreq := '1'; + tlbie_req := '1'; + -- Invalidate all iTLB/dTLB entries for tlbie with + -- RB[IS] != 0 or RB[AP] != 0, or for slbia + inval_all := l_in.slbia or l_in.addr(11) or l_in.addr(10) or + l_in.addr(7) or l_in.addr(6) or l_in.addr(5); + -- The RIC field of the tlbie instruction comes across on the + -- sprn bus as bits 2--3. RIC=2 flushes process table caches. + if l_in.sprn(3) = '1' then + v.pt0_valid := '0'; + v.pt3_valid := '0'; + end if; + v.state := TLB_WAIT; + else + v.valid := '1'; + if pt_valid = '0' then + -- need to fetch process table entry + -- set v.shift so we can use finalmask for generating + -- the process table entry address + v.shift := unsigned('0' & r.prtbl(4 downto 0)); + v.state := PROC_TBL_READ; + elsif mbits = 0 then + -- Use RPDS = 0 to disable radix tree walks + v.state := RADIX_ERROR; + v.invalid := '1'; + else + v.state := SEGMENT_CHECK; + end if; + end if; + end if; + if l_in.mtspr = '1' then + -- Move to PID needs to invalidate L1 TLBs and cached + -- pgtbl0 value. Move to PRTBL does that plus + -- invalidating the cached pgtbl3 value as well. + if l_in.sprn(9) = '0' then + v.pid := l_in.rs(31 downto 0); + else + v.prtbl := l_in.rs; + v.pt3_valid := '0'; + end if; + v.pt0_valid := '0'; + dcreq := '1'; + tlbie_req := '1'; + inval_all := '1'; + v.state := TLB_WAIT; + end if; + + when TLB_WAIT => + if d_in.done = '1' then + done := '1'; + v.state := IDLE; + end if; + + when PROC_TBL_READ => + dcreq := '1'; + prtbl_rd := '1'; + v.state := PROC_TBL_WAIT; + + when PROC_TBL_WAIT => + if d_in.done = '1' then + if d_in.err = '0' then + if r.addr(63) = '1' then + v.pgtbl3 := data; + v.pt3_valid := '1'; + else + v.pgtbl0 := data; + v.pt0_valid := '1'; + end if; + -- rts == radix tree size, # address bits being translated + rts := unsigned('0' & data(62 downto 61) & data(7 downto 5)); + -- mbits == # address bits to index top level of tree + mbits := unsigned('0' & data(4 downto 0)); + -- set v.shift to rts so that we can use finalmask for the segment check + v.shift := rts; + v.mask_size := mbits(4 downto 0); + v.pgbase := data(55 downto 8) & x"00"; + if mbits = 0 then + v.state := RADIX_ERROR; + v.invalid := '1'; + else + v.state := SEGMENT_CHECK; + end if; + else + v.state := RADIX_ERROR; + v.badtree := '1'; + end if; + end if; + + when SEGMENT_CHECK => + mbits := '0' & r.mask_size; + v.shift := r.shift + (31 - 12) - mbits; + nonzero := or(r.addr(61 downto 31) and not finalmask(30 downto 0)); + if r.addr(63) /= r.addr(62) or nonzero = '1' then + v.state := RADIX_ERROR; + v.segerror := '1'; + elsif mbits < 5 or mbits > 16 or mbits > (r.shift + (31 - 12)) then + v.state := RADIX_ERROR; + v.badtree := '1'; + else + v.state := RADIX_LOOKUP; + end if; + + when RADIX_LOOKUP => + dcreq := '1'; + v.state := RADIX_READ_WAIT; + + when RADIX_READ_WAIT => + if d_in.done = '1' then + if d_in.err = '0' then + v.pde := data; + -- test valid bit + if data(63) = '1' then + -- test leaf bit + if data(62) = '1' then + -- check permissions and RC bits + perm_ok := '0'; + if r.priv = '1' or data(3) = '0' then + if r.iside = '0' then + perm_ok := data(1) or (data(2) and not r.store); + else + -- no IAMR, so no KUEP support for now + -- deny execute permission if cache inhibited + perm_ok := data(0) and not data(5); + end if; + end if; + rc_ok := data(8) and (data(7) or not r.store); + if perm_ok = '1' and rc_ok = '1' then + v.state := RADIX_LOAD_TLB; + else + v.state := RADIX_ERROR; + v.perm_err := not perm_ok; + -- permission error takes precedence over RC error + v.rc_error := perm_ok; + end if; + else + mbits := unsigned('0' & data(4 downto 0)); + if mbits < 5 or mbits > 16 or mbits > r.shift then + v.state := RADIX_ERROR; + v.badtree := '1'; + else + v.shift := v.shift - mbits; + v.mask_size := mbits(4 downto 0); + v.pgbase := data(55 downto 8) & x"00"; + v.state := RADIX_LOOKUP; + end if; + end if; + else + -- non-present PTE, generate a DSI + v.state := RADIX_ERROR; + v.invalid := '1'; + end if; + else + v.state := RADIX_ERROR; + v.badtree := '1'; + end if; + end if; + + when RADIX_LOAD_TLB => + tlb_load := '1'; + if r.iside = '0' then + dcreq := '1'; + v.state := TLB_WAIT; + else + itlb_load := '1'; + done := '1'; + v.state := IDLE; + end if; + + when RADIX_ERROR => + done := '1'; + v.state := IDLE; + + end case; + + if r.addr(63) = '1' then + effpid := x"00000000"; + else + effpid := r.pid; + end if; + prtable_addr := x"00" & r.prtbl(55 downto 36) & + ((r.prtbl(35 downto 12) and not finalmask(23 downto 0)) or + (effpid(31 downto 8) and finalmask(23 downto 0))) & + effpid(7 downto 0) & "0000"; + + pgtable_addr := x"00" & r.pgbase(55 downto 19) & + ((r.pgbase(18 downto 3) and not mask) or (addrsh and mask)) & + "000"; + pte := x"00" & + ((r.pde(55 downto 12) and not finalmask) or (r.addr(55 downto 12) and finalmask)) + & r.pde(11 downto 0); + + -- update registers + rin <= v; + + -- drive outputs + if tlbie_req = '1' then + addr := l_in.addr; + tlb_data := l_in.rs; + elsif tlb_load = '1' then + addr := r.addr(63 downto 12) & x"000"; + tlb_data := pte; + elsif prtbl_rd = '1' then + addr := prtable_addr; + tlb_data := (others => '0'); + else + addr := pgtable_addr; + tlb_data := (others => '0'); + end if; + + l_out.done <= done; + l_out.invalid <= r.invalid; + l_out.badtree <= r.badtree; + l_out.segerr <= r.segerror; + l_out.perm_error <= r.perm_err; + l_out.rc_error <= r.rc_error; + + d_out.valid <= dcreq; + d_out.tlbie <= tlbie_req; + d_out.doall <= inval_all; + d_out.tlbld <= tlb_load; + d_out.addr <= addr; + d_out.pte <= tlb_data; + + i_out.tlbld <= itlb_load; + i_out.tlbie <= tlbie_req; + i_out.doall <= inval_all; + i_out.addr <= addr; + i_out.pte <= tlb_data; + + end process; +end; diff --git a/register_file.vhdl b/register_file.vhdl index 6a4c989..2cffeea 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -17,6 +17,11 @@ entity register_file is w_in : in WritebackToRegisterFileType; + dbg_gpr_req : in std_ulogic; + dbg_gpr_ack : out std_ulogic; + dbg_gpr_addr : in gspr_index_t; + dbg_gpr_data : out std_ulogic_vector(63 downto 0); + -- debug sim_dump : in std_ulogic; sim_dump_done : out std_ulogic @@ -26,6 +31,9 @@ end entity register_file; architecture behaviour of register_file is type regfile is array(0 to 63) of std_ulogic_vector(63 downto 0); signal registers : regfile := (others => (others => '0')); + signal rd_port_b : std_ulogic_vector(63 downto 0); + signal dbg_data : std_ulogic_vector(63 downto 0); + signal dbg_ack : std_ulogic; begin -- synchronous writes register_write_0: process(clk) @@ -45,6 +53,7 @@ begin -- asynchronous reads register_read_0: process(all) + variable b_addr : gspr_index_t; begin if d_in.read1_enable = '1' then report "Reading GPR " & to_hstring(d_in.read1_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read1_reg)))); @@ -56,7 +65,14 @@ begin report "Reading GPR " & to_hstring(d_in.read3_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read3_reg)))); end if; d_out.read1_data <= registers(to_integer(unsigned(d_in.read1_reg))); - d_out.read2_data <= registers(to_integer(unsigned(d_in.read2_reg))); + -- B read port is multiplexed with reads from the debug circuitry + if d_in.read2_enable = '0' and dbg_gpr_req = '1' and dbg_ack = '0' then + b_addr := dbg_gpr_addr; + else + b_addr := d_in.read2_reg; + end if; + rd_port_b <= registers(to_integer(unsigned(b_addr))); + d_out.read2_data <= rd_port_b; d_out.read3_data <= registers(to_integer(unsigned(gpr_to_gspr(d_in.read3_reg)))); -- Forward any written data @@ -73,6 +89,24 @@ begin end if; end process register_read_0; + -- Latch read data and ack if dbg read requested and B port not busy + dbg_register_read: process(clk) + begin + if rising_edge(clk) then + if dbg_gpr_req = '1' then + if d_in.read2_enable = '0' and dbg_ack = '0' then + dbg_data <= rd_port_b; + dbg_ack <= '1'; + end if; + else + dbg_ack <= '0'; + end if; + end if; + end process; + + dbg_gpr_ack <= dbg_ack; + dbg_gpr_data <= dbg_data; + -- Dump registers if core terminates sim_dump_test: if SIM generate dump_registers: process(all) diff --git a/rotator.vhdl b/rotator.vhdl index d8a8ee9..fef9788 100644 --- a/rotator.vhdl +++ b/rotator.vhdl @@ -15,6 +15,7 @@ entity rotator is arith: in std_ulogic; clear_left: in std_ulogic; clear_right: in std_ulogic; + sign_ext_rs: in std_ulogic; result: out std_ulogic_vector(63 downto 0); carry_out: out std_ulogic ); @@ -57,13 +58,18 @@ architecture behaviour of rotator is begin rotator_0: process(all) + variable hi32: std_ulogic_vector(31 downto 0); begin -- First replicate bottom 32 bits to both halves if 32-bit if is_32bit = '1' then - repl32 <= rs(31 downto 0) & rs(31 downto 0); - else - repl32 <= rs; + hi32 := rs(31 downto 0); + elsif sign_ext_rs = '1' then + -- sign extend bottom 32 bits + hi32 := (others => rs(31)); + else + hi32 := rs(63 downto 32); end if; + repl32 <= hi32 & rs(31 downto 0); -- Negate shift count for right shifts if right_shift = '1' then diff --git a/rotator_tb.vhdl b/rotator_tb.vhdl index 3cb46b0..62a09ce 100644 --- a/rotator_tb.vhdl +++ b/rotator_tb.vhdl @@ -19,6 +19,7 @@ architecture behave of rotator_tb is signal is_32bit, right_shift, arith, clear_left, clear_right: std_ulogic := '0'; signal result: std_ulogic_vector(63 downto 0); signal carry_out: std_ulogic; + signal extsw: std_ulogic; begin rotator_0: entity work.rotator @@ -32,6 +33,7 @@ begin arith => arith, clear_left => clear_left, clear_right => clear_right, + sign_ext_rs => extsw, result => result, carry_out => carry_out ); @@ -48,6 +50,7 @@ begin arith <= '0'; clear_left <= '1'; clear_right <= '1'; + extsw <= '0'; rlwnm_loop : for i in 0 to 1000 loop rs <= pseudorand(64); shift <= pseudorand(7); @@ -263,6 +266,31 @@ begin report "bad srad expected " & to_hstring(behave_ca_ra) & " got " & to_hstring(carry_out & result); end loop; + -- extswsli + report "test extswsli"; + ra <= (others => '0'); + is_32bit <= '0'; + right_shift <= '0'; + arith <= '0'; + clear_left <= '0'; + clear_right <= '0'; + extsw <= '1'; + extswsli_loop : for i in 0 to 1000 loop + rs <= pseudorand(64); + shift <= '0' & pseudorand(6); + wait for clk_period; + behave_ra := rs; + behave_ra(63 downto 32) := (others => rs(31)); + behave_ra := std_ulogic_vector(shift_left(unsigned(behave_ra), + to_integer(unsigned(shift)))); + --report "rs = " & to_hstring(rs); + --report "ra = " & to_hstring(ra); + --report "shift = " & to_hstring(shift); + --report "result = " & to_hstring(carry_out & result); + assert behave_ra = result + report "bad extswsli expected " & to_hstring(behave_ra) & " got " & to_hstring(result); + end loop; + assert false report "end of test" severity failure; wait; end process; diff --git a/scripts/mw_debug/mw_debug.c b/scripts/mw_debug/mw_debug.c index 7bcf4f9..8359242 100644 --- a/scripts/mw_debug/mw_debug.c +++ b/scripts/mw_debug/mw_debug.c @@ -33,6 +33,10 @@ #define DBG_CORE_STAT_TERM (1 << 2) #define DBG_CORE_NIA 0x12 +#define DBG_CORE_MSR 0x13 + +#define DBG_CORE_GSPR_INDEX 0x14 +#define DBG_CORE_GSPR_DATA 0x15 static bool debug; @@ -356,11 +360,12 @@ static int dmi_write(uint8_t addr, uint64_t data) static void core_status(void) { - uint64_t stat, nia; + uint64_t stat, nia, msr; const char *statstr, *statstr2; check(dmi_read(DBG_CORE_STAT, &stat), "reading core status"); check(dmi_read(DBG_CORE_NIA, &nia), "reading core NIA"); + check(dmi_read(DBG_CORE_MSR, &msr), "reading core MSR"); if (debug) printf("Core status = 0x%llx\n", (unsigned long long)stat); @@ -378,6 +383,7 @@ static void core_status(void) statstr = "odd state (TERM but no STOP)"; printf("Core: %s%s\n", statstr, statstr2); printf(" NIA: %016llx\n", (unsigned long long)nia); + printf(" MSR: %016llx\n", msr); } static void core_stop(void) @@ -413,19 +419,47 @@ static void icache_reset(void) check(dmi_write(DBG_CORE_CTRL, DBG_CORE_CTRL_ICRESET), "resetting icache"); } +static const char *fast_spr_names[] = +{ + "lr", "ctr", "srr0", "srr1", "hsrr0", "hsrr1", + "sprg0", "sprg1", "sprg2", "sprg3", + "hsprg0", "hsprg1", "xer" +}; + +static void gpr_read(uint64_t reg, uint64_t count) +{ + uint64_t data; + + reg &= 0x3f; + if (reg + count > 64) + count = 64 - reg; + for (; count != 0; --count, ++reg) { + check(dmi_write(DBG_CORE_GSPR_INDEX, reg), "setting GPR index"); + data = 0xdeadbeef; + check(dmi_read(DBG_CORE_GSPR_DATA, &data), "reading GPR data"); + if (reg <= 31) + printf("r%d", reg); + else if ((reg - 32) < sizeof(fast_spr_names) / sizeof(fast_spr_names[0])) + printf("%s", fast_spr_names[reg - 32]); + else + printf("gspr%d", reg); + printf(":\t%016llx\n", data); + } +} + static void mem_read(uint64_t addr, uint64_t count) { uint64_t data; int i, rc; - rc = dmi_write(2, 0x7ff); + rc = dmi_write(DBG_WB_CTRL, 0x7ff); if (rc < 0) return; - rc = dmi_write(0, addr); + rc = dmi_write(DBG_WB_ADDR, addr); if (rc < 0) return; for (i = 0; i < count; i++) { - rc = dmi_read(1, &data); + rc = dmi_read(DBG_WB_DATA, &data); if (rc < 0) return; printf("%016llx: %016llx\n", @@ -435,6 +469,13 @@ static void mem_read(uint64_t addr, uint64_t count) } } +static void mem_write(uint64_t addr, uint64_t data) +{ + check(dmi_write(DBG_WB_CTRL, 0x7ff), "writing WB_CTRL"); + check(dmi_write(DBG_WB_ADDR, addr), "writing WB_ADDR"); + check(dmi_write(DBG_WB_DATA, data), "writing WB_DATA"); +} + static void load(const char *filename, uint64_t addr) { uint64_t data; @@ -445,13 +486,8 @@ static void load(const char *filename, uint64_t addr) fprintf(stderr, "Failed to open '%s': %s\n", filename, strerror(errno)); exit(1); } - // XX dumb, do better - rc = dmi_write(2, 0x7ff); - if (rc < 0) - return; - rc = dmi_write(0, addr); - if (rc < 0) - return; + check(dmi_write(DBG_WB_CTRL, 0x7ff), "writing WB_CTRL"); + check(dmi_write(DBG_WB_ADDR, addr), "writing WB_ADDR"); count = 0; for (;;) { data = 0; @@ -459,7 +495,7 @@ static void load(const char *filename, uint64_t addr) if (rc <= 0) break; // if (rc < 8) XXX fixup endian ? - dmi_write(1, data); + check(dmi_write(DBG_WB_DATA, data), "writing WB_DATA"); count += 8; if (!(count % 1024)) printf("%x...\n", count); @@ -544,6 +580,8 @@ int main(int argc, char *argv[]) dmi_write(addr, data); } else if (strcmp(argv[i], "creset") == 0) { core_reset(); + } else if (strcmp(argv[i], "icreset") == 0) { + icache_reset(); } else if (strcmp(argv[i], "stop") == 0) { core_stop(); } else if (strcmp(argv[i], "start") == 0) { @@ -563,6 +601,14 @@ int main(int argc, char *argv[]) if (((i+1) < argc) && isdigit(argv[i+1][0])) count = strtoul(argv[++i], NULL, 16); mem_read(addr, count); + } else if (strcmp(argv[i], "mw") == 0) { + uint64_t addr, data; + + if ((i+2) >= argc) + usage(argv[0]); + addr = strtoul(argv[++i], NULL, 16); + data = strtoul(argv[++i], NULL, 16); + mem_write(addr, data); } else if (strcmp(argv[i], "load") == 0) { const char *filename; uint64_t addr = 0; @@ -573,6 +619,15 @@ int main(int argc, char *argv[]) if (((i+1) < argc) && isdigit(argv[i+1][0])) addr = strtoul(argv[++i], NULL, 16); load(filename, addr); + } else if (strcmp(argv[i], "gpr") == 0) { + uint64_t reg, count = 1; + + if ((i+1) >= argc) + usage(argv[0]); + reg = strtoul(argv[++i], NULL, 10); + if (((i+1) < argc) && isdigit(argv[i+1][0])) + count = strtoul(argv[++i], NULL, 10); + gpr_read(reg, count); } else { fprintf(stderr, "Unknown command %s\n", argv[i]); exit(1); diff --git a/tests/mmu/Makefile b/tests/mmu/Makefile new file mode 100644 index 0000000..84f7ff2 --- /dev/null +++ b/tests/mmu/Makefile @@ -0,0 +1,3 @@ +TEST=mmu + +include ../Makefile.test diff --git a/tests/mmu/head.S b/tests/mmu/head.S new file mode 100644 index 0000000..824ad67 --- /dev/null +++ b/tests/mmu/head.S @@ -0,0 +1,179 @@ +/* Copyright 2013-2014 IBM Corp. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Load an immediate 64-bit value into a register */ +#define LOAD_IMM64(r, e) \ + lis r,(e)@highest; \ + ori r,r,(e)@higher; \ + rldicr r,r, 32, 31; \ + oris r,r, (e)@h; \ + ori r,r, (e)@l; + + .section ".head","ax" + + /* + * Microwatt currently enters in LE mode at 0x0, so we don't need to + * do any endian fix ups + */ + . = 0 +.global _start +_start: + LOAD_IMM64(%r10,__bss_start) + LOAD_IMM64(%r11,__bss_end) + subf %r11,%r10,%r11 + addi %r11,%r11,63 + srdi. %r11,%r11,6 + beq 2f + mtctr %r11 +1: dcbz 0,%r10 + addi %r10,%r10,64 + bdnz 1b + +2: LOAD_IMM64(%r1,__stack_top) + li %r0,0 + stdu %r0,-16(%r1) + LOAD_IMM64(%r12, main) + mtctr %r12 + bctrl + attn // terminate on exit + b . + + /* Read a location with translation on */ + .globl test_read +test_read: + mfmsr %r9 + ori %r8,%r9,0x10 /* set MSR_DR */ + mtmsrd %r8,0 + mr %r6,%r3 + li %r3,0 + ld %r5,0(%r6) + li %r3,1 + /* land here if DSI occurred */ + mtmsrd %r9,0 + std %r5,0(%r4) + blr + + /* Write a location with translation on */ + .globl test_write +test_write: + mfmsr %r9 + ori %r8,%r9,0x10 /* set MSR_DR */ + mtmsrd %r8,0 + mr %r6,%r3 + li %r3,0 + std %r4,0(%r6) + li %r3,1 + /* land here if DSI occurred */ + mtmsrd %r9,0 + blr + + /* Do a dcbz with translation on */ + .globl test_dcbz +test_dcbz: + mfmsr %r9 + ori %r8,%r9,0x10 /* set MSR_DR */ + mtmsrd %r8,0 + mr %r6,%r3 + li %r3,0 + dcbz 0,%r6 + li %r3,1 + /* land here if DSI occurred */ + mtmsrd %r9,0 + blr + + .globl test_exec +test_exec: + mtsrr0 %r4 + mtsrr1 %r5 + rfid + +#define EXCEPTION(nr) \ + .= nr ;\ + attn + + /* DSI vector - skip the failing instruction + the next one */ + . = 0x300 + mtsprg0 %r10 + mfsrr0 %r10 + addi %r10,%r10,8 + mtsrr0 %r10 + rfid + + EXCEPTION(0x380) + + /* + * ISI vector - jump to LR to return from the test, + * with r3 cleared + */ + . = 0x400 + li %r3,0 + blr + + /* More exception stubs */ + EXCEPTION(0x480) + EXCEPTION(0x500) + EXCEPTION(0x600) + EXCEPTION(0x700) + EXCEPTION(0x800) + EXCEPTION(0x900) + EXCEPTION(0x980) + EXCEPTION(0xa00) + EXCEPTION(0xb00) + + /* + * System call - used to exit from tests where MSR[PR] + * may have been set. + */ + . = 0xc00 + blr + + EXCEPTION(0xd00) + EXCEPTION(0xe00) + EXCEPTION(0xe20) + EXCEPTION(0xe40) + EXCEPTION(0xe60) + EXCEPTION(0xe80) + EXCEPTION(0xf00) + EXCEPTION(0xf20) + EXCEPTION(0xf40) + EXCEPTION(0xf60) + EXCEPTION(0xf80) + + . = 0x1000 + /* + * This page gets mapped at various locations and + * the tests try to execute from it. + * r3 contains the test number. + */ + .globl test_start +test_start: + nop + nop + cmpdi %r3,1 + beq test_1 + cmpdi %r3,2 + beq test_2 +test_return: + li %r3,1 + sc + + . = 0x1ff8 + /* test a branch near the end of a page */ +test_1: b test_return + + /* test flowing from one page to the next */ +test_2: nop + b test_return diff --git a/tests/mmu/mmu.c b/tests/mmu/mmu.c new file mode 100644 index 0000000..a5d086b --- /dev/null +++ b/tests/mmu/mmu.c @@ -0,0 +1,688 @@ +#include +#include +#include + +#include "console.h" + +#define MSR_DR 0x10 +#define MSR_IR 0x20 + +extern int test_read(long *addr, long *ret, long init); +extern int test_write(long *addr, long val); +extern int test_dcbz(long *addr); +extern int test_exec(int testno, unsigned long pc, unsigned long msr); + +static inline void do_tlbie(unsigned long rb, unsigned long rs) +{ + __asm__ volatile("tlbie %0,%1" : : "r" (rb), "r" (rs) : "memory"); +} + +#define DSISR 18 +#define DAR 19 +#define SRR0 26 +#define SRR1 27 +#define PID 48 +#define PRTBL 720 + +static inline unsigned long mfspr(int sprnum) +{ + long val; + + __asm__ volatile("mfspr %0,%1" : "=r" (val) : "i" (sprnum)); + return val; +} + +static inline void mtspr(int sprnum, unsigned long val) +{ + __asm__ volatile("mtspr %0,%1" : : "i" (sprnum), "r" (val)); +} + +static inline void store_pte(unsigned long *p, unsigned long pte) +{ + __asm__ volatile("stdbrx %1,0,%0" : : "r" (p), "r" (pte) : "memory"); +} + +void print_string(const char *str) +{ + for (; *str; ++str) + putchar(*str); +} + +void print_hex(unsigned long val) +{ + int i, x; + + for (i = 60; i >= 0; i -= 4) { + x = (val >> i) & 0xf; + if (x >= 10) + putchar(x + 'a' - 10); + else + putchar(x + '0'); + } +} + +// i < 100 +void print_test_number(int i) +{ + print_string("test "); + putchar(48 + i/10); + putchar(48 + i%10); + putchar(':'); +} + +#define CACHE_LINE_SIZE 64 + +void zero_memory(void *ptr, unsigned long nbytes) +{ + unsigned long nb, i, nl; + void *p; + + for (; nbytes != 0; nbytes -= nb, ptr += nb) { + nb = -((unsigned long)ptr) & (CACHE_LINE_SIZE - 1); + if (nb == 0 && nbytes >= CACHE_LINE_SIZE) { + nl = nbytes / CACHE_LINE_SIZE; + p = ptr; + for (i = 0; i < nl; ++i) { + __asm__ volatile("dcbz 0,%0" : : "r" (p) : "memory"); + p += CACHE_LINE_SIZE; + } + nb = nl * CACHE_LINE_SIZE; + } else { + if (nb > nbytes) + nb = nbytes; + for (i = 0; i < nb; ++i) + ((unsigned char *)ptr)[i] = 0; + } + } +} + +#define PERM_EX 0x001 +#define PERM_WR 0x002 +#define PERM_RD 0x004 +#define PERM_PRIV 0x008 +#define ATTR_NC 0x020 +#define CHG 0x080 +#define REF 0x100 + +#define DFLT_PERM (PERM_WR | PERM_RD | REF | CHG) + +/* + * Set up an MMU translation tree using memory starting at the 64k point. + * We use 2 levels, mapping 2GB (the minimum size possible), with a + * 8kB PGD level pointing to 4kB PTE pages. + */ +unsigned long *pgdir = (unsigned long *) 0x10000; +unsigned long *proc_tbl = (unsigned long *) 0x12000; +unsigned long free_ptr = 0x13000; +void *eas_mapped[4]; +int neas_mapped; + +void init_mmu(void) +{ + /* set up process table */ + zero_memory(proc_tbl, 512 * sizeof(unsigned long)); + mtspr(PRTBL, (unsigned long)proc_tbl); + mtspr(PID, 1); + zero_memory(pgdir, 1024 * sizeof(unsigned long)); + /* RTS = 0 (2GB address space), RPDS = 10 (1024-entry top level) */ + store_pte(&proc_tbl[2 * 1], (unsigned long) pgdir | 10); + do_tlbie(0xc00, 0); /* invalidate all TLB entries */ +} + +static unsigned long *read_pgd(unsigned long i) +{ + unsigned long ret; + + __asm__ volatile("ldbrx %0,%1,%2" : "=r" (ret) : "b" (pgdir), + "r" (i * sizeof(unsigned long))); + return (unsigned long *) (ret & 0x00ffffffffffff00); +} + +void map(void *ea, void *pa, unsigned long perm_attr) +{ + unsigned long epn = (unsigned long) ea >> 12; + unsigned long i, j; + unsigned long *ptep; + + i = (epn >> 9) & 0x3ff; + j = epn & 0x1ff; + if (pgdir[i] == 0) { + zero_memory((void *)free_ptr, 512 * sizeof(unsigned long)); + store_pte(&pgdir[i], 0x8000000000000000 | free_ptr | 9); + free_ptr += 512 * sizeof(unsigned long); + } + ptep = read_pgd(i); + store_pte(&ptep[j], 0xc000000000000000 | ((unsigned long)pa & 0x00fffffffffff000) | perm_attr); + eas_mapped[neas_mapped++] = ea; +} + +void unmap(void *ea) +{ + unsigned long epn = (unsigned long) ea >> 12; + unsigned long i, j; + unsigned long *ptep; + + i = (epn >> 9) & 0x3ff; + j = epn & 0x1ff; + if (pgdir[i] == 0) + return; + ptep = read_pgd(i); + ptep[j] = 0; + do_tlbie(((unsigned long)ea & ~0xfff), 0); +} + +void unmap_all(void) +{ + int i; + + for (i = 0; i < neas_mapped; ++i) + unmap(eas_mapped[i]); + neas_mapped = 0; +} + +int mmu_test_1(void) +{ + long *ptr = (long *) 0x123000; + long val; + + /* this should fail */ + if (test_read(ptr, &val, 0xdeadbeefd00d)) + return 1; + /* dest reg of load should be unchanged */ + if (val != 0xdeadbeefd00d) + return 2; + /* DAR and DSISR should be set correctly */ + if (mfspr(DAR) != (long) ptr || mfspr(DSISR) != 0x40000000) + return 3; + return 0; +} + +int mmu_test_2(void) +{ + long *mem = (long *) 0x8000; + long *ptr = (long *) 0x124000; + long *ptr2 = (long *) 0x1124000; + long val; + + /* create PTE */ + map(ptr, mem, DFLT_PERM); + /* initialize the memory content */ + mem[33] = 0xbadc0ffee; + /* this should succeed and be a cache miss */ + if (!test_read(&ptr[33], &val, 0xdeadbeefd00d)) + return 1; + /* dest reg of load should have the value written */ + if (val != 0xbadc0ffee) + return 2; + /* load a second TLB entry in the same set as the first */ + map(ptr2, mem, DFLT_PERM); + /* this should succeed and be a cache hit */ + if (!test_read(&ptr2[33], &val, 0xdeadbeefd00d)) + return 3; + /* dest reg of load should have the value written */ + if (val != 0xbadc0ffee) + return 4; + /* check that the first entry still works */ + if (!test_read(&ptr[33], &val, 0xdeadbeefd00d)) + return 5; + if (val != 0xbadc0ffee) + return 6; + return 0; +} + +int mmu_test_3(void) +{ + long *mem = (long *) 0x9000; + long *ptr = (long *) 0x14a000; + long val; + + /* create PTE */ + map(ptr, mem, DFLT_PERM); + /* initialize the memory content */ + mem[45] = 0xfee1800d4ea; + /* this should succeed and be a cache miss */ + if (!test_read(&ptr[45], &val, 0xdeadbeefd0d0)) + return 1; + /* dest reg of load should have the value written */ + if (val != 0xfee1800d4ea) + return 2; + /* remove the PTE */ + unmap(ptr); + /* this should fail */ + if (test_read(&ptr[45], &val, 0xdeadbeefd0d0)) + return 3; + /* dest reg of load should be unchanged */ + if (val != 0xdeadbeefd0d0) + return 4; + /* DAR and DSISR should be set correctly */ + if (mfspr(DAR) != (long) &ptr[45] || mfspr(DSISR) != 0x40000000) + return 5; + return 0; +} + +int mmu_test_4(void) +{ + long *mem = (long *) 0xa000; + long *ptr = (long *) 0x10b000; + long *ptr2 = (long *) 0x110b000; + long val; + + /* create PTE */ + map(ptr, mem, DFLT_PERM); + /* initialize the memory content */ + mem[27] = 0xf00f00f00f00; + /* this should succeed and be a cache miss */ + if (!test_write(&ptr[27], 0xe44badc0ffee)) + return 1; + /* memory should now have the value written */ + if (mem[27] != 0xe44badc0ffee) + return 2; + /* load a second TLB entry in the same set as the first */ + map(ptr2, mem, DFLT_PERM); + /* this should succeed and be a cache hit */ + if (!test_write(&ptr2[27], 0x6e11ae)) + return 3; + /* memory should have the value written */ + if (mem[27] != 0x6e11ae) + return 4; + /* check that the first entry still exists */ + /* (assumes TLB is 2-way associative or more) */ + if (!test_read(&ptr[27], &val, 0xdeadbeefd00d)) + return 5; + if (val != 0x6e11ae) + return 6; + return 0; +} + +int mmu_test_5(void) +{ + long *mem = (long *) 0xbffd; + long *ptr = (long *) 0x39fffd; + long val; + + /* create PTE */ + map(ptr, mem, DFLT_PERM); + /* this should fail */ + if (test_read(ptr, &val, 0xdeadbeef0dd0)) + return 1; + /* dest reg of load should be unchanged */ + if (val != 0xdeadbeef0dd0) + return 2; + /* DAR and DSISR should be set correctly */ + if (mfspr(DAR) != ((long)ptr & ~0xfff) + 0x1000 || mfspr(DSISR) != 0x40000000) + return 3; + return 0; +} + +int mmu_test_6(void) +{ + long *mem = (long *) 0xbffd; + long *ptr = (long *) 0x39fffd; + + /* create PTE */ + map(ptr, mem, DFLT_PERM); + /* initialize memory */ + *mem = 0x123456789abcdef0; + /* this should fail */ + if (test_write(ptr, 0xdeadbeef0dd0)) + return 1; + /* DAR and DSISR should be set correctly */ + if (mfspr(DAR) != ((long)ptr & ~0xfff) + 0x1000 || mfspr(DSISR) != 0x42000000) + return 2; + return 0; +} + +int mmu_test_7(void) +{ + long *mem = (long *) 0x8000; + long *ptr = (long *) 0x124000; + long val; + + *mem = 0x123456789abcdef0; + /* create PTE without R or C */ + map(ptr, mem, PERM_RD | PERM_WR); + /* this should fail */ + if (test_read(ptr, &val, 0xdeadd00dbeef)) + return 1; + /* dest reg of load should be unchanged */ + if (val != 0xdeadd00dbeef) + return 2; + /* DAR and DSISR should be set correctly */ + if (mfspr(DAR) != (long) ptr || mfspr(DSISR) != 0x00040000) + return 3; + /* this should fail */ + if (test_write(ptr, 0xdeadbeef0dd0)) + return 4; + /* DAR and DSISR should be set correctly */ + if (mfspr(DAR) != (long)ptr || mfspr(DSISR) != 0x02040000) + return 5; + /* memory should be unchanged */ + if (*mem != 0x123456789abcdef0) + return 6; + return 0; +} + +int mmu_test_8(void) +{ + long *mem = (long *) 0x8000; + long *ptr = (long *) 0x124000; + long val; + + *mem = 0x123456789abcdef0; + /* create PTE with R but not C */ + map(ptr, mem, REF | PERM_RD | PERM_WR); + /* this should succeed */ + if (!test_read(ptr, &val, 0xdeadd00dbeef)) + return 1; + /* this should fail */ + if (test_write(ptr, 0xdeadbeef0dd1)) + return 2; + /* DAR and DSISR should be set correctly */ + if (mfspr(DAR) != (long)ptr || mfspr(DSISR) != 0x02040000) + return 3; + /* memory should be unchanged */ + if (*mem != 0x123456789abcdef0) + return 4; + return 0; +} + +int mmu_test_9(void) +{ + long *mem = (long *) 0x8000; + long *ptr = (long *) 0x124000; + long val; + + *mem = 0x123456789abcdef0; + /* create PTE without read or write permission */ + map(ptr, mem, REF); + /* this should fail */ + if (test_read(ptr, &val, 0xdeadd00dbeef)) + return 1; + /* dest reg of load should be unchanged */ + if (val != 0xdeadd00dbeef) + return 2; + /* DAR and DSISR should be set correctly */ + if (mfspr(DAR) != (long) ptr || mfspr(DSISR) != 0x08000000) + return 3; + /* this should fail */ + if (test_write(ptr, 0xdeadbeef0dd1)) + return 4; + /* DAR and DSISR should be set correctly */ + if (mfspr(DAR) != (long)ptr || mfspr(DSISR) != 0x0a000000) + return 5; + /* memory should be unchanged */ + if (*mem != 0x123456789abcdef0) + return 6; + return 0; +} + +int mmu_test_10(void) +{ + long *mem = (long *) 0x8000; + long *ptr = (long *) 0x124000; + long val; + + *mem = 0x123456789abcdef0; + /* create PTE with read but not write permission */ + map(ptr, mem, REF | PERM_RD); + /* this should succeed */ + if (!test_read(ptr, &val, 0xdeadd00dbeef)) + return 1; + /* this should fail */ + if (test_write(ptr, 0xdeadbeef0dd1)) + return 2; + /* DAR and DSISR should be set correctly */ + if (mfspr(DAR) != (long)ptr || mfspr(DSISR) != 0x0a000000) + return 3; + /* memory should be unchanged */ + if (*mem != 0x123456789abcdef0) + return 4; + return 0; +} + +int mmu_test_11(void) +{ + unsigned long ptr = 0x523000; + + /* this should fail */ + if (test_exec(0, ptr, MSR_IR)) + return 1; + /* SRR0 and SRR1 should be set correctly */ + if (mfspr(SRR0) != (long) ptr || mfspr(SRR1) != 0x40000020) + return 2; + return 0; +} + +int mmu_test_12(void) +{ + unsigned long mem = 0x1000; + unsigned long ptr = 0x324000; + unsigned long ptr2 = 0x1324000; + + /* create PTE */ + map((void *)ptr, (void *)mem, PERM_EX | REF); + /* this should succeed and be a cache miss */ + if (!test_exec(0, ptr, MSR_IR)) + return 1; + /* create a second PTE */ + map((void *)ptr2, (void *)mem, PERM_EX | REF); + /* this should succeed and be a cache hit */ + if (!test_exec(0, ptr2, MSR_IR)) + return 2; + return 0; +} + +int mmu_test_13(void) +{ + unsigned long mem = 0x1000; + unsigned long ptr = 0x349000; + unsigned long ptr2 = 0x34a000; + + /* create a PTE */ + map((void *)ptr, (void *)mem, PERM_EX | REF); + /* this should succeed */ + if (!test_exec(1, ptr, MSR_IR)) + return 1; + /* invalidate the PTE */ + unmap((void *)ptr); + /* install a second PTE */ + map((void *)ptr2, (void *)mem, PERM_EX | REF); + /* this should fail */ + if (test_exec(1, ptr, MSR_IR)) + return 2; + /* SRR0 and SRR1 should be set correctly */ + if (mfspr(SRR0) != (long) ptr || mfspr(SRR1) != 0x40000020) + return 3; + return 0; +} + +int mmu_test_14(void) +{ + unsigned long mem = 0x1000; + unsigned long mem2 = 0x2000; + unsigned long ptr = 0x30a000; + unsigned long ptr2 = 0x30b000; + + /* create a PTE */ + map((void *)ptr, (void *)mem, PERM_EX | REF); + /* this should fail due to second page not being mapped */ + if (test_exec(2, ptr, MSR_IR)) + return 1; + /* SRR0 and SRR1 should be set correctly */ + if (mfspr(SRR0) != ptr2 || mfspr(SRR1) != 0x40000020) + return 2; + /* create a PTE for the second page */ + map((void *)ptr2, (void *)mem2, PERM_EX | REF); + /* this should succeed */ + if (!test_exec(2, ptr, MSR_IR)) + return 3; + return 0; +} + +int mmu_test_15(void) +{ + unsigned long mem = 0x1000; + unsigned long ptr = 0x324000; + + /* create a PTE without execute permission */ + map((void *)ptr, (void *)mem, DFLT_PERM); + /* this should fail */ + if (test_exec(0, ptr, MSR_IR)) + return 1; + /* SRR0 and SRR1 should be set correctly */ + if (mfspr(SRR0) != ptr || mfspr(SRR1) != 0x10000020) + return 2; + return 0; +} + +int mmu_test_16(void) +{ + unsigned long mem = 0x1000; + unsigned long mem2 = 0x2000; + unsigned long ptr = 0x30a000; + unsigned long ptr2 = 0x30b000; + + /* create a PTE */ + map((void *)ptr, (void *)mem, PERM_EX | REF); + /* create a PTE for the second page without execute permission */ + map((void *)ptr2, (void *)mem2, PERM_RD | REF); + /* this should fail due to second page being no-execute */ + if (test_exec(2, ptr, MSR_IR)) + return 1; + /* SRR0 and SRR1 should be set correctly */ + if (mfspr(SRR0) != ptr2 || mfspr(SRR1) != 0x10000020) + return 2; + /* create a PTE for the second page with execute permission */ + map((void *)ptr2, (void *)mem2, PERM_RD | PERM_EX | REF); + /* this should succeed */ + if (!test_exec(2, ptr, MSR_IR)) + return 3; + return 0; +} + +int mmu_test_17(void) +{ + unsigned long mem = 0x1000; + unsigned long ptr = 0x349000; + + /* create a PTE without the ref bit set */ + map((void *)ptr, (void *)mem, PERM_EX); + /* this should fail */ + if (test_exec(2, ptr, MSR_IR)) + return 1; + /* SRR0 and SRR1 should be set correctly */ + if (mfspr(SRR0) != (long) ptr || mfspr(SRR1) != 0x00040020) + return 2; + /* create a PTE without ref or execute permission */ + unmap((void *)ptr); + map((void *)ptr, (void *)mem, 0); + /* this should fail */ + if (test_exec(2, ptr, MSR_IR)) + return 1; + /* SRR0 and SRR1 should be set correctly */ + /* RC update fail bit should not be set */ + if (mfspr(SRR0) != (long) ptr || mfspr(SRR1) != 0x10000020) + return 2; + return 0; +} + +int mmu_test_18(void) +{ + long *mem = (long *) 0x8000; + long *ptr = (long *) 0x124000; + long *ptr2 = (long *) 0x1124000; + + /* create PTE */ + map(ptr, mem, DFLT_PERM); + /* this should succeed and be a cache miss */ + if (!test_dcbz(&ptr[129])) + return 1; + /* create a second PTE */ + map(ptr2, mem, DFLT_PERM); + /* this should succeed and be a cache hit */ + if (!test_dcbz(&ptr2[130])) + return 2; + return 0; +} + +int mmu_test_19(void) +{ + long *mem = (long *) 0x8000; + long *ptr = (long *) 0x124000; + + *mem = 0x123456789abcdef0; + /* create PTE with read but not write permission */ + map(ptr, mem, REF | PERM_RD); + /* this should fail and create a TLB entry */ + if (test_write(ptr, 0xdeadbeef0dd1)) + return 1; + /* DAR and DSISR should be set correctly */ + if (mfspr(DAR) != (long)ptr || mfspr(DSISR) != 0x0a000000) + return 2; + /* Update the PTE to have write permission */ + map(ptr, mem, REF | CHG | PERM_RD | PERM_WR); + /* this should succeed */ + if (!test_write(ptr, 0xdeadbeef0dd1)) + return 3; + return 0; +} + +int fail = 0; + +void do_test(int num, int (*test)(void)) +{ + int ret; + + mtspr(DSISR, 0); + mtspr(DAR, 0); + unmap_all(); + print_test_number(num); + ret = test(); + if (ret == 0) { + print_string("PASS\r\n"); + } else { + fail = 1; + print_string("FAIL "); + putchar(ret + '0'); + if (num <= 10 || num == 19) { + print_string(" DAR="); + print_hex(mfspr(DAR)); + print_string(" DSISR="); + print_hex(mfspr(DSISR)); + } else { + print_string(" SRR0="); + print_hex(mfspr(SRR0)); + print_string(" SRR1="); + print_hex(mfspr(SRR1)); + } + print_string("\r\n"); + } +} + +int main(void) +{ + potato_uart_init(); + init_mmu(); + + do_test(1, mmu_test_1); + do_test(2, mmu_test_2); + do_test(3, mmu_test_3); + do_test(4, mmu_test_4); + do_test(5, mmu_test_5); + do_test(6, mmu_test_6); + do_test(7, mmu_test_7); + do_test(8, mmu_test_8); + do_test(9, mmu_test_9); + do_test(10, mmu_test_10); + do_test(11, mmu_test_11); + do_test(12, mmu_test_12); + do_test(13, mmu_test_13); + do_test(14, mmu_test_14); + do_test(15, mmu_test_15); + do_test(16, mmu_test_16); + do_test(17, mmu_test_17); + do_test(18, mmu_test_18); + do_test(19, mmu_test_19); + + return fail; +} diff --git a/tests/mmu/powerpc.lds b/tests/mmu/powerpc.lds new file mode 100644 index 0000000..99611ab --- /dev/null +++ b/tests/mmu/powerpc.lds @@ -0,0 +1,27 @@ +SECTIONS +{ + . = 0; + _start = .; + .head : { + KEEP(*(.head)) + } + . = ALIGN(0x1000); + .text : { *(.text) *(.text.*) *(.rodata) *(.rodata.*) } + . = ALIGN(0x1000); + .data : { *(.data) *(.data.*) *(.got) *(.toc) } + . = ALIGN(0x80); + __bss_start = .; + .bss : { + *(.dynsbss) + *(.sbss) + *(.scommon) + *(.dynbss) + *(.bss) + *(.common) + *(.bss.*) + } + . = ALIGN(0x80); + __bss_end = .; + . = . + 0x4000; + __stack_top = .; +} diff --git a/tests/privileged/privileged.c b/tests/privileged/privileged.c index 073dc07..98c037c 100644 --- a/tests/privileged/privileged.c +++ b/tests/privileged/privileged.c @@ -13,6 +13,8 @@ extern int call_with_msr(unsigned long arg, int (*fn)(unsigned long), unsigned l #define SRR0 26 #define SRR1 27 +#define PID 48 +#define PRTBL 720 static inline unsigned long mfspr(int sprnum) { @@ -55,6 +57,93 @@ void print_test_number(int i) putchar(':'); } +static inline void store_pte(unsigned long *p, unsigned long pte) +{ + __asm__ volatile("stdbrx %1,0,%0" : : "r" (p), "r" (pte) : "memory"); +} + +#define CACHE_LINE_SIZE 64 + +void zero_memory(void *ptr, unsigned long nbytes) +{ + unsigned long nb, i, nl; + void *p; + + for (; nbytes != 0; nbytes -= nb, ptr += nb) { + nb = -((unsigned long)ptr) & (CACHE_LINE_SIZE - 1); + if (nb == 0 && nbytes >= CACHE_LINE_SIZE) { + nl = nbytes / CACHE_LINE_SIZE; + p = ptr; + for (i = 0; i < nl; ++i) { + __asm__ volatile("dcbz 0,%0" : : "r" (p) : "memory"); + p += CACHE_LINE_SIZE; + } + nb = nl * CACHE_LINE_SIZE; + } else { + if (nb > nbytes) + nb = nbytes; + for (i = 0; i < nb; ++i) + ((unsigned char *)ptr)[i] = 0; + } + } +} + +#define PERM_EX 0x001 +#define PERM_WR 0x002 +#define PERM_RD 0x004 +#define PERM_PRIV 0x008 +#define ATTR_NC 0x020 +#define CHG 0x080 +#define REF 0x100 + +#define DFLT_PERM (PERM_WR | PERM_RD | REF | CHG) + +/* + * Set up an MMU translation tree using memory starting at the 64k point. + * We use 2 levels, mapping 2GB (the minimum size possible), with a + * 8kB PGD level pointing to 4kB PTE pages. + */ +unsigned long *pgdir = (unsigned long *) 0x10000; +unsigned long *proc_tbl = (unsigned long *) 0x12000; +unsigned long free_ptr = 0x13000; + +void init_mmu(void) +{ + /* set up process table */ + zero_memory(proc_tbl, 512 * sizeof(unsigned long)); + /* RTS = 0 (2GB address space), RPDS = 10 (1024-entry top level) */ + store_pte(&proc_tbl[2 * 1], (unsigned long) pgdir | 10); + mtspr(PRTBL, (unsigned long)proc_tbl); + mtspr(PID, 1); + zero_memory(pgdir, 1024 * sizeof(unsigned long)); +} + +static unsigned long *read_pgd(unsigned long i) +{ + unsigned long ret; + + __asm__ volatile("ldbrx %0,%1,%2" : "=r" (ret) : "b" (pgdir), + "r" (i * sizeof(unsigned long))); + return (unsigned long *) (ret & 0x00ffffffffffff00); +} + +void map(unsigned long ea, unsigned long pa, unsigned long perm_attr) +{ + unsigned long epn = ea >> 12; + unsigned long i, j; + unsigned long *ptep; + + i = (epn >> 9) & 0x3ff; + j = epn & 0x1ff; + if (pgdir[i] == 0) { + zero_memory((void *)free_ptr, 512 * sizeof(unsigned long)); + store_pte(&pgdir[i], 0x8000000000000000 | free_ptr | 9); + free_ptr += 512 * sizeof(unsigned long); + } + ptep = read_pgd(i); + store_pte(&ptep[j], 0xc000000000000000 | (pa & 0x00fffffffffff000) | perm_attr); +} + int priv_fn_1(unsigned long x) { __asm__ volatile("attn"); @@ -140,6 +229,9 @@ void do_test(int num, int (*fn)(unsigned long)) int main(void) { potato_uart_init(); + init_mmu(); + map(0x2000, 0x2000, REF | CHG | PERM_RD | PERM_EX); /* map code page */ + map(0x7000, 0x7000, REF | CHG | PERM_RD | PERM_WR); /* map stack page */ do_test(1, priv_fn_1); do_test(2, priv_fn_2); diff --git a/tests/test_mmu.bin b/tests/test_mmu.bin new file mode 100755 index 0000000..706f0d8 Binary files /dev/null and b/tests/test_mmu.bin differ diff --git a/tests/test_mmu.console_out b/tests/test_mmu.console_out new file mode 100644 index 0000000..cb4ad85 --- /dev/null +++ b/tests/test_mmu.console_out @@ -0,0 +1,19 @@ +test 01:PASS +test 02:PASS +test 03:PASS +test 04:PASS +test 05:PASS +test 06:PASS +test 07:PASS +test 08:PASS +test 09:PASS +test 10:PASS +test 11:PASS +test 12:PASS +test 13:PASS +test 14:PASS +test 15:PASS +test 16:PASS +test 17:PASS +test 18:PASS +test 19:PASS diff --git a/tests/test_privileged.bin b/tests/test_privileged.bin index 5b8ce63..340b7c0 100755 Binary files a/tests/test_privileged.bin and b/tests/test_privileged.bin differ diff --git a/tests/update_console_tests b/tests/update_console_tests index 94e74d1..d8fb44e 100755 --- a/tests/update_console_tests +++ b/tests/update_console_tests @@ -3,7 +3,7 @@ # Script to update console related tests from source # -for i in sc illegal decrementer xics privileged ; do +for i in sc illegal decrementer xics privileged mmu ; do cd $i make cd -