From 8160f4f8214e982284b2ce2678c8298073b4267c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 22 Apr 2020 11:10:56 +1000 Subject: [PATCH] Add framework for implementing an MMU This adds a new module to implement an MMU. At the moment it doesn't do very much. Tlbie instructions now get sent by loadstore1 to mmu, which sends them to dcache, rather than loadstore1 sending them directly to dcache. TLB misses from dcache now get sent by loadstore1 to mmu, which currently just returns an error. Loadstore1 then generates a DSI in response to the error return from mmu. Signed-off-by: Paul Mackerras --- Makefile | 5 +- common.vhdl | 25 ++++++++- core.vhdl | 19 +++++++ dcache.vhdl | 140 ++++++++++++++++++++++++++++++------------------ dcache_tb.vhdl | 9 +++- loadstore1.vhdl | 105 +++++++++++++++++++++++++++--------- microwatt.core | 1 + mmu.vhdl | 109 +++++++++++++++++++++++++++++++++++++ 8 files changed, 332 insertions(+), 81 deletions(-) create mode 100644 mmu.vhdl diff --git a/Makefile b/Makefile index ed74176..48be5b4 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ common.o: decode_types.o control.o: gpr_hazard.o cr_hazard.o common.o sim_jtag.o: sim_jtag_socket.o core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o -core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o writeback.o core_debug.o +core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o mmu.o dcache.o writeback.o core_debug.o core_debug.o: common.o countzero.o: countzero_tb.o: common.o glibc_random.o countzero.o @@ -58,10 +58,11 @@ icache_tb.o: common.o wishbone_types.o icache.o wishbone_bram_wrapper.o dcache.o: utils.o common.o wishbone_types.o plru.o cache_ram.o utils.o dcache_tb.o: common.o wishbone_types.o dcache.o wishbone_bram_wrapper.o insn_helpers.o: -loadstore1.o: common.o helpers.o decode_types.o +loadstore1.o: common.o decode_types.o logical.o: decode_types.o multiply_tb.o: decode_types.o common.o glibc_random.o ppc_fx_insns.o multiply.o multiply.o: common.o decode_types.o +mmu.o: common.o divider_tb.o: decode_types.o common.o glibc_random.o ppc_fx_insns.o divider.o divider.o: common.o decode_types.o ppc_fx_insns.o: helpers.o diff --git a/common.vhdl b/common.vhdl index e8ec19e..3ee19d7 100644 --- a/common.vhdl +++ b/common.vhdl @@ -246,7 +246,6 @@ package common is type Loadstore1ToDcacheType is record valid : std_ulogic; load : std_ulogic; -- is this a load - tlbie : std_ulogic; -- is this a tlbie dcbz : std_ulogic; nc : std_ulogic; reserve : std_ulogic; @@ -267,6 +266,30 @@ package common is rc_error : std_ulogic; end record; + type Loadstore1ToMmuType is record + valid : std_ulogic; + tlbie : std_ulogic; + addr : std_ulogic_vector(63 downto 0); + rs : std_ulogic_vector(63 downto 0); + end record; + + type MmuToLoadstore1Type is record + done : std_ulogic; + error : std_ulogic; + end record; + + type MmuToDcacheType is record + valid : std_ulogic; + tlbie : std_ulogic; + addr : std_ulogic_vector(63 downto 0); + pte : std_ulogic_vector(63 downto 0); + end record; + + type DcacheToMmuType is record + stall : std_ulogic; + done : std_ulogic; + end record; + type Loadstore1ToWritebackType is record valid : std_ulogic; write_enable: std_ulogic; diff --git a/core.vhdl b/core.vhdl index 0cb2ecd..c870404 100644 --- a/core.vhdl +++ b/core.vhdl @@ -65,10 +65,14 @@ architecture behave of core is signal execute1_to_loadstore1: Execute1ToLoadstore1Type; signal loadstore1_to_execute1: Loadstore1ToExecute1Type; signal loadstore1_to_writeback: Loadstore1ToWritebackType; + signal loadstore1_to_mmu: Loadstore1ToMmuType; + signal mmu_to_loadstore1: MmuToLoadstore1Type; -- dcache signals signal loadstore1_to_dcache: Loadstore1ToDcacheType; signal dcache_to_loadstore1: DcacheToLoadstore1Type; + signal mmu_to_dcache: MmuToDcacheType; + signal dcache_to_mmu: DcacheToMmuType; -- local signals signal fetch1_stall_in : std_ulogic; @@ -124,6 +128,7 @@ architecture behave of core is attribute keep_hierarchy of cr_file_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of execute1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of loadstore1_0 : label is keep_h(DISABLE_FLATTEN); + attribute keep_hierarchy of mmu_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of dcache_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of writeback_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of debug_0 : label is keep_h(DISABLE_FLATTEN); @@ -270,10 +275,22 @@ begin l_out => loadstore1_to_writeback, d_out => loadstore1_to_dcache, d_in => dcache_to_loadstore1, + m_out => loadstore1_to_mmu, + m_in => mmu_to_loadstore1, dc_stall => dcache_stall_out, stall_out => ls1_stall_out ); + mmu_0: entity work.mmu + port map ( + clk => clk, + rst => core_rst, + l_in => loadstore1_to_mmu, + l_out => mmu_to_loadstore1, + d_out => mmu_to_dcache, + d_in => dcache_to_mmu + ); + dcache_0: entity work.dcache generic map( LINE_SIZE => 64, @@ -285,6 +302,8 @@ begin rst => core_rst, d_in => loadstore1_to_dcache, d_out => dcache_to_loadstore1, + m_in => mmu_to_dcache, + m_out => dcache_to_mmu, stall_out => dcache_stall_out, wishbone_in => wishbone_data_in, wishbone_out => wishbone_data_out diff --git a/dcache.vhdl b/dcache.vhdl index 03b3886..126df48 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -40,6 +40,9 @@ entity dcache is d_in : in Loadstore1ToDcacheType; d_out : out DcacheToLoadstore1Type; + m_in : in MmuToDcacheType; + m_out : out DcacheToMmuType; + stall_out : out std_ulogic; wishbone_out : out wishbone_master_out; @@ -146,9 +149,6 @@ architecture rtl of dcache is attribute ram_style of dtlb_tags : signal is "distributed"; attribute ram_style of dtlb_ptes : signal is "distributed"; - signal r0 : Loadstore1ToDcacheType; - signal r0_valid : std_ulogic; - -- Record for storing permission, attribute, etc. bits from a PTE type perm_attr_t is record reference : std_ulogic; @@ -205,6 +205,15 @@ architecture rtl of dcache is -- first stage emits a stall for a complex op. -- + -- Stage 0 register, basically contains just the latched request + type reg_stage_0_t is record + req : Loadstore1ToDcacheType; + tlbie : std_ulogic; + end record; + + signal r0 : reg_stage_0_t; + signal r0_valid : std_ulogic; + -- First stage register, contains state for stage 1 of load hits -- and for the state machine used by all other operations -- @@ -424,35 +433,61 @@ begin assert (64 = wishbone_data_bits) report "Can't yet handle a wishbone width that isn't 64-bits" severity FAILURE; - -- Latch the request in r0 as long as we're not stalling + -- Latch the request in r0.req as long as we're not stalling stage_0 : process(clk) begin if rising_edge(clk) then if rst = '1' then - r0.valid <= '0'; + r0.req.valid <= '0'; elsif stall_out = '0' then - r0 <= d_in; + assert (d_in.valid and m_in.valid) = '0' report + "request collision loadstore vs MMU"; + if m_in.valid = '1' then + r0.req.valid <= '1'; + r0.req.load <= '0'; + r0.req.dcbz <= '0'; + r0.req.nc <= '0'; + r0.req.reserve <= '0'; + r0.req.virt_mode <= '0'; + r0.req.priv_mode <= '1'; + r0.req.addr <= m_in.addr; + r0.req.data <= m_in.pte; + r0.req.byte_sel <= (others => '1'); + r0.tlbie <= m_in.tlbie; + assert m_in.tlbie = '1' report "unknown request from MMU"; + else + r0.req <= d_in; + r0.tlbie <= '0'; + end if; end if; end if; end process; + -- we don't yet handle collisions between loadstore1 requests and MMU requests + m_out.stall <= '0'; + -- Hold off the request in r0 when stalling, -- and cancel it if we get an error in a previous request. - r0_valid <= r0.valid and not stall_out and not r1.error_done; + r0_valid <= r0.req.valid and not stall_out and not r1.error_done; -- TLB - -- Operates in the second cycle on the request latched in r0. + -- Operates in the second cycle on the request latched in r0.req. -- TLB updates write the entry at the end of the second cycle. tlb_read : process(clk) variable index : tlb_index_t; + variable addrbits : std_ulogic_vector(TLB_SET_BITS - 1 downto 0); begin if rising_edge(clk) then if stall_out = '1' then -- keep reading the same thing while stalled index := tlb_req_index; else - index := to_integer(unsigned(d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 - downto TLB_LG_PGSZ))); + if m_in.valid = '1' then + addrbits := m_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); + else + addrbits := d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); + end if; + index := to_integer(unsigned(addrbits)); end if; tlb_valid_way <= dtlb_valids(index); tlb_tag_way <= dtlb_tags(index); @@ -500,11 +535,11 @@ begin variable hit : std_ulogic; variable eatag : tlb_tag_t; begin - tlb_req_index <= to_integer(unsigned(r0.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 + tlb_req_index <= to_integer(unsigned(r0.req.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ))); hitway := 0; hit := '0'; - eatag := r0.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); + eatag := r0.req.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); for i in tlb_way_t loop if tlb_valid_way(i) = '1' and read_tlb_tag(i, tlb_tag_way) = eatag then @@ -515,13 +550,13 @@ begin tlb_hit <= hit and r0_valid; tlb_hit_way <= hitway; pte <= read_tlb_pte(hitway, tlb_pte_way); - valid_ra <= tlb_hit or not r0.virt_mode; - if r0.virt_mode = '1' then + valid_ra <= tlb_hit or not r0.req.virt_mode; + if r0.req.virt_mode = '1' then ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & - r0.addr(TLB_LG_PGSZ - 1 downto 0); + r0.req.addr(TLB_LG_PGSZ - 1 downto 0); perm_attr <= extract_perm_attr(pte); else - ra <= r0.addr(REAL_ADDR_BITS - 1 downto 0); + ra <= r0.req.addr(REAL_ADDR_BITS - 1 downto 0); perm_attr <= real_mode_perm_attr; end if; end process; @@ -540,9 +575,9 @@ begin tlbia := '0'; tlbwe := '0'; if r0_valid = '1' and r0.tlbie = '1' then - if r0.addr(11 downto 10) /= "00" then + if r0.req.addr(11 downto 10) /= "00" then tlbia := '1'; - elsif r0.addr(9) = '1' then + elsif r0.req.addr(9) = '1' then tlbwe := '1'; else tlbie := '1'; @@ -563,15 +598,16 @@ begin else repl_way := to_integer(unsigned(tlb_plru_victim(tlb_req_index))); end if; - eatag := r0.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); + eatag := r0.req.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); tagset := tlb_tag_way; write_tlb_tag(repl_way, tagset, eatag); dtlb_tags(tlb_req_index) <= tagset; pteset := tlb_pte_way; - write_tlb_pte(repl_way, pteset, r0.data); + write_tlb_pte(repl_way, pteset, r0.req.data); dtlb_ptes(tlb_req_index) <= pteset; dtlb_valids(tlb_req_index)(repl_way) <= '1'; end if; + m_out.done <= r0_valid and r0.tlbie; end if; end process; @@ -628,8 +664,8 @@ begin variable hit_way_set : hit_way_set_t; begin -- Extract line, row and tag from request - req_index <= get_index(r0.addr); - req_row <= get_row(r0.addr); + req_index <= get_index(r0.req.addr); + req_row <= get_row(r0.req.addr); req_tag <= get_tag(ra); -- Only do anything if not being stalled by stage 1 @@ -648,13 +684,13 @@ begin -- the TLB, and then decide later which match to use. hit_way := 0; is_hit := '0'; - if r0.virt_mode = '1' then + if r0.req.virt_mode = '1' then for j in tlb_way_t loop hit_way_set(j) := 0; s_hit := '0'; s_pte := read_tlb_pte(j, tlb_pte_way); s_ra := s_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & - r0.addr(TLB_LG_PGSZ - 1 downto 0); + r0.req.addr(TLB_LG_PGSZ - 1 downto 0); s_tag := get_tag(s_ra); for i in way_t loop if go = '1' and cache_valids(req_index)(i) = '1' and @@ -671,7 +707,7 @@ begin hit_way := hit_way_set(tlb_hit_way); end if; else - s_tag := get_tag(r0.addr(REAL_ADDR_BITS - 1 downto 0)); + s_tag := get_tag(r0.req.addr(REAL_ADDR_BITS - 1 downto 0)); for i in way_t loop if go = '1' and cache_valids(req_index)(i) = '1' and read_tag(i, cache_tags(req_index)) = s_tag then @@ -689,18 +725,18 @@ begin -- work out whether we have permission for this access -- NB we don't yet implement AMR, thus no KUAP - rc_ok <= perm_attr.reference and (r0.load or perm_attr.changed); - perm_ok <= (r0.priv_mode or not perm_attr.priv) and - (perm_attr.wr_perm or (r0.load and perm_attr.rd_perm)); + rc_ok <= perm_attr.reference and (r0.req.load or perm_attr.changed); + perm_ok <= (r0.req.priv_mode or not perm_attr.priv) and + (perm_attr.wr_perm or (r0.req.load and perm_attr.rd_perm)); -- Combine the request and cache hit status to decide what -- operation needs to be done -- - nc := r0.nc or perm_attr.nocache; + nc := r0.req.nc or perm_attr.nocache; op := OP_NONE; if go = '1' then if valid_ra = '1' and rc_ok = '1' and perm_ok = '1' then - opsel := r0.load & nc & is_hit; + opsel := r0.req.load & nc & is_hit; case opsel is when "101" => op := OP_LOAD_HIT; when "100" => op := OP_LOAD_MISS; @@ -723,7 +759,11 @@ begin -- If we're stalling then we need to keep reading the last -- row requested. if stall_out = '0' then - early_req_row <= get_row(d_in.addr); + if m_in.valid = '1' then + early_req_row <= get_row(m_in.addr); + else + early_req_row <= get_row(d_in.addr); + end if; else early_req_row <= req_row; end if; @@ -741,17 +781,17 @@ begin cancel_store <= '0'; set_rsrv <= '0'; clear_rsrv <= '0'; - if r0_valid = '1' and r0.reserve = '1' then + if r0_valid = '1' and r0.req.reserve = '1' then -- XXX generate alignment interrupt if address is not aligned - -- XXX or if r0.nc = '1' - if r0.load = '1' then + -- XXX or if r0.req.nc = '1' + if r0.req.load = '1' then -- load with reservation set_rsrv <= '1'; else -- store conditional clear_rsrv <= '1'; if reservation.valid = '0' or - r0.addr(63 downto LINE_OFF_BITS) /= reservation.addr then + r0.req.addr(63 downto LINE_OFF_BITS) /= reservation.addr then cancel_store <= '1'; end if; end if; @@ -765,7 +805,7 @@ begin reservation.valid <= '0'; elsif set_rsrv = '1' then reservation.valid <= '1'; - reservation.addr <= r0.addr(63 downto LINE_OFF_BITS); + reservation.addr <= r0.req.addr(63 downto LINE_OFF_BITS); end if; end if; end process; @@ -818,12 +858,6 @@ begin d_out.valid <= '1'; end if; - -- tlbie is handled above and doesn't go through the cache state machine - if r1.tlbie_done = '1' then - report "completing tlbie"; - d_out.valid <= '1'; - end if; - -- Slow ops (load miss, NC, stores) if r1.slow_valid = '1' then -- If it's a load, enable register writeback and switch @@ -900,8 +934,8 @@ begin if r1.state = IDLE then -- In IDLE state, the only write path is the store-hit update case wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); - wr_data <= r0.data; - wr_sel <= r0.byte_sel; + wr_data <= r0.req.data; + wr_sel <= r0.req.byte_sel; else -- Otherwise, we might be doing a reload or a DCBZ if r1.req.dcbz = '1' then @@ -936,17 +970,17 @@ begin dcache_fast_hit : process(clk) begin if rising_edge(clk) then - -- If we have a request incoming, we have to latch it as r0.valid + -- If we have a request incoming, we have to latch it as r0.req.valid -- is only set for a single cycle. It's up to the control logic to -- ensure we don't override an uncompleted request (for now we are -- single issue on load/stores so we are fine, later, we can generate -- a stall output if necessary). if req_op /= OP_NONE and stall_out = '0' then - r1.req <= r0; + r1.req <= r0.req; report "op:" & op_t'image(req_op) & - " addr:" & to_hstring(r0.addr) & - " nc:" & std_ulogic'image(r0.nc) & + " addr:" & to_hstring(r0.req.addr) & + " nc:" & std_ulogic'image(r0.req.nc) & " idx:" & integer'image(req_index) & " tag:" & to_hstring(req_tag) & " way: " & integer'image(req_hit_way); @@ -1018,7 +1052,7 @@ begin when OP_LOAD_MISS => -- Normal load cache miss, start the reload machine -- - report "cache miss addr:" & to_hstring(r0.addr) & + report "cache miss addr:" & to_hstring(r0.req.addr) & " idx:" & integer'image(req_index) & " way:" & integer'image(replace_way) & " tag:" & to_hstring(req_tag); @@ -1053,7 +1087,7 @@ begin r1.state <= RELOAD_WAIT_ACK; when OP_LOAD_NC => - r1.wb.sel <= r0.byte_sel; + r1.wb.sel <= r0.req.byte_sel; r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; @@ -1061,10 +1095,10 @@ begin r1.state <= NC_LOAD_WAIT_ACK; when OP_STORE_HIT | OP_STORE_MISS => - if r0.dcbz = '0' then - r1.wb.sel <= r0.byte_sel; + if r0.req.dcbz = '0' then + r1.wb.sel <= r0.req.byte_sel; r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; - r1.wb.dat <= r0.data; + r1.wb.dat <= r0.req.data; if cancel_store = '0' then r1.wb.cyc <= '1'; r1.wb.stb <= '1'; diff --git a/dcache_tb.vhdl b/dcache_tb.vhdl index 66b938f..48c6877 100644 --- a/dcache_tb.vhdl +++ b/dcache_tb.vhdl @@ -15,6 +15,9 @@ architecture behave of dcache_tb is signal d_in : Loadstore1ToDcacheType; signal d_out : DcacheToLoadstore1Type; + signal m_in : MmuToDcacheType; + signal m_out : DcacheToMmuType; + signal wb_bram_in : wishbone_master_out; signal wb_bram_out : wishbone_slave_out; @@ -30,6 +33,8 @@ begin rst => rst, d_in => d_in, d_out => d_out, + m_in => m_in, + m_out => m_out, wishbone_out => wb_bram_in, wishbone_in => wb_bram_out ); @@ -68,10 +73,12 @@ begin -- Clear stuff d_in.valid <= '0'; d_in.load <= '0'; - d_in.tlbie <= '0'; d_in.nc <= '0'; d_in.addr <= (others => '0'); d_in.data <= (others => '0'); + m_in.valid <= '0'; + m_in.addr <= (others => '0'); + m_in.pte <= (others => '0'); wait for 4*clk_period; wait until rising_edge(clk); diff --git a/loadstore1.vhdl b/loadstore1.vhdl index c54e47b..d5dd010 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -5,7 +5,6 @@ use ieee.numeric_std.all; library work; use work.decode_types.all; use work.common.all; -use work.helpers.all; -- 2 cycle LSU -- We calculate the address in the first cycle @@ -22,6 +21,9 @@ entity loadstore1 is d_out : out Loadstore1ToDcacheType; d_in : in DcacheToLoadstore1Type; + m_out : out Loadstore1ToMmuType; + m_in : in MmuToLoadstore1Type; + dc_stall : in std_ulogic; stall_out : out std_ulogic ); @@ -38,7 +40,9 @@ architecture behave of loadstore1 is SECOND_REQ, -- send 2nd request of unaligned xfer FIRST_ACK_WAIT, -- waiting for 1st ack from dcache LAST_ACK_WAIT, -- waiting for last ack from dcache - LD_UPDATE -- writing rA with computed addr on load + LD_UPDATE, -- writing rA with computed addr on load + MMU_LOOKUP_1ST, -- waiting for MMU to look up translation + MMU_LOOKUP_LAST ); type reg_stage_t is record @@ -62,6 +66,7 @@ architecture behave of loadstore1 is virt_mode : std_ulogic; priv_mode : std_ulogic; state : state_t; + first_bytes : std_ulogic_vector(7 downto 0); second_bytes : std_ulogic_vector(7 downto 0); dar : std_ulogic_vector(63 downto 0); dsisr : std_ulogic_vector(31 downto 0); @@ -146,6 +151,7 @@ begin variable sprval : std_ulogic_vector(63 downto 0); variable exception : std_ulogic; variable next_addr : std_ulogic_vector(63 downto 0); + variable mmureq : std_ulogic; variable dsisr : std_ulogic_vector(31 downto 0); begin v := r; @@ -158,6 +164,7 @@ begin sprval := (others => '0'); -- avoid inferred latches exception := '0'; dsisr := (others => '0'); + mmureq := '0'; write_enable := '0'; do_update := '0'; @@ -230,7 +237,7 @@ begin req := '1'; v.dcbz := '1'; when OP_TLBIE => - req := '1'; + mmureq := '1'; v.tlbie := '1'; when OP_MFSPR => done := '1'; @@ -282,18 +289,14 @@ begin -- Do length_to_sel and work out if we are doing 2 dwords long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0)); byte_sel := long_sel(7 downto 0); + v.first_bytes := byte_sel; v.second_bytes := long_sel(15 downto 8); - v.addr := lsu_sum; - -- Do byte reversing and rotating for stores in the first cycle - byte_offset := "000"; + byte_offset := unsigned(lsu_sum(2 downto 0)); brev_lenm1 := "000"; - if v.tlbie = '0' then - byte_offset := unsigned(lsu_sum(2 downto 0)); - if l_in.byte_reverse = '1' then - brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; - end if; + if l_in.byte_reverse = '1' then + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; end if; for i in 0 to 7 loop k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; @@ -309,6 +312,10 @@ begin v.state := SECOND_REQ; end if; end if; + if mmureq = '1' then + stall := '1'; + v.state := LAST_ACK_WAIT; + end if; end if; when SECOND_REQ => @@ -323,12 +330,19 @@ begin if d_in.valid = '1' then if d_in.error = '1' then -- dcache will discard the second request - exception := '1'; - dsisr(30) := d_in.tlb_miss; - dsisr(63 - 36) := d_in.perm_error; - dsisr(63 - 38) := not r.load; - dsisr(63 - 45) := d_in.rc_error; - v.state := IDLE; + addr := r.addr; + if d_in.tlb_miss = '1' then + -- give it to the MMU to look up + mmureq := '1'; + v.state := MMU_LOOKUP_1ST; + else + -- signal an interrupt straight away + exception := '1'; + dsisr(63 - 36) := d_in.perm_error; + dsisr(63 - 38) := not r.load; + dsisr(63 - 45) := d_in.rc_error; + v.state := IDLE; + end if; else v.state := LAST_ACK_WAIT; if r.load = '1' then @@ -337,6 +351,32 @@ begin end if; end if; + when MMU_LOOKUP_1ST | MMU_LOOKUP_LAST => + stall := '1'; + if two_dwords = '1' and r.state = MMU_LOOKUP_LAST then + addr := next_addr; + byte_sel := r.second_bytes; + else + addr := r.addr; + byte_sel := r.first_bytes; + end if; + if m_in.done = '1' then + if m_in.error = '0' then + -- retry the request now that the MMU has installed a TLB entry + req := '1'; + if r.state = MMU_LOOKUP_1ST then + v.state := SECOND_REQ; + else + v.state := LAST_ACK_WAIT; + end if; + else + exception := '1'; + dsisr(63 - 33) := '1'; + dsisr(63 - 38) := not r.load; + v.state := IDLE; + end if; + end if; + when LAST_ACK_WAIT => stall := '1'; if d_in.valid = '1' then @@ -346,12 +386,18 @@ begin else addr := r.addr; end if; - exception := '1'; - dsisr(30) := d_in.tlb_miss; - dsisr(63 - 36) := d_in.perm_error; - dsisr(63 - 38) := not r.load; - dsisr(63 - 45) := d_in.rc_error; - v.state := IDLE; + if d_in.tlb_miss = '1' then + -- give it to the MMU to look up + mmureq := '1'; + v.state := MMU_LOOKUP_LAST; + else + -- signal an interrupt straight away + exception := '1'; + dsisr(63 - 36) := d_in.perm_error; + dsisr(63 - 38) := not r.load; + dsisr(63 - 45) := d_in.rc_error; + v.state := IDLE; + end if; else write_enable := r.load; if r.load = '1' and r.update = '1' then @@ -366,6 +412,12 @@ begin end if; end if; end if; + if m_in.done = '1' then + -- tlbie is finished + stall := '0'; + done := '1'; + v.state := IDLE; + end if; when LD_UPDATE => do_update := '1'; @@ -376,7 +428,6 @@ begin -- Update outputs to dcache d_out.valid <= req; d_out.load <= v.load; - d_out.tlbie <= v.tlbie; d_out.dcbz <= v.dcbz; d_out.nc <= v.nc; d_out.reserve <= v.reserve; @@ -386,6 +437,12 @@ begin d_out.virt_mode <= v.virt_mode; d_out.priv_mode <= v.priv_mode; + -- Update outputs to MMU + m_out.valid <= mmureq; + m_out.tlbie <= v.tlbie; + m_out.addr <= addr; + m_out.rs <= l_in.data; + -- Update outputs to writeback -- Multiplex either cache data to the destination GPR or -- the address for the rA update. diff --git a/microwatt.core b/microwatt.core index a2d6ab5..180e0a5 100644 --- a/microwatt.core +++ b/microwatt.core @@ -25,6 +25,7 @@ filesets: - control.vhdl - execute1.vhdl - loadstore1.vhdl + - mmu.vhdl - dcache.vhdl - multiply.vhdl - divider.vhdl diff --git a/mmu.vhdl b/mmu.vhdl new file mode 100644 index 0000000..2e6d0fd --- /dev/null +++ b/mmu.vhdl @@ -0,0 +1,109 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.common.all; + +-- Radix MMU +-- Supports 4-level trees as in arch 3.0B, but not the two-step translation for +-- guests under a hypervisor (i.e. there is no gRA -> hRA translation). + +entity mmu is + port ( + clk : in std_ulogic; + rst : in std_ulogic; + + l_in : in Loadstore1ToMmuType; + l_out : out MmuToLoadstore1Type; + + d_out : out MmuToDcacheType; + d_in : in DcacheToMmuType + ); +end mmu; + +architecture behave of mmu is + + type state_t is (IDLE, + TLBIE_WAIT, + RADIX_LOOKUP_0 + ); + + type reg_stage_t is record + -- latched request from loadstore1 + valid : std_ulogic; + addr : std_ulogic_vector(63 downto 0); + state : state_t; + end record; + + signal r, rin : reg_stage_t; + +begin + + mmu_0: process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + r.state <= IDLE; + r.valid <= '0'; + else + if rin.valid = '1' then + report "MMU got tlb miss for " & to_hstring(rin.addr); + end if; + if l_out.done = '1' then + report "MMU completing miss with error=" & std_ulogic'image(l_out.error); + end if; + r <= rin; + end if; + end if; + end process; + + mmu_1: process(all) + variable v : reg_stage_t; + variable dcreq : std_ulogic; + variable done : std_ulogic; + variable err : std_ulogic; + begin + v.valid := l_in.valid; + v.addr := l_in.addr; + v.state := r.state; + dcreq := '0'; + done := '0'; + err := '0'; + + case r.state is + when IDLE => + if l_in.valid = '1' then + if l_in.tlbie = '1' then + dcreq := '1'; + v.state := TLBIE_WAIT; + else + v.state := RADIX_LOOKUP_0; + end if; + end if; + + when TLBIE_WAIT => + if d_in.done = '1' then + done := '1'; + v.state := IDLE; + end if; + + when RADIX_LOOKUP_0 => + done := '1'; + err := '1'; + v.state := IDLE; + end case; + + -- update registers + rin <= v; + + -- drive outputs + l_out.done <= done; + l_out.error <= err; + + d_out.valid <= dcreq; + d_out.tlbie <= l_in.tlbie; + d_out.addr <= l_in.addr; + d_out.pte <= l_in.rs; + end process; +end;