Add framework for implementing an MMU

This adds a new module to implement an MMU.  At the moment it doesn't
do very much.  Tlbie instructions now get sent by loadstore1 to mmu,
which sends them to dcache, rather than loadstore1 sending them
directly to dcache.  TLB misses from dcache now get sent by loadstore1
to mmu, which currently just returns an error.  Loadstore1 then
generates a DSI in response to the error return from mmu.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/169/head
Paul Mackerras 4 years ago
parent d47fbf88d1
commit 8160f4f821

@ -31,7 +31,7 @@ common.o: decode_types.o
control.o: gpr_hazard.o cr_hazard.o common.o control.o: gpr_hazard.o cr_hazard.o common.o
sim_jtag.o: sim_jtag_socket.o sim_jtag.o: sim_jtag_socket.o
core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o
core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o writeback.o core_debug.o core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o mmu.o dcache.o writeback.o core_debug.o
core_debug.o: common.o core_debug.o: common.o
countzero.o: countzero.o:
countzero_tb.o: common.o glibc_random.o countzero.o countzero_tb.o: common.o glibc_random.o countzero.o
@ -58,10 +58,11 @@ icache_tb.o: common.o wishbone_types.o icache.o wishbone_bram_wrapper.o
dcache.o: utils.o common.o wishbone_types.o plru.o cache_ram.o utils.o dcache.o: utils.o common.o wishbone_types.o plru.o cache_ram.o utils.o
dcache_tb.o: common.o wishbone_types.o dcache.o wishbone_bram_wrapper.o dcache_tb.o: common.o wishbone_types.o dcache.o wishbone_bram_wrapper.o
insn_helpers.o: insn_helpers.o:
loadstore1.o: common.o helpers.o decode_types.o loadstore1.o: common.o decode_types.o
logical.o: decode_types.o logical.o: decode_types.o
multiply_tb.o: decode_types.o common.o glibc_random.o ppc_fx_insns.o multiply.o multiply_tb.o: decode_types.o common.o glibc_random.o ppc_fx_insns.o multiply.o
multiply.o: common.o decode_types.o multiply.o: common.o decode_types.o
mmu.o: common.o
divider_tb.o: decode_types.o common.o glibc_random.o ppc_fx_insns.o divider.o divider_tb.o: decode_types.o common.o glibc_random.o ppc_fx_insns.o divider.o
divider.o: common.o decode_types.o divider.o: common.o decode_types.o
ppc_fx_insns.o: helpers.o ppc_fx_insns.o: helpers.o

@ -246,7 +246,6 @@ package common is
type Loadstore1ToDcacheType is record type Loadstore1ToDcacheType is record
valid : std_ulogic; valid : std_ulogic;
load : std_ulogic; -- is this a load load : std_ulogic; -- is this a load
tlbie : std_ulogic; -- is this a tlbie
dcbz : std_ulogic; dcbz : std_ulogic;
nc : std_ulogic; nc : std_ulogic;
reserve : std_ulogic; reserve : std_ulogic;
@ -267,6 +266,30 @@ package common is
rc_error : std_ulogic; rc_error : std_ulogic;
end record; end record;


type Loadstore1ToMmuType is record
valid : std_ulogic;
tlbie : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
rs : std_ulogic_vector(63 downto 0);
end record;

type MmuToLoadstore1Type is record
done : std_ulogic;
error : std_ulogic;
end record;

type MmuToDcacheType is record
valid : std_ulogic;
tlbie : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
pte : std_ulogic_vector(63 downto 0);
end record;

type DcacheToMmuType is record
stall : std_ulogic;
done : std_ulogic;
end record;

type Loadstore1ToWritebackType is record type Loadstore1ToWritebackType is record
valid : std_ulogic; valid : std_ulogic;
write_enable: std_ulogic; write_enable: std_ulogic;

@ -65,10 +65,14 @@ architecture behave of core is
signal execute1_to_loadstore1: Execute1ToLoadstore1Type; signal execute1_to_loadstore1: Execute1ToLoadstore1Type;
signal loadstore1_to_execute1: Loadstore1ToExecute1Type; signal loadstore1_to_execute1: Loadstore1ToExecute1Type;
signal loadstore1_to_writeback: Loadstore1ToWritebackType; signal loadstore1_to_writeback: Loadstore1ToWritebackType;
signal loadstore1_to_mmu: Loadstore1ToMmuType;
signal mmu_to_loadstore1: MmuToLoadstore1Type;


-- dcache signals -- dcache signals
signal loadstore1_to_dcache: Loadstore1ToDcacheType; signal loadstore1_to_dcache: Loadstore1ToDcacheType;
signal dcache_to_loadstore1: DcacheToLoadstore1Type; signal dcache_to_loadstore1: DcacheToLoadstore1Type;
signal mmu_to_dcache: MmuToDcacheType;
signal dcache_to_mmu: DcacheToMmuType;


-- local signals -- local signals
signal fetch1_stall_in : std_ulogic; signal fetch1_stall_in : std_ulogic;
@ -124,6 +128,7 @@ architecture behave of core is
attribute keep_hierarchy of cr_file_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of cr_file_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of execute1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of execute1_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of loadstore1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of loadstore1_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of mmu_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of dcache_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of dcache_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of writeback_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of writeback_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of debug_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of debug_0 : label is keep_h(DISABLE_FLATTEN);
@ -270,10 +275,22 @@ begin
l_out => loadstore1_to_writeback, l_out => loadstore1_to_writeback,
d_out => loadstore1_to_dcache, d_out => loadstore1_to_dcache,
d_in => dcache_to_loadstore1, d_in => dcache_to_loadstore1,
m_out => loadstore1_to_mmu,
m_in => mmu_to_loadstore1,
dc_stall => dcache_stall_out, dc_stall => dcache_stall_out,
stall_out => ls1_stall_out stall_out => ls1_stall_out
); );


mmu_0: entity work.mmu
port map (
clk => clk,
rst => core_rst,
l_in => loadstore1_to_mmu,
l_out => mmu_to_loadstore1,
d_out => mmu_to_dcache,
d_in => dcache_to_mmu
);

dcache_0: entity work.dcache dcache_0: entity work.dcache
generic map( generic map(
LINE_SIZE => 64, LINE_SIZE => 64,
@ -285,6 +302,8 @@ begin
rst => core_rst, rst => core_rst,
d_in => loadstore1_to_dcache, d_in => loadstore1_to_dcache,
d_out => dcache_to_loadstore1, d_out => dcache_to_loadstore1,
m_in => mmu_to_dcache,
m_out => dcache_to_mmu,
stall_out => dcache_stall_out, stall_out => dcache_stall_out,
wishbone_in => wishbone_data_in, wishbone_in => wishbone_data_in,
wishbone_out => wishbone_data_out wishbone_out => wishbone_data_out

@ -40,6 +40,9 @@ entity dcache is
d_in : in Loadstore1ToDcacheType; d_in : in Loadstore1ToDcacheType;
d_out : out DcacheToLoadstore1Type; d_out : out DcacheToLoadstore1Type;


m_in : in MmuToDcacheType;
m_out : out DcacheToMmuType;

stall_out : out std_ulogic; stall_out : out std_ulogic;


wishbone_out : out wishbone_master_out; wishbone_out : out wishbone_master_out;
@ -146,9 +149,6 @@ architecture rtl of dcache is
attribute ram_style of dtlb_tags : signal is "distributed"; attribute ram_style of dtlb_tags : signal is "distributed";
attribute ram_style of dtlb_ptes : signal is "distributed"; attribute ram_style of dtlb_ptes : signal is "distributed";


signal r0 : Loadstore1ToDcacheType;
signal r0_valid : std_ulogic;

-- Record for storing permission, attribute, etc. bits from a PTE -- Record for storing permission, attribute, etc. bits from a PTE
type perm_attr_t is record type perm_attr_t is record
reference : std_ulogic; reference : std_ulogic;
@ -205,6 +205,15 @@ architecture rtl of dcache is
-- first stage emits a stall for a complex op. -- first stage emits a stall for a complex op.
-- --


-- Stage 0 register, basically contains just the latched request
type reg_stage_0_t is record
req : Loadstore1ToDcacheType;
tlbie : std_ulogic;
end record;

signal r0 : reg_stage_0_t;
signal r0_valid : std_ulogic;
-- First stage register, contains state for stage 1 of load hits -- First stage register, contains state for stage 1 of load hits
-- and for the state machine used by all other operations -- and for the state machine used by all other operations
-- --
@ -424,35 +433,61 @@ begin
assert (64 = wishbone_data_bits) assert (64 = wishbone_data_bits)
report "Can't yet handle a wishbone width that isn't 64-bits" severity FAILURE; report "Can't yet handle a wishbone width that isn't 64-bits" severity FAILURE;


-- Latch the request in r0 as long as we're not stalling -- Latch the request in r0.req as long as we're not stalling
stage_0 : process(clk) stage_0 : process(clk)
begin begin
if rising_edge(clk) then if rising_edge(clk) then
if rst = '1' then if rst = '1' then
r0.valid <= '0'; r0.req.valid <= '0';
elsif stall_out = '0' then elsif stall_out = '0' then
r0 <= d_in; assert (d_in.valid and m_in.valid) = '0' report
"request collision loadstore vs MMU";
if m_in.valid = '1' then
r0.req.valid <= '1';
r0.req.load <= '0';
r0.req.dcbz <= '0';
r0.req.nc <= '0';
r0.req.reserve <= '0';
r0.req.virt_mode <= '0';
r0.req.priv_mode <= '1';
r0.req.addr <= m_in.addr;
r0.req.data <= m_in.pte;
r0.req.byte_sel <= (others => '1');
r0.tlbie <= m_in.tlbie;
assert m_in.tlbie = '1' report "unknown request from MMU";
else
r0.req <= d_in;
r0.tlbie <= '0';
end if;
end if; end if;
end if; end if;
end process; end process;


-- we don't yet handle collisions between loadstore1 requests and MMU requests
m_out.stall <= '0';

-- Hold off the request in r0 when stalling, -- Hold off the request in r0 when stalling,
-- and cancel it if we get an error in a previous request. -- and cancel it if we get an error in a previous request.
r0_valid <= r0.valid and not stall_out and not r1.error_done; r0_valid <= r0.req.valid and not stall_out and not r1.error_done;


-- TLB -- TLB
-- Operates in the second cycle on the request latched in r0. -- Operates in the second cycle on the request latched in r0.req.
-- TLB updates write the entry at the end of the second cycle. -- TLB updates write the entry at the end of the second cycle.
tlb_read : process(clk) tlb_read : process(clk)
variable index : tlb_index_t; variable index : tlb_index_t;
variable addrbits : std_ulogic_vector(TLB_SET_BITS - 1 downto 0);
begin begin
if rising_edge(clk) then if rising_edge(clk) then
if stall_out = '1' then if stall_out = '1' then
-- keep reading the same thing while stalled -- keep reading the same thing while stalled
index := tlb_req_index; index := tlb_req_index;
else else
index := to_integer(unsigned(d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 if m_in.valid = '1' then
downto TLB_LG_PGSZ))); addrbits := m_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ);
else
addrbits := d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ);
end if;
index := to_integer(unsigned(addrbits));
end if; end if;
tlb_valid_way <= dtlb_valids(index); tlb_valid_way <= dtlb_valids(index);
tlb_tag_way <= dtlb_tags(index); tlb_tag_way <= dtlb_tags(index);
@ -500,11 +535,11 @@ begin
variable hit : std_ulogic; variable hit : std_ulogic;
variable eatag : tlb_tag_t; variable eatag : tlb_tag_t;
begin begin
tlb_req_index <= to_integer(unsigned(r0.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 tlb_req_index <= to_integer(unsigned(r0.req.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1
downto TLB_LG_PGSZ))); downto TLB_LG_PGSZ)));
hitway := 0; hitway := 0;
hit := '0'; hit := '0';
eatag := r0.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); eatag := r0.req.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS);
for i in tlb_way_t loop for i in tlb_way_t loop
if tlb_valid_way(i) = '1' and if tlb_valid_way(i) = '1' and
read_tlb_tag(i, tlb_tag_way) = eatag then read_tlb_tag(i, tlb_tag_way) = eatag then
@ -515,13 +550,13 @@ begin
tlb_hit <= hit and r0_valid; tlb_hit <= hit and r0_valid;
tlb_hit_way <= hitway; tlb_hit_way <= hitway;
pte <= read_tlb_pte(hitway, tlb_pte_way); pte <= read_tlb_pte(hitway, tlb_pte_way);
valid_ra <= tlb_hit or not r0.virt_mode; valid_ra <= tlb_hit or not r0.req.virt_mode;
if r0.virt_mode = '1' then if r0.req.virt_mode = '1' then
ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
r0.addr(TLB_LG_PGSZ - 1 downto 0); r0.req.addr(TLB_LG_PGSZ - 1 downto 0);
perm_attr <= extract_perm_attr(pte); perm_attr <= extract_perm_attr(pte);
else else
ra <= r0.addr(REAL_ADDR_BITS - 1 downto 0); ra <= r0.req.addr(REAL_ADDR_BITS - 1 downto 0);
perm_attr <= real_mode_perm_attr; perm_attr <= real_mode_perm_attr;
end if; end if;
end process; end process;
@ -540,9 +575,9 @@ begin
tlbia := '0'; tlbia := '0';
tlbwe := '0'; tlbwe := '0';
if r0_valid = '1' and r0.tlbie = '1' then if r0_valid = '1' and r0.tlbie = '1' then
if r0.addr(11 downto 10) /= "00" then if r0.req.addr(11 downto 10) /= "00" then
tlbia := '1'; tlbia := '1';
elsif r0.addr(9) = '1' then elsif r0.req.addr(9) = '1' then
tlbwe := '1'; tlbwe := '1';
else else
tlbie := '1'; tlbie := '1';
@ -563,15 +598,16 @@ begin
else else
repl_way := to_integer(unsigned(tlb_plru_victim(tlb_req_index))); repl_way := to_integer(unsigned(tlb_plru_victim(tlb_req_index)));
end if; end if;
eatag := r0.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); eatag := r0.req.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS);
tagset := tlb_tag_way; tagset := tlb_tag_way;
write_tlb_tag(repl_way, tagset, eatag); write_tlb_tag(repl_way, tagset, eatag);
dtlb_tags(tlb_req_index) <= tagset; dtlb_tags(tlb_req_index) <= tagset;
pteset := tlb_pte_way; pteset := tlb_pte_way;
write_tlb_pte(repl_way, pteset, r0.data); write_tlb_pte(repl_way, pteset, r0.req.data);
dtlb_ptes(tlb_req_index) <= pteset; dtlb_ptes(tlb_req_index) <= pteset;
dtlb_valids(tlb_req_index)(repl_way) <= '1'; dtlb_valids(tlb_req_index)(repl_way) <= '1';
end if; end if;
m_out.done <= r0_valid and r0.tlbie;
end if; end if;
end process; end process;


@ -628,8 +664,8 @@ begin
variable hit_way_set : hit_way_set_t; variable hit_way_set : hit_way_set_t;
begin begin
-- Extract line, row and tag from request -- Extract line, row and tag from request
req_index <= get_index(r0.addr); req_index <= get_index(r0.req.addr);
req_row <= get_row(r0.addr); req_row <= get_row(r0.req.addr);
req_tag <= get_tag(ra); req_tag <= get_tag(ra);


-- Only do anything if not being stalled by stage 1 -- Only do anything if not being stalled by stage 1
@ -648,13 +684,13 @@ begin
-- the TLB, and then decide later which match to use. -- the TLB, and then decide later which match to use.
hit_way := 0; hit_way := 0;
is_hit := '0'; is_hit := '0';
if r0.virt_mode = '1' then if r0.req.virt_mode = '1' then
for j in tlb_way_t loop for j in tlb_way_t loop
hit_way_set(j) := 0; hit_way_set(j) := 0;
s_hit := '0'; s_hit := '0';
s_pte := read_tlb_pte(j, tlb_pte_way); s_pte := read_tlb_pte(j, tlb_pte_way);
s_ra := s_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & s_ra := s_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
r0.addr(TLB_LG_PGSZ - 1 downto 0); r0.req.addr(TLB_LG_PGSZ - 1 downto 0);
s_tag := get_tag(s_ra); s_tag := get_tag(s_ra);
for i in way_t loop for i in way_t loop
if go = '1' and cache_valids(req_index)(i) = '1' and if go = '1' and cache_valids(req_index)(i) = '1' and
@ -671,7 +707,7 @@ begin
hit_way := hit_way_set(tlb_hit_way); hit_way := hit_way_set(tlb_hit_way);
end if; end if;
else else
s_tag := get_tag(r0.addr(REAL_ADDR_BITS - 1 downto 0)); s_tag := get_tag(r0.req.addr(REAL_ADDR_BITS - 1 downto 0));
for i in way_t loop for i in way_t loop
if go = '1' and cache_valids(req_index)(i) = '1' and if go = '1' and cache_valids(req_index)(i) = '1' and
read_tag(i, cache_tags(req_index)) = s_tag then read_tag(i, cache_tags(req_index)) = s_tag then
@ -689,18 +725,18 @@ begin


-- work out whether we have permission for this access -- work out whether we have permission for this access
-- NB we don't yet implement AMR, thus no KUAP -- NB we don't yet implement AMR, thus no KUAP
rc_ok <= perm_attr.reference and (r0.load or perm_attr.changed); rc_ok <= perm_attr.reference and (r0.req.load or perm_attr.changed);
perm_ok <= (r0.priv_mode or not perm_attr.priv) and perm_ok <= (r0.req.priv_mode or not perm_attr.priv) and
(perm_attr.wr_perm or (r0.load and perm_attr.rd_perm)); (perm_attr.wr_perm or (r0.req.load and perm_attr.rd_perm));


-- Combine the request and cache hit status to decide what -- Combine the request and cache hit status to decide what
-- operation needs to be done -- operation needs to be done
-- --
nc := r0.nc or perm_attr.nocache; nc := r0.req.nc or perm_attr.nocache;
op := OP_NONE; op := OP_NONE;
if go = '1' then if go = '1' then
if valid_ra = '1' and rc_ok = '1' and perm_ok = '1' then if valid_ra = '1' and rc_ok = '1' and perm_ok = '1' then
opsel := r0.load & nc & is_hit; opsel := r0.req.load & nc & is_hit;
case opsel is case opsel is
when "101" => op := OP_LOAD_HIT; when "101" => op := OP_LOAD_HIT;
when "100" => op := OP_LOAD_MISS; when "100" => op := OP_LOAD_MISS;
@ -723,7 +759,11 @@ begin
-- If we're stalling then we need to keep reading the last -- If we're stalling then we need to keep reading the last
-- row requested. -- row requested.
if stall_out = '0' then if stall_out = '0' then
early_req_row <= get_row(d_in.addr); if m_in.valid = '1' then
early_req_row <= get_row(m_in.addr);
else
early_req_row <= get_row(d_in.addr);
end if;
else else
early_req_row <= req_row; early_req_row <= req_row;
end if; end if;
@ -741,17 +781,17 @@ begin
cancel_store <= '0'; cancel_store <= '0';
set_rsrv <= '0'; set_rsrv <= '0';
clear_rsrv <= '0'; clear_rsrv <= '0';
if r0_valid = '1' and r0.reserve = '1' then if r0_valid = '1' and r0.req.reserve = '1' then
-- XXX generate alignment interrupt if address is not aligned -- XXX generate alignment interrupt if address is not aligned
-- XXX or if r0.nc = '1' -- XXX or if r0.req.nc = '1'
if r0.load = '1' then if r0.req.load = '1' then
-- load with reservation -- load with reservation
set_rsrv <= '1'; set_rsrv <= '1';
else else
-- store conditional -- store conditional
clear_rsrv <= '1'; clear_rsrv <= '1';
if reservation.valid = '0' or if reservation.valid = '0' or
r0.addr(63 downto LINE_OFF_BITS) /= reservation.addr then r0.req.addr(63 downto LINE_OFF_BITS) /= reservation.addr then
cancel_store <= '1'; cancel_store <= '1';
end if; end if;
end if; end if;
@ -765,7 +805,7 @@ begin
reservation.valid <= '0'; reservation.valid <= '0';
elsif set_rsrv = '1' then elsif set_rsrv = '1' then
reservation.valid <= '1'; reservation.valid <= '1';
reservation.addr <= r0.addr(63 downto LINE_OFF_BITS); reservation.addr <= r0.req.addr(63 downto LINE_OFF_BITS);
end if; end if;
end if; end if;
end process; end process;
@ -818,12 +858,6 @@ begin
d_out.valid <= '1'; d_out.valid <= '1';
end if; end if;


-- tlbie is handled above and doesn't go through the cache state machine
if r1.tlbie_done = '1' then
report "completing tlbie";
d_out.valid <= '1';
end if;

-- Slow ops (load miss, NC, stores) -- Slow ops (load miss, NC, stores)
if r1.slow_valid = '1' then if r1.slow_valid = '1' then
-- If it's a load, enable register writeback and switch -- If it's a load, enable register writeback and switch
@ -900,8 +934,8 @@ begin
if r1.state = IDLE then if r1.state = IDLE then
-- In IDLE state, the only write path is the store-hit update case -- In IDLE state, the only write path is the store-hit update case
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
wr_data <= r0.data; wr_data <= r0.req.data;
wr_sel <= r0.byte_sel; wr_sel <= r0.req.byte_sel;
else else
-- Otherwise, we might be doing a reload or a DCBZ -- Otherwise, we might be doing a reload or a DCBZ
if r1.req.dcbz = '1' then if r1.req.dcbz = '1' then
@ -936,17 +970,17 @@ begin
dcache_fast_hit : process(clk) dcache_fast_hit : process(clk)
begin begin
if rising_edge(clk) then if rising_edge(clk) then
-- If we have a request incoming, we have to latch it as r0.valid -- If we have a request incoming, we have to latch it as r0.req.valid
-- is only set for a single cycle. It's up to the control logic to -- is only set for a single cycle. It's up to the control logic to
-- ensure we don't override an uncompleted request (for now we are -- ensure we don't override an uncompleted request (for now we are
-- single issue on load/stores so we are fine, later, we can generate -- single issue on load/stores so we are fine, later, we can generate
-- a stall output if necessary). -- a stall output if necessary).


if req_op /= OP_NONE and stall_out = '0' then if req_op /= OP_NONE and stall_out = '0' then
r1.req <= r0; r1.req <= r0.req;
report "op:" & op_t'image(req_op) & report "op:" & op_t'image(req_op) &
" addr:" & to_hstring(r0.addr) & " addr:" & to_hstring(r0.req.addr) &
" nc:" & std_ulogic'image(r0.nc) & " nc:" & std_ulogic'image(r0.req.nc) &
" idx:" & integer'image(req_index) & " idx:" & integer'image(req_index) &
" tag:" & to_hstring(req_tag) & " tag:" & to_hstring(req_tag) &
" way: " & integer'image(req_hit_way); " way: " & integer'image(req_hit_way);
@ -1018,7 +1052,7 @@ begin
when OP_LOAD_MISS => when OP_LOAD_MISS =>
-- Normal load cache miss, start the reload machine -- Normal load cache miss, start the reload machine
-- --
report "cache miss addr:" & to_hstring(r0.addr) & report "cache miss addr:" & to_hstring(r0.req.addr) &
" idx:" & integer'image(req_index) & " idx:" & integer'image(req_index) &
" way:" & integer'image(replace_way) & " way:" & integer'image(replace_way) &
" tag:" & to_hstring(req_tag); " tag:" & to_hstring(req_tag);
@ -1053,7 +1087,7 @@ begin
r1.state <= RELOAD_WAIT_ACK; r1.state <= RELOAD_WAIT_ACK;


when OP_LOAD_NC => when OP_LOAD_NC =>
r1.wb.sel <= r0.byte_sel; r1.wb.sel <= r0.req.byte_sel;
r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000";
r1.wb.cyc <= '1'; r1.wb.cyc <= '1';
r1.wb.stb <= '1'; r1.wb.stb <= '1';
@ -1061,10 +1095,10 @@ begin
r1.state <= NC_LOAD_WAIT_ACK; r1.state <= NC_LOAD_WAIT_ACK;


when OP_STORE_HIT | OP_STORE_MISS => when OP_STORE_HIT | OP_STORE_MISS =>
if r0.dcbz = '0' then if r0.req.dcbz = '0' then
r1.wb.sel <= r0.byte_sel; r1.wb.sel <= r0.req.byte_sel;
r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000";
r1.wb.dat <= r0.data; r1.wb.dat <= r0.req.data;
if cancel_store = '0' then if cancel_store = '0' then
r1.wb.cyc <= '1'; r1.wb.cyc <= '1';
r1.wb.stb <= '1'; r1.wb.stb <= '1';

@ -15,6 +15,9 @@ architecture behave of dcache_tb is
signal d_in : Loadstore1ToDcacheType; signal d_in : Loadstore1ToDcacheType;
signal d_out : DcacheToLoadstore1Type; signal d_out : DcacheToLoadstore1Type;


signal m_in : MmuToDcacheType;
signal m_out : DcacheToMmuType;

signal wb_bram_in : wishbone_master_out; signal wb_bram_in : wishbone_master_out;
signal wb_bram_out : wishbone_slave_out; signal wb_bram_out : wishbone_slave_out;


@ -30,6 +33,8 @@ begin
rst => rst, rst => rst,
d_in => d_in, d_in => d_in,
d_out => d_out, d_out => d_out,
m_in => m_in,
m_out => m_out,
wishbone_out => wb_bram_in, wishbone_out => wb_bram_in,
wishbone_in => wb_bram_out wishbone_in => wb_bram_out
); );
@ -68,10 +73,12 @@ begin
-- Clear stuff -- Clear stuff
d_in.valid <= '0'; d_in.valid <= '0';
d_in.load <= '0'; d_in.load <= '0';
d_in.tlbie <= '0';
d_in.nc <= '0'; d_in.nc <= '0';
d_in.addr <= (others => '0'); d_in.addr <= (others => '0');
d_in.data <= (others => '0'); d_in.data <= (others => '0');
m_in.valid <= '0';
m_in.addr <= (others => '0');
m_in.pte <= (others => '0');


wait for 4*clk_period; wait for 4*clk_period;
wait until rising_edge(clk); wait until rising_edge(clk);

@ -5,7 +5,6 @@ use ieee.numeric_std.all;
library work; library work;
use work.decode_types.all; use work.decode_types.all;
use work.common.all; use work.common.all;
use work.helpers.all;


-- 2 cycle LSU -- 2 cycle LSU
-- We calculate the address in the first cycle -- We calculate the address in the first cycle
@ -22,6 +21,9 @@ entity loadstore1 is
d_out : out Loadstore1ToDcacheType; d_out : out Loadstore1ToDcacheType;
d_in : in DcacheToLoadstore1Type; d_in : in DcacheToLoadstore1Type;


m_out : out Loadstore1ToMmuType;
m_in : in MmuToLoadstore1Type;

dc_stall : in std_ulogic; dc_stall : in std_ulogic;
stall_out : out std_ulogic stall_out : out std_ulogic
); );
@ -38,7 +40,9 @@ architecture behave of loadstore1 is
SECOND_REQ, -- send 2nd request of unaligned xfer SECOND_REQ, -- send 2nd request of unaligned xfer
FIRST_ACK_WAIT, -- waiting for 1st ack from dcache FIRST_ACK_WAIT, -- waiting for 1st ack from dcache
LAST_ACK_WAIT, -- waiting for last ack from dcache LAST_ACK_WAIT, -- waiting for last ack from dcache
LD_UPDATE -- writing rA with computed addr on load LD_UPDATE, -- writing rA with computed addr on load
MMU_LOOKUP_1ST, -- waiting for MMU to look up translation
MMU_LOOKUP_LAST
); );


type reg_stage_t is record type reg_stage_t is record
@ -62,6 +66,7 @@ architecture behave of loadstore1 is
virt_mode : std_ulogic; virt_mode : std_ulogic;
priv_mode : std_ulogic; priv_mode : std_ulogic;
state : state_t; state : state_t;
first_bytes : std_ulogic_vector(7 downto 0);
second_bytes : std_ulogic_vector(7 downto 0); second_bytes : std_ulogic_vector(7 downto 0);
dar : std_ulogic_vector(63 downto 0); dar : std_ulogic_vector(63 downto 0);
dsisr : std_ulogic_vector(31 downto 0); dsisr : std_ulogic_vector(31 downto 0);
@ -146,6 +151,7 @@ begin
variable sprval : std_ulogic_vector(63 downto 0); variable sprval : std_ulogic_vector(63 downto 0);
variable exception : std_ulogic; variable exception : std_ulogic;
variable next_addr : std_ulogic_vector(63 downto 0); variable next_addr : std_ulogic_vector(63 downto 0);
variable mmureq : std_ulogic;
variable dsisr : std_ulogic_vector(31 downto 0); variable dsisr : std_ulogic_vector(31 downto 0);
begin begin
v := r; v := r;
@ -158,6 +164,7 @@ begin
sprval := (others => '0'); -- avoid inferred latches sprval := (others => '0'); -- avoid inferred latches
exception := '0'; exception := '0';
dsisr := (others => '0'); dsisr := (others => '0');
mmureq := '0';


write_enable := '0'; write_enable := '0';
do_update := '0'; do_update := '0';
@ -230,7 +237,7 @@ begin
req := '1'; req := '1';
v.dcbz := '1'; v.dcbz := '1';
when OP_TLBIE => when OP_TLBIE =>
req := '1'; mmureq := '1';
v.tlbie := '1'; v.tlbie := '1';
when OP_MFSPR => when OP_MFSPR =>
done := '1'; done := '1';
@ -282,18 +289,14 @@ begin
-- Do length_to_sel and work out if we are doing 2 dwords -- Do length_to_sel and work out if we are doing 2 dwords
long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0)); long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0));
byte_sel := long_sel(7 downto 0); byte_sel := long_sel(7 downto 0);
v.first_bytes := byte_sel;
v.second_bytes := long_sel(15 downto 8); v.second_bytes := long_sel(15 downto 8);


v.addr := lsu_sum;

-- Do byte reversing and rotating for stores in the first cycle -- Do byte reversing and rotating for stores in the first cycle
byte_offset := "000"; byte_offset := unsigned(lsu_sum(2 downto 0));
brev_lenm1 := "000"; brev_lenm1 := "000";
if v.tlbie = '0' then if l_in.byte_reverse = '1' then
byte_offset := unsigned(lsu_sum(2 downto 0)); brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
if l_in.byte_reverse = '1' then
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
end if;
end if; end if;
for i in 0 to 7 loop for i in 0 to 7 loop
k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
@ -309,6 +312,10 @@ begin
v.state := SECOND_REQ; v.state := SECOND_REQ;
end if; end if;
end if; end if;
if mmureq = '1' then
stall := '1';
v.state := LAST_ACK_WAIT;
end if;
end if; end if;


when SECOND_REQ => when SECOND_REQ =>
@ -323,12 +330,19 @@ begin
if d_in.valid = '1' then if d_in.valid = '1' then
if d_in.error = '1' then if d_in.error = '1' then
-- dcache will discard the second request -- dcache will discard the second request
exception := '1'; addr := r.addr;
dsisr(30) := d_in.tlb_miss; if d_in.tlb_miss = '1' then
dsisr(63 - 36) := d_in.perm_error; -- give it to the MMU to look up
dsisr(63 - 38) := not r.load; mmureq := '1';
dsisr(63 - 45) := d_in.rc_error; v.state := MMU_LOOKUP_1ST;
v.state := IDLE; else
-- signal an interrupt straight away
exception := '1';
dsisr(63 - 36) := d_in.perm_error;
dsisr(63 - 38) := not r.load;
dsisr(63 - 45) := d_in.rc_error;
v.state := IDLE;
end if;
else else
v.state := LAST_ACK_WAIT; v.state := LAST_ACK_WAIT;
if r.load = '1' then if r.load = '1' then
@ -337,6 +351,32 @@ begin
end if; end if;
end if; end if;


when MMU_LOOKUP_1ST | MMU_LOOKUP_LAST =>
stall := '1';
if two_dwords = '1' and r.state = MMU_LOOKUP_LAST then
addr := next_addr;
byte_sel := r.second_bytes;
else
addr := r.addr;
byte_sel := r.first_bytes;
end if;
if m_in.done = '1' then
if m_in.error = '0' then
-- retry the request now that the MMU has installed a TLB entry
req := '1';
if r.state = MMU_LOOKUP_1ST then
v.state := SECOND_REQ;
else
v.state := LAST_ACK_WAIT;
end if;
else
exception := '1';
dsisr(63 - 33) := '1';
dsisr(63 - 38) := not r.load;
v.state := IDLE;
end if;
end if;

when LAST_ACK_WAIT => when LAST_ACK_WAIT =>
stall := '1'; stall := '1';
if d_in.valid = '1' then if d_in.valid = '1' then
@ -346,12 +386,18 @@ begin
else else
addr := r.addr; addr := r.addr;
end if; end if;
exception := '1'; if d_in.tlb_miss = '1' then
dsisr(30) := d_in.tlb_miss; -- give it to the MMU to look up
dsisr(63 - 36) := d_in.perm_error; mmureq := '1';
dsisr(63 - 38) := not r.load; v.state := MMU_LOOKUP_LAST;
dsisr(63 - 45) := d_in.rc_error; else
v.state := IDLE; -- signal an interrupt straight away
exception := '1';
dsisr(63 - 36) := d_in.perm_error;
dsisr(63 - 38) := not r.load;
dsisr(63 - 45) := d_in.rc_error;
v.state := IDLE;
end if;
else else
write_enable := r.load; write_enable := r.load;
if r.load = '1' and r.update = '1' then if r.load = '1' and r.update = '1' then
@ -366,6 +412,12 @@ begin
end if; end if;
end if; end if;
end if; end if;
if m_in.done = '1' then
-- tlbie is finished
stall := '0';
done := '1';
v.state := IDLE;
end if;


when LD_UPDATE => when LD_UPDATE =>
do_update := '1'; do_update := '1';
@ -376,7 +428,6 @@ begin
-- Update outputs to dcache -- Update outputs to dcache
d_out.valid <= req; d_out.valid <= req;
d_out.load <= v.load; d_out.load <= v.load;
d_out.tlbie <= v.tlbie;
d_out.dcbz <= v.dcbz; d_out.dcbz <= v.dcbz;
d_out.nc <= v.nc; d_out.nc <= v.nc;
d_out.reserve <= v.reserve; d_out.reserve <= v.reserve;
@ -386,6 +437,12 @@ begin
d_out.virt_mode <= v.virt_mode; d_out.virt_mode <= v.virt_mode;
d_out.priv_mode <= v.priv_mode; d_out.priv_mode <= v.priv_mode;


-- Update outputs to MMU
m_out.valid <= mmureq;
m_out.tlbie <= v.tlbie;
m_out.addr <= addr;
m_out.rs <= l_in.data;

-- Update outputs to writeback -- Update outputs to writeback
-- Multiplex either cache data to the destination GPR or -- Multiplex either cache data to the destination GPR or
-- the address for the rA update. -- the address for the rA update.

@ -25,6 +25,7 @@ filesets:
- control.vhdl - control.vhdl
- execute1.vhdl - execute1.vhdl
- loadstore1.vhdl - loadstore1.vhdl
- mmu.vhdl
- dcache.vhdl - dcache.vhdl
- multiply.vhdl - multiply.vhdl
- divider.vhdl - divider.vhdl

@ -0,0 +1,109 @@
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

library work;
use work.common.all;

-- Radix MMU
-- Supports 4-level trees as in arch 3.0B, but not the two-step translation for
-- guests under a hypervisor (i.e. there is no gRA -> hRA translation).

entity mmu is
port (
clk : in std_ulogic;
rst : in std_ulogic;

l_in : in Loadstore1ToMmuType;
l_out : out MmuToLoadstore1Type;

d_out : out MmuToDcacheType;
d_in : in DcacheToMmuType
);
end mmu;

architecture behave of mmu is

type state_t is (IDLE,
TLBIE_WAIT,
RADIX_LOOKUP_0
);

type reg_stage_t is record
-- latched request from loadstore1
valid : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
state : state_t;
end record;

signal r, rin : reg_stage_t;

begin

mmu_0: process(clk)
begin
if rising_edge(clk) then
if rst = '1' then
r.state <= IDLE;
r.valid <= '0';
else
if rin.valid = '1' then
report "MMU got tlb miss for " & to_hstring(rin.addr);
end if;
if l_out.done = '1' then
report "MMU completing miss with error=" & std_ulogic'image(l_out.error);
end if;
r <= rin;
end if;
end if;
end process;

mmu_1: process(all)
variable v : reg_stage_t;
variable dcreq : std_ulogic;
variable done : std_ulogic;
variable err : std_ulogic;
begin
v.valid := l_in.valid;
v.addr := l_in.addr;
v.state := r.state;
dcreq := '0';
done := '0';
err := '0';

case r.state is
when IDLE =>
if l_in.valid = '1' then
if l_in.tlbie = '1' then
dcreq := '1';
v.state := TLBIE_WAIT;
else
v.state := RADIX_LOOKUP_0;
end if;
end if;

when TLBIE_WAIT =>
if d_in.done = '1' then
done := '1';
v.state := IDLE;
end if;

when RADIX_LOOKUP_0 =>
done := '1';
err := '1';
v.state := IDLE;
end case;

-- update registers
rin <= v;

-- drive outputs
l_out.done <= done;
l_out.error <= err;

d_out.valid <= dcreq;
d_out.tlbie <= l_in.tlbie;
d_out.addr <= l_in.addr;
d_out.pte <= l_in.rs;
end process;
end;
Loading…
Cancel
Save