MMU: Implement radix page table machinery

This adds the necessary machinery to the MMU for it to do radix page
table walks.  The core elements are a shifter that can shift the
address right by between 0 and 47 bits, a mask generator that can
generate a mask of between 5 and 16 bits, a final mask generator,
and new states in the state machine.

(The final mask generator is used for transferring bits of the
original address into the resulting TLB entry when the leaf PTE
corresponds to a page size larger than 4kB.)

The hardware does not implement a partition table or a process table.
Software is expected to load the appropriate process table entry
into a new SPR called PGTBL0, SPR 720.  The contents should be
formatted as described in Book III section 5.7.6.2 of the Power ISA
v3.0B.  PGTBL0 is set to 0 on hard reset.  At present, the top two bits
of the address (the quadrant) are ignored.

There is currently no caching of any step in the translation process
or of the final result, other than the entry created in the dTLB.
That entry is a 4k page entry even if the leaf PTE found in the walk
corresponds to a larger page size.

This implementation can handle almost any page table layout and any
page size.  The RTS field (in PGTBL0) can have any value between 0
and 31, corresponding to a total address space size between 2^31
and 2^62 bytes.  The RPDS field of PGTBL0 can be any value between
5 and 16, except that a value of 0 is taken to disable radix page
table walking (for use when one is using software loading of TLB
entries).  The NLS field of the page directory entries can have any
value between 5 and 16.  The minimum page size is 4kB, meaning that
the sum of RPDS and the NLS values of the PDEs found on the path to
a leaf PTE must be less than or equal to RTS + 31 - 12.

The PGTBL0 SPR is in the mmu module; thus this adds a path for
loadstore1 to read and write SPRs in mmu.  This adds code in dcache
to service doubleword read requests from the MMU, as well as requests
to write dTLB entries.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/169/head
Paul Mackerras 5 years ago
parent 8160f4f821
commit 4e6fc6811a

@ -39,6 +39,7 @@ package common is
constant SPR_SPRG3U : spr_num_t := 259;
constant SPR_HSPRG0 : spr_num_t := 304;
constant SPR_HSPRG1 : spr_num_t := 305;
constant SPR_PGTBL0 : spr_num_t := 720;

-- GPR indices in the register file (GPR only)
subtype gpr_index_t is std_ulogic_vector(4 downto 0);
@ -269,18 +270,23 @@ package common is
type Loadstore1ToMmuType is record
valid : std_ulogic;
tlbie : std_ulogic;
mtspr : std_ulogic;
sprn : std_ulogic_vector(3 downto 0);
addr : std_ulogic_vector(63 downto 0);
rs : std_ulogic_vector(63 downto 0);
end record;

type MmuToLoadstore1Type is record
done : std_ulogic;
error : std_ulogic;
done : std_ulogic;
invalid : std_ulogic;
badtree : std_ulogic;
sprval : std_ulogic_vector(63 downto 0);
end record;

type MmuToDcacheType is record
valid : std_ulogic;
tlbie : std_ulogic;
tlbld : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
pte : std_ulogic_vector(63 downto 0);
end record;
@ -288,6 +294,8 @@ package common is
type DcacheToMmuType is record
stall : std_ulogic;
done : std_ulogic;
err : std_ulogic;
data : std_ulogic_vector(63 downto 0);
end record;

type Loadstore1ToWritebackType is record

@ -209,6 +209,8 @@ architecture rtl of dcache is
type reg_stage_0_t is record
req : Loadstore1ToDcacheType;
tlbie : std_ulogic;
tlbld : std_ulogic;
mmu_req : std_ulogic; -- indicates source of request
end record;

signal r0 : reg_stage_0_t;
@ -220,6 +222,7 @@ architecture rtl of dcache is
type reg_stage_1_t is record
-- Latch the complete request from ls1
req : Loadstore1ToDcacheType;
mmu_req : std_ulogic;

-- Cache hit state
hit_way : way_t;
@ -444,7 +447,7 @@ begin
"request collision loadstore vs MMU";
if m_in.valid = '1' then
r0.req.valid <= '1';
r0.req.load <= '0';
r0.req.load <= not (m_in.tlbie or m_in.tlbld);
r0.req.dcbz <= '0';
r0.req.nc <= '0';
r0.req.reserve <= '0';
@ -454,10 +457,13 @@ begin
r0.req.data <= m_in.pte;
r0.req.byte_sel <= (others => '1');
r0.tlbie <= m_in.tlbie;
assert m_in.tlbie = '1' report "unknown request from MMU";
r0.tlbld <= m_in.tlbld;
r0.mmu_req <= '1';
else
r0.req <= d_in;
r0.tlbie <= '0';
r0.tlbld <= '0';
r0.mmu_req <= '0';
end if;
end if;
end if;
@ -549,7 +555,11 @@ begin
end loop;
tlb_hit <= hit and r0_valid;
tlb_hit_way <= hitway;
pte <= read_tlb_pte(hitway, tlb_pte_way);
if tlb_hit = '1' then
pte <= read_tlb_pte(hitway, tlb_pte_way);
else
pte <= (others => '0');
end if;
valid_ra <= tlb_hit or not r0.req.virt_mode;
if r0.req.virt_mode = '1' then
ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
@ -573,7 +583,7 @@ begin
if rising_edge(clk) then
tlbie := '0';
tlbia := '0';
tlbwe := '0';
tlbwe := r0_valid and r0.tlbld;
if r0_valid = '1' and r0.tlbie = '1' then
if r0.req.addr(11 downto 10) /= "00" then
tlbia := '1';
@ -607,7 +617,6 @@ begin
dtlb_ptes(tlb_req_index) <= pteset;
dtlb_valids(tlb_req_index)(repl_way) <= '1';
end if;
m_out.done <= r0_valid and r0.tlbie;
end if;
end process;

@ -669,7 +678,7 @@ begin
req_tag <= get_tag(ra);

-- Only do anything if not being stalled by stage 1
go := r0_valid and not r0.tlbie;
go := r0_valid and not (r0.tlbie or r0.tlbld);

-- Calculate address of beginning of cache line, will be
-- used for cache miss processing if needed
@ -824,6 +833,11 @@ begin
d_out.perm_error <= '0';
d_out.rc_error <= '0';

-- Outputs to MMU
m_out.done <= r1.tlbie_done;
m_out.err <= '0';
m_out.data <= cache_out(r1.hit_way);

-- We have a valid load or store hit or we just completed a slow
-- op such as a load miss, a NC load or a store
--
@ -842,40 +856,65 @@ begin
"unexpected hit_load_delayed collision with slow_valid"
severity FAILURE;

-- Load hit case is the standard path
if r1.hit_load_valid = '1' then
report "completing load hit";
d_out.valid <= '1';
end if;
if r1.mmu_req = '0' then
-- Request came from loadstore1...
-- Load hit case is the standard path
if r1.hit_load_valid = '1' then
report "completing load hit";
d_out.valid <= '1';
end if;

-- error cases complete without stalling
if r1.error_done = '1' then
report "completing ld/st with error";
d_out.error <= '1';
d_out.tlb_miss <= r1.tlb_miss;
d_out.perm_error <= r1.perm_error;
d_out.rc_error <= r1.rc_error;
d_out.valid <= '1';
end if;
-- error cases complete without stalling
if r1.error_done = '1' then
report "completing ld/st with error";
d_out.error <= '1';
d_out.tlb_miss <= r1.tlb_miss;
d_out.perm_error <= r1.perm_error;
d_out.rc_error <= r1.rc_error;
d_out.valid <= '1';
end if;

-- Slow ops (load miss, NC, stores)
if r1.slow_valid = '1' then
-- If it's a load, enable register writeback and switch
-- mux accordingly
--
if r1.req.load then
-- Read data comes from the slow data latch
d_out.data <= r1.slow_data;
end if;
d_out.store_done <= '1';
-- Slow ops (load miss, NC, stores)
if r1.slow_valid = '1' then
-- If it's a load, enable register writeback and switch
-- mux accordingly
--
if r1.req.load then
-- Read data comes from the slow data latch
d_out.data <= r1.slow_data;
end if;
d_out.store_done <= '1';

report "completing store or load miss";
d_out.valid <= '1';
end if;
report "completing store or load miss";
d_out.valid <= '1';
end if;

if r1.stcx_fail = '1' then
d_out.store_done <= '0';
d_out.valid <= '1';
end if;

else
-- Request came from MMU
if r1.hit_load_valid = '1' then
report "completing load hit to MMU, data=" & to_hstring(m_out.data);
m_out.done <= '1';
end if;

if r1.stcx_fail = '1' then
d_out.store_done <= '0';
d_out.valid <= '1';
-- error cases complete without stalling
if r1.error_done = '1' then
report "completing MMU ld with error";
m_out.err <= '1';
m_out.done <= '1';
end if;

-- Slow ops (i.e. load miss)
if r1.slow_valid = '1' then
-- Read data comes from the slow data latch
m_out.data <= r1.slow_data;
report "completing MMU load miss, data=" & to_hstring(m_out.data);
m_out.done <= '1';
end if;
end if;

end process;
@ -978,6 +1017,7 @@ begin

if req_op /= OP_NONE and stall_out = '0' then
r1.req <= r0.req;
r1.mmu_req <= r0.mmu_req;
report "op:" & op_t'image(req_op) &
" addr:" & to_hstring(r0.req.addr) &
" nc:" & std_ulogic'image(r0.req.nc) &
@ -995,8 +1035,8 @@ begin
end if;

if req_op = OP_BAD then
report "Signalling ld/st error valid_ra=" & " rc_ok=" & std_ulogic'image(rc_ok) &
" perm_ok=" & std_ulogic'image(perm_ok);
report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) &
" rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok);
r1.error_done <= '1';
r1.tlb_miss <= not valid_ra;
r1.perm_error <= valid_ra and not perm_ok;
@ -1005,8 +1045,8 @@ begin
r1.error_done <= '0';
end if;

-- complete tlbies in the third cycle
r1.tlbie_done <= r0_valid and r0.tlbie;
-- complete tlbies and TLB loads in the third cycle
r1.tlbie_done <= r0_valid and (r0.tlbie or r0.tlbld);
end if;
end process;


@ -438,7 +438,7 @@ begin
v.decode.sgl_pipe := '1';
-- send MMU-related SPRs to loadstore1
case sprn is
when SPR_DAR | SPR_DSISR =>
when SPR_DAR | SPR_DSISR | SPR_PGTBL0 =>
v.decode.unit := LDST;
when others =>
end case;

@ -153,6 +153,7 @@ begin
variable next_addr : std_ulogic_vector(63 downto 0);
variable mmureq : std_ulogic;
variable dsisr : std_ulogic_vector(31 downto 0);
variable mmu_mtspr : std_ulogic;
begin
v := r;
req := '0';
@ -161,6 +162,8 @@ begin
byte_sel := (others => '0');
addr := lsu_sum;
mfspr := '0';
mmu_mtspr := '0';
sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10));
sprval := (others => '0'); -- avoid inferred latches
exception := '0';
dsisr := (others => '0');
@ -244,19 +247,27 @@ begin
mfspr := '1';
-- partial decode on SPR number should be adequate given
-- the restricted set that get sent down this path
sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10));
if sprn(0) = '0' then
sprval := x"00000000" & r.dsisr;
if sprn(9) = '0' then
if sprn(0) = '0' then
sprval := x"00000000" & r.dsisr;
else
sprval := r.dar;
end if;
else
sprval := r.dar;
-- reading one of the SPRs in the MMU
sprval := m_in.sprval;
end if;
when OP_MTSPR =>
done := '1';
sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10));
if sprn(0) = '0' then
v.dsisr := l_in.data(31 downto 0);
if sprn(9) = '0' then
if sprn(0) = '0' then
v.dsisr := l_in.data(31 downto 0);
else
v.dar := l_in.data;
end if;
else
v.dar := l_in.data;
-- writing one of the SPRs in the MMU
mmu_mtspr := '1';
end if;
when others =>
assert false report "unknown op sent to loadstore1";
@ -361,7 +372,7 @@ begin
byte_sel := r.first_bytes;
end if;
if m_in.done = '1' then
if m_in.error = '0' then
if m_in.invalid = '0' and m_in.badtree = '0' then
-- retry the request now that the MMU has installed a TLB entry
req := '1';
if r.state = MMU_LOOKUP_1ST then
@ -371,8 +382,9 @@ begin
end if;
else
exception := '1';
dsisr(63 - 33) := '1';
dsisr(63 - 33) := m_in.invalid;
dsisr(63 - 38) := not r.load;
dsisr(63 - 44) := m_in.badtree;
v.state := IDLE;
end if;
end if;
@ -440,6 +452,8 @@ begin
-- Update outputs to MMU
m_out.valid <= mmureq;
m_out.tlbie <= v.tlbie;
m_out.mtspr <= mmu_mtspr;
m_out.sprn <= sprn(3 downto 0);
m_out.addr <= addr;
m_out.rs <= l_in.data;


@ -25,20 +25,37 @@ end mmu;
architecture behave of mmu is

type state_t is (IDLE,
TLBIE_WAIT,
RADIX_LOOKUP_0
TLB_WAIT,
RADIX_LOOKUP,
RADIX_READ_WAIT,
RADIX_LOAD_TLB,
RADIX_NO_TRANS,
RADIX_BAD_TREE
);

type reg_stage_t is record
-- latched request from loadstore1
valid : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
-- internal state
state : state_t;
pgtbl0 : std_ulogic_vector(63 downto 0);
shift : unsigned(5 downto 0);
mask_size : unsigned(4 downto 0);
pgbase : std_ulogic_vector(55 downto 0);
pde : std_ulogic_vector(63 downto 0);
end record;

signal r, rin : reg_stage_t;

signal addrsh : std_ulogic_vector(15 downto 0);
signal mask : std_ulogic_vector(15 downto 0);
signal finalmask : std_ulogic_vector(43 downto 0);

begin
-- Multiplex internal SPR values back to loadstore1, selected
-- by l_in.sprn. Easy when there's only one...
l_out.sprval <= r.pgtbl0;

mmu_0: process(clk)
begin
@ -46,64 +63,237 @@ begin
if rst = '1' then
r.state <= IDLE;
r.valid <= '0';
r.pgtbl0 <= (others => '0');
else
if rin.valid = '1' then
report "MMU got tlb miss for " & to_hstring(rin.addr);
end if;
if l_out.done = '1' then
report "MMU completing miss with error=" & std_ulogic'image(l_out.error);
report "MMU completing op with invalid=" & std_ulogic'image(l_out.invalid) &
" badtree=" & std_ulogic'image(l_out.badtree);
end if;
if rin.state = RADIX_LOOKUP then
report "radix lookup shift=" & integer'image(to_integer(rin.shift)) &
" msize=" & integer'image(to_integer(rin.mask_size));
end if;
if r.state = RADIX_LOOKUP then
report "send load addr=" & to_hstring(d_out.addr) &
" addrsh=" & to_hstring(addrsh) & " mask=" & to_hstring(mask);
end if;
r <= rin;
end if;
end if;
end process;

-- Shift address bits 61--12 right by 0--47 bits and
-- supply the least significant 16 bits of the result.
addrshifter: process(all)
variable sh1 : std_ulogic_vector(30 downto 0);
variable sh2 : std_ulogic_vector(18 downto 0);
variable result : std_ulogic_vector(15 downto 0);
begin
case r.shift(5 downto 4) is
when "00" =>
sh1 := r.addr(42 downto 12);
when "01" =>
sh1 := r.addr(58 downto 28);
when others =>
sh1 := "0000000000000" & r.addr(61 downto 44);
end case;
case r.shift(3 downto 2) is
when "00" =>
sh2 := sh1(18 downto 0);
when "01" =>
sh2 := sh1(22 downto 4);
when "10" =>
sh2 := sh1(26 downto 8);
when others =>
sh2 := sh1(30 downto 12);
end case;
case r.shift(1 downto 0) is
when "00" =>
result := sh2(15 downto 0);
when "01" =>
result := sh2(16 downto 1);
when "10" =>
result := sh2(17 downto 2);
when others =>
result := sh2(18 downto 3);
end case;
addrsh <= result;
end process;

-- generate mask for extracting address fields for PTE address generation
addrmaskgen: process(all)
variable m : std_ulogic_vector(15 downto 0);
begin
-- mask_count has to be >= 5
m := x"001f";
for i in 5 to 15 loop
if i < to_integer(r.mask_size) then
m(i) := '1';
end if;
end loop;
mask <= m;
end process;

-- generate mask for extracting address bits to go in TLB entry
-- in order to support pages > 4kB
finalmaskgen: process(all)
variable m : std_ulogic_vector(43 downto 0);
begin
m := (others => '0');
for i in 0 to 43 loop
if i < to_integer(r.shift) then
m(i) := '1';
end if;
end loop;
finalmask <= m;
end process;

mmu_1: process(all)
variable v : reg_stage_t;
variable dcreq : std_ulogic;
variable done : std_ulogic;
variable err : std_ulogic;
variable invalid : std_ulogic;
variable badtree : std_ulogic;
variable tlb_load : std_ulogic;
variable tlbie_req : std_ulogic;
variable rts : unsigned(5 downto 0);
variable mbits : unsigned(5 downto 0);
variable pgtable_addr : std_ulogic_vector(63 downto 0);
variable pte : std_ulogic_vector(63 downto 0);
variable data : std_ulogic_vector(63 downto 0);
begin
v.valid := l_in.valid;
v.addr := l_in.addr;
v.state := r.state;
v := r;
v.valid := '0';
dcreq := '0';
done := '0';
err := '0';
invalid := '0';
badtree := '0';
tlb_load := '0';
tlbie_req := '0';

-- Radix tree data structures in memory are big-endian,
-- so we need to byte-swap them
for i in 0 to 7 loop
data(i * 8 + 7 downto i * 8) := d_in.data((7 - i) * 8 + 7 downto (7 - i) * 8);
end loop;

case r.state is
when IDLE =>
-- rts == radix tree size, # address bits being translated
rts := unsigned('0' & r.pgtbl0(62 downto 61) & r.pgtbl0(7 downto 5)) + (31 - 12);
-- mbits == # address bits to index top level of tree
mbits := unsigned('0' & r.pgtbl0(4 downto 0));
v.shift := rts - mbits;
v.mask_size := mbits(4 downto 0);
v.pgbase := r.pgtbl0(55 downto 8) & x"00";

if l_in.valid = '1' then
v.addr := l_in.addr;
if l_in.tlbie = '1' then
dcreq := '1';
v.state := TLBIE_WAIT;
tlbie_req := '1';
v.state := TLB_WAIT;
else
v.state := RADIX_LOOKUP_0;
v.valid := '1';
-- for now, take RPDS = 0 to disable radix translation
if mbits = 0 then
v.state := RADIX_NO_TRANS;
elsif mbits < 5 or mbits > 16 or mbits > rts then
v.state := RADIX_BAD_TREE;
else
v.state := RADIX_LOOKUP;
end if;
end if;
end if;
if l_in.mtspr = '1' then
v.pgtbl0 := l_in.rs;
end if;

when TLBIE_WAIT =>
when TLB_WAIT =>
if d_in.done = '1' then
done := '1';
v.state := IDLE;
end if;

when RADIX_LOOKUP_0 =>
when RADIX_LOOKUP =>
dcreq := '1';
v.state := RADIX_READ_WAIT;

when RADIX_READ_WAIT =>
if d_in.done = '1' then
if d_in.err = '0' then
v.pde := data;
-- test valid bit
if data(63) = '1' then
-- test leaf bit
if data(62) = '1' then
v.state := RADIX_LOAD_TLB;
else
mbits := unsigned('0' & data(4 downto 0));
if mbits < 5 or mbits > 16 or mbits > r.shift then
v.state := RADIX_BAD_TREE;
else
v.shift := v.shift - mbits;
v.mask_size := mbits(4 downto 0);
v.pgbase := data(55 downto 8) & x"00";
v.state := RADIX_LOOKUP;
end if;
end if;
else
-- non-present PTE, generate a DSI
v.state := RADIX_NO_TRANS;
end if;
else
v.state := RADIX_BAD_TREE;
end if;
end if;

when RADIX_LOAD_TLB =>
tlb_load := '1';
dcreq := '1';
v.state := TLB_WAIT;

when RADIX_NO_TRANS =>
done := '1';
invalid := '1';
v.state := IDLE;

when RADIX_BAD_TREE =>
done := '1';
err := '1';
badtree := '1';
v.state := IDLE;
end case;

pgtable_addr := x"00" & r.pgbase(55 downto 19) &
((r.pgbase(18 downto 3) and not mask) or (addrsh and mask)) &
"000";
pte := x"00" &
((r.pde(55 downto 12) and not finalmask) or (r.addr(55 downto 12) and finalmask))
& r.pde(11 downto 0);

-- update registers
rin <= v;

-- drive outputs
l_out.done <= done;
l_out.error <= err;
l_out.invalid <= invalid;
l_out.badtree <= badtree;

d_out.valid <= dcreq;
d_out.tlbie <= l_in.tlbie;
d_out.addr <= l_in.addr;
d_out.pte <= l_in.rs;
d_out.tlbie <= tlbie_req;
d_out.tlbld <= tlb_load;
if tlbie_req = '1' then
d_out.addr <= l_in.addr;
d_out.pte <= l_in.rs;
elsif tlb_load = '1' then
d_out.addr <= r.addr(63 downto 12) & x"000";
d_out.pte <= pte;
else
d_out.addr <= pgtable_addr;
d_out.pte <= (others => '0');
end if;
end process;
end;

Loading…
Cancel
Save