dcache: Implement data TLB

This adds a TLB to dcache, providing the ability to translate
addresses for loads and stores.  No protection mechanism has been
implemented yet.  The MSR_DR bit controls whether addresses are
translated through the TLB.

The TLB is a fixed-pagesize, set-associative cache.  Currently
the page size is 4kB and the TLB is 2-way set associative with 64
entries per set.

This implements the tlbie instruction.  RB bits 10 and 11 control
whether the whole TLB is invalidated (if either bit is 1) or just
a single entry corresponding to the effective page number in bits
12-63 of RB.

As an extension until we get a hardware page table walk, a tlbie
instruction with RB bits 9-11 set to 001 will load an entry into
the TLB.  The TLB entry value is in RS in the format of a radix PTE.

Currently there is no proper handling of TLB misses.  The load or
store will not be performed but no interrupt is generated.

In order to make timing at 100MHz on the Arty A7-100, we compare
the real address from each way of the TLB with the tag from each way
of the cache in parallel (requiring # TLB ways * # cache ways
comparators).  Then the result is selected based on which way hit in
the TLB.  That avoids a timing path going through the TLB EA
comparators, the multiplexer that selects the RA, and the cache tag
comparators.

The hack where addresses of the form 0xc------- are marked as
cache-inhibited is kept for now but restricted to real-mode accesses.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/169/head
Paul Mackerras 5 years ago
parent 635e316f9b
commit 750b3a8e28

@ -216,7 +216,7 @@ package common is


type Execute1ToLoadstore1Type is record type Execute1ToLoadstore1Type is record
valid : std_ulogic; valid : std_ulogic;
op : insn_type_t; -- what ld/st or m[tf]spr to do op : insn_type_t; -- what ld/st or m[tf]spr or TLB op to do
addr1 : std_ulogic_vector(63 downto 0); addr1 : std_ulogic_vector(63 downto 0);
addr2 : std_ulogic_vector(63 downto 0); addr2 : std_ulogic_vector(63 downto 0);
data : std_ulogic_vector(63 downto 0); -- data to write, unused for read data : std_ulogic_vector(63 downto 0); -- data to write, unused for read
@ -231,18 +231,21 @@ package common is
reserve : std_ulogic; -- set for larx/stcx. reserve : std_ulogic; -- set for larx/stcx.
rc : std_ulogic; -- set for stcx. rc : std_ulogic; -- set for stcx.
spr_num : spr_num_t; -- SPR number for mfspr/mtspr spr_num : spr_num_t; -- SPR number for mfspr/mtspr
virt_mode : std_ulogic; -- do translation through TLB
end record; end record;
constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0', constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0',
sign_extend => '0', update => '0', xerc => xerc_init, sign_extend => '0', update => '0', xerc => xerc_init,
reserve => '0', rc => '0', reserve => '0', rc => '0', virt_mode => '0',
spr_num => 0, others => (others => '0')); spr_num => 0, others => (others => '0'));


type Loadstore1ToDcacheType is record type Loadstore1ToDcacheType is record
valid : std_ulogic; valid : std_ulogic;
load : std_ulogic; load : std_ulogic; -- is this a load
tlbie : std_ulogic; -- is this a tlbie
dcbz : std_ulogic; dcbz : std_ulogic;
nc : std_ulogic; nc : std_ulogic;
reserve : std_ulogic; reserve : std_ulogic;
virt_mode : std_ulogic;
addr : std_ulogic_vector(63 downto 0); addr : std_ulogic_vector(63 downto 0);
data : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0);
byte_sel : std_ulogic_vector(7 downto 0); byte_sel : std_ulogic_vector(7 downto 0);
@ -253,6 +256,7 @@ package common is
data : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0);
store_done : std_ulogic; store_done : std_ulogic;
error : std_ulogic; error : std_ulogic;
tlb_miss : std_ulogic;
end record; end record;


type Loadstore1ToWritebackType is record type Loadstore1ToWritebackType is record

@ -25,7 +25,13 @@ entity dcache is
-- Number of lines in a set -- Number of lines in a set
NUM_LINES : positive := 32; NUM_LINES : positive := 32;
-- Number of ways -- Number of ways
NUM_WAYS : positive := 4 NUM_WAYS : positive := 4;
-- L1 DTLB entries per set
TLB_SET_SIZE : positive := 64;
-- L1 DTLB number of sets
TLB_NUM_WAYS : positive := 2;
-- L1 DTLB log_2(page_size)
TLB_LG_PGSZ : positive := 12
); );
port ( port (
clk : in std_ulogic; clk : in std_ulogic;
@ -56,6 +62,8 @@ architecture rtl of dcache is


-- Bit fields counts in the address -- Bit fields counts in the address


-- REAL_ADDR_BITS is the number of real address bits that we store
constant REAL_ADDR_BITS : positive := 56;
-- ROW_BITS is the number of bits to select a row -- ROW_BITS is the number of bits to select a row
constant ROW_BITS : natural := log2(BRAM_ROWS); constant ROW_BITS : natural := log2(BRAM_ROWS);
-- ROW_LINEBITS is the number of bits to select a row within a line -- ROW_LINEBITS is the number of bits to select a row within a line
@ -66,8 +74,10 @@ architecture rtl of dcache is
constant ROW_OFF_BITS : natural := log2(ROW_SIZE); constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
-- INDEX_BITS is the number if bits to select a cache line -- INDEX_BITS is the number if bits to select a cache line
constant INDEX_BITS : natural := log2(NUM_LINES); constant INDEX_BITS : natural := log2(NUM_LINES);
-- SET_SIZE_BITS is the log base 2 of the set size
constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
-- TAG_BITS is the number of bits of the tag part of the address -- TAG_BITS is the number of bits of the tag part of the address
constant TAG_BITS : natural := 64 - LINE_OFF_BITS - INDEX_BITS; constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
-- WAY_BITS is the number of bits to select a way -- WAY_BITS is the number of bits to select a way
constant WAY_BITS : natural := log2(NUM_WAYS); constant WAY_BITS : natural := log2(NUM_WAYS);


@ -80,7 +90,7 @@ architecture rtl of dcache is
-- .. | |- --| ROW_OFF_BITS (3) -- .. | |- --| ROW_OFF_BITS (3)
-- .. |----- ---| | ROW_BITS (8) -- .. |----- ---| | ROW_BITS (8)
-- .. |-----| | INDEX_BITS (5) -- .. |-----| | INDEX_BITS (5)
-- .. --------| | TAG_BITS (53) -- .. --------| | TAG_BITS (45)


subtype row_t is integer range 0 to BRAM_ROWS-1; subtype row_t is integer range 0 to BRAM_ROWS-1;
subtype index_t is integer range 0 to NUM_LINES-1; subtype index_t is integer range 0 to NUM_LINES-1;
@ -110,6 +120,32 @@ architecture rtl of dcache is
attribute ram_style : string; attribute ram_style : string;
attribute ram_style of cache_tags : signal is "distributed"; attribute ram_style of cache_tags : signal is "distributed";


-- L1 TLB.
constant TLB_SET_BITS : natural := log2(TLB_SET_SIZE);
constant TLB_WAY_BITS : natural := log2(TLB_NUM_WAYS);
constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_SET_BITS);
constant TLB_TAG_WAY_BITS : natural := TLB_NUM_WAYS * TLB_EA_TAG_BITS;
constant TLB_PTE_BITS : natural := 64;
constant TLB_PTE_WAY_BITS : natural := TLB_NUM_WAYS * TLB_PTE_BITS;

subtype tlb_way_t is integer range 0 to TLB_NUM_WAYS - 1;
subtype tlb_index_t is integer range 0 to TLB_SET_SIZE - 1;
subtype tlb_way_valids_t is std_ulogic_vector(TLB_NUM_WAYS-1 downto 0);
type tlb_valids_t is array(tlb_index_t) of tlb_way_valids_t;
subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
subtype tlb_way_tags_t is std_ulogic_vector(TLB_TAG_WAY_BITS-1 downto 0);
type tlb_tags_t is array(tlb_index_t) of tlb_way_tags_t;
subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
subtype tlb_way_ptes_t is std_ulogic_vector(TLB_PTE_WAY_BITS-1 downto 0);
type tlb_ptes_t is array(tlb_index_t) of tlb_way_ptes_t;
type hit_way_set_t is array(tlb_way_t) of way_t;

signal dtlb_valids : tlb_valids_t;
signal dtlb_tags : tlb_tags_t;
signal dtlb_ptes : tlb_ptes_t;
attribute ram_style of dtlb_tags : signal is "distributed";
attribute ram_style of dtlb_ptes : signal is "distributed";

signal r0 : Loadstore1ToDcacheType; signal r0 : Loadstore1ToDcacheType;


-- Type of operation on a "valid" input -- Type of operation on a "valid" input
@ -168,6 +204,13 @@ architecture rtl of dcache is
store_way : way_t; store_way : way_t;
store_row : row_t; store_row : row_t;
store_index : index_t; store_index : index_t;

-- Signals to complete with error
error_done : std_ulogic;
tlb_miss : std_ulogic;

-- completion signal for tlbie
tlbie_done : std_ulogic;
end record; end record;


signal r1 : reg_stage_1_t; signal r1 : reg_stage_1_t;
@ -208,6 +251,21 @@ architecture rtl of dcache is
-- Wishbone read/write/cache write formatting signals -- Wishbone read/write/cache write formatting signals
signal bus_sel : std_ulogic_vector(7 downto 0); signal bus_sel : std_ulogic_vector(7 downto 0);


-- TLB signals
signal tlb_tag_way : tlb_way_tags_t;
signal tlb_pte_way : tlb_way_ptes_t;
signal tlb_valid_way : tlb_way_valids_t;
signal tlb_req_index : tlb_index_t;
signal tlb_hit : std_ulogic;
signal tlb_hit_way : tlb_way_t;
signal pte : tlb_pte_t;
signal ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
signal valid_ra : std_ulogic;

-- TLB PLRU output interface
type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
signal tlb_plru_victim : tlb_plru_out_t;

-- --
-- Helper functions to decode incoming requests -- Helper functions to decode incoming requests
-- --
@ -215,13 +273,13 @@ architecture rtl of dcache is
-- Return the cache line index (tag index) for an address -- Return the cache line index (tag index) for an address
function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is
begin begin
return to_integer(unsigned(addr(63-TAG_BITS downto LINE_OFF_BITS))); return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)));
end; end;


-- Return the cache row index (data memory) for an address -- Return the cache row index (data memory) for an address
function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is
begin begin
return to_integer(unsigned(addr(63-TAG_BITS downto ROW_OFF_BITS))); return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)));
end; end;


-- Returns whether this is the last row of a line -- Returns whether this is the last row of a line
@ -269,9 +327,9 @@ architecture rtl of dcache is
end; end;


-- Get the tag value from the address -- Get the tag value from the address
function get_tag(addr: std_ulogic_vector(63 downto 0)) return cache_tag_t is function get_tag(addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)) return cache_tag_t is
begin begin
return addr(63 downto 64-TAG_BITS); return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
end; end;


-- Read a tag from a tag memory row -- Read a tag from a tag memory row
@ -287,6 +345,38 @@ architecture rtl of dcache is
tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
end; end;


-- Read a TLB tag from a TLB tag memory row
function read_tlb_tag(way: tlb_way_t; tags: tlb_way_tags_t) return tlb_tag_t is
variable j : integer;
begin
j := way * TLB_EA_TAG_BITS;
return tags(j + TLB_EA_TAG_BITS - 1 downto j);
end;

-- Write a TLB tag to a TLB tag memory row
procedure write_tlb_tag(way: tlb_way_t; tags: inout tlb_way_tags_t;
tag: tlb_tag_t) is
variable j : integer;
begin
j := way * TLB_EA_TAG_BITS;
tags(j + TLB_EA_TAG_BITS - 1 downto j) := tag;
end;

-- Read a PTE from a TLB PTE memory row
function read_tlb_pte(way: tlb_way_t; ptes: tlb_way_ptes_t) return tlb_pte_t is
variable j : integer;
begin
j := way * TLB_PTE_BITS;
return ptes(j + TLB_PTE_BITS - 1 downto j);
end;

procedure write_tlb_pte(way: tlb_way_t; ptes: inout tlb_way_ptes_t; newpte: tlb_pte_t) is
variable j : integer;
begin
j := way * TLB_PTE_BITS;
ptes(j + TLB_PTE_BITS - 1 downto j) := newpte;
end;

begin begin


assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
@ -297,13 +387,158 @@ begin
report "geometry bits don't add up" severity FAILURE; report "geometry bits don't add up" severity FAILURE;
assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
report "geometry bits don't add up" severity FAILURE; report "geometry bits don't add up" severity FAILURE;
assert (64 = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
report "geometry bits don't add up" severity FAILURE; report "geometry bits don't add up" severity FAILURE;
assert (64 = TAG_BITS + ROW_BITS + ROW_OFF_BITS) assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
report "geometry bits don't add up" severity FAILURE; report "geometry bits don't add up" severity FAILURE;
assert (64 = wishbone_data_bits) assert (64 = wishbone_data_bits)
report "Can't yet handle a wishbone width that isn't 64-bits" severity FAILURE; report "Can't yet handle a wishbone width that isn't 64-bits" severity FAILURE;

-- Latch the request in r0 as long as we're not stalling
stage_0 : process(clk)
begin
if rising_edge(clk) then
if rst = '1' then
r0.valid <= '0';
elsif stall_out = '0' then
r0 <= d_in;
end if;
end if;
end process;

-- TLB
-- Operates in the second cycle on the request latched in r0.
-- TLB updates write the entry at the end of the second cycle.
tlb_read : process(clk)
variable index : tlb_index_t;
begin
if rising_edge(clk) then
if stall_out = '1' then
-- keep reading the same thing while stalled
index := tlb_req_index;
else
index := to_integer(unsigned(d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1
downto TLB_LG_PGSZ)));
end if;
tlb_valid_way <= dtlb_valids(index);
tlb_tag_way <= dtlb_tags(index);
tlb_pte_way <= dtlb_ptes(index);
end if;
end process;

-- Generate TLB PLRUs
maybe_tlb_plrus: if TLB_NUM_WAYS > 1 generate
begin
tlb_plrus: for i in 0 to TLB_SET_SIZE - 1 generate
-- TLB PLRU interface
signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
signal tlb_plru_acc_en : std_ulogic;
signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
begin
tlb_plru : entity work.plru
generic map (
BITS => TLB_WAY_BITS
)
port map (
clk => clk,
rst => rst,
acc => tlb_plru_acc,
acc_en => tlb_plru_acc_en,
lru => tlb_plru_out
);

process(tlb_req_index, tlb_hit, tlb_hit_way, tlb_plru_out)
begin
-- PLRU interface
if tlb_hit = '1' and tlb_req_index = i then
tlb_plru_acc_en <= '1';
else
tlb_plru_acc_en <= '0';
end if;
tlb_plru_acc <= std_ulogic_vector(to_unsigned(tlb_hit_way, TLB_WAY_BITS));
tlb_plru_victim(i) <= tlb_plru_out;
end process;
end generate;
end generate;

tlb_search : process(all)
variable hitway : tlb_way_t;
variable hit : std_ulogic;
variable eatag : tlb_tag_t;
begin
tlb_req_index <= to_integer(unsigned(r0.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1
downto TLB_LG_PGSZ)));
hitway := 0;
hit := '0';
eatag := r0.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS);
for i in tlb_way_t loop
if tlb_valid_way(i) = '1' and
read_tlb_tag(i, tlb_tag_way) = eatag then
hitway := i;
hit := '1';
end if;
end loop;
tlb_hit <= hit and r0.valid;
tlb_hit_way <= hitway;
pte <= read_tlb_pte(hitway, tlb_pte_way);
valid_ra <= tlb_hit or not r0.virt_mode;
if r0.virt_mode = '1' then
ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
r0.addr(TLB_LG_PGSZ - 1 downto 0);
else
ra <= r0.addr(REAL_ADDR_BITS - 1 downto 0);
end if;
end process;

tlb_update : process(clk)
variable tlbie : std_ulogic;
variable tlbia : std_ulogic;
variable tlbwe : std_ulogic;
variable repl_way : tlb_way_t;
variable eatag : tlb_tag_t;
variable tagset : tlb_way_tags_t;
variable pteset : tlb_way_ptes_t;
begin
if rising_edge(clk) then
tlbie := '0';
tlbia := '0';
tlbwe := '0';
if r0.valid = '1' and stall_out = '0' and r0.tlbie = '1' then
if r0.addr(11 downto 10) /= "00" then
tlbia := '1';
elsif r0.addr(9) = '1' then
tlbwe := '1';
else
tlbie := '1';
end if;
end if;
if rst = '1' or tlbia = '1' then
-- clear all valid bits at once
for i in tlb_index_t loop
dtlb_valids(i) <= (others => '0');
end loop;
elsif tlbie = '1' then
if tlb_hit = '1' then
dtlb_valids(tlb_req_index)(tlb_hit_way) <= '0';
end if;
elsif tlbwe = '1' then
if tlb_hit = '1' then
repl_way := tlb_hit_way;
else
repl_way := to_integer(unsigned(tlb_plru_victim(tlb_req_index)));
end if;
eatag := r0.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS);
tagset := tlb_tag_way;
write_tlb_tag(repl_way, tagset, eatag);
dtlb_tags(tlb_req_index) <= tagset;
pteset := tlb_pte_way;
write_tlb_pte(repl_way, pteset, r0.data);
dtlb_ptes(tlb_req_index) <= pteset;
dtlb_valids(tlb_req_index)(repl_way) <= '1';
end if;
end if;
end process;

-- Generate PLRUs -- Generate PLRUs
maybe_plrus: if NUM_WAYS > 1 generate maybe_plrus: if NUM_WAYS > 1 generate
begin begin
@ -341,53 +576,73 @@ begin
end generate; end generate;
end generate; end generate;


-- Latch the request in r0 as long as we're not stalling
stage_0 : process(clk)
begin
if rising_edge(clk) then
if rst = '1' then
r0.valid <= '0';
elsif stall_out = '0' then
r0 <= d_in;
end if;
end if;
end process;

-- Cache request parsing and hit detection -- Cache request parsing and hit detection
dcache_request : process(all) dcache_request : process(all)
variable is_hit : std_ulogic; variable is_hit : std_ulogic;
variable hit_way : way_t; variable hit_way : way_t;
variable op : op_t; variable op : op_t;
variable tmp : std_ulogic_vector(63 downto 0); variable opsel : std_ulogic_vector(2 downto 0);
variable data : std_ulogic_vector(63 downto 0);
variable opsel : std_ulogic_vector(3 downto 0);
variable go : std_ulogic; variable go : std_ulogic;
variable s_hit : std_ulogic;
variable s_tag : cache_tag_t;
variable s_pte : tlb_pte_t;
variable s_ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
variable hit_set : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0);
variable hit_way_set : hit_way_set_t;
begin begin
-- Extract line, row and tag from request -- Extract line, row and tag from request
req_index <= get_index(r0.addr); req_index <= get_index(r0.addr);
req_row <= get_row(r0.addr); req_row <= get_row(r0.addr);
req_tag <= get_tag(r0.addr); req_tag <= get_tag(ra);


-- Only do anything if not being stalled by stage 1 -- Only do anything if not being stalled by stage 1
go := r0.valid and not stall_out; go := r0.valid and not stall_out and not r0.tlbie;


-- Calculate address of beginning of cache line, will be -- Calculate address of beginning of cache line, will be
-- used for cache miss processing if needed -- used for cache miss processing if needed
-- --
req_laddr <= r0.addr(63 downto LINE_OFF_BITS) & req_laddr <= (63 downto REAL_ADDR_BITS => '0') &
ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) &
(LINE_OFF_BITS-1 downto 0 => '0'); (LINE_OFF_BITS-1 downto 0 => '0');


-- Test if pending request is a hit on any way -- Test if pending request is a hit on any way
hit_way := 0; -- In order to make timing in virtual mode, when we are using the TLB,
is_hit := '0'; -- we compare each way with each of the real addresses from each way of
for i in way_t loop -- the TLB, and then decide later which match to use.
if go = '1' and cache_valids(req_index)(i) = '1' then hit_way := 0;
if read_tag(i, cache_tags(req_index)) = req_tag then is_hit := '0';
hit_way := i; if r0.virt_mode = '1' then
is_hit := '1'; for j in tlb_way_t loop
end if; hit_way_set(j) := 0;
end if; s_hit := '0';
end loop; s_pte := read_tlb_pte(j, tlb_pte_way);
s_ra := s_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
r0.addr(TLB_LG_PGSZ - 1 downto 0);
s_tag := get_tag(s_ra);
for i in way_t loop
if go = '1' and cache_valids(req_index)(i) = '1' and
read_tag(i, cache_tags(req_index)) = s_tag and
tlb_valid_way(j) = '1' then
hit_way_set(j) := i;
s_hit := '1';
end if;
end loop;
hit_set(j) := s_hit;
end loop;
if tlb_hit = '1' then
is_hit := hit_set(tlb_hit_way);
hit_way := hit_way_set(tlb_hit_way);
end if;
else
s_tag := get_tag(r0.addr(REAL_ADDR_BITS - 1 downto 0));
for i in way_t loop
if go = '1' and cache_valids(req_index)(i) = '1' and
read_tag(i, cache_tags(req_index)) = s_tag then
hit_way := i;
is_hit := '1';
end if;
end loop;
end if;


-- The way that matched on a hit -- The way that matched on a hit
req_hit_way <= hit_way; req_hit_way <= hit_way;
@ -398,19 +653,25 @@ begin
-- Combine the request and cache his status to decide what -- Combine the request and cache his status to decide what
-- operation needs to be done -- operation needs to be done
-- --
opsel := go & r0.load & r0.nc & is_hit; op := OP_NONE;
case opsel is if go = '1' then
when "1101" => op := OP_LOAD_HIT; if valid_ra = '1' then
when "1100" => op := OP_LOAD_MISS; opsel := r0.load & r0.nc & is_hit;
when "1110" => op := OP_LOAD_NC; case opsel is
when "1001" => op := OP_STORE_HIT; when "101" => op := OP_LOAD_HIT;
when "1000" => op := OP_STORE_MISS; when "100" => op := OP_LOAD_MISS;
when "1010" => op := OP_STORE_MISS; when "110" => op := OP_LOAD_NC;
when "1011" => op := OP_BAD; when "001" => op := OP_STORE_HIT;
when "1111" => op := OP_BAD; when "000" => op := OP_STORE_MISS;
when others => op := OP_NONE; when "010" => op := OP_STORE_MISS;
end case; when "011" => op := OP_BAD;

when "111" => op := OP_BAD;
when others => op := OP_NONE;
end case;
else
op := OP_BAD;
end if;
end if;
req_op <= op; req_op <= op;


-- Version of the row number that is valid one cycle earlier -- Version of the row number that is valid one cycle earlier
@ -427,9 +688,6 @@ begin
-- Wire up wishbone request latch out of stage 1 -- Wire up wishbone request latch out of stage 1
wishbone_out <= r1.wb; wishbone_out <= r1.wb;


-- TODO: Generate errors
-- err_nc_collision <= '1' when req_op = OP_BAD else '0';

-- Generate stalls from stage 1 state machine -- Generate stalls from stage 1 state machine
stall_out <= '1' when r1.state /= IDLE else '0'; stall_out <= '1' when r1.state /= IDLE else '0';


@ -477,6 +735,8 @@ begin
d_out.valid <= '0'; d_out.valid <= '0';
d_out.data <= cache_out(r1.hit_way); d_out.data <= cache_out(r1.hit_way);
d_out.store_done <= '0'; d_out.store_done <= '0';
d_out.error <= '0';
d_out.tlb_miss <= '0';


-- We have a valid load or store hit or we just completed a slow -- We have a valid load or store hit or we just completed a slow
-- op such as a load miss, a NC load or a store -- op such as a load miss, a NC load or a store
@ -502,6 +762,20 @@ begin
d_out.valid <= '1'; d_out.valid <= '1';
end if; end if;


-- error cases complete without stalling
if r1.error_done = '1' then
report "completing ld/st with error";
d_out.error <= '1';
d_out.tlb_miss <= r1.tlb_miss;
d_out.valid <= '1';
end if;

-- tlbie is handled above and doesn't go through the cache state machine
if r1.tlbie_done = '1' then
report "completing tlbie";
d_out.valid <= '1';
end if;

-- Slow ops (load miss, NC, stores) -- Slow ops (load miss, NC, stores)
if r1.slow_valid = '1' then if r1.slow_valid = '1' then
-- If it's a load, enable register writeback and switch -- If it's a load, enable register writeback and switch
@ -609,6 +883,7 @@ begin


-- --
-- Cache hit synchronous machine for the easy case. This handles load hits. -- Cache hit synchronous machine for the easy case. This handles load hits.
-- It also handles error cases (TLB miss, cache paradox)
-- --
dcache_fast_hit : process(clk) dcache_fast_hit : process(clk)
begin begin
@ -636,6 +911,16 @@ begin
else else
r1.hit_load_valid <= '0'; r1.hit_load_valid <= '0';
end if; end if;

if req_op = OP_BAD then
r1.error_done <= '1';
r1.tlb_miss <= not valid_ra;
else
r1.error_done <= '0';
end if;

-- complete tlbies in the third cycle
r1.tlbie_done <= r0.valid and r0.tlbie and not stall_out;
end if; end if;
end process; end process;


@ -717,7 +1002,7 @@ begin


when OP_LOAD_NC => when OP_LOAD_NC =>
r1.wb.sel <= r0.byte_sel; r1.wb.sel <= r0.byte_sel;
r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000";
r1.wb.cyc <= '1'; r1.wb.cyc <= '1';
r1.wb.stb <= '1'; r1.wb.stb <= '1';
r1.wb.we <= '0'; r1.wb.we <= '0';
@ -726,7 +1011,7 @@ begin
when OP_STORE_HIT | OP_STORE_MISS => when OP_STORE_HIT | OP_STORE_MISS =>
if r0.dcbz = '0' then if r0.dcbz = '0' then
r1.wb.sel <= r0.byte_sel; r1.wb.sel <= r0.byte_sel;
r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000";
r1.wb.dat <= r0.data; r1.wb.dat <= r0.data;
if cancel_store = '0' then if cancel_store = '0' then
r1.wb.cyc <= '1'; r1.wb.cyc <= '1';
@ -774,6 +1059,7 @@ begin
end if; end if;


-- OP_NONE and OP_BAD do nothing -- OP_NONE and OP_BAD do nothing
-- OP_BAD was handled above already
when OP_NONE => when OP_NONE =>
when OP_BAD => when OP_BAD =>
end case; end case;

@ -68,6 +68,7 @@ begin
-- Clear stuff -- Clear stuff
d_in.valid <= '0'; d_in.valid <= '0';
d_in.load <= '0'; d_in.load <= '0';
d_in.tlbie <= '0';
d_in.nc <= '0'; d_in.nc <= '0';
d_in.addr <= (others => '0'); d_in.addr <= (others => '0');
d_in.data <= (others => '0'); d_in.data <= (others => '0');

@ -323,6 +323,7 @@ architecture behaviour of decode1 is
2#1001010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sync 2#1001010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sync
2#0001000100# => (ALU, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- td 2#0001000100# => (ALU, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- td
2#0000000100# => (ALU, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- tw 2#0000000100# => (ALU, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- tw
2#0100110010# => (LDST, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- tlbie
2#0100111100# => (ALU, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- xor 2#0100111100# => (ALU, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- xor
others => illegal_inst others => illegal_inst
); );

@ -16,7 +16,7 @@ package decode_types is
OP_POPCNT, OP_PRTY, OP_RFID, OP_POPCNT, OP_PRTY, OP_RFID,
OP_RLC, OP_RLCL, OP_RLCR, OP_SC, OP_SETB, OP_RLC, OP_RLCL, OP_RLCR, OP_SC, OP_SETB,
OP_SHL, OP_SHR, OP_SHL, OP_SHR,
OP_SYNC, OP_TRAP, OP_SYNC, OP_TLBIE, OP_TRAP,
OP_XOR OP_XOR
); );
type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR); type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR);

@ -88,6 +88,7 @@ architecture behaviour of execute1 is
OP_MFMSR => SUPER, OP_MFMSR => SUPER,
OP_MTMSRD => SUPER, OP_MTMSRD => SUPER,
OP_RFID => SUPER, OP_RFID => SUPER,
OP_TLBIE => SUPER,
others => USER others => USER
); );


@ -988,6 +989,7 @@ begin
e_in.insn(5 downto 1) = "10101" then e_in.insn(5 downto 1) = "10101" then
lv.ci := '1'; lv.ci := '1';
end if; end if;
lv.virt_mode := ctrl.msr(MSR_DR);


-- Update registers -- Update registers
rin <= v; rin <= v;

@ -43,6 +43,7 @@ architecture behave of loadstore1 is
type reg_stage_t is record type reg_stage_t is record
-- latch most of the input request -- latch most of the input request
load : std_ulogic; load : std_ulogic;
tlbie : std_ulogic;
dcbz : std_ulogic; dcbz : std_ulogic;
addr : std_ulogic_vector(63 downto 0); addr : std_ulogic_vector(63 downto 0);
store_data : std_ulogic_vector(63 downto 0); store_data : std_ulogic_vector(63 downto 0);
@ -57,6 +58,7 @@ architecture behave of loadstore1 is
reserve : std_ulogic; reserve : std_ulogic;
rc : std_ulogic; rc : std_ulogic;
nc : std_ulogic; -- non-cacheable access nc : std_ulogic; -- non-cacheable access
virt_mode : std_ulogic;
state : state_t; state : state_t;
second_bytes : std_ulogic_vector(7 downto 0); second_bytes : std_ulogic_vector(7 downto 0);
dar : std_ulogic_vector(63 downto 0); dar : std_ulogic_vector(63 downto 0);
@ -207,6 +209,7 @@ begin
if l_in.valid = '1' then if l_in.valid = '1' then
v.load := '0'; v.load := '0';
v.dcbz := '0'; v.dcbz := '0';
v.tlbie := '0';
case l_in.op is case l_in.op is
when OP_STORE => when OP_STORE =>
req := '1'; req := '1';
@ -216,6 +219,9 @@ begin
when OP_DCBZ => when OP_DCBZ =>
req := '1'; req := '1';
v.dcbz := '1'; v.dcbz := '1';
when OP_TLBIE =>
req := '1';
v.tlbie := '1';
when OP_MFSPR => when OP_MFSPR =>
done := '1'; done := '1';
mfspr := '1'; mfspr := '1';
@ -250,14 +256,15 @@ begin
v.reserve := l_in.reserve; v.reserve := l_in.reserve;
v.rc := l_in.rc; v.rc := l_in.rc;
v.nc := l_in.ci; v.nc := l_in.ci;
v.virt_mode := l_in.virt_mode;


-- XXX Temporary hack. Mark the op as non-cachable if the address -- XXX Temporary hack. Mark the op as non-cachable if the address
-- is the form 0xc------- -- is the form 0xc------- for a real-mode access.
-- --
-- This will have to be replaced by a combination of implementing the -- This will have to be replaced by a combination of implementing the
-- proper HV CI load/store instructions and having an MMU to get the I -- proper HV CI load/store instructions and having an MMU to get the I
-- bit otherwise. -- bit otherwise.
if lsu_sum(31 downto 28) = "1100" then if lsu_sum(31 downto 28) = "1100" and l_in.virt_mode = '0' then
v.nc := '1'; v.nc := '1';
end if; end if;


@ -269,10 +276,13 @@ begin
v.addr := lsu_sum; v.addr := lsu_sum;


-- Do byte reversing and rotating for stores in the first cycle -- Do byte reversing and rotating for stores in the first cycle
byte_offset := unsigned(lsu_sum(2 downto 0)); byte_offset := "000";
brev_lenm1 := "000"; brev_lenm1 := "000";
if l_in.byte_reverse = '1' then if v.tlbie = '0' then
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; byte_offset := unsigned(lsu_sum(2 downto 0));
if l_in.byte_reverse = '1' then
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
end if;
end if; end if;
for i in 0 to 7 loop for i in 0 to 7 loop
k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
@ -332,12 +342,14 @@ begin
-- Update outputs to dcache -- Update outputs to dcache
d_out.valid <= req; d_out.valid <= req;
d_out.load <= v.load; d_out.load <= v.load;
d_out.tlbie <= v.tlbie;
d_out.dcbz <= v.dcbz; d_out.dcbz <= v.dcbz;
d_out.nc <= v.nc; d_out.nc <= v.nc;
d_out.reserve <= v.reserve; d_out.reserve <= v.reserve;
d_out.addr <= addr; d_out.addr <= addr;
d_out.data <= v.store_data; d_out.data <= v.store_data;
d_out.byte_sel <= byte_sel; d_out.byte_sel <= byte_sel;
d_out.virt_mode <= v.virt_mode;


-- Update outputs to writeback -- Update outputs to writeback
-- Multiplex either cache data to the destination GPR or -- Multiplex either cache data to the destination GPR or

Loading…
Cancel
Save