@ -5,16 +5,34 @@ use std.textio.all;
library work;
library work;
use work.wishbone_types.all;
use work.wishbone_types.all;
use work.utils.all;
use work.helpers.all;
entity litedram_wrapper is
entity litedram_wrapper is
generic (
generic (
DRAM_ABITS : positive;
DRAM_ABITS : positive;
DRAM_ALINES : positive;
DRAM_ALINES : positive;
-- Pseudo-ROM payload
-- Pseudo-ROM payload
PAYLOAD_SIZE : natural;
PAYLOAD_SIZE : natural;
PAYLOAD_FILE : string;
PAYLOAD_FILE : string;
-- L2 cache --
-- Line size in bytes
LINE_SIZE : positive := 128;
-- Number of lines in a set
NUM_LINES : positive := 32;
-- Number of ways
NUM_WAYS : positive := 4;
-- Max number of stores in the queue
STOREQ_DEPTH : positive := 8;
-- Don't send loads until all pending stores acked in litedram
NO_LS_OVERLAP : boolean := false;
-- Debug
-- Debug
LITEDRAM_TRACE : boolean := false
LITEDRAM_TRACE : boolean := false;
TRACE : boolean := false
);
);
port(
port(
-- LiteDRAM generates the system clock and reset
-- LiteDRAM generates the system clock and reset
@ -123,13 +141,11 @@ architecture behaviour of litedram_wrapper is
signal user_port0_rdata_ready : std_ulogic;
signal user_port0_rdata_ready : std_ulogic;
signal user_port0_rdata_data : std_ulogic_vector(127 downto 0);
signal user_port0_rdata_data : std_ulogic_vector(127 downto 0);
signal ad3 : std_ulogic;
signal wb_ctrl_adr : std_ulogic_vector(29 downto 0);
signal wb_ctrl_adr : std_ulogic_vector(29 downto 0);
signal wb_ctrl_dat_w : std_ulogic_vector(31 downto 0);
signal wb_ctrl_dat_w : std_ulogic_vector(31 downto 0);
signal wb_ctrl_dat_r : std_ulogic_vector(31 downto 0);
signal wb_ctrl_dat_r : std_ulogic_vector(31 downto 0);
signal wb_ctrl_sel : std_ulogic_vector(3 downto 0);
signal wb_ctrl_sel : std_ulogic_vector(3 downto 0);
signal wb_ctrl_cyc : std_ulogic;
signal wb_ctrl_cyc : std_ulogic := '0';
signal wb_ctrl_stb : std_ulogic;
signal wb_ctrl_stb : std_ulogic;
signal wb_ctrl_ack : std_ulogic;
signal wb_ctrl_ack : std_ulogic;
signal wb_ctrl_we : std_ulogic;
signal wb_ctrl_we : std_ulogic;
@ -137,11 +153,239 @@ architecture behaviour of litedram_wrapper is
signal wb_init_in : wb_io_master_out;
signal wb_init_in : wb_io_master_out;
signal wb_init_out : wb_io_slave_out;
signal wb_init_out : wb_io_slave_out;
type state_t is (CMD, MWRITE, MREAD);
-- DRAM data port width
constant DRAM_DBITS : natural := 128;
constant DRAM_SBITS : natural := (DRAM_DBITS / 8);
-- BRAM organisation: We never access more than wishbone_data_bits at
-- a time so to save resources we make the array only that wide, and
-- use consecutive indices for to make a cache "line"
--
-- ROW_SIZE is the width in bytes of the BRAM (based on litedram, so 128-bits)
constant ROW_SIZE : natural := DRAM_DBITS / 8;
-- ROW_PER_LINE is the number of row (litedram transactions) in a line
constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
-- BRAM_ROWS is the number of rows in BRAM needed to represent the full
-- dcache
constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
-- Bit fields counts in the address
-- ROW_BITS is the number of bits to select a row
constant ROW_BITS : natural := log2(BRAM_ROWS);
-- ROW_LINEBITS is the number of bits to select a row within a line
constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
-- LINE_OFF_BITS is the number of bits for the offset in a cache line
constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
-- ROW_OFF_BITS is the number of bits for the offset in a row
constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
-- REAL_ADDR_BITS is the number of real address bits that we store
constant REAL_ADDR_BITS : positive := DRAM_ABITS + ROW_OFF_BITS;
-- INDEX_BITS is the number if bits to select a cache line
constant INDEX_BITS : natural := log2(NUM_LINES);
-- SET_SIZE_BITS is the log base 2 of the set size
constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
-- TAG_BITS is the number of bits of the tag part of the address
constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
-- WAY_BITS is the number of bits to select a way
constant WAY_BITS : natural := log2(NUM_WAYS);
subtype row_t is integer range 0 to BRAM_ROWS-1;
subtype index_t is integer range 0 to NUM_LINES-1;
subtype way_t is integer range 0 to NUM_WAYS-1;
-- The cache data BRAM organized as described above for each way
subtype cache_row_t is std_ulogic_vector(DRAM_DBITS-1 downto 0);
-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
-- not handle a clean (commented) definition of the cache tags as a 3d
-- memory. For now, work around it by putting all the tags
subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
-- type cache_tags_set_t is array(way_t) of cache_tag_t;
-- type cache_tags_array_t is array(index_t) of cache_tags_set_t;
constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
type cache_tags_array_t is array(index_t) of cache_tags_set_t;
-- The cache valid bits
subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
type cache_valids_t is array(index_t) of cache_way_valids_t;
-- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
signal cache_tags : cache_tags_array_t;
signal cache_valids : cache_valids_t;
attribute ram_style : string;
attribute ram_style of cache_tags : signal is "distributed";
--
-- Store queue signals
--
-- We store a single wishbone dword per entry (64-bit) but all
-- 16 sel bits for the DRAM.
-- XXX Investigate storing only AD3 and 8 sel bits if it's better
constant STOREQ_BITS : positive := wishbone_data_bits + DRAM_SBITS;
signal storeq_rd_ready : std_ulogic;
signal storeq_rd_valid : std_ulogic;
signal storeq_rd_data : std_ulogic_vector(STOREQ_BITS-1 downto 0);
signal storeq_wr_ready : std_ulogic;
signal storeq_wr_valid : std_ulogic;
signal storeq_wr_data : std_ulogic_vector(STOREQ_BITS-1 downto 0);
--
-- Cache management signals
--
-- Cache state machine
type state_t is (IDLE, -- Normal load hit processing
REFILL_WAIT_ACK); -- Cache refill wait ack
signal state : state_t;
signal state : state_t;
-- Latched WB request.
signal wb_req : wishbone_master_out := wishbone_master_out_init;
-- Read pipeline (to handle cache RAM latency)
signal read_ack_0 : std_ulogic;
signal read_ack_1 : std_ulogic;
signal read_ad3_0 : std_ulogic;
signal read_ad3_1 : std_ulogic;
signal read_way_0 : way_t;
signal read_way_1 : way_t;
-- Async signals decoding latched request
type req_op_t is (OP_NONE,
OP_LOAD_HIT,
OP_LOAD_MISS,
OP_STORE_HIT,
OP_STORE_MISS);
signal req_index : index_t;
signal req_row : row_t;
signal req_hit_way : way_t;
signal req_tag : cache_tag_t;
signal req_op : req_op_t;
signal req_laddr : std_ulogic_vector(REAL_ADDR_BITS-1 downto 0);
signal req_ad3 : std_ulogic;
signal req_we : std_ulogic_vector(DRAM_SBITS-1 downto 0);
signal req_wdata : std_ulogic_vector(DRAM_DBITS-1 downto 0);
signal accept_store : std_ulogic;
-- Line refill command signals and latches
signal refill_cmd_valid : std_ulogic;
signal refill_cmd_addr : std_ulogic_vector(DRAM_ABITS-1 downto 0);
signal refill_way : way_t;
signal refill_index : index_t;
signal refill_row : row_t;
-- Cache RAM interface
type cache_ram_out_t is array(way_t) of cache_row_t;
signal cache_out : cache_ram_out_t;
-- PLRU output interface
type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_victim : plru_out_t;
--
-- Helper functions to decode incoming requests
--
-- Return the cache line index (tag index) for an address
function get_index(addr: wishbone_addr_type) return index_t is
begin
return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)));
end;
-- Return the cache row index (data memory) for an address
function get_row(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto 0)) return row_t is
begin
return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)));
end;
-- Returns whether this is the last row of a line. It takes a DRAM address
function is_last_row_addr(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS))
return boolean is
constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1');
begin
return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones;
end;
-- Returns whether this is the last row of a line
function is_last_row(row: row_t) return boolean is
variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1');
begin
row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
return row_v(ROW_LINEBITS-1 downto 0) = ones;
end;
-- Return the address of the next row in the current cache line. It takes a
-- DRAM address
function next_row_addr(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS))
return std_ulogic_vector is
variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
variable result : std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS);
begin
-- Is there no simpler way in VHDL to generate that 3 bits adder ?
row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
result := addr;
result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
return result;
end;
-- Return the next row in the current cache line. We use a dedicated
-- function in order to limit the size of the generated adder to be
-- only the bits within a cache line (3 bits with default settings)
--
function next_row(row: row_t) return row_t is
variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
begin
row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
row_idx := row_v(ROW_LINEBITS-1 downto 0);
row_v(ROW_LINEBITS-1 downto 0) := std_ulogic_vector(unsigned(row_idx) + 1);
return to_integer(unsigned(row_v));
end;
-- Get the tag value from the address
function get_tag(addr: wishbone_addr_type) return cache_tag_t is
begin
return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
end;
-- Read a tag from a tag memory row
function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is
begin
return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
end;
-- Write a tag to tag memory row
procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t;
tag: cache_tag_t) is
begin
tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
end;
begin
begin
-- Sanity checks
assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE;
assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE;
assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" severity FAILURE;
assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
report "geometry bits don't add up" severity FAILURE;
assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
report "geometry bits don't add up" severity FAILURE;
assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
report "geometry bits don't add up" severity FAILURE;
assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
report "geometry bits don't add up" severity FAILURE;
assert (128 = DRAM_DBITS)
report "Can't yet handle a DRAM width that isn't 128-bits" severity FAILURE;
-- alternate core reset address set when DRAM is not initialized.
-- alternate core reset address set when DRAM is not initialized.
core_alt_reset <= not init_done;
core_alt_reset <= not init_done;
@ -170,7 +414,15 @@ begin
wb_init_in.stb <= wb_ctrl_in.stb;
wb_init_in.stb <= wb_ctrl_in.stb;
wb_init_in.cyc <= wb_ctrl_in.cyc and wb_ctrl_is_init;
wb_init_in.cyc <= wb_ctrl_in.cyc and wb_ctrl_is_init;
-- DRAM CSR IN signals
-- DRAM CSR IN signals. Extra latch to help with timing
csr_latch: process(system_clk)
begin
if rising_edge(system_clk) then
if system_reset = '1' then
wb_ctrl_cyc <= '0';
wb_ctrl_stb <= '0';
else
-- XXX Maybe only update addr when cyc = '1' to save power ?
wb_ctrl_adr <= x"0000" & wb_ctrl_in.adr(15 downto 2);
wb_ctrl_adr <= x"0000" & wb_ctrl_in.adr(15 downto 2);
wb_ctrl_dat_w <= wb_ctrl_in.dat;
wb_ctrl_dat_w <= wb_ctrl_in.dat;
wb_ctrl_sel <= wb_ctrl_in.sel;
wb_ctrl_sel <= wb_ctrl_in.sel;
@ -178,7 +430,19 @@ begin
wb_ctrl_cyc <= wb_ctrl_in.cyc and wb_ctrl_is_csr;
wb_ctrl_cyc <= wb_ctrl_in.cyc and wb_ctrl_is_csr;
wb_ctrl_stb <= wb_ctrl_in.stb and wb_ctrl_is_csr;
wb_ctrl_stb <= wb_ctrl_in.stb and wb_ctrl_is_csr;
-- Ctrl bus wishbone OUT signals
-- Clear stb on ack otherwise the memory will latch
-- the write twice which breaks levelling. On the next
-- cycle we will latch an updated stb that takes the
-- ack into account.
if wb_ctrl_ack = '1' then
wb_ctrl_stb <= '0';
end if;
end if;
end if;
end process;
-- Ctrl bus wishbone OUT signals. XXX Consider adding latch on
-- CSR response to help timing
wb_ctrl_out.ack <= wb_ctrl_ack when wb_ctrl_is_csr = '1'
wb_ctrl_out.ack <= wb_ctrl_ack when wb_ctrl_is_csr = '1'
else wb_init_out.ack;
else wb_init_out.ack;
wb_ctrl_out.dat <= wb_ctrl_dat_r when wb_ctrl_is_csr = '1'
wb_ctrl_out.dat <= wb_ctrl_dat_r when wb_ctrl_is_csr = '1'
@ -186,56 +450,531 @@ begin
wb_ctrl_out.stall <= wb_init_out.stall when wb_ctrl_is_init else
wb_ctrl_out.stall <= wb_init_out.stall when wb_ctrl_is_init else
'0' when wb_ctrl_in.cyc = '0' else not wb_ctrl_ack;
'0' when wb_ctrl_in.cyc = '0' else not wb_ctrl_ack;
-- Generate a cache RAM for each way
rams: for i in 0 to NUM_WAYS-1 generate
signal do_read : std_ulogic;
signal do_write : std_ulogic;
signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
signal wr_data : std_ulogic_vector(DRAM_DBITS-1 downto 0);
signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
signal wr_sel_m : std_ulogic_vector(ROW_SIZE-1 downto 0);
signal dout : cache_row_t;
begin
way: entity work.cache_ram
generic map (
ROW_BITS => ROW_BITS,
WIDTH => DRAM_DBITS,
ADD_BUF => true
)
port map (
clk => system_clk,
rd_en => do_read,
rd_addr => rd_addr,
rd_data => dout,
wr_sel => wr_sel_m,
wr_addr => wr_addr,
wr_data => wr_data
);
process(all)
begin
--
-- Read port
--
do_read <= '1';
cache_out(i) <= dout;
rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
--
-- Write mux: cache refills from DRAM or writes from Wishbone
--
if state = IDLE then
-- Write from wishbone
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
wr_data <= req_wdata;
wr_sel <= req_we;
else
-- Refill from DRAM
wr_data <= user_port0_rdata_data;
wr_sel <= (others => '1');
wr_addr <= std_ulogic_vector(to_unsigned(refill_row, ROW_BITS));
end if;
--
-- Write enable logic
--
do_write <= '0';
if req_op = OP_STORE_HIT and req_hit_way = i then
do_write <= '1';
elsif user_port0_rdata_valid = '1' and refill_way = i then
do_write <= '1';
end if;
-- Mask write selects with do_write since BRAM doesn't always
-- have a global write-enable (Vivado generates TDP instead
-- of SDP when using one, thus doubling cache BRAM usage).
for i in 0 to ROW_SIZE-1 loop
wr_sel_m(i) <= wr_sel(i) and do_write;
end loop;
if TRACE and rising_edge(system_clk) then
if do_write = '1' then
report "cache write way:" & integer'image(i) &
" addr:" & to_hstring(wr_addr) &
" sel:" & to_hstring(wr_sel_m) &
" data:" & to_hstring(wr_data);
end if;
end if;
end process;
end generate;
-- Generate PLRUs
maybe_plrus: if NUM_WAYS > 1 generate
begin
plrus: for i in 0 to NUM_LINES-1 generate
-- PLRU interface
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_acc_en : std_ulogic;
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
begin
plru : entity work.plru
generic map (
BITS => WAY_BITS
)
port map (
clk => system_clk,
rst => system_reset,
acc => plru_acc,
acc_en => plru_acc_en,
lru => plru_out
);
process(req_index, req_op, req_hit_way, plru_out)
begin
-- PLRU interface
if (req_op = OP_LOAD_HIT or
req_op = OP_STORE_HIT) and req_index = i then
plru_acc_en <= '1';
else
plru_acc_en <= '0';
end if;
plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS));
plru_victim(i) <= plru_out;
end process;
end generate;
end generate;
--
-- Wishbone interface:
--
-- - Incoming wishbone request latch (to help with timing)
-- - Read response pipeline (to match BRAM output buffer delay)
-- - Stall generation
--
-- XXX TODO: Properly handle cyc drops before all acks are sent...
--
request_latch: process(system_clk)
begin
if rising_edge(system_clk) then
-- We can latch a new request if we are idle (for now). We also
-- latch the absence of request. This is a pipeline that takes
-- one per-cycle unless non-IDLE.
--
if wb_out.stall = '0' then
-- Avoid constantly updating addr/data for unrelated requests
if wb_in.cyc = '1' then
wb_req <= wb_in;
else
wb_req.cyc <= wb_in.cyc;
wb_req.stb <= wb_in.stb;
end if;
if TRACE then
if wb_in.cyc = '1' and wb_in.stb = '1' then
report "latch new wb req ! addr:" & to_hstring(wb_in.adr) &
" we:" & std_ulogic'image(wb_in.we) &
" sel:" & to_hstring(wb_in.sel);
end if;
end if;
end if;
end if;
end process;
--
--
-- Read response pipeline
--
-- XXX Might have to put store acks in there too (see comment in wb_response)
read_pipe: process(system_clk)
begin
if rising_edge(system_clk) then
read_ack_0 <= '1' when req_op = OP_LOAD_HIT else '0';
read_ad3_0 <= req_ad3;
read_way_0 <= req_hit_way;
read_ack_1 <= read_ack_0;
read_ad3_1 <= read_ad3_0;
read_way_1 <= read_way_0;
if TRACE then
if req_op = OP_LOAD_HIT then
report "Load hit addr:" & to_hstring(wb_req.adr) &
" idx:" & integer'image(req_index) &
" tag:" & to_hstring(req_tag) &
" way:" & integer'image(req_hit_way);
elsif req_op = OP_LOAD_MISS then
report "Load miss addr:" & to_hstring(wb_req.adr);
end if;
if read_ack_0 = '1' then
report "read data:" & to_hstring(cache_out(read_way_0));
end if;
end if;
end if;
end process;
wb_reponse: process(all)
variable rdata : std_ulogic_vector(DRAM_DBITS-1 downto 0);
variable store_done : std_ulogic;
begin
-- Can we accept a store ? This is set when IDLE and the store
-- queue & command queue are not full.
--
--
-- Data bus wishbone to LiteDRAM native port
-- Note: This is only used to control the WB request latch, stall
-- and store "early complete". We don't want to use this to control
-- cmd_valid to DRAM as this would create a circular dependency inside
-- LiteDRAM as cmd_ready I think is driven from cmd_valid.
--
--
-- Address bit 3 selects the top or bottom half of the data
-- The state machine that controls the command queue must thus
-- bus (64-bit wishbone vs. 128-bit DRAM interface)
-- reproduce this logic at least partially.
--
--
-- XXX TODO: Figure out how to pipeline this
-- Note also that user_port0_cmd_ready from LiteDRAM is combinational
-- from user_port0_cmd_valid. IE. we won't know that LiteDRAM cannot
-- accept a command until we try to send one.
--
--
ad3 <= wb_in.adr(3);
if state = IDLE then
accept_store <= user_port0_cmd_ready and storeq_wr_ready;
-- Wishbone port IN signals
-- Corner case !!! The read acks pipeline takes two extra cycles
user_port0_cmd_valid <= wb_in.cyc and wb_in.stb when state = CMD else '0';
-- which means a store ack can collide with a previous load hit
user_port0_cmd_we <= wb_in.we when state = CMD else '0';
-- ack. Thus we stall stores if we have a load ack pending.
user_port0_wdata_valid <= '1' when state = MWRITE else '0';
if read_ack_0 = '1' or read_ack_1 = '1' then
user_port0_rdata_ready <= '1' when state = MREAD else '0';
accept_store <= '0';
user_port0_cmd_addr <= wb_in.adr(DRAM_ABITS+3 downto 4);
end if;
user_port0_wdata_data <= wb_in.dat & wb_in.dat;
else
user_port0_wdata_we <= wb_in.sel & "00000000" when ad3 = '1' else
accept_store <= '0';
"00000000" & wb_in.sel;
end if;
-- Generate stalls. For loads, we stall if we are going to take a load
-- miss or are in the middle of a refill. For stores, if we can't
-- accept it.
case state is
when IDLE =>
case req_op is
when OP_LOAD_MISS =>
wb_out.stall <= '1';
when OP_STORE_MISS | OP_STORE_HIT =>
wb_out.stall <= not accept_store;
when others =>
wb_out.stall <= '0';
end case;
when REFILL_WAIT_ACK =>
wb_out.stall <= '1';
end case;
-- Wishbone OUT signals
-- Data out mux
wb_out.ack <= user_port0_wdata_ready when state = MWRITE else
rdata := cache_out(read_way_1);
user_port0_rdata_valid when state = MREAD else '0';
wb_out.dat <= rdata(127 downto 64) when read_ad3_1 = '1' else rdata(63 downto 0);
wb_out.dat <= user_port0_rdata_data(127 downto 64) when ad3 = '1' else
-- Early-complete stores on wishbone.
user_port0_rdata_data(63 downto 0);
if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then
store_done := accept_store;
else
store_done := '0';
end if;
-- We don't do pipelining yet.
-- Generate ACKs on read hits and store complete
wb_out.stall <= '0' when wb_in.cyc = '0' else not wb_out.ack;
--
-- XXXX TODO: This can happen on store right behind loads !
-- This probably need to be fixed by putting store acks in
-- the same pipeline as the read acks. TOOD: Create a testbench
-- to exercise those corner cases as the core can't yet.
--
wb_out.ack <= read_ack_1 or store_done;
assert read_ack_0 = '0' or store_done = '0' report
"Read ack and store ack collision !"
severity failure;
end process;
-- DRAM user port State machine
--
sm: process(system_clk)
-- Cache request decode
--
request_decode: process(all)
variable valid : std_ulogic;
variable is_hit : std_ulogic;
variable hit_way : way_t;
begin
begin
-- Extract line, row and tag from request
req_index <= get_index(wb_req.adr);
req_row <= get_row(wb_req.adr(REAL_ADDR_BITS-1 downto 0));
req_tag <= get_tag(wb_req.adr);
-- Calculate address of beginning of cache line, will be
-- used for cache miss processing if needed
req_laddr <= wb_req.adr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) &
(LINE_OFF_BITS-1 downto 0 => '0');
-- Do we have a valid request in the WB latch ?
if state = IDLE then
valid := wb_req.cyc and wb_req.stb;
else
valid := '0';
end if;
-- Store signals
req_ad3 <= wb_req.adr(3);
req_wdata <= wb_req.dat & wb_req.dat;
req_we <= wb_req.sel & "00000000" when req_ad3 = '1' else
"00000000" & wb_req.sel;
-- Test if pending request is a hit on any way
hit_way := 0;
is_hit := '0';
for i in way_t loop
if valid = '1' and cache_valids(req_index)(i) = '1' then
if read_tag(i, cache_tags(req_index)) = req_tag then
hit_way := i;
is_hit := '1';
end if;
end if;
end loop;
-- Generate the req op. We only allow OP_LOAD_* when in the
-- IDLE state as our PLRU and ACK generation rely on this,
-- stores are allowed in IDLE state.
--
req_op <= OP_NONE;
if valid = '1' then
if wb_req.we = '1' then
if is_hit = '1' then
req_op <= OP_STORE_HIT;
else
req_op <= OP_STORE_MISS;
end if;
else
if is_hit = '1' then
req_op <= OP_LOAD_HIT;
else
req_op <= OP_LOAD_MISS;
end if;
end if;
end if;
req_hit_way <= hit_way;
end process;
--
-- Store queue
--
-- For now, queue up to 16 stores
store_queue: entity work.sync_fifo
generic map (
DEPTH => STOREQ_DEPTH,
WIDTH => STOREQ_BITS
)
port map (
clk => system_clk,
reset => system_reset,
rd_ready => storeq_rd_ready,
rd_valid => storeq_rd_valid,
rd_data => storeq_rd_data,
wr_ready => storeq_wr_ready,
wr_valid => storeq_wr_valid,
wr_data => storeq_wr_data
);
storeq_control : process(all)
variable stq_data : wishbone_data_type;
variable stq_sel : std_ulogic_vector(DRAM_SBITS-1 downto 0);
begin
storeq_wr_data <= wb_req.dat & req_we;
-- Only accept store if we can send a command
if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then
storeq_wr_valid <= user_port0_cmd_ready;
else
storeq_wr_valid <= '0';
end if;
stq_data := storeq_rd_data(storeq_rd_data'left downto DRAM_SBITS);
stq_sel := storeq_rd_data(DRAM_SBITS-1 downto 0);
user_port0_wdata_data <= stq_data & stq_data;
user_port0_wdata_we <= stq_sel;
user_port0_wdata_valid <= storeq_rd_valid;
storeq_rd_ready <= user_port0_wdata_ready;
if TRACE then
if rising_edge(system_clk) then
if rising_edge(system_clk) then
if req_op = OP_STORE_HIT then
report "Store hit to:" &
to_hstring(wb_req.adr(DRAM_ABITS+3 downto 0)) &
" data:" & to_hstring(req_wdata) &
" we:" & to_hstring(req_we) &
" V:" & std_ulogic'image(accept_store);
else
report "Store miss to:" &
to_hstring(wb_req.adr(DRAM_ABITS+3 downto 0)) &
" data:" & to_hstring(req_wdata) &
" we:" & to_hstring(req_we) &
" V:" & std_ulogic'image(accept_store);
end if;
if storeq_wr_valid = '1' and storeq_wr_ready = '1' then
report "storeq push " & to_hstring(storeq_wr_data);
end if;
if storeq_rd_valid = '1' and storeq_rd_ready = '1' then
report "storeq pop " & to_hstring(storeq_rd_data);
end if;
end if;
end if;
end process;
-- LiteDRAM command mux
dram_commands: process(all)
begin
if state = IDLE and (req_op = OP_STORE_HIT or req_op = OP_STORE_MISS) then
-- For stores, forward signals directly. Only send command if
-- the FIFO can accept a store
user_port0_cmd_addr <= wb_req.adr(DRAM_ABITS+3 downto 4);
user_port0_cmd_we <= '1';
user_port0_cmd_valid <= storeq_wr_ready;
else
-- For loads, we route via a latch controlled by the refill machine
user_port0_cmd_addr <= refill_cmd_addr;
user_port0_cmd_valid <= refill_cmd_valid;
user_port0_cmd_we <= '0';
end if;
user_port0_rdata_ready <= '1'; -- Always 1
end process;
-- LiteDRAM refill machine
--
-- This handles the cache line refills
--
refill_machine : process(system_clk)
variable tagset : cache_tags_set_t;
variable cmds_done : boolean;
variable replace_way : way_t;
variable wait_qdrain : boolean;
begin
if rising_edge(system_clk) then
-- On reset, clear all valid bits to force misses
if system_reset = '1' then
if system_reset = '1' then
state <= CMD;
for i in index_t loop
cache_valids(i) <= (others => '0');
end loop;
state <= IDLE;
refill_cmd_valid <= '0';
else
else
-- Main state machine
case state is
case state is
when CMD =>
when IDLE =>
if (user_port0_cmd_ready and user_port0_cmd_valid) = '1' then
assert refill_cmd_valid = '0' report "refill cmd valid in IDLE state !"
state <= MWRITE when wb_in.we = '1' else MREAD;
severity failure;
-- If NO_LS_OVERLAP is set, disallow a load miss if the store
-- queue still has data in it.
wait_qdrain := false;
if NO_LS_OVERLAP then
wait_qdrain := storeq_rd_valid = '1';
end if;
-- We need to read a cache line
if req_op = OP_LOAD_MISS and not wait_qdrain then
-- Grab way to replace
replace_way := to_integer(unsigned(plru_victim(req_index)));
-- Force misses on that way while refilling that line
cache_valids(req_index)(replace_way) <= '0';
-- Store new tag in selected way
for i in 0 to NUM_WAYS-1 loop
if i = replace_way then
tagset := cache_tags(req_index);
write_tag(i, tagset, req_tag);
cache_tags(req_index) <= tagset;
end if;
end if;
when MWRITE =>
end loop;
if user_port0_wdata_ready = '1' then
state <= CMD;
-- Keep track of our index and way for subsequent stores
refill_index <= req_index;
refill_way <= replace_way;
refill_row <= get_row(req_laddr);
-- Prep for first DRAM read
--
-- XXX TODO: We could start a cycle early here by using
-- combo logic to generate the first command in
-- "dram_commands". In fact, we could make refill_cmd_addr
-- only contain the "counter" bits and wire it with the
-- other bits from req_laddr.
refill_cmd_addr <= req_laddr(DRAM_ABITS+3 downto 4);
refill_cmd_valid <= '1';
if TRACE then
report "refill addr " & to_hstring(req_laddr);
end if;
end if;
when MREAD =>
-- Track that we had one request sent
state <= REFILL_WAIT_ACK;
end if;
when REFILL_WAIT_ACK =>
-- Commands are all sent if user_port0_cmd_valid is 0
cmds_done := refill_cmd_valid = '0';
-- If we are still sending requests, was one accepted ?
if user_port0_cmd_ready = '1' and not cmds_done then
-- That was the last word ? We are done sending. Clear
-- command valid and set cmds_done so we can handle an
-- eventual last ack on the same cycle.
--
if TRACE then
report "got refill cmd ack !";
end if;
if is_last_row_addr(refill_cmd_addr) then
refill_cmd_valid <= '0';
cmds_done := true;
if TRACE then
report "all refill cmds done !";
end if;
else
-- Calculate the next row address
refill_cmd_addr <= next_row_addr(refill_cmd_addr);
if TRACE then
report "refill addr " &
to_hstring(next_row_addr(refill_cmd_addr));
end if;
end if;
end if;
-- Incoming read data processing
if user_port0_rdata_valid = '1' then
if user_port0_rdata_valid = '1' then
state <= CMD;
if TRACE then
report "got refill data ack !";
end if;
-- Check for completion
if cmds_done and is_last_row(refill_row) then
if TRACE then
report "all refill data done !";
end if;
-- Cache line is now valid
cache_valids(refill_index)(refill_way) <= '1';
-- We are done
state <= IDLE;
end if;
-- Increment store row counter
refill_row <= next_row(refill_row);
end if;
end if;
end case;
end case;
end if;
end if;