litedram: Add an L2 cache with store queue

This adds a cache between the wishbone and litedram with the following
features (at this point, it's still evolving)

  - 128 bytes line width in order to have a reasonable amount of
litedram pipelining on the 128-bit wide data port.

  - Configurable geometry otherwise

  - Stores are acked immediately on wishbone whether hit or miss
(minus a 2 cycles delay if there's a previous load response in the
way) and sent to LiteDRAM via 8 entries (configurable) store queue

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
pull/190/head
Benjamin Herrenschmidt 4 years ago
parent bf1b98b958
commit a3857aac94

@ -50,7 +50,7 @@ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \
core.vhdl

soc_files = wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl \
soc_files = wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \
wishbone_debug_master.vhdl xics.vhdl syscon.vhdl soc.vhdl

soc_sim_files = sim_console.vhdl sim_uart.vhdl sim_bram_helpers.vhdl \

File diff suppressed because one or more lines are too long

@ -0,0 +1,84 @@
$ version 1.1

# Signals in entities :
/core_dram_tb/dram/rst
/core_dram_tb/dram/system_clk
/core_dram_tb/dram/system_reset
/core_dram_tb/dram/wb_in
/core_dram_tb/dram/wb_out
/core_dram_tb/dram/user_port0_cmd_valid
/core_dram_tb/dram/user_port0_cmd_ready
/core_dram_tb/dram/user_port0_cmd_we
/core_dram_tb/dram/user_port0_cmd_addr
/core_dram_tb/dram/user_port0_wdata_valid
/core_dram_tb/dram/user_port0_wdata_ready
/core_dram_tb/dram/user_port0_wdata_we
/core_dram_tb/dram/user_port0_wdata_data
/core_dram_tb/dram/user_port0_rdata_valid
/core_dram_tb/dram/user_port0_rdata_ready
/core_dram_tb/dram/user_port0_rdata_data
/core_dram_tb/dram/cache_tags
/core_dram_tb/dram/cache_valids
/core_dram_tb/dram/storeq_rd_ready
/core_dram_tb/dram/storeq_rd_valid
/core_dram_tb/dram/storeq_rd_data
/core_dram_tb/dram/storeq_wr_ready
/core_dram_tb/dram/storeq_wr_valid
/core_dram_tb/dram/storeq_wr_data
/core_dram_tb/dram/accept_store
/core_dram_tb/dram/state
/core_dram_tb/dram/wb_req
/core_dram_tb/dram/store_queued
/core_dram_tb/dram/read_ack_0
/core_dram_tb/dram/read_ack_1
/core_dram_tb/dram/read_ad3_0
/core_dram_tb/dram/read_ad3_1
/core_dram_tb/dram/read_way_0
/core_dram_tb/dram/read_way_1
/core_dram_tb/dram/req_index
/core_dram_tb/dram/req_row
/core_dram_tb/dram/req_hit_way
/core_dram_tb/dram/req_tag
/core_dram_tb/dram/req_op
/core_dram_tb/dram/req_laddr
/core_dram_tb/dram/req_ad3
/core_dram_tb/dram/req_we
/core_dram_tb/dram/req_wdata
/core_dram_tb/dram/store_way
/core_dram_tb/dram/store_index
/core_dram_tb/dram/store_row
/core_dram_tb/dram/cache_out
/core_dram_tb/dram/plru_victim
/core_dram_tb/dram/replace_way
/core_dram_tb/dram/rams/do_read
/core_dram_tb/dram/rams/do_write
/core_dram_tb/dram/rams/rd_addr
/core_dram_tb/dram/rams/wr_addr
/core_dram_tb/dram/rams/wr_data
/core_dram_tb/dram/rams/wr_sel
/core_dram_tb/dram/rams/wr_sel_m
/core_dram_tb/dram/rams/dout
/core_dram_tb/dram/rams/way/clk
/core_dram_tb/dram/rams/way/rd_en
/core_dram_tb/dram/rams/way/rd_addr
/core_dram_tb/dram/rams/way/rd_data
/core_dram_tb/dram/rams/way/wr_sel
/core_dram_tb/dram/rams/way/wr_addr
/core_dram_tb/dram/rams/way/wr_data
/core_dram_tb/dram/rams/way/rd_data0
/core_dram_tb/dram/store_queue/wr_ready
/core_dram_tb/dram/store_queue/wr_valid
/core_dram_tb/dram/store_queue/wr_data
/core_dram_tb/dram/store_queue/rd_ready
/core_dram_tb/dram/store_queue/rd_valid
/core_dram_tb/dram/store_queue/rd_data
/core_dram_tb/dram/store_queue/rd_idx
/core_dram_tb/dram/store_queue/rd_next
/core_dram_tb/dram/store_queue/wr_idx
/core_dram_tb/dram/store_queue/wr_next
/core_dram_tb/dram/store_queue/op_prev
/core_dram_tb/dram/store_queue/op_next
/core_dram_tb/dram/store_queue/full
/core_dram_tb/dram/store_queue/empty
/core_dram_tb/dram/store_queue/push
/core_dram_tb/dram/store_queue/pop

@ -5,16 +5,34 @@ use std.textio.all;

library work;
use work.wishbone_types.all;
use work.utils.all;
use work.helpers.all;

entity litedram_wrapper is
generic (
DRAM_ABITS : positive;
DRAM_ALINES : positive;

-- Pseudo-ROM payload
PAYLOAD_SIZE : natural;
PAYLOAD_FILE : string;

-- L2 cache --

-- Line size in bytes
LINE_SIZE : positive := 128;
-- Number of lines in a set
NUM_LINES : positive := 32;
-- Number of ways
NUM_WAYS : positive := 4;
-- Max number of stores in the queue
STOREQ_DEPTH : positive := 8;
-- Don't send loads until all pending stores acked in litedram
NO_LS_OVERLAP : boolean := false;

-- Debug
LITEDRAM_TRACE : boolean := false
LITEDRAM_TRACE : boolean := false;
TRACE : boolean := false
);
port(
-- LiteDRAM generates the system clock and reset
@ -123,13 +141,11 @@ architecture behaviour of litedram_wrapper is
signal user_port0_rdata_ready : std_ulogic;
signal user_port0_rdata_data : std_ulogic_vector(127 downto 0);

signal ad3 : std_ulogic;

signal wb_ctrl_adr : std_ulogic_vector(29 downto 0);
signal wb_ctrl_dat_w : std_ulogic_vector(31 downto 0);
signal wb_ctrl_dat_r : std_ulogic_vector(31 downto 0);
signal wb_ctrl_sel : std_ulogic_vector(3 downto 0);
signal wb_ctrl_cyc : std_ulogic;
signal wb_ctrl_cyc : std_ulogic := '0';
signal wb_ctrl_stb : std_ulogic;
signal wb_ctrl_ack : std_ulogic;
signal wb_ctrl_we : std_ulogic;
@ -137,11 +153,239 @@ architecture behaviour of litedram_wrapper is
signal wb_init_in : wb_io_master_out;
signal wb_init_out : wb_io_slave_out;

type state_t is (CMD, MWRITE, MREAD);
-- DRAM data port width
constant DRAM_DBITS : natural := 128;
constant DRAM_SBITS : natural := (DRAM_DBITS / 8);

-- BRAM organisation: We never access more than wishbone_data_bits at
-- a time so to save resources we make the array only that wide, and
-- use consecutive indices for to make a cache "line"
--
-- ROW_SIZE is the width in bytes of the BRAM (based on litedram, so 128-bits)
constant ROW_SIZE : natural := DRAM_DBITS / 8;
-- ROW_PER_LINE is the number of row (litedram transactions) in a line
constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
-- BRAM_ROWS is the number of rows in BRAM needed to represent the full
-- dcache
constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;

-- Bit fields counts in the address

-- ROW_BITS is the number of bits to select a row
constant ROW_BITS : natural := log2(BRAM_ROWS);
-- ROW_LINEBITS is the number of bits to select a row within a line
constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
-- LINE_OFF_BITS is the number of bits for the offset in a cache line
constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
-- ROW_OFF_BITS is the number of bits for the offset in a row
constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
-- REAL_ADDR_BITS is the number of real address bits that we store
constant REAL_ADDR_BITS : positive := DRAM_ABITS + ROW_OFF_BITS;
-- INDEX_BITS is the number if bits to select a cache line
constant INDEX_BITS : natural := log2(NUM_LINES);
-- SET_SIZE_BITS is the log base 2 of the set size
constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
-- TAG_BITS is the number of bits of the tag part of the address
constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
-- WAY_BITS is the number of bits to select a way
constant WAY_BITS : natural := log2(NUM_WAYS);

subtype row_t is integer range 0 to BRAM_ROWS-1;
subtype index_t is integer range 0 to NUM_LINES-1;
subtype way_t is integer range 0 to NUM_WAYS-1;

-- The cache data BRAM organized as described above for each way
subtype cache_row_t is std_ulogic_vector(DRAM_DBITS-1 downto 0);

-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
-- not handle a clean (commented) definition of the cache tags as a 3d
-- memory. For now, work around it by putting all the tags
subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
-- type cache_tags_set_t is array(way_t) of cache_tag_t;
-- type cache_tags_array_t is array(index_t) of cache_tags_set_t;
constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
type cache_tags_array_t is array(index_t) of cache_tags_set_t;

-- The cache valid bits
subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
type cache_valids_t is array(index_t) of cache_way_valids_t;

-- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
signal cache_tags : cache_tags_array_t;
signal cache_valids : cache_valids_t;

attribute ram_style : string;
attribute ram_style of cache_tags : signal is "distributed";

--
-- Store queue signals
--
-- We store a single wishbone dword per entry (64-bit) but all
-- 16 sel bits for the DRAM.
-- XXX Investigate storing only AD3 and 8 sel bits if it's better
constant STOREQ_BITS : positive := wishbone_data_bits + DRAM_SBITS;

signal storeq_rd_ready : std_ulogic;
signal storeq_rd_valid : std_ulogic;
signal storeq_rd_data : std_ulogic_vector(STOREQ_BITS-1 downto 0);
signal storeq_wr_ready : std_ulogic;
signal storeq_wr_valid : std_ulogic;
signal storeq_wr_data : std_ulogic_vector(STOREQ_BITS-1 downto 0);

--
-- Cache management signals
--

-- Cache state machine
type state_t is (IDLE, -- Normal load hit processing
REFILL_WAIT_ACK); -- Cache refill wait ack
signal state : state_t;

-- Latched WB request.
signal wb_req : wishbone_master_out := wishbone_master_out_init;

-- Read pipeline (to handle cache RAM latency)
signal read_ack_0 : std_ulogic;
signal read_ack_1 : std_ulogic;
signal read_ad3_0 : std_ulogic;
signal read_ad3_1 : std_ulogic;
signal read_way_0 : way_t;
signal read_way_1 : way_t;

-- Async signals decoding latched request
type req_op_t is (OP_NONE,
OP_LOAD_HIT,
OP_LOAD_MISS,
OP_STORE_HIT,
OP_STORE_MISS);

signal req_index : index_t;
signal req_row : row_t;
signal req_hit_way : way_t;
signal req_tag : cache_tag_t;
signal req_op : req_op_t;
signal req_laddr : std_ulogic_vector(REAL_ADDR_BITS-1 downto 0);
signal req_ad3 : std_ulogic;
signal req_we : std_ulogic_vector(DRAM_SBITS-1 downto 0);
signal req_wdata : std_ulogic_vector(DRAM_DBITS-1 downto 0);
signal accept_store : std_ulogic;

-- Line refill command signals and latches
signal refill_cmd_valid : std_ulogic;
signal refill_cmd_addr : std_ulogic_vector(DRAM_ABITS-1 downto 0);
signal refill_way : way_t;
signal refill_index : index_t;
signal refill_row : row_t;

-- Cache RAM interface
type cache_ram_out_t is array(way_t) of cache_row_t;
signal cache_out : cache_ram_out_t;

-- PLRU output interface
type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_victim : plru_out_t;

--
-- Helper functions to decode incoming requests
--

-- Return the cache line index (tag index) for an address
function get_index(addr: wishbone_addr_type) return index_t is
begin
return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)));
end;

-- Return the cache row index (data memory) for an address
function get_row(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto 0)) return row_t is
begin
return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)));
end;

-- Returns whether this is the last row of a line. It takes a DRAM address
function is_last_row_addr(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS))
return boolean is
constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1');
begin
return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones;
end;

-- Returns whether this is the last row of a line
function is_last_row(row: row_t) return boolean is
variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1');
begin
row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
return row_v(ROW_LINEBITS-1 downto 0) = ones;
end;

-- Return the address of the next row in the current cache line. It takes a
-- DRAM address
function next_row_addr(addr: std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS))
return std_ulogic_vector is
variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
variable result : std_ulogic_vector(REAL_ADDR_BITS-1 downto ROW_OFF_BITS);
begin
-- Is there no simpler way in VHDL to generate that 3 bits adder ?
row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
result := addr;
result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
return result;
end;

-- Return the next row in the current cache line. We use a dedicated
-- function in order to limit the size of the generated adder to be
-- only the bits within a cache line (3 bits with default settings)
--
function next_row(row: row_t) return row_t is
variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
begin
row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
row_idx := row_v(ROW_LINEBITS-1 downto 0);
row_v(ROW_LINEBITS-1 downto 0) := std_ulogic_vector(unsigned(row_idx) + 1);
return to_integer(unsigned(row_v));
end;

-- Get the tag value from the address
function get_tag(addr: wishbone_addr_type) return cache_tag_t is
begin
return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
end;

-- Read a tag from a tag memory row
function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is
begin
return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
end;

-- Write a tag to tag memory row
procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t;
tag: cache_tag_t) is
begin
tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
end;

begin

-- Sanity checks
assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE;
assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE;
assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" severity FAILURE;
assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
report "geometry bits don't add up" severity FAILURE;
assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
report "geometry bits don't add up" severity FAILURE;
assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
report "geometry bits don't add up" severity FAILURE;
assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
report "geometry bits don't add up" severity FAILURE;
assert (128 = DRAM_DBITS)
report "Can't yet handle a DRAM width that isn't 128-bits" severity FAILURE;

-- alternate core reset address set when DRAM is not initialized.
core_alt_reset <= not init_done;

@ -170,7 +414,15 @@ begin
wb_init_in.stb <= wb_ctrl_in.stb;
wb_init_in.cyc <= wb_ctrl_in.cyc and wb_ctrl_is_init;

-- DRAM CSR IN signals
-- DRAM CSR IN signals. Extra latch to help with timing
csr_latch: process(system_clk)
begin
if rising_edge(system_clk) then
if system_reset = '1' then
wb_ctrl_cyc <= '0';
wb_ctrl_stb <= '0';
else
-- XXX Maybe only update addr when cyc = '1' to save power ?
wb_ctrl_adr <= x"0000" & wb_ctrl_in.adr(15 downto 2);
wb_ctrl_dat_w <= wb_ctrl_in.dat;
wb_ctrl_sel <= wb_ctrl_in.sel;
@ -178,7 +430,19 @@ begin
wb_ctrl_cyc <= wb_ctrl_in.cyc and wb_ctrl_is_csr;
wb_ctrl_stb <= wb_ctrl_in.stb and wb_ctrl_is_csr;

-- Ctrl bus wishbone OUT signals
-- Clear stb on ack otherwise the memory will latch
-- the write twice which breaks levelling. On the next
-- cycle we will latch an updated stb that takes the
-- ack into account.
if wb_ctrl_ack = '1' then
wb_ctrl_stb <= '0';
end if;
end if;
end if;
end process;

-- Ctrl bus wishbone OUT signals. XXX Consider adding latch on
-- CSR response to help timing
wb_ctrl_out.ack <= wb_ctrl_ack when wb_ctrl_is_csr = '1'
else wb_init_out.ack;
wb_ctrl_out.dat <= wb_ctrl_dat_r when wb_ctrl_is_csr = '1'
@ -186,56 +450,531 @@ begin
wb_ctrl_out.stall <= wb_init_out.stall when wb_ctrl_is_init else
'0' when wb_ctrl_in.cyc = '0' else not wb_ctrl_ack;


-- Generate a cache RAM for each way
rams: for i in 0 to NUM_WAYS-1 generate
signal do_read : std_ulogic;
signal do_write : std_ulogic;
signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
signal wr_data : std_ulogic_vector(DRAM_DBITS-1 downto 0);
signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
signal wr_sel_m : std_ulogic_vector(ROW_SIZE-1 downto 0);
signal dout : cache_row_t;
begin
way: entity work.cache_ram
generic map (
ROW_BITS => ROW_BITS,
WIDTH => DRAM_DBITS,
ADD_BUF => true
)
port map (
clk => system_clk,
rd_en => do_read,
rd_addr => rd_addr,
rd_data => dout,
wr_sel => wr_sel_m,
wr_addr => wr_addr,
wr_data => wr_data
);
process(all)
begin
--
-- Read port
--
do_read <= '1';
cache_out(i) <= dout;
rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));

--
-- Write mux: cache refills from DRAM or writes from Wishbone
--
if state = IDLE then
-- Write from wishbone
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
wr_data <= req_wdata;
wr_sel <= req_we;
else
-- Refill from DRAM
wr_data <= user_port0_rdata_data;
wr_sel <= (others => '1');
wr_addr <= std_ulogic_vector(to_unsigned(refill_row, ROW_BITS));
end if;

--
-- Write enable logic
--
do_write <= '0';
if req_op = OP_STORE_HIT and req_hit_way = i then
do_write <= '1';
elsif user_port0_rdata_valid = '1' and refill_way = i then
do_write <= '1';
end if;

-- Mask write selects with do_write since BRAM doesn't always
-- have a global write-enable (Vivado generates TDP instead
-- of SDP when using one, thus doubling cache BRAM usage).
for i in 0 to ROW_SIZE-1 loop
wr_sel_m(i) <= wr_sel(i) and do_write;
end loop;

if TRACE and rising_edge(system_clk) then
if do_write = '1' then
report "cache write way:" & integer'image(i) &
" addr:" & to_hstring(wr_addr) &
" sel:" & to_hstring(wr_sel_m) &
" data:" & to_hstring(wr_data);
end if;
end if;
end process;
end generate;

-- Generate PLRUs
maybe_plrus: if NUM_WAYS > 1 generate
begin
plrus: for i in 0 to NUM_LINES-1 generate
-- PLRU interface
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
signal plru_acc_en : std_ulogic;
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
begin
plru : entity work.plru
generic map (
BITS => WAY_BITS
)
port map (
clk => system_clk,
rst => system_reset,
acc => plru_acc,
acc_en => plru_acc_en,
lru => plru_out
);

process(req_index, req_op, req_hit_way, plru_out)
begin
-- PLRU interface
if (req_op = OP_LOAD_HIT or
req_op = OP_STORE_HIT) and req_index = i then
plru_acc_en <= '1';
else
plru_acc_en <= '0';
end if;
plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS));
plru_victim(i) <= plru_out;
end process;
end generate;
end generate;

--
-- Wishbone interface:
--
-- - Incoming wishbone request latch (to help with timing)
-- - Read response pipeline (to match BRAM output buffer delay)
-- - Stall generation
--
-- XXX TODO: Properly handle cyc drops before all acks are sent...
--
request_latch: process(system_clk)
begin
if rising_edge(system_clk) then
-- We can latch a new request if we are idle (for now). We also
-- latch the absence of request. This is a pipeline that takes
-- one per-cycle unless non-IDLE.
--
if wb_out.stall = '0' then
-- Avoid constantly updating addr/data for unrelated requests
if wb_in.cyc = '1' then
wb_req <= wb_in;
else
wb_req.cyc <= wb_in.cyc;
wb_req.stb <= wb_in.stb;
end if;

if TRACE then
if wb_in.cyc = '1' and wb_in.stb = '1' then
report "latch new wb req ! addr:" & to_hstring(wb_in.adr) &
" we:" & std_ulogic'image(wb_in.we) &
" sel:" & to_hstring(wb_in.sel);
end if;
end if;
end if;
end if;
end process;

--
--
-- Read response pipeline
--
-- XXX Might have to put store acks in there too (see comment in wb_response)
read_pipe: process(system_clk)
begin
if rising_edge(system_clk) then
read_ack_0 <= '1' when req_op = OP_LOAD_HIT else '0';
read_ad3_0 <= req_ad3;
read_way_0 <= req_hit_way;

read_ack_1 <= read_ack_0;
read_ad3_1 <= read_ad3_0;
read_way_1 <= read_way_0;

if TRACE then
if req_op = OP_LOAD_HIT then
report "Load hit addr:" & to_hstring(wb_req.adr) &
" idx:" & integer'image(req_index) &
" tag:" & to_hstring(req_tag) &
" way:" & integer'image(req_hit_way);
elsif req_op = OP_LOAD_MISS then
report "Load miss addr:" & to_hstring(wb_req.adr);
end if;
if read_ack_0 = '1' then
report "read data:" & to_hstring(cache_out(read_way_0));
end if;
end if;
end if;
end process;

wb_reponse: process(all)
variable rdata : std_ulogic_vector(DRAM_DBITS-1 downto 0);
variable store_done : std_ulogic;
begin
-- Can we accept a store ? This is set when IDLE and the store
-- queue & command queue are not full.
--
-- Data bus wishbone to LiteDRAM native port
-- Note: This is only used to control the WB request latch, stall
-- and store "early complete". We don't want to use this to control
-- cmd_valid to DRAM as this would create a circular dependency inside
-- LiteDRAM as cmd_ready I think is driven from cmd_valid.
--
-- Address bit 3 selects the top or bottom half of the data
-- bus (64-bit wishbone vs. 128-bit DRAM interface)
-- The state machine that controls the command queue must thus
-- reproduce this logic at least partially.
--
-- XXX TODO: Figure out how to pipeline this
-- Note also that user_port0_cmd_ready from LiteDRAM is combinational
-- from user_port0_cmd_valid. IE. we won't know that LiteDRAM cannot
-- accept a command until we try to send one.
--
ad3 <= wb_in.adr(3);
if state = IDLE then
accept_store <= user_port0_cmd_ready and storeq_wr_ready;

-- Wishbone port IN signals
user_port0_cmd_valid <= wb_in.cyc and wb_in.stb when state = CMD else '0';
user_port0_cmd_we <= wb_in.we when state = CMD else '0';
user_port0_wdata_valid <= '1' when state = MWRITE else '0';
user_port0_rdata_ready <= '1' when state = MREAD else '0';
user_port0_cmd_addr <= wb_in.adr(DRAM_ABITS+3 downto 4);
user_port0_wdata_data <= wb_in.dat & wb_in.dat;
user_port0_wdata_we <= wb_in.sel & "00000000" when ad3 = '1' else
"00000000" & wb_in.sel;
-- Corner case !!! The read acks pipeline takes two extra cycles
-- which means a store ack can collide with a previous load hit
-- ack. Thus we stall stores if we have a load ack pending.
if read_ack_0 = '1' or read_ack_1 = '1' then
accept_store <= '0';
end if;
else
accept_store <= '0';
end if;

-- Generate stalls. For loads, we stall if we are going to take a load
-- miss or are in the middle of a refill. For stores, if we can't
-- accept it.
case state is
when IDLE =>
case req_op is
when OP_LOAD_MISS =>
wb_out.stall <= '1';
when OP_STORE_MISS | OP_STORE_HIT =>
wb_out.stall <= not accept_store;
when others =>
wb_out.stall <= '0';
end case;
when REFILL_WAIT_ACK =>
wb_out.stall <= '1';
end case;
-- Wishbone OUT signals
wb_out.ack <= user_port0_wdata_ready when state = MWRITE else
user_port0_rdata_valid when state = MREAD else '0';
-- Data out mux
rdata := cache_out(read_way_1);
wb_out.dat <= rdata(127 downto 64) when read_ad3_1 = '1' else rdata(63 downto 0);

wb_out.dat <= user_port0_rdata_data(127 downto 64) when ad3 = '1' else
user_port0_rdata_data(63 downto 0);
-- Early-complete stores on wishbone.
if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then
store_done := accept_store;
else
store_done := '0';
end if;

-- We don't do pipelining yet.
wb_out.stall <= '0' when wb_in.cyc = '0' else not wb_out.ack;
-- Generate ACKs on read hits and store complete
--
-- XXXX TODO: This can happen on store right behind loads !
-- This probably need to be fixed by putting store acks in
-- the same pipeline as the read acks. TOOD: Create a testbench
-- to exercise those corner cases as the core can't yet.
--
wb_out.ack <= read_ack_1 or store_done;
assert read_ack_0 = '0' or store_done = '0' report
"Read ack and store ack collision !"
severity failure;
end process;

-- DRAM user port State machine
sm: process(system_clk)
--
-- Cache request decode
--
request_decode: process(all)
variable valid : std_ulogic;
variable is_hit : std_ulogic;
variable hit_way : way_t;
begin
-- Extract line, row and tag from request
req_index <= get_index(wb_req.adr);
req_row <= get_row(wb_req.adr(REAL_ADDR_BITS-1 downto 0));
req_tag <= get_tag(wb_req.adr);

-- Calculate address of beginning of cache line, will be
-- used for cache miss processing if needed
req_laddr <= wb_req.adr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) &
(LINE_OFF_BITS-1 downto 0 => '0');


-- Do we have a valid request in the WB latch ?
if state = IDLE then
valid := wb_req.cyc and wb_req.stb;
else
valid := '0';
end if;

-- Store signals
req_ad3 <= wb_req.adr(3);
req_wdata <= wb_req.dat & wb_req.dat;
req_we <= wb_req.sel & "00000000" when req_ad3 = '1' else
"00000000" & wb_req.sel;

-- Test if pending request is a hit on any way
hit_way := 0;
is_hit := '0';
for i in way_t loop
if valid = '1' and cache_valids(req_index)(i) = '1' then
if read_tag(i, cache_tags(req_index)) = req_tag then
hit_way := i;
is_hit := '1';
end if;
end if;
end loop;

-- Generate the req op. We only allow OP_LOAD_* when in the
-- IDLE state as our PLRU and ACK generation rely on this,
-- stores are allowed in IDLE state.
--
req_op <= OP_NONE;
if valid = '1' then
if wb_req.we = '1' then
if is_hit = '1' then
req_op <= OP_STORE_HIT;
else
req_op <= OP_STORE_MISS;
end if;
else
if is_hit = '1' then
req_op <= OP_LOAD_HIT;
else
req_op <= OP_LOAD_MISS;
end if;
end if;
end if;
req_hit_way <= hit_way;
end process;

--
-- Store queue
--
-- For now, queue up to 16 stores
store_queue: entity work.sync_fifo
generic map (
DEPTH => STOREQ_DEPTH,
WIDTH => STOREQ_BITS
)
port map (
clk => system_clk,
reset => system_reset,
rd_ready => storeq_rd_ready,
rd_valid => storeq_rd_valid,
rd_data => storeq_rd_data,
wr_ready => storeq_wr_ready,
wr_valid => storeq_wr_valid,
wr_data => storeq_wr_data
);

storeq_control : process(all)
variable stq_data : wishbone_data_type;
variable stq_sel : std_ulogic_vector(DRAM_SBITS-1 downto 0);
begin
storeq_wr_data <= wb_req.dat & req_we;

-- Only accept store if we can send a command
if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then
storeq_wr_valid <= user_port0_cmd_ready;
else
storeq_wr_valid <= '0';
end if;

stq_data := storeq_rd_data(storeq_rd_data'left downto DRAM_SBITS);
stq_sel := storeq_rd_data(DRAM_SBITS-1 downto 0);
user_port0_wdata_data <= stq_data & stq_data;
user_port0_wdata_we <= stq_sel;
user_port0_wdata_valid <= storeq_rd_valid;
storeq_rd_ready <= user_port0_wdata_ready;

if TRACE then
if rising_edge(system_clk) then
if req_op = OP_STORE_HIT then
report "Store hit to:" &
to_hstring(wb_req.adr(DRAM_ABITS+3 downto 0)) &
" data:" & to_hstring(req_wdata) &
" we:" & to_hstring(req_we) &
" V:" & std_ulogic'image(accept_store);
else
report "Store miss to:" &
to_hstring(wb_req.adr(DRAM_ABITS+3 downto 0)) &
" data:" & to_hstring(req_wdata) &
" we:" & to_hstring(req_we) &
" V:" & std_ulogic'image(accept_store);
end if;
if storeq_wr_valid = '1' and storeq_wr_ready = '1' then
report "storeq push " & to_hstring(storeq_wr_data);
end if;
if storeq_rd_valid = '1' and storeq_rd_ready = '1' then
report "storeq pop " & to_hstring(storeq_rd_data);
end if;
end if;
end if;
end process;

-- LiteDRAM command mux
dram_commands: process(all)
begin
if state = IDLE and (req_op = OP_STORE_HIT or req_op = OP_STORE_MISS) then
-- For stores, forward signals directly. Only send command if
-- the FIFO can accept a store
user_port0_cmd_addr <= wb_req.adr(DRAM_ABITS+3 downto 4);
user_port0_cmd_we <= '1';
user_port0_cmd_valid <= storeq_wr_ready;
else
-- For loads, we route via a latch controlled by the refill machine
user_port0_cmd_addr <= refill_cmd_addr;
user_port0_cmd_valid <= refill_cmd_valid;
user_port0_cmd_we <= '0';
end if;
user_port0_rdata_ready <= '1'; -- Always 1
end process;

-- LiteDRAM refill machine
--
-- This handles the cache line refills
--
refill_machine : process(system_clk)
variable tagset : cache_tags_set_t;
variable cmds_done : boolean;
variable replace_way : way_t;
variable wait_qdrain : boolean;
begin
if rising_edge(system_clk) then
-- On reset, clear all valid bits to force misses
if system_reset = '1' then
state <= CMD;
for i in index_t loop
cache_valids(i) <= (others => '0');
end loop;
state <= IDLE;
refill_cmd_valid <= '0';
else
-- Main state machine
case state is
when CMD =>
if (user_port0_cmd_ready and user_port0_cmd_valid) = '1' then
state <= MWRITE when wb_in.we = '1' else MREAD;
when IDLE =>
assert refill_cmd_valid = '0' report "refill cmd valid in IDLE state !"
severity failure;

-- If NO_LS_OVERLAP is set, disallow a load miss if the store
-- queue still has data in it.
wait_qdrain := false;
if NO_LS_OVERLAP then
wait_qdrain := storeq_rd_valid = '1';
end if;

-- We need to read a cache line
if req_op = OP_LOAD_MISS and not wait_qdrain then
-- Grab way to replace
replace_way := to_integer(unsigned(plru_victim(req_index)));

-- Force misses on that way while refilling that line
cache_valids(req_index)(replace_way) <= '0';

-- Store new tag in selected way
for i in 0 to NUM_WAYS-1 loop
if i = replace_way then
tagset := cache_tags(req_index);
write_tag(i, tagset, req_tag);
cache_tags(req_index) <= tagset;
end if;
when MWRITE =>
if user_port0_wdata_ready = '1' then
state <= CMD;
end loop;

-- Keep track of our index and way for subsequent stores
refill_index <= req_index;
refill_way <= replace_way;
refill_row <= get_row(req_laddr);

-- Prep for first DRAM read
--
-- XXX TODO: We could start a cycle early here by using
-- combo logic to generate the first command in
-- "dram_commands". In fact, we could make refill_cmd_addr
-- only contain the "counter" bits and wire it with the
-- other bits from req_laddr.
refill_cmd_addr <= req_laddr(DRAM_ABITS+3 downto 4);
refill_cmd_valid <= '1';

if TRACE then
report "refill addr " & to_hstring(req_laddr);
end if;
when MREAD =>

-- Track that we had one request sent
state <= REFILL_WAIT_ACK;
end if;

when REFILL_WAIT_ACK =>
-- Commands are all sent if user_port0_cmd_valid is 0
cmds_done := refill_cmd_valid = '0';

-- If we are still sending requests, was one accepted ?
if user_port0_cmd_ready = '1' and not cmds_done then
-- That was the last word ? We are done sending. Clear
-- command valid and set cmds_done so we can handle an
-- eventual last ack on the same cycle.
--
if TRACE then
report "got refill cmd ack !";
end if;
if is_last_row_addr(refill_cmd_addr) then
refill_cmd_valid <= '0';
cmds_done := true;
if TRACE then
report "all refill cmds done !";
end if;
else
-- Calculate the next row address
refill_cmd_addr <= next_row_addr(refill_cmd_addr);
if TRACE then
report "refill addr " &
to_hstring(next_row_addr(refill_cmd_addr));
end if;
end if;
end if;

-- Incoming read data processing
if user_port0_rdata_valid = '1' then
state <= CMD;
if TRACE then
report "got refill data ack !";
end if;
-- Check for completion
if cmds_done and is_last_row(refill_row) then
if TRACE then
report "all refill data done !";
end if;
-- Cache line is now valid
cache_valids(refill_index)(refill_way) <= '1';
-- We are done
state <= IDLE;
end if;

-- Increment store row counter
refill_row <= next_row(refill_row);
end if;
end case;
end if;

@ -9,9 +9,9 @@
#define CONFIG_CPU_NOP "nop"

#ifdef __SIM__
#define MEMTEST_BUS_SIZE 16
#define MEMTEST_DATA_SIZE 16
#define MEMTEST_ADDR_SIZE 16
#define MEMTEST_BUS_SIZE 512//16
#define MEMTEST_DATA_SIZE 1024//16
#define MEMTEST_ADDR_SIZE 128//16
#define CONFIG_SIM_DISABLE_DELAYS
#endif


File diff suppressed because it is too large Load Diff

@ -1,5 +1,5 @@
//--------------------------------------------------------------------------------
// Auto-generated by Migen (0d16e03) & LiteX (564d731a) on 2020-05-26 20:37:38
// Auto-generated by Migen (0d16e03) & LiteX (564d731a) on 2020-05-30 20:25:53
//--------------------------------------------------------------------------------
module litedram_core(
input wire clk,

File diff suppressed because it is too large Load Diff

@ -1,5 +1,5 @@
//--------------------------------------------------------------------------------
// Auto-generated by Migen (0d16e03) & LiteX (564d731a) on 2020-05-26 20:37:40
// Auto-generated by Migen (0d16e03) & LiteX (564d731a) on 2020-05-30 20:25:55
//--------------------------------------------------------------------------------
module litedram_core(
input wire clk,

@ -510,7 +510,7 @@ a64b5a7d14004a39
0000000000000000
0000000000000000
0000000000000000
384296003c4c0001
384297003c4c0001
fbc1fff07c0802a6
f8010010fbe1fff8
3be10020f821fe91
@ -519,11 +519,11 @@ f8c101a838800140
38c101987c651b78
7fe3fb78f8e101b0
f92101c0f90101b8
48000da5f94101c8
48000d65f94101c8
7c7e1b7860000000
480008bd7fe3fb78
4800087d7fe3fb78
3821017060000000
480013647fc3f378
480013247fc3f378
0100000000000000
4e80002000000280
0000000000000000
@ -531,67 +531,67 @@ f92101c0f90101b8
4e8000204c00012c
0000000000000000
3c4c000100000000
7c0802a63842955c
7c0802a63842965c
7d800026fbe1fff8
91810008f8010010
480007b1f821ff91
48000771f821ff91
3c62ffff60000000
4bffff3538637de8
4bffff3538637ca8
548400023880ffff
7c8026ea7c0004ac
3fe0c0003c62ffff
63ff000838637e08
63ff000838637cc8
3c62ffff4bffff11
38637e287bff0020
38637ce87bff0020
7c0004ac4bffff01
73e900017fe0feea
3c62ffff41820010
4bfffee538637e40
4bfffee538637d00
4d80000073e90002
3c62ffff41820010
4bfffecd38637e48
4bfffecd38637d08
4e00000073e90004
3c62ffff41820010
4bfffeb538637e50
3be2804860000000
4bfffeb538637d10
3bff7fa03fe2ffff
4bfffea57fe3fb78
3c80c00041920028
7884002060840010
7c8026ea7c0004ac
7884b2823c62ffff
4bfffe7d38637e58
4bfffe7d38637d18
3c80c000418e004c
7884002060840018
7c8026ea7c0004ac
788465023c62ffff
4bfffe5538637e78
4bfffe5538637d38
608400303c80c000
7c0004ac78840020
3c62ffff7c8026ea
38637e987884b282
38637d587884b282
3d20c0004bfffe31
7929002061290020
7d204eea7c0004ac
3c62ffff3c80000f
38637eb860844240
38637d7860844240
4bfffe057c892392
4bfffdfd7fe3fb78
3ca2ffff418e0028
3c62ffff3c82ffff
38847ee838a57ed8
4bfffddd38637ef0
60000000480004c1
38847da838a57d98
4bfffddd38637db0
6000000048000481
3c62ffff41920020
4bfffdc538637f20
4bfffdc538637de0
8181000838210070
480011807d818120
38637f383c62ffff
480011407d818120
38637df83c62ffff
3c80f0004bfffda9
6084400038a0ffff
7884002054a50422
480008553c604000
480008153c604000
3c62ffff60000000
4bfffd7d38637f58
4bfffd7d38637e18
e801001038210070
ebe1fff881810008
7d8181207c0803a6
@ -605,138 +605,130 @@ ebe1fff881810008
4e8000207d20572a
0000000000000000
3c4c000100000000
7c0802a63842930c
7c0802a63842940c
614a08003d40c010
794a002039200001
f821ffa1f8010010
7d20572a7c0004ac
3862802860000000
38637f803c62ffff
600000004bfffce1
e801001038210060
4e8000207c0803a6
0100000000000000
3c4c000100000080
7c0802a6384292b4
6129000c3d204000
3fc0aaaa48000ffd
f821ff713f804000
63deaaaa3fa04000
639c00043fe04000
93df000063bd0008
93dd000093dc0000
4bfffce993c90000
813f000060000000
7d29f278815c0000
7d2900347f8af000
692900015529d97e
7fff07b43be90001
7d3f07b4409e0008
7f89f000813d0000
3bff0001419e000c
3d2040007fff07b4
812900006129000c
2f8aaaaa6d2a5555
3bff0001419e000c
3fc055557fff07b4
3d2040003fa04000
3f80400063de5555
63bd000461290008
93dd000093dc0000
3d20400093c90000
93c900006129000c
600000004bfffc4d
7f89f000813c0000
3bff0001419e000c
813d00007fff07b4
2f8a55556d2a5555
3bff0001419e000c
3d2040007fff07b4
8129000061290008
2f8a55556d2a5555
3bff0001419e000c
3d2040007fff07b4
812900006129000c
2f8a55556d2a5555
3bff0001419e0028
3c62ffff7fff07b4
7fe4fb7838a00100
4bfffb5538637f70
4800000c60000000
409effe02fbf0000
3ce0802039000004
60e700037d0903a6
392000013d404000
7928f84278e70020
7d2900d0792907e0
7d293838394a0004
912afffc7d294278
4bfffb794200ffe4
3900000460000000
7d0903a63ce08020
3d40400060e70003
392000013bc00000
7928f84278e70020
7d2900d0792907e0
7d2942787d293838
7f884840810a0000
3bde0001419e000c
394a00047fde07b4
2fbe00004200ffd4
3c62ffff419e001c
7fc4f37838a00004
4bfffa9538637f98
3d20400060000000
6129000839400000
914900003ba00000
394000013d204000
914900006129000c
394000023d204000
9149000061290010
394000033d204000
9149000061290014
600000004bfffabd
3940000039200004
3d2a10007d2903a6
8129000879291764
7f8950005529043e
3bbd0001419e000c
394a00017fbd07b4
2fbd00004200ffdc
3c62ffff419e001c
7fa4eb7838a00004
4bfff9f538637fc0
7ffefa1460000000
7fffea143bc00000
409e00a42f9f0000
38637fe83c62ffff
600000004bfff9d1
3f8040007f5602a6
639c00043f604000
3fa0400039200001
3fc0400093db0000
63bd0008913c0000
63de000c39200002
39200003913d0000
7ff602a6913e0000
600000004bfff9fd
815b00007d3602a6
815d0000815c0000
7cb602a6815e0000
7ca5485038803200
7ca42b967d3fd050
3c62ffff7c844b96
788404a078a504a0
3bc0000138637ff8
600000004bfff941
7fc3f37838210090
0000000048000cd4
0000068001000000
38428f183c4c0001
600000007c0802a6
48000c6138628050
7c0802a6384293b4
38637e303c62ffff
f821ff7148000fc1
600000004bfffca1
3d40aaaa39000080
3d2040007d0903a6
91490000614aaaaa
4200fff839290004
4bfffce93f60aaaa
3fa0aaaa60000000
637baaaa3f82ffff
3be000003bc00000
3b9c7e4063bdaaaa
3d3e10007b7b0020
792917647fc407b4
7f85e80080a90000
3bff0001419e001c
7f83e3787f66db78
4bfffc257fff07b4
3bde000160000000
409effc82bbe0080
3d40555539000080
3d2040007d0903a6
91490000614a5555
4200fff839290004
600000004bfffc65
3f82ffff3fa05555
3bc000003f605555
3b9c7e6063bd5555
3d3e1000637b5555
792917647fc407b4
7f85e80080a90000
3bff0001419e001c
7f83e3787f66db78
4bfffba57fff07b4
3bde000160000000
409effc82bbe0080
419e001c2fbf0000
38a001003c62ffff
38637e807fe4fb78
600000004bfffb79
3fc2ffff3c62ffff
3bde7ec038637ea8
600000004bfffb61
3d20400039400100
390000017d4903a6
3929000439480001
9149fffc79480020
4bfffba94200fff0
3940010060000000
7d4903a639200000
3d09100038c00001
7908176439460001
794600207d2407b4
7f8a284080a80000
7fc3f378419e0014
600000004bfffaf9
392900014bffff98
3c62ffff4200ffcc
4bfffadd38637ee0
3920002060000000
7d2903a639400000
794800203d2a1000
394a000139290002
9109000079291764
4bfffb214200ffe8
3f82ffff60000000
3bc000003ba00000
3d3d10003b9c7ef8
792917647fa607b4
5529043e81290008
7d2507b47f893000
3bde0001419e001c
7f83e3787cc43378
4bfffa657fde07b4
3bbd000160000000
409effc02bbd0020
419e001c2fbe0000
38a000203c62ffff
38637f187fc4f378
600000004bfffa39
386000007ffff214
409e00b02f9f0000
38637f403c62ffff
600000004bfffa19
394001007c9602a6
7d4903a678840020
3d49100039200000
794a176479280020
910a000039290001
7ff602a64200ffec
3fe0000c7c9f2050
7fff239663ff8000
600000004bfffa45
7d3602a67bff0020
7929002039000100
3d4040007d0903a6
394a0004810a0000
7cb602a64200fff8
3ca0000c7d254850
3c62ffff60a58000
7fe4fb787ca54b96
78a5032038637f50
600000004bfff981
3821009038600001
0000000048000cd8
0000058001000000
384290583c4c0001
3c62ffff7c0802a6
48000c6138637fa8
3f60c010f821ff71
637b10003be00000
4bfff8f57b7b0020
4bfff9357b7b0020
7c0004ac60000000
3f40c0107fe0df2a
7b5a0020635a1008
@ -756,22 +748,22 @@ f821ff713f804000
7d20ef2a7c0004ac
7c0004ac39200002
3860000f7d20f72a
7c0004ac4bfffb09
7c0004ac4bfffb49
392000037fe0ef2a
7d20f72a7c0004ac
4bfffaed3860000f
4bfffb2d3860000f
7c0004ac39200006
3b8000017d20ef2a
7f80f72a7c0004ac
4bfffacd3860000f
4bfffb0d3860000f
7c0004ac39200920
7c0004ac7d20ef2a
3860000f7fe0f72a
392004004bfffab1
392004004bfffaf1
7d20ef2a7c0004ac
7fe0f72a7c0004ac
4bfffa9538600003
4bfffb294bfffad5
4bfffad538600003
4bfffb694bfffb15
4082001c2c230000
7f80df2a7c0004ac
7f80d72a7c0004ac
@ -780,27 +772,27 @@ f821ff713f804000
4bffffec38600001
0100000000000000
3c4c000100000680
3d20c00038428d94
3d20c00038428ed4
6129200060000000
f92280b879290020
f922801079290020
612900203d20c000
7c0004ac79290020
3d40001c7d204eea
7d295392614a2000
394a0018e94280b8
394a0018e9428010
7c0004ac3929ffff
4e8000207d2057ea
0000000000000000
3c4c000100000000
6000000038428d34
39290010e92280b8
6000000038428e74
39290010e9228010
7d204eea7c0004ac
4082ffe871290008
e94280b85469063e
e94280105469063e
7d2057ea7c0004ac
000000004e800020
0000000000000000
38428cf03c4c0001
38428e303c4c0001
fbc1fff07c0802a6
3bc3fffffbe1fff8
f821ffd1f8010010
@ -874,7 +866,7 @@ f924000039290002
7c6307b43863ffe0
000000004e800020
0000000000000000
38428aa03c4c0001
38428be03c4c0001
3d2037367c0802a6
612935347d908026
65293332792907c6
@ -908,7 +900,7 @@ fbfd00007fe9fa14
4bfffff07d29f392
0300000000000000
3c4c000100000580
7c0802a638428994
7c0802a638428ad4
f821ffb1480006e9
7c7f1b78eb630000
7cbd2b787c9c2378
@ -924,7 +916,7 @@ f821ffb1480006e9
4bffffb8f93f0000
0100000000000000
3c4c000100000580
7c0802a638428914
7c0802a638428a54
f821ffa148000661
7c9b23787c7d1b78
388000007ca32b78
@ -955,16 +947,16 @@ e95d00009b270000
f95d0000394a0001
000000004bffffa8
0000078001000000
384288183c4c0001
384289583c4c0001
480005397c0802a6
7c741b79f821fed1
38600000f8610060
2fa4000041820068
39210040419e0060
3ac4ffff60000000
3ac4ffff3e42ffff
f92100703b410020
3ae0000060000000
3a428068392280b0
3a527fc039228008
f92100783ba10060
ebc1006089250000
419e00102fa90000
@ -1196,16 +1188,35 @@ e8010010ebc1fff0
20676e69746f6f42
415244206d6f7266
0000000a2e2e2e4d
20747365746d656d
000a2e2e2e737562
7830203a7375625b
7830203a5d783025
2073762078383025
000a783830257830
257830207375625b
257830203a5d7830
3020737620783830
00000a7838302578
20747365746d654d
6c69616620737562
252f6425203a6465
73726f7272652064
000000000000000a
20747365746d654d
6961662061746164
2f6425203a64656c
726f727265206425
0000000000000a73
20747365746d656d
0a2e2e2e61746164
0000000000000000
783020617461645b
7830203a5d783025
2073762078383025
000a783830257830
20747365746d656d
0a2e2e2e72646461
0000000000000000
783020726464615b
7830203a5d783025
2073762078383025
000a783830257830
20747365746d654d
6961662072646461
2f6425203a64656c

@ -1,5 +1,5 @@
//--------------------------------------------------------------------------------
// Auto-generated by Migen (0d16e03) & LiteX (564d731a) on 2020-05-26 20:37:42
// Auto-generated by Migen (0d16e03) & LiteX (564d731a) on 2020-05-30 20:25:57
//--------------------------------------------------------------------------------
module litedram_core(
input wire clk,

@ -48,6 +48,7 @@ filesets:
- soc.vhdl
- xics.vhdl
- syscon.vhdl
- sync_fifo.vhdl
file_type : vhdlSource-2008

fpga:

@ -0,0 +1,163 @@
-- Synchronous FIFO with a protocol similar to AXI
--
-- The outputs are generated combinationally from the inputs
-- in order to allow for back-to-back transfers with the type
-- of flow control used by busses lite AXI, pipelined WB or
-- LiteDRAM native port when the FIFO is full.
--
-- That means that care needs to be taken by the user not to
-- generate the inputs combinationally from the outputs otherwise
-- it would create a logic loop.
--
-- If breaking that loop is required, a stash buffer could be
-- added to break the flow control "loop" between the read and
-- the write port.
--
library ieee;
use ieee.std_logic_1164.all;

library work;
use work.utils.all;

entity sync_fifo is
generic(
-- Fifo depth in entries
DEPTH : natural := 64;

-- Fifo width in bits
WIDTH : natural := 32;

-- When INIT_ZERO is set, the memory is pre-initialized to 0's
INIT_ZERO : boolean := false
);
port(
-- Control lines:
clk : in std_ulogic;
reset : in std_ulogic;

-- Write port
wr_ready : out std_ulogic;
wr_valid : in std_ulogic;
wr_data : in std_ulogic_vector(WIDTH - 1 downto 0);

-- Read port
rd_ready : in std_ulogic;
rd_valid : out std_ulogic;
rd_data : out std_ulogic_vector(WIDTH - 1 downto 0)
);
end entity sync_fifo;

architecture behaviour of sync_fifo is

subtype data_t is std_ulogic_vector(WIDTH - 1 downto 0);
type memory_t is array(0 to DEPTH - 1) of data_t;

function init_mem return memory_t is
variable m : memory_t;
begin
if INIT_ZERO then
for i in 0 to DEPTH - 1 loop
m(i) := (others => '0');
end loop;
end if;
return m;
end function;

signal memory : memory_t := init_mem;

subtype index_t is integer range 0 to DEPTH - 1;
signal rd_idx : index_t;
signal rd_next : index_t;
signal wr_idx : index_t;
signal wr_next : index_t;

function next_index(idx : index_t) return index_t is
variable r : index_t;
begin
if ispow2(DEPTH) then
r := (idx + 1) mod DEPTH;
else
r := idx + 1;
if r = DEPTH then
r := 0;
end if;
end if;
return r;
end function;
type op_t is (OP_POP, OP_PUSH);
signal op_prev : op_t := OP_POP;
signal op_next : op_t;

signal full, empty : std_ulogic;
signal push, pop : std_ulogic;
begin

-- Current state at last clock edge
empty <= '1' when rd_idx = wr_idx and op_prev = OP_POP else '0';
full <= '1' when rd_idx = wr_idx and op_prev = OP_PUSH else '0';

-- We can accept new data if we aren't full or we are but
-- the read port is going to accept data this cycle
wr_ready <= rd_ready or not full;

-- We can provide data if we aren't empty or we are but
-- the write port is going to provide data this cycle
rd_valid <= wr_valid or not empty;

-- Internal control signals
push <= wr_ready and wr_valid;
pop <= rd_ready and rd_valid;

-- Next state
rd_next <= next_index(rd_idx) when pop = '1' else rd_idx;
wr_next <= next_index(wr_idx) when push = '1' else wr_idx;
with push & pop select op_next <=
OP_PUSH when "10",
OP_POP when "01",
op_prev when others;

-- Read port output
rd_data <= memory(rd_idx) when empty = '0' else wr_data;

-- Read counter
reader: process(clk)
begin
if rising_edge(clk) then
if reset = '1' then
rd_idx <= 0;
else
rd_idx <= rd_next;
end if;
end if;
end process;

-- Write counter and memory write
producer: process(clk)
begin
if rising_edge(clk) then
if reset = '1' then
wr_idx <= 0;
else
wr_idx <= wr_next;

if push = '1' then
memory(wr_idx) <= wr_data;
end if;
end if;
end if;
end process;

-- Previous op latch used for generating empty/full
op: process(clk)
begin
if rising_edge(clk) then
if reset = '1' then
op_prev <= OP_POP;
else
op_prev <= op_next;
end if;
end if;
end process;

end architecture behaviour;
Loading…
Cancel
Save