|
|
|
--
|
|
|
|
-- Set associative dcache write-through
|
|
|
|
--
|
|
|
|
-- TODO (in no specific order):
|
|
|
|
--
|
|
|
|
-- * See list in icache.vhdl
|
|
|
|
-- * Complete load misses on the cycle when WB data comes instead of
|
|
|
|
-- at the end of line (this requires dealing with requests coming in
|
|
|
|
-- while not idle...)
|
|
|
|
-- * Load with update could use one less non-pipelined cycle by moving
|
|
|
|
-- the register update to the pipeline bubble that exists when going
|
|
|
|
-- back to the IDLE state.
|
|
|
|
--
|
|
|
|
library ieee;
|
|
|
|
use ieee.std_logic_1164.all;
|
|
|
|
use ieee.numeric_std.all;
|
|
|
|
|
|
|
|
library work;
|
|
|
|
use work.utils.all;
|
|
|
|
use work.common.all;
|
|
|
|
use work.helpers.all;
|
|
|
|
use work.wishbone_types.all;
|
|
|
|
|
|
|
|
entity dcache is
|
|
|
|
generic (
|
|
|
|
-- Line size in bytes
|
|
|
|
LINE_SIZE : positive := 64;
|
|
|
|
-- Number of lines in a set
|
|
|
|
NUM_LINES : positive := 32;
|
|
|
|
-- Number of ways
|
|
|
|
NUM_WAYS : positive := 4
|
|
|
|
);
|
|
|
|
port (
|
|
|
|
clk : in std_ulogic;
|
|
|
|
rst : in std_ulogic;
|
|
|
|
|
|
|
|
d_in : in Loadstore1ToDcacheType;
|
|
|
|
d_out : out DcacheToWritebackType;
|
|
|
|
|
|
|
|
stall_out : out std_ulogic;
|
|
|
|
|
|
|
|
wishbone_out : out wishbone_master_out;
|
|
|
|
wishbone_in : in wishbone_slave_out
|
|
|
|
);
|
|
|
|
end entity dcache;
|
|
|
|
|
|
|
|
architecture rtl of dcache is
|
|
|
|
-- BRAM organisation: We never access more than wishbone_data_bits at
|
|
|
|
-- a time so to save resources we make the array only that wide, and
|
|
|
|
-- use consecutive indices for to make a cache "line"
|
|
|
|
--
|
|
|
|
-- ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
|
|
|
|
constant ROW_SIZE : natural := wishbone_data_bits / 8;
|
|
|
|
-- ROW_PER_LINE is the number of row (wishbone transactions) in a line
|
|
|
|
constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
|
|
|
|
-- BRAM_ROWS is the number of rows in BRAM needed to represent the full
|
|
|
|
-- dcache
|
|
|
|
constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
|
|
|
|
|
|
|
|
-- Bit fields counts in the address
|
|
|
|
|
|
|
|
-- ROW_BITS is the number of bits to select a row
|
|
|
|
constant ROW_BITS : natural := log2(BRAM_ROWS);
|
|
|
|
-- ROW_LINEBITS is the number of bits to select a row within a line
|
|
|
|
constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
|
|
|
|
-- LINE_OFF_BITS is the number of bits for the offset in a cache line
|
|
|
|
constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
|
|
|
|
-- ROW_OFF_BITS is the number of bits for the offset in a row
|
|
|
|
constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
|
|
|
|
-- INDEX_BITS is the number if bits to select a cache line
|
|
|
|
constant INDEX_BITS : natural := log2(NUM_LINES);
|
|
|
|
-- TAG_BITS is the number of bits of the tag part of the address
|
|
|
|
constant TAG_BITS : natural := 64 - LINE_OFF_BITS - INDEX_BITS;
|
|
|
|
-- WAY_BITS is the number of bits to select a way
|
|
|
|
constant WAY_BITS : natural := log2(NUM_WAYS);
|
|
|
|
|
|
|
|
-- Example of layout for 32 lines of 64 bytes:
|
|
|
|
--
|
|
|
|
-- .. tag |index| line |
|
|
|
|
-- .. | row | |
|
|
|
|
-- .. | |---| | ROW_LINEBITS (3)
|
|
|
|
-- .. | |--- - --| LINE_OFF_BITS (6)
|
|
|
|
-- .. | |- --| ROW_OFF_BITS (3)
|
|
|
|
-- .. |----- ---| | ROW_BITS (8)
|
|
|
|
-- .. |-----| | INDEX_BITS (5)
|
|
|
|
-- .. --------| | TAG_BITS (53)
|
|
|
|
|
|
|
|
subtype row_t is integer range 0 to BRAM_ROWS-1;
|
|
|
|
subtype index_t is integer range 0 to NUM_LINES-1;
|
|
|
|
subtype way_t is integer range 0 to NUM_WAYS-1;
|
|
|
|
|
|
|
|
-- The cache data BRAM organized as described above for each way
|
|
|
|
subtype cache_row_t is std_ulogic_vector(wishbone_data_bits-1 downto 0);
|
|
|
|
|
|
|
|
-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
|
|
|
|
-- not handle a clean (commented) definition of the cache tags as a 3d
|
|
|
|
-- memory. For now, work around it by putting all the tags
|
|
|
|
subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
|
|
|
|
-- type cache_tags_set_t is array(way_t) of cache_tag_t;
|
|
|
|
-- type cache_tags_array_t is array(index_t) of cache_tags_set_t;
|
|
|
|
constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
|
|
|
|
subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
|
|
|
|
type cache_tags_array_t is array(index_t) of cache_tags_set_t;
|
|
|
|
|
|
|
|
-- The cache valid bits
|
|
|
|
subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
|
|
|
|
type cache_valids_t is array(index_t) of cache_way_valids_t;
|
|
|
|
|
|
|
|
-- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
|
|
|
|
signal cache_tags : cache_tags_array_t;
|
|
|
|
signal cache_valids : cache_valids_t;
|
|
|
|
|
|
|
|
attribute ram_style : string;
|
|
|
|
attribute ram_style of cache_tags : signal is "distributed";
|
|
|
|
|
|
|
|
-- Type of operation on a "valid" input
|
|
|
|
type op_t is (OP_NONE,
|
|
|
|
OP_LOAD_HIT, -- Cache hit on load
|
|
|
|
OP_LOAD_MISS, -- Load missing cache
|
|
|
|
OP_LOAD_NC, -- Non-cachable load
|
|
|
|
OP_BAD, -- BAD: Cache hit on NC load/store
|
|
|
|
OP_STORE_HIT, -- Store hitting cache
|
|
|
|
OP_STORE_MISS); -- Store missing cache
|
|
|
|
|
|
|
|
-- Cache state machine
|
|
|
|
type state_t is (IDLE, -- Normal load hit processing
|
|
|
|
NEXT_DWORD, -- Starting the 2nd xfer of misaligned
|
|
|
|
LOAD_UPDATE, -- Load with update extra cycle
|
|
|
|
LOAD_UPDATE2, -- Load with update extra cycle
|
|
|
|
RELOAD_WAIT_ACK, -- Cache reload wait ack
|
|
|
|
STORE_WAIT_ACK, -- Store wait ack
|
|
|
|
NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack
|
|
|
|
|
|
|
|
|
|
|
|
--
|
|
|
|
-- Dcache operations:
|
|
|
|
--
|
|
|
|
-- In order to make timing, we use the BRAMs with an output buffer,
|
|
|
|
-- which means that the BRAM output is delayed by an extra cycle.
|
|
|
|
--
|
|
|
|
-- Thus, the dcache has a 2-stage internal pipeline for cache hits
|
|
|
|
-- with no stalls.
|
|
|
|
--
|
|
|
|
-- All other operations are handled via stalling in the first stage.
|
|
|
|
--
|
|
|
|
-- The second stage can thus complete a hit at the same time as the
|
|
|
|
-- first stage emits a stall for a complex op.
|
|
|
|
--
|
|
|
|
|
|
|
|
-- First stage register, contains state for stage 1 of load hits
|
|
|
|
-- and for the state machine used by all other operations
|
|
|
|
--
|
|
|
|
type reg_stage_1_t is record
|
|
|
|
-- Latch the complete request from ls1
|
|
|
|
req : Loadstore1ToDcacheType;
|
|
|
|
|
|
|
|
-- Cache hit state
|
|
|
|
hit_way : way_t;
|
|
|
|
hit_load_valid : std_ulogic;
|
|
|
|
|
|
|
|
-- Info for doing the second transfer of a misaligned load/store
|
|
|
|
two_dwords : std_ulogic;
|
|
|
|
second_dword : std_ulogic;
|
|
|
|
next_addr : std_ulogic_vector(63 downto 0);
|
|
|
|
next_sel : std_ulogic_vector(7 downto 0);
|
|
|
|
|
|
|
|
-- Register update (load/store with update)
|
|
|
|
update_valid : std_ulogic;
|
|
|
|
|
|
|
|
-- Data buffer for "slow" read ops (load miss and NC loads).
|
|
|
|
slow_data : std_ulogic_vector(63 downto 0);
|
|
|
|
slow_valid : std_ulogic;
|
|
|
|
|
|
|
|
-- Cache miss state (reload state machine)
|
|
|
|
state : state_t;
|
|
|
|
wb : wishbone_master_out;
|
|
|
|
store_way : way_t;
|
|
|
|
store_row : row_t;
|
|
|
|
store_index : index_t;
|
|
|
|
end record;
|
|
|
|
|
|
|
|
signal r1 : reg_stage_1_t;
|
|
|
|
|
|
|
|
-- Second stage register, only used for load hits
|
|
|
|
--
|
|
|
|
type reg_stage_2_t is record
|
|
|
|
hit_way : way_t;
|
|
|
|
hit_load_valid : std_ulogic;
|
|
|
|
load_is_update : std_ulogic;
|
|
|
|
load_reg : std_ulogic_vector(4 downto 0);
|
|
|
|
data_shift : std_ulogic_vector(2 downto 0);
|
|
|
|
length : std_ulogic_vector(3 downto 0);
|
|
|
|
sign_extend : std_ulogic;
|
|
|
|
byte_reverse : std_ulogic;
|
Add basic XER support
The carry is currently internal to execute1. We don't handle any of
the other XER fields.
This creates type called "xer_common_t" that contains the commonly
used XER bits (CA, CA32, SO, OV, OV32).
The value is stored in the CR file (though it could be a separate
module). The rest of the bits will be implemented as a separate
SPR and the two parts reconciled in mfspr/mtspr in latter commits.
We always read XER in decode2 (there is little point not to)
and send it down all pipeline branches as it will be needed in
writeback for all type of instructions when CR0:SO needs to be
updated (such forms exist for all pipeline branches even if we don't
yet implement them).
To avoid having to track XER hazards, we forward it back in EX1. This
assumes that other pipeline branches that can modify it (mult and div)
are running single issue for now.
One additional hazard to beware of is an XER:SO modifying instruction
in EX1 followed immediately by a store conditional. Due to our writeback
latency, the store will go down the LSU with the previous XER value,
thus the stcx. will set CR0:SO using an obsolete SO value.
I doubt there exist any code relying on this behaviour being correct
but we should account for it regardless, possibly by ensuring that
stcx. remain single issue initially, or later by adding some minimal
tracking or moving the LSU into the same pipeline as execute.
Missing some obscure XER affecting instructions like addex or mcrxrx.
[paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of
arguments to set_ov]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
xerc : xer_common_t;
|
|
|
|
last_dword : std_ulogic;
|
|
|
|
second_dword : std_ulogic;
|
|
|
|
end record;
|
|
|
|
|
|
|
|
signal r2 : reg_stage_2_t;
|
|
|
|
|
|
|
|
-- Async signals on incoming request
|
|
|
|
signal req_index : index_t;
|
|
|
|
signal req_row : row_t;
|
|
|
|
signal req_hit_way : way_t;
|
|
|
|
signal req_tag : cache_tag_t;
|
|
|
|
signal req_op : op_t;
|
|
|
|
signal req_data : std_ulogic_vector(63 downto 0);
|
|
|
|
signal req_addr : std_ulogic_vector(63 downto 0);
|
|
|
|
signal req_laddr : std_ulogic_vector(63 downto 0);
|
|
|
|
signal req_sel : std_ulogic_vector(7 downto 0);
|
|
|
|
|
|
|
|
-- Cache RAM interface
|
|
|
|
type cache_ram_out_t is array(way_t) of cache_row_t;
|
|
|
|
signal cache_out : cache_ram_out_t;
|
|
|
|
|
|
|
|
-- PLRU output interface
|
|
|
|
type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0);
|
|
|
|
signal plru_victim : plru_out_t;
|
|
|
|
signal replace_way : way_t;
|
|
|
|
|
|
|
|
-- Wishbone read/write/cache write formatting signals
|
|
|
|
signal bus_sel : std_ulogic_vector(15 downto 0);
|
|
|
|
|
|
|
|
signal two_dwords : std_ulogic;
|
|
|
|
|
|
|
|
--
|
|
|
|
-- Helper functions to decode incoming requests
|
|
|
|
--
|
|
|
|
|
|
|
|
-- Return the cache line index (tag index) for an address
|
|
|
|
function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is
|
|
|
|
begin
|
|
|
|
return to_integer(unsigned(addr(63-TAG_BITS downto LINE_OFF_BITS)));
|
|
|
|
end;
|
|
|
|
|
|
|
|
-- Return the cache row index (data memory) for an address
|
|
|
|
function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is
|
|
|
|
begin
|
|
|
|
return to_integer(unsigned(addr(63-TAG_BITS downto ROW_OFF_BITS)));
|
|
|
|
end;
|
|
|
|
|
|
|
|
-- Returns whether this is the last row of a line
|
|
|
|
function is_last_row_addr(addr: wishbone_addr_type) return boolean is
|
|
|
|
constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1');
|
|
|
|
begin
|
|
|
|
return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones;
|
|
|
|
end;
|
|
|
|
|
|
|
|
-- Returns whether this is the last row of a line
|
|
|
|
function is_last_row(row: row_t) return boolean is
|
|
|
|
variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
|
|
|
|
constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1');
|
|
|
|
begin
|
|
|
|
row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
|
|
|
|
return row_v(ROW_LINEBITS-1 downto 0) = ones;
|
|
|
|
end;
|
|
|
|
|
|
|
|
-- Return the address of the next row in the current cache line
|
|
|
|
function next_row_addr(addr: wishbone_addr_type) return std_ulogic_vector is
|
|
|
|
variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
|
|
|
|
variable result : wishbone_addr_type;
|
|
|
|
begin
|
|
|
|
-- Is there no simpler way in VHDL to generate that 3 bits adder ?
|
|
|
|
row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
|
|
|
|
row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
|
|
|
|
result := addr;
|
|
|
|
result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
|
|
|
|
return result;
|
|
|
|
end;
|
|
|
|
|
|
|
|
-- Return the next row in the current cache line. We use a dedicated
|
|
|
|
-- function in order to limit the size of the generated adder to be
|
|
|
|
-- only the bits within a cache line (3 bits with default settings)
|
|
|
|
--
|
|
|
|
function next_row(row: row_t) return row_t is
|
|
|
|
variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
|
|
|
|
variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
|
|
|
|
variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
|
|
|
|
begin
|
|
|
|
row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
|
|
|
|
row_idx := row_v(ROW_LINEBITS-1 downto 0);
|
|
|
|
row_v(ROW_LINEBITS-1 downto 0) := std_ulogic_vector(unsigned(row_idx) + 1);
|
|
|
|
return to_integer(unsigned(row_v));
|
|
|
|
end;
|
|
|
|
|
|
|
|
-- Get the tag value from the address
|
|
|
|
function get_tag(addr: std_ulogic_vector(63 downto 0)) return cache_tag_t is
|
|
|
|
begin
|
|
|
|
return addr(63 downto 64-TAG_BITS);
|
|
|
|
end;
|
|
|
|
|
|
|
|
-- Read a tag from a tag memory row
|
|
|
|
function read_tag(way: way_t; tagset: cache_tags_set_t) return cache_tag_t is
|
|
|
|
begin
|
|
|
|
return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
|
|
|
|
end;
|
|
|
|
|
|
|
|
-- Write a tag to tag memory row
|
|
|
|
procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t;
|
|
|
|
tag: cache_tag_t) is
|
|
|
|
begin
|
|
|
|
tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
|
|
|
|
end;
|
|
|
|
|
|
|
|
-- Generate byte enables from sizes
|
|
|
|
function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
|
|
|
|
begin
|
|
|
|
case length is
|
|
|
|
when "0001" =>
|
|
|
|
return "00000001";
|
|
|
|
when "0010" =>
|
|
|
|
return "00000011";
|
|
|
|
when "0100" =>
|
|
|
|
return "00001111";
|
|
|
|
when "1000" =>
|
|
|
|
return "11111111";
|
|
|
|
when others =>
|
|
|
|
return "00000000";
|
|
|
|
end case;
|
|
|
|
end function length_to_sel;
|
|
|
|
|
|
|
|
-- Calculate byte enables for wishbone
|
|
|
|
-- This returns 16 bits, giving the select signals for two transfers,
|
|
|
|
-- to account for unaligned loads or stores
|
|
|
|
function wishbone_data_sel(size : in std_logic_vector(3 downto 0);
|
|
|
|
address : in std_logic_vector(63 downto 0))
|
|
|
|
return std_ulogic_vector is
|
|
|
|
variable longsel : std_ulogic_vector(15 downto 0);
|
|
|
|
begin
|
|
|
|
longsel := (others => '0');
|
|
|
|
longsel(7 downto 0) := length_to_sel(size);
|
|
|
|
return std_ulogic_vector(shift_left(unsigned(longsel),
|
|
|
|
to_integer(unsigned(address(2 downto 0)))));
|
|
|
|
end function wishbone_data_sel;
|
|
|
|
|
|
|
|
begin
|
|
|
|
|
|
|
|
assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
|
|
|
|
assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE;
|
|
|
|
assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE;
|
|
|
|
assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" severity FAILURE;
|
|
|
|
assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
|
|
|
|
report "geometry bits don't add up" severity FAILURE;
|
|
|
|
assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
|
|
|
|
report "geometry bits don't add up" severity FAILURE;
|
|
|
|
assert (64 = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
|
|
|
|
report "geometry bits don't add up" severity FAILURE;
|
|
|
|
assert (64 = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
|
|
|
|
report "geometry bits don't add up" severity FAILURE;
|
|
|
|
assert (64 = wishbone_data_bits)
|
|
|
|
report "Can't yet handle a wishbone width that isn't 64-bits" severity FAILURE;
|
|
|
|
|
|
|
|
-- Generate PLRUs
|
|
|
|
maybe_plrus: if NUM_WAYS > 1 generate
|
|
|
|
begin
|
|
|
|
plrus: for i in 0 to NUM_LINES-1 generate
|
|
|
|
-- PLRU interface
|
|
|
|
signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
|
|
|
|
signal plru_acc_en : std_ulogic;
|
|
|
|
signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
|
|
|
|
|
|
|
|
begin
|
|
|
|
plru : entity work.plru
|
|
|
|
generic map (
|
|
|
|
BITS => WAY_BITS
|
|
|
|
)
|
|
|
|
port map (
|
|
|
|
clk => clk,
|
|
|
|
rst => rst,
|
|
|
|
acc => plru_acc,
|
|
|
|
acc_en => plru_acc_en,
|
|
|
|
lru => plru_out
|
|
|
|
);
|
|
|
|
|
|
|
|
process(req_index, req_op, req_hit_way, plru_out)
|
|
|
|
begin
|
|
|
|
-- PLRU interface
|
|
|
|
if (req_op = OP_LOAD_HIT or
|
|
|
|
req_op = OP_STORE_HIT) and req_index = i then
|
|
|
|
plru_acc_en <= '1';
|
|
|
|
else
|
|
|
|
plru_acc_en <= '0';
|
|
|
|
end if;
|
|
|
|
plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS));
|
|
|
|
plru_victim(i) <= plru_out;
|
|
|
|
end process;
|
|
|
|
end generate;
|
|
|
|
end generate;
|
|
|
|
|
|
|
|
-- Cache request parsing and hit detection
|
|
|
|
dcache_request : process(all)
|
|
|
|
variable is_hit : std_ulogic;
|
|
|
|
variable hit_way : way_t;
|
|
|
|
variable op : op_t;
|
|
|
|
variable tmp : std_ulogic_vector(63 downto 0);
|
|
|
|
variable data : std_ulogic_vector(63 downto 0);
|
|
|
|
variable opsel : std_ulogic_vector(3 downto 0);
|
|
|
|
variable go : std_ulogic;
|
|
|
|
variable is_load : std_ulogic;
|
|
|
|
variable is_nc : std_ulogic;
|
|
|
|
begin
|
|
|
|
-- Extract line, row and tag from request
|
|
|
|
if r1.state /= NEXT_DWORD then
|
|
|
|
req_addr <= d_in.addr;
|
|
|
|
req_data <= d_in.data;
|
|
|
|
req_sel <= bus_sel(7 downto 0);
|
|
|
|
go := d_in.valid;
|
|
|
|
is_load := d_in.load;
|
|
|
|
is_nc := d_in.nc;
|
|
|
|
|
|
|
|
else
|
|
|
|
req_addr <= r1.next_addr;
|
|
|
|
req_data <= r1.req.data;
|
|
|
|
req_sel <= r1.next_sel;
|
|
|
|
go := '1';
|
|
|
|
is_load := r1.req.load;
|
|
|
|
is_nc := r1.req.nc;
|
|
|
|
end if;
|
|
|
|
|
|
|
|
req_index <= get_index(req_addr);
|
|
|
|
req_row <= get_row(req_addr);
|
|
|
|
req_tag <= get_tag(req_addr);
|
|
|
|
|
|
|
|
-- Calculate address of beginning of cache line, will be
|
|
|
|
-- used for cache miss processing if needed
|
|
|
|
--
|
|
|
|
req_laddr <= req_addr(63 downto LINE_OFF_BITS) &
|
|
|
|
(LINE_OFF_BITS-1 downto 0 => '0');
|
|
|
|
|
|
|
|
-- Test if pending request is a hit on any way
|
|
|
|
hit_way := 0;
|
|
|
|
is_hit := '0';
|
|
|
|
for i in way_t loop
|
|
|
|
if go = '1' and cache_valids(req_index)(i) = '1' then
|
|
|
|
if read_tag(i, cache_tags(req_index)) = req_tag then
|
|
|
|
hit_way := i;
|
|
|
|
is_hit := '1';
|
|
|
|
end if;
|
|
|
|
end if;
|
|
|
|
end loop;
|
|
|
|
|
|
|
|
-- The way that matched on a hit
|
|
|
|
req_hit_way <= hit_way;
|
|
|
|
|
|
|
|
-- The way to replace on a miss
|
|
|
|
replace_way <= to_integer(unsigned(plru_victim(req_index)));
|
|
|
|
|
|
|
|
-- Combine the request and cache his status to decide what
|
|
|
|
-- operation needs to be done
|
|
|
|
--
|
|
|
|
opsel := go & is_load & is_nc & is_hit;
|
|
|
|
case opsel is
|
|
|
|
when "1101" => op := OP_LOAD_HIT;
|
|
|
|
when "1100" => op := OP_LOAD_MISS;
|
|
|
|
when "1110" => op := OP_LOAD_NC;
|
|
|
|
when "1001" => op := OP_STORE_HIT;
|
|
|
|
when "1000" => op := OP_STORE_MISS;
|
|
|
|
when "1010" => op := OP_STORE_MISS;
|
|
|
|
when "1011" => op := OP_BAD;
|
|
|
|
when "1111" => op := OP_BAD;
|
|
|
|
when others => op := OP_NONE;
|
|
|
|
end case;
|
|
|
|
|
|
|
|
req_op <= op;
|
|
|
|
|
|
|
|
end process;
|
|
|
|
|
|
|
|
-- Wire up wishbone request latch out of stage 1
|
|
|
|
wishbone_out <= r1.wb;
|
|
|
|
|
|
|
|
-- Wishbone read and write and BRAM write sel bits generation
|
|
|
|
bus_sel <= wishbone_data_sel(d_in.length, d_in.addr);
|
|
|
|
|
|
|
|
-- See if the operation crosses two doublewords
|
|
|
|
two_dwords <= or (bus_sel(15 downto 8));
|
|
|
|
|
|
|
|
-- TODO: Generate errors
|
|
|
|
-- err_nc_collision <= '1' when req_op = OP_BAD else '0';
|
|
|
|
|
|
|
|
-- Generate stalls from stage 1 state machine
|
|
|
|
stall_out <= '1' when r1.state /= IDLE else '0';
|
|
|
|
|
|
|
|
-- Writeback (loads and reg updates) & completion control logic
|
|
|
|
--
|
|
|
|
writeback_control: process(all)
|
|
|
|
begin
|
|
|
|
|
|
|
|
-- The mux on d_out.write reg defaults to the normal load hit case.
|
|
|
|
d_out.write_enable <= '0';
|
|
|
|
d_out.valid <= '0';
|
|
|
|
d_out.write_reg <= r2.load_reg;
|
|
|
|
d_out.write_data <= cache_out(r2.hit_way);
|
|
|
|
d_out.write_len <= r2.length;
|
|
|
|
d_out.write_shift <= r2.data_shift;
|
|
|
|
d_out.sign_extend <= r2.sign_extend;
|
|
|
|
d_out.byte_reverse <= r2.byte_reverse;
|
|
|
|
d_out.second_word <= r2.second_dword;
|
Add basic XER support
The carry is currently internal to execute1. We don't handle any of
the other XER fields.
This creates type called "xer_common_t" that contains the commonly
used XER bits (CA, CA32, SO, OV, OV32).
The value is stored in the CR file (though it could be a separate
module). The rest of the bits will be implemented as a separate
SPR and the two parts reconciled in mfspr/mtspr in latter commits.
We always read XER in decode2 (there is little point not to)
and send it down all pipeline branches as it will be needed in
writeback for all type of instructions when CR0:SO needs to be
updated (such forms exist for all pipeline branches even if we don't
yet implement them).
To avoid having to track XER hazards, we forward it back in EX1. This
assumes that other pipeline branches that can modify it (mult and div)
are running single issue for now.
One additional hazard to beware of is an XER:SO modifying instruction
in EX1 followed immediately by a store conditional. Due to our writeback
latency, the store will go down the LSU with the previous XER value,
thus the stcx. will set CR0:SO using an obsolete SO value.
I doubt there exist any code relying on this behaviour being correct
but we should account for it regardless, possibly by ensuring that
stcx. remain single issue initially, or later by adding some minimal
tracking or moving the LSU into the same pipeline as execute.
Missing some obscure XER affecting instructions like addex or mcrxrx.
[paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of
arguments to set_ov]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
d_out.xerc <= r2.xerc;
|
|
|
|
|
|
|
|
-- We have a valid load or store hit or we just completed a slow
|
|
|
|
-- op such as a load miss, a NC load or a store
|
|
|
|
--
|
|
|
|
-- Note: the load hit is delayed by one cycle. However it can still
|
|
|
|
-- not collide with r.slow_valid (well unless I miscalculated) because
|
|
|
|
-- slow_valid can only be set on a subsequent request and not on its
|
|
|
|
-- first cycle (the state machine must have advanced), which makes
|
|
|
|
-- slow_valid at least 2 cycles from the previous hit_load_valid.
|
|
|
|
--
|
|
|
|
|
|
|
|
-- Sanity: Only one of these must be set in any given cycle
|
|
|
|
assert (r1.update_valid and r2.hit_load_valid) /= '1' report
|
|
|
|
"unexpected hit_load_delayed collision with update_valid"
|
|
|
|
severity FAILURE;
|
|
|
|
assert (r1.slow_valid and r2.hit_load_valid) /= '1' report
|
|
|
|
"unexpected hit_load_delayed collision with slow_valid"
|
|
|
|
severity FAILURE;
|
|
|
|
assert (r1.slow_valid and r1.update_valid) /= '1' report
|
|
|
|
"unexpected update_valid collision with slow_valid"
|
|
|
|
severity FAILURE;
|
|
|
|
|
|
|
|
-- Delayed load hit case is the standard path
|
|
|
|
if r2.hit_load_valid = '1' then
|
|
|
|
d_out.write_enable <= '1';
|
|
|
|
|
|
|
|
-- If there isn't another dword to go and
|
|
|
|
-- it's not a load with update, complete it now
|
|
|
|
if r2.last_dword = '1' and r2.load_is_update = '0' then
|
|
|
|
report "completing load hit";
|
|
|
|
d_out.valid <= '1';
|
|
|
|
end if;
|
|
|
|
end if;
|
|
|
|
|
|
|
|
-- Slow ops (load miss, NC, stores)
|
|
|
|
if r1.slow_valid = '1' then
|
|
|
|
-- If it's a load, enable register writeback and switch
|
|
|
|
-- mux accordingly
|
|
|
|
--
|
|
|
|
if r1.req.load then
|
|
|
|
d_out.write_reg <= r1.req.write_reg;
|
|
|
|
d_out.write_enable <= '1';
|
|
|
|
|
|
|
|
-- Read data comes from the slow data latch, formatter
|
|
|
|
-- from the latched request.
|
|
|
|
--
|
|
|
|
d_out.write_data <= r1.slow_data;
|
|
|
|
d_out.write_shift <= r1.req.addr(2 downto 0);
|
|
|
|
d_out.sign_extend <= r1.req.sign_extend;
|
|
|
|
d_out.byte_reverse <= r1.req.byte_reverse;
|
|
|
|
d_out.write_len <= r1.req.length;
|
Add basic XER support
The carry is currently internal to execute1. We don't handle any of
the other XER fields.
This creates type called "xer_common_t" that contains the commonly
used XER bits (CA, CA32, SO, OV, OV32).
The value is stored in the CR file (though it could be a separate
module). The rest of the bits will be implemented as a separate
SPR and the two parts reconciled in mfspr/mtspr in latter commits.
We always read XER in decode2 (there is little point not to)
and send it down all pipeline branches as it will be needed in
writeback for all type of instructions when CR0:SO needs to be
updated (such forms exist for all pipeline branches even if we don't
yet implement them).
To avoid having to track XER hazards, we forward it back in EX1. This
assumes that other pipeline branches that can modify it (mult and div)
are running single issue for now.
One additional hazard to beware of is an XER:SO modifying instruction
in EX1 followed immediately by a store conditional. Due to our writeback
latency, the store will go down the LSU with the previous XER value,
thus the stcx. will set CR0:SO using an obsolete SO value.
I doubt there exist any code relying on this behaviour being correct
but we should account for it regardless, possibly by ensuring that
stcx. remain single issue initially, or later by adding some minimal
tracking or moving the LSU into the same pipeline as execute.
Missing some obscure XER affecting instructions like addex or mcrxrx.
[paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of
arguments to set_ov]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
d_out.xerc <= r1.req.xerc;
|
|
|
|
d_out.second_word <= r1.second_dword;
|
|
|
|
end if;
|
|
|
|
|
|
|
|
-- If it's a store or a non-update load form, complete now
|
|
|
|
-- unless we need to do another dword transfer
|
|
|
|
if (r1.req.load = '0' or r1.req.update = '0') and
|
|
|
|
(r1.two_dwords = '0' or r1.second_dword = '1') then
|
|
|
|
report "completing store or load miss";
|
|
|
|
d_out.valid <= '1';
|
|
|
|
end if;
|
|
|
|
end if;
|
|
|
|
|
|
|
|
-- We have a register update to do.
|
|
|
|
if r1.update_valid = '1' then
|
|
|
|
d_out.write_enable <= '1';
|
|
|
|
d_out.write_reg <= r1.req.update_reg;
|
|
|
|
|
|
|
|
-- Change the read data mux to the address that's going into
|
|
|
|
-- the register and the formatter does nothing.
|
|
|
|
--
|
|
|
|
d_out.write_data <= r1.req.addr;
|
|
|
|
d_out.write_shift <= "000";
|
|
|
|
d_out.write_len <= "1000";
|
|
|
|
d_out.sign_extend <= '0';
|
|
|
|
d_out.byte_reverse <= '0';
|
Add basic XER support
The carry is currently internal to execute1. We don't handle any of
the other XER fields.
This creates type called "xer_common_t" that contains the commonly
used XER bits (CA, CA32, SO, OV, OV32).
The value is stored in the CR file (though it could be a separate
module). The rest of the bits will be implemented as a separate
SPR and the two parts reconciled in mfspr/mtspr in latter commits.
We always read XER in decode2 (there is little point not to)
and send it down all pipeline branches as it will be needed in
writeback for all type of instructions when CR0:SO needs to be
updated (such forms exist for all pipeline branches even if we don't
yet implement them).
To avoid having to track XER hazards, we forward it back in EX1. This
assumes that other pipeline branches that can modify it (mult and div)
are running single issue for now.
One additional hazard to beware of is an XER:SO modifying instruction
in EX1 followed immediately by a store conditional. Due to our writeback
latency, the store will go down the LSU with the previous XER value,
thus the stcx. will set CR0:SO using an obsolete SO value.
I doubt there exist any code relying on this behaviour being correct
but we should account for it regardless, possibly by ensuring that
stcx. remain single issue initially, or later by adding some minimal
tracking or moving the LSU into the same pipeline as execute.
Missing some obscure XER affecting instructions like addex or mcrxrx.
[paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of
arguments to set_ov]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
d_out.xerc <= r1.req.xerc;
|
|
|
|
d_out.second_word <= '0';
|
|
|
|
|
|
|
|
-- If it was a load, this completes the operation (load with
|
|
|
|
-- update case).
|
|
|
|
--
|
|
|
|
if r1.req.load = '1' then
|
|
|
|
report "completing after load update";
|
|
|
|
d_out.valid <= '1';
|
|
|
|
end if;
|
|
|
|
end if;
|
|
|
|
|
|
|
|
end process;
|
|
|
|
|
|
|
|
--
|
|
|
|
-- Generate a cache RAM for each way. This handles the normal
|
|
|
|
-- reads, writes from reloads and the special store-hit update
|
|
|
|
-- path as well.
|
|
|
|
--
|
|
|
|
-- Note: the BRAMs have an extra read buffer, meaning the output
|
|
|
|
-- is pipelined an extra cycle. This differs from the
|
|
|
|
-- icache. The writeback logic needs to take that into
|
|
|
|
-- account by using 1-cycle delayed signals for load hits.
|
|
|
|
--
|
|
|
|
rams: for i in 0 to NUM_WAYS-1 generate
|
|
|
|
signal do_read : std_ulogic;
|
|
|
|
signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
|
|
|
|
signal do_write : std_ulogic;
|
|
|
|
signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
|
|
|
|
signal wr_data : std_ulogic_vector(wishbone_data_bits-1 downto 0);
|
|
|
|
signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
|
|
|
|
signal dout : cache_row_t;
|
|
|
|
begin
|
|
|
|
way: entity work.cache_ram
|
|
|
|
generic map (
|
|
|
|
ROW_BITS => ROW_BITS,
|
|
|
|
WIDTH => wishbone_data_bits,
|
|
|
|
ADD_BUF => true
|
|
|
|
)
|
|
|
|
port map (
|
|
|
|
clk => clk,
|
|
|
|
rd_en => do_read,
|
|
|
|
rd_addr => rd_addr,
|
|
|
|
rd_data => dout,
|
|
|
|
wr_en => do_write,
|
|
|
|
wr_sel => wr_sel,
|
|
|
|
wr_addr => wr_addr,
|
|
|
|
wr_data => wr_data
|
|
|
|
);
|
|
|
|
process(all)
|
|
|
|
variable tmp_adr : std_ulogic_vector(63 downto 0);
|
|
|
|
variable reloading : boolean;
|
|
|
|
begin
|
|
|
|
-- Cache hit reads
|
|
|
|
do_read <= '1';
|
|
|
|
rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
|
|
|
|
cache_out(i) <= dout;
|
|
|
|
|
|
|
|
-- Write mux:
|
|
|
|
--
|
|
|
|
-- Defaults to wishbone read responses (cache refill),
|
|
|
|
--
|
|
|
|
-- For timing, the mux on wr_data/sel/addr is not dependent on anything
|
|
|
|
-- other than the current state. Only the do_write signal is.
|
|
|
|
--
|
|
|
|
if r1.state = IDLE or r1.state = NEXT_DWORD then
|
|
|
|
-- In these states, the only write path is the store-hit update case
|
|
|
|
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
|
|
|
|
wr_data <= req_data;
|
|
|
|
wr_sel <= req_sel;
|
|
|
|
else
|
|
|
|
-- Otherwise, we might be doing a reload
|
|
|
|
wr_data <= wishbone_in.dat;
|
|
|
|
wr_sel <= (others => '1');
|
|
|
|
wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS));
|
|
|
|
end if;
|
|
|
|
|
|
|
|
-- The two actual write cases here
|
|
|
|
do_write <= '0';
|
|
|
|
reloading := r1.state = RELOAD_WAIT_ACK;
|
|
|
|
if reloading and wishbone_in.ack = '1' and r1.store_way = i then
|
|
|
|
do_write <= '1';
|
|
|
|
end if;
|
|
|
|
if req_op = OP_STORE_HIT and req_hit_way = i then
|
|
|
|
assert not reloading report "Store hit while in state:" &
|
|
|
|
state_t'image(r1.state)
|
|
|
|
severity FAILURE;
|
|
|
|
do_write <= '1';
|
|
|
|
end if;
|
|
|
|
end process;
|
|
|
|
end generate;
|
|
|
|
|
|
|
|
--
|
|
|
|
-- Cache hit synchronous machine for the easy case. This handles
|
|
|
|
-- non-update form load hits and stage 1 to stage 2 transfers
|
|
|
|
--
|
|
|
|
dcache_fast_hit : process(clk)
|
|
|
|
begin
|
|
|
|
if rising_edge(clk) then
|
|
|
|
-- stage 1 -> stage 2
|
|
|
|
r2.hit_load_valid <= r1.hit_load_valid;
|
|
|
|
r2.hit_way <= r1.hit_way;
|
|
|
|
r2.load_is_update <= r1.req.update;
|
|
|
|
r2.load_reg <= r1.req.write_reg;
|
|
|
|
r2.data_shift <= r1.req.addr(2 downto 0);
|
|
|
|
r2.length <= r1.req.length;
|
|
|
|
r2.sign_extend <= r1.req.sign_extend;
|
|
|
|
r2.byte_reverse <= r1.req.byte_reverse;
|
|
|
|
r2.second_dword <= r1.second_dword;
|
|
|
|
r2.last_dword <= r1.second_dword or not r1.two_dwords;
|
|
|
|
|
|
|
|
-- If we have a request incoming, we have to latch it as d_in.valid
|
|
|
|
-- is only set for a single cycle. It's up to the control logic to
|
|
|
|
-- ensure we don't override an uncompleted request (for now we are
|
|
|
|
-- single issue on load/stores so we are fine, later, we can generate
|
|
|
|
-- a stall output if necessary).
|
|
|
|
|
|
|
|
if req_op /= OP_NONE and d_in.valid = '1' then
|
|
|
|
r1.req <= d_in;
|
|
|
|
r1.second_dword <= '0';
|
|
|
|
r1.two_dwords <= two_dwords;
|
|
|
|
r1.next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000";
|
|
|
|
r1.next_sel <= bus_sel(15 downto 8);
|
|
|
|
|
|
|
|
report "op:" & op_t'image(req_op) &
|
|
|
|
" addr:" & to_hstring(d_in.addr) &
|
|
|
|
" upd:" & std_ulogic'image(d_in.update) &
|
|
|
|
" nc:" & std_ulogic'image(d_in.nc) &
|
|
|
|
" reg:" & to_hstring(d_in.write_reg) &
|
|
|
|
" idx:" & integer'image(req_index) &
|
|
|
|
" tag:" & to_hstring(req_tag) &
|
|
|
|
" way: " & integer'image(req_hit_way);
|
|
|
|
elsif r1.state = NEXT_DWORD then
|
|
|
|
r1.second_dword <= '1';
|
|
|
|
end if;
|
|
|
|
|
|
|
|
-- Fast path for load/store hits. Set signals for the writeback controls.
|
|
|
|
if req_op = OP_LOAD_HIT then
|
|
|
|
r1.hit_way <= req_hit_way;
|
|
|
|
r1.hit_load_valid <= '1';
|
|
|
|
else
|
|
|
|
r1.hit_load_valid <= '0';
|
|
|
|
end if;
|
|
|
|
end if;
|
|
|
|
end process;
|
|
|
|
|
|
|
|
--
|
|
|
|
-- Every other case is handled by this state machine:
|
|
|
|
--
|
|
|
|
-- * Cache load miss/reload (in conjunction with "rams")
|
|
|
|
-- * Load hits for update forms
|
|
|
|
-- * Load hits for non-cachable forms
|
|
|
|
-- * Stores (the collision case is handled in "rams")
|
|
|
|
--
|
|
|
|
-- All wishbone requests generation is done here. This machine
|
|
|
|
-- operates at stage 1.
|
|
|
|
--
|
|
|
|
dcache_slow : process(clk)
|
|
|
|
variable tagset : cache_tags_set_t;
|
|
|
|
variable stbs_done : boolean;
|
|
|
|
begin
|
|
|
|
if rising_edge(clk) then
|
|
|
|
-- On reset, clear all valid bits to force misses
|
|
|
|
if rst = '1' then
|
|
|
|
for i in index_t loop
|
|
|
|
cache_valids(i) <= (others => '0');
|
|
|
|
end loop;
|
|
|
|
r1.state <= IDLE;
|
|
|
|
r1.slow_valid <= '0';
|
|
|
|
r1.update_valid <= '0';
|
|
|
|
r1.wb.cyc <= '0';
|
|
|
|
r1.wb.stb <= '0';
|
|
|
|
|
|
|
|
-- Not useful normally but helps avoiding tons of sim warnings
|
|
|
|
r1.wb.adr <= (others => '0');
|
|
|
|
else
|
|
|
|
-- One cycle pulses reset
|
|
|
|
r1.slow_valid <= '0';
|
|
|
|
r1.update_valid <= '0';
|
|
|
|
|
|
|
|
-- We cannot currently process a new request when not idle
|
|
|
|
assert d_in.valid = '0' or r1.state = IDLE report "request " &
|
|
|
|
op_t'image(req_op) & " while in state " & state_t'image(r1.state)
|
|
|
|
severity FAILURE;
|
|
|
|
|
|
|
|
-- Main state machine
|
|
|
|
case r1.state is
|
|
|
|
when IDLE | NEXT_DWORD =>
|
|
|
|
case req_op is
|
|
|
|
when OP_LOAD_HIT =>
|
|
|
|
if r1.state = IDLE then
|
|
|
|
-- If the load is misaligned then we will need to start
|
|
|
|
-- the state machine
|
|
|
|
if two_dwords = '1' then
|
|
|
|
r1.state <= NEXT_DWORD;
|
|
|
|
elsif d_in.update = '1' then
|
|
|
|
-- We have a load with update hit, we need the delayed update cycle
|
|
|
|
r1.state <= LOAD_UPDATE;
|
|
|
|
end if;
|
|
|
|
else
|
|
|
|
if r1.req.update = '1' then
|
|
|
|
r1.state <= LOAD_UPDATE;
|
|
|
|
else
|
|
|
|
r1.state <= IDLE;
|
|
|
|
end if;
|
|
|
|
end if;
|
|
|
|
|
|
|
|
when OP_LOAD_MISS =>
|
|
|
|
-- Normal load cache miss, start the reload machine
|
|
|
|
--
|
|
|
|
report "cache miss addr:" & to_hstring(req_addr) &
|
|
|
|
" idx:" & integer'image(req_index) &
|
|
|
|
" way:" & integer'image(replace_way) &
|
|
|
|
" tag:" & to_hstring(req_tag);
|
|
|
|
|
|
|
|
-- Force misses on that way while reloading that line
|
|
|
|
cache_valids(req_index)(replace_way) <= '0';
|
|
|
|
|
|
|
|
-- Store new tag in selected way
|
|
|
|
for i in 0 to NUM_WAYS-1 loop
|
|
|
|
if i = replace_way then
|
|
|
|
tagset := cache_tags(req_index);
|
|
|
|
write_tag(i, tagset, req_tag);
|
|
|
|
cache_tags(req_index) <= tagset;
|
|
|
|
end if;
|
|
|
|
end loop;
|
|
|
|
|
|
|
|
-- Keep track of our index and way for subsequent stores.
|
|
|
|
r1.store_index <= req_index;
|
|
|
|
r1.store_way <= replace_way;
|
|
|
|
r1.store_row <= get_row(req_laddr);
|
|
|
|
|
|
|
|
-- Prep for first wishbone read. We calculate the address of
|
|
|
|
-- the start of the cache line and start the WB cycle
|
|
|
|
--
|
|
|
|
r1.wb.adr <= req_laddr(r1.wb.adr'left downto 0);
|
|
|
|
r1.wb.sel <= (others => '1');
|
|
|
|
r1.wb.we <= '0';
|
|
|
|
r1.wb.cyc <= '1';
|
|
|
|
r1.wb.stb <= '1';
|
|
|
|
|
|
|
|
-- Track that we had one request sent
|
|
|
|
r1.state <= RELOAD_WAIT_ACK;
|
|
|
|
|
|
|
|
when OP_LOAD_NC =>
|
|
|
|
r1.wb.sel <= req_sel;
|
|
|
|
r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000";
|
|
|
|
r1.wb.cyc <= '1';
|
|
|
|
r1.wb.stb <= '1';
|
|
|
|
r1.wb.we <= '0';
|
|
|
|
r1.state <= NC_LOAD_WAIT_ACK;
|
|
|
|
|
|
|
|
when OP_STORE_HIT | OP_STORE_MISS =>
|
|
|
|
-- For store-with-update do the register update
|
|
|
|
r1.update_valid <= d_in.valid and d_in.update;
|
|
|
|
r1.wb.sel <= req_sel;
|
|
|
|
r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000";
|
|
|
|
r1.wb.dat <= req_data;
|
|
|
|
r1.wb.cyc <= '1';
|
|
|
|
r1.wb.stb <= '1';
|
|
|
|
r1.wb.we <= '1';
|
|
|
|
r1.state <= STORE_WAIT_ACK;
|
|
|
|
|
|
|
|
-- OP_NONE and OP_BAD do nothing
|
|
|
|
when OP_NONE =>
|
|
|
|
when OP_BAD =>
|
|
|
|
end case;
|
|
|
|
|
|
|
|
when RELOAD_WAIT_ACK =>
|
|
|
|
-- Requests are all sent if stb is 0
|
|
|
|
stbs_done := r1.wb.stb = '0';
|
|
|
|
|
|
|
|
-- If we are still sending requests, was one accepted ?
|
|
|
|
if wishbone_in.stall = '0' and not stbs_done then
|
|
|
|
-- That was the last word ? We are done sending. Clear
|
|
|
|
-- stb and set stbs_done so we can handle an eventual last
|
|
|
|
-- ack on the same cycle.
|
|
|
|
--
|
|
|
|
if is_last_row_addr(r1.wb.adr) then
|
|
|
|
r1.wb.stb <= '0';
|
|
|
|
stbs_done := true;
|
|
|
|
end if;
|
|
|
|
|
|
|
|
-- Calculate the next row address
|
|
|
|
r1.wb.adr <= next_row_addr(r1.wb.adr);
|
|
|
|
end if;
|
|
|
|
|
|
|
|
-- Incoming acks processing
|
|
|
|
if wishbone_in.ack = '1' then
|
|
|
|
-- Is this the data we were looking for ? Latch it so
|
|
|
|
-- we can respond later. We don't currently complete the
|
|
|
|
-- pending miss request immediately, we wait for the
|
|
|
|
-- whole line to be loaded. The reason is that if we
|
|
|
|
-- did, we would potentially get new requests in while
|
|
|
|
-- not idle, which we don't currently know how to deal
|
|
|
|
-- with.
|
|
|
|
--
|
|
|
|
if r1.store_row = get_row(r1.req.addr) then
|
|
|
|
r1.slow_data <= wishbone_in.dat;
|
|
|
|
end if;
|
|
|
|
|
|
|
|
-- Check for completion
|
|
|
|
if stbs_done and is_last_row(r1.store_row) then
|
|
|
|
-- Complete wishbone cycle
|
|
|
|
r1.wb.cyc <= '0';
|
|
|
|
|
|
|
|
-- Cache line is now valid
|
|
|
|
cache_valids(r1.store_index)(r1.store_way) <= '1';
|
|
|
|
|
|
|
|
-- Write back the load data that we got, and start
|
|
|
|
-- the second dword if necessary. Otherwise, see if
|
|
|
|
-- we also need to do the deferred update cycle.
|
|
|
|
r1.slow_valid <= '1';
|
|
|
|
if r1.two_dwords and not r1.second_dword then
|
|
|
|
r1.state <= NEXT_DWORD;
|
|
|
|
elsif r1.req.update = '1' then
|
|
|
|
r1.state <= LOAD_UPDATE2;
|
|
|
|
report "completing miss with load-update !";
|
|
|
|
else
|
|
|
|
r1.state <= IDLE;
|
|
|
|
report "completing miss !";
|
|
|
|
end if;
|
|
|
|
end if;
|
|
|
|
|
|
|
|
-- Increment store row counter
|
|
|
|
r1.store_row <= next_row(r1.store_row);
|
|
|
|
end if;
|
|
|
|
|
|
|
|
when LOAD_UPDATE =>
|
|
|
|
-- We need the extra cycle to complete a load with update
|
|
|
|
r1.state <= LOAD_UPDATE2;
|
|
|
|
when LOAD_UPDATE2 =>
|
|
|
|
-- We need the extra cycle to complete a load with update
|
|
|
|
r1.update_valid <= '1';
|
|
|
|
r1.state <= IDLE;
|
|
|
|
|
|
|
|
when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK =>
|
|
|
|
-- Clear stb when slave accepted request
|
|
|
|
if wishbone_in.stall = '0' then
|
|
|
|
r1.wb.stb <= '0';
|
|
|
|
end if;
|
|
|
|
|
|
|
|
-- Got ack ? complete.
|
|
|
|
if wishbone_in.ack = '1' then
|
|
|
|
if r1.two_dwords and not r1.second_dword then
|
|
|
|
r1.state <= NEXT_DWORD;
|
|
|
|
elsif r1.state = NC_LOAD_WAIT_ACK and r1.req.update = '1' then
|
|
|
|
r1.state <= LOAD_UPDATE2;
|
|
|
|
else
|
|
|
|
r1.state <= IDLE;
|
|
|
|
end if;
|
|
|
|
if r1.state = NC_LOAD_WAIT_ACK then
|
|
|
|
r1.slow_data <= wishbone_in.dat;
|
|
|
|
end if;
|
|
|
|
r1.slow_valid <= '1';
|
|
|
|
r1.wb.cyc <= '0';
|
|
|
|
r1.wb.stb <= '0';
|
|
|
|
end if;
|
|
|
|
end case;
|
|
|
|
end if;
|
|
|
|
end if;
|
|
|
|
end process;
|
|
|
|
end;
|