dcache: Add support for unaligned loads and stores

For an unaligned load or store, we do the first doubleword (dword) of
the transfer as normal, but then go to a new NEXT_DWORD state of the
state machine to do the cache tag lookup for the second dword of the
transfer.  From the NEXT_DWORD state we have much the same transitions
to other states as from the IDLE state (the transitions for OP_LOAD_HIT
are a bit different but almost identical for the other op values).

We now do the preparation of the data to be written in loadstore1,
that is, byte reversal if necessary and rotation by a number of
bytes based on the low 3 bits of the address.  We do rotation not
shifting so we have the bytes that need to go into the second
doubleword in the right place in the low bytes of the data sent to
dcache.  The rotation and byte reversal are done in a single step
with one multiplexer per byte by setting the select inputs for each
byte appropriately.

This also fixes writeback to not write the register value until it
has received both pieces of an unaligned load value.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
pull/153/head
Paul Mackerras 5 years ago
parent 1587d9e6eb
commit 94dd8bc480

@ -124,6 +124,7 @@ architecture rtl of dcache is
-- Cache state machine -- Cache state machine
type state_t is (IDLE, -- Normal load hit processing type state_t is (IDLE, -- Normal load hit processing
NEXT_DWORD, -- Starting the 2nd xfer of misaligned
LOAD_UPDATE, -- Load with update extra cycle LOAD_UPDATE, -- Load with update extra cycle
LOAD_UPDATE2, -- Load with update extra cycle LOAD_UPDATE2, -- Load with update extra cycle
RELOAD_WAIT_ACK, -- Cache reload wait ack RELOAD_WAIT_ACK, -- Cache reload wait ack
@ -157,6 +158,12 @@ architecture rtl of dcache is
hit_way : way_t; hit_way : way_t;
hit_load_valid : std_ulogic; hit_load_valid : std_ulogic;


-- Info for doing the second transfer of a misaligned load/store
two_dwords : std_ulogic;
second_dword : std_ulogic;
next_addr : std_ulogic_vector(63 downto 0);
next_sel : std_ulogic_vector(7 downto 0);

-- Register update (load/store with update) -- Register update (load/store with update)
update_valid : std_ulogic; update_valid : std_ulogic;


@ -186,6 +193,8 @@ architecture rtl of dcache is
sign_extend : std_ulogic; sign_extend : std_ulogic;
byte_reverse : std_ulogic; byte_reverse : std_ulogic;
xerc : xer_common_t; xerc : xer_common_t;
last_dword : std_ulogic;
second_dword : std_ulogic;
end record; end record;


signal r2 : reg_stage_2_t; signal r2 : reg_stage_2_t;
@ -196,7 +205,10 @@ architecture rtl of dcache is
signal req_hit_way : way_t; signal req_hit_way : way_t;
signal req_tag : cache_tag_t; signal req_tag : cache_tag_t;
signal req_op : op_t; signal req_op : op_t;
signal req_data : std_ulogic_vector(63 downto 0);
signal req_addr : std_ulogic_vector(63 downto 0);
signal req_laddr : std_ulogic_vector(63 downto 0); signal req_laddr : std_ulogic_vector(63 downto 0);
signal req_sel : std_ulogic_vector(7 downto 0);


-- Cache RAM interface -- Cache RAM interface
type cache_ram_out_t is array(way_t) of cache_row_t; type cache_ram_out_t is array(way_t) of cache_row_t;
@ -208,8 +220,9 @@ architecture rtl of dcache is
signal replace_way : way_t; signal replace_way : way_t;


-- Wishbone read/write/cache write formatting signals -- Wishbone read/write/cache write formatting signals
signal bus_sel : wishbone_sel_type; signal bus_sel : std_ulogic_vector(15 downto 0);
signal store_data : wishbone_data_type;
signal two_dwords : std_ulogic;
-- --
-- Helper functions to decode incoming requests -- Helper functions to decode incoming requests
@ -307,17 +320,17 @@ architecture rtl of dcache is
end case; end case;
end function length_to_sel; end function length_to_sel;


-- Calculate shift and byte enables for wishbone -- Calculate byte enables for wishbone
function wishbone_data_shift(address : in std_ulogic_vector(63 downto 0)) return natural is -- This returns 16 bits, giving the select signals for two transfers,
begin -- to account for unaligned loads or stores
return to_integer(unsigned(address(2 downto 0))) * 8;
end function wishbone_data_shift;

function wishbone_data_sel(size : in std_logic_vector(3 downto 0); function wishbone_data_sel(size : in std_logic_vector(3 downto 0);
address : in std_logic_vector(63 downto 0)) address : in std_logic_vector(63 downto 0))
return std_ulogic_vector is return std_ulogic_vector is
variable longsel : std_ulogic_vector(15 downto 0);
begin begin
return std_ulogic_vector(shift_left(unsigned(length_to_sel(size)), longsel := (others => '0');
longsel(7 downto 0) := length_to_sel(size);
return std_ulogic_vector(shift_left(unsigned(longsel),
to_integer(unsigned(address(2 downto 0))))); to_integer(unsigned(address(2 downto 0)))));
end function wishbone_data_sel; end function wishbone_data_sel;


@ -383,23 +396,43 @@ begin
variable tmp : std_ulogic_vector(63 downto 0); variable tmp : std_ulogic_vector(63 downto 0);
variable data : std_ulogic_vector(63 downto 0); variable data : std_ulogic_vector(63 downto 0);
variable opsel : std_ulogic_vector(3 downto 0); variable opsel : std_ulogic_vector(3 downto 0);
variable go : std_ulogic;
variable is_load : std_ulogic;
variable is_nc : std_ulogic;
begin begin
-- Extract line, row and tag from request -- Extract line, row and tag from request
req_index <= get_index(d_in.addr); if r1.state /= NEXT_DWORD then
req_row <= get_row(d_in.addr); req_addr <= d_in.addr;
req_tag <= get_tag(d_in.addr); req_data <= d_in.data;
req_sel <= bus_sel(7 downto 0);
go := d_in.valid;
is_load := d_in.load;
is_nc := d_in.nc;

else
req_addr <= r1.next_addr;
req_data <= r1.req.data;
req_sel <= r1.next_sel;
go := '1';
is_load := r1.req.load;
is_nc := r1.req.nc;
end if;

req_index <= get_index(req_addr);
req_row <= get_row(req_addr);
req_tag <= get_tag(req_addr);


-- Calculate address of beginning of cache line, will be -- Calculate address of beginning of cache line, will be
-- used for cache miss processing if needed -- used for cache miss processing if needed
-- --
req_laddr <= d_in.addr(63 downto LINE_OFF_BITS) & req_laddr <= req_addr(63 downto LINE_OFF_BITS) &
(LINE_OFF_BITS-1 downto 0 => '0'); (LINE_OFF_BITS-1 downto 0 => '0');


-- Test if pending request is a hit on any way -- Test if pending request is a hit on any way
hit_way := 0; hit_way := 0;
is_hit := '0'; is_hit := '0';
for i in way_t loop for i in way_t loop
if d_in.valid = '1' and cache_valids(req_index)(i) = '1' then if go = '1' and cache_valids(req_index)(i) = '1' then
if read_tag(i, cache_tags(req_index)) = req_tag then if read_tag(i, cache_tags(req_index)) = req_tag then
hit_way := i; hit_way := i;
is_hit := '1'; is_hit := '1';
@ -416,7 +449,7 @@ begin
-- Combine the request and cache his status to decide what -- Combine the request and cache his status to decide what
-- operation needs to be done -- operation needs to be done
-- --
opsel := d_in.valid & d_in.load & d_in.nc & is_hit; opsel := go & is_load & is_nc & is_hit;
case opsel is case opsel is
when "1101" => op := OP_LOAD_HIT; when "1101" => op := OP_LOAD_HIT;
when "1100" => op := OP_LOAD_MISS; when "1100" => op := OP_LOAD_MISS;
@ -433,22 +466,15 @@ begin


end process; end process;


--
-- Misc signal assignments
--

-- Wire up wishbone request latch out of stage 1 -- Wire up wishbone request latch out of stage 1
wishbone_out <= r1.wb; wishbone_out <= r1.wb;


-- Wishbone & BRAM write data formatting for stores (most of it already
-- happens in loadstore1, this is the remaining data shifting)
--
store_data <= std_logic_vector(shift_left(unsigned(d_in.data),
wishbone_data_shift(d_in.addr)));

-- Wishbone read and write and BRAM write sel bits generation -- Wishbone read and write and BRAM write sel bits generation
bus_sel <= wishbone_data_sel(d_in.length, d_in.addr); bus_sel <= wishbone_data_sel(d_in.length, d_in.addr);


-- See if the operation crosses two doublewords
two_dwords <= or (bus_sel(15 downto 8));

-- TODO: Generate errors -- TODO: Generate errors
-- err_nc_collision <= '1' when req_op = OP_BAD else '0'; -- err_nc_collision <= '1' when req_op = OP_BAD else '0';


@ -469,7 +495,7 @@ begin
d_out.write_shift <= r2.data_shift; d_out.write_shift <= r2.data_shift;
d_out.sign_extend <= r2.sign_extend; d_out.sign_extend <= r2.sign_extend;
d_out.byte_reverse <= r2.byte_reverse; d_out.byte_reverse <= r2.byte_reverse;
d_out.second_word <= '0'; d_out.second_word <= r2.second_dword;
d_out.xerc <= r2.xerc; d_out.xerc <= r2.xerc;


-- We have a valid load or store hit or we just completed a slow -- We have a valid load or store hit or we just completed a slow
@ -497,8 +523,10 @@ begin
if r2.hit_load_valid = '1' then if r2.hit_load_valid = '1' then
d_out.write_enable <= '1'; d_out.write_enable <= '1';


-- If it's not a load with update, complete it now -- If there isn't another dword to go and
if r2.load_is_update = '0' then -- it's not a load with update, complete it now
if r2.last_dword = '1' and r2.load_is_update = '0' then
report "completing load hit";
d_out.valid <= '1'; d_out.valid <= '1';
end if; end if;
end if; end if;
@ -521,10 +549,14 @@ begin
d_out.byte_reverse <= r1.req.byte_reverse; d_out.byte_reverse <= r1.req.byte_reverse;
d_out.write_len <= r1.req.length; d_out.write_len <= r1.req.length;
d_out.xerc <= r1.req.xerc; d_out.xerc <= r1.req.xerc;
d_out.second_word <= r1.second_dword;
end if; end if;


-- If it's a store or a non-update load form, complete now -- If it's a store or a non-update load form, complete now
if r1.req.load = '0' or r1.req.update = '0' then -- unless we need to do another dword transfer
if (r1.req.load = '0' or r1.req.update = '0') and
(r1.two_dwords = '0' or r1.second_dword = '1') then
report "completing store or load miss";
d_out.valid <= '1'; d_out.valid <= '1';
end if; end if;
end if; end if;
@ -543,11 +575,13 @@ begin
d_out.sign_extend <= '0'; d_out.sign_extend <= '0';
d_out.byte_reverse <= '0'; d_out.byte_reverse <= '0';
d_out.xerc <= r1.req.xerc; d_out.xerc <= r1.req.xerc;
d_out.second_word <= '0';


-- If it was a load, this completes the operation (load with -- If it was a load, this completes the operation (load with
-- update case). -- update case).
-- --
if r1.req.load = '1' then if r1.req.load = '1' then
report "completing after load update";
d_out.valid <= '1'; d_out.valid <= '1';
end if; end if;
end if; end if;
@ -605,11 +639,11 @@ begin
-- For timing, the mux on wr_data/sel/addr is not dependent on anything -- For timing, the mux on wr_data/sel/addr is not dependent on anything
-- other than the current state. Only the do_write signal is. -- other than the current state. Only the do_write signal is.
-- --
if r1.state = IDLE then if r1.state = IDLE or r1.state = NEXT_DWORD then
-- When IDLE, the only write path is the store-hit update case -- In these states, the only write path is the store-hit update case
wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
wr_data <= store_data; wr_data <= req_data;
wr_sel <= bus_sel; wr_sel <= req_sel;
else else
-- Otherwise, we might be doing a reload -- Otherwise, we might be doing a reload
wr_data <= wishbone_in.dat; wr_data <= wishbone_in.dat;
@ -648,6 +682,8 @@ begin
r2.length <= r1.req.length; r2.length <= r1.req.length;
r2.sign_extend <= r1.req.sign_extend; r2.sign_extend <= r1.req.sign_extend;
r2.byte_reverse <= r1.req.byte_reverse; r2.byte_reverse <= r1.req.byte_reverse;
r2.second_dword <= r1.second_dword;
r2.last_dword <= r1.second_dword or not r1.two_dwords;


-- If we have a request incoming, we have to latch it as d_in.valid -- If we have a request incoming, we have to latch it as d_in.valid
-- is only set for a single cycle. It's up to the control logic to -- is only set for a single cycle. It's up to the control logic to
@ -655,8 +691,12 @@ begin
-- single issue on load/stores so we are fine, later, we can generate -- single issue on load/stores so we are fine, later, we can generate
-- a stall output if necessary). -- a stall output if necessary).


if req_op /= OP_NONE then if req_op /= OP_NONE and d_in.valid = '1' then
r1.req <= d_in; r1.req <= d_in;
r1.second_dword <= '0';
r1.two_dwords <= two_dwords;
r1.next_addr <= std_ulogic_vector(unsigned(d_in.addr(63 downto 3)) + 1) & "000";
r1.next_sel <= bus_sel(15 downto 8);


report "op:" & op_t'image(req_op) & report "op:" & op_t'image(req_op) &
" addr:" & to_hstring(d_in.addr) & " addr:" & to_hstring(d_in.addr) &
@ -666,6 +706,8 @@ begin
" idx:" & integer'image(req_index) & " idx:" & integer'image(req_index) &
" tag:" & to_hstring(req_tag) & " tag:" & to_hstring(req_tag) &
" way: " & integer'image(req_hit_way); " way: " & integer'image(req_hit_way);
elsif r1.state = NEXT_DWORD then
r1.second_dword <= '1';
end if; end if;


-- Fast path for load/store hits. Set signals for the writeback controls. -- Fast path for load/store hits. Set signals for the writeback controls.
@ -713,24 +755,36 @@ begin
r1.update_valid <= '0'; r1.update_valid <= '0';


-- We cannot currently process a new request when not idle -- We cannot currently process a new request when not idle
assert req_op = OP_NONE or r1.state = IDLE report "request " & assert d_in.valid = '0' or r1.state = IDLE report "request " &
op_t'image(req_op) & " while in state " & state_t'image(r1.state) op_t'image(req_op) & " while in state " & state_t'image(r1.state)
severity FAILURE; severity FAILURE;


-- Main state machine -- Main state machine
case r1.state is case r1.state is
when IDLE => when IDLE | NEXT_DWORD =>
case req_op is case req_op is
when OP_LOAD_HIT => when OP_LOAD_HIT =>
if r1.state = IDLE then
-- If the load is misaligned then we will need to start
-- the state machine
if two_dwords = '1' then
r1.state <= NEXT_DWORD;
elsif d_in.update = '1' then
-- We have a load with update hit, we need the delayed update cycle -- We have a load with update hit, we need the delayed update cycle
if d_in.update = '1' then
r1.state <= LOAD_UPDATE; r1.state <= LOAD_UPDATE;
end if; end if;
else
if r1.req.update = '1' then
r1.state <= LOAD_UPDATE;
else
r1.state <= IDLE;
end if;
end if;


when OP_LOAD_MISS => when OP_LOAD_MISS =>
-- Normal load cache miss, start the reload machine -- Normal load cache miss, start the reload machine
-- --
report "cache miss addr:" & to_hstring(d_in.addr) & report "cache miss addr:" & to_hstring(req_addr) &
" idx:" & integer'image(req_index) & " idx:" & integer'image(req_index) &
" way:" & integer'image(replace_way) & " way:" & integer'image(replace_way) &
" tag:" & to_hstring(req_tag); " tag:" & to_hstring(req_tag);
@ -765,8 +819,8 @@ begin
r1.state <= RELOAD_WAIT_ACK; r1.state <= RELOAD_WAIT_ACK;


when OP_LOAD_NC => when OP_LOAD_NC =>
r1.wb.sel <= bus_sel; r1.wb.sel <= req_sel;
r1.wb.adr <= d_in.addr(r1.wb.adr'left downto 3) & "000"; r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000";
r1.wb.cyc <= '1'; r1.wb.cyc <= '1';
r1.wb.stb <= '1'; r1.wb.stb <= '1';
r1.wb.we <= '0'; r1.wb.we <= '0';
@ -774,12 +828,10 @@ begin


when OP_STORE_HIT | OP_STORE_MISS => when OP_STORE_HIT | OP_STORE_MISS =>
-- For store-with-update do the register update -- For store-with-update do the register update
if d_in.update = '1' then r1.update_valid <= d_in.valid and d_in.update;
r1.update_valid <= '1'; r1.wb.sel <= req_sel;
end if; r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000";
r1.wb.sel <= bus_sel; r1.wb.dat <= req_data;
r1.wb.adr <= d_in.addr(r1.wb.adr'left downto 3) & "000";
r1.wb.dat <= store_data;
r1.wb.cyc <= '1'; r1.wb.cyc <= '1';
r1.wb.stb <= '1'; r1.wb.stb <= '1';
r1.wb.we <= '1'; r1.wb.we <= '1';
@ -831,11 +883,13 @@ begin
-- Cache line is now valid -- Cache line is now valid
cache_valids(r1.store_index)(r1.store_way) <= '1'; cache_valids(r1.store_index)(r1.store_way) <= '1';


-- Complete the load that missed. For load with update -- Write back the load data that we got, and start
-- the second dword if necessary. Otherwise, see if
-- we also need to do the deferred update cycle. -- we also need to do the deferred update cycle.
--
r1.slow_valid <= '1'; r1.slow_valid <= '1';
if r1.req.update = '1' then if r1.two_dwords and not r1.second_dword then
r1.state <= NEXT_DWORD;
elsif r1.req.update = '1' then
r1.state <= LOAD_UPDATE2; r1.state <= LOAD_UPDATE2;
report "completing miss with load-update !"; report "completing miss with load-update !";
else else
@ -864,12 +918,15 @@ begin


-- Got ack ? complete. -- Got ack ? complete.
if wishbone_in.ack = '1' then if wishbone_in.ack = '1' then
if r1.two_dwords and not r1.second_dword then
r1.state <= NEXT_DWORD;
elsif r1.state = NC_LOAD_WAIT_ACK and r1.req.update = '1' then
r1.state <= LOAD_UPDATE2;
else
r1.state <= IDLE; r1.state <= IDLE;
end if;
if r1.state = NC_LOAD_WAIT_ACK then if r1.state = NC_LOAD_WAIT_ACK then
r1.slow_data <= wishbone_in.dat; r1.slow_data <= wishbone_in.dat;
if r1.req.update = '1' then
r1.state <= LOAD_UPDATE2;
end if;
end if; end if;
r1.slow_valid <= '1'; r1.slow_valid <= '1';
r1.wb.cyc <= '0'; r1.wb.cyc <= '0';

@ -35,12 +35,15 @@ begin


loadstore1_1: process(all) loadstore1_1: process(all)
variable v : Loadstore1ToDcacheType; variable v : Loadstore1ToDcacheType;
variable brev_lenm1 : unsigned(2 downto 0);
variable byte_offset : unsigned(2 downto 0);
variable j : integer;
variable k : unsigned(2 downto 0);
begin begin
v := r; v := r;


v.valid := l_in.valid; v.valid := l_in.valid;
v.load := l_in.load; v.load := l_in.load;
v.data := l_in.data;
v.write_reg := l_in.write_reg; v.write_reg := l_in.write_reg;
v.length := l_in.length; v.length := l_in.length;
v.byte_reverse := l_in.byte_reverse; v.byte_reverse := l_in.byte_reverse;
@ -63,9 +66,18 @@ begin


-- XXX Do length_to_sel here ? -- XXX Do length_to_sel here ?


-- byte reverse stores in the first cycle -- Do byte reversing and rotating for stores in the first cycle
if v.load = '0' and l_in.byte_reverse = '1' then if v.load = '0' then
v.data := byte_reverse(l_in.data, to_integer(unsigned(l_in.length))); byte_offset := unsigned(lsu_sum(2 downto 0));
brev_lenm1 := "000";
if l_in.byte_reverse = '1' then
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
end if;
for i in 0 to 7 loop
k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
j := to_integer(k) * 8;
v.data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
end loop;
end if; end if;


v.addr := lsu_sum; v.addr := lsu_sum;

@ -116,12 +116,12 @@ begin
if l_in.byte_reverse = '1' then if l_in.byte_reverse = '1' then
brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1; brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1;
end if; end if;
w_out.write_enable <= '1';
second_word <= l_in.second_word; second_word <= l_in.second_word;
if l_in.valid = '0' and (data_len + byte_offset > 8) then if l_in.valid = '0' and (data_len + byte_offset > 8) then
partial_write <= '1'; partial_write <= '1';
end if; end if;
xe := l_in.xerc; xe := l_in.xerc;
w_out.write_enable <= not partial_write or second_word;
end if; end if;


-- shift and byte-reverse data bytes -- shift and byte-reverse data bytes

Loading…
Cancel
Save