Merge pull request #233 from paulusmack/master

Changes to improve timing
pull/236/head
Michael Neuling 4 years ago committed by GitHub
commit 178d7680af
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -315,6 +315,7 @@ package common is


type MmuToLoadstore1Type is record type MmuToLoadstore1Type is record
done : std_ulogic; done : std_ulogic;
err : std_ulogic;
invalid : std_ulogic; invalid : std_ulogic;
badtree : std_ulogic; badtree : std_ulogic;
segerr : std_ulogic; segerr : std_ulogic;

@ -202,7 +202,8 @@ begin
SIM => SIM, SIM => SIM,
LINE_SIZE => 64, LINE_SIZE => 64,
NUM_LINES => 64, NUM_LINES => 64,
NUM_WAYS => 2 NUM_WAYS => 2,
LOG_LENGTH => LOG_LENGTH
) )
port map( port map(
clk => clk, clk => clk,
@ -222,6 +223,9 @@ begin
icache_stall_in <= decode1_busy; icache_stall_in <= decode1_busy;


decode1_0: entity work.decode1 decode1_0: entity work.decode1
generic map(
LOG_LENGTH => LOG_LENGTH
)
port map ( port map (
clk => clk, clk => clk,
rst => rst_dec1, rst => rst_dec1,
@ -239,7 +243,8 @@ begin


decode2_0: entity work.decode2 decode2_0: entity work.decode2
generic map ( generic map (
EX1_BYPASS => EX1_BYPASS EX1_BYPASS => EX1_BYPASS,
LOG_LENGTH => LOG_LENGTH
) )
port map ( port map (
clk => clk, clk => clk,
@ -261,7 +266,8 @@ begin


register_file_0: entity work.register_file register_file_0: entity work.register_file
generic map ( generic map (
SIM => SIM SIM => SIM,
LOG_LENGTH => LOG_LENGTH
) )
port map ( port map (
clk => clk, clk => clk,
@ -279,7 +285,8 @@ begin


cr_file_0: entity work.cr_file cr_file_0: entity work.cr_file
generic map ( generic map (
SIM => SIM SIM => SIM,
LOG_LENGTH => LOG_LENGTH
) )
port map ( port map (
clk => clk, clk => clk,
@ -292,7 +299,8 @@ begin


execute1_0: entity work.execute1 execute1_0: entity work.execute1
generic map ( generic map (
EX1_BYPASS => EX1_BYPASS EX1_BYPASS => EX1_BYPASS,
LOG_LENGTH => LOG_LENGTH
) )
port map ( port map (
clk => clk, clk => clk,
@ -315,6 +323,9 @@ begin
); );


loadstore1_0: entity work.loadstore1 loadstore1_0: entity work.loadstore1
generic map (
LOG_LENGTH => LOG_LENGTH
)
port map ( port map (
clk => clk, clk => clk,
rst => rst_ls1, rst => rst_ls1,
@ -344,7 +355,8 @@ begin
generic map( generic map(
LINE_SIZE => 64, LINE_SIZE => 64,
NUM_LINES => 64, NUM_LINES => 64,
NUM_WAYS => 2 NUM_WAYS => 2,
LOG_LENGTH => LOG_LENGTH
) )
port map ( port map (
clk => clk, clk => clk,

@ -15,123 +15,81 @@ entity zero_counter is
end entity zero_counter; end entity zero_counter;


architecture behaviour of zero_counter is architecture behaviour of zero_counter is
type intermediate_result is record -- Reverse the order of bits in a word
v16: std_ulogic_vector(15 downto 0); function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is
sel_hi: std_ulogic_vector(1 downto 0); variable ret: std_ulogic_vector(a'left downto a'right);
is_32bit: std_ulogic; begin
count_right: std_ulogic; for i in a'right to a'left loop
end record; ret(a'left + a'right - i) := a(i);

end loop;
signal r, r_in : intermediate_result; return ret;
end;


-- Return the index of the leftmost or rightmost 1 in a set of 4 bits. -- If there is only one bit set in a doubleword, return its bit number
-- Assumes v is not "0000"; if it is, return (right ? "11" : "00"). -- (counting from the right). Each bit of the result is obtained by
function encoder(v: std_ulogic_vector(3 downto 0); right: std_ulogic) return std_ulogic_vector is -- ORing together 32 bits of the input:
-- bit 0 = a[1] or a[3] or a[5] or ...
-- bit 1 = a[2] or a[3] or a[6] or a[7] or ...
-- bit 2 = a[4..7] or a[12..15] or ...
-- bit 5 = a[32..63] ORed together
function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
variable ret: std_ulogic_vector(5 downto 0);
variable stride: natural;
variable bit: std_ulogic;
variable k: natural;
begin begin
if right = '0' then stride := 2;
if v(3) = '1' then for i in 0 to 5 loop
return "11"; bit := '0';
elsif v(2) = '1' then for j in 0 to (64 / stride) - 1 loop
return "10"; k := j * stride;
elsif v(1) = '1' then bit := bit or (or a(k + stride - 1 downto k + (stride / 2)));
return "01"; end loop;
else ret(i) := bit;
return "00"; stride := stride * 2;
end if; end loop;
else return ret;
if v(0) = '1' then
return "00";
elsif v(1) = '1' then
return "01";
elsif v(2) = '1' then
return "10";
else
return "11";
end if;
end if;
end; end;


signal inp : std_ulogic_vector(63 downto 0);
signal sum : std_ulogic_vector(64 downto 0);
signal msb_r : std_ulogic;
signal onehot : std_ulogic_vector(63 downto 0);
signal onehot_r : std_ulogic_vector(63 downto 0);
signal bitnum : std_ulogic_vector(5 downto 0);

begin begin
zerocounter_0: process(clk) countzero_r: process(clk)
begin begin
if rising_edge(clk) then if rising_edge(clk) then
r <= r_in; msb_r <= sum(64);
onehot_r <= onehot;
end if; end if;
end process; end process;


zerocounter_1: process(all) countzero: process(all)
variable v: intermediate_result;
variable y, z: std_ulogic_vector(3 downto 0);
variable sel: std_ulogic_vector(5 downto 0);
variable v4: std_ulogic_vector(3 downto 0);

begin begin
-- Test 4 groups of 16 bits each.
-- The top 2 groups are considered to be zero in 32-bit mode.
z(0) := or (rs(15 downto 0));
z(1) := or (rs(31 downto 16));
z(2) := or (rs(47 downto 32));
z(3) := or (rs(63 downto 48));
if is_32bit = '0' then if is_32bit = '0' then
v.sel_hi := encoder(z, count_right); if count_right = '0' then
inp <= bit_reverse(rs);
else
inp <= rs;
end if;
else else
v.sel_hi(1) := '0'; inp(63 downto 32) <= x"FFFFFFFF";
if count_right = '0' then if count_right = '0' then
v.sel_hi(0) := z(1); inp(31 downto 0) <= bit_reverse(rs(31 downto 0));
else else
v.sel_hi(0) := not z(0); inp(31 downto 0) <= rs(31 downto 0);
end if; end if;
end if; end if;


-- Select the leftmost/rightmost non-zero group of 16 bits sum <= std_ulogic_vector(unsigned('0' & not inp) + 1);
case v.sel_hi is onehot <= sum(63 downto 0) and inp;
when "00" =>
v.v16 := rs(15 downto 0);
when "01" =>
v.v16 := rs(31 downto 16);
when "10" =>
v.v16 := rs(47 downto 32);
when others =>
v.v16 := rs(63 downto 48);
end case;

-- Latch this and do the rest in the next cycle, for the sake of timing
v.is_32bit := is_32bit;
v.count_right := count_right;
r_in <= v;
sel(5 downto 4) := r.sel_hi;

-- Test 4 groups of 4 bits
y(0) := or (r.v16(3 downto 0));
y(1) := or (r.v16(7 downto 4));
y(2) := or (r.v16(11 downto 8));
y(3) := or (r.v16(15 downto 12));
sel(3 downto 2) := encoder(y, r.count_right);

-- Select the leftmost/rightmost non-zero group of 4 bits
case sel(3 downto 2) is
when "00" =>
v4 := r.v16(3 downto 0);
when "01" =>
v4 := r.v16(7 downto 4);
when "10" =>
v4 := r.v16(11 downto 8);
when others =>
v4 := r.v16(15 downto 12);
end case;

sel(1 downto 0) := encoder(v4, r.count_right);


-- sel is now the index of the leftmost/rightmost 1 bit in rs -- The following occurs after a clock edge
if v4 = "0000" then bitnum <= bit_number(onehot_r);
-- operand is zero, return 32 for 32-bit, else 64
result <= x"00000000000000" & '0' & not r.is_32bit & r.is_32bit & "00000";
elsif r.count_right = '0' then
-- return (63 - sel), trimmed to 5 bits in 32-bit mode
result <= x"00000000000000" & "00" & (not sel(5) and not r.is_32bit) & not sel(4 downto 0);
else
result <= x"00000000000000" & "00" & sel;
end if;


result <= x"00000000000000" & "0" & msb_r & bitnum;
end process; end process;
end behaviour; end behaviour;

@ -7,7 +7,9 @@ use work.common.all;


entity cr_file is entity cr_file is
generic ( generic (
SIM : boolean := false SIM : boolean := false;
-- Non-zero to enable log data collection
LOG_LENGTH : natural := 0
); );
port( port(
clk : in std_logic; clk : in std_logic;
@ -29,7 +31,6 @@ architecture behaviour of cr_file is
signal crs_updated : std_ulogic_vector(31 downto 0); signal crs_updated : std_ulogic_vector(31 downto 0);
signal xerc : xer_common_t := xerc_init; signal xerc : xer_common_t := xerc_init;
signal xerc_updated : xer_common_t; signal xerc_updated : xer_common_t;
signal log_data : std_ulogic_vector(12 downto 0);
begin begin
cr_create_0: process(all) cr_create_0: process(all)
variable hi, lo : integer := 0; variable hi, lo : integer := 0;
@ -91,14 +92,18 @@ begin
end process; end process;
end generate; end generate;


cr_log: process(clk) cf_log: if LOG_LENGTH > 0 generate
signal log_data : std_ulogic_vector(12 downto 0);
begin begin
if rising_edge(clk) then cr_log: process(clk)
log_data <= w_in.write_cr_enable & begin
w_in.write_cr_data(31 downto 28) & if rising_edge(clk) then
w_in.write_cr_mask; log_data <= w_in.write_cr_enable &
end if; w_in.write_cr_data(31 downto 28) &
end process; w_in.write_cr_mask;
log_out <= log_data; end if;
end process;
log_out <= log_data;
end generate;


end architecture behaviour; end architecture behaviour;

@ -31,7 +31,9 @@ entity dcache is
-- L1 DTLB number of sets -- L1 DTLB number of sets
TLB_NUM_WAYS : positive := 2; TLB_NUM_WAYS : positive := 2;
-- L1 DTLB log_2(page_size) -- L1 DTLB log_2(page_size)
TLB_LG_PGSZ : positive := 12 TLB_LG_PGSZ : positive := 12;
-- Non-zero to enable log data collection
LOG_LENGTH : natural := 0
); );
port ( port (
clk : in std_ulogic; clk : in std_ulogic;
@ -226,13 +228,14 @@ architecture rtl of dcache is


type mem_access_request_t is record type mem_access_request_t is record
op : op_t; op : op_t;
valid : std_ulogic;
dcbz : std_ulogic; dcbz : std_ulogic;
real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
data : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0);
byte_sel : std_ulogic_vector(7 downto 0); byte_sel : std_ulogic_vector(7 downto 0);
hit_way : way_t; hit_way : way_t;
repl_way : way_t;
same_tag : std_ulogic; same_tag : std_ulogic;
mmu_req : std_ulogic;
end record; end record;


-- First stage register, contains state for stage 1 of load hits -- First stage register, contains state for stage 1 of load hits
@ -247,6 +250,13 @@ architecture rtl of dcache is
-- Cache hit state -- Cache hit state
hit_way : way_t; hit_way : way_t;
hit_load_valid : std_ulogic; hit_load_valid : std_ulogic;
hit_index : index_t;
cache_hit : std_ulogic;

-- TLB hit state
tlb_hit : std_ulogic;
tlb_hit_way : tlb_way_t;
tlb_hit_index : tlb_index_t;


-- 2-stage data buffer for data forwarded from writes to reads -- 2-stage data buffer for data forwarded from writes to reads
forward_data1 : std_ulogic_vector(63 downto 0); forward_data1 : std_ulogic_vector(63 downto 0);
@ -272,16 +282,18 @@ architecture rtl of dcache is
end_row_ix : row_in_line_t; end_row_ix : row_in_line_t;
rows_valid : row_per_line_valid_t; rows_valid : row_per_line_valid_t;
acks_pending : unsigned(2 downto 0); acks_pending : unsigned(2 downto 0);

inc_acks : std_ulogic;
-- Signals to complete with error dec_acks : std_ulogic;
error_done : std_ulogic;
-- Signals to complete (possibly with error)
ls_valid : std_ulogic;
ls_error : std_ulogic;
mmu_done : std_ulogic;
mmu_error : std_ulogic;
cache_paradox : std_ulogic; cache_paradox : std_ulogic;


-- Signal to complete a failed stcx. -- Signal to complete a failed stcx.
stcx_fail : std_ulogic; stcx_fail : std_ulogic;

-- completion signal for tlbie
tlbie_done : std_ulogic;
end record; end record;


signal r1 : reg_stage_1_t; signal r1 : reg_stage_1_t;
@ -303,6 +315,7 @@ architecture rtl of dcache is
signal req_op : op_t; signal req_op : op_t;
signal req_data : std_ulogic_vector(63 downto 0); signal req_data : std_ulogic_vector(63 downto 0);
signal req_same_tag : std_ulogic; signal req_same_tag : std_ulogic;
signal req_go : std_ulogic;


signal early_req_row : row_t; signal early_req_row : row_t;


@ -455,8 +468,6 @@ architecture rtl of dcache is
ptes(j + TLB_PTE_BITS - 1 downto j) := newpte; ptes(j + TLB_PTE_BITS - 1 downto j) := newpte;
end; end;


signal log_data : std_ulogic_vector(19 downto 0);

begin begin


assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
@ -566,15 +577,15 @@ begin
lru => tlb_plru_out lru => tlb_plru_out
); );


process(tlb_req_index, tlb_hit, tlb_hit_way, tlb_plru_out) process(all)
begin begin
-- PLRU interface -- PLRU interface
if tlb_hit = '1' and tlb_req_index = i then if r1.tlb_hit_index = i then
tlb_plru_acc_en <= '1'; tlb_plru_acc_en <= r1.tlb_hit;
else else
tlb_plru_acc_en <= '0'; tlb_plru_acc_en <= '0';
end if; end if;
tlb_plru_acc <= std_ulogic_vector(to_unsigned(tlb_hit_way, TLB_WAY_BITS)); tlb_plru_acc <= std_ulogic_vector(to_unsigned(r1.tlb_hit_way, TLB_WAY_BITS));
tlb_plru_victim(i) <= tlb_plru_out; tlb_plru_victim(i) <= tlb_plru_out;
end process; end process;
end generate; end generate;
@ -677,16 +688,15 @@ begin
lru => plru_out lru => plru_out
); );


process(req_index, req_op, req_hit_way, plru_out) process(all)
begin begin
-- PLRU interface -- PLRU interface
if (req_op = OP_LOAD_HIT or if r1.hit_index = i then
req_op = OP_STORE_HIT) and req_index = i then plru_acc_en <= r1.cache_hit;
plru_acc_en <= '1';
else else
plru_acc_en <= '0'; plru_acc_en <= '0';
end if; end if;
plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS)); plru_acc <= std_ulogic_vector(to_unsigned(r1.hit_way, WAY_BITS));
plru_victim(i) <= plru_out; plru_victim(i) <= plru_out;
end process; end process;
end generate; end generate;
@ -730,7 +740,7 @@ begin
req_row <= get_row(r0.req.addr); req_row <= get_row(r0.req.addr);
req_tag <= get_tag(ra); req_tag <= get_tag(ra);


go := r0_valid and not (r0.tlbie or r0.tlbld) and not r1.error_done; go := r0_valid and not (r0.tlbie or r0.tlbld) and not r1.ls_error;


-- Test if pending request is a hit on any way -- Test if pending request is a hit on any way
-- In order to make timing in virtual mode, when we are using the TLB, -- In order to make timing in virtual mode, when we are using the TLB,
@ -788,7 +798,7 @@ begin
-- since it will be by the time we perform the store. -- since it will be by the time we perform the store.
-- For a load, check the appropriate row valid bit. -- For a load, check the appropriate row valid bit.
is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE); is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE);
hit_way := r1.store_way; hit_way := replace_way;
end if; end if;


-- Whether to use forwarded data for a load or not -- Whether to use forwarded data for a load or not
@ -811,8 +821,12 @@ begin
-- The way that matched on a hit -- The way that matched on a hit
req_hit_way <= hit_way; req_hit_way <= hit_way;


-- The way to replace on a miss -- The way to replace on a miss
replace_way <= to_integer(unsigned(plru_victim(req_index))); if r1.write_tag = '1' then
replace_way <= to_integer(unsigned(plru_victim(r1.store_index)));
else
replace_way <= r1.store_way;
end if;


-- work out whether we have permission for this access -- work out whether we have permission for this access
-- NB we don't yet implement AMR, thus no KUAP -- NB we don't yet implement AMR, thus no KUAP
@ -847,6 +861,7 @@ begin
end if; end if;
end if; end if;
req_op <= op; req_op <= op;
req_go <= go;


-- Version of the row number that is valid one cycle earlier -- Version of the row number that is valid one cycle earlier
-- in the cases where we need to read the cache data BRAM. -- in the cases where we need to read the cache data BRAM.
@ -928,15 +943,15 @@ begin
end if; end if;
end loop; end loop;


d_out.valid <= '0'; d_out.valid <= r1.ls_valid;
d_out.data <= data_out; d_out.data <= data_out;
d_out.store_done <= '0'; d_out.store_done <= not r1.stcx_fail;
d_out.error <= '0'; d_out.error <= r1.ls_error;
d_out.cache_paradox <= '0'; d_out.cache_paradox <= r1.cache_paradox;


-- Outputs to MMU -- Outputs to MMU
m_out.done <= r1.tlbie_done; m_out.done <= r1.mmu_done;
m_out.err <= '0'; m_out.err <= r1.mmu_error;
m_out.data <= data_out; m_out.data <= data_out;


-- We have a valid load or store hit or we just completed a slow -- We have a valid load or store hit or we just completed a slow
@ -962,47 +977,32 @@ begin
-- Load hit case is the standard path -- Load hit case is the standard path
if r1.hit_load_valid = '1' then if r1.hit_load_valid = '1' then
report "completing load hit data=" & to_hstring(data_out); report "completing load hit data=" & to_hstring(data_out);
d_out.valid <= '1';
end if; end if;


-- error cases complete without stalling -- error cases complete without stalling
if r1.error_done = '1' then if r1.ls_error = '1' then
report "completing ld/st with error"; report "completing ld/st with error";
d_out.error <= '1';
d_out.cache_paradox <= r1.cache_paradox;
d_out.valid <= '1';
end if; end if;


-- Slow ops (load miss, NC, stores) -- Slow ops (load miss, NC, stores)
if r1.slow_valid = '1' then if r1.slow_valid = '1' then
d_out.store_done <= '1';
report "completing store or load miss data=" & to_hstring(data_out); report "completing store or load miss data=" & to_hstring(data_out);
d_out.valid <= '1';
end if;

if r1.stcx_fail = '1' then
d_out.store_done <= '0';
d_out.valid <= '1';
end if; end if;


else else
-- Request came from MMU -- Request came from MMU
if r1.hit_load_valid = '1' then if r1.hit_load_valid = '1' then
report "completing load hit to MMU, data=" & to_hstring(m_out.data); report "completing load hit to MMU, data=" & to_hstring(m_out.data);
m_out.done <= '1';
end if; end if;


-- error cases complete without stalling -- error cases complete without stalling
if r1.error_done = '1' then if r1.mmu_error = '1' then
report "completing MMU ld with error"; report "completing MMU ld with error";
m_out.err <= '1';
m_out.done <= '1';
end if; end if;


-- Slow ops (i.e. load miss) -- Slow ops (i.e. load miss)
if r1.slow_valid = '1' then if r1.slow_valid = '1' then
report "completing MMU load miss, data=" & to_hstring(m_out.data); report "completing MMU load miss, data=" & to_hstring(m_out.data);
m_out.done <= '1';
end if; end if;
end if; end if;


@ -1079,7 +1079,7 @@ begin
wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS));
wr_sel <= (others => '1'); wr_sel <= (others => '1');


if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r1.store_way = i then if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and replace_way = i then
do_write <= '1'; do_write <= '1';
end if; end if;
end if; end if;
@ -1113,20 +1113,28 @@ begin
end if; end if;


-- Fast path for load/store hits. Set signals for the writeback controls. -- Fast path for load/store hits. Set signals for the writeback controls.
r1.hit_way <= req_hit_way;
r1.hit_index <= req_index;
if req_op = OP_LOAD_HIT then if req_op = OP_LOAD_HIT then
r1.hit_way <= req_hit_way;
r1.hit_load_valid <= '1'; r1.hit_load_valid <= '1';
else else
r1.hit_load_valid <= '0'; r1.hit_load_valid <= '0';
end if; end if;
if req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT then
r1.cache_hit <= '1';
else
r1.cache_hit <= '0';
end if;


if req_op = OP_BAD then if req_op = OP_BAD then
report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) & report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) &
" rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok); " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok);
r1.error_done <= '1'; r1.ls_error <= not r0.mmu_req;
r1.mmu_error <= r0.mmu_req;
r1.cache_paradox <= access_ok; r1.cache_paradox <= access_ok;
else else
r1.error_done <= '0'; r1.ls_error <= '0';
r1.mmu_error <= '0';
r1.cache_paradox <= '0'; r1.cache_paradox <= '0';
end if; end if;


@ -1136,8 +1144,11 @@ begin
r1.stcx_fail <= '0'; r1.stcx_fail <= '0';
end if; end if;


-- complete tlbies and TLB loads in the third cycle -- Record TLB hit information for updating TLB PLRU
r1.tlbie_done <= r0_valid and (r0.tlbie or r0.tlbld); r1.tlb_hit <= tlb_hit;
r1.tlb_hit_way <= tlb_hit_way;
r1.tlb_hit_index <= tlb_req_index;

end if; end if;
end process; end process;


@ -1179,7 +1190,7 @@ begin
r1.forward_data1 <= wishbone_in.dat; r1.forward_data1 <= wishbone_in.dat;
end if; end if;
r1.forward_sel1 <= (others => '1'); r1.forward_sel1 <= (others => '1');
r1.forward_way1 <= r1.store_way; r1.forward_way1 <= replace_way;
r1.forward_row1 <= r1.store_row; r1.forward_row1 <= r1.store_row;
r1.forward_valid1 <= '0'; r1.forward_valid1 <= '0';
end if; end if;
@ -1194,6 +1205,8 @@ begin
r1.slow_valid <= '0'; r1.slow_valid <= '0';
r1.wb.cyc <= '0'; r1.wb.cyc <= '0';
r1.wb.stb <= '0'; r1.wb.stb <= '0';
r1.ls_valid <= '0';
r1.mmu_done <= '0';


-- Not useful normally but helps avoiding tons of sim warnings -- Not useful normally but helps avoiding tons of sim warnings
r1.wb.adr <= (others => '0'); r1.wb.adr <= (others => '0');
@ -1201,15 +1214,29 @@ begin
-- One cycle pulses reset -- One cycle pulses reset
r1.slow_valid <= '0'; r1.slow_valid <= '0';
r1.write_bram <= '0'; r1.write_bram <= '0';
r1.inc_acks <= '0';
r1.dec_acks <= '0';

r1.ls_valid <= '0';
-- complete tlbies and TLB loads in the third cycle
r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld);
if req_op = OP_LOAD_HIT or req_op = OP_STCX_FAIL then
if r0.mmu_req = '0' then
r1.ls_valid <= '1';
else
r1.mmu_done <= '1';
end if;
end if;


if r1.write_tag = '1' then if r1.write_tag = '1' then
-- Store new tag in selected way -- Store new tag in selected way
for i in 0 to NUM_WAYS-1 loop for i in 0 to NUM_WAYS-1 loop
if i = r1.store_way then if i = replace_way then
cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <= cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
(TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag; (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
end if; end if;
end loop; end loop;
r1.store_way <= replace_way;
r1.write_tag <= '0'; r1.write_tag <= '0';
end if; end if;


@ -1219,12 +1246,23 @@ begin
req := r1.req; req := r1.req;
else else
req.op := req_op; req.op := req_op;
req.valid := req_go;
req.mmu_req := r0.mmu_req;
req.dcbz := r0.req.dcbz; req.dcbz := r0.req.dcbz;
req.real_addr := ra; req.real_addr := ra;
req.data := r0.req.data; -- Force data to 0 for dcbz
req.byte_sel := r0.req.byte_sel; if r0.req.dcbz = '0' then
req.data := r0.req.data;
else
req.data := (others => '0');
end if;
-- Select all bytes for dcbz and for cacheable loads
if r0.req.dcbz = '1' or (r0.req.load = '1' and r0.req.nc = '0') then
req.byte_sel := (others => '1');
else
req.byte_sel := r0.req.byte_sel;
end if;
req.hit_way := req_hit_way; req.hit_way := req_hit_way;
req.repl_way := replace_way;
req.same_tag := req_same_tag; req.same_tag := req_same_tag;


-- Store the incoming request from r0, if it is a slow request -- Store the incoming request from r0, if it is a slow request
@ -1240,7 +1278,9 @@ begin
case r1.state is case r1.state is
when IDLE => when IDLE =>
r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0); r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0);
r1.dcbz <= '0'; r1.wb.sel <= req.byte_sel;
r1.wb.dat <= req.data;
r1.dcbz <= req.dcbz;


-- Keep track of our index and way for subsequent stores. -- Keep track of our index and way for subsequent stores.
r1.store_index <= get_index(req.real_addr); r1.store_index <= get_index(req.real_addr);
@ -1251,8 +1291,6 @@ begin


if req.op = OP_STORE_HIT then if req.op = OP_STORE_HIT then
r1.store_way <= req.hit_way; r1.store_way <= req.hit_way;
else
r1.store_way <= req.repl_way;
end if; end if;


-- Reset per-row valid bits, ready for handling OP_LOAD_MISS -- Reset per-row valid bits, ready for handling OP_LOAD_MISS
@ -1269,11 +1307,9 @@ begin
-- --
report "cache miss real addr:" & to_hstring(req.real_addr) & report "cache miss real addr:" & to_hstring(req.real_addr) &
" idx:" & integer'image(get_index(req.real_addr)) & " idx:" & integer'image(get_index(req.real_addr)) &
" way:" & integer'image(req.repl_way) &
" tag:" & to_hstring(get_tag(req.real_addr)); " tag:" & to_hstring(get_tag(req.real_addr));


-- Start the wishbone cycle -- Start the wishbone cycle
r1.wb.sel <= (others => '1');
r1.wb.we <= '0'; r1.wb.we <= '0';
r1.wb.cyc <= '1'; r1.wb.cyc <= '1';
r1.wb.stb <= '1'; r1.wb.stb <= '1';
@ -1283,7 +1319,6 @@ begin
r1.write_tag <= '1'; r1.write_tag <= '1';


when OP_LOAD_NC => when OP_LOAD_NC =>
r1.wb.sel <= req.byte_sel;
r1.wb.cyc <= '1'; r1.wb.cyc <= '1';
r1.wb.stb <= '1'; r1.wb.stb <= '1';
r1.wb.we <= '0'; r1.wb.we <= '0';
@ -1291,27 +1326,25 @@ begin


when OP_STORE_HIT | OP_STORE_MISS => when OP_STORE_HIT | OP_STORE_MISS =>
if req.dcbz = '0' then if req.dcbz = '0' then
r1.wb.sel <= req.byte_sel;
r1.wb.dat <= req.data;
r1.state <= STORE_WAIT_ACK; r1.state <= STORE_WAIT_ACK;
r1.acks_pending <= to_unsigned(1, 3); r1.acks_pending <= to_unsigned(1, 3);
r1.full <= '0'; r1.full <= '0';
r1.slow_valid <= '1'; r1.slow_valid <= '1';
if req.mmu_req = '0' then
r1.ls_valid <= '1';
else
r1.mmu_done <= '1';
end if;
if req.op = OP_STORE_HIT then if req.op = OP_STORE_HIT then
r1.write_bram <= '1'; r1.write_bram <= '1';
end if; end if;
else else
-- dcbz is handled much like a load miss except -- dcbz is handled much like a load miss except
-- that we are writing to memory instead of reading -- that we are writing to memory instead of reading

-- Start the wishbone writes
r1.wb.sel <= (others => '1');
r1.wb.dat <= (others => '0');

-- Handle the rest like a load miss
r1.state <= RELOAD_WAIT_ACK; r1.state <= RELOAD_WAIT_ACK;
r1.write_tag <= '1'; if req.op = OP_STORE_MISS then
r1.dcbz <= '1'; r1.write_tag <= '1';
end if;
end if; end if;
r1.wb.we <= '1'; r1.wb.we <= '1';
r1.wb.cyc <= '1'; r1.wb.cyc <= '1';
@ -1357,6 +1390,11 @@ begin
r1.store_row = get_row(r1.req.real_addr) then r1.store_row = get_row(r1.req.real_addr) then
r1.full <= '0'; r1.full <= '0';
r1.slow_valid <= '1'; r1.slow_valid <= '1';
if r1.mmu_req = '0' then
r1.ls_valid <= '1';
else
r1.mmu_done <= '1';
end if;
r1.forward_sel <= (others => '1'); r1.forward_sel <= (others => '1');
r1.use_forward1 <= '1'; r1.use_forward1 <= '1';
end if; end if;
@ -1379,15 +1417,26 @@ begin
when STORE_WAIT_ACK => when STORE_WAIT_ACK =>
stbs_done := r1.wb.stb = '0'; stbs_done := r1.wb.stb = '0';
acks := r1.acks_pending; acks := r1.acks_pending;
if r1.inc_acks /= r1.dec_acks then
if r1.inc_acks = '1' then
acks := acks + 1;
else
acks := acks - 1;
end if;
end if;
r1.acks_pending <= acks;
-- Clear stb when slave accepted request -- Clear stb when slave accepted request
if wishbone_in.stall = '0' then if wishbone_in.stall = '0' then
-- See if there is another store waiting to be done -- See if there is another store waiting to be done
-- which is in the same real page. -- which is in the same real page.
if acks < 7 and req.same_tag = '1' and if req.valid = '1' then
(req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then r1.wb.adr(SET_SIZE_BITS - 1 downto 0) <=
r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0); req.real_addr(SET_SIZE_BITS - 1 downto 0);
r1.wb.dat <= req.data; r1.wb.dat <= req.data;
r1.wb.sel <= req.byte_sel; r1.wb.sel <= req.byte_sel;
end if;
if acks < 7 and req.same_tag = '1' and
(req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then
r1.wb.stb <= '1'; r1.wb.stb <= '1';
stbs_done := false; stbs_done := false;
if req.op = OP_STORE_HIT then if req.op = OP_STORE_HIT then
@ -1395,7 +1444,10 @@ begin
end if; end if;
r1.full <= '0'; r1.full <= '0';
r1.slow_valid <= '1'; r1.slow_valid <= '1';
acks := acks + 1; -- Store requests never come from the MMU
r1.ls_valid <= '1';
stbs_done := false;
r1.inc_acks <= '1';
else else
r1.wb.stb <= '0'; r1.wb.stb <= '0';
stbs_done := true; stbs_done := true;
@ -1409,9 +1461,8 @@ begin
r1.wb.cyc <= '0'; r1.wb.cyc <= '0';
r1.wb.stb <= '0'; r1.wb.stb <= '0';
end if; end if;
acks := acks - 1; r1.dec_acks <= '1';
end if; end if;
r1.acks_pending <= acks;


when NC_LOAD_WAIT_ACK => when NC_LOAD_WAIT_ACK =>
-- Clear stb when slave accepted request -- Clear stb when slave accepted request
@ -1424,6 +1475,11 @@ begin
r1.state <= IDLE; r1.state <= IDLE;
r1.full <= '0'; r1.full <= '0';
r1.slow_valid <= '1'; r1.slow_valid <= '1';
if r1.mmu_req = '0' then
r1.ls_valid <= '1';
else
r1.mmu_done <= '1';
end if;
r1.forward_sel <= (others => '1'); r1.forward_sel <= (others => '1');
r1.use_forward1 <= '1'; r1.use_forward1 <= '1';
r1.wb.cyc <= '0'; r1.wb.cyc <= '0';
@ -1434,21 +1490,25 @@ begin
end if; end if;
end process; end process;


dcache_log: process(clk) dc_log: if LOG_LENGTH > 0 generate
signal log_data : std_ulogic_vector(19 downto 0);
begin begin
if rising_edge(clk) then dcache_log: process(clk)
log_data <= r1.wb.adr(5 downto 3) & begin
wishbone_in.stall & if rising_edge(clk) then
wishbone_in.ack & log_data <= r1.wb.adr(5 downto 3) &
r1.wb.stb & r1.wb.cyc & wishbone_in.stall &
d_out.error & wishbone_in.ack &
d_out.valid & r1.wb.stb & r1.wb.cyc &
std_ulogic_vector(to_unsigned(op_t'pos(req_op), 3)) & d_out.error &
stall_out & d_out.valid &
std_ulogic_vector(to_unsigned(tlb_hit_way, 3)) & std_ulogic_vector(to_unsigned(op_t'pos(req_op), 3)) &
valid_ra & stall_out &
std_ulogic_vector(to_unsigned(state_t'pos(r1.state), 3)); std_ulogic_vector(to_unsigned(tlb_hit_way, 3)) &
end if; valid_ra &
end process; std_ulogic_vector(to_unsigned(state_t'pos(r1.state), 3));
log_out <= log_data; end if;
end process;
log_out <= log_data;
end generate;
end; end;

@ -7,6 +7,10 @@ use work.common.all;
use work.decode_types.all; use work.decode_types.all;


entity decode1 is entity decode1 is
generic (
-- Non-zero to enable log data collection
LOG_LENGTH : natural := 0
);
port ( port (
clk : in std_ulogic; clk : in std_ulogic;
rst : in std_ulogic; rst : in std_ulogic;
@ -47,7 +51,7 @@ architecture behaviour of decode1 is
15 => (ALU, OP_ADD, RA_OR_ZERO, CONST_SI_HI, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- addis 15 => (ALU, OP_ADD, RA_OR_ZERO, CONST_SI_HI, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- addis
28 => (ALU, OP_AND, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0'), -- andi. 28 => (ALU, OP_AND, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0'), -- andi.
29 => (ALU, OP_AND, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0'), -- andis. 29 => (ALU, OP_AND, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0'), -- andis.
0 => (ALU, OP_ATTN, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- attn 0 => (ALU, OP_ATTN, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- attn
18 => (ALU, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- b 18 => (ALU, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- b
16 => (ALU, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc 16 => (ALU, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc
11 => (ALU, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmpi 11 => (ALU, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmpi
@ -73,9 +77,9 @@ architecture behaviour of decode1 is
45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu 45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu
36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw 36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw
37 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwu 37 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwu
8 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic 8 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic
2 => (ALU, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi 2 => (ALU, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi
3 => (ALU, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- twi 3 => (ALU, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- twi
26 => (ALU, OP_XOR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- xori 26 => (ALU, OP_XOR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- xori
27 => (ALU, OP_XOR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- xoris 27 => (ALU, OP_XOR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- xoris
others => illegal_inst others => illegal_inst
@ -357,8 +361,6 @@ architecture behaviour of decode1 is
constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0');
constant fetch_fail_inst: decode_rom_t := (LDST, OP_FETCH_FAILED, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); constant fetch_fail_inst: decode_rom_t := (LDST, OP_FETCH_FAILED, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0');


signal log_data : std_ulogic_vector(12 downto 0);

begin begin
decode1_0: process(clk) decode1_0: process(clk)
begin begin
@ -524,15 +526,19 @@ begin
flush_out <= f.redirect; flush_out <= f.redirect;
end process; end process;


dec1_log : process(clk) d1_log: if LOG_LENGTH > 0 generate
signal log_data : std_ulogic_vector(12 downto 0);
begin begin
if rising_edge(clk) then dec1_log : process(clk)
log_data <= std_ulogic_vector(to_unsigned(insn_type_t'pos(r.decode.insn_type), 6)) & begin
r.nia(5 downto 2) & if rising_edge(clk) then
std_ulogic_vector(to_unsigned(unit_t'pos(r.decode.unit), 2)) & log_data <= std_ulogic_vector(to_unsigned(insn_type_t'pos(r.decode.insn_type), 6)) &
r.valid; r.nia(5 downto 2) &
end if; std_ulogic_vector(to_unsigned(unit_t'pos(r.decode.unit), 2)) &
end process; r.valid;
log_out <= log_data; end if;
end process;
log_out <= log_data;
end generate;


end architecture behaviour; end architecture behaviour;

@ -10,7 +10,9 @@ use work.insn_helpers.all;


entity decode2 is entity decode2 is
generic ( generic (
EX1_BYPASS : boolean := true EX1_BYPASS : boolean := true;
-- Non-zero to enable log data collection
LOG_LENGTH : natural := 0
); );
port ( port (
clk : in std_ulogic; clk : in std_ulogic;
@ -47,8 +49,6 @@ architecture behaviour of decode2 is


signal deferred : std_ulogic; signal deferred : std_ulogic;


signal log_data : std_ulogic_vector(9 downto 0);

type decode_input_reg_t is record type decode_input_reg_t is record
reg_valid : std_ulogic; reg_valid : std_ulogic;
reg : gspr_index_t; reg : gspr_index_t;
@ -415,18 +415,22 @@ begin
e_out <= r.e; e_out <= r.e;
end process; end process;


dec2_log : process(clk) d2_log: if LOG_LENGTH > 0 generate
signal log_data : std_ulogic_vector(9 downto 0);
begin begin
if rising_edge(clk) then dec2_log : process(clk)
log_data <= r.e.nia(5 downto 2) & begin
r.e.valid & if rising_edge(clk) then
stopped_out & log_data <= r.e.nia(5 downto 2) &
stall_out & r.e.valid &
r.e.bypass_data3 & stopped_out &
r.e.bypass_data2 & stall_out &
r.e.bypass_data1; r.e.bypass_data3 &
end if; r.e.bypass_data2 &
end process; r.e.bypass_data1;
log_out <= log_data; end if;
end process;
log_out <= log_data;
end generate;


end architecture behaviour; end architecture behaviour;

@ -12,7 +12,9 @@ use work.ppc_fx_insns.all;


entity execute1 is entity execute1 is
generic ( generic (
EX1_BYPASS : boolean := true EX1_BYPASS : boolean := true;
-- Non-zero to enable log data collection
LOG_LENGTH : natural := 0
); );
port ( port (
clk : in std_ulogic; clk : in std_ulogic;
@ -97,7 +99,6 @@ architecture behaviour of execute1 is
-- signals for logging -- signals for logging
signal exception_log : std_ulogic; signal exception_log : std_ulogic;
signal irq_valid_log : std_ulogic; signal irq_valid_log : std_ulogic;
signal log_data : std_ulogic_vector(14 downto 0);


type privilege_level is (USER, SUPER); type privilege_level is (USER, SUPER);
type op_privilege_array is array(insn_type_t) of privilege_level; type op_privilege_array is array(insn_type_t) of privilege_level;
@ -619,12 +620,12 @@ begin
end loop; end loop;
else else
-- trap instructions (tw, twi, td, tdi) -- trap instructions (tw, twi, td, tdi)
v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64));
-- set bit 46 to say trap occurred
ctrl_tmp.srr1(63 - 46) <= '1';
if or (trapval and insn_to(e_in.insn)) = '1' then if or (trapval and insn_to(e_in.insn)) = '1' then
-- generate trap-type program interrupt -- generate trap-type program interrupt
exception := '1'; exception := '1';
v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64));
-- set bit 46 to say trap occurred
ctrl_tmp.srr1(63 - 46) <= '1';
report "trap"; report "trap";
end if; end if;
end if; end if;
@ -1083,21 +1084,25 @@ begin
irq_valid_log <= irq_valid; irq_valid_log <= irq_valid;
end process; end process;


ex1_log : process(clk) e1_log: if LOG_LENGTH > 0 generate
signal log_data : std_ulogic_vector(14 downto 0);
begin begin
if rising_edge(clk) then ex1_log : process(clk)
log_data <= ctrl.msr(MSR_EE) & ctrl.msr(MSR_PR) & begin
ctrl.msr(MSR_IR) & ctrl.msr(MSR_DR) & if rising_edge(clk) then
exception_log & log_data <= ctrl.msr(MSR_EE) & ctrl.msr(MSR_PR) &
irq_valid_log & ctrl.msr(MSR_IR) & ctrl.msr(MSR_DR) &
std_ulogic_vector(to_unsigned(irq_state_t'pos(ctrl.irq_state), 1)) & exception_log &
"000" & irq_valid_log &
r.e.write_enable & std_ulogic_vector(to_unsigned(irq_state_t'pos(ctrl.irq_state), 1)) &
r.e.valid & "000" &
f_out.redirect & r.e.write_enable &
r.busy & r.e.valid &
flush_out; f_out.redirect &
end if; r.busy &
end process; flush_out;
log_out <= log_data; end if;
end process;
log_out <= log_data;
end generate;
end architecture behaviour; end architecture behaviour;

@ -47,7 +47,9 @@ entity icache is
-- L1 ITLB log_2(page_size) -- L1 ITLB log_2(page_size)
TLB_LG_PGSZ : positive := 12; TLB_LG_PGSZ : positive := 12;
-- Number of real address bits that we store -- Number of real address bits that we store
REAL_ADDR_BITS : positive := 56 REAL_ADDR_BITS : positive := 56;
-- Non-zero to enable log data collection
LOG_LENGTH : natural := 0
); );
port ( port (
clk : in std_ulogic; clk : in std_ulogic;
@ -207,9 +209,6 @@ architecture rtl of icache is
signal access_ok : std_ulogic; signal access_ok : std_ulogic;
signal use_previous : std_ulogic; signal use_previous : std_ulogic;


-- Output data to logger
signal log_data : std_ulogic_vector(53 downto 0);

-- Cache RAM interface -- Cache RAM interface
type cache_ram_out_t is array(way_t) of cache_row_t; type cache_ram_out_t is array(way_t) of cache_row_t;
signal cache_out : cache_ram_out_t; signal cache_out : cache_ram_out_t;
@ -379,7 +378,7 @@ begin
begin begin
do_read <= not (stall_in or use_previous); do_read <= not (stall_in or use_previous);
do_write <= '0'; do_write <= '0';
if wishbone_in.ack = '1' and r.store_way = i then if wishbone_in.ack = '1' and replace_way = i then
do_write <= '1'; do_write <= '1';
end if; end if;
cache_out(i) <= dout; cache_out(i) <= dout;
@ -413,15 +412,15 @@ begin
lru => plru_out lru => plru_out
); );


process(req_index, req_is_hit, req_hit_way, req_is_hit, plru_out) process(all)
begin begin
-- PLRU interface -- PLRU interface
if req_is_hit = '1' and req_index = i then if get_index(r.hit_nia) = i then
plru_acc_en <= req_is_hit; plru_acc_en <= r.hit_valid;
else else
plru_acc_en <= '0'; plru_acc_en <= '0';
end if; end if;
plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS)); plru_acc <= std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
plru_victim(i) <= plru_out; plru_victim(i) <= plru_out;
end process; end process;
end generate; end generate;
@ -531,8 +530,12 @@ begin
end if; end if;
req_hit_way <= hit_way; req_hit_way <= hit_way;


-- The way to replace on a miss -- The way to replace on a miss
replace_way <= to_integer(unsigned(plru_victim(req_index))); if r.state = CLR_TAG then
replace_way <= to_integer(unsigned(plru_victim(r.store_index)));
else
replace_way <= r.store_way;
end if;


-- Output instruction from current cache row -- Output instruction from current cache row
-- --
@ -642,7 +645,6 @@ begin


-- Keep track of our index and way for subsequent stores -- Keep track of our index and way for subsequent stores
r.store_index <= req_index; r.store_index <= req_index;
r.store_way <= replace_way;
r.store_row <= get_row(req_laddr); r.store_row <= get_row(req_laddr);
r.store_tag <= req_tag; r.store_tag <= req_tag;
r.store_valid <= '1'; r.store_valid <= '1';
@ -661,12 +663,15 @@ begin


when CLR_TAG | WAIT_ACK => when CLR_TAG | WAIT_ACK =>
if r.state = CLR_TAG then if r.state = CLR_TAG then
-- Get victim way from plru
r.store_way <= replace_way;

-- Force misses on that way while reloading that line -- Force misses on that way while reloading that line
cache_valids(req_index)(r.store_way) <= '0'; cache_valids(req_index)(replace_way) <= '0';


-- Store new tag in selected way -- Store new tag in selected way
for i in 0 to NUM_WAYS-1 loop for i in 0 to NUM_WAYS-1 loop
if i = r.store_way then if i = replace_way then
tagset := cache_tags(r.store_index); tagset := cache_tags(r.store_index);
write_tag(i, tagset, r.store_tag); write_tag(i, tagset, r.store_tag);
cache_tags(r.store_index) <= tagset; cache_tags(r.store_index) <= tagset;
@ -702,7 +707,7 @@ begin
r.wb.cyc <= '0'; r.wb.cyc <= '0';


-- Cache line is now valid -- Cache line is now valid
cache_valids(r.store_index)(r.store_way) <= r.store_valid and not inval_in; cache_valids(r.store_index)(replace_way) <= r.store_valid and not inval_in;


-- We are done -- We are done
r.state <= IDLE; r.state <= IDLE;
@ -723,35 +728,36 @@ begin
end if; end if;
end process; end process;


data_log: process(clk) icache_log: if LOG_LENGTH > 0 generate
variable lway: way_t; -- Output data to logger
variable wstate: std_ulogic; signal log_data : std_ulogic_vector(53 downto 0);
begin begin
if rising_edge(clk) then data_log: process(clk)
if req_is_hit then variable lway: way_t;
variable wstate: std_ulogic;
begin
if rising_edge(clk) then
lway := req_hit_way; lway := req_hit_way;
else wstate := '0';
lway := replace_way; if r.state /= IDLE then
end if; wstate := '1';
wstate := '0'; end if;
if r.state /= IDLE then log_data <= i_out.valid &
wstate := '1'; i_out.insn &
wishbone_in.ack &
r.wb.adr(5 downto 3) &
r.wb.stb & r.wb.cyc &
wishbone_in.stall &
stall_out &
r.fetch_failed &
r.hit_nia(5 downto 2) &
wstate &
std_ulogic_vector(to_unsigned(lway, 3)) &
req_is_hit & req_is_miss &
access_ok &
ra_valid;
end if; end if;
log_data <= i_out.valid & end process;
i_out.insn & log_out <= log_data;
wishbone_in.ack & end generate;
r.wb.adr(5 downto 3) &
r.wb.stb & r.wb.cyc &
wishbone_in.stall &
stall_out &
r.fetch_failed &
r.hit_nia(5 downto 2) &
wstate &
std_ulogic_vector(to_unsigned(lway, 3)) &
req_is_hit & req_is_miss &
access_ok &
ra_valid;
end if;
end process;
log_out <= log_data;
end; end;

@ -10,6 +10,10 @@ use work.common.all;
-- We calculate the address in the first cycle -- We calculate the address in the first cycle


entity loadstore1 is entity loadstore1 is
generic (
-- Non-zero to enable log data collection
LOG_LENGTH : natural := 0
);
port ( port (
clk : in std_ulogic; clk : in std_ulogic;
rst : in std_ulogic; rst : in std_ulogic;
@ -40,10 +44,9 @@ architecture behave of loadstore1 is
type state_t is (IDLE, -- ready for instruction type state_t is (IDLE, -- ready for instruction
SECOND_REQ, -- send 2nd request of unaligned xfer SECOND_REQ, -- send 2nd request of unaligned xfer
ACK_WAIT, -- waiting for ack from dcache ACK_WAIT, -- waiting for ack from dcache
LD_UPDATE, -- writing rA with computed addr on load
MMU_LOOKUP, -- waiting for MMU to look up translation MMU_LOOKUP, -- waiting for MMU to look up translation
TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie
SPR_CMPLT -- complete a mf/tspr operation COMPLETE -- extra cycle to complete an operation
); );


type reg_stage_t is record type reg_stage_t is record
@ -69,12 +72,18 @@ architecture behave of loadstore1 is
priv_mode : std_ulogic; priv_mode : std_ulogic;
state : state_t; state : state_t;
dwords_done : std_ulogic; dwords_done : std_ulogic;
last_dword : std_ulogic;
first_bytes : std_ulogic_vector(7 downto 0); first_bytes : std_ulogic_vector(7 downto 0);
second_bytes : std_ulogic_vector(7 downto 0); second_bytes : std_ulogic_vector(7 downto 0);
dar : std_ulogic_vector(63 downto 0); dar : std_ulogic_vector(63 downto 0);
dsisr : std_ulogic_vector(31 downto 0); dsisr : std_ulogic_vector(31 downto 0);
instr_fault : std_ulogic; instr_fault : std_ulogic;
sprval : std_ulogic_vector(63 downto 0); sprval : std_ulogic_vector(63 downto 0);
busy : std_ulogic;
wait_dcache : std_ulogic;
wait_mmu : std_ulogic;
do_update : std_ulogic;
extra_cycle : std_ulogic;
end record; end record;


type byte_sel_t is array(0 to 7) of std_ulogic; type byte_sel_t is array(0 to 7) of std_ulogic;
@ -84,8 +93,6 @@ architecture behave of loadstore1 is
signal r, rin : reg_stage_t; signal r, rin : reg_stage_t;
signal lsu_sum : std_ulogic_vector(63 downto 0); signal lsu_sum : std_ulogic_vector(63 downto 0);


signal log_data : std_ulogic_vector(9 downto 0);

-- Generate byte enables from sizes -- Generate byte enables from sizes
function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
begin begin
@ -125,6 +132,8 @@ begin
if rising_edge(clk) then if rising_edge(clk) then
if rst = '1' then if rst = '1' then
r.state <= IDLE; r.state <= IDLE;
r.busy <= '0';
r.do_update <= '0';
else else
r <= rin; r <= rin;
end if; end if;
@ -143,13 +152,14 @@ begin
variable req : std_ulogic; variable req : std_ulogic;
variable busy : std_ulogic; variable busy : std_ulogic;
variable addr : std_ulogic_vector(63 downto 0); variable addr : std_ulogic_vector(63 downto 0);
variable maddr : std_ulogic_vector(63 downto 0);
variable wdata : std_ulogic_vector(63 downto 0); variable wdata : std_ulogic_vector(63 downto 0);
variable write_enable : std_ulogic; variable write_enable : std_ulogic;
variable do_update : std_ulogic; variable do_update : std_ulogic;
variable two_dwords : std_ulogic;
variable done : std_ulogic; variable done : std_ulogic;
variable data_permuted : std_ulogic_vector(63 downto 0); variable data_permuted : std_ulogic_vector(63 downto 0);
variable data_trimmed : std_ulogic_vector(63 downto 0); variable data_trimmed : std_ulogic_vector(63 downto 0);
variable store_data : std_ulogic_vector(63 downto 0);
variable use_second : byte_sel_t; variable use_second : byte_sel_t;
variable trim_ctl : trim_ctl_t; variable trim_ctl : trim_ctl_t;
variable negative : std_ulogic; variable negative : std_ulogic;
@ -163,8 +173,6 @@ begin
begin begin
v := r; v := r;
req := '0'; req := '0';
byte_sel := (others => '0');
addr := lsu_sum;
v.mfspr := '0'; v.mfspr := '0';
mmu_mtspr := '0'; mmu_mtspr := '0';
itlb_fault := '0'; itlb_fault := '0';
@ -173,8 +181,9 @@ begin
mmureq := '0'; mmureq := '0';


write_enable := '0'; write_enable := '0';
do_update := '0';
two_dwords := or (r.second_bytes); do_update := r.do_update;
v.do_update := '0';


-- load data formatting -- load data formatting
byte_offset := unsigned(r.addr(2 downto 0)); byte_offset := unsigned(r.addr(2 downto 0));
@ -204,10 +213,10 @@ begin
-- trim and sign-extend -- trim and sign-extend
for i in 0 to 7 loop for i in 0 to 7 loop
if i < to_integer(unsigned(r.length)) then if i < to_integer(unsigned(r.length)) then
if two_dwords = '1' then if r.dwords_done = '1' then
trim_ctl(i) := '1' & not use_second(i); trim_ctl(i) := '1' & not use_second(i);
else else
trim_ctl(i) := not use_second(i) & '0'; trim_ctl(i) := "10";
end if; end if;
else else
trim_ctl(i) := '0' & (negative and r.sign_extend); trim_ctl(i) := '0' & (negative and r.sign_extend);
@ -224,121 +233,127 @@ begin
end case; end case;
end loop; end loop;


-- Byte reversing and rotating for stores
-- Done in the first cycle (when l_in.valid = 1)
store_data := r.store_data;
if l_in.valid = '1' then
byte_offset := unsigned(lsu_sum(2 downto 0));
brev_lenm1 := "000";
if l_in.byte_reverse = '1' then
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
end if;
for i in 0 to 7 loop
k := (to_unsigned(i, 3) - byte_offset) xor brev_lenm1;
j := to_integer(k) * 8;
store_data(i * 8 + 7 downto i * 8) := l_in.data(j + 7 downto j);
end loop;
end if;
v.store_data := store_data;

-- compute (addr + 8) & ~7 for the second doubleword when unaligned -- compute (addr + 8) & ~7 for the second doubleword when unaligned
next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";


-- Busy calculation.
-- We need to minimize the delay from clock to busy valid because it
-- gates the start of execution of the next instruction.
busy := r.busy and not ((r.wait_dcache and d_in.valid) or (r.wait_mmu and m_in.done));
v.busy := busy;

done := '0'; done := '0';
if r.state /= IDLE and busy = '0' then
done := '1';
end if;
exception := '0'; exception := '0';

if r.dwords_done = '1' or r.state = SECOND_REQ then
maddr := next_addr;
byte_sel := r.second_bytes;
else
maddr := r.addr;
byte_sel := r.first_bytes;
end if;
addr := maddr;

case r.state is case r.state is
when IDLE => when IDLE =>


when SECOND_REQ => when SECOND_REQ =>
addr := next_addr;
byte_sel := r.second_bytes;
req := '1'; req := '1';
v.state := ACK_WAIT; v.state := ACK_WAIT;
v.last_dword := '0';


when ACK_WAIT => when ACK_WAIT =>
if d_in.error = '1' then
-- dcache will discard the second request if it
-- gets an error on the 1st of two requests
if d_in.cache_paradox = '1' then
-- signal an interrupt straight away
exception := '1';
dsisr(63 - 38) := not r.load;
-- XXX there is no architected bit for this
dsisr(63 - 35) := d_in.cache_paradox;
else
-- Look up the translation for TLB miss
-- and also for permission error and RC error
-- in case the PTE has been updated.
mmureq := '1';
v.state := MMU_LOOKUP;
end if;
end if;
if d_in.valid = '1' then if d_in.valid = '1' then
if d_in.error = '1' then if r.last_dword = '0' then
-- dcache will discard the second request if it v.dwords_done := '1';
-- gets an error on the 1st of two requests v.last_dword := '1';
if r.dwords_done = '1' then if r.load = '1' then
addr := next_addr; v.load_data := data_permuted;
else
addr := r.addr;
end if;
if d_in.cache_paradox = '1' then
-- signal an interrupt straight away
exception := '1';
dsisr(63 - 38) := not r.load;
-- XXX there is no architected bit for this
dsisr(63 - 35) := d_in.cache_paradox;
v.state := IDLE;
else
-- Look up the translation for TLB miss
-- and also for permission error and RC error
-- in case the PTE has been updated.
mmureq := '1';
v.state := MMU_LOOKUP;
end if; end if;
else else
if two_dwords = '1' and r.dwords_done = '0' then write_enable := r.load;
v.dwords_done := '1'; if r.extra_cycle = '1' then
if r.load = '1' then -- loads with rA update need an extra cycle
v.load_data := data_permuted; v.state := COMPLETE;
end if; v.do_update := r.update;
else else
write_enable := r.load; -- stores write back rA update in this cycle
if r.load = '1' and r.update = '1' then do_update := r.update;
-- loads with rA update need an extra cycle
v.state := LD_UPDATE;
else
-- stores write back rA update in this cycle
do_update := r.update;
done := '1';
v.state := IDLE;
end if;
end if; end if;
v.busy := '0';
end if; end if;
end if; end if;
-- r.wait_dcache gets set one cycle after we come into ACK_WAIT state,
-- which is OK because the dcache always takes at least two cycles.
v.wait_dcache := r.last_dword and not r.extra_cycle;


when MMU_LOOKUP => when MMU_LOOKUP =>
if r.dwords_done = '1' then
addr := next_addr;
byte_sel := r.second_bytes;
else
addr := r.addr;
byte_sel := r.first_bytes;
end if;
if m_in.done = '1' then if m_in.done = '1' then
if m_in.invalid = '0' and m_in.perm_error = '0' and m_in.rc_error = '0' and if r.instr_fault = '0' then
m_in.badtree = '0' and m_in.segerr = '0' then -- retry the request now that the MMU has installed a TLB entry
if r.instr_fault = '0' then req := '1';
-- retry the request now that the MMU has installed a TLB entry if r.last_dword = '0' then
req := '1'; v.state := SECOND_REQ;
if two_dwords = '1' and r.dwords_done = '0' then
v.state := SECOND_REQ;
else
v.state := ACK_WAIT;
end if;
else else
-- nothing to do, the icache retries automatically v.state := ACK_WAIT;
done := '1';
v.state := IDLE;
end if; end if;
else
exception := '1';
dsisr(63 - 33) := m_in.invalid;
dsisr(63 - 36) := m_in.perm_error;
dsisr(63 - 38) := not r.load;
dsisr(63 - 44) := m_in.badtree;
dsisr(63 - 45) := m_in.rc_error;
v.state := IDLE;
end if; end if;
end if; end if;

if m_in.err = '1' then
when TLBIE_WAIT => exception := '1';
if m_in.done = '1' then dsisr(63 - 33) := m_in.invalid;
-- tlbie is finished dsisr(63 - 36) := m_in.perm_error;
done := '1'; dsisr(63 - 38) := not r.load;
v.state := IDLE; dsisr(63 - 44) := m_in.badtree;
dsisr(63 - 45) := m_in.rc_error;
end if; end if;


when LD_UPDATE => when TLBIE_WAIT =>
do_update := '1';
v.state := IDLE;
done := '1';


when SPR_CMPLT => when COMPLETE =>
done := '1';
v.state := IDLE;


end case; end case;


busy := '1'; if done = '1' or exception = '1' then
if r.state = IDLE or done = '1' then v.state := IDLE;
busy := '0'; v.busy := '0';
end if; end if;


-- Note that l_in.valid is gated with busy inside execute1 -- Note that l_in.valid is gated with busy inside execute1
@ -349,6 +364,7 @@ begin
v.tlbie := '0'; v.tlbie := '0';
v.instr_fault := '0'; v.instr_fault := '0';
v.dwords_done := '0'; v.dwords_done := '0';
v.last_dword := '1';
v.write_reg := l_in.write_reg; v.write_reg := l_in.write_reg;
v.length := l_in.length; v.length := l_in.length;
v.byte_reverse := l_in.byte_reverse; v.byte_reverse := l_in.byte_reverse;
@ -361,6 +377,13 @@ begin
v.nc := l_in.ci; v.nc := l_in.ci;
v.virt_mode := l_in.virt_mode; v.virt_mode := l_in.virt_mode;
v.priv_mode := l_in.priv_mode; v.priv_mode := l_in.priv_mode;
v.wait_dcache := '0';
v.wait_mmu := '0';
v.do_update := '0';
v.extra_cycle := '0';

addr := lsu_sum;
maddr := l_in.addr2; -- address from RB for tlbie


-- XXX Temporary hack. Mark the op as non-cachable if the address -- XXX Temporary hack. Mark the op as non-cachable if the address
-- is the form 0xc------- for a real-mode access. -- is the form 0xc------- for a real-mode access.
@ -374,24 +397,14 @@ begin
v.first_bytes := byte_sel; v.first_bytes := byte_sel;
v.second_bytes := long_sel(15 downto 8); v.second_bytes := long_sel(15 downto 8);


-- Do byte reversing and rotating for stores in the first cycle
byte_offset := unsigned(lsu_sum(2 downto 0));
brev_lenm1 := "000";
if l_in.byte_reverse = '1' then
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
end if;
for i in 0 to 7 loop
k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
j := to_integer(k) * 8;
v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
end loop;

case l_in.op is case l_in.op is
when OP_STORE => when OP_STORE =>
req := '1'; req := '1';
when OP_LOAD => when OP_LOAD =>
req := '1'; req := '1';
v.load := '1'; v.load := '1';
-- Allow an extra cycle for RA update on loads
v.extra_cycle := l_in.update;
when OP_DCBZ => when OP_DCBZ =>
req := '1'; req := '1';
v.dcbz := '1'; v.dcbz := '1';
@ -399,6 +412,7 @@ begin
mmureq := '1'; mmureq := '1';
v.tlbie := '1'; v.tlbie := '1';
v.state := TLBIE_WAIT; v.state := TLBIE_WAIT;
v.wait_mmu := '1';
when OP_MFSPR => when OP_MFSPR =>
v.mfspr := '1'; v.mfspr := '1';
-- partial decode on SPR number should be adequate given -- partial decode on SPR number should be adequate given
@ -413,7 +427,7 @@ begin
-- reading one of the SPRs in the MMU -- reading one of the SPRs in the MMU
v.sprval := m_in.sprval; v.sprval := m_in.sprval;
end if; end if;
v.state := SPR_CMPLT; v.state := COMPLETE;
when OP_MTSPR => when OP_MTSPR =>
if sprn(9) = '0' and sprn(5) = '0' then if sprn(9) = '0' and sprn(5) = '0' then
if sprn(0) = '0' then if sprn(0) = '0' then
@ -421,19 +435,20 @@ begin
else else
v.dar := l_in.data; v.dar := l_in.data;
end if; end if;
v.state := SPR_CMPLT; v.state := COMPLETE;
else else
-- writing one of the SPRs in the MMU -- writing one of the SPRs in the MMU
mmu_mtspr := '1'; mmu_mtspr := '1';
v.state := TLBIE_WAIT; v.state := TLBIE_WAIT;
v.wait_mmu := '1';
end if; end if;
when OP_FETCH_FAILED => when OP_FETCH_FAILED =>
-- send it to the MMU to do the radix walk -- send it to the MMU to do the radix walk
addr := l_in.nia; maddr := l_in.nia;
v.addr := l_in.nia;
v.instr_fault := '1'; v.instr_fault := '1';
mmureq := '1'; mmureq := '1';
v.state := MMU_LOOKUP; v.state := MMU_LOOKUP;
v.wait_mmu := '1';
when others => when others =>
assert false report "unknown op sent to loadstore1"; assert false report "unknown op sent to loadstore1";
end case; end case;
@ -445,6 +460,8 @@ begin
v.state := SECOND_REQ; v.state := SECOND_REQ;
end if; end if;
end if; end if;

v.busy := req or mmureq or mmu_mtspr;
end if; end if;


-- Update outputs to dcache -- Update outputs to dcache
@ -454,7 +471,7 @@ begin
d_out.nc <= v.nc; d_out.nc <= v.nc;
d_out.reserve <= v.reserve; d_out.reserve <= v.reserve;
d_out.addr <= addr; d_out.addr <= addr;
d_out.data <= v.store_data; d_out.data <= store_data;
d_out.byte_sel <= byte_sel; d_out.byte_sel <= byte_sel;
d_out.virt_mode <= v.virt_mode; d_out.virt_mode <= v.virt_mode;
d_out.priv_mode <= v.priv_mode; d_out.priv_mode <= v.priv_mode;
@ -467,7 +484,7 @@ begin
m_out.tlbie <= v.tlbie; m_out.tlbie <= v.tlbie;
m_out.mtspr <= mmu_mtspr; m_out.mtspr <= mmu_mtspr;
m_out.sprn <= sprn; m_out.sprn <= sprn;
m_out.addr <= addr; m_out.addr <= maddr;
m_out.slbia <= l_in.insn(7); m_out.slbia <= l_in.insn(7);
m_out.rs <= l_in.data; m_out.rs <= l_in.data;


@ -513,18 +530,23 @@ begin


end process; end process;


ls1_log: process(clk) l1_log: if LOG_LENGTH > 0 generate
signal log_data : std_ulogic_vector(9 downto 0);
begin begin
if rising_edge(clk) then ls1_log: process(clk)
log_data <= e_out.busy & begin
e_out.exception & if rising_edge(clk) then
l_out.valid & log_data <= e_out.busy &
m_out.valid & e_out.exception &
d_out.valid & l_out.valid &
m_in.done & m_out.valid &
r.dwords_done & d_out.valid &
std_ulogic_vector(to_unsigned(state_t'pos(r.state), 3)); m_in.done &
end if; r.dwords_done &
end process; std_ulogic_vector(to_unsigned(state_t'pos(r.state), 3));
log_out <= log_data; end if;
end process;
log_out <= log_data;
end generate;

end; end;

@ -35,7 +35,7 @@ architecture behave of mmu is
RADIX_LOOKUP, RADIX_LOOKUP,
RADIX_READ_WAIT, RADIX_READ_WAIT,
RADIX_LOAD_TLB, RADIX_LOAD_TLB,
RADIX_ERROR RADIX_FINISH
); );


type reg_stage_t is record type reg_stage_t is record
@ -51,6 +51,8 @@ architecture behave of mmu is
pid : std_ulogic_vector(31 downto 0); pid : std_ulogic_vector(31 downto 0);
-- internal state -- internal state
state : state_t; state : state_t;
done : std_ulogic;
err : std_ulogic;
pgtbl0 : std_ulogic_vector(63 downto 0); pgtbl0 : std_ulogic_vector(63 downto 0);
pt0_valid : std_ulogic; pt0_valid : std_ulogic;
pgtbl3 : std_ulogic_vector(63 downto 0); pgtbl3 : std_ulogic_vector(63 downto 0);
@ -91,7 +93,10 @@ begin
report "MMU got tlb miss for " & to_hstring(rin.addr); report "MMU got tlb miss for " & to_hstring(rin.addr);
end if; end if;
if l_out.done = '1' then if l_out.done = '1' then
report "MMU completing op with invalid=" & std_ulogic'image(l_out.invalid) & report "MMU completing op without error";
end if;
if l_out.err = '1' then
report "MMU completing op with err invalid=" & std_ulogic'image(l_out.invalid) &
" badtree=" & std_ulogic'image(l_out.badtree); " badtree=" & std_ulogic'image(l_out.badtree);
end if; end if;
if rin.state = RADIX_LOOKUP then if rin.state = RADIX_LOOKUP then
@ -176,7 +181,6 @@ begin
mmu_1: process(all) mmu_1: process(all)
variable v : reg_stage_t; variable v : reg_stage_t;
variable dcreq : std_ulogic; variable dcreq : std_ulogic;
variable done : std_ulogic;
variable tlb_load : std_ulogic; variable tlb_load : std_ulogic;
variable itlb_load : std_ulogic; variable itlb_load : std_ulogic;
variable tlbie_req : std_ulogic; variable tlbie_req : std_ulogic;
@ -199,7 +203,8 @@ begin
v := r; v := r;
v.valid := '0'; v.valid := '0';
dcreq := '0'; dcreq := '0';
done := '0'; v.done := '0';
v.err := '0';
v.invalid := '0'; v.invalid := '0';
v.badtree := '0'; v.badtree := '0';
v.segerror := '0'; v.segerror := '0';
@ -262,7 +267,7 @@ begin
v.state := PROC_TBL_READ; v.state := PROC_TBL_READ;
elsif mbits = 0 then elsif mbits = 0 then
-- Use RPDS = 0 to disable radix tree walks -- Use RPDS = 0 to disable radix tree walks
v.state := RADIX_ERROR; v.state := RADIX_FINISH;
v.invalid := '1'; v.invalid := '1';
else else
v.state := SEGMENT_CHECK; v.state := SEGMENT_CHECK;
@ -291,8 +296,7 @@ begin


when TLB_WAIT => when TLB_WAIT =>
if d_in.done = '1' then if d_in.done = '1' then
done := '1'; v.state := RADIX_FINISH;
v.state := IDLE;
end if; end if;


when PROC_TBL_READ => when PROC_TBL_READ =>
@ -302,43 +306,42 @@ begin


when PROC_TBL_WAIT => when PROC_TBL_WAIT =>
if d_in.done = '1' then if d_in.done = '1' then
if d_in.err = '0' then if r.addr(63) = '1' then
if r.addr(63) = '1' then v.pgtbl3 := data;
v.pgtbl3 := data; v.pt3_valid := '1';
v.pt3_valid := '1'; else
else v.pgtbl0 := data;
v.pgtbl0 := data; v.pt0_valid := '1';
v.pt0_valid := '1'; end if;
end if; -- rts == radix tree size, # address bits being translated
-- rts == radix tree size, # address bits being translated rts := unsigned('0' & data(62 downto 61) & data(7 downto 5));
rts := unsigned('0' & data(62 downto 61) & data(7 downto 5)); -- mbits == # address bits to index top level of tree
-- mbits == # address bits to index top level of tree mbits := unsigned('0' & data(4 downto 0));
mbits := unsigned('0' & data(4 downto 0)); -- set v.shift to rts so that we can use finalmask for the segment check
-- set v.shift to rts so that we can use finalmask for the segment check v.shift := rts;
v.shift := rts; v.mask_size := mbits(4 downto 0);
v.mask_size := mbits(4 downto 0); v.pgbase := data(55 downto 8) & x"00";
v.pgbase := data(55 downto 8) & x"00"; if mbits = 0 then
if mbits = 0 then v.state := RADIX_FINISH;
v.state := RADIX_ERROR; v.invalid := '1';
v.invalid := '1';
else
v.state := SEGMENT_CHECK;
end if;
else else
v.state := RADIX_ERROR; v.state := SEGMENT_CHECK;
v.badtree := '1';
end if; end if;
end if; end if;
if d_in.err = '1' then
v.state := RADIX_FINISH;
v.badtree := '1';
end if;


when SEGMENT_CHECK => when SEGMENT_CHECK =>
mbits := '0' & r.mask_size; mbits := '0' & r.mask_size;
v.shift := r.shift + (31 - 12) - mbits; v.shift := r.shift + (31 - 12) - mbits;
nonzero := or(r.addr(61 downto 31) and not finalmask(30 downto 0)); nonzero := or(r.addr(61 downto 31) and not finalmask(30 downto 0));
if r.addr(63) /= r.addr(62) or nonzero = '1' then if r.addr(63) /= r.addr(62) or nonzero = '1' then
v.state := RADIX_ERROR; v.state := RADIX_FINISH;
v.segerror := '1'; v.segerror := '1';
elsif mbits < 5 or mbits > 16 or mbits > (r.shift + (31 - 12)) then elsif mbits < 5 or mbits > 16 or mbits > (r.shift + (31 - 12)) then
v.state := RADIX_ERROR; v.state := RADIX_FINISH;
v.badtree := '1'; v.badtree := '1';
else else
v.state := RADIX_LOOKUP; v.state := RADIX_LOOKUP;
@ -350,54 +353,53 @@ begin


when RADIX_READ_WAIT => when RADIX_READ_WAIT =>
if d_in.done = '1' then if d_in.done = '1' then
if d_in.err = '0' then v.pde := data;
v.pde := data; -- test valid bit
-- test valid bit if data(63) = '1' then
if data(63) = '1' then -- test leaf bit
-- test leaf bit if data(62) = '1' then
if data(62) = '1' then -- check permissions and RC bits
-- check permissions and RC bits perm_ok := '0';
perm_ok := '0'; if r.priv = '1' or data(3) = '0' then
if r.priv = '1' or data(3) = '0' then if r.iside = '0' then
if r.iside = '0' then perm_ok := data(1) or (data(2) and not r.store);
perm_ok := data(1) or (data(2) and not r.store);
else
-- no IAMR, so no KUEP support for now
-- deny execute permission if cache inhibited
perm_ok := data(0) and not data(5);
end if;
end if;
rc_ok := data(8) and (data(7) or not r.store);
if perm_ok = '1' and rc_ok = '1' then
v.state := RADIX_LOAD_TLB;
else else
v.state := RADIX_ERROR; -- no IAMR, so no KUEP support for now
v.perm_err := not perm_ok; -- deny execute permission if cache inhibited
-- permission error takes precedence over RC error perm_ok := data(0) and not data(5);
v.rc_error := perm_ok;
end if; end if;
end if;
rc_ok := data(8) and (data(7) or not r.store);
if perm_ok = '1' and rc_ok = '1' then
v.state := RADIX_LOAD_TLB;
else else
mbits := unsigned('0' & data(4 downto 0)); v.state := RADIX_FINISH;
if mbits < 5 or mbits > 16 or mbits > r.shift then v.perm_err := not perm_ok;
v.state := RADIX_ERROR; -- permission error takes precedence over RC error
v.badtree := '1'; v.rc_error := perm_ok;
else
v.shift := v.shift - mbits;
v.mask_size := mbits(4 downto 0);
v.pgbase := data(55 downto 8) & x"00";
v.state := RADIX_LOOKUP;
end if;
end if; end if;
else else
-- non-present PTE, generate a DSI mbits := unsigned('0' & data(4 downto 0));
v.state := RADIX_ERROR; if mbits < 5 or mbits > 16 or mbits > r.shift then
v.invalid := '1'; v.state := RADIX_FINISH;
v.badtree := '1';
else
v.shift := v.shift - mbits;
v.mask_size := mbits(4 downto 0);
v.pgbase := data(55 downto 8) & x"00";
v.state := RADIX_LOOKUP;
end if;
end if; end if;
else else
v.state := RADIX_ERROR; -- non-present PTE, generate a DSI
v.badtree := '1'; v.state := RADIX_FINISH;
v.invalid := '1';
end if; end if;
end if; end if;
if d_in.err = '1' then
v.state := RADIX_FINISH;
v.badtree := '1';
end if;


when RADIX_LOAD_TLB => when RADIX_LOAD_TLB =>
tlb_load := '1'; tlb_load := '1';
@ -406,16 +408,19 @@ begin
v.state := TLB_WAIT; v.state := TLB_WAIT;
else else
itlb_load := '1'; itlb_load := '1';
done := '1';
v.state := IDLE; v.state := IDLE;
end if; end if;


when RADIX_ERROR => when RADIX_FINISH =>
done := '1';
v.state := IDLE; v.state := IDLE;


end case; end case;


if v.state = RADIX_FINISH or (v.state = RADIX_LOAD_TLB and r.iside = '1') then
v.err := v.invalid or v.badtree or v.segerror or v.perm_err or v.rc_error;
v.done := not v.err;
end if;

if r.addr(63) = '1' then if r.addr(63) = '1' then
effpid := x"00000000"; effpid := x"00000000";
else else
@ -451,7 +456,8 @@ begin
tlb_data := (others => '0'); tlb_data := (others => '0');
end if; end if;


l_out.done <= done; l_out.done <= r.done;
l_out.err <= r.err;
l_out.invalid <= r.invalid; l_out.invalid <= r.invalid;
l_out.badtree <= r.badtree; l_out.badtree <= r.badtree;
l_out.segerr <= r.segerror; l_out.segerr <= r.segerror;

@ -7,7 +7,9 @@ use work.common.all;


entity register_file is entity register_file is
generic ( generic (
SIM : boolean := false SIM : boolean := false;
-- Non-zero to enable log data collection
LOG_LENGTH : natural := 0
); );
port( port(
clk : in std_logic; clk : in std_logic;
@ -36,7 +38,6 @@ architecture behaviour of register_file is
signal rd_port_b : std_ulogic_vector(63 downto 0); signal rd_port_b : std_ulogic_vector(63 downto 0);
signal dbg_data : std_ulogic_vector(63 downto 0); signal dbg_data : std_ulogic_vector(63 downto 0);
signal dbg_ack : std_ulogic; signal dbg_ack : std_ulogic;
signal log_data : std_ulogic_vector(70 downto 0);
begin begin
-- synchronous writes -- synchronous writes
register_write_0: process(clk) register_write_0: process(clk)
@ -134,13 +135,18 @@ begin
sim_dump_done <= '0'; sim_dump_done <= '0';
end generate; end generate;


reg_log: process(clk) rf_log: if LOG_LENGTH > 0 generate
signal log_data : std_ulogic_vector(70 downto 0);
begin begin
if rising_edge(clk) then reg_log: process(clk)
log_data <= w_in.write_data & begin
w_in.write_enable & if rising_edge(clk) then
w_in.write_reg; log_data <= w_in.write_data &
end if; w_in.write_enable &
end process; w_in.write_reg;
log_out <= log_data; end if;
end process;
log_out <= log_data;
end generate;

end architecture behaviour; end architecture behaviour;

Loading…
Cancel
Save