diff --git a/common.vhdl b/common.vhdl index a193df1..28b3434 100644 --- a/common.vhdl +++ b/common.vhdl @@ -315,6 +315,7 @@ package common is type MmuToLoadstore1Type is record done : std_ulogic; + err : std_ulogic; invalid : std_ulogic; badtree : std_ulogic; segerr : std_ulogic; diff --git a/core.vhdl b/core.vhdl index 4a83d69..c7dd3f6 100644 --- a/core.vhdl +++ b/core.vhdl @@ -202,7 +202,8 @@ begin SIM => SIM, LINE_SIZE => 64, NUM_LINES => 64, - NUM_WAYS => 2 + NUM_WAYS => 2, + LOG_LENGTH => LOG_LENGTH ) port map( clk => clk, @@ -222,6 +223,9 @@ begin icache_stall_in <= decode1_busy; decode1_0: entity work.decode1 + generic map( + LOG_LENGTH => LOG_LENGTH + ) port map ( clk => clk, rst => rst_dec1, @@ -239,7 +243,8 @@ begin decode2_0: entity work.decode2 generic map ( - EX1_BYPASS => EX1_BYPASS + EX1_BYPASS => EX1_BYPASS, + LOG_LENGTH => LOG_LENGTH ) port map ( clk => clk, @@ -261,7 +266,8 @@ begin register_file_0: entity work.register_file generic map ( - SIM => SIM + SIM => SIM, + LOG_LENGTH => LOG_LENGTH ) port map ( clk => clk, @@ -279,7 +285,8 @@ begin cr_file_0: entity work.cr_file generic map ( - SIM => SIM + SIM => SIM, + LOG_LENGTH => LOG_LENGTH ) port map ( clk => clk, @@ -292,7 +299,8 @@ begin execute1_0: entity work.execute1 generic map ( - EX1_BYPASS => EX1_BYPASS + EX1_BYPASS => EX1_BYPASS, + LOG_LENGTH => LOG_LENGTH ) port map ( clk => clk, @@ -315,6 +323,9 @@ begin ); loadstore1_0: entity work.loadstore1 + generic map ( + LOG_LENGTH => LOG_LENGTH + ) port map ( clk => clk, rst => rst_ls1, @@ -344,7 +355,8 @@ begin generic map( LINE_SIZE => 64, NUM_LINES => 64, - NUM_WAYS => 2 + NUM_WAYS => 2, + LOG_LENGTH => LOG_LENGTH ) port map ( clk => clk, diff --git a/countzero.vhdl b/countzero.vhdl index 50e6ead..18aa043 100644 --- a/countzero.vhdl +++ b/countzero.vhdl @@ -15,123 +15,81 @@ entity zero_counter is end entity zero_counter; architecture behaviour of zero_counter is - type intermediate_result is record - v16: std_ulogic_vector(15 downto 0); - sel_hi: std_ulogic_vector(1 downto 0); - is_32bit: std_ulogic; - count_right: std_ulogic; - end record; - - signal r, r_in : intermediate_result; + -- Reverse the order of bits in a word + function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is + variable ret: std_ulogic_vector(a'left downto a'right); + begin + for i in a'right to a'left loop + ret(a'left + a'right - i) := a(i); + end loop; + return ret; + end; - -- Return the index of the leftmost or rightmost 1 in a set of 4 bits. - -- Assumes v is not "0000"; if it is, return (right ? "11" : "00"). - function encoder(v: std_ulogic_vector(3 downto 0); right: std_ulogic) return std_ulogic_vector is + -- If there is only one bit set in a doubleword, return its bit number + -- (counting from the right). Each bit of the result is obtained by + -- ORing together 32 bits of the input: + -- bit 0 = a[1] or a[3] or a[5] or ... + -- bit 1 = a[2] or a[3] or a[6] or a[7] or ... + -- bit 2 = a[4..7] or a[12..15] or ... + -- bit 5 = a[32..63] ORed together + function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is + variable ret: std_ulogic_vector(5 downto 0); + variable stride: natural; + variable bit: std_ulogic; + variable k: natural; begin - if right = '0' then - if v(3) = '1' then - return "11"; - elsif v(2) = '1' then - return "10"; - elsif v(1) = '1' then - return "01"; - else - return "00"; - end if; - else - if v(0) = '1' then - return "00"; - elsif v(1) = '1' then - return "01"; - elsif v(2) = '1' then - return "10"; - else - return "11"; - end if; - end if; + stride := 2; + for i in 0 to 5 loop + bit := '0'; + for j in 0 to (64 / stride) - 1 loop + k := j * stride; + bit := bit or (or a(k + stride - 1 downto k + (stride / 2))); + end loop; + ret(i) := bit; + stride := stride * 2; + end loop; + return ret; end; + signal inp : std_ulogic_vector(63 downto 0); + signal sum : std_ulogic_vector(64 downto 0); + signal msb_r : std_ulogic; + signal onehot : std_ulogic_vector(63 downto 0); + signal onehot_r : std_ulogic_vector(63 downto 0); + signal bitnum : std_ulogic_vector(5 downto 0); + begin - zerocounter_0: process(clk) + countzero_r: process(clk) begin - if rising_edge(clk) then - r <= r_in; + if rising_edge(clk) then + msb_r <= sum(64); + onehot_r <= onehot; end if; end process; - zerocounter_1: process(all) - variable v: intermediate_result; - variable y, z: std_ulogic_vector(3 downto 0); - variable sel: std_ulogic_vector(5 downto 0); - variable v4: std_ulogic_vector(3 downto 0); - + countzero: process(all) begin - -- Test 4 groups of 16 bits each. - -- The top 2 groups are considered to be zero in 32-bit mode. - z(0) := or (rs(15 downto 0)); - z(1) := or (rs(31 downto 16)); - z(2) := or (rs(47 downto 32)); - z(3) := or (rs(63 downto 48)); if is_32bit = '0' then - v.sel_hi := encoder(z, count_right); + if count_right = '0' then + inp <= bit_reverse(rs); + else + inp <= rs; + end if; else - v.sel_hi(1) := '0'; + inp(63 downto 32) <= x"FFFFFFFF"; if count_right = '0' then - v.sel_hi(0) := z(1); + inp(31 downto 0) <= bit_reverse(rs(31 downto 0)); else - v.sel_hi(0) := not z(0); + inp(31 downto 0) <= rs(31 downto 0); end if; end if; - -- Select the leftmost/rightmost non-zero group of 16 bits - case v.sel_hi is - when "00" => - v.v16 := rs(15 downto 0); - when "01" => - v.v16 := rs(31 downto 16); - when "10" => - v.v16 := rs(47 downto 32); - when others => - v.v16 := rs(63 downto 48); - end case; - - -- Latch this and do the rest in the next cycle, for the sake of timing - v.is_32bit := is_32bit; - v.count_right := count_right; - r_in <= v; - sel(5 downto 4) := r.sel_hi; - - -- Test 4 groups of 4 bits - y(0) := or (r.v16(3 downto 0)); - y(1) := or (r.v16(7 downto 4)); - y(2) := or (r.v16(11 downto 8)); - y(3) := or (r.v16(15 downto 12)); - sel(3 downto 2) := encoder(y, r.count_right); - - -- Select the leftmost/rightmost non-zero group of 4 bits - case sel(3 downto 2) is - when "00" => - v4 := r.v16(3 downto 0); - when "01" => - v4 := r.v16(7 downto 4); - when "10" => - v4 := r.v16(11 downto 8); - when others => - v4 := r.v16(15 downto 12); - end case; - - sel(1 downto 0) := encoder(v4, r.count_right); + sum <= std_ulogic_vector(unsigned('0' & not inp) + 1); + onehot <= sum(63 downto 0) and inp; - -- sel is now the index of the leftmost/rightmost 1 bit in rs - if v4 = "0000" then - -- operand is zero, return 32 for 32-bit, else 64 - result <= x"00000000000000" & '0' & not r.is_32bit & r.is_32bit & "00000"; - elsif r.count_right = '0' then - -- return (63 - sel), trimmed to 5 bits in 32-bit mode - result <= x"00000000000000" & "00" & (not sel(5) and not r.is_32bit) & not sel(4 downto 0); - else - result <= x"00000000000000" & "00" & sel; - end if; + -- The following occurs after a clock edge + bitnum <= bit_number(onehot_r); + result <= x"00000000000000" & "0" & msb_r & bitnum; end process; end behaviour; diff --git a/cr_file.vhdl b/cr_file.vhdl index 37fa76b..3e65663 100644 --- a/cr_file.vhdl +++ b/cr_file.vhdl @@ -7,7 +7,9 @@ use work.common.all; entity cr_file is generic ( - SIM : boolean := false + SIM : boolean := false; + -- Non-zero to enable log data collection + LOG_LENGTH : natural := 0 ); port( clk : in std_logic; @@ -29,7 +31,6 @@ architecture behaviour of cr_file is signal crs_updated : std_ulogic_vector(31 downto 0); signal xerc : xer_common_t := xerc_init; signal xerc_updated : xer_common_t; - signal log_data : std_ulogic_vector(12 downto 0); begin cr_create_0: process(all) variable hi, lo : integer := 0; @@ -91,14 +92,18 @@ begin end process; end generate; - cr_log: process(clk) + cf_log: if LOG_LENGTH > 0 generate + signal log_data : std_ulogic_vector(12 downto 0); begin - if rising_edge(clk) then - log_data <= w_in.write_cr_enable & - w_in.write_cr_data(31 downto 28) & - w_in.write_cr_mask; - end if; - end process; - log_out <= log_data; + cr_log: process(clk) + begin + if rising_edge(clk) then + log_data <= w_in.write_cr_enable & + w_in.write_cr_data(31 downto 28) & + w_in.write_cr_mask; + end if; + end process; + log_out <= log_data; + end generate; end architecture behaviour; diff --git a/dcache.vhdl b/dcache.vhdl index 9ecb6a9..956768c 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -31,7 +31,9 @@ entity dcache is -- L1 DTLB number of sets TLB_NUM_WAYS : positive := 2; -- L1 DTLB log_2(page_size) - TLB_LG_PGSZ : positive := 12 + TLB_LG_PGSZ : positive := 12; + -- Non-zero to enable log data collection + LOG_LENGTH : natural := 0 ); port ( clk : in std_ulogic; @@ -226,13 +228,14 @@ architecture rtl of dcache is type mem_access_request_t is record op : op_t; + valid : std_ulogic; dcbz : std_ulogic; real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); data : std_ulogic_vector(63 downto 0); byte_sel : std_ulogic_vector(7 downto 0); hit_way : way_t; - repl_way : way_t; same_tag : std_ulogic; + mmu_req : std_ulogic; end record; -- First stage register, contains state for stage 1 of load hits @@ -247,6 +250,13 @@ architecture rtl of dcache is -- Cache hit state hit_way : way_t; hit_load_valid : std_ulogic; + hit_index : index_t; + cache_hit : std_ulogic; + + -- TLB hit state + tlb_hit : std_ulogic; + tlb_hit_way : tlb_way_t; + tlb_hit_index : tlb_index_t; -- 2-stage data buffer for data forwarded from writes to reads forward_data1 : std_ulogic_vector(63 downto 0); @@ -272,16 +282,18 @@ architecture rtl of dcache is end_row_ix : row_in_line_t; rows_valid : row_per_line_valid_t; acks_pending : unsigned(2 downto 0); - - -- Signals to complete with error - error_done : std_ulogic; + inc_acks : std_ulogic; + dec_acks : std_ulogic; + + -- Signals to complete (possibly with error) + ls_valid : std_ulogic; + ls_error : std_ulogic; + mmu_done : std_ulogic; + mmu_error : std_ulogic; cache_paradox : std_ulogic; -- Signal to complete a failed stcx. stcx_fail : std_ulogic; - - -- completion signal for tlbie - tlbie_done : std_ulogic; end record; signal r1 : reg_stage_1_t; @@ -303,6 +315,7 @@ architecture rtl of dcache is signal req_op : op_t; signal req_data : std_ulogic_vector(63 downto 0); signal req_same_tag : std_ulogic; + signal req_go : std_ulogic; signal early_req_row : row_t; @@ -455,8 +468,6 @@ architecture rtl of dcache is ptes(j + TLB_PTE_BITS - 1 downto j) := newpte; end; - signal log_data : std_ulogic_vector(19 downto 0); - begin assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; @@ -566,15 +577,15 @@ begin lru => tlb_plru_out ); - process(tlb_req_index, tlb_hit, tlb_hit_way, tlb_plru_out) + process(all) begin -- PLRU interface - if tlb_hit = '1' and tlb_req_index = i then - tlb_plru_acc_en <= '1'; + if r1.tlb_hit_index = i then + tlb_plru_acc_en <= r1.tlb_hit; else tlb_plru_acc_en <= '0'; end if; - tlb_plru_acc <= std_ulogic_vector(to_unsigned(tlb_hit_way, TLB_WAY_BITS)); + tlb_plru_acc <= std_ulogic_vector(to_unsigned(r1.tlb_hit_way, TLB_WAY_BITS)); tlb_plru_victim(i) <= tlb_plru_out; end process; end generate; @@ -677,16 +688,15 @@ begin lru => plru_out ); - process(req_index, req_op, req_hit_way, plru_out) + process(all) begin -- PLRU interface - if (req_op = OP_LOAD_HIT or - req_op = OP_STORE_HIT) and req_index = i then - plru_acc_en <= '1'; + if r1.hit_index = i then + plru_acc_en <= r1.cache_hit; else plru_acc_en <= '0'; end if; - plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS)); + plru_acc <= std_ulogic_vector(to_unsigned(r1.hit_way, WAY_BITS)); plru_victim(i) <= plru_out; end process; end generate; @@ -730,7 +740,7 @@ begin req_row <= get_row(r0.req.addr); req_tag <= get_tag(ra); - go := r0_valid and not (r0.tlbie or r0.tlbld) and not r1.error_done; + go := r0_valid and not (r0.tlbie or r0.tlbld) and not r1.ls_error; -- Test if pending request is a hit on any way -- In order to make timing in virtual mode, when we are using the TLB, @@ -788,7 +798,7 @@ begin -- since it will be by the time we perform the store. -- For a load, check the appropriate row valid bit. is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE); - hit_way := r1.store_way; + hit_way := replace_way; end if; -- Whether to use forwarded data for a load or not @@ -811,8 +821,12 @@ begin -- The way that matched on a hit req_hit_way <= hit_way; - -- The way to replace on a miss - replace_way <= to_integer(unsigned(plru_victim(req_index))); + -- The way to replace on a miss + if r1.write_tag = '1' then + replace_way <= to_integer(unsigned(plru_victim(r1.store_index))); + else + replace_way <= r1.store_way; + end if; -- work out whether we have permission for this access -- NB we don't yet implement AMR, thus no KUAP @@ -847,6 +861,7 @@ begin end if; end if; req_op <= op; + req_go <= go; -- Version of the row number that is valid one cycle earlier -- in the cases where we need to read the cache data BRAM. @@ -928,15 +943,15 @@ begin end if; end loop; - d_out.valid <= '0'; + d_out.valid <= r1.ls_valid; d_out.data <= data_out; - d_out.store_done <= '0'; - d_out.error <= '0'; - d_out.cache_paradox <= '0'; + d_out.store_done <= not r1.stcx_fail; + d_out.error <= r1.ls_error; + d_out.cache_paradox <= r1.cache_paradox; -- Outputs to MMU - m_out.done <= r1.tlbie_done; - m_out.err <= '0'; + m_out.done <= r1.mmu_done; + m_out.err <= r1.mmu_error; m_out.data <= data_out; -- We have a valid load or store hit or we just completed a slow @@ -962,47 +977,32 @@ begin -- Load hit case is the standard path if r1.hit_load_valid = '1' then report "completing load hit data=" & to_hstring(data_out); - d_out.valid <= '1'; end if; -- error cases complete without stalling - if r1.error_done = '1' then + if r1.ls_error = '1' then report "completing ld/st with error"; - d_out.error <= '1'; - d_out.cache_paradox <= r1.cache_paradox; - d_out.valid <= '1'; end if; -- Slow ops (load miss, NC, stores) if r1.slow_valid = '1' then - d_out.store_done <= '1'; report "completing store or load miss data=" & to_hstring(data_out); - d_out.valid <= '1'; - end if; - - if r1.stcx_fail = '1' then - d_out.store_done <= '0'; - d_out.valid <= '1'; end if; else -- Request came from MMU if r1.hit_load_valid = '1' then report "completing load hit to MMU, data=" & to_hstring(m_out.data); - m_out.done <= '1'; end if; -- error cases complete without stalling - if r1.error_done = '1' then + if r1.mmu_error = '1' then report "completing MMU ld with error"; - m_out.err <= '1'; - m_out.done <= '1'; end if; -- Slow ops (i.e. load miss) if r1.slow_valid = '1' then report "completing MMU load miss, data=" & to_hstring(m_out.data); - m_out.done <= '1'; end if; end if; @@ -1079,7 +1079,7 @@ begin wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); wr_sel <= (others => '1'); - if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r1.store_way = i then + if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and replace_way = i then do_write <= '1'; end if; end if; @@ -1113,20 +1113,28 @@ begin end if; -- Fast path for load/store hits. Set signals for the writeback controls. + r1.hit_way <= req_hit_way; + r1.hit_index <= req_index; if req_op = OP_LOAD_HIT then - r1.hit_way <= req_hit_way; r1.hit_load_valid <= '1'; else r1.hit_load_valid <= '0'; end if; + if req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT then + r1.cache_hit <= '1'; + else + r1.cache_hit <= '0'; + end if; if req_op = OP_BAD then report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) & " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok); - r1.error_done <= '1'; + r1.ls_error <= not r0.mmu_req; + r1.mmu_error <= r0.mmu_req; r1.cache_paradox <= access_ok; else - r1.error_done <= '0'; + r1.ls_error <= '0'; + r1.mmu_error <= '0'; r1.cache_paradox <= '0'; end if; @@ -1136,8 +1144,11 @@ begin r1.stcx_fail <= '0'; end if; - -- complete tlbies and TLB loads in the third cycle - r1.tlbie_done <= r0_valid and (r0.tlbie or r0.tlbld); + -- Record TLB hit information for updating TLB PLRU + r1.tlb_hit <= tlb_hit; + r1.tlb_hit_way <= tlb_hit_way; + r1.tlb_hit_index <= tlb_req_index; + end if; end process; @@ -1179,7 +1190,7 @@ begin r1.forward_data1 <= wishbone_in.dat; end if; r1.forward_sel1 <= (others => '1'); - r1.forward_way1 <= r1.store_way; + r1.forward_way1 <= replace_way; r1.forward_row1 <= r1.store_row; r1.forward_valid1 <= '0'; end if; @@ -1194,6 +1205,8 @@ begin r1.slow_valid <= '0'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; + r1.ls_valid <= '0'; + r1.mmu_done <= '0'; -- Not useful normally but helps avoiding tons of sim warnings r1.wb.adr <= (others => '0'); @@ -1201,15 +1214,29 @@ begin -- One cycle pulses reset r1.slow_valid <= '0'; r1.write_bram <= '0'; + r1.inc_acks <= '0'; + r1.dec_acks <= '0'; + + r1.ls_valid <= '0'; + -- complete tlbies and TLB loads in the third cycle + r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld); + if req_op = OP_LOAD_HIT or req_op = OP_STCX_FAIL then + if r0.mmu_req = '0' then + r1.ls_valid <= '1'; + else + r1.mmu_done <= '1'; + end if; + end if; if r1.write_tag = '1' then -- Store new tag in selected way for i in 0 to NUM_WAYS-1 loop - if i = r1.store_way then + if i = replace_way then cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <= (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag; end if; end loop; + r1.store_way <= replace_way; r1.write_tag <= '0'; end if; @@ -1219,12 +1246,23 @@ begin req := r1.req; else req.op := req_op; + req.valid := req_go; + req.mmu_req := r0.mmu_req; req.dcbz := r0.req.dcbz; req.real_addr := ra; - req.data := r0.req.data; - req.byte_sel := r0.req.byte_sel; + -- Force data to 0 for dcbz + if r0.req.dcbz = '0' then + req.data := r0.req.data; + else + req.data := (others => '0'); + end if; + -- Select all bytes for dcbz and for cacheable loads + if r0.req.dcbz = '1' or (r0.req.load = '1' and r0.req.nc = '0') then + req.byte_sel := (others => '1'); + else + req.byte_sel := r0.req.byte_sel; + end if; req.hit_way := req_hit_way; - req.repl_way := replace_way; req.same_tag := req_same_tag; -- Store the incoming request from r0, if it is a slow request @@ -1240,7 +1278,9 @@ begin case r1.state is when IDLE => r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0); - r1.dcbz <= '0'; + r1.wb.sel <= req.byte_sel; + r1.wb.dat <= req.data; + r1.dcbz <= req.dcbz; -- Keep track of our index and way for subsequent stores. r1.store_index <= get_index(req.real_addr); @@ -1251,8 +1291,6 @@ begin if req.op = OP_STORE_HIT then r1.store_way <= req.hit_way; - else - r1.store_way <= req.repl_way; end if; -- Reset per-row valid bits, ready for handling OP_LOAD_MISS @@ -1269,11 +1307,9 @@ begin -- report "cache miss real addr:" & to_hstring(req.real_addr) & " idx:" & integer'image(get_index(req.real_addr)) & - " way:" & integer'image(req.repl_way) & " tag:" & to_hstring(get_tag(req.real_addr)); -- Start the wishbone cycle - r1.wb.sel <= (others => '1'); r1.wb.we <= '0'; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; @@ -1283,7 +1319,6 @@ begin r1.write_tag <= '1'; when OP_LOAD_NC => - r1.wb.sel <= req.byte_sel; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; @@ -1291,27 +1326,25 @@ begin when OP_STORE_HIT | OP_STORE_MISS => if req.dcbz = '0' then - r1.wb.sel <= req.byte_sel; - r1.wb.dat <= req.data; r1.state <= STORE_WAIT_ACK; r1.acks_pending <= to_unsigned(1, 3); r1.full <= '0'; r1.slow_valid <= '1'; + if req.mmu_req = '0' then + r1.ls_valid <= '1'; + else + r1.mmu_done <= '1'; + end if; if req.op = OP_STORE_HIT then r1.write_bram <= '1'; end if; else -- dcbz is handled much like a load miss except -- that we are writing to memory instead of reading - - -- Start the wishbone writes - r1.wb.sel <= (others => '1'); - r1.wb.dat <= (others => '0'); - - -- Handle the rest like a load miss r1.state <= RELOAD_WAIT_ACK; - r1.write_tag <= '1'; - r1.dcbz <= '1'; + if req.op = OP_STORE_MISS then + r1.write_tag <= '1'; + end if; end if; r1.wb.we <= '1'; r1.wb.cyc <= '1'; @@ -1357,6 +1390,11 @@ begin r1.store_row = get_row(r1.req.real_addr) then r1.full <= '0'; r1.slow_valid <= '1'; + if r1.mmu_req = '0' then + r1.ls_valid <= '1'; + else + r1.mmu_done <= '1'; + end if; r1.forward_sel <= (others => '1'); r1.use_forward1 <= '1'; end if; @@ -1379,15 +1417,26 @@ begin when STORE_WAIT_ACK => stbs_done := r1.wb.stb = '0'; acks := r1.acks_pending; + if r1.inc_acks /= r1.dec_acks then + if r1.inc_acks = '1' then + acks := acks + 1; + else + acks := acks - 1; + end if; + end if; + r1.acks_pending <= acks; -- Clear stb when slave accepted request if wishbone_in.stall = '0' then -- See if there is another store waiting to be done -- which is in the same real page. - if acks < 7 and req.same_tag = '1' and - (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then - r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0); + if req.valid = '1' then + r1.wb.adr(SET_SIZE_BITS - 1 downto 0) <= + req.real_addr(SET_SIZE_BITS - 1 downto 0); r1.wb.dat <= req.data; r1.wb.sel <= req.byte_sel; + end if; + if acks < 7 and req.same_tag = '1' and + (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then r1.wb.stb <= '1'; stbs_done := false; if req.op = OP_STORE_HIT then @@ -1395,7 +1444,10 @@ begin end if; r1.full <= '0'; r1.slow_valid <= '1'; - acks := acks + 1; + -- Store requests never come from the MMU + r1.ls_valid <= '1'; + stbs_done := false; + r1.inc_acks <= '1'; else r1.wb.stb <= '0'; stbs_done := true; @@ -1409,9 +1461,8 @@ begin r1.wb.cyc <= '0'; r1.wb.stb <= '0'; end if; - acks := acks - 1; + r1.dec_acks <= '1'; end if; - r1.acks_pending <= acks; when NC_LOAD_WAIT_ACK => -- Clear stb when slave accepted request @@ -1424,6 +1475,11 @@ begin r1.state <= IDLE; r1.full <= '0'; r1.slow_valid <= '1'; + if r1.mmu_req = '0' then + r1.ls_valid <= '1'; + else + r1.mmu_done <= '1'; + end if; r1.forward_sel <= (others => '1'); r1.use_forward1 <= '1'; r1.wb.cyc <= '0'; @@ -1434,21 +1490,25 @@ begin end if; end process; - dcache_log: process(clk) + dc_log: if LOG_LENGTH > 0 generate + signal log_data : std_ulogic_vector(19 downto 0); begin - if rising_edge(clk) then - log_data <= r1.wb.adr(5 downto 3) & - wishbone_in.stall & - wishbone_in.ack & - r1.wb.stb & r1.wb.cyc & - d_out.error & - d_out.valid & - std_ulogic_vector(to_unsigned(op_t'pos(req_op), 3)) & - stall_out & - std_ulogic_vector(to_unsigned(tlb_hit_way, 3)) & - valid_ra & - std_ulogic_vector(to_unsigned(state_t'pos(r1.state), 3)); - end if; - end process; - log_out <= log_data; + dcache_log: process(clk) + begin + if rising_edge(clk) then + log_data <= r1.wb.adr(5 downto 3) & + wishbone_in.stall & + wishbone_in.ack & + r1.wb.stb & r1.wb.cyc & + d_out.error & + d_out.valid & + std_ulogic_vector(to_unsigned(op_t'pos(req_op), 3)) & + stall_out & + std_ulogic_vector(to_unsigned(tlb_hit_way, 3)) & + valid_ra & + std_ulogic_vector(to_unsigned(state_t'pos(r1.state), 3)); + end if; + end process; + log_out <= log_data; + end generate; end; diff --git a/decode1.vhdl b/decode1.vhdl index 29b7a05..f553e2d 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -7,6 +7,10 @@ use work.common.all; use work.decode_types.all; entity decode1 is + generic ( + -- Non-zero to enable log data collection + LOG_LENGTH : natural := 0 + ); port ( clk : in std_ulogic; rst : in std_ulogic; @@ -47,7 +51,7 @@ architecture behaviour of decode1 is 15 => (ALU, OP_ADD, RA_OR_ZERO, CONST_SI_HI, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- addis 28 => (ALU, OP_AND, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0'), -- andi. 29 => (ALU, OP_AND, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0'), -- andis. - 0 => (ALU, OP_ATTN, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- attn + 0 => (ALU, OP_ATTN, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- attn 18 => (ALU, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- b 16 => (ALU, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc 11 => (ALU, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmpi @@ -73,9 +77,9 @@ architecture behaviour of decode1 is 45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu 36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw 37 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwu - 8 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic - 2 => (ALU, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi - 3 => (ALU, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- twi + 8 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic + 2 => (ALU, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi + 3 => (ALU, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- twi 26 => (ALU, OP_XOR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- xori 27 => (ALU, OP_XOR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- xoris others => illegal_inst @@ -357,8 +361,6 @@ architecture behaviour of decode1 is constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); constant fetch_fail_inst: decode_rom_t := (LDST, OP_FETCH_FAILED, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); - signal log_data : std_ulogic_vector(12 downto 0); - begin decode1_0: process(clk) begin @@ -524,15 +526,19 @@ begin flush_out <= f.redirect; end process; - dec1_log : process(clk) + d1_log: if LOG_LENGTH > 0 generate + signal log_data : std_ulogic_vector(12 downto 0); begin - if rising_edge(clk) then - log_data <= std_ulogic_vector(to_unsigned(insn_type_t'pos(r.decode.insn_type), 6)) & - r.nia(5 downto 2) & - std_ulogic_vector(to_unsigned(unit_t'pos(r.decode.unit), 2)) & - r.valid; - end if; - end process; - log_out <= log_data; + dec1_log : process(clk) + begin + if rising_edge(clk) then + log_data <= std_ulogic_vector(to_unsigned(insn_type_t'pos(r.decode.insn_type), 6)) & + r.nia(5 downto 2) & + std_ulogic_vector(to_unsigned(unit_t'pos(r.decode.unit), 2)) & + r.valid; + end if; + end process; + log_out <= log_data; + end generate; end architecture behaviour; diff --git a/decode2.vhdl b/decode2.vhdl index d724874..62c574c 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -10,7 +10,9 @@ use work.insn_helpers.all; entity decode2 is generic ( - EX1_BYPASS : boolean := true + EX1_BYPASS : boolean := true; + -- Non-zero to enable log data collection + LOG_LENGTH : natural := 0 ); port ( clk : in std_ulogic; @@ -47,8 +49,6 @@ architecture behaviour of decode2 is signal deferred : std_ulogic; - signal log_data : std_ulogic_vector(9 downto 0); - type decode_input_reg_t is record reg_valid : std_ulogic; reg : gspr_index_t; @@ -415,18 +415,22 @@ begin e_out <= r.e; end process; - dec2_log : process(clk) + d2_log: if LOG_LENGTH > 0 generate + signal log_data : std_ulogic_vector(9 downto 0); begin - if rising_edge(clk) then - log_data <= r.e.nia(5 downto 2) & - r.e.valid & - stopped_out & - stall_out & - r.e.bypass_data3 & - r.e.bypass_data2 & - r.e.bypass_data1; - end if; - end process; - log_out <= log_data; + dec2_log : process(clk) + begin + if rising_edge(clk) then + log_data <= r.e.nia(5 downto 2) & + r.e.valid & + stopped_out & + stall_out & + r.e.bypass_data3 & + r.e.bypass_data2 & + r.e.bypass_data1; + end if; + end process; + log_out <= log_data; + end generate; end architecture behaviour; diff --git a/execute1.vhdl b/execute1.vhdl index 3b2007a..2722570 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -12,7 +12,9 @@ use work.ppc_fx_insns.all; entity execute1 is generic ( - EX1_BYPASS : boolean := true + EX1_BYPASS : boolean := true; + -- Non-zero to enable log data collection + LOG_LENGTH : natural := 0 ); port ( clk : in std_ulogic; @@ -97,7 +99,6 @@ architecture behaviour of execute1 is -- signals for logging signal exception_log : std_ulogic; signal irq_valid_log : std_ulogic; - signal log_data : std_ulogic_vector(14 downto 0); type privilege_level is (USER, SUPER); type op_privilege_array is array(insn_type_t) of privilege_level; @@ -619,12 +620,12 @@ begin end loop; else -- trap instructions (tw, twi, td, tdi) + v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); + -- set bit 46 to say trap occurred + ctrl_tmp.srr1(63 - 46) <= '1'; if or (trapval and insn_to(e_in.insn)) = '1' then -- generate trap-type program interrupt exception := '1'; - v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); - -- set bit 46 to say trap occurred - ctrl_tmp.srr1(63 - 46) <= '1'; report "trap"; end if; end if; @@ -1083,21 +1084,25 @@ begin irq_valid_log <= irq_valid; end process; - ex1_log : process(clk) + e1_log: if LOG_LENGTH > 0 generate + signal log_data : std_ulogic_vector(14 downto 0); begin - if rising_edge(clk) then - log_data <= ctrl.msr(MSR_EE) & ctrl.msr(MSR_PR) & - ctrl.msr(MSR_IR) & ctrl.msr(MSR_DR) & - exception_log & - irq_valid_log & - std_ulogic_vector(to_unsigned(irq_state_t'pos(ctrl.irq_state), 1)) & - "000" & - r.e.write_enable & - r.e.valid & - f_out.redirect & - r.busy & - flush_out; - end if; - end process; - log_out <= log_data; + ex1_log : process(clk) + begin + if rising_edge(clk) then + log_data <= ctrl.msr(MSR_EE) & ctrl.msr(MSR_PR) & + ctrl.msr(MSR_IR) & ctrl.msr(MSR_DR) & + exception_log & + irq_valid_log & + std_ulogic_vector(to_unsigned(irq_state_t'pos(ctrl.irq_state), 1)) & + "000" & + r.e.write_enable & + r.e.valid & + f_out.redirect & + r.busy & + flush_out; + end if; + end process; + log_out <= log_data; + end generate; end architecture behaviour; diff --git a/icache.vhdl b/icache.vhdl index dab2505..3f1c15f 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -47,7 +47,9 @@ entity icache is -- L1 ITLB log_2(page_size) TLB_LG_PGSZ : positive := 12; -- Number of real address bits that we store - REAL_ADDR_BITS : positive := 56 + REAL_ADDR_BITS : positive := 56; + -- Non-zero to enable log data collection + LOG_LENGTH : natural := 0 ); port ( clk : in std_ulogic; @@ -207,9 +209,6 @@ architecture rtl of icache is signal access_ok : std_ulogic; signal use_previous : std_ulogic; - -- Output data to logger - signal log_data : std_ulogic_vector(53 downto 0); - -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; signal cache_out : cache_ram_out_t; @@ -379,7 +378,7 @@ begin begin do_read <= not (stall_in or use_previous); do_write <= '0'; - if wishbone_in.ack = '1' and r.store_way = i then + if wishbone_in.ack = '1' and replace_way = i then do_write <= '1'; end if; cache_out(i) <= dout; @@ -413,15 +412,15 @@ begin lru => plru_out ); - process(req_index, req_is_hit, req_hit_way, req_is_hit, plru_out) + process(all) begin -- PLRU interface - if req_is_hit = '1' and req_index = i then - plru_acc_en <= req_is_hit; + if get_index(r.hit_nia) = i then + plru_acc_en <= r.hit_valid; else plru_acc_en <= '0'; end if; - plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS)); + plru_acc <= std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS)); plru_victim(i) <= plru_out; end process; end generate; @@ -531,8 +530,12 @@ begin end if; req_hit_way <= hit_way; - -- The way to replace on a miss - replace_way <= to_integer(unsigned(plru_victim(req_index))); + -- The way to replace on a miss + if r.state = CLR_TAG then + replace_way <= to_integer(unsigned(plru_victim(r.store_index))); + else + replace_way <= r.store_way; + end if; -- Output instruction from current cache row -- @@ -642,7 +645,6 @@ begin -- Keep track of our index and way for subsequent stores r.store_index <= req_index; - r.store_way <= replace_way; r.store_row <= get_row(req_laddr); r.store_tag <= req_tag; r.store_valid <= '1'; @@ -661,12 +663,15 @@ begin when CLR_TAG | WAIT_ACK => if r.state = CLR_TAG then + -- Get victim way from plru + r.store_way <= replace_way; + -- Force misses on that way while reloading that line - cache_valids(req_index)(r.store_way) <= '0'; + cache_valids(req_index)(replace_way) <= '0'; -- Store new tag in selected way for i in 0 to NUM_WAYS-1 loop - if i = r.store_way then + if i = replace_way then tagset := cache_tags(r.store_index); write_tag(i, tagset, r.store_tag); cache_tags(r.store_index) <= tagset; @@ -702,7 +707,7 @@ begin r.wb.cyc <= '0'; -- Cache line is now valid - cache_valids(r.store_index)(r.store_way) <= r.store_valid and not inval_in; + cache_valids(r.store_index)(replace_way) <= r.store_valid and not inval_in; -- We are done r.state <= IDLE; @@ -723,35 +728,36 @@ begin end if; end process; - data_log: process(clk) - variable lway: way_t; - variable wstate: std_ulogic; + icache_log: if LOG_LENGTH > 0 generate + -- Output data to logger + signal log_data : std_ulogic_vector(53 downto 0); begin - if rising_edge(clk) then - if req_is_hit then + data_log: process(clk) + variable lway: way_t; + variable wstate: std_ulogic; + begin + if rising_edge(clk) then lway := req_hit_way; - else - lway := replace_way; - end if; - wstate := '0'; - if r.state /= IDLE then - wstate := '1'; + wstate := '0'; + if r.state /= IDLE then + wstate := '1'; + end if; + log_data <= i_out.valid & + i_out.insn & + wishbone_in.ack & + r.wb.adr(5 downto 3) & + r.wb.stb & r.wb.cyc & + wishbone_in.stall & + stall_out & + r.fetch_failed & + r.hit_nia(5 downto 2) & + wstate & + std_ulogic_vector(to_unsigned(lway, 3)) & + req_is_hit & req_is_miss & + access_ok & + ra_valid; end if; - log_data <= i_out.valid & - i_out.insn & - wishbone_in.ack & - r.wb.adr(5 downto 3) & - r.wb.stb & r.wb.cyc & - wishbone_in.stall & - stall_out & - r.fetch_failed & - r.hit_nia(5 downto 2) & - wstate & - std_ulogic_vector(to_unsigned(lway, 3)) & - req_is_hit & req_is_miss & - access_ok & - ra_valid; - end if; - end process; - log_out <= log_data; + end process; + log_out <= log_data; + end generate; end; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index cf00987..62914c0 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -10,6 +10,10 @@ use work.common.all; -- We calculate the address in the first cycle entity loadstore1 is + generic ( + -- Non-zero to enable log data collection + LOG_LENGTH : natural := 0 + ); port ( clk : in std_ulogic; rst : in std_ulogic; @@ -40,10 +44,9 @@ architecture behave of loadstore1 is type state_t is (IDLE, -- ready for instruction SECOND_REQ, -- send 2nd request of unaligned xfer ACK_WAIT, -- waiting for ack from dcache - LD_UPDATE, -- writing rA with computed addr on load MMU_LOOKUP, -- waiting for MMU to look up translation TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie - SPR_CMPLT -- complete a mf/tspr operation + COMPLETE -- extra cycle to complete an operation ); type reg_stage_t is record @@ -69,12 +72,18 @@ architecture behave of loadstore1 is priv_mode : std_ulogic; state : state_t; dwords_done : std_ulogic; + last_dword : std_ulogic; first_bytes : std_ulogic_vector(7 downto 0); second_bytes : std_ulogic_vector(7 downto 0); dar : std_ulogic_vector(63 downto 0); dsisr : std_ulogic_vector(31 downto 0); instr_fault : std_ulogic; sprval : std_ulogic_vector(63 downto 0); + busy : std_ulogic; + wait_dcache : std_ulogic; + wait_mmu : std_ulogic; + do_update : std_ulogic; + extra_cycle : std_ulogic; end record; type byte_sel_t is array(0 to 7) of std_ulogic; @@ -84,8 +93,6 @@ architecture behave of loadstore1 is signal r, rin : reg_stage_t; signal lsu_sum : std_ulogic_vector(63 downto 0); - signal log_data : std_ulogic_vector(9 downto 0); - -- Generate byte enables from sizes function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is begin @@ -125,6 +132,8 @@ begin if rising_edge(clk) then if rst = '1' then r.state <= IDLE; + r.busy <= '0'; + r.do_update <= '0'; else r <= rin; end if; @@ -143,13 +152,14 @@ begin variable req : std_ulogic; variable busy : std_ulogic; variable addr : std_ulogic_vector(63 downto 0); + variable maddr : std_ulogic_vector(63 downto 0); variable wdata : std_ulogic_vector(63 downto 0); variable write_enable : std_ulogic; variable do_update : std_ulogic; - variable two_dwords : std_ulogic; variable done : std_ulogic; variable data_permuted : std_ulogic_vector(63 downto 0); variable data_trimmed : std_ulogic_vector(63 downto 0); + variable store_data : std_ulogic_vector(63 downto 0); variable use_second : byte_sel_t; variable trim_ctl : trim_ctl_t; variable negative : std_ulogic; @@ -163,8 +173,6 @@ begin begin v := r; req := '0'; - byte_sel := (others => '0'); - addr := lsu_sum; v.mfspr := '0'; mmu_mtspr := '0'; itlb_fault := '0'; @@ -173,8 +181,9 @@ begin mmureq := '0'; write_enable := '0'; - do_update := '0'; - two_dwords := or (r.second_bytes); + + do_update := r.do_update; + v.do_update := '0'; -- load data formatting byte_offset := unsigned(r.addr(2 downto 0)); @@ -204,10 +213,10 @@ begin -- trim and sign-extend for i in 0 to 7 loop if i < to_integer(unsigned(r.length)) then - if two_dwords = '1' then + if r.dwords_done = '1' then trim_ctl(i) := '1' & not use_second(i); else - trim_ctl(i) := not use_second(i) & '0'; + trim_ctl(i) := "10"; end if; else trim_ctl(i) := '0' & (negative and r.sign_extend); @@ -224,121 +233,127 @@ begin end case; end loop; + -- Byte reversing and rotating for stores + -- Done in the first cycle (when l_in.valid = 1) + store_data := r.store_data; + if l_in.valid = '1' then + byte_offset := unsigned(lsu_sum(2 downto 0)); + brev_lenm1 := "000"; + if l_in.byte_reverse = '1' then + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + end if; + for i in 0 to 7 loop + k := (to_unsigned(i, 3) - byte_offset) xor brev_lenm1; + j := to_integer(k) * 8; + store_data(i * 8 + 7 downto i * 8) := l_in.data(j + 7 downto j); + end loop; + end if; + v.store_data := store_data; + -- compute (addr + 8) & ~7 for the second doubleword when unaligned next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; + -- Busy calculation. + -- We need to minimize the delay from clock to busy valid because it + -- gates the start of execution of the next instruction. + busy := r.busy and not ((r.wait_dcache and d_in.valid) or (r.wait_mmu and m_in.done)); + v.busy := busy; + done := '0'; + if r.state /= IDLE and busy = '0' then + done := '1'; + end if; exception := '0'; + + if r.dwords_done = '1' or r.state = SECOND_REQ then + maddr := next_addr; + byte_sel := r.second_bytes; + else + maddr := r.addr; + byte_sel := r.first_bytes; + end if; + addr := maddr; + case r.state is when IDLE => when SECOND_REQ => - addr := next_addr; - byte_sel := r.second_bytes; req := '1'; v.state := ACK_WAIT; + v.last_dword := '0'; when ACK_WAIT => + if d_in.error = '1' then + -- dcache will discard the second request if it + -- gets an error on the 1st of two requests + if d_in.cache_paradox = '1' then + -- signal an interrupt straight away + exception := '1'; + dsisr(63 - 38) := not r.load; + -- XXX there is no architected bit for this + dsisr(63 - 35) := d_in.cache_paradox; + else + -- Look up the translation for TLB miss + -- and also for permission error and RC error + -- in case the PTE has been updated. + mmureq := '1'; + v.state := MMU_LOOKUP; + end if; + end if; if d_in.valid = '1' then - if d_in.error = '1' then - -- dcache will discard the second request if it - -- gets an error on the 1st of two requests - if r.dwords_done = '1' then - addr := next_addr; - else - addr := r.addr; - end if; - if d_in.cache_paradox = '1' then - -- signal an interrupt straight away - exception := '1'; - dsisr(63 - 38) := not r.load; - -- XXX there is no architected bit for this - dsisr(63 - 35) := d_in.cache_paradox; - v.state := IDLE; - else - -- Look up the translation for TLB miss - -- and also for permission error and RC error - -- in case the PTE has been updated. - mmureq := '1'; - v.state := MMU_LOOKUP; + if r.last_dword = '0' then + v.dwords_done := '1'; + v.last_dword := '1'; + if r.load = '1' then + v.load_data := data_permuted; end if; else - if two_dwords = '1' and r.dwords_done = '0' then - v.dwords_done := '1'; - if r.load = '1' then - v.load_data := data_permuted; - end if; + write_enable := r.load; + if r.extra_cycle = '1' then + -- loads with rA update need an extra cycle + v.state := COMPLETE; + v.do_update := r.update; else - write_enable := r.load; - if r.load = '1' and r.update = '1' then - -- loads with rA update need an extra cycle - v.state := LD_UPDATE; - else - -- stores write back rA update in this cycle - do_update := r.update; - done := '1'; - v.state := IDLE; - end if; + -- stores write back rA update in this cycle + do_update := r.update; end if; + v.busy := '0'; end if; end if; + -- r.wait_dcache gets set one cycle after we come into ACK_WAIT state, + -- which is OK because the dcache always takes at least two cycles. + v.wait_dcache := r.last_dword and not r.extra_cycle; when MMU_LOOKUP => - if r.dwords_done = '1' then - addr := next_addr; - byte_sel := r.second_bytes; - else - addr := r.addr; - byte_sel := r.first_bytes; - end if; if m_in.done = '1' then - if m_in.invalid = '0' and m_in.perm_error = '0' and m_in.rc_error = '0' and - m_in.badtree = '0' and m_in.segerr = '0' then - if r.instr_fault = '0' then - -- retry the request now that the MMU has installed a TLB entry - req := '1'; - if two_dwords = '1' and r.dwords_done = '0' then - v.state := SECOND_REQ; - else - v.state := ACK_WAIT; - end if; + if r.instr_fault = '0' then + -- retry the request now that the MMU has installed a TLB entry + req := '1'; + if r.last_dword = '0' then + v.state := SECOND_REQ; else - -- nothing to do, the icache retries automatically - done := '1'; - v.state := IDLE; + v.state := ACK_WAIT; end if; - else - exception := '1'; - dsisr(63 - 33) := m_in.invalid; - dsisr(63 - 36) := m_in.perm_error; - dsisr(63 - 38) := not r.load; - dsisr(63 - 44) := m_in.badtree; - dsisr(63 - 45) := m_in.rc_error; - v.state := IDLE; end if; end if; - - when TLBIE_WAIT => - if m_in.done = '1' then - -- tlbie is finished - done := '1'; - v.state := IDLE; + if m_in.err = '1' then + exception := '1'; + dsisr(63 - 33) := m_in.invalid; + dsisr(63 - 36) := m_in.perm_error; + dsisr(63 - 38) := not r.load; + dsisr(63 - 44) := m_in.badtree; + dsisr(63 - 45) := m_in.rc_error; end if; - when LD_UPDATE => - do_update := '1'; - v.state := IDLE; - done := '1'; + when TLBIE_WAIT => - when SPR_CMPLT => - done := '1'; - v.state := IDLE; + when COMPLETE => end case; - busy := '1'; - if r.state = IDLE or done = '1' then - busy := '0'; + if done = '1' or exception = '1' then + v.state := IDLE; + v.busy := '0'; end if; -- Note that l_in.valid is gated with busy inside execute1 @@ -349,6 +364,7 @@ begin v.tlbie := '0'; v.instr_fault := '0'; v.dwords_done := '0'; + v.last_dword := '1'; v.write_reg := l_in.write_reg; v.length := l_in.length; v.byte_reverse := l_in.byte_reverse; @@ -361,6 +377,13 @@ begin v.nc := l_in.ci; v.virt_mode := l_in.virt_mode; v.priv_mode := l_in.priv_mode; + v.wait_dcache := '0'; + v.wait_mmu := '0'; + v.do_update := '0'; + v.extra_cycle := '0'; + + addr := lsu_sum; + maddr := l_in.addr2; -- address from RB for tlbie -- XXX Temporary hack. Mark the op as non-cachable if the address -- is the form 0xc------- for a real-mode access. @@ -374,24 +397,14 @@ begin v.first_bytes := byte_sel; v.second_bytes := long_sel(15 downto 8); - -- Do byte reversing and rotating for stores in the first cycle - byte_offset := unsigned(lsu_sum(2 downto 0)); - brev_lenm1 := "000"; - if l_in.byte_reverse = '1' then - brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; - end if; - for i in 0 to 7 loop - k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; - j := to_integer(k) * 8; - v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); - end loop; - case l_in.op is when OP_STORE => req := '1'; when OP_LOAD => req := '1'; v.load := '1'; + -- Allow an extra cycle for RA update on loads + v.extra_cycle := l_in.update; when OP_DCBZ => req := '1'; v.dcbz := '1'; @@ -399,6 +412,7 @@ begin mmureq := '1'; v.tlbie := '1'; v.state := TLBIE_WAIT; + v.wait_mmu := '1'; when OP_MFSPR => v.mfspr := '1'; -- partial decode on SPR number should be adequate given @@ -413,7 +427,7 @@ begin -- reading one of the SPRs in the MMU v.sprval := m_in.sprval; end if; - v.state := SPR_CMPLT; + v.state := COMPLETE; when OP_MTSPR => if sprn(9) = '0' and sprn(5) = '0' then if sprn(0) = '0' then @@ -421,19 +435,20 @@ begin else v.dar := l_in.data; end if; - v.state := SPR_CMPLT; + v.state := COMPLETE; else -- writing one of the SPRs in the MMU mmu_mtspr := '1'; v.state := TLBIE_WAIT; + v.wait_mmu := '1'; end if; when OP_FETCH_FAILED => -- send it to the MMU to do the radix walk - addr := l_in.nia; - v.addr := l_in.nia; + maddr := l_in.nia; v.instr_fault := '1'; mmureq := '1'; v.state := MMU_LOOKUP; + v.wait_mmu := '1'; when others => assert false report "unknown op sent to loadstore1"; end case; @@ -445,6 +460,8 @@ begin v.state := SECOND_REQ; end if; end if; + + v.busy := req or mmureq or mmu_mtspr; end if; -- Update outputs to dcache @@ -454,7 +471,7 @@ begin d_out.nc <= v.nc; d_out.reserve <= v.reserve; d_out.addr <= addr; - d_out.data <= v.store_data; + d_out.data <= store_data; d_out.byte_sel <= byte_sel; d_out.virt_mode <= v.virt_mode; d_out.priv_mode <= v.priv_mode; @@ -467,7 +484,7 @@ begin m_out.tlbie <= v.tlbie; m_out.mtspr <= mmu_mtspr; m_out.sprn <= sprn; - m_out.addr <= addr; + m_out.addr <= maddr; m_out.slbia <= l_in.insn(7); m_out.rs <= l_in.data; @@ -513,18 +530,23 @@ begin end process; - ls1_log: process(clk) + l1_log: if LOG_LENGTH > 0 generate + signal log_data : std_ulogic_vector(9 downto 0); begin - if rising_edge(clk) then - log_data <= e_out.busy & - e_out.exception & - l_out.valid & - m_out.valid & - d_out.valid & - m_in.done & - r.dwords_done & - std_ulogic_vector(to_unsigned(state_t'pos(r.state), 3)); - end if; - end process; - log_out <= log_data; + ls1_log: process(clk) + begin + if rising_edge(clk) then + log_data <= e_out.busy & + e_out.exception & + l_out.valid & + m_out.valid & + d_out.valid & + m_in.done & + r.dwords_done & + std_ulogic_vector(to_unsigned(state_t'pos(r.state), 3)); + end if; + end process; + log_out <= log_data; + end generate; + end; diff --git a/mmu.vhdl b/mmu.vhdl index fc2dd7a..09df3ae 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -35,7 +35,7 @@ architecture behave of mmu is RADIX_LOOKUP, RADIX_READ_WAIT, RADIX_LOAD_TLB, - RADIX_ERROR + RADIX_FINISH ); type reg_stage_t is record @@ -51,6 +51,8 @@ architecture behave of mmu is pid : std_ulogic_vector(31 downto 0); -- internal state state : state_t; + done : std_ulogic; + err : std_ulogic; pgtbl0 : std_ulogic_vector(63 downto 0); pt0_valid : std_ulogic; pgtbl3 : std_ulogic_vector(63 downto 0); @@ -91,7 +93,10 @@ begin report "MMU got tlb miss for " & to_hstring(rin.addr); end if; if l_out.done = '1' then - report "MMU completing op with invalid=" & std_ulogic'image(l_out.invalid) & + report "MMU completing op without error"; + end if; + if l_out.err = '1' then + report "MMU completing op with err invalid=" & std_ulogic'image(l_out.invalid) & " badtree=" & std_ulogic'image(l_out.badtree); end if; if rin.state = RADIX_LOOKUP then @@ -176,7 +181,6 @@ begin mmu_1: process(all) variable v : reg_stage_t; variable dcreq : std_ulogic; - variable done : std_ulogic; variable tlb_load : std_ulogic; variable itlb_load : std_ulogic; variable tlbie_req : std_ulogic; @@ -199,7 +203,8 @@ begin v := r; v.valid := '0'; dcreq := '0'; - done := '0'; + v.done := '0'; + v.err := '0'; v.invalid := '0'; v.badtree := '0'; v.segerror := '0'; @@ -262,7 +267,7 @@ begin v.state := PROC_TBL_READ; elsif mbits = 0 then -- Use RPDS = 0 to disable radix tree walks - v.state := RADIX_ERROR; + v.state := RADIX_FINISH; v.invalid := '1'; else v.state := SEGMENT_CHECK; @@ -291,8 +296,7 @@ begin when TLB_WAIT => if d_in.done = '1' then - done := '1'; - v.state := IDLE; + v.state := RADIX_FINISH; end if; when PROC_TBL_READ => @@ -302,43 +306,42 @@ begin when PROC_TBL_WAIT => if d_in.done = '1' then - if d_in.err = '0' then - if r.addr(63) = '1' then - v.pgtbl3 := data; - v.pt3_valid := '1'; - else - v.pgtbl0 := data; - v.pt0_valid := '1'; - end if; - -- rts == radix tree size, # address bits being translated - rts := unsigned('0' & data(62 downto 61) & data(7 downto 5)); - -- mbits == # address bits to index top level of tree - mbits := unsigned('0' & data(4 downto 0)); - -- set v.shift to rts so that we can use finalmask for the segment check - v.shift := rts; - v.mask_size := mbits(4 downto 0); - v.pgbase := data(55 downto 8) & x"00"; - if mbits = 0 then - v.state := RADIX_ERROR; - v.invalid := '1'; - else - v.state := SEGMENT_CHECK; - end if; + if r.addr(63) = '1' then + v.pgtbl3 := data; + v.pt3_valid := '1'; + else + v.pgtbl0 := data; + v.pt0_valid := '1'; + end if; + -- rts == radix tree size, # address bits being translated + rts := unsigned('0' & data(62 downto 61) & data(7 downto 5)); + -- mbits == # address bits to index top level of tree + mbits := unsigned('0' & data(4 downto 0)); + -- set v.shift to rts so that we can use finalmask for the segment check + v.shift := rts; + v.mask_size := mbits(4 downto 0); + v.pgbase := data(55 downto 8) & x"00"; + if mbits = 0 then + v.state := RADIX_FINISH; + v.invalid := '1'; else - v.state := RADIX_ERROR; - v.badtree := '1'; + v.state := SEGMENT_CHECK; end if; end if; + if d_in.err = '1' then + v.state := RADIX_FINISH; + v.badtree := '1'; + end if; when SEGMENT_CHECK => mbits := '0' & r.mask_size; v.shift := r.shift + (31 - 12) - mbits; nonzero := or(r.addr(61 downto 31) and not finalmask(30 downto 0)); if r.addr(63) /= r.addr(62) or nonzero = '1' then - v.state := RADIX_ERROR; + v.state := RADIX_FINISH; v.segerror := '1'; elsif mbits < 5 or mbits > 16 or mbits > (r.shift + (31 - 12)) then - v.state := RADIX_ERROR; + v.state := RADIX_FINISH; v.badtree := '1'; else v.state := RADIX_LOOKUP; @@ -350,54 +353,53 @@ begin when RADIX_READ_WAIT => if d_in.done = '1' then - if d_in.err = '0' then - v.pde := data; - -- test valid bit - if data(63) = '1' then - -- test leaf bit - if data(62) = '1' then - -- check permissions and RC bits - perm_ok := '0'; - if r.priv = '1' or data(3) = '0' then - if r.iside = '0' then - perm_ok := data(1) or (data(2) and not r.store); - else - -- no IAMR, so no KUEP support for now - -- deny execute permission if cache inhibited - perm_ok := data(0) and not data(5); - end if; - end if; - rc_ok := data(8) and (data(7) or not r.store); - if perm_ok = '1' and rc_ok = '1' then - v.state := RADIX_LOAD_TLB; + v.pde := data; + -- test valid bit + if data(63) = '1' then + -- test leaf bit + if data(62) = '1' then + -- check permissions and RC bits + perm_ok := '0'; + if r.priv = '1' or data(3) = '0' then + if r.iside = '0' then + perm_ok := data(1) or (data(2) and not r.store); else - v.state := RADIX_ERROR; - v.perm_err := not perm_ok; - -- permission error takes precedence over RC error - v.rc_error := perm_ok; + -- no IAMR, so no KUEP support for now + -- deny execute permission if cache inhibited + perm_ok := data(0) and not data(5); end if; + end if; + rc_ok := data(8) and (data(7) or not r.store); + if perm_ok = '1' and rc_ok = '1' then + v.state := RADIX_LOAD_TLB; else - mbits := unsigned('0' & data(4 downto 0)); - if mbits < 5 or mbits > 16 or mbits > r.shift then - v.state := RADIX_ERROR; - v.badtree := '1'; - else - v.shift := v.shift - mbits; - v.mask_size := mbits(4 downto 0); - v.pgbase := data(55 downto 8) & x"00"; - v.state := RADIX_LOOKUP; - end if; + v.state := RADIX_FINISH; + v.perm_err := not perm_ok; + -- permission error takes precedence over RC error + v.rc_error := perm_ok; end if; else - -- non-present PTE, generate a DSI - v.state := RADIX_ERROR; - v.invalid := '1'; + mbits := unsigned('0' & data(4 downto 0)); + if mbits < 5 or mbits > 16 or mbits > r.shift then + v.state := RADIX_FINISH; + v.badtree := '1'; + else + v.shift := v.shift - mbits; + v.mask_size := mbits(4 downto 0); + v.pgbase := data(55 downto 8) & x"00"; + v.state := RADIX_LOOKUP; + end if; end if; else - v.state := RADIX_ERROR; - v.badtree := '1'; + -- non-present PTE, generate a DSI + v.state := RADIX_FINISH; + v.invalid := '1'; end if; end if; + if d_in.err = '1' then + v.state := RADIX_FINISH; + v.badtree := '1'; + end if; when RADIX_LOAD_TLB => tlb_load := '1'; @@ -406,16 +408,19 @@ begin v.state := TLB_WAIT; else itlb_load := '1'; - done := '1'; v.state := IDLE; end if; - when RADIX_ERROR => - done := '1'; + when RADIX_FINISH => v.state := IDLE; end case; + if v.state = RADIX_FINISH or (v.state = RADIX_LOAD_TLB and r.iside = '1') then + v.err := v.invalid or v.badtree or v.segerror or v.perm_err or v.rc_error; + v.done := not v.err; + end if; + if r.addr(63) = '1' then effpid := x"00000000"; else @@ -451,7 +456,8 @@ begin tlb_data := (others => '0'); end if; - l_out.done <= done; + l_out.done <= r.done; + l_out.err <= r.err; l_out.invalid <= r.invalid; l_out.badtree <= r.badtree; l_out.segerr <= r.segerror; diff --git a/register_file.vhdl b/register_file.vhdl index 260255e..10f28a4 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -7,7 +7,9 @@ use work.common.all; entity register_file is generic ( - SIM : boolean := false + SIM : boolean := false; + -- Non-zero to enable log data collection + LOG_LENGTH : natural := 0 ); port( clk : in std_logic; @@ -36,7 +38,6 @@ architecture behaviour of register_file is signal rd_port_b : std_ulogic_vector(63 downto 0); signal dbg_data : std_ulogic_vector(63 downto 0); signal dbg_ack : std_ulogic; - signal log_data : std_ulogic_vector(70 downto 0); begin -- synchronous writes register_write_0: process(clk) @@ -134,13 +135,18 @@ begin sim_dump_done <= '0'; end generate; - reg_log: process(clk) + rf_log: if LOG_LENGTH > 0 generate + signal log_data : std_ulogic_vector(70 downto 0); begin - if rising_edge(clk) then - log_data <= w_in.write_data & - w_in.write_enable & - w_in.write_reg; - end if; - end process; - log_out <= log_data; + reg_log: process(clk) + begin + if rising_edge(clk) then + log_data <= w_in.write_data & + w_in.write_enable & + w_in.write_reg; + end if; + end process; + log_out <= log_data; + end generate; + end architecture behaviour;