From 65c43b488bff9157d2a9e83ba6ce5b851604536a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 13 Aug 2021 19:29:48 +1000 Subject: [PATCH] PMU: Add several more events This implements most of the architected PMU events. The ones missing are mostly the ones that depend on which level of the cache hierarchy data is fetched from. The events implemented here, and their raw event codes, are: Floating-point operation completed (100f4) Load completed (100fc) Store completed (200f0) Icache miss (200fc) ITLB miss (100f6) ITLB miss resolved (400fc) Dcache load miss (400f0) Dcache load miss resolved (300f8) Dcache store miss (300f0) DTLB miss (300fc) DTLB miss resolved (200f6) No instruction available and none being executed (100f8) Instruction dispatched (200f2, 300f2, 400f2) Taken branch instruction completed (200fa) Branch mispredicted (400f6) External interrupt taken (200f8) Signed-off-by: Paul Mackerras --- common.vhdl | 24 +++++++++++++++++++++++- core.vhdl | 9 +++++++++ dcache.vhdl | 19 +++++++++++++++++++ execute1.vhdl | 39 ++++++++++++++++++++++++++++++++++++++- icache.vhdl | 6 ++++++ loadstore1.vhdl | 10 ++++++++++ pmu.vhdl | 4 ++-- writeback.vhdl | 1 + 8 files changed, 108 insertions(+), 4 deletions(-) diff --git a/common.vhdl b/common.vhdl index 467f2a8..a6562e0 100644 --- a/common.vhdl +++ b/common.vhdl @@ -209,6 +209,11 @@ package common is next_predicted: std_ulogic; end record; + type IcacheEventType is record + icache_miss : std_ulogic; + itlb_miss_resolved : std_ulogic; + end record; + type Decode1ToDecode2Type is record valid: std_ulogic; stop_mark : std_ulogic; @@ -347,8 +352,10 @@ package common is itlb_miss_resolved : std_ulogic; icache_miss : std_ulogic; dc_miss_resolved : std_ulogic; + dc_load_miss : std_ulogic; dc_ld_miss_resolved : std_ulogic; dc_store_miss : std_ulogic; + dtlb_miss : std_ulogic; dtlb_miss_resolved : std_ulogic; ld_miss_nocache : std_ulogic; ld_fill_nocache : std_ulogic; @@ -468,6 +475,14 @@ package common is cache_paradox : std_ulogic; end record; + type DcacheEventType is record + load_miss : std_ulogic; + store_miss : std_ulogic; + dcache_refill : std_ulogic; + dtlb_miss : std_ulogic; + dtlb_miss_resolved : std_ulogic; + end record; + type Loadstore1ToMmuType is record valid : std_ulogic; tlbie : std_ulogic; @@ -537,6 +552,12 @@ package common is interrupt => '0', intr_vec => 0, srr0 => (others => '0'), srr1 => (others => '0')); + type Loadstore1EventType is record + load_complete : std_ulogic; + store_complete : std_ulogic; + itlb_miss : std_ulogic; + end record; + type Execute1ToWritebackType is record valid: std_ulogic; instr_tag : instr_tag_t; @@ -668,7 +689,8 @@ package common is write_cr_data => (others => '0')); type WritebackEventType is record - instr_complete : std_ulogic; + instr_complete : std_ulogic; + fp_complete : std_ulogic; end record; end common; diff --git a/core.vhdl b/core.vhdl index 5d8a822..32bfe88 100644 --- a/core.vhdl +++ b/core.vhdl @@ -148,6 +148,9 @@ architecture behave of core is signal msr : std_ulogic_vector(63 downto 0); -- PMU event bus + signal icache_events : IcacheEventType; + signal loadstore_events : Loadstore1EventType; + signal dcache_events : DcacheEventType; signal writeback_events : WritebackEventType; -- Debug status @@ -247,6 +250,7 @@ begin wishbone_out => wishbone_insn_out, wishbone_in => wishbone_insn_in, wb_snoop_in => wb_snoop_in, + events => icache_events, log_out => log_data(96 downto 43) ); @@ -356,6 +360,9 @@ begin icache_inval => ex1_icache_inval, dbg_msr_out => msr, wb_events => writeback_events, + ls_events => loadstore_events, + dc_events => dcache_events, + ic_events => icache_events, terminate_out => terminate, log_out => log_data(134 downto 120), log_rd_addr => log_rd_addr, @@ -397,6 +404,7 @@ begin m_out => loadstore1_to_mmu, m_in => mmu_to_loadstore1, dc_stall => dcache_stall_out, + events => loadstore_events, log_out => log_data(149 downto 140) ); @@ -431,6 +439,7 @@ begin wishbone_in => wishbone_data_in, wishbone_out => wishbone_data_out, snoop_in => wb_snoop_in, + events => dcache_events, log_out => log_data(170 downto 151) ); diff --git a/dcache.vhdl b/dcache.vhdl index 50060b2..90771f5 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -46,6 +46,8 @@ entity dcache is wishbone_out : out wishbone_master_out; wishbone_in : in wishbone_slave_out; + events : out DcacheEventType; + log_out : out std_ulogic_vector(19 downto 0) ); end entity dcache; @@ -355,6 +357,8 @@ architecture rtl of dcache is signal r1 : reg_stage_1_t; + signal ev : DcacheEventType; + -- Reservation information -- type reservation_t is record @@ -412,6 +416,7 @@ architecture rtl of dcache is signal rc_ok : std_ulogic; signal perm_ok : std_ulogic; signal access_ok : std_ulogic; + signal tlb_miss : std_ulogic; -- TLB PLRU output interface type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0); @@ -605,6 +610,8 @@ begin r0_valid <= r0_full and not r1.full and not d_in.hold; stall_out <= r0_stall; + events <= ev; + -- TLB -- Operates in the second cycle on the request latched in r0.req. -- TLB updates write the entry at the end of the second cycle. @@ -689,6 +696,7 @@ begin pte <= (others => '0'); end if; valid_ra <= tlb_hit or not r0.req.virt_mode; + tlb_miss <= r0_valid and r0.req.virt_mode and not tlb_hit; if r0.req.virt_mode = '1' then ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & r0.req.addr(TLB_LG_PGSZ - 1 downto ROW_OFF_BITS) & @@ -712,6 +720,7 @@ begin if rising_edge(clk) then tlbie := r0_valid and r0.tlbie; tlbwe := r0_valid and r0.tlbld; + ev.dtlb_miss_resolved <= tlbwe; if rst = '1' or (tlbie = '1' and r0.doall = '1') then -- clear all valid bits at once for i in tlb_index_t loop @@ -1286,6 +1295,11 @@ begin r1.forward_valid1 <= '0'; end if; + ev.dcache_refill <= '0'; + ev.load_miss <= '0'; + ev.store_miss <= '0'; + ev.dtlb_miss <= tlb_miss; + -- On reset, clear all valid bits to force misses if rst = '1' then for i in index_t loop @@ -1417,6 +1431,7 @@ begin -- Track that we had one request sent r1.state <= RELOAD_WAIT_ACK; r1.write_tag <= '1'; + ev.load_miss <= '1'; when OP_LOAD_NC => r1.wb.cyc <= '1'; @@ -1449,6 +1464,9 @@ begin r1.wb.we <= '1'; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; + if req.op = OP_STORE_MISS then + ev.store_miss <= '1'; + end if; -- OP_NONE and OP_BAD do nothing -- OP_BAD & OP_STCX_FAIL were handled above already @@ -1500,6 +1518,7 @@ begin -- Cache line is now valid cache_valids(r1.store_index)(r1.store_way) <= '1'; + ev.dcache_refill <= not r1.dcbz; r1.state <= IDLE; end if; diff --git a/execute1.vhdl b/execute1.vhdl index 5d2fa79..3f21757 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -47,6 +47,9 @@ entity execute1 is -- PMU event buses wb_events : in WritebackEventType; + ls_events : in Loadstore1EventType; + dc_events : in DcacheEventType; + ic_events : in IcacheEventType; log_out : out std_ulogic_vector(14 downto 0); log_rd_addr : out std_ulogic_vector(31 downto 0); @@ -70,6 +73,11 @@ architecture behaviour of execute1 is mul_finish : std_ulogic; div_in_progress : std_ulogic; cntz_in_progress : std_ulogic; + no_instr_avail : std_ulogic; + instr_dispatch : std_ulogic; + ext_interrupt : std_ulogic; + taken_branch_event : std_ulogic; + br_mispredict : std_ulogic; log_addr_spr : std_ulogic_vector(31 downto 0); end record; constant reg_type_init : reg_type := @@ -78,6 +86,8 @@ architecture behaviour of execute1 is busy => '0', terminate => '0', intr_pending => '0', fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, br_taken => '0', mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0', + no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0', + taken_branch_event => '0', br_mispredict => '0', others => (others => '0')); signal r, rin : reg_type; @@ -302,7 +312,24 @@ begin c_in <= e_in.read_data3; cr_in <= e_in.cr; - x_to_pmu.occur <= (instr_complete => wb_events.instr_complete, others => '0'); + x_to_pmu.occur <= (instr_complete => wb_events.instr_complete, + fp_complete => wb_events.fp_complete, + ld_complete => ls_events.load_complete, + st_complete => ls_events.store_complete, + itlb_miss => ls_events.itlb_miss, + dc_load_miss => dc_events.load_miss, + dc_ld_miss_resolved => dc_events.dcache_refill, + dc_store_miss => dc_events.store_miss, + dtlb_miss => dc_events.dtlb_miss, + dtlb_miss_resolved => dc_events.dtlb_miss_resolved, + icache_miss => ic_events.icache_miss, + itlb_miss_resolved => ic_events.itlb_miss_resolved, + no_instr_avail => r.no_instr_avail, + dispatch => r.instr_dispatch, + ext_interrupt => r.ext_interrupt, + br_taken_complete => r.taken_branch_event, + br_mispredict => r.br_mispredict, + others => '0'); x_to_pmu.nia <= current.nia; x_to_pmu.addr <= (others => '0'); x_to_pmu.addr_v <= '0'; @@ -715,6 +742,9 @@ begin v.div_in_progress := '0'; v.cntz_in_progress := '0'; v.mul_finish := '0'; + v.ext_interrupt := '0'; + v.taken_branch_event := '0'; + v.br_mispredict := '0'; x_to_pmu.mfspr <= '0'; x_to_pmu.mtspr <= '0'; @@ -804,6 +834,7 @@ begin elsif ext_irq_in = '1' then v.e.intr_vec := 16#500#; report "IRQ valid: External"; + v.ext_interrupt := '1'; end if; exception := '1'; @@ -836,6 +867,9 @@ begin v.intr_pending := '0'; end if; + v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or r.busy or fp_in.busy); + v.instr_dispatch := valid_in and not exception and not illegal; + if valid_in = '1' and exception = '0' and illegal = '0' and e_in.unit = ALU then v.e.valid := '1'; @@ -905,6 +939,7 @@ begin if ctrl.msr(MSR_BE) = '1' then do_trace := '1'; end if; + v.taken_branch_event := '1'; when OP_BC | OP_BCREG => -- read_data1 is CTR -- for OP_BCREG, read_data2 is target register (CTR, LR or TAR) @@ -920,6 +955,7 @@ begin taken_branch := r.br_taken; end if; v.br_taken := taken_branch; + v.taken_branch_event := taken_branch; abs_branch := e_in.br_abs; if e_in.repeat = '0' or e_in.second = '1' then is_branch := '1'; @@ -1114,6 +1150,7 @@ begin end if; if taken_branch /= e_in.br_pred then v.e.redirect := '1'; + v.br_mispredict := is_direct_branch; end if; v.e.br_last := is_direct_branch; v.e.br_taken := taken_branch; diff --git a/icache.vhdl b/icache.vhdl index 4d3b12e..57d3437 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -70,6 +70,7 @@ entity icache is wb_snoop_in : in wishbone_master_out := wishbone_master_out_init; + events : out IcacheEventType; log_out : out std_ulogic_vector(53 downto 0) ); end entity icache; @@ -197,6 +198,8 @@ architecture rtl of icache is signal r : reg_internal_t; + signal ev : IcacheEventType; + -- Async signals on incoming request signal req_index : index_t; signal req_row : row_t; @@ -494,6 +497,7 @@ begin itlb_ptes(wr_index) <= m_in.pte; itlb_valids(wr_index) <= '1'; end if; + ev.itlb_miss_resolved <= m_in.tlbld and not rst; end if; end process; @@ -627,6 +631,7 @@ begin variable snoop_cache_tags : cache_tags_set_t; begin if rising_edge(clk) then + ev.icache_miss <= '0'; -- On reset, clear all valid bits to force misses if rst = '1' then for i in index_t loop @@ -699,6 +704,7 @@ begin " way:" & integer'image(replace_way) & " tag:" & to_hstring(req_tag) & " RA:" & to_hstring(real_addr); + ev.icache_miss <= '1'; -- Keep track of our index and way for subsequent stores r.store_index <= req_index; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 243b99d..21ed836 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -33,6 +33,8 @@ entity loadstore1 is dc_stall : in std_ulogic; + events : out Loadstore1EventType; + log_out : out std_ulogic_vector(9 downto 0) ); end loadstore1; @@ -146,6 +148,7 @@ architecture behave of loadstore1 is intr_vec : integer range 0 to 16#fff#; nia : std_ulogic_vector(63 downto 0); srr1 : std_ulogic_vector(15 downto 0); + events : Loadstore1EventType; end record; signal req_in : request_t; @@ -668,6 +671,7 @@ begin do_update := '0'; v.convert_lfs := '0'; v.srr1 := (others => '0'); + v.events := (others => '0'); -- load data formatting -- shift and byte-reverse data bytes @@ -796,6 +800,7 @@ begin mmu_mtspr := r2.req.write_spr; if r2.req.instr_fault = '1' then v.state := MMU_LOOKUP; + v.events.itlb_miss := '1'; else v.state := TLBIE_WAIT; end if; @@ -838,6 +843,9 @@ begin v.state := IDLE; end if; + v.events.load_complete := r2.req.load and complete; + v.events.store_complete := (r2.req.store or r2.req.dcbz) and complete; + -- generate DSI or DSegI for load/store exceptions -- or ISI or ISegI for instruction fetch exceptions v.interrupt := exception; @@ -946,6 +954,8 @@ begin e_out.in_progress <= in_progress; e_out.interrupt <= r3.interrupt; + events <= r3.events; + -- Busy calculation. stage3_busy_next <= r2.req.valid and not (complete or part_done or exception); diff --git a/pmu.vhdl b/pmu.vhdl index cf5e7f5..2967f4e 100644 --- a/pmu.vhdl +++ b/pmu.vhdl @@ -290,13 +290,13 @@ begin when x"f8" => inc(3) := tbbit; when x"fe" => - inc(3) := p_in.occur.ld_fill_nocache; + inc(3) := p_in.occur.dtlb_miss; when others => end case; case mmcr1(7 downto 0) is when x"f0" => - inc(4) := p_in.occur.dc_store_miss; + inc(4) := p_in.occur.dc_load_miss; when x"f2" => inc(4) := p_in.occur.dispatch; when x"f4" => diff --git a/writeback.vhdl b/writeback.vhdl index b056ee1..a99d4d2 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -104,6 +104,7 @@ begin complete_out <= fp_in.instr_tag; end if; events.instr_complete <= complete_out.valid; + events.fp_complete <= fp_in.valid; intr := e_in.interrupt or l_in.interrupt or fp_in.interrupt;