From 144d8e3c612e603496481547ea2f27b9a26c0253 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 10 Jul 2020 10:04:56 +1000 Subject: [PATCH 01/18] icache: Do PLRU update one cycle later This does the PLRU update based on r.hit_valid and r.hit_way rather than req_is_hit and req_hit_way, which means there is now a register between the TLB and cache tag lookup and the PLRU update, which should help with timing. As a result, the PLRU victim way selection becomes valid one cycle later, in the cycle when r.state = CLR_TAG. So we have to use the PLRU output directly in the CLR_TAG state and r.store_way in the WAIT_ACK state. Signed-off-by: Paul Mackerras --- icache.vhdl | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/icache.vhdl b/icache.vhdl index dab2505..0aebbc2 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -379,7 +379,7 @@ begin begin do_read <= not (stall_in or use_previous); do_write <= '0'; - if wishbone_in.ack = '1' and r.store_way = i then + if wishbone_in.ack = '1' and replace_way = i then do_write <= '1'; end if; cache_out(i) <= dout; @@ -413,15 +413,15 @@ begin lru => plru_out ); - process(req_index, req_is_hit, req_hit_way, req_is_hit, plru_out) + process(all) begin -- PLRU interface - if req_is_hit = '1' and req_index = i then - plru_acc_en <= req_is_hit; + if get_index(r.hit_nia) = i then + plru_acc_en <= r.hit_valid; else plru_acc_en <= '0'; end if; - plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS)); + plru_acc <= std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS)); plru_victim(i) <= plru_out; end process; end generate; @@ -531,8 +531,12 @@ begin end if; req_hit_way <= hit_way; - -- The way to replace on a miss - replace_way <= to_integer(unsigned(plru_victim(req_index))); + -- The way to replace on a miss + if r.state = CLR_TAG then + replace_way <= to_integer(unsigned(plru_victim(r.store_index))); + else + replace_way <= r.store_way; + end if; -- Output instruction from current cache row -- @@ -642,7 +646,6 @@ begin -- Keep track of our index and way for subsequent stores r.store_index <= req_index; - r.store_way <= replace_way; r.store_row <= get_row(req_laddr); r.store_tag <= req_tag; r.store_valid <= '1'; @@ -661,12 +664,15 @@ begin when CLR_TAG | WAIT_ACK => if r.state = CLR_TAG then + -- Get victim way from plru + r.store_way <= replace_way; + -- Force misses on that way while reloading that line - cache_valids(req_index)(r.store_way) <= '0'; + cache_valids(req_index)(replace_way) <= '0'; -- Store new tag in selected way for i in 0 to NUM_WAYS-1 loop - if i = r.store_way then + if i = replace_way then tagset := cache_tags(r.store_index); write_tag(i, tagset, r.store_tag); cache_tags(r.store_index) <= tagset; @@ -702,7 +708,7 @@ begin r.wb.cyc <= '0'; -- Cache line is now valid - cache_valids(r.store_index)(r.store_way) <= r.store_valid and not inval_in; + cache_valids(r.store_index)(replace_way) <= r.store_valid and not inval_in; -- We are done r.state <= IDLE; @@ -728,11 +734,7 @@ begin variable wstate: std_ulogic; begin if rising_edge(clk) then - if req_is_hit then - lway := req_hit_way; - else - lway := replace_way; - end if; + lway := req_hit_way; wstate := '0'; if r.state /= IDLE then wstate := '1'; From 31587affb38980d5780888ecdbb9a2444da5d7a1 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 10 Jul 2020 17:47:52 +1000 Subject: [PATCH 02/18] dcache: Do PLRU update one cycle later This does the PLRU update based on r1.cache_hit and r1.hit_way rather than req_op and req_hit_way, which means there is now a register between the TLB and cache tag lookup and the PLRU update, which should help with timing. The PLRU victim selection now becomes valid one cycle later, in the cycle where r1.write_tag = 1. We now have replace_way coming from the PLRU when r1.write_tag = 1 and from r1.store_way at other times, and we use that instead of r1.store_way in situations where we need it to be valid in the first cycle of the RELOAD_WAIT_ACK state. Signed-off-by: Paul Mackerras --- dcache.vhdl | 45 +++++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 9ecb6a9..1c20de5 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -231,7 +231,6 @@ architecture rtl of dcache is data : std_ulogic_vector(63 downto 0); byte_sel : std_ulogic_vector(7 downto 0); hit_way : way_t; - repl_way : way_t; same_tag : std_ulogic; end record; @@ -247,6 +246,8 @@ architecture rtl of dcache is -- Cache hit state hit_way : way_t; hit_load_valid : std_ulogic; + hit_index : index_t; + cache_hit : std_ulogic; -- 2-stage data buffer for data forwarded from writes to reads forward_data1 : std_ulogic_vector(63 downto 0); @@ -677,16 +678,15 @@ begin lru => plru_out ); - process(req_index, req_op, req_hit_way, plru_out) + process(all) begin -- PLRU interface - if (req_op = OP_LOAD_HIT or - req_op = OP_STORE_HIT) and req_index = i then - plru_acc_en <= '1'; + if r1.hit_index = i then + plru_acc_en <= r1.cache_hit; else plru_acc_en <= '0'; end if; - plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS)); + plru_acc <= std_ulogic_vector(to_unsigned(r1.hit_way, WAY_BITS)); plru_victim(i) <= plru_out; end process; end generate; @@ -788,7 +788,7 @@ begin -- since it will be by the time we perform the store. -- For a load, check the appropriate row valid bit. is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE); - hit_way := r1.store_way; + hit_way := replace_way; end if; -- Whether to use forwarded data for a load or not @@ -811,8 +811,12 @@ begin -- The way that matched on a hit req_hit_way <= hit_way; - -- The way to replace on a miss - replace_way <= to_integer(unsigned(plru_victim(req_index))); + -- The way to replace on a miss + if r1.write_tag = '1' then + replace_way <= to_integer(unsigned(plru_victim(r1.store_index))); + else + replace_way <= r1.store_way; + end if; -- work out whether we have permission for this access -- NB we don't yet implement AMR, thus no KUAP @@ -1079,7 +1083,7 @@ begin wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); wr_sel <= (others => '1'); - if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r1.store_way = i then + if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and replace_way = i then do_write <= '1'; end if; end if; @@ -1113,12 +1117,18 @@ begin end if; -- Fast path for load/store hits. Set signals for the writeback controls. + r1.hit_way <= req_hit_way; + r1.hit_index <= req_index; if req_op = OP_LOAD_HIT then - r1.hit_way <= req_hit_way; r1.hit_load_valid <= '1'; else r1.hit_load_valid <= '0'; end if; + if req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT then + r1.cache_hit <= '1'; + else + r1.cache_hit <= '0'; + end if; if req_op = OP_BAD then report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) & @@ -1179,7 +1189,7 @@ begin r1.forward_data1 <= wishbone_in.dat; end if; r1.forward_sel1 <= (others => '1'); - r1.forward_way1 <= r1.store_way; + r1.forward_way1 <= replace_way; r1.forward_row1 <= r1.store_row; r1.forward_valid1 <= '0'; end if; @@ -1205,11 +1215,12 @@ begin if r1.write_tag = '1' then -- Store new tag in selected way for i in 0 to NUM_WAYS-1 loop - if i = r1.store_way then + if i = replace_way then cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <= (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag; end if; end loop; + r1.store_way <= replace_way; r1.write_tag <= '0'; end if; @@ -1224,7 +1235,6 @@ begin req.data := r0.req.data; req.byte_sel := r0.req.byte_sel; req.hit_way := req_hit_way; - req.repl_way := replace_way; req.same_tag := req_same_tag; -- Store the incoming request from r0, if it is a slow request @@ -1251,8 +1261,6 @@ begin if req.op = OP_STORE_HIT then r1.store_way <= req.hit_way; - else - r1.store_way <= req.repl_way; end if; -- Reset per-row valid bits, ready for handling OP_LOAD_MISS @@ -1269,7 +1277,6 @@ begin -- report "cache miss real addr:" & to_hstring(req.real_addr) & " idx:" & integer'image(get_index(req.real_addr)) & - " way:" & integer'image(req.repl_way) & " tag:" & to_hstring(get_tag(req.real_addr)); -- Start the wishbone cycle @@ -1310,7 +1317,9 @@ begin -- Handle the rest like a load miss r1.state <= RELOAD_WAIT_ACK; - r1.write_tag <= '1'; + if req.op = OP_STORE_MISS then + r1.write_tag <= '1'; + end if; r1.dcbz <= '1'; end if; r1.wb.we <= '1'; From 9160e29c56b3ae4da4b4b8e6abef04c95dd75f20 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 10 Jul 2020 19:07:47 +1000 Subject: [PATCH 03/18] execute1: Ease timing on redirect_nia This eliminates a dependency of r.f.redirect_nia on the carry out from the main adder in the case of a conditional trap instruction. We can set r.f.redirect_nia unconditionally, even if no interrupt is generated. Signed-off-by: Paul Mackerras --- execute1.vhdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/execute1.vhdl b/execute1.vhdl index 3b2007a..fb760d1 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -619,12 +619,12 @@ begin end loop; else -- trap instructions (tw, twi, td, tdi) + v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); + -- set bit 46 to say trap occurred + ctrl_tmp.srr1(63 - 46) <= '1'; if or (trapval and insn_to(e_in.insn)) = '1' then -- generate trap-type program interrupt exception := '1'; - v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); - -- set bit 46 to say trap occurred - ctrl_tmp.srr1(63 - 46) <= '1'; report "trap"; end if; end if; From b2ba024a4885f026200bdfb6ef804ed0f9dd72ca Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 10 Jul 2020 20:11:15 +1000 Subject: [PATCH 04/18] loadstore1: Eliminate two_dwords variable The computation of two_dwords from r.second_bytes has shown up as part of a critical path at times. Instead we add a 'last_dword' flag to the reg_stage_t record which tells us more directly whether a valid flag coming in from dcache means that the instruction is done, thereby shortening the path to the busy output back to execute1. This also simplifies some of the trim_ctl logic. The two_dwords = 0 case could never have use_second(i) = 1 for any of the bytes being transferred, so "not use_second(i)" is always 1. Signed-off-by: Paul Mackerras --- loadstore1.vhdl | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/loadstore1.vhdl b/loadstore1.vhdl index cf00987..dd11221 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -69,6 +69,7 @@ architecture behave of loadstore1 is priv_mode : std_ulogic; state : state_t; dwords_done : std_ulogic; + last_dword : std_ulogic; first_bytes : std_ulogic_vector(7 downto 0); second_bytes : std_ulogic_vector(7 downto 0); dar : std_ulogic_vector(63 downto 0); @@ -146,7 +147,6 @@ begin variable wdata : std_ulogic_vector(63 downto 0); variable write_enable : std_ulogic; variable do_update : std_ulogic; - variable two_dwords : std_ulogic; variable done : std_ulogic; variable data_permuted : std_ulogic_vector(63 downto 0); variable data_trimmed : std_ulogic_vector(63 downto 0); @@ -174,7 +174,6 @@ begin write_enable := '0'; do_update := '0'; - two_dwords := or (r.second_bytes); -- load data formatting byte_offset := unsigned(r.addr(2 downto 0)); @@ -204,10 +203,10 @@ begin -- trim and sign-extend for i in 0 to 7 loop if i < to_integer(unsigned(r.length)) then - if two_dwords = '1' then + if r.dwords_done = '1' then trim_ctl(i) := '1' & not use_second(i); else - trim_ctl(i) := not use_second(i) & '0'; + trim_ctl(i) := "10"; end if; else trim_ctl(i) := '0' & (negative and r.sign_extend); @@ -237,6 +236,7 @@ begin byte_sel := r.second_bytes; req := '1'; v.state := ACK_WAIT; + v.last_dword := '0'; when ACK_WAIT => if d_in.valid = '1' then @@ -263,8 +263,9 @@ begin v.state := MMU_LOOKUP; end if; else - if two_dwords = '1' and r.dwords_done = '0' then + if r.last_dword = '0' then v.dwords_done := '1'; + v.last_dword := '1'; if r.load = '1' then v.load_data := data_permuted; end if; @@ -297,7 +298,7 @@ begin if r.instr_fault = '0' then -- retry the request now that the MMU has installed a TLB entry req := '1'; - if two_dwords = '1' and r.dwords_done = '0' then + if r.last_dword = '0' then v.state := SECOND_REQ; else v.state := ACK_WAIT; @@ -349,6 +350,7 @@ begin v.tlbie := '0'; v.instr_fault := '0'; v.dwords_done := '0'; + v.last_dword := '1'; v.write_reg := l_in.write_reg; v.length := l_in.length; v.byte_reverse := l_in.byte_reverse; From c01e1c7b91068e04751b19b8b22e4c452c94080b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 10 Jul 2020 20:32:35 +1000 Subject: [PATCH 05/18] dcache: Update TLB PLRU one cycle later This puts the inputs to the TLB PLRU through a register stage, so the TLB PLRU update is done in the cycle after the TLB tag matching rather than the same cycle. This improves timing. The PLRU output is only used when writing the TLB in response to a tlbwe request from the MMU, and that doesn't happen within one cycle of a virtual-mode load or store, so the fact that the tlb victim way information is delayed by one cycle doesn't create any problems. Signed-off-by: Paul Mackerras --- dcache.vhdl | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 1c20de5..4d9843e 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -249,6 +249,11 @@ architecture rtl of dcache is hit_index : index_t; cache_hit : std_ulogic; + -- TLB hit state + tlb_hit : std_ulogic; + tlb_hit_way : tlb_way_t; + tlb_hit_index : tlb_index_t; + -- 2-stage data buffer for data forwarded from writes to reads forward_data1 : std_ulogic_vector(63 downto 0); forward_data2 : std_ulogic_vector(63 downto 0); @@ -567,15 +572,15 @@ begin lru => tlb_plru_out ); - process(tlb_req_index, tlb_hit, tlb_hit_way, tlb_plru_out) + process(all) begin -- PLRU interface - if tlb_hit = '1' and tlb_req_index = i then - tlb_plru_acc_en <= '1'; + if r1.tlb_hit_index = i then + tlb_plru_acc_en <= r1.tlb_hit; else tlb_plru_acc_en <= '0'; end if; - tlb_plru_acc <= std_ulogic_vector(to_unsigned(tlb_hit_way, TLB_WAY_BITS)); + tlb_plru_acc <= std_ulogic_vector(to_unsigned(r1.tlb_hit_way, TLB_WAY_BITS)); tlb_plru_victim(i) <= tlb_plru_out; end process; end generate; @@ -1146,6 +1151,11 @@ begin r1.stcx_fail <= '0'; end if; + -- Record TLB hit information for updating TLB PLRU + r1.tlb_hit <= tlb_hit; + r1.tlb_hit_way <= tlb_hit_way; + r1.tlb_hit_index <= tlb_req_index; + -- complete tlbies and TLB loads in the third cycle r1.tlbie_done <= r0_valid and (r0.tlbie or r0.tlbld); end if; From 1be6fbac3333a7f99c0cf7151100e8032f4cea50 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 11 Jul 2020 09:10:24 +1000 Subject: [PATCH 06/18] dcache: Remove dependency of r1.wb.adr/dat/sel on req_op This improves timing by setting r1.wb.{adr,dat,sel} to the next request when doing a write cycle on the wishbone before we know whether the next request has a TLB and cache hit or not, i.e. without depending on req_op. r1.wb.stb still depends on req_op. This contains a workaround for what is probably a bug elsewhere, in that changing r1.wb.sel unconditionally once we see stall=0 from the wishbone causes incorrect behaviour. Making it conditional on there being a valid following request appears to fix the problem. Signed-off-by: Paul Mackerras --- dcache.vhdl | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 4d9843e..4c1db9b 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -226,6 +226,7 @@ architecture rtl of dcache is type mem_access_request_t is record op : op_t; + valid : std_ulogic; dcbz : std_ulogic; real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); data : std_ulogic_vector(63 downto 0); @@ -309,6 +310,7 @@ architecture rtl of dcache is signal req_op : op_t; signal req_data : std_ulogic_vector(63 downto 0); signal req_same_tag : std_ulogic; + signal req_go : std_ulogic; signal early_req_row : row_t; @@ -856,6 +858,7 @@ begin end if; end if; req_op <= op; + req_go <= go; -- Version of the row number that is valid one cycle earlier -- in the cases where we need to read the cache data BRAM. @@ -1240,6 +1243,7 @@ begin req := r1.req; else req.op := req_op; + req.valid := req_go; req.dcbz := r0.req.dcbz; req.real_addr := ra; req.data := r0.req.data; @@ -1402,11 +1406,14 @@ begin if wishbone_in.stall = '0' then -- See if there is another store waiting to be done -- which is in the same real page. - if acks < 7 and req.same_tag = '1' and - (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then - r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0); + if req.valid = '1' then + r1.wb.adr(SET_SIZE_BITS - 1 downto 0) <= + req.real_addr(SET_SIZE_BITS - 1 downto 0); r1.wb.dat <= req.data; r1.wb.sel <= req.byte_sel; + end if; + if acks < 7 and req.same_tag = '1' and + (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then r1.wb.stb <= '1'; stbs_done := false; if req.op = OP_STORE_HIT then From 1f2058a0edca1d53157c3e87425118712d89004f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 11 Jul 2020 11:00:53 +1000 Subject: [PATCH 07/18] MMU: Improve timing of done signal back to loadstore1 This makes the l_out.done signal come from a clean latch, which improves timing. The cost is that TLB load and invalidation operations to the dcache now signal done back to loadstore1 one cycle later than before, but that doesn't seem to affect overall performance noticeably. Signed-off-by: Paul Mackerras --- mmu.vhdl | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/mmu.vhdl b/mmu.vhdl index fc2dd7a..f1d69e2 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -35,7 +35,7 @@ architecture behave of mmu is RADIX_LOOKUP, RADIX_READ_WAIT, RADIX_LOAD_TLB, - RADIX_ERROR + RADIX_FINISH ); type reg_stage_t is record @@ -51,6 +51,7 @@ architecture behave of mmu is pid : std_ulogic_vector(31 downto 0); -- internal state state : state_t; + done : std_ulogic; pgtbl0 : std_ulogic_vector(63 downto 0); pt0_valid : std_ulogic; pgtbl3 : std_ulogic_vector(63 downto 0); @@ -176,7 +177,6 @@ begin mmu_1: process(all) variable v : reg_stage_t; variable dcreq : std_ulogic; - variable done : std_ulogic; variable tlb_load : std_ulogic; variable itlb_load : std_ulogic; variable tlbie_req : std_ulogic; @@ -199,7 +199,7 @@ begin v := r; v.valid := '0'; dcreq := '0'; - done := '0'; + v.done := '0'; v.invalid := '0'; v.badtree := '0'; v.segerror := '0'; @@ -262,7 +262,7 @@ begin v.state := PROC_TBL_READ; elsif mbits = 0 then -- Use RPDS = 0 to disable radix tree walks - v.state := RADIX_ERROR; + v.state := RADIX_FINISH; v.invalid := '1'; else v.state := SEGMENT_CHECK; @@ -291,8 +291,7 @@ begin when TLB_WAIT => if d_in.done = '1' then - done := '1'; - v.state := IDLE; + v.state := RADIX_FINISH; end if; when PROC_TBL_READ => @@ -319,13 +318,13 @@ begin v.mask_size := mbits(4 downto 0); v.pgbase := data(55 downto 8) & x"00"; if mbits = 0 then - v.state := RADIX_ERROR; + v.state := RADIX_FINISH; v.invalid := '1'; else v.state := SEGMENT_CHECK; end if; else - v.state := RADIX_ERROR; + v.state := RADIX_FINISH; v.badtree := '1'; end if; end if; @@ -335,10 +334,10 @@ begin v.shift := r.shift + (31 - 12) - mbits; nonzero := or(r.addr(61 downto 31) and not finalmask(30 downto 0)); if r.addr(63) /= r.addr(62) or nonzero = '1' then - v.state := RADIX_ERROR; + v.state := RADIX_FINISH; v.segerror := '1'; elsif mbits < 5 or mbits > 16 or mbits > (r.shift + (31 - 12)) then - v.state := RADIX_ERROR; + v.state := RADIX_FINISH; v.badtree := '1'; else v.state := RADIX_LOOKUP; @@ -371,7 +370,7 @@ begin if perm_ok = '1' and rc_ok = '1' then v.state := RADIX_LOAD_TLB; else - v.state := RADIX_ERROR; + v.state := RADIX_FINISH; v.perm_err := not perm_ok; -- permission error takes precedence over RC error v.rc_error := perm_ok; @@ -379,7 +378,7 @@ begin else mbits := unsigned('0' & data(4 downto 0)); if mbits < 5 or mbits > 16 or mbits > r.shift then - v.state := RADIX_ERROR; + v.state := RADIX_FINISH; v.badtree := '1'; else v.shift := v.shift - mbits; @@ -390,11 +389,11 @@ begin end if; else -- non-present PTE, generate a DSI - v.state := RADIX_ERROR; + v.state := RADIX_FINISH; v.invalid := '1'; end if; else - v.state := RADIX_ERROR; + v.state := RADIX_FINISH; v.badtree := '1'; end if; end if; @@ -406,16 +405,18 @@ begin v.state := TLB_WAIT; else itlb_load := '1'; - done := '1'; v.state := IDLE; end if; - when RADIX_ERROR => - done := '1'; + when RADIX_FINISH => v.state := IDLE; end case; + if v.state = RADIX_FINISH or (v.state = RADIX_LOAD_TLB and r.iside = '1') then + v.done := '1'; + end if; + if r.addr(63) = '1' then effpid := x"00000000"; else @@ -451,7 +452,7 @@ begin tlb_data := (others => '0'); end if; - l_out.done <= done; + l_out.done <= r.done; l_out.invalid <= r.invalid; l_out.badtree <= r.badtree; l_out.segerr <= r.segerror; From 03a3a5d326d8c79f4fd14668534571049d70eaf7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 11 Jul 2020 12:05:43 +1000 Subject: [PATCH 08/18] countzero: Faster algorithm for count leading/trailing zeroes This uses an algorithm for count leading/trailing zeroes that is faster on FPGAs, which makes timing easier. cntlz* and cnttz* still take two cycles, though. For count trailing zeroes, we compute x & -x, which for non-zero x has a single 1 bit in the position of the least-significant 1 bit in x. This one-hot representation can then be converted to a bit number with six 32-input OR gates. For count leading zeroes, we simply do a bit-reversal on x and then use the same algorithm. Signed-off-by: Paul Mackerras --- countzero.vhdl | 156 ++++++++++++++++++------------------------------- 1 file changed, 57 insertions(+), 99 deletions(-) diff --git a/countzero.vhdl b/countzero.vhdl index 50e6ead..18aa043 100644 --- a/countzero.vhdl +++ b/countzero.vhdl @@ -15,123 +15,81 @@ entity zero_counter is end entity zero_counter; architecture behaviour of zero_counter is - type intermediate_result is record - v16: std_ulogic_vector(15 downto 0); - sel_hi: std_ulogic_vector(1 downto 0); - is_32bit: std_ulogic; - count_right: std_ulogic; - end record; - - signal r, r_in : intermediate_result; + -- Reverse the order of bits in a word + function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is + variable ret: std_ulogic_vector(a'left downto a'right); + begin + for i in a'right to a'left loop + ret(a'left + a'right - i) := a(i); + end loop; + return ret; + end; - -- Return the index of the leftmost or rightmost 1 in a set of 4 bits. - -- Assumes v is not "0000"; if it is, return (right ? "11" : "00"). - function encoder(v: std_ulogic_vector(3 downto 0); right: std_ulogic) return std_ulogic_vector is + -- If there is only one bit set in a doubleword, return its bit number + -- (counting from the right). Each bit of the result is obtained by + -- ORing together 32 bits of the input: + -- bit 0 = a[1] or a[3] or a[5] or ... + -- bit 1 = a[2] or a[3] or a[6] or a[7] or ... + -- bit 2 = a[4..7] or a[12..15] or ... + -- bit 5 = a[32..63] ORed together + function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is + variable ret: std_ulogic_vector(5 downto 0); + variable stride: natural; + variable bit: std_ulogic; + variable k: natural; begin - if right = '0' then - if v(3) = '1' then - return "11"; - elsif v(2) = '1' then - return "10"; - elsif v(1) = '1' then - return "01"; - else - return "00"; - end if; - else - if v(0) = '1' then - return "00"; - elsif v(1) = '1' then - return "01"; - elsif v(2) = '1' then - return "10"; - else - return "11"; - end if; - end if; + stride := 2; + for i in 0 to 5 loop + bit := '0'; + for j in 0 to (64 / stride) - 1 loop + k := j * stride; + bit := bit or (or a(k + stride - 1 downto k + (stride / 2))); + end loop; + ret(i) := bit; + stride := stride * 2; + end loop; + return ret; end; + signal inp : std_ulogic_vector(63 downto 0); + signal sum : std_ulogic_vector(64 downto 0); + signal msb_r : std_ulogic; + signal onehot : std_ulogic_vector(63 downto 0); + signal onehot_r : std_ulogic_vector(63 downto 0); + signal bitnum : std_ulogic_vector(5 downto 0); + begin - zerocounter_0: process(clk) + countzero_r: process(clk) begin - if rising_edge(clk) then - r <= r_in; + if rising_edge(clk) then + msb_r <= sum(64); + onehot_r <= onehot; end if; end process; - zerocounter_1: process(all) - variable v: intermediate_result; - variable y, z: std_ulogic_vector(3 downto 0); - variable sel: std_ulogic_vector(5 downto 0); - variable v4: std_ulogic_vector(3 downto 0); - + countzero: process(all) begin - -- Test 4 groups of 16 bits each. - -- The top 2 groups are considered to be zero in 32-bit mode. - z(0) := or (rs(15 downto 0)); - z(1) := or (rs(31 downto 16)); - z(2) := or (rs(47 downto 32)); - z(3) := or (rs(63 downto 48)); if is_32bit = '0' then - v.sel_hi := encoder(z, count_right); + if count_right = '0' then + inp <= bit_reverse(rs); + else + inp <= rs; + end if; else - v.sel_hi(1) := '0'; + inp(63 downto 32) <= x"FFFFFFFF"; if count_right = '0' then - v.sel_hi(0) := z(1); + inp(31 downto 0) <= bit_reverse(rs(31 downto 0)); else - v.sel_hi(0) := not z(0); + inp(31 downto 0) <= rs(31 downto 0); end if; end if; - -- Select the leftmost/rightmost non-zero group of 16 bits - case v.sel_hi is - when "00" => - v.v16 := rs(15 downto 0); - when "01" => - v.v16 := rs(31 downto 16); - when "10" => - v.v16 := rs(47 downto 32); - when others => - v.v16 := rs(63 downto 48); - end case; - - -- Latch this and do the rest in the next cycle, for the sake of timing - v.is_32bit := is_32bit; - v.count_right := count_right; - r_in <= v; - sel(5 downto 4) := r.sel_hi; - - -- Test 4 groups of 4 bits - y(0) := or (r.v16(3 downto 0)); - y(1) := or (r.v16(7 downto 4)); - y(2) := or (r.v16(11 downto 8)); - y(3) := or (r.v16(15 downto 12)); - sel(3 downto 2) := encoder(y, r.count_right); - - -- Select the leftmost/rightmost non-zero group of 4 bits - case sel(3 downto 2) is - when "00" => - v4 := r.v16(3 downto 0); - when "01" => - v4 := r.v16(7 downto 4); - when "10" => - v4 := r.v16(11 downto 8); - when others => - v4 := r.v16(15 downto 12); - end case; - - sel(1 downto 0) := encoder(v4, r.count_right); + sum <= std_ulogic_vector(unsigned('0' & not inp) + 1); + onehot <= sum(63 downto 0) and inp; - -- sel is now the index of the leftmost/rightmost 1 bit in rs - if v4 = "0000" then - -- operand is zero, return 32 for 32-bit, else 64 - result <= x"00000000000000" & '0' & not r.is_32bit & r.is_32bit & "00000"; - elsif r.count_right = '0' then - -- return (63 - sel), trimmed to 5 bits in 32-bit mode - result <= x"00000000000000" & "00" & (not sel(5) and not r.is_32bit) & not sel(4 downto 0); - else - result <= x"00000000000000" & "00" & sel; - end if; + -- The following occurs after a clock edge + bitnum <= bit_number(onehot_r); + result <= x"00000000000000" & "0" & msb_r & bitnum; end process; end behaviour; From 893d2bc6a24f9d1dbae374e7aa01b2abda57d140 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 11 Jul 2020 15:40:27 +1000 Subject: [PATCH 09/18] core: Don't generate logic for log data when LOG_LENGTH = 0 This adds "if LOG_LENGTH > 0 generate" to the places in the core where log output data is latched, so that when LOG_LENGTH = 0 we don't create the logic to collect the data which won't be stored. Signed-off-by: Paul Mackerras --- core.vhdl | 24 +++++++++++++----- cr_file.vhdl | 25 +++++++++++-------- dcache.vhdl | 42 +++++++++++++++++-------------- decode1.vhdl | 28 +++++++++++++-------- decode2.vhdl | 34 ++++++++++++++----------- execute1.vhdl | 41 ++++++++++++++++-------------- icache.vhdl | 62 ++++++++++++++++++++++++---------------------- loadstore1.vhdl | 37 ++++++++++++++++----------- register_file.vhdl | 26 +++++++++++-------- 9 files changed, 186 insertions(+), 133 deletions(-) diff --git a/core.vhdl b/core.vhdl index 4a83d69..c7dd3f6 100644 --- a/core.vhdl +++ b/core.vhdl @@ -202,7 +202,8 @@ begin SIM => SIM, LINE_SIZE => 64, NUM_LINES => 64, - NUM_WAYS => 2 + NUM_WAYS => 2, + LOG_LENGTH => LOG_LENGTH ) port map( clk => clk, @@ -222,6 +223,9 @@ begin icache_stall_in <= decode1_busy; decode1_0: entity work.decode1 + generic map( + LOG_LENGTH => LOG_LENGTH + ) port map ( clk => clk, rst => rst_dec1, @@ -239,7 +243,8 @@ begin decode2_0: entity work.decode2 generic map ( - EX1_BYPASS => EX1_BYPASS + EX1_BYPASS => EX1_BYPASS, + LOG_LENGTH => LOG_LENGTH ) port map ( clk => clk, @@ -261,7 +266,8 @@ begin register_file_0: entity work.register_file generic map ( - SIM => SIM + SIM => SIM, + LOG_LENGTH => LOG_LENGTH ) port map ( clk => clk, @@ -279,7 +285,8 @@ begin cr_file_0: entity work.cr_file generic map ( - SIM => SIM + SIM => SIM, + LOG_LENGTH => LOG_LENGTH ) port map ( clk => clk, @@ -292,7 +299,8 @@ begin execute1_0: entity work.execute1 generic map ( - EX1_BYPASS => EX1_BYPASS + EX1_BYPASS => EX1_BYPASS, + LOG_LENGTH => LOG_LENGTH ) port map ( clk => clk, @@ -315,6 +323,9 @@ begin ); loadstore1_0: entity work.loadstore1 + generic map ( + LOG_LENGTH => LOG_LENGTH + ) port map ( clk => clk, rst => rst_ls1, @@ -344,7 +355,8 @@ begin generic map( LINE_SIZE => 64, NUM_LINES => 64, - NUM_WAYS => 2 + NUM_WAYS => 2, + LOG_LENGTH => LOG_LENGTH ) port map ( clk => clk, diff --git a/cr_file.vhdl b/cr_file.vhdl index 37fa76b..3e65663 100644 --- a/cr_file.vhdl +++ b/cr_file.vhdl @@ -7,7 +7,9 @@ use work.common.all; entity cr_file is generic ( - SIM : boolean := false + SIM : boolean := false; + -- Non-zero to enable log data collection + LOG_LENGTH : natural := 0 ); port( clk : in std_logic; @@ -29,7 +31,6 @@ architecture behaviour of cr_file is signal crs_updated : std_ulogic_vector(31 downto 0); signal xerc : xer_common_t := xerc_init; signal xerc_updated : xer_common_t; - signal log_data : std_ulogic_vector(12 downto 0); begin cr_create_0: process(all) variable hi, lo : integer := 0; @@ -91,14 +92,18 @@ begin end process; end generate; - cr_log: process(clk) + cf_log: if LOG_LENGTH > 0 generate + signal log_data : std_ulogic_vector(12 downto 0); begin - if rising_edge(clk) then - log_data <= w_in.write_cr_enable & - w_in.write_cr_data(31 downto 28) & - w_in.write_cr_mask; - end if; - end process; - log_out <= log_data; + cr_log: process(clk) + begin + if rising_edge(clk) then + log_data <= w_in.write_cr_enable & + w_in.write_cr_data(31 downto 28) & + w_in.write_cr_mask; + end if; + end process; + log_out <= log_data; + end generate; end architecture behaviour; diff --git a/dcache.vhdl b/dcache.vhdl index 4c1db9b..ac92305 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -31,7 +31,9 @@ entity dcache is -- L1 DTLB number of sets TLB_NUM_WAYS : positive := 2; -- L1 DTLB log_2(page_size) - TLB_LG_PGSZ : positive := 12 + TLB_LG_PGSZ : positive := 12; + -- Non-zero to enable log data collection + LOG_LENGTH : natural := 0 ); port ( clk : in std_ulogic; @@ -463,8 +465,6 @@ architecture rtl of dcache is ptes(j + TLB_PTE_BITS - 1 downto j) := newpte; end; - signal log_data : std_ulogic_vector(19 downto 0); - begin assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; @@ -1460,21 +1460,25 @@ begin end if; end process; - dcache_log: process(clk) + dc_log: if LOG_LENGTH > 0 generate + signal log_data : std_ulogic_vector(19 downto 0); begin - if rising_edge(clk) then - log_data <= r1.wb.adr(5 downto 3) & - wishbone_in.stall & - wishbone_in.ack & - r1.wb.stb & r1.wb.cyc & - d_out.error & - d_out.valid & - std_ulogic_vector(to_unsigned(op_t'pos(req_op), 3)) & - stall_out & - std_ulogic_vector(to_unsigned(tlb_hit_way, 3)) & - valid_ra & - std_ulogic_vector(to_unsigned(state_t'pos(r1.state), 3)); - end if; - end process; - log_out <= log_data; + dcache_log: process(clk) + begin + if rising_edge(clk) then + log_data <= r1.wb.adr(5 downto 3) & + wishbone_in.stall & + wishbone_in.ack & + r1.wb.stb & r1.wb.cyc & + d_out.error & + d_out.valid & + std_ulogic_vector(to_unsigned(op_t'pos(req_op), 3)) & + stall_out & + std_ulogic_vector(to_unsigned(tlb_hit_way, 3)) & + valid_ra & + std_ulogic_vector(to_unsigned(state_t'pos(r1.state), 3)); + end if; + end process; + log_out <= log_data; + end generate; end; diff --git a/decode1.vhdl b/decode1.vhdl index 29b7a05..d8c2b61 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -7,6 +7,10 @@ use work.common.all; use work.decode_types.all; entity decode1 is + generic ( + -- Non-zero to enable log data collection + LOG_LENGTH : natural := 0 + ); port ( clk : in std_ulogic; rst : in std_ulogic; @@ -357,8 +361,6 @@ architecture behaviour of decode1 is constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); constant fetch_fail_inst: decode_rom_t := (LDST, OP_FETCH_FAILED, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); - signal log_data : std_ulogic_vector(12 downto 0); - begin decode1_0: process(clk) begin @@ -524,15 +526,19 @@ begin flush_out <= f.redirect; end process; - dec1_log : process(clk) + d1_log: if LOG_LENGTH > 0 generate + signal log_data : std_ulogic_vector(12 downto 0); begin - if rising_edge(clk) then - log_data <= std_ulogic_vector(to_unsigned(insn_type_t'pos(r.decode.insn_type), 6)) & - r.nia(5 downto 2) & - std_ulogic_vector(to_unsigned(unit_t'pos(r.decode.unit), 2)) & - r.valid; - end if; - end process; - log_out <= log_data; + dec1_log : process(clk) + begin + if rising_edge(clk) then + log_data <= std_ulogic_vector(to_unsigned(insn_type_t'pos(r.decode.insn_type), 6)) & + r.nia(5 downto 2) & + std_ulogic_vector(to_unsigned(unit_t'pos(r.decode.unit), 2)) & + r.valid; + end if; + end process; + log_out <= log_data; + end generate; end architecture behaviour; diff --git a/decode2.vhdl b/decode2.vhdl index d724874..62c574c 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -10,7 +10,9 @@ use work.insn_helpers.all; entity decode2 is generic ( - EX1_BYPASS : boolean := true + EX1_BYPASS : boolean := true; + -- Non-zero to enable log data collection + LOG_LENGTH : natural := 0 ); port ( clk : in std_ulogic; @@ -47,8 +49,6 @@ architecture behaviour of decode2 is signal deferred : std_ulogic; - signal log_data : std_ulogic_vector(9 downto 0); - type decode_input_reg_t is record reg_valid : std_ulogic; reg : gspr_index_t; @@ -415,18 +415,22 @@ begin e_out <= r.e; end process; - dec2_log : process(clk) + d2_log: if LOG_LENGTH > 0 generate + signal log_data : std_ulogic_vector(9 downto 0); begin - if rising_edge(clk) then - log_data <= r.e.nia(5 downto 2) & - r.e.valid & - stopped_out & - stall_out & - r.e.bypass_data3 & - r.e.bypass_data2 & - r.e.bypass_data1; - end if; - end process; - log_out <= log_data; + dec2_log : process(clk) + begin + if rising_edge(clk) then + log_data <= r.e.nia(5 downto 2) & + r.e.valid & + stopped_out & + stall_out & + r.e.bypass_data3 & + r.e.bypass_data2 & + r.e.bypass_data1; + end if; + end process; + log_out <= log_data; + end generate; end architecture behaviour; diff --git a/execute1.vhdl b/execute1.vhdl index fb760d1..2722570 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -12,7 +12,9 @@ use work.ppc_fx_insns.all; entity execute1 is generic ( - EX1_BYPASS : boolean := true + EX1_BYPASS : boolean := true; + -- Non-zero to enable log data collection + LOG_LENGTH : natural := 0 ); port ( clk : in std_ulogic; @@ -97,7 +99,6 @@ architecture behaviour of execute1 is -- signals for logging signal exception_log : std_ulogic; signal irq_valid_log : std_ulogic; - signal log_data : std_ulogic_vector(14 downto 0); type privilege_level is (USER, SUPER); type op_privilege_array is array(insn_type_t) of privilege_level; @@ -1083,21 +1084,25 @@ begin irq_valid_log <= irq_valid; end process; - ex1_log : process(clk) + e1_log: if LOG_LENGTH > 0 generate + signal log_data : std_ulogic_vector(14 downto 0); begin - if rising_edge(clk) then - log_data <= ctrl.msr(MSR_EE) & ctrl.msr(MSR_PR) & - ctrl.msr(MSR_IR) & ctrl.msr(MSR_DR) & - exception_log & - irq_valid_log & - std_ulogic_vector(to_unsigned(irq_state_t'pos(ctrl.irq_state), 1)) & - "000" & - r.e.write_enable & - r.e.valid & - f_out.redirect & - r.busy & - flush_out; - end if; - end process; - log_out <= log_data; + ex1_log : process(clk) + begin + if rising_edge(clk) then + log_data <= ctrl.msr(MSR_EE) & ctrl.msr(MSR_PR) & + ctrl.msr(MSR_IR) & ctrl.msr(MSR_DR) & + exception_log & + irq_valid_log & + std_ulogic_vector(to_unsigned(irq_state_t'pos(ctrl.irq_state), 1)) & + "000" & + r.e.write_enable & + r.e.valid & + f_out.redirect & + r.busy & + flush_out; + end if; + end process; + log_out <= log_data; + end generate; end architecture behaviour; diff --git a/icache.vhdl b/icache.vhdl index 0aebbc2..3f1c15f 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -47,7 +47,9 @@ entity icache is -- L1 ITLB log_2(page_size) TLB_LG_PGSZ : positive := 12; -- Number of real address bits that we store - REAL_ADDR_BITS : positive := 56 + REAL_ADDR_BITS : positive := 56; + -- Non-zero to enable log data collection + LOG_LENGTH : natural := 0 ); port ( clk : in std_ulogic; @@ -207,9 +209,6 @@ architecture rtl of icache is signal access_ok : std_ulogic; signal use_previous : std_ulogic; - -- Output data to logger - signal log_data : std_ulogic_vector(53 downto 0); - -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; signal cache_out : cache_ram_out_t; @@ -729,31 +728,36 @@ begin end if; end process; - data_log: process(clk) - variable lway: way_t; - variable wstate: std_ulogic; + icache_log: if LOG_LENGTH > 0 generate + -- Output data to logger + signal log_data : std_ulogic_vector(53 downto 0); begin - if rising_edge(clk) then - lway := req_hit_way; - wstate := '0'; - if r.state /= IDLE then - wstate := '1'; + data_log: process(clk) + variable lway: way_t; + variable wstate: std_ulogic; + begin + if rising_edge(clk) then + lway := req_hit_way; + wstate := '0'; + if r.state /= IDLE then + wstate := '1'; + end if; + log_data <= i_out.valid & + i_out.insn & + wishbone_in.ack & + r.wb.adr(5 downto 3) & + r.wb.stb & r.wb.cyc & + wishbone_in.stall & + stall_out & + r.fetch_failed & + r.hit_nia(5 downto 2) & + wstate & + std_ulogic_vector(to_unsigned(lway, 3)) & + req_is_hit & req_is_miss & + access_ok & + ra_valid; end if; - log_data <= i_out.valid & - i_out.insn & - wishbone_in.ack & - r.wb.adr(5 downto 3) & - r.wb.stb & r.wb.cyc & - wishbone_in.stall & - stall_out & - r.fetch_failed & - r.hit_nia(5 downto 2) & - wstate & - std_ulogic_vector(to_unsigned(lway, 3)) & - req_is_hit & req_is_miss & - access_ok & - ra_valid; - end if; - end process; - log_out <= log_data; + end process; + log_out <= log_data; + end generate; end; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index dd11221..0387e40 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -10,6 +10,10 @@ use work.common.all; -- We calculate the address in the first cycle entity loadstore1 is + generic ( + -- Non-zero to enable log data collection + LOG_LENGTH : natural := 0 + ); port ( clk : in std_ulogic; rst : in std_ulogic; @@ -85,8 +89,6 @@ architecture behave of loadstore1 is signal r, rin : reg_stage_t; signal lsu_sum : std_ulogic_vector(63 downto 0); - signal log_data : std_ulogic_vector(9 downto 0); - -- Generate byte enables from sizes function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is begin @@ -515,18 +517,23 @@ begin end process; - ls1_log: process(clk) + l1_log: if LOG_LENGTH > 0 generate + signal log_data : std_ulogic_vector(9 downto 0); begin - if rising_edge(clk) then - log_data <= e_out.busy & - e_out.exception & - l_out.valid & - m_out.valid & - d_out.valid & - m_in.done & - r.dwords_done & - std_ulogic_vector(to_unsigned(state_t'pos(r.state), 3)); - end if; - end process; - log_out <= log_data; + ls1_log: process(clk) + begin + if rising_edge(clk) then + log_data <= e_out.busy & + e_out.exception & + l_out.valid & + m_out.valid & + d_out.valid & + m_in.done & + r.dwords_done & + std_ulogic_vector(to_unsigned(state_t'pos(r.state), 3)); + end if; + end process; + log_out <= log_data; + end generate; + end; diff --git a/register_file.vhdl b/register_file.vhdl index 260255e..10f28a4 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -7,7 +7,9 @@ use work.common.all; entity register_file is generic ( - SIM : boolean := false + SIM : boolean := false; + -- Non-zero to enable log data collection + LOG_LENGTH : natural := 0 ); port( clk : in std_logic; @@ -36,7 +38,6 @@ architecture behaviour of register_file is signal rd_port_b : std_ulogic_vector(63 downto 0); signal dbg_data : std_ulogic_vector(63 downto 0); signal dbg_ack : std_ulogic; - signal log_data : std_ulogic_vector(70 downto 0); begin -- synchronous writes register_write_0: process(clk) @@ -134,13 +135,18 @@ begin sim_dump_done <= '0'; end generate; - reg_log: process(clk) + rf_log: if LOG_LENGTH > 0 generate + signal log_data : std_ulogic_vector(70 downto 0); begin - if rising_edge(clk) then - log_data <= w_in.write_data & - w_in.write_enable & - w_in.write_reg; - end if; - end process; - log_out <= log_data; + reg_log: process(clk) + begin + if rising_edge(clk) then + log_data <= w_in.write_data & + w_in.write_enable & + w_in.write_reg; + end if; + end process; + log_out <= log_data; + end generate; + end architecture behaviour; From dc8980d5a5dfab744e0df551a63208994927802d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 11 Jul 2020 17:46:03 +1000 Subject: [PATCH 10/18] dcache: Improve timing of valid/done outputs This makes d_out.valid and m_out.done come directly from registers in order to improve timing. The inputs to the registers are set by the same conditions that cause r1.hit_load_valid, r1.slow_valid, r1.error_done and r1.stcx_fail to be set. Note that the STORE_WAIT_ACK state doesn't test r1.mmu_req but assumes that the request came from loadstore1. This is because we normally have r1.full = 0 in this state, which means that r1.mmu_req can change at any time. Signed-off-by: Paul Mackerras --- dcache.vhdl | 68 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index ac92305..5cbc910 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -235,6 +235,7 @@ architecture rtl of dcache is byte_sel : std_ulogic_vector(7 downto 0); hit_way : way_t; same_tag : std_ulogic; + mmu_req : std_ulogic; end record; -- First stage register, contains state for stage 1 of load hits @@ -282,15 +283,14 @@ architecture rtl of dcache is rows_valid : row_per_line_valid_t; acks_pending : unsigned(2 downto 0); - -- Signals to complete with error + -- Signals to complete (possibly with error) + ls_valid : std_ulogic; + mmu_done : std_ulogic; error_done : std_ulogic; cache_paradox : std_ulogic; -- Signal to complete a failed stcx. stcx_fail : std_ulogic; - - -- completion signal for tlbie - tlbie_done : std_ulogic; end record; signal r1 : reg_stage_1_t; @@ -940,15 +940,15 @@ begin end if; end loop; - d_out.valid <= '0'; + d_out.valid <= r1.ls_valid; d_out.data <= data_out; - d_out.store_done <= '0'; - d_out.error <= '0'; - d_out.cache_paradox <= '0'; + d_out.store_done <= not r1.stcx_fail; + d_out.error <= r1.error_done; + d_out.cache_paradox <= r1.cache_paradox; -- Outputs to MMU - m_out.done <= r1.tlbie_done; - m_out.err <= '0'; + m_out.done <= r1.mmu_done; + m_out.err <= r1.error_done; m_out.data <= data_out; -- We have a valid load or store hit or we just completed a slow @@ -974,47 +974,32 @@ begin -- Load hit case is the standard path if r1.hit_load_valid = '1' then report "completing load hit data=" & to_hstring(data_out); - d_out.valid <= '1'; end if; -- error cases complete without stalling if r1.error_done = '1' then report "completing ld/st with error"; - d_out.error <= '1'; - d_out.cache_paradox <= r1.cache_paradox; - d_out.valid <= '1'; end if; -- Slow ops (load miss, NC, stores) if r1.slow_valid = '1' then - d_out.store_done <= '1'; report "completing store or load miss data=" & to_hstring(data_out); - d_out.valid <= '1'; - end if; - - if r1.stcx_fail = '1' then - d_out.store_done <= '0'; - d_out.valid <= '1'; end if; else -- Request came from MMU if r1.hit_load_valid = '1' then report "completing load hit to MMU, data=" & to_hstring(m_out.data); - m_out.done <= '1'; end if; -- error cases complete without stalling if r1.error_done = '1' then report "completing MMU ld with error"; - m_out.err <= '1'; - m_out.done <= '1'; end if; -- Slow ops (i.e. load miss) if r1.slow_valid = '1' then report "completing MMU load miss, data=" & to_hstring(m_out.data); - m_out.done <= '1'; end if; end if; @@ -1159,8 +1144,6 @@ begin r1.tlb_hit_way <= tlb_hit_way; r1.tlb_hit_index <= tlb_req_index; - -- complete tlbies and TLB loads in the third cycle - r1.tlbie_done <= r0_valid and (r0.tlbie or r0.tlbld); end if; end process; @@ -1217,6 +1200,8 @@ begin r1.slow_valid <= '0'; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; + r1.ls_valid <= '0'; + r1.mmu_done <= '0'; -- Not useful normally but helps avoiding tons of sim warnings r1.wb.adr <= (others => '0'); @@ -1225,6 +1210,17 @@ begin r1.slow_valid <= '0'; r1.write_bram <= '0'; + r1.ls_valid <= '0'; + -- complete tlbies and TLB loads in the third cycle + r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld); + if req_op = OP_LOAD_HIT or req_op = OP_BAD or req_op = OP_STCX_FAIL then + if r0.mmu_req = '0' then + r1.ls_valid <= '1'; + else + r1.mmu_done <= '1'; + end if; + end if; + if r1.write_tag = '1' then -- Store new tag in selected way for i in 0 to NUM_WAYS-1 loop @@ -1244,6 +1240,7 @@ begin else req.op := req_op; req.valid := req_go; + req.mmu_req := r0.mmu_req; req.dcbz := r0.req.dcbz; req.real_addr := ra; req.data := r0.req.data; @@ -1318,6 +1315,11 @@ begin r1.acks_pending <= to_unsigned(1, 3); r1.full <= '0'; r1.slow_valid <= '1'; + if req.mmu_req = '0' then + r1.ls_valid <= '1'; + else + r1.mmu_done <= '1'; + end if; if req.op = OP_STORE_HIT then r1.write_bram <= '1'; end if; @@ -1380,6 +1382,11 @@ begin r1.store_row = get_row(r1.req.real_addr) then r1.full <= '0'; r1.slow_valid <= '1'; + if r1.mmu_req = '0' then + r1.ls_valid <= '1'; + else + r1.mmu_done <= '1'; + end if; r1.forward_sel <= (others => '1'); r1.use_forward1 <= '1'; end if; @@ -1421,6 +1428,8 @@ begin end if; r1.full <= '0'; r1.slow_valid <= '1'; + -- Store requests never come from the MMU + r1.ls_valid <= '1'; acks := acks + 1; else r1.wb.stb <= '0'; @@ -1450,6 +1459,11 @@ begin r1.state <= IDLE; r1.full <= '0'; r1.slow_valid <= '1'; + if r1.mmu_req = '0' then + r1.ls_valid <= '1'; + else + r1.mmu_done <= '1'; + end if; r1.forward_sel <= (others => '1'); r1.use_forward1 <= '1'; r1.wb.cyc <= '0'; From 56420e74f3e92d09f02db12ca09bdedb95018ef7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 11 Jul 2020 22:23:31 +1000 Subject: [PATCH 11/18] dcache: Ease timing on calculation of acks remaining This moves the incrementing or decrementing of r1.acks_pending to the cycle after a strobe is output or an ack is seen on the wishbone, and simplifies the logic that determines whether the cycle is now complete. This means that the path from seeing req_op equal to OP_STORE_HIT or OP_STORE_MISS to setting r1.state and r1.cyc now just involves the stbs_done bit rather than a more complex calculation involving the possibly incremented r1.acks_pending. Signed-off-by: Paul Mackerras --- dcache.vhdl | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 5cbc910..a24dc15 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -282,6 +282,8 @@ architecture rtl of dcache is end_row_ix : row_in_line_t; rows_valid : row_per_line_valid_t; acks_pending : unsigned(2 downto 0); + inc_acks : std_ulogic; + dec_acks : std_ulogic; -- Signals to complete (possibly with error) ls_valid : std_ulogic; @@ -1209,6 +1211,8 @@ begin -- One cycle pulses reset r1.slow_valid <= '0'; r1.write_bram <= '0'; + r1.inc_acks <= '0'; + r1.dec_acks <= '0'; r1.ls_valid <= '0'; -- complete tlbies and TLB loads in the third cycle @@ -1409,6 +1413,14 @@ begin when STORE_WAIT_ACK => stbs_done := r1.wb.stb = '0'; acks := r1.acks_pending; + if r1.inc_acks /= r1.dec_acks then + if r1.inc_acks = '1' then + acks := acks + 1; + else + acks := acks - 1; + end if; + end if; + r1.acks_pending <= acks; -- Clear stb when slave accepted request if wishbone_in.stall = '0' then -- See if there is another store waiting to be done @@ -1430,7 +1442,8 @@ begin r1.slow_valid <= '1'; -- Store requests never come from the MMU r1.ls_valid <= '1'; - acks := acks + 1; + stbs_done := false; + r1.inc_acks <= '1'; else r1.wb.stb <= '0'; stbs_done := true; @@ -1444,9 +1457,8 @@ begin r1.wb.cyc <= '0'; r1.wb.stb <= '0'; end if; - acks := acks - 1; + r1.dec_acks <= '1'; end if; - r1.acks_pending <= acks; when NC_LOAD_WAIT_ACK => -- Clear stb when slave accepted request From c180ed0af0f2cee202ae81df75c89b81341c150c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sun, 12 Jul 2020 20:05:53 +1000 Subject: [PATCH 12/18] dcache: Output separate done-without-error and error-done signals This reduces the complexity of the logic in the places where these signals are used. Signed-off-by: Paul Mackerras --- dcache.vhdl | 21 +++++---- loadstore1.vhdl | 73 +++++++++++++++-------------- mmu.vhdl | 120 ++++++++++++++++++++++++------------------------ 3 files changed, 107 insertions(+), 107 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index a24dc15..08b1664 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -287,8 +287,9 @@ architecture rtl of dcache is -- Signals to complete (possibly with error) ls_valid : std_ulogic; + ls_error : std_ulogic; mmu_done : std_ulogic; - error_done : std_ulogic; + mmu_error : std_ulogic; cache_paradox : std_ulogic; -- Signal to complete a failed stcx. @@ -739,7 +740,7 @@ begin req_row <= get_row(r0.req.addr); req_tag <= get_tag(ra); - go := r0_valid and not (r0.tlbie or r0.tlbld) and not r1.error_done; + go := r0_valid and not (r0.tlbie or r0.tlbld) and not r1.ls_error; -- Test if pending request is a hit on any way -- In order to make timing in virtual mode, when we are using the TLB, @@ -945,12 +946,12 @@ begin d_out.valid <= r1.ls_valid; d_out.data <= data_out; d_out.store_done <= not r1.stcx_fail; - d_out.error <= r1.error_done; + d_out.error <= r1.ls_error; d_out.cache_paradox <= r1.cache_paradox; -- Outputs to MMU m_out.done <= r1.mmu_done; - m_out.err <= r1.error_done; + m_out.err <= r1.mmu_error; m_out.data <= data_out; -- We have a valid load or store hit or we just completed a slow @@ -979,7 +980,7 @@ begin end if; -- error cases complete without stalling - if r1.error_done = '1' then + if r1.ls_error = '1' then report "completing ld/st with error"; end if; @@ -995,7 +996,7 @@ begin end if; -- error cases complete without stalling - if r1.error_done = '1' then + if r1.mmu_error = '1' then report "completing MMU ld with error"; end if; @@ -1128,10 +1129,12 @@ begin if req_op = OP_BAD then report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) & " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok); - r1.error_done <= '1'; + r1.ls_error <= not r0.mmu_req; + r1.mmu_error <= r0.mmu_req; r1.cache_paradox <= access_ok; else - r1.error_done <= '0'; + r1.ls_error <= '0'; + r1.mmu_error <= '0'; r1.cache_paradox <= '0'; end if; @@ -1217,7 +1220,7 @@ begin r1.ls_valid <= '0'; -- complete tlbies and TLB loads in the third cycle r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld); - if req_op = OP_LOAD_HIT or req_op = OP_BAD or req_op = OP_STCX_FAIL then + if req_op = OP_LOAD_HIT or req_op = OP_STCX_FAIL then if r0.mmu_req = '0' then r1.ls_valid <= '1'; else diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 0387e40..4e1f943 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -241,47 +241,46 @@ begin v.last_dword := '0'; when ACK_WAIT => + if d_in.error = '1' then + -- dcache will discard the second request if it + -- gets an error on the 1st of two requests + if r.dwords_done = '1' then + addr := next_addr; + else + addr := r.addr; + end if; + if d_in.cache_paradox = '1' then + -- signal an interrupt straight away + exception := '1'; + dsisr(63 - 38) := not r.load; + -- XXX there is no architected bit for this + dsisr(63 - 35) := d_in.cache_paradox; + v.state := IDLE; + else + -- Look up the translation for TLB miss + -- and also for permission error and RC error + -- in case the PTE has been updated. + mmureq := '1'; + v.state := MMU_LOOKUP; + end if; + end if; if d_in.valid = '1' then - if d_in.error = '1' then - -- dcache will discard the second request if it - -- gets an error on the 1st of two requests - if r.dwords_done = '1' then - addr := next_addr; - else - addr := r.addr; - end if; - if d_in.cache_paradox = '1' then - -- signal an interrupt straight away - exception := '1'; - dsisr(63 - 38) := not r.load; - -- XXX there is no architected bit for this - dsisr(63 - 35) := d_in.cache_paradox; - v.state := IDLE; - else - -- Look up the translation for TLB miss - -- and also for permission error and RC error - -- in case the PTE has been updated. - mmureq := '1'; - v.state := MMU_LOOKUP; + if r.last_dword = '0' then + v.dwords_done := '1'; + v.last_dword := '1'; + if r.load = '1' then + v.load_data := data_permuted; end if; else - if r.last_dword = '0' then - v.dwords_done := '1'; - v.last_dword := '1'; - if r.load = '1' then - v.load_data := data_permuted; - end if; + write_enable := r.load; + if r.load = '1' and r.update = '1' then + -- loads with rA update need an extra cycle + v.state := LD_UPDATE; else - write_enable := r.load; - if r.load = '1' and r.update = '1' then - -- loads with rA update need an extra cycle - v.state := LD_UPDATE; - else - -- stores write back rA update in this cycle - do_update := r.update; - done := '1'; - v.state := IDLE; - end if; + -- stores write back rA update in this cycle + do_update := r.update; + done := '1'; + v.state := IDLE; end if; end if; end if; diff --git a/mmu.vhdl b/mmu.vhdl index f1d69e2..6458a6e 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -301,33 +301,32 @@ begin when PROC_TBL_WAIT => if d_in.done = '1' then - if d_in.err = '0' then - if r.addr(63) = '1' then - v.pgtbl3 := data; - v.pt3_valid := '1'; - else - v.pgtbl0 := data; - v.pt0_valid := '1'; - end if; - -- rts == radix tree size, # address bits being translated - rts := unsigned('0' & data(62 downto 61) & data(7 downto 5)); - -- mbits == # address bits to index top level of tree - mbits := unsigned('0' & data(4 downto 0)); - -- set v.shift to rts so that we can use finalmask for the segment check - v.shift := rts; - v.mask_size := mbits(4 downto 0); - v.pgbase := data(55 downto 8) & x"00"; - if mbits = 0 then - v.state := RADIX_FINISH; - v.invalid := '1'; - else - v.state := SEGMENT_CHECK; - end if; + if r.addr(63) = '1' then + v.pgtbl3 := data; + v.pt3_valid := '1'; else + v.pgtbl0 := data; + v.pt0_valid := '1'; + end if; + -- rts == radix tree size, # address bits being translated + rts := unsigned('0' & data(62 downto 61) & data(7 downto 5)); + -- mbits == # address bits to index top level of tree + mbits := unsigned('0' & data(4 downto 0)); + -- set v.shift to rts so that we can use finalmask for the segment check + v.shift := rts; + v.mask_size := mbits(4 downto 0); + v.pgbase := data(55 downto 8) & x"00"; + if mbits = 0 then v.state := RADIX_FINISH; - v.badtree := '1'; + v.invalid := '1'; + else + v.state := SEGMENT_CHECK; end if; end if; + if d_in.err = '1' then + v.state := RADIX_FINISH; + v.badtree := '1'; + end if; when SEGMENT_CHECK => mbits := '0' & r.mask_size; @@ -349,54 +348,53 @@ begin when RADIX_READ_WAIT => if d_in.done = '1' then - if d_in.err = '0' then - v.pde := data; - -- test valid bit - if data(63) = '1' then - -- test leaf bit - if data(62) = '1' then - -- check permissions and RC bits - perm_ok := '0'; - if r.priv = '1' or data(3) = '0' then - if r.iside = '0' then - perm_ok := data(1) or (data(2) and not r.store); - else - -- no IAMR, so no KUEP support for now - -- deny execute permission if cache inhibited - perm_ok := data(0) and not data(5); - end if; - end if; - rc_ok := data(8) and (data(7) or not r.store); - if perm_ok = '1' and rc_ok = '1' then - v.state := RADIX_LOAD_TLB; + v.pde := data; + -- test valid bit + if data(63) = '1' then + -- test leaf bit + if data(62) = '1' then + -- check permissions and RC bits + perm_ok := '0'; + if r.priv = '1' or data(3) = '0' then + if r.iside = '0' then + perm_ok := data(1) or (data(2) and not r.store); else - v.state := RADIX_FINISH; - v.perm_err := not perm_ok; - -- permission error takes precedence over RC error - v.rc_error := perm_ok; + -- no IAMR, so no KUEP support for now + -- deny execute permission if cache inhibited + perm_ok := data(0) and not data(5); end if; + end if; + rc_ok := data(8) and (data(7) or not r.store); + if perm_ok = '1' and rc_ok = '1' then + v.state := RADIX_LOAD_TLB; else - mbits := unsigned('0' & data(4 downto 0)); - if mbits < 5 or mbits > 16 or mbits > r.shift then - v.state := RADIX_FINISH; - v.badtree := '1'; - else - v.shift := v.shift - mbits; - v.mask_size := mbits(4 downto 0); - v.pgbase := data(55 downto 8) & x"00"; - v.state := RADIX_LOOKUP; - end if; + v.state := RADIX_FINISH; + v.perm_err := not perm_ok; + -- permission error takes precedence over RC error + v.rc_error := perm_ok; end if; else - -- non-present PTE, generate a DSI - v.state := RADIX_FINISH; - v.invalid := '1'; + mbits := unsigned('0' & data(4 downto 0)); + if mbits < 5 or mbits > 16 or mbits > r.shift then + v.state := RADIX_FINISH; + v.badtree := '1'; + else + v.shift := v.shift - mbits; + v.mask_size := mbits(4 downto 0); + v.pgbase := data(55 downto 8) & x"00"; + v.state := RADIX_LOOKUP; + end if; end if; else + -- non-present PTE, generate a DSI v.state := RADIX_FINISH; - v.badtree := '1'; + v.invalid := '1'; end if; end if; + if d_in.err = '1' then + v.state := RADIX_FINISH; + v.badtree := '1'; + end if; when RADIX_LOAD_TLB => tlb_load := '1'; From 91cbeee77cfebe1da3d9484d34b3c72af90d444b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 13 Jul 2020 12:18:53 +1000 Subject: [PATCH 13/18] loadstore1: Generate busy signal earlier This makes the calculation of busy as simple as possible and dependent only on register outputs. The timing of busy is critical, as it gates the valid signal for the next instruction, and therefore any delays in dropping busy at the end of a load or store directly impact the timing of a host of other paths. This also separates the 'done without error' and 'done with error' cases from the MMU into separate signals that are both driven directly from registers. Signed-off-by: Paul Mackerras --- common.vhdl | 1 + loadstore1.vhdl | 92 ++++++++++++++++++++++++++++++------------------- mmu.vhdl | 11 ++++-- 3 files changed, 66 insertions(+), 38 deletions(-) diff --git a/common.vhdl b/common.vhdl index a193df1..28b3434 100644 --- a/common.vhdl +++ b/common.vhdl @@ -315,6 +315,7 @@ package common is type MmuToLoadstore1Type is record done : std_ulogic; + err : std_ulogic; invalid : std_ulogic; badtree : std_ulogic; segerr : std_ulogic; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 4e1f943..660e6c0 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -80,6 +80,9 @@ architecture behave of loadstore1 is dsisr : std_ulogic_vector(31 downto 0); instr_fault : std_ulogic; sprval : std_ulogic_vector(63 downto 0); + busy : std_ulogic; + wait_dcache : std_ulogic; + wait_mmu : std_ulogic; end record; type byte_sel_t is array(0 to 7) of std_ulogic; @@ -128,6 +131,9 @@ begin if rising_edge(clk) then if rst = '1' then r.state <= IDLE; + r.busy <= '0'; + r.wait_dcache <= '0'; + r.wait_mmu <= '0'; else r <= rin; end if; @@ -228,8 +234,17 @@ begin -- compute (addr + 8) & ~7 for the second doubleword when unaligned next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; + -- Busy calculation. + -- We need to minimize the delay from clock to busy valid because it + -- gates the start of execution of the next instruction. + busy := r.busy or (r.wait_dcache and not d_in.valid) or (r.wait_mmu and not m_in.done); + done := '0'; + if r.state /= IDLE and busy = '0' then + done := '1'; + end if; exception := '0'; + case r.state is when IDLE => @@ -255,7 +270,6 @@ begin dsisr(63 - 38) := not r.load; -- XXX there is no architected bit for this dsisr(63 - 35) := d_in.cache_paradox; - v.state := IDLE; else -- Look up the translation for TLB miss -- and also for permission error and RC error @@ -279,8 +293,6 @@ begin else -- stores write back rA update in this cycle do_update := r.update; - done := '1'; - v.state := IDLE; end if; end if; end if; @@ -294,53 +306,36 @@ begin byte_sel := r.first_bytes; end if; if m_in.done = '1' then - if m_in.invalid = '0' and m_in.perm_error = '0' and m_in.rc_error = '0' and - m_in.badtree = '0' and m_in.segerr = '0' then - if r.instr_fault = '0' then - -- retry the request now that the MMU has installed a TLB entry - req := '1'; - if r.last_dword = '0' then - v.state := SECOND_REQ; - else - v.state := ACK_WAIT; - end if; + if r.instr_fault = '0' then + -- retry the request now that the MMU has installed a TLB entry + req := '1'; + if r.last_dword = '0' then + v.state := SECOND_REQ; else - -- nothing to do, the icache retries automatically - done := '1'; - v.state := IDLE; + v.state := ACK_WAIT; end if; - else - exception := '1'; - dsisr(63 - 33) := m_in.invalid; - dsisr(63 - 36) := m_in.perm_error; - dsisr(63 - 38) := not r.load; - dsisr(63 - 44) := m_in.badtree; - dsisr(63 - 45) := m_in.rc_error; - v.state := IDLE; end if; end if; + if m_in.err = '1' then + exception := '1'; + dsisr(63 - 33) := m_in.invalid; + dsisr(63 - 36) := m_in.perm_error; + dsisr(63 - 38) := not r.load; + dsisr(63 - 44) := m_in.badtree; + dsisr(63 - 45) := m_in.rc_error; + end if; when TLBIE_WAIT => - if m_in.done = '1' then - -- tlbie is finished - done := '1'; - v.state := IDLE; - end if; when LD_UPDATE => do_update := '1'; - v.state := IDLE; - done := '1'; when SPR_CMPLT => - done := '1'; - v.state := IDLE; end case; - busy := '1'; - if r.state = IDLE or done = '1' then - busy := '0'; + if done = '1' or exception = '1' then + v.state := IDLE; end if; -- Note that l_in.valid is gated with busy inside execute1 @@ -450,6 +445,31 @@ begin end if; end if; + -- Work out whether we'll be busy next cycle + v.busy := '0'; + v.wait_dcache := '0'; + v.wait_mmu := '0'; + case v.state is + when SECOND_REQ => + v.busy := '1'; + when ACK_WAIT => + if v.last_dword = '0' or (v.load = '1' and v.update = '1') then + v.busy := '1'; + else + v.wait_dcache := '1'; + end if; + when MMU_LOOKUP => + if v.instr_fault = '0' then + v.busy := '1'; + else + v.wait_mmu := '1'; + end if; + when TLBIE_WAIT => + v.wait_mmu := '1'; + when others => + -- not busy next cycle + end case; + -- Update outputs to dcache d_out.valid <= req; d_out.load <= v.load; diff --git a/mmu.vhdl b/mmu.vhdl index 6458a6e..09df3ae 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -52,6 +52,7 @@ architecture behave of mmu is -- internal state state : state_t; done : std_ulogic; + err : std_ulogic; pgtbl0 : std_ulogic_vector(63 downto 0); pt0_valid : std_ulogic; pgtbl3 : std_ulogic_vector(63 downto 0); @@ -92,7 +93,10 @@ begin report "MMU got tlb miss for " & to_hstring(rin.addr); end if; if l_out.done = '1' then - report "MMU completing op with invalid=" & std_ulogic'image(l_out.invalid) & + report "MMU completing op without error"; + end if; + if l_out.err = '1' then + report "MMU completing op with err invalid=" & std_ulogic'image(l_out.invalid) & " badtree=" & std_ulogic'image(l_out.badtree); end if; if rin.state = RADIX_LOOKUP then @@ -200,6 +204,7 @@ begin v.valid := '0'; dcreq := '0'; v.done := '0'; + v.err := '0'; v.invalid := '0'; v.badtree := '0'; v.segerror := '0'; @@ -412,7 +417,8 @@ begin end case; if v.state = RADIX_FINISH or (v.state = RADIX_LOAD_TLB and r.iside = '1') then - v.done := '1'; + v.err := v.invalid or v.badtree or v.segerror or v.perm_err or v.rc_error; + v.done := not v.err; end if; if r.addr(63) = '1' then @@ -451,6 +457,7 @@ begin end if; l_out.done <= r.done; + l_out.err <= r.err; l_out.invalid <= r.invalid; l_out.badtree <= r.badtree; l_out.segerr <= r.segerror; From b80e81e1234022c9c22725489b5031dc6e6b06ab Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 13 Jul 2020 17:43:52 +1000 Subject: [PATCH 14/18] loadstore1: Separate address calculation for MMU to ease timing This computes the address sent to the MMU separately from that sent to the dcache. This means that the address sent to the MMU doesn't have the delay through the lsu_sum adder, making it available earlier. The path through the lsu_sum adder and through the MMU to the MMU done and err outputs showed up as a critical path on some builds. Signed-off-by: Paul Mackerras --- loadstore1.vhdl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 660e6c0..16df9b1 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -152,6 +152,7 @@ begin variable req : std_ulogic; variable busy : std_ulogic; variable addr : std_ulogic_vector(63 downto 0); + variable maddr : std_ulogic_vector(63 downto 0); variable wdata : std_ulogic_vector(63 downto 0); variable write_enable : std_ulogic; variable do_update : std_ulogic; @@ -173,6 +174,7 @@ begin req := '0'; byte_sel := (others => '0'); addr := lsu_sum; + maddr := l_in.addr2; -- address from RB for tlbie v.mfspr := '0'; mmu_mtspr := '0'; itlb_fault := '0'; @@ -260,9 +262,9 @@ begin -- dcache will discard the second request if it -- gets an error on the 1st of two requests if r.dwords_done = '1' then - addr := next_addr; + maddr := next_addr; else - addr := r.addr; + maddr := r.addr; end if; if d_in.cache_paradox = '1' then -- signal an interrupt straight away @@ -427,8 +429,7 @@ begin end if; when OP_FETCH_FAILED => -- send it to the MMU to do the radix walk - addr := l_in.nia; - v.addr := l_in.nia; + maddr := l_in.nia; v.instr_fault := '1'; mmureq := '1'; v.state := MMU_LOOKUP; @@ -490,7 +491,7 @@ begin m_out.tlbie <= v.tlbie; m_out.mtspr <= mmu_mtspr; m_out.sprn <= sprn; - m_out.addr <= addr; + m_out.addr <= maddr; m_out.slbia <= l_in.insn(7); m_out.rs <= l_in.data; From 36297d35f8ef27fa5e6ef02c8698373fb8ee69bb Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 15 Jul 2020 09:00:44 +1000 Subject: [PATCH 15/18] decode1: Fix formatting Commit d5c8c33baecc ("decode1: Reformat to 4-space indentation") resulted in some rows of major_decode_rom_array being misaligned. This fixes it. No code change. Signed-off-by: Paul Mackerras --- decode1.vhdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index d8c2b61..f553e2d 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -51,7 +51,7 @@ architecture behaviour of decode1 is 15 => (ALU, OP_ADD, RA_OR_ZERO, CONST_SI_HI, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- addis 28 => (ALU, OP_AND, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0'), -- andi. 29 => (ALU, OP_AND, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0'), -- andis. - 0 => (ALU, OP_ATTN, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- attn + 0 => (ALU, OP_ATTN, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- attn 18 => (ALU, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- b 16 => (ALU, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc 11 => (ALU, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmpi @@ -77,9 +77,9 @@ architecture behaviour of decode1 is 45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu 36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw 37 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwu - 8 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic - 2 => (ALU, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi - 3 => (ALU, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- twi + 8 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic + 2 => (ALU, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi + 3 => (ALU, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- twi 26 => (ALU, OP_XOR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- xori 27 => (ALU, OP_XOR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- xoris others => illegal_inst From 128fe8ac264f57c1e0df23c22333b0c9d52a8f8d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 10 Jul 2020 19:04:37 +1000 Subject: [PATCH 16/18] dcache: Ease timing on wishbone data and byte selects This eliminates a path where the inputs to r1.wb.dat and r1.wb.sel depend on req_op, which depends on the TLB and cache hit detection. In fact they only need to depend on the nature of the request in r0.req (i.e. DCBZ, store, cacheable load, or non-cacheable load). This sets them at the beginning of the code for IDLE state rather than inside the req_op case statement. Signed-off-by: Paul Mackerras --- dcache.vhdl | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 08b1664..956768c 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -1250,8 +1250,18 @@ begin req.mmu_req := r0.mmu_req; req.dcbz := r0.req.dcbz; req.real_addr := ra; - req.data := r0.req.data; - req.byte_sel := r0.req.byte_sel; + -- Force data to 0 for dcbz + if r0.req.dcbz = '0' then + req.data := r0.req.data; + else + req.data := (others => '0'); + end if; + -- Select all bytes for dcbz and for cacheable loads + if r0.req.dcbz = '1' or (r0.req.load = '1' and r0.req.nc = '0') then + req.byte_sel := (others => '1'); + else + req.byte_sel := r0.req.byte_sel; + end if; req.hit_way := req_hit_way; req.same_tag := req_same_tag; @@ -1268,7 +1278,9 @@ begin case r1.state is when IDLE => r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0); - r1.dcbz <= '0'; + r1.wb.sel <= req.byte_sel; + r1.wb.dat <= req.data; + r1.dcbz <= req.dcbz; -- Keep track of our index and way for subsequent stores. r1.store_index <= get_index(req.real_addr); @@ -1298,7 +1310,6 @@ begin " tag:" & to_hstring(get_tag(req.real_addr)); -- Start the wishbone cycle - r1.wb.sel <= (others => '1'); r1.wb.we <= '0'; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; @@ -1308,7 +1319,6 @@ begin r1.write_tag <= '1'; when OP_LOAD_NC => - r1.wb.sel <= req.byte_sel; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; @@ -1316,8 +1326,6 @@ begin when OP_STORE_HIT | OP_STORE_MISS => if req.dcbz = '0' then - r1.wb.sel <= req.byte_sel; - r1.wb.dat <= req.data; r1.state <= STORE_WAIT_ACK; r1.acks_pending <= to_unsigned(1, 3); r1.full <= '0'; @@ -1333,17 +1341,10 @@ begin else -- dcbz is handled much like a load miss except -- that we are writing to memory instead of reading - - -- Start the wishbone writes - r1.wb.sel <= (others => '1'); - r1.wb.dat <= (others => '0'); - - -- Handle the rest like a load miss r1.state <= RELOAD_WAIT_ACK; if req.op = OP_STORE_MISS then r1.write_tag <= '1'; end if; - r1.dcbz <= '1'; end if; r1.wb.we <= '1'; r1.wb.cyc <= '1'; From 2cb1d7671ed5e4909b3d810a0e4461fdc22db12a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 16 Jul 2020 09:26:47 +1000 Subject: [PATCH 17/18] loadstore1: Further tweaks to improve synthesis with yosys/nextpnr This reworks the way that the busy and done signals are generated in loadstore in order to work around some problems where yosys/nextpnr are reporting combinatorial loops (not in fact on the current code but on minor variations needed for supporting the FPU). It seems that yosys has problems with the case statement on v.state. This also lifts the maddr and byte_sel generation out of the case statement. The overall result is a slight reduction in resource usage (~30 6-input LUTs on the A7-100). Signed-off-by: Paul Mackerras --- loadstore1.vhdl | 99 +++++++++++++++++++++---------------------------- 1 file changed, 43 insertions(+), 56 deletions(-) diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 16df9b1..87f4710 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -44,10 +44,9 @@ architecture behave of loadstore1 is type state_t is (IDLE, -- ready for instruction SECOND_REQ, -- send 2nd request of unaligned xfer ACK_WAIT, -- waiting for ack from dcache - LD_UPDATE, -- writing rA with computed addr on load MMU_LOOKUP, -- waiting for MMU to look up translation TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie - SPR_CMPLT -- complete a mf/tspr operation + COMPLETE -- extra cycle to complete an operation ); type reg_stage_t is record @@ -83,6 +82,8 @@ architecture behave of loadstore1 is busy : std_ulogic; wait_dcache : std_ulogic; wait_mmu : std_ulogic; + do_update : std_ulogic; + extra_cycle : std_ulogic; end record; type byte_sel_t is array(0 to 7) of std_ulogic; @@ -132,8 +133,7 @@ begin if rst = '1' then r.state <= IDLE; r.busy <= '0'; - r.wait_dcache <= '0'; - r.wait_mmu <= '0'; + r.do_update <= '0'; else r <= rin; end if; @@ -172,9 +172,6 @@ begin begin v := r; req := '0'; - byte_sel := (others => '0'); - addr := lsu_sum; - maddr := l_in.addr2; -- address from RB for tlbie v.mfspr := '0'; mmu_mtspr := '0'; itlb_fault := '0'; @@ -183,7 +180,9 @@ begin mmureq := '0'; write_enable := '0'; - do_update := '0'; + + do_update := r.do_update; + v.do_update := '0'; -- load data formatting byte_offset := unsigned(r.addr(2 downto 0)); @@ -239,7 +238,8 @@ begin -- Busy calculation. -- We need to minimize the delay from clock to busy valid because it -- gates the start of execution of the next instruction. - busy := r.busy or (r.wait_dcache and not d_in.valid) or (r.wait_mmu and not m_in.done); + busy := r.busy and not ((r.wait_dcache and d_in.valid) or (r.wait_mmu and m_in.done)); + v.busy := busy; done := '0'; if r.state /= IDLE and busy = '0' then @@ -247,12 +247,19 @@ begin end if; exception := '0'; + if r.dwords_done = '1' or r.state = SECOND_REQ then + maddr := next_addr; + byte_sel := r.second_bytes; + else + maddr := r.addr; + byte_sel := r.first_bytes; + end if; + addr := maddr; + case r.state is when IDLE => when SECOND_REQ => - addr := next_addr; - byte_sel := r.second_bytes; req := '1'; v.state := ACK_WAIT; v.last_dword := '0'; @@ -261,11 +268,6 @@ begin if d_in.error = '1' then -- dcache will discard the second request if it -- gets an error on the 1st of two requests - if r.dwords_done = '1' then - maddr := next_addr; - else - maddr := r.addr; - end if; if d_in.cache_paradox = '1' then -- signal an interrupt straight away exception := '1'; @@ -289,24 +291,22 @@ begin end if; else write_enable := r.load; - if r.load = '1' and r.update = '1' then + if r.extra_cycle = '1' then -- loads with rA update need an extra cycle - v.state := LD_UPDATE; + v.state := COMPLETE; + v.do_update := r.update; else -- stores write back rA update in this cycle do_update := r.update; end if; + v.busy := '0'; end if; end if; + -- r.wait_dcache gets set one cycle after we come into ACK_WAIT state, + -- which is OK because the dcache always takes at least two cycles. + v.wait_dcache := r.last_dword and not r.extra_cycle; when MMU_LOOKUP => - if r.dwords_done = '1' then - addr := next_addr; - byte_sel := r.second_bytes; - else - addr := r.addr; - byte_sel := r.first_bytes; - end if; if m_in.done = '1' then if r.instr_fault = '0' then -- retry the request now that the MMU has installed a TLB entry @@ -329,15 +329,13 @@ begin when TLBIE_WAIT => - when LD_UPDATE => - do_update := '1'; - - when SPR_CMPLT => + when COMPLETE => end case; if done = '1' or exception = '1' then v.state := IDLE; + v.busy := '0'; end if; -- Note that l_in.valid is gated with busy inside execute1 @@ -361,6 +359,13 @@ begin v.nc := l_in.ci; v.virt_mode := l_in.virt_mode; v.priv_mode := l_in.priv_mode; + v.wait_dcache := '0'; + v.wait_mmu := '0'; + v.do_update := '0'; + v.extra_cycle := '0'; + + addr := lsu_sum; + maddr := l_in.addr2; -- address from RB for tlbie -- XXX Temporary hack. Mark the op as non-cachable if the address -- is the form 0xc------- for a real-mode access. @@ -392,6 +397,8 @@ begin when OP_LOAD => req := '1'; v.load := '1'; + -- Allow an extra cycle for RA update on loads + v.extra_cycle := l_in.update; when OP_DCBZ => req := '1'; v.dcbz := '1'; @@ -399,6 +406,7 @@ begin mmureq := '1'; v.tlbie := '1'; v.state := TLBIE_WAIT; + v.wait_mmu := '1'; when OP_MFSPR => v.mfspr := '1'; -- partial decode on SPR number should be adequate given @@ -413,7 +421,7 @@ begin -- reading one of the SPRs in the MMU v.sprval := m_in.sprval; end if; - v.state := SPR_CMPLT; + v.state := COMPLETE; when OP_MTSPR => if sprn(9) = '0' and sprn(5) = '0' then if sprn(0) = '0' then @@ -421,11 +429,12 @@ begin else v.dar := l_in.data; end if; - v.state := SPR_CMPLT; + v.state := COMPLETE; else -- writing one of the SPRs in the MMU mmu_mtspr := '1'; v.state := TLBIE_WAIT; + v.wait_mmu := '1'; end if; when OP_FETCH_FAILED => -- send it to the MMU to do the radix walk @@ -433,6 +442,7 @@ begin v.instr_fault := '1'; mmureq := '1'; v.state := MMU_LOOKUP; + v.wait_mmu := '1'; when others => assert false report "unknown op sent to loadstore1"; end case; @@ -444,32 +454,9 @@ begin v.state := SECOND_REQ; end if; end if; - end if; - -- Work out whether we'll be busy next cycle - v.busy := '0'; - v.wait_dcache := '0'; - v.wait_mmu := '0'; - case v.state is - when SECOND_REQ => - v.busy := '1'; - when ACK_WAIT => - if v.last_dword = '0' or (v.load = '1' and v.update = '1') then - v.busy := '1'; - else - v.wait_dcache := '1'; - end if; - when MMU_LOOKUP => - if v.instr_fault = '0' then - v.busy := '1'; - else - v.wait_mmu := '1'; - end if; - when TLBIE_WAIT => - v.wait_mmu := '1'; - when others => - -- not busy next cycle - end case; + v.busy := req or mmureq or mmu_mtspr; + end if; -- Update outputs to dcache d_out.valid <= req; From ea0b843662b37dbf36c09d8837408cf29ef2b897 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 18 Jul 2020 16:37:03 +1000 Subject: [PATCH 18/18] loadstore1: Better expression for store data formatting This rearranges the code used for store data formatting so that the "for i in 0 to 7" loop indexes the output bytes rather than the input bytes. The new expression is formally identical to the old but is easier to synthesize. This reduces the number of LUTs by about 250 on the Artix-7 and improves timing. Signed-off-by: Paul Mackerras --- loadstore1.vhdl | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 87f4710..62914c0 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -159,6 +159,7 @@ begin variable done : std_ulogic; variable data_permuted : std_ulogic_vector(63 downto 0); variable data_trimmed : std_ulogic_vector(63 downto 0); + variable store_data : std_ulogic_vector(63 downto 0); variable use_second : byte_sel_t; variable trim_ctl : trim_ctl_t; variable negative : std_ulogic; @@ -232,6 +233,23 @@ begin end case; end loop; + -- Byte reversing and rotating for stores + -- Done in the first cycle (when l_in.valid = 1) + store_data := r.store_data; + if l_in.valid = '1' then + byte_offset := unsigned(lsu_sum(2 downto 0)); + brev_lenm1 := "000"; + if l_in.byte_reverse = '1' then + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + end if; + for i in 0 to 7 loop + k := (to_unsigned(i, 3) - byte_offset) xor brev_lenm1; + j := to_integer(k) * 8; + store_data(i * 8 + 7 downto i * 8) := l_in.data(j + 7 downto j); + end loop; + end if; + v.store_data := store_data; + -- compute (addr + 8) & ~7 for the second doubleword when unaligned next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; @@ -379,18 +397,6 @@ begin v.first_bytes := byte_sel; v.second_bytes := long_sel(15 downto 8); - -- Do byte reversing and rotating for stores in the first cycle - byte_offset := unsigned(lsu_sum(2 downto 0)); - brev_lenm1 := "000"; - if l_in.byte_reverse = '1' then - brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; - end if; - for i in 0 to 7 loop - k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; - j := to_integer(k) * 8; - v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); - end loop; - case l_in.op is when OP_STORE => req := '1'; @@ -465,7 +471,7 @@ begin d_out.nc <= v.nc; d_out.reserve <= v.reserve; d_out.addr <= addr; - d_out.data <= v.store_data; + d_out.data <= store_data; d_out.byte_sel <= byte_sel; d_out.virt_mode <= v.virt_mode; d_out.priv_mode <= v.priv_mode;