From 49ec80ac3e19967da9e2521dc5556755f304482d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 11 Oct 2021 17:23:08 +1100 Subject: [PATCH 1/5] fetch1/icache1: Remove the use_previous logic This removes logic that I added some time ago with the thought that it would enable us to do prefetching in the icache. This logic detects when the fetch address is an odd multiple of 4 and the next address in sequence from the previous cycle. In that case the instruction we want is in the output register of the icache RAM already so there is no need to do another read or any icache tag or TLB lookup. However, this logic adds complexity, and removing it improves timing, so this removes it. Signed-off-by: Paul Mackerras --- fetch1.vhdl | 7 +++---- icache.vhdl | 20 ++++---------------- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/fetch1.vhdl b/fetch1.vhdl index ac08eba..4c4a6a8 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -89,9 +89,8 @@ begin r_int.predicted_taken <= r_next_int.predicted_taken; r_int.pred_not_taken <= r_next_int.pred_not_taken; r_int.predicted_nia <= r_next_int.predicted_nia; - r_int.rd_is_niap4 <= r_next.sequential; + r_int.rd_is_niap4 <= r_next_int.rd_is_niap4; end if; - r.sequential <= r_next.sequential and advance_nia; -- always send the up-to-date stop mark and req r.stop_mark <= stop_in; r.req <= not rst; @@ -145,11 +144,11 @@ begin begin v := r; v_int := r_int; - v.sequential := '0'; v.predicted := '0'; v.pred_ntaken := '0'; v_int.predicted_taken := '0'; v_int.pred_not_taken := '0'; + v_int.rd_is_niap4 := '0'; if rst = '1' then if alt_reset_in = '1' then @@ -180,7 +179,7 @@ begin v.nia := r_int.predicted_nia; v.predicted := '1'; else - v.sequential := '1'; + v_int.rd_is_niap4 := '1'; v.pred_ntaken := r_int.pred_not_taken; v.nia := std_ulogic_vector(unsigned(r.nia) + 4); if r_int.mode_32bit = '1' then diff --git a/icache.vhdl b/icache.vhdl index d4be935..a51209e 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -212,7 +212,6 @@ architecture rtl of icache is signal ra_valid : std_ulogic; signal priv_fault : std_ulogic; signal access_ok : std_ulogic; - signal use_previous : std_ulogic; -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; @@ -397,7 +396,7 @@ begin wr_dat(ii * 8 + 7 downto ii * 8) <= wishbone_in.dat(j * 8 + 7 downto j * 8); end loop; end if; - do_read <= not (stall_in or use_previous); + do_read <= not stall_in; do_write <= '0'; if wishbone_in.ack = '1' and replace_way = i then do_write <= '1'; @@ -503,16 +502,6 @@ begin variable is_hit : std_ulogic; variable hit_way : way_t; begin - -- i_in.sequential means that i_in.nia this cycle is 4 more than - -- last cycle. If we read more than 32 bits at a time, had a cache hit - -- last cycle, and we don't want the first 32-bit chunk, then we can - -- keep the data we read last cycle and just use that. - if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then - use_previous <= i_in.req and i_in.sequential and r.hit_valid; - else - use_previous <= '0'; - end if; - -- Extract line, row and tag from request req_index <= get_index(i_in.nia); req_row <= get_row(i_in.nia); @@ -542,7 +531,7 @@ begin end loop; -- Generate the "hit" and "miss" signals for the synchronous blocks - if i_in.req = '1' and access_ok = '1' and flush_in = '0' and rst = '0' and use_previous = '0' then + if i_in.req = '1' and access_ok = '1' and flush_in = '0' and rst = '0' then req_is_hit <= is_hit; req_is_miss <= not is_hit; else @@ -576,7 +565,7 @@ begin i_out.next_pred_ntaken <= i_in.pred_ntaken; -- Stall fetch1 if we have a miss on cache or TLB or a protection fault - stall_out <= not (is_hit and access_ok) and not use_previous; + stall_out <= not (is_hit and access_ok); -- Wishbone requests output (from the cache miss reload machine) wishbone_out <= r.wb; @@ -588,8 +577,7 @@ begin if rising_edge(clk) then -- keep outputs to fetch2 unchanged on a stall -- except that flush or reset sets valid to 0 - -- If use_previous, keep the same data as last cycle and use the second half - if stall_in = '1' or use_previous = '1' then + if stall_in = '1' then if rst = '1' or flush_in = '1' then r.hit_valid <= '0'; end if; From 4cf2921b0bd433f870878dd56b3f2bac3a3860c6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 11 Oct 2021 17:46:44 +1100 Subject: [PATCH 2/5] soc: Re-do peripheral address decode to improve timing This generates a series of io_cycle_* signals which are clean latches and which become the 'cyc' signals of the wishbone buses going to various peripherals (syscon, uarts, XICS, GPIO, etc.). Effectively this is done by moving the address decoding into the slave_io_latch process. The slave_io_type, which drives the multiplexer which selects which wishbone to look for a response on, is reduced to just 8 values in the expectation that an 8-way multiplexer will use less logic than one with more than 8 inputs. With this timing is considerably better on the A7-100T. Signed-off-by: Paul Mackerras --- soc.vhdl | 239 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 129 insertions(+), 110 deletions(-) diff --git a/soc.vhdl b/soc.vhdl index b23336c..ee1ea3e 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -239,12 +239,20 @@ architecture behaviour of soc is SLAVE_IO_ICP, SLAVE_IO_ICS, SLAVE_IO_UART1, - SLAVE_IO_SPI_FLASH_REG, - SLAVE_IO_SPI_FLASH_MAP, + SLAVE_IO_SPI_FLASH, SLAVE_IO_GPIO, - SLAVE_IO_EXTERNAL, - SLAVE_IO_NONE); - signal slave_io_dbg : slave_io_type; + SLAVE_IO_EXTERNAL); + signal current_io_decode : slave_io_type; + + signal io_cycle_none : std_ulogic; + signal io_cycle_syscon : std_ulogic; + signal io_cycle_uart : std_ulogic; + signal io_cycle_uart1 : std_ulogic; + signal io_cycle_icp : std_ulogic; + signal io_cycle_ics : std_ulogic; + signal io_cycle_spi_flash : std_ulogic; + signal io_cycle_gpio : std_ulogic; + signal io_cycle_external : std_ulogic; function wishbone_widen_data(wb : wb_io_master_out) return wishbone_master_out is variable wwb : wishbone_master_out; @@ -465,14 +473,20 @@ begin -- Misc variable has_top : boolean; variable has_bot : boolean; + variable do_cyc : std_ulogic; + variable end_cyc : std_ulogic; + variable slave_io : slave_io_type; + variable match : std_ulogic_vector(31 downto 12); begin if rising_edge(system_clk) then + do_cyc := '0'; + end_cyc := '0'; if (rst) then state := IDLE; wb_io_out.ack <= '0'; wb_io_out.stall <= '0'; - wb_sio_out.cyc <= '0'; wb_sio_out.stb <= '0'; + end_cyc := '1'; has_top := false; has_bot := false; else @@ -488,7 +502,7 @@ begin wb_io_out.stall <= '1'; -- Start cycle downstream - wb_sio_out.cyc <= '1'; + do_cyc := '1'; wb_sio_out.stb <= '1'; -- Copy write enable to IO out, copy address as well @@ -551,8 +565,8 @@ begin -- Wait for new ack state := WAIT_ACK_TOP; else - -- We are done, ack up, clear cyc downstram - wb_sio_out.cyc <= '0'; + -- We are done, ack up, clear cyc downstream + end_cyc := '1'; -- And ack & unstall upstream wb_io_out.ack <= '1'; @@ -576,7 +590,7 @@ begin end if; -- We are done, ack up, clear cyc downstram - wb_sio_out.cyc <= '0'; + end_cyc := '1'; -- And ack & unstall upstream wb_io_out.ack <= '1'; @@ -587,144 +601,149 @@ begin end if; end case; end if; + + -- Create individual registered cycle signals for the wishbones + -- going to the various peripherals + if do_cyc = '1' or end_cyc = '1' then + io_cycle_none <= '0'; + io_cycle_syscon <= '0'; + io_cycle_uart <= '0'; + io_cycle_uart1 <= '0'; + io_cycle_icp <= '0'; + io_cycle_ics <= '0'; + io_cycle_spi_flash <= '0'; + io_cycle_gpio <= '0'; + io_cycle_external <= '0'; + wb_sio_out.cyc <= '0'; + wb_ext_is_dram_init <= '0'; + wb_spiflash_is_map <= '0'; + wb_spiflash_is_reg <= '0'; + wb_ext_is_dram_csr <= '0'; + wb_ext_is_eth <= '0'; + wb_ext_is_sdcard <= '0'; + end if; + if do_cyc = '1' then + -- Decode I/O address + -- This is real address bits 29 downto 12 + match := "11" & wb_io_in.adr(26 downto 9); + slave_io := SLAVE_IO_SYSCON; + if std_match(match, x"FF---") and HAS_DRAM then + slave_io := SLAVE_IO_EXTERNAL; + io_cycle_external <= '1'; + wb_ext_is_dram_init <= '1'; + elsif std_match(match, x"F----") then + slave_io := SLAVE_IO_SPI_FLASH; + io_cycle_spi_flash <= '1'; + wb_spiflash_is_map <= '1'; + elsif std_match(match, x"C8---") then + -- Ext IO "chip selects" + if std_match(match, x"--00-") and HAS_DRAM then + slave_io := SLAVE_IO_EXTERNAL; + io_cycle_external <= '1'; + wb_ext_is_dram_csr <= '1'; + elsif (std_match(match, x"--02-") or std_match(match, x"--03-")) and + HAS_LITEETH then + slave_io := SLAVE_IO_EXTERNAL; + io_cycle_external <= '1'; + wb_ext_is_eth <= '1'; + elsif std_match(match, x"--04-") and HAS_SD_CARD then + slave_io := SLAVE_IO_EXTERNAL; + io_cycle_external <= '1'; + wb_ext_is_sdcard <= '1'; + else + io_cycle_none <= '1'; + end if; + elsif std_match(match, x"C0000") then + slave_io := SLAVE_IO_SYSCON; + io_cycle_syscon <= '1'; + elsif std_match(match, x"C0002") then + slave_io := SLAVE_IO_UART; + io_cycle_uart <= '1'; + elsif std_match(match, x"C0003") then + slave_io := SLAVE_IO_UART1; + io_cycle_uart1 <= '1'; + elsif std_match(match, x"C0004") then + slave_io := SLAVE_IO_ICP; + io_cycle_icp <= '1'; + elsif std_match(match, x"C0005") then + slave_io := SLAVE_IO_ICS; + io_cycle_ics <= '1'; + elsif std_match(match, x"C0006") then + slave_io := SLAVE_IO_SPI_FLASH; + io_cycle_spi_flash <= '1'; + wb_spiflash_is_reg <= '1'; + elsif std_match(match, x"C0007") then + slave_io := SLAVE_IO_GPIO; + io_cycle_gpio <= '1'; + else + io_cycle_none <= '1'; + end if; + current_io_decode <= slave_io; + wb_sio_out.cyc <= '1'; + end if; end if; end process; - -- IO wishbone slave intercon. + -- IO wishbone slave interconnect. -- - slave_io_intercon: process(wb_sio_out, wb_syscon_out, wb_uart0_out, wb_uart1_out, - wb_ext_io_out, wb_xics_icp_out, wb_xics_ics_out, - wb_spiflash_out) - variable slave_io : slave_io_type; - - variable match : std_ulogic_vector(31 downto 12); - variable ext_valid : boolean; + slave_io_intercon: process(all) begin - - -- Simple address decoder. - slave_io := SLAVE_IO_NONE; - match := "11" & wb_sio_out.adr(27 downto 10); - if std_match(match, x"FF---") and HAS_DRAM then - slave_io := SLAVE_IO_EXTERNAL; - elsif std_match(match, x"F----") then - slave_io := SLAVE_IO_SPI_FLASH_MAP; - elsif std_match(match, x"C0000") then - slave_io := SLAVE_IO_SYSCON; - elsif std_match(match, x"C0002") then - slave_io := SLAVE_IO_UART; - elsif std_match(match, x"C0003") then - slave_io := SLAVE_IO_UART1; - elsif std_match(match, x"C8---") then - slave_io := SLAVE_IO_EXTERNAL; - elsif std_match(match, x"C0004") then - slave_io := SLAVE_IO_ICP; - elsif std_match(match, x"C0005") then - slave_io := SLAVE_IO_ICS; - elsif std_match(match, x"C0006") then - slave_io := SLAVE_IO_SPI_FLASH_REG; - elsif std_match(match, x"C0007") then - slave_io := SLAVE_IO_GPIO; - end if; - slave_io_dbg <= slave_io; wb_uart0_in <= wb_sio_out; - wb_uart0_in.cyc <= '0'; + wb_uart0_in.cyc <= io_cycle_uart; wb_uart1_in <= wb_sio_out; - wb_uart1_in.cyc <= '0'; + wb_uart1_in.cyc <= io_cycle_uart1; + wb_spiflash_in <= wb_sio_out; - wb_spiflash_in.cyc <= '0'; - wb_spiflash_is_reg <= '0'; - wb_spiflash_is_map <= '0'; + wb_spiflash_in.cyc <= io_cycle_spi_flash; + -- Clear top bits so they don't make their way to the + -- flash chip. + wb_spiflash_in.adr(27 downto 26) <= "00"; + wb_gpio_in <= wb_sio_out; - wb_gpio_in.cyc <= '0'; + wb_gpio_in.cyc <= io_cycle_gpio; -- Only give xics 8 bits of wb addr (for now...) wb_xics_icp_in <= wb_sio_out; wb_xics_icp_in.adr <= (others => '0'); wb_xics_icp_in.adr(5 downto 0) <= wb_sio_out.adr(5 downto 0); - wb_xics_icp_in.cyc <= '0'; + wb_xics_icp_in.cyc <= io_cycle_icp; wb_xics_ics_in <= wb_sio_out; wb_xics_ics_in.adr <= (others => '0'); wb_xics_ics_in.adr(9 downto 0) <= wb_sio_out.adr(9 downto 0); - wb_xics_ics_in.cyc <= '0'; + wb_xics_ics_in.cyc <= io_cycle_ics; wb_ext_io_in <= wb_sio_out; - wb_ext_io_in.cyc <= '0'; + wb_ext_io_in.cyc <= io_cycle_external; wb_syscon_in <= wb_sio_out; - wb_syscon_in.cyc <= '0'; - - wb_ext_is_dram_csr <= '0'; - wb_ext_is_dram_init <= '0'; - wb_ext_is_eth <= '0'; - wb_ext_is_sdcard <= '0'; + wb_syscon_in.cyc <= io_cycle_syscon; - -- Default response, ack & return all 1's - wb_sio_in.dat <= (others => '1'); - wb_sio_in.ack <= wb_sio_out.stb and wb_sio_out.cyc; - wb_sio_in.stall <= '0'; - - case slave_io is + case current_io_decode is when SLAVE_IO_EXTERNAL => - -- Ext IO "chip selects" - -- - -- DRAM init is special at 0xFF* so we just test the top - -- bit. Everything else is at 0xC8* so we test only bits - -- 23 downto 16 (21 downto 14 in the wishbone addr). - -- - ext_valid := false; - if wb_sio_out.adr(27) = '1' and HAS_DRAM then -- DRAM init is special - wb_ext_is_dram_init <= '1'; - ext_valid := true; - elsif wb_sio_out.adr(21 downto 14) = x"00" and HAS_DRAM then - wb_ext_is_dram_csr <= '1'; - ext_valid := true; - elsif wb_sio_out.adr(21 downto 14) = x"02" and HAS_LITEETH then - wb_ext_is_eth <= '1'; - ext_valid := true; - elsif wb_sio_out.adr(21 downto 14) = x"03" and HAS_LITEETH then - wb_ext_is_eth <= '1'; - ext_valid := true; - elsif wb_sio_out.adr(21 downto 14) = x"04" and HAS_SD_CARD then - wb_ext_is_sdcard <= '1'; - ext_valid := true; - end if; - if ext_valid then - wb_ext_io_in.cyc <= wb_sio_out.cyc; - wb_sio_in <= wb_ext_io_out; - end if; - + wb_sio_in <= wb_ext_io_out; when SLAVE_IO_SYSCON => - wb_syscon_in.cyc <= wb_sio_out.cyc; wb_sio_in <= wb_syscon_out; when SLAVE_IO_UART => - wb_uart0_in.cyc <= wb_sio_out.cyc; wb_sio_in <= wb_uart0_out; when SLAVE_IO_ICP => - wb_xics_icp_in.cyc <= wb_sio_out.cyc; wb_sio_in <= wb_xics_icp_out; when SLAVE_IO_ICS => - wb_xics_ics_in.cyc <= wb_sio_out.cyc; wb_sio_in <= wb_xics_ics_out; when SLAVE_IO_UART1 => - wb_uart1_in.cyc <= wb_sio_out.cyc; wb_sio_in <= wb_uart1_out; - when SLAVE_IO_SPI_FLASH_MAP => - -- Clear top bits so they don't make their way to the - -- fash chip. - wb_spiflash_in.adr(27 downto 26) <= "00"; - wb_spiflash_in.cyc <= wb_sio_out.cyc; + when SLAVE_IO_SPI_FLASH => wb_sio_in <= wb_spiflash_out; - wb_spiflash_is_map <= '1'; - when SLAVE_IO_SPI_FLASH_REG => - wb_spiflash_in.cyc <= wb_sio_out.cyc; - wb_sio_in <= wb_spiflash_out; - wb_spiflash_is_reg <= '1'; when SLAVE_IO_GPIO => - wb_gpio_in.cyc <= wb_sio_out.cyc; wb_sio_in <= wb_gpio_out; - when others => end case; + -- Default response, ack & return all 1's + if io_cycle_none = '1' then + wb_sio_in.dat <= (others => '1'); + wb_sio_in.ack <= wb_sio_out.stb and wb_sio_out.cyc; + wb_sio_in.stall <= '0'; + end if; + end process; -- Syscon slave From 10869888833847d6b899dc7c23c002f3fce85f71 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 21 Feb 2022 09:58:07 +1100 Subject: [PATCH 3/5] countzero: Use alternative algorithm for higher bits This implements an alternative count-leading-zeroes algorithm which uses less LUTs to generate the higher-order bits (2..5) of the result. By doing (v | -v) rather than (v & -v), we get a value which has ones from the MSB down to the rightmost 1 bit in v and then zeroes down to the LSB. This means that we can generate the MSB of the result (the index of the rightmost 1 bit in v) just by looking at bits 63 and 31 of (v | -v), assuming that v is 64 bits. Bit 4 of the result requires looking at bits 63, 47, 31 and 15. In contrast, each bit of the result using (v & -v), which has a single 1, requires ORing together 32 bits. It turns out that the minimum LUT usage comes from using (v & -v) to generate bits 0 and 1 of the result, and using (v | -v) to generate bits 2 to 5. This saves almost 60 6-input LUTs on the Artix-7. Signed-off-by: Paul Mackerras --- countbits.vhdl | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/countbits.vhdl b/countbits.vhdl index 134540f..ab5be2a 100644 --- a/countbits.vhdl +++ b/countbits.vhdl @@ -20,10 +20,11 @@ end entity bit_counter; architecture behaviour of bit_counter is -- signals for count-leading/trailing-zeroes signal inp : std_ulogic_vector(63 downto 0); + signal inp_r : std_ulogic_vector(63 downto 0); signal sum : std_ulogic_vector(64 downto 0); - signal msb_r : std_ulogic; + signal sum_r : std_ulogic_vector(64 downto 0); signal onehot : std_ulogic_vector(63 downto 0); - signal onehot_r : std_ulogic_vector(63 downto 0); + signal edge : std_ulogic_vector(63 downto 0); signal bitnum : std_ulogic_vector(5 downto 0); signal cntz : std_ulogic_vector(63 downto 0); @@ -45,16 +46,36 @@ architecture behaviour of bit_counter is signal pc32 : sixbit2; signal popcnt : std_ulogic_vector(63 downto 0); + function edgelocation(v: std_ulogic_vector; nbits: natural) return std_ulogic_vector is + variable p: std_ulogic_vector(nbits - 1 downto 0); + variable stride: natural; + variable b: std_ulogic; + variable k: natural; + begin + stride := 2; + for i in 0 to nbits - 1 loop + b := '0'; + for j in 0 to (2**nbits / stride) - 1 loop + k := j * stride; + b := b or (v(k + stride - 1) and not v(k + (stride/2) - 1)); + end loop; + p(i) := b; + stride := stride * 2; + end loop; + return p; + end function; + begin countzero_r: process(clk) begin if rising_edge(clk) then - msb_r <= sum(64); - onehot_r <= onehot; + inp_r <= inp; + sum_r <= sum; end if; end process; countzero: process(all) + variable bitnum_e, bitnum_o : std_ulogic_vector(5 downto 0); begin if is_32bit = '0' then if count_right = '0' then @@ -72,12 +93,16 @@ begin end if; sum <= std_ulogic_vector(unsigned('0' & not inp) + 1); - onehot <= sum(63 downto 0) and inp; -- The following occurs after a clock edge - bitnum <= bit_number(onehot_r); + edge <= sum_r(63 downto 0) or inp_r; + bitnum_e := edgelocation(edge, 6); + onehot <= sum_r(63 downto 0) and inp_r; + bitnum_o := bit_number(onehot); + bitnum(5 downto 2) <= bitnum_e(5 downto 2); + bitnum(1 downto 0) <= bitnum_o(1 downto 0); - cntz <= 57x"0" & msb_r & bitnum; + cntz <= 57x"0" & sum_r(64) & bitnum; end process; popcnt_r: process(clk) From 1720a0584a66976ba711f26409d79847b7017c9b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 21 Feb 2022 12:06:11 +1100 Subject: [PATCH 4/5] Use alternative count-leading-zeroes algorithm in the FPU and LSU Signed-off-by: Paul Mackerras --- countbits.vhdl | 19 ------------------- helpers.vhdl | 31 ++++++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/countbits.vhdl b/countbits.vhdl index ab5be2a..b16baa0 100644 --- a/countbits.vhdl +++ b/countbits.vhdl @@ -46,25 +46,6 @@ architecture behaviour of bit_counter is signal pc32 : sixbit2; signal popcnt : std_ulogic_vector(63 downto 0); - function edgelocation(v: std_ulogic_vector; nbits: natural) return std_ulogic_vector is - variable p: std_ulogic_vector(nbits - 1 downto 0); - variable stride: natural; - variable b: std_ulogic; - variable k: natural; - begin - stride := 2; - for i in 0 to nbits - 1 loop - b := '0'; - for j in 0 to (2**nbits / stride) - 1 loop - k := j * stride; - b := b or (v(k + stride - 1) and not v(k + (stride/2) - 1)); - end loop; - p(i) := b; - stride := stride * 2; - end loop; - return p; - end function; - begin countzero_r: process(clk) begin diff --git a/helpers.vhdl b/helpers.vhdl index 834e386..654c113 100644 --- a/helpers.vhdl +++ b/helpers.vhdl @@ -28,6 +28,7 @@ package helpers is function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector; function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; + function edgelocation(v: std_ulogic_vector; nbits: natural) return std_ulogic_vector; function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector; end package helpers; @@ -247,16 +248,44 @@ package body helpers is return ret; end; + -- Assuming the input 'v' is a value of the form 1...10...0, + -- the output is the bit number of the rightmost 1 bit in v. + -- If v is zero, the result is zero. + function edgelocation(v: std_ulogic_vector; nbits: natural) return std_ulogic_vector is + variable p: std_ulogic_vector(nbits - 1 downto 0); + variable stride: natural; + variable b: std_ulogic; + variable k: natural; + begin + stride := 2; + for i in 0 to nbits - 1 loop + b := '0'; + for j in 0 to (2**nbits / stride) - 1 loop + k := j * stride; + b := b or (v(k + stride - 1) and not v(k + (stride/2) - 1)); + end loop; + p(i) := b; + stride := stride * 2; + end loop; + return p; + end function; + -- Count leading zeroes operation -- Assumes the value passed in is not zero (if it is, zero is returned) function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector is variable rev: std_ulogic_vector(val'left downto val'right); variable sum: std_ulogic_vector(val'left downto val'right); variable onehot: std_ulogic_vector(val'left downto val'right); + variable edge: std_ulogic_vector(val'left downto val'right); + variable bn, bn_e, bn_o: std_ulogic_vector(5 downto 0); begin rev := bit_reverse(val); sum := std_ulogic_vector(- signed(rev)); onehot := sum and rev; - return bit_number(std_ulogic_vector(resize(unsigned(onehot), 64))); + edge := sum or rev; + bn_e := edgelocation(std_ulogic_vector(resize(signed(edge), 64)), 6); + bn_o := bit_number(std_ulogic_vector(resize(unsigned(onehot), 64))); + bn := bn_e(5 downto 2) & bn_o(1 downto 0); + return bn; end; end package body helpers; From 0aa898c7a6975732f21481487ae44fa7cd5d3503 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 19 Oct 2021 15:13:31 +1100 Subject: [PATCH 5/5] xics: Rework the irq_gen process At present, the loop in the irq_gen process generates a chain of comparators and other logic to work out the source number and priority of the most-favoured (lowest priority number) pending interrupt. This replaces that chain with (1) logic to generate an array of bits, one per priority, indicating whether any interrupt is pending at that priority, (2) a priority encoder to select the most favoured priority with an interrupt pending, (3) logic to generate an array of bits, one per source, indicating whether an interrupt is pending at the priority calculated in step 2, and (4) a priority encoder to work out the lowest numbered source that has an interrupt pending at the selected priority. This reduces LUT utilization. The priority encoder function implemented here uses the optimized count-leading-zeroes logic from helpers.vhdl. Signed-off-by: Paul Mackerras --- helpers.vhdl | 21 ++++++++----- xics.vhdl | 83 ++++++++++++++++++++++++++++++++++------------------ 2 files changed, 69 insertions(+), 35 deletions(-) diff --git a/helpers.vhdl b/helpers.vhdl index 654c113..bb69927 100644 --- a/helpers.vhdl +++ b/helpers.vhdl @@ -30,6 +30,7 @@ package helpers is function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; function edgelocation(v: std_ulogic_vector; nbits: natural) return std_ulogic_vector; function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector; + function count_right_zeroes(val: std_ulogic_vector) return std_ulogic_vector; end package helpers; package body helpers is @@ -270,22 +271,28 @@ package body helpers is return p; end function; - -- Count leading zeroes operation + -- Count leading zeroes operations -- Assumes the value passed in is not zero (if it is, zero is returned) - function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector is - variable rev: std_ulogic_vector(val'left downto val'right); + function count_right_zeroes(val: std_ulogic_vector) return std_ulogic_vector is variable sum: std_ulogic_vector(val'left downto val'right); variable onehot: std_ulogic_vector(val'left downto val'right); variable edge: std_ulogic_vector(val'left downto val'right); variable bn, bn_e, bn_o: std_ulogic_vector(5 downto 0); begin - rev := bit_reverse(val); - sum := std_ulogic_vector(- signed(rev)); - onehot := sum and rev; - edge := sum or rev; + sum := std_ulogic_vector(- signed(val)); + onehot := sum and val; + edge := sum or val; bn_e := edgelocation(std_ulogic_vector(resize(signed(edge), 64)), 6); bn_o := bit_number(std_ulogic_vector(resize(unsigned(onehot), 64))); bn := bn_e(5 downto 2) & bn_o(1 downto 0); return bn; end; + + function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector is + variable rev: std_ulogic_vector(val'left downto val'right); + begin + rev := bit_reverse(val); + return count_right_zeroes(rev); + end; + end package body helpers; diff --git a/xics.vhdl b/xics.vhdl index 0e3a1da..6daa5d4 100644 --- a/xics.vhdl +++ b/xics.vhdl @@ -54,9 +54,6 @@ architecture behaviour of xics_icp is signal r, r_next : reg_internal_t; - -- hardwire the hardware IRQ priority - constant HW_PRIORITY : std_ulogic_vector(7 downto 0) := x"80"; - -- 8 bit offsets for each presentation constant XIRR_POLL : std_ulogic_vector(7 downto 0) := x"00"; constant XIRR : std_ulogic_vector(7 downto 0) := x"04"; @@ -207,12 +204,14 @@ use ieee.numeric_std.all; library work; use work.common.all; +use work.utils.all; use work.wishbone_types.all; +use work.helpers.all; entity xics_ics is generic ( SRC_NUM : integer range 1 to 256 := 16; - PRIO_BITS : integer range 1 to 8 := 8 + PRIO_BITS : integer range 1 to 8 := 3 ); port ( clk : in std_logic; @@ -228,12 +227,16 @@ end xics_ics; architecture rtl of xics_ics is + constant SRC_NUM_BITS : natural := log2(SRC_NUM); + subtype pri_t is std_ulogic_vector(PRIO_BITS-1 downto 0); type xive_t is record pri : pri_t; end record; constant pri_masked : pri_t := (others => '1'); + subtype pri_vector_t is std_ulogic_vector(2**PRIO_BITS - 1 downto 0); + type xive_array_t is array(0 to SRC_NUM-1) of xive_t; signal xives : xive_array_t; @@ -262,8 +265,15 @@ architecture rtl of xics_ics is end function; function prio_pack(pri8: std_ulogic_vector(7 downto 0)) return pri_t is + variable masked : std_ulogic_vector(7 downto 0); begin - return pri8(PRIO_BITS-1 downto 0); + masked := x"00"; + masked(PRIO_BITS - 1 downto 0) := (others => '1'); + if pri8 >= masked then + return pri_masked; + else + return pri8(PRIO_BITS-1 downto 0); + end if; end function; function prio_unpack(pri: pri_t) return std_ulogic_vector is @@ -276,8 +286,27 @@ architecture rtl of xics_ics is r(PRIO_BITS-1 downto 0) := pri; end if; return r; - end function; + end function; + function prio_decode(pri: pri_t) return pri_vector_t is + variable v: pri_vector_t; + begin + v := (others => '0'); + v(to_integer(unsigned(pri))) := '1'; + return v; + end function; + + -- Assumes nbits <= 6; v is 2^nbits wide + function priority_encoder(v: std_ulogic_vector; nbits: natural) return std_ulogic_vector is + variable h: std_ulogic_vector(2**nbits - 1 downto 0); + variable p: std_ulogic_vector(5 downto 0); + begin + -- Set the lowest-priority (highest-numbered) bit + h := v; + h(2**nbits - 1) := '1'; + p := count_right_zeroes(h); + return p(nbits - 1 downto 0); + end function; -- Register map -- 0 : Config @@ -391,35 +420,33 @@ begin end process; irq_gen: process(all) - variable max_idx : integer range 0 to SRC_NUM-1; + variable max_idx : std_ulogic_vector(SRC_NUM_BITS - 1 downto 0); variable max_pri : pri_t; - - -- A more favored than b ? - function a_mf_b(a: pri_t; b: pri_t) return boolean is - variable a_i : unsigned(PRIO_BITS-1 downto 0); - variable b_i : unsigned(PRIO_BITS-1 downto 0); - begin - a_i := unsigned(a); - b_i := unsigned(b); - report "a_mf_b a=" & to_hstring(a) & - " b=" & to_hstring(b) & - " r=" & boolean'image(a < b); - return a_i < b_i; - end function; + variable pending_pri : pri_vector_t; + variable pending_at_pri : std_ulogic_vector(SRC_NUM - 1 downto 0); begin - -- XXX FIXME: Use a tree - max_pri := pri_masked; - max_idx := 0; + -- Work out the most-favoured (lowest) priority of the pending interrupts + pending_pri := (others => '0'); for i in 0 to SRC_NUM - 1 loop - if int_level_l(i) = '1' and a_mf_b(xives(i).pri, max_pri) then - max_pri := xives(i).pri; - max_idx := i; + if int_level_l(i) = '1' then + pending_pri := pending_pri or prio_decode(xives(i).pri); end if; end loop; + max_pri := priority_encoder(pending_pri, PRIO_BITS); + + -- Work out which interrupts are pending at that priority + pending_at_pri := (others => '0'); + for i in 0 to SRC_NUM - 1 loop + if int_level_l(i) = '1' and xives(i).pri = max_pri then + pending_at_pri(i) := '1'; + end if; + end loop; + max_idx := priority_encoder(pending_at_pri, SRC_NUM_BITS); + if max_pri /= pri_masked then - report "MFI: " & integer'image(max_idx) & " pri=" & to_hstring(prio_unpack(max_pri)); + report "MFI: " & integer'image(to_integer(unsigned(max_idx))) & " pri=" & to_hstring(prio_unpack(max_pri)); end if; - icp_out_next.src <= std_ulogic_vector(to_unsigned(max_idx, 4)); + icp_out_next.src <= max_idx; icp_out_next.pri <= prio_unpack(max_pri); end process;