From 559b3bcf2d700cb189f5fa3a78639c09baae88ff Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sat, 19 Oct 2019 09:21:42 +1100 Subject: [PATCH 01/21] pp_uart: reformat Signed-off-by: Benjamin Herrenschmidt --- fpga/pp_soc_uart.vhd | 630 ++++++++++++++++++++++--------------------- 1 file changed, 316 insertions(+), 314 deletions(-) diff --git a/fpga/pp_soc_uart.vhd b/fpga/pp_soc_uart.vhd index 1d5c629..879ea26 100644 --- a/fpga/pp_soc_uart.vhd +++ b/fpga/pp_soc_uart.vhd @@ -34,351 +34,353 @@ use ieee.numeric_std.all; --! - Bit 0: data received (receive buffer not empty) --! - Bit 1: ready to send data (transmit buffer empty) entity pp_soc_uart is - generic( - FIFO_DEPTH : natural := 64 --! Depth of the input and output FIFOs. + generic( + FIFO_DEPTH : natural := 64 --! Depth of the input and output FIFOs. ); - port( - clk : in std_logic; - reset : in std_logic; - - -- UART ports: - txd : out std_logic; - rxd : in std_logic; - - -- Interrupt signal: - irq : out std_logic; - - -- Wishbone ports: - wb_adr_in : in std_logic_vector(11 downto 0); - wb_dat_in : in std_logic_vector( 7 downto 0); - wb_dat_out : out std_logic_vector( 7 downto 0); - wb_we_in : in std_logic; - wb_cyc_in : in std_logic; - wb_stb_in : in std_logic; - wb_ack_out : out std_logic + port( + clk : in std_logic; + reset : in std_logic; + + -- UART ports: + txd : out std_logic; + rxd : in std_logic; + + -- Interrupt signal: + irq : out std_logic; + + -- Wishbone ports: + wb_adr_in : in std_logic_vector(11 downto 0); + wb_dat_in : in std_logic_vector( 7 downto 0); + wb_dat_out : out std_logic_vector( 7 downto 0); + wb_we_in : in std_logic; + wb_cyc_in : in std_logic; + wb_stb_in : in std_logic; + wb_ack_out : out std_logic ); end entity pp_soc_uart; architecture behaviour of pp_soc_uart is - subtype bitnumber is natural range 0 to 7; --! Type representing the index of a bit. + subtype bitnumber is natural range 0 to 7; --! Type representing the index of a bit. - -- UART sample clock signals: - signal sample_clk : std_logic; - signal sample_clk_divisor : std_logic_vector(7 downto 0); - signal sample_clk_counter : std_logic_vector(sample_clk_divisor'range); + -- UART sample clock signals: + signal sample_clk : std_logic; + signal sample_clk_divisor : std_logic_vector(7 downto 0); + signal sample_clk_counter : std_logic_vector(sample_clk_divisor'range); - -- UART receive process signals: - type rx_state_type is (IDLE, RECEIVE, STARTBIT, STOPBIT); - signal rx_state : rx_state_type; - signal rx_byte : std_logic_vector(7 downto 0); - signal rx_current_bit : bitnumber; + -- UART receive process signals: + type rx_state_type is (IDLE, RECEIVE, STARTBIT, STOPBIT); + signal rx_state : rx_state_type; + signal rx_byte : std_logic_vector(7 downto 0); + signal rx_current_bit : bitnumber; - subtype rx_sample_counter_type is natural range 0 to 15; - signal rx_sample_counter : rx_sample_counter_type; - signal rx_sample_value : rx_sample_counter_type; + subtype rx_sample_counter_type is natural range 0 to 15; + signal rx_sample_counter : rx_sample_counter_type; + signal rx_sample_value : rx_sample_counter_type; - subtype rx_sample_delay_type is natural range 0 to 7; - signal rx_sample_delay : rx_sample_delay_type; + subtype rx_sample_delay_type is natural range 0 to 7; + signal rx_sample_delay : rx_sample_delay_type; - -- UART transmit process signals: - type tx_state_type is (IDLE, TRANSMIT, STOPBIT); - signal tx_state : tx_state_type; - signal tx_byte : std_logic_vector(7 downto 0); - signal tx_current_bit : bitnumber; + -- UART transmit process signals: + type tx_state_type is (IDLE, TRANSMIT, STOPBIT); + signal tx_state : tx_state_type; + signal tx_byte : std_logic_vector(7 downto 0); + signal tx_current_bit : bitnumber; - -- UART transmit clock: - subtype uart_tx_counter_type is natural range 0 to 15; - signal uart_tx_counter : uart_tx_counter_type := 0; - signal uart_tx_clk : std_logic; + -- UART transmit clock: + subtype uart_tx_counter_type is natural range 0 to 15; + signal uart_tx_counter : uart_tx_counter_type := 0; + signal uart_tx_clk : std_logic; - -- Buffer signals: - signal send_buffer_full, send_buffer_empty : std_logic; - signal recv_buffer_full, recv_buffer_empty : std_logic; - signal send_buffer_input, send_buffer_output : std_logic_vector(7 downto 0); - signal recv_buffer_input, recv_buffer_output : std_logic_vector(7 downto 0); - signal send_buffer_push, send_buffer_pop : std_logic := '0'; - signal recv_buffer_push, recv_buffer_pop : std_logic := '0'; + -- Buffer signals: + signal send_buffer_full, send_buffer_empty : std_logic; + signal recv_buffer_full, recv_buffer_empty : std_logic; + signal send_buffer_input, send_buffer_output : std_logic_vector(7 downto 0); + signal recv_buffer_input, recv_buffer_output : std_logic_vector(7 downto 0); + signal send_buffer_push, send_buffer_pop : std_logic := '0'; + signal recv_buffer_push, recv_buffer_pop : std_logic := '0'; - -- IRQ enable signals: - signal irq_recv_enable, irq_tx_ready_enable : std_logic := '0'; + -- IRQ enable signals: + signal irq_recv_enable, irq_tx_ready_enable : std_logic := '0'; - -- Wishbone signals: - type wb_state_type is (IDLE, WRITE_ACK, READ_ACK); - signal wb_state : wb_state_type; + -- Wishbone signals: + type wb_state_type is (IDLE, WRITE_ACK, READ_ACK); + signal wb_state : wb_state_type; - signal wb_ack : std_logic; --! Wishbone acknowledge signal + signal wb_ack : std_logic; --! Wishbone acknowledge signal begin - irq <= (irq_recv_enable and (not recv_buffer_empty)) - or (irq_tx_ready_enable and send_buffer_empty); - - ---------- UART receive ---------- - - recv_buffer_input <= rx_byte; - - uart_receive: process(clk) - begin - if rising_edge(clk) then - if reset = '1' then - rx_state <= IDLE; - recv_buffer_push <= '0'; + irq <= (irq_recv_enable and (not recv_buffer_empty)) + or (irq_tx_ready_enable and send_buffer_empty); + + ---------- UART receive ---------- + + recv_buffer_input <= rx_byte; + + uart_receive: process(clk) + begin + if rising_edge(clk) then + if reset = '1' then + rx_state <= IDLE; + recv_buffer_push <= '0'; + else + case rx_state is + when IDLE => + if recv_buffer_push = '1' then + recv_buffer_push <= '0'; + end if; + + if sample_clk = '1' and rxd = '0' then + rx_sample_value <= rx_sample_counter; + rx_sample_delay <= 0; + rx_current_bit <= 0; + rx_state <= STARTBIT; + end if; + when STARTBIT => + if sample_clk = '1' then + if rx_sample_delay = 7 then + rx_state <= RECEIVE; + rx_sample_value <= rx_sample_counter; + rx_sample_delay <= 0; else - case rx_state is - when IDLE => - if recv_buffer_push = '1' then - recv_buffer_push <= '0'; - end if; - - if sample_clk = '1' and rxd = '0' then - rx_sample_value <= rx_sample_counter; - rx_sample_delay <= 0; - rx_current_bit <= 0; - rx_state <= STARTBIT; - end if; - when STARTBIT => - if sample_clk = '1' then - if rx_sample_delay = 7 then - rx_state <= RECEIVE; - rx_sample_value <= rx_sample_counter; - rx_sample_delay <= 0; - else - rx_sample_delay <= rx_sample_delay + 1; - end if; - end if; - when RECEIVE => - if sample_clk = '1' and rx_sample_counter = rx_sample_value then - if rx_current_bit /= 7 then - rx_byte(rx_current_bit) <= rxd; - rx_current_bit <= rx_current_bit + 1; - else - rx_byte(rx_current_bit) <= rxd; - rx_state <= STOPBIT; - end if; - end if; - when STOPBIT => - if sample_clk = '1' and rx_sample_counter = rx_sample_value then - rx_state <= IDLE; - - if recv_buffer_full = '0' then - recv_buffer_push <= '1'; - end if; - end if; - end case; + rx_sample_delay <= rx_sample_delay + 1; end if; - end if; - end process uart_receive; - - sample_counter: process(clk) - begin - if rising_edge(clk) then - if reset = '1' then - rx_sample_counter <= 0; - elsif sample_clk = '1' then - if rx_sample_counter = 15 then - rx_sample_counter <= 0; - else - rx_sample_counter <= rx_sample_counter + 1; - end if; + end if; + when RECEIVE => + if sample_clk = '1' and rx_sample_counter = rx_sample_value then + if rx_current_bit /= 7 then + rx_byte(rx_current_bit) <= rxd; + rx_current_bit <= rx_current_bit + 1; + else + rx_byte(rx_current_bit) <= rxd; + rx_state <= STOPBIT; end if; - end if; - end process sample_counter; - - ---------- UART transmit ---------- + end if; + when STOPBIT => + if sample_clk = '1' and rx_sample_counter = rx_sample_value then + rx_state <= IDLE; - tx_byte <= send_buffer_output; - - uart_transmit: process(clk) - begin - if rising_edge(clk) then - if reset = '1' then - txd <= '1'; - tx_state <= IDLE; - send_buffer_pop <= '0'; - tx_current_bit <= 0; - else - case tx_state is - when IDLE => - if send_buffer_empty = '0' and uart_tx_clk = '1' then - txd <= '0'; - send_buffer_pop <= '1'; - tx_current_bit <= 0; - tx_state <= TRANSMIT; - elsif uart_tx_clk = '1' then - txd <= '1'; - end if; - when TRANSMIT => - if send_buffer_pop = '1' then - send_buffer_pop <= '0'; - elsif uart_tx_clk = '1' and tx_current_bit = 7 then - txd <= tx_byte(tx_current_bit); - tx_state <= STOPBIT; - elsif uart_tx_clk = '1' then - txd <= tx_byte(tx_current_bit); - tx_current_bit <= tx_current_bit + 1; - end if; - when STOPBIT => - if uart_tx_clk = '1' then - txd <= '1'; - tx_state <= IDLE; - end if; - end case; + if recv_buffer_full = '0' then + recv_buffer_push <= '1'; end if; + end if; + end case; + end if; + end if; + end process uart_receive; + + sample_counter: process(clk) + begin + if rising_edge(clk) then + if reset = '1' then + rx_sample_counter <= 0; + elsif sample_clk = '1' then + if rx_sample_counter = 15 then + rx_sample_counter <= 0; + else + rx_sample_counter <= rx_sample_counter + 1; end if; - end process uart_transmit; - - uart_tx_clock_generator: process(clk) - begin - if rising_edge(clk) then - if reset = '1' then - uart_tx_counter <= 0; - uart_tx_clk <= '0'; - else - if sample_clk = '1' then - if uart_tx_counter = 15 then - uart_tx_counter <= 0; - uart_tx_clk <= '1'; - else - uart_tx_counter <= uart_tx_counter + 1; - uart_tx_clk <= '0'; - end if; - else - uart_tx_clk <= '0'; - end if; - end if; + end if; + end if; + end process sample_counter; + + ---------- UART transmit ---------- + + tx_byte <= send_buffer_output; + + uart_transmit: process(clk) + begin + if rising_edge(clk) then + if reset = '1' then + txd <= '1'; + tx_state <= IDLE; + send_buffer_pop <= '0'; + tx_current_bit <= 0; + else + case tx_state is + when IDLE => + if send_buffer_empty = '0' and uart_tx_clk = '1' then + txd <= '0'; + send_buffer_pop <= '1'; + tx_current_bit <= 0; + tx_state <= TRANSMIT; + elsif uart_tx_clk = '1' then + txd <= '1'; + end if; + when TRANSMIT => + if send_buffer_pop = '1' then + send_buffer_pop <= '0'; + elsif uart_tx_clk = '1' and tx_current_bit = 7 then + txd <= tx_byte(tx_current_bit); + tx_state <= STOPBIT; + elsif uart_tx_clk = '1' then + txd <= tx_byte(tx_current_bit); + tx_current_bit <= tx_current_bit + 1; + end if; + when STOPBIT => + if uart_tx_clk = '1' then + txd <= '1'; + tx_state <= IDLE; + end if; + end case; + end if; + end if; + end process uart_transmit; + + uart_tx_clock_generator: process(clk) + begin + if rising_edge(clk) then + if reset = '1' then + uart_tx_counter <= 0; + uart_tx_clk <= '0'; + else + if sample_clk = '1' then + if uart_tx_counter = 15 then + uart_tx_counter <= 0; + uart_tx_clk <= '1'; + else + uart_tx_counter <= uart_tx_counter + 1; + uart_tx_clk <= '0'; + end if; + else + uart_tx_clk <= '0'; end if; - end process uart_tx_clock_generator; - - ---------- Sample clock generator ---------- - - sample_clock_generator: process(clk) - begin - if rising_edge(clk) then - if reset = '1' then - sample_clk_counter <= (others => '0'); - sample_clk <= '0'; - else - if sample_clk_divisor /= x"00" then - if sample_clk_counter = sample_clk_divisor then - sample_clk_counter <= (others => '0'); - sample_clk <= '1'; - else - sample_clk_counter <= std_logic_vector(unsigned(sample_clk_counter) + 1); - sample_clk <= '0'; - end if; - end if; - end if; + end if; + end if; + end process uart_tx_clock_generator; + + ---------- Sample clock generator ---------- + + sample_clock_generator: process(clk) + begin + if rising_edge(clk) then + if reset = '1' then + sample_clk_counter <= (others => '0'); + sample_clk <= '0'; + else + if sample_clk_divisor /= x"00" then + if sample_clk_counter = sample_clk_divisor then + sample_clk_counter <= (others => '0'); + sample_clk <= '1'; + else + sample_clk_counter <= std_logic_vector(unsigned(sample_clk_counter) + 1); + sample_clk <= '0'; + end if; end if; - end process sample_clock_generator; - - ---------- Data Buffers ---------- - - send_buffer: entity work.pp_fifo - generic map( - DEPTH => FIFO_DEPTH, - WIDTH => 8 - ) port map( - clk => clk, - reset => reset, - full => send_buffer_full, - empty => send_buffer_empty, - data_in => send_buffer_input, - data_out => send_buffer_output, - push => send_buffer_push, - pop => send_buffer_pop + end if; + end if; + end process sample_clock_generator; + + ---------- Data Buffers ---------- + + send_buffer: entity work.pp_fifo + generic map( + DEPTH => FIFO_DEPTH, + WIDTH => 8 + ) port map( + clk => clk, + reset => reset, + full => send_buffer_full, + empty => send_buffer_empty, + data_in => send_buffer_input, + data_out => send_buffer_output, + push => send_buffer_push, + pop => send_buffer_pop ); - recv_buffer: entity work.pp_fifo - generic map( - DEPTH => FIFO_DEPTH, - WIDTH => 8 - ) port map( - clk => clk, - reset => reset, - full => recv_buffer_full, - empty => recv_buffer_empty, - data_in => recv_buffer_input, - data_out => recv_buffer_output, - push => recv_buffer_push, - pop => recv_buffer_pop + recv_buffer: entity work.pp_fifo + generic map( + DEPTH => FIFO_DEPTH, + WIDTH => 8 + ) port map( + clk => clk, + reset => reset, + full => recv_buffer_full, + empty => recv_buffer_empty, + data_in => recv_buffer_input, + data_out => recv_buffer_output, + push => recv_buffer_push, + pop => recv_buffer_pop ); - ---------- Wishbone Interface ---------- - - wb_ack_out <= wb_ack and wb_cyc_in and wb_stb_in; - - wishbone: process(clk) - begin - if rising_edge(clk) then - if reset = '1' then - wb_ack <= '0'; - wb_state <= IDLE; - send_buffer_push <= '0'; - recv_buffer_pop <= '0'; - sample_clk_divisor <= (others => '0'); - irq_recv_enable <= '0'; - irq_tx_ready_enable <= '0'; - else - case wb_state is - when IDLE => - if wb_cyc_in = '1' and wb_stb_in = '1' then - if wb_we_in = '1' then -- Write to register - if wb_adr_in = x"000" then - send_buffer_input <= wb_dat_in; - send_buffer_push <= '1'; - elsif wb_adr_in = x"018" then - sample_clk_divisor <= wb_dat_in; - elsif wb_adr_in = x"020" then - irq_recv_enable <= wb_dat_in(0); - irq_tx_ready_enable <= wb_dat_in(1); - end if; - - -- Invalid writes are acked and ignored. - - wb_ack <= '1'; - wb_state <= WRITE_ACK; - else -- Read from register - if wb_adr_in = x"008" then - recv_buffer_pop <= '1'; - elsif wb_adr_in = x"010" then - wb_dat_out <= x"0" & send_buffer_full & recv_buffer_full & send_buffer_empty & recv_buffer_empty; - wb_ack <= '1'; - elsif wb_adr_in = x"018" then - wb_dat_out <= sample_clk_divisor; - wb_ack <= '1'; - elsif wb_adr_in = x"020" then - wb_dat_out <= (0 => irq_recv_enable, 1 => irq_tx_ready_enable, others => '0'); - wb_ack <= '1'; - else - wb_dat_out <= (others => '0'); - wb_ack <= '1'; - end if; - wb_state <= READ_ACK; - end if; - end if; - when WRITE_ACK => - send_buffer_push <= '0'; - - if wb_stb_in = '0' then - wb_ack <= '0'; - wb_state <= IDLE; - end if; - when READ_ACK => - if recv_buffer_pop = '1' then - recv_buffer_pop <= '0'; - else - wb_dat_out <= recv_buffer_output; - wb_ack <= '1'; - end if; - - if wb_stb_in = '0' then - wb_ack <= '0'; - wb_state <= IDLE; - end if; - end case; + ---------- Wishbone Interface ---------- + + wb_ack_out <= wb_ack and wb_cyc_in and wb_stb_in; + + wishbone: process(clk) + begin + if rising_edge(clk) then + if reset = '1' then + wb_ack <= '0'; + wb_state <= IDLE; + send_buffer_push <= '0'; + recv_buffer_pop <= '0'; + sample_clk_divisor <= (others => '0'); + irq_recv_enable <= '0'; + irq_tx_ready_enable <= '0'; + else + case wb_state is + when IDLE => + if wb_cyc_in = '1' and wb_stb_in = '1' then + if wb_we_in = '1' then -- Write to register + if wb_adr_in = x"000" then + send_buffer_input <= wb_dat_in; + send_buffer_push <= '1'; + elsif wb_adr_in = x"018" then + sample_clk_divisor <= wb_dat_in; + elsif wb_adr_in = x"020" then + irq_recv_enable <= wb_dat_in(0); + irq_tx_ready_enable <= wb_dat_in(1); + end if; + + -- Invalid writes are acked and ignored. + wb_ack <= '1'; + wb_state <= WRITE_ACK; + else -- Read from register + if wb_adr_in = x"008" then + recv_buffer_pop <= '1'; + elsif wb_adr_in = x"010" then + wb_dat_out <= x"0" & send_buffer_full & recv_buffer_full & + send_buffer_empty & recv_buffer_empty; + wb_ack <= '1'; + elsif wb_adr_in = x"018" then + wb_dat_out <= sample_clk_divisor; + wb_ack <= '1'; + elsif wb_adr_in = x"020" then + wb_dat_out <= (0 => irq_recv_enable, + 1 => irq_tx_ready_enable, + others => '0'); + wb_ack <= '1'; + else + wb_dat_out <= (others => '0'); + wb_ack <= '1'; + end if; + wb_state <= READ_ACK; end if; - end if; - end process wishbone; + end if; + when WRITE_ACK => + send_buffer_push <= '0'; + + if wb_stb_in = '0' then + wb_ack <= '0'; + wb_state <= IDLE; + end if; + when READ_ACK => + if recv_buffer_pop = '1' then + recv_buffer_pop <= '0'; + else + wb_dat_out <= recv_buffer_output; + wb_ack <= '1'; + end if; + + if wb_stb_in = '0' then + wb_ack <= '0'; + wb_state <= IDLE; + end if; + end case; + end if; + end if; + end process wishbone; end architecture behaviour; From 79101041d6ea581cefdbabdf9fe96b1c4f76fcd5 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 17 Oct 2019 20:07:18 +1100 Subject: [PATCH 02/21] wishbone: Add stall signal Pipelined wishbone needs it Signed-off-by: Benjamin Herrenschmidt --- wishbone_types.vhdl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/wishbone_types.vhdl b/wishbone_types.vhdl index 12f0bc7..1b8a28b 100644 --- a/wishbone_types.vhdl +++ b/wishbone_types.vhdl @@ -21,9 +21,10 @@ package wishbone_types is constant wishbone_master_out_init : wishbone_master_out := (cyc => '0', stb => '0', we => '0', others => (others => '0')); type wishbone_slave_out is record - dat : wishbone_data_type; - ack : std_ulogic; + dat : wishbone_data_type; + ack : std_ulogic; + stall : std_ulogic; end record; - constant wishbone_slave_out_init : wishbone_slave_out := (ack => '0', others => (others => '0')); + constant wishbone_slave_out_init : wishbone_slave_out := (ack => '0', stall => '0', others => (others => '0')); end package wishbone_types; From b1424e859e878a881dc4525c1b5da5c9f692aa3f Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sat, 19 Oct 2019 10:26:09 +1100 Subject: [PATCH 03/21] icache_tb: Initialize stop_mark Too much red in gtkwave.. Signed-off-by: Benjamin Herrenschmidt --- icache_tb.vhdl | 1 + 1 file changed, 1 insertion(+) diff --git a/icache_tb.vhdl b/icache_tb.vhdl index a82912e..a50cabe 100644 --- a/icache_tb.vhdl +++ b/icache_tb.vhdl @@ -68,6 +68,7 @@ begin begin i_out.req <= '0'; i_out.nia <= (others => '0'); + i_out.stop_mark <= '0'; wait for 4*clk_period; From 7a4a9b6377cb04438ab02a25fe167b7e026b01fb Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sat, 19 Oct 2019 10:27:02 +1100 Subject: [PATCH 04/21] wb_arbiter: Forward stall signals They are set to '1' for non-selected devices Signed-off-by: Benjamin Herrenschmidt --- wishbone_arbiter.vhdl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/wishbone_arbiter.vhdl b/wishbone_arbiter.vhdl index d839b31..8e2358b 100644 --- a/wishbone_arbiter.vhdl +++ b/wishbone_arbiter.vhdl @@ -43,6 +43,9 @@ begin wb1_out.ack <= wb_in.ack when state = WB1_BUSY else '0'; wb2_out.ack <= wb_in.ack when state = WB2_BUSY else '0'; wb3_out.ack <= wb_in.ack when state = WB3_BUSY else '0'; + wb1_out.stall <= wb_in.stall when state = WB1_BUSY else '1'; + wb2_out.stall <= wb_in.stall when state = WB2_BUSY else '1'; + wb3_out.stall <= wb_in.stall when state = WB3_BUSY else '1'; end process; wishbone_arbiter_process: process(clk) From df1a9237f6ae06414ec93eda3adfc147756ed3fd Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sat, 19 Oct 2019 10:27:56 +1100 Subject: [PATCH 05/21] intercon: Generate stall signals for non-pipelined slaves So far the UART and the "miss" case. Memory will be pipelined Signed-off-by: Benjamin Herrenschmidt --- soc.vhdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/soc.vhdl b/soc.vhdl index 458a751..950d0dd 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -136,6 +136,7 @@ begin when others => wb_master_in.dat <= (others => '1'); wb_master_in.ack <= wb_master_out.stb and wb_master_out.cyc; + wb_master_in.stall <= '0'; end case; end process slave_intercon; @@ -164,6 +165,7 @@ begin wb_ack_out => wb_uart0_out.ack ); wb_uart0_out.dat <= x"00000000000000" & uart_dat8; + wb_uart0_out.stall <= '0' when wb_uart0_in.cyc = '0' else not wb_uart0_out.ack; -- BRAM Memory slave bram0: entity work.mw_soc_memory From 37acb35773e10f96907f6feda6a7c1922a5baf08 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sat, 19 Oct 2019 10:30:39 +1100 Subject: [PATCH 06/21] simple_ram: Add pipelining support The generic PIPELINE_DEPTH can be set to 0 to keep it operating as a non-pipelined slave, or a larger value indicating the amount of extra cycles between requests and acks. It will always generate a valid stall signal, so it can be used in either mode with a pipelined master (but only in non-pipelined mode with a non-pipelined master). Signed-off-by: Benjamin Herrenschmidt --- simple_ram_behavioural.vhdl | 84 ++++++++++++++++++++++++++++++------- 1 file changed, 69 insertions(+), 15 deletions(-) diff --git a/simple_ram_behavioural.vhdl b/simple_ram_behavioural.vhdl index 0f6a90a..64135b8 100644 --- a/simple_ram_behavioural.vhdl +++ b/simple_ram_behavioural.vhdl @@ -9,8 +9,9 @@ use work.simple_ram_behavioural_helpers.all; entity mw_soc_memory is generic ( - RAM_INIT_FILE : string; - MEMORY_SIZE : integer + RAM_INIT_FILE : string; + MEMORY_SIZE : integer; + PIPELINE_DEPTH : integer := 0 ); port ( @@ -29,48 +30,101 @@ architecture behave of mw_soc_memory is signal ret_ack : std_ulogic := '0'; signal identifier : integer := behavioural_initialize(filename => RAM_INIT_FILE, size => MEMORY_SIZE); signal reload : integer := 0; + signal ret_dat : wishbone_data_type; + + subtype pipe_idx_t is integer range 0 to PIPELINE_DEPTH-1; + type pipe_ack_t is array(pipe_idx_t) of std_ulogic; + type pipe_dat_t is array(pipe_idx_t) of wishbone_data_type; begin - wishbone_process: process(clk) - variable ret_dat: std_ulogic_vector(63 downto 0) := (others => '0'); - variable adr: std_ulogic_vector(63 downto 0); + + pipe_big: if PIPELINE_DEPTH > 1 generate + signal pipe_ack : pipe_ack_t; + signal pipe_dat : pipe_dat_t; + begin + wishbone_out.stall <= '0'; + wishbone_out.ack <= pipe_ack(0); + wishbone_out.dat <= pipe_dat(0); + + pipe_big_sync: process(clk) + begin + if rising_edge(clk) then + pipe_stages: for i in 0 to PIPELINE_DEPTH-2 loop + pipe_ack(i) <= pipe_ack(i+1); + pipe_dat(i) <= pipe_dat(i+1); + end loop; + pipe_ack(PIPELINE_DEPTH-1) <= ret_ack; + pipe_dat(PIPELINE_DEPTH-1) <= ret_dat; + end if; + end process; + end generate; + + pipe_one: if PIPELINE_DEPTH = 1 generate + signal pipe_ack : std_ulogic; + signal pipe_dat : wishbone_data_type; begin - wishbone_out.ack <= ret_ack and wishbone_in.cyc and wishbone_in.stb; - wishbone_out.dat <= ret_dat; + wishbone_out.stall <= '0'; + wishbone_out.ack <= pipe_ack; + wishbone_out.dat <= pipe_dat; + + pipe_one_sync: process(clk) + begin + if rising_edge(clk) then + pipe_ack <= ret_ack; + pipe_dat <= ret_dat; + end if; + end process; + end generate; + pipe_none: if PIPELINE_DEPTH = 0 generate + begin + wishbone_out.ack <= ret_ack; + wishbone_out.dat <= ret_dat; + wishbone_out.stall <= wishbone_in.cyc and not ret_ack; + end generate; + + wishbone_process: process(clk) + variable ret_dat_v : wishbone_data_type; + variable adr : std_ulogic_vector(63 downto 0); + begin if rising_edge(clk) then if rst = '1' then state <= IDLE; ret_ack <= '0'; else - ret_dat := x"FFFFFFFFFFFFFFFF"; + ret_dat <= x"FFFFFFFFFFFFFFFF"; + ret_ack <= '0'; -- Active if wishbone_in.cyc = '1' then case state is when IDLE => if wishbone_in.stb = '1' then + adr := (wishbone_in.adr'left downto 0 => wishbone_in.adr, + others => '0'); -- write - adr := (wishbone_in.adr'left downto 0 => wishbone_in.adr, others => '0'); if wishbone_in.we = '1' then assert not(is_x(wishbone_in.dat)) and not(is_x(wishbone_in.adr)) severity failure; report "RAM writing " & to_hstring(wishbone_in.dat) & " to " & to_hstring(wishbone_in.adr); behavioural_write(wishbone_in.dat, adr, to_integer(unsigned(wishbone_in.sel)), identifier); reload <= reload + 1; ret_ack <= '1'; - state <= ACK; + if PIPELINE_DEPTH = 0 then + state <= ACK; + end if; else - behavioural_read(ret_dat, adr, to_integer(unsigned(wishbone_in.sel)), identifier, reload); - report "RAM reading from " & to_hstring(wishbone_in.adr) & " returns " & to_hstring(ret_dat); + behavioural_read(ret_dat_v, adr, to_integer(unsigned(wishbone_in.sel)), identifier, reload); + report "RAM reading from " & to_hstring(wishbone_in.adr) & " returns " & to_hstring(ret_dat_v); + ret_dat <= ret_dat_v; ret_ack <= '1'; - state <= ACK; + if PIPELINE_DEPTH = 0 then + state <= ACK; + end if; end if; end if; when ACK => - ret_ack <= '0'; state <= IDLE; end case; else - ret_ack <= '0'; state <= IDLE; end if; end if; From e638c3e8ae9ca95c2dc5c831eb5fd7f72b173825 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sat, 19 Oct 2019 21:22:33 +1100 Subject: [PATCH 07/21] fpga/bram: Generate stall signal This doesn't yet pipeline the block RAM, just generate a valid stall signal so it's compatible with a pipelined master Signed-off-by: Benjamin Herrenschmidt --- fpga/mw_soc_memory.vhdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fpga/mw_soc_memory.vhdl b/fpga/mw_soc_memory.vhdl index e9ace36..af31c89 100644 --- a/fpga/mw_soc_memory.vhdl +++ b/fpga/mw_soc_memory.vhdl @@ -65,7 +65,8 @@ begin wb_adr_in <= wishbone_in.adr(log2(MEMORY_SIZE) - 1 downto 0); - wishbone_out.ack <= read_ack and wishbone_in.stb; + wishbone_out.ack <= read_ack and wishbone_in.cyc and wishbone_in.stb; + wishbone_out.stall <= '0' when wishbone_in.cyc = '0' else not wishbone_out.ack; memory_0: process(clk) begin From d363daa6928f50f869ff65f6c3b19b2725790334 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sat, 19 Oct 2019 10:32:46 +1100 Subject: [PATCH 08/21] dcache: Add wishbone pipelining support Signed-off-by: Benjamin Herrenschmidt --- dcache.vhdl | 104 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 83 insertions(+), 21 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index 7657dbd..f12fd35 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -187,6 +187,7 @@ architecture rtl of dcache is state : state_t; wb : wishbone_master_out; store_way : way_t; + store_row : row_t; store_index : index_t; end record; @@ -213,6 +214,7 @@ architecture rtl of dcache is signal req_hit_way : way_t; signal req_tag : cache_tag_t; signal req_op : op_t; + signal req_laddr : std_ulogic_vector(63 downto 0); -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; @@ -244,12 +246,21 @@ architecture rtl of dcache is end; -- Returns whether this is the last row of a line - function is_last_row(addr: wishbone_addr_type) return boolean is + function is_last_row_addr(addr: wishbone_addr_type) return boolean is constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); begin return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones; end; + -- Returns whether this is the last row of a line + function is_last_row(row: row_t) return boolean is + variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0); + constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); + begin + row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS)); + return row_v(ROW_LINEBITS-1 downto 0) = ones; + end; + -- Return the address of the next row in the current cache line function next_row_addr(addr: wishbone_addr_type) return std_ulogic_vector is variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0); @@ -263,6 +274,21 @@ architecture rtl of dcache is return result; end; + -- Return the next row in the current cache line. We use a dedicated + -- function in order to limit the size of the generated adder to be + -- only the bits within a cache line (3 bits with default settings) + -- + function next_row(row: row_t) return row_t is + variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0); + variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0); + variable result : std_ulogic_vector(ROW_BITS-1 downto 0); + begin + row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS)); + row_idx := row_v(ROW_LINEBITS-1 downto 0); + row_v(ROW_LINEBITS-1 downto 0) := std_ulogic_vector(unsigned(row_idx) + 1); + return to_integer(unsigned(row_v)); + end; + -- Get the tag value from the address function get_tag(addr: std_ulogic_vector(63 downto 0)) return cache_tag_t is begin @@ -381,6 +407,12 @@ begin req_row <= get_row(d_in.addr); req_tag <= get_tag(d_in.addr); + -- Calculate address of beginning of cache line, will be + -- used for cache miss processing if needed + -- + req_laddr <= d_in.addr(63 downto LINE_OFF_BITS) & + (LINE_OFF_BITS-1 downto 0 => '0'); + -- Test if pending request is a hit on any way hit_way := 0; is_hit := '0'; @@ -573,7 +605,8 @@ begin wr_data => wr_data ); process(all) - variable tmp_adr : std_ulogic_vector(63 downto 0); + variable tmp_adr : std_ulogic_vector(63 downto 0); + variable reloading : boolean; begin -- Cache hit reads do_read <= '1'; @@ -596,17 +629,17 @@ begin -- Otherwise, we might be doing a reload wr_data <= wishbone_in.dat; wr_sel <= (others => '1'); - tmp_adr := (r1.wb.adr'left downto 0 => r1.wb.adr, others => '0'); - wr_addr <= std_ulogic_vector(to_unsigned(get_row(tmp_adr), ROW_BITS)); + wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); end if; -- The two actual write cases here do_write <= '0'; - if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r1.store_way = i then + reloading := r1.state = RELOAD_WAIT_ACK; + if reloading and wishbone_in.ack = '1' and r1.store_way = i then do_write <= '1'; end if; if req_op = OP_STORE_HIT and req_hit_way = i then - assert r1.state /= RELOAD_WAIT_ACK report "Store hit while in state:" & + assert not reloading report "Store hit while in state:" & state_t'image(r1.state) severity FAILURE; do_write <= '1'; @@ -637,7 +670,7 @@ begin -- single issue on load/stores so we are fine, later, we can generate -- a stall output if necessary). - if d_in.valid = '1' then + if req_op /= OP_NONE then r1.req <= d_in; report "op:" & op_t'image(req_op) & @@ -672,7 +705,8 @@ begin -- operates at stage 1. -- dcache_slow : process(clk) - variable tagset : cache_tags_set_t; + variable tagset : cache_tags_set_t; + variable stbs_done : boolean; begin if rising_edge(clk) then -- On reset, clear all valid bits to force misses @@ -731,16 +765,18 @@ begin -- Keep track of our index and way for subsequent stores. r1.store_index <= req_index; r1.store_way <= replace_way; + r1.store_row <= get_row(req_laddr); -- Prep for first wishbone read. We calculate the address of - -- the start of the cache line + -- the start of the cache line and start the WB cycle -- - r1.wb.adr <= d_in.addr(r1.wb.adr'left downto LINE_OFF_BITS) & - (LINE_OFF_BITS-1 downto 0 => '0'); + r1.wb.adr <= req_laddr(r1.wb.adr'left downto 0); r1.wb.sel <= (others => '1'); r1.wb.we <= '0'; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; + + -- Track that we had one request sent r1.state <= RELOAD_WAIT_ACK; when OP_LOAD_NC => @@ -770,6 +806,25 @@ begin end case; when RELOAD_WAIT_ACK => + -- Requests are all sent if stb is 0 + stbs_done := r1.wb.stb = '0'; + + -- If we are still sending requests, was one accepted ? + if wishbone_in.stall = '0' and not stbs_done then + -- That was the last word ? We are done sending. Clear + -- stb and set stbs_done so we can handle an eventual last + -- ack on the same cycle. + -- + if is_last_row_addr(r1.wb.adr) then + r1.wb.stb <= '0'; + stbs_done := true; + end if; + + -- Calculate the next row address + r1.wb.adr <= next_row_addr(r1.wb.adr); + end if; + + -- Incoming acks processing if wishbone_in.ack = '1' then -- Is this the data we were looking for ? Latch it so -- we can respond later. We don't currently complete the @@ -779,16 +834,17 @@ begin -- not idle, which we don't currently know how to deal -- with. -- - if r1.wb.adr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = - r1.req.addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) then + if r1.store_row = get_row(r1.req.addr) then r1.slow_data <= wishbone_in.dat; end if; - -- That was the last word ? We are done - if is_last_row(r1.wb.adr) then - cache_valids(r1.store_index)(r1.store_way) <= '1'; + -- Check for completion + if stbs_done and is_last_row(r1.store_row) then + -- Complete wishbone cycle r1.wb.cyc <= '0'; - r1.wb.stb <= '0'; + + -- Cache line is now valid + cache_valids(r1.store_index)(r1.store_way) <= '1'; -- Complete the load that missed. For load with update -- we also need to do the deferred update cycle. @@ -801,10 +857,10 @@ begin r1.state <= IDLE; report "completing miss !"; end if; - else - -- Otherwise, calculate the next row address - r1.wb.adr <= next_row_addr(r1.wb.adr); end if; + + -- Increment store row counter + r1.store_row <= next_row(r1.store_row); end if; when LOAD_UPDATE => @@ -816,7 +872,13 @@ begin r1.state <= IDLE; when STORE_WAIT_ACK | NC_LOAD_WAIT_ACK => - if wishbone_in.ack = '1' then + -- Clear stb when slave accepted request + if wishbone_in.stall = '0' then + r1.wb.stb <= '0'; + end if; + + -- Got ack ? complete. + if wishbone_in.ack = '1' then if r1.state = NC_LOAD_WAIT_ACK then r1.slow_data <= wishbone_in.dat; end if; From 3df018cdc054426ae38312fb4b63c90116ccb83f Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sat, 19 Oct 2019 10:33:04 +1100 Subject: [PATCH 09/21] icache: Add wishbone pipelining support Signed-off-by: Benjamin Herrenschmidt --- icache.vhdl | 86 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 71 insertions(+), 15 deletions(-) diff --git a/icache.vhdl b/icache.vhdl index fccff9a..70226a8 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -159,6 +159,7 @@ architecture rtl of icache is wb : wishbone_master_out; store_way : way_t; store_index : index_t; + store_row : row_t; end record; signal r : reg_internal_t; @@ -170,6 +171,7 @@ architecture rtl of icache is signal req_tag : cache_tag_t; signal req_is_hit : std_ulogic; signal req_is_miss : std_ulogic; + signal req_laddr : std_ulogic_vector(63 downto 0); -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; @@ -193,12 +195,21 @@ architecture rtl of icache is end; -- Returns whether this is the last row of a line - function is_last_row(addr: wishbone_addr_type) return boolean is + function is_last_row_addr(addr: wishbone_addr_type) return boolean is constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); begin return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones; end; + -- Returns whether this is the last row of a line + function is_last_row(row: row_t) return boolean is + variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0); + constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1'); + begin + row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS)); + return row_v(ROW_LINEBITS-1 downto 0) = ones; + end; + -- Return the address of the next row in the current cache line function next_row_addr(addr: wishbone_addr_type) return std_ulogic_vector is @@ -213,6 +224,21 @@ architecture rtl of icache is return result; end; + -- Return the next row in the current cache line. We use a dedicated + -- function in order to limit the size of the generated adder to be + -- only the bits within a cache line (3 bits with default settings) + -- + function next_row(row: row_t) return row_t is + variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0); + variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0); + variable result : std_ulogic_vector(ROW_BITS-1 downto 0); + begin + row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS)); + row_idx := row_v(ROW_LINEBITS-1 downto 0); + row_v(ROW_LINEBITS-1 downto 0) := std_ulogic_vector(unsigned(row_idx) + 1); + return to_integer(unsigned(row_v)); + end; + -- Read the instruction word for the given address in the current cache row function read_insn_word(addr: std_ulogic_vector(63 downto 0); data: cache_row_t) return std_ulogic_vector is @@ -298,7 +324,6 @@ begin wr_data => wishbone_in.dat ); process(all) - variable tmp_adr : std_ulogic_vector(63 downto 0); begin do_read <= '1'; do_write <= '0'; @@ -307,8 +332,7 @@ begin end if; cache_out(i) <= dout; rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); - tmp_adr := (r.wb.adr'left downto 0 => r.wb.adr, others => '0'); - wr_addr <= std_ulogic_vector(to_unsigned(get_row(tmp_adr), ROW_BITS)); + wr_addr <= std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS)); end process; end generate; @@ -358,6 +382,12 @@ begin req_row <= get_row(i_in.nia); req_tag <= get_tag(i_in.nia); + -- Calculate address of beginning of cache line, will be + -- used for cache miss processing if needed + -- + req_laddr <= i_in.nia(63 downto LINE_OFF_BITS) & + (LINE_OFF_BITS-1 downto 0 => '0'); + -- Test if pending request is a hit on any way hit_way := 0; is_hit := '0'; @@ -427,7 +457,8 @@ begin -- Cache miss/reload synchronous machine icache_miss : process(clk) - variable tagset : cache_tags_set_t; + variable tagset : cache_tags_set_t; + variable stbs_done : boolean; begin if rising_edge(clk) then -- On reset, clear all valid bits to force misses @@ -473,29 +504,54 @@ begin -- Keep track of our index and way for subsequent stores r.store_index <= req_index; r.store_way <= replace_way; + r.store_row <= get_row(req_laddr); -- Prep for first wishbone read. We calculate the address of - -- the start of the cache line + -- the start of the cache line and start the WB cycle. -- - r.wb.adr <= i_in.nia(r.wb.adr'left downto LINE_OFF_BITS) & - (LINE_OFF_BITS-1 downto 0 => '0'); + r.wb.adr <= req_laddr(r.wb.adr'left downto 0); r.wb.cyc <= '1'; r.wb.stb <= '1'; + -- Track that we had one request sent r.state <= WAIT_ACK; end if; + when WAIT_ACK => + -- Requests are all sent if stb is 0 + stbs_done := r.wb.stb = '0'; + + -- If we are still sending requests, was one accepted ? + if wishbone_in.stall = '0' and not stbs_done then + -- That was the last word ? We are done sending. Clear + -- stb and set stbs_done so we can handle an eventual last + -- ack on the same cycle. + -- + if is_last_row_addr(r.wb.adr) then + r.wb.stb <= '0'; + stbs_done := true; + end if; + + -- Calculate the next row address + r.wb.adr <= next_row_addr(r.wb.adr); + end if; + + -- Incoming acks processing if wishbone_in.ack = '1' then - -- That was the last word ? We are done - if is_last_row(r.wb.adr) then - cache_valids(r.store_index)(r.store_way) <= '1'; + -- Check for completion + if stbs_done and is_last_row(r.store_row) then + -- Complete wishbone cycle r.wb.cyc <= '0'; - r.wb.stb <= '0'; + + -- Cache line is now valid + cache_valids(r.store_index)(r.store_way) <= '1'; + + -- We are done r.state <= IDLE; - else - -- Otherwise, calculate the next row address - r.wb.adr <= next_row_addr(r.wb.adr); end if; + + -- Increment store row counter + r.store_row <= next_row(r.store_row); end if; end case; end if; From c22734d0d99531b84e38950fb8ec461ab0a256b3 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sat, 19 Oct 2019 10:33:31 +1100 Subject: [PATCH 10/21] wb_debug: Add wishbone pipelining support Signed-off-by: Benjamin Herrenschmidt --- wishbone_debug_master.vhdl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/wishbone_debug_master.vhdl b/wishbone_debug_master.vhdl index 3ba6b21..11b9ee3 100644 --- a/wishbone_debug_master.vhdl +++ b/wishbone_debug_master.vhdl @@ -124,7 +124,6 @@ begin -- We always move WB cyc and stb simultaneously (no pipelining yet...) wb_out.cyc <= '1' when state = WB_CYCLE else '0'; - wb_out.stb <= '1' when state = WB_CYCLE else '0'; -- Data latch. WB will take the read data away as soon as the cycle -- terminates but we must maintain it on DMI until req goes down, so @@ -145,14 +144,23 @@ begin if rising_edge(clk) then if (rst) then state <= IDLE; + wb_out.stb <= '0'; else case state is when IDLE => if dmi_req = '1' and dmi_addr = DBG_WB_DATA then state <= WB_CYCLE; + wb_out.stb <= '1'; end if; when WB_CYCLE => + if wb_in.stall = '0' then + wb_out.stb <= '0'; + end if; if wb_in.ack then + -- We shouldn't get the ack if we hadn't already cleared + -- stb above but if this happen, don't leave it dangling. + -- + wb_out.stb <= '0'; state <= DMI_WAIT; end if; when DMI_WAIT => From 365f60b69391e193032d8b3bb961b2da707fd5c7 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sat, 19 Oct 2019 10:34:48 +1100 Subject: [PATCH 11/21] simple_ram: Turn on pipelining With a 1 cycle delay Signed-off-by: Benjamin Herrenschmidt --- simple_ram_behavioural.vhdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simple_ram_behavioural.vhdl b/simple_ram_behavioural.vhdl index 64135b8..d6255b8 100644 --- a/simple_ram_behavioural.vhdl +++ b/simple_ram_behavioural.vhdl @@ -11,7 +11,7 @@ entity mw_soc_memory is generic ( RAM_INIT_FILE : string; MEMORY_SIZE : integer; - PIPELINE_DEPTH : integer := 0 + PIPELINE_DEPTH : integer := 1 ); port ( From 48f260761b72924030abc8ff6bc1a727f2dbf331 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 21 Oct 2019 15:11:47 +1100 Subject: [PATCH 12/21] writeback: Slightly improve timing The CR update currently depends on the complete data formatting mux chain. This makes it source its inputs from a bit earlier in the chian, thus improving timing a bit Signed-off-by: Benjamin Herrenschmidt --- writeback.vhdl | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/writeback.vhdl b/writeback.vhdl index 0d9397c..e2b74f8 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -44,6 +44,7 @@ architecture behaviour of writeback is signal sign_extend : std_ulogic; signal negative : std_ulogic; signal second_word : std_ulogic; + signal zero : std_ulogic; begin writeback_0: process(clk) begin @@ -155,7 +156,9 @@ begin -- If the data can arrive split over two cycles, this will be correct -- provided we don't have both sign extension and byte reversal. - negative <= (data_len(2) and data_permuted(31)) or (data_len(1) and data_permuted(15)) or + negative <= (data_len(3) and data_permuted(63)) or + (data_len(2) and data_permuted(31)) or + (data_len(1) and data_permuted(15)) or (data_len(0) and data_permuted(7)); -- trim and sign-extend @@ -170,12 +173,16 @@ begin trim_ctl(i) <= '0' & (negative and sign_extend); end if; end loop; + zero <= not negative; for i in 0 to 7 loop case trim_ctl(i) is when "11" => data_trimmed(i * 8 + 7 downto i * 8) <= data_latched(i * 8 + 7 downto i * 8); when "10" => data_trimmed(i * 8 + 7 downto i * 8) <= data_permuted(i * 8 + 7 downto i * 8); + if or data_permuted(i * 8 + 7 downto i * 8) /= '0' then + zero <= '0'; + end if; when "01" => data_trimmed(i * 8 + 7 downto i * 8) <= x"FF"; when others => @@ -190,9 +197,9 @@ begin if rc = '1' then c_out.write_cr_enable <= '1'; c_out.write_cr_mask <= num_to_fxm(0); - if data_trimmed(63) = '1' then + if negative = '1' then c_out.write_cr_data <= x"80000000"; - elsif or (data_trimmed(62 downto 0)) = '1' then + elsif zero = '0' then c_out.write_cr_data <= x"40000000"; else c_out.write_cr_data <= x"20000000"; From d2762e70e5fb519d8fcca210b366c9c8bbdd696a Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 21 Oct 2019 15:15:07 +1100 Subject: [PATCH 13/21] Add option to not flatten hierarchy Vivado by default tries to flatten the module hierarchy to improve placement and timing. However this makes debugging timing issues really hard as the net names in the timing report can be pretty bogus. This adds a generic that can be used to control attributes to stop vivado from flattening the main core components. The resulting design will have worst timing overall but it will be easier to understand what the worst timing path are and address them. Signed-off-by: Benjamin Herrenschmidt --- core.vhdl | 26 +++++++++++++++++++++++++- fpga/toplevel.vhdl | 6 ++++-- microwatt.core | 11 +++++++++++ soc.vhdl | 6 ++++-- 4 files changed, 44 insertions(+), 5 deletions(-) diff --git a/core.vhdl b/core.vhdl index 810a279..22f7dca 100644 --- a/core.vhdl +++ b/core.vhdl @@ -8,7 +8,8 @@ use work.wishbone_types.all; entity core is generic ( - SIM : boolean := false + SIM : boolean := false; + DISABLE_FLATTEN : boolean := false ); port ( clk : in std_logic; @@ -93,6 +94,29 @@ architecture behave of core is -- Debug status signal dbg_core_is_stopped: std_ulogic; + function keep_h(disable : boolean) return string is + begin + if disable then + return "yes"; + else + return "no"; + end if; + end function; + attribute keep_hierarchy : string; + attribute keep_hierarchy of fetch1_0 : label is keep_h(DISABLE_FLATTEN); + attribute keep_hierarchy of icache_0 : label is keep_h(DISABLE_FLATTEN); + attribute keep_hierarchy of fetch2_0 : label is keep_h(DISABLE_FLATTEN); + attribute keep_hierarchy of decode1_0 : label is keep_h(DISABLE_FLATTEN); + attribute keep_hierarchy of decode2_0 : label is keep_h(DISABLE_FLATTEN); + attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN); + attribute keep_hierarchy of cr_file_0 : label is keep_h(DISABLE_FLATTEN); + attribute keep_hierarchy of execute1_0 : label is keep_h(DISABLE_FLATTEN); + attribute keep_hierarchy of multiply_0 : label is keep_h(DISABLE_FLATTEN); + attribute keep_hierarchy of divider_0 : label is keep_h(DISABLE_FLATTEN); + attribute keep_hierarchy of loadstore1_0 : label is keep_h(DISABLE_FLATTEN); + attribute keep_hierarchy of dcache_0 : label is keep_h(DISABLE_FLATTEN); + attribute keep_hierarchy of writeback_0 : label is keep_h(DISABLE_FLATTEN); + attribute keep_hierarchy of debug_0 : label is keep_h(DISABLE_FLATTEN); begin core_rst <= dbg_core_rst or rst; diff --git a/fpga/toplevel.vhdl b/fpga/toplevel.vhdl index d73c802..38af730 100644 --- a/fpga/toplevel.vhdl +++ b/fpga/toplevel.vhdl @@ -7,7 +7,8 @@ entity toplevel is RAM_INIT_FILE : string := "firmware.hex"; RESET_LOW : boolean := true; CLK_INPUT : positive := 100000000; - CLK_FREQUENCY : positive := 100000000 + CLK_FREQUENCY : positive := 100000000; + DISABLE_FLATTEN_CORE : boolean := false ); port( ext_clk : in std_ulogic; @@ -62,7 +63,8 @@ begin MEMORY_SIZE => MEMORY_SIZE, RAM_INIT_FILE => RAM_INIT_FILE, RESET_LOW => RESET_LOW, - SIM => false + SIM => false, + DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE ) port map ( system_clk => system_clk, diff --git a/microwatt.core b/microwatt.core index 5fb9a7a..35ada86 100644 --- a/microwatt.core +++ b/microwatt.core @@ -93,6 +93,7 @@ targets: - ram_init_file - clk_input - clk_frequency + - disable_flatten_core tools: vivado: {part : xc7a100tcsg324-1} toplevel : toplevel @@ -105,6 +106,7 @@ targets: - ram_init_file - clk_input - clk_frequency + - disable_flatten_core tools: vivado: {part : xc7a200tsbg484-1} toplevel : toplevel @@ -117,6 +119,7 @@ targets: - ram_init_file - clk_input - clk_frequency + - disable_flatten_core tools: vivado: {part : xc7a35ticsg324-1L} toplevel : toplevel @@ -129,6 +132,7 @@ targets: - ram_init_file - clk_input - clk_frequency + - disable_flatten_core tools: vivado: {part : xc7a100ticsg324-1L} toplevel : toplevel @@ -142,6 +146,7 @@ targets: - reset_low=false - clk_input=12000000 - clk_frequency + - disable_flatten_core tools: vivado: {part : xc7a35tcpg236-1} toplevel : toplevel @@ -179,3 +184,9 @@ parameters: description : Generated system clock frequency in HZ (for top-generic based boards) paramtype : generic default : 50000000 + + disable_flatten_core: + datatype : bool + description : Prevent Vivado from flattening the main core components + paramtype : generic + default : false diff --git a/soc.vhdl b/soc.vhdl index 950d0dd..b9a8215 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -17,7 +17,8 @@ entity soc is MEMORY_SIZE : positive; RAM_INIT_FILE : string; RESET_LOW : boolean; - SIM : boolean + SIM : boolean; + DISABLE_FLATTEN_CORE : boolean := false ); port( rst : in std_ulogic; @@ -76,7 +77,8 @@ begin -- Processor core processor: entity work.core generic map( - SIM => SIM + SIM => SIM, + DISABLE_FLATTEN => DISABLE_FLATTEN_CORE ) port map( clk => system_clk, From 797b1bb045161cf33f3086cd464ae88ee3c7d2be Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 21 Oct 2019 22:57:51 +1100 Subject: [PATCH 14/21] decode: Reformat decode_types.vhdl Signed-off-by: Benjamin Herrenschmidt --- decode_types.vhdl | 173 +++++++++++++++++++++++----------------------- 1 file changed, 87 insertions(+), 86 deletions(-) diff --git a/decode_types.vhdl b/decode_types.vhdl index 2d85b27..9736f58 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -2,92 +2,93 @@ library ieee; use ieee.std_logic_1164.all; package decode_types is - type insn_type_t is (OP_ILLEGAL, OP_NOP, OP_ADD, - OP_ADDPCIS, OP_AND, OP_ATTN, OP_B, OP_BC, OP_BCREG, - OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPL, OP_CMPRB, - OP_CNTZ, OP_CRAND, - OP_CRANDC, OP_CREQV, OP_CRNAND, OP_CRNOR, OP_CROR, OP_CRORC, - OP_CRXOR, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST, - OP_DCBZ, OP_DIV, OP_EXTS, - OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC, - OP_LOAD, OP_STORE, OP_MADDHD, OP_MADDHDU, OP_MADDLD, OP_MCRF, - OP_MCRXR, OP_MCRXRX, OP_MFCR, OP_MFSPR, OP_MOD, - OP_MTCRF, OP_MTSPR, OP_MUL_L64, - OP_MUL_H64, OP_MUL_H32, OP_OR, - OP_POPCNTB, OP_POPCNTD, OP_POPCNTW, OP_PRTYD, - OP_PRTYW, OP_RLC, OP_RLCL, OP_RLCR, OP_SETB, - OP_SHL, OP_SHR, - OP_SYNC, OP_TD, OP_TDI, OP_TW, - OP_TWI, OP_XOR, OP_SIM_CONFIG); - - type input_reg_a_t is (NONE, RA, RA_OR_ZERO); - type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, CONST_DS, CONST_M1, CONST_SH, CONST_SH32); - type input_reg_c_t is (NONE, RS); - type output_reg_a_t is (NONE, RT, RA); - type rc_t is (NONE, ONE, RC); - type carry_in_t is (ZERO, CA, ONE); - - constant SH_OFFSET : integer := 0; - constant MB_OFFSET : integer := 1; - constant ME_OFFSET : integer := 1; - constant SH32_OFFSET : integer := 0; - constant MB32_OFFSET : integer := 1; - constant ME32_OFFSET : integer := 2; - - constant FXM_OFFSET : integer := 0; - - constant BO_OFFSET : integer := 0; - constant BI_OFFSET : integer := 1; - constant BH_OFFSET : integer := 2; - - constant BF_OFFSET : integer := 0; - constant L_OFFSET : integer := 1; - - constant TOO_OFFSET : integer := 0; - - type unit_t is (NONE, ALU, LDST, MUL, DIV); - type length_t is (NONE, is1B, is2B, is4B, is8B); - - type decode_rom_t is record - unit : unit_t; - insn_type : insn_type_t; - input_reg_a : input_reg_a_t; - input_reg_b : input_reg_b_t; - input_reg_c : input_reg_c_t; - output_reg_a : output_reg_a_t; - - input_cr : std_ulogic; - output_cr : std_ulogic; - - invert_a : std_ulogic; - invert_out : std_ulogic; - input_carry : carry_in_t; - output_carry : std_ulogic; - - -- load/store signals - length : length_t; - byte_reverse : std_ulogic; - sign_extend : std_ulogic; - update : std_ulogic; - reserve : std_ulogic; - - -- multiplier and ALU signals - is_32bit : std_ulogic; - is_signed : std_ulogic; - - rc : rc_t; - lr : std_ulogic; - - sgl_pipe : std_ulogic; - end record; - constant decode_rom_init : decode_rom_t := (unit => NONE, - insn_type => OP_ILLEGAL, input_reg_a => NONE, - input_reg_b => NONE, input_reg_c => NONE, - output_reg_a => NONE, input_cr => '0', output_cr => '0', - invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', - length => NONE, byte_reverse => '0', sign_extend => '0', - update => '0', reserve => '0', is_32bit => '0', - is_signed => '0', rc => NONE, lr => '0', sgl_pipe => '0'); + type insn_type_t is (OP_ILLEGAL, OP_NOP, OP_ADD, + OP_ADDPCIS, OP_AND, OP_ATTN, OP_B, OP_BC, OP_BCREG, + OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPL, OP_CMPRB, + OP_CNTZ, OP_CRAND, + OP_CRANDC, OP_CREQV, OP_CRNAND, OP_CRNOR, OP_CROR, OP_CRORC, + OP_CRXOR, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST, + OP_DCBZ, OP_DIV, OP_EXTS, + OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC, + OP_LOAD, OP_STORE, OP_MADDHD, OP_MADDHDU, OP_MADDLD, OP_MCRF, + OP_MCRXR, OP_MCRXRX, OP_MFCR, OP_MFSPR, OP_MOD, + OP_MTCRF, OP_MTSPR, OP_MUL_L64, + OP_MUL_H64, OP_MUL_H32, OP_OR, + OP_POPCNTB, OP_POPCNTD, OP_POPCNTW, OP_PRTYD, + OP_PRTYW, OP_RLC, OP_RLCL, OP_RLCR, OP_SETB, + OP_SHL, OP_SHR, + OP_SYNC, OP_TD, OP_TDI, OP_TW, + OP_TWI, OP_XOR, OP_SIM_CONFIG + ); + + type input_reg_a_t is (NONE, RA, RA_OR_ZERO); + type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, CONST_DS, CONST_M1, CONST_SH, CONST_SH32); + type input_reg_c_t is (NONE, RS); + type output_reg_a_t is (NONE, RT, RA); + type rc_t is (NONE, ONE, RC); + type carry_in_t is (ZERO, CA, ONE); + + constant SH_OFFSET : integer := 0; + constant MB_OFFSET : integer := 1; + constant ME_OFFSET : integer := 1; + constant SH32_OFFSET : integer := 0; + constant MB32_OFFSET : integer := 1; + constant ME32_OFFSET : integer := 2; + + constant FXM_OFFSET : integer := 0; + + constant BO_OFFSET : integer := 0; + constant BI_OFFSET : integer := 1; + constant BH_OFFSET : integer := 2; + + constant BF_OFFSET : integer := 0; + constant L_OFFSET : integer := 1; + + constant TOO_OFFSET : integer := 0; + + type unit_t is (NONE, ALU, LDST, MUL, DIV); + type length_t is (NONE, is1B, is2B, is4B, is8B); + + type decode_rom_t is record + unit : unit_t; + insn_type : insn_type_t; + input_reg_a : input_reg_a_t; + input_reg_b : input_reg_b_t; + input_reg_c : input_reg_c_t; + output_reg_a : output_reg_a_t; + + input_cr : std_ulogic; + output_cr : std_ulogic; + + invert_a : std_ulogic; + invert_out : std_ulogic; + input_carry : carry_in_t; + output_carry : std_ulogic; + + -- load/store signals + length : length_t; + byte_reverse : std_ulogic; + sign_extend : std_ulogic; + update : std_ulogic; + reserve : std_ulogic; + + -- multiplier and ALU signals + is_32bit : std_ulogic; + is_signed : std_ulogic; + + rc : rc_t; + lr : std_ulogic; + + sgl_pipe : std_ulogic; + end record; + constant decode_rom_init : decode_rom_t := (unit => NONE, + insn_type => OP_ILLEGAL, input_reg_a => NONE, + input_reg_b => NONE, input_reg_c => NONE, + output_reg_a => NONE, input_cr => '0', output_cr => '0', + invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', + length => NONE, byte_reverse => '0', sign_extend => '0', + update => '0', reserve => '0', is_32bit => '0', + is_signed => '0', rc => NONE, lr => '0', sgl_pipe => '0'); end decode_types; From 3349bdc79891364f702b8ce977d14abfda46ae01 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 22 Oct 2019 16:05:18 +1100 Subject: [PATCH 15/21] ram: Add block RAM pipelining This adds an output buffer to help with timing and allows the BRAMs to actually pipeline. Signed-off-by: Benjamin Herrenschmidt --- fpga/mw_soc_memory.vhdl | 89 ++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 42 deletions(-) diff --git a/fpga/mw_soc_memory.vhdl b/fpga/mw_soc_memory.vhdl index af31c89..7e998b2 100644 --- a/fpga/mw_soc_memory.vhdl +++ b/fpga/mw_soc_memory.vhdl @@ -4,6 +4,7 @@ library ieee; use ieee.std_logic_1164.all; +use ieee.std_logic_unsigned.all; use ieee.numeric_std.all; use std.textio.all; @@ -29,9 +30,10 @@ entity mw_soc_memory is end entity mw_soc_memory; architecture behaviour of mw_soc_memory is - signal wb_adr_in : std_logic_vector(log2(MEMORY_SIZE) - 1 downto 0); + -- RAM type definition type ram_t is array(0 to (MEMORY_SIZE / 8) - 1) of std_logic_vector(63 downto 0); + -- RAM loading impure function init_ram(name : STRING) return ram_t is file ram_file : text open read_mode is name; variable ram_line : line; @@ -48,58 +50,61 @@ architecture behaviour of mw_soc_memory is return temp_ram; end function; + -- RAM instance signal memory : ram_t := init_ram(RAM_INIT_FILE); - attribute ram_style : string; attribute ram_style of memory : signal is "block"; - attribute ram_decomp : string; - attribute ram_decomp of memory : signal is "power"; - - type state_type is (IDLE, ACK); - signal state : state_type; - - signal read_ack : std_logic; - + attribute ram_decomp of memory : signal is "power"; + + -- RAM interface + constant ram_addr_bits : integer := log2(MEMORY_SIZE) - 3; + signal ram_addr : std_logic_vector(ram_addr_bits - 1 downto 0); + signal ram_di : std_logic_vector(63 downto 0); + signal ram_do : std_logic_vector(63 downto 0); + signal ram_sel : std_logic_vector(7 downto 0); + signal ram_we : std_ulogic; + + -- Others + signal ram_obuf : std_logic_vector(63 downto 0); + signal ack, ack_obuf : std_ulogic; begin - wb_adr_in <= wishbone_in.adr(log2(MEMORY_SIZE) - 1 downto 0); + -- Actual RAM template + memory_0: process(clk) + begin + if rising_edge(clk) then + if ram_we = '1' then + for i in 0 to 7 loop + if ram_sel(i) = '1' then + memory(conv_integer(ram_addr))((i + 1) * 8 - 1 downto i * 8) <= + ram_di((i + 1) * 8 - 1 downto i * 8); + end if; + end loop; + end if; + ram_do <= memory(conv_integer(ram_addr)); + ram_obuf <= ram_do; + end if; + end process; - wishbone_out.ack <= read_ack and wishbone_in.cyc and wishbone_in.stb; - wishbone_out.stall <= '0' when wishbone_in.cyc = '0' else not wishbone_out.ack; + -- Wishbone interface + ram_addr <= wishbone_in.adr(ram_addr_bits + 2 downto 3); + ram_di <= wishbone_in.dat; + ram_sel <= wishbone_in.sel; + ram_we <= wishbone_in.we and wishbone_in.stb and wishbone_in.cyc; + wishbone_out.stall <= '0'; + wishbone_out.ack <= ack_obuf; + wishbone_out.dat <= ram_obuf; - memory_0: process(clk) + wb_0: process(clk) begin if rising_edge(clk) then - if rst = '1' then - read_ack <= '0'; - state <= IDLE; + if rst = '1' or wishbone_in.cyc = '0' then + ack_obuf <= '0'; + ack <= '0'; else - if wishbone_in.cyc = '1' then - case state is - when IDLE => - if wishbone_in.stb = '1' and wishbone_in.we = '1' then - for i in 0 to 7 loop - if wishbone_in.sel(i) = '1' then - memory(to_integer(unsigned(wb_adr_in(wb_adr_in'left downto 3))))(((i + 1) * 8) - 1 downto i * 8) - <= wishbone_in.dat(((i + 1) * 8) - 1 downto i * 8); - end if; - end loop; - read_ack <= '1'; - state <= ACK; - elsif wishbone_in.stb = '1' then - wishbone_out.dat <= memory(to_integer(unsigned(wb_adr_in(wb_adr_in'left downto 3)))); - read_ack <= '1'; - state <= ACK; - end if; - when ACK => - read_ack <= '0'; - state <= IDLE; - end case; - else - state <= IDLE; - read_ack <= '0'; - end if; + ack <= wishbone_in.stb; + ack_obuf <= ack; end if; end if; end process; From 9a63c098a5471e40ca0364a867d30204f0288bc4 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 23 Oct 2019 10:52:37 +1100 Subject: [PATCH 16/21] Move log2/ispow2 to a utils package (Out of icache and dcache) Signed-off-by: Benjamin Herrenschmidt --- Makefile | 5 +++-- dcache.vhdl | 21 +-------------------- icache.vhdl | 21 +-------------------- utils.vhdl | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 40 insertions(+), 42 deletions(-) create mode 100644 utils.vhdl diff --git a/Makefile b/Makefile index 3056c53..1c68ff4 100644 --- a/Makefile +++ b/Makefile @@ -35,10 +35,11 @@ helpers.o: cache_ram.o: plru.o: plru_tb.o: plru.o -icache.o: common.o wishbone_types.o plru.o cache_ram.o icache_tb.o: common.o wishbone_types.o icache.o simple_ram_behavioural.o -dcache.o: common.o wishbone_types.o plru.o cache_ram.o dcache_tb.o: common.o wishbone_types.o dcache.o simple_ram_behavioural.o +utils.o: +icache.o: utils.o common.o wishbone_types.o plru.o cache_ram.o utils.o +dcache.o: utils.o common.o wishbone_types.o plru.o cache_ram.o utils.o insn_helpers.o: loadstore1.o: common.o helpers.o logical.o: decode_types.o diff --git a/dcache.vhdl b/dcache.vhdl index f12fd35..7d6e74c 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -16,6 +16,7 @@ use ieee.std_logic_1164.all; use ieee.numeric_std.all; library work; +use work.utils.all; use work.common.all; use work.helpers.all; use work.wishbone_types.all; @@ -44,26 +45,6 @@ entity dcache is end entity dcache; architecture rtl of dcache is - function log2(i : natural) return integer is - variable tmp : integer := i; - variable ret : integer := 0; - begin - while tmp > 1 loop - ret := ret + 1; - tmp := tmp / 2; - end loop; - return ret; - end function; - - function ispow2(i : integer) return boolean is - begin - if to_integer(to_unsigned(i, 32) and to_unsigned(i - 1, 32)) = 0 then - return true; - else - return false; - end if; - end function; - -- BRAM organisation: We never access more than wishbone_data_bits at -- a time so to save resources we make the array only that wide, and -- use consecutive indices for to make a cache "line" diff --git a/icache.vhdl b/icache.vhdl index 70226a8..20d5724 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -21,6 +21,7 @@ use ieee.std_logic_1164.all; use ieee.numeric_std.all; library work; +use work.utils.all; use work.common.all; use work.wishbone_types.all; @@ -51,26 +52,6 @@ entity icache is end entity icache; architecture rtl of icache is - function log2(i : natural) return integer is - variable tmp : integer := i; - variable ret : integer := 0; - begin - while tmp > 1 loop - ret := ret + 1; - tmp := tmp / 2; - end loop; - return ret; - end function; - - function ispow2(i : integer) return boolean is - begin - if to_integer(to_unsigned(i, 32) and to_unsigned(i - 1, 32)) = 0 then - return true; - else - return false; - end if; - end function; - -- BRAM organisation: We never access more than wishbone_data_bits at -- a time so to save resources we make the array only that wide, and -- use consecutive indices for to make a cache "line" diff --git a/utils.vhdl b/utils.vhdl new file mode 100644 index 0000000..7238641 --- /dev/null +++ b/utils.vhdl @@ -0,0 +1,35 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +package utils is + + function log2(i : natural) return integer; + function ispow2(i : integer) return boolean; + +end utils; + +package body utils is + + function log2(i : natural) return integer is + variable tmp : integer := i; + variable ret : integer := 0; + begin + while tmp > 1 loop + ret := ret + 1; + tmp := tmp / 2; + end loop; + return ret; + end function; + + function ispow2(i : integer) return boolean is + begin + if to_integer(to_unsigned(i, 32) and to_unsigned(i - 1, 32)) = 0 then + return true; + else + return false; + end if; + end function; + +end utils; + From 8e0389b9736c60572e13ef5eeb50d3a775c3ffc6 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 23 Oct 2019 12:08:55 +1100 Subject: [PATCH 17/21] ram: Rework main RAM interface This replaces the simple_ram_behavioural and mw_soc_memory modules with a common wishbone_bram_wrapper.vhdl that interfaces the pipelined WB with a lower-level RAM module, along with an FPGA and a sim variants of the latter. Signed-off-by: Benjamin Herrenschmidt --- Makefile | 32 +-- README.md | 2 +- core_tb.vhdl | 2 +- dcache_tb.vhdl | 10 +- dmi_dtm_tb.vhdl | 4 +- fpga/main_bram.vhdl | 83 ++++++ fpga/mw_soc_memory.vhdl | 112 -------- icache_tb.vhdl | 34 ++- microwatt.core | 6 +- scripts/run_test.sh | 2 +- scripts/test_micropython.py | 2 +- scripts/test_micropython_long.py | 2 +- sim_bram.vhdl | 67 +++++ ...ural_helpers.vhdl => sim_bram_helpers.vhdl | 12 +- ...ioural_helpers_c.c => sim_bram_helpers_c.c | 0 simple_ram_behavioural.vhdl | 133 ---------- simple_ram_behavioural_tb.vhdl | 246 ------------------ soc.vhdl | 2 +- ...behavioural_tb.bin => wishbone_bram_tb.bin | Bin wishbone_bram_tb.vhdl | 175 +++++++++++++ wishbone_bram_wrapper.vhdl | 76 ++++++ 21 files changed, 462 insertions(+), 540 deletions(-) create mode 100644 fpga/main_bram.vhdl delete mode 100644 fpga/mw_soc_memory.vhdl create mode 100644 sim_bram.vhdl rename simple_ram_behavioural_helpers.vhdl => sim_bram_helpers.vhdl (84%) rename simple_ram_behavioural_helpers_c.c => sim_bram_helpers_c.c (100%) delete mode 100644 simple_ram_behavioural.vhdl delete mode 100644 simple_ram_behavioural_tb.vhdl rename simple_ram_behavioural_tb.bin => wishbone_bram_tb.bin (100%) create mode 100644 wishbone_bram_tb.vhdl create mode 100644 wishbone_bram_wrapper.vhdl diff --git a/Makefile b/Makefile index 1c68ff4..85a0262 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,8 @@ GHDL=ghdl GHDLFLAGS=--std=08 -Psim-unisim CFLAGS=-O2 -Wall -all = core_tb simple_ram_behavioural_tb soc_reset_tb icache_tb dcache_tb multiply_tb dmi_dtm_tb divider_tb \ - rotator_tb countzero_tb +all = core_tb soc_reset_tb icache_tb dcache_tb multiply_tb dmi_dtm_tb divider_tb \ + rotator_tb countzero_tb wishbone_bram_tb # XXX # loadstore_tb fetch_tb @@ -35,11 +35,14 @@ helpers.o: cache_ram.o: plru.o: plru_tb.o: plru.o -icache_tb.o: common.o wishbone_types.o icache.o simple_ram_behavioural.o -dcache_tb.o: common.o wishbone_types.o dcache.o simple_ram_behavioural.o utils.o: +sim_bram.o: sim_bram_helpers.o utils.o +wishbone_bram_wrapper.o: wishbone_types.o sim_bram.o utils.o +wishbone_bram_tb.o: wishbone_bram_wrapper.o icache.o: utils.o common.o wishbone_types.o plru.o cache_ram.o utils.o +icache_tb.o: common.o wishbone_types.o icache.o wishbone_bram_wrapper.o dcache.o: utils.o common.o wishbone_types.o plru.o cache_ram.o utils.o +dcache_tb.o: common.o wishbone_types.o dcache.o wishbone_bram_wrapper.o insn_helpers.o: loadstore1.o: common.o helpers.o logical.o: decode_types.o @@ -52,11 +55,8 @@ register_file.o: common.o rotator.o: common.o rotator_tb.o: common.o glibc_random.o ppc_fx_insns.o insn_helpers.o rotator.o sim_console.o: -simple_ram_behavioural_helpers.o: -simple_ram_behavioural_tb.o: wishbone_types.o simple_ram_behavioural.o -simple_ram_behavioural.o: wishbone_types.o simple_ram_behavioural_helpers.o sim_uart.o: wishbone_types.o sim_console.o -soc.o: common.o wishbone_types.o core.o wishbone_arbiter.o sim_uart.o simple_ram_behavioural.o dmi_dtm_xilinx.o wishbone_debug_master.o +soc.o: common.o wishbone_types.o core.o wishbone_arbiter.o sim_uart.o wishbone_bram_wrapper.o dmi_dtm_xilinx.o wishbone_debug_master.o wishbone_arbiter.o: wishbone_types.o wishbone_types.o: writeback.o: common.o crhelpers.o @@ -74,17 +74,17 @@ fpga/soc_reset_tb.o: fpga/soc_reset.o soc_reset_tb: fpga/soc_reset_tb.o fpga/soc_reset.o $(GHDL) -e $(GHDLFLAGS) soc_reset_tb -core_tb: core_tb.o simple_ram_behavioural_helpers_c.o sim_console_c.o sim_jtag_socket_c.o - $(GHDL) -e $(GHDLFLAGS) -Wl,simple_ram_behavioural_helpers_c.o -Wl,sim_console_c.o -Wl,sim_jtag_socket_c.o $@ +core_tb: core_tb.o sim_bram_helpers_c.o sim_console_c.o sim_jtag_socket_c.o + $(GHDL) -e $(GHDLFLAGS) -Wl,sim_bram_helpers_c.o -Wl,sim_console_c.o -Wl,sim_jtag_socket_c.o $@ fetch_tb: fetch_tb.o $(GHDL) -e $(GHDLFLAGS) $@ icache_tb: icache_tb.o - $(GHDL) -e $(GHDLFLAGS) -Wl,simple_ram_behavioural_helpers_c.o $@ + $(GHDL) -e $(GHDLFLAGS) -Wl,sim_bram_helpers_c.o $@ dcache_tb: dcache_tb.o - $(GHDL) -e $(GHDLFLAGS) -Wl,simple_ram_behavioural_helpers_c.o $@ + $(GHDL) -e $(GHDLFLAGS) -Wl,sim_bram_helpers_c.o $@ plru_tb: plru_tb.o $(GHDL) -e $(GHDLFLAGS) $@ @@ -107,11 +107,11 @@ countzero_tb: countzero_tb.o simple_ram_tb: simple_ram_tb.o $(GHDL) -e $(GHDLFLAGS) $@ -simple_ram_behavioural_tb: simple_ram_behavioural_helpers_c.o simple_ram_behavioural_tb.o - $(GHDL) -e $(GHDLFLAGS) -Wl,simple_ram_behavioural_helpers_c.o $@ +wishbone_bram_tb: sim_bram_helpers_c.o wishbone_bram_tb.o + $(GHDL) -e $(GHDLFLAGS) -Wl,sim_bram_helpers_c.o $@ -dmi_dtm_tb: dmi_dtm_tb.o simple_ram_behavioural_helpers_c.o - $(GHDL) -e $(GHDLFLAGS) -Wl,simple_ram_behavioural_helpers_c.o $@ +dmi_dtm_tb: dmi_dtm_tb.o sim_bram_helpers_c.o + $(GHDL) -e $(GHDLFLAGS) -Wl,sim_bram_helpers_c.o $@ tests = $(sort $(patsubst tests/%.out,%,$(wildcard tests/*.out))) diff --git a/README.md b/README.md index 86e9e22..7c6bc11 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ make - Link in the micropython image: ``` -ln -s ../micropython/ports/powerpc/build/firmware.bin simple_ram_behavioural.bin +ln -s ../micropython/ports/powerpc/build/firmware.bin main_ram.bin ``` - Now run microwatt, sending debug output to /dev/null: diff --git a/core_tb.vhdl b/core_tb.vhdl index 672b424..90fc30c 100644 --- a/core_tb.vhdl +++ b/core_tb.vhdl @@ -20,7 +20,7 @@ begin generic map( SIM => true, MEMORY_SIZE => 524288, - RAM_INIT_FILE => "simple_ram_behavioural.bin", + RAM_INIT_FILE => "main_ram.bin", RESET_LOW => false ) port map( diff --git a/dcache_tb.vhdl b/dcache_tb.vhdl index 0edbdb7..437fd7d 100644 --- a/dcache_tb.vhdl +++ b/dcache_tb.vhdl @@ -35,9 +35,9 @@ begin ); -- BRAM Memory slave - bram0: entity work.mw_soc_memory + bram0: entity work.wishbone_bram_wrapper generic map( - MEMORY_SIZE => 128, + MEMORY_SIZE => 1024, RAM_INIT_FILE => "icache_test.bin" ) port map( @@ -121,7 +121,6 @@ begin d_in.valid <= '1'; wait until rising_edge(clk); d_in.valid <= '0'; - wait until rising_edge(clk) and d_out.write_enable = '1'; assert d_out.valid = '1'; assert d_out.write_data = x"0000004100000040" @@ -130,7 +129,10 @@ begin " expected 0000004100000040" severity failure; - wait for clk_period*4; + wait until rising_edge(clk); + wait until rising_edge(clk); + wait until rising_edge(clk); + wait until rising_edge(clk); assert false report "end of test" severity failure; wait; diff --git a/dmi_dtm_tb.vhdl b/dmi_dtm_tb.vhdl index fe60c12..0694266 100644 --- a/dmi_dtm_tb.vhdl +++ b/dmi_dtm_tb.vhdl @@ -50,8 +50,8 @@ begin dmi_ack => dmi_ack ); - simple_ram_0: entity work.mw_soc_memory - generic map(RAM_INIT_FILE => "simple_ram_behavioural.bin", + simple_ram_0: entity work.wishbone_bram_wrapper + generic map(RAM_INIT_FILE => "main_ram.bin", MEMORY_SIZE => 524288) port map(clk => clk, rst => rst, wishbone_in => wishbone_ram_out, diff --git a/fpga/main_bram.vhdl b/fpga/main_bram.vhdl new file mode 100644 index 0000000..810d60c --- /dev/null +++ b/fpga/main_bram.vhdl @@ -0,0 +1,83 @@ +-- Single port Block RAM with one cycle output buffer + +library ieee; +use ieee.std_logic_1164.all; +use ieee.std_logic_unsigned.all; +use ieee.numeric_std.all; +use std.textio.all; + +library work; + +entity main_bram is + generic( + WIDTH : natural := 64; + HEIGHT_BITS : natural := 1024; + MEMORY_SIZE : natural := 65536; + RAM_INIT_FILE : string + ); + port( + clk : in std_logic; + addr : in std_logic_vector(HEIGHT_BITS - 1 downto 0) ; + di : in std_logic_vector(WIDTH-1 downto 0); + do : out std_logic_vector(WIDTH-1 downto 0); + sel : in std_logic_vector((WIDTH/8)-1 downto 0); + re : in std_ulogic; + we : in std_ulogic + ); +end entity main_bram; + +architecture behaviour of main_bram is + + constant WIDTH_BYTES : natural := WIDTH / 8; + + -- RAM type definition + type ram_t is array(0 to (MEMORY_SIZE / WIDTH_BYTES) - 1) of std_logic_vector(WIDTH-1 downto 0); + + -- RAM loading + impure function init_ram(name : STRING) return ram_t is + file ram_file : text open read_mode is name; + variable ram_line : line; + variable temp_word : std_logic_vector(WIDTH-1 downto 0); + variable temp_ram : ram_t := (others => (others => '0')); + begin + for i in 0 to (MEMORY_SIZE / WIDTH_BYTES) - 1 loop + exit when endfile(ram_file); + readline(ram_file, ram_line); + hread(ram_line, temp_word); + temp_ram(i) := temp_word; + end loop; + + return temp_ram; + end function; + + -- RAM instance + signal memory : ram_t := init_ram(RAM_INIT_FILE); + attribute ram_style : string; + attribute ram_style of memory : signal is "block"; + attribute ram_decomp : string; + attribute ram_decomp of memory : signal is "power"; + + -- Others + signal obuf : std_logic_vector(WIDTH-1 downto 0); +begin + + -- Actual RAM template + memory_0: process(clk) + begin + if rising_edge(clk) then + if we = '1' then + for i in 0 to 7 loop + if sel(i) = '1' then + memory(conv_integer(addr))((i + 1) * 8 - 1 downto i * 8) <= + di((i + 1) * 8 - 1 downto i * 8); + end if; + end loop; + end if; + if re = '1' then + obuf <= memory(conv_integer(addr)); + end if; + do <= obuf; + end if; + end process; + +end architecture behaviour; diff --git a/fpga/mw_soc_memory.vhdl b/fpga/mw_soc_memory.vhdl deleted file mode 100644 index 7e998b2..0000000 --- a/fpga/mw_soc_memory.vhdl +++ /dev/null @@ -1,112 +0,0 @@ --- Based on: --- The Potato Processor - A simple processor for FPGAs --- (c) Kristian Klomsten Skordal 2014 - 2015 - -library ieee; -use ieee.std_logic_1164.all; -use ieee.std_logic_unsigned.all; -use ieee.numeric_std.all; -use std.textio.all; - -library work; -use work.wishbone_types.all; - -use work.pp_utilities.all; - ---! @brief Simple memory module for use in Wishbone-based systems. -entity mw_soc_memory is - generic( - MEMORY_SIZE : natural := 4096; --! Memory size in bytes. - RAM_INIT_FILE : string - ); - port( - clk : in std_logic; - rst : in std_logic; - - -- Wishbone interface: - wishbone_in : in wishbone_master_out; - wishbone_out : out wishbone_slave_out - ); -end entity mw_soc_memory; - -architecture behaviour of mw_soc_memory is - -- RAM type definition - type ram_t is array(0 to (MEMORY_SIZE / 8) - 1) of std_logic_vector(63 downto 0); - - -- RAM loading - impure function init_ram(name : STRING) return ram_t is - file ram_file : text open read_mode is name; - variable ram_line : line; - variable temp_word : std_logic_vector(63 downto 0); - variable temp_ram : ram_t := (others => (others => '0')); - begin - for i in 0 to (MEMORY_SIZE/8)-1 loop - exit when endfile(ram_file); - readline(ram_file, ram_line); - hread(ram_line, temp_word); - temp_ram(i) := temp_word; - end loop; - - return temp_ram; - end function; - - -- RAM instance - signal memory : ram_t := init_ram(RAM_INIT_FILE); - attribute ram_style : string; - attribute ram_style of memory : signal is "block"; - attribute ram_decomp : string; - attribute ram_decomp of memory : signal is "power"; - - -- RAM interface - constant ram_addr_bits : integer := log2(MEMORY_SIZE) - 3; - signal ram_addr : std_logic_vector(ram_addr_bits - 1 downto 0); - signal ram_di : std_logic_vector(63 downto 0); - signal ram_do : std_logic_vector(63 downto 0); - signal ram_sel : std_logic_vector(7 downto 0); - signal ram_we : std_ulogic; - - -- Others - signal ram_obuf : std_logic_vector(63 downto 0); - signal ack, ack_obuf : std_ulogic; -begin - - -- Actual RAM template - memory_0: process(clk) - begin - if rising_edge(clk) then - if ram_we = '1' then - for i in 0 to 7 loop - if ram_sel(i) = '1' then - memory(conv_integer(ram_addr))((i + 1) * 8 - 1 downto i * 8) <= - ram_di((i + 1) * 8 - 1 downto i * 8); - end if; - end loop; - end if; - ram_do <= memory(conv_integer(ram_addr)); - ram_obuf <= ram_do; - end if; - end process; - - -- Wishbone interface - ram_addr <= wishbone_in.adr(ram_addr_bits + 2 downto 3); - ram_di <= wishbone_in.dat; - ram_sel <= wishbone_in.sel; - ram_we <= wishbone_in.we and wishbone_in.stb and wishbone_in.cyc; - wishbone_out.stall <= '0'; - wishbone_out.ack <= ack_obuf; - wishbone_out.dat <= ram_obuf; - - wb_0: process(clk) - begin - if rising_edge(clk) then - if rst = '1' or wishbone_in.cyc = '0' then - ack_obuf <= '0'; - ack <= '0'; - else - ack <= wishbone_in.stb; - ack_obuf <= ack; - end if; - end if; - end process; - -end architecture behaviour; diff --git a/icache_tb.vhdl b/icache_tb.vhdl index a50cabe..ea5cf3a 100644 --- a/icache_tb.vhdl +++ b/icache_tb.vhdl @@ -36,9 +36,9 @@ begin ); -- BRAM Memory slave - bram0: entity work.mw_soc_memory + bram0: entity work.wishbone_bram_wrapper generic map( - MEMORY_SIZE => 128, + MEMORY_SIZE => 1024, RAM_INIT_FILE => "icache_test.bin" ) port map( @@ -70,14 +70,18 @@ begin i_out.nia <= (others => '0'); i_out.stop_mark <= '0'; - wait for 4*clk_period; + wait until rising_edge(clk); + wait until rising_edge(clk); + wait until rising_edge(clk); + wait until rising_edge(clk); i_out.req <= '1'; i_out.nia <= x"0000000000000004"; wait for 30*clk_period; + wait until rising_edge(clk); - assert i_in.valid = '1'; + assert i_in.valid = '1' severity failure; assert i_in.insn = x"00000001" report "insn @" & to_hstring(i_out.nia) & "=" & to_hstring(i_in.insn) & @@ -86,27 +90,29 @@ begin i_out.req <= '0'; - wait for clk_period; + wait until rising_edge(clk); -- hit i_out.req <= '1'; i_out.nia <= x"0000000000000008"; - wait for clk_period; - assert i_in.valid = '1'; + wait until rising_edge(clk); + wait until rising_edge(clk); + assert i_in.valid = '1' severity failure; assert i_in.insn = x"00000002" report "insn @" & to_hstring(i_out.nia) & "=" & to_hstring(i_in.insn) & " expected 00000002" severity failure; - wait for clk_period; + wait until rising_edge(clk); -- another miss i_out.req <= '1'; i_out.nia <= x"0000000000000040"; wait for 30*clk_period; + wait until rising_edge(clk); - assert i_in.valid = '1'; + assert i_in.valid = '1' severity failure; assert i_in.insn = x"00000010" report "insn @" & to_hstring(i_out.nia) & "=" & to_hstring(i_in.insn) & @@ -116,13 +122,15 @@ begin -- test something that aliases i_out.req <= '1'; i_out.nia <= x"0000000000000100"; - wait for clk_period; - assert i_in.valid = '0'; - wait for clk_period; + wait until rising_edge(clk); + wait until rising_edge(clk); + assert i_in.valid = '0' severity failure; + wait until rising_edge(clk); wait for 30*clk_period; + wait until rising_edge(clk); - assert i_in.valid = '1'; + assert i_in.valid = '1' severity failure; assert i_in.insn = x"00000040" report "insn @" & to_hstring(i_out.nia) & "=" & to_hstring(i_in.insn) & diff --git a/microwatt.core b/microwatt.core index 35ada86..ee48376 100644 --- a/microwatt.core +++ b/microwatt.core @@ -36,20 +36,22 @@ filesets: - plru.vhdl - cache_ram.vhdl - core_debug.vhdl + - utils.vhdl file_type : vhdlSource-2008 soc: files: - wishbone_arbiter.vhdl - wishbone_debug_master.vhdl + - wishbone_bram_wrapper.vhdl - soc.vhdl file_type : vhdlSource-2008 fpga: files: - - fpga/pp_fifo.vhd - - fpga/mw_soc_memory.vhdl + - fpga/main_bram.vhdl - fpga/soc_reset.vhdl + - fpga/pp_fifo.vhd - fpga/pp_soc_uart.vhd - fpga/pp_utilities.vhd - fpga/toplevel.vhdl diff --git a/scripts/run_test.sh b/scripts/run_test.sh index b6f2ee6..ef737fe 100755 --- a/scripts/run_test.sh +++ b/scripts/run_test.sh @@ -21,7 +21,7 @@ Y=$(${MICROWATT_DIR}/scripts/hash.py tests/${TEST}.out) cd $TMPDIR -cp ${MICROWATT_DIR}/tests/${TEST}.bin simple_ram_behavioural.bin +cp ${MICROWATT_DIR}/tests/${TEST}.bin main_ram.bin X=$( ${MICROWATT_DIR}/core_tb | ${MICROWATT_DIR}/scripts/hash.py ) diff --git a/scripts/test_micropython.py b/scripts/test_micropython.py index d7ffb2d..edc076f 100755 --- a/scripts/test_micropython.py +++ b/scripts/test_micropython.py @@ -13,7 +13,7 @@ cwd = os.getcwd() os.chdir(tempdir.name) copyfile(os.path.join(cwd, 'tests/micropython.bin'), - os.path.join(tempdir.name, 'simple_ram_behavioural.bin')) + os.path.join(tempdir.name, 'main_ram.bin')) cmd = [ os.path.join(cwd, './core_tb') ] diff --git a/scripts/test_micropython_long.py b/scripts/test_micropython_long.py index 805c6b2..6dea3a4 100755 --- a/scripts/test_micropython_long.py +++ b/scripts/test_micropython_long.py @@ -13,7 +13,7 @@ cwd = os.getcwd() os.chdir(tempdir.name) copyfile(os.path.join(cwd, 'tests/micropython.bin'), - os.path.join(tempdir.name, 'simple_ram_behavioural.bin')) + os.path.join(tempdir.name, 'main_ram.bin')) cmd = [ os.path.join(cwd, './core_tb') ] diff --git a/sim_bram.vhdl b/sim_bram.vhdl new file mode 100644 index 0000000..d2d4f1b --- /dev/null +++ b/sim_bram.vhdl @@ -0,0 +1,67 @@ +-- Single port Block RAM with one cycle output buffer +-- +-- Simulated via C helpers + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; +use std.textio.all; + +library work; +use work.utils.all; +use work.sim_bram_helpers.all; + +entity main_bram is + generic( + WIDTH : natural := 64; + HEIGHT_BITS : natural := 1024; + MEMORY_SIZE : natural := 65536; + RAM_INIT_FILE : string + ); + port( + clk : in std_logic; + addr : in std_logic_vector(HEIGHT_BITS - 1 downto 0) ; + di : in std_logic_vector(WIDTH-1 downto 0); + do : out std_logic_vector(WIDTH-1 downto 0); + sel : in std_logic_vector((WIDTH/8)-1 downto 0); + re : in std_ulogic; + we : in std_ulogic + ); +end entity main_bram; + +architecture sim of main_bram is + + constant WIDTH_BYTES : natural := WIDTH / 8; + constant pad_zeros : std_ulogic_vector(log2(WIDTH_BYTES)-1 downto 0) + := (others => '0'); + + signal identifier : integer := behavioural_initialize(filename => RAM_INIT_FILE, + size => MEMORY_SIZE); + -- Others + signal obuf : std_logic_vector(WIDTH-1 downto 0); +begin + + -- Actual RAM template + memory_0: process(clk) + variable ret_dat_v : std_ulogic_vector(63 downto 0); + variable addr64 : std_ulogic_vector(63 downto 0); + begin + if rising_edge(clk) then + addr64 := (others => '0'); + addr64(HEIGHT_BITS + 2 downto 3) := addr; + if we = '1' then + report "RAM writing " & to_hstring(di) & " to " & + to_hstring(addr & pad_zeros) & " sel:" & to_hstring(sel); + behavioural_write(di, addr64, to_integer(unsigned(sel)), identifier); + end if; + if re = '1' then + behavioural_read(ret_dat_v, addr64, to_integer(unsigned(sel)), identifier); + report "RAM reading from " & to_hstring(addr & pad_zeros) & + " returns " & to_hstring(ret_dat_v); + obuf <= ret_dat_v(obuf'left downto 0); + end if; + do <= obuf; + end if; + end process; + +end architecture sim; diff --git a/simple_ram_behavioural_helpers.vhdl b/sim_bram_helpers.vhdl similarity index 84% rename from simple_ram_behavioural_helpers.vhdl rename to sim_bram_helpers.vhdl index 507594f..c511a6e 100644 --- a/simple_ram_behavioural_helpers.vhdl +++ b/sim_bram_helpers.vhdl @@ -1,24 +1,24 @@ library ieee; use ieee.std_logic_1164.all; -package simple_ram_behavioural_helpers is +package sim_bram_helpers is function behavioural_initialize (filename: String; size: integer) return integer; attribute foreign of behavioural_initialize : function is "VHPIDIRECT behavioural_initialize"; - procedure behavioural_read (val: out std_ulogic_vector(63 downto 0); addr: std_ulogic_vector(63 downto 0); length: integer; identifier: integer; reload: integer); + procedure behavioural_read (val: out std_ulogic_vector(63 downto 0); addr: std_ulogic_vector(63 downto 0); length: integer; identifier:integer); attribute foreign of behavioural_read : procedure is "VHPIDIRECT behavioural_read"; procedure behavioural_write (val: std_ulogic_vector(63 downto 0); addr: std_ulogic_vector(63 downto 0); length: integer; identifier: integer); attribute foreign of behavioural_write : procedure is "VHPIDIRECT behavioural_write"; -end simple_ram_behavioural_helpers; +end sim_bram_helpers; -package body simple_ram_behavioural_helpers is +package body sim_bram_helpers is function behavioural_initialize (filename: String; size: integer) return integer is begin assert false report "VHPI" severity failure; end behavioural_initialize; - procedure behavioural_read (val: out std_ulogic_vector(63 downto 0); addr: std_ulogic_vector(63 downto 0); length: integer; identifier: integer; reload: integer) is + procedure behavioural_read (val: out std_ulogic_vector(63 downto 0); addr: std_ulogic_vector(63 downto 0); length: integer; identifier: integer) is begin assert false report "VHPI" severity failure; end behavioural_read; @@ -27,4 +27,4 @@ package body simple_ram_behavioural_helpers is begin assert false report "VHPI" severity failure; end behavioural_write; -end simple_ram_behavioural_helpers; +end sim_bram_helpers; diff --git a/simple_ram_behavioural_helpers_c.c b/sim_bram_helpers_c.c similarity index 100% rename from simple_ram_behavioural_helpers_c.c rename to sim_bram_helpers_c.c diff --git a/simple_ram_behavioural.vhdl b/simple_ram_behavioural.vhdl deleted file mode 100644 index d6255b8..0000000 --- a/simple_ram_behavioural.vhdl +++ /dev/null @@ -1,133 +0,0 @@ -library ieee; -use ieee.std_logic_1164.all; -use ieee.numeric_std.all; -use std.textio.all; - -library work; -use work.wishbone_types.all; -use work.simple_ram_behavioural_helpers.all; - -entity mw_soc_memory is - generic ( - RAM_INIT_FILE : string; - MEMORY_SIZE : integer; - PIPELINE_DEPTH : integer := 1 - ); - - port ( - clk : in std_ulogic; - rst : in std_ulogic; - - wishbone_in : in wishbone_master_out; - wishbone_out : out wishbone_slave_out - ); -end mw_soc_memory; - -architecture behave of mw_soc_memory is - type wishbone_state_t is (IDLE, ACK); - - signal state : wishbone_state_t := IDLE; - signal ret_ack : std_ulogic := '0'; - signal identifier : integer := behavioural_initialize(filename => RAM_INIT_FILE, size => MEMORY_SIZE); - signal reload : integer := 0; - signal ret_dat : wishbone_data_type; - - subtype pipe_idx_t is integer range 0 to PIPELINE_DEPTH-1; - type pipe_ack_t is array(pipe_idx_t) of std_ulogic; - type pipe_dat_t is array(pipe_idx_t) of wishbone_data_type; -begin - - pipe_big: if PIPELINE_DEPTH > 1 generate - signal pipe_ack : pipe_ack_t; - signal pipe_dat : pipe_dat_t; - begin - wishbone_out.stall <= '0'; - wishbone_out.ack <= pipe_ack(0); - wishbone_out.dat <= pipe_dat(0); - - pipe_big_sync: process(clk) - begin - if rising_edge(clk) then - pipe_stages: for i in 0 to PIPELINE_DEPTH-2 loop - pipe_ack(i) <= pipe_ack(i+1); - pipe_dat(i) <= pipe_dat(i+1); - end loop; - pipe_ack(PIPELINE_DEPTH-1) <= ret_ack; - pipe_dat(PIPELINE_DEPTH-1) <= ret_dat; - end if; - end process; - end generate; - - pipe_one: if PIPELINE_DEPTH = 1 generate - signal pipe_ack : std_ulogic; - signal pipe_dat : wishbone_data_type; - begin - wishbone_out.stall <= '0'; - wishbone_out.ack <= pipe_ack; - wishbone_out.dat <= pipe_dat; - - pipe_one_sync: process(clk) - begin - if rising_edge(clk) then - pipe_ack <= ret_ack; - pipe_dat <= ret_dat; - end if; - end process; - end generate; - - pipe_none: if PIPELINE_DEPTH = 0 generate - begin - wishbone_out.ack <= ret_ack; - wishbone_out.dat <= ret_dat; - wishbone_out.stall <= wishbone_in.cyc and not ret_ack; - end generate; - - wishbone_process: process(clk) - variable ret_dat_v : wishbone_data_type; - variable adr : std_ulogic_vector(63 downto 0); - begin - if rising_edge(clk) then - if rst = '1' then - state <= IDLE; - ret_ack <= '0'; - else - ret_dat <= x"FFFFFFFFFFFFFFFF"; - ret_ack <= '0'; - - -- Active - if wishbone_in.cyc = '1' then - case state is - when IDLE => - if wishbone_in.stb = '1' then - adr := (wishbone_in.adr'left downto 0 => wishbone_in.adr, - others => '0'); - -- write - if wishbone_in.we = '1' then - assert not(is_x(wishbone_in.dat)) and not(is_x(wishbone_in.adr)) severity failure; - report "RAM writing " & to_hstring(wishbone_in.dat) & " to " & to_hstring(wishbone_in.adr); - behavioural_write(wishbone_in.dat, adr, to_integer(unsigned(wishbone_in.sel)), identifier); - reload <= reload + 1; - ret_ack <= '1'; - if PIPELINE_DEPTH = 0 then - state <= ACK; - end if; - else - behavioural_read(ret_dat_v, adr, to_integer(unsigned(wishbone_in.sel)), identifier, reload); - report "RAM reading from " & to_hstring(wishbone_in.adr) & " returns " & to_hstring(ret_dat_v); - ret_dat <= ret_dat_v; - ret_ack <= '1'; - if PIPELINE_DEPTH = 0 then - state <= ACK; - end if; - end if; - end if; - when ACK => - state <= IDLE; - end case; - else - state <= IDLE; - end if; - end if; - end if; - end process; -end behave; diff --git a/simple_ram_behavioural_tb.vhdl b/simple_ram_behavioural_tb.vhdl deleted file mode 100644 index bee7d2e..0000000 --- a/simple_ram_behavioural_tb.vhdl +++ /dev/null @@ -1,246 +0,0 @@ -library ieee; -use ieee.std_logic_1164.all; -use ieee.numeric_std.all; - -library work; -use work.wishbone_types.all; - -entity simple_ram_behavioural_tb is -end simple_ram_behavioural_tb; - -architecture behave of simple_ram_behavioural_tb is - signal clk : std_ulogic; - signal rst : std_ulogic := '1'; - - constant clk_period : time := 10 ns; - - signal w_in : wishbone_slave_out; - signal w_out : wishbone_master_out; - - impure function to_adr(a: integer) return std_ulogic_vector is - begin - return std_ulogic_vector(to_unsigned(a, w_out.adr'length)); - end; -begin - simple_ram_0: entity work.mw_soc_memory - generic map ( - RAM_INIT_FILE => "simple_ram_behavioural_tb.bin", - MEMORY_SIZE => 16 - ) - port map ( - clk => clk, - rst => rst, - wishbone_out => w_in, - wishbone_in => w_out - ); - - clock: process - begin - clk <= '1'; - wait for clk_period / 2; - clk <= '0'; - wait for clk_period / 2; - end process clock; - - stim: process - begin - w_out.adr <= (others => '0'); - w_out.dat <= (others => '0'); - w_out.cyc <= '0'; - w_out.stb <= '0'; - w_out.sel <= (others => '0'); - w_out.we <= '0'; - - wait for clk_period; - rst <= '0'; - - wait for clk_period; - - w_out.cyc <= '1'; - - -- test various read lengths and alignments - w_out.stb <= '1'; - w_out.sel <= "00000001"; - w_out.adr <= to_adr(0); - assert w_in.ack = '0'; - wait for clk_period; - assert w_in.ack = '1'; - assert w_in.dat(7 downto 0) = x"00" report to_hstring(w_in.dat); - w_out.stb <= '0'; - wait for clk_period; - assert w_in.ack = '0'; - - w_out.stb <= '1'; - w_out.sel <= "00000001"; - w_out.adr <= to_adr(1); - assert w_in.ack = '0'; - wait for clk_period; - assert w_in.ack = '1'; - assert w_in.dat(7 downto 0) = x"01" report to_hstring(w_in.dat); - w_out.stb <= '0'; - wait for clk_period; - assert w_in.ack = '0'; - - w_out.stb <= '1'; - w_out.sel <= "00000001"; - w_out.adr <= to_adr(7); - assert w_in.ack = '0'; - wait for clk_period; - assert w_in.ack = '1'; - assert w_in.dat(7 downto 0) = x"07" report to_hstring(w_in.dat); - w_out.stb <= '0'; - wait for clk_period; - assert w_in.ack = '0'; - - w_out.stb <= '1'; - w_out.sel <= "00000011"; - w_out.adr <= to_adr(0); - assert w_in.ack = '0'; - wait for clk_period; - assert w_in.ack = '1'; - assert w_in.dat(15 downto 0) = x"0100" report to_hstring(w_in.dat); - w_out.stb <= '0'; - wait for clk_period; - assert w_in.ack = '0'; - - w_out.stb <= '1'; - w_out.sel <= "00000011"; - w_out.adr <= to_adr(1); - assert w_in.ack = '0'; - wait for clk_period; - assert w_in.ack = '1'; - assert w_in.dat(15 downto 0) = x"0201" report to_hstring(w_in.dat); - w_out.stb <= '0'; - wait for clk_period; - assert w_in.ack = '0'; - - w_out.stb <= '1'; - w_out.sel <= "00000011"; - w_out.adr <= to_adr(7); - assert w_in.ack = '0'; - wait for clk_period; - assert w_in.ack = '1'; - assert w_in.dat(15 downto 0) = x"0807" report to_hstring(w_in.dat); - w_out.stb <= '0'; - wait for clk_period; - assert w_in.ack = '0'; - - w_out.stb <= '1'; - w_out.sel <= "00001111"; - w_out.adr <= to_adr(0); - assert w_in.ack = '0'; - wait for clk_period; - assert w_in.ack = '1'; - assert w_in.dat(31 downto 0) = x"03020100" report to_hstring(w_in.dat); - w_out.stb <= '0'; - wait for clk_period; - assert w_in.ack = '0'; - - w_out.stb <= '1'; - w_out.sel <= "00001111"; - w_out.adr <= to_adr(1); - assert w_in.ack = '0'; - wait for clk_period; - assert w_in.ack = '1'; - assert w_in.dat(31 downto 0) = x"04030201" report to_hstring(w_in.dat); - w_out.stb <= '0'; - wait for clk_period; - assert w_in.ack = '0'; - - w_out.stb <= '1'; - w_out.sel <= "00001111"; - w_out.adr <= to_adr(7); - assert w_in.ack = '0'; - wait for clk_period; - assert w_in.ack = '1'; - assert w_in.dat(31 downto 0) = x"0A090807" report to_hstring(w_in.dat); - w_out.stb <= '0'; - wait for clk_period; - assert w_in.ack = '0'; - - w_out.stb <= '1'; - w_out.sel <= "11111111"; - w_out.adr <= to_adr(0); - assert w_in.ack = '0'; - wait for clk_period; - assert w_in.ack = '1'; - assert w_in.dat(63 downto 0) = x"0706050403020100" report to_hstring(w_in.dat); - w_out.stb <= '0'; - wait for clk_period; - assert w_in.ack = '0'; - - w_out.stb <= '1'; - w_out.sel <= "11111111"; - w_out.adr <= to_adr(1); - assert w_in.ack = '0'; - wait for clk_period; - assert w_in.ack = '1'; - assert w_in.dat(63 downto 0) = x"0807060504030201" report to_hstring(w_in.dat); - w_out.stb <= '0'; - wait for clk_period; - assert w_in.ack = '0'; - - w_out.stb <= '1'; - w_out.sel <= "11111111"; - w_out.adr <= to_adr(7); - assert w_in.ack = '0'; - wait for clk_period; - assert w_in.ack = '1'; - assert w_in.dat(63 downto 0) = x"0E0D0C0B0A090807" report to_hstring(w_in.dat); - w_out.stb <= '0'; - wait for clk_period; - assert w_in.ack = '0'; - - -- test various write lengths and alignments - w_out.stb <= '1'; - w_out.sel <= "00000001"; - w_out.adr <= to_adr(0); - w_out.we <= '1'; - w_out.dat(7 downto 0) <= x"0F"; - assert w_in.ack = '0'; - wait for clk_period; - assert w_in.ack = '1'; - w_out.stb <= '0'; - wait for clk_period; - assert w_in.ack = '0'; - - w_out.stb <= '1'; - w_out.sel <= "00000001"; - w_out.adr <= to_adr(0); - w_out.we <= '0'; - assert w_in.ack = '0'; - wait for clk_period; - assert w_in.ack = '1'; - assert w_in.dat(7 downto 0) = x"0F" report to_hstring(w_in.dat); - w_out.stb <= '0'; - wait for clk_period; - assert w_in.ack = '0'; - - w_out.stb <= '1'; - w_out.sel <= "11111111"; - w_out.adr <= to_adr(7); - w_out.we <= '1'; - w_out.dat <= x"BADC0FFEBADC0FFE"; - assert w_in.ack = '0'; - wait for clk_period; - assert w_in.ack = '1'; - w_out.stb <= '0'; - wait for clk_period; - assert w_in.ack = '0'; - - w_out.stb <= '1'; - w_out.sel <= "11111111"; - w_out.adr <= to_adr(7); - w_out.we <= '0'; - assert w_in.ack = '0'; - wait for clk_period; - assert w_in.ack = '1'; - assert w_in.dat = x"BADC0FFEBADC0FFE" report to_hstring(w_in.dat); - w_out.stb <= '0'; - wait for clk_period; - assert w_in.ack = '0'; - - assert false report "end of test" severity failure; - wait; - end process; -end behave; diff --git a/soc.vhdl b/soc.vhdl index b9a8215..6ed6bf7 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -170,7 +170,7 @@ begin wb_uart0_out.stall <= '0' when wb_uart0_in.cyc = '0' else not wb_uart0_out.ack; -- BRAM Memory slave - bram0: entity work.mw_soc_memory + bram0: entity work.wishbone_bram_wrapper generic map( MEMORY_SIZE => MEMORY_SIZE, RAM_INIT_FILE => RAM_INIT_FILE diff --git a/simple_ram_behavioural_tb.bin b/wishbone_bram_tb.bin similarity index 100% rename from simple_ram_behavioural_tb.bin rename to wishbone_bram_tb.bin diff --git a/wishbone_bram_tb.vhdl b/wishbone_bram_tb.vhdl new file mode 100644 index 0000000..be64db6 --- /dev/null +++ b/wishbone_bram_tb.vhdl @@ -0,0 +1,175 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.wishbone_types.all; + +entity wishbone_bram_tb is +end wishbone_bram_tb; + +architecture behave of wishbone_bram_tb is + signal clk : std_ulogic; + signal rst : std_ulogic := '1'; + + constant clk_period : time := 10 ns; + + signal w_in : wishbone_slave_out; + signal w_out : wishbone_master_out; + + impure function to_adr(a: integer) return std_ulogic_vector is + begin + return std_ulogic_vector(to_unsigned(a, w_out.adr'length)); + end; +begin + simple_ram_0: entity work.wishbone_bram_wrapper + generic map ( + RAM_INIT_FILE => "wishbone_bram_tb.bin", + MEMORY_SIZE => 16 + ) + port map ( + clk => clk, + rst => rst, + wishbone_out => w_in, + wishbone_in => w_out + ); + + clock: process + begin + clk <= '1'; + wait for clk_period / 2; + clk <= '0'; + wait for clk_period / 2; + end process clock; + + stim: process + begin + w_out.adr <= (others => '0'); + w_out.dat <= (others => '0'); + w_out.cyc <= '0'; + w_out.stb <= '0'; + w_out.sel <= (others => '0'); + w_out.we <= '0'; + + wait until rising_edge(clk); + rst <= '0'; + wait until rising_edge(clk); + + w_out.cyc <= '1'; + + -- Test read 0 + w_out.stb <= '1'; + w_out.sel <= "11111111"; + w_out.adr <= to_adr(0); + assert w_in.ack = '0'; + wait until rising_edge(clk); + w_out.stb <= '0'; + wait until rising_edge(clk); + wait until rising_edge(clk); + assert w_in.ack = '1'; + assert w_in.dat(63 downto 0) = x"0706050403020100" report to_hstring(w_in.dat); + wait until rising_edge(clk); + assert w_in.ack = '0'; + + -- Test read 8 + w_out.stb <= '1'; + w_out.sel <= "11111111"; + w_out.adr <= to_adr(8); + assert w_in.ack = '0'; + wait until rising_edge(clk); + w_out.stb <= '0'; + wait until rising_edge(clk); + wait until rising_edge(clk); + assert w_in.ack = '1'; + assert w_in.dat(63 downto 0) = x"0F0E0D0C0B0A0908" report to_hstring(w_in.dat); + wait until rising_edge(clk); + assert w_in.ack = '0'; + + -- Test write byte at 0 + w_out.stb <= '1'; + w_out.sel <= "00000001"; + w_out.adr <= to_adr(0); + w_out.we <= '1'; + w_out.dat(7 downto 0) <= x"0F"; + assert w_in.ack = '0'; + wait until rising_edge(clk); + w_out.stb <= '0'; + wait until rising_edge(clk) and w_in.ack = '1'; + wait until rising_edge(clk); + assert w_in.ack = '0'; + + -- Test read back + w_out.stb <= '1'; + w_out.sel <= "11111111"; + w_out.adr <= to_adr(0); + w_out.we <= '0'; + assert w_in.ack = '0'; + wait until rising_edge(clk); + w_out.stb <= '0'; + wait until rising_edge(clk); + wait until rising_edge(clk); + assert w_in.ack = '1'; + assert w_in.dat(63 downto 0) = x"070605040302010F" report to_hstring(w_in.dat); + wait until rising_edge(clk); + assert w_in.ack = '0'; + + -- Test write dword at 4 + w_out.stb <= '1'; + w_out.sel <= "11110000"; + w_out.adr <= to_adr(0); + w_out.we <= '1'; + w_out.dat(63 downto 32) <= x"BAADFEED"; + assert w_in.ack = '0'; + wait until rising_edge(clk); + w_out.stb <= '0'; + wait until rising_edge(clk) and w_in.ack = '1'; + wait until rising_edge(clk); + assert w_in.ack = '0'; + + -- Test read back + w_out.stb <= '1'; + w_out.sel <= "11111111"; + w_out.adr <= to_adr(0); + w_out.we <= '0'; + assert w_in.ack = '0'; + wait until rising_edge(clk); + w_out.stb <= '0'; + wait until rising_edge(clk); + wait until rising_edge(clk); + assert w_in.ack = '1'; + assert w_in.dat(63 downto 0) = x"BAADFEED0302010F" report to_hstring(w_in.dat); + wait until rising_edge(clk); + assert w_in.ack = '0'; + + -- Test write qword at 8 + w_out.stb <= '1'; + w_out.sel <= "11111111"; + w_out.adr <= to_adr(8); + w_out.we <= '1'; + w_out.dat(63 downto 0) <= x"0001020304050607"; + assert w_in.ack = '0'; + wait until rising_edge(clk); + w_out.stb <= '0'; + wait until rising_edge(clk) and w_in.ack = '1'; + wait until rising_edge(clk); + assert w_in.ack = '0'; + + -- Test read back + w_out.stb <= '1'; + w_out.sel <= "11111111"; + w_out.adr <= to_adr(8); + w_out.we <= '0'; + assert w_in.ack = '0'; + wait until rising_edge(clk); + w_out.stb <= '0'; + wait until rising_edge(clk); + wait until rising_edge(clk); + assert w_in.ack = '1'; + assert w_in.dat(63 downto 0) = x"0001020304050607" report to_hstring(w_in.dat); + wait until rising_edge(clk); + assert w_in.ack = '0'; + + assert false report "end of test" severity failure; + wait; + end process; +end behave; diff --git a/wishbone_bram_wrapper.vhdl b/wishbone_bram_wrapper.vhdl new file mode 100644 index 0000000..a711c3d --- /dev/null +++ b/wishbone_bram_wrapper.vhdl @@ -0,0 +1,76 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; +use std.textio.all; + +library work; +use work.utils.all; +use work.wishbone_types.all; + +--! @brief Simple memory module for use in Wishbone-based systems. +entity wishbone_bram_wrapper is + generic( + MEMORY_SIZE : natural := 4096; --! Memory size in bytes. + RAM_INIT_FILE : string + ); + port( + clk : in std_logic; + rst : in std_logic; + + -- Wishbone interface: + wishbone_in : in wishbone_master_out; + wishbone_out : out wishbone_slave_out + ); +end entity wishbone_bram_wrapper; + +architecture behaviour of wishbone_bram_wrapper is + constant ram_addr_bits : integer := log2(MEMORY_SIZE) - 3; + + -- RAM interface + signal ram_addr : std_logic_vector(ram_addr_bits - 1 downto 0); + signal ram_we : std_ulogic; + signal ram_re : std_ulogic; + + -- Others + signal ack, ack_buf : std_ulogic; +begin + + -- Actual RAM template + ram_0: entity work.main_bram + generic map( + WIDTH => 64, + HEIGHT_BITS => ram_addr_bits, + MEMORY_SIZE => MEMORY_SIZE, + RAM_INIT_FILE => RAM_INIT_FILE + ) + port map( + clk => clk, + addr => ram_addr, + di => wishbone_in.dat, + do => wishbone_out.dat, + sel => wishbone_in.sel, + re => ram_re, + we => ram_we + ); + + -- Wishbone interface + ram_addr <= wishbone_in.adr(ram_addr_bits + 2 downto 3); + ram_we <= wishbone_in.stb and wishbone_in.cyc and wishbone_in.we; + ram_re <= wishbone_in.stb and wishbone_in.cyc and not wishbone_in.we; + wishbone_out.stall <= '0'; + wishbone_out.ack <= ack_buf; + + wb_0: process(clk) + begin + if rising_edge(clk) then + if rst = '1' or wishbone_in.cyc = '0' then + ack_buf <= '0'; + ack <= '0'; + else + ack <= wishbone_in.stb; + ack_buf <= ack; + end if; + end if; + end process; + +end architecture behaviour; From 336f0e0690e391700ea4e636824f641927e20512 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 23 Oct 2019 14:00:30 +1100 Subject: [PATCH 18/21] ram: Ack stores early Stores only need a single cycle, so we can ack them early if there isn't an older ack already in the pipeline Signed-off-by: Benjamin Herrenschmidt --- wishbone_bram_wrapper.vhdl | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/wishbone_bram_wrapper.vhdl b/wishbone_bram_wrapper.vhdl index a711c3d..14520b5 100644 --- a/wishbone_bram_wrapper.vhdl +++ b/wishbone_bram_wrapper.vhdl @@ -67,8 +67,16 @@ begin ack_buf <= '0'; ack <= '0'; else - ack <= wishbone_in.stb; - ack_buf <= ack; + -- On loads, we have a delay cycle due to BRAM bufferring + -- but not on stores. So try to send an early ack on a + -- store if we aren't behind an existing load ack. + -- + if ram_we = '1' and ack = '0' then + ack_buf <= '1'; + else + ack <= wishbone_in.stb; + ack_buf <= ack; + end if; end if; end if; end process; From 472d8f94a240a3f739ed9fae574e97e1fc57ab8f Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 23 Oct 2019 14:01:48 +1100 Subject: [PATCH 19/21] wb_arbiter: Avoid IDLE cycle when not changing master Consecutive accesses from the same master shouldn't need an IDLE cycle. Completely remove the IDLE state and switch master when the bus is idle, but stay on the last selected one between cycles. Signed-off-by: Benjamin Herrenschmidt --- wishbone_arbiter.vhdl | 69 ++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 37 deletions(-) diff --git a/wishbone_arbiter.vhdl b/wishbone_arbiter.vhdl index 8e2358b..b206df1 100644 --- a/wishbone_arbiter.vhdl +++ b/wishbone_arbiter.vhdl @@ -24,58 +24,53 @@ entity wishbone_arbiter is end wishbone_arbiter; architecture behave of wishbone_arbiter is - type wishbone_arbiter_state_t is (IDLE, WB1_BUSY, WB2_BUSY, WB3_BUSY); - signal state : wishbone_arbiter_state_t := IDLE; + type wb_arb_master_t is (WB1, WB2, WB3); + signal candidate, selected : wb_arb_master_t; begin - wishbone_muxes: process(state, wb_in, wb1_in, wb2_in, wb3_in) + wishbone_muxes: process(selected, wb_in, wb1_in, wb2_in, wb3_in) begin -- Requests from masters are fully muxed - wb_out <= wb1_in when state = WB1_BUSY else - wb2_in when state = WB2_BUSY else - wb3_in when state = WB3_BUSY else - wishbone_master_out_init; + wb_out <= wb1_in when selected = WB1 else + wb2_in when selected = WB2 else + wb3_in when selected = WB3; -- Responses from slave don't need to mux the data bus wb1_out.dat <= wb_in.dat; wb2_out.dat <= wb_in.dat; wb3_out.dat <= wb_in.dat; - wb1_out.ack <= wb_in.ack when state = WB1_BUSY else '0'; - wb2_out.ack <= wb_in.ack when state = WB2_BUSY else '0'; - wb3_out.ack <= wb_in.ack when state = WB3_BUSY else '0'; - wb1_out.stall <= wb_in.stall when state = WB1_BUSY else '1'; - wb2_out.stall <= wb_in.stall when state = WB2_BUSY else '1'; - wb3_out.stall <= wb_in.stall when state = WB3_BUSY else '1'; + wb1_out.ack <= wb_in.ack when selected = WB1 else '0'; + wb2_out.ack <= wb_in.ack when selected = WB2 else '0'; + wb3_out.ack <= wb_in.ack when selected = WB3 else '0'; + wb1_out.stall <= wb_in.stall when selected = WB1 else '1'; + wb2_out.stall <= wb_in.stall when selected = WB2 else '1'; + wb3_out.stall <= wb_in.stall when selected = WB3 else '1'; + end process; + + -- Candidate selection is dumb, priority order... we could + -- instead consider some form of fairness but it's not really + -- an issue at the moment. + -- + wishbone_candidate: process(wb1_in.cyc, wb2_in.cyc, wb3_in.cyc) + begin + if wb1_in.cyc = '1' then + candidate <= WB1; + elsif wb2_in.cyc = '1' then + candidate <= WB2; + elsif wb3_in.cyc = '1' then + candidate <= WB3; + else + candidate <= selected; + end if; end process; wishbone_arbiter_process: process(clk) begin if rising_edge(clk) then if rst = '1' then - state <= IDLE; - else - case state is - when IDLE => - if wb1_in.cyc = '1' then - state <= WB1_BUSY; - elsif wb2_in.cyc = '1' then - state <= WB2_BUSY; - elsif wb3_in.cyc = '1' then - state <= WB3_BUSY; - end if; - when WB1_BUSY => - if wb1_in.cyc = '0' then - state <= IDLE; - end if; - when WB2_BUSY => - if wb2_in.cyc = '0' then - state <= IDLE; - end if; - when WB3_BUSY => - if wb3_in.cyc = '0' then - state <= IDLE; - end if; - end case; + selected <= WB1; + elsif wb_out.cyc = '0' then + selected <= candidate; end if; end if; end process; From bc2acfde2fd0aaf9a3a6b0af04e1175eb03762e8 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 23 Oct 2019 14:28:12 +1100 Subject: [PATCH 20/21] wb_arbiter: Make arbiter size parametric Signed-off-by: Benjamin Herrenschmidt --- soc.vhdl | 23 +++++++++++++--- wishbone_arbiter.vhdl | 63 +++++++++++++++++-------------------------- wishbone_types.vhdl | 3 +++ 3 files changed, 46 insertions(+), 43 deletions(-) diff --git a/soc.vhdl b/soc.vhdl index 6ed6bf7..fb8a36d 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -43,6 +43,12 @@ architecture behaviour of soc is signal wishbone_debug_in : wishbone_slave_out; signal wishbone_debug_out : wishbone_master_out; + -- Arbiter array (ghdl doesnt' support assigning the array + -- elements in the entity instantiation) + constant NUM_WB_MASTERS : positive := 3; + signal wb_masters_out : wishbone_master_out_vector(0 to NUM_WB_MASTERS-1); + signal wb_masters_in : wishbone_slave_out_vector(0 to NUM_WB_MASTERS-1); + -- Wishbone master (output of arbiter): signal wb_master_in : wishbone_slave_out; signal wb_master_out : wishbone_master_out; @@ -96,13 +102,22 @@ begin ); -- Wishbone bus master arbiter & mux + wb_masters_out <= (0 => wishbone_dcore_out, + 1 => wishbone_icore_out, + 2 => wishbone_debug_out); + wishbone_dcore_in <= wb_masters_in(0); + wishbone_icore_in <= wb_masters_in(1); + wishbone_debug_in <= wb_masters_in(2); wishbone_arbiter_0: entity work.wishbone_arbiter + generic map( + NUM_MASTERS => NUM_WB_MASTERS + ) port map( clk => system_clk, rst => rst, - wb1_in => wishbone_dcore_out, wb1_out => wishbone_dcore_in, - wb2_in => wishbone_icore_out, wb2_out => wishbone_icore_in, - wb3_in => wishbone_debug_out, wb3_out => wishbone_debug_in, - wb_out => wb_master_out, wb_in => wb_master_in + wb_masters_in => wb_masters_out, + wb_masters_out => wb_masters_in, + wb_slave_out => wb_master_out, + wb_slave_in => wb_master_in ); -- Wishbone slaves address decoder & mux diff --git a/wishbone_arbiter.vhdl b/wishbone_arbiter.vhdl index b206df1..63b3c4a 100644 --- a/wishbone_arbiter.vhdl +++ b/wishbone_arbiter.vhdl @@ -6,70 +6,55 @@ use work.wishbone_types.all; -- TODO: Use an array of master/slaves with parametric size entity wishbone_arbiter is + generic( + NUM_MASTERS : positive := 3 + ); port (clk : in std_ulogic; rst : in std_ulogic; - wb1_in : in wishbone_master_out; - wb1_out : out wishbone_slave_out; + wb_masters_in : in wishbone_master_out_vector(0 to NUM_MASTERS-1); + wb_masters_out : out wishbone_slave_out_vector(0 to NUM_MASTERS-1); - wb2_in : in wishbone_master_out; - wb2_out : out wishbone_slave_out; - - wb3_in : in wishbone_master_out; - wb3_out : out wishbone_slave_out; - - wb_out : out wishbone_master_out; - wb_in : in wishbone_slave_out + wb_slave_out : out wishbone_master_out; + wb_slave_in : in wishbone_slave_out ); end wishbone_arbiter; architecture behave of wishbone_arbiter is - type wb_arb_master_t is (WB1, WB2, WB3); + subtype wb_arb_master_t is integer range 0 to NUM_MASTERS-1; signal candidate, selected : wb_arb_master_t; begin - wishbone_muxes: process(selected, wb_in, wb1_in, wb2_in, wb3_in) + wishbone_muxes: process(selected, wb_slave_in, wb_masters_in) begin - -- Requests from masters are fully muxed - wb_out <= wb1_in when selected = WB1 else - wb2_in when selected = WB2 else - wb3_in when selected = WB3; - - -- Responses from slave don't need to mux the data bus - wb1_out.dat <= wb_in.dat; - wb2_out.dat <= wb_in.dat; - wb3_out.dat <= wb_in.dat; - wb1_out.ack <= wb_in.ack when selected = WB1 else '0'; - wb2_out.ack <= wb_in.ack when selected = WB2 else '0'; - wb3_out.ack <= wb_in.ack when selected = WB3 else '0'; - wb1_out.stall <= wb_in.stall when selected = WB1 else '1'; - wb2_out.stall <= wb_in.stall when selected = WB2 else '1'; - wb3_out.stall <= wb_in.stall when selected = WB3 else '1'; + wb_slave_out <= wb_masters_in(selected); + for i in 0 to NUM_MASTERS-1 loop + wb_masters_out(i).dat <= wb_slave_in.dat; + wb_masters_out(i).ack <= wb_slave_in.ack when selected = i else '0'; + wb_masters_out(i).stall <= wb_slave_in.stall when selected = i else '1'; + end loop; end process; -- Candidate selection is dumb, priority order... we could -- instead consider some form of fairness but it's not really -- an issue at the moment. -- - wishbone_candidate: process(wb1_in.cyc, wb2_in.cyc, wb3_in.cyc) + wishbone_candidate: process(all) begin - if wb1_in.cyc = '1' then - candidate <= WB1; - elsif wb2_in.cyc = '1' then - candidate <= WB2; - elsif wb3_in.cyc = '1' then - candidate <= WB3; - else - candidate <= selected; - end if; + candidate <= selected; + for i in NUM_MASTERS-1 downto 0 loop + if wb_masters_in(i).cyc = '1' then + candidate <= i; + end if; + end loop; end process; wishbone_arbiter_process: process(clk) begin if rising_edge(clk) then if rst = '1' then - selected <= WB1; - elsif wb_out.cyc = '0' then + selected <= 0; + elsif wb_slave_out.cyc = '0' then selected <= candidate; end if; end if; diff --git a/wishbone_types.vhdl b/wishbone_types.vhdl index 1b8a28b..d1f2a45 100644 --- a/wishbone_types.vhdl +++ b/wishbone_types.vhdl @@ -27,4 +27,7 @@ package wishbone_types is end record; constant wishbone_slave_out_init : wishbone_slave_out := (ack => '0', stall => '0', others => (others => '0')); + type wishbone_master_out_vector is array (natural range <>) of wishbone_master_out; + type wishbone_slave_out_vector is array (natural range <>) of wishbone_slave_out; + end package wishbone_types; From cff4b13a9b8d0036f9441c150bfd8e1d7bc620e3 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 23 Oct 2019 15:06:39 +1100 Subject: [PATCH 21/21] wb_arbiter: Early master selection This flips the arbiter muxes on the same cycle as a new request comes in, thus avoiding a cycle latency. Signed-off-by: Benjamin Herrenschmidt --- wishbone_arbiter.vhdl | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/wishbone_arbiter.vhdl b/wishbone_arbiter.vhdl index 63b3c4a..cb632bf 100644 --- a/wishbone_arbiter.vhdl +++ b/wishbone_arbiter.vhdl @@ -23,15 +23,23 @@ end wishbone_arbiter; architecture behave of wishbone_arbiter is subtype wb_arb_master_t is integer range 0 to NUM_MASTERS-1; signal candidate, selected : wb_arb_master_t; + signal busy : std_ulogic; begin - wishbone_muxes: process(selected, wb_slave_in, wb_masters_in) + busy <= wb_masters_in(selected).cyc; + + wishbone_muxes: process(selected, candidate, busy, wb_slave_in, wb_masters_in) + variable early_sel : wb_arb_master_t; begin - wb_slave_out <= wb_masters_in(selected); + early_sel := selected; + if busy = '0' then + early_sel := candidate; + end if; + wb_slave_out <= wb_masters_in(early_sel); for i in 0 to NUM_MASTERS-1 loop wb_masters_out(i).dat <= wb_slave_in.dat; - wb_masters_out(i).ack <= wb_slave_in.ack when selected = i else '0'; - wb_masters_out(i).stall <= wb_slave_in.stall when selected = i else '1'; + wb_masters_out(i).ack <= wb_slave_in.ack when early_sel = i else '0'; + wb_masters_out(i).stall <= wb_slave_in.stall when early_sel = i else '1'; end loop; end process; @@ -54,7 +62,7 @@ begin if rising_edge(clk) then if rst = '1' then selected <= 0; - elsif wb_slave_out.cyc = '0' then + elsif busy = '0' then selected <= candidate; end if; end if;