From 1d7de2f1dae295364848940f31c991c8b095f4aa Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 22 Feb 2022 09:30:05 +1100 Subject: [PATCH] register_file: Make read access to register file synchronous With this, the register RAM is read synchronously using the addresses supplied by decode1. That means the register RAM can now be block RAM rather than LUT RAM. Debug accesses are done via the B port on cycles when decode1 indicates that there is no valid instruction or the instruction doesn't use a [F]RB operand. We latch the addresses being read in each cycle and use the same address next cycle if stalled. Data that is being written is latched and a multiplexer on each read port then supplies the latched write data if the read address for that port equals the write address. Signed-off-by: Paul Mackerras --- common.vhdl | 3 ++ decode1.vhdl | 14 ++++++ register_file.vhdl | 117 +++++++++++++++++++++++++++++++-------------- 3 files changed, 99 insertions(+), 35 deletions(-) diff --git a/common.vhdl b/common.vhdl index 0349a6e..4d6cb91 100644 --- a/common.vhdl +++ b/common.vhdl @@ -280,6 +280,9 @@ package common is reg_1_addr : gspr_index_t; reg_2_addr : gspr_index_t; reg_3_addr : gspr_index_t; + read_1_enable : std_ulogic; + read_2_enable : std_ulogic; + read_3_enable : std_ulogic; end record; type bypass_data_t is record diff --git a/decode1.vhdl b/decode1.vhdl index 36d511b..cc93dfc 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -641,6 +641,7 @@ begin variable bv : br_predictor_t; variable fprs, fprabc : std_ulogic; variable in3rc : std_ulogic; + variable may_read_rb : std_ulogic; begin v := Decode1ToDecode2Init; vi := reg_internal_t_init; @@ -654,6 +655,7 @@ begin fprs := '0'; fprabc := '0'; in3rc := '0'; + may_read_rb := '0'; if f_in.valid = '1' then report "Decode insn " & to_hstring(f_in.insn) & " at " & to_hstring(f_in.nia); @@ -675,10 +677,16 @@ begin vi.override := not decode_op_4_valid(to_integer(unsigned(minor4op))); v.decode := decode_op_4_array(to_integer(unsigned(f_in.insn(5 downto 0)))); in3rc := '1'; + may_read_rb := '1'; + + when 23 => + -- rlwnm[.] + may_read_rb := '1'; when 31 => -- major opcode 31, lots of things v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1)))); + may_read_rb := '1'; if std_match(f_in.insn(10 downto 1), "01-1010011") then -- mfspr or mtspr @@ -728,6 +736,7 @@ begin when 30 => v.decode := decode_op_30_array(to_integer(unsigned(f_in.insn(4 downto 1)))); + may_read_rb := f_in.insn(4); when 52 | 53 | 54 | 55 => -- stfd[u] and stfs[u] @@ -748,6 +757,7 @@ begin in3rc := '1'; fprabc := '1'; fprs := '1'; + may_read_rb := '1'; end if; when 62 => @@ -764,6 +774,7 @@ begin in3rc := '1'; fprabc := '1'; fprs := '1'; + may_read_rb := '1'; end if; when others => @@ -777,6 +788,9 @@ begin else vr.reg_3_addr := fprs & insn_rs(f_in.insn); end if; + vr.read_1_enable := f_in.valid and not f_in.fetch_failed; + vr.read_2_enable := f_in.valid and not f_in.fetch_failed and may_read_rb; + vr.read_3_enable := f_in.valid and not f_in.fetch_failed; if f_in.fetch_failed = '1' then v.valid := '1'; diff --git a/register_file.vhdl b/register_file.vhdl index bc40c3f..a8ddee2 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -38,17 +38,27 @@ end entity register_file; architecture behaviour of register_file is type regfile is array(0 to 63) of std_ulogic_vector(63 downto 0); signal registers : regfile := (others => (others => '0')); - signal rd_port_b : std_ulogic_vector(63 downto 0); signal dbg_data : std_ulogic_vector(63 downto 0); signal dbg_ack : std_ulogic; + signal dbg_gpr_done : std_ulogic; signal addr_1_reg : gspr_index_t; signal addr_2_reg : gspr_index_t; signal addr_3_reg : gspr_index_t; + signal rd_2 : std_ulogic; + signal fwd_1 : std_ulogic; + signal fwd_2 : std_ulogic; + signal fwd_3 : std_ulogic; + signal data_1 : std_ulogic_vector(63 downto 0); + signal data_2 : std_ulogic_vector(63 downto 0); + signal data_3 : std_ulogic_vector(63 downto 0); + signal prev_write_data : std_ulogic_vector(63 downto 0); + begin - -- synchronous writes + -- synchronous reads and writes register_write_0: process(clk) variable a_addr, b_addr, c_addr : gspr_index_t; variable w_addr : gspr_index_t; + variable b_enable : std_ulogic; begin if rising_edge(clk) then if w_in.write_enable = '1' then @@ -66,57 +76,94 @@ begin a_addr := d1_in.reg_1_addr; b_addr := d1_in.reg_2_addr; c_addr := d1_in.reg_3_addr; - - if stall = '0' then + b_enable := d1_in.read_2_enable; + if stall = '1' then + a_addr := addr_1_reg; + b_addr := addr_2_reg; + c_addr := addr_3_reg; + b_enable := rd_2; + else addr_1_reg <= a_addr; addr_2_reg <= b_addr; addr_3_reg <= c_addr; + rd_2 <= b_enable; end if; + + fwd_1 <= '0'; + fwd_2 <= '0'; + fwd_3 <= '0'; + if w_in.write_enable = '1' then + if w_addr = a_addr then + fwd_1 <= '1'; + end if; + if w_addr = b_addr then + fwd_2 <= '1'; + end if; + if w_addr = c_addr then + fwd_3 <= '1'; + end if; + end if; + + -- Do debug reads to GPRs and FPRs using the B port when it is not in use + if dbg_gpr_req = '1' then + if b_enable = '0' then + b_addr := dbg_gpr_addr(5 downto 0); + dbg_gpr_done <= '1'; + end if; + else + dbg_gpr_done <= '0'; + end if; + + if not HAS_FPU then + -- Make it obvious that we only want 32 GSPRs for a no-FPU implementation + a_addr(5) := '0'; + b_addr(5) := '0'; + c_addr(5) := '0'; + end if; + data_1 <= registers(to_integer(unsigned(a_addr))); + data_2 <= registers(to_integer(unsigned(b_addr))); + data_3 <= registers(to_integer(unsigned(c_addr))); + + prev_write_data <= w_in.write_data; + assert (d_in.read1_enable = '0') or (d_in.read1_reg = addr_1_reg) severity failure; assert (d_in.read2_enable = '0') or (d_in.read2_reg = addr_2_reg) severity failure; assert (d_in.read3_enable = '0') or (d_in.read3_reg = addr_3_reg) severity failure; end if; end process register_write_0; - -- asynchronous reads + -- asynchronous forwarding of write data register_read_0: process(all) - variable a_addr, b_addr, c_addr : gspr_index_t; - variable w_addr : gspr_index_t; + variable out_data_1 : std_ulogic_vector(63 downto 0); + variable out_data_2 : std_ulogic_vector(63 downto 0); + variable out_data_3 : std_ulogic_vector(63 downto 0); begin - a_addr := d_in.read1_reg; - b_addr := d_in.read2_reg; - c_addr := d_in.read3_reg; - w_addr := w_in.write_reg; - if not HAS_FPU then - -- Make it obvious that we only want 32 GSPRs for a no-FPU implementation - a_addr(5) := '0'; - b_addr(5) := '0'; - c_addr(5) := '0'; - w_addr(5) := '0'; + out_data_1 := data_1; + out_data_2 := data_2; + out_data_3 := data_3; + if fwd_1 = '1' then + out_data_1 := prev_write_data; end if; + if fwd_2 = '1' then + out_data_2 := prev_write_data; + end if; + if fwd_3 = '1' then + out_data_3 := prev_write_data; + end if; + if d_in.read1_enable = '1' then - report "Reading GPR " & to_hstring(a_addr) & " " & to_hstring(registers(to_integer(unsigned(a_addr)))); + report "Reading GPR " & to_hstring(addr_1_reg) & " " & to_hstring(out_data_1); end if; if d_in.read2_enable = '1' then - report "Reading GPR " & to_hstring(b_addr) & " " & to_hstring(registers(to_integer(unsigned(b_addr)))); + report "Reading GPR " & to_hstring(addr_2_reg) & " " & to_hstring(out_data_2); end if; if d_in.read3_enable = '1' then - report "Reading GPR " & to_hstring(c_addr) & " " & to_hstring(registers(to_integer(unsigned(c_addr)))); - end if; - d_out.read1_data <= registers(to_integer(unsigned(a_addr))); - -- B read port is multiplexed with reads from the debug circuitry - if d_in.read2_enable = '0' and dbg_gpr_req = '1' and dbg_ack = '0' then - b_addr := dbg_gpr_addr; - if not HAS_FPU then - b_addr(5) := '0'; - end if; + report "Reading GPR " & to_hstring(addr_3_reg) & " " & to_hstring(out_data_3); end if; - rd_port_b <= registers(to_integer(unsigned(b_addr))); - d_out.read2_data <= rd_port_b; - d_out.read3_data <= registers(to_integer(unsigned(c_addr))); - -- Forwarding of written data is now done explicitly with a bypass path - -- from writeback to decode2. + d_out.read1_data <= out_data_1; + d_out.read2_data <= out_data_2; + d_out.read3_data <= out_data_3; end process register_read_0; -- Latch read data and ack if dbg read requested and B port not busy @@ -124,8 +171,8 @@ begin begin if rising_edge(clk) then if dbg_gpr_req = '1' then - if d_in.read2_enable = '0' and dbg_ack = '0' then - dbg_data <= rd_port_b; + if dbg_ack = '0' and dbg_gpr_done = '1' then + dbg_data <= data_2; dbg_ack <= '1'; end if; else