From 8a0a907e2faae2fb579260f64a40fffb00182638 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 2 May 2020 13:28:19 +1000 Subject: [PATCH 01/26] Implement the extswsli instruction This mainly required the addition of an entry to the opcode 31 decode table and a 32-bit sign-extender in the rotator. Signed-off-by: Paul Mackerras --- decode1.vhdl | 3 ++- execute1.vhdl | 5 ++++- rotator.vhdl | 12 +++++++++--- rotator_tb.vhdl | 28 ++++++++++++++++++++++++++++ 4 files changed, 43 insertions(+), 5 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 785b669..466a8ba 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -185,7 +185,8 @@ architecture behaviour of decode1 is 2#1110111010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsb 2#1110011010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsh 2#1111011010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsw - -- 2#110111101-# extswsli + 2#1101111010# => (ALU, OP_EXTSWSLI, NONE, CONST_SH, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extswsli + 2#1101111011# => (ALU, OP_EXTSWSLI, NONE, CONST_SH, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extswsli 2#1111010110# => (ALU, OP_ICBI, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- icbi 2#0000010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- icbt 2#0000001111# => (ALU, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- isel diff --git a/execute1.vhdl b/execute1.vhdl index 8286d30..6940049 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -63,6 +63,7 @@ architecture behaviour of execute1 is signal ctrl: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0')); signal ctrl_tmp: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0')); signal right_shift, rot_clear_left, rot_clear_right: std_ulogic; + signal rot_sign_ext: std_ulogic; signal rotator_result: std_ulogic_vector(63 downto 0); signal rotator_carry: std_ulogic; signal logical_result: std_ulogic_vector(63 downto 0); @@ -174,6 +175,7 @@ begin arith => e_in.is_signed, clear_left => rot_clear_left, clear_right => rot_clear_right, + sign_ext_rs => rot_sign_ext, result => rotator_result, carry_out => rotator_carry ); @@ -429,6 +431,7 @@ begin right_shift <= '1' when e_in.insn_type = OP_SHR else '0'; rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0'; rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0'; + rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0'; ctrl_tmp.irq_state <= WRITE_SRR0; exception := '0'; @@ -828,7 +831,7 @@ begin when OP_PRTY => result := parity_result; result_en := '1'; - when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR => + when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR | OP_EXTSWSLI => result := rotator_result; if e_in.output_carry = '1' then set_carry(v.e, rotator_carry, rotator_carry); diff --git a/rotator.vhdl b/rotator.vhdl index d8a8ee9..fef9788 100644 --- a/rotator.vhdl +++ b/rotator.vhdl @@ -15,6 +15,7 @@ entity rotator is arith: in std_ulogic; clear_left: in std_ulogic; clear_right: in std_ulogic; + sign_ext_rs: in std_ulogic; result: out std_ulogic_vector(63 downto 0); carry_out: out std_ulogic ); @@ -57,13 +58,18 @@ architecture behaviour of rotator is begin rotator_0: process(all) + variable hi32: std_ulogic_vector(31 downto 0); begin -- First replicate bottom 32 bits to both halves if 32-bit if is_32bit = '1' then - repl32 <= rs(31 downto 0) & rs(31 downto 0); - else - repl32 <= rs; + hi32 := rs(31 downto 0); + elsif sign_ext_rs = '1' then + -- sign extend bottom 32 bits + hi32 := (others => rs(31)); + else + hi32 := rs(63 downto 32); end if; + repl32 <= hi32 & rs(31 downto 0); -- Negate shift count for right shifts if right_shift = '1' then diff --git a/rotator_tb.vhdl b/rotator_tb.vhdl index 3cb46b0..62a09ce 100644 --- a/rotator_tb.vhdl +++ b/rotator_tb.vhdl @@ -19,6 +19,7 @@ architecture behave of rotator_tb is signal is_32bit, right_shift, arith, clear_left, clear_right: std_ulogic := '0'; signal result: std_ulogic_vector(63 downto 0); signal carry_out: std_ulogic; + signal extsw: std_ulogic; begin rotator_0: entity work.rotator @@ -32,6 +33,7 @@ begin arith => arith, clear_left => clear_left, clear_right => clear_right, + sign_ext_rs => extsw, result => result, carry_out => carry_out ); @@ -48,6 +50,7 @@ begin arith <= '0'; clear_left <= '1'; clear_right <= '1'; + extsw <= '0'; rlwnm_loop : for i in 0 to 1000 loop rs <= pseudorand(64); shift <= pseudorand(7); @@ -263,6 +266,31 @@ begin report "bad srad expected " & to_hstring(behave_ca_ra) & " got " & to_hstring(carry_out & result); end loop; + -- extswsli + report "test extswsli"; + ra <= (others => '0'); + is_32bit <= '0'; + right_shift <= '0'; + arith <= '0'; + clear_left <= '0'; + clear_right <= '0'; + extsw <= '1'; + extswsli_loop : for i in 0 to 1000 loop + rs <= pseudorand(64); + shift <= '0' & pseudorand(6); + wait for clk_period; + behave_ra := rs; + behave_ra(63 downto 32) := (others => rs(31)); + behave_ra := std_ulogic_vector(shift_left(unsigned(behave_ra), + to_integer(unsigned(shift)))); + --report "rs = " & to_hstring(rs); + --report "ra = " & to_hstring(ra); + --report "shift = " & to_hstring(shift); + --report "result = " & to_hstring(carry_out & result); + assert behave_ra = result + report "bad extswsli expected " & to_hstring(behave_ra) & " got " & to_hstring(result); + end loop; + assert false report "end of test" severity failure; wait; end process; From 517a91ce5ef492574d2fc8db2d7e18a66ef4cbd9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 2 May 2020 13:45:39 +1000 Subject: [PATCH 02/26] decode1: Implement eieio as a nop Signed-off-by: Paul Mackerras --- decode1.vhdl | 1 + 1 file changed, 1 insertion(+) diff --git a/decode1.vhdl b/decode1.vhdl index 466a8ba..0e88ef3 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -181,6 +181,7 @@ architecture behaviour of decode1 is 2#1111101001# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- divdo 2#0111101011# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- divw 2#1111101011# => (ALU, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- divwo + 2#1101010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- eieio 2#0100011100# => (ALU, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- eqv 2#1110111010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsb 2#1110011010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsh From 5d282a950c5be5a92b2d705f529f13fc55cd0c59 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 2 May 2020 20:08:10 +1000 Subject: [PATCH 03/26] Improve architectural compliance of mfspr and mtspr Mfspr from an unimplemented SPR should be a no-op in privileged state, so in this case we need to write back whatever was previously in the destination register. For problem state, both mtspr and mfspr to unimplemented SPRs should cause a program interrupt. There are special cases in the architecture for SPRs 0, 4 5 and 6 which we still don't implement. Signed-off-by: Paul Mackerras --- decode1.vhdl | 2 +- execute1.vhdl | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 0e88ef3..a819b79 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -249,7 +249,7 @@ architecture behaviour of decode1 is -- 2#1001000000# mcrxrx 2#0000010011# => (ALU, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf 2#0001010011# => (ALU, OP_MFMSR, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- mfmsr - 2#0101010011# => (ALU, OP_MFSPR, SPR, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfspr + 2#0101010011# => (ALU, OP_MFSPR, SPR, NONE, RS, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfspr 2#0100001001# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- modud 2#0100001011# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- moduw 2#1100001001# => (ALU, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- modsd diff --git a/execute1.vhdl b/execute1.vhdl index 6940049..c479a45 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -735,6 +735,7 @@ begin when OP_MFSPR => report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & "=" & to_hstring(a_in); + result_en := '1'; if is_fast_spr(e_in.read_reg1) then result := a_in; if decode_spr_num(e_in.insn) = SPR_XER then @@ -753,11 +754,15 @@ begin result := ctrl.tb; when SPR_DEC => result := ctrl.dec; - when others => - result := (others => '0'); + when others => + -- mfspr from unimplemented SPRs should be a nop in + -- supervisor mode and a program interrupt for user mode + result := c_in; + if ctrl.msr(MSR_PR) = '1' then + illegal := '1'; + end if; end case; end if; - result_en := '1'; when OP_MFCR => if e_in.insn(20) = '0' then -- mfcr @@ -823,6 +828,11 @@ begin when SPR_DEC => ctrl_tmp.dec <= c_in; when others => + -- mtspr to unimplemented SPRs should be a nop in + -- supervisor mode and a program interrupt for user mode + if ctrl.msr(MSR_PR) = '1' then + illegal := '1'; + end if; end case; end if; when OP_POPCNT => From dd2e71930c6854f2d251526ff29041c5d90e8e24 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 2 May 2020 13:26:30 +1000 Subject: [PATCH 04/26] debug: Provide a way to examine GPRs, fast SPRs and MSR This provides commands on the debug interface to read the value of the MSR or any of the 64 GSPR register file entries. The GSPR values are read using the B port of the register file in a cycle when decode2 is not using it. Signed-off-by: Paul Mackerras --- core.vhdl | 17 +++++++++++++++++ core_debug.vhdl | 31 +++++++++++++++++++++++++++++-- decode2.vhdl | 6 +++--- execute1.vhdl | 4 ++++ register_file.vhdl | 36 +++++++++++++++++++++++++++++++++++- 5 files changed, 88 insertions(+), 6 deletions(-) diff --git a/core.vhdl b/core.vhdl index 0e60905..acb37cc 100644 --- a/core.vhdl +++ b/core.vhdl @@ -95,6 +95,13 @@ architecture behave of core is signal dbg_core_rst: std_ulogic; signal dbg_icache_rst: std_ulogic; + signal dbg_gpr_req : std_ulogic; + signal dbg_gpr_ack : std_ulogic; + signal dbg_gpr_addr : gspr_index_t; + signal dbg_gpr_data : std_ulogic_vector(63 downto 0); + + signal msr : std_ulogic_vector(63 downto 0); + -- Debug status signal dbg_core_is_stopped: std_ulogic; @@ -213,6 +220,10 @@ begin d_in => decode2_to_register_file, d_out => register_file_to_decode2, w_in => writeback_to_register_file, + dbg_gpr_req => dbg_gpr_req, + dbg_gpr_ack => dbg_gpr_ack, + dbg_gpr_addr => dbg_gpr_addr, + dbg_gpr_data => dbg_gpr_data, sim_dump => terminate, sim_dump_done => sim_cr_dump ); @@ -244,6 +255,7 @@ begin f_out => execute1_to_fetch1, e_out => execute1_to_writeback, icache_inval => ex1_icache_inval, + dbg_msr_out => msr, terminate_out => terminate ); @@ -301,6 +313,11 @@ begin terminate => terminate, core_stopped => dbg_core_is_stopped, nia => fetch1_to_icache.nia, + msr => msr, + dbg_gpr_req => dbg_gpr_req, + dbg_gpr_ack => dbg_gpr_ack, + dbg_gpr_addr => dbg_gpr_addr, + dbg_gpr_data => dbg_gpr_data, terminated_out => terminated_out ); diff --git a/core_debug.vhdl b/core_debug.vhdl index ae4414e..c97213b 100644 --- a/core_debug.vhdl +++ b/core_debug.vhdl @@ -26,6 +26,13 @@ entity core_debug is terminate : in std_ulogic; core_stopped : in std_ulogic; nia : in std_ulogic_vector(63 downto 0); + msr : in std_ulogic_vector(63 downto 0); + + -- GSPR register read port + dbg_gpr_req : out std_ulogic; + dbg_gpr_ack : in std_ulogic; + dbg_gpr_addr : out gspr_index_t; + dbg_gpr_data : in std_ulogic_vector(63 downto 0); -- Misc terminated_out : out std_ulogic @@ -61,6 +68,15 @@ architecture behave of core_debug is -- NIA register (read only for now) constant DBG_CORE_NIA : std_ulogic_vector(3 downto 0) := "0010"; + -- MSR (read only) + constant DBG_CORE_MSR : std_ulogic_vector(3 downto 0) := "0011"; + + -- GSPR register index + constant DBG_CORE_GSPR_INDEX : std_ulogic_vector(3 downto 0) := "0100"; + + -- GSPR register data + constant DBG_CORE_GSPR_DATA : std_ulogic_vector(3 downto 0) := "0101"; + -- Some internal wires signal stat_reg : std_ulogic_vector(63 downto 0); @@ -70,10 +86,15 @@ architecture behave of core_debug is signal do_reset : std_ulogic; signal do_icreset : std_ulogic; signal terminated : std_ulogic; + signal do_gspr_rd : std_ulogic; + signal gspr_index : gspr_index_t; begin - -- Single cycle register accesses on DMI - dmi_ack <= dmi_req; + -- Single cycle register accesses on DMI except for GSPR data + dmi_ack <= dmi_req when dmi_addr /= DBG_CORE_GSPR_DATA + else dbg_gpr_ack; + dbg_gpr_req <= dmi_req when dmi_addr = DBG_CORE_GSPR_DATA + else '0'; -- Status register read composition stat_reg <= (2 => terminated, @@ -85,6 +106,8 @@ begin with dmi_addr select dmi_dout <= stat_reg when DBG_CORE_STAT, nia when DBG_CORE_NIA, + msr when DBG_CORE_MSR, + dbg_gpr_data when DBG_CORE_GSPR_DATA, (others => '0') when others; -- DMI writes @@ -126,6 +149,8 @@ begin stopping <= '0'; terminated <= '0'; end if; + elsif dmi_addr = DBG_CORE_GSPR_INDEX then + gspr_index <= dmi_din(gspr_index_t'left downto 0); end if; else report("DMI read from " & to_string(dmi_addr)); @@ -143,6 +168,8 @@ begin end if; end process; + dbg_gpr_addr <= gspr_index; + -- Core control signals generated by the debug module core_stop <= stopping and not do_step; core_rst <= do_reset; diff --git a/decode2.vhdl b/decode2.vhdl index edcc50c..b239392 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -285,9 +285,9 @@ begin decoded_reg_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn, r_in.read3_data); decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispr1); - r_out.read1_enable <= decoded_reg_a.reg_valid; - r_out.read2_enable <= decoded_reg_b.reg_valid; - r_out.read3_enable <= decoded_reg_c.reg_valid; + r_out.read1_enable <= decoded_reg_a.reg_valid and d_in.valid; + r_out.read2_enable <= decoded_reg_b.reg_valid and d_in.valid; + r_out.read3_enable <= decoded_reg_c.reg_valid and d_in.valid; case d_in.decode.length is when is1B => diff --git a/execute1.vhdl b/execute1.vhdl index c479a45..82776e2 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -32,6 +32,8 @@ entity execute1 is e_out : out Execute1ToWritebackType; + dbg_msr_out : out std_ulogic_vector(63 downto 0); + icache_inval : out std_ulogic; terminate_out : out std_ulogic ); @@ -217,6 +219,8 @@ begin d_out => divider_to_x ); + dbg_msr_out <= ctrl.msr; + a_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data1 = '1' else e_in.read_data1; b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2; c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3; diff --git a/register_file.vhdl b/register_file.vhdl index 6a4c989..2cffeea 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -17,6 +17,11 @@ entity register_file is w_in : in WritebackToRegisterFileType; + dbg_gpr_req : in std_ulogic; + dbg_gpr_ack : out std_ulogic; + dbg_gpr_addr : in gspr_index_t; + dbg_gpr_data : out std_ulogic_vector(63 downto 0); + -- debug sim_dump : in std_ulogic; sim_dump_done : out std_ulogic @@ -26,6 +31,9 @@ end entity register_file; architecture behaviour of register_file is type regfile is array(0 to 63) of std_ulogic_vector(63 downto 0); signal registers : regfile := (others => (others => '0')); + signal rd_port_b : std_ulogic_vector(63 downto 0); + signal dbg_data : std_ulogic_vector(63 downto 0); + signal dbg_ack : std_ulogic; begin -- synchronous writes register_write_0: process(clk) @@ -45,6 +53,7 @@ begin -- asynchronous reads register_read_0: process(all) + variable b_addr : gspr_index_t; begin if d_in.read1_enable = '1' then report "Reading GPR " & to_hstring(d_in.read1_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read1_reg)))); @@ -56,7 +65,14 @@ begin report "Reading GPR " & to_hstring(d_in.read3_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read3_reg)))); end if; d_out.read1_data <= registers(to_integer(unsigned(d_in.read1_reg))); - d_out.read2_data <= registers(to_integer(unsigned(d_in.read2_reg))); + -- B read port is multiplexed with reads from the debug circuitry + if d_in.read2_enable = '0' and dbg_gpr_req = '1' and dbg_ack = '0' then + b_addr := dbg_gpr_addr; + else + b_addr := d_in.read2_reg; + end if; + rd_port_b <= registers(to_integer(unsigned(b_addr))); + d_out.read2_data <= rd_port_b; d_out.read3_data <= registers(to_integer(unsigned(gpr_to_gspr(d_in.read3_reg)))); -- Forward any written data @@ -73,6 +89,24 @@ begin end if; end process register_read_0; + -- Latch read data and ack if dbg read requested and B port not busy + dbg_register_read: process(clk) + begin + if rising_edge(clk) then + if dbg_gpr_req = '1' then + if d_in.read2_enable = '0' and dbg_ack = '0' then + dbg_data <= rd_port_b; + dbg_ack <= '1'; + end if; + else + dbg_ack <= '0'; + end if; + end if; + end process; + + dbg_gpr_ack <= dbg_ack; + dbg_gpr_data <= dbg_data; + -- Dump registers if core terminates sim_dump_test: if SIM generate dump_registers: process(all) From 3340d8aa9fac6acaef70d3f48ed8e47d8afb7374 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 2 May 2020 13:31:07 +1000 Subject: [PATCH 05/26] mw_debug: Add support for reading GSPRs and writing memory This adds a "gpr" command for reading 1 or more GPRs/fast SPRs, and a "mw" command for writing an 8-byte value to memory. It also adds an "icreset" command for resetting the instruction cache and fixes the "creset" command to actually reset the core instead of starting it. The MSR is now printed along with the NIA in the status information. Signed-off-by: Paul Mackerras --- scripts/mw_debug/mw_debug.c | 81 +++++++++++++++++++++++++++++++------ 1 file changed, 68 insertions(+), 13 deletions(-) diff --git a/scripts/mw_debug/mw_debug.c b/scripts/mw_debug/mw_debug.c index f1a7cab..8359242 100644 --- a/scripts/mw_debug/mw_debug.c +++ b/scripts/mw_debug/mw_debug.c @@ -33,6 +33,10 @@ #define DBG_CORE_STAT_TERM (1 << 2) #define DBG_CORE_NIA 0x12 +#define DBG_CORE_MSR 0x13 + +#define DBG_CORE_GSPR_INDEX 0x14 +#define DBG_CORE_GSPR_DATA 0x15 static bool debug; @@ -356,11 +360,12 @@ static int dmi_write(uint8_t addr, uint64_t data) static void core_status(void) { - uint64_t stat, nia; + uint64_t stat, nia, msr; const char *statstr, *statstr2; check(dmi_read(DBG_CORE_STAT, &stat), "reading core status"); check(dmi_read(DBG_CORE_NIA, &nia), "reading core NIA"); + check(dmi_read(DBG_CORE_MSR, &msr), "reading core MSR"); if (debug) printf("Core status = 0x%llx\n", (unsigned long long)stat); @@ -378,6 +383,7 @@ static void core_status(void) statstr = "odd state (TERM but no STOP)"; printf("Core: %s%s\n", statstr, statstr2); printf(" NIA: %016llx\n", (unsigned long long)nia); + printf(" MSR: %016llx\n", msr); } static void core_stop(void) @@ -392,7 +398,7 @@ static void core_start(void) static void core_reset(void) { - check(dmi_write(DBG_CORE_CTRL, DBG_CORE_CTRL_START), "resetting core"); + check(dmi_write(DBG_CORE_CTRL, DBG_CORE_CTRL_RESET), "resetting core"); } static void core_step(void) @@ -413,19 +419,47 @@ static void icache_reset(void) check(dmi_write(DBG_CORE_CTRL, DBG_CORE_CTRL_ICRESET), "resetting icache"); } +static const char *fast_spr_names[] = +{ + "lr", "ctr", "srr0", "srr1", "hsrr0", "hsrr1", + "sprg0", "sprg1", "sprg2", "sprg3", + "hsprg0", "hsprg1", "xer" +}; + +static void gpr_read(uint64_t reg, uint64_t count) +{ + uint64_t data; + + reg &= 0x3f; + if (reg + count > 64) + count = 64 - reg; + for (; count != 0; --count, ++reg) { + check(dmi_write(DBG_CORE_GSPR_INDEX, reg), "setting GPR index"); + data = 0xdeadbeef; + check(dmi_read(DBG_CORE_GSPR_DATA, &data), "reading GPR data"); + if (reg <= 31) + printf("r%d", reg); + else if ((reg - 32) < sizeof(fast_spr_names) / sizeof(fast_spr_names[0])) + printf("%s", fast_spr_names[reg - 32]); + else + printf("gspr%d", reg); + printf(":\t%016llx\n", data); + } +} + static void mem_read(uint64_t addr, uint64_t count) { uint64_t data; int i, rc; - rc = dmi_write(2, 0x7ff); + rc = dmi_write(DBG_WB_CTRL, 0x7ff); if (rc < 0) return; - rc = dmi_write(0, addr); + rc = dmi_write(DBG_WB_ADDR, addr); if (rc < 0) return; for (i = 0; i < count; i++) { - rc = dmi_read(1, &data); + rc = dmi_read(DBG_WB_DATA, &data); if (rc < 0) return; printf("%016llx: %016llx\n", @@ -435,6 +469,13 @@ static void mem_read(uint64_t addr, uint64_t count) } } +static void mem_write(uint64_t addr, uint64_t data) +{ + check(dmi_write(DBG_WB_CTRL, 0x7ff), "writing WB_CTRL"); + check(dmi_write(DBG_WB_ADDR, addr), "writing WB_ADDR"); + check(dmi_write(DBG_WB_DATA, data), "writing WB_DATA"); +} + static void load(const char *filename, uint64_t addr) { uint64_t data; @@ -445,13 +486,8 @@ static void load(const char *filename, uint64_t addr) fprintf(stderr, "Failed to open '%s': %s\n", filename, strerror(errno)); exit(1); } - // XX dumb, do better - rc = dmi_write(2, 0x7ff); - if (rc < 0) - return; - rc = dmi_write(0, addr); - if (rc < 0) - return; + check(dmi_write(DBG_WB_CTRL, 0x7ff), "writing WB_CTRL"); + check(dmi_write(DBG_WB_ADDR, addr), "writing WB_ADDR"); count = 0; for (;;) { data = 0; @@ -459,7 +495,7 @@ static void load(const char *filename, uint64_t addr) if (rc <= 0) break; // if (rc < 8) XXX fixup endian ? - dmi_write(1, data); + check(dmi_write(DBG_WB_DATA, data), "writing WB_DATA"); count += 8; if (!(count % 1024)) printf("%x...\n", count); @@ -544,6 +580,8 @@ int main(int argc, char *argv[]) dmi_write(addr, data); } else if (strcmp(argv[i], "creset") == 0) { core_reset(); + } else if (strcmp(argv[i], "icreset") == 0) { + icache_reset(); } else if (strcmp(argv[i], "stop") == 0) { core_stop(); } else if (strcmp(argv[i], "start") == 0) { @@ -563,6 +601,14 @@ int main(int argc, char *argv[]) if (((i+1) < argc) && isdigit(argv[i+1][0])) count = strtoul(argv[++i], NULL, 16); mem_read(addr, count); + } else if (strcmp(argv[i], "mw") == 0) { + uint64_t addr, data; + + if ((i+2) >= argc) + usage(argv[0]); + addr = strtoul(argv[++i], NULL, 16); + data = strtoul(argv[++i], NULL, 16); + mem_write(addr, data); } else if (strcmp(argv[i], "load") == 0) { const char *filename; uint64_t addr = 0; @@ -573,6 +619,15 @@ int main(int argc, char *argv[]) if (((i+1) < argc) && isdigit(argv[i+1][0])) addr = strtoul(argv[++i], NULL, 16); load(filename, addr); + } else if (strcmp(argv[i], "gpr") == 0) { + uint64_t reg, count = 1; + + if ((i+1) >= argc) + usage(argv[0]); + reg = strtoul(argv[++i], NULL, 10); + if (((i+1) < argc) && isdigit(argv[i+1][0])) + count = strtoul(argv[++i], NULL, 10); + gpr_read(reg, count); } else { fprintf(stderr, "Unknown command %s\n", argv[i]); exit(1); From 635e316f9b77e83db47889b4c4985b5a12141498 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 22 Apr 2020 16:53:39 +1000 Subject: [PATCH 06/26] Pass mtspr/mfspr to MMU-related SPRs down to loadstore1 This arranges for some mfspr and mtspr to get sent to loadstore1 instead of being handled in execute1. In particular, DAR and DSISR are handled this way. They are therefore "slow" SPRs. While we're at it, fix the spelling of HEIR and remove mention of DAR and DSISR from the comments in execute1. Signed-off-by: Paul Mackerras --- common.vhdl | 8 +++++-- decode1.vhdl | 10 ++++++++- execute1.vhdl | 6 ++--- loadstore1.vhdl | 59 ++++++++++++++++++++++++++++++++++++++++--------- 4 files changed, 67 insertions(+), 16 deletions(-) diff --git a/common.vhdl b/common.vhdl index ed97e0c..d3d30e7 100644 --- a/common.vhdl +++ b/common.vhdl @@ -24,6 +24,8 @@ package common is constant SPR_XER : spr_num_t := 1; constant SPR_LR : spr_num_t := 8; constant SPR_CTR : spr_num_t := 9; + constant SPR_DSISR : spr_num_t := 18; + constant SPR_DAR : spr_num_t := 19; constant SPR_TB : spr_num_t := 268; constant SPR_DEC : spr_num_t := 22; constant SPR_SRR0 : spr_num_t := 26; @@ -214,7 +216,7 @@ package common is type Execute1ToLoadstore1Type is record valid : std_ulogic; - op : insn_type_t; -- what ld/st op to do + op : insn_type_t; -- what ld/st or m[tf]spr to do addr1 : std_ulogic_vector(63 downto 0); addr2 : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); -- data to write, unused for read @@ -228,10 +230,12 @@ package common is xerc : xer_common_t; reserve : std_ulogic; -- set for larx/stcx. rc : std_ulogic; -- set for stcx. + spr_num : spr_num_t; -- SPR number for mfspr/mtspr end record; constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0', sign_extend => '0', update => '0', xerc => xerc_init, - reserve => '0', rc => '0', others => (others => '0')); + reserve => '0', rc => '0', + spr_num => 0, others => (others => '0')); type Loadstore1ToDcacheType is record valid : std_ulogic; diff --git a/decode1.vhdl b/decode1.vhdl index a819b79..70099d4 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -363,6 +363,7 @@ begin variable v : Decode1ToDecode2Type; variable majorop : major_opcode_t; variable op_19_bits: std_ulogic_vector(2 downto 0); + variable sprn : spr_num_t; begin v := r; @@ -429,10 +430,17 @@ begin end if; end if; elsif v.decode.insn_type = OP_MFSPR or v.decode.insn_type = OP_MTSPR then - v.ispr1 := fast_spr_num(decode_spr_num(f_in.insn)); + sprn := decode_spr_num(f_in.insn); + v.ispr1 := fast_spr_num(sprn); -- Make slow SPRs single issue if is_fast_spr(v.ispr1) = '0' then v.decode.sgl_pipe := '1'; + -- send MMU-related SPRs to loadstore1 + case sprn is + when SPR_DAR | SPR_DSISR => + v.decode.unit := LDST; + when others => + end case; end if; elsif v.decode.insn_type = OP_RFID then report "PPC RFID"; diff --git a/execute1.vhdl b/execute1.vhdl index 82776e2..490723e 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -463,7 +463,7 @@ begin elsif irq_valid = '1' and e_in.valid = '1' then -- we need two cycles to write srr0 and 1 - -- will need more when we have to write DSISR, DAR and HIER + -- will need more when we have to write HEIR -- Don't deliver the interrupt until we have a valid instruction -- coming in, so we have a valid NIA to put in SRR0. exception := '1'; @@ -494,13 +494,12 @@ begin when OP_ILLEGAL => -- we need two cycles to write srr0 and 1 - -- will need more when we have to write DSISR, DAR and HIER + -- will need more when we have to write HEIR illegal := '1'; when OP_SC => -- check bit 1 of the instruction is 1 so we know this is sc; -- 0 would mean scv, so generate an illegal instruction interrupt -- we need two cycles to write srr0 and 1 - -- will need more when we have to write DSISR, DAR and HIER if e_in.insn(1) = '1' then exception := '1'; exception_nextpc := '1'; @@ -983,6 +982,7 @@ begin lv.xerc := v.e.xerc; lv.reserve := e_in.reserve; lv.rc := e_in.rc; + lv.spr_num := decode_spr_num(e_in.insn); -- decode l*cix and st*cix instructions here if e_in.insn(31 downto 26) = "011111" and e_in.insn(10 downto 9) = "11" and e_in.insn(5 downto 1) = "10101" then diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 90650db..7ddbbc0 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -59,6 +59,8 @@ architecture behave of loadstore1 is nc : std_ulogic; -- non-cacheable access state : state_t; second_bytes : std_ulogic_vector(7 downto 0); + dar : std_ulogic_vector(63 downto 0); + dsisr : std_ulogic_vector(31 downto 0); end record; type byte_sel_t is array(0 to 7) of std_ulogic; @@ -135,6 +137,9 @@ begin variable use_second : byte_sel_t; variable trim_ctl : trim_ctl_t; variable negative : std_ulogic; + variable mfspr : std_ulogic; + variable sprn : std_ulogic_vector(9 downto 0); + variable sprval : std_ulogic_vector(63 downto 0); begin v := r; req := '0'; @@ -142,6 +147,8 @@ begin done := '0'; byte_sel := (others => '0'); addr := lsu_sum; + mfspr := '0'; + sprval := (others => '0'); -- avoid inferred latches write_enable := '0'; do_update := '0'; @@ -200,11 +207,38 @@ begin if l_in.valid = '1' then v.load := '0'; v.dcbz := '0'; - if l_in.op = OP_LOAD then + case l_in.op is + when OP_STORE => + req := '1'; + when OP_LOAD => + req := '1'; v.load := '1'; - elsif l_in.op = OP_DCBZ then + when OP_DCBZ => + req := '1'; v.dcbz := '1'; - end if; + when OP_MFSPR => + done := '1'; + mfspr := '1'; + -- partial decode on SPR number should be adequate given + -- the restricted set that get sent down this path + sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10)); + if sprn(0) = '0' then + sprval := x"00000000" & r.dsisr; + else + sprval := r.dar; + end if; + when OP_MTSPR => + done := '1'; + sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10)); + if sprn(0) = '0' then + v.dsisr := l_in.data(31 downto 0); + else + v.dar := l_in.data; + end if; + when others => + assert false report "unknown op sent to loadstore1"; + end case; + v.addr := lsu_sum; v.write_reg := l_in.write_reg; v.length := l_in.length; @@ -246,12 +280,13 @@ begin v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8); end loop; - req := '1'; - stall := '1'; - if long_sel(15 downto 8) = "00000000" then - v.state := LAST_ACK_WAIT; - else - v.state := SECOND_REQ; + if req = '1' then + stall := '1'; + if long_sel(15 downto 8) = "00000000" then + v.state := LAST_ACK_WAIT; + else + v.state := SECOND_REQ; + end if; end if; end if; @@ -308,7 +343,11 @@ begin -- Multiplex either cache data to the destination GPR or -- the address for the rA update. l_out.valid <= done; - if do_update = '1' then + if mfspr = '1' then + l_out.write_enable <= '1'; + l_out.write_reg <= l_in.write_reg; + l_out.write_data <= sprval; + elsif do_update = '1' then l_out.write_enable <= '1'; l_out.write_reg <= r.update_reg; l_out.write_data <= r.addr; From 750b3a8e284ef721c101bbf82f88743ae2428459 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 6 Apr 2020 17:54:45 +1000 Subject: [PATCH 07/26] dcache: Implement data TLB This adds a TLB to dcache, providing the ability to translate addresses for loads and stores. No protection mechanism has been implemented yet. The MSR_DR bit controls whether addresses are translated through the TLB. The TLB is a fixed-pagesize, set-associative cache. Currently the page size is 4kB and the TLB is 2-way set associative with 64 entries per set. This implements the tlbie instruction. RB bits 10 and 11 control whether the whole TLB is invalidated (if either bit is 1) or just a single entry corresponding to the effective page number in bits 12-63 of RB. As an extension until we get a hardware page table walk, a tlbie instruction with RB bits 9-11 set to 001 will load an entry into the TLB. The TLB entry value is in RS in the format of a radix PTE. Currently there is no proper handling of TLB misses. The load or store will not be performed but no interrupt is generated. In order to make timing at 100MHz on the Arty A7-100, we compare the real address from each way of the TLB with the tag from each way of the cache in parallel (requiring # TLB ways * # cache ways comparators). Then the result is selected based on which way hit in the TLB. That avoids a timing path going through the TLB EA comparators, the multiplexer that selects the RA, and the cache tag comparators. The hack where addresses of the form 0xc------- are marked as cache-inhibited is kept for now but restricted to real-mode accesses. Signed-off-by: Paul Mackerras --- common.vhdl | 10 +- dcache.vhdl | 398 +++++++++++++++++++++++++++++++++++++++------- dcache_tb.vhdl | 1 + decode1.vhdl | 1 + decode_types.vhdl | 2 +- execute1.vhdl | 2 + loadstore1.vhdl | 22 ++- 7 files changed, 371 insertions(+), 65 deletions(-) diff --git a/common.vhdl b/common.vhdl index d3d30e7..6741044 100644 --- a/common.vhdl +++ b/common.vhdl @@ -216,7 +216,7 @@ package common is type Execute1ToLoadstore1Type is record valid : std_ulogic; - op : insn_type_t; -- what ld/st or m[tf]spr to do + op : insn_type_t; -- what ld/st or m[tf]spr or TLB op to do addr1 : std_ulogic_vector(63 downto 0); addr2 : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); -- data to write, unused for read @@ -231,18 +231,21 @@ package common is reserve : std_ulogic; -- set for larx/stcx. rc : std_ulogic; -- set for stcx. spr_num : spr_num_t; -- SPR number for mfspr/mtspr + virt_mode : std_ulogic; -- do translation through TLB end record; constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0', sign_extend => '0', update => '0', xerc => xerc_init, - reserve => '0', rc => '0', + reserve => '0', rc => '0', virt_mode => '0', spr_num => 0, others => (others => '0')); type Loadstore1ToDcacheType is record valid : std_ulogic; - load : std_ulogic; + load : std_ulogic; -- is this a load + tlbie : std_ulogic; -- is this a tlbie dcbz : std_ulogic; nc : std_ulogic; reserve : std_ulogic; + virt_mode : std_ulogic; addr : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); byte_sel : std_ulogic_vector(7 downto 0); @@ -253,6 +256,7 @@ package common is data : std_ulogic_vector(63 downto 0); store_done : std_ulogic; error : std_ulogic; + tlb_miss : std_ulogic; end record; type Loadstore1ToWritebackType is record diff --git a/dcache.vhdl b/dcache.vhdl index 7d61a85..3464c0d 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -25,7 +25,13 @@ entity dcache is -- Number of lines in a set NUM_LINES : positive := 32; -- Number of ways - NUM_WAYS : positive := 4 + NUM_WAYS : positive := 4; + -- L1 DTLB entries per set + TLB_SET_SIZE : positive := 64; + -- L1 DTLB number of sets + TLB_NUM_WAYS : positive := 2; + -- L1 DTLB log_2(page_size) + TLB_LG_PGSZ : positive := 12 ); port ( clk : in std_ulogic; @@ -56,6 +62,8 @@ architecture rtl of dcache is -- Bit fields counts in the address + -- REAL_ADDR_BITS is the number of real address bits that we store + constant REAL_ADDR_BITS : positive := 56; -- ROW_BITS is the number of bits to select a row constant ROW_BITS : natural := log2(BRAM_ROWS); -- ROW_LINEBITS is the number of bits to select a row within a line @@ -66,8 +74,10 @@ architecture rtl of dcache is constant ROW_OFF_BITS : natural := log2(ROW_SIZE); -- INDEX_BITS is the number if bits to select a cache line constant INDEX_BITS : natural := log2(NUM_LINES); + -- SET_SIZE_BITS is the log base 2 of the set size + constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS; -- TAG_BITS is the number of bits of the tag part of the address - constant TAG_BITS : natural := 64 - LINE_OFF_BITS - INDEX_BITS; + constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS; -- WAY_BITS is the number of bits to select a way constant WAY_BITS : natural := log2(NUM_WAYS); @@ -80,7 +90,7 @@ architecture rtl of dcache is -- .. | |- --| ROW_OFF_BITS (3) -- .. |----- ---| | ROW_BITS (8) -- .. |-----| | INDEX_BITS (5) - -- .. --------| | TAG_BITS (53) + -- .. --------| | TAG_BITS (45) subtype row_t is integer range 0 to BRAM_ROWS-1; subtype index_t is integer range 0 to NUM_LINES-1; @@ -110,6 +120,32 @@ architecture rtl of dcache is attribute ram_style : string; attribute ram_style of cache_tags : signal is "distributed"; + -- L1 TLB. + constant TLB_SET_BITS : natural := log2(TLB_SET_SIZE); + constant TLB_WAY_BITS : natural := log2(TLB_NUM_WAYS); + constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_SET_BITS); + constant TLB_TAG_WAY_BITS : natural := TLB_NUM_WAYS * TLB_EA_TAG_BITS; + constant TLB_PTE_BITS : natural := 64; + constant TLB_PTE_WAY_BITS : natural := TLB_NUM_WAYS * TLB_PTE_BITS; + + subtype tlb_way_t is integer range 0 to TLB_NUM_WAYS - 1; + subtype tlb_index_t is integer range 0 to TLB_SET_SIZE - 1; + subtype tlb_way_valids_t is std_ulogic_vector(TLB_NUM_WAYS-1 downto 0); + type tlb_valids_t is array(tlb_index_t) of tlb_way_valids_t; + subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0); + subtype tlb_way_tags_t is std_ulogic_vector(TLB_TAG_WAY_BITS-1 downto 0); + type tlb_tags_t is array(tlb_index_t) of tlb_way_tags_t; + subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0); + subtype tlb_way_ptes_t is std_ulogic_vector(TLB_PTE_WAY_BITS-1 downto 0); + type tlb_ptes_t is array(tlb_index_t) of tlb_way_ptes_t; + type hit_way_set_t is array(tlb_way_t) of way_t; + + signal dtlb_valids : tlb_valids_t; + signal dtlb_tags : tlb_tags_t; + signal dtlb_ptes : tlb_ptes_t; + attribute ram_style of dtlb_tags : signal is "distributed"; + attribute ram_style of dtlb_ptes : signal is "distributed"; + signal r0 : Loadstore1ToDcacheType; -- Type of operation on a "valid" input @@ -168,6 +204,13 @@ architecture rtl of dcache is store_way : way_t; store_row : row_t; store_index : index_t; + + -- Signals to complete with error + error_done : std_ulogic; + tlb_miss : std_ulogic; + + -- completion signal for tlbie + tlbie_done : std_ulogic; end record; signal r1 : reg_stage_1_t; @@ -208,6 +251,21 @@ architecture rtl of dcache is -- Wishbone read/write/cache write formatting signals signal bus_sel : std_ulogic_vector(7 downto 0); + -- TLB signals + signal tlb_tag_way : tlb_way_tags_t; + signal tlb_pte_way : tlb_way_ptes_t; + signal tlb_valid_way : tlb_way_valids_t; + signal tlb_req_index : tlb_index_t; + signal tlb_hit : std_ulogic; + signal tlb_hit_way : tlb_way_t; + signal pte : tlb_pte_t; + signal ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + signal valid_ra : std_ulogic; + + -- TLB PLRU output interface + type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + signal tlb_plru_victim : tlb_plru_out_t; + -- -- Helper functions to decode incoming requests -- @@ -215,13 +273,13 @@ architecture rtl of dcache is -- Return the cache line index (tag index) for an address function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is begin - return to_integer(unsigned(addr(63-TAG_BITS downto LINE_OFF_BITS))); + return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS))); end; -- Return the cache row index (data memory) for an address function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is begin - return to_integer(unsigned(addr(63-TAG_BITS downto ROW_OFF_BITS))); + return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS))); end; -- Returns whether this is the last row of a line @@ -269,9 +327,9 @@ architecture rtl of dcache is end; -- Get the tag value from the address - function get_tag(addr: std_ulogic_vector(63 downto 0)) return cache_tag_t is + function get_tag(addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)) return cache_tag_t is begin - return addr(63 downto 64-TAG_BITS); + return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS); end; -- Read a tag from a tag memory row @@ -287,6 +345,38 @@ architecture rtl of dcache is tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; end; + -- Read a TLB tag from a TLB tag memory row + function read_tlb_tag(way: tlb_way_t; tags: tlb_way_tags_t) return tlb_tag_t is + variable j : integer; + begin + j := way * TLB_EA_TAG_BITS; + return tags(j + TLB_EA_TAG_BITS - 1 downto j); + end; + + -- Write a TLB tag to a TLB tag memory row + procedure write_tlb_tag(way: tlb_way_t; tags: inout tlb_way_tags_t; + tag: tlb_tag_t) is + variable j : integer; + begin + j := way * TLB_EA_TAG_BITS; + tags(j + TLB_EA_TAG_BITS - 1 downto j) := tag; + end; + + -- Read a PTE from a TLB PTE memory row + function read_tlb_pte(way: tlb_way_t; ptes: tlb_way_ptes_t) return tlb_pte_t is + variable j : integer; + begin + j := way * TLB_PTE_BITS; + return ptes(j + TLB_PTE_BITS - 1 downto j); + end; + + procedure write_tlb_pte(way: tlb_way_t; ptes: inout tlb_way_ptes_t; newpte: tlb_pte_t) is + variable j : integer; + begin + j := way * TLB_PTE_BITS; + ptes(j + TLB_PTE_BITS - 1 downto j) := newpte; + end; + begin assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; @@ -297,13 +387,158 @@ begin report "geometry bits don't add up" severity FAILURE; assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) report "geometry bits don't add up" severity FAILURE; - assert (64 = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) + assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) report "geometry bits don't add up" severity FAILURE; - assert (64 = TAG_BITS + ROW_BITS + ROW_OFF_BITS) + assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS) report "geometry bits don't add up" severity FAILURE; assert (64 = wishbone_data_bits) report "Can't yet handle a wishbone width that isn't 64-bits" severity FAILURE; - + + -- Latch the request in r0 as long as we're not stalling + stage_0 : process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + r0.valid <= '0'; + elsif stall_out = '0' then + r0 <= d_in; + end if; + end if; + end process; + + -- TLB + -- Operates in the second cycle on the request latched in r0. + -- TLB updates write the entry at the end of the second cycle. + tlb_read : process(clk) + variable index : tlb_index_t; + begin + if rising_edge(clk) then + if stall_out = '1' then + -- keep reading the same thing while stalled + index := tlb_req_index; + else + index := to_integer(unsigned(d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 + downto TLB_LG_PGSZ))); + end if; + tlb_valid_way <= dtlb_valids(index); + tlb_tag_way <= dtlb_tags(index); + tlb_pte_way <= dtlb_ptes(index); + end if; + end process; + + -- Generate TLB PLRUs + maybe_tlb_plrus: if TLB_NUM_WAYS > 1 generate + begin + tlb_plrus: for i in 0 to TLB_SET_SIZE - 1 generate + -- TLB PLRU interface + signal tlb_plru_acc : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + signal tlb_plru_acc_en : std_ulogic; + signal tlb_plru_out : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + begin + tlb_plru : entity work.plru + generic map ( + BITS => TLB_WAY_BITS + ) + port map ( + clk => clk, + rst => rst, + acc => tlb_plru_acc, + acc_en => tlb_plru_acc_en, + lru => tlb_plru_out + ); + + process(tlb_req_index, tlb_hit, tlb_hit_way, tlb_plru_out) + begin + -- PLRU interface + if tlb_hit = '1' and tlb_req_index = i then + tlb_plru_acc_en <= '1'; + else + tlb_plru_acc_en <= '0'; + end if; + tlb_plru_acc <= std_ulogic_vector(to_unsigned(tlb_hit_way, TLB_WAY_BITS)); + tlb_plru_victim(i) <= tlb_plru_out; + end process; + end generate; + end generate; + + tlb_search : process(all) + variable hitway : tlb_way_t; + variable hit : std_ulogic; + variable eatag : tlb_tag_t; + begin + tlb_req_index <= to_integer(unsigned(r0.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 + downto TLB_LG_PGSZ))); + hitway := 0; + hit := '0'; + eatag := r0.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); + for i in tlb_way_t loop + if tlb_valid_way(i) = '1' and + read_tlb_tag(i, tlb_tag_way) = eatag then + hitway := i; + hit := '1'; + end if; + end loop; + tlb_hit <= hit and r0.valid; + tlb_hit_way <= hitway; + pte <= read_tlb_pte(hitway, tlb_pte_way); + valid_ra <= tlb_hit or not r0.virt_mode; + if r0.virt_mode = '1' then + ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & + r0.addr(TLB_LG_PGSZ - 1 downto 0); + else + ra <= r0.addr(REAL_ADDR_BITS - 1 downto 0); + end if; + end process; + + tlb_update : process(clk) + variable tlbie : std_ulogic; + variable tlbia : std_ulogic; + variable tlbwe : std_ulogic; + variable repl_way : tlb_way_t; + variable eatag : tlb_tag_t; + variable tagset : tlb_way_tags_t; + variable pteset : tlb_way_ptes_t; + begin + if rising_edge(clk) then + tlbie := '0'; + tlbia := '0'; + tlbwe := '0'; + if r0.valid = '1' and stall_out = '0' and r0.tlbie = '1' then + if r0.addr(11 downto 10) /= "00" then + tlbia := '1'; + elsif r0.addr(9) = '1' then + tlbwe := '1'; + else + tlbie := '1'; + end if; + end if; + if rst = '1' or tlbia = '1' then + -- clear all valid bits at once + for i in tlb_index_t loop + dtlb_valids(i) <= (others => '0'); + end loop; + elsif tlbie = '1' then + if tlb_hit = '1' then + dtlb_valids(tlb_req_index)(tlb_hit_way) <= '0'; + end if; + elsif tlbwe = '1' then + if tlb_hit = '1' then + repl_way := tlb_hit_way; + else + repl_way := to_integer(unsigned(tlb_plru_victim(tlb_req_index))); + end if; + eatag := r0.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); + tagset := tlb_tag_way; + write_tlb_tag(repl_way, tagset, eatag); + dtlb_tags(tlb_req_index) <= tagset; + pteset := tlb_pte_way; + write_tlb_pte(repl_way, pteset, r0.data); + dtlb_ptes(tlb_req_index) <= pteset; + dtlb_valids(tlb_req_index)(repl_way) <= '1'; + end if; + end if; + end process; + -- Generate PLRUs maybe_plrus: if NUM_WAYS > 1 generate begin @@ -341,53 +576,73 @@ begin end generate; end generate; - -- Latch the request in r0 as long as we're not stalling - stage_0 : process(clk) - begin - if rising_edge(clk) then - if rst = '1' then - r0.valid <= '0'; - elsif stall_out = '0' then - r0 <= d_in; - end if; - end if; - end process; - -- Cache request parsing and hit detection dcache_request : process(all) variable is_hit : std_ulogic; variable hit_way : way_t; variable op : op_t; - variable tmp : std_ulogic_vector(63 downto 0); - variable data : std_ulogic_vector(63 downto 0); - variable opsel : std_ulogic_vector(3 downto 0); + variable opsel : std_ulogic_vector(2 downto 0); variable go : std_ulogic; + variable s_hit : std_ulogic; + variable s_tag : cache_tag_t; + variable s_pte : tlb_pte_t; + variable s_ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + variable hit_set : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); + variable hit_way_set : hit_way_set_t; begin -- Extract line, row and tag from request req_index <= get_index(r0.addr); req_row <= get_row(r0.addr); - req_tag <= get_tag(r0.addr); + req_tag <= get_tag(ra); -- Only do anything if not being stalled by stage 1 - go := r0.valid and not stall_out; + go := r0.valid and not stall_out and not r0.tlbie; -- Calculate address of beginning of cache line, will be -- used for cache miss processing if needed -- - req_laddr <= r0.addr(63 downto LINE_OFF_BITS) & + req_laddr <= (63 downto REAL_ADDR_BITS => '0') & + ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) & (LINE_OFF_BITS-1 downto 0 => '0'); -- Test if pending request is a hit on any way - hit_way := 0; - is_hit := '0'; - for i in way_t loop - if go = '1' and cache_valids(req_index)(i) = '1' then - if read_tag(i, cache_tags(req_index)) = req_tag then - hit_way := i; - is_hit := '1'; - end if; - end if; - end loop; + -- In order to make timing in virtual mode, when we are using the TLB, + -- we compare each way with each of the real addresses from each way of + -- the TLB, and then decide later which match to use. + hit_way := 0; + is_hit := '0'; + if r0.virt_mode = '1' then + for j in tlb_way_t loop + hit_way_set(j) := 0; + s_hit := '0'; + s_pte := read_tlb_pte(j, tlb_pte_way); + s_ra := s_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & + r0.addr(TLB_LG_PGSZ - 1 downto 0); + s_tag := get_tag(s_ra); + for i in way_t loop + if go = '1' and cache_valids(req_index)(i) = '1' and + read_tag(i, cache_tags(req_index)) = s_tag and + tlb_valid_way(j) = '1' then + hit_way_set(j) := i; + s_hit := '1'; + end if; + end loop; + hit_set(j) := s_hit; + end loop; + if tlb_hit = '1' then + is_hit := hit_set(tlb_hit_way); + hit_way := hit_way_set(tlb_hit_way); + end if; + else + s_tag := get_tag(r0.addr(REAL_ADDR_BITS - 1 downto 0)); + for i in way_t loop + if go = '1' and cache_valids(req_index)(i) = '1' and + read_tag(i, cache_tags(req_index)) = s_tag then + hit_way := i; + is_hit := '1'; + end if; + end loop; + end if; -- The way that matched on a hit req_hit_way <= hit_way; @@ -398,19 +653,25 @@ begin -- Combine the request and cache his status to decide what -- operation needs to be done -- - opsel := go & r0.load & r0.nc & is_hit; - case opsel is - when "1101" => op := OP_LOAD_HIT; - when "1100" => op := OP_LOAD_MISS; - when "1110" => op := OP_LOAD_NC; - when "1001" => op := OP_STORE_HIT; - when "1000" => op := OP_STORE_MISS; - when "1010" => op := OP_STORE_MISS; - when "1011" => op := OP_BAD; - when "1111" => op := OP_BAD; - when others => op := OP_NONE; - end case; - + op := OP_NONE; + if go = '1' then + if valid_ra = '1' then + opsel := r0.load & r0.nc & is_hit; + case opsel is + when "101" => op := OP_LOAD_HIT; + when "100" => op := OP_LOAD_MISS; + when "110" => op := OP_LOAD_NC; + when "001" => op := OP_STORE_HIT; + when "000" => op := OP_STORE_MISS; + when "010" => op := OP_STORE_MISS; + when "011" => op := OP_BAD; + when "111" => op := OP_BAD; + when others => op := OP_NONE; + end case; + else + op := OP_BAD; + end if; + end if; req_op <= op; -- Version of the row number that is valid one cycle earlier @@ -427,9 +688,6 @@ begin -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- TODO: Generate errors - -- err_nc_collision <= '1' when req_op = OP_BAD else '0'; - -- Generate stalls from stage 1 state machine stall_out <= '1' when r1.state /= IDLE else '0'; @@ -477,6 +735,8 @@ begin d_out.valid <= '0'; d_out.data <= cache_out(r1.hit_way); d_out.store_done <= '0'; + d_out.error <= '0'; + d_out.tlb_miss <= '0'; -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store @@ -502,6 +762,20 @@ begin d_out.valid <= '1'; end if; + -- error cases complete without stalling + if r1.error_done = '1' then + report "completing ld/st with error"; + d_out.error <= '1'; + d_out.tlb_miss <= r1.tlb_miss; + d_out.valid <= '1'; + end if; + + -- tlbie is handled above and doesn't go through the cache state machine + if r1.tlbie_done = '1' then + report "completing tlbie"; + d_out.valid <= '1'; + end if; + -- Slow ops (load miss, NC, stores) if r1.slow_valid = '1' then -- If it's a load, enable register writeback and switch @@ -609,6 +883,7 @@ begin -- -- Cache hit synchronous machine for the easy case. This handles load hits. + -- It also handles error cases (TLB miss, cache paradox) -- dcache_fast_hit : process(clk) begin @@ -636,6 +911,16 @@ begin else r1.hit_load_valid <= '0'; end if; + + if req_op = OP_BAD then + r1.error_done <= '1'; + r1.tlb_miss <= not valid_ra; + else + r1.error_done <= '0'; + end if; + + -- complete tlbies in the third cycle + r1.tlbie_done <= r0.valid and r0.tlbie and not stall_out; end if; end process; @@ -717,7 +1002,7 @@ begin when OP_LOAD_NC => r1.wb.sel <= r0.byte_sel; - r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; r1.wb.we <= '0'; @@ -726,7 +1011,7 @@ begin when OP_STORE_HIT | OP_STORE_MISS => if r0.dcbz = '0' then r1.wb.sel <= r0.byte_sel; - r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; r1.wb.dat <= r0.data; if cancel_store = '0' then r1.wb.cyc <= '1'; @@ -774,6 +1059,7 @@ begin end if; -- OP_NONE and OP_BAD do nothing + -- OP_BAD was handled above already when OP_NONE => when OP_BAD => end case; diff --git a/dcache_tb.vhdl b/dcache_tb.vhdl index bd8341a..66b938f 100644 --- a/dcache_tb.vhdl +++ b/dcache_tb.vhdl @@ -68,6 +68,7 @@ begin -- Clear stuff d_in.valid <= '0'; d_in.load <= '0'; + d_in.tlbie <= '0'; d_in.nc <= '0'; d_in.addr <= (others => '0'); d_in.data <= (others => '0'); diff --git a/decode1.vhdl b/decode1.vhdl index 70099d4..fd799fe 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -323,6 +323,7 @@ architecture behaviour of decode1 is 2#1001010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sync 2#0001000100# => (ALU, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- td 2#0000000100# => (ALU, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- tw + 2#0100110010# => (LDST, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- tlbie 2#0100111100# => (ALU, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- xor others => illegal_inst ); diff --git a/decode_types.vhdl b/decode_types.vhdl index 07c486a..ef51bd0 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -16,7 +16,7 @@ package decode_types is OP_POPCNT, OP_PRTY, OP_RFID, OP_RLC, OP_RLCL, OP_RLCR, OP_SC, OP_SETB, OP_SHL, OP_SHR, - OP_SYNC, OP_TRAP, + OP_SYNC, OP_TLBIE, OP_TRAP, OP_XOR ); type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR); diff --git a/execute1.vhdl b/execute1.vhdl index 490723e..98b95dc 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -88,6 +88,7 @@ architecture behaviour of execute1 is OP_MFMSR => SUPER, OP_MTMSRD => SUPER, OP_RFID => SUPER, + OP_TLBIE => SUPER, others => USER ); @@ -988,6 +989,7 @@ begin e_in.insn(5 downto 1) = "10101" then lv.ci := '1'; end if; + lv.virt_mode := ctrl.msr(MSR_DR); -- Update registers rin <= v; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 7ddbbc0..d5a59e8 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -43,6 +43,7 @@ architecture behave of loadstore1 is type reg_stage_t is record -- latch most of the input request load : std_ulogic; + tlbie : std_ulogic; dcbz : std_ulogic; addr : std_ulogic_vector(63 downto 0); store_data : std_ulogic_vector(63 downto 0); @@ -57,6 +58,7 @@ architecture behave of loadstore1 is reserve : std_ulogic; rc : std_ulogic; nc : std_ulogic; -- non-cacheable access + virt_mode : std_ulogic; state : state_t; second_bytes : std_ulogic_vector(7 downto 0); dar : std_ulogic_vector(63 downto 0); @@ -207,6 +209,7 @@ begin if l_in.valid = '1' then v.load := '0'; v.dcbz := '0'; + v.tlbie := '0'; case l_in.op is when OP_STORE => req := '1'; @@ -216,6 +219,9 @@ begin when OP_DCBZ => req := '1'; v.dcbz := '1'; + when OP_TLBIE => + req := '1'; + v.tlbie := '1'; when OP_MFSPR => done := '1'; mfspr := '1'; @@ -250,14 +256,15 @@ begin v.reserve := l_in.reserve; v.rc := l_in.rc; v.nc := l_in.ci; + v.virt_mode := l_in.virt_mode; -- XXX Temporary hack. Mark the op as non-cachable if the address - -- is the form 0xc------- + -- is the form 0xc------- for a real-mode access. -- -- This will have to be replaced by a combination of implementing the -- proper HV CI load/store instructions and having an MMU to get the I -- bit otherwise. - if lsu_sum(31 downto 28) = "1100" then + if lsu_sum(31 downto 28) = "1100" and l_in.virt_mode = '0' then v.nc := '1'; end if; @@ -269,10 +276,13 @@ begin v.addr := lsu_sum; -- Do byte reversing and rotating for stores in the first cycle - byte_offset := unsigned(lsu_sum(2 downto 0)); + byte_offset := "000"; brev_lenm1 := "000"; - if l_in.byte_reverse = '1' then - brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + if v.tlbie = '0' then + byte_offset := unsigned(lsu_sum(2 downto 0)); + if l_in.byte_reverse = '1' then + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + end if; end if; for i in 0 to 7 loop k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; @@ -332,12 +342,14 @@ begin -- Update outputs to dcache d_out.valid <= req; d_out.load <= v.load; + d_out.tlbie <= v.tlbie; d_out.dcbz <= v.dcbz; d_out.nc <= v.nc; d_out.reserve <= v.reserve; d_out.addr <= addr; d_out.data <= v.store_data; d_out.byte_sel <= byte_sel; + d_out.virt_mode <= v.virt_mode; -- Update outputs to writeback -- Multiplex either cache data to the destination GPR or From 42d0fcc5119e28c2e8807076f25b19e9b04dea83 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 7 Apr 2020 16:17:37 +1000 Subject: [PATCH 08/26] Implement data storage interrupts This adds a path from loadstore1 back to execute1 for reporting errors, and machinery in execute1 for generating data storage interrupts at vector 0x300. If dcache is given two requests in successive cycles and the first encounters an error (e.g. a TLB miss), it will now cancel the second request. Loadstore1 now responds to errors reported by dcache by sending an exception signal to execute1 and returning to the idle state. Execute1 then writes SRR0 and SRR1 and jumps to the 0x300 Data Storage Interrupt vector. DAR and DSISR are held in loadstore1. Signed-off-by: Paul Mackerras --- common.vhdl | 4 ++++ core.vhdl | 3 +++ dcache.vhdl | 15 ++++++++---- execute1.vhdl | 20 +++++++++++++--- loadstore1.vhdl | 61 +++++++++++++++++++++++++++++++++++++------------ 5 files changed, 81 insertions(+), 22 deletions(-) diff --git a/common.vhdl b/common.vhdl index 6741044..59b3744 100644 --- a/common.vhdl +++ b/common.vhdl @@ -238,6 +238,10 @@ package common is reserve => '0', rc => '0', virt_mode => '0', spr_num => 0, others => (others => '0')); + type Loadstore1ToExecute1Type is record + exception : std_ulogic; + end record; + type Loadstore1ToDcacheType is record valid : std_ulogic; load : std_ulogic; -- is this a load diff --git a/core.vhdl b/core.vhdl index acb37cc..0cb2ecd 100644 --- a/core.vhdl +++ b/core.vhdl @@ -63,6 +63,7 @@ architecture behave of core is -- load store signals signal execute1_to_loadstore1: Execute1ToLoadstore1Type; + signal loadstore1_to_execute1: Loadstore1ToExecute1Type; signal loadstore1_to_writeback: Loadstore1ToWritebackType; -- dcache signals @@ -251,6 +252,7 @@ begin stall_out => ex1_stall_out, e_in => decode2_to_execute1, i_in => xics_in, + l_in => loadstore1_to_execute1, l_out => execute1_to_loadstore1, f_out => execute1_to_fetch1, e_out => execute1_to_writeback, @@ -264,6 +266,7 @@ begin clk => clk, rst => core_rst, l_in => execute1_to_loadstore1, + e_out => loadstore1_to_execute1, l_out => loadstore1_to_writeback, d_out => loadstore1_to_dcache, d_in => dcache_to_loadstore1, diff --git a/dcache.vhdl b/dcache.vhdl index 3464c0d..7895877 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -147,6 +147,7 @@ architecture rtl of dcache is attribute ram_style of dtlb_ptes : signal is "distributed"; signal r0 : Loadstore1ToDcacheType; + signal r0_valid : std_ulogic; -- Type of operation on a "valid" input type op_t is (OP_NONE, @@ -406,6 +407,10 @@ begin end if; end process; + -- Hold off the request in r0 when stalling, + -- and cancel it if we get an error in a previous request. + r0_valid <= r0.valid and not stall_out and not r1.error_done; + -- TLB -- Operates in the second cycle on the request latched in r0. -- TLB updates write the entry at the end of the second cycle. @@ -478,7 +483,7 @@ begin hit := '1'; end if; end loop; - tlb_hit <= hit and r0.valid; + tlb_hit <= hit and r0_valid; tlb_hit_way <= hitway; pte <= read_tlb_pte(hitway, tlb_pte_way); valid_ra <= tlb_hit or not r0.virt_mode; @@ -503,7 +508,7 @@ begin tlbie := '0'; tlbia := '0'; tlbwe := '0'; - if r0.valid = '1' and stall_out = '0' and r0.tlbie = '1' then + if r0_valid = '1' and r0.tlbie = '1' then if r0.addr(11 downto 10) /= "00" then tlbia := '1'; elsif r0.addr(9) = '1' then @@ -596,7 +601,7 @@ begin req_tag <= get_tag(ra); -- Only do anything if not being stalled by stage 1 - go := r0.valid and not stall_out and not r0.tlbie; + go := r0_valid and not r0.tlbie; -- Calculate address of beginning of cache line, will be -- used for cache miss processing if needed @@ -697,7 +702,7 @@ begin cancel_store <= '0'; set_rsrv <= '0'; clear_rsrv <= '0'; - if stall_out = '0' and r0.valid = '1' and r0.reserve = '1' then + if r0_valid = '1' and r0.reserve = '1' then -- XXX generate alignment interrupt if address is not aligned -- XXX or if r0.nc = '1' if r0.load = '1' then @@ -920,7 +925,7 @@ begin end if; -- complete tlbies in the third cycle - r1.tlbie_done <= r0.valid and r0.tlbie and not stall_out; + r1.tlbie_done <= r0_valid and r0.tlbie; end if; end process; diff --git a/execute1.vhdl b/execute1.vhdl index 98b95dc..e2cb651 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -23,6 +23,7 @@ entity execute1 is stall_out : out std_ulogic; e_in : in Decode2ToExecute1Type; + l_in : in Loadstore1ToExecute1Type; i_in : in XicsToExecute1Type; @@ -51,6 +52,7 @@ architecture behaviour of execute1 is slow_op_rc : std_ulogic; slow_op_oe : std_ulogic; slow_op_xerc : xer_common_t; + ldst_nia : std_ulogic_vector(63 downto 0); end record; constant reg_type_init : reg_type := (e => Execute1ToWritebackInit, lr_update => '0', @@ -446,9 +448,9 @@ begin v.e.exc_write_reg := fast_spr_num(SPR_SRR0); v.e.exc_write_data := e_in.nia; - if ctrl.irq_state = WRITE_SRR1 then - v.e.exc_write_reg := fast_spr_num(SPR_SRR1); - v.e.exc_write_data := ctrl.srr1; + if ctrl.irq_state = WRITE_SRR1 then + v.e.exc_write_reg := fast_spr_num(SPR_SRR1); + v.e.exc_write_data := ctrl.srr1; v.e.exc_write_enable := '1'; ctrl_tmp.msr(MSR_SF) <= '1'; ctrl_tmp.msr(MSR_EE) <= '0'; @@ -899,6 +901,7 @@ begin elsif e_in.valid = '1' then -- instruction for other units, i.e. LDST + v.ldst_nia := e_in.nia; v.e.valid := '0'; if e_in.unit = LDST then lv.valid := '1'; @@ -969,6 +972,17 @@ begin v.e.write_data := result; v.e.write_enable := result_en; + -- generate DSI for load/store exceptions + if l_in.exception = '1' then + ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#300#, 64)); + ctrl_tmp.srr1 <= msr_copy(ctrl.msr); + v.e.exc_write_enable := '1'; + v.e.exc_write_reg := fast_spr_num(SPR_SRR0); + v.e.exc_write_data := r.ldst_nia; + ctrl_tmp.irq_state <= WRITE_SRR1; + v.e.valid := '1'; -- complete the original load or store + end if; + -- Outputs to loadstore1 (async) lv.op := e_in.insn_type; lv.addr1 := a_in; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index d5a59e8..6ab18f5 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -16,6 +16,7 @@ entity loadstore1 is rst : in std_ulogic; l_in : in Execute1ToLoadstore1Type; + e_out : out Loadstore1ToExecute1Type; l_out : out Loadstore1ToWritebackType; d_out : out Loadstore1ToDcacheType; @@ -142,6 +143,9 @@ begin variable mfspr : std_ulogic; variable sprn : std_ulogic_vector(9 downto 0); variable sprval : std_ulogic_vector(63 downto 0); + variable exception : std_ulogic; + variable next_addr : std_ulogic_vector(63 downto 0); + variable dsisr : std_ulogic_vector(31 downto 0); begin v := r; req := '0'; @@ -151,6 +155,8 @@ begin addr := lsu_sum; mfspr := '0'; sprval := (others => '0'); -- avoid inferred latches + exception := '0'; + dsisr := (others => '0'); write_enable := '0'; do_update := '0'; @@ -204,6 +210,9 @@ begin end case; end loop; + -- compute (addr + 8) & ~7 for the second doubleword when unaligned + next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; + case r.state is when IDLE => if l_in.valid = '1' then @@ -301,8 +310,7 @@ begin end if; when SECOND_REQ => - -- compute (addr + 8) & ~7 for the second doubleword when unaligned - addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; + addr := next_addr; byte_sel := r.second_bytes; req := '1'; stall := '1'; @@ -311,25 +319,43 @@ begin when FIRST_ACK_WAIT => stall := '1'; if d_in.valid = '1' then - v.state := LAST_ACK_WAIT; - if r.load = '1' then - v.load_data := data_permuted; + if d_in.error = '1' then + -- dcache will discard the second request + exception := '1'; + dsisr(30) := d_in.tlb_miss; + v.state := IDLE; + else + v.state := LAST_ACK_WAIT; + if r.load = '1' then + v.load_data := data_permuted; + end if; end if; end if; when LAST_ACK_WAIT => stall := '1'; if d_in.valid = '1' then - write_enable := r.load; - if r.load = '1' and r.update = '1' then - -- loads with rA update need an extra cycle - v.state := LD_UPDATE; - else - -- stores write back rA update in this cycle - do_update := r.update; - stall := '0'; - done := '1'; + if d_in.error = '1' then + if two_dwords = '1' then + addr := next_addr; + else + addr := r.addr; + end if; + exception := '1'; + dsisr(30) := d_in.tlb_miss; v.state := IDLE; + else + write_enable := r.load; + if r.load = '1' and r.update = '1' then + -- loads with rA update need an extra cycle + v.state := LD_UPDATE; + else + -- stores write back rA update in this cycle + do_update := r.update; + stall := '0'; + done := '1'; + v.state := IDLE; + end if; end if; end if; @@ -372,6 +398,13 @@ begin l_out.rc <= r.rc and done; l_out.store_done <= d_in.store_done; + -- update exception info back to execute1 + e_out.exception <= exception; + if exception = '1' then + v.dar := addr; + v.dsisr := dsisr; + end if; + stall_out <= stall; -- Update registers From d47fbf88d14f2ca90d6b37378d698b9133e66631 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 20 Apr 2020 12:43:06 +1000 Subject: [PATCH 09/26] Implement access permission checks This adds logic to the dcache to check the permissions encoded in the PTE that it gets from the dTLB. The bits that are checked are: R must be 1 C must be 1 for a store EAA(0) - if this is 1, MSR[PR] must be 0 EAA(2) must be 1 for a store EAA(1) | EAA(2) must be 1 for a load In addition, ATT(0) is used to indicate a cache-inhibited access. This now implements DSISR bits 36, 38 and 45. (Bit numbers above correspond to the ISA, i.e. using big-endian numbering.) MSR[PR] is now conveyed to loadstore1 for use in permission checking. Signed-off-by: Paul Mackerras --- common.vhdl | 8 +++++-- dcache.vhdl | 55 +++++++++++++++++++++++++++++++++++++++++++++---- execute1.vhdl | 1 + loadstore1.vhdl | 9 ++++++++ 4 files changed, 67 insertions(+), 6 deletions(-) diff --git a/common.vhdl b/common.vhdl index 59b3744..e8ec19e 100644 --- a/common.vhdl +++ b/common.vhdl @@ -230,12 +230,13 @@ package common is xerc : xer_common_t; reserve : std_ulogic; -- set for larx/stcx. rc : std_ulogic; -- set for stcx. - spr_num : spr_num_t; -- SPR number for mfspr/mtspr virt_mode : std_ulogic; -- do translation through TLB + priv_mode : std_ulogic; -- privileged mode (MSR[PR] = 0) + spr_num : spr_num_t; -- SPR number for mfspr/mtspr end record; constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0', sign_extend => '0', update => '0', xerc => xerc_init, - reserve => '0', rc => '0', virt_mode => '0', + reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0', spr_num => 0, others => (others => '0')); type Loadstore1ToExecute1Type is record @@ -250,6 +251,7 @@ package common is nc : std_ulogic; reserve : std_ulogic; virt_mode : std_ulogic; + priv_mode : std_ulogic; addr : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); byte_sel : std_ulogic_vector(7 downto 0); @@ -261,6 +263,8 @@ package common is store_done : std_ulogic; error : std_ulogic; tlb_miss : std_ulogic; + perm_error : std_ulogic; + rc_error : std_ulogic; end record; type Loadstore1ToWritebackType is record diff --git a/dcache.vhdl b/dcache.vhdl index 7895877..03b3886 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -149,6 +149,30 @@ architecture rtl of dcache is signal r0 : Loadstore1ToDcacheType; signal r0_valid : std_ulogic; + -- Record for storing permission, attribute, etc. bits from a PTE + type perm_attr_t is record + reference : std_ulogic; + changed : std_ulogic; + nocache : std_ulogic; + priv : std_ulogic; + rd_perm : std_ulogic; + wr_perm : std_ulogic; + end record; + + function extract_perm_attr(pte : std_ulogic_vector(TLB_PTE_BITS - 1 downto 0)) return perm_attr_t is + variable pa : perm_attr_t; + begin + pa.reference := pte(8); + pa.changed := pte(7); + pa.nocache := pte(5); + pa.priv := pte(3); + pa.rd_perm := pte(2); + pa.wr_perm := pte(1); + return pa; + end; + + constant real_mode_perm_attr : perm_attr_t := (nocache => '0', others => '1'); + -- Type of operation on a "valid" input type op_t is (OP_NONE, OP_LOAD_HIT, -- Cache hit on load @@ -208,7 +232,9 @@ architecture rtl of dcache is -- Signals to complete with error error_done : std_ulogic; - tlb_miss : std_ulogic; + tlb_miss : std_ulogic; -- No entry found in TLB + perm_error : std_ulogic; -- Permissions don't allow access + rc_error : std_ulogic; -- Reference or change bit clear -- completion signal for tlbie tlbie_done : std_ulogic; @@ -262,6 +288,9 @@ architecture rtl of dcache is signal pte : tlb_pte_t; signal ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); signal valid_ra : std_ulogic; + signal perm_attr : perm_attr_t; + signal rc_ok : std_ulogic; + signal perm_ok : std_ulogic; -- TLB PLRU output interface type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0); @@ -490,8 +519,10 @@ begin if r0.virt_mode = '1' then ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & r0.addr(TLB_LG_PGSZ - 1 downto 0); + perm_attr <= extract_perm_attr(pte); else ra <= r0.addr(REAL_ADDR_BITS - 1 downto 0); + perm_attr <= real_mode_perm_attr; end if; end process; @@ -588,6 +619,7 @@ begin variable op : op_t; variable opsel : std_ulogic_vector(2 downto 0); variable go : std_ulogic; + variable nc : std_ulogic; variable s_hit : std_ulogic; variable s_tag : cache_tag_t; variable s_pte : tlb_pte_t; @@ -655,13 +687,20 @@ begin -- The way to replace on a miss replace_way <= to_integer(unsigned(plru_victim(req_index))); - -- Combine the request and cache his status to decide what + -- work out whether we have permission for this access + -- NB we don't yet implement AMR, thus no KUAP + rc_ok <= perm_attr.reference and (r0.load or perm_attr.changed); + perm_ok <= (r0.priv_mode or not perm_attr.priv) and + (perm_attr.wr_perm or (r0.load and perm_attr.rd_perm)); + + -- Combine the request and cache hit status to decide what -- operation needs to be done -- + nc := r0.nc or perm_attr.nocache; op := OP_NONE; if go = '1' then - if valid_ra = '1' then - opsel := r0.load & r0.nc & is_hit; + if valid_ra = '1' and rc_ok = '1' and perm_ok = '1' then + opsel := r0.load & nc & is_hit; case opsel is when "101" => op := OP_LOAD_HIT; when "100" => op := OP_LOAD_MISS; @@ -742,6 +781,8 @@ begin d_out.store_done <= '0'; d_out.error <= '0'; d_out.tlb_miss <= '0'; + d_out.perm_error <= '0'; + d_out.rc_error <= '0'; -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store @@ -772,6 +813,8 @@ begin report "completing ld/st with error"; d_out.error <= '1'; d_out.tlb_miss <= r1.tlb_miss; + d_out.perm_error <= r1.perm_error; + d_out.rc_error <= r1.rc_error; d_out.valid <= '1'; end if; @@ -918,8 +961,12 @@ begin end if; if req_op = OP_BAD then + report "Signalling ld/st error valid_ra=" & " rc_ok=" & std_ulogic'image(rc_ok) & + " perm_ok=" & std_ulogic'image(perm_ok); r1.error_done <= '1'; r1.tlb_miss <= not valid_ra; + r1.perm_error <= valid_ra and not perm_ok; + r1.rc_error <= valid_ra and perm_ok and not rc_ok; else r1.error_done <= '0'; end if; diff --git a/execute1.vhdl b/execute1.vhdl index e2cb651..5e25efc 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -1004,6 +1004,7 @@ begin lv.ci := '1'; end if; lv.virt_mode := ctrl.msr(MSR_DR); + lv.priv_mode := not ctrl.msr(MSR_PR); -- Update registers rin <= v; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 6ab18f5..c54e47b 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -60,6 +60,7 @@ architecture behave of loadstore1 is rc : std_ulogic; nc : std_ulogic; -- non-cacheable access virt_mode : std_ulogic; + priv_mode : std_ulogic; state : state_t; second_bytes : std_ulogic_vector(7 downto 0); dar : std_ulogic_vector(63 downto 0); @@ -266,6 +267,7 @@ begin v.rc := l_in.rc; v.nc := l_in.ci; v.virt_mode := l_in.virt_mode; + v.priv_mode := l_in.priv_mode; -- XXX Temporary hack. Mark the op as non-cachable if the address -- is the form 0xc------- for a real-mode access. @@ -323,6 +325,9 @@ begin -- dcache will discard the second request exception := '1'; dsisr(30) := d_in.tlb_miss; + dsisr(63 - 36) := d_in.perm_error; + dsisr(63 - 38) := not r.load; + dsisr(63 - 45) := d_in.rc_error; v.state := IDLE; else v.state := LAST_ACK_WAIT; @@ -343,6 +348,9 @@ begin end if; exception := '1'; dsisr(30) := d_in.tlb_miss; + dsisr(63 - 36) := d_in.perm_error; + dsisr(63 - 38) := not r.load; + dsisr(63 - 45) := d_in.rc_error; v.state := IDLE; else write_enable := r.load; @@ -376,6 +384,7 @@ begin d_out.data <= v.store_data; d_out.byte_sel <= byte_sel; d_out.virt_mode <= v.virt_mode; + d_out.priv_mode <= v.priv_mode; -- Update outputs to writeback -- Multiplex either cache data to the destination GPR or From 8160f4f8214e982284b2ce2678c8298073b4267c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 22 Apr 2020 11:10:56 +1000 Subject: [PATCH 10/26] Add framework for implementing an MMU This adds a new module to implement an MMU. At the moment it doesn't do very much. Tlbie instructions now get sent by loadstore1 to mmu, which sends them to dcache, rather than loadstore1 sending them directly to dcache. TLB misses from dcache now get sent by loadstore1 to mmu, which currently just returns an error. Loadstore1 then generates a DSI in response to the error return from mmu. Signed-off-by: Paul Mackerras --- Makefile | 5 +- common.vhdl | 25 ++++++++- core.vhdl | 19 +++++++ dcache.vhdl | 140 ++++++++++++++++++++++++++++++------------------ dcache_tb.vhdl | 9 +++- loadstore1.vhdl | 105 +++++++++++++++++++++++++++--------- microwatt.core | 1 + mmu.vhdl | 109 +++++++++++++++++++++++++++++++++++++ 8 files changed, 332 insertions(+), 81 deletions(-) create mode 100644 mmu.vhdl diff --git a/Makefile b/Makefile index ed74176..48be5b4 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ common.o: decode_types.o control.o: gpr_hazard.o cr_hazard.o common.o sim_jtag.o: sim_jtag_socket.o core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o -core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o writeback.o core_debug.o +core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o mmu.o dcache.o writeback.o core_debug.o core_debug.o: common.o countzero.o: countzero_tb.o: common.o glibc_random.o countzero.o @@ -58,10 +58,11 @@ icache_tb.o: common.o wishbone_types.o icache.o wishbone_bram_wrapper.o dcache.o: utils.o common.o wishbone_types.o plru.o cache_ram.o utils.o dcache_tb.o: common.o wishbone_types.o dcache.o wishbone_bram_wrapper.o insn_helpers.o: -loadstore1.o: common.o helpers.o decode_types.o +loadstore1.o: common.o decode_types.o logical.o: decode_types.o multiply_tb.o: decode_types.o common.o glibc_random.o ppc_fx_insns.o multiply.o multiply.o: common.o decode_types.o +mmu.o: common.o divider_tb.o: decode_types.o common.o glibc_random.o ppc_fx_insns.o divider.o divider.o: common.o decode_types.o ppc_fx_insns.o: helpers.o diff --git a/common.vhdl b/common.vhdl index e8ec19e..3ee19d7 100644 --- a/common.vhdl +++ b/common.vhdl @@ -246,7 +246,6 @@ package common is type Loadstore1ToDcacheType is record valid : std_ulogic; load : std_ulogic; -- is this a load - tlbie : std_ulogic; -- is this a tlbie dcbz : std_ulogic; nc : std_ulogic; reserve : std_ulogic; @@ -267,6 +266,30 @@ package common is rc_error : std_ulogic; end record; + type Loadstore1ToMmuType is record + valid : std_ulogic; + tlbie : std_ulogic; + addr : std_ulogic_vector(63 downto 0); + rs : std_ulogic_vector(63 downto 0); + end record; + + type MmuToLoadstore1Type is record + done : std_ulogic; + error : std_ulogic; + end record; + + type MmuToDcacheType is record + valid : std_ulogic; + tlbie : std_ulogic; + addr : std_ulogic_vector(63 downto 0); + pte : std_ulogic_vector(63 downto 0); + end record; + + type DcacheToMmuType is record + stall : std_ulogic; + done : std_ulogic; + end record; + type Loadstore1ToWritebackType is record valid : std_ulogic; write_enable: std_ulogic; diff --git a/core.vhdl b/core.vhdl index 0cb2ecd..c870404 100644 --- a/core.vhdl +++ b/core.vhdl @@ -65,10 +65,14 @@ architecture behave of core is signal execute1_to_loadstore1: Execute1ToLoadstore1Type; signal loadstore1_to_execute1: Loadstore1ToExecute1Type; signal loadstore1_to_writeback: Loadstore1ToWritebackType; + signal loadstore1_to_mmu: Loadstore1ToMmuType; + signal mmu_to_loadstore1: MmuToLoadstore1Type; -- dcache signals signal loadstore1_to_dcache: Loadstore1ToDcacheType; signal dcache_to_loadstore1: DcacheToLoadstore1Type; + signal mmu_to_dcache: MmuToDcacheType; + signal dcache_to_mmu: DcacheToMmuType; -- local signals signal fetch1_stall_in : std_ulogic; @@ -124,6 +128,7 @@ architecture behave of core is attribute keep_hierarchy of cr_file_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of execute1_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of loadstore1_0 : label is keep_h(DISABLE_FLATTEN); + attribute keep_hierarchy of mmu_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of dcache_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of writeback_0 : label is keep_h(DISABLE_FLATTEN); attribute keep_hierarchy of debug_0 : label is keep_h(DISABLE_FLATTEN); @@ -270,10 +275,22 @@ begin l_out => loadstore1_to_writeback, d_out => loadstore1_to_dcache, d_in => dcache_to_loadstore1, + m_out => loadstore1_to_mmu, + m_in => mmu_to_loadstore1, dc_stall => dcache_stall_out, stall_out => ls1_stall_out ); + mmu_0: entity work.mmu + port map ( + clk => clk, + rst => core_rst, + l_in => loadstore1_to_mmu, + l_out => mmu_to_loadstore1, + d_out => mmu_to_dcache, + d_in => dcache_to_mmu + ); + dcache_0: entity work.dcache generic map( LINE_SIZE => 64, @@ -285,6 +302,8 @@ begin rst => core_rst, d_in => loadstore1_to_dcache, d_out => dcache_to_loadstore1, + m_in => mmu_to_dcache, + m_out => dcache_to_mmu, stall_out => dcache_stall_out, wishbone_in => wishbone_data_in, wishbone_out => wishbone_data_out diff --git a/dcache.vhdl b/dcache.vhdl index 03b3886..126df48 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -40,6 +40,9 @@ entity dcache is d_in : in Loadstore1ToDcacheType; d_out : out DcacheToLoadstore1Type; + m_in : in MmuToDcacheType; + m_out : out DcacheToMmuType; + stall_out : out std_ulogic; wishbone_out : out wishbone_master_out; @@ -146,9 +149,6 @@ architecture rtl of dcache is attribute ram_style of dtlb_tags : signal is "distributed"; attribute ram_style of dtlb_ptes : signal is "distributed"; - signal r0 : Loadstore1ToDcacheType; - signal r0_valid : std_ulogic; - -- Record for storing permission, attribute, etc. bits from a PTE type perm_attr_t is record reference : std_ulogic; @@ -205,6 +205,15 @@ architecture rtl of dcache is -- first stage emits a stall for a complex op. -- + -- Stage 0 register, basically contains just the latched request + type reg_stage_0_t is record + req : Loadstore1ToDcacheType; + tlbie : std_ulogic; + end record; + + signal r0 : reg_stage_0_t; + signal r0_valid : std_ulogic; + -- First stage register, contains state for stage 1 of load hits -- and for the state machine used by all other operations -- @@ -424,35 +433,61 @@ begin assert (64 = wishbone_data_bits) report "Can't yet handle a wishbone width that isn't 64-bits" severity FAILURE; - -- Latch the request in r0 as long as we're not stalling + -- Latch the request in r0.req as long as we're not stalling stage_0 : process(clk) begin if rising_edge(clk) then if rst = '1' then - r0.valid <= '0'; + r0.req.valid <= '0'; elsif stall_out = '0' then - r0 <= d_in; + assert (d_in.valid and m_in.valid) = '0' report + "request collision loadstore vs MMU"; + if m_in.valid = '1' then + r0.req.valid <= '1'; + r0.req.load <= '0'; + r0.req.dcbz <= '0'; + r0.req.nc <= '0'; + r0.req.reserve <= '0'; + r0.req.virt_mode <= '0'; + r0.req.priv_mode <= '1'; + r0.req.addr <= m_in.addr; + r0.req.data <= m_in.pte; + r0.req.byte_sel <= (others => '1'); + r0.tlbie <= m_in.tlbie; + assert m_in.tlbie = '1' report "unknown request from MMU"; + else + r0.req <= d_in; + r0.tlbie <= '0'; + end if; end if; end if; end process; + -- we don't yet handle collisions between loadstore1 requests and MMU requests + m_out.stall <= '0'; + -- Hold off the request in r0 when stalling, -- and cancel it if we get an error in a previous request. - r0_valid <= r0.valid and not stall_out and not r1.error_done; + r0_valid <= r0.req.valid and not stall_out and not r1.error_done; -- TLB - -- Operates in the second cycle on the request latched in r0. + -- Operates in the second cycle on the request latched in r0.req. -- TLB updates write the entry at the end of the second cycle. tlb_read : process(clk) variable index : tlb_index_t; + variable addrbits : std_ulogic_vector(TLB_SET_BITS - 1 downto 0); begin if rising_edge(clk) then if stall_out = '1' then -- keep reading the same thing while stalled index := tlb_req_index; else - index := to_integer(unsigned(d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 - downto TLB_LG_PGSZ))); + if m_in.valid = '1' then + addrbits := m_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); + else + addrbits := d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ); + end if; + index := to_integer(unsigned(addrbits)); end if; tlb_valid_way <= dtlb_valids(index); tlb_tag_way <= dtlb_tags(index); @@ -500,11 +535,11 @@ begin variable hit : std_ulogic; variable eatag : tlb_tag_t; begin - tlb_req_index <= to_integer(unsigned(r0.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 + tlb_req_index <= to_integer(unsigned(r0.req.addr(TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ))); hitway := 0; hit := '0'; - eatag := r0.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); + eatag := r0.req.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); for i in tlb_way_t loop if tlb_valid_way(i) = '1' and read_tlb_tag(i, tlb_tag_way) = eatag then @@ -515,13 +550,13 @@ begin tlb_hit <= hit and r0_valid; tlb_hit_way <= hitway; pte <= read_tlb_pte(hitway, tlb_pte_way); - valid_ra <= tlb_hit or not r0.virt_mode; - if r0.virt_mode = '1' then + valid_ra <= tlb_hit or not r0.req.virt_mode; + if r0.req.virt_mode = '1' then ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & - r0.addr(TLB_LG_PGSZ - 1 downto 0); + r0.req.addr(TLB_LG_PGSZ - 1 downto 0); perm_attr <= extract_perm_attr(pte); else - ra <= r0.addr(REAL_ADDR_BITS - 1 downto 0); + ra <= r0.req.addr(REAL_ADDR_BITS - 1 downto 0); perm_attr <= real_mode_perm_attr; end if; end process; @@ -540,9 +575,9 @@ begin tlbia := '0'; tlbwe := '0'; if r0_valid = '1' and r0.tlbie = '1' then - if r0.addr(11 downto 10) /= "00" then + if r0.req.addr(11 downto 10) /= "00" then tlbia := '1'; - elsif r0.addr(9) = '1' then + elsif r0.req.addr(9) = '1' then tlbwe := '1'; else tlbie := '1'; @@ -563,15 +598,16 @@ begin else repl_way := to_integer(unsigned(tlb_plru_victim(tlb_req_index))); end if; - eatag := r0.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); + eatag := r0.req.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); tagset := tlb_tag_way; write_tlb_tag(repl_way, tagset, eatag); dtlb_tags(tlb_req_index) <= tagset; pteset := tlb_pte_way; - write_tlb_pte(repl_way, pteset, r0.data); + write_tlb_pte(repl_way, pteset, r0.req.data); dtlb_ptes(tlb_req_index) <= pteset; dtlb_valids(tlb_req_index)(repl_way) <= '1'; end if; + m_out.done <= r0_valid and r0.tlbie; end if; end process; @@ -628,8 +664,8 @@ begin variable hit_way_set : hit_way_set_t; begin -- Extract line, row and tag from request - req_index <= get_index(r0.addr); - req_row <= get_row(r0.addr); + req_index <= get_index(r0.req.addr); + req_row <= get_row(r0.req.addr); req_tag <= get_tag(ra); -- Only do anything if not being stalled by stage 1 @@ -648,13 +684,13 @@ begin -- the TLB, and then decide later which match to use. hit_way := 0; is_hit := '0'; - if r0.virt_mode = '1' then + if r0.req.virt_mode = '1' then for j in tlb_way_t loop hit_way_set(j) := 0; s_hit := '0'; s_pte := read_tlb_pte(j, tlb_pte_way); s_ra := s_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & - r0.addr(TLB_LG_PGSZ - 1 downto 0); + r0.req.addr(TLB_LG_PGSZ - 1 downto 0); s_tag := get_tag(s_ra); for i in way_t loop if go = '1' and cache_valids(req_index)(i) = '1' and @@ -671,7 +707,7 @@ begin hit_way := hit_way_set(tlb_hit_way); end if; else - s_tag := get_tag(r0.addr(REAL_ADDR_BITS - 1 downto 0)); + s_tag := get_tag(r0.req.addr(REAL_ADDR_BITS - 1 downto 0)); for i in way_t loop if go = '1' and cache_valids(req_index)(i) = '1' and read_tag(i, cache_tags(req_index)) = s_tag then @@ -689,18 +725,18 @@ begin -- work out whether we have permission for this access -- NB we don't yet implement AMR, thus no KUAP - rc_ok <= perm_attr.reference and (r0.load or perm_attr.changed); - perm_ok <= (r0.priv_mode or not perm_attr.priv) and - (perm_attr.wr_perm or (r0.load and perm_attr.rd_perm)); + rc_ok <= perm_attr.reference and (r0.req.load or perm_attr.changed); + perm_ok <= (r0.req.priv_mode or not perm_attr.priv) and + (perm_attr.wr_perm or (r0.req.load and perm_attr.rd_perm)); -- Combine the request and cache hit status to decide what -- operation needs to be done -- - nc := r0.nc or perm_attr.nocache; + nc := r0.req.nc or perm_attr.nocache; op := OP_NONE; if go = '1' then if valid_ra = '1' and rc_ok = '1' and perm_ok = '1' then - opsel := r0.load & nc & is_hit; + opsel := r0.req.load & nc & is_hit; case opsel is when "101" => op := OP_LOAD_HIT; when "100" => op := OP_LOAD_MISS; @@ -723,7 +759,11 @@ begin -- If we're stalling then we need to keep reading the last -- row requested. if stall_out = '0' then - early_req_row <= get_row(d_in.addr); + if m_in.valid = '1' then + early_req_row <= get_row(m_in.addr); + else + early_req_row <= get_row(d_in.addr); + end if; else early_req_row <= req_row; end if; @@ -741,17 +781,17 @@ begin cancel_store <= '0'; set_rsrv <= '0'; clear_rsrv <= '0'; - if r0_valid = '1' and r0.reserve = '1' then + if r0_valid = '1' and r0.req.reserve = '1' then -- XXX generate alignment interrupt if address is not aligned - -- XXX or if r0.nc = '1' - if r0.load = '1' then + -- XXX or if r0.req.nc = '1' + if r0.req.load = '1' then -- load with reservation set_rsrv <= '1'; else -- store conditional clear_rsrv <= '1'; if reservation.valid = '0' or - r0.addr(63 downto LINE_OFF_BITS) /= reservation.addr then + r0.req.addr(63 downto LINE_OFF_BITS) /= reservation.addr then cancel_store <= '1'; end if; end if; @@ -765,7 +805,7 @@ begin reservation.valid <= '0'; elsif set_rsrv = '1' then reservation.valid <= '1'; - reservation.addr <= r0.addr(63 downto LINE_OFF_BITS); + reservation.addr <= r0.req.addr(63 downto LINE_OFF_BITS); end if; end if; end process; @@ -818,12 +858,6 @@ begin d_out.valid <= '1'; end if; - -- tlbie is handled above and doesn't go through the cache state machine - if r1.tlbie_done = '1' then - report "completing tlbie"; - d_out.valid <= '1'; - end if; - -- Slow ops (load miss, NC, stores) if r1.slow_valid = '1' then -- If it's a load, enable register writeback and switch @@ -900,8 +934,8 @@ begin if r1.state = IDLE then -- In IDLE state, the only write path is the store-hit update case wr_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); - wr_data <= r0.data; - wr_sel <= r0.byte_sel; + wr_data <= r0.req.data; + wr_sel <= r0.req.byte_sel; else -- Otherwise, we might be doing a reload or a DCBZ if r1.req.dcbz = '1' then @@ -936,17 +970,17 @@ begin dcache_fast_hit : process(clk) begin if rising_edge(clk) then - -- If we have a request incoming, we have to latch it as r0.valid + -- If we have a request incoming, we have to latch it as r0.req.valid -- is only set for a single cycle. It's up to the control logic to -- ensure we don't override an uncompleted request (for now we are -- single issue on load/stores so we are fine, later, we can generate -- a stall output if necessary). if req_op /= OP_NONE and stall_out = '0' then - r1.req <= r0; + r1.req <= r0.req; report "op:" & op_t'image(req_op) & - " addr:" & to_hstring(r0.addr) & - " nc:" & std_ulogic'image(r0.nc) & + " addr:" & to_hstring(r0.req.addr) & + " nc:" & std_ulogic'image(r0.req.nc) & " idx:" & integer'image(req_index) & " tag:" & to_hstring(req_tag) & " way: " & integer'image(req_hit_way); @@ -1018,7 +1052,7 @@ begin when OP_LOAD_MISS => -- Normal load cache miss, start the reload machine -- - report "cache miss addr:" & to_hstring(r0.addr) & + report "cache miss addr:" & to_hstring(r0.req.addr) & " idx:" & integer'image(req_index) & " way:" & integer'image(replace_way) & " tag:" & to_hstring(req_tag); @@ -1053,7 +1087,7 @@ begin r1.state <= RELOAD_WAIT_ACK; when OP_LOAD_NC => - r1.wb.sel <= r0.byte_sel; + r1.wb.sel <= r0.req.byte_sel; r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; @@ -1061,10 +1095,10 @@ begin r1.state <= NC_LOAD_WAIT_ACK; when OP_STORE_HIT | OP_STORE_MISS => - if r0.dcbz = '0' then - r1.wb.sel <= r0.byte_sel; + if r0.req.dcbz = '0' then + r1.wb.sel <= r0.req.byte_sel; r1.wb.adr <= ra(r1.wb.adr'left downto 3) & "000"; - r1.wb.dat <= r0.data; + r1.wb.dat <= r0.req.data; if cancel_store = '0' then r1.wb.cyc <= '1'; r1.wb.stb <= '1'; diff --git a/dcache_tb.vhdl b/dcache_tb.vhdl index 66b938f..48c6877 100644 --- a/dcache_tb.vhdl +++ b/dcache_tb.vhdl @@ -15,6 +15,9 @@ architecture behave of dcache_tb is signal d_in : Loadstore1ToDcacheType; signal d_out : DcacheToLoadstore1Type; + signal m_in : MmuToDcacheType; + signal m_out : DcacheToMmuType; + signal wb_bram_in : wishbone_master_out; signal wb_bram_out : wishbone_slave_out; @@ -30,6 +33,8 @@ begin rst => rst, d_in => d_in, d_out => d_out, + m_in => m_in, + m_out => m_out, wishbone_out => wb_bram_in, wishbone_in => wb_bram_out ); @@ -68,10 +73,12 @@ begin -- Clear stuff d_in.valid <= '0'; d_in.load <= '0'; - d_in.tlbie <= '0'; d_in.nc <= '0'; d_in.addr <= (others => '0'); d_in.data <= (others => '0'); + m_in.valid <= '0'; + m_in.addr <= (others => '0'); + m_in.pte <= (others => '0'); wait for 4*clk_period; wait until rising_edge(clk); diff --git a/loadstore1.vhdl b/loadstore1.vhdl index c54e47b..d5dd010 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -5,7 +5,6 @@ use ieee.numeric_std.all; library work; use work.decode_types.all; use work.common.all; -use work.helpers.all; -- 2 cycle LSU -- We calculate the address in the first cycle @@ -22,6 +21,9 @@ entity loadstore1 is d_out : out Loadstore1ToDcacheType; d_in : in DcacheToLoadstore1Type; + m_out : out Loadstore1ToMmuType; + m_in : in MmuToLoadstore1Type; + dc_stall : in std_ulogic; stall_out : out std_ulogic ); @@ -38,7 +40,9 @@ architecture behave of loadstore1 is SECOND_REQ, -- send 2nd request of unaligned xfer FIRST_ACK_WAIT, -- waiting for 1st ack from dcache LAST_ACK_WAIT, -- waiting for last ack from dcache - LD_UPDATE -- writing rA with computed addr on load + LD_UPDATE, -- writing rA with computed addr on load + MMU_LOOKUP_1ST, -- waiting for MMU to look up translation + MMU_LOOKUP_LAST ); type reg_stage_t is record @@ -62,6 +66,7 @@ architecture behave of loadstore1 is virt_mode : std_ulogic; priv_mode : std_ulogic; state : state_t; + first_bytes : std_ulogic_vector(7 downto 0); second_bytes : std_ulogic_vector(7 downto 0); dar : std_ulogic_vector(63 downto 0); dsisr : std_ulogic_vector(31 downto 0); @@ -146,6 +151,7 @@ begin variable sprval : std_ulogic_vector(63 downto 0); variable exception : std_ulogic; variable next_addr : std_ulogic_vector(63 downto 0); + variable mmureq : std_ulogic; variable dsisr : std_ulogic_vector(31 downto 0); begin v := r; @@ -158,6 +164,7 @@ begin sprval := (others => '0'); -- avoid inferred latches exception := '0'; dsisr := (others => '0'); + mmureq := '0'; write_enable := '0'; do_update := '0'; @@ -230,7 +237,7 @@ begin req := '1'; v.dcbz := '1'; when OP_TLBIE => - req := '1'; + mmureq := '1'; v.tlbie := '1'; when OP_MFSPR => done := '1'; @@ -282,18 +289,14 @@ begin -- Do length_to_sel and work out if we are doing 2 dwords long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0)); byte_sel := long_sel(7 downto 0); + v.first_bytes := byte_sel; v.second_bytes := long_sel(15 downto 8); - v.addr := lsu_sum; - -- Do byte reversing and rotating for stores in the first cycle - byte_offset := "000"; + byte_offset := unsigned(lsu_sum(2 downto 0)); brev_lenm1 := "000"; - if v.tlbie = '0' then - byte_offset := unsigned(lsu_sum(2 downto 0)); - if l_in.byte_reverse = '1' then - brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; - end if; + if l_in.byte_reverse = '1' then + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; end if; for i in 0 to 7 loop k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset; @@ -309,6 +312,10 @@ begin v.state := SECOND_REQ; end if; end if; + if mmureq = '1' then + stall := '1'; + v.state := LAST_ACK_WAIT; + end if; end if; when SECOND_REQ => @@ -323,12 +330,19 @@ begin if d_in.valid = '1' then if d_in.error = '1' then -- dcache will discard the second request - exception := '1'; - dsisr(30) := d_in.tlb_miss; - dsisr(63 - 36) := d_in.perm_error; - dsisr(63 - 38) := not r.load; - dsisr(63 - 45) := d_in.rc_error; - v.state := IDLE; + addr := r.addr; + if d_in.tlb_miss = '1' then + -- give it to the MMU to look up + mmureq := '1'; + v.state := MMU_LOOKUP_1ST; + else + -- signal an interrupt straight away + exception := '1'; + dsisr(63 - 36) := d_in.perm_error; + dsisr(63 - 38) := not r.load; + dsisr(63 - 45) := d_in.rc_error; + v.state := IDLE; + end if; else v.state := LAST_ACK_WAIT; if r.load = '1' then @@ -337,6 +351,32 @@ begin end if; end if; + when MMU_LOOKUP_1ST | MMU_LOOKUP_LAST => + stall := '1'; + if two_dwords = '1' and r.state = MMU_LOOKUP_LAST then + addr := next_addr; + byte_sel := r.second_bytes; + else + addr := r.addr; + byte_sel := r.first_bytes; + end if; + if m_in.done = '1' then + if m_in.error = '0' then + -- retry the request now that the MMU has installed a TLB entry + req := '1'; + if r.state = MMU_LOOKUP_1ST then + v.state := SECOND_REQ; + else + v.state := LAST_ACK_WAIT; + end if; + else + exception := '1'; + dsisr(63 - 33) := '1'; + dsisr(63 - 38) := not r.load; + v.state := IDLE; + end if; + end if; + when LAST_ACK_WAIT => stall := '1'; if d_in.valid = '1' then @@ -346,12 +386,18 @@ begin else addr := r.addr; end if; - exception := '1'; - dsisr(30) := d_in.tlb_miss; - dsisr(63 - 36) := d_in.perm_error; - dsisr(63 - 38) := not r.load; - dsisr(63 - 45) := d_in.rc_error; - v.state := IDLE; + if d_in.tlb_miss = '1' then + -- give it to the MMU to look up + mmureq := '1'; + v.state := MMU_LOOKUP_LAST; + else + -- signal an interrupt straight away + exception := '1'; + dsisr(63 - 36) := d_in.perm_error; + dsisr(63 - 38) := not r.load; + dsisr(63 - 45) := d_in.rc_error; + v.state := IDLE; + end if; else write_enable := r.load; if r.load = '1' and r.update = '1' then @@ -366,6 +412,12 @@ begin end if; end if; end if; + if m_in.done = '1' then + -- tlbie is finished + stall := '0'; + done := '1'; + v.state := IDLE; + end if; when LD_UPDATE => do_update := '1'; @@ -376,7 +428,6 @@ begin -- Update outputs to dcache d_out.valid <= req; d_out.load <= v.load; - d_out.tlbie <= v.tlbie; d_out.dcbz <= v.dcbz; d_out.nc <= v.nc; d_out.reserve <= v.reserve; @@ -386,6 +437,12 @@ begin d_out.virt_mode <= v.virt_mode; d_out.priv_mode <= v.priv_mode; + -- Update outputs to MMU + m_out.valid <= mmureq; + m_out.tlbie <= v.tlbie; + m_out.addr <= addr; + m_out.rs <= l_in.data; + -- Update outputs to writeback -- Multiplex either cache data to the destination GPR or -- the address for the rA update. diff --git a/microwatt.core b/microwatt.core index a2d6ab5..180e0a5 100644 --- a/microwatt.core +++ b/microwatt.core @@ -25,6 +25,7 @@ filesets: - control.vhdl - execute1.vhdl - loadstore1.vhdl + - mmu.vhdl - dcache.vhdl - multiply.vhdl - divider.vhdl diff --git a/mmu.vhdl b/mmu.vhdl new file mode 100644 index 0000000..2e6d0fd --- /dev/null +++ b/mmu.vhdl @@ -0,0 +1,109 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.common.all; + +-- Radix MMU +-- Supports 4-level trees as in arch 3.0B, but not the two-step translation for +-- guests under a hypervisor (i.e. there is no gRA -> hRA translation). + +entity mmu is + port ( + clk : in std_ulogic; + rst : in std_ulogic; + + l_in : in Loadstore1ToMmuType; + l_out : out MmuToLoadstore1Type; + + d_out : out MmuToDcacheType; + d_in : in DcacheToMmuType + ); +end mmu; + +architecture behave of mmu is + + type state_t is (IDLE, + TLBIE_WAIT, + RADIX_LOOKUP_0 + ); + + type reg_stage_t is record + -- latched request from loadstore1 + valid : std_ulogic; + addr : std_ulogic_vector(63 downto 0); + state : state_t; + end record; + + signal r, rin : reg_stage_t; + +begin + + mmu_0: process(clk) + begin + if rising_edge(clk) then + if rst = '1' then + r.state <= IDLE; + r.valid <= '0'; + else + if rin.valid = '1' then + report "MMU got tlb miss for " & to_hstring(rin.addr); + end if; + if l_out.done = '1' then + report "MMU completing miss with error=" & std_ulogic'image(l_out.error); + end if; + r <= rin; + end if; + end if; + end process; + + mmu_1: process(all) + variable v : reg_stage_t; + variable dcreq : std_ulogic; + variable done : std_ulogic; + variable err : std_ulogic; + begin + v.valid := l_in.valid; + v.addr := l_in.addr; + v.state := r.state; + dcreq := '0'; + done := '0'; + err := '0'; + + case r.state is + when IDLE => + if l_in.valid = '1' then + if l_in.tlbie = '1' then + dcreq := '1'; + v.state := TLBIE_WAIT; + else + v.state := RADIX_LOOKUP_0; + end if; + end if; + + when TLBIE_WAIT => + if d_in.done = '1' then + done := '1'; + v.state := IDLE; + end if; + + when RADIX_LOOKUP_0 => + done := '1'; + err := '1'; + v.state := IDLE; + end case; + + -- update registers + rin <= v; + + -- drive outputs + l_out.done <= done; + l_out.error <= err; + + d_out.valid <= dcreq; + d_out.tlbie <= l_in.tlbie; + d_out.addr <= l_in.addr; + d_out.pte <= l_in.rs; + end process; +end; From 4e6fc6811a17fbc524f0fe632d2d6e5adb268420 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 23 Apr 2020 15:28:22 +1000 Subject: [PATCH 11/26] MMU: Implement radix page table machinery This adds the necessary machinery to the MMU for it to do radix page table walks. The core elements are a shifter that can shift the address right by between 0 and 47 bits, a mask generator that can generate a mask of between 5 and 16 bits, a final mask generator, and new states in the state machine. (The final mask generator is used for transferring bits of the original address into the resulting TLB entry when the leaf PTE corresponds to a page size larger than 4kB.) The hardware does not implement a partition table or a process table. Software is expected to load the appropriate process table entry into a new SPR called PGTBL0, SPR 720. The contents should be formatted as described in Book III section 5.7.6.2 of the Power ISA v3.0B. PGTBL0 is set to 0 on hard reset. At present, the top two bits of the address (the quadrant) are ignored. There is currently no caching of any step in the translation process or of the final result, other than the entry created in the dTLB. That entry is a 4k page entry even if the leaf PTE found in the walk corresponds to a larger page size. This implementation can handle almost any page table layout and any page size. The RTS field (in PGTBL0) can have any value between 0 and 31, corresponding to a total address space size between 2^31 and 2^62 bytes. The RPDS field of PGTBL0 can be any value between 5 and 16, except that a value of 0 is taken to disable radix page table walking (for use when one is using software loading of TLB entries). The NLS field of the page directory entries can have any value between 5 and 16. The minimum page size is 4kB, meaning that the sum of RPDS and the NLS values of the PDEs found on the path to a leaf PTE must be less than or equal to RTS + 31 - 12. The PGTBL0 SPR is in the mmu module; thus this adds a path for loadstore1 to read and write SPRs in mmu. This adds code in dcache to service doubleword read requests from the MMU, as well as requests to write dTLB entries. Signed-off-by: Paul Mackerras --- common.vhdl | 12 ++- dcache.vhdl | 120 +++++++++++++++++--------- decode1.vhdl | 2 +- loadstore1.vhdl | 34 +++++--- mmu.vhdl | 224 ++++++++++++++++++++++++++++++++++++++++++++---- 5 files changed, 322 insertions(+), 70 deletions(-) diff --git a/common.vhdl b/common.vhdl index 3ee19d7..d617fa4 100644 --- a/common.vhdl +++ b/common.vhdl @@ -39,6 +39,7 @@ package common is constant SPR_SPRG3U : spr_num_t := 259; constant SPR_HSPRG0 : spr_num_t := 304; constant SPR_HSPRG1 : spr_num_t := 305; + constant SPR_PGTBL0 : spr_num_t := 720; -- GPR indices in the register file (GPR only) subtype gpr_index_t is std_ulogic_vector(4 downto 0); @@ -269,18 +270,23 @@ package common is type Loadstore1ToMmuType is record valid : std_ulogic; tlbie : std_ulogic; + mtspr : std_ulogic; + sprn : std_ulogic_vector(3 downto 0); addr : std_ulogic_vector(63 downto 0); rs : std_ulogic_vector(63 downto 0); end record; type MmuToLoadstore1Type is record - done : std_ulogic; - error : std_ulogic; + done : std_ulogic; + invalid : std_ulogic; + badtree : std_ulogic; + sprval : std_ulogic_vector(63 downto 0); end record; type MmuToDcacheType is record valid : std_ulogic; tlbie : std_ulogic; + tlbld : std_ulogic; addr : std_ulogic_vector(63 downto 0); pte : std_ulogic_vector(63 downto 0); end record; @@ -288,6 +294,8 @@ package common is type DcacheToMmuType is record stall : std_ulogic; done : std_ulogic; + err : std_ulogic; + data : std_ulogic_vector(63 downto 0); end record; type Loadstore1ToWritebackType is record diff --git a/dcache.vhdl b/dcache.vhdl index 126df48..96563a5 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -209,6 +209,8 @@ architecture rtl of dcache is type reg_stage_0_t is record req : Loadstore1ToDcacheType; tlbie : std_ulogic; + tlbld : std_ulogic; + mmu_req : std_ulogic; -- indicates source of request end record; signal r0 : reg_stage_0_t; @@ -220,6 +222,7 @@ architecture rtl of dcache is type reg_stage_1_t is record -- Latch the complete request from ls1 req : Loadstore1ToDcacheType; + mmu_req : std_ulogic; -- Cache hit state hit_way : way_t; @@ -444,7 +447,7 @@ begin "request collision loadstore vs MMU"; if m_in.valid = '1' then r0.req.valid <= '1'; - r0.req.load <= '0'; + r0.req.load <= not (m_in.tlbie or m_in.tlbld); r0.req.dcbz <= '0'; r0.req.nc <= '0'; r0.req.reserve <= '0'; @@ -454,10 +457,13 @@ begin r0.req.data <= m_in.pte; r0.req.byte_sel <= (others => '1'); r0.tlbie <= m_in.tlbie; - assert m_in.tlbie = '1' report "unknown request from MMU"; + r0.tlbld <= m_in.tlbld; + r0.mmu_req <= '1'; else r0.req <= d_in; r0.tlbie <= '0'; + r0.tlbld <= '0'; + r0.mmu_req <= '0'; end if; end if; end if; @@ -549,7 +555,11 @@ begin end loop; tlb_hit <= hit and r0_valid; tlb_hit_way <= hitway; - pte <= read_tlb_pte(hitway, tlb_pte_way); + if tlb_hit = '1' then + pte <= read_tlb_pte(hitway, tlb_pte_way); + else + pte <= (others => '0'); + end if; valid_ra <= tlb_hit or not r0.req.virt_mode; if r0.req.virt_mode = '1' then ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & @@ -573,7 +583,7 @@ begin if rising_edge(clk) then tlbie := '0'; tlbia := '0'; - tlbwe := '0'; + tlbwe := r0_valid and r0.tlbld; if r0_valid = '1' and r0.tlbie = '1' then if r0.req.addr(11 downto 10) /= "00" then tlbia := '1'; @@ -607,7 +617,6 @@ begin dtlb_ptes(tlb_req_index) <= pteset; dtlb_valids(tlb_req_index)(repl_way) <= '1'; end if; - m_out.done <= r0_valid and r0.tlbie; end if; end process; @@ -669,7 +678,7 @@ begin req_tag <= get_tag(ra); -- Only do anything if not being stalled by stage 1 - go := r0_valid and not r0.tlbie; + go := r0_valid and not (r0.tlbie or r0.tlbld); -- Calculate address of beginning of cache line, will be -- used for cache miss processing if needed @@ -824,6 +833,11 @@ begin d_out.perm_error <= '0'; d_out.rc_error <= '0'; + -- Outputs to MMU + m_out.done <= r1.tlbie_done; + m_out.err <= '0'; + m_out.data <= cache_out(r1.hit_way); + -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store -- @@ -842,40 +856,65 @@ begin "unexpected hit_load_delayed collision with slow_valid" severity FAILURE; - -- Load hit case is the standard path - if r1.hit_load_valid = '1' then - report "completing load hit"; - d_out.valid <= '1'; - end if; + if r1.mmu_req = '0' then + -- Request came from loadstore1... + -- Load hit case is the standard path + if r1.hit_load_valid = '1' then + report "completing load hit"; + d_out.valid <= '1'; + end if; - -- error cases complete without stalling - if r1.error_done = '1' then - report "completing ld/st with error"; - d_out.error <= '1'; - d_out.tlb_miss <= r1.tlb_miss; - d_out.perm_error <= r1.perm_error; - d_out.rc_error <= r1.rc_error; - d_out.valid <= '1'; - end if; + -- error cases complete without stalling + if r1.error_done = '1' then + report "completing ld/st with error"; + d_out.error <= '1'; + d_out.tlb_miss <= r1.tlb_miss; + d_out.perm_error <= r1.perm_error; + d_out.rc_error <= r1.rc_error; + d_out.valid <= '1'; + end if; - -- Slow ops (load miss, NC, stores) - if r1.slow_valid = '1' then - -- If it's a load, enable register writeback and switch - -- mux accordingly - -- - if r1.req.load then - -- Read data comes from the slow data latch - d_out.data <= r1.slow_data; - end if; - d_out.store_done <= '1'; + -- Slow ops (load miss, NC, stores) + if r1.slow_valid = '1' then + -- If it's a load, enable register writeback and switch + -- mux accordingly + -- + if r1.req.load then + -- Read data comes from the slow data latch + d_out.data <= r1.slow_data; + end if; + d_out.store_done <= '1'; - report "completing store or load miss"; - d_out.valid <= '1'; - end if; + report "completing store or load miss"; + d_out.valid <= '1'; + end if; + + if r1.stcx_fail = '1' then + d_out.store_done <= '0'; + d_out.valid <= '1'; + end if; + + else + -- Request came from MMU + if r1.hit_load_valid = '1' then + report "completing load hit to MMU, data=" & to_hstring(m_out.data); + m_out.done <= '1'; + end if; - if r1.stcx_fail = '1' then - d_out.store_done <= '0'; - d_out.valid <= '1'; + -- error cases complete without stalling + if r1.error_done = '1' then + report "completing MMU ld with error"; + m_out.err <= '1'; + m_out.done <= '1'; + end if; + + -- Slow ops (i.e. load miss) + if r1.slow_valid = '1' then + -- Read data comes from the slow data latch + m_out.data <= r1.slow_data; + report "completing MMU load miss, data=" & to_hstring(m_out.data); + m_out.done <= '1'; + end if; end if; end process; @@ -978,6 +1017,7 @@ begin if req_op /= OP_NONE and stall_out = '0' then r1.req <= r0.req; + r1.mmu_req <= r0.mmu_req; report "op:" & op_t'image(req_op) & " addr:" & to_hstring(r0.req.addr) & " nc:" & std_ulogic'image(r0.req.nc) & @@ -995,8 +1035,8 @@ begin end if; if req_op = OP_BAD then - report "Signalling ld/st error valid_ra=" & " rc_ok=" & std_ulogic'image(rc_ok) & - " perm_ok=" & std_ulogic'image(perm_ok); + report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) & + " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok); r1.error_done <= '1'; r1.tlb_miss <= not valid_ra; r1.perm_error <= valid_ra and not perm_ok; @@ -1005,8 +1045,8 @@ begin r1.error_done <= '0'; end if; - -- complete tlbies in the third cycle - r1.tlbie_done <= r0_valid and r0.tlbie; + -- complete tlbies and TLB loads in the third cycle + r1.tlbie_done <= r0_valid and (r0.tlbie or r0.tlbld); end if; end process; diff --git a/decode1.vhdl b/decode1.vhdl index fd799fe..b7212c2 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -438,7 +438,7 @@ begin v.decode.sgl_pipe := '1'; -- send MMU-related SPRs to loadstore1 case sprn is - when SPR_DAR | SPR_DSISR => + when SPR_DAR | SPR_DSISR | SPR_PGTBL0 => v.decode.unit := LDST; when others => end case; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index d5dd010..03aaa6f 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -153,6 +153,7 @@ begin variable next_addr : std_ulogic_vector(63 downto 0); variable mmureq : std_ulogic; variable dsisr : std_ulogic_vector(31 downto 0); + variable mmu_mtspr : std_ulogic; begin v := r; req := '0'; @@ -161,6 +162,8 @@ begin byte_sel := (others => '0'); addr := lsu_sum; mfspr := '0'; + mmu_mtspr := '0'; + sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10)); sprval := (others => '0'); -- avoid inferred latches exception := '0'; dsisr := (others => '0'); @@ -244,19 +247,27 @@ begin mfspr := '1'; -- partial decode on SPR number should be adequate given -- the restricted set that get sent down this path - sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10)); - if sprn(0) = '0' then - sprval := x"00000000" & r.dsisr; + if sprn(9) = '0' then + if sprn(0) = '0' then + sprval := x"00000000" & r.dsisr; + else + sprval := r.dar; + end if; else - sprval := r.dar; + -- reading one of the SPRs in the MMU + sprval := m_in.sprval; end if; when OP_MTSPR => done := '1'; - sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10)); - if sprn(0) = '0' then - v.dsisr := l_in.data(31 downto 0); + if sprn(9) = '0' then + if sprn(0) = '0' then + v.dsisr := l_in.data(31 downto 0); + else + v.dar := l_in.data; + end if; else - v.dar := l_in.data; + -- writing one of the SPRs in the MMU + mmu_mtspr := '1'; end if; when others => assert false report "unknown op sent to loadstore1"; @@ -361,7 +372,7 @@ begin byte_sel := r.first_bytes; end if; if m_in.done = '1' then - if m_in.error = '0' then + if m_in.invalid = '0' and m_in.badtree = '0' then -- retry the request now that the MMU has installed a TLB entry req := '1'; if r.state = MMU_LOOKUP_1ST then @@ -371,8 +382,9 @@ begin end if; else exception := '1'; - dsisr(63 - 33) := '1'; + dsisr(63 - 33) := m_in.invalid; dsisr(63 - 38) := not r.load; + dsisr(63 - 44) := m_in.badtree; v.state := IDLE; end if; end if; @@ -440,6 +452,8 @@ begin -- Update outputs to MMU m_out.valid <= mmureq; m_out.tlbie <= v.tlbie; + m_out.mtspr <= mmu_mtspr; + m_out.sprn <= sprn(3 downto 0); m_out.addr <= addr; m_out.rs <= l_in.data; diff --git a/mmu.vhdl b/mmu.vhdl index 2e6d0fd..fe6ad16 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -25,20 +25,37 @@ end mmu; architecture behave of mmu is type state_t is (IDLE, - TLBIE_WAIT, - RADIX_LOOKUP_0 + TLB_WAIT, + RADIX_LOOKUP, + RADIX_READ_WAIT, + RADIX_LOAD_TLB, + RADIX_NO_TRANS, + RADIX_BAD_TREE ); type reg_stage_t is record -- latched request from loadstore1 valid : std_ulogic; addr : std_ulogic_vector(63 downto 0); + -- internal state state : state_t; + pgtbl0 : std_ulogic_vector(63 downto 0); + shift : unsigned(5 downto 0); + mask_size : unsigned(4 downto 0); + pgbase : std_ulogic_vector(55 downto 0); + pde : std_ulogic_vector(63 downto 0); end record; signal r, rin : reg_stage_t; + signal addrsh : std_ulogic_vector(15 downto 0); + signal mask : std_ulogic_vector(15 downto 0); + signal finalmask : std_ulogic_vector(43 downto 0); + begin + -- Multiplex internal SPR values back to loadstore1, selected + -- by l_in.sprn. Easy when there's only one... + l_out.sprval <= r.pgtbl0; mmu_0: process(clk) begin @@ -46,64 +63,237 @@ begin if rst = '1' then r.state <= IDLE; r.valid <= '0'; + r.pgtbl0 <= (others => '0'); else if rin.valid = '1' then report "MMU got tlb miss for " & to_hstring(rin.addr); end if; if l_out.done = '1' then - report "MMU completing miss with error=" & std_ulogic'image(l_out.error); + report "MMU completing op with invalid=" & std_ulogic'image(l_out.invalid) & + " badtree=" & std_ulogic'image(l_out.badtree); + end if; + if rin.state = RADIX_LOOKUP then + report "radix lookup shift=" & integer'image(to_integer(rin.shift)) & + " msize=" & integer'image(to_integer(rin.mask_size)); + end if; + if r.state = RADIX_LOOKUP then + report "send load addr=" & to_hstring(d_out.addr) & + " addrsh=" & to_hstring(addrsh) & " mask=" & to_hstring(mask); end if; r <= rin; end if; end if; end process; + -- Shift address bits 61--12 right by 0--47 bits and + -- supply the least significant 16 bits of the result. + addrshifter: process(all) + variable sh1 : std_ulogic_vector(30 downto 0); + variable sh2 : std_ulogic_vector(18 downto 0); + variable result : std_ulogic_vector(15 downto 0); + begin + case r.shift(5 downto 4) is + when "00" => + sh1 := r.addr(42 downto 12); + when "01" => + sh1 := r.addr(58 downto 28); + when others => + sh1 := "0000000000000" & r.addr(61 downto 44); + end case; + case r.shift(3 downto 2) is + when "00" => + sh2 := sh1(18 downto 0); + when "01" => + sh2 := sh1(22 downto 4); + when "10" => + sh2 := sh1(26 downto 8); + when others => + sh2 := sh1(30 downto 12); + end case; + case r.shift(1 downto 0) is + when "00" => + result := sh2(15 downto 0); + when "01" => + result := sh2(16 downto 1); + when "10" => + result := sh2(17 downto 2); + when others => + result := sh2(18 downto 3); + end case; + addrsh <= result; + end process; + + -- generate mask for extracting address fields for PTE address generation + addrmaskgen: process(all) + variable m : std_ulogic_vector(15 downto 0); + begin + -- mask_count has to be >= 5 + m := x"001f"; + for i in 5 to 15 loop + if i < to_integer(r.mask_size) then + m(i) := '1'; + end if; + end loop; + mask <= m; + end process; + + -- generate mask for extracting address bits to go in TLB entry + -- in order to support pages > 4kB + finalmaskgen: process(all) + variable m : std_ulogic_vector(43 downto 0); + begin + m := (others => '0'); + for i in 0 to 43 loop + if i < to_integer(r.shift) then + m(i) := '1'; + end if; + end loop; + finalmask <= m; + end process; + mmu_1: process(all) variable v : reg_stage_t; variable dcreq : std_ulogic; variable done : std_ulogic; - variable err : std_ulogic; + variable invalid : std_ulogic; + variable badtree : std_ulogic; + variable tlb_load : std_ulogic; + variable tlbie_req : std_ulogic; + variable rts : unsigned(5 downto 0); + variable mbits : unsigned(5 downto 0); + variable pgtable_addr : std_ulogic_vector(63 downto 0); + variable pte : std_ulogic_vector(63 downto 0); + variable data : std_ulogic_vector(63 downto 0); begin - v.valid := l_in.valid; - v.addr := l_in.addr; - v.state := r.state; + v := r; + v.valid := '0'; dcreq := '0'; done := '0'; - err := '0'; + invalid := '0'; + badtree := '0'; + tlb_load := '0'; + tlbie_req := '0'; + + -- Radix tree data structures in memory are big-endian, + -- so we need to byte-swap them + for i in 0 to 7 loop + data(i * 8 + 7 downto i * 8) := d_in.data((7 - i) * 8 + 7 downto (7 - i) * 8); + end loop; case r.state is when IDLE => + -- rts == radix tree size, # address bits being translated + rts := unsigned('0' & r.pgtbl0(62 downto 61) & r.pgtbl0(7 downto 5)) + (31 - 12); + -- mbits == # address bits to index top level of tree + mbits := unsigned('0' & r.pgtbl0(4 downto 0)); + v.shift := rts - mbits; + v.mask_size := mbits(4 downto 0); + v.pgbase := r.pgtbl0(55 downto 8) & x"00"; + if l_in.valid = '1' then + v.addr := l_in.addr; if l_in.tlbie = '1' then dcreq := '1'; - v.state := TLBIE_WAIT; + tlbie_req := '1'; + v.state := TLB_WAIT; else - v.state := RADIX_LOOKUP_0; + v.valid := '1'; + -- for now, take RPDS = 0 to disable radix translation + if mbits = 0 then + v.state := RADIX_NO_TRANS; + elsif mbits < 5 or mbits > 16 or mbits > rts then + v.state := RADIX_BAD_TREE; + else + v.state := RADIX_LOOKUP; + end if; end if; end if; + if l_in.mtspr = '1' then + v.pgtbl0 := l_in.rs; + end if; - when TLBIE_WAIT => + when TLB_WAIT => if d_in.done = '1' then done := '1'; v.state := IDLE; end if; - when RADIX_LOOKUP_0 => + when RADIX_LOOKUP => + dcreq := '1'; + v.state := RADIX_READ_WAIT; + + when RADIX_READ_WAIT => + if d_in.done = '1' then + if d_in.err = '0' then + v.pde := data; + -- test valid bit + if data(63) = '1' then + -- test leaf bit + if data(62) = '1' then + v.state := RADIX_LOAD_TLB; + else + mbits := unsigned('0' & data(4 downto 0)); + if mbits < 5 or mbits > 16 or mbits > r.shift then + v.state := RADIX_BAD_TREE; + else + v.shift := v.shift - mbits; + v.mask_size := mbits(4 downto 0); + v.pgbase := data(55 downto 8) & x"00"; + v.state := RADIX_LOOKUP; + end if; + end if; + else + -- non-present PTE, generate a DSI + v.state := RADIX_NO_TRANS; + end if; + else + v.state := RADIX_BAD_TREE; + end if; + end if; + + when RADIX_LOAD_TLB => + tlb_load := '1'; + dcreq := '1'; + v.state := TLB_WAIT; + + when RADIX_NO_TRANS => + done := '1'; + invalid := '1'; + v.state := IDLE; + + when RADIX_BAD_TREE => done := '1'; - err := '1'; + badtree := '1'; v.state := IDLE; end case; + pgtable_addr := x"00" & r.pgbase(55 downto 19) & + ((r.pgbase(18 downto 3) and not mask) or (addrsh and mask)) & + "000"; + pte := x"00" & + ((r.pde(55 downto 12) and not finalmask) or (r.addr(55 downto 12) and finalmask)) + & r.pde(11 downto 0); + -- update registers rin <= v; -- drive outputs l_out.done <= done; - l_out.error <= err; + l_out.invalid <= invalid; + l_out.badtree <= badtree; d_out.valid <= dcreq; - d_out.tlbie <= l_in.tlbie; - d_out.addr <= l_in.addr; - d_out.pte <= l_in.rs; + d_out.tlbie <= tlbie_req; + d_out.tlbld <= tlb_load; + if tlbie_req = '1' then + d_out.addr <= l_in.addr; + d_out.pte <= l_in.rs; + elsif tlb_load = '1' then + d_out.addr <= r.addr(63 downto 12) & x"000"; + d_out.pte <= pte; + else + d_out.addr <= pgtable_addr; + d_out.pte <= (others => '0'); + end if; end process; end; From f6a0d7f9daacd657c5bf5ea837a0f9e91bb36bce Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 23 Apr 2020 21:54:08 +1000 Subject: [PATCH 12/26] MMU: Implement data segment interrupts A data segment interrupt (DSegI) occurs when an address to be translated by the MMU is outside the range of the radix tree or the top two bits of the address (the quadrant) are 01 or 10. This is detected in a new state of the MMU state machine, and is sent back to loadstore1 as an error, which sends it on to execute1 to generate an interrupt to the 0x380 vector. Signed-off-by: Paul Mackerras --- common.vhdl | 2 ++ execute1.vhdl | 6 ++++- loadstore1.vhdl | 7 ++++-- mmu.vhdl | 64 ++++++++++++++++++++++++++++++------------------- 4 files changed, 52 insertions(+), 27 deletions(-) diff --git a/common.vhdl b/common.vhdl index d617fa4..07d1a36 100644 --- a/common.vhdl +++ b/common.vhdl @@ -242,6 +242,7 @@ package common is type Loadstore1ToExecute1Type is record exception : std_ulogic; + segment_fault : std_ulogic; end record; type Loadstore1ToDcacheType is record @@ -280,6 +281,7 @@ package common is done : std_ulogic; invalid : std_ulogic; badtree : std_ulogic; + segerr : std_ulogic; sprval : std_ulogic_vector(63 downto 0); end record; diff --git a/execute1.vhdl b/execute1.vhdl index 5e25efc..7181f7f 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -974,7 +974,11 @@ begin -- generate DSI for load/store exceptions if l_in.exception = '1' then - ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#300#, 64)); + if l_in.segment_fault = '0' then + ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#300#, 64)); + else + ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#380#, 64)); + end if; ctrl_tmp.srr1 <= msr_copy(ctrl.msr); v.e.exc_write_enable := '1'; v.e.exc_write_reg := fast_spr_num(SPR_SRR0); diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 03aaa6f..a29564b 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -372,7 +372,7 @@ begin byte_sel := r.first_bytes; end if; if m_in.done = '1' then - if m_in.invalid = '0' and m_in.badtree = '0' then + if m_in.invalid = '0' and m_in.badtree = '0' and m_in.segerr = '0' then -- retry the request now that the MMU has installed a TLB entry req := '1'; if r.state = MMU_LOOKUP_1ST then @@ -480,9 +480,12 @@ begin -- update exception info back to execute1 e_out.exception <= exception; + e_out.segment_fault <= m_in.segerr; if exception = '1' then v.dar := addr; - v.dsisr := dsisr; + if m_in.segerr = '0' then + v.dsisr := dsisr; + end if; end if; stall_out <= stall; diff --git a/mmu.vhdl b/mmu.vhdl index fe6ad16..293b7a8 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -26,11 +26,11 @@ architecture behave of mmu is type state_t is (IDLE, TLB_WAIT, + SEGMENT_CHECK, RADIX_LOOKUP, RADIX_READ_WAIT, RADIX_LOAD_TLB, - RADIX_NO_TRANS, - RADIX_BAD_TREE + RADIX_ERROR ); type reg_stage_t is record @@ -44,6 +44,9 @@ architecture behave of mmu is mask_size : unsigned(4 downto 0); pgbase : std_ulogic_vector(55 downto 0); pde : std_ulogic_vector(63 downto 0); + invalid : std_ulogic; + badtree : std_ulogic; + segerror : std_ulogic; end record; signal r, rin : reg_stage_t; @@ -155,8 +158,6 @@ begin variable v : reg_stage_t; variable dcreq : std_ulogic; variable done : std_ulogic; - variable invalid : std_ulogic; - variable badtree : std_ulogic; variable tlb_load : std_ulogic; variable tlbie_req : std_ulogic; variable rts : unsigned(5 downto 0); @@ -164,13 +165,15 @@ begin variable pgtable_addr : std_ulogic_vector(63 downto 0); variable pte : std_ulogic_vector(63 downto 0); variable data : std_ulogic_vector(63 downto 0); + variable nonzero : std_ulogic; begin v := r; v.valid := '0'; dcreq := '0'; done := '0'; - invalid := '0'; - badtree := '0'; + v.invalid := '0'; + v.badtree := '0'; + v.segerror := '0'; tlb_load := '0'; tlbie_req := '0'; @@ -183,10 +186,11 @@ begin case r.state is when IDLE => -- rts == radix tree size, # address bits being translated - rts := unsigned('0' & r.pgtbl0(62 downto 61) & r.pgtbl0(7 downto 5)) + (31 - 12); + rts := unsigned('0' & r.pgtbl0(62 downto 61) & r.pgtbl0(7 downto 5)); -- mbits == # address bits to index top level of tree mbits := unsigned('0' & r.pgtbl0(4 downto 0)); - v.shift := rts - mbits; + -- set v.shift to rts so that we can use finalmask for the segment check + v.shift := rts; v.mask_size := mbits(4 downto 0); v.pgbase := r.pgtbl0(55 downto 8) & x"00"; @@ -198,13 +202,12 @@ begin v.state := TLB_WAIT; else v.valid := '1'; - -- for now, take RPDS = 0 to disable radix translation + -- Use RPDS = 0 to disable radix tree walks if mbits = 0 then - v.state := RADIX_NO_TRANS; - elsif mbits < 5 or mbits > 16 or mbits > rts then - v.state := RADIX_BAD_TREE; + v.state := RADIX_ERROR; + v.invalid := '1'; else - v.state := RADIX_LOOKUP; + v.state := SEGMENT_CHECK; end if; end if; end if; @@ -218,6 +221,20 @@ begin v.state := IDLE; end if; + when SEGMENT_CHECK => + mbits := '0' & r.mask_size; + v.shift := r.shift + (31 - 12) - mbits; + nonzero := or(r.addr(61 downto 31) and not finalmask(30 downto 0)); + if r.addr(63) /= r.addr(62) or nonzero = '1' then + v.state := RADIX_ERROR; + v.segerror := '1'; + elsif mbits < 5 or mbits > 16 or mbits > (r.shift + (31 - 12)) then + v.state := RADIX_ERROR; + v.badtree := '1'; + else + v.state := RADIX_LOOKUP; + end if; + when RADIX_LOOKUP => dcreq := '1'; v.state := RADIX_READ_WAIT; @@ -234,7 +251,8 @@ begin else mbits := unsigned('0' & data(4 downto 0)); if mbits < 5 or mbits > 16 or mbits > r.shift then - v.state := RADIX_BAD_TREE; + v.state := RADIX_ERROR; + v.badtree := '1'; else v.shift := v.shift - mbits; v.mask_size := mbits(4 downto 0); @@ -244,10 +262,12 @@ begin end if; else -- non-present PTE, generate a DSI - v.state := RADIX_NO_TRANS; + v.state := RADIX_ERROR; + v.invalid := '1'; end if; else - v.state := RADIX_BAD_TREE; + v.state := RADIX_ERROR; + v.badtree := '1'; end if; end if; @@ -256,15 +276,10 @@ begin dcreq := '1'; v.state := TLB_WAIT; - when RADIX_NO_TRANS => + when RADIX_ERROR => done := '1'; - invalid := '1'; v.state := IDLE; - when RADIX_BAD_TREE => - done := '1'; - badtree := '1'; - v.state := IDLE; end case; pgtable_addr := x"00" & r.pgbase(55 downto 19) & @@ -279,8 +294,9 @@ begin -- drive outputs l_out.done <= done; - l_out.invalid <= invalid; - l_out.badtree <= badtree; + l_out.invalid <= r.invalid; + l_out.badtree <= r.badtree; + l_out.segerr <= r.segerror; d_out.valid <= dcreq; d_out.tlbie <= tlbie_req; From 3eb07dc6370c3825394596657ac044c47f5b3cd2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 6 May 2020 20:21:01 +1000 Subject: [PATCH 13/26] MMU: Refetch PTE on access fault This is required by the architecture. It means that the error bits reported in DSISR or SRR1 now come from the permission/RC check done on the refetched PTE rather than the TLB entry. Unfortunately that somewhat breaks the software-loaded TLB mode of operation in that DSISR/SRR1 always report no PTE rather than permission error or RC failure. This also restructures the loadstore1 state machine a bit, combining the FIRST_ACK_WAIT and LAST_ACK_WAIT states into a single state and the MMU_LOOKUP_1ST and MMU_LOOKUP_LAST states likewise. We now have a 'dwords_done' bit to say whether the first transfer of two (for an unaligned access) has been done. The cache paradox error (where a non-cacheable access finds a hit in the cache) is now the only cause of DSI from the dcache. This should probably be a machine check rather than DSI in fact. Signed-off-by: Paul Mackerras --- common.vhdl | 18 ++++---- dcache.vhdl | 29 ++++++------ loadstore1.vhdl | 114 ++++++++++++++++++++++-------------------------- mmu.vhdl | 27 +++++++++++- 4 files changed, 103 insertions(+), 85 deletions(-) diff --git a/common.vhdl b/common.vhdl index 07d1a36..424259b 100644 --- a/common.vhdl +++ b/common.vhdl @@ -263,26 +263,28 @@ package common is data : std_ulogic_vector(63 downto 0); store_done : std_ulogic; error : std_ulogic; - tlb_miss : std_ulogic; - perm_error : std_ulogic; - rc_error : std_ulogic; + cache_paradox : std_ulogic; end record; type Loadstore1ToMmuType is record valid : std_ulogic; tlbie : std_ulogic; mtspr : std_ulogic; + load : std_ulogic; + priv : std_ulogic; sprn : std_ulogic_vector(3 downto 0); addr : std_ulogic_vector(63 downto 0); rs : std_ulogic_vector(63 downto 0); end record; type MmuToLoadstore1Type is record - done : std_ulogic; - invalid : std_ulogic; - badtree : std_ulogic; - segerr : std_ulogic; - sprval : std_ulogic_vector(63 downto 0); + done : std_ulogic; + invalid : std_ulogic; + badtree : std_ulogic; + segerr : std_ulogic; + perm_error : std_ulogic; + rc_error : std_ulogic; + sprval : std_ulogic_vector(63 downto 0); end record; type MmuToDcacheType is record diff --git a/dcache.vhdl b/dcache.vhdl index 96563a5..ed593e8 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -179,6 +179,7 @@ architecture rtl of dcache is OP_LOAD_MISS, -- Load missing cache OP_LOAD_NC, -- Non-cachable load OP_BAD, -- BAD: Cache hit on NC load/store + OP_TLB_ERR, -- TLB miss or protection/RC failure OP_STORE_HIT, -- Store hitting cache OP_STORE_MISS); -- Store missing cache @@ -244,9 +245,7 @@ architecture rtl of dcache is -- Signals to complete with error error_done : std_ulogic; - tlb_miss : std_ulogic; -- No entry found in TLB - perm_error : std_ulogic; -- Permissions don't allow access - rc_error : std_ulogic; -- Reference or change bit clear + cache_paradox : std_ulogic; -- completion signal for tlbie tlbie_done : std_ulogic; @@ -758,7 +757,7 @@ begin when others => op := OP_NONE; end case; else - op := OP_BAD; + op := OP_TLB_ERR; end if; end if; req_op <= op; @@ -829,9 +828,7 @@ begin d_out.data <= cache_out(r1.hit_way); d_out.store_done <= '0'; d_out.error <= '0'; - d_out.tlb_miss <= '0'; - d_out.perm_error <= '0'; - d_out.rc_error <= '0'; + d_out.cache_paradox <= '0'; -- Outputs to MMU m_out.done <= r1.tlbie_done; @@ -868,9 +865,7 @@ begin if r1.error_done = '1' then report "completing ld/st with error"; d_out.error <= '1'; - d_out.tlb_miss <= r1.tlb_miss; - d_out.perm_error <= r1.perm_error; - d_out.rc_error <= r1.rc_error; + d_out.cache_paradox <= r1.cache_paradox; d_out.valid <= '1'; end if; @@ -1034,15 +1029,18 @@ begin r1.hit_load_valid <= '0'; end if; - if req_op = OP_BAD then + if req_op = OP_TLB_ERR then report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) & " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok); r1.error_done <= '1'; - r1.tlb_miss <= not valid_ra; - r1.perm_error <= valid_ra and not perm_ok; - r1.rc_error <= valid_ra and perm_ok and not rc_ok; + r1.cache_paradox <= '0'; + elsif req_op = OP_BAD then + report "Signalling cache paradox"; + r1.error_done <= '1'; + r1.cache_paradox <= '1'; else r1.error_done <= '0'; + r1.cache_paradox <= '0'; end if; -- complete tlbies and TLB loads in the third cycle @@ -1187,7 +1185,8 @@ begin -- OP_NONE and OP_BAD do nothing -- OP_BAD was handled above already when OP_NONE => - when OP_BAD => + when OP_BAD => + when OP_TLB_ERR => end case; when RELOAD_WAIT_ACK => diff --git a/loadstore1.vhdl b/loadstore1.vhdl index a29564b..c56346f 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -38,11 +38,10 @@ architecture behave of loadstore1 is -- State machine for unaligned loads/stores type state_t is (IDLE, -- ready for instruction SECOND_REQ, -- send 2nd request of unaligned xfer - FIRST_ACK_WAIT, -- waiting for 1st ack from dcache - LAST_ACK_WAIT, -- waiting for last ack from dcache + ACK_WAIT, -- waiting for ack from dcache LD_UPDATE, -- writing rA with computed addr on load - MMU_LOOKUP_1ST, -- waiting for MMU to look up translation - MMU_LOOKUP_LAST + MMU_LOOKUP, -- waiting for MMU to look up translation + TLBIE_WAIT -- waiting for MMU to finish doing a tlbie ); type reg_stage_t is record @@ -66,6 +65,7 @@ architecture behave of loadstore1 is virt_mode : std_ulogic; priv_mode : std_ulogic; state : state_t; + dwords_done : std_ulogic; first_bytes : std_ulogic_vector(7 downto 0); second_bytes : std_ulogic_vector(7 downto 0); dar : std_ulogic_vector(63 downto 0); @@ -230,6 +230,7 @@ begin v.load := '0'; v.dcbz := '0'; v.tlbie := '0'; + v.dwords_done := '0'; case l_in.op is when OP_STORE => req := '1'; @@ -241,7 +242,9 @@ begin v.dcbz := '1'; when OP_TLBIE => mmureq := '1'; + stall := '1'; v.tlbie := '1'; + v.state := TLBIE_WAIT; when OP_MFSPR => done := '1'; mfspr := '1'; @@ -318,15 +321,11 @@ begin if req = '1' then stall := '1'; if long_sel(15 downto 8) = "00000000" then - v.state := LAST_ACK_WAIT; + v.state := ACK_WAIT; else v.state := SECOND_REQ; end if; end if; - if mmureq = '1' then - stall := '1'; - v.state := LAST_ACK_WAIT; - end if; end if; when SECOND_REQ => @@ -334,37 +333,58 @@ begin byte_sel := r.second_bytes; req := '1'; stall := '1'; - v.state := FIRST_ACK_WAIT; + v.state := ACK_WAIT; - when FIRST_ACK_WAIT => + when ACK_WAIT => stall := '1'; if d_in.valid = '1' then if d_in.error = '1' then - -- dcache will discard the second request - addr := r.addr; - if d_in.tlb_miss = '1' then - -- give it to the MMU to look up - mmureq := '1'; - v.state := MMU_LOOKUP_1ST; + -- dcache will discard the second request if it + -- gets an error on the 1st of two requests + if r.dwords_done = '1' then + addr := next_addr; else + addr := r.addr; + end if; + if d_in.cache_paradox = '1' then -- signal an interrupt straight away exception := '1'; - dsisr(63 - 36) := d_in.perm_error; dsisr(63 - 38) := not r.load; - dsisr(63 - 45) := d_in.rc_error; + -- XXX there is no architected bit for this + dsisr(63 - 35) := d_in.cache_paradox; v.state := IDLE; + else + -- Look up the translation for TLB miss + -- and also for permission error and RC error + -- in case the PTE has been updated. + mmureq := '1'; + v.state := MMU_LOOKUP; end if; else - v.state := LAST_ACK_WAIT; - if r.load = '1' then - v.load_data := data_permuted; + if two_dwords = '1' and r.dwords_done = '0' then + v.dwords_done := '1'; + if r.load = '1' then + v.load_data := data_permuted; + end if; + else + write_enable := r.load; + if r.load = '1' and r.update = '1' then + -- loads with rA update need an extra cycle + v.state := LD_UPDATE; + else + -- stores write back rA update in this cycle + do_update := r.update; + stall := '0'; + done := '1'; + v.state := IDLE; + end if; end if; end if; end if; - when MMU_LOOKUP_1ST | MMU_LOOKUP_LAST => + when MMU_LOOKUP => stall := '1'; - if two_dwords = '1' and r.state = MMU_LOOKUP_LAST then + if r.dwords_done = '1' then addr := next_addr; byte_sel := r.second_bytes; else @@ -372,58 +392,28 @@ begin byte_sel := r.first_bytes; end if; if m_in.done = '1' then - if m_in.invalid = '0' and m_in.badtree = '0' and m_in.segerr = '0' then + if m_in.invalid = '0' and m_in.perm_error = '0' and m_in.rc_error = '0' and + m_in.badtree = '0' and m_in.segerr = '0' then -- retry the request now that the MMU has installed a TLB entry req := '1'; - if r.state = MMU_LOOKUP_1ST then + if two_dwords = '1' and r.dwords_done = '0' then v.state := SECOND_REQ; else - v.state := LAST_ACK_WAIT; + v.state := ACK_WAIT; end if; else exception := '1'; dsisr(63 - 33) := m_in.invalid; + dsisr(63 - 36) := m_in.perm_error; dsisr(63 - 38) := not r.load; dsisr(63 - 44) := m_in.badtree; + dsisr(63 - 45) := m_in.rc_error; v.state := IDLE; end if; end if; - when LAST_ACK_WAIT => + when TLBIE_WAIT => stall := '1'; - if d_in.valid = '1' then - if d_in.error = '1' then - if two_dwords = '1' then - addr := next_addr; - else - addr := r.addr; - end if; - if d_in.tlb_miss = '1' then - -- give it to the MMU to look up - mmureq := '1'; - v.state := MMU_LOOKUP_LAST; - else - -- signal an interrupt straight away - exception := '1'; - dsisr(63 - 36) := d_in.perm_error; - dsisr(63 - 38) := not r.load; - dsisr(63 - 45) := d_in.rc_error; - v.state := IDLE; - end if; - else - write_enable := r.load; - if r.load = '1' and r.update = '1' then - -- loads with rA update need an extra cycle - v.state := LD_UPDATE; - else - -- stores write back rA update in this cycle - do_update := r.update; - stall := '0'; - done := '1'; - v.state := IDLE; - end if; - end if; - end if; if m_in.done = '1' then -- tlbie is finished stall := '0'; @@ -451,6 +441,8 @@ begin -- Update outputs to MMU m_out.valid <= mmureq; + m_out.load <= r.load; + m_out.priv <= r.priv_mode; m_out.tlbie <= v.tlbie; m_out.mtspr <= mmu_mtspr; m_out.sprn <= sprn(3 downto 0); diff --git a/mmu.vhdl b/mmu.vhdl index 293b7a8..3a1003c 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -36,6 +36,8 @@ architecture behave of mmu is type reg_stage_t is record -- latched request from loadstore1 valid : std_ulogic; + store : std_ulogic; + priv : std_ulogic; addr : std_ulogic_vector(63 downto 0); -- internal state state : state_t; @@ -47,6 +49,8 @@ architecture behave of mmu is invalid : std_ulogic; badtree : std_ulogic; segerror : std_ulogic; + perm_err : std_ulogic; + rc_error : std_ulogic; end record; signal r, rin : reg_stage_t; @@ -166,6 +170,8 @@ begin variable pte : std_ulogic_vector(63 downto 0); variable data : std_ulogic_vector(63 downto 0); variable nonzero : std_ulogic; + variable perm_ok : std_ulogic; + variable rc_ok : std_ulogic; begin v := r; v.valid := '0'; @@ -174,6 +180,8 @@ begin v.invalid := '0'; v.badtree := '0'; v.segerror := '0'; + v.perm_err := '0'; + v.rc_error := '0'; tlb_load := '0'; tlbie_req := '0'; @@ -196,6 +204,8 @@ begin if l_in.valid = '1' then v.addr := l_in.addr; + v.store := not l_in.load; + v.priv := l_in.priv; if l_in.tlbie = '1' then dcreq := '1'; tlbie_req := '1'; @@ -247,7 +257,20 @@ begin if data(63) = '1' then -- test leaf bit if data(62) = '1' then - v.state := RADIX_LOAD_TLB; + -- check permissions and RC bits + perm_ok := '0'; + if r.priv = '1' or data(3) = '0' then + perm_ok := data(1) or (data(2) and not r.store); + end if; + rc_ok := data(8) and (data(7) or not r.store); + if perm_ok = '1' and rc_ok = '1' then + v.state := RADIX_LOAD_TLB; + else + v.state := RADIX_ERROR; + v.perm_err := not perm_ok; + -- permission error takes precedence over RC error + v.rc_error := perm_ok; + end if; else mbits := unsigned('0' & data(4 downto 0)); if mbits < 5 or mbits > 16 or mbits > r.shift then @@ -297,6 +320,8 @@ begin l_out.invalid <= r.invalid; l_out.badtree <= r.badtree; l_out.segerr <= r.segerror; + l_out.perm_error <= r.perm_err; + l_out.rc_error <= r.rc_error; d_out.valid <= dcreq; d_out.tlbie <= tlbie_req; From dee3783d79c40d6a9e986e2d032ae0afad3064ff Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 7 May 2020 10:17:08 +1000 Subject: [PATCH 14/26] MMU: Remove software-loaded dTLB mode This removes the hack where the tlbie instruction could be used to load entries directly into the dTLB, because we don't report the correct DSISR values for accesses that hit software-loaded dTLB entries and have privilege or permission errors. Signed-off-by: Paul Mackerras --- dcache.vhdl | 2 -- 1 file changed, 2 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index ed593e8..b75d91f 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -586,8 +586,6 @@ begin if r0_valid = '1' and r0.tlbie = '1' then if r0.req.addr(11 downto 10) /= "00" then tlbia := '1'; - elsif r0.req.addr(9) = '1' then - tlbwe := '1'; else tlbie := '1'; end if; From 882a5a0dc06add8f91b747e8032e044708a32318 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 23 Apr 2020 15:33:36 +1000 Subject: [PATCH 15/26] tests: Add a test for the MMU radix page table walks This adds tests to check that the MMU and dTLB are translating addresses and checking permissions correctly. We use a simple 2-level radix tree. The radix tree maps 2GB of address space and has a 1024-entry page directory pointing to 512-entry page table pages. Signed-off-by: Paul Mackerras --- tests/mmu/Makefile | 3 + tests/mmu/head.S | 112 +++++++++ tests/mmu/mmu.c | 468 +++++++++++++++++++++++++++++++++++++ tests/mmu/powerpc.lds | 13 ++ tests/test_mmu.bin | Bin 0 -> 12304 bytes tests/test_mmu.console_out | 10 + tests/update_console_tests | 2 +- 7 files changed, 607 insertions(+), 1 deletion(-) create mode 100644 tests/mmu/Makefile create mode 100644 tests/mmu/head.S create mode 100644 tests/mmu/mmu.c create mode 100644 tests/mmu/powerpc.lds create mode 100755 tests/test_mmu.bin create mode 100644 tests/test_mmu.console_out diff --git a/tests/mmu/Makefile b/tests/mmu/Makefile new file mode 100644 index 0000000..84f7ff2 --- /dev/null +++ b/tests/mmu/Makefile @@ -0,0 +1,3 @@ +TEST=mmu + +include ../Makefile.test diff --git a/tests/mmu/head.S b/tests/mmu/head.S new file mode 100644 index 0000000..3627cff --- /dev/null +++ b/tests/mmu/head.S @@ -0,0 +1,112 @@ +/* Copyright 2013-2014 IBM Corp. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define STACK_TOP 0x4000 + +/* Load an immediate 64-bit value into a register */ +#define LOAD_IMM64(r, e) \ + lis r,(e)@highest; \ + ori r,r,(e)@higher; \ + rldicr r,r, 32, 31; \ + oris r,r, (e)@h; \ + ori r,r, (e)@l; + + .section ".head","ax" + + /* + * Microwatt currently enters in LE mode at 0x0, so we don't need to + * do any endian fix ups + */ + . = 0 +.global _start +_start: + b boot_entry + +.global boot_entry +boot_entry: + /* setup stack */ + LOAD_IMM64(%r1, STACK_TOP - 0x100) + LOAD_IMM64(%r12, main) + mtctr %r12 + bctrl + attn // terminate on exit + b . + + /* Read a location with translation on */ + .globl test_read +test_read: + mfmsr %r9 + ori %r8,%r9,0x10 /* set MSR_DR */ + mtmsrd %r8,0 + mr %r6,%r3 + li %r3,0 + ld %r5,0(%r6) + li %r3,1 + /* land here if DSI occurred */ + mtmsrd %r9,0 + std %r5,0(%r4) + blr + + /* Write a location with translation on */ + .globl test_write +test_write: + mfmsr %r9 + ori %r8,%r9,0x10 /* set MSR_DR */ + mtmsrd %r8,0 + mr %r6,%r3 + li %r3,0 + std %r4,0(%r6) + li %r3,1 + /* land here if DSI occurred */ + mtmsrd %r9,0 + blr + +#define EXCEPTION(nr) \ + .= nr ;\ + attn + + /* DSI vector - skip the failing instruction + the next one */ + . = 0x300 + mtsprg0 %r10 + mfsrr0 %r10 + addi %r10,%r10,8 + mtsrr0 %r10 + rfid + + /* More exception stubs */ + EXCEPTION(0x380) + EXCEPTION(0x400) + EXCEPTION(0x480) + EXCEPTION(0x500) + EXCEPTION(0x600) + EXCEPTION(0x700) + EXCEPTION(0x800) + EXCEPTION(0x900) + EXCEPTION(0x980) + EXCEPTION(0xa00) + EXCEPTION(0xb00) + EXCEPTION(0xc00) + EXCEPTION(0xd00) + EXCEPTION(0xe00) + EXCEPTION(0xe20) + EXCEPTION(0xe40) + EXCEPTION(0xe60) + EXCEPTION(0xe80) + EXCEPTION(0xf00) + EXCEPTION(0xf20) + EXCEPTION(0xf40) + EXCEPTION(0xf60) + EXCEPTION(0xf80) diff --git a/tests/mmu/mmu.c b/tests/mmu/mmu.c new file mode 100644 index 0000000..0a717c7 --- /dev/null +++ b/tests/mmu/mmu.c @@ -0,0 +1,468 @@ +#include +#include +#include + +#include "console.h" + +extern int test_read(long *addr, long *ret, long init); +extern int test_write(long *addr, long val); + +static inline void do_tlbie(unsigned long rb, unsigned long rs) +{ + __asm__ volatile("tlbie %0,%1" : : "r" (rb), "r" (rs) : "memory"); +} + +static inline unsigned long mfspr(int sprnum) +{ + long val; + + __asm__ volatile("mfspr %0,%1" : "=r" (val) : "i" (sprnum)); + return val; +} + +static inline void mtspr(int sprnum, unsigned long val) +{ + __asm__ volatile("mtspr %0,%1" : : "i" (sprnum), "r" (val)); +} + +static inline void store_pte(unsigned long *p, unsigned long pte) +{ + __asm__ volatile("stdbrx %1,0,%0" : : "r" (p), "r" (pte) : "memory"); +} + +void print_string(const char *str) +{ + for (; *str; ++str) + putchar(*str); +} + +void print_hex(unsigned long val) +{ + int i, x; + + for (i = 60; i >= 0; i -= 4) { + x = (val >> i) & 0xf; + if (x >= 10) + putchar(x + 'a' - 10); + else + putchar(x + '0'); + } +} + +// i < 100 +void print_test_number(int i) +{ + print_string("test "); + putchar(48 + i/10); + putchar(48 + i%10); + putchar(':'); +} + +#define CACHE_LINE_SIZE 64 + +void zero_memory(void *ptr, unsigned long nbytes) +{ + unsigned long nb, i, nl; + void *p; + + for (; nbytes != 0; nbytes -= nb, ptr += nb) { + nb = -((unsigned long)ptr) & (CACHE_LINE_SIZE - 1); + if (nb == 0 && nbytes >= CACHE_LINE_SIZE) { + nl = nbytes / CACHE_LINE_SIZE; + p = ptr; + for (i = 0; i < nl; ++i) { + __asm__ volatile("dcbz 0,%0" : : "r" (p) : "memory"); + p += CACHE_LINE_SIZE; + } + nb = nl * CACHE_LINE_SIZE; + } else { + if (nb > nbytes) + nb = nbytes; + for (i = 0; i < nb; ++i) + ((unsigned char *)ptr)[i] = 0; + } + } +} + +#define PERM_EX 0x001 +#define PERM_WR 0x002 +#define PERM_RD 0x004 +#define PERM_PRIV 0x008 +#define ATTR_NC 0x020 +#define CHG 0x080 +#define REF 0x100 + +#define DFLT_PERM (PERM_WR | PERM_RD | REF | CHG) + +/* + * Set up an MMU translation tree using memory starting at the 64k point. + * We use 2 levels, mapping 2GB (the minimum size possible), with a + * 8kB PGD level pointing to 4kB PTE pages. + */ +unsigned long *pgdir = (unsigned long *) 0x10000; +unsigned long free_ptr = 0x12000; +void *eas_mapped[4]; +int neas_mapped; + +void init_mmu(void) +{ + zero_memory(pgdir, 1024 * sizeof(unsigned long)); + /* RTS = 0 (2GB address space), RPDS = 10 (1024-entry top level) */ + mtspr(720, (unsigned long) pgdir | 10); + do_tlbie(0xc00, 0); /* invalidate all TLB entries */ +} + +static unsigned long *read_pgd(unsigned long i) +{ + unsigned long ret; + + __asm__ volatile("ldbrx %0,%1,%2" : "=r" (ret) : "b" (pgdir), + "r" (i * sizeof(unsigned long))); + return (unsigned long *) (ret & 0x00ffffffffffff00); +} + +void map(void *ea, void *pa, unsigned long perm_attr) +{ + unsigned long epn = (unsigned long) ea >> 12; + unsigned long i, j; + unsigned long *ptep; + + i = (epn >> 9) & 0x3ff; + j = epn & 0x1ff; + if (pgdir[i] == 0) { + zero_memory((void *)free_ptr, 512 * sizeof(unsigned long)); + store_pte(&pgdir[i], 0x8000000000000000 | free_ptr | 9); + free_ptr += 512 * sizeof(unsigned long); + } + ptep = read_pgd(i); + store_pte(&ptep[j], 0xc000000000000000 | ((unsigned long)pa & 0x00fffffffffff000) | perm_attr); + eas_mapped[neas_mapped++] = ea; +} + +void unmap(void *ea) +{ + unsigned long epn = (unsigned long) ea >> 12; + unsigned long i, j; + unsigned long *ptep; + + i = (epn >> 9) & 0x3ff; + j = epn & 0x1ff; + if (pgdir[i] == 0) + return; + ptep = read_pgd(i); + ptep[j] = 0; + do_tlbie(((unsigned long)ea & ~0xfff), 0); +} + +void unmap_all(void) +{ + int i; + + for (i = 0; i < neas_mapped; ++i) + unmap(eas_mapped[i]); + neas_mapped = 0; +} + +int mmu_test_1(void) +{ + long *ptr = (long *) 0x123000; + long val; + + /* this should fail */ + if (test_read(ptr, &val, 0xdeadbeefd00d)) + return 1; + /* dest reg of load should be unchanged */ + if (val != 0xdeadbeefd00d) + return 2; + /* DAR and DSISR should be set correctly */ + if (mfspr(19) != (long) ptr || mfspr(18) != 0x40000000) + return 3; + return 0; +} + +int mmu_test_2(void) +{ + long *mem = (long *) 0x4000; + long *ptr = (long *) 0x124000; + long *ptr2 = (long *) 0x1124000; + long val; + + /* create PTE */ + map(ptr, mem, DFLT_PERM); + /* initialize the memory content */ + mem[33] = 0xbadc0ffee; + /* this should succeed and be a cache miss */ + if (!test_read(&ptr[33], &val, 0xdeadbeefd00d)) + return 1; + /* dest reg of load should have the value written */ + if (val != 0xbadc0ffee) + return 2; + /* load a second TLB entry in the same set as the first */ + map(ptr2, mem, DFLT_PERM); + /* this should succeed and be a cache hit */ + if (!test_read(&ptr2[33], &val, 0xdeadbeefd00d)) + return 3; + /* dest reg of load should have the value written */ + if (val != 0xbadc0ffee) + return 4; + /* check that the first entry still works */ + if (!test_read(&ptr[33], &val, 0xdeadbeefd00d)) + return 5; + if (val != 0xbadc0ffee) + return 6; + return 0; +} + +int mmu_test_3(void) +{ + long *mem = (long *) 0x5000; + long *ptr = (long *) 0x149000; + long val; + + /* create PTE */ + map(ptr, mem, DFLT_PERM); + /* initialize the memory content */ + mem[45] = 0xfee1800d4ea; + /* this should succeed and be a cache miss */ + if (!test_read(&ptr[45], &val, 0xdeadbeefd0d0)) + return 1; + /* dest reg of load should have the value written */ + if (val != 0xfee1800d4ea) + return 2; + /* remove the PTE */ + unmap(ptr); + /* this should fail */ + if (test_read(&ptr[45], &val, 0xdeadbeefd0d0)) + return 3; + /* dest reg of load should be unchanged */ + if (val != 0xdeadbeefd0d0) + return 4; + /* DAR and DSISR should be set correctly */ + if (mfspr(19) != (long) &ptr[45] || mfspr(18) != 0x40000000) + return 5; + return 0; +} + +int mmu_test_4(void) +{ + long *mem = (long *) 0x6000; + long *ptr = (long *) 0x10a000; + long *ptr2 = (long *) 0x110a000; + long val; + + /* create PTE */ + map(ptr, mem, DFLT_PERM); + /* initialize the memory content */ + mem[27] = 0xf00f00f00f00; + /* this should succeed and be a cache miss */ + if (!test_write(&ptr[27], 0xe44badc0ffee)) + return 1; + /* memory should now have the value written */ + if (mem[27] != 0xe44badc0ffee) + return 2; + /* load a second TLB entry in the same set as the first */ + map(ptr2, mem, DFLT_PERM); + /* this should succeed and be a cache hit */ + if (!test_write(&ptr2[27], 0x6e11ae)) + return 3; + /* memory should have the value written */ + if (mem[27] != 0x6e11ae) + return 4; + /* check that the first entry still exists */ + /* (assumes TLB is 2-way associative or more) */ + if (!test_read(&ptr[27], &val, 0xdeadbeefd00d)) + return 5; + if (val != 0x6e11ae) + return 6; + return 0; +} + +int mmu_test_5(void) +{ + long *mem = (long *) 0x7ffd; + long *ptr = (long *) 0x39fffd; + long val; + + /* create PTE */ + map(ptr, mem, DFLT_PERM); + /* this should fail */ + if (test_read(ptr, &val, 0xdeadbeef0dd0)) + return 1; + /* dest reg of load should be unchanged */ + if (val != 0xdeadbeef0dd0) + return 2; + /* DAR and DSISR should be set correctly */ + if (mfspr(19) != ((long)ptr & ~0xfff) + 0x1000 || mfspr(18) != 0x40000000) + return 3; + return 0; +} + +int mmu_test_6(void) +{ + long *mem = (long *) 0x7ffd; + long *ptr = (long *) 0x39fffd; + + /* create PTE */ + map(ptr, mem, DFLT_PERM); + /* initialize memory */ + *mem = 0x123456789abcdef0; + /* this should fail */ + if (test_write(ptr, 0xdeadbeef0dd0)) + return 1; + /* DAR and DSISR should be set correctly */ + if (mfspr(19) != ((long)ptr & ~0xfff) + 0x1000 || mfspr(18) != 0x42000000) + return 2; + return 0; +} + +int mmu_test_7(void) +{ + long *mem = (long *) 0x4000; + long *ptr = (long *) 0x124000; + long val; + + *mem = 0x123456789abcdef0; + /* create PTE without R or C */ + map(ptr, mem, PERM_RD | PERM_WR); + /* this should fail */ + if (test_read(ptr, &val, 0xdeadd00dbeef)) + return 1; + /* dest reg of load should be unchanged */ + if (val != 0xdeadd00dbeef) + return 2; + /* DAR and DSISR should be set correctly */ + if (mfspr(19) != (long) ptr || mfspr(18) != 0x00040000) + return 3; + /* this should fail */ + if (test_write(ptr, 0xdeadbeef0dd0)) + return 4; + /* DAR and DSISR should be set correctly */ + if (mfspr(19) != (long)ptr || mfspr(18) != 0x02040000) + return 5; + /* memory should be unchanged */ + if (*mem != 0x123456789abcdef0) + return 6; + return 0; +} + +int mmu_test_8(void) +{ + long *mem = (long *) 0x4000; + long *ptr = (long *) 0x124000; + long val; + + *mem = 0x123456789abcdef0; + /* create PTE with R but not C */ + map(ptr, mem, REF | PERM_RD | PERM_WR); + /* this should succeed */ + if (!test_read(ptr, &val, 0xdeadd00dbeef)) + return 1; + /* this should fail */ + if (test_write(ptr, 0xdeadbeef0dd1)) + return 2; + /* DAR and DSISR should be set correctly */ + if (mfspr(19) != (long)ptr || mfspr(18) != 0x02040000) + return 3; + /* memory should be unchanged */ + if (*mem != 0x123456789abcdef0) + return 4; + return 0; +} + +int mmu_test_9(void) +{ + long *mem = (long *) 0x4000; + long *ptr = (long *) 0x124000; + long val; + + *mem = 0x123456789abcdef0; + /* create PTE without read or write permission */ + map(ptr, mem, REF); + /* this should fail */ + if (test_read(ptr, &val, 0xdeadd00dbeef)) + return 1; + /* dest reg of load should be unchanged */ + if (val != 0xdeadd00dbeef) + return 2; + /* DAR and DSISR should be set correctly */ + if (mfspr(19) != (long) ptr || mfspr(18) != 0x08000000) + return 3; + /* this should fail */ + if (test_write(ptr, 0xdeadbeef0dd1)) + return 4; + /* DAR and DSISR should be set correctly */ + if (mfspr(19) != (long)ptr || mfspr(18) != 0x0a000000) + return 5; + /* memory should be unchanged */ + if (*mem != 0x123456789abcdef0) + return 6; + return 0; +} + +int mmu_test_10(void) +{ + long *mem = (long *) 0x4000; + long *ptr = (long *) 0x124000; + long val; + + *mem = 0x123456789abcdef0; + /* create PTE with read but not write permission */ + map(ptr, mem, REF | PERM_RD); + /* this should succeed */ + if (!test_read(ptr, &val, 0xdeadd00dbeef)) + return 1; + /* this should fail */ + if (test_write(ptr, 0xdeadbeef0dd1)) + return 2; + /* DAR and DSISR should be set correctly */ + if (mfspr(19) != (long)ptr || mfspr(18) != 0x0a000000) + return 3; + /* memory should be unchanged */ + if (*mem != 0x123456789abcdef0) + return 4; + return 0; +} + +int fail = 0; + +void do_test(int num, int (*test)(void)) +{ + int ret; + + mtspr(18, 0); + mtspr(19, 0); + unmap_all(); + print_test_number(num); + ret = test(); + if (ret == 0) { + print_string("PASS\r\n"); + } else { + fail = 1; + print_string("FAIL "); + putchar(ret + '0'); + print_string(" DAR="); + print_hex(mfspr(19)); + print_string(" DSISR="); + print_hex(mfspr(18)); + print_string("\r\n"); + } +} + +int main(void) +{ + potato_uart_init(); + init_mmu(); + + do_test(1, mmu_test_1); + do_test(2, mmu_test_2); + do_test(3, mmu_test_3); + do_test(4, mmu_test_4); + do_test(5, mmu_test_5); + do_test(6, mmu_test_6); + do_test(7, mmu_test_7); + do_test(8, mmu_test_8); + do_test(9, mmu_test_9); + do_test(10, mmu_test_10); + + return fail; +} diff --git a/tests/mmu/powerpc.lds b/tests/mmu/powerpc.lds new file mode 100644 index 0000000..c4bff13 --- /dev/null +++ b/tests/mmu/powerpc.lds @@ -0,0 +1,13 @@ +SECTIONS +{ + _start = .; + . = 0; + .head : { + KEEP(*(.head)) + } + . = 0x1000; + .text : { *(.text) } + . = 0x3000; + .data : { *(.data) } + .bss : { *(.bss) } +} diff --git a/tests/test_mmu.bin b/tests/test_mmu.bin new file mode 100755 index 0000000000000000000000000000000000000000..961e2df8a24786b52775933f1f9573d2beea9c7e GIT binary patch literal 12304 zcmeHMeQaCR6+h2T9Vet|sPacdC-sG7QtaiA465mwCiQcYLO$B!eozY9$1_ez+I6fQ zo#ZU41 zABo*IDI0@G!%uSVyXT&J?m54E&OP_Mln`|g$<0Ky+GuHQ3~dhuYc--o3(>yFXz9Lw zv|F2&?~A0|dyU$X#5!^jKOHHOjVfx0^hjjHmOZQ!X$nzlQbOA!$p+EktW1e@H|BdH z>+ms;DY5vch``s3K&mZlq+AagUg`>@+z%S{L~FiQ6|7kLT9+(Xthjyw9N)OkLht`V z175U$p?6?m*I(%U|E90Mu)9z8U3Q_hSh)2U+W!{La%<(H{qyg@&G7lE$LH7X%a4B5 z`hQvZU*5ubC5(ws?wp9^#>Jl8r1(v4TAa#d#ap>5lFZs2<5L))!uS+uli-<3xxG60 zrocDFwL8D_825RMX%eZ;^t)Ci zUBcf=NSl~Pj)Pxp*<;Xmd`8~)uDlXoRqDS_F7@@7QF&t%ZBr^}k9t4(0;@cA_m$bQ zuZUpaiYRM5UK%vc2y5-3&-3(#KjrJ4zn%~eHJ4*eWmi;j?D#fx{d?~7coW5yauUl| zd7_)1tF-J0PsjBpiYiRoYt!xmPKN!?cGq9CHtY`&)i=sCxOR-%j39OGQh`yB>Khc; z)8pA2i!Ix(j|g92@Ks;G1p88MpYh)Pq^qd0vw~WgCoJ#PtGlB5ap-AXOUfqlN zv5lREL>YWXQZX;1pA%~9vGs1yx&nFoXKrIY4!keyXR+>}L2^RzCLWq}Fj$KZT`{?KGlg$x_iKu?!yu#N3k^YFdKFIX}{;>kRJQaOS=D@ z&v<)S$9FSdV>@tQeyU0+g!TVsVQ0>#L~Di;)G}J?#r}IE)vx=ESX-CTuvRq)@~O%f zKzrF)ui*`!G0{3PFwy4gcb!hg>X#oJh)pGRA=H5Dw6DK%y1l>pZ6D?*@ZF8Hszhrb z3|m~}&q!1js9)~uCo131V>KA_cm6fL0ydz}kCgD-*pF_Zks!zD$kBB4328_n6}G5R zZQ_^df%ipvCRX2tZQc;$IHVj+4{JP!=#|`P>2Qo=_~Oao9&aeBEgl=zUYj!4)g)s6 zy2j&(QJLPFa&dn;s2>JyOC_y2f4#>|?1mJkvwW7xvRNj}X1RN8xo%tT_f5H{rb@V< zU8S!v_X9GZ+gvhQY#EDe88xPiP=-w1{P4-akO%L4j>Nj_jOtt-hI7PX71lFt(SY4h zR6D1QuOu2-**!6f-)L!i02uc&rq7!GCkJ@ml4)NgcInUYQJfR!Fn5}KTWN_eLKM9G z#|wox{ua6s{U~GM=+_Uz-YN%2d+ip;euNUuOh0yOQe&CBY&i~YJ*FHn^$1Z5V|VOU zZ-ms~41TOvK7K62*@LSYOQ##+m%SdpvzO<_FF#KNuqNjT#_?)Ap&W`Og)`!1_{r6{9GFJD z5+xMoBkFuMjsDSe65raBG1gZC?F48SPbPcD7AG|q`UlW=qmAK=DnYv$?IN@*(0b4! z{{>3X9+|YTXImTRvafYEFS0F8UIZ?jBPN}E&4&UWDZzheHg|LEco5^xIdJ|{(Y<2n zNRZT_mF8KvIUVKP&i>5#{ltNEFYnuQIvquB??sNs`1#xmpNVdk)`pgy+;4CZT9`) zQJu1Ny5$d$;j&~jm@gu4(g_Td#v$Ag@g zW1C^SIs$CO0NZ}zfavwWpQnKn|1RAtMI4CZGN_&^Ha*T5%(vynPc2P2M#~Peapgu?RL(-nol?$EF5@03UIjQ zEM$8LwmN<@{PBP1H%9h)zuEh4AqM}c-;j5HzhT_g+H1_^H&lLezZv`Jf9E$P^Z3mJ z@K4UejCuY!o$ojIO%!5wGrw7D^WV&G3URox-;7`SVt#{oGVe;+-F=(+pHRXFA8D6Q?^r zob@{!6e)2{d^))B*$p1tA5YKJDnzUHP__)U)gIh0APR*Fcdh?sM6c`wk=JzSG!+xu8vc7k&bIriMZov;NnN z_sRZcanzK9D%UqEY8$qba~I^A^`;8Fth*&?%+moLC2{+HALv~7aD8(`vT7b44`X~W z#!{|F^|ff3wgt4yA297_dcdY>pgo;$msEJxvpt;aUo-!QPFg?o7h&`7&Qdxw# z+~>S|C7M(|22R}P`taO|mai zgSt;9U%!ePZj60sAn)1Q7xem3PZ&y|fs$Tfgg>_M#_hYV@wdS^Fs<7UL zyZ!jTtTmVY@WQ>@&2J0fnR#v$>(#gaDISB*@cwWuucF4Nfd8-`Z!*6v#Gho}eFpne zP|L$_xpsy6%CQ#{E<*2bMBv66fe8CKDt{*8xC#UxVW6??oj6%?#u0=4Gfyx zzOi#7-gt~3*keA@FE&s~EF>+fkffS zLA8w`6~FLBA@~D?&DU1h1itn{zCH}^1m8~PTMc>7yADb1Cg;5@FzpGM@}9!*i{OK? zvwU}(d|`)=b$$`Q3K)-%A?73F?-~4(plx9auW?9PZO-hlRBi%!8ow98GwJYz4tRdO z=wQinZdM3-w^oLG@txt#^!Ydb@wz9(Hr6A)c6=a2JK7oDyG%6jNuqT|_ca)Q@ Date: Mon, 27 Apr 2020 17:43:19 +1000 Subject: [PATCH 16/26] Add TLB to icache This adds a direct-mapped TLB to the icache, with 64 entries by default. Execute1 now sends a "virt_mode" signal from MSR[IR] to fetch1 along with redirects to indicate whether instruction addresses should be translated through the TLB, and fetch1 sends that on to icache. Similarly a "priv_mode" signal is sent to indicate the privilege mode for instruction fetches. This means that changes to MSR[IR] or MSR[PR] don't take effect until the next redirect, meaning an isync, rfid, branch, etc. The icache uses a hash of the effective address (i.e. next instruction address) to index the TLB. The hash is an XOR of three fields of the address; with a 64-entry TLB, the fields are bits 12--17, 18--23 and 24--29 of the address. TLB invalidations simply invalidate the indexed TLB entry without checking the contents. If the icache detects a TLB miss with virt_mode=1, it will send a fetch_failed indication through fetch2 to decode1, which will turn it into a special OP_FETCH_FAILED opcode with unit=LDST. That will get sent down to loadstore1 which will currently just raise a Instruction Storage Interrupt (0x400) exception. One bit in the PTE obtained from the TLB is used to check whether an instruction access is allowed -- the privilege bit (bit 3). If bit 3 is 1 and priv_mode=0, then a fetch_failed indication is sent down to fetch2 and to decode1, which generates an OP_FETCH_FAILED. Any PTEs with PTE bit 0 (EAA[3]) clear or bit 8 (R) clear should not be put into the iTLB since such PTEs would not allow execution by any context. Tlbie operations get sent from mmu to icache over a new connection. Unfortunately the privileged instruction tests are broken for now. Signed-off-by: Paul Mackerras --- common.vhdl | 22 +++++- core.vhdl | 5 +- decode1.vhdl | 17 +++-- decode_types.vhdl | 3 +- execute1.vhdl | 29 ++++++-- fetch1.vhdl | 6 ++ fetch2.vhdl | 4 ++ icache.vhdl | 175 +++++++++++++++++++++++++++++++++++++++------- icache_tb.vhdl | 8 +++ loadstore1.vhdl | 22 +++++- mmu.vhdl | 10 ++- 11 files changed, 260 insertions(+), 41 deletions(-) diff --git a/common.vhdl b/common.vhdl index 424259b..ba8aab3 100644 --- a/common.vhdl +++ b/common.vhdl @@ -89,6 +89,8 @@ package common is type Fetch1ToIcacheType is record req: std_ulogic; + virt_mode : std_ulogic; + priv_mode : std_ulogic; stop_mark: std_ulogic; nia: std_ulogic_vector(63 downto 0); end record; @@ -96,6 +98,7 @@ package common is type IcacheToFetch2Type is record valid: std_ulogic; stop_mark: std_ulogic; + fetch_failed: std_ulogic; nia: std_ulogic_vector(63 downto 0); insn: std_ulogic_vector(31 downto 0); end record; @@ -103,10 +106,12 @@ package common is type Fetch2ToDecode1Type is record valid: std_ulogic; stop_mark : std_ulogic; + fetch_failed: std_ulogic; nia: std_ulogic_vector(63 downto 0); insn: std_ulogic_vector(31 downto 0); end record; - constant Fetch2ToDecode1Init : Fetch2ToDecode1Type := (valid => '0', stop_mark => '0', others => (others => '0')); + constant Fetch2ToDecode1Init : Fetch2ToDecode1Type := (valid => '0', stop_mark => '0', fetch_failed => '0', + others => (others => '0')); type Decode1ToDecode2Type is record valid: std_ulogic; @@ -211,13 +216,17 @@ package common is type Execute1ToFetch1Type is record redirect: std_ulogic; + virt_mode: std_ulogic; + priv_mode: std_ulogic; redirect_nia: std_ulogic_vector(63 downto 0); end record; - constant Execute1ToFetch1TypeInit : Execute1ToFetch1Type := (redirect => '0', others => (others => '0')); + constant Execute1ToFetch1TypeInit : Execute1ToFetch1Type := (redirect => '0', virt_mode => '0', + priv_mode => '0', others => (others => '0')); type Execute1ToLoadstore1Type is record valid : std_ulogic; op : insn_type_t; -- what ld/st or m[tf]spr or TLB op to do + nia : std_ulogic_vector(63 downto 0); addr1 : std_ulogic_vector(63 downto 0); addr2 : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); -- data to write, unused for read @@ -243,6 +252,7 @@ package common is type Loadstore1ToExecute1Type is record exception : std_ulogic; segment_fault : std_ulogic; + instr_fault : std_ulogic; end record; type Loadstore1ToDcacheType is record @@ -270,6 +280,7 @@ package common is valid : std_ulogic; tlbie : std_ulogic; mtspr : std_ulogic; + iside : std_ulogic; load : std_ulogic; priv : std_ulogic; sprn : std_ulogic_vector(3 downto 0); @@ -302,6 +313,13 @@ package common is data : std_ulogic_vector(63 downto 0); end record; + type MmuToIcacheType is record + tlbld : std_ulogic; + tlbie : std_ulogic; + addr : std_ulogic_vector(63 downto 0); + pte : std_ulogic_vector(63 downto 0); + end record; + type Loadstore1ToWritebackType is record valid : std_ulogic; write_enable: std_ulogic; diff --git a/core.vhdl b/core.vhdl index c870404..05fb328 100644 --- a/core.vhdl +++ b/core.vhdl @@ -42,6 +42,7 @@ architecture behave of core is -- icache signals signal fetch1_to_icache : Fetch1ToIcacheType; signal icache_to_fetch2 : IcacheToFetch2Type; + signal mmu_to_icache : MmuToIcacheType; -- decode signals signal decode1_to_decode2: Decode1ToDecode2Type; @@ -164,6 +165,7 @@ begin rst => icache_rst, i_in => fetch1_to_icache, i_out => icache_to_fetch2, + m_in => mmu_to_icache, flush_in => flush, stall_out => icache_stall_out, wishbone_out => wishbone_insn_out, @@ -288,7 +290,8 @@ begin l_in => loadstore1_to_mmu, l_out => mmu_to_loadstore1, d_out => mmu_to_dcache, - d_in => dcache_to_mmu + d_in => dcache_to_mmu, + i_out => mmu_to_icache ); dcache_0: entity work.dcache diff --git a/decode1.vhdl b/decode1.vhdl index b7212c2..598e59c 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -345,9 +345,10 @@ architecture behaviour of decode1 is others => decode_rom_init ); - -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl - -- op in out A out in out len ext pipe - constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); + -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl + -- op in out A out in out len ext pipe + constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); + constant fetch_fail_inst: decode_rom_t := (LDST, OP_FETCH_FAILED, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); begin decode1_0: process(clk) @@ -380,7 +381,15 @@ begin end if; majorop := unsigned(f_in.insn(31 downto 26)); - if majorop = "011111" then + if f_in.fetch_failed = '1' then + v.valid := '1'; + -- Only send down a single OP_FETCH_FAILED + if r.decode.insn_type = OP_FETCH_FAILED then + v.valid := '0'; + end if; + v.decode := fetch_fail_inst; + + elsif majorop = "011111" then -- major opcode 31, lots of things v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1)))); diff --git a/decode_types.vhdl b/decode_types.vhdl index ef51bd0..8f000a0 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -17,7 +17,8 @@ package decode_types is OP_RLC, OP_RLCL, OP_RLCR, OP_SC, OP_SETB, OP_SHL, OP_SHR, OP_SYNC, OP_TLBIE, OP_TRAP, - OP_XOR + OP_XOR, + OP_FETCH_FAILED ); type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR); type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR); diff --git a/execute1.vhdl b/execute1.vhdl index 7181f7f..71c79ee 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -430,6 +430,9 @@ begin icache_inval <= '0'; stall_out <= '0'; f_out <= Execute1ToFetch1TypeInit; + -- send MSR[IR] and ~MSR[PR] up to fetch1 + f_out.virt_mode <= ctrl.msr(MSR_IR); + f_out.priv_mode <= not ctrl.msr(MSR_PR); -- Next insn adder used in a couple of places next_nia := std_ulogic_vector(unsigned(e_in.nia) + 4); @@ -460,6 +463,8 @@ begin ctrl_tmp.msr(MSR_RI) <= '0'; ctrl_tmp.msr(MSR_LE) <= '1'; f_out.redirect <= '1'; + f_out.virt_mode <= '0'; + f_out.priv_mode <= '1'; f_out.redirect_nia <= ctrl.irq_nia; v.e.valid := e_in.valid; report "Writing SRR1: " & to_hstring(ctrl.srr1); @@ -651,6 +656,8 @@ begin when OP_RFID => f_out.redirect <= '1'; + f_out.virt_mode <= b_in(MSR_IR) or b_in(MSR_PR); + f_out.priv_mode <= not b_in(MSR_PR); f_out.redirect_nia <= a_in(63 downto 2) & "00"; -- srr0 -- Can't use msr_copy here because the partial function MSR -- bits should be left unchanged, not zeroed. @@ -972,23 +979,35 @@ begin v.e.write_data := result; v.e.write_enable := result_en; - -- generate DSI for load/store exceptions + -- generate DSI or DSegI for load/store exceptions + -- or ISI or ISegI for instruction fetch exceptions if l_in.exception = '1' then - if l_in.segment_fault = '0' then - ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#300#, 64)); + ctrl_tmp.srr1 <= msr_copy(ctrl.msr); + if l_in.instr_fault = '0' then + if l_in.segment_fault = '0' then + ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#300#, 64)); + else + ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#380#, 64)); + end if; else - ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#380#, 64)); + if l_in.segment_fault = '0' then + ctrl_tmp.srr1(63 - 33) <= '1'; + ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#400#, 64)); + else + ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#480#, 64)); + end if; end if; - ctrl_tmp.srr1 <= msr_copy(ctrl.msr); v.e.exc_write_enable := '1'; v.e.exc_write_reg := fast_spr_num(SPR_SRR0); v.e.exc_write_data := r.ldst_nia; + report "ldst exception writing srr0=" & to_hstring(r.ldst_nia); ctrl_tmp.irq_state <= WRITE_SRR1; v.e.valid := '1'; -- complete the original load or store end if; -- Outputs to loadstore1 (async) lv.op := e_in.insn_type; + lv.nia := e_in.nia; lv.addr1 := a_in; lv.addr2 := b_in; lv.data := c_in; diff --git a/fetch1.vhdl b/fetch1.vhdl index 9cd5445..936e830 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -40,6 +40,8 @@ begin if rising_edge(clk) then if r /= r_next then report "fetch1 rst:" & std_ulogic'image(rst) & + " IR:" & std_ulogic'image(e_in.virt_mode) & + " P:" & std_ulogic'image(e_in.priv_mode) & " R:" & std_ulogic'image(e_in.redirect) & " S:" & std_ulogic'image(stall_in) & " T:" & std_ulogic'image(stop_in) & @@ -61,9 +63,13 @@ begin if rst = '1' then v.nia := RESET_ADDRESS; + v.virt_mode := '0'; + v.priv_mode := '1'; v_int.stop_state := RUNNING; elsif e_in.redirect = '1' then v.nia := e_in.redirect_nia; + v.virt_mode := e_in.virt_mode; + v.priv_mode := e_in.priv_mode; elsif stall_in = '0' then -- For debug stop/step to work properly we need a little bit of diff --git a/fetch2.vhdl b/fetch2.vhdl index 99f92ee..cc0727d 100644 --- a/fetch2.vhdl +++ b/fetch2.vhdl @@ -46,6 +46,7 @@ begin " F:" & std_ulogic'image(flush_in) & " T:" & std_ulogic'image(rin.stop_mark) & " V:" & std_ulogic'image(rin.valid) & + " FF:" & std_ulogic'image(rin.fetch_failed) & " nia:" & to_hstring(rin.nia); end if; @@ -84,6 +85,7 @@ begin v.valid := v_i_in.valid; v.stop_mark := v_i_in.stop_mark; + v.fetch_failed := v_i_in.fetch_failed; v.nia := v_i_in.nia; v.insn := v_i_in.insn; @@ -94,12 +96,14 @@ begin -- if flush_in = '1' then v_int.stash.valid := '0'; + v_int.stash.fetch_failed := '0'; end if; -- If we are flushing or the instruction comes with a stop mark -- we tag it as invalid so it doesn't get decoded and executed if flush_in = '1' or v.stop_mark = '1' then v.valid := '0'; + v.fetch_failed := '0'; end if; -- Clear stash on reset diff --git a/icache.vhdl b/icache.vhdl index 343c73a..7d7973d 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -35,7 +35,13 @@ entity icache is -- Number of lines in a set NUM_LINES : positive := 32; -- Number of ways - NUM_WAYS : positive := 4 + NUM_WAYS : positive := 4; + -- L1 ITLB number of entries (direct mapped) + TLB_SIZE : positive := 64; + -- L1 ITLB log_2(page_size) + TLB_LG_PGSZ : positive := 12; + -- Number of real address bits that we store + REAL_ADDR_BITS : positive := 56 ); port ( clk : in std_ulogic; @@ -44,6 +50,8 @@ entity icache is i_in : in Fetch1ToIcacheType; i_out : out IcacheToFetch2Type; + m_in : in MmuToIcacheType; + stall_out : out std_ulogic; flush_in : in std_ulogic; @@ -78,10 +86,12 @@ architecture rtl of icache is constant LINE_OFF_BITS : natural := log2(LINE_SIZE); -- ROW_OFF_BITS is the number of bits for the offset in a row constant ROW_OFF_BITS : natural := log2(ROW_SIZE); - -- INDEX_BITS is the number if bits to select a cache line + -- INDEX_BITS is the number of bits to select a cache line constant INDEX_BITS : natural := log2(NUM_LINES); + -- SET_SIZE_BITS is the log base 2 of the set size + constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS; -- TAG_BITS is the number of bits of the tag part of the address - constant TAG_BITS : natural := 64 - LINE_OFF_BITS - INDEX_BITS; + constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS; -- WAY_BITS is the number of bits to select a way constant WAY_BITS : natural := log2(NUM_WAYS); @@ -126,6 +136,27 @@ architecture rtl of icache is attribute ram_style : string; attribute ram_style of cache_tags : signal is "distributed"; + -- L1 ITLB. + constant TLB_BITS : natural := log2(TLB_SIZE); + constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS); + constant TLB_PTE_BITS : natural := 64; + + subtype tlb_index_t is integer range 0 to TLB_SIZE - 1; + type tlb_valids_t is array(tlb_index_t) of std_ulogic; + subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0); + type tlb_tags_t is array(tlb_index_t) of tlb_tag_t; + subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0); + type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t; + + signal itlb_valids : tlb_valids_t; + signal itlb_tags : tlb_tags_t; + signal itlb_ptes : tlb_ptes_t; + attribute ram_style of itlb_tags : signal is "distributed"; + attribute ram_style of itlb_ptes : signal is "distributed"; + + -- Privilege bit from PTE EAA field + signal eaa_priv : std_ulogic; + -- Cache reload state machine type state_t is (IDLE, WAIT_ACK); @@ -142,6 +173,9 @@ architecture rtl of icache is store_way : way_t; store_index : index_t; store_row : row_t; + + -- TLB miss state + fetch_failed : std_ulogic; end record; signal r : reg_internal_t; @@ -155,6 +189,12 @@ architecture rtl of icache is signal req_is_miss : std_ulogic; signal req_laddr : std_ulogic_vector(63 downto 0); + signal tlb_req_index : tlb_index_t; + signal real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); + signal ra_valid : std_ulogic; + signal priv_fault : std_ulogic; + signal access_ok : std_ulogic; + -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; signal cache_out : cache_ram_out_t; @@ -167,13 +207,13 @@ architecture rtl of icache is -- Return the cache line index (tag index) for an address function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is begin - return to_integer(unsigned(addr(63-TAG_BITS downto LINE_OFF_BITS))); + return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS))); end; -- Return the cache row index (data memory) for an address function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is begin - return to_integer(unsigned(addr(63-TAG_BITS downto ROW_OFF_BITS))); + return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS))); end; -- Returns whether this is the last row of a line @@ -231,9 +271,9 @@ architecture rtl of icache is end; -- Get the tag value from the address - function get_tag(addr: std_ulogic_vector(63 downto 0)) return cache_tag_t is + function get_tag(addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)) return cache_tag_t is begin - return addr(63 downto 64-TAG_BITS); + return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS); end; -- Read a tag from a tag memory row @@ -249,6 +289,15 @@ architecture rtl of icache is tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; end; + -- Simple hash for direct-mapped TLB index + function hash_ea(addr: std_ulogic_vector(63 downto 0)) return tlb_index_t is + variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0); + begin + hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ) + xor addr(TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto TLB_LG_PGSZ + TLB_BITS) + xor addr(TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto TLB_LG_PGSZ + 2 * TLB_BITS); + return to_integer(unsigned(hash)); + end; begin assert LINE_SIZE mod ROW_SIZE = 0; @@ -260,9 +309,9 @@ begin report "geometry bits don't add up" severity FAILURE; assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) report "geometry bits don't add up" severity FAILURE; - assert (64 = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) + assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) report "geometry bits don't add up" severity FAILURE; - assert (64 = TAG_BITS + ROW_BITS + ROW_OFF_BITS) + assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS) report "geometry bits don't add up" severity FAILURE; sim_debug: if SIM generate @@ -356,6 +405,69 @@ begin end generate; end generate; + -- TLB hit detection and real address generation + itlb_lookup : process(all) + variable pte : tlb_pte_t; + variable ttag : tlb_tag_t; + begin + tlb_req_index <= hash_ea(i_in.nia); + pte := itlb_ptes(tlb_req_index); + ttag := itlb_tags(tlb_req_index); + if i_in.virt_mode = '1' then + real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & + i_in.nia(TLB_LG_PGSZ - 1 downto 0); + if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then + ra_valid <= itlb_valids(tlb_req_index); + else + ra_valid <= '0'; + end if; + eaa_priv <= pte(3); + else + real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0); + ra_valid <= '1'; + eaa_priv <= '1'; + end if; + + -- no IAMR, so no KUEP support for now + priv_fault <= eaa_priv and not i_in.priv_mode; + access_ok <= ra_valid and not priv_fault; + end process; + + -- iTLB update + itlb_update: process(clk) + variable tlbie : std_ulogic; + variable tlbia : std_ulogic; + variable tlbwe : std_ulogic; + variable wr_index : tlb_index_t; + begin + if rising_edge(clk) then + tlbie := '0'; + tlbia := '0'; + tlbwe := m_in.tlbld; + if m_in.tlbie = '1' then + if m_in.addr(11 downto 10) /= "00" then + tlbia := '1'; + else + tlbie := '1'; + end if; + end if; + wr_index := hash_ea(m_in.addr); + if rst = '1' or tlbia = '1' then + -- clear all valid bits + for i in tlb_index_t loop + itlb_valids(i) <= '0'; + end loop; + elsif tlbie = '1' then + -- clear entry regardless of hit or miss + itlb_valids(wr_index) <= '0'; + elsif tlbwe = '1' then + itlb_tags(wr_index) <= m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS); + itlb_ptes(wr_index) <= m_in.pte; + itlb_valids(wr_index) <= '1'; + end if; + end if; + end process; + -- Cache hit detection, output to fetch2 and other misc logic icache_comb : process(all) variable is_hit : std_ulogic; @@ -364,12 +476,13 @@ begin -- Extract line, row and tag from request req_index <= get_index(i_in.nia); req_row <= get_row(i_in.nia); - req_tag <= get_tag(i_in.nia); + req_tag <= get_tag(real_addr); -- Calculate address of beginning of cache line, will be -- used for cache miss processing if needed -- - req_laddr <= i_in.nia(63 downto LINE_OFF_BITS) & + req_laddr <= (63 downto REAL_ADDR_BITS => '0') & + real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) & (LINE_OFF_BITS-1 downto 0 => '0'); -- Test if pending request is a hit on any way @@ -385,8 +498,13 @@ begin end loop; -- Generate the "hit" and "miss" signals for the synchronous blocks - req_is_hit <= i_in.req and is_hit and not flush_in; - req_is_miss <= i_in.req and not is_hit and not flush_in; + if i_in.req = '1' and access_ok = '1' and flush_in = '0' then + req_is_hit <= is_hit; + req_is_miss <= not is_hit; + else + req_is_hit <= '0'; + req_is_miss <= '0'; + end if; req_hit_way <= hit_way; -- The way to replace on a miss @@ -404,9 +522,10 @@ begin i_out.valid <= r.hit_valid; i_out.nia <= r.hit_nia; i_out.stop_mark <= r.hit_smark; + i_out.fetch_failed <= r.fetch_failed; - -- Stall fetch1 if we have a miss - stall_out <= not is_hit; + -- Stall fetch1 if we have a miss on cache or TLB or a protection fault + stall_out <= not (is_hit and access_ok); -- Wishbone requests output (from the cache miss reload machine) wishbone_out <= r.wb; @@ -419,22 +538,21 @@ begin -- On a hit, latch the request for the next cycle, when the BRAM data -- will be available on the cache_out output of the corresponding way -- + r.hit_valid <= req_is_hit; + -- Send stop marks and NIA down regardless of validity + r.hit_smark <= i_in.stop_mark; + r.hit_nia <= i_in.nia; if req_is_hit = '1' then r.hit_way <= req_hit_way; - r.hit_nia <= i_in.nia; r.hit_smark <= i_in.stop_mark; - r.hit_valid <= '1'; report "cache hit nia:" & to_hstring(i_in.nia) & + " IR:" & std_ulogic'image(i_in.virt_mode) & " SM:" & std_ulogic'image(i_in.stop_mark) & " idx:" & integer'image(req_index) & " tag:" & to_hstring(req_tag) & - " way: " & integer'image(req_hit_way); - else - r.hit_valid <= '0'; - - -- Send stop marks down regardless of validity - r.hit_smark <= i_in.stop_mark; + " way:" & integer'image(req_hit_way) & + " RA:" & to_hstring(real_addr); end if; end if; end process; @@ -468,10 +586,12 @@ begin -- We need to read a cache line if req_is_miss = '1' then report "cache miss nia:" & to_hstring(i_in.nia) & + " IR:" & std_ulogic'image(i_in.virt_mode) & " SM:" & std_ulogic'image(i_in.stop_mark) & " idx:" & integer'image(req_index) & " way:" & integer'image(replace_way) & - " tag:" & to_hstring(req_tag); + " tag:" & to_hstring(req_tag) & + " RA:" & to_hstring(real_addr); -- Force misses on that way while reloading that line cache_valids(req_index)(replace_way) <= '0'; @@ -539,6 +659,13 @@ begin end if; end case; end if; + + -- TLB miss and protection fault processing + if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then + r.fetch_failed <= '0'; + elsif i_in.req = '1' and access_ok = '0' then + r.fetch_failed <= '1'; + end if; end if; end process; end; diff --git a/icache_tb.vhdl b/icache_tb.vhdl index ea5cf3a..09a644b 100644 --- a/icache_tb.vhdl +++ b/icache_tb.vhdl @@ -15,6 +15,8 @@ architecture behave of icache_tb is signal i_out : Fetch1ToIcacheType; signal i_in : IcacheToFetch2Type; + signal m_out : MmuToIcacheType; + signal wb_bram_in : wishbone_master_out; signal wb_bram_out : wishbone_slave_out; @@ -30,6 +32,7 @@ begin rst => rst, i_in => i_out, i_out => i_in, + m_in => m_out, flush_in => '0', wishbone_out => wb_bram_in, wishbone_in => wb_bram_out @@ -70,6 +73,11 @@ begin i_out.nia <= (others => '0'); i_out.stop_mark <= '0'; + m_out.tlbld <= '0'; + m_out.tlbie <= '0'; + m_out.addr <= (others => '0'); + m_out.pte <= (others => '0'); + wait until rising_edge(clk); wait until rising_edge(clk); wait until rising_edge(clk); diff --git a/loadstore1.vhdl b/loadstore1.vhdl index c56346f..666cf4e 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -41,7 +41,8 @@ architecture behave of loadstore1 is ACK_WAIT, -- waiting for ack from dcache LD_UPDATE, -- writing rA with computed addr on load MMU_LOOKUP, -- waiting for MMU to look up translation - TLBIE_WAIT -- waiting for MMU to finish doing a tlbie + TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie + DO_ISI ); type reg_stage_t is record @@ -70,6 +71,7 @@ architecture behave of loadstore1 is second_bytes : std_ulogic_vector(7 downto 0); dar : std_ulogic_vector(63 downto 0); dsisr : std_ulogic_vector(31 downto 0); + instr_fault : std_ulogic; end record; type byte_sel_t is array(0 to 7) of std_ulogic; @@ -154,6 +156,7 @@ begin variable mmureq : std_ulogic; variable dsisr : std_ulogic_vector(31 downto 0); variable mmu_mtspr : std_ulogic; + variable itlb_fault : std_ulogic; begin v := r; req := '0'; @@ -163,6 +166,7 @@ begin addr := lsu_sum; mfspr := '0'; mmu_mtspr := '0'; + itlb_fault := '0'; sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10)); sprval := (others => '0'); -- avoid inferred latches exception := '0'; @@ -230,6 +234,7 @@ begin v.load := '0'; v.dcbz := '0'; v.tlbie := '0'; + v.instr_fault := '0'; v.dwords_done := '0'; case l_in.op is when OP_STORE => @@ -272,6 +277,10 @@ begin -- writing one of the SPRs in the MMU mmu_mtspr := '1'; end if; + when OP_FETCH_FAILED => + -- for now, always signal an ISI in the next cycle + v.instr_fault := '1'; + v.state := DO_ISI; when others => assert false report "unknown op sent to loadstore1"; end case; @@ -425,6 +434,10 @@ begin do_update := '1'; v.state := IDLE; done := '1'; + + when DO_ISI => + exception := '1'; + v.state := IDLE; end case; -- Update outputs to dcache @@ -441,6 +454,7 @@ begin -- Update outputs to MMU m_out.valid <= mmureq; + m_out.iside <= itlb_fault; m_out.load <= r.load; m_out.priv <= r.priv_mode; m_out.tlbie <= v.tlbie; @@ -472,9 +486,11 @@ begin -- update exception info back to execute1 e_out.exception <= exception; - e_out.segment_fault <= m_in.segerr; - if exception = '1' then + e_out.segment_fault <= '0'; + e_out.instr_fault <= r.instr_fault; + if exception = '1' and r.instr_fault = '0' then v.dar := addr; + e_out.segment_fault <= m_in.segerr; if m_in.segerr = '0' then v.dsisr := dsisr; end if; diff --git a/mmu.vhdl b/mmu.vhdl index 3a1003c..e26c5a7 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -18,7 +18,9 @@ entity mmu is l_out : out MmuToLoadstore1Type; d_out : out MmuToDcacheType; - d_in : in DcacheToMmuType + d_in : in DcacheToMmuType; + + i_out : out MmuToIcacheType ); end mmu; @@ -336,5 +338,11 @@ begin d_out.addr <= pgtable_addr; d_out.pte <= (others => '0'); end if; + + i_out.tlbld <= '0'; + i_out.tlbie <= tlbie_req; + i_out.addr <= l_in.addr; + i_out.pte <= l_in.rs; + end process; end; From 01046527ba0e720a3f2a97e4d837a5d12ae68061 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 28 Apr 2020 14:54:22 +1000 Subject: [PATCH 17/26] MMU: Do radix page table walks on iTLB misses This hooks up the connections so that an OP_FETCH_FAILED coming down to loadstore1 will get sent to the MMU for it to do a radix tree walk for the instruction address. The MMU then sends the resulting PTE to the icache module to be installed in the iTLB. If no valid PTE can be found, the MMU sends an error signal back to loadstore1 which sends it on to execute1 to generate an ISI. Signed-off-by: Paul Mackerras --- common.vhdl | 4 ++++ execute1.vhdl | 5 ++++- loadstore1.vhdl | 42 ++++++++++++++++++++++-------------- mmu.vhdl | 57 +++++++++++++++++++++++++++++++++---------------- 4 files changed, 73 insertions(+), 35 deletions(-) diff --git a/common.vhdl b/common.vhdl index ba8aab3..79bc1bd 100644 --- a/common.vhdl +++ b/common.vhdl @@ -251,6 +251,10 @@ package common is type Loadstore1ToExecute1Type is record exception : std_ulogic; + invalid : std_ulogic; + perm_error : std_ulogic; + rc_error : std_ulogic; + badtree : std_ulogic; segment_fault : std_ulogic; instr_fault : std_ulogic; end record; diff --git a/execute1.vhdl b/execute1.vhdl index 71c79ee..78361c2 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -991,7 +991,10 @@ begin end if; else if l_in.segment_fault = '0' then - ctrl_tmp.srr1(63 - 33) <= '1'; + ctrl_tmp.srr1(63 - 33) <= l_in.invalid; + ctrl_tmp.srr1(63 - 35) <= l_in.perm_error; -- noexec fault + ctrl_tmp.srr1(63 - 44) <= l_in.badtree; + ctrl_tmp.srr1(63 - 45) <= l_in.rc_error; ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#400#, 64)); else ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#480#, 64)); diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 666cf4e..b7b56d4 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -41,8 +41,7 @@ architecture behave of loadstore1 is ACK_WAIT, -- waiting for ack from dcache LD_UPDATE, -- writing rA with computed addr on load MMU_LOOKUP, -- waiting for MMU to look up translation - TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie - DO_ISI + TLBIE_WAIT -- waiting for MMU to finish doing a tlbie ); type reg_stage_t is record @@ -231,6 +230,7 @@ begin case r.state is when IDLE => if l_in.valid = '1' then + v.addr := lsu_sum; v.load := '0'; v.dcbz := '0'; v.tlbie := '0'; @@ -278,14 +278,17 @@ begin mmu_mtspr := '1'; end if; when OP_FETCH_FAILED => - -- for now, always signal an ISI in the next cycle + -- send it to the MMU to do the radix walk + addr := l_in.nia; + v.addr := l_in.nia; v.instr_fault := '1'; - v.state := DO_ISI; + mmureq := '1'; + stall := '1'; + v.state := MMU_LOOKUP; when others => assert false report "unknown op sent to loadstore1"; end case; - v.addr := lsu_sum; v.write_reg := l_in.write_reg; v.length := l_in.length; v.byte_reverse := l_in.byte_reverse; @@ -403,12 +406,19 @@ begin if m_in.done = '1' then if m_in.invalid = '0' and m_in.perm_error = '0' and m_in.rc_error = '0' and m_in.badtree = '0' and m_in.segerr = '0' then - -- retry the request now that the MMU has installed a TLB entry - req := '1'; - if two_dwords = '1' and r.dwords_done = '0' then - v.state := SECOND_REQ; + if r.instr_fault = '0' then + -- retry the request now that the MMU has installed a TLB entry + req := '1'; + if two_dwords = '1' and r.dwords_done = '0' then + v.state := SECOND_REQ; + else + v.state := ACK_WAIT; + end if; else - v.state := ACK_WAIT; + -- nothing to do, the icache retries automatically + stall := '0'; + done := '1'; + v.state := IDLE; end if; else exception := '1'; @@ -435,9 +445,6 @@ begin v.state := IDLE; done := '1'; - when DO_ISI => - exception := '1'; - v.state := IDLE; end case; -- Update outputs to dcache @@ -454,7 +461,7 @@ begin -- Update outputs to MMU m_out.valid <= mmureq; - m_out.iside <= itlb_fault; + m_out.iside <= v.instr_fault; m_out.load <= r.load; m_out.priv <= r.priv_mode; m_out.tlbie <= v.tlbie; @@ -486,11 +493,14 @@ begin -- update exception info back to execute1 e_out.exception <= exception; - e_out.segment_fault <= '0'; e_out.instr_fault <= r.instr_fault; + e_out.invalid <= m_in.invalid; + e_out.badtree <= m_in.badtree; + e_out.perm_error <= m_in.perm_error; + e_out.rc_error <= m_in.rc_error; + e_out.segment_fault <= m_in.segerr; if exception = '1' and r.instr_fault = '0' then v.dar := addr; - e_out.segment_fault <= m_in.segerr; if m_in.segerr = '0' then v.dsisr := dsisr; end if; diff --git a/mmu.vhdl b/mmu.vhdl index e26c5a7..e770d99 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -38,6 +38,7 @@ architecture behave of mmu is type reg_stage_t is record -- latched request from loadstore1 valid : std_ulogic; + iside : std_ulogic; store : std_ulogic; priv : std_ulogic; addr : std_ulogic_vector(63 downto 0); @@ -165,15 +166,18 @@ begin variable dcreq : std_ulogic; variable done : std_ulogic; variable tlb_load : std_ulogic; + variable itlb_load : std_ulogic; variable tlbie_req : std_ulogic; variable rts : unsigned(5 downto 0); variable mbits : unsigned(5 downto 0); variable pgtable_addr : std_ulogic_vector(63 downto 0); variable pte : std_ulogic_vector(63 downto 0); - variable data : std_ulogic_vector(63 downto 0); + variable tlb_data : std_ulogic_vector(63 downto 0); variable nonzero : std_ulogic; variable perm_ok : std_ulogic; variable rc_ok : std_ulogic; + variable addr : std_ulogic_vector(63 downto 0); + variable data : std_ulogic_vector(63 downto 0); begin v := r; v.valid := '0'; @@ -185,6 +189,7 @@ begin v.perm_err := '0'; v.rc_error := '0'; tlb_load := '0'; + itlb_load := '0'; tlbie_req := '0'; -- Radix tree data structures in memory are big-endian, @@ -206,7 +211,8 @@ begin if l_in.valid = '1' then v.addr := l_in.addr; - v.store := not l_in.load; + v.iside := l_in.iside; + v.store := not (l_in.load or l_in.iside); v.priv := l_in.priv; if l_in.tlbie = '1' then dcreq := '1'; @@ -262,7 +268,13 @@ begin -- check permissions and RC bits perm_ok := '0'; if r.priv = '1' or data(3) = '0' then - perm_ok := data(1) or (data(2) and not r.store); + if r.iside = '0' then + perm_ok := data(1) or (data(2) and not r.store); + else + -- no IAMR, so no KUEP support for now + -- deny execute permission if cache inhibited + perm_ok := data(0) and not data(5); + end if; end if; rc_ok := data(8) and (data(7) or not r.store); if perm_ok = '1' and rc_ok = '1' then @@ -298,8 +310,14 @@ begin when RADIX_LOAD_TLB => tlb_load := '1'; - dcreq := '1'; - v.state := TLB_WAIT; + if r.iside = '0' then + dcreq := '1'; + v.state := TLB_WAIT; + else + itlb_load := '1'; + done := '1'; + v.state := IDLE; + end if; when RADIX_ERROR => done := '1'; @@ -318,6 +336,17 @@ begin rin <= v; -- drive outputs + if tlbie_req = '1' then + addr := l_in.addr; + tlb_data := l_in.rs; + elsif tlb_load = '1' then + addr := r.addr(63 downto 12) & x"000"; + tlb_data := pte; + else + addr := pgtable_addr; + tlb_data := (others => '0'); + end if; + l_out.done <= done; l_out.invalid <= r.invalid; l_out.badtree <= r.badtree; @@ -328,21 +357,13 @@ begin d_out.valid <= dcreq; d_out.tlbie <= tlbie_req; d_out.tlbld <= tlb_load; - if tlbie_req = '1' then - d_out.addr <= l_in.addr; - d_out.pte <= l_in.rs; - elsif tlb_load = '1' then - d_out.addr <= r.addr(63 downto 12) & x"000"; - d_out.pte <= pte; - else - d_out.addr <= pgtable_addr; - d_out.pte <= (others => '0'); - end if; + d_out.addr <= addr; + d_out.pte <= tlb_data; - i_out.tlbld <= '0'; + i_out.tlbld <= itlb_load; i_out.tlbie <= tlbie_req; - i_out.addr <= l_in.addr; - i_out.pte <= l_in.rs; + i_out.addr <= addr; + i_out.pte <= tlb_data; end process; end; From b342312e4e2b0301234c128d2d6a0d350ec30609 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 28 Apr 2020 16:00:00 +1000 Subject: [PATCH 18/26] tests: mmu: Add tests for instruction translation This adds tests of instruction translation to the mmu test. This also clears the BSS and improves the linker script. Signed-off-by: Paul Mackerras --- tests/mmu/head.S | 73 ++++++++++-- tests/mmu/mmu.c | 226 ++++++++++++++++++++++++++++++++----- tests/mmu/powerpc.lds | 26 ++++- tests/test_mmu.bin | Bin 12304 -> 20496 bytes tests/test_mmu.console_out | 7 ++ 5 files changed, 288 insertions(+), 44 deletions(-) diff --git a/tests/mmu/head.S b/tests/mmu/head.S index 3627cff..083b1c5 100644 --- a/tests/mmu/head.S +++ b/tests/mmu/head.S @@ -14,8 +14,6 @@ * limitations under the License. */ -#define STACK_TOP 0x4000 - /* Load an immediate 64-bit value into a register */ #define LOAD_IMM64(r, e) \ lis r,(e)@highest; \ @@ -33,12 +31,20 @@ . = 0 .global _start _start: - b boot_entry + LOAD_IMM64(%r10,__bss_start) + LOAD_IMM64(%r11,__bss_end) + subf %r11,%r10,%r11 + addi %r11,%r11,63 + srdi. %r11,%r11,6 + beq 2f + mtctr %r11 +1: dcbz 0,%r10 + addi %r10,%r10,64 + bdnz 1b -.global boot_entry -boot_entry: - /* setup stack */ - LOAD_IMM64(%r1, STACK_TOP - 0x100) +2: LOAD_IMM64(%r1,__stack_top) + li %r0,0 + stdu %r0,-16(%r1) LOAD_IMM64(%r12, main) mtctr %r12 bctrl @@ -74,6 +80,12 @@ test_write: mtmsrd %r9,0 blr + .globl test_exec +test_exec: + mtsrr0 %r4 + mtsrr1 %r5 + rfid + #define EXCEPTION(nr) \ .= nr ;\ attn @@ -86,9 +98,17 @@ test_write: mtsrr0 %r10 rfid - /* More exception stubs */ EXCEPTION(0x380) - EXCEPTION(0x400) + + /* + * ISI vector - jump to LR to return from the test, + * with r3 cleared + */ + . = 0x400 + li %r3,0 + blr + + /* More exception stubs */ EXCEPTION(0x480) EXCEPTION(0x500) EXCEPTION(0x600) @@ -98,7 +118,14 @@ test_write: EXCEPTION(0x980) EXCEPTION(0xa00) EXCEPTION(0xb00) - EXCEPTION(0xc00) + + /* + * System call - used to exit from tests where MSR[PR] + * may have been set. + */ + . = 0xc00 + blr + EXCEPTION(0xd00) EXCEPTION(0xe00) EXCEPTION(0xe20) @@ -110,3 +137,29 @@ test_write: EXCEPTION(0xf40) EXCEPTION(0xf60) EXCEPTION(0xf80) + + . = 0x1000 + /* + * This page gets mapped at various locations and + * the tests try to execute from it. + * r3 contains the test number. + */ + .globl test_start +test_start: + nop + nop + cmpdi %r3,1 + beq test_1 + cmpdi %r3,2 + beq test_2 +test_return: + li %r3,1 + sc + + . = 0x1ff8 + /* test a branch near the end of a page */ +test_1: b test_return + + /* test flowing from one page to the next */ +test_2: nop + b test_return diff --git a/tests/mmu/mmu.c b/tests/mmu/mmu.c index 0a717c7..a44c79d 100644 --- a/tests/mmu/mmu.c +++ b/tests/mmu/mmu.c @@ -4,14 +4,23 @@ #include "console.h" +#define MSR_DR 0x10 +#define MSR_IR 0x20 + extern int test_read(long *addr, long *ret, long init); extern int test_write(long *addr, long val); +extern int test_exec(int testno, unsigned long pc, unsigned long msr); static inline void do_tlbie(unsigned long rb, unsigned long rs) { __asm__ volatile("tlbie %0,%1" : : "r" (rb), "r" (rs) : "memory"); } +#define DSISR 18 +#define DAR 19 +#define SRR0 26 +#define SRR1 27 + static inline unsigned long mfspr(int sprnum) { long val; @@ -135,6 +144,8 @@ void map(void *ea, void *pa, unsigned long perm_attr) free_ptr += 512 * sizeof(unsigned long); } ptep = read_pgd(i); + if (ptep[j]) + do_tlbie(((unsigned long)ea & ~0xfff), 0); store_pte(&ptep[j], 0xc000000000000000 | ((unsigned long)pa & 0x00fffffffffff000) | perm_attr); eas_mapped[neas_mapped++] = ea; } @@ -175,14 +186,14 @@ int mmu_test_1(void) if (val != 0xdeadbeefd00d) return 2; /* DAR and DSISR should be set correctly */ - if (mfspr(19) != (long) ptr || mfspr(18) != 0x40000000) + if (mfspr(DAR) != (long) ptr || mfspr(DSISR) != 0x40000000) return 3; return 0; } int mmu_test_2(void) { - long *mem = (long *) 0x4000; + long *mem = (long *) 0x8000; long *ptr = (long *) 0x124000; long *ptr2 = (long *) 0x1124000; long val; @@ -215,8 +226,8 @@ int mmu_test_2(void) int mmu_test_3(void) { - long *mem = (long *) 0x5000; - long *ptr = (long *) 0x149000; + long *mem = (long *) 0x9000; + long *ptr = (long *) 0x14a000; long val; /* create PTE */ @@ -238,16 +249,16 @@ int mmu_test_3(void) if (val != 0xdeadbeefd0d0) return 4; /* DAR and DSISR should be set correctly */ - if (mfspr(19) != (long) &ptr[45] || mfspr(18) != 0x40000000) + if (mfspr(DAR) != (long) &ptr[45] || mfspr(DSISR) != 0x40000000) return 5; return 0; } int mmu_test_4(void) { - long *mem = (long *) 0x6000; - long *ptr = (long *) 0x10a000; - long *ptr2 = (long *) 0x110a000; + long *mem = (long *) 0xa000; + long *ptr = (long *) 0x10b000; + long *ptr2 = (long *) 0x110b000; long val; /* create PTE */ @@ -279,7 +290,7 @@ int mmu_test_4(void) int mmu_test_5(void) { - long *mem = (long *) 0x7ffd; + long *mem = (long *) 0xbffd; long *ptr = (long *) 0x39fffd; long val; @@ -292,14 +303,14 @@ int mmu_test_5(void) if (val != 0xdeadbeef0dd0) return 2; /* DAR and DSISR should be set correctly */ - if (mfspr(19) != ((long)ptr & ~0xfff) + 0x1000 || mfspr(18) != 0x40000000) + if (mfspr(DAR) != ((long)ptr & ~0xfff) + 0x1000 || mfspr(DSISR) != 0x40000000) return 3; return 0; } int mmu_test_6(void) { - long *mem = (long *) 0x7ffd; + long *mem = (long *) 0xbffd; long *ptr = (long *) 0x39fffd; /* create PTE */ @@ -310,14 +321,14 @@ int mmu_test_6(void) if (test_write(ptr, 0xdeadbeef0dd0)) return 1; /* DAR and DSISR should be set correctly */ - if (mfspr(19) != ((long)ptr & ~0xfff) + 0x1000 || mfspr(18) != 0x42000000) + if (mfspr(DAR) != ((long)ptr & ~0xfff) + 0x1000 || mfspr(DSISR) != 0x42000000) return 2; return 0; } int mmu_test_7(void) { - long *mem = (long *) 0x4000; + long *mem = (long *) 0x8000; long *ptr = (long *) 0x124000; long val; @@ -331,13 +342,13 @@ int mmu_test_7(void) if (val != 0xdeadd00dbeef) return 2; /* DAR and DSISR should be set correctly */ - if (mfspr(19) != (long) ptr || mfspr(18) != 0x00040000) + if (mfspr(DAR) != (long) ptr || mfspr(DSISR) != 0x00040000) return 3; /* this should fail */ if (test_write(ptr, 0xdeadbeef0dd0)) return 4; /* DAR and DSISR should be set correctly */ - if (mfspr(19) != (long)ptr || mfspr(18) != 0x02040000) + if (mfspr(DAR) != (long)ptr || mfspr(DSISR) != 0x02040000) return 5; /* memory should be unchanged */ if (*mem != 0x123456789abcdef0) @@ -347,7 +358,7 @@ int mmu_test_7(void) int mmu_test_8(void) { - long *mem = (long *) 0x4000; + long *mem = (long *) 0x8000; long *ptr = (long *) 0x124000; long val; @@ -361,7 +372,7 @@ int mmu_test_8(void) if (test_write(ptr, 0xdeadbeef0dd1)) return 2; /* DAR and DSISR should be set correctly */ - if (mfspr(19) != (long)ptr || mfspr(18) != 0x02040000) + if (mfspr(DAR) != (long)ptr || mfspr(DSISR) != 0x02040000) return 3; /* memory should be unchanged */ if (*mem != 0x123456789abcdef0) @@ -371,7 +382,7 @@ int mmu_test_8(void) int mmu_test_9(void) { - long *mem = (long *) 0x4000; + long *mem = (long *) 0x8000; long *ptr = (long *) 0x124000; long val; @@ -385,13 +396,13 @@ int mmu_test_9(void) if (val != 0xdeadd00dbeef) return 2; /* DAR and DSISR should be set correctly */ - if (mfspr(19) != (long) ptr || mfspr(18) != 0x08000000) + if (mfspr(DAR) != (long) ptr || mfspr(DSISR) != 0x08000000) return 3; /* this should fail */ if (test_write(ptr, 0xdeadbeef0dd1)) return 4; /* DAR and DSISR should be set correctly */ - if (mfspr(19) != (long)ptr || mfspr(18) != 0x0a000000) + if (mfspr(DAR) != (long)ptr || mfspr(DSISR) != 0x0a000000) return 5; /* memory should be unchanged */ if (*mem != 0x123456789abcdef0) @@ -401,7 +412,7 @@ int mmu_test_9(void) int mmu_test_10(void) { - long *mem = (long *) 0x4000; + long *mem = (long *) 0x8000; long *ptr = (long *) 0x124000; long val; @@ -415,7 +426,7 @@ int mmu_test_10(void) if (test_write(ptr, 0xdeadbeef0dd1)) return 2; /* DAR and DSISR should be set correctly */ - if (mfspr(19) != (long)ptr || mfspr(18) != 0x0a000000) + if (mfspr(DAR) != (long)ptr || mfspr(DSISR) != 0x0a000000) return 3; /* memory should be unchanged */ if (*mem != 0x123456789abcdef0) @@ -423,14 +434,159 @@ int mmu_test_10(void) return 0; } +int mmu_test_11(void) +{ + unsigned long ptr = 0x523000; + + /* this should fail */ + if (test_exec(0, ptr, MSR_IR)) + return 1; + /* SRR0 and SRR1 should be set correctly */ + if (mfspr(SRR0) != (long) ptr || mfspr(SRR1) != 0x40000020) + return 2; + return 0; +} + +int mmu_test_12(void) +{ + unsigned long mem = 0x1000; + unsigned long ptr = 0x324000; + unsigned long ptr2 = 0x1324000; + + /* create PTE */ + map((void *)ptr, (void *)mem, PERM_EX | REF); + /* this should succeed and be a cache miss */ + if (!test_exec(0, ptr, MSR_IR)) + return 1; + /* create a second PTE */ + map((void *)ptr2, (void *)mem, PERM_EX | REF); + /* this should succeed and be a cache hit */ + if (!test_exec(0, ptr2, MSR_IR)) + return 2; + return 0; +} + +int mmu_test_13(void) +{ + unsigned long mem = 0x1000; + unsigned long ptr = 0x349000; + unsigned long ptr2 = 0x34a000; + + /* create a PTE */ + map((void *)ptr, (void *)mem, PERM_EX | REF); + /* this should succeed */ + if (!test_exec(1, ptr, MSR_IR)) + return 1; + /* invalidate the PTE */ + unmap((void *)ptr); + /* install a second PTE */ + map((void *)ptr2, (void *)mem, PERM_EX | REF); + /* this should fail */ + if (test_exec(1, ptr, MSR_IR)) + return 2; + /* SRR0 and SRR1 should be set correctly */ + if (mfspr(SRR0) != (long) ptr || mfspr(SRR1) != 0x40000020) + return 3; + return 0; +} + +int mmu_test_14(void) +{ + unsigned long mem = 0x1000; + unsigned long mem2 = 0x2000; + unsigned long ptr = 0x30a000; + unsigned long ptr2 = 0x30b000; + + /* create a PTE */ + map((void *)ptr, (void *)mem, PERM_EX | REF); + /* this should fail due to second page not being mapped */ + if (test_exec(2, ptr, MSR_IR)) + return 1; + /* SRR0 and SRR1 should be set correctly */ + if (mfspr(SRR0) != ptr2 || mfspr(SRR1) != 0x40000020) + return 2; + /* create a PTE for the second page */ + map((void *)ptr2, (void *)mem2, PERM_EX | REF); + /* this should succeed */ + if (!test_exec(2, ptr, MSR_IR)) + return 3; + return 0; +} + +int mmu_test_15(void) +{ + unsigned long mem = 0x1000; + unsigned long ptr = 0x324000; + + /* create a PTE without execute permission */ + map((void *)ptr, (void *)mem, DFLT_PERM); + /* this should fail */ + if (test_exec(0, ptr, MSR_IR)) + return 1; + /* SRR0 and SRR1 should be set correctly */ + if (mfspr(SRR0) != ptr || mfspr(SRR1) != 0x10000020) + return 2; + return 0; +} + +int mmu_test_16(void) +{ + unsigned long mem = 0x1000; + unsigned long mem2 = 0x2000; + unsigned long ptr = 0x30a000; + unsigned long ptr2 = 0x30b000; + + /* create a PTE */ + map((void *)ptr, (void *)mem, PERM_EX | REF); + /* create a PTE for the second page without execute permission */ + map((void *)ptr2, (void *)mem2, PERM_RD | REF); + /* this should fail due to second page being no-execute */ + if (test_exec(2, ptr, MSR_IR)) + return 1; + /* SRR0 and SRR1 should be set correctly */ + if (mfspr(SRR0) != ptr2 || mfspr(SRR1) != 0x10000020) + return 2; + /* create a PTE for the second page with execute permission */ + map((void *)ptr2, (void *)mem2, PERM_RD | PERM_EX | REF); + /* this should succeed */ + if (!test_exec(2, ptr, MSR_IR)) + return 3; + return 0; +} + +int mmu_test_17(void) +{ + unsigned long mem = 0x1000; + unsigned long ptr = 0x349000; + + /* create a PTE without the ref bit set */ + map((void *)ptr, (void *)mem, PERM_EX); + /* this should fail */ + if (test_exec(2, ptr, MSR_IR)) + return 1; + /* SRR0 and SRR1 should be set correctly */ + if (mfspr(SRR0) != (long) ptr || mfspr(SRR1) != 0x00040020) + return 2; + /* create a PTE without ref or execute permission */ + map((void *)ptr, (void *)mem, 0); + /* this should fail */ + if (test_exec(2, ptr, MSR_IR)) + return 1; + /* SRR0 and SRR1 should be set correctly */ + /* RC update fail bit should not be set */ + if (mfspr(SRR0) != (long) ptr || mfspr(SRR1) != 0x10000020) + return 2; + return 0; +} + int fail = 0; void do_test(int num, int (*test)(void)) { int ret; - mtspr(18, 0); - mtspr(19, 0); + mtspr(DSISR, 0); + mtspr(DAR, 0); unmap_all(); print_test_number(num); ret = test(); @@ -440,10 +596,17 @@ void do_test(int num, int (*test)(void)) fail = 1; print_string("FAIL "); putchar(ret + '0'); - print_string(" DAR="); - print_hex(mfspr(19)); - print_string(" DSISR="); - print_hex(mfspr(18)); + if (num <= 10) { + print_string(" DAR="); + print_hex(mfspr(DAR)); + print_string(" DSISR="); + print_hex(mfspr(DSISR)); + } else { + print_string(" SRR0="); + print_hex(mfspr(SRR0)); + print_string(" SRR1="); + print_hex(mfspr(SRR1)); + } print_string("\r\n"); } } @@ -463,6 +626,13 @@ int main(void) do_test(8, mmu_test_8); do_test(9, mmu_test_9); do_test(10, mmu_test_10); + do_test(11, mmu_test_11); + do_test(12, mmu_test_12); + do_test(13, mmu_test_13); + do_test(14, mmu_test_14); + do_test(15, mmu_test_15); + do_test(16, mmu_test_16); + do_test(17, mmu_test_17); return fail; } diff --git a/tests/mmu/powerpc.lds b/tests/mmu/powerpc.lds index c4bff13..99611ab 100644 --- a/tests/mmu/powerpc.lds +++ b/tests/mmu/powerpc.lds @@ -1,13 +1,27 @@ SECTIONS { - _start = .; . = 0; + _start = .; .head : { KEEP(*(.head)) } - . = 0x1000; - .text : { *(.text) } - . = 0x3000; - .data : { *(.data) } - .bss : { *(.bss) } + . = ALIGN(0x1000); + .text : { *(.text) *(.text.*) *(.rodata) *(.rodata.*) } + . = ALIGN(0x1000); + .data : { *(.data) *(.data.*) *(.got) *(.toc) } + . = ALIGN(0x80); + __bss_start = .; + .bss : { + *(.dynsbss) + *(.sbss) + *(.scommon) + *(.dynbss) + *(.bss) + *(.common) + *(.bss.*) + } + . = ALIGN(0x80); + __bss_end = .; + . = . + 0x4000; + __stack_top = .; } diff --git a/tests/test_mmu.bin b/tests/test_mmu.bin index 961e2df8a24786b52775933f1f9573d2beea9c7e..afae999223bcb15bede8df58d6abc88bc7ea06dc 100755 GIT binary patch literal 20496 zcmeHPdu&uy8vpK0X{WUcQ7rUD~ygJhs>_c=c2Kvk?P)}A-6lF7QD+&>j_MuQxL@~(Q z0(n~?Z%fPe+AZOPd$UoL5xb3C;3BF^kZKfAX}DP-BRc&)9X4o0iGBrTv!WVAPo-3f z-8MSkqbZwD^0;^E3HM+05~5qBbmxzQz!(T5YJx_>b+_R`94Fj&8&^GM%=jO}x~oAB zX*_4HhNe+? zgn6B&*opr%Fh+SC51;S4eH4?vRnm8D|2LKYtvWlZun?#rg^J5hPu>`Cq3lP==V_Q; zJ8BPy=pf)A;2_{2;2_{2;2_{2;2_{2;2_{2;2_{2@V!7l9Tbf)G*oDo-wT@^BOL@B z1RMk$1RMk$1RMk$1RMk$1RMk$1RMk$1il^u3i;$7cu{VcCP z;r8fhzl&00Y)G07jFS-W&1T(4peIw~OJ@gGrL%MYmz-9mj|;>(~oIK;A| z(;o(ACx+$y;{4%`d0Az5s#)H)Y|1IGpcPs!ZSXH8uWw#v@txV0?DL|^cV1+d@6W0- zjtEot_0SEy{83`~0Y|#s>H37YcTNs;w*D?6F=Rh(gsuM=`l6$PIYJtJ^HH6(FwtTITAX`a}<{q|Tcgbq*ahJhUNh7*6TFeq1r z4}M>BgWq5uyG?(k$k-*Z$#S{xo#08WYhU~<&lQYqw0M^u{pkU{?3ma1_Z|c9$9T(Y zfdlhXK};j)fAb47ez`xzs_~_{Jy{-%zb8EHEw2%+X)sC``VHdt0Q-zL)}%6 zDxI2|5noLHNT_>nGSc@*vYoN!Jo3IsMh;8A8RJgPv%wqQo7{tUi1tLeKhfc#4_?^Q z%(*77cTdO@h=l(6Hxnm?xKDBU^YBllcTc&vK3Jtc4P2#uB{U~pcUbD_)TV;YI#?#_ zVVSIhl{&!@0R9E7=a5mov6&E-+TA zkNZv@^oFBnU+(BZPC17DgXC?Y$=)zg)w#W|jl^^xY$Up8z{Ipn@05PS1`AW$hCIl> zANiQ+&rFGjSZ>&oWnOwvpGo$A--D9Ta}wg%^1&W7^(iJj6B=)&C(0dQVn7`oRnF zX0}}c?Eq-KdGU}7&pYw#Mj1tZ%RspV$r6GfA$u5xN;l@KRp%N)9-E23tIr9D|)$Yb5J5A5t(zBieS3?{oG9PiPg zzK*=!w-+nF{Szh$)D{`O0GI#%VWQ=c>V;2~X@tE}M?;X5nk+JLNJ;FS_ zRnnV`48$+_%k*WaGtSYoFLf~1HolB$i>0G&`U-0g^pH-2#kv_Akl`|Alu8-*3{d~Y z80Ngo`JgtR>Uf?EVjbeR??*}IM~S&F&$pFRyQ}0{wj>#0yciG0>cGzAa>if~IAtQn z?YL&F7!RU%h8CV<*_Y=tZbv?B#ykwI=oyKdIiJn^&i*s6jlof15VrUk17712u0_}f zvfn)K%f`lH(8Pj$w{v4KkLQPp1&_%HEH25F>?^@fw&&g4Zx%m4PhB2A;a5iD@Kxi7 zJYOC^jNOfvjkFJ4EX+S$4|x>@ss!2`0?X?c_eP*#ZQsNKVJM?hQ%oHQ*h0d z$Inc}kHYc8`!U3jonyV9ipH7tnZs|MId;D^(kJX`12eG(ruYBw6ZBJJ=2^w+x5Dbj zYcl6j>;?QB-&jNB+=tF9I(t;4+uCW`%KLW%IxB2ju}|PU{K+BH*0$*#RxVqQeY`rC z_F=uKE7q~c!5kfVE~*1f`l7}(?W5@@+m39nm zHDl=6Zqpam%jXF`uk(2C&%%5AuFpFvIi9w1JXPtK-!h&uFgEr$1VQ6^_W8HupB>C1>j%j)Zl=Ii{1ddWM`qn(?1-Jb^B-k)rqYY6xd<{uTB=NjIV z+w1JfHY1o-Umn*CtDc3r z-Wu1|SalET`&PuYUt9HT)DNtTYYVM<4(fxe;@VuRehunR!M>ZUdM@hcSI4!VSoJ*A zwFl!`o>k9h{xxy!0^)=HnT+}a_s6v}RvqVL&GVbM_O?}@f_gvducNNYn4{1}^B(I< z_Yl;LT#Xz_syAGS`#F`oZJThHbRF{H$y@5j%!Ax#6Yix1Vp2uFled(O%ggb5WxRa< z2K`Hlad+oko5C94LEn?NMRdMz94j~5HWjcv3(D|+c$j@z@5$R&>SOieyKZ|zen74d)30Sme3d^w0WXp9%!5t~@ePG*& z&$^a})~TQb1B({rZ&6xvUr{S?o2@W^>?&w1wIXKu4|0Mme-{3z?2C?gtBrspbD z0t#Y-`u;1#Q`}mc9iUxGeZGH6;Nc}X`4o63GjI*j^=R_r8^R_pj*{y0Rd6)rQ*|bi zp_CIs`7+w&rL-xhCcgq)wHM{EzD@Y{f$ss7G#~n&bnQ}VCFkv|FbRpww-4X)0U-eV zMZN`+FKF|zPsi|$fe#n+@}uJKX?#Q?HM7S2b+a`qvX2lB4&*J0r0&6zJBIwkl3%Zu`dVdINBe7 zUx-T9w*dMA!(P zb}RVfAC51Z{n`%R=8uGU8qZct-JMT0GNx3J_M`nY+K+I1gB-zKN=Owrx83>{BxZ7!4v-kV~=P4Fpi5s z>jq6_9PM~8XV2O+80Gwlfp`C>mK~;@t+uZ$s}t?(J`-X!o`>g!8MX7%Wijs&mIYpp zVJp6$bk)kp34*sa6Vu3^w<+-MNC~l6^2*q8VH$u3bAWZ>lb#cpR*3KO=Y-I>5g(?> zI0KDK{xHpt@5h(W7K4`ixe#+u@`w3a@vQ{Smd8Huyj})c5aqDHW*(@K$YU-c^~nBD ZqtwC6{hW`3fP=u-KtLub?#GFM=zqYWQ&<21 literal 12304 zcmeHMeQaCR6+h2T9Vet|sPacdC-sG7QtaiA465mwCiQcYLO$B!eozY9$1_ez+I6fQ zo#ZU41 zABo*IDI0@G!%uSVyXT&J?m54E&OP_Mln`|g$<0Ky+GuHQ3~dhuYc--o3(>yFXz9Lw zv|F2&?~A0|dyU$X#5!^jKOHHOjVfx0^hjjHmOZQ!X$nzlQbOA!$p+EktW1e@H|BdH z>+ms;DY5vch``s3K&mZlq+AagUg`>@+z%S{L~FiQ6|7kLT9+(Xthjyw9N)OkLht`V z175U$p?6?m*I(%U|E90Mu)9z8U3Q_hSh)2U+W!{La%<(H{qyg@&G7lE$LH7X%a4B5 z`hQvZU*5ubC5(ws?wp9^#>Jl8r1(v4TAa#d#ap>5lFZs2<5L))!uS+uli-<3xxG60 zrocDFwL8D_825RMX%eZ;^t)Ci zUBcf=NSl~Pj)Pxp*<;Xmd`8~)uDlXoRqDS_F7@@7QF&t%ZBr^}k9t4(0;@cA_m$bQ zuZUpaiYRM5UK%vc2y5-3&-3(#KjrJ4zn%~eHJ4*eWmi;j?D#fx{d?~7coW5yauUl| zd7_)1tF-J0PsjBpiYiRoYt!xmPKN!?cGq9CHtY`&)i=sCxOR-%j39OGQh`yB>Khc; z)8pA2i!Ix(j|g92@Ks;G1p88MpYh)Pq^qd0vw~WgCoJ#PtGlB5ap-AXOUfqlN zv5lREL>YWXQZX;1pA%~9vGs1yx&nFoXKrIY4!keyXR+>}L2^RzCLWq}Fj$KZT`{?KGlg$x_iKu?!yu#N3k^YFdKFIX}{;>kRJQaOS=D@ z&v<)S$9FSdV>@tQeyU0+g!TVsVQ0>#L~Di;)G}J?#r}IE)vx=ESX-CTuvRq)@~O%f zKzrF)ui*`!G0{3PFwy4gcb!hg>X#oJh)pGRA=H5Dw6DK%y1l>pZ6D?*@ZF8Hszhrb z3|m~}&q!1js9)~uCo131V>KA_cm6fL0ydz}kCgD-*pF_Zks!zD$kBB4328_n6}G5R zZQ_^df%ipvCRX2tZQc;$IHVj+4{JP!=#|`P>2Qo=_~Oao9&aeBEgl=zUYj!4)g)s6 zy2j&(QJLPFa&dn;s2>JyOC_y2f4#>|?1mJkvwW7xvRNj}X1RN8xo%tT_f5H{rb@V< zU8S!v_X9GZ+gvhQY#EDe88xPiP=-w1{P4-akO%L4j>Nj_jOtt-hI7PX71lFt(SY4h zR6D1QuOu2-**!6f-)L!i02uc&rq7!GCkJ@ml4)NgcInUYQJfR!Fn5}KTWN_eLKM9G z#|wox{ua6s{U~GM=+_Uz-YN%2d+ip;euNUuOh0yOQe&CBY&i~YJ*FHn^$1Z5V|VOU zZ-ms~41TOvK7K62*@LSYOQ##+m%SdpvzO<_FF#KNuqNjT#_?)Ap&W`Og)`!1_{r6{9GFJD z5+xMoBkFuMjsDSe65raBG1gZC?F48SPbPcD7AG|q`UlW=qmAK=DnYv$?IN@*(0b4! z{{>3X9+|YTXImTRvafYEFS0F8UIZ?jBPN}E&4&UWDZzheHg|LEco5^xIdJ|{(Y<2n zNRZT_mF8KvIUVKP&i>5#{ltNEFYnuQIvquB??sNs`1#xmpNVdk)`pgy+;4CZT9`) zQJu1Ny5$d$;j&~jm@gu4(g_Td#v$Ag@g zW1C^SIs$CO0NZ}zfavwWpQnKn|1RAtMI4CZGN_&^Ha*T5%(vynPc2P2M#~Peapgu?RL(-nol?$EF5@03UIjQ zEM$8LwmN<@{PBP1H%9h)zuEh4AqM}c-;j5HzhT_g+H1_^H&lLezZv`Jf9E$P^Z3mJ z@K4UejCuY!o$ojIO%!5wGrw7D^WV&G3URox-;7`SVt#{oGVe;+-F=(+pHRXFA8D6Q?^r zob@{!6e)2{d^))B*$p1tA5YKJDnzUHP__)U)gIh0APR*Fcdh?sM6c`wk=JzSG!+xu8vc7k&bIriMZov;NnN z_sRZcanzK9D%UqEY8$qba~I^A^`;8Fth*&?%+moLC2{+HALv~7aD8(`vT7b44`X~W z#!{|F^|ff3wgt4yA297_dcdY>pgo;$msEJxvpt;aUo-!QPFg?o7h&`7&Qdxw# z+~>S|C7M(|22R}P`taO|mai zgSt;9U%!ePZj60sAn)1Q7xem3PZ&y|fs$Tfgg>_M#_hYV@wdS^Fs<7UL zyZ!jTtTmVY@WQ>@&2J0fnR#v$>(#gaDISB*@cwWuucF4Nfd8-`Z!*6v#Gho}eFpne zP|L$_xpsy6%CQ#{E<*2bMBv66fe8CKDt{*8xC#UxVW6??oj6%?#u0=4Gfyx zzOi#7-gt~3*keA@FE&s~EF>+fkffS zLA8w`6~FLBA@~D?&DU1h1itn{zCH}^1m8~PTMc>7yADb1Cg;5@FzpGM@}9!*i{OK? zvwU}(d|`)=b$$`Q3K)-%A?73F?-~4(plx9auW?9PZO-hlRBi%!8ow98GwJYz4tRdO z=wQinZdM3-w^oLG@txt#^!Ydb@wz9(Hr6A)c6=a2JK7oDyG%6jNuqT|_ca)Q@ Date: Thu, 7 May 2020 12:08:43 +1000 Subject: [PATCH 19/26] tests/privileged: Update for instruction translation Since setting MSR[PR] = 1 forces instruction translation on, we need to set up translations for the problem state code to use. Signed-off-by: Paul Mackerras --- tests/privileged/privileged.c | 91 ++++++++++++++++++++++++++++++++++ tests/test_privileged.bin | Bin 9900 -> 16400 bytes 2 files changed, 91 insertions(+) diff --git a/tests/privileged/privileged.c b/tests/privileged/privileged.c index 073dc07..eca6e0e 100644 --- a/tests/privileged/privileged.c +++ b/tests/privileged/privileged.c @@ -55,6 +55,94 @@ void print_test_number(int i) putchar(':'); } +static inline void do_tlbie(unsigned long rb, unsigned long rs) +{ + __asm__ volatile("tlbie %0,%1" : : "r" (rb), "r" (rs) : "memory"); +} + +static inline void store_pte(unsigned long *p, unsigned long pte) +{ + __asm__ volatile("stdbrx %1,0,%0" : : "r" (p), "r" (pte) : "memory"); +} + +#define CACHE_LINE_SIZE 64 + +void zero_memory(void *ptr, unsigned long nbytes) +{ + unsigned long nb, i, nl; + void *p; + + for (; nbytes != 0; nbytes -= nb, ptr += nb) { + nb = -((unsigned long)ptr) & (CACHE_LINE_SIZE - 1); + if (nb == 0 && nbytes >= CACHE_LINE_SIZE) { + nl = nbytes / CACHE_LINE_SIZE; + p = ptr; + for (i = 0; i < nl; ++i) { + __asm__ volatile("dcbz 0,%0" : : "r" (p) : "memory"); + p += CACHE_LINE_SIZE; + } + nb = nl * CACHE_LINE_SIZE; + } else { + if (nb > nbytes) + nb = nbytes; + for (i = 0; i < nb; ++i) + ((unsigned char *)ptr)[i] = 0; + } + } +} + +#define PERM_EX 0x001 +#define PERM_WR 0x002 +#define PERM_RD 0x004 +#define PERM_PRIV 0x008 +#define ATTR_NC 0x020 +#define CHG 0x080 +#define REF 0x100 + +#define DFLT_PERM (PERM_WR | PERM_RD | REF | CHG) + +/* + * Set up an MMU translation tree using memory starting at the 64k point. + * We use 2 levels, mapping 2GB (the minimum size possible), with a + * 8kB PGD level pointing to 4kB PTE pages. + */ +unsigned long *pgdir = (unsigned long *) 0x10000; +unsigned long free_ptr = 0x12000; + +void init_mmu(void) +{ + zero_memory(pgdir, 1024 * sizeof(unsigned long)); + /* RTS = 0 (2GB address space), RPDS = 10 (1024-entry top level) */ + mtspr(720, (unsigned long) pgdir | 10); + do_tlbie(0xc00, 0); /* invalidate all TLB entries */ +} + +static unsigned long *read_pgd(unsigned long i) +{ + unsigned long ret; + + __asm__ volatile("ldbrx %0,%1,%2" : "=r" (ret) : "b" (pgdir), + "r" (i * sizeof(unsigned long))); + return (unsigned long *) (ret & 0x00ffffffffffff00); +} + +void map(unsigned long ea, unsigned long pa, unsigned long perm_attr) +{ + unsigned long epn = ea >> 12; + unsigned long i, j; + unsigned long *ptep; + + i = (epn >> 9) & 0x3ff; + j = epn & 0x1ff; + if (pgdir[i] == 0) { + zero_memory((void *)free_ptr, 512 * sizeof(unsigned long)); + store_pte(&pgdir[i], 0x8000000000000000 | free_ptr | 9); + free_ptr += 512 * sizeof(unsigned long); + } + ptep = read_pgd(i); + store_pte(&ptep[j], 0xc000000000000000 | (pa & 0x00fffffffffff000) | perm_attr); +} + int priv_fn_1(unsigned long x) { __asm__ volatile("attn"); @@ -140,6 +228,9 @@ void do_test(int num, int (*fn)(unsigned long)) int main(void) { potato_uart_init(); + init_mmu(); + map(0x2000, 0x2000, REF | CHG | PERM_RD | PERM_EX); /* map code page */ + map(0x7000, 0x7000, REF | CHG | PERM_RD | PERM_WR); /* map stack page */ do_test(1, priv_fn_1); do_test(2, priv_fn_2); diff --git a/tests/test_privileged.bin b/tests/test_privileged.bin index 5b8ce63ab5e843bf8ff40c84d36833ad52abd0ca..6eb6b536b11ab8f75b5663faf5dcc4fea6c94e0e 100755 GIT binary patch delta 1554 zcmZ8hT}&KR6h1Ss>r!!94bg2OF#AK@L1Hle%~CA0Kr7uX%^I=x!H1<_+8C{1d>CY= zI}mD0(g$$jlOS!Pq%ln+K@&|hX)BtT_|V4GV656!ngHtqF=!SRynbf@L%7M!-0wTz zIrpBQ-AnBwwlorxD{H1|f2wYr6`h6KVxTZDE)sFO#WLGM*RI%$l?^Y?ddx%+G{K>FKCxM21Zt{EZh6SrRR1L^}Ky%vqr~Uw)wo zTr$2agp_i=QiA*Wp`5jmYO+YNHJ^Qao-1%eg!1i(=(EEMyGZSvoUXU1RKbh&cz05_ z>M5kvtvioL+(a@g`+iNN@^7E+n1ZVjnJ|sVzBccEY#Oyoszs#MwIVsbjsmtuxaQ}7 zMT#-|lHH6=&wY2FudjrBo#lV2meG>5V(q#`Y!9e$d{?&gv1QeoYo6<$Yf+!y*{pOI^BISt`Ga_O3LG@Q(~ zuZxsX_#^u&lJ2X>nrMqI#PP1t>x&^N|&S^WJS;& zFFn3^G=f1?d-GyY_d&FHq|4bC4Y^JMj&iWp^1<%B9<4uC9{wh4pe0*#sV_B|rDzYg zCAPwS&7PG=?0X;i&SB$uyd+yD;$IM6t6dJ(VhBszUz>y<X3e`;#8hgXDP zIUOiT(1-GbbkJY_VB&PzPl?}u^CsNql}beCp*zPqEQL3?;HnVEm|))t>459m#-83p zJ?K_w4nuQgOOyD(Ggdra#Mj`CW`DZ-5%->7Z~cO+HUb-Pnq760d#k)-I6hTyPVEiv zp22aZAjFIt%CEq^+fS|Dg!6Pwb+%(o2p$k#TlF-+ab0;ar(=)o;JBU_!8PK`i*;^2 zx5bNr;{q*%OXK^0p<3N^TXy;BVYgKG!^QzFVh7iSh)s`{y&0H3U)Jj><7&BUTV-CI(6TtVX4W9tvJ!OQ7D=Q&W@f5_%F3 zse?zwQxPemcNM{l2QT6WcoAzr+S+0T4_lKs{wMCCWZ=R3{ojAy&YRhmrL0_PC0QZz zEdA*qFK9Nq>f%}bCfj2Cv7TpYm4DYm`6Wi@TH)ojr$s_3v0 zx>lE0&MWbtU$ss;rgWqIGg0Gj;+ZKS77Za@9}k4vCVJcNgl-q@m&hNK6k51*ALrdh z$on-qyB|otla_ljeyTxq`-r#iPvxX{t{iFH^#*t)anXGBM+i+?v2a2e3Q_=4rsH`HV5|Bq5a;aMZqmZ7aIONekiH4V258$rrB(V&nH8}8meA}=y`|x kQXx+U>1dZ9O@)I~NFQ~DcnAlb@o6Kag)2Tf?I{8O0m91$6951J From f54a65f8cf69597d1ebaf9b56f6df33a2e5917a4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 2 May 2020 14:23:14 +1000 Subject: [PATCH 20/26] Decode tlbiel as tlbie The Linux kernel contains tlbiel instructions, which we can treat identically to tlbie. Signed-off-by: Paul Mackerras --- decode1.vhdl | 1 + 1 file changed, 1 insertion(+) diff --git a/decode1.vhdl b/decode1.vhdl index 598e59c..90a5980 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -324,6 +324,7 @@ architecture behaviour of decode1 is 2#0001000100# => (ALU, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- td 2#0000000100# => (ALU, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- tw 2#0100110010# => (LDST, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- tlbie + 2#0100010010# => (LDST, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- tlbiel 2#0100111100# => (ALU, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- xor others => illegal_inst ); From a658766fcf415bd40aa12cc26d34ec2a686188f8 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 7 May 2020 20:02:21 +1000 Subject: [PATCH 21/26] Implement slbia as a dTLB/iTLB flush Slbia (with IH=7) is used in the Linux kernel to flush the ERATs (our iTLB/dTLB), so make it do that. This moves the logic to work out whether to flush a single entry or the whole TLB from dcache and icache into mmu. We now invalidate all dTLB and iTLB entries when the AP (actual pagesize) field of RB is non-zero on a tlbie[l], as well as when IS is non-zero. Signed-off-by: Paul Mackerras --- common.vhdl | 7 +++++-- dcache.vhdl | 16 +++++----------- decode1.vhdl | 1 + execute1.vhdl | 2 +- icache.vhdl | 19 +++---------------- loadstore1.vhdl | 3 ++- mmu.vhdl | 8 ++++++++ 7 files changed, 25 insertions(+), 31 deletions(-) diff --git a/common.vhdl b/common.vhdl index 79bc1bd..02f0d3f 100644 --- a/common.vhdl +++ b/common.vhdl @@ -227,6 +227,7 @@ package common is valid : std_ulogic; op : insn_type_t; -- what ld/st or m[tf]spr or TLB op to do nia : std_ulogic_vector(63 downto 0); + insn : std_ulogic_vector(31 downto 0); addr1 : std_ulogic_vector(63 downto 0); addr2 : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); -- data to write, unused for read @@ -242,12 +243,11 @@ package common is rc : std_ulogic; -- set for stcx. virt_mode : std_ulogic; -- do translation through TLB priv_mode : std_ulogic; -- privileged mode (MSR[PR] = 0) - spr_num : spr_num_t; -- SPR number for mfspr/mtspr end record; constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0', sign_extend => '0', update => '0', xerc => xerc_init, reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0', - spr_num => 0, others => (others => '0')); + others => (others => '0')); type Loadstore1ToExecute1Type is record exception : std_ulogic; @@ -283,6 +283,7 @@ package common is type Loadstore1ToMmuType is record valid : std_ulogic; tlbie : std_ulogic; + slbia : std_ulogic; mtspr : std_ulogic; iside : std_ulogic; load : std_ulogic; @@ -305,6 +306,7 @@ package common is type MmuToDcacheType is record valid : std_ulogic; tlbie : std_ulogic; + doall : std_ulogic; tlbld : std_ulogic; addr : std_ulogic_vector(63 downto 0); pte : std_ulogic_vector(63 downto 0); @@ -320,6 +322,7 @@ package common is type MmuToIcacheType is record tlbld : std_ulogic; tlbie : std_ulogic; + doall : std_ulogic; addr : std_ulogic_vector(63 downto 0); pte : std_ulogic_vector(63 downto 0); end record; diff --git a/dcache.vhdl b/dcache.vhdl index b75d91f..a9b5c4a 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -210,6 +210,7 @@ architecture rtl of dcache is type reg_stage_0_t is record req : Loadstore1ToDcacheType; tlbie : std_ulogic; + doall : std_ulogic; tlbld : std_ulogic; mmu_req : std_ulogic; -- indicates source of request end record; @@ -456,11 +457,13 @@ begin r0.req.data <= m_in.pte; r0.req.byte_sel <= (others => '1'); r0.tlbie <= m_in.tlbie; + r0.doall <= m_in.doall; r0.tlbld <= m_in.tlbld; r0.mmu_req <= '1'; else r0.req <= d_in; r0.tlbie <= '0'; + r0.doall <= '0'; r0.tlbld <= '0'; r0.mmu_req <= '0'; end if; @@ -572,7 +575,6 @@ begin tlb_update : process(clk) variable tlbie : std_ulogic; - variable tlbia : std_ulogic; variable tlbwe : std_ulogic; variable repl_way : tlb_way_t; variable eatag : tlb_tag_t; @@ -580,17 +582,9 @@ begin variable pteset : tlb_way_ptes_t; begin if rising_edge(clk) then - tlbie := '0'; - tlbia := '0'; + tlbie := r0_valid and r0.tlbie; tlbwe := r0_valid and r0.tlbld; - if r0_valid = '1' and r0.tlbie = '1' then - if r0.req.addr(11 downto 10) /= "00" then - tlbia := '1'; - else - tlbie := '1'; - end if; - end if; - if rst = '1' or tlbia = '1' then + if rst = '1' or (tlbie = '1' and r0.doall = '1') then -- clear all valid bits at once for i in tlb_index_t loop dtlb_valids(i) <= (others => '0'); diff --git a/decode1.vhdl b/decode1.vhdl index 90a5980..cd17d1e 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -282,6 +282,7 @@ architecture behaviour of decode1 is 2#0010111010# => (ALU, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd 2#0010011010# => (ALU, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw -- 2#0010000000# setb + 2#0111110010# => (LDST, OP_TLBIE, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- slbia 2#0000011011# => (ALU, OP_SHL, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- sld 2#0000011000# => (ALU, OP_SHL, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- slw 2#1100011010# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- srad diff --git a/execute1.vhdl b/execute1.vhdl index 78361c2..688f93c 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -1023,7 +1023,7 @@ begin lv.xerc := v.e.xerc; lv.reserve := e_in.reserve; lv.rc := e_in.rc; - lv.spr_num := decode_spr_num(e_in.insn); + lv.insn := e_in.insn; -- decode l*cix and st*cix instructions here if e_in.insn(31 downto 26) = "011111" and e_in.insn(10 downto 9) = "11" and e_in.insn(5 downto 1) = "10101" then diff --git a/icache.vhdl b/icache.vhdl index 7d7973d..86c2746 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -435,32 +435,19 @@ begin -- iTLB update itlb_update: process(clk) - variable tlbie : std_ulogic; - variable tlbia : std_ulogic; - variable tlbwe : std_ulogic; variable wr_index : tlb_index_t; begin if rising_edge(clk) then - tlbie := '0'; - tlbia := '0'; - tlbwe := m_in.tlbld; - if m_in.tlbie = '1' then - if m_in.addr(11 downto 10) /= "00" then - tlbia := '1'; - else - tlbie := '1'; - end if; - end if; wr_index := hash_ea(m_in.addr); - if rst = '1' or tlbia = '1' then + if rst = '1' or (m_in.tlbie = '1' and m_in.doall = '1') then -- clear all valid bits for i in tlb_index_t loop itlb_valids(i) <= '0'; end loop; - elsif tlbie = '1' then + elsif m_in.tlbie = '1' then -- clear entry regardless of hit or miss itlb_valids(wr_index) <= '0'; - elsif tlbwe = '1' then + elsif m_in.tlbld = '1' then itlb_tags(wr_index) <= m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS); itlb_ptes(wr_index) <= m_in.pte; itlb_valids(wr_index) <= '1'; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index b7b56d4..251f529 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -166,7 +166,7 @@ begin mfspr := '0'; mmu_mtspr := '0'; itlb_fault := '0'; - sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10)); + sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10)); sprval := (others => '0'); -- avoid inferred latches exception := '0'; dsisr := (others => '0'); @@ -468,6 +468,7 @@ begin m_out.mtspr <= mmu_mtspr; m_out.sprn <= sprn(3 downto 0); m_out.addr <= addr; + m_out.slbia <= l_in.insn(7); m_out.rs <= l_in.data; -- Update outputs to writeback diff --git a/mmu.vhdl b/mmu.vhdl index e770d99..8415443 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -168,6 +168,7 @@ begin variable tlb_load : std_ulogic; variable itlb_load : std_ulogic; variable tlbie_req : std_ulogic; + variable inval_all : std_ulogic; variable rts : unsigned(5 downto 0); variable mbits : unsigned(5 downto 0); variable pgtable_addr : std_ulogic_vector(63 downto 0); @@ -191,6 +192,7 @@ begin tlb_load := '0'; itlb_load := '0'; tlbie_req := '0'; + inval_all := '0'; -- Radix tree data structures in memory are big-endian, -- so we need to byte-swap them @@ -217,6 +219,10 @@ begin if l_in.tlbie = '1' then dcreq := '1'; tlbie_req := '1'; + -- Invalidate all iTLB/dTLB entries for tlbie with + -- RB[IS] != 0 or RB[AP] != 0, or for slbia + inval_all := l_in.slbia or l_in.addr(11) or l_in.addr(10) or + l_in.addr(7) or l_in.addr(6) or l_in.addr(5); v.state := TLB_WAIT; else v.valid := '1'; @@ -356,12 +362,14 @@ begin d_out.valid <= dcreq; d_out.tlbie <= tlbie_req; + d_out.doall <= inval_all; d_out.tlbld <= tlb_load; d_out.addr <= addr; d_out.pte <= tlb_data; i_out.tlbld <= itlb_load; i_out.tlbie <= tlbie_req; + i_out.doall <= inval_all; i_out.addr <= addr; i_out.pte <= tlb_data; From 8ff8b2f25649d6660eca01b33da9add4bb929123 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 4 May 2020 08:57:05 +1000 Subject: [PATCH 22/26] tests/mmu: Add a test for dcbz with translation on Signed-off-by: Paul Mackerras --- tests/mmu/head.S | 14 ++++++++++++++ tests/mmu/mmu.c | 21 +++++++++++++++++++++ tests/test_mmu.bin | Bin 20496 -> 20496 bytes tests/test_mmu.console_out | 1 + 4 files changed, 36 insertions(+) diff --git a/tests/mmu/head.S b/tests/mmu/head.S index 083b1c5..824ad67 100644 --- a/tests/mmu/head.S +++ b/tests/mmu/head.S @@ -80,6 +80,20 @@ test_write: mtmsrd %r9,0 blr + /* Do a dcbz with translation on */ + .globl test_dcbz +test_dcbz: + mfmsr %r9 + ori %r8,%r9,0x10 /* set MSR_DR */ + mtmsrd %r8,0 + mr %r6,%r3 + li %r3,0 + dcbz 0,%r6 + li %r3,1 + /* land here if DSI occurred */ + mtmsrd %r9,0 + blr + .globl test_exec test_exec: mtsrr0 %r4 diff --git a/tests/mmu/mmu.c b/tests/mmu/mmu.c index a44c79d..994ffe3 100644 --- a/tests/mmu/mmu.c +++ b/tests/mmu/mmu.c @@ -9,6 +9,7 @@ extern int test_read(long *addr, long *ret, long init); extern int test_write(long *addr, long val); +extern int test_dcbz(long *addr); extern int test_exec(int testno, unsigned long pc, unsigned long msr); static inline void do_tlbie(unsigned long rb, unsigned long rs) @@ -579,6 +580,25 @@ int mmu_test_17(void) return 0; } +int mmu_test_18(void) +{ + long *mem = (long *) 0x8000; + long *ptr = (long *) 0x124000; + long *ptr2 = (long *) 0x1124000; + + /* create PTE */ + map(ptr, mem, DFLT_PERM); + /* this should succeed and be a cache miss */ + if (!test_dcbz(&ptr[129])) + return 1; + /* create a second PTE */ + map(ptr2, mem, DFLT_PERM); + /* this should succeed and be a cache hit */ + if (!test_dcbz(&ptr2[130])) + return 2; + return 0; +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -633,6 +653,7 @@ int main(void) do_test(15, mmu_test_15); do_test(16, mmu_test_16); do_test(17, mmu_test_17); + do_test(18, mmu_test_18); return fail; } diff --git a/tests/test_mmu.bin b/tests/test_mmu.bin index afae999223bcb15bede8df58d6abc88bc7ea06dc..416ae706ac754e79daab704110bf8e4c8379c8f6 100755 GIT binary patch delta 1155 zcmZ9Ke@t6d6vywmbO@awvn^IOpunILx>*>52W$IUDD(Bl2xi2kY}00)3I4H#uxX4b zKZ8c&Ff_*{%Q6+rSh7EKvXaFm;y|-xGn;_~7d4x4#zYp1c5@nZ%dY3P^$$IHC->gZ z`JQvnz4wic!x()>czs*!l^0@D23?|&MW-$s`MIp4o4L`9@7SB67O&Xf&JI4oG*cmV z)piwMFfolW7JeYqrUxR|HvNd&cU26#?W|0>iVL+*x0_#$CD-yA(2*R7+F3-AtSQ5qSyL;&W|#IMidYX62v! zf;+SlFyf1vmCz`m=n|gSs!a1sLL8+Blx87MkZX8HtKiYwSm>(*4ShZI; zC)-4RgxDB{0@liLnvMJ^#E&xMOJ#Ten=O^I6!+p}z`7tob03>Kj zpE&;AlZ9!Q$(z)imRKh{YsLCis;G^y(*H{wvx016%p*1+EQT(E~f LW|7-SOOO8mjc|aI delta 1006 zcmZ9LT})G15Xa9P6fA;Puxda`TUIJi+=>bpq`fT#Y(AUQMDtmv~dwn*EqX~ZaE1|oP?_Y`M>}8OOBlh1D2Uaj|vjH2{H)QDy%*z;S z{36uSFG6PWvo$YpO58r(N2SuuZ}f{)8S!Gh@6 z=EVSd954R~BDl#>0tfN1;~4)ij18(4b1`q4-o|=zArwt16vhy0W<8F03 zMjAxp2BxSe&xCmi8(Q%`2uHieyXOI?U`Unhlj zi609(NGn+vv57c48qAw#VPaiE7~_H-N=D*>lR@3Qpr6IxB>1SX#UTTHk-i>WQU%5~5s+$fT}?d?RkHNrzq7TvGtSXm3rc P{Er_lHOZ@_&8^=6h`vx~ diff --git a/tests/test_mmu.console_out b/tests/test_mmu.console_out index a8e2dcb..a5c08ea 100644 --- a/tests/test_mmu.console_out +++ b/tests/test_mmu.console_out @@ -15,3 +15,4 @@ test 14:PASS test 15:PASS test 16:PASS test 17:PASS +test 18:PASS From f3c6119cf63d21671bbad9bdfb983316b0336dd8 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 7 May 2020 20:12:46 +1000 Subject: [PATCH 23/26] tests/mmu: Add a test of PTE refetching on permission error Signed-off-by: Paul Mackerras --- tests/mmu/mmu.c | 28 +++++++++++++++++++++++++--- tests/test_mmu.bin | Bin 20496 -> 20496 bytes tests/test_mmu.console_out | 1 + 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/tests/mmu/mmu.c b/tests/mmu/mmu.c index 994ffe3..8281b04 100644 --- a/tests/mmu/mmu.c +++ b/tests/mmu/mmu.c @@ -145,8 +145,6 @@ void map(void *ea, void *pa, unsigned long perm_attr) free_ptr += 512 * sizeof(unsigned long); } ptep = read_pgd(i); - if (ptep[j]) - do_tlbie(((unsigned long)ea & ~0xfff), 0); store_pte(&ptep[j], 0xc000000000000000 | ((unsigned long)pa & 0x00fffffffffff000) | perm_attr); eas_mapped[neas_mapped++] = ea; } @@ -569,6 +567,7 @@ int mmu_test_17(void) if (mfspr(SRR0) != (long) ptr || mfspr(SRR1) != 0x00040020) return 2; /* create a PTE without ref or execute permission */ + unmap((void *)ptr); map((void *)ptr, (void *)mem, 0); /* this should fail */ if (test_exec(2, ptr, MSR_IR)) @@ -599,6 +598,28 @@ int mmu_test_18(void) return 0; } +int mmu_test_19(void) +{ + long *mem = (long *) 0x8000; + long *ptr = (long *) 0x124000; + + *mem = 0x123456789abcdef0; + /* create PTE with read but not write permission */ + map(ptr, mem, REF | PERM_RD); + /* this should fail and create a TLB entry */ + if (test_write(ptr, 0xdeadbeef0dd1)) + return 1; + /* DAR and DSISR should be set correctly */ + if (mfspr(DAR) != (long)ptr || mfspr(DSISR) != 0x0a000000) + return 2; + /* Update the PTE to have write permission */ + map(ptr, mem, REF | CHG | PERM_RD | PERM_WR); + /* this should succeed */ + if (!test_write(ptr, 0xdeadbeef0dd1)) + return 3; + return 0; +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -616,7 +637,7 @@ void do_test(int num, int (*test)(void)) fail = 1; print_string("FAIL "); putchar(ret + '0'); - if (num <= 10) { + if (num <= 10 || num == 19) { print_string(" DAR="); print_hex(mfspr(DAR)); print_string(" DSISR="); @@ -654,6 +675,7 @@ int main(void) do_test(16, mmu_test_16); do_test(17, mmu_test_17); do_test(18, mmu_test_18); + do_test(19, mmu_test_19); return fail; } diff --git a/tests/test_mmu.bin b/tests/test_mmu.bin index 416ae706ac754e79daab704110bf8e4c8379c8f6..a1861b22dbbdfc857e7d512d9a924efe9a997bbb 100755 GIT binary patch delta 2333 zcmZWpeNa@_6~FhntbnjC;NnIJEUX|)Ff3+)x3bH=-G#XDz=0`AjGeYiS0d7=7)fWh zKHQagN&nFd^dz06W0Z7)lg>1bPV7uGleES`Cpsjf9n-O`;73WsLKQSItEjiq#YrL}=Nfcw4GD{Geb3#lUkdA1Uok|Hf(t$?G(VGCkclAKyR-cW`j0N( zr9|xql;~5RDrM&#eOHDR*zwHs#es>P6G2BzIIc!l6ztm>ol+IXB)4$f5i`xy$Cg}i z^p&=N(N!jNSzxtm9lZ>@T@`c|j<{-Q46eEC^exD&3eq-ctEvz@OSJ6vRY~+U7^_-I zPe4j_jivZLledzD$4tYXLs@G0{NYGA25r@{ka8OKS3fPxWTV|WOT#@TyGhs+o%=d& zBfG`>EE3)hl{KgX_hTC)lNi^XlitQmeQ)>-!6WpXyRxphLbAyOp#Fy4atvACT>VA!O zmA)LylPGsSln<$40lj|=FZz6zQ@0+Mbi#+eKgPk!g<7cMf$tE^0ksKW)WZ$4{)X3F+Bwg~7R)^A#_;n1%gV;z%a+Mwc_me!pqt0Wxb3*Lp zsc5%=T&k7ul|<+M9dCtGzC8Fx^&;r;@572_1`4owR|R}D3$_G)lyTljLIz@U&)|!M zGu2j@2w3r|{6T)XS-}d+{TobnJuLr#8m3tOci_3;HaP4xK+Tj2k={-O+$##M?WBa&mW8^N7(Tes`9$-; zRbMK+GYvnjFN&Aqr->H~KTng`1yk4sQwJBMD(_D*nPPGH%`x_Kj}!G0iK4;1$UDg{ z?GtrMn2(iC3d%FcZYc*>5p&`CBIM?ExJ)Fa!*V_enVM&Go6n&Jk8bnJ-o6ctcoWd%wt}O(v7P4Cmk<9JhELChjnb= zmFiT*w{)Ds=`vxJxt zH*hI6@cQ|%;he@PO~UURi;TUKj7<_a-MB1w zlZk{bj*ngV*FTN^6{t~92`Rk>chOJlf1s!d-AzGLQ%R9wn%m%LGTE~HyShyzVDuv_ z$9VybMyKI=Q<*VCAmq${Azv;^=MoX&;p-@QS~z+G*)rkH8EBI8g}hnlmWzauS$G4H z=O!GIm&NnYB5wg9$(xJ~;a-pRf{6sRSfxmLP#(R-m;qywwP}8`UpPP;O%k4|D;*DY zaop%%gd#J}F4S%Pma$gc^TF*EoDQT3{Yy8g&ftYV*o8s{u>;J!4iFZnj(M5jm5L_43#ans1>M#+vQ4 z2WU%9+naY7<5lB>pI^fH#~*O*$c-T9!Icj!Ph6*w6E#N9lgps9KaiW65H}iG&pY|G~?u5g+-}is#f1T6Q z+9tK)-x6vq-}O(YKO5HBIb_k=rbPk3D3p8n3zao*yx zkdf?oGR(=(s0$2V5#;=RM?ASyloIeWJ(2oO@{xVnoXck(KMPw(n8g#GUZV9+saD<9 z_J)HN;%9=hshpko*ckO&xUWryn(>!4jYvUb;V`I~piPfK(6?mylwIgw#0_L_4hBZ= z3!2ZGLYKFNVDW@^ic3vRS?T+i)5E5wq#JP|bC)SZh$&A`THKwQW6KF~*%~Tcp-xZd za?Xu5dRPert|&oWZ&p)oIoz6HrrsnNX04Hq!EEa|=^U)L7E1MS(wZrS;J&p~vVkps z98DdJ)z?Y$;7I-`X*=}f7a9_ta(O;!cIYLaV@IOmJG;~Gs|Q{4wl*GNY!xFmMP7KIQuxP z!e*B#(ZR>|0=gz4;wHcEbvN~pSmc+oVQEn*-7^q=ESf_T9>U+Y3~Krr;_YLlkA?YT)ZsCh9&mX)mTe?$jJOmZUKe%*Migfz-@cC)>1E66pTO?&Ku4+O3ulTUgbK^^ zy={t*;@+Wftk`KV{f`B`mJFx0@nA3U7{0&vdf2yMP01&wp=hySXEYKfWNc;x&mktl zsi~-ewt(B|ly<>N=U`fW2+~T1(X+AW+l<+zhHZDbY<-?uYc#df2e~YM4uoVzE|i4n zio4nnOT6v$QP@h1#iZ3#PrS}x18H%!)1r`uxQdOWvYynMg0;vI(I$N}3p~COK7EJF zqHTf4oncY>(ii>*@Sv$3dNFVGdNse2^Tiim^42PVBx~aHRoPKPX-*@f+Xk9tF zY(e4G(<8O*0Cu?qg6@q{F%0#jOVh#TaY`eg$@4Zn9|OO6?1t*zmkSidT!#^*;|y{C zW7!7vr3;|L9S?`g%+glqDI21x#|sa}iK)<0pn=@-T)MgkCYNV~jra_>ur2r$boKH> zH~aWsH{1Kg2G@0SSt3T8XrXvAa#7dZoUvvl@)m-Z8lQt*H2UJilk_}BR=S1Gg1YIb zX{U>Wn{X8zL#`siEed|Ba<#~<;ygVfa3L^71j~Z6Re={dLpKbYXbw-6qH(tL2!AaO zQ9bb~?8LJu7Oi$QC~Uuq`yt-DP+{9utU=s5Lt&d#9E14$2MSxQVlCpeGZhw6aeu_Y zSqhsM#bHM*5<6!rY`Q9lL%eT}!m3pqkGS_kg%zuK0OGBf;20GTMEqo~!bYe#0rA;I z3QJJ2PUz<;>?t0SC~y$s#`y~CiQv&K4m}dF3lw%i6(l0=LVQBSc)i#!jo5+Rp-Jya zQU{##PJ+~mz0`UQ&Q#>ko!8)5MH)@&g0#vUYVCrFxCFakeq~N_M;GUx;W3M0#MzD0 zdmX;3%qR$6=bT80%j4X{y+Qj`%tpM^`&fN4yvRym0=H-?i>6l$D{@ zdjqJPLmO{`8JD%)Fd3Ju9#|%)(e57DAdjWJx8PUVOgsKTWCmlEhMww0Cc~VHKGsq~ z_T1+D8~i2}IwMA^(~(lG8=(>B9<)2|z@n-Q+Ia`QtV#<{yUY0{w2(@j1=X9S8Sy&e zxu|syi0(-3M@Y)OS2Qwe+)<5)>O##a)U-wYh^X&AkE%z`hxRG=Ijx57ZdZtYim;aqj5Aw@&TtUk^gB3+JDORFHU zCbM|mznn7>3Wvxb59dDQ#43j1TsTj?!c9T08y40KHY7cI;X&jO#K~ZN&FGkEXvy6@ PQa_gtGK#=rMA!cST}J$J diff --git a/tests/test_mmu.console_out b/tests/test_mmu.console_out index a5c08ea..cb4ad85 100644 --- a/tests/test_mmu.console_out +++ b/tests/test_mmu.console_out @@ -16,3 +16,4 @@ test 15:PASS test 16:PASS test 17:PASS test 18:PASS +test 19:PASS From 2843c99a71ad4b88d8d722bb7bae7d4979b6083c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 24 Apr 2020 10:58:56 +1000 Subject: [PATCH 24/26] MMU: Implement reading of the process table This adds the PID register and repurposes SPR 720 as the PRTBL register, which points to the base of the process table. There doesn't seem to be any point to implementing the partition table given that we don't have hypervisor mode. The MMU caches entry 0 of the process table internally (in pgtbl3) plus the entry indexed by the value in the PID register (pgtbl0). Both caches are invalidated by a tlbie[l] with RIC=2 or by a move to PRTBL. The pgtbl0 cache is invalidated by a move to PID. The dTLB and iTLB are cleared by a move to either PRTBL or PID. Which of the two page table root pointers is used (pgtbl0 or pgtbl3) depends on the MSB of the address being translated. Since the segment checking ensures that address(63) = address(62), this is sufficient to map quadrants 0 and 3. Signed-off-by: Paul Mackerras --- common.vhdl | 5 +- decode1.vhdl | 2 +- loadstore1.vhdl | 10 +-- mmu.vhdl | 114 +++++++++++++++++++++++++++++++--- tests/mmu/mmu.c | 11 +++- tests/privileged/privileged.c | 19 +++--- tests/test_mmu.bin | Bin 20496 -> 20504 bytes tests/test_privileged.bin | Bin 16400 -> 16408 bytes 8 files changed, 134 insertions(+), 27 deletions(-) diff --git a/common.vhdl b/common.vhdl index 02f0d3f..aaf176d 100644 --- a/common.vhdl +++ b/common.vhdl @@ -39,7 +39,8 @@ package common is constant SPR_SPRG3U : spr_num_t := 259; constant SPR_HSPRG0 : spr_num_t := 304; constant SPR_HSPRG1 : spr_num_t := 305; - constant SPR_PGTBL0 : spr_num_t := 720; + constant SPR_PID : spr_num_t := 48; + constant SPR_PRTBL : spr_num_t := 720; -- GPR indices in the register file (GPR only) subtype gpr_index_t is std_ulogic_vector(4 downto 0); @@ -288,7 +289,7 @@ package common is iside : std_ulogic; load : std_ulogic; priv : std_ulogic; - sprn : std_ulogic_vector(3 downto 0); + sprn : std_ulogic_vector(9 downto 0); addr : std_ulogic_vector(63 downto 0); rs : std_ulogic_vector(63 downto 0); end record; diff --git a/decode1.vhdl b/decode1.vhdl index cd17d1e..4cd195f 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -449,7 +449,7 @@ begin v.decode.sgl_pipe := '1'; -- send MMU-related SPRs to loadstore1 case sprn is - when SPR_DAR | SPR_DSISR | SPR_PGTBL0 => + when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PRTBL => v.decode.unit := LDST; when others => end case; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 251f529..e71ad74 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -255,7 +255,7 @@ begin mfspr := '1'; -- partial decode on SPR number should be adequate given -- the restricted set that get sent down this path - if sprn(9) = '0' then + if sprn(9) = '0' and sprn(5) = '0' then if sprn(0) = '0' then sprval := x"00000000" & r.dsisr; else @@ -266,16 +266,18 @@ begin sprval := m_in.sprval; end if; when OP_MTSPR => - done := '1'; - if sprn(9) = '0' then + if sprn(9) = '0' and sprn(5) = '0' then if sprn(0) = '0' then v.dsisr := l_in.data(31 downto 0); else v.dar := l_in.data; end if; + done := '1'; else -- writing one of the SPRs in the MMU mmu_mtspr := '1'; + stall := '1'; + v.state := TLBIE_WAIT; end if; when OP_FETCH_FAILED => -- send it to the MMU to do the radix walk @@ -466,7 +468,7 @@ begin m_out.priv <= r.priv_mode; m_out.tlbie <= v.tlbie; m_out.mtspr <= mmu_mtspr; - m_out.sprn <= sprn(3 downto 0); + m_out.sprn <= sprn; m_out.addr <= addr; m_out.slbia <= l_in.insn(7); m_out.rs <= l_in.data; diff --git a/mmu.vhdl b/mmu.vhdl index 8415443..0eefbab 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -28,6 +28,8 @@ architecture behave of mmu is type state_t is (IDLE, TLB_WAIT, + PROC_TBL_READ, + PROC_TBL_WAIT, SEGMENT_CHECK, RADIX_LOOKUP, RADIX_READ_WAIT, @@ -42,9 +44,15 @@ architecture behave of mmu is store : std_ulogic; priv : std_ulogic; addr : std_ulogic_vector(63 downto 0); + -- config SPRs + prtbl : std_ulogic_vector(63 downto 0); + pid : std_ulogic_vector(31 downto 0); -- internal state state : state_t; pgtbl0 : std_ulogic_vector(63 downto 0); + pt0_valid : std_ulogic; + pgtbl3 : std_ulogic_vector(63 downto 0); + pt3_valid : std_ulogic; shift : unsigned(5 downto 0); mask_size : unsigned(4 downto 0); pgbase : std_ulogic_vector(55 downto 0); @@ -64,8 +72,8 @@ architecture behave of mmu is begin -- Multiplex internal SPR values back to loadstore1, selected - -- by l_in.sprn. Easy when there's only one... - l_out.sprval <= r.pgtbl0; + -- by l_in.sprn. + l_out.sprval <= r.prtbl when l_in.sprn(9) = '1' else x"00000000" & r.pid; mmu_0: process(clk) begin @@ -73,7 +81,9 @@ begin if rst = '1' then r.state <= IDLE; r.valid <= '0'; - r.pgtbl0 <= (others => '0'); + r.pt0_valid <= '0'; + r.pt3_valid <= '0'; + r.prtbl <= (others => '0'); else if rin.valid = '1' then report "MMU got tlb miss for " & to_hstring(rin.addr); @@ -169,12 +179,17 @@ begin variable itlb_load : std_ulogic; variable tlbie_req : std_ulogic; variable inval_all : std_ulogic; + variable prtbl_rd : std_ulogic; + variable pt_valid : std_ulogic; + variable effpid : std_ulogic_vector(31 downto 0); + variable prtable_addr : std_ulogic_vector(63 downto 0); variable rts : unsigned(5 downto 0); variable mbits : unsigned(5 downto 0); variable pgtable_addr : std_ulogic_vector(63 downto 0); variable pte : std_ulogic_vector(63 downto 0); variable tlb_data : std_ulogic_vector(63 downto 0); variable nonzero : std_ulogic; + variable pgtbl : std_ulogic_vector(63 downto 0); variable perm_ok : std_ulogic; variable rc_ok : std_ulogic; variable addr : std_ulogic_vector(63 downto 0); @@ -193,6 +208,7 @@ begin itlb_load := '0'; tlbie_req := '0'; inval_all := '0'; + prtbl_rd := '0'; -- Radix tree data structures in memory are big-endian, -- so we need to byte-swap them @@ -202,14 +218,21 @@ begin case r.state is when IDLE => + if l_in.addr(63) = '0' then + pgtbl := r.pgtbl0; + pt_valid := r.pt0_valid; + else + pgtbl := r.pgtbl3; + pt_valid := r.pt3_valid; + end if; -- rts == radix tree size, # address bits being translated - rts := unsigned('0' & r.pgtbl0(62 downto 61) & r.pgtbl0(7 downto 5)); + rts := unsigned('0' & pgtbl(62 downto 61) & pgtbl(7 downto 5)); -- mbits == # address bits to index top level of tree - mbits := unsigned('0' & r.pgtbl0(4 downto 0)); + mbits := unsigned('0' & pgtbl(4 downto 0)); -- set v.shift to rts so that we can use finalmask for the segment check v.shift := rts; v.mask_size := mbits(4 downto 0); - v.pgbase := r.pgtbl0(55 downto 8) & x"00"; + v.pgbase := pgtbl(55 downto 8) & x"00"; if l_in.valid = '1' then v.addr := l_in.addr; @@ -223,11 +246,23 @@ begin -- RB[IS] != 0 or RB[AP] != 0, or for slbia inval_all := l_in.slbia or l_in.addr(11) or l_in.addr(10) or l_in.addr(7) or l_in.addr(6) or l_in.addr(5); + -- The RIC field of the tlbie instruction comes across on the + -- sprn bus as bits 2--3. RIC=2 flushes process table caches. + if l_in.sprn(3) = '1' then + v.pt0_valid := '0'; + v.pt3_valid := '0'; + end if; v.state := TLB_WAIT; else v.valid := '1'; - -- Use RPDS = 0 to disable radix tree walks - if mbits = 0 then + if pt_valid = '0' then + -- need to fetch process table entry + -- set v.shift so we can use finalmask for generating + -- the process table entry address + v.shift := unsigned('0' & r.prtbl(4 downto 0)); + v.state := PROC_TBL_READ; + elsif mbits = 0 then + -- Use RPDS = 0 to disable radix tree walks v.state := RADIX_ERROR; v.invalid := '1'; else @@ -236,7 +271,20 @@ begin end if; end if; if l_in.mtspr = '1' then - v.pgtbl0 := l_in.rs; + -- Move to PID needs to invalidate L1 TLBs and cached + -- pgtbl0 value. Move to PRTBL does that plus + -- invalidating the cached pgtbl3 value as well. + if l_in.sprn(9) = '0' then + v.pid := l_in.rs(31 downto 0); + else + v.prtbl := l_in.rs; + v.pt3_valid := '0'; + end if; + v.pt0_valid := '0'; + dcreq := '1'; + tlbie_req := '1'; + inval_all := '1'; + v.state := TLB_WAIT; end if; when TLB_WAIT => @@ -245,6 +293,41 @@ begin v.state := IDLE; end if; + when PROC_TBL_READ => + dcreq := '1'; + prtbl_rd := '1'; + v.state := PROC_TBL_WAIT; + + when PROC_TBL_WAIT => + if d_in.done = '1' then + if d_in.err = '0' then + if r.addr(63) = '1' then + v.pgtbl3 := data; + v.pt3_valid := '1'; + else + v.pgtbl0 := data; + v.pt0_valid := '1'; + end if; + -- rts == radix tree size, # address bits being translated + rts := unsigned('0' & data(62 downto 61) & data(7 downto 5)); + -- mbits == # address bits to index top level of tree + mbits := unsigned('0' & data(4 downto 0)); + -- set v.shift to rts so that we can use finalmask for the segment check + v.shift := rts; + v.mask_size := mbits(4 downto 0); + v.pgbase := data(55 downto 8) & x"00"; + if mbits = 0 then + v.state := RADIX_ERROR; + v.invalid := '1'; + else + v.state := SEGMENT_CHECK; + end if; + else + v.state := RADIX_ERROR; + v.badtree := '1'; + end if; + end if; + when SEGMENT_CHECK => mbits := '0' & r.mask_size; v.shift := r.shift + (31 - 12) - mbits; @@ -331,6 +414,16 @@ begin end case; + if r.addr(63) = '1' then + effpid := x"00000000"; + else + effpid := r.pid; + end if; + prtable_addr := x"00" & r.prtbl(55 downto 36) & + ((r.prtbl(35 downto 12) and not finalmask(23 downto 0)) or + (effpid(31 downto 8) and finalmask(23 downto 0))) & + effpid(7 downto 0) & "0000"; + pgtable_addr := x"00" & r.pgbase(55 downto 19) & ((r.pgbase(18 downto 3) and not mask) or (addrsh and mask)) & "000"; @@ -348,6 +441,9 @@ begin elsif tlb_load = '1' then addr := r.addr(63 downto 12) & x"000"; tlb_data := pte; + elsif prtbl_rd = '1' then + addr := prtable_addr; + tlb_data := (others => '0'); else addr := pgtable_addr; tlb_data := (others => '0'); diff --git a/tests/mmu/mmu.c b/tests/mmu/mmu.c index 8281b04..a5d086b 100644 --- a/tests/mmu/mmu.c +++ b/tests/mmu/mmu.c @@ -21,6 +21,8 @@ static inline void do_tlbie(unsigned long rb, unsigned long rs) #define DAR 19 #define SRR0 26 #define SRR1 27 +#define PID 48 +#define PRTBL 720 static inline unsigned long mfspr(int sprnum) { @@ -110,15 +112,20 @@ void zero_memory(void *ptr, unsigned long nbytes) * 8kB PGD level pointing to 4kB PTE pages. */ unsigned long *pgdir = (unsigned long *) 0x10000; -unsigned long free_ptr = 0x12000; +unsigned long *proc_tbl = (unsigned long *) 0x12000; +unsigned long free_ptr = 0x13000; void *eas_mapped[4]; int neas_mapped; void init_mmu(void) { + /* set up process table */ + zero_memory(proc_tbl, 512 * sizeof(unsigned long)); + mtspr(PRTBL, (unsigned long)proc_tbl); + mtspr(PID, 1); zero_memory(pgdir, 1024 * sizeof(unsigned long)); /* RTS = 0 (2GB address space), RPDS = 10 (1024-entry top level) */ - mtspr(720, (unsigned long) pgdir | 10); + store_pte(&proc_tbl[2 * 1], (unsigned long) pgdir | 10); do_tlbie(0xc00, 0); /* invalidate all TLB entries */ } diff --git a/tests/privileged/privileged.c b/tests/privileged/privileged.c index eca6e0e..98c037c 100644 --- a/tests/privileged/privileged.c +++ b/tests/privileged/privileged.c @@ -13,6 +13,8 @@ extern int call_with_msr(unsigned long arg, int (*fn)(unsigned long), unsigned l #define SRR0 26 #define SRR1 27 +#define PID 48 +#define PRTBL 720 static inline unsigned long mfspr(int sprnum) { @@ -55,11 +57,6 @@ void print_test_number(int i) putchar(':'); } -static inline void do_tlbie(unsigned long rb, unsigned long rs) -{ - __asm__ volatile("tlbie %0,%1" : : "r" (rb), "r" (rs) : "memory"); -} - static inline void store_pte(unsigned long *p, unsigned long pte) { __asm__ volatile("stdbrx %1,0,%0" : : "r" (p), "r" (pte) : "memory"); @@ -107,14 +104,18 @@ void zero_memory(void *ptr, unsigned long nbytes) * 8kB PGD level pointing to 4kB PTE pages. */ unsigned long *pgdir = (unsigned long *) 0x10000; -unsigned long free_ptr = 0x12000; +unsigned long *proc_tbl = (unsigned long *) 0x12000; +unsigned long free_ptr = 0x13000; void init_mmu(void) { - zero_memory(pgdir, 1024 * sizeof(unsigned long)); + /* set up process table */ + zero_memory(proc_tbl, 512 * sizeof(unsigned long)); /* RTS = 0 (2GB address space), RPDS = 10 (1024-entry top level) */ - mtspr(720, (unsigned long) pgdir | 10); - do_tlbie(0xc00, 0); /* invalidate all TLB entries */ + store_pte(&proc_tbl[2 * 1], (unsigned long) pgdir | 10); + mtspr(PRTBL, (unsigned long)proc_tbl); + mtspr(PID, 1); + zero_memory(pgdir, 1024 * sizeof(unsigned long)); } static unsigned long *read_pgd(unsigned long i) diff --git a/tests/test_mmu.bin b/tests/test_mmu.bin index a1861b22dbbdfc857e7d512d9a924efe9a997bbb..706f0d80dc8d3b8563edbd9a1ba0cc8967b5603e 100755 GIT binary patch delta 1369 zcmZ9MZA=_h6o&6v7|J5uE>bEVEiAOSU{Mz6wlmvZc9&&gKMB}2w!t4uDHa+~5Qx(3 z+Oe5NO=<$3n1G2Etwo!(u?9%|V?tAnVlXLbh%vER6SaI*+gK86sMm90VvILAbKmE@ z=iIq-XEKAMFgyx_eQ@~G{>N-ki*MS#lH%Dc5X2)D%{%UFV*$okY>m5>HEt}M*yK*k z@Y5a}E0ou8wxS#Y2$co!Di&5|=5*=R4i=MJbS)}Xj8=Yx*S%@avKTX~vROSkZ)UJM z%L`YdoQKV7(ZblcEdk7;j6GsYh&wWiDyLH2le(X()-0G=sglM*EhSM;f~8v&#yrZg zkS8JT(YFmx*mE&b<%G-QZ&Ymon8uMk&CrD_dwfueihUg3#XI&eG@-ILY`8g%4tpNH zzxO2wV$QxlA%UCaYcPbfaw{A{yK)1LpwUq-${pX)HDiA(WMTjQ8-|=OFwOB4WZ;1F zJWS#a*FHny8ag~Rc-@!$mOC3KTwB42)2;);t?mUc4po0bOMBf146dt~U!99r+y~(z zN}ej{N5yjgf0tifTID}4jU}bV<4Zg&PE(vD=SQ7~sZLdkhT8Cg-`Wr(QmA6g&>J71 zSyZ%Bu6RB7V!TbysiKR{U-;dmrFH*g+^_pDQ|{g(=GIlx=|$`EPBk9WIt#@*$m##4{%} z*{^dNizUgd6Bq09Po;Nha+aRoOm*qKnk?yTBAy_B<>AIYw-Bq(Yx1u;n~Aeu(BwIt zGl-X||4W@8C0-?eQs+$K;cj|PVhgdvc-D2#bSfk0 zI32d8I7wal9p4E*k${+YmxX8GjT80S+u_=-#eMTx9dm;tJKaiTb$N4b%ENHUO zo}MwW8rl(K7K%0UWt@x@NR}kV?&J5=?IG2N4V!Z delta 1303 zcmZ9JVN6q36vywWJOsx|8F7MQS|qg?AT1zxi>+;eDr;$j5wjYyq!pOpL`5)xt(~2D z`>+I(u>*;IAc~6f!?KNlA51nAH=3EnB@##GhndA>3oPrBvCX7py1R4B{j{5R-u?gn z=iKw&JMYp}_>it)aLk?G=72IBcYLFDmaw*RJXYJaGxaKKV~j`6Qj z7rhQv&cDZ{T02+}>dbHq%j=3u{i?c&MY&UHSsB4l-N#clE+s~ z3^GfiJQETkn3vr3>lU^rc5~j5UqLkk?nSi?M|aBE`_MlseHG-%)Ar;Q)MAo}+G1+&<|zlJ-w&buLjZ#W;pdsy3K zm!(ZV(A~Me8{9a({}K3b&}9cF&bY3_6ZE_H(b#&TPuS>S~3Hff#9z)Sxyw-e8_H1c^%lJXd6NS`K;m`;*nmRPbt=9{!N{~uQ-=DenRKx6>lc)>C<_y;ymI@ z{rUxdR2lhX+&HOozv3;#kKfXHgW{KDo?g(ZxPbWQQ##+O_+`oGbgn5j5|0h&{Mn0* z4=N;%oYwg-icQ3SoVM$HSs6uSEDq5G6u(0J@C;2raWU~(oH-PP?eo7MDl|YfzIe?B zx6m3e8wytNU|=^{0|6Th<5a-ryuTvE82wSXEEH)9@2U_H+L=G!$F6|i=qU2N+h%0G zpK^T;vQ5C)Fs=veS}UnatO-_Vr$}|+QBsrHY$8};{eo0ZT8LHID`2!xmy<@;oa-a3 zrudQa4=@$1(h3ZWJ;Y~0o5o3fj^*uDS|6zeJlt;6CP__VqJ4+9K&lL<+s)czQj2(x zybLK9K5aK!%Qi8V`BR9i)LT98bt7xZKChB27v)FL8nTu4k?Qzchz?mvOJ?KbjbU%d zrrjcy#EH-j?LH|l&V|fcnp6%hQ??|Bu~vkzO=~6Pz?yJ9oW)qU*gd%}g#0eKOcY*< sPyXk1lD9~ni*~tWT^~gn=fcHc!c@2l_Rg<`O@J1tF0{~yvTI}i4+K!t1ONa4 diff --git a/tests/test_privileged.bin b/tests/test_privileged.bin index 6eb6b536b11ab8f75b5663faf5dcc4fea6c94e0e..340b7c0f0cf67ab4f669c57e4086487cc4020118 100755 GIT binary patch delta 710 zcmZ9JO=uHQ5Xaxdq^y>A-t4`}@z#{+BKLQ1PK` z!PslNQ-x5}NF54aq;%keQu6z$O>0nKA0q8r7Op1g4~`olenV zsw>efQ+$`^>jB^VS=}dj5@yvtbaaUvruCz+hS&8QkiZRn8~_g``a<<49!pGujrSAr z(!;9-SFGB-BD2k>#A}2V&KEOf>N)6Z%LefmRw1p6>T2kYCzE!*%~y#3 zX$pR~#ZztmoOpTXGig1;xS_%lrj18%7~dPmBZ{wWjUXg-P|!#Y(mmI@i^)-{-zG0Z z7-QyH%$w7(2Lb1b*g_%G8KP6hbu*f+>`~@PdAf&PBle0|KV=bOgGc8L_F7SRhlXW_ zz6E%}Is!wuWSR7rSFNZKY;i8y*qQw~_5>!no!;EN>27wqBlp!IJOPZ&;%93ZzF?QF zgNrHKgb#Sh9*$ky=G+yt6#or-xN1kUk4X{sMF>S0en~7N0EJ+}@TXlHEP57B0{m?c cM*9D9Zq=51ZpyWlp1DZeAs?-1XJse<149PYk^lez delta 699 zcmZ9IUuaTM9LImh=Fnx+B?Fs6UfuOx*T$3uUE4IRWI87p7>r_IvuF_gdvm3E3xXhm z$?v5eMhimft>H_L4ffJoy=@Osqk>s4J@lYWIsJ|^h<&-|o^wCn-}#;IJx_JEOpnT3 zYyN|h`Mwbn0Fe8JLD`9OQidJE*U}`*;dkk%wAQodW`3260my5S-h6hY%T7X0tDx+u zMa2FF-`*{xPL;r|YbJeR#|EtNkxPv<=P8r!srP~ZQneN+fyEsHh=wlDL`wmXw9z(3 zq#?6tA!;S=vQmcXX>43jZD>@-+fcwu;^oNn_}VfSnK3Uys_mWyt95aoPf zYVm4b6W!?49ZK@Edcai7wRnnwI1O^5<`&i1bWbHoya#km^m#~Y;T6aW(z_nj;C(COeahD&gWtdO!|%)9s21y zVA`k9_Y2=7y+8RbG8IG0G!~W2;L8T*Zi*;mqcKY34Q?y;@Y{V(mYj`!Y?0UpVlJA6 zM28}cO$@4b_66grVt^*zP)FD-K2sI8itDOfcKqRdPSnTEF?i16{egkly+l96KKI4~ zgwHa7S!}8kY!k=)UUnSM`4zT-Oa6($Y>RVE>?QpRT)=g|J-kegh%Y1@5%?9cDFH}= lbprp_K_w;v|Hgq4x~w+mYOP|uX{)VTZyT*{%F)r@{1*#}+x-9l From 941499133e74eb786a27ea84e430f87af2c6f511 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 14 May 2020 15:43:33 +1000 Subject: [PATCH 25/26] soc: Work around compile error with ghdl 0.37-dev The ghdl packaged in Fedora 31 doesn't like a port map of the form "rst => rst or core_reset", so this works around the problem by doing the OR in a separate statement. Signed-off-by: Paul Mackerras --- soc.vhdl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/soc.vhdl b/soc.vhdl index a8ae3c9..899a71b 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -74,6 +74,7 @@ architecture behaviour of soc is -- Syscon signals signal dram_at_0 : std_ulogic; signal core_reset : std_ulogic; + signal do_core_reset : std_ulogic; signal wb_syscon_in : wishbone_master_out; signal wb_syscon_out : wishbone_slave_out; @@ -112,6 +113,7 @@ architecture behaviour of soc is begin -- Processor core + core_reset <= rst or do_core_reset; processor: entity work.core generic map( SIM => SIM, @@ -120,7 +122,7 @@ begin ) port map( clk => system_clk, - rst => rst or core_reset, + rst => core_reset, alt_reset => alt_reset, wishbone_insn_in => wishbone_icore_in, wishbone_insn_out => wishbone_icore_out, @@ -254,7 +256,7 @@ begin wishbone_in => wb_syscon_in, wishbone_out => wb_syscon_out, dram_at_0 => dram_at_0, - core_reset => core_reset, + core_reset => do_core_reset, soc_reset => open -- XXX TODO ); From eca0fb5bf1950043c4f424fd2dfc4f64694a728b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 15 May 2020 22:01:02 +1000 Subject: [PATCH 26/26] dcache: Fix bug in store hit after dcbz case This fixes a bug where a store that hits in the dcache immediately following a dcbz has its write to the cache RAM suppressed (but not its write to memory). If a load to the same location comes along before the cache line gets replaced, the load will return incorrect data. Fixes: 4db1676ef8b3 ("dcache: Don't assert on dcbz cache hit") Signed-off-by: Paul Mackerras --- dcache.vhdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dcache.vhdl b/dcache.vhdl index a9b5c4a..1d9cbda 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -980,7 +980,7 @@ begin do_write <= '1'; end if; if req_op = OP_STORE_HIT and req_hit_way = i and cancel_store = '0' and - r1.req.dcbz = '0' then + r0.req.dcbz = '0' then assert not reloading report "Store hit while in state:" & state_t'image(r1.state) severity FAILURE;