diff --git a/common.vhdl b/common.vhdl index 6b60e49..0207fe1 100644 --- a/common.vhdl +++ b/common.vhdl @@ -39,6 +39,8 @@ package common is constant SPR_DAR : spr_num_t := 19; constant SPR_TB : spr_num_t := 268; constant SPR_TBU : spr_num_t := 269; + constant SPR_TBLW : spr_num_t := 284; + constant SPR_TBUW : spr_num_t := 285; constant SPR_DEC : spr_num_t := 22; constant SPR_SRR0 : spr_num_t := 26; constant SPR_SRR1 : spr_num_t := 27; @@ -71,6 +73,14 @@ package common is constant SPR_DAWRX1 : spr_num_t := 189; constant SPR_HASHKEYR : spr_num_t := 468; constant SPR_HASHPKEYR : spr_num_t := 469; + constant SPR_DEXCR : spr_num_t := 828; + constant SPR_DEXCRU : spr_num_t := 812; + constant SPR_HDEXCR : spr_num_t := 471; + constant SPR_HDEXCU : spr_num_t := 455; + constant SPR_NOOP0 : spr_num_t := 808; + constant SPR_NOOP1 : spr_num_t := 809; + constant SPR_NOOP2 : spr_num_t := 810; + constant SPR_NOOP3 : spr_num_t := 811; -- PMU registers constant SPR_UPMC1 : spr_num_t := 771; @@ -167,6 +177,7 @@ package common is ispmu : std_ulogic; ronly : std_ulogic; wonly : std_ulogic; + noop : std_ulogic; end record; constant spr_id_init : spr_id := (sel => "0000", others => '0'); @@ -184,6 +195,7 @@ package common is constant SPRSEL_DSCR : spr_selector := 4x"b"; constant SPRSEL_PIR : spr_selector := 4x"c"; constant SPRSEL_CIABR : spr_selector := 4x"d"; + constant SPRSEL_DEXCR : spr_selector := 4x"e"; constant SPRSEL_XER : spr_selector := 4x"f"; -- FSCR and HFSCR bit numbers @@ -266,6 +278,16 @@ package common is pri : std_ulogic_vector(31 downto 0); -- 8 bits each for 4 cpus end record; + -- Bits in each half of DEXCR and HDEXCR + subtype aspect_bits_t is std_ulogic_vector(4 downto 0); + constant aspect_bits_init : aspect_bits_t := (others => '1'); + -- Bit numbers in aspect_bits_t + constant DEXCR_SBHE : integer := 4; -- speculative branch hint enable + constant DEXCR_IBRTPD : integer := 3; -- indirect branch recurrent target prediction disable + constant DEXCR_SRAPD : integer := 2; -- subroutine return address prediction disable + constant DEXCR_NPHIE : integer := 1; -- non-privileged hash instruction enable + constant DEXCR_PHIE : integer := 0; -- privileged hash instruction enable + -- This needs to die... type ctrl_t is record wait_state: std_ulogic; @@ -287,14 +309,26 @@ package common is heir: std_ulogic_vector(63 downto 0); dscr: std_ulogic_vector(24 downto 0); ciabr: std_ulogic_vector(63 downto 0); + dexcr_pnh: aspect_bits_t; + dexcr_pro: aspect_bits_t; + hdexcr_hyp: aspect_bits_t; + hdexcr_enf: aspect_bits_t; end record; constant ctrl_t_init : ctrl_t := (wait_state => '0', run => '1', xer_low => 18x"0", fscr_ic => x"0", fscr_pref => '1', fscr_scv => '1', fscr_tar => '1', fscr_dscr => '1', hfscr_ic => x"0", hfscr_pref => '1', hfscr_tar => '1', hfscr_dscr => '1', hfscr_fp => '1', dscr => (others => '0'), + dexcr_pnh => aspect_bits_init, dexcr_pro => aspect_bits_init, + hdexcr_hyp => aspect_bits_init, hdexcr_enf => aspect_bits_init, others => (others => '0')); + type timebase_ctrl is record + reset : std_ulogic; + rd_prot : std_ulogic; -- read-protect => userspace can't read TB + freeze : std_ulogic; + end record; + type Fetch1ToIcacheType is record req: std_ulogic; fetch_fail : std_ulogic; @@ -604,6 +638,7 @@ package common is e2stall : std_ulogic; msr : std_ulogic_vector(63 downto 0); hashkey : std_ulogic_vector(63 downto 0); + hash_enable : std_ulogic; end record; constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0', @@ -616,7 +651,7 @@ package common is length => (others => '0'), mode_32bit => '0', is_32bit => '0', prefixed => '0', repeat => '0', second => '0', e2stall => '0', - msr => (others => '0'), hashkey => (others => '0')); + msr => (others => '0'), hashkey => (others => '0'), hash_enable => '0'); type Loadstore1ToExecute1Type is record busy : std_ulogic; diff --git a/core.vhdl b/core.vhdl index bf0708e..c94db6f 100644 --- a/core.vhdl +++ b/core.vhdl @@ -31,8 +31,8 @@ entity core is -- Alternate reset (0xffff0000) for use by DRAM init fw alt_reset : in std_ulogic; - -- Global timebase - timebase : in std_ulogic_vector(63 downto 0); + -- Global timebase control + tb_ctrl : in timebase_ctrl; -- Wishbone interface wishbone_insn_in : in wishbone_slave_out; @@ -309,6 +309,7 @@ begin busy_in => decode2_busy_in, stall_out => decode2_stall_out, flush_in => flush, + tb_ctrl => tb_ctrl, complete_in => complete, stopped_out => dbg_core_is_stopped, d_in => decode1_to_decode2, @@ -376,7 +377,7 @@ begin port map ( clk => clk, rst => rst_ex1, - timebase => timebase, + tb_ctrl => tb_ctrl, flush_in => flush, busy_out => ex1_busy_out, e_in => decode2_to_execute1, diff --git a/decode1.vhdl b/decode1.vhdl index 4be6413..2fb1ad4 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -457,11 +457,20 @@ architecture behaviour of decode1 is i.ispmu := '0'; i.ronly := '0'; i.wonly := '0'; + i.noop := '0'; case sprn is when SPR_TB => i.sel := SPRSEL_TB; + i.ronly := '1'; when SPR_TBU => i.sel := SPRSEL_TBU; + i.ronly := '1'; + when SPR_TBLW => + i.sel := SPRSEL_TB; + i.wonly := '1'; + when SPR_TBUW => + i.sel := SPRSEL_TB; + i.wonly := '1'; when SPR_DEC => i.sel := SPRSEL_DEC; when SPR_PVR => @@ -499,6 +508,13 @@ architecture behaviour of decode1 is i.sel := SPRSEL_PIR; when SPR_CIABR => i.sel := SPRSEL_CIABR; + when SPR_DEXCR | SPR_HDEXCR => + i.sel := SPRSEL_DEXCR; + when SPR_DEXCRU | SPR_HDEXCU => + i.sel := SPRSEL_DEXCR; + i.ronly := '1'; + when SPR_NOOP0 | SPR_NOOP1 | SPR_NOOP2 | SPR_NOOP3 => + i.noop := '1'; when others => i.valid := '0'; end case; diff --git a/decode2.vhdl b/decode2.vhdl index 1bc8f2b..e99432b 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -27,6 +27,8 @@ entity decode2 is flush_in: in std_ulogic; + tb_ctrl : timebase_ctrl; + d_in : in Decode1ToDecode2Type; e_out : out Decode2ToExecute1Type; @@ -696,9 +698,11 @@ begin if op = OP_MFSPR then if d_in.ram_spr.valid = '1' then v.e.result_sel := "101"; -- ramspr_result - elsif d_in.spr_info.valid = '0' or d_in.spr_info.wonly = '1' then + elsif d_in.spr_info.valid = '0' or d_in.spr_info.wonly = '1' or + d_in.spr_info.noop = '1' then -- Privileged mfspr to invalid/unimplemented SPR numbers -- writes the contents of RT back to RT (i.e. it's a no-op) + -- as does any mfspr from the reserved/noop SPR numbers v.e.result_sel := "001"; -- logical_result end if; end if; @@ -706,6 +710,11 @@ begin if (op = OP_MFSPR or op = OP_MTSPR) and d_in.insn(20) = '1' then v.e.privileged := '1'; end if; + -- Reading TB is privileged if syscon_tb_ctrl.rd_protect is 1 + if tb_ctrl.rd_prot = '1' and op = OP_MFSPR and d_in.spr_info.valid = '1' and + (d_in.spr_info.sel = SPRSEL_TB or d_in.spr_info.sel = SPRSEL_TBU) then + v.e.privileged := '1'; + end if; v.e.prefixed := d_in.prefixed; v.e.prefix := d_in.prefix; v.e.illegal_suffix := d_in.illegal_suffix; diff --git a/execute1.vhdl b/execute1.vhdl index 2f6c6ff..ee38863 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -34,7 +34,7 @@ entity execute1 is ext_irq_in : std_ulogic; interrupt_in : WritebackToExecute1Type; - timebase : std_ulogic_vector(63 downto 0); + tb_ctrl : timebase_ctrl; -- asynchronous l_out : out Execute1ToLoadstore1Type; @@ -101,6 +101,8 @@ architecture behaviour of execute1 is write_ciabr : std_ulogic; enter_wait : std_ulogic; scv_trap : std_ulogic; + write_tbl : std_ulogic; + write_tbu : std_ulogic; end record; constant side_effect_init : side_effect_type := (others => '0'); @@ -279,6 +281,10 @@ architecture behaviour of execute1 is signal stage2_stall : std_ulogic; + signal timebase : std_ulogic_vector(63 downto 0); + signal tb_next : std_ulogic_vector(63 downto 0); + signal tb_carry : std_ulogic; + type privilege_level is (USER, SUPER); type op_privilege_array is array(insn_type_t) of privilege_level; constant op_privilege: op_privilege_array := ( @@ -425,6 +431,32 @@ architecture behaviour of execute1 is return ret; end; + -- return contents of DEXCR or HDEXCR + -- top 32 bits are zeroed for access via non-privileged number + function assemble_dexcr(c: ctrl_t; insn: std_ulogic_vector(31 downto 0)) return std_ulogic_vector is + variable ret : std_ulogic_vector(63 downto 0); + variable spr : std_ulogic_vector(9 downto 0); + variable dexh, dexl : aspect_bits_t; + begin + ret := (others => '0'); + spr := insn(15 downto 11) & insn(20 downto 16); + if spr(9) = '1' then + dexh := c.dexcr_pnh; + dexl := c.dexcr_pro; + else + dexh := c.hdexcr_hyp; + dexl := c.hdexcr_enf; + end if; + if spr(4) = '0' then + dexl := (others => '0'); + end if; + ret := dexh(DEXCR_SBHE) & "00" & dexh(DEXCR_IBRTPD) & dexh(DEXCR_SRAPD) & + dexh(DEXCR_NPHIE) & dexh(DEXCR_PHIE) & 25x"0" & + dexl(DEXCR_SBHE) & "00" & dexl(DEXCR_IBRTPD) & dexl(DEXCR_SRAPD) & + dexl(DEXCR_NPHIE) & dexl(DEXCR_PHIE) & 25x"0"; + return ret; + end; + -- Tell vivado to keep the hierarchy for the random module so that the -- net names in the xdc file match. attribute keep_hierarchy : string; @@ -527,6 +559,43 @@ begin p_out => pmu_to_x ); + -- Timebase just increments at the system clock frequency. + -- Ideally it would (appear to) run at 512MHz like IBM POWER systems, + -- but Linux seems to cope OK with it being 100MHz or whatever. + tbase: process(clk) + begin + if rising_edge(clk) then + if tb_ctrl.reset = '1' then + timebase <= (others => '0'); + tb_carry <= '0'; + else + timebase <= tb_next; + tb_carry <= and(tb_next(31 downto 0)); + end if; + end if; + end process; + + tbase_comb: process(all) + variable thi, tlo : std_ulogic_vector(31 downto 0); + variable carry : std_ulogic; + begin + tlo := timebase(31 downto 0); + thi := timebase(63 downto 32); + carry := '0'; + if stage2_stall = '0' and ex1.se.write_tbl = '1' then + tlo := ex1.e.write_data(31 downto 0); + elsif tb_ctrl.freeze = '0' then + tlo := std_ulogic_vector(unsigned(tlo) + 1); + carry := tb_carry; + end if; + if stage2_stall = '0' and ex1.se.write_tbu = '1' then + thi := ex1.e.write_data(31 downto 0); + else + thi := std_ulogic_vector(unsigned(thi) + carry); + end if; + tb_next <= thi & tlo; + end process; + dbg_ctrl_out <= ctrl; log_rd_addr <= ex2.log_addr_spr; @@ -1306,7 +1375,7 @@ begin when OP_DARN => when OP_MFMSR => when OP_MFSPR => - if e_in.spr_is_ram = '1' then + if e_in.spr_is_ram = '1' or e_in.spr_select.noop = '1' then if e_in.valid = '1' and not is_X(e_in.insn) then report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & "=" & to_hstring(alu_result); @@ -1398,6 +1467,10 @@ begin v.se.write_dscr := '1'; when SPRSEL_CIABR => v.se.write_ciabr := '1'; + when SPRSEL_TB => + v.se.write_tbl := '1'; + when SPRSEL_TBU => + v.se.write_tbu := '1'; when others => end case; end if; @@ -1600,6 +1673,7 @@ begin variable go : std_ulogic; variable bypass_valid : std_ulogic; variable is_scv : std_ulogic; + variable dex : aspect_bits_t; begin v := ex1; if busy_out = '0' then @@ -1735,6 +1809,13 @@ begin bperm_start <= go and actions.start_bperm; pmu_trace <= go and actions.do_trace; + -- evaluate DEXCR/HDEXCR bits that apply at present + if ex1.msr(MSR_PR) = '0' then + dex := ctrl.hdexcr_hyp; + else + dex := ctrl.dexcr_pro or ctrl.hdexcr_enf; + end if; + if not HAS_FPU and ex1.div_in_progress = '1' then v.div_in_progress := not divider_to_x.valid; v.busy := not divider_to_x.valid; @@ -1850,6 +1931,11 @@ begin lv.second := e_in.second; lv.e2stall := fp_in.f2stall; lv.hashkey := ramspr_odd; + if e_in.insn(7) = '0' then + lv.hash_enable := dex(DEXCR_PHIE); + else + lv.hash_enable := dex(DEXCR_NPHIE); + end if; -- Outputs to FPU fv.op := e_in.insn_type; @@ -1897,6 +1983,7 @@ begin 39x"0" & ctrl.dscr when SPRSEL_DSCR, 56x"0" & std_ulogic_vector(to_unsigned(CPU_INDEX, 8)) when SPRSEL_PIR, ctrl.ciabr when SPRSEL_CIABR, + assemble_dexcr(ctrl, ex1.insn) when SPRSEL_DEXCR, assemble_xer(ex1.e.xerc, ctrl.xer_low) when others; stage2_stall <= l_in.l2stall or fp_in.f2stall; diff --git a/include/microwatt_soc.h b/include/microwatt_soc.h index 67ea13d..6e367b1 100644 --- a/include/microwatt_soc.h +++ b/include/microwatt_soc.h @@ -67,6 +67,9 @@ #define SYS_REG_GIT_IS_DIRTY (1ull << 63) #define SYS_REG_CPU_CTRL 0x58 #define SYS_REG_CPU_CTRL_ENABLE 0xff +#define SYS_REG_TB_CTRL 0x60 +#define SYS_REG_TB_CTRL_FREEZE 0x01 +#define SYS_REG_TB_CTRL_RD_PROTECT 0x02 /* * Register definitions for the potato UART diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 1785685..e3bd558 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -565,6 +565,7 @@ begin variable sprn : std_ulogic_vector(9 downto 0); variable misaligned : std_ulogic; variable addr_mask : std_ulogic_vector(2 downto 0); + variable hash_nop : std_ulogic; begin v := request_init; sprn := l_in.insn(15 downto 11) & l_in.insn(20 downto 16); @@ -641,7 +642,7 @@ begin if l_in.repeat = '1' and l_in.update = '0' and addr(3) /= l_in.second then misaligned := '1'; end if; - v.align_intr := (l_in.reserve or l_in.hash) and misaligned; + v.align_intr := (l_in.reserve or (l_in.hash and l_in.hash_enable)) and misaligned; v.atomic_first := not misaligned and not l_in.second; v.atomic_last := not misaligned and (l_in.second or not l_in.repeat); @@ -661,6 +662,7 @@ begin end if; end if; + hash_nop := '0'; case l_in.op is when OP_SYNC => v.sync := '1'; @@ -671,6 +673,7 @@ begin v.touch := '1'; end if; v.hashst := l_in.hash; + hash_nop := not l_in.hash_enable; when OP_LOAD => if l_in.update = '0' or l_in.second = '0' then v.load := '1'; @@ -686,6 +689,7 @@ begin v.do_update := '1'; end if; v.hashcmp := l_in.hash; + hash_nop := not l_in.hash_enable; when OP_DCBF => v.load := '1'; v.flush := '1'; @@ -709,7 +713,8 @@ begin v.mmu_op := '1'; when others => end case; - v.dc_req := l_in.valid and (v.load or v.store or v.sync or v.dcbz) and not v.align_intr; + v.dc_req := l_in.valid and (v.load or v.store or v.sync or v.dcbz) and not v.align_intr and + not hash_nop; v.incomplete := v.dc_req and v.two_dwords; -- Work out controls for load and store formatting diff --git a/soc.vhdl b/soc.vhdl index b3d03b7..bf58826 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -183,6 +183,7 @@ architecture behaviour of soc is signal alt_reset : std_ulogic; signal wb_syscon_in : wb_io_master_out; signal wb_syscon_out : wb_io_slave_out; + signal tb_ctrl : timebase_ctrl; -- UART0 signals: signal wb_uart0_in : wb_io_master_out; @@ -271,8 +272,6 @@ architecture behaviour of soc is signal core_run_out : std_ulogic_vector(NCPUS-1 downto 0); - signal timebase : std_ulogic_vector(63 downto 0); - function wishbone_widen_data(wb : wb_io_master_out) return wishbone_master_out is variable wwb : wishbone_master_out; begin @@ -333,6 +332,7 @@ begin -- either external reset, or from syscon soc_reset <= rst or sw_soc_reset; + tb_ctrl.reset <= soc_reset; resets: process(system_clk) begin @@ -352,21 +352,6 @@ begin end if; end process; - -- Timebase just increments at the system clock frequency. - -- There is currently no way to set it. - -- Ideally it would (appear to) run at 512MHz like IBM POWER systems, - -- but Linux seems to cope OK with it being 100MHz or whatever. - tbase: process(system_clk) - begin - if rising_edge(system_clk) then - if soc_reset = '1' then - timebase <= (others => '0'); - else - timebase <= std_ulogic_vector(unsigned(timebase) + 1); - end if; - end if; - end process; - -- Processor cores processors: for i in 0 to NCPUS-1 generate core: entity work.core @@ -391,7 +376,7 @@ begin rst => rst_core(i), alt_reset => alt_reset_d, run_out => core_run_out(i), - timebase => timebase, + tb_ctrl => tb_ctrl, wishbone_insn_in => wb_masters_in(i + NCPUS), wishbone_insn_out => wb_masters_out(i + NCPUS), wishbone_data_in => wb_masters_in(i), @@ -823,7 +808,9 @@ begin dram_at_0 => dram_at_0, core_reset => do_core_reset, soc_reset => sw_soc_reset, - alt_reset => alt_reset + alt_reset => alt_reset, + tb_rdp => tb_ctrl.rd_prot, + tb_frz => tb_ctrl.freeze ); -- diff --git a/syscon.vhdl b/syscon.vhdl index 98990d1..ad9ba2c 100644 --- a/syscon.vhdl +++ b/syscon.vhdl @@ -36,7 +36,9 @@ entity syscon is dram_at_0 : out std_ulogic; core_reset : out std_ulogic_vector(NCPUS-1 downto 0); soc_reset : out std_ulogic; - alt_reset : out std_ulogic + alt_reset : out std_ulogic; + tb_rdp : out std_ulogic; + tb_frz : out std_ulogic ); end entity syscon; @@ -58,6 +60,7 @@ architecture behaviour of syscon is constant SYS_REG_UART1_INFO : std_ulogic_vector(SYS_REG_BITS-1 downto 0) := "001001"; constant SYS_REG_GIT_INFO : std_ulogic_vector(SYS_REG_BITS-1 downto 0) := "001010"; constant SYS_REG_CPU_CTRL : std_ulogic_vector(SYS_REG_BITS-1 downto 0) := "001011"; + constant SYS_REG_TB_CTRL : std_ulogic_vector(SYS_REG_BITS-1 downto 0) := "001100"; -- Muxed reg read signal signal reg_out : std_ulogic_vector(63 downto 0); @@ -119,6 +122,7 @@ architecture behaviour of syscon is signal reg_uart1info : std_ulogic_vector(63 downto 0); signal reg_gitinfo : std_ulogic_vector(63 downto 0); signal reg_cpuctrl : std_ulogic_vector(63 downto 0); + signal reg_tbctrl : std_ulogic_vector(63 downto 0); signal info_has_dram : std_ulogic; signal info_has_bram : std_ulogic; signal info_has_uart : std_ulogic; @@ -130,6 +134,8 @@ architecture behaviour of syscon is signal info_fl_off : std_ulogic_vector(31 downto 0); signal uinfo_16550 : std_ulogic; signal uinfo_freq : std_ulogic_vector(31 downto 0); + signal tb_rdprot : std_ulogic; + signal tb_freeze : std_ulogic; -- Wishbone response latch signal wb_rsp : wb_io_slave_out; @@ -193,6 +199,8 @@ begin reg_cpuctrl(63 downto 8) <= std_ulogic_vector(to_unsigned(NCPUS, 56)); + reg_tbctrl <= 62x"0" & tb_rdprot & tb_freeze; + -- Wishbone response wb_rsp.ack <= wishbone_in.cyc and wishbone_in.stb; with wishbone_in.adr(SYS_REG_BITS downto 1) select reg_out <= @@ -208,6 +216,7 @@ begin reg_uart1info when SYS_REG_UART1_INFO, reg_gitinfo when SYS_REG_GIT_INFO, reg_cpuctrl when SYS_REG_CPU_CTRL, + reg_tbctrl when SYS_REG_TB_CTRL, (others => '0') when others; wb_rsp.dat <= reg_out(63 downto 32) when wishbone_in.adr(0) = '1' else reg_out(31 downto 0); @@ -222,17 +231,23 @@ begin end if; end process; + -- Timebase control + tb_rdp <= tb_rdprot; + tb_frz <= tb_freeze; + -- Initial state ctrl_init_alt_reset <= '1' when HAS_DRAM else '0'; -- Register writes - regs_write: process(clk) + regs_write : process(clk) begin if rising_edge(clk) then if (rst) then reg_ctrl <= (SYS_REG_CTRL_ALT_RESET => ctrl_init_alt_reset, others => '0'); reg_cpuctrl(7 downto 0) <= x"01"; -- enable cpu 0 only + tb_rdprot <= '0'; + tb_freeze <= '0'; else if wishbone_in.cyc and wishbone_in.stb and wishbone_in.we then -- Change this if CTRL ever has more than 32 bits @@ -245,6 +260,11 @@ begin wishbone_in.adr(0) = '0' and wishbone_in.sel(0) = '1' then reg_cpuctrl(7 downto 0) <= wishbone_in.dat(7 downto 0); end if; + if wishbone_in.adr(SYS_REG_BITS downto 1) = SYS_REG_TB_CTRL and + wishbone_in.adr(0) = '0' and wishbone_in.sel(0) = '1' then + tb_rdprot <= wishbone_in.dat(1); + tb_freeze <= wishbone_in.dat(0); + end if; end if; -- Reset auto-clear