diff --git a/Makefile b/Makefile index fb591a4..01eab73 100644 --- a/Makefile +++ b/Makefile @@ -74,7 +74,7 @@ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \ execute1.vhdl loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl \ - core_debug.vhdl core.vhdl fpu.vhdl pmu.vhdl + core_debug.vhdl core.vhdl fpu.vhdl pmu.vhdl bitsort.vhdl soc_files = wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \ wishbone_debug_master.vhdl xics.vhdl syscon.vhdl gpio.vhdl soc.vhdl \ diff --git a/bitsort.vhdl b/bitsort.vhdl new file mode 100644 index 0000000..f2aeddb --- /dev/null +++ b/bitsort.vhdl @@ -0,0 +1,102 @@ +-- Implements instructions that involve sorting bits, +-- that is, cfuged, pextd and pdepd. +-- +-- cfuged: Sort the bits in the mask in RB into 0s at the left, 1s at the right +-- and move the bits in RS in the same fashion to give the result +-- pextd: Like cfuged but the only use the bits of RS where the +-- corresponding bit in RB is 1 +-- pdepd: Inverse of pextd; take the low-order bits of RS and spread them out +-- to the bit positions which have a 1 in RB + +-- NB opc is bits 7-6 of the instruction: +-- 00 = pdepd, 01 = pextd, 10 = cfuged + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.helpers.all; + +entity bit_sorter is + port ( + clk : in std_ulogic; + rst : in std_ulogic; + rs : in std_ulogic_vector(63 downto 0); + rb : in std_ulogic_vector(63 downto 0); + go : in std_ulogic; + opc : in std_ulogic_vector(1 downto 0); + done : out std_ulogic; + result : out std_ulogic_vector(63 downto 0) + ); +end entity bit_sorter; + +architecture behaviour of bit_sorter is + + signal val : std_ulogic_vector(63 downto 0); + signal st : std_ulogic; + signal sd : std_ulogic; + signal opr : std_ulogic_vector(1 downto 0); + signal bc : unsigned(5 downto 0); + signal jl : unsigned(5 downto 0); + signal jr : unsigned(5 downto 0); + signal sr_ml : std_ulogic_vector(63 downto 0); + signal sr_mr : std_ulogic_vector(63 downto 0); + signal sr_vl : std_ulogic_vector(63 downto 0); + signal sr_vr : std_ulogic_vector(63 downto 0); + +begin + bsort_r: process(clk) + begin + if rising_edge(clk) then + sd <= '0'; + if rst = '1' then + st <= '0'; + opr <= "00"; + val <= (others => '0'); + elsif go = '1' then + st <= '1'; + sr_ml <= rb; + sr_mr <= rb; + sr_vl <= rs; + sr_vr <= rs; + opr <= opc; + val <= (others => '0'); + bc <= to_unsigned(0, 6); + jl <= to_unsigned(63, 6); + jr <= to_unsigned(0, 6); + elsif st = '1' then + if bc = 6x"3f" then + st <= '0'; + sd <= '1'; + end if; + bc <= bc + 1; + if sr_ml(63) = '0' and opr(1) = '1' then + -- cfuged + val(to_integer(jl)) <= sr_vl(63); + jl <= jl - 1; + end if; + if sr_mr(0) = '1' then + if opr = "00" then + -- pdepd + val(to_integer(bc)) <= sr_vr(0); + else + -- cfuged or pextd + val(to_integer(jr)) <= sr_vr(0); + end if; + jr <= jr + 1; + end if; + sr_vl <= sr_vl(62 downto 0) & '0'; + if opr /= "00" or sr_mr(0) = '1' then + sr_vr <= '0' & sr_vr(63 downto 1); + end if; + sr_ml <= sr_ml(62 downto 0) & '0'; + sr_mr <= '0' & sr_mr(63 downto 1); + end if; + end if; + end process; + + done <= sd; + result <= val; + +end behaviour; diff --git a/common.vhdl b/common.vhdl index eefa2fd..76eaec2 100644 --- a/common.vhdl +++ b/common.vhdl @@ -12,6 +12,7 @@ package common is -- MSR bit numbers constant MSR_SF : integer := (63 - 0); -- Sixty-Four bit mode + constant MSR_HV : integer := (63 - 3); -- Hypervisor mode (always 1) constant MSR_EE : integer := (63 - 48); -- External interrupt Enable constant MSR_PR : integer := (63 - 49); -- PRoblem state constant MSR_FP : integer := (63 - 50); -- Floating Point available @@ -54,6 +55,15 @@ package common is constant SPR_PID : spr_num_t := 48; constant SPR_PTCR : spr_num_t := 464; constant SPR_PVR : spr_num_t := 287; + constant SPR_FSCR : spr_num_t := 153; + constant SPR_HFSCR : spr_num_t := 190; + constant SPR_HEIR : spr_num_t := 339; + constant SPR_CTRL : spr_num_t := 136; + constant SPR_CTRLW : spr_num_t := 152; + constant SPR_UDSCR : spr_num_t := 3; + constant SPR_DSCR : spr_num_t := 17; + constant SPR_VRSAVE : spr_num_t := 256; + constant SPR_PIR : spr_num_t := 1023; -- PMU registers constant SPR_UPMC1 : spr_num_t := 771; @@ -131,30 +141,52 @@ package common is constant RAMSPR_SPRG3 : ramspr_index := to_unsigned(3,3); constant RAMSPR_HSPRG1 : ramspr_index := to_unsigned(4,3); constant RAMSPR_CTR : ramspr_index := to_unsigned(5,3); -- must equal RAMSPR_LR + constant RAMSPR_VRSAVE : ramspr_index := to_unsigned(6,3); type ram_spr_info is record index : ramspr_index; isodd : std_ulogic; + is32b : std_ulogic; valid : std_ulogic; end record; constant ram_spr_info_init: ram_spr_info := (index => to_unsigned(0,3), others => '0'); - subtype spr_selector is std_ulogic_vector(2 downto 0); + subtype spr_selector is std_ulogic_vector(3 downto 0); type spr_id is record sel : spr_selector; valid : std_ulogic; ispmu : std_ulogic; - end record; - constant spr_id_init : spr_id := (sel => "000", others => '0'); - - constant SPRSEL_TB : spr_selector := 3x"0"; - constant SPRSEL_TBU : spr_selector := 3x"1"; - constant SPRSEL_DEC : spr_selector := 3x"2"; - constant SPRSEL_PVR : spr_selector := 3x"3"; - constant SPRSEL_LOGA : spr_selector := 3x"4"; - constant SPRSEL_LOGD : spr_selector := 3x"5"; - constant SPRSEL_CFAR : spr_selector := 3x"6"; - constant SPRSEL_XER : spr_selector := 3x"7"; + ronly : std_ulogic; + wonly : std_ulogic; + end record; + constant spr_id_init : spr_id := (sel => "0000", others => '0'); + + constant SPRSEL_TB : spr_selector := 4x"0"; + constant SPRSEL_TBU : spr_selector := 4x"1"; + constant SPRSEL_DEC : spr_selector := 4x"2"; + constant SPRSEL_PVR : spr_selector := 4x"3"; + constant SPRSEL_LOGA : spr_selector := 4x"4"; + constant SPRSEL_LOGD : spr_selector := 4x"5"; + constant SPRSEL_CFAR : spr_selector := 4x"6"; + constant SPRSEL_FSCR : spr_selector := 4x"7"; + constant SPRSEL_HFSCR : spr_selector := 4x"8"; + constant SPRSEL_HEIR : spr_selector := 4x"9"; + constant SPRSEL_CTRL : spr_selector := 4x"a"; + constant SPRSEL_DSCR : spr_selector := 4x"b"; + constant SPRSEL_PIR : spr_selector := 4x"c"; + constant SPRSEL_XER : spr_selector := 4x"f"; + + -- FSCR and HFSCR bit numbers + constant FSCR_PREFIX : integer := 63 - 50; + constant FSCR_SCV : integer := 63 - 51; + constant FSCR_TAR : integer := 63 - 55; + constant FSCR_DSCR : integer := 63 - 61; + constant HFSCR_PREFIX : integer := 63 - 50; + constant HFSCR_MSG : integer := 63 - 53; + constant HFSCR_TAR : integer := 63 - 55; + constant HFSCR_PMUSPR : integer := 63 - 60; + constant HFSCR_DSCR : integer := 63 - 61; + constant HFSCR_FP : integer := 63 - 63; -- FPSCR bit numbers constant FPSCR_FX : integer := 63 - 32; @@ -224,14 +256,32 @@ package common is -- This needs to die... type ctrl_t is record + wait_state: std_ulogic; + run: std_ulogic; tb: std_ulogic_vector(63 downto 0); dec: std_ulogic_vector(63 downto 0); msr: std_ulogic_vector(63 downto 0); cfar: std_ulogic_vector(63 downto 0); xer_low: std_ulogic_vector(17 downto 0); + fscr_ic: std_ulogic_vector(3 downto 0); + fscr_pref: std_ulogic; + fscr_scv: std_ulogic; + fscr_tar: std_ulogic; + fscr_dscr: std_ulogic; + hfscr_ic: std_ulogic_vector(3 downto 0); + hfscr_pref: std_ulogic; + hfscr_tar: std_ulogic; + hfscr_dscr: std_ulogic; + hfscr_fp: std_ulogic; + heir: std_ulogic_vector(63 downto 0); + dscr: std_ulogic_vector(24 downto 0); end record; constant ctrl_t_init : ctrl_t := - (xer_low => 18x"0", others => (others => '0')); + (wait_state => '0', run => '1', xer_low => 18x"0", + fscr_ic => x"0", fscr_pref => '1', fscr_scv => '1', fscr_tar => '1', fscr_dscr => '1', + hfscr_ic => x"0", hfscr_pref => '1', hfscr_tar => '1', hfscr_dscr => '1', hfscr_fp => '1', + dscr => (others => '0'), + others => (others => '0')); type Fetch1ToIcacheType is record req: std_ulogic; @@ -270,6 +320,7 @@ package common is type Decode1ToDecode2Type is record valid: std_ulogic; stop_mark : std_ulogic; + second : std_ulogic; nia: std_ulogic_vector(63 downto 0); prefixed: std_ulogic; prefix: std_ulogic_vector(25 downto 0); @@ -286,7 +337,7 @@ package common is reg_c : gspr_index_t; end record; constant Decode1ToDecode2Init : Decode1ToDecode2Type := - (valid => '0', stop_mark => '0', nia => (others => '0'), + (valid => '0', stop_mark => '0', second => '0', nia => (others => '0'), prefixed => '0', prefix => (others => '0'), insn => (others => '0'), illegal_suffix => '0', misaligned_prefix => '0', decode => decode_rom_init, br_pred => '0', big_endian => '0', @@ -371,11 +422,16 @@ package common is ramspr_wraddr : ramspr_index; ramspr_write_even : std_ulogic; ramspr_write_odd : std_ulogic; + ramspr_32bit : std_ulogic; dbg_spr_access : std_ulogic; dec_ctr : std_ulogic; prefixed : std_ulogic; + prefix : std_ulogic_vector(25 downto 0); illegal_suffix : std_ulogic; misaligned_prefix : std_ulogic; + illegal_form : std_ulogic; + uses_tar : std_ulogic; + uses_dscr : std_ulogic; end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => ALU, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init, @@ -393,9 +449,11 @@ package common is spr_is_ram => '0', ramspr_even_rdaddr => (others => '0'), ramspr_odd_rdaddr => (others => '0'), ramspr_rd_odd => '0', ramspr_wraddr => (others => '0'), ramspr_write_even => '0', ramspr_write_odd => '0', + ramspr_32bit => '0', dbg_spr_access => '0', dec_ctr => '0', - prefixed => '0', illegal_suffix => '0', misaligned_prefix => '0', + prefixed => '0', prefix => (others => '0'), illegal_suffix => '0', + misaligned_prefix => '0', illegal_form => '0', uses_tar => '0', uses_dscr => '0', others => (others => '0')); type MultiplyInputType is record @@ -547,14 +605,23 @@ package common is hold : std_ulogic; load : std_ulogic; -- is this a load dcbz : std_ulogic; + flush : std_ulogic; + touch : std_ulogic; + sync : std_ulogic; nc : std_ulogic; reserve : std_ulogic; + atomic_qw : std_ulogic; -- part of a quadword atomic op + atomic_first : std_ulogic; + atomic_last : std_ulogic; virt_mode : std_ulogic; priv_mode : std_ulogic; addr : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); -- valid the cycle after .valid = 1 byte_sel : std_ulogic_vector(7 downto 0); end record; + constant Loadstore1ToDcacheInit : Loadstore1ToDcacheType := + (addr => (others => '0'), data => (others => '0'), byte_sel => x"00", + others => '0'); type DcacheToLoadstore1Type is record valid : std_ulogic; @@ -562,6 +629,7 @@ package common is store_done : std_ulogic; error : std_ulogic; cache_paradox : std_ulogic; + reserve_nc : std_ulogic; end record; type DcacheEventType is record @@ -662,6 +730,8 @@ package common is write_xerc_enable : std_ulogic; xerc : xer_common_t; interrupt : std_ulogic; + hv_intr : std_ulogic; + is_scv : std_ulogic; intr_vec : intr_vector_t; redirect: std_ulogic; redir_mode: std_ulogic_vector(3 downto 0); @@ -678,7 +748,8 @@ package common is write_xerc_enable => '0', xerc => xerc_init, write_data => (others => '0'), write_cr_mask => (others => '0'), write_cr_data => (others => '0'), write_reg => (others => '0'), - interrupt => '0', intr_vec => 0, redirect => '0', redir_mode => "0000", + interrupt => '0', hv_intr => '0', is_scv => '0', intr_vec => 0, + redirect => '0', redir_mode => "0000", last_nia => (others => '0'), br_last => '0', br_taken => '0', abs_br => '0', srr1 => (others => '0'), msr => (others => '0')); @@ -766,13 +837,13 @@ package common is br_last : std_ulogic; br_taken : std_ulogic; interrupt : std_ulogic; - intr_vec : std_ulogic_vector(11 downto 0); + intr_vec : std_ulogic_vector(16 downto 0); end record; constant WritebackToFetch1Init : WritebackToFetch1Type := (redirect => '0', virt_mode => '0', priv_mode => '0', big_endian => '0', mode_32bit => '0', redirect_nia => (others => '0'), br_last => '0', br_taken => '0', br_nia => (others => '0'), - interrupt => '0', intr_vec => x"000"); + interrupt => '0', intr_vec => 17x"0"); type WritebackToRegisterFileType is record write_reg : gspr_index_t; @@ -795,8 +866,10 @@ package common is write_cr_data => (others => '0')); type WritebackToExecute1Type is record - intr : std_ulogic; - srr1 : std_ulogic_vector(15 downto 0); + intr : std_ulogic; + hv_intr : std_ulogic; + scv_int : std_ulogic; + srr1 : std_ulogic_vector(15 downto 0); end record; type WritebackEventType is record diff --git a/core.vhdl b/core.vhdl index 35a860e..187e176 100644 --- a/core.vhdl +++ b/core.vhdl @@ -9,6 +9,7 @@ use work.wishbone_types.all; entity core is generic ( SIM : boolean := false; + CPU_INDEX : natural := 0; DISABLE_FLATTEN : boolean := false; EX1_BYPASS : boolean := true; HAS_FPU : boolean := true; @@ -48,6 +49,7 @@ entity core is ext_irq : in std_ulogic; + run_out : out std_ulogic; terminated_out : out std_logic ); end core; @@ -363,6 +365,7 @@ begin execute1_0: entity work.execute1 generic map ( SIM => SIM, + CPU_INDEX => CPU_INDEX, EX1_BYPASS => EX1_BYPASS, HAS_FPU => HAS_FPU, LOG_LENGTH => LOG_LENGTH @@ -390,6 +393,7 @@ begin ls_events => loadstore_events, dc_events => dcache_events, ic_events => icache_events, + run_out => run_out, terminate_out => terminate, dbg_spr_req => dbg_spr_req, dbg_spr_ack => dbg_spr_ack, diff --git a/core_debug.vhdl b/core_debug.vhdl index c7215ff..67b41fb 100644 --- a/core_debug.vhdl +++ b/core_debug.vhdl @@ -294,7 +294,7 @@ begin -- For SPRs, use the same mapping as when the fast SPRs were in the GPR file valid := '1'; - sel := "000"; + sel := "0000"; isram := '1'; raddr := (others => '0'); odd := '0'; @@ -324,10 +324,26 @@ begin sel := SPRSEL_XER; when 5x"0d" => raddr := RAMSPR_TAR; + when 5x"0e" => + isram := '0'; + sel := SPRSEL_FSCR; + when 5x"0f" => + isram := '0'; + sel := SPRSEL_HFSCR; + when 5x"10" => + isram := '0'; + sel := SPRSEL_HEIR; + when 5x"11" => + isram := '0'; + sel := SPRSEL_CFAR; when others => valid := '0'; end case; - dbg_spr_addr <= isram & sel & std_ulogic_vector(raddr) & odd; + if isram = '1' then + dbg_spr_addr <= "1000" & std_ulogic_vector(raddr) & odd; + else + dbg_spr_addr <= "0000" & sel; + end if; spr_index_valid <= valid; end if; end process; diff --git a/dcache.vhdl b/dcache.vhdl index c9541e5..ce7b351 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -181,22 +181,13 @@ architecture rtl of dcache is constant real_mode_perm_attr : perm_attr_t := (nocache => '0', others => '1'); - -- Type of operation on a "valid" input - type op_t is (OP_NONE, - OP_BAD, -- NC cache hit, TLB miss, prot/RC failure - OP_STCX_FAIL, -- conditional store w/o reservation - OP_LOAD_HIT, -- Cache hit on load - OP_LOAD_MISS, -- Load missing cache - OP_LOAD_NC, -- Non-cachable load - OP_STORE_HIT, -- Store hitting cache - OP_STORE_MISS); -- Store missing cache - -- Cache state machine type state_t is (IDLE, -- Normal load hit processing RELOAD_WAIT_ACK, -- Cache reload wait ack STORE_WAIT_ACK, -- Store wait ack - NC_LOAD_WAIT_ACK);-- Non-cachable load wait ack - + NC_LOAD_WAIT_ACK, -- Non-cachable load wait ack + DO_STCX, -- Check for stcx. validity + FLUSH_CYCLE); -- Cycle for invalidating cache line -- -- Dcache operations: @@ -230,8 +221,9 @@ architecture rtl of dcache is -- Clock edge between cycle 1 and cycle 2: -- Request is stored in r1 (assuming r1.full was 0) -- The state machine transitions out of IDLE state for a load miss, - -- a store, a dcbz, or a non-cacheable load. r1.full is set to 1 - -- for a load miss, dcbz or non-cacheable load but not a store. + -- a store, a dcbz, a flush (dcbf) or a non-cacheable load. + -- r1.full is set to 1 for a load miss, dcbz, flush or + -- non-cacheable load but not a store. -- -- Cycle 2: Completion signals are asserted for a load hit, -- a store (excluding dcbz), a TLB operation, a conditional @@ -272,6 +264,23 @@ architecture rtl of dcache is -- subsequent load requests to the same line can be completed as -- soon as the necessary data comes in from memory, without -- waiting for the whole line to be read. + -- + -- Aligned loads and stores of a doubleword or less are atomic + -- because they are done in a single wishbone operation. + -- For quadword atomic loads and stores we rely on the wishbone + -- arbiter not interrupting access to a target once it has first + -- given access; i.e. once we have the main wishbone, no other + -- master gets access until we drop cyc. + -- + -- Note on loads potentially hitting the victim line that is + -- currently being replaced: the new tag is available starting + -- with the 3rd cycle of RELOAD_WAIT_ACK state. As long as the + -- first read on the wishbone takes at least one cycle (i.e. the + -- ack doesn't arrive in the same cycle as stb was asserted), + -- r1.full will be true at least until that 3rd cycle and so a load + -- following a load miss can't hit on the old tag of the victim + -- line. As long as ack is not generated combinationally from + -- stb, this will be fine. -- Stage 0 register, basically contains just the latched request type reg_stage_0_t is record @@ -287,12 +296,23 @@ architecture rtl of dcache is signal r0_full : std_ulogic; type mem_access_request_t is record - op : op_t; + op_lmiss : std_ulogic; + op_store : std_ulogic; + op_flush : std_ulogic; + op_sync : std_ulogic; + nc : std_ulogic; valid : std_ulogic; dcbz : std_ulogic; + flush : std_ulogic; + touch : std_ulogic; + sync : std_ulogic; + reserve : std_ulogic; + first_dw : std_ulogic; + last_dw : std_ulogic; real_addr : real_addr_t; data : std_ulogic_vector(63 downto 0); byte_sel : std_ulogic_vector(7 downto 0); + is_hit : std_ulogic; hit_way : way_t; same_tag : std_ulogic; mmu_req : std_ulogic; @@ -306,12 +326,16 @@ architecture rtl of dcache is full : std_ulogic; -- have uncompleted request mmu_req : std_ulogic; -- request is from MMU req : mem_access_request_t; + atomic_more : std_ulogic; -- atomic request isn't finished -- Cache hit state hit_way : way_t; hit_load_valid : std_ulogic; hit_index : index_t; cache_hit : std_ulogic; + prev_hit : std_ulogic; + prev_way : way_t; + prev_hit_reload : std_ulogic; -- TLB hit state tlb_hit : std_ulogic; @@ -352,6 +376,7 @@ architecture rtl of dcache is mmu_done : std_ulogic; mmu_error : std_ulogic; cache_paradox : std_ulogic; + reserve_nc : std_ulogic; -- Signal to complete a failed stcx. stcx_fail : std_ulogic; @@ -365,27 +390,34 @@ architecture rtl of dcache is -- type reservation_t is record valid : std_ulogic; - addr : std_ulogic_vector(63 downto LINE_OFF_BITS); + addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); end record; signal reservation : reservation_t; + signal kill_rsrv : std_ulogic; + signal kill_rsrv2 : std_ulogic; -- Async signals on incoming request - signal req_index : index_t; - signal req_hit_way : way_t; - signal req_tag : cache_tag_t; - signal req_op : op_t; - signal req_data : std_ulogic_vector(63 downto 0); - signal req_same_tag : std_ulogic; - signal req_go : std_ulogic; + signal req_index : index_t; + signal req_hit_way : way_t; + signal req_is_hit : std_ulogic; + signal req_tag : cache_tag_t; + signal req_op_load_hit : std_ulogic; + signal req_op_load_miss : std_ulogic; + signal req_op_store : std_ulogic; + signal req_op_flush : std_ulogic; + signal req_op_sync : std_ulogic; + signal req_op_bad : std_ulogic; + signal req_op_nop : std_ulogic; + signal req_data : std_ulogic_vector(63 downto 0); + signal req_same_tag : std_ulogic; + signal req_go : std_ulogic; + signal req_nc : std_ulogic; + signal req_hit_reload : std_ulogic; signal early_req_row : row_t; signal early_rd_valid : std_ulogic; - signal cancel_store : std_ulogic; - signal set_rsrv : std_ulogic; - signal clear_rsrv : std_ulogic; - signal r0_valid : std_ulogic; signal r0_stall : std_ulogic; @@ -427,10 +459,13 @@ architecture rtl of dcache is -- TLB PLRU output interface signal tlb_plru_victim : std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + signal snoop_active : std_ulogic; signal snoop_tag_set : cache_tags_set_t; signal snoop_valid : std_ulogic; - signal snoop_wrtag : cache_tag_t; - signal snoop_index : index_t; + signal snoop_paddr : real_addr_t; + signal snoop_addr : real_addr_t; + signal snoop_hits : cache_way_valids_t; + signal req_snoop_hit : std_ulogic; -- -- Helper functions to decode incoming requests @@ -565,12 +600,9 @@ begin assert (d_in.valid and m_in.valid) = '0' report "request collision loadstore vs MMU"; if m_in.valid = '1' then + r.req := Loadstore1ToDcacheInit; r.req.valid := '1'; r.req.load := not (m_in.tlbie or m_in.tlbld); - r.req.dcbz := '0'; - r.req.nc := '0'; - r.req.reserve := '0'; - r.req.virt_mode := '0'; r.req.priv_mode := '1'; r.req.addr := m_in.addr; r.req.data := m_in.pte; @@ -861,34 +893,51 @@ begin end if; end process; + -- Snoop logic + -- Don't snoop our own cycles + snoop_addr <= addr_to_real(wb_to_addr(snoop_in.adr)); + snoop_active <= snoop_in.cyc and snoop_in.stb and snoop_in.we and + not (r1.wb.cyc and not wishbone_in.stall); + kill_rsrv <= '1' when (snoop_active = '1' and reservation.valid = '1' and + snoop_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) = reservation.addr) + else '0'; + -- Cache tag RAM second read port, for snooping cache_tag_read_2 : process(clk) - variable addr : real_addr_t; begin if rising_edge(clk) then - -- Don't snoop our own cycles - snoop_valid <= '0'; - if not (r1.wb.cyc = '1' and wishbone_in.stall = '0') then - if (snoop_in.cyc and snoop_in.stb and snoop_in.we) = '1' then - snoop_valid <= '1'; - addr := addr_to_real(wb_to_addr(snoop_in.adr)); - assert not is_X(addr); - snoop_tag_set <= cache_tags(to_integer(get_index(addr))); - snoop_wrtag <= get_tag(addr); - snoop_index <= get_index(addr); - end if; + if is_X(snoop_addr) then + snoop_tag_set <= (others => 'X'); + else + snoop_tag_set <= cache_tags(to_integer(get_index(snoop_addr))); end if; + snoop_paddr <= snoop_addr; + snoop_valid <= snoop_active; end if; end process; + -- Compare the previous cycle's snooped store address to the reservation, + -- to catch the case where a write happens on cycle 1 of a cached larx + kill_rsrv2 <= '1' when (snoop_valid = '1' and reservation.valid = '1' and + snoop_paddr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) = reservation.addr) + else '0'; + + snoop_tag_match : process(all) + begin + snoop_hits <= (others => '0'); + for i in 0 to NUM_WAYS-1 loop + if snoop_valid = '1' and read_tag(i, snoop_tag_set) = get_tag(snoop_paddr) then + snoop_hits(i) <= '1'; + end if; + end loop; + end process; + -- Cache request parsing and hit detection dcache_request : process(all) variable req_row : row_t; variable rindex : index_t; variable is_hit : std_ulogic; variable hit_way : way_t; - variable op : op_t; - variable opsel : std_ulogic_vector(2 downto 0); variable go : std_ulogic; variable nc : std_ulogic; variable s_hit : std_ulogic; @@ -901,6 +950,9 @@ begin variable rel_match : std_ulogic; variable fwd_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); variable fwd_match : std_ulogic; + variable snp_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0); + variable snoop_match : std_ulogic; + variable hit_reload : std_ulogic; begin -- Extract line, row and tag from request rindex := get_index(r0.req.addr); @@ -924,9 +976,11 @@ begin is_hit := '0'; rel_match := '0'; fwd_match := '0'; + snoop_match := '0'; if r0.req.virt_mode = '1' then rel_matches := (others => '0'); fwd_matches := (others => '0'); + snp_matches := (others => '0'); for j in tlb_way_t loop hit_way_set(j) := to_unsigned(0, WAY_BITS); s_hit := '0'; @@ -943,6 +997,9 @@ begin tlb_valid_way(j) = '1' then hit_way_set(j) := to_unsigned(i, WAY_BITS); s_hit := '1'; + if snoop_hits(i) = '1' then + snp_matches(j) := '1'; + end if; end if; end loop; hit_set(j) := s_hit; @@ -959,6 +1016,7 @@ begin hit_way := hit_way_set(to_integer(tlb_hit_way)); rel_match := rel_matches(to_integer(tlb_hit_way)); fwd_match := fwd_matches(to_integer(tlb_hit_way)); + snoop_match := snp_matches(to_integer(tlb_hit_way)); end if; else s_tag := get_tag(r0.req.addr); @@ -970,6 +1028,9 @@ begin read_tag(i, cache_tag_set) = s_tag then hit_way := to_unsigned(i, WAY_BITS); is_hit := '1'; + if snoop_hits(i) = '1' then + snoop_match := '1'; + end if; end if; end loop; if go = '1' and not is_X(r1.reload_tag) and s_tag = r1.reload_tag then @@ -982,6 +1043,13 @@ begin req_same_tag <= rel_match; fwd_same_tag <= fwd_match; + -- This is 1 if the snooped write from the previous cycle hits the same + -- cache line that is being accessed in this cycle. + req_snoop_hit <= '0'; + if go = '1' and snoop_match = '1' and get_index(snoop_paddr) = rindex then + req_snoop_hit <= '1'; + end if; + -- Whether to use forwarded data for a load or not use_forward_st <= '0'; use_forward_rl <= '0'; @@ -1029,6 +1097,7 @@ begin assert not is_X(rindex); assert not is_X(r1.store_index); end if; + hit_reload := '0'; if r1.state = RELOAD_WAIT_ACK and rel_match = '1' and rindex = r1.store_index then -- Ignore is_hit from above, because a load miss writes the new tag @@ -1037,13 +1106,29 @@ begin -- since it will be by the time we perform the store. -- For a load, check the appropriate row valid bit; but also, -- if use_forward_rl is 1 then we can consider this a hit. - is_hit := not r0.req.load or r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or + -- For a touch, since the line we want is being reloaded already, + -- consider this a hit. + is_hit := not r0.req.load or r0.req.touch or + r1.rows_valid(to_integer(req_row(ROW_LINEBITS-1 downto 0))) or use_forward_rl; hit_way := replace_way; + hit_reload := is_hit; + elsif r0.req.load = '1' and r0.req.atomic_qw = '1' and r0.req.atomic_first = '0' and + r0.req.nc = '0' and perm_attr.nocache = '0' and r1.prev_hit = '1' then + -- For the second half of an atomic quadword load, just use the + -- same way as the first half, without considering whether the line + -- is valid; it is as if we had read the second dword at the same + -- time as the first dword, and the line was valid back then. + -- (Cases where the line is currently being reloaded are handled above.) + -- NB lq to noncacheable isn't required to be atomic per the ISA. + is_hit := '1'; + hit_way := r1.prev_way; end if; -- The way that matched on a hit req_hit_way <= hit_way; + req_is_hit <= is_hit; + req_hit_reload <= hit_reload; -- work out whether we have permission for this access -- NB we don't yet implement AMR, thus no KUAP @@ -1056,29 +1141,44 @@ begin -- operation needs to be done -- nc := r0.req.nc or perm_attr.nocache; - op := OP_NONE; + req_op_bad <= '0'; + req_op_load_hit <= '0'; + req_op_load_miss <= '0'; + req_op_store <= '0'; + req_op_nop <= '0'; + req_op_flush <= '0'; + req_op_sync <= '0'; if go = '1' then - if access_ok = '0' then - op := OP_BAD; - elsif cancel_store = '1' then - op := OP_STCX_FAIL; + if r0.req.sync = '1' then + req_op_sync <= '1'; + elsif r0.req.touch = '1' then + if access_ok = '1' and is_hit = '0' and nc = '0' then + req_op_load_miss <= '1'; + elsif access_ok = '1' and is_hit = '1' and nc = '0' then + -- Make this OP_LOAD_HIT so the PLRU gets updated + req_op_load_hit <= '1'; + else + req_op_nop <= '1'; + end if; + elsif access_ok = '0' then + req_op_bad <= '1'; + elsif r0.req.flush = '1' then + if is_hit = '0' then + req_op_nop <= '1'; + else + req_op_flush <= '1'; + end if; + elsif nc = '1' and (is_hit = '1' or r0.req.reserve = '1') then + req_op_bad <= '1'; + elsif r0.req.load = '0' then + req_op_store <= '1'; -- includes dcbz else - opsel := r0.req.load & nc & is_hit; - case opsel is - when "101" => op := OP_LOAD_HIT; - when "100" => op := OP_LOAD_MISS; - when "110" => op := OP_LOAD_NC; - when "001" => op := OP_STORE_HIT; - when "000" => op := OP_STORE_MISS; - when "010" => op := OP_STORE_MISS; - when "011" => op := OP_BAD; - when "111" => op := OP_BAD; - when others => op := OP_NONE; - end case; + req_op_load_hit <= is_hit; + req_op_load_miss <= not is_hit; -- includes non-cacheable loads end if; end if; - req_op <= op; req_go <= go; + req_nc <= nc; -- Version of the row number that is valid one cycle earlier -- in the cases where we need to read the cache data BRAM. @@ -1101,45 +1201,6 @@ begin -- Wire up wishbone request latch out of stage 1 wishbone_out <= r1.wb; - -- Handle load-with-reservation and store-conditional instructions - reservation_comb: process(all) - begin - cancel_store <= '0'; - set_rsrv <= '0'; - clear_rsrv <= '0'; - if r0_valid = '1' and r0.req.reserve = '1' then - -- XXX generate alignment interrupt if address is not aligned - -- XXX or if r0.req.nc = '1' - if r0.req.load = '1' then - -- load with reservation - set_rsrv <= '1'; - else - -- store conditional - clear_rsrv <= '1'; - if reservation.valid = '0' or - r0.req.addr(63 downto LINE_OFF_BITS) /= reservation.addr then - cancel_store <= '1'; - end if; - end if; - end if; - end process; - - reservation_reg: process(clk) - begin - if rising_edge(clk) then - if rst = '1' then - reservation.valid <= '0'; - elsif r0_valid = '1' and access_ok = '1' then - if clear_rsrv = '1' then - reservation.valid <= '0'; - elsif set_rsrv = '1' then - reservation.valid <= '1'; - reservation.addr <= r0.req.addr(63 downto LINE_OFF_BITS); - end if; - end if; - end if; - end process; - -- Return data for loads & completion control logic -- writeback_control: process(all) @@ -1149,6 +1210,7 @@ begin d_out.store_done <= not r1.stcx_fail; d_out.error <= r1.ls_error; d_out.cache_paradox <= r1.cache_paradox; + d_out.reserve_nc <= r1.reserve_nc; -- Outputs to MMU m_out.done <= r1.mmu_done; @@ -1185,7 +1247,7 @@ begin report "completing ld/st with error"; end if; - -- Slow ops (load miss, NC, stores) + -- Slow ops (load miss, NC, stores, sync) if r1.slow_valid = '1' then report "completing store or load miss data=" & to_hstring(r1.data_out); end if; @@ -1288,14 +1350,6 @@ begin variable data_out : std_ulogic_vector(63 downto 0); begin if rising_edge(clk) then - if req_op /= OP_NONE then - report "op:" & op_t'image(req_op) & - " addr:" & to_hstring(r0.req.addr) & - " nc:" & std_ulogic'image(r0.req.nc) & - " idx:" & to_hstring(req_index) & - " tag:" & to_hstring(req_tag) & - " way: " & to_hstring(req_hit_way); - end if; if r0_valid = '1' then r1.mmu_req <= r0.mmu_req; end if; @@ -1341,36 +1395,19 @@ begin r1.forward_valid <= '1'; end if; - -- Fast path for load/store hits. Set signals for the writeback controls. - if req_op = OP_LOAD_HIT then - r1.hit_load_valid <= '1'; - else - r1.hit_load_valid <= '0'; - end if; + r1.hit_load_valid <= req_op_load_hit; + r1.cache_hit <= req_op_load_hit or (req_op_store and req_is_hit); -- causes PLRU update - -- The cache hit indication is used for PLRU updates - if req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT then - r1.cache_hit <= '1'; - else - r1.cache_hit <= '0'; - end if; - - if req_op = OP_BAD then + r1.cache_paradox <= access_ok and req_nc and req_is_hit; + r1.reserve_nc <= access_ok and r0.req.reserve and req_nc; + if req_op_bad = '1' then report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) & " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok); r1.ls_error <= not r0.mmu_req; r1.mmu_error <= r0.mmu_req; - r1.cache_paradox <= access_ok; else r1.ls_error <= '0'; r1.mmu_error <= '0'; - r1.cache_paradox <= '0'; - end if; - - if req_op = OP_STCX_FAIL then - r1.stcx_fail <= '1'; - else - r1.stcx_fail <= '0'; end if; -- Record TLB hit information for updating TLB PLRU @@ -1423,6 +1460,10 @@ begin r1.acks_pending <= to_unsigned(0, 3); r1.stalled <= '0'; r1.dec_acks <= '0'; + r1.prev_hit <= '0'; + r1.prev_hit_reload <= '0'; + reservation.valid <= '0'; + reservation.addr <= (others => '0'); -- Not useful normally but helps avoiding tons of sim warnings r1.wb.adr <= (others => '0'); @@ -1430,27 +1471,33 @@ begin -- One cycle pulses reset r1.slow_valid <= '0'; r1.write_bram <= '0'; + r1.stcx_fail <= '0'; - r1.ls_valid <= '0'; + r1.ls_valid <= (req_op_load_hit or req_op_nop) and not r0.mmu_req; -- complete tlbies and TLB loads in the third cycle - r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld); - if req_op = OP_LOAD_HIT or req_op = OP_STCX_FAIL then - if r0.mmu_req = '0' then - r1.ls_valid <= '1'; - else - r1.mmu_done <= '1'; - end if; + r1.mmu_done <= (r0_valid and (r0.tlbie or r0.tlbld)) or + (req_op_load_hit and r0.mmu_req); + + -- The kill_rsrv2 term covers the case where the reservation + -- address was set at the beginning of this cycle, and a store + -- to that address happened in the previous cycle. + if kill_rsrv = '1' or kill_rsrv2 = '1' then + reservation.valid <= '0'; + end if; + if req_go = '1' and access_ok = '1' and r0.req.load = '1' and + r0.req.reserve = '1' and r0.req.atomic_first = '1' then + reservation.addr <= ra(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS); + reservation.valid <= req_is_hit and not req_snoop_hit; end if; -- Do invalidations from snooped stores to memory if snoop_valid = '1' then - assert not is_X(snoop_tag_set); - assert not is_X(snoop_wrtag); + assert not is_X(snoop_paddr); + assert not is_X(snoop_hits); end if; for i in 0 to NUM_WAYS-1 loop - if snoop_valid = '1' and read_tag(i, snoop_tag_set) = snoop_wrtag then - assert not is_X(snoop_index); - cache_valids(to_integer(snoop_index))(i) <= '0'; + if snoop_hits(i) = '1' then + cache_valids(to_integer(get_index(snoop_paddr)))(i) <= '0'; end if; end loop; @@ -1469,14 +1516,24 @@ begin end if; -- Take request from r1.req if there is one there, - -- else from req_op, ra, etc. + -- else from req_op_*, ra, etc. if r1.full = '1' then req := r1.req; else - req.op := req_op; + req.op_lmiss := req_op_load_miss; + req.op_store := req_op_store; + req.op_flush := req_op_flush; + req.op_sync := req_op_sync; + req.nc := req_nc; req.valid := req_go; req.mmu_req := r0.mmu_req; req.dcbz := r0.req.dcbz; + req.flush := r0.req.flush; + req.touch := r0.req.touch; + req.sync := r0.req.sync; + req.reserve := r0.req.reserve; + req.first_dw := not r0.req.atomic_qw or r0.req.atomic_first; + req.last_dw := not r0.req.atomic_qw or r0.req.atomic_last; req.real_addr := ra; -- Force data to 0 for dcbz if r0.req.dcbz = '1' then @@ -1493,14 +1550,16 @@ begin req.byte_sel := r0.req.byte_sel; end if; req.hit_way := req_hit_way; + req.is_hit := req_is_hit; req.same_tag := req_same_tag; -- Store the incoming request from r0, if it is a slow request - -- Note that r1.full = 1 implies req_op = OP_NONE - if req_op = OP_LOAD_MISS or req_op = OP_LOAD_NC or - req_op = OP_STORE_MISS or req_op = OP_STORE_HIT then + -- Note that r1.full = 1 implies none of the req_op_* are 1. + -- For the sake of timing we put any valid request in r1.req, + -- but only set r1.full if it is a slow request. + if req_go = '1' then r1.req <= req; - r1.full <= '1'; + r1.full <= req_op_load_miss or req_op_store or req_op_flush or req_op_sync; end if; end if; @@ -1512,9 +1571,14 @@ begin r1.victim_way <= plru_victim; report "victim way:" & to_hstring(plru_victim); end if; - if req_op = OP_LOAD_MISS or (req_op = OP_STORE_MISS and r0.req.dcbz = '1') then + if req_op_load_miss = '1' or (r0.req.dcbz = '1' and req_is_hit = '0') then r1.choose_victim <= '1'; end if; + if req_go = '1' then + r1.prev_hit <= req_is_hit; + r1.prev_way <= req_hit_way; + r1.prev_hit_reload <= req_hit_reload; + end if; -- Update count of pending acks acks := r1.acks_pending; @@ -1536,6 +1600,7 @@ begin r1.wb.sel <= req.byte_sel; r1.wb.dat <= req.data; r1.dcbz <= req.dcbz; + r1.atomic_more <= not req.last_dw; -- Keep track of our index and way for subsequent stores. r1.store_index <= get_index(req.real_addr); @@ -1544,44 +1609,52 @@ begin r1.reload_tag <= get_tag(req.real_addr); r1.req.same_tag <= '1'; - if req.op = OP_STORE_HIT then + if req.is_hit = '1' then r1.store_way <= req.hit_way; end if; - -- Reset per-row valid bits, ready for handling OP_LOAD_MISS + -- Reset per-row valid bits, ready for handling the next load miss for i in 0 to ROW_PER_LINE - 1 loop r1.rows_valid(i) <= '0'; end loop; - case req.op is - when OP_LOAD_HIT => - -- stay in IDLE state - - when OP_LOAD_MISS => + if req.op_lmiss = '1' then -- Normal load cache miss, start the reload machine - -- - report "cache miss real addr:" & to_hstring(req.real_addr) & - " idx:" & to_hstring(get_index(req.real_addr)) & - " tag:" & to_hstring(get_tag(req.real_addr)); + -- Or non-cacheable load + if req.nc = '0' then + report "cache miss real addr:" & to_hstring(req.real_addr) & + " idx:" & to_hstring(get_index(req.real_addr)) & + " tag:" & to_hstring(get_tag(req.real_addr)); + end if; -- Start the wishbone cycle r1.wb.we <= '0'; r1.wb.cyc <= '1'; r1.wb.stb <= '1'; - -- Track that we had one request sent - r1.state <= RELOAD_WAIT_ACK; - r1.write_tag <= '1'; - ev.load_miss <= '1'; + if req.nc = '0' then + -- Track that we had one request sent + r1.state <= RELOAD_WAIT_ACK; + r1.write_tag <= '1'; + ev.load_miss <= '1'; - when OP_LOAD_NC => - r1.wb.cyc <= '1'; - r1.wb.stb <= '1'; - r1.wb.we <= '0'; - r1.state <= NC_LOAD_WAIT_ACK; + -- If this is a touch, complete the instruction + if req.touch = '1' then + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + end if; + else + r1.state <= NC_LOAD_WAIT_ACK; + end if; + end if; - when OP_STORE_HIT | OP_STORE_MISS => - if req.dcbz = '0' then + if req.op_store = '1' then + if req.reserve = '1' then + -- stcx needs to wait until next cycle + -- for the reservation address check + r1.state <= DO_STCX; + elsif req.dcbz = '0' then r1.state <= STORE_WAIT_ACK; r1.full <= '0'; r1.slow_valid <= '1'; @@ -1590,30 +1663,33 @@ begin else r1.mmu_done <= '1'; end if; - if req.op = OP_STORE_HIT then - r1.write_bram <= '1'; - end if; + r1.write_bram <= req.is_hit; + r1.wb.we <= '1'; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; else -- dcbz is handled much like a load miss except -- that we are writing to memory instead of reading r1.state <= RELOAD_WAIT_ACK; - if req.op = OP_STORE_MISS then - r1.write_tag <= '1'; - end if; - end if; - r1.wb.we <= '1'; - r1.wb.cyc <= '1'; - r1.wb.stb <= '1'; - if req.op = OP_STORE_MISS then - ev.store_miss <= '1'; + r1.write_tag <= not req.is_hit; + r1.wb.we <= '1'; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; end if; + ev.store_miss <= not req.is_hit; + end if; - -- OP_NONE and OP_BAD do nothing - -- OP_BAD & OP_STCX_FAIL were handled above already - when OP_NONE => - when OP_BAD => - when OP_STCX_FAIL => - end case; + if req.op_flush = '1' then + r1.state <= FLUSH_CYCLE; + end if; + + if req.op_sync = '1' then + -- sync/lwsync can complete now that the state machine + -- is idle. + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + end if; when RELOAD_WAIT_ACK => -- If we are still sending requests, was one accepted ? @@ -1643,7 +1719,7 @@ begin assert not is_X(r1.req.real_addr); end if; if r1.full = '1' and r1.req.same_tag = '1' and - ((r1.dcbz = '1' and req.dcbz = '1') or r1.req.op = OP_LOAD_MISS) and + ((r1.dcbz = '1' and r1.req.dcbz = '1') or r1.req.op_lmiss = '1') and r1.store_row = get_row(r1.req.real_addr) then r1.full <= '0'; r1.slow_valid <= '1'; @@ -1652,6 +1728,10 @@ begin else r1.mmu_done <= '1'; end if; + -- NB: for lqarx, set the reservation on the first dword + if r1.req.reserve = '1' and r1.req.first_dw = '1' then + reservation.valid <= '1'; + end if; end if; -- Check for completion @@ -1667,6 +1747,10 @@ begin cache_valids(to_integer(r1.store_index))(to_integer(r1.store_way)) <= '1'; ev.dcache_refill <= not r1.dcbz; + -- Second half of a lq/lqarx can assume a hit on this line now + -- if the first half hit this line. + r1.prev_hit <= r1.prev_hit_reload; + r1.prev_way <= r1.store_way; r1.state <= IDLE; end if; @@ -1680,6 +1764,10 @@ begin if wishbone_in.stall = '0' then -- See if there is another store waiting to be done -- which is in the same real page. + -- This could be either in r1.req or in r0. + -- Ignore store-conditionals, they have to go through + -- DO_STCX state, unless they are the second half of a + -- successful stqcx, which is handled here. if req.valid = '1' then r1.wb.adr(SET_SIZE_BITS - ROW_OFF_BITS - 1 downto 0) <= req.real_addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS); @@ -1687,30 +1775,33 @@ begin r1.wb.sel <= req.byte_sel; end if; assert not is_X(acks); - if acks < 7 and req.same_tag = '1' and req.dcbz = '0' and - (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then - r1.wb.stb <= '1'; - stbs_done := false; - r1.store_way <= req.hit_way; - r1.store_row <= get_row(req.real_addr); - if req.op = OP_STORE_HIT then - r1.write_bram <= '1'; + r1.wb.stb <= '0'; + if req.op_store = '1' and req.same_tag = '1' and req.dcbz = '0' and + (req.reserve = '0' or r1.atomic_more = '1') then + if acks < 7 then + r1.wb.stb <= '1'; + stbs_done := false; + r1.store_way <= req.hit_way; + r1.store_row <= get_row(req.real_addr); + r1.write_bram <= req.is_hit; + r1.atomic_more <= not req.last_dw; + r1.full <= '0'; + r1.slow_valid <= '1'; + -- Store requests never come from the MMU + r1.ls_valid <= '1'; end if; - r1.full <= '0'; - r1.slow_valid <= '1'; - -- Store requests never come from the MMU - r1.ls_valid <= '1'; - stbs_done := false; else - r1.wb.stb <= '0'; stbs_done := true; + if req.valid = '1' then + r1.atomic_more <= '0'; + end if; end if; end if; -- Got ack ? See if complete. - if wishbone_in.ack = '1' then + if stbs_done and r1.atomic_more = '0' then assert not is_X(acks); - if stbs_done and acks = 1 then + if acks = 0 or (wishbone_in.ack = '1' and acks = 1) then r1.state <= IDLE; r1.wb.cyc <= '0'; r1.wb.stb <= '0'; @@ -1736,6 +1827,51 @@ begin r1.wb.cyc <= '0'; r1.wb.stb <= '0'; end if; + + when DO_STCX => + if reservation.valid = '0' or kill_rsrv = '1' or + r1.req.real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) /= reservation.addr then + -- Wrong address, didn't have reservation, or lost reservation + -- Abandon the wishbone cycle if started and fail the stcx. + r1.stcx_fail <= '1'; + r1.full <= '0'; + r1.ls_valid <= '1'; + r1.state <= IDLE; + r1.wb.cyc <= '0'; + r1.wb.stb <= '0'; + reservation.valid <= '0'; + -- If this is the first half of a stqcx., the second half + -- will fail also because the reservation is not valid. + r1.state <= IDLE; + elsif r1.wb.cyc = '0' then + -- Right address and have reservation, so start the + -- wishbone cycle + r1.wb.we <= '1'; + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; + elsif r1.wb.stb = '1' and wishbone_in.stall = '0' then + -- Store has been accepted, so now we can write the + -- cache data RAM and complete the request + r1.write_bram <= r1.req.is_hit; + r1.wb.stb <= '0'; + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + reservation.valid <= '0'; + -- For a stqcx, STORE_WAIT_ACK will issue the second half + -- without checking the reservation, which is what we want + -- given that the first half has gone out. + -- With r1.atomic_more set, STORE_WAIT_ACK won't exit to + -- IDLE state until it sees the second half. + r1.state <= STORE_WAIT_ACK; + end if; + + when FLUSH_CYCLE => + cache_valids(to_integer(r1.store_index))(to_integer(r1.store_way)) <= '0'; + r1.full <= '0'; + r1.slow_valid <= '1'; + r1.ls_valid <= '1'; + r1.state <= IDLE; end case; end if; end if; @@ -1753,7 +1889,7 @@ begin r1.wb.stb & r1.wb.cyc & d_out.error & d_out.valid & - std_ulogic_vector(to_unsigned(op_t'pos(req_op), 3)) & + req_op_load_miss & req_op_store & req_op_bad & stall_out & std_ulogic_vector(resize(tlb_hit_way, 3)) & valid_ra & diff --git a/decode1.vhdl b/decode1.vhdl index 151977d..0ea9ed1 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -44,6 +44,8 @@ architecture behaviour of decode1 is signal decode_rom_addr : insn_code; signal decode : decode_rom_t; + signal double : std_ulogic; + type prefix_state_t is record prefixed : std_ulogic; prefix : std_ulogic_vector(25 downto 0); @@ -106,6 +108,7 @@ architecture behaviour of decode1 is INSN_brd => (ALU, NONE, OP_BREV, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cbcdtd => (ALU, NONE, OP_BCD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cdtbcd => (ALU, NONE, OP_BCD, NONE, NONE, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_cfuged => (ALU, NONE, OP_BSORT, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cmp => (ALU, NONE, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), INSN_cmpb => (ALU, NONE, OP_CMPB, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cmpeqb => (ALU, NONE, OP_CMPEQB, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), @@ -113,10 +116,10 @@ architecture behaviour of decode1 is INSN_cmpl => (ALU, NONE, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cmpli => (ALU, NONE, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_cmprb => (ALU, NONE, OP_CMPRB, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_cntlzd => (ALU, NONE, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), - INSN_cntlzw => (ALU, NONE, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), - INSN_cnttzd => (ALU, NONE, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), - INSN_cnttzw => (ALU, NONE, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), + INSN_cntlzd => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), + INSN_cntlzw => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), + INSN_cnttzd => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), + INSN_cnttzw => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), INSN_crand => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_crandc => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_creqv => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), @@ -126,10 +129,10 @@ architecture behaviour of decode1 is INSN_crorc => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_crxor => (ALU, NONE, OP_CROP, NONE, NONE, NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_darn => (ALU, NONE, OP_DARN, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_dcbf => (ALU, NONE, OP_DCBF, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_dcbf => (LDST, NONE, OP_DCBF, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_dcbst => (ALU, NONE, OP_DCBST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_dcbt => (ALU, NONE, OP_DCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_dcbtst => (ALU, NONE, OP_DCBTST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_dcbt => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_dcbtst => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_dcbz => (LDST, NONE, OP_DCBZ, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_divd => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RCOE, '0', '0', NONE), INSN_divde => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RCOE, '0', '0', NONE), @@ -197,7 +200,7 @@ architecture behaviour of decode1 is INSN_ftdiv => (FPU, FPU, OP_FP_CMP, FRA, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_ftsqrt => (FPU, FPU, OP_FP_CMP, NONE, FRB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_icbi => (ALU, NONE, OP_ICBI, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), - INSN_icbt => (ALU, NONE, OP_ICBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), + INSN_icbt => (ALU, NONE, OP_ICBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_isel => (ALU, NONE, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_isync => (ALU, NONE, OP_ISYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_lbarx => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), @@ -234,6 +237,8 @@ architecture behaviour of decode1 is INSN_lhzu => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), INSN_lhzux => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), INSN_lhzx => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_lq => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_DQ, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRTP), + INSN_lqarx => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', DRTP), INSN_lwa => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_DS, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_lwarx => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), INSN_lwaux => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), @@ -281,12 +286,15 @@ architecture behaviour of decode1 is INSN_ori => (ALU, NONE, OP_LOGIC, NONE, CONST_UI, RS, RA, '0', '0', '1', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), INSN_oris => (ALU, NONE, OP_LOGIC, NONE, CONST_UI_HI, RS, RA, '0', '0', '1', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), INSN_paddi => (ALU, NONE, OP_ADD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_pdepd => (ALU, NONE, OP_BSORT, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_pextd => (ALU, NONE, OP_BSORT, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_plbz => (LDST, NONE, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_pld => (LDST, NONE, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_plfd => (LDST, FPU, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_plfs => (LDST, FPU, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), INSN_plha => (LDST, NONE, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_plhz => (LDST, NONE, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_plq => (LDST, NONE, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRTP), INSN_plwa => (LDST, NONE, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_plwz => (LDST, NONE, OP_LOAD, RA0_OR_CIA, CONST_PSI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_pnop => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), @@ -295,13 +303,15 @@ architecture behaviour of decode1 is INSN_pstfd => (LDST, FPU, OP_STORE, RA0_OR_CIA, CONST_PSI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_pstfs => (LDST, FPU, OP_STORE, RA0_OR_CIA, CONST_PSI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), INSN_psth => (LDST, NONE, OP_STORE, RA0_OR_CIA, CONST_PSI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_pstq => (LDST, NONE, OP_STORE, RA0_OR_CIA, CONST_PSI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRSP), INSN_pstw => (LDST, NONE, OP_STORE, RA0_OR_CIA, CONST_PSI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_popcntb => (ALU, NONE, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_popcntd => (ALU, NONE, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), - INSN_popcntw => (ALU, NONE, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_popcntb => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_popcntd => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_popcntw => (ALU, NONE, OP_COUNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_prtyd => (ALU, NONE, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_prtyw => (ALU, NONE, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_rfid => (ALU, NONE, OP_RFID, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_rfscv => (ALU, NONE, OP_RFID, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_rldcl => (ALU, NONE, OP_RLCL, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), INSN_rldcr => (ALU, NONE, OP_RLCR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), INSN_rldic => (ALU, NONE, OP_RLC, NONE, CONST_SH, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), @@ -352,6 +362,8 @@ architecture behaviour of decode1 is INSN_sthu => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), INSN_sthux => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), INSN_sthx => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_stq => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_DS, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRSP), + INSN_stqcx => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', DRSP), INSN_stw => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_stwbrx => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_stwcix => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '1', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), @@ -365,7 +377,7 @@ architecture behaviour of decode1 is INSN_subfic => (ALU, NONE, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_subfme => (ALU, NONE, OP_ADD, RA, CONST_M1, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RCOE, '0', '0', NONE), INSN_subfze => (ALU, NONE, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RCOE, '0', '0', NONE), - INSN_sync => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_sync => (LDST, NONE, OP_SYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), INSN_td => (ALU, NONE, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_tdi => (ALU, NONE, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_tlbie => (LDST, NONE, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), @@ -373,7 +385,7 @@ architecture behaviour of decode1 is INSN_tlbsync => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_tw => (ALU, NONE, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), INSN_twi => (ALU, NONE, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), - INSN_wait => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + INSN_wait => (ALU, NONE, OP_WAIT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), INSN_xor => (ALU, NONE, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), INSN_xori => (ALU, NONE, OP_XOR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), INSN_xoris => (ALU, NONE, OP_XOR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), @@ -384,7 +396,7 @@ architecture behaviour of decode1 is function decode_ram_spr(sprn : spr_num_t) return ram_spr_info is variable ret : ram_spr_info; begin - ret := (index => (others => '0'), isodd => '0', valid => '1'); + ret := (index => (others => '0'), isodd => '0', is32b => '0', valid => '1'); case sprn is when SPR_LR => ret.index := RAMSPR_LR; @@ -418,6 +430,10 @@ architecture behaviour of decode1 is when SPR_HSPRG1 => ret.index := RAMSPR_HSPRG1; ret.isodd := '1'; + when SPR_VRSAVE => + ret.index := RAMSPR_VRSAVE; + ret.isodd := '1'; + ret.is32b := '1'; when others => ret.valid := '0'; end case; @@ -427,9 +443,11 @@ architecture behaviour of decode1 is function map_spr(sprn : spr_num_t) return spr_id is variable i : spr_id; begin - i.sel := "000"; + i.sel := "0000"; i.valid := '1'; i.ispmu := '0'; + i.ronly := '0'; + i.wonly := '0'; case sprn is when SPR_TB => i.sel := SPRSEL_TB; @@ -452,6 +470,24 @@ architecture behaviour of decode1 is i.sel := SPRSEL_CFAR; when SPR_XER => i.sel := SPRSEL_XER; + when SPR_FSCR => + i.sel := SPRSEL_FSCR; + when SPR_HFSCR => + i.sel := SPRSEL_HFSCR; + when SPR_HEIR => + i.sel := SPRSEL_HEIR; + when SPR_CTRL => + i.sel := SPRSEL_CTRL; + i.ronly := '1'; + when SPR_CTRLW => + i.sel := SPRSEL_CTRL; + i.wonly := '1'; + when SPR_UDSCR => + i.sel := SPRSEL_DSCR; + when SPR_DSCR => + i.sel := SPRSEL_DSCR; + when SPR_PIR => + i.sel := SPRSEL_PIR; when others => i.valid := '0'; end case; @@ -459,6 +495,8 @@ architecture behaviour of decode1 is end; begin + double <= not r.second when (r.valid = '1' and decode.repeat /= NONE) else '0'; + decode1_0: process(clk) begin if rising_edge(clk) then @@ -471,10 +509,15 @@ begin fetch_failed <= '0'; pr <= prefix_state_init; elsif stall_in = '0' then - r <= rin; - fetch_failed <= f_in.fetch_failed; - if f_in.valid = '1' then - pr <= pr_in; + if double = '0' then + r <= rin; + fetch_failed <= f_in.fetch_failed; + if f_in.valid = '1' then + pr <= pr_in; + end if; + else + r.second <= '1'; + r.reg_c <= rin.reg_c; end if; end if; if rst = '1' then @@ -485,12 +528,12 @@ begin end if; end process; - busy_out <= stall_in; + busy_out <= stall_in or double; decode1_rom: process(clk) begin if rising_edge(clk) then - if stall_in = '0' then + if stall_in = '0' and double = '0' then decode <= decode_rom(decode_rom_addr); end if; end if; @@ -521,7 +564,7 @@ begin v.big_endian := f_in.big_endian; if is_X(f_in.insn) then - v.spr_info := (sel => "XXX", others => 'X'); + v.spr_info := (sel => "XXXX", others => 'X'); v.ram_spr := (index => (others => 'X'), others => 'X'); else sprn := decode_spr_num(f_in.insn); @@ -620,33 +663,55 @@ begin -- Work out GPR/FPR read addresses -- Note that for prefixed instructions we are working this out based -- only on the suffix. - maybe_rb := '0'; - vr.reg_1_addr := '0' & insn_ra(f_in.insn); - vr.reg_2_addr := '0' & insn_rb(f_in.insn); - vr.reg_3_addr := '0' & insn_rs(f_in.insn); - if icode >= INSN_first_rb then - maybe_rb := '1'; - if icode < INSN_first_frs then - if icode >= INSN_first_rc then - vr.reg_3_addr := '0' & insn_rcreg(f_in.insn); - end if; - else - -- access FRS operand - vr.reg_3_addr(5) := '1'; - if icode >= INSN_first_frab then - -- access FRA and/or FRB operands - vr.reg_1_addr(5) := '1'; - vr.reg_2_addr(5) := '1'; - end if; - if icode >= INSN_first_frabc then - -- access FRC operand - vr.reg_3_addr := '1' & insn_rcreg(f_in.insn); + if double = '0' then + maybe_rb := '0'; + vr.reg_1_addr := '0' & insn_ra(f_in.insn); + vr.reg_2_addr := '0' & insn_rb(f_in.insn); + vr.reg_3_addr := '0' & insn_rs(f_in.insn); + if icode >= INSN_first_rb then + maybe_rb := '1'; + if icode < INSN_first_frs then + if icode >= INSN_first_rc then + vr.reg_3_addr := '0' & insn_rcreg(f_in.insn); + end if; + else + -- access FRS operand + vr.reg_3_addr(5) := '1'; + if icode >= INSN_first_frab then + -- access FRA and/or FRB operands + vr.reg_1_addr(5) := '1'; + vr.reg_2_addr(5) := '1'; + end if; + if icode >= INSN_first_frabc then + -- access FRC operand + vr.reg_3_addr := '1' & insn_rcreg(f_in.insn); + end if; end if; end if; + -- See if this is an instruction where repeat_t = DRSP and we need + -- to read RS|1 followed by RS, i.e. stq or stqcx. in LE mode + -- (note we don't have access to the decode for the current instruction) + if (icode = INSN_stq or icode = INSN_stqcx) and f_in.big_endian = '0' then + vr.reg_3_addr(0) := '1'; + end if; + vr.read_1_enable := f_in.valid; + vr.read_2_enable := f_in.valid and maybe_rb; + vr.read_3_enable := f_in.valid; + else + -- second instance of a doubled instruction + vr.reg_1_addr := r.reg_a; + vr.reg_2_addr := r.reg_b; + vr.reg_3_addr := r.reg_c; + vr.read_1_enable := '0'; -- (not actually used) + vr.read_2_enable := '0'; + vr.read_3_enable := '1'; -- (not actually used) + -- For pstq, and for stq and stqcx in BE mode, + -- we need to read register RS|1 in the cycle after we read RS; + -- stq and stqcx in LE mode read RS. + if decode.repeat = DRSP then + vr.reg_3_addr(0) := r.prefixed or f_in.big_endian; + end if; end if; - vr.read_1_enable := f_in.valid; - vr.read_2_enable := f_in.valid and maybe_rb; - vr.read_3_enable := f_in.valid; v.reg_a := vr.reg_1_addr; v.reg_b := vr.reg_2_addr; diff --git a/decode2.vhdl b/decode2.vhdl index a68bc8b..7e993d5 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -232,12 +232,13 @@ architecture behaviour of decode2 is ); constant subresult_select : mux_select_array_t := ( - OP_MUL_L64 => "000", -- muldiv_result - OP_MUL_H64 => "001", - OP_MUL_H32 => "010", - OP_DIV => "011", - OP_DIVE => "011", - OP_MOD => "011", + OP_MUL_L64 => "000", -- multicyc_result + OP_MUL_H64 => "010", + OP_MUL_H32 => "001", + OP_DIV => "101", + OP_DIVE => "101", + OP_MOD => "101", + OP_BSORT => "100", OP_ADDG6S => "001", -- misc_result OP_ISEL => "010", OP_DARN => "011", @@ -347,7 +348,8 @@ begin elsif deferred = '0' then if dc2in.e.valid = '1' then report "execute " & to_hstring(dc2in.e.nia) & - " tag=" & integer'image(dc2in.e.instr_tag.tag) & std_ulogic'image(dc2in.e.instr_tag.valid); + " tag=" & integer'image(dc2in.e.instr_tag.tag) & std_ulogic'image(dc2in.e.instr_tag.valid) & + " rpt=" & std_ulogic'image(dc2in.e.repeat) & " 2nd=" & std_ulogic'image(dc2in.e.second) & " wr=" & to_hstring(dc2in.e.write_reg); end if; dc2 <= dc2in; elsif dc2.read_rspr = '0' then @@ -376,6 +378,31 @@ begin dec_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, d_in.prefix); dec_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn); dec_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn); + case d_in.decode.repeat is + when DUPD => + if d_in.second = '1' then + -- update-form loads, 2nd instruction writes RA + dec_o.reg := dec_a.reg; + end if; + when DRSP => + -- non-prefixed stq, stqcx do RS|1, RS in LE mode; others do RS, RS|1 + if d_in.second = (d_in.big_endian or d_in.prefixed) then + dec_c.reg(0) := '1'; -- do RS, RS|1 + end if; + when DRTP => + -- non-prefixed lq, lqarx do RT|1, RT in LE mode; others do RT, RT|1 + if d_in.second = (d_in.big_endian or d_in.prefixed) then + dec_o.reg(0) := '1'; + end if; + when others => + end case; + -- For the second instance of a doubled instruction, we ignore the RA + -- and RB operands, in order to avoid false dependencies on the output + -- of the first instance. + if d_in.second = '1' then + dec_a.reg_valid := '0'; + dec_b.reg_valid := '0'; + end if; if d_in.valid = '0' or d_in.illegal_suffix = '1' then dec_a.reg_valid := '0'; dec_b.reg_valid := '0'; @@ -420,6 +447,8 @@ begin v.e.input_cr := d_in.decode.input_cr; v.e.output_cr := d_in.decode.output_cr; + v.e.spr_select := d_in.spr_info; + -- Work out whether XER SO/OV/OV32 bits are set -- or used by this instruction v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); @@ -450,8 +479,15 @@ begin v.input_ov := '1'; when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR => unit := LDST; + when SPR_TAR => + v.e.uses_tar := '1'; + when SPR_UDSCR => + v.e.uses_dscr := '1'; when others => end case; + if d_in.spr_info.wonly = '1' then + v.e.spr_select.valid := '0'; + end if; end if; when OP_MTSPR => if is_X(d_in.insn) then @@ -468,9 +504,15 @@ begin if d_in.valid = '1' then v.sgl_pipe := '1'; end if; + when SPR_TAR => + v.e.uses_tar := '1'; + when SPR_UDSCR => + v.e.uses_dscr := '1'; when others => end case; - if d_in.spr_info.valid = '1' and d_in.valid = '1' then + if d_in.spr_info.ronly = '1' then + v.e.spr_select.valid := '0'; + elsif d_in.spr_info.valid = '1' and d_in.valid = '1' then v.sgl_pipe := '1'; end if; end if; @@ -496,12 +538,10 @@ begin end if; v.e.dec_ctr := decctr; - v.repeat := d_in.decode.repeat; if d_in.decode.repeat /= NONE then v.e.repeat := '1'; end if; - - v.e.spr_select := d_in.spr_info; + v.e.second := d_in.second; if decctr = '1' then -- read and write CTR @@ -525,12 +565,14 @@ begin v.e.ramspr_rd_odd := '1'; else v.e.ramspr_even_rdaddr := RAMSPR_TAR; + v.e.uses_tar := '1'; end if; sprs_busy := '1'; when OP_MFSPR => v.e.ramspr_even_rdaddr := d_in.ram_spr.index; v.e.ramspr_odd_rdaddr := d_in.ram_spr.index; v.e.ramspr_rd_odd := d_in.ram_spr.isodd; + v.e.ramspr_32bit := d_in.ram_spr.is32b; v.e.spr_is_ram := d_in.ram_spr.valid; sprs_busy := d_in.ram_spr.valid; when OP_MTSPR => @@ -539,8 +581,19 @@ begin v.e.ramspr_write_odd := d_in.ram_spr.valid and d_in.ram_spr.isodd; v.e.spr_is_ram := d_in.ram_spr.valid; when OP_RFID => - v.e.ramspr_even_rdaddr := RAMSPR_SRR0; - v.e.ramspr_odd_rdaddr := RAMSPR_SRR1; + if d_in.insn(7) = '1' then + -- rfscv + v.e.ramspr_even_rdaddr := RAMSPR_LR; + v.e.ramspr_odd_rdaddr := RAMSPR_CTR; + elsif d_in.insn(9) = '0' then + -- rfid + v.e.ramspr_even_rdaddr := RAMSPR_SRR0; + v.e.ramspr_odd_rdaddr := RAMSPR_SRR1; + else + -- hrfid + v.e.ramspr_even_rdaddr := RAMSPR_HSRR0; + v.e.ramspr_odd_rdaddr := RAMSPR_HSRR1; + end if; sprs_busy := '1'; when others => end case; @@ -590,23 +643,28 @@ begin if op = OP_MFSPR then if d_in.ram_spr.valid = '1' then v.e.result_sel := "101"; -- ramspr_result - elsif d_in.spr_info.valid = '0' then + elsif d_in.spr_info.valid = '0' or d_in.spr_info.wonly = '1' then -- Privileged mfspr to invalid/unimplemented SPR numbers -- writes the contents of RT back to RT (i.e. it's a no-op) v.e.result_sel := "001"; -- logical_result end if; end if; v.e.prefixed := d_in.prefixed; + v.e.prefix := d_in.prefix; v.e.illegal_suffix := d_in.illegal_suffix; v.e.misaligned_prefix := d_in.misaligned_prefix; - elsif dc2.e.valid = '1' then - -- dc2.busy = 1 and dc2.e.valid = 1, thus this must be a repeated instruction. - -- Set up for the second iteration (if deferred = 1 this will all be ignored) - v.e.second := '1'; - -- DUPD is the only possibility here: - -- update-form loads, 2nd instruction writes RA - v.e.write_reg := dc2.e.read_reg1; + -- check for invalid forms that cause an illegal instruction interrupt + -- Does RA = RT for a load quadword instr, or RB = RT for lqarx? + if d_in.decode.repeat = DRTP and + (insn_ra(d_in.insn) = insn_rt(d_in.insn) or + (d_in.decode.reserve = '1' and insn_rb(d_in.insn) = insn_rt(d_in.insn))) then + v.e.illegal_form := '1'; + end if; + -- Is RS/RT odd for a load/store quadword instruction? + if (d_in.decode.repeat = DRSP or d_in.decode.repeat = DRTP) and d_in.insn(21) = '1' then + v.e.illegal_form := '1'; + end if; end if; -- issue control @@ -695,7 +753,7 @@ begin v.e.valid := control_valid_out; v.e.instr_tag := instr_tag; - v.busy := valid_in and (not control_valid_out or (v.e.repeat and not v.e.second)); + v.busy := valid_in and not control_valid_out; stall_out <= dc2.busy or deferred; diff --git a/decode_types.vhdl b/decode_types.vhdl index 5b21fff..5695643 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -6,9 +6,10 @@ package decode_types is OP_ATTN, OP_B, OP_BC, OP_BCREG, OP_BCD, OP_BPERM, OP_BREV, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB, - OP_CNTZ, OP_CROP, - OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST, - OP_DCBZ, OP_ICBI, OP_ICBT, + OP_COUNTB, OP_CROP, + OP_DARN, OP_DCBF, OP_DCBST, OP_DCBZ, + OP_SPARE, + OP_ICBI, OP_ICBT, OP_FP_CMP, OP_FP_ARITH, OP_FP_MOVE, OP_FP_MISC, OP_DIV, OP_DIVE, OP_MOD, OP_EXTS, OP_EXTSWSLI, @@ -18,12 +19,14 @@ package decode_types is OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MTCRF, OP_MTMSRD, OP_MTSPR, OP_MUL_L64, OP_MUL_H64, OP_MUL_H32, - OP_POPCNT, OP_PRTY, OP_RFID, + OP_BSORT, + OP_PRTY, OP_RFID, OP_RLC, OP_RLCL, OP_RLCR, OP_SC, OP_SETB, OP_SHL, OP_SHR, OP_SYNC, OP_TLBIE, OP_TRAP, OP_XOR, OP_ADDG6S, + OP_WAIT, OP_FETCH_FAILED ); @@ -106,6 +109,7 @@ package decode_types is INSN_prtyw, INSN_prtyd, -- 70 INSN_rfid, + INSN_rfscv, INSN_rldic, INSN_rldicl, INSN_rldicr, @@ -113,8 +117,8 @@ package decode_types is INSN_rlwimi, INSN_rlwinm, INSN_rnop, - INSN_sc, - INSN_setb, -- 80 + INSN_sc, -- 80 + INSN_setb, INSN_slbia, INSN_sradi, INSN_srawi, @@ -122,9 +126,10 @@ package decode_types is INSN_std, INSN_stdu, INSN_sthu, - INSN_stwu, + INSN_stq, + INSN_stwu, -- 90 INSN_subfic, - INSN_subfme, -- 90 + INSN_subfme, INSN_subfze, INSN_sync, INSN_tdi, @@ -132,23 +137,23 @@ package decode_types is INSN_twi, INSN_wait, INSN_xori, - INSN_xoris, - -- pad to 104 - INSN_063, INSN_064, INSN_065, INSN_066, INSN_067, + INSN_xoris, -- 100 + -- pad to 102 + INSN_065, -- Non-prefixed instructions that have a MLS:D prefixed form and -- their corresponding prefixed instructions. -- The non-prefixed versions have even indexes so that we can -- convert them to the prefixed version by setting bit 0 - INSN_addi, -- 104 + INSN_addi, -- 102 INSN_paddi, INSN_lbz, INSN_plbz, INSN_lha, INSN_plha, - INSN_lhz, -- 110 + INSN_lhz, INSN_plhz, - INSN_lwz, + INSN_lwz, -- 110 INSN_plwz, INSN_stb, INSN_pstb, @@ -158,15 +163,18 @@ package decode_types is INSN_pstw, -- Slots for non-prefixed opcodes that are 8LS:D when prefixed - INSN_lhzu, -- 120 + INSN_lhzu, INSN_plwa, + INSN_lq, -- 120 + INSN_plq, INSN_op57, INSN_pld, + INSN_op60, + INSN_pstq, INSN_op61, INSN_pstd, -- pad to 128 to simplify comparison logic - INSN_07e, INSN_07f, -- The following instructions have an RB operand but don't access FPRs INSN_add, @@ -177,11 +185,12 @@ package decode_types is INSN_and, INSN_andc, INSN_bperm, + INSN_cfuged, INSN_cmp, INSN_cmpb, INSN_cmpeqb, - INSN_cmpl, - INSN_cmprb, -- 140 + INSN_cmpl, -- 140 + INSN_cmprb, INSN_dcbf, INSN_dcbst, INSN_dcbt, @@ -190,8 +199,8 @@ package decode_types is INSN_divd, INSN_divdu, INSN_divde, - INSN_divdeu, - INSN_divw, -- 150 + INSN_divdeu, -- 150 + INSN_divw, INSN_divwu, INSN_divwe, INSN_divweu, @@ -200,8 +209,8 @@ package decode_types is INSN_icbt, INSN_isel, INSN_lbarx, - INSN_lbzcix, - INSN_lbzux, -- 160 + INSN_lbzcix, -- 160 + INSN_lbzux, INSN_lbzx, INSN_ldarx, INSN_ldbrx, @@ -210,18 +219,19 @@ package decode_types is INSN_ldux, INSN_lharx, INSN_lhax, - INSN_lhaux, - INSN_lhbrx, -- 170 + INSN_lhaux, -- 170 + INSN_lhbrx, INSN_lhzcix, INSN_lhzx, INSN_lhzux, + INSN_lqarx, INSN_lwarx, INSN_lwax, INSN_lwaux, INSN_lwbrx, - INSN_lwzcix, + INSN_lwzcix, -- 180 INSN_lwzx, - INSN_lwzux, -- 180 + INSN_lwzux, INSN_modsd, INSN_modsw, INSN_moduw, @@ -229,52 +239,55 @@ package decode_types is INSN_mulhw, INSN_mulhwu, INSN_mulhd, - INSN_mulhdu, + INSN_mulhdu, -- 190 INSN_mullw, - INSN_mulld, -- 190 + INSN_mulld, INSN_nand, INSN_nor, INSN_or, INSN_orc, + INSN_pdepd, + INSN_pextd, INSN_rldcl, - INSN_rldcr, + INSN_rldcr, -- 200 INSN_rlwnm, INSN_slw, INSN_sld, - INSN_sraw, -- 200 + INSN_sraw, INSN_srad, INSN_srw, INSN_srd, INSN_stbcix, INSN_stbcx, - INSN_stbx, + INSN_stbx, -- 210 INSN_stbux, INSN_stdbrx, INSN_stdcix, - INSN_stdcx, -- 210 + INSN_stdcx, INSN_stdx, INSN_stdux, INSN_sthbrx, INSN_sthcix, INSN_sthcx, - INSN_sthx, + INSN_sthx, -- 220 INSN_sthux, + INSN_stqcx, INSN_stwbrx, INSN_stwcix, - INSN_stwcx, -- 220 + INSN_stwcx, INSN_stwx, INSN_stwux, INSN_subf, INSN_subfc, - INSN_subfe, + INSN_subfe, -- 230 INSN_td, INSN_tlbie, INSN_tlbiel, INSN_tw, - INSN_xor, -- 230 + INSN_xor, - -- pad to 232 to simplify comparison logic - INSN_231, + -- pad to 240 to simplify comparison logic + INSN_236, INSN_237, INSN_238, INSN_239, -- The following instructions have a third input addressed by RC INSN_maddld, @@ -282,9 +295,7 @@ package decode_types is INSN_maddhdu, -- pad to 256 to simplify comparison logic - INSN_235, - INSN_236, INSN_237, INSN_238, INSN_239, - INSN_240, INSN_241, INSN_242, INSN_243, + INSN_243, INSN_244, INSN_245, INSN_246, INSN_247, INSN_248, INSN_249, INSN_250, INSN_251, INSN_252, INSN_253, INSN_254, INSN_255, @@ -434,7 +445,9 @@ package decode_types is type length_t is (NONE, is1B, is2B, is4B, is8B); type repeat_t is (NONE, -- instruction is not repeated - DUPD); -- update-form load + DUPD, -- update-form load + DRSP, -- double RS (RS, RS+1) + DRTP); -- double RT (RT, RT+1, or RT+1, RT) type decode_rom_t is record unit : unit_t; @@ -518,6 +531,7 @@ package body decode_types is when INSN_lhau => return "101011"; when INSN_lhz => return "101000"; when INSN_lhzu => return "101001"; + when INSN_lq => return "111000"; when INSN_lwz => return "100000"; when INSN_lwzu => return "100001"; when INSN_mulli => return "000111"; @@ -537,6 +551,7 @@ package body decode_types is when INSN_sth => return "101100"; when INSN_sthu => return "101101"; when INSN_stw => return "100100"; + when INSN_stq => return "111110"; when INSN_stwu => return "100101"; when INSN_subfic => return "001000"; when INSN_tdi => return "000010"; @@ -582,6 +597,7 @@ package body decode_types is when INSN_fnmadd => return "111111"; when INSN_prefix => return "000001"; when INSN_op57 => return "111001"; + when INSN_op60 => return "111100"; when INSN_op61 => return "111101"; when INSN_add => return "011111"; when INSN_addc => return "011111"; @@ -649,6 +665,7 @@ package body decode_types is when INSN_lhzcix => return "011111"; when INSN_lhzux => return "011111"; when INSN_lhzx => return "011111"; + when INSN_lqarx => return "011111"; when INSN_lwarx => return "011111"; when INSN_lwaux => return "011111"; when INSN_lwax => return "011111"; @@ -714,6 +731,7 @@ package body decode_types is when INSN_sthcx => return "011111"; when INSN_sthux => return "011111"; when INSN_sthx => return "011111"; + when INSN_stqcx => return "011111"; when INSN_stwbrx => return "011111"; when INSN_stwcix => return "011111"; when INSN_stwcx => return "011111"; diff --git a/execute1.vhdl b/execute1.vhdl index cf73de5..3b7ec2f 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -15,6 +15,7 @@ entity execute1 is SIM : boolean := false; EX1_BYPASS : boolean := true; HAS_FPU : boolean := true; + CPU_INDEX : natural; -- Non-zero to enable log data collection LOG_LENGTH : natural := 0 ); @@ -45,6 +46,7 @@ entity execute1 is dbg_ctrl_out : out ctrl_t; + run_out : out std_ulogic; icache_inval : out std_ulogic; terminate_out : out std_ulogic; @@ -79,12 +81,23 @@ architecture behaviour of execute1 is write_xerlow : std_ulogic; write_dec : std_ulogic; write_cfar : std_ulogic; + set_cfar : std_ulogic; write_loga : std_ulogic; inc_loga : std_ulogic; write_pmuspr : std_ulogic; ramspr_write_even : std_ulogic; ramspr_write_odd : std_ulogic; mult_32s : std_ulogic; + write_fscr : std_ulogic; + write_ic : std_ulogic; + write_hfscr : std_ulogic; + write_hic : std_ulogic; + write_heir : std_ulogic; + set_heir : std_ulogic; + write_ctrl : std_ulogic; + write_dscr : std_ulogic; + enter_wait : std_ulogic; + scv_trap : std_ulogic; end record; constant side_effect_init : side_effect_type := (others => '0'); @@ -101,16 +114,18 @@ architecture behaviour of execute1 is direct_branch : std_ulogic; start_mul : std_ulogic; start_div : std_ulogic; + start_bsort : std_ulogic; do_trace : std_ulogic; fp_intr : std_ulogic; res2_sel : std_ulogic_vector(1 downto 0); bypass_valid : std_ulogic; ramspr_odd_data : std_ulogic_vector(63 downto 0); + ic : std_ulogic_vector(3 downto 0); end record; constant actions_type_init : actions_type := (e => Execute1ToWritebackInit, se => side_effect_init, new_msr => (others => '0'), res2_sel => "00", - ramspr_odd_data => 64x"0", others => '0'); + ramspr_odd_data => 64x"0", ic => x"0", others => '0'); type reg_stage1_type is record e : Execute1ToWritebackType; @@ -121,7 +136,7 @@ architecture behaviour of execute1 is prev_op : insn_type_t; prev_prefixed : std_ulogic; oe : std_ulogic; - mul_select : std_ulogic_vector(1 downto 0); + mul_select : std_ulogic_vector(2 downto 0); res2_sel : std_ulogic_vector(1 downto 0); spr_select : spr_id; pmu_spr_num : std_ulogic_vector(4 downto 0); @@ -131,6 +146,7 @@ architecture behaviour of execute1 is mul_in_progress : std_ulogic; mul_finish : std_ulogic; div_in_progress : std_ulogic; + bsort_in_progress : std_ulogic; no_instr_avail : std_ulogic; instr_dispatch : std_ulogic; ext_interrupt : std_ulogic; @@ -141,21 +157,28 @@ architecture behaviour of execute1 is xerc_valid : std_ulogic; ramspr_wraddr : ramspr_index; ramspr_odd_data : std_ulogic_vector(63 downto 0); + ic : std_ulogic_vector(3 downto 0); + prefixed : std_ulogic; + insn : std_ulogic_vector(31 downto 0); + prefix : std_ulogic_vector(25 downto 0); end record; constant reg_stage1_type_init : reg_stage1_type := (e => Execute1ToWritebackInit, se => side_effect_init, busy => '0', fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, prev_prefixed => '0', - oe => '0', mul_select => "00", res2_sel => "00", + oe => '0', mul_select => "000", res2_sel => "00", spr_select => spr_id_init, pmu_spr_num => 5x"0", redir_to_next => '0', advance_nia => '0', lr_from_next => '0', mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', + bsort_in_progress => '0', no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0', taken_branch_event => '0', br_mispredict => '0', msr => 64x"0", xerc => xerc_init, xerc_valid => '0', - ramspr_wraddr => (others => '0'), ramspr_odd_data => 64x"0"); + ramspr_wraddr => (others => '0'), ramspr_odd_data => 64x"0", + ic => x"0", + prefixed => '0', insn => 32x"0", prefix => 26x"0"); type reg_stage2_type is record e : Execute1ToWritebackType; @@ -190,7 +213,8 @@ architecture behaviour of execute1 is signal alu_result: std_ulogic_vector(63 downto 0); signal adder_result: std_ulogic_vector(63 downto 0); signal misc_result: std_ulogic_vector(63 downto 0); - signal muldiv_result: std_ulogic_vector(63 downto 0); + signal multicyc_result: std_ulogic_vector(63 downto 0); + signal bsort_result: std_ulogic_vector(63 downto 0); signal spr_result: std_ulogic_vector(63 downto 0); signal next_nia : std_ulogic_vector(63 downto 0); signal s1_sel : std_ulogic_vector(2 downto 0); @@ -215,6 +239,10 @@ architecture behaviour of execute1 is signal x_to_divider: Execute1ToDividerType; signal divider_to_x: DividerToExecute1Type := DividerToExecute1Init; + -- bit-sort unit signals + signal bsort_start : std_ulogic; + signal bsort_done : std_ulogic; + -- random number generator signals signal random_raw : std_ulogic_vector(63 downto 0); signal random_cond : std_ulogic_vector(63 downto 0); @@ -322,6 +350,7 @@ architecture behaviour of execute1 is -- 48:63, and partial function MSR bits lie in the range -- 33:36 and 42:47. (Note this is IBM bit numbering). msr_out := (others => '0'); + msr_out(MSR_HV) := '1'; -- HV is always set msr_out(63 downto 31) := msr(63 downto 31); msr_out(26 downto 22) := msr(26 downto 22); msr_out(15 downto 0) := msr(15 downto 0); @@ -332,6 +361,9 @@ architecture behaviour of execute1 is return std_ulogic_vector is variable srr1: std_ulogic_vector(63 downto 0); begin + srr1(63 downto 61) := msr(63 downto 61); + srr1(MSR_HV) := '1'; + srr1(59 downto 31) := msr(59 downto 31); srr1(63 downto 31) := msr(63 downto 31); srr1(30 downto 27) := flags(14 downto 11); srr1(26 downto 22) := msr(26 downto 22); @@ -365,6 +397,39 @@ architecture behaviour of execute1 is xerc.ov32 & xerc.ca32 & xer_low(17 downto 0); end; + function assemble_fscr(c: ctrl_t) return std_ulogic_vector is + variable ret : std_ulogic_vector(63 downto 0); + begin + ret := (others => '0'); + ret(59 downto 56) := c.fscr_ic; + ret(FSCR_PREFIX) := c.fscr_pref; + ret(FSCR_SCV) := c.fscr_scv; + ret(FSCR_TAR) := c.fscr_tar; + ret(FSCR_DSCR) := c.fscr_dscr; + return ret; + end; + + function assemble_hfscr(c: ctrl_t) return std_ulogic_vector is + variable ret : std_ulogic_vector(63 downto 0); + begin + ret := (others => '0'); + ret(59 downto 56) := c.hfscr_ic; + ret(HFSCR_PREFIX) := c.hfscr_pref; + ret(HFSCR_TAR) := c.hfscr_tar; + ret(HFSCR_DSCR) := c.hfscr_dscr; + ret(HFSCR_FP) := c.hfscr_fp; + return ret; + end; + + function assemble_ctrl(c: ctrl_t; msrpr: std_ulogic) return std_ulogic_vector is + variable ret : std_ulogic_vector(63 downto 0); + begin + ret := (others => '0'); + ret(0) := c.run; + ret(15) := c.run and not msrpr; + return ret; + end; + -- Tell vivado to keep the hierarchy for the random module so that the -- net names in the xdc file match. attribute keep_hierarchy : string; @@ -437,6 +502,18 @@ begin ); end generate; + bsort_0: entity work.bit_sorter + port map ( + clk => clk, + rst => rst, + rs => c_in, + rb => b_in, + go => bsort_start, + opc => e_in.insn(7 downto 6), + done => bsort_done, + result => bsort_result + ); + random_0: entity work.random port map ( clk => clk, @@ -484,7 +561,7 @@ begin x_to_pmu.addr_v <= '0'; x_to_pmu.spr_num <= ex1.pmu_spr_num; x_to_pmu.spr_val <= ex1.e.write_data; - x_to_pmu.run <= '1'; + x_to_pmu.run <= ctrl.run; -- XER forwarding. The CA and CA32 bits are only modified by instructions -- that are handled here, so for them we can just use the result most @@ -501,7 +578,7 @@ begin -- N.B. the busy signal from each source includes the -- stage2 stall from that source in it. - busy_out <= l_in.busy or ex1.busy or fp_in.busy; + busy_out <= l_in.busy or ex1.busy or fp_in.busy or ctrl.wait_state; valid_in <= e_in.valid and not (busy_out or flush_in or ex1.e.redirect or ex1.e.interrupt); @@ -533,7 +610,13 @@ begin even_wr_enab := (ex1.se.ramspr_write_even and doit) or interrupt_in.intr; odd_wr_enab := (ex1.se.ramspr_write_odd and doit) or interrupt_in.intr; if interrupt_in.intr = '1' then - wr_addr := RAMSPR_SRR0; + if interrupt_in.hv_intr = '1' then + wr_addr := RAMSPR_HSRR0; + elsif interrupt_in.scv_int = '1' then + wr_addr := RAMSPR_LR; + else + wr_addr := RAMSPR_SRR0; + end if; else wr_addr := ex1.ramspr_wraddr; end if; @@ -573,6 +656,9 @@ begin else ramspr_result <= ramspr_odd; end if; + if e_in.ramspr_32bit = '1' then + ramspr_result(63 downto 32) <= 32x"0"; + end if; end process; ramspr_write: process(clk) @@ -599,7 +685,7 @@ begin adder_result when "000", logical_result when "001", rotator_result when "010", - muldiv_result when "100", + multicyc_result when "100", ramspr_result when "101", misc_result when others; @@ -610,16 +696,18 @@ begin ex1 <= reg_stage1_type_init; ex2 <= reg_stage2_type_init; ctrl <= ctrl_t_init; - ctrl.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0'); - ex1.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0'); + ctrl.msr <= (MSR_SF => '1', MSR_HV => '1', MSR_LE => '1', others => '0'); + ex1.msr <= (MSR_SF => '1', MSR_HV => '1', MSR_LE => '1', others => '0'); else ex1 <= ex1in; ex2 <= ex2in; ctrl <= ctrl_tmp; if valid_in = '1' then - report "execute " & to_hstring(e_in.nia) & " op=" & insn_type_t'image(e_in.insn_type) & + report "CPU " & natural'image(CPU_INDEX) & " execute " & to_hstring(e_in.nia) & + " op=" & insn_type_t'image(e_in.insn_type) & " wr=" & to_hstring(ex1in.e.write_reg) & " we=" & std_ulogic'image(ex1in.e.write_enable) & - " tag=" & integer'image(ex1in.e.instr_tag.tag) & std_ulogic'image(ex1in.e.instr_tag.valid); + " tag=" & integer'image(ex1in.e.instr_tag.tag) & std_ulogic'image(ex1in.e.instr_tag.valid) & + " 2nd=" & std_ulogic'image(e_in.second); end if; -- We mustn't get stalled on a cycle where execute2 is -- completing an instruction or generating an interrupt @@ -638,7 +726,18 @@ begin if dbg_spr_addr(7) = '1' then dbg_spr_data <= ramspr_result; else - dbg_spr_data <= assemble_xer(xerc_in, ctrl.xer_low); + case dbg_spr_addr(3 downto 0) is + when SPRSEL_FSCR => + dbg_spr_data <= assemble_fscr(ctrl); + when SPRSEL_HFSCR => + dbg_spr_data <= assemble_hfscr(ctrl); + when SPRSEL_HEIR => + dbg_spr_data <= ctrl.heir; + when SPRSEL_CFAR => + dbg_spr_data <= ctrl.cfar; + when others => + dbg_spr_data <= assemble_xer(xerc_in, ctrl.xer_low); + end case; end if; dbg_spr_ack <= '1'; end if; @@ -769,17 +868,21 @@ begin x_to_mult_32s.subtract <= '0'; x_to_mult_32s.addend <= (others => '0'); - case ex1.mul_select is - when "00" => - muldiv_result <= multiply_to_x.result(63 downto 0); - when "01" => - muldiv_result <= multiply_to_x.result(127 downto 64); - when "10" => - muldiv_result <= multiply_to_x.result(63 downto 32) & - multiply_to_x.result(63 downto 32); - when others => - muldiv_result <= divider_to_x.write_reg_data; - end case; + if ex1.mul_select(2) = '0' then + case ex1.mul_select(1 downto 0) is + when "00" => + multicyc_result <= multiply_to_x.result(63 downto 0); + when "01" => + multicyc_result <= multiply_to_x.result(63 downto 32) & + multiply_to_x.result(63 downto 32); + when others => + multicyc_result <= multiply_to_x.result(127 downto 64); + end case; + elsif ex1.mul_select(0) = '1' and not HAS_FPU then + multicyc_result <= divider_to_x.write_reg_data; + else + multicyc_result <= bsort_result; + end if; -- Compute misc_result case e_in.sub_select is @@ -1047,7 +1150,7 @@ begin slow_op := '0'; owait := '0'; - if e_in.illegal_suffix = '1' then + if e_in.illegal_suffix = '1' or e_in.illegal_form = '1' then illegal := '1'; elsif ex1.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then privileged := '1'; @@ -1058,18 +1161,20 @@ begin when OP_ILLEGAL => illegal := '1'; when OP_SC => - -- check bit 1 of the instruction is 1 so we know this is sc; - -- 0 would mean scv, so generate an illegal instruction interrupt + -- check bit 1 of the instruction to distinguish sc from scv if e_in.insn(1) = '1' then - v.trap := '1'; - v.advance_nia := '1'; + -- sc v.e.intr_vec := 16#C00#; if e_in.valid = '1' then report "sc"; end if; else - illegal := '1'; + -- scv + v.se.scv_trap := '1'; + v.e.intr_vec := to_integer(unsigned(e_in.insn(11 downto 5))) * 32; end if; + v.trap := '1'; + v.advance_nia := '1'; when OP_ATTN => -- check bits 1-10 of the instruction to make sure it's attn -- if not then it is illegal @@ -1081,8 +1186,8 @@ begin else illegal := '1'; end if; - when OP_NOP | OP_DCBF | OP_DCBST | OP_DCBT | OP_DCBTST | OP_ICBT => - -- Do nothing + when OP_NOP | OP_DCBST | OP_ICBT => + -- Do nothing when OP_ADD => if e_in.output_carry = '1' then if e_in.input_carry /= OV then @@ -1126,7 +1231,7 @@ begin if ex1.msr(MSR_BE) = '1' then v.do_trace := '1'; end if; - v.se.write_cfar := '1'; + v.se.set_cfar := '1'; when OP_BC => -- If CTR is being decremented, it is in ramspr_odd. bo := insn_bo(e_in.insn); @@ -1145,7 +1250,7 @@ begin if ex1.msr(MSR_BE) = '1' then v.do_trace := '1'; end if; - v.se.write_cfar := v.take_branch; + v.se.set_cfar := v.take_branch; when OP_BCREG => -- If CTR is being decremented, it is in ramspr_odd. -- The target address is in ramspr_result (LR, CTR or TAR). @@ -1158,15 +1263,20 @@ begin if ex1.msr(MSR_BE) = '1' then v.do_trace := '1'; end if; - v.se.write_cfar := v.take_branch; + v.se.set_cfar := v.take_branch; when OP_RFID => + -- rfid, hrfid and rfscv. + -- These all act the same given that we don't have + -- privileged non-hypervisor mode or ultravisor mode. srr1 := ramspr_odd; v.e.redir_mode := (srr1(MSR_IR) or srr1(MSR_PR)) & not srr1(MSR_PR) & not srr1(MSR_LE) & not srr1(MSR_SF); -- Can't use msr_copy here because the partial function MSR -- bits should be left unchanged, not zeroed. - v.new_msr(63 downto 31) := srr1(63 downto 31); + v.new_msr(63 downto 61) := srr1(63 downto 61); + v.new_msr(MSR_HV) := '1'; + v.new_msr(59 downto 31) := srr1(59 downto 31); v.new_msr(26 downto 22) := srr1(26 downto 22); v.new_msr(15 downto 0) := srr1(15 downto 0); if srr1(MSR_PR) = '1' then @@ -1176,14 +1286,14 @@ begin end if; v.se.write_msr := '1'; v.e.redirect := '1'; - v.se.write_cfar := '1'; + v.se.set_cfar := '1'; if HAS_FPU then v.fp_intr := fp_in.exception and (srr1(MSR_FE0) or srr1(MSR_FE1)); end if; v.do_trace := '0'; - when OP_CNTZ | OP_POPCNT => + when OP_COUNTB => v.res2_sel := "01"; slow_op := '1'; when OP_ISEL => @@ -1270,6 +1380,18 @@ begin v.se.write_dec := '1'; when SPRSEL_LOGA => v.se.write_loga := '1'; + when SPRSEL_CFAR => + v.se.write_cfar := '1'; + when SPRSEL_FSCR => + v.se.write_fscr := '1'; + when SPRSEL_HFSCR => + v.se.write_hfscr := '1'; + when SPRSEL_HEIR => + v.se.write_heir := '1'; + when SPRSEL_CTRL => + v.se.write_ctrl := '1'; + when SPRSEL_DSCR => + v.se.write_dscr := '1'; when others => end case; end if; @@ -1293,6 +1415,11 @@ begin when OP_ICBI => v.se.icache_inval := '1'; + when OP_BSORT => + v.start_bsort := '1'; + slow_op := '1'; + owait := '1'; + when OP_MUL_L64 => if e_in.is_32bit = '1' then v.se.mult_32s := '1'; @@ -1321,6 +1448,11 @@ begin owait := '1'; end if; + when OP_WAIT => + if e_in.insn(22 downto 21) = "00" then + v.se.enter_wait := '1'; + end if; + when OP_FETCH_FAILED => -- Handling an ITLB miss doesn't count as having executed an instruction v.do_trace := '0'; @@ -1331,7 +1463,25 @@ begin end if; end case; - if misaligned = '1' then + if ex1.msr(MSR_PR) = '1' and e_in.prefixed = '1' and + (ctrl.hfscr_pref = '0' or ctrl.fscr_pref = '0') then + -- [Hypervisor] facility unavailable for prefixed instructions, + -- which has higher priority than the alignment interrupt for + -- misaligned prefixed instructions, which has higher priority than + -- other [hypervisor] facility unavailable interrupts (e.g. for + -- plfs with HFSCR[FP] = 0). + v.exception := '1'; + v.ic := x"b"; + if ctrl.hfscr_pref = '0' then + v.e.hv_intr := '1'; + v.e.intr_vec := 16#f80#; + v.se.write_hic := '1'; + else + v.e.intr_vec := 16#f60#; + v.se.write_ic := '1'; + end if; + + elsif misaligned = '1' then -- generate an alignment interrupt -- This is higher priority than illegal because a misaligned -- prefix will come down as an OP_ILLEGAL instruction. @@ -1354,15 +1504,62 @@ begin end if; elsif illegal = '1' then + -- generate hypervisor emulation assistance interrupt (HEAI) + -- and write the offending instruction into HEIR v.exception := '1'; v.e.srr1(47 - 34) := e_in.prefixed; - -- Since we aren't doing Hypervisor emulation assist (0xe40) we - -- set bit 44 to indicate we have an illegal - v.e.srr1(47 - 44) := '1'; + v.e.intr_vec := 16#e40#; + v.e.hv_intr := '1'; + v.se.set_heir := '1'; if e_in.valid = '1' then report "illegal instruction"; end if; + elsif ex1.msr(MSR_PR) = '1' and v.se.scv_trap = '1' and + ctrl.fscr_scv = '0' then + -- Facility unavailable for scv instruction + v.exception := '1'; + v.ic := x"c"; + v.e.intr_vec := 16#f60#; + v.se.write_ic := '1'; + + elsif ex1.msr(MSR_PR) = '1' and e_in.uses_tar = '1' and + (ctrl.hfscr_tar = '0' or ctrl.fscr_tar = '0') then + -- [Hypervisor] facility unavailable for TAR access + v.exception := '1'; + v.ic := x"8"; + if ctrl.hfscr_tar = '0' then + v.e.hv_intr := '1'; + v.e.intr_vec := 16#f80#; + v.se.write_hic := '1'; + else + v.e.intr_vec := 16#f60#; + v.se.write_ic := '1'; + end if; + + elsif ex1.msr(MSR_PR) = '1' and e_in.uses_dscr = '1' and + (ctrl.hfscr_dscr = '0' or ctrl.fscr_dscr = '0') then + -- [Hypervisor] facility unavailable for DSCR access + v.exception := '1'; + v.ic := x"2"; + if ctrl.hfscr_dscr = '0' then + v.e.hv_intr := '1'; + v.e.intr_vec := 16#f80#; + v.se.write_hic := '1'; + else + v.e.intr_vec := 16#f60#; + v.se.write_ic := '1'; + end if; + + elsif HAS_FPU and ex1.msr(MSR_PR) = '1' and e_in.fac = FPU and + ctrl.hfscr_fp = '0' then + -- Hypervisor facility unavailable for FP instructions + v.exception := '1'; + v.ic := x"0"; + v.e.hv_intr := '1'; + v.e.intr_vec := 16#f80#; + v.se.write_hic := '1'; + elsif HAS_FPU and ex1.msr(MSR_FP) = '0' and e_in.fac = FPU then -- generate a floating-point unavailable interrupt v.exception := '1'; @@ -1391,19 +1588,24 @@ begin variable fv : Execute1ToFPUType; variable go : std_ulogic; variable bypass_valid : std_ulogic; + variable is_scv : std_ulogic; begin v := ex1; - if (ex1.busy or l_in.busy or fp_in.busy) = '0' then + if busy_out = '0' then v.e := actions.e; v.e.valid := '0'; v.oe := e_in.oe; v.spr_select := e_in.spr_select; v.pmu_spr_num := e_in.insn(20 downto 16); - v.mul_select := e_in.sub_select(1 downto 0); + v.mul_select := e_in.sub_select; v.se := side_effect_init; v.ramspr_wraddr := e_in.ramspr_wraddr; v.lr_from_next := e_in.lr; v.ramspr_odd_data := actions.ramspr_odd_data; + v.ic := actions.ic; + v.prefixed := e_in.prefixed; + v.insn := e_in.insn; + v.prefix := e_in.prefix; end if; lv := Execute1ToLoadstore1Init; @@ -1426,7 +1628,7 @@ begin rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0'; rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0'; - do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0'; + do_popcnt <= '1' when e_in.insn_type = OP_COUNTB and e_in.insn(7 downto 6) = "11" else '0'; if valid_in = '1' then v.prev_op := e_in.insn_type; @@ -1454,10 +1656,9 @@ begin v.e.srr1(47 - 33) := '1'; v.e.srr1(47 - 34) := ex1.prev_prefixed; if ex1.prev_op = OP_LOAD or ex1.prev_op = OP_ICBI or ex1.prev_op = OP_ICBT or - ex1.prev_op = OP_DCBT or ex1.prev_op = OP_DCBST or ex1.prev_op = OP_DCBF then + ex1.prev_op = OP_DCBF then v.e.srr1(47 - 35) := '1'; - elsif ex1.prev_op = OP_STORE or ex1.prev_op = OP_DCBZ or - ex1.prev_op = OP_DCBTST then + elsif ex1.prev_op = OP_STORE or ex1.prev_op = OP_DCBZ then v.e.srr1(47 - 36) := '1'; end if; @@ -1474,6 +1675,7 @@ begin v.e.intr_vec := 16#500#; report "IRQ valid: External"; v.ext_interrupt := '1'; + v.e.hv_intr := '1'; end if; v.e.srr1 := (others => '0'); exception := '1'; @@ -1500,6 +1702,7 @@ begin v.mul_in_progress := actions.start_mul; x_to_divider.valid <= actions.start_div; v.div_in_progress := actions.start_div; + v.bsort_in_progress := actions.start_bsort; v.br_mispredict := v.e.redirect and actions.direct_branch; v.advance_nia := actions.advance_nia; v.redir_to_next := actions.redir_to_next; @@ -1510,7 +1713,7 @@ begin -- multiply is happening in order to stop following -- instructions from using the wrong XER value -- (and for simplicity in the OE=0 case). - v.busy := actions.start_div or actions.start_mul; + v.busy := actions.start_div or actions.start_mul or actions.start_bsort; -- instruction for other units, i.e. LDST if e_in.unit = LDST then @@ -1520,6 +1723,8 @@ begin fv.valid := '1'; end if; end if; + is_scv := go and actions.se.scv_trap; + bsort_start <= go and actions.start_bsort; if not HAS_FPU and ex1.div_in_progress = '1' then v.div_in_progress := not divider_to_x.valid; @@ -1552,6 +1757,13 @@ begin end if; v.e.valid := '1'; end if; + if ex1.bsort_in_progress = '1' then + v.bsort_in_progress := not bsort_done; + v.e.valid := bsort_done; + v.busy := not bsort_done; + v.e.write_data := alu_result; + bypass_valid := bsort_done; + end if; if v.e.write_xerc_enable = '1' and v.e.valid = '1' then v.xerc := v.e.xerc; @@ -1560,6 +1772,7 @@ begin if (ex1.busy or l_in.busy or fp_in.busy) = '0' then v.e.interrupt := exception; + v.e.is_scv := is_scv; end if; if v.e.valid = '0' then v.e.redirect := '0'; @@ -1658,6 +1871,12 @@ begin log_wr_addr & ex2.log_addr_spr when SPRSEL_LOGA, log_rd_data when SPRSEL_LOGD, ctrl.cfar when SPRSEL_CFAR, + assemble_fscr(ctrl) when SPRSEL_FSCR, + assemble_hfscr(ctrl) when SPRSEL_HFSCR, + ctrl.heir when SPRSEL_HEIR, + assemble_ctrl(ctrl, ex1.msr(MSR_PR)) when SPRSEL_CTRL, + 39x"0" & ctrl.dscr when SPRSEL_DSCR, + 56x"0" & std_ulogic_vector(to_unsigned(CPU_INDEX, 8)) when SPRSEL_PIR, assemble_xer(ex1.e.xerc, ctrl.xer_low) when others; stage2_stall <= l_in.l2stall or fp_in.f2stall; @@ -1673,6 +1892,7 @@ begin variable cr_mask : std_ulogic_vector(7 downto 0); variable sign, zero : std_ulogic; variable rcnz_hi, rcnz_lo : std_ulogic; + variable irq_exc : std_ulogic; begin -- Next insn adder used in a couple of places next_nia <= std_ulogic_vector(unsigned(ex1.e.last_nia) + 4); @@ -1792,6 +2012,8 @@ begin ctrl_tmp.dec <= ex1.e.write_data; end if; if ex1.se.write_cfar = '1' then + ctrl_tmp.cfar <= ex1.e.write_data; + elsif ex1.se.set_cfar = '1' then ctrl_tmp.cfar <= ex1.e.last_nia; end if; if ex1.se.write_loga = '1' then @@ -1800,11 +2022,56 @@ begin v.log_addr_spr := std_ulogic_vector(unsigned(ex2.log_addr_spr) + 1); end if; x_to_pmu.mtspr <= ex1.se.write_pmuspr; + if ex1.se.write_hfscr = '1' then + ctrl_tmp.hfscr_ic <= ex1.e.write_data(59 downto 56); + ctrl_tmp.hfscr_pref <= ex1.e.write_data(HFSCR_PREFIX); + ctrl_tmp.hfscr_tar <= ex1.e.write_data(HFSCR_TAR); + ctrl_tmp.hfscr_dscr <= ex1.e.write_data(HFSCR_DSCR); + ctrl_tmp.hfscr_fp <= ex1.e.write_data(HFSCR_FP); + elsif ex1.se.write_hic = '1' then + ctrl_tmp.hfscr_ic <= ex1.ic; + end if; + if ex1.se.write_fscr = '1' then + ctrl_tmp.fscr_ic <= ex1.e.write_data(59 downto 56); + ctrl_tmp.fscr_pref <= ex1.e.write_data(FSCR_PREFIX); + ctrl_tmp.fscr_scv <= ex1.e.write_data(FSCR_SCV); + ctrl_tmp.fscr_tar <= ex1.e.write_data(FSCR_TAR); + ctrl_tmp.fscr_dscr <= ex1.e.write_data(FSCR_DSCR); + elsif ex1.se.write_ic = '1' then + ctrl_tmp.fscr_ic <= ex1.ic; + end if; + if ex1.se.write_heir = '1' then + ctrl_tmp.heir <= ex1.e.write_data; + elsif ex1.se.set_heir = '1' then + ctrl_tmp.heir(31 downto 0) <= ex1.insn; + if ex1.prefixed = '1' then + ctrl_tmp.heir(63 downto 58) <= 6x"01"; + ctrl_tmp.heir(57 downto 32) <= ex1.prefix; + else + ctrl_tmp.heir(63 downto 32) <= (others => '0'); + end if; + end if; + if ex1.se.write_ctrl = '1' then + ctrl_tmp.run <= ex1.e.write_data(0); + end if; + if ex1.se.write_dscr = '1' then + ctrl_tmp.dscr <= ex1.e.write_data(24 downto 0); + end if; + if ex1.se.enter_wait = '1' then + ctrl_tmp.wait_state <= '1'; + end if; + end if; + + -- pending exceptions clear any wait state + -- ex1.fp_exception_next is not tested because it is not possible to + -- get into wait state with a pending FP exception. + irq_exc := pmu_to_x.intr or ctrl.dec(63) or ext_irq_in; + if ex1.trace_next = '1' or irq_exc = '1' or interrupt_in.intr = '1' then + ctrl_tmp.wait_state <= '0'; end if; if interrupt_in.intr = '1' then ctrl_tmp.msr(MSR_SF) <= '1'; - ctrl_tmp.msr(MSR_EE) <= '0'; ctrl_tmp.msr(MSR_PR) <= '0'; ctrl_tmp.msr(MSR_SE) <= '0'; ctrl_tmp.msr(MSR_BE) <= '0'; @@ -1813,8 +2080,11 @@ begin ctrl_tmp.msr(MSR_FE1) <= '0'; ctrl_tmp.msr(MSR_IR) <= '0'; ctrl_tmp.msr(MSR_DR) <= '0'; - ctrl_tmp.msr(MSR_RI) <= '0'; ctrl_tmp.msr(MSR_LE) <= '1'; + if interrupt_in.scv_int = '0' then + ctrl_tmp.msr(MSR_EE) <= '0'; + ctrl_tmp.msr(MSR_RI) <= '0'; + end if; end if; bypass_valid := ex1.e.valid; @@ -1838,6 +2108,7 @@ begin e_out <= ex2.e; e_out.msr <= msr_copy(ctrl.msr); + run_out <= ctrl.run; terminate_out <= ex2.se.terminate; icache_inval <= ex2.se.icache_inval; diff --git a/fetch1.vhdl b/fetch1.vhdl index 96c16fb..f07188d 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -391,7 +391,7 @@ begin v_int.next_nia := RESET_ADDRESS; end if; elsif w_in.interrupt = '1' then - v_int.next_nia := 52x"0" & w_in.intr_vec(11 downto 2) & "00"; + v_int.next_nia := 47x"0" & w_in.intr_vec(16 downto 2) & "00"; end if; if rst /= '0' or w_in.interrupt = '1' then v.req := '0'; diff --git a/fpga/top-arty.vhdl b/fpga/top-arty.vhdl index 0980667..c3be9d9 100644 --- a/fpga/top-arty.vhdl +++ b/fpga/top-arty.vhdl @@ -142,6 +142,9 @@ end entity toplevel; architecture behaviour of toplevel is + -- Status + signal run_out : std_ulogic; + -- Reset signals: signal soc_rst : std_ulogic; signal pll_rst : std_ulogic; @@ -263,6 +266,7 @@ begin system_clk => system_clk, rst => soc_rst, sw_soc_reset => sw_rst, + run_out => run_out, -- UART signals uart0_txd => uart_main_tx, @@ -742,6 +746,7 @@ begin led4 <= system_clk_locked; led5 <= eth_clk_locked; led6 <= not soc_rst; + led7 <= run_out; -- GPIO gpio_in(10) <= btn0; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index fc8c158..485947b 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -61,6 +61,9 @@ architecture behave of loadstore1 is dc_req : std_ulogic; load : std_ulogic; store : std_ulogic; + flush : std_ulogic; + touch : std_ulogic; + sync : std_ulogic; tlbie : std_ulogic; dcbz : std_ulogic; read_spr : std_ulogic; @@ -84,6 +87,9 @@ architecture behave of loadstore1 is update : std_ulogic; xerc : xer_common_t; reserve : std_ulogic; + atomic_qw : std_ulogic; + atomic_first : std_ulogic; + atomic_last : std_ulogic; rc : std_ulogic; nc : std_ulogic; -- non-cacheable access virt_mode : std_ulogic; @@ -97,7 +103,8 @@ architecture behave of loadstore1 is two_dwords : std_ulogic; incomplete : std_ulogic; end record; - constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', tlbie => '0', + constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', + flush => '0', touch => '0', sync => '0', tlbie => '0', dcbz => '0', read_spr => '0', write_spr => '0', mmu_op => '0', instr_fault => '0', do_update => '0', mode_32bit => '0', prefixed => '0', @@ -108,6 +115,7 @@ architecture behave of loadstore1 is elt_length => x"0", byte_reverse => '0', brev_mask => "000", sign_extend => '0', update => '0', xerc => xerc_init, reserve => '0', + atomic_qw => '0', atomic_first => '0', atomic_last => '0', rc => '0', nc => '0', virt_mode => '0', priv_mode => '0', load_sp => '0', sprsel => "00", ric => "00", is_slbia => '0', align_intr => '0', @@ -447,7 +455,10 @@ begin if l_in.second = '1' then -- for an update-form load, use the previous address -- as the value to write back to RA. - addr := r1.addr0; + -- for a quadword load or store, use with the previous + -- address + 8. + addr := std_ulogic_vector(unsigned(r1.addr0(63 downto 3)) + not l_in.update) & + r1.addr0(2 downto 0); end if; if l_in.mode_32bit = '1' then addr(63 downto 32) := (others => '0'); @@ -463,7 +474,7 @@ begin addr_mask := std_ulogic_vector(unsigned(l_in.length(2 downto 0)) - 1); -- Do length_to_sel and work out if we are doing 2 dwords - long_sel := xfer_data_sel(v.length, addr(2 downto 0)); + long_sel := xfer_data_sel(l_in.length, addr(2 downto 0)); v.byte_sel := long_sel(7 downto 0); v.second_bytes := long_sel(15 downto 8); if long_sel(15 downto 8) /= "00000000" then @@ -472,23 +483,54 @@ begin -- check alignment for larx/stcx misaligned := or (addr_mask and addr(2 downto 0)); + if l_in.repeat = '1' and l_in.update = '0' and addr(3) /= l_in.second then + misaligned := '1'; + end if; v.align_intr := l_in.reserve and misaligned; + v.atomic_first := not misaligned and not l_in.second; + v.atomic_last := not misaligned and (l_in.second or not l_in.repeat); + + -- is this a quadword load or store? i.e. lq plq stq pstq lqarx stqcx. + if l_in.repeat = '1' and l_in.update = '0' then + if misaligned = '0' then + -- Since the access is aligned we have to do it atomically + v.atomic_qw := '1'; + else + -- We require non-prefixed lq in LE mode to be aligned in order + -- to avoid the case where RA = RT+1 and the second access faults + -- after the first has overwritten RA. + if l_in.op = OP_LOAD and l_in.byte_reverse = '0' and l_in.prefixed = '0' then + v.align_intr := '1'; + end if; + end if; + end if; + case l_in.op is + when OP_SYNC => + v.sync := '1'; when OP_STORE => v.store := '1'; + if l_in.length = "0000" then + v.touch := '1'; + end if; when OP_LOAD => - -- Note: only RA updates have l_in.second = 1 - if l_in.second = '0' then + if l_in.update = '0' or l_in.second = '0' then v.load := '1'; if HAS_FPU and l_in.is_32bit = '1' then -- Allow an extra cycle for SP->DP precision conversion v.load_sp := '1'; end if; + if l_in.length = "0000" then + v.touch := '1'; + end if; else -- write back address to RA v.do_update := '1'; end if; + when OP_DCBF => + v.load := '1'; + v.flush := '1'; when OP_DCBZ => v.dcbz := '1'; v.align_intr := v.nc; @@ -508,13 +550,13 @@ begin v.mmu_op := '1'; when others => end case; - v.dc_req := l_in.valid and (v.load or v.store or v.dcbz) and not v.align_intr; + v.dc_req := l_in.valid and (v.load or v.store or v.sync or v.dcbz) and not v.align_intr; v.incomplete := v.dc_req and v.two_dwords; -- Work out controls for load and store formatting brev_lenm1 := "000"; if v.byte_reverse = '1' then - brev_lenm1 := unsigned(v.length(2 downto 0)) - 1; + brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; end if; v.brev_mask := brev_lenm1; @@ -699,7 +741,8 @@ begin end if; interrupt := (r2.req.valid and r2.req.align_intr) or - (d_in.error and d_in.cache_paradox) or m_in.err; + (d_in.error and (d_in.cache_paradox or d_in.reserve_nc)) or + m_in.err; if interrupt = '1' then v.req.valid := '0'; v.busy := '0'; @@ -855,7 +898,8 @@ begin if d_in.valid = '1' then if r2.req.incomplete = '0' then - write_enable := r2.req.load and not r2.req.load_sp; + write_enable := r2.req.load and not r2.req.load_sp and + not r2.req.flush and not r2.req.touch; -- stores write back rA update do_update := r2.req.update and r2.req.store; end if; @@ -865,6 +909,7 @@ begin -- signal an interrupt straight away exception := '1'; dsisr(63 - 38) := not r2.req.load; + dsisr(63 - 37) := d_in.reserve_nc; -- XXX there is no architected bit for this -- (probably should be a machine check in fact) dsisr(63 - 35) := d_in.cache_paradox; @@ -950,8 +995,14 @@ begin d_out.valid <= stage1_dcreq; d_out.load <= stage1_req.load; d_out.dcbz <= stage1_req.dcbz; + d_out.flush <= stage1_req.flush; + d_out.touch <= stage1_req.touch; + d_out.sync <= stage1_req.sync; d_out.nc <= stage1_req.nc; d_out.reserve <= stage1_req.reserve; + d_out.atomic_qw <= stage1_req.atomic_qw; + d_out.atomic_first <= stage1_req.atomic_first; + d_out.atomic_last <= stage1_req.atomic_last; d_out.addr <= stage1_req.addr; d_out.byte_sel <= stage1_req.byte_sel; d_out.virt_mode <= stage1_req.virt_mode; @@ -960,8 +1011,14 @@ begin d_out.valid <= req; d_out.load <= r2.req.load; d_out.dcbz <= r2.req.dcbz; + d_out.flush <= r2.req.flush; + d_out.touch <= r2.req.touch; + d_out.sync <= r2.req.sync; d_out.nc <= r2.req.nc; d_out.reserve <= r2.req.reserve; + d_out.atomic_qw <= r2.req.atomic_qw; + d_out.atomic_first <= r2.req.atomic_first; + d_out.atomic_last <= r2.req.atomic_last; d_out.addr <= r2.req.addr; d_out.byte_sel <= r2.req.byte_sel; d_out.virt_mode <= r2.req.virt_mode; diff --git a/microwatt.core b/microwatt.core index dad180f..f56bee0 100644 --- a/microwatt.core +++ b/microwatt.core @@ -20,6 +20,7 @@ filesets: - sim_console.vhdl - logical.vhdl - countbits.vhdl + - bitsort.vhdl - control.vhdl - execute1.vhdl - fpu.vhdl diff --git a/predecode.vhdl b/predecode.vhdl index d3ca015..e8689ef 100644 --- a/predecode.vhdl +++ b/predecode.vhdl @@ -121,6 +121,8 @@ architecture behaviour of predecoder is 2#011110_01110# to 2#011110_01111# => INSN_rldimi, 2#011110_10000# to 2#011110_10001# => INSN_rldcl, 2#011110_10010# to 2#011110_10011# => INSN_rldcr, + -- major opcode 56 + 2#111000_00000# to 2#111000_11111# => INSN_lq, -- major opcode 58 2#111010_00000# => INSN_ld, 2#111010_00001# => INSN_ldu, @@ -161,20 +163,28 @@ architecture behaviour of predecoder is -- major opcode 62 2#111110_00000# => INSN_std, 2#111110_00001# => INSN_stdu, + 2#111110_00010# => INSN_stq, 2#111110_00100# => INSN_std, 2#111110_00101# => INSN_stdu, + 2#111110_00110# => INSN_stq, 2#111110_01000# => INSN_std, 2#111110_01001# => INSN_stdu, + 2#111110_01010# => INSN_stq, 2#111110_01100# => INSN_std, 2#111110_01101# => INSN_stdu, + 2#111110_01110# => INSN_stq, 2#111110_10000# => INSN_std, 2#111110_10001# => INSN_stdu, + 2#111110_10010# => INSN_stq, 2#111110_10100# => INSN_std, 2#111110_10101# => INSN_stdu, + 2#111110_10110# => INSN_stq, 2#111110_11000# => INSN_std, 2#111110_11001# => INSN_stdu, + 2#111110_11010# => INSN_stq, 2#111110_11100# => INSN_std, 2#111110_11101# => INSN_stdu, + 2#111110_11110# => INSN_stq, -- major opcode 63 2#111111_00100# to 2#111111_00101# => INSN_fdiv, 2#111111_01000# to 2#111111_01001# => INSN_fsub, @@ -190,8 +200,9 @@ architecture behaviour of predecoder is 2#111111_11110# to 2#111111_11111# => INSN_fnmadd, -- prefix word, PO1 2#000001_00000# to 2#000001_11111# => INSN_prefix, - -- Major opcodes 57 and 61 are SFFS load/store instructions when prefixed + -- Major opcodes 57, 60 and 61 are SFFS load/store instructions when prefixed 2#111001_00000# to 2#111001_11111# => INSN_op57, + 2#111100_00000# to 2#111100_11111# => INSN_op60, 2#111101_00000# to 2#111101_11111# => INSN_op61, others => INSN_illegal ); @@ -219,6 +230,7 @@ architecture behaviour of predecoder is 2#0_00101_11011# => INSN_brd, 2#0_01001_11010# => INSN_cbcdtd, 2#0_01000_11010# => INSN_cdtbcd, + 2#0_00110_11100# => INSN_cfuged, 2#0_00000_00000# => INSN_cmp, 2#0_01111_11100# => INSN_cmpb, 2#0_00111_00000# => INSN_cmpeqb, @@ -316,6 +328,7 @@ architecture behaviour of predecoder is 2#0_11001_10101# => INSN_lhzcix, 2#0_01001_10111# => INSN_lhzux, 2#0_01000_10111# => INSN_lhzx, + 2#0_01000_10100# => INSN_lqarx, 2#0_00000_10100# => INSN_lwarx, 2#0_01011_10101# => INSN_lwaux, 2#0_01010_10101# => INSN_lwax, @@ -363,6 +376,8 @@ architecture behaviour of predecoder is 2#0_00011_11100# => INSN_nor, 2#0_01101_11100# => INSN_or, 2#0_01100_11100# => INSN_orc, + 2#0_00100_11100# => INSN_pdepd, + 2#0_00101_11100# => INSN_pextd, 2#0_00011_11010# => INSN_popcntb, 2#0_01111_11010# => INSN_popcntd, 2#0_01011_11010# => INSN_popcntw, @@ -402,6 +417,7 @@ architecture behaviour of predecoder is 2#0_10110_10110# => INSN_sthcx, 2#0_01101_10111# => INSN_sthux, 2#0_01100_10111# => INSN_sthx, + 2#0_00101_10110# => INSN_stqcx, 2#0_10100_10110# => INSN_stwbrx, 2#0_11100_10101# => INSN_stwcix, 2#0_00100_10110# => INSN_stwcx, @@ -447,6 +463,8 @@ architecture behaviour of predecoder is 2#1_00100_11110# => INSN_isync, 2#1_00000_10000# => INSN_mcrf, 2#1_00000_11010# => INSN_rfid, + 2#1_00010_11010# => INSN_rfscv, + 2#1_01000_11010# => INSN_rfid, -- hrfid -- Major opcode 59 -- Address bits are 1, insn(10..6), 1, 0, insn(3..1) diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c index e15d42d..aa0573a 100644 --- a/scripts/fmt_log/fmt_log.c +++ b/scripts/fmt_log/fmt_log.c @@ -87,13 +87,13 @@ const char *units[4] = { "al", "ls", "fp", "3?" }; const char *ops[64] = { "illegal", "nop ", "add ", "attn ", "b ", "bc ", "bcreg ", "bcd ", - "bperm ", "brev ", "cmp ", "cmpb ", "cmpeqb ", "cmprb ", "cntz ", "crop ", - "darn ", "dcbf ", "dcbst ", "dcbt ", "dcbtst ", "dcbz ", "icbi ", "icbt ", - "fpcmp ", "fparith", "fpmove ", "fpmisc ", "div ", "dive ", "mod ", "exts ", - "extswsl", "isel ", "isync ", "logic ", "ld ", "st ", "mcrxrx ", "mfcr ", - "mfmsr ", "mfspr ", "mtcrf ", "mtmsr ", "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", - "popcnt ", "prty ", "rfid ", "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", - "shl ", "shr ", "sync ", "tlbie ", "trap ", "xor ", "addg6s ", "ffail ", + "bperm ", "brev ", "cmp ", "cmpb ", "cmpeqb ", "cmprb ", "countb ", "crop ", + "darn ", "dcbf ", "dcbst ", "xcbt ", "dcbtst ", "dcbz ", "icbi ", "fpcmp ", + "fparith", "fpmove ", "fpmisc ", "div ", "dive ", "mod ", "exts ", "extswsl", + "isel ", "isync ", "logic ", "ld ", "st ", "mcrxrx ", "mfcr ", "mfmsr ", + "mfspr ", "mtcrf ", "mtmsr ", "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "bsort ", + "prty ", "rfid ", "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", "shl ", + "shr ", "sync ", "tlbie ", "trap ", "xor ", "addg6s ", "wait ", "ffail ", }; const char *spr_names[13] = diff --git a/scripts/mw_debug/mw_debug.c b/scripts/mw_debug/mw_debug.c index 07c1056..d7966d9 100644 --- a/scripts/mw_debug/mw_debug.c +++ b/scripts/mw_debug/mw_debug.c @@ -550,6 +550,7 @@ static const char *fast_spr_names[] = "lr", "ctr", "srr0", "srr1", "hsrr0", "hsrr1", "sprg0", "sprg1", "sprg2", "sprg3", "hsprg0", "hsprg1", "xer", "tar", + "fscr", "hfscr", "heir", "cfar", }; static const char *ldst_spr_names[] = { diff --git a/soc.vhdl b/soc.vhdl index 942da63..3e3b438 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -99,6 +99,8 @@ entity soc is rst : in std_ulogic; system_clk : in std_ulogic; + run_out : out std_ulogic; + -- "Large" (64-bit) DRAM wishbone wb_dram_in : out wishbone_master_out; wb_dram_out : in wishbone_slave_out := wishbone_slave_out_init; @@ -349,6 +351,7 @@ begin processor: entity work.core generic map( SIM => SIM, + CPU_INDEX => 0, HAS_FPU => HAS_FPU, HAS_BTC => HAS_BTC, DISABLE_FLATTEN => DISABLE_FLATTEN_CORE, @@ -366,6 +369,7 @@ begin clk => system_clk, rst => rst_core, alt_reset => alt_reset_d, + run_out => run_out, wishbone_insn_in => wishbone_icore_in, wishbone_insn_out => wishbone_icore_out, wishbone_data_in => wishbone_dcore_in, diff --git a/tests/illegal/head.S b/tests/illegal/head.S index 5446d68..2f7e3d2 100644 --- a/tests/illegal/head.S +++ b/tests/illegal/head.S @@ -74,25 +74,15 @@ ill_test_1: EXCEPTION(0x500) EXCEPTION(0x600) + // We shouldn't get a Program interrupt at 700, so fail . = 0x700 mtsprg0 %r3 - mtsprg1 %r4 - - // test for bit 44 being set for ILL - mfsrr1 %r3 - li %r4, 1 - sldi %r4, %r4, (63-44) - and. %r4, %r4, %r3 - li %r4, 8 // PASS so skip 2 instructions - bne 1f - li %r4, 4 // FAIL so only skip 1 instruction. Return will catch -1: + mfsrr0 %r3 - add %r3, %r3, %r4 // skip some instructions + addi %r3, %r3, 4 // skip one instruction, causing a fail mtsrr0 %r3 mfsprg0 %r3 - mfsprg1 %r4 rfid EXCEPTION(0x800) @@ -104,7 +94,18 @@ ill_test_1: EXCEPTION(0xd00) EXCEPTION(0xe00) EXCEPTION(0xe20) - EXCEPTION(0xe40) + + // We now expect a HEAI at e40 for illegal instructions + . = 0xe40 + mthsprg0 %r3 + + mfhsrr0 %r3 + addi %r3, %r3, 8 // skip one instruction, causing success + mthsrr0 %r3 + + mfhsprg0 %r3 + hrfid + EXCEPTION(0xe60) EXCEPTION(0xe80) EXCEPTION(0xf00) diff --git a/tests/mmu/mmu.c b/tests/mmu/mmu.c index 64afa44..ff6a582 100644 --- a/tests/mmu/mmu.c +++ b/tests/mmu/mmu.c @@ -7,6 +7,7 @@ #define MSR_LE 0x1 #define MSR_DR 0x10 #define MSR_IR 0x20 +#define MSR_HV 0x1000000000000000ul #define MSR_SF 0x8000000000000000ul extern int test_read(long *addr, long *ret, long init); @@ -450,11 +451,11 @@ int mmu_test_11(void) unsigned long ptr = 0x523000; /* this should fail */ - if (test_exec(0, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (test_exec(0, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 1; /* SRR0 and SRR1 should be set correctly */ if (mfspr(SRR0) != (long) ptr || - mfspr(SRR1) != (MSR_SF | 0x40000000 | MSR_IR | MSR_LE)) + mfspr(SRR1) != (MSR_SF | MSR_HV | 0x40000000 | MSR_IR | MSR_LE)) return 2; return 0; } @@ -468,12 +469,12 @@ int mmu_test_12(void) /* create PTE */ map((void *)ptr, (void *)mem, PERM_EX | REF); /* this should succeed and be a cache miss */ - if (!test_exec(0, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (!test_exec(0, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 1; /* create a second PTE */ map((void *)ptr2, (void *)mem, PERM_EX | REF); /* this should succeed and be a cache hit */ - if (!test_exec(0, ptr2, MSR_SF | MSR_IR | MSR_LE)) + if (!test_exec(0, ptr2, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 2; return 0; } @@ -487,18 +488,18 @@ int mmu_test_13(void) /* create a PTE */ map((void *)ptr, (void *)mem, PERM_EX | REF); /* this should succeed */ - if (!test_exec(1, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (!test_exec(1, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 1; /* invalidate the PTE */ unmap((void *)ptr); /* install a second PTE */ map((void *)ptr2, (void *)mem, PERM_EX | REF); /* this should fail */ - if (test_exec(1, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (test_exec(1, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 2; /* SRR0 and SRR1 should be set correctly */ if (mfspr(SRR0) != (long) ptr || - mfspr(SRR1) != (MSR_SF | 0x40000000 | MSR_IR | MSR_LE)) + mfspr(SRR1) != (MSR_SF | MSR_HV | 0x40000000 | MSR_IR | MSR_LE)) return 3; return 0; } @@ -513,16 +514,16 @@ int mmu_test_14(void) /* create a PTE */ map((void *)ptr, (void *)mem, PERM_EX | REF); /* this should fail due to second page not being mapped */ - if (test_exec(2, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (test_exec(2, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 1; /* SRR0 and SRR1 should be set correctly */ if (mfspr(SRR0) != ptr2 || - mfspr(SRR1) != (MSR_SF | 0x40000000 | MSR_IR | MSR_LE)) + mfspr(SRR1) != (MSR_SF | MSR_HV | 0x40000000 | MSR_IR | MSR_LE)) return 2; /* create a PTE for the second page */ map((void *)ptr2, (void *)mem2, PERM_EX | REF); /* this should succeed */ - if (!test_exec(2, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (!test_exec(2, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 3; return 0; } @@ -535,11 +536,11 @@ int mmu_test_15(void) /* create a PTE without execute permission */ map((void *)ptr, (void *)mem, DFLT_PERM); /* this should fail */ - if (test_exec(0, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (test_exec(0, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 1; /* SRR0 and SRR1 should be set correctly */ if (mfspr(SRR0) != ptr || - mfspr(SRR1) != (MSR_SF | 0x10000000 | MSR_IR | MSR_LE)) + mfspr(SRR1) != (MSR_SF | MSR_HV | 0x10000000 | MSR_IR | MSR_LE)) return 2; return 0; } @@ -556,16 +557,16 @@ int mmu_test_16(void) /* create a PTE for the second page without execute permission */ map((void *)ptr2, (void *)mem2, PERM_RD | REF); /* this should fail due to second page being no-execute */ - if (test_exec(2, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (test_exec(2, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 1; /* SRR0 and SRR1 should be set correctly */ if (mfspr(SRR0) != ptr2 || - mfspr(SRR1) != (MSR_SF | 0x10000000 | MSR_IR | MSR_LE)) + mfspr(SRR1) != (MSR_SF | MSR_HV | 0x10000000 | MSR_IR | MSR_LE)) return 2; /* create a PTE for the second page with execute permission */ map((void *)ptr2, (void *)mem2, PERM_RD | PERM_EX | REF); /* this should succeed */ - if (!test_exec(2, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (!test_exec(2, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 3; return 0; } @@ -578,22 +579,22 @@ int mmu_test_17(void) /* create a PTE without the ref bit set */ map((void *)ptr, (void *)mem, PERM_EX); /* this should fail */ - if (test_exec(2, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (test_exec(2, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 1; /* SRR0 and SRR1 should be set correctly */ if (mfspr(SRR0) != (long) ptr || - mfspr(SRR1) != (MSR_SF | 0x00040000 | MSR_IR | MSR_LE)) + mfspr(SRR1) != (MSR_SF | MSR_HV | 0x00040000 | MSR_IR | MSR_LE)) return 2; /* create a PTE without ref or execute permission */ unmap((void *)ptr); map((void *)ptr, (void *)mem, 0); /* this should fail */ - if (test_exec(2, ptr, MSR_SF | MSR_IR | MSR_LE)) + if (test_exec(2, ptr, MSR_SF | MSR_HV | MSR_IR | MSR_LE)) return 1; /* SRR0 and SRR1 should be set correctly */ /* RC update fail bit should not be set */ if (mfspr(SRR0) != (long) ptr || - mfspr(SRR1) != (MSR_SF | 0x10000000 | MSR_IR | MSR_LE)) + mfspr(SRR1) != (MSR_SF | MSR_HV | 0x10000000 | MSR_IR | MSR_LE)) return 2; return 0; } diff --git a/tests/modes/head.S b/tests/modes/head.S index d9e69dc..8b00bdd 100644 --- a/tests/modes/head.S +++ b/tests/modes/head.S @@ -230,3 +230,63 @@ restore: ld %r0,16(%r1) mtlr %r0 blr + + .global do_lq +do_lq: + lq %r6,0(%r3) + std %r6,0(%r4) + std %r7,8(%r4) + li %r3,0 + blr + + .global do_lq_np /* "non-preferred" form of lq */ +do_lq_np: + mr %r7,%r3 + lq %r6,0(%r7) + std %r6,0(%r4) + std %r7,8(%r4) + li %r3,0 + blr + + .global do_lq_bad /* illegal form of lq */ +do_lq_bad: + mr %r6,%r3 + .long 0xe0c60000 /* lq %r6,0(%r6) */ + std %r6,0(%r4) + std %r7,8(%r4) + li %r3,0 + blr + + .global do_stq +do_stq: + ld %r8,0(%r4) + ld %r9,8(%r4) + stq %r8,0(%r3) + li %r3,0 + blr + + /* big-endian versions of the above */ + .global do_lq_be +do_lq_be: + .long 0x0000c3e0 + .long 0x0000c4f8 + .long 0x0800e4f8 + .long 0x00006038 + .long 0x2000804e + + .global do_lq_np_be /* "non-preferred" form of lq */ +do_lq_np_be: + .long 0x781b677c + .long 0x0000c7e0 + .long 0x0000c4f8 + .long 0x0800e4f8 + .long 0x00006038 + .long 0x2000804e + + .global do_stq_be +do_stq_be: + .long 0x000004e9 + .long 0x080024e9 + .long 0x020003f9 + .long 0x00006038 + .long 0x2000804e diff --git a/tests/modes/modes.c b/tests/modes/modes.c index fa4872c..f37e70b 100644 --- a/tests/modes/modes.c +++ b/tests/modes/modes.c @@ -12,6 +12,14 @@ extern unsigned long callit(unsigned long arg1, unsigned long arg2, unsigned long fn, unsigned long msr); +extern void do_lq(void *src, unsigned long *regs); +extern void do_lq_np(void *src, unsigned long *regs); +extern void do_lq_bad(void *src, unsigned long *regs); +extern void do_stq(void *dst, unsigned long *regs); +extern void do_lq_be(void *src, unsigned long *regs); +extern void do_lq_np_be(void *src, unsigned long *regs); +extern void do_stq_be(void *dst, unsigned long *regs); + static inline void do_tlbie(unsigned long rb, unsigned long rs) { __asm__ volatile("tlbie %0,%1" : : "r" (rb), "r" (rs) : "memory"); @@ -25,6 +33,8 @@ static inline void do_tlbie(unsigned long rb, unsigned long rs) #define SPRG0 272 #define SPRG1 273 #define SPRG3 275 +#define HSRR0 314 +#define HSRR1 315 #define PTCR 464 static inline unsigned long mfspr(int sprnum) @@ -294,6 +304,166 @@ int mode_test_6(void) return 0; } +int mode_test_7(void) +{ + unsigned long quad[4] __attribute__((__aligned__(16))); + unsigned long regs[2]; + unsigned long ret, msr; + + /* + * Test lq/stq in LE mode + */ + msr = MSR_SF | MSR_LE; + quad[0] = 0x123456789abcdef0ul; + quad[1] = 0xfafa5959bcbc3434ul; + ret = callit((unsigned long)quad, (unsigned long)regs, + (unsigned long)&do_lq, msr); + if (ret) + return ret | 1; + if (regs[0] != quad[1] || regs[1] != quad[0]) + return 2; + /* unaligned may give alignment interrupt */ + quad[2] = 0x0011223344556677ul; + ret = callit((unsigned long)&quad[1], (unsigned long)regs, + (unsigned long)&do_lq, msr); + if (ret == 0) { + if (regs[0] != quad[2] || regs[1] != quad[1]) + return 3; + } else if (ret == 0x600) { + if (mfspr(SPRG0) != (unsigned long) &do_lq || + mfspr(DAR) != (unsigned long) &quad[1]) + return ret | 4; + } else + return ret | 5; + + /* try stq */ + regs[0] = 0x5238523852385238ul; + regs[1] = 0x5239523952395239ul; + ret = callit((unsigned long)quad, (unsigned long)regs, + (unsigned long)&do_stq, msr); + if (ret) + return ret | 5; + if (quad[0] != regs[1] || quad[1] != regs[0]) + return 6; + regs[0] = 0x0172686966746564ul; + regs[1] = 0xfe8d0badd00dabcdul; + ret = callit((unsigned long)quad + 1, (unsigned long)regs, + (unsigned long)&do_stq, msr); + if (ret) + return ret | 7; + if (((quad[0] >> 8) | (quad[1] << 56)) != regs[1] || + ((quad[1] >> 8) | (quad[2] << 56)) != regs[0]) + return 8; + + /* try lq non-preferred form */ + quad[0] = 0x56789abcdef01234ul; + quad[1] = 0x5959bcbc3434fafaul; + ret = callit((unsigned long)quad, (unsigned long)regs, + (unsigned long)&do_lq_np, msr); + if (ret) + return ret | 9; + if (regs[0] != quad[1] || regs[1] != quad[0]) + return 10; + /* unaligned should give alignment interrupt in uW implementation */ + quad[2] = 0x6677001122334455ul; + ret = callit((unsigned long)&quad[1], (unsigned long)regs, + (unsigned long)&do_lq_np, msr); + if (ret == 0x600) { + if (mfspr(SPRG0) != (unsigned long) &do_lq_np + 4 || + mfspr(DAR) != (unsigned long) &quad[1]) + return ret | 11; + } else + return 12; + + /* make sure lq with rt = ra causes a HEAI interrupt */ + ret = callit((unsigned long)quad, (unsigned long)regs, + (unsigned long)&do_lq_bad, msr); + if (ret != 0xe40) + return 13; + if (mfspr(HSRR0) != (unsigned long)&do_lq_bad + 4) + return 14; + return 0; +} + +int mode_test_8(void) +{ + unsigned long quad[4] __attribute__((__aligned__(16))); + unsigned long regs[2]; + unsigned long ret, msr; + + /* + * Test lq/stq in BE mode + */ + msr = MSR_SF; + quad[0] = 0x123456789abcdef0ul; + quad[1] = 0xfafa5959bcbc3434ul; + ret = callit((unsigned long)quad, (unsigned long)regs, + (unsigned long)&do_lq_be, msr); + if (ret) + return ret | 1; + if (regs[0] != quad[0] || regs[1] != quad[1]) { + print_hex(regs[0], 16); + print_string(" "); + print_hex(regs[1], 16); + print_string(" "); + return 2; + } + /* don't expect alignment interrupt */ + quad[2] = 0x0011223344556677ul; + ret = callit((unsigned long)&quad[1], (unsigned long)regs, + (unsigned long)&do_lq_be, msr); + if (ret == 0) { + if (regs[0] != quad[1] || regs[1] != quad[2]) + return 3; + } else + return ret | 5; + + /* try stq */ + regs[0] = 0x5238523852385238ul; + regs[1] = 0x5239523952395239ul; + ret = callit((unsigned long)quad, (unsigned long)regs, + (unsigned long)&do_stq_be, msr); + if (ret) + return ret | 5; + if (quad[0] != regs[0] || quad[1] != regs[1]) + return 6; + regs[0] = 0x0172686966746564ul; + regs[1] = 0xfe8d0badd00dabcdul; + ret = callit((unsigned long)quad + 1, (unsigned long)regs, + (unsigned long)&do_stq_be, msr); + if (ret) + return ret | 7; + if (((quad[0] >> 8) | (quad[1] << 56)) != regs[0] || + ((quad[1] >> 8) | (quad[2] << 56)) != regs[1]) { + print_hex(quad[0], 16); + print_string(" "); + print_hex(quad[1], 16); + print_string(" "); + print_hex(quad[2], 16); + print_string(" "); + return 8; + } + + /* try lq non-preferred form */ + quad[0] = 0x56789abcdef01234ul; + quad[1] = 0x5959bcbc3434fafaul; + ret = callit((unsigned long)quad, (unsigned long)regs, + (unsigned long)&do_lq_np_be, msr); + if (ret) + return ret | 9; + if (regs[0] != quad[0] || regs[1] != quad[1]) + return 10; + /* unaligned should not give alignment interrupt in uW implementation */ + quad[2] = 0x6677001122334455ul; + ret = callit((unsigned long)&quad[1], (unsigned long)regs, + (unsigned long)&do_lq_np_be, msr); + if (ret) + return ret | 11; + if (regs[0] != quad[1] || regs[1] != quad[2]) + return 12; + return 0; +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -338,6 +508,8 @@ int main(void) do_test(4, mode_test_4); do_test(5, mode_test_5); do_test(6, mode_test_6); + do_test(7, mode_test_7); + do_test(8, mode_test_8); return fail; } diff --git a/tests/prefix/head.S b/tests/prefix/head.S index 961c2a9..9a46e4b 100644 --- a/tests/prefix/head.S +++ b/tests/prefix/head.S @@ -245,3 +245,23 @@ test_pstw: pstw %r3,wvar(0) li %r3,0 blr + + .globl test_plq +test_plq: + nop + nop + plq %r4,qvar(0) + std %r4,0(%r3) + std %r5,8(%r3) + li %r3,0 + blr + + .globl test_pstq +test_pstq: + nop + nop + ld %r4,0(%r3) + ld %r5,8(%r3) + pstq %r4,qvar(0) + li %r3,0 + blr diff --git a/tests/prefix/prefix.c b/tests/prefix/prefix.c index 94ac500..8cc117d 100644 --- a/tests/prefix/prefix.c +++ b/tests/prefix/prefix.c @@ -7,6 +7,7 @@ #define MSR_LE 0x1 #define MSR_DR 0x10 #define MSR_IR 0x20 +#define MSR_HV 0x1000000000000000ul #define MSR_SF 0x8000000000000000ul #define DSISR 18 @@ -32,6 +33,8 @@ extern long test_pstd(long arg); extern long test_psth(long arg); extern long test_pstw(long arg); extern long test_plfd(long arg); +extern long test_plq(long arg); +extern long test_pstq(long arg); static inline unsigned long mfspr(int sprnum) { @@ -103,7 +106,7 @@ long int prefix_test_2(void) return 1; if (mfspr(SRR0) != (unsigned long)&test_paddi_mis + 8) return 2; - if (mfspr(SRR1) != (MSR_SF | MSR_LE | (1ul << (63 - 35)) | (1ul << (63 - 34)))) + if (mfspr(SRR1) != (MSR_SF | MSR_HV | MSR_LE | (1ul << (63 - 35)) | (1ul << (63 - 34)))) return 3; ret = trapit((long)&x, test_plfd); @@ -111,7 +114,7 @@ long int prefix_test_2(void) return ret; if (mfspr(SRR0) != (unsigned long)&test_plfd + 8) return 6; - if (mfspr(SRR1) != (MSR_SF | MSR_LE | (1ul << (63 - 34)))) + if (mfspr(SRR1) != (MSR_SF | MSR_HV | MSR_LE | (1ul << (63 - 34)))) return 7; return 0; } @@ -182,6 +185,39 @@ long int prefix_test_3(void) return 0; } +unsigned long qvar[2] __attribute__((__aligned__(16))); +#define V1 0x678912345a5a2b2bull +#define V2 0xa0549922bbccddeeull + +/* test plq and pstq */ +long int prefix_test_4(void) +{ + long int ret; + unsigned long x[2]; + + qvar[0] = V1; + qvar[1] = V2; + ret = trapit((long)&x, test_plq); + if (ret) + return ret | 1; + if (x[0] != V1 || x[1] != V2) { + print_hex(x[0], 16, " "); + print_hex(x[1], 16, " "); + return 2; + } + x[0] = ~V2; + x[1] = ~V1; + ret = trapit((long)&x, test_pstq); + if (ret) + return ret | 3; + if (qvar[0] != ~V2 || qvar[1] != ~V1) { + print_hex(qvar[0], 16, " "); + print_hex(qvar[1], 16, " "); + return 4; + } + return 0; +} + int fail = 0; void do_test(int num, long int (*test)(void)) @@ -209,6 +245,7 @@ int main(void) do_test(1, prefix_test_1); do_test(2, prefix_test_2); do_test(3, prefix_test_3); + do_test(4, prefix_test_4); return fail; } diff --git a/tests/reservation/head.S b/tests/reservation/head.S index ce258b5..4ff85ce 100644 --- a/tests/reservation/head.S +++ b/tests/reservation/head.S @@ -155,3 +155,31 @@ call_ret: ld %r31,248(%r1) addi %r1,%r1,256 blr + + .global do_lqarx +do_lqarx: + /* r3 = src, r4 = regs */ + lqarx %r10,0,%r3 + std %r10,0(%r4) + std %r11,8(%r4) + li %r3,0 + blr + + .global do_lqarx_bad +do_lqarx_bad: + /* r3 = src, r4 = regs */ + .long 0x7d405228 /* lqarx %r10,0,%r10 */ + std %r10,0(%r4) + std %r11,8(%r4) + li %r3,0 + blr + + .global do_stqcx +do_stqcx: + /* r3 = dest, r4 = regs, return CR */ + ld %r10,0(%r4) + ld %r11,8(%r4) + stqcx. %r10,0,%r3 + mfcr %r3 + oris %r3,%r3,1 /* to distinguish from trap number */ + blr diff --git a/tests/reservation/reservation.c b/tests/reservation/reservation.c index 79bbc1f..502b285 100644 --- a/tests/reservation/reservation.c +++ b/tests/reservation/reservation.c @@ -7,6 +7,10 @@ extern unsigned long callit(unsigned long arg1, unsigned long arg2, unsigned long (*fn)(unsigned long, unsigned long)); +extern unsigned long do_lqarx(unsigned long src, unsigned long regs); +extern unsigned long do_lqarx_bad(unsigned long src, unsigned long regs); +extern unsigned long do_stqcx(unsigned long dst, unsigned long regs); + #define DSISR 18 #define DAR 19 #define SRR0 26 @@ -180,6 +184,63 @@ int resv_test_2(void) return 0; } +/* test lqarx/stqcx */ +int resv_test_3(void) +{ + unsigned long x[4] __attribute__((__aligned__(16))); + unsigned long y[2], regs[2]; + unsigned long ret, offset; + int count; + + x[0] = 0x7766554433221100ul; + x[1] = 0xffeeddccbbaa9988ul; + y[0] = 0x0badcafef00dd00dul; + y[1] = 0xdeadbeef07070707ul; + for (count = 0; count < 1000; ++count) { + ret = callit((unsigned long)x, (unsigned long)regs, do_lqarx); + if (ret) + return ret | 1; + ret = callit((unsigned long)x, (unsigned long)y, do_stqcx); + if (ret < 0x10000) + return ret | 2; + if (ret & 0x20000000) + break; + } + if (count == 1000) + return 3; + if (x[0] != y[1] || x[1] != y[0]) + return 4; + if (regs[1] != 0x7766554433221100ul || regs[0] != 0xffeeddccbbaa9988ul) + return 5; + ret = callit((unsigned long)x, (unsigned long)regs, do_stqcx); + if (ret < 0x10000 || (ret & 0x20000000)) + return ret | 12; + /* test alignment interrupts */ + for (offset = 0; offset < 16; ++offset) { + ret = callit((unsigned long)x + offset, (unsigned long)regs, do_lqarx); + if (ret == 0 && (offset & 15) != 0) + return 6; + if (ret == 0x600) { + if ((offset & 15) == 0) + return ret + 7; + } else if (ret) + return ret; + ret = callit((unsigned long)x + offset, (unsigned long)y, do_stqcx); + if (ret >= 0x10000 && (offset & 15) != 0) + return 8; + if (ret == 0x600) { + if ((offset & 15) == 0) + return ret + 9; + } else if (ret < 0x10000) + return ret; + } + /* test illegal interrupt for bad lqarx case */ + ret = callit((unsigned long)x, (unsigned long)regs, do_lqarx_bad); + if (ret != 0xe40) + return ret + 10; + return 0; +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -204,6 +265,7 @@ int main(void) do_test(1, resv_test_1); do_test(2, resv_test_2); + do_test(3, resv_test_3); return fail; } diff --git a/tests/test_illegal.bin b/tests/test_illegal.bin index 727876c..22bbbc1 100755 Binary files a/tests/test_illegal.bin and b/tests/test_illegal.bin differ diff --git a/tests/test_mmu.bin b/tests/test_mmu.bin index 1ade44e..6e352a7 100755 Binary files a/tests/test_mmu.bin and b/tests/test_mmu.bin differ diff --git a/tests/test_modes.bin b/tests/test_modes.bin index 24e3981..d0c24d7 100755 Binary files a/tests/test_modes.bin and b/tests/test_modes.bin differ diff --git a/tests/test_modes.console_out b/tests/test_modes.console_out index a49bb9b..25e791c 100644 --- a/tests/test_modes.console_out +++ b/tests/test_modes.console_out @@ -4,3 +4,5 @@ test 03:PASS test 04:PASS test 05:PASS test 06:PASS +test 07:PASS +test 08:PASS diff --git a/tests/test_prefix.bin b/tests/test_prefix.bin index a5f9ff7..e84e70e 100755 Binary files a/tests/test_prefix.bin and b/tests/test_prefix.bin differ diff --git a/tests/test_prefix.console_out b/tests/test_prefix.console_out index 623335d..90cfed1 100644 --- a/tests/test_prefix.console_out +++ b/tests/test_prefix.console_out @@ -1,3 +1,4 @@ test 01:PASS test 02:PASS test 03:PASS +test 04:PASS diff --git a/tests/test_reservation.bin b/tests/test_reservation.bin index 1cb6250..9c9ad8f 100755 Binary files a/tests/test_reservation.bin and b/tests/test_reservation.bin differ diff --git a/tests/test_reservation.console_out b/tests/test_reservation.console_out index 0c39ae3..623335d 100644 --- a/tests/test_reservation.console_out +++ b/tests/test_reservation.console_out @@ -1,2 +1,3 @@ test 01:PASS test 02:PASS +test 03:PASS diff --git a/tests/test_xics.bin b/tests/test_xics.bin index 80d1508..41876c8 100755 Binary files a/tests/test_xics.bin and b/tests/test_xics.bin differ diff --git a/tests/xics/head.S b/tests/xics/head.S index c513a02..4de3e29 100644 --- a/tests/xics/head.S +++ b/tests/xics/head.S @@ -115,7 +115,7 @@ __isr: std %r29, 29*8(%r1) std %r30, 30*8(%r1) std %r31, 31*8(%r1) - mfsrr0 %r0 + mfhsrr0 %r0 std %r0, SAVE_NIA*8(%r1) mflr %r0 std %r0, SAVE_LR*8(%r1) @@ -123,7 +123,7 @@ __isr: std %r0, SAVE_CTR*8(%r1) mfcr %r0 std %r0, SAVE_CR*8(%r1) - mfsrr1 %r0 + mfhsrr1 %r0 std %r0, SAVE_SRR1*8(%r1) stdu %r1,-STACK_FRAME_C_MINIMAL(%r1) diff --git a/writeback.vhdl b/writeback.vhdl index 6a86fb7..d7690a5 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -72,11 +72,15 @@ begin variable vec : integer range 0 to 16#fff#; variable srr1 : std_ulogic_vector(15 downto 0); variable intr : std_ulogic; + variable hvi : std_ulogic; + variable scv : std_ulogic; + variable intr_page : std_ulogic_vector(4 downto 0); begin w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; f := WritebackToFetch1Init; vec := 0; + hvi := '0'; complete_out <= instr_tag_init; if e_in.valid = '1' then @@ -93,9 +97,16 @@ begin interrupt_out.intr <= intr; srr1 := (others => '0'); + intr_page := 5x"0"; + scv := '0'; if e_in.interrupt = '1' then vec := e_in.intr_vec; srr1 := e_in.srr1; + hvi := e_in.hv_intr; + scv := e_in.is_scv; + if e_in.is_scv = '1' then + intr_page := 5x"17"; + end if; elsif l_in.interrupt = '1' then vec := l_in.intr_vec; srr1 := l_in.srr1; @@ -103,7 +114,9 @@ begin vec := fp_in.intr_vec; srr1 := fp_in.srr1; end if; + interrupt_out.hv_intr <= hvi; interrupt_out.srr1 <= srr1; + interrupt_out.scv_int <= scv; if intr = '0' then if e_in.write_enable = '1' then @@ -161,7 +174,7 @@ begin -- Outputs to fetch1 f.interrupt := intr; - f.intr_vec := std_ulogic_vector(to_unsigned(vec, 12)); + f.intr_vec := intr_page & std_ulogic_vector(to_unsigned(vec, 12)); f.redirect := e_in.redirect; f.redirect_nia := e_in.write_data; f.br_nia := e_in.last_nia;