diff --git a/Makefile b/Makefile index 2ee5d57..678bbfa 100644 --- a/Makefile +++ b/Makefile @@ -43,8 +43,8 @@ all: $(all) core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \ - decode1.vhdl helpers.vhdl insn_helpers.vhdl gpr_hazard.vhdl \ - cr_hazard.vhdl control.vhdl decode2.vhdl register_file.vhdl \ + decode1.vhdl helpers.vhdl insn_helpers.vhdl \ + control.vhdl decode2.vhdl register_file.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ logical.vhdl countzero.vhdl multiply.vhdl divider.vhdl execute1.vhdl \ loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \ diff --git a/common.vhdl b/common.vhdl index 7bf8277..69dde30 100644 --- a/common.vhdl +++ b/common.vhdl @@ -3,6 +3,7 @@ use ieee.std_logic_1164.all; use ieee.numeric_std.all; library work; +use work.utils.all; use work.decode_types.all; package common is @@ -126,7 +127,19 @@ package common is constant FPSCR_NI : integer := 63 - 61; constant FPSCR_RN : integer := 63 - 63; - type irq_state_t is (WRITE_SRR0, WRITE_SRR1); + -- Used for tracking instruction completion and pending register writes + constant TAG_COUNT : positive := 4; + constant TAG_NUMBER_BITS : natural := log2(TAG_COUNT); + subtype tag_number_t is integer range 0 to TAG_COUNT - 1; + subtype tag_index_t is unsigned(TAG_NUMBER_BITS - 1 downto 0); + type instr_tag_t is record + tag : tag_number_t; + valid : std_ulogic; + end record; + constant instr_tag_init : instr_tag_t := (tag => 0, valid => '0'); + function tag_match(tag1 : instr_tag_t; tag2 : instr_tag_t) return boolean; + + subtype intr_vector_t is integer range 0 to 16#fff#; -- For now, fixed 16 sources, make this either a parametric -- package of some sort or an unconstrainted array. @@ -144,8 +157,6 @@ package common is dec: std_ulogic_vector(63 downto 0); msr: std_ulogic_vector(63 downto 0); cfar: std_ulogic_vector(63 downto 0); - irq_state : irq_state_t; - srr1: std_ulogic_vector(63 downto 0); end record; type Fetch1ToIcacheType is record @@ -176,26 +187,40 @@ package common is insn: std_ulogic_vector(31 downto 0); ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR) + ispro: gspr_index_t; -- (G)SPR written with LR or CTR decode: decode_rom_t; br_pred: std_ulogic; -- Branch was predicted to be taken big_endian: std_ulogic; end record; constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), - ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init, - br_pred => '0', big_endian => '0'); + ispr1 => (others => '0'), ispr2 => (others => '0'), ispro => (others => '0'), + decode => decode_rom_init, br_pred => '0', big_endian => '0'); type Decode1ToFetch1Type is record redirect : std_ulogic; redirect_nia : std_ulogic_vector(63 downto 0); end record; + type bypass_data_t is record + tag : instr_tag_t; + data : std_ulogic_vector(63 downto 0); + end record; + constant bypass_data_init : bypass_data_t := (tag => instr_tag_init, data => (others => '0')); + + type cr_bypass_data_t is record + tag : instr_tag_t; + data : std_ulogic_vector(31 downto 0); + end record; + constant cr_bypass_data_init : cr_bypass_data_t := (tag => instr_tag_init, data => (others => '0')); + type Decode2ToExecute1Type is record valid: std_ulogic; unit : unit_t; fac : facility_t; insn_type: insn_type_t; nia: std_ulogic_vector(63 downto 0); + instr_tag : instr_tag_t; write_reg: gspr_index_t; write_reg_enable: std_ulogic; read_reg1: gspr_index_t; @@ -203,13 +228,10 @@ package common is read_data1: std_ulogic_vector(63 downto 0); read_data2: std_ulogic_vector(63 downto 0); read_data3: std_ulogic_vector(63 downto 0); - bypass_data1: std_ulogic; - bypass_data2: std_ulogic; - bypass_data3: std_ulogic; cr: std_ulogic_vector(31 downto 0); - bypass_cr : std_ulogic; xerc: xer_common_t; lr: std_ulogic; + br_abs: std_ulogic; rc: std_ulogic; oe: std_ulogic; invert_a: std_ulogic; @@ -219,6 +241,7 @@ package common is output_carry: std_ulogic; input_cr: std_ulogic; output_cr: std_ulogic; + output_xer: std_ulogic; is_32bit: std_ulogic; is_signed: std_ulogic; insn: std_ulogic_vector(31 downto 0); @@ -234,10 +257,11 @@ package common is second : std_ulogic; -- set if this is the second op end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := - (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, - write_reg_enable => '0', bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', - bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', - invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', + (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init, + write_reg_enable => '0', + lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', + invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', + output_cr => '0', output_xer => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), @@ -289,9 +313,9 @@ package common is end record; type RegisterFileToDecode2Type is record - read1_data : std_ulogic_vector(63 downto 0); - read2_data : std_ulogic_vector(63 downto 0); - read3_data : std_ulogic_vector(63 downto 0); + read1_data : std_ulogic_vector(63 downto 0); + read2_data : std_ulogic_vector(63 downto 0); + read3_data : std_ulogic_vector(63 downto 0); end record; type Decode2ToCrFileType is record @@ -303,27 +327,12 @@ package common is read_xerc_data : xer_common_t; end record; - type Execute1ToFetch1Type is record - redirect: std_ulogic; - virt_mode: std_ulogic; - priv_mode: std_ulogic; - big_endian: std_ulogic; - mode_32bit: std_ulogic; - redirect_nia: std_ulogic_vector(63 downto 0); - br_nia : std_ulogic_vector(63 downto 0); - br_last : std_ulogic; - br_taken : std_ulogic; - end record; - constant Execute1ToFetch1Init : Execute1ToFetch1Type := (redirect => '0', virt_mode => '0', - priv_mode => '0', big_endian => '0', - mode_32bit => '0', br_taken => '0', - br_last => '0', others => (others => '0')); - type Execute1ToLoadstore1Type is record valid : std_ulogic; op : insn_type_t; -- what ld/st or m[tf]spr or TLB op to do nia : std_ulogic_vector(63 downto 0); insn : std_ulogic_vector(31 downto 0); + instr_tag : instr_tag_t; addr1 : std_ulogic_vector(63 downto 0); addr2 : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); -- data to write, unused for read @@ -333,7 +342,6 @@ package common is byte_reverse : std_ulogic; sign_extend : std_ulogic; -- do we need to sign extend? update : std_ulogic; -- is this an update instruction? - update_reg : gpr_index_t; -- if so, the register to update xerc : xer_common_t; reserve : std_ulogic; -- set for larx/stcx. rc : std_ulogic; -- set for stcx. @@ -343,30 +351,29 @@ package common is is_32bit : std_ulogic; repeat : std_ulogic; second : std_ulogic; - end record; - constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0', - sign_extend => '0', update => '0', xerc => xerc_init, - reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0', - nia => (others => '0'), insn => (others => '0'), - addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'), - write_reg => (others => '0'), length => (others => '0'), - mode_32bit => '0', is_32bit => '0', - repeat => '0', second => '0', others => (others => '0')); + msr : std_ulogic_vector(63 downto 0); + end record; + constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := + (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0', + sign_extend => '0', update => '0', xerc => xerc_init, + reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0', + nia => (others => '0'), insn => (others => '0'), + instr_tag => instr_tag_init, + addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'), + write_reg => (others => '0'), + length => (others => '0'), + mode_32bit => '0', is_32bit => '0', + repeat => '0', second => '0', + msr => (others => '0')); type Loadstore1ToExecute1Type is record busy : std_ulogic; - exception : std_ulogic; - alignment : std_ulogic; - invalid : std_ulogic; - perm_error : std_ulogic; - rc_error : std_ulogic; - badtree : std_ulogic; - segment_fault : std_ulogic; - instr_fault : std_ulogic; + in_progress : std_ulogic; end record; type Loadstore1ToDcacheType is record valid : std_ulogic; + hold : std_ulogic; load : std_ulogic; -- is this a load dcbz : std_ulogic; nc : std_ulogic; @@ -438,18 +445,28 @@ package common is type Loadstore1ToWritebackType is record valid : std_ulogic; + instr_tag : instr_tag_t; write_enable: std_ulogic; write_reg : gspr_index_t; write_data : std_ulogic_vector(63 downto 0); xerc : xer_common_t; rc : std_ulogic; store_done : std_ulogic; + interrupt : std_ulogic; + intr_vec : intr_vector_t; + srr0: std_ulogic_vector(63 downto 0); + srr1: std_ulogic_vector(15 downto 0); end record; - constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := (valid => '0', write_enable => '0', xerc => xerc_init, - rc => '0', store_done => '0', write_data => (others => '0'), others => (others => '0')); + constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := + (valid => '0', instr_tag => instr_tag_init, write_enable => '0', + write_reg => (others => '0'), write_data => (others => '0'), + xerc => xerc_init, rc => '0', store_done => '0', + interrupt => '0', intr_vec => 0, + srr0 => (others => '0'), srr1 => (others => '0')); type Execute1ToWritebackType is record valid: std_ulogic; + instr_tag : instr_tag_t; rc : std_ulogic; mode_32bit : std_ulogic; write_enable : std_ulogic; @@ -460,21 +477,34 @@ package common is write_cr_data : std_ulogic_vector(31 downto 0); write_xerc_enable : std_ulogic; xerc : xer_common_t; - exc_write_enable : std_ulogic; - exc_write_reg : gspr_index_t; - exc_write_data : std_ulogic_vector(63 downto 0); - end record; - constant Execute1ToWritebackInit : Execute1ToWritebackType := (valid => '0', rc => '0', mode_32bit => '0', write_enable => '0', - write_cr_enable => '0', exc_write_enable => '0', - write_xerc_enable => '0', xerc => xerc_init, - write_data => (others => '0'), write_cr_mask => (others => '0'), - write_cr_data => (others => '0'), write_reg => (others => '0'), - exc_write_reg => (others => '0'), exc_write_data => (others => '0')); + interrupt : std_ulogic; + intr_vec : intr_vector_t; + redirect: std_ulogic; + redir_mode: std_ulogic_vector(3 downto 0); + last_nia: std_ulogic_vector(63 downto 0); + br_offset: std_ulogic_vector(63 downto 0); + br_last: std_ulogic; + br_taken: std_ulogic; + abs_br: std_ulogic; + srr1: std_ulogic_vector(15 downto 0); + msr: std_ulogic_vector(63 downto 0); + end record; + constant Execute1ToWritebackInit : Execute1ToWritebackType := + (valid => '0', instr_tag => instr_tag_init, rc => '0', mode_32bit => '0', + write_enable => '0', write_cr_enable => '0', + write_xerc_enable => '0', xerc => xerc_init, + write_data => (others => '0'), write_cr_mask => (others => '0'), + write_cr_data => (others => '0'), write_reg => (others => '0'), + interrupt => '0', intr_vec => 0, redirect => '0', redir_mode => "0000", + last_nia => (others => '0'), br_offset => (others => '0'), + br_last => '0', br_taken => '0', abs_br => '0', + srr1 => (others => '0'), msr => (others => '0')); type Execute1ToFPUType is record valid : std_ulogic; op : insn_type_t; nia : std_ulogic_vector(63 downto 0); + itag : instr_tag_t; insn : std_ulogic_vector(31 downto 0); single : std_ulogic; fe_mode : std_ulogic_vector(1 downto 0); @@ -486,6 +516,7 @@ package common is out_cr : std_ulogic; end record; constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'), + itag => instr_tag_init, insn => (others => '0'), fe_mode => "00", rc => '0', fra => (others => '0'), frb => (others => '0'), frc => (others => '0'), frt => (others => '0'), @@ -494,21 +525,30 @@ package common is type FPUToExecute1Type is record busy : std_ulogic; exception : std_ulogic; - interrupt : std_ulogic; - illegal : std_ulogic; end record; constant FPUToExecute1Init : FPUToExecute1Type := (others => '0'); type FPUToWritebackType is record valid : std_ulogic; + interrupt : std_ulogic; + instr_tag : instr_tag_t; write_enable : std_ulogic; write_reg : gspr_index_t; write_data : std_ulogic_vector(63 downto 0); write_cr_enable : std_ulogic; write_cr_mask : std_ulogic_vector(7 downto 0); write_cr_data : std_ulogic_vector(31 downto 0); - end record; - constant FPUToWritebackInit : FPUToWritebackType := (valid => '0', write_enable => '0', write_cr_enable => '0', others => (others => '0')); + intr_vec : intr_vector_t; + srr0 : std_ulogic_vector(63 downto 0); + srr1 : std_ulogic_vector(15 downto 0); + end record; + constant FPUToWritebackInit : FPUToWritebackType := + (valid => '0', interrupt => '0', instr_tag => instr_tag_init, + write_enable => '0', write_reg => (others => '0'), + write_cr_enable => '0', write_cr_mask => (others => '0'), + write_cr_data => (others => '0'), + intr_vec => 0, srr1 => (others => '0'), + others => (others => '0')); type DividerToExecute1Type is record valid: std_ulogic; @@ -518,12 +558,29 @@ package common is constant DividerToExecute1Init : DividerToExecute1Type := (valid => '0', overflow => '0', others => (others => '0')); + type WritebackToFetch1Type is record + redirect: std_ulogic; + virt_mode: std_ulogic; + priv_mode: std_ulogic; + big_endian: std_ulogic; + mode_32bit: std_ulogic; + redirect_nia: std_ulogic_vector(63 downto 0); + br_nia : std_ulogic_vector(63 downto 0); + br_last : std_ulogic; + br_taken : std_ulogic; + end record; + constant WritebackToFetch1Init : WritebackToFetch1Type := + (redirect => '0', virt_mode => '0', priv_mode => '0', big_endian => '0', + mode_32bit => '0', redirect_nia => (others => '0'), + br_last => '0', br_taken => '0', br_nia => (others => '0')); + type WritebackToRegisterFileType is record write_reg : gspr_index_t; write_data : std_ulogic_vector(63 downto 0); write_enable : std_ulogic; end record; - constant WritebackToRegisterFileInit : WritebackToRegisterFileType := (write_enable => '0', write_data => (others => '0'), others => (others => '0')); + constant WritebackToRegisterFileInit : WritebackToRegisterFileType := + (write_enable => '0', write_data => (others => '0'), others => (others => '0')); type WritebackToCrFileType is record write_cr_enable : std_ulogic; @@ -553,9 +610,9 @@ package body common is begin case spr is when SPR_LR => - n := 0; + n := 0; -- N.B. decode2 relies on this specific value when SPR_CTR => - n:= 1; + n := 1; -- N.B. decode2 relies on this specific value when SPR_SRR0 => n := 2; when SPR_SRR1 => @@ -616,4 +673,9 @@ package body common is begin return "10" & f; end; + + function tag_match(tag1 : instr_tag_t; tag2 : instr_tag_t) return boolean is + begin + return tag1.valid = '1' and tag2.valid = '1' and tag1.tag = tag2.tag; + end; end common; diff --git a/control.vhdl b/control.vhdl index a89dab8..34c35e2 100644 --- a/control.vhdl +++ b/control.vhdl @@ -6,13 +6,14 @@ use work.common.all; entity control is generic ( - PIPELINE_DEPTH : natural := 2 + EX1_BYPASS : boolean := true; + PIPELINE_DEPTH : natural := 3 ); port ( clk : in std_ulogic; rst : in std_ulogic; - complete_in : in std_ulogic; + complete_in : in instr_tag_t; valid_in : in std_ulogic; repeated : in std_ulogic; flush_in : in std_ulogic; @@ -23,10 +24,6 @@ entity control is gpr_write_valid_in : in std_ulogic; gpr_write_in : in gspr_index_t; - gpr_bypassable : in std_ulogic; - - update_gpr_write_valid : in std_ulogic; - update_gpr_write_reg : in gspr_index_t; gpr_a_read_valid_in : in std_ulogic; gpr_a_read_in : in gspr_index_t; @@ -37,9 +34,11 @@ entity control is gpr_c_read_valid_in : in std_ulogic; gpr_c_read_in : in gspr_index_t; + execute_next_tag : in instr_tag_t; + execute_next_cr_tag : in instr_tag_t; + cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; - cr_bypassable : in std_ulogic; valid_out : out std_ulogic; stall_out : out std_ulogic; @@ -48,7 +47,9 @@ entity control is gpr_bypass_a : out std_ulogic; gpr_bypass_b : out std_ulogic; gpr_bypass_c : out std_ulogic; - cr_bypass : out std_ulogic + cr_bypass : out std_ulogic; + + instr_tag_out : out instr_tag_t ); end entity control; @@ -63,122 +64,165 @@ architecture rtl of control is signal r_int, rin_int : reg_internal_type := reg_internal_init; - signal stall_a_out : std_ulogic; - signal stall_b_out : std_ulogic; - signal stall_c_out : std_ulogic; - signal cr_stall_out : std_ulogic; - signal gpr_write_valid : std_ulogic := '0'; signal cr_write_valid : std_ulogic := '0'; -begin - gpr_hazard0: entity work.gpr_hazard - generic map ( - PIPELINE_DEPTH => PIPELINE_DEPTH - ) - port map ( - clk => clk, - busy_in => busy_in, - deferred => deferred, - complete_in => complete_in, - flush_in => flush_in, - issuing => valid_out, - repeated => repeated, - - gpr_write_valid_in => gpr_write_valid, - gpr_write_in => gpr_write_in, - bypass_avail => gpr_bypassable, - gpr_read_valid_in => gpr_a_read_valid_in, - gpr_read_in => gpr_a_read_in, - - ugpr_write_valid => update_gpr_write_valid, - ugpr_write_reg => update_gpr_write_reg, - - stall_out => stall_a_out, - use_bypass => gpr_bypass_a - ); - - gpr_hazard1: entity work.gpr_hazard - generic map ( - PIPELINE_DEPTH => PIPELINE_DEPTH - ) - port map ( - clk => clk, - busy_in => busy_in, - deferred => deferred, - complete_in => complete_in, - flush_in => flush_in, - issuing => valid_out, - repeated => repeated, - - gpr_write_valid_in => gpr_write_valid, - gpr_write_in => gpr_write_in, - bypass_avail => gpr_bypassable, - gpr_read_valid_in => gpr_b_read_valid_in, - gpr_read_in => gpr_b_read_in, - - ugpr_write_valid => update_gpr_write_valid, - ugpr_write_reg => update_gpr_write_reg, - - stall_out => stall_b_out, - use_bypass => gpr_bypass_b - ); - - gpr_hazard2: entity work.gpr_hazard - generic map ( - PIPELINE_DEPTH => PIPELINE_DEPTH - ) - port map ( - clk => clk, - busy_in => busy_in, - deferred => deferred, - complete_in => complete_in, - flush_in => flush_in, - issuing => valid_out, - repeated => repeated, - - gpr_write_valid_in => gpr_write_valid, - gpr_write_in => gpr_write_in, - bypass_avail => gpr_bypassable, - gpr_read_valid_in => gpr_c_read_valid_in, - gpr_read_in => gpr_c_read_in, - - ugpr_write_valid => update_gpr_write_valid, - ugpr_write_reg => update_gpr_write_reg, - - stall_out => stall_c_out, - use_bypass => gpr_bypass_c - ); - - cr_hazard0: entity work.cr_hazard - generic map ( - PIPELINE_DEPTH => PIPELINE_DEPTH - ) - port map ( - clk => clk, - busy_in => busy_in, - deferred => deferred, - complete_in => complete_in, - flush_in => flush_in, - issuing => valid_out, - - cr_read_in => cr_read_in, - cr_write_in => cr_write_valid, - bypassable => cr_bypassable, - - stall_out => cr_stall_out, - use_bypass => cr_bypass - ); + type tag_register is record + wr_gpr : std_ulogic; + reg : gspr_index_t; + recent : std_ulogic; + wr_cr : std_ulogic; + end record; + type tag_regs_array is array(tag_number_t) of tag_register; + signal tag_regs : tag_regs_array; + + signal instr_tag : instr_tag_t; + + signal gpr_tag_stall : std_ulogic; + signal cr_tag_stall : std_ulogic; + + signal curr_tag : tag_number_t; + signal next_tag : tag_number_t; + + signal curr_cr_tag : tag_number_t; + +begin control0: process(clk) begin if rising_edge(clk) then assert rin_int.outstanding >= 0 and rin_int.outstanding <= (PIPELINE_DEPTH+1) report "Outstanding bad " & integer'image(rin_int.outstanding) severity failure; r_int <= rin_int; + for i in tag_number_t loop + if rst = '1' or flush_in = '1' then + tag_regs(i).wr_gpr <= '0'; + tag_regs(i).wr_cr <= '0'; + else + if complete_in.valid = '1' and i = complete_in.tag then + tag_regs(i).wr_gpr <= '0'; + tag_regs(i).wr_cr <= '0'; + report "tag " & integer'image(i) & " not valid"; + end if; + if gpr_write_valid = '1' and tag_regs(i).reg = gpr_write_in then + tag_regs(i).recent <= '0'; + if tag_regs(i).recent = '1' and tag_regs(i).wr_gpr = '1' then + report "tag " & integer'image(i) & " not recent"; + end if; + end if; + if instr_tag.valid = '1' and i = instr_tag.tag then + tag_regs(i).wr_gpr <= gpr_write_valid; + tag_regs(i).reg <= gpr_write_in; + tag_regs(i).recent <= gpr_write_valid; + tag_regs(i).wr_cr <= cr_write_valid; + if gpr_write_valid = '1' then + report "tag " & integer'image(i) & " valid for gpr " & to_hstring(gpr_write_in); + end if; + end if; + end if; + end loop; + if rst = '1' then + curr_tag <= 0; + curr_cr_tag <= 0; + else + curr_tag <= next_tag; + if cr_write_valid = '1' then + curr_cr_tag <= instr_tag.tag; + end if; + end if; end if; end process; + control_hazards : process(all) + variable gpr_stall : std_ulogic; + variable tag_a : instr_tag_t; + variable tag_b : instr_tag_t; + variable tag_c : instr_tag_t; + variable tag_s : instr_tag_t; + variable tag_t : instr_tag_t; + variable incr_tag : tag_number_t; + variable byp_a : std_ulogic; + variable byp_b : std_ulogic; + variable byp_c : std_ulogic; + variable tag_cr : instr_tag_t; + variable byp_cr : std_ulogic; + begin + tag_a := instr_tag_init; + for i in tag_number_t loop + if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_a_read_in then + tag_a.valid := gpr_a_read_valid_in; + tag_a.tag := i; + end if; + end loop; + if tag_match(tag_a, complete_in) then + tag_a.valid := '0'; + end if; + tag_b := instr_tag_init; + for i in tag_number_t loop + if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_b_read_in then + tag_b.valid := gpr_b_read_valid_in; + tag_b.tag := i; + end if; + end loop; + if tag_match(tag_b, complete_in) then + tag_b.valid := '0'; + end if; + tag_c := instr_tag_init; + for i in tag_number_t loop + if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_c_read_in then + tag_c.valid := gpr_c_read_valid_in; + tag_c.tag := i; + end if; + end loop; + if tag_match(tag_c, complete_in) then + tag_c.valid := '0'; + end if; + + byp_a := '0'; + if EX1_BYPASS and tag_match(execute_next_tag, tag_a) then + byp_a := '1'; + end if; + byp_b := '0'; + if EX1_BYPASS and tag_match(execute_next_tag, tag_b) then + byp_b := '1'; + end if; + byp_c := '0'; + if EX1_BYPASS and tag_match(execute_next_tag, tag_c) then + byp_c := '1'; + end if; + + gpr_bypass_a <= byp_a; + gpr_bypass_b <= byp_b; + gpr_bypass_c <= byp_c; + + gpr_tag_stall <= (tag_a.valid and not byp_a) or + (tag_b.valid and not byp_b) or + (tag_c.valid and not byp_c); + + incr_tag := curr_tag; + instr_tag.tag <= curr_tag; + instr_tag.valid <= valid_out and not deferred; + if instr_tag.valid = '1' then + incr_tag := (curr_tag + 1) mod TAG_COUNT; + end if; + next_tag <= incr_tag; + instr_tag_out <= instr_tag; + + -- CR hazards + tag_cr.tag := curr_cr_tag; + tag_cr.valid := cr_read_in and tag_regs(curr_cr_tag).wr_cr; + if tag_match(tag_cr, complete_in) then + tag_cr.valid := '0'; + end if; + byp_cr := '0'; + if EX1_BYPASS and tag_match(execute_next_cr_tag, tag_cr) then + byp_cr := '1'; + end if; + + cr_bypass <= byp_cr; + cr_tag_stall <= tag_cr.valid and not byp_cr; + end process; + control1 : process(all) variable v_int : reg_internal_type; variable valid_tmp : std_ulogic; @@ -191,11 +235,14 @@ begin stall_tmp := '0'; if flush_in = '1' then - -- expect to see complete_in next cycle - v_int.outstanding := 1; - elsif complete_in = '1' then + v_int.outstanding := 0; + elsif complete_in.valid = '1' then v_int.outstanding := r_int.outstanding - 1; end if; + if r_int.outstanding >= PIPELINE_DEPTH + 1 then + valid_tmp := '0'; + stall_tmp := '1'; + end if; if rst = '1' then v_int := reg_internal_init; @@ -222,8 +269,8 @@ begin v_int.state := WAIT_FOR_CURR_TO_COMPLETE; end if; else - -- let it go out if there are no GPR hazards - stall_tmp := stall_a_out or stall_b_out or stall_c_out or cr_stall_out; + -- let it go out if there are no GPR or CR hazards + stall_tmp := gpr_tag_stall or cr_tag_stall; end if; end if; @@ -249,8 +296,8 @@ begin v_int.state := WAIT_FOR_CURR_TO_COMPLETE; end if; else - -- let it go out if there are no GPR hazards - stall_tmp := stall_a_out or stall_b_out or stall_c_out or cr_stall_out; + -- let it go out if there are no GPR or CR hazards + stall_tmp := gpr_tag_stall or cr_tag_stall; end if; end if; else @@ -262,15 +309,11 @@ begin valid_tmp := '0'; end if; - if valid_tmp = '1' then - if deferred = '0' then - v_int.outstanding := v_int.outstanding + 1; - end if; - gpr_write_valid <= gpr_write_valid_in; - cr_write_valid <= cr_write_in; - else - gpr_write_valid <= '0'; - cr_write_valid <= '0'; + gpr_write_valid <= gpr_write_valid_in and valid_tmp; + cr_write_valid <= cr_write_in and valid_tmp; + + if valid_tmp = '1' and deferred = '0' then + v_int.outstanding := v_int.outstanding + 1; end if; -- update outputs diff --git a/core.vhdl b/core.vhdl index 3948b86..e2a93b9 100644 --- a/core.vhdl +++ b/core.vhdl @@ -46,6 +46,7 @@ end core; architecture behave of core is -- icache signals signal fetch1_to_icache : Fetch1ToIcacheType; + signal writeback_to_fetch1: WritebackToFetch1Type; signal icache_to_decode1 : IcacheToDecode1Type; signal mmu_to_icache : MmuToIcacheType; @@ -66,7 +67,8 @@ architecture behave of core is -- execute signals signal execute1_to_writeback: Execute1ToWritebackType; - signal execute1_to_fetch1: Execute1ToFetch1Type; + signal execute1_bypass: bypass_data_t; + signal execute1_cr_bypass: cr_bypass_data_t; -- load store signals signal execute1_to_loadstore1: Execute1ToLoadstore1Type; @@ -102,10 +104,11 @@ architecture behave of core is signal decode1_flush: std_ulogic; signal fetch1_flush: std_ulogic; - signal complete: std_ulogic; + signal complete: instr_tag_t; signal terminate: std_ulogic; signal core_rst: std_ulogic; signal icache_inv: std_ulogic; + signal do_interrupt: std_ulogic; -- Delayed/Latched resets and alt_reset signal rst_fetch1 : std_ulogic := '1'; @@ -117,6 +120,7 @@ architecture behave of core is signal rst_ex1 : std_ulogic := '1'; signal rst_fpu : std_ulogic := '1'; signal rst_ls1 : std_ulogic := '1'; + signal rst_wback : std_ulogic := '1'; signal rst_dbg : std_ulogic := '1'; signal alt_reset_d : std_ulogic; @@ -180,6 +184,7 @@ begin rst_ex1 <= core_rst; rst_fpu <= core_rst; rst_ls1 <= core_rst; + rst_wback <= core_rst; rst_dbg <= rst; alt_reset_d <= alt_reset; end if; @@ -200,7 +205,7 @@ begin inval_btc => ex1_icache_inval or mmu_to_icache.tlbie, stop_in => dbg_core_stop, d_in => decode1_to_fetch1, - e_in => execute1_to_fetch1, + w_in => writeback_to_fetch1, i_out => fetch1_to_icache, log_out => log_data(42 downto 0) ); @@ -273,6 +278,8 @@ begin r_out => decode2_to_register_file, c_in => cr_file_to_decode2, c_out => decode2_to_cr_file, + execute_bypass => execute1_bypass, + execute_cr_bypass => execute1_cr_bypass, log_out => log_data(119 downto 110) ); decode2_busy_in <= ex1_busy_out; @@ -320,16 +327,18 @@ begin port map ( clk => clk, rst => rst_ex1, - flush_out => flush, + flush_in => flush, busy_out => ex1_busy_out, e_in => decode2_to_execute1, l_in => loadstore1_to_execute1, fp_in => fpu_to_execute1, ext_irq_in => ext_irq, + interrupt_in => do_interrupt, l_out => execute1_to_loadstore1, - f_out => execute1_to_fetch1, fp_out => execute1_to_fpu, e_out => execute1_to_writeback, + bypass_data => execute1_bypass, + bypass_cr_data => execute1_cr_bypass, icache_inval => ex1_icache_inval, dbg_msr_out => msr, terminate_out => terminate, @@ -410,11 +419,15 @@ begin writeback_0: entity work.writeback port map ( clk => clk, + rst => rst_wback, + flush_out => flush, e_in => execute1_to_writeback, l_in => loadstore1_to_writeback, fp_in => fpu_to_writeback, w_out => writeback_to_register_file, c_out => writeback_to_cr_file, + f_out => writeback_to_fetch1, + interrupt_out => do_interrupt, complete_out => complete ); diff --git a/cr_hazard.vhdl b/cr_hazard.vhdl deleted file mode 100644 index a6203a8..0000000 --- a/cr_hazard.vhdl +++ /dev/null @@ -1,86 +0,0 @@ -library ieee; -use ieee.std_logic_1164.all; -use ieee.numeric_std.all; - -entity cr_hazard is - generic ( - PIPELINE_DEPTH : natural := 1 - ); - port( - clk : in std_ulogic; - busy_in : in std_ulogic; - deferred : in std_ulogic; - complete_in : in std_ulogic; - flush_in : in std_ulogic; - issuing : in std_ulogic; - - cr_read_in : in std_ulogic; - cr_write_in : in std_ulogic; - bypassable : in std_ulogic; - - stall_out : out std_ulogic; - use_bypass : out std_ulogic - ); -end entity cr_hazard; -architecture behaviour of cr_hazard is - type pipeline_entry_type is record - valid : std_ulogic; - bypass : std_ulogic; - end record; - constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0'); - - type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type; - constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init); - - signal r, rin : pipeline_t := pipeline_t_init; -begin - cr_hazard0: process(clk) - begin - if rising_edge(clk) then - r <= rin; - end if; - end process; - - cr_hazard1: process(all) - variable v : pipeline_t; - begin - v := r; - - -- XXX assumes PIPELINE_DEPTH = 1 - if complete_in = '1' then - v(1).valid := '0'; - end if; - - use_bypass <= '0'; - stall_out <= '0'; - if cr_read_in = '1' then - loop_0: for i in 0 to PIPELINE_DEPTH loop - if v(i).valid = '1' then - if r(i).bypass = '1' then - use_bypass <= '1'; - else - stall_out <= '1'; - end if; - end if; - end loop; - end if; - - -- XXX assumes PIPELINE_DEPTH = 1 - if busy_in = '0' then - v(1) := r(0); - v(0).valid := '0'; - end if; - if deferred = '0' and issuing = '1' then - v(0).valid := cr_write_in; - v(0).bypass := bypassable; - end if; - if flush_in = '1' then - v(0).valid := '0'; - v(1).valid := '0'; - end if; - - -- update registers - rin <= v; - - end process; -end; diff --git a/dcache.vhdl b/dcache.vhdl index 7da67e1..bb93148 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -275,6 +275,7 @@ architecture rtl of dcache is doall : std_ulogic; -- with tlbie, indicates flush whole TLB tlbld : std_ulogic; -- indicates a TLB load request (from MMU) mmu_req : std_ulogic; -- indicates source of request + d_valid : std_ulogic; -- indicates req.data is valid now end record; signal r0 : reg_stage_0_t; @@ -564,17 +565,27 @@ begin r.mmu_req := '1'; else r.req := d_in; + r.req.data := (others => '0'); r.tlbie := '0'; r.doall := '0'; r.tlbld := '0'; r.mmu_req := '0'; end if; + r.d_valid := '0'; if rst = '1' then r0_full <= '0'; - elsif r1.full = '0' or r0_full = '0' then + elsif (r1.full = '0' and d_in.hold = '0') or r0_full = '0' then r0 <= r; r0_full <= r.req.valid; end if; + -- Sample data the cycle after a request comes in from loadstore1. + -- If another request has come in already then the data will get + -- put directly into req.data below. + if r0.req.valid = '1' and r.req.valid = '0' and r0.d_valid = '0' and + r0.mmu_req = '0' then + r0.req.data <= d_in.data; + r0.d_valid <= '1'; + end if; end if; end process; @@ -582,8 +593,8 @@ begin m_out.stall <= '0'; -- Hold off the request in r0 when r1 has an uncompleted request - r0_stall <= r0_full and r1.full; - r0_valid <= r0_full and not r1.full; + r0_stall <= r0_full and (r1.full or d_in.hold); + r0_valid <= r0_full and not r1.full and not d_in.hold; stall_out <= r0_stall; -- TLB @@ -1305,10 +1316,12 @@ begin req.dcbz := r0.req.dcbz; req.real_addr := ra; -- Force data to 0 for dcbz - if r0.req.dcbz = '0' then - req.data := d_in.data; - else + if r0.req.dcbz = '1' then req.data := (others => '0'); + elsif r0.d_valid = '1' then + req.data := r0.req.data; + else + req.data := d_in.data; end if; -- Select all bytes for dcbz and for cacheable loads if r0.req.dcbz = '1' or (r0.req.load = '1' and r0.req.nc = '0') then @@ -1438,10 +1451,10 @@ begin -- complete the request next cycle. -- Compare the whole address in case the request in -- r1.req is not the one that started this refill. - if r1.full = '1' and r1.req.same_tag = '1' and - ((r1.dcbz = '1' and r1.req.dcbz = '1') or - (r1.dcbz = '0' and r1.req.op = OP_LOAD_MISS)) and - r1.store_row = get_row(r1.req.real_addr) then + if req.valid = '1' and req.same_tag = '1' and + ((r1.dcbz = '1' and req.dcbz = '1') or + (r1.dcbz = '0' and req.op = OP_LOAD_MISS)) and + r1.store_row = get_row(req.real_addr) then r1.full <= '0'; r1.slow_valid <= '1'; if r1.mmu_req = '0' then diff --git a/decode1.vhdl b/decode1.vhdl index ebe59be..2869c39 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -79,23 +79,23 @@ architecture behaviour of decode1 is 28 => (ALU, NONE, OP_AND, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), -- andi. 29 => (ALU, NONE, OP_AND, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), -- andis. 0 => (ALU, NONE, OP_ATTN, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- attn - 18 => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b + 18 => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b 16 => (ALU, NONE, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- bc 11 => (ALU, NONE, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- cmpi 10 => (ALU, NONE, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpli 34 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lbz - 35 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lbzu + 35 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lbzu 50 => (LDST, FPU, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lfd - 51 => (LDST, FPU, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lfdu + 51 => (LDST, FPU, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lfdu 48 => (LDST, FPU, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- lfs - 49 => (LDST, FPU, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', NONE), -- lfsu + 49 => (LDST, FPU, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', DUPD), -- lfsu 42 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lha - 43 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lhau + 43 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhau 40 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhz - 41 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lhzu + 41 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhzu 56 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_DQ, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRTE), -- lq 32 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwz - 33 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lwzu + 33 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lwzu 7 => (ALU, NONE, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- mulli 24 => (ALU, NONE, OP_OR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- ori 25 => (ALU, NONE, OP_OR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- oris @@ -104,15 +104,15 @@ architecture behaviour of decode1 is 23 => (ALU, NONE, OP_RLC, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- rlwnm 17 => (ALU, NONE, OP_SC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sc 38 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stb - 39 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stbu + 39 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stbu 54 => (LDST, FPU, OP_STORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stfd - 55 => (LDST, FPU, OP_STORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stfdu + 55 => (LDST, FPU, OP_STORE, RA_OR_ZERO, CONST_SI, FRS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stfdu 52 => (LDST, FPU, OP_STORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- stfs - 53 => (LDST, FPU, OP_STORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', NONE), -- stfsu + 53 => (LDST, FPU, OP_STORE, RA_OR_ZERO, CONST_SI, FRS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', NONE), -- stfsu 44 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sth - 45 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- sthu + 45 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- sthu 36 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stw - 37 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stwu + 37 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stwu 8 => (ALU, NONE, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- subfic 2 => (ALU, NONE, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- tdi 3 => (ALU, NONE, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1', NONE), -- twi @@ -214,7 +214,7 @@ architecture behaviour of decode1 is 2#0100111010# => (ALU, NONE, OP_BCD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cbcdtd 2#0100011010# => (ALU, NONE, OP_BCD, NONE, NONE, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cdtbcd 2#0000000000# => (ALU, NONE, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- cmp - 2#0111111100# => (ALU, NONE, OP_CMPB, NONE, RB, RS, RA, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpb + 2#0111111100# => (ALU, NONE, OP_CMPB, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpb 2#0011100000# => (ALU, NONE, OP_CMPEQB, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpeqb 2#0000100000# => (ALU, NONE, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpl 2#0011000000# => (ALU, NONE, OP_CMPRB, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmprb @@ -290,33 +290,33 @@ architecture behaviour of decode1 is 2#1111101111# => (ALU, NONE, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel 2#0000110100# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), -- lbarx 2#1101010101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lbzcix - 2#0001110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lbzux + 2#0001110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lbzux 2#0001010111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lbzx 2#0001010100# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), -- ldarx 2#1000010100# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- ldbrx 2#1101110101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- ldcix - 2#0000110101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- ldux + 2#0000110101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- ldux 2#0000010101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- ldx 2#1001010111# => (LDST, FPU, OP_LOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lfdx - 2#1001110111# => (LDST, FPU, OP_LOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lfdux + 2#1001110111# => (LDST, FPU, OP_LOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lfdux 2#1101010111# => (LDST, FPU, OP_LOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lfiwax 2#1101110111# => (LDST, FPU, OP_LOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lfiwzx 2#1000010111# => (LDST, FPU, OP_LOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- lfsx - 2#1000110111# => (LDST, FPU, OP_LOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', NONE), -- lfsux + 2#1000110111# => (LDST, FPU, OP_LOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', DUPD), -- lfsux 2#0001110100# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), -- lharx - 2#0101110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lhaux + 2#0101110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhaux 2#0101010111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhax 2#1100010110# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhbrx 2#1100110101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhzcix - 2#0100110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lhzux + 2#0100110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhzux 2#0100010111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhzx 2#0100010100# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', DRTE), -- lqarx 2#0000010100# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), -- lwarx - 2#0101110101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lwaux + 2#0101110101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lwaux 2#0101010101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwax 2#1000010110# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwbrx 2#1100010101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwzcix - 2#0000110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lwzux + 2#0000110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lwzux 2#0000010111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwzx 2#1001000000# => (ALU, NONE, OP_MCRXRX, NONE, NONE, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mcrxrx 2#0000010011# => (ALU, NONE, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfcr/mfocrf @@ -376,28 +376,28 @@ architecture behaviour of decode1 is 2#1000011000# => (ALU, NONE, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- srw 2#1111010101# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stbcix 2#1010110110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', NONE), -- stbcx - 2#0011110111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stbux + 2#0011110111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stbux 2#0011010111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stbx 2#1010010100# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stdbrx 2#1111110101# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stdcix 2#0011010110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', NONE), -- stdcx - 2#0010110101# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stdux + 2#0010110101# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stdux 2#0010010101# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stdx 2#1011010111# => (LDST, FPU, OP_STORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stfdx - 2#1011110111# => (LDST, FPU, OP_STORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stfdux + 2#1011110111# => (LDST, FPU, OP_STORE, RA_OR_ZERO, RB, FRS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stfdux 2#1111010111# => (LDST, FPU, OP_STORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stfiwx 2#1010010111# => (LDST, FPU, OP_STORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- stfsx - 2#1010110111# => (LDST, FPU, OP_STORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', NONE), -- stfsux + 2#1010110111# => (LDST, FPU, OP_STORE, RA_OR_ZERO, RB, FRS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', NONE), -- stfsux 2#1110010110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sthbrx 2#1110110101# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sthcix 2#1011010110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', NONE), -- sthcx - 2#0110110111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- sthux + 2#0110110111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- sthux 2#0110010111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sthx 2#0010110110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', DRSE), -- stqcx 2#1010010110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stwbrx 2#1110010101# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stwcix 2#0010010110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', NONE), -- stwcx - 2#0010110111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stwux + 2#0010110111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stwux 2#0010010111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stwx 2#0000101000# => (ALU, NONE, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- subf 2#1000101000# => (ALU, NONE, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- subfo @@ -424,7 +424,7 @@ architecture behaviour of decode1 is -- unit fac internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl rpt -- op in out A out in out len ext pipe 0 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_DS, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- ld - 1 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_DS, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- ldu + 1 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_DS, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- ldu 2 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_DS, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwa others => decode_rom_init ); @@ -451,7 +451,7 @@ architecture behaviour of decode1 is -- unit fac internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl rpt -- op in out A out in out len ext pipe 0 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_DS, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- std - 1 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_DS, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stdu + 1 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_DS, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stdu 2 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_DS, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRSE), -- stq others => decode_rom_init ); @@ -597,9 +597,10 @@ begin -- major opcode 31, lots of things v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1)))); - -- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path + -- Work out ispr1/ispro independent of v.decode since they seem to be critical path sprn := decode_spr_num(f_in.insn); v.ispr1 := fast_spr_num(sprn); + v.ispro := fast_spr_num(sprn); if std_match(f_in.insn(10 downto 1), "01-1010011") then -- mfspr or mtspr @@ -627,6 +628,9 @@ begin -- CTR may be needed as input to bc if f_in.insn(23) = '0' then v.ispr1 := fast_spr_num(SPR_CTR); + v.ispro := fast_spr_num(SPR_CTR); + elsif f_in.insn(0) = '1' then + v.ispro := fast_spr_num(SPR_LR); end if; -- Predict backward branches as taken, forward as untaken v.br_pred := f_in.insn(15); @@ -636,6 +640,9 @@ begin -- Unconditional branches are always taken v.br_pred := '1'; br_offset := signed(f_in.insn(25 downto 2)); + if f_in.insn(0) = '1' then + v.ispro := fast_spr_num(SPR_LR); + end if; when 19 => vi.override := not decode_op_19_valid(to_integer(unsigned(f_in.insn(5 downto 1) & f_in.insn(10 downto 6)))); @@ -648,8 +655,12 @@ begin -- Branch uses CTR as condition when BO(2) is 0. This is -- also used to indicate that CTR is modified (they go -- together). - if f_in.insn(23) = '0' then + -- bcctr doesn't update CTR or use it in the branch condition + if f_in.insn(23) = '0' and (f_in.insn(10) = '0' or f_in.insn(6) = '1') then v.ispr1 := fast_spr_num(SPR_CTR); + v.ispro := fast_spr_num(SPR_CTR); + elsif f_in.insn(0) = '1' then + v.ispro := fast_spr_num(SPR_LR); end if; if f_in.insn(10) = '0' then v.ispr2 := fast_spr_num(SPR_LR); diff --git a/decode2.vhdl b/decode2.vhdl index e00a05d..f9fa541 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -19,7 +19,7 @@ entity decode2 is clk : in std_ulogic; rst : in std_ulogic; - complete_in : in std_ulogic; + complete_in : in instr_tag_t; busy_in : in std_ulogic; stall_out : out std_ulogic; @@ -37,6 +37,9 @@ entity decode2 is c_in : in CrFileToDecode2Type; c_out : out Decode2ToCrFileType; + execute_bypass : in bypass_data_t; + execute_cr_bypass : in cr_bypass_data_t; + log_out : out std_ulogic_vector(9 downto 0) ); end entity decode2; @@ -205,22 +208,6 @@ architecture behaviour of decode2 is end case; end; - -- For now, use "rc" in the decode table to decide whether oe exists. - -- This is not entirely correct architecturally: For mulhd and - -- mulhdu, the OE field is reserved. It remains to be seen what an - -- actual POWER9 does if we set it on those instructions, for now we - -- test that further down when assigning to the multiplier oe input. - -- - function decode_oe (t : rc_t; insn_in : std_ulogic_vector(31 downto 0)) return std_ulogic is - begin - case t is - when RC => - return insn_oe(insn_in); - when OTHERS => - return '0'; - end case; - end; - -- control signals that are derived from insn_type type mux_select_array_t is array(insn_type_t) of std_ulogic_vector(2 downto 0); @@ -249,6 +236,9 @@ architecture behaviour of decode2 is OP_MOD => "011", OP_CNTZ => "100", -- countzero_result OP_MFSPR => "101", -- spr_result + OP_B => "110", -- next_nia + OP_BC => "110", + OP_BCREG => "110", OP_ADDG6S => "111", -- misc_result OP_ISEL => "111", OP_DARN => "111", @@ -271,6 +261,12 @@ architecture behaviour of decode2 is OP_MFMSR => "100", OP_MFCR => "101", OP_SETB => "110", + OP_CMP => "000", -- cr_result + OP_CMPRB => "001", + OP_CMPEQB => "010", + OP_CROP => "011", + OP_MCRXRX => "100", + OP_MTCRF => "101", others => "000" ); @@ -282,31 +278,29 @@ architecture behaviour of decode2 is signal gpr_write_valid : std_ulogic; signal gpr_write : gspr_index_t; - signal gpr_bypassable : std_ulogic; - - signal update_gpr_write_valid : std_ulogic; - signal update_gpr_write_reg : gspr_index_t; signal gpr_a_read_valid : std_ulogic; - signal gpr_a_read :gspr_index_t; - signal gpr_a_bypass : std_ulogic; + signal gpr_a_read : gspr_index_t; + signal gpr_a_bypass : std_ulogic; signal gpr_b_read_valid : std_ulogic; - signal gpr_b_read : gspr_index_t; - signal gpr_b_bypass : std_ulogic; + signal gpr_b_read : gspr_index_t; + signal gpr_b_bypass : std_ulogic; signal gpr_c_read_valid : std_ulogic; - signal gpr_c_read : gspr_index_t; - signal gpr_c_bypass : std_ulogic; + signal gpr_c_read : gspr_index_t; + signal gpr_c_bypass : std_ulogic; + signal cr_read_valid : std_ulogic; signal cr_write_valid : std_ulogic; signal cr_bypass : std_ulogic; - signal cr_bypass_avail : std_ulogic; + + signal instr_tag : instr_tag_t; begin control_0: entity work.control generic map ( - PIPELINE_DEPTH => 1 + EX1_BYPASS => EX1_BYPASS ) port map ( clk => clk, @@ -323,10 +317,6 @@ begin gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write, - gpr_bypassable => gpr_bypassable, - - update_gpr_write_valid => update_gpr_write_valid, - update_gpr_write_reg => update_gpr_write_reg, gpr_a_read_valid_in => gpr_a_read_valid, gpr_a_read_in => gpr_a_read, @@ -337,10 +327,12 @@ begin gpr_c_read_valid_in => gpr_c_read_valid, gpr_c_read_in => gpr_c_read, - cr_read_in => d_in.decode.input_cr, + execute_next_tag => execute_bypass.tag, + execute_next_cr_tag => execute_cr_bypass.tag, + + cr_read_in => cr_read_valid, cr_write_in => cr_write_valid, cr_bypass => cr_bypass, - cr_bypassable => cr_bypass_avail, valid_out => control_valid_out, stall_out => control_stall_out, @@ -348,7 +340,9 @@ begin gpr_bypass_a => gpr_a_bypass, gpr_bypass_b => gpr_b_bypass, - gpr_bypass_c => gpr_c_bypass + gpr_bypass_c => gpr_c_bypass, + + instr_tag_out => instr_tag ); deferred <= r.e.valid and busy_in; @@ -376,6 +370,7 @@ begin variable decoded_reg_c : decode_input_reg_t; variable decoded_reg_o : decode_output_reg_t; variable length : std_ulogic_vector(3 downto 0); + variable op : insn_type_t; begin v := r; @@ -386,12 +381,35 @@ begin --v.e.input_cr := d_in.decode.input_cr; v.e.output_cr := d_in.decode.output_cr; - + + -- Work out whether XER common bits are set + v.e.output_xer := d_in.decode.output_carry; + case d_in.decode.insn_type is + when OP_ADD | OP_MUL_L64 | OP_DIV | OP_DIVE => + -- OE field is valid in OP_ADD/OP_MUL_L64 with major opcode 31 only + if d_in.insn(31 downto 26) = "011111" and insn_oe(d_in.insn) = '1' then + v.e.oe := '1'; + v.e.output_xer := '1'; + end if; + when OP_MTSPR => + if decode_spr_num(d_in.insn) = SPR_XER then + v.e.output_xer := '1'; + end if; + when others => + end case; + decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1, d_in.nia); decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2); decoded_reg_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn, r_in.read3_data); - decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispr1); + decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro); + + if d_in.decode.lr = '1' then + v.e.lr := insn_lk(d_in.insn); + -- b and bc have even major opcodes; bcreg is considered absolute + v.e.br_abs := insn_aa(d_in.insn) or d_in.insn(26); + end if; + op := d_in.decode.insn_type; if d_in.decode.repeat /= NONE then v.e.repeat := '1'; @@ -407,8 +425,19 @@ begin if r.repeat = d_in.big_endian then decoded_reg_o.reg(0) := '1'; end if; + when DUPD => + -- update-form loads, 2nd instruction writes RA + if r.repeat = '1' then + decoded_reg_o.reg := decoded_reg_a.reg; + end if; when others => end case; + elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then + -- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled + v.e.repeat := '1'; + v.e.second := r.repeat; + -- first one does CTR, second does LR + decoded_reg_o.reg(0) := not r.repeat; end if; r_out.read1_enable <= decoded_reg_a.reg_valid and d_in.valid; @@ -435,43 +464,21 @@ begin v.e.nia := d_in.nia; v.e.unit := d_in.decode.unit; v.e.fac := d_in.decode.facility; - v.e.insn_type := d_in.decode.insn_type; + v.e.instr_tag := instr_tag; v.e.read_reg1 := decoded_reg_a.reg; - v.e.read_data1 := decoded_reg_a.data; - v.e.bypass_data1 := gpr_a_bypass; v.e.read_reg2 := decoded_reg_b.reg; - v.e.read_data2 := decoded_reg_b.data; - v.e.bypass_data2 := gpr_b_bypass; - v.e.read_data3 := decoded_reg_c.data; - v.e.bypass_data3 := gpr_c_bypass; v.e.write_reg := decoded_reg_o.reg; v.e.write_reg_enable := decoded_reg_o.reg_valid; v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); - if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then - v.e.oe := decode_oe(d_in.decode.rc, d_in.insn); - end if; - v.e.cr := c_in.read_cr_data; - v.e.bypass_cr := cr_bypass; v.e.xerc := c_in.read_xerc_data; v.e.invert_a := d_in.decode.invert_a; v.e.addm1 := '0'; - if d_in.decode.insn_type = OP_BC or d_in.decode.insn_type = OP_BCREG then - -- add -1 to CTR - v.e.addm1 := '1'; - if d_in.insn(23) = '1' or - (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then - -- don't write decremented CTR if BO(2) = 1 or bcctr - v.e.write_reg_enable := '0'; - end if; - end if; + v.e.insn_type := op; v.e.invert_out := d_in.decode.invert_out; v.e.input_carry := d_in.decode.input_carry; v.e.output_carry := d_in.decode.output_carry; v.e.is_32bit := d_in.decode.is_32bit; v.e.is_signed := d_in.decode.is_signed; - if d_in.decode.lr = '1' then - v.e.lr := insn_lk(d_in.insn); - end if; v.e.insn := d_in.insn; v.e.data_len := length; v.e.byte_reverse := d_in.decode.byte_reverse; @@ -479,8 +486,41 @@ begin v.e.update := d_in.decode.update; v.e.reserve := d_in.decode.reserve; v.e.br_pred := d_in.br_pred; - v.e.result_sel := result_select(d_in.decode.insn_type); - v.e.sub_select := subresult_select(d_in.decode.insn_type); + v.e.result_sel := result_select(op); + v.e.sub_select := subresult_select(op); + if op = OP_BC or op = OP_BCREG then + if d_in.insn(23) = '0' and r.repeat = '0' and + not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then + -- decrement CTR if BO(2) = 0 and not bcctr + v.e.addm1 := '1'; + v.e.result_sel := "000"; -- select adder output + end if; + end if; + + -- See if any of the operands can get their value via the bypass path. + case gpr_a_bypass is + when '1' => + v.e.read_data1 := execute_bypass.data; + when others => + v.e.read_data1 := decoded_reg_a.data; + end case; + case gpr_b_bypass is + when '1' => + v.e.read_data2 := execute_bypass.data; + when others => + v.e.read_data2 := decoded_reg_b.data; + end case; + case gpr_c_bypass is + when '1' => + v.e.read_data3 := execute_bypass.data; + when others => + v.e.read_data3 := decoded_reg_c.data; + end case; + + v.e.cr := c_in.read_cr_data; + if cr_bypass = '1' then + v.e.cr := execute_cr_bypass.data; + end if; -- issue control control_valid_in <= d_in.valid; @@ -488,17 +528,6 @@ begin gpr_write_valid <= v.e.write_reg_enable; gpr_write <= decoded_reg_o.reg; - gpr_bypassable <= '0'; - if EX1_BYPASS and d_in.decode.unit = ALU then - gpr_bypassable <= '1'; - end if; - update_gpr_write_valid <= d_in.decode.update; - update_gpr_write_reg <= decoded_reg_a.reg; - if v.e.lr = '1' then - -- there are no instructions that have both update=1 and lr=1 - update_gpr_write_valid <= '1'; - update_gpr_write_reg <= fast_spr_num(SPR_LR); - end if; gpr_a_read_valid <= decoded_reg_a.reg_valid; gpr_a_read <= decoded_reg_a.reg; @@ -510,10 +539,9 @@ begin gpr_c_read <= decoded_reg_c.reg; cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); - cr_bypass_avail <= '0'; - if EX1_BYPASS and d_in.decode.unit = ALU then - cr_bypass_avail <= d_in.decode.output_cr; - end if; + -- Since ops that write CR only write some of the fields, + -- any op that writes CR effectively also reads it. + cr_read_valid <= cr_write_valid or d_in.decode.input_cr; v.e.valid := control_valid_out; if control_valid_out = '1' then @@ -544,9 +572,9 @@ begin r.e.valid & stopped_out & stall_out & - r.e.bypass_data3 & - r.e.bypass_data2 & - r.e.bypass_data1; + gpr_a_bypass & + gpr_b_bypass & + gpr_c_bypass; end if; end process; log_out <= log_data; diff --git a/decode_types.vhdl b/decode_types.vhdl index 02790a6..885cc91 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -54,7 +54,8 @@ package decode_types is type repeat_t is (NONE, -- instruction is not repeated DRSE, -- double RS, endian twist - DRTE); -- double RT, endian twist + DRTE, -- double RT, endian twist + DUPD); -- update-form load type decode_rom_t is record unit : unit_t; diff --git a/execute1.vhdl b/execute1.vhdl index 25b1dc7..c0434a0 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -22,7 +22,7 @@ entity execute1 is rst : in std_ulogic; -- asynchronous - flush_out : out std_ulogic; + flush_in : in std_ulogic; busy_out : out std_ulogic; e_in : in Decode2ToExecute1Type; @@ -30,13 +30,15 @@ entity execute1 is fp_in : in FPUToExecute1Type; ext_irq_in : std_ulogic; + interrupt_in : std_ulogic; -- asynchronous l_out : out Execute1ToLoadstore1Type; - f_out : out Execute1ToFetch1Type; fp_out : out Execute1ToFPUType; e_out : out Execute1ToWritebackType; + bypass_data : out bypass_data_t; + bypass_cr_data : out cr_bypass_data_t; dbg_msr_out : out std_ulogic_vector(63 downto 0); @@ -59,42 +61,30 @@ architecture behaviour of execute1 is fp_exception_next : std_ulogic; trace_next : std_ulogic; prev_op : insn_type_t; - lr_update : std_ulogic; - next_lr : std_ulogic_vector(63 downto 0); + br_taken : std_ulogic; mul_in_progress : std_ulogic; mul_finish : std_ulogic; div_in_progress : std_ulogic; cntz_in_progress : std_ulogic; - last_nia : std_ulogic_vector(63 downto 0); - redirect : std_ulogic; - abs_br : std_ulogic; - taken_br : std_ulogic; - br_last : std_ulogic; - do_intr : std_ulogic; - vector : integer range 0 to 16#fff#; - br_offset : std_ulogic_vector(63 downto 0); - redir_mode : std_ulogic_vector(3 downto 0); log_addr_spr : std_ulogic_vector(31 downto 0); end record; constant reg_type_init : reg_type := (e => Execute1ToWritebackInit, cur_instr => Decode2ToExecute1Init, - busy => '0', lr_update => '0', terminate => '0', - fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, + busy => '0', terminate => '0', + fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, br_taken => '0', mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0', - next_lr => (others => '0'), last_nia => (others => '0'), - redirect => '0', abs_br => '0', taken_br => '0', br_last => '0', do_intr => '0', vector => 0, - br_offset => (others => '0'), redir_mode => "0000", others => (others => '0')); signal r, rin : reg_type; signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0); signal cr_in : std_ulogic_vector(31 downto 0); + signal xerc_in : xer_common_t; signal valid_in : std_ulogic; - signal ctrl: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0')); - signal ctrl_tmp: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0')); + signal ctrl: ctrl_t := (others => (others => '0')); + signal ctrl_tmp: ctrl_t := (others => (others => '0')); signal right_shift, rot_clear_left, rot_clear_right: std_ulogic; signal rot_sign_ext: std_ulogic; signal rotator_result: std_ulogic_vector(63 downto 0); @@ -108,8 +98,19 @@ architecture behaviour of execute1 is signal spr_result: std_ulogic_vector(63 downto 0); signal result_mux_sel: std_ulogic_vector(2 downto 0); signal sub_mux_sel: std_ulogic_vector(2 downto 0); + signal next_nia : std_ulogic_vector(63 downto 0); signal current: Decode2ToExecute1Type; + signal carry_32 : std_ulogic; + signal carry_64 : std_ulogic; + signal overflow_32 : std_ulogic; + signal overflow_64 : std_ulogic; + + signal trapval : std_ulogic_vector(4 downto 0); + + signal write_cr_mask : std_ulogic_vector(7 downto 0); + signal write_cr_data : std_ulogic_vector(31 downto 0); + -- multiply signals signal x_to_multiply: MultiplyInputType; signal multiply_to_x: MultiplyOutputType; @@ -156,7 +157,6 @@ architecture behaviour of execute1 is begin e.xerc.ca32 := carry32; e.xerc.ca := carry; - e.write_xerc_enable := '1'; end; procedure set_ov(e: inout Execute1ToWritebackType; @@ -168,7 +168,6 @@ architecture behaviour of execute1 is if ov = '1' then e.xerc.so := '1'; end if; - e.write_xerc_enable := '1'; end; function calc_ov(msb_a : std_ulogic; msb_b: std_ulogic; @@ -282,12 +281,23 @@ begin dbg_msr_out <= ctrl.msr; log_rd_addr <= r.log_addr_spr; - a_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data1 = '1' else e_in.read_data1; - b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2; - c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3; + a_in <= e_in.read_data1; + b_in <= e_in.read_data2; + c_in <= e_in.read_data3; + cr_in <= e_in.cr; + + -- XER forwarding. To avoid having to track XER hazards, we use + -- the previously latched value. Since the XER common bits + -- (SO, OV[32] and CA[32]) are only modified by instructions that are + -- handled here, we can just forward the result being sent to + -- writeback. + xerc_in <= r.e.xerc when r.e.write_xerc_enable = '1' or r.busy = '1' else e_in.xerc; - busy_out <= l_in.busy or r.busy or fp_in.busy; - valid_in <= e_in.valid and not busy_out; + with e_in.unit select busy_out <= + l_in.busy or r.busy or fp_in.busy when LDST, + l_in.busy or l_in.in_progress or r.busy or fp_in.busy when others; + + valid_in <= e_in.valid and not busy_out and not flush_in; terminate_out <= r.terminate; @@ -301,6 +311,7 @@ begin muldiv_result when "011", countzero_result when "100", spr_result when "101", + next_nia when "110", misc_result when others; execute1_0: process(clk) @@ -311,125 +322,50 @@ begin ctrl.tb <= (others => '0'); ctrl.dec <= (others => '0'); ctrl.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0'); - ctrl.irq_state <= WRITE_SRR0; else r <= rin; ctrl <= ctrl_tmp; - assert not (r.lr_update = '1' and valid_in = '1') - report "LR update collision with valid in EX1" - severity failure; - if r.lr_update = '1' then - report "LR update to " & to_hstring(r.next_lr); + if valid_in = '1' then + report "execute " & to_hstring(e_in.nia) & " op=" & insn_type_t'image(e_in.insn_type) & + " wr=" & to_hstring(rin.e.write_reg) & " we=" & std_ulogic'image(rin.e.write_enable) & + " tag=" & integer'image(rin.e.instr_tag.tag) & std_ulogic'image(rin.e.instr_tag.valid); end if; end if; end if; end process; - execute1_1: process(all) - variable v : reg_type; + -- Data path for integer instructions + execute1_dp: process(all) variable a_inv : std_ulogic_vector(63 downto 0); variable b_or_m1 : std_ulogic_vector(63 downto 0); + variable sum_with_carry : std_ulogic_vector(64 downto 0); + variable sign1, sign2 : std_ulogic; + variable abs1, abs2 : signed(63 downto 0); + variable addend : std_ulogic_vector(127 downto 0); variable addg6s : std_ulogic_vector(63 downto 0); + variable crbit : integer range 0 to 31; variable isel_result : std_ulogic_vector(63 downto 0); variable darn : std_ulogic_vector(63 downto 0); - variable mfcr_result : std_ulogic_vector(63 downto 0); variable setb_result : std_ulogic_vector(63 downto 0); - variable newcrf : std_ulogic_vector(3 downto 0); - variable sum_with_carry : std_ulogic_vector(64 downto 0); - variable crnum : crnum_t; - variable crbit : integer range 0 to 31; - variable scrnum : crnum_t; + variable mfcr_result : std_ulogic_vector(63 downto 0); variable lo, hi : integer; - variable sh, mb, me : std_ulogic_vector(5 downto 0); - variable sh32, mb32, me32 : std_ulogic_vector(4 downto 0); - variable bo, bi : std_ulogic_vector(4 downto 0); - variable bf, bfa : std_ulogic_vector(2 downto 0); - variable cr_op : std_ulogic_vector(9 downto 0); - variable cr_operands : std_ulogic_vector(1 downto 0); - variable bt, ba, bb : std_ulogic_vector(4 downto 0); - variable btnum, banum, bbnum : integer range 0 to 31; - variable crresult : std_ulogic; variable l : std_ulogic; - variable next_nia : std_ulogic_vector(63 downto 0); - variable carry_32, carry_64 : std_ulogic; - variable sign1, sign2 : std_ulogic; - variable abs1, abs2 : signed(63 downto 0); - variable overflow : std_ulogic; variable zerohi, zerolo : std_ulogic; variable msb_a, msb_b : std_ulogic; variable a_lt : std_ulogic; variable a_lt_lo : std_ulogic; variable a_lt_hi : std_ulogic; - variable lv : Execute1ToLoadstore1Type; - variable irq_valid : std_ulogic; - variable exception : std_ulogic; - variable exception_nextpc : std_ulogic; - variable trapval : std_ulogic_vector(4 downto 0); - variable illegal : std_ulogic; - variable is_branch : std_ulogic; - variable is_direct_branch : std_ulogic; - variable taken_branch : std_ulogic; - variable abs_branch : std_ulogic; - variable spr_val : std_ulogic_vector(63 downto 0); - variable addend : std_ulogic_vector(127 downto 0); - variable do_trace : std_ulogic; - variable hold_wr_data : std_ulogic; - variable f : Execute1ToFetch1Type; - variable fv : Execute1ToFPUType; + variable newcrf : std_ulogic_vector(3 downto 0); + variable bf, bfa : std_ulogic_vector(2 downto 0); + variable crnum : crnum_t; + variable scrnum : crnum_t; + variable cr_operands : std_ulogic_vector(1 downto 0); + variable crresult : std_ulogic; + variable bt, ba, bb : std_ulogic_vector(4 downto 0); + variable btnum : integer range 0 to 3; + variable banum, bbnum : integer range 0 to 31; + variable j : integer; begin - sum_with_carry := (others => '0'); - newcrf := (others => '0'); - is_branch := '0'; - is_direct_branch := '0'; - taken_branch := '0'; - abs_branch := '0'; - hold_wr_data := '0'; - - v := r; - v.e := Execute1ToWritebackInit; - v.redirect := '0'; - v.abs_br := '0'; - v.do_intr := '0'; - v.vector := 0; - v.br_offset := (others => '0'); - v.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) & - not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF); - v.taken_br := '0'; - v.br_last := '0'; - - lv := Execute1ToLoadstore1Init; - fv := Execute1ToFPUInit; - - -- XER forwarding. To avoid having to track XER hazards, we use - -- the previously latched value. Since the XER common bits - -- (SO, OV[32] and CA[32]) are only modified by instructions that are - -- handled here, we can just forward the result being sent to - -- writeback. - if r.e.write_xerc_enable = '1' or r.busy = '1' then - v.e.xerc := r.e.xerc; - else - v.e.xerc := e_in.xerc; - end if; - - -- CR forwarding - cr_in <= e_in.cr; - if EX1_BYPASS and e_in.bypass_cr = '1' and r.e.write_cr_enable = '1' then - for i in 0 to 7 loop - if r.e.write_cr_mask(i) = '1' then - cr_in(i * 4 + 3 downto i * 4) <= r.e.write_cr_data(i * 4 + 3 downto i * 4); - end if; - end loop; - end if; - - v.lr_update := '0'; - v.mul_in_progress := '0'; - v.div_in_progress := '0'; - v.cntz_in_progress := '0'; - v.mul_finish := '0'; - - spr_result <= (others => '0'); - spr_val := (others => '0'); - -- Main adder if e_in.invert_a = '0' then a_inv := a_in; @@ -442,10 +378,12 @@ begin b_or_m1 := (others => '1'); end if; sum_with_carry := ppc_adde(a_inv, b_or_m1, - decode_input_carry(e_in.input_carry, v.e.xerc)); + decode_input_carry(e_in.input_carry, xerc_in)); adder_result <= sum_with_carry(63 downto 0); - carry_32 := sum_with_carry(32) xor a_inv(32) xor b_in(32); - carry_64 := sum_with_carry(64); + carry_32 <= sum_with_carry(32) xor a_inv(32) xor b_in(32); + carry_64 <= sum_with_carry(64); + overflow_32 <= calc_ov(a_inv(31), b_in(31), carry_32, sum_with_carry(31)); + overflow_64 <= calc_ov(a_inv(63), b_in(63), carry_64, sum_with_carry(63)); -- signals to multiply and divide units sign1 := '0'; @@ -472,12 +410,10 @@ begin end if; -- Interface to multiply and divide units - x_to_multiply <= MultiplyInputInit; - x_to_multiply.is_32bit <= e_in.is_32bit; - - x_to_divider <= Execute1ToDividerInit; x_to_divider.is_signed <= e_in.is_signed; x_to_divider.is_32bit <= e_in.is_32bit; + x_to_divider.is_extended <= '0'; + x_to_divider.is_modulus <= '0'; if e_in.insn_type = OP_MOD then x_to_divider.is_modulus <= '1'; end if; @@ -494,6 +430,7 @@ begin addend := not addend; end if; + x_to_multiply.is_32bit <= e_in.is_32bit; x_to_multiply.not_result <= sign1 xor sign2; x_to_multiply.addend <= addend; x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus); @@ -618,7 +555,7 @@ begin zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32))); if zerolo = '1' and (l = '0' or zerohi = '1') then -- values are equal - trapval := "00100"; + trapval <= "00100"; else a_lt_lo := '0'; a_lt_hi := '0'; @@ -642,14 +579,124 @@ begin if msb_a /= msb_b then -- Comparison is clear from MSB difference. -- for signed, 0 is greater; for unsigned, 1 is greater - trapval := msb_a & msb_b & '0' & msb_b & msb_a; + trapval <= msb_a & msb_b & '0' & msb_b & msb_a; else -- MSBs are equal, so signed and unsigned comparisons give the -- same answer. - trapval := a_lt & not a_lt & '0' & a_lt & not a_lt; + trapval <= a_lt & not a_lt & '0' & a_lt & not a_lt; end if; end if; + -- CR result mux + bf := insn_bf(e_in.insn); + crnum := to_integer(unsigned(bf)); + newcrf := (others => '0'); + case current.sub_select is + when "000" => + -- CMP and CMPL instructions + if e_in.is_signed = '1' then + newcrf := trapval(4 downto 2) & xerc_in.so; + else + newcrf := trapval(1 downto 0) & trapval(2) & xerc_in.so; + end if; + when "001" => + newcrf := ppc_cmprb(a_in, b_in, insn_l(e_in.insn)); + when "010" => + newcrf := ppc_cmpeqb(a_in, b_in); + when "011" => + if current.insn(1) = '1' then + -- CR logical instructions + j := (7 - crnum) * 4; + newcrf := cr_in(j + 3 downto j); + bt := insn_bt(e_in.insn); + ba := insn_ba(e_in.insn); + bb := insn_bb(e_in.insn); + btnum := 3 - to_integer(unsigned(bt(1 downto 0))); + banum := 31 - to_integer(unsigned(ba)); + bbnum := 31 - to_integer(unsigned(bb)); + -- Bits 6-9 of the instruction word give the truth table + -- of the requested logical operation + cr_operands := cr_in(banum) & cr_in(bbnum); + crresult := e_in.insn(6 + to_integer(unsigned(cr_operands))); + for i in 0 to 3 loop + if i = btnum then + newcrf(i) := crresult; + end if; + end loop; + else + -- MCRF + bfa := insn_bfa(e_in.insn); + scrnum := to_integer(unsigned(bfa)); + j := (7 - scrnum) * 4; + newcrf := cr_in(j + 3 downto j); + end if; + when "100" => + -- MCRXRX + newcrf := xerc_in.ov & xerc_in.ca & xerc_in.ov32 & xerc_in.ca32; + when others => + end case; + if current.insn_type = OP_MTCRF then + if e_in.insn(20) = '0' then + -- mtcrf + write_cr_mask <= insn_fxm(e_in.insn); + else + -- mtocrf: We require one hot priority encoding here + crnum := fxm_to_num(insn_fxm(e_in.insn)); + write_cr_mask <= num_to_fxm(crnum); + end if; + write_cr_data <= c_in(31 downto 0); + else + write_cr_mask <= num_to_fxm(crnum); + write_cr_data <= newcrf & newcrf & newcrf & newcrf & + newcrf & newcrf & newcrf & newcrf; + end if; + + end process; + + execute1_1: process(all) + variable v : reg_type; + variable lo, hi : integer; + variable sh, mb, me : std_ulogic_vector(5 downto 0); + variable bo, bi : std_ulogic_vector(4 downto 0); + variable overflow : std_ulogic; + variable lv : Execute1ToLoadstore1Type; + variable irq_valid : std_ulogic; + variable exception : std_ulogic; + variable illegal : std_ulogic; + variable is_branch : std_ulogic; + variable is_direct_branch : std_ulogic; + variable taken_branch : std_ulogic; + variable abs_branch : std_ulogic; + variable spr_val : std_ulogic_vector(63 downto 0); + variable do_trace : std_ulogic; + variable hold_wr_data : std_ulogic; + variable fv : Execute1ToFPUType; + begin + is_branch := '0'; + is_direct_branch := '0'; + taken_branch := '0'; + abs_branch := '0'; + hold_wr_data := '0'; + + v := r; + v.e := Execute1ToWritebackInit; + v.e.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) & + not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF); + v.e.xerc := xerc_in; + + lv := Execute1ToLoadstore1Init; + fv := Execute1ToFPUInit; + + x_to_multiply.valid <= '0'; + x_to_divider.valid <= '0'; + v.mul_in_progress := '0'; + v.div_in_progress := '0'; + v.cntz_in_progress := '0'; + v.mul_finish := '0'; + + spr_result <= (others => '0'); + spr_val := (others => '0'); + ctrl_tmp <= ctrl; -- FIXME: run at 512MHz not core freq ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); @@ -658,11 +705,11 @@ begin irq_valid := '0'; if ctrl.msr(MSR_EE) = '1' then if ctrl.dec(63) = '1' then - v.vector := 16#900#; + v.e.intr_vec := 16#900#; report "IRQ valid: DEC"; irq_valid := '1'; elsif ext_irq_in = '1' then - v.vector := 16#500#; + v.e.intr_vec := 16#500#; report "IRQ valid: External"; irq_valid := '1'; end if; @@ -673,7 +720,7 @@ begin v.busy := '0'; -- Next insn adder used in a couple of places - next_nia := std_ulogic_vector(unsigned(e_in.nia) + 4); + next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4); -- rotator control signals right_shift <= '1' when e_in.insn_type = OP_SHR else '0'; @@ -681,21 +728,17 @@ begin rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0'; rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0'; - ctrl_tmp.srr1 <= msr_copy(ctrl.msr); - ctrl_tmp.irq_state <= WRITE_SRR0; + v.e.srr1 := (others => '0'); exception := '0'; illegal := '0'; - exception_nextpc := '0'; - v.e.exc_write_enable := '0'; - v.e.exc_write_reg := fast_spr_num(SPR_SRR0); if valid_in = '1' then - v.e.exc_write_data := e_in.nia; - v.last_nia := e_in.nia; + v.e.last_nia := e_in.nia; else - v.e.exc_write_data := r.last_nia; + v.e.last_nia := r.e.last_nia; end if; v.e.mode_32bit := not ctrl.msr(MSR_SF); + v.e.instr_tag := current.instr_tag; do_trace := valid_in and ctrl.msr(MSR_SE); if valid_in = '1' then @@ -704,25 +747,25 @@ begin -- Determine if there is any exception to be taken -- before/instead of executing this instruction - if valid_in = '1' and e_in.second = '0' then + if valid_in = '1' and e_in.second = '0' and l_in.in_progress = '0' then if HAS_FPU and r.fp_exception_next = '1' then -- This is used for FP-type program interrupts that -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero. exception := '1'; - v.vector := 16#700#; - ctrl_tmp.srr1(63 - 43) <= '1'; - ctrl_tmp.srr1(63 - 47) <= '1'; + v.e.intr_vec := 16#700#; + v.e.srr1(47 - 43) := '1'; + v.e.srr1(47 - 47) := '1'; elsif r.trace_next = '1' then -- Generate a trace interrupt rather than executing the next instruction -- or taking any asynchronous interrupt exception := '1'; - v.vector := 16#d00#; - ctrl_tmp.srr1(63 - 33) <= '1'; + v.e.intr_vec := 16#d00#; + v.e.srr1(47 - 33) := '1'; if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then - ctrl_tmp.srr1(63 - 35) <= '1'; + v.e.srr1(47 - 35) := '1'; elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then - ctrl_tmp.srr1(63 - 36) <= '1'; + v.e.srr1(47 - 36) := '1'; end if; elsif irq_valid = '1' then @@ -733,9 +776,9 @@ begin elsif ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then -- generate a program interrupt exception := '1'; - v.vector := 16#700#; + v.e.intr_vec := 16#700#; -- set bit 45 to indicate privileged instruction type interrupt - ctrl_tmp.srr1(63 - 45) <= '1'; + v.e.srr1(47 - 45) := '1'; report "privileged instruction"; elsif not HAS_FPU and e_in.fac = FPU then @@ -745,16 +788,13 @@ begin elsif HAS_FPU and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then -- generate a floating-point unavailable interrupt exception := '1'; - v.vector := 16#800#; + v.e.intr_vec := 16#800#; report "FP unavailable interrupt"; end if; end if; if valid_in = '1' and exception = '0' and illegal = '0' and e_in.unit = ALU then - report "execute nia " & to_hstring(e_in.nia); - v.cur_instr := e_in; - v.next_lr := next_nia; v.e.valid := '1'; case_0: case e_in.insn_type is @@ -769,8 +809,8 @@ begin -- we need two cycles to write srr0 and 1 if e_in.insn(1) = '1' then exception := '1'; - exception_nextpc := '1'; - v.vector := 16#C00#; + v.e.intr_vec := 16#C00#; + v.e.last_nia := next_nia; report "sc"; else illegal := '1'; @@ -793,35 +833,17 @@ begin else v.e.xerc.ov := carry_64; v.e.xerc.ov32 := carry_32; - v.e.write_xerc_enable := '1'; end if; end if; if e_in.oe = '1' then - set_ov(v.e, - calc_ov(a_inv(63), b_in(63), carry_64, sum_with_carry(63)), - calc_ov(a_inv(31), b_in(31), carry_32, sum_with_carry(31))); + set_ov(v.e, overflow_64, overflow_32); end if; when OP_CMP => - -- CMP and CMPL instructions - if e_in.is_signed = '1' then - newcrf := trapval(4 downto 2) & v.e.xerc.so; - else - newcrf := trapval(1 downto 0) & trapval(2) & v.e.xerc.so; - end if; - bf := insn_bf(e_in.insn); - crnum := to_integer(unsigned(bf)); - v.e.write_cr_enable := '1'; - v.e.write_cr_mask := num_to_fxm(crnum); - for i in 0 to 7 loop - lo := i*4; - hi := lo + 3; - v.e.write_cr_data(hi downto lo) := newcrf; - end loop; when OP_TRAP => -- trap instructions (tw, twi, td, tdi) - v.vector := 16#700#; + v.e.intr_vec := 16#700#; -- set bit 46 to say trap occurred - ctrl_tmp.srr1(63 - 46) <= '1'; + v.e.srr1(47 - 46) := '1'; if or (trapval and insn_to(e_in.insn)) = '1' then -- generate trap-type program interrupt exception := '1'; @@ -829,57 +851,47 @@ begin end if; when OP_ADDG6S => when OP_CMPRB => - newcrf := ppc_cmprb(a_in, b_in, insn_l(e_in.insn)); - bf := insn_bf(e_in.insn); - crnum := to_integer(unsigned(bf)); - v.e.write_cr_enable := '1'; - v.e.write_cr_mask := num_to_fxm(crnum); - v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & - newcrf & newcrf & newcrf & newcrf; when OP_CMPEQB => - newcrf := ppc_cmpeqb(a_in, b_in); - bf := insn_bf(e_in.insn); - crnum := to_integer(unsigned(bf)); - v.e.write_cr_enable := '1'; - v.e.write_cr_mask := num_to_fxm(crnum); - v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & - newcrf & newcrf & newcrf & newcrf; when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS | OP_BPERM | OP_BCD => + when OP_B => is_branch := '1'; taken_branch := '1'; is_direct_branch := '1'; - abs_branch := insn_aa(e_in.insn); + abs_branch := e_in.br_abs; if ctrl.msr(MSR_BE) = '1' then do_trace := '1'; end if; - when OP_BC => - -- read_data1 is CTR + when OP_BC | OP_BCREG => + -- read_data1 is CTR + -- for OP_BCREG, read_data2 is target register (CTR, LR or TAR) + -- If this instruction updates both CTR and LR, then it is + -- doubled; the first instruction decrements CTR and determines + -- whether the branch is taken, and the second does the + -- redirect and the LR update. bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); - is_branch := '1'; - is_direct_branch := '1'; - taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); - abs_branch := insn_aa(e_in.insn); - if ctrl.msr(MSR_BE) = '1' then - do_trace := '1'; + if e_in.second = '0' then + taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); + else + taken_branch := r.br_taken; end if; - when OP_BCREG => - -- read_data1 is CTR - -- read_data2 is target register (CTR, LR or TAR) - bo := insn_bo(e_in.insn); - bi := insn_bi(e_in.insn); - is_branch := '1'; - taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); - abs_branch := '1'; - if ctrl.msr(MSR_BE) = '1' then - do_trace := '1'; + v.br_taken := taken_branch; + abs_branch := e_in.br_abs; + if e_in.repeat = '0' or e_in.second = '1' then + is_branch := '1'; + if e_in.insn_type = OP_BC then + is_direct_branch := '1'; + end if; + if ctrl.msr(MSR_BE) = '1' then + do_trace := '1'; + end if; end if; when OP_RFID => - v.redir_mode := (a_in(MSR_IR) or a_in(MSR_PR)) & not a_in(MSR_PR) & - not a_in(MSR_LE) & not a_in(MSR_SF); + v.e.redir_mode := (a_in(MSR_IR) or a_in(MSR_PR)) & not a_in(MSR_PR) & + not a_in(MSR_LE) & not a_in(MSR_SF); -- Can't use msr_copy here because the partial function MSR -- bits should be left unchanged, not zeroed. ctrl_tmp.msr(63 downto 31) <= a_in(63 downto 31); @@ -905,57 +917,8 @@ begin v.cntz_in_progress := '1'; v.busy := '1'; when OP_ISEL => - when OP_CROP => - cr_op := insn_cr(e_in.insn); - report "CR OP " & to_hstring(cr_op); - if cr_op(0) = '0' then -- MCRF - bf := insn_bf(e_in.insn); - bfa := insn_bfa(e_in.insn); - v.e.write_cr_enable := '1'; - crnum := to_integer(unsigned(bf)); - scrnum := to_integer(unsigned(bfa)); - v.e.write_cr_mask := num_to_fxm(crnum); - for i in 0 to 7 loop - lo := (7-i)*4; - hi := lo + 3; - if i = scrnum then - newcrf := cr_in(hi downto lo); - end if; - end loop; - for i in 0 to 7 loop - lo := i*4; - hi := lo + 3; - v.e.write_cr_data(hi downto lo) := newcrf; - end loop; - else - v.e.write_cr_enable := '1'; - bt := insn_bt(e_in.insn); - ba := insn_ba(e_in.insn); - bb := insn_bb(e_in.insn); - btnum := 31 - to_integer(unsigned(bt)); - banum := 31 - to_integer(unsigned(ba)); - bbnum := 31 - to_integer(unsigned(bb)); - -- Bits 5-8 of cr_op give the truth table of the requested - -- logical operation - cr_operands := cr_in(banum) & cr_in(bbnum); - crresult := cr_op(5 + to_integer(unsigned(cr_operands))); - v.e.write_cr_mask := num_to_fxm((31-btnum) / 4); - for i in 0 to 31 loop - if i = btnum then - v.e.write_cr_data(i) := crresult; - else - v.e.write_cr_data(i) := cr_in(i); - end if; - end loop; - end if; + when OP_CROP => when OP_MCRXRX => - newcrf := v.e.xerc.ov & v.e.xerc.ca & v.e.xerc.ov32 & v.e.xerc.ca32; - bf := insn_bf(e_in.insn); - crnum := to_integer(unsigned(bf)); - v.e.write_cr_enable := '1'; - v.e.write_cr_mask := num_to_fxm(crnum); - v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & - newcrf & newcrf & newcrf & newcrf; when OP_DARN => when OP_MFMSR => when OP_MFSPR => @@ -966,12 +929,12 @@ begin if decode_spr_num(e_in.insn) = SPR_XER then -- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer spr_val(63 downto 32) := (others => '0'); - spr_val(63-32) := v.e.xerc.so; - spr_val(63-33) := v.e.xerc.ov; - spr_val(63-34) := v.e.xerc.ca; + spr_val(63-32) := xerc_in.so; + spr_val(63-33) := xerc_in.ov; + spr_val(63-34) := xerc_in.ca; spr_val(63-35 downto 63-43) := "000000000"; - spr_val(63-44) := v.e.xerc.ov32; - spr_val(63-45) := v.e.xerc.ca32; + spr_val(63-44) := xerc_in.ov32; + spr_val(63-45) := xerc_in.ca32; end if; else spr_val := c_in; @@ -1005,16 +968,6 @@ begin when OP_MFCR => when OP_MTCRF => - v.e.write_cr_enable := '1'; - if e_in.insn(20) = '0' then - -- mtcrf - v.e.write_cr_mask := insn_fxm(e_in.insn); - else - -- mtocrf: We require one hot priority encoding here - crnum := fxm_to_num(insn_fxm(e_in.insn)); - v.e.write_cr_mask := num_to_fxm(crnum); - end if; - v.e.write_cr_data := c_in(31 downto 0); when OP_MTMSRD => if e_in.insn(16) = '1' then -- just update EE and RI @@ -1049,7 +1002,6 @@ begin v.e.xerc.ca := c_in(63-34); v.e.xerc.ov32 := c_in(63-44); v.e.xerc.ca32 := c_in(63-45); - v.e.write_xerc_enable := '1'; end if; else -- slow spr @@ -1073,8 +1025,8 @@ begin when OP_SETB => when OP_ISYNC => - v.redirect := '1'; - v.br_offset := std_ulogic_vector(to_unsigned(4, 64)); + v.e.redirect := '1'; + v.e.br_offset := std_ulogic_vector(to_unsigned(4, 64)); when OP_ICBI => icache_inval <= '1'; @@ -1102,16 +1054,16 @@ begin ctrl_tmp.cfar <= e_in.nia; end if; if taken_branch = '1' then - v.br_offset := b_in; - v.abs_br := abs_branch; + v.e.br_offset := b_in; + v.e.abs_br := abs_branch; else - v.br_offset := std_ulogic_vector(to_unsigned(4, 64)); + v.e.br_offset := std_ulogic_vector(to_unsigned(4, 64)); end if; if taken_branch /= e_in.br_pred then - v.redirect := '1'; + v.e.redirect := '1'; end if; - v.br_last := is_direct_branch; - v.taken_br := taken_branch; + v.e.br_last := is_direct_branch; + v.e.br_taken := taken_branch; end if; elsif valid_in = '1' and exception = '0' and illegal = '0' then @@ -1132,28 +1084,7 @@ begin -- The following cases all occur when r.busy = 1 and therefore -- valid_in = 0. Hence they don't happen in the same cycle as any of -- the cases above which depend on valid_in = 1. - - if ctrl.irq_state = WRITE_SRR1 then - v.e.exc_write_reg := fast_spr_num(SPR_SRR1); - v.e.exc_write_data := ctrl.srr1; - v.e.exc_write_enable := '1'; - ctrl_tmp.msr(MSR_SF) <= '1'; - ctrl_tmp.msr(MSR_EE) <= '0'; - ctrl_tmp.msr(MSR_PR) <= '0'; - ctrl_tmp.msr(MSR_SE) <= '0'; - ctrl_tmp.msr(MSR_BE) <= '0'; - ctrl_tmp.msr(MSR_FP) <= '0'; - ctrl_tmp.msr(MSR_FE0) <= '0'; - ctrl_tmp.msr(MSR_FE1) <= '0'; - ctrl_tmp.msr(MSR_IR) <= '0'; - ctrl_tmp.msr(MSR_DR) <= '0'; - ctrl_tmp.msr(MSR_RI) <= '0'; - ctrl_tmp.msr(MSR_LE) <= '1'; - v.trace_next := '0'; - v.fp_exception_next := '0'; - report "Writing SRR1: " & to_hstring(ctrl.srr1); - - elsif r.cntz_in_progress = '1' then + if r.cntz_in_progress = '1' then -- cnt[lt]z always takes two cycles v.e.valid := '1'; elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then @@ -1169,7 +1100,6 @@ begin v.mul_finish := '1'; v.busy := '1'; else - v.e.write_xerc_enable := current.oe; -- We must test oe because the RC update code in writeback -- will use the xerc value to set CR0:SO so we must not clobber -- xerc if OE wasn't set. @@ -1189,7 +1119,6 @@ begin end if; elsif r.mul_finish = '1' then hold_wr_data := '1'; - v.e.write_xerc_enable := current.oe; v.e.xerc.ov := multiply_to_x.overflow; v.e.xerc.ov32 := multiply_to_x.overflow; if multiply_to_x.overflow = '1' then @@ -1197,74 +1126,39 @@ begin end if; v.e.valid := '1'; end if; - -- When doing delayed LR update, keep r.e.write_data unchanged - -- next cycle in case it is needed for a forwarded result (e.g. CTR). - if r.lr_update = '1' then - hold_wr_data := '1'; - end if; - - -- Generate FP-type program interrupt. fp_in.interrupt will only - -- be set during the execution of a FP instruction. - -- The case where MSR[FE0,FE1] goes from zero to non-zero is - -- handled above by mtmsrd and rfid setting v.fp_exception_next. - if HAS_FPU and fp_in.interrupt = '1' then - v.vector := 16#700#; - ctrl_tmp.srr1(63 - 43) <= '1'; - exception := '1'; - end if; - if illegal = '1' or (HAS_FPU and fp_in.illegal = '1') then + if illegal = '1' then exception := '1'; - v.vector := 16#700#; + v.e.intr_vec := 16#700#; -- Since we aren't doing Hypervisor emulation assist (0xe40) we -- set bit 44 to indicate we have an illegal - ctrl_tmp.srr1(63 - 44) <= '1'; + v.e.srr1(47 - 44) := '1'; report "illegal"; end if; - if exception = '1' then - v.e.exc_write_enable := '1'; - if exception_nextpc = '1' then - v.e.exc_write_data := next_nia; - end if; - end if; - -- generate DSI or DSegI for load/store exceptions - -- or ISI or ISegI for instruction fetch exceptions - if l_in.exception = '1' then - if l_in.alignment = '1' then - v.vector := 16#600#; - elsif l_in.instr_fault = '0' then - if l_in.segment_fault = '0' then - v.vector := 16#300#; - else - v.vector := 16#380#; - end if; - else - if l_in.segment_fault = '0' then - ctrl_tmp.srr1(63 - 33) <= l_in.invalid; - ctrl_tmp.srr1(63 - 35) <= l_in.perm_error; -- noexec fault - ctrl_tmp.srr1(63 - 44) <= l_in.badtree; - ctrl_tmp.srr1(63 - 45) <= l_in.rc_error; - v.vector := 16#400#; - else - v.vector := 16#480#; - end if; - end if; - v.e.exc_write_enable := '1'; - v.e.exc_write_reg := fast_spr_num(SPR_SRR0); - report "ldst exception writing srr0=" & to_hstring(r.last_nia); - end if; - - if exception = '1' or l_in.exception = '1' then - ctrl_tmp.irq_state <= WRITE_SRR1; - v.redirect := '1'; - v.do_intr := '1'; - end if; + v.e.interrupt := exception; if do_trace = '1' then v.trace_next := '1'; end if; + if interrupt_in = '1' then + ctrl_tmp.msr(MSR_SF) <= '1'; + ctrl_tmp.msr(MSR_EE) <= '0'; + ctrl_tmp.msr(MSR_PR) <= '0'; + ctrl_tmp.msr(MSR_SE) <= '0'; + ctrl_tmp.msr(MSR_BE) <= '0'; + ctrl_tmp.msr(MSR_FP) <= '0'; + ctrl_tmp.msr(MSR_FE0) <= '0'; + ctrl_tmp.msr(MSR_FE1) <= '0'; + ctrl_tmp.msr(MSR_IR) <= '0'; + ctrl_tmp.msr(MSR_DR) <= '0'; + ctrl_tmp.msr(MSR_RI) <= '0'; + ctrl_tmp.msr(MSR_LE) <= '1'; + v.trace_next := '0'; + v.fp_exception_next := '0'; + end if; + if hold_wr_data = '0' then v.e.write_data := alu_result; else @@ -1273,69 +1167,29 @@ begin v.e.write_reg := current.write_reg; v.e.write_enable := current.write_reg_enable and v.e.valid and not exception; v.e.rc := current.rc and v.e.valid and not exception; - - -- Update LR on the next cycle after a branch link - -- If we're not writing back anything else, we can write back LR - -- this cycle, otherwise we take an extra cycle. We use the - -- exc_write path since next_nia is written through that path - -- in other places. - if v.e.valid = '1' and exception = '0' and current.lr = '1' then - if current.write_reg_enable = '0' then - v.e.exc_write_enable := '1'; - v.e.exc_write_data := next_nia; - v.e.exc_write_reg := fast_spr_num(SPR_LR); + v.e.write_cr_data := write_cr_data; + v.e.write_cr_mask := write_cr_mask; + v.e.write_cr_enable := current.output_cr and v.e.valid and not exception; + v.e.write_xerc_enable := current.output_xer and v.e.valid and not exception; + + bypass_data.tag.valid <= current.instr_tag.valid and current.write_reg_enable and v.e.valid; + bypass_data.tag.tag <= current.instr_tag.tag; + bypass_data.data <= v.e.write_data; + + bypass_cr_data.tag.valid <= current.instr_tag.valid and current.output_cr and v.e.valid; + bypass_cr_data.tag.tag <= current.instr_tag.tag; + for i in 0 to 7 loop + if v.e.write_cr_mask(i) = '1' then + bypass_cr_data.data(i*4 + 3 downto i*4) <= v.e.write_cr_data(i*4 + 3 downto i*4); else - v.lr_update := '1'; - v.e.valid := '0'; - report "Delayed LR update to " & to_hstring(next_nia); - v.busy := '1'; - end if; - end if; - if r.lr_update = '1' then - v.e.exc_write_enable := '1'; - v.e.exc_write_data := r.next_lr; - v.e.exc_write_reg := fast_spr_num(SPR_LR); - v.e.valid := '1'; - end if; - - -- Defer completion for one cycle when redirecting. - -- This also ensures r.busy = 1 when ctrl.irq_state = WRITE_SRR1 - if v.redirect = '1' then - v.busy := '1'; - v.e.valid := '0'; - end if; - if r.redirect = '1' then - v.e.valid := '1'; - end if; - - -- Outputs to fetch1 - f.redirect := r.redirect; - f.br_nia := r.last_nia; - f.br_last := r.br_last and not r.do_intr; - f.br_taken := r.taken_br; - if r.do_intr = '1' then - f.redirect_nia := std_ulogic_vector(to_unsigned(r.vector, 64)); - f.virt_mode := '0'; - f.priv_mode := '1'; - -- XXX need an interrupt LE bit here, e.g. from LPCR - f.big_endian := '0'; - f.mode_32bit := '0'; - else - if r.abs_br = '1' then - f.redirect_nia := r.br_offset; - else - f.redirect_nia := std_ulogic_vector(unsigned(r.last_nia) + unsigned(r.br_offset)); + bypass_cr_data.data(i*4 + 3 downto i*4) <= cr_in(i*4 + 3 downto i*4); end if; - -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1 - f.virt_mode := r.redir_mode(3); - f.priv_mode := r.redir_mode(2); - f.big_endian := r.redir_mode(1); - f.mode_32bit := r.redir_mode(0); - end if; + end loop; -- Outputs to loadstore1 (async) lv.op := e_in.insn_type; lv.nia := e_in.nia; + lv.instr_tag := e_in.instr_tag; lv.addr1 := a_in; lv.addr2 := b_in; lv.data := c_in; @@ -1344,8 +1198,7 @@ begin lv.byte_reverse := e_in.byte_reverse xnor ctrl.msr(MSR_LE); lv.sign_extend := e_in.sign_extend; lv.update := e_in.update; - lv.update_reg := gspr_to_gpr(e_in.read_reg1); - lv.xerc := v.e.xerc; + lv.xerc := xerc_in; lv.reserve := e_in.reserve; lv.rc := e_in.rc; lv.insn := e_in.insn; @@ -1365,6 +1218,7 @@ begin fv.op := e_in.insn_type; fv.nia := e_in.nia; fv.insn := e_in.insn; + fv.itag := e_in.instr_tag; fv.single := e_in.is_32bit; fv.fe_mode := ctrl.msr(MSR_FE0) & ctrl.msr(MSR_FE1); fv.fra := a_in; @@ -1378,11 +1232,10 @@ begin rin <= v; -- update outputs - f_out <= f; l_out <= lv; e_out <= r.e; + e_out.msr <= msr_copy(ctrl.msr); fp_out <= fv; - flush_out <= f_out.redirect; exception_log <= exception; irq_valid_log <= irq_valid; @@ -1398,13 +1251,13 @@ begin ctrl.msr(MSR_IR) & ctrl.msr(MSR_DR) & exception_log & irq_valid_log & - std_ulogic_vector(to_unsigned(irq_state_t'pos(ctrl.irq_state), 1)) & + interrupt_in & "000" & r.e.write_enable & r.e.valid & - f_out.redirect & + (r.e.redirect or r.e.interrupt) & r.busy & - flush_out; + flush_in; end if; end process; log_out <= log_data; diff --git a/fetch1.vhdl b/fetch1.vhdl index 8ca7e57..788a76d 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -22,8 +22,8 @@ entity fetch1 is stop_in : in std_ulogic; alt_reset_in : in std_ulogic; - -- redirect from execution unit - e_in : in Execute1ToFetch1Type; + -- redirect from writeback unit + w_in : in WritebackToFetch1Type; -- redirect from decode1 d_in : in Decode1ToFetch1Type; @@ -70,12 +70,12 @@ begin " P:" & std_ulogic'image(r_next.priv_mode) & " E:" & std_ulogic'image(r_next.big_endian) & " 32:" & std_ulogic'image(r_next_int.mode_32bit) & - " R:" & std_ulogic'image(e_in.redirect) & std_ulogic'image(d_in.redirect) & + " R:" & std_ulogic'image(w_in.redirect) & std_ulogic'image(d_in.redirect) & " S:" & std_ulogic'image(stall_in) & " T:" & std_ulogic'image(stop_in) & " nia:" & to_hstring(r_next.nia); end if; - if rst = '1' or e_in.redirect = '1' or d_in.redirect = '1' or stall_in = '0' then + if rst = '1' or w_in.redirect = '1' or d_in.redirect = '1' or stall_in = '0' then r.virt_mode <= r_next.virt_mode; r.priv_mode <= r_next.priv_mode; r.big_endian <= r_next.big_endian; @@ -109,11 +109,11 @@ begin signal btc_wr_addr : std_ulogic_vector(BTC_ADDR_BITS - 1 downto 0); signal btc_wr_v : std_ulogic; begin - btc_wr_data <= e_in.br_nia(63 downto BTC_ADDR_BITS + 2) & - e_in.redirect_nia(63 downto 2); - btc_wr_addr <= e_in.br_nia(BTC_ADDR_BITS + 1 downto 2); - btc_wr <= e_in.br_last; - btc_wr_v <= e_in.br_taken; + btc_wr_data <= w_in.br_nia(63 downto BTC_ADDR_BITS + 2) & + w_in.redirect_nia(63 downto 2); + btc_wr_addr <= w_in.br_nia(BTC_ADDR_BITS + 1 downto 2); + btc_wr <= w_in.br_last; + btc_wr_v <= w_in.br_taken; btc_ram : process(clk) variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0); @@ -158,15 +158,15 @@ begin v.big_endian := '0'; v_int.mode_32bit := '0'; v_int.predicted_nia := (others => '0'); - elsif e_in.redirect = '1' then - v.nia := e_in.redirect_nia(63 downto 2) & "00"; - if e_in.mode_32bit = '1' then + elsif w_in.redirect = '1' then + v.nia := w_in.redirect_nia(63 downto 2) & "00"; + if w_in.mode_32bit = '1' then v.nia(63 downto 32) := (others => '0'); end if; - v.virt_mode := e_in.virt_mode; - v.priv_mode := e_in.priv_mode; - v.big_endian := e_in.big_endian; - v_int.mode_32bit := e_in.mode_32bit; + v.virt_mode := w_in.virt_mode; + v.priv_mode := w_in.priv_mode; + v.big_endian := w_in.big_endian; + v_int.mode_32bit := w_in.mode_32bit; elsif d_in.redirect = '1' then v.nia := d_in.redirect_nia(63 downto 2) & "00"; if r_int.mode_32bit = '1' then @@ -191,7 +191,7 @@ begin -- If the last NIA value went down with a stop mark, it didn't get -- executed, and hence we shouldn't increment NIA. - advance_nia <= rst or e_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in); + advance_nia <= rst or w_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in); r_next <= v; r_next_int <= v_int; diff --git a/fpu.vhdl b/fpu.vhdl index 2e8096a..93fa9d6 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -73,8 +73,11 @@ architecture behaviour of fpu is busy : std_ulogic; instr_done : std_ulogic; do_intr : std_ulogic; + illegal : std_ulogic; op : insn_type_t; insn : std_ulogic_vector(31 downto 0); + nia : std_ulogic_vector(63 downto 0); + instr_tag : instr_tag_t; dest_fpr : gspr_index_t; fe_mode : std_ulogic; rc : std_ulogic; @@ -571,9 +574,9 @@ begin e_out.busy <= r.busy; e_out.exception <= r.fpscr(FPSCR_FEX); - e_out.interrupt <= r.do_intr; w_out.valid <= r.instr_done and not r.do_intr; + w_out.instr_tag <= r.instr_tag; w_out.write_enable <= r.writing_back; w_out.write_reg <= r.dest_fpr; w_out.write_data <= fp_result; @@ -581,6 +584,10 @@ begin w_out.write_cr_mask <= r.cr_mask; w_out.write_cr_data <= r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result; + w_out.interrupt <= r.do_intr; + w_out.intr_vec <= 16#700#; + w_out.srr0 <= r.nia; + w_out.srr1 <= (47-44 => r.illegal, 47-43 => not r.illegal, others => '0'); fpu_1: process(all) variable v : reg_type; @@ -642,7 +649,9 @@ begin -- capture incoming instruction if e_in.valid = '1' then v.insn := e_in.insn; + v.nia := e_in.nia; v.op := e_in.op; + v.instr_tag := e_in.itag; v.fe_mode := or (e_in.fe_mode); v.dest_fpr := e_in.frt; v.single_prec := e_in.single; @@ -2540,9 +2549,10 @@ begin v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX); end if; + v.illegal := illegal; if illegal = '1' then v.instr_done := '0'; - v.do_intr := '0'; + v.do_intr := '1'; v.writing_back := '0'; v.busy := '0'; v.state := IDLE; @@ -2554,7 +2564,6 @@ begin end if; rin <= v; - e_out.illegal <= illegal; end process; end architecture behaviour; diff --git a/gpr_hazard.vhdl b/gpr_hazard.vhdl deleted file mode 100644 index 6b00994..0000000 --- a/gpr_hazard.vhdl +++ /dev/null @@ -1,112 +0,0 @@ -library ieee; -use ieee.std_logic_1164.all; -use ieee.numeric_std.all; - -library work; -use work.common.all; - -entity gpr_hazard is - generic ( - PIPELINE_DEPTH : natural := 1 - ); - port( - clk : in std_ulogic; - busy_in : in std_ulogic; - deferred : in std_ulogic; - complete_in : in std_ulogic; - flush_in : in std_ulogic; - issuing : in std_ulogic; - repeated : in std_ulogic; - - gpr_write_valid_in : in std_ulogic; - gpr_write_in : in gspr_index_t; - bypass_avail : in std_ulogic; - gpr_read_valid_in : in std_ulogic; - gpr_read_in : in gspr_index_t; - - ugpr_write_valid : in std_ulogic; - ugpr_write_reg : in gspr_index_t; - - stall_out : out std_ulogic; - use_bypass : out std_ulogic - ); -end entity gpr_hazard; -architecture behaviour of gpr_hazard is - type pipeline_entry_type is record - valid : std_ulogic; - bypass : std_ulogic; - gpr : gspr_index_t; - ugpr_valid : std_ulogic; - ugpr : gspr_index_t; - end record; - constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0'), - ugpr_valid => '0', ugpr => (others => '0')); - - type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type; - constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init); - - signal r, rin : pipeline_t := pipeline_t_init; -begin - gpr_hazard0: process(clk) - begin - if rising_edge(clk) then - r <= rin; - end if; - end process; - - gpr_hazard1: process(all) - variable v : pipeline_t; - begin - v := r; - - if complete_in = '1' then - v(PIPELINE_DEPTH).valid := '0'; - v(PIPELINE_DEPTH).ugpr_valid := '0'; - end if; - - stall_out <= '0'; - use_bypass <= '0'; - if repeated = '0' and gpr_read_valid_in = '1' then - loop_0: for i in 0 to PIPELINE_DEPTH loop - -- The second half of a split instruction never has GPR - -- dependencies on the first half's output GPR, - -- so ignore matches when i = 0 for the second half. - if v(i).valid = '1' and r(i).gpr = gpr_read_in and - not (i = 0 and repeated = '1') then - if r(i).bypass = '1' then - use_bypass <= '1'; - else - stall_out <= '1'; - end if; - end if; - if v(i).ugpr_valid = '1' and r(i).ugpr = gpr_read_in then - stall_out <= '1'; - end if; - end loop; - end if; - - -- XXX assumes PIPELINE_DEPTH = 1 - if busy_in = '0' then - v(1) := v(0); - v(0).valid := '0'; - v(0).ugpr_valid := '0'; - end if; - if deferred = '0' and issuing = '1' then - v(0).valid := gpr_write_valid_in; - v(0).bypass := bypass_avail; - v(0).gpr := gpr_write_in; - v(0).ugpr_valid := ugpr_write_valid; - v(0).ugpr := ugpr_write_reg; - end if; - if flush_in = '1' then - v(0).valid := '0'; - v(0).ugpr_valid := '0'; - v(1).valid := '0'; - v(1).ugpr_valid := '0'; - end if; - - -- update registers - rin <= v; - - end process; -end; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index b83eed6..ee4507b 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -37,42 +37,47 @@ entity loadstore1 is ); end loadstore1; --- Note, we don't currently use the stall output from the dcache because --- we know it can take two requests without stalling when idle, we are --- its only user, and we know it never stalls when idle. - architecture behave of loadstore1 is -- State machine for unaligned loads/stores type state_t is (IDLE, -- ready for instruction - SECOND_REQ, -- send 2nd request of unaligned xfer - ACK_WAIT, -- waiting for ack from dcache MMU_LOOKUP, -- waiting for MMU to look up translation TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie - FINISH_LFS, -- write back converted SP data for lfs* - COMPLETE -- extra cycle to complete an operation + FINISH_LFS -- write back converted SP data for lfs* ); type byte_index_t is array(0 to 7) of unsigned(2 downto 0); subtype byte_trim_t is std_ulogic_vector(1 downto 0); type trim_ctl_t is array(0 to 7) of byte_trim_t; - type reg_stage_t is record - -- latch most of the input request + type request_t is record + valid : std_ulogic; + dc_req : std_ulogic; load : std_ulogic; + store : std_ulogic; tlbie : std_ulogic; dcbz : std_ulogic; + read_spr : std_ulogic; + write_spr : std_ulogic; + mmu_op : std_ulogic; + instr_fault : std_ulogic; + load_zero : std_ulogic; + do_update : std_ulogic; + noop : std_ulogic; + mode_32bit : std_ulogic; addr : std_ulogic_vector(63 downto 0); + addr0 : std_ulogic_vector(63 downto 0); + byte_sel : std_ulogic_vector(7 downto 0); + second_bytes : std_ulogic_vector(7 downto 0); store_data : std_ulogic_vector(63 downto 0); - load_data : std_ulogic_vector(63 downto 0); + instr_tag : instr_tag_t; write_reg : gspr_index_t; length : std_ulogic_vector(3 downto 0); + elt_length : std_ulogic_vector(3 downto 0); byte_reverse : std_ulogic; - byte_offset : unsigned(2 downto 0); brev_mask : unsigned(2 downto 0); sign_extend : std_ulogic; update : std_ulogic; - update_reg : gpr_index_t; xerc : xer_common_t; reserve : std_ulogic; atomic : std_ulogic; @@ -81,37 +86,88 @@ architecture behave of loadstore1 is nc : std_ulogic; -- non-cacheable access virt_mode : std_ulogic; priv_mode : std_ulogic; + load_sp : std_ulogic; + sprn : std_ulogic_vector(9 downto 0); + is_slbia : std_ulogic; + align_intr : std_ulogic; + dword_index : std_ulogic; + two_dwords : std_ulogic; + nia : std_ulogic_vector(63 downto 0); + end record; + constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', tlbie => '0', + dcbz => '0', read_spr => '0', write_spr => '0', mmu_op => '0', + instr_fault => '0', load_zero => '0', do_update => '0', noop => '0', + mode_32bit => '0', addr => (others => '0'), addr0 => (others => '0'), + byte_sel => x"00", second_bytes => x"00", + store_data => (others => '0'), instr_tag => instr_tag_init, + write_reg => 7x"00", length => x"0", + elt_length => x"0", byte_reverse => '0', brev_mask => "000", + sign_extend => '0', update => '0', + xerc => xerc_init, reserve => '0', + atomic => '0', atomic_last => '0', rc => '0', nc => '0', + virt_mode => '0', priv_mode => '0', load_sp => '0', + sprn => 10x"0", is_slbia => '0', align_intr => '0', + dword_index => '0', two_dwords => '0', + nia => (others => '0')); + + type reg_stage1_t is record + req : request_t; + issued : std_ulogic; + end record; + + type reg_stage2_t is record + req : request_t; + byte_index : byte_index_t; + use_second : std_ulogic_vector(7 downto 0); + wait_dc : std_ulogic; + wait_mmu : std_ulogic; + one_cycle : std_ulogic; + wr_sel : std_ulogic_vector(1 downto 0); + end record; + + type reg_stage3_t is record state : state_t; - dwords_done : std_ulogic; - last_dword : std_ulogic; - first_bytes : std_ulogic_vector(7 downto 0); - second_bytes : std_ulogic_vector(7 downto 0); + instr_tag : instr_tag_t; + write_enable : std_ulogic; + write_reg : gspr_index_t; + write_data : std_ulogic_vector(63 downto 0); + rc : std_ulogic; + xerc : xer_common_t; + store_done : std_ulogic; + convert_lfs : std_ulogic; + load_data : std_ulogic_vector(63 downto 0); dar : std_ulogic_vector(63 downto 0); dsisr : std_ulogic_vector(31 downto 0); - instr_fault : std_ulogic; - align_intr : std_ulogic; - sprval : std_ulogic_vector(63 downto 0); - busy : std_ulogic; - wait_dcache : std_ulogic; - wait_mmu : std_ulogic; - do_update : std_ulogic; - extra_cycle : std_ulogic; - mode_32bit : std_ulogic; - byte_index : byte_index_t; - use_second : std_ulogic_vector(7 downto 0); - trim_ctl : trim_ctl_t; - load_sp : std_ulogic; ld_sp_data : std_ulogic_vector(31 downto 0); ld_sp_nz : std_ulogic; ld_sp_lz : std_ulogic_vector(5 downto 0); - wr_sel : std_ulogic_vector(1 downto 0); + stage1_en : std_ulogic; + interrupt : std_ulogic; + intr_vec : integer range 0 to 16#fff#; + nia : std_ulogic_vector(63 downto 0); + srr1 : std_ulogic_vector(15 downto 0); end record; - signal r, rin : reg_stage_t; - signal lsu_sum : std_ulogic_vector(63 downto 0); + signal req_in : request_t; + signal r1, r1in : reg_stage1_t; + signal r2, r2in : reg_stage2_t; + signal r3, r3in : reg_stage3_t; + + signal busy : std_ulogic; + signal complete : std_ulogic; + signal in_progress : std_ulogic; + signal flushing : std_ulogic; signal store_sp_data : std_ulogic_vector(31 downto 0); signal load_dp_data : std_ulogic_vector(63 downto 0); + signal store_data : std_ulogic_vector(63 downto 0); + + signal stage1_issue_enable : std_ulogic; + signal stage1_req : request_t; + signal stage1_dcreq : std_ulogic; + signal stage1_dreq : std_ulogic; + signal stage2_busy_next : std_ulogic; + signal stage3_busy_next : std_ulogic; -- Generate byte enables from sizes function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is @@ -210,18 +266,37 @@ architecture behave of loadstore1 is end; begin - -- Calculate the address in the first cycle - lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0'); - - loadstore1_0: process(clk) + loadstore1_reg: process(clk) begin if rising_edge(clk) then if rst = '1' then - r.state <= IDLE; - r.busy <= '0'; - r.do_update <= '0'; + r1.req.valid <= '0'; + r2.req.valid <= '0'; + r2.wait_dc <= '0'; + r2.wait_mmu <= '0'; + r2.one_cycle <= '0'; + r3.state <= IDLE; + r3.write_enable <= '0'; + r3.interrupt <= '0'; + r3.stage1_en <= '1'; + r3.convert_lfs <= '0'; + flushing <= '0'; else - r <= rin; + r1 <= r1in; + r2 <= r2in; + r3 <= r3in; + flushing <= (flushing or (r1in.req.valid and r1in.req.align_intr)) and + not r3in.interrupt; + end if; + stage1_dreq <= stage1_dcreq; + if d_in.valid = '1' then + assert r2.req.valid = '1' and r2.req.dc_req = '1' and r3.state = IDLE severity failure; + end if; + if d_in.error = '1' then + assert r2.req.valid = '1' and r2.req.dc_req = '1' and r3.state = IDLE severity failure; + end if; + if m_in.done = '1' or m_in.err = '1' then + assert r2.req.valid = '1' and (r3.state = MMU_LOOKUP or r3.state = TLBIE_WAIT) severity failure; end if; end if; end process; @@ -256,79 +331,346 @@ begin variable frac : std_ulogic_vector(22 downto 0); variable frac_shift : unsigned(4 downto 0); begin - frac := r.ld_sp_data(22 downto 0); - exp := unsigned(r.ld_sp_data(30 downto 23)); - exp_nz := or (r.ld_sp_data(30 downto 23)); - exp_ao := and (r.ld_sp_data(30 downto 23)); + frac := r3.ld_sp_data(22 downto 0); + exp := unsigned(r3.ld_sp_data(30 downto 23)); + exp_nz := or (r3.ld_sp_data(30 downto 23)); + exp_ao := and (r3.ld_sp_data(30 downto 23)); frac_shift := (others => '0'); if exp_ao = '1' then exp_dp := to_unsigned(2047, 11); -- infinity or NaN elsif exp_nz = '1' then exp_dp := 896 + resize(exp, 11); -- finite normalized value - elsif r.ld_sp_nz = '0' then + elsif r3.ld_sp_nz = '0' then exp_dp := to_unsigned(0, 11); -- zero else -- denormalized SP operand, need to normalize - exp_dp := 896 - resize(unsigned(r.ld_sp_lz), 11); - frac_shift := unsigned(r.ld_sp_lz(4 downto 0)) + 1; + exp_dp := 896 - resize(unsigned(r3.ld_sp_lz), 11); + frac_shift := unsigned(r3.ld_sp_lz(4 downto 0)) + 1; end if; - load_dp_data(63) <= r.ld_sp_data(31); + load_dp_data(63) <= r3.ld_sp_data(31); load_dp_data(62 downto 52) <= std_ulogic_vector(exp_dp); load_dp_data(51 downto 29) <= shifter_23l(frac, frac_shift); load_dp_data(28 downto 0) <= (others => '0'); end process; end generate; - loadstore1_1: process(all) - variable v : reg_stage_t; + -- Translate a load/store instruction into the internal request format + -- XXX this should only depend on l_in, but actually depends on + -- r1.req.addr0 as well (in the l_in.second = 1 case). + loadstore1_in: process(all) + variable v : request_t; + variable lsu_sum : std_ulogic_vector(63 downto 0); variable brev_lenm1 : unsigned(2 downto 0); - variable byte_offset : unsigned(2 downto 0); - variable j : integer; - variable k : unsigned(2 downto 0); - variable kk : unsigned(3 downto 0); variable long_sel : std_ulogic_vector(15 downto 0); - variable byte_sel : std_ulogic_vector(7 downto 0); - variable req : std_ulogic; - variable busy : std_ulogic; variable addr : std_ulogic_vector(63 downto 0); - variable maddr : std_ulogic_vector(63 downto 0); - variable wdata : std_ulogic_vector(63 downto 0); - variable write_enable : std_ulogic; - variable do_update : std_ulogic; - variable done : std_ulogic; - variable data_permuted : std_ulogic_vector(63 downto 0); - variable data_trimmed : std_ulogic_vector(63 downto 0); - variable store_data : std_ulogic_vector(63 downto 0); - variable byte_rev : std_ulogic; - variable length : std_ulogic_vector(3 downto 0); - variable negative : std_ulogic; variable sprn : std_ulogic_vector(9 downto 0); - variable exception : std_ulogic; - variable next_addr : std_ulogic_vector(63 downto 0); - variable mmureq : std_ulogic; - variable dsisr : std_ulogic_vector(31 downto 0); - variable mmu_mtspr : std_ulogic; - variable itlb_fault : std_ulogic; variable misaligned : std_ulogic; + variable addr_mask : std_ulogic_vector(2 downto 0); + begin + v := request_init; + sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10)); + + v.valid := l_in.valid; + v.instr_tag := l_in.instr_tag; + v.mode_32bit := l_in.mode_32bit; + v.write_reg := l_in.write_reg; + v.length := l_in.length; + v.elt_length := l_in.length; + v.byte_reverse := l_in.byte_reverse; + v.sign_extend := l_in.sign_extend; + v.update := l_in.update; + v.xerc := l_in.xerc; + v.reserve := l_in.reserve; + v.rc := l_in.rc; + v.nc := l_in.ci; + v.virt_mode := l_in.virt_mode; + v.priv_mode := l_in.priv_mode; + v.sprn := sprn; + v.nia := l_in.nia; + + lsu_sum := std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)); + + if HAS_FPU and l_in.is_32bit = '1' then + v.store_data := x"00000000" & store_sp_data; + else + v.store_data := l_in.data; + end if; + + addr := lsu_sum; + + if l_in.second = '1' then + if l_in.update = '0' then + -- for the second half of a 16-byte transfer, + -- use the previous address plus 8. + addr := std_ulogic_vector(unsigned(r1.req.addr0(63 downto 3)) + 1) & r1.req.addr0(2 downto 0); + else + -- for an update-form load, use the previous address + -- as the value to write back to RA. + addr := r1.req.addr0; + end if; + end if; + if l_in.mode_32bit = '1' then + addr(63 downto 32) := (others => '0'); + end if; + v.addr := addr; + v.addr0 := addr; + + -- XXX Temporary hack. Mark the op as non-cachable if the address + -- is the form 0xc------- for a real-mode access. + if addr(31 downto 28) = "1100" and l_in.virt_mode = '0' then + v.nc := '1'; + end if; + + addr_mask := std_ulogic_vector(unsigned(l_in.length(2 downto 0)) - 1); + + -- Do length_to_sel and work out if we are doing 2 dwords + long_sel := xfer_data_sel(v.length, addr(2 downto 0)); + v.byte_sel := long_sel(7 downto 0); + v.second_bytes := long_sel(15 downto 8); + if long_sel(15 downto 8) /= "00000000" then + v.two_dwords := '1'; + end if; + + -- check alignment for larx/stcx + misaligned := or (addr_mask and addr(2 downto 0)); + v.align_intr := l_in.reserve and misaligned; + if l_in.repeat = '1' and l_in.second = '0' and l_in.update = '0' and addr(3) = '1' then + -- length is really 16 not 8 + -- Make misaligned lq cause an alignment interrupt in LE mode, + -- in order to avoid the case with RA = RT + 1 where the second half + -- faults but the first doesn't (and updates RT+1, destroying RA). + -- The equivalent BE case doesn't occur because RA = RT is illegal. + misaligned := '1'; + if l_in.reserve = '1' or (l_in.op = OP_LOAD and l_in.byte_reverse = '0') then + v.align_intr := '1'; + end if; + end if; + + v.atomic := not misaligned; + v.atomic_last := not misaligned and (l_in.second or not l_in.repeat); + + case l_in.op is + when OP_STORE => + v.store := '1'; + when OP_LOAD => + if l_in.update = '0' or l_in.second = '0' then + v.load := '1'; + if HAS_FPU and l_in.is_32bit = '1' then + -- Allow an extra cycle for SP->DP precision conversion + v.load_sp := '1'; + end if; + else + -- write back address to RA + v.do_update := '1'; + end if; + when OP_DCBZ => + v.dcbz := '1'; + v.align_intr := v.nc; + when OP_TLBIE => + v.tlbie := '1'; + v.addr := l_in.addr2; -- address from RB for tlbie + v.is_slbia := l_in.insn(7); + v.mmu_op := '1'; + when OP_MFSPR => + v.read_spr := '1'; + when OP_MTSPR => + v.write_spr := '1'; + v.mmu_op := sprn(9) or sprn(5); + when OP_FETCH_FAILED => + -- send it to the MMU to do the radix walk + v.instr_fault := '1'; + v.addr := l_in.nia; + v.mmu_op := '1'; + when others => + end case; + v.dc_req := l_in.valid and (v.load or v.store or v.dcbz) and not v.align_intr; + + -- Work out controls for load and store formatting + brev_lenm1 := "000"; + if v.byte_reverse = '1' then + brev_lenm1 := unsigned(v.length(2 downto 0)) - 1; + end if; + v.brev_mask := brev_lenm1; + + req_in <= v; + end process; + + busy <= r1.req.valid and ((r1.req.dc_req and not r1.issued) or + (r1.issued and d_in.error) or + stage2_busy_next or + (r1.req.dc_req and r1.req.two_dwords and not r1.req.dword_index)); + complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or + (r2.wait_mmu and m_in.done) or r3.convert_lfs; + in_progress <= r1.req.valid or (r2.req.valid and not complete); + + stage1_issue_enable <= r3.stage1_en and not (r1.req.valid and r1.req.mmu_op) and + not (r2.req.valid and r2.req.mmu_op); + + -- Processing done in the first cycle of a load/store instruction + loadstore1_1: process(all) + variable v : reg_stage1_t; + variable req : request_t; + variable dcreq : std_ulogic; + variable addr : std_ulogic_vector(63 downto 0); + begin + v := r1; + dcreq := '0'; + req := req_in; + if flushing = '1' then + -- Make this a no-op request rather than simply invalid. + -- It will never get to stage 3 since there is a request ahead of + -- it with align_intr = 1. + req.dc_req := '0'; + end if; + + -- Note that l_in.valid is gated with busy inside execute1 + if l_in.valid = '1' then + dcreq := req.dc_req and stage1_issue_enable and not d_in.error and not dc_stall; + v.req := req; + v.issued := dcreq; + elsif r1.req.valid = '1' then + if r1.req.dc_req = '1' and r1.issued = '0' then + req := r1.req; + dcreq := stage1_issue_enable and not dc_stall and not d_in.error; + v.issued := dcreq; + elsif r1.issued = '1' and d_in.error = '1' then + v.issued := '0'; + elsif stage2_busy_next = '0' then + -- we can change what's in r1 next cycle because the current thing + -- in r1 will go into r2 + if r1.req.dc_req = '1' and r1.req.two_dwords = '1' and r1.req.dword_index = '0' then + -- construct the second request for a misaligned access + v.req.dword_index := '1'; + v.req.addr := std_ulogic_vector(unsigned(r1.req.addr(63 downto 3)) + 1) & "000"; + if r1.req.mode_32bit = '1' then + v.req.addr(32) := '0'; + end if; + v.req.byte_sel := r1.req.second_bytes; + v.issued := stage1_issue_enable and not dc_stall; + dcreq := stage1_issue_enable and not dc_stall; + req := v.req; + else + v.req.valid := '0'; + end if; + end if; + end if; + if r3in.interrupt = '1' then + v.req.valid := '0'; + dcreq := '0'; + end if; + + stage1_req <= req; + stage1_dcreq <= dcreq; + r1in <= v; + end process; + + -- Processing done in the second cycle of a load/store instruction. + -- Store data is formatted here and sent to the dcache. + -- The request in r1 is sent to stage 3 if stage 3 will not be busy next cycle. + loadstore1_2: process(all) + variable v : reg_stage2_t; + variable j : integer; + variable k : unsigned(2 downto 0); + variable kk : unsigned(3 downto 0); + variable idx : unsigned(2 downto 0); + variable byte_offset : unsigned(2 downto 0); begin - v := r; + v := r2; + + -- Byte reversing and rotating for stores. + -- Done in the second cycle (the cycle after l_in.valid = 1). + byte_offset := unsigned(r1.req.addr0(2 downto 0)); + for i in 0 to 7 loop + k := (to_unsigned(i, 3) - byte_offset) xor r1.req.brev_mask; + j := to_integer(k) * 8; + store_data(i * 8 + 7 downto i * 8) <= r1.req.store_data(j + 7 downto j); + end loop; + + if stage3_busy_next = '0' and + (r1.req.valid = '0' or r1.issued = '1' or r1.req.dc_req = '0') then + v.req := r1.req; + v.req.store_data := store_data; + v.wait_dc := r1.req.valid and r1.req.dc_req and not r1.req.load_sp and + not (r1.req.two_dwords and not r1.req.dword_index); + v.wait_mmu := r1.req.valid and r1.req.mmu_op; + v.one_cycle := r1.req.valid and (r1.req.noop or r1.req.read_spr or + (r1.req.write_spr and not r1.req.mmu_op) or + r1.req.load_zero or r1.req.do_update); + if r1.req.read_spr = '1' then + v.wr_sel := "00"; + elsif r1.req.do_update = '1' or r1.req.store = '1' then + v.wr_sel := "01"; + elsif r1.req.load_sp = '1' then + v.wr_sel := "10"; + else + v.wr_sel := "11"; + end if; + + -- Work out load formatter controls for next cycle + for i in 0 to 7 loop + idx := to_unsigned(i, 3) xor r1.req.brev_mask; + kk := ('0' & idx) + ('0' & byte_offset); + v.use_second(i) := kk(3); + v.byte_index(i) := kk(2 downto 0); + end loop; + elsif stage3_busy_next = '0' then + v.req.valid := '0'; + v.wait_dc := '0'; + v.wait_mmu := '0'; + end if; + + stage2_busy_next <= r1.req.valid and stage3_busy_next; + + if r3in.interrupt = '1' then + v.req.valid := '0'; + end if; + + r2in <= v; + end process; + + -- Processing done in the third cycle of a load/store instruction. + -- At this stage we can do things that have side effects without + -- fear of the instruction getting flushed. This is the point at + -- which requests get sent to the MMU. + loadstore1_3: process(all) + variable v : reg_stage3_t; + variable j : integer; + variable req : std_ulogic; + variable mmureq : std_ulogic; + variable mmu_mtspr : std_ulogic; + variable write_enable : std_ulogic; + variable write_data : std_ulogic_vector(63 downto 0); + variable do_update : std_ulogic; + variable done : std_ulogic; + variable part_done : std_ulogic; + variable exception : std_ulogic; + variable data_permuted : std_ulogic_vector(63 downto 0); + variable data_trimmed : std_ulogic_vector(63 downto 0); + variable sprval : std_ulogic_vector(63 downto 0); + variable negative : std_ulogic; + variable dsisr : std_ulogic_vector(31 downto 0); + variable itlb_fault : std_ulogic; + variable trim_ctl : trim_ctl_t; + begin + v := r3; + req := '0'; + mmureq := '0'; mmu_mtspr := '0'; - itlb_fault := '0'; - sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10)); + done := '0'; + part_done := '0'; + exception := '0'; dsisr := (others => '0'); - mmureq := '0'; - v.wr_sel := "11"; - write_enable := '0'; - - do_update := r.do_update; - v.do_update := '0'; + sprval := (others => '0'); + do_update := '0'; + v.convert_lfs := '0'; + v.srr1 := (others => '0'); -- load data formatting -- shift and byte-reverse data bytes for i in 0 to 7 loop - j := to_integer(r.byte_index(i)) * 8; + j := to_integer(r2.byte_index(i)) * 8; data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j); end loop; @@ -336,29 +678,39 @@ begin -- For unaligned loads crossing two dwords, the sign bit is in the -- first dword for big-endian (byte_reverse = 1), or the second dword -- for little-endian. - if r.dwords_done = '1' and r.byte_reverse = '1' then - negative := (r.length(3) and r.load_data(63)) or - (r.length(2) and r.load_data(31)) or - (r.length(1) and r.load_data(15)) or - (r.length(0) and r.load_data(7)); + if r2.req.dword_index = '1' and r2.req.byte_reverse = '1' then + negative := (r2.req.length(3) and r3.load_data(63)) or + (r2.req.length(2) and r3.load_data(31)) or + (r2.req.length(1) and r3.load_data(15)) or + (r2.req.length(0) and r3.load_data(7)); else - negative := (r.length(3) and data_permuted(63)) or - (r.length(2) and data_permuted(31)) or - (r.length(1) and data_permuted(15)) or - (r.length(0) and data_permuted(7)); + negative := (r2.req.length(3) and data_permuted(63)) or + (r2.req.length(2) and data_permuted(31)) or + (r2.req.length(1) and data_permuted(15)) or + (r2.req.length(0) and data_permuted(7)); end if; -- trim and sign-extend for i in 0 to 7 loop - case r.trim_ctl(i) is + if i < to_integer(unsigned(r2.req.length)) then + if r2.req.dword_index = '1' then + trim_ctl(i) := '1' & not r2.use_second(i); + else + trim_ctl(i) := "10"; + end if; + else + trim_ctl(i) := "00"; + end if; + end loop; + + for i in 0 to 7 loop + case trim_ctl(i) is when "11" => - data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8); + data_trimmed(i * 8 + 7 downto i * 8) := r3.load_data(i * 8 + 7 downto i * 8); when "10" => data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8); - when "01" => - data_trimmed(i * 8 + 7 downto i * 8) := (others => negative); when others => - data_trimmed(i * 8 + 7 downto i * 8) := x"00"; + data_trimmed(i * 8 + 7 downto i * 8) := (others => negative and r2.req.sign_extend); end case; end loop; @@ -369,63 +721,62 @@ begin v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0)); end if; - -- Byte reversing and rotating for stores. - -- Done in the second cycle (the cycle after l_in.valid = 1). - for i in 0 to 7 loop - k := (to_unsigned(i, 3) - r.byte_offset) xor r.brev_mask; - j := to_integer(k) * 8; - store_data(i * 8 + 7 downto i * 8) := r.store_data(j + 7 downto j); - end loop; - - -- compute (addr + 8) & ~7 for the second doubleword when unaligned - next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; - - -- Busy calculation. - -- We need to minimize the delay from clock to busy valid because it - -- gates the start of execution of the next instruction. - busy := r.busy and not ((r.wait_dcache and d_in.valid) or (r.wait_mmu and m_in.done)); - v.busy := busy; - - done := '0'; - if r.state /= IDLE and busy = '0' then - done := '1'; + if d_in.valid = '1' and r2.req.load = '1' then + v.load_data := data_permuted; end if; - exception := '0'; - if r.dwords_done = '1' or r.state = SECOND_REQ then - addr := next_addr; - byte_sel := r.second_bytes; - else - addr := r.addr; - byte_sel := r.first_bytes; - end if; - if r.mode_32bit = '1' then - addr(63 downto 32) := (others => '0'); + if r2.req.valid = '1' then + if r2.req.read_spr = '1' then + write_enable := '1'; + -- partial decode on SPR number should be adequate given + -- the restricted set that get sent down this path + if r2.req.sprn(9) = '0' and r2.req.sprn(5) = '0' then + if r2.req.sprn(0) = '0' then + sprval := x"00000000" & r3.dsisr; + else + sprval := r3.dar; + end if; + else + -- reading one of the SPRs in the MMU + sprval := m_in.sprval; + end if; + end if; + if r2.req.align_intr = '1' then + -- generate alignment interrupt + exception := '1'; + end if; + if r2.req.load_zero = '1' then + write_enable := '1'; + end if; + if r2.req.do_update = '1' then + do_update := '1'; + end if; end if; - maddr := addr; - case r.state is + case r3.state is when IDLE => - - when SECOND_REQ => - req := '1'; - v.state := ACK_WAIT; - v.last_dword := '0'; - - when ACK_WAIT => - -- r.wr_sel gets set one cycle after we come into ACK_WAIT state, - -- which is OK because the dcache always takes at least two cycles. - if r.update = '1' and (r.load = '0' or (HAS_FPU and r.load_sp = '1')) then - v.wr_sel := "01"; + if d_in.valid = '1' then + if r2.req.two_dwords = '0' or r2.req.dword_index = '1' then + write_enable := r2.req.load and not r2.req.load_sp; + if HAS_FPU and r2.req.load_sp = '1' then + -- SP to DP conversion takes a cycle + v.state := FINISH_LFS; + v.convert_lfs := '1'; + else + -- stores write back rA update + do_update := r2.req.update and r2.req.store; + end if; + else + part_done := '1'; + end if; end if; if d_in.error = '1' then - -- dcache will discard the second request if it - -- gets an error on the 1st of two requests if d_in.cache_paradox = '1' then -- signal an interrupt straight away exception := '1'; - dsisr(63 - 38) := not r.load; + dsisr(63 - 38) := not r2.req.load; -- XXX there is no architected bit for this + -- (probably should be a machine check in fact) dsisr(63 - 35) := d_in.cache_paradox; else -- Look up the translation for TLB miss @@ -433,56 +784,42 @@ begin -- in case the PTE has been updated. mmureq := '1'; v.state := MMU_LOOKUP; + v.stage1_en := '0'; end if; end if; - if d_in.valid = '1' then - if r.last_dword = '0' then - v.dwords_done := '1'; - v.last_dword := '1'; - if r.load = '1' then - v.load_data := data_permuted; + if r2.req.valid = '1' then + if r2.req.mmu_op = '1' then + -- send request (tlbie, mtspr, itlb miss) to MMU + mmureq := not r2.req.write_spr; + mmu_mtspr := r2.req.write_spr; + if r2.req.instr_fault = '1' then + v.state := MMU_LOOKUP; + else + v.state := TLBIE_WAIT; end if; - else - write_enable := r.load and not r.load_sp; - if HAS_FPU and r.load_sp = '1' then - -- SP to DP conversion takes a cycle - -- Write back rA update in this cycle if needed - do_update := r.update; - v.wr_sel := "10"; - v.state := FINISH_LFS; - elsif r.extra_cycle = '1' then - -- loads with rA update need an extra cycle - v.wr_sel := "01"; - v.state := COMPLETE; - v.do_update := r.update; + elsif r2.req.write_spr = '1' then + if r2.req.sprn(0) = '0' then + v.dsisr := r2.req.store_data(31 downto 0); else - -- stores write back rA update in this cycle - do_update := r.update; + v.dar := r2.req.store_data; end if; - v.busy := '0'; end if; end if; - -- r.wait_dcache gets set one cycle after we come into ACK_WAIT state, - -- which is OK because the dcache always takes at least two cycles. - v.wait_dcache := r.last_dword and not r.extra_cycle; when MMU_LOOKUP => if m_in.done = '1' then - if r.instr_fault = '0' then + if r2.req.instr_fault = '0' then -- retry the request now that the MMU has installed a TLB entry req := '1'; - if r.last_dword = '0' then - v.state := SECOND_REQ; - else - v.state := ACK_WAIT; - end if; + v.stage1_en := '1'; + v.state := IDLE; end if; end if; if m_in.err = '1' then exception := '1'; dsisr(63 - 33) := m_in.invalid; dsisr(63 - 36) := m_in.perm_error; - dsisr(63 - 38) := not r.load; + dsisr(63 - 38) := r2.req.store or r2.req.dcbz; dsisr(63 - 44) := m_in.badtree; dsisr(63 - 45) := m_in.rc_error; end if; @@ -490,281 +827,127 @@ begin when TLBIE_WAIT => when FINISH_LFS => - - when COMPLETE => - exception := r.align_intr; + write_enable := '1'; end case; - if done = '1' or exception = '1' then + if complete = '1' or exception = '1' then + v.stage1_en := '1'; v.state := IDLE; - v.busy := '0'; end if; - -- Note that l_in.valid is gated with busy inside execute1 - if l_in.valid = '1' then - v.mode_32bit := l_in.mode_32bit; - v.load := '0'; - v.dcbz := '0'; - v.tlbie := '0'; - v.instr_fault := '0'; - v.align_intr := '0'; - v.dwords_done := '0'; - v.last_dword := '1'; - v.write_reg := l_in.write_reg; - v.length := l_in.length; - v.byte_reverse := l_in.byte_reverse; - v.sign_extend := l_in.sign_extend; - v.update := l_in.update; - v.update_reg := l_in.update_reg; - v.xerc := l_in.xerc; - v.reserve := l_in.reserve; - v.rc := l_in.rc; - v.nc := l_in.ci; - v.virt_mode := l_in.virt_mode; - v.priv_mode := l_in.priv_mode; - v.load_sp := '0'; - v.wait_dcache := '0'; - v.wait_mmu := '0'; - v.do_update := '0'; - v.extra_cycle := '0'; - - if HAS_FPU and l_in.is_32bit = '1' then - v.store_data := x"00000000" & store_sp_data; - else - v.store_data := l_in.data; - end if; - - addr := lsu_sum; - if l_in.second = '1' then - -- for the second half of a 16-byte transfer, use next_addr - addr := next_addr; - end if; - if l_in.mode_32bit = '1' then - addr(63 downto 32) := (others => '0'); - end if; - v.addr := addr; - maddr := l_in.addr2; -- address from RB for tlbie - - -- XXX Temporary hack. Mark the op as non-cachable if the address - -- is the form 0xc------- for a real-mode access. - if addr(31 downto 28) = "1100" and l_in.virt_mode = '0' then - v.nc := '1'; - end if; - - if l_in.second = '0' then - -- Do length_to_sel and work out if we are doing 2 dwords - long_sel := xfer_data_sel(l_in.length, lsu_sum(2 downto 0)); - byte_sel := long_sel(7 downto 0); - v.first_bytes := byte_sel; - v.second_bytes := long_sel(15 downto 8); - else - byte_sel := r.first_bytes; - long_sel := r.second_bytes & r.first_bytes; - end if; - - -- check alignment for larx/stcx - misaligned := or (std_ulogic_vector(unsigned(l_in.length(2 downto 0)) - 1) and addr(2 downto 0)); - v.align_intr := l_in.reserve and misaligned; - if l_in.repeat = '1' and l_in.second = '0' and addr(3) = '1' then - -- length is really 16 not 8 - -- Make misaligned lq cause an alignment interrupt in LE mode, - -- in order to avoid the case with RA = RT + 1 where the second half - -- faults but the first doesn't (and updates RT+1, destroying RA). - -- The equivalent BE case doesn't occur because RA = RT is illegal. - misaligned := '1'; - if l_in.reserve = '1' or (l_in.op = OP_LOAD and l_in.byte_reverse = '0') then - v.align_intr := '1'; + -- generate DSI or DSegI for load/store exceptions + -- or ISI or ISegI for instruction fetch exceptions + v.interrupt := exception; + if exception = '1' then + v.nia := r2.req.nia; + if r2.req.align_intr = '1' then + v.intr_vec := 16#600#; + v.dar := r2.req.addr; + elsif r2.req.instr_fault = '0' then + v.dar := r2.req.addr; + if m_in.segerr = '0' then + v.intr_vec := 16#300#; + v.dsisr := dsisr; + else + v.intr_vec := 16#380#; end if; - end if; - - v.atomic := not misaligned; - v.atomic_last := not misaligned and (l_in.second or not l_in.repeat); - - case l_in.op is - when OP_STORE => - req := '1'; - when OP_LOAD => - req := '1'; - v.load := '1'; - -- Allow an extra cycle for RA update on loads - v.extra_cycle := l_in.update; - if HAS_FPU and l_in.is_32bit = '1' then - -- Allow an extra cycle for SP->DP precision conversion - v.load_sp := '1'; - v.extra_cycle := '1'; - end if; - when OP_DCBZ => - v.align_intr := v.nc; - req := '1'; - v.dcbz := '1'; - when OP_TLBIE => - mmureq := '1'; - v.tlbie := '1'; - v.state := TLBIE_WAIT; - v.wait_mmu := '1'; - when OP_MFSPR => - v.wr_sel := "00"; - -- partial decode on SPR number should be adequate given - -- the restricted set that get sent down this path - if sprn(9) = '0' and sprn(5) = '0' then - if sprn(0) = '0' then - v.sprval := x"00000000" & r.dsisr; - else - v.sprval := r.dar; - end if; - else - -- reading one of the SPRs in the MMU - v.sprval := m_in.sprval; - end if; - v.state := COMPLETE; - when OP_MTSPR => - if sprn(9) = '0' and sprn(5) = '0' then - if sprn(0) = '0' then - v.dsisr := l_in.data(31 downto 0); - else - v.dar := l_in.data; - end if; - v.state := COMPLETE; - else - -- writing one of the SPRs in the MMU - mmu_mtspr := '1'; - v.state := TLBIE_WAIT; - v.wait_mmu := '1'; - end if; - when OP_FETCH_FAILED => - -- send it to the MMU to do the radix walk - maddr := l_in.nia; - v.instr_fault := '1'; - mmureq := '1'; - v.state := MMU_LOOKUP; - v.wait_mmu := '1'; - when others => - assert false report "unknown op sent to loadstore1"; - end case; - - if req = '1' then - if v.align_intr = '1' then - v.state := COMPLETE; - elsif long_sel(15 downto 8) = "00000000" then - v.state := ACK_WAIT; + else + if m_in.segerr = '0' then + v.srr1(47 - 33) := m_in.invalid; + v.srr1(47 - 35) := m_in.perm_error; -- noexec fault + v.srr1(47 - 44) := m_in.badtree; + v.srr1(47 - 45) := m_in.rc_error; + v.intr_vec := 16#400#; else - v.state := SECOND_REQ; + v.intr_vec := 16#480#; end if; end if; - - v.busy := req or mmureq or mmu_mtspr; - end if; - - -- Work out controls for store formatting - if l_in.valid = '1' then - byte_offset := unsigned(lsu_sum(2 downto 0)); - byte_rev := l_in.byte_reverse; - length := l_in.length; - brev_lenm1 := "000"; - if byte_rev = '1' then - brev_lenm1 := unsigned(length(2 downto 0)) - 1; - end if; - v.byte_offset := byte_offset; - v.brev_mask := brev_lenm1; - end if; - - -- Work out load formatter controls for next cycle - byte_offset := unsigned(v.addr(2 downto 0)); - brev_lenm1 := "000"; - if v.byte_reverse = '1' then - brev_lenm1 := unsigned(v.length(2 downto 0)) - 1; end if; - for i in 0 to 7 loop - kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); - v.use_second(i) := kk(3); - v.byte_index(i) := kk(2 downto 0); - end loop; - - for i in 0 to 7 loop - if i < to_integer(unsigned(v.length)) then - if v.dwords_done = '1' then - v.trim_ctl(i) := '1' & not v.use_second(i); - else - v.trim_ctl(i) := "10"; - end if; - else - v.trim_ctl(i) := '0' & v.sign_extend; - end if; - end loop; + case r2.wr_sel is + when "00" => + -- mfspr result + write_data := sprval; + when "01" => + -- update reg + write_data := r2.req.addr0; + when "10" => + -- lfs result + write_data := load_dp_data; + when others => + -- load data + write_data := data_trimmed; + end case; -- Update outputs to dcache - d_out.valid <= req and not v.align_intr; - d_out.load <= v.load; - d_out.dcbz <= v.dcbz; - d_out.nc <= v.nc; - d_out.reserve <= v.reserve; - d_out.atomic <= v.atomic; - d_out.atomic_last <= v.atomic_last; - d_out.addr <= addr; - d_out.data <= store_data; - d_out.byte_sel <= byte_sel; - d_out.virt_mode <= v.virt_mode; - d_out.priv_mode <= v.priv_mode; + if stage1_issue_enable = '1' then + d_out.valid <= stage1_dcreq; + d_out.load <= stage1_req.load; + d_out.dcbz <= stage1_req.dcbz; + d_out.nc <= stage1_req.nc; + d_out.reserve <= stage1_req.reserve; + d_out.atomic <= stage1_req.atomic; + d_out.atomic_last <= stage1_req.atomic_last; + d_out.addr <= stage1_req.addr; + d_out.byte_sel <= stage1_req.byte_sel; + d_out.virt_mode <= stage1_req.virt_mode; + d_out.priv_mode <= stage1_req.priv_mode; + else + d_out.valid <= req; + d_out.load <= r2.req.load; + d_out.dcbz <= r2.req.dcbz; + d_out.nc <= r2.req.nc; + d_out.reserve <= r2.req.reserve; + d_out.atomic <= r2.req.atomic; + d_out.atomic_last <= r2.req.atomic_last; + d_out.addr <= r2.req.addr; + d_out.byte_sel <= r2.req.byte_sel; + d_out.virt_mode <= r2.req.virt_mode; + d_out.priv_mode <= r2.req.priv_mode; + end if; + if stage1_dreq = '1' then + d_out.data <= store_data; + else + d_out.data <= r2.req.store_data; + end if; + d_out.hold <= r2.req.valid and r2.req.load_sp and d_in.valid; -- Update outputs to MMU m_out.valid <= mmureq; - m_out.iside <= v.instr_fault; - m_out.load <= r.load; - m_out.priv <= r.priv_mode; - m_out.tlbie <= v.tlbie; + m_out.iside <= r2.req.instr_fault; + m_out.load <= r2.req.load; + m_out.priv <= r2.req.priv_mode; + m_out.tlbie <= r2.req.tlbie; m_out.mtspr <= mmu_mtspr; - m_out.sprn <= sprn; - m_out.addr <= maddr; - m_out.slbia <= l_in.insn(7); - m_out.rs <= l_in.data; + m_out.sprn <= r2.req.sprn; + m_out.addr <= r2.req.addr; + m_out.slbia <= r2.req.is_slbia; + m_out.rs <= r2.req.store_data; -- Update outputs to writeback - -- Multiplex either cache data to the destination GPR or - -- the address for the rA update. - l_out.valid <= done; - case r.wr_sel is - when "00" => - l_out.write_enable <= '1'; - l_out.write_reg <= r.write_reg; - l_out.write_data <= r.sprval; - when "01" => - l_out.write_enable <= do_update; - l_out.write_reg <= gpr_to_gspr(r.update_reg); - l_out.write_data <= r.addr; - when "10" => - l_out.write_enable <= '1'; - l_out.write_reg <= r.write_reg; - l_out.write_data <= load_dp_data; - when others => - l_out.write_enable <= write_enable; - l_out.write_reg <= r.write_reg; - l_out.write_data <= data_trimmed; - end case; - l_out.xerc <= r.xerc; - l_out.rc <= r.rc and done; + l_out.valid <= complete; + l_out.instr_tag <= r2.req.instr_tag; + l_out.write_enable <= write_enable or do_update; + l_out.write_reg <= r2.req.write_reg; + l_out.write_data <= write_data; + l_out.xerc <= r2.req.xerc; + l_out.rc <= r2.req.rc and complete; l_out.store_done <= d_in.store_done; + l_out.interrupt <= r3.interrupt; + l_out.intr_vec <= r3.intr_vec; + l_out.srr0 <= r3.nia; + l_out.srr1 <= r3.srr1; - -- update exception info back to execute1 + -- update busy signal back to execute1 e_out.busy <= busy; - e_out.exception <= exception; - e_out.alignment <= r.align_intr; - e_out.instr_fault <= r.instr_fault; - e_out.invalid <= m_in.invalid; - e_out.badtree <= m_in.badtree; - e_out.perm_error <= m_in.perm_error; - e_out.rc_error <= m_in.rc_error; - e_out.segment_fault <= m_in.segerr; - if exception = '1' and r.instr_fault = '0' then - v.dar := addr; - if m_in.segerr = '0' and r.align_intr = '0' then - v.dsisr := dsisr; - end if; - end if; + e_out.in_progress <= in_progress; + + -- Busy calculation. + stage3_busy_next <= r2.req.valid and not (complete or part_done or exception); -- Update registers - rin <= v; + r3in <= v; end process; @@ -775,13 +958,13 @@ begin begin if rising_edge(clk) then log_data <= e_out.busy & - e_out.exception & + l_out.interrupt & l_out.valid & m_out.valid & d_out.valid & m_in.done & - r.dwords_done & - std_ulogic_vector(to_unsigned(state_t'pos(r.state), 3)); + r2.req.dword_index & + std_ulogic_vector(to_unsigned(state_t'pos(r3.state), 3)); end if; end process; log_out <= log_data; diff --git a/microwatt.core b/microwatt.core index 41b6230..79af3c1 100644 --- a/microwatt.core +++ b/microwatt.core @@ -19,8 +19,6 @@ filesets: - sim_console.vhdl - logical.vhdl - countzero.vhdl - - gpr_hazard.vhdl - - cr_hazard.vhdl - control.vhdl - execute1.vhdl - fpu.vhdl diff --git a/writeback.vhdl b/writeback.vhdl index 95de0ec..65da537 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -9,6 +9,7 @@ use work.crhelpers.all; entity writeback is port ( clk : in std_ulogic; + rst : in std_ulogic; e_in : in Execute1ToWritebackType; l_in : in Loadstore1ToWritebackType; @@ -16,12 +17,24 @@ entity writeback is w_out : out WritebackToRegisterFileType; c_out : out WritebackToCrFileType; + f_out : out WritebackToFetch1Type; - complete_out : out std_ulogic + flush_out : out std_ulogic; + interrupt_out: out std_ulogic; + complete_out : out instr_tag_t ); end entity writeback; architecture behaviour of writeback is + type irq_state_t is (WRITE_SRR0, WRITE_SRR1); + + type reg_type is record + state : irq_state_t; + srr1 : std_ulogic_vector(63 downto 0); + end record; + + signal r, rin : reg_type; + begin writeback_0: process(clk) variable x : std_ulogic_vector(0 downto 0); @@ -29,6 +42,13 @@ begin variable w : std_ulogic_vector(0 downto 0); begin if rising_edge(clk) then + if rst = '1' then + r.state <= WRITE_SRR0; + r.srr1 <= (others => '0'); + else + r <= rin; + end if; + -- Do consistency checks only on the clock edge x(0) := e_in.valid; y(0) := l_in.valid; @@ -36,7 +56,7 @@ begin assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(w))) <= 1 severity failure; - x(0) := e_in.write_enable or e_in.exc_write_enable; + x(0) := e_in.write_enable; y(0) := l_in.write_enable; w(0) := fp_in.write_enable; assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + @@ -47,27 +67,73 @@ begin y(0) := fp_in.write_cr_enable; assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; + + assert not (e_in.valid = '1' and e_in.instr_tag.valid = '0') severity failure; + assert not (l_in.valid = '1' and l_in.instr_tag.valid = '0') severity failure; + assert not (fp_in.valid = '1' and fp_in.instr_tag.valid = '0') severity failure; end if; end process; writeback_1: process(all) + variable v : reg_type; + variable f : WritebackToFetch1Type; variable cf: std_ulogic_vector(3 downto 0); variable zero : std_ulogic; variable sign : std_ulogic; variable scf : std_ulogic_vector(3 downto 0); + variable vec : integer range 0 to 16#fff#; + variable srr1 : std_ulogic_vector(15 downto 0); + variable intr : std_ulogic; begin w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; - - complete_out <= '0'; - if e_in.valid = '1' or l_in.valid = '1' or fp_in.valid = '1' then - complete_out <= '1'; + f := WritebackToFetch1Init; + interrupt_out <= '0'; + vec := 0; + v := r; + + complete_out <= instr_tag_init; + if e_in.valid = '1' then + complete_out <= e_in.instr_tag; + elsif l_in.valid = '1' then + complete_out <= l_in.instr_tag; + elsif fp_in.valid = '1' then + complete_out <= fp_in.instr_tag; end if; - if e_in.exc_write_enable = '1' then - w_out.write_reg <= e_in.exc_write_reg; - w_out.write_data <= e_in.exc_write_data; + intr := e_in.interrupt or l_in.interrupt or fp_in.interrupt; + + if r.state = WRITE_SRR1 then + w_out.write_reg <= fast_spr_num(SPR_SRR1); + w_out.write_data <= r.srr1; + w_out.write_enable <= '1'; + interrupt_out <= '1'; + v.state := WRITE_SRR0; + + elsif intr = '1' then + w_out.write_reg <= fast_spr_num(SPR_SRR0); w_out.write_enable <= '1'; + v.state := WRITE_SRR1; + srr1 := (others => '0'); + if e_in.interrupt = '1' then + vec := e_in.intr_vec; + w_out.write_data <= e_in.last_nia; + srr1 := e_in.srr1; + elsif l_in.interrupt = '1' then + vec := l_in.intr_vec; + w_out.write_data <= l_in.srr0; + srr1 := l_in.srr1; + elsif fp_in.interrupt = '1' then + vec := fp_in.intr_vec; + w_out.write_data <= fp_in.srr0; + srr1 := fp_in.srr1; + end if; + v.srr1(63 downto 31) := e_in.msr(63 downto 31); + v.srr1(30 downto 27) := srr1(14 downto 11); + v.srr1(26 downto 22) := e_in.msr(26 downto 22); + v.srr1(21 downto 16) := srr1(5 downto 0); + v.srr1(15 downto 0) := e_in.msr(15 downto 0); + else if e_in.write_enable = '1' then w_out.write_reg <= e_in.write_reg; @@ -134,5 +200,37 @@ begin c_out.write_cr_data(31 downto 28) <= cf; end if; end if; + + -- Outputs to fetch1 + f.redirect := e_in.redirect; + f.br_nia := e_in.last_nia; + f.br_last := e_in.br_last; + f.br_taken := e_in.br_taken; + if intr = '1' then + f.redirect := '1'; + f.br_last := '0'; + f.redirect_nia := std_ulogic_vector(to_unsigned(vec, 64)); + f.virt_mode := '0'; + f.priv_mode := '1'; + -- XXX need an interrupt LE bit here, e.g. from LPCR + f.big_endian := '0'; + f.mode_32bit := '0'; + else + if e_in.abs_br = '1' then + f.redirect_nia := e_in.br_offset; + else + f.redirect_nia := std_ulogic_vector(unsigned(e_in.last_nia) + unsigned(e_in.br_offset)); + end if; + -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1 + f.virt_mode := e_in.redir_mode(3); + f.priv_mode := e_in.redir_mode(2); + f.big_endian := e_in.redir_mode(1); + f.mode_32bit := e_in.redir_mode(0); + end if; + + f_out <= f; + flush_out <= f_out.redirect; + + rin <= v; end process; end;