diff --git a/common.vhdl b/common.vhdl index bfc0db2..7bf8277 100644 --- a/common.vhdl +++ b/common.vhdl @@ -155,6 +155,7 @@ package common is big_endian : std_ulogic; stop_mark: std_ulogic; sequential: std_ulogic; + predicted : std_ulogic; nia: std_ulogic_vector(63 downto 0); end record; @@ -165,6 +166,7 @@ package common is nia: std_ulogic_vector(63 downto 0); insn: std_ulogic_vector(31 downto 0); big_endian: std_ulogic; + next_predicted: std_ulogic; end record; type Decode1ToDecode2Type is record @@ -195,6 +197,7 @@ package common is insn_type: insn_type_t; nia: std_ulogic_vector(63 downto 0); write_reg: gspr_index_t; + write_reg_enable: std_ulogic; read_reg1: gspr_index_t; read_reg2: gspr_index_t; read_data1: std_ulogic_vector(63 downto 0); @@ -210,6 +213,7 @@ package common is rc: std_ulogic; oe: std_ulogic; invert_a: std_ulogic; + addm1 : std_ulogic; invert_out: std_ulogic; input_carry: carry_in_t; output_carry: std_ulogic; @@ -224,18 +228,21 @@ package common is update : std_ulogic; -- is this an update instruction? reserve : std_ulogic; -- set for larx/stcx br_pred : std_ulogic; + result_sel : std_ulogic_vector(2 downto 0); -- select source of result + sub_select : std_ulogic_vector(2 downto 0); -- sub-result selection repeat : std_ulogic; -- set if instruction is cracked into two ops second : std_ulogic; -- set if this is the second op end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, - bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', - bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', + write_reg_enable => '0', bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', + bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), + result_sel => "000", sub_select => "000", repeat => '0', second => '0', others => (others => '0')); type MultiplyInputType is record @@ -303,10 +310,14 @@ package common is big_endian: std_ulogic; mode_32bit: std_ulogic; redirect_nia: std_ulogic_vector(63 downto 0); + br_nia : std_ulogic_vector(63 downto 0); + br_last : std_ulogic; + br_taken : std_ulogic; end record; constant Execute1ToFetch1Init : Execute1ToFetch1Type := (redirect => '0', virt_mode => '0', priv_mode => '0', big_endian => '0', - mode_32bit => '0', others => (others => '0')); + mode_32bit => '0', br_taken => '0', + br_last => '0', others => (others => '0')); type Execute1ToLoadstore1Type is record valid : std_ulogic; @@ -365,7 +376,7 @@ package common is virt_mode : std_ulogic; priv_mode : std_ulogic; addr : std_ulogic_vector(63 downto 0); - data : std_ulogic_vector(63 downto 0); + data : std_ulogic_vector(63 downto 0); -- valid the cycle after .valid = 1 byte_sel : std_ulogic_vector(7 downto 0); end record; diff --git a/core.vhdl b/core.vhdl index bc32a8c..3948b86 100644 --- a/core.vhdl +++ b/core.vhdl @@ -12,6 +12,7 @@ entity core is DISABLE_FLATTEN : boolean := false; EX1_BYPASS : boolean := true; HAS_FPU : boolean := true; + HAS_BTC : boolean := true; ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0'); LOG_LENGTH : natural := 512 ); @@ -187,7 +188,8 @@ begin fetch1_0: entity work.fetch1 generic map ( RESET_ADDRESS => (others => '0'), - ALT_RESET_ADDRESS => ALT_RESET_ADDRESS + ALT_RESET_ADDRESS => ALT_RESET_ADDRESS, + HAS_BTC => HAS_BTC ) port map ( clk => clk, @@ -195,6 +197,7 @@ begin alt_reset_in => alt_reset_d, stall_in => fetch1_stall_in, flush_in => fetch1_flush, + inval_btc => ex1_icache_inval or mmu_to_icache.tlbie, stop_in => dbg_core_stop, d_in => decode1_to_fetch1, e_in => execute1_to_fetch1, diff --git a/dcache.vhdl b/dcache.vhdl index 1e58e1f..7da67e1 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -1306,7 +1306,7 @@ begin req.real_addr := ra; -- Force data to 0 for dcbz if r0.req.dcbz = '0' then - req.data := r0.req.data; + req.data := d_in.data; else req.data := (others => '0'); end if; diff --git a/decode1.vhdl b/decode1.vhdl index 086083e..ebe59be 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -31,6 +31,7 @@ end entity decode1; architecture behaviour of decode1 is signal r, rin : Decode1ToDecode2Type; signal s : Decode1ToDecode2Type; + signal f, fin : Decode1ToFetch1Type; constant illegal_inst : decode_rom_t := (NONE, NONE, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE); @@ -47,6 +48,14 @@ architecture behaviour of decode1 is signal ri, ri_in : reg_internal_t; signal si : reg_internal_t; + type br_predictor_t is record + br_nia : std_ulogic_vector(61 downto 0); + br_offset : signed(23 downto 0); + predict : std_ulogic; + end record; + + signal br, br_in : br_predictor_t; + subtype major_opcode_t is unsigned(5 downto 0); type major_rom_array_t is array(0 to 63) of decode_rom_t; type minor_valid_array_t is array(0 to 1023) of std_ulogic; @@ -537,6 +546,13 @@ begin ri <= ri_in; end if; end if; + if rst = '1' then + br.br_nia <= (others => '0'); + br.br_offset <= (others => '0'); + br.predict <= '0'; + else + br <= br_in; + end if; end if; end process; busy_out <= s.valid; @@ -544,14 +560,13 @@ begin decode1_1: process(all) variable v : Decode1ToDecode2Type; variable vi : reg_internal_t; - variable f : Decode1ToFetch1Type; variable majorop : major_opcode_t; variable minor4op : std_ulogic_vector(10 downto 0); variable op_19_bits: std_ulogic_vector(2 downto 0); variable sprn : spr_num_t; - variable br_nia : std_ulogic_vector(61 downto 0); variable br_target : std_ulogic_vector(61 downto 0); variable br_offset : signed(23 downto 0); + variable bv : br_predictor_t; begin v := Decode1ToDecode2Init; vi := reg_internal_t_init; @@ -707,17 +722,22 @@ begin -- Branch predictor -- Note bclr, bcctr and bctar are predicted not taken as we have no -- count cache or link stack. - br_nia := f_in.nia(63 downto 2); + bv.br_nia := f_in.nia(63 downto 2); if f_in.insn(1) = '1' then - br_nia := (others => '0'); + bv.br_nia := (others => '0'); + end if; + bv.br_offset := br_offset; + if f_in.next_predicted = '1' then + v.br_pred := '1'; end if; - br_target := std_ulogic_vector(signed(br_nia) + br_offset); - f.redirect := v.br_pred and f_in.valid and not flush_in and not s.valid; - f.redirect_nia := br_target & "00"; + bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out and not f_in.next_predicted; + -- after a clock edge... + br_target := std_ulogic_vector(signed(br.br_nia) + br.br_offset); -- Update registers rin <= v; ri_in <= vi; + br_in <= bv; -- Update outputs d_out <= r; @@ -729,8 +749,9 @@ begin if ri.force_single = '1' then d_out.decode.sgl_pipe <= '1'; end if; - f_out <= f; - flush_out <= f.redirect; + f_out.redirect <= br.predict; + f_out.redirect_nia <= br_target & "00"; + flush_out <= bv.predict or br.predict; end process; d1_log: if LOG_LENGTH > 0 generate diff --git a/decode2.vhdl b/decode2.vhdl index 8b4633a..e00a05d 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -221,6 +221,59 @@ architecture behaviour of decode2 is end case; end; + -- control signals that are derived from insn_type + type mux_select_array_t is array(insn_type_t) of std_ulogic_vector(2 downto 0); + + constant result_select : mux_select_array_t := ( + OP_AND => "001", -- logical_result + OP_OR => "001", + OP_XOR => "001", + OP_POPCNT => "001", + OP_PRTY => "001", + OP_CMPB => "001", + OP_EXTS => "001", + OP_BPERM => "001", + OP_BCD => "001", + OP_MTSPR => "001", + OP_RLC => "010", -- rotator_result + OP_RLCL => "010", + OP_RLCR => "010", + OP_SHL => "010", + OP_SHR => "010", + OP_EXTSWSLI => "010", + OP_MUL_L64 => "011", -- muldiv_result + OP_MUL_H64 => "011", + OP_MUL_H32 => "011", + OP_DIV => "011", + OP_DIVE => "011", + OP_MOD => "011", + OP_CNTZ => "100", -- countzero_result + OP_MFSPR => "101", -- spr_result + OP_ADDG6S => "111", -- misc_result + OP_ISEL => "111", + OP_DARN => "111", + OP_MFMSR => "111", + OP_MFCR => "111", + OP_SETB => "111", + others => "000" -- default to adder_result + ); + + constant subresult_select : mux_select_array_t := ( + OP_MUL_L64 => "000", -- muldiv_result + OP_MUL_H64 => "001", + OP_MUL_H32 => "010", + OP_DIV => "011", + OP_DIVE => "011", + OP_MOD => "011", + OP_ADDG6S => "001", -- misc_result + OP_ISEL => "010", + OP_DARN => "011", + OP_MFMSR => "100", + OP_MFCR => "101", + OP_SETB => "110", + others => "000" + ); + -- issue control signals signal control_valid_in : std_ulogic; signal control_valid_out : std_ulogic; @@ -392,6 +445,7 @@ begin v.e.read_data3 := decoded_reg_c.data; v.e.bypass_data3 := gpr_c_bypass; v.e.write_reg := decoded_reg_o.reg; + v.e.write_reg_enable := decoded_reg_o.reg_valid; v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then v.e.oe := decode_oe(d_in.decode.rc, d_in.insn); @@ -400,6 +454,16 @@ begin v.e.bypass_cr := cr_bypass; v.e.xerc := c_in.read_xerc_data; v.e.invert_a := d_in.decode.invert_a; + v.e.addm1 := '0'; + if d_in.decode.insn_type = OP_BC or d_in.decode.insn_type = OP_BCREG then + -- add -1 to CTR + v.e.addm1 := '1'; + if d_in.insn(23) = '1' or + (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then + -- don't write decremented CTR if BO(2) = 1 or bcctr + v.e.write_reg_enable := '0'; + end if; + end if; v.e.invert_out := d_in.decode.invert_out; v.e.input_carry := d_in.decode.input_carry; v.e.output_carry := d_in.decode.output_carry; @@ -415,12 +479,14 @@ begin v.e.update := d_in.decode.update; v.e.reserve := d_in.decode.reserve; v.e.br_pred := d_in.br_pred; + v.e.result_sel := result_select(d_in.decode.insn_type); + v.e.sub_select := subresult_select(d_in.decode.insn_type); -- issue control control_valid_in <= d_in.valid; control_sgl_pipe <= d_in.decode.sgl_pipe; - gpr_write_valid <= decoded_reg_o.reg_valid; + gpr_write_valid <= v.e.write_reg_enable; gpr_write <= decoded_reg_o.reg; gpr_bypassable <= '0'; if EX1_BYPASS and d_in.decode.unit = ALU then diff --git a/execute1.vhdl b/execute1.vhdl index 11d81ed..25b1dc7 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -53,7 +53,7 @@ end entity execute1; architecture behaviour of execute1 is type reg_type is record e : Execute1ToWritebackType; - f : Execute1ToFetch1Type; + cur_instr : Decode2ToExecute1Type; busy: std_ulogic; terminate: std_ulogic; fp_exception_next : std_ulogic; @@ -65,21 +65,27 @@ architecture behaviour of execute1 is mul_finish : std_ulogic; div_in_progress : std_ulogic; cntz_in_progress : std_ulogic; - slow_op_insn : insn_type_t; - slow_op_dest : gpr_index_t; - slow_op_rc : std_ulogic; - slow_op_oe : std_ulogic; - slow_op_xerc : xer_common_t; last_nia : std_ulogic_vector(63 downto 0); + redirect : std_ulogic; + abs_br : std_ulogic; + taken_br : std_ulogic; + br_last : std_ulogic; + do_intr : std_ulogic; + vector : integer range 0 to 16#fff#; + br_offset : std_ulogic_vector(63 downto 0); + redir_mode : std_ulogic_vector(3 downto 0); log_addr_spr : std_ulogic_vector(31 downto 0); end record; constant reg_type_init : reg_type := - (e => Execute1ToWritebackInit, f => Execute1ToFetch1Init, + (e => Execute1ToWritebackInit, + cur_instr => Decode2ToExecute1Init, busy => '0', lr_update => '0', terminate => '0', fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0', - slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init, - next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0')); + next_lr => (others => '0'), last_nia => (others => '0'), + redirect => '0', abs_br => '0', taken_br => '0', br_last => '0', do_intr => '0', vector => 0, + br_offset => (others => '0'), redir_mode => "0000", + others => (others => '0')); signal r, rin : reg_type; @@ -95,6 +101,14 @@ architecture behaviour of execute1 is signal rotator_carry: std_ulogic; signal logical_result: std_ulogic_vector(63 downto 0); signal countzero_result: std_ulogic_vector(63 downto 0); + signal alu_result: std_ulogic_vector(63 downto 0); + signal adder_result: std_ulogic_vector(63 downto 0); + signal misc_result: std_ulogic_vector(63 downto 0); + signal muldiv_result: std_ulogic_vector(63 downto 0); + signal spr_result: std_ulogic_vector(63 downto 0); + signal result_mux_sel: std_ulogic_vector(2 downto 0); + signal sub_mux_sel: std_ulogic_vector(2 downto 0); + signal current: Decode2ToExecute1Type; -- multiply signals signal x_to_multiply: MultiplyInputType; @@ -277,6 +291,18 @@ begin terminate_out <= r.terminate; + current <= e_in when r.busy = '0' else r.cur_instr; + + -- Result mux + with current.result_sel select alu_result <= + adder_result when "000", + logical_result when "001", + rotator_result when "010", + muldiv_result when "011", + countzero_result when "100", + spr_result when "101", + misc_result when others; + execute1_0: process(clk) begin if rising_edge(clk) then @@ -302,10 +328,14 @@ begin execute1_1: process(all) variable v : reg_type; variable a_inv : std_ulogic_vector(63 downto 0); - variable result : std_ulogic_vector(63 downto 0); + variable b_or_m1 : std_ulogic_vector(63 downto 0); + variable addg6s : std_ulogic_vector(63 downto 0); + variable isel_result : std_ulogic_vector(63 downto 0); + variable darn : std_ulogic_vector(63 downto 0); + variable mfcr_result : std_ulogic_vector(63 downto 0); + variable setb_result : std_ulogic_vector(63 downto 0); variable newcrf : std_ulogic_vector(3 downto 0); variable sum_with_carry : std_ulogic_vector(64 downto 0); - variable result_en : std_ulogic; variable crnum : crnum_t; variable crbit : integer range 0 to 31; variable scrnum : crnum_t; @@ -328,6 +358,8 @@ begin variable zerohi, zerolo : std_ulogic; variable msb_a, msb_b : std_ulogic; variable a_lt : std_ulogic; + variable a_lt_lo : std_ulogic; + variable a_lt_hi : std_ulogic; variable lv : Execute1ToLoadstore1Type; variable irq_valid : std_ulogic; variable exception : std_ulogic; @@ -335,25 +367,37 @@ begin variable trapval : std_ulogic_vector(4 downto 0); variable illegal : std_ulogic; variable is_branch : std_ulogic; + variable is_direct_branch : std_ulogic; variable taken_branch : std_ulogic; variable abs_branch : std_ulogic; variable spr_val : std_ulogic_vector(63 downto 0); variable addend : std_ulogic_vector(127 downto 0); variable do_trace : std_ulogic; + variable hold_wr_data : std_ulogic; + variable f : Execute1ToFetch1Type; variable fv : Execute1ToFPUType; begin - result := (others => '0'); sum_with_carry := (others => '0'); - result_en := '0'; newcrf := (others => '0'); is_branch := '0'; + is_direct_branch := '0'; taken_branch := '0'; abs_branch := '0'; + hold_wr_data := '0'; v := r; v.e := Execute1ToWritebackInit; + v.redirect := '0'; + v.abs_br := '0'; + v.do_intr := '0'; + v.vector := 0; + v.br_offset := (others => '0'); + v.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) & + not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF); + v.taken_br := '0'; + v.br_last := '0'; + lv := Execute1ToLoadstore1Init; - v.f.redirect := '0'; fv := Execute1ToFPUInit; -- XER forwarding. To avoid having to track XER hazards, we use @@ -361,7 +405,7 @@ begin -- (SO, OV[32] and CA[32]) are only modified by instructions that are -- handled here, we can just forward the result being sent to -- writeback. - if r.e.write_xerc_enable = '1' then + if r.e.write_xerc_enable = '1' or r.busy = '1' then v.e.xerc := r.e.xerc; else v.e.xerc := e_in.xerc; @@ -383,14 +427,25 @@ begin v.cntz_in_progress := '0'; v.mul_finish := '0'; + spr_result <= (others => '0'); + spr_val := (others => '0'); + -- Main adder if e_in.invert_a = '0' then a_inv := a_in; else a_inv := not a_in; end if; - sum_with_carry := ppc_adde(a_inv, b_in, + if e_in.addm1 = '0' then + b_or_m1 := b_in; + else + b_or_m1 := (others => '1'); + end if; + sum_with_carry := ppc_adde(a_inv, b_or_m1, decode_input_carry(e_in.input_carry, v.e.xerc)); + adder_result <= sum_with_carry(63 downto 0); + carry_32 := sum_with_carry(32) xor a_inv(32) xor b_in(32); + carry_64 := sum_with_carry(64); -- signals to multiply and divide units sign1 := '0'; @@ -416,6 +471,7 @@ begin abs2 := - signed(b_in); end if; + -- Interface to multiply and divide units x_to_multiply <= MultiplyInputInit; x_to_multiply.is_32bit <= e_in.is_32bit; @@ -463,6 +519,137 @@ begin x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0)); end if; + case current.sub_select(1 downto 0) is + when "00" => + muldiv_result <= multiply_to_x.result(63 downto 0); + when "01" => + muldiv_result <= multiply_to_x.result(127 downto 64); + when "10" => + muldiv_result <= multiply_to_x.result(63 downto 32) & + multiply_to_x.result(63 downto 32); + when others => + muldiv_result <= divider_to_x.write_reg_data; + end case; + + -- Compute misc_result + case current.sub_select is + when "000" => + misc_result <= (others => '0'); + when "001" => + -- addg6s + addg6s := (others => '0'); + for i in 0 to 14 loop + lo := i * 4; + hi := (i + 1) * 4; + if (a_in(hi) xor b_in(hi) xor sum_with_carry(hi)) = '0' then + addg6s(lo + 3 downto lo) := "0110"; + end if; + end loop; + if sum_with_carry(64) = '0' then + addg6s(63 downto 60) := "0110"; + end if; + misc_result <= addg6s; + when "010" => + -- isel + crbit := to_integer(unsigned(insn_bc(e_in.insn))); + if cr_in(31-crbit) = '1' then + isel_result := a_in; + else + isel_result := b_in; + end if; + misc_result <= isel_result; + when "011" => + -- darn + darn := (others => '1'); + if random_err = '0' then + case e_in.insn(17 downto 16) is + when "00" => + darn := x"00000000" & random_cond(31 downto 0); + when "10" => + darn := random_raw; + when others => + darn := random_cond; + end case; + end if; + misc_result <= darn; + when "100" => + -- mfmsr + misc_result <= ctrl.msr; + when "101" => + if e_in.insn(20) = '0' then + -- mfcr + mfcr_result := x"00000000" & cr_in; + else + -- mfocrf + crnum := fxm_to_num(insn_fxm(e_in.insn)); + mfcr_result := (others => '0'); + for i in 0 to 7 loop + lo := (7-i)*4; + hi := lo + 3; + if crnum = i then + mfcr_result(hi downto lo) := cr_in(hi downto lo); + end if; + end loop; + end if; + misc_result <= mfcr_result; + when "110" => + -- setb + bfa := insn_bfa(e_in.insn); + crbit := to_integer(unsigned(bfa)) * 4; + setb_result := (others => '0'); + if cr_in(31 - crbit) = '1' then + setb_result := (others => '1'); + elsif cr_in(30 - crbit) = '1' then + setb_result(0) := '1'; + end if; + misc_result <= setb_result; + when others => + misc_result <= (others => '0'); + end case; + + -- compute comparison results + -- Note, we have done RB - RA, not RA - RB + if e_in.insn_type = OP_CMP then + l := insn_l(e_in.insn); + else + l := not e_in.is_32bit; + end if; + zerolo := not (or (a_in(31 downto 0) xor b_in(31 downto 0))); + zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32))); + if zerolo = '1' and (l = '0' or zerohi = '1') then + -- values are equal + trapval := "00100"; + else + a_lt_lo := '0'; + a_lt_hi := '0'; + if unsigned(a_in(30 downto 0)) < unsigned(b_in(30 downto 0)) then + a_lt_lo := '1'; + end if; + if unsigned(a_in(62 downto 31)) < unsigned(b_in(62 downto 31)) then + a_lt_hi := '1'; + end if; + if l = '1' then + -- 64-bit comparison + msb_a := a_in(63); + msb_b := b_in(63); + a_lt := a_lt_hi or (zerohi and (a_in(31) xnor b_in(31)) and a_lt_lo); + else + -- 32-bit comparison + msb_a := a_in(31); + msb_b := b_in(31); + a_lt := a_lt_lo; + end if; + if msb_a /= msb_b then + -- Comparison is clear from MSB difference. + -- for signed, 0 is greater; for unsigned, 1 is greater + trapval := msb_a & msb_b & '0' & msb_b & msb_a; + else + -- MSBs are equal, so signed and unsigned comparisons give the + -- same answer. + trapval := a_lt & not a_lt & '0' & a_lt & not a_lt; + end if; + end if; + ctrl_tmp <= ctrl; -- FIXME: run at 512MHz not core freq ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); @@ -471,11 +658,11 @@ begin irq_valid := '0'; if ctrl.msr(MSR_EE) = '1' then if ctrl.dec(63) = '1' then - v.f.redirect_nia := std_logic_vector(to_unsigned(16#900#, 64)); + v.vector := 16#900#; report "IRQ valid: DEC"; irq_valid := '1'; elsif ext_irq_in = '1' then - v.f.redirect_nia := std_logic_vector(to_unsigned(16#500#, 64)); + v.vector := 16#500#; report "IRQ valid: External"; irq_valid := '1'; end if; @@ -484,11 +671,6 @@ begin v.terminate := '0'; icache_inval <= '0'; v.busy := '0'; - -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1 - v.f.virt_mode := ctrl.msr(MSR_IR); - v.f.priv_mode := not ctrl.msr(MSR_PR); - v.f.big_endian := not ctrl.msr(MSR_LE); - v.f.mode_32bit := not ctrl.msr(MSR_SF); -- Next insn adder used in a couple of places next_nia := std_ulogic_vector(unsigned(e_in.nia) + 4); @@ -520,39 +702,21 @@ begin v.prev_op := e_in.insn_type; end if; - if ctrl.irq_state = WRITE_SRR1 then - v.e.exc_write_reg := fast_spr_num(SPR_SRR1); - v.e.exc_write_data := ctrl.srr1; - v.e.exc_write_enable := '1'; - ctrl_tmp.msr(MSR_SF) <= '1'; - ctrl_tmp.msr(MSR_EE) <= '0'; - ctrl_tmp.msr(MSR_PR) <= '0'; - ctrl_tmp.msr(MSR_SE) <= '0'; - ctrl_tmp.msr(MSR_BE) <= '0'; - ctrl_tmp.msr(MSR_FP) <= '0'; - ctrl_tmp.msr(MSR_FE0) <= '0'; - ctrl_tmp.msr(MSR_FE1) <= '0'; - ctrl_tmp.msr(MSR_IR) <= '0'; - ctrl_tmp.msr(MSR_DR) <= '0'; - ctrl_tmp.msr(MSR_RI) <= '0'; - ctrl_tmp.msr(MSR_LE) <= '1'; - v.e.valid := '1'; - v.trace_next := '0'; - v.fp_exception_next := '0'; - report "Writing SRR1: " & to_hstring(ctrl.srr1); - - elsif valid_in = '1' and e_in.second = '0' and - ((HAS_FPU and r.fp_exception_next = '1') or r.trace_next = '1') then + -- Determine if there is any exception to be taken + -- before/instead of executing this instruction + if valid_in = '1' and e_in.second = '0' then if HAS_FPU and r.fp_exception_next = '1' then -- This is used for FP-type program interrupts that -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero. - v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); + exception := '1'; + v.vector := 16#700#; ctrl_tmp.srr1(63 - 43) <= '1'; ctrl_tmp.srr1(63 - 47) <= '1'; - else + elsif r.trace_next = '1' then -- Generate a trace interrupt rather than executing the next instruction -- or taking any asynchronous interrupt - v.f.redirect_nia := std_logic_vector(to_unsigned(16#d00#, 64)); + exception := '1'; + v.vector := 16#d00#; ctrl_tmp.srr1(63 - 33) <= '1'; if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then @@ -560,46 +724,38 @@ begin elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then ctrl_tmp.srr1(63 - 36) <= '1'; end if; - end if; - exception := '1'; - elsif irq_valid = '1' and valid_in = '1' and e_in.second = '0' then - -- we need two cycles to write srr0 and 1 - -- will need more when we have to write HEIR - -- Don't deliver the interrupt until we have a valid instruction - -- coming in, so we have a valid NIA to put in SRR0. - exception := '1'; - - elsif valid_in = '1' and ctrl.msr(MSR_PR) = '1' and - instr_is_privileged(e_in.insn_type, e_in.insn) then - -- generate a program interrupt - exception := '1'; - v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); - -- set bit 45 to indicate privileged instruction type interrupt - ctrl_tmp.srr1(63 - 45) <= '1'; - report "privileged instruction"; - - elsif not HAS_FPU and valid_in = '1' and e_in.fac = FPU then - -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations - illegal := '1'; - - elsif HAS_FPU and valid_in = '1' and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then - -- generate a floating-point unavailable interrupt - exception := '1'; - v.f.redirect_nia := std_logic_vector(to_unsigned(16#800#, 64)); - report "FP unavailable interrupt"; + elsif irq_valid = '1' then + -- Don't deliver the interrupt until we have a valid instruction + -- coming in, so we have a valid NIA to put in SRR0. + exception := '1'; + + elsif ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then + -- generate a program interrupt + exception := '1'; + v.vector := 16#700#; + -- set bit 45 to indicate privileged instruction type interrupt + ctrl_tmp.srr1(63 - 45) <= '1'; + report "privileged instruction"; + + elsif not HAS_FPU and e_in.fac = FPU then + -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations + illegal := '1'; - elsif valid_in = '1' and e_in.unit = ALU then + elsif HAS_FPU and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then + -- generate a floating-point unavailable interrupt + exception := '1'; + v.vector := 16#800#; + report "FP unavailable interrupt"; + end if; + end if; + if valid_in = '1' and exception = '0' and illegal = '0' and e_in.unit = ALU then report "execute nia " & to_hstring(e_in.nia); + v.cur_instr := e_in; + v.next_lr := next_nia; v.e.valid := '1'; - v.e.write_reg := e_in.write_reg; - v.slow_op_insn := e_in.insn_type; - v.slow_op_dest := gspr_to_gpr(e_in.write_reg); - v.slow_op_rc := e_in.rc; - v.slow_op_oe := e_in.oe; - v.slow_op_xerc := v.e.xerc; case_0: case e_in.insn_type is @@ -614,7 +770,7 @@ begin if e_in.insn(1) = '1' then exception := '1'; exception_nextpc := '1'; - v.f.redirect_nia := std_logic_vector(to_unsigned(16#C00#, 64)); + v.vector := 16#C00#; report "sc"; else illegal := '1'; @@ -630,101 +786,48 @@ begin end if; when OP_NOP | OP_DCBF | OP_DCBST | OP_DCBT | OP_DCBTST | OP_ICBT => -- Do nothing - when OP_ADD | OP_CMP | OP_TRAP => - result := sum_with_carry(63 downto 0); - carry_32 := result(32) xor a_inv(32) xor b_in(32); - carry_64 := sum_with_carry(64); - if e_in.insn_type = OP_ADD then - if e_in.output_carry = '1' then - if e_in.input_carry /= OV then - set_carry(v.e, carry_32, carry_64); - else - v.e.xerc.ov := carry_64; - v.e.xerc.ov32 := carry_32; - v.e.write_xerc_enable := '1'; - end if; - end if; - if e_in.oe = '1' then - set_ov(v.e, - calc_ov(a_inv(63), b_in(63), carry_64, sum_with_carry(63)), - calc_ov(a_inv(31), b_in(31), carry_32, sum_with_carry(31))); - end if; - result_en := '1'; - else - -- trap, CMP and CMPL instructions - -- Note, we have done RB - RA, not RA - RB - if e_in.insn_type = OP_CMP then - l := insn_l(e_in.insn); - else - l := not e_in.is_32bit; - end if; - zerolo := not (or (a_in(31 downto 0) xor b_in(31 downto 0))); - zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32))); - if zerolo = '1' and (l = '0' or zerohi = '1') then - -- values are equal - trapval := "00100"; + when OP_ADD => + if e_in.output_carry = '1' then + if e_in.input_carry /= OV then + set_carry(v.e, carry_32, carry_64); else - if l = '1' then - -- 64-bit comparison - msb_a := a_in(63); - msb_b := b_in(63); - else - -- 32-bit comparison - msb_a := a_in(31); - msb_b := b_in(31); - end if; - if msb_a /= msb_b then - -- Subtraction might overflow, but - -- comparison is clear from MSB difference. - -- for signed, 0 is greater; for unsigned, 1 is greater - trapval := msb_a & msb_b & '0' & msb_b & msb_a; - else - -- Subtraction cannot overflow since MSBs are equal. - -- carry = 1 indicates RA is smaller (signed or unsigned) - a_lt := (not l and carry_32) or (l and carry_64); - trapval := a_lt & not a_lt & '0' & a_lt & not a_lt; - end if; - end if; - if e_in.insn_type = OP_CMP then - if e_in.is_signed = '1' then - newcrf := trapval(4 downto 2) & v.e.xerc.so; - else - newcrf := trapval(1 downto 0) & trapval(2) & v.e.xerc.so; - end if; - bf := insn_bf(e_in.insn); - crnum := to_integer(unsigned(bf)); - v.e.write_cr_enable := '1'; - v.e.write_cr_mask := num_to_fxm(crnum); - for i in 0 to 7 loop - lo := i*4; - hi := lo + 3; - v.e.write_cr_data(hi downto lo) := newcrf; - end loop; - else - -- trap instructions (tw, twi, td, tdi) - v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); - -- set bit 46 to say trap occurred - ctrl_tmp.srr1(63 - 46) <= '1'; - if or (trapval and insn_to(e_in.insn)) = '1' then - -- generate trap-type program interrupt - exception := '1'; - report "trap"; - end if; + v.e.xerc.ov := carry_64; + v.e.xerc.ov32 := carry_32; + v.e.write_xerc_enable := '1'; end if; end if; - when OP_ADDG6S => - result := (others => '0'); - for i in 0 to 14 loop - lo := i * 4; - hi := (i + 1) * 4; - if (a_in(hi) xor b_in(hi) xor sum_with_carry(hi)) = '0' then - result(lo + 3 downto lo) := "0110"; - end if; + if e_in.oe = '1' then + set_ov(v.e, + calc_ov(a_inv(63), b_in(63), carry_64, sum_with_carry(63)), + calc_ov(a_inv(31), b_in(31), carry_32, sum_with_carry(31))); + end if; + when OP_CMP => + -- CMP and CMPL instructions + if e_in.is_signed = '1' then + newcrf := trapval(4 downto 2) & v.e.xerc.so; + else + newcrf := trapval(1 downto 0) & trapval(2) & v.e.xerc.so; + end if; + bf := insn_bf(e_in.insn); + crnum := to_integer(unsigned(bf)); + v.e.write_cr_enable := '1'; + v.e.write_cr_mask := num_to_fxm(crnum); + for i in 0 to 7 loop + lo := i*4; + hi := lo + 3; + v.e.write_cr_data(hi downto lo) := newcrf; end loop; - if sum_with_carry(64) = '0' then - result(63 downto 60) := "0110"; + when OP_TRAP => + -- trap instructions (tw, twi, td, tdi) + v.vector := 16#700#; + -- set bit 46 to say trap occurred + ctrl_tmp.srr1(63 - 46) <= '1'; + if or (trapval and insn_to(e_in.insn)) = '1' then + -- generate trap-type program interrupt + exception := '1'; + report "trap"; end if; - result_en := '1'; + when OP_ADDG6S => when OP_CMPRB => newcrf := ppc_cmprb(a_in, b_in, insn_l(e_in.insn)); bf := insn_bf(e_in.insn); @@ -743,11 +846,10 @@ begin newcrf & newcrf & newcrf & newcrf; when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS | OP_BPERM | OP_BCD => - result := logical_result; - result_en := '1'; when OP_B => is_branch := '1'; taken_branch := '1'; + is_direct_branch := '1'; abs_branch := insn_aa(e_in.insn); if ctrl.msr(MSR_BE) = '1' then do_trace := '1'; @@ -756,12 +858,8 @@ begin -- read_data1 is CTR bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); - if bo(4-2) = '0' then - result := std_ulogic_vector(unsigned(a_in) - 1); - result_en := '1'; - v.e.write_reg := fast_spr_num(SPR_CTR); - end if; is_branch := '1'; + is_direct_branch := '1'; taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); abs_branch := insn_aa(e_in.insn); if ctrl.msr(MSR_BE) = '1' then @@ -772,11 +870,6 @@ begin -- read_data2 is target register (CTR, LR or TAR) bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); - if bo(4-2) = '0' and e_in.insn(10) = '0' then - result := std_ulogic_vector(unsigned(a_in) - 1); - result_en := '1'; - v.e.write_reg := fast_spr_num(SPR_CTR); - end if; is_branch := '1'; taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); abs_branch := '1'; @@ -785,10 +878,8 @@ begin end if; when OP_RFID => - v.f.virt_mode := a_in(MSR_IR) or a_in(MSR_PR); - v.f.priv_mode := not a_in(MSR_PR); - v.f.big_endian := not a_in(MSR_LE); - v.f.mode_32bit := not a_in(MSR_SF); + v.redir_mode := (a_in(MSR_IR) or a_in(MSR_PR)) & not a_in(MSR_PR) & + not a_in(MSR_LE) & not a_in(MSR_SF); -- Can't use msr_copy here because the partial function MSR -- bits should be left unchanged, not zeroed. ctrl_tmp.msr(63 downto 31) <= a_in(63 downto 31); @@ -814,13 +905,6 @@ begin v.cntz_in_progress := '1'; v.busy := '1'; when OP_ISEL => - crbit := to_integer(unsigned(insn_bc(e_in.insn))); - if cr_in(31-crbit) = '1' then - result := a_in; - else - result := b_in; - end if; - result_en := '1'; when OP_CROP => cr_op := insn_cr(e_in.insn); report "CR OP " & to_hstring(cr_op); @@ -873,41 +957,25 @@ begin v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf; when OP_DARN => - if random_err = '0' then - case e_in.insn(17 downto 16) is - when "00" => - result := x"00000000" & random_cond(31 downto 0); - when "10" => - result := random_raw; - when others => - result := random_cond; - end case; - else - result := (others => '1'); - end if; - result_en := '1'; when OP_MFMSR => - result := ctrl.msr; - result_en := '1'; when OP_MFSPR => report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & "=" & to_hstring(a_in); - result_en := '1'; - if is_fast_spr(e_in.read_reg1) then - result := a_in; - if decode_spr_num(e_in.insn) = SPR_XER then + if is_fast_spr(e_in.read_reg1) = '1' then + spr_val := a_in; + if decode_spr_num(e_in.insn) = SPR_XER then -- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer - result(63 downto 32) := (others => '0'); - result(63-32) := v.e.xerc.so; - result(63-33) := v.e.xerc.ov; - result(63-34) := v.e.xerc.ca; - result(63-35 downto 63-43) := "000000000"; - result(63-44) := v.e.xerc.ov32; - result(63-45) := v.e.xerc.ca32; - end if; + spr_val(63 downto 32) := (others => '0'); + spr_val(63-32) := v.e.xerc.so; + spr_val(63-33) := v.e.xerc.ov; + spr_val(63-34) := v.e.xerc.ca; + spr_val(63-35 downto 63-43) := "000000000"; + spr_val(63-44) := v.e.xerc.ov32; + spr_val(63-45) := v.e.xerc.ca32; + end if; else spr_val := c_in; - case decode_spr_num(e_in.insn) is + case decode_spr_num(e_in.insn) is when SPR_TB => spr_val := ctrl.tb; when SPR_TBU => @@ -928,29 +996,14 @@ begin when others => -- mfspr from unimplemented SPRs should be a nop in -- supervisor mode and a program interrupt for user mode - if ctrl.msr(MSR_PR) = '1' then + if is_fast_spr(e_in.read_reg1) = '0' and ctrl.msr(MSR_PR) = '1' then illegal := '1'; end if; - end case; - result := spr_val; - end if; + end case; + end if; + spr_result <= spr_val; + when OP_MFCR => - if e_in.insn(20) = '0' then - -- mfcr - result := x"00000000" & cr_in; - else - -- mfocrf - crnum := fxm_to_num(insn_fxm(e_in.insn)); - result := (others => '0'); - for i in 0 to 7 loop - lo := (7-i)*4; - hi := lo + 3; - if crnum = i then - result(hi downto lo) := cr_in(hi downto lo); - end if; - end loop; - end if; - result_en := '1'; when OP_MTCRF => v.e.write_cr_enable := '1'; if e_in.insn(20) = '0' then @@ -990,8 +1043,6 @@ begin report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & "=" & to_hstring(c_in); if is_fast_spr(e_in.write_reg) then - result := c_in; - result_en := '1'; if decode_spr_num(e_in.insn) = SPR_XER then v.e.xerc.so := c_in(63-32); v.e.xerc.ov := c_in(63-33); @@ -1016,24 +1067,14 @@ begin end case; end if; when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR | OP_EXTSWSLI => - result := rotator_result; if e_in.output_carry = '1' then set_carry(v.e, rotator_carry, rotator_carry); end if; - result_en := '1'; when OP_SETB => - bfa := insn_bfa(e_in.insn); - crbit := to_integer(unsigned(bfa)) * 4; - result := (others => '0'); - if cr_in(31 - crbit) = '1' then - result := (others => '1'); - elsif cr_in(30 - crbit) = '1' then - result(0) := '1'; - end if; when OP_ISYNC => - v.f.redirect := '1'; - v.f.redirect_nia := next_nia; + v.redirect := '1'; + v.br_offset := std_ulogic_vector(to_unsigned(4, 64)); when OP_ICBI => icache_inval <= '1'; @@ -1055,47 +1096,25 @@ begin report "illegal"; end case; - v.e.rc := e_in.rc and valid_in; - -- Mispredicted branches cause a redirect if is_branch = '1' then if taken_branch = '1' then ctrl_tmp.cfar <= e_in.nia; end if; - if e_in.br_pred = '0' then - if abs_branch = '1' then - v.f.redirect_nia := b_in; - else - v.f.redirect_nia := std_ulogic_vector(signed(e_in.nia) + signed(b_in)); - end if; + if taken_branch = '1' then + v.br_offset := b_in; + v.abs_br := abs_branch; else - v.f.redirect_nia := next_nia; + v.br_offset := std_ulogic_vector(to_unsigned(4, 64)); end if; if taken_branch /= e_in.br_pred then - v.f.redirect := '1'; + v.redirect := '1'; end if; + v.br_last := is_direct_branch; + v.taken_br := taken_branch; end if; - -- Update LR on the next cycle after a branch link - -- If we're not writing back anything else, we can write back LR - -- this cycle, otherwise we take an extra cycle. We use the - -- exc_write path since next_nia is written through that path - -- in other places. - if e_in.lr = '1' then - if result_en = '0' then - v.e.exc_write_enable := '1'; - v.e.exc_write_data := next_nia; - v.e.exc_write_reg := fast_spr_num(SPR_LR); - else - v.lr_update := '1'; - v.next_lr := next_nia; - v.e.valid := '0'; - report "Delayed LR update to " & to_hstring(next_nia); - v.busy := '1'; - end if; - end if; - - elsif valid_in = '1' then + elsif valid_in = '1' and exception = '0' and illegal = '0' then -- instruction for other units, i.e. LDST if e_in.unit = LDST then lv.valid := '1'; @@ -1114,61 +1133,52 @@ begin -- valid_in = 0. Hence they don't happen in the same cycle as any of -- the cases above which depend on valid_in = 1. - if r.f.redirect = '1' then - v.e.valid := '1'; - end if; - if r.lr_update = '1' then + if ctrl.irq_state = WRITE_SRR1 then + v.e.exc_write_reg := fast_spr_num(SPR_SRR1); + v.e.exc_write_data := ctrl.srr1; v.e.exc_write_enable := '1'; - v.e.exc_write_data := r.next_lr; - v.e.exc_write_reg := fast_spr_num(SPR_LR); - v.e.valid := '1'; - -- Keep r.e.write_data unchanged next cycle in case it is needed - -- for a forwarded result (e.g. for CTR). - result := r.e.write_data; + ctrl_tmp.msr(MSR_SF) <= '1'; + ctrl_tmp.msr(MSR_EE) <= '0'; + ctrl_tmp.msr(MSR_PR) <= '0'; + ctrl_tmp.msr(MSR_SE) <= '0'; + ctrl_tmp.msr(MSR_BE) <= '0'; + ctrl_tmp.msr(MSR_FP) <= '0'; + ctrl_tmp.msr(MSR_FE0) <= '0'; + ctrl_tmp.msr(MSR_FE1) <= '0'; + ctrl_tmp.msr(MSR_IR) <= '0'; + ctrl_tmp.msr(MSR_DR) <= '0'; + ctrl_tmp.msr(MSR_RI) <= '0'; + ctrl_tmp.msr(MSR_LE) <= '1'; + v.trace_next := '0'; + v.fp_exception_next := '0'; + report "Writing SRR1: " & to_hstring(ctrl.srr1); + elsif r.cntz_in_progress = '1' then -- cnt[lt]z always takes two cycles - result := countzero_result; - result_en := '1'; - v.e.write_reg := gpr_to_gspr(r.slow_op_dest); - v.e.rc := r.slow_op_rc; - v.e.xerc := r.slow_op_xerc; v.e.valid := '1'; elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or (r.div_in_progress = '1' and divider_to_x.valid = '1') then if r.mul_in_progress = '1' then overflow := '0'; - case r.slow_op_insn is - when OP_MUL_H32 => - result := multiply_to_x.result(63 downto 32) & - multiply_to_x.result(63 downto 32); - when OP_MUL_H64 => - result := multiply_to_x.result(127 downto 64); - when others => - -- i.e. OP_MUL_L64 - result := multiply_to_x.result(63 downto 0); - end case; else - result := divider_to_x.write_reg_data; overflow := divider_to_x.overflow; end if; - if r.mul_in_progress = '1' and r.slow_op_oe = '1' then + if r.mul_in_progress = '1' and current.oe = '1' then -- have to wait until next cycle for overflow indication v.mul_finish := '1'; v.busy := '1'; else - result_en := '1'; - v.e.write_reg := gpr_to_gspr(r.slow_op_dest); - v.e.rc := r.slow_op_rc; - v.e.xerc := r.slow_op_xerc; - v.e.write_xerc_enable := r.slow_op_oe; + v.e.write_xerc_enable := current.oe; -- We must test oe because the RC update code in writeback -- will use the xerc value to set CR0:SO so we must not clobber -- xerc if OE wasn't set. - if r.slow_op_oe = '1' then + if current.oe = '1' then v.e.xerc.ov := overflow; v.e.xerc.ov32 := overflow; - v.e.xerc.so := r.slow_op_xerc.so or overflow; + if overflow = '1' then + v.e.xerc.so := '1'; + end if; end if; v.e.valid := '1'; end if; @@ -1178,31 +1188,34 @@ begin v.div_in_progress := r.div_in_progress; end if; elsif r.mul_finish = '1' then - result := r.e.write_data; - result_en := '1'; - v.e.write_reg := gpr_to_gspr(r.slow_op_dest); - v.e.rc := r.slow_op_rc; - v.e.xerc := r.slow_op_xerc; - v.e.write_xerc_enable := r.slow_op_oe; + hold_wr_data := '1'; + v.e.write_xerc_enable := current.oe; v.e.xerc.ov := multiply_to_x.overflow; v.e.xerc.ov32 := multiply_to_x.overflow; - v.e.xerc.so := r.slow_op_xerc.so or multiply_to_x.overflow; + if multiply_to_x.overflow = '1' then + v.e.xerc.so := '1'; + end if; v.e.valid := '1'; end if; + -- When doing delayed LR update, keep r.e.write_data unchanged + -- next cycle in case it is needed for a forwarded result (e.g. CTR). + if r.lr_update = '1' then + hold_wr_data := '1'; + end if; -- Generate FP-type program interrupt. fp_in.interrupt will only -- be set during the execution of a FP instruction. -- The case where MSR[FE0,FE1] goes from zero to non-zero is -- handled above by mtmsrd and rfid setting v.fp_exception_next. if HAS_FPU and fp_in.interrupt = '1' then - v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); + v.vector := 16#700#; ctrl_tmp.srr1(63 - 43) <= '1'; exception := '1'; end if; if illegal = '1' or (HAS_FPU and fp_in.illegal = '1') then exception := '1'; - v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); + v.vector := 16#700#; -- Since we aren't doing Hypervisor emulation assist (0xe40) we -- set bit 44 to indicate we have an illegal ctrl_tmp.srr1(63 - 44) <= '1'; @@ -1215,23 +1228,16 @@ begin end if; end if; - if do_trace = '1' then - v.trace_next := '1'; - end if; - - v.e.write_data := result; - v.e.write_enable := result_en and not exception; - -- generate DSI or DSegI for load/store exceptions -- or ISI or ISegI for instruction fetch exceptions if l_in.exception = '1' then if l_in.alignment = '1' then - v.f.redirect_nia := std_logic_vector(to_unsigned(16#600#, 64)); + v.vector := 16#600#; elsif l_in.instr_fault = '0' then if l_in.segment_fault = '0' then - v.f.redirect_nia := std_logic_vector(to_unsigned(16#300#, 64)); + v.vector := 16#300#; else - v.f.redirect_nia := std_logic_vector(to_unsigned(16#380#, 64)); + v.vector := 16#380#; end if; else if l_in.segment_fault = '0' then @@ -1239,9 +1245,9 @@ begin ctrl_tmp.srr1(63 - 35) <= l_in.perm_error; -- noexec fault ctrl_tmp.srr1(63 - 44) <= l_in.badtree; ctrl_tmp.srr1(63 - 45) <= l_in.rc_error; - v.f.redirect_nia := std_logic_vector(to_unsigned(16#400#, 64)); + v.vector := 16#400#; else - v.f.redirect_nia := std_logic_vector(to_unsigned(16#480#, 64)); + v.vector := 16#480#; end if; end if; v.e.exc_write_enable := '1'; @@ -1251,18 +1257,81 @@ begin if exception = '1' or l_in.exception = '1' then ctrl_tmp.irq_state <= WRITE_SRR1; - v.f.redirect := '1'; - v.f.virt_mode := '0'; - v.f.priv_mode := '1'; - -- XXX need an interrupt LE bit here, e.g. from LPCR - v.f.big_endian := '0'; - v.f.mode_32bit := '0'; + v.redirect := '1'; + v.do_intr := '1'; + end if; + + if do_trace = '1' then + v.trace_next := '1'; + end if; + + if hold_wr_data = '0' then + v.e.write_data := alu_result; + else + v.e.write_data := r.e.write_data; + end if; + v.e.write_reg := current.write_reg; + v.e.write_enable := current.write_reg_enable and v.e.valid and not exception; + v.e.rc := current.rc and v.e.valid and not exception; + + -- Update LR on the next cycle after a branch link + -- If we're not writing back anything else, we can write back LR + -- this cycle, otherwise we take an extra cycle. We use the + -- exc_write path since next_nia is written through that path + -- in other places. + if v.e.valid = '1' and exception = '0' and current.lr = '1' then + if current.write_reg_enable = '0' then + v.e.exc_write_enable := '1'; + v.e.exc_write_data := next_nia; + v.e.exc_write_reg := fast_spr_num(SPR_LR); + else + v.lr_update := '1'; + v.e.valid := '0'; + report "Delayed LR update to " & to_hstring(next_nia); + v.busy := '1'; + end if; + end if; + if r.lr_update = '1' then + v.e.exc_write_enable := '1'; + v.e.exc_write_data := r.next_lr; + v.e.exc_write_reg := fast_spr_num(SPR_LR); + v.e.valid := '1'; end if; - if v.f.redirect = '1' then + -- Defer completion for one cycle when redirecting. + -- This also ensures r.busy = 1 when ctrl.irq_state = WRITE_SRR1 + if v.redirect = '1' then v.busy := '1'; v.e.valid := '0'; end if; + if r.redirect = '1' then + v.e.valid := '1'; + end if; + + -- Outputs to fetch1 + f.redirect := r.redirect; + f.br_nia := r.last_nia; + f.br_last := r.br_last and not r.do_intr; + f.br_taken := r.taken_br; + if r.do_intr = '1' then + f.redirect_nia := std_ulogic_vector(to_unsigned(r.vector, 64)); + f.virt_mode := '0'; + f.priv_mode := '1'; + -- XXX need an interrupt LE bit here, e.g. from LPCR + f.big_endian := '0'; + f.mode_32bit := '0'; + else + if r.abs_br = '1' then + f.redirect_nia := r.br_offset; + else + f.redirect_nia := std_ulogic_vector(unsigned(r.last_nia) + unsigned(r.br_offset)); + end if; + -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1 + f.virt_mode := r.redir_mode(3); + f.priv_mode := r.redir_mode(2); + f.big_endian := r.redir_mode(1); + f.mode_32bit := r.redir_mode(0); + end if; -- Outputs to loadstore1 (async) lv.op := e_in.insn_type; @@ -1309,7 +1378,7 @@ begin rin <= v; -- update outputs - f_out <= r.f; + f_out <= f; l_out <= lv; e_out <= r.e; fp_out <= fv; diff --git a/fetch1.vhdl b/fetch1.vhdl index 3c9d946..8ca7e57 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -8,7 +8,8 @@ use work.common.all; entity fetch1 is generic( RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0'); - ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0') + ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0'); + HAS_BTC : boolean := true ); port( clk : in std_ulogic; @@ -17,6 +18,7 @@ entity fetch1 is -- Control inputs: stall_in : in std_ulogic; flush_in : in std_ulogic; + inval_btc : in std_ulogic; stop_in : in std_ulogic; alt_reset_in : in std_ulogic; @@ -37,10 +39,25 @@ end entity fetch1; architecture behaviour of fetch1 is type reg_internal_t is record mode_32bit: std_ulogic; + rd_is_niap4: std_ulogic; + predicted: std_ulogic; + predicted_nia: std_ulogic_vector(63 downto 0); end record; signal r, r_next : Fetch1ToIcacheType; signal r_int, r_next_int : reg_internal_t; + signal advance_nia : std_ulogic; signal log_nia : std_ulogic_vector(42 downto 0); + + constant BTC_ADDR_BITS : integer := 10; + constant BTC_TAG_BITS : integer := 62 - BTC_ADDR_BITS; + constant BTC_TARGET_BITS : integer := 62; + constant BTC_SIZE : integer := 2 ** BTC_ADDR_BITS; + constant BTC_WIDTH : integer := BTC_TAG_BITS + BTC_TARGET_BITS; + type btc_mem_type is array (0 to BTC_SIZE - 1) of std_ulogic_vector(BTC_WIDTH - 1 downto 0); + + signal btc_rd_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0) := (others => '0'); + signal btc_rd_valid : std_ulogic := '0'; + begin regs : process(clk) @@ -56,15 +73,70 @@ begin " R:" & std_ulogic'image(e_in.redirect) & std_ulogic'image(d_in.redirect) & " S:" & std_ulogic'image(stall_in) & " T:" & std_ulogic'image(stop_in) & - " nia:" & to_hstring(r_next.nia) & - " SM:" & std_ulogic'image(r_next.stop_mark); + " nia:" & to_hstring(r_next.nia); end if; - r <= r_next; - r_int <= r_next_int; + if rst = '1' or e_in.redirect = '1' or d_in.redirect = '1' or stall_in = '0' then + r.virt_mode <= r_next.virt_mode; + r.priv_mode <= r_next.priv_mode; + r.big_endian <= r_next.big_endian; + r_int.mode_32bit <= r_next_int.mode_32bit; + end if; + if advance_nia = '1' then + r.predicted <= r_next.predicted; + r.nia <= r_next.nia; + r_int.predicted <= r_next_int.predicted; + r_int.predicted_nia <= r_next_int.predicted_nia; + r_int.rd_is_niap4 <= r_next.sequential; + end if; + r.sequential <= r_next.sequential and advance_nia; + -- always send the up-to-date stop mark and req + r.stop_mark <= stop_in; + r.req <= not rst; end if; end process; log_out <= log_nia; + btc : if HAS_BTC generate + signal btc_memory : btc_mem_type; + attribute ram_style : string; + attribute ram_style of btc_memory : signal is "block"; + + signal btc_valids : std_ulogic_vector(BTC_SIZE - 1 downto 0); + attribute ram_style of btc_valids : signal is "distributed"; + + signal btc_wr : std_ulogic; + signal btc_wr_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0); + signal btc_wr_addr : std_ulogic_vector(BTC_ADDR_BITS - 1 downto 0); + signal btc_wr_v : std_ulogic; + begin + btc_wr_data <= e_in.br_nia(63 downto BTC_ADDR_BITS + 2) & + e_in.redirect_nia(63 downto 2); + btc_wr_addr <= e_in.br_nia(BTC_ADDR_BITS + 1 downto 2); + btc_wr <= e_in.br_last; + btc_wr_v <= e_in.br_taken; + + btc_ram : process(clk) + variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0); + begin + if rising_edge(clk) then + raddr := unsigned(r.nia(BTC_ADDR_BITS + 1 downto 2)) + + to_unsigned(2, BTC_ADDR_BITS); + if advance_nia = '1' then + btc_rd_data <= btc_memory(to_integer(raddr)); + btc_rd_valid <= btc_valids(to_integer(raddr)); + end if; + if btc_wr = '1' then + btc_memory(to_integer(unsigned(btc_wr_addr))) <= btc_wr_data; + end if; + if inval_btc = '1' or rst = '1' then + btc_valids <= (others => '0'); + elsif btc_wr = '1' then + btc_valids(to_integer(unsigned(btc_wr_addr))) <= btc_wr_v; + end if; + end if; + end process; + end generate; + comb : process(all) variable v : Fetch1ToIcacheType; variable v_int : reg_internal_t; @@ -72,6 +144,8 @@ begin v := r; v_int := r_int; v.sequential := '0'; + v.predicted := '0'; + v_int.predicted := '0'; if rst = '1' then if alt_reset_in = '1' then @@ -83,6 +157,7 @@ begin v.priv_mode := '1'; v.big_endian := '0'; v_int.mode_32bit := '0'; + v_int.predicted_nia := (others => '0'); elsif e_in.redirect = '1' then v.nia := e_in.redirect_nia(63 downto 2) & "00"; if e_in.mode_32bit = '1' then @@ -97,22 +172,26 @@ begin if r_int.mode_32bit = '1' then v.nia(63 downto 32) := (others => '0'); end if; - elsif stall_in = '0' then - - -- If the last NIA value went down with a stop mark, it didn't get - -- executed, and hence we shouldn't increment NIA. - if r.stop_mark = '0' then - if r_int.mode_32bit = '0' then - v.nia := std_ulogic_vector(unsigned(r.nia) + 4); - else - v.nia := x"00000000" & std_ulogic_vector(unsigned(r.nia(31 downto 0)) + 4); - end if; - v.sequential := '1'; - end if; - end if; + elsif r_int.predicted = '1' then + v.nia := r_int.predicted_nia; + v.predicted := '1'; + else + v.sequential := '1'; + v.nia := std_ulogic_vector(unsigned(r.nia) + 4); + if r_int.mode_32bit = '1' then + v.nia(63 downto 32) := x"00000000"; + end if; + if btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and + btc_rd_data(BTC_WIDTH - 1 downto BTC_TARGET_BITS) + = v.nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then + v_int.predicted := '1'; + end if; + end if; + v_int.predicted_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00"; - v.req := not rst and not stop_in; - v.stop_mark := stop_in; + -- If the last NIA value went down with a stop mark, it didn't get + -- executed, and hence we shouldn't increment NIA. + advance_nia <= rst or e_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in); r_next <= v; r_next_int <= v_int; diff --git a/fpga/top-arty.vhdl b/fpga/top-arty.vhdl index 8a3dc7a..68d1e89 100644 --- a/fpga/top-arty.vhdl +++ b/fpga/top-arty.vhdl @@ -15,6 +15,7 @@ entity toplevel is RESET_LOW : boolean := true; CLK_FREQUENCY : positive := 100000000; HAS_FPU : boolean := true; + HAS_BTC : boolean := true; USE_LITEDRAM : boolean := false; NO_BRAM : boolean := false; DISABLE_FLATTEN_CORE : boolean := false; @@ -170,6 +171,7 @@ begin SIM => false, CLK_FREQ => CLK_FREQUENCY, HAS_FPU => HAS_FPU, + HAS_BTC => HAS_BTC, HAS_DRAM => USE_LITEDRAM, DRAM_SIZE => 256 * 1024 * 1024, DRAM_INIT_SIZE => PAYLOAD_SIZE, diff --git a/fpga/top-generic.vhdl b/fpga/top-generic.vhdl index d5219ff..8bff5bb 100644 --- a/fpga/top-generic.vhdl +++ b/fpga/top-generic.vhdl @@ -12,6 +12,7 @@ entity toplevel is CLK_INPUT : positive := 100000000; CLK_FREQUENCY : positive := 100000000; HAS_FPU : boolean := true; + HAS_BTC : boolean := false; LOG_LENGTH : natural := 512; DISABLE_FLATTEN_CORE : boolean := false; UART_IS_16550 : boolean := true @@ -71,6 +72,7 @@ begin SIM => false, CLK_FREQ => CLK_FREQUENCY, HAS_FPU => HAS_FPU, + HAS_BTC => HAS_BTC, LOG_LENGTH => LOG_LENGTH, DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE, UART0_IS_16550 => UART_IS_16550 diff --git a/fpga/top-nexys-video.vhdl b/fpga/top-nexys-video.vhdl index 1942b10..86bdd11 100644 --- a/fpga/top-nexys-video.vhdl +++ b/fpga/top-nexys-video.vhdl @@ -15,6 +15,7 @@ entity toplevel is RESET_LOW : boolean := true; CLK_FREQUENCY : positive := 100000000; HAS_FPU : boolean := true; + HAS_BTC : boolean := true; USE_LITEDRAM : boolean := false; NO_BRAM : boolean := false; DISABLE_FLATTEN_CORE : boolean := false; @@ -122,6 +123,7 @@ begin SIM => false, CLK_FREQ => CLK_FREQUENCY, HAS_FPU => HAS_FPU, + HAS_BTC => HAS_BTC, HAS_DRAM => USE_LITEDRAM, DRAM_SIZE => 512 * 1024 * 1024, DRAM_INIT_SIZE => PAYLOAD_SIZE, diff --git a/icache.vhdl b/icache.vhdl index 37a230d..a658783 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -565,6 +565,7 @@ begin i_out.stop_mark <= r.hit_smark; i_out.fetch_failed <= r.fetch_failed; i_out.big_endian <= r.big_endian; + i_out.next_predicted <= i_in.predicted; -- Stall fetch1 if we have a miss on cache or TLB or a protection fault stall_out <= not (is_hit and access_ok); diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 33c8694..b83eed6 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -45,7 +45,6 @@ architecture behave of loadstore1 is -- State machine for unaligned loads/stores type state_t is (IDLE, -- ready for instruction - FPR_CONV, -- converting double to float for store SECOND_REQ, -- send 2nd request of unaligned xfer ACK_WAIT, -- waiting for ack from dcache MMU_LOOKUP, -- waiting for MMU to look up translation @@ -54,18 +53,23 @@ architecture behave of loadstore1 is COMPLETE -- extra cycle to complete an operation ); + type byte_index_t is array(0 to 7) of unsigned(2 downto 0); + subtype byte_trim_t is std_ulogic_vector(1 downto 0); + type trim_ctl_t is array(0 to 7) of byte_trim_t; + type reg_stage_t is record -- latch most of the input request load : std_ulogic; tlbie : std_ulogic; dcbz : std_ulogic; - mfspr : std_ulogic; addr : std_ulogic_vector(63 downto 0); store_data : std_ulogic_vector(63 downto 0); load_data : std_ulogic_vector(63 downto 0); write_reg : gspr_index_t; length : std_ulogic_vector(3 downto 0); byte_reverse : std_ulogic; + byte_offset : unsigned(2 downto 0); + brev_mask : unsigned(2 downto 0); sign_extend : std_ulogic; update : std_ulogic; update_reg : gpr_index_t; @@ -93,17 +97,16 @@ architecture behave of loadstore1 is do_update : std_ulogic; extra_cycle : std_ulogic; mode_32bit : std_ulogic; + byte_index : byte_index_t; + use_second : std_ulogic_vector(7 downto 0); + trim_ctl : trim_ctl_t; load_sp : std_ulogic; ld_sp_data : std_ulogic_vector(31 downto 0); ld_sp_nz : std_ulogic; ld_sp_lz : std_ulogic_vector(5 downto 0); - st_sp_data : std_ulogic_vector(31 downto 0); + wr_sel : std_ulogic_vector(1 downto 0); end record; - type byte_sel_t is array(0 to 7) of std_ulogic; - subtype byte_trim_t is std_ulogic_vector(1 downto 0); - type trim_ctl_t is array(0 to 7) of byte_trim_t; - signal r, rin : reg_stage_t; signal lsu_sum : std_ulogic_vector(63 downto 0); @@ -296,11 +299,8 @@ begin variable data_permuted : std_ulogic_vector(63 downto 0); variable data_trimmed : std_ulogic_vector(63 downto 0); variable store_data : std_ulogic_vector(63 downto 0); - variable data_in : std_ulogic_vector(63 downto 0); variable byte_rev : std_ulogic; variable length : std_ulogic_vector(3 downto 0); - variable use_second : byte_sel_t; - variable trim_ctl : trim_ctl_t; variable negative : std_ulogic; variable sprn : std_ulogic_vector(9 downto 0); variable exception : std_ulogic; @@ -310,37 +310,25 @@ begin variable mmu_mtspr : std_ulogic; variable itlb_fault : std_ulogic; variable misaligned : std_ulogic; - variable fp_reg_conv : std_ulogic; - variable lfs_done : std_ulogic; begin v := r; req := '0'; - v.mfspr := '0'; mmu_mtspr := '0'; itlb_fault := '0'; sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10)); dsisr := (others => '0'); mmureq := '0'; - fp_reg_conv := '0'; + v.wr_sel := "11"; write_enable := '0'; - lfs_done := '0'; do_update := r.do_update; v.do_update := '0'; -- load data formatting - byte_offset := unsigned(r.addr(2 downto 0)); - brev_lenm1 := "000"; - if r.byte_reverse = '1' then - brev_lenm1 := unsigned(r.length(2 downto 0)) - 1; - end if; - -- shift and byte-reverse data bytes for i in 0 to 7 loop - kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); - use_second(i) := kk(3); - j := to_integer(kk(2 downto 0)) * 8; + j := to_integer(r.byte_index(i)) * 8; data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j); end loop; @@ -362,62 +350,32 @@ begin -- trim and sign-extend for i in 0 to 7 loop - if i < to_integer(unsigned(r.length)) then - if r.dwords_done = '1' then - trim_ctl(i) := '1' & not use_second(i); - else - trim_ctl(i) := "10"; - end if; - else - trim_ctl(i) := '0' & (negative and r.sign_extend); - end if; - case trim_ctl(i) is + case r.trim_ctl(i) is when "11" => data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8); when "10" => data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8); when "01" => - data_trimmed(i * 8 + 7 downto i * 8) := x"FF"; + data_trimmed(i * 8 + 7 downto i * 8) := (others => negative); when others => data_trimmed(i * 8 + 7 downto i * 8) := x"00"; end case; end loop; if HAS_FPU then - -- Single-precision FP conversion - v.st_sp_data := store_sp_data; + -- Single-precision FP conversion for loads v.ld_sp_data := data_trimmed(31 downto 0); v.ld_sp_nz := or (data_trimmed(22 downto 0)); v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0)); end if; -- Byte reversing and rotating for stores. - -- Done in the first cycle (when l_in.valid = 1) for integer stores - -- and DP float stores, and in the second cycle for SP float stores. - store_data := r.store_data; - if l_in.valid = '1' or (HAS_FPU and r.state = FPR_CONV) then - if HAS_FPU and r.state = FPR_CONV then - data_in := x"00000000" & r.st_sp_data; - byte_offset := unsigned(r.addr(2 downto 0)); - byte_rev := r.byte_reverse; - length := r.length; - else - data_in := l_in.data; - byte_offset := unsigned(lsu_sum(2 downto 0)); - byte_rev := l_in.byte_reverse; - length := l_in.length; - end if; - brev_lenm1 := "000"; - if byte_rev = '1' then - brev_lenm1 := unsigned(length(2 downto 0)) - 1; - end if; - for i in 0 to 7 loop - k := (to_unsigned(i, 3) - byte_offset) xor brev_lenm1; - j := to_integer(k) * 8; - store_data(i * 8 + 7 downto i * 8) := data_in(j + 7 downto j); - end loop; - end if; - v.store_data := store_data; + -- Done in the second cycle (the cycle after l_in.valid = 1). + for i in 0 to 7 loop + k := (to_unsigned(i, 3) - r.byte_offset) xor r.brev_mask; + j := to_integer(k) * 8; + store_data(i * 8 + 7 downto i * 8) := r.store_data(j + 7 downto j); + end loop; -- compute (addr + 8) & ~7 for the second doubleword when unaligned next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; @@ -449,20 +407,17 @@ begin case r.state is when IDLE => - when FPR_CONV => - req := '1'; - if r.second_bytes /= "00000000" then - v.state := SECOND_REQ; - else - v.state := ACK_WAIT; - end if; - when SECOND_REQ => req := '1'; v.state := ACK_WAIT; v.last_dword := '0'; when ACK_WAIT => + -- r.wr_sel gets set one cycle after we come into ACK_WAIT state, + -- which is OK because the dcache always takes at least two cycles. + if r.update = '1' and (r.load = '0' or (HAS_FPU and r.load_sp = '1')) then + v.wr_sel := "01"; + end if; if d_in.error = '1' then -- dcache will discard the second request if it -- gets an error on the 1st of two requests @@ -493,9 +448,11 @@ begin -- SP to DP conversion takes a cycle -- Write back rA update in this cycle if needed do_update := r.update; + v.wr_sel := "10"; v.state := FINISH_LFS; elsif r.extra_cycle = '1' then -- loads with rA update need an extra cycle + v.wr_sel := "01"; v.state := COMPLETE; v.do_update := r.update; else @@ -533,7 +490,6 @@ begin when TLBIE_WAIT => when FINISH_LFS => - lfs_done := '1'; when COMPLETE => exception := r.align_intr; @@ -573,6 +529,12 @@ begin v.do_update := '0'; v.extra_cycle := '0'; + if HAS_FPU and l_in.is_32bit = '1' then + v.store_data := x"00000000" & store_sp_data; + else + v.store_data := l_in.data; + end if; + addr := lsu_sum; if l_in.second = '1' then -- for the second half of a 16-byte transfer, use next_addr @@ -621,12 +583,7 @@ begin case l_in.op is when OP_STORE => - if HAS_FPU and l_in.is_32bit = '1' then - v.state := FPR_CONV; - fp_reg_conv := '1'; - else - req := '1'; - end if; + req := '1'; when OP_LOAD => req := '1'; v.load := '1'; @@ -647,7 +604,7 @@ begin v.state := TLBIE_WAIT; v.wait_mmu := '1'; when OP_MFSPR => - v.mfspr := '1'; + v.wr_sel := "00"; -- partial decode on SPR number should be adequate given -- the restricted set that get sent down this path if sprn(9) = '0' and sprn(5) = '0' then @@ -696,9 +653,47 @@ begin end if; end if; - v.busy := req or mmureq or mmu_mtspr or fp_reg_conv; + v.busy := req or mmureq or mmu_mtspr; + end if; + + -- Work out controls for store formatting + if l_in.valid = '1' then + byte_offset := unsigned(lsu_sum(2 downto 0)); + byte_rev := l_in.byte_reverse; + length := l_in.length; + brev_lenm1 := "000"; + if byte_rev = '1' then + brev_lenm1 := unsigned(length(2 downto 0)) - 1; + end if; + v.byte_offset := byte_offset; + v.brev_mask := brev_lenm1; + end if; + + -- Work out load formatter controls for next cycle + byte_offset := unsigned(v.addr(2 downto 0)); + brev_lenm1 := "000"; + if v.byte_reverse = '1' then + brev_lenm1 := unsigned(v.length(2 downto 0)) - 1; end if; + for i in 0 to 7 loop + kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); + v.use_second(i) := kk(3); + v.byte_index(i) := kk(2 downto 0); + end loop; + + for i in 0 to 7 loop + if i < to_integer(unsigned(v.length)) then + if v.dwords_done = '1' then + v.trim_ctl(i) := '1' & not v.use_second(i); + else + v.trim_ctl(i) := "10"; + end if; + else + v.trim_ctl(i) := '0' & v.sign_extend; + end if; + end loop; + -- Update outputs to dcache d_out.valid <= req and not v.align_intr; d_out.load <= v.load; @@ -729,23 +724,24 @@ begin -- Multiplex either cache data to the destination GPR or -- the address for the rA update. l_out.valid <= done; - if r.mfspr = '1' then + case r.wr_sel is + when "00" => l_out.write_enable <= '1'; l_out.write_reg <= r.write_reg; l_out.write_data <= r.sprval; - elsif do_update = '1' then - l_out.write_enable <= '1'; + when "01" => + l_out.write_enable <= do_update; l_out.write_reg <= gpr_to_gspr(r.update_reg); l_out.write_data <= r.addr; - elsif lfs_done = '1' then + when "10" => l_out.write_enable <= '1'; l_out.write_reg <= r.write_reg; l_out.write_data <= load_dp_data; - else + when others => l_out.write_enable <= write_enable; l_out.write_reg <= r.write_reg; l_out.write_data <= data_trimmed; - end if; + end case; l_out.xerc <= r.xerc; l_out.rc <= r.rc and done; l_out.store_done <= d_in.store_done; diff --git a/logical.vhdl b/logical.vhdl index d008e47..6b6f202 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -197,8 +197,7 @@ begin tmp := x"00" & dpd_to_bcd(rs(51 downto 42)) & dpd_to_bcd(rs(41 downto 32)) & x"00" & dpd_to_bcd(rs(19 downto 10)) & dpd_to_bcd(rs(9 downto 0)); end if; - when others => - -- EXTS + when OP_EXTS => -- note datalen is a 1-hot encoding negative := (datalen(0) and rs(7)) or (datalen(1) and rs(15)) or @@ -211,6 +210,9 @@ begin tmp(15 downto 8) := rs(15 downto 8); end if; tmp(7 downto 0) := rs(7 downto 0); + when others => + -- e.g. OP_MTSPR + tmp := rs; end case; result <= tmp; diff --git a/microwatt.core b/microwatt.core index 7f2068d..41b6230 100644 --- a/microwatt.core +++ b/microwatt.core @@ -134,6 +134,7 @@ targets: - log_length=2048 - uart_is_16550 - has_fpu + - has_btc tools: vivado: {part : xc7a100tcsg324-1} toplevel : toplevel @@ -218,6 +219,7 @@ targets: - log_length=2048 - uart_is_16550 - has_fpu + - has_btc tools: vivado: {part : xc7a200tsbg484-1} toplevel : toplevel @@ -235,6 +237,7 @@ targets: - log_length=2048 - uart_is_16550 - has_fpu + - has_btc generate: [litedram_nexys_video] tools: vivado: {part : xc7a200tsbg484-1} @@ -254,6 +257,7 @@ targets: - uart_is_16550 - has_uart1 - has_fpu=false + - has_btc=false tools: vivado: {part : xc7a35ticsg324-1L} toplevel : toplevel @@ -273,6 +277,7 @@ targets: - uart_is_16550 - has_uart1 - has_fpu=false + - has_btc=false generate: [litedram_arty, liteeth_arty] tools: vivado: {part : xc7a35ticsg324-1L} @@ -292,6 +297,7 @@ targets: - uart_is_16550 - has_uart1 - has_fpu + - has_btc tools: vivado: {part : xc7a100ticsg324-1L} toplevel : toplevel @@ -311,6 +317,7 @@ targets: - uart_is_16550 - has_uart1 - has_fpu + - has_btc generate: [litedram_arty, liteeth_arty] tools: vivado: {part : xc7a100ticsg324-1L} @@ -329,6 +336,7 @@ targets: - log_length=512 - uart_is_16550 - has_fpu=false + - has_btc=false tools: vivado: {part : xc7a35tcpg236-1} toplevel : toplevel @@ -395,6 +403,12 @@ parameters: paramtype : generic default : true + has_btc: + datatype : bool + description : Include a branch target cache in the core + paramtype : generic + default : true + disable_flatten_core: datatype : bool description : Prevent Vivado from flattening the main core components diff --git a/soc.vhdl b/soc.vhdl index 9feb1e4..88249bf 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -53,6 +53,7 @@ entity soc is CLK_FREQ : positive; SIM : boolean; HAS_FPU : boolean := true; + HAS_BTC : boolean := true; DISABLE_FLATTEN_CORE : boolean := false; HAS_DRAM : boolean := false; DRAM_SIZE : integer := 0; @@ -255,6 +256,7 @@ begin generic map( SIM => SIM, HAS_FPU => HAS_FPU, + HAS_BTC => HAS_BTC, DISABLE_FLATTEN => DISABLE_FLATTEN_CORE, ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'), LOG_LENGTH => LOG_LENGTH