From c2da82764f746745ab00d75bba9dd66b4c40c98d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 15 Jun 2020 17:45:55 +1000 Subject: [PATCH 1/4] core: Implement CFAR register This implements the CFAR SPR as a slow SPR stored in 'ctrl'. Taken branches and rfid update it to the address of the branch or rfid instruction. To simplify the logic, this makes rfid use the branch logic to generate its redirect (requiring SRR0 to come in to execute1 on the B input and SRR1 on the A input), and the masking of the bottom 2 bits of NIA is moved to fetch1. Signed-off-by: Paul Mackerras --- common.vhdl | 2 ++ decode1.vhdl | 4 ++-- execute1.vhdl | 34 ++++++++++++++++++++-------------- fetch1.vhdl | 4 ++-- 4 files changed, 26 insertions(+), 18 deletions(-) diff --git a/common.vhdl b/common.vhdl index d376ac3..15c5c2a 100644 --- a/common.vhdl +++ b/common.vhdl @@ -31,6 +31,7 @@ package common is constant SPR_DEC : spr_num_t := 22; constant SPR_SRR0 : spr_num_t := 26; constant SPR_SRR1 : spr_num_t := 27; + constant SPR_CFAR : spr_num_t := 28; constant SPR_HSRR0 : spr_num_t := 314; constant SPR_HSRR1 : spr_num_t := 315; constant SPR_SPRG0 : spr_num_t := 272; @@ -94,6 +95,7 @@ package common is tb: std_ulogic_vector(63 downto 0); dec: std_ulogic_vector(63 downto 0); msr: std_ulogic_vector(63 downto 0); + cfar: std_ulogic_vector(63 downto 0); irq_state : irq_state_t; irq_nia: std_ulogic_vector(63 downto 0); srr1: std_ulogic_vector(63 downto 0); diff --git a/decode1.vhdl b/decode1.vhdl index 2060e64..d215e7e 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -473,8 +473,8 @@ begin end if; else -- Could be OP_RFID - v.ispr1 := fast_spr_num(SPR_SRR0); - v.ispr2 := fast_spr_num(SPR_SRR1); + v.ispr1 := fast_spr_num(SPR_SRR1); + v.ispr2 := fast_spr_num(SPR_SRR0); end if; elsif majorop = "011110" then diff --git a/execute1.vhdl b/execute1.vhdl index c585f78..0da059a 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -652,26 +652,27 @@ begin result_en := '1'; v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, a_in) = '1' then - f_out.redirect <= '1'; - f_out.redirect_nia <= b_in(63 downto 2) & "00"; - end if; + is_branch := '1'; + taken_branch := ppc_bc_taken(bo, bi, e_in.cr, a_in); + abs_branch := '1'; when OP_RFID => - f_out.redirect <= '1'; - f_out.virt_mode <= b_in(MSR_IR) or b_in(MSR_PR); - f_out.priv_mode <= not b_in(MSR_PR); - f_out.redirect_nia <= a_in(63 downto 2) & "00"; -- srr0 + f_out.virt_mode <= a_in(MSR_IR) or a_in(MSR_PR); + f_out.priv_mode <= not a_in(MSR_PR); -- Can't use msr_copy here because the partial function MSR -- bits should be left unchanged, not zeroed. - ctrl_tmp.msr(63 downto 31) <= b_in(63 downto 31); - ctrl_tmp.msr(26 downto 22) <= b_in(26 downto 22); - ctrl_tmp.msr(15 downto 0) <= b_in(15 downto 0); - if b_in(MSR_PR) = '1' then + ctrl_tmp.msr(63 downto 31) <= a_in(63 downto 31); + ctrl_tmp.msr(26 downto 22) <= a_in(26 downto 22); + ctrl_tmp.msr(15 downto 0) <= a_in(15 downto 0); + if a_in(MSR_PR) = '1' then ctrl_tmp.msr(MSR_EE) <= '1'; ctrl_tmp.msr(MSR_IR) <= '1'; ctrl_tmp.msr(MSR_DR) <= '1'; end if; + -- mark this as a branch so CFAR gets updated + is_branch := '1'; + taken_branch := '1'; + abs_branch := '1'; when OP_CNTZ => v.e.valid := '0'; @@ -757,6 +758,8 @@ begin spr_val(31 downto 0) := ctrl.tb(63 downto 32); when SPR_DEC => spr_val := ctrl.dec; + when SPR_CFAR => + spr_val := ctrl.cfar; when 724 => -- LOG_ADDR SPR spr_val := log_wr_addr & r.log_addr_spr; when 725 => -- LOG_DATA SPR @@ -879,9 +882,9 @@ begin v.e.rc := e_in.rc and valid_in; -- Mispredicted branches cause a redirect - if is_branch = '1' and taken_branch /= e_in.br_pred then - f_out.redirect <= '1'; + if is_branch = '1' then if taken_branch = '1' then + ctrl_tmp.cfar <= e_in.nia; if abs_branch = '1' then f_out.redirect_nia <= b_in; else @@ -890,6 +893,9 @@ begin else f_out.redirect_nia <= next_nia; end if; + if taken_branch /= e_in.br_pred then + f_out.redirect <= '1'; + end if; end if; -- Update LR on the next cycle after a branch link diff --git a/fetch1.vhdl b/fetch1.vhdl index 0d9c6f7..a56f33d 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -83,11 +83,11 @@ begin v.priv_mode := '1'; v_int.stop_state := RUNNING; elsif e_in.redirect = '1' then - v.nia := e_in.redirect_nia; + v.nia := e_in.redirect_nia(63 downto 2) & "00"; v.virt_mode := e_in.virt_mode; v.priv_mode := e_in.priv_mode; elsif d_in.redirect = '1' then - v.nia := d_in.redirect_nia; + v.nia := d_in.redirect_nia(63 downto 2) & "00"; elsif stall_in = '0' then -- For debug stop/step to work properly we need a little bit of From 9b40b5a77b2ecd2d6a6317e624fc7b4aff7bb7c5 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 19 Jun 2020 17:13:06 +1000 Subject: [PATCH 2/4] logical: Only do output inversion for OP_AND, OP_OR and OP_XOR It's not needed for the other ops (popcnt, parity, etc.) and the logical unit shows up as a critical path from time to time. Signed-off-by: Paul Mackerras --- logical.vhdl | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/logical.vhdl b/logical.vhdl index 5e6abfa..0f53544 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -87,12 +87,19 @@ begin end if; case op is - when OP_AND => - tmp := rs and rb_adj; - when OP_OR => - tmp := rs or rb_adj; - when OP_XOR => - tmp := rs xor rb_adj; + when OP_AND | OP_OR | OP_XOR => + case op is + when OP_AND => + tmp := rs and rb_adj; + when OP_OR => + tmp := rs or rb_adj; + when others => + tmp := rs xor rb_adj; + end case; + if invert_out = '1' then + tmp := not tmp; + end if; + when OP_POPCNT => tmp := popcnt; when OP_PRTY => @@ -115,9 +122,6 @@ begin tmp(7 downto 0) := rs(7 downto 0); end case; - if invert_out = '1' then - tmp := not tmp; - end if; result <= tmp; end process; From 0f0573903b16618b1c61ea1029b4ab3f1006ec9c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 19 Jun 2020 18:00:37 +1000 Subject: [PATCH 3/4] execute1: Add latch to redirect path This latches the redirect signal inside execute1, so that it is sent a cycle later to fetch1 (and to decode/icache as flush). This breaks a long combinatorial chain from the branch and interrupt detection in execute1 through the redirect/flush signals all the way back to fetch1, icache and decode. Signed-off-by: Paul Mackerras --- common.vhdl | 5 ++-- execute1.vhdl | 79 +++++++++++++++++++++++++++------------------------ 2 files changed, 44 insertions(+), 40 deletions(-) diff --git a/common.vhdl b/common.vhdl index 15c5c2a..16d38c5 100644 --- a/common.vhdl +++ b/common.vhdl @@ -97,7 +97,6 @@ package common is msr: std_ulogic_vector(63 downto 0); cfar: std_ulogic_vector(63 downto 0); irq_state : irq_state_t; - irq_nia: std_ulogic_vector(63 downto 0); srr1: std_ulogic_vector(63 downto 0); end record; @@ -234,8 +233,8 @@ package common is priv_mode: std_ulogic; redirect_nia: std_ulogic_vector(63 downto 0); end record; - constant Execute1ToFetch1TypeInit : Execute1ToFetch1Type := (redirect => '0', virt_mode => '0', - priv_mode => '0', others => (others => '0')); + constant Execute1ToFetch1Init : Execute1ToFetch1Type := (redirect => '0', virt_mode => '0', + priv_mode => '0', others => (others => '0')); type Execute1ToLoadstore1Type is record valid : std_ulogic; diff --git a/execute1.vhdl b/execute1.vhdl index 0da059a..c68857e 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -48,6 +48,7 @@ end entity execute1; architecture behaviour of execute1 is type reg_type is record e : Execute1ToWritebackType; + f : Execute1ToFetch1Type; busy: std_ulogic; terminate: std_ulogic; lr_update : std_ulogic; @@ -64,7 +65,8 @@ architecture behaviour of execute1 is log_addr_spr : std_ulogic_vector(31 downto 0); end record; constant reg_type_init : reg_type := - (e => Execute1ToWritebackInit, busy => '0', lr_update => '0', terminate => '0', + (e => Execute1ToWritebackInit, f => Execute1ToFetch1Init, + busy => '0', lr_update => '0', terminate => '0', mul_in_progress => '0', div_in_progress => '0', cntz_in_progress => '0', slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init, next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0')); @@ -316,6 +318,7 @@ begin v := r; v.e := Execute1ToWritebackInit; lv := Execute1ToLoadstore1Init; + v.f.redirect := '0'; -- XER forwarding. To avoid having to track XER hazards, we -- use the previously latched value. @@ -423,11 +426,11 @@ begin irq_valid := '0'; if ctrl.msr(MSR_EE) = '1' then if ctrl.dec(63) = '1' then - ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#900#, 64)); + v.f.redirect_nia := std_logic_vector(to_unsigned(16#900#, 64)); report "IRQ valid: DEC"; irq_valid := '1'; elsif ext_irq_in = '1' then - ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#500#, 64)); + v.f.redirect_nia := std_logic_vector(to_unsigned(16#500#, 64)); report "IRQ valid: External"; irq_valid := '1'; end if; @@ -436,10 +439,9 @@ begin v.terminate := '0'; icache_inval <= '0'; v.busy := '0'; - f_out <= Execute1ToFetch1TypeInit; -- send MSR[IR] and ~MSR[PR] up to fetch1 - f_out.virt_mode <= ctrl.msr(MSR_IR); - f_out.priv_mode <= not ctrl.msr(MSR_PR); + v.f.virt_mode := ctrl.msr(MSR_IR); + v.f.priv_mode := not ctrl.msr(MSR_PR); -- Next insn adder used in a couple of places next_nia := std_ulogic_vector(unsigned(e_in.nia) + 4); @@ -450,6 +452,7 @@ begin rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0'; rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0'; + ctrl_tmp.srr1 <= msr_copy(ctrl.msr); ctrl_tmp.irq_state <= WRITE_SRR0; exception := '0'; illegal := '0'; @@ -472,10 +475,6 @@ begin ctrl_tmp.msr(MSR_DR) <= '0'; ctrl_tmp.msr(MSR_RI) <= '0'; ctrl_tmp.msr(MSR_LE) <= '1'; - f_out.redirect <= '1'; - f_out.virt_mode <= '0'; - f_out.priv_mode <= '1'; - f_out.redirect_nia <= ctrl.irq_nia; v.e.valid := '1'; report "Writing SRR1: " & to_hstring(ctrl.srr1); @@ -485,14 +484,12 @@ begin -- Don't deliver the interrupt until we have a valid instruction -- coming in, so we have a valid NIA to put in SRR0. exception := '1'; - ctrl_tmp.srr1 <= msr_copy(ctrl.msr); elsif valid_in = '1' and ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then -- generate a program interrupt exception := '1'; - ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#700#, 64)); - ctrl_tmp.srr1 <= msr_copy(ctrl.msr); + v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); -- set bit 45 to indicate privileged instruction type interrupt ctrl_tmp.srr1(63 - 45) <= '1'; report "privileged instruction"; @@ -522,8 +519,7 @@ begin if e_in.insn(1) = '1' then exception := '1'; exception_nextpc := '1'; - ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#C00#, 64)); - ctrl_tmp.srr1 <= msr_copy(ctrl.msr); + v.f.redirect_nia := std_logic_vector(to_unsigned(16#C00#, 64)); report "sc"; else illegal := '1'; @@ -615,8 +611,7 @@ begin if or (trapval and insn_to(e_in.insn)) = '1' then -- generate trap-type program interrupt exception := '1'; - ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#700#, 64)); - ctrl_tmp.srr1 <= msr_copy(ctrl.msr); + v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); -- set bit 46 to say trap occurred ctrl_tmp.srr1(63 - 46) <= '1'; report "trap"; @@ -657,8 +652,8 @@ begin abs_branch := '1'; when OP_RFID => - f_out.virt_mode <= a_in(MSR_IR) or a_in(MSR_PR); - f_out.priv_mode <= not a_in(MSR_PR); + v.f.virt_mode := a_in(MSR_IR) or a_in(MSR_PR); + v.f.priv_mode := not a_in(MSR_PR); -- Can't use msr_copy here because the partial function MSR -- bits should be left unchanged, not zeroed. ctrl_tmp.msr(63 downto 31) <= a_in(63 downto 31); @@ -856,8 +851,8 @@ begin result_en := '1'; when OP_ISYNC => - f_out.redirect <= '1'; - f_out.redirect_nia <= next_nia; + v.f.redirect := '1'; + v.f.redirect_nia := next_nia; when OP_ICBI => icache_inval <= '1'; @@ -885,16 +880,18 @@ begin if is_branch = '1' then if taken_branch = '1' then ctrl_tmp.cfar <= e_in.nia; + end if; + if e_in.br_pred = '0' then if abs_branch = '1' then - f_out.redirect_nia <= b_in; + v.f.redirect_nia := b_in; else - f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in)); + v.f.redirect_nia := std_ulogic_vector(signed(e_in.nia) + signed(b_in)); end if; else - f_out.redirect_nia <= next_nia; + v.f.redirect_nia := next_nia; end if; if taken_branch /= e_in.br_pred then - f_out.redirect <= '1'; + v.f.redirect := '1'; end if; end if; @@ -923,6 +920,8 @@ begin lv.valid := '1'; end if; + elsif r.f.redirect = '1' then + v.e.valid := '1'; elsif r.lr_update = '1' then v.e.exc_write_enable := '1'; v.e.exc_write_data := r.next_lr; @@ -979,8 +978,7 @@ begin if illegal = '1' then exception := '1'; - ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#700#, 64)); - ctrl_tmp.srr1 <= msr_copy(ctrl.msr); + v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); -- Since we aren't doing Hypervisor emulation assist (0xe40) we -- set bit 44 to indicate we have an illegal ctrl_tmp.srr1(63 - 44) <= '1'; @@ -991,23 +989,19 @@ begin if exception_nextpc = '1' then v.e.exc_write_data := next_nia; end if; - ctrl_tmp.irq_state <= WRITE_SRR1; - v.busy := '1'; - v.e.valid := '0'; end if; v.e.write_data := result; - v.e.write_enable := result_en; + v.e.write_enable := result_en and not exception; -- generate DSI or DSegI for load/store exceptions -- or ISI or ISegI for instruction fetch exceptions if l_in.exception = '1' then - ctrl_tmp.srr1 <= msr_copy(ctrl.msr); if l_in.instr_fault = '0' then if l_in.segment_fault = '0' then - ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#300#, 64)); + v.f.redirect_nia := std_logic_vector(to_unsigned(16#300#, 64)); else - ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#380#, 64)); + v.f.redirect_nia := std_logic_vector(to_unsigned(16#380#, 64)); end if; else if l_in.segment_fault = '0' then @@ -1015,16 +1009,27 @@ begin ctrl_tmp.srr1(63 - 35) <= l_in.perm_error; -- noexec fault ctrl_tmp.srr1(63 - 44) <= l_in.badtree; ctrl_tmp.srr1(63 - 45) <= l_in.rc_error; - ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#400#, 64)); + v.f.redirect_nia := std_logic_vector(to_unsigned(16#400#, 64)); else - ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#480#, 64)); + v.f.redirect_nia := std_logic_vector(to_unsigned(16#480#, 64)); end if; end if; v.e.exc_write_enable := '1'; v.e.exc_write_reg := fast_spr_num(SPR_SRR0); v.e.exc_write_data := r.last_nia; report "ldst exception writing srr0=" & to_hstring(r.last_nia); + end if; + + if exception = '1' or l_in.exception = '1' then ctrl_tmp.irq_state <= WRITE_SRR1; + v.f.redirect := '1'; + v.f.virt_mode := '0'; + v.f.priv_mode := '1'; + end if; + + if v.f.redirect = '1' then + v.busy := '1'; + v.e.valid := '0'; end if; -- Outputs to loadstore1 (async) @@ -1055,7 +1060,7 @@ begin rin <= v; -- update outputs - --f_out <= r.f; + f_out <= r.f; l_out <= lv; e_out <= r.e; flush_out <= f_out.redirect; From 74062195ca9cb74119c81e5978315ac149fe515d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 19 Jun 2020 20:00:16 +1000 Subject: [PATCH 4/4] execute1: Do forwarding of the CR result to the next instruction This adds a path to allow the CR result of one instruction to be forwarded to the next instruction, so that sequences such as cmp; bc can avoid having a 1-cycle bubble. Forwarding is not available for dot-form (Rc=1) instructions, since the CR result for them is calculated in writeback. The decode.output_cr field is used to identify those instructions that compute the CR result in execute1. For some reason, the multiply instructions incorrectly had output_cr = 1 in the decode tables. This fixes that. Signed-off-by: Paul Mackerras --- common.vhdl | 3 ++- control.vhdl | 8 ++++++-- cr_hazard.vhdl | 25 +++++++++++++++++++++---- decode1.vhdl | 26 +++++++++++++------------- decode2.vhdl | 14 ++++++++++++-- execute1.vhdl | 27 +++++++++++++++++++-------- 6 files changed, 73 insertions(+), 30 deletions(-) diff --git a/common.vhdl b/common.vhdl index 16d38c5..18378d5 100644 --- a/common.vhdl +++ b/common.vhdl @@ -151,6 +151,7 @@ package common is bypass_data2: std_ulogic; bypass_data3: std_ulogic; cr: std_ulogic_vector(31 downto 0); + bypass_cr : std_ulogic; xerc: xer_common_t; lr: std_ulogic; rc: std_ulogic; @@ -173,7 +174,7 @@ package common is end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', - lr => '0', rc => '0', oe => '0', invert_a => '0', + bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), others => (others => '0')); diff --git a/control.vhdl b/control.vhdl index 5e557c4..d04576a 100644 --- a/control.vhdl +++ b/control.vhdl @@ -38,6 +38,7 @@ entity control is cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; + cr_bypassable : in std_ulogic; valid_out : out std_ulogic; stall_out : out std_ulogic; @@ -45,7 +46,8 @@ entity control is gpr_bypass_a : out std_ulogic; gpr_bypass_b : out std_ulogic; - gpr_bypass_c : out std_ulogic + gpr_bypass_c : out std_ulogic; + cr_bypass : out std_ulogic ); end entity control; @@ -161,8 +163,10 @@ begin cr_read_in => cr_read_in, cr_write_in => cr_write_valid, + bypassable => cr_bypassable, - stall_out => cr_stall_out + stall_out => cr_stall_out, + use_bypass => cr_bypass ); control0: process(clk) diff --git a/cr_hazard.vhdl b/cr_hazard.vhdl index 4b79020..a6203a8 100644 --- a/cr_hazard.vhdl +++ b/cr_hazard.vhdl @@ -16,15 +16,18 @@ entity cr_hazard is cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; + bypassable : in std_ulogic; - stall_out : out std_ulogic + stall_out : out std_ulogic; + use_bypass : out std_ulogic ); end entity cr_hazard; architecture behaviour of cr_hazard is type pipeline_entry_type is record - valid : std_ulogic; + valid : std_ulogic; + bypass : std_ulogic; end record; - constant pipeline_entry_init : pipeline_entry_type := (valid => '0'); + constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0'); type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type; constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init); @@ -47,7 +50,20 @@ begin if complete_in = '1' then v(1).valid := '0'; end if; - stall_out <= cr_read_in and (v(0).valid or v(1).valid); + + use_bypass <= '0'; + stall_out <= '0'; + if cr_read_in = '1' then + loop_0: for i in 0 to PIPELINE_DEPTH loop + if v(i).valid = '1' then + if r(i).bypass = '1' then + use_bypass <= '1'; + else + stall_out <= '1'; + end if; + end if; + end loop; + end if; -- XXX assumes PIPELINE_DEPTH = 1 if busy_in = '0' then @@ -56,6 +72,7 @@ begin end if; if deferred = '0' and issuing = '1' then v(0).valid := cr_write_in; + v(0).bypass := bypassable; end if; if flush_in = '1' then v(0).valid := '0'; diff --git a/decode1.vhdl b/decode1.vhdl index d215e7e..29b7a05 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -60,7 +60,7 @@ architecture behaviour of decode1 is 41 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lhzu 32 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwz 33 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lwzu - 7 => (ALU, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli + 7 => (ALU, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli 24 => (ALU, OP_OR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ori 25 => (ALU, OP_OR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- oris 20 => (ALU, OP_RLC, RA, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwimi @@ -262,19 +262,19 @@ architecture behaviour of decode1 is 2#0010010000# => (ALU, OP_MTCRF, NONE, NONE, RS, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtcrf/mtocrf 2#0010110010# => (ALU, OP_MTMSRD, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- mtmsrd # ignore top bits and d 2#0111010011# => (ALU, OP_MTSPR, NONE, NONE, RS, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtspr - 2#0001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd - 2#0000001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- mulhdu - 2#0001001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mulhw - 2#0000001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- mulhwu + 2#0001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd + 2#0000001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- mulhdu + 2#0001001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mulhw + 2#0000001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- mulhwu -- next 4 have reserved bit set - 2#1001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd - 2#1000001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- mulhdu - 2#1001001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mulhw - 2#1000001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- mulhwu - 2#0011101001# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulld - 2#1011101001# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulldo - 2#0011101011# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mullw - 2#1011101011# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mullwo + 2#1001001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulhd + 2#1000001001# => (ALU, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- mulhdu + 2#1001001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mulhw + 2#1000001011# => (ALU, OP_MUL_H32, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- mulhwu + 2#0011101001# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulld + 2#1011101001# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0'), -- mulldo + 2#0011101011# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mullw + 2#1011101011# => (ALU, OP_MUL_L64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- mullwo 2#0111011100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nand 2#0001101000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- neg 2#1001101000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nego diff --git a/decode2.vhdl b/decode2.vhdl index 80687a0..d724874 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -213,7 +213,10 @@ architecture behaviour of decode2 is signal gpr_c_read : gpr_index_t; signal gpr_c_bypass : std_ulogic; - signal cr_write_valid : std_ulogic; + signal cr_write_valid : std_ulogic; + signal cr_bypass : std_ulogic; + signal cr_bypass_avail : std_ulogic; + begin control_0: entity work.control generic map ( @@ -248,7 +251,9 @@ begin gpr_c_read_in => gpr_c_read, cr_read_in => d_in.decode.input_cr, - cr_write_in => cr_write_valid, + cr_write_in => cr_write_valid, + cr_bypass => cr_bypass, + cr_bypassable => cr_bypass_avail, valid_out => control_valid_out, stall_out => stall_out, @@ -342,6 +347,7 @@ begin v.e.oe := decode_oe(d_in.decode.rc, d_in.insn); end if; v.e.cr := c_in.read_cr_data; + v.e.bypass_cr := cr_bypass; v.e.xerc := c_in.read_xerc_data; v.e.invert_a := d_in.decode.invert_a; v.e.invert_out := d_in.decode.invert_out; @@ -388,6 +394,10 @@ begin gpr_c_read <= gspr_to_gpr(decoded_reg_c.reg); cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); + cr_bypass_avail <= '0'; + if EX1_BYPASS then + cr_bypass_avail <= d_in.decode.output_cr; + end if; v.e.valid := control_valid_out; if d_in.decode.unit = NONE then diff --git a/execute1.vhdl b/execute1.vhdl index c68857e..a1cd008 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -74,6 +74,7 @@ architecture behaviour of execute1 is signal r, rin : reg_type; signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0); + signal cr_in : std_ulogic_vector(31 downto 0); signal valid_in : std_ulogic; signal ctrl: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0')); @@ -355,6 +356,16 @@ begin v.e.xerc := e_in.xerc; end if; + -- CR forwarding + cr_in <= e_in.cr; + if EX1_BYPASS and e_in.bypass_cr = '1' and r.e.write_cr_enable = '1' then + for i in 0 to 7 loop + if r.e.write_cr_mask(i) = '1' then + cr_in(i * 4 + 3 downto i * 4) <= r.e.write_cr_data(i * 4 + 3 downto i * 4); + end if; + end loop; + end if; + v.lr_update := '0'; v.mul_in_progress := '0'; v.div_in_progress := '0'; @@ -635,7 +646,7 @@ begin v.e.write_reg := fast_spr_num(SPR_CTR); end if; is_branch := '1'; - taken_branch := ppc_bc_taken(bo, bi, e_in.cr, a_in); + taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); abs_branch := insn_aa(e_in.insn); when OP_BCREG => -- read_data1 is CTR @@ -648,7 +659,7 @@ begin v.e.write_reg := fast_spr_num(SPR_CTR); end if; is_branch := '1'; - taken_branch := ppc_bc_taken(bo, bi, e_in.cr, a_in); + taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); abs_branch := '1'; when OP_RFID => @@ -675,7 +686,7 @@ begin v.busy := '1'; when OP_ISEL => crbit := to_integer(unsigned(insn_bc(e_in.insn))); - if e_in.cr(31-crbit) = '1' then + if cr_in(31-crbit) = '1' then result := a_in; else result := b_in; @@ -695,7 +706,7 @@ begin lo := (7-i)*4; hi := lo + 3; if i = scrnum then - newcrf := e_in.cr(hi downto lo); + newcrf := cr_in(hi downto lo); end if; end loop; for i in 0 to 7 loop @@ -713,14 +724,14 @@ begin bbnum := 31 - to_integer(unsigned(bb)); -- Bits 5-8 of cr_op give the truth table of the requested -- logical operation - cr_operands := e_in.cr(banum) & e_in.cr(bbnum); + cr_operands := cr_in(banum) & cr_in(bbnum); crresult := cr_op(5 + to_integer(unsigned(cr_operands))); v.e.write_cr_mask := num_to_fxm((31-btnum) / 4); for i in 0 to 31 loop if i = btnum then v.e.write_cr_data(i) := crresult; else - v.e.write_cr_data(i) := e_in.cr(i); + v.e.write_cr_data(i) := cr_in(i); end if; end loop; end if; @@ -772,7 +783,7 @@ begin when OP_MFCR => if e_in.insn(20) = '0' then -- mfcr - result := x"00000000" & e_in.cr; + result := x"00000000" & cr_in; else -- mfocrf crnum := fxm_to_num(insn_fxm(e_in.insn)); @@ -781,7 +792,7 @@ begin lo := (7-i)*4; hi := lo + 3; if crnum = i then - result(hi downto lo) := e_in.cr(hi downto lo); + result(hi downto lo) := cr_in(hi downto lo); end if; end loop; end if;