From 54f89afab7bc2b58dc48759a68cc8c56954a6b6d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 21 Sep 2020 11:41:46 +1000 Subject: [PATCH 1/9] loadstore1: Decide on load formatting controls a cycle earlier This helps timing. Signed-off-by: Paul Mackerras --- loadstore1.vhdl | 61 ++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 33c8694..f1b98dc 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -54,6 +54,10 @@ architecture behave of loadstore1 is COMPLETE -- extra cycle to complete an operation ); + type byte_index_t is array(0 to 7) of unsigned(2 downto 0); + subtype byte_trim_t is std_ulogic_vector(1 downto 0); + type trim_ctl_t is array(0 to 7) of byte_trim_t; + type reg_stage_t is record -- latch most of the input request load : std_ulogic; @@ -93,6 +97,9 @@ architecture behave of loadstore1 is do_update : std_ulogic; extra_cycle : std_ulogic; mode_32bit : std_ulogic; + byte_index : byte_index_t; + use_second : std_ulogic_vector(7 downto 0); + trim_ctl : trim_ctl_t; load_sp : std_ulogic; ld_sp_data : std_ulogic_vector(31 downto 0); ld_sp_nz : std_ulogic; @@ -100,10 +107,6 @@ architecture behave of loadstore1 is st_sp_data : std_ulogic_vector(31 downto 0); end record; - type byte_sel_t is array(0 to 7) of std_ulogic; - subtype byte_trim_t is std_ulogic_vector(1 downto 0); - type trim_ctl_t is array(0 to 7) of byte_trim_t; - signal r, rin : reg_stage_t; signal lsu_sum : std_ulogic_vector(63 downto 0); @@ -299,8 +302,6 @@ begin variable data_in : std_ulogic_vector(63 downto 0); variable byte_rev : std_ulogic; variable length : std_ulogic_vector(3 downto 0); - variable use_second : byte_sel_t; - variable trim_ctl : trim_ctl_t; variable negative : std_ulogic; variable sprn : std_ulogic_vector(9 downto 0); variable exception : std_ulogic; @@ -330,17 +331,9 @@ begin v.do_update := '0'; -- load data formatting - byte_offset := unsigned(r.addr(2 downto 0)); - brev_lenm1 := "000"; - if r.byte_reverse = '1' then - brev_lenm1 := unsigned(r.length(2 downto 0)) - 1; - end if; - -- shift and byte-reverse data bytes for i in 0 to 7 loop - kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); - use_second(i) := kk(3); - j := to_integer(kk(2 downto 0)) * 8; + j := to_integer(r.byte_index(i)) * 8; data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j); end loop; @@ -362,22 +355,13 @@ begin -- trim and sign-extend for i in 0 to 7 loop - if i < to_integer(unsigned(r.length)) then - if r.dwords_done = '1' then - trim_ctl(i) := '1' & not use_second(i); - else - trim_ctl(i) := "10"; - end if; - else - trim_ctl(i) := '0' & (negative and r.sign_extend); - end if; - case trim_ctl(i) is + case r.trim_ctl(i) is when "11" => data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8); when "10" => data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8); when "01" => - data_trimmed(i * 8 + 7 downto i * 8) := x"FF"; + data_trimmed(i * 8 + 7 downto i * 8) := (others => negative); when others => data_trimmed(i * 8 + 7 downto i * 8) := x"00"; end case; @@ -699,6 +683,31 @@ begin v.busy := req or mmureq or mmu_mtspr or fp_reg_conv; end if; + -- Work out load formatter controls for next cycle + byte_offset := unsigned(v.addr(2 downto 0)); + brev_lenm1 := "000"; + if v.byte_reverse = '1' then + brev_lenm1 := unsigned(v.length(2 downto 0)) - 1; + end if; + + for i in 0 to 7 loop + kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); + v.use_second(i) := kk(3); + v.byte_index(i) := kk(2 downto 0); + end loop; + + for i in 0 to 7 loop + if i < to_integer(unsigned(v.length)) then + if v.dwords_done = '1' then + v.trim_ctl(i) := '1' & not v.use_second(i); + else + v.trim_ctl(i) := "10"; + end if; + else + v.trim_ctl(i) := '0' & v.sign_extend; + end if; + end loop; + -- Update outputs to dcache d_out.valid <= req and not v.align_intr; d_out.load <= v.load; From d1f35705c07d4468b3943467683ca2501731e41c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 28 Sep 2020 14:02:03 +1000 Subject: [PATCH 2/9] loadstore1: Improve timing of data path from cache RAM to writeback Work out select inputs for writeback mux a cycle earlier. Signed-off-by: Paul Mackerras --- loadstore1.vhdl | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/loadstore1.vhdl b/loadstore1.vhdl index f1b98dc..e83d642 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -63,7 +63,6 @@ architecture behave of loadstore1 is load : std_ulogic; tlbie : std_ulogic; dcbz : std_ulogic; - mfspr : std_ulogic; addr : std_ulogic_vector(63 downto 0); store_data : std_ulogic_vector(63 downto 0); load_data : std_ulogic_vector(63 downto 0); @@ -105,6 +104,7 @@ architecture behave of loadstore1 is ld_sp_nz : std_ulogic; ld_sp_lz : std_ulogic_vector(5 downto 0); st_sp_data : std_ulogic_vector(31 downto 0); + wr_sel : std_ulogic_vector(1 downto 0); end record; signal r, rin : reg_stage_t; @@ -312,20 +312,18 @@ begin variable itlb_fault : std_ulogic; variable misaligned : std_ulogic; variable fp_reg_conv : std_ulogic; - variable lfs_done : std_ulogic; begin v := r; req := '0'; - v.mfspr := '0'; mmu_mtspr := '0'; itlb_fault := '0'; sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10)); dsisr := (others => '0'); mmureq := '0'; fp_reg_conv := '0'; + v.wr_sel := "11"; write_enable := '0'; - lfs_done := '0'; do_update := r.do_update; v.do_update := '0'; @@ -447,6 +445,11 @@ begin v.last_dword := '0'; when ACK_WAIT => + -- r.wr_sel gets set one cycle after we come into ACK_WAIT state, + -- which is OK because the dcache always takes at least two cycles. + if r.update = '1' and (r.load = '0' or (HAS_FPU and r.load_sp = '1')) then + v.wr_sel := "01"; + end if; if d_in.error = '1' then -- dcache will discard the second request if it -- gets an error on the 1st of two requests @@ -477,9 +480,11 @@ begin -- SP to DP conversion takes a cycle -- Write back rA update in this cycle if needed do_update := r.update; + v.wr_sel := "10"; v.state := FINISH_LFS; elsif r.extra_cycle = '1' then -- loads with rA update need an extra cycle + v.wr_sel := "01"; v.state := COMPLETE; v.do_update := r.update; else @@ -517,7 +522,6 @@ begin when TLBIE_WAIT => when FINISH_LFS => - lfs_done := '1'; when COMPLETE => exception := r.align_intr; @@ -631,7 +635,7 @@ begin v.state := TLBIE_WAIT; v.wait_mmu := '1'; when OP_MFSPR => - v.mfspr := '1'; + v.wr_sel := "00"; -- partial decode on SPR number should be adequate given -- the restricted set that get sent down this path if sprn(9) = '0' and sprn(5) = '0' then @@ -738,23 +742,24 @@ begin -- Multiplex either cache data to the destination GPR or -- the address for the rA update. l_out.valid <= done; - if r.mfspr = '1' then + case r.wr_sel is + when "00" => l_out.write_enable <= '1'; l_out.write_reg <= r.write_reg; l_out.write_data <= r.sprval; - elsif do_update = '1' then - l_out.write_enable <= '1'; + when "01" => + l_out.write_enable <= do_update; l_out.write_reg <= gpr_to_gspr(r.update_reg); l_out.write_data <= r.addr; - elsif lfs_done = '1' then + when "10" => l_out.write_enable <= '1'; l_out.write_reg <= r.write_reg; l_out.write_data <= load_dp_data; - else + when others => l_out.write_enable <= write_enable; l_out.write_reg <= r.write_reg; l_out.write_data <= data_trimmed; - end if; + end case; l_out.xerc <= r.xerc; l_out.rc <= r.rc and done; l_out.store_done <= d_in.store_done; From 6427cab46fe7f37074505e18a1957414023c2708 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 31 Oct 2020 13:48:58 +1100 Subject: [PATCH 3/9] loadstore1/dcache: Send store data one cycle later This makes timing easier and also means that store floating-point single precision instructions no longer need to take an extra cycle. Signed-off-by: Paul Mackerras --- common.vhdl | 2 +- dcache.vhdl | 2 +- loadstore1.vhdl | 78 +++++++++++++++++++------------------------------ 3 files changed, 32 insertions(+), 50 deletions(-) diff --git a/common.vhdl b/common.vhdl index bfc0db2..8b9380c 100644 --- a/common.vhdl +++ b/common.vhdl @@ -365,7 +365,7 @@ package common is virt_mode : std_ulogic; priv_mode : std_ulogic; addr : std_ulogic_vector(63 downto 0); - data : std_ulogic_vector(63 downto 0); + data : std_ulogic_vector(63 downto 0); -- valid the cycle after .valid = 1 byte_sel : std_ulogic_vector(7 downto 0); end record; diff --git a/dcache.vhdl b/dcache.vhdl index 1e58e1f..7da67e1 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -1306,7 +1306,7 @@ begin req.real_addr := ra; -- Force data to 0 for dcbz if r0.req.dcbz = '0' then - req.data := r0.req.data; + req.data := d_in.data; else req.data := (others => '0'); end if; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index e83d642..b83eed6 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -45,7 +45,6 @@ architecture behave of loadstore1 is -- State machine for unaligned loads/stores type state_t is (IDLE, -- ready for instruction - FPR_CONV, -- converting double to float for store SECOND_REQ, -- send 2nd request of unaligned xfer ACK_WAIT, -- waiting for ack from dcache MMU_LOOKUP, -- waiting for MMU to look up translation @@ -69,6 +68,8 @@ architecture behave of loadstore1 is write_reg : gspr_index_t; length : std_ulogic_vector(3 downto 0); byte_reverse : std_ulogic; + byte_offset : unsigned(2 downto 0); + brev_mask : unsigned(2 downto 0); sign_extend : std_ulogic; update : std_ulogic; update_reg : gpr_index_t; @@ -103,7 +104,6 @@ architecture behave of loadstore1 is ld_sp_data : std_ulogic_vector(31 downto 0); ld_sp_nz : std_ulogic; ld_sp_lz : std_ulogic_vector(5 downto 0); - st_sp_data : std_ulogic_vector(31 downto 0); wr_sel : std_ulogic_vector(1 downto 0); end record; @@ -299,7 +299,6 @@ begin variable data_permuted : std_ulogic_vector(63 downto 0); variable data_trimmed : std_ulogic_vector(63 downto 0); variable store_data : std_ulogic_vector(63 downto 0); - variable data_in : std_ulogic_vector(63 downto 0); variable byte_rev : std_ulogic; variable length : std_ulogic_vector(3 downto 0); variable negative : std_ulogic; @@ -311,7 +310,6 @@ begin variable mmu_mtspr : std_ulogic; variable itlb_fault : std_ulogic; variable misaligned : std_ulogic; - variable fp_reg_conv : std_ulogic; begin v := r; req := '0'; @@ -320,7 +318,6 @@ begin sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10)); dsisr := (others => '0'); mmureq := '0'; - fp_reg_conv := '0'; v.wr_sel := "11"; write_enable := '0'; @@ -366,40 +363,19 @@ begin end loop; if HAS_FPU then - -- Single-precision FP conversion - v.st_sp_data := store_sp_data; + -- Single-precision FP conversion for loads v.ld_sp_data := data_trimmed(31 downto 0); v.ld_sp_nz := or (data_trimmed(22 downto 0)); v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0)); end if; -- Byte reversing and rotating for stores. - -- Done in the first cycle (when l_in.valid = 1) for integer stores - -- and DP float stores, and in the second cycle for SP float stores. - store_data := r.store_data; - if l_in.valid = '1' or (HAS_FPU and r.state = FPR_CONV) then - if HAS_FPU and r.state = FPR_CONV then - data_in := x"00000000" & r.st_sp_data; - byte_offset := unsigned(r.addr(2 downto 0)); - byte_rev := r.byte_reverse; - length := r.length; - else - data_in := l_in.data; - byte_offset := unsigned(lsu_sum(2 downto 0)); - byte_rev := l_in.byte_reverse; - length := l_in.length; - end if; - brev_lenm1 := "000"; - if byte_rev = '1' then - brev_lenm1 := unsigned(length(2 downto 0)) - 1; - end if; - for i in 0 to 7 loop - k := (to_unsigned(i, 3) - byte_offset) xor brev_lenm1; - j := to_integer(k) * 8; - store_data(i * 8 + 7 downto i * 8) := data_in(j + 7 downto j); - end loop; - end if; - v.store_data := store_data; + -- Done in the second cycle (the cycle after l_in.valid = 1). + for i in 0 to 7 loop + k := (to_unsigned(i, 3) - r.byte_offset) xor r.brev_mask; + j := to_integer(k) * 8; + store_data(i * 8 + 7 downto i * 8) := r.store_data(j + 7 downto j); + end loop; -- compute (addr + 8) & ~7 for the second doubleword when unaligned next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; @@ -431,14 +407,6 @@ begin case r.state is when IDLE => - when FPR_CONV => - req := '1'; - if r.second_bytes /= "00000000" then - v.state := SECOND_REQ; - else - v.state := ACK_WAIT; - end if; - when SECOND_REQ => req := '1'; v.state := ACK_WAIT; @@ -561,6 +529,12 @@ begin v.do_update := '0'; v.extra_cycle := '0'; + if HAS_FPU and l_in.is_32bit = '1' then + v.store_data := x"00000000" & store_sp_data; + else + v.store_data := l_in.data; + end if; + addr := lsu_sum; if l_in.second = '1' then -- for the second half of a 16-byte transfer, use next_addr @@ -609,12 +583,7 @@ begin case l_in.op is when OP_STORE => - if HAS_FPU and l_in.is_32bit = '1' then - v.state := FPR_CONV; - fp_reg_conv := '1'; - else - req := '1'; - end if; + req := '1'; when OP_LOAD => req := '1'; v.load := '1'; @@ -684,7 +653,20 @@ begin end if; end if; - v.busy := req or mmureq or mmu_mtspr or fp_reg_conv; + v.busy := req or mmureq or mmu_mtspr; + end if; + + -- Work out controls for store formatting + if l_in.valid = '1' then + byte_offset := unsigned(lsu_sum(2 downto 0)); + byte_rev := l_in.byte_reverse; + length := l_in.length; + brev_lenm1 := "000"; + if byte_rev = '1' then + brev_lenm1 := unsigned(length(2 downto 0)) - 1; + end if; + v.byte_offset := byte_offset; + v.brev_mask := brev_lenm1; end if; -- Work out load formatter controls for next cycle From cb1e3f6d705c6b1808e96ef6e5873c18e9d33a36 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 16 Dec 2020 19:32:07 +1100 Subject: [PATCH 4/9] decode1: Take an extra cycle for predicted branch redirects This does the addition of NIA plus the branch offset from the instruction after a clock edge, in order to ease timing, as the path from the icache RAM through the adder in decode1 to the NIA register in fetch1 was showing up as a critical path. This adds one extra cycle of latency when redirecting fetch because of a predicted-taken branch. Signed-off-by: Paul Mackerras --- decode1.vhdl | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/decode1.vhdl b/decode1.vhdl index 086083e..2edacd3 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -31,6 +31,7 @@ end entity decode1; architecture behaviour of decode1 is signal r, rin : Decode1ToDecode2Type; signal s : Decode1ToDecode2Type; + signal f, fin : Decode1ToFetch1Type; constant illegal_inst : decode_rom_t := (NONE, NONE, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE); @@ -47,6 +48,14 @@ architecture behaviour of decode1 is signal ri, ri_in : reg_internal_t; signal si : reg_internal_t; + type br_predictor_t is record + br_nia : std_ulogic_vector(61 downto 0); + br_offset : signed(23 downto 0); + predict : std_ulogic; + end record; + + signal br, br_in : br_predictor_t; + subtype major_opcode_t is unsigned(5 downto 0); type major_rom_array_t is array(0 to 63) of decode_rom_t; type minor_valid_array_t is array(0 to 1023) of std_ulogic; @@ -537,6 +546,13 @@ begin ri <= ri_in; end if; end if; + if rst = '1' then + br.br_nia <= (others => '0'); + br.br_offset <= (others => '0'); + br.predict <= '0'; + else + br <= br_in; + end if; end if; end process; busy_out <= s.valid; @@ -544,14 +560,13 @@ begin decode1_1: process(all) variable v : Decode1ToDecode2Type; variable vi : reg_internal_t; - variable f : Decode1ToFetch1Type; variable majorop : major_opcode_t; variable minor4op : std_ulogic_vector(10 downto 0); variable op_19_bits: std_ulogic_vector(2 downto 0); variable sprn : spr_num_t; - variable br_nia : std_ulogic_vector(61 downto 0); variable br_target : std_ulogic_vector(61 downto 0); variable br_offset : signed(23 downto 0); + variable bv : br_predictor_t; begin v := Decode1ToDecode2Init; vi := reg_internal_t_init; @@ -707,17 +722,19 @@ begin -- Branch predictor -- Note bclr, bcctr and bctar are predicted not taken as we have no -- count cache or link stack. - br_nia := f_in.nia(63 downto 2); + bv.br_nia := f_in.nia(63 downto 2); if f_in.insn(1) = '1' then - br_nia := (others => '0'); + bv.br_nia := (others => '0'); end if; - br_target := std_ulogic_vector(signed(br_nia) + br_offset); - f.redirect := v.br_pred and f_in.valid and not flush_in and not s.valid; - f.redirect_nia := br_target & "00"; + bv.br_offset := br_offset; + bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out; + -- after a clock edge... + br_target := std_ulogic_vector(signed(br.br_nia) + br.br_offset); -- Update registers rin <= v; ri_in <= vi; + br_in <= bv; -- Update outputs d_out <= r; @@ -729,8 +746,9 @@ begin if ri.force_single = '1' then d_out.decode.sgl_pipe <= '1'; end if; - f_out <= f; - flush_out <= f.redirect; + f_out.redirect <= br.predict; + f_out.redirect_nia <= br_target & "00"; + flush_out <= bv.predict or br.predict; end process; d1_log: if LOG_LENGTH > 0 generate From 9ea1ab0215111bb3d87bf2f9d030f630aea5f952 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 16 Dec 2020 20:41:08 +1100 Subject: [PATCH 5/9] execute1: Move branch adder after register This does the addition of the instruction NIA and the branch offset after the register at the output of execute1 rather than before. The propagation through the adder was showing up as a critical path on the A7-100. Performance is unaffected and now it makes timing. Signed-off-by: Paul Mackerras --- execute1.vhdl | 116 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 70 insertions(+), 46 deletions(-) diff --git a/execute1.vhdl b/execute1.vhdl index 11d81ed..4ea2680 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -53,7 +53,6 @@ end entity execute1; architecture behaviour of execute1 is type reg_type is record e : Execute1ToWritebackType; - f : Execute1ToFetch1Type; busy: std_ulogic; terminate: std_ulogic; fp_exception_next : std_ulogic; @@ -71,15 +70,24 @@ architecture behaviour of execute1 is slow_op_oe : std_ulogic; slow_op_xerc : xer_common_t; last_nia : std_ulogic_vector(63 downto 0); + redirect : std_ulogic; + abs_br : std_ulogic; + do_intr : std_ulogic; + vector : integer range 0 to 16#fff#; + br_offset : std_ulogic_vector(63 downto 0); + redir_mode : std_ulogic_vector(3 downto 0); log_addr_spr : std_ulogic_vector(31 downto 0); end record; constant reg_type_init : reg_type := - (e => Execute1ToWritebackInit, f => Execute1ToFetch1Init, + (e => Execute1ToWritebackInit, busy => '0', lr_update => '0', terminate => '0', fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0', slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init, - next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0')); + next_lr => (others => '0'), last_nia => (others => '0'), + redirect => '0', abs_br => '0', do_intr => '0', vector => 0, + br_offset => (others => '0'), redir_mode => "0000", + others => (others => '0')); signal r, rin : reg_type; @@ -340,6 +348,7 @@ begin variable spr_val : std_ulogic_vector(63 downto 0); variable addend : std_ulogic_vector(127 downto 0); variable do_trace : std_ulogic; + variable f : Execute1ToFetch1Type; variable fv : Execute1ToFPUType; begin result := (others => '0'); @@ -352,8 +361,15 @@ begin v := r; v.e := Execute1ToWritebackInit; + v.redirect := '0'; + v.abs_br := '0'; + v.do_intr := '0'; + v.vector := 0; + v.br_offset := (others => '0'); + v.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) & + not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF); + lv := Execute1ToLoadstore1Init; - v.f.redirect := '0'; fv := Execute1ToFPUInit; -- XER forwarding. To avoid having to track XER hazards, we use @@ -471,11 +487,11 @@ begin irq_valid := '0'; if ctrl.msr(MSR_EE) = '1' then if ctrl.dec(63) = '1' then - v.f.redirect_nia := std_logic_vector(to_unsigned(16#900#, 64)); + v.vector := 16#900#; report "IRQ valid: DEC"; irq_valid := '1'; elsif ext_irq_in = '1' then - v.f.redirect_nia := std_logic_vector(to_unsigned(16#500#, 64)); + v.vector := 16#500#; report "IRQ valid: External"; irq_valid := '1'; end if; @@ -484,11 +500,6 @@ begin v.terminate := '0'; icache_inval <= '0'; v.busy := '0'; - -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1 - v.f.virt_mode := ctrl.msr(MSR_IR); - v.f.priv_mode := not ctrl.msr(MSR_PR); - v.f.big_endian := not ctrl.msr(MSR_LE); - v.f.mode_32bit := not ctrl.msr(MSR_SF); -- Next insn adder used in a couple of places next_nia := std_ulogic_vector(unsigned(e_in.nia) + 4); @@ -546,13 +557,13 @@ begin if HAS_FPU and r.fp_exception_next = '1' then -- This is used for FP-type program interrupts that -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero. - v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); + v.vector := 16#700#; ctrl_tmp.srr1(63 - 43) <= '1'; ctrl_tmp.srr1(63 - 47) <= '1'; else -- Generate a trace interrupt rather than executing the next instruction -- or taking any asynchronous interrupt - v.f.redirect_nia := std_logic_vector(to_unsigned(16#d00#, 64)); + v.vector := 16#d00#; ctrl_tmp.srr1(63 - 33) <= '1'; if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then @@ -574,7 +585,7 @@ begin instr_is_privileged(e_in.insn_type, e_in.insn) then -- generate a program interrupt exception := '1'; - v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); + v.vector := 16#700#; -- set bit 45 to indicate privileged instruction type interrupt ctrl_tmp.srr1(63 - 45) <= '1'; report "privileged instruction"; @@ -586,7 +597,7 @@ begin elsif HAS_FPU and valid_in = '1' and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then -- generate a floating-point unavailable interrupt exception := '1'; - v.f.redirect_nia := std_logic_vector(to_unsigned(16#800#, 64)); + v.vector := 16#800#; report "FP unavailable interrupt"; elsif valid_in = '1' and e_in.unit = ALU then @@ -614,7 +625,7 @@ begin if e_in.insn(1) = '1' then exception := '1'; exception_nextpc := '1'; - v.f.redirect_nia := std_logic_vector(to_unsigned(16#C00#, 64)); + v.vector := 16#C00#; report "sc"; else illegal := '1'; @@ -702,7 +713,7 @@ begin end loop; else -- trap instructions (tw, twi, td, tdi) - v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); + v.vector := 16#700#; -- set bit 46 to say trap occurred ctrl_tmp.srr1(63 - 46) <= '1'; if or (trapval and insn_to(e_in.insn)) = '1' then @@ -785,10 +796,8 @@ begin end if; when OP_RFID => - v.f.virt_mode := a_in(MSR_IR) or a_in(MSR_PR); - v.f.priv_mode := not a_in(MSR_PR); - v.f.big_endian := not a_in(MSR_LE); - v.f.mode_32bit := not a_in(MSR_SF); + v.redir_mode := (a_in(MSR_IR) or a_in(MSR_PR)) & not a_in(MSR_PR) & + not a_in(MSR_LE) & not a_in(MSR_SF); -- Can't use msr_copy here because the partial function MSR -- bits should be left unchanged, not zeroed. ctrl_tmp.msr(63 downto 31) <= a_in(63 downto 31); @@ -1032,8 +1041,8 @@ begin end if; when OP_ISYNC => - v.f.redirect := '1'; - v.f.redirect_nia := next_nia; + v.redirect := '1'; + v.br_offset := std_ulogic_vector(to_unsigned(4, 64)); when OP_ICBI => icache_inval <= '1'; @@ -1063,16 +1072,13 @@ begin ctrl_tmp.cfar <= e_in.nia; end if; if e_in.br_pred = '0' then - if abs_branch = '1' then - v.f.redirect_nia := b_in; - else - v.f.redirect_nia := std_ulogic_vector(signed(e_in.nia) + signed(b_in)); - end if; + v.br_offset := b_in; + v.abs_br := abs_branch; else - v.f.redirect_nia := next_nia; + v.br_offset := std_ulogic_vector(to_unsigned(4, 64)); end if; if taken_branch /= e_in.br_pred then - v.f.redirect := '1'; + v.redirect := '1'; end if; end if; @@ -1114,7 +1120,7 @@ begin -- valid_in = 0. Hence they don't happen in the same cycle as any of -- the cases above which depend on valid_in = 1. - if r.f.redirect = '1' then + if r.redirect = '1' then v.e.valid := '1'; end if; if r.lr_update = '1' then @@ -1195,14 +1201,14 @@ begin -- The case where MSR[FE0,FE1] goes from zero to non-zero is -- handled above by mtmsrd and rfid setting v.fp_exception_next. if HAS_FPU and fp_in.interrupt = '1' then - v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); + v.vector := 16#700#; ctrl_tmp.srr1(63 - 43) <= '1'; exception := '1'; end if; if illegal = '1' or (HAS_FPU and fp_in.illegal = '1') then exception := '1'; - v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64)); + v.vector := 16#700#; -- Since we aren't doing Hypervisor emulation assist (0xe40) we -- set bit 44 to indicate we have an illegal ctrl_tmp.srr1(63 - 44) <= '1'; @@ -1226,12 +1232,12 @@ begin -- or ISI or ISegI for instruction fetch exceptions if l_in.exception = '1' then if l_in.alignment = '1' then - v.f.redirect_nia := std_logic_vector(to_unsigned(16#600#, 64)); + v.vector := 16#600#; elsif l_in.instr_fault = '0' then if l_in.segment_fault = '0' then - v.f.redirect_nia := std_logic_vector(to_unsigned(16#300#, 64)); + v.vector := 16#300#; else - v.f.redirect_nia := std_logic_vector(to_unsigned(16#380#, 64)); + v.vector := 16#380#; end if; else if l_in.segment_fault = '0' then @@ -1239,9 +1245,9 @@ begin ctrl_tmp.srr1(63 - 35) <= l_in.perm_error; -- noexec fault ctrl_tmp.srr1(63 - 44) <= l_in.badtree; ctrl_tmp.srr1(63 - 45) <= l_in.rc_error; - v.f.redirect_nia := std_logic_vector(to_unsigned(16#400#, 64)); + v.vector := 16#400#; else - v.f.redirect_nia := std_logic_vector(to_unsigned(16#480#, 64)); + v.vector := 16#480#; end if; end if; v.e.exc_write_enable := '1'; @@ -1251,19 +1257,37 @@ begin if exception = '1' or l_in.exception = '1' then ctrl_tmp.irq_state <= WRITE_SRR1; - v.f.redirect := '1'; - v.f.virt_mode := '0'; - v.f.priv_mode := '1'; - -- XXX need an interrupt LE bit here, e.g. from LPCR - v.f.big_endian := '0'; - v.f.mode_32bit := '0'; + v.redirect := '1'; + v.do_intr := '1'; end if; - if v.f.redirect = '1' then + if v.redirect = '1' then v.busy := '1'; v.e.valid := '0'; end if; + -- Outputs to fetch1 + f.redirect := r.redirect; + if r.do_intr = '1' then + f.redirect_nia := std_ulogic_vector(to_unsigned(r.vector, 64)); + f.virt_mode := '0'; + f.priv_mode := '1'; + -- XXX need an interrupt LE bit here, e.g. from LPCR + f.big_endian := '0'; + f.mode_32bit := '0'; + else + if r.abs_br = '1' then + f.redirect_nia := r.br_offset; + else + f.redirect_nia := std_ulogic_vector(unsigned(r.last_nia) + unsigned(r.br_offset)); + end if; + -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1 + f.virt_mode := r.redir_mode(3); + f.priv_mode := r.redir_mode(2); + f.big_endian := r.redir_mode(1); + f.mode_32bit := r.redir_mode(0); + end if; + -- Outputs to loadstore1 (async) lv.op := e_in.insn_type; lv.nia := e_in.nia; @@ -1309,7 +1333,7 @@ begin rin <= v; -- update outputs - f_out <= r.f; + f_out <= f; l_out <= lv; e_out <= r.e; fp_out <= fv; From 658feabfd40fa4d4e3048334d11036fc1c1c959b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 26 Sep 2020 17:19:57 +1000 Subject: [PATCH 6/9] core: Make result multiplexing explicit This adds an explicit multiplexer feeding v.e.write_data in execute1, with the select lines determined in the previous cycle based on the insn_type. Similarly, for multiply and divide instructions, there is now an explicit multiplexer. Signed-off-by: Paul Mackerras --- common.vhdl | 6 +- decode2.vhdl | 52 +++++++++++++++++ execute1.vhdl | 150 +++++++++++++++++++++++++++++++------------------- logical.vhdl | 6 +- 4 files changed, 154 insertions(+), 60 deletions(-) diff --git a/common.vhdl b/common.vhdl index 8b9380c..44f63bd 100644 --- a/common.vhdl +++ b/common.vhdl @@ -210,6 +210,7 @@ package common is rc: std_ulogic; oe: std_ulogic; invert_a: std_ulogic; + addm1 : std_ulogic; invert_out: std_ulogic; input_carry: carry_in_t; output_carry: std_ulogic; @@ -224,18 +225,21 @@ package common is update : std_ulogic; -- is this an update instruction? reserve : std_ulogic; -- set for larx/stcx br_pred : std_ulogic; + result_sel : std_ulogic_vector(2 downto 0); -- select source of result + sub_select : std_ulogic_vector(2 downto 0); -- sub-result selection repeat : std_ulogic; -- set if instruction is cracked into two ops second : std_ulogic; -- set if this is the second op end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', - bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', + bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), + result_sel => "000", sub_select => "000", repeat => '0', second => '0', others => (others => '0')); type MultiplyInputType is record diff --git a/decode2.vhdl b/decode2.vhdl index 8b4633a..561fd79 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -221,6 +221,52 @@ architecture behaviour of decode2 is end case; end; + -- control signals that are derived from insn_type + type mux_select_array_t is array(insn_type_t) of std_ulogic_vector(2 downto 0); + + constant result_select : mux_select_array_t := ( + OP_AND => "001", -- logical_result + OP_OR => "001", + OP_XOR => "001", + OP_POPCNT => "001", + OP_PRTY => "001", + OP_CMPB => "001", + OP_EXTS => "001", + OP_BPERM => "001", + OP_BCD => "001", + OP_MTSPR => "001", + OP_RLC => "010", -- rotator_result + OP_RLCL => "010", + OP_RLCR => "010", + OP_SHL => "010", + OP_SHR => "010", + OP_EXTSWSLI => "010", + OP_MUL_L64 => "011", -- muldiv_result + OP_MUL_H64 => "011", + OP_MUL_H32 => "011", + OP_DIV => "011", + OP_DIVE => "011", + OP_MOD => "011", + OP_CNTZ => "100", -- countzero_result + OP_MFSPR => "101", -- spr_result + OP_ISEL => "111", -- misc_result + OP_DARN => "111", + OP_MFMSR => "111", + OP_MFCR => "111", + OP_SETB => "111", + others => "000" -- default to adder_result + ); + + constant subresult_select : mux_select_array_t := ( + OP_MUL_L64 => "000", -- muldiv_result + OP_MUL_H64 => "001", + OP_MUL_H32 => "010", + OP_DIV => "011", + OP_DIVE => "011", + OP_MOD => "011", + others => "000" + ); + -- issue control signals signal control_valid_in : std_ulogic; signal control_valid_out : std_ulogic; @@ -400,6 +446,10 @@ begin v.e.bypass_cr := cr_bypass; v.e.xerc := c_in.read_xerc_data; v.e.invert_a := d_in.decode.invert_a; + v.e.addm1 := '0'; + if d_in.decode.insn_type = OP_BC or d_in.decode.insn_type = OP_BCREG then + v.e.addm1 := '1'; + end if; v.e.invert_out := d_in.decode.invert_out; v.e.input_carry := d_in.decode.input_carry; v.e.output_carry := d_in.decode.output_carry; @@ -415,6 +465,8 @@ begin v.e.update := d_in.decode.update; v.e.reserve := d_in.decode.reserve; v.e.br_pred := d_in.br_pred; + v.e.result_sel := result_select(d_in.decode.insn_type); + v.e.sub_select := subresult_select(d_in.decode.insn_type); -- issue control control_valid_in <= d_in.valid; diff --git a/execute1.vhdl b/execute1.vhdl index 4ea2680..6d2eb04 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -60,6 +60,8 @@ architecture behaviour of execute1 is prev_op : insn_type_t; lr_update : std_ulogic; next_lr : std_ulogic_vector(63 downto 0); + resmux : std_ulogic_vector(2 downto 0); + submux : std_ulogic_vector(2 downto 0); mul_in_progress : std_ulogic; mul_finish : std_ulogic; div_in_progress : std_ulogic; @@ -103,6 +105,13 @@ architecture behaviour of execute1 is signal rotator_carry: std_ulogic; signal logical_result: std_ulogic_vector(63 downto 0); signal countzero_result: std_ulogic_vector(63 downto 0); + signal alu_result: std_ulogic_vector(63 downto 0); + signal adder_result: std_ulogic_vector(63 downto 0); + signal misc_result: std_ulogic_vector(63 downto 0); + signal muldiv_result: std_ulogic_vector(63 downto 0); + signal spr_result: std_ulogic_vector(63 downto 0); + signal result_mux_sel: std_ulogic_vector(2 downto 0); + signal sub_mux_sel: std_ulogic_vector(2 downto 0); -- multiply signals signal x_to_multiply: MultiplyInputType; @@ -285,6 +294,18 @@ begin terminate_out <= r.terminate; + -- Result mux + result_mux_sel <= e_in.result_sel when r.busy = '0' else r.resmux; + sub_mux_sel <= e_in.sub_select when r.busy = '0' else r.submux; + with result_mux_sel select alu_result <= + adder_result when "000", + logical_result when "001", + rotator_result when "010", + muldiv_result when "011", + countzero_result when "100", + spr_result when "101", + misc_result when others; + execute1_0: process(clk) begin if rising_edge(clk) then @@ -310,7 +331,8 @@ begin execute1_1: process(all) variable v : reg_type; variable a_inv : std_ulogic_vector(63 downto 0); - variable result : std_ulogic_vector(63 downto 0); + variable b_or_m1 : std_ulogic_vector(63 downto 0); + variable addg6s : std_ulogic_vector(63 downto 0); variable newcrf : std_ulogic_vector(3 downto 0); variable sum_with_carry : std_ulogic_vector(64 downto 0); variable result_en : std_ulogic; @@ -348,16 +370,17 @@ begin variable spr_val : std_ulogic_vector(63 downto 0); variable addend : std_ulogic_vector(127 downto 0); variable do_trace : std_ulogic; + variable hold_wr_data : std_ulogic; variable f : Execute1ToFetch1Type; variable fv : Execute1ToFPUType; begin - result := (others => '0'); sum_with_carry := (others => '0'); result_en := '0'; newcrf := (others => '0'); is_branch := '0'; taken_branch := '0'; abs_branch := '0'; + hold_wr_data := '0'; v := r; v.e := Execute1ToWritebackInit; @@ -399,14 +422,24 @@ begin v.cntz_in_progress := '0'; v.mul_finish := '0'; + misc_result <= (others => '0'); + spr_result <= (others => '0'); + spr_val := (others => '0'); + -- Main adder if e_in.invert_a = '0' then a_inv := a_in; else a_inv := not a_in; end if; - sum_with_carry := ppc_adde(a_inv, b_in, + if e_in.addm1 = '0' then + b_or_m1 := b_in; + else + b_or_m1 := (others => '1'); + end if; + sum_with_carry := ppc_adde(a_inv, b_or_m1, decode_input_carry(e_in.input_carry, v.e.xerc)); + adder_result <= sum_with_carry(63 downto 0); -- signals to multiply and divide units sign1 := '0'; @@ -432,6 +465,7 @@ begin abs2 := - signed(b_in); end if; + -- Interface to multiply and divide units x_to_multiply <= MultiplyInputInit; x_to_multiply.is_32bit <= e_in.is_32bit; @@ -479,6 +513,18 @@ begin x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0)); end if; + case sub_mux_sel(1 downto 0) is + when "00" => + muldiv_result <= multiply_to_x.result(63 downto 0); + when "01" => + muldiv_result <= multiply_to_x.result(127 downto 64); + when "10" => + muldiv_result <= multiply_to_x.result(63 downto 32) & + multiply_to_x.result(63 downto 32); + when others => + muldiv_result <= divider_to_x.write_reg_data; + end case; + ctrl_tmp <= ctrl; -- FIXME: run at 512MHz not core freq ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); @@ -611,6 +657,8 @@ begin v.slow_op_rc := e_in.rc; v.slow_op_oe := e_in.oe; v.slow_op_xerc := v.e.xerc; + v.resmux := e_in.result_sel; + v.submux := e_in.sub_select; case_0: case e_in.insn_type is @@ -642,8 +690,7 @@ begin when OP_NOP | OP_DCBF | OP_DCBST | OP_DCBT | OP_DCBTST | OP_ICBT => -- Do nothing when OP_ADD | OP_CMP | OP_TRAP => - result := sum_with_carry(63 downto 0); - carry_32 := result(32) xor a_inv(32) xor b_in(32); + carry_32 := sum_with_carry(32) xor a_inv(32) xor b_in(32); carry_64 := sum_with_carry(64); if e_in.insn_type = OP_ADD then if e_in.output_carry = '1' then @@ -724,17 +771,18 @@ begin end if; end if; when OP_ADDG6S => - result := (others => '0'); + addg6s := (others => '0'); for i in 0 to 14 loop lo := i * 4; hi := (i + 1) * 4; if (a_in(hi) xor b_in(hi) xor sum_with_carry(hi)) = '0' then - result(lo + 3 downto lo) := "0110"; + addg6s(lo + 3 downto lo) := "0110"; end if; end loop; if sum_with_carry(64) = '0' then - result(63 downto 60) := "0110"; + addg6s(63 downto 60) := "0110"; end if; + misc_result <= addg6s; result_en := '1'; when OP_CMPRB => newcrf := ppc_cmprb(a_in, b_in, insn_l(e_in.insn)); @@ -754,7 +802,6 @@ begin newcrf & newcrf & newcrf & newcrf; when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS | OP_BPERM | OP_BCD => - result := logical_result; result_en := '1'; when OP_B => is_branch := '1'; @@ -765,12 +812,11 @@ begin end if; when OP_BC => -- read_data1 is CTR + v.e.write_reg := fast_spr_num(SPR_CTR); bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); if bo(4-2) = '0' then - result := std_ulogic_vector(unsigned(a_in) - 1); result_en := '1'; - v.e.write_reg := fast_spr_num(SPR_CTR); end if; is_branch := '1'; taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); @@ -781,12 +827,11 @@ begin when OP_BCREG => -- read_data1 is CTR -- read_data2 is target register (CTR, LR or TAR) + v.e.write_reg := fast_spr_num(SPR_CTR); bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); if bo(4-2) = '0' and e_in.insn(10) = '0' then - result := std_ulogic_vector(unsigned(a_in) - 1); result_en := '1'; - v.e.write_reg := fast_spr_num(SPR_CTR); end if; is_branch := '1'; taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); @@ -825,9 +870,9 @@ begin when OP_ISEL => crbit := to_integer(unsigned(insn_bc(e_in.insn))); if cr_in(31-crbit) = '1' then - result := a_in; + misc_result <= a_in; else - result := b_in; + misc_result <= b_in; end if; result_en := '1'; when OP_CROP => @@ -885,38 +930,38 @@ begin if random_err = '0' then case e_in.insn(17 downto 16) is when "00" => - result := x"00000000" & random_cond(31 downto 0); + misc_result <= x"00000000" & random_cond(31 downto 0); when "10" => - result := random_raw; + misc_result <= random_raw; when others => - result := random_cond; + misc_result <= random_cond; end case; else - result := (others => '1'); + misc_result <= (others => '1'); end if; result_en := '1'; when OP_MFMSR => - result := ctrl.msr; + misc_result <= ctrl.msr; result_en := '1'; when OP_MFSPR => report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & "=" & to_hstring(a_in); result_en := '1'; if is_fast_spr(e_in.read_reg1) then - result := a_in; - if decode_spr_num(e_in.insn) = SPR_XER then + spr_val := a_in; + if decode_spr_num(e_in.insn) = SPR_XER then -- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer - result(63 downto 32) := (others => '0'); - result(63-32) := v.e.xerc.so; - result(63-33) := v.e.xerc.ov; - result(63-34) := v.e.xerc.ca; - result(63-35 downto 63-43) := "000000000"; - result(63-44) := v.e.xerc.ov32; - result(63-45) := v.e.xerc.ca32; - end if; + spr_val(63 downto 32) := (others => '0'); + spr_val(63-32) := v.e.xerc.so; + spr_val(63-33) := v.e.xerc.ov; + spr_val(63-34) := v.e.xerc.ca; + spr_val(63-35 downto 63-43) := "000000000"; + spr_val(63-44) := v.e.xerc.ov32; + spr_val(63-45) := v.e.xerc.ca32; + end if; else spr_val := c_in; - case decode_spr_num(e_in.insn) is + case decode_spr_num(e_in.insn) is when SPR_TB => spr_val := ctrl.tb; when SPR_TBU => @@ -940,22 +985,23 @@ begin if ctrl.msr(MSR_PR) = '1' then illegal := '1'; end if; - end case; - result := spr_val; - end if; + end case; + end if; + spr_result <= spr_val; + when OP_MFCR => if e_in.insn(20) = '0' then -- mfcr - result := x"00000000" & cr_in; + misc_result <= x"00000000" & cr_in; else -- mfocrf crnum := fxm_to_num(insn_fxm(e_in.insn)); - result := (others => '0'); + misc_result <= (others => '0'); for i in 0 to 7 loop lo := (7-i)*4; hi := lo + 3; if crnum = i then - result(hi downto lo) := cr_in(hi downto lo); + misc_result(hi downto lo) <= cr_in(hi downto lo); end if; end loop; end if; @@ -999,7 +1045,6 @@ begin report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & "=" & to_hstring(c_in); if is_fast_spr(e_in.write_reg) then - result := c_in; result_en := '1'; if decode_spr_num(e_in.insn) = SPR_XER then v.e.xerc.so := c_in(63-32); @@ -1025,7 +1070,6 @@ begin end case; end if; when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR | OP_EXTSWSLI => - result := rotator_result; if e_in.output_carry = '1' then set_carry(v.e, rotator_carry, rotator_carry); end if; @@ -1033,11 +1077,11 @@ begin when OP_SETB => bfa := insn_bfa(e_in.insn); crbit := to_integer(unsigned(bfa)) * 4; - result := (others => '0'); + misc_result <= (others => '0'); if cr_in(31 - crbit) = '1' then - result := (others => '1'); + misc_result <= (others => '1'); elsif cr_in(30 - crbit) = '1' then - result(0) := '1'; + misc_result(0) <= '1'; end if; when OP_ISYNC => @@ -1130,10 +1174,9 @@ begin v.e.valid := '1'; -- Keep r.e.write_data unchanged next cycle in case it is needed -- for a forwarded result (e.g. for CTR). - result := r.e.write_data; + hold_wr_data := '1'; elsif r.cntz_in_progress = '1' then -- cnt[lt]z always takes two cycles - result := countzero_result; result_en := '1'; v.e.write_reg := gpr_to_gspr(r.slow_op_dest); v.e.rc := r.slow_op_rc; @@ -1144,18 +1187,7 @@ begin (r.div_in_progress = '1' and divider_to_x.valid = '1') then if r.mul_in_progress = '1' then overflow := '0'; - case r.slow_op_insn is - when OP_MUL_H32 => - result := multiply_to_x.result(63 downto 32) & - multiply_to_x.result(63 downto 32); - when OP_MUL_H64 => - result := multiply_to_x.result(127 downto 64); - when others => - -- i.e. OP_MUL_L64 - result := multiply_to_x.result(63 downto 0); - end case; else - result := divider_to_x.write_reg_data; overflow := divider_to_x.overflow; end if; if r.mul_in_progress = '1' and r.slow_op_oe = '1' then @@ -1184,7 +1216,7 @@ begin v.div_in_progress := r.div_in_progress; end if; elsif r.mul_finish = '1' then - result := r.e.write_data; + hold_wr_data := '1'; result_en := '1'; v.e.write_reg := gpr_to_gspr(r.slow_op_dest); v.e.rc := r.slow_op_rc; @@ -1225,7 +1257,11 @@ begin v.trace_next := '1'; end if; - v.e.write_data := result; + if hold_wr_data = '0' then + v.e.write_data := alu_result; + else + v.e.write_data := r.e.write_data; + end if; v.e.write_enable := result_en and not exception; -- generate DSI or DSegI for load/store exceptions diff --git a/logical.vhdl b/logical.vhdl index d008e47..6b6f202 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -197,8 +197,7 @@ begin tmp := x"00" & dpd_to_bcd(rs(51 downto 42)) & dpd_to_bcd(rs(41 downto 32)) & x"00" & dpd_to_bcd(rs(19 downto 10)) & dpd_to_bcd(rs(9 downto 0)); end if; - when others => - -- EXTS + when OP_EXTS => -- note datalen is a 1-hot encoding negative := (datalen(0) and rs(7)) or (datalen(1) and rs(15)) or @@ -211,6 +210,9 @@ begin tmp(15 downto 8) := rs(15 downto 8); end if; tmp(7 downto 0) := rs(7 downto 0); + when others => + -- e.g. OP_MTSPR + tmp := rs; end case; result <= tmp; From b0510fd1bbfe50ab7f61e6be4a4643c9d5dd87b1 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 26 Sep 2020 19:58:46 +1000 Subject: [PATCH 7/9] core: Reorganize execute1 This breaks up the enormous if .. elsif .. case .. elsif statement in execute1 in order to try to make it simpler and more understandable. We now have decode2 deciding whether the instruction has a value to be written back to a register (GPR, GSPR, FPR, etc.) rather than individual cases in execute1 setting result_en. The computation of the data to be written back is now independent of detection of various exception conditions. We now have an if block determining if any exception condition exists which prevents the next instruction from being executed, then the case statement which performs actions such as setting carry/overflow bits, determining if a trap exception exists, doing branches, etc., then an if statement for all the r.busy = 1 cases (continuing execution of an instruction which was started in a previous cycle, or writing SRR1 for an interrupt). Signed-off-by: Paul Mackerras --- common.vhdl | 3 +- decode2.vhdl | 18 +- execute1.vhdl | 556 ++++++++++++++++++++++++-------------------------- 3 files changed, 289 insertions(+), 288 deletions(-) diff --git a/common.vhdl b/common.vhdl index 44f63bd..d085199 100644 --- a/common.vhdl +++ b/common.vhdl @@ -195,6 +195,7 @@ package common is insn_type: insn_type_t; nia: std_ulogic_vector(63 downto 0); write_reg: gspr_index_t; + write_reg_enable: std_ulogic; read_reg1: gspr_index_t; read_reg2: gspr_index_t; read_data1: std_ulogic_vector(63 downto 0); @@ -232,7 +233,7 @@ package common is end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, - bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', + write_reg_enable => '0', bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', diff --git a/decode2.vhdl b/decode2.vhdl index 561fd79..e00a05d 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -249,7 +249,8 @@ architecture behaviour of decode2 is OP_MOD => "011", OP_CNTZ => "100", -- countzero_result OP_MFSPR => "101", -- spr_result - OP_ISEL => "111", -- misc_result + OP_ADDG6S => "111", -- misc_result + OP_ISEL => "111", OP_DARN => "111", OP_MFMSR => "111", OP_MFCR => "111", @@ -264,6 +265,12 @@ architecture behaviour of decode2 is OP_DIV => "011", OP_DIVE => "011", OP_MOD => "011", + OP_ADDG6S => "001", -- misc_result + OP_ISEL => "010", + OP_DARN => "011", + OP_MFMSR => "100", + OP_MFCR => "101", + OP_SETB => "110", others => "000" ); @@ -438,6 +445,7 @@ begin v.e.read_data3 := decoded_reg_c.data; v.e.bypass_data3 := gpr_c_bypass; v.e.write_reg := decoded_reg_o.reg; + v.e.write_reg_enable := decoded_reg_o.reg_valid; v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then v.e.oe := decode_oe(d_in.decode.rc, d_in.insn); @@ -448,7 +456,13 @@ begin v.e.invert_a := d_in.decode.invert_a; v.e.addm1 := '0'; if d_in.decode.insn_type = OP_BC or d_in.decode.insn_type = OP_BCREG then + -- add -1 to CTR v.e.addm1 := '1'; + if d_in.insn(23) = '1' or + (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then + -- don't write decremented CTR if BO(2) = 1 or bcctr + v.e.write_reg_enable := '0'; + end if; end if; v.e.invert_out := d_in.decode.invert_out; v.e.input_carry := d_in.decode.input_carry; @@ -472,7 +486,7 @@ begin control_valid_in <= d_in.valid; control_sgl_pipe <= d_in.decode.sgl_pipe; - gpr_write_valid <= decoded_reg_o.reg_valid; + gpr_write_valid <= v.e.write_reg_enable; gpr_write <= decoded_reg_o.reg; gpr_bypassable <= '0'; if EX1_BYPASS and d_in.decode.unit = ALU then diff --git a/execute1.vhdl b/execute1.vhdl index 6d2eb04..6a27ee8 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -53,6 +53,7 @@ end entity execute1; architecture behaviour of execute1 is type reg_type is record e : Execute1ToWritebackType; + cur_instr : Decode2ToExecute1Type; busy: std_ulogic; terminate: std_ulogic; fp_exception_next : std_ulogic; @@ -60,17 +61,10 @@ architecture behaviour of execute1 is prev_op : insn_type_t; lr_update : std_ulogic; next_lr : std_ulogic_vector(63 downto 0); - resmux : std_ulogic_vector(2 downto 0); - submux : std_ulogic_vector(2 downto 0); mul_in_progress : std_ulogic; mul_finish : std_ulogic; div_in_progress : std_ulogic; cntz_in_progress : std_ulogic; - slow_op_insn : insn_type_t; - slow_op_dest : gpr_index_t; - slow_op_rc : std_ulogic; - slow_op_oe : std_ulogic; - slow_op_xerc : xer_common_t; last_nia : std_ulogic_vector(63 downto 0); redirect : std_ulogic; abs_br : std_ulogic; @@ -82,10 +76,10 @@ architecture behaviour of execute1 is end record; constant reg_type_init : reg_type := (e => Execute1ToWritebackInit, + cur_instr => Decode2ToExecute1Init, busy => '0', lr_update => '0', terminate => '0', fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0', - slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init, next_lr => (others => '0'), last_nia => (others => '0'), redirect => '0', abs_br => '0', do_intr => '0', vector => 0, br_offset => (others => '0'), redir_mode => "0000", @@ -112,6 +106,7 @@ architecture behaviour of execute1 is signal spr_result: std_ulogic_vector(63 downto 0); signal result_mux_sel: std_ulogic_vector(2 downto 0); signal sub_mux_sel: std_ulogic_vector(2 downto 0); + signal current: Decode2ToExecute1Type; -- multiply signals signal x_to_multiply: MultiplyInputType; @@ -294,10 +289,10 @@ begin terminate_out <= r.terminate; + current <= e_in when r.busy = '0' else r.cur_instr; + -- Result mux - result_mux_sel <= e_in.result_sel when r.busy = '0' else r.resmux; - sub_mux_sel <= e_in.sub_select when r.busy = '0' else r.submux; - with result_mux_sel select alu_result <= + with current.result_sel select alu_result <= adder_result when "000", logical_result when "001", rotator_result when "010", @@ -333,9 +328,12 @@ begin variable a_inv : std_ulogic_vector(63 downto 0); variable b_or_m1 : std_ulogic_vector(63 downto 0); variable addg6s : std_ulogic_vector(63 downto 0); + variable isel_result : std_ulogic_vector(63 downto 0); + variable darn : std_ulogic_vector(63 downto 0); + variable mfcr_result : std_ulogic_vector(63 downto 0); + variable setb_result : std_ulogic_vector(63 downto 0); variable newcrf : std_ulogic_vector(3 downto 0); variable sum_with_carry : std_ulogic_vector(64 downto 0); - variable result_en : std_ulogic; variable crnum : crnum_t; variable crbit : integer range 0 to 31; variable scrnum : crnum_t; @@ -375,7 +373,6 @@ begin variable fv : Execute1ToFPUType; begin sum_with_carry := (others => '0'); - result_en := '0'; newcrf := (others => '0'); is_branch := '0'; taken_branch := '0'; @@ -400,7 +397,7 @@ begin -- (SO, OV[32] and CA[32]) are only modified by instructions that are -- handled here, we can just forward the result being sent to -- writeback. - if r.e.write_xerc_enable = '1' then + if r.e.write_xerc_enable = '1' or r.busy = '1' then v.e.xerc := r.e.xerc; else v.e.xerc := e_in.xerc; @@ -422,7 +419,6 @@ begin v.cntz_in_progress := '0'; v.mul_finish := '0'; - misc_result <= (others => '0'); spr_result <= (others => '0'); spr_val := (others => '0'); @@ -440,6 +436,8 @@ begin sum_with_carry := ppc_adde(a_inv, b_or_m1, decode_input_carry(e_in.input_carry, v.e.xerc)); adder_result <= sum_with_carry(63 downto 0); + carry_32 := sum_with_carry(32) xor a_inv(32) xor b_in(32); + carry_64 := sum_with_carry(64); -- signals to multiply and divide units sign1 := '0'; @@ -513,7 +511,7 @@ begin x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0)); end if; - case sub_mux_sel(1 downto 0) is + case current.sub_select(1 downto 0) is when "00" => muldiv_result <= multiply_to_x.result(63 downto 0); when "01" => @@ -525,6 +523,117 @@ begin muldiv_result <= divider_to_x.write_reg_data; end case; + -- Compute misc_result + case current.sub_select is + when "000" => + misc_result <= (others => '0'); + when "001" => + -- addg6s + addg6s := (others => '0'); + for i in 0 to 14 loop + lo := i * 4; + hi := (i + 1) * 4; + if (a_in(hi) xor b_in(hi) xor sum_with_carry(hi)) = '0' then + addg6s(lo + 3 downto lo) := "0110"; + end if; + end loop; + if sum_with_carry(64) = '0' then + addg6s(63 downto 60) := "0110"; + end if; + misc_result <= addg6s; + when "010" => + -- isel + crbit := to_integer(unsigned(insn_bc(e_in.insn))); + if cr_in(31-crbit) = '1' then + isel_result := a_in; + else + isel_result := b_in; + end if; + misc_result <= isel_result; + when "011" => + -- darn + darn := (others => '1'); + if random_err = '0' then + case e_in.insn(17 downto 16) is + when "00" => + darn := x"00000000" & random_cond(31 downto 0); + when "10" => + darn := random_raw; + when others => + darn := random_cond; + end case; + end if; + misc_result <= darn; + when "100" => + -- mfmsr + misc_result <= ctrl.msr; + when "101" => + if e_in.insn(20) = '0' then + -- mfcr + mfcr_result := x"00000000" & cr_in; + else + -- mfocrf + crnum := fxm_to_num(insn_fxm(e_in.insn)); + mfcr_result := (others => '0'); + for i in 0 to 7 loop + lo := (7-i)*4; + hi := lo + 3; + if crnum = i then + mfcr_result(hi downto lo) := cr_in(hi downto lo); + end if; + end loop; + end if; + misc_result <= mfcr_result; + when "110" => + -- setb + bfa := insn_bfa(e_in.insn); + crbit := to_integer(unsigned(bfa)) * 4; + setb_result := (others => '0'); + if cr_in(31 - crbit) = '1' then + setb_result := (others => '1'); + elsif cr_in(30 - crbit) = '1' then + setb_result(0) := '1'; + end if; + misc_result <= setb_result; + when others => + misc_result <= (others => '0'); + end case; + + -- compute comparison results + -- Note, we have done RB - RA, not RA - RB + if e_in.insn_type = OP_CMP then + l := insn_l(e_in.insn); + else + l := not e_in.is_32bit; + end if; + zerolo := not (or (a_in(31 downto 0) xor b_in(31 downto 0))); + zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32))); + if zerolo = '1' and (l = '0' or zerohi = '1') then + -- values are equal + trapval := "00100"; + else + if l = '1' then + -- 64-bit comparison + msb_a := a_in(63); + msb_b := b_in(63); + else + -- 32-bit comparison + msb_a := a_in(31); + msb_b := b_in(31); + end if; + if msb_a /= msb_b then + -- Subtraction might overflow, but + -- comparison is clear from MSB difference. + -- for signed, 0 is greater; for unsigned, 1 is greater + trapval := msb_a & msb_b & '0' & msb_b & msb_a; + else + -- Subtraction cannot overflow since MSBs are equal. + -- carry = 1 indicates RA is smaller (signed or unsigned) + a_lt := (not l and carry_32) or (l and carry_64); + trapval := a_lt & not a_lt & '0' & a_lt & not a_lt; + end if; + end if; + ctrl_tmp <= ctrl; -- FIXME: run at 512MHz not core freq ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); @@ -577,38 +686,20 @@ begin v.prev_op := e_in.insn_type; end if; - if ctrl.irq_state = WRITE_SRR1 then - v.e.exc_write_reg := fast_spr_num(SPR_SRR1); - v.e.exc_write_data := ctrl.srr1; - v.e.exc_write_enable := '1'; - ctrl_tmp.msr(MSR_SF) <= '1'; - ctrl_tmp.msr(MSR_EE) <= '0'; - ctrl_tmp.msr(MSR_PR) <= '0'; - ctrl_tmp.msr(MSR_SE) <= '0'; - ctrl_tmp.msr(MSR_BE) <= '0'; - ctrl_tmp.msr(MSR_FP) <= '0'; - ctrl_tmp.msr(MSR_FE0) <= '0'; - ctrl_tmp.msr(MSR_FE1) <= '0'; - ctrl_tmp.msr(MSR_IR) <= '0'; - ctrl_tmp.msr(MSR_DR) <= '0'; - ctrl_tmp.msr(MSR_RI) <= '0'; - ctrl_tmp.msr(MSR_LE) <= '1'; - v.e.valid := '1'; - v.trace_next := '0'; - v.fp_exception_next := '0'; - report "Writing SRR1: " & to_hstring(ctrl.srr1); - - elsif valid_in = '1' and e_in.second = '0' and - ((HAS_FPU and r.fp_exception_next = '1') or r.trace_next = '1') then + -- Determine if there is any exception to be taken + -- before/instead of executing this instruction + if valid_in = '1' and e_in.second = '0' then if HAS_FPU and r.fp_exception_next = '1' then -- This is used for FP-type program interrupts that -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero. + exception := '1'; v.vector := 16#700#; ctrl_tmp.srr1(63 - 43) <= '1'; ctrl_tmp.srr1(63 - 47) <= '1'; - else + elsif r.trace_next = '1' then -- Generate a trace interrupt rather than executing the next instruction -- or taking any asynchronous interrupt + exception := '1'; v.vector := 16#d00#; ctrl_tmp.srr1(63 - 33) <= '1'; if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or @@ -617,48 +708,38 @@ begin elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then ctrl_tmp.srr1(63 - 36) <= '1'; end if; - end if; - exception := '1'; - - elsif irq_valid = '1' and valid_in = '1' and e_in.second = '0' then - -- we need two cycles to write srr0 and 1 - -- will need more when we have to write HEIR - -- Don't deliver the interrupt until we have a valid instruction - -- coming in, so we have a valid NIA to put in SRR0. - exception := '1'; - elsif valid_in = '1' and ctrl.msr(MSR_PR) = '1' and - instr_is_privileged(e_in.insn_type, e_in.insn) then - -- generate a program interrupt - exception := '1'; - v.vector := 16#700#; - -- set bit 45 to indicate privileged instruction type interrupt - ctrl_tmp.srr1(63 - 45) <= '1'; - report "privileged instruction"; + elsif irq_valid = '1' then + -- Don't deliver the interrupt until we have a valid instruction + -- coming in, so we have a valid NIA to put in SRR0. + exception := '1'; - elsif not HAS_FPU and valid_in = '1' and e_in.fac = FPU then - -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations - illegal := '1'; + elsif ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then + -- generate a program interrupt + exception := '1'; + v.vector := 16#700#; + -- set bit 45 to indicate privileged instruction type interrupt + ctrl_tmp.srr1(63 - 45) <= '1'; + report "privileged instruction"; - elsif HAS_FPU and valid_in = '1' and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then - -- generate a floating-point unavailable interrupt - exception := '1'; - v.vector := 16#800#; - report "FP unavailable interrupt"; + elsif not HAS_FPU and e_in.fac = FPU then + -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations + illegal := '1'; - elsif valid_in = '1' and e_in.unit = ALU then + elsif HAS_FPU and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then + -- generate a floating-point unavailable interrupt + exception := '1'; + v.vector := 16#800#; + report "FP unavailable interrupt"; + end if; + end if; + if valid_in = '1' and exception = '0' and illegal = '0' and e_in.unit = ALU then report "execute nia " & to_hstring(e_in.nia); + v.cur_instr := e_in; + v.next_lr := next_nia; v.e.valid := '1'; - v.e.write_reg := e_in.write_reg; - v.slow_op_insn := e_in.insn_type; - v.slow_op_dest := gspr_to_gpr(e_in.write_reg); - v.slow_op_rc := e_in.rc; - v.slow_op_oe := e_in.oe; - v.slow_op_xerc := v.e.xerc; - v.resmux := e_in.result_sel; - v.submux := e_in.sub_select; case_0: case e_in.insn_type is @@ -689,101 +770,48 @@ begin end if; when OP_NOP | OP_DCBF | OP_DCBST | OP_DCBT | OP_DCBTST | OP_ICBT => -- Do nothing - when OP_ADD | OP_CMP | OP_TRAP => - carry_32 := sum_with_carry(32) xor a_inv(32) xor b_in(32); - carry_64 := sum_with_carry(64); - if e_in.insn_type = OP_ADD then - if e_in.output_carry = '1' then - if e_in.input_carry /= OV then - set_carry(v.e, carry_32, carry_64); - else - v.e.xerc.ov := carry_64; - v.e.xerc.ov32 := carry_32; - v.e.write_xerc_enable := '1'; - end if; - end if; - if e_in.oe = '1' then - set_ov(v.e, - calc_ov(a_inv(63), b_in(63), carry_64, sum_with_carry(63)), - calc_ov(a_inv(31), b_in(31), carry_32, sum_with_carry(31))); - end if; - result_en := '1'; - else - -- trap, CMP and CMPL instructions - -- Note, we have done RB - RA, not RA - RB - if e_in.insn_type = OP_CMP then - l := insn_l(e_in.insn); - else - l := not e_in.is_32bit; - end if; - zerolo := not (or (a_in(31 downto 0) xor b_in(31 downto 0))); - zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32))); - if zerolo = '1' and (l = '0' or zerohi = '1') then - -- values are equal - trapval := "00100"; + when OP_ADD => + if e_in.output_carry = '1' then + if e_in.input_carry /= OV then + set_carry(v.e, carry_32, carry_64); else - if l = '1' then - -- 64-bit comparison - msb_a := a_in(63); - msb_b := b_in(63); - else - -- 32-bit comparison - msb_a := a_in(31); - msb_b := b_in(31); - end if; - if msb_a /= msb_b then - -- Subtraction might overflow, but - -- comparison is clear from MSB difference. - -- for signed, 0 is greater; for unsigned, 1 is greater - trapval := msb_a & msb_b & '0' & msb_b & msb_a; - else - -- Subtraction cannot overflow since MSBs are equal. - -- carry = 1 indicates RA is smaller (signed or unsigned) - a_lt := (not l and carry_32) or (l and carry_64); - trapval := a_lt & not a_lt & '0' & a_lt & not a_lt; - end if; - end if; - if e_in.insn_type = OP_CMP then - if e_in.is_signed = '1' then - newcrf := trapval(4 downto 2) & v.e.xerc.so; - else - newcrf := trapval(1 downto 0) & trapval(2) & v.e.xerc.so; - end if; - bf := insn_bf(e_in.insn); - crnum := to_integer(unsigned(bf)); - v.e.write_cr_enable := '1'; - v.e.write_cr_mask := num_to_fxm(crnum); - for i in 0 to 7 loop - lo := i*4; - hi := lo + 3; - v.e.write_cr_data(hi downto lo) := newcrf; - end loop; - else - -- trap instructions (tw, twi, td, tdi) - v.vector := 16#700#; - -- set bit 46 to say trap occurred - ctrl_tmp.srr1(63 - 46) <= '1'; - if or (trapval and insn_to(e_in.insn)) = '1' then - -- generate trap-type program interrupt - exception := '1'; - report "trap"; - end if; + v.e.xerc.ov := carry_64; + v.e.xerc.ov32 := carry_32; + v.e.write_xerc_enable := '1'; end if; end if; - when OP_ADDG6S => - addg6s := (others => '0'); - for i in 0 to 14 loop - lo := i * 4; - hi := (i + 1) * 4; - if (a_in(hi) xor b_in(hi) xor sum_with_carry(hi)) = '0' then - addg6s(lo + 3 downto lo) := "0110"; - end if; + if e_in.oe = '1' then + set_ov(v.e, + calc_ov(a_inv(63), b_in(63), carry_64, sum_with_carry(63)), + calc_ov(a_inv(31), b_in(31), carry_32, sum_with_carry(31))); + end if; + when OP_CMP => + -- CMP and CMPL instructions + if e_in.is_signed = '1' then + newcrf := trapval(4 downto 2) & v.e.xerc.so; + else + newcrf := trapval(1 downto 0) & trapval(2) & v.e.xerc.so; + end if; + bf := insn_bf(e_in.insn); + crnum := to_integer(unsigned(bf)); + v.e.write_cr_enable := '1'; + v.e.write_cr_mask := num_to_fxm(crnum); + for i in 0 to 7 loop + lo := i*4; + hi := lo + 3; + v.e.write_cr_data(hi downto lo) := newcrf; end loop; - if sum_with_carry(64) = '0' then - addg6s(63 downto 60) := "0110"; + when OP_TRAP => + -- trap instructions (tw, twi, td, tdi) + v.vector := 16#700#; + -- set bit 46 to say trap occurred + ctrl_tmp.srr1(63 - 46) <= '1'; + if or (trapval and insn_to(e_in.insn)) = '1' then + -- generate trap-type program interrupt + exception := '1'; + report "trap"; end if; - misc_result <= addg6s; - result_en := '1'; + when OP_ADDG6S => when OP_CMPRB => newcrf := ppc_cmprb(a_in, b_in, insn_l(e_in.insn)); bf := insn_bf(e_in.insn); @@ -802,7 +830,6 @@ begin newcrf & newcrf & newcrf & newcrf; when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS | OP_BPERM | OP_BCD => - result_en := '1'; when OP_B => is_branch := '1'; taken_branch := '1'; @@ -812,12 +839,8 @@ begin end if; when OP_BC => -- read_data1 is CTR - v.e.write_reg := fast_spr_num(SPR_CTR); bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); - if bo(4-2) = '0' then - result_en := '1'; - end if; is_branch := '1'; taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); abs_branch := insn_aa(e_in.insn); @@ -827,12 +850,8 @@ begin when OP_BCREG => -- read_data1 is CTR -- read_data2 is target register (CTR, LR or TAR) - v.e.write_reg := fast_spr_num(SPR_CTR); bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); - if bo(4-2) = '0' and e_in.insn(10) = '0' then - result_en := '1'; - end if; is_branch := '1'; taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); abs_branch := '1'; @@ -868,13 +887,6 @@ begin v.cntz_in_progress := '1'; v.busy := '1'; when OP_ISEL => - crbit := to_integer(unsigned(insn_bc(e_in.insn))); - if cr_in(31-crbit) = '1' then - misc_result <= a_in; - else - misc_result <= b_in; - end if; - result_en := '1'; when OP_CROP => cr_op := insn_cr(e_in.insn); report "CR OP " & to_hstring(cr_op); @@ -927,27 +939,11 @@ begin v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf; when OP_DARN => - if random_err = '0' then - case e_in.insn(17 downto 16) is - when "00" => - misc_result <= x"00000000" & random_cond(31 downto 0); - when "10" => - misc_result <= random_raw; - when others => - misc_result <= random_cond; - end case; - else - misc_result <= (others => '1'); - end if; - result_en := '1'; when OP_MFMSR => - misc_result <= ctrl.msr; - result_en := '1'; when OP_MFSPR => report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & "=" & to_hstring(a_in); - result_en := '1'; - if is_fast_spr(e_in.read_reg1) then + if is_fast_spr(e_in.read_reg1) = '1' then spr_val := a_in; if decode_spr_num(e_in.insn) = SPR_XER then -- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer @@ -982,7 +978,7 @@ begin when others => -- mfspr from unimplemented SPRs should be a nop in -- supervisor mode and a program interrupt for user mode - if ctrl.msr(MSR_PR) = '1' then + if is_fast_spr(e_in.read_reg1) = '0' and ctrl.msr(MSR_PR) = '1' then illegal := '1'; end if; end case; @@ -990,22 +986,6 @@ begin spr_result <= spr_val; when OP_MFCR => - if e_in.insn(20) = '0' then - -- mfcr - misc_result <= x"00000000" & cr_in; - else - -- mfocrf - crnum := fxm_to_num(insn_fxm(e_in.insn)); - misc_result <= (others => '0'); - for i in 0 to 7 loop - lo := (7-i)*4; - hi := lo + 3; - if crnum = i then - misc_result(hi downto lo) <= cr_in(hi downto lo); - end if; - end loop; - end if; - result_en := '1'; when OP_MTCRF => v.e.write_cr_enable := '1'; if e_in.insn(20) = '0' then @@ -1045,7 +1025,6 @@ begin report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & "=" & to_hstring(c_in); if is_fast_spr(e_in.write_reg) then - result_en := '1'; if decode_spr_num(e_in.insn) = SPR_XER then v.e.xerc.so := c_in(63-32); v.e.xerc.ov := c_in(63-33); @@ -1073,16 +1052,7 @@ begin if e_in.output_carry = '1' then set_carry(v.e, rotator_carry, rotator_carry); end if; - result_en := '1'; when OP_SETB => - bfa := insn_bfa(e_in.insn); - crbit := to_integer(unsigned(bfa)) * 4; - misc_result <= (others => '0'); - if cr_in(31 - crbit) = '1' then - misc_result <= (others => '1'); - elsif cr_in(30 - crbit) = '1' then - misc_result(0) <= '1'; - end if; when OP_ISYNC => v.redirect := '1'; @@ -1108,8 +1078,6 @@ begin report "illegal"; end case; - v.e.rc := e_in.rc and valid_in; - -- Mispredicted branches cause a redirect if is_branch = '1' then if taken_branch = '1' then @@ -1126,26 +1094,7 @@ begin end if; end if; - -- Update LR on the next cycle after a branch link - -- If we're not writing back anything else, we can write back LR - -- this cycle, otherwise we take an extra cycle. We use the - -- exc_write path since next_nia is written through that path - -- in other places. - if e_in.lr = '1' then - if result_en = '0' then - v.e.exc_write_enable := '1'; - v.e.exc_write_data := next_nia; - v.e.exc_write_reg := fast_spr_num(SPR_LR); - else - v.lr_update := '1'; - v.next_lr := next_nia; - v.e.valid := '0'; - report "Delayed LR update to " & to_hstring(next_nia); - v.busy := '1'; - end if; - end if; - - elsif valid_in = '1' then + elsif valid_in = '1' and exception = '0' and illegal = '0' then -- instruction for other units, i.e. LDST if e_in.unit = LDST then lv.valid := '1'; @@ -1164,23 +1113,28 @@ begin -- valid_in = 0. Hence they don't happen in the same cycle as any of -- the cases above which depend on valid_in = 1. - if r.redirect = '1' then - v.e.valid := '1'; - end if; - if r.lr_update = '1' then + if ctrl.irq_state = WRITE_SRR1 then + v.e.exc_write_reg := fast_spr_num(SPR_SRR1); + v.e.exc_write_data := ctrl.srr1; v.e.exc_write_enable := '1'; - v.e.exc_write_data := r.next_lr; - v.e.exc_write_reg := fast_spr_num(SPR_LR); - v.e.valid := '1'; - -- Keep r.e.write_data unchanged next cycle in case it is needed - -- for a forwarded result (e.g. for CTR). - hold_wr_data := '1'; + ctrl_tmp.msr(MSR_SF) <= '1'; + ctrl_tmp.msr(MSR_EE) <= '0'; + ctrl_tmp.msr(MSR_PR) <= '0'; + ctrl_tmp.msr(MSR_SE) <= '0'; + ctrl_tmp.msr(MSR_BE) <= '0'; + ctrl_tmp.msr(MSR_FP) <= '0'; + ctrl_tmp.msr(MSR_FE0) <= '0'; + ctrl_tmp.msr(MSR_FE1) <= '0'; + ctrl_tmp.msr(MSR_IR) <= '0'; + ctrl_tmp.msr(MSR_DR) <= '0'; + ctrl_tmp.msr(MSR_RI) <= '0'; + ctrl_tmp.msr(MSR_LE) <= '1'; + v.trace_next := '0'; + v.fp_exception_next := '0'; + report "Writing SRR1: " & to_hstring(ctrl.srr1); + elsif r.cntz_in_progress = '1' then -- cnt[lt]z always takes two cycles - result_en := '1'; - v.e.write_reg := gpr_to_gspr(r.slow_op_dest); - v.e.rc := r.slow_op_rc; - v.e.xerc := r.slow_op_xerc; v.e.valid := '1'; elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or @@ -1190,23 +1144,21 @@ begin else overflow := divider_to_x.overflow; end if; - if r.mul_in_progress = '1' and r.slow_op_oe = '1' then + if r.mul_in_progress = '1' and current.oe = '1' then -- have to wait until next cycle for overflow indication v.mul_finish := '1'; v.busy := '1'; else - result_en := '1'; - v.e.write_reg := gpr_to_gspr(r.slow_op_dest); - v.e.rc := r.slow_op_rc; - v.e.xerc := r.slow_op_xerc; - v.e.write_xerc_enable := r.slow_op_oe; + v.e.write_xerc_enable := current.oe; -- We must test oe because the RC update code in writeback -- will use the xerc value to set CR0:SO so we must not clobber -- xerc if OE wasn't set. - if r.slow_op_oe = '1' then + if current.oe = '1' then v.e.xerc.ov := overflow; v.e.xerc.ov32 := overflow; - v.e.xerc.so := r.slow_op_xerc.so or overflow; + if overflow = '1' then + v.e.xerc.so := '1'; + end if; end if; v.e.valid := '1'; end if; @@ -1217,16 +1169,19 @@ begin end if; elsif r.mul_finish = '1' then hold_wr_data := '1'; - result_en := '1'; - v.e.write_reg := gpr_to_gspr(r.slow_op_dest); - v.e.rc := r.slow_op_rc; - v.e.xerc := r.slow_op_xerc; - v.e.write_xerc_enable := r.slow_op_oe; + v.e.write_xerc_enable := current.oe; v.e.xerc.ov := multiply_to_x.overflow; v.e.xerc.ov32 := multiply_to_x.overflow; - v.e.xerc.so := r.slow_op_xerc.so or multiply_to_x.overflow; + if multiply_to_x.overflow = '1' then + v.e.xerc.so := '1'; + end if; v.e.valid := '1'; end if; + -- When doing delayed LR update, keep r.e.write_data unchanged + -- next cycle in case it is needed for a forwarded result (e.g. CTR). + if r.lr_update = '1' then + hold_wr_data := '1'; + end if; -- Generate FP-type program interrupt. fp_in.interrupt will only -- be set during the execution of a FP instruction. @@ -1253,17 +1208,6 @@ begin end if; end if; - if do_trace = '1' then - v.trace_next := '1'; - end if; - - if hold_wr_data = '0' then - v.e.write_data := alu_result; - else - v.e.write_data := r.e.write_data; - end if; - v.e.write_enable := result_en and not exception; - -- generate DSI or DSegI for load/store exceptions -- or ISI or ISegI for instruction fetch exceptions if l_in.exception = '1' then @@ -1297,10 +1241,52 @@ begin v.do_intr := '1'; end if; + if do_trace = '1' then + v.trace_next := '1'; + end if; + + if hold_wr_data = '0' then + v.e.write_data := alu_result; + else + v.e.write_data := r.e.write_data; + end if; + v.e.write_reg := current.write_reg; + v.e.write_enable := current.write_reg_enable and v.e.valid and not exception; + v.e.rc := current.rc and v.e.valid and not exception; + + -- Update LR on the next cycle after a branch link + -- If we're not writing back anything else, we can write back LR + -- this cycle, otherwise we take an extra cycle. We use the + -- exc_write path since next_nia is written through that path + -- in other places. + if v.e.valid = '1' and exception = '0' and current.lr = '1' then + if current.write_reg_enable = '0' then + v.e.exc_write_enable := '1'; + v.e.exc_write_data := next_nia; + v.e.exc_write_reg := fast_spr_num(SPR_LR); + else + v.lr_update := '1'; + v.e.valid := '0'; + report "Delayed LR update to " & to_hstring(next_nia); + v.busy := '1'; + end if; + end if; + if r.lr_update = '1' then + v.e.exc_write_enable := '1'; + v.e.exc_write_data := r.next_lr; + v.e.exc_write_reg := fast_spr_num(SPR_LR); + v.e.valid := '1'; + end if; + + -- Defer completion for one cycle when redirecting. + -- This also ensures r.busy = 1 when ctrl.irq_state = WRITE_SRR1 if v.redirect = '1' then v.busy := '1'; v.e.valid := '0'; end if; + if r.redirect = '1' then + v.e.valid := '1'; + end if; -- Outputs to fetch1 f.redirect := r.redirect; From f7b855dfc36cd1d916e019ab31edbcc679077255 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 28 Sep 2020 14:04:08 +1000 Subject: [PATCH 8/9] execute1: Improve timing on comparisons Using the main adder for comparisons has the disadvantage of creating a long path from the CA/OV bit forwarding to v.busy via the carry input of the adder, the comparison result, and determining whether a trap instruction would trap. Instead we now have dedicated comparators for the high and low words of a_in vs. b_in, and combine their results to get the signed and unsigned comparison results. Signed-off-by: Paul Mackerras --- execute1.vhdl | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/execute1.vhdl b/execute1.vhdl index 6a27ee8..3385455 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -356,6 +356,8 @@ begin variable zerohi, zerolo : std_ulogic; variable msb_a, msb_b : std_ulogic; variable a_lt : std_ulogic; + variable a_lt_lo : std_ulogic; + variable a_lt_hi : std_ulogic; variable lv : Execute1ToLoadstore1Type; variable irq_valid : std_ulogic; variable exception : std_ulogic; @@ -612,24 +614,32 @@ begin -- values are equal trapval := "00100"; else + a_lt_lo := '0'; + a_lt_hi := '0'; + if unsigned(a_in(30 downto 0)) < unsigned(b_in(30 downto 0)) then + a_lt_lo := '1'; + end if; + if unsigned(a_in(62 downto 31)) < unsigned(b_in(62 downto 31)) then + a_lt_hi := '1'; + end if; if l = '1' then -- 64-bit comparison msb_a := a_in(63); msb_b := b_in(63); + a_lt := a_lt_hi or (zerohi and (a_in(31) xnor b_in(31)) and a_lt_lo); else -- 32-bit comparison msb_a := a_in(31); msb_b := b_in(31); + a_lt := a_lt_lo; end if; if msb_a /= msb_b then - -- Subtraction might overflow, but - -- comparison is clear from MSB difference. + -- Comparison is clear from MSB difference. -- for signed, 0 is greater; for unsigned, 1 is greater trapval := msb_a & msb_b & '0' & msb_b & msb_a; else - -- Subtraction cannot overflow since MSBs are equal. - -- carry = 1 indicates RA is smaller (signed or unsigned) - a_lt := (not l and carry_32) or (l and carry_64); + -- MSBs are equal, so signed and unsigned comparisons give the + -- same answer. trapval := a_lt & not a_lt & '0' & a_lt & not a_lt; end if; end if; From 0fb207be606969e7fb8b55241461596c2792c3dc Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 19 Dec 2020 09:25:04 +1100 Subject: [PATCH 9/9] fetch1: Implement a simple branch target cache This implements a cache in fetch1, where each entry stores the address of a simple branch instruction (b or bc) and the target of the branch. When fetching sequentially, if the address being fetched matches the cache entry, then fetching will be redirected to the branch target. The cache has 1024 entries and is direct-mapped, i.e. indexed by bits 11..2 of the NIA. The bus from execute1 now carries information about taken and not-taken simple branches, which fetch1 uses to update the cache. The cache entry is updated for both taken and not-taken branches, with the valid bit being set if the branch was taken and cleared if the branch was not taken. If fetching is redirected to the branch target then that goes down the pipe as a predicted-taken branch, and decode1 does not do any static branch prediction. If fetching is not redirected, then the next instruction goes down the pipe as normal and decode1 does its static branch prediction. In order to make timing, the lookup of the cache is pipelined, so on each cycle the cache entry for the current NIA + 8 is read. This means that after a redirect (from decode1 or execute1), only the third and subsequent sequentially-fetched instructions will be able to be predicted. This improves the coremark value on the Arty A7-100 from about 180 to about 190 (more than 5%). The BTC is optional. Builds for the Artix 7 35-T part have it off by default because the extra ~1420 LUTs it takes mean that the design doesn't fit on the Arty A7-35 board. Signed-off-by: Paul Mackerras --- common.vhdl | 8 ++- core.vhdl | 5 +- decode1.vhdl | 5 +- execute1.vhdl | 17 +++++- fetch1.vhdl | 119 +++++++++++++++++++++++++++++++------- fpga/top-arty.vhdl | 2 + fpga/top-generic.vhdl | 2 + fpga/top-nexys-video.vhdl | 2 + icache.vhdl | 1 + microwatt.core | 14 +++++ soc.vhdl | 2 + 11 files changed, 152 insertions(+), 25 deletions(-) diff --git a/common.vhdl b/common.vhdl index d085199..7bf8277 100644 --- a/common.vhdl +++ b/common.vhdl @@ -155,6 +155,7 @@ package common is big_endian : std_ulogic; stop_mark: std_ulogic; sequential: std_ulogic; + predicted : std_ulogic; nia: std_ulogic_vector(63 downto 0); end record; @@ -165,6 +166,7 @@ package common is nia: std_ulogic_vector(63 downto 0); insn: std_ulogic_vector(31 downto 0); big_endian: std_ulogic; + next_predicted: std_ulogic; end record; type Decode1ToDecode2Type is record @@ -308,10 +310,14 @@ package common is big_endian: std_ulogic; mode_32bit: std_ulogic; redirect_nia: std_ulogic_vector(63 downto 0); + br_nia : std_ulogic_vector(63 downto 0); + br_last : std_ulogic; + br_taken : std_ulogic; end record; constant Execute1ToFetch1Init : Execute1ToFetch1Type := (redirect => '0', virt_mode => '0', priv_mode => '0', big_endian => '0', - mode_32bit => '0', others => (others => '0')); + mode_32bit => '0', br_taken => '0', + br_last => '0', others => (others => '0')); type Execute1ToLoadstore1Type is record valid : std_ulogic; diff --git a/core.vhdl b/core.vhdl index bc32a8c..3948b86 100644 --- a/core.vhdl +++ b/core.vhdl @@ -12,6 +12,7 @@ entity core is DISABLE_FLATTEN : boolean := false; EX1_BYPASS : boolean := true; HAS_FPU : boolean := true; + HAS_BTC : boolean := true; ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0'); LOG_LENGTH : natural := 512 ); @@ -187,7 +188,8 @@ begin fetch1_0: entity work.fetch1 generic map ( RESET_ADDRESS => (others => '0'), - ALT_RESET_ADDRESS => ALT_RESET_ADDRESS + ALT_RESET_ADDRESS => ALT_RESET_ADDRESS, + HAS_BTC => HAS_BTC ) port map ( clk => clk, @@ -195,6 +197,7 @@ begin alt_reset_in => alt_reset_d, stall_in => fetch1_stall_in, flush_in => fetch1_flush, + inval_btc => ex1_icache_inval or mmu_to_icache.tlbie, stop_in => dbg_core_stop, d_in => decode1_to_fetch1, e_in => execute1_to_fetch1, diff --git a/decode1.vhdl b/decode1.vhdl index 2edacd3..ebe59be 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -727,7 +727,10 @@ begin bv.br_nia := (others => '0'); end if; bv.br_offset := br_offset; - bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out; + if f_in.next_predicted = '1' then + v.br_pred := '1'; + end if; + bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out and not f_in.next_predicted; -- after a clock edge... br_target := std_ulogic_vector(signed(br.br_nia) + br.br_offset); diff --git a/execute1.vhdl b/execute1.vhdl index 3385455..25b1dc7 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -68,6 +68,8 @@ architecture behaviour of execute1 is last_nia : std_ulogic_vector(63 downto 0); redirect : std_ulogic; abs_br : std_ulogic; + taken_br : std_ulogic; + br_last : std_ulogic; do_intr : std_ulogic; vector : integer range 0 to 16#fff#; br_offset : std_ulogic_vector(63 downto 0); @@ -81,7 +83,7 @@ architecture behaviour of execute1 is fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0', next_lr => (others => '0'), last_nia => (others => '0'), - redirect => '0', abs_br => '0', do_intr => '0', vector => 0, + redirect => '0', abs_br => '0', taken_br => '0', br_last => '0', do_intr => '0', vector => 0, br_offset => (others => '0'), redir_mode => "0000", others => (others => '0')); @@ -365,6 +367,7 @@ begin variable trapval : std_ulogic_vector(4 downto 0); variable illegal : std_ulogic; variable is_branch : std_ulogic; + variable is_direct_branch : std_ulogic; variable taken_branch : std_ulogic; variable abs_branch : std_ulogic; variable spr_val : std_ulogic_vector(63 downto 0); @@ -377,6 +380,7 @@ begin sum_with_carry := (others => '0'); newcrf := (others => '0'); is_branch := '0'; + is_direct_branch := '0'; taken_branch := '0'; abs_branch := '0'; hold_wr_data := '0'; @@ -390,6 +394,8 @@ begin v.br_offset := (others => '0'); v.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) & not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF); + v.taken_br := '0'; + v.br_last := '0'; lv := Execute1ToLoadstore1Init; fv := Execute1ToFPUInit; @@ -843,6 +849,7 @@ begin when OP_B => is_branch := '1'; taken_branch := '1'; + is_direct_branch := '1'; abs_branch := insn_aa(e_in.insn); if ctrl.msr(MSR_BE) = '1' then do_trace := '1'; @@ -852,6 +859,7 @@ begin bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); is_branch := '1'; + is_direct_branch := '1'; taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); abs_branch := insn_aa(e_in.insn); if ctrl.msr(MSR_BE) = '1' then @@ -1093,7 +1101,7 @@ begin if taken_branch = '1' then ctrl_tmp.cfar <= e_in.nia; end if; - if e_in.br_pred = '0' then + if taken_branch = '1' then v.br_offset := b_in; v.abs_br := abs_branch; else @@ -1102,6 +1110,8 @@ begin if taken_branch /= e_in.br_pred then v.redirect := '1'; end if; + v.br_last := is_direct_branch; + v.taken_br := taken_branch; end if; elsif valid_in = '1' and exception = '0' and illegal = '0' then @@ -1300,6 +1310,9 @@ begin -- Outputs to fetch1 f.redirect := r.redirect; + f.br_nia := r.last_nia; + f.br_last := r.br_last and not r.do_intr; + f.br_taken := r.taken_br; if r.do_intr = '1' then f.redirect_nia := std_ulogic_vector(to_unsigned(r.vector, 64)); f.virt_mode := '0'; diff --git a/fetch1.vhdl b/fetch1.vhdl index 3c9d946..8ca7e57 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -8,7 +8,8 @@ use work.common.all; entity fetch1 is generic( RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0'); - ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0') + ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0'); + HAS_BTC : boolean := true ); port( clk : in std_ulogic; @@ -17,6 +18,7 @@ entity fetch1 is -- Control inputs: stall_in : in std_ulogic; flush_in : in std_ulogic; + inval_btc : in std_ulogic; stop_in : in std_ulogic; alt_reset_in : in std_ulogic; @@ -37,10 +39,25 @@ end entity fetch1; architecture behaviour of fetch1 is type reg_internal_t is record mode_32bit: std_ulogic; + rd_is_niap4: std_ulogic; + predicted: std_ulogic; + predicted_nia: std_ulogic_vector(63 downto 0); end record; signal r, r_next : Fetch1ToIcacheType; signal r_int, r_next_int : reg_internal_t; + signal advance_nia : std_ulogic; signal log_nia : std_ulogic_vector(42 downto 0); + + constant BTC_ADDR_BITS : integer := 10; + constant BTC_TAG_BITS : integer := 62 - BTC_ADDR_BITS; + constant BTC_TARGET_BITS : integer := 62; + constant BTC_SIZE : integer := 2 ** BTC_ADDR_BITS; + constant BTC_WIDTH : integer := BTC_TAG_BITS + BTC_TARGET_BITS; + type btc_mem_type is array (0 to BTC_SIZE - 1) of std_ulogic_vector(BTC_WIDTH - 1 downto 0); + + signal btc_rd_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0) := (others => '0'); + signal btc_rd_valid : std_ulogic := '0'; + begin regs : process(clk) @@ -56,15 +73,70 @@ begin " R:" & std_ulogic'image(e_in.redirect) & std_ulogic'image(d_in.redirect) & " S:" & std_ulogic'image(stall_in) & " T:" & std_ulogic'image(stop_in) & - " nia:" & to_hstring(r_next.nia) & - " SM:" & std_ulogic'image(r_next.stop_mark); + " nia:" & to_hstring(r_next.nia); end if; - r <= r_next; - r_int <= r_next_int; + if rst = '1' or e_in.redirect = '1' or d_in.redirect = '1' or stall_in = '0' then + r.virt_mode <= r_next.virt_mode; + r.priv_mode <= r_next.priv_mode; + r.big_endian <= r_next.big_endian; + r_int.mode_32bit <= r_next_int.mode_32bit; + end if; + if advance_nia = '1' then + r.predicted <= r_next.predicted; + r.nia <= r_next.nia; + r_int.predicted <= r_next_int.predicted; + r_int.predicted_nia <= r_next_int.predicted_nia; + r_int.rd_is_niap4 <= r_next.sequential; + end if; + r.sequential <= r_next.sequential and advance_nia; + -- always send the up-to-date stop mark and req + r.stop_mark <= stop_in; + r.req <= not rst; end if; end process; log_out <= log_nia; + btc : if HAS_BTC generate + signal btc_memory : btc_mem_type; + attribute ram_style : string; + attribute ram_style of btc_memory : signal is "block"; + + signal btc_valids : std_ulogic_vector(BTC_SIZE - 1 downto 0); + attribute ram_style of btc_valids : signal is "distributed"; + + signal btc_wr : std_ulogic; + signal btc_wr_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0); + signal btc_wr_addr : std_ulogic_vector(BTC_ADDR_BITS - 1 downto 0); + signal btc_wr_v : std_ulogic; + begin + btc_wr_data <= e_in.br_nia(63 downto BTC_ADDR_BITS + 2) & + e_in.redirect_nia(63 downto 2); + btc_wr_addr <= e_in.br_nia(BTC_ADDR_BITS + 1 downto 2); + btc_wr <= e_in.br_last; + btc_wr_v <= e_in.br_taken; + + btc_ram : process(clk) + variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0); + begin + if rising_edge(clk) then + raddr := unsigned(r.nia(BTC_ADDR_BITS + 1 downto 2)) + + to_unsigned(2, BTC_ADDR_BITS); + if advance_nia = '1' then + btc_rd_data <= btc_memory(to_integer(raddr)); + btc_rd_valid <= btc_valids(to_integer(raddr)); + end if; + if btc_wr = '1' then + btc_memory(to_integer(unsigned(btc_wr_addr))) <= btc_wr_data; + end if; + if inval_btc = '1' or rst = '1' then + btc_valids <= (others => '0'); + elsif btc_wr = '1' then + btc_valids(to_integer(unsigned(btc_wr_addr))) <= btc_wr_v; + end if; + end if; + end process; + end generate; + comb : process(all) variable v : Fetch1ToIcacheType; variable v_int : reg_internal_t; @@ -72,6 +144,8 @@ begin v := r; v_int := r_int; v.sequential := '0'; + v.predicted := '0'; + v_int.predicted := '0'; if rst = '1' then if alt_reset_in = '1' then @@ -83,6 +157,7 @@ begin v.priv_mode := '1'; v.big_endian := '0'; v_int.mode_32bit := '0'; + v_int.predicted_nia := (others => '0'); elsif e_in.redirect = '1' then v.nia := e_in.redirect_nia(63 downto 2) & "00"; if e_in.mode_32bit = '1' then @@ -97,22 +172,26 @@ begin if r_int.mode_32bit = '1' then v.nia(63 downto 32) := (others => '0'); end if; - elsif stall_in = '0' then - - -- If the last NIA value went down with a stop mark, it didn't get - -- executed, and hence we shouldn't increment NIA. - if r.stop_mark = '0' then - if r_int.mode_32bit = '0' then - v.nia := std_ulogic_vector(unsigned(r.nia) + 4); - else - v.nia := x"00000000" & std_ulogic_vector(unsigned(r.nia(31 downto 0)) + 4); - end if; - v.sequential := '1'; - end if; - end if; + elsif r_int.predicted = '1' then + v.nia := r_int.predicted_nia; + v.predicted := '1'; + else + v.sequential := '1'; + v.nia := std_ulogic_vector(unsigned(r.nia) + 4); + if r_int.mode_32bit = '1' then + v.nia(63 downto 32) := x"00000000"; + end if; + if btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and + btc_rd_data(BTC_WIDTH - 1 downto BTC_TARGET_BITS) + = v.nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then + v_int.predicted := '1'; + end if; + end if; + v_int.predicted_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00"; - v.req := not rst and not stop_in; - v.stop_mark := stop_in; + -- If the last NIA value went down with a stop mark, it didn't get + -- executed, and hence we shouldn't increment NIA. + advance_nia <= rst or e_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in); r_next <= v; r_next_int <= v_int; diff --git a/fpga/top-arty.vhdl b/fpga/top-arty.vhdl index 8a3dc7a..68d1e89 100644 --- a/fpga/top-arty.vhdl +++ b/fpga/top-arty.vhdl @@ -15,6 +15,7 @@ entity toplevel is RESET_LOW : boolean := true; CLK_FREQUENCY : positive := 100000000; HAS_FPU : boolean := true; + HAS_BTC : boolean := true; USE_LITEDRAM : boolean := false; NO_BRAM : boolean := false; DISABLE_FLATTEN_CORE : boolean := false; @@ -170,6 +171,7 @@ begin SIM => false, CLK_FREQ => CLK_FREQUENCY, HAS_FPU => HAS_FPU, + HAS_BTC => HAS_BTC, HAS_DRAM => USE_LITEDRAM, DRAM_SIZE => 256 * 1024 * 1024, DRAM_INIT_SIZE => PAYLOAD_SIZE, diff --git a/fpga/top-generic.vhdl b/fpga/top-generic.vhdl index d5219ff..8bff5bb 100644 --- a/fpga/top-generic.vhdl +++ b/fpga/top-generic.vhdl @@ -12,6 +12,7 @@ entity toplevel is CLK_INPUT : positive := 100000000; CLK_FREQUENCY : positive := 100000000; HAS_FPU : boolean := true; + HAS_BTC : boolean := false; LOG_LENGTH : natural := 512; DISABLE_FLATTEN_CORE : boolean := false; UART_IS_16550 : boolean := true @@ -71,6 +72,7 @@ begin SIM => false, CLK_FREQ => CLK_FREQUENCY, HAS_FPU => HAS_FPU, + HAS_BTC => HAS_BTC, LOG_LENGTH => LOG_LENGTH, DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE, UART0_IS_16550 => UART_IS_16550 diff --git a/fpga/top-nexys-video.vhdl b/fpga/top-nexys-video.vhdl index 1942b10..86bdd11 100644 --- a/fpga/top-nexys-video.vhdl +++ b/fpga/top-nexys-video.vhdl @@ -15,6 +15,7 @@ entity toplevel is RESET_LOW : boolean := true; CLK_FREQUENCY : positive := 100000000; HAS_FPU : boolean := true; + HAS_BTC : boolean := true; USE_LITEDRAM : boolean := false; NO_BRAM : boolean := false; DISABLE_FLATTEN_CORE : boolean := false; @@ -122,6 +123,7 @@ begin SIM => false, CLK_FREQ => CLK_FREQUENCY, HAS_FPU => HAS_FPU, + HAS_BTC => HAS_BTC, HAS_DRAM => USE_LITEDRAM, DRAM_SIZE => 512 * 1024 * 1024, DRAM_INIT_SIZE => PAYLOAD_SIZE, diff --git a/icache.vhdl b/icache.vhdl index 37a230d..a658783 100644 --- a/icache.vhdl +++ b/icache.vhdl @@ -565,6 +565,7 @@ begin i_out.stop_mark <= r.hit_smark; i_out.fetch_failed <= r.fetch_failed; i_out.big_endian <= r.big_endian; + i_out.next_predicted <= i_in.predicted; -- Stall fetch1 if we have a miss on cache or TLB or a protection fault stall_out <= not (is_hit and access_ok); diff --git a/microwatt.core b/microwatt.core index 7f2068d..41b6230 100644 --- a/microwatt.core +++ b/microwatt.core @@ -134,6 +134,7 @@ targets: - log_length=2048 - uart_is_16550 - has_fpu + - has_btc tools: vivado: {part : xc7a100tcsg324-1} toplevel : toplevel @@ -218,6 +219,7 @@ targets: - log_length=2048 - uart_is_16550 - has_fpu + - has_btc tools: vivado: {part : xc7a200tsbg484-1} toplevel : toplevel @@ -235,6 +237,7 @@ targets: - log_length=2048 - uart_is_16550 - has_fpu + - has_btc generate: [litedram_nexys_video] tools: vivado: {part : xc7a200tsbg484-1} @@ -254,6 +257,7 @@ targets: - uart_is_16550 - has_uart1 - has_fpu=false + - has_btc=false tools: vivado: {part : xc7a35ticsg324-1L} toplevel : toplevel @@ -273,6 +277,7 @@ targets: - uart_is_16550 - has_uart1 - has_fpu=false + - has_btc=false generate: [litedram_arty, liteeth_arty] tools: vivado: {part : xc7a35ticsg324-1L} @@ -292,6 +297,7 @@ targets: - uart_is_16550 - has_uart1 - has_fpu + - has_btc tools: vivado: {part : xc7a100ticsg324-1L} toplevel : toplevel @@ -311,6 +317,7 @@ targets: - uart_is_16550 - has_uart1 - has_fpu + - has_btc generate: [litedram_arty, liteeth_arty] tools: vivado: {part : xc7a100ticsg324-1L} @@ -329,6 +336,7 @@ targets: - log_length=512 - uart_is_16550 - has_fpu=false + - has_btc=false tools: vivado: {part : xc7a35tcpg236-1} toplevel : toplevel @@ -395,6 +403,12 @@ parameters: paramtype : generic default : true + has_btc: + datatype : bool + description : Include a branch target cache in the core + paramtype : generic + default : true + disable_flatten_core: datatype : bool description : Prevent Vivado from flattening the main core components diff --git a/soc.vhdl b/soc.vhdl index e4a7895..77f229e 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -53,6 +53,7 @@ entity soc is CLK_FREQ : positive; SIM : boolean; HAS_FPU : boolean := true; + HAS_BTC : boolean := true; DISABLE_FLATTEN_CORE : boolean := false; HAS_DRAM : boolean := false; DRAM_SIZE : integer := 0; @@ -255,6 +256,7 @@ begin generic map( SIM => SIM, HAS_FPU => HAS_FPU, + HAS_BTC => HAS_BTC, DISABLE_FLATTEN => DISABLE_FLATTEN_CORE, ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'), LOG_LENGTH => LOG_LENGTH