From 54f89afab7bc2b58dc48759a68cc8c56954a6b6d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 21 Sep 2020 11:41:46 +1000
Subject: [PATCH 1/9] loadstore1: Decide on load formatting controls a cycle
 earlier

This helps timing.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 loadstore1.vhdl | 61 ++++++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 26 deletions(-)

diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index 33c8694..f1b98dc 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -54,6 +54,10 @@ architecture behave of loadstore1 is
                      COMPLETE           -- extra cycle to complete an operation
                      );
 
+    type byte_index_t is array(0 to 7) of unsigned(2 downto 0);
+    subtype byte_trim_t is std_ulogic_vector(1 downto 0);
+    type trim_ctl_t is array(0 to 7) of byte_trim_t;
+
     type reg_stage_t is record
         -- latch most of the input request
         load         : std_ulogic;
@@ -93,6 +97,9 @@ architecture behave of loadstore1 is
         do_update    : std_ulogic;
         extra_cycle  : std_ulogic;
         mode_32bit   : std_ulogic;
+        byte_index   : byte_index_t;
+        use_second   : std_ulogic_vector(7 downto 0);
+        trim_ctl     : trim_ctl_t;
         load_sp      : std_ulogic;
         ld_sp_data   : std_ulogic_vector(31 downto 0);
         ld_sp_nz     : std_ulogic;
@@ -100,10 +107,6 @@ architecture behave of loadstore1 is
         st_sp_data   : std_ulogic_vector(31 downto 0);
     end record;
 
-    type byte_sel_t is array(0 to 7) of std_ulogic;
-    subtype byte_trim_t is std_ulogic_vector(1 downto 0);
-    type trim_ctl_t is array(0 to 7) of byte_trim_t;
-
     signal r, rin : reg_stage_t;
     signal lsu_sum : std_ulogic_vector(63 downto 0);
 
@@ -299,8 +302,6 @@ begin
         variable data_in : std_ulogic_vector(63 downto 0);
         variable byte_rev : std_ulogic;
         variable length : std_ulogic_vector(3 downto 0);
-        variable use_second : byte_sel_t;
-        variable trim_ctl : trim_ctl_t;
         variable negative : std_ulogic;
         variable sprn : std_ulogic_vector(9 downto 0);
         variable exception : std_ulogic;
@@ -330,17 +331,9 @@ begin
         v.do_update := '0';
 
         -- load data formatting
-        byte_offset := unsigned(r.addr(2 downto 0));
-        brev_lenm1 := "000";
-        if r.byte_reverse = '1' then
-            brev_lenm1 := unsigned(r.length(2 downto 0)) - 1;
-        end if;
-
         -- shift and byte-reverse data bytes
         for i in 0 to 7 loop
-            kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
-            use_second(i) := kk(3);
-            j := to_integer(kk(2 downto 0)) * 8;
+            j := to_integer(r.byte_index(i)) * 8;
             data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j);
         end loop;
 
@@ -362,22 +355,13 @@ begin
 
         -- trim and sign-extend
         for i in 0 to 7 loop
-            if i < to_integer(unsigned(r.length)) then
-                if r.dwords_done = '1' then
-                    trim_ctl(i) := '1' & not use_second(i);
-                else
-                    trim_ctl(i) := "10";
-                end if;
-            else
-                trim_ctl(i) := '0' & (negative and r.sign_extend);
-            end if;
-            case trim_ctl(i) is
+            case r.trim_ctl(i) is
                 when "11" =>
                     data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8);
                 when "10" =>
                     data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8);
                 when "01" =>
-                    data_trimmed(i * 8 + 7 downto i * 8) := x"FF";
+                    data_trimmed(i * 8 + 7 downto i * 8) := (others => negative);
                 when others =>
                     data_trimmed(i * 8 + 7 downto i * 8) := x"00";
             end case;
@@ -699,6 +683,31 @@ begin
             v.busy := req or mmureq or mmu_mtspr or fp_reg_conv;
         end if;
 
+        -- Work out load formatter controls for next cycle
+        byte_offset := unsigned(v.addr(2 downto 0));
+        brev_lenm1 := "000";
+        if v.byte_reverse = '1' then
+            brev_lenm1 := unsigned(v.length(2 downto 0)) - 1;
+        end if;
+
+        for i in 0 to 7 loop
+            kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
+            v.use_second(i) := kk(3);
+            v.byte_index(i) := kk(2 downto 0);
+        end loop;
+
+        for i in 0 to 7 loop
+            if i < to_integer(unsigned(v.length)) then
+                if v.dwords_done = '1' then
+                    v.trim_ctl(i) := '1' & not v.use_second(i);
+                else
+                    v.trim_ctl(i) := "10";
+                end if;
+            else
+                v.trim_ctl(i) := '0' & v.sign_extend;
+            end if;
+        end loop;
+
         -- Update outputs to dcache
         d_out.valid <= req and not v.align_intr;
         d_out.load <= v.load;

From d1f35705c07d4468b3943467683ca2501731e41c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 28 Sep 2020 14:02:03 +1000
Subject: [PATCH 2/9] loadstore1: Improve timing of data path from cache RAM to
 writeback

Work out select inputs for writeback mux a cycle earlier.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 loadstore1.vhdl | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index f1b98dc..e83d642 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -63,7 +63,6 @@ architecture behave of loadstore1 is
         load         : std_ulogic;
         tlbie        : std_ulogic;
         dcbz         : std_ulogic;
-        mfspr        : std_ulogic;
 	addr         : std_ulogic_vector(63 downto 0);
 	store_data   : std_ulogic_vector(63 downto 0);
 	load_data    : std_ulogic_vector(63 downto 0);
@@ -105,6 +104,7 @@ architecture behave of loadstore1 is
         ld_sp_nz     : std_ulogic;
         ld_sp_lz     : std_ulogic_vector(5 downto 0);
         st_sp_data   : std_ulogic_vector(31 downto 0);
+        wr_sel       : std_ulogic_vector(1 downto 0);
     end record;
 
     signal r, rin : reg_stage_t;
@@ -312,20 +312,18 @@ begin
         variable itlb_fault : std_ulogic;
         variable misaligned : std_ulogic;
         variable fp_reg_conv : std_ulogic;
-        variable lfs_done : std_ulogic;
     begin
         v := r;
         req := '0';
-        v.mfspr := '0';
         mmu_mtspr := '0';
         itlb_fault := '0';
         sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10));
         dsisr := (others => '0');
         mmureq := '0';
         fp_reg_conv := '0';
+        v.wr_sel := "11";
 
         write_enable := '0';
-        lfs_done := '0';
 
         do_update := r.do_update;
         v.do_update := '0';
@@ -447,6 +445,11 @@ begin
             v.last_dword := '0';
 
         when ACK_WAIT =>
+            -- r.wr_sel gets set one cycle after we come into ACK_WAIT state,
+            -- which is OK because the dcache always takes at least two cycles.
+            if r.update = '1' and (r.load = '0' or (HAS_FPU and r.load_sp = '1')) then
+                v.wr_sel := "01";
+            end if;
             if d_in.error = '1' then
                 -- dcache will discard the second request if it
                 -- gets an error on the 1st of two requests
@@ -477,9 +480,11 @@ begin
                         -- SP to DP conversion takes a cycle
                         -- Write back rA update in this cycle if needed
                         do_update := r.update;
+                        v.wr_sel := "10";
                         v.state := FINISH_LFS;
                     elsif r.extra_cycle = '1' then
                         -- loads with rA update need an extra cycle
+                        v.wr_sel := "01";
                         v.state := COMPLETE;
                         v.do_update := r.update;
                     else
@@ -517,7 +522,6 @@ begin
         when TLBIE_WAIT =>
 
         when FINISH_LFS =>
-            lfs_done := '1';
 
         when COMPLETE =>
             exception := r.align_intr;
@@ -631,7 +635,7 @@ begin
                     v.state := TLBIE_WAIT;
                     v.wait_mmu := '1';
                 when OP_MFSPR =>
-                    v.mfspr := '1';
+                    v.wr_sel := "00";
                     -- partial decode on SPR number should be adequate given
                     -- the restricted set that get sent down this path
                     if sprn(9) = '0' and sprn(5) = '0' then
@@ -738,23 +742,24 @@ begin
         -- Multiplex either cache data to the destination GPR or
         -- the address for the rA update.
         l_out.valid <= done;
-        if r.mfspr = '1' then
+        case r.wr_sel is
+        when "00" =>
             l_out.write_enable <= '1';
             l_out.write_reg <= r.write_reg;
             l_out.write_data <= r.sprval;
-        elsif do_update = '1' then
-            l_out.write_enable <= '1';
+        when "01" =>
+            l_out.write_enable <= do_update;
             l_out.write_reg <= gpr_to_gspr(r.update_reg);
             l_out.write_data <= r.addr;
-        elsif lfs_done = '1' then
+        when "10" =>
             l_out.write_enable <= '1';
             l_out.write_reg <= r.write_reg;
             l_out.write_data <= load_dp_data;
-        else
+        when others =>
             l_out.write_enable <= write_enable;
             l_out.write_reg <= r.write_reg;
             l_out.write_data <= data_trimmed;
-        end if;
+        end case;
         l_out.xerc <= r.xerc;
         l_out.rc <= r.rc and done;
         l_out.store_done <= d_in.store_done;

From 6427cab46fe7f37074505e18a1957414023c2708 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 31 Oct 2020 13:48:58 +1100
Subject: [PATCH 3/9] loadstore1/dcache: Send store data one cycle later

This makes timing easier and also means that store floating-point
single precision instructions no longer need to take an extra cycle.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl     |  2 +-
 dcache.vhdl     |  2 +-
 loadstore1.vhdl | 78 +++++++++++++++++++------------------------------
 3 files changed, 32 insertions(+), 50 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index bfc0db2..8b9380c 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -365,7 +365,7 @@ package common is
         virt_mode : std_ulogic;
         priv_mode : std_ulogic;
 	addr : std_ulogic_vector(63 downto 0);
-	data : std_ulogic_vector(63 downto 0);
+	data : std_ulogic_vector(63 downto 0);          -- valid the cycle after .valid = 1
         byte_sel : std_ulogic_vector(7 downto 0);
     end record;
 
diff --git a/dcache.vhdl b/dcache.vhdl
index 1e58e1f..7da67e1 100644
--- a/dcache.vhdl
+++ b/dcache.vhdl
@@ -1306,7 +1306,7 @@ begin
                     req.real_addr := ra;
                     -- Force data to 0 for dcbz
                     if r0.req.dcbz = '0' then
-                        req.data := r0.req.data;
+                        req.data := d_in.data;
                     else
                         req.data := (others => '0');
                     end if;
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index e83d642..b83eed6 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -45,7 +45,6 @@ architecture behave of loadstore1 is
 
     -- State machine for unaligned loads/stores
     type state_t is (IDLE,              -- ready for instruction
-                     FPR_CONV,          -- converting double to float for store
                      SECOND_REQ,        -- send 2nd request of unaligned xfer
                      ACK_WAIT,          -- waiting for ack from dcache
                      MMU_LOOKUP,        -- waiting for MMU to look up translation
@@ -69,6 +68,8 @@ architecture behave of loadstore1 is
 	write_reg    : gspr_index_t;
 	length       : std_ulogic_vector(3 downto 0);
 	byte_reverse : std_ulogic;
+        byte_offset  : unsigned(2 downto 0);
+        brev_mask    : unsigned(2 downto 0);
 	sign_extend  : std_ulogic;
 	update       : std_ulogic;
 	update_reg   : gpr_index_t;
@@ -103,7 +104,6 @@ architecture behave of loadstore1 is
         ld_sp_data   : std_ulogic_vector(31 downto 0);
         ld_sp_nz     : std_ulogic;
         ld_sp_lz     : std_ulogic_vector(5 downto 0);
-        st_sp_data   : std_ulogic_vector(31 downto 0);
         wr_sel       : std_ulogic_vector(1 downto 0);
     end record;
 
@@ -299,7 +299,6 @@ begin
         variable data_permuted : std_ulogic_vector(63 downto 0);
         variable data_trimmed : std_ulogic_vector(63 downto 0);
         variable store_data : std_ulogic_vector(63 downto 0);
-        variable data_in : std_ulogic_vector(63 downto 0);
         variable byte_rev : std_ulogic;
         variable length : std_ulogic_vector(3 downto 0);
         variable negative : std_ulogic;
@@ -311,7 +310,6 @@ begin
         variable mmu_mtspr : std_ulogic;
         variable itlb_fault : std_ulogic;
         variable misaligned : std_ulogic;
-        variable fp_reg_conv : std_ulogic;
     begin
         v := r;
         req := '0';
@@ -320,7 +318,6 @@ begin
         sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10));
         dsisr := (others => '0');
         mmureq := '0';
-        fp_reg_conv := '0';
         v.wr_sel := "11";
 
         write_enable := '0';
@@ -366,40 +363,19 @@ begin
         end loop;
 
         if HAS_FPU then
-            -- Single-precision FP conversion
-            v.st_sp_data := store_sp_data;
+            -- Single-precision FP conversion for loads
             v.ld_sp_data := data_trimmed(31 downto 0);
             v.ld_sp_nz := or (data_trimmed(22 downto 0));
             v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0));
         end if;
 
         -- Byte reversing and rotating for stores.
-        -- Done in the first cycle (when l_in.valid = 1) for integer stores
-        -- and DP float stores, and in the second cycle for SP float stores.
-        store_data := r.store_data;
-        if l_in.valid = '1' or (HAS_FPU and r.state = FPR_CONV) then
-            if HAS_FPU and r.state = FPR_CONV then
-                data_in := x"00000000" & r.st_sp_data;
-                byte_offset := unsigned(r.addr(2 downto 0));
-                byte_rev := r.byte_reverse;
-                length := r.length;
-            else
-                data_in := l_in.data;
-                byte_offset := unsigned(lsu_sum(2 downto 0));
-                byte_rev := l_in.byte_reverse;
-                length := l_in.length;
-            end if;
-            brev_lenm1 := "000";
-            if byte_rev = '1' then
-                brev_lenm1 := unsigned(length(2 downto 0)) - 1;
-            end if;
-            for i in 0 to 7 loop
-                k := (to_unsigned(i, 3) - byte_offset) xor brev_lenm1;
-                j := to_integer(k) * 8;
-                store_data(i * 8 + 7 downto i * 8) := data_in(j + 7 downto j);
-            end loop;
-        end if;
-        v.store_data := store_data;
+        -- Done in the second cycle (the cycle after l_in.valid = 1).
+        for i in 0 to 7 loop
+            k := (to_unsigned(i, 3) - r.byte_offset) xor r.brev_mask;
+            j := to_integer(k) * 8;
+            store_data(i * 8 + 7 downto i * 8) := r.store_data(j + 7 downto j);
+        end loop;
 
         -- compute (addr + 8) & ~7 for the second doubleword when unaligned
         next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";
@@ -431,14 +407,6 @@ begin
         case r.state is
         when IDLE =>
 
-        when FPR_CONV =>
-            req := '1';
-            if r.second_bytes /= "00000000" then
-                v.state := SECOND_REQ;
-            else
-                v.state := ACK_WAIT;
-            end if;
-
         when SECOND_REQ =>
             req := '1';
             v.state := ACK_WAIT;
@@ -561,6 +529,12 @@ begin
             v.do_update := '0';
             v.extra_cycle := '0';
 
+            if HAS_FPU and l_in.is_32bit = '1' then
+                v.store_data := x"00000000" & store_sp_data;
+            else
+                v.store_data := l_in.data;
+            end if;
+
             addr := lsu_sum;
             if l_in.second = '1' then
                 -- for the second half of a 16-byte transfer, use next_addr
@@ -609,12 +583,7 @@ begin
 
             case l_in.op is
                 when OP_STORE =>
-                    if HAS_FPU and l_in.is_32bit = '1' then
-                        v.state := FPR_CONV;
-                        fp_reg_conv := '1';
-                    else
-                        req := '1';
-                    end if;
+                    req := '1';
                 when OP_LOAD =>
                     req := '1';
                     v.load := '1';
@@ -684,7 +653,20 @@ begin
                 end if;
             end if;
 
-            v.busy := req or mmureq or mmu_mtspr or fp_reg_conv;
+            v.busy := req or mmureq or mmu_mtspr;
+        end if;
+
+        -- Work out controls for store formatting
+        if l_in.valid = '1' then
+            byte_offset := unsigned(lsu_sum(2 downto 0));
+            byte_rev := l_in.byte_reverse;
+            length := l_in.length;
+            brev_lenm1 := "000";
+            if byte_rev = '1' then
+                brev_lenm1 := unsigned(length(2 downto 0)) - 1;
+            end if;
+            v.byte_offset := byte_offset;
+            v.brev_mask := brev_lenm1;
         end if;
 
         -- Work out load formatter controls for next cycle

From cb1e3f6d705c6b1808e96ef6e5873c18e9d33a36 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 16 Dec 2020 19:32:07 +1100
Subject: [PATCH 4/9] decode1: Take an extra cycle for predicted branch
 redirects

This does the addition of NIA plus the branch offset from the
instruction after a clock edge, in order to ease timing, as the path
from the icache RAM through the adder in decode1 to the NIA register
in fetch1 was showing up as a critical path.

This adds one extra cycle of latency when redirecting fetch because of
a predicted-taken branch.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 086083e..2edacd3 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -31,6 +31,7 @@ end entity decode1;
 architecture behaviour of decode1 is
     signal r, rin : Decode1ToDecode2Type;
     signal s      : Decode1ToDecode2Type;
+    signal f, fin : Decode1ToFetch1Type;
 
     constant illegal_inst : decode_rom_t :=
         (NONE, NONE, OP_ILLEGAL,   NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE);
@@ -47,6 +48,14 @@ architecture behaviour of decode1 is
     signal ri, ri_in : reg_internal_t;
     signal si        : reg_internal_t;
 
+    type br_predictor_t is record
+        br_nia    : std_ulogic_vector(61 downto 0);
+        br_offset : signed(23 downto 0);
+        predict   : std_ulogic;
+    end record;
+
+    signal br, br_in : br_predictor_t;
+
     subtype major_opcode_t is unsigned(5 downto 0);
     type major_rom_array_t is array(0 to 63) of decode_rom_t;
     type minor_valid_array_t is array(0 to 1023) of std_ulogic;
@@ -537,6 +546,13 @@ begin
                     ri <= ri_in;
                 end if;
             end if;
+            if rst = '1' then
+                br.br_nia <= (others => '0');
+                br.br_offset <= (others => '0');
+                br.predict <= '0';
+            else
+                br <= br_in;
+            end if;
         end if;
     end process;
     busy_out <= s.valid;
@@ -544,14 +560,13 @@ begin
     decode1_1: process(all)
         variable v : Decode1ToDecode2Type;
         variable vi : reg_internal_t;
-        variable f : Decode1ToFetch1Type;
         variable majorop : major_opcode_t;
         variable minor4op : std_ulogic_vector(10 downto 0);
         variable op_19_bits: std_ulogic_vector(2 downto 0);
         variable sprn : spr_num_t;
-        variable br_nia    : std_ulogic_vector(61 downto 0);
         variable br_target : std_ulogic_vector(61 downto 0);
         variable br_offset : signed(23 downto 0);
+        variable bv : br_predictor_t;
     begin
         v := Decode1ToDecode2Init;
         vi := reg_internal_t_init;
@@ -707,17 +722,19 @@ begin
         -- Branch predictor
         -- Note bclr, bcctr and bctar are predicted not taken as we have no
         -- count cache or link stack.
-        br_nia := f_in.nia(63 downto 2);
+        bv.br_nia := f_in.nia(63 downto 2);
         if f_in.insn(1) = '1' then
-            br_nia := (others => '0');
+            bv.br_nia := (others => '0');
         end if;
-        br_target := std_ulogic_vector(signed(br_nia) + br_offset);
-        f.redirect := v.br_pred and f_in.valid and not flush_in and not s.valid;
-        f.redirect_nia := br_target & "00";
+        bv.br_offset := br_offset;
+        bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out;
+        -- after a clock edge...
+        br_target := std_ulogic_vector(signed(br.br_nia) + br.br_offset);
 
         -- Update registers
         rin <= v;
         ri_in <= vi;
+        br_in <= bv;
 
         -- Update outputs
         d_out <= r;
@@ -729,8 +746,9 @@ begin
         if ri.force_single = '1' then
             d_out.decode.sgl_pipe <= '1';
         end if;
-        f_out <= f;
-        flush_out <= f.redirect;
+        f_out.redirect <= br.predict;
+        f_out.redirect_nia <= br_target & "00";
+        flush_out <= bv.predict or br.predict;
     end process;
 
     d1_log: if LOG_LENGTH > 0 generate

From 9ea1ab0215111bb3d87bf2f9d030f630aea5f952 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 16 Dec 2020 20:41:08 +1100
Subject: [PATCH 5/9] execute1: Move branch adder after register

This does the addition of the instruction NIA and the branch offset
after the register at the output of execute1 rather than before.
The propagation through the adder was showing up as a critical path
on the A7-100.  Performance is unaffected and now it makes timing.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 execute1.vhdl | 116 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 70 insertions(+), 46 deletions(-)

diff --git a/execute1.vhdl b/execute1.vhdl
index 11d81ed..4ea2680 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -53,7 +53,6 @@ end entity execute1;
 architecture behaviour of execute1 is
     type reg_type is record
 	e : Execute1ToWritebackType;
-        f : Execute1ToFetch1Type;
         busy: std_ulogic;
         terminate: std_ulogic;
         fp_exception_next : std_ulogic;
@@ -71,15 +70,24 @@ architecture behaviour of execute1 is
 	slow_op_oe : std_ulogic;
 	slow_op_xerc : xer_common_t;
         last_nia : std_ulogic_vector(63 downto 0);
+        redirect : std_ulogic;
+        abs_br : std_ulogic;
+        do_intr : std_ulogic;
+        vector : integer range 0 to 16#fff#;
+        br_offset : std_ulogic_vector(63 downto 0);
+        redir_mode : std_ulogic_vector(3 downto 0);
         log_addr_spr : std_ulogic_vector(31 downto 0);
     end record;
     constant reg_type_init : reg_type :=
-        (e => Execute1ToWritebackInit, f => Execute1ToFetch1Init,
+        (e => Execute1ToWritebackInit,
          busy => '0', lr_update => '0', terminate => '0',
          fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL,
          mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0',
          slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init,
-         next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0'));
+         next_lr => (others => '0'), last_nia => (others => '0'),
+         redirect => '0', abs_br => '0', do_intr => '0', vector => 0,
+         br_offset => (others => '0'), redir_mode => "0000",
+         others => (others => '0'));
 
     signal r, rin : reg_type;
 
@@ -340,6 +348,7 @@ begin
         variable spr_val : std_ulogic_vector(63 downto 0);
         variable addend : std_ulogic_vector(127 downto 0);
         variable do_trace : std_ulogic;
+        variable f : Execute1ToFetch1Type;
         variable fv : Execute1ToFPUType;
     begin
 	result := (others => '0');
@@ -352,8 +361,15 @@ begin
 
 	v := r;
 	v.e := Execute1ToWritebackInit;
+        v.redirect := '0';
+        v.abs_br := '0';
+        v.do_intr := '0';
+        v.vector := 0;
+        v.br_offset := (others => '0');
+        v.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) &
+                        not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF);
+
         lv := Execute1ToLoadstore1Init;
-        v.f.redirect := '0';
         fv := Execute1ToFPUInit;
 
 	-- XER forwarding. To avoid having to track XER hazards, we use
@@ -471,11 +487,11 @@ begin
 	irq_valid := '0';
 	if ctrl.msr(MSR_EE) = '1' then
 	    if ctrl.dec(63) = '1' then
-		v.f.redirect_nia := std_logic_vector(to_unsigned(16#900#, 64));
+		v.vector := 16#900#;
 		report "IRQ valid: DEC";
 		irq_valid := '1';
 	    elsif ext_irq_in = '1' then
-		v.f.redirect_nia := std_logic_vector(to_unsigned(16#500#, 64));
+		v.vector := 16#500#;
 		report "IRQ valid: External";
 		irq_valid := '1';
 	    end if;
@@ -484,11 +500,6 @@ begin
 	v.terminate := '0';
 	icache_inval <= '0';
 	v.busy := '0';
-        -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1
-        v.f.virt_mode := ctrl.msr(MSR_IR);
-        v.f.priv_mode := not ctrl.msr(MSR_PR);
-        v.f.big_endian := not ctrl.msr(MSR_LE);
-        v.f.mode_32bit := not ctrl.msr(MSR_SF);
 
 	-- Next insn adder used in a couple of places
 	next_nia := std_ulogic_vector(unsigned(e_in.nia) + 4);
@@ -546,13 +557,13 @@ begin
             if HAS_FPU and r.fp_exception_next = '1' then
                 -- This is used for FP-type program interrupts that
                 -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero.
-                v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64));
+                v.vector := 16#700#;
                 ctrl_tmp.srr1(63 - 43) <= '1';
                 ctrl_tmp.srr1(63 - 47) <= '1';
             else
                 -- Generate a trace interrupt rather than executing the next instruction
                 -- or taking any asynchronous interrupt
-                v.f.redirect_nia := std_logic_vector(to_unsigned(16#d00#, 64));
+                v.vector := 16#d00#;
                 ctrl_tmp.srr1(63 - 33) <= '1';
                 if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or
                     r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then
@@ -574,7 +585,7 @@ begin
             instr_is_privileged(e_in.insn_type, e_in.insn) then
             -- generate a program interrupt
             exception := '1';
-            v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64));
+            v.vector := 16#700#;
             -- set bit 45 to indicate privileged instruction type interrupt
             ctrl_tmp.srr1(63 - 45) <= '1';
             report "privileged instruction";
@@ -586,7 +597,7 @@ begin
         elsif HAS_FPU and valid_in = '1' and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then
             -- generate a floating-point unavailable interrupt
             exception := '1';
-            v.f.redirect_nia := std_logic_vector(to_unsigned(16#800#, 64));
+            v.vector := 16#800#;
             report "FP unavailable interrupt";
 
 	elsif valid_in = '1' and e_in.unit = ALU then
@@ -614,7 +625,7 @@ begin
                 if e_in.insn(1) = '1' then
                     exception := '1';
                     exception_nextpc := '1';
-                    v.f.redirect_nia := std_logic_vector(to_unsigned(16#C00#, 64));
+                    v.vector := 16#C00#;
                     report "sc";
                 else
                     illegal := '1';
@@ -702,7 +713,7 @@ begin
                         end loop;
                     else
                         -- trap instructions (tw, twi, td, tdi)
-                        v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64));
+                        v.vector := 16#700#;
                         -- set bit 46 to say trap occurred
                         ctrl_tmp.srr1(63 - 46) <= '1';
                         if or (trapval and insn_to(e_in.insn)) = '1' then
@@ -785,10 +796,8 @@ begin
                 end if;
 
 	    when OP_RFID =>
-                v.f.virt_mode := a_in(MSR_IR) or a_in(MSR_PR);
-                v.f.priv_mode := not a_in(MSR_PR);
-                v.f.big_endian := not a_in(MSR_LE);
-                v.f.mode_32bit := not a_in(MSR_SF);
+                v.redir_mode := (a_in(MSR_IR) or a_in(MSR_PR)) & not a_in(MSR_PR) &
+                                not a_in(MSR_LE) & not a_in(MSR_SF);
                 -- Can't use msr_copy here because the partial function MSR
                 -- bits should be left unchanged, not zeroed.
                 ctrl_tmp.msr(63 downto 31) <= a_in(63 downto 31);
@@ -1032,8 +1041,8 @@ begin
                 end if;
 
 	    when OP_ISYNC =>
-		v.f.redirect := '1';
-		v.f.redirect_nia := next_nia;
+		v.redirect := '1';
+                v.br_offset := std_ulogic_vector(to_unsigned(4, 64));
 
 	    when OP_ICBI =>
 		icache_inval <= '1';
@@ -1063,16 +1072,13 @@ begin
                     ctrl_tmp.cfar <= e_in.nia;
                 end if;
                 if e_in.br_pred = '0' then
-                    if abs_branch = '1' then
-                        v.f.redirect_nia := b_in;
-                    else
-                        v.f.redirect_nia := std_ulogic_vector(signed(e_in.nia) + signed(b_in));
-                    end if;
+                    v.br_offset := b_in;
+                    v.abs_br := abs_branch;
                 else
-                    v.f.redirect_nia := next_nia;
+                    v.br_offset := std_ulogic_vector(to_unsigned(4, 64));
                 end if;
                 if taken_branch /= e_in.br_pred then
-                    v.f.redirect := '1';
+                    v.redirect := '1';
                 end if;
             end if;
 
@@ -1114,7 +1120,7 @@ begin
         -- valid_in = 0.  Hence they don't happen in the same cycle as any of
         -- the cases above which depend on valid_in = 1.
 
-        if r.f.redirect = '1' then
+        if r.redirect = '1' then
             v.e.valid := '1';
         end if;
 	if r.lr_update = '1' then
@@ -1195,14 +1201,14 @@ begin
         -- The case where MSR[FE0,FE1] goes from zero to non-zero is
         -- handled above by mtmsrd and rfid setting v.fp_exception_next.
         if HAS_FPU and fp_in.interrupt = '1' then
-            v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64));
+            v.vector := 16#700#;
             ctrl_tmp.srr1(63 - 43) <= '1';
             exception := '1';
         end if;
 
         if illegal = '1' or (HAS_FPU and fp_in.illegal = '1') then
             exception := '1';
-            v.f.redirect_nia := std_logic_vector(to_unsigned(16#700#, 64));
+            v.vector := 16#700#;
             -- Since we aren't doing Hypervisor emulation assist (0xe40) we
             -- set bit 44 to indicate we have an illegal
             ctrl_tmp.srr1(63 - 44) <= '1';
@@ -1226,12 +1232,12 @@ begin
         -- or ISI or ISegI for instruction fetch exceptions
         if l_in.exception = '1' then
             if l_in.alignment = '1' then
-                v.f.redirect_nia := std_logic_vector(to_unsigned(16#600#, 64));
+                v.vector := 16#600#;
             elsif l_in.instr_fault = '0' then
                 if l_in.segment_fault = '0' then
-                    v.f.redirect_nia := std_logic_vector(to_unsigned(16#300#, 64));
+                    v.vector := 16#300#;
                 else
-                    v.f.redirect_nia := std_logic_vector(to_unsigned(16#380#, 64));
+                    v.vector := 16#380#;
                 end if;
             else
                 if l_in.segment_fault = '0' then
@@ -1239,9 +1245,9 @@ begin
                     ctrl_tmp.srr1(63 - 35) <= l_in.perm_error; -- noexec fault
                     ctrl_tmp.srr1(63 - 44) <= l_in.badtree;
                     ctrl_tmp.srr1(63 - 45) <= l_in.rc_error;
-                    v.f.redirect_nia := std_logic_vector(to_unsigned(16#400#, 64));
+                    v.vector := 16#400#;
                 else
-                    v.f.redirect_nia := std_logic_vector(to_unsigned(16#480#, 64));
+                    v.vector := 16#480#;
                 end if;
             end if;
             v.e.exc_write_enable := '1';
@@ -1251,19 +1257,37 @@ begin
 
         if exception = '1' or l_in.exception = '1' then
             ctrl_tmp.irq_state <= WRITE_SRR1;
-            v.f.redirect := '1';
-            v.f.virt_mode := '0';
-            v.f.priv_mode := '1';
-            -- XXX need an interrupt LE bit here, e.g. from LPCR
-            v.f.big_endian := '0';
-            v.f.mode_32bit := '0';
+            v.redirect := '1';
+            v.do_intr := '1';
         end if;
 
-        if v.f.redirect = '1' then
+        if v.redirect = '1' then
             v.busy := '1';
             v.e.valid := '0';
         end if;
 
+        -- Outputs to fetch1
+        f.redirect := r.redirect;
+        if r.do_intr = '1' then
+            f.redirect_nia := std_ulogic_vector(to_unsigned(r.vector, 64));
+            f.virt_mode := '0';
+            f.priv_mode := '1';
+            -- XXX need an interrupt LE bit here, e.g. from LPCR
+            f.big_endian := '0';
+            f.mode_32bit := '0';
+        else
+            if r.abs_br = '1' then
+                f.redirect_nia := r.br_offset;
+            else
+                f.redirect_nia := std_ulogic_vector(unsigned(r.last_nia) + unsigned(r.br_offset));
+            end if;
+            -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1
+            f.virt_mode := r.redir_mode(3);
+            f.priv_mode := r.redir_mode(2);
+            f.big_endian := r.redir_mode(1);
+            f.mode_32bit := r.redir_mode(0);
+        end if;
+
         -- Outputs to loadstore1 (async)
         lv.op := e_in.insn_type;
         lv.nia := e_in.nia;
@@ -1309,7 +1333,7 @@ begin
 	rin <= v;
 
 	-- update outputs
-	f_out <= r.f;
+	f_out <= f;
         l_out <= lv;
 	e_out <= r.e;
         fp_out <= fv;

From 658feabfd40fa4d4e3048334d11036fc1c1c959b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 26 Sep 2020 17:19:57 +1000
Subject: [PATCH 6/9] core: Make result multiplexing explicit

This adds an explicit multiplexer feeding v.e.write_data in execute1,
with the select lines determined in the previous cycle based on the
insn_type.  Similarly, for multiply and divide instructions, there is
now an explicit multiplexer.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl   |   6 +-
 decode2.vhdl  |  52 +++++++++++++++++
 execute1.vhdl | 150 +++++++++++++++++++++++++++++++-------------------
 logical.vhdl  |   6 +-
 4 files changed, 154 insertions(+), 60 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 8b9380c..44f63bd 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -210,6 +210,7 @@ package common is
 	rc: std_ulogic;
 	oe: std_ulogic;
 	invert_a: std_ulogic;
+        addm1 : std_ulogic;
 	invert_out: std_ulogic;
 	input_carry: carry_in_t;
 	output_carry: std_ulogic;
@@ -224,18 +225,21 @@ package common is
 	update : std_ulogic;				-- is this an update instruction?
         reserve : std_ulogic;                           -- set for larx/stcx
         br_pred : std_ulogic;
+        result_sel : std_ulogic_vector(2 downto 0);     -- select source of result
+        sub_select : std_ulogic_vector(2 downto 0);     -- sub-result selection
         repeat : std_ulogic;                            -- set if instruction is cracked into two ops
         second : std_ulogic;                            -- set if this is the second op
     end record;
     constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
 	(valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL,
          bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
-         bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0',
+         bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0',
 	 invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
 	 is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0',
          byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'),
          read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'),
          cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'),
+         result_sel => "000", sub_select => "000",
          repeat => '0', second => '0', others => (others => '0'));
 
     type MultiplyInputType is record
diff --git a/decode2.vhdl b/decode2.vhdl
index 8b4633a..561fd79 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -221,6 +221,52 @@ architecture behaviour of decode2 is
         end case;
     end;
 
+    -- control signals that are derived from insn_type
+    type mux_select_array_t is array(insn_type_t) of std_ulogic_vector(2 downto 0);
+
+    constant result_select : mux_select_array_t := (
+        OP_AND      => "001",           -- logical_result
+        OP_OR       => "001",
+        OP_XOR      => "001",
+        OP_POPCNT   => "001",
+        OP_PRTY     => "001",
+        OP_CMPB     => "001",
+        OP_EXTS     => "001",
+        OP_BPERM    => "001",
+        OP_BCD      => "001",
+        OP_MTSPR    => "001",
+        OP_RLC      => "010",           -- rotator_result
+        OP_RLCL     => "010",
+        OP_RLCR     => "010",
+        OP_SHL      => "010",
+        OP_SHR      => "010",
+        OP_EXTSWSLI => "010",
+        OP_MUL_L64  => "011",           -- muldiv_result
+        OP_MUL_H64  => "011",
+        OP_MUL_H32  => "011",
+        OP_DIV      => "011",
+        OP_DIVE     => "011",
+        OP_MOD      => "011",
+        OP_CNTZ     => "100",           -- countzero_result
+        OP_MFSPR    => "101",           -- spr_result
+        OP_ISEL     => "111",           -- misc_result
+        OP_DARN     => "111",
+        OP_MFMSR    => "111",
+        OP_MFCR     => "111",
+        OP_SETB     => "111",
+        others      => "000"            -- default to adder_result
+        );
+
+    constant subresult_select : mux_select_array_t := (
+        OP_MUL_L64 => "000",            -- muldiv_result
+        OP_MUL_H64 => "001",
+        OP_MUL_H32 => "010",
+        OP_DIV     => "011",
+        OP_DIVE    => "011",
+        OP_MOD     => "011",
+        others     => "000"
+        );
+
     -- issue control signals
     signal control_valid_in : std_ulogic;
     signal control_valid_out : std_ulogic;
@@ -400,6 +446,10 @@ begin
         v.e.bypass_cr := cr_bypass;
         v.e.xerc := c_in.read_xerc_data;
         v.e.invert_a := d_in.decode.invert_a;
+        v.e.addm1 := '0';
+        if d_in.decode.insn_type = OP_BC or d_in.decode.insn_type = OP_BCREG then
+            v.e.addm1 := '1';
+        end if;
         v.e.invert_out := d_in.decode.invert_out;
         v.e.input_carry := d_in.decode.input_carry;
         v.e.output_carry := d_in.decode.output_carry;
@@ -415,6 +465,8 @@ begin
         v.e.update := d_in.decode.update;
         v.e.reserve := d_in.decode.reserve;
         v.e.br_pred := d_in.br_pred;
+        v.e.result_sel := result_select(d_in.decode.insn_type);
+        v.e.sub_select := subresult_select(d_in.decode.insn_type);
 
         -- issue control
         control_valid_in <= d_in.valid;
diff --git a/execute1.vhdl b/execute1.vhdl
index 4ea2680..6d2eb04 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -60,6 +60,8 @@ architecture behaviour of execute1 is
         prev_op : insn_type_t;
 	lr_update : std_ulogic;
 	next_lr : std_ulogic_vector(63 downto 0);
+        resmux : std_ulogic_vector(2 downto 0);
+        submux : std_ulogic_vector(2 downto 0);
 	mul_in_progress : std_ulogic;
         mul_finish : std_ulogic;
         div_in_progress : std_ulogic;
@@ -103,6 +105,13 @@ architecture behaviour of execute1 is
     signal rotator_carry: std_ulogic;
     signal logical_result: std_ulogic_vector(63 downto 0);
     signal countzero_result: std_ulogic_vector(63 downto 0);
+    signal alu_result: std_ulogic_vector(63 downto 0);
+    signal adder_result: std_ulogic_vector(63 downto 0);
+    signal misc_result: std_ulogic_vector(63 downto 0);
+    signal muldiv_result: std_ulogic_vector(63 downto 0);
+    signal spr_result: std_ulogic_vector(63 downto 0);
+    signal result_mux_sel: std_ulogic_vector(2 downto 0);
+    signal sub_mux_sel: std_ulogic_vector(2 downto 0);
 
     -- multiply signals
     signal x_to_multiply: MultiplyInputType;
@@ -285,6 +294,18 @@ begin
 
     terminate_out <= r.terminate;
 
+    -- Result mux
+    result_mux_sel <= e_in.result_sel when r.busy = '0' else r.resmux;
+    sub_mux_sel <= e_in.sub_select when r.busy = '0' else r.submux;
+    with result_mux_sel select alu_result <=
+        adder_result       when "000",
+        logical_result     when "001",
+        rotator_result     when "010",
+        muldiv_result      when "011",
+        countzero_result   when "100",
+        spr_result         when "101",
+        misc_result        when others;
+
     execute1_0: process(clk)
     begin
 	if rising_edge(clk) then
@@ -310,7 +331,8 @@ begin
     execute1_1: process(all)
 	variable v : reg_type;
 	variable a_inv : std_ulogic_vector(63 downto 0);
-	variable result : std_ulogic_vector(63 downto 0);
+	variable b_or_m1 : std_ulogic_vector(63 downto 0);
+	variable addg6s : std_ulogic_vector(63 downto 0);
 	variable newcrf : std_ulogic_vector(3 downto 0);
 	variable sum_with_carry : std_ulogic_vector(64 downto 0);
 	variable result_en : std_ulogic;
@@ -348,16 +370,17 @@ begin
         variable spr_val : std_ulogic_vector(63 downto 0);
         variable addend : std_ulogic_vector(127 downto 0);
         variable do_trace : std_ulogic;
+        variable hold_wr_data : std_ulogic;
         variable f : Execute1ToFetch1Type;
         variable fv : Execute1ToFPUType;
     begin
-	result := (others => '0');
 	sum_with_carry := (others => '0');
 	result_en := '0';
 	newcrf := (others => '0');
         is_branch := '0';
         taken_branch := '0';
         abs_branch := '0';
+        hold_wr_data := '0';
 
 	v := r;
 	v.e := Execute1ToWritebackInit;
@@ -399,14 +422,24 @@ begin
         v.cntz_in_progress := '0';
         v.mul_finish := '0';
 
+        misc_result <= (others => '0');
+        spr_result <= (others => '0');
+        spr_val := (others => '0');
+
         -- Main adder
         if e_in.invert_a = '0' then
             a_inv := a_in;
         else
             a_inv := not a_in;
         end if;
-        sum_with_carry := ppc_adde(a_inv, b_in,
+        if e_in.addm1 = '0' then
+            b_or_m1 := b_in;
+        else
+            b_or_m1 := (others => '1');
+        end if;
+        sum_with_carry := ppc_adde(a_inv, b_or_m1,
                                    decode_input_carry(e_in.input_carry, v.e.xerc));
+        adder_result <= sum_with_carry(63 downto 0);
 
         -- signals to multiply and divide units
         sign1 := '0';
@@ -432,6 +465,7 @@ begin
             abs2 := - signed(b_in);
         end if;
 
+        -- Interface to multiply and divide units
 	x_to_multiply <= MultiplyInputInit;
 	x_to_multiply.is_32bit <= e_in.is_32bit;
 
@@ -479,6 +513,18 @@ begin
             x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
         end if;
 
+        case sub_mux_sel(1 downto 0) is
+            when "00" =>
+                muldiv_result <= multiply_to_x.result(63 downto 0);
+            when "01" =>
+                muldiv_result <= multiply_to_x.result(127 downto 64);
+            when "10" =>
+                muldiv_result <= multiply_to_x.result(63 downto 32) &
+                                 multiply_to_x.result(63 downto 32);
+            when others =>
+                muldiv_result <= divider_to_x.write_reg_data;
+        end case;
+
 	ctrl_tmp <= ctrl;
 	-- FIXME: run at 512MHz not core freq
 	ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
@@ -611,6 +657,8 @@ begin
 	    v.slow_op_rc := e_in.rc;
 	    v.slow_op_oe := e_in.oe;
 	    v.slow_op_xerc := v.e.xerc;
+            v.resmux := e_in.result_sel;
+            v.submux := e_in.sub_select;
 
 	    case_0: case e_in.insn_type is
 
@@ -642,8 +690,7 @@ begin
 	    when OP_NOP | OP_DCBF | OP_DCBST | OP_DCBT | OP_DCBTST | OP_ICBT =>
 		-- Do nothing
 	    when OP_ADD | OP_CMP | OP_TRAP =>
-		result := sum_with_carry(63 downto 0);
-                carry_32 := result(32) xor a_inv(32) xor b_in(32);
+                carry_32 := sum_with_carry(32) xor a_inv(32) xor b_in(32);
                 carry_64 := sum_with_carry(64);
                 if e_in.insn_type = OP_ADD then
                     if e_in.output_carry = '1' then
@@ -724,17 +771,18 @@ begin
                     end if;
                 end if;
             when OP_ADDG6S =>
-                result := (others => '0');
+                addg6s := (others => '0');
                 for i in 0 to 14 loop
                     lo := i * 4;
                     hi := (i + 1) * 4;
                     if (a_in(hi) xor b_in(hi) xor sum_with_carry(hi)) = '0' then
-                        result(lo + 3 downto lo) := "0110";
+                        addg6s(lo + 3 downto lo) := "0110";
                     end if;
                 end loop;
                 if sum_with_carry(64) = '0' then
-                    result(63 downto 60) := "0110";
+                    addg6s(63 downto 60) := "0110";
                 end if;
+                misc_result <= addg6s;
                 result_en := '1';
             when OP_CMPRB =>
                 newcrf := ppc_cmprb(a_in, b_in, insn_l(e_in.insn));
@@ -754,7 +802,6 @@ begin
                                      newcrf & newcrf & newcrf & newcrf;
             when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS |
                     OP_BPERM | OP_BCD =>
-		result := logical_result;
 		result_en := '1';
 	    when OP_B =>
                 is_branch := '1';
@@ -765,12 +812,11 @@ begin
                 end if;
 	    when OP_BC =>
 		-- read_data1 is CTR
+                v.e.write_reg := fast_spr_num(SPR_CTR);
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
 		if bo(4-2) = '0' then
-		    result := std_ulogic_vector(unsigned(a_in) - 1);
 		    result_en := '1';
-		    v.e.write_reg := fast_spr_num(SPR_CTR);
 		end if;
                 is_branch := '1';
 		taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
@@ -781,12 +827,11 @@ begin
 	    when OP_BCREG =>
 		-- read_data1 is CTR
 		-- read_data2 is target register (CTR, LR or TAR)
+                v.e.write_reg := fast_spr_num(SPR_CTR);
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
 		if bo(4-2) = '0' and e_in.insn(10) = '0' then
-		    result := std_ulogic_vector(unsigned(a_in) - 1);
 		    result_en := '1';
-		    v.e.write_reg := fast_spr_num(SPR_CTR);
 		end if;
                 is_branch := '1';
 		taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
@@ -825,9 +870,9 @@ begin
 	    when OP_ISEL =>
 		crbit := to_integer(unsigned(insn_bc(e_in.insn)));
 		if cr_in(31-crbit) = '1' then
-		    result := a_in;
+		    misc_result <= a_in;
 		else
-		    result := b_in;
+		    misc_result <= b_in;
 		end if;
 		result_en := '1';
 	    when OP_CROP =>
@@ -885,38 +930,38 @@ begin
                 if random_err = '0' then
                     case e_in.insn(17 downto 16) is
                         when "00" =>
-                            result := x"00000000" & random_cond(31 downto 0);
+                            misc_result <= x"00000000" & random_cond(31 downto 0);
                         when "10" =>
-                            result := random_raw;
+                            misc_result <= random_raw;
                         when others =>
-                            result := random_cond;
+                            misc_result <= random_cond;
                     end case;
                 else
-                    result := (others => '1');
+                    misc_result <= (others => '1');
                 end if;
                 result_en := '1';
 	    when OP_MFMSR =>
-		result := ctrl.msr;
+		misc_result <= ctrl.msr;
 		result_en := '1';
 	    when OP_MFSPR =>
 		report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
 		    "=" & to_hstring(a_in);
 		result_en := '1';
 		if is_fast_spr(e_in.read_reg1) then
-		    result := a_in;
-		    if decode_spr_num(e_in.insn) = SPR_XER then
+		    spr_val := a_in;
+                    if decode_spr_num(e_in.insn) = SPR_XER then
 			-- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer
-			result(63 downto 32) := (others => '0');
-			result(63-32) := v.e.xerc.so;
-			result(63-33) := v.e.xerc.ov;
-			result(63-34) := v.e.xerc.ca;
-			result(63-35 downto 63-43) := "000000000";
-			result(63-44) := v.e.xerc.ov32;
-			result(63-45) := v.e.xerc.ca32;
-		    end if;
+			spr_val(63 downto 32) := (others => '0');
+			spr_val(63-32) := v.e.xerc.so;
+			spr_val(63-33) := v.e.xerc.ov;
+			spr_val(63-34) := v.e.xerc.ca;
+			spr_val(63-35 downto 63-43) := "000000000";
+			spr_val(63-44) := v.e.xerc.ov32;
+			spr_val(63-45) := v.e.xerc.ca32;
+                    end if;
 		else
                     spr_val := c_in;
-		    case decode_spr_num(e_in.insn) is
+                    case decode_spr_num(e_in.insn) is
 		    when SPR_TB =>
 			spr_val := ctrl.tb;
 		    when SPR_TBU =>
@@ -940,22 +985,23 @@ begin
                         if ctrl.msr(MSR_PR) = '1' then
                             illegal := '1';
                         end if;
-		    end case;
-                    result := spr_val;
-		end if;
+                    end case;
+                end if;
+                spr_result <= spr_val;
+
 	    when OP_MFCR =>
 		if e_in.insn(20) = '0' then
 		    -- mfcr
-		    result := x"00000000" & cr_in;
+		    misc_result <= x"00000000" & cr_in;
 		else
 		    -- mfocrf
 		    crnum := fxm_to_num(insn_fxm(e_in.insn));
-		    result := (others => '0');
+		    misc_result <= (others => '0');
 		    for i in 0 to 7 loop
 			lo := (7-i)*4;
 			hi := lo + 3;
 			if crnum = i then
-			    result(hi downto lo) := cr_in(hi downto lo);
+			    misc_result(hi downto lo) <= cr_in(hi downto lo);
 			end if;
 		    end loop;
 		end if;
@@ -999,7 +1045,6 @@ begin
 		report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
 		    "=" & to_hstring(c_in);
 		if is_fast_spr(e_in.write_reg) then
-		    result := c_in;
 		    result_en := '1';
 		    if decode_spr_num(e_in.insn) = SPR_XER then
 			v.e.xerc.so := c_in(63-32);
@@ -1025,7 +1070,6 @@ begin
 		    end case;
 		end if;
 	    when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR | OP_EXTSWSLI =>
-		result := rotator_result;
 		if e_in.output_carry = '1' then
 		    set_carry(v.e, rotator_carry, rotator_carry);
 		end if;
@@ -1033,11 +1077,11 @@ begin
             when OP_SETB =>
                 bfa := insn_bfa(e_in.insn);
                 crbit := to_integer(unsigned(bfa)) * 4;
-                result := (others => '0');
+                misc_result <= (others => '0');
                 if cr_in(31 - crbit) = '1' then
-                    result := (others => '1');
+                    misc_result <= (others => '1');
                 elsif cr_in(30 - crbit) = '1' then
-                    result(0) := '1';
+                    misc_result(0) <= '1';
                 end if;
 
 	    when OP_ISYNC =>
@@ -1130,10 +1174,9 @@ begin
 	    v.e.valid := '1';
             -- Keep r.e.write_data unchanged next cycle in case it is needed
             -- for a forwarded result (e.g. for CTR).
-            result := r.e.write_data;
+            hold_wr_data := '1';
         elsif r.cntz_in_progress = '1' then
             -- cnt[lt]z always takes two cycles
-            result := countzero_result;
             result_en := '1';
             v.e.write_reg := gpr_to_gspr(r.slow_op_dest);
             v.e.rc := r.slow_op_rc;
@@ -1144,18 +1187,7 @@ begin
 	       (r.div_in_progress = '1' and divider_to_x.valid = '1') then
 		if r.mul_in_progress = '1' then
                     overflow := '0';
-                    case r.slow_op_insn is
-                        when OP_MUL_H32 =>
-                            result := multiply_to_x.result(63 downto 32) &
-                                      multiply_to_x.result(63 downto 32);
-                        when OP_MUL_H64 =>
-                            result := multiply_to_x.result(127 downto 64);
-                        when others =>
-                            -- i.e. OP_MUL_L64
-                            result := multiply_to_x.result(63 downto 0);
-                    end case;
 		else
-		    result := divider_to_x.write_reg_data;
 		    overflow := divider_to_x.overflow;
 		end if;
                 if r.mul_in_progress = '1' and r.slow_op_oe = '1' then
@@ -1184,7 +1216,7 @@ begin
 		v.div_in_progress := r.div_in_progress;
 	    end if;
         elsif r.mul_finish = '1' then
-            result := r.e.write_data;
+            hold_wr_data := '1';
             result_en := '1';
             v.e.write_reg := gpr_to_gspr(r.slow_op_dest);
             v.e.rc := r.slow_op_rc;
@@ -1225,7 +1257,11 @@ begin
             v.trace_next := '1';
         end if;
 
-	v.e.write_data := result;
+        if hold_wr_data = '0' then
+            v.e.write_data := alu_result;
+        else
+            v.e.write_data := r.e.write_data;
+        end if;
 	v.e.write_enable := result_en and not exception;
 
         -- generate DSI or DSegI for load/store exceptions
diff --git a/logical.vhdl b/logical.vhdl
index d008e47..6b6f202 100644
--- a/logical.vhdl
+++ b/logical.vhdl
@@ -197,8 +197,7 @@ begin
                     tmp := x"00" & dpd_to_bcd(rs(51 downto 42)) & dpd_to_bcd(rs(41 downto 32)) &
                            x"00" & dpd_to_bcd(rs(19 downto 10)) & dpd_to_bcd(rs(9 downto 0));
                 end if;
-            when others =>
-                -- EXTS
+            when OP_EXTS =>
                 -- note datalen is a 1-hot encoding
 		negative := (datalen(0) and rs(7)) or
 			    (datalen(1) and rs(15)) or
@@ -211,6 +210,9 @@ begin
 		    tmp(15 downto 8) := rs(15 downto 8);
 		end if;
 		tmp(7 downto 0) := rs(7 downto 0);
+            when others =>
+                -- e.g. OP_MTSPR
+                tmp := rs;
         end case;
 
         result <= tmp;

From b0510fd1bbfe50ab7f61e6be4a4643c9d5dd87b1 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 26 Sep 2020 19:58:46 +1000
Subject: [PATCH 7/9] core: Reorganize execute1

This breaks up the enormous if .. elsif .. case .. elsif statement in
execute1 in order to try to make it simpler and more understandable.
We now have decode2 deciding whether the instruction has a value to be
written back to a register (GPR, GSPR, FPR, etc.) rather than
individual cases in execute1 setting result_en.  The computation of
the data to be written back is now independent of detection of various
exception conditions.  We now have an if block determining if any
exception condition exists which prevents the next instruction from
being executed, then the case statement which performs actions such as
setting carry/overflow bits, determining if a trap exception exists,
doing branches, etc., then an if statement for all the r.busy = 1
cases (continuing execution of an instruction which was started in a
previous cycle, or writing SRR1 for an interrupt).

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl   |   3 +-
 decode2.vhdl  |  18 +-
 execute1.vhdl | 556 ++++++++++++++++++++++++--------------------------
 3 files changed, 289 insertions(+), 288 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 44f63bd..d085199 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -195,6 +195,7 @@ package common is
 	insn_type: insn_type_t;
 	nia: std_ulogic_vector(63 downto 0);
 	write_reg: gspr_index_t;
+        write_reg_enable: std_ulogic;
 	read_reg1: gspr_index_t;
 	read_reg2: gspr_index_t;
 	read_data1: std_ulogic_vector(63 downto 0);
@@ -232,7 +233,7 @@ package common is
     end record;
     constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
 	(valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL,
-         bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
+         write_reg_enable => '0', bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
          bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0',
 	 invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
 	 is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0',
diff --git a/decode2.vhdl b/decode2.vhdl
index 561fd79..e00a05d 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -249,7 +249,8 @@ architecture behaviour of decode2 is
         OP_MOD      => "011",
         OP_CNTZ     => "100",           -- countzero_result
         OP_MFSPR    => "101",           -- spr_result
-        OP_ISEL     => "111",           -- misc_result
+        OP_ADDG6S   => "111",           -- misc_result
+        OP_ISEL     => "111",
         OP_DARN     => "111",
         OP_MFMSR    => "111",
         OP_MFCR     => "111",
@@ -264,6 +265,12 @@ architecture behaviour of decode2 is
         OP_DIV     => "011",
         OP_DIVE    => "011",
         OP_MOD     => "011",
+        OP_ADDG6S  => "001",            -- misc_result
+        OP_ISEL    => "010",
+        OP_DARN    => "011",
+        OP_MFMSR   => "100",
+        OP_MFCR    => "101",
+        OP_SETB    => "110",
         others     => "000"
         );
 
@@ -438,6 +445,7 @@ begin
         v.e.read_data3 := decoded_reg_c.data;
         v.e.bypass_data3 := gpr_c_bypass;
         v.e.write_reg := decoded_reg_o.reg;
+        v.e.write_reg_enable := decoded_reg_o.reg_valid;
         v.e.rc := decode_rc(d_in.decode.rc, d_in.insn);
         if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then
             v.e.oe := decode_oe(d_in.decode.rc, d_in.insn);
@@ -448,7 +456,13 @@ begin
         v.e.invert_a := d_in.decode.invert_a;
         v.e.addm1 := '0';
         if d_in.decode.insn_type = OP_BC or d_in.decode.insn_type = OP_BCREG then
+            -- add -1 to CTR
             v.e.addm1 := '1';
+            if d_in.insn(23) = '1' or
+                (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then
+                -- don't write decremented CTR if BO(2) = 1 or bcctr
+                v.e.write_reg_enable := '0';
+            end if;
         end if;
         v.e.invert_out := d_in.decode.invert_out;
         v.e.input_carry := d_in.decode.input_carry;
@@ -472,7 +486,7 @@ begin
         control_valid_in <= d_in.valid;
         control_sgl_pipe <= d_in.decode.sgl_pipe;
 
-        gpr_write_valid <= decoded_reg_o.reg_valid;
+        gpr_write_valid <= v.e.write_reg_enable;
         gpr_write <= decoded_reg_o.reg;
         gpr_bypassable <= '0';
         if EX1_BYPASS and d_in.decode.unit = ALU then
diff --git a/execute1.vhdl b/execute1.vhdl
index 6d2eb04..6a27ee8 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -53,6 +53,7 @@ end entity execute1;
 architecture behaviour of execute1 is
     type reg_type is record
 	e : Execute1ToWritebackType;
+        cur_instr : Decode2ToExecute1Type;
         busy: std_ulogic;
         terminate: std_ulogic;
         fp_exception_next : std_ulogic;
@@ -60,17 +61,10 @@ architecture behaviour of execute1 is
         prev_op : insn_type_t;
 	lr_update : std_ulogic;
 	next_lr : std_ulogic_vector(63 downto 0);
-        resmux : std_ulogic_vector(2 downto 0);
-        submux : std_ulogic_vector(2 downto 0);
 	mul_in_progress : std_ulogic;
         mul_finish : std_ulogic;
         div_in_progress : std_ulogic;
         cntz_in_progress : std_ulogic;
-        slow_op_insn : insn_type_t;
-	slow_op_dest : gpr_index_t;
-	slow_op_rc : std_ulogic;
-	slow_op_oe : std_ulogic;
-	slow_op_xerc : xer_common_t;
         last_nia : std_ulogic_vector(63 downto 0);
         redirect : std_ulogic;
         abs_br : std_ulogic;
@@ -82,10 +76,10 @@ architecture behaviour of execute1 is
     end record;
     constant reg_type_init : reg_type :=
         (e => Execute1ToWritebackInit,
+         cur_instr => Decode2ToExecute1Init,
          busy => '0', lr_update => '0', terminate => '0',
          fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL,
          mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0',
-         slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init,
          next_lr => (others => '0'), last_nia => (others => '0'),
          redirect => '0', abs_br => '0', do_intr => '0', vector => 0,
          br_offset => (others => '0'), redir_mode => "0000",
@@ -112,6 +106,7 @@ architecture behaviour of execute1 is
     signal spr_result: std_ulogic_vector(63 downto 0);
     signal result_mux_sel: std_ulogic_vector(2 downto 0);
     signal sub_mux_sel: std_ulogic_vector(2 downto 0);
+    signal current: Decode2ToExecute1Type;
 
     -- multiply signals
     signal x_to_multiply: MultiplyInputType;
@@ -294,10 +289,10 @@ begin
 
     terminate_out <= r.terminate;
 
+    current <= e_in when r.busy = '0' else r.cur_instr;
+
     -- Result mux
-    result_mux_sel <= e_in.result_sel when r.busy = '0' else r.resmux;
-    sub_mux_sel <= e_in.sub_select when r.busy = '0' else r.submux;
-    with result_mux_sel select alu_result <=
+    with current.result_sel select alu_result <=
         adder_result       when "000",
         logical_result     when "001",
         rotator_result     when "010",
@@ -333,9 +328,12 @@ begin
 	variable a_inv : std_ulogic_vector(63 downto 0);
 	variable b_or_m1 : std_ulogic_vector(63 downto 0);
 	variable addg6s : std_ulogic_vector(63 downto 0);
+	variable isel_result : std_ulogic_vector(63 downto 0);
+	variable darn : std_ulogic_vector(63 downto 0);
+	variable mfcr_result : std_ulogic_vector(63 downto 0);
+	variable setb_result : std_ulogic_vector(63 downto 0);
 	variable newcrf : std_ulogic_vector(3 downto 0);
 	variable sum_with_carry : std_ulogic_vector(64 downto 0);
-	variable result_en : std_ulogic;
 	variable crnum : crnum_t;
 	variable crbit : integer range 0 to 31;
 	variable scrnum : crnum_t;
@@ -375,7 +373,6 @@ begin
         variable fv : Execute1ToFPUType;
     begin
 	sum_with_carry := (others => '0');
-	result_en := '0';
 	newcrf := (others => '0');
         is_branch := '0';
         taken_branch := '0';
@@ -400,7 +397,7 @@ begin
 	-- (SO, OV[32] and CA[32]) are only modified by instructions that are
         -- handled here, we can just forward the result being sent to
         -- writeback.
-	if r.e.write_xerc_enable = '1' then
+	if r.e.write_xerc_enable = '1' or r.busy = '1' then
 	    v.e.xerc := r.e.xerc;
 	else
 	    v.e.xerc := e_in.xerc;
@@ -422,7 +419,6 @@ begin
         v.cntz_in_progress := '0';
         v.mul_finish := '0';
 
-        misc_result <= (others => '0');
         spr_result <= (others => '0');
         spr_val := (others => '0');
 
@@ -440,6 +436,8 @@ begin
         sum_with_carry := ppc_adde(a_inv, b_or_m1,
                                    decode_input_carry(e_in.input_carry, v.e.xerc));
         adder_result <= sum_with_carry(63 downto 0);
+        carry_32 := sum_with_carry(32) xor a_inv(32) xor b_in(32);
+        carry_64 := sum_with_carry(64);
 
         -- signals to multiply and divide units
         sign1 := '0';
@@ -513,7 +511,7 @@ begin
             x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
         end if;
 
-        case sub_mux_sel(1 downto 0) is
+        case current.sub_select(1 downto 0) is
             when "00" =>
                 muldiv_result <= multiply_to_x.result(63 downto 0);
             when "01" =>
@@ -525,6 +523,117 @@ begin
                 muldiv_result <= divider_to_x.write_reg_data;
         end case;
 
+        -- Compute misc_result
+        case current.sub_select is
+            when "000" =>
+                misc_result <= (others => '0');
+            when "001" =>
+                -- addg6s
+                addg6s := (others => '0');
+                for i in 0 to 14 loop
+                    lo := i * 4;
+                    hi := (i + 1) * 4;
+                    if (a_in(hi) xor b_in(hi) xor sum_with_carry(hi)) = '0' then
+                        addg6s(lo + 3 downto lo) := "0110";
+                    end if;
+                end loop;
+                if sum_with_carry(64) = '0' then
+                    addg6s(63 downto 60) := "0110";
+                end if;
+                misc_result <= addg6s;
+            when "010" =>
+                -- isel
+		crbit := to_integer(unsigned(insn_bc(e_in.insn)));
+		if cr_in(31-crbit) = '1' then
+		    isel_result := a_in;
+		else
+		    isel_result := b_in;
+		end if;
+                misc_result <= isel_result;
+            when "011" =>
+                -- darn
+                darn := (others => '1');
+                if random_err = '0' then
+                    case e_in.insn(17 downto 16) is
+                        when "00" =>
+                            darn := x"00000000" & random_cond(31 downto 0);
+                        when "10" =>
+                            darn := random_raw;
+                        when others =>
+                            darn := random_cond;
+                    end case;
+                end if;
+                misc_result <= darn;
+            when "100" =>
+                -- mfmsr
+		misc_result <= ctrl.msr;
+            when "101" =>
+		if e_in.insn(20) = '0' then
+		    -- mfcr
+		    mfcr_result := x"00000000" & cr_in;
+		else
+		    -- mfocrf
+		    crnum := fxm_to_num(insn_fxm(e_in.insn));
+		    mfcr_result := (others => '0');
+		    for i in 0 to 7 loop
+			lo := (7-i)*4;
+			hi := lo + 3;
+			if crnum = i then
+			    mfcr_result(hi downto lo) := cr_in(hi downto lo);
+			end if;
+		    end loop;
+		end if;
+                misc_result <= mfcr_result;
+            when "110" =>
+                -- setb
+                bfa := insn_bfa(e_in.insn);
+                crbit := to_integer(unsigned(bfa)) * 4;
+                setb_result := (others => '0');
+                if cr_in(31 - crbit) = '1' then
+                    setb_result := (others => '1');
+                elsif cr_in(30 - crbit) = '1' then
+                    setb_result(0) := '1';
+                end if;
+                misc_result <= setb_result;
+            when others =>
+                misc_result <= (others => '0');
+        end case;
+
+        -- compute comparison results
+        -- Note, we have done RB - RA, not RA - RB
+        if e_in.insn_type = OP_CMP then
+            l := insn_l(e_in.insn);
+        else
+            l := not e_in.is_32bit;
+        end if;
+        zerolo := not (or (a_in(31 downto 0) xor b_in(31 downto 0)));
+        zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32)));
+        if zerolo = '1' and (l = '0' or zerohi = '1') then
+            -- values are equal
+            trapval := "00100";
+        else
+            if l = '1' then
+                -- 64-bit comparison
+                msb_a := a_in(63);
+                msb_b := b_in(63);
+            else
+                -- 32-bit comparison
+                msb_a := a_in(31);
+                msb_b := b_in(31);
+            end if;
+            if msb_a /= msb_b then
+                -- Subtraction might overflow, but
+                -- comparison is clear from MSB difference.
+                -- for signed, 0 is greater; for unsigned, 1 is greater
+                trapval := msb_a & msb_b & '0' & msb_b & msb_a;
+            else
+                -- Subtraction cannot overflow since MSBs are equal.
+                -- carry = 1 indicates RA is smaller (signed or unsigned)
+                a_lt := (not l and carry_32) or (l and carry_64);
+                trapval := a_lt & not a_lt & '0' & a_lt & not a_lt;
+            end if;
+        end if;
+
 	ctrl_tmp <= ctrl;
 	-- FIXME: run at 512MHz not core freq
 	ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
@@ -577,38 +686,20 @@ begin
             v.prev_op := e_in.insn_type;
         end if;
 
- 	if ctrl.irq_state = WRITE_SRR1 then
- 	    v.e.exc_write_reg := fast_spr_num(SPR_SRR1);
- 	    v.e.exc_write_data := ctrl.srr1;
-            v.e.exc_write_enable := '1';
-            ctrl_tmp.msr(MSR_SF) <= '1';
-            ctrl_tmp.msr(MSR_EE) <= '0';
-            ctrl_tmp.msr(MSR_PR) <= '0';
-            ctrl_tmp.msr(MSR_SE) <= '0';
-            ctrl_tmp.msr(MSR_BE) <= '0';
-            ctrl_tmp.msr(MSR_FP) <= '0';
-            ctrl_tmp.msr(MSR_FE0) <= '0';
-            ctrl_tmp.msr(MSR_FE1) <= '0';
-            ctrl_tmp.msr(MSR_IR) <= '0';
-            ctrl_tmp.msr(MSR_DR) <= '0';
-            ctrl_tmp.msr(MSR_RI) <= '0';
-            ctrl_tmp.msr(MSR_LE) <= '1';
-            v.e.valid := '1';
-            v.trace_next := '0';
-            v.fp_exception_next := '0';
-	    report "Writing SRR1: " & to_hstring(ctrl.srr1);
-
-        elsif valid_in = '1' and e_in.second = '0' and
-            ((HAS_FPU and r.fp_exception_next = '1') or r.trace_next = '1') then
+        -- Determine if there is any exception to be taken
+        -- before/instead of executing this instruction
+        if valid_in = '1' and e_in.second = '0' then
             if HAS_FPU and r.fp_exception_next = '1' then
                 -- This is used for FP-type program interrupts that
                 -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero.
+                exception := '1';
                 v.vector := 16#700#;
                 ctrl_tmp.srr1(63 - 43) <= '1';
                 ctrl_tmp.srr1(63 - 47) <= '1';
-            else
+            elsif r.trace_next = '1' then
                 -- Generate a trace interrupt rather than executing the next instruction
                 -- or taking any asynchronous interrupt
+                exception := '1';
                 v.vector := 16#d00#;
                 ctrl_tmp.srr1(63 - 33) <= '1';
                 if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or
@@ -617,48 +708,38 @@ begin
                 elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then
                     ctrl_tmp.srr1(63 - 36) <= '1';
                 end if;
-            end if;
-            exception := '1';
-
-	elsif irq_valid = '1' and valid_in = '1' and e_in.second = '0' then
-	    -- we need two cycles to write srr0 and 1
-	    -- will need more when we have to write HEIR
-            -- Don't deliver the interrupt until we have a valid instruction
-            -- coming in, so we have a valid NIA to put in SRR0.
-	    exception := '1';
 
-        elsif valid_in = '1' and ctrl.msr(MSR_PR) = '1' and
-            instr_is_privileged(e_in.insn_type, e_in.insn) then
-            -- generate a program interrupt
-            exception := '1';
-            v.vector := 16#700#;
-            -- set bit 45 to indicate privileged instruction type interrupt
-            ctrl_tmp.srr1(63 - 45) <= '1';
-            report "privileged instruction";
+            elsif irq_valid = '1' then
+                -- Don't deliver the interrupt until we have a valid instruction
+                -- coming in, so we have a valid NIA to put in SRR0.
+                exception := '1';
 
-        elsif not HAS_FPU and valid_in = '1' and e_in.fac = FPU then
-            -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations
-            illegal := '1';
+            elsif ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then
+                -- generate a program interrupt
+                exception := '1';
+                v.vector := 16#700#;
+                -- set bit 45 to indicate privileged instruction type interrupt
+                ctrl_tmp.srr1(63 - 45) <= '1';
+                report "privileged instruction";
 
-        elsif HAS_FPU and valid_in = '1' and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then
-            -- generate a floating-point unavailable interrupt
-            exception := '1';
-            v.vector := 16#800#;
-            report "FP unavailable interrupt";
+            elsif not HAS_FPU and e_in.fac = FPU then
+                -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations
+                illegal := '1';
 
-	elsif valid_in = '1' and e_in.unit = ALU then
+            elsif HAS_FPU and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then
+                -- generate a floating-point unavailable interrupt
+                exception := '1';
+                v.vector := 16#800#;
+                report "FP unavailable interrupt";
+            end if;
+        end if;
 
+	if valid_in = '1' and exception = '0' and illegal = '0' and e_in.unit = ALU then
 	    report "execute nia " & to_hstring(e_in.nia);
 
+            v.cur_instr := e_in;
+            v.next_lr := next_nia;
 	    v.e.valid := '1';
-	    v.e.write_reg := e_in.write_reg;
-            v.slow_op_insn := e_in.insn_type;
-	    v.slow_op_dest := gspr_to_gpr(e_in.write_reg);
-	    v.slow_op_rc := e_in.rc;
-	    v.slow_op_oe := e_in.oe;
-	    v.slow_op_xerc := v.e.xerc;
-            v.resmux := e_in.result_sel;
-            v.submux := e_in.sub_select;
 
 	    case_0: case e_in.insn_type is
 
@@ -689,101 +770,48 @@ begin
                 end if;
 	    when OP_NOP | OP_DCBF | OP_DCBST | OP_DCBT | OP_DCBTST | OP_ICBT =>
 		-- Do nothing
-	    when OP_ADD | OP_CMP | OP_TRAP =>
-                carry_32 := sum_with_carry(32) xor a_inv(32) xor b_in(32);
-                carry_64 := sum_with_carry(64);
-                if e_in.insn_type = OP_ADD then
-                    if e_in.output_carry = '1' then
-                        if e_in.input_carry /= OV then
-                            set_carry(v.e, carry_32, carry_64);
-                        else
-                            v.e.xerc.ov := carry_64;
-                            v.e.xerc.ov32 := carry_32;
-                            v.e.write_xerc_enable := '1';
-                        end if;
-                    end if;
-                    if e_in.oe = '1' then
-                        set_ov(v.e,
-                               calc_ov(a_inv(63), b_in(63), carry_64, sum_with_carry(63)),
-                               calc_ov(a_inv(31), b_in(31), carry_32, sum_with_carry(31)));
-                    end if;
-                    result_en := '1';
-                else
-                    -- trap, CMP and CMPL instructions
-                    -- Note, we have done RB - RA, not RA - RB
-                    if e_in.insn_type = OP_CMP then
-                        l := insn_l(e_in.insn);
-                    else
-                        l := not e_in.is_32bit;
-                    end if;
-                    zerolo := not (or (a_in(31 downto 0) xor b_in(31 downto 0)));
-                    zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32)));
-                    if zerolo = '1' and (l = '0' or zerohi = '1') then
-                        -- values are equal
-                        trapval := "00100";
+	    when OP_ADD =>
+                if e_in.output_carry = '1' then
+                    if e_in.input_carry /= OV then
+                        set_carry(v.e, carry_32, carry_64);
                     else
-                        if l = '1' then
-                            -- 64-bit comparison
-                            msb_a := a_in(63);
-                            msb_b := b_in(63);
-                        else
-                            -- 32-bit comparison
-                            msb_a := a_in(31);
-                            msb_b := b_in(31);
-                        end if;
-                        if msb_a /= msb_b then
-                            -- Subtraction might overflow, but
-                            -- comparison is clear from MSB difference.
-                            -- for signed, 0 is greater; for unsigned, 1 is greater
-                            trapval := msb_a & msb_b & '0' & msb_b & msb_a;
-                        else
-                            -- Subtraction cannot overflow since MSBs are equal.
-                            -- carry = 1 indicates RA is smaller (signed or unsigned)
-                            a_lt := (not l and carry_32) or (l and carry_64);
-                            trapval := a_lt & not a_lt & '0' & a_lt & not a_lt;
-                        end if;
-                    end if;
-                    if e_in.insn_type = OP_CMP then
-                        if e_in.is_signed = '1' then
-                            newcrf := trapval(4 downto 2) & v.e.xerc.so;
-                        else
-                            newcrf := trapval(1 downto 0) & trapval(2) & v.e.xerc.so;
-                        end if;
-                        bf := insn_bf(e_in.insn);
-                        crnum := to_integer(unsigned(bf));
-                        v.e.write_cr_enable := '1';
-                        v.e.write_cr_mask := num_to_fxm(crnum);
-                        for i in 0 to 7 loop
-                            lo := i*4;
-                            hi := lo + 3;
-                            v.e.write_cr_data(hi downto lo) := newcrf;
-                        end loop;
-                    else
-                        -- trap instructions (tw, twi, td, tdi)
-                        v.vector := 16#700#;
-                        -- set bit 46 to say trap occurred
-                        ctrl_tmp.srr1(63 - 46) <= '1';
-                        if or (trapval and insn_to(e_in.insn)) = '1' then
-                            -- generate trap-type program interrupt
-                            exception := '1';
-                            report "trap";
-                        end if;
+                        v.e.xerc.ov := carry_64;
+                        v.e.xerc.ov32 := carry_32;
+                        v.e.write_xerc_enable := '1';
                     end if;
                 end if;
-            when OP_ADDG6S =>
-                addg6s := (others => '0');
-                for i in 0 to 14 loop
-                    lo := i * 4;
-                    hi := (i + 1) * 4;
-                    if (a_in(hi) xor b_in(hi) xor sum_with_carry(hi)) = '0' then
-                        addg6s(lo + 3 downto lo) := "0110";
-                    end if;
+                if e_in.oe = '1' then
+                    set_ov(v.e,
+                           calc_ov(a_inv(63), b_in(63), carry_64, sum_with_carry(63)),
+                           calc_ov(a_inv(31), b_in(31), carry_32, sum_with_carry(31)));
+                end if;
+            when OP_CMP =>
+                -- CMP and CMPL instructions
+                if e_in.is_signed = '1' then
+                    newcrf := trapval(4 downto 2) & v.e.xerc.so;
+                else
+                    newcrf := trapval(1 downto 0) & trapval(2) & v.e.xerc.so;
+                end if;
+                bf := insn_bf(e_in.insn);
+                crnum := to_integer(unsigned(bf));
+                v.e.write_cr_enable := '1';
+                v.e.write_cr_mask := num_to_fxm(crnum);
+                for i in 0 to 7 loop
+                    lo := i*4;
+                    hi := lo + 3;
+                    v.e.write_cr_data(hi downto lo) := newcrf;
                 end loop;
-                if sum_with_carry(64) = '0' then
-                    addg6s(63 downto 60) := "0110";
+            when OP_TRAP =>
+                -- trap instructions (tw, twi, td, tdi)
+                v.vector := 16#700#;
+                -- set bit 46 to say trap occurred
+                ctrl_tmp.srr1(63 - 46) <= '1';
+                if or (trapval and insn_to(e_in.insn)) = '1' then
+                    -- generate trap-type program interrupt
+                    exception := '1';
+                    report "trap";
                 end if;
-                misc_result <= addg6s;
-                result_en := '1';
+            when OP_ADDG6S =>
             when OP_CMPRB =>
                 newcrf := ppc_cmprb(a_in, b_in, insn_l(e_in.insn));
                 bf := insn_bf(e_in.insn);
@@ -802,7 +830,6 @@ begin
                                      newcrf & newcrf & newcrf & newcrf;
             when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS |
                     OP_BPERM | OP_BCD =>
-		result_en := '1';
 	    when OP_B =>
                 is_branch := '1';
                 taken_branch := '1';
@@ -812,12 +839,8 @@ begin
                 end if;
 	    when OP_BC =>
 		-- read_data1 is CTR
-                v.e.write_reg := fast_spr_num(SPR_CTR);
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
-		if bo(4-2) = '0' then
-		    result_en := '1';
-		end if;
                 is_branch := '1';
 		taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
                 abs_branch := insn_aa(e_in.insn);
@@ -827,12 +850,8 @@ begin
 	    when OP_BCREG =>
 		-- read_data1 is CTR
 		-- read_data2 is target register (CTR, LR or TAR)
-                v.e.write_reg := fast_spr_num(SPR_CTR);
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
-		if bo(4-2) = '0' and e_in.insn(10) = '0' then
-		    result_en := '1';
-		end if;
                 is_branch := '1';
 		taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
                 abs_branch := '1';
@@ -868,13 +887,6 @@ begin
                 v.cntz_in_progress := '1';
                 v.busy := '1';
 	    when OP_ISEL =>
-		crbit := to_integer(unsigned(insn_bc(e_in.insn)));
-		if cr_in(31-crbit) = '1' then
-		    misc_result <= a_in;
-		else
-		    misc_result <= b_in;
-		end if;
-		result_en := '1';
 	    when OP_CROP =>
 		cr_op := insn_cr(e_in.insn);
 		report "CR OP " & to_hstring(cr_op);
@@ -927,27 +939,11 @@ begin
                 v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf &
                                      newcrf & newcrf & newcrf & newcrf;
             when OP_DARN =>
-                if random_err = '0' then
-                    case e_in.insn(17 downto 16) is
-                        when "00" =>
-                            misc_result <= x"00000000" & random_cond(31 downto 0);
-                        when "10" =>
-                            misc_result <= random_raw;
-                        when others =>
-                            misc_result <= random_cond;
-                    end case;
-                else
-                    misc_result <= (others => '1');
-                end if;
-                result_en := '1';
 	    when OP_MFMSR =>
-		misc_result <= ctrl.msr;
-		result_en := '1';
 	    when OP_MFSPR =>
 		report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
 		    "=" & to_hstring(a_in);
-		result_en := '1';
-		if is_fast_spr(e_in.read_reg1) then
+		if is_fast_spr(e_in.read_reg1) = '1' then
 		    spr_val := a_in;
                     if decode_spr_num(e_in.insn) = SPR_XER then
 			-- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer
@@ -982,7 +978,7 @@ begin
                     when others =>
                         -- mfspr from unimplemented SPRs should be a nop in
                         -- supervisor mode and a program interrupt for user mode
-                        if ctrl.msr(MSR_PR) = '1' then
+                        if is_fast_spr(e_in.read_reg1) = '0' and ctrl.msr(MSR_PR) = '1' then
                             illegal := '1';
                         end if;
                     end case;
@@ -990,22 +986,6 @@ begin
                 spr_result <= spr_val;
 
 	    when OP_MFCR =>
-		if e_in.insn(20) = '0' then
-		    -- mfcr
-		    misc_result <= x"00000000" & cr_in;
-		else
-		    -- mfocrf
-		    crnum := fxm_to_num(insn_fxm(e_in.insn));
-		    misc_result <= (others => '0');
-		    for i in 0 to 7 loop
-			lo := (7-i)*4;
-			hi := lo + 3;
-			if crnum = i then
-			    misc_result(hi downto lo) <= cr_in(hi downto lo);
-			end if;
-		    end loop;
-		end if;
-		result_en := '1';
 	    when OP_MTCRF =>
 		v.e.write_cr_enable := '1';
 		if e_in.insn(20) = '0' then
@@ -1045,7 +1025,6 @@ begin
 		report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
 		    "=" & to_hstring(c_in);
 		if is_fast_spr(e_in.write_reg) then
-		    result_en := '1';
 		    if decode_spr_num(e_in.insn) = SPR_XER then
 			v.e.xerc.so := c_in(63-32);
 			v.e.xerc.ov := c_in(63-33);
@@ -1073,16 +1052,7 @@ begin
 		if e_in.output_carry = '1' then
 		    set_carry(v.e, rotator_carry, rotator_carry);
 		end if;
-		result_en := '1';
             when OP_SETB =>
-                bfa := insn_bfa(e_in.insn);
-                crbit := to_integer(unsigned(bfa)) * 4;
-                misc_result <= (others => '0');
-                if cr_in(31 - crbit) = '1' then
-                    misc_result <= (others => '1');
-                elsif cr_in(30 - crbit) = '1' then
-                    misc_result(0) <= '1';
-                end if;
 
 	    when OP_ISYNC =>
 		v.redirect := '1';
@@ -1108,8 +1078,6 @@ begin
 		report "illegal";
 	    end case;
 
-	    v.e.rc := e_in.rc and valid_in;
-
             -- Mispredicted branches cause a redirect
             if is_branch = '1' then
                 if taken_branch = '1' then
@@ -1126,26 +1094,7 @@ begin
                 end if;
             end if;
 
-	    -- Update LR on the next cycle after a branch link
-	    -- If we're not writing back anything else, we can write back LR
-            -- this cycle, otherwise we take an extra cycle.  We use the
-            -- exc_write path since next_nia is written through that path
-            -- in other places.
-	    if e_in.lr = '1' then
-                if result_en = '0' then
-                    v.e.exc_write_enable := '1';
-                    v.e.exc_write_data := next_nia;
-                    v.e.exc_write_reg := fast_spr_num(SPR_LR);
-                else
-                    v.lr_update := '1';
-                    v.next_lr := next_nia;
-                    v.e.valid := '0';
-                    report "Delayed LR update to " & to_hstring(next_nia);
-                    v.busy := '1';
-                end if;
-	    end if;
-
-        elsif valid_in = '1' then
+        elsif valid_in = '1' and exception = '0' and illegal = '0' then
             -- instruction for other units, i.e. LDST
             if e_in.unit = LDST then
                 lv.valid := '1';
@@ -1164,23 +1113,28 @@ begin
         -- valid_in = 0.  Hence they don't happen in the same cycle as any of
         -- the cases above which depend on valid_in = 1.
 
-        if r.redirect = '1' then
-            v.e.valid := '1';
-        end if;
-	if r.lr_update = '1' then
+ 	if ctrl.irq_state = WRITE_SRR1 then
+ 	    v.e.exc_write_reg := fast_spr_num(SPR_SRR1);
+ 	    v.e.exc_write_data := ctrl.srr1;
             v.e.exc_write_enable := '1';
-	    v.e.exc_write_data := r.next_lr;
-	    v.e.exc_write_reg := fast_spr_num(SPR_LR);
-	    v.e.valid := '1';
-            -- Keep r.e.write_data unchanged next cycle in case it is needed
-            -- for a forwarded result (e.g. for CTR).
-            hold_wr_data := '1';
+            ctrl_tmp.msr(MSR_SF) <= '1';
+            ctrl_tmp.msr(MSR_EE) <= '0';
+            ctrl_tmp.msr(MSR_PR) <= '0';
+            ctrl_tmp.msr(MSR_SE) <= '0';
+            ctrl_tmp.msr(MSR_BE) <= '0';
+            ctrl_tmp.msr(MSR_FP) <= '0';
+            ctrl_tmp.msr(MSR_FE0) <= '0';
+            ctrl_tmp.msr(MSR_FE1) <= '0';
+            ctrl_tmp.msr(MSR_IR) <= '0';
+            ctrl_tmp.msr(MSR_DR) <= '0';
+            ctrl_tmp.msr(MSR_RI) <= '0';
+            ctrl_tmp.msr(MSR_LE) <= '1';
+            v.trace_next := '0';
+            v.fp_exception_next := '0';
+	    report "Writing SRR1: " & to_hstring(ctrl.srr1);
+
         elsif r.cntz_in_progress = '1' then
             -- cnt[lt]z always takes two cycles
-            result_en := '1';
-            v.e.write_reg := gpr_to_gspr(r.slow_op_dest);
-            v.e.rc := r.slow_op_rc;
-            v.e.xerc := r.slow_op_xerc;
             v.e.valid := '1';
 	elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then
 	    if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or
@@ -1190,23 +1144,21 @@ begin
 		else
 		    overflow := divider_to_x.overflow;
 		end if;
-                if r.mul_in_progress = '1' and r.slow_op_oe = '1' then
+                if r.mul_in_progress = '1' and current.oe = '1' then
                     -- have to wait until next cycle for overflow indication
                     v.mul_finish := '1';
                     v.busy := '1';
                 else
-                    result_en := '1';
-                    v.e.write_reg := gpr_to_gspr(r.slow_op_dest);
-                    v.e.rc := r.slow_op_rc;
-                    v.e.xerc := r.slow_op_xerc;
-                    v.e.write_xerc_enable := r.slow_op_oe;
+                    v.e.write_xerc_enable := current.oe;
                     -- We must test oe because the RC update code in writeback
                     -- will use the xerc value to set CR0:SO so we must not clobber
                     -- xerc if OE wasn't set.
-                    if r.slow_op_oe = '1' then
+                    if current.oe = '1' then
                         v.e.xerc.ov := overflow;
                         v.e.xerc.ov32 := overflow;
-                        v.e.xerc.so := r.slow_op_xerc.so or overflow;
+                        if overflow = '1' then
+                            v.e.xerc.so := '1';
+                        end if;
                     end if;
                     v.e.valid := '1';
                 end if;
@@ -1217,16 +1169,19 @@ begin
 	    end if;
         elsif r.mul_finish = '1' then
             hold_wr_data := '1';
-            result_en := '1';
-            v.e.write_reg := gpr_to_gspr(r.slow_op_dest);
-            v.e.rc := r.slow_op_rc;
-            v.e.xerc := r.slow_op_xerc;
-            v.e.write_xerc_enable := r.slow_op_oe;
+            v.e.write_xerc_enable := current.oe;
             v.e.xerc.ov := multiply_to_x.overflow;
             v.e.xerc.ov32 := multiply_to_x.overflow;
-            v.e.xerc.so := r.slow_op_xerc.so or multiply_to_x.overflow;
+            if multiply_to_x.overflow = '1' then
+                v.e.xerc.so := '1';
+            end if;
             v.e.valid := '1';
 	end if;
+        -- When doing delayed LR update, keep r.e.write_data unchanged
+        -- next cycle in case it is needed for a forwarded result (e.g. CTR).
+        if r.lr_update = '1' then
+            hold_wr_data := '1';
+        end if;
 
         -- Generate FP-type program interrupt.  fp_in.interrupt will only
         -- be set during the execution of a FP instruction.
@@ -1253,17 +1208,6 @@ begin
             end if;
 	end if;
 
-        if do_trace = '1' then
-            v.trace_next := '1';
-        end if;
-
-        if hold_wr_data = '0' then
-            v.e.write_data := alu_result;
-        else
-            v.e.write_data := r.e.write_data;
-        end if;
-	v.e.write_enable := result_en and not exception;
-
         -- generate DSI or DSegI for load/store exceptions
         -- or ISI or ISegI for instruction fetch exceptions
         if l_in.exception = '1' then
@@ -1297,10 +1241,52 @@ begin
             v.do_intr := '1';
         end if;
 
+        if do_trace = '1' then
+            v.trace_next := '1';
+        end if;
+
+        if hold_wr_data = '0' then
+            v.e.write_data := alu_result;
+        else
+            v.e.write_data := r.e.write_data;
+        end if;
+        v.e.write_reg := current.write_reg;
+	v.e.write_enable := current.write_reg_enable and v.e.valid and not exception;
+        v.e.rc := current.rc and v.e.valid and not exception;
+
+        -- Update LR on the next cycle after a branch link
+        -- If we're not writing back anything else, we can write back LR
+        -- this cycle, otherwise we take an extra cycle.  We use the
+        -- exc_write path since next_nia is written through that path
+        -- in other places.
+        if v.e.valid = '1' and exception = '0' and current.lr = '1' then
+            if current.write_reg_enable = '0' then
+                v.e.exc_write_enable := '1';
+                v.e.exc_write_data := next_nia;
+                v.e.exc_write_reg := fast_spr_num(SPR_LR);
+            else
+                v.lr_update := '1';
+                v.e.valid := '0';
+                report "Delayed LR update to " & to_hstring(next_nia);
+                v.busy := '1';
+            end if;
+        end if;
+	if r.lr_update = '1' then
+            v.e.exc_write_enable := '1';
+	    v.e.exc_write_data := r.next_lr;
+	    v.e.exc_write_reg := fast_spr_num(SPR_LR);
+	    v.e.valid := '1';
+        end if;
+
+        -- Defer completion for one cycle when redirecting.
+        -- This also ensures r.busy = 1 when ctrl.irq_state = WRITE_SRR1
         if v.redirect = '1' then
             v.busy := '1';
             v.e.valid := '0';
         end if;
+        if r.redirect = '1' then
+            v.e.valid := '1';
+        end if;
 
         -- Outputs to fetch1
         f.redirect := r.redirect;

From f7b855dfc36cd1d916e019ab31edbcc679077255 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 28 Sep 2020 14:04:08 +1000
Subject: [PATCH 8/9] execute1: Improve timing on comparisons

Using the main adder for comparisons has the disadvantage of creating
a long path from the CA/OV bit forwarding to v.busy via the carry
input of the adder, the comparison result, and determining whether a
trap instruction would trap.  Instead we now have dedicated
comparators for the high and low words of a_in vs. b_in, and combine
their results to get the signed and unsigned comparison results.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 execute1.vhdl | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/execute1.vhdl b/execute1.vhdl
index 6a27ee8..3385455 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -356,6 +356,8 @@ begin
         variable zerohi, zerolo : std_ulogic;
         variable msb_a, msb_b : std_ulogic;
         variable a_lt : std_ulogic;
+        variable a_lt_lo : std_ulogic;
+        variable a_lt_hi : std_ulogic;
         variable lv : Execute1ToLoadstore1Type;
 	variable irq_valid : std_ulogic;
 	variable exception : std_ulogic;
@@ -612,24 +614,32 @@ begin
             -- values are equal
             trapval := "00100";
         else
+            a_lt_lo := '0';
+            a_lt_hi := '0';
+            if unsigned(a_in(30 downto 0)) < unsigned(b_in(30 downto 0)) then
+                a_lt_lo := '1';
+            end if;
+            if unsigned(a_in(62 downto 31)) < unsigned(b_in(62 downto 31)) then
+                a_lt_hi := '1';
+            end if;
             if l = '1' then
                 -- 64-bit comparison
                 msb_a := a_in(63);
                 msb_b := b_in(63);
+                a_lt := a_lt_hi or (zerohi and (a_in(31) xnor b_in(31)) and a_lt_lo);
             else
                 -- 32-bit comparison
                 msb_a := a_in(31);
                 msb_b := b_in(31);
+                a_lt := a_lt_lo;
             end if;
             if msb_a /= msb_b then
-                -- Subtraction might overflow, but
-                -- comparison is clear from MSB difference.
+                -- Comparison is clear from MSB difference.
                 -- for signed, 0 is greater; for unsigned, 1 is greater
                 trapval := msb_a & msb_b & '0' & msb_b & msb_a;
             else
-                -- Subtraction cannot overflow since MSBs are equal.
-                -- carry = 1 indicates RA is smaller (signed or unsigned)
-                a_lt := (not l and carry_32) or (l and carry_64);
+                -- MSBs are equal, so signed and unsigned comparisons give the
+                -- same answer.
                 trapval := a_lt & not a_lt & '0' & a_lt & not a_lt;
             end if;
         end if;

From 0fb207be606969e7fb8b55241461596c2792c3dc Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 19 Dec 2020 09:25:04 +1100
Subject: [PATCH 9/9] fetch1: Implement a simple branch target cache

This implements a cache in fetch1, where each entry stores the address
of a simple branch instruction (b or bc) and the target of the branch.
When fetching sequentially, if the address being fetched matches the
cache entry, then fetching will be redirected to the branch target.
The cache has 1024 entries and is direct-mapped, i.e. indexed by bits
11..2 of the NIA.

The bus from execute1 now carries information about taken and
not-taken simple branches, which fetch1 uses to update the cache.
The cache entry is updated for both taken and not-taken branches, with
the valid bit being set if the branch was taken and cleared if the
branch was not taken.

If fetching is redirected to the branch target then that goes down the
pipe as a predicted-taken branch, and decode1 does not do any static
branch prediction.  If fetching is not redirected, then the next
instruction goes down the pipe as normal and decode1 does its static
branch prediction.

In order to make timing, the lookup of the cache is pipelined, so on
each cycle the cache entry for the current NIA + 8 is read.  This
means that after a redirect (from decode1 or execute1), only the third
and subsequent sequentially-fetched instructions will be able to be
predicted.

This improves the coremark value on the Arty A7-100 from about 180 to
about 190 (more than 5%).

The BTC is optional.  Builds for the Artix 7 35-T part have it off by
default because the extra ~1420 LUTs it takes mean that the design
doesn't fit on the Arty A7-35 board.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl               |   8 ++-
 core.vhdl                 |   5 +-
 decode1.vhdl              |   5 +-
 execute1.vhdl             |  17 +++++-
 fetch1.vhdl               | 119 +++++++++++++++++++++++++++++++-------
 fpga/top-arty.vhdl        |   2 +
 fpga/top-generic.vhdl     |   2 +
 fpga/top-nexys-video.vhdl |   2 +
 icache.vhdl               |   1 +
 microwatt.core            |  14 +++++
 soc.vhdl                  |   2 +
 11 files changed, 152 insertions(+), 25 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index d085199..7bf8277 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -155,6 +155,7 @@ package common is
         big_endian : std_ulogic;
 	stop_mark: std_ulogic;
         sequential: std_ulogic;
+        predicted : std_ulogic;
 	nia: std_ulogic_vector(63 downto 0);
     end record;
 
@@ -165,6 +166,7 @@ package common is
 	nia: std_ulogic_vector(63 downto 0);
 	insn: std_ulogic_vector(31 downto 0);
         big_endian: std_ulogic;
+        next_predicted: std_ulogic;
     end record;
 
     type Decode1ToDecode2Type is record
@@ -308,10 +310,14 @@ package common is
         big_endian: std_ulogic;
         mode_32bit: std_ulogic;
 	redirect_nia: std_ulogic_vector(63 downto 0);
+        br_nia : std_ulogic_vector(63 downto 0);
+        br_last : std_ulogic;
+        br_taken : std_ulogic;
     end record;
     constant Execute1ToFetch1Init : Execute1ToFetch1Type := (redirect => '0', virt_mode => '0',
                                                              priv_mode => '0', big_endian => '0',
-                                                             mode_32bit => '0', others => (others => '0'));
+                                                             mode_32bit => '0', br_taken => '0',
+                                                             br_last => '0', others => (others => '0'));
 
     type Execute1ToLoadstore1Type is record
 	valid : std_ulogic;
diff --git a/core.vhdl b/core.vhdl
index bc32a8c..3948b86 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -12,6 +12,7 @@ entity core is
 	DISABLE_FLATTEN : boolean := false;
         EX1_BYPASS : boolean := true;
         HAS_FPU : boolean := true;
+        HAS_BTC : boolean := true;
 	ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0');
         LOG_LENGTH : natural := 512
         );
@@ -187,7 +188,8 @@ begin
     fetch1_0: entity work.fetch1
         generic map (
             RESET_ADDRESS => (others => '0'),
-	    ALT_RESET_ADDRESS => ALT_RESET_ADDRESS
+	    ALT_RESET_ADDRESS => ALT_RESET_ADDRESS,
+            HAS_BTC => HAS_BTC
             )
         port map (
             clk => clk,
@@ -195,6 +197,7 @@ begin
 	    alt_reset_in => alt_reset_d,
             stall_in => fetch1_stall_in,
             flush_in => fetch1_flush,
+            inval_btc => ex1_icache_inval or mmu_to_icache.tlbie,
 	    stop_in => dbg_core_stop,
             d_in => decode1_to_fetch1,
             e_in => execute1_to_fetch1,
diff --git a/decode1.vhdl b/decode1.vhdl
index 2edacd3..ebe59be 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -727,7 +727,10 @@ begin
             bv.br_nia := (others => '0');
         end if;
         bv.br_offset := br_offset;
-        bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out;
+        if f_in.next_predicted = '1' then
+            v.br_pred := '1';
+        end if;
+        bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out and not f_in.next_predicted;
         -- after a clock edge...
         br_target := std_ulogic_vector(signed(br.br_nia) + br.br_offset);
 
diff --git a/execute1.vhdl b/execute1.vhdl
index 3385455..25b1dc7 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -68,6 +68,8 @@ architecture behaviour of execute1 is
         last_nia : std_ulogic_vector(63 downto 0);
         redirect : std_ulogic;
         abs_br : std_ulogic;
+        taken_br : std_ulogic;
+        br_last : std_ulogic;
         do_intr : std_ulogic;
         vector : integer range 0 to 16#fff#;
         br_offset : std_ulogic_vector(63 downto 0);
@@ -81,7 +83,7 @@ architecture behaviour of execute1 is
          fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL,
          mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0',
          next_lr => (others => '0'), last_nia => (others => '0'),
-         redirect => '0', abs_br => '0', do_intr => '0', vector => 0,
+         redirect => '0', abs_br => '0', taken_br => '0', br_last => '0', do_intr => '0', vector => 0,
          br_offset => (others => '0'), redir_mode => "0000",
          others => (others => '0'));
 
@@ -365,6 +367,7 @@ begin
         variable trapval : std_ulogic_vector(4 downto 0);
         variable illegal : std_ulogic;
         variable is_branch : std_ulogic;
+        variable is_direct_branch : std_ulogic;
         variable taken_branch : std_ulogic;
         variable abs_branch : std_ulogic;
         variable spr_val : std_ulogic_vector(63 downto 0);
@@ -377,6 +380,7 @@ begin
 	sum_with_carry := (others => '0');
 	newcrf := (others => '0');
         is_branch := '0';
+        is_direct_branch := '0';
         taken_branch := '0';
         abs_branch := '0';
         hold_wr_data := '0';
@@ -390,6 +394,8 @@ begin
         v.br_offset := (others => '0');
         v.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) &
                         not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF);
+        v.taken_br := '0';
+        v.br_last := '0';
 
         lv := Execute1ToLoadstore1Init;
         fv := Execute1ToFPUInit;
@@ -843,6 +849,7 @@ begin
 	    when OP_B =>
                 is_branch := '1';
                 taken_branch := '1';
+                is_direct_branch := '1';
                 abs_branch := insn_aa(e_in.insn);
                 if ctrl.msr(MSR_BE) = '1' then
                     do_trace := '1';
@@ -852,6 +859,7 @@ begin
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
                 is_branch := '1';
+                is_direct_branch := '1';
 		taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
                 abs_branch := insn_aa(e_in.insn);
                 if ctrl.msr(MSR_BE) = '1' then
@@ -1093,7 +1101,7 @@ begin
                 if taken_branch = '1' then
                     ctrl_tmp.cfar <= e_in.nia;
                 end if;
-                if e_in.br_pred = '0' then
+                if taken_branch = '1' then
                     v.br_offset := b_in;
                     v.abs_br := abs_branch;
                 else
@@ -1102,6 +1110,8 @@ begin
                 if taken_branch /= e_in.br_pred then
                     v.redirect := '1';
                 end if;
+                v.br_last := is_direct_branch;
+                v.taken_br := taken_branch;
             end if;
 
         elsif valid_in = '1' and exception = '0' and illegal = '0' then
@@ -1300,6 +1310,9 @@ begin
 
         -- Outputs to fetch1
         f.redirect := r.redirect;
+        f.br_nia := r.last_nia;
+        f.br_last := r.br_last and not r.do_intr;
+        f.br_taken := r.taken_br;
         if r.do_intr = '1' then
             f.redirect_nia := std_ulogic_vector(to_unsigned(r.vector, 64));
             f.virt_mode := '0';
diff --git a/fetch1.vhdl b/fetch1.vhdl
index 3c9d946..8ca7e57 100644
--- a/fetch1.vhdl
+++ b/fetch1.vhdl
@@ -8,7 +8,8 @@ use work.common.all;
 entity fetch1 is
     generic(
 	RESET_ADDRESS     : std_logic_vector(63 downto 0) := (others => '0');
-	ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0')
+	ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0');
+        HAS_BTC           : boolean := true
 	);
     port(
 	clk           : in std_ulogic;
@@ -17,6 +18,7 @@ entity fetch1 is
 	-- Control inputs:
 	stall_in      : in std_ulogic;
 	flush_in      : in std_ulogic;
+        inval_btc     : in std_ulogic;
 	stop_in       : in std_ulogic;
 	alt_reset_in  : in std_ulogic;
 
@@ -37,10 +39,25 @@ end entity fetch1;
 architecture behaviour of fetch1 is
     type reg_internal_t is record
         mode_32bit: std_ulogic;
+        rd_is_niap4: std_ulogic;
+        predicted: std_ulogic;
+        predicted_nia: std_ulogic_vector(63 downto 0);
     end record;
     signal r, r_next : Fetch1ToIcacheType;
     signal r_int, r_next_int : reg_internal_t;
+    signal advance_nia : std_ulogic;
     signal log_nia : std_ulogic_vector(42 downto 0);
+
+    constant BTC_ADDR_BITS : integer := 10;
+    constant BTC_TAG_BITS : integer := 62 - BTC_ADDR_BITS;
+    constant BTC_TARGET_BITS : integer := 62;
+    constant BTC_SIZE : integer := 2 ** BTC_ADDR_BITS;
+    constant BTC_WIDTH : integer := BTC_TAG_BITS + BTC_TARGET_BITS;
+    type btc_mem_type is array (0 to BTC_SIZE - 1) of std_ulogic_vector(BTC_WIDTH - 1 downto 0);
+
+    signal btc_rd_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0) := (others => '0');
+    signal btc_rd_valid : std_ulogic := '0';
+
 begin
 
     regs : process(clk)
@@ -56,15 +73,70 @@ begin
 		    " R:" & std_ulogic'image(e_in.redirect) & std_ulogic'image(d_in.redirect) &
 		    " S:" & std_ulogic'image(stall_in) &
 		    " T:" & std_ulogic'image(stop_in) &
-		    " nia:" & to_hstring(r_next.nia) &
-		    " SM:" & std_ulogic'image(r_next.stop_mark);
+		    " nia:" & to_hstring(r_next.nia);
 	    end if;
-	    r <= r_next;
-	    r_int <= r_next_int;
+            if rst = '1' or e_in.redirect = '1' or d_in.redirect = '1' or stall_in = '0' then
+                r.virt_mode <= r_next.virt_mode;
+                r.priv_mode <= r_next.priv_mode;
+                r.big_endian <= r_next.big_endian;
+                r_int.mode_32bit <= r_next_int.mode_32bit;
+            end if;
+            if advance_nia = '1' then
+                r.predicted <= r_next.predicted;
+                r.nia <= r_next.nia;
+                r_int.predicted <= r_next_int.predicted;
+                r_int.predicted_nia <= r_next_int.predicted_nia;
+                r_int.rd_is_niap4 <= r_next.sequential;
+            end if;
+            r.sequential <= r_next.sequential and advance_nia;
+            -- always send the up-to-date stop mark and req
+            r.stop_mark <= stop_in;
+            r.req <= not rst;
 	end if;
     end process;
     log_out <= log_nia;
 
+    btc : if HAS_BTC generate
+        signal btc_memory : btc_mem_type;
+        attribute ram_style : string;
+        attribute ram_style of btc_memory : signal is "block";
+
+        signal btc_valids : std_ulogic_vector(BTC_SIZE - 1 downto 0);
+        attribute ram_style of btc_valids : signal is "distributed";
+
+        signal btc_wr : std_ulogic;
+        signal btc_wr_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0);
+        signal btc_wr_addr : std_ulogic_vector(BTC_ADDR_BITS - 1 downto 0);
+        signal btc_wr_v : std_ulogic;
+    begin
+        btc_wr_data <= e_in.br_nia(63 downto BTC_ADDR_BITS + 2) &
+                       e_in.redirect_nia(63 downto 2);
+        btc_wr_addr <= e_in.br_nia(BTC_ADDR_BITS + 1 downto 2);
+        btc_wr <= e_in.br_last;
+        btc_wr_v <= e_in.br_taken;
+
+        btc_ram : process(clk)
+            variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0);
+        begin
+            if rising_edge(clk) then
+                raddr := unsigned(r.nia(BTC_ADDR_BITS + 1 downto 2)) +
+                         to_unsigned(2, BTC_ADDR_BITS);
+                if advance_nia = '1' then
+                    btc_rd_data <= btc_memory(to_integer(raddr));
+                    btc_rd_valid <= btc_valids(to_integer(raddr));
+                end if;
+                if btc_wr = '1' then
+                    btc_memory(to_integer(unsigned(btc_wr_addr))) <= btc_wr_data;
+                end if;
+                if inval_btc = '1' or rst = '1' then
+                    btc_valids <= (others => '0');
+                elsif btc_wr = '1' then
+                    btc_valids(to_integer(unsigned(btc_wr_addr))) <= btc_wr_v;
+                end if;
+            end if;
+        end process;
+    end generate;
+
     comb : process(all)
 	variable v : Fetch1ToIcacheType;
 	variable v_int : reg_internal_t;
@@ -72,6 +144,8 @@ begin
 	v := r;
 	v_int := r_int;
         v.sequential := '0';
+        v.predicted := '0';
+        v_int.predicted := '0';
 
 	if rst = '1' then
 	    if alt_reset_in = '1' then
@@ -83,6 +157,7 @@ begin
             v.priv_mode := '1';
             v.big_endian := '0';
             v_int.mode_32bit := '0';
+            v_int.predicted_nia := (others => '0');
 	elsif e_in.redirect = '1' then
 	    v.nia := e_in.redirect_nia(63 downto 2) & "00";
             if e_in.mode_32bit = '1' then
@@ -97,22 +172,26 @@ begin
             if r_int.mode_32bit = '1' then
                 v.nia(63 downto 32) := (others => '0');
             end if;
-	elsif stall_in = '0' then
-
-            -- If the last NIA value went down with a stop mark, it didn't get
-            -- executed, and hence we shouldn't increment NIA.
-	    if r.stop_mark = '0' then
-                if r_int.mode_32bit = '0' then
-                    v.nia := std_ulogic_vector(unsigned(r.nia) + 4);
-                else
-                    v.nia := x"00000000" & std_ulogic_vector(unsigned(r.nia(31 downto 0)) + 4);
-                end if;
-                v.sequential := '1';
-	    end if;
-	end if;
+        elsif r_int.predicted = '1' then
+            v.nia := r_int.predicted_nia;
+            v.predicted := '1';
+        else
+            v.sequential := '1';
+            v.nia := std_ulogic_vector(unsigned(r.nia) + 4);
+            if r_int.mode_32bit = '1' then
+                v.nia(63 downto 32) := x"00000000";
+            end if;
+            if btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and
+                btc_rd_data(BTC_WIDTH - 1 downto BTC_TARGET_BITS)
+                = v.nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then
+                v_int.predicted := '1';
+            end if;
+        end if;
+        v_int.predicted_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00";
 
-	v.req := not rst and not stop_in;
-	v.stop_mark := stop_in;
+        -- If the last NIA value went down with a stop mark, it didn't get
+        -- executed, and hence we shouldn't increment NIA.
+        advance_nia <= rst or e_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in);
 
 	r_next <= v;
 	r_next_int <= v_int;
diff --git a/fpga/top-arty.vhdl b/fpga/top-arty.vhdl
index 8a3dc7a..68d1e89 100644
--- a/fpga/top-arty.vhdl
+++ b/fpga/top-arty.vhdl
@@ -15,6 +15,7 @@ entity toplevel is
         RESET_LOW          : boolean  := true;
         CLK_FREQUENCY      : positive := 100000000;
         HAS_FPU            : boolean  := true;
+        HAS_BTC            : boolean  := true;
         USE_LITEDRAM       : boolean  := false;
         NO_BRAM            : boolean  := false;
         DISABLE_FLATTEN_CORE : boolean := false;
@@ -170,6 +171,7 @@ begin
             SIM                => false,
             CLK_FREQ           => CLK_FREQUENCY,
             HAS_FPU            => HAS_FPU,
+            HAS_BTC            => HAS_BTC,
             HAS_DRAM           => USE_LITEDRAM,
             DRAM_SIZE          => 256 * 1024 * 1024,
             DRAM_INIT_SIZE     => PAYLOAD_SIZE,
diff --git a/fpga/top-generic.vhdl b/fpga/top-generic.vhdl
index d5219ff..8bff5bb 100644
--- a/fpga/top-generic.vhdl
+++ b/fpga/top-generic.vhdl
@@ -12,6 +12,7 @@ entity toplevel is
 	CLK_INPUT     : positive := 100000000;
 	CLK_FREQUENCY : positive := 100000000;
         HAS_FPU       : boolean  := true;
+        HAS_BTC       : boolean  := false;
         LOG_LENGTH    : natural := 512;
 	DISABLE_FLATTEN_CORE : boolean := false;
         UART_IS_16550 : boolean  := true
@@ -71,6 +72,7 @@ begin
 	    SIM           => false,
 	    CLK_FREQ      => CLK_FREQUENCY,
             HAS_FPU       => HAS_FPU,
+            HAS_BTC       => HAS_BTC,
             LOG_LENGTH    => LOG_LENGTH,
 	    DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE,
             UART0_IS_16550     => UART_IS_16550
diff --git a/fpga/top-nexys-video.vhdl b/fpga/top-nexys-video.vhdl
index 1942b10..86bdd11 100644
--- a/fpga/top-nexys-video.vhdl
+++ b/fpga/top-nexys-video.vhdl
@@ -15,6 +15,7 @@ entity toplevel is
 	RESET_LOW     : boolean  := true;
 	CLK_FREQUENCY : positive := 100000000;
         HAS_FPU       : boolean  := true;
+        HAS_BTC       : boolean  := true;
 	USE_LITEDRAM  : boolean  := false;
 	NO_BRAM       : boolean  := false;
 	DISABLE_FLATTEN_CORE : boolean := false;
@@ -122,6 +123,7 @@ begin
 	    SIM           => false,
 	    CLK_FREQ      => CLK_FREQUENCY,
             HAS_FPU       => HAS_FPU,
+            HAS_BTC       => HAS_BTC,
 	    HAS_DRAM      => USE_LITEDRAM,
 	    DRAM_SIZE     => 512 * 1024 * 1024,
             DRAM_INIT_SIZE => PAYLOAD_SIZE,
diff --git a/icache.vhdl b/icache.vhdl
index 37a230d..a658783 100644
--- a/icache.vhdl
+++ b/icache.vhdl
@@ -565,6 +565,7 @@ begin
 	i_out.stop_mark <= r.hit_smark;
         i_out.fetch_failed <= r.fetch_failed;
         i_out.big_endian <= r.big_endian;
+        i_out.next_predicted <= i_in.predicted;
 
 	-- Stall fetch1 if we have a miss on cache or TLB or a protection fault
 	stall_out <= not (is_hit and access_ok);
diff --git a/microwatt.core b/microwatt.core
index 7f2068d..41b6230 100644
--- a/microwatt.core
+++ b/microwatt.core
@@ -134,6 +134,7 @@ targets:
       - log_length=2048
       - uart_is_16550
       - has_fpu
+      - has_btc
     tools:
       vivado: {part : xc7a100tcsg324-1}
     toplevel : toplevel
@@ -218,6 +219,7 @@ targets:
       - log_length=2048
       - uart_is_16550
       - has_fpu
+      - has_btc
     tools:
       vivado: {part : xc7a200tsbg484-1}
     toplevel : toplevel
@@ -235,6 +237,7 @@ targets:
       - log_length=2048
       - uart_is_16550
       - has_fpu
+      - has_btc
     generate: [litedram_nexys_video]
     tools:
       vivado: {part : xc7a200tsbg484-1}
@@ -254,6 +257,7 @@ targets:
       - uart_is_16550
       - has_uart1
       - has_fpu=false
+      - has_btc=false
     tools:
       vivado: {part : xc7a35ticsg324-1L}
     toplevel : toplevel
@@ -273,6 +277,7 @@ targets:
       - uart_is_16550
       - has_uart1
       - has_fpu=false
+      - has_btc=false
     generate: [litedram_arty, liteeth_arty]
     tools:
       vivado: {part : xc7a35ticsg324-1L}
@@ -292,6 +297,7 @@ targets:
       - uart_is_16550
       - has_uart1
       - has_fpu
+      - has_btc
     tools:
       vivado: {part : xc7a100ticsg324-1L}
     toplevel : toplevel
@@ -311,6 +317,7 @@ targets:
       - uart_is_16550
       - has_uart1
       - has_fpu
+      - has_btc
     generate: [litedram_arty, liteeth_arty]
     tools:
       vivado: {part : xc7a100ticsg324-1L}
@@ -329,6 +336,7 @@ targets:
       - log_length=512
       - uart_is_16550
       - has_fpu=false
+      - has_btc=false
     tools:
       vivado: {part : xc7a35tcpg236-1}
     toplevel : toplevel
@@ -395,6 +403,12 @@ parameters:
     paramtype   : generic
     default     : true
 
+  has_btc:
+    datatype    : bool
+    description : Include a branch target cache in the core
+    paramtype   : generic
+    default     : true
+
   disable_flatten_core:
     datatype    : bool
     description : Prevent Vivado from flattening the main core components
diff --git a/soc.vhdl b/soc.vhdl
index e4a7895..77f229e 100644
--- a/soc.vhdl
+++ b/soc.vhdl
@@ -53,6 +53,7 @@ entity soc is
 	CLK_FREQ           : positive;
 	SIM                : boolean;
         HAS_FPU            : boolean := true;
+        HAS_BTC            : boolean := true;
 	DISABLE_FLATTEN_CORE : boolean := false;
 	HAS_DRAM           : boolean  := false;
 	DRAM_SIZE          : integer := 0;
@@ -255,6 +256,7 @@ begin
 	generic map(
 	    SIM => SIM,
             HAS_FPU => HAS_FPU,
+            HAS_BTC => HAS_BTC,
 	    DISABLE_FLATTEN => DISABLE_FLATTEN_CORE,
 	    ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'),
             LOG_LENGTH => LOG_LENGTH