From 1249a11349af27a7b013713a17da6b823d1c7951 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 30 Oct 2019 13:26:43 +1100
Subject: [PATCH 1/9] cr_file: Check write_cr_enable

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 cr_file.vhdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cr_file.vhdl b/cr_file.vhdl
index fa56dd9..d8ce230 100644
--- a/cr_file.vhdl
+++ b/cr_file.vhdl
@@ -43,8 +43,8 @@ begin
         if rising_edge(clk) then
             if w_in.write_cr_enable = '1' then
                 report "Writing " & to_hstring(w_in.write_cr_data) & " to CR mask " & to_hstring(w_in.write_cr_mask);
+		crs <= crs_updated;
             end if;
-            crs <= crs_updated;
         end if;
     end process;
 

From f291efa26690d28c043a9de1a3d6c538d8278d79 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 14 Nov 2019 15:25:28 +1100
Subject: [PATCH 2/9] decode1: Mark ALU ops using carry as pipelined

There is no reason not to that I can think of

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/decode1.vhdl b/decode1.vhdl
index 1a7bc0b..23b02fe 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -36,8 +36,8 @@ architecture behaviour of decode1 is
         constant major_decode_rom_array : major_rom_array_t := (
 		--          unit     internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
 		--                        op                                            in   out   A   out  in    out  len        ext                                 pipe
-		12 =>       (ALU,    OP_ADD,       RA,         CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- addic
-		13 =>       (ALU,    OP_ADD,       RA,         CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '1'), -- addic.
+		12 =>       (ALU,    OP_ADD,       RA,         CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- addic
+		13 =>       (ALU,    OP_ADD,       RA,         CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0'), -- addic.
                 14 =>       (ALU,    OP_ADD,       RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- addi
 		15 =>       (ALU,    OP_ADD,       RA_OR_ZERO, CONST_SI_HI, NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- addis
 		28 =>       (ALU,    OP_AND,       NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0'), -- andi.
@@ -133,10 +133,10 @@ architecture behaviour of decode1 is
 		--                       unit    internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
 		--                                    op                                            in   out   A   out  in    out  len        ext                                 pipe
 		2#0100001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- add
-		2#0000001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- addc
-		2#0010001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- adde
-		2#0011101010#  =>       (ALU,    OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- addme
-		2#0011001010#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- addze
+		2#0000001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addc
+		2#0010001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- adde
+		2#0011101010#  =>       (ALU,    OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addme
+		2#0011001010#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addze
 		2#0000011100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- and
 		2#0000111100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- andc
 		-- 2#0011111100# bperm
@@ -278,10 +278,10 @@ architecture behaviour of decode1 is
 		2#0010110111#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stwux
 		2#0010010111#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stwx
 		2#0000101000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subf
-		2#0000001000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', ONE,  '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- subfc
-		2#0010001000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- subfe
-		2#0011101000#  =>       (ALU,    OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- subfme
-		2#0011001000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- subfze
+		2#0000001000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', ONE,  '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfc
+		2#0010001000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfe
+		2#0011101000#  =>       (ALU,    OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfme
+		2#0011001000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfze
 		2#1001010110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sync
 		-- 2#0001000100# td
 		2#0000000100#  =>       (ALU,    OP_TW,        RA,         RB,          NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tw

From 501b6daf9b9dbc43f3278fa5885efa9216686899 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 30 Oct 2019 13:53:23 +1100
Subject: [PATCH 3/9] Add basic XER support

The carry is currently internal to execute1. We don't handle any of
the other XER fields.

This creates type called "xer_common_t" that contains the commonly
used XER bits (CA, CA32, SO, OV, OV32).

The value is stored in the CR file (though it could be a separate
module). The rest of the bits will be implemented as a separate
SPR and the two parts reconciled in mfspr/mtspr in latter commits.

We always read XER in decode2 (there is little point not to)
and send it down all pipeline branches as it will be needed in
writeback for all type of instructions when CR0:SO needs to be
updated (such forms exist for all pipeline branches even if we don't
yet implement them).

To avoid having to track XER hazards, we forward it back in EX1. This
assumes that other pipeline branches that can modify it (mult and div)
are running single issue for now.

One additional hazard to beware of is an XER:SO modifying instruction
in EX1 followed immediately by a store conditional. Due to our writeback
latency, the store will go down the LSU with the previous XER value,
thus the stcx. will set CR0:SO using an obsolete SO value.

I doubt there exist any code relying on this behaviour being correct
but we should account for it regardless, possibly by ensuring that
stcx. remain single issue initially, or later by adding some minimal
tracking or moving the LSU into the same pipeline as execute.

Missing some obscure XER affecting instructions like addex or mcrxrx.

[paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of
 arguments to set_ov]

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl       |  75 ++++++++++++++++++++++++++------
 cr_file.vhdl      |  16 ++++++-
 dcache.vhdl       |   4 ++
 decode2.vhdl      |  26 +++++++++++
 divider.vhdl      |  23 ++++++++--
 execute1.vhdl     | 107 +++++++++++++++++++++++++++++++++++++++++-----
 insn_helpers.vhdl |   6 +++
 loadstore1.vhdl   |   1 +
 multiply.vhdl     |  35 ++++++++++++++-
 writeback.vhdl    |  45 ++++++++++++++-----
 10 files changed, 298 insertions(+), 40 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 44198b0..ea0ec1d 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -12,15 +12,28 @@ package common is
 
     function decode_spr_num(insn: std_ulogic_vector(31 downto 0)) return spr_num_t;
 
+    constant SPR_XER   : spr_num_t := 1;
     constant SPR_LR    : spr_num_t := 8;
     constant SPR_CTR   : spr_num_t := 9;
     constant SPR_TB    : spr_num_t := 268;
 
+    -- The XER is split: the common bits (CA, OV, SO, OV32 and CA32) are
+    -- in the CR file as a kind of CR extension (with a separate write
+    -- control). The rest is stored as a fast SPR.
+    type xer_common_t is record
+	ca : std_ulogic;
+	ca32 : std_ulogic;
+	ov : std_ulogic;
+	ov32 : std_ulogic;
+	so : std_ulogic;
+    end record;
+    constant xerc_init : xer_common_t := (others => '0');
+
+    -- This needs to die...
     type ctrl_t is record
 	lr: std_ulogic_vector(63 downto 0);
 	ctr: std_ulogic_vector(63 downto 0);
 	tb: std_ulogic_vector(63 downto 0);
-	carry: std_ulogic;
     end record;
 
     type Fetch1ToIcacheType is record
@@ -64,8 +77,10 @@ package common is
 	read_data2: std_ulogic_vector(63 downto 0);
 	read_data3: std_ulogic_vector(63 downto 0);
 	cr: std_ulogic_vector(31 downto 0);
+	xerc: xer_common_t;
 	lr: std_ulogic;
 	rc: std_ulogic;
+	oe: std_ulogic;
 	invert_a: std_ulogic;
 	invert_out: std_ulogic;
 	input_carry: carry_in_t;
@@ -78,9 +93,9 @@ package common is
 	data_len: std_ulogic_vector(3 downto 0);
     end record;
     constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
-	(valid => '0', insn_type => OP_ILLEGAL, lr => '0', rc => '0', invert_a => '0',
+	(valid => '0', insn_type => OP_ILLEGAL, lr => '0', rc => '0', oe => '0', invert_a => '0',
 	 invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
-	 is_32bit => '0', is_signed => '0', others => (others => '0'));
+	 is_32bit => '0', is_signed => '0', xerc => xerc_init, others => (others => '0'));
 
     type Decode2ToMultiplyType is record
 	valid: std_ulogic;
@@ -89,8 +104,13 @@ package common is
 	data1: std_ulogic_vector(64 downto 0);
 	data2: std_ulogic_vector(64 downto 0);
 	rc: std_ulogic;
+	oe: std_ulogic;
+	is_32bit: std_ulogic;
+	xerc: xer_common_t;
     end record;
-    constant Decode2ToMultiplyInit : Decode2ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, rc => '0', others => (others => '0'));
+    constant Decode2ToMultiplyInit : Decode2ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, rc => '0',
+							       oe => '0', is_32bit => '0', xerc => xerc_init,
+							       others => (others => '0'));
 
     type Decode2ToDividerType is record
 	valid: std_ulogic;
@@ -102,8 +122,13 @@ package common is
 	is_extended: std_ulogic;
 	is_modulus: std_ulogic;
 	rc: std_ulogic;
+	oe: std_ulogic;
+	xerc: xer_common_t;
     end record;
-    constant Decode2ToDividerInit: Decode2ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0', is_extended => '0', is_modulus => '0', rc => '0', others => (others => '0'));
+    constant Decode2ToDividerInit: Decode2ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0',
+							    is_extended => '0', is_modulus => '0',
+							    rc => '0', oe => '0', xerc => xerc_init,
+							    others => (others => '0'));
 
     type Decode2ToRegisterFileType is record
 	read1_enable : std_ulogic;
@@ -126,6 +151,7 @@ package common is
 
     type CrFileToDecode2Type is record
 	read_cr_data   : std_ulogic_vector(31 downto 0);
+	read_xerc_data : xer_common_t;
     end record;
 
     type Execute1ToFetch1Type is record
@@ -146,8 +172,11 @@ package common is
 	sign_extend : std_ulogic;			-- do we need to sign extend?
 	update : std_ulogic;				-- is this an update instruction?
 	update_reg : std_ulogic_vector(4 downto 0);	-- if so, the register to update
+	xerc : xer_common_t;
     end record;
-    constant Decode2ToLoadstore1Init : Decode2ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0', sign_extend => '0', update => '0', others => (others => '0'));
+    constant Decode2ToLoadstore1Init : Decode2ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0',
+								   sign_extend => '0', update => '0', xerc => xerc_init,
+								   others => (others => '0'));
 
     type Loadstore1ToDcacheType is record
 	valid : std_ulogic;
@@ -161,6 +190,7 @@ package common is
 	sign_extend : std_ulogic;
 	update : std_ulogic;
 	update_reg : std_ulogic_vector(4 downto 0);
+	xerc : xer_common_t;
     end record;
 
     type DcacheToWritebackType is record
@@ -173,8 +203,11 @@ package common is
 	sign_extend : std_ulogic;
 	byte_reverse : std_ulogic;
 	second_word : std_ulogic;
+	xerc : xer_common_t;
     end record;
-    constant DcacheToWritebackInit : DcacheToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', byte_reverse => '0', second_word => '0', others => (others => '0'));
+    constant DcacheToWritebackInit : DcacheToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0',
+							       byte_reverse => '0', second_word => '0', xerc => xerc_init,
+							       others => (others => '0'));
 
     type Execute1ToWritebackType is record
 	valid: std_ulogic;
@@ -186,9 +219,14 @@ package common is
 	write_cr_enable : std_ulogic;
 	write_cr_mask : std_ulogic_vector(7 downto 0);
 	write_cr_data : std_ulogic_vector(31 downto 0);
+	write_xerc_enable : std_ulogic;
+	xerc : xer_common_t;
 	sign_extend: std_ulogic;
     end record;
-    constant Execute1ToWritebackInit : Execute1ToWritebackType := (valid => '0', rc => '0', write_enable => '0', write_cr_enable => '0', sign_extend => '0', others => (others => '0'));
+    constant Execute1ToWritebackInit : Execute1ToWritebackType := (valid => '0', rc => '0', write_enable => '0',
+								   write_cr_enable => '0', sign_extend => '0',
+								   write_xerc_enable => '0', xerc => xerc_init,
+								   others => (others => '0'));
 
     type MultiplyToWritebackType is record
 	valid: std_ulogic;
@@ -196,9 +234,14 @@ package common is
 	write_reg_enable : std_ulogic;
 	write_reg_nr: std_ulogic_vector(4 downto 0);
 	write_reg_data: std_ulogic_vector(63 downto 0);
+	write_xerc_enable : std_ulogic;
+	xerc : xer_common_t;
 	rc: std_ulogic;
     end record;
-    constant MultiplyToWritebackInit : MultiplyToWritebackType := (valid => '0', write_reg_enable => '0', rc => '0', others => (others => '0'));
+    constant MultiplyToWritebackInit : MultiplyToWritebackType := (valid => '0', write_reg_enable => '0',
+								   rc => '0', write_xerc_enable => '0',
+								   xerc => xerc_init,
+								   others => (others => '0'));
 
     type DividerToWritebackType is record
 	valid: std_ulogic;
@@ -206,9 +249,14 @@ package common is
 	write_reg_enable : std_ulogic;
 	write_reg_nr: std_ulogic_vector(4 downto 0);
 	write_reg_data: std_ulogic_vector(63 downto 0);
+	write_xerc_enable : std_ulogic;
+	xerc : xer_common_t;
 	rc: std_ulogic;
     end record;
-    constant DividerToWritebackInit : DividerToWritebackType := (valid => '0', write_reg_enable => '0', rc => '0', others => (others => '0'));
+    constant DividerToWritebackInit : DividerToWritebackType := (valid => '0', write_reg_enable => '0',
+								 rc => '0', write_xerc_enable => '0',
+								 xerc => xerc_init,
+								 others => (others => '0'));
 
     type WritebackToRegisterFileType is record
 	write_reg : std_ulogic_vector(4 downto 0);
@@ -221,9 +269,12 @@ package common is
 	write_cr_enable : std_ulogic;
 	write_cr_mask : std_ulogic_vector(7 downto 0);
 	write_cr_data : std_ulogic_vector(31 downto 0);
+	write_xerc_enable : std_ulogic;
+	write_xerc_data : xer_common_t;
     end record;
-    constant WritebackToCrFileInit : WritebackToCrFileType := (write_cr_enable => '0', others => (others => '0'));
-
+    constant WritebackToCrFileInit : WritebackToCrFileType := (write_cr_enable => '0', write_xerc_enable => '0',
+							       write_xerc_data => xerc_init,
+							       others => (others => '0'));
 end common;
 
 package body common is
diff --git a/cr_file.vhdl b/cr_file.vhdl
index d8ce230..dcd21be 100644
--- a/cr_file.vhdl
+++ b/cr_file.vhdl
@@ -18,7 +18,9 @@ end entity cr_file;
 
 architecture behaviour of cr_file is
     signal crs : std_ulogic_vector(31 downto 0) := (others => '0');
-    signal crs_updated : std_ulogic_vector(31 downto 0) := (others => '0');
+    signal crs_updated : std_ulogic_vector(31 downto 0);
+    signal xerc : xer_common_t := xerc_init;
+    signal xerc_updated : xer_common_t;
 begin
     cr_create_0: process(all)
         variable hi, lo : integer := 0;
@@ -35,6 +37,13 @@ begin
         end loop;
 
         crs_updated <= cr_tmp;
+
+	if w_in.write_xerc_enable = '1' then
+	    xerc_updated <= w_in.write_xerc_data;
+	else
+	    xerc_updated <= xerc;
+	end if;
+
     end process;
 
     -- synchronous writes
@@ -45,6 +54,10 @@ begin
                 report "Writing " & to_hstring(w_in.write_cr_data) & " to CR mask " & to_hstring(w_in.write_cr_mask);
 		crs <= crs_updated;
             end if;
+	    if w_in.write_xerc_enable = '1' then
+                report "Writing XERC";
+		xerc <= xerc_updated;
+	    end if;
         end if;
     end process;
 
@@ -56,5 +69,6 @@ begin
             report "Reading CR " & to_hstring(crs_updated);
         end if;
         d_out.read_cr_data <= crs_updated;
+        d_out.read_xerc_data <= xerc_updated;
     end process;
 end architecture behaviour;
diff --git a/dcache.vhdl b/dcache.vhdl
index 7d6e74c..df54c95 100644
--- a/dcache.vhdl
+++ b/dcache.vhdl
@@ -185,6 +185,7 @@ architecture rtl of dcache is
 	length         : std_ulogic_vector(3 downto 0);
 	sign_extend    : std_ulogic;
 	byte_reverse   : std_ulogic;
+	xerc           : xer_common_t;
     end record;
 
     signal r2 : reg_stage_2_t;
@@ -469,6 +470,7 @@ begin
 	d_out.sign_extend <= r2.sign_extend;
 	d_out.byte_reverse <= r2.byte_reverse;
 	d_out.second_word <= '0';
+	d_out.xerc <= r2.xerc;
 
 	-- We have a valid load or store hit or we just completed a slow
 	-- op such as a load miss, a NC load or a store
@@ -518,6 +520,7 @@ begin
 		d_out.sign_extend <= r1.req.sign_extend;
 		d_out.byte_reverse <= r1.req.byte_reverse;
 		d_out.write_len <= r1.req.length;
+		d_out.xerc <= r1.req.xerc;
 	    end if;
 
 	    -- If it's a store or a non-update load form, complete now
@@ -539,6 +542,7 @@ begin
 	    d_out.write_len <= "1000";
 	    d_out.sign_extend <= '0';
 	    d_out.byte_reverse <= '0';
+	    d_out.xerc <= r1.req.xerc;
 
 	    -- If it was a load, this completes the operation (load with
 	    -- update case).
diff --git a/decode2.vhdl b/decode2.vhdl
index 1307e7d..e9c71ba 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -131,6 +131,22 @@ architecture behaviour of decode2 is
 		end case;
 	end;
 
+	-- For now, use "rc" in the decode table to decide whether oe exists.
+	-- This is not entirely correct architecturally: For mulhd and
+	-- mulhdu, the OE field is reserved. It remains to be seen what an
+	-- actual POWER9 does if we set it on those instructions, for now we
+	-- test that further down when assigning to the multiplier oe input.
+	--
+	function decode_oe (t : rc_t; insn_in : std_ulogic_vector(31 downto 0)) return std_ulogic is
+	begin
+		case t is
+		when RC =>
+			return insn_oe(insn_in);
+		when OTHERS =>
+			return '0';
+		end case;
+	end;
+
 	-- issue control signals
 	signal control_valid_in : std_ulogic;
 	signal control_valid_out : std_ulogic;
@@ -255,7 +271,9 @@ begin
                 v.e.read_data3 := decoded_reg_c.data;
 		v.e.write_reg := decode_output_reg(d_in.decode.output_reg_a, d_in.insn);
 		v.e.rc := decode_rc(d_in.decode.rc, d_in.insn);
+		v.e.oe := decode_oe(d_in.decode.rc, d_in.insn);
 		v.e.cr := c_in.read_cr_data;
+		v.e.xerc := c_in.read_xerc_data;
                 v.e.invert_a := d_in.decode.invert_a;
                 v.e.invert_out := d_in.decode.invert_out;
 		v.e.input_carry := d_in.decode.input_carry;
@@ -274,6 +292,11 @@ begin
 		mul_b := decoded_reg_b.data;
 		v.m.write_reg := decode_output_reg(d_in.decode.output_reg_a, d_in.insn);
 		v.m.rc := decode_rc(d_in.decode.rc, d_in.insn);
+		v.m.xerc := c_in.read_xerc_data;
+		if v.m.insn_type = OP_MUL_L64 then
+		  v.m.oe := decode_oe(d_in.decode.rc, d_in.insn);
+		end if;
+		v.m.is_32bit := d_in.decode.is_32bit;
 
 		if d_in.decode.is_32bit = '1' then
 			if d_in.decode.is_signed = '1' then
@@ -337,6 +360,8 @@ begin
                         end if;
                 end if;
                 v.d.rc := decode_rc(d_in.decode.rc, d_in.insn);
+		v.d.xerc := c_in.read_xerc_data;
+		v.d.oe := decode_oe(d_in.decode.rc, d_in.insn);
 
 		-- load/store unit
 		v.l.update_reg := decoded_reg_a.reg;
@@ -355,6 +380,7 @@ begin
 		v.l.byte_reverse := d_in.decode.byte_reverse;
 		v.l.sign_extend := d_in.decode.sign_extend;
 		v.l.update := d_in.decode.update;
+		v.l.xerc := c_in.read_xerc_data;
 
 		-- issue control
 		control_valid_in <= d_in.valid;
diff --git a/divider.vhdl b/divider.vhdl
index 20d4600..d632e90 100644
--- a/divider.vhdl
+++ b/divider.vhdl
@@ -36,7 +36,8 @@ architecture behaviour of divider is
     signal overflow   : std_ulogic;
     signal ovf32      : std_ulogic;
     signal did_ovf    : std_ulogic;
-
+    signal oe         : std_ulogic;
+    signal xerc       : xer_common_t;
 begin
     divider_0: process(clk)
     begin
@@ -62,6 +63,8 @@ begin
                 is_32bit <= d_in.is_32bit;
                 is_signed <= d_in.is_signed;
                 rc <= d_in.rc;
+                oe <= d_in.oe;
+		xerc <= d_in.xerc;
                 count <= "1111111";
                 running <= '1';
                 overflow <= '0';
@@ -147,13 +150,25 @@ begin
     divider_out: process(clk)
     begin
         if rising_edge(clk) then
+	    d_out.valid <= '0';
             d_out.write_reg_data <= oresult;
+	    d_out.write_reg_enable <= '0';
+	    d_out.write_xerc_enable <= '0';
+	    d_out.xerc <= xerc;
             if count = "1000000" then
                 d_out.valid <= '1';
                 d_out.write_reg_enable <= '1';
-            else
-                d_out.valid <= '0';
-                d_out.write_reg_enable <= '0';
+		d_out.write_xerc_enable <= oe;
+
+		-- We must test oe because the RC update code in writeback
+		-- will use the xerc value to set CR0:SO so we must not clobber
+		-- xerc if OE wasn't set.
+		--
+		if oe = '1' then
+		    d_out.xerc.ov <= did_ovf;
+		    d_out.xerc.ov32 <= did_ovf;
+		    d_out.xerc.so <= xerc.so or did_ovf;
+		end if;
             end if;
         end if;
     end process;
diff --git a/execute1.vhdl b/execute1.vhdl
index 862c631..2391ba2 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -31,14 +31,13 @@ end entity execute1;
 
 architecture behaviour of execute1 is
     type reg_type is record
-	--f : Execute1ToFetch1Type;
 	e : Execute1ToWritebackType;
     end record;
 
     signal r, rin : reg_type;
 
-    signal ctrl: ctrl_t := (carry => '0', others => (others => '0'));
-    signal ctrl_tmp: ctrl_t := (carry => '0', others => (others => '0'));
+    signal ctrl: ctrl_t := (others => (others => '0'));
+    signal ctrl_tmp: ctrl_t := (others => (others => '0'));
 
     signal right_shift, rot_clear_left, rot_clear_right: std_ulogic;
     signal rotator_result: std_ulogic_vector(63 downto 0);
@@ -46,17 +45,46 @@ architecture behaviour of execute1 is
     signal logical_result: std_ulogic_vector(63 downto 0);
     signal countzero_result: std_ulogic_vector(63 downto 0);
 
-    function decode_input_carry (carry_sel : carry_in_t; ca_in : std_ulogic) return std_ulogic is
+    procedure set_carry(e: inout Execute1ToWritebackType;
+			carry32 : in std_ulogic;
+			carry : in std_ulogic) is
     begin
-	case carry_sel is
+	e.xerc.ca32 := carry32;
+	e.xerc.ca := carry;
+	e.write_xerc_enable := '1';
+    end;
+
+    procedure set_ov(e: inout Execute1ToWritebackType;
+		     ov   : in std_ulogic;
+		     ov32 : in std_ulogic) is
+    begin
+	e.xerc.ov32 := ov32;
+	e.xerc.ov := ov;
+	if ov = '1' then
+	    e.xerc.so := '1';
+	end if;
+	e.write_xerc_enable := '1';
+    end;
+
+    function calc_ov(msb_a : std_ulogic; msb_b: std_ulogic;
+		     ca: std_ulogic; msb_r: std_ulogic) return std_ulogic is
+    begin
+	return (ca xor msb_r) and not (msb_a xor msb_b);
+    end;
+
+    function decode_input_carry(ic : carry_in_t;
+				xerc : xer_common_t) return std_ulogic is
+    begin
+	case ic is
 	when ZERO =>
 	    return '0';
 	when CA =>
-	    return ca_in;
+	    return xerc.ca;
 	when ONE =>
 	    return '1';
 	end case;
     end;
+
 begin
 
     rotator_0: entity work.rotator
@@ -117,6 +145,7 @@ begin
 	variable bf, bfa : std_ulogic_vector(2 downto 0);
 	variable l : std_ulogic;
 	variable next_nia : std_ulogic_vector(63 downto 0);
+        variable carry_32, carry_64 : std_ulogic;
     begin
 	result := (others => '0');
 	result_with_carry := (others => '0');
@@ -125,7 +154,41 @@ begin
 
 	v := r;
 	v.e := Execute1ToWritebackInit;
-	--v.f := Execute1ToFetch1TypeInit;
+
+	-- XER forwarding. To avoid having to track XER hazards, we
+	-- use the previously latched value.
+	--
+	-- If the XER was modified by a multiply or a divide, those are
+	-- single issue, we'll get the up to date value from decode2 from
+	-- the register file.
+	--
+	-- If it was modified by an instruction older than the previous
+	-- one in EX1, it will have also hit writeback and will be up
+	-- to date in decode2.
+	--
+	-- That leaves us with the case where it was updated by the previous
+	-- instruction in EX1. In that case, we can forward it back here.
+	--
+	-- This will break if we allow pipelining of multiply and divide,
+	-- but ideally, those should go via EX1 anyway and run as a state
+	-- machine from here.
+	--
+	-- One additional hazard to beware of is an XER:SO modifying instruction
+	-- in EX1 followed immediately by a store conditional. Due to our
+	-- writeback latency, the store will go down the LSU with the previous
+	-- XER value, thus the stcx. will set CR0:SO using an obsolete SO value.
+	--
+	-- We will need to handle that if we ever make stcx. not single issue
+	--
+	-- We always pass a valid XER value downto writeback even when
+	-- we aren't updating it, in order for XER:SO -> CR0:SO transfer
+	-- to work for RC instructions.
+	--
+	if r.e.write_xerc_enable = '1' then
+	    v.e.xerc := r.e.xerc;
+	else
+	    v.e.xerc := e_in.xerc;
+	end if;
 
 	ctrl_tmp <= ctrl;
 	-- FIXME: run at 512MHz not core freq
@@ -163,10 +226,18 @@ begin
 		else
 		    a_inv := not e_in.read_data1;
 		end if;
-		result_with_carry := ppc_adde(a_inv, e_in.read_data2, decode_input_carry(e_in.input_carry, ctrl.carry));
+		result_with_carry := ppc_adde(a_inv, e_in.read_data2,
+					      decode_input_carry(e_in.input_carry, v.e.xerc));
 		result := result_with_carry(63 downto 0);
-		if e_in.output_carry then
-		    ctrl_tmp.carry <= result_with_carry(64);
+                carry_32 := result(32) xor a_inv(32) xor e_in.read_data2(32);
+                carry_64 := result_with_carry(64);
+		if e_in.output_carry = '1' then
+		    set_carry(v.e, carry_32, carry_64);
+		end if;
+		if e_in.oe = '1' then
+		    set_ov(v.e,
+			   calc_ov(a_inv(63), e_in.read_data2(63), carry_64, result_with_carry(63)),
+			   calc_ov(a_inv(31), e_in.read_data2(31), carry_32, result_with_carry(31)));
 		end if;
 		result_en := '1';
 	    when OP_AND | OP_OR | OP_XOR =>
@@ -270,6 +341,13 @@ begin
 		end loop;
 	    when OP_MFSPR =>
 		case decode_spr_num(e_in.insn) is
+		when SPR_XER =>
+		    result := ( 63-32 => v.e.xerc.so,
+				63-33 => v.e.xerc.ov,
+				63-34 => v.e.xerc.ca,
+				63-44 => v.e.xerc.ov32,
+				63-45 => v.e.xerc.ca32,
+				others => '0');
 		when SPR_CTR =>
 		    result := ctrl.ctr;
 		when SPR_LR =>
@@ -310,6 +388,13 @@ begin
 		v.e.write_cr_data := e_in.read_data3(31 downto 0);
 	    when OP_MTSPR =>
 		case decode_spr_num(e_in.insn) is
+		when SPR_XER =>
+		    v.e.xerc.so := e_in.read_data3(63-32);
+		    v.e.xerc.ov := e_in.read_data3(63-33);
+		    v.e.xerc.ca := e_in.read_data3(63-34);
+		    v.e.xerc.ov32 := e_in.read_data3(63-44);
+		    v.e.xerc.ca32 := e_in.read_data3(63-45);
+		    v.e.write_xerc_enable := '1';
 		when SPR_CTR =>
 		    ctrl_tmp.ctr <= e_in.read_data3;
 		when SPR_LR =>
@@ -334,7 +419,7 @@ begin
 	    when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR =>
 		result := rotator_result;
 		if e_in.output_carry = '1' then
-		    ctrl_tmp.carry <= rotator_carry;
+		    set_carry(v.e, rotator_carry, rotator_carry);
 		end if;
 		result_en := '1';
 	    when OP_SIM_CONFIG =>
diff --git a/insn_helpers.vhdl b/insn_helpers.vhdl
index d3ddcca..f58dacd 100644
--- a/insn_helpers.vhdl
+++ b/insn_helpers.vhdl
@@ -16,6 +16,7 @@ package insn_helpers is
     function insn_lk (insn_in : std_ulogic_vector) return std_ulogic;
     function insn_aa (insn_in : std_ulogic_vector) return std_ulogic;
     function insn_rc (insn_in : std_ulogic_vector) return std_ulogic;
+    function insn_oe (insn_in : std_ulogic_vector) return std_ulogic;
     function insn_bd (insn_in : std_ulogic_vector) return std_ulogic_vector;
     function insn_bf (insn_in : std_ulogic_vector) return std_ulogic_vector;
     function insn_bfa (insn_in : std_ulogic_vector) return std_ulogic_vector;
@@ -103,6 +104,11 @@ package body insn_helpers is
         return insn_in(0);
     end;
 
+    function insn_oe (insn_in : std_ulogic_vector) return std_ulogic is
+    begin
+        return insn_in(10);
+    end;
+
     function insn_bd (insn_in : std_ulogic_vector) return std_ulogic_vector is
     begin
         return insn_in(15 downto 2);
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index 7fa8a42..1c16c46 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -47,6 +47,7 @@ begin
         v.sign_extend := l_in.sign_extend;
         v.update := l_in.update;
         v.update_reg := l_in.update_reg;
+	v.xerc := l_in.xerc;
 
 	-- XXX Temporary hack. Mark the op as non-cachable if the address
 	-- is the form 0xc-------
diff --git a/multiply.vhdl b/multiply.vhdl
index 94fa792..23339b5 100644
--- a/multiply.vhdl
+++ b/multiply.vhdl
@@ -27,8 +27,17 @@ architecture behaviour of multiply is
         data      : signed(129 downto 0);
         write_reg : std_ulogic_vector(4 downto 0);
         rc        : std_ulogic;
+	oe        : std_ulogic;
+	is_32bit  : std_ulogic;
+	xerc      : xer_common_t;
     end record;
-    constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0', insn_type => OP_ILLEGAL, rc => '0', data => (others => '0'), others => (others => '0'));
+    constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0',
+								     insn_type => OP_ILLEGAL,
+								     rc => '0', oe => '0',
+								     is_32bit => '0',
+								     xerc => xerc_init,
+								     data => (others => '0'),
+								     others => (others => '0'));
 
     type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage;
     constant MultiplyPipelineInit : multiply_pipeline_type := (others => MultiplyPipelineStageInit);
@@ -51,6 +60,7 @@ begin
         variable v : reg_type;
         variable d : std_ulogic_vector(129 downto 0);
         variable d2 : std_ulogic_vector(63 downto 0);
+	variable ov : std_ulogic;
     begin
         v := r;
 
@@ -61,16 +71,26 @@ begin
         v.multiply_pipeline(0).data := signed(m.data1) * signed(m.data2);
         v.multiply_pipeline(0).write_reg := m.write_reg;
         v.multiply_pipeline(0).rc := m.rc;
+        v.multiply_pipeline(0).oe := m.oe;
+        v.multiply_pipeline(0).is_32bit := m.is_32bit;
+        v.multiply_pipeline(0).xerc := m.xerc;
 
         loop_0: for i in 1 to PIPELINE_DEPTH-1 loop
             v.multiply_pipeline(i) := r.multiply_pipeline(i-1);
         end loop;
 
         d := std_ulogic_vector(v.multiply_pipeline(PIPELINE_DEPTH-1).data);
+	ov := '0';
 
+	-- TODO: Handle overflows
         case_0: case v.multiply_pipeline(PIPELINE_DEPTH-1).insn_type is
             when OP_MUL_L64 =>
                 d2 := d(63 downto 0);
+		if v.multiply_pipeline(PIPELINE_DEPTH-1).is_32bit = '1' then
+		    ov := (or d(63 downto 31)) and not (and d(63 downto 31));
+		else
+		    ov := (or d(127 downto 63)) and not (and d(127 downto 63));
+		end if;
             when OP_MUL_H32 =>
                 d2 := d(63 downto 32) & d(63 downto 32);
             when OP_MUL_H64 =>
@@ -82,11 +102,24 @@ begin
 
         m_out.write_reg_data <= d2;
         m_out.write_reg_nr <= v.multiply_pipeline(PIPELINE_DEPTH-1).write_reg;
+	m_out.xerc <= v.multiply_pipeline(PIPELINE_DEPTH-1).xerc;
 
+	-- Generate OV/OV32/SO when OE=1
         if v.multiply_pipeline(PIPELINE_DEPTH-1).valid = '1' then
             m_out.valid <= '1';
             m_out.write_reg_enable <= '1';
             m_out.rc <= v.multiply_pipeline(PIPELINE_DEPTH-1).rc;
+            m_out.write_xerc_enable <= v.multiply_pipeline(PIPELINE_DEPTH-1).oe;
+
+	    -- We must test oe because the RC update code in writeback
+	    -- will use the xerc value to set CR0:SO so we must not clobber
+	    -- xerc if OE wasn't set.
+	    --
+	    if v.multiply_pipeline(PIPELINE_DEPTH-1).oe = '1' then
+		m_out.xerc.ov <= ov;
+		m_out.xerc.ov32 <= ov;
+		m_out.xerc.so <= v.multiply_pipeline(PIPELINE_DEPTH-1).xerc.so or ov;
+	    end if;
         end if;
 
         rin <= v;
diff --git a/writeback.vhdl b/writeback.vhdl
index e2b74f8..545e931 100644
--- a/writeback.vhdl
+++ b/writeback.vhdl
@@ -62,6 +62,8 @@ begin
         variable w : std_ulogic_vector(0 downto 0);
         variable j : integer;
         variable k : unsigned(3 downto 0);
+	variable cf: std_ulogic_vector(3 downto 0);
+	variable xe: xer_common_t;
     begin
         x := "" & e_in.valid;
         y := "" & l_in.valid;
@@ -81,6 +83,11 @@ begin
         z := "" & (d_in.valid and d_in.rc);
         assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
 
+        x := "" & e_in.write_xerc_enable;
+        y := "" & m_in.write_xerc_enable;
+        z := "" & D_in.write_xerc_enable;
+        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
+
         w_out <= WritebackToRegisterFileInit;
         c_out <= WritebackToCrFileInit;
 
@@ -96,12 +103,12 @@ begin
         partial_write <= '0';
         sign_extend <= '0';
         second_word <= '0';
-        data_in <= e_in.write_data;
+	xe := e_in.xerc;
 
         if e_in.write_enable = '1' then
             w_out.write_reg <= e_in.write_reg;
-            data_in <= e_in.write_data;
             w_out.write_enable <= '1';
+	    data_in <= e_in.write_data;
             data_len <= unsigned(e_in.write_len);
             sign_extend <= e_in.sign_extend;
             rc <= e_in.rc;
@@ -113,7 +120,12 @@ begin
             c_out.write_cr_data <= e_in.write_cr_data;
         end if;
 
-        if l_in.write_enable = '1' then
+	if e_in.write_xerc_enable = '1' then
+            c_out.write_xerc_enable <= '1';
+            c_out.write_xerc_data <= e_in.xerc;
+	end if;
+
+	if l_in.write_enable = '1' then
             w_out.write_reg <= l_in.write_reg;
             data_in <= l_in.write_data;
             data_len <= unsigned(l_in.write_len);
@@ -127,6 +139,7 @@ begin
             if l_in.valid = '0' and (data_len + byte_offset > 8) then
                 partial_write <= '1';
             end if;
+	    xe := l_in.xerc;
         end if;
 
         if m_in.write_reg_enable = '1' then
@@ -134,15 +147,27 @@ begin
             w_out.write_reg <= m_in.write_reg_nr;
             data_in <= m_in.write_reg_data;
             rc <= m_in.rc;
+	    xe := m_in.xerc;
         end if;
 
+	if m_in.write_xerc_enable = '1' then
+            c_out.write_xerc_enable <= '1';
+            c_out.write_xerc_data <= m_in.xerc;
+	end if;
+
         if d_in.write_reg_enable = '1' then
             w_out.write_enable <= '1';
             w_out.write_reg <= d_in.write_reg_nr;
             data_in <= d_in.write_reg_data;
             rc <= d_in.rc;
+	    xe := d_in.xerc;
         end if;
 
+	if d_in.write_xerc_enable = '1' then
+            c_out.write_xerc_enable <= '1';
+            c_out.write_xerc_data <= d_in.xerc;
+	end if;
+
         -- shift and byte-reverse data bytes
         for i in 0 to 7 loop
             k := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
@@ -193,17 +218,15 @@ begin
         -- deliver to regfile
         w_out.write_data <= data_trimmed;
 
-        -- test value against 0 and set CR0 if requested
+        -- Perform CR0 update for RC forms
         if rc = '1' then
             c_out.write_cr_enable <= '1';
             c_out.write_cr_mask <= num_to_fxm(0);
-            if negative = '1' then
-                c_out.write_cr_data <= x"80000000";
-            elsif zero = '0' then
-                c_out.write_cr_data <= x"40000000";
-            else
-                c_out.write_cr_data <= x"20000000";
-            end if;
+	    cf(3) := negative;
+	    cf(2) := not negative and not zero;
+	    cf(1) := zero;
+	    cf(0) := xe.so;
+	    c_out.write_cr_data(31 downto 28) <= cf;
         end if;
     end process;
 end;

From ec9b27660fde71f4098037759d970019db59db2e Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 7 Dec 2019 14:31:33 +1100
Subject: [PATCH 4/9] execute: Copy XER[SO] to CR for cmp[i] and cmpl[i]
 instructions

We were copying in XER[SO] for the dot-form instructions but not the
explicit compare instructions.  Fix this.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 execute1.vhdl     |  4 ++--
 helpers.vhdl      | 28 ++++++++++++++--------------
 ppc_fx_insns.vhdl | 32 ++++++++++++++++++++------------
 3 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/execute1.vhdl b/execute1.vhdl
index 2391ba2..e1ca950 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -291,7 +291,7 @@ begin
 		for i in 0 to 7 loop
 		    lo := i*4;
 		    hi := lo + 3;
-		    v.e.write_cr_data(hi downto lo) := ppc_cmp(l, e_in.read_data1, e_in.read_data2);
+		    v.e.write_cr_data(hi downto lo) := ppc_cmp(l, e_in.read_data1, e_in.read_data2, v.e.xerc.so);
 		end loop;
 	    when OP_CMPL =>
 		bf := insn_bf(e_in.insn);
@@ -302,7 +302,7 @@ begin
 		for i in 0 to 7 loop
 		    lo := i*4;
 		    hi := lo + 3;
-		    v.e.write_cr_data(hi downto lo) := ppc_cmpl(l, e_in.read_data1, e_in.read_data2);
+		    v.e.write_cr_data(hi downto lo) := ppc_cmpl(l, e_in.read_data1, e_in.read_data2, v.e.xerc.so);
 		end loop;
 	    when OP_CNTZ =>
 		result := countzero_result;
diff --git a/helpers.vhdl b/helpers.vhdl
index 3961332..fe91938 100644
--- a/helpers.vhdl
+++ b/helpers.vhdl
@@ -17,8 +17,8 @@ package helpers is
 
     function cmp_one_byte(a, b: std_ulogic_vector(7 downto 0)) return std_ulogic_vector;
 
-    function ppc_signed_compare(a, b: signed(63 downto 0)) return std_ulogic_vector;
-    function ppc_unsigned_compare(a, b: unsigned(63 downto 0)) return std_ulogic_vector;
+    function ppc_signed_compare(a, b: signed(63 downto 0); so: std_ulogic) return std_ulogic_vector;
+    function ppc_unsigned_compare(a, b: unsigned(63 downto 0); so: std_ulogic) return std_ulogic_vector;
 
     function ra_or_zero(ra: std_ulogic_vector(63 downto 0); reg: std_ulogic_vector(4 downto 0)) return std_ulogic_vector;
 
@@ -126,32 +126,32 @@ package body helpers is
         return ret;
     end;
 
-    function ppc_signed_compare(a, b: signed(63 downto 0)) return std_ulogic_vector is
-        variable ret: std_ulogic_vector(3 downto 0);
+    function ppc_signed_compare(a, b: signed(63 downto 0); so: std_ulogic) return std_ulogic_vector is
+        variable ret: std_ulogic_vector(2 downto 0);
     begin
         if a < b then
-            ret := "1000";
+            ret := "100";
         elsif a > b then
-            ret := "0100";
+            ret := "010";
         else
-            ret := "0010";
+            ret := "001";
         end if;
 
-        return ret;
+        return ret & so;
     end;
 
-    function ppc_unsigned_compare(a, b: unsigned(63 downto 0)) return std_ulogic_vector is
-        variable ret: std_ulogic_vector(3 downto 0);
+    function ppc_unsigned_compare(a, b: unsigned(63 downto 0); so: std_ulogic) return std_ulogic_vector is
+        variable ret: std_ulogic_vector(2 downto 0);
     begin
         if a < b then
-            ret := "1000";
+            ret := "100";
         elsif a > b then
-            ret := "0100";
+            ret := "010";
         else
-            ret := "0010";
+            ret := "001";
         end if;
 
-        return ret;
+        return ret & so;
     end;
 
     function ra_or_zero(ra: std_ulogic_vector(63 downto 0); reg: std_ulogic_vector(4 downto 0)) return std_ulogic_vector is
diff --git a/ppc_fx_insns.vhdl b/ppc_fx_insns.vhdl
index 407881f..3b03dc2 100644
--- a/ppc_fx_insns.vhdl
+++ b/ppc_fx_insns.vhdl
@@ -77,10 +77,14 @@ package ppc_fx_insns is
 	function ppc_mulhw (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
 	function ppc_mulhwu (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
 
-	function ppc_cmpi (l: std_ulogic; ra: std_ulogic_vector(63 downto 0); si: std_ulogic_vector(15 downto 0)) return std_ulogic_vector;
-	function ppc_cmp (l: std_ulogic; ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
-	function ppc_cmpli (l: std_ulogic; ra: std_ulogic_vector(63 downto 0); si: std_ulogic_vector(15 downto 0)) return std_ulogic_vector;
-	function ppc_cmpl (l: std_ulogic; ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
+	function ppc_cmpi (l: std_ulogic; ra: std_ulogic_vector(63 downto 0); si: std_ulogic_vector(15 downto 0);
+                           so: std_ulogic) return std_ulogic_vector;
+	function ppc_cmp (l: std_ulogic; ra, rb: std_ulogic_vector(63 downto 0);
+                           so: std_ulogic) return std_ulogic_vector;
+	function ppc_cmpli (l: std_ulogic; ra: std_ulogic_vector(63 downto 0); si: std_ulogic_vector(15 downto 0);
+                           so: std_ulogic) return std_ulogic_vector;
+	function ppc_cmpl (l: std_ulogic; ra, rb: std_ulogic_vector(63 downto 0);
+                           so: std_ulogic) return std_ulogic_vector;
 
 	function ppc_cmpb (rs, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
 
@@ -677,7 +681,8 @@ package body ppc_fx_insns is
 		return std_ulogic_vector(tmp(63 downto 32)) & std_ulogic_vector(tmp(63 downto 32));
 	end;
 
-	function ppc_cmpi (l: std_ulogic; ra: std_ulogic_vector(63 downto 0); si: std_ulogic_vector(15 downto 0)) return std_ulogic_vector is
+	function ppc_cmpi (l: std_ulogic; ra: std_ulogic_vector(63 downto 0); si: std_ulogic_vector(15 downto 0);
+                           so: std_ulogic) return std_ulogic_vector is
 		variable tmp: signed(ra'range);
 	begin
 		tmp := signed(ra);
@@ -685,10 +690,11 @@ package body ppc_fx_insns is
 			tmp := resize(signed(ra(31 downto 0)), tmp'length);
 		end if;
 
-		return ppc_signed_compare(tmp, resize(signed(si), tmp'length));
+		return ppc_signed_compare(tmp, resize(signed(si), tmp'length), so);
 	end;
 
-	function ppc_cmp (l: std_ulogic; ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
+	function ppc_cmp (l: std_ulogic; ra, rb: std_ulogic_vector(63 downto 0);
+                          so: std_ulogic) return std_ulogic_vector is
 		variable tmpa, tmpb: signed(ra'range);
 	begin
 		tmpa := signed(ra);
@@ -698,10 +704,11 @@ package body ppc_fx_insns is
 			tmpb := resize(signed(rb(31 downto 0)), ra'length);
 		end if;
 
-		return ppc_signed_compare(tmpa, tmpb);
+		return ppc_signed_compare(tmpa, tmpb, so);
 	end;
 
-	function ppc_cmpli (l: std_ulogic; ra: std_ulogic_vector(63 downto 0); si: std_ulogic_vector(15 downto 0)) return std_ulogic_vector is
+	function ppc_cmpli (l: std_ulogic; ra: std_ulogic_vector(63 downto 0); si: std_ulogic_vector(15 downto 0);
+                            so: std_ulogic) return std_ulogic_vector is
 		variable tmp: unsigned(ra'range);
 	begin
 		tmp := unsigned(ra);
@@ -709,10 +716,11 @@ package body ppc_fx_insns is
 			tmp := resize(unsigned(ra(31 downto 0)), tmp'length);
 		end if;
 
-		return ppc_unsigned_compare(tmp, resize(unsigned(si), tmp'length));
+		return ppc_unsigned_compare(tmp, resize(unsigned(si), tmp'length), so);
 	end;
 
-	function ppc_cmpl (l: std_ulogic; ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
+	function ppc_cmpl (l: std_ulogic; ra, rb: std_ulogic_vector(63 downto 0);
+                           so: std_ulogic) return std_ulogic_vector is
 		variable tmpa, tmpb: unsigned(ra'range);
 	begin
 		tmpa := unsigned(ra);
@@ -722,7 +730,7 @@ package body ppc_fx_insns is
 			tmpb := resize(unsigned(rb(31 downto 0)), ra'length);
 		end if;
 
-		return ppc_unsigned_compare(tmpa, tmpb);
+		return ppc_unsigned_compare(tmpa, tmpb, so);
 	end;
 
 	function ppc_cmpb (rs, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is

From d04887fdcd3cc550bbc8b2383ff6f4331bde5f9d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 6 Dec 2019 08:25:28 +1100
Subject: [PATCH 5/9] decode1: Add OE=1 forms of add/sub, mul and div
 instructions

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode1.vhdl | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/decode1.vhdl b/decode1.vhdl
index 23b02fe..3138480 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -133,10 +133,15 @@ architecture behaviour of decode1 is
 		--                       unit    internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
 		--                                    op                                            in   out   A   out  in    out  len        ext                                 pipe
 		2#0100001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- add
+		2#1100001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addo
 		2#0000001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addc
+		2#1000001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addco
 		2#0010001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- adde
+		2#1010001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addeo
 		2#0011101010#  =>       (ALU,    OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addme
+		2#1011101010#  =>       (ALU,    OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addmeo
 		2#0011001010#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addze
+		2#1011001010#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addzeo
 		2#0000011100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- and
 		2#0000111100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- andc
 		-- 2#0011111100# bperm
@@ -156,13 +161,21 @@ architecture behaviour of decode1 is
 		2#0011110110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbtst
 		-- 2#1111110110# dcbz
 		2#0110001001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdeu
+		2#1110001001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdeuo
 		2#0110001011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divweu
+		2#1110001011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divweuo
 		2#0110101001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divde
+		2#1110101001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdeo
 		2#0110101011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divwe
+		2#1110101011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divweo
 		2#0111001001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdu
+		2#1111001001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divduo
 		2#0111001011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divwu
+		2#1111001011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divwuo
 		2#0111101001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divd
+		2#1111101001#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divdo
 		2#0111101011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divw
+		2#1111101011#  =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- divwo
 		2#0100011100#  =>       (ALU,    OP_XOR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- eqv
 		2#1110111010#  =>       (ALU,    OP_EXTS,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- extsb
 		2#1110011010#  =>       (ALU,    OP_EXTS,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- extsh
@@ -241,9 +254,12 @@ architecture behaviour of decode1 is
 		2#1001001011#  =>       (MUL,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '1'), -- mulhw
 		2#1000001011#  =>       (MUL,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '1'), -- mulhwu
 		2#0011101001#  =>       (MUL,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '1'), -- mulld
+		2#1011101001#  =>       (MUL,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '1'), -- mulldo
 		2#0011101011#  =>       (MUL,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '1'), -- mullw
+		2#1011101011#  =>       (MUL,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '1'), -- mullwo
 		2#0111011100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nand
 		2#0001101000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- neg
+		2#1001101000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nego
 		2#0001111100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nor
 		2#0110111100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- or
 		2#0110011100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- orc
@@ -278,10 +294,15 @@ architecture behaviour of decode1 is
 		2#0010110111#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stwux
 		2#0010010111#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stwx
 		2#0000101000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subf
+		2#1000101000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfo
 		2#0000001000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', ONE,  '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfc
+		2#1000001000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', ONE,  '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfco
 		2#0010001000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfe
+		2#1010001000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfeo
 		2#0011101000#  =>       (ALU,    OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfme
+		2#1011101000#  =>       (ALU,    OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfmeo
 		2#0011001000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfze
+		2#1011001000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfzeo
 		2#1001010110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sync
 		-- 2#0001000100# td
 		2#0000000100#  =>       (ALU,    OP_TW,        RA,         RB,          NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tw

From 5a0458dec165d8d52f5b91b6f73c28b2768371ac Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 7 Dec 2019 15:26:25 +1100
Subject: [PATCH 6/9] divider: Fix overflow calculation

We were signalling overflow when neg_result=1 but the result was zero.
Fix this.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 divider.vhdl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/divider.vhdl b/divider.vhdl
index d632e90..39893a8 100644
--- a/divider.vhdl
+++ b/divider.vhdl
@@ -20,7 +20,7 @@ architecture behaviour of divider is
     signal div        : unsigned(63 downto 0);
     signal quot       : std_ulogic_vector(63 downto 0);
     signal result     : std_ulogic_vector(63 downto 0);
-    signal sresult    : std_ulogic_vector(63 downto 0);
+    signal sresult    : std_ulogic_vector(64 downto 0);
     signal oresult    : std_ulogic_vector(63 downto 0);
     signal qbit       : std_ulogic;
     signal running    : std_ulogic;
@@ -123,13 +123,13 @@ begin
             result <= quot;
         end if;
         if neg_result = '1' then
-            sresult <= std_ulogic_vector(- signed(result));
+            sresult <= std_ulogic_vector(- signed('0' & result));
         else
-            sresult <= result;
+            sresult <= '0' & result;
         end if;
         did_ovf <= '0';
         if is_32bit = '0' then
-            did_ovf <= overflow or (is_signed and (sresult(63) xor neg_result));
+            did_ovf <= overflow or (is_signed and (sresult(64) xor sresult(63)));
         elsif is_signed = '1' then
             if ovf32 = '1' or sresult(32) /= sresult(31) then
                 did_ovf <= '1';
@@ -143,7 +143,7 @@ begin
             -- 32-bit divisions set the top 32 bits of the result to 0
             oresult <= x"00000000" & sresult(31 downto 0);
         else
-            oresult <= sresult;
+            oresult <= sresult(63 downto 0);
         end if;
     end process;
 

From afdd59350270e8d5d8e503321024a3d545ec3796 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 31 Oct 2019 12:09:14 +1100
Subject: [PATCH 7/9] spr: Add translation from SPR to special GPR number

We will want to store some SPRs in the register file using
a set of "extra" registers. This provides a function for
doing the translation along with some SPR definitions.

This isn't used yet

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl | 64 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 60 insertions(+), 4 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index ea0ec1d..1ccbf08 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -12,10 +12,31 @@ package common is
 
     function decode_spr_num(insn: std_ulogic_vector(31 downto 0)) return spr_num_t;
 
-    constant SPR_XER   : spr_num_t := 1;
-    constant SPR_LR    : spr_num_t := 8;
-    constant SPR_CTR   : spr_num_t := 9;
-    constant SPR_TB    : spr_num_t := 268;
+    constant SPR_XER    : spr_num_t := 1;
+    constant SPR_LR     : spr_num_t := 8;
+    constant SPR_CTR    : spr_num_t := 9;
+    constant SPR_TB     : spr_num_t := 268;
+    constant SPR_SRR0   : spr_num_t := 26;
+    constant SPR_SRR1   : spr_num_t := 27;
+    constant SPR_HSRR0  : spr_num_t := 314;
+    constant SPR_HSRR1  : spr_num_t := 315;
+    constant SPR_SPRG0  : spr_num_t := 272;
+    constant SPR_SPRG1  : spr_num_t := 273;
+    constant SPR_SPRG2  : spr_num_t := 274;
+    constant SPR_SPRG3  : spr_num_t := 275;
+    constant SPR_SPRG3U : spr_num_t := 259;
+    constant SPR_HSPRG0 : spr_num_t := 304;
+    constant SPR_HSPRG1 : spr_num_t := 305;
+
+    -- Some SPRs are stored in the register file, they use the magic
+    -- GPR numbers above 31.
+    --
+    -- The function fast_spr_num() returns the corresponding fast
+    -- pseudo-GPR number for a given SPR number. The result MSB
+    -- indicates if this is indeed a fast SPR. If clear, then
+    -- the SPR is not stored in the GPR file.
+    --
+    function fast_spr_num(spr: spr_num_t) return std_ulogic_vector;
 
     -- The XER is split: the common bits (CA, OV, SO, OV32 and CA32) are
     -- in the CR file as a kind of CR extension (with a separate write
@@ -282,4 +303,39 @@ package body common is
     begin
 	return to_integer(unsigned(insn(15 downto 11) & insn(20 downto 16)));
     end;
+    function fast_spr_num(spr: spr_num_t) return std_ulogic_vector is
+       variable n : integer range 0 to 31;
+    begin
+       case spr is
+       when SPR_LR =>
+           n := 0;
+       when SPR_CTR =>
+           n:= 1;
+       when SPR_SRR0 =>
+           n := 2;
+       when SPR_SRR1 =>
+           n := 3;
+       when SPR_HSRR0 =>
+           n := 4;
+       when SPR_HSRR1 =>
+           n := 5;
+       when SPR_SPRG0 =>
+           n := 6;
+       when SPR_SPRG1 =>
+           n := 7;
+       when SPR_SPRG2 =>
+           n := 8;
+       when SPR_SPRG3 | SPR_SPRG3U =>
+           n := 9;
+       when SPR_HSPRG0 =>
+           n := 10;
+       when SPR_HSPRG1 =>
+           n := 11;
+       when SPR_XER =>
+           n := 12;
+       when others =>
+           return "000000";
+       end case;
+       return "1" & std_ulogic_vector(to_unsigned(n, 5));
+    end;
 end common;

From e4f475e17f8dc121350d30bb0815aebe53d4b3af Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 31 Oct 2019 13:48:43 +1100
Subject: [PATCH 8/9] sprs: Store common SPRs in register file

This stores the most common SPRs in the register file.

This includes CTR and LR and a not yet final list of others.

The register file is set to 64 entries for now. Specific types
are defined that can represent a GPR index (gpr_index_t) or
a GPR/SPR index (gspr_index_t) along with conversion functions
between the two.

On order to deal with some forms of branch updating both LR and
CTR, we introduced a delayed update of LR after a branch link.

Note: We currently stall the pipeline on such a delayed branch,
but we could avoid stalling fetch in that specific case as we
know we have a branch delay. We could also limit that to the
specific case where we need to update both CTR and LR.

This allows us to make bcreg, mtspr and mfspr pipelined. decode1
will automatically force the single issue flag on mfspr/mtspr to
a "slow" SPR.

[paulus@ozlabs.org - fix direction of decode2.stall_in]

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Makefile           |   2 +-
 common.vhdl        |  78 +++++++++++++++++++--------
 control.vhdl       |  22 +++++---
 core.vhdl          |   5 ++
 cr_hazard.vhdl     |   7 ++-
 decode1.vhdl       |  37 +++++++++++--
 decode2.vhdl       |  96 +++++++++++++++++++++++----------
 decode_types.vhdl  |   6 +--
 execute1.vhdl      | 131 +++++++++++++++++++++++++++++----------------
 gpr_hazard.vhdl    |  13 +++--
 ppc_fx_insns.vhdl  |  18 -------
 register_file.vhdl |  12 +++--
 writeback.vhdl     |   6 +--
 13 files changed, 292 insertions(+), 141 deletions(-)

diff --git a/Makefile b/Makefile
index 85a0262..e2398c0 100644
--- a/Makefile
+++ b/Makefile
@@ -14,7 +14,7 @@ all: $(all)
 	$(GHDL) -a $(GHDLFLAGS) $<
 
 common.o: decode_types.o
-control.o: gpr_hazard.o cr_hazard.o
+control.o: gpr_hazard.o cr_hazard.o common.o
 sim_jtag.o: sim_jtag_socket.o
 core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o
 core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o multiply.o writeback.o core_debug.o divider.o
diff --git a/common.vhdl b/common.vhdl
index 1ccbf08..8e24ab9 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -28,6 +28,12 @@ package common is
     constant SPR_HSPRG0 : spr_num_t := 304;
     constant SPR_HSPRG1 : spr_num_t := 305;
 
+    -- GPR indices in the register file (GPR only)
+    subtype gpr_index_t is std_ulogic_vector(4 downto 0);
+
+    -- Extended GPR indice (can hold an SPR)
+    subtype gspr_index_t is std_ulogic_vector(5 downto 0);
+
     -- Some SPRs are stored in the register file, they use the magic
     -- GPR numbers above 31.
     --
@@ -36,7 +42,13 @@ package common is
     -- indicates if this is indeed a fast SPR. If clear, then
     -- the SPR is not stored in the GPR file.
     --
-    function fast_spr_num(spr: spr_num_t) return std_ulogic_vector;
+    function fast_spr_num(spr: spr_num_t) return gspr_index_t;
+
+    -- Indices conversion functions
+    function gspr_to_gpr(i: gspr_index_t) return gpr_index_t;
+    function gpr_to_gspr(i: gpr_index_t) return gspr_index_t;
+    function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t;
+    function is_fast_spr(s: gspr_index_t) return std_ulogic;
 
     -- The XER is split: the common bits (CA, OV, SO, OV32 and CA32) are
     -- in the CR file as a kind of CR extension (with a separate write
@@ -52,8 +64,6 @@ package common is
 
     -- This needs to die...
     type ctrl_t is record
-	lr: std_ulogic_vector(63 downto 0);
-	ctr: std_ulogic_vector(63 downto 0);
 	tb: std_ulogic_vector(63 downto 0);
     end record;
 
@@ -83,6 +93,8 @@ package common is
 	stop_mark : std_ulogic;
 	nia: std_ulogic_vector(63 downto 0);
 	insn: std_ulogic_vector(31 downto 0);
+	ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr
+	ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR)
 	decode: decode_rom_t;
     end record;
     constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', decode => decode_rom_init, others => (others => '0'));
@@ -91,9 +103,9 @@ package common is
 	valid: std_ulogic;
 	insn_type: insn_type_t;
 	nia: std_ulogic_vector(63 downto 0);
-	write_reg: std_ulogic_vector(4 downto 0);
-	read_reg1: std_ulogic_vector(4 downto 0);
-	read_reg2: std_ulogic_vector(4 downto 0);
+	write_reg: gspr_index_t;
+	read_reg1: gspr_index_t;
+	read_reg2: gspr_index_t;
 	read_data1: std_ulogic_vector(63 downto 0);
 	read_data2: std_ulogic_vector(63 downto 0);
 	read_data3: std_ulogic_vector(63 downto 0);
@@ -121,7 +133,7 @@ package common is
     type Decode2ToMultiplyType is record
 	valid: std_ulogic;
 	insn_type: insn_type_t;
-	write_reg: std_ulogic_vector(4 downto 0);
+	write_reg: gpr_index_t;
 	data1: std_ulogic_vector(64 downto 0);
 	data2: std_ulogic_vector(64 downto 0);
 	rc: std_ulogic;
@@ -135,7 +147,7 @@ package common is
 
     type Decode2ToDividerType is record
 	valid: std_ulogic;
-	write_reg: std_ulogic_vector(4 downto 0);
+	write_reg: gpr_index_t;
 	dividend: std_ulogic_vector(63 downto 0);
 	divisor: std_ulogic_vector(63 downto 0);
 	is_signed: std_ulogic;
@@ -153,11 +165,11 @@ package common is
 
     type Decode2ToRegisterFileType is record
 	read1_enable : std_ulogic;
-	read1_reg : std_ulogic_vector(4 downto 0);
+	read1_reg : gspr_index_t;
 	read2_enable : std_ulogic;
-	read2_reg : std_ulogic_vector(4 downto 0);
+	read2_reg : gspr_index_t;
 	read3_enable : std_ulogic;
-	read3_reg : std_ulogic_vector(4 downto 0);
+	read3_reg : gpr_index_t;
     end record;
 
     type RegisterFileToDecode2Type is record
@@ -187,12 +199,12 @@ package common is
 	addr1 : std_ulogic_vector(63 downto 0);
 	addr2 : std_ulogic_vector(63 downto 0);
 	data : std_ulogic_vector(63 downto 0);		-- data to write, unused for read
-	write_reg : std_ulogic_vector(4 downto 0);	-- read data goes to this register
+	write_reg : gpr_index_t;
 	length : std_ulogic_vector(3 downto 0);
 	byte_reverse : std_ulogic;
 	sign_extend : std_ulogic;			-- do we need to sign extend?
 	update : std_ulogic;				-- is this an update instruction?
-	update_reg : std_ulogic_vector(4 downto 0);	-- if so, the register to update
+	update_reg : gpr_index_t;                      	-- if so, the register to update
 	xerc : xer_common_t;
     end record;
     constant Decode2ToLoadstore1Init : Decode2ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0',
@@ -205,19 +217,19 @@ package common is
 	nc : std_ulogic;
 	addr : std_ulogic_vector(63 downto 0);
 	data : std_ulogic_vector(63 downto 0);
-	write_reg : std_ulogic_vector(4 downto 0);
+	write_reg : gpr_index_t;
 	length : std_ulogic_vector(3 downto 0);
 	byte_reverse : std_ulogic;
 	sign_extend : std_ulogic;
 	update : std_ulogic;
-	update_reg : std_ulogic_vector(4 downto 0);
+	update_reg : gpr_index_t;
 	xerc : xer_common_t;
     end record;
 
     type DcacheToWritebackType is record
 	valid : std_ulogic;
 	write_enable: std_ulogic;
-	write_reg : std_ulogic_vector(4 downto 0);
+	write_reg : gpr_index_t;
 	write_data : std_ulogic_vector(63 downto 0);
 	write_len : std_ulogic_vector(3 downto 0);
 	write_shift : std_ulogic_vector(2 downto 0);
@@ -234,7 +246,7 @@ package common is
 	valid: std_ulogic;
 	rc : std_ulogic;
 	write_enable : std_ulogic;
-	write_reg: std_ulogic_vector(4 downto 0);
+	write_reg: gspr_index_t;
 	write_data: std_ulogic_vector(63 downto 0);
 	write_len : std_ulogic_vector(3 downto 0);
 	write_cr_enable : std_ulogic;
@@ -253,7 +265,7 @@ package common is
 	valid: std_ulogic;
 
 	write_reg_enable : std_ulogic;
-	write_reg_nr: std_ulogic_vector(4 downto 0);
+	write_reg_nr: gpr_index_t;
 	write_reg_data: std_ulogic_vector(63 downto 0);
 	write_xerc_enable : std_ulogic;
 	xerc : xer_common_t;
@@ -268,7 +280,7 @@ package common is
 	valid: std_ulogic;
 
 	write_reg_enable : std_ulogic;
-	write_reg_nr: std_ulogic_vector(4 downto 0);
+	write_reg_nr: gpr_index_t;
 	write_reg_data: std_ulogic_vector(63 downto 0);
 	write_xerc_enable : std_ulogic;
 	xerc : xer_common_t;
@@ -280,7 +292,7 @@ package common is
 								 others => (others => '0'));
 
     type WritebackToRegisterFileType is record
-	write_reg : std_ulogic_vector(4 downto 0);
+	write_reg : gspr_index_t;
 	write_data : std_ulogic_vector(63 downto 0);
 	write_enable : std_ulogic;
     end record;
@@ -303,7 +315,7 @@ package body common is
     begin
 	return to_integer(unsigned(insn(15 downto 11) & insn(20 downto 16)));
     end;
-    function fast_spr_num(spr: spr_num_t) return std_ulogic_vector is
+    function fast_spr_num(spr: spr_num_t) return gspr_index_t is
        variable n : integer range 0 to 31;
     begin
        case spr is
@@ -338,4 +350,28 @@ package body common is
        end case;
        return "1" & std_ulogic_vector(to_unsigned(n, 5));
     end;
+
+    function gspr_to_gpr(i: gspr_index_t) return gpr_index_t is
+    begin
+	return i(4 downto 0);
+    end;
+
+    function gpr_to_gspr(i: gpr_index_t) return gspr_index_t is
+    begin
+	return "0" & i;
+    end;
+
+    function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t is
+    begin
+	if s(5) = '1' then
+	    return s;
+	else
+	    return gpr_to_gspr(g);
+	end if;
+    end;
+
+    function is_fast_spr(s: gspr_index_t) return std_ulogic is
+    begin
+	return s(5);
+    end;
 end common;
diff --git a/control.vhdl b/control.vhdl
index 0555b06..fed5618 100644
--- a/control.vhdl
+++ b/control.vhdl
@@ -1,6 +1,9 @@
 library ieee;
 use ieee.std_logic_1164.all;
 
+library work;
+use work.common.all;
+
 entity control is
     generic (
         PIPELINE_DEPTH : natural := 2
@@ -12,20 +15,21 @@ entity control is
         complete_in         : in std_ulogic;
         valid_in            : in std_ulogic;
         flush_in            : in std_ulogic;
+	stall_in            : in std_ulogic;
         sgl_pipe_in         : in std_ulogic;
         stop_mark_in        : in std_ulogic;
 
         gpr_write_valid_in  : in std_ulogic;
-        gpr_write_in        : in std_ulogic_vector(4 downto 0);
+        gpr_write_in        : in gspr_index_t;
 
         gpr_a_read_valid_in : in std_ulogic;
-        gpr_a_read_in       : in std_ulogic_vector(4 downto 0);
+        gpr_a_read_in       : in gspr_index_t;
 
         gpr_b_read_valid_in : in std_ulogic;
-        gpr_b_read_in       : in std_ulogic_vector(4 downto 0);
+        gpr_b_read_in       : in gspr_index_t;
 
         gpr_c_read_valid_in : in std_ulogic;
-        gpr_c_read_in       : in std_ulogic_vector(4 downto 0);
+        gpr_c_read_in       : in gpr_index_t;
 
         cr_read_in          : in std_ulogic;
         cr_write_in         : in std_ulogic;
@@ -61,6 +65,7 @@ begin
             )
         port map (
             clk                => clk,
+	    stall_in           => stall_in,
 
             gpr_write_valid_in => gpr_write_valid,
             gpr_write_in       => gpr_write_in,
@@ -76,6 +81,7 @@ begin
             )
         port map (
             clk                => clk,
+	    stall_in           => stall_in,
 
             gpr_write_valid_in => gpr_write_valid,
             gpr_write_in       => gpr_write_in,
@@ -91,11 +97,12 @@ begin
             )
         port map (
             clk                => clk,
+	    stall_in           => stall_in,
 
             gpr_write_valid_in => gpr_write_valid,
             gpr_write_in       => gpr_write_in,
             gpr_read_valid_in  => gpr_c_read_valid_in,
-            gpr_read_in        => gpr_c_read_in,
+            gpr_read_in        => "0" & gpr_c_read_in,
 
             stall_out          => stall_c_out
             );
@@ -106,6 +113,7 @@ begin
             )
         port map (
             clk                => clk,
+	    stall_in           => stall_in,
 
             cr_read_in         => cr_read_in,
             cr_write_in        => cr_write_valid,
@@ -129,8 +137,8 @@ begin
         v_int := r_int;
 
         -- asynchronous
-        valid_tmp := valid_in and not flush_in;
-        stall_tmp := '0';
+        valid_tmp := valid_in and not flush_in and not stall_in;
+        stall_tmp := stall_in;
 
         if complete_in = '1' then
             v_int.outstanding := r_int.outstanding - 1;
diff --git a/core.vhdl b/core.vhdl
index 22f7dca..f95a1af 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -76,8 +76,10 @@ architecture behave of core is
     signal icache_stall_out : std_ulogic;
     signal fetch2_stall_in : std_ulogic;
     signal decode1_stall_in : std_ulogic;
+    signal decode2_stall_in : std_ulogic;
     signal decode2_stall_out : std_ulogic;
     signal ex1_icache_inval: std_ulogic;
+    signal ex1_stall_out: std_ulogic;
 
     signal flush: std_ulogic;
 
@@ -184,6 +186,7 @@ begin
         port map (
             clk => clk,
             rst => core_rst,
+	    stall_in => decode2_stall_in,
             stall_out => decode2_stall_out,
             flush_in => flush,
             complete_in => complete,
@@ -198,6 +201,7 @@ begin
             c_in => cr_file_to_decode2,
             c_out => decode2_to_cr_file
             );
+    decode2_stall_in <= ex1_stall_out;
 
     register_file_0: entity work.register_file
         generic map (
@@ -223,6 +227,7 @@ begin
         port map (
             clk => clk,
             flush_out => flush,
+	    stall_out => ex1_stall_out,
             e_in => decode2_to_execute1,
             f_out => execute1_to_fetch1,
             e_out => execute1_to_writeback,
diff --git a/cr_hazard.vhdl b/cr_hazard.vhdl
index 2a434ac..f6c5f3f 100644
--- a/cr_hazard.vhdl
+++ b/cr_hazard.vhdl
@@ -7,7 +7,8 @@ entity cr_hazard is
         PIPELINE_DEPTH : natural := 2
         );
     port(
-        clk         : in std_logic;
+        clk         : in std_ulogic;
+	stall_in    : in std_ulogic;
 
         cr_read_in  : in std_ulogic;
         cr_write_in : in std_ulogic;
@@ -29,7 +30,9 @@ begin
     cr_hazard0: process(clk)
     begin
         if rising_edge(clk) then
-            r <= rin;
+	    if stall_in = '0' then
+		r <= rin;
+	    end if;
         end if;
     end process;
 
diff --git a/decode1.vhdl b/decode1.vhdl
index 3138480..b4e7d26 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -43,7 +43,7 @@ architecture behaviour of decode1 is
 		28 =>       (ALU,    OP_AND,       NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0'), -- andi.
 		29 =>       (ALU,    OP_AND,       NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0'), -- andis.
 		18 =>       (ALU,    OP_B,         NONE,       CONST_LI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- b
-		16 =>       (ALU,    OP_BC,        NONE,       CONST_BD,    NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc
+		16 =>       (ALU,    OP_BC,        SPR,        CONST_BD,    NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc
 		11 =>       (ALU,    OP_CMP,       RA,         CONST_SI,    NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpi
 		10 =>       (ALU,    OP_CMPL,      RA,         CONST_UI,    NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli
 		34 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbz
@@ -106,7 +106,7 @@ architecture behaviour of decode1 is
 		-- addpcis not implemented yet
 		2#001#    =>       (ALU,    OP_ILLEGAL,   NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'),
                 -- bclr, bcctr, bctar
-		2#100#    =>       (ALU,    OP_BCREG,     NONE,       NONE,        NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '1'),
+		2#100#    =>       (ALU,    OP_BCREG,     SPR,        SPR,         NONE, SPR,  '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'),
                 -- isync
 		2#111#    =>       (ALU,    OP_ISYNC,     NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'),
 		others   => illegal_inst
@@ -237,13 +237,13 @@ architecture behaviour of decode1 is
 		-- 2#1000000000# mcrxr
 		-- 2#1001000000# mcrxrx
 		2#0000010011#  =>       (ALU,    OP_MFCR,      NONE,       NONE,        NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf
-		2#0101010011#  =>       (ALU,    OP_MFSPR,     NONE,       NONE,        NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- mfspr
+		2#0101010011#  =>       (ALU,    OP_MFSPR,     SPR,        NONE,        NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfspr
 		2#0100001001#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modud
 		2#0100001011#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- moduw
 		2#1100001001#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsd
 		2#1100001011#  =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsw
 		2#0010010000#  =>       (ALU,    OP_MTCRF,     NONE,       NONE,        RS,   NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtcrf/mtocrf
-		2#0111010011#  =>       (ALU,    OP_MTSPR,     NONE,       NONE,        RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- mtspr
+		2#0111010011#  =>       (ALU,    OP_MTSPR,     NONE,       NONE,        RS,   SPR,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtspr
 		2#0001001001#  =>       (MUL,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '1'), -- mulhd
 		2#0000001001#  =>       (MUL,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- mulhdu
 		2#0001001011#  =>       (MUL,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '1'), -- mulhw
@@ -355,6 +355,8 @@ begin
 		v.nia  := f_in.nia;
 		v.insn := f_in.insn;
 		v.stop_mark := f_in.stop_mark;
+		v.ispr1 := (others => '0');
+		v.ispr2 := (others => '0');
 
 		if f_in.valid = '1' then
 			report "Decode insn " & to_hstring(f_in.insn) & " at " & to_hstring(f_in.nia);
@@ -398,6 +400,33 @@ begin
                         v.decode := major_decode_rom_array(to_integer(majorop));
 		end if;
 
+		-- Set ISPR1/ISPR2 when needed
+		if v.decode.insn_type = OP_BC or v.decode.insn_type = OP_BCREG then
+		    -- Branch uses CTR as condition when BO(2) is 0. This is
+		    -- also used to indicate that CTR is modified (they go
+		    -- together).
+		    --
+		    if f_in.insn(23) = '0' then
+			v.ispr1 := fast_spr_num(SPR_CTR);
+		    end if;
+
+		    -- Branch source register is an SPR
+		    if v.decode.insn_type = OP_BCREG then
+			-- TODO: Add TAR
+			if f_in.insn(10) = '0' then
+			    v.ispr2 := fast_spr_num(SPR_LR);
+			else
+			    v.ispr2 := fast_spr_num(SPR_CTR);
+			end if;
+		    end if;
+		elsif v.decode.insn_type = OP_MFSPR or v.decode.insn_type = OP_MTSPR then
+		    v.ispr1 := fast_spr_num(decode_spr_num(f_in.insn));
+		    -- Make slow SPRs single issue
+		    if is_fast_spr(v.ispr1) = '0' then
+			v.decode.sgl_pipe := '1';
+		    end if;
+		end if;
+
 		if flush_in = '1' then
 			v.valid := '0';
 		end if;
diff --git a/decode2.vhdl b/decode2.vhdl
index e9c71ba..8a2d970 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -14,6 +14,7 @@ entity decode2 is
 		rst   : in std_ulogic;
 
 		complete_in : in std_ulogic;
+		stall_in : in std_ulogic;
 		stall_out : out std_ulogic;
 
 		stopped_out : out std_ulogic;
@@ -47,30 +48,49 @@ architecture behaviour of decode2 is
 
 	type decode_input_reg_t is record
 		reg_valid : std_ulogic;
-		reg       : std_ulogic_vector(4 downto 0);
+		reg       : gspr_index_t;
 		data      : std_ulogic_vector(63 downto 0);
 	end record;
 
+	type decode_output_reg_t is record
+		reg_valid : std_ulogic;
+		reg       : gspr_index_t;
+	end record;
+
 	function decode_input_reg_a (t : input_reg_a_t; insn_in : std_ulogic_vector(31 downto 0);
-				     reg_data : std_ulogic_vector(63 downto 0)) return decode_input_reg_t is
+				     reg_data : std_ulogic_vector(63 downto 0);
+				     ispr : gspr_index_t) return decode_input_reg_t is
 		variable is_reg : std_ulogic;
 	begin
 		is_reg := '0' when insn_ra(insn_in) = "00000" else '1';
 
 		if t = RA or (t = RA_OR_ZERO and insn_ra(insn_in) /= "00000") then
-			--return (is_reg, insn_ra(insn_in), reg_data);
-			return ('1', insn_ra(insn_in), reg_data);
+		    assert is_fast_spr(ispr) = '0' report "Decode A says GPR but ISPR says SPR:" &
+			to_hstring(ispr) severity failure;
+		    return ('1', gpr_to_gspr(insn_ra(insn_in)), reg_data);
+		elsif t = SPR then
+		    -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR.
+		    -- If it's all 0, we don't treat it as a dependency as slow SPRs
+		    -- operations are single issue.
+		    --
+		    assert is_fast_spr(ispr) =  '1' or ispr = "000000"
+			report "Decode A says SPR but ISPR is invalid:" &
+			to_hstring(ispr) severity failure;
+		    return (is_fast_spr(ispr), ispr, reg_data);
 		else
 			return ('0', (others => '0'), (others => '0'));
 		end if;
 	end;
 
 	function decode_input_reg_b (t : input_reg_b_t; insn_in : std_ulogic_vector(31 downto 0);
-				     reg_data : std_ulogic_vector(63 downto 0)) return decode_input_reg_t is
+				     reg_data : std_ulogic_vector(63 downto 0);
+				     ispr : gspr_index_t) return decode_input_reg_t is
 	begin
 		case t is
 		when RB =>
-			return ('1', insn_rb(insn_in), reg_data);
+		    assert is_fast_spr(ispr) = '0' report "Decode B says GPR but ISPR says SPR:" &
+			to_hstring(ispr) severity failure;
+		    return ('1', gpr_to_gspr(insn_rb(insn_in)), reg_data);
 		when CONST_UI =>
 			return ('0', (others => '0'), std_ulogic_vector(resize(unsigned(insn_ui(insn_in)), 64)));
 		when CONST_SI =>
@@ -91,6 +111,14 @@ architecture behaviour of decode2 is
 			return ('0', (others => '0'), x"00000000000000" & "00" & insn_in(1) & insn_in(15 downto 11));
 		when CONST_SH32 =>
 			return ('0', (others => '0'), x"00000000000000" & "000" & insn_in(15 downto 11));
+		when SPR =>
+		    -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR.
+		    -- If it's all 0, we don't treat it as a dependency as slow SPRs
+		    -- operations are single issue.
+		    assert is_fast_spr(ispr) = '1' or ispr = "000000"
+			report "Decode B says SPR but ISPR is invalid:" &
+			to_hstring(ispr) severity failure;
+		    return (is_fast_spr(ispr), ispr, reg_data);
 		when NONE =>
 			return ('0', (others => '0'), (others => '0'));
 		end case;
@@ -101,21 +129,30 @@ architecture behaviour of decode2 is
 	begin
 		case t is
 		when RS =>
-			return ('1', insn_rs(insn_in), reg_data);
+			return ('1', gpr_to_gspr(insn_rs(insn_in)), reg_data);
 		when NONE =>
 			return ('0', (others => '0'), (others => '0'));
 		end case;
 	end;
 
-	function decode_output_reg (t : output_reg_a_t; insn_in : std_ulogic_vector(31 downto 0)) return std_ulogic_vector is
+	function decode_output_reg (t : output_reg_a_t; insn_in : std_ulogic_vector(31 downto 0);
+				    ispr : gspr_index_t) return decode_output_reg_t is
 	begin
 		case t is
 		when RT =>
-			return insn_rt(insn_in);
+		    return ('1', gpr_to_gspr(insn_rt(insn_in)));
 		when RA =>
-			return insn_ra(insn_in);
+		    return ('1', gpr_to_gspr(insn_ra(insn_in)));
+		when SPR =>
+		    -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR.
+		    -- If it's all 0, we don't treat it as a dependency as slow SPRs
+		    -- operations are single issue.
+		    assert is_fast_spr(ispr) = '1' or ispr = "000000"
+			report "Decode B says SPR but ISPR is invalid:" &
+			to_hstring(ispr) severity failure;
+		    return (is_fast_spr(ispr), ispr);
 		when NONE =>
-			return "00000";
+		    return ('0', "000000");
 		end case;
 	end;
 
@@ -153,16 +190,16 @@ architecture behaviour of decode2 is
 	signal control_sgl_pipe : std_logic;
 
 	signal gpr_write_valid : std_ulogic;
-	signal gpr_write : std_ulogic_vector(4 downto 0);
+	signal gpr_write : gspr_index_t;
 
 	signal gpr_a_read_valid : std_ulogic;
-	signal gpr_a_read : std_ulogic_vector(4 downto 0);
+	signal gpr_a_read :gspr_index_t;
 
 	signal gpr_b_read_valid : std_ulogic;
-	signal gpr_b_read : std_ulogic_vector(4 downto 0);
+	signal gpr_b_read : gspr_index_t;
 
 	signal gpr_c_read_valid : std_ulogic;
-	signal gpr_c_read : std_ulogic_vector(4 downto 0);
+	signal gpr_c_read : gpr_index_t;
 
 	signal cr_write_valid : std_ulogic;
 begin
@@ -176,6 +213,7 @@ begin
 
 		complete_in => complete_in,
 		valid_in    => control_valid_in,
+		stall_in    => stall_in,
 		flush_in    => flush_in,
 		sgl_pipe_in => control_sgl_pipe,
 		stop_mark_in => d_in.stop_mark,
@@ -210,8 +248,8 @@ begin
 		end if;
 	end process;
 
-	r_out.read1_reg <= insn_ra(d_in.insn);
-	r_out.read2_reg <= insn_rb(d_in.insn);
+	r_out.read1_reg <= gpr_or_spr_to_gspr(insn_ra(d_in.insn), d_in.ispr1);
+	r_out.read2_reg <= gpr_or_spr_to_gspr(insn_rb(d_in.insn), d_in.ispr2);
 	r_out.read3_reg <= insn_rs(d_in.insn);
 
 	c_out.read <= d_in.decode.input_cr;
@@ -223,6 +261,7 @@ begin
 		variable decoded_reg_a : decode_input_reg_t;
 		variable decoded_reg_b : decode_input_reg_t;
 		variable decoded_reg_c : decode_input_reg_t;
+		variable decoded_reg_o : decode_output_reg_t;
                 variable signed_division: std_ulogic;
                 variable length : std_ulogic_vector(3 downto 0);
 	begin
@@ -239,10 +278,11 @@ begin
 		--v.e.input_cr := d_in.decode.input_cr;
 		--v.m.input_cr := d_in.decode.input_cr;
 		--v.e.output_cr := d_in.decode.output_cr;
-
-		decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data);
-		decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data);
+    
+		decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1);
+		decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2);
 		decoded_reg_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn, r_in.read3_data);
+		decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispr1);
 
 		r_out.read1_enable <= decoded_reg_a.reg_valid;
 		r_out.read2_enable <= decoded_reg_b.reg_valid;
@@ -269,7 +309,7 @@ begin
 		v.e.read_reg2 := decoded_reg_b.reg;
 		v.e.read_data2 := decoded_reg_b.data;
                 v.e.read_data3 := decoded_reg_c.data;
-		v.e.write_reg := decode_output_reg(d_in.decode.output_reg_a, d_in.insn);
+		v.e.write_reg := decoded_reg_o.reg;
 		v.e.rc := decode_rc(d_in.decode.rc, d_in.insn);
 		v.e.oe := decode_oe(d_in.decode.rc, d_in.insn);
 		v.e.cr := c_in.read_cr_data;
@@ -290,7 +330,7 @@ begin
 		v.m.insn_type := d_in.decode.insn_type;
 		mul_a := decoded_reg_a.data;
 		mul_b := decoded_reg_b.data;
-		v.m.write_reg := decode_output_reg(d_in.decode.output_reg_a, d_in.insn);
+		v.m.write_reg := gspr_to_gpr(decoded_reg_o.reg);
 		v.m.rc := decode_rc(d_in.decode.rc, d_in.insn);
 		v.m.xerc := c_in.read_xerc_data;
 		if v.m.insn_type = OP_MUL_L64 then
@@ -327,7 +367,7 @@ begin
                 --       s = 1 for signed, 0 for unsigned (for div*)
                 --       t = 1 for 32-bit, 0 for 64-bit
                 --       r = RC bit (record condition code)
-		v.d.write_reg := decode_output_reg(d_in.decode.output_reg_a, d_in.insn);
+		v.d.write_reg := gspr_to_gpr(decoded_reg_o.reg);
                 v.d.is_modulus := not d_in.insn(8);
                 v.d.is_32bit := d_in.insn(2);
                 if d_in.insn(8) = '1' then
@@ -364,11 +404,11 @@ begin
 		v.d.oe := decode_oe(d_in.decode.rc, d_in.insn);
 
 		-- load/store unit
-		v.l.update_reg := decoded_reg_a.reg;
+		v.l.update_reg := gspr_to_gpr(decoded_reg_a.reg);
 		v.l.addr1 := decoded_reg_a.data;
 		v.l.addr2 := decoded_reg_b.data;
 		v.l.data := decoded_reg_c.data;
-		v.l.write_reg := decode_output_reg(d_in.decode.output_reg_a, d_in.insn);
+		v.l.write_reg := gspr_to_gpr(decoded_reg_o.reg);
 
 		if d_in.decode.insn_type = OP_LOAD then
 			v.l.load := '1';
@@ -386,8 +426,8 @@ begin
 		control_valid_in <= d_in.valid;
 		control_sgl_pipe <= d_in.decode.sgl_pipe;
 
-		gpr_write_valid <= '1' when d_in.decode.output_reg_a /= NONE else '0';
-		gpr_write <= decode_output_reg(d_in.decode.output_reg_a, d_in.insn);
+		gpr_write_valid <= decoded_reg_o.reg_valid;
+		gpr_write <= decoded_reg_o.reg;
 
 		gpr_a_read_valid <= decoded_reg_a.reg_valid;
 		gpr_a_read <= decoded_reg_a.reg;
@@ -396,7 +436,7 @@ begin
 		gpr_b_read <= decoded_reg_b.reg;
 
 		gpr_c_read_valid <= decoded_reg_c.reg_valid;
-		gpr_c_read <= decoded_reg_c.reg;
+		gpr_c_read <= gspr_to_gpr(decoded_reg_c.reg);
 
                 cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn);
 
diff --git a/decode_types.vhdl b/decode_types.vhdl
index 9736f58..e847fcf 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -21,10 +21,10 @@ package decode_types is
 			 OP_TWI, OP_XOR, OP_SIM_CONFIG
 			 );
 
-    type input_reg_a_t is (NONE, RA, RA_OR_ZERO);
-    type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, CONST_DS, CONST_M1, CONST_SH, CONST_SH32);
+    type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR);
+    type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR);
     type input_reg_c_t is (NONE, RS);
-    type output_reg_a_t is (NONE, RT, RA);
+    type output_reg_a_t is (NONE, RT, RA, SPR);
     type rc_t is (NONE, ONE, RC);
     type carry_in_t is (ZERO, CA, ONE);
 
diff --git a/execute1.vhdl b/execute1.vhdl
index e1ca950..9b14088 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -12,10 +12,11 @@ use work.ppc_fx_insns.all;
 
 entity execute1 is
     port (
-	clk   : in std_logic;
+	clk   : in std_ulogic;
 
 	-- asynchronous
 	flush_out : out std_ulogic;
+	stall_out : out std_ulogic;
 
 	e_in  : in Decode2ToExecute1Type;
 
@@ -32,6 +33,8 @@ end entity execute1;
 architecture behaviour of execute1 is
     type reg_type is record
 	e : Execute1ToWritebackType;
+	lr_update : std_ulogic;
+	next_lr : std_ulogic_vector(63 downto 0);
     end record;
 
     signal r, rin : reg_type;
@@ -125,6 +128,12 @@ begin
 	if rising_edge(clk) then
 	    r <= rin;
 	    ctrl <= ctrl_tmp;
+	    assert not (r.lr_update = '1' and e_in.valid = '1')
+		report "LR update collision with valid in EX1"
+		severity failure;
+	    if r.lr_update = '1' then
+		report "LR update to " & to_hstring(r.next_lr);
+	    end if;
 	end if;
     end process;
 
@@ -190,12 +199,15 @@ begin
 	    v.e.xerc := e_in.xerc;
 	end if;
 
+	v.lr_update := '0';
+
 	ctrl_tmp <= ctrl;
 	-- FIXME: run at 512MHz not core freq
 	ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
 
 	terminate_out <= '0';
 	icache_inval <= '0';
+	stall_out <= '0';
 	f_out <= Execute1ToFetch1TypeInit;
 
 	-- Next insn adder used in a couple of places
@@ -251,12 +263,15 @@ begin
 		    f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2));
 		end if;
 	    when OP_BC =>
+		-- read_data1 is CTR
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
 		if bo(4-2) = '0' then
-		    ctrl_tmp.ctr <= std_ulogic_vector(unsigned(ctrl.ctr) - 1);
+		    result := std_ulogic_vector(unsigned(e_in.read_data1) - 1);
+		    result_en := '1';
+		    v.e.write_reg := fast_spr_num(SPR_CTR);
 		end if;
-		if ppc_bc_taken(bo, bi, e_in.cr, ctrl.ctr) = 1 then
+		if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then
 		    f_out.redirect <= '1';
 		    if (insn_aa(e_in.insn)) then
 			f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2));
@@ -265,19 +280,18 @@ begin
 		    end if;
 		end if;
 	    when OP_BCREG =>
-                -- bits 10 and 6 distinguish between bclr, bcctr and bctar
+		-- read_data1 is CTR
+		-- read_data2 is target register (CTR, LR or TAR)
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
 		if bo(4-2) = '0' and e_in.insn(10) = '0' then
-		    ctrl_tmp.ctr <= std_ulogic_vector(unsigned(ctrl.ctr) - 1);
+		    result := std_ulogic_vector(unsigned(e_in.read_data1) - 1);
+		    result_en := '1';
+		    v.e.write_reg := fast_spr_num(SPR_CTR);
 		end if;
-		if ppc_bc_taken(bo, bi, e_in.cr, ctrl.ctr) = 1 then
+		if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then
 		    f_out.redirect <= '1';
-		    if e_in.insn(10) = '0' then
-			f_out.redirect_nia <= ctrl.lr(63 downto 2) & "00";
-		    else
-			f_out.redirect_nia <= ctrl.ctr(63 downto 2) & "00";
-		    end if;
+		    f_out.redirect_nia <= e_in.read_data2(63 downto 2) & "00";
 		end if;
 	    when OP_CMPB =>
 		result := ppc_cmpb(e_in.read_data3, e_in.read_data2);
@@ -340,23 +354,24 @@ begin
 		    v.e.write_cr_data(hi downto lo) := newcrf;
 		end loop;
 	    when OP_MFSPR =>
-		case decode_spr_num(e_in.insn) is
-		when SPR_XER =>
-		    result := ( 63-32 => v.e.xerc.so,
-				63-33 => v.e.xerc.ov,
-				63-34 => v.e.xerc.ca,
-				63-44 => v.e.xerc.ov32,
-				63-45 => v.e.xerc.ca32,
-				others => '0');
-		when SPR_CTR =>
-		    result := ctrl.ctr;
-		when SPR_LR =>
-		    result := ctrl.lr;
-		when SPR_TB =>
-		    result := ctrl.tb;
-		when others =>
-		    result := (others => '0');
-		end case;
+		if is_fast_spr(e_in.read_reg1) then
+		    result := e_in.read_data1;
+		    if decode_spr_num(e_in.insn) = SPR_XER then
+			result(63-32) := v.e.xerc.so;
+			result(63-33) := v.e.xerc.ov;
+			result(63-34) := v.e.xerc.ca;
+			result(63-35 downto 63-43) := "000000000";
+			result(63-44) := v.e.xerc.ov32;
+			result(63-45) := v.e.xerc.ca32;
+		    end if;
+		else
+		    case decode_spr_num(e_in.insn) is
+		    when SPR_TB =>
+			result := ctrl.tb;
+		    when others =>
+			result := (others => '0');
+		    end case;
+		end if;
 		result_en := '1';
 	    when OP_MFCR =>
 		if e_in.insn(20) = '0' then
@@ -387,20 +402,25 @@ begin
 		end if;
 		v.e.write_cr_data := e_in.read_data3(31 downto 0);
 	    when OP_MTSPR =>
-		case decode_spr_num(e_in.insn) is
-		when SPR_XER =>
-		    v.e.xerc.so := e_in.read_data3(63-32);
-		    v.e.xerc.ov := e_in.read_data3(63-33);
-		    v.e.xerc.ca := e_in.read_data3(63-34);
-		    v.e.xerc.ov32 := e_in.read_data3(63-44);
-		    v.e.xerc.ca32 := e_in.read_data3(63-45);
-		    v.e.write_xerc_enable := '1';
-		when SPR_CTR =>
-		    ctrl_tmp.ctr <= e_in.read_data3;
-		when SPR_LR =>
-		    ctrl_tmp.lr <= e_in.read_data3;
-		when others =>
-		end case;
+		report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
+		    "=" & to_hstring(e_in.read_data3);
+		if is_fast_spr(e_in.write_reg) then
+		    result := e_in.read_data3;
+		    result_en := '1';
+		    if decode_spr_num(e_in.insn) = SPR_XER then
+			v.e.xerc.so := e_in.read_data3(63-32);
+			v.e.xerc.ov := e_in.read_data3(63-33);
+			v.e.xerc.ca := e_in.read_data3(63-34);
+			v.e.xerc.ov32 := e_in.read_data3(63-44);
+			v.e.xerc.ca32 := e_in.read_data3(63-45);
+			v.e.write_xerc_enable := '1';
+		    end if;
+		else
+-- TODO: Implement slow SPRs	    
+--		    case decode_spr_num(e_in.insn) is
+--		    when others =>
+--		    end case;
+		end if;
 	    when OP_POPCNTB =>
 		result := ppc_popcntb(e_in.read_data3);
 		result_en := '1';
@@ -444,15 +464,36 @@ begin
 		report "illegal";
 	    end case;
 
+	    -- Update LR on the next cycle after a branch link
+	    --
+	    -- WARNING: The LR update isn't tracked by our hazard tracker. This
+	    --          will work (well I hope) because it only happens on branches
+	    --          which will flush all decoded instructions. By the time
+	    --          fetch catches up, we'll have the new LR. This will
+	    --          *not* work properly however if we have a branch predictor,
+	    --          in which case the solution would probably be to keep a
+	    --          local cache of the updated LR in execute1 (flushed on
+	    --          exceptions) that is used instead of the value from
+	    --          decode when its content is valid.
 	    if e_in.lr = '1' then
-		ctrl_tmp.lr <= next_nia;
+		v.lr_update := '1';
+		v.next_lr := next_nia;
+		v.e.valid := '0';
+		report "Delayed LR update to " & to_hstring(next_nia);
+		stall_out <= '1';
 	    end if;
-
+	elsif r.lr_update = '1' then
+	    result_en := '1';
+	    result := r.next_lr;
+	    v.e.write_reg := fast_spr_num(SPR_LR);
+	    v.e.write_len := x"8";
+	    v.e.sign_extend := '0';
+	    v.e.valid := '1';
 	end if;
 
 	v.e.write_data := result;
 	v.e.write_enable := result_en;
-	v.e.rc := e_in.rc;
+	v.e.rc := e_in.rc and e_in.valid;
 
 	-- Update registers
 	rin <= v;
diff --git a/gpr_hazard.vhdl b/gpr_hazard.vhdl
index 6c8614b..705e69d 100644
--- a/gpr_hazard.vhdl
+++ b/gpr_hazard.vhdl
@@ -7,12 +7,13 @@ entity gpr_hazard is
         PIPELINE_DEPTH : natural := 2
         );
     port(
-        clk                : in std_logic;
+        clk                : in std_ulogic;
+	stall_in           : in std_ulogic;
 
         gpr_write_valid_in : in std_ulogic;
-        gpr_write_in       : in std_ulogic_vector(4 downto 0);
+        gpr_write_in       : in std_ulogic_vector(5 downto 0);
         gpr_read_valid_in  : in std_ulogic;
-        gpr_read_in        : in std_ulogic_vector(4 downto 0);
+        gpr_read_in        : in std_ulogic_vector(5 downto 0);
 
         stall_out          : out std_ulogic
         );
@@ -20,7 +21,7 @@ end entity gpr_hazard;
 architecture behaviour of gpr_hazard is
     type pipeline_entry_type is record
         valid : std_ulogic;
-        gpr   : std_ulogic_vector(4 downto 0);
+        gpr   : std_ulogic_vector(5 downto 0);
     end record;
     constant pipeline_entry_init : pipeline_entry_type := (valid => '0', gpr => (others => '0'));
 
@@ -32,7 +33,9 @@ begin
     gpr_hazard0: process(clk)
     begin
         if rising_edge(clk) then
-            r <= rin;
+	    if stall_in = '0' then
+		r <= rin;
+	    end if;
         end if;
     end process;
 
diff --git a/ppc_fx_insns.vhdl b/ppc_fx_insns.vhdl
index 3b03dc2..0bf011d 100644
--- a/ppc_fx_insns.vhdl
+++ b/ppc_fx_insns.vhdl
@@ -94,7 +94,6 @@ package ppc_fx_insns is
 	function ppc_divwu (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
 
 	function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return integer;
-	function ppc_bcctr_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0)) return integer;
 end package ppc_fx_insns;
 
 package body ppc_fx_insns is
@@ -809,21 +808,4 @@ package body ppc_fx_insns is
 		return ret;
 	end;
 
-	function ppc_bcctr_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0)) return integer is
-		variable crfield: integer;
-		variable crbit_match: std_ulogic;
-		variable cond_ok: std_ulogic;
-		variable ret: integer;
-	begin
-		crfield := to_integer(unsigned(bi));
-		-- BE bit numbering
-		crbit_match := '1' when cr(31-crfield) = bo(4-1) else '0';
-		cond_ok := bo(4-0) or crbit_match;
-		if cond_ok = '1' then
-			ret := 1;
-		else
-			ret := 0;
-		end if;
-		return ret;
-	end;
 end package body ppc_fx_insns;
diff --git a/register_file.vhdl b/register_file.vhdl
index 669093b..952d9fc 100644
--- a/register_file.vhdl
+++ b/register_file.vhdl
@@ -23,7 +23,7 @@ entity register_file is
 end entity register_file;
 
 architecture behaviour of register_file is
-    type regfile is array(0 to 31) of std_ulogic_vector(63 downto 0);
+    type regfile is array(0 to 63) of std_ulogic_vector(63 downto 0);
     signal registers : regfile := (others => (others => '0'));
 begin
     -- synchronous writes
@@ -32,7 +32,11 @@ begin
         if rising_edge(clk) then
             if w_in.write_enable = '1' then
                 assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure;
-                report "Writing GPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data);
+		if w_in.write_reg(5) = '0' then
+		    report "Writing GPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data);
+		else
+		    report "Writing GSPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data);
+		end if;
                 registers(to_integer(unsigned(w_in.write_reg))) <= w_in.write_data;
             end if;
         end if;
@@ -52,7 +56,7 @@ begin
         end if;
         d_out.read1_data <= registers(to_integer(unsigned(d_in.read1_reg)));
         d_out.read2_data <= registers(to_integer(unsigned(d_in.read2_reg)));
-        d_out.read3_data <= registers(to_integer(unsigned(d_in.read3_reg)));
+        d_out.read3_data <= registers(to_integer(unsigned(gpr_to_gspr(d_in.read3_reg))));
 
         -- Forward any written data
         if w_in.write_enable = '1' then
@@ -62,7 +66,7 @@ begin
             if d_in.read2_reg = w_in.write_reg then
                 d_out.read2_data <= w_in.write_data;
             end if;
-            if d_in.read3_reg = w_in.write_reg then
+            if gpr_to_gspr(d_in.read3_reg) = w_in.write_reg then
                 d_out.read3_data <= w_in.write_data;
             end if;
         end if;
diff --git a/writeback.vhdl b/writeback.vhdl
index 545e931..b88277e 100644
--- a/writeback.vhdl
+++ b/writeback.vhdl
@@ -126,7 +126,7 @@ begin
 	end if;
 
 	if l_in.write_enable = '1' then
-            w_out.write_reg <= l_in.write_reg;
+            w_out.write_reg <= gpr_to_gspr(l_in.write_reg);
             data_in <= l_in.write_data;
             data_len <= unsigned(l_in.write_len);
             byte_offset <= unsigned(l_in.write_shift);
@@ -144,7 +144,7 @@ begin
 
         if m_in.write_reg_enable = '1' then
             w_out.write_enable <= '1';
-            w_out.write_reg <= m_in.write_reg_nr;
+            w_out.write_reg <= gpr_to_gspr(m_in.write_reg_nr);
             data_in <= m_in.write_reg_data;
             rc <= m_in.rc;
 	    xe := m_in.xerc;
@@ -157,7 +157,7 @@ begin
 
         if d_in.write_reg_enable = '1' then
             w_out.write_enable <= '1';
-            w_out.write_reg <= d_in.write_reg_nr;
+            w_out.write_reg <= gpr_to_gspr(d_in.write_reg_nr);
             data_in <= d_in.write_reg_data;
             rc <= d_in.rc;
 	    xe := d_in.xerc;

From 23ade0b1c3bf2cb53d3ae7dd80099e5ec90a5efc Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 5 Dec 2019 12:42:31 +1100
Subject: [PATCH 9/9] decode2: Minor cleanup

Remove unused variable is_reg in decode_input_reg_a.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 decode2.vhdl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/decode2.vhdl b/decode2.vhdl
index 8a2d970..f6f7101 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -60,10 +60,7 @@ architecture behaviour of decode2 is
 	function decode_input_reg_a (t : input_reg_a_t; insn_in : std_ulogic_vector(31 downto 0);
 				     reg_data : std_ulogic_vector(63 downto 0);
 				     ispr : gspr_index_t) return decode_input_reg_t is
-		variable is_reg : std_ulogic;
 	begin
-		is_reg := '0' when insn_ra(insn_in) = "00000" else '1';
-
 		if t = RA or (t = RA_OR_ZERO and insn_ra(insn_in) /= "00000") then
 		    assert is_fast_spr(ispr) = '0' report "Decode A says GPR but ISPR says SPR:" &
 			to_hstring(ispr) severity failure;