From b55c9cc2987d30974adb06d2130ad774944252fd Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 28 Apr 2020 20:28:20 +1000
Subject: [PATCH 01/10] execute1: Improve architecture compliance of MSR and
 related instructions

This makes our treatment of the MSR conform better with the ISA.

- On reset, initialize the MSR to have the SF and LE bits set and
  all the others reset.  For good measure initialize r properly too.

- Fix the bit numbering in msr_copy (the code was using big-endian
  bit numbers, not little-endian).

- Use constants like MSR_EE to index MSR bits instead of expressions
  like '63 - 48', for readability.

- Set MSR[SF, LE] and clear MSR[PR, IR, DR, RI] on interrupts.

- Copy the relevant fields for rfid instead of using msr_copy, because
  the partial function fields of the MSR should be left unchanged,
  not zeroed.  Our implementation of rfid is like the architecture
  description of hrfid, because we don't implement hypervisor mode.

- Return the whole MSR for mfmsr.

- Implement the L field for mtmsrd (L=1 copies just EE and RI).

- For mtmsrd with L=0, leave out the HV, ME and LE bits as per the arch.

- For mtmsrd and rfid, if PR ends up set, then also set EE, IR and DR
  as per the arch.

- A few other minor tidyups (no semantic change).

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl   |  9 ++++++
 execute1.vhdl | 83 ++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 71 insertions(+), 21 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 9f6e96d..9041d32 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -7,6 +7,15 @@ use work.decode_types.all;
 
 package common is
 
+    -- MSR bit numbers
+    constant MSR_SF  : integer := (63 - 0);     -- Sixty-Four bit mode
+    constant MSR_EE  : integer := (63 - 48);    -- External interrupt Enable
+    constant MSR_PR  : integer := (63 - 49);    -- PRoblem state
+    constant MSR_IR  : integer := (63 - 58);    -- Instruction Relocation
+    constant MSR_DR  : integer := (63 - 59);    -- Data Relocation
+    constant MSR_RI  : integer := (63 - 62);    -- Recoverable Interrupt
+    constant MSR_LE  : integer := (63 - 63);    -- Little Endian
+
     -- SPR numbers
     subtype spr_num_t is integer range 0 to 1023;
 
diff --git a/execute1.vhdl b/execute1.vhdl
index 2c0a558..0f4eea9 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -48,6 +48,11 @@ architecture behaviour of execute1 is
 	slow_op_oe : std_ulogic;
 	slow_op_xerc : xer_common_t;
     end record;
+    constant reg_type_init : reg_type :=
+        (e => Execute1ToWritebackInit, lr_update => '0',
+         mul_in_progress => '0', div_in_progress => '0', cntz_in_progress => '0',
+         slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init,
+         others => (others => '0'));
 
     signal r, rin : reg_type;
 
@@ -124,11 +129,11 @@ architecture behaviour of execute1 is
 	--  tion MSR bits are not saved or restored.
 	--  Full function MSR bits lie in the range 0:32, 37:41, and
 	--  48:63, and partial function MSR bits lie in the range
-	--  33:36 and 42:47.
+	--  33:36 and 42:47. (Note this is IBM bit numbering).
 	msr_out := (others => '0');
-	msr_out(32 downto 0) := msr(32 downto 0);
-	msr_out(41 downto 37) := msr(41 downto 37);
-	msr_out(63 downto 48) := msr(63 downto 48);
+	msr_out(63 downto 31) := msr(63 downto 31);
+	msr_out(26 downto 22) := msr(26 downto 22);
+	msr_out(15 downto 0)  := msr(15 downto 0);
 	return msr_out;
     end;
 
@@ -193,14 +198,20 @@ begin
     execute1_0: process(clk)
     begin
 	if rising_edge(clk) then
-	    r <= rin;
-	    ctrl <= ctrl_tmp;
-	    assert not (r.lr_update = '1' and e_in.valid = '1')
-		report "LR update collision with valid in EX1"
-		severity failure;
-	    if r.lr_update = '1' then
-		report "LR update to " & to_hstring(r.next_lr);
-	    end if;
+            if rst = '1' then
+                r <= reg_type_init;
+                ctrl.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0');
+                ctrl.irq_state <= WRITE_SRR0;
+            else
+                r <= rin;
+                ctrl <= ctrl_tmp;
+                assert not (r.lr_update = '1' and e_in.valid = '1')
+                    report "LR update collision with valid in EX1"
+                    severity failure;
+                if r.lr_update = '1' then
+                    report "LR update to " & to_hstring(r.next_lr);
+                end if;
+            end if;
 	end if;
     end process;
 
@@ -370,7 +381,7 @@ begin
 	ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1);
 
 	irq_valid := '0';
-	if ctrl.msr(63 - 48) = '1' and ctrl.dec(63) = '1' then
+	if ctrl.msr(MSR_EE) = '1' and ctrl.dec(63) = '1' then
 	    report "IRQ valid";
 	    irq_valid := '1';
 	end if;
@@ -400,7 +411,13 @@ begin
 	    v.e.exc_write_reg := fast_spr_num(SPR_SRR1);
 	    v.e.exc_write_data := ctrl.srr1;
             v.e.exc_write_enable := '1';
-	    ctrl_tmp.msr(63 - 48) <= '0'; -- clear EE
+            ctrl_tmp.msr(MSR_SF) <= '1';
+            ctrl_tmp.msr(MSR_EE) <= '0';
+            ctrl_tmp.msr(MSR_PR) <= '0';
+            ctrl_tmp.msr(MSR_IR) <= '0';
+            ctrl_tmp.msr(MSR_DR) <= '0';
+            ctrl_tmp.msr(MSR_RI) <= '0';
+            ctrl_tmp.msr(MSR_LE) <= '1';
 	    f_out.redirect <= '1';
 	    f_out.redirect_nia <= ctrl.irq_nia;
 	    v.e.valid := e_in.valid;
@@ -545,7 +562,7 @@ begin
 	    when OP_B =>
 		f_out.redirect <= '1';
 		if (insn_aa(e_in.insn)) then
-		    f_out.redirect_nia <= std_ulogic_vector(signed(b_in));
+		    f_out.redirect_nia <= b_in;
 		else
 		    f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
 		end if;
@@ -561,7 +578,7 @@ begin
 		if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then
 		    f_out.redirect <= '1';
 		    if (insn_aa(e_in.insn)) then
-			f_out.redirect_nia <= std_ulogic_vector(signed(b_in));
+			f_out.redirect_nia <= b_in;
 		    else
 			f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
 		    end if;
@@ -584,7 +601,17 @@ begin
 	    when OP_RFID =>
 		f_out.redirect <= '1';
 		f_out.redirect_nia <= a_in(63 downto 2) & "00"; -- srr0
-		ctrl_tmp.msr <= msr_copy(std_ulogic_vector(signed(b_in))); -- srr1
+                -- Can't use msr_copy here because the partial function MSR
+                -- bits should be left unchanged, not zeroed.
+                ctrl_tmp.msr(63 downto 31) <= b_in(63 downto 31);
+                ctrl_tmp.msr(26 downto 22) <= b_in(26 downto 22);
+                ctrl_tmp.msr(15 downto 0)  <= b_in(15 downto 0);
+                if b_in(MSR_PR) = '1' then
+                    ctrl_tmp.msr(MSR_EE) <= '1';
+                    ctrl_tmp.msr(MSR_IR) <= '1';
+                    ctrl_tmp.msr(MSR_DR) <= '1';
+                end if;
+
 	    when OP_CMPB =>
 		result := ppc_cmpb(c_in, b_in);
 		result_en := '1';
@@ -658,7 +685,7 @@ begin
 		    end loop;
 		end if;
 	    when OP_MFMSR =>
-		result := msr_copy(ctrl.msr);
+		result := ctrl.msr;
 		result_en := '1';
 	    when OP_MFSPR =>
 		report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
@@ -714,9 +741,23 @@ begin
 		    v.e.write_cr_mask := num_to_fxm(crnum);
 		end if;
 		v.e.write_cr_data := c_in(31 downto 0);
-	    when OP_MTMSRD =>
-		-- FIXME handle just the bits we need to.
-		ctrl_tmp.msr <= msr_copy(c_in);
+            when OP_MTMSRD =>
+                if e_in.insn(16) = '1' then
+                    -- just update EE and RI
+                    ctrl_tmp.msr(MSR_EE) <= c_in(MSR_EE);
+                    ctrl_tmp.msr(MSR_RI) <= c_in(MSR_RI);
+                else
+                    -- Architecture says to leave out bits 3 (HV), 51 (ME)
+                    -- and 63 (LE) (IBM bit numbering)
+                    ctrl_tmp.msr(63 downto 61) <= c_in(63 downto 61);
+                    ctrl_tmp.msr(59 downto 13) <= c_in(59 downto 13);
+                    ctrl_tmp.msr(11 downto 1)  <= c_in(11 downto 1);
+                    if c_in(MSR_PR) = '1' then
+                        ctrl_tmp.msr(MSR_EE) <= '1';
+                        ctrl_tmp.msr(MSR_IR) <= '1';
+                        ctrl_tmp.msr(MSR_DR) <= '1';
+                    end if;
+                end if;
 	    when OP_MTSPR =>
 		report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
 		    "=" & to_hstring(c_in);

From 74db0710678d4871843a783edfa602ed621c91d1 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 28 Apr 2020 19:38:58 +1000
Subject: [PATCH 02/10] execute1: Generate privileged instruction interrupts
 when MSR[PR] = 1

This adds logic to execute1 to check, when MSR[PR] = 1, whether each
instruction arriving to be executed is a privileged instruction.
If it is, a privileged-instruction type program interrupt is generated.
For the mtspr and mfspr instructions, we need to look at bit 20 of the
instruction (bit 4 of the SPR number) to determine if the SPR is
privileged.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 execute1.vhdl | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/execute1.vhdl b/execute1.vhdl
index 0f4eea9..9153b37 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -76,6 +76,28 @@ architecture behaviour of execute1 is
     signal x_to_divider: Execute1ToDividerType;
     signal divider_to_x: DividerToExecute1Type;
 
+    type privilege_level is (USER, SUPER);
+    type op_privilege_array is array(insn_type_t) of privilege_level;
+    constant op_privilege: op_privilege_array := (
+        OP_ATTN => SUPER,
+        OP_MFMSR => SUPER,
+        OP_MTMSRD => SUPER,
+        OP_RFID => SUPER,
+        others => USER
+        );
+
+    function instr_is_privileged(op: insn_type_t; insn: std_ulogic_vector(31 downto 0))
+        return boolean is
+    begin
+        if op_privilege(op) = SUPER then
+            return true;
+        elsif op = OP_MFSPR or op = OP_MTSPR then
+            return insn(20) = '1';
+        else
+            return false;
+        end if;
+    end;
+
     procedure set_carry(e: inout Execute1ToWritebackType;
 			carry32 : in std_ulogic;
 			carry : in std_ulogic) is
@@ -432,6 +454,16 @@ begin
 	    ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#900#, 64));
 	    ctrl_tmp.srr1 <= msr_copy(ctrl.msr);
 
+        elsif e_in.valid = '1' and ctrl.msr(MSR_PR) = '1' and
+            instr_is_privileged(e_in.insn_type, e_in.insn) then
+            -- generate a program interrupt
+            exception := '1';
+            ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#700#, 64));
+            ctrl_tmp.srr1 <= msr_copy(ctrl.msr);
+            -- set bit 45 to indicate privileged instruction type interrupt
+            ctrl_tmp.srr1(63 - 45) <= '1';
+            report "privileged instruction";
+            
 	elsif e_in.valid = '1' then
 
 	    v.e.valid := '1';

From 167e37d6675136d26acdb6f7aba0a7f7ad1e60d8 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 3 Apr 2020 14:50:17 +1100
Subject: [PATCH 03/10] Plumb insn_type through to loadstore1

In preparation for adding a TLB to the dcache, this plumbs the
insn_type from execute1 through to loadstore1, so that we can have
other operations besides loads and stores (e.g. tlbie) going to
loadstore1 and thence to the dcache.  This also plumbs the unit field
of the decode ROM from decode2 through to execute1 to simplify the
logic around which ops need to go to loadstore1.

The load and store data formatting are now not conditional on the
op being OP_LOAD or OP_STORE.  This eliminates the inferred latches
clocked by each of the bits of r.op that we were getting previously.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Makefile        |   2 +-
 common.vhdl     |   7 +--
 decode2.vhdl    |   1 +
 execute1.vhdl   |  19 ++++----
 loadstore1.vhdl | 116 ++++++++++++++++++++++++------------------------
 5 files changed, 74 insertions(+), 71 deletions(-)

diff --git a/Makefile b/Makefile
index 8c3133d..c09696a 100644
--- a/Makefile
+++ b/Makefile
@@ -58,7 +58,7 @@ icache_tb.o: common.o wishbone_types.o icache.o wishbone_bram_wrapper.o
 dcache.o: utils.o common.o wishbone_types.o plru.o cache_ram.o utils.o
 dcache_tb.o: common.o wishbone_types.o dcache.o wishbone_bram_wrapper.o
 insn_helpers.o:
-loadstore1.o: common.o helpers.o
+loadstore1.o: common.o helpers.o decode_types.o
 logical.o: decode_types.o
 multiply_tb.o: decode_types.o common.o glibc_random.o ppc_fx_insns.o multiply.o
 multiply.o: common.o decode_types.o
diff --git a/common.vhdl b/common.vhdl
index 9041d32..65e40c1 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -118,6 +118,7 @@ package common is
 
     type Decode2ToExecute1Type is record
 	valid: std_ulogic;
+        unit : unit_t;
 	insn_type: insn_type_t;
 	nia: std_ulogic_vector(63 downto 0);
 	write_reg: gspr_index_t;
@@ -150,7 +151,7 @@ package common is
         reserve : std_ulogic;                           -- set for larx/stcx
     end record;
     constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
-	(valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
+	(valid => '0', unit => NONE, insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
          lr => '0', rc => '0', oe => '0', invert_a => '0',
 	 invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
 	 is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0',
@@ -213,7 +214,7 @@ package common is
 
     type Execute1ToLoadstore1Type is record
 	valid : std_ulogic;
-	load : std_ulogic;				-- is this a load or store
+        op : insn_type_t;                               -- what ld/st op to do
 	addr1 : std_ulogic_vector(63 downto 0);
 	addr2 : std_ulogic_vector(63 downto 0);
 	data : std_ulogic_vector(63 downto 0);		-- data to write, unused for read
@@ -228,7 +229,7 @@ package common is
         reserve : std_ulogic;                           -- set for larx/stcx.
         rc : std_ulogic;                                -- set for stcx.
     end record;
-    constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', ci => '0', byte_reverse => '0',
+    constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0',
                                                                      sign_extend => '0', update => '0', xerc => xerc_init,
                                                                      reserve => '0', rc => '0', others => (others => '0'));
 
diff --git a/decode2.vhdl b/decode2.vhdl
index ff773aa..edcc50c 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -304,6 +304,7 @@ begin
 
 		-- execute unit
 		v.e.nia := d_in.nia;
+                v.e.unit := d_in.decode.unit;
 		v.e.insn_type := d_in.decode.insn_type;
 		v.e.read_reg1 := decoded_reg_a.reg;
 		v.e.read_data1 := decoded_reg_a.data;
diff --git a/execute1.vhdl b/execute1.vhdl
index 9153b37..abd4a18 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -464,7 +464,7 @@ begin
             ctrl_tmp.srr1(63 - 45) <= '1';
             report "privileged instruction";
             
-	elsif e_in.valid = '1' then
+	elsif e_in.valid = '1' and e_in.unit = ALU then
 
 	    v.e.valid := '1';
 	    v.e.write_reg := e_in.write_reg;
@@ -844,11 +844,6 @@ begin
 		stall_out <= '1';
 		x_to_divider.valid <= '1';
 
-            when OP_LOAD | OP_STORE =>
-                -- loadstore/dcache has its own port to writeback
-                v.e.valid := '0';
-                lv.valid := '1';
-
             when others =>
 		terminate_out <= '1';
 		report "illegal";
@@ -874,6 +869,14 @@ begin
 		report "Delayed LR update to " & to_hstring(next_nia);
 		stall_out <= '1';
 	    end if;
+
+        elsif e_in.valid = '1' then
+            -- instruction for other units, i.e. LDST
+            v.e.valid := '0';
+            if e_in.unit = LDST then
+                lv.valid := '1';
+            end if;
+
 	elsif r.lr_update = '1' then
 	    result_en := '1';
 	    result := r.next_lr;
@@ -940,9 +943,7 @@ begin
 	v.e.write_enable := result_en;
 
         -- Outputs to loadstore1 (async)
-        if e_in.insn_type = OP_LOAD then
-            lv.load := '1';
-        end if;
+        lv.op := e_in.insn_type;
         lv.addr1 := a_in;
         lv.addr2 := b_in;
         lv.data := c_in;
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index 518feee..664e396 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -3,6 +3,7 @@ use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;
 
 library work;
+use work.decode_types.all;
 use work.common.all;
 use work.helpers.all;
 
@@ -41,7 +42,7 @@ architecture behave of loadstore1 is
 
     type reg_stage_t is record
         -- latch most of the input request
-	load         : std_ulogic;
+        load         : std_ulogic;
 	addr         : std_ulogic_vector(63 downto 0);
 	store_data   : std_ulogic_vector(63 downto 0);
 	load_data    : std_ulogic_vector(63 downto 0);
@@ -146,59 +147,60 @@ begin
         two_dwords := or (r.second_bytes);
 
         -- load data formatting
-        if r.load = '1' then
-            byte_offset := unsigned(r.addr(2 downto 0));
-            brev_lenm1 := "000";
-            if r.byte_reverse = '1' then
-                brev_lenm1 := unsigned(r.length(2 downto 0)) - 1;
-            end if;
+        byte_offset := unsigned(r.addr(2 downto 0));
+        brev_lenm1 := "000";
+        if r.byte_reverse = '1' then
+            brev_lenm1 := unsigned(r.length(2 downto 0)) - 1;
+        end if;
 
-            -- shift and byte-reverse data bytes
-            for i in 0 to 7 loop
-                kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
-                use_second(i) := kk(3);
-                j := to_integer(kk(2 downto 0)) * 8;
-                data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j);
-            end loop;
-
-            -- Work out the sign bit for sign extension.
-            -- Assumes we are not doing both sign extension and byte reversal,
-            -- in that for unaligned loads crossing two dwords we end up
-            -- using a bit from the second dword, whereas for a byte-reversed
-            -- (i.e. big-endian) load the sign bit would be in the first dword.
-            negative := (r.length(3) and data_permuted(63)) or
-                        (r.length(2) and data_permuted(31)) or
-                        (r.length(1) and data_permuted(15)) or
-                        (r.length(0) and data_permuted(7));
-
-            -- trim and sign-extend
-            for i in 0 to 7 loop
-                if i < to_integer(unsigned(r.length)) then
-                    if two_dwords = '1' then
-                        trim_ctl(i) := '1' & not use_second(i);
-                    else
-                        trim_ctl(i) := not use_second(i) & '0';
-                    end if;
+        -- shift and byte-reverse data bytes
+        for i in 0 to 7 loop
+            kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
+            use_second(i) := kk(3);
+            j := to_integer(kk(2 downto 0)) * 8;
+            data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j);
+        end loop;
+
+        -- Work out the sign bit for sign extension.
+        -- Assumes we are not doing both sign extension and byte reversal,
+        -- in that for unaligned loads crossing two dwords we end up
+        -- using a bit from the second dword, whereas for a byte-reversed
+        -- (i.e. big-endian) load the sign bit would be in the first dword.
+        negative := (r.length(3) and data_permuted(63)) or
+                    (r.length(2) and data_permuted(31)) or
+                    (r.length(1) and data_permuted(15)) or
+                    (r.length(0) and data_permuted(7));
+
+        -- trim and sign-extend
+        for i in 0 to 7 loop
+            if i < to_integer(unsigned(r.length)) then
+                if two_dwords = '1' then
+                    trim_ctl(i) := '1' & not use_second(i);
                 else
-                    trim_ctl(i) := '0' & (negative and r.sign_extend);
+                    trim_ctl(i) := not use_second(i) & '0';
                 end if;
-                case trim_ctl(i) is
-                    when "11" =>
-                        data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8);
-                    when "10" =>
-                        data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8);
-                    when "01" =>
-                        data_trimmed(i * 8 + 7 downto i * 8) := x"FF";
-                    when others =>
-                        data_trimmed(i * 8 + 7 downto i * 8) := x"00";
-                end case;
-            end loop;
-        end if;
+            else
+                trim_ctl(i) := '0' & (negative and r.sign_extend);
+            end if;
+            case trim_ctl(i) is
+                when "11" =>
+                    data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8);
+                when "10" =>
+                    data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8);
+                when "01" =>
+                    data_trimmed(i * 8 + 7 downto i * 8) := x"FF";
+                when others =>
+                    data_trimmed(i * 8 + 7 downto i * 8) := x"00";
+            end case;
+        end loop;
 
         case r.state is
         when IDLE =>
             if l_in.valid = '1' then
-                v.load := l_in.load;
+                v.load := '0';
+                if l_in.op = OP_LOAD then
+                    v.load := '1';
+                end if;
                 v.addr := lsu_sum;
                 v.write_reg := l_in.write_reg;
                 v.length := l_in.length;
@@ -229,18 +231,16 @@ begin
                 v.addr := lsu_sum;
 
                 -- Do byte reversing and rotating for stores in the first cycle
-                if v.load = '0' then
-                    byte_offset := unsigned(lsu_sum(2 downto 0));
-                    brev_lenm1 := "000";
-                    if l_in.byte_reverse = '1' then
-                        brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
-                    end if;
-                    for i in 0 to 7 loop
-                        k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
-                        j := to_integer(k) * 8;
-                        v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
-                    end loop;
+                byte_offset := unsigned(lsu_sum(2 downto 0));
+                brev_lenm1 := "000";
+                if l_in.byte_reverse = '1' then
+                    brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
                 end if;
+                for i in 0 to 7 loop
+                    k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
+                    j := to_integer(k) * 8;
+                    v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
+                end loop;
 
                 req := '1';
                 stall := '1';

From 041d6bef60956849364c1540e7eecb6fdca77497 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 28 Apr 2020 18:11:52 +1000
Subject: [PATCH 04/10] dcache: Implement the dcbz instruction

This adds logic to dcache and loadstore1 to implement dcbz.  For now
it zeroes a single cache line (by default 64 bytes), not 128 bytes
like IBM Power processors do.

The dcbz operation is performed much like a load miss, except that
we are writing zeroes to memory instead of reading.  As each ack
comes back, we write zeroes to the BRAM instead of data from memory.
In this way we zero the line in memory and also zero the line of
cache memory, establishing the line in the cache if it wasn't already
resident.  If it was already resident then we overwrite the existing
line in the cache.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 common.vhdl     |  1 +
 dcache.vhdl     | 66 +++++++++++++++++++++++++++++++++++++++----------
 decode1.vhdl    |  2 +-
 loadstore1.vhdl |  5 ++++
 4 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/common.vhdl b/common.vhdl
index 65e40c1..61252bd 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -236,6 +236,7 @@ package common is
     type Loadstore1ToDcacheType is record
 	valid : std_ulogic;
 	load : std_ulogic;
+        dcbz : std_ulogic;
 	nc : std_ulogic;
         reserve : std_ulogic;
 	addr : std_ulogic_vector(63 downto 0);
diff --git a/dcache.vhdl b/dcache.vhdl
index 7e553bf..550298b 100644
--- a/dcache.vhdl
+++ b/dcache.vhdl
@@ -581,8 +581,12 @@ begin
 		wr_data  <= r0.data;
 		wr_sel   <= r0.byte_sel;
 	    else
-		-- Otherwise, we might be doing a reload
-		wr_data <= wishbone_in.dat;
+		-- Otherwise, we might be doing a reload or a DCBZ
+                if r1.req.dcbz = '1' then
+                    wr_data <= (others => '0');
+                else
+                    wr_data <= wishbone_in.dat;
+                end if;
 		wr_sel  <= (others => '1');
 		wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS));
 	    end if;
@@ -718,18 +722,54 @@ begin
 			r1.wb.we <= '0';
 			r1.state <= NC_LOAD_WAIT_ACK;
 
-		    when OP_STORE_HIT | OP_STORE_MISS =>
-                        r1.wb.sel <= r0.byte_sel;
-                        r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000";
-			r1.wb.dat <= r0.data;
-                        if cancel_store = '0' then
+                    when OP_STORE_HIT | OP_STORE_MISS =>
+                        if r0.dcbz = '0' then
+                            r1.wb.sel <= r0.byte_sel;
+                            r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000";
+                            r1.wb.dat <= r0.data;
+                            if cancel_store = '0' then
+                                r1.wb.cyc <= '1';
+                                r1.wb.stb <= '1';
+                                r1.wb.we <= '1';
+                                r1.state <= STORE_WAIT_ACK;
+                            else
+                                r1.stcx_fail <= '1';
+                                r1.state <= IDLE;
+                            end if;
+                        else
+                            -- dcbz is handled much like a load miss except
+                            -- that we are writing to memory instead of reading
+                            r1.store_index <= req_index;
+                            r1.store_row <= get_row(req_laddr);
+
+                            if req_op = OP_STORE_HIT then
+                                r1.store_way <= req_hit_way;
+                            else
+                                r1.store_way <= replace_way;
+
+                                -- Force misses on the victim way while zeroing
+                                cache_valids(req_index)(replace_way) <= '0';
+
+                                -- Store new tag in selected way
+                                for i in 0 to NUM_WAYS-1 loop
+                                    if i = replace_way then
+                                        tagset := cache_tags(req_index);
+                                        write_tag(i, tagset, req_tag);
+                                        cache_tags(req_index) <= tagset;
+                                    end if;
+                                end loop;
+                            end if;
+
+                            -- Set up for wishbone writes
+                            r1.wb.adr <= req_laddr(r1.wb.adr'left downto 0);
+                            r1.wb.sel <= (others => '1');
+                            r1.wb.we <= '1';
+                            r1.wb.dat <= (others => '0');
                             r1.wb.cyc <= '1';
                             r1.wb.stb <= '1';
-                            r1.wb.we <= '1';
-                            r1.state <= STORE_WAIT_ACK;
-                        else
-                            r1.stcx_fail <= '1';
-                            r1.state <= IDLE;
+
+                            -- Handle the rest like a load miss
+                            r1.state <= RELOAD_WAIT_ACK;
                         end if;
 
 		    -- OP_NONE and OP_BAD do nothing
@@ -766,7 +806,7 @@ begin
 			-- not idle, which we don't currently know how to deal
 			-- with.
 			--
-			if r1.store_row = get_row(r1.req.addr) then
+			if r1.store_row = get_row(r1.req.addr) and r1.req.dcbz = '0' then
 			    r1.slow_data <= wishbone_in.dat;
 			end if;
 
diff --git a/decode1.vhdl b/decode1.vhdl
index 8c7d5f2..785b669 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -164,7 +164,7 @@ architecture behaviour of decode1 is
 		2#0000110110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbst
 		2#0100010110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbt
 		2#0011110110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbtst
-		-- 2#1111110110# dcbz
+		2#1111110110#  =>       (LDST,   OP_DCBZ,      RA_OR_ZERO, RB,          NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- dcbz
 		2#0110001001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divdeu
 		2#1110001001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divdeuo
 		2#0110001011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divweu
diff --git a/loadstore1.vhdl b/loadstore1.vhdl
index 664e396..90650db 100644
--- a/loadstore1.vhdl
+++ b/loadstore1.vhdl
@@ -43,6 +43,7 @@ architecture behave of loadstore1 is
     type reg_stage_t is record
         -- latch most of the input request
         load         : std_ulogic;
+        dcbz         : std_ulogic;
 	addr         : std_ulogic_vector(63 downto 0);
 	store_data   : std_ulogic_vector(63 downto 0);
 	load_data    : std_ulogic_vector(63 downto 0);
@@ -198,8 +199,11 @@ begin
         when IDLE =>
             if l_in.valid = '1' then
                 v.load := '0';
+                v.dcbz := '0';
                 if l_in.op = OP_LOAD then
                     v.load := '1';
+                elsif l_in.op = OP_DCBZ then
+                    v.dcbz := '1';
                 end if;
                 v.addr := lsu_sum;
                 v.write_reg := l_in.write_reg;
@@ -293,6 +297,7 @@ begin
         -- Update outputs to dcache
         d_out.valid <= req;
         d_out.load <= v.load;
+        d_out.dcbz <= v.dcbz;
         d_out.nc <= v.nc;
         d_out.reserve <= v.reserve;
         d_out.addr <= addr;

From 10f4be4309667ba5fa42c52edbe5132607cbdcbb Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 29 Apr 2020 09:09:23 +1000
Subject: [PATCH 05/10] tests: Add a test for privileged instruction interrupts

This adds a test that tries to execute various privileged instructions
with MSR[PR] = 1.  This also incidentally tests some of the MSR bit
manipulations.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 tests/privileged/Makefile         |   3 +
 tests/privileged/head.S           |  91 ++++++++++++++++++
 tests/privileged/powerpc.lds      |  13 +++
 tests/privileged/privileged.c     | 152 ++++++++++++++++++++++++++++++
 tests/test_privileged.bin         | Bin 0 -> 9900 bytes
 tests/test_privileged.console_out |   6 ++
 tests/update_console_tests        |   2 +-
 7 files changed, 266 insertions(+), 1 deletion(-)
 create mode 100644 tests/privileged/Makefile
 create mode 100644 tests/privileged/head.S
 create mode 100644 tests/privileged/powerpc.lds
 create mode 100644 tests/privileged/privileged.c
 create mode 100755 tests/test_privileged.bin
 create mode 100644 tests/test_privileged.console_out

diff --git a/tests/privileged/Makefile b/tests/privileged/Makefile
new file mode 100644
index 0000000..7c24998
--- /dev/null
+++ b/tests/privileged/Makefile
@@ -0,0 +1,3 @@
+TEST=privileged
+
+include ../Makefile.test
diff --git a/tests/privileged/head.S b/tests/privileged/head.S
new file mode 100644
index 0000000..9b76234
--- /dev/null
+++ b/tests/privileged/head.S
@@ -0,0 +1,91 @@
+/* Copyright 2013-2014 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define STACK_TOP 0x8000
+
+/* Load an immediate 64-bit value into a register */
+#define LOAD_IMM64(r, e)			\
+	lis     r,(e)@highest;			\
+	ori     r,r,(e)@higher;			\
+	rldicr  r,r, 32, 31;			\
+	oris    r,r, (e)@h;			\
+	ori     r,r, (e)@l;
+
+	.section ".head","ax"
+
+	/*
+	 * Microwatt currently enters in LE mode at 0x0, so we don't need to
+	 * do any endian fix ups
+	 */
+	. = 0
+.global _start
+_start:
+	b	boot_entry
+
+.global boot_entry
+boot_entry:
+	/* setup stack */
+	LOAD_IMM64(%r1, STACK_TOP - 0x100)
+	LOAD_IMM64(%r12, main)
+	mtctr	%r12
+	bctrl
+	attn // terminate on exit
+	b .
+
+	/* Call a function with a specified MSR value */
+	.global	call_with_msr
+call_with_msr:
+	mtsrr0	%r4
+	mr	%r12,%r4
+	mtsrr1	%r5
+	rfid
+
+#define EXCEPTION(nr)		\
+	.= nr			;\
+	li	%r3,nr		;\
+	blr
+
+	EXCEPTION(0x300)
+	EXCEPTION(0x380)
+	EXCEPTION(0x400)
+	EXCEPTION(0x480)
+	EXCEPTION(0x500)
+	EXCEPTION(0x600)
+	EXCEPTION(0x700)
+	EXCEPTION(0x800)
+	EXCEPTION(0x900)
+	EXCEPTION(0x980)
+	EXCEPTION(0xa00)
+	EXCEPTION(0xb00)
+
+	/*
+	 * System call - used to exit from tests where MSR[PR]
+	 * may have been set.
+	 */
+	. = 0xc00
+	blr
+
+	EXCEPTION(0xd00)
+	EXCEPTION(0xe00)
+	EXCEPTION(0xe20)
+	EXCEPTION(0xe40)
+	EXCEPTION(0xe60)
+	EXCEPTION(0xe80)
+	EXCEPTION(0xf00)
+	EXCEPTION(0xf20)
+	EXCEPTION(0xf40)
+	EXCEPTION(0xf60)
+	EXCEPTION(0xf80)
diff --git a/tests/privileged/powerpc.lds b/tests/privileged/powerpc.lds
new file mode 100644
index 0000000..8c8c65b
--- /dev/null
+++ b/tests/privileged/powerpc.lds
@@ -0,0 +1,13 @@
+SECTIONS
+{
+	_start = .;
+	. = 0;
+	.head : {
+		KEEP(*(.head))
+ 	}
+	. = 0x2000;
+	.text : { *(.text) }
+	. = 0x4000;
+	.data : { *(.data) }
+	.bss : { *(.bss) }
+}
diff --git a/tests/privileged/privileged.c b/tests/privileged/privileged.c
new file mode 100644
index 0000000..073dc07
--- /dev/null
+++ b/tests/privileged/privileged.c
@@ -0,0 +1,152 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "console.h"
+
+#define MSR_EE	0x8000
+#define MSR_PR	0x4000
+#define MSR_IR	0x0020
+#define MSR_DR	0x0010
+
+extern int call_with_msr(unsigned long arg, int (*fn)(unsigned long), unsigned long msr);
+
+#define SRR0	26
+#define SRR1	27
+
+static inline unsigned long mfspr(int sprnum)
+{
+	long val;
+
+	__asm__ volatile("mfspr %0,%1" : "=r" (val) : "i" (sprnum));
+	return val;
+}
+
+static inline void mtspr(int sprnum, unsigned long val)
+{
+	__asm__ volatile("mtspr %0,%1" : : "i" (sprnum), "r" (val));
+}
+
+void print_string(const char *str)
+{
+	for (; *str; ++str)
+		putchar(*str);
+}
+
+void print_hex(unsigned long val, int ndigits)
+{
+	int i, x;
+
+	for (i = (ndigits - 1) * 4; i >= 0; i -= 4) {
+		x = (val >> i) & 0xf;
+		if (x >= 10)
+			putchar(x + 'a' - 10);
+		else
+			putchar(x + '0');
+	}
+}
+
+// i < 100
+void print_test_number(int i)
+{
+	print_string("test ");
+	putchar(48 + i/10);
+	putchar(48 + i%10);
+	putchar(':');
+}
+
+int priv_fn_1(unsigned long x)
+{
+	__asm__ volatile("attn");
+	__asm__ volatile("li 3,0; sc");
+	return 0;
+}
+
+int priv_fn_2(unsigned long x)
+{
+	__asm__ volatile("mfmsr 3");
+	__asm__ volatile("sc");
+	return 0;
+}
+
+int priv_fn_3(unsigned long x)
+{
+	__asm__ volatile("mtmsrd 3");
+	__asm__ volatile("li 3,0; sc");
+	return 0;
+}
+
+int priv_fn_4(unsigned long x)
+{
+	__asm__ volatile("rfid");
+	__asm__ volatile("li 3,0; sc");
+	return 0;
+}
+
+int priv_fn_5(unsigned long x)
+{
+	__asm__ volatile("mfsrr0 3");
+	__asm__ volatile("sc");
+	return 0;
+}
+
+int priv_fn_6(unsigned long x)
+{
+	__asm__ volatile("mtsrr0 3");
+	__asm__ volatile("sc");
+	return 0;
+}
+
+int priv_test(int (*fn)(unsigned long))
+{
+	unsigned long msr;
+	int vec;
+
+	__asm__ volatile ("mtdec %0" : : "r" (0x7fffffff));
+	__asm__ volatile ("mfmsr %0" : "=r" (msr));
+	/* this should fail */
+	vec = call_with_msr(0, fn, msr | MSR_PR);
+	if (vec != 0x700)
+		return vec | 1;
+	/* SRR1 should be set correctly */
+	msr |= MSR_PR | MSR_EE | MSR_IR | MSR_DR;
+	if (mfspr(SRR1) != (msr | 0x40000))
+		return 2;
+	return 0;
+}
+
+int fail = 0;
+
+void do_test(int num, int (*fn)(unsigned long))
+{
+	int ret;
+
+	print_test_number(num);
+	ret = priv_test(fn);
+	if (ret == 0) {
+		print_string("PASS\r\n");
+	} else {
+		fail = 1;
+		print_string("FAIL ");
+		print_hex(ret, 4);
+		print_string(" SRR0=");
+		print_hex(mfspr(SRR0), 16);
+		print_string(" SRR1=");
+		print_hex(mfspr(SRR1), 16);
+		print_string("\r\n");
+	}
+}
+
+int main(void)
+{
+	potato_uart_init();
+
+	do_test(1, priv_fn_1);
+	do_test(2, priv_fn_2);
+	do_test(3, priv_fn_3);
+	do_test(4, priv_fn_4);
+	do_test(5, priv_fn_5);
+	do_test(6, priv_fn_6);
+
+	return fail;
+}
diff --git a/tests/test_privileged.bin b/tests/test_privileged.bin
new file mode 100755
index 0000000000000000000000000000000000000000..5b8ce63ab5e843bf8ff40c84d36833ad52abd0ca
GIT binary patch
literal 9900
zcmeHNUuauZ9RA%TP13B38T6sr>24aqWfY7c<7Ua~z3ZQ)bZBWeCirr*w7B`CxQDdf
z^j^bWlst3~z6h>9U3?JnVGjfU5D^A~FM<k!Y%Q(Obq*`|klMQAcTSQv%}nz&E|PoU
z=A7T3^ZU;CopTQ%xjv!{k(?%q>bC>YJlYvDqdJjwny8e$9VpGAolVC|S=;+wA?mYU
zCJ*r|V|za|^0AU(d%rNAB|0a(?uV<u|49L_uE>Nv?a*a8j88s?`8$kHUC#1%Xxr_<
z`a7I|pu_nGJDh*Fv-~ZG@hRji|Gk6Aq28g+=I`WpABR?aKkVVoUVjQ7%wKL3SKGw(
zHgT&>eE(uS2J4A7{M*FUHgUa8+-ehl42<29t^%$Ct^%$Ct^%$Ct^%$Ct^%$Ct^%$C
zt^%$C9j3r0`{sUxr1c70xo>-ehWI^D{J*a&Xhw)0{e8qH-Dnk?k#vKPX%(-&C(t@>
zd*)l#w!N)m5~*n_pHXa2&;XZh##U3{Rr!HMD-xL{qS#;$71abql?Y=KSucO@f%G->
zYfCO!5>aCvXtQ0dwU)~FFBqpH>|(v<%>Xlrj-jR{YTkOtCQj~rXAT8UT#y6txo!##
zCg_UNL+`2=DV`ki$1XhZR55oLOO+dOA=ZE4a!kf9r9-rw3eM(~I`-2T`}6igGeM>j
zB6e)ZpF4lOR(o>OG!m3kxUW$2ox*y&RqJ#9J(%k*=O%1s%BMN!eE$rv<Jmf<?LAqr
zX&Ib#h1B12U3WwcodVv$UMgx5<@B3@9I&Ix=l`$?+mkBzh`rA96J?g8%uB`^D{3Ct
zzczAj(p+H2mW(>?pO|ZYd)1u9n7Muv`?IKWO(M`ktYyRA^&wXxImy?R=ky_mtbywf
zFdrXT3a=AK4v)!*{Uqi&4=hVLQu7~&|105tNBFnU=lKcv7Y~>>pYICx+6Ur&8jfE~
zOW@k8UsK{3$8FC2Mc7Z)Y%lCd*xfbT2m4LfTVgK7bQkPJ*!OC-ANHrPe}OGSXU@}O
zU0MhB`tGw^U0BINTi;VumhmnR=_JRqeMCu_;&VZ3VCyMTlN)Ep3(fxl?2f}2<lT#4
z%*NPwVI^>OuEL6H5cAl0=8D1h*ui7Z)W)7Dtp4$w7)w~6jaS~No-@_CiqASGe}4C#
zyK4$#-==y8R?Yf;P%B@fe*^n%q4W`S+r-WHfwH}SPjj5lGQ$O;_n+?(`SB#r%}>N0
z;map*b@uJ6GTKRPdirpPXiOVF$8C9fYAONsc>4lc9^)4@5tlSSMZUZy_4i1^2Ysph
zH)C=&EPR9Le-Gc$2A_)lk4<^!;ahI<U54-Xrm;9+>8}PK&Qw~&y#pUV)M%tzqa|rL
zoH(K_jD*6Z-SumSh>jz43-?i+j(MIpio6@1C26!F^NvU*UULoiW#n?a-P|#eJIRY8
zm1KlKoxVh#g+7go_>wfShs$dvkkf~QkK^x~Uj})@e(f$M$n|*@d7mKfW1cq}mcih)
JFpU6-=wFGzEoA@z

literal 0
HcmV?d00001

diff --git a/tests/test_privileged.console_out b/tests/test_privileged.console_out
new file mode 100644
index 0000000..a49bb9b
--- /dev/null
+++ b/tests/test_privileged.console_out
@@ -0,0 +1,6 @@
+test 01:PASS
+test 02:PASS
+test 03:PASS
+test 04:PASS
+test 05:PASS
+test 06:PASS
diff --git a/tests/update_console_tests b/tests/update_console_tests
index c17c12b..bd012d9 100755
--- a/tests/update_console_tests
+++ b/tests/update_console_tests
@@ -3,7 +3,7 @@
 # Script to update console related tests from source
 #
 
-for i in sc illegal decrementer ; do
+for i in sc illegal decrementer privileged ; do
     cd $i
     make
     cd -

From a05ee9fc7f3da2a61a358a172a2c1c44cc03a1c5 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 29 Apr 2020 11:11:22 +1000
Subject: [PATCH 06/10] Makefile: fix typo

Fix a typo which meant that the console tests weren't getting
executed by 'make check'.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index c09696a..a13fdcc 100644
--- a/Makefile
+++ b/Makefile
@@ -130,7 +130,7 @@ dmi_dtm_tb: dmi_dtm_tb.o sim_vhpi_c.o sim_bram_helpers_c.o
 tests = $(sort $(patsubst tests/%.out,%,$(wildcard tests/*.out)))
 tests_console = $(sort $(patsubst tests/%.console_out,%,$(wildcard tests/*.console_out)))
 
-check: $(tests) $(test_console) test_micropython test_micropython_long
+check: $(tests) $(tests_console) test_micropython test_micropython_long
 
 check_light: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 test_micropython test_micropython_long $(tests_console)
 

From cf4dfeca3645fb3f43785536deaa2cc8643b0e48 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 29 Apr 2020 11:37:02 +1000
Subject: [PATCH 07/10] Change the default cross compiler prefix to
 powerpc64le-linux-gnu-

That is what is used by the packaged cross-compilers on (at least)
Fedora and Ubuntu.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 README.md              | 5 ++++-
 hello_world/Makefile   | 2 +-
 rust_lib_demo/Makefile | 2 +-
 tests/Makefile.test    | 2 +-
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 8bf4622..98f2140 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,10 @@ You can try out Microwatt/Micropython without hardware by using the ghdl simulat
 
 - Build micropython. If you aren't building on a ppc64le box you
   will need a cross compiler. If it isn't available on your distro
-  grab the powerpc64le-power8 toolchain from https://toolchains.bootlin.com
+  grab the powerpc64le-power8 toolchain from https://toolchains.bootlin.com.
+  You may need to set the CROSS_COMPILE environment variable
+  to the prefix used for your cross compilers.  The default is
+  powerpc64le-linux-gnu-.
 
 ```
 git clone https://github.com/micropython/micropython.git
diff --git a/hello_world/Makefile b/hello_world/Makefile
index 674095e..a609199 100644
--- a/hello_world/Makefile
+++ b/hello_world/Makefile
@@ -1,7 +1,7 @@
 ARCH = $(shell uname -m)
 ifneq ("$(ARCH)", "ppc64")
 ifneq ("$(ARCH)", "ppc64le")
-	CROSS_COMPILE ?= powerpc64le-linux-
+	CROSS_COMPILE ?= powerpc64le-linux-gnu-
 endif
 endif
 
diff --git a/rust_lib_demo/Makefile b/rust_lib_demo/Makefile
index 26aebf8..fdbb18b 100644
--- a/rust_lib_demo/Makefile
+++ b/rust_lib_demo/Makefile
@@ -1,7 +1,7 @@
 ARCH = $(shell uname -m)
 ifneq ("$(ARCH)", "ppc64")
 ifneq ("$(ARCH)", "ppc64le")
-	CROSS_COMPILE ?= powerpc64le-linux-
+	CROSS_COMPILE ?= powerpc64le-linux-gnu-
 endif
 endif
 
diff --git a/tests/Makefile.test b/tests/Makefile.test
index 9676370..250135d 100644
--- a/tests/Makefile.test
+++ b/tests/Makefile.test
@@ -1,7 +1,7 @@
 ARCH = $(shell uname -m)
 ifneq ("$(ARCH)", "ppc64")
 ifneq ("$(ARCH)", "ppc64le")
-        CROSS_COMPILE ?= powerpc64le-linux-
+        CROSS_COMPILE ?= powerpc64le-linux-gnu-
         endif
         endif
 

From 4db1676ef8b37fe7f36abe14b3255e4b92fbc5bd Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 4 May 2020 08:31:18 +1000
Subject: [PATCH 08/10] dcache: Don't assert on dcbz cache hit

We can hit the assert for req_op = OP_STORE_HIT and reloading in the
case of dcbz, since it looks like a store.  Therefore we need to
exclude that case from the assert.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 dcache.vhdl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dcache.vhdl b/dcache.vhdl
index 550298b..7d61a85 100644
--- a/dcache.vhdl
+++ b/dcache.vhdl
@@ -597,7 +597,8 @@ begin
 	    if reloading and wishbone_in.ack = '1' and r1.store_way = i then
 		do_write <= '1';
 	    end if;
-	    if req_op = OP_STORE_HIT and req_hit_way = i and cancel_store = '0' then
+	    if req_op = OP_STORE_HIT and req_hit_way = i and cancel_store = '0' and
+                r1.req.dcbz = '0' then
 		assert not reloading report "Store hit while in state:" &
 		    state_t'image(r1.state)
 		    severity FAILURE;

From fe789190e40fe160d129f0504d1f69fec54cf4d9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 1 May 2020 09:00:21 +1000
Subject: [PATCH 09/10] wishbone_debug_master: Fix address auto-increment for
 memory writes

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 wishbone_debug_master.vhdl | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/wishbone_debug_master.vhdl b/wishbone_debug_master.vhdl
index 11b9ee3..ddf6923 100644
--- a/wishbone_debug_master.vhdl
+++ b/wishbone_debug_master.vhdl
@@ -49,6 +49,7 @@ architecture behaviour of wishbone_debug_master is
     
     type state_t is (IDLE, WB_CYCLE, DMI_WAIT);
     signal state : state_t;
+    signal do_inc : std_ulogic;
 
 begin
 
@@ -84,16 +85,16 @@ begin
 		reg_addr <= (others => '0');
 		reg_ctrl <= (others => '0');
 	    else 	    -- Standard register writes
-		if dmi_req and dmi_wr then
+                if do_inc = '1' then
+		    -- Address register auto-increment
+		    reg_addr <= std_ulogic_vector(unsigned(reg_addr) +
+						  decode_autoinc(reg_ctrl(10 downto 9)));
+                elsif dmi_req and dmi_wr then
 		    if dmi_addr = DBG_WB_ADDR then
 			reg_addr <= dmi_din;
 		    elsif dmi_addr = DBG_WB_CTRL then
 			reg_ctrl <= dmi_din(10 downto 0);
 		    end if;
-                elsif state = WB_CYCLE and (wb_in.ack and reg_ctrl(8))= '1'  then
-		    -- Address register auto-increment
-		    reg_addr <= std_ulogic_vector(unsigned(reg_addr) +
-						  decode_autoinc(reg_ctrl(10 downto 9)));
 		end if;
 	    end if;
 	end if;
@@ -145,6 +146,7 @@ begin
 	    if (rst) then
 		state <= IDLE;
 		wb_out.stb <= '0';
+                do_inc <= '0';
 	    else
 		case state is
 		when IDLE =>
@@ -162,11 +164,13 @@ begin
 			--
 			wb_out.stb <= '0';
 			state <= DMI_WAIT;
+                        do_inc <= reg_ctrl(8);
 		    end if;
 		when DMI_WAIT =>
 		    if dmi_req = '0' then
 			state <= IDLE;
 		    end if;
+                    do_inc <= '0';
 		end case;
 	    end if;
 	end if;

From 102fbcfe9a3d8e054fdb0ad050512944051e4844 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 4 May 2020 15:17:04 +1000
Subject: [PATCH 10/10] execute1: Fix interrupt delivery during slow
 instructions

During slow instructions such as multiply or divide, if a decrementer
(or other asynchronous) interrupt becomes pending, it disrupts the
logic that keeps stall asserted until the end of the slow
instruction, and the interrupt logic starts trying to deliver the
interrupt before the slow instruction has finished.

To fix that, make the interrupt logic wait until it sees e_in.valid
set before setting exception to 1.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 execute1.vhdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/execute1.vhdl b/execute1.vhdl
index 1846488..8286d30 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -454,12 +454,12 @@ begin
 	    v.e.valid := e_in.valid;
 	    report "Writing SRR1: " & to_hstring(ctrl.srr1);
 
-	elsif irq_valid = '1' then
+	elsif irq_valid = '1' and e_in.valid = '1' then
 	    -- we need two cycles to write srr0 and 1
 	    -- will need more when we have to write DSISR, DAR and HIER
             -- Don't deliver the interrupt until we have a valid instruction
             -- coming in, so we have a valid NIA to put in SRR0.
-	    exception := e_in.valid;
+	    exception := '1';
 	    ctrl_tmp.srr1 <= msr_copy(ctrl.msr);
 
         elsif e_in.valid = '1' and ctrl.msr(MSR_PR) = '1' and