From 4c61a71a62e7c217699ff0597ee10cf3bfd18c1d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 11 Nov 2020 18:11:04 +1100 Subject: [PATCH 01/13] core: Crack update-form loads into two internal ops This uses the instruction-doubling machinery to send load with update instructions down to loadstore1 as two separate ops, rather than one op with two destinations. This will help to simplify the value tracking mechanisms. Signed-off-by: Paul Mackerras --- common.vhdl | 3 +-- decode1.vhdl | 54 +++++++++++++++++++++++------------------------ decode2.vhdl | 15 +++++++------ decode_types.vhdl | 3 ++- execute1.vhdl | 1 - loadstore1.vhdl | 52 ++++++++++++++++++++++----------------------- 6 files changed, 63 insertions(+), 65 deletions(-) diff --git a/common.vhdl b/common.vhdl index 7bf8277..893127f 100644 --- a/common.vhdl +++ b/common.vhdl @@ -333,7 +333,6 @@ package common is byte_reverse : std_ulogic; sign_extend : std_ulogic; -- do we need to sign extend? update : std_ulogic; -- is this an update instruction? - update_reg : gpr_index_t; -- if so, the register to update xerc : xer_common_t; reserve : std_ulogic; -- set for larx/stcx. rc : std_ulogic; -- set for stcx. @@ -351,7 +350,7 @@ package common is addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'), write_reg => (others => '0'), length => (others => '0'), mode_32bit => '0', is_32bit => '0', - repeat => '0', second => '0', others => (others => '0')); + repeat => '0', second => '0'); type Loadstore1ToExecute1Type is record busy : std_ulogic; diff --git a/decode1.vhdl b/decode1.vhdl index ebe59be..0f3410d 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -84,18 +84,18 @@ architecture behaviour of decode1 is 11 => (ALU, NONE, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- cmpi 10 => (ALU, NONE, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpli 34 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lbz - 35 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lbzu + 35 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lbzu 50 => (LDST, FPU, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lfd - 51 => (LDST, FPU, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lfdu + 51 => (LDST, FPU, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lfdu 48 => (LDST, FPU, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- lfs - 49 => (LDST, FPU, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', NONE), -- lfsu + 49 => (LDST, FPU, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', DUPD), -- lfsu 42 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lha - 43 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lhau + 43 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhau 40 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhz - 41 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lhzu + 41 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhzu 56 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_DQ, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRTE), -- lq 32 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwz - 33 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lwzu + 33 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lwzu 7 => (ALU, NONE, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- mulli 24 => (ALU, NONE, OP_OR, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- ori 25 => (ALU, NONE, OP_OR, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- oris @@ -104,15 +104,15 @@ architecture behaviour of decode1 is 23 => (ALU, NONE, OP_RLC, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- rlwnm 17 => (ALU, NONE, OP_SC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sc 38 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stb - 39 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stbu + 39 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stbu 54 => (LDST, FPU, OP_STORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stfd - 55 => (LDST, FPU, OP_STORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stfdu + 55 => (LDST, FPU, OP_STORE, RA_OR_ZERO, CONST_SI, FRS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stfdu 52 => (LDST, FPU, OP_STORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- stfs - 53 => (LDST, FPU, OP_STORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', NONE), -- stfsu + 53 => (LDST, FPU, OP_STORE, RA_OR_ZERO, CONST_SI, FRS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', NONE), -- stfsu 44 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sth - 45 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- sthu + 45 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- sthu 36 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stw - 37 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stwu + 37 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_SI, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stwu 8 => (ALU, NONE, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- subfic 2 => (ALU, NONE, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- tdi 3 => (ALU, NONE, OP_TRAP, RA, CONST_SI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1', NONE), -- twi @@ -290,33 +290,33 @@ architecture behaviour of decode1 is 2#1111101111# => (ALU, NONE, OP_ISEL, RA_OR_ZERO, RB, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel 2#0000110100# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), -- lbarx 2#1101010101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lbzcix - 2#0001110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lbzux + 2#0001110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lbzux 2#0001010111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lbzx 2#0001010100# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), -- ldarx 2#1000010100# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- ldbrx 2#1101110101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- ldcix - 2#0000110101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- ldux + 2#0000110101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- ldux 2#0000010101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- ldx 2#1001010111# => (LDST, FPU, OP_LOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lfdx - 2#1001110111# => (LDST, FPU, OP_LOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lfdux + 2#1001110111# => (LDST, FPU, OP_LOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lfdux 2#1101010111# => (LDST, FPU, OP_LOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lfiwax 2#1101110111# => (LDST, FPU, OP_LOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lfiwzx 2#1000010111# => (LDST, FPU, OP_LOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- lfsx - 2#1000110111# => (LDST, FPU, OP_LOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', NONE), -- lfsux + 2#1000110111# => (LDST, FPU, OP_LOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', DUPD), -- lfsux 2#0001110100# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), -- lharx - 2#0101110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lhaux + 2#0101110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhaux 2#0101010111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhax 2#1100010110# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhbrx 2#1100110101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhzcix - 2#0100110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lhzux + 2#0100110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhzux 2#0100010111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhzx 2#0100010100# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', DRTE), -- lqarx 2#0000010100# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), -- lwarx - 2#0101110101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lwaux + 2#0101110101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lwaux 2#0101010101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwax 2#1000010110# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwbrx 2#1100010101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwzcix - 2#0000110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- lwzux + 2#0000110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lwzux 2#0000010111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwzx 2#1001000000# => (ALU, NONE, OP_MCRXRX, NONE, NONE, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mcrxrx 2#0000010011# => (ALU, NONE, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfcr/mfocrf @@ -376,28 +376,28 @@ architecture behaviour of decode1 is 2#1000011000# => (ALU, NONE, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- srw 2#1111010101# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stbcix 2#1010110110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', NONE), -- stbcx - 2#0011110111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stbux + 2#0011110111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stbux 2#0011010111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stbx 2#1010010100# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stdbrx 2#1111110101# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stdcix 2#0011010110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', NONE), -- stdcx - 2#0010110101# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stdux + 2#0010110101# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stdux 2#0010010101# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stdx 2#1011010111# => (LDST, FPU, OP_STORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stfdx - 2#1011110111# => (LDST, FPU, OP_STORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stfdux + 2#1011110111# => (LDST, FPU, OP_STORE, RA_OR_ZERO, RB, FRS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stfdux 2#1111010111# => (LDST, FPU, OP_STORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stfiwx 2#1010010111# => (LDST, FPU, OP_STORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- stfsx - 2#1010110111# => (LDST, FPU, OP_STORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', NONE), -- stfsux + 2#1010110111# => (LDST, FPU, OP_STORE, RA_OR_ZERO, RB, FRS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', NONE), -- stfsux 2#1110010110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sthbrx 2#1110110101# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sthcix 2#1011010110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', NONE), -- sthcx - 2#0110110111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- sthux + 2#0110110111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- sthux 2#0110010111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sthx 2#0010110110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', DRSE), -- stqcx 2#1010010110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stwbrx 2#1110010101# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stwcix 2#0010010110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', NONE), -- stwcx - 2#0010110111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stwux + 2#0010110111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stwux 2#0010010111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stwx 2#0000101000# => (ALU, NONE, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- subf 2#1000101000# => (ALU, NONE, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- subfo @@ -424,7 +424,7 @@ architecture behaviour of decode1 is -- unit fac internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl rpt -- op in out A out in out len ext pipe 0 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_DS, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- ld - 1 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_DS, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- ldu + 1 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_DS, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- ldu 2 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_DS, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwa others => decode_rom_init ); @@ -451,7 +451,7 @@ architecture behaviour of decode1 is -- unit fac internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl rpt -- op in out A out in out len ext pipe 0 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_DS, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- std - 1 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_DS, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stdu + 1 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_DS, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stdu 2 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_DS, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRSE), -- stq others => decode_rom_init ); diff --git a/decode2.vhdl b/decode2.vhdl index e00a05d..0336057 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -407,6 +407,11 @@ begin if r.repeat = d_in.big_endian then decoded_reg_o.reg(0) := '1'; end if; + when DUPD => + -- update-form loads, 2nd instruction writes RA + if r.repeat = '1' then + decoded_reg_o.reg := decoded_reg_a.reg; + end if; when others => end case; end if; @@ -492,13 +497,9 @@ begin if EX1_BYPASS and d_in.decode.unit = ALU then gpr_bypassable <= '1'; end if; - update_gpr_write_valid <= d_in.decode.update; - update_gpr_write_reg <= decoded_reg_a.reg; - if v.e.lr = '1' then - -- there are no instructions that have both update=1 and lr=1 - update_gpr_write_valid <= '1'; - update_gpr_write_reg <= fast_spr_num(SPR_LR); - end if; + + update_gpr_write_valid <= v.e.lr; + update_gpr_write_reg <= fast_spr_num(SPR_LR); gpr_a_read_valid <= decoded_reg_a.reg_valid; gpr_a_read <= decoded_reg_a.reg; diff --git a/decode_types.vhdl b/decode_types.vhdl index 02790a6..885cc91 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -54,7 +54,8 @@ package decode_types is type repeat_t is (NONE, -- instruction is not repeated DRSE, -- double RS, endian twist - DRTE); -- double RT, endian twist + DRTE, -- double RT, endian twist + DUPD); -- update-form load type decode_rom_t is record unit : unit_t; diff --git a/execute1.vhdl b/execute1.vhdl index 25b1dc7..559f34f 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -1344,7 +1344,6 @@ begin lv.byte_reverse := e_in.byte_reverse xnor ctrl.msr(MSR_LE); lv.sign_extend := e_in.sign_extend; lv.update := e_in.update; - lv.update_reg := gspr_to_gpr(e_in.read_reg1); lv.xerc := v.e.xerc; lv.reserve := e_in.reserve; lv.rc := e_in.rc; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index b83eed6..fc5bcf9 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -72,7 +72,6 @@ architecture behave of loadstore1 is brev_mask : unsigned(2 downto 0); sign_extend : std_ulogic; update : std_ulogic; - update_reg : gpr_index_t; xerc : xer_common_t; reserve : std_ulogic; atomic : std_ulogic; @@ -415,7 +414,7 @@ begin when ACK_WAIT => -- r.wr_sel gets set one cycle after we come into ACK_WAIT state, -- which is OK because the dcache always takes at least two cycles. - if r.update = '1' and (r.load = '0' or (HAS_FPU and r.load_sp = '1')) then + if r.update = '1' and r.load = '0' then v.wr_sel := "01"; end if; if d_in.error = '1' then @@ -446,16 +445,9 @@ begin write_enable := r.load and not r.load_sp; if HAS_FPU and r.load_sp = '1' then -- SP to DP conversion takes a cycle - -- Write back rA update in this cycle if needed - do_update := r.update; v.wr_sel := "10"; v.state := FINISH_LFS; - elsif r.extra_cycle = '1' then - -- loads with rA update need an extra cycle - v.wr_sel := "01"; - v.state := COMPLETE; - v.do_update := r.update; - else + elsif r.load = '0' then -- stores write back rA update in this cycle do_update := r.update; end if; @@ -516,7 +508,6 @@ begin v.byte_reverse := l_in.byte_reverse; v.sign_extend := l_in.sign_extend; v.update := l_in.update; - v.update_reg := l_in.update_reg; v.xerc := l_in.xerc; v.reserve := l_in.reserve; v.rc := l_in.rc; @@ -526,7 +517,6 @@ begin v.load_sp := '0'; v.wait_dcache := '0'; v.wait_mmu := '0'; - v.do_update := '0'; v.extra_cycle := '0'; if HAS_FPU and l_in.is_32bit = '1' then @@ -537,13 +527,21 @@ begin addr := lsu_sum; if l_in.second = '1' then - -- for the second half of a 16-byte transfer, use next_addr - addr := next_addr; + -- second half of load with update does the update + if l_in.op = OP_LOAD and l_in.update = '1' then + v.do_update := '1'; + else + -- for the second half of a 16-byte transfer, use next_addr + addr := next_addr; + end if; end if; if l_in.mode_32bit = '1' then addr(63 downto 32) := (others => '0'); end if; - v.addr := addr; + if v.do_update = '0' then + -- preserve previous r.addr for load with update + v.addr := addr; + end if; maddr := l_in.addr2; -- address from RB for tlbie -- XXX Temporary hack. Mark the op as non-cachable if the address @@ -566,7 +564,7 @@ begin -- check alignment for larx/stcx misaligned := or (std_ulogic_vector(unsigned(l_in.length(2 downto 0)) - 1) and addr(2 downto 0)); v.align_intr := l_in.reserve and misaligned; - if l_in.repeat = '1' and l_in.second = '0' and addr(3) = '1' then + if l_in.repeat = '1' and l_in.second = '0' and l_in.update = '0' and addr(3) = '1' then -- length is really 16 not 8 -- Make misaligned lq cause an alignment interrupt in LE mode, -- in order to avoid the case with RA = RT + 1 where the second half @@ -585,14 +583,17 @@ begin when OP_STORE => req := '1'; when OP_LOAD => - req := '1'; v.load := '1'; - -- Allow an extra cycle for RA update on loads - v.extra_cycle := l_in.update; - if HAS_FPU and l_in.is_32bit = '1' then - -- Allow an extra cycle for SP->DP precision conversion - v.load_sp := '1'; - v.extra_cycle := '1'; + if l_in.second = '1' and l_in.update = '1' then + v.wr_sel := "01"; + v.state := COMPLETE; + else + req := '1'; + if HAS_FPU and l_in.is_32bit = '1' then + -- Allow an extra cycle for SP->DP precision conversion + v.load_sp := '1'; + v.extra_cycle := '1'; + end if; end if; when OP_DCBZ => v.align_intr := v.nc; @@ -724,22 +725,19 @@ begin -- Multiplex either cache data to the destination GPR or -- the address for the rA update. l_out.valid <= done; + l_out.write_reg <= r.write_reg; case r.wr_sel is when "00" => l_out.write_enable <= '1'; - l_out.write_reg <= r.write_reg; l_out.write_data <= r.sprval; when "01" => l_out.write_enable <= do_update; - l_out.write_reg <= gpr_to_gspr(r.update_reg); l_out.write_data <= r.addr; when "10" => l_out.write_enable <= '1'; - l_out.write_reg <= r.write_reg; l_out.write_data <= load_dp_data; when others => l_out.write_enable <= write_enable; - l_out.write_reg <= r.write_reg; l_out.write_data <= data_trimmed; end case; l_out.xerc <= r.xerc; From a1d7b54f76859e7270c48a56536b901f3c05c641 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 11 Nov 2020 22:10:38 +1100 Subject: [PATCH 02/13] core: Crack branches that update both CTR and LR This uses the instruction doubling machinery to convert conditional branch instructions that update both CTR and LR (e.g., bdnzl, bdnzlrl) into two instructions, of which the first updates CTR and determines whether the branch is taken, and the second updates LR and does the redirect if necessary. Signed-off-by: Paul Mackerras --- common.vhdl | 12 ++++--- decode1.vhdl | 17 ++++++++-- decode2.vhdl | 55 +++++++++++++++++-------------- execute1.vhdl | 91 ++++++++++++++++++--------------------------------- 4 files changed, 84 insertions(+), 91 deletions(-) diff --git a/common.vhdl b/common.vhdl index 893127f..686e414 100644 --- a/common.vhdl +++ b/common.vhdl @@ -176,14 +176,15 @@ package common is insn: std_ulogic_vector(31 downto 0); ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR) + ispro: gspr_index_t; -- (G)SPR written with LR or CTR decode: decode_rom_t; br_pred: std_ulogic; -- Branch was predicted to be taken big_endian: std_ulogic; end record; constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), - ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init, - br_pred => '0', big_endian => '0'); + ispr1 => (others => '0'), ispr2 => (others => '0'), ispro => (others => '0'), + decode => decode_rom_init, br_pred => '0', big_endian => '0'); type Decode1ToFetch1Type is record redirect : std_ulogic; @@ -210,6 +211,7 @@ package common is bypass_cr : std_ulogic; xerc: xer_common_t; lr: std_ulogic; + br_abs: std_ulogic; rc: std_ulogic; oe: std_ulogic; invert_a: std_ulogic; @@ -236,7 +238,7 @@ package common is constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, write_reg_enable => '0', bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', - bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', + bypass_cr => '0', lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), @@ -552,9 +554,9 @@ package body common is begin case spr is when SPR_LR => - n := 0; + n := 0; -- N.B. decode2 relies on this specific value when SPR_CTR => - n:= 1; + n := 1; -- N.B. decode2 relies on this specific value when SPR_SRR0 => n := 2; when SPR_SRR1 => diff --git a/decode1.vhdl b/decode1.vhdl index 0f3410d..f62594b 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -79,7 +79,7 @@ architecture behaviour of decode1 is 28 => (ALU, NONE, OP_AND, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), -- andi. 29 => (ALU, NONE, OP_AND, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), -- andis. 0 => (ALU, NONE, OP_ATTN, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- attn - 18 => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b + 18 => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b 16 => (ALU, NONE, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- bc 11 => (ALU, NONE, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- cmpi 10 => (ALU, NONE, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpli @@ -597,9 +597,10 @@ begin -- major opcode 31, lots of things v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1)))); - -- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path + -- Work out ispr1/ispro independent of v.decode since they seem to be critical path sprn := decode_spr_num(f_in.insn); v.ispr1 := fast_spr_num(sprn); + v.ispro := fast_spr_num(sprn); if std_match(f_in.insn(10 downto 1), "01-1010011") then -- mfspr or mtspr @@ -627,6 +628,9 @@ begin -- CTR may be needed as input to bc if f_in.insn(23) = '0' then v.ispr1 := fast_spr_num(SPR_CTR); + v.ispro := fast_spr_num(SPR_CTR); + elsif f_in.insn(0) = '1' then + v.ispro := fast_spr_num(SPR_LR); end if; -- Predict backward branches as taken, forward as untaken v.br_pred := f_in.insn(15); @@ -636,6 +640,9 @@ begin -- Unconditional branches are always taken v.br_pred := '1'; br_offset := signed(f_in.insn(25 downto 2)); + if f_in.insn(0) = '1' then + v.ispro := fast_spr_num(SPR_LR); + end if; when 19 => vi.override := not decode_op_19_valid(to_integer(unsigned(f_in.insn(5 downto 1) & f_in.insn(10 downto 6)))); @@ -648,8 +655,12 @@ begin -- Branch uses CTR as condition when BO(2) is 0. This is -- also used to indicate that CTR is modified (they go -- together). - if f_in.insn(23) = '0' then + -- bcctr doesn't update CTR or use it in the branch condition + if f_in.insn(23) = '0' and (f_in.insn(10) = '0' or f_in.insn(6) = '1') then v.ispr1 := fast_spr_num(SPR_CTR); + v.ispro := fast_spr_num(SPR_CTR); + elsif f_in.insn(0) = '1' then + v.ispro := fast_spr_num(SPR_LR); end if; if f_in.insn(10) = '0' then v.ispr2 := fast_spr_num(SPR_LR); diff --git a/decode2.vhdl b/decode2.vhdl index 0336057..274a241 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -249,6 +249,9 @@ architecture behaviour of decode2 is OP_MOD => "011", OP_CNTZ => "100", -- countzero_result OP_MFSPR => "101", -- spr_result + OP_B => "110", -- next_nia + OP_BC => "110", + OP_BCREG => "110", OP_ADDG6S => "111", -- misc_result OP_ISEL => "111", OP_DARN => "111", @@ -284,9 +287,6 @@ architecture behaviour of decode2 is signal gpr_write : gspr_index_t; signal gpr_bypassable : std_ulogic; - signal update_gpr_write_valid : std_ulogic; - signal update_gpr_write_reg : gspr_index_t; - signal gpr_a_read_valid : std_ulogic; signal gpr_a_read :gspr_index_t; signal gpr_a_bypass : std_ulogic; @@ -325,8 +325,8 @@ begin gpr_write_in => gpr_write, gpr_bypassable => gpr_bypassable, - update_gpr_write_valid => update_gpr_write_valid, - update_gpr_write_reg => update_gpr_write_reg, + update_gpr_write_valid => '0', + update_gpr_write_reg => 7x"00", gpr_a_read_valid_in => gpr_a_read_valid, gpr_a_read_in => gpr_a_read, @@ -376,6 +376,7 @@ begin variable decoded_reg_c : decode_input_reg_t; variable decoded_reg_o : decode_output_reg_t; variable length : std_ulogic_vector(3 downto 0); + variable op : insn_type_t; begin v := r; @@ -391,7 +392,14 @@ begin d_in.nia); decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2); decoded_reg_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn, r_in.read3_data); - decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispr1); + decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro); + + if d_in.decode.lr = '1' then + v.e.lr := insn_lk(d_in.insn); + -- b and bc have even major opcodes; bcreg is considered absolute + v.e.br_abs := insn_aa(d_in.insn) or d_in.insn(26); + end if; + op := d_in.decode.insn_type; if d_in.decode.repeat /= NONE then v.e.repeat := '1'; @@ -414,6 +422,12 @@ begin end if; when others => end case; + elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then + -- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled + v.e.repeat := '1'; + v.e.second := r.repeat; + -- first one does CTR, second does LR + decoded_reg_o.reg(0) := not r.repeat; end if; r_out.read1_enable <= decoded_reg_a.reg_valid and d_in.valid; @@ -440,7 +454,6 @@ begin v.e.nia := d_in.nia; v.e.unit := d_in.decode.unit; v.e.fac := d_in.decode.facility; - v.e.insn_type := d_in.decode.insn_type; v.e.read_reg1 := decoded_reg_a.reg; v.e.read_data1 := decoded_reg_a.data; v.e.bypass_data1 := gpr_a_bypass; @@ -460,23 +473,12 @@ begin v.e.xerc := c_in.read_xerc_data; v.e.invert_a := d_in.decode.invert_a; v.e.addm1 := '0'; - if d_in.decode.insn_type = OP_BC or d_in.decode.insn_type = OP_BCREG then - -- add -1 to CTR - v.e.addm1 := '1'; - if d_in.insn(23) = '1' or - (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then - -- don't write decremented CTR if BO(2) = 1 or bcctr - v.e.write_reg_enable := '0'; - end if; - end if; + v.e.insn_type := op; v.e.invert_out := d_in.decode.invert_out; v.e.input_carry := d_in.decode.input_carry; v.e.output_carry := d_in.decode.output_carry; v.e.is_32bit := d_in.decode.is_32bit; v.e.is_signed := d_in.decode.is_signed; - if d_in.decode.lr = '1' then - v.e.lr := insn_lk(d_in.insn); - end if; v.e.insn := d_in.insn; v.e.data_len := length; v.e.byte_reverse := d_in.decode.byte_reverse; @@ -484,8 +486,16 @@ begin v.e.update := d_in.decode.update; v.e.reserve := d_in.decode.reserve; v.e.br_pred := d_in.br_pred; - v.e.result_sel := result_select(d_in.decode.insn_type); - v.e.sub_select := subresult_select(d_in.decode.insn_type); + v.e.result_sel := result_select(op); + v.e.sub_select := subresult_select(op); + if op = OP_BC or op = OP_BCREG then + if d_in.insn(23) = '0' and r.repeat = '0' and + not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then + -- decrement CTR if BO(2) = 0 and not bcctr + v.e.addm1 := '1'; + v.e.result_sel := "000"; -- select adder output + end if; + end if; -- issue control control_valid_in <= d_in.valid; @@ -498,9 +508,6 @@ begin gpr_bypassable <= '1'; end if; - update_gpr_write_valid <= v.e.lr; - update_gpr_write_reg <= fast_spr_num(SPR_LR); - gpr_a_read_valid <= decoded_reg_a.reg_valid; gpr_a_read <= decoded_reg_a.reg; diff --git a/execute1.vhdl b/execute1.vhdl index 559f34f..2690424 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -59,8 +59,8 @@ architecture behaviour of execute1 is fp_exception_next : std_ulogic; trace_next : std_ulogic; prev_op : insn_type_t; - lr_update : std_ulogic; next_lr : std_ulogic_vector(63 downto 0); + br_taken : std_ulogic; mul_in_progress : std_ulogic; mul_finish : std_ulogic; div_in_progress : std_ulogic; @@ -79,8 +79,8 @@ architecture behaviour of execute1 is constant reg_type_init : reg_type := (e => Execute1ToWritebackInit, cur_instr => Decode2ToExecute1Init, - busy => '0', lr_update => '0', terminate => '0', - fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, + busy => '0', terminate => '0', + fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, br_taken => '0', mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0', next_lr => (others => '0'), last_nia => (others => '0'), redirect => '0', abs_br => '0', taken_br => '0', br_last => '0', do_intr => '0', vector => 0, @@ -108,6 +108,7 @@ architecture behaviour of execute1 is signal spr_result: std_ulogic_vector(63 downto 0); signal result_mux_sel: std_ulogic_vector(2 downto 0); signal sub_mux_sel: std_ulogic_vector(2 downto 0); + signal next_nia : std_ulogic_vector(63 downto 0); signal current: Decode2ToExecute1Type; -- multiply signals @@ -301,6 +302,7 @@ begin muldiv_result when "011", countzero_result when "100", spr_result when "101", + next_nia when "110", misc_result when others; execute1_0: process(clk) @@ -315,11 +317,9 @@ begin else r <= rin; ctrl <= ctrl_tmp; - assert not (r.lr_update = '1' and valid_in = '1') - report "LR update collision with valid in EX1" - severity failure; - if r.lr_update = '1' then - report "LR update to " & to_hstring(r.next_lr); + if valid_in = '1' then + report "execute " & to_hstring(e_in.nia) & " op=" & insn_type_t'image(e_in.insn_type) & + " wr=" & to_hstring(rin.e.write_reg); end if; end if; end if; @@ -350,7 +350,6 @@ begin variable btnum, banum, bbnum : integer range 0 to 31; variable crresult : std_ulogic; variable l : std_ulogic; - variable next_nia : std_ulogic_vector(63 downto 0); variable carry_32, carry_64 : std_ulogic; variable sign1, sign2 : std_ulogic; variable abs1, abs2 : signed(63 downto 0); @@ -421,7 +420,6 @@ begin end loop; end if; - v.lr_update := '0'; v.mul_in_progress := '0'; v.div_in_progress := '0'; v.cntz_in_progress := '0'; @@ -673,7 +671,7 @@ begin v.busy := '0'; -- Next insn adder used in a couple of places - next_nia := std_ulogic_vector(unsigned(e_in.nia) + 4); + next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4); -- rotator control signals right_shift <= '1' when e_in.insn_type = OP_SHR else '0'; @@ -846,35 +844,39 @@ begin newcrf & newcrf & newcrf & newcrf; when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS | OP_BPERM | OP_BCD => + when OP_B => is_branch := '1'; taken_branch := '1'; is_direct_branch := '1'; - abs_branch := insn_aa(e_in.insn); + abs_branch := e_in.br_abs; if ctrl.msr(MSR_BE) = '1' then do_trace := '1'; end if; - when OP_BC => - -- read_data1 is CTR + when OP_BC | OP_BCREG => + -- read_data1 is CTR + -- for OP_BCREG, read_data2 is target register (CTR, LR or TAR) + -- If this instruction updates both CTR and LR, then it is + -- doubled; the first instruction decrements CTR and determines + -- whether the branch is taken, and the second does the + -- redirect and the LR update. bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); - is_branch := '1'; - is_direct_branch := '1'; - taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); - abs_branch := insn_aa(e_in.insn); - if ctrl.msr(MSR_BE) = '1' then - do_trace := '1'; + if e_in.second = '0' then + taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); + else + taken_branch := r.br_taken; end if; - when OP_BCREG => - -- read_data1 is CTR - -- read_data2 is target register (CTR, LR or TAR) - bo := insn_bo(e_in.insn); - bi := insn_bi(e_in.insn); - is_branch := '1'; - taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); - abs_branch := '1'; - if ctrl.msr(MSR_BE) = '1' then - do_trace := '1'; + v.br_taken := taken_branch; + abs_branch := e_in.br_abs; + if e_in.repeat = '0' or e_in.second = '1' then + is_branch := '1'; + if e_in.insn_type = OP_BC then + is_direct_branch := '1'; + end if; + if ctrl.msr(MSR_BE) = '1' then + do_trace := '1'; + end if; end if; when OP_RFID => @@ -1197,11 +1199,6 @@ begin end if; v.e.valid := '1'; end if; - -- When doing delayed LR update, keep r.e.write_data unchanged - -- next cycle in case it is needed for a forwarded result (e.g. CTR). - if r.lr_update = '1' then - hold_wr_data := '1'; - end if; -- Generate FP-type program interrupt. fp_in.interrupt will only -- be set during the execution of a FP instruction. @@ -1274,30 +1271,6 @@ begin v.e.write_enable := current.write_reg_enable and v.e.valid and not exception; v.e.rc := current.rc and v.e.valid and not exception; - -- Update LR on the next cycle after a branch link - -- If we're not writing back anything else, we can write back LR - -- this cycle, otherwise we take an extra cycle. We use the - -- exc_write path since next_nia is written through that path - -- in other places. - if v.e.valid = '1' and exception = '0' and current.lr = '1' then - if current.write_reg_enable = '0' then - v.e.exc_write_enable := '1'; - v.e.exc_write_data := next_nia; - v.e.exc_write_reg := fast_spr_num(SPR_LR); - else - v.lr_update := '1'; - v.e.valid := '0'; - report "Delayed LR update to " & to_hstring(next_nia); - v.busy := '1'; - end if; - end if; - if r.lr_update = '1' then - v.e.exc_write_enable := '1'; - v.e.exc_write_data := r.next_lr; - v.e.exc_write_reg := fast_spr_num(SPR_LR); - v.e.valid := '1'; - end if; - -- Defer completion for one cycle when redirecting. -- This also ensures r.busy = 1 when ctrl.irq_state = WRITE_SRR1 if v.redirect = '1' then From c0b45e153b39ddd7fe062b575d136979cdcec076 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 10 Nov 2020 20:04:00 +1100 Subject: [PATCH 03/13] core: Track GPR hazards using tags that propagate through the pipelines This changes the way GPR hazards are detected and tracked. Instead of having a model of the pipeline in gpr_hazard.vhdl, which has to mirror the behaviour of the real pipeline exactly, we now assign a 2-bit tag to each instruction and record which GSPR the instruction writes. Subsequent instructions that need to use the GSPR get the tag number and stall until the value with that tag is being written back to the register file. For now, the forwarding paths are disabled. That gives about a 8% reduction in coremark performance. Signed-off-by: Paul Mackerras --- Makefile | 2 +- common.vhdl | 71 +++++++++++----- control.vhdl | 214 +++++++++++++++++++++++++++--------------------- core.vhdl | 2 +- decode2.vhdl | 12 +-- execute1.vhdl | 9 +- fpu.vhdl | 3 + gpr_hazard.vhdl | 112 ------------------------- loadstore1.vhdl | 3 + microwatt.core | 1 - writeback.vhdl | 16 +++- 11 files changed, 202 insertions(+), 243 deletions(-) delete mode 100644 gpr_hazard.vhdl diff --git a/Makefile b/Makefile index 2ee5d57..bb39007 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ all: $(all) core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \ - decode1.vhdl helpers.vhdl insn_helpers.vhdl gpr_hazard.vhdl \ + decode1.vhdl helpers.vhdl insn_helpers.vhdl \ cr_hazard.vhdl control.vhdl decode2.vhdl register_file.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ logical.vhdl countzero.vhdl multiply.vhdl divider.vhdl execute1.vhdl \ diff --git a/common.vhdl b/common.vhdl index 686e414..8792944 100644 --- a/common.vhdl +++ b/common.vhdl @@ -3,6 +3,7 @@ use ieee.std_logic_1164.all; use ieee.numeric_std.all; library work; +use work.utils.all; use work.decode_types.all; package common is @@ -126,6 +127,17 @@ package common is constant FPSCR_NI : integer := 63 - 61; constant FPSCR_RN : integer := 63 - 63; + -- Used for tracking instruction completion and pending register writes + constant TAG_COUNT : positive := 4; + constant TAG_NUMBER_BITS : natural := log2(TAG_COUNT); + subtype tag_number_t is integer range 0 to TAG_COUNT - 1; + subtype tag_index_t is unsigned(TAG_NUMBER_BITS - 1 downto 0); + type instr_tag_t is record + tag : tag_number_t; + valid : std_ulogic; + end record; + constant instr_tag_init : instr_tag_t := (tag => 0, valid => '0'); + type irq_state_t is (WRITE_SRR0, WRITE_SRR1); -- For now, fixed 16 sources, make this either a parametric @@ -197,6 +209,7 @@ package common is fac : facility_t; insn_type: insn_type_t; nia: std_ulogic_vector(63 downto 0); + instr_tag : instr_tag_t; write_reg: gspr_index_t; write_reg_enable: std_ulogic; read_reg1: gspr_index_t; @@ -236,7 +249,7 @@ package common is second : std_ulogic; -- set if this is the second op end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := - (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, + (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init, write_reg_enable => '0', bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', bypass_cr => '0', lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', @@ -291,9 +304,9 @@ package common is end record; type RegisterFileToDecode2Type is record - read1_data : std_ulogic_vector(63 downto 0); - read2_data : std_ulogic_vector(63 downto 0); - read3_data : std_ulogic_vector(63 downto 0); + read1_data : std_ulogic_vector(63 downto 0); + read2_data : std_ulogic_vector(63 downto 0); + read3_data : std_ulogic_vector(63 downto 0); end record; type Decode2ToCrFileType is record @@ -326,6 +339,7 @@ package common is op : insn_type_t; -- what ld/st or m[tf]spr or TLB op to do nia : std_ulogic_vector(63 downto 0); insn : std_ulogic_vector(31 downto 0); + instr_tag : instr_tag_t; addr1 : std_ulogic_vector(63 downto 0); addr2 : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); -- data to write, unused for read @@ -345,14 +359,17 @@ package common is repeat : std_ulogic; second : std_ulogic; end record; - constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0', - sign_extend => '0', update => '0', xerc => xerc_init, - reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0', - nia => (others => '0'), insn => (others => '0'), - addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'), - write_reg => (others => '0'), length => (others => '0'), - mode_32bit => '0', is_32bit => '0', - repeat => '0', second => '0'); + constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := + (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0', + sign_extend => '0', update => '0', xerc => xerc_init, + reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0', + nia => (others => '0'), insn => (others => '0'), + instr_tag => instr_tag_init, + addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'), + write_reg => (others => '0'), + length => (others => '0'), + mode_32bit => '0', is_32bit => '0', + repeat => '0', second => '0'); type Loadstore1ToExecute1Type is record busy : std_ulogic; @@ -439,6 +456,7 @@ package common is type Loadstore1ToWritebackType is record valid : std_ulogic; + instr_tag : instr_tag_t; write_enable: std_ulogic; write_reg : gspr_index_t; write_data : std_ulogic_vector(63 downto 0); @@ -446,11 +464,13 @@ package common is rc : std_ulogic; store_done : std_ulogic; end record; - constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := (valid => '0', write_enable => '0', xerc => xerc_init, - rc => '0', store_done => '0', write_data => (others => '0'), others => (others => '0')); + constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := + (valid => '0', instr_tag => instr_tag_init, write_enable => '0', xerc => xerc_init, + rc => '0', store_done => '0', write_data => (others => '0'), others => (others => '0')); type Execute1ToWritebackType is record valid: std_ulogic; + instr_tag : instr_tag_t; rc : std_ulogic; mode_32bit : std_ulogic; write_enable : std_ulogic; @@ -465,17 +485,19 @@ package common is exc_write_reg : gspr_index_t; exc_write_data : std_ulogic_vector(63 downto 0); end record; - constant Execute1ToWritebackInit : Execute1ToWritebackType := (valid => '0', rc => '0', mode_32bit => '0', write_enable => '0', - write_cr_enable => '0', exc_write_enable => '0', - write_xerc_enable => '0', xerc => xerc_init, - write_data => (others => '0'), write_cr_mask => (others => '0'), - write_cr_data => (others => '0'), write_reg => (others => '0'), - exc_write_reg => (others => '0'), exc_write_data => (others => '0')); + constant Execute1ToWritebackInit : Execute1ToWritebackType := + (valid => '0', instr_tag => instr_tag_init, rc => '0', mode_32bit => '0', + write_enable => '0', write_cr_enable => '0', exc_write_enable => '0', + write_xerc_enable => '0', xerc => xerc_init, + write_data => (others => '0'), write_cr_mask => (others => '0'), + write_cr_data => (others => '0'), write_reg => (others => '0'), + exc_write_reg => (others => '0'), exc_write_data => (others => '0')); type Execute1ToFPUType is record valid : std_ulogic; op : insn_type_t; nia : std_ulogic_vector(63 downto 0); + itag : instr_tag_t; insn : std_ulogic_vector(31 downto 0); single : std_ulogic; fe_mode : std_ulogic_vector(1 downto 0); @@ -487,6 +509,7 @@ package common is out_cr : std_ulogic; end record; constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'), + itag => instr_tag_init, insn => (others => '0'), fe_mode => "00", rc => '0', fra => (others => '0'), frb => (others => '0'), frc => (others => '0'), frt => (others => '0'), @@ -502,6 +525,7 @@ package common is type FPUToWritebackType is record valid : std_ulogic; + instr_tag : instr_tag_t; write_enable : std_ulogic; write_reg : gspr_index_t; write_data : std_ulogic_vector(63 downto 0); @@ -509,7 +533,9 @@ package common is write_cr_mask : std_ulogic_vector(7 downto 0); write_cr_data : std_ulogic_vector(31 downto 0); end record; - constant FPUToWritebackInit : FPUToWritebackType := (valid => '0', write_enable => '0', write_cr_enable => '0', others => (others => '0')); + constant FPUToWritebackInit : FPUToWritebackType := (valid => '0', instr_tag => instr_tag_init, + write_enable => '0', write_cr_enable => '0', + others => (others => '0')); type DividerToExecute1Type is record valid: std_ulogic; @@ -524,7 +550,8 @@ package common is write_data : std_ulogic_vector(63 downto 0); write_enable : std_ulogic; end record; - constant WritebackToRegisterFileInit : WritebackToRegisterFileType := (write_enable => '0', write_data => (others => '0'), others => (others => '0')); + constant WritebackToRegisterFileInit : WritebackToRegisterFileType := + (write_enable => '0', write_data => (others => '0'), others => (others => '0')); type WritebackToCrFileType is record write_cr_enable : std_ulogic; diff --git a/control.vhdl b/control.vhdl index a89dab8..576627b 100644 --- a/control.vhdl +++ b/control.vhdl @@ -12,7 +12,7 @@ entity control is clk : in std_ulogic; rst : in std_ulogic; - complete_in : in std_ulogic; + complete_in : in instr_tag_t; valid_in : in std_ulogic; repeated : in std_ulogic; flush_in : in std_ulogic; @@ -25,9 +25,6 @@ entity control is gpr_write_in : in gspr_index_t; gpr_bypassable : in std_ulogic; - update_gpr_write_valid : in std_ulogic; - update_gpr_write_reg : in gspr_index_t; - gpr_a_read_valid_in : in std_ulogic; gpr_a_read_in : in gspr_index_t; @@ -48,7 +45,9 @@ entity control is gpr_bypass_a : out std_ulogic; gpr_bypass_b : out std_ulogic; gpr_bypass_c : out std_ulogic; - cr_bypass : out std_ulogic + cr_bypass : out std_ulogic; + + instr_tag_out : out instr_tag_t ); end entity control; @@ -71,85 +70,31 @@ architecture rtl of control is signal gpr_write_valid : std_ulogic := '0'; signal cr_write_valid : std_ulogic := '0'; -begin - gpr_hazard0: entity work.gpr_hazard - generic map ( - PIPELINE_DEPTH => PIPELINE_DEPTH - ) - port map ( - clk => clk, - busy_in => busy_in, - deferred => deferred, - complete_in => complete_in, - flush_in => flush_in, - issuing => valid_out, - repeated => repeated, - - gpr_write_valid_in => gpr_write_valid, - gpr_write_in => gpr_write_in, - bypass_avail => gpr_bypassable, - gpr_read_valid_in => gpr_a_read_valid_in, - gpr_read_in => gpr_a_read_in, - - ugpr_write_valid => update_gpr_write_valid, - ugpr_write_reg => update_gpr_write_reg, - - stall_out => stall_a_out, - use_bypass => gpr_bypass_a - ); + type tag_register is record + wr_gpr : std_ulogic; + reg : gspr_index_t; + recent : std_ulogic; + end record; - gpr_hazard1: entity work.gpr_hazard - generic map ( - PIPELINE_DEPTH => PIPELINE_DEPTH - ) - port map ( - clk => clk, - busy_in => busy_in, - deferred => deferred, - complete_in => complete_in, - flush_in => flush_in, - issuing => valid_out, - repeated => repeated, + type tag_regs_array is array(tag_number_t) of tag_register; + signal tag_regs : tag_regs_array; - gpr_write_valid_in => gpr_write_valid, - gpr_write_in => gpr_write_in, - bypass_avail => gpr_bypassable, - gpr_read_valid_in => gpr_b_read_valid_in, - gpr_read_in => gpr_b_read_in, + signal instr_tag : instr_tag_t; - ugpr_write_valid => update_gpr_write_valid, - ugpr_write_reg => update_gpr_write_reg, + signal gpr_tag_a : instr_tag_t; + signal gpr_tag_b : instr_tag_t; + signal gpr_tag_c : instr_tag_t; + signal gpr_tag_stall : std_ulogic; - stall_out => stall_b_out, - use_bypass => gpr_bypass_b - ); + signal curr_tag : tag_number_t; + signal next_tag : tag_number_t; - gpr_hazard2: entity work.gpr_hazard - generic map ( - PIPELINE_DEPTH => PIPELINE_DEPTH - ) - port map ( - clk => clk, - busy_in => busy_in, - deferred => deferred, - complete_in => complete_in, - flush_in => flush_in, - issuing => valid_out, - repeated => repeated, - - gpr_write_valid_in => gpr_write_valid, - gpr_write_in => gpr_write_in, - bypass_avail => gpr_bypassable, - gpr_read_valid_in => gpr_c_read_valid_in, - gpr_read_in => gpr_c_read_in, - - ugpr_write_valid => update_gpr_write_valid, - ugpr_write_reg => update_gpr_write_reg, - - stall_out => stall_c_out, - use_bypass => gpr_bypass_c - ); + function tag_match(tag1 : instr_tag_t; tag2 : instr_tag_t) return boolean is + begin + return tag1.valid = '1' and tag2.valid = '1' and tag1.tag = tag2.tag; + end; +begin cr_hazard0: entity work.cr_hazard generic map ( PIPELINE_DEPTH => PIPELINE_DEPTH @@ -158,7 +103,7 @@ begin clk => clk, busy_in => busy_in, deferred => deferred, - complete_in => complete_in, + complete_in => complete_in.valid, flush_in => flush_in, issuing => valid_out, @@ -170,15 +115,102 @@ begin use_bypass => cr_bypass ); + gpr_bypass_a <= '0'; + gpr_bypass_b <= '0'; + gpr_bypass_c <= '0'; + control0: process(clk) begin if rising_edge(clk) then assert rin_int.outstanding >= 0 and rin_int.outstanding <= (PIPELINE_DEPTH+1) report "Outstanding bad " & integer'image(rin_int.outstanding) severity failure; r_int <= rin_int; + for i in tag_number_t loop + if rst = '1' or flush_in = '1' then + tag_regs(i).wr_gpr <= '0'; + else + if complete_in.valid = '1' and i = complete_in.tag then + tag_regs(i).wr_gpr <= '0'; + report "tag " & integer'image(i) & " not valid"; + end if; + if gpr_write_valid = '1' and tag_regs(i).reg = gpr_write_in then + tag_regs(i).recent <= '0'; + if tag_regs(i).recent = '1' and tag_regs(i).wr_gpr = '1' then + report "tag " & integer'image(i) & " not recent"; + end if; + end if; + if instr_tag.valid = '1' and i = instr_tag.tag then + tag_regs(i).wr_gpr <= gpr_write_valid; + tag_regs(i).reg <= gpr_write_in; + tag_regs(i).recent <= gpr_write_valid; + if gpr_write_valid = '1' then + report "tag " & integer'image(i) & " valid for gpr " & to_hstring(gpr_write_in); + end if; + end if; + end if; + end loop; + if rst = '1' then + curr_tag <= 0; + else + curr_tag <= next_tag; + end if; end if; end process; + control_hazards : process(all) + variable gpr_stall : std_ulogic; + variable tag_a : instr_tag_t; + variable tag_b : instr_tag_t; + variable tag_c : instr_tag_t; + variable tag_s : instr_tag_t; + variable tag_t : instr_tag_t; + variable incr_tag : tag_number_t; + begin + tag_a := instr_tag_init; + for i in tag_number_t loop + if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_a_read_in then + tag_a.valid := gpr_a_read_valid_in; + tag_a.tag := i; + end if; + end loop; + if tag_match(tag_a, complete_in) then + tag_a.valid := '0'; + end if; + tag_b := instr_tag_init; + for i in tag_number_t loop + if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_b_read_in then + tag_b.valid := gpr_b_read_valid_in; + tag_b.tag := i; + end if; + end loop; + if tag_match(tag_b, complete_in) then + tag_b.valid := '0'; + end if; + tag_c := instr_tag_init; + for i in tag_number_t loop + if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_c_read_in then + tag_c.valid := gpr_c_read_valid_in; + tag_c.tag := i; + end if; + end loop; + if tag_match(tag_c, complete_in) then + tag_c.valid := '0'; + end if; + gpr_tag_a <= tag_a; + gpr_tag_b <= tag_b; + gpr_tag_c <= tag_c; + gpr_tag_stall <= tag_a.valid or tag_b.valid or tag_c.valid; + + incr_tag := curr_tag; + instr_tag.tag <= curr_tag; + instr_tag.valid <= valid_out and not deferred; + if instr_tag.valid = '1' then + incr_tag := (curr_tag + 1) mod TAG_COUNT; + end if; + next_tag <= incr_tag; + instr_tag_out <= instr_tag; + end process; + control1 : process(all) variable v_int : reg_internal_type; variable valid_tmp : std_ulogic; @@ -193,7 +225,7 @@ begin if flush_in = '1' then -- expect to see complete_in next cycle v_int.outstanding := 1; - elsif complete_in = '1' then + elsif complete_in.valid = '1' then v_int.outstanding := r_int.outstanding - 1; end if; @@ -222,8 +254,8 @@ begin v_int.state := WAIT_FOR_CURR_TO_COMPLETE; end if; else - -- let it go out if there are no GPR hazards - stall_tmp := stall_a_out or stall_b_out or stall_c_out or cr_stall_out; + -- let it go out if there are no GPR or CR hazards + stall_tmp := gpr_tag_stall or cr_stall_out; end if; end if; @@ -249,8 +281,8 @@ begin v_int.state := WAIT_FOR_CURR_TO_COMPLETE; end if; else - -- let it go out if there are no GPR hazards - stall_tmp := stall_a_out or stall_b_out or stall_c_out or cr_stall_out; + -- let it go out if there are no GPR or CR hazards + stall_tmp := gpr_tag_stall or cr_stall_out; end if; end if; else @@ -262,15 +294,11 @@ begin valid_tmp := '0'; end if; - if valid_tmp = '1' then - if deferred = '0' then - v_int.outstanding := v_int.outstanding + 1; - end if; - gpr_write_valid <= gpr_write_valid_in; - cr_write_valid <= cr_write_in; - else - gpr_write_valid <= '0'; - cr_write_valid <= '0'; + gpr_write_valid <= gpr_write_valid_in and valid_tmp; + cr_write_valid <= cr_write_in and valid_tmp; + + if valid_tmp = '1' and deferred = '0' then + v_int.outstanding := v_int.outstanding + 1; end if; -- update outputs diff --git a/core.vhdl b/core.vhdl index 3948b86..2ac2ece 100644 --- a/core.vhdl +++ b/core.vhdl @@ -102,7 +102,7 @@ architecture behave of core is signal decode1_flush: std_ulogic; signal fetch1_flush: std_ulogic; - signal complete: std_ulogic; + signal complete: instr_tag_t; signal terminate: std_ulogic; signal core_rst: std_ulogic; signal icache_inv: std_ulogic; diff --git a/decode2.vhdl b/decode2.vhdl index 274a241..a5d7f67 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -19,7 +19,7 @@ entity decode2 is clk : in std_ulogic; rst : in std_ulogic; - complete_in : in std_ulogic; + complete_in : in instr_tag_t; busy_in : in std_ulogic; stall_out : out std_ulogic; @@ -303,6 +303,8 @@ architecture behaviour of decode2 is signal cr_bypass : std_ulogic; signal cr_bypass_avail : std_ulogic; + signal instr_tag : instr_tag_t; + begin control_0: entity work.control generic map ( @@ -325,9 +327,6 @@ begin gpr_write_in => gpr_write, gpr_bypassable => gpr_bypassable, - update_gpr_write_valid => '0', - update_gpr_write_reg => 7x"00", - gpr_a_read_valid_in => gpr_a_read_valid, gpr_a_read_in => gpr_a_read, @@ -348,7 +347,9 @@ begin gpr_bypass_a => gpr_a_bypass, gpr_bypass_b => gpr_b_bypass, - gpr_bypass_c => gpr_c_bypass + gpr_bypass_c => gpr_c_bypass, + + instr_tag_out => instr_tag ); deferred <= r.e.valid and busy_in; @@ -454,6 +455,7 @@ begin v.e.nia := d_in.nia; v.e.unit := d_in.decode.unit; v.e.fac := d_in.decode.facility; + v.e.instr_tag := instr_tag; v.e.read_reg1 := decoded_reg_a.reg; v.e.read_data1 := decoded_reg_a.data; v.e.bypass_data1 := gpr_a_bypass; diff --git a/execute1.vhdl b/execute1.vhdl index 2690424..e1fc240 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -319,7 +319,8 @@ begin ctrl <= ctrl_tmp; if valid_in = '1' then report "execute " & to_hstring(e_in.nia) & " op=" & insn_type_t'image(e_in.insn_type) & - " wr=" & to_hstring(rin.e.write_reg); + " wr=" & to_hstring(rin.e.write_reg) & " we=" & std_ulogic'image(rin.e.write_enable) & + " tag=" & integer'image(rin.e.instr_tag.tag) & std_ulogic'image(rin.e.instr_tag.valid); end if; end if; end if; @@ -694,6 +695,7 @@ begin end if; v.e.mode_32bit := not ctrl.msr(MSR_SF); + v.e.instr_tag := current.instr_tag; do_trace := valid_in and ctrl.msr(MSR_SE); if valid_in = '1' then @@ -749,8 +751,6 @@ begin end if; if valid_in = '1' and exception = '0' and illegal = '0' and e_in.unit = ALU then - report "execute nia " & to_hstring(e_in.nia); - v.cur_instr := e_in; v.next_lr := next_nia; v.e.valid := '1'; @@ -909,7 +909,6 @@ begin when OP_ISEL => when OP_CROP => cr_op := insn_cr(e_in.insn); - report "CR OP " & to_hstring(cr_op); if cr_op(0) = '0' then -- MCRF bf := insn_bf(e_in.insn); bfa := insn_bfa(e_in.insn); @@ -1309,6 +1308,7 @@ begin -- Outputs to loadstore1 (async) lv.op := e_in.insn_type; lv.nia := e_in.nia; + lv.instr_tag := e_in.instr_tag; lv.addr1 := a_in; lv.addr2 := b_in; lv.data := c_in; @@ -1337,6 +1337,7 @@ begin fv.op := e_in.insn_type; fv.nia := e_in.nia; fv.insn := e_in.insn; + fv.itag := e_in.instr_tag; fv.single := e_in.is_32bit; fv.fe_mode := ctrl.msr(MSR_FE0) & ctrl.msr(MSR_FE1); fv.fra := a_in; diff --git a/fpu.vhdl b/fpu.vhdl index 2e8096a..5e5c7d6 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -75,6 +75,7 @@ architecture behaviour of fpu is do_intr : std_ulogic; op : insn_type_t; insn : std_ulogic_vector(31 downto 0); + instr_tag : instr_tag_t; dest_fpr : gspr_index_t; fe_mode : std_ulogic; rc : std_ulogic; @@ -574,6 +575,7 @@ begin e_out.interrupt <= r.do_intr; w_out.valid <= r.instr_done and not r.do_intr; + w_out.instr_tag <= r.instr_tag; w_out.write_enable <= r.writing_back; w_out.write_reg <= r.dest_fpr; w_out.write_data <= fp_result; @@ -643,6 +645,7 @@ begin if e_in.valid = '1' then v.insn := e_in.insn; v.op := e_in.op; + v.instr_tag := e_in.itag; v.fe_mode := or (e_in.fe_mode); v.dest_fpr := e_in.frt; v.single_prec := e_in.single; diff --git a/gpr_hazard.vhdl b/gpr_hazard.vhdl deleted file mode 100644 index 6b00994..0000000 --- a/gpr_hazard.vhdl +++ /dev/null @@ -1,112 +0,0 @@ -library ieee; -use ieee.std_logic_1164.all; -use ieee.numeric_std.all; - -library work; -use work.common.all; - -entity gpr_hazard is - generic ( - PIPELINE_DEPTH : natural := 1 - ); - port( - clk : in std_ulogic; - busy_in : in std_ulogic; - deferred : in std_ulogic; - complete_in : in std_ulogic; - flush_in : in std_ulogic; - issuing : in std_ulogic; - repeated : in std_ulogic; - - gpr_write_valid_in : in std_ulogic; - gpr_write_in : in gspr_index_t; - bypass_avail : in std_ulogic; - gpr_read_valid_in : in std_ulogic; - gpr_read_in : in gspr_index_t; - - ugpr_write_valid : in std_ulogic; - ugpr_write_reg : in gspr_index_t; - - stall_out : out std_ulogic; - use_bypass : out std_ulogic - ); -end entity gpr_hazard; -architecture behaviour of gpr_hazard is - type pipeline_entry_type is record - valid : std_ulogic; - bypass : std_ulogic; - gpr : gspr_index_t; - ugpr_valid : std_ulogic; - ugpr : gspr_index_t; - end record; - constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0'), - ugpr_valid => '0', ugpr => (others => '0')); - - type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type; - constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init); - - signal r, rin : pipeline_t := pipeline_t_init; -begin - gpr_hazard0: process(clk) - begin - if rising_edge(clk) then - r <= rin; - end if; - end process; - - gpr_hazard1: process(all) - variable v : pipeline_t; - begin - v := r; - - if complete_in = '1' then - v(PIPELINE_DEPTH).valid := '0'; - v(PIPELINE_DEPTH).ugpr_valid := '0'; - end if; - - stall_out <= '0'; - use_bypass <= '0'; - if repeated = '0' and gpr_read_valid_in = '1' then - loop_0: for i in 0 to PIPELINE_DEPTH loop - -- The second half of a split instruction never has GPR - -- dependencies on the first half's output GPR, - -- so ignore matches when i = 0 for the second half. - if v(i).valid = '1' and r(i).gpr = gpr_read_in and - not (i = 0 and repeated = '1') then - if r(i).bypass = '1' then - use_bypass <= '1'; - else - stall_out <= '1'; - end if; - end if; - if v(i).ugpr_valid = '1' and r(i).ugpr = gpr_read_in then - stall_out <= '1'; - end if; - end loop; - end if; - - -- XXX assumes PIPELINE_DEPTH = 1 - if busy_in = '0' then - v(1) := v(0); - v(0).valid := '0'; - v(0).ugpr_valid := '0'; - end if; - if deferred = '0' and issuing = '1' then - v(0).valid := gpr_write_valid_in; - v(0).bypass := bypass_avail; - v(0).gpr := gpr_write_in; - v(0).ugpr_valid := ugpr_write_valid; - v(0).ugpr := ugpr_write_reg; - end if; - if flush_in = '1' then - v(0).valid := '0'; - v(0).ugpr_valid := '0'; - v(1).valid := '0'; - v(1).ugpr_valid := '0'; - end if; - - -- update registers - rin <= v; - - end process; -end; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index fc5bcf9..935ce5f 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -65,6 +65,7 @@ architecture behave of loadstore1 is addr : std_ulogic_vector(63 downto 0); store_data : std_ulogic_vector(63 downto 0); load_data : std_ulogic_vector(63 downto 0); + instr_tag : instr_tag_t; write_reg : gspr_index_t; length : std_ulogic_vector(3 downto 0); byte_reverse : std_ulogic; @@ -503,6 +504,7 @@ begin v.align_intr := '0'; v.dwords_done := '0'; v.last_dword := '1'; + v.instr_tag := l_in.instr_tag; v.write_reg := l_in.write_reg; v.length := l_in.length; v.byte_reverse := l_in.byte_reverse; @@ -725,6 +727,7 @@ begin -- Multiplex either cache data to the destination GPR or -- the address for the rA update. l_out.valid <= done; + l_out.instr_tag <= r.instr_tag; l_out.write_reg <= r.write_reg; case r.wr_sel is when "00" => diff --git a/microwatt.core b/microwatt.core index 41b6230..0f77fba 100644 --- a/microwatt.core +++ b/microwatt.core @@ -19,7 +19,6 @@ filesets: - sim_console.vhdl - logical.vhdl - countzero.vhdl - - gpr_hazard.vhdl - cr_hazard.vhdl - control.vhdl - execute1.vhdl diff --git a/writeback.vhdl b/writeback.vhdl index 95de0ec..044b1fb 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -17,7 +17,7 @@ entity writeback is w_out : out WritebackToRegisterFileType; c_out : out WritebackToCrFileType; - complete_out : out std_ulogic + complete_out : out instr_tag_t ); end entity writeback; @@ -47,6 +47,10 @@ begin y(0) := fp_in.write_cr_enable; assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; + + assert not (e_in.valid = '1' and e_in.instr_tag.valid = '0') severity failure; + assert not (l_in.valid = '1' and l_in.instr_tag.valid = '0') severity failure; + assert not (fp_in.valid = '1' and fp_in.instr_tag.valid = '0') severity failure; end if; end process; @@ -59,9 +63,13 @@ begin w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; - complete_out <= '0'; - if e_in.valid = '1' or l_in.valid = '1' or fp_in.valid = '1' then - complete_out <= '1'; + complete_out <= instr_tag_init; + if e_in.valid = '1' then + complete_out <= e_in.instr_tag; + elsif l_in.valid = '1' then + complete_out <= l_in.instr_tag; + elsif fp_in.valid = '1' then + complete_out <= fp_in.instr_tag; end if; if e_in.exc_write_enable = '1' then From d290d2a9bbddcfe52faa9427088bf6c4f225a711 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 11 Nov 2020 09:42:17 +1100 Subject: [PATCH 04/13] core: Restore bypass path from execute1 This changes the bypass path. Previously it went from after execute1's output to after decode2's output. Now it goes from before execute1's output register to before decode2's output register. The reason is that the new path will be simpler to manage when there are possibly multiple instructions in flight. This means that the bypassing can be managed inside decode2 and control. Signed-off-by: Paul Mackerras --- common.vhdl | 17 ++++++++++++---- control.vhdl | 44 +++++++++++++++++++++++++---------------- core.vhdl | 3 +++ decode2.vhdl | 55 +++++++++++++++++++++++++++++++-------------------- execute1.vhdl | 11 ++++++++--- 5 files changed, 85 insertions(+), 45 deletions(-) diff --git a/common.vhdl b/common.vhdl index 8792944..8d1ca29 100644 --- a/common.vhdl +++ b/common.vhdl @@ -137,6 +137,7 @@ package common is valid : std_ulogic; end record; constant instr_tag_init : instr_tag_t := (tag => 0, valid => '0'); + function tag_match(tag1 : instr_tag_t; tag2 : instr_tag_t) return boolean; type irq_state_t is (WRITE_SRR0, WRITE_SRR1); @@ -203,6 +204,12 @@ package common is redirect_nia : std_ulogic_vector(63 downto 0); end record; + type bypass_data_t is record + tag : instr_tag_t; + data : std_ulogic_vector(63 downto 0); + end record; + constant bypass_data_init : bypass_data_t := (tag => instr_tag_init, data => (others => '0')); + type Decode2ToExecute1Type is record valid: std_ulogic; unit : unit_t; @@ -217,9 +224,6 @@ package common is read_data1: std_ulogic_vector(63 downto 0); read_data2: std_ulogic_vector(63 downto 0); read_data3: std_ulogic_vector(63 downto 0); - bypass_data1: std_ulogic; - bypass_data2: std_ulogic; - bypass_data3: std_ulogic; cr: std_ulogic_vector(31 downto 0); bypass_cr : std_ulogic; xerc: xer_common_t; @@ -250,7 +254,7 @@ package common is end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init, - write_reg_enable => '0', bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', + write_reg_enable => '0', bypass_cr => '0', lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', @@ -644,4 +648,9 @@ package body common is begin return "10" & f; end; + + function tag_match(tag1 : instr_tag_t; tag2 : instr_tag_t) return boolean is + begin + return tag1.valid = '1' and tag2.valid = '1' and tag1.tag = tag2.tag; + end; end common; diff --git a/control.vhdl b/control.vhdl index 576627b..c4b8d4e 100644 --- a/control.vhdl +++ b/control.vhdl @@ -6,6 +6,7 @@ use work.common.all; entity control is generic ( + EX1_BYPASS : boolean := true; PIPELINE_DEPTH : natural := 2 ); port ( @@ -23,7 +24,6 @@ entity control is gpr_write_valid_in : in std_ulogic; gpr_write_in : in gspr_index_t; - gpr_bypassable : in std_ulogic; gpr_a_read_valid_in : in std_ulogic; gpr_a_read_in : in gspr_index_t; @@ -34,6 +34,8 @@ entity control is gpr_c_read_valid_in : in std_ulogic; gpr_c_read_in : in gspr_index_t; + execute_next_tag : in instr_tag_t; + cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; cr_bypassable : in std_ulogic; @@ -81,19 +83,11 @@ architecture rtl of control is signal instr_tag : instr_tag_t; - signal gpr_tag_a : instr_tag_t; - signal gpr_tag_b : instr_tag_t; - signal gpr_tag_c : instr_tag_t; signal gpr_tag_stall : std_ulogic; signal curr_tag : tag_number_t; signal next_tag : tag_number_t; - function tag_match(tag1 : instr_tag_t; tag2 : instr_tag_t) return boolean is - begin - return tag1.valid = '1' and tag2.valid = '1' and tag1.tag = tag2.tag; - end; - begin cr_hazard0: entity work.cr_hazard generic map ( @@ -115,10 +109,6 @@ begin use_bypass => cr_bypass ); - gpr_bypass_a <= '0'; - gpr_bypass_b <= '0'; - gpr_bypass_c <= '0'; - control0: process(clk) begin if rising_edge(clk) then @@ -165,6 +155,9 @@ begin variable tag_s : instr_tag_t; variable tag_t : instr_tag_t; variable incr_tag : tag_number_t; + variable byp_a : std_ulogic; + variable byp_b : std_ulogic; + variable byp_c : std_ulogic; begin tag_a := instr_tag_init; for i in tag_number_t loop @@ -196,10 +189,27 @@ begin if tag_match(tag_c, complete_in) then tag_c.valid := '0'; end if; - gpr_tag_a <= tag_a; - gpr_tag_b <= tag_b; - gpr_tag_c <= tag_c; - gpr_tag_stall <= tag_a.valid or tag_b.valid or tag_c.valid; + + byp_a := '0'; + if EX1_BYPASS and tag_match(execute_next_tag, tag_a) then + byp_a := '1'; + end if; + byp_b := '0'; + if EX1_BYPASS and tag_match(execute_next_tag, tag_b) then + byp_b := '1'; + end if; + byp_c := '0'; + if EX1_BYPASS and tag_match(execute_next_tag, tag_c) then + byp_c := '1'; + end if; + + gpr_bypass_a <= byp_a; + gpr_bypass_b <= byp_b; + gpr_bypass_c <= byp_c; + + gpr_tag_stall <= (tag_a.valid and not byp_a) or + (tag_b.valid and not byp_b) or + (tag_c.valid and not byp_c); incr_tag := curr_tag; instr_tag.tag <= curr_tag; diff --git a/core.vhdl b/core.vhdl index 2ac2ece..71bf2c8 100644 --- a/core.vhdl +++ b/core.vhdl @@ -67,6 +67,7 @@ architecture behave of core is -- execute signals signal execute1_to_writeback: Execute1ToWritebackType; signal execute1_to_fetch1: Execute1ToFetch1Type; + signal execute1_bypass: bypass_data_t; -- load store signals signal execute1_to_loadstore1: Execute1ToLoadstore1Type; @@ -273,6 +274,7 @@ begin r_out => decode2_to_register_file, c_in => cr_file_to_decode2, c_out => decode2_to_cr_file, + execute_bypass => execute1_bypass, log_out => log_data(119 downto 110) ); decode2_busy_in <= ex1_busy_out; @@ -330,6 +332,7 @@ begin f_out => execute1_to_fetch1, fp_out => execute1_to_fpu, e_out => execute1_to_writeback, + bypass_data => execute1_bypass, icache_inval => ex1_icache_inval, dbg_msr_out => msr, terminate_out => terminate, diff --git a/decode2.vhdl b/decode2.vhdl index a5d7f67..51c8ef1 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -37,6 +37,8 @@ entity decode2 is c_in : in CrFileToDecode2Type; c_out : out Decode2ToCrFileType; + execute_bypass : in bypass_data_t; + log_out : out std_ulogic_vector(9 downto 0) ); end entity decode2; @@ -285,19 +287,18 @@ architecture behaviour of decode2 is signal gpr_write_valid : std_ulogic; signal gpr_write : gspr_index_t; - signal gpr_bypassable : std_ulogic; signal gpr_a_read_valid : std_ulogic; - signal gpr_a_read :gspr_index_t; - signal gpr_a_bypass : std_ulogic; + signal gpr_a_read : gspr_index_t; + signal gpr_a_bypass : std_ulogic; signal gpr_b_read_valid : std_ulogic; - signal gpr_b_read : gspr_index_t; - signal gpr_b_bypass : std_ulogic; + signal gpr_b_read : gspr_index_t; + signal gpr_b_bypass : std_ulogic; signal gpr_c_read_valid : std_ulogic; - signal gpr_c_read : gspr_index_t; - signal gpr_c_bypass : std_ulogic; + signal gpr_c_read : gspr_index_t; + signal gpr_c_bypass : std_ulogic; signal cr_write_valid : std_ulogic; signal cr_bypass : std_ulogic; @@ -308,6 +309,7 @@ architecture behaviour of decode2 is begin control_0: entity work.control generic map ( + EX1_BYPASS => EX1_BYPASS, PIPELINE_DEPTH => 1 ) port map ( @@ -325,7 +327,6 @@ begin gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write, - gpr_bypassable => gpr_bypassable, gpr_a_read_valid_in => gpr_a_read_valid, gpr_a_read_in => gpr_a_read, @@ -336,6 +337,8 @@ begin gpr_c_read_valid_in => gpr_c_read_valid, gpr_c_read_in => gpr_c_read, + execute_next_tag => execute_bypass.tag, + cr_read_in => d_in.decode.input_cr, cr_write_in => cr_write_valid, cr_bypass => cr_bypass, @@ -457,13 +460,7 @@ begin v.e.fac := d_in.decode.facility; v.e.instr_tag := instr_tag; v.e.read_reg1 := decoded_reg_a.reg; - v.e.read_data1 := decoded_reg_a.data; - v.e.bypass_data1 := gpr_a_bypass; v.e.read_reg2 := decoded_reg_b.reg; - v.e.read_data2 := decoded_reg_b.data; - v.e.bypass_data2 := gpr_b_bypass; - v.e.read_data3 := decoded_reg_c.data; - v.e.bypass_data3 := gpr_c_bypass; v.e.write_reg := decoded_reg_o.reg; v.e.write_reg_enable := decoded_reg_o.reg_valid; v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); @@ -499,16 +496,32 @@ begin end if; end if; + -- See if any of the operands can get their value via the bypass path. + case gpr_a_bypass is + when '1' => + v.e.read_data1 := execute_bypass.data; + when others => + v.e.read_data1 := decoded_reg_a.data; + end case; + case gpr_b_bypass is + when '1' => + v.e.read_data2 := execute_bypass.data; + when others => + v.e.read_data2 := decoded_reg_b.data; + end case; + case gpr_c_bypass is + when '1' => + v.e.read_data3 := execute_bypass.data; + when others => + v.e.read_data3 := decoded_reg_c.data; + end case; + -- issue control control_valid_in <= d_in.valid; control_sgl_pipe <= d_in.decode.sgl_pipe; gpr_write_valid <= v.e.write_reg_enable; gpr_write <= decoded_reg_o.reg; - gpr_bypassable <= '0'; - if EX1_BYPASS and d_in.decode.unit = ALU then - gpr_bypassable <= '1'; - end if; gpr_a_read_valid <= decoded_reg_a.reg_valid; gpr_a_read <= decoded_reg_a.reg; @@ -554,9 +567,9 @@ begin r.e.valid & stopped_out & stall_out & - r.e.bypass_data3 & - r.e.bypass_data2 & - r.e.bypass_data1; + gpr_a_bypass & + gpr_b_bypass & + gpr_c_bypass; end if; end process; log_out <= log_data; diff --git a/execute1.vhdl b/execute1.vhdl index e1fc240..c0cc32f 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -37,6 +37,7 @@ entity execute1 is fp_out : out Execute1ToFPUType; e_out : out Execute1ToWritebackType; + bypass_data : out bypass_data_t; dbg_msr_out : out std_ulogic_vector(63 downto 0); @@ -283,9 +284,9 @@ begin dbg_msr_out <= ctrl.msr; log_rd_addr <= r.log_addr_spr; - a_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data1 = '1' else e_in.read_data1; - b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2; - c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3; + a_in <= e_in.read_data1; + b_in <= e_in.read_data2; + c_in <= e_in.read_data3; busy_out <= l_in.busy or r.busy or fp_in.busy; valid_in <= e_in.valid and not busy_out; @@ -1270,6 +1271,10 @@ begin v.e.write_enable := current.write_reg_enable and v.e.valid and not exception; v.e.rc := current.rc and v.e.valid and not exception; + bypass_data.tag.valid <= current.instr_tag.valid and current.write_reg_enable and v.e.valid; + bypass_data.tag.tag <= current.instr_tag.tag; + bypass_data.data <= v.e.write_data; + -- Defer completion for one cycle when redirecting. -- This also ensures r.busy = 1 when ctrl.irq_state = WRITE_SRR1 if v.redirect = '1' then From ae2afeca5c7ba04eebb79e671534c3431006fe13 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 12 Nov 2020 22:07:33 +1100 Subject: [PATCH 05/13] core: Track CR hazards and bypasses using tags Signed-off-by: Paul Mackerras --- Makefile | 2 +- common.vhdl | 9 ++++-- control.vhdl | 58 ++++++++++++++++++---------------- core.vhdl | 3 ++ cr_hazard.vhdl | 86 -------------------------------------------------- decode1.vhdl | 2 +- decode2.vhdl | 25 ++++++++------- execute1.vhdl | 27 +++++++--------- microwatt.core | 1 - 9 files changed, 68 insertions(+), 145 deletions(-) delete mode 100644 cr_hazard.vhdl diff --git a/Makefile b/Makefile index bb39007..678bbfa 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ all: $(all) core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \ decode1.vhdl helpers.vhdl insn_helpers.vhdl \ - cr_hazard.vhdl control.vhdl decode2.vhdl register_file.vhdl \ + control.vhdl decode2.vhdl register_file.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ logical.vhdl countzero.vhdl multiply.vhdl divider.vhdl execute1.vhdl \ loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \ diff --git a/common.vhdl b/common.vhdl index 8d1ca29..0151595 100644 --- a/common.vhdl +++ b/common.vhdl @@ -210,6 +210,12 @@ package common is end record; constant bypass_data_init : bypass_data_t := (tag => instr_tag_init, data => (others => '0')); + type cr_bypass_data_t is record + tag : instr_tag_t; + data : std_ulogic_vector(31 downto 0); + end record; + constant cr_bypass_data_init : cr_bypass_data_t := (tag => instr_tag_init, data => (others => '0')); + type Decode2ToExecute1Type is record valid: std_ulogic; unit : unit_t; @@ -225,7 +231,6 @@ package common is read_data2: std_ulogic_vector(63 downto 0); read_data3: std_ulogic_vector(63 downto 0); cr: std_ulogic_vector(31 downto 0); - bypass_cr : std_ulogic; xerc: xer_common_t; lr: std_ulogic; br_abs: std_ulogic; @@ -255,7 +260,7 @@ package common is constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init, write_reg_enable => '0', - bypass_cr => '0', lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', + lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), diff --git a/control.vhdl b/control.vhdl index c4b8d4e..5c83f78 100644 --- a/control.vhdl +++ b/control.vhdl @@ -35,10 +35,10 @@ entity control is gpr_c_read_in : in gspr_index_t; execute_next_tag : in instr_tag_t; + execute_next_cr_tag : in instr_tag_t; cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; - cr_bypassable : in std_ulogic; valid_out : out std_ulogic; stall_out : out std_ulogic; @@ -64,11 +64,6 @@ architecture rtl of control is signal r_int, rin_int : reg_internal_type := reg_internal_init; - signal stall_a_out : std_ulogic; - signal stall_b_out : std_ulogic; - signal stall_c_out : std_ulogic; - signal cr_stall_out : std_ulogic; - signal gpr_write_valid : std_ulogic := '0'; signal cr_write_valid : std_ulogic := '0'; @@ -76,6 +71,7 @@ architecture rtl of control is wr_gpr : std_ulogic; reg : gspr_index_t; recent : std_ulogic; + wr_cr : std_ulogic; end record; type tag_regs_array is array(tag_number_t) of tag_register; @@ -84,31 +80,14 @@ architecture rtl of control is signal instr_tag : instr_tag_t; signal gpr_tag_stall : std_ulogic; + signal cr_tag_stall : std_ulogic; signal curr_tag : tag_number_t; signal next_tag : tag_number_t; -begin - cr_hazard0: entity work.cr_hazard - generic map ( - PIPELINE_DEPTH => PIPELINE_DEPTH - ) - port map ( - clk => clk, - busy_in => busy_in, - deferred => deferred, - complete_in => complete_in.valid, - flush_in => flush_in, - issuing => valid_out, - - cr_read_in => cr_read_in, - cr_write_in => cr_write_valid, - bypassable => cr_bypassable, - - stall_out => cr_stall_out, - use_bypass => cr_bypass - ); + signal curr_cr_tag : tag_number_t; +begin control0: process(clk) begin if rising_edge(clk) then @@ -118,9 +97,11 @@ begin for i in tag_number_t loop if rst = '1' or flush_in = '1' then tag_regs(i).wr_gpr <= '0'; + tag_regs(i).wr_cr <= '0'; else if complete_in.valid = '1' and i = complete_in.tag then tag_regs(i).wr_gpr <= '0'; + tag_regs(i).wr_cr <= '0'; report "tag " & integer'image(i) & " not valid"; end if; if gpr_write_valid = '1' and tag_regs(i).reg = gpr_write_in then @@ -133,6 +114,7 @@ begin tag_regs(i).wr_gpr <= gpr_write_valid; tag_regs(i).reg <= gpr_write_in; tag_regs(i).recent <= gpr_write_valid; + tag_regs(i).wr_cr <= cr_write_valid; if gpr_write_valid = '1' then report "tag " & integer'image(i) & " valid for gpr " & to_hstring(gpr_write_in); end if; @@ -141,8 +123,12 @@ begin end loop; if rst = '1' then curr_tag <= 0; + curr_cr_tag <= 0; else curr_tag <= next_tag; + if cr_write_valid = '1' then + curr_cr_tag <= instr_tag.tag; + end if; end if; end if; end process; @@ -158,6 +144,8 @@ begin variable byp_a : std_ulogic; variable byp_b : std_ulogic; variable byp_c : std_ulogic; + variable tag_cr : instr_tag_t; + variable byp_cr : std_ulogic; begin tag_a := instr_tag_init; for i in tag_number_t loop @@ -219,6 +207,20 @@ begin end if; next_tag <= incr_tag; instr_tag_out <= instr_tag; + + -- CR hazards + tag_cr.tag := curr_cr_tag; + tag_cr.valid := cr_read_in and tag_regs(curr_cr_tag).wr_cr; + if tag_match(tag_cr, complete_in) then + tag_cr.valid := '0'; + end if; + byp_cr := '0'; + if EX1_BYPASS and tag_match(execute_next_cr_tag, tag_cr) then + byp_cr := '1'; + end if; + + cr_bypass <= byp_cr; + cr_tag_stall <= tag_cr.valid and not byp_cr; end process; control1 : process(all) @@ -265,7 +267,7 @@ begin end if; else -- let it go out if there are no GPR or CR hazards - stall_tmp := gpr_tag_stall or cr_stall_out; + stall_tmp := gpr_tag_stall or cr_tag_stall; end if; end if; @@ -292,7 +294,7 @@ begin end if; else -- let it go out if there are no GPR or CR hazards - stall_tmp := gpr_tag_stall or cr_stall_out; + stall_tmp := gpr_tag_stall or cr_tag_stall; end if; end if; else diff --git a/core.vhdl b/core.vhdl index 71bf2c8..7dafd1c 100644 --- a/core.vhdl +++ b/core.vhdl @@ -68,6 +68,7 @@ architecture behave of core is signal execute1_to_writeback: Execute1ToWritebackType; signal execute1_to_fetch1: Execute1ToFetch1Type; signal execute1_bypass: bypass_data_t; + signal execute1_cr_bypass: cr_bypass_data_t; -- load store signals signal execute1_to_loadstore1: Execute1ToLoadstore1Type; @@ -275,6 +276,7 @@ begin c_in => cr_file_to_decode2, c_out => decode2_to_cr_file, execute_bypass => execute1_bypass, + execute_cr_bypass => execute1_cr_bypass, log_out => log_data(119 downto 110) ); decode2_busy_in <= ex1_busy_out; @@ -333,6 +335,7 @@ begin fp_out => execute1_to_fpu, e_out => execute1_to_writeback, bypass_data => execute1_bypass, + bypass_cr_data => execute1_cr_bypass, icache_inval => ex1_icache_inval, dbg_msr_out => msr, terminate_out => terminate, diff --git a/cr_hazard.vhdl b/cr_hazard.vhdl deleted file mode 100644 index a6203a8..0000000 --- a/cr_hazard.vhdl +++ /dev/null @@ -1,86 +0,0 @@ -library ieee; -use ieee.std_logic_1164.all; -use ieee.numeric_std.all; - -entity cr_hazard is - generic ( - PIPELINE_DEPTH : natural := 1 - ); - port( - clk : in std_ulogic; - busy_in : in std_ulogic; - deferred : in std_ulogic; - complete_in : in std_ulogic; - flush_in : in std_ulogic; - issuing : in std_ulogic; - - cr_read_in : in std_ulogic; - cr_write_in : in std_ulogic; - bypassable : in std_ulogic; - - stall_out : out std_ulogic; - use_bypass : out std_ulogic - ); -end entity cr_hazard; -architecture behaviour of cr_hazard is - type pipeline_entry_type is record - valid : std_ulogic; - bypass : std_ulogic; - end record; - constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0'); - - type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type; - constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init); - - signal r, rin : pipeline_t := pipeline_t_init; -begin - cr_hazard0: process(clk) - begin - if rising_edge(clk) then - r <= rin; - end if; - end process; - - cr_hazard1: process(all) - variable v : pipeline_t; - begin - v := r; - - -- XXX assumes PIPELINE_DEPTH = 1 - if complete_in = '1' then - v(1).valid := '0'; - end if; - - use_bypass <= '0'; - stall_out <= '0'; - if cr_read_in = '1' then - loop_0: for i in 0 to PIPELINE_DEPTH loop - if v(i).valid = '1' then - if r(i).bypass = '1' then - use_bypass <= '1'; - else - stall_out <= '1'; - end if; - end if; - end loop; - end if; - - -- XXX assumes PIPELINE_DEPTH = 1 - if busy_in = '0' then - v(1) := r(0); - v(0).valid := '0'; - end if; - if deferred = '0' and issuing = '1' then - v(0).valid := cr_write_in; - v(0).bypass := bypassable; - end if; - if flush_in = '1' then - v(0).valid := '0'; - v(1).valid := '0'; - end if; - - -- update registers - rin <= v; - - end process; -end; diff --git a/decode1.vhdl b/decode1.vhdl index f62594b..2869c39 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -214,7 +214,7 @@ architecture behaviour of decode1 is 2#0100111010# => (ALU, NONE, OP_BCD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cbcdtd 2#0100011010# => (ALU, NONE, OP_BCD, NONE, NONE, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cdtbcd 2#0000000000# => (ALU, NONE, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- cmp - 2#0111111100# => (ALU, NONE, OP_CMPB, NONE, RB, RS, RA, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpb + 2#0111111100# => (ALU, NONE, OP_CMPB, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpb 2#0011100000# => (ALU, NONE, OP_CMPEQB, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpeqb 2#0000100000# => (ALU, NONE, OP_CMP, RA, RB, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpl 2#0011000000# => (ALU, NONE, OP_CMPRB, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmprb diff --git a/decode2.vhdl b/decode2.vhdl index 51c8ef1..748edb9 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -37,7 +37,8 @@ entity decode2 is c_in : in CrFileToDecode2Type; c_out : out Decode2ToCrFileType; - execute_bypass : in bypass_data_t; + execute_bypass : in bypass_data_t; + execute_cr_bypass : in cr_bypass_data_t; log_out : out std_ulogic_vector(9 downto 0) ); @@ -300,9 +301,9 @@ architecture behaviour of decode2 is signal gpr_c_read : gspr_index_t; signal gpr_c_bypass : std_ulogic; + signal cr_read_valid : std_ulogic; signal cr_write_valid : std_ulogic; signal cr_bypass : std_ulogic; - signal cr_bypass_avail : std_ulogic; signal instr_tag : instr_tag_t; @@ -338,11 +339,11 @@ begin gpr_c_read_in => gpr_c_read, execute_next_tag => execute_bypass.tag, + execute_next_cr_tag => execute_cr_bypass.tag, - cr_read_in => d_in.decode.input_cr, + cr_read_in => cr_read_valid, cr_write_in => cr_write_valid, cr_bypass => cr_bypass, - cr_bypassable => cr_bypass_avail, valid_out => control_valid_out, stall_out => control_stall_out, @@ -391,7 +392,7 @@ begin --v.e.input_cr := d_in.decode.input_cr; v.e.output_cr := d_in.decode.output_cr; - + decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1, d_in.nia); decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2); @@ -467,8 +468,6 @@ begin if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then v.e.oe := decode_oe(d_in.decode.rc, d_in.insn); end if; - v.e.cr := c_in.read_cr_data; - v.e.bypass_cr := cr_bypass; v.e.xerc := c_in.read_xerc_data; v.e.invert_a := d_in.decode.invert_a; v.e.addm1 := '0'; @@ -516,6 +515,11 @@ begin v.e.read_data3 := decoded_reg_c.data; end case; + v.e.cr := c_in.read_cr_data; + if cr_bypass = '1' then + v.e.cr := execute_cr_bypass.data; + end if; + -- issue control control_valid_in <= d_in.valid; control_sgl_pipe <= d_in.decode.sgl_pipe; @@ -533,10 +537,9 @@ begin gpr_c_read <= decoded_reg_c.reg; cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); - cr_bypass_avail <= '0'; - if EX1_BYPASS and d_in.decode.unit = ALU then - cr_bypass_avail <= d_in.decode.output_cr; - end if; + -- Since ops that write CR only write some of the fields, + -- any op that writes CR effectively also reads it. + cr_read_valid <= cr_write_valid or d_in.decode.input_cr; v.e.valid := control_valid_out; if control_valid_out = '1' then diff --git a/execute1.vhdl b/execute1.vhdl index c0cc32f..c859689 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -38,6 +38,7 @@ entity execute1 is e_out : out Execute1ToWritebackType; bypass_data : out bypass_data_t; + bypass_cr_data : out cr_bypass_data_t; dbg_msr_out : out std_ulogic_vector(63 downto 0); @@ -412,15 +413,7 @@ begin v.e.xerc := e_in.xerc; end if; - -- CR forwarding cr_in <= e_in.cr; - if EX1_BYPASS and e_in.bypass_cr = '1' and r.e.write_cr_enable = '1' then - for i in 0 to 7 loop - if r.e.write_cr_mask(i) = '1' then - cr_in(i * 4 + 3 downto i * 4) <= r.e.write_cr_data(i * 4 + 3 downto i * 4); - end if; - end loop; - end if; v.mul_in_progress := '0'; v.div_in_progress := '0'; @@ -809,7 +802,6 @@ begin end if; bf := insn_bf(e_in.insn); crnum := to_integer(unsigned(bf)); - v.e.write_cr_enable := '1'; v.e.write_cr_mask := num_to_fxm(crnum); for i in 0 to 7 loop lo := i*4; @@ -831,7 +823,6 @@ begin newcrf := ppc_cmprb(a_in, b_in, insn_l(e_in.insn)); bf := insn_bf(e_in.insn); crnum := to_integer(unsigned(bf)); - v.e.write_cr_enable := '1'; v.e.write_cr_mask := num_to_fxm(crnum); v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf; @@ -839,7 +830,6 @@ begin newcrf := ppc_cmpeqb(a_in, b_in); bf := insn_bf(e_in.insn); crnum := to_integer(unsigned(bf)); - v.e.write_cr_enable := '1'; v.e.write_cr_mask := num_to_fxm(crnum); v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf; @@ -913,7 +903,6 @@ begin if cr_op(0) = '0' then -- MCRF bf := insn_bf(e_in.insn); bfa := insn_bfa(e_in.insn); - v.e.write_cr_enable := '1'; crnum := to_integer(unsigned(bf)); scrnum := to_integer(unsigned(bfa)); v.e.write_cr_mask := num_to_fxm(crnum); @@ -930,7 +919,6 @@ begin v.e.write_cr_data(hi downto lo) := newcrf; end loop; else - v.e.write_cr_enable := '1'; bt := insn_bt(e_in.insn); ba := insn_ba(e_in.insn); bb := insn_bb(e_in.insn); @@ -954,7 +942,6 @@ begin newcrf := v.e.xerc.ov & v.e.xerc.ca & v.e.xerc.ov32 & v.e.xerc.ca32; bf := insn_bf(e_in.insn); crnum := to_integer(unsigned(bf)); - v.e.write_cr_enable := '1'; v.e.write_cr_mask := num_to_fxm(crnum); v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf; @@ -1007,7 +994,6 @@ begin when OP_MFCR => when OP_MTCRF => - v.e.write_cr_enable := '1'; if e_in.insn(20) = '0' then -- mtcrf v.e.write_cr_mask := insn_fxm(e_in.insn); @@ -1269,12 +1255,23 @@ begin end if; v.e.write_reg := current.write_reg; v.e.write_enable := current.write_reg_enable and v.e.valid and not exception; + v.e.write_cr_enable := current.output_cr and v.e.valid and not exception; v.e.rc := current.rc and v.e.valid and not exception; bypass_data.tag.valid <= current.instr_tag.valid and current.write_reg_enable and v.e.valid; bypass_data.tag.tag <= current.instr_tag.tag; bypass_data.data <= v.e.write_data; + bypass_cr_data.tag.valid <= current.instr_tag.valid and current.output_cr and v.e.valid; + bypass_cr_data.tag.tag <= current.instr_tag.tag; + for i in 0 to 7 loop + if v.e.write_cr_mask(i) = '1' then + bypass_cr_data.data(i*4 + 3 downto i*4) <= v.e.write_cr_data(i*4 + 3 downto i*4); + else + bypass_cr_data.data(i*4 + 3 downto i*4) <= cr_in(i*4 + 3 downto i*4); + end if; + end loop; + -- Defer completion for one cycle when redirecting. -- This also ensures r.busy = 1 when ctrl.irq_state = WRITE_SRR1 if v.redirect = '1' then diff --git a/microwatt.core b/microwatt.core index 0f77fba..79af3c1 100644 --- a/microwatt.core +++ b/microwatt.core @@ -19,7 +19,6 @@ filesets: - sim_console.vhdl - logical.vhdl - countzero.vhdl - - cr_hazard.vhdl - control.vhdl - execute1.vhdl - fpu.vhdl From d6ac43251ac3edf9001678e18eca0f9431ec8166 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 26 Nov 2020 22:10:30 +1100 Subject: [PATCH 06/13] execute1: Move data-path logic out to a separate process Signed-off-by: Paul Mackerras --- execute1.vhdl | 221 ++++++++++++++++++++++++++------------------------ 1 file changed, 117 insertions(+), 104 deletions(-) diff --git a/execute1.vhdl b/execute1.vhdl index c859689..cfbb278 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -93,6 +93,7 @@ architecture behaviour of execute1 is signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0); signal cr_in : std_ulogic_vector(31 downto 0); + signal xerc_in : xer_common_t; signal valid_in : std_ulogic; signal ctrl: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0')); @@ -113,6 +114,15 @@ architecture behaviour of execute1 is signal next_nia : std_ulogic_vector(63 downto 0); signal current: Decode2ToExecute1Type; + signal carry_32 : std_ulogic; + signal carry_64 : std_ulogic; + signal overflow_32 : std_ulogic; + signal overflow_64 : std_ulogic; + + signal cmprb_result : std_ulogic_vector(3 downto 0); + signal cmpeqb_result : std_ulogic_vector(3 downto 0); + signal trapval : std_ulogic_vector(4 downto 0); + -- multiply signals signal x_to_multiply: MultiplyInputType; signal multiply_to_x: MultiplyOutputType; @@ -288,6 +298,14 @@ begin a_in <= e_in.read_data1; b_in <= e_in.read_data2; c_in <= e_in.read_data3; + cr_in <= e_in.cr; + + -- XER forwarding. To avoid having to track XER hazards, we use + -- the previously latched value. Since the XER common bits + -- (SO, OV[32] and CA[32]) are only modified by instructions that are + -- handled here, we can just forward the result being sent to + -- writeback. + xerc_in <= r.e.xerc when r.e.write_xerc_enable = '1' or r.busy = '1' else e_in.xerc; busy_out <= l_in.busy or r.busy or fp_in.busy; valid_in <= e_in.valid and not busy_out; @@ -328,101 +346,30 @@ begin end if; end process; - execute1_1: process(all) - variable v : reg_type; + -- Data path for integer instructions + execute1_dp: process(all) variable a_inv : std_ulogic_vector(63 downto 0); variable b_or_m1 : std_ulogic_vector(63 downto 0); + variable sum_with_carry : std_ulogic_vector(64 downto 0); + variable sign1, sign2 : std_ulogic; + variable abs1, abs2 : signed(63 downto 0); + variable addend : std_ulogic_vector(127 downto 0); variable addg6s : std_ulogic_vector(63 downto 0); + variable crbit : integer range 0 to 31; variable isel_result : std_ulogic_vector(63 downto 0); variable darn : std_ulogic_vector(63 downto 0); - variable mfcr_result : std_ulogic_vector(63 downto 0); variable setb_result : std_ulogic_vector(63 downto 0); - variable newcrf : std_ulogic_vector(3 downto 0); - variable sum_with_carry : std_ulogic_vector(64 downto 0); + variable mfcr_result : std_ulogic_vector(63 downto 0); variable crnum : crnum_t; - variable crbit : integer range 0 to 31; - variable scrnum : crnum_t; variable lo, hi : integer; - variable sh, mb, me : std_ulogic_vector(5 downto 0); - variable sh32, mb32, me32 : std_ulogic_vector(4 downto 0); - variable bo, bi : std_ulogic_vector(4 downto 0); - variable bf, bfa : std_ulogic_vector(2 downto 0); - variable cr_op : std_ulogic_vector(9 downto 0); - variable cr_operands : std_ulogic_vector(1 downto 0); - variable bt, ba, bb : std_ulogic_vector(4 downto 0); - variable btnum, banum, bbnum : integer range 0 to 31; - variable crresult : std_ulogic; variable l : std_ulogic; - variable carry_32, carry_64 : std_ulogic; - variable sign1, sign2 : std_ulogic; - variable abs1, abs2 : signed(63 downto 0); - variable overflow : std_ulogic; variable zerohi, zerolo : std_ulogic; variable msb_a, msb_b : std_ulogic; variable a_lt : std_ulogic; variable a_lt_lo : std_ulogic; variable a_lt_hi : std_ulogic; - variable lv : Execute1ToLoadstore1Type; - variable irq_valid : std_ulogic; - variable exception : std_ulogic; - variable exception_nextpc : std_ulogic; - variable trapval : std_ulogic_vector(4 downto 0); - variable illegal : std_ulogic; - variable is_branch : std_ulogic; - variable is_direct_branch : std_ulogic; - variable taken_branch : std_ulogic; - variable abs_branch : std_ulogic; - variable spr_val : std_ulogic_vector(63 downto 0); - variable addend : std_ulogic_vector(127 downto 0); - variable do_trace : std_ulogic; - variable hold_wr_data : std_ulogic; - variable f : Execute1ToFetch1Type; - variable fv : Execute1ToFPUType; + variable bfa : std_ulogic_vector(2 downto 0); begin - sum_with_carry := (others => '0'); - newcrf := (others => '0'); - is_branch := '0'; - is_direct_branch := '0'; - taken_branch := '0'; - abs_branch := '0'; - hold_wr_data := '0'; - - v := r; - v.e := Execute1ToWritebackInit; - v.redirect := '0'; - v.abs_br := '0'; - v.do_intr := '0'; - v.vector := 0; - v.br_offset := (others => '0'); - v.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) & - not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF); - v.taken_br := '0'; - v.br_last := '0'; - - lv := Execute1ToLoadstore1Init; - fv := Execute1ToFPUInit; - - -- XER forwarding. To avoid having to track XER hazards, we use - -- the previously latched value. Since the XER common bits - -- (SO, OV[32] and CA[32]) are only modified by instructions that are - -- handled here, we can just forward the result being sent to - -- writeback. - if r.e.write_xerc_enable = '1' or r.busy = '1' then - v.e.xerc := r.e.xerc; - else - v.e.xerc := e_in.xerc; - end if; - - cr_in <= e_in.cr; - - v.mul_in_progress := '0'; - v.div_in_progress := '0'; - v.cntz_in_progress := '0'; - v.mul_finish := '0'; - - spr_result <= (others => '0'); - spr_val := (others => '0'); - -- Main adder if e_in.invert_a = '0' then a_inv := a_in; @@ -435,10 +382,12 @@ begin b_or_m1 := (others => '1'); end if; sum_with_carry := ppc_adde(a_inv, b_or_m1, - decode_input_carry(e_in.input_carry, v.e.xerc)); + decode_input_carry(e_in.input_carry, xerc_in)); adder_result <= sum_with_carry(63 downto 0); - carry_32 := sum_with_carry(32) xor a_inv(32) xor b_in(32); - carry_64 := sum_with_carry(64); + carry_32 <= sum_with_carry(32) xor a_inv(32) xor b_in(32); + carry_64 <= sum_with_carry(64); + overflow_32 <= calc_ov(a_inv(31), b_in(31), carry_32, sum_with_carry(31)); + overflow_64 <= calc_ov(a_inv(63), b_in(63), carry_64, sum_with_carry(63)); -- signals to multiply and divide units sign1 := '0'; @@ -465,12 +414,10 @@ begin end if; -- Interface to multiply and divide units - x_to_multiply <= MultiplyInputInit; - x_to_multiply.is_32bit <= e_in.is_32bit; - - x_to_divider <= Execute1ToDividerInit; x_to_divider.is_signed <= e_in.is_signed; x_to_divider.is_32bit <= e_in.is_32bit; + x_to_divider.is_extended <= '0'; + x_to_divider.is_modulus <= '0'; if e_in.insn_type = OP_MOD then x_to_divider.is_modulus <= '1'; end if; @@ -487,6 +434,7 @@ begin addend := not addend; end if; + x_to_multiply.is_32bit <= e_in.is_32bit; x_to_multiply.not_result <= sign1 xor sign2; x_to_multiply.addend <= addend; x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus); @@ -611,7 +559,7 @@ begin zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32))); if zerolo = '1' and (l = '0' or zerohi = '1') then -- values are equal - trapval := "00100"; + trapval <= "00100"; else a_lt_lo := '0'; a_lt_hi := '0'; @@ -635,14 +583,81 @@ begin if msb_a /= msb_b then -- Comparison is clear from MSB difference. -- for signed, 0 is greater; for unsigned, 1 is greater - trapval := msb_a & msb_b & '0' & msb_b & msb_a; + trapval <= msb_a & msb_b & '0' & msb_b & msb_a; else -- MSBs are equal, so signed and unsigned comparisons give the -- same answer. - trapval := a_lt & not a_lt & '0' & a_lt & not a_lt; + trapval <= a_lt & not a_lt & '0' & a_lt & not a_lt; end if; end if; + cmprb_result <= ppc_cmprb(a_in, b_in, insn_l(e_in.insn)); + cmpeqb_result <= ppc_cmpeqb(a_in, b_in); + end process; + + execute1_1: process(all) + variable v : reg_type; + variable newcrf : std_ulogic_vector(3 downto 0); + variable crnum : crnum_t; + variable scrnum : crnum_t; + variable lo, hi : integer; + variable sh, mb, me : std_ulogic_vector(5 downto 0); + variable bo, bi : std_ulogic_vector(4 downto 0); + variable bf, bfa : std_ulogic_vector(2 downto 0); + variable cr_op : std_ulogic_vector(9 downto 0); + variable cr_operands : std_ulogic_vector(1 downto 0); + variable bt, ba, bb : std_ulogic_vector(4 downto 0); + variable btnum, banum, bbnum : integer range 0 to 31; + variable crresult : std_ulogic; + variable overflow : std_ulogic; + variable lv : Execute1ToLoadstore1Type; + variable irq_valid : std_ulogic; + variable exception : std_ulogic; + variable exception_nextpc : std_ulogic; + variable illegal : std_ulogic; + variable is_branch : std_ulogic; + variable is_direct_branch : std_ulogic; + variable taken_branch : std_ulogic; + variable abs_branch : std_ulogic; + variable spr_val : std_ulogic_vector(63 downto 0); + variable do_trace : std_ulogic; + variable hold_wr_data : std_ulogic; + variable f : Execute1ToFetch1Type; + variable fv : Execute1ToFPUType; + begin + newcrf := (others => '0'); + is_branch := '0'; + is_direct_branch := '0'; + taken_branch := '0'; + abs_branch := '0'; + hold_wr_data := '0'; + + v := r; + v.e := Execute1ToWritebackInit; + v.redirect := '0'; + v.abs_br := '0'; + v.do_intr := '0'; + v.vector := 0; + v.br_offset := (others => '0'); + v.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) & + not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF); + v.taken_br := '0'; + v.br_last := '0'; + v.e.xerc := xerc_in; + + lv := Execute1ToLoadstore1Init; + fv := Execute1ToFPUInit; + + x_to_multiply.valid <= '0'; + x_to_divider.valid <= '0'; + v.mul_in_progress := '0'; + v.div_in_progress := '0'; + v.cntz_in_progress := '0'; + v.mul_finish := '0'; + + spr_result <= (others => '0'); + spr_val := (others => '0'); + ctrl_tmp <= ctrl; -- FIXME: run at 512MHz not core freq ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); @@ -789,16 +804,14 @@ begin end if; end if; if e_in.oe = '1' then - set_ov(v.e, - calc_ov(a_inv(63), b_in(63), carry_64, sum_with_carry(63)), - calc_ov(a_inv(31), b_in(31), carry_32, sum_with_carry(31))); + set_ov(v.e, overflow_64, overflow_32); end if; when OP_CMP => -- CMP and CMPL instructions if e_in.is_signed = '1' then - newcrf := trapval(4 downto 2) & v.e.xerc.so; + newcrf := trapval(4 downto 2) & xerc_in.so; else - newcrf := trapval(1 downto 0) & trapval(2) & v.e.xerc.so; + newcrf := trapval(1 downto 0) & trapval(2) & xerc_in.so; end if; bf := insn_bf(e_in.insn); crnum := to_integer(unsigned(bf)); @@ -820,14 +833,14 @@ begin end if; when OP_ADDG6S => when OP_CMPRB => - newcrf := ppc_cmprb(a_in, b_in, insn_l(e_in.insn)); + newcrf := cmprb_result; bf := insn_bf(e_in.insn); crnum := to_integer(unsigned(bf)); v.e.write_cr_mask := num_to_fxm(crnum); v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf & newcrf; when OP_CMPEQB => - newcrf := ppc_cmpeqb(a_in, b_in); + newcrf := cmpeqb_result; bf := insn_bf(e_in.insn); crnum := to_integer(unsigned(bf)); v.e.write_cr_mask := num_to_fxm(crnum); @@ -939,7 +952,7 @@ begin end loop; end if; when OP_MCRXRX => - newcrf := v.e.xerc.ov & v.e.xerc.ca & v.e.xerc.ov32 & v.e.xerc.ca32; + newcrf := xerc_in.ov & xerc_in.ca & xerc_in.ov32 & xerc_in.ca32; bf := insn_bf(e_in.insn); crnum := to_integer(unsigned(bf)); v.e.write_cr_mask := num_to_fxm(crnum); @@ -955,12 +968,12 @@ begin if decode_spr_num(e_in.insn) = SPR_XER then -- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer spr_val(63 downto 32) := (others => '0'); - spr_val(63-32) := v.e.xerc.so; - spr_val(63-33) := v.e.xerc.ov; - spr_val(63-34) := v.e.xerc.ca; + spr_val(63-32) := xerc_in.so; + spr_val(63-33) := xerc_in.ov; + spr_val(63-34) := xerc_in.ca; spr_val(63-35 downto 63-43) := "000000000"; - spr_val(63-44) := v.e.xerc.ov32; - spr_val(63-45) := v.e.xerc.ca32; + spr_val(63-44) := xerc_in.ov32; + spr_val(63-45) := xerc_in.ca32; end if; else spr_val := c_in; @@ -1319,7 +1332,7 @@ begin lv.byte_reverse := e_in.byte_reverse xnor ctrl.msr(MSR_LE); lv.sign_extend := e_in.sign_extend; lv.update := e_in.update; - lv.xerc := v.e.xerc; + lv.xerc := xerc_in; lv.reserve := e_in.reserve; lv.rc := e_in.rc; lv.insn := e_in.insn; From 4fd8d9509c3fed511f3b17c62a4038fc0c525e67 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 27 Nov 2020 17:41:39 +1100 Subject: [PATCH 07/13] execute1: Move CR result to data path process Also work out in decode2 whether the instruction sets the XER common bits. Signed-off-by: Paul Mackerras --- common.vhdl | 4 +- decode2.vhdl | 41 +++++------ execute1.vhdl | 186 ++++++++++++++++++++++---------------------------- 3 files changed, 107 insertions(+), 124 deletions(-) diff --git a/common.vhdl b/common.vhdl index 0151595..a3a95f4 100644 --- a/common.vhdl +++ b/common.vhdl @@ -243,6 +243,7 @@ package common is output_carry: std_ulogic; input_cr: std_ulogic; output_cr: std_ulogic; + output_xer: std_ulogic; is_32bit: std_ulogic; is_signed: std_ulogic; insn: std_ulogic_vector(31 downto 0); @@ -261,7 +262,8 @@ package common is (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init, write_reg_enable => '0', lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', - invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', + invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', + output_cr => '0', output_xer => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), diff --git a/decode2.vhdl b/decode2.vhdl index 748edb9..732cfe0 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -208,22 +208,6 @@ architecture behaviour of decode2 is end case; end; - -- For now, use "rc" in the decode table to decide whether oe exists. - -- This is not entirely correct architecturally: For mulhd and - -- mulhdu, the OE field is reserved. It remains to be seen what an - -- actual POWER9 does if we set it on those instructions, for now we - -- test that further down when assigning to the multiplier oe input. - -- - function decode_oe (t : rc_t; insn_in : std_ulogic_vector(31 downto 0)) return std_ulogic is - begin - case t is - when RC => - return insn_oe(insn_in); - when OTHERS => - return '0'; - end case; - end; - -- control signals that are derived from insn_type type mux_select_array_t is array(insn_type_t) of std_ulogic_vector(2 downto 0); @@ -277,6 +261,12 @@ architecture behaviour of decode2 is OP_MFMSR => "100", OP_MFCR => "101", OP_SETB => "110", + OP_CMP => "000", -- cr_result + OP_CMPRB => "001", + OP_CMPEQB => "010", + OP_CROP => "011", + OP_MCRXRX => "100", + OP_MTCRF => "101", others => "000" ); @@ -393,6 +383,22 @@ begin --v.e.input_cr := d_in.decode.input_cr; v.e.output_cr := d_in.decode.output_cr; + -- Work out whether XER common bits are set + v.e.output_xer := d_in.decode.output_carry; + case d_in.decode.insn_type is + when OP_ADD | OP_MUL_L64 | OP_DIV | OP_DIVE => + -- OE field is valid in OP_ADD/OP_MUL_L64 with major opcode 31 only + if d_in.insn(31 downto 26) = "011111" and insn_oe(d_in.insn) = '1' then + v.e.oe := '1'; + v.e.output_xer := '1'; + end if; + when OP_MTSPR => + if decode_spr_num(d_in.insn) = SPR_XER then + v.e.output_xer := '1'; + end if; + when others => + end case; + decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1, d_in.nia); decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2); @@ -465,9 +471,6 @@ begin v.e.write_reg := decoded_reg_o.reg; v.e.write_reg_enable := decoded_reg_o.reg_valid; v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); - if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then - v.e.oe := decode_oe(d_in.decode.rc, d_in.insn); - end if; v.e.xerc := c_in.read_xerc_data; v.e.invert_a := d_in.decode.invert_a; v.e.addm1 := '0'; diff --git a/execute1.vhdl b/execute1.vhdl index cfbb278..0b9ba0e 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -119,10 +119,11 @@ architecture behaviour of execute1 is signal overflow_32 : std_ulogic; signal overflow_64 : std_ulogic; - signal cmprb_result : std_ulogic_vector(3 downto 0); - signal cmpeqb_result : std_ulogic_vector(3 downto 0); signal trapval : std_ulogic_vector(4 downto 0); + signal write_cr_mask : std_ulogic_vector(7 downto 0); + signal write_cr_data : std_ulogic_vector(31 downto 0); + -- multiply signals signal x_to_multiply: MultiplyInputType; signal multiply_to_x: MultiplyOutputType; @@ -169,7 +170,6 @@ architecture behaviour of execute1 is begin e.xerc.ca32 := carry32; e.xerc.ca := carry; - e.write_xerc_enable := '1'; end; procedure set_ov(e: inout Execute1ToWritebackType; @@ -181,7 +181,6 @@ architecture behaviour of execute1 is if ov = '1' then e.xerc.so := '1'; end if; - e.write_xerc_enable := '1'; end; function calc_ov(msb_a : std_ulogic; msb_b: std_ulogic; @@ -360,7 +359,6 @@ begin variable darn : std_ulogic_vector(63 downto 0); variable setb_result : std_ulogic_vector(63 downto 0); variable mfcr_result : std_ulogic_vector(63 downto 0); - variable crnum : crnum_t; variable lo, hi : integer; variable l : std_ulogic; variable zerohi, zerolo : std_ulogic; @@ -368,7 +366,16 @@ begin variable a_lt : std_ulogic; variable a_lt_lo : std_ulogic; variable a_lt_hi : std_ulogic; - variable bfa : std_ulogic_vector(2 downto 0); + variable newcrf : std_ulogic_vector(3 downto 0); + variable bf, bfa : std_ulogic_vector(2 downto 0); + variable crnum : crnum_t; + variable scrnum : crnum_t; + variable cr_operands : std_ulogic_vector(1 downto 0); + variable crresult : std_ulogic; + variable bt, ba, bb : std_ulogic_vector(4 downto 0); + variable btnum : integer range 0 to 3; + variable banum, bbnum : integer range 0 to 31; + variable j : integer; begin -- Main adder if e_in.invert_a = '0' then @@ -591,24 +598,77 @@ begin end if; end if; - cmprb_result <= ppc_cmprb(a_in, b_in, insn_l(e_in.insn)); - cmpeqb_result <= ppc_cmpeqb(a_in, b_in); + -- CR result mux + bf := insn_bf(e_in.insn); + crnum := to_integer(unsigned(bf)); + newcrf := (others => '0'); + case current.sub_select is + when "000" => + -- CMP and CMPL instructions + if e_in.is_signed = '1' then + newcrf := trapval(4 downto 2) & xerc_in.so; + else + newcrf := trapval(1 downto 0) & trapval(2) & xerc_in.so; + end if; + when "001" => + newcrf := ppc_cmprb(a_in, b_in, insn_l(e_in.insn)); + when "010" => + newcrf := ppc_cmpeqb(a_in, b_in); + when "011" => + if current.insn(1) = '1' then + -- CR logical instructions + j := (7 - crnum) * 4; + newcrf := cr_in(j + 3 downto j); + bt := insn_bt(e_in.insn); + ba := insn_ba(e_in.insn); + bb := insn_bb(e_in.insn); + btnum := 3 - to_integer(unsigned(bt(1 downto 0))); + banum := 31 - to_integer(unsigned(ba)); + bbnum := 31 - to_integer(unsigned(bb)); + -- Bits 6-9 of the instruction word give the truth table + -- of the requested logical operation + cr_operands := cr_in(banum) & cr_in(bbnum); + crresult := e_in.insn(6 + to_integer(unsigned(cr_operands))); + for i in 0 to 3 loop + if i = btnum then + newcrf(i) := crresult; + end if; + end loop; + else + -- MCRF + bfa := insn_bfa(e_in.insn); + scrnum := to_integer(unsigned(bfa)); + j := (7 - scrnum) * 4; + newcrf := cr_in(j + 3 downto j); + end if; + when "100" => + -- MCRXRX + newcrf := xerc_in.ov & xerc_in.ca & xerc_in.ov32 & xerc_in.ca32; + when others => + end case; + if current.insn_type = OP_MTCRF then + if e_in.insn(20) = '0' then + -- mtcrf + write_cr_mask <= insn_fxm(e_in.insn); + else + -- mtocrf: We require one hot priority encoding here + crnum := fxm_to_num(insn_fxm(e_in.insn)); + write_cr_mask <= num_to_fxm(crnum); + end if; + write_cr_data <= c_in(31 downto 0); + else + write_cr_mask <= num_to_fxm(crnum); + write_cr_data <= newcrf & newcrf & newcrf & newcrf & + newcrf & newcrf & newcrf & newcrf; + end if; + end process; execute1_1: process(all) variable v : reg_type; - variable newcrf : std_ulogic_vector(3 downto 0); - variable crnum : crnum_t; - variable scrnum : crnum_t; variable lo, hi : integer; variable sh, mb, me : std_ulogic_vector(5 downto 0); variable bo, bi : std_ulogic_vector(4 downto 0); - variable bf, bfa : std_ulogic_vector(2 downto 0); - variable cr_op : std_ulogic_vector(9 downto 0); - variable cr_operands : std_ulogic_vector(1 downto 0); - variable bt, ba, bb : std_ulogic_vector(4 downto 0); - variable btnum, banum, bbnum : integer range 0 to 31; - variable crresult : std_ulogic; variable overflow : std_ulogic; variable lv : Execute1ToLoadstore1Type; variable irq_valid : std_ulogic; @@ -625,7 +685,6 @@ begin variable f : Execute1ToFetch1Type; variable fv : Execute1ToFPUType; begin - newcrf := (others => '0'); is_branch := '0'; is_direct_branch := '0'; taken_branch := '0'; @@ -800,27 +859,12 @@ begin else v.e.xerc.ov := carry_64; v.e.xerc.ov32 := carry_32; - v.e.write_xerc_enable := '1'; end if; end if; if e_in.oe = '1' then set_ov(v.e, overflow_64, overflow_32); end if; when OP_CMP => - -- CMP and CMPL instructions - if e_in.is_signed = '1' then - newcrf := trapval(4 downto 2) & xerc_in.so; - else - newcrf := trapval(1 downto 0) & trapval(2) & xerc_in.so; - end if; - bf := insn_bf(e_in.insn); - crnum := to_integer(unsigned(bf)); - v.e.write_cr_mask := num_to_fxm(crnum); - for i in 0 to 7 loop - lo := i*4; - hi := lo + 3; - v.e.write_cr_data(hi downto lo) := newcrf; - end loop; when OP_TRAP => -- trap instructions (tw, twi, td, tdi) v.vector := 16#700#; @@ -833,19 +877,7 @@ begin end if; when OP_ADDG6S => when OP_CMPRB => - newcrf := cmprb_result; - bf := insn_bf(e_in.insn); - crnum := to_integer(unsigned(bf)); - v.e.write_cr_mask := num_to_fxm(crnum); - v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & - newcrf & newcrf & newcrf & newcrf; when OP_CMPEQB => - newcrf := cmpeqb_result; - bf := insn_bf(e_in.insn); - crnum := to_integer(unsigned(bf)); - v.e.write_cr_mask := num_to_fxm(crnum); - v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & - newcrf & newcrf & newcrf & newcrf; when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS | OP_BPERM | OP_BCD => @@ -911,53 +943,8 @@ begin v.cntz_in_progress := '1'; v.busy := '1'; when OP_ISEL => - when OP_CROP => - cr_op := insn_cr(e_in.insn); - if cr_op(0) = '0' then -- MCRF - bf := insn_bf(e_in.insn); - bfa := insn_bfa(e_in.insn); - crnum := to_integer(unsigned(bf)); - scrnum := to_integer(unsigned(bfa)); - v.e.write_cr_mask := num_to_fxm(crnum); - for i in 0 to 7 loop - lo := (7-i)*4; - hi := lo + 3; - if i = scrnum then - newcrf := cr_in(hi downto lo); - end if; - end loop; - for i in 0 to 7 loop - lo := i*4; - hi := lo + 3; - v.e.write_cr_data(hi downto lo) := newcrf; - end loop; - else - bt := insn_bt(e_in.insn); - ba := insn_ba(e_in.insn); - bb := insn_bb(e_in.insn); - btnum := 31 - to_integer(unsigned(bt)); - banum := 31 - to_integer(unsigned(ba)); - bbnum := 31 - to_integer(unsigned(bb)); - -- Bits 5-8 of cr_op give the truth table of the requested - -- logical operation - cr_operands := cr_in(banum) & cr_in(bbnum); - crresult := cr_op(5 + to_integer(unsigned(cr_operands))); - v.e.write_cr_mask := num_to_fxm((31-btnum) / 4); - for i in 0 to 31 loop - if i = btnum then - v.e.write_cr_data(i) := crresult; - else - v.e.write_cr_data(i) := cr_in(i); - end if; - end loop; - end if; + when OP_CROP => when OP_MCRXRX => - newcrf := xerc_in.ov & xerc_in.ca & xerc_in.ov32 & xerc_in.ca32; - bf := insn_bf(e_in.insn); - crnum := to_integer(unsigned(bf)); - v.e.write_cr_mask := num_to_fxm(crnum); - v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf & - newcrf & newcrf & newcrf & newcrf; when OP_DARN => when OP_MFMSR => when OP_MFSPR => @@ -1007,15 +994,6 @@ begin when OP_MFCR => when OP_MTCRF => - if e_in.insn(20) = '0' then - -- mtcrf - v.e.write_cr_mask := insn_fxm(e_in.insn); - else - -- mtocrf: We require one hot priority encoding here - crnum := fxm_to_num(insn_fxm(e_in.insn)); - v.e.write_cr_mask := num_to_fxm(crnum); - end if; - v.e.write_cr_data := c_in(31 downto 0); when OP_MTMSRD => if e_in.insn(16) = '1' then -- just update EE and RI @@ -1050,7 +1028,6 @@ begin v.e.xerc.ca := c_in(63-34); v.e.xerc.ov32 := c_in(63-44); v.e.xerc.ca32 := c_in(63-45); - v.e.write_xerc_enable := '1'; end if; else -- slow spr @@ -1170,7 +1147,6 @@ begin v.mul_finish := '1'; v.busy := '1'; else - v.e.write_xerc_enable := current.oe; -- We must test oe because the RC update code in writeback -- will use the xerc value to set CR0:SO so we must not clobber -- xerc if OE wasn't set. @@ -1190,7 +1166,6 @@ begin end if; elsif r.mul_finish = '1' then hold_wr_data := '1'; - v.e.write_xerc_enable := current.oe; v.e.xerc.ov := multiply_to_x.overflow; v.e.xerc.ov32 := multiply_to_x.overflow; if multiply_to_x.overflow = '1' then @@ -1268,8 +1243,11 @@ begin end if; v.e.write_reg := current.write_reg; v.e.write_enable := current.write_reg_enable and v.e.valid and not exception; - v.e.write_cr_enable := current.output_cr and v.e.valid and not exception; v.e.rc := current.rc and v.e.valid and not exception; + v.e.write_cr_data := write_cr_data; + v.e.write_cr_mask := write_cr_mask; + v.e.write_cr_enable := current.output_cr and v.e.valid and not exception; + v.e.write_xerc_enable := current.output_xer and v.e.valid and not exception; bypass_data.tag.valid <= current.instr_tag.valid and current.write_reg_enable and v.e.valid; bypass_data.tag.tag <= current.instr_tag.tag; From 3cd3449b4b88e025ff9412f82737747b0c6d938a Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 23 Dec 2020 11:13:21 +1100 Subject: [PATCH 08/13] core: Move redirect and interrupt delivery logic to writeback This moves the logic for redirecting fetching and writing SRR0 and SRR1 to writeback. The aim is that ultimately units other than execute1 can send their interrupts to writeback along with their instruction completions, so that there can be multiple instructions in flight without needing execute1 to keep track of the address of each outstanding instruction. Signed-off-by: Paul Mackerras --- common.vhdl | 55 ++++++------ control.vhdl | 3 +- core.vhdl | 15 +++- execute1.vhdl | 225 ++++++++++++++++--------------------------------- fetch1.vhdl | 34 ++++---- writeback.vhdl | 72 +++++++++++++++- 6 files changed, 198 insertions(+), 206 deletions(-) diff --git a/common.vhdl b/common.vhdl index a3a95f4..b2d6b13 100644 --- a/common.vhdl +++ b/common.vhdl @@ -139,8 +139,6 @@ package common is constant instr_tag_init : instr_tag_t := (tag => 0, valid => '0'); function tag_match(tag1 : instr_tag_t; tag2 : instr_tag_t) return boolean; - type irq_state_t is (WRITE_SRR0, WRITE_SRR1); - -- For now, fixed 16 sources, make this either a parametric -- package of some sort or an unconstrainted array. type ics_to_icp_t is record @@ -157,8 +155,6 @@ package common is dec: std_ulogic_vector(63 downto 0); msr: std_ulogic_vector(63 downto 0); cfar: std_ulogic_vector(63 downto 0); - irq_state : irq_state_t; - srr1: std_ulogic_vector(63 downto 0); end record; type Fetch1ToIcacheType is record @@ -329,22 +325,6 @@ package common is read_xerc_data : xer_common_t; end record; - type Execute1ToFetch1Type is record - redirect: std_ulogic; - virt_mode: std_ulogic; - priv_mode: std_ulogic; - big_endian: std_ulogic; - mode_32bit: std_ulogic; - redirect_nia: std_ulogic_vector(63 downto 0); - br_nia : std_ulogic_vector(63 downto 0); - br_last : std_ulogic; - br_taken : std_ulogic; - end record; - constant Execute1ToFetch1Init : Execute1ToFetch1Type := (redirect => '0', virt_mode => '0', - priv_mode => '0', big_endian => '0', - mode_32bit => '0', br_taken => '0', - br_last => '0', others => (others => '0')); - type Execute1ToLoadstore1Type is record valid : std_ulogic; op : insn_type_t; -- what ld/st or m[tf]spr or TLB op to do @@ -492,17 +472,26 @@ package common is write_cr_data : std_ulogic_vector(31 downto 0); write_xerc_enable : std_ulogic; xerc : xer_common_t; - exc_write_enable : std_ulogic; - exc_write_reg : gspr_index_t; - exc_write_data : std_ulogic_vector(63 downto 0); + interrupt : std_ulogic; + intr_vec : integer range 0 to 16#fff#; + redirect: std_ulogic; + redir_mode: std_ulogic_vector(3 downto 0); + last_nia: std_ulogic_vector(63 downto 0); + br_offset: std_ulogic_vector(63 downto 0); + br_last: std_ulogic; + br_taken: std_ulogic; + abs_br: std_ulogic; + srr1: std_ulogic_vector(63 downto 0); end record; constant Execute1ToWritebackInit : Execute1ToWritebackType := (valid => '0', instr_tag => instr_tag_init, rc => '0', mode_32bit => '0', - write_enable => '0', write_cr_enable => '0', exc_write_enable => '0', + write_enable => '0', write_cr_enable => '0', write_xerc_enable => '0', xerc => xerc_init, write_data => (others => '0'), write_cr_mask => (others => '0'), write_cr_data => (others => '0'), write_reg => (others => '0'), - exc_write_reg => (others => '0'), exc_write_data => (others => '0')); + interrupt => '0', intr_vec => 0, redirect => '0', redir_mode => "0000", + last_nia => (others => '0'), br_offset => (others => '0'), + br_last => '0', br_taken => '0', abs_br => '0', srr1 => (others => '0')); type Execute1ToFPUType is record valid : std_ulogic; @@ -556,6 +545,22 @@ package common is constant DividerToExecute1Init : DividerToExecute1Type := (valid => '0', overflow => '0', others => (others => '0')); + type WritebackToFetch1Type is record + redirect: std_ulogic; + virt_mode: std_ulogic; + priv_mode: std_ulogic; + big_endian: std_ulogic; + mode_32bit: std_ulogic; + redirect_nia: std_ulogic_vector(63 downto 0); + br_nia : std_ulogic_vector(63 downto 0); + br_last : std_ulogic; + br_taken : std_ulogic; + end record; + constant WritebackToFetch1Init : WritebackToFetch1Type := + (redirect => '0', virt_mode => '0', priv_mode => '0', big_endian => '0', + mode_32bit => '0', redirect_nia => (others => '0'), + br_last => '0', br_taken => '0', br_nia => (others => '0')); + type WritebackToRegisterFileType is record write_reg : gspr_index_t; write_data : std_ulogic_vector(63 downto 0); diff --git a/control.vhdl b/control.vhdl index 5c83f78..f14e350 100644 --- a/control.vhdl +++ b/control.vhdl @@ -235,8 +235,7 @@ begin stall_tmp := '0'; if flush_in = '1' then - -- expect to see complete_in next cycle - v_int.outstanding := 1; + v_int.outstanding := 0; elsif complete_in.valid = '1' then v_int.outstanding := r_int.outstanding - 1; end if; diff --git a/core.vhdl b/core.vhdl index 7dafd1c..e2a93b9 100644 --- a/core.vhdl +++ b/core.vhdl @@ -46,6 +46,7 @@ end core; architecture behave of core is -- icache signals signal fetch1_to_icache : Fetch1ToIcacheType; + signal writeback_to_fetch1: WritebackToFetch1Type; signal icache_to_decode1 : IcacheToDecode1Type; signal mmu_to_icache : MmuToIcacheType; @@ -66,7 +67,6 @@ architecture behave of core is -- execute signals signal execute1_to_writeback: Execute1ToWritebackType; - signal execute1_to_fetch1: Execute1ToFetch1Type; signal execute1_bypass: bypass_data_t; signal execute1_cr_bypass: cr_bypass_data_t; @@ -108,6 +108,7 @@ architecture behave of core is signal terminate: std_ulogic; signal core_rst: std_ulogic; signal icache_inv: std_ulogic; + signal do_interrupt: std_ulogic; -- Delayed/Latched resets and alt_reset signal rst_fetch1 : std_ulogic := '1'; @@ -119,6 +120,7 @@ architecture behave of core is signal rst_ex1 : std_ulogic := '1'; signal rst_fpu : std_ulogic := '1'; signal rst_ls1 : std_ulogic := '1'; + signal rst_wback : std_ulogic := '1'; signal rst_dbg : std_ulogic := '1'; signal alt_reset_d : std_ulogic; @@ -182,6 +184,7 @@ begin rst_ex1 <= core_rst; rst_fpu <= core_rst; rst_ls1 <= core_rst; + rst_wback <= core_rst; rst_dbg <= rst; alt_reset_d <= alt_reset; end if; @@ -202,7 +205,7 @@ begin inval_btc => ex1_icache_inval or mmu_to_icache.tlbie, stop_in => dbg_core_stop, d_in => decode1_to_fetch1, - e_in => execute1_to_fetch1, + w_in => writeback_to_fetch1, i_out => fetch1_to_icache, log_out => log_data(42 downto 0) ); @@ -324,14 +327,14 @@ begin port map ( clk => clk, rst => rst_ex1, - flush_out => flush, + flush_in => flush, busy_out => ex1_busy_out, e_in => decode2_to_execute1, l_in => loadstore1_to_execute1, fp_in => fpu_to_execute1, ext_irq_in => ext_irq, + interrupt_in => do_interrupt, l_out => execute1_to_loadstore1, - f_out => execute1_to_fetch1, fp_out => execute1_to_fpu, e_out => execute1_to_writeback, bypass_data => execute1_bypass, @@ -416,11 +419,15 @@ begin writeback_0: entity work.writeback port map ( clk => clk, + rst => rst_wback, + flush_out => flush, e_in => execute1_to_writeback, l_in => loadstore1_to_writeback, fp_in => fpu_to_writeback, w_out => writeback_to_register_file, c_out => writeback_to_cr_file, + f_out => writeback_to_fetch1, + interrupt_out => do_interrupt, complete_out => complete ); diff --git a/execute1.vhdl b/execute1.vhdl index 0b9ba0e..875e22c 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -22,7 +22,7 @@ entity execute1 is rst : in std_ulogic; -- asynchronous - flush_out : out std_ulogic; + flush_in : in std_ulogic; busy_out : out std_ulogic; e_in : in Decode2ToExecute1Type; @@ -30,10 +30,10 @@ entity execute1 is fp_in : in FPUToExecute1Type; ext_irq_in : std_ulogic; + interrupt_in : std_ulogic; -- asynchronous l_out : out Execute1ToLoadstore1Type; - f_out : out Execute1ToFetch1Type; fp_out : out Execute1ToFPUType; e_out : out Execute1ToWritebackType; @@ -61,21 +61,11 @@ architecture behaviour of execute1 is fp_exception_next : std_ulogic; trace_next : std_ulogic; prev_op : insn_type_t; - next_lr : std_ulogic_vector(63 downto 0); br_taken : std_ulogic; mul_in_progress : std_ulogic; mul_finish : std_ulogic; div_in_progress : std_ulogic; cntz_in_progress : std_ulogic; - last_nia : std_ulogic_vector(63 downto 0); - redirect : std_ulogic; - abs_br : std_ulogic; - taken_br : std_ulogic; - br_last : std_ulogic; - do_intr : std_ulogic; - vector : integer range 0 to 16#fff#; - br_offset : std_ulogic_vector(63 downto 0); - redir_mode : std_ulogic_vector(3 downto 0); log_addr_spr : std_ulogic_vector(31 downto 0); end record; constant reg_type_init : reg_type := @@ -84,9 +74,6 @@ architecture behaviour of execute1 is busy => '0', terminate => '0', fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, br_taken => '0', mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0', - next_lr => (others => '0'), last_nia => (others => '0'), - redirect => '0', abs_br => '0', taken_br => '0', br_last => '0', do_intr => '0', vector => 0, - br_offset => (others => '0'), redir_mode => "0000", others => (others => '0')); signal r, rin : reg_type; @@ -96,8 +83,8 @@ architecture behaviour of execute1 is signal xerc_in : xer_common_t; signal valid_in : std_ulogic; - signal ctrl: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0')); - signal ctrl_tmp: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0')); + signal ctrl: ctrl_t := (others => (others => '0')); + signal ctrl_tmp: ctrl_t := (others => (others => '0')); signal right_shift, rot_clear_left, rot_clear_right: std_ulogic; signal rot_sign_ext: std_ulogic; signal rotator_result: std_ulogic_vector(63 downto 0); @@ -307,7 +294,7 @@ begin xerc_in <= r.e.xerc when r.e.write_xerc_enable = '1' or r.busy = '1' else e_in.xerc; busy_out <= l_in.busy or r.busy or fp_in.busy; - valid_in <= e_in.valid and not busy_out; + valid_in <= e_in.valid and not busy_out and not flush_in; terminate_out <= r.terminate; @@ -332,7 +319,6 @@ begin ctrl.tb <= (others => '0'); ctrl.dec <= (others => '0'); ctrl.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0'); - ctrl.irq_state <= WRITE_SRR0; else r <= rin; ctrl <= ctrl_tmp; @@ -673,7 +659,6 @@ begin variable lv : Execute1ToLoadstore1Type; variable irq_valid : std_ulogic; variable exception : std_ulogic; - variable exception_nextpc : std_ulogic; variable illegal : std_ulogic; variable is_branch : std_ulogic; variable is_direct_branch : std_ulogic; @@ -682,7 +667,6 @@ begin variable spr_val : std_ulogic_vector(63 downto 0); variable do_trace : std_ulogic; variable hold_wr_data : std_ulogic; - variable f : Execute1ToFetch1Type; variable fv : Execute1ToFPUType; begin is_branch := '0'; @@ -693,15 +677,8 @@ begin v := r; v.e := Execute1ToWritebackInit; - v.redirect := '0'; - v.abs_br := '0'; - v.do_intr := '0'; - v.vector := 0; - v.br_offset := (others => '0'); - v.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) & - not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF); - v.taken_br := '0'; - v.br_last := '0'; + v.e.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) & + not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF); v.e.xerc := xerc_in; lv := Execute1ToLoadstore1Init; @@ -725,11 +702,11 @@ begin irq_valid := '0'; if ctrl.msr(MSR_EE) = '1' then if ctrl.dec(63) = '1' then - v.vector := 16#900#; + v.e.intr_vec := 16#900#; report "IRQ valid: DEC"; irq_valid := '1'; elsif ext_irq_in = '1' then - v.vector := 16#500#; + v.e.intr_vec := 16#500#; report "IRQ valid: External"; irq_valid := '1'; end if; @@ -748,18 +725,13 @@ begin rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0'; rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0'; - ctrl_tmp.srr1 <= msr_copy(ctrl.msr); - ctrl_tmp.irq_state <= WRITE_SRR0; + v.e.srr1 := msr_copy(ctrl.msr); exception := '0'; illegal := '0'; - exception_nextpc := '0'; - v.e.exc_write_enable := '0'; - v.e.exc_write_reg := fast_spr_num(SPR_SRR0); if valid_in = '1' then - v.e.exc_write_data := e_in.nia; - v.last_nia := e_in.nia; + v.e.last_nia := e_in.nia; else - v.e.exc_write_data := r.last_nia; + v.e.last_nia := r.e.last_nia; end if; v.e.mode_32bit := not ctrl.msr(MSR_SF); @@ -777,20 +749,20 @@ begin -- This is used for FP-type program interrupts that -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero. exception := '1'; - v.vector := 16#700#; - ctrl_tmp.srr1(63 - 43) <= '1'; - ctrl_tmp.srr1(63 - 47) <= '1'; + v.e.intr_vec := 16#700#; + v.e.srr1(63 - 43) := '1'; + v.e.srr1(63 - 47) := '1'; elsif r.trace_next = '1' then -- Generate a trace interrupt rather than executing the next instruction -- or taking any asynchronous interrupt exception := '1'; - v.vector := 16#d00#; - ctrl_tmp.srr1(63 - 33) <= '1'; + v.e.intr_vec := 16#d00#; + v.e.srr1(63 - 33) := '1'; if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then - ctrl_tmp.srr1(63 - 35) <= '1'; + v.e.srr1(63 - 35) := '1'; elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then - ctrl_tmp.srr1(63 - 36) <= '1'; + v.e.srr1(63 - 36) := '1'; end if; elsif irq_valid = '1' then @@ -801,9 +773,9 @@ begin elsif ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then -- generate a program interrupt exception := '1'; - v.vector := 16#700#; + v.e.intr_vec := 16#700#; -- set bit 45 to indicate privileged instruction type interrupt - ctrl_tmp.srr1(63 - 45) <= '1'; + v.e.srr1(63 - 45) := '1'; report "privileged instruction"; elsif not HAS_FPU and e_in.fac = FPU then @@ -813,14 +785,13 @@ begin elsif HAS_FPU and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then -- generate a floating-point unavailable interrupt exception := '1'; - v.vector := 16#800#; + v.e.intr_vec := 16#800#; report "FP unavailable interrupt"; end if; end if; if valid_in = '1' and exception = '0' and illegal = '0' and e_in.unit = ALU then v.cur_instr := e_in; - v.next_lr := next_nia; v.e.valid := '1'; case_0: case e_in.insn_type is @@ -835,8 +806,8 @@ begin -- we need two cycles to write srr0 and 1 if e_in.insn(1) = '1' then exception := '1'; - exception_nextpc := '1'; - v.vector := 16#C00#; + v.e.intr_vec := 16#C00#; + v.e.last_nia := next_nia; report "sc"; else illegal := '1'; @@ -867,9 +838,9 @@ begin when OP_CMP => when OP_TRAP => -- trap instructions (tw, twi, td, tdi) - v.vector := 16#700#; + v.e.intr_vec := 16#700#; -- set bit 46 to say trap occurred - ctrl_tmp.srr1(63 - 46) <= '1'; + v.e.srr1(63 - 46) := '1'; if or (trapval and insn_to(e_in.insn)) = '1' then -- generate trap-type program interrupt exception := '1'; @@ -916,8 +887,8 @@ begin end if; when OP_RFID => - v.redir_mode := (a_in(MSR_IR) or a_in(MSR_PR)) & not a_in(MSR_PR) & - not a_in(MSR_LE) & not a_in(MSR_SF); + v.e.redir_mode := (a_in(MSR_IR) or a_in(MSR_PR)) & not a_in(MSR_PR) & + not a_in(MSR_LE) & not a_in(MSR_SF); -- Can't use msr_copy here because the partial function MSR -- bits should be left unchanged, not zeroed. ctrl_tmp.msr(63 downto 31) <= a_in(63 downto 31); @@ -1051,8 +1022,8 @@ begin when OP_SETB => when OP_ISYNC => - v.redirect := '1'; - v.br_offset := std_ulogic_vector(to_unsigned(4, 64)); + v.e.redirect := '1'; + v.e.br_offset := std_ulogic_vector(to_unsigned(4, 64)); when OP_ICBI => icache_inval <= '1'; @@ -1080,16 +1051,16 @@ begin ctrl_tmp.cfar <= e_in.nia; end if; if taken_branch = '1' then - v.br_offset := b_in; - v.abs_br := abs_branch; + v.e.br_offset := b_in; + v.e.abs_br := abs_branch; else - v.br_offset := std_ulogic_vector(to_unsigned(4, 64)); + v.e.br_offset := std_ulogic_vector(to_unsigned(4, 64)); end if; if taken_branch /= e_in.br_pred then - v.redirect := '1'; + v.e.redirect := '1'; end if; - v.br_last := is_direct_branch; - v.taken_br := taken_branch; + v.e.br_last := is_direct_branch; + v.e.br_taken := taken_branch; end if; elsif valid_in = '1' and exception = '0' and illegal = '0' then @@ -1110,28 +1081,7 @@ begin -- The following cases all occur when r.busy = 1 and therefore -- valid_in = 0. Hence they don't happen in the same cycle as any of -- the cases above which depend on valid_in = 1. - - if ctrl.irq_state = WRITE_SRR1 then - v.e.exc_write_reg := fast_spr_num(SPR_SRR1); - v.e.exc_write_data := ctrl.srr1; - v.e.exc_write_enable := '1'; - ctrl_tmp.msr(MSR_SF) <= '1'; - ctrl_tmp.msr(MSR_EE) <= '0'; - ctrl_tmp.msr(MSR_PR) <= '0'; - ctrl_tmp.msr(MSR_SE) <= '0'; - ctrl_tmp.msr(MSR_BE) <= '0'; - ctrl_tmp.msr(MSR_FP) <= '0'; - ctrl_tmp.msr(MSR_FE0) <= '0'; - ctrl_tmp.msr(MSR_FE1) <= '0'; - ctrl_tmp.msr(MSR_IR) <= '0'; - ctrl_tmp.msr(MSR_DR) <= '0'; - ctrl_tmp.msr(MSR_RI) <= '0'; - ctrl_tmp.msr(MSR_LE) <= '1'; - v.trace_next := '0'; - v.fp_exception_next := '0'; - report "Writing SRR1: " & to_hstring(ctrl.srr1); - - elsif r.cntz_in_progress = '1' then + if r.cntz_in_progress = '1' then -- cnt[lt]z always takes two cycles v.e.valid := '1'; elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then @@ -1179,63 +1129,67 @@ begin -- The case where MSR[FE0,FE1] goes from zero to non-zero is -- handled above by mtmsrd and rfid setting v.fp_exception_next. if HAS_FPU and fp_in.interrupt = '1' then - v.vector := 16#700#; - ctrl_tmp.srr1(63 - 43) <= '1'; + v.e.intr_vec := 16#700#; + v.e.srr1(63 - 43) := '1'; exception := '1'; end if; if illegal = '1' or (HAS_FPU and fp_in.illegal = '1') then exception := '1'; - v.vector := 16#700#; + v.e.intr_vec := 16#700#; -- Since we aren't doing Hypervisor emulation assist (0xe40) we -- set bit 44 to indicate we have an illegal - ctrl_tmp.srr1(63 - 44) <= '1'; + v.e.srr1(63 - 44) := '1'; report "illegal"; end if; - if exception = '1' then - v.e.exc_write_enable := '1'; - if exception_nextpc = '1' then - v.e.exc_write_data := next_nia; - end if; - end if; -- generate DSI or DSegI for load/store exceptions -- or ISI or ISegI for instruction fetch exceptions if l_in.exception = '1' then if l_in.alignment = '1' then - v.vector := 16#600#; + v.e.intr_vec := 16#600#; elsif l_in.instr_fault = '0' then if l_in.segment_fault = '0' then - v.vector := 16#300#; + v.e.intr_vec := 16#300#; else - v.vector := 16#380#; + v.e.intr_vec := 16#380#; end if; else if l_in.segment_fault = '0' then - ctrl_tmp.srr1(63 - 33) <= l_in.invalid; - ctrl_tmp.srr1(63 - 35) <= l_in.perm_error; -- noexec fault - ctrl_tmp.srr1(63 - 44) <= l_in.badtree; - ctrl_tmp.srr1(63 - 45) <= l_in.rc_error; - v.vector := 16#400#; + v.e.srr1(63 - 33) := l_in.invalid; + v.e.srr1(63 - 35) := l_in.perm_error; -- noexec fault + v.e.srr1(63 - 44) := l_in.badtree; + v.e.srr1(63 - 45) := l_in.rc_error; + v.e.intr_vec := 16#400#; else - v.vector := 16#480#; + v.e.intr_vec := 16#480#; end if; end if; - v.e.exc_write_enable := '1'; - v.e.exc_write_reg := fast_spr_num(SPR_SRR0); - report "ldst exception writing srr0=" & to_hstring(r.last_nia); end if; - if exception = '1' or l_in.exception = '1' then - ctrl_tmp.irq_state <= WRITE_SRR1; - v.redirect := '1'; - v.do_intr := '1'; - end if; + v.e.interrupt := exception or l_in.exception; if do_trace = '1' then v.trace_next := '1'; end if; + if interrupt_in = '1' then + ctrl_tmp.msr(MSR_SF) <= '1'; + ctrl_tmp.msr(MSR_EE) <= '0'; + ctrl_tmp.msr(MSR_PR) <= '0'; + ctrl_tmp.msr(MSR_SE) <= '0'; + ctrl_tmp.msr(MSR_BE) <= '0'; + ctrl_tmp.msr(MSR_FP) <= '0'; + ctrl_tmp.msr(MSR_FE0) <= '0'; + ctrl_tmp.msr(MSR_FE1) <= '0'; + ctrl_tmp.msr(MSR_IR) <= '0'; + ctrl_tmp.msr(MSR_DR) <= '0'; + ctrl_tmp.msr(MSR_RI) <= '0'; + ctrl_tmp.msr(MSR_LE) <= '1'; + v.trace_next := '0'; + v.fp_exception_next := '0'; + end if; + if hold_wr_data = '0' then v.e.write_data := alu_result; else @@ -1263,41 +1217,6 @@ begin end if; end loop; - -- Defer completion for one cycle when redirecting. - -- This also ensures r.busy = 1 when ctrl.irq_state = WRITE_SRR1 - if v.redirect = '1' then - v.busy := '1'; - v.e.valid := '0'; - end if; - if r.redirect = '1' then - v.e.valid := '1'; - end if; - - -- Outputs to fetch1 - f.redirect := r.redirect; - f.br_nia := r.last_nia; - f.br_last := r.br_last and not r.do_intr; - f.br_taken := r.taken_br; - if r.do_intr = '1' then - f.redirect_nia := std_ulogic_vector(to_unsigned(r.vector, 64)); - f.virt_mode := '0'; - f.priv_mode := '1'; - -- XXX need an interrupt LE bit here, e.g. from LPCR - f.big_endian := '0'; - f.mode_32bit := '0'; - else - if r.abs_br = '1' then - f.redirect_nia := r.br_offset; - else - f.redirect_nia := std_ulogic_vector(unsigned(r.last_nia) + unsigned(r.br_offset)); - end if; - -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1 - f.virt_mode := r.redir_mode(3); - f.priv_mode := r.redir_mode(2); - f.big_endian := r.redir_mode(1); - f.mode_32bit := r.redir_mode(0); - end if; - -- Outputs to loadstore1 (async) lv.op := e_in.insn_type; lv.nia := e_in.nia; @@ -1344,11 +1263,9 @@ begin rin <= v; -- update outputs - f_out <= f; l_out <= lv; e_out <= r.e; fp_out <= fv; - flush_out <= f_out.redirect; exception_log <= exception; irq_valid_log <= irq_valid; @@ -1364,13 +1281,13 @@ begin ctrl.msr(MSR_IR) & ctrl.msr(MSR_DR) & exception_log & irq_valid_log & - std_ulogic_vector(to_unsigned(irq_state_t'pos(ctrl.irq_state), 1)) & + interrupt_in & "000" & r.e.write_enable & r.e.valid & - f_out.redirect & + (r.e.redirect or r.e.interrupt) & r.busy & - flush_out; + flush_in; end if; end process; log_out <= log_data; diff --git a/fetch1.vhdl b/fetch1.vhdl index 8ca7e57..788a76d 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -22,8 +22,8 @@ entity fetch1 is stop_in : in std_ulogic; alt_reset_in : in std_ulogic; - -- redirect from execution unit - e_in : in Execute1ToFetch1Type; + -- redirect from writeback unit + w_in : in WritebackToFetch1Type; -- redirect from decode1 d_in : in Decode1ToFetch1Type; @@ -70,12 +70,12 @@ begin " P:" & std_ulogic'image(r_next.priv_mode) & " E:" & std_ulogic'image(r_next.big_endian) & " 32:" & std_ulogic'image(r_next_int.mode_32bit) & - " R:" & std_ulogic'image(e_in.redirect) & std_ulogic'image(d_in.redirect) & + " R:" & std_ulogic'image(w_in.redirect) & std_ulogic'image(d_in.redirect) & " S:" & std_ulogic'image(stall_in) & " T:" & std_ulogic'image(stop_in) & " nia:" & to_hstring(r_next.nia); end if; - if rst = '1' or e_in.redirect = '1' or d_in.redirect = '1' or stall_in = '0' then + if rst = '1' or w_in.redirect = '1' or d_in.redirect = '1' or stall_in = '0' then r.virt_mode <= r_next.virt_mode; r.priv_mode <= r_next.priv_mode; r.big_endian <= r_next.big_endian; @@ -109,11 +109,11 @@ begin signal btc_wr_addr : std_ulogic_vector(BTC_ADDR_BITS - 1 downto 0); signal btc_wr_v : std_ulogic; begin - btc_wr_data <= e_in.br_nia(63 downto BTC_ADDR_BITS + 2) & - e_in.redirect_nia(63 downto 2); - btc_wr_addr <= e_in.br_nia(BTC_ADDR_BITS + 1 downto 2); - btc_wr <= e_in.br_last; - btc_wr_v <= e_in.br_taken; + btc_wr_data <= w_in.br_nia(63 downto BTC_ADDR_BITS + 2) & + w_in.redirect_nia(63 downto 2); + btc_wr_addr <= w_in.br_nia(BTC_ADDR_BITS + 1 downto 2); + btc_wr <= w_in.br_last; + btc_wr_v <= w_in.br_taken; btc_ram : process(clk) variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0); @@ -158,15 +158,15 @@ begin v.big_endian := '0'; v_int.mode_32bit := '0'; v_int.predicted_nia := (others => '0'); - elsif e_in.redirect = '1' then - v.nia := e_in.redirect_nia(63 downto 2) & "00"; - if e_in.mode_32bit = '1' then + elsif w_in.redirect = '1' then + v.nia := w_in.redirect_nia(63 downto 2) & "00"; + if w_in.mode_32bit = '1' then v.nia(63 downto 32) := (others => '0'); end if; - v.virt_mode := e_in.virt_mode; - v.priv_mode := e_in.priv_mode; - v.big_endian := e_in.big_endian; - v_int.mode_32bit := e_in.mode_32bit; + v.virt_mode := w_in.virt_mode; + v.priv_mode := w_in.priv_mode; + v.big_endian := w_in.big_endian; + v_int.mode_32bit := w_in.mode_32bit; elsif d_in.redirect = '1' then v.nia := d_in.redirect_nia(63 downto 2) & "00"; if r_int.mode_32bit = '1' then @@ -191,7 +191,7 @@ begin -- If the last NIA value went down with a stop mark, it didn't get -- executed, and hence we shouldn't increment NIA. - advance_nia <= rst or e_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in); + advance_nia <= rst or w_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in); r_next <= v; r_next_int <= v_int; diff --git a/writeback.vhdl b/writeback.vhdl index 044b1fb..c7632ea 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -9,6 +9,7 @@ use work.crhelpers.all; entity writeback is port ( clk : in std_ulogic; + rst : in std_ulogic; e_in : in Execute1ToWritebackType; l_in : in Loadstore1ToWritebackType; @@ -16,12 +17,24 @@ entity writeback is w_out : out WritebackToRegisterFileType; c_out : out WritebackToCrFileType; + f_out : out WritebackToFetch1Type; + flush_out : out std_ulogic; + interrupt_out: out std_ulogic; complete_out : out instr_tag_t ); end entity writeback; architecture behaviour of writeback is + type irq_state_t is (WRITE_SRR0, WRITE_SRR1); + + type reg_type is record + state : irq_state_t; + srr1 : std_ulogic_vector(63 downto 0); + end record; + + signal r, rin : reg_type; + begin writeback_0: process(clk) variable x : std_ulogic_vector(0 downto 0); @@ -29,6 +42,13 @@ begin variable w : std_ulogic_vector(0 downto 0); begin if rising_edge(clk) then + if rst = '1' then + r.state <= WRITE_SRR0; + r.srr1 <= (others => '0'); + else + r <= rin; + end if; + -- Do consistency checks only on the clock edge x(0) := e_in.valid; y(0) := l_in.valid; @@ -36,7 +56,7 @@ begin assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(w))) <= 1 severity failure; - x(0) := e_in.write_enable or e_in.exc_write_enable; + x(0) := e_in.write_enable; y(0) := l_in.write_enable; w(0) := fp_in.write_enable; assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + @@ -55,6 +75,8 @@ begin end process; writeback_1: process(all) + variable v : reg_type; + variable f : WritebackToFetch1Type; variable cf: std_ulogic_vector(3 downto 0); variable zero : std_ulogic; variable sign : std_ulogic; @@ -62,6 +84,9 @@ begin begin w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; + f := WritebackToFetch1Init; + interrupt_out <= '0'; + v := r; complete_out <= instr_tag_init; if e_in.valid = '1' then @@ -72,10 +97,19 @@ begin complete_out <= fp_in.instr_tag; end if; - if e_in.exc_write_enable = '1' then - w_out.write_reg <= e_in.exc_write_reg; - w_out.write_data <= e_in.exc_write_data; + if r.state = WRITE_SRR1 then + w_out.write_reg <= fast_spr_num(SPR_SRR1); + w_out.write_data <= r.srr1; + w_out.write_enable <= '1'; + interrupt_out <= '1'; + v.state := WRITE_SRR0; + + elsif e_in.interrupt = '1' then + w_out.write_reg <= fast_spr_num(SPR_SRR0); + w_out.write_data <= e_in.last_nia; w_out.write_enable <= '1'; + v.state := WRITE_SRR1; + v.srr1 := e_in.srr1; else if e_in.write_enable = '1' then w_out.write_reg <= e_in.write_reg; @@ -142,5 +176,35 @@ begin c_out.write_cr_data(31 downto 28) <= cf; end if; end if; + + -- Outputs to fetch1 + f.redirect := e_in.redirect or e_in.interrupt; + f.br_nia := e_in.last_nia; + f.br_last := e_in.br_last and not e_in.interrupt; + f.br_taken := e_in.br_taken; + if e_in.interrupt = '1' then + f.redirect_nia := std_ulogic_vector(to_unsigned(e_in.intr_vec, 64)); + f.virt_mode := '0'; + f.priv_mode := '1'; + -- XXX need an interrupt LE bit here, e.g. from LPCR + f.big_endian := '0'; + f.mode_32bit := '0'; + else + if e_in.abs_br = '1' then + f.redirect_nia := e_in.br_offset; + else + f.redirect_nia := std_ulogic_vector(unsigned(e_in.last_nia) + unsigned(e_in.br_offset)); + end if; + -- send MSR[IR], ~MSR[PR], ~MSR[LE] and ~MSR[SF] up to fetch1 + f.virt_mode := e_in.redir_mode(3); + f.priv_mode := e_in.redir_mode(2); + f.big_endian := e_in.redir_mode(1); + f.mode_32bit := e_in.redir_mode(0); + end if; + + f_out <= f; + flush_out <= f_out.redirect; + + rin <= v; end process; end; From 29221315e90120cd5bb134d8035803fa2d829e32 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 23 Dec 2020 12:27:22 +1100 Subject: [PATCH 09/13] core: Send loadstore1 interrupts to writeback rather than execute1 Signed-off-by: Paul Mackerras --- common.vhdl | 29 ++++++++++++++----------- execute1.vhdl | 29 +++---------------------- loadstore1.vhdl | 57 +++++++++++++++++++++++++++++++++++-------------- writeback.vhdl | 26 +++++++++++++++++----- 4 files changed, 81 insertions(+), 60 deletions(-) diff --git a/common.vhdl b/common.vhdl index b2d6b13..48ba46f 100644 --- a/common.vhdl +++ b/common.vhdl @@ -349,6 +349,7 @@ package common is is_32bit : std_ulogic; repeat : std_ulogic; second : std_ulogic; + msr : std_ulogic_vector(63 downto 0); end record; constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0', @@ -360,18 +361,11 @@ package common is write_reg => (others => '0'), length => (others => '0'), mode_32bit => '0', is_32bit => '0', - repeat => '0', second => '0'); + repeat => '0', second => '0', + msr => (others => '0')); type Loadstore1ToExecute1Type is record busy : std_ulogic; - exception : std_ulogic; - alignment : std_ulogic; - invalid : std_ulogic; - perm_error : std_ulogic; - rc_error : std_ulogic; - badtree : std_ulogic; - segment_fault : std_ulogic; - instr_fault : std_ulogic; end record; type Loadstore1ToDcacheType is record @@ -454,10 +448,17 @@ package common is xerc : xer_common_t; rc : std_ulogic; store_done : std_ulogic; + interrupt : std_ulogic; + intr_vec : integer range 0 to 16#fff#; + srr0: std_ulogic_vector(63 downto 0); + srr1: std_ulogic_vector(31 downto 0); end record; constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := - (valid => '0', instr_tag => instr_tag_init, write_enable => '0', xerc => xerc_init, - rc => '0', store_done => '0', write_data => (others => '0'), others => (others => '0')); + (valid => '0', instr_tag => instr_tag_init, write_enable => '0', + write_reg => (others => '0'), write_data => (others => '0'), + xerc => xerc_init, rc => '0', store_done => '0', + interrupt => '0', intr_vec => 0, + srr0 => (others => '0'), srr1 => (others => '0')); type Execute1ToWritebackType is record valid: std_ulogic; @@ -481,7 +482,8 @@ package common is br_last: std_ulogic; br_taken: std_ulogic; abs_br: std_ulogic; - srr1: std_ulogic_vector(63 downto 0); + srr1: std_ulogic_vector(31 downto 0); + msr: std_ulogic_vector(63 downto 0); end record; constant Execute1ToWritebackInit : Execute1ToWritebackType := (valid => '0', instr_tag => instr_tag_init, rc => '0', mode_32bit => '0', @@ -491,7 +493,8 @@ package common is write_cr_data => (others => '0'), write_reg => (others => '0'), interrupt => '0', intr_vec => 0, redirect => '0', redir_mode => "0000", last_nia => (others => '0'), br_offset => (others => '0'), - br_last => '0', br_taken => '0', abs_br => '0', srr1 => (others => '0')); + br_last => '0', br_taken => '0', abs_br => '0', + srr1 => (others => '0'), msr => (others => '0')); type Execute1ToFPUType is record valid : std_ulogic; diff --git a/execute1.vhdl b/execute1.vhdl index 875e22c..f8507bb 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -725,7 +725,7 @@ begin rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0'; rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0'; - v.e.srr1 := msr_copy(ctrl.msr); + v.e.srr1 := (others => '0'); exception := '0'; illegal := '0'; if valid_in = '1' then @@ -1143,31 +1143,7 @@ begin report "illegal"; end if; - -- generate DSI or DSegI for load/store exceptions - -- or ISI or ISegI for instruction fetch exceptions - if l_in.exception = '1' then - if l_in.alignment = '1' then - v.e.intr_vec := 16#600#; - elsif l_in.instr_fault = '0' then - if l_in.segment_fault = '0' then - v.e.intr_vec := 16#300#; - else - v.e.intr_vec := 16#380#; - end if; - else - if l_in.segment_fault = '0' then - v.e.srr1(63 - 33) := l_in.invalid; - v.e.srr1(63 - 35) := l_in.perm_error; -- noexec fault - v.e.srr1(63 - 44) := l_in.badtree; - v.e.srr1(63 - 45) := l_in.rc_error; - v.e.intr_vec := 16#400#; - else - v.e.intr_vec := 16#480#; - end if; - end if; - end if; - - v.e.interrupt := exception or l_in.exception; + v.e.interrupt := exception; if do_trace = '1' then v.trace_next := '1'; @@ -1265,6 +1241,7 @@ begin -- update outputs l_out <= lv; e_out <= r.e; + e_out.msr <= msr_copy(ctrl.msr); fp_out <= fv; exception_log <= exception; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 935ce5f..f4f4f4a 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -105,6 +105,10 @@ architecture behave of loadstore1 is ld_sp_nz : std_ulogic; ld_sp_lz : std_ulogic_vector(5 downto 0); wr_sel : std_ulogic_vector(1 downto 0); + interrupt : std_ulogic; + intr_vec : integer range 0 to 16#fff#; + nia : std_ulogic_vector(63 downto 0); + srr1 : std_ulogic_vector(31 downto 0); end record; signal r, rin : reg_stage_t; @@ -220,6 +224,7 @@ begin r.state <= IDLE; r.busy <= '0'; r.do_update <= '0'; + r.interrupt <= '0'; else r <= rin; end if; @@ -520,6 +525,8 @@ begin v.wait_dcache := '0'; v.wait_mmu := '0'; v.extra_cycle := '0'; + v.nia := l_in.nia; + v.srr1 := (others => '0'); if HAS_FPU and l_in.is_32bit = '1' then v.store_data := x"00000000" & store_sp_data; @@ -697,6 +704,34 @@ begin end if; end loop; + -- generate DSI or DSegI for load/store exceptions + -- or ISI or ISegI for instruction fetch exceptions + v.interrupt := exception; + if exception = '1' then + if r.align_intr = '1' then + v.intr_vec := 16#600#; + v.dar := addr; + elsif r.instr_fault = '0' then + v.dar := addr; + if m_in.segerr = '0' then + v.intr_vec := 16#300#; + v.dsisr := dsisr; + else + v.intr_vec := 16#380#; + end if; + else + if m_in.segerr = '0' then + v.srr1(63 - 33) := m_in.invalid; + v.srr1(63 - 35) := m_in.perm_error; -- noexec fault + v.srr1(63 - 44) := m_in.badtree; + v.srr1(63 - 45) := m_in.rc_error; + v.intr_vec := 16#400#; + else + v.intr_vec := 16#480#; + end if; + end if; + end if; + -- Update outputs to dcache d_out.valid <= req and not v.align_intr; d_out.load <= v.load; @@ -746,23 +781,13 @@ begin l_out.xerc <= r.xerc; l_out.rc <= r.rc and done; l_out.store_done <= d_in.store_done; + l_out.interrupt <= r.interrupt; + l_out.intr_vec <= r.intr_vec; + l_out.srr0 <= r.nia; + l_out.srr1 <= r.srr1; - -- update exception info back to execute1 + -- update busy signal back to execute1 e_out.busy <= busy; - e_out.exception <= exception; - e_out.alignment <= r.align_intr; - e_out.instr_fault <= r.instr_fault; - e_out.invalid <= m_in.invalid; - e_out.badtree <= m_in.badtree; - e_out.perm_error <= m_in.perm_error; - e_out.rc_error <= m_in.rc_error; - e_out.segment_fault <= m_in.segerr; - if exception = '1' and r.instr_fault = '0' then - v.dar := addr; - if m_in.segerr = '0' and r.align_intr = '0' then - v.dsisr := dsisr; - end if; - end if; -- Update registers rin <= v; @@ -776,7 +801,7 @@ begin begin if rising_edge(clk) then log_data <= e_out.busy & - e_out.exception & + l_out.interrupt & l_out.valid & m_out.valid & d_out.valid & diff --git a/writeback.vhdl b/writeback.vhdl index c7632ea..40cd5b4 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -81,11 +81,13 @@ begin variable zero : std_ulogic; variable sign : std_ulogic; variable scf : std_ulogic_vector(3 downto 0); + variable vec : integer range 0 to 16#fff#; begin w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; f := WritebackToFetch1Init; interrupt_out <= '0'; + vec := 0; v := r; complete_out <= instr_tag_init; @@ -109,7 +111,19 @@ begin w_out.write_data <= e_in.last_nia; w_out.write_enable <= '1'; v.state := WRITE_SRR1; - v.srr1 := e_in.srr1; + v.srr1(63 downto 32) := e_in.msr(63 downto 32); + v.srr1(31 downto 0) := e_in.msr(31 downto 0) or e_in.srr1; + vec := e_in.intr_vec; + + elsif l_in.interrupt = '1' then + w_out.write_reg <= fast_spr_num(SPR_SRR0); + w_out.write_data <= l_in.srr0; + w_out.write_enable <= '1'; + v.state := WRITE_SRR1; + v.srr1(63 downto 32) := e_in.msr(63 downto 32); + v.srr1(31 downto 0) := e_in.msr(31 downto 0) or l_in.srr1; + vec := l_in.intr_vec; + else if e_in.write_enable = '1' then w_out.write_reg <= e_in.write_reg; @@ -178,12 +192,14 @@ begin end if; -- Outputs to fetch1 - f.redirect := e_in.redirect or e_in.interrupt; + f.redirect := e_in.redirect; f.br_nia := e_in.last_nia; - f.br_last := e_in.br_last and not e_in.interrupt; + f.br_last := e_in.br_last; f.br_taken := e_in.br_taken; - if e_in.interrupt = '1' then - f.redirect_nia := std_ulogic_vector(to_unsigned(e_in.intr_vec, 64)); + if e_in.interrupt = '1' or l_in.interrupt = '1' then + f.redirect := '1'; + f.br_last := '0'; + f.redirect_nia := std_ulogic_vector(to_unsigned(vec, 64)); f.virt_mode := '0'; f.priv_mode := '1'; -- XXX need an interrupt LE bit here, e.g. from LPCR From acb3d2d7455dfb9b1813f406f45cb314fba2e34e Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 23 Dec 2020 13:57:40 +1100 Subject: [PATCH 10/13] core: Send FPU interrupts to writeback rather than execute1 Signed-off-by: Paul Mackerras --- common.vhdl | 28 ++++++++++++++++++---------- execute1.vhdl | 28 +++++++++------------------- fpu.vhdl | 12 +++++++++--- loadstore1.vhdl | 10 +++++----- writeback.vhdl | 40 +++++++++++++++++++++++++--------------- 5 files changed, 66 insertions(+), 52 deletions(-) diff --git a/common.vhdl b/common.vhdl index 48ba46f..e79bcb5 100644 --- a/common.vhdl +++ b/common.vhdl @@ -139,6 +139,8 @@ package common is constant instr_tag_init : instr_tag_t := (tag => 0, valid => '0'); function tag_match(tag1 : instr_tag_t; tag2 : instr_tag_t) return boolean; + subtype intr_vector_t is integer range 0 to 16#fff#; + -- For now, fixed 16 sources, make this either a parametric -- package of some sort or an unconstrainted array. type ics_to_icp_t is record @@ -449,9 +451,9 @@ package common is rc : std_ulogic; store_done : std_ulogic; interrupt : std_ulogic; - intr_vec : integer range 0 to 16#fff#; + intr_vec : intr_vector_t; srr0: std_ulogic_vector(63 downto 0); - srr1: std_ulogic_vector(31 downto 0); + srr1: std_ulogic_vector(15 downto 0); end record; constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := (valid => '0', instr_tag => instr_tag_init, write_enable => '0', @@ -474,7 +476,7 @@ package common is write_xerc_enable : std_ulogic; xerc : xer_common_t; interrupt : std_ulogic; - intr_vec : integer range 0 to 16#fff#; + intr_vec : intr_vector_t; redirect: std_ulogic; redir_mode: std_ulogic_vector(3 downto 0); last_nia: std_ulogic_vector(63 downto 0); @@ -482,7 +484,7 @@ package common is br_last: std_ulogic; br_taken: std_ulogic; abs_br: std_ulogic; - srr1: std_ulogic_vector(31 downto 0); + srr1: std_ulogic_vector(15 downto 0); msr: std_ulogic_vector(63 downto 0); end record; constant Execute1ToWritebackInit : Execute1ToWritebackType := @@ -521,13 +523,12 @@ package common is type FPUToExecute1Type is record busy : std_ulogic; exception : std_ulogic; - interrupt : std_ulogic; - illegal : std_ulogic; end record; constant FPUToExecute1Init : FPUToExecute1Type := (others => '0'); type FPUToWritebackType is record valid : std_ulogic; + interrupt : std_ulogic; instr_tag : instr_tag_t; write_enable : std_ulogic; write_reg : gspr_index_t; @@ -535,10 +536,17 @@ package common is write_cr_enable : std_ulogic; write_cr_mask : std_ulogic_vector(7 downto 0); write_cr_data : std_ulogic_vector(31 downto 0); - end record; - constant FPUToWritebackInit : FPUToWritebackType := (valid => '0', instr_tag => instr_tag_init, - write_enable => '0', write_cr_enable => '0', - others => (others => '0')); + intr_vec : intr_vector_t; + srr0 : std_ulogic_vector(63 downto 0); + srr1 : std_ulogic_vector(15 downto 0); + end record; + constant FPUToWritebackInit : FPUToWritebackType := + (valid => '0', interrupt => '0', instr_tag => instr_tag_init, + write_enable => '0', write_reg => (others => '0'), + write_cr_enable => '0', write_cr_mask => (others => '0'), + write_cr_data => (others => '0'), + intr_vec => 0, srr1 => (others => '0'), + others => (others => '0')); type DividerToExecute1Type is record valid: std_ulogic; diff --git a/execute1.vhdl b/execute1.vhdl index f8507bb..0eaf55a 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -750,19 +750,19 @@ begin -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero. exception := '1'; v.e.intr_vec := 16#700#; - v.e.srr1(63 - 43) := '1'; - v.e.srr1(63 - 47) := '1'; + v.e.srr1(47 - 43) := '1'; + v.e.srr1(47 - 47) := '1'; elsif r.trace_next = '1' then -- Generate a trace interrupt rather than executing the next instruction -- or taking any asynchronous interrupt exception := '1'; v.e.intr_vec := 16#d00#; - v.e.srr1(63 - 33) := '1'; + v.e.srr1(47 - 33) := '1'; if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then - v.e.srr1(63 - 35) := '1'; + v.e.srr1(47 - 35) := '1'; elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then - v.e.srr1(63 - 36) := '1'; + v.e.srr1(47 - 36) := '1'; end if; elsif irq_valid = '1' then @@ -775,7 +775,7 @@ begin exception := '1'; v.e.intr_vec := 16#700#; -- set bit 45 to indicate privileged instruction type interrupt - v.e.srr1(63 - 45) := '1'; + v.e.srr1(47 - 45) := '1'; report "privileged instruction"; elsif not HAS_FPU and e_in.fac = FPU then @@ -840,7 +840,7 @@ begin -- trap instructions (tw, twi, td, tdi) v.e.intr_vec := 16#700#; -- set bit 46 to say trap occurred - v.e.srr1(63 - 46) := '1'; + v.e.srr1(47 - 46) := '1'; if or (trapval and insn_to(e_in.insn)) = '1' then -- generate trap-type program interrupt exception := '1'; @@ -1124,22 +1124,12 @@ begin v.e.valid := '1'; end if; - -- Generate FP-type program interrupt. fp_in.interrupt will only - -- be set during the execution of a FP instruction. - -- The case where MSR[FE0,FE1] goes from zero to non-zero is - -- handled above by mtmsrd and rfid setting v.fp_exception_next. - if HAS_FPU and fp_in.interrupt = '1' then - v.e.intr_vec := 16#700#; - v.e.srr1(63 - 43) := '1'; - exception := '1'; - end if; - - if illegal = '1' or (HAS_FPU and fp_in.illegal = '1') then + if illegal = '1' then exception := '1'; v.e.intr_vec := 16#700#; -- Since we aren't doing Hypervisor emulation assist (0xe40) we -- set bit 44 to indicate we have an illegal - v.e.srr1(63 - 44) := '1'; + v.e.srr1(47 - 44) := '1'; report "illegal"; end if; diff --git a/fpu.vhdl b/fpu.vhdl index 5e5c7d6..93fa9d6 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -73,8 +73,10 @@ architecture behaviour of fpu is busy : std_ulogic; instr_done : std_ulogic; do_intr : std_ulogic; + illegal : std_ulogic; op : insn_type_t; insn : std_ulogic_vector(31 downto 0); + nia : std_ulogic_vector(63 downto 0); instr_tag : instr_tag_t; dest_fpr : gspr_index_t; fe_mode : std_ulogic; @@ -572,7 +574,6 @@ begin e_out.busy <= r.busy; e_out.exception <= r.fpscr(FPSCR_FEX); - e_out.interrupt <= r.do_intr; w_out.valid <= r.instr_done and not r.do_intr; w_out.instr_tag <= r.instr_tag; @@ -583,6 +584,10 @@ begin w_out.write_cr_mask <= r.cr_mask; w_out.write_cr_data <= r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result; + w_out.interrupt <= r.do_intr; + w_out.intr_vec <= 16#700#; + w_out.srr0 <= r.nia; + w_out.srr1 <= (47-44 => r.illegal, 47-43 => not r.illegal, others => '0'); fpu_1: process(all) variable v : reg_type; @@ -644,6 +649,7 @@ begin -- capture incoming instruction if e_in.valid = '1' then v.insn := e_in.insn; + v.nia := e_in.nia; v.op := e_in.op; v.instr_tag := e_in.itag; v.fe_mode := or (e_in.fe_mode); @@ -2543,9 +2549,10 @@ begin v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX); end if; + v.illegal := illegal; if illegal = '1' then v.instr_done := '0'; - v.do_intr := '0'; + v.do_intr := '1'; v.writing_back := '0'; v.busy := '0'; v.state := IDLE; @@ -2557,7 +2564,6 @@ begin end if; rin <= v; - e_out.illegal <= illegal; end process; end architecture behaviour; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index f4f4f4a..a754cc4 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -108,7 +108,7 @@ architecture behave of loadstore1 is interrupt : std_ulogic; intr_vec : integer range 0 to 16#fff#; nia : std_ulogic_vector(63 downto 0); - srr1 : std_ulogic_vector(31 downto 0); + srr1 : std_ulogic_vector(15 downto 0); end record; signal r, rin : reg_stage_t; @@ -721,10 +721,10 @@ begin end if; else if m_in.segerr = '0' then - v.srr1(63 - 33) := m_in.invalid; - v.srr1(63 - 35) := m_in.perm_error; -- noexec fault - v.srr1(63 - 44) := m_in.badtree; - v.srr1(63 - 45) := m_in.rc_error; + v.srr1(47 - 33) := m_in.invalid; + v.srr1(47 - 35) := m_in.perm_error; -- noexec fault + v.srr1(47 - 44) := m_in.badtree; + v.srr1(47 - 45) := m_in.rc_error; v.intr_vec := 16#400#; else v.intr_vec := 16#480#; diff --git a/writeback.vhdl b/writeback.vhdl index 40cd5b4..65da537 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -82,6 +82,8 @@ begin variable sign : std_ulogic; variable scf : std_ulogic_vector(3 downto 0); variable vec : integer range 0 to 16#fff#; + variable srr1 : std_ulogic_vector(15 downto 0); + variable intr : std_ulogic; begin w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; @@ -99,6 +101,8 @@ begin complete_out <= fp_in.instr_tag; end if; + intr := e_in.interrupt or l_in.interrupt or fp_in.interrupt; + if r.state = WRITE_SRR1 then w_out.write_reg <= fast_spr_num(SPR_SRR1); w_out.write_data <= r.srr1; @@ -106,23 +110,29 @@ begin interrupt_out <= '1'; v.state := WRITE_SRR0; - elsif e_in.interrupt = '1' then - w_out.write_reg <= fast_spr_num(SPR_SRR0); - w_out.write_data <= e_in.last_nia; - w_out.write_enable <= '1'; - v.state := WRITE_SRR1; - v.srr1(63 downto 32) := e_in.msr(63 downto 32); - v.srr1(31 downto 0) := e_in.msr(31 downto 0) or e_in.srr1; - vec := e_in.intr_vec; - - elsif l_in.interrupt = '1' then + elsif intr = '1' then w_out.write_reg <= fast_spr_num(SPR_SRR0); - w_out.write_data <= l_in.srr0; w_out.write_enable <= '1'; v.state := WRITE_SRR1; - v.srr1(63 downto 32) := e_in.msr(63 downto 32); - v.srr1(31 downto 0) := e_in.msr(31 downto 0) or l_in.srr1; - vec := l_in.intr_vec; + srr1 := (others => '0'); + if e_in.interrupt = '1' then + vec := e_in.intr_vec; + w_out.write_data <= e_in.last_nia; + srr1 := e_in.srr1; + elsif l_in.interrupt = '1' then + vec := l_in.intr_vec; + w_out.write_data <= l_in.srr0; + srr1 := l_in.srr1; + elsif fp_in.interrupt = '1' then + vec := fp_in.intr_vec; + w_out.write_data <= fp_in.srr0; + srr1 := fp_in.srr1; + end if; + v.srr1(63 downto 31) := e_in.msr(63 downto 31); + v.srr1(30 downto 27) := srr1(14 downto 11); + v.srr1(26 downto 22) := e_in.msr(26 downto 22); + v.srr1(21 downto 16) := srr1(5 downto 0); + v.srr1(15 downto 0) := e_in.msr(15 downto 0); else if e_in.write_enable = '1' then @@ -196,7 +206,7 @@ begin f.br_nia := e_in.last_nia; f.br_last := e_in.br_last; f.br_taken := e_in.br_taken; - if e_in.interrupt = '1' or l_in.interrupt = '1' then + if intr = '1' then f.redirect := '1'; f.br_last := '0'; f.redirect_nia := std_ulogic_vector(to_unsigned(vec, 64)); From f636bb7c3999d9326a2bd1c6131fc128be2cae24 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 18 Jan 2021 08:55:56 +1100 Subject: [PATCH 11/13] dcache: Fix bugs in pipelined operation This fixes two bugs which show up when multiple operations are in flight in the dcache, and adds a 'hold' input which will be needed when loadstore1 is pipelined. The first bug is that dcache needs to sample the data for a store on the cycle after the store request comes in even if the store request is held up because of a previous request (e.g. if the previous request is a load miss or a dcbz). The second bug is that a load request coming in for a cache line being refilled needs to be handled immediately in the case where it is for the row whose data arrives on the same cycle. If it is not, then it will be handled as a separate cache miss and the cache line will be refilled again into a different way, leading to two ways both being valid for the same tag. This can lead to data corruption, in the scenario where subsequent writes go to one of the ways and then that way gets displaced but the other way doesn't. This bug could in principle show up even without having multiple operations in flight in the dcache. Signed-off-by: Paul Mackerras --- common.vhdl | 1 + dcache.vhdl | 33 +++++++++++++++++++++++---------- loadstore1.vhdl | 1 + 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/common.vhdl b/common.vhdl index e79bcb5..35f782b 100644 --- a/common.vhdl +++ b/common.vhdl @@ -372,6 +372,7 @@ package common is type Loadstore1ToDcacheType is record valid : std_ulogic; + hold : std_ulogic; load : std_ulogic; -- is this a load dcbz : std_ulogic; nc : std_ulogic; diff --git a/dcache.vhdl b/dcache.vhdl index 7da67e1..bb93148 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -275,6 +275,7 @@ architecture rtl of dcache is doall : std_ulogic; -- with tlbie, indicates flush whole TLB tlbld : std_ulogic; -- indicates a TLB load request (from MMU) mmu_req : std_ulogic; -- indicates source of request + d_valid : std_ulogic; -- indicates req.data is valid now end record; signal r0 : reg_stage_0_t; @@ -564,17 +565,27 @@ begin r.mmu_req := '1'; else r.req := d_in; + r.req.data := (others => '0'); r.tlbie := '0'; r.doall := '0'; r.tlbld := '0'; r.mmu_req := '0'; end if; + r.d_valid := '0'; if rst = '1' then r0_full <= '0'; - elsif r1.full = '0' or r0_full = '0' then + elsif (r1.full = '0' and d_in.hold = '0') or r0_full = '0' then r0 <= r; r0_full <= r.req.valid; end if; + -- Sample data the cycle after a request comes in from loadstore1. + -- If another request has come in already then the data will get + -- put directly into req.data below. + if r0.req.valid = '1' and r.req.valid = '0' and r0.d_valid = '0' and + r0.mmu_req = '0' then + r0.req.data <= d_in.data; + r0.d_valid <= '1'; + end if; end if; end process; @@ -582,8 +593,8 @@ begin m_out.stall <= '0'; -- Hold off the request in r0 when r1 has an uncompleted request - r0_stall <= r0_full and r1.full; - r0_valid <= r0_full and not r1.full; + r0_stall <= r0_full and (r1.full or d_in.hold); + r0_valid <= r0_full and not r1.full and not d_in.hold; stall_out <= r0_stall; -- TLB @@ -1305,10 +1316,12 @@ begin req.dcbz := r0.req.dcbz; req.real_addr := ra; -- Force data to 0 for dcbz - if r0.req.dcbz = '0' then - req.data := d_in.data; - else + if r0.req.dcbz = '1' then req.data := (others => '0'); + elsif r0.d_valid = '1' then + req.data := r0.req.data; + else + req.data := d_in.data; end if; -- Select all bytes for dcbz and for cacheable loads if r0.req.dcbz = '1' or (r0.req.load = '1' and r0.req.nc = '0') then @@ -1438,10 +1451,10 @@ begin -- complete the request next cycle. -- Compare the whole address in case the request in -- r1.req is not the one that started this refill. - if r1.full = '1' and r1.req.same_tag = '1' and - ((r1.dcbz = '1' and r1.req.dcbz = '1') or - (r1.dcbz = '0' and r1.req.op = OP_LOAD_MISS)) and - r1.store_row = get_row(r1.req.real_addr) then + if req.valid = '1' and req.same_tag = '1' and + ((r1.dcbz = '1' and req.dcbz = '1') or + (r1.dcbz = '0' and req.op = OP_LOAD_MISS)) and + r1.store_row = get_row(req.real_addr) then r1.full <= '0'; r1.slow_valid <= '1'; if r1.mmu_req = '0' then diff --git a/loadstore1.vhdl b/loadstore1.vhdl index a754cc4..8e6c7be 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -745,6 +745,7 @@ begin d_out.byte_sel <= byte_sel; d_out.virt_mode <= v.virt_mode; d_out.priv_mode <= v.priv_mode; + d_out.hold <= '0'; -- Update outputs to MMU m_out.valid <= mmureq; From f583d088b7e6bf38839449c8101beb64b23b12ed Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 7 Nov 2020 20:50:58 +1100 Subject: [PATCH 12/13] loadstore: Convert to 3-stage pipeline This makes loadstore use a 3-stage pipeline. For now, only one instruction goes through the pipe at a time. Completion and writeback are still combinatorial off the valid signal back from the dcache, so performance should be the same as before. In future it should be able to sustain one load or store per cycle provided they hit in the dcache. Signed-off-by: Paul Mackerras --- loadstore1.vhdl | 1022 +++++++++++++++++++++++++++-------------------- 1 file changed, 588 insertions(+), 434 deletions(-) diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 8e6c7be..66700e8 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -37,39 +37,44 @@ entity loadstore1 is ); end loadstore1; --- Note, we don't currently use the stall output from the dcache because --- we know it can take two requests without stalling when idle, we are --- its only user, and we know it never stalls when idle. - architecture behave of loadstore1 is -- State machine for unaligned loads/stores type state_t is (IDLE, -- ready for instruction - SECOND_REQ, -- send 2nd request of unaligned xfer - ACK_WAIT, -- waiting for ack from dcache MMU_LOOKUP, -- waiting for MMU to look up translation TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie - FINISH_LFS, -- write back converted SP data for lfs* - COMPLETE -- extra cycle to complete an operation + FINISH_LFS -- write back converted SP data for lfs* ); type byte_index_t is array(0 to 7) of unsigned(2 downto 0); subtype byte_trim_t is std_ulogic_vector(1 downto 0); type trim_ctl_t is array(0 to 7) of byte_trim_t; - type reg_stage_t is record - -- latch most of the input request + type request_t is record + valid : std_ulogic; + dc_req : std_ulogic; load : std_ulogic; + store : std_ulogic; tlbie : std_ulogic; dcbz : std_ulogic; + read_spr : std_ulogic; + write_spr : std_ulogic; + mmu_op : std_ulogic; + instr_fault : std_ulogic; + load_zero : std_ulogic; + do_update : std_ulogic; + noop : std_ulogic; + mode_32bit : std_ulogic; addr : std_ulogic_vector(63 downto 0); + addr0 : std_ulogic_vector(63 downto 0); + byte_sel : std_ulogic_vector(7 downto 0); + second_bytes : std_ulogic_vector(7 downto 0); store_data : std_ulogic_vector(63 downto 0); - load_data : std_ulogic_vector(63 downto 0); instr_tag : instr_tag_t; write_reg : gspr_index_t; length : std_ulogic_vector(3 downto 0); + elt_length : std_ulogic_vector(3 downto 0); byte_reverse : std_ulogic; - byte_offset : unsigned(2 downto 0); brev_mask : unsigned(2 downto 0); sign_extend : std_ulogic; update : std_ulogic; @@ -81,41 +86,87 @@ architecture behave of loadstore1 is nc : std_ulogic; -- non-cacheable access virt_mode : std_ulogic; priv_mode : std_ulogic; + load_sp : std_ulogic; + sprn : std_ulogic_vector(9 downto 0); + is_slbia : std_ulogic; + align_intr : std_ulogic; + dword_index : std_ulogic; + two_dwords : std_ulogic; + nia : std_ulogic_vector(63 downto 0); + end record; + constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', tlbie => '0', + dcbz => '0', read_spr => '0', write_spr => '0', mmu_op => '0', + instr_fault => '0', load_zero => '0', do_update => '0', noop => '0', + mode_32bit => '0', addr => (others => '0'), addr0 => (others => '0'), + byte_sel => x"00", second_bytes => x"00", + store_data => (others => '0'), instr_tag => instr_tag_init, + write_reg => 7x"00", length => x"0", + elt_length => x"0", byte_reverse => '0', brev_mask => "000", + sign_extend => '0', update => '0', + xerc => xerc_init, reserve => '0', + atomic => '0', atomic_last => '0', rc => '0', nc => '0', + virt_mode => '0', priv_mode => '0', load_sp => '0', + sprn => 10x"0", is_slbia => '0', align_intr => '0', + dword_index => '0', two_dwords => '0', + nia => (others => '0')); + + type reg_stage1_t is record + req : request_t; + issued : std_ulogic; + end record; + + type reg_stage2_t is record + req : request_t; + byte_index : byte_index_t; + use_second : std_ulogic_vector(7 downto 0); + wait_dc : std_ulogic; + wait_mmu : std_ulogic; + one_cycle : std_ulogic; + wr_sel : std_ulogic_vector(1 downto 0); + end record; + + type reg_stage3_t is record state : state_t; - dwords_done : std_ulogic; - last_dword : std_ulogic; - first_bytes : std_ulogic_vector(7 downto 0); - second_bytes : std_ulogic_vector(7 downto 0); + instr_tag : instr_tag_t; + write_enable : std_ulogic; + write_reg : gspr_index_t; + write_data : std_ulogic_vector(63 downto 0); + rc : std_ulogic; + xerc : xer_common_t; + store_done : std_ulogic; + convert_lfs : std_ulogic; + load_data : std_ulogic_vector(63 downto 0); dar : std_ulogic_vector(63 downto 0); dsisr : std_ulogic_vector(31 downto 0); - instr_fault : std_ulogic; - align_intr : std_ulogic; - sprval : std_ulogic_vector(63 downto 0); - busy : std_ulogic; - wait_dcache : std_ulogic; - wait_mmu : std_ulogic; - do_update : std_ulogic; - extra_cycle : std_ulogic; - mode_32bit : std_ulogic; - byte_index : byte_index_t; - use_second : std_ulogic_vector(7 downto 0); - trim_ctl : trim_ctl_t; - load_sp : std_ulogic; ld_sp_data : std_ulogic_vector(31 downto 0); ld_sp_nz : std_ulogic; ld_sp_lz : std_ulogic_vector(5 downto 0); - wr_sel : std_ulogic_vector(1 downto 0); + stage1_en : std_ulogic; interrupt : std_ulogic; intr_vec : integer range 0 to 16#fff#; nia : std_ulogic_vector(63 downto 0); srr1 : std_ulogic_vector(15 downto 0); end record; - signal r, rin : reg_stage_t; - signal lsu_sum : std_ulogic_vector(63 downto 0); + signal req_in : request_t; + signal r1, r1in : reg_stage1_t; + signal r2, r2in : reg_stage2_t; + signal r3, r3in : reg_stage3_t; + + signal busy : std_ulogic; + signal complete : std_ulogic; + signal flushing : std_ulogic; signal store_sp_data : std_ulogic_vector(31 downto 0); signal load_dp_data : std_ulogic_vector(63 downto 0); + signal store_data : std_ulogic_vector(63 downto 0); + + signal stage1_issue_enable : std_ulogic; + signal stage1_req : request_t; + signal stage1_dcreq : std_ulogic; + signal stage1_dreq : std_ulogic; + signal stage2_busy_next : std_ulogic; + signal stage3_busy_next : std_ulogic; -- Generate byte enables from sizes function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is @@ -214,19 +265,37 @@ architecture behave of loadstore1 is end; begin - -- Calculate the address in the first cycle - lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0'); - - loadstore1_0: process(clk) + loadstore1_reg: process(clk) begin if rising_edge(clk) then if rst = '1' then - r.state <= IDLE; - r.busy <= '0'; - r.do_update <= '0'; - r.interrupt <= '0'; + r1.req.valid <= '0'; + r2.req.valid <= '0'; + r2.wait_dc <= '0'; + r2.wait_mmu <= '0'; + r2.one_cycle <= '0'; + r3.state <= IDLE; + r3.write_enable <= '0'; + r3.interrupt <= '0'; + r3.stage1_en <= '1'; + r3.convert_lfs <= '0'; + flushing <= '0'; else - r <= rin; + r1 <= r1in; + r2 <= r2in; + r3 <= r3in; + flushing <= (flushing or (r1in.req.valid and r1in.req.align_intr)) and + not r3in.interrupt; + end if; + stage1_dreq <= stage1_dcreq; + if d_in.valid = '1' then + assert r2.req.valid = '1' and r2.req.dc_req = '1' and r3.state = IDLE severity failure; + end if; + if d_in.error = '1' then + assert r2.req.valid = '1' and r2.req.dc_req = '1' and r3.state = IDLE severity failure; + end if; + if m_in.done = '1' or m_in.err = '1' then + assert r2.req.valid = '1' and (r3.state = MMU_LOOKUP or r3.state = TLBIE_WAIT) severity failure; end if; end if; end process; @@ -261,79 +330,346 @@ begin variable frac : std_ulogic_vector(22 downto 0); variable frac_shift : unsigned(4 downto 0); begin - frac := r.ld_sp_data(22 downto 0); - exp := unsigned(r.ld_sp_data(30 downto 23)); - exp_nz := or (r.ld_sp_data(30 downto 23)); - exp_ao := and (r.ld_sp_data(30 downto 23)); + frac := r3.ld_sp_data(22 downto 0); + exp := unsigned(r3.ld_sp_data(30 downto 23)); + exp_nz := or (r3.ld_sp_data(30 downto 23)); + exp_ao := and (r3.ld_sp_data(30 downto 23)); frac_shift := (others => '0'); if exp_ao = '1' then exp_dp := to_unsigned(2047, 11); -- infinity or NaN elsif exp_nz = '1' then exp_dp := 896 + resize(exp, 11); -- finite normalized value - elsif r.ld_sp_nz = '0' then + elsif r3.ld_sp_nz = '0' then exp_dp := to_unsigned(0, 11); -- zero else -- denormalized SP operand, need to normalize - exp_dp := 896 - resize(unsigned(r.ld_sp_lz), 11); - frac_shift := unsigned(r.ld_sp_lz(4 downto 0)) + 1; + exp_dp := 896 - resize(unsigned(r3.ld_sp_lz), 11); + frac_shift := unsigned(r3.ld_sp_lz(4 downto 0)) + 1; end if; - load_dp_data(63) <= r.ld_sp_data(31); + load_dp_data(63) <= r3.ld_sp_data(31); load_dp_data(62 downto 52) <= std_ulogic_vector(exp_dp); load_dp_data(51 downto 29) <= shifter_23l(frac, frac_shift); load_dp_data(28 downto 0) <= (others => '0'); end process; end generate; - loadstore1_1: process(all) - variable v : reg_stage_t; + -- Translate a load/store instruction into the internal request format + -- XXX this should only depend on l_in, but actually depends on + -- r1.req.addr0 as well (in the l_in.second = 1 case). + loadstore1_in: process(all) + variable v : request_t; + variable lsu_sum : std_ulogic_vector(63 downto 0); variable brev_lenm1 : unsigned(2 downto 0); - variable byte_offset : unsigned(2 downto 0); - variable j : integer; - variable k : unsigned(2 downto 0); - variable kk : unsigned(3 downto 0); variable long_sel : std_ulogic_vector(15 downto 0); - variable byte_sel : std_ulogic_vector(7 downto 0); - variable req : std_ulogic; - variable busy : std_ulogic; variable addr : std_ulogic_vector(63 downto 0); - variable maddr : std_ulogic_vector(63 downto 0); - variable wdata : std_ulogic_vector(63 downto 0); - variable write_enable : std_ulogic; - variable do_update : std_ulogic; - variable done : std_ulogic; - variable data_permuted : std_ulogic_vector(63 downto 0); - variable data_trimmed : std_ulogic_vector(63 downto 0); - variable store_data : std_ulogic_vector(63 downto 0); - variable byte_rev : std_ulogic; - variable length : std_ulogic_vector(3 downto 0); - variable negative : std_ulogic; variable sprn : std_ulogic_vector(9 downto 0); - variable exception : std_ulogic; - variable next_addr : std_ulogic_vector(63 downto 0); - variable mmureq : std_ulogic; - variable dsisr : std_ulogic_vector(31 downto 0); - variable mmu_mtspr : std_ulogic; - variable itlb_fault : std_ulogic; variable misaligned : std_ulogic; + variable addr_mask : std_ulogic_vector(2 downto 0); begin - v := r; + v := request_init; + sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10)); + + v.valid := l_in.valid; + v.instr_tag := l_in.instr_tag; + v.mode_32bit := l_in.mode_32bit; + v.write_reg := l_in.write_reg; + v.length := l_in.length; + v.elt_length := l_in.length; + v.byte_reverse := l_in.byte_reverse; + v.sign_extend := l_in.sign_extend; + v.update := l_in.update; + v.xerc := l_in.xerc; + v.reserve := l_in.reserve; + v.rc := l_in.rc; + v.nc := l_in.ci; + v.virt_mode := l_in.virt_mode; + v.priv_mode := l_in.priv_mode; + v.sprn := sprn; + v.nia := l_in.nia; + + lsu_sum := std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)); + + if HAS_FPU and l_in.is_32bit = '1' then + v.store_data := x"00000000" & store_sp_data; + else + v.store_data := l_in.data; + end if; + + addr := lsu_sum; + + if l_in.second = '1' then + if l_in.update = '0' then + -- for the second half of a 16-byte transfer, + -- use the previous address plus 8. + addr := std_ulogic_vector(unsigned(r1.req.addr0(63 downto 3)) + 1) & r1.req.addr0(2 downto 0); + else + -- for an update-form load, use the previous address + -- as the value to write back to RA. + addr := r1.req.addr0; + end if; + end if; + if l_in.mode_32bit = '1' then + addr(63 downto 32) := (others => '0'); + end if; + v.addr := addr; + v.addr0 := addr; + + -- XXX Temporary hack. Mark the op as non-cachable if the address + -- is the form 0xc------- for a real-mode access. + if addr(31 downto 28) = "1100" and l_in.virt_mode = '0' then + v.nc := '1'; + end if; + + addr_mask := std_ulogic_vector(unsigned(l_in.length(2 downto 0)) - 1); + + -- Do length_to_sel and work out if we are doing 2 dwords + long_sel := xfer_data_sel(v.length, addr(2 downto 0)); + v.byte_sel := long_sel(7 downto 0); + v.second_bytes := long_sel(15 downto 8); + if long_sel(15 downto 8) /= "00000000" then + v.two_dwords := '1'; + end if; + + -- check alignment for larx/stcx + misaligned := or (addr_mask and addr(2 downto 0)); + v.align_intr := l_in.reserve and misaligned; + if l_in.repeat = '1' and l_in.second = '0' and l_in.update = '0' and addr(3) = '1' then + -- length is really 16 not 8 + -- Make misaligned lq cause an alignment interrupt in LE mode, + -- in order to avoid the case with RA = RT + 1 where the second half + -- faults but the first doesn't (and updates RT+1, destroying RA). + -- The equivalent BE case doesn't occur because RA = RT is illegal. + misaligned := '1'; + if l_in.reserve = '1' or (l_in.op = OP_LOAD and l_in.byte_reverse = '0') then + v.align_intr := '1'; + end if; + end if; + + v.atomic := not misaligned; + v.atomic_last := not misaligned and (l_in.second or not l_in.repeat); + + case l_in.op is + when OP_STORE => + v.store := '1'; + when OP_LOAD => + if l_in.update = '0' or l_in.second = '0' then + v.load := '1'; + if HAS_FPU and l_in.is_32bit = '1' then + -- Allow an extra cycle for SP->DP precision conversion + v.load_sp := '1'; + end if; + else + -- write back address to RA + v.do_update := '1'; + end if; + when OP_DCBZ => + v.dcbz := '1'; + v.align_intr := v.nc; + when OP_TLBIE => + v.tlbie := '1'; + v.addr := l_in.addr2; -- address from RB for tlbie + v.is_slbia := l_in.insn(7); + v.mmu_op := '1'; + when OP_MFSPR => + v.read_spr := '1'; + when OP_MTSPR => + v.write_spr := '1'; + v.mmu_op := sprn(9) or sprn(5); + when OP_FETCH_FAILED => + -- send it to the MMU to do the radix walk + v.instr_fault := '1'; + v.addr := l_in.nia; + v.mmu_op := '1'; + when others => + end case; + v.dc_req := l_in.valid and (v.load or v.store or v.dcbz) and not v.align_intr; + + -- Work out controls for load and store formatting + brev_lenm1 := "000"; + if v.byte_reverse = '1' then + brev_lenm1 := unsigned(v.length(2 downto 0)) - 1; + end if; + v.brev_mask := brev_lenm1; + + req_in <= v; + end process; + + --busy <= r1.req.valid and ((r1.req.dc_req and not r1.issued) or + -- (r1.issued and d_in.error) or + -- stage2_busy_next or + -- (r1.req.dc_req and r1.req.two_dwords and not r1.req.dword_index)); + complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or + (r2.wait_mmu and m_in.done) or r3.convert_lfs; + busy <= r1.req.valid or (r2.req.valid and not complete); + + stage1_issue_enable <= r3.stage1_en and not (r1.req.valid and r1.req.mmu_op) and + not (r2.req.valid and r2.req.mmu_op); + + -- Processing done in the first cycle of a load/store instruction + loadstore1_1: process(all) + variable v : reg_stage1_t; + variable req : request_t; + variable dcreq : std_ulogic; + variable addr : std_ulogic_vector(63 downto 0); + begin + v := r1; + dcreq := '0'; + req := req_in; + if flushing = '1' then + -- Make this a no-op request rather than simply invalid. + -- It will never get to stage 3 since there is a request ahead of + -- it with align_intr = 1. + req.dc_req := '0'; + end if; + + -- Note that l_in.valid is gated with busy inside execute1 + if l_in.valid = '1' then + dcreq := req.dc_req and stage1_issue_enable and not d_in.error and not dc_stall; + v.req := req; + v.issued := dcreq; + elsif r1.req.valid = '1' then + if r1.req.dc_req = '1' and r1.issued = '0' then + req := r1.req; + dcreq := stage1_issue_enable and not dc_stall and not d_in.error; + v.issued := dcreq; + elsif r1.issued = '1' and d_in.error = '1' then + v.issued := '0'; + elsif stage2_busy_next = '0' then + -- we can change what's in r1 next cycle because the current thing + -- in r1 will go into r2 + if r1.req.dc_req = '1' and r1.req.two_dwords = '1' and r1.req.dword_index = '0' then + -- construct the second request for a misaligned access + v.req.dword_index := '1'; + v.req.addr := std_ulogic_vector(unsigned(r1.req.addr(63 downto 3)) + 1) & "000"; + if r1.req.mode_32bit = '1' then + v.req.addr(32) := '0'; + end if; + v.req.byte_sel := r1.req.second_bytes; + v.issued := stage1_issue_enable and not dc_stall; + dcreq := stage1_issue_enable and not dc_stall; + req := v.req; + else + v.req.valid := '0'; + end if; + end if; + end if; + if r3in.interrupt = '1' then + v.req.valid := '0'; + dcreq := '0'; + end if; + + stage1_req <= req; + stage1_dcreq <= dcreq; + r1in <= v; + end process; + + -- Processing done in the second cycle of a load/store instruction. + -- Store data is formatted here and sent to the dcache. + -- The request in r1 is sent to stage 3 if stage 3 will not be busy next cycle. + loadstore1_2: process(all) + variable v : reg_stage2_t; + variable j : integer; + variable k : unsigned(2 downto 0); + variable kk : unsigned(3 downto 0); + variable idx : unsigned(2 downto 0); + variable byte_offset : unsigned(2 downto 0); + begin + v := r2; + + -- Byte reversing and rotating for stores. + -- Done in the second cycle (the cycle after l_in.valid = 1). + byte_offset := unsigned(r1.req.addr0(2 downto 0)); + for i in 0 to 7 loop + k := (to_unsigned(i, 3) - byte_offset) xor r1.req.brev_mask; + j := to_integer(k) * 8; + store_data(i * 8 + 7 downto i * 8) <= r1.req.store_data(j + 7 downto j); + end loop; + + if stage3_busy_next = '0' and + (r1.req.valid = '0' or r1.issued = '1' or r1.req.dc_req = '0') then + v.req := r1.req; + v.req.store_data := store_data; + v.wait_dc := r1.req.valid and r1.req.dc_req and not r1.req.load_sp and + not (r1.req.two_dwords and not r1.req.dword_index); + v.wait_mmu := r1.req.valid and r1.req.mmu_op; + v.one_cycle := r1.req.valid and (r1.req.noop or r1.req.read_spr or + (r1.req.write_spr and not r1.req.mmu_op) or + r1.req.load_zero or r1.req.do_update); + if r1.req.read_spr = '1' then + v.wr_sel := "00"; + elsif r1.req.do_update = '1' or r1.req.store = '1' then + v.wr_sel := "01"; + elsif r1.req.load_sp = '1' then + v.wr_sel := "10"; + else + v.wr_sel := "11"; + end if; + + -- Work out load formatter controls for next cycle + for i in 0 to 7 loop + idx := to_unsigned(i, 3) xor r1.req.brev_mask; + kk := ('0' & idx) + ('0' & byte_offset); + v.use_second(i) := kk(3); + v.byte_index(i) := kk(2 downto 0); + end loop; + elsif stage3_busy_next = '0' then + v.req.valid := '0'; + v.wait_dc := '0'; + v.wait_mmu := '0'; + end if; + + stage2_busy_next <= r1.req.valid and stage3_busy_next; + + if r3in.interrupt = '1' then + v.req.valid := '0'; + end if; + + r2in <= v; + end process; + + -- Processing done in the third cycle of a load/store instruction. + -- At this stage we can do things that have side effects without + -- fear of the instruction getting flushed. This is the point at + -- which requests get sent to the MMU. + loadstore1_3: process(all) + variable v : reg_stage3_t; + variable j : integer; + variable req : std_ulogic; + variable mmureq : std_ulogic; + variable mmu_mtspr : std_ulogic; + variable write_enable : std_ulogic; + variable write_data : std_ulogic_vector(63 downto 0); + variable do_update : std_ulogic; + variable done : std_ulogic; + variable part_done : std_ulogic; + variable exception : std_ulogic; + variable data_permuted : std_ulogic_vector(63 downto 0); + variable data_trimmed : std_ulogic_vector(63 downto 0); + variable sprval : std_ulogic_vector(63 downto 0); + variable negative : std_ulogic; + variable dsisr : std_ulogic_vector(31 downto 0); + variable itlb_fault : std_ulogic; + variable trim_ctl : trim_ctl_t; + begin + v := r3; + req := '0'; + mmureq := '0'; mmu_mtspr := '0'; - itlb_fault := '0'; - sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10)); + done := '0'; + part_done := '0'; + exception := '0'; dsisr := (others => '0'); - mmureq := '0'; - v.wr_sel := "11"; - write_enable := '0'; - - do_update := r.do_update; - v.do_update := '0'; + sprval := (others => '0'); + do_update := '0'; + v.convert_lfs := '0'; + v.srr1 := (others => '0'); -- load data formatting -- shift and byte-reverse data bytes for i in 0 to 7 loop - j := to_integer(r.byte_index(i)) * 8; + j := to_integer(r2.byte_index(i)) * 8; data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j); end loop; @@ -341,29 +677,39 @@ begin -- For unaligned loads crossing two dwords, the sign bit is in the -- first dword for big-endian (byte_reverse = 1), or the second dword -- for little-endian. - if r.dwords_done = '1' and r.byte_reverse = '1' then - negative := (r.length(3) and r.load_data(63)) or - (r.length(2) and r.load_data(31)) or - (r.length(1) and r.load_data(15)) or - (r.length(0) and r.load_data(7)); + if r2.req.dword_index = '1' and r2.req.byte_reverse = '1' then + negative := (r2.req.length(3) and r3.load_data(63)) or + (r2.req.length(2) and r3.load_data(31)) or + (r2.req.length(1) and r3.load_data(15)) or + (r2.req.length(0) and r3.load_data(7)); else - negative := (r.length(3) and data_permuted(63)) or - (r.length(2) and data_permuted(31)) or - (r.length(1) and data_permuted(15)) or - (r.length(0) and data_permuted(7)); + negative := (r2.req.length(3) and data_permuted(63)) or + (r2.req.length(2) and data_permuted(31)) or + (r2.req.length(1) and data_permuted(15)) or + (r2.req.length(0) and data_permuted(7)); end if; -- trim and sign-extend for i in 0 to 7 loop - case r.trim_ctl(i) is + if i < to_integer(unsigned(r2.req.length)) then + if r2.req.dword_index = '1' then + trim_ctl(i) := '1' & not r2.use_second(i); + else + trim_ctl(i) := "10"; + end if; + else + trim_ctl(i) := "00"; + end if; + end loop; + + for i in 0 to 7 loop + case trim_ctl(i) is when "11" => - data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8); + data_trimmed(i * 8 + 7 downto i * 8) := r3.load_data(i * 8 + 7 downto i * 8); when "10" => data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8); - when "01" => - data_trimmed(i * 8 + 7 downto i * 8) := (others => negative); when others => - data_trimmed(i * 8 + 7 downto i * 8) := x"00"; + data_trimmed(i * 8 + 7 downto i * 8) := (others => negative and r2.req.sign_extend); end case; end loop; @@ -374,63 +720,62 @@ begin v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0)); end if; - -- Byte reversing and rotating for stores. - -- Done in the second cycle (the cycle after l_in.valid = 1). - for i in 0 to 7 loop - k := (to_unsigned(i, 3) - r.byte_offset) xor r.brev_mask; - j := to_integer(k) * 8; - store_data(i * 8 + 7 downto i * 8) := r.store_data(j + 7 downto j); - end loop; - - -- compute (addr + 8) & ~7 for the second doubleword when unaligned - next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000"; - - -- Busy calculation. - -- We need to minimize the delay from clock to busy valid because it - -- gates the start of execution of the next instruction. - busy := r.busy and not ((r.wait_dcache and d_in.valid) or (r.wait_mmu and m_in.done)); - v.busy := busy; - - done := '0'; - if r.state /= IDLE and busy = '0' then - done := '1'; + if d_in.valid = '1' and r2.req.load = '1' then + v.load_data := data_permuted; end if; - exception := '0'; - if r.dwords_done = '1' or r.state = SECOND_REQ then - addr := next_addr; - byte_sel := r.second_bytes; - else - addr := r.addr; - byte_sel := r.first_bytes; - end if; - if r.mode_32bit = '1' then - addr(63 downto 32) := (others => '0'); + if r2.req.valid = '1' then + if r2.req.read_spr = '1' then + write_enable := '1'; + -- partial decode on SPR number should be adequate given + -- the restricted set that get sent down this path + if r2.req.sprn(9) = '0' and r2.req.sprn(5) = '0' then + if r2.req.sprn(0) = '0' then + sprval := x"00000000" & r3.dsisr; + else + sprval := r3.dar; + end if; + else + -- reading one of the SPRs in the MMU + sprval := m_in.sprval; + end if; + end if; + if r2.req.align_intr = '1' then + -- generate alignment interrupt + exception := '1'; + end if; + if r2.req.load_zero = '1' then + write_enable := '1'; + end if; + if r2.req.do_update = '1' then + do_update := '1'; + end if; end if; - maddr := addr; - case r.state is + case r3.state is when IDLE => - - when SECOND_REQ => - req := '1'; - v.state := ACK_WAIT; - v.last_dword := '0'; - - when ACK_WAIT => - -- r.wr_sel gets set one cycle after we come into ACK_WAIT state, - -- which is OK because the dcache always takes at least two cycles. - if r.update = '1' and r.load = '0' then - v.wr_sel := "01"; + if d_in.valid = '1' then + if r2.req.two_dwords = '0' or r2.req.dword_index = '1' then + write_enable := r2.req.load and not r2.req.load_sp; + if HAS_FPU and r2.req.load_sp = '1' then + -- SP to DP conversion takes a cycle + v.state := FINISH_LFS; + v.convert_lfs := '1'; + else + -- stores write back rA update + do_update := r2.req.update and r2.req.store; + end if; + else + part_done := '1'; + end if; end if; if d_in.error = '1' then - -- dcache will discard the second request if it - -- gets an error on the 1st of two requests if d_in.cache_paradox = '1' then -- signal an interrupt straight away exception := '1'; - dsisr(63 - 38) := not r.load; + dsisr(63 - 38) := not r2.req.load; -- XXX there is no architected bit for this + -- (probably should be a machine check in fact) dsisr(63 - 35) := d_in.cache_paradox; else -- Look up the translation for TLB miss @@ -438,49 +783,42 @@ begin -- in case the PTE has been updated. mmureq := '1'; v.state := MMU_LOOKUP; + v.stage1_en := '0'; end if; end if; - if d_in.valid = '1' then - if r.last_dword = '0' then - v.dwords_done := '1'; - v.last_dword := '1'; - if r.load = '1' then - v.load_data := data_permuted; + if r2.req.valid = '1' then + if r2.req.mmu_op = '1' then + -- send request (tlbie, mtspr, itlb miss) to MMU + mmureq := not r2.req.write_spr; + mmu_mtspr := r2.req.write_spr; + if r2.req.instr_fault = '1' then + v.state := MMU_LOOKUP; + else + v.state := TLBIE_WAIT; end if; - else - write_enable := r.load and not r.load_sp; - if HAS_FPU and r.load_sp = '1' then - -- SP to DP conversion takes a cycle - v.wr_sel := "10"; - v.state := FINISH_LFS; - elsif r.load = '0' then - -- stores write back rA update in this cycle - do_update := r.update; + elsif r2.req.write_spr = '1' then + if r2.req.sprn(0) = '0' then + v.dsisr := r2.req.store_data(31 downto 0); + else + v.dar := r2.req.store_data; end if; - v.busy := '0'; end if; end if; - -- r.wait_dcache gets set one cycle after we come into ACK_WAIT state, - -- which is OK because the dcache always takes at least two cycles. - v.wait_dcache := r.last_dword and not r.extra_cycle; when MMU_LOOKUP => if m_in.done = '1' then - if r.instr_fault = '0' then + if r2.req.instr_fault = '0' then -- retry the request now that the MMU has installed a TLB entry req := '1'; - if r.last_dword = '0' then - v.state := SECOND_REQ; - else - v.state := ACK_WAIT; - end if; + v.stage1_en := '1'; + v.state := IDLE; end if; end if; if m_in.err = '1' then exception := '1'; dsisr(63 - 33) := m_in.invalid; dsisr(63 - 36) := m_in.perm_error; - dsisr(63 - 38) := not r.load; + dsisr(63 - 38) := r2.req.store or r2.req.dcbz; dsisr(63 - 44) := m_in.badtree; dsisr(63 - 45) := m_in.rc_error; end if; @@ -488,231 +826,25 @@ begin when TLBIE_WAIT => when FINISH_LFS => - - when COMPLETE => - exception := r.align_intr; + write_enable := '1'; end case; - if done = '1' or exception = '1' then + if complete = '1' or exception = '1' then + v.stage1_en := '1'; v.state := IDLE; - v.busy := '0'; end if; - -- Note that l_in.valid is gated with busy inside execute1 - if l_in.valid = '1' then - v.mode_32bit := l_in.mode_32bit; - v.load := '0'; - v.dcbz := '0'; - v.tlbie := '0'; - v.instr_fault := '0'; - v.align_intr := '0'; - v.dwords_done := '0'; - v.last_dword := '1'; - v.instr_tag := l_in.instr_tag; - v.write_reg := l_in.write_reg; - v.length := l_in.length; - v.byte_reverse := l_in.byte_reverse; - v.sign_extend := l_in.sign_extend; - v.update := l_in.update; - v.xerc := l_in.xerc; - v.reserve := l_in.reserve; - v.rc := l_in.rc; - v.nc := l_in.ci; - v.virt_mode := l_in.virt_mode; - v.priv_mode := l_in.priv_mode; - v.load_sp := '0'; - v.wait_dcache := '0'; - v.wait_mmu := '0'; - v.extra_cycle := '0'; - v.nia := l_in.nia; - v.srr1 := (others => '0'); - - if HAS_FPU and l_in.is_32bit = '1' then - v.store_data := x"00000000" & store_sp_data; - else - v.store_data := l_in.data; - end if; - - addr := lsu_sum; - if l_in.second = '1' then - -- second half of load with update does the update - if l_in.op = OP_LOAD and l_in.update = '1' then - v.do_update := '1'; - else - -- for the second half of a 16-byte transfer, use next_addr - addr := next_addr; - end if; - end if; - if l_in.mode_32bit = '1' then - addr(63 downto 32) := (others => '0'); - end if; - if v.do_update = '0' then - -- preserve previous r.addr for load with update - v.addr := addr; - end if; - maddr := l_in.addr2; -- address from RB for tlbie - - -- XXX Temporary hack. Mark the op as non-cachable if the address - -- is the form 0xc------- for a real-mode access. - if addr(31 downto 28) = "1100" and l_in.virt_mode = '0' then - v.nc := '1'; - end if; - - if l_in.second = '0' then - -- Do length_to_sel and work out if we are doing 2 dwords - long_sel := xfer_data_sel(l_in.length, lsu_sum(2 downto 0)); - byte_sel := long_sel(7 downto 0); - v.first_bytes := byte_sel; - v.second_bytes := long_sel(15 downto 8); - else - byte_sel := r.first_bytes; - long_sel := r.second_bytes & r.first_bytes; - end if; - - -- check alignment for larx/stcx - misaligned := or (std_ulogic_vector(unsigned(l_in.length(2 downto 0)) - 1) and addr(2 downto 0)); - v.align_intr := l_in.reserve and misaligned; - if l_in.repeat = '1' and l_in.second = '0' and l_in.update = '0' and addr(3) = '1' then - -- length is really 16 not 8 - -- Make misaligned lq cause an alignment interrupt in LE mode, - -- in order to avoid the case with RA = RT + 1 where the second half - -- faults but the first doesn't (and updates RT+1, destroying RA). - -- The equivalent BE case doesn't occur because RA = RT is illegal. - misaligned := '1'; - if l_in.reserve = '1' or (l_in.op = OP_LOAD and l_in.byte_reverse = '0') then - v.align_intr := '1'; - end if; - end if; - - v.atomic := not misaligned; - v.atomic_last := not misaligned and (l_in.second or not l_in.repeat); - - case l_in.op is - when OP_STORE => - req := '1'; - when OP_LOAD => - v.load := '1'; - if l_in.second = '1' and l_in.update = '1' then - v.wr_sel := "01"; - v.state := COMPLETE; - else - req := '1'; - if HAS_FPU and l_in.is_32bit = '1' then - -- Allow an extra cycle for SP->DP precision conversion - v.load_sp := '1'; - v.extra_cycle := '1'; - end if; - end if; - when OP_DCBZ => - v.align_intr := v.nc; - req := '1'; - v.dcbz := '1'; - when OP_TLBIE => - mmureq := '1'; - v.tlbie := '1'; - v.state := TLBIE_WAIT; - v.wait_mmu := '1'; - when OP_MFSPR => - v.wr_sel := "00"; - -- partial decode on SPR number should be adequate given - -- the restricted set that get sent down this path - if sprn(9) = '0' and sprn(5) = '0' then - if sprn(0) = '0' then - v.sprval := x"00000000" & r.dsisr; - else - v.sprval := r.dar; - end if; - else - -- reading one of the SPRs in the MMU - v.sprval := m_in.sprval; - end if; - v.state := COMPLETE; - when OP_MTSPR => - if sprn(9) = '0' and sprn(5) = '0' then - if sprn(0) = '0' then - v.dsisr := l_in.data(31 downto 0); - else - v.dar := l_in.data; - end if; - v.state := COMPLETE; - else - -- writing one of the SPRs in the MMU - mmu_mtspr := '1'; - v.state := TLBIE_WAIT; - v.wait_mmu := '1'; - end if; - when OP_FETCH_FAILED => - -- send it to the MMU to do the radix walk - maddr := l_in.nia; - v.instr_fault := '1'; - mmureq := '1'; - v.state := MMU_LOOKUP; - v.wait_mmu := '1'; - when others => - assert false report "unknown op sent to loadstore1"; - end case; - - if req = '1' then - if v.align_intr = '1' then - v.state := COMPLETE; - elsif long_sel(15 downto 8) = "00000000" then - v.state := ACK_WAIT; - else - v.state := SECOND_REQ; - end if; - end if; - - v.busy := req or mmureq or mmu_mtspr; - end if; - - -- Work out controls for store formatting - if l_in.valid = '1' then - byte_offset := unsigned(lsu_sum(2 downto 0)); - byte_rev := l_in.byte_reverse; - length := l_in.length; - brev_lenm1 := "000"; - if byte_rev = '1' then - brev_lenm1 := unsigned(length(2 downto 0)) - 1; - end if; - v.byte_offset := byte_offset; - v.brev_mask := brev_lenm1; - end if; - - -- Work out load formatter controls for next cycle - byte_offset := unsigned(v.addr(2 downto 0)); - brev_lenm1 := "000"; - if v.byte_reverse = '1' then - brev_lenm1 := unsigned(v.length(2 downto 0)) - 1; - end if; - - for i in 0 to 7 loop - kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); - v.use_second(i) := kk(3); - v.byte_index(i) := kk(2 downto 0); - end loop; - - for i in 0 to 7 loop - if i < to_integer(unsigned(v.length)) then - if v.dwords_done = '1' then - v.trim_ctl(i) := '1' & not v.use_second(i); - else - v.trim_ctl(i) := "10"; - end if; - else - v.trim_ctl(i) := '0' & v.sign_extend; - end if; - end loop; - -- generate DSI or DSegI for load/store exceptions -- or ISI or ISegI for instruction fetch exceptions v.interrupt := exception; if exception = '1' then - if r.align_intr = '1' then + v.nia := r2.req.nia; + if r2.req.align_intr = '1' then v.intr_vec := 16#600#; - v.dar := addr; - elsif r.instr_fault = '0' then - v.dar := addr; + v.dar := r2.req.addr; + elsif r2.req.instr_fault = '0' then + v.dar := r2.req.addr; if m_in.segerr = '0' then v.intr_vec := 16#300#; v.dsisr := dsisr; @@ -732,66 +864,88 @@ begin end if; end if; + case r2.wr_sel is + when "00" => + -- mfspr result + write_data := sprval; + when "01" => + -- update reg + write_data := r2.req.addr0; + when "10" => + -- lfs result + write_data := load_dp_data; + when others => + -- load data + write_data := data_trimmed; + end case; + -- Update outputs to dcache - d_out.valid <= req and not v.align_intr; - d_out.load <= v.load; - d_out.dcbz <= v.dcbz; - d_out.nc <= v.nc; - d_out.reserve <= v.reserve; - d_out.atomic <= v.atomic; - d_out.atomic_last <= v.atomic_last; - d_out.addr <= addr; - d_out.data <= store_data; - d_out.byte_sel <= byte_sel; - d_out.virt_mode <= v.virt_mode; - d_out.priv_mode <= v.priv_mode; - d_out.hold <= '0'; + if stage1_issue_enable = '1' then + d_out.valid <= stage1_dcreq; + d_out.load <= stage1_req.load; + d_out.dcbz <= stage1_req.dcbz; + d_out.nc <= stage1_req.nc; + d_out.reserve <= stage1_req.reserve; + d_out.atomic <= stage1_req.atomic; + d_out.atomic_last <= stage1_req.atomic_last; + d_out.addr <= stage1_req.addr; + d_out.byte_sel <= stage1_req.byte_sel; + d_out.virt_mode <= stage1_req.virt_mode; + d_out.priv_mode <= stage1_req.priv_mode; + else + d_out.valid <= req; + d_out.load <= r2.req.load; + d_out.dcbz <= r2.req.dcbz; + d_out.nc <= r2.req.nc; + d_out.reserve <= r2.req.reserve; + d_out.atomic <= r2.req.atomic; + d_out.atomic_last <= r2.req.atomic_last; + d_out.addr <= r2.req.addr; + d_out.byte_sel <= r2.req.byte_sel; + d_out.virt_mode <= r2.req.virt_mode; + d_out.priv_mode <= r2.req.priv_mode; + end if; + if stage1_dreq = '1' then + d_out.data <= store_data; + else + d_out.data <= r2.req.store_data; + end if; + d_out.hold <= r2.req.valid and r2.req.load_sp and d_in.valid; -- Update outputs to MMU m_out.valid <= mmureq; - m_out.iside <= v.instr_fault; - m_out.load <= r.load; - m_out.priv <= r.priv_mode; - m_out.tlbie <= v.tlbie; + m_out.iside <= r2.req.instr_fault; + m_out.load <= r2.req.load; + m_out.priv <= r2.req.priv_mode; + m_out.tlbie <= r2.req.tlbie; m_out.mtspr <= mmu_mtspr; - m_out.sprn <= sprn; - m_out.addr <= maddr; - m_out.slbia <= l_in.insn(7); - m_out.rs <= l_in.data; + m_out.sprn <= r2.req.sprn; + m_out.addr <= r2.req.addr; + m_out.slbia <= r2.req.is_slbia; + m_out.rs <= r2.req.store_data; -- Update outputs to writeback - -- Multiplex either cache data to the destination GPR or - -- the address for the rA update. - l_out.valid <= done; - l_out.instr_tag <= r.instr_tag; - l_out.write_reg <= r.write_reg; - case r.wr_sel is - when "00" => - l_out.write_enable <= '1'; - l_out.write_data <= r.sprval; - when "01" => - l_out.write_enable <= do_update; - l_out.write_data <= r.addr; - when "10" => - l_out.write_enable <= '1'; - l_out.write_data <= load_dp_data; - when others => - l_out.write_enable <= write_enable; - l_out.write_data <= data_trimmed; - end case; - l_out.xerc <= r.xerc; - l_out.rc <= r.rc and done; + l_out.valid <= complete; + l_out.instr_tag <= r2.req.instr_tag; + l_out.write_enable <= write_enable or do_update; + l_out.write_reg <= r2.req.write_reg; + l_out.write_data <= write_data; + l_out.xerc <= r2.req.xerc; + l_out.rc <= r2.req.rc and complete; l_out.store_done <= d_in.store_done; - l_out.interrupt <= r.interrupt; - l_out.intr_vec <= r.intr_vec; - l_out.srr0 <= r.nia; - l_out.srr1 <= r.srr1; + l_out.interrupt <= r3.interrupt; + l_out.intr_vec <= r3.intr_vec; + l_out.srr0 <= r3.nia; + l_out.srr1 <= r3.srr1; -- update busy signal back to execute1 e_out.busy <= busy; + -- Busy calculation. + stage3_busy_next <= r2.req.valid and not (complete or part_done or exception); + -- Update registers - rin <= v; + r3in <= v; end process; @@ -807,8 +961,8 @@ begin m_out.valid & d_out.valid & m_in.done & - r.dwords_done & - std_ulogic_vector(to_unsigned(state_t'pos(r.state), 3)); + r2.req.dword_index & + std_ulogic_vector(to_unsigned(state_t'pos(r3.state), 3)); end if; end process; log_out <= log_data; From 17fd069640c240054db07746543252c89322407f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 28 Dec 2020 15:15:30 +1100 Subject: [PATCH 13/13] core: Allow multiple loadstore instructions to be in flight The idea here is that we can have multiple instructions in progress at the same time as long as they all go to the same unit, because that unit will keep them in order. If we get an instruction for a different unit, we wait for all the previous instructions to finish before executing it. Since the loadstore unit is the only one that is currently pipelined, this boils down to saying that loadstore instructions can go ahead while l_in.in_progress = 1 but other instructions have to wait until it is 0. This gives a 2% increase on coremark performance on the Arty A7-100 (from ~190 to ~194). Signed-off-by: Paul Mackerras --- common.vhdl | 1 + control.vhdl | 6 +++++- decode2.vhdl | 3 +-- execute1.vhdl | 7 +++++-- loadstore1.vhdl | 12 +++++++----- 5 files changed, 19 insertions(+), 10 deletions(-) diff --git a/common.vhdl b/common.vhdl index 35f782b..69dde30 100644 --- a/common.vhdl +++ b/common.vhdl @@ -368,6 +368,7 @@ package common is type Loadstore1ToExecute1Type is record busy : std_ulogic; + in_progress : std_ulogic; end record; type Loadstore1ToDcacheType is record diff --git a/control.vhdl b/control.vhdl index f14e350..34c35e2 100644 --- a/control.vhdl +++ b/control.vhdl @@ -7,7 +7,7 @@ use work.common.all; entity control is generic ( EX1_BYPASS : boolean := true; - PIPELINE_DEPTH : natural := 2 + PIPELINE_DEPTH : natural := 3 ); port ( clk : in std_ulogic; @@ -239,6 +239,10 @@ begin elsif complete_in.valid = '1' then v_int.outstanding := r_int.outstanding - 1; end if; + if r_int.outstanding >= PIPELINE_DEPTH + 1 then + valid_tmp := '0'; + stall_tmp := '1'; + end if; if rst = '1' then v_int := reg_internal_init; diff --git a/decode2.vhdl b/decode2.vhdl index 732cfe0..f9fa541 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -300,8 +300,7 @@ architecture behaviour of decode2 is begin control_0: entity work.control generic map ( - EX1_BYPASS => EX1_BYPASS, - PIPELINE_DEPTH => 1 + EX1_BYPASS => EX1_BYPASS ) port map ( clk => clk, diff --git a/execute1.vhdl b/execute1.vhdl index 0eaf55a..c0434a0 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -293,7 +293,10 @@ begin -- writeback. xerc_in <= r.e.xerc when r.e.write_xerc_enable = '1' or r.busy = '1' else e_in.xerc; - busy_out <= l_in.busy or r.busy or fp_in.busy; + with e_in.unit select busy_out <= + l_in.busy or r.busy or fp_in.busy when LDST, + l_in.busy or l_in.in_progress or r.busy or fp_in.busy when others; + valid_in <= e_in.valid and not busy_out and not flush_in; terminate_out <= r.terminate; @@ -744,7 +747,7 @@ begin -- Determine if there is any exception to be taken -- before/instead of executing this instruction - if valid_in = '1' and e_in.second = '0' then + if valid_in = '1' and e_in.second = '0' and l_in.in_progress = '0' then if HAS_FPU and r.fp_exception_next = '1' then -- This is used for FP-type program interrupts that -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero. diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 66700e8..ee4507b 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -155,6 +155,7 @@ architecture behave of loadstore1 is signal busy : std_ulogic; signal complete : std_ulogic; + signal in_progress : std_ulogic; signal flushing : std_ulogic; signal store_sp_data : std_ulogic_vector(31 downto 0); @@ -494,13 +495,13 @@ begin req_in <= v; end process; - --busy <= r1.req.valid and ((r1.req.dc_req and not r1.issued) or - -- (r1.issued and d_in.error) or - -- stage2_busy_next or - -- (r1.req.dc_req and r1.req.two_dwords and not r1.req.dword_index)); + busy <= r1.req.valid and ((r1.req.dc_req and not r1.issued) or + (r1.issued and d_in.error) or + stage2_busy_next or + (r1.req.dc_req and r1.req.two_dwords and not r1.req.dword_index)); complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or (r2.wait_mmu and m_in.done) or r3.convert_lfs; - busy <= r1.req.valid or (r2.req.valid and not complete); + in_progress <= r1.req.valid or (r2.req.valid and not complete); stage1_issue_enable <= r3.stage1_en and not (r1.req.valid and r1.req.mmu_op) and not (r2.req.valid and r2.req.mmu_op); @@ -940,6 +941,7 @@ begin -- update busy signal back to execute1 e_out.busy <= busy; + e_out.in_progress <= in_progress; -- Busy calculation. stage3_busy_next <= r2.req.valid and not (complete or part_done or exception);