|
|
|
library ieee;
|
|
|
|
use ieee.std_logic_1164.all;
|
|
|
|
use ieee.numeric_std.all;
|
|
|
|
|
|
|
|
library work;
|
|
|
|
use work.utils.all;
|
|
|
|
use work.decode_types.all;
|
|
|
|
|
|
|
|
package common is
|
|
|
|
-- Processor Version Number
|
|
|
|
constant PVR_MICROWATT : std_ulogic_vector(31 downto 0) := x"00630000";
|
|
|
|
|
execute1: Improve architecture compliance of MSR and related instructions
This makes our treatment of the MSR conform better with the ISA.
- On reset, initialize the MSR to have the SF and LE bits set and
all the others reset. For good measure initialize r properly too.
- Fix the bit numbering in msr_copy (the code was using big-endian
bit numbers, not little-endian).
- Use constants like MSR_EE to index MSR bits instead of expressions
like '63 - 48', for readability.
- Set MSR[SF, LE] and clear MSR[PR, IR, DR, RI] on interrupts.
- Copy the relevant fields for rfid instead of using msr_copy, because
the partial function fields of the MSR should be left unchanged,
not zeroed. Our implementation of rfid is like the architecture
description of hrfid, because we don't implement hypervisor mode.
- Return the whole MSR for mfmsr.
- Implement the L field for mtmsrd (L=1 copies just EE and RI).
- For mtmsrd with L=0, leave out the HV, ME and LE bits as per the arch.
- For mtmsrd and rfid, if PR ends up set, then also set EE, IR and DR
as per the arch.
- A few other minor tidyups (no semantic change).
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
-- MSR bit numbers
|
|
|
|
constant MSR_SF : integer := (63 - 0); -- Sixty-Four bit mode
|
|
|
|
constant MSR_HV : integer := (63 - 3); -- Hypervisor mode (always 1)
|
execute1: Improve architecture compliance of MSR and related instructions
This makes our treatment of the MSR conform better with the ISA.
- On reset, initialize the MSR to have the SF and LE bits set and
all the others reset. For good measure initialize r properly too.
- Fix the bit numbering in msr_copy (the code was using big-endian
bit numbers, not little-endian).
- Use constants like MSR_EE to index MSR bits instead of expressions
like '63 - 48', for readability.
- Set MSR[SF, LE] and clear MSR[PR, IR, DR, RI] on interrupts.
- Copy the relevant fields for rfid instead of using msr_copy, because
the partial function fields of the MSR should be left unchanged,
not zeroed. Our implementation of rfid is like the architecture
description of hrfid, because we don't implement hypervisor mode.
- Return the whole MSR for mfmsr.
- Implement the L field for mtmsrd (L=1 copies just EE and RI).
- For mtmsrd with L=0, leave out the HV, ME and LE bits as per the arch.
- For mtmsrd and rfid, if PR ends up set, then also set EE, IR and DR
as per the arch.
- A few other minor tidyups (no semantic change).
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
constant MSR_EE : integer := (63 - 48); -- External interrupt Enable
|
|
|
|
constant MSR_PR : integer := (63 - 49); -- PRoblem state
|
|
|
|
constant MSR_FP : integer := (63 - 50); -- Floating Point available
|
|
|
|
constant MSR_FE0 : integer := (63 - 52); -- Floating Exception mode
|
|
|
|
constant MSR_SE : integer := (63 - 53); -- Single-step bit of TE field
|
|
|
|
constant MSR_BE : integer := (63 - 54); -- Branch trace bit of TE field
|
|
|
|
constant MSR_FE1 : integer := (63 - 55); -- Floating Exception mode
|
execute1: Improve architecture compliance of MSR and related instructions
This makes our treatment of the MSR conform better with the ISA.
- On reset, initialize the MSR to have the SF and LE bits set and
all the others reset. For good measure initialize r properly too.
- Fix the bit numbering in msr_copy (the code was using big-endian
bit numbers, not little-endian).
- Use constants like MSR_EE to index MSR bits instead of expressions
like '63 - 48', for readability.
- Set MSR[SF, LE] and clear MSR[PR, IR, DR, RI] on interrupts.
- Copy the relevant fields for rfid instead of using msr_copy, because
the partial function fields of the MSR should be left unchanged,
not zeroed. Our implementation of rfid is like the architecture
description of hrfid, because we don't implement hypervisor mode.
- Return the whole MSR for mfmsr.
- Implement the L field for mtmsrd (L=1 copies just EE and RI).
- For mtmsrd with L=0, leave out the HV, ME and LE bits as per the arch.
- For mtmsrd and rfid, if PR ends up set, then also set EE, IR and DR
as per the arch.
- A few other minor tidyups (no semantic change).
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
constant MSR_IR : integer := (63 - 58); -- Instruction Relocation
|
|
|
|
constant MSR_DR : integer := (63 - 59); -- Data Relocation
|
|
|
|
constant MSR_PMM : integer := (63 - 61); -- Performance Monitor Mark
|
execute1: Improve architecture compliance of MSR and related instructions
This makes our treatment of the MSR conform better with the ISA.
- On reset, initialize the MSR to have the SF and LE bits set and
all the others reset. For good measure initialize r properly too.
- Fix the bit numbering in msr_copy (the code was using big-endian
bit numbers, not little-endian).
- Use constants like MSR_EE to index MSR bits instead of expressions
like '63 - 48', for readability.
- Set MSR[SF, LE] and clear MSR[PR, IR, DR, RI] on interrupts.
- Copy the relevant fields for rfid instead of using msr_copy, because
the partial function fields of the MSR should be left unchanged,
not zeroed. Our implementation of rfid is like the architecture
description of hrfid, because we don't implement hypervisor mode.
- Return the whole MSR for mfmsr.
- Implement the L field for mtmsrd (L=1 copies just EE and RI).
- For mtmsrd with L=0, leave out the HV, ME and LE bits as per the arch.
- For mtmsrd and rfid, if PR ends up set, then also set EE, IR and DR
as per the arch.
- A few other minor tidyups (no semantic change).
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
constant MSR_RI : integer := (63 - 62); -- Recoverable Interrupt
|
|
|
|
constant MSR_LE : integer := (63 - 63); -- Little Endian
|
|
|
|
|
|
|
|
-- SPR numbers
|
|
|
|
subtype spr_num_t is integer range 0 to 1023;
|
|
|
|
|
|
|
|
function decode_spr_num(insn: std_ulogic_vector(31 downto 0)) return spr_num_t;
|
|
|
|
|
|
|
|
constant SPR_XER : spr_num_t := 1;
|
|
|
|
constant SPR_LR : spr_num_t := 8;
|
|
|
|
constant SPR_CTR : spr_num_t := 9;
|
|
|
|
constant SPR_TAR : spr_num_t := 815;
|
|
|
|
constant SPR_DSISR : spr_num_t := 18;
|
|
|
|
constant SPR_DAR : spr_num_t := 19;
|
|
|
|
constant SPR_TB : spr_num_t := 268;
|
|
|
|
constant SPR_TBU : spr_num_t := 269;
|
|
|
|
constant SPR_DEC : spr_num_t := 22;
|
|
|
|
constant SPR_SRR0 : spr_num_t := 26;
|
|
|
|
constant SPR_SRR1 : spr_num_t := 27;
|
|
|
|
constant SPR_CFAR : spr_num_t := 28;
|
|
|
|
constant SPR_HSRR0 : spr_num_t := 314;
|
|
|
|
constant SPR_HSRR1 : spr_num_t := 315;
|
|
|
|
constant SPR_SPRG0 : spr_num_t := 272;
|
|
|
|
constant SPR_SPRG1 : spr_num_t := 273;
|
|
|
|
constant SPR_SPRG2 : spr_num_t := 274;
|
|
|
|
constant SPR_SPRG3 : spr_num_t := 275;
|
|
|
|
constant SPR_SPRG3U : spr_num_t := 259;
|
|
|
|
constant SPR_HSPRG0 : spr_num_t := 304;
|
|
|
|
constant SPR_HSPRG1 : spr_num_t := 305;
|
|
|
|
constant SPR_PID : spr_num_t := 48;
|
|
|
|
constant SPR_PTCR : spr_num_t := 464;
|
|
|
|
constant SPR_PVR : spr_num_t := 287;
|
|
|
|
constant SPR_FSCR : spr_num_t := 153;
|
|
|
|
constant SPR_HFSCR : spr_num_t := 190;
|
|
|
|
constant SPR_HEIR : spr_num_t := 339;
|
|
|
|
constant SPR_CTRL : spr_num_t := 136;
|
|
|
|
constant SPR_CTRLW : spr_num_t := 152;
|
|
|
|
constant SPR_UDSCR : spr_num_t := 3;
|
|
|
|
constant SPR_DSCR : spr_num_t := 17;
|
|
|
|
constant SPR_VRSAVE : spr_num_t := 256;
|
|
|
|
|
|
|
|
-- PMU registers
|
|
|
|
constant SPR_UPMC1 : spr_num_t := 771;
|
|
|
|
constant SPR_UPMC2 : spr_num_t := 772;
|
|
|
|
constant SPR_UPMC3 : spr_num_t := 773;
|
|
|
|
constant SPR_UPMC4 : spr_num_t := 774;
|
|
|
|
constant SPR_UPMC5 : spr_num_t := 775;
|
|
|
|
constant SPR_UPMC6 : spr_num_t := 776;
|
|
|
|
constant SPR_UMMCR0 : spr_num_t := 779;
|
|
|
|
constant SPR_UMMCR1 : spr_num_t := 782;
|
|
|
|
constant SPR_UMMCR2 : spr_num_t := 769;
|
|
|
|
constant SPR_UMMCRA : spr_num_t := 770;
|
|
|
|
constant SPR_USIER : spr_num_t := 768;
|
|
|
|
constant SPR_USIAR : spr_num_t := 780;
|
|
|
|
constant SPR_USDAR : spr_num_t := 781;
|
|
|
|
constant SPR_PMC1 : spr_num_t := 787;
|
|
|
|
constant SPR_PMC2 : spr_num_t := 788;
|
|
|
|
constant SPR_PMC3 : spr_num_t := 789;
|
|
|
|
constant SPR_PMC4 : spr_num_t := 790;
|
|
|
|
constant SPR_PMC5 : spr_num_t := 791;
|
|
|
|
constant SPR_PMC6 : spr_num_t := 792;
|
|
|
|
constant SPR_MMCR0 : spr_num_t := 795;
|
|
|
|
constant SPR_MMCR1 : spr_num_t := 798;
|
|
|
|
constant SPR_MMCR2 : spr_num_t := 785;
|
|
|
|
constant SPR_MMCRA : spr_num_t := 786;
|
|
|
|
constant SPR_SIER : spr_num_t := 784;
|
|
|
|
constant SPR_SIAR : spr_num_t := 796;
|
|
|
|
constant SPR_SDAR : spr_num_t := 797;
|
|
|
|
|
|
|
|
-- GPR indices in the register file (GPR only)
|
|
|
|
subtype gpr_index_t is std_ulogic_vector(4 downto 0);
|
|
|
|
|
|
|
|
-- Extended GPR index (can hold a GPR or a FPR)
|
|
|
|
subtype gspr_index_t is std_ulogic_vector(5 downto 0);
|
|
|
|
|
|
|
|
-- FPR indices
|
|
|
|
subtype fpr_index_t is std_ulogic_vector(4 downto 0);
|
|
|
|
|
|
|
|
-- FPRs are stored in the register file, using GSPR
|
|
|
|
-- numbers from 32 to 63.
|
|
|
|
--
|
|
|
|
|
|
|
|
-- Indices conversion functions
|
|
|
|
function gspr_to_gpr(i: gspr_index_t) return gpr_index_t;
|
|
|
|
function gpr_to_gspr(i: gpr_index_t) return gspr_index_t;
|
|
|
|
function fpr_to_gspr(f: fpr_index_t) return gspr_index_t;
|
|
|
|
|
Add basic XER support
The carry is currently internal to execute1. We don't handle any of
the other XER fields.
This creates type called "xer_common_t" that contains the commonly
used XER bits (CA, CA32, SO, OV, OV32).
The value is stored in the CR file (though it could be a separate
module). The rest of the bits will be implemented as a separate
SPR and the two parts reconciled in mfspr/mtspr in latter commits.
We always read XER in decode2 (there is little point not to)
and send it down all pipeline branches as it will be needed in
writeback for all type of instructions when CR0:SO needs to be
updated (such forms exist for all pipeline branches even if we don't
yet implement them).
To avoid having to track XER hazards, we forward it back in EX1. This
assumes that other pipeline branches that can modify it (mult and div)
are running single issue for now.
One additional hazard to beware of is an XER:SO modifying instruction
in EX1 followed immediately by a store conditional. Due to our writeback
latency, the store will go down the LSU with the previous XER value,
thus the stcx. will set CR0:SO using an obsolete SO value.
I doubt there exist any code relying on this behaviour being correct
but we should account for it regardless, possibly by ensuring that
stcx. remain single issue initially, or later by adding some minimal
tracking or moving the LSU into the same pipeline as execute.
Missing some obscure XER affecting instructions like addex or mcrxrx.
[paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of
arguments to set_ov]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
-- The XER is split: the common bits (CA, OV, SO, OV32 and CA32) are
|
|
|
|
-- in the CR file as a kind of CR extension (with a separate write
|
|
|
|
-- control). The rest is stored in ctrl_t (effectively in execute1).
|
Add basic XER support
The carry is currently internal to execute1. We don't handle any of
the other XER fields.
This creates type called "xer_common_t" that contains the commonly
used XER bits (CA, CA32, SO, OV, OV32).
The value is stored in the CR file (though it could be a separate
module). The rest of the bits will be implemented as a separate
SPR and the two parts reconciled in mfspr/mtspr in latter commits.
We always read XER in decode2 (there is little point not to)
and send it down all pipeline branches as it will be needed in
writeback for all type of instructions when CR0:SO needs to be
updated (such forms exist for all pipeline branches even if we don't
yet implement them).
To avoid having to track XER hazards, we forward it back in EX1. This
assumes that other pipeline branches that can modify it (mult and div)
are running single issue for now.
One additional hazard to beware of is an XER:SO modifying instruction
in EX1 followed immediately by a store conditional. Due to our writeback
latency, the store will go down the LSU with the previous XER value,
thus the stcx. will set CR0:SO using an obsolete SO value.
I doubt there exist any code relying on this behaviour being correct
but we should account for it regardless, possibly by ensuring that
stcx. remain single issue initially, or later by adding some minimal
tracking or moving the LSU into the same pipeline as execute.
Missing some obscure XER affecting instructions like addex or mcrxrx.
[paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of
arguments to set_ov]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
type xer_common_t is record
|
|
|
|
ca : std_ulogic;
|
|
|
|
ca32 : std_ulogic;
|
|
|
|
ov : std_ulogic;
|
|
|
|
ov32 : std_ulogic;
|
|
|
|
so : std_ulogic;
|
|
|
|
end record;
|
|
|
|
constant xerc_init : xer_common_t := (others => '0');
|
|
|
|
|
|
|
|
-- Some SPRs are stored in a pair of small RAMs in execute1
|
|
|
|
-- Even half:
|
|
|
|
subtype ramspr_index_range is natural range 0 to 7;
|
|
|
|
subtype ramspr_index is unsigned(2 downto 0);
|
|
|
|
constant RAMSPR_SRR0 : ramspr_index := to_unsigned(0,3);
|
|
|
|
constant RAMSPR_HSRR0 : ramspr_index := to_unsigned(1,3);
|
|
|
|
constant RAMSPR_SPRG0 : ramspr_index := to_unsigned(2,3);
|
|
|
|
constant RAMSPR_SPRG2 : ramspr_index := to_unsigned(3,3);
|
|
|
|
constant RAMSPR_HSPRG0 : ramspr_index := to_unsigned(4,3);
|
|
|
|
constant RAMSPR_LR : ramspr_index := to_unsigned(5,3); -- must equal RAMSPR_CTR
|
|
|
|
constant RAMSPR_TAR : ramspr_index := to_unsigned(6,3);
|
|
|
|
-- Odd half:
|
|
|
|
constant RAMSPR_SRR1 : ramspr_index := to_unsigned(0,3);
|
|
|
|
constant RAMSPR_HSRR1 : ramspr_index := to_unsigned(1,3);
|
|
|
|
constant RAMSPR_SPRG1 : ramspr_index := to_unsigned(2,3);
|
|
|
|
constant RAMSPR_SPRG3 : ramspr_index := to_unsigned(3,3);
|
|
|
|
constant RAMSPR_HSPRG1 : ramspr_index := to_unsigned(4,3);
|
|
|
|
constant RAMSPR_CTR : ramspr_index := to_unsigned(5,3); -- must equal RAMSPR_LR
|
|
|
|
constant RAMSPR_VRSAVE : ramspr_index := to_unsigned(6,3);
|
|
|
|
|
|
|
|
type ram_spr_info is record
|
|
|
|
index : ramspr_index;
|
|
|
|
isodd : std_ulogic;
|
|
|
|
is32b : std_ulogic;
|
|
|
|
valid : std_ulogic;
|
|
|
|
end record;
|
|
|
|
constant ram_spr_info_init: ram_spr_info := (index => to_unsigned(0,3), others => '0');
|
|
|
|
|
|
|
|
subtype spr_selector is std_ulogic_vector(3 downto 0);
|
|
|
|
type spr_id is record
|
|
|
|
sel : spr_selector;
|
|
|
|
valid : std_ulogic;
|
|
|
|
ispmu : std_ulogic;
|
|
|
|
ronly : std_ulogic;
|
|
|
|
wonly : std_ulogic;
|
|
|
|
end record;
|
|
|
|
constant spr_id_init : spr_id := (sel => "0000", others => '0');
|
|
|
|
|
|
|
|
constant SPRSEL_TB : spr_selector := 4x"0";
|
|
|
|
constant SPRSEL_TBU : spr_selector := 4x"1";
|
|
|
|
constant SPRSEL_DEC : spr_selector := 4x"2";
|
|
|
|
constant SPRSEL_PVR : spr_selector := 4x"3";
|
|
|
|
constant SPRSEL_LOGA : spr_selector := 4x"4";
|
|
|
|
constant SPRSEL_LOGD : spr_selector := 4x"5";
|
|
|
|
constant SPRSEL_CFAR : spr_selector := 4x"6";
|
|
|
|
constant SPRSEL_FSCR : spr_selector := 4x"7";
|
|
|
|
constant SPRSEL_HFSCR : spr_selector := 4x"8";
|
|
|
|
constant SPRSEL_HEIR : spr_selector := 4x"9";
|
|
|
|
constant SPRSEL_CTRL : spr_selector := 4x"a";
|
|
|
|
constant SPRSEL_DSCR : spr_selector := 4x"b";
|
|
|
|
constant SPRSEL_XER : spr_selector := 4x"f";
|
|
|
|
|
|
|
|
-- FSCR and HFSCR bit numbers
|
|
|
|
constant FSCR_PREFIX : integer := 63 - 50;
|
|
|
|
constant FSCR_SCV : integer := 63 - 51;
|
|
|
|
constant FSCR_TAR : integer := 63 - 55;
|
|
|
|
constant FSCR_DSCR : integer := 63 - 61;
|
|
|
|
constant HFSCR_PREFIX : integer := 63 - 50;
|
|
|
|
constant HFSCR_MSG : integer := 63 - 53;
|
|
|
|
constant HFSCR_TAR : integer := 63 - 55;
|
|
|
|
constant HFSCR_PMUSPR : integer := 63 - 60;
|
|
|
|
constant HFSCR_DSCR : integer := 63 - 61;
|
|
|
|
constant HFSCR_FP : integer := 63 - 63;
|
|
|
|
|
|
|
|
-- FPSCR bit numbers
|
|
|
|
constant FPSCR_FX : integer := 63 - 32;
|
|
|
|
constant FPSCR_FEX : integer := 63 - 33;
|
|
|
|
constant FPSCR_VX : integer := 63 - 34;
|
|
|
|
constant FPSCR_OX : integer := 63 - 35;
|
|
|
|
constant FPSCR_UX : integer := 63 - 36;
|
|
|
|
constant FPSCR_ZX : integer := 63 - 37;
|
|
|
|
constant FPSCR_XX : integer := 63 - 38;
|
|
|
|
constant FPSCR_VXSNAN : integer := 63 - 39;
|
|
|
|
constant FPSCR_VXISI : integer := 63 - 40;
|
|
|
|
constant FPSCR_VXIDI : integer := 63 - 41;
|
|
|
|
constant FPSCR_VXZDZ : integer := 63 - 42;
|
|
|
|
constant FPSCR_VXIMZ : integer := 63 - 43;
|
|
|
|
constant FPSCR_VXVC : integer := 63 - 44;
|
|
|
|
constant FPSCR_FR : integer := 63 - 45;
|
|
|
|
constant FPSCR_FI : integer := 63 - 46;
|
|
|
|
constant FPSCR_C : integer := 63 - 47;
|
|
|
|
constant FPSCR_FL : integer := 63 - 48;
|
|
|
|
constant FPSCR_FG : integer := 63 - 49;
|
|
|
|
constant FPSCR_FE : integer := 63 - 50;
|
|
|
|
constant FPSCR_FU : integer := 63 - 51;
|
|
|
|
constant FPSCR_VXSOFT : integer := 63 - 53;
|
|
|
|
constant FPSCR_VXSQRT : integer := 63 - 54;
|
|
|
|
constant FPSCR_VXCVI : integer := 63 - 55;
|
|
|
|
constant FPSCR_VE : integer := 63 - 56;
|
|
|
|
constant FPSCR_OE : integer := 63 - 57;
|
|
|
|
constant FPSCR_UE : integer := 63 - 58;
|
|
|
|
constant FPSCR_ZE : integer := 63 - 59;
|
|
|
|
constant FPSCR_XE : integer := 63 - 60;
|
|
|
|
constant FPSCR_NI : integer := 63 - 61;
|
|
|
|
constant FPSCR_RN : integer := 63 - 63;
|
|
|
|
|
|
|
|
-- Real addresses
|
|
|
|
-- REAL_ADDR_BITS is the number of real address bits that we store
|
|
|
|
constant REAL_ADDR_BITS : positive := 56;
|
|
|
|
subtype real_addr_t is std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
|
|
|
|
function addr_to_real(addr: std_ulogic_vector(63 downto 0)) return real_addr_t;
|
|
|
|
|
Move iTLB from icache to fetch1
This moves the address translation step for instruction fetches one
cycle earlier, so that it now happens in the fetch1 stage. There is
now a 2-entry mini translation cache ("ERAT", or effective to real
address translation cache) which operates on the output of the
multiplexer that selects the instruction address for the next cycle.
The ERAT consists of two effective address registers and two
corresponding real address registers. They store the page number part
of the addresses for a 4kB page size, which is the smallest page size
supported by the architecture.
If the effective address doesn't match either of the EA registers, and
address translation is enabled, then i_out.req goes low for two cycles
while the iTLB is looked up. Experimentally, this delay results in a
0.1% drop in coremark performance; allowing two cycles for the lookup
results in better timing. The result from the iTLB is placed into the
least recently used ERAT entry and then used to translate the address
as normal. If address translation is not enabled then the EA is used
directly as the real address.
The iTLB structure is the same as it was before; direct mapped,
indexed using a hashed EA.
The "fetch failed" signal, which indicates a TLB miss or protection
violation, is now generated in fetch1 and passed through icache.
When it is asserted, fetch1 goes into a stalled state until a PTE
arrives from the MMU (which gets put into both the iTLB and the ERAT),
or an interrupt or redirect occurs.
Any TLB invalidations from the MMU invalidate the whole ERAT.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
1 year ago
|
|
|
-- Minimum page size
|
|
|
|
constant MIN_LG_PGSZ : positive := 12;
|
|
|
|
constant MIN_PAGESZ : positive := 2 ** MIN_LG_PGSZ;
|
|
|
|
|
|
|
|
-- Used for tracking instruction completion and pending register writes
|
|
|
|
constant TAG_COUNT : positive := 4;
|
|
|
|
constant TAG_NUMBER_BITS : natural := log2(TAG_COUNT);
|
|
|
|
subtype tag_number_t is integer range 0 to TAG_COUNT - 1;
|
|
|
|
subtype tag_index_t is unsigned(TAG_NUMBER_BITS - 1 downto 0);
|
|
|
|
type instr_tag_t is record
|
|
|
|
tag : tag_number_t;
|
|
|
|
valid : std_ulogic;
|
|
|
|
end record;
|
|
|
|
constant instr_tag_init : instr_tag_t := (tag => 0, valid => '0');
|
|
|
|
function tag_match(tag1 : instr_tag_t; tag2 : instr_tag_t) return boolean;
|
|
|
|
|
|
|
|
subtype intr_vector_t is integer range 0 to 16#fff#;
|
|
|
|
|
|
|
|
-- For now, fixed 16 sources, make this either a parametric
|
|
|
|
-- package of some sort or an unconstrainted array.
|
|
|
|
type ics_to_icp_t is record
|
|
|
|
-- Level interrupts only, ICS just keeps prsenting the
|
|
|
|
-- highest priority interrupt. Once handling edge, something
|
|
|
|
-- smarter involving handshake & reject support will be needed
|
|
|
|
src : std_ulogic_vector(3 downto 0);
|
|
|
|
pri : std_ulogic_vector(7 downto 0);
|
|
|
|
end record;
|
|
|
|
|
Add basic XER support
The carry is currently internal to execute1. We don't handle any of
the other XER fields.
This creates type called "xer_common_t" that contains the commonly
used XER bits (CA, CA32, SO, OV, OV32).
The value is stored in the CR file (though it could be a separate
module). The rest of the bits will be implemented as a separate
SPR and the two parts reconciled in mfspr/mtspr in latter commits.
We always read XER in decode2 (there is little point not to)
and send it down all pipeline branches as it will be needed in
writeback for all type of instructions when CR0:SO needs to be
updated (such forms exist for all pipeline branches even if we don't
yet implement them).
To avoid having to track XER hazards, we forward it back in EX1. This
assumes that other pipeline branches that can modify it (mult and div)
are running single issue for now.
One additional hazard to beware of is an XER:SO modifying instruction
in EX1 followed immediately by a store conditional. Due to our writeback
latency, the store will go down the LSU with the previous XER value,
thus the stcx. will set CR0:SO using an obsolete SO value.
I doubt there exist any code relying on this behaviour being correct
but we should account for it regardless, possibly by ensuring that
stcx. remain single issue initially, or later by adding some minimal
tracking or moving the LSU into the same pipeline as execute.
Missing some obscure XER affecting instructions like addex or mcrxrx.
[paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of
arguments to set_ov]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
-- This needs to die...
|
|
|
|
type ctrl_t is record
|
|
|
|
wait_state: std_ulogic;
|
|
|
|
run: std_ulogic;
|
|
|
|
tb: std_ulogic_vector(63 downto 0);
|
|
|
|
dec: std_ulogic_vector(63 downto 0);
|
|
|
|
msr: std_ulogic_vector(63 downto 0);
|
|
|
|
cfar: std_ulogic_vector(63 downto 0);
|
|
|
|
xer_low: std_ulogic_vector(17 downto 0);
|
|
|
|
fscr_ic: std_ulogic_vector(3 downto 0);
|
|
|
|
fscr_pref: std_ulogic;
|
|
|
|
fscr_scv: std_ulogic;
|
|
|
|
fscr_tar: std_ulogic;
|
|
|
|
fscr_dscr: std_ulogic;
|
|
|
|
hfscr_ic: std_ulogic_vector(3 downto 0);
|
|
|
|
hfscr_pref: std_ulogic;
|
|
|
|
hfscr_tar: std_ulogic;
|
|
|
|
hfscr_dscr: std_ulogic;
|
|
|
|
hfscr_fp: std_ulogic;
|
|
|
|
heir: std_ulogic_vector(63 downto 0);
|
|
|
|
dscr: std_ulogic_vector(24 downto 0);
|
|
|
|
end record;
|
|
|
|
constant ctrl_t_init : ctrl_t :=
|
|
|
|
(wait_state => '0', run => '1', xer_low => 18x"0",
|
|
|
|
fscr_ic => x"0", fscr_pref => '1', fscr_scv => '1', fscr_tar => '1', fscr_dscr => '1',
|
|
|
|
hfscr_ic => x"0", hfscr_pref => '1', hfscr_tar => '1', hfscr_dscr => '1', hfscr_fp => '1',
|
|
|
|
dscr => (others => '0'),
|
|
|
|
others => (others => '0'));
|
|
|
|
|
|
|
|
type Fetch1ToIcacheType is record
|
|
|
|
req: std_ulogic;
|
Move iTLB from icache to fetch1
This moves the address translation step for instruction fetches one
cycle earlier, so that it now happens in the fetch1 stage. There is
now a 2-entry mini translation cache ("ERAT", or effective to real
address translation cache) which operates on the output of the
multiplexer that selects the instruction address for the next cycle.
The ERAT consists of two effective address registers and two
corresponding real address registers. They store the page number part
of the addresses for a 4kB page size, which is the smallest page size
supported by the architecture.
If the effective address doesn't match either of the EA registers, and
address translation is enabled, then i_out.req goes low for two cycles
while the iTLB is looked up. Experimentally, this delay results in a
0.1% drop in coremark performance; allowing two cycles for the lookup
results in better timing. The result from the iTLB is placed into the
least recently used ERAT entry and then used to translate the address
as normal. If address translation is not enabled then the EA is used
directly as the real address.
The iTLB structure is the same as it was before; direct mapped,
indexed using a hashed EA.
The "fetch failed" signal, which indicates a TLB miss or protection
violation, is now generated in fetch1 and passed through icache.
When it is asserted, fetch1 goes into a stalled state until a PTE
arrives from the MMU (which gets put into both the iTLB and the ERAT),
or an interrupt or redirect occurs.
Any TLB invalidations from the MMU invalidate the whole ERAT.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
1 year ago
|
|
|
fetch_fail : std_ulogic;
|
Add TLB to icache
This adds a direct-mapped TLB to the icache, with 64 entries by default.
Execute1 now sends a "virt_mode" signal from MSR[IR] to fetch1 along
with redirects to indicate whether instruction addresses should be
translated through the TLB, and fetch1 sends that on to icache.
Similarly a "priv_mode" signal is sent to indicate the privilege
mode for instruction fetches. This means that changes to MSR[IR]
or MSR[PR] don't take effect until the next redirect, meaning an
isync, rfid, branch, etc.
The icache uses a hash of the effective address (i.e. next instruction
address) to index the TLB. The hash is an XOR of three fields of the
address; with a 64-entry TLB, the fields are bits 12--17, 18--23 and
24--29 of the address. TLB invalidations simply invalidate the
indexed TLB entry without checking the contents.
If the icache detects a TLB miss with virt_mode=1, it will send a
fetch_failed indication through fetch2 to decode1, which will turn it
into a special OP_FETCH_FAILED opcode with unit=LDST. That will get
sent down to loadstore1 which will currently just raise a Instruction
Storage Interrupt (0x400) exception.
One bit in the PTE obtained from the TLB is used to check whether an
instruction access is allowed -- the privilege bit (bit 3). If bit 3
is 1 and priv_mode=0, then a fetch_failed indication is sent down to
fetch2 and to decode1, which generates an OP_FETCH_FAILED. Any PTEs
with PTE bit 0 (EAA[3]) clear or bit 8 (R) clear should not be put
into the iTLB since such PTEs would not allow execution by any
context.
Tlbie operations get sent from mmu to icache over a new connection.
Unfortunately the privileged instruction tests are broken for now.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
virt_mode : std_ulogic;
|
|
|
|
priv_mode : std_ulogic;
|
|
|
|
big_endian : std_ulogic;
|
|
|
|
stop_mark: std_ulogic;
|
fetch1: Implement a simple branch target cache
This implements a cache in fetch1, where each entry stores the address
of a simple branch instruction (b or bc) and the target of the branch.
When fetching sequentially, if the address being fetched matches the
cache entry, then fetching will be redirected to the branch target.
The cache has 1024 entries and is direct-mapped, i.e. indexed by bits
11..2 of the NIA.
The bus from execute1 now carries information about taken and
not-taken simple branches, which fetch1 uses to update the cache.
The cache entry is updated for both taken and not-taken branches, with
the valid bit being set if the branch was taken and cleared if the
branch was not taken.
If fetching is redirected to the branch target then that goes down the
pipe as a predicted-taken branch, and decode1 does not do any static
branch prediction. If fetching is not redirected, then the next
instruction goes down the pipe as normal and decode1 does its static
branch prediction.
In order to make timing, the lookup of the cache is pipelined, so on
each cycle the cache entry for the current NIA + 8 is read. This
means that after a redirect (from decode1 or execute1), only the third
and subsequent sequentially-fetched instructions will be able to be
predicted.
This improves the coremark value on the Arty A7-100 from about 180 to
about 190 (more than 5%).
The BTC is optional. Builds for the Artix 7 35-T part have it off by
default because the extra ~1420 LUTs it takes mean that the design
doesn't fit on the Arty A7-35 board.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
predicted : std_ulogic;
|
|
|
|
pred_ntaken : std_ulogic;
|
|
|
|
nia: std_ulogic_vector(63 downto 0);
|
|
|
|
next_nia: std_ulogic_vector(63 downto 0);
|
Move iTLB from icache to fetch1
This moves the address translation step for instruction fetches one
cycle earlier, so that it now happens in the fetch1 stage. There is
now a 2-entry mini translation cache ("ERAT", or effective to real
address translation cache) which operates on the output of the
multiplexer that selects the instruction address for the next cycle.
The ERAT consists of two effective address registers and two
corresponding real address registers. They store the page number part
of the addresses for a 4kB page size, which is the smallest page size
supported by the architecture.
If the effective address doesn't match either of the EA registers, and
address translation is enabled, then i_out.req goes low for two cycles
while the iTLB is looked up. Experimentally, this delay results in a
0.1% drop in coremark performance; allowing two cycles for the lookup
results in better timing. The result from the iTLB is placed into the
least recently used ERAT entry and then used to translate the address
as normal. If address translation is not enabled then the EA is used
directly as the real address.
The iTLB structure is the same as it was before; direct mapped,
indexed using a hashed EA.
The "fetch failed" signal, which indicates a TLB miss or protection
violation, is now generated in fetch1 and passed through icache.
When it is asserted, fetch1 goes into a stalled state until a PTE
arrives from the MMU (which gets put into both the iTLB and the ERAT),
or an interrupt or redirect occurs.
Any TLB invalidations from the MMU invalidate the whole ERAT.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
1 year ago
|
|
|
rpn: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0);
|
|
|
|
next_rpn: std_ulogic_vector(REAL_ADDR_BITS - MIN_LG_PGSZ - 1 downto 0);
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type IcacheToDecode1Type is record
|
|
|
|
valid: std_ulogic;
|
|
|
|
stop_mark: std_ulogic;
|
Add TLB to icache
This adds a direct-mapped TLB to the icache, with 64 entries by default.
Execute1 now sends a "virt_mode" signal from MSR[IR] to fetch1 along
with redirects to indicate whether instruction addresses should be
translated through the TLB, and fetch1 sends that on to icache.
Similarly a "priv_mode" signal is sent to indicate the privilege
mode for instruction fetches. This means that changes to MSR[IR]
or MSR[PR] don't take effect until the next redirect, meaning an
isync, rfid, branch, etc.
The icache uses a hash of the effective address (i.e. next instruction
address) to index the TLB. The hash is an XOR of three fields of the
address; with a 64-entry TLB, the fields are bits 12--17, 18--23 and
24--29 of the address. TLB invalidations simply invalidate the
indexed TLB entry without checking the contents.
If the icache detects a TLB miss with virt_mode=1, it will send a
fetch_failed indication through fetch2 to decode1, which will turn it
into a special OP_FETCH_FAILED opcode with unit=LDST. That will get
sent down to loadstore1 which will currently just raise a Instruction
Storage Interrupt (0x400) exception.
One bit in the PTE obtained from the TLB is used to check whether an
instruction access is allowed -- the privilege bit (bit 3). If bit 3
is 1 and priv_mode=0, then a fetch_failed indication is sent down to
fetch2 and to decode1, which generates an OP_FETCH_FAILED. Any PTEs
with PTE bit 0 (EAA[3]) clear or bit 8 (R) clear should not be put
into the iTLB since such PTEs would not allow execution by any
context.
Tlbie operations get sent from mmu to icache over a new connection.
Unfortunately the privileged instruction tests are broken for now.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
fetch_failed: std_ulogic;
|
|
|
|
nia: std_ulogic_vector(63 downto 0);
|
|
|
|
insn: std_ulogic_vector(31 downto 0);
|
|
|
|
icode: insn_code;
|
core: Implement quadword loads and stores
This implements the lq, stq, lqarx and stqcx. instructions.
These instructions all access two consecutive GPRs; for example the
"lq %r6,0(%r3)" instruction will load the doubleword at the address
in R3 into R7 and the doubleword at address R3 + 8 into R6. To cope
with having two GPR sources or destinations, the instruction gets
repeated at the decode2 stage, that is, for each lq/stq/lqarx/stqcx.
coming in from decode1, two instructions get sent out to execute1.
For these instructions, the RS or RT register gets modified on one
of the iterations by setting the LSB of the register number. In LE
mode, the first iteration uses RS|1 or RT|1 and the second iteration
uses RS or RT. In BE mode, this is done the other way around. In
order for decode2 to know what endianness is currently in use, we
pass the big_endian flag down from icache through decode1 to decode2.
This is always in sync with what execute1 is using because only rfid
or an interrupt can change MSR[LE], and those operations all cause
a flush and redirect.
There is now an extra column in the decode tables in decode1 to
indicate whether the instruction needs to be repeated. Decode1 also
enforces the rule that lq with RT = RT and lqarx with RA = RT or
RB = RT are illegal.
Decode2 now passes a 'repeat' flag and a 'second' flag to execute1,
and execute1 passes them on to loadstore1. The 'repeat' flag is set
for both iterations of a repeated instruction, and 'second' is set
on the second iteration. Execute1 does not take asynchronous or
trace interrupts on the second iteration of a repeated instruction.
Loadstore1 uses 'next_addr' for the second iteration of a repeated
load/store so that we access the second doubleword of the memory
operand. Thus loadstore1 accesses the doublewords in increasing
memory order. For 16-byte loads this means that the first iteration
writes GPR RT|1. It is possible that RA = RT|1 (this is a legal
but non-preferred form), meaning that if the memory operand was
misaligned, the first iteration would overwrite RA but then the
second iteration might take a page fault, leading to corrupted state.
To avoid that possibility, 16-byte loads in LE mode take an
alignment interrupt if the operand is not 16-byte aligned. (This
is the case anyway for lqarx, and we enforce it for lq as well.)
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
big_endian: std_ulogic;
|
fetch1: Implement a simple branch target cache
This implements a cache in fetch1, where each entry stores the address
of a simple branch instruction (b or bc) and the target of the branch.
When fetching sequentially, if the address being fetched matches the
cache entry, then fetching will be redirected to the branch target.
The cache has 1024 entries and is direct-mapped, i.e. indexed by bits
11..2 of the NIA.
The bus from execute1 now carries information about taken and
not-taken simple branches, which fetch1 uses to update the cache.
The cache entry is updated for both taken and not-taken branches, with
the valid bit being set if the branch was taken and cleared if the
branch was not taken.
If fetching is redirected to the branch target then that goes down the
pipe as a predicted-taken branch, and decode1 does not do any static
branch prediction. If fetching is not redirected, then the next
instruction goes down the pipe as normal and decode1 does its static
branch prediction.
In order to make timing, the lookup of the cache is pipelined, so on
each cycle the cache entry for the current NIA + 8 is read. This
means that after a redirect (from decode1 or execute1), only the third
and subsequent sequentially-fetched instructions will be able to be
predicted.
This improves the coremark value on the Arty A7-100 from about 180 to
about 190 (more than 5%).
The BTC is optional. Builds for the Artix 7 35-T part have it off by
default because the extra ~1420 LUTs it takes mean that the design
doesn't fit on the Arty A7-35 board.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
next_predicted: std_ulogic;
|
|
|
|
next_pred_ntaken: std_ulogic;
|
|
|
|
end record;
|
|
|
|
constant IcacheToDecode1Init : IcacheToDecode1Type :=
|
|
|
|
(nia => (others => '0'), insn => (others => '0'), icode => INSN_illegal, others => '0');
|
|
|
|
|
|
|
|
type IcacheEventType is record
|
|
|
|
icache_miss : std_ulogic;
|
|
|
|
itlb_miss_resolved : std_ulogic;
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type Decode1ToDecode2Type is record
|
|
|
|
valid: std_ulogic;
|
|
|
|
stop_mark : std_ulogic;
|
|
|
|
nia: std_ulogic_vector(63 downto 0);
|
Decode prefixed instructions
This adds logic to do basic decoding of the prefixed instructions
defined in PowerISA v3.1B which are in the SFFS (Scalar Fixed plus
Floating-Point Subset) compliancy subset. In PowerISA v3.1B SFFS,
there are 14 prefixed load/store instructions plus the prefixed no-op
instruction (pnop). The prefixed load/store instructions all use an
extended version of D-form, which has an extra 18 bits of displacement
in the prefix, plus an 'R' bit which enables PC-relative addressing.
When decode1 sees an instruction word where the insn_code is
INSN_prefix (i.e. the primary opcode was 1), it stores the prefix word
and sends nothing down to decode2 in that cycle. When the next valid
instruction word arrives, it is interpreted as a suffix, meaning that
its insn_code gets modified before being used to look up the decode
table.
The insn_code values are rearranged so that the values for
instructions which are the suffix of a valid prefixed instruction are
all at even indexes, and the corresponding prefixed instructions
follow immediately, so that an insn_code value can be converted to the
corresponding prefixed value by setting the LSB of the insn_code
value. There are two prefixed instructions, pld and pstd, for which
the suffix is not a valid SFFS instruction by itself, so these have
been given dummy insn_code values which decode as illegal (INSN_op57
and INSN_op61).
For a prefixed instruction, decode1 examines the type and subtype
fields of the prefix and checks that the suffix is valid for the type
and subtype. This check doesn't affect which entry of the decode
table is used; the result is passed down to decode2, and will in
future be acted upon in execute1.
The instruction address passed down to decode2 is the address of the
prefix. To enable this, part of the instruction address is saved when
the prefix is seen, and then the instruction address received from
icache is partly overlaid by the saved prefix address. Because
prefixed instructions are not permitted to cross 64-byte boundaries,
we only need to save bits 5:2 of the instruction to do this. If the
alignment restriction ever gets relaxed, we will then need to save
more bits of the address.
Decode2 has been extended to handle the R bit of the prefix (in 8LS
and MLS forms) and to be able to generate the 34-bit immediate value
from the prefix and suffix.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2 years ago
|
|
|
prefixed: std_ulogic;
|
|
|
|
prefix: std_ulogic_vector(25 downto 0);
|
|
|
|
illegal_suffix: std_ulogic;
|
Implement interrupts for prefixed instructions
This arranges to generate an illegal instruction type program
interrupt for illegal prefixed instructions, that is, those where the
suffix is not a legal value given the prefix, or the prefix has a
reserved value in the subtype field. This implementation doesn't
generate an interrupt for the invalid 8LS:D and MLS:D instruction
forms where R = 1 and RA != 0. (In those cases it uses (RA) as the
addend, i.e. it ignores the R bit.)
This detects the case where the address of an instruction prefix is
equal mod 64 to 60, and generates an alignment interrupt in that case.
This also arranges to set bit 34 of SRR1 when an interrupt occurs due
to a prefixed instruction, for those interrupts where that is required
(i.e. trace, alignment, floating-point unavailable, data storage, data
segment, and most cases of program interrupt).
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2 years ago
|
|
|
misaligned_prefix: std_ulogic;
|
|
|
|
insn: std_ulogic_vector(31 downto 0);
|
|
|
|
decode: decode_rom_t;
|
|
|
|
br_pred: std_ulogic; -- Branch was predicted to be taken
|
core: Implement quadword loads and stores
This implements the lq, stq, lqarx and stqcx. instructions.
These instructions all access two consecutive GPRs; for example the
"lq %r6,0(%r3)" instruction will load the doubleword at the address
in R3 into R7 and the doubleword at address R3 + 8 into R6. To cope
with having two GPR sources or destinations, the instruction gets
repeated at the decode2 stage, that is, for each lq/stq/lqarx/stqcx.
coming in from decode1, two instructions get sent out to execute1.
For these instructions, the RS or RT register gets modified on one
of the iterations by setting the LSB of the register number. In LE
mode, the first iteration uses RS|1 or RT|1 and the second iteration
uses RS or RT. In BE mode, this is done the other way around. In
order for decode2 to know what endianness is currently in use, we
pass the big_endian flag down from icache through decode1 to decode2.
This is always in sync with what execute1 is using because only rfid
or an interrupt can change MSR[LE], and those operations all cause
a flush and redirect.
There is now an extra column in the decode tables in decode1 to
indicate whether the instruction needs to be repeated. Decode1 also
enforces the rule that lq with RT = RT and lqarx with RA = RT or
RB = RT are illegal.
Decode2 now passes a 'repeat' flag and a 'second' flag to execute1,
and execute1 passes them on to loadstore1. The 'repeat' flag is set
for both iterations of a repeated instruction, and 'second' is set
on the second iteration. Execute1 does not take asynchronous or
trace interrupts on the second iteration of a repeated instruction.
Loadstore1 uses 'next_addr' for the second iteration of a repeated
load/store so that we access the second doubleword of the memory
operand. Thus loadstore1 accesses the doublewords in increasing
memory order. For 16-byte loads this means that the first iteration
writes GPR RT|1. It is possible that RA = RT|1 (this is a legal
but non-preferred form), meaning that if the memory operand was
misaligned, the first iteration would overwrite RA but then the
second iteration might take a page fault, leading to corrupted state.
To avoid that possibility, 16-byte loads in LE mode take an
alignment interrupt if the operand is not 16-byte aligned. (This
is the case anyway for lqarx, and we enforce it for lq as well.)
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
big_endian: std_ulogic;
|
|
|
|
spr_info : spr_id;
|
|
|
|
ram_spr : ram_spr_info;
|
|
|
|
reg_a : gspr_index_t;
|
|
|
|
reg_b : gspr_index_t;
|
|
|
|
reg_c : gspr_index_t;
|
|
|
|
end record;
|
|
|
|
constant Decode1ToDecode2Init : Decode1ToDecode2Type :=
|
Decode prefixed instructions
This adds logic to do basic decoding of the prefixed instructions
defined in PowerISA v3.1B which are in the SFFS (Scalar Fixed plus
Floating-Point Subset) compliancy subset. In PowerISA v3.1B SFFS,
there are 14 prefixed load/store instructions plus the prefixed no-op
instruction (pnop). The prefixed load/store instructions all use an
extended version of D-form, which has an extra 18 bits of displacement
in the prefix, plus an 'R' bit which enables PC-relative addressing.
When decode1 sees an instruction word where the insn_code is
INSN_prefix (i.e. the primary opcode was 1), it stores the prefix word
and sends nothing down to decode2 in that cycle. When the next valid
instruction word arrives, it is interpreted as a suffix, meaning that
its insn_code gets modified before being used to look up the decode
table.
The insn_code values are rearranged so that the values for
instructions which are the suffix of a valid prefixed instruction are
all at even indexes, and the corresponding prefixed instructions
follow immediately, so that an insn_code value can be converted to the
corresponding prefixed value by setting the LSB of the insn_code
value. There are two prefixed instructions, pld and pstd, for which
the suffix is not a valid SFFS instruction by itself, so these have
been given dummy insn_code values which decode as illegal (INSN_op57
and INSN_op61).
For a prefixed instruction, decode1 examines the type and subtype
fields of the prefix and checks that the suffix is valid for the type
and subtype. This check doesn't affect which entry of the decode
table is used; the result is passed down to decode2, and will in
future be acted upon in execute1.
The instruction address passed down to decode2 is the address of the
prefix. To enable this, part of the instruction address is saved when
the prefix is seen, and then the instruction address received from
icache is partly overlaid by the saved prefix address. Because
prefixed instructions are not permitted to cross 64-byte boundaries,
we only need to save bits 5:2 of the instruction to do this. If the
alignment restriction ever gets relaxed, we will then need to save
more bits of the address.
Decode2 has been extended to handle the R bit of the prefix (in 8LS
and MLS forms) and to be able to generate the 34-bit immediate value
from the prefix and suffix.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2 years ago
|
|
|
(valid => '0', stop_mark => '0', nia => (others => '0'),
|
|
|
|
prefixed => '0', prefix => (others => '0'), insn => (others => '0'),
|
Implement interrupts for prefixed instructions
This arranges to generate an illegal instruction type program
interrupt for illegal prefixed instructions, that is, those where the
suffix is not a legal value given the prefix, or the prefix has a
reserved value in the subtype field. This implementation doesn't
generate an interrupt for the invalid 8LS:D and MLS:D instruction
forms where R = 1 and RA != 0. (In those cases it uses (RA) as the
addend, i.e. it ignores the R bit.)
This detects the case where the address of an instruction prefix is
equal mod 64 to 60, and generates an alignment interrupt in that case.
This also arranges to set bit 34 of SRR1 when an interrupt occurs due
to a prefixed instruction, for those interrupts where that is required
(i.e. trace, alignment, floating-point unavailable, data storage, data
segment, and most cases of program interrupt).
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2 years ago
|
|
|
illegal_suffix => '0', misaligned_prefix => '0',
|
|
|
|
decode => decode_rom_init, br_pred => '0', big_endian => '0',
|
|
|
|
spr_info => spr_id_init, ram_spr => ram_spr_info_init,
|
|
|
|
reg_a => (others => '0'), reg_b => (others => '0'), reg_c => (others => '0'));
|
|
|
|
|
|
|
|
type Decode1ToFetch1Type is record
|
|
|
|
redirect : std_ulogic;
|
|
|
|
redirect_nia : std_ulogic_vector(63 downto 0);
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type Decode1ToRegisterFileType is record
|
|
|
|
reg_1_addr : gspr_index_t;
|
|
|
|
reg_2_addr : gspr_index_t;
|
|
|
|
reg_3_addr : gspr_index_t;
|
|
|
|
read_1_enable : std_ulogic;
|
|
|
|
read_2_enable : std_ulogic;
|
|
|
|
read_3_enable : std_ulogic;
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type bypass_data_t is record
|
|
|
|
tag : instr_tag_t;
|
|
|
|
data : std_ulogic_vector(63 downto 0);
|
|
|
|
end record;
|
|
|
|
constant bypass_data_init : bypass_data_t := (tag => instr_tag_init, data => (others => '0'));
|
|
|
|
|
|
|
|
type cr_bypass_data_t is record
|
|
|
|
tag : instr_tag_t;
|
|
|
|
data : std_ulogic_vector(31 downto 0);
|
|
|
|
end record;
|
|
|
|
constant cr_bypass_data_init : cr_bypass_data_t := (tag => instr_tag_init, data => (others => '0'));
|
|
|
|
|
|
|
|
type Decode2ToExecute1Type is record
|
|
|
|
valid: std_ulogic;
|
|
|
|
unit : unit_t;
|
|
|
|
fac : facility_t;
|
|
|
|
insn_type: insn_type_t;
|
|
|
|
nia: std_ulogic_vector(63 downto 0);
|
|
|
|
instr_tag : instr_tag_t;
|
|
|
|
write_reg: gspr_index_t;
|
|
|
|
write_reg_enable: std_ulogic;
|
|
|
|
read_reg1: gspr_index_t;
|
|
|
|
read_reg2: gspr_index_t;
|
|
|
|
read_reg3: gspr_index_t;
|
|
|
|
read_data1: std_ulogic_vector(63 downto 0);
|
|
|
|
read_data2: std_ulogic_vector(63 downto 0);
|
|
|
|
read_data3: std_ulogic_vector(63 downto 0);
|
|
|
|
reg_valid1: std_ulogic;
|
|
|
|
reg_valid2: std_ulogic;
|
|
|
|
reg_valid3: std_ulogic;
|
|
|
|
cr: std_ulogic_vector(31 downto 0);
|
Add basic XER support
The carry is currently internal to execute1. We don't handle any of
the other XER fields.
This creates type called "xer_common_t" that contains the commonly
used XER bits (CA, CA32, SO, OV, OV32).
The value is stored in the CR file (though it could be a separate
module). The rest of the bits will be implemented as a separate
SPR and the two parts reconciled in mfspr/mtspr in latter commits.
We always read XER in decode2 (there is little point not to)
and send it down all pipeline branches as it will be needed in
writeback for all type of instructions when CR0:SO needs to be
updated (such forms exist for all pipeline branches even if we don't
yet implement them).
To avoid having to track XER hazards, we forward it back in EX1. This
assumes that other pipeline branches that can modify it (mult and div)
are running single issue for now.
One additional hazard to beware of is an XER:SO modifying instruction
in EX1 followed immediately by a store conditional. Due to our writeback
latency, the store will go down the LSU with the previous XER value,
thus the stcx. will set CR0:SO using an obsolete SO value.
I doubt there exist any code relying on this behaviour being correct
but we should account for it regardless, possibly by ensuring that
stcx. remain single issue initially, or later by adding some minimal
tracking or moving the LSU into the same pipeline as execute.
Missing some obscure XER affecting instructions like addex or mcrxrx.
[paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of
arguments to set_ov]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
xerc: xer_common_t;
|
|
|
|
lr: std_ulogic;
|
|
|
|
br_abs: std_ulogic;
|
|
|
|
rc: std_ulogic;
|
Add basic XER support
The carry is currently internal to execute1. We don't handle any of
the other XER fields.
This creates type called "xer_common_t" that contains the commonly
used XER bits (CA, CA32, SO, OV, OV32).
The value is stored in the CR file (though it could be a separate
module). The rest of the bits will be implemented as a separate
SPR and the two parts reconciled in mfspr/mtspr in latter commits.
We always read XER in decode2 (there is little point not to)
and send it down all pipeline branches as it will be needed in
writeback for all type of instructions when CR0:SO needs to be
updated (such forms exist for all pipeline branches even if we don't
yet implement them).
To avoid having to track XER hazards, we forward it back in EX1. This
assumes that other pipeline branches that can modify it (mult and div)
are running single issue for now.
One additional hazard to beware of is an XER:SO modifying instruction
in EX1 followed immediately by a store conditional. Due to our writeback
latency, the store will go down the LSU with the previous XER value,
thus the stcx. will set CR0:SO using an obsolete SO value.
I doubt there exist any code relying on this behaviour being correct
but we should account for it regardless, possibly by ensuring that
stcx. remain single issue initially, or later by adding some minimal
tracking or moving the LSU into the same pipeline as execute.
Missing some obscure XER affecting instructions like addex or mcrxrx.
[paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of
arguments to set_ov]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
oe: std_ulogic;
|
|
|
|
invert_a: std_ulogic;
|
|
|
|
invert_out: std_ulogic;
|
|
|
|
input_carry: carry_in_t;
|
|
|
|
output_carry: std_ulogic;
|
|
|
|
input_cr: std_ulogic;
|
|
|
|
output_cr: std_ulogic;
|
|
|
|
output_xer: std_ulogic;
|
|
|
|
is_32bit: std_ulogic;
|
|
|
|
is_signed: std_ulogic;
|
|
|
|
insn: std_ulogic_vector(31 downto 0);
|
|
|
|
data_len: std_ulogic_vector(3 downto 0);
|
|
|
|
byte_reverse : std_ulogic;
|
|
|
|
sign_extend : std_ulogic; -- do we need to sign extend?
|
|
|
|
update : std_ulogic; -- is this an update instruction?
|
|
|
|
reserve : std_ulogic; -- set for larx/stcx
|
|
|
|
br_pred : std_ulogic;
|
|
|
|
result_sel : std_ulogic_vector(2 downto 0); -- select source of result
|
|
|
|
sub_select : std_ulogic_vector(2 downto 0); -- sub-result selection
|
core: Implement quadword loads and stores
This implements the lq, stq, lqarx and stqcx. instructions.
These instructions all access two consecutive GPRs; for example the
"lq %r6,0(%r3)" instruction will load the doubleword at the address
in R3 into R7 and the doubleword at address R3 + 8 into R6. To cope
with having two GPR sources or destinations, the instruction gets
repeated at the decode2 stage, that is, for each lq/stq/lqarx/stqcx.
coming in from decode1, two instructions get sent out to execute1.
For these instructions, the RS or RT register gets modified on one
of the iterations by setting the LSB of the register number. In LE
mode, the first iteration uses RS|1 or RT|1 and the second iteration
uses RS or RT. In BE mode, this is done the other way around. In
order for decode2 to know what endianness is currently in use, we
pass the big_endian flag down from icache through decode1 to decode2.
This is always in sync with what execute1 is using because only rfid
or an interrupt can change MSR[LE], and those operations all cause
a flush and redirect.
There is now an extra column in the decode tables in decode1 to
indicate whether the instruction needs to be repeated. Decode1 also
enforces the rule that lq with RT = RT and lqarx with RA = RT or
RB = RT are illegal.
Decode2 now passes a 'repeat' flag and a 'second' flag to execute1,
and execute1 passes them on to loadstore1. The 'repeat' flag is set
for both iterations of a repeated instruction, and 'second' is set
on the second iteration. Execute1 does not take asynchronous or
trace interrupts on the second iteration of a repeated instruction.
Loadstore1 uses 'next_addr' for the second iteration of a repeated
load/store so that we access the second doubleword of the memory
operand. Thus loadstore1 accesses the doublewords in increasing
memory order. For 16-byte loads this means that the first iteration
writes GPR RT|1. It is possible that RA = RT|1 (this is a legal
but non-preferred form), meaning that if the memory operand was
misaligned, the first iteration would overwrite RA but then the
second iteration might take a page fault, leading to corrupted state.
To avoid that possibility, 16-byte loads in LE mode take an
alignment interrupt if the operand is not 16-byte aligned. (This
is the case anyway for lqarx, and we enforce it for lq as well.)
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
repeat : std_ulogic; -- set if instruction is cracked into two ops
|
|
|
|
second : std_ulogic; -- set if this is the second op
|
|
|
|
spr_select : spr_id;
|
|
|
|
spr_is_ram : std_ulogic;
|
|
|
|
ramspr_even_rdaddr : ramspr_index;
|
|
|
|
ramspr_odd_rdaddr : ramspr_index;
|
|
|
|
ramspr_rd_odd : std_ulogic;
|
|
|
|
ramspr_wraddr : ramspr_index;
|
|
|
|
ramspr_write_even : std_ulogic;
|
|
|
|
ramspr_write_odd : std_ulogic;
|
|
|
|
ramspr_32bit : std_ulogic;
|
|
|
|
dbg_spr_access : std_ulogic;
|
|
|
|
dec_ctr : std_ulogic;
|
Implement interrupts for prefixed instructions
This arranges to generate an illegal instruction type program
interrupt for illegal prefixed instructions, that is, those where the
suffix is not a legal value given the prefix, or the prefix has a
reserved value in the subtype field. This implementation doesn't
generate an interrupt for the invalid 8LS:D and MLS:D instruction
forms where R = 1 and RA != 0. (In those cases it uses (RA) as the
addend, i.e. it ignores the R bit.)
This detects the case where the address of an instruction prefix is
equal mod 64 to 60, and generates an alignment interrupt in that case.
This also arranges to set bit 34 of SRR1 when an interrupt occurs due
to a prefixed instruction, for those interrupts where that is required
(i.e. trace, alignment, floating-point unavailable, data storage, data
segment, and most cases of program interrupt).
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2 years ago
|
|
|
prefixed : std_ulogic;
|
|
|
|
prefix : std_ulogic_vector(25 downto 0);
|
Implement interrupts for prefixed instructions
This arranges to generate an illegal instruction type program
interrupt for illegal prefixed instructions, that is, those where the
suffix is not a legal value given the prefix, or the prefix has a
reserved value in the subtype field. This implementation doesn't
generate an interrupt for the invalid 8LS:D and MLS:D instruction
forms where R = 1 and RA != 0. (In those cases it uses (RA) as the
addend, i.e. it ignores the R bit.)
This detects the case where the address of an instruction prefix is
equal mod 64 to 60, and generates an alignment interrupt in that case.
This also arranges to set bit 34 of SRR1 when an interrupt occurs due
to a prefixed instruction, for those interrupts where that is required
(i.e. trace, alignment, floating-point unavailable, data storage, data
segment, and most cases of program interrupt).
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2 years ago
|
|
|
illegal_suffix : std_ulogic;
|
|
|
|
misaligned_prefix : std_ulogic;
|
|
|
|
uses_tar : std_ulogic;
|
|
|
|
uses_dscr : std_ulogic;
|
|
|
|
end record;
|
|
|
|
constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
|
|
|
|
(valid => '0', unit => ALU, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init,
|
|
|
|
write_reg_enable => '0',
|
|
|
|
lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0',
|
|
|
|
invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0',
|
|
|
|
output_cr => '0', output_xer => '0',
|
|
|
|
is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0',
|
core: Implement quadword loads and stores
This implements the lq, stq, lqarx and stqcx. instructions.
These instructions all access two consecutive GPRs; for example the
"lq %r6,0(%r3)" instruction will load the doubleword at the address
in R3 into R7 and the doubleword at address R3 + 8 into R6. To cope
with having two GPR sources or destinations, the instruction gets
repeated at the decode2 stage, that is, for each lq/stq/lqarx/stqcx.
coming in from decode1, two instructions get sent out to execute1.
For these instructions, the RS or RT register gets modified on one
of the iterations by setting the LSB of the register number. In LE
mode, the first iteration uses RS|1 or RT|1 and the second iteration
uses RS or RT. In BE mode, this is done the other way around. In
order for decode2 to know what endianness is currently in use, we
pass the big_endian flag down from icache through decode1 to decode2.
This is always in sync with what execute1 is using because only rfid
or an interrupt can change MSR[LE], and those operations all cause
a flush and redirect.
There is now an extra column in the decode tables in decode1 to
indicate whether the instruction needs to be repeated. Decode1 also
enforces the rule that lq with RT = RT and lqarx with RA = RT or
RB = RT are illegal.
Decode2 now passes a 'repeat' flag and a 'second' flag to execute1,
and execute1 passes them on to loadstore1. The 'repeat' flag is set
for both iterations of a repeated instruction, and 'second' is set
on the second iteration. Execute1 does not take asynchronous or
trace interrupts on the second iteration of a repeated instruction.
Loadstore1 uses 'next_addr' for the second iteration of a repeated
load/store so that we access the second doubleword of the memory
operand. Thus loadstore1 accesses the doublewords in increasing
memory order. For 16-byte loads this means that the first iteration
writes GPR RT|1. It is possible that RA = RT|1 (this is a legal
but non-preferred form), meaning that if the memory operand was
misaligned, the first iteration would overwrite RA but then the
second iteration might take a page fault, leading to corrupted state.
To avoid that possibility, 16-byte loads in LE mode take an
alignment interrupt if the operand is not 16-byte aligned. (This
is the case anyway for lqarx, and we enforce it for lq as well.)
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'),
|
|
|
|
read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'),
|
|
|
|
reg_valid1 => '0', reg_valid2 => '0', reg_valid3 => '0',
|
core: Implement quadword loads and stores
This implements the lq, stq, lqarx and stqcx. instructions.
These instructions all access two consecutive GPRs; for example the
"lq %r6,0(%r3)" instruction will load the doubleword at the address
in R3 into R7 and the doubleword at address R3 + 8 into R6. To cope
with having two GPR sources or destinations, the instruction gets
repeated at the decode2 stage, that is, for each lq/stq/lqarx/stqcx.
coming in from decode1, two instructions get sent out to execute1.
For these instructions, the RS or RT register gets modified on one
of the iterations by setting the LSB of the register number. In LE
mode, the first iteration uses RS|1 or RT|1 and the second iteration
uses RS or RT. In BE mode, this is done the other way around. In
order for decode2 to know what endianness is currently in use, we
pass the big_endian flag down from icache through decode1 to decode2.
This is always in sync with what execute1 is using because only rfid
or an interrupt can change MSR[LE], and those operations all cause
a flush and redirect.
There is now an extra column in the decode tables in decode1 to
indicate whether the instruction needs to be repeated. Decode1 also
enforces the rule that lq with RT = RT and lqarx with RA = RT or
RB = RT are illegal.
Decode2 now passes a 'repeat' flag and a 'second' flag to execute1,
and execute1 passes them on to loadstore1. The 'repeat' flag is set
for both iterations of a repeated instruction, and 'second' is set
on the second iteration. Execute1 does not take asynchronous or
trace interrupts on the second iteration of a repeated instruction.
Loadstore1 uses 'next_addr' for the second iteration of a repeated
load/store so that we access the second doubleword of the memory
operand. Thus loadstore1 accesses the doublewords in increasing
memory order. For 16-byte loads this means that the first iteration
writes GPR RT|1. It is possible that RA = RT|1 (this is a legal
but non-preferred form), meaning that if the memory operand was
misaligned, the first iteration would overwrite RA but then the
second iteration might take a page fault, leading to corrupted state.
To avoid that possibility, 16-byte loads in LE mode take an
alignment interrupt if the operand is not 16-byte aligned. (This
is the case anyway for lqarx, and we enforce it for lq as well.)
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'),
|
|
|
|
result_sel => "000", sub_select => "000",
|
|
|
|
repeat => '0', second => '0', spr_select => spr_id_init,
|
|
|
|
spr_is_ram => '0',
|
|
|
|
ramspr_even_rdaddr => (others => '0'), ramspr_odd_rdaddr => (others => '0'), ramspr_rd_odd => '0',
|
|
|
|
ramspr_wraddr => (others => '0'), ramspr_write_even => '0', ramspr_write_odd => '0',
|
|
|
|
ramspr_32bit => '0',
|
|
|
|
dbg_spr_access => '0',
|
|
|
|
dec_ctr => '0',
|
|
|
|
prefixed => '0', prefix => (others => '0'), illegal_suffix => '0',
|
|
|
|
misaligned_prefix => '0', uses_tar => '0', uses_dscr => '0',
|
|
|
|
others => (others => '0'));
|
|
|
|
|
|
|
|
type MultiplyInputType is record
|
|
|
|
valid: std_ulogic;
|
|
|
|
data1: std_ulogic_vector(63 downto 0);
|
|
|
|
data2: std_ulogic_vector(63 downto 0);
|
|
|
|
addend: std_ulogic_vector(127 downto 0);
|
|
|
|
is_signed: std_ulogic;
|
|
|
|
subtract: std_ulogic; -- 0 => addend + data1 * data2, 1 => addend - data1 * data2
|
|
|
|
end record;
|
|
|
|
constant MultiplyInputInit : MultiplyInputType := (data1 => 64x"0", data2 => 64x"0",
|
|
|
|
addend => 128x"0", others => '0');
|
|
|
|
|
|
|
|
type MultiplyOutputType is record
|
|
|
|
valid: std_ulogic;
|
|
|
|
result: std_ulogic_vector(127 downto 0);
|
|
|
|
overflow : std_ulogic;
|
|
|
|
end record;
|
|
|
|
constant MultiplyOutputInit : MultiplyOutputType := (valid => '0', overflow => '0',
|
|
|
|
others => (others => '0'));
|
|
|
|
|
|
|
|
type Execute1ToDividerType is record
|
|
|
|
valid: std_ulogic;
|
Add a second execute stage to the pipeline
This adds a second execute stage to the pipeline, in order to match up
the length of the pipeline through loadstore and dcache with the
length through execute1. This will ultimately enable us to get rid of
the 1-cycle bubble that we currently have when issuing ALU
instructions after one or more LSU instructions.
Most ALU instructions execute in the first stage, except for
count-zeroes and popcount instructions (which take two cycles and do
some of their work in the second stage) and mfspr/mtspr to "slow" SPRs
(TB, DEC, PVR, LOGA/LOGD, CFAR). Multiply and divide/mod instructions
take several cycles but the instruction stays in the first stage (ex1)
and ex1.busy is asserted until the operation is complete.
There is currently a bypass from the first stage but not the second
stage. Performance is down somewhat because of that and because this
doesn't yet eliminate the bubble between LSU and ALU instructions.
The forwarding of XER common bits has been changed somewhat because
now there is another pipeline stage between ex1 and the committed
state in cr_file. The simplest thing for now is to record the last
value written and use that, unless there has been a flush, in which
case the committed state (obtained via e_in.xerc) is used.
Note that this fixes what was previously a benign bug in control.vhdl,
where it was possible for control to forget an instructions dependency
on a value from a previous instruction (a GPR or the CR) if this
instruction writes the value and the instruction gets to the point
where it could issue but is blocked by the busy signal from execute1.
In that situation, control may incorrectly not indicate that a bypass
should be used. That didn't matter previously because, for ALU and
FPU instructions, there was only one previous instruction in flight
and once the current instruction could issue, the previous instruction
was completing and the correct value would be obtained from
register_file or cr_file. For loadstore instructions there could be
two being executed, but because there are no bypass paths, failing to
indicate use of a bypass path is fine.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
3 years ago
|
|
|
flush: std_ulogic;
|
|
|
|
dividend: std_ulogic_vector(63 downto 0);
|
|
|
|
divisor: std_ulogic_vector(63 downto 0);
|
|
|
|
is_signed: std_ulogic;
|
|
|
|
is_32bit: std_ulogic;
|
|
|
|
is_extended: std_ulogic;
|
|
|
|
is_modulus: std_ulogic;
|
|
|
|
neg_result: std_ulogic;
|
|
|
|
end record;
|
Add a second execute stage to the pipeline
This adds a second execute stage to the pipeline, in order to match up
the length of the pipeline through loadstore and dcache with the
length through execute1. This will ultimately enable us to get rid of
the 1-cycle bubble that we currently have when issuing ALU
instructions after one or more LSU instructions.
Most ALU instructions execute in the first stage, except for
count-zeroes and popcount instructions (which take two cycles and do
some of their work in the second stage) and mfspr/mtspr to "slow" SPRs
(TB, DEC, PVR, LOGA/LOGD, CFAR). Multiply and divide/mod instructions
take several cycles but the instruction stays in the first stage (ex1)
and ex1.busy is asserted until the operation is complete.
There is currently a bypass from the first stage but not the second
stage. Performance is down somewhat because of that and because this
doesn't yet eliminate the bubble between LSU and ALU instructions.
The forwarding of XER common bits has been changed somewhat because
now there is another pipeline stage between ex1 and the committed
state in cr_file. The simplest thing for now is to record the last
value written and use that, unless there has been a flush, in which
case the committed state (obtained via e_in.xerc) is used.
Note that this fixes what was previously a benign bug in control.vhdl,
where it was possible for control to forget an instructions dependency
on a value from a previous instruction (a GPR or the CR) if this
instruction writes the value and the instruction gets to the point
where it could issue but is blocked by the busy signal from execute1.
In that situation, control may incorrectly not indicate that a bypass
should be used. That didn't matter previously because, for ALU and
FPU instructions, there was only one previous instruction in flight
and once the current instruction could issue, the previous instruction
was completing and the correct value would be obtained from
register_file or cr_file. For loadstore instructions there could be
two being executed, but because there are no bypass paths, failing to
indicate use of a bypass path is fine.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
3 years ago
|
|
|
constant Execute1ToDividerInit: Execute1ToDividerType := (
|
|
|
|
dividend => 64x"0", divisor => 64x"0", others => '0');
|
|
|
|
|
|
|
|
type PMUEventType is record
|
|
|
|
no_instr_avail : std_ulogic;
|
|
|
|
dispatch : std_ulogic;
|
|
|
|
ext_interrupt : std_ulogic;
|
|
|
|
instr_complete : std_ulogic;
|
|
|
|
fp_complete : std_ulogic;
|
|
|
|
ld_complete : std_ulogic;
|
|
|
|
st_complete : std_ulogic;
|
|
|
|
br_taken_complete : std_ulogic;
|
|
|
|
br_mispredict : std_ulogic;
|
|
|
|
ipref_discard : std_ulogic;
|
|
|
|
itlb_miss : std_ulogic;
|
|
|
|
itlb_miss_resolved : std_ulogic;
|
|
|
|
icache_miss : std_ulogic;
|
|
|
|
dc_miss_resolved : std_ulogic;
|
|
|
|
dc_load_miss : std_ulogic;
|
|
|
|
dc_ld_miss_resolved : std_ulogic;
|
|
|
|
dc_store_miss : std_ulogic;
|
|
|
|
dtlb_miss : std_ulogic;
|
|
|
|
dtlb_miss_resolved : std_ulogic;
|
|
|
|
ld_miss_nocache : std_ulogic;
|
|
|
|
ld_fill_nocache : std_ulogic;
|
|
|
|
end record;
|
|
|
|
constant PMUEventInit : PMUEventType := (others => '0');
|
|
|
|
|
|
|
|
type Execute1ToPMUType is record
|
|
|
|
mfspr : std_ulogic;
|
|
|
|
mtspr : std_ulogic;
|
|
|
|
spr_num : std_ulogic_vector(4 downto 0);
|
|
|
|
spr_val : std_ulogic_vector(63 downto 0);
|
|
|
|
tbbits : std_ulogic_vector(3 downto 0); -- event bits from timebase
|
|
|
|
pmm_msr : std_ulogic; -- PMM bit from MSR
|
|
|
|
pr_msr : std_ulogic; -- PR bit from MSR
|
|
|
|
run : std_ulogic;
|
|
|
|
nia : std_ulogic_vector(63 downto 0);
|
|
|
|
addr : std_ulogic_vector(63 downto 0);
|
|
|
|
addr_v : std_ulogic;
|
|
|
|
occur : PMUEventType;
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type PMUToExecute1Type is record
|
|
|
|
spr_val : std_ulogic_vector(63 downto 0);
|
|
|
|
intr : std_ulogic;
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type Decode2ToRegisterFileType is record
|
|
|
|
read1_enable : std_ulogic;
|
|
|
|
read2_enable : std_ulogic;
|
|
|
|
read3_enable : std_ulogic;
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type RegisterFileToDecode2Type is record
|
|
|
|
read1_data : std_ulogic_vector(63 downto 0);
|
|
|
|
read2_data : std_ulogic_vector(63 downto 0);
|
|
|
|
read3_data : std_ulogic_vector(63 downto 0);
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type Decode2ToCrFileType is record
|
|
|
|
read : std_ulogic;
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type CrFileToDecode2Type is record
|
|
|
|
read_cr_data : std_ulogic_vector(31 downto 0);
|
Add basic XER support
The carry is currently internal to execute1. We don't handle any of
the other XER fields.
This creates type called "xer_common_t" that contains the commonly
used XER bits (CA, CA32, SO, OV, OV32).
The value is stored in the CR file (though it could be a separate
module). The rest of the bits will be implemented as a separate
SPR and the two parts reconciled in mfspr/mtspr in latter commits.
We always read XER in decode2 (there is little point not to)
and send it down all pipeline branches as it will be needed in
writeback for all type of instructions when CR0:SO needs to be
updated (such forms exist for all pipeline branches even if we don't
yet implement them).
To avoid having to track XER hazards, we forward it back in EX1. This
assumes that other pipeline branches that can modify it (mult and div)
are running single issue for now.
One additional hazard to beware of is an XER:SO modifying instruction
in EX1 followed immediately by a store conditional. Due to our writeback
latency, the store will go down the LSU with the previous XER value,
thus the stcx. will set CR0:SO using an obsolete SO value.
I doubt there exist any code relying on this behaviour being correct
but we should account for it regardless, possibly by ensuring that
stcx. remain single issue initially, or later by adding some minimal
tracking or moving the LSU into the same pipeline as execute.
Missing some obscure XER affecting instructions like addex or mcrxrx.
[paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of
arguments to set_ov]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
read_xerc_data : xer_common_t;
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type Execute1ToLoadstore1Type is record
|
|
|
|
valid : std_ulogic;
|
|
|
|
op : insn_type_t; -- what ld/st or m[tf]spr or TLB op to do
|
|
|
|
insn : std_ulogic_vector(31 downto 0);
|
|
|
|
instr_tag : instr_tag_t;
|
|
|
|
addr1 : std_ulogic_vector(63 downto 0);
|
|
|
|
addr2 : std_ulogic_vector(63 downto 0);
|
|
|
|
data : std_ulogic_vector(63 downto 0); -- data to write, unused for read
|
|
|
|
write_reg : gspr_index_t;
|
|
|
|
length : std_ulogic_vector(3 downto 0);
|
|
|
|
ci : std_ulogic; -- cache-inhibited load/store
|
|
|
|
byte_reverse : std_ulogic;
|
|
|
|
sign_extend : std_ulogic; -- do we need to sign extend?
|
|
|
|
update : std_ulogic; -- is this an update instruction?
|
Add basic XER support
The carry is currently internal to execute1. We don't handle any of
the other XER fields.
This creates type called "xer_common_t" that contains the commonly
used XER bits (CA, CA32, SO, OV, OV32).
The value is stored in the CR file (though it could be a separate
module). The rest of the bits will be implemented as a separate
SPR and the two parts reconciled in mfspr/mtspr in latter commits.
We always read XER in decode2 (there is little point not to)
and send it down all pipeline branches as it will be needed in
writeback for all type of instructions when CR0:SO needs to be
updated (such forms exist for all pipeline branches even if we don't
yet implement them).
To avoid having to track XER hazards, we forward it back in EX1. This
assumes that other pipeline branches that can modify it (mult and div)
are running single issue for now.
One additional hazard to beware of is an XER:SO modifying instruction
in EX1 followed immediately by a store conditional. Due to our writeback
latency, the store will go down the LSU with the previous XER value,
thus the stcx. will set CR0:SO using an obsolete SO value.
I doubt there exist any code relying on this behaviour being correct
but we should account for it regardless, possibly by ensuring that
stcx. remain single issue initially, or later by adding some minimal
tracking or moving the LSU into the same pipeline as execute.
Missing some obscure XER affecting instructions like addex or mcrxrx.
[paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of
arguments to set_ov]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
xerc : xer_common_t;
|
|
|
|
reserve : std_ulogic; -- set for larx/stcx.
|
|
|
|
rc : std_ulogic; -- set for stcx.
|
|
|
|
virt_mode : std_ulogic; -- do translation through TLB
|
|
|
|
priv_mode : std_ulogic; -- privileged mode (MSR[PR] = 0)
|
|
|
|
mode_32bit : std_ulogic; -- trim addresses to 32 bits
|
|
|
|
is_32bit : std_ulogic;
|
Implement interrupts for prefixed instructions
This arranges to generate an illegal instruction type program
interrupt for illegal prefixed instructions, that is, those where the
suffix is not a legal value given the prefix, or the prefix has a
reserved value in the subtype field. This implementation doesn't
generate an interrupt for the invalid 8LS:D and MLS:D instruction
forms where R = 1 and RA != 0. (In those cases it uses (RA) as the
addend, i.e. it ignores the R bit.)
This detects the case where the address of an instruction prefix is
equal mod 64 to 60, and generates an alignment interrupt in that case.
This also arranges to set bit 34 of SRR1 when an interrupt occurs due
to a prefixed instruction, for those interrupts where that is required
(i.e. trace, alignment, floating-point unavailable, data storage, data
segment, and most cases of program interrupt).
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2 years ago
|
|
|
prefixed : std_ulogic;
|
core: Implement quadword loads and stores
This implements the lq, stq, lqarx and stqcx. instructions.
These instructions all access two consecutive GPRs; for example the
"lq %r6,0(%r3)" instruction will load the doubleword at the address
in R3 into R7 and the doubleword at address R3 + 8 into R6. To cope
with having two GPR sources or destinations, the instruction gets
repeated at the decode2 stage, that is, for each lq/stq/lqarx/stqcx.
coming in from decode1, two instructions get sent out to execute1.
For these instructions, the RS or RT register gets modified on one
of the iterations by setting the LSB of the register number. In LE
mode, the first iteration uses RS|1 or RT|1 and the second iteration
uses RS or RT. In BE mode, this is done the other way around. In
order for decode2 to know what endianness is currently in use, we
pass the big_endian flag down from icache through decode1 to decode2.
This is always in sync with what execute1 is using because only rfid
or an interrupt can change MSR[LE], and those operations all cause
a flush and redirect.
There is now an extra column in the decode tables in decode1 to
indicate whether the instruction needs to be repeated. Decode1 also
enforces the rule that lq with RT = RT and lqarx with RA = RT or
RB = RT are illegal.
Decode2 now passes a 'repeat' flag and a 'second' flag to execute1,
and execute1 passes them on to loadstore1. The 'repeat' flag is set
for both iterations of a repeated instruction, and 'second' is set
on the second iteration. Execute1 does not take asynchronous or
trace interrupts on the second iteration of a repeated instruction.
Loadstore1 uses 'next_addr' for the second iteration of a repeated
load/store so that we access the second doubleword of the memory
operand. Thus loadstore1 accesses the doublewords in increasing
memory order. For 16-byte loads this means that the first iteration
writes GPR RT|1. It is possible that RA = RT|1 (this is a legal
but non-preferred form), meaning that if the memory operand was
misaligned, the first iteration would overwrite RA but then the
second iteration might take a page fault, leading to corrupted state.
To avoid that possibility, 16-byte loads in LE mode take an
alignment interrupt if the operand is not 16-byte aligned. (This
is the case anyway for lqarx, and we enforce it for lq as well.)
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
repeat : std_ulogic;
|
|
|
|
second : std_ulogic;
|
|
|
|
e2stall : std_ulogic;
|
|
|
|
msr : std_ulogic_vector(63 downto 0);
|
|
|
|
end record;
|
|
|
|
constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type :=
|
|
|
|
(valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0',
|
|
|
|
sign_extend => '0', update => '0', xerc => xerc_init,
|
|
|
|
reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0',
|
|
|
|
insn => (others => '0'),
|
|
|
|
instr_tag => instr_tag_init,
|
|
|
|
addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'),
|
|
|
|
write_reg => (others => '0'),
|
|
|
|
length => (others => '0'),
|
Implement interrupts for prefixed instructions
This arranges to generate an illegal instruction type program
interrupt for illegal prefixed instructions, that is, those where the
suffix is not a legal value given the prefix, or the prefix has a
reserved value in the subtype field. This implementation doesn't
generate an interrupt for the invalid 8LS:D and MLS:D instruction
forms where R = 1 and RA != 0. (In those cases it uses (RA) as the
addend, i.e. it ignores the R bit.)
This detects the case where the address of an instruction prefix is
equal mod 64 to 60, and generates an alignment interrupt in that case.
This also arranges to set bit 34 of SRR1 when an interrupt occurs due
to a prefixed instruction, for those interrupts where that is required
(i.e. trace, alignment, floating-point unavailable, data storage, data
segment, and most cases of program interrupt).
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
2 years ago
|
|
|
mode_32bit => '0', is_32bit => '0', prefixed => '0',
|
|
|
|
repeat => '0', second => '0', e2stall => '0',
|
|
|
|
msr => (others => '0'));
|
|
|
|
|
|
|
|
type Loadstore1ToExecute1Type is record
|
|
|
|
busy : std_ulogic;
|
|
|
|
l2stall : std_ulogic;
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type Loadstore1ToDcacheType is record
|
|
|
|
valid : std_ulogic;
|
|
|
|
hold : std_ulogic;
|
|
|
|
load : std_ulogic; -- is this a load
|
|
|
|
dcbz : std_ulogic;
|
|
|
|
nc : std_ulogic;
|
|
|
|
reserve : std_ulogic;
|
|
|
|
virt_mode : std_ulogic;
|
|
|
|
priv_mode : std_ulogic;
|
|
|
|
addr : std_ulogic_vector(63 downto 0);
|
|
|
|
data : std_ulogic_vector(63 downto 0); -- valid the cycle after .valid = 1
|
|
|
|
byte_sel : std_ulogic_vector(7 downto 0);
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type DcacheToLoadstore1Type is record
|
|
|
|
valid : std_ulogic;
|
|
|
|
data : std_ulogic_vector(63 downto 0);
|
|
|
|
store_done : std_ulogic;
|
|
|
|
error : std_ulogic;
|
|
|
|
cache_paradox : std_ulogic;
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type DcacheEventType is record
|
|
|
|
load_miss : std_ulogic;
|
|
|
|
store_miss : std_ulogic;
|
|
|
|
dcache_refill : std_ulogic;
|
|
|
|
dtlb_miss : std_ulogic;
|
|
|
|
dtlb_miss_resolved : std_ulogic;
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type Loadstore1ToMmuType is record
|
|
|
|
valid : std_ulogic;
|
|
|
|
tlbie : std_ulogic;
|
|
|
|
slbia : std_ulogic;
|
MMU: Implement radix page table machinery
This adds the necessary machinery to the MMU for it to do radix page
table walks. The core elements are a shifter that can shift the
address right by between 0 and 47 bits, a mask generator that can
generate a mask of between 5 and 16 bits, a final mask generator,
and new states in the state machine.
(The final mask generator is used for transferring bits of the
original address into the resulting TLB entry when the leaf PTE
corresponds to a page size larger than 4kB.)
The hardware does not implement a partition table or a process table.
Software is expected to load the appropriate process table entry
into a new SPR called PGTBL0, SPR 720. The contents should be
formatted as described in Book III section 5.7.6.2 of the Power ISA
v3.0B. PGTBL0 is set to 0 on hard reset. At present, the top two bits
of the address (the quadrant) are ignored.
There is currently no caching of any step in the translation process
or of the final result, other than the entry created in the dTLB.
That entry is a 4k page entry even if the leaf PTE found in the walk
corresponds to a larger page size.
This implementation can handle almost any page table layout and any
page size. The RTS field (in PGTBL0) can have any value between 0
and 31, corresponding to a total address space size between 2^31
and 2^62 bytes. The RPDS field of PGTBL0 can be any value between
5 and 16, except that a value of 0 is taken to disable radix page
table walking (for use when one is using software loading of TLB
entries). The NLS field of the page directory entries can have any
value between 5 and 16. The minimum page size is 4kB, meaning that
the sum of RPDS and the NLS values of the PDEs found on the path to
a leaf PTE must be less than or equal to RTS + 31 - 12.
The PGTBL0 SPR is in the mmu module; thus this adds a path for
loadstore1 to read and write SPRs in mmu. This adds code in dcache
to service doubleword read requests from the MMU, as well as requests
to write dTLB entries.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
mtspr : std_ulogic;
|
Add TLB to icache
This adds a direct-mapped TLB to the icache, with 64 entries by default.
Execute1 now sends a "virt_mode" signal from MSR[IR] to fetch1 along
with redirects to indicate whether instruction addresses should be
translated through the TLB, and fetch1 sends that on to icache.
Similarly a "priv_mode" signal is sent to indicate the privilege
mode for instruction fetches. This means that changes to MSR[IR]
or MSR[PR] don't take effect until the next redirect, meaning an
isync, rfid, branch, etc.
The icache uses a hash of the effective address (i.e. next instruction
address) to index the TLB. The hash is an XOR of three fields of the
address; with a 64-entry TLB, the fields are bits 12--17, 18--23 and
24--29 of the address. TLB invalidations simply invalidate the
indexed TLB entry without checking the contents.
If the icache detects a TLB miss with virt_mode=1, it will send a
fetch_failed indication through fetch2 to decode1, which will turn it
into a special OP_FETCH_FAILED opcode with unit=LDST. That will get
sent down to loadstore1 which will currently just raise a Instruction
Storage Interrupt (0x400) exception.
One bit in the PTE obtained from the TLB is used to check whether an
instruction access is allowed -- the privilege bit (bit 3). If bit 3
is 1 and priv_mode=0, then a fetch_failed indication is sent down to
fetch2 and to decode1, which generates an OP_FETCH_FAILED. Any PTEs
with PTE bit 0 (EAA[3]) clear or bit 8 (R) clear should not be put
into the iTLB since such PTEs would not allow execution by any
context.
Tlbie operations get sent from mmu to icache over a new connection.
Unfortunately the privileged instruction tests are broken for now.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
iside : std_ulogic;
|
|
|
|
load : std_ulogic;
|
|
|
|
priv : std_ulogic;
|
|
|
|
ric : std_ulogic_vector(1 downto 0);
|
|
|
|
sprnf : std_ulogic;
|
|
|
|
sprnt : std_ulogic;
|
|
|
|
addr : std_ulogic_vector(63 downto 0);
|
|
|
|
rs : std_ulogic_vector(63 downto 0);
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type MmuToLoadstore1Type is record
|
|
|
|
done : std_ulogic;
|
|
|
|
err : std_ulogic;
|
|
|
|
invalid : std_ulogic;
|
|
|
|
badtree : std_ulogic;
|
|
|
|
segerr : std_ulogic;
|
|
|
|
perm_error : std_ulogic;
|
|
|
|
rc_error : std_ulogic;
|
|
|
|
sprval : std_ulogic_vector(63 downto 0);
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type MmuToDcacheType is record
|
|
|
|
valid : std_ulogic;
|
|
|
|
tlbie : std_ulogic;
|
|
|
|
doall : std_ulogic;
|
MMU: Implement radix page table machinery
This adds the necessary machinery to the MMU for it to do radix page
table walks. The core elements are a shifter that can shift the
address right by between 0 and 47 bits, a mask generator that can
generate a mask of between 5 and 16 bits, a final mask generator,
and new states in the state machine.
(The final mask generator is used for transferring bits of the
original address into the resulting TLB entry when the leaf PTE
corresponds to a page size larger than 4kB.)
The hardware does not implement a partition table or a process table.
Software is expected to load the appropriate process table entry
into a new SPR called PGTBL0, SPR 720. The contents should be
formatted as described in Book III section 5.7.6.2 of the Power ISA
v3.0B. PGTBL0 is set to 0 on hard reset. At present, the top two bits
of the address (the quadrant) are ignored.
There is currently no caching of any step in the translation process
or of the final result, other than the entry created in the dTLB.
That entry is a 4k page entry even if the leaf PTE found in the walk
corresponds to a larger page size.
This implementation can handle almost any page table layout and any
page size. The RTS field (in PGTBL0) can have any value between 0
and 31, corresponding to a total address space size between 2^31
and 2^62 bytes. The RPDS field of PGTBL0 can be any value between
5 and 16, except that a value of 0 is taken to disable radix page
table walking (for use when one is using software loading of TLB
entries). The NLS field of the page directory entries can have any
value between 5 and 16. The minimum page size is 4kB, meaning that
the sum of RPDS and the NLS values of the PDEs found on the path to
a leaf PTE must be less than or equal to RTS + 31 - 12.
The PGTBL0 SPR is in the mmu module; thus this adds a path for
loadstore1 to read and write SPRs in mmu. This adds code in dcache
to service doubleword read requests from the MMU, as well as requests
to write dTLB entries.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
tlbld : std_ulogic;
|
|
|
|
addr : std_ulogic_vector(63 downto 0);
|
|
|
|
pte : std_ulogic_vector(63 downto 0);
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type DcacheToMmuType is record
|
|
|
|
stall : std_ulogic;
|
|
|
|
done : std_ulogic;
|
MMU: Implement radix page table machinery
This adds the necessary machinery to the MMU for it to do radix page
table walks. The core elements are a shifter that can shift the
address right by between 0 and 47 bits, a mask generator that can
generate a mask of between 5 and 16 bits, a final mask generator,
and new states in the state machine.
(The final mask generator is used for transferring bits of the
original address into the resulting TLB entry when the leaf PTE
corresponds to a page size larger than 4kB.)
The hardware does not implement a partition table or a process table.
Software is expected to load the appropriate process table entry
into a new SPR called PGTBL0, SPR 720. The contents should be
formatted as described in Book III section 5.7.6.2 of the Power ISA
v3.0B. PGTBL0 is set to 0 on hard reset. At present, the top two bits
of the address (the quadrant) are ignored.
There is currently no caching of any step in the translation process
or of the final result, other than the entry created in the dTLB.
That entry is a 4k page entry even if the leaf PTE found in the walk
corresponds to a larger page size.
This implementation can handle almost any page table layout and any
page size. The RTS field (in PGTBL0) can have any value between 0
and 31, corresponding to a total address space size between 2^31
and 2^62 bytes. The RPDS field of PGTBL0 can be any value between
5 and 16, except that a value of 0 is taken to disable radix page
table walking (for use when one is using software loading of TLB
entries). The NLS field of the page directory entries can have any
value between 5 and 16. The minimum page size is 4kB, meaning that
the sum of RPDS and the NLS values of the PDEs found on the path to
a leaf PTE must be less than or equal to RTS + 31 - 12.
The PGTBL0 SPR is in the mmu module; thus this adds a path for
loadstore1 to read and write SPRs in mmu. This adds code in dcache
to service doubleword read requests from the MMU, as well as requests
to write dTLB entries.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
err : std_ulogic;
|
|
|
|
data : std_ulogic_vector(63 downto 0);
|
|
|
|
end record;
|
|
|
|
|
Move iTLB from icache to fetch1
This moves the address translation step for instruction fetches one
cycle earlier, so that it now happens in the fetch1 stage. There is
now a 2-entry mini translation cache ("ERAT", or effective to real
address translation cache) which operates on the output of the
multiplexer that selects the instruction address for the next cycle.
The ERAT consists of two effective address registers and two
corresponding real address registers. They store the page number part
of the addresses for a 4kB page size, which is the smallest page size
supported by the architecture.
If the effective address doesn't match either of the EA registers, and
address translation is enabled, then i_out.req goes low for two cycles
while the iTLB is looked up. Experimentally, this delay results in a
0.1% drop in coremark performance; allowing two cycles for the lookup
results in better timing. The result from the iTLB is placed into the
least recently used ERAT entry and then used to translate the address
as normal. If address translation is not enabled then the EA is used
directly as the real address.
The iTLB structure is the same as it was before; direct mapped,
indexed using a hashed EA.
The "fetch failed" signal, which indicates a TLB miss or protection
violation, is now generated in fetch1 and passed through icache.
When it is asserted, fetch1 goes into a stalled state until a PTE
arrives from the MMU (which gets put into both the iTLB and the ERAT),
or an interrupt or redirect occurs.
Any TLB invalidations from the MMU invalidate the whole ERAT.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
1 year ago
|
|
|
type MmuToITLBType is record
|
Add TLB to icache
This adds a direct-mapped TLB to the icache, with 64 entries by default.
Execute1 now sends a "virt_mode" signal from MSR[IR] to fetch1 along
with redirects to indicate whether instruction addresses should be
translated through the TLB, and fetch1 sends that on to icache.
Similarly a "priv_mode" signal is sent to indicate the privilege
mode for instruction fetches. This means that changes to MSR[IR]
or MSR[PR] don't take effect until the next redirect, meaning an
isync, rfid, branch, etc.
The icache uses a hash of the effective address (i.e. next instruction
address) to index the TLB. The hash is an XOR of three fields of the
address; with a 64-entry TLB, the fields are bits 12--17, 18--23 and
24--29 of the address. TLB invalidations simply invalidate the
indexed TLB entry without checking the contents.
If the icache detects a TLB miss with virt_mode=1, it will send a
fetch_failed indication through fetch2 to decode1, which will turn it
into a special OP_FETCH_FAILED opcode with unit=LDST. That will get
sent down to loadstore1 which will currently just raise a Instruction
Storage Interrupt (0x400) exception.
One bit in the PTE obtained from the TLB is used to check whether an
instruction access is allowed -- the privilege bit (bit 3). If bit 3
is 1 and priv_mode=0, then a fetch_failed indication is sent down to
fetch2 and to decode1, which generates an OP_FETCH_FAILED. Any PTEs
with PTE bit 0 (EAA[3]) clear or bit 8 (R) clear should not be put
into the iTLB since such PTEs would not allow execution by any
context.
Tlbie operations get sent from mmu to icache over a new connection.
Unfortunately the privileged instruction tests are broken for now.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
tlbld : std_ulogic;
|
|
|
|
tlbie : std_ulogic;
|
|
|
|
doall : std_ulogic;
|
Add TLB to icache
This adds a direct-mapped TLB to the icache, with 64 entries by default.
Execute1 now sends a "virt_mode" signal from MSR[IR] to fetch1 along
with redirects to indicate whether instruction addresses should be
translated through the TLB, and fetch1 sends that on to icache.
Similarly a "priv_mode" signal is sent to indicate the privilege
mode for instruction fetches. This means that changes to MSR[IR]
or MSR[PR] don't take effect until the next redirect, meaning an
isync, rfid, branch, etc.
The icache uses a hash of the effective address (i.e. next instruction
address) to index the TLB. The hash is an XOR of three fields of the
address; with a 64-entry TLB, the fields are bits 12--17, 18--23 and
24--29 of the address. TLB invalidations simply invalidate the
indexed TLB entry without checking the contents.
If the icache detects a TLB miss with virt_mode=1, it will send a
fetch_failed indication through fetch2 to decode1, which will turn it
into a special OP_FETCH_FAILED opcode with unit=LDST. That will get
sent down to loadstore1 which will currently just raise a Instruction
Storage Interrupt (0x400) exception.
One bit in the PTE obtained from the TLB is used to check whether an
instruction access is allowed -- the privilege bit (bit 3). If bit 3
is 1 and priv_mode=0, then a fetch_failed indication is sent down to
fetch2 and to decode1, which generates an OP_FETCH_FAILED. Any PTEs
with PTE bit 0 (EAA[3]) clear or bit 8 (R) clear should not be put
into the iTLB since such PTEs would not allow execution by any
context.
Tlbie operations get sent from mmu to icache over a new connection.
Unfortunately the privileged instruction tests are broken for now.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
addr : std_ulogic_vector(63 downto 0);
|
|
|
|
pte : std_ulogic_vector(63 downto 0);
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type Loadstore1ToWritebackType is record
|
|
|
|
valid : std_ulogic;
|
|
|
|
instr_tag : instr_tag_t;
|
|
|
|
write_enable: std_ulogic;
|
|
|
|
write_reg : gspr_index_t;
|
|
|
|
write_data : std_ulogic_vector(63 downto 0);
|
Add basic XER support
The carry is currently internal to execute1. We don't handle any of
the other XER fields.
This creates type called "xer_common_t" that contains the commonly
used XER bits (CA, CA32, SO, OV, OV32).
The value is stored in the CR file (though it could be a separate
module). The rest of the bits will be implemented as a separate
SPR and the two parts reconciled in mfspr/mtspr in latter commits.
We always read XER in decode2 (there is little point not to)
and send it down all pipeline branches as it will be needed in
writeback for all type of instructions when CR0:SO needs to be
updated (such forms exist for all pipeline branches even if we don't
yet implement them).
To avoid having to track XER hazards, we forward it back in EX1. This
assumes that other pipeline branches that can modify it (mult and div)
are running single issue for now.
One additional hazard to beware of is an XER:SO modifying instruction
in EX1 followed immediately by a store conditional. Due to our writeback
latency, the store will go down the LSU with the previous XER value,
thus the stcx. will set CR0:SO using an obsolete SO value.
I doubt there exist any code relying on this behaviour being correct
but we should account for it regardless, possibly by ensuring that
stcx. remain single issue initially, or later by adding some minimal
tracking or moving the LSU into the same pipeline as execute.
Missing some obscure XER affecting instructions like addex or mcrxrx.
[paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of
arguments to set_ov]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
xerc : xer_common_t;
|
|
|
|
rc : std_ulogic;
|
|
|
|
store_done : std_ulogic;
|
|
|
|
interrupt : std_ulogic;
|
|
|
|
intr_vec : intr_vector_t;
|
|
|
|
srr1: std_ulogic_vector(15 downto 0);
|
|
|
|
end record;
|
|
|
|
constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType :=
|
|
|
|
(valid => '0', instr_tag => instr_tag_init, write_enable => '0',
|
|
|
|
write_reg => (others => '0'), write_data => (others => '0'),
|
|
|
|
xerc => xerc_init, rc => '0', store_done => '0',
|
|
|
|
interrupt => '0', intr_vec => 0,
|
|
|
|
srr1 => (others => '0'));
|
|
|
|
|
|
|
|
type Loadstore1EventType is record
|
|
|
|
load_complete : std_ulogic;
|
|
|
|
store_complete : std_ulogic;
|
|
|
|
itlb_miss : std_ulogic;
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type Execute1ToWritebackType is record
|
|
|
|
valid: std_ulogic;
|
|
|
|
instr_tag : instr_tag_t;
|
|
|
|
rc : std_ulogic;
|
|
|
|
mode_32bit : std_ulogic;
|
|
|
|
write_enable : std_ulogic;
|
|
|
|
write_reg: gspr_index_t;
|
|
|
|
write_data: std_ulogic_vector(63 downto 0);
|
|
|
|
write_cr_enable : std_ulogic;
|
|
|
|
write_cr_mask : std_ulogic_vector(7 downto 0);
|
|
|
|
write_cr_data : std_ulogic_vector(31 downto 0);
|
Add basic XER support
The carry is currently internal to execute1. We don't handle any of
the other XER fields.
This creates type called "xer_common_t" that contains the commonly
used XER bits (CA, CA32, SO, OV, OV32).
The value is stored in the CR file (though it could be a separate
module). The rest of the bits will be implemented as a separate
SPR and the two parts reconciled in mfspr/mtspr in latter commits.
We always read XER in decode2 (there is little point not to)
and send it down all pipeline branches as it will be needed in
writeback for all type of instructions when CR0:SO needs to be
updated (such forms exist for all pipeline branches even if we don't
yet implement them).
To avoid having to track XER hazards, we forward it back in EX1. This
assumes that other pipeline branches that can modify it (mult and div)
are running single issue for now.
One additional hazard to beware of is an XER:SO modifying instruction
in EX1 followed immediately by a store conditional. Due to our writeback
latency, the store will go down the LSU with the previous XER value,
thus the stcx. will set CR0:SO using an obsolete SO value.
I doubt there exist any code relying on this behaviour being correct
but we should account for it regardless, possibly by ensuring that
stcx. remain single issue initially, or later by adding some minimal
tracking or moving the LSU into the same pipeline as execute.
Missing some obscure XER affecting instructions like addex or mcrxrx.
[paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of
arguments to set_ov]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
write_xerc_enable : std_ulogic;
|
|
|
|
xerc : xer_common_t;
|
|
|
|
interrupt : std_ulogic;
|
|
|
|
hv_intr : std_ulogic;
|
|
|
|
is_scv : std_ulogic;
|
|
|
|
intr_vec : intr_vector_t;
|
|
|
|
redirect: std_ulogic;
|
|
|
|
redir_mode: std_ulogic_vector(3 downto 0);
|
|
|
|
last_nia: std_ulogic_vector(63 downto 0);
|
|
|
|
br_last: std_ulogic;
|
|
|
|
br_taken: std_ulogic;
|
|
|
|
abs_br: std_ulogic;
|
|
|
|
srr1: std_ulogic_vector(15 downto 0);
|
|
|
|
msr: std_ulogic_vector(63 downto 0);
|
|
|
|
end record;
|
|
|
|
constant Execute1ToWritebackInit : Execute1ToWritebackType :=
|
|
|
|
(valid => '0', instr_tag => instr_tag_init, rc => '0', mode_32bit => '0',
|
|
|
|
write_enable => '0', write_cr_enable => '0',
|
|
|
|
write_xerc_enable => '0', xerc => xerc_init,
|
|
|
|
write_data => (others => '0'), write_cr_mask => (others => '0'),
|
|
|
|
write_cr_data => (others => '0'), write_reg => (others => '0'),
|
|
|
|
interrupt => '0', hv_intr => '0', is_scv => '0', intr_vec => 0,
|
|
|
|
redirect => '0', redir_mode => "0000",
|
|
|
|
last_nia => (others => '0'),
|
|
|
|
br_last => '0', br_taken => '0', abs_br => '0',
|
|
|
|
srr1 => (others => '0'), msr => (others => '0'));
|
|
|
|
|
|
|
|
type Execute1ToFPUType is record
|
|
|
|
valid : std_ulogic;
|
|
|
|
op : insn_type_t;
|
|
|
|
nia : std_ulogic_vector(63 downto 0);
|
|
|
|
itag : instr_tag_t;
|
|
|
|
insn : std_ulogic_vector(31 downto 0);
|
|
|
|
single : std_ulogic;
|
|
|
|
is_signed : std_ulogic;
|
|
|
|
fe_mode : std_ulogic_vector(1 downto 0);
|
|
|
|
fra : std_ulogic_vector(63 downto 0);
|
|
|
|
frb : std_ulogic_vector(63 downto 0);
|
|
|
|
frc : std_ulogic_vector(63 downto 0);
|
|
|
|
valid_a : std_ulogic;
|
|
|
|
valid_b : std_ulogic;
|
|
|
|
valid_c : std_ulogic;
|
|
|
|
frt : gspr_index_t;
|
|
|
|
rc : std_ulogic;
|
|
|
|
m32b : std_ulogic;
|
|
|
|
out_cr : std_ulogic;
|
|
|
|
oe : std_ulogic;
|
|
|
|
xerc : xer_common_t;
|
|
|
|
stall : std_ulogic;
|
|
|
|
end record;
|
|
|
|
constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'),
|
|
|
|
itag => instr_tag_init,
|
|
|
|
insn => (others => '0'), fe_mode => "00", rc => '0',
|
|
|
|
fra => (others => '0'), frb => (others => '0'),
|
|
|
|
frc => (others => '0'), frt => (others => '0'),
|
|
|
|
valid_a => '0', valid_b => '0', valid_c => '0',
|
|
|
|
single => '0', is_signed => '0', out_cr => '0',
|
|
|
|
m32b => '0', oe => '0', xerc => xerc_init,
|
|
|
|
stall => '0');
|
|
|
|
|
|
|
|
type FPUToExecute1Type is record
|
|
|
|
busy : std_ulogic;
|
|
|
|
f2stall : std_ulogic;
|
|
|
|
exception : std_ulogic;
|
|
|
|
end record;
|
|
|
|
constant FPUToExecute1Init : FPUToExecute1Type := (others => '0');
|
|
|
|
|
|
|
|
type FPUToWritebackType is record
|
|
|
|
valid : std_ulogic;
|
|
|
|
interrupt : std_ulogic;
|
|
|
|
instr_tag : instr_tag_t;
|
|
|
|
write_enable : std_ulogic;
|
|
|
|
write_reg : gspr_index_t;
|
|
|
|
write_data : std_ulogic_vector(63 downto 0);
|
|
|
|
write_cr_enable : std_ulogic;
|
|
|
|
write_cr_mask : std_ulogic_vector(7 downto 0);
|
|
|
|
write_cr_data : std_ulogic_vector(31 downto 0);
|
|
|
|
write_xerc : std_ulogic;
|
|
|
|
xerc : xer_common_t;
|
|
|
|
intr_vec : intr_vector_t;
|
|
|
|
srr1 : std_ulogic_vector(15 downto 0);
|
|
|
|
end record;
|
|
|
|
constant FPUToWritebackInit : FPUToWritebackType :=
|
|
|
|
(valid => '0', interrupt => '0', instr_tag => instr_tag_init,
|
|
|
|
write_enable => '0', write_reg => (others => '0'),
|
|
|
|
write_cr_enable => '0', write_cr_mask => (others => '0'),
|
|
|
|
write_cr_data => (others => '0'),
|
|
|
|
write_xerc => '0', xerc => xerc_init,
|
|
|
|
intr_vec => 0, srr1 => (others => '0'),
|
|
|
|
others => (others => '0'));
|
|
|
|
|
|
|
|
type DividerToExecute1Type is record
|
|
|
|
valid: std_ulogic;
|
|
|
|
write_reg_data: std_ulogic_vector(63 downto 0);
|
|
|
|
overflow : std_ulogic;
|
|
|
|
end record;
|
|
|
|
constant DividerToExecute1Init : DividerToExecute1Type := (valid => '0', overflow => '0',
|
|
|
|
others => (others => '0'));
|
|
|
|
|
|
|
|
type WritebackToFetch1Type is record
|
|
|
|
redirect: std_ulogic;
|
|
|
|
virt_mode: std_ulogic;
|
|
|
|
priv_mode: std_ulogic;
|
|
|
|
big_endian: std_ulogic;
|
|
|
|
mode_32bit: std_ulogic;
|
|
|
|
redirect_nia: std_ulogic_vector(63 downto 0);
|
|
|
|
br_nia : std_ulogic_vector(63 downto 0);
|
|
|
|
br_last : std_ulogic;
|
|
|
|
br_taken : std_ulogic;
|
|
|
|
interrupt : std_ulogic;
|
|
|
|
intr_vec : std_ulogic_vector(16 downto 0);
|
|
|
|
end record;
|
|
|
|
constant WritebackToFetch1Init : WritebackToFetch1Type :=
|
|
|
|
(redirect => '0', virt_mode => '0', priv_mode => '0', big_endian => '0',
|
|
|
|
mode_32bit => '0', redirect_nia => (others => '0'),
|
|
|
|
br_last => '0', br_taken => '0', br_nia => (others => '0'),
|
|
|
|
interrupt => '0', intr_vec => 17x"0");
|
|
|
|
|
|
|
|
type WritebackToRegisterFileType is record
|
|
|
|
write_reg : gspr_index_t;
|
|
|
|
write_data : std_ulogic_vector(63 downto 0);
|
|
|
|
write_enable : std_ulogic;
|
|
|
|
end record;
|
|
|
|
constant WritebackToRegisterFileInit : WritebackToRegisterFileType :=
|
|
|
|
(write_enable => '0', write_data => (others => '0'), others => (others => '0'));
|
|
|
|
|
|
|
|
type WritebackToCrFileType is record
|
|
|
|
write_cr_enable : std_ulogic;
|
|
|
|
write_cr_mask : std_ulogic_vector(7 downto 0);
|
|
|
|
write_cr_data : std_ulogic_vector(31 downto 0);
|
Add basic XER support
The carry is currently internal to execute1. We don't handle any of
the other XER fields.
This creates type called "xer_common_t" that contains the commonly
used XER bits (CA, CA32, SO, OV, OV32).
The value is stored in the CR file (though it could be a separate
module). The rest of the bits will be implemented as a separate
SPR and the two parts reconciled in mfspr/mtspr in latter commits.
We always read XER in decode2 (there is little point not to)
and send it down all pipeline branches as it will be needed in
writeback for all type of instructions when CR0:SO needs to be
updated (such forms exist for all pipeline branches even if we don't
yet implement them).
To avoid having to track XER hazards, we forward it back in EX1. This
assumes that other pipeline branches that can modify it (mult and div)
are running single issue for now.
One additional hazard to beware of is an XER:SO modifying instruction
in EX1 followed immediately by a store conditional. Due to our writeback
latency, the store will go down the LSU with the previous XER value,
thus the stcx. will set CR0:SO using an obsolete SO value.
I doubt there exist any code relying on this behaviour being correct
but we should account for it regardless, possibly by ensuring that
stcx. remain single issue initially, or later by adding some minimal
tracking or moving the LSU into the same pipeline as execute.
Missing some obscure XER affecting instructions like addex or mcrxrx.
[paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of
arguments to set_ov]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
write_xerc_enable : std_ulogic;
|
|
|
|
write_xerc_data : xer_common_t;
|
|
|
|
end record;
|
Add basic XER support
The carry is currently internal to execute1. We don't handle any of
the other XER fields.
This creates type called "xer_common_t" that contains the commonly
used XER bits (CA, CA32, SO, OV, OV32).
The value is stored in the CR file (though it could be a separate
module). The rest of the bits will be implemented as a separate
SPR and the two parts reconciled in mfspr/mtspr in latter commits.
We always read XER in decode2 (there is little point not to)
and send it down all pipeline branches as it will be needed in
writeback for all type of instructions when CR0:SO needs to be
updated (such forms exist for all pipeline branches even if we don't
yet implement them).
To avoid having to track XER hazards, we forward it back in EX1. This
assumes that other pipeline branches that can modify it (mult and div)
are running single issue for now.
One additional hazard to beware of is an XER:SO modifying instruction
in EX1 followed immediately by a store conditional. Due to our writeback
latency, the store will go down the LSU with the previous XER value,
thus the stcx. will set CR0:SO using an obsolete SO value.
I doubt there exist any code relying on this behaviour being correct
but we should account for it regardless, possibly by ensuring that
stcx. remain single issue initially, or later by adding some minimal
tracking or moving the LSU into the same pipeline as execute.
Missing some obscure XER affecting instructions like addex or mcrxrx.
[paulus@ozlabs.org - fix CA32 and OV32 for OP_ADD, fix order of
arguments to set_ov]
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
constant WritebackToCrFileInit : WritebackToCrFileType := (write_cr_enable => '0', write_xerc_enable => '0',
|
|
|
|
write_xerc_data => xerc_init,
|
|
|
|
write_cr_mask => (others => '0'),
|
|
|
|
write_cr_data => (others => '0'));
|
|
|
|
|
|
|
|
type WritebackToExecute1Type is record
|
|
|
|
intr : std_ulogic;
|
|
|
|
hv_intr : std_ulogic;
|
|
|
|
scv_int : std_ulogic;
|
|
|
|
srr1 : std_ulogic_vector(15 downto 0);
|
|
|
|
end record;
|
|
|
|
|
|
|
|
type WritebackEventType is record
|
|
|
|
instr_complete : std_ulogic;
|
|
|
|
fp_complete : std_ulogic;
|
|
|
|
end record;
|
|
|
|
|
|
|
|
end common;
|
|
|
|
|
|
|
|
package body common is
|
|
|
|
function decode_spr_num(insn: std_ulogic_vector(31 downto 0)) return spr_num_t is
|
|
|
|
begin
|
|
|
|
return to_integer(unsigned(insn(15 downto 11) & insn(20 downto 16)));
|
|
|
|
end;
|
|
|
|
|
|
|
|
function gspr_to_gpr(i: gspr_index_t) return gpr_index_t is
|
|
|
|
begin
|
|
|
|
return i(4 downto 0);
|
|
|
|
end;
|
|
|
|
|
|
|
|
function gpr_to_gspr(i: gpr_index_t) return gspr_index_t is
|
|
|
|
begin
|
|
|
|
return "0" & i;
|
|
|
|
end;
|
|
|
|
|
|
|
|
function fpr_to_gspr(f: fpr_index_t) return gspr_index_t is
|
|
|
|
begin
|
|
|
|
return "1" & f;
|
|
|
|
end;
|
|
|
|
|
|
|
|
function tag_match(tag1 : instr_tag_t; tag2 : instr_tag_t) return boolean is
|
|
|
|
begin
|
|
|
|
return tag1.valid = '1' and tag2.valid = '1' and tag1.tag = tag2.tag;
|
|
|
|
end;
|
|
|
|
|
|
|
|
function addr_to_real(addr: std_ulogic_vector(63 downto 0)) return real_addr_t is
|
|
|
|
begin
|
|
|
|
return addr(real_addr_t'range);
|
|
|
|
end;
|
|
|
|
end common;
|