|
|
|
library ieee;
|
|
|
|
use ieee.std_logic_1164.all;
|
|
|
|
use ieee.numeric_std.all;
|
|
|
|
|
|
|
|
library work;
|
|
|
|
use work.common.all;
|
|
|
|
use work.wishbone_types.all;
|
|
|
|
|
|
|
|
entity core is
|
|
|
|
generic (
|
|
|
|
SIM : boolean := false;
|
|
|
|
DISABLE_FLATTEN : boolean := false;
|
|
|
|
EX1_BYPASS : boolean := true;
|
|
|
|
HAS_FPU : boolean := true;
|
fetch1: Implement a simple branch target cache
This implements a cache in fetch1, where each entry stores the address
of a simple branch instruction (b or bc) and the target of the branch.
When fetching sequentially, if the address being fetched matches the
cache entry, then fetching will be redirected to the branch target.
The cache has 1024 entries and is direct-mapped, i.e. indexed by bits
11..2 of the NIA.
The bus from execute1 now carries information about taken and
not-taken simple branches, which fetch1 uses to update the cache.
The cache entry is updated for both taken and not-taken branches, with
the valid bit being set if the branch was taken and cleared if the
branch was not taken.
If fetching is redirected to the branch target then that goes down the
pipe as a predicted-taken branch, and decode1 does not do any static
branch prediction. If fetching is not redirected, then the next
instruction goes down the pipe as normal and decode1 does its static
branch prediction.
In order to make timing, the lookup of the cache is pipelined, so on
each cycle the cache entry for the current NIA + 8 is read. This
means that after a redirect (from decode1 or execute1), only the third
and subsequent sequentially-fetched instructions will be able to be
predicted.
This improves the coremark value on the Arty A7-100 from about 180 to
about 190 (more than 5%).
The BTC is optional. Builds for the Artix 7 35-T part have it off by
default because the extra ~1420 LUTs it takes mean that the design
doesn't fit on the Arty A7-35 board.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
HAS_BTC : boolean := true;
|
|
|
|
ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0');
|
|
|
|
LOG_LENGTH : natural := 512;
|
|
|
|
ICACHE_NUM_LINES : natural := 64;
|
|
|
|
ICACHE_NUM_WAYS : natural := 2;
|
|
|
|
ICACHE_TLB_SIZE : natural := 64;
|
|
|
|
DCACHE_NUM_LINES : natural := 64;
|
|
|
|
DCACHE_NUM_WAYS : natural := 2;
|
|
|
|
DCACHE_TLB_SET_SIZE : natural := 64;
|
|
|
|
DCACHE_TLB_NUM_WAYS : natural := 2
|
|
|
|
);
|
|
|
|
port (
|
|
|
|
clk : in std_ulogic;
|
|
|
|
rst : in std_ulogic;
|
|
|
|
|
|
|
|
-- Alternate reset (0xffff0000) for use by DRAM init fw
|
|
|
|
alt_reset : in std_ulogic;
|
|
|
|
|
|
|
|
-- Wishbone interface
|
|
|
|
wishbone_insn_in : in wishbone_slave_out;
|
|
|
|
wishbone_insn_out : out wishbone_master_out;
|
|
|
|
|
|
|
|
wishbone_data_in : in wishbone_slave_out;
|
|
|
|
wishbone_data_out : out wishbone_master_out;
|
|
|
|
|
|
|
|
wb_snoop_in : in wishbone_master_out;
|
|
|
|
|
|
|
|
dmi_addr : in std_ulogic_vector(3 downto 0);
|
|
|
|
dmi_din : in std_ulogic_vector(63 downto 0);
|
|
|
|
dmi_dout : out std_ulogic_vector(63 downto 0);
|
|
|
|
dmi_req : in std_ulogic;
|
|
|
|
dmi_wr : in std_ulogic;
|
|
|
|
dmi_ack : out std_ulogic;
|
|
|
|
|
|
|
|
ext_irq : in std_ulogic;
|
|
|
|
|
|
|
|
run_out : out std_ulogic;
|
|
|
|
terminated_out : out std_logic
|
|
|
|
);
|
|
|
|
end core;
|
|
|
|
|
|
|
|
architecture behave of core is
|
|
|
|
-- icache signals
|
|
|
|
signal fetch1_to_icache : Fetch1ToIcacheType;
|
|
|
|
signal writeback_to_fetch1: WritebackToFetch1Type;
|
|
|
|
signal icache_to_decode1 : IcacheToDecode1Type;
|
Move iTLB from icache to fetch1
This moves the address translation step for instruction fetches one
cycle earlier, so that it now happens in the fetch1 stage. There is
now a 2-entry mini translation cache ("ERAT", or effective to real
address translation cache) which operates on the output of the
multiplexer that selects the instruction address for the next cycle.
The ERAT consists of two effective address registers and two
corresponding real address registers. They store the page number part
of the addresses for a 4kB page size, which is the smallest page size
supported by the architecture.
If the effective address doesn't match either of the EA registers, and
address translation is enabled, then i_out.req goes low for two cycles
while the iTLB is looked up. Experimentally, this delay results in a
0.1% drop in coremark performance; allowing two cycles for the lookup
results in better timing. The result from the iTLB is placed into the
least recently used ERAT entry and then used to translate the address
as normal. If address translation is not enabled then the EA is used
directly as the real address.
The iTLB structure is the same as it was before; direct mapped,
indexed using a hashed EA.
The "fetch failed" signal, which indicates a TLB miss or protection
violation, is now generated in fetch1 and passed through icache.
When it is asserted, fetch1 goes into a stalled state until a PTE
arrives from the MMU (which gets put into both the iTLB and the ERAT),
or an interrupt or redirect occurs.
Any TLB invalidations from the MMU invalidate the whole ERAT.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
1 year ago
|
|
|
signal mmu_to_itlb : MmuToITLBType;
|
|
|
|
|
|
|
|
-- decode signals
|
|
|
|
signal decode1_to_decode2: Decode1ToDecode2Type;
|
|
|
|
signal decode1_to_fetch1: Decode1ToFetch1Type;
|
|
|
|
signal decode1_to_register_file: Decode1ToRegisterFileType;
|
|
|
|
signal decode2_to_execute1: Decode2ToExecute1Type;
|
|
|
|
|
|
|
|
-- register file signals
|
|
|
|
signal register_file_to_decode2: RegisterFileToDecode2Type;
|
|
|
|
signal decode2_to_register_file: Decode2ToRegisterFileType;
|
|
|
|
signal writeback_to_register_file: WritebackToRegisterFileType;
|
|
|
|
|
|
|
|
-- CR file signals
|
|
|
|
signal decode2_to_cr_file: Decode2ToCrFileType;
|
|
|
|
signal cr_file_to_decode2: CrFileToDecode2Type;
|
|
|
|
signal writeback_to_cr_file: WritebackToCrFileType;
|
|
|
|
|
|
|
|
-- execute signals
|
|
|
|
signal execute1_to_writeback: Execute1ToWritebackType;
|
|
|
|
signal execute1_bypass: bypass_data_t;
|
|
|
|
signal execute1_cr_bypass: cr_bypass_data_t;
|
|
|
|
signal execute2_bypass: bypass_data_t;
|
|
|
|
signal execute2_cr_bypass: cr_bypass_data_t;
|
|
|
|
|
|
|
|
-- load store signals
|
|
|
|
signal execute1_to_loadstore1: Execute1ToLoadstore1Type;
|
|
|
|
signal loadstore1_to_execute1: Loadstore1ToExecute1Type;
|
|
|
|
signal loadstore1_to_writeback: Loadstore1ToWritebackType;
|
|
|
|
signal loadstore1_to_mmu: Loadstore1ToMmuType;
|
|
|
|
signal mmu_to_loadstore1: MmuToLoadstore1Type;
|
|
|
|
|
|
|
|
-- dcache signals
|
|
|
|
signal loadstore1_to_dcache: Loadstore1ToDcacheType;
|
|
|
|
signal dcache_to_loadstore1: DcacheToLoadstore1Type;
|
|
|
|
signal mmu_to_dcache: MmuToDcacheType;
|
|
|
|
signal dcache_to_mmu: DcacheToMmuType;
|
|
|
|
|
|
|
|
-- FPU signals
|
|
|
|
signal execute1_to_fpu: Execute1ToFPUType;
|
|
|
|
signal fpu_to_execute1: FPUToExecute1Type;
|
|
|
|
signal fpu_to_writeback: FPUToWritebackType;
|
|
|
|
|
|
|
|
-- Writeback signals
|
|
|
|
signal writeback_bypass: bypass_data_t;
|
|
|
|
signal wb_interrupt: WritebackToExecute1Type;
|
|
|
|
|
|
|
|
-- local signals
|
|
|
|
signal fetch1_stall_in : std_ulogic;
|
|
|
|
signal icache_stall_out : std_ulogic;
|
|
|
|
signal icache_stall_in : std_ulogic;
|
|
|
|
signal decode1_stall_in : std_ulogic;
|
|
|
|
signal decode1_busy : std_ulogic;
|
|
|
|
signal decode2_busy_in : std_ulogic;
|
|
|
|
signal decode2_stall_out : std_ulogic;
|
|
|
|
signal ex1_icache_inval: std_ulogic;
|
|
|
|
signal ex1_busy_out: std_ulogic;
|
|
|
|
signal dcache_stall_out: std_ulogic;
|
|
|
|
|
|
|
|
signal flush: std_ulogic;
|
|
|
|
signal decode1_flush: std_ulogic;
|
|
|
|
signal fetch1_flush: std_ulogic;
|
|
|
|
|
|
|
|
signal complete: instr_tag_t;
|
|
|
|
signal terminate: std_ulogic;
|
|
|
|
signal core_rst: std_ulogic;
|
|
|
|
|
|
|
|
-- Delayed/Latched resets and alt_reset
|
|
|
|
signal rst_fetch1 : std_ulogic;
|
|
|
|
signal rst_fetch2 : std_ulogic;
|
|
|
|
signal rst_icache : std_ulogic;
|
|
|
|
signal rst_dcache : std_ulogic;
|
|
|
|
signal rst_dec1 : std_ulogic;
|
|
|
|
signal rst_dec2 : std_ulogic;
|
|
|
|
signal rst_ex1 : std_ulogic;
|
|
|
|
signal rst_fpu : std_ulogic;
|
|
|
|
signal rst_ls1 : std_ulogic;
|
|
|
|
signal rst_wback : std_ulogic;
|
|
|
|
signal rst_dbg : std_ulogic;
|
|
|
|
signal alt_reset_d : std_ulogic;
|
|
|
|
|
|
|
|
signal sim_ex_dump: std_ulogic;
|
|
|
|
signal sim_cr_dump: std_ulogic;
|
|
|
|
|
|
|
|
-- Debug actions
|
|
|
|
signal dbg_core_stop: std_ulogic;
|
|
|
|
signal dbg_core_rst: std_ulogic;
|
|
|
|
signal dbg_icache_rst: std_ulogic;
|
|
|
|
|
|
|
|
signal dbg_gpr_req : std_ulogic;
|
|
|
|
signal dbg_gpr_ack : std_ulogic;
|
|
|
|
signal dbg_gpr_addr : gspr_index_t;
|
|
|
|
signal dbg_gpr_data : std_ulogic_vector(63 downto 0);
|
|
|
|
signal dbg_spr_req : std_ulogic;
|
|
|
|
signal dbg_spr_ack : std_ulogic;
|
|
|
|
signal dbg_spr_addr : std_ulogic_vector(7 downto 0);
|
|
|
|
signal dbg_spr_data : std_ulogic_vector(63 downto 0);
|
|
|
|
signal dbg_ls_spr_req : std_ulogic;
|
|
|
|
signal dbg_ls_spr_ack : std_ulogic;
|
|
|
|
signal dbg_ls_spr_addr : std_ulogic_vector(1 downto 0);
|
|
|
|
signal dbg_ls_spr_data : std_ulogic_vector(63 downto 0);
|
|
|
|
|
|
|
|
signal ctrl_debug : ctrl_t;
|
|
|
|
|
|
|
|
-- PMU event bus
|
|
|
|
signal icache_events : IcacheEventType;
|
|
|
|
signal loadstore_events : Loadstore1EventType;
|
|
|
|
signal dcache_events : DcacheEventType;
|
|
|
|
signal writeback_events : WritebackEventType;
|
|
|
|
|
|
|
|
-- Debug status
|
|
|
|
signal dbg_core_is_stopped: std_ulogic;
|
|
|
|
|
|
|
|
-- Logging signals
|
|
|
|
signal log_data : std_ulogic_vector(255 downto 0);
|
|
|
|
signal log_rd_addr : std_ulogic_vector(31 downto 0);
|
|
|
|
signal log_wr_addr : std_ulogic_vector(31 downto 0);
|
|
|
|
signal log_rd_data : std_ulogic_vector(63 downto 0);
|
|
|
|
|
|
|
|
function keep_h(disable : boolean) return string is
|
|
|
|
begin
|
|
|
|
if disable then
|
|
|
|
return "yes";
|
|
|
|
else
|
|
|
|
return "no";
|
|
|
|
end if;
|
|
|
|
end function;
|
|
|
|
attribute keep_hierarchy : string;
|
|
|
|
attribute keep_hierarchy of fetch1_0 : label is keep_h(DISABLE_FLATTEN);
|
|
|
|
attribute keep_hierarchy of icache_0 : label is keep_h(DISABLE_FLATTEN);
|
|
|
|
attribute keep_hierarchy of decode1_0 : label is keep_h(DISABLE_FLATTEN);
|
|
|
|
attribute keep_hierarchy of decode2_0 : label is keep_h(DISABLE_FLATTEN);
|
|
|
|
attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN);
|
|
|
|
attribute keep_hierarchy of cr_file_0 : label is keep_h(DISABLE_FLATTEN);
|
|
|
|
attribute keep_hierarchy of execute1_0 : label is keep_h(DISABLE_FLATTEN);
|
|
|
|
attribute keep_hierarchy of loadstore1_0 : label is keep_h(DISABLE_FLATTEN);
|
|
|
|
attribute keep_hierarchy of mmu_0 : label is keep_h(DISABLE_FLATTEN);
|
|
|
|
attribute keep_hierarchy of dcache_0 : label is keep_h(DISABLE_FLATTEN);
|
|
|
|
attribute keep_hierarchy of writeback_0 : label is keep_h(DISABLE_FLATTEN);
|
|
|
|
attribute keep_hierarchy of debug_0 : label is keep_h(DISABLE_FLATTEN);
|
|
|
|
begin
|
|
|
|
|
|
|
|
core_rst <= dbg_core_rst or rst;
|
|
|
|
|
|
|
|
resets: process(clk)
|
|
|
|
begin
|
|
|
|
if rising_edge(clk) then
|
|
|
|
rst_fetch1 <= core_rst;
|
|
|
|
rst_fetch2 <= core_rst;
|
|
|
|
rst_icache <= core_rst;
|
|
|
|
rst_dcache <= core_rst;
|
|
|
|
rst_dec1 <= core_rst;
|
|
|
|
rst_dec2 <= core_rst;
|
|
|
|
rst_ex1 <= core_rst;
|
|
|
|
rst_fpu <= core_rst;
|
|
|
|
rst_ls1 <= core_rst;
|
|
|
|
rst_wback <= core_rst;
|
|
|
|
rst_dbg <= rst;
|
|
|
|
alt_reset_d <= alt_reset;
|
|
|
|
end if;
|
|
|
|
end process;
|
|
|
|
|
|
|
|
fetch1_0: entity work.fetch1
|
|
|
|
generic map (
|
|
|
|
RESET_ADDRESS => (others => '0'),
|
fetch1: Implement a simple branch target cache
This implements a cache in fetch1, where each entry stores the address
of a simple branch instruction (b or bc) and the target of the branch.
When fetching sequentially, if the address being fetched matches the
cache entry, then fetching will be redirected to the branch target.
The cache has 1024 entries and is direct-mapped, i.e. indexed by bits
11..2 of the NIA.
The bus from execute1 now carries information about taken and
not-taken simple branches, which fetch1 uses to update the cache.
The cache entry is updated for both taken and not-taken branches, with
the valid bit being set if the branch was taken and cleared if the
branch was not taken.
If fetching is redirected to the branch target then that goes down the
pipe as a predicted-taken branch, and decode1 does not do any static
branch prediction. If fetching is not redirected, then the next
instruction goes down the pipe as normal and decode1 does its static
branch prediction.
In order to make timing, the lookup of the cache is pipelined, so on
each cycle the cache entry for the current NIA + 8 is read. This
means that after a redirect (from decode1 or execute1), only the third
and subsequent sequentially-fetched instructions will be able to be
predicted.
This improves the coremark value on the Arty A7-100 from about 180 to
about 190 (more than 5%).
The BTC is optional. Builds for the Artix 7 35-T part have it off by
default because the extra ~1420 LUTs it takes mean that the design
doesn't fit on the Arty A7-35 board.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
ALT_RESET_ADDRESS => ALT_RESET_ADDRESS,
|
Move iTLB from icache to fetch1
This moves the address translation step for instruction fetches one
cycle earlier, so that it now happens in the fetch1 stage. There is
now a 2-entry mini translation cache ("ERAT", or effective to real
address translation cache) which operates on the output of the
multiplexer that selects the instruction address for the next cycle.
The ERAT consists of two effective address registers and two
corresponding real address registers. They store the page number part
of the addresses for a 4kB page size, which is the smallest page size
supported by the architecture.
If the effective address doesn't match either of the EA registers, and
address translation is enabled, then i_out.req goes low for two cycles
while the iTLB is looked up. Experimentally, this delay results in a
0.1% drop in coremark performance; allowing two cycles for the lookup
results in better timing. The result from the iTLB is placed into the
least recently used ERAT entry and then used to translate the address
as normal. If address translation is not enabled then the EA is used
directly as the real address.
The iTLB structure is the same as it was before; direct mapped,
indexed using a hashed EA.
The "fetch failed" signal, which indicates a TLB miss or protection
violation, is now generated in fetch1 and passed through icache.
When it is asserted, fetch1 goes into a stalled state until a PTE
arrives from the MMU (which gets put into both the iTLB and the ERAT),
or an interrupt or redirect occurs.
Any TLB invalidations from the MMU invalidate the whole ERAT.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
1 year ago
|
|
|
TLB_SIZE => ICACHE_TLB_SIZE,
|
fetch1: Implement a simple branch target cache
This implements a cache in fetch1, where each entry stores the address
of a simple branch instruction (b or bc) and the target of the branch.
When fetching sequentially, if the address being fetched matches the
cache entry, then fetching will be redirected to the branch target.
The cache has 1024 entries and is direct-mapped, i.e. indexed by bits
11..2 of the NIA.
The bus from execute1 now carries information about taken and
not-taken simple branches, which fetch1 uses to update the cache.
The cache entry is updated for both taken and not-taken branches, with
the valid bit being set if the branch was taken and cleared if the
branch was not taken.
If fetching is redirected to the branch target then that goes down the
pipe as a predicted-taken branch, and decode1 does not do any static
branch prediction. If fetching is not redirected, then the next
instruction goes down the pipe as normal and decode1 does its static
branch prediction.
In order to make timing, the lookup of the cache is pipelined, so on
each cycle the cache entry for the current NIA + 8 is read. This
means that after a redirect (from decode1 or execute1), only the third
and subsequent sequentially-fetched instructions will be able to be
predicted.
This improves the coremark value on the Arty A7-100 from about 180 to
about 190 (more than 5%).
The BTC is optional. Builds for the Artix 7 35-T part have it off by
default because the extra ~1420 LUTs it takes mean that the design
doesn't fit on the Arty A7-35 board.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
4 years ago
|
|
|
HAS_BTC => HAS_BTC
|
|
|
|
)
|
|
|
|
port map (
|
|
|
|
clk => clk,
|
|
|
|
rst => rst_fetch1,
|
|
|
|
alt_reset_in => alt_reset_d,
|
|
|
|
stall_in => fetch1_stall_in,
|
|
|
|
flush_in => fetch1_flush,
|
Move iTLB from icache to fetch1
This moves the address translation step for instruction fetches one
cycle earlier, so that it now happens in the fetch1 stage. There is
now a 2-entry mini translation cache ("ERAT", or effective to real
address translation cache) which operates on the output of the
multiplexer that selects the instruction address for the next cycle.
The ERAT consists of two effective address registers and two
corresponding real address registers. They store the page number part
of the addresses for a 4kB page size, which is the smallest page size
supported by the architecture.
If the effective address doesn't match either of the EA registers, and
address translation is enabled, then i_out.req goes low for two cycles
while the iTLB is looked up. Experimentally, this delay results in a
0.1% drop in coremark performance; allowing two cycles for the lookup
results in better timing. The result from the iTLB is placed into the
least recently used ERAT entry and then used to translate the address
as normal. If address translation is not enabled then the EA is used
directly as the real address.
The iTLB structure is the same as it was before; direct mapped,
indexed using a hashed EA.
The "fetch failed" signal, which indicates a TLB miss or protection
violation, is now generated in fetch1 and passed through icache.
When it is asserted, fetch1 goes into a stalled state until a PTE
arrives from the MMU (which gets put into both the iTLB and the ERAT),
or an interrupt or redirect occurs.
Any TLB invalidations from the MMU invalidate the whole ERAT.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
1 year ago
|
|
|
inval_btc => ex1_icache_inval or mmu_to_itlb.tlbie,
|
|
|
|
stop_in => dbg_core_stop,
|
Move iTLB from icache to fetch1
This moves the address translation step for instruction fetches one
cycle earlier, so that it now happens in the fetch1 stage. There is
now a 2-entry mini translation cache ("ERAT", or effective to real
address translation cache) which operates on the output of the
multiplexer that selects the instruction address for the next cycle.
The ERAT consists of two effective address registers and two
corresponding real address registers. They store the page number part
of the addresses for a 4kB page size, which is the smallest page size
supported by the architecture.
If the effective address doesn't match either of the EA registers, and
address translation is enabled, then i_out.req goes low for two cycles
while the iTLB is looked up. Experimentally, this delay results in a
0.1% drop in coremark performance; allowing two cycles for the lookup
results in better timing. The result from the iTLB is placed into the
least recently used ERAT entry and then used to translate the address
as normal. If address translation is not enabled then the EA is used
directly as the real address.
The iTLB structure is the same as it was before; direct mapped,
indexed using a hashed EA.
The "fetch failed" signal, which indicates a TLB miss or protection
violation, is now generated in fetch1 and passed through icache.
When it is asserted, fetch1 goes into a stalled state until a PTE
arrives from the MMU (which gets put into both the iTLB and the ERAT),
or an interrupt or redirect occurs.
Any TLB invalidations from the MMU invalidate the whole ERAT.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
1 year ago
|
|
|
m_in => mmu_to_itlb,
|
|
|
|
d_in => decode1_to_fetch1,
|
|
|
|
w_in => writeback_to_fetch1,
|
|
|
|
i_out => fetch1_to_icache,
|
|
|
|
log_out => log_data(42 downto 0)
|
|
|
|
);
|
|
|
|
|
|
|
|
fetch1_stall_in <= icache_stall_out or decode1_busy;
|
|
|
|
fetch1_flush <= flush or decode1_flush;
|
|
|
|
|
|
|
|
icache_0: entity work.icache
|
|
|
|
generic map(
|
|
|
|
SIM => SIM,
|
|
|
|
HAS_FPU => HAS_FPU,
|
|
|
|
LINE_SIZE => 64,
|
|
|
|
NUM_LINES => ICACHE_NUM_LINES,
|
|
|
|
NUM_WAYS => ICACHE_NUM_WAYS,
|
|
|
|
LOG_LENGTH => LOG_LENGTH
|
|
|
|
)
|
|
|
|
port map(
|
|
|
|
clk => clk,
|
|
|
|
rst => rst_icache,
|
|
|
|
i_in => fetch1_to_icache,
|
|
|
|
i_out => icache_to_decode1,
|
|
|
|
flush_in => fetch1_flush,
|
|
|
|
inval_in => dbg_icache_rst or ex1_icache_inval,
|
|
|
|
stall_in => icache_stall_in,
|
|
|
|
stall_out => icache_stall_out,
|
|
|
|
wishbone_out => wishbone_insn_out,
|
|
|
|
wishbone_in => wishbone_insn_in,
|
|
|
|
wb_snoop_in => wb_snoop_in,
|
|
|
|
events => icache_events,
|
|
|
|
log_out => log_data(100 downto 43)
|
|
|
|
);
|
|
|
|
|
|
|
|
icache_stall_in <= decode1_busy;
|
|
|
|
|
|
|
|
decode1_0: entity work.decode1
|
|
|
|
generic map(
|
|
|
|
HAS_FPU => HAS_FPU,
|
|
|
|
LOG_LENGTH => LOG_LENGTH
|
|
|
|
)
|
|
|
|
port map (
|
|
|
|
clk => clk,
|
|
|
|
rst => rst_dec1,
|
|
|
|
stall_in => decode1_stall_in,
|
|
|
|
flush_in => flush,
|
|
|
|
flush_out => decode1_flush,
|
|
|
|
busy_out => decode1_busy,
|
|
|
|
f_in => icache_to_decode1,
|
|
|
|
d_out => decode1_to_decode2,
|
|
|
|
f_out => decode1_to_fetch1,
|
|
|
|
r_out => decode1_to_register_file,
|
|
|
|
log_out => log_data(113 downto 101)
|
|
|
|
);
|
|
|
|
|
|
|
|
decode1_stall_in <= decode2_stall_out;
|
|
|
|
|
|
|
|
decode2_0: entity work.decode2
|
|
|
|
generic map (
|
|
|
|
EX1_BYPASS => EX1_BYPASS,
|
|
|
|
HAS_FPU => HAS_FPU,
|
|
|
|
LOG_LENGTH => LOG_LENGTH
|
|
|
|
)
|
|
|
|
port map (
|
|
|
|
clk => clk,
|
|
|
|
rst => rst_dec2,
|
|
|
|
busy_in => decode2_busy_in,
|
|
|
|
stall_out => decode2_stall_out,
|
|
|
|
flush_in => flush,
|
|
|
|
complete_in => complete,
|
|
|
|
stopped_out => dbg_core_is_stopped,
|
|
|
|
d_in => decode1_to_decode2,
|
|
|
|
e_out => decode2_to_execute1,
|
|
|
|
r_in => register_file_to_decode2,
|
|
|
|
r_out => decode2_to_register_file,
|
|
|
|
c_in => cr_file_to_decode2,
|
|
|
|
c_out => decode2_to_cr_file,
|
|
|
|
execute_bypass => execute1_bypass,
|
|
|
|
execute_cr_bypass => execute1_cr_bypass,
|
|
|
|
execute2_bypass => execute2_bypass,
|
|
|
|
execute2_cr_bypass => execute2_cr_bypass,
|
|
|
|
writeback_bypass => writeback_bypass,
|
|
|
|
dbg_spr_req => dbg_spr_req,
|
|
|
|
dbg_spr_addr => dbg_spr_addr,
|
|
|
|
log_out => log_data(123 downto 114)
|
|
|
|
);
|
|
|
|
decode2_busy_in <= ex1_busy_out;
|
|
|
|
|
|
|
|
register_file_0: entity work.register_file
|
|
|
|
generic map (
|
|
|
|
SIM => SIM,
|
|
|
|
HAS_FPU => HAS_FPU,
|
|
|
|
LOG_LENGTH => LOG_LENGTH
|
|
|
|
)
|
|
|
|
port map (
|
|
|
|
clk => clk,
|
|
|
|
stall => decode2_stall_out,
|
|
|
|
d1_in => decode1_to_register_file,
|
|
|
|
d_in => decode2_to_register_file,
|
|
|
|
d_out => register_file_to_decode2,
|
|
|
|
w_in => writeback_to_register_file,
|
|
|
|
dbg_gpr_req => dbg_gpr_req,
|
|
|
|
dbg_gpr_ack => dbg_gpr_ack,
|
|
|
|
dbg_gpr_addr => dbg_gpr_addr,
|
|
|
|
dbg_gpr_data => dbg_gpr_data,
|
|
|
|
sim_dump => terminate,
|
|
|
|
sim_dump_done => sim_ex_dump,
|
|
|
|
log_out => log_data(255 downto 184)
|
|
|
|
);
|
|
|
|
|
|
|
|
cr_file_0: entity work.cr_file
|
|
|
|
generic map (
|
|
|
|
SIM => SIM,
|
|
|
|
LOG_LENGTH => LOG_LENGTH
|
|
|
|
)
|
|
|
|
port map (
|
|
|
|
clk => clk,
|
|
|
|
d_in => decode2_to_cr_file,
|
|
|
|
d_out => cr_file_to_decode2,
|
|
|
|
w_in => writeback_to_cr_file,
|
|
|
|
sim_dump => sim_cr_dump,
|
|
|
|
ctrl => ctrl_debug,
|
|
|
|
log_out => log_data(183 downto 171)
|
|
|
|
);
|
|
|
|
|
|
|
|
execute1_0: entity work.execute1
|
|
|
|
generic map (
|
|
|
|
SIM => SIM,
|
|
|
|
EX1_BYPASS => EX1_BYPASS,
|
|
|
|
HAS_FPU => HAS_FPU,
|
|
|
|
LOG_LENGTH => LOG_LENGTH
|
|
|
|
)
|
|
|
|
port map (
|
|
|
|
clk => clk,
|
|
|
|
rst => rst_ex1,
|
|
|
|
flush_in => flush,
|
|
|
|
busy_out => ex1_busy_out,
|
|
|
|
e_in => decode2_to_execute1,
|
|
|
|
l_in => loadstore1_to_execute1,
|
|
|
|
fp_in => fpu_to_execute1,
|
|
|
|
ext_irq_in => ext_irq,
|
|
|
|
interrupt_in => wb_interrupt,
|
|
|
|
l_out => execute1_to_loadstore1,
|
|
|
|
fp_out => execute1_to_fpu,
|
|
|
|
e_out => execute1_to_writeback,
|
|
|
|
bypass_data => execute1_bypass,
|
|
|
|
bypass_cr_data => execute1_cr_bypass,
|
|
|
|
bypass2_data => execute2_bypass,
|
|
|
|
bypass2_cr_data => execute2_cr_bypass,
|
|
|
|
icache_inval => ex1_icache_inval,
|
|
|
|
dbg_ctrl_out => ctrl_debug,
|
|
|
|
wb_events => writeback_events,
|
|
|
|
ls_events => loadstore_events,
|
|
|
|
dc_events => dcache_events,
|
|
|
|
ic_events => icache_events,
|
|
|
|
run_out => run_out,
|
|
|
|
terminate_out => terminate,
|
|
|
|
dbg_spr_req => dbg_spr_req,
|
|
|
|
dbg_spr_ack => dbg_spr_ack,
|
|
|
|
dbg_spr_addr => dbg_spr_addr,
|
|
|
|
dbg_spr_data => dbg_spr_data,
|
|
|
|
sim_dump => sim_ex_dump,
|
|
|
|
sim_dump_done => sim_cr_dump,
|
|
|
|
log_out => log_data(135 downto 124),
|
|
|
|
log_rd_addr => log_rd_addr,
|
|
|
|
log_rd_data => log_rd_data,
|
|
|
|
log_wr_addr => log_wr_addr
|
|
|
|
);
|
|
|
|
|
|
|
|
with_fpu: if HAS_FPU generate
|
|
|
|
begin
|
|
|
|
fpu_0: entity work.fpu
|
|
|
|
port map (
|
|
|
|
clk => clk,
|
|
|
|
rst => rst_fpu,
|
|
|
|
flush_in => flush,
|
|
|
|
e_in => execute1_to_fpu,
|
|
|
|
e_out => fpu_to_execute1,
|
|
|
|
w_out => fpu_to_writeback
|
|
|
|
);
|
|
|
|
end generate;
|
|
|
|
|
|
|
|
no_fpu: if not HAS_FPU generate
|
|
|
|
begin
|
|
|
|
fpu_to_execute1 <= FPUToExecute1Init;
|
|
|
|
fpu_to_writeback <= FPUToWritebackInit;
|
|
|
|
end generate;
|
|
|
|
|
|
|
|
loadstore1_0: entity work.loadstore1
|
|
|
|
generic map (
|
|
|
|
HAS_FPU => HAS_FPU,
|
|
|
|
LOG_LENGTH => LOG_LENGTH
|
|
|
|
)
|
|
|
|
port map (
|
|
|
|
clk => clk,
|
|
|
|
rst => rst_ls1,
|
|
|
|
l_in => execute1_to_loadstore1,
|
|
|
|
e_out => loadstore1_to_execute1,
|
|
|
|
l_out => loadstore1_to_writeback,
|
|
|
|
d_out => loadstore1_to_dcache,
|
|
|
|
d_in => dcache_to_loadstore1,
|
|
|
|
m_out => loadstore1_to_mmu,
|
|
|
|
m_in => mmu_to_loadstore1,
|
|
|
|
dc_stall => dcache_stall_out,
|
|
|
|
events => loadstore_events,
|
|
|
|
dbg_spr_req => dbg_ls_spr_req,
|
|
|
|
dbg_spr_ack => dbg_ls_spr_ack,
|
|
|
|
dbg_spr_addr => dbg_ls_spr_addr,
|
|
|
|
dbg_spr_data => dbg_ls_spr_data,
|
|
|
|
log_out => log_data(149 downto 140)
|
|
|
|
);
|
|
|
|
|
|
|
|
mmu_0: entity work.mmu
|
|
|
|
port map (
|
|
|
|
clk => clk,
|
|
|
|
rst => core_rst,
|
|
|
|
l_in => loadstore1_to_mmu,
|
|
|
|
l_out => mmu_to_loadstore1,
|
|
|
|
d_out => mmu_to_dcache,
|
Add TLB to icache
This adds a direct-mapped TLB to the icache, with 64 entries by default.
Execute1 now sends a "virt_mode" signal from MSR[IR] to fetch1 along
with redirects to indicate whether instruction addresses should be
translated through the TLB, and fetch1 sends that on to icache.
Similarly a "priv_mode" signal is sent to indicate the privilege
mode for instruction fetches. This means that changes to MSR[IR]
or MSR[PR] don't take effect until the next redirect, meaning an
isync, rfid, branch, etc.
The icache uses a hash of the effective address (i.e. next instruction
address) to index the TLB. The hash is an XOR of three fields of the
address; with a 64-entry TLB, the fields are bits 12--17, 18--23 and
24--29 of the address. TLB invalidations simply invalidate the
indexed TLB entry without checking the contents.
If the icache detects a TLB miss with virt_mode=1, it will send a
fetch_failed indication through fetch2 to decode1, which will turn it
into a special OP_FETCH_FAILED opcode with unit=LDST. That will get
sent down to loadstore1 which will currently just raise a Instruction
Storage Interrupt (0x400) exception.
One bit in the PTE obtained from the TLB is used to check whether an
instruction access is allowed -- the privilege bit (bit 3). If bit 3
is 1 and priv_mode=0, then a fetch_failed indication is sent down to
fetch2 and to decode1, which generates an OP_FETCH_FAILED. Any PTEs
with PTE bit 0 (EAA[3]) clear or bit 8 (R) clear should not be put
into the iTLB since such PTEs would not allow execution by any
context.
Tlbie operations get sent from mmu to icache over a new connection.
Unfortunately the privileged instruction tests are broken for now.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
5 years ago
|
|
|
d_in => dcache_to_mmu,
|
Move iTLB from icache to fetch1
This moves the address translation step for instruction fetches one
cycle earlier, so that it now happens in the fetch1 stage. There is
now a 2-entry mini translation cache ("ERAT", or effective to real
address translation cache) which operates on the output of the
multiplexer that selects the instruction address for the next cycle.
The ERAT consists of two effective address registers and two
corresponding real address registers. They store the page number part
of the addresses for a 4kB page size, which is the smallest page size
supported by the architecture.
If the effective address doesn't match either of the EA registers, and
address translation is enabled, then i_out.req goes low for two cycles
while the iTLB is looked up. Experimentally, this delay results in a
0.1% drop in coremark performance; allowing two cycles for the lookup
results in better timing. The result from the iTLB is placed into the
least recently used ERAT entry and then used to translate the address
as normal. If address translation is not enabled then the EA is used
directly as the real address.
The iTLB structure is the same as it was before; direct mapped,
indexed using a hashed EA.
The "fetch failed" signal, which indicates a TLB miss or protection
violation, is now generated in fetch1 and passed through icache.
When it is asserted, fetch1 goes into a stalled state until a PTE
arrives from the MMU (which gets put into both the iTLB and the ERAT),
or an interrupt or redirect occurs.
Any TLB invalidations from the MMU invalidate the whole ERAT.
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
1 year ago
|
|
|
i_out => mmu_to_itlb
|
|
|
|
);
|
|
|
|
|
|
|
|
dcache_0: entity work.dcache
|
|
|
|
generic map(
|
|
|
|
LINE_SIZE => 64,
|
|
|
|
NUM_LINES => DCACHE_NUM_LINES,
|
|
|
|
NUM_WAYS => DCACHE_NUM_WAYS,
|
|
|
|
TLB_SET_SIZE => DCACHE_TLB_SET_SIZE,
|
|
|
|
TLB_NUM_WAYS => DCACHE_TLB_NUM_WAYS,
|
|
|
|
LOG_LENGTH => LOG_LENGTH
|
|
|
|
)
|
|
|
|
port map (
|
|
|
|
clk => clk,
|
|
|
|
rst => rst_dcache,
|
|
|
|
d_in => loadstore1_to_dcache,
|
|
|
|
d_out => dcache_to_loadstore1,
|
|
|
|
m_in => mmu_to_dcache,
|
|
|
|
m_out => dcache_to_mmu,
|
|
|
|
stall_out => dcache_stall_out,
|
|
|
|
wishbone_in => wishbone_data_in,
|
|
|
|
wishbone_out => wishbone_data_out,
|
|
|
|
snoop_in => wb_snoop_in,
|
|
|
|
events => dcache_events,
|
|
|
|
log_out => log_data(170 downto 151)
|
|
|
|
);
|
|
|
|
|
|
|
|
writeback_0: entity work.writeback
|
|
|
|
port map (
|
|
|
|
clk => clk,
|
|
|
|
rst => rst_wback,
|
|
|
|
flush_out => flush,
|
|
|
|
e_in => execute1_to_writeback,
|
|
|
|
l_in => loadstore1_to_writeback,
|
|
|
|
fp_in => fpu_to_writeback,
|
|
|
|
w_out => writeback_to_register_file,
|
|
|
|
c_out => writeback_to_cr_file,
|
|
|
|
f_out => writeback_to_fetch1,
|
|
|
|
wb_bypass => writeback_bypass,
|
|
|
|
events => writeback_events,
|
|
|
|
interrupt_out => wb_interrupt,
|
|
|
|
complete_out => complete
|
|
|
|
);
|
|
|
|
|
|
|
|
log_data(150) <= '0';
|
|
|
|
log_data(139 downto 136) <= "0000";
|
|
|
|
|
|
|
|
debug_0: entity work.core_debug
|
|
|
|
generic map (
|
|
|
|
LOG_LENGTH => LOG_LENGTH
|
|
|
|
)
|
|
|
|
port map (
|
|
|
|
clk => clk,
|
|
|
|
rst => rst_dbg,
|
|
|
|
dmi_addr => dmi_addr,
|
|
|
|
dmi_din => dmi_din,
|
|
|
|
dmi_dout => dmi_dout,
|
|
|
|
dmi_req => dmi_req,
|
|
|
|
dmi_wr => dmi_wr,
|
|
|
|
dmi_ack => dmi_ack,
|
|
|
|
core_stop => dbg_core_stop,
|
|
|
|
core_rst => dbg_core_rst,
|
|
|
|
icache_rst => dbg_icache_rst,
|
|
|
|
terminate => terminate,
|
|
|
|
core_stopped => dbg_core_is_stopped,
|
|
|
|
nia => fetch1_to_icache.nia,
|
|
|
|
msr => ctrl_debug.msr,
|
|
|
|
wb_snoop_in => wb_snoop_in,
|
|
|
|
dbg_gpr_req => dbg_gpr_req,
|
|
|
|
dbg_gpr_ack => dbg_gpr_ack,
|
|
|
|
dbg_gpr_addr => dbg_gpr_addr,
|
|
|
|
dbg_gpr_data => dbg_gpr_data,
|
|
|
|
dbg_spr_req => dbg_spr_req,
|
|
|
|
dbg_spr_ack => dbg_spr_ack,
|
|
|
|
dbg_spr_addr => dbg_spr_addr,
|
|
|
|
dbg_spr_data => dbg_spr_data,
|
|
|
|
dbg_ls_spr_req => dbg_ls_spr_req,
|
|
|
|
dbg_ls_spr_ack => dbg_ls_spr_ack,
|
|
|
|
dbg_ls_spr_addr => dbg_ls_spr_addr,
|
|
|
|
dbg_ls_spr_data => dbg_ls_spr_data,
|
|
|
|
log_data => log_data,
|
|
|
|
log_read_addr => log_rd_addr,
|
|
|
|
log_read_data => log_rd_data,
|
|
|
|
log_write_addr => log_wr_addr,
|
|
|
|
terminated_out => terminated_out
|
|
|
|
);
|
|
|
|
|
|
|
|
end behave;
|