Merge pull request #382 from paulusmack/master

Decode in block RAM and other improvements
pull/397/head
Michael Neuling 10 months ago committed by GitHub
commit ff63ffdbfd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -56,13 +56,13 @@ all = core_tb icache_tb dcache_tb dmi_dtm_tb \
all: $(all)

core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \
utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl predecode.vhdl \
decode1.vhdl helpers.vhdl insn_helpers.vhdl \
control.vhdl decode2.vhdl register_file.vhdl \
cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \
logical.vhdl countbits.vhdl multiply.vhdl divider.vhdl execute1.vhdl \
loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \
core.vhdl fpu.vhdl pmu.vhdl
logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \
execute1.vhdl loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl \
core_debug.vhdl core.vhdl fpu.vhdl pmu.vhdl

soc_files = wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \
wishbone_debug_master.vhdl xics.vhdl syscon.vhdl gpio.vhdl soc.vhdl \

@ -7,6 +7,7 @@ entity cache_ram is
generic(
ROW_BITS : integer := 16;
WIDTH : integer := 64;
BYTEWID : integer := 8;
TRACE : boolean := false;
ADD_BUF : boolean := false
);
@ -16,7 +17,7 @@ entity cache_ram is
rd_en : in std_logic;
rd_addr : in std_logic_vector(ROW_BITS - 1 downto 0);
rd_data : out std_logic_vector(WIDTH - 1 downto 0);
wr_sel : in std_logic_vector(WIDTH/8 - 1 downto 0);
wr_sel : in std_logic_vector(WIDTH/BYTEWID - 1 downto 0);
wr_addr : in std_logic_vector(ROW_BITS - 1 downto 0);
wr_data : in std_logic_vector(WIDTH - 1 downto 0)
);
@ -38,7 +39,7 @@ begin
variable lbit : integer range 0 to WIDTH - 1;
variable mbit : integer range 0 to WIDTH - 1;
variable widx : integer range 0 to SIZE - 1;
constant sel0 : std_logic_vector(WIDTH/8 - 1 downto 0)
constant sel0 : std_logic_vector(WIDTH/BYTEWID - 1 downto 0)
:= (others => '0');
begin
if rising_edge(clk) then
@ -49,9 +50,9 @@ begin
" dat:" & to_hstring(wr_data);
end if;
end if;
for i in 0 to WIDTH/8-1 loop
lbit := i * 8;
mbit := lbit + 7;
for i in 0 to WIDTH/BYTEWID-1 loop
lbit := i * BYTEWID;
mbit := lbit + BYTEWID - 1;
widx := to_integer(unsigned(wr_addr));
if wr_sel(i) = '1' then
ram(widx)(mbit downto lbit) <= wr_data(mbit downto lbit);

@ -246,10 +246,13 @@ package common is
fetch_failed: std_ulogic;
nia: std_ulogic_vector(63 downto 0);
insn: std_ulogic_vector(31 downto 0);
icode: insn_code;
big_endian: std_ulogic;
next_predicted: std_ulogic;
next_pred_ntaken: std_ulogic;
end record;
constant IcacheToDecode1Init : IcacheToDecode1Type :=
(nia => (others => '0'), insn => (others => '0'), icode => INSN_illegal, others => '0');

type IcacheEventType is record
icache_miss : std_ulogic;
@ -317,6 +320,9 @@ package common is
read_data1: std_ulogic_vector(63 downto 0);
read_data2: std_ulogic_vector(63 downto 0);
read_data3: std_ulogic_vector(63 downto 0);
reg_valid1: std_ulogic;
reg_valid2: std_ulogic;
reg_valid3: std_ulogic;
cr: std_ulogic_vector(31 downto 0);
xerc: xer_common_t;
lr: std_ulogic;
@ -363,6 +369,7 @@ package common is
is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0',
byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'),
read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'),
reg_valid1 => '0', reg_valid2 => '0', reg_valid3 => '0',
cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'),
result_sel => "000", sub_select => "000",
repeat => '0', second => '0', spr_select => spr_id_init,
@ -378,12 +385,11 @@ package common is
data1: std_ulogic_vector(63 downto 0);
data2: std_ulogic_vector(63 downto 0);
addend: std_ulogic_vector(127 downto 0);
is_32bit: std_ulogic;
not_result: std_ulogic;
is_signed: std_ulogic;
subtract: std_ulogic; -- 0 => addend + data1 * data2, 1 => addend - data1 * data2
end record;
constant MultiplyInputInit : MultiplyInputType := (valid => '0',
is_32bit => '0', not_result => '0',
others => (others => '0'));
constant MultiplyInputInit : MultiplyInputType := (data1 => 64x"0", data2 => 64x"0",
addend => 128x"0", others => '0');

type MultiplyOutputType is record
valid: std_ulogic;
@ -476,7 +482,6 @@ package common is
type Execute1ToLoadstore1Type is record
valid : std_ulogic;
op : insn_type_t; -- what ld/st or m[tf]spr or TLB op to do
nia : std_ulogic_vector(63 downto 0);
insn : std_ulogic_vector(31 downto 0);
instr_tag : instr_tag_t;
addr1 : std_ulogic_vector(63 downto 0);
@ -504,7 +509,7 @@ package common is
(valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0',
sign_extend => '0', update => '0', xerc => xerc_init,
reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0',
nia => (others => '0'), insn => (others => '0'),
insn => (others => '0'),
instr_tag => instr_tag_init,
addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'),
write_reg => (others => '0'),
@ -525,8 +530,6 @@ package common is
dcbz : std_ulogic;
nc : std_ulogic;
reserve : std_ulogic;
atomic : std_ulogic; -- part of a multi-transfer atomic op
atomic_last : std_ulogic;
virt_mode : std_ulogic;
priv_mode : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
@ -674,6 +677,9 @@ package common is
fra : std_ulogic_vector(63 downto 0);
frb : std_ulogic_vector(63 downto 0);
frc : std_ulogic_vector(63 downto 0);
valid_a : std_ulogic;
valid_b : std_ulogic;
valid_c : std_ulogic;
frt : gspr_index_t;
rc : std_ulogic;
m32b : std_ulogic;
@ -687,6 +693,7 @@ package common is
insn => (others => '0'), fe_mode => "00", rc => '0',
fra => (others => '0'), frb => (others => '0'),
frc => (others => '0'), frt => (others => '0'),
valid_a => '0', valid_b => '0', valid_c => '0',
single => '0', is_signed => '0', out_cr => '0',
m32b => '0', oe => '0', xerc => xerc_init,
stall => '0');

@ -13,7 +13,6 @@ entity core is
EX1_BYPASS : boolean := true;
HAS_FPU : boolean := true;
HAS_BTC : boolean := true;
HAS_SHORT_MULT : boolean := false;
ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0');
LOG_LENGTH : natural := 512;
ICACHE_NUM_LINES : natural := 64;
@ -246,6 +245,7 @@ begin
icache_0: entity work.icache
generic map(
SIM => SIM,
HAS_FPU => HAS_FPU,
LINE_SIZE => 64,
NUM_LINES => ICACHE_NUM_LINES,
NUM_WAYS => ICACHE_NUM_WAYS,
@ -266,7 +266,7 @@ begin
wishbone_in => wishbone_insn_in,
wb_snoop_in => wb_snoop_in,
events => icache_events,
log_out => log_data(96 downto 43)
log_out => log_data(100 downto 43)
);

icache_stall_in <= decode1_busy;
@ -287,7 +287,7 @@ begin
d_out => decode1_to_decode2,
f_out => decode1_to_fetch1,
r_out => decode1_to_register_file,
log_out => log_data(109 downto 97)
log_out => log_data(113 downto 101)
);

decode1_stall_in <= decode2_stall_out;
@ -319,7 +319,7 @@ begin
writeback_bypass => writeback_bypass,
dbg_spr_req => dbg_spr_req,
dbg_spr_addr => dbg_spr_addr,
log_out => log_data(119 downto 110)
log_out => log_data(123 downto 114)
);
decode2_busy_in <= ex1_busy_out;

@ -365,7 +365,6 @@ begin
SIM => SIM,
EX1_BYPASS => EX1_BYPASS,
HAS_FPU => HAS_FPU,
HAS_SHORT_MULT => HAS_SHORT_MULT,
LOG_LENGTH => LOG_LENGTH
)
port map (
@ -398,7 +397,7 @@ begin
dbg_spr_data => dbg_spr_data,
sim_dump => sim_ex_dump,
sim_dump_done => sim_cr_dump,
log_out => log_data(134 downto 120),
log_out => log_data(135 downto 124),
log_rd_addr => log_rd_addr,
log_rd_data => log_rd_data,
log_wr_addr => log_wr_addr
@ -500,7 +499,7 @@ begin
);

log_data(150) <= '0';
log_data(139 downto 135) <= "00000";
log_data(139 downto 136) <= "0000";

debug_0: entity work.core_debug
generic map (

@ -175,7 +175,8 @@ begin
gspr_index <= (others => '0');
else
if do_log_trigger = '1' or log_trigger_delay /= 0 then
if log_trigger_delay = 255 then
if log_trigger_delay = 255 or
(LOG_LENGTH < 1024 and log_trigger_delay = LOG_LENGTH / 4) then
log_dmi_trigger(1) <= '1';
log_trigger_delay <= 0;
else

@ -1004,10 +1004,10 @@ begin
-- XXX or if r0.req.nc = '1'
if r0.req.load = '1' then
-- load with reservation
set_rsrv <= r0.req.atomic_last;
set_rsrv <= '1';
else
-- store conditional
clear_rsrv <= r0.req.atomic_last;
clear_rsrv <= '1';
if reservation.valid = '0' or
r0.req.addr(63 downto LINE_OFF_BITS) /= reservation.addr then
cancel_store <= '1';

File diff suppressed because it is too large Load Diff

@ -58,10 +58,6 @@ architecture behaviour of decode2 is
busy : std_ulogic;
sgl_pipe : std_ulogic;
prev_sgl : std_ulogic;
reg_a_valid : std_ulogic;
reg_b_valid : std_ulogic;
reg_c_valid : std_ulogic;
reg_o_valid : std_ulogic;
input_ov : std_ulogic;
output_ov : std_ulogic;
read_rspr : std_ulogic;
@ -192,7 +188,7 @@ architecture behaviour of decode2 is
function decode_rc (t : rc_t; insn_in : std_ulogic_vector(31 downto 0)) return std_ulogic is
begin
case t is
when RC =>
when RC | RCOE =>
return insn_rc(insn_in);
when ONE =>
return '1';
@ -393,6 +389,7 @@ begin
variable v : reg_type;
variable length : std_ulogic_vector(3 downto 0);
variable op : insn_type_t;
variable unit : unit_t;
variable valid_in : std_ulogic;
variable decctr : std_ulogic;
variable sprs_busy : std_ulogic;
@ -405,6 +402,7 @@ begin
v.e := Decode2ToExecute1Init;

sprs_busy := '0';
unit := d_in.decode.unit;

if d_in.valid = '1' then
v.prev_sgl := dc2.sgl_pipe;
@ -429,32 +427,40 @@ begin
end if;
case d_in.decode.insn_type is
when OP_ADD | OP_MUL_L64 | OP_DIV | OP_DIVE =>
-- OE field is valid in OP_ADD/OP_MUL_L64 with major opcode 31 only
if d_in.insn(31 downto 26) = "011111" and insn_oe(d_in.insn) = '1' then
if d_in.decode.rc = RCOE and insn_oe(d_in.insn) = '1' then
v.e.oe := '1';
v.e.output_xer := '1';
v.output_ov := '1';
v.input_ov := '1'; -- need SO state if setting OV to 0
end if;
when OP_MFSPR =>
if decode_spr_num(d_in.insn) = SPR_XER then
v.input_ov := '1';
end if;
case decode_spr_num(d_in.insn) is
when SPR_XER =>
v.input_ov := '1';
when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR =>
unit := LDST;
when others =>
end case;
when OP_MTSPR =>
if decode_spr_num(d_in.insn) = SPR_XER then
v.e.output_xer := '1';
v.output_ov := '1';
case decode_spr_num(d_in.insn) is
when SPR_XER =>
v.e.output_xer := '1';
v.output_ov := '1';
when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR =>
unit := LDST;
if d_in.valid = '1' then
v.sgl_pipe := '1';
end if;
when others =>
end case;
if d_in.spr_info.valid = '1' and d_in.valid = '1' then
v.sgl_pipe := '1';
end if;
when OP_CMP | OP_MCRXRX =>
v.input_ov := '1';
when others =>
end case;

v.reg_a_valid := decoded_reg_a.reg_valid;
v.reg_b_valid := decoded_reg_b.reg_valid;
v.reg_c_valid := decoded_reg_c.reg_valid;
v.reg_o_valid := decoded_reg_o.reg_valid;

if d_in.decode.lr = '1' then
v.e.lr := insn_lk(d_in.insn);
-- b and bc have even major opcodes; bcreg is considered absolute
@ -537,11 +543,14 @@ begin

-- execute unit
v.e.nia := d_in.nia;
v.e.unit := d_in.decode.unit;
v.e.unit := unit;
v.e.fac := d_in.decode.facility;
v.e.read_reg1 := d_in.reg_a;
v.e.read_reg2 := d_in.reg_b;
v.e.read_reg3 := d_in.reg_c;
v.e.reg_valid1 := decoded_reg_a.reg_valid;
v.e.reg_valid2 := decoded_reg_b.reg_valid;
v.e.reg_valid3 := decoded_reg_c.reg_valid;
v.e.write_reg := decoded_reg_o.reg;
v.e.write_reg_enable := decoded_reg_o.reg_valid;
v.e.invert_a := d_in.decode.invert_a;
@ -583,16 +592,16 @@ begin
control_valid_in <= valid_in;
control_serialize <= v.sgl_pipe or v.prev_sgl;

gpr_write_valid <= v.reg_o_valid;
gpr_write_valid <= v.e.write_reg_enable;
gpr_write <= v.e.write_reg;

gpr_a_read_valid <= v.reg_a_valid;
gpr_a_read_valid <= v.e.reg_valid1;
gpr_a_read <= v.e.read_reg1;

gpr_b_read_valid <= v.reg_b_valid;
gpr_b_read_valid <= v.e.reg_valid2;
gpr_b_read <= v.e.read_reg2;

gpr_c_read_valid <= v.reg_c_valid;
gpr_c_read_valid <= v.e.reg_valid3;
gpr_c_read <= v.e.read_reg3;

cr_write_valid <= v.e.output_cr or v.e.rc;

@ -4,14 +4,16 @@ use ieee.std_logic_1164.all;
package decode_types is
type insn_type_t is (OP_ILLEGAL, OP_NOP, OP_ADD,
OP_AND, OP_ATTN, OP_B, OP_BC, OP_BCREG,
OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB,
OP_BCD, OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB,
OP_CNTZ, OP_CROP,
OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST,
OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS, OP_EXTSWSLI,
OP_FPOP, OP_FPOP_I,
OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC,
OP_DCBZ, OP_ICBI, OP_ICBT,
OP_FP_CMP, OP_FP_ARITH, OP_FP_MOVE, OP_FP_MISC,
OP_DIV, OP_DIVE, OP_MOD,
OP_EXTS, OP_EXTSWSLI,
OP_ISEL, OP_ISYNC,
OP_LOAD, OP_STORE,
OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MOD,
OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR,
OP_MTCRF, OP_MTMSRD, OP_MTSPR, OP_MUL_L64,
OP_MUL_H64, OP_MUL_H32, OP_OR,
OP_POPCNT, OP_PRTY, OP_RFID,
@ -19,15 +21,349 @@ package decode_types is
OP_SHL, OP_SHR,
OP_SYNC, OP_TLBIE, OP_TRAP,
OP_XOR,
OP_BCD, OP_ADDG6S,
OP_ADDG6S,
OP_FETCH_FAILED
);

-- The following list is ordered in such a way that we can know some
-- things about which registers are accessed by an instruction by its place
-- in the list. In other words we can decide whether an instruction
-- accesses FPRs and whether it has an RB operand by doing simple
-- comparisons of the insn_code for the instruction with a few constants.
type insn_code is (
-- The following instructions don't have an RB operand or access FPRs
INSN_illegal, -- 0
INSN_fetch_fail,
INSN_addi,
INSN_addic,
INSN_addic_dot,
INSN_addis,
INSN_addme,
INSN_addpcis,
INSN_addze,
INSN_andi_dot,
INSN_andis_dot, -- 10
INSN_attn,
INSN_b,
INSN_bc,
INSN_bcctr,
INSN_bclr,
INSN_bctar,
INSN_cbcdtd,
INSN_cdtbcd,
INSN_cmpi,
INSN_cmpli, -- 20
INSN_cntlzw,
INSN_cntlzd,
INSN_cnttzw,
INSN_cnttzd,
INSN_crand,
INSN_crandc,
INSN_creqv,
INSN_crnand,
INSN_crnor,
INSN_cror, -- 30
INSN_crorc,
INSN_crxor,
INSN_darn,
INSN_eieio,
INSN_extsb,
INSN_extsh,
INSN_extsw,
INSN_extswsli,
INSN_isync,
INSN_lbz, -- 40
INSN_lbzu,
INSN_ld,
INSN_ldu,
INSN_lha,
INSN_lhau,
INSN_lhz,
INSN_lhzu,
INSN_lwa,
INSN_lwz,
INSN_lwzu, -- 50
INSN_mcrf,
INSN_mcrfs,
INSN_mcrxrx,
INSN_mfcr,
INSN_mfmsr,
INSN_mfspr,
INSN_mtcrf,
INSN_mtfsb,
INSN_mtfsfi,
INSN_mtmsr, -- 60
INSN_mtmsrd,
INSN_mtspr,
INSN_mulli,
INSN_neg,
INSN_nop,
INSN_ori,
INSN_oris,
INSN_popcntb,
INSN_popcntw,
INSN_popcntd, -- 70
INSN_prtyw,
INSN_prtyd,
INSN_rfid,
INSN_rldic,
INSN_rldicl,
INSN_rldicr,
INSN_rldimi,
INSN_rlwimi,
INSN_rlwinm,
INSN_sc, -- 80
INSN_setb,
INSN_slbia,
INSN_sradi,
INSN_srawi,
INSN_stb,
INSN_stbu,
INSN_std,
INSN_stdu,
INSN_sth,
INSN_sthu, -- 90
INSN_stw,
INSN_stwu,
INSN_subfic,
INSN_subfme,
INSN_subfze,
INSN_sync,
INSN_tdi,
INSN_tlbsync,
INSN_twi,
INSN_wait, -- 100
INSN_xori,
INSN_xoris,

-- pad to 112 to simplify comparison logic
INSN_103,
INSN_104, INSN_105, INSN_106, INSN_107,
INSN_108, INSN_109, INSN_110, INSN_111,

-- The following instructions have an RB operand but don't access FPRs
INSN_add,
INSN_addc,
INSN_adde,
INSN_addex,
INSN_addg6s,
INSN_and,
INSN_andc,
INSN_bperm,
INSN_cmp, -- 120
INSN_cmpb,
INSN_cmpeqb,
INSN_cmpl,
INSN_cmprb,
INSN_dcbf,
INSN_dcbst,
INSN_dcbt,
INSN_dcbtst,
INSN_dcbz,
INSN_divd, -- 130
INSN_divdu,
INSN_divde,
INSN_divdeu,
INSN_divw,
INSN_divwu,
INSN_divwe,
INSN_divweu,
INSN_eqv,
INSN_icbi,
INSN_icbt, -- 140
INSN_isel,
INSN_lbarx,
INSN_lbzcix,
INSN_lbzux,
INSN_lbzx,
INSN_ldarx,
INSN_ldbrx,
INSN_ldcix,
INSN_ldx,
INSN_ldux, -- 150
INSN_lharx,
INSN_lhax,
INSN_lhaux,
INSN_lhbrx,
INSN_lhzcix,
INSN_lhzx,
INSN_lhzux,
INSN_lwarx,
INSN_lwax,
INSN_lwaux, -- 160
INSN_lwbrx,
INSN_lwzcix,
INSN_lwzx,
INSN_lwzux,
INSN_modsd,
INSN_modsw,
INSN_moduw,
INSN_modud,
INSN_mulhw,
INSN_mulhwu, -- 170
INSN_mulhd,
INSN_mulhdu,
INSN_mullw,
INSN_mulld,
INSN_nand,
INSN_nor,
INSN_or,
INSN_orc,
INSN_rldcl,
INSN_rldcr, -- 180
INSN_rlwnm,
INSN_slw,
INSN_sld,
INSN_sraw,
INSN_srad,
INSN_srw,
INSN_srd,
INSN_stbcix,
INSN_stbcx,
INSN_stbx, -- 190
INSN_stbux,
INSN_stdbrx,
INSN_stdcix,
INSN_stdcx,
INSN_stdx,
INSN_stdux,
INSN_sthbrx,
INSN_sthcix,
INSN_sthcx,
INSN_sthx, -- 200
INSN_sthux,
INSN_stwbrx,
INSN_stwcix,
INSN_stwcx,
INSN_stwx,
INSN_stwux,
INSN_subf,
INSN_subfc,
INSN_subfe,
INSN_td, -- 210
INSN_tlbie,
INSN_tlbiel,
INSN_tw,
INSN_xor,

-- pad to 224 to simplify comparison logic
INSN_215,
INSN_216, INSN_217, INSN_218, INSN_219,
INSN_220, INSN_221, INSN_222, INSN_223,

-- The following instructions have a third input addressed by RC
INSN_maddld,
INSN_maddhd,
INSN_maddhdu,

-- pad to 256 to simplify comparison logic
INSN_227,
INSN_228, INSN_229, INSN_230, INSN_231,
INSN_232, INSN_233, INSN_234, INSN_235,
INSN_236, INSN_237, INSN_238, INSN_239,
INSN_240, INSN_241, INSN_242, INSN_243,
INSN_244, INSN_245, INSN_246, INSN_247,
INSN_248, INSN_249, INSN_250, INSN_251,
INSN_252, INSN_253, INSN_254, INSN_255,

-- The following instructions access floating-point registers
-- These ones have an FRS operand, but RA/RB are GPRs
INSN_stfd,
INSN_stfdu,
INSN_stfs,
INSN_stfsu,
INSN_stfdux, -- 260
INSN_stfdx,
INSN_stfiwx,
INSN_stfsux,
INSN_stfsx,
-- These ones don't actually have an FRS operand (rather an FRT destination)
-- but are here so that all FP instructions are >= INST_first_frs.
INSN_lfd,
INSN_lfdu,
INSN_lfs,
INSN_lfsu,
INSN_lfdx,
INSN_lfdux, -- 270
INSN_lfiwax,
INSN_lfiwzx,
INSN_lfsx,
INSN_lfsux,
INSN_275, -- padding

-- The following instructions access FRA and/or FRB operands
INSN_fabs,
INSN_fadd,
INSN_fadds,
INSN_fcfid,
INSN_fcfids, -- 280
INSN_fcfidu,
INSN_fcfidus,
INSN_fcmpo,
INSN_fcmpu,
INSN_fcpsgn,
INSN_fctid,
INSN_fctidz,
INSN_fctidu,
INSN_fctiduz,
INSN_fctiw, -- 290
INSN_fctiwz,
INSN_fctiwu,
INSN_fctiwuz,
INSN_fdiv,
INSN_fdivs,
INSN_fmr,
INSN_fmrgew,
INSN_fmrgow,
INSN_fnabs,
INSN_fneg, -- 300
INSN_fre,
INSN_fres,
INSN_frim,
INSN_frin,
INSN_frip,
INSN_friz,
INSN_frsp,
INSN_frsqrte,
INSN_frsqrtes,
INSN_fsqrt, -- 310
INSN_fsqrts,
INSN_fsub,
INSN_fsubs,
INSN_ftdiv,
INSN_ftsqrt,
INSN_mffs,
INSN_mtfsf,

-- pad to 320
INSN_318, INSN_319,

-- The following instructions access FRA, FRB (possibly) and FRC operands
INSN_fmul, -- 320
INSN_fmuls,
INSN_fmadd,
INSN_fmadds,
INSN_fmsub,
INSN_fmsubs,
INSN_fnmadd,
INSN_fnmadds,
INSN_fnmsub,
INSN_fnmsubs,
INSN_fsel -- 330
);

constant INSN_first_rb : insn_code := INSN_add;
constant INSN_first_rc : insn_code := INSN_maddld;
constant INSN_first_frs : insn_code := INSN_stfd;
constant INSN_first_frab : insn_code := INSN_fabs;
constant INSN_first_frabc : insn_code := INSN_fmul;

type input_reg_a_t is (NONE, RA, RA_OR_ZERO, CIA, FRA);
type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD,
CONST_DXHI4, CONST_DS, CONST_DQ, CONST_M1, CONST_SH, CONST_SH32, FRB);
type input_reg_c_t is (NONE, RS, RCR, FRC, FRS);
type output_reg_a_t is (NONE, RT, RA, FRT);
type rc_t is (NONE, ONE, RC);
type rc_t is (NONE, ONE, RC, RCOE);
type carry_in_t is (ZERO, CA, OV, ONE);

constant SH_OFFSET : integer := 0;

@ -15,7 +15,6 @@ entity execute1 is
SIM : boolean := false;
EX1_BYPASS : boolean := true;
HAS_FPU : boolean := true;
HAS_SHORT_MULT : boolean := false;
-- Non-zero to enable log data collection
LOG_LENGTH : natural := 0
);
@ -65,7 +64,7 @@ entity execute1 is
sim_dump : in std_ulogic;
sim_dump_done : out std_ulogic;

log_out : out std_ulogic_vector(14 downto 0);
log_out : out std_ulogic_vector(11 downto 0);
log_rd_addr : out std_ulogic_vector(31 downto 0);
log_rd_data : in std_ulogic_vector(63 downto 0);
log_wr_addr : in std_ulogic_vector(31 downto 0)
@ -85,6 +84,7 @@ architecture behaviour of execute1 is
write_pmuspr : std_ulogic;
ramspr_write_even : std_ulogic;
ramspr_write_odd : std_ulogic;
mult_32s : std_ulogic;
end record;
constant side_effect_init : side_effect_type := (others => '0');

@ -203,6 +203,8 @@ architecture behaviour of execute1 is
-- multiply signals
signal x_to_multiply: MultiplyInputType;
signal multiply_to_x: MultiplyOutputType;
signal x_to_mult_32s: MultiplyInputType;
signal mult_32s_to_x: MultiplyOutputType;

-- divider signals
signal x_to_divider: Execute1ToDividerType;
@ -411,6 +413,14 @@ begin
m_out => multiply_to_x
);

mult_32s_0: entity work.multiply_32s
port map (
clk => clk,
stall => stage2_stall,
m_in => x_to_mult_32s,
m_out => mult_32s_to_x
);

divider_0: if not HAS_FPU generate
div_0: entity work.divider
port map (
@ -437,17 +447,6 @@ begin
p_out => pmu_to_x
);

short_mult_0: if HAS_SHORT_MULT generate
begin
short_mult: entity work.short_multiply
port map (
clk => clk,
a_in => a_in(15 downto 0),
b_in => b_in(15 downto 0),
m_out => mshort_p
);
end generate;

dbg_ctrl_out <= ctrl;
log_rd_addr <= ex2.log_addr_spr;

@ -684,78 +683,82 @@ begin
overflow_32 <= calc_ov(a_inv(31), b_in(31), carry_32, sum_with_carry(31));
overflow_64 <= calc_ov(a_inv(63), b_in(63), carry_64, sum_with_carry(63));

-- signals to multiply and divide units
sign1 := '0';
sign2 := '0';
if e_in.is_signed = '1' then
if e_in.is_32bit = '1' then
sign1 := a_in(31);
sign2 := b_in(31);
else
sign1 := a_in(63);
sign2 := b_in(63);
end if;
end if;
-- take absolute values
if sign1 = '0' then
abs1 := signed(a_in);
else
abs1 := - signed(a_in);
end if;
if sign2 = '0' then
abs2 := signed(b_in);
else
abs2 := - signed(b_in);
end if;

-- Interface to multiply and divide units
x_to_divider.is_signed <= e_in.is_signed;
x_to_divider.is_32bit <= e_in.is_32bit;
x_to_divider.is_extended <= '0';
x_to_divider.is_modulus <= '0';
if e_in.insn_type = OP_MOD then
x_to_divider.is_modulus <= '1';
end if;
x_to_divider.flush <= flush_in;

-- signals to multiplier
addend := (others => '0');
if e_in.insn(26) = '0' then
if e_in.reg_valid3 = '1' then
-- integer multiply-add, major op 4 (if it is a multiply)
addend(63 downto 0) := c_in;
if e_in.is_signed = '1' then
addend(127 downto 64) := (others => c_in(63));
end if;
end if;
if (sign1 xor sign2) = '1' then
addend := not addend;
end if;

x_to_multiply.is_32bit <= e_in.is_32bit;
x_to_multiply.not_result <= sign1 xor sign2;
x_to_multiply.data1 <= std_ulogic_vector(a_in);
x_to_multiply.data2 <= std_ulogic_vector(b_in);
x_to_multiply.is_signed <= e_in.is_signed;
x_to_multiply.subtract <= '0';
x_to_multiply.addend <= addend;
x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus);
if e_in.is_32bit = '0' then
-- 64-bit forms
x_to_multiply.data1 <= std_ulogic_vector(abs1);
x_to_multiply.data2 <= std_ulogic_vector(abs2);
if e_in.insn_type = OP_DIVE then
x_to_divider.is_extended <= '1';

-- Interface to divide unit
if not HAS_FPU then
sign1 := '0';
sign2 := '0';
if e_in.is_signed = '1' then
if e_in.is_32bit = '1' then
sign1 := a_in(31);
sign2 := b_in(31);
else
sign1 := a_in(63);
sign2 := b_in(63);
end if;
end if;
x_to_divider.dividend <= std_ulogic_vector(abs1);
x_to_divider.divisor <= std_ulogic_vector(abs2);
else
-- 32-bit forms
x_to_multiply.data1 <= x"00000000" & std_ulogic_vector(abs1(31 downto 0));
x_to_multiply.data2 <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
-- take absolute values
if sign1 = '0' then
abs1 := signed(a_in);
else
abs1 := - signed(a_in);
end if;
if sign2 = '0' then
abs2 := signed(b_in);
else
abs2 := - signed(b_in);
end if;

x_to_divider.is_signed <= e_in.is_signed;
x_to_divider.is_32bit <= e_in.is_32bit;
x_to_divider.is_extended <= '0';
if e_in.insn_type = OP_DIVE then -- extended forms
x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000";
x_to_divider.is_modulus <= '0';
if e_in.insn_type = OP_MOD then
x_to_divider.is_modulus <= '1';
end if;
x_to_divider.flush <= flush_in;
x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus);
if e_in.is_32bit = '0' then
-- 64-bit forms
if e_in.insn_type = OP_DIVE then
x_to_divider.is_extended <= '1';
end if;
x_to_divider.dividend <= std_ulogic_vector(abs1);
x_to_divider.divisor <= std_ulogic_vector(abs2);
else
x_to_divider.dividend <= x"00000000" & std_ulogic_vector(abs1(31 downto 0));
-- 32-bit forms
x_to_divider.is_extended <= '0';
if e_in.insn_type = OP_DIVE then -- extended forms
x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000";
else
x_to_divider.dividend <= x"00000000" & std_ulogic_vector(abs1(31 downto 0));
end if;
x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
end if;
x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
end if;

-- signals to 32-bit multiplier
x_to_mult_32s.data1 <= 32x"0" & a_in(31 downto 0);
x_to_mult_32s.data2 <= 32x"0" & b_in(31 downto 0);
x_to_mult_32s.is_signed <= e_in.is_signed;
-- The following are unused, but set here to avoid X states
x_to_mult_32s.subtract <= '0';
x_to_mult_32s.addend <= (others => '0');

shortmul_result <= std_ulogic_vector(resize(signed(mshort_p), 64));
case ex1.mul_select is
when "00" =>
@ -1271,13 +1274,10 @@ begin
v.se.icache_inval := '1';

when OP_MUL_L64 =>
if HAS_SHORT_MULT and e_in.insn(26) = '1' and
fits_in_n_bits(a_in, 16) and fits_in_n_bits(b_in, 16) then
-- Operands fit into 16 bits, so use short multiplier
if e_in.oe = '1' then
-- Note 16x16 multiply can't overflow, even for mullwo
set_ov(v.e, '0', '0');
end if;
if e_in.is_32bit = '1' then
v.se.mult_32s := '1';
v.res2_sel := "00";
slow_op := '1';
else
-- Use standard multiplier
v.start_mul := '1';
@ -1285,11 +1285,16 @@ begin
owait := '1';
end if;

when OP_MUL_H64 | OP_MUL_H32 =>
when OP_MUL_H64 =>
v.start_mul := '1';
slow_op := '1';
owait := '1';

when OP_MUL_H32 =>
v.se.mult_32s := '1';
v.res2_sel := "01";
slow_op := '1';

when OP_DIV | OP_DIVE | OP_MOD =>
if not HAS_FPU then
v.start_div := '1';
@ -1370,6 +1375,7 @@ begin
fv := Execute1ToFPUInit;

x_to_multiply.valid <= '0';
x_to_mult_32s.valid <= '0';
x_to_divider.valid <= '0';
v.ext_interrupt := '0';
v.taken_branch_event := '0';
@ -1456,6 +1462,7 @@ begin
v.res2_sel := actions.res2_sel;
v.msr := actions.new_msr;
x_to_multiply.valid <= actions.start_mul;
x_to_mult_32s.valid <= actions.se.mult_32s;
v.mul_in_progress := actions.start_mul;
x_to_divider.valid <= actions.start_div;
v.div_in_progress := actions.start_div;
@ -1481,7 +1488,7 @@ begin
end if;
end if;

if ex1.div_in_progress = '1' then
if not HAS_FPU and ex1.div_in_progress = '1' then
v.div_in_progress := not divider_to_x.valid;
v.busy := not divider_to_x.valid;
if divider_to_x.valid = '1' and ex1.oe = '1' then
@ -1554,7 +1561,6 @@ begin

-- Outputs to loadstore1 (async)
lv.op := e_in.insn_type;
lv.nia := e_in.nia;
lv.instr_tag := e_in.instr_tag;
lv.addr1 := a_in;
lv.addr2 := b_in;
@ -1568,11 +1574,9 @@ begin
lv.reserve := e_in.reserve;
lv.rc := e_in.rc;
lv.insn := e_in.insn;
-- decode l*cix and st*cix instructions here
if e_in.insn(31 downto 26) = "011111" and e_in.insn(10 downto 9) = "11" and
e_in.insn(5 downto 1) = "10101" then
lv.ci := '1';
end if;
-- invert_a field is overloaded for load/store instructions
-- to mark l*cix and st*cix
lv.ci := e_in.invert_a;
lv.virt_mode := ex1.msr(MSR_DR);
lv.priv_mode := not ex1.msr(MSR_PR);
lv.mode_32bit := not ex1.msr(MSR_SF);
@ -1591,6 +1595,9 @@ begin
fv.fra := a_in;
fv.frb := b_in;
fv.frc := c_in;
fv.valid_a := e_in.reg_valid1;
fv.valid_b := e_in.reg_valid2;
fv.valid_c := e_in.reg_valid3;
fv.frt := e_in.write_reg;
fv.rc := e_in.rc;
fv.out_cr := e_in.output_cr;
@ -1624,11 +1631,6 @@ begin
-- Second execute stage control
execute2_1: process(all)
variable v : reg_stage2_type;
variable overflow : std_ulogic;
variable lv : Execute1ToLoadstore1Type;
variable fv : Execute1ToFPUType;
variable k : integer;
variable go : std_ulogic;
variable bypass_valid : std_ulogic;
variable rcresult : std_ulogic_vector(63 downto 0);
variable sprres : std_ulogic_vector(63 downto 0);
@ -1647,6 +1649,14 @@ begin
v.br_mispredict := ex1.br_mispredict;
end if;

if ex1.se.mult_32s = '1' and ex1.oe = '1' then
v.e.xerc.ov := mult_32s_to_x.overflow;
v.e.xerc.ov32 := mult_32s_to_x.overflow;
if mult_32s_to_x.overflow = '1' then
v.e.xerc.so := '1';
end if;
end if;

ctrl_tmp <= ctrl;
-- FIXME: run at 512MHz not core freq
ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
@ -1667,24 +1677,34 @@ begin
v.e.write_xerc_enable := '0';
v.e.redirect := '0';
v.e.br_last := '0';
v.se := side_effect_init;
v.taken_branch_event := '0';
v.br_mispredict := '0';
end if;
if flush_in = '1' then
v.e.valid := '0';
v.e.interrupt := '0';
v.se := side_effect_init;
v.ext_interrupt := '0';
end if;

-- This is split like this because mfspr doesn't have an Rc bit,
-- and we don't want the zero-detect logic to be after the
-- SPR mux for timing reasons.
if ex1.res2_sel(0) = '0' then
if ex1.se.mult_32s = '1' then
if ex1.res2_sel(0) = '0' then
rcresult := mult_32s_to_x.result(63 downto 0);
else
rcresult := mult_32s_to_x.result(63 downto 32) &
mult_32s_to_x.result(63 downto 32);
end if;
elsif ex1.res2_sel(0) = '0' then
rcresult := ex1.e.write_data;
sprres := spr_result;
else
rcresult := countbits_result;
end if;
if ex1.res2_sel(0) = '0' then
sprres := spr_result;
else
sprres := pmu_to_x.spr_val;
end if;
if ex1.res2_sel(1) = '0' then
@ -1708,7 +1728,7 @@ begin
cr_res(31) := sign;
cr_res(30) := not (sign or zero);
cr_res(29) := zero;
cr_res(28) := ex1.e.xerc.so;
cr_res(28) := v.e.xerc.so;
cr_mask(7) := '1';
end if;

@ -1802,7 +1822,7 @@ begin
end generate;

e1_log: if LOG_LENGTH > 0 generate
signal log_data : std_ulogic_vector(14 downto 0);
signal log_data : std_ulogic_vector(11 downto 0);
begin
ex1_log : process(clk)
begin
@ -1812,7 +1832,6 @@ begin
exception_log &
irq_valid_log &
interrupt_in.intr &
"000" &
ex2.e.write_enable &
ex2.e.valid &
(ex2.e.redirect or ex2.e.interrupt) &

@ -16,7 +16,6 @@ entity toplevel is
CLK_FREQUENCY : positive := 100000000;
HAS_FPU : boolean := true;
HAS_BTC : boolean := true;
HAS_SHORT_MULT : boolean := false;
USE_LITEDRAM : boolean := false;
NO_BRAM : boolean := false;
DISABLE_FLATTEN_CORE : boolean := false;
@ -199,7 +198,6 @@ begin
CLK_FREQ => CLK_FREQUENCY,
HAS_FPU => HAS_FPU,
HAS_BTC => HAS_BTC,
HAS_SHORT_MULT => HAS_SHORT_MULT,
HAS_DRAM => USE_LITEDRAM,
DRAM_SIZE => 256 * 1024 * 1024,
DRAM_INIT_SIZE => PAYLOAD_SIZE,

@ -13,7 +13,6 @@ entity toplevel is
CLK_FREQUENCY : positive := 100000000;
HAS_FPU : boolean := true;
HAS_BTC : boolean := false;
HAS_SHORT_MULT: boolean := false;
ICACHE_NUM_LINES : natural := 64;
LOG_LENGTH : natural := 512;
DISABLE_FLATTEN_CORE : boolean := false;
@ -75,7 +74,6 @@ begin
CLK_FREQ => CLK_FREQUENCY,
HAS_FPU => HAS_FPU,
HAS_BTC => HAS_BTC,
HAS_SHORT_MULT => HAS_SHORT_MULT,
ICACHE_NUM_LINES => ICACHE_NUM_LINES,
LOG_LENGTH => LOG_LENGTH,
DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE,

@ -16,7 +16,6 @@ entity toplevel is
CLK_FREQUENCY : positive := 100000000;
HAS_FPU : boolean := true;
HAS_BTC : boolean := true;
HAS_SHORT_MULT: boolean := false;
USE_LITEDRAM : boolean := false;
NO_BRAM : boolean := false;
DISABLE_FLATTEN_CORE : boolean := false;
@ -175,7 +174,6 @@ begin
CLK_FREQ => CLK_FREQUENCY,
HAS_FPU => HAS_FPU,
HAS_BTC => HAS_BTC,
HAS_SHORT_MULT=> HAS_SHORT_MULT,
HAS_DRAM => USE_LITEDRAM,
DRAM_SIZE => 512 * 1024 * 1024,
DRAM_INIT_SIZE => PAYLOAD_SIZE,

@ -188,7 +188,6 @@ begin
HAS_UART1 => HAS_UART1,
HAS_SD_CARD => USE_LITESDCARD,
ICACHE_NUM_LINES => ICACHE_NUM_LINES,
HAS_SHORT_MULT => true,
NGPIO => NGPIO
)
port map (

@ -16,7 +16,6 @@ entity toplevel is
CLK_FREQUENCY : positive := 100000000;
HAS_FPU : boolean := true;
HAS_BTC : boolean := true;
HAS_SHORT_MULT : boolean := false;
USE_LITEDRAM : boolean := false;
NO_BRAM : boolean := false;
DISABLE_FLATTEN_CORE : boolean := false;
@ -175,7 +174,6 @@ begin
CLK_FREQ => CLK_FREQUENCY,
HAS_FPU => HAS_FPU,
HAS_BTC => HAS_BTC,
HAS_SHORT_MULT => HAS_SHORT_MULT,
HAS_DRAM => USE_LITEDRAM,
DRAM_SIZE => 256 * 1024 * 1024,
DRAM_INIT_SIZE => PAYLOAD_SIZE,

File diff suppressed because it is too large Load Diff

@ -23,6 +23,7 @@ use ieee.numeric_std.all;
library work;
use work.utils.all;
use work.common.all;
use work.decode_types.all;
use work.wishbone_types.all;

-- 64 bit direct mapped icache. All instructions are 4B aligned.
@ -30,6 +31,7 @@ use work.wishbone_types.all;
entity icache is
generic (
SIM : boolean := false;
HAS_FPU : boolean := true;
-- Line size in bytes
LINE_SIZE : positive := 64;
-- BRAM organisation: We never access more than wishbone_data_bits at
@ -69,7 +71,7 @@ entity icache is
wb_snoop_in : in wishbone_master_out := wishbone_master_out_init;

events : out IcacheEventType;
log_out : out std_ulogic_vector(53 downto 0)
log_out : out std_ulogic_vector(57 downto 0)
);
end entity icache;

@ -122,8 +124,20 @@ architecture rtl of icache is
subtype way_t is integer range 0 to NUM_WAYS-1;
subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);

-- We store a pre-decoded 10-bit insn_code along with the bottom 26 bits of
-- each instruction, giving a total of 36 bits per instruction, which
-- fits neatly into the block RAMs available on FPGAs.
-- For illegal instructions, the top 4 bits are ones and the bottom 6 bits
-- are the instruction's primary opcode, so we have the whole instruction
-- word available (e.g. to put in HEIR). For other instructions, the
-- primary opcode is not stored but could be determined from the insn_code.
constant PREDECODE_BITS : natural := 10;
constant INSN_IMAGE_BITS : natural := 26;
constant ICWORDLEN : natural := PREDECODE_BITS + INSN_IMAGE_BITS;
constant ROW_WIDTH : natural := INSN_PER_ROW * ICWORDLEN;

-- The cache data BRAM organized as described above for each way
subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
subtype cache_row_t is std_ulogic_vector(ROW_WIDTH-1 downto 0);

-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
-- not handle a clean (commented) definition of the cache tags as a 3d
@ -184,6 +198,8 @@ architecture rtl of icache is
wb : wishbone_master_out;
store_way : way_t;
store_index : index_t;
recv_row : row_t;
recv_valid : std_ulogic;
store_row : row_t;
store_tag : cache_tag_t;
store_valid : std_ulogic;
@ -214,7 +230,9 @@ architecture rtl of icache is

-- Cache RAM interface
type cache_ram_out_t is array(way_t) of cache_row_t;
signal cache_out : cache_ram_out_t;
signal cache_out : cache_ram_out_t;
signal cache_wr_data : std_ulogic_vector(ROW_WIDTH - 1 downto 0);
signal wb_rd_data : std_ulogic_vector(ROW_SIZE_BITS - 1 downto 0);

-- PLRU output interface
type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0);
@ -226,6 +244,8 @@ architecture rtl of icache is
signal snoop_index : index_t;
signal snoop_hits : cache_way_valids_t;

signal log_insn : std_ulogic_vector(35 downto 0);

-- Return the cache line index (tag index) for an address
function get_index(addr: std_ulogic_vector) return index_t is
begin
@ -293,7 +313,7 @@ architecture rtl of icache is
variable word: integer range 0 to INSN_PER_ROW-1;
begin
word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
return data(31+word*32 downto word*32);
return data(word * ICWORDLEN + ICWORDLEN - 1 downto word * ICWORDLEN);
end;

-- Get the tag value from the address
@ -327,6 +347,34 @@ architecture rtl of icache is

begin

-- byte-swap read data if big endian
process(all)
variable j: integer;
begin
if r.store_tag(TAG_BITS - 1) = '0' then
wb_rd_data <= wishbone_in.dat;
else
for ii in 0 to (wishbone_in.dat'length / 8) - 1 loop
j := ((ii / 4) * 4) + (3 - (ii mod 4));
wb_rd_data(ii * 8 + 7 downto ii * 8) <= wishbone_in.dat(j * 8 + 7 downto j * 8);
end loop;
end if;
end process;

predecoder_0: entity work.predecoder
generic map (
HAS_FPU => HAS_FPU,
WIDTH => INSN_PER_ROW,
ICODE_LEN => PREDECODE_BITS,
IMAGE_LEN => INSN_IMAGE_BITS
)
port map (
clk => clk,
valid_in => wishbone_in.ack,
insns_in => wb_rd_data,
icodes_out => cache_wr_data
);

assert LINE_SIZE mod ROW_SIZE = 0;
assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE;
assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE;
@ -367,13 +415,13 @@ begin
signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
signal dout : cache_row_t;
signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
signal wr_dat : std_ulogic_vector(wishbone_in.dat'left downto 0);
signal wr_sel : std_ulogic_vector(0 downto 0);
begin
way: entity work.cache_ram
generic map (
ROW_BITS => ROW_BITS,
WIDTH => ROW_SIZE_BITS
WIDTH => ROW_WIDTH,
BYTEWID => ROW_WIDTH
)
port map (
clk => clk,
@ -382,31 +430,19 @@ begin
rd_data => dout,
wr_sel => wr_sel,
wr_addr => wr_addr,
wr_data => wr_dat
wr_data => cache_wr_data
);
process(all)
variable j: integer;
begin
-- byte-swap read data if big endian
if r.store_tag(TAG_BITS - 1) = '0' then
wr_dat <= wishbone_in.dat;
else
for ii in 0 to (wishbone_in.dat'length / 8) - 1 loop
j := ((ii / 4) * 4) + (3 - (ii mod 4));
wr_dat(ii * 8 + 7 downto ii * 8) <= wishbone_in.dat(j * 8 + 7 downto j * 8);
end loop;
end if;
do_read <= not stall_in;
do_write <= '0';
if wishbone_in.ack = '1' and replace_way = i then
if r.recv_valid = '1' and r.store_way = i then
do_write <= '1';
end if;
cache_out(i) <= dout;
rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
wr_addr <= std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
for ii in 0 to ROW_SIZE-1 loop
wr_sel(ii) <= do_write;
end loop;
wr_sel(0) <= do_write;
end process;
end generate;
@ -515,6 +551,8 @@ begin
icache_comb : process(all)
variable is_hit : std_ulogic;
variable hit_way : way_t;
variable insn : std_ulogic_vector(ICWORDLEN - 1 downto 0);
variable icode : insn_code;
begin
-- Extract line, row and tag from request
if not is_X(i_in.nia) then
@ -575,11 +613,19 @@ begin
-- I prefer not to do just yet as it would force fetch2 to know about
-- some of the cache geometry information.
--
insn := (others => '0');
icode := INSN_illegal;
if r.hit_valid = '1' then
i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
else
i_out.insn <= (others => '0');
insn := read_insn_word(r.hit_nia, cache_out(r.hit_way));
-- Currently we use only the top bit for indicating illegal
-- instructions because we know that insn_codes fit into 9 bits.
if insn(ICWORDLEN - 1) = '0' then
icode := insn_code'val(to_integer(unsigned(insn(ICWORDLEN-1 downto INSN_IMAGE_BITS))));
end if;
end if;
i_out.insn <= insn(31 downto 0);
i_out.icode <= icode;
log_insn <= cache_wr_data(35 downto 0);
i_out.valid <= r.hit_valid;
i_out.nia <= r.hit_nia;
i_out.stop_mark <= r.hit_smark;
@ -640,9 +686,11 @@ begin
variable snoop_addr : real_addr_t;
variable snoop_tag : cache_tag_t;
variable snoop_cache_tags : cache_tags_set_t;
variable replace_way : way_t;
begin
if rising_edge(clk) then
ev.icache_miss <= '0';
r.recv_valid <= '0';
-- On reset, clear all valid bits to force misses
if rst = '1' then
for i in index_t loop
@ -714,13 +762,13 @@ begin
" IR:" & std_ulogic'image(i_in.virt_mode) &
" SM:" & std_ulogic'image(i_in.stop_mark) &
" idx:" & integer'image(req_index) &
" way:" & integer'image(replace_way) &
" tag:" & to_hstring(req_tag) &
" RA:" & to_hstring(real_addr);
ev.icache_miss <= '1';

-- Keep track of our index and way for subsequent stores
r.store_index <= req_index;
r.recv_row <= get_row(req_raddr);
r.store_row <= get_row(req_raddr);
r.store_tag <= req_tag;
r.store_valid <= '1';
@ -740,6 +788,7 @@ begin
when CLR_TAG | WAIT_ACK =>
if r.state = CLR_TAG then
-- Get victim way from plru
replace_way := to_integer(unsigned(plru_victim(r.store_index)));
r.store_way <= replace_way;

-- Force misses on that way while reloading that line
@ -757,6 +806,19 @@ begin
r.state <= WAIT_ACK;
end if;

-- If we are writing in this cycle, mark row valid and see if we are done
if r.recv_valid = '1' then
r.rows_valid(r.store_row mod ROW_PER_LINE) <= not inval_in;
if is_last_row(r.store_row, r.end_row_ix) then
-- Cache line is now valid
cache_valids(r.store_index)(r.store_way) <= r.store_valid and not inval_in;
-- We are done
r.state <= IDLE;
end if;
-- Increment store row counter
r.store_row <= r.recv_row;
end if;

-- If we are still sending requests, was one accepted ?
if wishbone_in.stall = '0' and r.wb.stb = '1' then
-- That was the last word ? We are done sending. Clear stb.
@ -777,33 +839,27 @@ begin

-- Incoming acks processing
if wishbone_in.ack = '1' then
r.rows_valid(r.store_row mod ROW_PER_LINE) <= not inval_in;
-- Check for completion
if is_last_row(r.store_row, r.end_row_ix) then
if is_last_row(r.recv_row, r.end_row_ix) then
-- Complete wishbone cycle
r.wb.cyc <= '0';

-- Cache line is now valid
cache_valids(r.store_index)(replace_way) <= r.store_valid and not inval_in;

-- We are done
r.state <= IDLE;
end if;
r.recv_valid <= '1';

-- Increment store row counter
r.store_row <= next_row(r.store_row);
-- Increment receive row counter
r.recv_row <= next_row(r.recv_row);
end if;

when STOP_RELOAD =>
-- Wait for all outstanding requests to be satisfied, then
-- go to IDLE state.
if get_row_of_line(r.store_row) = get_row_of_line(get_row(wb_to_addr(r.wb.adr))) then
if get_row_of_line(r.recv_row) = get_row_of_line(get_row(wb_to_addr(r.wb.adr))) then
r.wb.cyc <= '0';
r.state <= IDLE;
end if;
if wishbone_in.ack = '1' then
-- Increment store row counter
r.store_row <= next_row(r.store_row);
r.recv_row <= next_row(r.recv_row);
end if;
end case;
end if;
@ -819,7 +875,7 @@ begin

icache_log: if LOG_LENGTH > 0 generate
-- Output data to logger
signal log_data : std_ulogic_vector(53 downto 0);
signal log_data : std_ulogic_vector(57 downto 0);
begin