Merge pull request #208 from paulusmack/faster

Make the core go faster

Several major improvements in here:
- Simple branch predictor
- Reduced latency for mispredicted branches and interrupts by removing fetch2 stage
- Cache improvements
  o Request critical dword first on refill
  o Handle hits while refilling, including on line being refilled
  o Sizes doubled for both D and I
- Loadstore improvements: can now do one load or store every two cycles in most cases
- Optimized 2-cycle multiplier for Xilinx 7-series parts using DSP slices
- Timing improvements, including:
  o Stash buffer in decode1
  o Reduced width of execute1 result mux
  o Improved SPR decode in decode1
  o Some non-critical operation take a cycle longer so we can break some long combinatorial chains
- Core logging: logs 256 bits of info every cycle into a ring buffer, to help with debugging and performance analysis

This increases the LUT usage for the "synth" + A35 target from 9182 to 10297 = 12%.
pull/211/head
Michael Neuling 4 years ago committed by GitHub
commit b90a0a2139
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -42,7 +42,7 @@ all = core_tb icache_tb dcache_tb multiply_tb dmi_dtm_tb divider_tb \
all: $(all)

core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
fetch2.vhdl utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \
utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \
decode1.vhdl helpers.vhdl insn_helpers.vhdl gpr_hazard.vhdl \
cr_hazard.vhdl control.vhdl decode2.vhdl register_file.vhdl \
cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \

@ -93,10 +93,11 @@ package common is
virt_mode : std_ulogic;
priv_mode : std_ulogic;
stop_mark: std_ulogic;
sequential: std_ulogic;
nia: std_ulogic_vector(63 downto 0);
end record;

type IcacheToFetch2Type is record
type IcacheToDecode1Type is record
valid: std_ulogic;
stop_mark: std_ulogic;
fetch_failed: std_ulogic;
@ -104,16 +105,6 @@ package common is
insn: std_ulogic_vector(31 downto 0);
end record;

type Fetch2ToDecode1Type is record
valid: std_ulogic;
stop_mark : std_ulogic;
fetch_failed: std_ulogic;
nia: std_ulogic_vector(63 downto 0);
insn: std_ulogic_vector(31 downto 0);
end record;
constant Fetch2ToDecode1Init : Fetch2ToDecode1Type := (valid => '0', stop_mark => '0', fetch_failed => '0',
nia => (others => '0'), insn => (others => '0'));

type Decode1ToDecode2Type is record
valid: std_ulogic;
stop_mark : std_ulogic;
@ -122,8 +113,16 @@ package common is
ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr
ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR)
decode: decode_rom_t;
br_pred: std_ulogic; -- Branch was predicted to be taken
end record;
constant Decode1ToDecode2Init : Decode1ToDecode2Type :=
(valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'),
ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init, br_pred => '0');

type Decode1ToFetch1Type is record
redirect : std_ulogic;
redirect_nia : std_ulogic_vector(63 downto 0);
end record;
constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init);

type Decode2ToExecute1Type is record
valid: std_ulogic;
@ -158,23 +157,24 @@ package common is
sign_extend : std_ulogic; -- do we need to sign extend?
update : std_ulogic; -- is this an update instruction?
reserve : std_ulogic; -- set for larx/stcx
br_pred : std_ulogic;
end record;
constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
(valid => '0', unit => NONE, insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
lr => '0', rc => '0', oe => '0', invert_a => '0',
invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0',
is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0',
byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), others => (others => '0'));

type Execute1ToMultiplyType is record
valid: std_ulogic;
insn_type: insn_type_t;
data1: std_ulogic_vector(64 downto 0);
data2: std_ulogic_vector(64 downto 0);
data1: std_ulogic_vector(63 downto 0);
data2: std_ulogic_vector(63 downto 0);
is_32bit: std_ulogic;
neg_result: std_ulogic;
end record;
constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL,
is_32bit => '0',
constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0',
is_32bit => '0', neg_result => '0',
others => (others => '0'));

type Execute1ToDividerType is record
@ -253,6 +253,7 @@ package common is
others => (others => '0'));

type Loadstore1ToExecute1Type is record
busy : std_ulogic;
exception : std_ulogic;
invalid : std_ulogic;
perm_error : std_ulogic;
@ -366,7 +367,7 @@ package common is

type MultiplyToExecute1Type is record
valid: std_ulogic;
write_reg_data: std_ulogic_vector(63 downto 0);
result: std_ulogic_vector(127 downto 0);
overflow : std_ulogic;
end record;
constant MultiplyToExecute1Init : MultiplyToExecute1Type := (valid => '0', overflow => '0',

@ -15,7 +15,8 @@ entity control is
complete_in : in std_ulogic;
valid_in : in std_ulogic;
flush_in : in std_ulogic;
stall_in : in std_ulogic;
busy_in : in std_ulogic;
deferred : in std_ulogic;
sgl_pipe_in : in std_ulogic;
stop_mark_in : in std_ulogic;

@ -23,6 +24,9 @@ entity control is
gpr_write_in : in gspr_index_t;
gpr_bypassable : in std_ulogic;

update_gpr_write_valid : in std_ulogic;
update_gpr_write_reg : in gspr_index_t;

gpr_a_read_valid_in : in std_ulogic;
gpr_a_read_in : in gspr_index_t;

@ -72,7 +76,11 @@ begin
)
port map (
clk => clk,
stall_in => stall_in,
busy_in => busy_in,
deferred => deferred,
complete_in => complete_in,
flush_in => flush_in,
issuing => valid_out,

gpr_write_valid_in => gpr_write_valid,
gpr_write_in => gpr_write_in,
@ -80,6 +88,9 @@ begin
gpr_read_valid_in => gpr_a_read_valid_in,
gpr_read_in => gpr_a_read_in,

ugpr_write_valid => update_gpr_write_valid,
ugpr_write_reg => update_gpr_write_reg,

stall_out => stall_a_out,
use_bypass => gpr_bypass_a
);
@ -90,7 +101,11 @@ begin
)
port map (
clk => clk,
stall_in => stall_in,
busy_in => busy_in,
deferred => deferred,
complete_in => complete_in,
flush_in => flush_in,
issuing => valid_out,

gpr_write_valid_in => gpr_write_valid,
gpr_write_in => gpr_write_in,
@ -98,6 +113,9 @@ begin
gpr_read_valid_in => gpr_b_read_valid_in,
gpr_read_in => gpr_b_read_in,

ugpr_write_valid => update_gpr_write_valid,
ugpr_write_reg => update_gpr_write_reg,

stall_out => stall_b_out,
use_bypass => gpr_bypass_b
);
@ -110,7 +128,11 @@ begin
)
port map (
clk => clk,
stall_in => stall_in,
busy_in => busy_in,
deferred => deferred,
complete_in => complete_in,
flush_in => flush_in,
issuing => valid_out,

gpr_write_valid_in => gpr_write_valid,
gpr_write_in => gpr_write_in,
@ -118,6 +140,9 @@ begin
gpr_read_valid_in => gpr_c_read_valid_in,
gpr_read_in => gpr_c_read_in_fmt,

ugpr_write_valid => update_gpr_write_valid,
ugpr_write_reg => update_gpr_write_reg,

stall_out => stall_c_out,
use_bypass => gpr_bypass_c
);
@ -128,7 +153,11 @@ begin
)
port map (
clk => clk,
stall_in => stall_in,
busy_in => busy_in,
deferred => deferred,
complete_in => complete_in,
flush_in => flush_in,
issuing => valid_out,

cr_read_in => cr_read_in,
cr_write_in => cr_write_valid,
@ -139,7 +168,8 @@ begin
control0: process(clk)
begin
if rising_edge(clk) then
assert r_int.outstanding >= 0 and r_int.outstanding <= (PIPELINE_DEPTH+1) report "Outstanding bad " & integer'image(r_int.outstanding) severity failure;
assert rin_int.outstanding >= 0 and rin_int.outstanding <= (PIPELINE_DEPTH+1)
report "Outstanding bad " & integer'image(rin_int.outstanding) severity failure;
r_int <= rin_int;
end if;
end process;
@ -152,17 +182,18 @@ begin
v_int := r_int;

-- asynchronous
valid_tmp := valid_in and not flush_in and not stall_in;
stall_tmp := stall_in;
valid_tmp := valid_in and not flush_in;
stall_tmp := '0';

if complete_in = '1' then
if flush_in = '1' then
-- expect to see complete_in next cycle
v_int.outstanding := 1;
elsif complete_in = '1' then
v_int.outstanding := r_int.outstanding - 1;
end if;

if rst = '1' then
v_int.state := IDLE;
v_int.outstanding := 0;
stall_tmp := '0';
v_int := reg_internal_init;
valid_tmp := '0';
end if;

@ -227,7 +258,9 @@ begin
end if;

if valid_tmp = '1' then
if deferred = '0' then
v_int.outstanding := v_int.outstanding + 1;
end if;
gpr_write_valid <= gpr_write_valid_in;
cr_write_valid <= cr_write_in;
else
@ -237,7 +270,7 @@ begin

-- update outputs
valid_out <= valid_tmp;
stall_out <= stall_tmp;
stall_out <= stall_tmp or deferred;

-- update registers
rin_int <= v_int;

@ -11,7 +11,8 @@ entity core is
SIM : boolean := false;
DISABLE_FLATTEN : boolean := false;
EX1_BYPASS : boolean := true;
ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0')
ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0');
LOG_LENGTH : natural := 512
);
port (
clk : in std_ulogic;
@ -41,16 +42,14 @@ entity core is
end core;

architecture behave of core is
-- fetch signals
signal fetch2_to_decode1: Fetch2ToDecode1Type;

-- icache signals
signal fetch1_to_icache : Fetch1ToIcacheType;
signal icache_to_fetch2 : IcacheToFetch2Type;
signal icache_to_decode1 : IcacheToDecode1Type;
signal mmu_to_icache : MmuToIcacheType;

-- decode signals
signal decode1_to_decode2: Decode1ToDecode2Type;
signal decode1_to_fetch1: Decode1ToFetch1Type;
signal decode2_to_execute1: Decode2ToExecute1Type;

-- register file signals
@ -83,16 +82,18 @@ architecture behave of core is
-- local signals
signal fetch1_stall_in : std_ulogic;
signal icache_stall_out : std_ulogic;
signal fetch2_stall_in : std_ulogic;
signal icache_stall_in : std_ulogic;
signal decode1_stall_in : std_ulogic;
signal decode2_stall_in : std_ulogic;
signal decode1_busy : std_ulogic;
signal decode2_busy_in : std_ulogic;
signal decode2_stall_out : std_ulogic;
signal ex1_icache_inval: std_ulogic;
signal ex1_stall_out: std_ulogic;
signal ls1_stall_out: std_ulogic;
signal ex1_busy_out: std_ulogic;
signal dcache_stall_out: std_ulogic;

signal flush: std_ulogic;
signal decode1_flush: std_ulogic;
signal fetch1_flush: std_ulogic;

signal complete: std_ulogic;
signal terminate: std_ulogic;
@ -128,6 +129,12 @@ architecture behave of core is
-- Debug status
signal dbg_core_is_stopped: std_ulogic;

-- Logging signals
signal log_data : std_ulogic_vector(255 downto 0);
signal log_rd_addr : std_ulogic_vector(31 downto 0);
signal log_wr_addr : std_ulogic_vector(31 downto 0);
signal log_rd_data : std_ulogic_vector(63 downto 0);

function keep_h(disable : boolean) return string is
begin
if disable then
@ -139,7 +146,6 @@ architecture behave of core is
attribute keep_hierarchy : string;
attribute keep_hierarchy of fetch1_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of icache_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of fetch2_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of decode1_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of decode2_0 : label is keep_h(DISABLE_FLATTEN);
attribute keep_hierarchy of register_file_0 : label is keep_h(DISABLE_FLATTEN);
@ -180,45 +186,40 @@ begin
rst => rst_fetch1,
alt_reset_in => alt_reset_d,
stall_in => fetch1_stall_in,
flush_in => flush,
flush_in => fetch1_flush,
stop_in => dbg_core_stop,
d_in => decode1_to_fetch1,
e_in => execute1_to_fetch1,
i_out => fetch1_to_icache
i_out => fetch1_to_icache,
log_out => log_data(42 downto 0)
);

fetch1_stall_in <= icache_stall_out or decode2_stall_out;
fetch1_stall_in <= icache_stall_out or decode1_busy;
fetch1_flush <= flush or decode1_flush;

icache_0: entity work.icache
generic map(
SIM => SIM,
LINE_SIZE => 64,
NUM_LINES => 32,
NUM_LINES => 64,
NUM_WAYS => 2
)
port map(
clk => clk,
rst => rst_icache,
i_in => fetch1_to_icache,
i_out => icache_to_fetch2,
i_out => icache_to_decode1,
m_in => mmu_to_icache,
flush_in => flush,
flush_in => fetch1_flush,
inval_in => dbg_icache_rst or ex1_icache_inval,
stall_in => icache_stall_in,
stall_out => icache_stall_out,
wishbone_out => wishbone_insn_out,
wishbone_in => wishbone_insn_in
);

fetch2_0: entity work.fetch2
port map (
clk => clk,
rst => rst_fetch2,
stall_in => fetch2_stall_in,
flush_in => flush,
i_in => icache_to_fetch2,
f_out => fetch2_to_decode1
wishbone_in => wishbone_insn_in,
log_out => log_data(96 downto 43)
);

fetch2_stall_in <= decode2_stall_out;
icache_stall_in <= decode1_busy;

decode1_0: entity work.decode1
port map (
@ -226,8 +227,12 @@ begin
rst => rst_dec1,
stall_in => decode1_stall_in,
flush_in => flush,
f_in => fetch2_to_decode1,
d_out => decode1_to_decode2
flush_out => decode1_flush,
busy_out => decode1_busy,
f_in => icache_to_decode1,
d_out => decode1_to_decode2,
f_out => decode1_to_fetch1,
log_out => log_data(109 downto 97)
);

decode1_stall_in <= decode2_stall_out;
@ -239,7 +244,7 @@ begin
port map (
clk => clk,
rst => rst_dec2,
stall_in => decode2_stall_in,
busy_in => decode2_busy_in,
stall_out => decode2_stall_out,
flush_in => flush,
complete_in => complete,
@ -249,9 +254,10 @@ begin
r_in => register_file_to_decode2,
r_out => decode2_to_register_file,
c_in => cr_file_to_decode2,
c_out => decode2_to_cr_file
c_out => decode2_to_cr_file,
log_out => log_data(119 downto 110)
);
decode2_stall_in <= ex1_stall_out or ls1_stall_out;
decode2_busy_in <= ex1_busy_out;

register_file_0: entity work.register_file
generic map (
@ -267,7 +273,8 @@ begin
dbg_gpr_addr => dbg_gpr_addr,
dbg_gpr_data => dbg_gpr_data,
sim_dump => terminate,
sim_dump_done => sim_cr_dump
sim_dump_done => sim_cr_dump,
log_out => log_data(255 downto 185)
);

cr_file_0: entity work.cr_file
@ -279,7 +286,8 @@ begin
d_in => decode2_to_cr_file,
d_out => cr_file_to_decode2,
w_in => writeback_to_cr_file,
sim_dump => sim_cr_dump
sim_dump => sim_cr_dump,
log_out => log_data(184 downto 172)
);

execute1_0: entity work.execute1
@ -290,7 +298,7 @@ begin
clk => clk,
rst => rst_ex1,
flush_out => flush,
stall_out => ex1_stall_out,
busy_out => ex1_busy_out,
e_in => decode2_to_execute1,
l_in => loadstore1_to_execute1,
ext_irq_in => ext_irq,
@ -299,7 +307,11 @@ begin
e_out => execute1_to_writeback,
icache_inval => ex1_icache_inval,
dbg_msr_out => msr,
terminate_out => terminate
terminate_out => terminate,
log_out => log_data(134 downto 120),
log_rd_addr => log_rd_addr,
log_rd_data => log_rd_data,
log_wr_addr => log_wr_addr
);

loadstore1_0: entity work.loadstore1
@ -314,7 +326,7 @@ begin
m_out => loadstore1_to_mmu,
m_in => mmu_to_loadstore1,
dc_stall => dcache_stall_out,
stall_out => ls1_stall_out
log_out => log_data(149 downto 140)
);

mmu_0: entity work.mmu
@ -331,7 +343,7 @@ begin
dcache_0: entity work.dcache
generic map(
LINE_SIZE => 64,
NUM_LINES => 32,
NUM_LINES => 64,
NUM_WAYS => 2
)
port map (
@ -343,7 +355,8 @@ begin
m_out => dcache_to_mmu,
stall_out => dcache_stall_out,
wishbone_in => wishbone_data_in,
wishbone_out => wishbone_data_out
wishbone_out => wishbone_data_out,
log_out => log_data(171 downto 152)
);

writeback_0: entity work.writeback
@ -356,7 +369,13 @@ begin
complete_out => complete
);

log_data(151 downto 150) <= "00";
log_data(139 downto 135) <= "00000";

debug_0: entity work.core_debug
generic map (
LOG_LENGTH => LOG_LENGTH
)
port map (
clk => clk,
rst => rst_dbg,
@ -377,6 +396,10 @@ begin
dbg_gpr_ack => dbg_gpr_ack,
dbg_gpr_addr => dbg_gpr_addr,
dbg_gpr_data => dbg_gpr_data,
log_data => log_data,
log_read_addr => log_rd_addr,
log_read_data => log_rd_data,
log_write_addr => log_wr_addr,
terminated_out => terminated_out
);


@ -3,9 +3,14 @@ use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

library work;
use work.utils.all;
use work.common.all;

entity core_debug is
generic (
-- Length of log buffer
LOG_LENGTH : natural := 512
);
port (
clk : in std_logic;
rst : in std_logic;
@ -34,6 +39,12 @@ entity core_debug is
dbg_gpr_addr : out gspr_index_t;
dbg_gpr_data : in std_ulogic_vector(63 downto 0);

-- Core logging data
log_data : in std_ulogic_vector(255 downto 0);
log_read_addr : in std_ulogic_vector(31 downto 0);
log_read_data : out std_ulogic_vector(63 downto 0);
log_write_addr : out std_ulogic_vector(31 downto 0);

-- Misc
terminated_out : out std_ulogic
);
@ -77,6 +88,12 @@ architecture behave of core_debug is
-- GSPR register data
constant DBG_CORE_GSPR_DATA : std_ulogic_vector(3 downto 0) := "0101";

-- Log buffer address and data registers
constant DBG_CORE_LOG_ADDR : std_ulogic_vector(3 downto 0) := "0110";
constant DBG_CORE_LOG_DATA : std_ulogic_vector(3 downto 0) := "0111";

constant LOG_INDEX_BITS : natural := log2(LOG_LENGTH);

-- Some internal wires
signal stat_reg : std_ulogic_vector(63 downto 0);

@ -89,6 +106,12 @@ architecture behave of core_debug is
signal do_gspr_rd : std_ulogic;
signal gspr_index : gspr_index_t;

signal log_dmi_addr : std_ulogic_vector(31 downto 0) := (others => '0');
signal log_dmi_data : std_ulogic_vector(63 downto 0) := (others => '0');
signal do_dmi_log_rd : std_ulogic;
signal dmi_read_log_data : std_ulogic;
signal dmi_read_log_data_1 : std_ulogic;

begin
-- Single cycle register accesses on DMI except for GSPR data
dmi_ack <= dmi_req when dmi_addr /= DBG_CORE_GSPR_DATA
@ -108,6 +131,8 @@ begin
nia when DBG_CORE_NIA,
msr when DBG_CORE_MSR,
dbg_gpr_data when DBG_CORE_GSPR_DATA,
log_write_addr & log_dmi_addr when DBG_CORE_LOG_ADDR,
log_dmi_data when DBG_CORE_LOG_DATA,
(others => '0') when others;

-- DMI writes
@ -118,6 +143,7 @@ begin
do_step <= '0';
do_reset <= '0';
do_icreset <= '0';
do_dmi_log_rd <= '0';

if (rst) then
stopping <= '0';
@ -151,10 +177,25 @@ begin
end if;
elsif dmi_addr = DBG_CORE_GSPR_INDEX then
gspr_index <= dmi_din(gspr_index_t'left downto 0);
elsif dmi_addr = DBG_CORE_LOG_ADDR then
log_dmi_addr <= dmi_din(31 downto 0);
do_dmi_log_rd <= '1';
end if;
else
report("DMI read from " & to_string(dmi_addr));
end if;

elsif dmi_read_log_data = '0' and dmi_read_log_data_1 = '1' then
-- Increment log_dmi_addr after the end of a read from DBG_CORE_LOG_DATA
log_dmi_addr(LOG_INDEX_BITS + 1 downto 0) <=
std_ulogic_vector(unsigned(log_dmi_addr(LOG_INDEX_BITS+1 downto 0)) + 1);
do_dmi_log_rd <= '1';
end if;
dmi_read_log_data_1 <= dmi_read_log_data;
if dmi_req = '1' and dmi_addr = DBG_CORE_LOG_DATA then
dmi_read_log_data <= '1';
else
dmi_read_log_data <= '0';
end if;

-- Set core stop on terminate. We'll be stopping some time *after*
@ -175,5 +216,87 @@ begin
core_rst <= do_reset;
icache_rst <= do_icreset;
terminated_out <= terminated;

-- Logging RAM
maybe_log: if LOG_LENGTH > 0 generate
subtype log_ptr_t is unsigned(LOG_INDEX_BITS - 1 downto 0);
type log_array_t is array(0 to LOG_LENGTH - 1) of std_ulogic_vector(255 downto 0);
signal log_array : log_array_t;
signal log_rd_ptr : log_ptr_t;
signal log_wr_ptr : log_ptr_t;
signal log_toggle : std_ulogic;
signal log_wr_enable : std_ulogic;
signal log_rd_ptr_latched : log_ptr_t;
signal log_rd : std_ulogic_vector(255 downto 0);
signal log_dmi_reading : std_ulogic;
signal log_dmi_read_done : std_ulogic;

function select_dword(data : std_ulogic_vector(255 downto 0);
addr : std_ulogic_vector(31 downto 0)) return std_ulogic_vector is
variable firstbit : integer;
begin
firstbit := to_integer(unsigned(addr(1 downto 0))) * 64;
return data(firstbit + 63 downto firstbit);
end;

attribute ram_style : string;
attribute ram_style of log_array : signal is "block";
attribute ram_decomp : string;
attribute ram_decomp of log_array : signal is "power";

begin
-- Use MSB of read addresses to stop the logging
log_wr_enable <= not (log_read_addr(31) or log_dmi_addr(31));

log_ram: process(clk)
begin
if rising_edge(clk) then
if log_wr_enable = '1' then
log_array(to_integer(log_wr_ptr)) <= log_data;
end if;
log_rd <= log_array(to_integer(log_rd_ptr_latched));
end if;
end process;


log_buffer: process(clk)
variable b : integer;
variable data : std_ulogic_vector(255 downto 0);
begin
if rising_edge(clk) then
if rst = '1' then
log_wr_ptr <= (others => '0');
log_toggle <= '0';
elsif log_wr_enable = '1' then
if log_wr_ptr = to_unsigned(LOG_LENGTH - 1, LOG_INDEX_BITS) then
log_toggle <= not log_toggle;
end if;
log_wr_ptr <= log_wr_ptr + 1;
end if;
if do_dmi_log_rd = '1' then
log_rd_ptr_latched <= unsigned(log_dmi_addr(LOG_INDEX_BITS + 1 downto 2));
else
log_rd_ptr_latched <= unsigned(log_read_addr(LOG_INDEX_BITS + 1 downto 2));
end if;
if log_dmi_read_done = '1' then
log_dmi_data <= select_dword(log_rd, log_dmi_addr);
else
log_read_data <= select_dword(log_rd, log_read_addr);
end if;
log_dmi_read_done <= log_dmi_reading;
log_dmi_reading <= do_dmi_log_rd;
end if;
end process;
log_write_addr(LOG_INDEX_BITS - 1 downto 0) <= std_ulogic_vector(log_wr_ptr);
log_write_addr(LOG_INDEX_BITS) <= '1';
log_write_addr(31 downto LOG_INDEX_BITS + 1) <= (others => '0');
end generate;

no_log: if LOG_LENGTH = 0 generate
begin
log_read_data <= (others => '0');
log_write_addr <= x"00000001";
end generate;

end behave;


@ -18,7 +18,9 @@ entity cr_file is
w_in : in WritebackToCrFileType;

-- debug
sim_dump : in std_ulogic
sim_dump : in std_ulogic;

log_out : out std_ulogic_vector(12 downto 0)
);
end entity cr_file;

@ -27,6 +29,7 @@ architecture behaviour of cr_file is
signal crs_updated : std_ulogic_vector(31 downto 0);
signal xerc : xer_common_t := xerc_init;
signal xerc_updated : xer_common_t;
signal log_data : std_ulogic_vector(12 downto 0);
begin
cr_create_0: process(all)
variable hi, lo : integer := 0;
@ -88,4 +91,14 @@ begin
end process;
end generate;

cr_log: process(clk)
begin
if rising_edge(clk) then
log_data <= w_in.write_cr_enable &
w_in.write_cr_data(31 downto 28) &
w_in.write_cr_mask;
end if;
end process;
log_out <= log_data;

end architecture behaviour;

@ -4,11 +4,15 @@ use ieee.numeric_std.all;

entity cr_hazard is
generic (
PIPELINE_DEPTH : natural := 2
PIPELINE_DEPTH : natural := 1
);
port(
clk : in std_ulogic;
stall_in : in std_ulogic;
busy_in : in std_ulogic;
deferred : in std_ulogic;
complete_in : in std_ulogic;
flush_in : in std_ulogic;
issuing : in std_ulogic;

cr_read_in : in std_ulogic;
cr_write_in : in std_ulogic;
@ -22,7 +26,7 @@ architecture behaviour of cr_hazard is
end record;
constant pipeline_entry_init : pipeline_entry_type := (valid => '0');

type pipeline_t is array(0 to PIPELINE_DEPTH-1) of pipeline_entry_type;
type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type;
constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init);

signal r, rin : pipeline_t := pipeline_t_init;
@ -30,10 +34,8 @@ begin
cr_hazard0: process(clk)
begin
if rising_edge(clk) then
if stall_in = '0' then
r <= rin;
end if;
end if;
end process;

cr_hazard1: process(all)
@ -41,22 +43,23 @@ begin
begin
v := r;

stall_out <= '0';
loop_0: for i in 0 to PIPELINE_DEPTH-1 loop
if (r(i).valid = cr_read_in) then
stall_out <= '1';
-- XXX assumes PIPELINE_DEPTH = 1
if complete_in = '1' then
v(1).valid := '0';
end if;
end loop;
stall_out <= cr_read_in and (v(0).valid or v(1).valid);

-- XXX assumes PIPELINE_DEPTH = 1
if busy_in = '0' then
v(1) := r(0);
v(0).valid := '0';
end if;
if deferred = '0' and issuing = '1' then
v(0).valid := cr_write_in;
loop_1: for i in 0 to PIPELINE_DEPTH-2 loop
-- propagate to next slot
v(i+1) := r(i);
end loop;

-- asynchronous output
if cr_read_in = '0' then
stall_out <= '0';
end if;
if flush_in = '1' then
v(0).valid := '0';
v(1).valid := '0';
end if;

-- update registers

File diff suppressed because it is too large Load Diff

@ -13,14 +13,19 @@ entity decode1 is

stall_in : in std_ulogic;
flush_in : in std_ulogic;
busy_out : out std_ulogic;
flush_out : out std_ulogic;

f_in : in Fetch2ToDecode1Type;
d_out : out Decode1ToDecode2Type
f_in : in IcacheToDecode1Type;
f_out : out Decode1ToFetch1Type;
d_out : out Decode1ToDecode2Type;
log_out : out std_ulogic_vector(12 downto 0)
);
end entity decode1;

architecture behaviour of decode1 is
signal r, rin : Decode1ToDecode2Type;
signal s : Decode1ToDecode2Type;

subtype major_opcode_t is unsigned(5 downto 0);
type major_rom_array_t is array(0 to 63) of decode_rom_t;
@ -352,24 +357,45 @@ architecture behaviour of decode1 is
constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0');
constant fetch_fail_inst: decode_rom_t := (LDST, OP_FETCH_FAILED, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0');

signal log_data : std_ulogic_vector(12 downto 0);

begin
decode1_0: process(clk)
begin
if rising_edge(clk) then
-- Output state remains unchanged on stall, unless we are flushing
if rst = '1' or flush_in = '1' or stall_in = '0' then
if rst = '1' then
r <= Decode1ToDecode2Init;
s <= Decode1ToDecode2Init;
elsif flush_in = '1' then
r.valid <= '0';
s.valid <= '0';
elsif s.valid = '1' then
if stall_in = '0' then
r <= s;
s.valid <= '0';
end if;
else
s <= rin;
s.valid <= rin.valid and r.valid and stall_in;
if r.valid = '0' or stall_in = '0' then
r <= rin;
end if;
end if;
end if;
end process;
busy_out <= s.valid;

decode1_1: process(all)
variable v : Decode1ToDecode2Type;
variable f : Decode1ToFetch1Type;
variable majorop : major_opcode_t;
variable op_19_bits: std_ulogic_vector(2 downto 0);
variable sprn : spr_num_t;
variable br_nia : std_ulogic_vector(61 downto 0);
variable br_target : std_ulogic_vector(61 downto 0);
variable br_offset : signed(23 downto 0);
begin
v := r;
v := Decode1ToDecode2Init;

v.valid := f_in.valid;
v.nia := f_in.nia;
@ -395,6 +421,31 @@ begin
-- major opcode 31, lots of things
v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1))));

-- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path
sprn := decode_spr_num(f_in.insn);
v.ispr1 := fast_spr_num(sprn);

if std_match(f_in.insn(10 downto 1), "01-1010011") then
-- mfspr or mtspr
-- Make slow SPRs single issue
if is_fast_spr(v.ispr1) = '0' then
v.decode.sgl_pipe := '1';
-- send MMU-related SPRs to loadstore1
case sprn is
when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PRTBL =>
v.decode.unit := LDST;
when others =>
end case;
end if;
end if;

elsif majorop = "010000" then
-- CTR may be needed as input to bc
v.decode := major_decode_rom_array(to_integer(majorop));
if f_in.insn(23) = '0' then
v.ispr1 := fast_spr_num(SPR_CTR);
end if;

elsif majorop = "010011" then
if decode_op_19_valid(to_integer(unsigned(f_in.insn(10 downto 1)))) = '0' then
report "op 19 illegal subcode";
@ -405,73 +456,83 @@ begin
report "op 19 sub " & to_hstring(op_19_bits);
end if;

elsif majorop = "011110" then
v.decode := decode_op_30_array(to_integer(unsigned(f_in.insn(4 downto 1))));

elsif majorop = "111010" then
v.decode := decode_op_58_array(to_integer(unsigned(f_in.insn(1 downto 0))));

elsif majorop = "111110" then
v.decode := decode_op_62_array(to_integer(unsigned(f_in.insn(1 downto 0))));

elsif std_match(f_in.insn, "01100000000000000000000000000000") then
report "PPC_nop";
v.decode := nop_instr;

else
v.decode := major_decode_rom_array(to_integer(majorop));
end if;

-- Set ISPR1/ISPR2 when needed
if v.decode.insn_type = OP_BC or v.decode.insn_type = OP_BCREG then
-- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path
if f_in.insn(2) = '0' then
-- Could be OP_BCREG: bclr, bcctr, bctar
-- Branch uses CTR as condition when BO(2) is 0. This is
-- also used to indicate that CTR is modified (they go
-- together).
--
if f_in.insn(23) = '0' then
v.ispr1 := fast_spr_num(SPR_CTR);
end if;

-- Branch source register is an SPR
if v.decode.insn_type = OP_BCREG then
-- TODO: Add TAR
if f_in.insn(10) = '0' then
v.ispr2 := fast_spr_num(SPR_LR);
else
v.ispr2 := fast_spr_num(SPR_CTR);
end if;
end if;
elsif v.decode.insn_type = OP_MFSPR or v.decode.insn_type = OP_MTSPR then
sprn := decode_spr_num(f_in.insn);
v.ispr1 := fast_spr_num(sprn);
-- Make slow SPRs single issue
if is_fast_spr(v.ispr1) = '0' then
v.decode.sgl_pipe := '1';
-- send MMU-related SPRs to loadstore1
case sprn is
when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PRTBL =>
v.decode.unit := LDST;
when others =>
end case;
end if;
elsif v.decode.insn_type = OP_RFID then
report "PPC RFID";
else
-- Could be OP_RFID
v.ispr1 := fast_spr_num(SPR_SRR0);
v.ispr2 := fast_spr_num(SPR_SRR1);
end if;

if flush_in = '1' then
v.valid := '0';
elsif majorop = "011110" then
v.decode := decode_op_30_array(to_integer(unsigned(f_in.insn(4 downto 1))));

elsif majorop = "111010" then
v.decode := decode_op_58_array(to_integer(unsigned(f_in.insn(1 downto 0))));

elsif majorop = "111110" then
v.decode := decode_op_62_array(to_integer(unsigned(f_in.insn(1 downto 0))));

elsif std_match(f_in.insn, "01100000000000000000000000000000") then
report "PPC_nop";
v.decode := nop_instr;

else
v.decode := major_decode_rom_array(to_integer(majorop));
end if;

if rst = '1' then
v := Decode1ToDecode2Init;
-- Branch predictor
-- Note bclr, bcctr and bctar are predicted not taken as we have no
-- count cache or link stack.
br_offset := (others => '0');
if majorop = 18 then
-- Unconditional branches are always taken
v.br_pred := '1';
br_offset := signed(f_in.insn(25 downto 2));
elsif majorop = 16 then
-- Predict backward branches as taken, forward as untaken
v.br_pred := f_in.insn(15);
br_offset := resize(signed(f_in.insn(15 downto 2)), 24);
end if;
br_nia := f_in.nia(63 downto 2);
if f_in.insn(1) = '1' then
br_nia := (others => '0');
end if;
br_target := std_ulogic_vector(signed(br_nia) + br_offset);
f.redirect := v.br_pred and f_in.valid and not flush_in and not s.valid;
f.redirect_nia := br_target & "00";

-- Update registers
rin <= v;

-- Update outputs
d_out <= r;
f_out <= f;
flush_out <= f.redirect;
end process;

dec1_log : process(clk)
begin
if rising_edge(clk) then
log_data <= std_ulogic_vector(to_unsigned(insn_type_t'pos(r.decode.insn_type), 6)) &
r.nia(5 downto 2) &
std_ulogic_vector(to_unsigned(unit_t'pos(r.decode.unit), 2)) &
r.valid;
end if;
end process;
log_out <= log_data;

end architecture behaviour;

@ -17,7 +17,7 @@ entity decode2 is
rst : in std_ulogic;

complete_in : in std_ulogic;
stall_in : in std_ulogic;
busy_in : in std_ulogic;
stall_out : out std_ulogic;

stopped_out : out std_ulogic;
@ -32,7 +32,9 @@ entity decode2 is
r_out : out Decode2ToRegisterFileType;

c_in : in CrFileToDecode2Type;
c_out : out Decode2ToCrFileType
c_out : out Decode2ToCrFileType;

log_out : out std_ulogic_vector(9 downto 0)
);
end entity decode2;

@ -43,6 +45,10 @@ architecture behaviour of decode2 is

signal r, rin : reg_type;

signal deferred : std_ulogic;

signal log_data : std_ulogic_vector(9 downto 0);

type decode_input_reg_t is record
reg_valid : std_ulogic;
reg : gspr_index_t;
@ -61,8 +67,6 @@ architecture behaviour of decode2 is
return decode_input_reg_t is
begin
if t = RA or (t = RA_OR_ZERO and insn_ra(insn_in) /= "00000") then
assert is_fast_spr(ispr) = '0' report "Decode A says GPR but ISPR says SPR:" &
to_hstring(ispr) severity failure;
return ('1', gpr_to_gspr(insn_ra(insn_in)), reg_data);
elsif t = SPR then
-- ISPR must be either a valid fast SPR number or all 0 for a slow SPR.
@ -87,8 +91,6 @@ architecture behaviour of decode2 is
begin
case t is
when RB =>
assert is_fast_spr(ispr) = '0' report "Decode B says GPR but ISPR says SPR:" &
to_hstring(ispr) severity failure;
ret := ('1', gpr_to_gspr(insn_rb(insn_in)), reg_data);
when CONST_UI =>
ret := ('0', (others => '0'), std_ulogic_vector(resize(unsigned(insn_ui(insn_in)), 64)));
@ -196,6 +198,9 @@ architecture behaviour of decode2 is
signal gpr_write : gspr_index_t;
signal gpr_bypassable : std_ulogic;

signal update_gpr_write_valid : std_ulogic;
signal update_gpr_write_reg : gspr_index_t;

signal gpr_a_read_valid : std_ulogic;
signal gpr_a_read :gspr_index_t;
signal gpr_a_bypass : std_ulogic;
@ -220,7 +225,8 @@ begin

complete_in => complete_in,
valid_in => control_valid_in,
stall_in => stall_in,
busy_in => busy_in,
deferred => deferred,
flush_in => flush_in,
sgl_pipe_in => control_sgl_pipe,
stop_mark_in => d_in.stop_mark,
@ -229,6 +235,9 @@ begin
gpr_write_in => gpr_write,
gpr_bypassable => gpr_bypassable,

update_gpr_write_valid => update_gpr_write_valid,
update_gpr_write_reg => update_gpr_write_reg,

gpr_a_read_valid_in => gpr_a_read_valid,
gpr_a_read_in => gpr_a_read,

@ -250,18 +259,24 @@ begin
gpr_bypass_c => gpr_c_bypass
);

deferred <= r.e.valid and busy_in;

decode2_0: process(clk)
begin
if rising_edge(clk) then
if rst = '1' or flush_in = '1' or deferred = '0' then
if rin.e.valid = '1' then
report "execute " & to_hstring(rin.e.nia);
end if;
r <= rin;
end if;
end if;
end process;

r_out.read1_reg <= gpr_or_spr_to_gspr(insn_ra(d_in.insn), d_in.ispr1);
r_out.read2_reg <= gpr_or_spr_to_gspr(insn_rb(d_in.insn), d_in.ispr2);
r_out.read1_reg <= d_in.ispr1 when d_in.decode.input_reg_a = SPR
else gpr_to_gspr(insn_ra(d_in.insn));
r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR
else gpr_to_gspr(insn_rb(d_in.insn));
r_out.read3_reg <= insn_rs(d_in.insn);

c_out.read <= d_in.decode.input_cr;
@ -343,6 +358,7 @@ begin
v.e.sign_extend := d_in.decode.sign_extend;
v.e.update := d_in.decode.update;
v.e.reserve := d_in.decode.reserve;
v.e.br_pred := d_in.br_pred;

-- issue control
control_valid_in <= d_in.valid;
@ -354,6 +370,13 @@ begin
if EX1_BYPASS and d_in.decode.unit = ALU then
gpr_bypassable <= '1';
end if;
update_gpr_write_valid <= d_in.decode.update;
update_gpr_write_reg <= decoded_reg_a.reg;
if v.e.lr = '1' then
-- there are no instructions that have both update=1 and lr=1
update_gpr_write_valid <= '1';
update_gpr_write_reg <= fast_spr_num(SPR_LR);
end if;

gpr_a_read_valid <= decoded_reg_a.reg_valid;
gpr_a_read <= decoded_reg_a.reg;
@ -371,7 +394,7 @@ begin
v.e.insn_type := OP_ILLEGAL;
end if;

if rst = '1' then
if rst = '1' or flush_in = '1' then
v.e := Decode2ToExecute1Init;
end if;

@ -381,4 +404,19 @@ begin
-- Update outputs
e_out <= r.e;
end process;

dec2_log : process(clk)
begin
if rising_edge(clk) then
log_data <= r.e.nia(5 downto 2) &
r.e.valid &
stopped_out &
stall_out &
r.e.bypass_data3 &
r.e.bypass_data2 &
r.e.bypass_data1;
end if;
end process;
log_out <= log_data;

end architecture behaviour;

@ -20,7 +20,7 @@ entity execute1 is

-- asynchronous
flush_out : out std_ulogic;
stall_out : out std_ulogic;
busy_out : out std_ulogic;

e_in : in Decode2ToExecute1Type;
l_in : in Loadstore1ToExecute1Type;
@ -36,34 +36,44 @@ entity execute1 is
dbg_msr_out : out std_ulogic_vector(63 downto 0);

icache_inval : out std_ulogic;
terminate_out : out std_ulogic
terminate_out : out std_ulogic;

log_out : out std_ulogic_vector(14 downto 0);
log_rd_addr : out std_ulogic_vector(31 downto 0);
log_rd_data : in std_ulogic_vector(63 downto 0);
log_wr_addr : in std_ulogic_vector(31 downto 0)
);
end entity execute1;

architecture behaviour of execute1 is
type reg_type is record
e : Execute1ToWritebackType;
busy: std_ulogic;
terminate: std_ulogic;
lr_update : std_ulogic;
next_lr : std_ulogic_vector(63 downto 0);
mul_in_progress : std_ulogic;
div_in_progress : std_ulogic;
cntz_in_progress : std_ulogic;
slow_op_insn : insn_type_t;
slow_op_dest : gpr_index_t;
slow_op_rc : std_ulogic;
slow_op_oe : std_ulogic;
slow_op_xerc : xer_common_t;
ldst_nia : std_ulogic_vector(63 downto 0);
last_nia : std_ulogic_vector(63 downto 0);
log_addr_spr : std_ulogic_vector(31 downto 0);
end record;
constant reg_type_init : reg_type :=
(e => Execute1ToWritebackInit, lr_update => '0',
(e => Execute1ToWritebackInit, busy => '0', lr_update => '0', terminate => '0',
mul_in_progress => '0', div_in_progress => '0', cntz_in_progress => '0',
slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init,
next_lr => (others => '0'), ldst_nia => (others => '0'), others => (others => '0'));
slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init,
next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0'));

signal r, rin : reg_type;

signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0);

signal valid_in : std_ulogic;
signal ctrl: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0'));
signal ctrl_tmp: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0'));
signal right_shift, rot_clear_left, rot_clear_right: std_ulogic;
@ -72,8 +82,6 @@ architecture behaviour of execute1 is
signal rotator_carry: std_ulogic;
signal logical_result: std_ulogic_vector(63 downto 0);
signal countzero_result: std_ulogic_vector(63 downto 0);
signal popcnt_result: std_ulogic_vector(63 downto 0);
signal parity_result: std_ulogic_vector(63 downto 0);

-- multiply signals
signal x_to_multiply: Execute1ToMultiplyType;
@ -83,6 +91,11 @@ architecture behaviour of execute1 is
signal x_to_divider: Execute1ToDividerType;
signal divider_to_x: DividerToExecute1Type;

-- signals for logging
signal exception_log : std_ulogic;
signal irq_valid_log : std_ulogic;
signal log_data : std_ulogic_vector(14 downto 0);

type privilege_level is (USER, SUPER);
type op_privilege_array is array(insn_type_t) of privilege_level;
constant op_privilege: op_privilege_array := (
@ -193,9 +206,7 @@ begin
invert_in => e_in.invert_a,
invert_out => e_in.invert_out,
result => logical_result,
datalen => e_in.data_len,
popcnt => popcnt_result,
parity => parity_result
datalen => e_in.data_len
);

countzero_0: entity work.zero_counter
@ -223,11 +234,17 @@ begin
);

dbg_msr_out <= ctrl.msr;
log_rd_addr <= r.log_addr_spr;

a_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data1 = '1' else e_in.read_data1;
b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2;
c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3;

busy_out <= l_in.busy or r.busy;
valid_in <= e_in.valid and not busy_out;

terminate_out <= r.terminate;

execute1_0: process(clk)
begin
if rising_edge(clk) then
@ -238,7 +255,7 @@ begin
else
r <= rin;
ctrl <= ctrl_tmp;
assert not (r.lr_update = '1' and e_in.valid = '1')
assert not (r.lr_update = '1' and valid_in = '1')
report "LR update collision with valid in EX1"
severity failure;
if r.lr_update = '1' then
@ -274,7 +291,6 @@ begin
variable sign1, sign2 : std_ulogic;
variable abs1, abs2 : signed(63 downto 0);
variable overflow : std_ulogic;
variable negative : std_ulogic;
variable zerohi, zerolo : std_ulogic;
variable msb_a, msb_b : std_ulogic;
variable a_lt : std_ulogic;
@ -284,11 +300,18 @@ begin
variable exception_nextpc : std_ulogic;
variable trapval : std_ulogic_vector(4 downto 0);
variable illegal : std_ulogic;
variable is_branch : std_ulogic;
variable taken_branch : std_ulogic;
variable abs_branch : std_ulogic;
variable spr_val : std_ulogic_vector(63 downto 0);
begin
result := (others => '0');
result_with_carry := (others => '0');
result_en := '0';
newcrf := (others => '0');
is_branch := '0';
taken_branch := '0';
abs_branch := '0';

v := r;
v.e := Execute1ToWritebackInit;
@ -334,32 +357,7 @@ begin
v.div_in_progress := '0';
v.cntz_in_progress := '0';

-- signals to multiply unit
x_to_multiply <= Execute1ToMultiplyInit;
x_to_multiply.insn_type <= e_in.insn_type;
x_to_multiply.is_32bit <= e_in.is_32bit;

if e_in.is_32bit = '1' then
if e_in.is_signed = '1' then
x_to_multiply.data1 <= (others => a_in(31));
x_to_multiply.data1(31 downto 0) <= a_in(31 downto 0);
x_to_multiply.data2 <= (others => b_in(31));
x_to_multiply.data2(31 downto 0) <= b_in(31 downto 0);
else
x_to_multiply.data1 <= '0' & x"00000000" & a_in(31 downto 0);
x_to_multiply.data2 <= '0' & x"00000000" & b_in(31 downto 0);
end if;
else
if e_in.is_signed = '1' then
x_to_multiply.data1 <= a_in(63) & a_in;
x_to_multiply.data2 <= b_in(63) & b_in;
else
x_to_multiply.data1 <= '0' & a_in;
x_to_multiply.data2 <= '0' & b_in;
end if;
end if;

-- signals to divide unit
-- signals to multiply and divide units
sign1 := '0';
sign2 := '0';
if e_in.is_signed = '1' then
@ -383,15 +381,22 @@ begin
abs2 := - signed(b_in);
end if;

x_to_multiply <= Execute1ToMultiplyInit;
x_to_multiply.is_32bit <= e_in.is_32bit;

x_to_divider <= Execute1ToDividerInit;
x_to_divider.is_signed <= e_in.is_signed;
x_to_divider.is_32bit <= e_in.is_32bit;
if e_in.insn_type = OP_MOD then
x_to_divider.is_modulus <= '1';
end if;

x_to_multiply.neg_result <= sign1 xor sign2;
x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus);
if e_in.is_32bit = '0' then
-- 64-bit forms
x_to_multiply.data1 <= std_ulogic_vector(abs1);
x_to_multiply.data2 <= std_ulogic_vector(abs2);
if e_in.insn_type = OP_DIVE then
x_to_divider.is_extended <= '1';
end if;
@ -399,6 +404,8 @@ begin
x_to_divider.divisor <= std_ulogic_vector(abs2);
else
-- 32-bit forms
x_to_multiply.data1 <= x"00000000" & std_ulogic_vector(abs1(31 downto 0));
x_to_multiply.data2 <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
x_to_divider.is_extended <= '0';
if e_in.insn_type = OP_DIVE then -- extended forms
x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000";
@ -426,9 +433,9 @@ begin
end if;
end if;

terminate_out <= '0';
v.terminate := '0';
icache_inval <= '0';
stall_out <= '0';
v.busy := '0';
f_out <= Execute1ToFetch1TypeInit;
-- send MSR[IR] and ~MSR[PR] up to fetch1
f_out.virt_mode <= ctrl.msr(MSR_IR);
@ -450,6 +457,9 @@ begin
v.e.exc_write_enable := '0';
v.e.exc_write_reg := fast_spr_num(SPR_SRR0);
v.e.exc_write_data := e_in.nia;
if valid_in = '1' then
v.last_nia := e_in.nia;
end if;

if ctrl.irq_state = WRITE_SRR1 then
v.e.exc_write_reg := fast_spr_num(SPR_SRR1);
@ -466,10 +476,10 @@ begin
f_out.virt_mode <= '0';
f_out.priv_mode <= '1';
f_out.redirect_nia <= ctrl.irq_nia;
v.e.valid := e_in.valid;
v.e.valid := '1';
report "Writing SRR1: " & to_hstring(ctrl.srr1);

elsif irq_valid = '1' and e_in.valid = '1' then
elsif irq_valid = '1' and valid_in = '1' then
-- we need two cycles to write srr0 and 1
-- will need more when we have to write HEIR
-- Don't deliver the interrupt until we have a valid instruction
@ -477,7 +487,7 @@ begin
exception := '1';
ctrl_tmp.srr1 <= msr_copy(ctrl.msr);

elsif e_in.valid = '1' and ctrl.msr(MSR_PR) = '1' and
elsif valid_in = '1' and ctrl.msr(MSR_PR) = '1' and
instr_is_privileged(e_in.insn_type, e_in.insn) then
-- generate a program interrupt
exception := '1';
@ -487,12 +497,13 @@ begin
ctrl_tmp.srr1(63 - 45) <= '1';
report "privileged instruction";
elsif e_in.valid = '1' and e_in.unit = ALU then
elsif valid_in = '1' and e_in.unit = ALU then

report "execute nia " & to_hstring(e_in.nia);

v.e.valid := '1';
v.e.write_reg := e_in.write_reg;
v.slow_op_insn := e_in.insn_type;
v.slow_op_dest := gspr_to_gpr(e_in.write_reg);
v.slow_op_rc := e_in.rc;
v.slow_op_oe := e_in.oe;
@ -521,7 +532,7 @@ begin
-- check bits 1-10 of the instruction to make sure it's attn
-- if not then it is illegal
if e_in.insn(10 downto 1) = "0100000000" then
terminate_out <= '1';
v.terminate := '1';
report "ATTN";
else
illegal := '1';
@ -612,16 +623,13 @@ begin
end if;
end if;
end if;
when OP_AND | OP_OR | OP_XOR =>
when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS =>
result := logical_result;
result_en := '1';
when OP_B =>
f_out.redirect <= '1';
if (insn_aa(e_in.insn)) then
f_out.redirect_nia <= b_in;
else
f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
end if;
is_branch := '1';
taken_branch := '1';
abs_branch := insn_aa(e_in.insn);
when OP_BC =>
-- read_data1 is CTR
bo := insn_bo(e_in.insn);
@ -631,14 +639,9 @@ begin
result_en := '1';
v.e.write_reg := fast_spr_num(SPR_CTR);
end if;
if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then
f_out.redirect <= '1';
if (insn_aa(e_in.insn)) then
f_out.redirect_nia <= b_in;
else
f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
end if;
end if;
is_branch := '1';
taken_branch := ppc_bc_taken(bo, bi, e_in.cr, a_in);
abs_branch := insn_aa(e_in.insn);
when OP_BCREG =>
-- read_data1 is CTR
-- read_data2 is target register (CTR, LR or TAR)
@ -649,7 +652,7 @@ begin
result_en := '1';
v.e.write_reg := fast_spr_num(SPR_CTR);
end if;
if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then
if ppc_bc_taken(bo, bi, e_in.cr, a_in) = '1' then
f_out.redirect <= '1';
f_out.redirect_nia <= b_in(63 downto 2) & "00";
end if;
@ -670,27 +673,10 @@ begin
ctrl_tmp.msr(MSR_DR) <= '1';
end if;

when OP_CMPB =>
result := ppc_cmpb(c_in, b_in);
result_en := '1';
when OP_CNTZ =>
v.e.valid := '0';
v.cntz_in_progress := '1';
stall_out <= '1';
when OP_EXTS =>
-- note data_len is a 1-hot encoding
negative := (e_in.data_len(0) and c_in(7)) or
(e_in.data_len(1) and c_in(15)) or
(e_in.data_len(2) and c_in(31));
result := (others => negative);
if e_in.data_len(2) = '1' then
result(31 downto 16) := c_in(31 downto 16);
end if;
if e_in.data_len(2) = '1' or e_in.data_len(1) = '1' then
result(15 downto 8) := c_in(15 downto 8);
end if;
result(7 downto 0) := c_in(7 downto 0);
result_en := '1';
v.busy := '1';
when OP_ISEL =>
crbit := to_integer(unsigned(insn_bc(e_in.insn)));
if e_in.cr(31-crbit) = '1' then
@ -762,19 +748,25 @@ begin
result(63-45) := v.e.xerc.ca32;
end if;
else
spr_val := c_in;
case decode_spr_num(e_in.insn) is
when SPR_TB =>
result := ctrl.tb;
spr_val := ctrl.tb;
when SPR_DEC =>
result := ctrl.dec;
spr_val := ctrl.dec;
when 724 => -- LOG_ADDR SPR
spr_val := log_wr_addr & r.log_addr_spr;
when 725 => -- LOG_DATA SPR
spr_val := log_rd_data;
v.log_addr_spr := std_ulogic_vector(unsigned(r.log_addr_spr) + 1);
when others =>
-- mfspr from unimplemented SPRs should be a nop in
-- supervisor mode and a program interrupt for user mode
result := c_in;
if ctrl.msr(MSR_PR) = '1' then
illegal := '1';
end if;
end case;
result := spr_val;
end if;
when OP_MFCR =>
if e_in.insn(20) = '0' then
@ -840,6 +832,8 @@ begin
case decode_spr_num(e_in.insn) is
when SPR_DEC =>
ctrl_tmp.dec <= c_in;
when 724 => -- LOG_ADDR SPR
v.log_addr_spr := c_in(31 downto 0);
when others =>
-- mtspr to unimplemented SPRs should be a nop in
-- supervisor mode and a program interrupt for user mode
@ -848,12 +842,6 @@ begin
end if;
end case;
end if;
when OP_POPCNT =>
result := popcnt_result;
result_en := '1';
when OP_PRTY =>
result := parity_result;
result_en := '1';
when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR | OP_EXTSWSLI =>
result := rotator_result;
if e_in.output_carry = '1' then
@ -871,53 +859,65 @@ begin
when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 =>
v.e.valid := '0';
v.mul_in_progress := '1';
stall_out <= '1';
v.busy := '1';
x_to_multiply.valid <= '1';

when OP_DIV | OP_DIVE | OP_MOD =>
v.e.valid := '0';
v.div_in_progress := '1';
stall_out <= '1';
v.busy := '1';
x_to_divider.valid <= '1';

when others =>
terminate_out <= '1';
v.terminate := '1';
report "illegal";
end case;

v.e.rc := e_in.rc and e_in.valid;
v.e.rc := e_in.rc and valid_in;

-- Mispredicted branches cause a redirect
if is_branch = '1' and taken_branch /= e_in.br_pred then
f_out.redirect <= '1';
if taken_branch = '1' then
if abs_branch = '1' then
f_out.redirect_nia <= b_in;
else
f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
end if;
else
f_out.redirect_nia <= next_nia;
end if;
end if;

-- Update LR on the next cycle after a branch link
--
-- WARNING: The LR update isn't tracked by our hazard tracker. This
-- will work (well I hope) because it only happens on branches
-- which will flush all decoded instructions. By the time
-- fetch catches up, we'll have the new LR. This will
-- *not* work properly however if we have a branch predictor,
-- in which case the solution would probably be to keep a
-- local cache of the updated LR in execute1 (flushed on
-- exceptions) that is used instead of the value from
-- decode when its content is valid.
-- If we're not writing back anything else, we can write back LR
-- this cycle, otherwise we take an extra cycle. We use the
-- exc_write path since next_nia is written through that path
-- in other places.
if e_in.lr = '1' then
if result_en = '0' then
v.e.exc_write_enable := '1';
v.e.exc_write_data := next_nia;
v.e.exc_write_reg := fast_spr_num(SPR_LR);
else
v.lr_update := '1';
v.next_lr := next_nia;
v.e.valid := '0';
report "Delayed LR update to " & to_hstring(next_nia);
stall_out <= '1';
v.busy := '1';
end if;
end if;

elsif e_in.valid = '1' then
elsif valid_in = '1' then
-- instruction for other units, i.e. LDST
v.ldst_nia := e_in.nia;
v.e.valid := '0';
if e_in.unit = LDST then
lv.valid := '1';
end if;

elsif r.lr_update = '1' then
result_en := '1';
result := r.next_lr;
v.e.write_reg := fast_spr_num(SPR_LR);
v.e.exc_write_enable := '1';
v.e.exc_write_data := r.next_lr;
v.e.exc_write_reg := fast_spr_num(SPR_LR);
v.e.valid := '1';
elsif r.cntz_in_progress = '1' then
-- cnt[lt]z always takes two cycles
@ -931,8 +931,18 @@ begin
if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or
(r.div_in_progress = '1' and divider_to_x.valid = '1') then
if r.mul_in_progress = '1' then
result := multiply_to_x.write_reg_data;
overflow := '0';
case r.slow_op_insn is
when OP_MUL_H32 =>
result := multiply_to_x.result(63 downto 32) &
multiply_to_x.result(63 downto 32);
when OP_MUL_H64 =>
result := multiply_to_x.result(127 downto 64);
when others =>
-- i.e. OP_MUL_L64
result := multiply_to_x.result(63 downto 0);
overflow := multiply_to_x.overflow;
end case;
else
result := divider_to_x.write_reg_data;
overflow := divider_to_x.overflow;
@ -952,7 +962,7 @@ begin
end if;
v.e.valid := '1';
else
stall_out <= '1';
v.busy := '1';
v.mul_in_progress := r.mul_in_progress;
v.div_in_progress := r.div_in_progress;
end if;
@ -973,7 +983,8 @@ begin
v.e.exc_write_data := next_nia;
end if;
ctrl_tmp.irq_state <= WRITE_SRR1;
v.e.valid := '1';
v.busy := '1';
v.e.valid := '0';
end if;

v.e.write_data := result;
@ -1002,10 +1013,9 @@ begin
end if;
v.e.exc_write_enable := '1';
v.e.exc_write_reg := fast_spr_num(SPR_SRR0);
v.e.exc_write_data := r.ldst_nia;
report "ldst exception writing srr0=" & to_hstring(r.ldst_nia);
v.e.exc_write_data := r.last_nia;
report "ldst exception writing srr0=" & to_hstring(r.last_nia);
ctrl_tmp.irq_state <= WRITE_SRR1;
v.e.valid := '1'; -- complete the original load or store
end if;

-- Outputs to loadstore1 (async)
@ -1040,5 +1050,26 @@ begin
l_out <= lv;
e_out <= r.e;
flush_out <= f_out.redirect;

exception_log <= exception;
irq_valid_log <= irq_valid;
end process;

ex1_log : process(clk)
begin
if rising_edge(clk) then
log_data <= ctrl.msr(MSR_EE) & ctrl.msr(MSR_PR) &
ctrl.msr(MSR_IR) & ctrl.msr(MSR_DR) &
exception_log &
irq_valid_log &
std_ulogic_vector(to_unsigned(irq_state_t'pos(ctrl.irq_state), 1)) &
"000" &
r.e.write_enable &
r.e.valid &
f_out.redirect &
r.busy &
flush_out;
end if;
end process;
log_out <= log_data;
end architecture behaviour;

@ -23,8 +23,14 @@ entity fetch1 is
-- redirect from execution unit
e_in : in Execute1ToFetch1Type;

-- redirect from decode1
d_in : in Decode1ToFetch1Type;

-- Request to icache
i_out : out Fetch1ToIcacheType
i_out : out Fetch1ToIcacheType;

-- outputs to logger
log_out : out std_ulogic_vector(42 downto 0)
);
end entity fetch1;

@ -35,16 +41,18 @@ architecture behaviour of fetch1 is
end record;
signal r, r_next : Fetch1ToIcacheType;
signal r_int, r_next_int : reg_internal_t;
signal log_nia : std_ulogic_vector(42 downto 0);
begin

regs : process(clk)
begin
if rising_edge(clk) then
log_nia <= r.nia(63) & r.nia(43 downto 2);
if r /= r_next then
report "fetch1 rst:" & std_ulogic'image(rst) &
" IR:" & std_ulogic'image(e_in.virt_mode) &
" P:" & std_ulogic'image(e_in.priv_mode) &
" R:" & std_ulogic'image(e_in.redirect) &
" R:" & std_ulogic'image(e_in.redirect) & std_ulogic'image(d_in.redirect) &
" S:" & std_ulogic'image(stall_in) &
" T:" & std_ulogic'image(stop_in) &
" nia:" & to_hstring(r_next.nia) &
@ -54,6 +62,7 @@ begin
r_int <= r_next_int;
end if;
end process;
log_out <= log_nia;

comb : process(all)
variable v : Fetch1ToIcacheType;
@ -62,6 +71,7 @@ begin
begin
v := r;
v_int := r_int;
v.sequential := '0';

if rst = '1' then
if alt_reset_in = '1' then
@ -76,6 +86,8 @@ begin
v.nia := e_in.redirect_nia;
v.virt_mode := e_in.virt_mode;
v.priv_mode := e_in.priv_mode;
elsif d_in.redirect = '1' then
v.nia := d_in.redirect_nia;
elsif stall_in = '0' then

-- For debug stop/step to work properly we need a little bit of
@ -122,6 +134,7 @@ begin

if increment then
v.nia := std_logic_vector(unsigned(v.nia) + 4);
v.sequential := '1';
end if;
end if;


@ -1,123 +0,0 @@
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

library work;
use work.common.all;
use work.wishbone_types.all;

entity fetch2 is
port(
clk : in std_ulogic;
rst : in std_ulogic;

stall_in : in std_ulogic;
flush_in : in std_ulogic;

-- Results from icache
i_in : in IcacheToFetch2Type;

-- Output to decode
f_out : out Fetch2ToDecode1Type
);
end entity fetch2;

architecture behaviour of fetch2 is

-- The icache cannot stall, so we need to stash a cycle
-- of output from it when we stall.
type reg_internal_type is record
stash : IcacheToFetch2Type;
stash_valid : std_ulogic;
stopped : std_ulogic;
end record;

signal r_int, rin_int : reg_internal_type;
signal r, rin : Fetch2ToDecode1Type;

begin
regs : process(clk)
begin
if rising_edge(clk) then

if (r /= rin) then
report "fetch2 rst:" & std_ulogic'image(rst) &
" S:" & std_ulogic'image(stall_in) &
" F:" & std_ulogic'image(flush_in) &
" T:" & std_ulogic'image(rin.stop_mark) &
" V:" & std_ulogic'image(rin.valid) &
" FF:" & std_ulogic'image(rin.fetch_failed) &
" nia:" & to_hstring(rin.nia);
end if;

-- Output state remains unchanged on stall, unless we are flushing
if rst = '1' or flush_in = '1' or stall_in = '0' then
r <= rin;
end if;

-- Internal state is updated on every clock
r_int <= rin_int;
end if;
end process;

comb : process(all)
variable v : Fetch2ToDecode1Type;
variable v_int : reg_internal_type;
variable v_i_in : IcacheToFetch2Type;
begin
v := r;
v_int := r_int;

-- If stalling, stash away the current input from the icache
if stall_in = '1' and v_int.stash_valid = '0' then
v_int.stash := i_in;
v_int.stash_valid := '1';
end if;

-- If unstalling, source input from the stash and invalidate it,
-- otherwise source normally from the icache.
--
v_i_in := i_in;
if v_int.stash_valid = '1' and stall_in = '0' then
v_i_in := v_int.stash;
v_int.stash_valid := '0';
end if;

v.valid := v_i_in.valid;
v.stop_mark := v_i_in.stop_mark;
v.fetch_failed := v_i_in.fetch_failed;
v.nia := v_i_in.nia;
v.insn := v_i_in.insn;

-- Clear stash internal valid bit on flush. We still mark
-- the stash itself as valid since we still want to override
-- whatever comes form icache when unstalling, but we'll
-- override it with something invalid.
--
if flush_in = '1' then
v_int.stash.valid := '0';
v_int.stash.fetch_failed := '0';
end if;

-- If we are flushing or the instruction comes with a stop mark
-- we tag it as invalid so it doesn't get decoded and executed
if flush_in = '1' or v.stop_mark = '1' then
v.valid := '0';
v.fetch_failed := '0';
end if;

-- Clear stash on reset
if rst = '1' then
v_int.stash_valid := '0';
v.valid := '0';
end if;

-- Update registers
rin <= v;
rin_int <= v_int;

-- Update outputs
f_out <= r;
end process;

end architecture behaviour;

@ -20,7 +20,8 @@ entity toplevel is
SCLK_STARTUPE2 : boolean := false;
SPI_FLASH_OFFSET : integer := 4194304;
SPI_FLASH_DEF_CKDV : natural := 1;
SPI_FLASH_DEF_QUAD : boolean := true
SPI_FLASH_DEF_QUAD : boolean := true;
LOG_LENGTH : natural := 512
);
port(
ext_clk : in std_ulogic;
@ -140,7 +141,8 @@ begin
SPI_FLASH_DLINES => 4,
SPI_FLASH_OFFSET => SPI_FLASH_OFFSET,
SPI_FLASH_DEF_CKDV => SPI_FLASH_DEF_CKDV,
SPI_FLASH_DEF_QUAD => SPI_FLASH_DEF_QUAD
SPI_FLASH_DEF_QUAD => SPI_FLASH_DEF_QUAD,
LOG_LENGTH => LOG_LENGTH
)
port map (
-- System signals

@ -4,11 +4,15 @@ use ieee.numeric_std.all;

entity gpr_hazard is
generic (
PIPELINE_DEPTH : natural := 2
PIPELINE_DEPTH : natural := 1
);
port(
clk : in std_ulogic;
stall_in : in std_ulogic;
busy_in : in std_ulogic;
deferred : in std_ulogic;
complete_in : in std_ulogic;
flush_in : in std_ulogic;
issuing : in std_ulogic;

gpr_write_valid_in : in std_ulogic;
gpr_write_in : in std_ulogic_vector(5 downto 0);
@ -16,6 +20,9 @@ entity gpr_hazard is
gpr_read_valid_in : in std_ulogic;
gpr_read_in : in std_ulogic_vector(5 downto 0);

ugpr_write_valid : in std_ulogic;
ugpr_write_reg : in std_ulogic_vector(5 downto 0);

stall_out : out std_ulogic;
use_bypass : out std_ulogic
);
@ -25,10 +32,13 @@ architecture behaviour of gpr_hazard is
valid : std_ulogic;
bypass : std_ulogic;
gpr : std_ulogic_vector(5 downto 0);
ugpr_valid : std_ulogic;
ugpr : std_ulogic_vector(5 downto 0);
end record;
constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0'));
constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0'),
ugpr_valid => '0', ugpr => (others => '0'));

type pipeline_t is array(0 to PIPELINE_DEPTH-1) of pipeline_entry_type;
type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type;
constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init);

signal r, rin : pipeline_t := pipeline_t_init;
@ -45,50 +55,46 @@ begin
begin
v := r;

if complete_in = '1' then
v(PIPELINE_DEPTH).valid := '0';
v(PIPELINE_DEPTH).ugpr_valid := '0';
end if;

stall_out <= '0';
use_bypass <= '0';
if gpr_read_valid_in = '1' then
if r(0).valid = '1' and r(0).gpr = gpr_read_in then
if r(0).bypass = '1' and stall_in = '0' then
loop_0: for i in 0 to PIPELINE_DEPTH loop
if v(i).valid = '1' and r(i).gpr = gpr_read_in then
if r(i).bypass = '1' then
use_bypass <= '1';
else
stall_out <= '1';
end if;
end if;
loop_0: for i in 1 to PIPELINE_DEPTH-1 loop
if r(i).valid = '1' and r(i).gpr = gpr_read_in then
if r(i).bypass = '1' then
use_bypass <= '1';
else
if v(i).ugpr_valid = '1' and r(i).ugpr = gpr_read_in then
stall_out <= '1';
end if;
end if;
end loop;
end if;

if stall_in = '0' then
-- XXX assumes PIPELINE_DEPTH = 1
if busy_in = '0' then
v(1) := v(0);
v(0).valid := '0';
v(0).ugpr_valid := '0';
end if;
if deferred = '0' and issuing = '1' then
v(0).valid := gpr_write_valid_in;
v(0).bypass := bypass_avail;
v(0).gpr := gpr_write_in;
loop_1: for i in 1 to PIPELINE_DEPTH-1 loop
-- propagate to next slot
v(i).valid := r(i-1).valid;
v(i).bypass := r(i-1).bypass;
v(i).gpr := r(i-1).gpr;
end loop;

else
-- stage 0 stalled, so stage 1 becomes empty
loop_1b: for i in 1 to PIPELINE_DEPTH-1 loop
-- propagate to next slot
if i = 1 then
v(i).valid := '0';
else
v(i).valid := r(i-1).valid;
v(i).bypass := r(i-1).bypass;
v(i).gpr := r(i-1).gpr;
v(0).ugpr_valid := ugpr_write_valid;
v(0).ugpr := ugpr_write_reg;
end if;
end loop;
if flush_in = '1' then
v(0).valid := '0';
v(0).ugpr_valid := '0';
v(1).valid := '0';
v(1).ugpr_valid := '0';
end if;

-- update registers

@ -48,16 +48,19 @@ entity icache is
rst : in std_ulogic;

i_in : in Fetch1ToIcacheType;
i_out : out IcacheToFetch2Type;
i_out : out IcacheToDecode1Type;

m_in : in MmuToIcacheType;

stall_in : in std_ulogic;
stall_out : out std_ulogic;
flush_in : in std_ulogic;
inval_in : in std_ulogic;

wishbone_out : out wishbone_master_out;
wishbone_in : in wishbone_slave_out
wishbone_in : in wishbone_slave_out;

log_out : out std_ulogic_vector(53 downto 0)
);
end entity icache;

@ -112,6 +115,7 @@ architecture rtl of icache is
subtype row_t is integer range 0 to BRAM_ROWS-1;
subtype index_t is integer range 0 to NUM_LINES-1;
subtype way_t is integer range 0 to NUM_WAYS-1;
subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);

-- The cache data BRAM organized as described above for each way
subtype cache_row_t is std_ulogic_vector(wishbone_data_bits-1 downto 0);
@ -129,6 +133,7 @@ architecture rtl of icache is
-- The cache valid bits
subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
type cache_valids_t is array(index_t) of cache_way_valids_t;
type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;

-- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
signal cache_tags : cache_tags_array_t;
@ -176,6 +181,8 @@ architecture rtl of icache is
store_row : row_t;
store_tag : cache_tag_t;
store_valid : std_ulogic;
end_row_ix : row_in_line_t;
rows_valid : row_per_line_valid_t;

-- TLB miss state
fetch_failed : std_ulogic;
@ -197,6 +204,10 @@ architecture rtl of icache is
signal ra_valid : std_ulogic;
signal priv_fault : std_ulogic;
signal access_ok : std_ulogic;
signal use_previous : std_ulogic;

-- Output data to logger
signal log_data : std_ulogic_vector(53 downto 0);

-- Cache RAM interface
type cache_ram_out_t is array(way_t) of cache_row_t;
@ -219,20 +230,24 @@ architecture rtl of icache is
return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)));
end;

-- Return the index of a row within a line
function get_row_of_line(row: row_t) return row_in_line_t is
variable row_v : unsigned(ROW_BITS-1 downto 0);
begin
row_v := to_unsigned(row, ROW_BITS);
return row_v(ROW_LINEBITS-1 downto 0);
end;

-- Returns whether this is the last row of a line
function is_last_row_addr(addr: wishbone_addr_type) return boolean is
constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1');
function is_last_row_addr(addr: wishbone_addr_type; last: row_in_line_t) return boolean is
begin
return addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) = ones;
return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last;
end;

-- Returns whether this is the last row of a line
function is_last_row(row: row_t) return boolean is
variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
constant ones : std_ulogic_vector(ROW_LINEBITS-1 downto 0) := (others => '1');
function is_last_row(row: row_t; last: row_in_line_t) return boolean is
begin
row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
return row_v(ROW_LINEBITS-1 downto 0) = ones;
return get_row_of_line(row) = last;
end;

-- Return the address of the next row in the current cache line
@ -361,7 +376,7 @@ begin
);
process(all)
begin
do_read <= '1';
do_read <= not (stall_in or use_previous);
do_write <= '0';
if wishbone_in.ack = '1' and r.store_way = i then
do_write <= '1';
@ -466,23 +481,38 @@ begin
variable is_hit : std_ulogic;
variable hit_way : way_t;
begin
-- i_in.sequential means that i_in.nia this cycle is 4 more than
-- last cycle. If we read more than 32 bits at a time, had a cache hit
-- last cycle, and we don't want the first 32-bit chunk, then we can
-- keep the data we read last cycle and just use that.
if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
use_previous <= i_in.sequential and r.hit_valid;
else
use_previous <= '0';
end if;

-- Extract line, row and tag from request
req_index <= get_index(i_in.nia);
req_row <= get_row(i_in.nia);
req_tag <= get_tag(real_addr);

-- Calculate address of beginning of cache line, will be
-- Calculate address of beginning of cache row, will be
-- used for cache miss processing if needed
--
req_laddr <= (63 downto REAL_ADDR_BITS => '0') &
real_addr(REAL_ADDR_BITS - 1 downto LINE_OFF_BITS) &
(LINE_OFF_BITS-1 downto 0 => '0');
real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
(ROW_OFF_BITS-1 downto 0 => '0');

-- Test if pending request is a hit on any way
hit_way := 0;
is_hit := '0';
for i in way_t loop
if i_in.req = '1' and cache_valids(req_index)(i) = '1' then
if i_in.req = '1' and
(cache_valids(req_index)(i) = '1' or
(r.state = WAIT_ACK and
req_index = r.store_index and
i = r.store_way and
r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
if read_tag(i, cache_tags(req_index)) = req_tag then
hit_way := i;
is_hit := '1';
@ -528,16 +558,20 @@ begin
icache_hit : process(clk)
begin
if rising_edge(clk) then
-- keep outputs to fetch2 unchanged on a stall
-- except that flush or reset sets valid to 0
-- If use_previous, keep the same data as last cycle and use the second half
if stall_in = '1' or use_previous = '1' then
if rst = '1' or flush_in = '1' then
r.hit_valid <= '0';
end if;
else
-- On a hit, latch the request for the next cycle, when the BRAM data
-- will be available on the cache_out output of the corresponding way
--
r.hit_valid <= req_is_hit;
-- Send stop marks and NIA down regardless of validity
r.hit_smark <= i_in.stop_mark;
r.hit_nia <= i_in.nia;
if req_is_hit = '1' then
r.hit_way <= req_hit_way;
r.hit_smark <= i_in.stop_mark;

report "cache hit nia:" & to_hstring(i_in.nia) &
" IR:" & std_ulogic'image(i_in.virt_mode) &
@ -548,6 +582,12 @@ begin
" RA:" & to_hstring(real_addr);
end if;
end if;
if stall_in = '0' then
-- Send stop marks and NIA down regardless of validity
r.hit_smark <= i_in.stop_mark;
r.hit_nia <= i_in.nia;
end if;
end if;
end process;

-- Cache miss/reload synchronous machine
@ -584,6 +624,11 @@ begin
-- Main state machine
case r.state is
when IDLE =>
-- Reset per-row valid flags, only used in WAIT_ACK
for i in 0 to ROW_PER_LINE - 1 loop
r.rows_valid(i) <= '0';
end loop;

-- We need to read a cache line
if req_is_miss = '1' then
report "cache miss nia:" & to_hstring(i_in.nia) &
@ -600,6 +645,7 @@ begin
r.store_row <= get_row(req_laddr);
r.store_tag <= req_tag;
r.store_valid <= '1';
r.end_row_ix <= get_row_of_line(get_row(req_laddr)) - 1;

-- Prep for first wishbone read. We calculate the address of
-- the start of the cache line and start the WB cycle.
@ -637,7 +683,7 @@ begin
-- stb and set stbs_done so we can handle an eventual last
-- ack on the same cycle.
--
if is_last_row_addr(r.wb.adr) then
if is_last_row_addr(r.wb.adr, r.end_row_ix) then
r.wb.stb <= '0';
stbs_done := true;
end if;
@ -648,8 +694,9 @@ begin

-- Incoming acks processing
if wishbone_in.ack = '1' then
r.rows_valid(r.store_row mod ROW_PER_LINE) <= '1';
-- Check for completion
if stbs_done and is_last_row(r.store_row) then
if stbs_done and is_last_row(r.store_row, r.end_row_ix) then
-- Complete wishbone cycle
r.wb.cyc <= '0';

@ -669,9 +716,41 @@ begin
-- TLB miss and protection fault processing
if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
r.fetch_failed <= '0';
elsif i_in.req = '1' and access_ok = '0' then
elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then
r.fetch_failed <= '1';
end if;
end if;
end process;

data_log: process(clk)
variable lway: way_t;
variable wstate: std_ulogic;
begin
if rising_edge(clk) then
if req_is_hit then
lway := req_hit_way;
else
lway := replace_way;
end if;
wstate := '0';
if r.state /= IDLE then
wstate := '1';
end if;
log_data <= i_out.valid &
i_out.insn &
wishbone_in.ack &
r.wb.adr(5 downto 3) &
r.wb.stb & r.wb.cyc &
wishbone_in.stall &
stall_out &
r.fetch_failed &
r.hit_nia(5 downto 2) &
wstate &
std_ulogic_vector(to_unsigned(lway, 3)) &
req_is_hit & req_is_miss &
access_ok &
ra_valid;
end if;
end process;
log_out <= log_data;
end;

@ -13,7 +13,7 @@ architecture behave of icache_tb is
signal rst : std_ulogic;

signal i_out : Fetch1ToIcacheType;
signal i_in : IcacheToFetch2Type;
signal i_in : IcacheToDecode1Type;

signal m_out : MmuToIcacheType;

@ -33,6 +33,7 @@ begin
i_in => i_out,
i_out => i_in,
m_in => m_out,
stall_in => '0',
flush_in => '0',
inval_in => '0',
wishbone_out => wb_bram_in,

@ -25,7 +25,8 @@ entity loadstore1 is
m_in : in MmuToLoadstore1Type;

dc_stall : in std_ulogic;
stall_out : out std_ulogic

log_out : out std_ulogic_vector(9 downto 0)
);
end loadstore1;

@ -41,7 +42,8 @@ architecture behave of loadstore1 is
ACK_WAIT, -- waiting for ack from dcache
LD_UPDATE, -- writing rA with computed addr on load
MMU_LOOKUP, -- waiting for MMU to look up translation
TLBIE_WAIT -- waiting for MMU to finish doing a tlbie
TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie
SPR_CMPLT -- complete a mf/tspr operation
);

type reg_stage_t is record
@ -49,6 +51,7 @@ architecture behave of loadstore1 is
load : std_ulogic;
tlbie : std_ulogic;
dcbz : std_ulogic;
mfspr : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
store_data : std_ulogic_vector(63 downto 0);
load_data : std_ulogic_vector(63 downto 0);
@ -71,6 +74,7 @@ architecture behave of loadstore1 is
dar : std_ulogic_vector(63 downto 0);
dsisr : std_ulogic_vector(31 downto 0);
instr_fault : std_ulogic;
sprval : std_ulogic_vector(63 downto 0);
end record;

type byte_sel_t is array(0 to 7) of std_ulogic;
@ -80,6 +84,8 @@ architecture behave of loadstore1 is
signal r, rin : reg_stage_t;
signal lsu_sum : std_ulogic_vector(63 downto 0);

signal log_data : std_ulogic_vector(9 downto 0);

-- Generate byte enables from sizes
function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
begin
@ -135,7 +141,7 @@ begin
variable long_sel : std_ulogic_vector(15 downto 0);
variable byte_sel : std_ulogic_vector(7 downto 0);
variable req : std_ulogic;
variable stall : std_ulogic;
variable busy : std_ulogic;
variable addr : std_ulogic_vector(63 downto 0);
variable wdata : std_ulogic_vector(63 downto 0);
variable write_enable : std_ulogic;
@ -147,9 +153,7 @@ begin
variable use_second : byte_sel_t;
variable trim_ctl : trim_ctl_t;
variable negative : std_ulogic;
variable mfspr : std_ulogic;
variable sprn : std_ulogic_vector(9 downto 0);
variable sprval : std_ulogic_vector(63 downto 0);
variable exception : std_ulogic;
variable next_addr : std_ulogic_vector(63 downto 0);
variable mmureq : std_ulogic;
@ -159,16 +163,12 @@ begin
begin
v := r;
req := '0';
stall := '0';
done := '0';
byte_sel := (others => '0');
addr := lsu_sum;
mfspr := '0';
v.mfspr := '0';
mmu_mtspr := '0';
itlb_fault := '0';
sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10));
sprval := (others => '0'); -- avoid inferred latches
exception := '0';
dsisr := (others => '0');
mmureq := '0';

@ -227,130 +227,18 @@ begin
-- compute (addr + 8) & ~7 for the second doubleword when unaligned
next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";

done := '0';
exception := '0';
case r.state is
when IDLE =>
if l_in.valid = '1' then
v.addr := lsu_sum;
v.load := '0';
v.dcbz := '0';
v.tlbie := '0';
v.instr_fault := '0';
v.dwords_done := '0';
case l_in.op is
when OP_STORE =>
req := '1';
when OP_LOAD =>
req := '1';
v.load := '1';
when OP_DCBZ =>
req := '1';
v.dcbz := '1';
when OP_TLBIE =>
mmureq := '1';
stall := '1';
v.tlbie := '1';
v.state := TLBIE_WAIT;
when OP_MFSPR =>
done := '1';
mfspr := '1';
-- partial decode on SPR number should be adequate given
-- the restricted set that get sent down this path
if sprn(9) = '0' and sprn(5) = '0' then
if sprn(0) = '0' then
sprval := x"00000000" & r.dsisr;
else
sprval := r.dar;
end if;
else
-- reading one of the SPRs in the MMU
sprval := m_in.sprval;
end if;
when OP_MTSPR =>
if sprn(9) = '0' and sprn(5) = '0' then
if sprn(0) = '0' then
v.dsisr := l_in.data(31 downto 0);
else
v.dar := l_in.data;
end if;
done := '1';
else
-- writing one of the SPRs in the MMU
mmu_mtspr := '1';
stall := '1';
v.state := TLBIE_WAIT;
end if;
when OP_FETCH_FAILED =>
-- send it to the MMU to do the radix walk
addr := l_in.nia;
v.addr := l_in.nia;
v.instr_fault := '1';
mmureq := '1';
stall := '1';
v.state := MMU_LOOKUP;
when others =>
assert false report "unknown op sent to loadstore1";
end case;

v.write_reg := l_in.write_reg;
v.length := l_in.length;
v.byte_reverse := l_in.byte_reverse;
v.sign_extend := l_in.sign_extend;
v.update := l_in.update;
v.update_reg := l_in.update_reg;
v.xerc := l_in.xerc;
v.reserve := l_in.reserve;
v.rc := l_in.rc;
v.nc := l_in.ci;
v.virt_mode := l_in.virt_mode;
v.priv_mode := l_in.priv_mode;

-- XXX Temporary hack. Mark the op as non-cachable if the address
-- is the form 0xc------- for a real-mode access.
--
-- This will have to be replaced by a combination of implementing the
-- proper HV CI load/store instructions and having an MMU to get the I
-- bit otherwise.
if lsu_sum(31 downto 28) = "1100" and l_in.virt_mode = '0' then
v.nc := '1';
end if;

-- Do length_to_sel and work out if we are doing 2 dwords
long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0));
byte_sel := long_sel(7 downto 0);
v.first_bytes := byte_sel;
v.second_bytes := long_sel(15 downto 8);

-- Do byte reversing and rotating for stores in the first cycle
byte_offset := unsigned(lsu_sum(2 downto 0));
brev_lenm1 := "000";
if l_in.byte_reverse = '1' then
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
end if;
for i in 0 to 7 loop
k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
j := to_integer(k) * 8;
v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
end loop;

if req = '1' then
stall := '1';
if long_sel(15 downto 8) = "00000000" then
v.state := ACK_WAIT;
else
v.state := SECOND_REQ;
end if;
end if;
end if;

when SECOND_REQ =>
addr := next_addr;
byte_sel := r.second_bytes;
req := '1';
stall := '1';
v.state := ACK_WAIT;

when ACK_WAIT =>
stall := '1';
if d_in.valid = '1' then
if d_in.error = '1' then
-- dcache will discard the second request if it
@ -388,7 +276,6 @@ begin
else
-- stores write back rA update in this cycle
do_update := r.update;
stall := '0';
done := '1';
v.state := IDLE;
end if;
@ -397,7 +284,6 @@ begin
end if;

when MMU_LOOKUP =>
stall := '1';
if r.dwords_done = '1' then
addr := next_addr;
byte_sel := r.second_bytes;
@ -418,7 +304,6 @@ begin
end if;
else
-- nothing to do, the icache retries automatically
stall := '0';
done := '1';
v.state := IDLE;
end if;
@ -434,10 +319,8 @@ begin
end if;

when TLBIE_WAIT =>
stall := '1';
if m_in.done = '1' then
-- tlbie is finished
stall := '0';
done := '1';
v.state := IDLE;
end if;
@ -447,8 +330,123 @@ begin
v.state := IDLE;
done := '1';

when SPR_CMPLT =>
done := '1';
v.state := IDLE;

end case;

busy := '1';
if r.state = IDLE or done = '1' then
busy := '0';
end if;

-- Note that l_in.valid is gated with busy inside execute1
if l_in.valid = '1' then
v.addr := lsu_sum;
v.load := '0';
v.dcbz := '0';
v.tlbie := '0';
v.instr_fault := '0';
v.dwords_done := '0';
v.write_reg := l_in.write_reg;
v.length := l_in.length;
v.byte_reverse := l_in.byte_reverse;
v.sign_extend := l_in.sign_extend;
v.update := l_in.update;
v.update_reg := l_in.update_reg;
v.xerc := l_in.xerc;
v.reserve := l_in.reserve;
v.rc := l_in.rc;
v.nc := l_in.ci;
v.virt_mode := l_in.virt_mode;
v.priv_mode := l_in.priv_mode;

-- XXX Temporary hack. Mark the op as non-cachable if the address
-- is the form 0xc------- for a real-mode access.
if lsu_sum(31 downto 28) = "1100" and l_in.virt_mode = '0' then
v.nc := '1';
end if;

-- Do length_to_sel and work out if we are doing 2 dwords
long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0));
byte_sel := long_sel(7 downto 0);
v.first_bytes := byte_sel;
v.second_bytes := long_sel(15 downto 8);

-- Do byte reversing and rotating for stores in the first cycle
byte_offset := unsigned(lsu_sum(2 downto 0));
brev_lenm1 := "000";
if l_in.byte_reverse = '1' then
brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
end if;
for i in 0 to 7 loop
k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
j := to_integer(k) * 8;
v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
end loop;

case l_in.op is
when OP_STORE =>
req := '1';
when OP_LOAD =>
req := '1';
v.load := '1';
when OP_DCBZ =>
req := '1';
v.dcbz := '1';
when OP_TLBIE =>
mmureq := '1';
v.tlbie := '1';
v.state := TLBIE_WAIT;
when OP_MFSPR =>
v.mfspr := '1';
-- partial decode on SPR number should be adequate given
-- the restricted set that get sent down this path
if sprn(9) = '0' and sprn(5) = '0' then
if sprn(0) = '0' then
v.sprval := x"00000000" & r.dsisr;
else
v.sprval := r.dar;
end if;
else
-- reading one of the SPRs in the MMU
v.sprval := m_in.sprval;
end if;
v.state := SPR_CMPLT;
when OP_MTSPR =>
if sprn(9) = '0' and sprn(5) = '0' then
if sprn(0) = '0' then
v.dsisr := l_in.data(31 downto 0);
else
v.dar := l_in.data;
end if;
v.state := SPR_CMPLT;
else
-- writing one of the SPRs in the MMU
mmu_mtspr := '1';
v.state := TLBIE_WAIT;
end if;
when OP_FETCH_FAILED =>
-- send it to the MMU to do the radix walk
addr := l_in.nia;
v.addr := l_in.nia;
v.instr_fault := '1';
mmureq := '1';
v.state := MMU_LOOKUP;
when others =>
assert false report "unknown op sent to loadstore1";
end case;

if req = '1' then
if long_sel(15 downto 8) = "00000000" then
v.state := ACK_WAIT;
else
v.state := SECOND_REQ;
end if;
end if;
end if;

-- Update outputs to dcache
d_out.valid <= req;
d_out.load <= v.load;
@ -477,10 +475,10 @@ begin
-- Multiplex either cache data to the destination GPR or
-- the address for the rA update.
l_out.valid <= done;
if mfspr = '1' then
if r.mfspr = '1' then
l_out.write_enable <= '1';
l_out.write_reg <= l_in.write_reg;
l_out.write_data <= sprval;
l_out.write_reg <= r.write_reg;
l_out.write_data <= r.sprval;
elsif do_update = '1' then
l_out.write_enable <= '1';
l_out.write_reg <= r.update_reg;
@ -495,6 +493,7 @@ begin
l_out.store_done <= d_in.store_done;

-- update exception info back to execute1
e_out.busy <= busy;
e_out.exception <= exception;
e_out.instr_fault <= r.instr_fault;
e_out.invalid <= m_in.invalid;
@ -509,11 +508,23 @@ begin
end if;
end if;

stall_out <= stall;

-- Update registers
rin <= v;

end process;

ls1_log: process(clk)
begin
if rising_edge(clk) then
log_data <= e_out.busy &
e_out.exception &
l_out.valid &
m_out.valid &
d_out.valid &
m_in.done &
r.dwords_done &
std_ulogic_vector(to_unsigned(state_t'pos(r.state), 3));
end if;
end process;
log_out <= log_data;
end;

@ -4,6 +4,7 @@ use ieee.numeric_std.all;

library work;
use work.decode_types.all;
use work.ppc_fx_insns.all;

entity logical is
port (
@ -13,9 +14,7 @@ entity logical is
invert_in : in std_ulogic;
invert_out : in std_ulogic;
result : out std_ulogic_vector(63 downto 0);
datalen : in std_logic_vector(3 downto 0);
popcnt : out std_ulogic_vector(63 downto 0);
parity : out std_ulogic_vector(63 downto 0)
datalen : in std_logic_vector(3 downto 0)
);
end entity logical;

@ -34,30 +33,14 @@ architecture behaviour of logical is
type sixbit2 is array(0 to 1) of sixbit;
signal pc32 : sixbit2;
signal par0, par1 : std_ulogic;
signal popcnt : std_ulogic_vector(63 downto 0);
signal parity : std_ulogic_vector(63 downto 0);

begin
logical_0: process(all)
variable rb_adj, tmp : std_ulogic_vector(63 downto 0);
variable negative : std_ulogic;
begin
rb_adj := rb;
if invert_in = '1' then
rb_adj := not rb;
end if;

case op is
when OP_AND =>
tmp := rs and rb_adj;
when OP_OR =>
tmp := rs or rb_adj;
when others =>
tmp := rs xor rb_adj;
end case;

result <= tmp;
if invert_out = '1' then
result <= not tmp;
end if;

-- population counts
for i in 0 to 31 loop
pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1));
@ -98,5 +81,44 @@ begin
parity(32) <= par1;
end if;

rb_adj := rb;
if invert_in = '1' then
rb_adj := not rb;
end if;

case op is
when OP_AND =>
tmp := rs and rb_adj;
when OP_OR =>
tmp := rs or rb_adj;
when OP_XOR =>
tmp := rs xor rb_adj;
when OP_POPCNT =>
tmp := popcnt;
when OP_PRTY =>
tmp := parity;
when OP_CMPB =>
tmp := ppc_cmpb(rs, rb);
when others =>
-- EXTS
-- note datalen is a 1-hot encoding
negative := (datalen(0) and rs(7)) or
(datalen(1) and rs(15)) or
(datalen(2) and rs(31));
tmp := (others => negative);
if datalen(2) = '1' then
tmp(31 downto 16) := rs(31 downto 16);
end if;
if datalen(2) = '1' or datalen(1) = '1' then
tmp(15 downto 8) := rs(15 downto 8);
end if;
tmp(7 downto 0) := rs(7 downto 0);
end case;

if invert_out = '1' then
tmp := not tmp;
end if;
result <= tmp;

end process;
end behaviour;

@ -9,7 +9,6 @@ filesets:
- wishbone_types.vhdl
- common.vhdl
- fetch1.vhdl
- fetch2.vhdl
- decode1.vhdl
- helpers.vhdl
- decode2.vhdl
@ -27,7 +26,6 @@ filesets:
- loadstore1.vhdl
- mmu.vhdl
- dcache.vhdl
- multiply.vhdl
- divider.vhdl
- rotator.vhdl
- writeback.vhdl
@ -63,6 +61,10 @@ filesets:
- fpga/firmware.hex : {copyto : firmware.hex, file_type : user}
file_type : vhdlSource-2008

xilinx_specific:
files:
- xilinx-mult.vhdl : {file_type : vhdlSource-2008}

debug_xilinx:
files:
- dmi_dtm_xilinx.vhdl : {file_type : vhdlSource-2008}
@ -101,20 +103,21 @@ filesets:
targets:
nexys_a7:
default_tool: vivado
filesets: [core, nexys_a7, soc, fpga, debug_xilinx]
filesets: [core, nexys_a7, soc, fpga, debug_xilinx, xilinx_specific]
parameters :
- memory_size
- ram_init_file
- clk_input
- clk_frequency
- disable_flatten_core
- log_length=2048
tools:
vivado: {part : xc7a100tcsg324-1}
toplevel : toplevel

nexys_video-nodram:
default_tool: vivado
filesets: [core, nexys_video, soc, fpga, debug_xilinx]
filesets: [core, nexys_video, soc, fpga, debug_xilinx, xilinx_specific]
parameters :
- memory_size
- ram_init_file
@ -122,13 +125,14 @@ targets:
- clk_frequency
- disable_flatten_core
- spi_flash_offset=10485760
- log_length=2048
tools:
vivado: {part : xc7a200tsbg484-1}
toplevel : toplevel

nexys_video:
default_tool: vivado
filesets: [core, nexys_video, soc, fpga, debug_xilinx, litedram]
filesets: [core, nexys_video, soc, fpga, debug_xilinx, litedram, xilinx_specific]
parameters:
- memory_size
- ram_init_file
@ -136,6 +140,7 @@ targets:
- disable_flatten_core
- no_bram
- spi_flash_offset=10485760
- log_length=2048
generate: [dram_nexys_video]
tools:
vivado: {part : xc7a200tsbg484-1}
@ -143,7 +148,7 @@ targets:

arty_a7-35-nodram:
default_tool: vivado
filesets: [core, arty_a7, soc, fpga, debug_xilinx]
filesets: [core, arty_a7, soc, fpga, debug_xilinx, xilinx_specific]
parameters :
- memory_size
- ram_init_file
@ -151,13 +156,14 @@ targets:
- clk_frequency
- disable_flatten_core
- spi_flash_offset=3145728
- log_length=512
tools:
vivado: {part : xc7a35ticsg324-1L}
toplevel : toplevel

arty_a7-35:
default_tool: vivado
filesets: [core, arty_a7, soc, fpga, debug_xilinx, litedram]
filesets: [core, arty_a7, soc, fpga, debug_xilinx, litedram, xilinx_specific]
parameters :
- memory_size
- ram_init_file
@ -165,6 +171,7 @@ targets:
- disable_flatten_core
- no_bram
- spi_flash_offset=3145728
- log_length=512
generate: [dram_arty]
tools:
vivado: {part : xc7a35ticsg324-1L}
@ -172,7 +179,7 @@ targets:

arty_a7-100-nodram:
default_tool: vivado
filesets: [core, arty_a7, soc, fpga, debug_xilinx]
filesets: [core, arty_a7, soc, fpga, debug_xilinx, xilinx_specific]
parameters :
- memory_size
- ram_init_file
@ -180,13 +187,14 @@ targets:
- clk_frequency
- disable_flatten_core
- spi_flash_offset=4194304
- log_length=2048
tools:
vivado: {part : xc7a100ticsg324-1L}
toplevel : toplevel

arty_a7-100:
default_tool: vivado
filesets: [core, arty_a7, soc, fpga, debug_xilinx, litedram]
filesets: [core, arty_a7, soc, fpga, debug_xilinx, litedram, xilinx_specific]
parameters:
- memory_size
- ram_init_file
@ -194,6 +202,7 @@ targets:
- disable_flatten_core
- no_bram
- spi_flash_offset=4194304
- log_length=2048
generate: [dram_arty]
tools:
vivado: {part : xc7a100ticsg324-1L}
@ -201,7 +210,7 @@ targets:

cmod_a7-35:
default_tool: vivado
filesets: [core, cmod_a7-35, soc, fpga, debug_xilinx]
filesets: [core, cmod_a7-35, soc, fpga, debug_xilinx, xilinx_specific]
parameters :
- memory_size
- ram_init_file
@ -209,12 +218,13 @@ targets:
- clk_input=12000000
- clk_frequency
- disable_flatten_core
- log_length=512
tools:
vivado: {part : xc7a35tcpg236-1}
toplevel : toplevel

synth:
filesets: [core, soc]
filesets: [core, soc, xilinx_specific]
tools:
vivado: {pnr : none}
toplevel: core
@ -279,3 +289,8 @@ parameters:
datatype : int
description : Offset (in bytes) in the SPI flash of the code payload to run
paramtype : generic

log_length:
datatype : int
description : Length of the core log buffer in entries (32 bytes each)
paramtype : generic

@ -27,6 +27,7 @@ end mmu;
architecture behave of mmu is

type state_t is (IDLE,
DO_TLBIE,
TLB_WAIT,
PROC_TBL_READ,
PROC_TBL_WAIT,
@ -44,6 +45,7 @@ architecture behave of mmu is
store : std_ulogic;
priv : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
inval_all : std_ulogic;
-- config SPRs
prtbl : std_ulogic_vector(63 downto 0);
pid : std_ulogic_vector(31 downto 0);
@ -178,7 +180,6 @@ begin
variable tlb_load : std_ulogic;
variable itlb_load : std_ulogic;
variable tlbie_req : std_ulogic;
variable inval_all : std_ulogic;
variable prtbl_rd : std_ulogic;
variable pt_valid : std_ulogic;
variable effpid : std_ulogic_vector(31 downto 0);
@ -207,7 +208,7 @@ begin
tlb_load := '0';
itlb_load := '0';
tlbie_req := '0';
inval_all := '0';
v.inval_all := '0';
prtbl_rd := '0';

-- Radix tree data structures in memory are big-endian,
@ -240,11 +241,9 @@ begin
v.store := not (l_in.load or l_in.iside);
v.priv := l_in.priv;
if l_in.tlbie = '1' then
dcreq := '1';
tlbie_req := '1';
-- Invalidate all iTLB/dTLB entries for tlbie with
-- RB[IS] != 0 or RB[AP] != 0, or for slbia
inval_all := l_in.slbia or l_in.addr(11) or l_in.addr(10) or
v.inval_all := l_in.slbia or l_in.addr(11) or l_in.addr(10) or
l_in.addr(7) or l_in.addr(6) or l_in.addr(5);
-- The RIC field of the tlbie instruction comes across on the
-- sprn bus as bits 2--3. RIC=2 flushes process table caches.
@ -252,7 +251,7 @@ begin
v.pt0_valid := '0';
v.pt3_valid := '0';
end if;
v.state := TLB_WAIT;
v.state := DO_TLBIE;
else
v.valid := '1';
if pt_valid = '0' then
@ -281,11 +280,14 @@ begin
v.pt3_valid := '0';
end if;
v.pt0_valid := '0';
v.inval_all := '1';
v.state := DO_TLBIE;
end if;

when DO_TLBIE =>
dcreq := '1';
tlbie_req := '1';
inval_all := '1';
v.state := TLB_WAIT;
end if;

when TLB_WAIT =>
if d_in.done = '1' then
@ -436,8 +438,8 @@ begin

-- drive outputs
if tlbie_req = '1' then
addr := l_in.addr;
tlb_data := l_in.rs;
addr := r.addr;
tlb_data := (others => '0');
elsif tlb_load = '1' then
addr := r.addr(63 downto 12) & x"000";
tlb_data := pte;
@ -458,14 +460,14 @@ begin

d_out.valid <= dcreq;
d_out.tlbie <= tlbie_req;
d_out.doall <= inval_all;
d_out.doall <= r.inval_all;
d_out.tlbld <= tlb_load;
d_out.addr <= addr;
d_out.pte <= tlb_data;

i_out.tlbld <= itlb_load;
i_out.tlbie <= tlbie_req;
i_out.doall <= inval_all;
i_out.doall <= r.inval_all;
i_out.addr <= addr;
i_out.pte <= tlb_data;


@ -4,11 +4,10 @@ use ieee.numeric_std.all;

library work;
use work.common.all;
use work.decode_types.all;

entity multiply is
generic (
PIPELINE_DEPTH : natural := 16
PIPELINE_DEPTH : natural := 4
);
port (
clk : in std_logic;
@ -19,17 +18,16 @@ entity multiply is
end entity multiply;

architecture behaviour of multiply is
signal m: Execute1ToMultiplyType;
signal m: Execute1ToMultiplyType := Execute1ToMultiplyInit;

type multiply_pipeline_stage is record
valid : std_ulogic;
insn_type : insn_type_t;
data : signed(129 downto 0);
data : unsigned(127 downto 0);
is_32bit : std_ulogic;
neg_res : std_ulogic;
end record;
constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0',
insn_type => OP_ILLEGAL,
is_32bit => '0',
is_32bit => '0', neg_res => '0',
data => (others => '0'));

type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage;
@ -51,50 +49,35 @@ begin

multiply_1: process(all)
variable v : reg_type;
variable d : std_ulogic_vector(129 downto 0);
variable d : std_ulogic_vector(127 downto 0);
variable d2 : std_ulogic_vector(63 downto 0);
variable ov : std_ulogic;
begin
v := r;

m_out <= MultiplyToExecute1Init;

v.multiply_pipeline(0).valid := m.valid;
v.multiply_pipeline(0).insn_type := m.insn_type;
v.multiply_pipeline(0).data := signed(m.data1) * signed(m.data2);
v.multiply_pipeline(0).data := unsigned(m.data1) * unsigned(m.data2);
v.multiply_pipeline(0).is_32bit := m.is_32bit;
v.multiply_pipeline(0).neg_res := m.neg_result;

loop_0: for i in 1 to PIPELINE_DEPTH-1 loop
v.multiply_pipeline(i) := r.multiply_pipeline(i-1);
end loop;

if v.multiply_pipeline(PIPELINE_DEPTH-1).neg_res = '0' then
d := std_ulogic_vector(v.multiply_pipeline(PIPELINE_DEPTH-1).data);
ov := '0';
else
d := std_ulogic_vector(- signed(v.multiply_pipeline(PIPELINE_DEPTH-1).data));
end if;

-- TODO: Handle overflows
case_0: case v.multiply_pipeline(PIPELINE_DEPTH-1).insn_type is
when OP_MUL_L64 =>
d2 := d(63 downto 0);
ov := '0';
if v.multiply_pipeline(PIPELINE_DEPTH-1).is_32bit = '1' then
ov := (or d(63 downto 31)) and not (and d(63 downto 31));
else
ov := (or d(127 downto 63)) and not (and d(127 downto 63));
end if;
when OP_MUL_H32 =>
d2 := d(63 downto 32) & d(63 downto 32);
when OP_MUL_H64 =>
d2 := d(127 downto 64);
when others =>
--report "Illegal insn type in multiplier";
d2 := (others => '0');
end case;

m_out.write_reg_data <= d2;
m_out.result <= d;
m_out.overflow <= ov;

if v.multiply_pipeline(PIPELINE_DEPTH-1).valid = '1' then
m_out.valid <= '1';
end if;
m_out.valid <= v.multiply_pipeline(PIPELINE_DEPTH-1).valid;

rin <= v;
end process;

@ -17,8 +17,18 @@ architecture behave of multiply_tb is

constant pipeline_depth : integer := 4;

signal m1 : Execute1ToMultiplyType;
signal m1 : Execute1ToMultiplyType := Execute1ToMultiplyInit;
signal m2 : MultiplyToExecute1Type;

function absval(x: std_ulogic_vector) return std_ulogic_vector is
begin
if x(x'left) = '1' then
return std_ulogic_vector(- signed(x));
else
return x;
end if;
end;

begin
multiply_0: entity work.multiply
generic map (PIPELINE_DEPTH => pipeline_depth)
@ -39,9 +49,8 @@ begin
wait for clk_period;

m1.valid <= '1';
m1.insn_type <= OP_MUL_L64;
m1.data1 <= '0' & x"0000000000001000";
m1.data2 <= '0' & x"0000000000001111";
m1.data1 <= x"0000000000001000";
m1.data2 <= x"0000000000001111";

wait for clk_period;
assert m2.valid = '0';
@ -56,7 +65,7 @@ begin

wait for clk_period;
assert m2.valid = '1';
assert m2.write_reg_data = x"0000000001111000";
assert m2.result = x"00000000000000000000000001111000";

wait for clk_period;
assert m2.valid = '0';
@ -70,7 +79,7 @@ begin

wait for clk_period * (pipeline_depth-1);
assert m2.valid = '1';
assert m2.write_reg_data = x"0000000001111000";
assert m2.result = x"00000000000000000000000001111000";

-- test mulld
mulld_loop : for i in 0 to 1000 loop
@ -79,10 +88,10 @@ begin

behave_rt := ppc_mulld(ra, rb);

m1.data1 <= '0' & ra;
m1.data2 <= '0' & rb;
m1.data1 <= absval(ra);
m1.data2 <= absval(rb);
m1.neg_result <= ra(63) xor rb(63);
m1.valid <= '1';
m1.insn_type <= OP_MUL_L64;

wait for clk_period;

@ -92,8 +101,8 @@ begin

assert m2.valid = '1';

assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data)
report "bad mulld expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data);
assert to_hstring(behave_rt) = to_hstring(m2.result(63 downto 0))
report "bad mulld expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.result(63 downto 0));
end loop;

-- test mulhdu
@ -103,10 +112,10 @@ begin

behave_rt := ppc_mulhdu(ra, rb);

m1.data1 <= '0' & ra;
m1.data2 <= '0' & rb;
m1.data1 <= ra;
m1.data2 <= rb;
m1.neg_result <= '0';
m1.valid <= '1';
m1.insn_type <= OP_MUL_H64;

wait for clk_period;

@ -116,8 +125,8 @@ begin

assert m2.valid = '1';

assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data)
report "bad mulhdu expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data);
assert to_hstring(behave_rt) = to_hstring(m2.result(127 downto 64))
report "bad mulhdu expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.result(127 downto 64));
end loop;

-- test mulhd
@ -127,10 +136,10 @@ begin

behave_rt := ppc_mulhd(ra, rb);

m1.data1 <= ra(63) & ra;
m1.data2 <= rb(63) & rb;
m1.data1 <= absval(ra);
m1.data2 <= absval(rb);
m1.neg_result <= ra(63) xor rb(63);
m1.valid <= '1';
m1.insn_type <= OP_MUL_H64;

wait for clk_period;

@ -140,8 +149,8 @@ begin

assert m2.valid = '1';

assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data)
report "bad mulhd expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data);
assert to_hstring(behave_rt) = to_hstring(m2.result(127 downto 64))
report "bad mulhd expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.result(127 downto 64));
end loop;

-- test mullw
@ -151,12 +160,12 @@ begin

behave_rt := ppc_mullw(ra, rb);

m1.data1 <= (others => ra(31));
m1.data1(31 downto 0) <= ra(31 downto 0);
m1.data2 <= (others => rb(31));
m1.data2(31 downto 0) <= rb(31 downto 0);
m1.data1 <= (others => '0');
m1.data1(31 downto 0) <= absval(ra(31 downto 0));
m1.data2 <= (others => '0');
m1.data2(31 downto 0) <= absval(rb(31 downto 0));
m1.neg_result <= ra(31) xor rb(31);
m1.valid <= '1';
m1.insn_type <= OP_MUL_L64;

wait for clk_period;

@ -166,8 +175,8 @@ begin

assert m2.valid = '1';

assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data)
report "bad mullw expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data);
assert to_hstring(behave_rt) = to_hstring(m2.result(63 downto 0))
report "bad mullw expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.result(63 downto 0));
end loop;

-- test mulhw
@ -177,12 +186,12 @@ begin

behave_rt := ppc_mulhw(ra, rb);

m1.data1 <= (others => ra(31));
m1.data1(31 downto 0) <= ra(31 downto 0);
m1.data2 <= (others => rb(31));
m1.data2(31 downto 0) <= rb(31 downto 0);
m1.data1 <= (others => '0');
m1.data1(31 downto 0) <= absval(ra(31 downto 0));
m1.data2 <= (others => '0');
m1.data2(31 downto 0) <= absval(rb(31 downto 0));
m1.neg_result <= ra(31) xor rb(31);
m1.valid <= '1';
m1.insn_type <= OP_MUL_H32;

wait for clk_period;

@ -192,8 +201,9 @@ begin

assert m2.valid = '1';

assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data)
report "bad mulhw expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data);
assert to_hstring(behave_rt) = to_hstring(m2.result(63 downto 32) & m2.result(63 downto 32))
report "bad mulhw expected " & to_hstring(behave_rt) & " got " &
to_hstring(m2.result(63 downto 32) & m2.result(63 downto 32));
end loop;

-- test mulhwu
@ -207,8 +217,8 @@ begin
m1.data1(31 downto 0) <= ra(31 downto 0);
m1.data2 <= (others => '0');
m1.data2(31 downto 0) <= rb(31 downto 0);
m1.neg_result <= '0';
m1.valid <= '1';
m1.insn_type <= OP_MUL_H32;

wait for clk_period;

@ -218,8 +228,9 @@ begin

assert m2.valid = '1';

assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data)
report "bad mulhwu expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data);
assert to_hstring(behave_rt) = to_hstring(m2.result(63 downto 32) & m2.result(63 downto 32))
report "bad mulhwu expected " & to_hstring(behave_rt) & " got " &
to_hstring(m2.result(63 downto 32) & m2.result(63 downto 32));
end loop;

-- test mulli
@ -229,11 +240,11 @@ begin

behave_rt := ppc_mulli(ra, si);

m1.data1 <= ra(63) & ra;
m1.data2 <= (others => si(15));
m1.data2(15 downto 0) <= si;
m1.data1 <= absval(ra);
m1.data2 <= (others => '0');
m1.data2(15 downto 0) <= absval(si);
m1.neg_result <= ra(63) xor si(15);
m1.valid <= '1';
m1.insn_type <= OP_MUL_L64;

wait for clk_period;

@ -243,8 +254,8 @@ begin

assert m2.valid = '1';

assert to_hstring(behave_rt) = to_hstring(m2.write_reg_data)
report "bad mulli expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.write_reg_data);
assert to_hstring(behave_rt) = to_hstring(m2.result(63 downto 0))
report "bad mulli expected " & to_hstring(behave_rt) & " got " & to_hstring(m2.result(63 downto 0));
end loop;

std.env.finish;

@ -93,7 +93,7 @@ package ppc_fx_insns is
function ppc_divd (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
function ppc_divwu (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;

function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return integer;
function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return std_ulogic;
end package ppc_fx_insns;

package body ppc_fx_insns is
@ -785,13 +785,12 @@ package body ppc_fx_insns is
return std_ulogic_vector(resize(tmp, ra'length));
end;

function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return integer is
function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return std_ulogic is
variable crfield: integer;
variable crbit_match: std_ulogic;
variable ctr_not_zero: std_ulogic;
variable ctr_ok: std_ulogic;
variable cond_ok: std_ulogic;
variable ret: integer;
begin
crfield := to_integer(unsigned(bi));
-- BE bit numbering
@ -800,12 +799,7 @@ package body ppc_fx_insns is
ctr_not_zero := '1' when ctr /= x"0000000000000001" else '0';
ctr_ok := bo(4-2) or (ctr_not_zero xor bo(4-3));
cond_ok := bo(4-0) or crbit_match;
if ctr_ok = '1' and cond_ok = '1' then
ret := 1;
else
ret := 0;
end if;
return ret;
return ctr_ok and cond_ok;
end;

end package body ppc_fx_insns;

@ -24,7 +24,9 @@ entity register_file is

-- debug
sim_dump : in std_ulogic;
sim_dump_done : out std_ulogic
sim_dump_done : out std_ulogic;

log_out : out std_ulogic_vector(70 downto 0)
);
end entity register_file;

@ -34,18 +36,19 @@ architecture behaviour of register_file is
signal rd_port_b : std_ulogic_vector(63 downto 0);
signal dbg_data : std_ulogic_vector(63 downto 0);
signal dbg_ack : std_ulogic;
signal log_data : std_ulogic_vector(70 downto 0);
begin
-- synchronous writes
register_write_0: process(clk)
begin
if rising_edge(clk) then
if w_in.write_enable = '1' then
assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure;
if w_in.write_reg(5) = '0' then
report "Writing GPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data);
else
report "Writing GSPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data);
end if;
assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure;
registers(to_integer(unsigned(w_in.write_reg))) <= w_in.write_data;
end if;
end if;
@ -131,4 +134,13 @@ begin
sim_dump_done <= '0';
end generate;

reg_log: process(clk)
begin
if rising_edge(clk) then
log_data <= w_in.write_data &
w_in.write_enable &
w_in.write_reg;
end if;
end process;
log_out <= log_data;
end architecture behaviour;

@ -0,0 +1,12 @@
CFLAGS = -O2 -g -Wall -std=c99

all: fmt_log

fmt_log: fmt_log.c
$(CC) -o $@ $^ $(CFLAGS)

clean:
rm -f fmt_log
distclean:
rm -f *~

@ -0,0 +1,235 @@
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>

typedef unsigned long long u64;

struct log_entry {
u64 nia_lo: 42;
u64 nia_hi: 1;
u64 ic_ra_valid: 1;
u64 ic_access_ok: 1;
u64 ic_is_miss: 1;
u64 ic_is_hit: 1;
u64 ic_way: 3;
u64 ic_state: 1;
u64 ic_part_nia: 4;
u64 ic_fetch_failed: 1;
u64 ic_stall_out: 1;
u64 ic_wb_stall: 1;
u64 ic_wb_cyc: 1;
u64 ic_wb_stb: 1;
u64 ic_wb_adr: 3;
u64 ic_wb_ack: 1;

u64 ic_insn: 32;
u64 ic_valid: 1;
u64 d1_valid: 1;
u64 d1_unit: 2;
u64 d1_part_nia: 4;
u64 d1_insn_type: 6;
u64 d2_bypass_a: 1;
u64 d2_bypass_b: 1;
u64 d2_bypass_c: 1;
u64 d2_stall_out: 1;
u64 d2_stopped_out: 1;
u64 d2_valid: 1;
u64 d2_part_nia: 4;
u64 e1_flush_out: 1;
u64 e1_stall_out: 1;
u64 e1_redirect: 1;
u64 e1_valid: 1;
u64 e1_write_enable: 1;
u64 e1_unused: 3;

u64 e1_irq_state: 1;
u64 e1_irq: 1;
u64 e1_exception: 1;
u64 e1_msr_dr: 1;
u64 e1_msr_ir: 1;
u64 e1_msr_pr: 1;
u64 e1_msr_ee: 1;
u64 pad1: 5;
u64 ls_state: 3;
u64 ls_dw_done: 1;
u64 ls_min_done: 1;
u64 ls_do_valid: 1;
u64 ls_mo_valid: 1;
u64 ls_lo_valid: 1;
u64 ls_eo_except: 1;
u64 ls_stall_out: 1;
u64 pad2: 2;
u64 dc_state: 3;
u64 dc_ra_valid: 1;
u64 dc_tlb_way: 3;
u64 dc_stall_out: 1;
u64 dc_op: 3;
u64 dc_do_valid: 1;
u64 dc_do_error: 1;
u64 dc_wb_cyc: 1;
u64 dc_wb_stb: 1;
u64 dc_wb_ack: 1;
u64 dc_wb_stall: 1;
u64 dc_wb_adr: 3;
u64 cr_wr_mask: 8;
u64 cr_wr_data: 4;
u64 cr_wr_enable: 1;
u64 reg_wr_reg: 6;
u64 reg_wr_enable: 1;

u64 reg_wr_data;
};

#define FLAG(i, y) (log.i? y: ' ')
#define FLGA(i, y, z) (log.i? y: z)
#define PNIA(f) (full_nia[log.f] & 0xff)

const char *units[4] = { "--", "al", "ls", "?3" };
const char *ops[64] =
{
"illegal", "nop ", "add ", "and ", "attn ", "b ", "bc ", "bcreg ",
"bperm ", "cmp ", "cmpb ", "cmpeqb ", "cmprb ", "cntz ", "crop ", "darn ",
"dcbf ", "dcbst ", "dcbt ", "dcbtst ", "dcbz ", "div ", "dive ", "exts ",
"extswsl", "icbi ", "icbt ", "isel ", "isync ", "ld ", "st ", "maddhd ",
"maddhdu", "maddld ", "mcrxr ", "mcrxrx ", "mfcr ", "mfmsr ", "mfspr ", "mod ",
"mtcrf ", "mtmsr ", "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "or ", "popcnt ",
"prty ", "rfid ", "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", "shl ",
"shr ", "sync ", "tlbie ", "trap ", "xor ", "ffail ", "?62 ", "?63 "
};

const char *spr_names[13] =
{
"lr ", "ctr", "sr0", "sr1", "hr0", "hr1", "sg0", "sg1",
"sg2", "sg3", "hg0", "hg1", "xer"
};
int main(int ac, char **av)
{
struct log_entry log;
u64 full_nia[16];
long int lineno = 1;
FILE *f;
const char *filename;
int i;
long int ncompl = 0;

if (ac != 1 && ac != 2) {
fprintf(stderr, "Usage: %s [filename]\n", av[0]);
exit(1);
}
f = stdin;
if (ac == 2) {
filename = av[1];
f = fopen(filename, "rb");
if (f == NULL) {
perror(filename);
exit(1);
}
}

for (i = 0; i < 15; ++i)
full_nia[i] = i << 2;

while (fread(&log, sizeof(log), 1, f) == 1) {
full_nia[log.nia_lo & 0xf] = (log.nia_hi? 0xc000000000000000: 0) |
(log.nia_lo << 2);
if (lineno % 20 == 1) {
printf(" fetch1 NIA icache decode1 decode2 execute1 loadstore dcache CR GSPR\n");
printf(" ---------------- TAHW S -WB-- pN --insn-- pN un op pN byp FR IIE MSR WC SD MM CE SRTO DE -WB-- c ms reg val\n");
printf(" LdMy t csnSa IA IA it IA abc le srx EPID em tw rd mx tAwp vr csnSa 0 k\n");
}
printf("%4ld %c0000%.11llx %c ", lineno,
(log.nia_hi? 'c': '0'),
(unsigned long long)log.nia_lo << 2,
FLAG(ic_stall_out, '|'));
printf("%c%c%c%d %c %c%c%d%c%c %.2llx ",
FLGA(ic_ra_valid, ' ', 'T'),
FLGA(ic_access_ok, ' ', 'X'),
FLGA(ic_is_hit, 'H', FLGA(ic_is_miss, 'M', ' ')),
log.ic_way,
FLAG(ic_state, 'W'),
FLAG(ic_wb_cyc, 'c'),
FLAG(ic_wb_stb, 's'),
log.ic_wb_adr,
FLAG(ic_wb_stall, 'S'),
FLAG(ic_wb_ack, 'a'),
PNIA(ic_part_nia));
if (log.ic_valid)
printf("%.8x", log.ic_insn);
else if (log.ic_fetch_failed)
printf("!!!!!!!!");
else
printf("--------");
printf(" %c%c %.2llx ",
FLAG(ic_valid, '>'),
FLAG(d2_stall_out, '|'),
PNIA(d1_part_nia));
if (log.d1_valid)
printf("%s %s",
units[log.d1_unit],
ops[log.d1_insn_type]);
else
printf("-- -------");
printf(" %c%c ",
FLAG(d1_valid, '>'),
FLAG(d2_stall_out, '|'));
printf("%.2llx %c%c%c %c%c ",
PNIA(d2_part_nia),
FLAG(d2_bypass_a, 'a'),
FLAG(d2_bypass_b, 'b'),
FLAG(d2_bypass_c, 'c'),
FLAG(d2_valid, '>'),
FLAG(e1_stall_out, '|'));
printf("%c%c %c%c%c %c%c%c%c %c%c ",
FLAG(e1_flush_out, 'F'),
FLAG(e1_redirect, 'R'),
FLAG(e1_irq_state, 'w'),
FLAG(e1_irq, 'I'),
FLAG(e1_exception, 'X'),
FLAG(e1_msr_ee, 'E'),
FLGA(e1_msr_pr, 'u', 's'),
FLAG(e1_msr_ir, 'I'),
FLAG(e1_msr_dr, 'D'),
FLAG(e1_write_enable, 'W'),
FLAG(e1_valid, 'C'));
printf("%c %d%d %c%c %c%c %c ",
FLAG(ls_stall_out, '|'),
log.ls_state,
log.ls_dw_done,
FLAG(ls_mo_valid, 'M'),
FLAG(ls_min_done, 'm'),
FLAG(ls_lo_valid, 'C'),
FLAG(ls_eo_except, 'X'),
FLAG(ls_do_valid, '>'));
printf("%d%c%d%d %c%c %c%c%d%c%c ",
log.dc_state,
FLAG(dc_ra_valid, 'R'),
log.dc_tlb_way,
log.dc_op,
FLAG(dc_do_valid, 'V'),
FLAG(dc_do_error, 'E'),
FLAG(dc_wb_cyc, 'c'),
FLAG(dc_wb_stb, 's'),
log.dc_wb_adr,
FLAG(dc_wb_stall, 'S'),
FLAG(dc_wb_ack, 'a'));
if (log.cr_wr_enable)
printf("%x>%.2x ", log.cr_wr_data, log.cr_wr_mask);
else
printf(" ");
if (log.reg_wr_enable) {
if (log.reg_wr_reg < 32 || log.reg_wr_reg > 44)
printf("r%02d", log.reg_wr_reg);
else
printf("%s", spr_names[log.reg_wr_reg - 32]);
printf("=%.16llx", log.reg_wr_data);
}
printf("\n");
++lineno;
if (log.ls_lo_valid || log.e1_valid)
++ncompl;
}
printf("%ld instructions completed, %.2f CPI\n", ncompl,
(double)(lineno - 1) / ncompl);
exit(0);
}

@ -42,6 +42,9 @@
#define DBG_CORE_GSPR_INDEX 0x14
#define DBG_CORE_GSPR_DATA 0x15

#define DBG_LOG_ADDR 0x16
#define DBG_LOG_DATA 0x17

static bool debug;

struct backend {
@ -507,8 +510,10 @@ static void load(const char *filename, uint64_t addr)
// if (rc < 8) XXX fixup endian ?
check(dmi_write(DBG_WB_DATA, data), "writing WB_DATA");
count += 8;
if (!(count % 1024))
printf("%x...\n", count);
if (!(count % 1024)) {
printf("%x...\r", count);
fflush(stdout);
}
}
close(fd);
printf("%x done.\n", count);
@ -535,8 +540,10 @@ static void save(const char *filename, uint64_t addr, uint64_t size)
break;
}
count += 8;
if (!(count % 1024))
printf("%x...\n", count);
if (!(count % 1024)) {
printf("%x...\r", count);
fflush(stdout);
}
if (count >= size)
break;
}
@ -544,6 +551,73 @@ static void save(const char *filename, uint64_t addr, uint64_t size)
printf("%x done.\n", count);
}

#define LOG_STOP 0x80000000ull

static void log_start(void)
{
check(dmi_write(DBG_LOG_ADDR, 0), "writing LOG_ADDR");
}

static void log_stop(void)
{
uint64_t lsize, laddr, waddr;

check(dmi_write(DBG_LOG_ADDR, LOG_STOP), "writing LOG_ADDR");
check(dmi_read(DBG_LOG_ADDR, &laddr), "reading LOG_ADDR");
waddr = laddr >> 32;
for (lsize = 1; lsize; lsize <<= 1)
if ((waddr >> 1) < lsize)
break;
waddr &= ~lsize;
printf("Log size = %" PRIu64 " entries, ", lsize);
printf("write ptr = %" PRIx64 "\n", waddr);
}

static void log_dump(const char *filename)
{
FILE *f;
uint64_t lsize, laddr, waddr;
uint64_t orig_laddr;
uint64_t i, ldata;

f = fopen(filename, "w");
if (f == NULL) {
fprintf(stderr, "Failed to create '%s': %s\n", filename,
strerror(errno));
exit(1);
}

check(dmi_read(DBG_LOG_ADDR, &orig_laddr), "reading LOG_ADDR");
if (!(orig_laddr & LOG_STOP))
check(dmi_write(DBG_LOG_ADDR, LOG_STOP), "writing LOG_ADDR");

waddr = orig_laddr >> 32;
for (lsize = 1; lsize; lsize <<= 1)
if ((waddr >> 1) < lsize)
break;
waddr &= ~lsize;
printf("Log size = %" PRIu64 " entries\n", lsize);

laddr = LOG_STOP | (waddr << 2);
check(dmi_write(DBG_LOG_ADDR, laddr), "writing LOG_ADDR");

for (i = 0; i < lsize * 4; ++i) {
check(dmi_read(DBG_LOG_DATA, &ldata), "reading LOG_DATA");
if (fwrite(&ldata, sizeof(ldata), 1, f) != 1) {
fprintf(stderr, "Write error on %s\n", filename);
exit(1);
}
if (!(i % 128)) {
printf("%" PRIu64 "...\r", i * 8);
fflush(stdout);
}
}
fclose(f);
printf("%" PRIu64 " done\n", lsize * 32);

check(dmi_write(DBG_LOG_ADDR, orig_laddr), "writing LOG_ADDR");
}

static void usage(const char *cmd)
{
fprintf(stderr, "Usage: %s -b <jtag|sim> <command> <args>\n", cmd);
@ -568,6 +642,12 @@ static void usage(const char *cmd)
fprintf(stderr, " gpr <reg> [count]\n");
fprintf(stderr, " status\n");

fprintf(stderr, "\n");
fprintf(stderr, " Core logging:\n");
fprintf(stderr, " lstart start logging\n");
fprintf(stderr, " lstop stop logging\n");
fprintf(stderr, " ldump <file> dump log to file\n");

fprintf(stderr, "\n");
fprintf(stderr, " JTAG:\n");
fprintf(stderr, " dmiread <hex addr>\n");
@ -706,6 +786,17 @@ int main(int argc, char *argv[])
if (((i+1) < argc) && isdigit(argv[i+1][0]))
count = strtoul(argv[++i], NULL, 10);
gpr_read(reg, count);
} else if (strcmp(argv[i], "lstart") == 0) {
log_start();
} else if (strcmp(argv[i], "lstop") == 0) {
log_stop();
} else if (strcmp(argv[i], "ldump") == 0) {
const char *filename;

if ((i+1) >= argc)
usage(argv[0]);
filename = argv[++i];
log_dump(filename);
} else {
fprintf(stderr, "Unknown command %s\n", argv[i]);
exit(1);

@ -51,7 +51,8 @@ entity soc is
SPI_FLASH_DLINES : positive := 1;
SPI_FLASH_OFFSET : integer := 0;
SPI_FLASH_DEF_CKDV : natural := 2;
SPI_FLASH_DEF_QUAD : boolean := false
SPI_FLASH_DEF_QUAD : boolean := false;
LOG_LENGTH : natural := 512
);
port(
rst : in std_ulogic;
@ -198,7 +199,8 @@ begin
generic map(
SIM => SIM,
DISABLE_FLATTEN => DISABLE_FLATTEN_CORE,
ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1')
ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'),
LOG_LENGTH => LOG_LENGTH
)
port map(
clk => system_clk,

Binary file not shown.

@ -9,6 +9,14 @@
#undef DEBUG
//#define DEBUG 1

void delay(void)
{
static volatile int i;

for (i = 0; i < 10; ++i)
;
}

void print_number(unsigned int i) // only for i = 0-999
{
unsigned int j, k, m;
@ -148,14 +156,17 @@ int xics_test_0(void)
xics_write8(XICS_MFRR, 0x05); // cause 0x500 interrupt

// still masked, so shouldn't happen yet
delay();
assert(isrs_run == 0);

// unmask IPI only
xics_write8(XICS_XIRR, 0x40);
delay();
assert(isrs_run == ISR_IPI);

// unmask UART
xics_write8(XICS_XIRR, 0xc0);
delay();
assert(isrs_run == (ISR_IPI | ISR_UART));

// cleanup
@ -174,12 +185,14 @@ int xics_test_1(void)
xics_write8(XICS_XIRR, 0xff); // allow all interrupts

// should be none pending
delay();
assert(isrs_run == 0);

// trigger both
potato_uart_irq_en(); // cause 0x500 interrupt
xics_write8(XICS_MFRR, 0x05); // cause 0x500 interrupt

delay();
assert(isrs_run == (ISR_IPI | ISR_UART));

// cleanup
@ -208,9 +221,11 @@ int xics_test_2(void)
// trigger an IPI
xics_write8(XICS_MFRR, 0x05); // cause 0x500 interrupt

delay();
assert(isrs_run == 0);

mtmsrd(0x9000000000008003); // EE on
delay();
assert(isrs_run == ISR_IPI);

// cleanup

@ -22,15 +22,13 @@ end entity writeback;

architecture behaviour of writeback is
begin
writeback_1: process(all)
writeback_0: process(clk)
variable x : std_ulogic_vector(0 downto 0);
variable y : std_ulogic_vector(0 downto 0);
variable w : std_ulogic_vector(0 downto 0);
variable cf: std_ulogic_vector(3 downto 0);
variable zero : std_ulogic;
variable sign : std_ulogic;
variable scf : std_ulogic_vector(3 downto 0);
begin
if rising_edge(clk) then
-- Do consistency checks only on the clock edge
x(0) := e_in.valid;
y(0) := l_in.valid;
assert (to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure;
@ -42,7 +40,15 @@ begin
w(0) := e_in.write_cr_enable;
x(0) := (e_in.write_enable and e_in.rc);
assert (to_integer(unsigned(w)) + to_integer(unsigned(x))) <= 1 severity failure;
end if;
end process;

writeback_1: process(all)
variable cf: std_ulogic_vector(3 downto 0);
variable zero : std_ulogic;
variable sign : std_ulogic;
variable scf : std_ulogic_vector(3 downto 0);
begin
w_out <= WritebackToRegisterFileInit;
c_out <= WritebackToCrFileInit;


@ -0,0 +1,985 @@
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

library work;
use work.common.all;

library unisim;
use unisim.vcomponents.all;

entity multiply is
port (
clk : in std_logic;

m_in : in Execute1ToMultiplyType;
m_out : out MultiplyToExecute1Type
);
end entity multiply;

architecture behaviour of multiply is
signal m00_p, m01_p, m02_p, m03_p : std_ulogic_vector(47 downto 0);
signal m00_pc : std_ulogic_vector(47 downto 0);
signal m10_p, m11_p, m12_p, m13_p : std_ulogic_vector(47 downto 0);
signal m11_pc, m12_pc, m13_pc : std_ulogic_vector(47 downto 0);
signal m20_p, m21_p, m22_p, m23_p : std_ulogic_vector(47 downto 0);
signal s0_pc, s1_pc : std_ulogic_vector(47 downto 0);
signal product_lo : std_ulogic_vector(31 downto 0);
signal product : std_ulogic_vector(127 downto 0);
signal addend : std_ulogic_vector(127 downto 0);
signal s0_carry, p0_carry : std_ulogic_vector(3 downto 0);
signal p0_mask : std_ulogic_vector(47 downto 0);
signal p0_pat, p0_patb : std_ulogic;
signal p1_pat, p1_patb : std_ulogic;

signal req_32bit, r32_1 : std_ulogic;
signal req_neg, rneg_1 : std_ulogic;
signal valid_1 : std_ulogic;

begin
addend <= (others => m_in.neg_result);

m00: DSP48E1
generic map (
ACASCREG => 0,
ALUMODEREG => 0,
AREG => 0,
BCASCREG => 0,
BREG => 0,
CARRYINREG => 0,
CARRYINSELREG => 0,
INMODEREG => 0,
OPMODEREG => 0,
PREG => 0
)
port map (
A => "0000000" & m_in.data1(22 downto 0),
ACIN => (others => '0'),
ALUMODE => "0000",
B => '0' & m_in.data2(16 downto 0),
BCIN => (others => '0'),
C => "00000000000000" & addend(33 downto 0),
CARRYCASCIN => '0',
CARRYIN => '0',
CARRYINSEL => "000",
CEA1 => '0',
CEA2 => '0',
CEAD => '0',
CEALUMODE => '0',
CEB1 => '0',
CEB2 => '0',
CEC => '1',
CECARRYIN => '0',
CECTRL => '0',
CED => '0',
CEINMODE => '0',
CEM => '1',
CEP => '0',
CLK => clk,
D => (others => '0'),
INMODE => "00000",
MULTSIGNIN => '0',
OPMODE => "0110101",
P => m00_p,
PCIN => (others => '0'),
PCOUT => m00_pc,
RSTA => '0',
RSTALLCARRYIN => '0',
RSTALUMODE => '0',
RSTB => '0',
RSTC => '0',
RSTCTRL => '0',
RSTD => '0',
RSTINMODE => '0',
RSTM => '0',
RSTP => '0'
);

m01: DSP48E1
generic map (
ACASCREG => 0,
ALUMODEREG => 0,
AREG => 0,
BCASCREG => 0,
BREG => 0,
CARRYINREG => 0,
CARRYINSELREG => 0,
INMODEREG => 0,
OPMODEREG => 0,
PREG => 0
)
port map (
A => "0000000" & m_in.data1(22 downto 0),
ACIN => (others => '0'),
ALUMODE => "0000",
B => '0' & m_in.data2(33 downto 17),
BCIN => (others => '0'),
C => (others => '0'),
CARRYCASCIN => '0',
CARRYIN => '0',
CARRYINSEL => "000",
CEA1 => '0',
CEA2 => '0',
CEAD => '0',
CEALUMODE => '0',
CEB1 => '0',
CEB2 => '0',
CEC => '1',
CECARRYIN => '0',
CECTRL => '0',
CED => '0',
CEINMODE => '0',
CEM => '1',
CEP => '0',
CLK => clk,
D => (others => '0'),
INMODE => "00000",
MULTSIGNIN => '0',
OPMODE => "1010101",
P => m01_p,
PCIN => m00_pc,
RSTA => '0',
RSTALLCARRYIN => '0',
RSTALUMODE => '0',
RSTB => '0',
RSTC => '0',
RSTCTRL => '0',
RSTD => '0',
RSTINMODE => '0',
RSTM => '0',
RSTP => '0'
);

m02: DSP48E1
generic map (
ACASCREG => 0,
ALUMODEREG => 0,
AREG => 0,
BCASCREG => 0,
BREG => 0,
CARRYINREG => 0,
CARRYINSELREG => 0,
INMODEREG => 0,
OPMODEREG => 0,
PREG => 0
)
port map (
A => "0000000" & m_in.data1(22 downto 0),
ACIN => (others => '0'),
ALUMODE => "0000",
B => '0' & m_in.data2(50 downto 34),
BCIN => (others => '0'),
C => x"0000000" & "000" & addend(50 downto 34),
CARRYCASCIN => '0',
CARRYIN => '0',
CARRYINSEL => "000",
CEA1 => '0',
CEA2 => '0',
CEAD => '0',
CEALUMODE => '0',
CEB1 => '0',
CEB2 => '0',
CEC => '1',
CECARRYIN => '0',
CECTRL => '0',
CED => '0',
CEINMODE => '0',
CEM => '1',
CEP => '0',
CLK => clk,
D => (others => '0'),
INMODE => "00000",
MULTSIGNIN => '0',
OPMODE => "0110101",
P => m02_p,
PCIN => (others => '0'),
RSTA => '0',
RSTALLCARRYIN => '0',
RSTALUMODE => '0',
RSTB => '0',
RSTC => '0',
RSTCTRL => '0',
RSTD => '0',
RSTINMODE => '0',
RSTM => '0',
RSTP => '0'
);

m03: DSP48E1
generic map (
ACASCREG => 0,
ALUMODEREG => 0,
AREG => 0,
BCASCREG => 0,
BREG => 0,
CARRYINREG => 0,
CARRYINSELREG => 0,
INMODEREG => 0,
OPMODEREG => 0,
PREG => 0
)
port map (
A => "0000000" & m_in.data1(22 downto 0),
ACIN => (others => '0'),
ALUMODE => "0000",
B => "00000" & m_in.data2(63 downto 51),
BCIN => (others => '0'),
C => x"000000" & '0' & addend(73 downto 51),
CARRYCASCIN => '0',
CARRYIN => '0',
CARRYINSEL => "000",
CEA1 => '0',
CEA2 => '0',
CEAD => '0',
CEALUMODE => '0',
CEB1 => '0',
CEB2 => '0',
CEC => '1',
CECARRYIN => '0',
CECTRL => '0',
CED => '0',
CEINMODE => '0',
CEM => '1',
CEP => '0',
CLK => clk,
D => (others => '0'),
INMODE => "00000",
MULTSIGNIN => '0',
OPMODE => "0110101",
P => m03_p,
PCIN => (others => '0'),
RSTA => '0',
RSTALLCARRYIN => '0',
RSTALUMODE => '0',
RSTB => '0',
RSTC => '0',
RSTCTRL => '0',
RSTD => '0',
RSTINMODE => '0',
RSTM => '0',
RSTP => '0'
);

m10: DSP48E1
generic map (
ACASCREG => 0,
ALUMODEREG => 0,
AREG => 0,
BCASCREG => 0,
BREG => 0,
CARRYINREG => 0,
CARRYINSELREG => 0,
CREG => 0,
INMODEREG => 0,
OPMODEREG => 0,
PREG => 0
)
port map (
A => "0000000000000" & m_in.data1(39 downto 23),
ACIN => (others => '0'),
ALUMODE => "0000",
B => '0' & m_in.data2(16 downto 0),
BCIN => (others => '0'),
C => x"000" & "00" & m01_p(39 downto 6),
CARRYCASCIN => '0',
CARRYIN => '0',
CARRYINSEL => "000",
CEA1 => '0',
CEA2 => '0',
CEAD => '0',
CEALUMODE => '0',
CEB1 => '0',
CEB2 => '0',
CEC => '0',
CECARRYIN => '0',
CECTRL => '0',
CED => '0',
CEINMODE => '0',
CEM => '1',
CEP => '0',
CLK => clk,
D => (others => '0'),
INMODE => "00000",
MULTSIGNIN => '0',
OPMODE => "0110101",
P => m10_p,
PCIN => (others => '0'),
RSTA => '0',
RSTALLCARRYIN => '0',
RSTALUMODE => '0',
RSTB => '0',
RSTC => '0',
RSTCTRL => '0',
RSTD => '0',
RSTINMODE => '0',
RSTM => '0',
RSTP => '0'
);

m11: DSP48E1
generic map (
ACASCREG => 0,
ALUMODEREG => 0,
AREG => 0,
BCASCREG => 0,
BREG => 0,
CARRYINREG => 0,
CARRYINSELREG => 0,
CREG => 0,
INMODEREG => 0,
OPMODEREG => 0,
PREG => 0
)
port map (
A => "0000000000000" & m_in.data1(39 downto 23),
ACIN => (others => '0'),
ALUMODE => "0000",
B => '0' & m_in.data2(33 downto 17),
BCIN => (others => '0'),
C => x"000" & "00" & m02_p(39 downto 6),
CARRYCASCIN => '0',
CARRYIN => '0',
CARRYINSEL => "000",
CEA1 => '0',
CEA2 => '0',
CEAD => '0',
CEALUMODE => '0',
CEB1 => '0',
CEB2 => '0',
CEC => '0',
CECARRYIN => '0',
CECTRL => '0',
CED => '0',
CEINMODE => '0',
CEM => '1',
CEP => '0',
CLK => clk,
D => (others => '0'),
INMODE => "00000",
MULTSIGNIN => '0',
OPMODE => "0110101",
P => m11_p,
PCIN => (others => '0'),
PCOUT => m11_pc,
RSTA => '0',
RSTALLCARRYIN => '0',
RSTALUMODE => '0',
RSTB => '0',
RSTC => '0',
RSTCTRL => '0',
RSTD => '0',
RSTINMODE => '0',
RSTM => '0',
RSTP => '0'
);

m12: DSP48E1
generic map (
ACASCREG => 0,
ALUMODEREG => 0,
AREG => 0,
BCASCREG => 0,
BREG => 0,
CARRYINREG => 0,
CARRYINSELREG => 0,
CREG => 0,
INMODEREG => 0,
OPMODEREG => 0,
PREG => 0
)
port map (
A => "0000000000000" & m_in.data1(39 downto 23),
ACIN => (others => '0'),
ALUMODE => "0000",
B => '0' & m_in.data2(50 downto 34),
BCIN => (others => '0'),
C => x"0000" & '0' & m03_p(36 downto 6),
CARRYCASCIN => '0',
CARRYIN => '0',
CARRYINSEL => "000",
CEA1 => '0',
CEA2 => '0',
CEAD => '0',
CEALUMODE => '0',
CEB1 => '0',
CEB2 => '0',
CEC => '0',
CECARRYIN => '0',
CECTRL => '0',
CED => '0',
CEINMODE => '0',
CEM => '1',
CEP => '0',
CLK => clk,
D => (others => '0'),
INMODE => "00000",
MULTSIGNIN => '0',
OPMODE => "0110101",
P => m12_p,
PCIN => (others => '0'),
PCOUT => m12_pc,
RSTA => '0',
RSTALLCARRYIN => '0',
RSTALUMODE => '0',
RSTB => '0',
RSTC => '0',
RSTCTRL => '0',
RSTD => '0',
RSTINMODE => '0',
RSTM => '0',
RSTP => '0'
);

m13: DSP48E1
generic map (
ACASCREG => 0,
ALUMODEREG => 0,
AREG => 0,
BCASCREG => 0,
BREG => 0,
CARRYINREG => 0,
CARRYINSELREG => 0,
INMODEREG => 0,
OPMODEREG => 0,
PREG => 0
)
port map (
A => "0000000000000" & m_in.data1(39 downto 23),
ACIN => (others => '0'),
ALUMODE => "0000",
B => "00000" & m_in.data2(63 downto 51),
BCIN => (others => '0'),
C => x"0000000" & "000" & addend(90 downto 74),
CARRYCASCIN => '0',
CARRYIN => '0',
CARRYINSEL => "000",
CEA1 => '0',
CEA2 => '0',
CEAD => '0',
CEALUMODE => '0',
CEB1 => '0',
CEB2 => '0',
CEC => '1',
CECARRYIN => '0',
CECTRL => '0',
CED => '0',
CEINMODE => '0',
CEM => '1',
CEP => '0',
CLK => clk,
D => (others => '0'),
INMODE => "00000",
MULTSIGNIN => '0',
OPMODE => "0110101",
P => m13_p,
PCIN => (others => '0'),
PCOUT => m13_pc,
RSTA => '0',
RSTALLCARRYIN => '0',
RSTALUMODE => '0',
RSTB => '0',
RSTC => '0',
RSTCTRL => '0',
RSTD => '0',
RSTINMODE => '0',
RSTM => '0',
RSTP => '0'
);

m20: DSP48E1
generic map (
ACASCREG => 0,
ALUMODEREG => 0,
AREG => 0,
BCASCREG => 0,
BREG => 0,
CARRYINREG => 0,
CARRYINSELREG => 0,
INMODEREG => 0,
OPMODEREG => 0,
PREG => 0
)
port map (
A => "000000" & m_in.data1(63 downto 40),
ACIN => (others => '0'),
ALUMODE => "0000",
B => '0' & m_in.data2(16 downto 0),
BCIN => (others => '0'),
C => (others => '0'),
CARRYCASCIN => '0',
CARRYIN => '0',
CARRYINSEL => "000",
CEA1 => '0',
CEA2 => '0',
CEAD => '0',
CEALUMODE => '0',
CEB1 => '0',
CEB2 => '0',
CEC => '1',
CECARRYIN => '0',
CECTRL => '0',
CED => '0',
CEINMODE => '0',
CEM => '1',
CEP => '0',
CLK => clk,
D => (others => '0'),
INMODE => "00000",
MULTSIGNIN => '0',
OPMODE => "0010101",
P => m20_p,
PCIN => m11_pc,
RSTA => '0',
RSTALLCARRYIN => '0',
RSTALUMODE => '0',
RSTB => '0',
RSTC => '0',
RSTCTRL => '0',
RSTD => '0',
RSTINMODE => '0',
RSTM => '0',
RSTP => '0'
);

m21: DSP48E1
generic map (
ACASCREG => 0,
ALUMODEREG => 0,
AREG => 0,
BCASCREG => 0,
BREG => 0,
CARRYINREG => 0,
CARRYINSELREG => 0,
INMODEREG => 0,
OPMODEREG => 0,
PREG => 0
)
port map (
A => "000000" & m_in.data1(63 downto 40),
ACIN => (others => '0'),
ALUMODE => "0000",
B => '0' & m_in.data2(33 downto 17),
BCIN => (others => '0'),
C => (others => '0'),
CARRYCASCIN => '0',
CARRYIN => '0',
CARRYINSEL => "000",
CEA1 => '0',
CEA2 => '0',
CEAD => '0',
CEALUMODE => '0',
CEB1 => '0',
CEB2 => '0',
CEC => '1',
CECARRYIN => '0',
CECTRL => '0',
CED => '0',
CEINMODE => '0',
CEM => '1',
CEP => '0',
CLK => clk,
D => (others => '0'),
INMODE => "00000",
MULTSIGNIN => '0',
OPMODE => "0010101",
P => m21_p,
PCIN => m12_pc,
RSTA => '0',
RSTALLCARRYIN => '0',
RSTALUMODE => '0',
RSTB => '0',
RSTC => '0',
RSTCTRL => '0',
RSTD => '0',
RSTINMODE => '0',
RSTM => '0',
RSTP => '0'
);

m22: DSP48E1
generic map (
ACASCREG => 0,
ALUMODEREG => 0,
AREG => 0,
BCASCREG => 0,
BREG => 0,
CARRYINREG => 0,
CARRYINSELREG => 0,
INMODEREG => 0,
OPMODEREG => 0,
PREG => 0
)
port map (
A => "000000" & m_in.data1(63 downto 40),
ACIN => (others => '0'),
ALUMODE => "0000",
B => '0' & m_in.data2(50 downto 34),
BCIN => (others => '0'),
C => (others => '0'),
CARRYCASCIN => '0',
CARRYIN => '0',
CARRYINSEL => "000",
CEA1 => '0',
CEA2 => '0',
CEAD => '0',
CEALUMODE => '0',
CEB1 => '0',
CEB2 => '0',
CEC => '1',
CECARRYIN => '0',
CECTRL => '0',
CED => '0',
CEINMODE => '0',
CEM => '1',
CEP => '0',
CLK => clk,
D => (others => '0'),
INMODE => "00000",
MULTSIGNIN => '0',
OPMODE => "0010101",
P => m22_p,
PCIN => m13_pc,
RSTA => '0',
RSTALLCARRYIN => '0',
RSTALUMODE => '0',
RSTB => '0',
RSTC => '0',
RSTCTRL => '0',
RSTD => '0',
RSTINMODE => '0',
RSTM => '0',
RSTP => '0'
);

m23: DSP48E1
generic map (
ACASCREG => 0,
ALUMODEREG => 0,
AREG => 0,
BCASCREG => 0,
BREG => 0,
CARRYINREG => 0,
CARRYINSELREG => 0,
INMODEREG => 0,
OPMODEREG => 0,
PREG => 0
)
port map (
A => "000000" & m_in.data1(63 downto 40),
ACIN => (others => '0'),
ALUMODE => "0000",
B => "00000" & m_in.data2(63 downto 51),
BCIN => (others => '0'),
C => x"00" & "000" & addend(127 downto 91),
CARRYCASCIN => '0',
CARRYIN => '0',
CARRYINSEL => "000",
CEA1 => '0',
CEA2 => '0',
CEAD => '0',
CEALUMODE => '0',
CEB1 => '0',
CEB2 => '0',
CEC => '1',
CECARRYIN => '0',
CECTRL => '0',
CED => '0',
CEINMODE => '0',
CEM => '1',
CEP => '0',
CLK => clk,
D => (others => '0'),
INMODE => "00000",
MULTSIGNIN => '0',
OPMODE => "0110101",
P => m23_p,
PCIN => (others => '0'),
RSTA => '0',
RSTALLCARRYIN => '0',
RSTALUMODE => '0',
RSTB => '0',
RSTC => '0',
RSTCTRL => '0',
RSTD => '0',
RSTINMODE => '0',
RSTM => '0',
RSTP => '0'
);

s0: DSP48E1
generic map (
ACASCREG => 1,
ALUMODEREG => 0,
AREG => 1,
BCASCREG => 1,
BREG => 1,
CARRYINREG => 0,
CARRYINSELREG => 0,
CREG => 1,
INMODEREG => 0,
MREG => 0,
OPMODEREG => 0,
PREG => 0,
USE_MULT => "none"
)
port map (
A => m22_p(5 downto 0) & x"0000" & m10_p(34 downto 27),
ACIN => (others => '0'),
ALUMODE => "0000",
B => m10_p(26 downto 9),
BCIN => (others => '0'),
C => m20_p(39 downto 0) & m02_p(5 downto 0) & "00",
CARRYCASCIN => '0',
CARRYIN => '0',
CARRYINSEL => "000",
CARRYOUT => s0_carry,
CEA1 => '0',
CEA2 => '1',
CEAD => '0',
CEALUMODE => '0',
CEB1 => '0',
CEB2 => '1',
CEC => '1',
CECARRYIN => '0',
CECTRL => '0',
CED => '0',
CEINMODE => '0',
CEM => '0',
CEP => '0',
CLK => clk,
D => (others => '0'),
INMODE => "00000",
MULTSIGNIN => '0',
OPMODE => "0001111",
PCIN => (others => '0'),
PCOUT => s0_pc,
RSTA => '0',
RSTALLCARRYIN => '0',
RSTALUMODE => '0',
RSTB => '0',
RSTC => '0',
RSTCTRL => '0',
RSTD => '0',
RSTINMODE => '0',
RSTM => '0',
RSTP => '0'
);

s1: DSP48E1
generic map (
ACASCREG => 1,
ALUMODEREG => 0,
AREG => 1,
BCASCREG => 1,
BREG => 1,
CARRYINREG => 0,
CARRYINSELREG => 0,
CREG => 1,
INMODEREG => 0,
MREG => 0,
OPMODEREG => 0,
PREG => 0,
USE_MULT => "none"
)
port map (
A => x"000" & m22_p(41 downto 24),
ACIN => (others => '0'),
ALUMODE => "0000",
B => m22_p(23 downto 6),
BCIN => (others => '0'),
C => m23_p(36 downto 0) & x"00" & "0" & m20_p(41 downto 40),
CARRYCASCIN => '0',
CARRYIN => s0_carry(3),
CARRYINSEL => "000",
CEA1 => '0',
CEA2 => '1',
CEAD => '0',
CEALUMODE => '0',
CEB1 => '0',
CEB2 => '1',
CEC => '1',
CECARRYIN => '0',
CECTRL => '0',
CED => '0',
CEINMODE => '0',
CEM => '0',
CEP => '0',
CLK => clk,
D => (others => '0'),
INMODE => "00000",
MULTSIGNIN => '0',
OPMODE => "0001111",
PCIN => (others => '0'),
PCOUT => s1_pc,
RSTA => '0',
RSTALLCARRYIN => '0',
RSTALUMODE => '0',
RSTB => '0',
RSTC => '0',
RSTCTRL => '0',
RSTD => '0',
RSTINMODE => '0',
RSTM => '0',
RSTP => '0'
);

-- mask is 0 for 32-bit ops, 0x0000ffffffff for 64-bit
p0_mask(47 downto 31) <= (others => '0');
p0_mask(30 downto 0) <= (others => not r32_1);

p0: DSP48E1
generic map (
ACASCREG => 1,
ALUMODEREG => 1,
AREG => 1,
BCASCREG => 1,
BREG => 1,
CARRYINREG => 0,
CARRYINSELREG => 0,
CREG => 1,
INMODEREG => 0,
MREG => 0,
OPMODEREG => 0,
PREG => 0,
SEL_MASK => "C",
USE_MULT => "none",
USE_PATTERN_DETECT => "PATDET"
)
port map (
A => m21_p(22 downto 0) & m03_p(5 downto 0) & '0',
ACIN => (others => '0'),
ALUMODE => "00" & rneg_1 & '0',
B => (others => '0'),
BCIN => (others => '0'),
C => p0_mask,
CARRYCASCIN => '0',
CARRYIN => '0',
CARRYINSEL => "000",
CARRYOUT => p0_carry,
CEA1 => '0',
CEA2 => '1',
CEAD => '0',
CEALUMODE => '1',
CEB1 => '0',
CEB2 => '1',
CEC => '1',
CECARRYIN => '0',
CECTRL => '0',
CED => '0',
CEINMODE => '0',
CEM => '0',
CEP => '0',
CLK => clk,
D => (others => '0'),
INMODE => "00000",
MULTSIGNIN => '0',
OPMODE => "0010011",
P => product(79 downto 32),
PATTERNDETECT => p0_pat,
PATTERNBDETECT => p0_patb,
PCIN => s0_pc,
RSTA => '0',
RSTALLCARRYIN => '0',
RSTALUMODE => '0',
RSTB => '0',
RSTC => '0',
RSTCTRL => '0',
RSTD => '0',
RSTINMODE => '0',
RSTM => '0',
RSTP => '0'
);

p1: DSP48E1
generic map (
ACASCREG => 1,
ALUMODEREG => 1,
AREG => 1,
BCASCREG => 1,
BREG => 1,
CARRYINREG => 0,
CARRYINSELREG => 0,
CREG => 0,
INMODEREG => 0,
MASK => x"000000000000",
MREG => 0,
OPMODEREG => 0,
PREG => 0,
USE_MULT => "none",
USE_PATTERN_DETECT => "PATDET"
)
port map (
A => x"0000000" & '0' & m21_p(41),
ACIN => (others => '0'),
ALUMODE => "00" & rneg_1 & '0',
B => m21_p(40 downto 23),
BCIN => (others => '0'),
C => (others => '0'),
CARRYCASCIN => '0',
CARRYIN => p0_carry(3),
CARRYINSEL => "000",
CEA1 => '0',
CEA2 => '1',
CEAD => '0',
CEALUMODE => '1',
CEB1 => '0',
CEB2 => '1',
CEC => '0',
CECARRYIN => '0',
CECTRL => '0',
CED => '0',
CEINMODE => '0',
CEM => '0',
CEP => '0',
CLK => clk,
D => (others => '0'),
INMODE => "00000",
MULTSIGNIN => '0',
OPMODE => "0010011",
P => product(127 downto 80),
PATTERNDETECT => p1_pat,
PATTERNBDETECT => p1_patb,
PCIN => s1_pc,
RSTA => '0',
RSTALLCARRYIN => '0',
RSTALUMODE => '0',
RSTB => '0',
RSTC => '0',
RSTCTRL => '0',
RSTD => '0',
RSTINMODE => '0',
RSTM => '0',
RSTP => '0'
);

product(31 downto 0) <= product_lo xor (31 downto 0 => req_neg);

mult_out: process(all)
variable ov : std_ulogic;
begin
-- set overflow if the high bits are neither all zeroes nor all ones
if req_32bit = '0' then
ov := not ((p1_pat and p0_pat) or (p1_patb and p0_patb));
else
ov := not ((p1_pat and p0_pat and not product(31)) or
(p1_patb and p0_patb and product(31)));
end if;

m_out.result <= product;
m_out.overflow <= ov;
end process;

process(clk)
begin
if rising_edge(clk) then
product_lo <= m10_p(8 downto 0) & m01_p(5 downto 0) & m00_p(16 downto 0);
m_out.valid <= valid_1;
valid_1 <= m_in.valid;
req_32bit <= r32_1;
r32_1 <= m_in.is_32bit;
req_neg <= rneg_1;
rneg_1 <= m_in.neg_result;
end if;
end process;

end architecture behaviour;
Loading…
Cancel
Save