diff --git a/Makefile b/Makefile index 85a0262..e2398c0 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ all: $(all) $(GHDL) -a $(GHDLFLAGS) $< common.o: decode_types.o -control.o: gpr_hazard.o cr_hazard.o +control.o: gpr_hazard.o cr_hazard.o common.o sim_jtag.o: sim_jtag_socket.o core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o multiply.o writeback.o core_debug.o divider.o diff --git a/common.vhdl b/common.vhdl index 44198b0..8e24ab9 100644 --- a/common.vhdl +++ b/common.vhdl @@ -12,15 +12,59 @@ package common is function decode_spr_num(insn: std_ulogic_vector(31 downto 0)) return spr_num_t; - constant SPR_LR : spr_num_t := 8; - constant SPR_CTR : spr_num_t := 9; - constant SPR_TB : spr_num_t := 268; + constant SPR_XER : spr_num_t := 1; + constant SPR_LR : spr_num_t := 8; + constant SPR_CTR : spr_num_t := 9; + constant SPR_TB : spr_num_t := 268; + constant SPR_SRR0 : spr_num_t := 26; + constant SPR_SRR1 : spr_num_t := 27; + constant SPR_HSRR0 : spr_num_t := 314; + constant SPR_HSRR1 : spr_num_t := 315; + constant SPR_SPRG0 : spr_num_t := 272; + constant SPR_SPRG1 : spr_num_t := 273; + constant SPR_SPRG2 : spr_num_t := 274; + constant SPR_SPRG3 : spr_num_t := 275; + constant SPR_SPRG3U : spr_num_t := 259; + constant SPR_HSPRG0 : spr_num_t := 304; + constant SPR_HSPRG1 : spr_num_t := 305; + -- GPR indices in the register file (GPR only) + subtype gpr_index_t is std_ulogic_vector(4 downto 0); + + -- Extended GPR indice (can hold an SPR) + subtype gspr_index_t is std_ulogic_vector(5 downto 0); + + -- Some SPRs are stored in the register file, they use the magic + -- GPR numbers above 31. + -- + -- The function fast_spr_num() returns the corresponding fast + -- pseudo-GPR number for a given SPR number. The result MSB + -- indicates if this is indeed a fast SPR. If clear, then + -- the SPR is not stored in the GPR file. + -- + function fast_spr_num(spr: spr_num_t) return gspr_index_t; + + -- Indices conversion functions + function gspr_to_gpr(i: gspr_index_t) return gpr_index_t; + function gpr_to_gspr(i: gpr_index_t) return gspr_index_t; + function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t; + function is_fast_spr(s: gspr_index_t) return std_ulogic; + + -- The XER is split: the common bits (CA, OV, SO, OV32 and CA32) are + -- in the CR file as a kind of CR extension (with a separate write + -- control). The rest is stored as a fast SPR. + type xer_common_t is record + ca : std_ulogic; + ca32 : std_ulogic; + ov : std_ulogic; + ov32 : std_ulogic; + so : std_ulogic; + end record; + constant xerc_init : xer_common_t := (others => '0'); + + -- This needs to die... type ctrl_t is record - lr: std_ulogic_vector(63 downto 0); - ctr: std_ulogic_vector(63 downto 0); tb: std_ulogic_vector(63 downto 0); - carry: std_ulogic; end record; type Fetch1ToIcacheType is record @@ -49,6 +93,8 @@ package common is stop_mark : std_ulogic; nia: std_ulogic_vector(63 downto 0); insn: std_ulogic_vector(31 downto 0); + ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr + ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR) decode: decode_rom_t; end record; constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', decode => decode_rom_init, others => (others => '0')); @@ -57,15 +103,17 @@ package common is valid: std_ulogic; insn_type: insn_type_t; nia: std_ulogic_vector(63 downto 0); - write_reg: std_ulogic_vector(4 downto 0); - read_reg1: std_ulogic_vector(4 downto 0); - read_reg2: std_ulogic_vector(4 downto 0); + write_reg: gspr_index_t; + read_reg1: gspr_index_t; + read_reg2: gspr_index_t; read_data1: std_ulogic_vector(63 downto 0); read_data2: std_ulogic_vector(63 downto 0); read_data3: std_ulogic_vector(63 downto 0); cr: std_ulogic_vector(31 downto 0); + xerc: xer_common_t; lr: std_ulogic; rc: std_ulogic; + oe: std_ulogic; invert_a: std_ulogic; invert_out: std_ulogic; input_carry: carry_in_t; @@ -78,23 +126,28 @@ package common is data_len: std_ulogic_vector(3 downto 0); end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := - (valid => '0', insn_type => OP_ILLEGAL, lr => '0', rc => '0', invert_a => '0', + (valid => '0', insn_type => OP_ILLEGAL, lr => '0', rc => '0', oe => '0', invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', - is_32bit => '0', is_signed => '0', others => (others => '0')); + is_32bit => '0', is_signed => '0', xerc => xerc_init, others => (others => '0')); type Decode2ToMultiplyType is record valid: std_ulogic; insn_type: insn_type_t; - write_reg: std_ulogic_vector(4 downto 0); + write_reg: gpr_index_t; data1: std_ulogic_vector(64 downto 0); data2: std_ulogic_vector(64 downto 0); rc: std_ulogic; + oe: std_ulogic; + is_32bit: std_ulogic; + xerc: xer_common_t; end record; - constant Decode2ToMultiplyInit : Decode2ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, rc => '0', others => (others => '0')); + constant Decode2ToMultiplyInit : Decode2ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, rc => '0', + oe => '0', is_32bit => '0', xerc => xerc_init, + others => (others => '0')); type Decode2ToDividerType is record valid: std_ulogic; - write_reg: std_ulogic_vector(4 downto 0); + write_reg: gpr_index_t; dividend: std_ulogic_vector(63 downto 0); divisor: std_ulogic_vector(63 downto 0); is_signed: std_ulogic; @@ -102,16 +155,21 @@ package common is is_extended: std_ulogic; is_modulus: std_ulogic; rc: std_ulogic; + oe: std_ulogic; + xerc: xer_common_t; end record; - constant Decode2ToDividerInit: Decode2ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0', is_extended => '0', is_modulus => '0', rc => '0', others => (others => '0')); + constant Decode2ToDividerInit: Decode2ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0', + is_extended => '0', is_modulus => '0', + rc => '0', oe => '0', xerc => xerc_init, + others => (others => '0')); type Decode2ToRegisterFileType is record read1_enable : std_ulogic; - read1_reg : std_ulogic_vector(4 downto 0); + read1_reg : gspr_index_t; read2_enable : std_ulogic; - read2_reg : std_ulogic_vector(4 downto 0); + read2_reg : gspr_index_t; read3_enable : std_ulogic; - read3_reg : std_ulogic_vector(4 downto 0); + read3_reg : gpr_index_t; end record; type RegisterFileToDecode2Type is record @@ -126,6 +184,7 @@ package common is type CrFileToDecode2Type is record read_cr_data : std_ulogic_vector(31 downto 0); + read_xerc_data : xer_common_t; end record; type Execute1ToFetch1Type is record @@ -140,14 +199,17 @@ package common is addr1 : std_ulogic_vector(63 downto 0); addr2 : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); -- data to write, unused for read - write_reg : std_ulogic_vector(4 downto 0); -- read data goes to this register + write_reg : gpr_index_t; length : std_ulogic_vector(3 downto 0); byte_reverse : std_ulogic; sign_extend : std_ulogic; -- do we need to sign extend? update : std_ulogic; -- is this an update instruction? - update_reg : std_ulogic_vector(4 downto 0); -- if so, the register to update + update_reg : gpr_index_t; -- if so, the register to update + xerc : xer_common_t; end record; - constant Decode2ToLoadstore1Init : Decode2ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0', sign_extend => '0', update => '0', others => (others => '0')); + constant Decode2ToLoadstore1Init : Decode2ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0', + sign_extend => '0', update => '0', xerc => xerc_init, + others => (others => '0')); type Loadstore1ToDcacheType is record valid : std_ulogic; @@ -155,63 +217,82 @@ package common is nc : std_ulogic; addr : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); - write_reg : std_ulogic_vector(4 downto 0); + write_reg : gpr_index_t; length : std_ulogic_vector(3 downto 0); byte_reverse : std_ulogic; sign_extend : std_ulogic; update : std_ulogic; - update_reg : std_ulogic_vector(4 downto 0); + update_reg : gpr_index_t; + xerc : xer_common_t; end record; type DcacheToWritebackType is record valid : std_ulogic; write_enable: std_ulogic; - write_reg : std_ulogic_vector(4 downto 0); + write_reg : gpr_index_t; write_data : std_ulogic_vector(63 downto 0); write_len : std_ulogic_vector(3 downto 0); write_shift : std_ulogic_vector(2 downto 0); sign_extend : std_ulogic; byte_reverse : std_ulogic; second_word : std_ulogic; + xerc : xer_common_t; end record; - constant DcacheToWritebackInit : DcacheToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', byte_reverse => '0', second_word => '0', others => (others => '0')); + constant DcacheToWritebackInit : DcacheToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', + byte_reverse => '0', second_word => '0', xerc => xerc_init, + others => (others => '0')); type Execute1ToWritebackType is record valid: std_ulogic; rc : std_ulogic; write_enable : std_ulogic; - write_reg: std_ulogic_vector(4 downto 0); + write_reg: gspr_index_t; write_data: std_ulogic_vector(63 downto 0); write_len : std_ulogic_vector(3 downto 0); write_cr_enable : std_ulogic; write_cr_mask : std_ulogic_vector(7 downto 0); write_cr_data : std_ulogic_vector(31 downto 0); + write_xerc_enable : std_ulogic; + xerc : xer_common_t; sign_extend: std_ulogic; end record; - constant Execute1ToWritebackInit : Execute1ToWritebackType := (valid => '0', rc => '0', write_enable => '0', write_cr_enable => '0', sign_extend => '0', others => (others => '0')); + constant Execute1ToWritebackInit : Execute1ToWritebackType := (valid => '0', rc => '0', write_enable => '0', + write_cr_enable => '0', sign_extend => '0', + write_xerc_enable => '0', xerc => xerc_init, + others => (others => '0')); type MultiplyToWritebackType is record valid: std_ulogic; write_reg_enable : std_ulogic; - write_reg_nr: std_ulogic_vector(4 downto 0); + write_reg_nr: gpr_index_t; write_reg_data: std_ulogic_vector(63 downto 0); + write_xerc_enable : std_ulogic; + xerc : xer_common_t; rc: std_ulogic; end record; - constant MultiplyToWritebackInit : MultiplyToWritebackType := (valid => '0', write_reg_enable => '0', rc => '0', others => (others => '0')); + constant MultiplyToWritebackInit : MultiplyToWritebackType := (valid => '0', write_reg_enable => '0', + rc => '0', write_xerc_enable => '0', + xerc => xerc_init, + others => (others => '0')); type DividerToWritebackType is record valid: std_ulogic; write_reg_enable : std_ulogic; - write_reg_nr: std_ulogic_vector(4 downto 0); + write_reg_nr: gpr_index_t; write_reg_data: std_ulogic_vector(63 downto 0); + write_xerc_enable : std_ulogic; + xerc : xer_common_t; rc: std_ulogic; end record; - constant DividerToWritebackInit : DividerToWritebackType := (valid => '0', write_reg_enable => '0', rc => '0', others => (others => '0')); + constant DividerToWritebackInit : DividerToWritebackType := (valid => '0', write_reg_enable => '0', + rc => '0', write_xerc_enable => '0', + xerc => xerc_init, + others => (others => '0')); type WritebackToRegisterFileType is record - write_reg : std_ulogic_vector(4 downto 0); + write_reg : gspr_index_t; write_data : std_ulogic_vector(63 downto 0); write_enable : std_ulogic; end record; @@ -221,9 +302,12 @@ package common is write_cr_enable : std_ulogic; write_cr_mask : std_ulogic_vector(7 downto 0); write_cr_data : std_ulogic_vector(31 downto 0); + write_xerc_enable : std_ulogic; + write_xerc_data : xer_common_t; end record; - constant WritebackToCrFileInit : WritebackToCrFileType := (write_cr_enable => '0', others => (others => '0')); - + constant WritebackToCrFileInit : WritebackToCrFileType := (write_cr_enable => '0', write_xerc_enable => '0', + write_xerc_data => xerc_init, + others => (others => '0')); end common; package body common is @@ -231,4 +315,63 @@ package body common is begin return to_integer(unsigned(insn(15 downto 11) & insn(20 downto 16))); end; + function fast_spr_num(spr: spr_num_t) return gspr_index_t is + variable n : integer range 0 to 31; + begin + case spr is + when SPR_LR => + n := 0; + when SPR_CTR => + n:= 1; + when SPR_SRR0 => + n := 2; + when SPR_SRR1 => + n := 3; + when SPR_HSRR0 => + n := 4; + when SPR_HSRR1 => + n := 5; + when SPR_SPRG0 => + n := 6; + when SPR_SPRG1 => + n := 7; + when SPR_SPRG2 => + n := 8; + when SPR_SPRG3 | SPR_SPRG3U => + n := 9; + when SPR_HSPRG0 => + n := 10; + when SPR_HSPRG1 => + n := 11; + when SPR_XER => + n := 12; + when others => + return "000000"; + end case; + return "1" & std_ulogic_vector(to_unsigned(n, 5)); + end; + + function gspr_to_gpr(i: gspr_index_t) return gpr_index_t is + begin + return i(4 downto 0); + end; + + function gpr_to_gspr(i: gpr_index_t) return gspr_index_t is + begin + return "0" & i; + end; + + function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t is + begin + if s(5) = '1' then + return s; + else + return gpr_to_gspr(g); + end if; + end; + + function is_fast_spr(s: gspr_index_t) return std_ulogic is + begin + return s(5); + end; end common; diff --git a/control.vhdl b/control.vhdl index 0555b06..fed5618 100644 --- a/control.vhdl +++ b/control.vhdl @@ -1,6 +1,9 @@ library ieee; use ieee.std_logic_1164.all; +library work; +use work.common.all; + entity control is generic ( PIPELINE_DEPTH : natural := 2 @@ -12,20 +15,21 @@ entity control is complete_in : in std_ulogic; valid_in : in std_ulogic; flush_in : in std_ulogic; + stall_in : in std_ulogic; sgl_pipe_in : in std_ulogic; stop_mark_in : in std_ulogic; gpr_write_valid_in : in std_ulogic; - gpr_write_in : in std_ulogic_vector(4 downto 0); + gpr_write_in : in gspr_index_t; gpr_a_read_valid_in : in std_ulogic; - gpr_a_read_in : in std_ulogic_vector(4 downto 0); + gpr_a_read_in : in gspr_index_t; gpr_b_read_valid_in : in std_ulogic; - gpr_b_read_in : in std_ulogic_vector(4 downto 0); + gpr_b_read_in : in gspr_index_t; gpr_c_read_valid_in : in std_ulogic; - gpr_c_read_in : in std_ulogic_vector(4 downto 0); + gpr_c_read_in : in gpr_index_t; cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; @@ -61,6 +65,7 @@ begin ) port map ( clk => clk, + stall_in => stall_in, gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, @@ -76,6 +81,7 @@ begin ) port map ( clk => clk, + stall_in => stall_in, gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, @@ -91,11 +97,12 @@ begin ) port map ( clk => clk, + stall_in => stall_in, gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, gpr_read_valid_in => gpr_c_read_valid_in, - gpr_read_in => gpr_c_read_in, + gpr_read_in => "0" & gpr_c_read_in, stall_out => stall_c_out ); @@ -106,6 +113,7 @@ begin ) port map ( clk => clk, + stall_in => stall_in, cr_read_in => cr_read_in, cr_write_in => cr_write_valid, @@ -129,8 +137,8 @@ begin v_int := r_int; -- asynchronous - valid_tmp := valid_in and not flush_in; - stall_tmp := '0'; + valid_tmp := valid_in and not flush_in and not stall_in; + stall_tmp := stall_in; if complete_in = '1' then v_int.outstanding := r_int.outstanding - 1; diff --git a/core.vhdl b/core.vhdl index 22f7dca..f95a1af 100644 --- a/core.vhdl +++ b/core.vhdl @@ -76,8 +76,10 @@ architecture behave of core is signal icache_stall_out : std_ulogic; signal fetch2_stall_in : std_ulogic; signal decode1_stall_in : std_ulogic; + signal decode2_stall_in : std_ulogic; signal decode2_stall_out : std_ulogic; signal ex1_icache_inval: std_ulogic; + signal ex1_stall_out: std_ulogic; signal flush: std_ulogic; @@ -184,6 +186,7 @@ begin port map ( clk => clk, rst => core_rst, + stall_in => decode2_stall_in, stall_out => decode2_stall_out, flush_in => flush, complete_in => complete, @@ -198,6 +201,7 @@ begin c_in => cr_file_to_decode2, c_out => decode2_to_cr_file ); + decode2_stall_in <= ex1_stall_out; register_file_0: entity work.register_file generic map ( @@ -223,6 +227,7 @@ begin port map ( clk => clk, flush_out => flush, + stall_out => ex1_stall_out, e_in => decode2_to_execute1, f_out => execute1_to_fetch1, e_out => execute1_to_writeback, diff --git a/cr_file.vhdl b/cr_file.vhdl index fa56dd9..dcd21be 100644 --- a/cr_file.vhdl +++ b/cr_file.vhdl @@ -18,7 +18,9 @@ end entity cr_file; architecture behaviour of cr_file is signal crs : std_ulogic_vector(31 downto 0) := (others => '0'); - signal crs_updated : std_ulogic_vector(31 downto 0) := (others => '0'); + signal crs_updated : std_ulogic_vector(31 downto 0); + signal xerc : xer_common_t := xerc_init; + signal xerc_updated : xer_common_t; begin cr_create_0: process(all) variable hi, lo : integer := 0; @@ -35,6 +37,13 @@ begin end loop; crs_updated <= cr_tmp; + + if w_in.write_xerc_enable = '1' then + xerc_updated <= w_in.write_xerc_data; + else + xerc_updated <= xerc; + end if; + end process; -- synchronous writes @@ -43,8 +52,12 @@ begin if rising_edge(clk) then if w_in.write_cr_enable = '1' then report "Writing " & to_hstring(w_in.write_cr_data) & " to CR mask " & to_hstring(w_in.write_cr_mask); + crs <= crs_updated; end if; - crs <= crs_updated; + if w_in.write_xerc_enable = '1' then + report "Writing XERC"; + xerc <= xerc_updated; + end if; end if; end process; @@ -56,5 +69,6 @@ begin report "Reading CR " & to_hstring(crs_updated); end if; d_out.read_cr_data <= crs_updated; + d_out.read_xerc_data <= xerc_updated; end process; end architecture behaviour; diff --git a/cr_hazard.vhdl b/cr_hazard.vhdl index 2a434ac..f6c5f3f 100644 --- a/cr_hazard.vhdl +++ b/cr_hazard.vhdl @@ -7,7 +7,8 @@ entity cr_hazard is PIPELINE_DEPTH : natural := 2 ); port( - clk : in std_logic; + clk : in std_ulogic; + stall_in : in std_ulogic; cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; @@ -29,7 +30,9 @@ begin cr_hazard0: process(clk) begin if rising_edge(clk) then - r <= rin; + if stall_in = '0' then + r <= rin; + end if; end if; end process; diff --git a/dcache.vhdl b/dcache.vhdl index 7d6e74c..df54c95 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -185,6 +185,7 @@ architecture rtl of dcache is length : std_ulogic_vector(3 downto 0); sign_extend : std_ulogic; byte_reverse : std_ulogic; + xerc : xer_common_t; end record; signal r2 : reg_stage_2_t; @@ -469,6 +470,7 @@ begin d_out.sign_extend <= r2.sign_extend; d_out.byte_reverse <= r2.byte_reverse; d_out.second_word <= '0'; + d_out.xerc <= r2.xerc; -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store @@ -518,6 +520,7 @@ begin d_out.sign_extend <= r1.req.sign_extend; d_out.byte_reverse <= r1.req.byte_reverse; d_out.write_len <= r1.req.length; + d_out.xerc <= r1.req.xerc; end if; -- If it's a store or a non-update load form, complete now @@ -539,6 +542,7 @@ begin d_out.write_len <= "1000"; d_out.sign_extend <= '0'; d_out.byte_reverse <= '0'; + d_out.xerc <= r1.req.xerc; -- If it was a load, this completes the operation (load with -- update case). diff --git a/decode1.vhdl b/decode1.vhdl index 1a7bc0b..b4e7d26 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -36,14 +36,14 @@ architecture behaviour of decode1 is constant major_decode_rom_array : major_rom_array_t := ( -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl -- op in out A out in out len ext pipe - 12 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- addic - 13 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '1'), -- addic. + 12 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- addic + 13 => (ALU, OP_ADD, RA, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0'), -- addic. 14 => (ALU, OP_ADD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- addi 15 => (ALU, OP_ADD, RA_OR_ZERO, CONST_SI_HI, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- addis 28 => (ALU, OP_AND, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0'), -- andi. 29 => (ALU, OP_AND, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0'), -- andis. 18 => (ALU, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- b - 16 => (ALU, OP_BC, NONE, CONST_BD, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc + 16 => (ALU, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc 11 => (ALU, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpi 10 => (ALU, OP_CMPL, RA, CONST_UI, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli 34 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbz @@ -106,7 +106,7 @@ architecture behaviour of decode1 is -- addpcis not implemented yet 2#001# => (ALU, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- bclr, bcctr, bctar - 2#100# => (ALU, OP_BCREG, NONE, NONE, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '1'), + 2#100# => (ALU, OP_BCREG, SPR, SPR, NONE, SPR, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- isync 2#111# => (ALU, OP_ISYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), others => illegal_inst @@ -133,10 +133,15 @@ architecture behaviour of decode1 is -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl -- op in out A out in out len ext pipe 2#0100001010# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- add - 2#0000001010# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- addc - 2#0010001010# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- adde - 2#0011101010# => (ALU, OP_ADD, RA, CONST_M1, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- addme - 2#0011001010# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- addze + 2#1100001010# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addo + 2#0000001010# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addc + 2#1000001010# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addco + 2#0010001010# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- adde + 2#1010001010# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addeo + 2#0011101010# => (ALU, OP_ADD, RA, CONST_M1, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addme + 2#1011101010# => (ALU, OP_ADD, RA, CONST_M1, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addmeo + 2#0011001010# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addze + 2#1011001010# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '0', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- addzeo 2#0000011100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- and 2#0000111100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- andc -- 2#0011111100# bperm @@ -156,13 +161,21 @@ architecture behaviour of decode1 is 2#0011110110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbtst -- 2#1111110110# dcbz 2#0110001001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdeu + 2#1110001001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdeuo 2#0110001011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divweu + 2#1110001011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divweuo 2#0110101001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divde + 2#1110101001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdeo 2#0110101011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divwe + 2#1110101011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divweo 2#0111001001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdu + 2#1111001001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divduo 2#0111001011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divwu + 2#1111001011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divwuo 2#0111101001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divd + 2#1111101001# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divdo 2#0111101011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divw + 2#1111101011# => (DIV, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- divwo 2#0100011100# => (ALU, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- eqv 2#1110111010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsb 2#1110011010# => (ALU, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- extsh @@ -224,13 +237,13 @@ architecture behaviour of decode1 is -- 2#1000000000# mcrxr -- 2#1001000000# mcrxrx 2#0000010011# => (ALU, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf - 2#0101010011# => (ALU, OP_MFSPR, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- mfspr + 2#0101010011# => (ALU, OP_MFSPR, SPR, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfspr 2#0100001001# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modud 2#0100001011# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- moduw 2#1100001001# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsd 2#1100001011# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsw 2#0010010000# => (ALU, OP_MTCRF, NONE, NONE, RS, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtcrf/mtocrf - 2#0111010011# => (ALU, OP_MTSPR, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- mtspr + 2#0111010011# => (ALU, OP_MTSPR, NONE, NONE, RS, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtspr 2#0001001001# => (MUL, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '1'), -- mulhd 2#0000001001# => (MUL, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- mulhdu 2#0001001011# => (MUL, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '1'), -- mulhw @@ -241,9 +254,12 @@ architecture behaviour of decode1 is 2#1001001011# => (MUL, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '1'), -- mulhw 2#1000001011# => (MUL, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '1'), -- mulhwu 2#0011101001# => (MUL, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '1'), -- mulld + 2#1011101001# => (MUL, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '1'), -- mulldo 2#0011101011# => (MUL, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '1'), -- mullw + 2#1011101011# => (MUL, OP_MUL_L64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '1'), -- mullwo 2#0111011100# => (ALU, OP_AND, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nand 2#0001101000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- neg + 2#1001101000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nego 2#0001111100# => (ALU, OP_OR, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nor 2#0110111100# => (ALU, OP_OR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- or 2#0110011100# => (ALU, OP_OR, NONE, RB, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- orc @@ -278,10 +294,15 @@ architecture behaviour of decode1 is 2#0010110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- stwux 2#0010010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- stwx 2#0000101000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subf - 2#0000001000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- subfc - 2#0010001000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- subfe - 2#0011101000# => (ALU, OP_ADD, RA, CONST_M1, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- subfme - 2#0011001000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- subfze + 2#1000101000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfo + 2#0000001000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfc + 2#1000001000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfco + 2#0010001000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfe + 2#1010001000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfeo + 2#0011101000# => (ALU, OP_ADD, RA, CONST_M1, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfme + 2#1011101000# => (ALU, OP_ADD, RA, CONST_M1, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfmeo + 2#0011001000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfze + 2#1011001000# => (ALU, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subfzeo 2#1001010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sync -- 2#0001000100# td 2#0000000100# => (ALU, OP_TW, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tw @@ -334,6 +355,8 @@ begin v.nia := f_in.nia; v.insn := f_in.insn; v.stop_mark := f_in.stop_mark; + v.ispr1 := (others => '0'); + v.ispr2 := (others => '0'); if f_in.valid = '1' then report "Decode insn " & to_hstring(f_in.insn) & " at " & to_hstring(f_in.nia); @@ -377,6 +400,33 @@ begin v.decode := major_decode_rom_array(to_integer(majorop)); end if; + -- Set ISPR1/ISPR2 when needed + if v.decode.insn_type = OP_BC or v.decode.insn_type = OP_BCREG then + -- Branch uses CTR as condition when BO(2) is 0. This is + -- also used to indicate that CTR is modified (they go + -- together). + -- + if f_in.insn(23) = '0' then + v.ispr1 := fast_spr_num(SPR_CTR); + end if; + + -- Branch source register is an SPR + if v.decode.insn_type = OP_BCREG then + -- TODO: Add TAR + if f_in.insn(10) = '0' then + v.ispr2 := fast_spr_num(SPR_LR); + else + v.ispr2 := fast_spr_num(SPR_CTR); + end if; + end if; + elsif v.decode.insn_type = OP_MFSPR or v.decode.insn_type = OP_MTSPR then + v.ispr1 := fast_spr_num(decode_spr_num(f_in.insn)); + -- Make slow SPRs single issue + if is_fast_spr(v.ispr1) = '0' then + v.decode.sgl_pipe := '1'; + end if; + end if; + if flush_in = '1' then v.valid := '0'; end if; diff --git a/decode2.vhdl b/decode2.vhdl index 1307e7d..f6f7101 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -14,6 +14,7 @@ entity decode2 is rst : in std_ulogic; complete_in : in std_ulogic; + stall_in : in std_ulogic; stall_out : out std_ulogic; stopped_out : out std_ulogic; @@ -47,30 +48,46 @@ architecture behaviour of decode2 is type decode_input_reg_t is record reg_valid : std_ulogic; - reg : std_ulogic_vector(4 downto 0); + reg : gspr_index_t; data : std_ulogic_vector(63 downto 0); end record; + type decode_output_reg_t is record + reg_valid : std_ulogic; + reg : gspr_index_t; + end record; + function decode_input_reg_a (t : input_reg_a_t; insn_in : std_ulogic_vector(31 downto 0); - reg_data : std_ulogic_vector(63 downto 0)) return decode_input_reg_t is - variable is_reg : std_ulogic; + reg_data : std_ulogic_vector(63 downto 0); + ispr : gspr_index_t) return decode_input_reg_t is begin - is_reg := '0' when insn_ra(insn_in) = "00000" else '1'; - if t = RA or (t = RA_OR_ZERO and insn_ra(insn_in) /= "00000") then - --return (is_reg, insn_ra(insn_in), reg_data); - return ('1', insn_ra(insn_in), reg_data); + assert is_fast_spr(ispr) = '0' report "Decode A says GPR but ISPR says SPR:" & + to_hstring(ispr) severity failure; + return ('1', gpr_to_gspr(insn_ra(insn_in)), reg_data); + elsif t = SPR then + -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. + -- If it's all 0, we don't treat it as a dependency as slow SPRs + -- operations are single issue. + -- + assert is_fast_spr(ispr) = '1' or ispr = "000000" + report "Decode A says SPR but ISPR is invalid:" & + to_hstring(ispr) severity failure; + return (is_fast_spr(ispr), ispr, reg_data); else return ('0', (others => '0'), (others => '0')); end if; end; function decode_input_reg_b (t : input_reg_b_t; insn_in : std_ulogic_vector(31 downto 0); - reg_data : std_ulogic_vector(63 downto 0)) return decode_input_reg_t is + reg_data : std_ulogic_vector(63 downto 0); + ispr : gspr_index_t) return decode_input_reg_t is begin case t is when RB => - return ('1', insn_rb(insn_in), reg_data); + assert is_fast_spr(ispr) = '0' report "Decode B says GPR but ISPR says SPR:" & + to_hstring(ispr) severity failure; + return ('1', gpr_to_gspr(insn_rb(insn_in)), reg_data); when CONST_UI => return ('0', (others => '0'), std_ulogic_vector(resize(unsigned(insn_ui(insn_in)), 64))); when CONST_SI => @@ -91,6 +108,14 @@ architecture behaviour of decode2 is return ('0', (others => '0'), x"00000000000000" & "00" & insn_in(1) & insn_in(15 downto 11)); when CONST_SH32 => return ('0', (others => '0'), x"00000000000000" & "000" & insn_in(15 downto 11)); + when SPR => + -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. + -- If it's all 0, we don't treat it as a dependency as slow SPRs + -- operations are single issue. + assert is_fast_spr(ispr) = '1' or ispr = "000000" + report "Decode B says SPR but ISPR is invalid:" & + to_hstring(ispr) severity failure; + return (is_fast_spr(ispr), ispr, reg_data); when NONE => return ('0', (others => '0'), (others => '0')); end case; @@ -101,21 +126,30 @@ architecture behaviour of decode2 is begin case t is when RS => - return ('1', insn_rs(insn_in), reg_data); + return ('1', gpr_to_gspr(insn_rs(insn_in)), reg_data); when NONE => return ('0', (others => '0'), (others => '0')); end case; end; - function decode_output_reg (t : output_reg_a_t; insn_in : std_ulogic_vector(31 downto 0)) return std_ulogic_vector is + function decode_output_reg (t : output_reg_a_t; insn_in : std_ulogic_vector(31 downto 0); + ispr : gspr_index_t) return decode_output_reg_t is begin case t is when RT => - return insn_rt(insn_in); + return ('1', gpr_to_gspr(insn_rt(insn_in))); when RA => - return insn_ra(insn_in); + return ('1', gpr_to_gspr(insn_ra(insn_in))); + when SPR => + -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. + -- If it's all 0, we don't treat it as a dependency as slow SPRs + -- operations are single issue. + assert is_fast_spr(ispr) = '1' or ispr = "000000" + report "Decode B says SPR but ISPR is invalid:" & + to_hstring(ispr) severity failure; + return (is_fast_spr(ispr), ispr); when NONE => - return "00000"; + return ('0', "000000"); end case; end; @@ -131,22 +165,38 @@ architecture behaviour of decode2 is end case; end; + -- For now, use "rc" in the decode table to decide whether oe exists. + -- This is not entirely correct architecturally: For mulhd and + -- mulhdu, the OE field is reserved. It remains to be seen what an + -- actual POWER9 does if we set it on those instructions, for now we + -- test that further down when assigning to the multiplier oe input. + -- + function decode_oe (t : rc_t; insn_in : std_ulogic_vector(31 downto 0)) return std_ulogic is + begin + case t is + when RC => + return insn_oe(insn_in); + when OTHERS => + return '0'; + end case; + end; + -- issue control signals signal control_valid_in : std_ulogic; signal control_valid_out : std_ulogic; signal control_sgl_pipe : std_logic; signal gpr_write_valid : std_ulogic; - signal gpr_write : std_ulogic_vector(4 downto 0); + signal gpr_write : gspr_index_t; signal gpr_a_read_valid : std_ulogic; - signal gpr_a_read : std_ulogic_vector(4 downto 0); + signal gpr_a_read :gspr_index_t; signal gpr_b_read_valid : std_ulogic; - signal gpr_b_read : std_ulogic_vector(4 downto 0); + signal gpr_b_read : gspr_index_t; signal gpr_c_read_valid : std_ulogic; - signal gpr_c_read : std_ulogic_vector(4 downto 0); + signal gpr_c_read : gpr_index_t; signal cr_write_valid : std_ulogic; begin @@ -160,6 +210,7 @@ begin complete_in => complete_in, valid_in => control_valid_in, + stall_in => stall_in, flush_in => flush_in, sgl_pipe_in => control_sgl_pipe, stop_mark_in => d_in.stop_mark, @@ -194,8 +245,8 @@ begin end if; end process; - r_out.read1_reg <= insn_ra(d_in.insn); - r_out.read2_reg <= insn_rb(d_in.insn); + r_out.read1_reg <= gpr_or_spr_to_gspr(insn_ra(d_in.insn), d_in.ispr1); + r_out.read2_reg <= gpr_or_spr_to_gspr(insn_rb(d_in.insn), d_in.ispr2); r_out.read3_reg <= insn_rs(d_in.insn); c_out.read <= d_in.decode.input_cr; @@ -207,6 +258,7 @@ begin variable decoded_reg_a : decode_input_reg_t; variable decoded_reg_b : decode_input_reg_t; variable decoded_reg_c : decode_input_reg_t; + variable decoded_reg_o : decode_output_reg_t; variable signed_division: std_ulogic; variable length : std_ulogic_vector(3 downto 0); begin @@ -223,10 +275,11 @@ begin --v.e.input_cr := d_in.decode.input_cr; --v.m.input_cr := d_in.decode.input_cr; --v.e.output_cr := d_in.decode.output_cr; - - decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data); - decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data); + + decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1); + decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2); decoded_reg_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn, r_in.read3_data); + decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispr1); r_out.read1_enable <= decoded_reg_a.reg_valid; r_out.read2_enable <= decoded_reg_b.reg_valid; @@ -253,9 +306,11 @@ begin v.e.read_reg2 := decoded_reg_b.reg; v.e.read_data2 := decoded_reg_b.data; v.e.read_data3 := decoded_reg_c.data; - v.e.write_reg := decode_output_reg(d_in.decode.output_reg_a, d_in.insn); + v.e.write_reg := decoded_reg_o.reg; v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); + v.e.oe := decode_oe(d_in.decode.rc, d_in.insn); v.e.cr := c_in.read_cr_data; + v.e.xerc := c_in.read_xerc_data; v.e.invert_a := d_in.decode.invert_a; v.e.invert_out := d_in.decode.invert_out; v.e.input_carry := d_in.decode.input_carry; @@ -272,8 +327,13 @@ begin v.m.insn_type := d_in.decode.insn_type; mul_a := decoded_reg_a.data; mul_b := decoded_reg_b.data; - v.m.write_reg := decode_output_reg(d_in.decode.output_reg_a, d_in.insn); + v.m.write_reg := gspr_to_gpr(decoded_reg_o.reg); v.m.rc := decode_rc(d_in.decode.rc, d_in.insn); + v.m.xerc := c_in.read_xerc_data; + if v.m.insn_type = OP_MUL_L64 then + v.m.oe := decode_oe(d_in.decode.rc, d_in.insn); + end if; + v.m.is_32bit := d_in.decode.is_32bit; if d_in.decode.is_32bit = '1' then if d_in.decode.is_signed = '1' then @@ -304,7 +364,7 @@ begin -- s = 1 for signed, 0 for unsigned (for div*) -- t = 1 for 32-bit, 0 for 64-bit -- r = RC bit (record condition code) - v.d.write_reg := decode_output_reg(d_in.decode.output_reg_a, d_in.insn); + v.d.write_reg := gspr_to_gpr(decoded_reg_o.reg); v.d.is_modulus := not d_in.insn(8); v.d.is_32bit := d_in.insn(2); if d_in.insn(8) = '1' then @@ -337,13 +397,15 @@ begin end if; end if; v.d.rc := decode_rc(d_in.decode.rc, d_in.insn); + v.d.xerc := c_in.read_xerc_data; + v.d.oe := decode_oe(d_in.decode.rc, d_in.insn); -- load/store unit - v.l.update_reg := decoded_reg_a.reg; + v.l.update_reg := gspr_to_gpr(decoded_reg_a.reg); v.l.addr1 := decoded_reg_a.data; v.l.addr2 := decoded_reg_b.data; v.l.data := decoded_reg_c.data; - v.l.write_reg := decode_output_reg(d_in.decode.output_reg_a, d_in.insn); + v.l.write_reg := gspr_to_gpr(decoded_reg_o.reg); if d_in.decode.insn_type = OP_LOAD then v.l.load := '1'; @@ -355,13 +417,14 @@ begin v.l.byte_reverse := d_in.decode.byte_reverse; v.l.sign_extend := d_in.decode.sign_extend; v.l.update := d_in.decode.update; + v.l.xerc := c_in.read_xerc_data; -- issue control control_valid_in <= d_in.valid; control_sgl_pipe <= d_in.decode.sgl_pipe; - gpr_write_valid <= '1' when d_in.decode.output_reg_a /= NONE else '0'; - gpr_write <= decode_output_reg(d_in.decode.output_reg_a, d_in.insn); + gpr_write_valid <= decoded_reg_o.reg_valid; + gpr_write <= decoded_reg_o.reg; gpr_a_read_valid <= decoded_reg_a.reg_valid; gpr_a_read <= decoded_reg_a.reg; @@ -370,7 +433,7 @@ begin gpr_b_read <= decoded_reg_b.reg; gpr_c_read_valid <= decoded_reg_c.reg_valid; - gpr_c_read <= decoded_reg_c.reg; + gpr_c_read <= gspr_to_gpr(decoded_reg_c.reg); cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); diff --git a/decode_types.vhdl b/decode_types.vhdl index 9736f58..e847fcf 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -21,10 +21,10 @@ package decode_types is OP_TWI, OP_XOR, OP_SIM_CONFIG ); - type input_reg_a_t is (NONE, RA, RA_OR_ZERO); - type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, CONST_DS, CONST_M1, CONST_SH, CONST_SH32); + type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR); + type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR); type input_reg_c_t is (NONE, RS); - type output_reg_a_t is (NONE, RT, RA); + type output_reg_a_t is (NONE, RT, RA, SPR); type rc_t is (NONE, ONE, RC); type carry_in_t is (ZERO, CA, ONE); diff --git a/divider.vhdl b/divider.vhdl index 20d4600..39893a8 100644 --- a/divider.vhdl +++ b/divider.vhdl @@ -20,7 +20,7 @@ architecture behaviour of divider is signal div : unsigned(63 downto 0); signal quot : std_ulogic_vector(63 downto 0); signal result : std_ulogic_vector(63 downto 0); - signal sresult : std_ulogic_vector(63 downto 0); + signal sresult : std_ulogic_vector(64 downto 0); signal oresult : std_ulogic_vector(63 downto 0); signal qbit : std_ulogic; signal running : std_ulogic; @@ -36,7 +36,8 @@ architecture behaviour of divider is signal overflow : std_ulogic; signal ovf32 : std_ulogic; signal did_ovf : std_ulogic; - + signal oe : std_ulogic; + signal xerc : xer_common_t; begin divider_0: process(clk) begin @@ -62,6 +63,8 @@ begin is_32bit <= d_in.is_32bit; is_signed <= d_in.is_signed; rc <= d_in.rc; + oe <= d_in.oe; + xerc <= d_in.xerc; count <= "1111111"; running <= '1'; overflow <= '0'; @@ -120,13 +123,13 @@ begin result <= quot; end if; if neg_result = '1' then - sresult <= std_ulogic_vector(- signed(result)); + sresult <= std_ulogic_vector(- signed('0' & result)); else - sresult <= result; + sresult <= '0' & result; end if; did_ovf <= '0'; if is_32bit = '0' then - did_ovf <= overflow or (is_signed and (sresult(63) xor neg_result)); + did_ovf <= overflow or (is_signed and (sresult(64) xor sresult(63))); elsif is_signed = '1' then if ovf32 = '1' or sresult(32) /= sresult(31) then did_ovf <= '1'; @@ -140,20 +143,32 @@ begin -- 32-bit divisions set the top 32 bits of the result to 0 oresult <= x"00000000" & sresult(31 downto 0); else - oresult <= sresult; + oresult <= sresult(63 downto 0); end if; end process; divider_out: process(clk) begin if rising_edge(clk) then + d_out.valid <= '0'; d_out.write_reg_data <= oresult; + d_out.write_reg_enable <= '0'; + d_out.write_xerc_enable <= '0'; + d_out.xerc <= xerc; if count = "1000000" then d_out.valid <= '1'; d_out.write_reg_enable <= '1'; - else - d_out.valid <= '0'; - d_out.write_reg_enable <= '0'; + d_out.write_xerc_enable <= oe; + + -- We must test oe because the RC update code in writeback + -- will use the xerc value to set CR0:SO so we must not clobber + -- xerc if OE wasn't set. + -- + if oe = '1' then + d_out.xerc.ov <= did_ovf; + d_out.xerc.ov32 <= did_ovf; + d_out.xerc.so <= xerc.so or did_ovf; + end if; end if; end if; end process; diff --git a/execute1.vhdl b/execute1.vhdl index 862c631..9b14088 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -12,10 +12,11 @@ use work.ppc_fx_insns.all; entity execute1 is port ( - clk : in std_logic; + clk : in std_ulogic; -- asynchronous flush_out : out std_ulogic; + stall_out : out std_ulogic; e_in : in Decode2ToExecute1Type; @@ -31,14 +32,15 @@ end entity execute1; architecture behaviour of execute1 is type reg_type is record - --f : Execute1ToFetch1Type; e : Execute1ToWritebackType; + lr_update : std_ulogic; + next_lr : std_ulogic_vector(63 downto 0); end record; signal r, rin : reg_type; - signal ctrl: ctrl_t := (carry => '0', others => (others => '0')); - signal ctrl_tmp: ctrl_t := (carry => '0', others => (others => '0')); + signal ctrl: ctrl_t := (others => (others => '0')); + signal ctrl_tmp: ctrl_t := (others => (others => '0')); signal right_shift, rot_clear_left, rot_clear_right: std_ulogic; signal rotator_result: std_ulogic_vector(63 downto 0); @@ -46,17 +48,46 @@ architecture behaviour of execute1 is signal logical_result: std_ulogic_vector(63 downto 0); signal countzero_result: std_ulogic_vector(63 downto 0); - function decode_input_carry (carry_sel : carry_in_t; ca_in : std_ulogic) return std_ulogic is + procedure set_carry(e: inout Execute1ToWritebackType; + carry32 : in std_ulogic; + carry : in std_ulogic) is begin - case carry_sel is + e.xerc.ca32 := carry32; + e.xerc.ca := carry; + e.write_xerc_enable := '1'; + end; + + procedure set_ov(e: inout Execute1ToWritebackType; + ov : in std_ulogic; + ov32 : in std_ulogic) is + begin + e.xerc.ov32 := ov32; + e.xerc.ov := ov; + if ov = '1' then + e.xerc.so := '1'; + end if; + e.write_xerc_enable := '1'; + end; + + function calc_ov(msb_a : std_ulogic; msb_b: std_ulogic; + ca: std_ulogic; msb_r: std_ulogic) return std_ulogic is + begin + return (ca xor msb_r) and not (msb_a xor msb_b); + end; + + function decode_input_carry(ic : carry_in_t; + xerc : xer_common_t) return std_ulogic is + begin + case ic is when ZERO => return '0'; when CA => - return ca_in; + return xerc.ca; when ONE => return '1'; end case; end; + begin rotator_0: entity work.rotator @@ -97,6 +128,12 @@ begin if rising_edge(clk) then r <= rin; ctrl <= ctrl_tmp; + assert not (r.lr_update = '1' and e_in.valid = '1') + report "LR update collision with valid in EX1" + severity failure; + if r.lr_update = '1' then + report "LR update to " & to_hstring(r.next_lr); + end if; end if; end process; @@ -117,6 +154,7 @@ begin variable bf, bfa : std_ulogic_vector(2 downto 0); variable l : std_ulogic; variable next_nia : std_ulogic_vector(63 downto 0); + variable carry_32, carry_64 : std_ulogic; begin result := (others => '0'); result_with_carry := (others => '0'); @@ -125,7 +163,43 @@ begin v := r; v.e := Execute1ToWritebackInit; - --v.f := Execute1ToFetch1TypeInit; + + -- XER forwarding. To avoid having to track XER hazards, we + -- use the previously latched value. + -- + -- If the XER was modified by a multiply or a divide, those are + -- single issue, we'll get the up to date value from decode2 from + -- the register file. + -- + -- If it was modified by an instruction older than the previous + -- one in EX1, it will have also hit writeback and will be up + -- to date in decode2. + -- + -- That leaves us with the case where it was updated by the previous + -- instruction in EX1. In that case, we can forward it back here. + -- + -- This will break if we allow pipelining of multiply and divide, + -- but ideally, those should go via EX1 anyway and run as a state + -- machine from here. + -- + -- One additional hazard to beware of is an XER:SO modifying instruction + -- in EX1 followed immediately by a store conditional. Due to our + -- writeback latency, the store will go down the LSU with the previous + -- XER value, thus the stcx. will set CR0:SO using an obsolete SO value. + -- + -- We will need to handle that if we ever make stcx. not single issue + -- + -- We always pass a valid XER value downto writeback even when + -- we aren't updating it, in order for XER:SO -> CR0:SO transfer + -- to work for RC instructions. + -- + if r.e.write_xerc_enable = '1' then + v.e.xerc := r.e.xerc; + else + v.e.xerc := e_in.xerc; + end if; + + v.lr_update := '0'; ctrl_tmp <= ctrl; -- FIXME: run at 512MHz not core freq @@ -133,6 +207,7 @@ begin terminate_out <= '0'; icache_inval <= '0'; + stall_out <= '0'; f_out <= Execute1ToFetch1TypeInit; -- Next insn adder used in a couple of places @@ -163,10 +238,18 @@ begin else a_inv := not e_in.read_data1; end if; - result_with_carry := ppc_adde(a_inv, e_in.read_data2, decode_input_carry(e_in.input_carry, ctrl.carry)); + result_with_carry := ppc_adde(a_inv, e_in.read_data2, + decode_input_carry(e_in.input_carry, v.e.xerc)); result := result_with_carry(63 downto 0); - if e_in.output_carry then - ctrl_tmp.carry <= result_with_carry(64); + carry_32 := result(32) xor a_inv(32) xor e_in.read_data2(32); + carry_64 := result_with_carry(64); + if e_in.output_carry = '1' then + set_carry(v.e, carry_32, carry_64); + end if; + if e_in.oe = '1' then + set_ov(v.e, + calc_ov(a_inv(63), e_in.read_data2(63), carry_64, result_with_carry(63)), + calc_ov(a_inv(31), e_in.read_data2(31), carry_32, result_with_carry(31))); end if; result_en := '1'; when OP_AND | OP_OR | OP_XOR => @@ -180,12 +263,15 @@ begin f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2)); end if; when OP_BC => + -- read_data1 is CTR bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); if bo(4-2) = '0' then - ctrl_tmp.ctr <= std_ulogic_vector(unsigned(ctrl.ctr) - 1); + result := std_ulogic_vector(unsigned(e_in.read_data1) - 1); + result_en := '1'; + v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, ctrl.ctr) = 1 then + if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then f_out.redirect <= '1'; if (insn_aa(e_in.insn)) then f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2)); @@ -194,19 +280,18 @@ begin end if; end if; when OP_BCREG => - -- bits 10 and 6 distinguish between bclr, bcctr and bctar + -- read_data1 is CTR + -- read_data2 is target register (CTR, LR or TAR) bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); if bo(4-2) = '0' and e_in.insn(10) = '0' then - ctrl_tmp.ctr <= std_ulogic_vector(unsigned(ctrl.ctr) - 1); + result := std_ulogic_vector(unsigned(e_in.read_data1) - 1); + result_en := '1'; + v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, ctrl.ctr) = 1 then + if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then f_out.redirect <= '1'; - if e_in.insn(10) = '0' then - f_out.redirect_nia <= ctrl.lr(63 downto 2) & "00"; - else - f_out.redirect_nia <= ctrl.ctr(63 downto 2) & "00"; - end if; + f_out.redirect_nia <= e_in.read_data2(63 downto 2) & "00"; end if; when OP_CMPB => result := ppc_cmpb(e_in.read_data3, e_in.read_data2); @@ -220,7 +305,7 @@ begin for i in 0 to 7 loop lo := i*4; hi := lo + 3; - v.e.write_cr_data(hi downto lo) := ppc_cmp(l, e_in.read_data1, e_in.read_data2); + v.e.write_cr_data(hi downto lo) := ppc_cmp(l, e_in.read_data1, e_in.read_data2, v.e.xerc.so); end loop; when OP_CMPL => bf := insn_bf(e_in.insn); @@ -231,7 +316,7 @@ begin for i in 0 to 7 loop lo := i*4; hi := lo + 3; - v.e.write_cr_data(hi downto lo) := ppc_cmpl(l, e_in.read_data1, e_in.read_data2); + v.e.write_cr_data(hi downto lo) := ppc_cmpl(l, e_in.read_data1, e_in.read_data2, v.e.xerc.so); end loop; when OP_CNTZ => result := countzero_result; @@ -269,16 +354,24 @@ begin v.e.write_cr_data(hi downto lo) := newcrf; end loop; when OP_MFSPR => - case decode_spr_num(e_in.insn) is - when SPR_CTR => - result := ctrl.ctr; - when SPR_LR => - result := ctrl.lr; - when SPR_TB => - result := ctrl.tb; - when others => - result := (others => '0'); - end case; + if is_fast_spr(e_in.read_reg1) then + result := e_in.read_data1; + if decode_spr_num(e_in.insn) = SPR_XER then + result(63-32) := v.e.xerc.so; + result(63-33) := v.e.xerc.ov; + result(63-34) := v.e.xerc.ca; + result(63-35 downto 63-43) := "000000000"; + result(63-44) := v.e.xerc.ov32; + result(63-45) := v.e.xerc.ca32; + end if; + else + case decode_spr_num(e_in.insn) is + when SPR_TB => + result := ctrl.tb; + when others => + result := (others => '0'); + end case; + end if; result_en := '1'; when OP_MFCR => if e_in.insn(20) = '0' then @@ -309,13 +402,25 @@ begin end if; v.e.write_cr_data := e_in.read_data3(31 downto 0); when OP_MTSPR => - case decode_spr_num(e_in.insn) is - when SPR_CTR => - ctrl_tmp.ctr <= e_in.read_data3; - when SPR_LR => - ctrl_tmp.lr <= e_in.read_data3; - when others => - end case; + report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & + "=" & to_hstring(e_in.read_data3); + if is_fast_spr(e_in.write_reg) then + result := e_in.read_data3; + result_en := '1'; + if decode_spr_num(e_in.insn) = SPR_XER then + v.e.xerc.so := e_in.read_data3(63-32); + v.e.xerc.ov := e_in.read_data3(63-33); + v.e.xerc.ca := e_in.read_data3(63-34); + v.e.xerc.ov32 := e_in.read_data3(63-44); + v.e.xerc.ca32 := e_in.read_data3(63-45); + v.e.write_xerc_enable := '1'; + end if; + else +-- TODO: Implement slow SPRs +-- case decode_spr_num(e_in.insn) is +-- when others => +-- end case; + end if; when OP_POPCNTB => result := ppc_popcntb(e_in.read_data3); result_en := '1'; @@ -334,7 +439,7 @@ begin when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR => result := rotator_result; if e_in.output_carry = '1' then - ctrl_tmp.carry <= rotator_carry; + set_carry(v.e, rotator_carry, rotator_carry); end if; result_en := '1'; when OP_SIM_CONFIG => @@ -359,15 +464,36 @@ begin report "illegal"; end case; + -- Update LR on the next cycle after a branch link + -- + -- WARNING: The LR update isn't tracked by our hazard tracker. This + -- will work (well I hope) because it only happens on branches + -- which will flush all decoded instructions. By the time + -- fetch catches up, we'll have the new LR. This will + -- *not* work properly however if we have a branch predictor, + -- in which case the solution would probably be to keep a + -- local cache of the updated LR in execute1 (flushed on + -- exceptions) that is used instead of the value from + -- decode when its content is valid. if e_in.lr = '1' then - ctrl_tmp.lr <= next_nia; + v.lr_update := '1'; + v.next_lr := next_nia; + v.e.valid := '0'; + report "Delayed LR update to " & to_hstring(next_nia); + stall_out <= '1'; end if; - + elsif r.lr_update = '1' then + result_en := '1'; + result := r.next_lr; + v.e.write_reg := fast_spr_num(SPR_LR); + v.e.write_len := x"8"; + v.e.sign_extend := '0'; + v.e.valid := '1'; end if; v.e.write_data := result; v.e.write_enable := result_en; - v.e.rc := e_in.rc; + v.e.rc := e_in.rc and e_in.valid; -- Update registers rin <= v; diff --git a/gpr_hazard.vhdl b/gpr_hazard.vhdl index 6c8614b..705e69d 100644 --- a/gpr_hazard.vhdl +++ b/gpr_hazard.vhdl @@ -7,12 +7,13 @@ entity gpr_hazard is PIPELINE_DEPTH : natural := 2 ); port( - clk : in std_logic; + clk : in std_ulogic; + stall_in : in std_ulogic; gpr_write_valid_in : in std_ulogic; - gpr_write_in : in std_ulogic_vector(4 downto 0); + gpr_write_in : in std_ulogic_vector(5 downto 0); gpr_read_valid_in : in std_ulogic; - gpr_read_in : in std_ulogic_vector(4 downto 0); + gpr_read_in : in std_ulogic_vector(5 downto 0); stall_out : out std_ulogic ); @@ -20,7 +21,7 @@ end entity gpr_hazard; architecture behaviour of gpr_hazard is type pipeline_entry_type is record valid : std_ulogic; - gpr : std_ulogic_vector(4 downto 0); + gpr : std_ulogic_vector(5 downto 0); end record; constant pipeline_entry_init : pipeline_entry_type := (valid => '0', gpr => (others => '0')); @@ -32,7 +33,9 @@ begin gpr_hazard0: process(clk) begin if rising_edge(clk) then - r <= rin; + if stall_in = '0' then + r <= rin; + end if; end if; end process; diff --git a/helpers.vhdl b/helpers.vhdl index 3961332..fe91938 100644 --- a/helpers.vhdl +++ b/helpers.vhdl @@ -17,8 +17,8 @@ package helpers is function cmp_one_byte(a, b: std_ulogic_vector(7 downto 0)) return std_ulogic_vector; - function ppc_signed_compare(a, b: signed(63 downto 0)) return std_ulogic_vector; - function ppc_unsigned_compare(a, b: unsigned(63 downto 0)) return std_ulogic_vector; + function ppc_signed_compare(a, b: signed(63 downto 0); so: std_ulogic) return std_ulogic_vector; + function ppc_unsigned_compare(a, b: unsigned(63 downto 0); so: std_ulogic) return std_ulogic_vector; function ra_or_zero(ra: std_ulogic_vector(63 downto 0); reg: std_ulogic_vector(4 downto 0)) return std_ulogic_vector; @@ -126,32 +126,32 @@ package body helpers is return ret; end; - function ppc_signed_compare(a, b: signed(63 downto 0)) return std_ulogic_vector is - variable ret: std_ulogic_vector(3 downto 0); + function ppc_signed_compare(a, b: signed(63 downto 0); so: std_ulogic) return std_ulogic_vector is + variable ret: std_ulogic_vector(2 downto 0); begin if a < b then - ret := "1000"; + ret := "100"; elsif a > b then - ret := "0100"; + ret := "010"; else - ret := "0010"; + ret := "001"; end if; - return ret; + return ret & so; end; - function ppc_unsigned_compare(a, b: unsigned(63 downto 0)) return std_ulogic_vector is - variable ret: std_ulogic_vector(3 downto 0); + function ppc_unsigned_compare(a, b: unsigned(63 downto 0); so: std_ulogic) return std_ulogic_vector is + variable ret: std_ulogic_vector(2 downto 0); begin if a < b then - ret := "1000"; + ret := "100"; elsif a > b then - ret := "0100"; + ret := "010"; else - ret := "0010"; + ret := "001"; end if; - return ret; + return ret & so; end; function ra_or_zero(ra: std_ulogic_vector(63 downto 0); reg: std_ulogic_vector(4 downto 0)) return std_ulogic_vector is diff --git a/insn_helpers.vhdl b/insn_helpers.vhdl index d3ddcca..f58dacd 100644 --- a/insn_helpers.vhdl +++ b/insn_helpers.vhdl @@ -16,6 +16,7 @@ package insn_helpers is function insn_lk (insn_in : std_ulogic_vector) return std_ulogic; function insn_aa (insn_in : std_ulogic_vector) return std_ulogic; function insn_rc (insn_in : std_ulogic_vector) return std_ulogic; + function insn_oe (insn_in : std_ulogic_vector) return std_ulogic; function insn_bd (insn_in : std_ulogic_vector) return std_ulogic_vector; function insn_bf (insn_in : std_ulogic_vector) return std_ulogic_vector; function insn_bfa (insn_in : std_ulogic_vector) return std_ulogic_vector; @@ -103,6 +104,11 @@ package body insn_helpers is return insn_in(0); end; + function insn_oe (insn_in : std_ulogic_vector) return std_ulogic is + begin + return insn_in(10); + end; + function insn_bd (insn_in : std_ulogic_vector) return std_ulogic_vector is begin return insn_in(15 downto 2); diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 7fa8a42..1c16c46 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -47,6 +47,7 @@ begin v.sign_extend := l_in.sign_extend; v.update := l_in.update; v.update_reg := l_in.update_reg; + v.xerc := l_in.xerc; -- XXX Temporary hack. Mark the op as non-cachable if the address -- is the form 0xc------- diff --git a/multiply.vhdl b/multiply.vhdl index 94fa792..23339b5 100644 --- a/multiply.vhdl +++ b/multiply.vhdl @@ -27,8 +27,17 @@ architecture behaviour of multiply is data : signed(129 downto 0); write_reg : std_ulogic_vector(4 downto 0); rc : std_ulogic; + oe : std_ulogic; + is_32bit : std_ulogic; + xerc : xer_common_t; end record; - constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0', insn_type => OP_ILLEGAL, rc => '0', data => (others => '0'), others => (others => '0')); + constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0', + insn_type => OP_ILLEGAL, + rc => '0', oe => '0', + is_32bit => '0', + xerc => xerc_init, + data => (others => '0'), + others => (others => '0')); type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage; constant MultiplyPipelineInit : multiply_pipeline_type := (others => MultiplyPipelineStageInit); @@ -51,6 +60,7 @@ begin variable v : reg_type; variable d : std_ulogic_vector(129 downto 0); variable d2 : std_ulogic_vector(63 downto 0); + variable ov : std_ulogic; begin v := r; @@ -61,16 +71,26 @@ begin v.multiply_pipeline(0).data := signed(m.data1) * signed(m.data2); v.multiply_pipeline(0).write_reg := m.write_reg; v.multiply_pipeline(0).rc := m.rc; + v.multiply_pipeline(0).oe := m.oe; + v.multiply_pipeline(0).is_32bit := m.is_32bit; + v.multiply_pipeline(0).xerc := m.xerc; loop_0: for i in 1 to PIPELINE_DEPTH-1 loop v.multiply_pipeline(i) := r.multiply_pipeline(i-1); end loop; d := std_ulogic_vector(v.multiply_pipeline(PIPELINE_DEPTH-1).data); + ov := '0'; + -- TODO: Handle overflows case_0: case v.multiply_pipeline(PIPELINE_DEPTH-1).insn_type is when OP_MUL_L64 => d2 := d(63 downto 0); + if v.multiply_pipeline(PIPELINE_DEPTH-1).is_32bit = '1' then + ov := (or d(63 downto 31)) and not (and d(63 downto 31)); + else + ov := (or d(127 downto 63)) and not (and d(127 downto 63)); + end if; when OP_MUL_H32 => d2 := d(63 downto 32) & d(63 downto 32); when OP_MUL_H64 => @@ -82,11 +102,24 @@ begin m_out.write_reg_data <= d2; m_out.write_reg_nr <= v.multiply_pipeline(PIPELINE_DEPTH-1).write_reg; + m_out.xerc <= v.multiply_pipeline(PIPELINE_DEPTH-1).xerc; + -- Generate OV/OV32/SO when OE=1 if v.multiply_pipeline(PIPELINE_DEPTH-1).valid = '1' then m_out.valid <= '1'; m_out.write_reg_enable <= '1'; m_out.rc <= v.multiply_pipeline(PIPELINE_DEPTH-1).rc; + m_out.write_xerc_enable <= v.multiply_pipeline(PIPELINE_DEPTH-1).oe; + + -- We must test oe because the RC update code in writeback + -- will use the xerc value to set CR0:SO so we must not clobber + -- xerc if OE wasn't set. + -- + if v.multiply_pipeline(PIPELINE_DEPTH-1).oe = '1' then + m_out.xerc.ov <= ov; + m_out.xerc.ov32 <= ov; + m_out.xerc.so <= v.multiply_pipeline(PIPELINE_DEPTH-1).xerc.so or ov; + end if; end if; rin <= v; diff --git a/ppc_fx_insns.vhdl b/ppc_fx_insns.vhdl index 407881f..0bf011d 100644 --- a/ppc_fx_insns.vhdl +++ b/ppc_fx_insns.vhdl @@ -77,10 +77,14 @@ package ppc_fx_insns is function ppc_mulhw (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; function ppc_mulhwu (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; - function ppc_cmpi (l: std_ulogic; ra: std_ulogic_vector(63 downto 0); si: std_ulogic_vector(15 downto 0)) return std_ulogic_vector; - function ppc_cmp (l: std_ulogic; ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; - function ppc_cmpli (l: std_ulogic; ra: std_ulogic_vector(63 downto 0); si: std_ulogic_vector(15 downto 0)) return std_ulogic_vector; - function ppc_cmpl (l: std_ulogic; ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; + function ppc_cmpi (l: std_ulogic; ra: std_ulogic_vector(63 downto 0); si: std_ulogic_vector(15 downto 0); + so: std_ulogic) return std_ulogic_vector; + function ppc_cmp (l: std_ulogic; ra, rb: std_ulogic_vector(63 downto 0); + so: std_ulogic) return std_ulogic_vector; + function ppc_cmpli (l: std_ulogic; ra: std_ulogic_vector(63 downto 0); si: std_ulogic_vector(15 downto 0); + so: std_ulogic) return std_ulogic_vector; + function ppc_cmpl (l: std_ulogic; ra, rb: std_ulogic_vector(63 downto 0); + so: std_ulogic) return std_ulogic_vector; function ppc_cmpb (rs, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; @@ -90,7 +94,6 @@ package ppc_fx_insns is function ppc_divwu (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return integer; - function ppc_bcctr_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0)) return integer; end package ppc_fx_insns; package body ppc_fx_insns is @@ -677,7 +680,8 @@ package body ppc_fx_insns is return std_ulogic_vector(tmp(63 downto 32)) & std_ulogic_vector(tmp(63 downto 32)); end; - function ppc_cmpi (l: std_ulogic; ra: std_ulogic_vector(63 downto 0); si: std_ulogic_vector(15 downto 0)) return std_ulogic_vector is + function ppc_cmpi (l: std_ulogic; ra: std_ulogic_vector(63 downto 0); si: std_ulogic_vector(15 downto 0); + so: std_ulogic) return std_ulogic_vector is variable tmp: signed(ra'range); begin tmp := signed(ra); @@ -685,10 +689,11 @@ package body ppc_fx_insns is tmp := resize(signed(ra(31 downto 0)), tmp'length); end if; - return ppc_signed_compare(tmp, resize(signed(si), tmp'length)); + return ppc_signed_compare(tmp, resize(signed(si), tmp'length), so); end; - function ppc_cmp (l: std_ulogic; ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is + function ppc_cmp (l: std_ulogic; ra, rb: std_ulogic_vector(63 downto 0); + so: std_ulogic) return std_ulogic_vector is variable tmpa, tmpb: signed(ra'range); begin tmpa := signed(ra); @@ -698,10 +703,11 @@ package body ppc_fx_insns is tmpb := resize(signed(rb(31 downto 0)), ra'length); end if; - return ppc_signed_compare(tmpa, tmpb); + return ppc_signed_compare(tmpa, tmpb, so); end; - function ppc_cmpli (l: std_ulogic; ra: std_ulogic_vector(63 downto 0); si: std_ulogic_vector(15 downto 0)) return std_ulogic_vector is + function ppc_cmpli (l: std_ulogic; ra: std_ulogic_vector(63 downto 0); si: std_ulogic_vector(15 downto 0); + so: std_ulogic) return std_ulogic_vector is variable tmp: unsigned(ra'range); begin tmp := unsigned(ra); @@ -709,10 +715,11 @@ package body ppc_fx_insns is tmp := resize(unsigned(ra(31 downto 0)), tmp'length); end if; - return ppc_unsigned_compare(tmp, resize(unsigned(si), tmp'length)); + return ppc_unsigned_compare(tmp, resize(unsigned(si), tmp'length), so); end; - function ppc_cmpl (l: std_ulogic; ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is + function ppc_cmpl (l: std_ulogic; ra, rb: std_ulogic_vector(63 downto 0); + so: std_ulogic) return std_ulogic_vector is variable tmpa, tmpb: unsigned(ra'range); begin tmpa := unsigned(ra); @@ -722,7 +729,7 @@ package body ppc_fx_insns is tmpb := resize(unsigned(rb(31 downto 0)), ra'length); end if; - return ppc_unsigned_compare(tmpa, tmpb); + return ppc_unsigned_compare(tmpa, tmpb, so); end; function ppc_cmpb (rs, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is @@ -801,21 +808,4 @@ package body ppc_fx_insns is return ret; end; - function ppc_bcctr_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0)) return integer is - variable crfield: integer; - variable crbit_match: std_ulogic; - variable cond_ok: std_ulogic; - variable ret: integer; - begin - crfield := to_integer(unsigned(bi)); - -- BE bit numbering - crbit_match := '1' when cr(31-crfield) = bo(4-1) else '0'; - cond_ok := bo(4-0) or crbit_match; - if cond_ok = '1' then - ret := 1; - else - ret := 0; - end if; - return ret; - end; end package body ppc_fx_insns; diff --git a/register_file.vhdl b/register_file.vhdl index 669093b..952d9fc 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -23,7 +23,7 @@ entity register_file is end entity register_file; architecture behaviour of register_file is - type regfile is array(0 to 31) of std_ulogic_vector(63 downto 0); + type regfile is array(0 to 63) of std_ulogic_vector(63 downto 0); signal registers : regfile := (others => (others => '0')); begin -- synchronous writes @@ -32,7 +32,11 @@ begin if rising_edge(clk) then if w_in.write_enable = '1' then assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure; - report "Writing GPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data); + if w_in.write_reg(5) = '0' then + report "Writing GPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data); + else + report "Writing GSPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data); + end if; registers(to_integer(unsigned(w_in.write_reg))) <= w_in.write_data; end if; end if; @@ -52,7 +56,7 @@ begin end if; d_out.read1_data <= registers(to_integer(unsigned(d_in.read1_reg))); d_out.read2_data <= registers(to_integer(unsigned(d_in.read2_reg))); - d_out.read3_data <= registers(to_integer(unsigned(d_in.read3_reg))); + d_out.read3_data <= registers(to_integer(unsigned(gpr_to_gspr(d_in.read3_reg)))); -- Forward any written data if w_in.write_enable = '1' then @@ -62,7 +66,7 @@ begin if d_in.read2_reg = w_in.write_reg then d_out.read2_data <= w_in.write_data; end if; - if d_in.read3_reg = w_in.write_reg then + if gpr_to_gspr(d_in.read3_reg) = w_in.write_reg then d_out.read3_data <= w_in.write_data; end if; end if; diff --git a/writeback.vhdl b/writeback.vhdl index e2b74f8..b88277e 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -62,6 +62,8 @@ begin variable w : std_ulogic_vector(0 downto 0); variable j : integer; variable k : unsigned(3 downto 0); + variable cf: std_ulogic_vector(3 downto 0); + variable xe: xer_common_t; begin x := "" & e_in.valid; y := "" & l_in.valid; @@ -81,6 +83,11 @@ begin z := "" & (d_in.valid and d_in.rc); assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure; + x := "" & e_in.write_xerc_enable; + y := "" & m_in.write_xerc_enable; + z := "" & D_in.write_xerc_enable; + assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure; + w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; @@ -96,12 +103,12 @@ begin partial_write <= '0'; sign_extend <= '0'; second_word <= '0'; - data_in <= e_in.write_data; + xe := e_in.xerc; if e_in.write_enable = '1' then w_out.write_reg <= e_in.write_reg; - data_in <= e_in.write_data; w_out.write_enable <= '1'; + data_in <= e_in.write_data; data_len <= unsigned(e_in.write_len); sign_extend <= e_in.sign_extend; rc <= e_in.rc; @@ -113,8 +120,13 @@ begin c_out.write_cr_data <= e_in.write_cr_data; end if; - if l_in.write_enable = '1' then - w_out.write_reg <= l_in.write_reg; + if e_in.write_xerc_enable = '1' then + c_out.write_xerc_enable <= '1'; + c_out.write_xerc_data <= e_in.xerc; + end if; + + if l_in.write_enable = '1' then + w_out.write_reg <= gpr_to_gspr(l_in.write_reg); data_in <= l_in.write_data; data_len <= unsigned(l_in.write_len); byte_offset <= unsigned(l_in.write_shift); @@ -127,22 +139,35 @@ begin if l_in.valid = '0' and (data_len + byte_offset > 8) then partial_write <= '1'; end if; + xe := l_in.xerc; end if; if m_in.write_reg_enable = '1' then w_out.write_enable <= '1'; - w_out.write_reg <= m_in.write_reg_nr; + w_out.write_reg <= gpr_to_gspr(m_in.write_reg_nr); data_in <= m_in.write_reg_data; rc <= m_in.rc; + xe := m_in.xerc; end if; + if m_in.write_xerc_enable = '1' then + c_out.write_xerc_enable <= '1'; + c_out.write_xerc_data <= m_in.xerc; + end if; + if d_in.write_reg_enable = '1' then w_out.write_enable <= '1'; - w_out.write_reg <= d_in.write_reg_nr; + w_out.write_reg <= gpr_to_gspr(d_in.write_reg_nr); data_in <= d_in.write_reg_data; rc <= d_in.rc; + xe := d_in.xerc; end if; + if d_in.write_xerc_enable = '1' then + c_out.write_xerc_enable <= '1'; + c_out.write_xerc_data <= d_in.xerc; + end if; + -- shift and byte-reverse data bytes for i in 0 to 7 loop k := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset); @@ -193,17 +218,15 @@ begin -- deliver to regfile w_out.write_data <= data_trimmed; - -- test value against 0 and set CR0 if requested + -- Perform CR0 update for RC forms if rc = '1' then c_out.write_cr_enable <= '1'; c_out.write_cr_mask <= num_to_fxm(0); - if negative = '1' then - c_out.write_cr_data <= x"80000000"; - elsif zero = '0' then - c_out.write_cr_data <= x"40000000"; - else - c_out.write_cr_data <= x"20000000"; - end if; + cf(3) := negative; + cf(2) := not negative and not zero; + cf(1) := zero; + cf(0) := xe.so; + c_out.write_cr_data(31 downto 28) <= cf; end if; end process; end;