diff --git a/Makefile b/Makefile index 85a0262..e2398c0 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ all: $(all) $(GHDL) -a $(GHDLFLAGS) $< common.o: decode_types.o -control.o: gpr_hazard.o cr_hazard.o +control.o: gpr_hazard.o cr_hazard.o common.o sim_jtag.o: sim_jtag_socket.o core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o loadstore1.o dcache.o multiply.o writeback.o core_debug.o divider.o diff --git a/common.vhdl b/common.vhdl index 1ccbf08..8e24ab9 100644 --- a/common.vhdl +++ b/common.vhdl @@ -28,6 +28,12 @@ package common is constant SPR_HSPRG0 : spr_num_t := 304; constant SPR_HSPRG1 : spr_num_t := 305; + -- GPR indices in the register file (GPR only) + subtype gpr_index_t is std_ulogic_vector(4 downto 0); + + -- Extended GPR indice (can hold an SPR) + subtype gspr_index_t is std_ulogic_vector(5 downto 0); + -- Some SPRs are stored in the register file, they use the magic -- GPR numbers above 31. -- @@ -36,7 +42,13 @@ package common is -- indicates if this is indeed a fast SPR. If clear, then -- the SPR is not stored in the GPR file. -- - function fast_spr_num(spr: spr_num_t) return std_ulogic_vector; + function fast_spr_num(spr: spr_num_t) return gspr_index_t; + + -- Indices conversion functions + function gspr_to_gpr(i: gspr_index_t) return gpr_index_t; + function gpr_to_gspr(i: gpr_index_t) return gspr_index_t; + function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t; + function is_fast_spr(s: gspr_index_t) return std_ulogic; -- The XER is split: the common bits (CA, OV, SO, OV32 and CA32) are -- in the CR file as a kind of CR extension (with a separate write @@ -52,8 +64,6 @@ package common is -- This needs to die... type ctrl_t is record - lr: std_ulogic_vector(63 downto 0); - ctr: std_ulogic_vector(63 downto 0); tb: std_ulogic_vector(63 downto 0); end record; @@ -83,6 +93,8 @@ package common is stop_mark : std_ulogic; nia: std_ulogic_vector(63 downto 0); insn: std_ulogic_vector(31 downto 0); + ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr + ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR) decode: decode_rom_t; end record; constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', decode => decode_rom_init, others => (others => '0')); @@ -91,9 +103,9 @@ package common is valid: std_ulogic; insn_type: insn_type_t; nia: std_ulogic_vector(63 downto 0); - write_reg: std_ulogic_vector(4 downto 0); - read_reg1: std_ulogic_vector(4 downto 0); - read_reg2: std_ulogic_vector(4 downto 0); + write_reg: gspr_index_t; + read_reg1: gspr_index_t; + read_reg2: gspr_index_t; read_data1: std_ulogic_vector(63 downto 0); read_data2: std_ulogic_vector(63 downto 0); read_data3: std_ulogic_vector(63 downto 0); @@ -121,7 +133,7 @@ package common is type Decode2ToMultiplyType is record valid: std_ulogic; insn_type: insn_type_t; - write_reg: std_ulogic_vector(4 downto 0); + write_reg: gpr_index_t; data1: std_ulogic_vector(64 downto 0); data2: std_ulogic_vector(64 downto 0); rc: std_ulogic; @@ -135,7 +147,7 @@ package common is type Decode2ToDividerType is record valid: std_ulogic; - write_reg: std_ulogic_vector(4 downto 0); + write_reg: gpr_index_t; dividend: std_ulogic_vector(63 downto 0); divisor: std_ulogic_vector(63 downto 0); is_signed: std_ulogic; @@ -153,11 +165,11 @@ package common is type Decode2ToRegisterFileType is record read1_enable : std_ulogic; - read1_reg : std_ulogic_vector(4 downto 0); + read1_reg : gspr_index_t; read2_enable : std_ulogic; - read2_reg : std_ulogic_vector(4 downto 0); + read2_reg : gspr_index_t; read3_enable : std_ulogic; - read3_reg : std_ulogic_vector(4 downto 0); + read3_reg : gpr_index_t; end record; type RegisterFileToDecode2Type is record @@ -187,12 +199,12 @@ package common is addr1 : std_ulogic_vector(63 downto 0); addr2 : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); -- data to write, unused for read - write_reg : std_ulogic_vector(4 downto 0); -- read data goes to this register + write_reg : gpr_index_t; length : std_ulogic_vector(3 downto 0); byte_reverse : std_ulogic; sign_extend : std_ulogic; -- do we need to sign extend? update : std_ulogic; -- is this an update instruction? - update_reg : std_ulogic_vector(4 downto 0); -- if so, the register to update + update_reg : gpr_index_t; -- if so, the register to update xerc : xer_common_t; end record; constant Decode2ToLoadstore1Init : Decode2ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0', @@ -205,19 +217,19 @@ package common is nc : std_ulogic; addr : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); - write_reg : std_ulogic_vector(4 downto 0); + write_reg : gpr_index_t; length : std_ulogic_vector(3 downto 0); byte_reverse : std_ulogic; sign_extend : std_ulogic; update : std_ulogic; - update_reg : std_ulogic_vector(4 downto 0); + update_reg : gpr_index_t; xerc : xer_common_t; end record; type DcacheToWritebackType is record valid : std_ulogic; write_enable: std_ulogic; - write_reg : std_ulogic_vector(4 downto 0); + write_reg : gpr_index_t; write_data : std_ulogic_vector(63 downto 0); write_len : std_ulogic_vector(3 downto 0); write_shift : std_ulogic_vector(2 downto 0); @@ -234,7 +246,7 @@ package common is valid: std_ulogic; rc : std_ulogic; write_enable : std_ulogic; - write_reg: std_ulogic_vector(4 downto 0); + write_reg: gspr_index_t; write_data: std_ulogic_vector(63 downto 0); write_len : std_ulogic_vector(3 downto 0); write_cr_enable : std_ulogic; @@ -253,7 +265,7 @@ package common is valid: std_ulogic; write_reg_enable : std_ulogic; - write_reg_nr: std_ulogic_vector(4 downto 0); + write_reg_nr: gpr_index_t; write_reg_data: std_ulogic_vector(63 downto 0); write_xerc_enable : std_ulogic; xerc : xer_common_t; @@ -268,7 +280,7 @@ package common is valid: std_ulogic; write_reg_enable : std_ulogic; - write_reg_nr: std_ulogic_vector(4 downto 0); + write_reg_nr: gpr_index_t; write_reg_data: std_ulogic_vector(63 downto 0); write_xerc_enable : std_ulogic; xerc : xer_common_t; @@ -280,7 +292,7 @@ package common is others => (others => '0')); type WritebackToRegisterFileType is record - write_reg : std_ulogic_vector(4 downto 0); + write_reg : gspr_index_t; write_data : std_ulogic_vector(63 downto 0); write_enable : std_ulogic; end record; @@ -303,7 +315,7 @@ package body common is begin return to_integer(unsigned(insn(15 downto 11) & insn(20 downto 16))); end; - function fast_spr_num(spr: spr_num_t) return std_ulogic_vector is + function fast_spr_num(spr: spr_num_t) return gspr_index_t is variable n : integer range 0 to 31; begin case spr is @@ -338,4 +350,28 @@ package body common is end case; return "1" & std_ulogic_vector(to_unsigned(n, 5)); end; + + function gspr_to_gpr(i: gspr_index_t) return gpr_index_t is + begin + return i(4 downto 0); + end; + + function gpr_to_gspr(i: gpr_index_t) return gspr_index_t is + begin + return "0" & i; + end; + + function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t is + begin + if s(5) = '1' then + return s; + else + return gpr_to_gspr(g); + end if; + end; + + function is_fast_spr(s: gspr_index_t) return std_ulogic is + begin + return s(5); + end; end common; diff --git a/control.vhdl b/control.vhdl index 0555b06..fed5618 100644 --- a/control.vhdl +++ b/control.vhdl @@ -1,6 +1,9 @@ library ieee; use ieee.std_logic_1164.all; +library work; +use work.common.all; + entity control is generic ( PIPELINE_DEPTH : natural := 2 @@ -12,20 +15,21 @@ entity control is complete_in : in std_ulogic; valid_in : in std_ulogic; flush_in : in std_ulogic; + stall_in : in std_ulogic; sgl_pipe_in : in std_ulogic; stop_mark_in : in std_ulogic; gpr_write_valid_in : in std_ulogic; - gpr_write_in : in std_ulogic_vector(4 downto 0); + gpr_write_in : in gspr_index_t; gpr_a_read_valid_in : in std_ulogic; - gpr_a_read_in : in std_ulogic_vector(4 downto 0); + gpr_a_read_in : in gspr_index_t; gpr_b_read_valid_in : in std_ulogic; - gpr_b_read_in : in std_ulogic_vector(4 downto 0); + gpr_b_read_in : in gspr_index_t; gpr_c_read_valid_in : in std_ulogic; - gpr_c_read_in : in std_ulogic_vector(4 downto 0); + gpr_c_read_in : in gpr_index_t; cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; @@ -61,6 +65,7 @@ begin ) port map ( clk => clk, + stall_in => stall_in, gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, @@ -76,6 +81,7 @@ begin ) port map ( clk => clk, + stall_in => stall_in, gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, @@ -91,11 +97,12 @@ begin ) port map ( clk => clk, + stall_in => stall_in, gpr_write_valid_in => gpr_write_valid, gpr_write_in => gpr_write_in, gpr_read_valid_in => gpr_c_read_valid_in, - gpr_read_in => gpr_c_read_in, + gpr_read_in => "0" & gpr_c_read_in, stall_out => stall_c_out ); @@ -106,6 +113,7 @@ begin ) port map ( clk => clk, + stall_in => stall_in, cr_read_in => cr_read_in, cr_write_in => cr_write_valid, @@ -129,8 +137,8 @@ begin v_int := r_int; -- asynchronous - valid_tmp := valid_in and not flush_in; - stall_tmp := '0'; + valid_tmp := valid_in and not flush_in and not stall_in; + stall_tmp := stall_in; if complete_in = '1' then v_int.outstanding := r_int.outstanding - 1; diff --git a/core.vhdl b/core.vhdl index 22f7dca..f95a1af 100644 --- a/core.vhdl +++ b/core.vhdl @@ -76,8 +76,10 @@ architecture behave of core is signal icache_stall_out : std_ulogic; signal fetch2_stall_in : std_ulogic; signal decode1_stall_in : std_ulogic; + signal decode2_stall_in : std_ulogic; signal decode2_stall_out : std_ulogic; signal ex1_icache_inval: std_ulogic; + signal ex1_stall_out: std_ulogic; signal flush: std_ulogic; @@ -184,6 +186,7 @@ begin port map ( clk => clk, rst => core_rst, + stall_in => decode2_stall_in, stall_out => decode2_stall_out, flush_in => flush, complete_in => complete, @@ -198,6 +201,7 @@ begin c_in => cr_file_to_decode2, c_out => decode2_to_cr_file ); + decode2_stall_in <= ex1_stall_out; register_file_0: entity work.register_file generic map ( @@ -223,6 +227,7 @@ begin port map ( clk => clk, flush_out => flush, + stall_out => ex1_stall_out, e_in => decode2_to_execute1, f_out => execute1_to_fetch1, e_out => execute1_to_writeback, diff --git a/cr_hazard.vhdl b/cr_hazard.vhdl index 2a434ac..f6c5f3f 100644 --- a/cr_hazard.vhdl +++ b/cr_hazard.vhdl @@ -7,7 +7,8 @@ entity cr_hazard is PIPELINE_DEPTH : natural := 2 ); port( - clk : in std_logic; + clk : in std_ulogic; + stall_in : in std_ulogic; cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; @@ -29,7 +30,9 @@ begin cr_hazard0: process(clk) begin if rising_edge(clk) then - r <= rin; + if stall_in = '0' then + r <= rin; + end if; end if; end process; diff --git a/decode1.vhdl b/decode1.vhdl index 3138480..b4e7d26 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -43,7 +43,7 @@ architecture behaviour of decode1 is 28 => (ALU, OP_AND, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0'), -- andi. 29 => (ALU, OP_AND, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0'), -- andis. 18 => (ALU, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- b - 16 => (ALU, OP_BC, NONE, CONST_BD, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc + 16 => (ALU, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc 11 => (ALU, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpi 10 => (ALU, OP_CMPL, RA, CONST_UI, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli 34 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbz @@ -106,7 +106,7 @@ architecture behaviour of decode1 is -- addpcis not implemented yet 2#001# => (ALU, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- bclr, bcctr, bctar - 2#100# => (ALU, OP_BCREG, NONE, NONE, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '1'), + 2#100# => (ALU, OP_BCREG, SPR, SPR, NONE, SPR, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- isync 2#111# => (ALU, OP_ISYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), others => illegal_inst @@ -237,13 +237,13 @@ architecture behaviour of decode1 is -- 2#1000000000# mcrxr -- 2#1001000000# mcrxrx 2#0000010011# => (ALU, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf - 2#0101010011# => (ALU, OP_MFSPR, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- mfspr + 2#0101010011# => (ALU, OP_MFSPR, SPR, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfspr 2#0100001001# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modud 2#0100001011# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- moduw 2#1100001001# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsd 2#1100001011# => (DIV, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- modsw 2#0010010000# => (ALU, OP_MTCRF, NONE, NONE, RS, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtcrf/mtocrf - 2#0111010011# => (ALU, OP_MTSPR, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- mtspr + 2#0111010011# => (ALU, OP_MTSPR, NONE, NONE, RS, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtspr 2#0001001001# => (MUL, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '1'), -- mulhd 2#0000001001# => (MUL, OP_MUL_H64, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- mulhdu 2#0001001011# => (MUL, OP_MUL_H32, RA, RB, NONE, RT, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '1'), -- mulhw @@ -355,6 +355,8 @@ begin v.nia := f_in.nia; v.insn := f_in.insn; v.stop_mark := f_in.stop_mark; + v.ispr1 := (others => '0'); + v.ispr2 := (others => '0'); if f_in.valid = '1' then report "Decode insn " & to_hstring(f_in.insn) & " at " & to_hstring(f_in.nia); @@ -398,6 +400,33 @@ begin v.decode := major_decode_rom_array(to_integer(majorop)); end if; + -- Set ISPR1/ISPR2 when needed + if v.decode.insn_type = OP_BC or v.decode.insn_type = OP_BCREG then + -- Branch uses CTR as condition when BO(2) is 0. This is + -- also used to indicate that CTR is modified (they go + -- together). + -- + if f_in.insn(23) = '0' then + v.ispr1 := fast_spr_num(SPR_CTR); + end if; + + -- Branch source register is an SPR + if v.decode.insn_type = OP_BCREG then + -- TODO: Add TAR + if f_in.insn(10) = '0' then + v.ispr2 := fast_spr_num(SPR_LR); + else + v.ispr2 := fast_spr_num(SPR_CTR); + end if; + end if; + elsif v.decode.insn_type = OP_MFSPR or v.decode.insn_type = OP_MTSPR then + v.ispr1 := fast_spr_num(decode_spr_num(f_in.insn)); + -- Make slow SPRs single issue + if is_fast_spr(v.ispr1) = '0' then + v.decode.sgl_pipe := '1'; + end if; + end if; + if flush_in = '1' then v.valid := '0'; end if; diff --git a/decode2.vhdl b/decode2.vhdl index e9c71ba..8a2d970 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -14,6 +14,7 @@ entity decode2 is rst : in std_ulogic; complete_in : in std_ulogic; + stall_in : in std_ulogic; stall_out : out std_ulogic; stopped_out : out std_ulogic; @@ -47,30 +48,49 @@ architecture behaviour of decode2 is type decode_input_reg_t is record reg_valid : std_ulogic; - reg : std_ulogic_vector(4 downto 0); + reg : gspr_index_t; data : std_ulogic_vector(63 downto 0); end record; + type decode_output_reg_t is record + reg_valid : std_ulogic; + reg : gspr_index_t; + end record; + function decode_input_reg_a (t : input_reg_a_t; insn_in : std_ulogic_vector(31 downto 0); - reg_data : std_ulogic_vector(63 downto 0)) return decode_input_reg_t is + reg_data : std_ulogic_vector(63 downto 0); + ispr : gspr_index_t) return decode_input_reg_t is variable is_reg : std_ulogic; begin is_reg := '0' when insn_ra(insn_in) = "00000" else '1'; if t = RA or (t = RA_OR_ZERO and insn_ra(insn_in) /= "00000") then - --return (is_reg, insn_ra(insn_in), reg_data); - return ('1', insn_ra(insn_in), reg_data); + assert is_fast_spr(ispr) = '0' report "Decode A says GPR but ISPR says SPR:" & + to_hstring(ispr) severity failure; + return ('1', gpr_to_gspr(insn_ra(insn_in)), reg_data); + elsif t = SPR then + -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. + -- If it's all 0, we don't treat it as a dependency as slow SPRs + -- operations are single issue. + -- + assert is_fast_spr(ispr) = '1' or ispr = "000000" + report "Decode A says SPR but ISPR is invalid:" & + to_hstring(ispr) severity failure; + return (is_fast_spr(ispr), ispr, reg_data); else return ('0', (others => '0'), (others => '0')); end if; end; function decode_input_reg_b (t : input_reg_b_t; insn_in : std_ulogic_vector(31 downto 0); - reg_data : std_ulogic_vector(63 downto 0)) return decode_input_reg_t is + reg_data : std_ulogic_vector(63 downto 0); + ispr : gspr_index_t) return decode_input_reg_t is begin case t is when RB => - return ('1', insn_rb(insn_in), reg_data); + assert is_fast_spr(ispr) = '0' report "Decode B says GPR but ISPR says SPR:" & + to_hstring(ispr) severity failure; + return ('1', gpr_to_gspr(insn_rb(insn_in)), reg_data); when CONST_UI => return ('0', (others => '0'), std_ulogic_vector(resize(unsigned(insn_ui(insn_in)), 64))); when CONST_SI => @@ -91,6 +111,14 @@ architecture behaviour of decode2 is return ('0', (others => '0'), x"00000000000000" & "00" & insn_in(1) & insn_in(15 downto 11)); when CONST_SH32 => return ('0', (others => '0'), x"00000000000000" & "000" & insn_in(15 downto 11)); + when SPR => + -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. + -- If it's all 0, we don't treat it as a dependency as slow SPRs + -- operations are single issue. + assert is_fast_spr(ispr) = '1' or ispr = "000000" + report "Decode B says SPR but ISPR is invalid:" & + to_hstring(ispr) severity failure; + return (is_fast_spr(ispr), ispr, reg_data); when NONE => return ('0', (others => '0'), (others => '0')); end case; @@ -101,21 +129,30 @@ architecture behaviour of decode2 is begin case t is when RS => - return ('1', insn_rs(insn_in), reg_data); + return ('1', gpr_to_gspr(insn_rs(insn_in)), reg_data); when NONE => return ('0', (others => '0'), (others => '0')); end case; end; - function decode_output_reg (t : output_reg_a_t; insn_in : std_ulogic_vector(31 downto 0)) return std_ulogic_vector is + function decode_output_reg (t : output_reg_a_t; insn_in : std_ulogic_vector(31 downto 0); + ispr : gspr_index_t) return decode_output_reg_t is begin case t is when RT => - return insn_rt(insn_in); + return ('1', gpr_to_gspr(insn_rt(insn_in))); when RA => - return insn_ra(insn_in); + return ('1', gpr_to_gspr(insn_ra(insn_in))); + when SPR => + -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. + -- If it's all 0, we don't treat it as a dependency as slow SPRs + -- operations are single issue. + assert is_fast_spr(ispr) = '1' or ispr = "000000" + report "Decode B says SPR but ISPR is invalid:" & + to_hstring(ispr) severity failure; + return (is_fast_spr(ispr), ispr); when NONE => - return "00000"; + return ('0', "000000"); end case; end; @@ -153,16 +190,16 @@ architecture behaviour of decode2 is signal control_sgl_pipe : std_logic; signal gpr_write_valid : std_ulogic; - signal gpr_write : std_ulogic_vector(4 downto 0); + signal gpr_write : gspr_index_t; signal gpr_a_read_valid : std_ulogic; - signal gpr_a_read : std_ulogic_vector(4 downto 0); + signal gpr_a_read :gspr_index_t; signal gpr_b_read_valid : std_ulogic; - signal gpr_b_read : std_ulogic_vector(4 downto 0); + signal gpr_b_read : gspr_index_t; signal gpr_c_read_valid : std_ulogic; - signal gpr_c_read : std_ulogic_vector(4 downto 0); + signal gpr_c_read : gpr_index_t; signal cr_write_valid : std_ulogic; begin @@ -176,6 +213,7 @@ begin complete_in => complete_in, valid_in => control_valid_in, + stall_in => stall_in, flush_in => flush_in, sgl_pipe_in => control_sgl_pipe, stop_mark_in => d_in.stop_mark, @@ -210,8 +248,8 @@ begin end if; end process; - r_out.read1_reg <= insn_ra(d_in.insn); - r_out.read2_reg <= insn_rb(d_in.insn); + r_out.read1_reg <= gpr_or_spr_to_gspr(insn_ra(d_in.insn), d_in.ispr1); + r_out.read2_reg <= gpr_or_spr_to_gspr(insn_rb(d_in.insn), d_in.ispr2); r_out.read3_reg <= insn_rs(d_in.insn); c_out.read <= d_in.decode.input_cr; @@ -223,6 +261,7 @@ begin variable decoded_reg_a : decode_input_reg_t; variable decoded_reg_b : decode_input_reg_t; variable decoded_reg_c : decode_input_reg_t; + variable decoded_reg_o : decode_output_reg_t; variable signed_division: std_ulogic; variable length : std_ulogic_vector(3 downto 0); begin @@ -239,10 +278,11 @@ begin --v.e.input_cr := d_in.decode.input_cr; --v.m.input_cr := d_in.decode.input_cr; --v.e.output_cr := d_in.decode.output_cr; - - decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data); - decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data); + + decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1); + decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2); decoded_reg_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn, r_in.read3_data); + decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispr1); r_out.read1_enable <= decoded_reg_a.reg_valid; r_out.read2_enable <= decoded_reg_b.reg_valid; @@ -269,7 +309,7 @@ begin v.e.read_reg2 := decoded_reg_b.reg; v.e.read_data2 := decoded_reg_b.data; v.e.read_data3 := decoded_reg_c.data; - v.e.write_reg := decode_output_reg(d_in.decode.output_reg_a, d_in.insn); + v.e.write_reg := decoded_reg_o.reg; v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); v.e.oe := decode_oe(d_in.decode.rc, d_in.insn); v.e.cr := c_in.read_cr_data; @@ -290,7 +330,7 @@ begin v.m.insn_type := d_in.decode.insn_type; mul_a := decoded_reg_a.data; mul_b := decoded_reg_b.data; - v.m.write_reg := decode_output_reg(d_in.decode.output_reg_a, d_in.insn); + v.m.write_reg := gspr_to_gpr(decoded_reg_o.reg); v.m.rc := decode_rc(d_in.decode.rc, d_in.insn); v.m.xerc := c_in.read_xerc_data; if v.m.insn_type = OP_MUL_L64 then @@ -327,7 +367,7 @@ begin -- s = 1 for signed, 0 for unsigned (for div*) -- t = 1 for 32-bit, 0 for 64-bit -- r = RC bit (record condition code) - v.d.write_reg := decode_output_reg(d_in.decode.output_reg_a, d_in.insn); + v.d.write_reg := gspr_to_gpr(decoded_reg_o.reg); v.d.is_modulus := not d_in.insn(8); v.d.is_32bit := d_in.insn(2); if d_in.insn(8) = '1' then @@ -364,11 +404,11 @@ begin v.d.oe := decode_oe(d_in.decode.rc, d_in.insn); -- load/store unit - v.l.update_reg := decoded_reg_a.reg; + v.l.update_reg := gspr_to_gpr(decoded_reg_a.reg); v.l.addr1 := decoded_reg_a.data; v.l.addr2 := decoded_reg_b.data; v.l.data := decoded_reg_c.data; - v.l.write_reg := decode_output_reg(d_in.decode.output_reg_a, d_in.insn); + v.l.write_reg := gspr_to_gpr(decoded_reg_o.reg); if d_in.decode.insn_type = OP_LOAD then v.l.load := '1'; @@ -386,8 +426,8 @@ begin control_valid_in <= d_in.valid; control_sgl_pipe <= d_in.decode.sgl_pipe; - gpr_write_valid <= '1' when d_in.decode.output_reg_a /= NONE else '0'; - gpr_write <= decode_output_reg(d_in.decode.output_reg_a, d_in.insn); + gpr_write_valid <= decoded_reg_o.reg_valid; + gpr_write <= decoded_reg_o.reg; gpr_a_read_valid <= decoded_reg_a.reg_valid; gpr_a_read <= decoded_reg_a.reg; @@ -396,7 +436,7 @@ begin gpr_b_read <= decoded_reg_b.reg; gpr_c_read_valid <= decoded_reg_c.reg_valid; - gpr_c_read <= decoded_reg_c.reg; + gpr_c_read <= gspr_to_gpr(decoded_reg_c.reg); cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); diff --git a/decode_types.vhdl b/decode_types.vhdl index 9736f58..e847fcf 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -21,10 +21,10 @@ package decode_types is OP_TWI, OP_XOR, OP_SIM_CONFIG ); - type input_reg_a_t is (NONE, RA, RA_OR_ZERO); - type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, CONST_DS, CONST_M1, CONST_SH, CONST_SH32); + type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR); + type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR); type input_reg_c_t is (NONE, RS); - type output_reg_a_t is (NONE, RT, RA); + type output_reg_a_t is (NONE, RT, RA, SPR); type rc_t is (NONE, ONE, RC); type carry_in_t is (ZERO, CA, ONE); diff --git a/execute1.vhdl b/execute1.vhdl index e1ca950..9b14088 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -12,10 +12,11 @@ use work.ppc_fx_insns.all; entity execute1 is port ( - clk : in std_logic; + clk : in std_ulogic; -- asynchronous flush_out : out std_ulogic; + stall_out : out std_ulogic; e_in : in Decode2ToExecute1Type; @@ -32,6 +33,8 @@ end entity execute1; architecture behaviour of execute1 is type reg_type is record e : Execute1ToWritebackType; + lr_update : std_ulogic; + next_lr : std_ulogic_vector(63 downto 0); end record; signal r, rin : reg_type; @@ -125,6 +128,12 @@ begin if rising_edge(clk) then r <= rin; ctrl <= ctrl_tmp; + assert not (r.lr_update = '1' and e_in.valid = '1') + report "LR update collision with valid in EX1" + severity failure; + if r.lr_update = '1' then + report "LR update to " & to_hstring(r.next_lr); + end if; end if; end process; @@ -190,12 +199,15 @@ begin v.e.xerc := e_in.xerc; end if; + v.lr_update := '0'; + ctrl_tmp <= ctrl; -- FIXME: run at 512MHz not core freq ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); terminate_out <= '0'; icache_inval <= '0'; + stall_out <= '0'; f_out <= Execute1ToFetch1TypeInit; -- Next insn adder used in a couple of places @@ -251,12 +263,15 @@ begin f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2)); end if; when OP_BC => + -- read_data1 is CTR bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); if bo(4-2) = '0' then - ctrl_tmp.ctr <= std_ulogic_vector(unsigned(ctrl.ctr) - 1); + result := std_ulogic_vector(unsigned(e_in.read_data1) - 1); + result_en := '1'; + v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, ctrl.ctr) = 1 then + if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then f_out.redirect <= '1'; if (insn_aa(e_in.insn)) then f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2)); @@ -265,19 +280,18 @@ begin end if; end if; when OP_BCREG => - -- bits 10 and 6 distinguish between bclr, bcctr and bctar + -- read_data1 is CTR + -- read_data2 is target register (CTR, LR or TAR) bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); if bo(4-2) = '0' and e_in.insn(10) = '0' then - ctrl_tmp.ctr <= std_ulogic_vector(unsigned(ctrl.ctr) - 1); + result := std_ulogic_vector(unsigned(e_in.read_data1) - 1); + result_en := '1'; + v.e.write_reg := fast_spr_num(SPR_CTR); end if; - if ppc_bc_taken(bo, bi, e_in.cr, ctrl.ctr) = 1 then + if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then f_out.redirect <= '1'; - if e_in.insn(10) = '0' then - f_out.redirect_nia <= ctrl.lr(63 downto 2) & "00"; - else - f_out.redirect_nia <= ctrl.ctr(63 downto 2) & "00"; - end if; + f_out.redirect_nia <= e_in.read_data2(63 downto 2) & "00"; end if; when OP_CMPB => result := ppc_cmpb(e_in.read_data3, e_in.read_data2); @@ -340,23 +354,24 @@ begin v.e.write_cr_data(hi downto lo) := newcrf; end loop; when OP_MFSPR => - case decode_spr_num(e_in.insn) is - when SPR_XER => - result := ( 63-32 => v.e.xerc.so, - 63-33 => v.e.xerc.ov, - 63-34 => v.e.xerc.ca, - 63-44 => v.e.xerc.ov32, - 63-45 => v.e.xerc.ca32, - others => '0'); - when SPR_CTR => - result := ctrl.ctr; - when SPR_LR => - result := ctrl.lr; - when SPR_TB => - result := ctrl.tb; - when others => - result := (others => '0'); - end case; + if is_fast_spr(e_in.read_reg1) then + result := e_in.read_data1; + if decode_spr_num(e_in.insn) = SPR_XER then + result(63-32) := v.e.xerc.so; + result(63-33) := v.e.xerc.ov; + result(63-34) := v.e.xerc.ca; + result(63-35 downto 63-43) := "000000000"; + result(63-44) := v.e.xerc.ov32; + result(63-45) := v.e.xerc.ca32; + end if; + else + case decode_spr_num(e_in.insn) is + when SPR_TB => + result := ctrl.tb; + when others => + result := (others => '0'); + end case; + end if; result_en := '1'; when OP_MFCR => if e_in.insn(20) = '0' then @@ -387,20 +402,25 @@ begin end if; v.e.write_cr_data := e_in.read_data3(31 downto 0); when OP_MTSPR => - case decode_spr_num(e_in.insn) is - when SPR_XER => - v.e.xerc.so := e_in.read_data3(63-32); - v.e.xerc.ov := e_in.read_data3(63-33); - v.e.xerc.ca := e_in.read_data3(63-34); - v.e.xerc.ov32 := e_in.read_data3(63-44); - v.e.xerc.ca32 := e_in.read_data3(63-45); - v.e.write_xerc_enable := '1'; - when SPR_CTR => - ctrl_tmp.ctr <= e_in.read_data3; - when SPR_LR => - ctrl_tmp.lr <= e_in.read_data3; - when others => - end case; + report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & + "=" & to_hstring(e_in.read_data3); + if is_fast_spr(e_in.write_reg) then + result := e_in.read_data3; + result_en := '1'; + if decode_spr_num(e_in.insn) = SPR_XER then + v.e.xerc.so := e_in.read_data3(63-32); + v.e.xerc.ov := e_in.read_data3(63-33); + v.e.xerc.ca := e_in.read_data3(63-34); + v.e.xerc.ov32 := e_in.read_data3(63-44); + v.e.xerc.ca32 := e_in.read_data3(63-45); + v.e.write_xerc_enable := '1'; + end if; + else +-- TODO: Implement slow SPRs +-- case decode_spr_num(e_in.insn) is +-- when others => +-- end case; + end if; when OP_POPCNTB => result := ppc_popcntb(e_in.read_data3); result_en := '1'; @@ -444,15 +464,36 @@ begin report "illegal"; end case; + -- Update LR on the next cycle after a branch link + -- + -- WARNING: The LR update isn't tracked by our hazard tracker. This + -- will work (well I hope) because it only happens on branches + -- which will flush all decoded instructions. By the time + -- fetch catches up, we'll have the new LR. This will + -- *not* work properly however if we have a branch predictor, + -- in which case the solution would probably be to keep a + -- local cache of the updated LR in execute1 (flushed on + -- exceptions) that is used instead of the value from + -- decode when its content is valid. if e_in.lr = '1' then - ctrl_tmp.lr <= next_nia; + v.lr_update := '1'; + v.next_lr := next_nia; + v.e.valid := '0'; + report "Delayed LR update to " & to_hstring(next_nia); + stall_out <= '1'; end if; - + elsif r.lr_update = '1' then + result_en := '1'; + result := r.next_lr; + v.e.write_reg := fast_spr_num(SPR_LR); + v.e.write_len := x"8"; + v.e.sign_extend := '0'; + v.e.valid := '1'; end if; v.e.write_data := result; v.e.write_enable := result_en; - v.e.rc := e_in.rc; + v.e.rc := e_in.rc and e_in.valid; -- Update registers rin <= v; diff --git a/gpr_hazard.vhdl b/gpr_hazard.vhdl index 6c8614b..705e69d 100644 --- a/gpr_hazard.vhdl +++ b/gpr_hazard.vhdl @@ -7,12 +7,13 @@ entity gpr_hazard is PIPELINE_DEPTH : natural := 2 ); port( - clk : in std_logic; + clk : in std_ulogic; + stall_in : in std_ulogic; gpr_write_valid_in : in std_ulogic; - gpr_write_in : in std_ulogic_vector(4 downto 0); + gpr_write_in : in std_ulogic_vector(5 downto 0); gpr_read_valid_in : in std_ulogic; - gpr_read_in : in std_ulogic_vector(4 downto 0); + gpr_read_in : in std_ulogic_vector(5 downto 0); stall_out : out std_ulogic ); @@ -20,7 +21,7 @@ end entity gpr_hazard; architecture behaviour of gpr_hazard is type pipeline_entry_type is record valid : std_ulogic; - gpr : std_ulogic_vector(4 downto 0); + gpr : std_ulogic_vector(5 downto 0); end record; constant pipeline_entry_init : pipeline_entry_type := (valid => '0', gpr => (others => '0')); @@ -32,7 +33,9 @@ begin gpr_hazard0: process(clk) begin if rising_edge(clk) then - r <= rin; + if stall_in = '0' then + r <= rin; + end if; end if; end process; diff --git a/ppc_fx_insns.vhdl b/ppc_fx_insns.vhdl index 3b03dc2..0bf011d 100644 --- a/ppc_fx_insns.vhdl +++ b/ppc_fx_insns.vhdl @@ -94,7 +94,6 @@ package ppc_fx_insns is function ppc_divwu (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; function ppc_bc_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0); ctr: std_ulogic_vector(63 downto 0)) return integer; - function ppc_bcctr_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0)) return integer; end package ppc_fx_insns; package body ppc_fx_insns is @@ -809,21 +808,4 @@ package body ppc_fx_insns is return ret; end; - function ppc_bcctr_taken(bo, bi: std_ulogic_vector(4 downto 0); cr: std_ulogic_vector(31 downto 0)) return integer is - variable crfield: integer; - variable crbit_match: std_ulogic; - variable cond_ok: std_ulogic; - variable ret: integer; - begin - crfield := to_integer(unsigned(bi)); - -- BE bit numbering - crbit_match := '1' when cr(31-crfield) = bo(4-1) else '0'; - cond_ok := bo(4-0) or crbit_match; - if cond_ok = '1' then - ret := 1; - else - ret := 0; - end if; - return ret; - end; end package body ppc_fx_insns; diff --git a/register_file.vhdl b/register_file.vhdl index 669093b..952d9fc 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -23,7 +23,7 @@ entity register_file is end entity register_file; architecture behaviour of register_file is - type regfile is array(0 to 31) of std_ulogic_vector(63 downto 0); + type regfile is array(0 to 63) of std_ulogic_vector(63 downto 0); signal registers : regfile := (others => (others => '0')); begin -- synchronous writes @@ -32,7 +32,11 @@ begin if rising_edge(clk) then if w_in.write_enable = '1' then assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure; - report "Writing GPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data); + if w_in.write_reg(5) = '0' then + report "Writing GPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data); + else + report "Writing GSPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data); + end if; registers(to_integer(unsigned(w_in.write_reg))) <= w_in.write_data; end if; end if; @@ -52,7 +56,7 @@ begin end if; d_out.read1_data <= registers(to_integer(unsigned(d_in.read1_reg))); d_out.read2_data <= registers(to_integer(unsigned(d_in.read2_reg))); - d_out.read3_data <= registers(to_integer(unsigned(d_in.read3_reg))); + d_out.read3_data <= registers(to_integer(unsigned(gpr_to_gspr(d_in.read3_reg)))); -- Forward any written data if w_in.write_enable = '1' then @@ -62,7 +66,7 @@ begin if d_in.read2_reg = w_in.write_reg then d_out.read2_data <= w_in.write_data; end if; - if d_in.read3_reg = w_in.write_reg then + if gpr_to_gspr(d_in.read3_reg) = w_in.write_reg then d_out.read3_data <= w_in.write_data; end if; end if; diff --git a/writeback.vhdl b/writeback.vhdl index 545e931..b88277e 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -126,7 +126,7 @@ begin end if; if l_in.write_enable = '1' then - w_out.write_reg <= l_in.write_reg; + w_out.write_reg <= gpr_to_gspr(l_in.write_reg); data_in <= l_in.write_data; data_len <= unsigned(l_in.write_len); byte_offset <= unsigned(l_in.write_shift); @@ -144,7 +144,7 @@ begin if m_in.write_reg_enable = '1' then w_out.write_enable <= '1'; - w_out.write_reg <= m_in.write_reg_nr; + w_out.write_reg <= gpr_to_gspr(m_in.write_reg_nr); data_in <= m_in.write_reg_data; rc <= m_in.rc; xe := m_in.xerc; @@ -157,7 +157,7 @@ begin if d_in.write_reg_enable = '1' then w_out.write_enable <= '1'; - w_out.write_reg <= d_in.write_reg_nr; + w_out.write_reg <= gpr_to_gspr(d_in.write_reg_nr); data_in <= d_in.write_reg_data; rc <= d_in.rc; xe := d_in.xerc;